From c044bef6a77edeea44d35ac908841cbe49610b75 Mon Sep 17 00:00:00 2001 From: elsid Date: Mon, 14 Feb 2022 22:26:01 +0100 Subject: [PATCH] Add StatelessUtf8Encoder to support caller provided buffer for output --- apps/opencs/editor.cpp | 2 +- .../contentselector/model/contentmodel.cpp | 3 +- components/to_utf8/to_utf8.cpp | 131 ++++++++++-------- components/to_utf8/to_utf8.hpp | 41 ++++-- 4 files changed, 105 insertions(+), 72 deletions(-) diff --git a/apps/opencs/editor.cpp b/apps/opencs/editor.cpp index 1d5934fe5d..07154f6d55 100644 --- a/apps/opencs/editor.cpp +++ b/apps/opencs/editor.cpp @@ -370,7 +370,7 @@ int CS::Editor::run() else { ESM::ESMReader fileReader; - ToUTF8::Utf8Encoder encoder = ToUTF8::calculateEncoding(mEncodingName); + ToUTF8::Utf8Encoder encoder(ToUTF8::calculateEncoding(mEncodingName)); fileReader.setEncoder(&encoder); fileReader.open(mFileToLoad.string()); diff --git a/components/contentselector/model/contentmodel.cpp b/components/contentselector/model/contentmodel.cpp index 57dfe0f87e..f7cedc83a4 100644 --- a/components/contentselector/model/contentmodel.cpp +++ b/components/contentselector/model/contentmodel.cpp @@ -445,8 +445,7 @@ void ContentSelectorModel::ContentModel::addFiles(const QString &path) try { ESM::ESMReader fileReader; - ToUTF8::Utf8Encoder encoder = - ToUTF8::calculateEncoding(mEncoding.toStdString()); + ToUTF8::Utf8Encoder encoder(ToUTF8::calculateEncoding(mEncoding.toStdString())); fileReader.setEncoder(&encoder); fileReader.open(std::string(dir.absoluteFilePath(path2).toUtf8().constData())); diff --git a/components/to_utf8/to_utf8.cpp b/components/to_utf8/to_utf8.cpp index 1f0a81ad10..a193e6375d 100644 --- a/components/to_utf8/to_utf8.cpp +++ b/components/to_utf8/to_utf8.cpp @@ -51,42 +51,52 @@ namespace { return std::find_if(input.begin(), input.end(), [] (unsigned char v) { return v == 0 || v >= 128; }); } -} -Utf8Encoder::Utf8Encoder(const FromType sourceEncoding): - mOutput(50*1024) -{ - switch (sourceEncoding) + std::basic_string_view getTranslationArray(FromType sourceEncoding) { - case ToUTF8::WINDOWS_1252: + switch (sourceEncoding) { - translationArray = ToUTF8::windows_1252; - break; - } - case ToUTF8::WINDOWS_1250: - { - translationArray = ToUTF8::windows_1250; - break; - } - case ToUTF8::WINDOWS_1251: - { - translationArray = ToUTF8::windows_1251; - break; - } - case ToUTF8::CP437: - { - translationArray = ToUTF8::cp437; - break; + case ToUTF8::WINDOWS_1252: + return ToUTF8::windows_1252; + case ToUTF8::WINDOWS_1250: + return ToUTF8::windows_1250; + case ToUTF8::WINDOWS_1251: + return ToUTF8::windows_1251; + case ToUTF8::CP437: + return ToUTF8::cp437; } + throw std::logic_error("Invalid source encoding: " + std::to_string(sourceEncoding)); + } - default: + // Make sure the output vector is large enough for 'size' bytes, + // including a terminating zero after it. + void resize(std::size_t size, BufferAllocationPolicy bufferAllocationPolicy, std::string& buffer) + { + if (buffer.size() >= size) + return; + + switch (bufferAllocationPolicy) { - assert(0); + case BufferAllocationPolicy::FitToRequiredSize: + buffer.resize(size); + break; + case BufferAllocationPolicy::UseGrowFactor: + // Add some extra padding to reduce the chance of having to resize + // again later. + buffer.resize(3 * size); + // And make sure the string is zero terminated + buffer[size] = 0; + break; } } } -std::string_view Utf8Encoder::getUtf8(std::string_view input) +StatelessUtf8Encoder::StatelessUtf8Encoder(FromType sourceEncoding) + : mTranslationArray(getTranslationArray(sourceEncoding)) +{ +} + +std::string_view StatelessUtf8Encoder::getUtf8(std::string_view input, BufferAllocationPolicy bufferAllocationPolicy, std::string& buffer) const { if (input.empty()) return input; @@ -106,24 +116,24 @@ std::string_view Utf8Encoder::getUtf8(std::string_view input) return std::string_view(input.data(), outlen); // Make sure the output is large enough - resize(outlen); - char *out = &mOutput[0]; + resize(outlen, bufferAllocationPolicy, buffer); + char *out = buffer.data(); // Translate for (auto it = input.begin(); it != input.end() && *it != 0; ++it) copyFromArray(*it, out); // Make sure that we wrote the correct number of bytes - assert((out-&mOutput[0]) == (int)outlen); + assert((out - buffer.data()) == (int)outlen); // And make extra sure the output is null terminated - assert(mOutput.size() > outlen); - assert(mOutput[outlen] == 0); + assert(buffer.size() >= outlen); + assert(buffer[outlen] == 0); - return std::string_view(mOutput.data(), outlen); + return std::string_view(buffer.data(), outlen); } -std::string_view Utf8Encoder::getLegacyEnc(std::string_view input) +std::string_view StatelessUtf8Encoder::getLegacyEnc(std::string_view input, BufferAllocationPolicy bufferAllocationPolicy, std::string& buffer) const { if (input.empty()) return input; @@ -143,34 +153,21 @@ std::string_view Utf8Encoder::getLegacyEnc(std::string_view input) return std::string_view(input.data(), outlen); // Make sure the output is large enough - resize(outlen); - char *out = &mOutput[0]; + resize(outlen, bufferAllocationPolicy, buffer); + char *out = buffer.data(); // Translate for (auto it = input.begin(); it != input.end() && *it != 0;) copyFromArrayLegacyEnc(it, input.end(), out); // Make sure that we wrote the correct number of bytes - assert((out-&mOutput[0]) == (int)outlen); + assert((out - buffer.data()) == static_cast(outlen)); // And make extra sure the output is null terminated - assert(mOutput.size() > outlen); - assert(mOutput[outlen] == 0); + assert(buffer.size() >= outlen); + assert(buffer[outlen] == 0); - return std::string_view(mOutput.data(), outlen); -} - -// Make sure the output vector is large enough for 'size' bytes, -// including a terminating zero after it. -void Utf8Encoder::resize(size_t size) -{ - if (mOutput.size() <= size) - // Add some extra padding to reduce the chance of having to resize - // again later. - mOutput.resize(3*size); - - // And make sure the string is zero terminated - mOutput[size] = 0; + return std::string_view(buffer.data(), outlen); } /** Get the total length length needed to decode the given string with @@ -183,7 +180,7 @@ void Utf8Encoder::resize(size_t size) is the case, then the ascii parameter is set to true, and the caller can optimize for this case. */ -std::pair Utf8Encoder::getLength(std::string_view input) const +std::pair StatelessUtf8Encoder::getLength(std::string_view input) const { // Do away with the ascii part of the string first (this is almost // always the entire string.) @@ -201,7 +198,7 @@ std::pair Utf8Encoder::getLength(std::string_view input) cons { // Find the translated length of this character in the // lookup table. - len += translationArray[static_cast(*it) * 6]; + len += mTranslationArray[static_cast(*it) * 6]; ++it; } while (it != input.end() && *it != 0); @@ -211,7 +208,7 @@ std::pair Utf8Encoder::getLength(std::string_view input) cons // Translate one character 'ch' using the translation array 'arr', and // advance the output pointer accordingly. -void Utf8Encoder::copyFromArray(unsigned char ch, char* &out) const +void StatelessUtf8Encoder::copyFromArray(unsigned char ch, char* &out) const { // Optimize for ASCII values if (ch < 128) @@ -220,13 +217,13 @@ void Utf8Encoder::copyFromArray(unsigned char ch, char* &out) const return; } - const signed char *in = translationArray + ch*6; + const signed char *in = &mTranslationArray[ch * 6]; int len = *(in++); memcpy(out, in, len); out += len; } -std::pair Utf8Encoder::getLengthLegacyEnc(std::string_view input) const +std::pair StatelessUtf8Encoder::getLengthLegacyEnc(std::string_view input) const { // Do away with the ascii part of the string first (this is almost // always the entire string.) @@ -271,7 +268,7 @@ std::pair Utf8Encoder::getLengthLegacyEnc(std::string_view in return {len, false}; } -void Utf8Encoder::copyFromArrayLegacyEnc(std::string_view::iterator& chp, std::string_view::iterator end, char* &out) const +void StatelessUtf8Encoder::copyFromArrayLegacyEnc(std::string_view::iterator& chp, std::string_view::iterator end, char* &out) const { unsigned char ch = *(chp++); // Optimize for ASCII values @@ -316,7 +313,7 @@ void Utf8Encoder::copyFromArrayLegacyEnc(std::string_view::iterator& chp, std::s for (int i = 128; i < 256; i++) { - unsigned char b1 = translationArray[i*6 + 1], b2 = translationArray[i*6 + 2], b3 = translationArray[i*6 + 3]; + unsigned char b1 = mTranslationArray[i*6 + 1], b2 = mTranslationArray[i*6 + 2], b3 = mTranslationArray[i*6 + 3]; if (b1 == ch && b2 == ch2 && (len != 3 || b3 == ch3)) { *(out++) = (char)i; @@ -329,6 +326,22 @@ void Utf8Encoder::copyFromArrayLegacyEnc(std::string_view::iterator& chp, std::s *(out++) = ch; // Could not find glyph, just put whatever } +Utf8Encoder::Utf8Encoder(FromType sourceEncoding) + : mBuffer(50 * 1024, '\0') + , mImpl(sourceEncoding) +{ +} + +std::string_view Utf8Encoder::getUtf8(std::string_view input) +{ + return mImpl.getUtf8(input, BufferAllocationPolicy::UseGrowFactor, mBuffer); +} + +std::string_view Utf8Encoder::getLegacyEnc(std::string_view input) +{ + return mImpl.getLegacyEnc(input, BufferAllocationPolicy::UseGrowFactor, mBuffer); +} + ToUTF8::FromType ToUTF8::calculateEncoding(const std::string& encodingName) { if (encodingName == "win1250") diff --git a/components/to_utf8/to_utf8.hpp b/components/to_utf8/to_utf8.hpp index 794c9148e5..037e3ea3bf 100644 --- a/components/to_utf8/to_utf8.hpp +++ b/components/to_utf8/to_utf8.hpp @@ -18,34 +18,55 @@ namespace ToUTF8 CP437 // Used for fonts (*.fnt) if data files encoding is 1252. Otherwise, uses the same encoding as the data files. }; + enum class BufferAllocationPolicy + { + FitToRequiredSize, + UseGrowFactor, + }; + FromType calculateEncoding(const std::string& encodingName); std::string encodingUsingMessage(const std::string& encodingName); - // class + class StatelessUtf8Encoder + { + public: + explicit StatelessUtf8Encoder(FromType sourceEncoding); + + /// Convert to UTF8 from the previously given code page. + /// Returns a view to passed buffer that will be resized to fit output if it's too small. + std::string_view getUtf8(std::string_view input, BufferAllocationPolicy bufferAllocationPolicy, std::string& buffer) const; + + /// Convert from UTF-8 to sourceEncoding. + /// Returns a view to passed buffer that will be resized to fit output if it's too small. + std::string_view getLegacyEnc(std::string_view input, BufferAllocationPolicy bufferAllocationPolicy, std::string& buffer) const; + + private: + inline std::pair getLength(std::string_view input) const; + inline void copyFromArray(unsigned char chp, char* &out) const; + inline std::pair getLengthLegacyEnc(std::string_view input) const; + inline void copyFromArrayLegacyEnc(std::string_view::iterator& chp, std::string_view::iterator end, char* &out) const; + + const std::basic_string_view mTranslationArray; + }; class Utf8Encoder { public: - Utf8Encoder(FromType sourceEncoding); + explicit Utf8Encoder(FromType sourceEncoding); /// Convert to UTF8 from the previously given code page. /// Returns a view to internal buffer invalidate by next getUtf8 or getLegacyEnc call if input is not /// ASCII-only string. Otherwise returns a view to the input. std::string_view getUtf8(std::string_view input); + /// Convert from UTF-8 to sourceEncoding. /// Returns a view to internal buffer invalidate by next getUtf8 or getLegacyEnc call if input is not /// ASCII-only string. Otherwise returns a view to the input. std::string_view getLegacyEnc(std::string_view input); private: - inline void resize(std::size_t size); - inline std::pair getLength(std::string_view input) const; - inline void copyFromArray(unsigned char chp, char* &out) const; - inline std::pair getLengthLegacyEnc(std::string_view input) const; - inline void copyFromArrayLegacyEnc(std::string_view::iterator& chp, std::string_view::iterator end, char* &out) const; - - std::vector mOutput; - const signed char* translationArray; + std::string mBuffer; + StatelessUtf8Encoder mImpl; }; }