Add StatelessUtf8Encoder to support caller provided buffer for output

2025-07-12 07:41:42 +00:00 · 2022-02-14 22:26:01 +01:00 · 2022-02-14 22:26:01 +01:00 · c044bef6a7
commit c044bef6a7
parent b617d0aec4
4 changed files with 105 additions and 72 deletions
--- a/apps/opencs/editor.cpp
+++ b/apps/opencs/editor.cpp
@ -370,7 +370,7 @@ int CS::Editor::run()
    else
    {
        ESM::ESMReader fileReader;
-        ToUTF8::Utf8Encoder encoder = ToUTF8::calculateEncoding(mEncodingName);
+        ToUTF8::Utf8Encoder encoder(ToUTF8::calculateEncoding(mEncodingName));
        fileReader.setEncoder(&encoder);
        fileReader.open(mFileToLoad.string());
--- a/components/contentselector/model/contentmodel.cpp
+++ b/components/contentselector/model/contentmodel.cpp
@ -445,8 +445,7 @@ void ContentSelectorModel::ContentModel::addFiles(const QString &path)
        try {
            ESM::ESMReader fileReader;
-            ToUTF8::Utf8Encoder encoder =
+            ToUTF8::Utf8Encoder encoder(ToUTF8::calculateEncoding(mEncoding.toStdString()));
            ToUTF8::calculateEncoding(mEncoding.toStdString());
            fileReader.setEncoder(&encoder);
            fileReader.open(std::string(dir.absoluteFilePath(path2).toUtf8().constData()));
--- a/components/to_utf8/to_utf8.cpp
+++ b/components/to_utf8/to_utf8.cpp
@ -51,42 +51,52 @@ namespace
    {
        return std::find_if(input.begin(), input.end(), [] (unsigned char v) { return v == 0 || v >= 128; });
    }
 }
-Utf8Encoder::Utf8Encoder(const FromType sourceEncoding):
+    std::basic_string_view<signed char> getTranslationArray(FromType sourceEncoding)
    mOutput(50*1024)
 {
    switch (sourceEncoding)
    {
-        case ToUTF8::WINDOWS_1252:
+        switch (sourceEncoding)
        {
-            translationArray = ToUTF8::windows_1252;
+            case ToUTF8::WINDOWS_1252:
-            break;
+                return ToUTF8::windows_1252;
-        }
+            case ToUTF8::WINDOWS_1250:
-        case ToUTF8::WINDOWS_1250:
+                return ToUTF8::windows_1250;
-        {
+            case ToUTF8::WINDOWS_1251:
-            translationArray = ToUTF8::windows_1250;
+                return ToUTF8::windows_1251;
-            break;
+            case ToUTF8::CP437:
-        }
+                return ToUTF8::cp437;
        case ToUTF8::WINDOWS_1251:
        {
            translationArray = ToUTF8::windows_1251;
            break;
        }
        case ToUTF8::CP437:
        {
            translationArray = ToUTF8::cp437;
            break;
        }
        throw std::logic_error("Invalid source encoding: " + std::to_string(sourceEncoding));
    }
-        default:
+    // Make sure the output vector is large enough for 'size' bytes,
    // including a terminating zero after it.
    void resize(std::size_t size, BufferAllocationPolicy bufferAllocationPolicy, std::string& buffer)
    {
        if (buffer.size() >= size)
            return;
        switch (bufferAllocationPolicy)
        {
-            assert(0);
+            case BufferAllocationPolicy::FitToRequiredSize:
                buffer.resize(size);
                break;
            case BufferAllocationPolicy::UseGrowFactor:
                // Add some extra padding to reduce the chance of having to resize
                // again later.
                buffer.resize(3 * size);
                // And make sure the string is zero terminated
                buffer[size] = 0;
                break;
        }
    }
 }
-std::string_view Utf8Encoder::getUtf8(std::string_view input)
+StatelessUtf8Encoder::StatelessUtf8Encoder(FromType sourceEncoding)
    : mTranslationArray(getTranslationArray(sourceEncoding))
 {
 }
 std::string_view StatelessUtf8Encoder::getUtf8(std::string_view input, BufferAllocationPolicy bufferAllocationPolicy, std::string& buffer) const
 {
    if (input.empty())
        return input;
@ -106,24 +116,24 @@ std::string_view Utf8Encoder::getUtf8(std::string_view input)
        return std::string_view(input.data(), outlen);
    // Make sure the output is large enough
-    resize(outlen);
+    resize(outlen, bufferAllocationPolicy, buffer);
-    char *out = &mOutput[0];
+    char *out = buffer.data();
    // Translate
    for (auto it = input.begin(); it != input.end() && *it != 0; ++it)
        copyFromArray(*it, out);
    // Make sure that we wrote the correct number of bytes
-    assert((out-&mOutput[0]) == (int)outlen);
+    assert((out - buffer.data()) == (int)outlen);
    // And make extra sure the output is null terminated
-    assert(mOutput.size() > outlen);
+    assert(buffer.size() >= outlen);
-    assert(mOutput[outlen] == 0);
+    assert(buffer[outlen] == 0);
-    return std::string_view(mOutput.data(), outlen);
+    return std::string_view(buffer.data(), outlen);
 }
-std::string_view Utf8Encoder::getLegacyEnc(std::string_view input)
+std::string_view StatelessUtf8Encoder::getLegacyEnc(std::string_view input, BufferAllocationPolicy bufferAllocationPolicy, std::string& buffer) const
 {
    if (input.empty())
        return input;
@ -143,34 +153,21 @@ std::string_view Utf8Encoder::getLegacyEnc(std::string_view input)
        return std::string_view(input.data(), outlen);
    // Make sure the output is large enough
-    resize(outlen);
+    resize(outlen, bufferAllocationPolicy, buffer);
-    char *out = &mOutput[0];
+    char *out = buffer.data();
    // Translate
    for (auto it = input.begin(); it != input.end() && *it != 0;)
        copyFromArrayLegacyEnc(it, input.end(), out);
    // Make sure that we wrote the correct number of bytes
-    assert((out-&mOutput[0]) == (int)outlen);
+    assert((out - buffer.data()) == static_cast<int>(outlen));
    // And make extra sure the output is null terminated
-    assert(mOutput.size() > outlen);
+    assert(buffer.size() >= outlen);
-    assert(mOutput[outlen] == 0);
+    assert(buffer[outlen] == 0);
-    return std::string_view(mOutput.data(), outlen);
+    return std::string_view(buffer.data(), outlen);
 }
 // Make sure the output vector is large enough for 'size' bytes,
 // including a terminating zero after it.
 void Utf8Encoder::resize(size_t size)
 {
    if (mOutput.size() <= size)
        // Add some extra padding to reduce the chance of having to resize
        // again later.
        mOutput.resize(3*size);
    // And make sure the string is zero terminated
    mOutput[size] = 0;
 }
 /** Get the total length length needed to decode the given string with
@ -183,7 +180,7 @@ void Utf8Encoder::resize(size_t size)
  is the case, then the ascii parameter is set to true, and the
  caller can optimize for this case.
 */
-std::pair<std::size_t, bool> Utf8Encoder::getLength(std::string_view input) const
+std::pair<std::size_t, bool> StatelessUtf8Encoder::getLength(std::string_view input) const
 {
    // Do away with the ascii part of the string first (this is almost
    // always the entire string.)
@ -201,7 +198,7 @@ std::pair<std::size_t, bool> Utf8Encoder::getLength(std::string_view input) cons
    {
        // Find the translated length of this character in the
        // lookup table.
-        len += translationArray[static_cast<unsigned char>(*it) * 6];
+        len += mTranslationArray[static_cast<unsigned char>(*it) * 6];
        ++it;
    }
    while (it != input.end() && *it != 0);
@ -211,7 +208,7 @@ std::pair<std::size_t, bool> Utf8Encoder::getLength(std::string_view input) cons
 // Translate one character 'ch' using the translation array 'arr', and
 // advance the output pointer accordingly.
-void Utf8Encoder::copyFromArray(unsigned char ch, char* &out) const
+void StatelessUtf8Encoder::copyFromArray(unsigned char ch, char* &out) const
 {
    // Optimize for ASCII values
    if (ch < 128)
@ -220,13 +217,13 @@ void Utf8Encoder::copyFromArray(unsigned char ch, char* &out) const
        return;
    }
-    const signed char *in = translationArray + ch*6;
+    const signed char *in = &mTranslationArray[ch * 6];
    int len = *(in++);
    memcpy(out, in, len);
    out += len;
 }
-std::pair<std::size_t, bool> Utf8Encoder::getLengthLegacyEnc(std::string_view input) const
+std::pair<std::size_t, bool> StatelessUtf8Encoder::getLengthLegacyEnc(std::string_view input) const
 {
    // Do away with the ascii part of the string first (this is almost
    // always the entire string.)
@ -271,7 +268,7 @@ std::pair<std::size_t, bool> Utf8Encoder::getLengthLegacyEnc(std::string_view in
    return {len, false};
 }
-void Utf8Encoder::copyFromArrayLegacyEnc(std::string_view::iterator& chp, std::string_view::iterator end, char* &out) const
+void StatelessUtf8Encoder::copyFromArrayLegacyEnc(std::string_view::iterator& chp, std::string_view::iterator end, char* &out) const
 {
    unsigned char ch = *(chp++);
    // Optimize for ASCII values
@ -316,7 +313,7 @@ void Utf8Encoder::copyFromArrayLegacyEnc(std::string_view::iterator& chp, std::s
    for (int i = 128; i < 256; i++)
    {
-        unsigned char b1 = translationArray[i*6 + 1], b2 = translationArray[i*6 + 2], b3 = translationArray[i*6 + 3];
+        unsigned char b1 = mTranslationArray[i*6 + 1], b2 = mTranslationArray[i*6 + 2], b3 = mTranslationArray[i*6 + 3];
        if (b1 == ch && b2 == ch2 && (len != 3 || b3 == ch3))
        {
            *(out++) = (char)i;
@ -329,6 +326,22 @@ void Utf8Encoder::copyFromArrayLegacyEnc(std::string_view::iterator& chp, std::s
    *(out++) = ch; // Could not find glyph, just put whatever
 }
 Utf8Encoder::Utf8Encoder(FromType sourceEncoding)
    : mBuffer(50 * 1024, '\0')
    , mImpl(sourceEncoding)
 {
 }
 std::string_view Utf8Encoder::getUtf8(std::string_view input)
 {
    return mImpl.getUtf8(input, BufferAllocationPolicy::UseGrowFactor, mBuffer);
 }
 std::string_view Utf8Encoder::getLegacyEnc(std::string_view input)
 {
    return mImpl.getLegacyEnc(input, BufferAllocationPolicy::UseGrowFactor, mBuffer);
 }
 ToUTF8::FromType ToUTF8::calculateEncoding(const std::string& encodingName)
 {
    if (encodingName == "win1250")
--- a/components/to_utf8/to_utf8.hpp
+++ b/components/to_utf8/to_utf8.hpp
@ -18,34 +18,55 @@ namespace ToUTF8
        CP437           // Used for fonts (*.fnt) if data files encoding is 1252. Otherwise, uses the same encoding as the data files.
    };
    enum class BufferAllocationPolicy
    {
        FitToRequiredSize,
        UseGrowFactor,
    };
    FromType calculateEncoding(const std::string& encodingName);
    std::string encodingUsingMessage(const std::string& encodingName);
-    // class
+    class StatelessUtf8Encoder
    {
        public:
            explicit StatelessUtf8Encoder(FromType sourceEncoding);
            /// Convert to UTF8 from the previously given code page.
            /// Returns a view to passed buffer that will be resized to fit output if it's too small.
            std::string_view getUtf8(std::string_view input, BufferAllocationPolicy bufferAllocationPolicy, std::string& buffer) const;
            /// Convert from UTF-8 to sourceEncoding.
            /// Returns a view to passed buffer that will be resized to fit output if it's too small.
            std::string_view getLegacyEnc(std::string_view input, BufferAllocationPolicy bufferAllocationPolicy, std::string& buffer) const;
        private:
            inline std::pair<std::size_t, bool> getLength(std::string_view input) const;
            inline void copyFromArray(unsigned char chp, char* &out) const;
            inline std::pair<std::size_t, bool> getLengthLegacyEnc(std::string_view input) const;
            inline void copyFromArrayLegacyEnc(std::string_view::iterator& chp, std::string_view::iterator end, char* &out) const;
            const std::basic_string_view<signed char> mTranslationArray;
    };
    class Utf8Encoder
    {
        public:
-            Utf8Encoder(FromType sourceEncoding);
+            explicit Utf8Encoder(FromType sourceEncoding);
            /// Convert to UTF8 from the previously given code page.
            /// Returns a view to internal buffer invalidate by next getUtf8 or getLegacyEnc call if input is not
            /// ASCII-only string. Otherwise returns a view to the input.
            std::string_view getUtf8(std::string_view input);
            /// Convert from UTF-8 to sourceEncoding.
            /// Returns a view to internal buffer invalidate by next getUtf8 or getLegacyEnc call if input is not
            /// ASCII-only string. Otherwise returns a view to the input.
            std::string_view getLegacyEnc(std::string_view input);
        private:
-            inline void resize(std::size_t size);
+            std::string mBuffer;
-            inline std::pair<std::size_t, bool> getLength(std::string_view input) const;
+            StatelessUtf8Encoder mImpl;
            inline void copyFromArray(unsigned char chp, char* &out) const;
            inline std::pair<std::size_t, bool> getLengthLegacyEnc(std::string_view input) const;
            inline void copyFromArrayLegacyEnc(std::string_view::iterator& chp, std::string_view::iterator end, char* &out) const;
            std::vector<char> mOutput;
            const signed char* translationArray;
    };
 }