Merge branch 'utf8_encoder_string_view' into 'master'

Use std::string_view for argument and return type of Utf8Encoder functions See merge request OpenMW/openmw!1652
2026-02-11 10:38:29 +00:00 · 2022-02-12 18:07:26 +00:00 · 2022-02-12 18:07:26 +00:00 · cd8967e265
commit cd8967e265
parent 17ecaf177a c75e938c46
5 changed files with 45 additions and 37 deletions
--- a/components/esm3/esmreader.cpp
+++ b/components/esm3/esmreader.cpp
@ -320,7 +320,7 @@ std::string ESMReader::getString(int size)

    // Convert to UTF8 and return
    if (mEncoder)
-        return mEncoder->getUtf8(ptr, size);
+        return std::string(mEncoder->getUtf8(std::string_view(ptr, size)));

    return std::string (ptr, size);
 }
--- a/components/esm3/esmwriter.cpp
+++ b/components/esm3/esmwriter.cpp
@ -193,9 +193,9 @@ namespace ESM
        else
        {
            // Convert to UTF8 and return
-            std::string string = mEncoder ? mEncoder->getLegacyEnc(data) : data;
+            const std::string_view string = mEncoder != nullptr ? mEncoder->getLegacyEnc(data) : data;

-            write(string.c_str(), string.size());
+            write(string.data(), string.size());
        }
    }

--- a/components/fontloader/fontloader.cpp
+++ b/components/fontloader/fontloader.cpp
@ -1,6 +1,8 @@
 #include "fontloader.hpp"

 #include <stdexcept>
+#include <string_view>
+#include <array>

 #include <osg/Image>

@ -26,7 +28,7 @@

 namespace
 {
-    unsigned long utf8ToUnicode(const std::string& utf8)
+    unsigned long utf8ToUnicode(std::string_view utf8)
    {
        size_t i = 0;
        unsigned long unicode;
@ -116,16 +118,21 @@ namespace
        }
    }

-    // getUtf8, aka the worst function ever written.
-    // This includes various hacks for dealing with Morrowind's .fnt files that are *mostly*
+    // getUnicode includes various hacks for dealing with Morrowind's .fnt files that are *mostly*
    // in the expected win12XX encoding, but also have randomly swapped characters sometimes.
    // Looks like the Morrowind developers found standard encodings too boring and threw in some twists for fun.
-    std::string getUtf8 (unsigned char c, ToUTF8::Utf8Encoder& encoder, ToUTF8::FromType encoding)
+    unsigned long getUnicode(unsigned char c, ToUTF8::Utf8Encoder& encoder, ToUTF8::FromType encoding)
    {
        if (encoding == ToUTF8::WINDOWS_1250) // Hack for polish font
-            return encoder.getUtf8(std::string(1, mapUtf8Char(c)));
+        {
+            const std::array<char, 2> str {static_cast<char>(mapUtf8Char(c)), '\0'};
+            return utf8ToUnicode(encoder.getUtf8(std::string_view(str.data(), 1)));
+        }
        else
-            return encoder.getUtf8(std::string(1, c));
+        {
+            const std::array<char, 2> str {static_cast<char>(c), '\0'};
+            return utf8ToUnicode(encoder.getUtf8(std::string_view(str.data(), 1)));
+        }
    }

    [[noreturn]] void fail (Files::IStreamPtr file, const std::string& fileName, const std::string& message)
@ -355,7 +362,7 @@ namespace Gui
            float h  = data[i].bottom_left.y*height - y1;

            ToUTF8::Utf8Encoder encoder(mEncoding);
-            unsigned long unicodeVal = utf8ToUnicode(getUtf8(i, encoder, mEncoding));
+            unsigned long unicodeVal = getUnicode(i, encoder, mEncoding);

            MyGUI::xml::ElementPtr code = codes->createChild("Code");
            code->addAttribute("index", unicodeVal);
--- a/components/to_utf8/to_utf8.cpp
+++ b/components/to_utf8/to_utf8.cpp
@ -77,12 +77,15 @@ Utf8Encoder::Utf8Encoder(const FromType sourceEncoding):
    }
 }

-std::string Utf8Encoder::getUtf8(const char* input, size_t size)
+std::string_view Utf8Encoder::getUtf8(std::string_view input)
 {
+    if (input.empty())
+        return input;
+
    // Double check that the input string stops at some point (it might
    // contain zero terminators before this, inside its own data, which
    // is also ok.)
-    assert(input[size] == 0);
+    assert(input[input.size()] == 0);

    // Note: The rest of this function is designed for single-character
    // input encodings only. It also assumes that the input encoding
@ -93,19 +96,19 @@ std::string Utf8Encoder::getUtf8(const char* input, size_t size)
    // Compute output length, and check for pure ascii input at the same
    // time.
    bool ascii;
-    size_t outlen = getLength(input, ascii);
+    size_t outlen = getLength(input.data(), ascii);

    // If we're pure ascii, then don't bother converting anything.
    if(ascii)
-        return std::string(input, outlen);
+        return std::string_view(input.data(), outlen);

    // Make sure the output is large enough
    resize(outlen);
    char *out = &mOutput[0];

    // Translate
-    while (*input)
-        copyFromArray(*(input++), out);
+    for (const char* ptr = input.data(); *ptr;)
+        copyFromArray(*(ptr++), out);

    // Make sure that we wrote the correct number of bytes
    assert((out-&mOutput[0]) == (int)outlen);
@ -114,16 +117,18 @@ std::string Utf8Encoder::getUtf8(const char* input, size_t size)
    assert(mOutput.size() > outlen);
    assert(mOutput[outlen] == 0);

-    // Return a string
-    return std::string(&mOutput[0], outlen);
+    return std::string_view(mOutput.data(), outlen);
 }

-std::string Utf8Encoder::getLegacyEnc(const char *input, size_t size)
+std::string_view Utf8Encoder::getLegacyEnc(std::string_view input)
 {
+    if (input.empty())
+        return input;
+
    // Double check that the input string stops at some point (it might
    // contain zero terminators before this, inside its own data, which
    // is also ok.)
-    assert(input[size] == 0);
+    assert(input[input.size()] == 0);

    // TODO: The rest of this function is designed for single-character
    // input encodings only. It also assumes that the input the input
@ -134,19 +139,19 @@ std::string Utf8Encoder::getLegacyEnc(const char *input, size_t size)
    // Compute output length, and check for pure ascii input at the same
    // time.
    bool ascii;
-    size_t outlen = getLength2(input, ascii);
+    size_t outlen = getLength2(input.data(), ascii);

    // If we're pure ascii, then don't bother converting anything.
    if(ascii)
-        return std::string(input, outlen);
+        return std::string_view(input.data(), outlen);

    // Make sure the output is large enough
    resize(outlen);
    char *out = &mOutput[0];

    // Translate
-    while(*input)
-        copyFromArray2(input, out);
+    for (const char* ptr = input.data(); *ptr;)
+        copyFromArray2(ptr, out);

    // Make sure that we wrote the correct number of bytes
    assert((out-&mOutput[0]) == (int)outlen);
@ -155,8 +160,7 @@ std::string Utf8Encoder::getLegacyEnc(const char *input, size_t size)
    assert(mOutput.size() > outlen);
    assert(mOutput[outlen] == 0);

-    // Return a string
-    return std::string(&mOutput[0], outlen);
+    return std::string_view(mOutput.data(), outlen);
 }

 // Make sure the output vector is large enough for 'size' bytes,
--- a/components/to_utf8/to_utf8.hpp
+++ b/components/to_utf8/to_utf8.hpp
@ -4,6 +4,7 @@
 #include <string>
 #include <cstring>
 #include <vector>
+#include <string_view>

 namespace ToUTF8
 {
@ -27,18 +28,14 @@ namespace ToUTF8
        public:
            Utf8Encoder(FromType sourceEncoding);

-            // Convert to UTF8 from the previously given code page.
-            std::string getUtf8(const char *input, size_t size);
-            inline std::string getUtf8(const std::string &str)
-            {
-                return getUtf8(str.c_str(), str.size());
-            }
+            /// Convert to UTF8 from the previously given code page.
+            /// Returns a view to internal buffer invalidate by next getUtf8 or getLegacyEnc call if input is not
+            /// ASCII-only string. Otherwise returns a view to the input.
+            std::string_view getUtf8(std::string_view input);

-            std::string getLegacyEnc(const char *input, size_t size);
-            inline std::string getLegacyEnc(const std::string &str)
-            {
-                return getLegacyEnc(str.c_str(), str.size());
-            }
+            /// Returns a view to internal buffer invalidate by next getUtf8 or getLegacyEnc call if input is not
+            /// ASCII-only string. Otherwise returns a view to the input.
+            std::string_view getLegacyEnc(std::string_view input);

        private:
            void resize(size_t size);