Support UTF-8 by StringRefId::toDebugString

2025-12-15 06:43:06 +00:00 · 2023-06-01 11:35:34 +02:00 · 2023-06-01 11:35:34 +02:00 · 78b3f7288a
commit 78b3f7288a
parent 8e3e351015
3 changed files with 67 additions and 16 deletions
--- a/apps/openmw_test_suite/esm/testrefid.cpp
+++ b/apps/openmw_test_suite/esm/testrefid.cpp
@ -287,7 +287,11 @@ namespace ESM
            { RefId(), "Empty{}" },
            { RefId::stringRefId("foo"), "\"foo\"" },
            { RefId::stringRefId("BAR"), "\"BAR\"" },
-            { RefId::stringRefId(std::string({ 'a', 0, -1, '\n', '\t' })), "\"a\\x0\\xFF\\xA\\x9\"" },
+            { RefId::stringRefId(std::string({ 'a', 0, -1, '\n', '\t' })), "\"a\\x0\\xff\\xa\\x9\"" },
            { RefId::stringRefId("Логово дракона"), "\"Логово дракона\"" },
            { RefId::stringRefId("\xd0\x9b"), "\"Л\"" },
            { RefId::stringRefId("\xff\x9b"), "\"\\xff\\x9b\"" },
            { RefId::stringRefId("\xd0\xd0"), "\"\\xd0\\xd0\"" },
            { RefId::formIdRefId({ .mIndex = 42, .mContentFile = 0 }), "FormId:0x2a" },
            { RefId::formIdRefId({ .mIndex = 0xffffff, .mContentFile = std::numeric_limits<std::int32_t>::min() }),
                "FormId:0xff80000000ffffff" },
--- a/components/esm/stringrefid.cpp
+++ b/components/esm/stringrefid.cpp
@ -1,13 +1,17 @@
 #include "stringrefid.hpp"
 #include "serializerefid.hpp"
 #include <charconv>
 #include <iomanip>
 #include <mutex>
 #include <ostream>
 #include <sstream>
 #include <system_error>
 #include <unordered_set>
 #include "components/misc/guarded.hpp"
 #include "components/misc/strings/algorithm.hpp"
 #include "components/misc/utf8stream.hpp"
 namespace ESM
 {
@ -26,6 +30,18 @@ namespace ESM
                it = locked->emplace(id).first;
            return &*it;
        }
        void addHex(unsigned char value, std::string& result)
        {
            const std::size_t size = 2 + getHexIntegralSize(value);
            const std::size_t shift = result.size();
            result.resize(shift + size);
            result[shift] = '\\';
            result[shift + 1] = 'x';
            const auto [end, ec] = std::to_chars(result.data() + shift + 2, result.data() + result.size(), value, 16);
            if (ec != std::errc())
                throw std::system_error(std::make_error_code(ec));
        }
    }
    StringRefId::StringRefId()
@ -60,20 +76,43 @@ namespace ESM
    std::ostream& operator<<(std::ostream& stream, StringRefId value)
    {
-        stream << '"';
+        return stream << value.toDebugString();
        for (char c : *value.mValue)
            if (std::isprint(c) && c != '\t' && c != '\n' && c != '\r')
                stream << c;
            else
                stream << "\\x" << std::hex << std::uppercase << static_cast<unsigned>(static_cast<unsigned char>(c));
        return stream << '"';
    }
    std::string StringRefId::toDebugString() const
    {
-        std::ostringstream stream;
+        std::string result;
-        stream << *this;
+        result.reserve(2 + mValue->size());
-        return stream.str();
+        result.push_back('"');
        const unsigned char* ptr = reinterpret_cast<const unsigned char*>(mValue->data());
        const unsigned char* const end = reinterpret_cast<const unsigned char*>(mValue->data() + mValue->size());
        while (ptr != end)
        {
            if (Utf8Stream::isAscii(*ptr))
            {
                if (std::isprint(*ptr) && *ptr != '\t' && *ptr != '\n' && *ptr != '\r')
                    result.push_back(*ptr);
                else
                    addHex(*ptr, result);
                ++ptr;
                continue;
            }
            const auto [octets, first] = Utf8Stream::getOctetCount(*ptr);
            const auto [chr, next] = Utf8Stream::decode(ptr + 1, end, first, octets);
            if (chr == Utf8Stream::sBadChar())
            {
                while (ptr != std::min(end, ptr + octets))
                {
                    addHex(*ptr, result);
                    ++ptr;
                }
                continue;
            }
            result.append(ptr, next);
            ptr = next;
        }
        result.push_back('"');
        return result;
    }
    bool StringRefId::startsWith(std::string_view prefix) const
--- a/components/misc/utf8stream.hpp
+++ b/components/misc/utf8stream.hpp
@ -1,6 +1,7 @@
 #ifndef MISC_UTF8ITER_HPP
 #define MISC_UTF8ITER_HPP
 #include <cstdint>
 #include <cstring>
 #include <string>
 #include <string_view>
@ -63,9 +64,11 @@ public:
        return val;
    }
    static bool isAscii(unsigned char value) { return (value & 0x80) == 0; }
    static std::pair<UnicodeChar, Point> decode(Point cur, Point end)
    {
-        if ((*cur & 0x80) == 0)
+        if (isAscii(*cur))
        {
            UnicodeChar chr = *cur++;
@ -75,8 +78,13 @@ public:
        int octets;
        UnicodeChar chr;
-        std::tie(octets, chr) = octet_count(*cur++);
+        std::tie(octets, chr) = getOctetCount(*cur++);
        return decode(cur, end, chr, octets);
    }
    static std::pair<UnicodeChar, Point> decode(Point cur, Point end, UnicodeChar chr, std::size_t octets)
    {
        if (octets > 5)
            return std::make_pair(sBadChar(), cur);
@ -161,10 +169,9 @@ public:
        return out;
    }
-private:
+    static std::pair<std::size_t, UnicodeChar> getOctetCount(unsigned char octet)
    static std::pair<int, UnicodeChar> octet_count(unsigned char octet)
    {
-        int octets;
+        std::size_t octets;
        unsigned char mark = 0xC0;
        unsigned char mask = 0xE0;
@ -181,6 +188,7 @@ private:
        return std::make_pair(octets, octet & ~mask);
    }
 private:
    void next() { std::tie(val, nxt) = decode(nxt, end); }
    Point cur;