Merge branch 'luautflib' into 'master'

Lua utf-8 support Closes #6505 See merge request OpenMW/openmw!3327
2025-07-02 23:41:34 +00:00 · 2023-09-03 17:05:27 +00:00 · 2023-09-03 17:05:27 +00:00 · 0b74146b05
commit 0b74146b05
parent ccc465db27 7eb456a169
6 changed files with 331 additions and 3 deletions
--- a/components/CMakeLists.txt
+++ b/components/CMakeLists.txt
@ -42,7 +42,7 @@ list (APPEND COMPONENT_FILES "${OpenMW_BINARY_DIR}/${VERSION_CPP_FILE}")
 # source files

 add_component_dir (lua
-    luastate scriptscontainer asyncpackage utilpackage serialization configuration l10n storage
+    luastate scriptscontainer asyncpackage utilpackage serialization configuration l10n storage utf8
    shapes/box
    )

--- a/components/lua/luastate.cpp
+++ b/components/lua/luastate.cpp
@ -12,6 +12,7 @@
 #include <components/vfs/manager.hpp>

 #include "scriptscontainer.hpp"
+#include "utf8.hpp"

 namespace LuaUtil
 {
@ -51,7 +52,7 @@ namespace LuaUtil

    static const std::string safeFunctions[] = { "assert", "error", "ipairs", "next", "pairs", "pcall", "select",
        "tonumber", "tostring", "type", "unpack", "xpcall", "rawequal", "rawget", "rawset", "setmetatable" };
-    static const std::string safePackages[] = { "coroutine", "math", "string", "table" };
+    static const std::string safePackages[] = { "coroutine", "math", "string", "table", "utf8" };

    static constexpr int64_t countHookStep = 1000;

@ -181,6 +182,8 @@ namespace LuaUtil
        mSol["math"]["randomseed"](static_cast<unsigned>(std::time(nullptr)));
        mSol["math"]["randomseed"] = [] {};

+        mSol["utf8"] = LuaUtf8::initUtf8Package(mSol);
+
        mSol["writeToLog"] = [](std::string_view s) { Log(Debug::Level::Info) << s; };

        mSol["setEnvironment"]
--- a/components/lua/utf8.cpp
+++ b/components/lua/utf8.cpp
@ -0,0 +1,233 @@
+#include <codecvt>
+#include <components/misc/strings/format.hpp>
+
+#include "utf8.hpp"
+
+namespace
+{
+    constexpr std::string_view UTF8PATT = "[%z\x01-\x7F\xC2-\xF4][\x80-\xBF]*"; // %z is deprecated in Lua5.2
+    constexpr uint32_t MAXUTF = 0x7FFFFFFFu;
+    // constexpr uint32_t MAXUNICODE = 0x10FFFFu;
+
+    inline bool isNilOrNone(const sol::stack_proxy arg)
+    {
+        return (arg.get_type() == sol::type::lua_nil || arg.get_type() == sol::type::none);
+    }
+
+    inline double getInteger(const sol::stack_proxy arg, const size_t n, std::string_view name)
+    {
+        double integer;
+        if (!arg.is<double>())
+            throw std::runtime_error(Misc::StringUtils::format("bad argument #%i to '%s' (number expected, got %s)", n,
+                name, sol::type_name(arg.lua_state(), arg.get_type())));
+
+        if (std::modf(arg, &integer) != 0)
+            throw std::runtime_error(
+                Misc::StringUtils::format("bad argument #{} to '{}' (number has no integer representation)", n, name));
+
+        return integer;
+    }
+
+    // If the input 'pos' is negative, it is treated as counting from the end of the string,
+    // where -1 represents the last character position, -2 represents the second-to-last position,
+    // and so on. If 'pos' is non-negative, it is used as-is.
+    inline void relativePosition(int64_t& pos, const size_t len)
+    {
+        if (pos < 0)
+            pos = std::max<int64_t>(0, pos + len + 1);
+    }
+
+    // returns: first - character pos in bytes, second - character codepoint
+    std::pair<int64_t, int64_t> decodeNextUTF8Character(std::string_view s, std::vector<int64_t>& pos_byte)
+    {
+        const int64_t pos = pos_byte.back() - 1;
+        const unsigned char ch = static_cast<unsigned char>(s[pos]);
+        int64_t codepoint = -1;
+        size_t byteSize = 0;
+
+        if ((ch & 0b10000000) == 0)
+        {
+            codepoint = ch;
+            byteSize = 1;
+        }
+        else if ((ch & 0b11100000) == 0b11000000)
+        {
+            codepoint = ch & 0b00011111;
+            byteSize = 2;
+        }
+        else if ((ch & 0b11110000) == 0b11100000)
+        {
+            codepoint = ch & 0b00001111;
+            byteSize = 3;
+        }
+        else if ((ch & 0b11111000) == 0b11110000)
+        {
+            codepoint = ch & 0b00000111;
+            byteSize = 4;
+        }
+
+        // construct codepoint for non-ascii
+        for (size_t i = 1; i < byteSize; ++i)
+        {
+            // if not a continuation byte
+            if ((pos + i) >= s.size() || (static_cast<unsigned char>(s[pos + i]) & 0b11000000) != 0b10000000)
+            {
+                return std::make_pair(0, -1);
+            }
+            codepoint = (codepoint << 6) | (static_cast<unsigned char>(s[pos + i]) & 0b00111111);
+        }
+
+        std::pair<size_t, int64_t> res = std::make_pair(pos_byte.back(), codepoint);
+
+        pos_byte.push_back(pos_byte.back() + byteSize); /* the next character (if exists) starts at this byte */
+
+        return res;
+    }
+
+}
+
+namespace LuaUtf8
+{
+    sol::table initUtf8Package(sol::state_view& lua)
+    {
+        sol::table utf8(lua, sol::create);
+
+        utf8["charpattern"] = UTF8PATT;
+
+        utf8["char"] = [](const sol::variadic_args args) -> std::string {
+            std::string result{};
+            std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
+            for (size_t i = 0; i < args.size(); ++i)
+            {
+                int64_t codepoint = getInteger(args[i], (i + 1), "char");
+                if (codepoint < 0 || codepoint > MAXUTF)
+                    throw std::runtime_error(
+                        Misc::StringUtils::format("bad argument #{} to 'char' (value out of range)", (i + 1)));
+
+                result += converter.to_bytes(codepoint);
+            }
+            return result;
+        };
+
+        utf8["codes"] = [](std::string_view s) {
+            std::vector<int64_t> pos_byte{ 1 };
+            return sol::as_function([s, pos_byte]() mutable -> sol::optional<std::pair<int64_t, int64_t>> {
+                if (pos_byte.back() <= static_cast<int64_t>(s.size()))
+                {
+                    const auto pair = decodeNextUTF8Character(s, pos_byte);
+                    if (pair.second == -1)
+                        throw std::runtime_error("Invalid UTF-8 code at position " + std::to_string(pos_byte.size()));
+
+                    return pair;
+                }
+                return sol::nullopt;
+            });
+        };
+
+        utf8["len"] = [](std::string_view s,
+                          const sol::variadic_args args) -> std::variant<size_t, std::pair<sol::object, int64_t>> {
+            const size_t len = s.size();
+            int64_t iv = isNilOrNone(args[0]) ? 1 : getInteger(args[0], 2, "len");
+            int64_t fv = isNilOrNone(args[1]) ? -1 : getInteger(args[1], 3, "len");
+
+            relativePosition(iv, len);
+            relativePosition(fv, len);
+
+            if (iv <= 0)
+                throw std::runtime_error("bad argument #2 to 'len' (initial position out of bounds)");
+            if (fv > static_cast<int64_t>(len))
+                throw std::runtime_error("bad argument #3 to 'len' (final position out of bounds)");
+
+            if (len == 0)
+                return len;
+
+            std::vector<int64_t> pos_byte = { iv };
+
+            while (pos_byte.back() <= fv)
+            {
+                if (decodeNextUTF8Character(s, pos_byte).second == -1)
+                    return std::pair(sol::lua_nil, pos_byte.back());
+            }
+            return pos_byte.size() - 1;
+        };
+
+        utf8["codepoint"]
+            = [](std::string_view s, const sol::variadic_args args) -> sol::as_returns_t<std::vector<int64_t>> {
+            size_t len = s.size();
+            int64_t iv = isNilOrNone(args[0]) ? 1 : getInteger(args[0], 2, "codepoint");
+            int64_t fv = isNilOrNone(args[1]) ? iv : getInteger(args[1], 3, "codepoint");
+
+            relativePosition(iv, len);
+            relativePosition(fv, len);
+
+            if (iv <= 0)
+                throw std::runtime_error("bad argument #2 to 'codepoint' (initial position out of bounds)");
+            if (fv > static_cast<int64_t>(len))
+                throw std::runtime_error("bad argument #3 to 'codepoint' (final position out of bounds)");
+
+            if (iv > fv)
+                return sol::as_returns(std::vector<int64_t>{}); /* empty interval; return nothing */
+
+            std::vector<int64_t> pos_byte = { iv };
+            std::vector<int64_t> codepoints;
+
+            while (pos_byte.back() <= fv)
+            {
+                codepoints.push_back(decodeNextUTF8Character(s, pos_byte).second);
+                if (codepoints.back() == -1)
+                    throw std::runtime_error("Invalid UTF-8 code at position " + std::to_string(pos_byte.size()));
+            }
+
+            return sol::as_returns(std::move(codepoints));
+        };
+
+        utf8["offset"]
+            = [](std::string_view s, const int64_t n, const sol::variadic_args args) -> sol::optional<int64_t> {
+            size_t len = s.size();
+            int64_t iv;
+
+            if (isNilOrNone(args[0]))
+            {
+                if (n >= 0)
+                    iv = 1;
+                else
+                    iv = s.size() + 1;
+            }
+            else
+                iv = getInteger(args[0], 3, "offset");
+
+            std::vector<int64_t> pos_byte = { 1 };
+
+            relativePosition(iv, len);
+
+            if (iv > static_cast<int64_t>(len) + 1)
+                throw std::runtime_error("bad argument #3 to 'offset' (position out of bounds)");
+
+            while (pos_byte.back() <= static_cast<int64_t>(len))
+                decodeNextUTF8Character(s, pos_byte);
+
+            for (auto it = pos_byte.begin(); it != pos_byte.end(); ++it)
+            {
+                if (*it == iv)
+                {
+                    if (n <= 0 && it + n >= pos_byte.begin())
+                        return *(it + n);
+                    if (n > 0 && it + n - 1 < pos_byte.end())
+                        return *(it + n - 1);
+                    break;
+                }
+                else if (*it > iv) /* a continuation byte */
+                {
+                    if (n == 0)
+                        return *(it - 1); /* special case */
+                    else
+                        throw std::runtime_error("initial position is a continuation byte");
+                }
+            }
+
+            return sol::nullopt;
+        };
+
+        return utf8;
+    }
+}
--- a/components/lua/utf8.hpp
+++ b/components/lua/utf8.hpp
@ -0,0 +1,11 @@
+#ifndef COMPONENTS_LUA_UTF8_H
+#define COMPONENTS_LUA_UTF8_H
+
+#include <sol/sol.hpp>
+
+namespace LuaUtf8
+{
+    sol::table initUtf8Package(sol::state_view&);
+}
+
+#endif
--- a/docs/source/reference/lua-scripting/overview.rst
+++ b/docs/source/reference/lua-scripting/overview.rst
@ -6,7 +6,7 @@ Overview of Lua scripting
 Language and sandboxing
 =======================

-OpenMW supports scripts written in Lua 5.1 with some extensions (see below) from Lua 5.2.
+OpenMW supports scripts written in Lua 5.1 with some extensions (see below) from Lua 5.2 and Lua 5.3.
 There are no plans to switch to any newer version of the language, because newer versions are not supported by LuaJIT.

 .. note::
@ -40,6 +40,10 @@ Supported Lua 5.2 features:
 - ``__pairs`` and ``__ipairs`` metamethods;
 - Function ``table.unpack`` (alias to Lua 5.1 ``unpack``).

+Supported Lua 5.3 features:
+
+- All functions in the `UTF-8 Library <https://www.lua.org/manual/5.3/manual.html#6.5>`__
+
 Loading libraries with ``require('library_name')`` is allowed, but limited. It works this way:

 1. If `library_name` is one of the standard libraries, then return the library.
--- a/files/lua_api/utf8.doclua
+++ b/files/lua_api/utf8.doclua
@ -0,0 +1,77 @@
+-------------------------------------------------------------------------------
+-- UTF-8 Support. 
+-- This library provides basic support for UTF-8 encoding.
+-- It provides all its functions inside the table utf8.
+-- This library does not provide any support for Unicode other than the handling of the encoding.
+-- Any operation that needs the meaning of a character, such as character classification, is outside its scope.
+--
+-- Unless stated otherwise, all functions that expect a byte position as a parameter assume that 
+-- the given position is either the start of a byte sequence or one plus the length of the subject string.
+-- As in the string library, negative indices count from the end of the string.
+-- @module utf8
+
+-------------------------------------------------------------------------------
+-- Receives zero or more integers, converts each one to its
+-- corresponding UTF-8 byte sequence, and returns a string with the concatenation
+-- of all these sequences.
+-- @function [parent=#utf8] char
+-- @param ... zero or more integers.
+-- @return #string
+
+-------------------------------------------------------------------------------
+-- The pattern which matches exactly one UTF-8 byte sequence, assuming that
+-- the subject is a valid UTF-8 string.
+-- @function [parent=#utf8] charpattern
+-- @return #string
+
+-------------------------------------------------------------------------------
+-- Returns values so that the construction
+--
+--     for p, c in utf8.codes(s) do body end
+--
+-- will iterate over all characters in string s, with p being the position (in bytes)
+-- and c the code point of each character.
+-- It raises an error if it meets any invalid byte sequence.
+-- @function [parent=#utf8] codes
+-- @param #string s string to handle.
+
+-------------------------------------------------------------------------------
+-- Returns the codepoints (as integers) from all characters in s that start
+-- between byte position i and j (both included). The default for i is 1 and for j is i.
+-- It raises an error if it meets any invalid byte sequence.
+-- @function [parent=#utf8] codepoint
+-- @param #string s string to handle
+-- @param #number i the initial position (default value is 1)
+-- @param #number j the final position (default value is i)
+-- @return #number the codepoints of each character in s
+
+-------------------------------------------------------------------------------
+-- Returns the number of UTF-8 characters in string s that start
+-- between positions i and j (both inclusive).
+-- The default for i is 1 and for j is -1.
+-- If it finds any invalid byte sequence,
+-- returns a false value plus the position of the first invalid byte.
+-- @function [parent=#utf8] len
+-- @param #string s string to handle
+-- @param #number i the initial position (default value is 1)
+-- @param #number j the final position (default value is -1)
+-- @return #number the number of utf8 characters in s
+
+-------------------------------------------------------------------------------
+-- Returns the position (in bytes) where the encoding of the n-th character of s
+-- (counting from position i) starts. A negative n gets characters before position i.
+-- The default for i is 1 when n is non-negative and #s + 1 otherwise,
+-- so that utf8.offset(s, -n) gets the offset of the n-th character from the end of the string.
+-- If the specified character is neither in the subject nor right after its end, the function returns nil.
+--
+-- As a special case, when n is 0 the function returns the
+-- start of the encoding of the character that contains the i-th byte of s.
+--
+-- This function assumes that s is a valid UTF-8 string.
+-- @function [parent=#utf8] offset
+-- @param #string s string to handle
+-- @param #number n the n-th character
+-- @param #number i the initial position (default value is 1 if n is is non-negative and #s + 1 otherwise)
+-- @return #number
+
+return nil