mirror of
https://github.com/OpenMW/openmw.git
synced 2025-02-06 07:15:36 +00:00
Merge branch 'luautflib' into 'master'
Lua utf-8 support Closes #6505 See merge request OpenMW/openmw!3327
This commit is contained in:
commit
0b74146b05
6 changed files with 331 additions and 3 deletions
|
@ -42,7 +42,7 @@ list (APPEND COMPONENT_FILES "${OpenMW_BINARY_DIR}/${VERSION_CPP_FILE}")
|
|||
# source files
|
||||
|
||||
add_component_dir (lua
|
||||
luastate scriptscontainer asyncpackage utilpackage serialization configuration l10n storage
|
||||
luastate scriptscontainer asyncpackage utilpackage serialization configuration l10n storage utf8
|
||||
shapes/box
|
||||
)
|
||||
|
||||
|
|
|
@ -12,6 +12,7 @@
|
|||
#include <components/vfs/manager.hpp>
|
||||
|
||||
#include "scriptscontainer.hpp"
|
||||
#include "utf8.hpp"
|
||||
|
||||
namespace LuaUtil
|
||||
{
|
||||
|
@ -51,7 +52,7 @@ namespace LuaUtil
|
|||
|
||||
static const std::string safeFunctions[] = { "assert", "error", "ipairs", "next", "pairs", "pcall", "select",
|
||||
"tonumber", "tostring", "type", "unpack", "xpcall", "rawequal", "rawget", "rawset", "setmetatable" };
|
||||
static const std::string safePackages[] = { "coroutine", "math", "string", "table" };
|
||||
static const std::string safePackages[] = { "coroutine", "math", "string", "table", "utf8" };
|
||||
|
||||
static constexpr int64_t countHookStep = 1000;
|
||||
|
||||
|
@ -181,6 +182,8 @@ namespace LuaUtil
|
|||
mSol["math"]["randomseed"](static_cast<unsigned>(std::time(nullptr)));
|
||||
mSol["math"]["randomseed"] = [] {};
|
||||
|
||||
mSol["utf8"] = LuaUtf8::initUtf8Package(mSol);
|
||||
|
||||
mSol["writeToLog"] = [](std::string_view s) { Log(Debug::Level::Info) << s; };
|
||||
|
||||
mSol["setEnvironment"]
|
||||
|
|
233
components/lua/utf8.cpp
Normal file
233
components/lua/utf8.cpp
Normal file
|
@ -0,0 +1,233 @@
|
|||
#include <codecvt>
|
||||
#include <components/misc/strings/format.hpp>
|
||||
|
||||
#include "utf8.hpp"
|
||||
|
||||
namespace
|
||||
{
|
||||
constexpr std::string_view UTF8PATT = "[%z\x01-\x7F\xC2-\xF4][\x80-\xBF]*"; // %z is deprecated in Lua5.2
|
||||
constexpr uint32_t MAXUTF = 0x7FFFFFFFu;
|
||||
// constexpr uint32_t MAXUNICODE = 0x10FFFFu;
|
||||
|
||||
inline bool isNilOrNone(const sol::stack_proxy arg)
|
||||
{
|
||||
return (arg.get_type() == sol::type::lua_nil || arg.get_type() == sol::type::none);
|
||||
}
|
||||
|
||||
inline double getInteger(const sol::stack_proxy arg, const size_t n, std::string_view name)
|
||||
{
|
||||
double integer;
|
||||
if (!arg.is<double>())
|
||||
throw std::runtime_error(Misc::StringUtils::format("bad argument #%i to '%s' (number expected, got %s)", n,
|
||||
name, sol::type_name(arg.lua_state(), arg.get_type())));
|
||||
|
||||
if (std::modf(arg, &integer) != 0)
|
||||
throw std::runtime_error(
|
||||
Misc::StringUtils::format("bad argument #{} to '{}' (number has no integer representation)", n, name));
|
||||
|
||||
return integer;
|
||||
}
|
||||
|
||||
// If the input 'pos' is negative, it is treated as counting from the end of the string,
|
||||
// where -1 represents the last character position, -2 represents the second-to-last position,
|
||||
// and so on. If 'pos' is non-negative, it is used as-is.
|
||||
inline void relativePosition(int64_t& pos, const size_t len)
|
||||
{
|
||||
if (pos < 0)
|
||||
pos = std::max<int64_t>(0, pos + len + 1);
|
||||
}
|
||||
|
||||
// returns: first - character pos in bytes, second - character codepoint
|
||||
std::pair<int64_t, int64_t> decodeNextUTF8Character(std::string_view s, std::vector<int64_t>& pos_byte)
|
||||
{
|
||||
const int64_t pos = pos_byte.back() - 1;
|
||||
const unsigned char ch = static_cast<unsigned char>(s[pos]);
|
||||
int64_t codepoint = -1;
|
||||
size_t byteSize = 0;
|
||||
|
||||
if ((ch & 0b10000000) == 0)
|
||||
{
|
||||
codepoint = ch;
|
||||
byteSize = 1;
|
||||
}
|
||||
else if ((ch & 0b11100000) == 0b11000000)
|
||||
{
|
||||
codepoint = ch & 0b00011111;
|
||||
byteSize = 2;
|
||||
}
|
||||
else if ((ch & 0b11110000) == 0b11100000)
|
||||
{
|
||||
codepoint = ch & 0b00001111;
|
||||
byteSize = 3;
|
||||
}
|
||||
else if ((ch & 0b11111000) == 0b11110000)
|
||||
{
|
||||
codepoint = ch & 0b00000111;
|
||||
byteSize = 4;
|
||||
}
|
||||
|
||||
// construct codepoint for non-ascii
|
||||
for (size_t i = 1; i < byteSize; ++i)
|
||||
{
|
||||
// if not a continuation byte
|
||||
if ((pos + i) >= s.size() || (static_cast<unsigned char>(s[pos + i]) & 0b11000000) != 0b10000000)
|
||||
{
|
||||
return std::make_pair(0, -1);
|
||||
}
|
||||
codepoint = (codepoint << 6) | (static_cast<unsigned char>(s[pos + i]) & 0b00111111);
|
||||
}
|
||||
|
||||
std::pair<size_t, int64_t> res = std::make_pair(pos_byte.back(), codepoint);
|
||||
|
||||
pos_byte.push_back(pos_byte.back() + byteSize); /* the next character (if exists) starts at this byte */
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
namespace LuaUtf8
|
||||
{
|
||||
sol::table initUtf8Package(sol::state_view& lua)
|
||||
{
|
||||
sol::table utf8(lua, sol::create);
|
||||
|
||||
utf8["charpattern"] = UTF8PATT;
|
||||
|
||||
utf8["char"] = [](const sol::variadic_args args) -> std::string {
|
||||
std::string result{};
|
||||
std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
|
||||
for (size_t i = 0; i < args.size(); ++i)
|
||||
{
|
||||
int64_t codepoint = getInteger(args[i], (i + 1), "char");
|
||||
if (codepoint < 0 || codepoint > MAXUTF)
|
||||
throw std::runtime_error(
|
||||
Misc::StringUtils::format("bad argument #{} to 'char' (value out of range)", (i + 1)));
|
||||
|
||||
result += converter.to_bytes(codepoint);
|
||||
}
|
||||
return result;
|
||||
};
|
||||
|
||||
utf8["codes"] = [](std::string_view s) {
|
||||
std::vector<int64_t> pos_byte{ 1 };
|
||||
return sol::as_function([s, pos_byte]() mutable -> sol::optional<std::pair<int64_t, int64_t>> {
|
||||
if (pos_byte.back() <= static_cast<int64_t>(s.size()))
|
||||
{
|
||||
const auto pair = decodeNextUTF8Character(s, pos_byte);
|
||||
if (pair.second == -1)
|
||||
throw std::runtime_error("Invalid UTF-8 code at position " + std::to_string(pos_byte.size()));
|
||||
|
||||
return pair;
|
||||
}
|
||||
return sol::nullopt;
|
||||
});
|
||||
};
|
||||
|
||||
utf8["len"] = [](std::string_view s,
|
||||
const sol::variadic_args args) -> std::variant<size_t, std::pair<sol::object, int64_t>> {
|
||||
const size_t len = s.size();
|
||||
int64_t iv = isNilOrNone(args[0]) ? 1 : getInteger(args[0], 2, "len");
|
||||
int64_t fv = isNilOrNone(args[1]) ? -1 : getInteger(args[1], 3, "len");
|
||||
|
||||
relativePosition(iv, len);
|
||||
relativePosition(fv, len);
|
||||
|
||||
if (iv <= 0)
|
||||
throw std::runtime_error("bad argument #2 to 'len' (initial position out of bounds)");
|
||||
if (fv > static_cast<int64_t>(len))
|
||||
throw std::runtime_error("bad argument #3 to 'len' (final position out of bounds)");
|
||||
|
||||
if (len == 0)
|
||||
return len;
|
||||
|
||||
std::vector<int64_t> pos_byte = { iv };
|
||||
|
||||
while (pos_byte.back() <= fv)
|
||||
{
|
||||
if (decodeNextUTF8Character(s, pos_byte).second == -1)
|
||||
return std::pair(sol::lua_nil, pos_byte.back());
|
||||
}
|
||||
return pos_byte.size() - 1;
|
||||
};
|
||||
|
||||
utf8["codepoint"]
|
||||
= [](std::string_view s, const sol::variadic_args args) -> sol::as_returns_t<std::vector<int64_t>> {
|
||||
size_t len = s.size();
|
||||
int64_t iv = isNilOrNone(args[0]) ? 1 : getInteger(args[0], 2, "codepoint");
|
||||
int64_t fv = isNilOrNone(args[1]) ? iv : getInteger(args[1], 3, "codepoint");
|
||||
|
||||
relativePosition(iv, len);
|
||||
relativePosition(fv, len);
|
||||
|
||||
if (iv <= 0)
|
||||
throw std::runtime_error("bad argument #2 to 'codepoint' (initial position out of bounds)");
|
||||
if (fv > static_cast<int64_t>(len))
|
||||
throw std::runtime_error("bad argument #3 to 'codepoint' (final position out of bounds)");
|
||||
|
||||
if (iv > fv)
|
||||
return sol::as_returns(std::vector<int64_t>{}); /* empty interval; return nothing */
|
||||
|
||||
std::vector<int64_t> pos_byte = { iv };
|
||||
std::vector<int64_t> codepoints;
|
||||
|
||||
while (pos_byte.back() <= fv)
|
||||
{
|
||||
codepoints.push_back(decodeNextUTF8Character(s, pos_byte).second);
|
||||
if (codepoints.back() == -1)
|
||||
throw std::runtime_error("Invalid UTF-8 code at position " + std::to_string(pos_byte.size()));
|
||||
}
|
||||
|
||||
return sol::as_returns(std::move(codepoints));
|
||||
};
|
||||
|
||||
utf8["offset"]
|
||||
= [](std::string_view s, const int64_t n, const sol::variadic_args args) -> sol::optional<int64_t> {
|
||||
size_t len = s.size();
|
||||
int64_t iv;
|
||||
|
||||
if (isNilOrNone(args[0]))
|
||||
{
|
||||
if (n >= 0)
|
||||
iv = 1;
|
||||
else
|
||||
iv = s.size() + 1;
|
||||
}
|
||||
else
|
||||
iv = getInteger(args[0], 3, "offset");
|
||||
|
||||
std::vector<int64_t> pos_byte = { 1 };
|
||||
|
||||
relativePosition(iv, len);
|
||||
|
||||
if (iv > static_cast<int64_t>(len) + 1)
|
||||
throw std::runtime_error("bad argument #3 to 'offset' (position out of bounds)");
|
||||
|
||||
while (pos_byte.back() <= static_cast<int64_t>(len))
|
||||
decodeNextUTF8Character(s, pos_byte);
|
||||
|
||||
for (auto it = pos_byte.begin(); it != pos_byte.end(); ++it)
|
||||
{
|
||||
if (*it == iv)
|
||||
{
|
||||
if (n <= 0 && it + n >= pos_byte.begin())
|
||||
return *(it + n);
|
||||
if (n > 0 && it + n - 1 < pos_byte.end())
|
||||
return *(it + n - 1);
|
||||
break;
|
||||
}
|
||||
else if (*it > iv) /* a continuation byte */
|
||||
{
|
||||
if (n == 0)
|
||||
return *(it - 1); /* special case */
|
||||
else
|
||||
throw std::runtime_error("initial position is a continuation byte");
|
||||
}
|
||||
}
|
||||
|
||||
return sol::nullopt;
|
||||
};
|
||||
|
||||
return utf8;
|
||||
}
|
||||
}
|
11
components/lua/utf8.hpp
Normal file
11
components/lua/utf8.hpp
Normal file
|
@ -0,0 +1,11 @@
|
|||
#ifndef COMPONENTS_LUA_UTF8_H
|
||||
#define COMPONENTS_LUA_UTF8_H
|
||||
|
||||
#include <sol/sol.hpp>
|
||||
|
||||
namespace LuaUtf8
|
||||
{
|
||||
sol::table initUtf8Package(sol::state_view&);
|
||||
}
|
||||
|
||||
#endif
|
|
@ -6,7 +6,7 @@ Overview of Lua scripting
|
|||
Language and sandboxing
|
||||
=======================
|
||||
|
||||
OpenMW supports scripts written in Lua 5.1 with some extensions (see below) from Lua 5.2.
|
||||
OpenMW supports scripts written in Lua 5.1 with some extensions (see below) from Lua 5.2 and Lua 5.3.
|
||||
There are no plans to switch to any newer version of the language, because newer versions are not supported by LuaJIT.
|
||||
|
||||
.. note::
|
||||
|
@ -40,6 +40,10 @@ Supported Lua 5.2 features:
|
|||
- ``__pairs`` and ``__ipairs`` metamethods;
|
||||
- Function ``table.unpack`` (alias to Lua 5.1 ``unpack``).
|
||||
|
||||
Supported Lua 5.3 features:
|
||||
|
||||
- All functions in the `UTF-8 Library <https://www.lua.org/manual/5.3/manual.html#6.5>`__
|
||||
|
||||
Loading libraries with ``require('library_name')`` is allowed, but limited. It works this way:
|
||||
|
||||
1. If `library_name` is one of the standard libraries, then return the library.
|
||||
|
|
77
files/lua_api/utf8.doclua
Normal file
77
files/lua_api/utf8.doclua
Normal file
|
@ -0,0 +1,77 @@
|
|||
-------------------------------------------------------------------------------
|
||||
-- UTF-8 Support.
|
||||
-- This library provides basic support for UTF-8 encoding.
|
||||
-- It provides all its functions inside the table utf8.
|
||||
-- This library does not provide any support for Unicode other than the handling of the encoding.
|
||||
-- Any operation that needs the meaning of a character, such as character classification, is outside its scope.
|
||||
--
|
||||
-- Unless stated otherwise, all functions that expect a byte position as a parameter assume that
|
||||
-- the given position is either the start of a byte sequence or one plus the length of the subject string.
|
||||
-- As in the string library, negative indices count from the end of the string.
|
||||
-- @module utf8
|
||||
|
||||
-------------------------------------------------------------------------------
|
||||
-- Receives zero or more integers, converts each one to its
|
||||
-- corresponding UTF-8 byte sequence, and returns a string with the concatenation
|
||||
-- of all these sequences.
|
||||
-- @function [parent=#utf8] char
|
||||
-- @param ... zero or more integers.
|
||||
-- @return #string
|
||||
|
||||
-------------------------------------------------------------------------------
|
||||
-- The pattern which matches exactly one UTF-8 byte sequence, assuming that
|
||||
-- the subject is a valid UTF-8 string.
|
||||
-- @function [parent=#utf8] charpattern
|
||||
-- @return #string
|
||||
|
||||
-------------------------------------------------------------------------------
|
||||
-- Returns values so that the construction
|
||||
--
|
||||
-- for p, c in utf8.codes(s) do body end
|
||||
--
|
||||
-- will iterate over all characters in string s, with p being the position (in bytes)
|
||||
-- and c the code point of each character.
|
||||
-- It raises an error if it meets any invalid byte sequence.
|
||||
-- @function [parent=#utf8] codes
|
||||
-- @param #string s string to handle.
|
||||
|
||||
-------------------------------------------------------------------------------
|
||||
-- Returns the codepoints (as integers) from all characters in s that start
|
||||
-- between byte position i and j (both included). The default for i is 1 and for j is i.
|
||||
-- It raises an error if it meets any invalid byte sequence.
|
||||
-- @function [parent=#utf8] codepoint
|
||||
-- @param #string s string to handle
|
||||
-- @param #number i the initial position (default value is 1)
|
||||
-- @param #number j the final position (default value is i)
|
||||
-- @return #number the codepoints of each character in s
|
||||
|
||||
-------------------------------------------------------------------------------
|
||||
-- Returns the number of UTF-8 characters in string s that start
|
||||
-- between positions i and j (both inclusive).
|
||||
-- The default for i is 1 and for j is -1.
|
||||
-- If it finds any invalid byte sequence,
|
||||
-- returns a false value plus the position of the first invalid byte.
|
||||
-- @function [parent=#utf8] len
|
||||
-- @param #string s string to handle
|
||||
-- @param #number i the initial position (default value is 1)
|
||||
-- @param #number j the final position (default value is -1)
|
||||
-- @return #number the number of utf8 characters in s
|
||||
|
||||
-------------------------------------------------------------------------------
|
||||
-- Returns the position (in bytes) where the encoding of the n-th character of s
|
||||
-- (counting from position i) starts. A negative n gets characters before position i.
|
||||
-- The default for i is 1 when n is non-negative and #s + 1 otherwise,
|
||||
-- so that utf8.offset(s, -n) gets the offset of the n-th character from the end of the string.
|
||||
-- If the specified character is neither in the subject nor right after its end, the function returns nil.
|
||||
--
|
||||
-- As a special case, when n is 0 the function returns the
|
||||
-- start of the encoding of the character that contains the i-th byte of s.
|
||||
--
|
||||
-- This function assumes that s is a valid UTF-8 string.
|
||||
-- @function [parent=#utf8] offset
|
||||
-- @param #string s string to handle
|
||||
-- @param #number n the n-th character
|
||||
-- @param #number i the initial position (default value is 1 if n is is non-negative and #s + 1 otherwise)
|
||||
-- @return #number
|
||||
|
||||
return nil
|
Loading…
Reference in a new issue