From 92842cedf55995009106c726fe130b88b1cce735 Mon Sep 17 00:00:00 2001 From: Kindi Date: Sun, 27 Aug 2023 16:12:12 +0800 Subject: [PATCH] len,codepoint,offset --- components/lua/utf8.cpp | 119 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 114 insertions(+), 5 deletions(-) diff --git a/components/lua/utf8.cpp b/components/lua/utf8.cpp index 926e43e84b..6a80505411 100644 --- a/components/lua/utf8.cpp +++ b/components/lua/utf8.cpp @@ -4,11 +4,16 @@ namespace { - static constexpr std::string_view UTF8PATT = "[%z\x01-\x7F\xC2-\xF4][\x80-\xBF]*"; // %z is deprecated in Lua5.2 - static constexpr uint32_t MAXUTF = 0x7FFFFFFFu; - static constexpr uint32_t MAXUNICODE = 0x10FFFFu; + constexpr std::string_view UTF8PATT = "[%z\x01-\x7F\xC2-\xF4][\x80-\xBF]*"; // %z is deprecated in Lua5.2 + constexpr uint32_t MAXUTF = 0x7FFFFFFFu; + constexpr uint32_t MAXUNICODE = 0x10FFFFu; - inline static double getInteger(const sol::stack_proxy arg, const size_t& n, const std::string_view& name) + inline bool isNilOrNone(const sol::stack_proxy arg) + { + return (arg.get_type() == sol::type::lua_nil || arg.get_type() == sol::type::none); + } + + inline double getInteger(const sol::stack_proxy arg, const size_t& n, const std::string_view& name) { double integer; if (!arg.is()) @@ -22,8 +27,18 @@ namespace return integer; } + inline void posrelat(int64_t& pos, const size_t& len) + { + if (pos >= 0) + /* no change */; + else if (0u - pos > static_cast(len)) + pos = 0; + else + pos = len + pos + 1; + } + // returns: first - character pos in bytes, second - character codepoint - static std::pair poscodes(const std::string_view& s, std::vector& pos_byte) + std::pair poscodes(const std::string_view& s, std::vector& pos_byte) { const int64_t pos = pos_byte.back() - 1; const unsigned char ch = static_cast(s[pos]); @@ -106,6 +121,100 @@ namespace LuaUtf8 return sol::nullopt; }); }; + + utf8["len"] = [](const std::string_view& s, + const sol::variadic_args args) -> std::variant> { + size_t len = s.size(); + int64_t iv = isNilOrNone(args[0]) ? 1 : getInteger(args[0], 2, "len"); + int64_t fv = isNilOrNone(args[1]) ? -1 : getInteger(args[1], 3, "len"); + + posrelat(iv, len); + posrelat(fv, len); + + if (iv <= 0) + throw std::runtime_error("bad argument #2 to 'len' (initial position out of bounds)"); + if (fv > static_cast(len)) + throw std::runtime_error("bad argument #3 to 'len' (final position out of bounds)"); + + if (len == 0) + return len; + + std::vector pos_byte = { iv }; + + while (pos_byte.back() <= fv) + { + if (poscodes(s, pos_byte).second == -1) + return std::pair(sol::lua_nil, pos_byte.back()); + } + return pos_byte.size() - 1; + }; + + utf8["codepoint"] + = [](const std::string_view& s, const sol::variadic_args args) -> sol::as_returns_t> { + size_t len = s.size(); + int64_t iv = isNilOrNone(args[0]) ? 1 : getInteger(args[0], 2, "codepoint"); + int64_t fv = isNilOrNone(args[1]) ? iv : getInteger(args[1], 3, "codepoint"); + + posrelat(iv, len); + posrelat(fv, len); + + if (iv <= 0) + throw std::runtime_error("bad argument #2 to 'codepoint' (initial position out of bounds)"); + if (fv > static_cast(len)) + throw std::runtime_error("bad argument #3 to 'codepoint' (final position out of bounds)"); + + if (iv > fv) + return sol::as_returns(std::vector{}); /* empty interval; return nothing */ + + std::vector pos_byte = { iv }; + std::vector codepoints; + + while (pos_byte.back() <= fv) + { + codepoints.push_back(poscodes(s, pos_byte).second); + if (codepoints.back() == -1) + throw std::runtime_error("Invalid UTF-8 code at position " + std::to_string(pos_byte.size())); + } + + return sol::as_returns(std::move(codepoints)); + }; + + utf8["offset"] + = [](const std::string_view& s, const int64_t n, const sol::variadic_args args) -> sol::optional { + size_t len = s.size(); + int64_t iv = isNilOrNone(args[0]) ? ((n >= 0) ? 1 : s.size() + 1) : getInteger(args[0], 3, "offset"); + std::vector pos_byte = { 1 }; + + posrelat(iv, len); + + if (iv > static_cast(len) + 1) + throw std::runtime_error("bad argument #3 to 'offset' (position out of bounds)"); + + while (pos_byte.back() <= static_cast(len)) + poscodes(s, pos_byte); + + for (auto it = pos_byte.begin(); it != pos_byte.end(); ++it) + if (*it == iv) + { + if (n <= 0) + if ((it + n) >= pos_byte.begin()) + return *(it + n); + if (n > 0) + if ((it + n - 1) < pos_byte.end()) + return *(it + n - 1); + break; + } + else if (*it > iv) /* a continuation byte */ + { + if (n == 0) + return *(it - 1); /* special case */ + else + throw std::runtime_error("initial position is a continuation byte"); + } + + return sol::nullopt; + }; + return utf8; } }