From 7884a0102632694b524a19ebe21d3e1438742907 Mon Sep 17 00:00:00 2001 From: elsid Date: Sun, 13 Feb 2022 16:38:49 +0100 Subject: [PATCH] Add tests for Utf8Encoder --- apps/openmw_test_suite/CMakeLists.txt | 6 +- .../toutf8/data}/french-utf8.txt | 0 .../toutf8/data}/french-win1252.txt | 0 .../toutf8/data}/russian-utf8.txt | 0 .../toutf8/data}/russian-win1251.txt | 0 apps/openmw_test_suite/toutf8/toutf8.cpp | 139 ++++++++++++++++++ components/to_utf8/tests/.gitignore | 1 - .../to_utf8/tests/output/to_utf8_test.out | 4 - components/to_utf8/tests/test.sh | 18 --- components/to_utf8/tests/to_utf8_test.cpp | 59 -------- components/to_utf8/to_utf8.cpp | 139 +++++++++--------- components/to_utf8/to_utf8.hpp | 10 +- 12 files changed, 219 insertions(+), 157 deletions(-) rename {components/to_utf8/tests/test_data => apps/openmw_test_suite/toutf8/data}/french-utf8.txt (100%) rename {components/to_utf8/tests/test_data => apps/openmw_test_suite/toutf8/data}/french-win1252.txt (100%) rename {components/to_utf8/tests/test_data => apps/openmw_test_suite/toutf8/data}/russian-utf8.txt (100%) rename {components/to_utf8/tests/test_data => apps/openmw_test_suite/toutf8/data}/russian-win1251.txt (100%) create mode 100644 apps/openmw_test_suite/toutf8/toutf8.cpp delete mode 100644 components/to_utf8/tests/.gitignore delete mode 100644 components/to_utf8/tests/output/to_utf8_test.out delete mode 100755 components/to_utf8/tests/test.sh delete mode 100644 components/to_utf8/tests/to_utf8_test.cpp diff --git a/apps/openmw_test_suite/CMakeLists.txt b/apps/openmw_test_suite/CMakeLists.txt index 2ee34186d8..16d820c4f9 100644 --- a/apps/openmw_test_suite/CMakeLists.txt +++ b/apps/openmw_test_suite/CMakeLists.txt @@ -70,6 +70,8 @@ if (GTEST_FOUND AND GMOCK_FOUND) esmloader/esmdata.cpp files/hash.cpp + + toutf8/toutf8.cpp ) source_group(apps\\openmw_test_suite FILES openmw_test_suite.cpp ${UNITTEST_SRC_FILES}) @@ -93,6 +95,8 @@ if (GTEST_FOUND AND GMOCK_FOUND) EXPECTED_MD5 bf3691034a38611534c74c3b89a7d2c3 ) - target_compile_definitions(openmw_test_suite PRIVATE OPENMW_DATA_DIR="${CMAKE_CURRENT_BINARY_DIR}/data") + target_compile_definitions(openmw_test_suite + PRIVATE OPENMW_DATA_DIR="${CMAKE_CURRENT_BINARY_DIR}/data" + OPENMW_TEST_SUITE_SOURCE_DIR="${CMAKE_CURRENT_SOURCE_DIR}") endif() diff --git a/components/to_utf8/tests/test_data/french-utf8.txt b/apps/openmw_test_suite/toutf8/data/french-utf8.txt similarity index 100% rename from components/to_utf8/tests/test_data/french-utf8.txt rename to apps/openmw_test_suite/toutf8/data/french-utf8.txt diff --git a/components/to_utf8/tests/test_data/french-win1252.txt b/apps/openmw_test_suite/toutf8/data/french-win1252.txt similarity index 100% rename from components/to_utf8/tests/test_data/french-win1252.txt rename to apps/openmw_test_suite/toutf8/data/french-win1252.txt diff --git a/components/to_utf8/tests/test_data/russian-utf8.txt b/apps/openmw_test_suite/toutf8/data/russian-utf8.txt similarity index 100% rename from components/to_utf8/tests/test_data/russian-utf8.txt rename to apps/openmw_test_suite/toutf8/data/russian-utf8.txt diff --git a/components/to_utf8/tests/test_data/russian-win1251.txt b/apps/openmw_test_suite/toutf8/data/russian-win1251.txt similarity index 100% rename from components/to_utf8/tests/test_data/russian-win1251.txt rename to apps/openmw_test_suite/toutf8/data/russian-win1251.txt diff --git a/apps/openmw_test_suite/toutf8/toutf8.cpp b/apps/openmw_test_suite/toutf8/toutf8.cpp new file mode 100644 index 0000000000..bad4f34fd5 --- /dev/null +++ b/apps/openmw_test_suite/toutf8/toutf8.cpp @@ -0,0 +1,139 @@ +#include + +#include + +#include + +#ifndef OPENMW_TEST_SUITE_SOURCE_DIR +#define OPENMW_TEST_SUITE_SOURCE_DIR "" +#endif + +namespace +{ + using namespace testing; + using namespace ToUTF8; + + struct Params + { + FromType mLegacyEncoding; + std::string mLegacyEncodingFileName; + std::string mUtf8FileName; + }; + + std::string readContent(const std::string& fileName) + { + std::ifstream file; + file.exceptions(std::ios::failbit | std::ios::badbit); + file.open(std::string(OPENMW_TEST_SUITE_SOURCE_DIR) + "/toutf8/data/" + fileName); + std::stringstream buffer; + buffer << file.rdbuf(); + return buffer.str(); + } + + struct Utf8EncoderTest : TestWithParam {}; + + TEST(Utf8EncoderTest, getUtf8ShouldReturnEmptyAsIs) + { + Utf8Encoder encoder(FromType::CP437); + EXPECT_EQ(encoder.getUtf8(std::string_view()), std::string_view()); + } + + TEST(Utf8EncoderTest, getUtf8ShouldReturnAsciiOnlyAsIs) + { + std::string input; + for (int c = 1; c <= std::numeric_limits::max(); ++c) + input.push_back(c); + Utf8Encoder encoder(FromType::CP437); + const std::string_view result = encoder.getUtf8(input); + EXPECT_EQ(result.data(), input.data()); + EXPECT_EQ(result.size(), input.size()); + } + + TEST(Utf8EncoderTest, getUtf8ShouldLookUpUntilZero) + { + const std::string input("a\0b"); + Utf8Encoder encoder(FromType::CP437); + const std::string_view result = encoder.getUtf8(input); + EXPECT_EQ(result, "a"); + } + + TEST(Utf8EncoderTest, getUtf8ShouldLookUpUntilEndOfInputForAscii) + { + const std::string input("abc"); + Utf8Encoder encoder(FromType::CP437); + const std::string_view result = encoder.getUtf8(std::string_view(input.data(), 2)); + EXPECT_EQ(result, "ab"); + } + + TEST(Utf8EncoderTest, getUtf8ShouldLookUpUntilEndOfInputForNonAscii) + { + const std::string input("a\x92" "b"); + Utf8Encoder encoder(FromType::WINDOWS_1252); + const std::string_view result = encoder.getUtf8(std::string_view(input.data(), 2)); + EXPECT_EQ(result, "a\xE2\x80\x99"); + } + + TEST_P(Utf8EncoderTest, getUtf8ShouldConvertFromLegacyEncodingToUtf8) + { + const std::string input(readContent(GetParam().mLegacyEncodingFileName)); + const std::string expected(readContent(GetParam().mUtf8FileName)); + Utf8Encoder encoder(GetParam().mLegacyEncoding); + const std::string_view result = encoder.getUtf8(input); + EXPECT_EQ(result, expected); + } + + TEST(Utf8EncoderTest, getLegacyEncShouldReturnEmptyAsIs) + { + Utf8Encoder encoder(FromType::CP437); + EXPECT_EQ(encoder.getLegacyEnc(std::string_view()), std::string_view()); + } + + TEST(Utf8EncoderTest, getLegacyEncShouldReturnAsciiOnlyAsIs) + { + std::string input; + for (int c = 1; c <= std::numeric_limits::max(); ++c) + input.push_back(c); + Utf8Encoder encoder(FromType::CP437); + const std::string_view result = encoder.getLegacyEnc(input); + EXPECT_EQ(result.data(), input.data()); + EXPECT_EQ(result.size(), input.size()); + } + + TEST(Utf8EncoderTest, getLegacyEncShouldLookUpUntilZero) + { + const std::string input("a\0b"); + Utf8Encoder encoder(FromType::CP437); + const std::string_view result = encoder.getLegacyEnc(input); + EXPECT_EQ(result, "a"); + } + + TEST(Utf8EncoderTest, getLegacyEncShouldLookUpUntilEndOfInputForAscii) + { + const std::string input("abc"); + Utf8Encoder encoder(FromType::CP437); + const std::string_view result = encoder.getLegacyEnc(std::string_view(input.data(), 2)); + EXPECT_EQ(result, "ab"); + } + + TEST(Utf8EncoderTest, getLegacyEncShouldStripIncompleteCharacters) + { + const std::string input("a\xc3\xa2\xe2\x80\x99"); + Utf8Encoder encoder(FromType::WINDOWS_1252); + const std::string_view result = encoder.getLegacyEnc(std::string_view(input.data(), 5)); + EXPECT_EQ(result, "a\xe2"); + } + + TEST_P(Utf8EncoderTest, getLegacyEncShouldConvertFromUtf8ToLegacyEncoding) + { + const std::string input(readContent(GetParam().mUtf8FileName)); + const std::string expected(readContent(GetParam().mLegacyEncodingFileName)); + Utf8Encoder encoder(GetParam().mLegacyEncoding); + const std::string_view result = encoder.getLegacyEnc(input); + EXPECT_EQ(result, expected); + } + + INSTANTIATE_TEST_SUITE_P(Files, Utf8EncoderTest, Values( + Params {ToUTF8::WINDOWS_1251, "russian-win1251.txt", "russian-utf8.txt"}, + Params {ToUTF8::WINDOWS_1252, "french-win1252.txt", "french-utf8.txt"} + )); +} diff --git a/components/to_utf8/tests/.gitignore b/components/to_utf8/tests/.gitignore deleted file mode 100644 index 8144904045..0000000000 --- a/components/to_utf8/tests/.gitignore +++ /dev/null @@ -1 +0,0 @@ -*_test diff --git a/components/to_utf8/tests/output/to_utf8_test.out b/components/to_utf8/tests/output/to_utf8_test.out deleted file mode 100644 index dcb32359ab..0000000000 --- a/components/to_utf8/tests/output/to_utf8_test.out +++ /dev/null @@ -1,4 +0,0 @@ -original: Без вопросов отдаете ему рулет, зная, что позже вы сможете привести с собой своих друзей и тогда он получит по заслугам? -converted: Без вопросов отдаете ему рулет, зная, что позже вы сможете привести с собой своих друзей и тогда он получит по заслугам? -original: Vous lui donnez le gâteau sans protester avant d’aller chercher tous vos amis et de revenir vous venger. -converted: Vous lui donnez le gâteau sans protester avant d’aller chercher tous vos amis et de revenir vous venger. diff --git a/components/to_utf8/tests/test.sh b/components/to_utf8/tests/test.sh deleted file mode 100755 index 2d07708adc..0000000000 --- a/components/to_utf8/tests/test.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash - -make || exit - -mkdir -p output - -PROGS=*_test - -for a in $PROGS; do - if [ -f "output/$a.out" ]; then - echo "Running $a:" - ./$a | diff output/$a.out - - else - echo "Creating $a.out" - ./$a > "output/$a.out" - git add "output/$a.out" - fi -done diff --git a/components/to_utf8/tests/to_utf8_test.cpp b/components/to_utf8/tests/to_utf8_test.cpp deleted file mode 100644 index 3fcddd1581..0000000000 --- a/components/to_utf8/tests/to_utf8_test.cpp +++ /dev/null @@ -1,59 +0,0 @@ -#include -#include -#include -#include - -#include "../to_utf8.hpp" - -std::string getFirstLine(const std::string &filename); -void testEncoder(ToUTF8::FromType encoding, const std::string &legacyEncFile, - const std::string &utf8File); - -/// Test character encoding conversion to and from UTF-8 -void testEncoder(ToUTF8::FromType encoding, const std::string &legacyEncFile, - const std::string &utf8File) -{ - // get some test data - std::string legacyEncLine = getFirstLine(legacyEncFile); - std::string utf8Line = getFirstLine(utf8File); - - // create an encoder for specified character encoding - ToUTF8::Utf8Encoder encoder (encoding); - - // convert text to UTF-8 - std::string convertedUtf8Line = encoder.getUtf8(legacyEncLine); - - std::cout << "original: " << utf8Line << std::endl; - std::cout << "converted: " << convertedUtf8Line << std::endl; - - // check correctness - assert(convertedUtf8Line == utf8Line); - - // convert UTF-8 text to legacy encoding - std::string convertedLegacyEncLine = encoder.getLegacyEnc(utf8Line); - // check correctness - assert(convertedLegacyEncLine == legacyEncLine); -} - -std::string getFirstLine(const std::string &filename) -{ - std::string line; - std::ifstream text (filename.c_str()); - - if (!text.is_open()) - { - throw std::runtime_error("Unable to open file " + filename); - } - - std::getline(text, line); - text.close(); - - return line; -} - -int main() -{ - testEncoder(ToUTF8::WINDOWS_1251, "test_data/russian-win1251.txt", "test_data/russian-utf8.txt"); - testEncoder(ToUTF8::WINDOWS_1252, "test_data/french-win1252.txt", "test_data/french-utf8.txt"); - return 0; -} diff --git a/components/to_utf8/to_utf8.cpp b/components/to_utf8/to_utf8.cpp index 7fd0e3cd88..1f0a81ad10 100644 --- a/components/to_utf8/to_utf8.cpp +++ b/components/to_utf8/to_utf8.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include @@ -44,6 +45,14 @@ using namespace ToUTF8; +namespace +{ + std::string_view::iterator skipAscii(std::string_view input) + { + return std::find_if(input.begin(), input.end(), [] (unsigned char v) { return v == 0 || v >= 128; }); + } +} + Utf8Encoder::Utf8Encoder(const FromType sourceEncoding): mOutput(50*1024) { @@ -82,11 +91,6 @@ std::string_view Utf8Encoder::getUtf8(std::string_view input) if (input.empty()) return input; - // Double check that the input string stops at some point (it might - // contain zero terminators before this, inside its own data, which - // is also ok.) - assert(input[input.size()] == 0); - // Note: The rest of this function is designed for single-character // input encodings only. It also assumes that the input encoding // shares its first 128 values (0-127) with ASCII. There are no plans @@ -95,8 +99,7 @@ std::string_view Utf8Encoder::getUtf8(std::string_view input) // Compute output length, and check for pure ascii input at the same // time. - bool ascii; - size_t outlen = getLength(input.data(), ascii); + const auto [outlen, ascii] = getLength(input); // If we're pure ascii, then don't bother converting anything. if(ascii) @@ -107,8 +110,8 @@ std::string_view Utf8Encoder::getUtf8(std::string_view input) char *out = &mOutput[0]; // Translate - for (const char* ptr = input.data(); *ptr;) - copyFromArray(*(ptr++), out); + for (auto it = input.begin(); it != input.end() && *it != 0; ++it) + copyFromArray(*it, out); // Make sure that we wrote the correct number of bytes assert((out-&mOutput[0]) == (int)outlen); @@ -125,11 +128,6 @@ std::string_view Utf8Encoder::getLegacyEnc(std::string_view input) if (input.empty()) return input; - // Double check that the input string stops at some point (it might - // contain zero terminators before this, inside its own data, which - // is also ok.) - assert(input[input.size()] == 0); - // TODO: The rest of this function is designed for single-character // input encodings only. It also assumes that the input the input // encoding shares its first 128 values (0-127) with ASCII. These @@ -138,8 +136,7 @@ std::string_view Utf8Encoder::getLegacyEnc(std::string_view input) // Compute output length, and check for pure ascii input at the same // time. - bool ascii; - size_t outlen = getLength2(input.data(), ascii); + const auto [outlen, ascii] = getLengthLegacyEnc(input); // If we're pure ascii, then don't bother converting anything. if(ascii) @@ -150,8 +147,8 @@ std::string_view Utf8Encoder::getLegacyEnc(std::string_view input) char *out = &mOutput[0]; // Translate - for (const char* ptr = input.data(); *ptr;) - copyFromArray2(ptr, out); + for (auto it = input.begin(); it != input.end() && *it != 0;) + copyFromArrayLegacyEnc(it, input.end(), out); // Make sure that we wrote the correct number of bytes assert((out-&mOutput[0]) == (int)outlen); @@ -186,34 +183,30 @@ void Utf8Encoder::resize(size_t size) is the case, then the ascii parameter is set to true, and the caller can optimize for this case. */ -size_t Utf8Encoder::getLength(const char* input, bool &ascii) const +std::pair Utf8Encoder::getLength(std::string_view input) const { - ascii = true; - size_t len = 0; - const char* ptr = input; - unsigned char inp = *ptr; - // Do away with the ascii part of the string first (this is almost // always the entire string.) - while (inp && inp < 128) - inp = *(++ptr); - len += (ptr-input); + auto it = skipAscii(input); // If we're not at the null terminator at this point, then there // were some non-ascii characters to deal with. Go to slow-mode for // the rest of the string. - if (inp) + if (it == input.end() || *it == 0) + return {it - input.begin(), true}; + + std::size_t len = it - input.begin(); + + do { - ascii = false; - while (inp) - { - // Find the translated length of this character in the - // lookup table. - len += translationArray[inp*6]; - inp = *(++ptr); - } + // Find the translated length of this character in the + // lookup table. + len += translationArray[static_cast(*it) * 6]; + ++it; } - return len; + while (it != input.end() && *it != 0); + + return {len, false}; } // Translate one character 'ch' using the translation array 'arr', and @@ -233,51 +226,52 @@ void Utf8Encoder::copyFromArray(unsigned char ch, char* &out) const out += len; } -size_t Utf8Encoder::getLength2(const char* input, bool &ascii) const +std::pair Utf8Encoder::getLengthLegacyEnc(std::string_view input) const { - ascii = true; - size_t len = 0; - const char* ptr = input; - unsigned char inp = *ptr; - // Do away with the ascii part of the string first (this is almost // always the entire string.) - while (inp && inp < 128) - inp = *(++ptr); - len += (ptr-input); + auto it = skipAscii(input); // If we're not at the null terminator at this point, then there // were some non-ascii characters to deal with. Go to slow-mode for // the rest of the string. - if (inp) + if (it == input.end() || *it == 0) + return {it - input.begin(), true}; + + std::size_t len = it - input.begin(); + std::size_t symbolLen = 0; + + do { - ascii = false; - while(inp) + symbolLen += 1; + // Find the translated length of this character in the + // lookup table. + switch (static_cast(*it)) { - len += 1; - // Find the translated length of this character in the - // lookup table. - switch(inp) - { - case 0xe2: len -= 2; break; - case 0xc2: - case 0xcb: - case 0xc4: - case 0xc6: - case 0xc3: - case 0xd0: - case 0xd1: - case 0xd2: - case 0xc5: len -= 1; break; - } - - inp = *(++ptr); + case 0xe2: symbolLen -= 2; break; + case 0xc2: + case 0xcb: + case 0xc4: + case 0xc6: + case 0xc3: + case 0xd0: + case 0xd1: + case 0xd2: + case 0xc5: symbolLen -= 1; break; + default: + len += symbolLen; + symbolLen = 0; + break; } + + ++it; } - return len; + while (it != input.end() && *it != 0); + + return {len, false}; } -void Utf8Encoder::copyFromArray2(const char*& chp, char* &out) const +void Utf8Encoder::copyFromArrayLegacyEnc(std::string_view::iterator& chp, std::string_view::iterator end, char* &out) const { unsigned char ch = *(chp++); // Optimize for ASCII values @@ -308,10 +302,17 @@ void Utf8Encoder::copyFromArray2(const char*& chp, char* &out) const return; } + if (chp == end) + return; + unsigned char ch2 = *(chp++); unsigned char ch3 = '\0'; if (len == 3) + { + if (chp == end) + return; ch3 = *(chp++); + } for (int i = 128; i < 256; i++) { diff --git a/components/to_utf8/to_utf8.hpp b/components/to_utf8/to_utf8.hpp index 0e9db01e1d..794c9148e5 100644 --- a/components/to_utf8/to_utf8.hpp +++ b/components/to_utf8/to_utf8.hpp @@ -38,11 +38,11 @@ namespace ToUTF8 std::string_view getLegacyEnc(std::string_view input); private: - void resize(size_t size); - size_t getLength(const char* input, bool &ascii) const; - void copyFromArray(unsigned char chp, char* &out) const; - size_t getLength2(const char* input, bool &ascii) const; - void copyFromArray2(const char*& chp, char* &out) const; + inline void resize(std::size_t size); + inline std::pair getLength(std::string_view input) const; + inline void copyFromArray(unsigned char chp, char* &out) const; + inline std::pair getLengthLegacyEnc(std::string_view input) const; + inline void copyFromArrayLegacyEnc(std::string_view::iterator& chp, std::string_view::iterator end, char* &out) const; std::vector mOutput; const signed char* translationArray;