From 7884a0102632694b524a19ebe21d3e1438742907 Mon Sep 17 00:00:00 2001
From: elsid <elsid.mail@gmail.com>
Date: Sun, 13 Feb 2022 16:38:49 +0100
Subject: [PATCH] Add tests for Utf8Encoder

---
 apps/openmw_test_suite/CMakeLists.txt         |   6 +-
 .../toutf8/data}/french-utf8.txt              |   0
 .../toutf8/data}/french-win1252.txt           |   0
 .../toutf8/data}/russian-utf8.txt             |   0
 .../toutf8/data}/russian-win1251.txt          |   0
 apps/openmw_test_suite/toutf8/toutf8.cpp      | 139 +++++++++++++++++
 components/to_utf8/tests/.gitignore           |   1 -
 .../to_utf8/tests/output/to_utf8_test.out     |   4 -
 components/to_utf8/tests/test.sh              |  18 ---
 components/to_utf8/tests/to_utf8_test.cpp     |  59 --------
 components/to_utf8/to_utf8.cpp                | 141 +++++++++---------
 components/to_utf8/to_utf8.hpp                |  10 +-
 12 files changed, 220 insertions(+), 158 deletions(-)
 rename {components/to_utf8/tests/test_data => apps/openmw_test_suite/toutf8/data}/french-utf8.txt (100%)
 rename {components/to_utf8/tests/test_data => apps/openmw_test_suite/toutf8/data}/french-win1252.txt (100%)
 rename {components/to_utf8/tests/test_data => apps/openmw_test_suite/toutf8/data}/russian-utf8.txt (100%)
 rename {components/to_utf8/tests/test_data => apps/openmw_test_suite/toutf8/data}/russian-win1251.txt (100%)
 create mode 100644 apps/openmw_test_suite/toutf8/toutf8.cpp
 delete mode 100644 components/to_utf8/tests/.gitignore
 delete mode 100644 components/to_utf8/tests/output/to_utf8_test.out
 delete mode 100755 components/to_utf8/tests/test.sh
 delete mode 100644 components/to_utf8/tests/to_utf8_test.cpp
diff --git a/apps/openmw_test_suite/CMakeLists.txt b/apps/openmw_test_suite/CMakeLists.txt
index 2ee34186d8..16d820c4f9 100644
--- a/apps/openmw_test_suite/CMakeLists.txt
+++ b/apps/openmw_test_suite/CMakeLists.txt
@@ -70,6 +70,8 @@ if (GTEST_FOUND AND GMOCK_FOUND)
         esmloader/esmdata.cpp
 
         files/hash.cpp
+
+        toutf8/toutf8.cpp
     )
 
     source_group(apps\\openmw_test_suite FILES openmw_test_suite.cpp ${UNITTEST_SRC_FILES})
@@ -93,6 +95,8 @@ if (GTEST_FOUND AND GMOCK_FOUND)
         EXPECTED_MD5 bf3691034a38611534c74c3b89a7d2c3
     )
 
-    target_compile_definitions(openmw_test_suite PRIVATE OPENMW_DATA_DIR="${CMAKE_CURRENT_BINARY_DIR}/data")
+    target_compile_definitions(openmw_test_suite
+        PRIVATE OPENMW_DATA_DIR="${CMAKE_CURRENT_BINARY_DIR}/data"
+                OPENMW_TEST_SUITE_SOURCE_DIR="${CMAKE_CURRENT_SOURCE_DIR}")
 
 endif()
diff --git a/components/to_utf8/tests/test_data/french-utf8.txt b/apps/openmw_test_suite/toutf8/data/french-utf8.txt
similarity index 100%
rename from components/to_utf8/tests/test_data/french-utf8.txt
rename to apps/openmw_test_suite/toutf8/data/french-utf8.txt
diff --git a/components/to_utf8/tests/test_data/french-win1252.txt b/apps/openmw_test_suite/toutf8/data/french-win1252.txt
similarity index 100%
rename from components/to_utf8/tests/test_data/french-win1252.txt
rename to apps/openmw_test_suite/toutf8/data/french-win1252.txt
diff --git a/components/to_utf8/tests/test_data/russian-utf8.txt b/apps/openmw_test_suite/toutf8/data/russian-utf8.txt
similarity index 100%
rename from components/to_utf8/tests/test_data/russian-utf8.txt
rename to apps/openmw_test_suite/toutf8/data/russian-utf8.txt
diff --git a/components/to_utf8/tests/test_data/russian-win1251.txt b/apps/openmw_test_suite/toutf8/data/russian-win1251.txt
similarity index 100%
rename from components/to_utf8/tests/test_data/russian-win1251.txt
rename to apps/openmw_test_suite/toutf8/data/russian-win1251.txt
diff --git a/apps/openmw_test_suite/toutf8/toutf8.cpp b/apps/openmw_test_suite/toutf8/toutf8.cpp
new file mode 100644
index 0000000000..bad4f34fd5
--- /dev/null
+++ b/apps/openmw_test_suite/toutf8/toutf8.cpp
@@ -0,0 +1,139 @@
+#include <components/to_utf8/to_utf8.hpp>
+
+#include <gtest/gtest.h>
+
+#include <fstream>
+
+#ifndef OPENMW_TEST_SUITE_SOURCE_DIR
+#define OPENMW_TEST_SUITE_SOURCE_DIR ""
+#endif
+
+namespace
+{
+    using namespace testing;
+    using namespace ToUTF8;
+
+    struct Params
+    {
+        FromType mLegacyEncoding;
+        std::string mLegacyEncodingFileName;
+        std::string mUtf8FileName;
+    };
+
+    std::string readContent(const std::string& fileName)
+    {
+        std::ifstream file;
+        file.exceptions(std::ios::failbit | std::ios::badbit);
+        file.open(std::string(OPENMW_TEST_SUITE_SOURCE_DIR) + "/toutf8/data/" + fileName);
+        std::stringstream buffer;
+        buffer << file.rdbuf();
+        return buffer.str();
+    }
+
+    struct Utf8EncoderTest : TestWithParam<Params> {};
+
+    TEST(Utf8EncoderTest, getUtf8ShouldReturnEmptyAsIs)
+    {
+        Utf8Encoder encoder(FromType::CP437);
+        EXPECT_EQ(encoder.getUtf8(std::string_view()), std::string_view());
+    }
+
+    TEST(Utf8EncoderTest, getUtf8ShouldReturnAsciiOnlyAsIs)
+    {
+        std::string input;
+        for (int c = 1; c <= std::numeric_limits<char>::max(); ++c)
+            input.push_back(c);
+        Utf8Encoder encoder(FromType::CP437);
+        const std::string_view result = encoder.getUtf8(input);
+        EXPECT_EQ(result.data(), input.data());
+        EXPECT_EQ(result.size(), input.size());
+    }
+
+    TEST(Utf8EncoderTest, getUtf8ShouldLookUpUntilZero)
+    {
+        const std::string input("a\0b");
+        Utf8Encoder encoder(FromType::CP437);
+        const std::string_view result = encoder.getUtf8(input);
+        EXPECT_EQ(result, "a");
+    }
+
+    TEST(Utf8EncoderTest, getUtf8ShouldLookUpUntilEndOfInputForAscii)
+    {
+        const std::string input("abc");
+        Utf8Encoder encoder(FromType::CP437);
+        const std::string_view result = encoder.getUtf8(std::string_view(input.data(), 2));
+        EXPECT_EQ(result, "ab");
+    }
+
+    TEST(Utf8EncoderTest, getUtf8ShouldLookUpUntilEndOfInputForNonAscii)
+    {
+        const std::string input("a\x92" "b");
+        Utf8Encoder encoder(FromType::WINDOWS_1252);
+        const std::string_view result = encoder.getUtf8(std::string_view(input.data(), 2));
+        EXPECT_EQ(result, "a\xE2\x80\x99");
+    }
+
+    TEST_P(Utf8EncoderTest, getUtf8ShouldConvertFromLegacyEncodingToUtf8)
+    {
+        const std::string input(readContent(GetParam().mLegacyEncodingFileName));
+        const std::string expected(readContent(GetParam().mUtf8FileName));
+        Utf8Encoder encoder(GetParam().mLegacyEncoding);
+        const std::string_view result = encoder.getUtf8(input);
+        EXPECT_EQ(result, expected);
+    }
+
+    TEST(Utf8EncoderTest, getLegacyEncShouldReturnEmptyAsIs)
+    {
+        Utf8Encoder encoder(FromType::CP437);
+        EXPECT_EQ(encoder.getLegacyEnc(std::string_view()), std::string_view());
+    }
+
+    TEST(Utf8EncoderTest, getLegacyEncShouldReturnAsciiOnlyAsIs)
+    {
+        std::string input;
+        for (int c = 1; c <= std::numeric_limits<char>::max(); ++c)
+            input.push_back(c);
+        Utf8Encoder encoder(FromType::CP437);
+        const std::string_view result = encoder.getLegacyEnc(input);
+        EXPECT_EQ(result.data(), input.data());
+        EXPECT_EQ(result.size(), input.size());
+    }
+
+    TEST(Utf8EncoderTest, getLegacyEncShouldLookUpUntilZero)
+    {
+        const std::string input("a\0b");
+        Utf8Encoder encoder(FromType::CP437);
+        const std::string_view result = encoder.getLegacyEnc(input);
+        EXPECT_EQ(result, "a");
+    }
+
+    TEST(Utf8EncoderTest, getLegacyEncShouldLookUpUntilEndOfInputForAscii)
+    {
+        const std::string input("abc");
+        Utf8Encoder encoder(FromType::CP437);
+        const std::string_view result = encoder.getLegacyEnc(std::string_view(input.data(), 2));
+        EXPECT_EQ(result, "ab");
+    }
+
+    TEST(Utf8EncoderTest, getLegacyEncShouldStripIncompleteCharacters)
+    {
+        const std::string input("a\xc3\xa2\xe2\x80\x99");
+        Utf8Encoder encoder(FromType::WINDOWS_1252);
+        const std::string_view result = encoder.getLegacyEnc(std::string_view(input.data(), 5));
+        EXPECT_EQ(result, "a\xe2");
+    }
+
+    TEST_P(Utf8EncoderTest, getLegacyEncShouldConvertFromUtf8ToLegacyEncoding)
+    {
+        const std::string input(readContent(GetParam().mUtf8FileName));
+        const std::string expected(readContent(GetParam().mLegacyEncodingFileName));
+        Utf8Encoder encoder(GetParam().mLegacyEncoding);
+        const std::string_view result = encoder.getLegacyEnc(input);
+        EXPECT_EQ(result, expected);
+    }
+
+    INSTANTIATE_TEST_SUITE_P(Files, Utf8EncoderTest, Values(
+        Params {ToUTF8::WINDOWS_1251, "russian-win1251.txt", "russian-utf8.txt"},
+        Params {ToUTF8::WINDOWS_1252, "french-win1252.txt", "french-utf8.txt"}
+    ));
+}
diff --git a/components/to_utf8/tests/.gitignore b/components/to_utf8/tests/.gitignore
deleted file mode 100644
index 8144904045..0000000000
--- a/components/to_utf8/tests/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-*_test
diff --git a/components/to_utf8/tests/output/to_utf8_test.out b/components/to_utf8/tests/output/to_utf8_test.out
deleted file mode 100644
index dcb32359ab..0000000000
--- a/components/to_utf8/tests/output/to_utf8_test.out
+++ /dev/null
@@ -1,4 +0,0 @@
-original:  Без вопросов отдаете ему рулет, зная, что позже вы сможете привести с собой своих друзей и тогда он получит по заслугам?
-converted: Без вопросов отдаете ему рулет, зная, что позже вы сможете привести с собой своих друзей и тогда он получит по заслугам?
-original:  Vous lui donnez le gâteau sans protester avant d’aller chercher tous vos amis et de revenir vous venger.
-converted: Vous lui donnez le gâteau sans protester avant d’aller chercher tous vos amis et de revenir vous venger.
diff --git a/components/to_utf8/tests/test.sh b/components/to_utf8/tests/test.sh
deleted file mode 100755
index 2d07708adc..0000000000
--- a/components/to_utf8/tests/test.sh
+++ /dev/null
@@ -1,18 +0,0 @@
-#!/bin/bash
-
-make || exit
-
-mkdir -p output
-
-PROGS=*_test
-
-for a in $PROGS; do
-    if [ -f "output/$a.out" ]; then
-        echo "Running $a:"
-        ./$a | diff output/$a.out -
-    else
-        echo "Creating $a.out"
-        ./$a > "output/$a.out"
-        git add "output/$a.out"
-    fi
-done
diff --git a/components/to_utf8/tests/to_utf8_test.cpp b/components/to_utf8/tests/to_utf8_test.cpp
deleted file mode 100644
index 3fcddd1581..0000000000
--- a/components/to_utf8/tests/to_utf8_test.cpp
+++ /dev/null
@@ -1,59 +0,0 @@
-#include <iostream>
-#include <fstream>
-#include <cassert>
-#include <stdexcept>
-
-#include "../to_utf8.hpp"
-
-std::string getFirstLine(const std::string &filename);
-void testEncoder(ToUTF8::FromType encoding, const std::string &legacyEncFile,
-                 const std::string &utf8File);
-
-/// Test character encoding conversion to and from UTF-8
-void testEncoder(ToUTF8::FromType encoding, const std::string &legacyEncFile,
-                 const std::string &utf8File)
-{
-    // get some test data
-    std::string legacyEncLine = getFirstLine(legacyEncFile);
-    std::string utf8Line = getFirstLine(utf8File);
-
-    // create an encoder for specified character encoding
-    ToUTF8::Utf8Encoder encoder (encoding);
-
-    // convert text to UTF-8
-    std::string convertedUtf8Line = encoder.getUtf8(legacyEncLine);
-
-    std::cout << "original:  " << utf8Line          << std::endl;
-    std::cout << "converted: " << convertedUtf8Line << std::endl;
-
-    // check correctness
-    assert(convertedUtf8Line == utf8Line);
-
-    // convert UTF-8 text to legacy encoding
-    std::string convertedLegacyEncLine = encoder.getLegacyEnc(utf8Line);
-    // check correctness
-    assert(convertedLegacyEncLine == legacyEncLine);
-}
-
-std::string getFirstLine(const std::string &filename)
-{
-    std::string line;
-    std::ifstream text (filename.c_str());
-
-    if (!text.is_open())
-    {
-        throw std::runtime_error("Unable to open file " + filename);
-    }
-
-    std::getline(text, line);
-    text.close();
-
-    return line;
-}
-
-int main()
-{
-    testEncoder(ToUTF8::WINDOWS_1251, "test_data/russian-win1251.txt", "test_data/russian-utf8.txt");
-    testEncoder(ToUTF8::WINDOWS_1252, "test_data/french-win1252.txt", "test_data/french-utf8.txt");
-    return 0;
-}
diff --git a/components/to_utf8/to_utf8.cpp b/components/to_utf8/to_utf8.cpp
index 7fd0e3cd88..1f0a81ad10 100644
--- a/components/to_utf8/to_utf8.cpp
+++ b/components/to_utf8/to_utf8.cpp
@@ -3,6 +3,7 @@
 #include <vector>
 #include <cassert>
 #include <stdexcept>
+#include <algorithm>
 
 #include <components/debug/debuglog.hpp>
 
@@ -44,6 +45,14 @@
 
 using namespace ToUTF8;
 
+namespace
+{
+    std::string_view::iterator skipAscii(std::string_view input)
+    {
+        return std::find_if(input.begin(), input.end(), [] (unsigned char v) { return v == 0 || v >= 128; });
+    }
+}
+
 Utf8Encoder::Utf8Encoder(const FromType sourceEncoding):
     mOutput(50*1024)
 {
@@ -82,11 +91,6 @@ std::string_view Utf8Encoder::getUtf8(std::string_view input)
     if (input.empty())
         return input;
 
-    // Double check that the input string stops at some point (it might
-    // contain zero terminators before this, inside its own data, which
-    // is also ok.)
-    assert(input[input.size()] == 0);
-
     // Note: The rest of this function is designed for single-character
     // input encodings only. It also assumes that the input encoding
     // shares its first 128 values (0-127) with ASCII. There are no plans
@@ -95,8 +99,7 @@ std::string_view Utf8Encoder::getUtf8(std::string_view input)
 
     // Compute output length, and check for pure ascii input at the same
     // time.
-    bool ascii;
-    size_t outlen = getLength(input.data(), ascii);
+    const auto [outlen, ascii] = getLength(input);
 
     // If we're pure ascii, then don't bother converting anything.
     if(ascii)
@@ -107,8 +110,8 @@ std::string_view Utf8Encoder::getUtf8(std::string_view input)
     char *out = &mOutput[0];
 
     // Translate
-    for (const char* ptr = input.data(); *ptr;)
-        copyFromArray(*(ptr++), out);
+    for (auto it = input.begin(); it != input.end() && *it != 0; ++it)
+        copyFromArray(*it, out);
 
     // Make sure that we wrote the correct number of bytes
     assert((out-&mOutput[0]) == (int)outlen);
@@ -125,11 +128,6 @@ std::string_view Utf8Encoder::getLegacyEnc(std::string_view input)
     if (input.empty())
         return input;
 
-    // Double check that the input string stops at some point (it might
-    // contain zero terminators before this, inside its own data, which
-    // is also ok.)
-    assert(input[input.size()] == 0);
-
     // TODO: The rest of this function is designed for single-character
     // input encodings only. It also assumes that the input the input
     // encoding shares its first 128 values (0-127) with ASCII. These
@@ -138,8 +136,7 @@ std::string_view Utf8Encoder::getLegacyEnc(std::string_view input)
 
     // Compute output length, and check for pure ascii input at the same
     // time.
-    bool ascii;
-    size_t outlen = getLength2(input.data(), ascii);
+    const auto [outlen, ascii] = getLengthLegacyEnc(input);
 
     // If we're pure ascii, then don't bother converting anything.
     if(ascii)
@@ -150,8 +147,8 @@ std::string_view Utf8Encoder::getLegacyEnc(std::string_view input)
     char *out = &mOutput[0];
 
     // Translate
-    for (const char* ptr = input.data(); *ptr;)
-        copyFromArray2(ptr, out);
+    for (auto it = input.begin(); it != input.end() && *it != 0;)
+        copyFromArrayLegacyEnc(it, input.end(), out);
 
     // Make sure that we wrote the correct number of bytes
     assert((out-&mOutput[0]) == (int)outlen);
@@ -186,34 +183,30 @@ void Utf8Encoder::resize(size_t size)
   is the case, then the ascii parameter is set to true, and the
   caller can optimize for this case.
  */
-size_t Utf8Encoder::getLength(const char* input, bool &ascii) const
+std::pair<std::size_t, bool> Utf8Encoder::getLength(std::string_view input) const
 {
-    ascii = true;
-    size_t len = 0;
-    const char* ptr = input;
-    unsigned char inp = *ptr;
-
     // Do away with the ascii part of the string first (this is almost
     // always the entire string.)
-    while (inp && inp < 128)
-        inp = *(++ptr);
-    len += (ptr-input);
+    auto it = skipAscii(input);
 
     // If we're not at the null terminator at this point, then there
     // were some non-ascii characters to deal with. Go to slow-mode for
     // the rest of the string.
-    if (inp)
+    if (it == input.end() || *it == 0)
+        return {it - input.begin(), true};
+
+    std::size_t len = it - input.begin();
+
+    do
     {
-        ascii = false;
-        while (inp)
-        {
-            // Find the translated length of this character in the
-            // lookup table.
-            len += translationArray[inp*6];
-            inp = *(++ptr);
-        }
+        // Find the translated length of this character in the
+        // lookup table.
+        len += translationArray[static_cast<unsigned char>(*it) * 6];
+        ++it;
     }
-    return len;
+    while (it != input.end() && *it != 0);
+
+    return {len, false};
 }
 
 // Translate one character 'ch' using the translation array 'arr', and
@@ -233,51 +226,52 @@ void Utf8Encoder::copyFromArray(unsigned char ch, char* &out) const
     out += len;
 }
 
-size_t Utf8Encoder::getLength2(const char* input, bool &ascii) const
+std::pair<std::size_t, bool> Utf8Encoder::getLengthLegacyEnc(std::string_view input) const
 {
-    ascii = true;
-    size_t len = 0;
-    const char* ptr = input;
-    unsigned char inp = *ptr;
-
     // Do away with the ascii part of the string first (this is almost
     // always the entire string.)
-    while (inp && inp < 128)
-        inp = *(++ptr);
-    len += (ptr-input);
+    auto it = skipAscii(input);
 
     // If we're not at the null terminator at this point, then there
     // were some non-ascii characters to deal with. Go to slow-mode for
     // the rest of the string.
-    if (inp)
-    {
-        ascii = false;
-        while(inp)
-        {
-            len += 1;
-            // Find the translated length of this character in the
-            // lookup table.
-            switch(inp)
-            {
-                case 0xe2: len -= 2; break;
-                case 0xc2:
-                case 0xcb:
-                case 0xc4:
-                case 0xc6:
-                case 0xc3:
-                case 0xd0:
-                case 0xd1:
-                case 0xd2:
-                case 0xc5: len -= 1; break;
-            }
+    if (it == input.end() || *it == 0)
+        return {it - input.begin(), true};
 
-            inp = *(++ptr);
+    std::size_t len = it - input.begin();
+    std::size_t symbolLen = 0;
+
+    do
+    {
+        symbolLen += 1;
+        // Find the translated length of this character in the
+        // lookup table.
+        switch (static_cast<unsigned char>(*it))
+        {
+            case 0xe2: symbolLen -= 2; break;
+            case 0xc2:
+            case 0xcb:
+            case 0xc4:
+            case 0xc6:
+            case 0xc3:
+            case 0xd0:
+            case 0xd1:
+            case 0xd2:
+            case 0xc5: symbolLen -= 1; break;
+            default:
+                len += symbolLen;
+                symbolLen = 0;
+                break;
         }
+
+        ++it;
     }
-    return len;
+    while (it != input.end() && *it != 0);
+
+    return {len, false};
 }
 
-void Utf8Encoder::copyFromArray2(const char*& chp, char* &out) const
+void Utf8Encoder::copyFromArrayLegacyEnc(std::string_view::iterator& chp, std::string_view::iterator end, char* &out) const
 {
     unsigned char ch = *(chp++);
     // Optimize for ASCII values
@@ -308,10 +302,17 @@ void Utf8Encoder::copyFromArray2(const char*& chp, char* &out) const
         return;
     }
 
+    if (chp == end)
+        return;
+
     unsigned char ch2 = *(chp++);
     unsigned char ch3 = '\0';
     if (len == 3)
+    {
+        if (chp == end)
+            return;
         ch3 = *(chp++);
+    }
 
     for (int i = 128; i < 256; i++)
     {
diff --git a/components/to_utf8/to_utf8.hpp b/components/to_utf8/to_utf8.hpp
index 0e9db01e1d..794c9148e5 100644
--- a/components/to_utf8/to_utf8.hpp
+++ b/components/to_utf8/to_utf8.hpp
@@ -38,11 +38,11 @@ namespace ToUTF8
             std::string_view getLegacyEnc(std::string_view input);
 
         private:
-            void resize(size_t size);
-            size_t getLength(const char* input, bool &ascii) const;
-            void copyFromArray(unsigned char chp, char* &out) const;
-            size_t getLength2(const char* input, bool &ascii) const;
-            void copyFromArray2(const char*& chp, char* &out) const;
+            inline void resize(std::size_t size);
+            inline std::pair<std::size_t, bool> getLength(std::string_view input) const;
+            inline void copyFromArray(unsigned char chp, char* &out) const;
+            inline std::pair<std::size_t, bool> getLengthLegacyEnc(std::string_view input) const;
+            inline void copyFromArrayLegacyEnc(std::string_view::iterator& chp, std::string_view::iterator end, char* &out) const;
 
             std::vector<char> mOutput;
             const signed char* translationArray;