Add tests for Utf8Encoder

C++20
elsid 3 years ago
parent d8127fdad2
commit 7884a01026
No known key found for this signature in database
GPG Key ID: B845CB9FEE18AB40

@ -70,6 +70,8 @@ if (GTEST_FOUND AND GMOCK_FOUND)
esmloader/esmdata.cpp esmloader/esmdata.cpp
files/hash.cpp files/hash.cpp
toutf8/toutf8.cpp
) )
source_group(apps\\openmw_test_suite FILES openmw_test_suite.cpp ${UNITTEST_SRC_FILES}) source_group(apps\\openmw_test_suite FILES openmw_test_suite.cpp ${UNITTEST_SRC_FILES})
@ -93,6 +95,8 @@ if (GTEST_FOUND AND GMOCK_FOUND)
EXPECTED_MD5 bf3691034a38611534c74c3b89a7d2c3 EXPECTED_MD5 bf3691034a38611534c74c3b89a7d2c3
) )
target_compile_definitions(openmw_test_suite PRIVATE OPENMW_DATA_DIR="${CMAKE_CURRENT_BINARY_DIR}/data") target_compile_definitions(openmw_test_suite
PRIVATE OPENMW_DATA_DIR="${CMAKE_CURRENT_BINARY_DIR}/data"
OPENMW_TEST_SUITE_SOURCE_DIR="${CMAKE_CURRENT_SOURCE_DIR}")
endif() endif()

@ -0,0 +1,139 @@
#include <components/to_utf8/to_utf8.hpp>
#include <gtest/gtest.h>
#include <fstream>
#ifndef OPENMW_TEST_SUITE_SOURCE_DIR
#define OPENMW_TEST_SUITE_SOURCE_DIR ""
#endif
namespace
{
using namespace testing;
using namespace ToUTF8;
struct Params
{
FromType mLegacyEncoding;
std::string mLegacyEncodingFileName;
std::string mUtf8FileName;
};
std::string readContent(const std::string& fileName)
{
std::ifstream file;
file.exceptions(std::ios::failbit | std::ios::badbit);
file.open(std::string(OPENMW_TEST_SUITE_SOURCE_DIR) + "/toutf8/data/" + fileName);
std::stringstream buffer;
buffer << file.rdbuf();
return buffer.str();
}
struct Utf8EncoderTest : TestWithParam<Params> {};
TEST(Utf8EncoderTest, getUtf8ShouldReturnEmptyAsIs)
{
Utf8Encoder encoder(FromType::CP437);
EXPECT_EQ(encoder.getUtf8(std::string_view()), std::string_view());
}
TEST(Utf8EncoderTest, getUtf8ShouldReturnAsciiOnlyAsIs)
{
std::string input;
for (int c = 1; c <= std::numeric_limits<char>::max(); ++c)
input.push_back(c);
Utf8Encoder encoder(FromType::CP437);
const std::string_view result = encoder.getUtf8(input);
EXPECT_EQ(result.data(), input.data());
EXPECT_EQ(result.size(), input.size());
}
TEST(Utf8EncoderTest, getUtf8ShouldLookUpUntilZero)
{
const std::string input("a\0b");
Utf8Encoder encoder(FromType::CP437);
const std::string_view result = encoder.getUtf8(input);
EXPECT_EQ(result, "a");
}
TEST(Utf8EncoderTest, getUtf8ShouldLookUpUntilEndOfInputForAscii)
{
const std::string input("abc");
Utf8Encoder encoder(FromType::CP437);
const std::string_view result = encoder.getUtf8(std::string_view(input.data(), 2));
EXPECT_EQ(result, "ab");
}
TEST(Utf8EncoderTest, getUtf8ShouldLookUpUntilEndOfInputForNonAscii)
{
const std::string input("a\x92" "b");
Utf8Encoder encoder(FromType::WINDOWS_1252);
const std::string_view result = encoder.getUtf8(std::string_view(input.data(), 2));
EXPECT_EQ(result, "a\xE2\x80\x99");
}
TEST_P(Utf8EncoderTest, getUtf8ShouldConvertFromLegacyEncodingToUtf8)
{
const std::string input(readContent(GetParam().mLegacyEncodingFileName));
const std::string expected(readContent(GetParam().mUtf8FileName));
Utf8Encoder encoder(GetParam().mLegacyEncoding);
const std::string_view result = encoder.getUtf8(input);
EXPECT_EQ(result, expected);
}
TEST(Utf8EncoderTest, getLegacyEncShouldReturnEmptyAsIs)
{
Utf8Encoder encoder(FromType::CP437);
EXPECT_EQ(encoder.getLegacyEnc(std::string_view()), std::string_view());
}
TEST(Utf8EncoderTest, getLegacyEncShouldReturnAsciiOnlyAsIs)
{
std::string input;
for (int c = 1; c <= std::numeric_limits<char>::max(); ++c)
input.push_back(c);
Utf8Encoder encoder(FromType::CP437);
const std::string_view result = encoder.getLegacyEnc(input);
EXPECT_EQ(result.data(), input.data());
EXPECT_EQ(result.size(), input.size());
}
TEST(Utf8EncoderTest, getLegacyEncShouldLookUpUntilZero)
{
const std::string input("a\0b");
Utf8Encoder encoder(FromType::CP437);
const std::string_view result = encoder.getLegacyEnc(input);
EXPECT_EQ(result, "a");
}
TEST(Utf8EncoderTest, getLegacyEncShouldLookUpUntilEndOfInputForAscii)
{
const std::string input("abc");
Utf8Encoder encoder(FromType::CP437);
const std::string_view result = encoder.getLegacyEnc(std::string_view(input.data(), 2));
EXPECT_EQ(result, "ab");
}
TEST(Utf8EncoderTest, getLegacyEncShouldStripIncompleteCharacters)
{
const std::string input("a\xc3\xa2\xe2\x80\x99");
Utf8Encoder encoder(FromType::WINDOWS_1252);
const std::string_view result = encoder.getLegacyEnc(std::string_view(input.data(), 5));
EXPECT_EQ(result, "a\xe2");
}
TEST_P(Utf8EncoderTest, getLegacyEncShouldConvertFromUtf8ToLegacyEncoding)
{
const std::string input(readContent(GetParam().mUtf8FileName));
const std::string expected(readContent(GetParam().mLegacyEncodingFileName));
Utf8Encoder encoder(GetParam().mLegacyEncoding);
const std::string_view result = encoder.getLegacyEnc(input);
EXPECT_EQ(result, expected);
}
INSTANTIATE_TEST_SUITE_P(Files, Utf8EncoderTest, Values(
Params {ToUTF8::WINDOWS_1251, "russian-win1251.txt", "russian-utf8.txt"},
Params {ToUTF8::WINDOWS_1252, "french-win1252.txt", "french-utf8.txt"}
));
}

@ -1,4 +0,0 @@
original: Без вопросов отдаете ему рулет, зная, что позже вы сможете привести с собой своих друзей и тогда он получит по заслугам?
converted: Без вопросов отдаете ему рулет, зная, что позже вы сможете привести с собой своих друзей и тогда он получит по заслугам?
original: Vous lui donnez le gâteau sans protester avant daller chercher tous vos amis et de revenir vous venger.
converted: Vous lui donnez le gâteau sans protester avant daller chercher tous vos amis et de revenir vous venger.

@ -1,18 +0,0 @@
#!/bin/bash
make || exit
mkdir -p output
PROGS=*_test
for a in $PROGS; do
if [ -f "output/$a.out" ]; then
echo "Running $a:"
./$a | diff output/$a.out -
else
echo "Creating $a.out"
./$a > "output/$a.out"
git add "output/$a.out"
fi
done

@ -1,59 +0,0 @@
#include <iostream>
#include <fstream>
#include <cassert>
#include <stdexcept>
#include "../to_utf8.hpp"
std::string getFirstLine(const std::string &filename);
void testEncoder(ToUTF8::FromType encoding, const std::string &legacyEncFile,
const std::string &utf8File);
/// Test character encoding conversion to and from UTF-8
void testEncoder(ToUTF8::FromType encoding, const std::string &legacyEncFile,
const std::string &utf8File)
{
// get some test data
std::string legacyEncLine = getFirstLine(legacyEncFile);
std::string utf8Line = getFirstLine(utf8File);
// create an encoder for specified character encoding
ToUTF8::Utf8Encoder encoder (encoding);
// convert text to UTF-8
std::string convertedUtf8Line = encoder.getUtf8(legacyEncLine);
std::cout << "original: " << utf8Line << std::endl;
std::cout << "converted: " << convertedUtf8Line << std::endl;
// check correctness
assert(convertedUtf8Line == utf8Line);
// convert UTF-8 text to legacy encoding
std::string convertedLegacyEncLine = encoder.getLegacyEnc(utf8Line);
// check correctness
assert(convertedLegacyEncLine == legacyEncLine);
}
std::string getFirstLine(const std::string &filename)
{
std::string line;
std::ifstream text (filename.c_str());
if (!text.is_open())
{
throw std::runtime_error("Unable to open file " + filename);
}
std::getline(text, line);
text.close();
return line;
}
int main()
{
testEncoder(ToUTF8::WINDOWS_1251, "test_data/russian-win1251.txt", "test_data/russian-utf8.txt");
testEncoder(ToUTF8::WINDOWS_1252, "test_data/french-win1252.txt", "test_data/french-utf8.txt");
return 0;
}

@ -3,6 +3,7 @@
#include <vector> #include <vector>
#include <cassert> #include <cassert>
#include <stdexcept> #include <stdexcept>
#include <algorithm>
#include <components/debug/debuglog.hpp> #include <components/debug/debuglog.hpp>
@ -44,6 +45,14 @@
using namespace ToUTF8; using namespace ToUTF8;
namespace
{
std::string_view::iterator skipAscii(std::string_view input)
{
return std::find_if(input.begin(), input.end(), [] (unsigned char v) { return v == 0 || v >= 128; });
}
}
Utf8Encoder::Utf8Encoder(const FromType sourceEncoding): Utf8Encoder::Utf8Encoder(const FromType sourceEncoding):
mOutput(50*1024) mOutput(50*1024)
{ {
@ -82,11 +91,6 @@ std::string_view Utf8Encoder::getUtf8(std::string_view input)
if (input.empty()) if (input.empty())
return input; return input;
// Double check that the input string stops at some point (it might
// contain zero terminators before this, inside its own data, which
// is also ok.)
assert(input[input.size()] == 0);
// Note: The rest of this function is designed for single-character // Note: The rest of this function is designed for single-character
// input encodings only. It also assumes that the input encoding // input encodings only. It also assumes that the input encoding
// shares its first 128 values (0-127) with ASCII. There are no plans // shares its first 128 values (0-127) with ASCII. There are no plans
@ -95,8 +99,7 @@ std::string_view Utf8Encoder::getUtf8(std::string_view input)
// Compute output length, and check for pure ascii input at the same // Compute output length, and check for pure ascii input at the same
// time. // time.
bool ascii; const auto [outlen, ascii] = getLength(input);
size_t outlen = getLength(input.data(), ascii);
// If we're pure ascii, then don't bother converting anything. // If we're pure ascii, then don't bother converting anything.
if(ascii) if(ascii)
@ -107,8 +110,8 @@ std::string_view Utf8Encoder::getUtf8(std::string_view input)
char *out = &mOutput[0]; char *out = &mOutput[0];
// Translate // Translate
for (const char* ptr = input.data(); *ptr;) for (auto it = input.begin(); it != input.end() && *it != 0; ++it)
copyFromArray(*(ptr++), out); copyFromArray(*it, out);
// Make sure that we wrote the correct number of bytes // Make sure that we wrote the correct number of bytes
assert((out-&mOutput[0]) == (int)outlen); assert((out-&mOutput[0]) == (int)outlen);
@ -125,11 +128,6 @@ std::string_view Utf8Encoder::getLegacyEnc(std::string_view input)
if (input.empty()) if (input.empty())
return input; return input;
// Double check that the input string stops at some point (it might
// contain zero terminators before this, inside its own data, which
// is also ok.)
assert(input[input.size()] == 0);
// TODO: The rest of this function is designed for single-character // TODO: The rest of this function is designed for single-character
// input encodings only. It also assumes that the input the input // input encodings only. It also assumes that the input the input
// encoding shares its first 128 values (0-127) with ASCII. These // encoding shares its first 128 values (0-127) with ASCII. These
@ -138,8 +136,7 @@ std::string_view Utf8Encoder::getLegacyEnc(std::string_view input)
// Compute output length, and check for pure ascii input at the same // Compute output length, and check for pure ascii input at the same
// time. // time.
bool ascii; const auto [outlen, ascii] = getLengthLegacyEnc(input);
size_t outlen = getLength2(input.data(), ascii);
// If we're pure ascii, then don't bother converting anything. // If we're pure ascii, then don't bother converting anything.
if(ascii) if(ascii)
@ -150,8 +147,8 @@ std::string_view Utf8Encoder::getLegacyEnc(std::string_view input)
char *out = &mOutput[0]; char *out = &mOutput[0];
// Translate // Translate
for (const char* ptr = input.data(); *ptr;) for (auto it = input.begin(); it != input.end() && *it != 0;)
copyFromArray2(ptr, out); copyFromArrayLegacyEnc(it, input.end(), out);
// Make sure that we wrote the correct number of bytes // Make sure that we wrote the correct number of bytes
assert((out-&mOutput[0]) == (int)outlen); assert((out-&mOutput[0]) == (int)outlen);
@ -186,34 +183,30 @@ void Utf8Encoder::resize(size_t size)
is the case, then the ascii parameter is set to true, and the is the case, then the ascii parameter is set to true, and the
caller can optimize for this case. caller can optimize for this case.
*/ */
size_t Utf8Encoder::getLength(const char* input, bool &ascii) const std::pair<std::size_t, bool> Utf8Encoder::getLength(std::string_view input) const
{ {
ascii = true;
size_t len = 0;
const char* ptr = input;
unsigned char inp = *ptr;
// Do away with the ascii part of the string first (this is almost // Do away with the ascii part of the string first (this is almost
// always the entire string.) // always the entire string.)
while (inp && inp < 128) auto it = skipAscii(input);
inp = *(++ptr);
len += (ptr-input);
// If we're not at the null terminator at this point, then there // If we're not at the null terminator at this point, then there
// were some non-ascii characters to deal with. Go to slow-mode for // were some non-ascii characters to deal with. Go to slow-mode for
// the rest of the string. // the rest of the string.
if (inp) if (it == input.end() || *it == 0)
return {it - input.begin(), true};
std::size_t len = it - input.begin();
do
{ {
ascii = false; // Find the translated length of this character in the
while (inp) // lookup table.
{ len += translationArray[static_cast<unsigned char>(*it) * 6];
// Find the translated length of this character in the ++it;
// lookup table.
len += translationArray[inp*6];
inp = *(++ptr);
}
} }
return len; while (it != input.end() && *it != 0);
return {len, false};
} }
// Translate one character 'ch' using the translation array 'arr', and // Translate one character 'ch' using the translation array 'arr', and
@ -233,51 +226,52 @@ void Utf8Encoder::copyFromArray(unsigned char ch, char* &out) const
out += len; out += len;
} }
size_t Utf8Encoder::getLength2(const char* input, bool &ascii) const std::pair<std::size_t, bool> Utf8Encoder::getLengthLegacyEnc(std::string_view input) const
{ {
ascii = true;
size_t len = 0;
const char* ptr = input;
unsigned char inp = *ptr;
// Do away with the ascii part of the string first (this is almost // Do away with the ascii part of the string first (this is almost
// always the entire string.) // always the entire string.)
while (inp && inp < 128) auto it = skipAscii(input);
inp = *(++ptr);
len += (ptr-input);
// If we're not at the null terminator at this point, then there // If we're not at the null terminator at this point, then there
// were some non-ascii characters to deal with. Go to slow-mode for // were some non-ascii characters to deal with. Go to slow-mode for
// the rest of the string. // the rest of the string.
if (inp) if (it == input.end() || *it == 0)
return {it - input.begin(), true};
std::size_t len = it - input.begin();
std::size_t symbolLen = 0;
do
{ {
ascii = false; symbolLen += 1;
while(inp) // Find the translated length of this character in the
// lookup table.
switch (static_cast<unsigned char>(*it))
{ {
len += 1; case 0xe2: symbolLen -= 2; break;
// Find the translated length of this character in the case 0xc2:
// lookup table. case 0xcb:
switch(inp) case 0xc4:
{ case 0xc6:
case 0xe2: len -= 2; break; case 0xc3:
case 0xc2: case 0xd0:
case 0xcb: case 0xd1:
case 0xc4: case 0xd2:
case 0xc6: case 0xc5: symbolLen -= 1; break;
case 0xc3: default:
case 0xd0: len += symbolLen;
case 0xd1: symbolLen = 0;
case 0xd2: break;
case 0xc5: len -= 1; break;
}
inp = *(++ptr);
} }
++it;
} }
return len; while (it != input.end() && *it != 0);
return {len, false};
} }
void Utf8Encoder::copyFromArray2(const char*& chp, char* &out) const void Utf8Encoder::copyFromArrayLegacyEnc(std::string_view::iterator& chp, std::string_view::iterator end, char* &out) const
{ {
unsigned char ch = *(chp++); unsigned char ch = *(chp++);
// Optimize for ASCII values // Optimize for ASCII values
@ -308,10 +302,17 @@ void Utf8Encoder::copyFromArray2(const char*& chp, char* &out) const
return; return;
} }
if (chp == end)
return;
unsigned char ch2 = *(chp++); unsigned char ch2 = *(chp++);
unsigned char ch3 = '\0'; unsigned char ch3 = '\0';
if (len == 3) if (len == 3)
{
if (chp == end)
return;
ch3 = *(chp++); ch3 = *(chp++);
}
for (int i = 128; i < 256; i++) for (int i = 128; i < 256; i++)
{ {

@ -38,11 +38,11 @@ namespace ToUTF8
std::string_view getLegacyEnc(std::string_view input); std::string_view getLegacyEnc(std::string_view input);
private: private:
void resize(size_t size); inline void resize(std::size_t size);
size_t getLength(const char* input, bool &ascii) const; inline std::pair<std::size_t, bool> getLength(std::string_view input) const;
void copyFromArray(unsigned char chp, char* &out) const; inline void copyFromArray(unsigned char chp, char* &out) const;
size_t getLength2(const char* input, bool &ascii) const; inline std::pair<std::size_t, bool> getLengthLegacyEnc(std::string_view input) const;
void copyFromArray2(const char*& chp, char* &out) const; inline void copyFromArrayLegacyEnc(std::string_view::iterator& chp, std::string_view::iterator end, char* &out) const;
std::vector<char> mOutput; std::vector<char> mOutput;
const signed char* translationArray; const signed char* translationArray;

Loading…
Cancel
Save