From 740e2b5769d85bec22b8478846bd3800d2552049 Mon Sep 17 00:00:00 2001 From: Emanuel Guevel Date: Wed, 2 Jan 2013 23:02:13 +0100 Subject: [PATCH] components/to_utf8: add class Utf8Encoder --- components/to_utf8/to_utf8.cpp | 795 ++++++++++++++++++++++----------- components/to_utf8/to_utf8.hpp | 67 ++- 2 files changed, 589 insertions(+), 273 deletions(-) diff --git a/components/to_utf8/to_utf8.cpp b/components/to_utf8/to_utf8.cpp index 7db6112475..8ac582b81d 100644 --- a/components/to_utf8/to_utf8.cpp +++ b/components/to_utf8/to_utf8.cpp @@ -2,6 +2,8 @@ #include #include +#include +#include /* This file contains the code to translate from WINDOWS-1252 (native charset used in English version of Morrowind) to UTF-8. The library @@ -46,334 +48,611 @@ static std::vector buf (50*1024); static std::vector output (50*1024); static int size; -// Make sure the given vector is large enough for 'size' bytes, +using namespace ToUTF8; + +Utf8Encoder::Utf8Encoder(void): + mOutput(50*1024) +{ +} + +void Utf8Encoder::setEncoding(const FromType sourceEncoding) +{ + mEncoding = sourceEncoding; + + switch (mEncoding) + { + case ToUTF8::WINDOWS_1252: + { + translationArray = ToUTF8::windows_1252; + break; + } + case ToUTF8::WINDOWS_1250: + { + translationArray = ToUTF8::windows_1250; + break; + } + case ToUTF8::WINDOWS_1251: + { + translationArray = ToUTF8::windows_1251; + break; + } + default: + { + assert(0); + } + } +} + +std::string Utf8Encoder::getUtf8(const char* input, int size) +{ + // Double check that the input string stops at some point (it might + // contain zero terminators before this, inside its own data, which + // is also ok.) + assert(input[size] == 0); + + // TODO: The rest of this function is designed for single-character + // input encodings only. It also assumes that the input the input + // encoding shares its first 128 values (0-127) with ASCII. These + // conditions must be checked again if you add more input encodings + // later. + + // Compute output length, and check for pure ascii input at the same + // time. + bool ascii; + size_t outlen = getLength(input, ascii); + + // If we're pure ascii, then don't bother converting anything. + if(ascii) + return std::string(input, outlen); + + // Make sure the output is large enough + resize(outlen); + char *out = &mOutput[0]; + + // Translate + while (*input) + copyFromArray(*(input++), out); + + // Make sure that we wrote the correct number of bytes + assert((out-&mOutput[0]) == (int)outlen); + + // And make extra sure the output is null terminated + assert(mOutput.size() > outlen); + assert(mOutput[outlen] == 0); + + // Return a string + return std::string(&mOutput[0], outlen); +} + +std::string Utf8Encoder::getLegacyEnc(const char *input, int size) +{ + // Double check that the input string stops at some point (it might + // contain zero terminators before this, inside its own data, which + // is also ok.) + assert(input[size] == 0); + + // TODO: The rest of this function is designed for single-character + // input encodings only. It also assumes that the input the input + // encoding shares its first 128 values (0-127) with ASCII. These + // conditions must be checked again if you add more input encodings + // later. + + // Compute output length, and check for pure ascii input at the same + // time. + bool ascii; + size_t outlen = getLength2(input, ascii); + + // If we're pure ascii, then don't bother converting anything. + if(ascii) + return std::string(input, outlen); + + // Make sure the output is large enough + resize(outlen); + char *out = &mOutput[0]; + + // Translate + while(*input) + copyFromArray2(input, out); + + // Make sure that we wrote the correct number of bytes + assert((out-&mOutput[0]) == (int)outlen); + + // And make extra sure the output is null terminated + assert(mOutput.size() > outlen); + assert(mOutput[outlen] == 0); + + // Return a string + return std::string(&mOutput[0], outlen); +} + +// Make sure the output vector is large enough for 'size' bytes, // including a terminating zero after it. +void Utf8Encoder::resize(size_t size) +{ + if (mOutput.size() <= size) + // Add some extra padding to reduce the chance of having to resize + // again later. + mOutput.resize(3*size); + + // And make sure the string is zero terminated + mOutput[size] = 0; +} + +/** Get the total length length needed to decode the given string with + the given translation array. The arrays are encoded with 6 bytes + per character, with the first giving the length and the next 5 the + actual data. + + The function serves a dual purpose for optimization reasons: it + checks if the input is pure ascii (all values are <= 127). If this + is the case, then the ascii parameter is set to true, and the + caller can optimize for this case. + */ +size_t Utf8Encoder::getLength(const char* input, bool &ascii) +{ + ascii = true; + size_t len = 0; + const char* ptr = input; + unsigned char inp = *ptr; + + // Do away with the ascii part of the string first (this is almost + // always the entire string.) + while (inp && inp < 128) + inp = *(++ptr); + len += (ptr-input); + + // If we're not at the null terminator at this point, then there + // were some non-ascii characters to deal with. Go to slow-mode for + // the rest of the string. + if (inp) + { + ascii = false; + while (inp) + { + // Find the translated length of this character in the + // lookup table. + len += translationArray[inp*6]; + inp = *(++ptr); + } + } + return len; +} + +// Translate one character 'ch' using the translation array 'arr', and +// advance the output pointer accordingly. +void Utf8Encoder::copyFromArray(unsigned char ch, char* &out) +{ + // Optimize for ASCII values + if (ch < 128) + { + *(out++) = ch; + return; + } + + const char *in = translationArray + ch*6; + int len = *(in++); + for (int i=0; i &buf, size_t size) { - if(buf.size() <= size) - // Add some extra padding to reduce the chance of having to resize - // again later. - buf.resize(3*size); + if(buf.size() <= size) + // Add some extra padding to reduce the chance of having to resize + // again later. + buf.resize(3*size); - // And make sure the string is zero terminated - buf[size] = 0; + // And make sure the string is zero terminated + buf[size] = 0; } // This is just used to spew out a reusable input buffer for the // conversion process. char *ToUTF8::getBuffer(int s) { - // Remember the requested size - size = s; - resize(buf, size); - return &buf[0]; + // Remember the requested size + size = s; + resize(buf, size); + return &buf[0]; } /** Get the total length length needed to decode the given string with - the given translation array. The arrays are encoded with 6 bytes - per character, with the first giving the length and the next 5 the - actual data. - - The function serves a dual purpose for optimization reasons: it - checks if the input is pure ascii (all values are <= 127). If this - is the case, then the ascii parameter is set to true, and the - caller can optimize for this case. + the given translation array. The arrays are encoded with 6 bytes + per character, with the first giving the length and the next 5 the + actual data. + + The function serves a dual purpose for optimization reasons: it + checks if the input is pure ascii (all values are <= 127). If this + is the case, then the ascii parameter is set to true, and the + caller can optimize for this case. */ static size_t getLength(const char *arr, const char* input, bool &ascii) { - ascii = true; - size_t len = 0; - const char* ptr = input; - unsigned char inp = *ptr; - - // Do away with the ascii part of the string first (this is almost - // always the entire string.) - while(inp && inp < 128) - inp = *(++ptr); - len += (ptr-input); - - // If we're not at the null terminator at this point, then there - // were some non-ascii characters to deal with. Go to slow-mode for - // the rest of the string. - if(inp) + ascii = true; + size_t len = 0; + const char* ptr = input; + unsigned char inp = *ptr; + + // Do away with the ascii part of the string first (this is almost + // always the entire string.) + while(inp && inp < 128) + inp = *(++ptr); + len += (ptr-input); + + // If we're not at the null terminator at this point, then there + // were some non-ascii characters to deal with. Go to slow-mode for + // the rest of the string. + if(inp) { - ascii = false; - while(inp) + ascii = false; + while(inp) { - // Find the translated length of this character in the - // lookup table. - len += arr[inp*6]; - inp = *(++ptr); + // Find the translated length of this character in the + // lookup table. + len += arr[inp*6]; + inp = *(++ptr); } } - return len; + return len; } // Translate one character 'ch' using the translation array 'arr', and // advance the output pointer accordingly. static void copyFromArray(const char *arr, unsigned char ch, char* &out) { - // Optimize for ASCII values - if(ch < 128) + // Optimize for ASCII values + if(ch < 128) { - *(out++) = ch; - return; + *(out++) = ch; + return; } - const char *in = arr + ch*6; - int len = *(in++); - for(int i=0; i outlen); - assert(output[outlen] == 0); - - // Return a string - return std::string(&output[0], outlen); + + // Double check that the input string stops at some point (it might + // contain zero terminators before this, inside its own data, which + // is also ok.) + const char* input = &buf[0]; + assert(input[size] == 0); + + // TODO: The rest of this function is designed for single-character + // input encodings only. It also assumes that the input the input + // encoding shares its first 128 values (0-127) with ASCII. These + // conditions must be checked again if you add more input encodings + // later. + + // Compute output length, and check for pure ascii input at the same + // time. + bool ascii; + size_t outlen = getLength(arr, input, ascii); + + // If we're pure ascii, then don't bother converting anything. + if(ascii) + return std::string(input, outlen); + + // Make sure the output is large enough + resize(output, outlen); + char *out = &output[0]; + + // Translate + while(*input) + copyFromArray(arr, *(input++), out); + + // Make sure that we wrote the correct number of bytes + assert((out-&output[0]) == (int)outlen); + + // And make extra sure the output is null terminated + assert(output.size() > outlen); + assert(output[outlen] == 0); + + // Return a string + return std::string(&output[0], outlen); } static size_t getLength2(const char *arr, const char* input, bool &ascii) { - ascii = true; - size_t len = 0; - const char* ptr = input; - unsigned char inp = *ptr; - - // Do away with the ascii part of the string first (this is almost - // always the entire string.) - while(inp && inp < 128) - inp = *(++ptr); - len += (ptr-input); - - // If we're not at the null terminator at this point, then there - // were some non-ascii characters to deal with. Go to slow-mode for - // the rest of the string. - if(inp) + ascii = true; + size_t len = 0; + const char* ptr = input; + unsigned char inp = *ptr; + + // Do away with the ascii part of the string first (this is almost + // always the entire string.) + while(inp && inp < 128) + inp = *(++ptr); + len += (ptr-input); + + // If we're not at the null terminator at this point, then there + // were some non-ascii characters to deal with. Go to slow-mode for + // the rest of the string. + if(inp) { - ascii = false; - while(inp) + ascii = false; + while(inp) { len += 1; - // Find the translated length of this character in the - // lookup table. + // Find the translated length of this character in the + // lookup table. switch(inp) { - case 0xe2: len -= 2; break; - case 0xc2: - case 0xcb: - case 0xc4: - case 0xc6: - case 0xc3: - case 0xd0: - case 0xd1: - case 0xd2: - case 0xc5: len -= 1; break; + case 0xe2: len -= 2; break; + case 0xc2: + case 0xcb: + case 0xc4: + case 0xc6: + case 0xc3: + case 0xd0: + case 0xd1: + case 0xd2: + case 0xc5: len -= 1; break; } - inp = *(++ptr); + inp = *(++ptr); } } - return len; + return len; } -#include -#include - static void copyFromArray2(const char *arr, char*& chp, char* &out) { unsigned char ch = *(chp++); - // Optimize for ASCII values - if(ch < 128) + // Optimize for ASCII values + if(ch < 128) { - *(out++) = ch; - return; + *(out++) = ch; + return; } - int len = 1; - switch (ch) - { - case 0xe2: len = 3; break; - case 0xc2: - case 0xcb: - case 0xc4: - case 0xc6: - case 0xc3: - case 0xd0: - case 0xd1: - case 0xd2: - case 0xc5: len = 2; break; - } - - if (len == 1) // There is no 1 length utf-8 glyph that is not 0x20 (empty space) - { - *(out++) = ch; - return; - } - - unsigned char ch2 = *(chp++); - unsigned char ch3 = '\0'; - if (len == 3) - ch3 = *(chp++); - - for (int i = 128; i < 256; i++) - { - unsigned char b1 = arr[i*6 + 1], b2 = arr[i*6 + 2], b3 = arr[i*6 + 3]; - if (b1 == ch && b2 == ch2 && (len != 3 || b3 == ch3)) - { - *(out++) = (char)i; - return; - } - } - - std::cout << "Could not find glyph " << std::hex << (int)ch << " " << (int)ch2 << " " << (int)ch3 << std::endl; - - *(out++) = ch; // Could not find glyph, just put whatever -} - -std::string ToUTF8::getLegacyEnc(ToUTF8::FromType to) -{ - // Pick translation array - const char *arr; - switch (to) - { - case ToUTF8::WINDOWS_1252: + int len = 1; + switch (ch) { - arr = ToUTF8::windows_1252; - break; + case 0xe2: len = 3; break; + case 0xc2: + case 0xcb: + case 0xc4: + case 0xc6: + case 0xc3: + case 0xd0: + case 0xd1: + case 0xd2: + case 0xc5: len = 2; break; } - case ToUTF8::WINDOWS_1250: + + if (len == 1) // There is no 1 length utf-8 glyph that is not 0x20 (empty space) { - arr = ToUTF8::windows_1250; - break; + *(out++) = ch; + return; } - case ToUTF8::WINDOWS_1251: + + unsigned char ch2 = *(chp++); + unsigned char ch3 = '\0'; + if (len == 3) + ch3 = *(chp++); + + for (int i = 128; i < 256; i++) { - arr = ToUTF8::windows_1251; - break; + unsigned char b1 = arr[i*6 + 1], b2 = arr[i*6 + 2], b3 = arr[i*6 + 3]; + if (b1 == ch && b2 == ch2 && (len != 3 || b3 == ch3)) + { + *(out++) = (char)i; + return; + } } - default: + + std::cout << "Could not find glyph " << std::hex << (int)ch << " " << (int)ch2 << " " << (int)ch3 << std::endl; + + *(out++) = ch; // Could not find glyph, just put whatever +} + +std::string ToUTF8::getLegacyEnc(ToUTF8::FromType to) +{ + // Pick translation array + const char *arr; + switch (to) { - assert(0); + case ToUTF8::WINDOWS_1252: + { + arr = ToUTF8::windows_1252; + break; + } + case ToUTF8::WINDOWS_1250: + { + arr = ToUTF8::windows_1250; + break; + } + case ToUTF8::WINDOWS_1251: + { + arr = ToUTF8::windows_1251; + break; + } + default: + { + assert(0); + } } - } - - // Double check that the input string stops at some point (it might - // contain zero terminators before this, inside its own data, which - // is also ok.) - char* input = &buf[0]; - assert(input[size] == 0); - - // TODO: The rest of this function is designed for single-character - // input encodings only. It also assumes that the input the input - // encoding shares its first 128 values (0-127) with ASCII. These - // conditions must be checked again if you add more input encodings - // later. - - // Compute output length, and check for pure ascii input at the same - // time. - bool ascii; - size_t outlen = getLength2(arr, input, ascii); - - // If we're pure ascii, then don't bother converting anything. - if(ascii) - return std::string(input, outlen); - - // Make sure the output is large enough - resize(output, outlen); - char *out = &output[0]; - - // Translate - while(*input) - copyFromArray2(arr, input, out); - - // Make sure that we wrote the correct number of bytes - assert((out-&output[0]) == (int)outlen); - - // And make extra sure the output is null terminated - assert(output.size() > outlen); - assert(output[outlen] == 0); - - // Return a string - return std::string(&output[0], outlen); + + // Double check that the input string stops at some point (it might + // contain zero terminators before this, inside its own data, which + // is also ok.) + char* input = &buf[0]; + assert(input[size] == 0); + + // TODO: The rest of this function is designed for single-character + // input encodings only. It also assumes that the input the input + // encoding shares its first 128 values (0-127) with ASCII. These + // conditions must be checked again if you add more input encodings + // later. + + // Compute output length, and check for pure ascii input at the same + // time. + bool ascii; + size_t outlen = getLength2(arr, input, ascii); + + // If we're pure ascii, then don't bother converting anything. + if(ascii) + return std::string(input, outlen); + + // Make sure the output is large enough + resize(output, outlen); + char *out = &output[0]; + + // Translate + while(*input) + copyFromArray2(arr, input, out); + + // Make sure that we wrote the correct number of bytes + assert((out-&output[0]) == (int)outlen); + + // And make extra sure the output is null terminated + assert(output.size() > outlen); + assert(output[outlen] == 0); + + // Return a string + return std::string(&output[0], outlen); } ToUTF8::FromType ToUTF8::calculateEncoding(const std::string& encodingName) { - if (encodingName == "win1250") - return ToUTF8::WINDOWS_1250; - else if (encodingName == "win1251") - return ToUTF8::WINDOWS_1251; - else - return ToUTF8::WINDOWS_1252; + if (encodingName == "win1250") + return ToUTF8::WINDOWS_1250; + else if (encodingName == "win1251") + return ToUTF8::WINDOWS_1251; + else + return ToUTF8::WINDOWS_1252; } std::string ToUTF8::encodingUsingMessage(const std::string& encodingName) { - if (encodingName == "win1250") - return "Using Central and Eastern European font encoding."; - else if (encodingName == "win1251") - return "Using Cyrillic font encoding."; - else - return "Using default (English) font encoding."; + if (encodingName == "win1250") + return "Using Central and Eastern European font encoding."; + else if (encodingName == "win1251") + return "Using Cyrillic font encoding."; + else + return "Using default (English) font encoding."; } diff --git a/components/to_utf8/to_utf8.hpp b/components/to_utf8/to_utf8.hpp index f52ae73bd8..6877e2dc17 100644 --- a/components/to_utf8/to_utf8.hpp +++ b/components/to_utf8/to_utf8.hpp @@ -2,29 +2,66 @@ #define COMPONENTS_TOUTF8_H #include +#include +#include namespace ToUTF8 { - // These are all the currently supported code pages - enum FromType + // These are all the currently supported code pages + enum FromType { - WINDOWS_1250, // Central ane Eastern European languages - WINDOWS_1251, // Cyrillic languages - WINDOWS_1252 // Used by English version of Morrowind (and - // probably others) + WINDOWS_1250, // Central ane Eastern European languages + WINDOWS_1251, // Cyrillic languages + WINDOWS_1252 // Used by English version of Morrowind (and + // probably others) }; - // Return a writable buffer of at least 'size' bytes. The buffer - // does not have to be freed. - char* getBuffer(int size); + // Return a writable buffer of at least 'size' bytes. The buffer + // does not have to be freed. + char* getBuffer(int size); - // Convert the previously written buffer to UTF8 from the given code - // page. - std::string getUtf8(FromType from); - std::string getLegacyEnc(FromType to); + // Convert the previously written buffer to UTF8 from the given code + // page. + std::string getUtf8(FromType from); + std::string getLegacyEnc(FromType to); - FromType calculateEncoding(const std::string& encodingName); - std::string encodingUsingMessage(const std::string& encodingName); + FromType calculateEncoding(const std::string& encodingName); + std::string encodingUsingMessage(const std::string& encodingName); + + // class + + class Utf8Encoder + { + public: + Utf8Encoder(void); + + void setEncoding(const FromType sourceEncoding); + + // Convert to UTF8 from the previously given code page. + std::string getUtf8(const char *input, int size); + inline std::string getUtf8(const std::string &str) + { + return getUtf8(str.c_str(), str.size()); + } + + std::string getLegacyEnc(const char *input, int size); + inline std::string getLegacyEnc(const std::string &str) + { + return getLegacyEnc(str.c_str(), str.size()); + } + + private: + void resize(size_t size); + size_t getLength(const char* input, bool &ascii); + void copyFromArray(unsigned char chp, char* &out); + size_t getLength2(const char* input, bool &ascii); + void copyFromArray2(const char*& chp, char* &out); + + FromType mEncoding; + std::vector mOutput; + int mSize; + char* translationArray; + }; } #endif