From 0bdf52a0719a756f3be97a988ff33784d02f7fcc Mon Sep 17 00:00:00 2001 From: Emanuel Guevel Date: Thu, 3 Jan 2013 23:21:14 +0100 Subject: [PATCH] components/to_utf8: keep only Utf8Encoder --- components/to_utf8/to_utf8.cpp | 314 --------------------------------- components/to_utf8/to_utf8.hpp | 9 - 2 files changed, 323 deletions(-) diff --git a/components/to_utf8/to_utf8.cpp b/components/to_utf8/to_utf8.cpp index 8ac582b81..5efec36a4 100644 --- a/components/to_utf8/to_utf8.cpp +++ b/components/to_utf8/to_utf8.cpp @@ -41,13 +41,6 @@ // Generated tables #include "tables_gen.hpp" -// Shared global buffers, we love you. These initial sizes are large -// enough to hold the largest books in Morrowind.esm, but we will -// resize automaticall if necessary. -static std::vector buf (50*1024); -static std::vector output (50*1024); -static int size; - using namespace ToUTF8; Utf8Encoder::Utf8Encoder(void): @@ -330,313 +323,6 @@ void Utf8Encoder::copyFromArray2(const char*& chp, char* &out) *(out++) = ch; // Could not find glyph, just put whatever } -static void resize(std::vector &buf, size_t size) -{ - if(buf.size() <= size) - // Add some extra padding to reduce the chance of having to resize - // again later. - buf.resize(3*size); - - // And make sure the string is zero terminated - buf[size] = 0; -} - -// This is just used to spew out a reusable input buffer for the -// conversion process. -char *ToUTF8::getBuffer(int s) -{ - // Remember the requested size - size = s; - resize(buf, size); - return &buf[0]; -} - -/** Get the total length length needed to decode the given string with - the given translation array. The arrays are encoded with 6 bytes - per character, with the first giving the length and the next 5 the - actual data. - - The function serves a dual purpose for optimization reasons: it - checks if the input is pure ascii (all values are <= 127). If this - is the case, then the ascii parameter is set to true, and the - caller can optimize for this case. - */ -static size_t getLength(const char *arr, const char* input, bool &ascii) -{ - ascii = true; - size_t len = 0; - const char* ptr = input; - unsigned char inp = *ptr; - - // Do away with the ascii part of the string first (this is almost - // always the entire string.) - while(inp && inp < 128) - inp = *(++ptr); - len += (ptr-input); - - // If we're not at the null terminator at this point, then there - // were some non-ascii characters to deal with. Go to slow-mode for - // the rest of the string. - if(inp) - { - ascii = false; - while(inp) - { - // Find the translated length of this character in the - // lookup table. - len += arr[inp*6]; - inp = *(++ptr); - } - } - return len; -} - -// Translate one character 'ch' using the translation array 'arr', and -// advance the output pointer accordingly. -static void copyFromArray(const char *arr, unsigned char ch, char* &out) -{ - // Optimize for ASCII values - if(ch < 128) - { - *(out++) = ch; - return; - } - - const char *in = arr + ch*6; - int len = *(in++); - for(int i=0; i outlen); - assert(output[outlen] == 0); - - // Return a string - return std::string(&output[0], outlen); -} - -static size_t getLength2(const char *arr, const char* input, bool &ascii) -{ - ascii = true; - size_t len = 0; - const char* ptr = input; - unsigned char inp = *ptr; - - // Do away with the ascii part of the string first (this is almost - // always the entire string.) - while(inp && inp < 128) - inp = *(++ptr); - len += (ptr-input); - - // If we're not at the null terminator at this point, then there - // were some non-ascii characters to deal with. Go to slow-mode for - // the rest of the string. - if(inp) - { - ascii = false; - while(inp) - { - len += 1; - // Find the translated length of this character in the - // lookup table. - switch(inp) - { - case 0xe2: len -= 2; break; - case 0xc2: - case 0xcb: - case 0xc4: - case 0xc6: - case 0xc3: - case 0xd0: - case 0xd1: - case 0xd2: - case 0xc5: len -= 1; break; - } - - inp = *(++ptr); - } - } - return len; -} - -static void copyFromArray2(const char *arr, char*& chp, char* &out) -{ - unsigned char ch = *(chp++); - // Optimize for ASCII values - if(ch < 128) - { - *(out++) = ch; - return; - } - - int len = 1; - switch (ch) - { - case 0xe2: len = 3; break; - case 0xc2: - case 0xcb: - case 0xc4: - case 0xc6: - case 0xc3: - case 0xd0: - case 0xd1: - case 0xd2: - case 0xc5: len = 2; break; - } - - if (len == 1) // There is no 1 length utf-8 glyph that is not 0x20 (empty space) - { - *(out++) = ch; - return; - } - - unsigned char ch2 = *(chp++); - unsigned char ch3 = '\0'; - if (len == 3) - ch3 = *(chp++); - - for (int i = 128; i < 256; i++) - { - unsigned char b1 = arr[i*6 + 1], b2 = arr[i*6 + 2], b3 = arr[i*6 + 3]; - if (b1 == ch && b2 == ch2 && (len != 3 || b3 == ch3)) - { - *(out++) = (char)i; - return; - } - } - - std::cout << "Could not find glyph " << std::hex << (int)ch << " " << (int)ch2 << " " << (int)ch3 << std::endl; - - *(out++) = ch; // Could not find glyph, just put whatever -} - -std::string ToUTF8::getLegacyEnc(ToUTF8::FromType to) -{ - // Pick translation array - const char *arr; - switch (to) - { - case ToUTF8::WINDOWS_1252: - { - arr = ToUTF8::windows_1252; - break; - } - case ToUTF8::WINDOWS_1250: - { - arr = ToUTF8::windows_1250; - break; - } - case ToUTF8::WINDOWS_1251: - { - arr = ToUTF8::windows_1251; - break; - } - default: - { - assert(0); - } - } - - // Double check that the input string stops at some point (it might - // contain zero terminators before this, inside its own data, which - // is also ok.) - char* input = &buf[0]; - assert(input[size] == 0); - - // TODO: The rest of this function is designed for single-character - // input encodings only. It also assumes that the input the input - // encoding shares its first 128 values (0-127) with ASCII. These - // conditions must be checked again if you add more input encodings - // later. - - // Compute output length, and check for pure ascii input at the same - // time. - bool ascii; - size_t outlen = getLength2(arr, input, ascii); - - // If we're pure ascii, then don't bother converting anything. - if(ascii) - return std::string(input, outlen); - - // Make sure the output is large enough - resize(output, outlen); - char *out = &output[0]; - - // Translate - while(*input) - copyFromArray2(arr, input, out); - - // Make sure that we wrote the correct number of bytes - assert((out-&output[0]) == (int)outlen); - - // And make extra sure the output is null terminated - assert(output.size() > outlen); - assert(output[outlen] == 0); - - // Return a string - return std::string(&output[0], outlen); -} - ToUTF8::FromType ToUTF8::calculateEncoding(const std::string& encodingName) { if (encodingName == "win1250") diff --git a/components/to_utf8/to_utf8.hpp b/components/to_utf8/to_utf8.hpp index 6877e2dc1..bfba8a1ac 100644 --- a/components/to_utf8/to_utf8.hpp +++ b/components/to_utf8/to_utf8.hpp @@ -16,15 +16,6 @@ namespace ToUTF8 // probably others) }; - // Return a writable buffer of at least 'size' bytes. The buffer - // does not have to be freed. - char* getBuffer(int size); - - // Convert the previously written buffer to UTF8 from the given code - // page. - std::string getUtf8(FromType from); - std::string getLegacyEnc(FromType to); - FromType calculateEncoding(const std::string& encodingName); std::string encodingUsingMessage(const std::string& encodingName);