components/to_utf8: keep only Utf8Encoder

actorid
Emanuel Guevel 12 years ago
parent 02bf02f288
commit 0bdf52a071

@ -41,13 +41,6 @@
// Generated tables
#include "tables_gen.hpp"
// Shared global buffers, we love you. These initial sizes are large
// enough to hold the largest books in Morrowind.esm, but we will
// resize automaticall if necessary.
static std::vector<char> buf (50*1024);
static std::vector<char> output (50*1024);
static int size;
using namespace ToUTF8;
Utf8Encoder::Utf8Encoder(void):
@ -330,313 +323,6 @@ void Utf8Encoder::copyFromArray2(const char*& chp, char* &out)
*(out++) = ch; // Could not find glyph, just put whatever
}
static void resize(std::vector<char> &buf, size_t size)
{
if(buf.size() <= size)
// Add some extra padding to reduce the chance of having to resize
// again later.
buf.resize(3*size);
// And make sure the string is zero terminated
buf[size] = 0;
}
// This is just used to spew out a reusable input buffer for the
// conversion process.
char *ToUTF8::getBuffer(int s)
{
// Remember the requested size
size = s;
resize(buf, size);
return &buf[0];
}
/** Get the total length length needed to decode the given string with
the given translation array. The arrays are encoded with 6 bytes
per character, with the first giving the length and the next 5 the
actual data.
The function serves a dual purpose for optimization reasons: it
checks if the input is pure ascii (all values are <= 127). If this
is the case, then the ascii parameter is set to true, and the
caller can optimize for this case.
*/
static size_t getLength(const char *arr, const char* input, bool &ascii)
{
ascii = true;
size_t len = 0;
const char* ptr = input;
unsigned char inp = *ptr;
// Do away with the ascii part of the string first (this is almost
// always the entire string.)
while(inp && inp < 128)
inp = *(++ptr);
len += (ptr-input);
// If we're not at the null terminator at this point, then there
// were some non-ascii characters to deal with. Go to slow-mode for
// the rest of the string.
if(inp)
{
ascii = false;
while(inp)
{
// Find the translated length of this character in the
// lookup table.
len += arr[inp*6];
inp = *(++ptr);
}
}
return len;
}
// Translate one character 'ch' using the translation array 'arr', and
// advance the output pointer accordingly.
static void copyFromArray(const char *arr, unsigned char ch, char* &out)
{
// Optimize for ASCII values
if(ch < 128)
{
*(out++) = ch;
return;
}
const char *in = arr + ch*6;
int len = *(in++);
for(int i=0; i<len; i++)
*(out++) = *(in++);
}
std::string ToUTF8::getUtf8(ToUTF8::FromType from)
{
// Pick translation array
const char *arr;
switch (from)
{
case ToUTF8::WINDOWS_1252:
{
arr = ToUTF8::windows_1252;
break;
}
case ToUTF8::WINDOWS_1250:
{
arr = ToUTF8::windows_1250;
break;
}
case ToUTF8::WINDOWS_1251:
{
arr = ToUTF8::windows_1251;
break;
}
default:
{
assert(0);
}
}
// Double check that the input string stops at some point (it might
// contain zero terminators before this, inside its own data, which
// is also ok.)
const char* input = &buf[0];
assert(input[size] == 0);
// TODO: The rest of this function is designed for single-character
// input encodings only. It also assumes that the input the input
// encoding shares its first 128 values (0-127) with ASCII. These
// conditions must be checked again if you add more input encodings
// later.
// Compute output length, and check for pure ascii input at the same
// time.
bool ascii;
size_t outlen = getLength(arr, input, ascii);
// If we're pure ascii, then don't bother converting anything.
if(ascii)
return std::string(input, outlen);
// Make sure the output is large enough
resize(output, outlen);
char *out = &output[0];
// Translate
while(*input)
copyFromArray(arr, *(input++), out);
// Make sure that we wrote the correct number of bytes
assert((out-&output[0]) == (int)outlen);
// And make extra sure the output is null terminated
assert(output.size() > outlen);
assert(output[outlen] == 0);
// Return a string
return std::string(&output[0], outlen);
}
static size_t getLength2(const char *arr, const char* input, bool &ascii)
{
ascii = true;
size_t len = 0;
const char* ptr = input;
unsigned char inp = *ptr;
// Do away with the ascii part of the string first (this is almost
// always the entire string.)
while(inp && inp < 128)
inp = *(++ptr);
len += (ptr-input);
// If we're not at the null terminator at this point, then there
// were some non-ascii characters to deal with. Go to slow-mode for
// the rest of the string.
if(inp)
{
ascii = false;
while(inp)
{
len += 1;
// Find the translated length of this character in the
// lookup table.
switch(inp)
{
case 0xe2: len -= 2; break;
case 0xc2:
case 0xcb:
case 0xc4:
case 0xc6:
case 0xc3:
case 0xd0:
case 0xd1:
case 0xd2:
case 0xc5: len -= 1; break;
}
inp = *(++ptr);
}
}
return len;
}
static void copyFromArray2(const char *arr, char*& chp, char* &out)
{
unsigned char ch = *(chp++);
// Optimize for ASCII values
if(ch < 128)
{
*(out++) = ch;
return;
}
int len = 1;
switch (ch)
{
case 0xe2: len = 3; break;
case 0xc2:
case 0xcb:
case 0xc4:
case 0xc6:
case 0xc3:
case 0xd0:
case 0xd1:
case 0xd2:
case 0xc5: len = 2; break;
}
if (len == 1) // There is no 1 length utf-8 glyph that is not 0x20 (empty space)
{
*(out++) = ch;
return;
}
unsigned char ch2 = *(chp++);
unsigned char ch3 = '\0';
if (len == 3)
ch3 = *(chp++);
for (int i = 128; i < 256; i++)
{
unsigned char b1 = arr[i*6 + 1], b2 = arr[i*6 + 2], b3 = arr[i*6 + 3];
if (b1 == ch && b2 == ch2 && (len != 3 || b3 == ch3))
{
*(out++) = (char)i;
return;
}
}
std::cout << "Could not find glyph " << std::hex << (int)ch << " " << (int)ch2 << " " << (int)ch3 << std::endl;
*(out++) = ch; // Could not find glyph, just put whatever
}
std::string ToUTF8::getLegacyEnc(ToUTF8::FromType to)
{
// Pick translation array
const char *arr;
switch (to)
{
case ToUTF8::WINDOWS_1252:
{
arr = ToUTF8::windows_1252;
break;
}
case ToUTF8::WINDOWS_1250:
{
arr = ToUTF8::windows_1250;
break;
}
case ToUTF8::WINDOWS_1251:
{
arr = ToUTF8::windows_1251;
break;
}
default:
{
assert(0);
}
}
// Double check that the input string stops at some point (it might
// contain zero terminators before this, inside its own data, which
// is also ok.)
char* input = &buf[0];
assert(input[size] == 0);
// TODO: The rest of this function is designed for single-character
// input encodings only. It also assumes that the input the input
// encoding shares its first 128 values (0-127) with ASCII. These
// conditions must be checked again if you add more input encodings
// later.
// Compute output length, and check for pure ascii input at the same
// time.
bool ascii;
size_t outlen = getLength2(arr, input, ascii);
// If we're pure ascii, then don't bother converting anything.
if(ascii)
return std::string(input, outlen);
// Make sure the output is large enough
resize(output, outlen);
char *out = &output[0];
// Translate
while(*input)
copyFromArray2(arr, input, out);
// Make sure that we wrote the correct number of bytes
assert((out-&output[0]) == (int)outlen);
// And make extra sure the output is null terminated
assert(output.size() > outlen);
assert(output[outlen] == 0);
// Return a string
return std::string(&output[0], outlen);
}
ToUTF8::FromType ToUTF8::calculateEncoding(const std::string& encodingName)
{
if (encodingName == "win1250")

@ -16,15 +16,6 @@ namespace ToUTF8
// probably others)
};
// Return a writable buffer of at least 'size' bytes. The buffer
// does not have to be freed.
char* getBuffer(int size);
// Convert the previously written buffer to UTF8 from the given code
// page.
std::string getUtf8(FromType from);
std::string getLegacyEnc(FromType to);
FromType calculateEncoding(const std::string& encodingName);
std::string encodingUsingMessage(const std::string& encodingName);

Loading…
Cancel
Save