components/to_utf8: add class Utf8Encoder

actorid
Emanuel Guevel 12 years ago
parent 5c007cd527
commit 740e2b5769

@ -2,6 +2,8 @@
#include <vector> #include <vector>
#include <cassert> #include <cassert>
#include <iostream>
#include <iomanip>
/* This file contains the code to translate from WINDOWS-1252 (native /* This file contains the code to translate from WINDOWS-1252 (native
charset used in English version of Morrowind) to UTF-8. The library charset used in English version of Morrowind) to UTF-8. The library
@ -46,334 +48,611 @@ static std::vector<char> buf (50*1024);
static std::vector<char> output (50*1024); static std::vector<char> output (50*1024);
static int size; static int size;
// Make sure the given vector is large enough for 'size' bytes, using namespace ToUTF8;
Utf8Encoder::Utf8Encoder(void):
mOutput(50*1024)
{
}
void Utf8Encoder::setEncoding(const FromType sourceEncoding)
{
mEncoding = sourceEncoding;
switch (mEncoding)
{
case ToUTF8::WINDOWS_1252:
{
translationArray = ToUTF8::windows_1252;
break;
}
case ToUTF8::WINDOWS_1250:
{
translationArray = ToUTF8::windows_1250;
break;
}
case ToUTF8::WINDOWS_1251:
{
translationArray = ToUTF8::windows_1251;
break;
}
default:
{
assert(0);
}
}
}
std::string Utf8Encoder::getUtf8(const char* input, int size)
{
// Double check that the input string stops at some point (it might
// contain zero terminators before this, inside its own data, which
// is also ok.)
assert(input[size] == 0);
// TODO: The rest of this function is designed for single-character
// input encodings only. It also assumes that the input the input
// encoding shares its first 128 values (0-127) with ASCII. These
// conditions must be checked again if you add more input encodings
// later.
// Compute output length, and check for pure ascii input at the same
// time.
bool ascii;
size_t outlen = getLength(input, ascii);
// If we're pure ascii, then don't bother converting anything.
if(ascii)
return std::string(input, outlen);
// Make sure the output is large enough
resize(outlen);
char *out = &mOutput[0];
// Translate
while (*input)
copyFromArray(*(input++), out);
// Make sure that we wrote the correct number of bytes
assert((out-&mOutput[0]) == (int)outlen);
// And make extra sure the output is null terminated
assert(mOutput.size() > outlen);
assert(mOutput[outlen] == 0);
// Return a string
return std::string(&mOutput[0], outlen);
}
std::string Utf8Encoder::getLegacyEnc(const char *input, int size)
{
// Double check that the input string stops at some point (it might
// contain zero terminators before this, inside its own data, which
// is also ok.)
assert(input[size] == 0);
// TODO: The rest of this function is designed for single-character
// input encodings only. It also assumes that the input the input
// encoding shares its first 128 values (0-127) with ASCII. These
// conditions must be checked again if you add more input encodings
// later.
// Compute output length, and check for pure ascii input at the same
// time.
bool ascii;
size_t outlen = getLength2(input, ascii);
// If we're pure ascii, then don't bother converting anything.
if(ascii)
return std::string(input, outlen);
// Make sure the output is large enough
resize(outlen);
char *out = &mOutput[0];
// Translate
while(*input)
copyFromArray2(input, out);
// Make sure that we wrote the correct number of bytes
assert((out-&mOutput[0]) == (int)outlen);
// And make extra sure the output is null terminated
assert(mOutput.size() > outlen);
assert(mOutput[outlen] == 0);
// Return a string
return std::string(&mOutput[0], outlen);
}
// Make sure the output vector is large enough for 'size' bytes,
// including a terminating zero after it. // including a terminating zero after it.
void Utf8Encoder::resize(size_t size)
{
if (mOutput.size() <= size)
// Add some extra padding to reduce the chance of having to resize
// again later.
mOutput.resize(3*size);
// And make sure the string is zero terminated
mOutput[size] = 0;
}
/** Get the total length length needed to decode the given string with
the given translation array. The arrays are encoded with 6 bytes
per character, with the first giving the length and the next 5 the
actual data.
The function serves a dual purpose for optimization reasons: it
checks if the input is pure ascii (all values are <= 127). If this
is the case, then the ascii parameter is set to true, and the
caller can optimize for this case.
*/
size_t Utf8Encoder::getLength(const char* input, bool &ascii)
{
ascii = true;
size_t len = 0;
const char* ptr = input;
unsigned char inp = *ptr;
// Do away with the ascii part of the string first (this is almost
// always the entire string.)
while (inp && inp < 128)
inp = *(++ptr);
len += (ptr-input);
// If we're not at the null terminator at this point, then there
// were some non-ascii characters to deal with. Go to slow-mode for
// the rest of the string.
if (inp)
{
ascii = false;
while (inp)
{
// Find the translated length of this character in the
// lookup table.
len += translationArray[inp*6];
inp = *(++ptr);
}
}
return len;
}
// Translate one character 'ch' using the translation array 'arr', and
// advance the output pointer accordingly.
void Utf8Encoder::copyFromArray(unsigned char ch, char* &out)
{
// Optimize for ASCII values
if (ch < 128)
{
*(out++) = ch;
return;
}
const char *in = translationArray + ch*6;
int len = *(in++);
for (int i=0; i<len; i++)
*(out++) = *(in++);
}
size_t Utf8Encoder::getLength2(const char* input, bool &ascii)
{
ascii = true;
size_t len = 0;
const char* ptr = input;
unsigned char inp = *ptr;
// Do away with the ascii part of the string first (this is almost
// always the entire string.)
while (inp && inp < 128)
inp = *(++ptr);
len += (ptr-input);
// If we're not at the null terminator at this point, then there
// were some non-ascii characters to deal with. Go to slow-mode for
// the rest of the string.
if (inp)
{
ascii = false;
while(inp)
{
len += 1;
// Find the translated length of this character in the
// lookup table.
switch(inp)
{
case 0xe2: len -= 2; break;
case 0xc2:
case 0xcb:
case 0xc4:
case 0xc6:
case 0xc3:
case 0xd0:
case 0xd1:
case 0xd2:
case 0xc5: len -= 1; break;
}
inp = *(++ptr);
}
}
return len;
}
void Utf8Encoder::copyFromArray2(const char*& chp, char* &out)
{
unsigned char ch = *(chp++);
// Optimize for ASCII values
if (ch < 128)
{
*(out++) = ch;
return;
}
int len = 1;
switch (ch)
{
case 0xe2: len = 3; break;
case 0xc2:
case 0xcb:
case 0xc4:
case 0xc6:
case 0xc3:
case 0xd0:
case 0xd1:
case 0xd2:
case 0xc5: len = 2; break;
}
if (len == 1) // There is no 1 length utf-8 glyph that is not 0x20 (empty space)
{
*(out++) = ch;
return;
}
unsigned char ch2 = *(chp++);
unsigned char ch3 = '\0';
if (len == 3)
ch3 = *(chp++);
for (int i = 128; i < 256; i++)
{
unsigned char b1 = translationArray[i*6 + 1], b2 = translationArray[i*6 + 2], b3 = translationArray[i*6 + 3];
if (b1 == ch && b2 == ch2 && (len != 3 || b3 == ch3))
{
*(out++) = (char)i;
return;
}
}
std::cout << "Could not find glyph " << std::hex << (int)ch << " " << (int)ch2 << " " << (int)ch3 << std::endl;
*(out++) = ch; // Could not find glyph, just put whatever
}
static void resize(std::vector<char> &buf, size_t size) static void resize(std::vector<char> &buf, size_t size)
{ {
if(buf.size() <= size) if(buf.size() <= size)
// Add some extra padding to reduce the chance of having to resize // Add some extra padding to reduce the chance of having to resize
// again later. // again later.
buf.resize(3*size); buf.resize(3*size);
// And make sure the string is zero terminated // And make sure the string is zero terminated
buf[size] = 0; buf[size] = 0;
} }
// This is just used to spew out a reusable input buffer for the // This is just used to spew out a reusable input buffer for the
// conversion process. // conversion process.
char *ToUTF8::getBuffer(int s) char *ToUTF8::getBuffer(int s)
{ {
// Remember the requested size // Remember the requested size
size = s; size = s;
resize(buf, size); resize(buf, size);
return &buf[0]; return &buf[0];
} }
/** Get the total length length needed to decode the given string with /** Get the total length length needed to decode the given string with
the given translation array. The arrays are encoded with 6 bytes the given translation array. The arrays are encoded with 6 bytes
per character, with the first giving the length and the next 5 the per character, with the first giving the length and the next 5 the
actual data. actual data.
The function serves a dual purpose for optimization reasons: it The function serves a dual purpose for optimization reasons: it
checks if the input is pure ascii (all values are <= 127). If this checks if the input is pure ascii (all values are <= 127). If this
is the case, then the ascii parameter is set to true, and the is the case, then the ascii parameter is set to true, and the
caller can optimize for this case. caller can optimize for this case.
*/ */
static size_t getLength(const char *arr, const char* input, bool &ascii) static size_t getLength(const char *arr, const char* input, bool &ascii)
{ {
ascii = true; ascii = true;
size_t len = 0; size_t len = 0;
const char* ptr = input; const char* ptr = input;
unsigned char inp = *ptr; unsigned char inp = *ptr;
// Do away with the ascii part of the string first (this is almost // Do away with the ascii part of the string first (this is almost
// always the entire string.) // always the entire string.)
while(inp && inp < 128) while(inp && inp < 128)
inp = *(++ptr); inp = *(++ptr);
len += (ptr-input); len += (ptr-input);
// If we're not at the null terminator at this point, then there // If we're not at the null terminator at this point, then there
// were some non-ascii characters to deal with. Go to slow-mode for // were some non-ascii characters to deal with. Go to slow-mode for
// the rest of the string. // the rest of the string.
if(inp) if(inp)
{ {
ascii = false; ascii = false;
while(inp) while(inp)
{ {
// Find the translated length of this character in the // Find the translated length of this character in the
// lookup table. // lookup table.
len += arr[inp*6]; len += arr[inp*6];
inp = *(++ptr); inp = *(++ptr);
} }
} }
return len; return len;
} }
// Translate one character 'ch' using the translation array 'arr', and // Translate one character 'ch' using the translation array 'arr', and
// advance the output pointer accordingly. // advance the output pointer accordingly.
static void copyFromArray(const char *arr, unsigned char ch, char* &out) static void copyFromArray(const char *arr, unsigned char ch, char* &out)
{ {
// Optimize for ASCII values // Optimize for ASCII values
if(ch < 128) if(ch < 128)
{ {
*(out++) = ch; *(out++) = ch;
return; return;
} }
const char *in = arr + ch*6; const char *in = arr + ch*6;
int len = *(in++); int len = *(in++);
for(int i=0; i<len; i++) for(int i=0; i<len; i++)
*(out++) = *(in++); *(out++) = *(in++);
} }
std::string ToUTF8::getUtf8(ToUTF8::FromType from) std::string ToUTF8::getUtf8(ToUTF8::FromType from)
{ {
// Pick translation array // Pick translation array
const char *arr; const char *arr;
switch (from) switch (from)
{
case ToUTF8::WINDOWS_1252:
{
arr = ToUTF8::windows_1252;
break;
}
case ToUTF8::WINDOWS_1250:
{
arr = ToUTF8::windows_1250;
break;
}
case ToUTF8::WINDOWS_1251:
{
arr = ToUTF8::windows_1251;
break;
}
default:
{ {
assert(0); case ToUTF8::WINDOWS_1252:
{
arr = ToUTF8::windows_1252;
break;
}
case ToUTF8::WINDOWS_1250:
{
arr = ToUTF8::windows_1250;
break;
}
case ToUTF8::WINDOWS_1251:
{
arr = ToUTF8::windows_1251;
break;
}
default:
{
assert(0);
}
} }
}
// Double check that the input string stops at some point (it might
// Double check that the input string stops at some point (it might // contain zero terminators before this, inside its own data, which
// contain zero terminators before this, inside its own data, which // is also ok.)
// is also ok.) const char* input = &buf[0];
const char* input = &buf[0]; assert(input[size] == 0);
assert(input[size] == 0);
// TODO: The rest of this function is designed for single-character
// TODO: The rest of this function is designed for single-character // input encodings only. It also assumes that the input the input
// input encodings only. It also assumes that the input the input // encoding shares its first 128 values (0-127) with ASCII. These
// encoding shares its first 128 values (0-127) with ASCII. These // conditions must be checked again if you add more input encodings
// conditions must be checked again if you add more input encodings // later.
// later.
// Compute output length, and check for pure ascii input at the same
// Compute output length, and check for pure ascii input at the same // time.
// time. bool ascii;
bool ascii; size_t outlen = getLength(arr, input, ascii);
size_t outlen = getLength(arr, input, ascii);
// If we're pure ascii, then don't bother converting anything.
// If we're pure ascii, then don't bother converting anything. if(ascii)
if(ascii) return std::string(input, outlen);
return std::string(input, outlen);
// Make sure the output is large enough
// Make sure the output is large enough resize(output, outlen);
resize(output, outlen); char *out = &output[0];
char *out = &output[0];
// Translate
// Translate while(*input)
while(*input) copyFromArray(arr, *(input++), out);
copyFromArray(arr, *(input++), out);
// Make sure that we wrote the correct number of bytes
// Make sure that we wrote the correct number of bytes assert((out-&output[0]) == (int)outlen);
assert((out-&output[0]) == (int)outlen);
// And make extra sure the output is null terminated
// And make extra sure the output is null terminated assert(output.size() > outlen);
assert(output.size() > outlen); assert(output[outlen] == 0);
assert(output[outlen] == 0);
// Return a string
// Return a string return std::string(&output[0], outlen);
return std::string(&output[0], outlen);
} }
static size_t getLength2(const char *arr, const char* input, bool &ascii) static size_t getLength2(const char *arr, const char* input, bool &ascii)
{ {
ascii = true; ascii = true;
size_t len = 0; size_t len = 0;
const char* ptr = input; const char* ptr = input;
unsigned char inp = *ptr; unsigned char inp = *ptr;
// Do away with the ascii part of the string first (this is almost // Do away with the ascii part of the string first (this is almost
// always the entire string.) // always the entire string.)
while(inp && inp < 128) while(inp && inp < 128)
inp = *(++ptr); inp = *(++ptr);
len += (ptr-input); len += (ptr-input);
// If we're not at the null terminator at this point, then there // If we're not at the null terminator at this point, then there
// were some non-ascii characters to deal with. Go to slow-mode for // were some non-ascii characters to deal with. Go to slow-mode for
// the rest of the string. // the rest of the string.
if(inp) if(inp)
{ {
ascii = false; ascii = false;
while(inp) while(inp)
{ {
len += 1; len += 1;
// Find the translated length of this character in the // Find the translated length of this character in the
// lookup table. // lookup table.
switch(inp) switch(inp)
{ {
case 0xe2: len -= 2; break; case 0xe2: len -= 2; break;
case 0xc2: case 0xc2:
case 0xcb: case 0xcb:
case 0xc4: case 0xc4:
case 0xc6: case 0xc6:
case 0xc3: case 0xc3:
case 0xd0: case 0xd0:
case 0xd1: case 0xd1:
case 0xd2: case 0xd2:
case 0xc5: len -= 1; break; case 0xc5: len -= 1; break;
} }
inp = *(++ptr); inp = *(++ptr);
} }
} }
return len; return len;
} }
#include <iostream>
#include <iomanip>
static void copyFromArray2(const char *arr, char*& chp, char* &out) static void copyFromArray2(const char *arr, char*& chp, char* &out)
{ {
unsigned char ch = *(chp++); unsigned char ch = *(chp++);
// Optimize for ASCII values // Optimize for ASCII values
if(ch < 128) if(ch < 128)
{ {
*(out++) = ch; *(out++) = ch;
return; return;
} }
int len = 1; int len = 1;
switch (ch) switch (ch)
{
case 0xe2: len = 3; break;
case 0xc2:
case 0xcb:
case 0xc4:
case 0xc6:
case 0xc3:
case 0xd0:
case 0xd1:
case 0xd2:
case 0xc5: len = 2; break;
}
if (len == 1) // There is no 1 length utf-8 glyph that is not 0x20 (empty space)
{
*(out++) = ch;
return;
}
unsigned char ch2 = *(chp++);
unsigned char ch3 = '\0';
if (len == 3)
ch3 = *(chp++);
for (int i = 128; i < 256; i++)
{
unsigned char b1 = arr[i*6 + 1], b2 = arr[i*6 + 2], b3 = arr[i*6 + 3];
if (b1 == ch && b2 == ch2 && (len != 3 || b3 == ch3))
{
*(out++) = (char)i;
return;
}
}
std::cout << "Could not find glyph " << std::hex << (int)ch << " " << (int)ch2 << " " << (int)ch3 << std::endl;
*(out++) = ch; // Could not find glyph, just put whatever
}
std::string ToUTF8::getLegacyEnc(ToUTF8::FromType to)
{
// Pick translation array
const char *arr;
switch (to)
{
case ToUTF8::WINDOWS_1252:
{ {
arr = ToUTF8::windows_1252; case 0xe2: len = 3; break;
break; case 0xc2:
case 0xcb:
case 0xc4:
case 0xc6:
case 0xc3:
case 0xd0:
case 0xd1:
case 0xd2:
case 0xc5: len = 2; break;
} }
case ToUTF8::WINDOWS_1250:
if (len == 1) // There is no 1 length utf-8 glyph that is not 0x20 (empty space)
{ {
arr = ToUTF8::windows_1250; *(out++) = ch;
break; return;
} }
case ToUTF8::WINDOWS_1251:
unsigned char ch2 = *(chp++);
unsigned char ch3 = '\0';
if (len == 3)
ch3 = *(chp++);
for (int i = 128; i < 256; i++)
{ {
arr = ToUTF8::windows_1251; unsigned char b1 = arr[i*6 + 1], b2 = arr[i*6 + 2], b3 = arr[i*6 + 3];
break; if (b1 == ch && b2 == ch2 && (len != 3 || b3 == ch3))
{
*(out++) = (char)i;
return;
}
} }
default:
std::cout << "Could not find glyph " << std::hex << (int)ch << " " << (int)ch2 << " " << (int)ch3 << std::endl;
*(out++) = ch; // Could not find glyph, just put whatever
}
std::string ToUTF8::getLegacyEnc(ToUTF8::FromType to)
{
// Pick translation array
const char *arr;
switch (to)
{ {
assert(0); case ToUTF8::WINDOWS_1252:
{
arr = ToUTF8::windows_1252;
break;
}
case ToUTF8::WINDOWS_1250:
{
arr = ToUTF8::windows_1250;
break;
}
case ToUTF8::WINDOWS_1251:
{
arr = ToUTF8::windows_1251;
break;
}
default:
{
assert(0);
}
} }
}
// Double check that the input string stops at some point (it might
// Double check that the input string stops at some point (it might // contain zero terminators before this, inside its own data, which
// contain zero terminators before this, inside its own data, which // is also ok.)
// is also ok.) char* input = &buf[0];
char* input = &buf[0]; assert(input[size] == 0);
assert(input[size] == 0);
// TODO: The rest of this function is designed for single-character
// TODO: The rest of this function is designed for single-character // input encodings only. It also assumes that the input the input
// input encodings only. It also assumes that the input the input // encoding shares its first 128 values (0-127) with ASCII. These
// encoding shares its first 128 values (0-127) with ASCII. These // conditions must be checked again if you add more input encodings
// conditions must be checked again if you add more input encodings // later.
// later.
// Compute output length, and check for pure ascii input at the same
// Compute output length, and check for pure ascii input at the same // time.
// time. bool ascii;
bool ascii; size_t outlen = getLength2(arr, input, ascii);
size_t outlen = getLength2(arr, input, ascii);
// If we're pure ascii, then don't bother converting anything.
// If we're pure ascii, then don't bother converting anything. if(ascii)
if(ascii) return std::string(input, outlen);
return std::string(input, outlen);
// Make sure the output is large enough
// Make sure the output is large enough resize(output, outlen);
resize(output, outlen); char *out = &output[0];
char *out = &output[0];
// Translate
// Translate while(*input)
while(*input) copyFromArray2(arr, input, out);
copyFromArray2(arr, input, out);
// Make sure that we wrote the correct number of bytes
// Make sure that we wrote the correct number of bytes assert((out-&output[0]) == (int)outlen);
assert((out-&output[0]) == (int)outlen);
// And make extra sure the output is null terminated
// And make extra sure the output is null terminated assert(output.size() > outlen);
assert(output.size() > outlen); assert(output[outlen] == 0);
assert(output[outlen] == 0);
// Return a string
// Return a string return std::string(&output[0], outlen);
return std::string(&output[0], outlen);
} }
ToUTF8::FromType ToUTF8::calculateEncoding(const std::string& encodingName) ToUTF8::FromType ToUTF8::calculateEncoding(const std::string& encodingName)
{ {
if (encodingName == "win1250") if (encodingName == "win1250")
return ToUTF8::WINDOWS_1250; return ToUTF8::WINDOWS_1250;
else if (encodingName == "win1251") else if (encodingName == "win1251")
return ToUTF8::WINDOWS_1251; return ToUTF8::WINDOWS_1251;
else else
return ToUTF8::WINDOWS_1252; return ToUTF8::WINDOWS_1252;
} }
std::string ToUTF8::encodingUsingMessage(const std::string& encodingName) std::string ToUTF8::encodingUsingMessage(const std::string& encodingName)
{ {
if (encodingName == "win1250") if (encodingName == "win1250")
return "Using Central and Eastern European font encoding."; return "Using Central and Eastern European font encoding.";
else if (encodingName == "win1251") else if (encodingName == "win1251")
return "Using Cyrillic font encoding."; return "Using Cyrillic font encoding.";
else else
return "Using default (English) font encoding."; return "Using default (English) font encoding.";
} }

@ -2,29 +2,66 @@
#define COMPONENTS_TOUTF8_H #define COMPONENTS_TOUTF8_H
#include <string> #include <string>
#include <cstring>
#include <vector>
namespace ToUTF8 namespace ToUTF8
{ {
// These are all the currently supported code pages // These are all the currently supported code pages
enum FromType enum FromType
{ {
WINDOWS_1250, // Central ane Eastern European languages WINDOWS_1250, // Central ane Eastern European languages
WINDOWS_1251, // Cyrillic languages WINDOWS_1251, // Cyrillic languages
WINDOWS_1252 // Used by English version of Morrowind (and WINDOWS_1252 // Used by English version of Morrowind (and
// probably others) // probably others)
}; };
// Return a writable buffer of at least 'size' bytes. The buffer // Return a writable buffer of at least 'size' bytes. The buffer
// does not have to be freed. // does not have to be freed.
char* getBuffer(int size); char* getBuffer(int size);
// Convert the previously written buffer to UTF8 from the given code // Convert the previously written buffer to UTF8 from the given code
// page. // page.
std::string getUtf8(FromType from); std::string getUtf8(FromType from);
std::string getLegacyEnc(FromType to); std::string getLegacyEnc(FromType to);
FromType calculateEncoding(const std::string& encodingName); FromType calculateEncoding(const std::string& encodingName);
std::string encodingUsingMessage(const std::string& encodingName); std::string encodingUsingMessage(const std::string& encodingName);
// class
class Utf8Encoder
{
public:
Utf8Encoder(void);
void setEncoding(const FromType sourceEncoding);
// Convert to UTF8 from the previously given code page.
std::string getUtf8(const char *input, int size);
inline std::string getUtf8(const std::string &str)
{
return getUtf8(str.c_str(), str.size());
}
std::string getLegacyEnc(const char *input, int size);
inline std::string getLegacyEnc(const std::string &str)
{
return getLegacyEnc(str.c_str(), str.size());
}
private:
void resize(size_t size);
size_t getLength(const char* input, bool &ascii);
void copyFromArray(unsigned char chp, char* &out);
size_t getLength2(const char* input, bool &ascii);
void copyFromArray2(const char*& chp, char* &out);
FromType mEncoding;
std::vector<char> mOutput;
int mSize;
char* translationArray;
};
} }
#endif #endif

Loading…
Cancel
Save