|
|
|
@ -2,6 +2,8 @@
|
|
|
|
|
|
|
|
|
|
#include <vector>
|
|
|
|
|
#include <cassert>
|
|
|
|
|
#include <iostream>
|
|
|
|
|
#include <iomanip>
|
|
|
|
|
|
|
|
|
|
/* This file contains the code to translate from WINDOWS-1252 (native
|
|
|
|
|
charset used in English version of Morrowind) to UTF-8. The library
|
|
|
|
@ -46,334 +48,611 @@ static std::vector<char> buf (50*1024);
|
|
|
|
|
static std::vector<char> output (50*1024);
|
|
|
|
|
static int size;
|
|
|
|
|
|
|
|
|
|
// Make sure the given vector is large enough for 'size' bytes,
|
|
|
|
|
using namespace ToUTF8;
|
|
|
|
|
|
|
|
|
|
Utf8Encoder::Utf8Encoder(void):
|
|
|
|
|
mOutput(50*1024)
|
|
|
|
|
{
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void Utf8Encoder::setEncoding(const FromType sourceEncoding)
|
|
|
|
|
{
|
|
|
|
|
mEncoding = sourceEncoding;
|
|
|
|
|
|
|
|
|
|
switch (mEncoding)
|
|
|
|
|
{
|
|
|
|
|
case ToUTF8::WINDOWS_1252:
|
|
|
|
|
{
|
|
|
|
|
translationArray = ToUTF8::windows_1252;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
case ToUTF8::WINDOWS_1250:
|
|
|
|
|
{
|
|
|
|
|
translationArray = ToUTF8::windows_1250;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
case ToUTF8::WINDOWS_1251:
|
|
|
|
|
{
|
|
|
|
|
translationArray = ToUTF8::windows_1251;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
default:
|
|
|
|
|
{
|
|
|
|
|
assert(0);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
std::string Utf8Encoder::getUtf8(const char* input, int size)
|
|
|
|
|
{
|
|
|
|
|
// Double check that the input string stops at some point (it might
|
|
|
|
|
// contain zero terminators before this, inside its own data, which
|
|
|
|
|
// is also ok.)
|
|
|
|
|
assert(input[size] == 0);
|
|
|
|
|
|
|
|
|
|
// TODO: The rest of this function is designed for single-character
|
|
|
|
|
// input encodings only. It also assumes that the input the input
|
|
|
|
|
// encoding shares its first 128 values (0-127) with ASCII. These
|
|
|
|
|
// conditions must be checked again if you add more input encodings
|
|
|
|
|
// later.
|
|
|
|
|
|
|
|
|
|
// Compute output length, and check for pure ascii input at the same
|
|
|
|
|
// time.
|
|
|
|
|
bool ascii;
|
|
|
|
|
size_t outlen = getLength(input, ascii);
|
|
|
|
|
|
|
|
|
|
// If we're pure ascii, then don't bother converting anything.
|
|
|
|
|
if(ascii)
|
|
|
|
|
return std::string(input, outlen);
|
|
|
|
|
|
|
|
|
|
// Make sure the output is large enough
|
|
|
|
|
resize(outlen);
|
|
|
|
|
char *out = &mOutput[0];
|
|
|
|
|
|
|
|
|
|
// Translate
|
|
|
|
|
while (*input)
|
|
|
|
|
copyFromArray(*(input++), out);
|
|
|
|
|
|
|
|
|
|
// Make sure that we wrote the correct number of bytes
|
|
|
|
|
assert((out-&mOutput[0]) == (int)outlen);
|
|
|
|
|
|
|
|
|
|
// And make extra sure the output is null terminated
|
|
|
|
|
assert(mOutput.size() > outlen);
|
|
|
|
|
assert(mOutput[outlen] == 0);
|
|
|
|
|
|
|
|
|
|
// Return a string
|
|
|
|
|
return std::string(&mOutput[0], outlen);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
std::string Utf8Encoder::getLegacyEnc(const char *input, int size)
|
|
|
|
|
{
|
|
|
|
|
// Double check that the input string stops at some point (it might
|
|
|
|
|
// contain zero terminators before this, inside its own data, which
|
|
|
|
|
// is also ok.)
|
|
|
|
|
assert(input[size] == 0);
|
|
|
|
|
|
|
|
|
|
// TODO: The rest of this function is designed for single-character
|
|
|
|
|
// input encodings only. It also assumes that the input the input
|
|
|
|
|
// encoding shares its first 128 values (0-127) with ASCII. These
|
|
|
|
|
// conditions must be checked again if you add more input encodings
|
|
|
|
|
// later.
|
|
|
|
|
|
|
|
|
|
// Compute output length, and check for pure ascii input at the same
|
|
|
|
|
// time.
|
|
|
|
|
bool ascii;
|
|
|
|
|
size_t outlen = getLength2(input, ascii);
|
|
|
|
|
|
|
|
|
|
// If we're pure ascii, then don't bother converting anything.
|
|
|
|
|
if(ascii)
|
|
|
|
|
return std::string(input, outlen);
|
|
|
|
|
|
|
|
|
|
// Make sure the output is large enough
|
|
|
|
|
resize(outlen);
|
|
|
|
|
char *out = &mOutput[0];
|
|
|
|
|
|
|
|
|
|
// Translate
|
|
|
|
|
while(*input)
|
|
|
|
|
copyFromArray2(input, out);
|
|
|
|
|
|
|
|
|
|
// Make sure that we wrote the correct number of bytes
|
|
|
|
|
assert((out-&mOutput[0]) == (int)outlen);
|
|
|
|
|
|
|
|
|
|
// And make extra sure the output is null terminated
|
|
|
|
|
assert(mOutput.size() > outlen);
|
|
|
|
|
assert(mOutput[outlen] == 0);
|
|
|
|
|
|
|
|
|
|
// Return a string
|
|
|
|
|
return std::string(&mOutput[0], outlen);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Make sure the output vector is large enough for 'size' bytes,
|
|
|
|
|
// including a terminating zero after it.
|
|
|
|
|
void Utf8Encoder::resize(size_t size)
|
|
|
|
|
{
|
|
|
|
|
if (mOutput.size() <= size)
|
|
|
|
|
// Add some extra padding to reduce the chance of having to resize
|
|
|
|
|
// again later.
|
|
|
|
|
mOutput.resize(3*size);
|
|
|
|
|
|
|
|
|
|
// And make sure the string is zero terminated
|
|
|
|
|
mOutput[size] = 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/** Get the total length length needed to decode the given string with
|
|
|
|
|
the given translation array. The arrays are encoded with 6 bytes
|
|
|
|
|
per character, with the first giving the length and the next 5 the
|
|
|
|
|
actual data.
|
|
|
|
|
|
|
|
|
|
The function serves a dual purpose for optimization reasons: it
|
|
|
|
|
checks if the input is pure ascii (all values are <= 127). If this
|
|
|
|
|
is the case, then the ascii parameter is set to true, and the
|
|
|
|
|
caller can optimize for this case.
|
|
|
|
|
*/
|
|
|
|
|
size_t Utf8Encoder::getLength(const char* input, bool &ascii)
|
|
|
|
|
{
|
|
|
|
|
ascii = true;
|
|
|
|
|
size_t len = 0;
|
|
|
|
|
const char* ptr = input;
|
|
|
|
|
unsigned char inp = *ptr;
|
|
|
|
|
|
|
|
|
|
// Do away with the ascii part of the string first (this is almost
|
|
|
|
|
// always the entire string.)
|
|
|
|
|
while (inp && inp < 128)
|
|
|
|
|
inp = *(++ptr);
|
|
|
|
|
len += (ptr-input);
|
|
|
|
|
|
|
|
|
|
// If we're not at the null terminator at this point, then there
|
|
|
|
|
// were some non-ascii characters to deal with. Go to slow-mode for
|
|
|
|
|
// the rest of the string.
|
|
|
|
|
if (inp)
|
|
|
|
|
{
|
|
|
|
|
ascii = false;
|
|
|
|
|
while (inp)
|
|
|
|
|
{
|
|
|
|
|
// Find the translated length of this character in the
|
|
|
|
|
// lookup table.
|
|
|
|
|
len += translationArray[inp*6];
|
|
|
|
|
inp = *(++ptr);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return len;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Translate one character 'ch' using the translation array 'arr', and
|
|
|
|
|
// advance the output pointer accordingly.
|
|
|
|
|
void Utf8Encoder::copyFromArray(unsigned char ch, char* &out)
|
|
|
|
|
{
|
|
|
|
|
// Optimize for ASCII values
|
|
|
|
|
if (ch < 128)
|
|
|
|
|
{
|
|
|
|
|
*(out++) = ch;
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const char *in = translationArray + ch*6;
|
|
|
|
|
int len = *(in++);
|
|
|
|
|
for (int i=0; i<len; i++)
|
|
|
|
|
*(out++) = *(in++);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
size_t Utf8Encoder::getLength2(const char* input, bool &ascii)
|
|
|
|
|
{
|
|
|
|
|
ascii = true;
|
|
|
|
|
size_t len = 0;
|
|
|
|
|
const char* ptr = input;
|
|
|
|
|
unsigned char inp = *ptr;
|
|
|
|
|
|
|
|
|
|
// Do away with the ascii part of the string first (this is almost
|
|
|
|
|
// always the entire string.)
|
|
|
|
|
while (inp && inp < 128)
|
|
|
|
|
inp = *(++ptr);
|
|
|
|
|
len += (ptr-input);
|
|
|
|
|
|
|
|
|
|
// If we're not at the null terminator at this point, then there
|
|
|
|
|
// were some non-ascii characters to deal with. Go to slow-mode for
|
|
|
|
|
// the rest of the string.
|
|
|
|
|
if (inp)
|
|
|
|
|
{
|
|
|
|
|
ascii = false;
|
|
|
|
|
while(inp)
|
|
|
|
|
{
|
|
|
|
|
len += 1;
|
|
|
|
|
// Find the translated length of this character in the
|
|
|
|
|
// lookup table.
|
|
|
|
|
switch(inp)
|
|
|
|
|
{
|
|
|
|
|
case 0xe2: len -= 2; break;
|
|
|
|
|
case 0xc2:
|
|
|
|
|
case 0xcb:
|
|
|
|
|
case 0xc4:
|
|
|
|
|
case 0xc6:
|
|
|
|
|
case 0xc3:
|
|
|
|
|
case 0xd0:
|
|
|
|
|
case 0xd1:
|
|
|
|
|
case 0xd2:
|
|
|
|
|
case 0xc5: len -= 1; break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
inp = *(++ptr);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return len;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void Utf8Encoder::copyFromArray2(const char*& chp, char* &out)
|
|
|
|
|
{
|
|
|
|
|
unsigned char ch = *(chp++);
|
|
|
|
|
// Optimize for ASCII values
|
|
|
|
|
if (ch < 128)
|
|
|
|
|
{
|
|
|
|
|
*(out++) = ch;
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int len = 1;
|
|
|
|
|
switch (ch)
|
|
|
|
|
{
|
|
|
|
|
case 0xe2: len = 3; break;
|
|
|
|
|
case 0xc2:
|
|
|
|
|
case 0xcb:
|
|
|
|
|
case 0xc4:
|
|
|
|
|
case 0xc6:
|
|
|
|
|
case 0xc3:
|
|
|
|
|
case 0xd0:
|
|
|
|
|
case 0xd1:
|
|
|
|
|
case 0xd2:
|
|
|
|
|
case 0xc5: len = 2; break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (len == 1) // There is no 1 length utf-8 glyph that is not 0x20 (empty space)
|
|
|
|
|
{
|
|
|
|
|
*(out++) = ch;
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
unsigned char ch2 = *(chp++);
|
|
|
|
|
unsigned char ch3 = '\0';
|
|
|
|
|
if (len == 3)
|
|
|
|
|
ch3 = *(chp++);
|
|
|
|
|
|
|
|
|
|
for (int i = 128; i < 256; i++)
|
|
|
|
|
{
|
|
|
|
|
unsigned char b1 = translationArray[i*6 + 1], b2 = translationArray[i*6 + 2], b3 = translationArray[i*6 + 3];
|
|
|
|
|
if (b1 == ch && b2 == ch2 && (len != 3 || b3 == ch3))
|
|
|
|
|
{
|
|
|
|
|
*(out++) = (char)i;
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
std::cout << "Could not find glyph " << std::hex << (int)ch << " " << (int)ch2 << " " << (int)ch3 << std::endl;
|
|
|
|
|
|
|
|
|
|
*(out++) = ch; // Could not find glyph, just put whatever
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void resize(std::vector<char> &buf, size_t size)
|
|
|
|
|
{
|
|
|
|
|
if(buf.size() <= size)
|
|
|
|
|
// Add some extra padding to reduce the chance of having to resize
|
|
|
|
|
// again later.
|
|
|
|
|
buf.resize(3*size);
|
|
|
|
|
if(buf.size() <= size)
|
|
|
|
|
// Add some extra padding to reduce the chance of having to resize
|
|
|
|
|
// again later.
|
|
|
|
|
buf.resize(3*size);
|
|
|
|
|
|
|
|
|
|
// And make sure the string is zero terminated
|
|
|
|
|
buf[size] = 0;
|
|
|
|
|
// And make sure the string is zero terminated
|
|
|
|
|
buf[size] = 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// This is just used to spew out a reusable input buffer for the
|
|
|
|
|
// conversion process.
|
|
|
|
|
char *ToUTF8::getBuffer(int s)
|
|
|
|
|
{
|
|
|
|
|
// Remember the requested size
|
|
|
|
|
size = s;
|
|
|
|
|
resize(buf, size);
|
|
|
|
|
return &buf[0];
|
|
|
|
|
// Remember the requested size
|
|
|
|
|
size = s;
|
|
|
|
|
resize(buf, size);
|
|
|
|
|
return &buf[0];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/** Get the total length length needed to decode the given string with
|
|
|
|
|
the given translation array. The arrays are encoded with 6 bytes
|
|
|
|
|
per character, with the first giving the length and the next 5 the
|
|
|
|
|
actual data.
|
|
|
|
|
|
|
|
|
|
The function serves a dual purpose for optimization reasons: it
|
|
|
|
|
checks if the input is pure ascii (all values are <= 127). If this
|
|
|
|
|
is the case, then the ascii parameter is set to true, and the
|
|
|
|
|
caller can optimize for this case.
|
|
|
|
|
the given translation array. The arrays are encoded with 6 bytes
|
|
|
|
|
per character, with the first giving the length and the next 5 the
|
|
|
|
|
actual data.
|
|
|
|
|
|
|
|
|
|
The function serves a dual purpose for optimization reasons: it
|
|
|
|
|
checks if the input is pure ascii (all values are <= 127). If this
|
|
|
|
|
is the case, then the ascii parameter is set to true, and the
|
|
|
|
|
caller can optimize for this case.
|
|
|
|
|
*/
|
|
|
|
|
static size_t getLength(const char *arr, const char* input, bool &ascii)
|
|
|
|
|
{
|
|
|
|
|
ascii = true;
|
|
|
|
|
size_t len = 0;
|
|
|
|
|
const char* ptr = input;
|
|
|
|
|
unsigned char inp = *ptr;
|
|
|
|
|
|
|
|
|
|
// Do away with the ascii part of the string first (this is almost
|
|
|
|
|
// always the entire string.)
|
|
|
|
|
while(inp && inp < 128)
|
|
|
|
|
inp = *(++ptr);
|
|
|
|
|
len += (ptr-input);
|
|
|
|
|
|
|
|
|
|
// If we're not at the null terminator at this point, then there
|
|
|
|
|
// were some non-ascii characters to deal with. Go to slow-mode for
|
|
|
|
|
// the rest of the string.
|
|
|
|
|
if(inp)
|
|
|
|
|
ascii = true;
|
|
|
|
|
size_t len = 0;
|
|
|
|
|
const char* ptr = input;
|
|
|
|
|
unsigned char inp = *ptr;
|
|
|
|
|
|
|
|
|
|
// Do away with the ascii part of the string first (this is almost
|
|
|
|
|
// always the entire string.)
|
|
|
|
|
while(inp && inp < 128)
|
|
|
|
|
inp = *(++ptr);
|
|
|
|
|
len += (ptr-input);
|
|
|
|
|
|
|
|
|
|
// If we're not at the null terminator at this point, then there
|
|
|
|
|
// were some non-ascii characters to deal with. Go to slow-mode for
|
|
|
|
|
// the rest of the string.
|
|
|
|
|
if(inp)
|
|
|
|
|
{
|
|
|
|
|
ascii = false;
|
|
|
|
|
while(inp)
|
|
|
|
|
ascii = false;
|
|
|
|
|
while(inp)
|
|
|
|
|
{
|
|
|
|
|
// Find the translated length of this character in the
|
|
|
|
|
// lookup table.
|
|
|
|
|
len += arr[inp*6];
|
|
|
|
|
inp = *(++ptr);
|
|
|
|
|
// Find the translated length of this character in the
|
|
|
|
|
// lookup table.
|
|
|
|
|
len += arr[inp*6];
|
|
|
|
|
inp = *(++ptr);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return len;
|
|
|
|
|
return len;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Translate one character 'ch' using the translation array 'arr', and
|
|
|
|
|
// advance the output pointer accordingly.
|
|
|
|
|
static void copyFromArray(const char *arr, unsigned char ch, char* &out)
|
|
|
|
|
{
|
|
|
|
|
// Optimize for ASCII values
|
|
|
|
|
if(ch < 128)
|
|
|
|
|
// Optimize for ASCII values
|
|
|
|
|
if(ch < 128)
|
|
|
|
|
{
|
|
|
|
|
*(out++) = ch;
|
|
|
|
|
return;
|
|
|
|
|
*(out++) = ch;
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const char *in = arr + ch*6;
|
|
|
|
|
int len = *(in++);
|
|
|
|
|
for(int i=0; i<len; i++)
|
|
|
|
|
*(out++) = *(in++);
|
|
|
|
|
const char *in = arr + ch*6;
|
|
|
|
|
int len = *(in++);
|
|
|
|
|
for(int i=0; i<len; i++)
|
|
|
|
|
*(out++) = *(in++);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
std::string ToUTF8::getUtf8(ToUTF8::FromType from)
|
|
|
|
|
{
|
|
|
|
|
// Pick translation array
|
|
|
|
|
const char *arr;
|
|
|
|
|
switch (from)
|
|
|
|
|
{
|
|
|
|
|
case ToUTF8::WINDOWS_1252:
|
|
|
|
|
{
|
|
|
|
|
arr = ToUTF8::windows_1252;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
case ToUTF8::WINDOWS_1250:
|
|
|
|
|
{
|
|
|
|
|
arr = ToUTF8::windows_1250;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
case ToUTF8::WINDOWS_1251:
|
|
|
|
|
{
|
|
|
|
|
arr = ToUTF8::windows_1251;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
default:
|
|
|
|
|
// Pick translation array
|
|
|
|
|
const char *arr;
|
|
|
|
|
switch (from)
|
|
|
|
|
{
|
|
|
|
|
assert(0);
|
|
|
|
|
case ToUTF8::WINDOWS_1252:
|
|
|
|
|
{
|
|
|
|
|
arr = ToUTF8::windows_1252;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
case ToUTF8::WINDOWS_1250:
|
|
|
|
|
{
|
|
|
|
|
arr = ToUTF8::windows_1250;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
case ToUTF8::WINDOWS_1251:
|
|
|
|
|
{
|
|
|
|
|
arr = ToUTF8::windows_1251;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
default:
|
|
|
|
|
{
|
|
|
|
|
assert(0);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Double check that the input string stops at some point (it might
|
|
|
|
|
// contain zero terminators before this, inside its own data, which
|
|
|
|
|
// is also ok.)
|
|
|
|
|
const char* input = &buf[0];
|
|
|
|
|
assert(input[size] == 0);
|
|
|
|
|
|
|
|
|
|
// TODO: The rest of this function is designed for single-character
|
|
|
|
|
// input encodings only. It also assumes that the input the input
|
|
|
|
|
// encoding shares its first 128 values (0-127) with ASCII. These
|
|
|
|
|
// conditions must be checked again if you add more input encodings
|
|
|
|
|
// later.
|
|
|
|
|
|
|
|
|
|
// Compute output length, and check for pure ascii input at the same
|
|
|
|
|
// time.
|
|
|
|
|
bool ascii;
|
|
|
|
|
size_t outlen = getLength(arr, input, ascii);
|
|
|
|
|
|
|
|
|
|
// If we're pure ascii, then don't bother converting anything.
|
|
|
|
|
if(ascii)
|
|
|
|
|
return std::string(input, outlen);
|
|
|
|
|
|
|
|
|
|
// Make sure the output is large enough
|
|
|
|
|
resize(output, outlen);
|
|
|
|
|
char *out = &output[0];
|
|
|
|
|
|
|
|
|
|
// Translate
|
|
|
|
|
while(*input)
|
|
|
|
|
copyFromArray(arr, *(input++), out);
|
|
|
|
|
|
|
|
|
|
// Make sure that we wrote the correct number of bytes
|
|
|
|
|
assert((out-&output[0]) == (int)outlen);
|
|
|
|
|
|
|
|
|
|
// And make extra sure the output is null terminated
|
|
|
|
|
assert(output.size() > outlen);
|
|
|
|
|
assert(output[outlen] == 0);
|
|
|
|
|
|
|
|
|
|
// Return a string
|
|
|
|
|
return std::string(&output[0], outlen);
|
|
|
|
|
|
|
|
|
|
// Double check that the input string stops at some point (it might
|
|
|
|
|
// contain zero terminators before this, inside its own data, which
|
|
|
|
|
// is also ok.)
|
|
|
|
|
const char* input = &buf[0];
|
|
|
|
|
assert(input[size] == 0);
|
|
|
|
|
|
|
|
|
|
// TODO: The rest of this function is designed for single-character
|
|
|
|
|
// input encodings only. It also assumes that the input the input
|
|
|
|
|
// encoding shares its first 128 values (0-127) with ASCII. These
|
|
|
|
|
// conditions must be checked again if you add more input encodings
|
|
|
|
|
// later.
|
|
|
|
|
|
|
|
|
|
// Compute output length, and check for pure ascii input at the same
|
|
|
|
|
// time.
|
|
|
|
|
bool ascii;
|
|
|
|
|
size_t outlen = getLength(arr, input, ascii);
|
|
|
|
|
|
|
|
|
|
// If we're pure ascii, then don't bother converting anything.
|
|
|
|
|
if(ascii)
|
|
|
|
|
return std::string(input, outlen);
|
|
|
|
|
|
|
|
|
|
// Make sure the output is large enough
|
|
|
|
|
resize(output, outlen);
|
|
|
|
|
char *out = &output[0];
|
|
|
|
|
|
|
|
|
|
// Translate
|
|
|
|
|
while(*input)
|
|
|
|
|
copyFromArray(arr, *(input++), out);
|
|
|
|
|
|
|
|
|
|
// Make sure that we wrote the correct number of bytes
|
|
|
|
|
assert((out-&output[0]) == (int)outlen);
|
|
|
|
|
|
|
|
|
|
// And make extra sure the output is null terminated
|
|
|
|
|
assert(output.size() > outlen);
|
|
|
|
|
assert(output[outlen] == 0);
|
|
|
|
|
|
|
|
|
|
// Return a string
|
|
|
|
|
return std::string(&output[0], outlen);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static size_t getLength2(const char *arr, const char* input, bool &ascii)
|
|
|
|
|
{
|
|
|
|
|
ascii = true;
|
|
|
|
|
size_t len = 0;
|
|
|
|
|
const char* ptr = input;
|
|
|
|
|
unsigned char inp = *ptr;
|
|
|
|
|
|
|
|
|
|
// Do away with the ascii part of the string first (this is almost
|
|
|
|
|
// always the entire string.)
|
|
|
|
|
while(inp && inp < 128)
|
|
|
|
|
inp = *(++ptr);
|
|
|
|
|
len += (ptr-input);
|
|
|
|
|
|
|
|
|
|
// If we're not at the null terminator at this point, then there
|
|
|
|
|
// were some non-ascii characters to deal with. Go to slow-mode for
|
|
|
|
|
// the rest of the string.
|
|
|
|
|
if(inp)
|
|
|
|
|
ascii = true;
|
|
|
|
|
size_t len = 0;
|
|
|
|
|
const char* ptr = input;
|
|
|
|
|
unsigned char inp = *ptr;
|
|
|
|
|
|
|
|
|
|
// Do away with the ascii part of the string first (this is almost
|
|
|
|
|
// always the entire string.)
|
|
|
|
|
while(inp && inp < 128)
|
|
|
|
|
inp = *(++ptr);
|
|
|
|
|
len += (ptr-input);
|
|
|
|
|
|
|
|
|
|
// If we're not at the null terminator at this point, then there
|
|
|
|
|
// were some non-ascii characters to deal with. Go to slow-mode for
|
|
|
|
|
// the rest of the string.
|
|
|
|
|
if(inp)
|
|
|
|
|
{
|
|
|
|
|
ascii = false;
|
|
|
|
|
while(inp)
|
|
|
|
|
ascii = false;
|
|
|
|
|
while(inp)
|
|
|
|
|
{
|
|
|
|
|
len += 1;
|
|
|
|
|
// Find the translated length of this character in the
|
|
|
|
|
// lookup table.
|
|
|
|
|
// Find the translated length of this character in the
|
|
|
|
|
// lookup table.
|
|
|
|
|
switch(inp)
|
|
|
|
|
{
|
|
|
|
|
case 0xe2: len -= 2; break;
|
|
|
|
|
case 0xc2:
|
|
|
|
|
case 0xcb:
|
|
|
|
|
case 0xc4:
|
|
|
|
|
case 0xc6:
|
|
|
|
|
case 0xc3:
|
|
|
|
|
case 0xd0:
|
|
|
|
|
case 0xd1:
|
|
|
|
|
case 0xd2:
|
|
|
|
|
case 0xc5: len -= 1; break;
|
|
|
|
|
case 0xe2: len -= 2; break;
|
|
|
|
|
case 0xc2:
|
|
|
|
|
case 0xcb:
|
|
|
|
|
case 0xc4:
|
|
|
|
|
case 0xc6:
|
|
|
|
|
case 0xc3:
|
|
|
|
|
case 0xd0:
|
|
|
|
|
case 0xd1:
|
|
|
|
|
case 0xd2:
|
|
|
|
|
case 0xc5: len -= 1; break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
inp = *(++ptr);
|
|
|
|
|
inp = *(++ptr);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return len;
|
|
|
|
|
return len;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#include <iostream>
|
|
|
|
|
#include <iomanip>
|
|
|
|
|
|
|
|
|
|
static void copyFromArray2(const char *arr, char*& chp, char* &out)
|
|
|
|
|
{
|
|
|
|
|
unsigned char ch = *(chp++);
|
|
|
|
|
// Optimize for ASCII values
|
|
|
|
|
if(ch < 128)
|
|
|
|
|
// Optimize for ASCII values
|
|
|
|
|
if(ch < 128)
|
|
|
|
|
{
|
|
|
|
|
*(out++) = ch;
|
|
|
|
|
return;
|
|
|
|
|
*(out++) = ch;
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int len = 1;
|
|
|
|
|
switch (ch)
|
|
|
|
|
{
|
|
|
|
|
case 0xe2: len = 3; break;
|
|
|
|
|
case 0xc2:
|
|
|
|
|
case 0xcb:
|
|
|
|
|
case 0xc4:
|
|
|
|
|
case 0xc6:
|
|
|
|
|
case 0xc3:
|
|
|
|
|
case 0xd0:
|
|
|
|
|
case 0xd1:
|
|
|
|
|
case 0xd2:
|
|
|
|
|
case 0xc5: len = 2; break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (len == 1) // There is no 1 length utf-8 glyph that is not 0x20 (empty space)
|
|
|
|
|
{
|
|
|
|
|
*(out++) = ch;
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
unsigned char ch2 = *(chp++);
|
|
|
|
|
unsigned char ch3 = '\0';
|
|
|
|
|
if (len == 3)
|
|
|
|
|
ch3 = *(chp++);
|
|
|
|
|
|
|
|
|
|
for (int i = 128; i < 256; i++)
|
|
|
|
|
{
|
|
|
|
|
unsigned char b1 = arr[i*6 + 1], b2 = arr[i*6 + 2], b3 = arr[i*6 + 3];
|
|
|
|
|
if (b1 == ch && b2 == ch2 && (len != 3 || b3 == ch3))
|
|
|
|
|
{
|
|
|
|
|
*(out++) = (char)i;
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
std::cout << "Could not find glyph " << std::hex << (int)ch << " " << (int)ch2 << " " << (int)ch3 << std::endl;
|
|
|
|
|
|
|
|
|
|
*(out++) = ch; // Could not find glyph, just put whatever
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
std::string ToUTF8::getLegacyEnc(ToUTF8::FromType to)
|
|
|
|
|
{
|
|
|
|
|
// Pick translation array
|
|
|
|
|
const char *arr;
|
|
|
|
|
switch (to)
|
|
|
|
|
{
|
|
|
|
|
case ToUTF8::WINDOWS_1252:
|
|
|
|
|
int len = 1;
|
|
|
|
|
switch (ch)
|
|
|
|
|
{
|
|
|
|
|
arr = ToUTF8::windows_1252;
|
|
|
|
|
break;
|
|
|
|
|
case 0xe2: len = 3; break;
|
|
|
|
|
case 0xc2:
|
|
|
|
|
case 0xcb:
|
|
|
|
|
case 0xc4:
|
|
|
|
|
case 0xc6:
|
|
|
|
|
case 0xc3:
|
|
|
|
|
case 0xd0:
|
|
|
|
|
case 0xd1:
|
|
|
|
|
case 0xd2:
|
|
|
|
|
case 0xc5: len = 2; break;
|
|
|
|
|
}
|
|
|
|
|
case ToUTF8::WINDOWS_1250:
|
|
|
|
|
|
|
|
|
|
if (len == 1) // There is no 1 length utf-8 glyph that is not 0x20 (empty space)
|
|
|
|
|
{
|
|
|
|
|
arr = ToUTF8::windows_1250;
|
|
|
|
|
break;
|
|
|
|
|
*(out++) = ch;
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
case ToUTF8::WINDOWS_1251:
|
|
|
|
|
|
|
|
|
|
unsigned char ch2 = *(chp++);
|
|
|
|
|
unsigned char ch3 = '\0';
|
|
|
|
|
if (len == 3)
|
|
|
|
|
ch3 = *(chp++);
|
|
|
|
|
|
|
|
|
|
for (int i = 128; i < 256; i++)
|
|
|
|
|
{
|
|
|
|
|
arr = ToUTF8::windows_1251;
|
|
|
|
|
break;
|
|
|
|
|
unsigned char b1 = arr[i*6 + 1], b2 = arr[i*6 + 2], b3 = arr[i*6 + 3];
|
|
|
|
|
if (b1 == ch && b2 == ch2 && (len != 3 || b3 == ch3))
|
|
|
|
|
{
|
|
|
|
|
*(out++) = (char)i;
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
default:
|
|
|
|
|
|
|
|
|
|
std::cout << "Could not find glyph " << std::hex << (int)ch << " " << (int)ch2 << " " << (int)ch3 << std::endl;
|
|
|
|
|
|
|
|
|
|
*(out++) = ch; // Could not find glyph, just put whatever
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
std::string ToUTF8::getLegacyEnc(ToUTF8::FromType to)
|
|
|
|
|
{
|
|
|
|
|
// Pick translation array
|
|
|
|
|
const char *arr;
|
|
|
|
|
switch (to)
|
|
|
|
|
{
|
|
|
|
|
assert(0);
|
|
|
|
|
case ToUTF8::WINDOWS_1252:
|
|
|
|
|
{
|
|
|
|
|
arr = ToUTF8::windows_1252;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
case ToUTF8::WINDOWS_1250:
|
|
|
|
|
{
|
|
|
|
|
arr = ToUTF8::windows_1250;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
case ToUTF8::WINDOWS_1251:
|
|
|
|
|
{
|
|
|
|
|
arr = ToUTF8::windows_1251;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
default:
|
|
|
|
|
{
|
|
|
|
|
assert(0);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Double check that the input string stops at some point (it might
|
|
|
|
|
// contain zero terminators before this, inside its own data, which
|
|
|
|
|
// is also ok.)
|
|
|
|
|
char* input = &buf[0];
|
|
|
|
|
assert(input[size] == 0);
|
|
|
|
|
|
|
|
|
|
// TODO: The rest of this function is designed for single-character
|
|
|
|
|
// input encodings only. It also assumes that the input the input
|
|
|
|
|
// encoding shares its first 128 values (0-127) with ASCII. These
|
|
|
|
|
// conditions must be checked again if you add more input encodings
|
|
|
|
|
// later.
|
|
|
|
|
|
|
|
|
|
// Compute output length, and check for pure ascii input at the same
|
|
|
|
|
// time.
|
|
|
|
|
bool ascii;
|
|
|
|
|
size_t outlen = getLength2(arr, input, ascii);
|
|
|
|
|
|
|
|
|
|
// If we're pure ascii, then don't bother converting anything.
|
|
|
|
|
if(ascii)
|
|
|
|
|
return std::string(input, outlen);
|
|
|
|
|
|
|
|
|
|
// Make sure the output is large enough
|
|
|
|
|
resize(output, outlen);
|
|
|
|
|
char *out = &output[0];
|
|
|
|
|
|
|
|
|
|
// Translate
|
|
|
|
|
while(*input)
|
|
|
|
|
copyFromArray2(arr, input, out);
|
|
|
|
|
|
|
|
|
|
// Make sure that we wrote the correct number of bytes
|
|
|
|
|
assert((out-&output[0]) == (int)outlen);
|
|
|
|
|
|
|
|
|
|
// And make extra sure the output is null terminated
|
|
|
|
|
assert(output.size() > outlen);
|
|
|
|
|
assert(output[outlen] == 0);
|
|
|
|
|
|
|
|
|
|
// Return a string
|
|
|
|
|
return std::string(&output[0], outlen);
|
|
|
|
|
|
|
|
|
|
// Double check that the input string stops at some point (it might
|
|
|
|
|
// contain zero terminators before this, inside its own data, which
|
|
|
|
|
// is also ok.)
|
|
|
|
|
char* input = &buf[0];
|
|
|
|
|
assert(input[size] == 0);
|
|
|
|
|
|
|
|
|
|
// TODO: The rest of this function is designed for single-character
|
|
|
|
|
// input encodings only. It also assumes that the input the input
|
|
|
|
|
// encoding shares its first 128 values (0-127) with ASCII. These
|
|
|
|
|
// conditions must be checked again if you add more input encodings
|
|
|
|
|
// later.
|
|
|
|
|
|
|
|
|
|
// Compute output length, and check for pure ascii input at the same
|
|
|
|
|
// time.
|
|
|
|
|
bool ascii;
|
|
|
|
|
size_t outlen = getLength2(arr, input, ascii);
|
|
|
|
|
|
|
|
|
|
// If we're pure ascii, then don't bother converting anything.
|
|
|
|
|
if(ascii)
|
|
|
|
|
return std::string(input, outlen);
|
|
|
|
|
|
|
|
|
|
// Make sure the output is large enough
|
|
|
|
|
resize(output, outlen);
|
|
|
|
|
char *out = &output[0];
|
|
|
|
|
|
|
|
|
|
// Translate
|
|
|
|
|
while(*input)
|
|
|
|
|
copyFromArray2(arr, input, out);
|
|
|
|
|
|
|
|
|
|
// Make sure that we wrote the correct number of bytes
|
|
|
|
|
assert((out-&output[0]) == (int)outlen);
|
|
|
|
|
|
|
|
|
|
// And make extra sure the output is null terminated
|
|
|
|
|
assert(output.size() > outlen);
|
|
|
|
|
assert(output[outlen] == 0);
|
|
|
|
|
|
|
|
|
|
// Return a string
|
|
|
|
|
return std::string(&output[0], outlen);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ToUTF8::FromType ToUTF8::calculateEncoding(const std::string& encodingName)
|
|
|
|
|
{
|
|
|
|
|
if (encodingName == "win1250")
|
|
|
|
|
return ToUTF8::WINDOWS_1250;
|
|
|
|
|
else if (encodingName == "win1251")
|
|
|
|
|
return ToUTF8::WINDOWS_1251;
|
|
|
|
|
else
|
|
|
|
|
return ToUTF8::WINDOWS_1252;
|
|
|
|
|
if (encodingName == "win1250")
|
|
|
|
|
return ToUTF8::WINDOWS_1250;
|
|
|
|
|
else if (encodingName == "win1251")
|
|
|
|
|
return ToUTF8::WINDOWS_1251;
|
|
|
|
|
else
|
|
|
|
|
return ToUTF8::WINDOWS_1252;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
std::string ToUTF8::encodingUsingMessage(const std::string& encodingName)
|
|
|
|
|
{
|
|
|
|
|
if (encodingName == "win1250")
|
|
|
|
|
return "Using Central and Eastern European font encoding.";
|
|
|
|
|
else if (encodingName == "win1251")
|
|
|
|
|
return "Using Cyrillic font encoding.";
|
|
|
|
|
else
|
|
|
|
|
return "Using default (English) font encoding.";
|
|
|
|
|
if (encodingName == "win1250")
|
|
|
|
|
return "Using Central and Eastern European font encoding.";
|
|
|
|
|
else if (encodingName == "win1251")
|
|
|
|
|
return "Using Cyrillic font encoding.";
|
|
|
|
|
else
|
|
|
|
|
return "Using default (English) font encoding.";
|
|
|
|
|
}
|
|
|
|
|