From 740e2b5769d85bec22b8478846bd3800d2552049 Mon Sep 17 00:00:00 2001 From: Emanuel Guevel Date: Wed, 2 Jan 2013 23:02:13 +0100 Subject: [PATCH 1/9] components/to_utf8: add class Utf8Encoder --- components/to_utf8/to_utf8.cpp | 795 ++++++++++++++++++++++----------- components/to_utf8/to_utf8.hpp | 67 ++- 2 files changed, 589 insertions(+), 273 deletions(-) diff --git a/components/to_utf8/to_utf8.cpp b/components/to_utf8/to_utf8.cpp index 7db6112475..8ac582b81d 100644 --- a/components/to_utf8/to_utf8.cpp +++ b/components/to_utf8/to_utf8.cpp @@ -2,6 +2,8 @@ #include #include +#include +#include /* This file contains the code to translate from WINDOWS-1252 (native charset used in English version of Morrowind) to UTF-8. The library @@ -46,334 +48,611 @@ static std::vector buf (50*1024); static std::vector output (50*1024); static int size; -// Make sure the given vector is large enough for 'size' bytes, +using namespace ToUTF8; + +Utf8Encoder::Utf8Encoder(void): + mOutput(50*1024) +{ +} + +void Utf8Encoder::setEncoding(const FromType sourceEncoding) +{ + mEncoding = sourceEncoding; + + switch (mEncoding) + { + case ToUTF8::WINDOWS_1252: + { + translationArray = ToUTF8::windows_1252; + break; + } + case ToUTF8::WINDOWS_1250: + { + translationArray = ToUTF8::windows_1250; + break; + } + case ToUTF8::WINDOWS_1251: + { + translationArray = ToUTF8::windows_1251; + break; + } + default: + { + assert(0); + } + } +} + +std::string Utf8Encoder::getUtf8(const char* input, int size) +{ + // Double check that the input string stops at some point (it might + // contain zero terminators before this, inside its own data, which + // is also ok.) + assert(input[size] == 0); + + // TODO: The rest of this function is designed for single-character + // input encodings only. It also assumes that the input the input + // encoding shares its first 128 values (0-127) with ASCII. These + // conditions must be checked again if you add more input encodings + // later. + + // Compute output length, and check for pure ascii input at the same + // time. + bool ascii; + size_t outlen = getLength(input, ascii); + + // If we're pure ascii, then don't bother converting anything. + if(ascii) + return std::string(input, outlen); + + // Make sure the output is large enough + resize(outlen); + char *out = &mOutput[0]; + + // Translate + while (*input) + copyFromArray(*(input++), out); + + // Make sure that we wrote the correct number of bytes + assert((out-&mOutput[0]) == (int)outlen); + + // And make extra sure the output is null terminated + assert(mOutput.size() > outlen); + assert(mOutput[outlen] == 0); + + // Return a string + return std::string(&mOutput[0], outlen); +} + +std::string Utf8Encoder::getLegacyEnc(const char *input, int size) +{ + // Double check that the input string stops at some point (it might + // contain zero terminators before this, inside its own data, which + // is also ok.) + assert(input[size] == 0); + + // TODO: The rest of this function is designed for single-character + // input encodings only. It also assumes that the input the input + // encoding shares its first 128 values (0-127) with ASCII. These + // conditions must be checked again if you add more input encodings + // later. + + // Compute output length, and check for pure ascii input at the same + // time. + bool ascii; + size_t outlen = getLength2(input, ascii); + + // If we're pure ascii, then don't bother converting anything. + if(ascii) + return std::string(input, outlen); + + // Make sure the output is large enough + resize(outlen); + char *out = &mOutput[0]; + + // Translate + while(*input) + copyFromArray2(input, out); + + // Make sure that we wrote the correct number of bytes + assert((out-&mOutput[0]) == (int)outlen); + + // And make extra sure the output is null terminated + assert(mOutput.size() > outlen); + assert(mOutput[outlen] == 0); + + // Return a string + return std::string(&mOutput[0], outlen); +} + +// Make sure the output vector is large enough for 'size' bytes, // including a terminating zero after it. +void Utf8Encoder::resize(size_t size) +{ + if (mOutput.size() <= size) + // Add some extra padding to reduce the chance of having to resize + // again later. + mOutput.resize(3*size); + + // And make sure the string is zero terminated + mOutput[size] = 0; +} + +/** Get the total length length needed to decode the given string with + the given translation array. The arrays are encoded with 6 bytes + per character, with the first giving the length and the next 5 the + actual data. + + The function serves a dual purpose for optimization reasons: it + checks if the input is pure ascii (all values are <= 127). If this + is the case, then the ascii parameter is set to true, and the + caller can optimize for this case. + */ +size_t Utf8Encoder::getLength(const char* input, bool &ascii) +{ + ascii = true; + size_t len = 0; + const char* ptr = input; + unsigned char inp = *ptr; + + // Do away with the ascii part of the string first (this is almost + // always the entire string.) + while (inp && inp < 128) + inp = *(++ptr); + len += (ptr-input); + + // If we're not at the null terminator at this point, then there + // were some non-ascii characters to deal with. Go to slow-mode for + // the rest of the string. + if (inp) + { + ascii = false; + while (inp) + { + // Find the translated length of this character in the + // lookup table. + len += translationArray[inp*6]; + inp = *(++ptr); + } + } + return len; +} + +// Translate one character 'ch' using the translation array 'arr', and +// advance the output pointer accordingly. +void Utf8Encoder::copyFromArray(unsigned char ch, char* &out) +{ + // Optimize for ASCII values + if (ch < 128) + { + *(out++) = ch; + return; + } + + const char *in = translationArray + ch*6; + int len = *(in++); + for (int i=0; i &buf, size_t size) { - if(buf.size() <= size) - // Add some extra padding to reduce the chance of having to resize - // again later. - buf.resize(3*size); + if(buf.size() <= size) + // Add some extra padding to reduce the chance of having to resize + // again later. + buf.resize(3*size); - // And make sure the string is zero terminated - buf[size] = 0; + // And make sure the string is zero terminated + buf[size] = 0; } // This is just used to spew out a reusable input buffer for the // conversion process. char *ToUTF8::getBuffer(int s) { - // Remember the requested size - size = s; - resize(buf, size); - return &buf[0]; + // Remember the requested size + size = s; + resize(buf, size); + return &buf[0]; } /** Get the total length length needed to decode the given string with - the given translation array. The arrays are encoded with 6 bytes - per character, with the first giving the length and the next 5 the - actual data. - - The function serves a dual purpose for optimization reasons: it - checks if the input is pure ascii (all values are <= 127). If this - is the case, then the ascii parameter is set to true, and the - caller can optimize for this case. + the given translation array. The arrays are encoded with 6 bytes + per character, with the first giving the length and the next 5 the + actual data. + + The function serves a dual purpose for optimization reasons: it + checks if the input is pure ascii (all values are <= 127). If this + is the case, then the ascii parameter is set to true, and the + caller can optimize for this case. */ static size_t getLength(const char *arr, const char* input, bool &ascii) { - ascii = true; - size_t len = 0; - const char* ptr = input; - unsigned char inp = *ptr; - - // Do away with the ascii part of the string first (this is almost - // always the entire string.) - while(inp && inp < 128) - inp = *(++ptr); - len += (ptr-input); - - // If we're not at the null terminator at this point, then there - // were some non-ascii characters to deal with. Go to slow-mode for - // the rest of the string. - if(inp) + ascii = true; + size_t len = 0; + const char* ptr = input; + unsigned char inp = *ptr; + + // Do away with the ascii part of the string first (this is almost + // always the entire string.) + while(inp && inp < 128) + inp = *(++ptr); + len += (ptr-input); + + // If we're not at the null terminator at this point, then there + // were some non-ascii characters to deal with. Go to slow-mode for + // the rest of the string. + if(inp) { - ascii = false; - while(inp) + ascii = false; + while(inp) { - // Find the translated length of this character in the - // lookup table. - len += arr[inp*6]; - inp = *(++ptr); + // Find the translated length of this character in the + // lookup table. + len += arr[inp*6]; + inp = *(++ptr); } } - return len; + return len; } // Translate one character 'ch' using the translation array 'arr', and // advance the output pointer accordingly. static void copyFromArray(const char *arr, unsigned char ch, char* &out) { - // Optimize for ASCII values - if(ch < 128) + // Optimize for ASCII values + if(ch < 128) { - *(out++) = ch; - return; + *(out++) = ch; + return; } - const char *in = arr + ch*6; - int len = *(in++); - for(int i=0; i outlen); - assert(output[outlen] == 0); - - // Return a string - return std::string(&output[0], outlen); + + // Double check that the input string stops at some point (it might + // contain zero terminators before this, inside its own data, which + // is also ok.) + const char* input = &buf[0]; + assert(input[size] == 0); + + // TODO: The rest of this function is designed for single-character + // input encodings only. It also assumes that the input the input + // encoding shares its first 128 values (0-127) with ASCII. These + // conditions must be checked again if you add more input encodings + // later. + + // Compute output length, and check for pure ascii input at the same + // time. + bool ascii; + size_t outlen = getLength(arr, input, ascii); + + // If we're pure ascii, then don't bother converting anything. + if(ascii) + return std::string(input, outlen); + + // Make sure the output is large enough + resize(output, outlen); + char *out = &output[0]; + + // Translate + while(*input) + copyFromArray(arr, *(input++), out); + + // Make sure that we wrote the correct number of bytes + assert((out-&output[0]) == (int)outlen); + + // And make extra sure the output is null terminated + assert(output.size() > outlen); + assert(output[outlen] == 0); + + // Return a string + return std::string(&output[0], outlen); } static size_t getLength2(const char *arr, const char* input, bool &ascii) { - ascii = true; - size_t len = 0; - const char* ptr = input; - unsigned char inp = *ptr; - - // Do away with the ascii part of the string first (this is almost - // always the entire string.) - while(inp && inp < 128) - inp = *(++ptr); - len += (ptr-input); - - // If we're not at the null terminator at this point, then there - // were some non-ascii characters to deal with. Go to slow-mode for - // the rest of the string. - if(inp) + ascii = true; + size_t len = 0; + const char* ptr = input; + unsigned char inp = *ptr; + + // Do away with the ascii part of the string first (this is almost + // always the entire string.) + while(inp && inp < 128) + inp = *(++ptr); + len += (ptr-input); + + // If we're not at the null terminator at this point, then there + // were some non-ascii characters to deal with. Go to slow-mode for + // the rest of the string. + if(inp) { - ascii = false; - while(inp) + ascii = false; + while(inp) { len += 1; - // Find the translated length of this character in the - // lookup table. + // Find the translated length of this character in the + // lookup table. switch(inp) { - case 0xe2: len -= 2; break; - case 0xc2: - case 0xcb: - case 0xc4: - case 0xc6: - case 0xc3: - case 0xd0: - case 0xd1: - case 0xd2: - case 0xc5: len -= 1; break; + case 0xe2: len -= 2; break; + case 0xc2: + case 0xcb: + case 0xc4: + case 0xc6: + case 0xc3: + case 0xd0: + case 0xd1: + case 0xd2: + case 0xc5: len -= 1; break; } - inp = *(++ptr); + inp = *(++ptr); } } - return len; + return len; } -#include -#include - static void copyFromArray2(const char *arr, char*& chp, char* &out) { unsigned char ch = *(chp++); - // Optimize for ASCII values - if(ch < 128) + // Optimize for ASCII values + if(ch < 128) { - *(out++) = ch; - return; + *(out++) = ch; + return; } - int len = 1; - switch (ch) - { - case 0xe2: len = 3; break; - case 0xc2: - case 0xcb: - case 0xc4: - case 0xc6: - case 0xc3: - case 0xd0: - case 0xd1: - case 0xd2: - case 0xc5: len = 2; break; - } - - if (len == 1) // There is no 1 length utf-8 glyph that is not 0x20 (empty space) - { - *(out++) = ch; - return; - } - - unsigned char ch2 = *(chp++); - unsigned char ch3 = '\0'; - if (len == 3) - ch3 = *(chp++); - - for (int i = 128; i < 256; i++) - { - unsigned char b1 = arr[i*6 + 1], b2 = arr[i*6 + 2], b3 = arr[i*6 + 3]; - if (b1 == ch && b2 == ch2 && (len != 3 || b3 == ch3)) - { - *(out++) = (char)i; - return; - } - } - - std::cout << "Could not find glyph " << std::hex << (int)ch << " " << (int)ch2 << " " << (int)ch3 << std::endl; - - *(out++) = ch; // Could not find glyph, just put whatever -} - -std::string ToUTF8::getLegacyEnc(ToUTF8::FromType to) -{ - // Pick translation array - const char *arr; - switch (to) - { - case ToUTF8::WINDOWS_1252: + int len = 1; + switch (ch) { - arr = ToUTF8::windows_1252; - break; + case 0xe2: len = 3; break; + case 0xc2: + case 0xcb: + case 0xc4: + case 0xc6: + case 0xc3: + case 0xd0: + case 0xd1: + case 0xd2: + case 0xc5: len = 2; break; } - case ToUTF8::WINDOWS_1250: + + if (len == 1) // There is no 1 length utf-8 glyph that is not 0x20 (empty space) { - arr = ToUTF8::windows_1250; - break; + *(out++) = ch; + return; } - case ToUTF8::WINDOWS_1251: + + unsigned char ch2 = *(chp++); + unsigned char ch3 = '\0'; + if (len == 3) + ch3 = *(chp++); + + for (int i = 128; i < 256; i++) { - arr = ToUTF8::windows_1251; - break; + unsigned char b1 = arr[i*6 + 1], b2 = arr[i*6 + 2], b3 = arr[i*6 + 3]; + if (b1 == ch && b2 == ch2 && (len != 3 || b3 == ch3)) + { + *(out++) = (char)i; + return; + } } - default: + + std::cout << "Could not find glyph " << std::hex << (int)ch << " " << (int)ch2 << " " << (int)ch3 << std::endl; + + *(out++) = ch; // Could not find glyph, just put whatever +} + +std::string ToUTF8::getLegacyEnc(ToUTF8::FromType to) +{ + // Pick translation array + const char *arr; + switch (to) { - assert(0); + case ToUTF8::WINDOWS_1252: + { + arr = ToUTF8::windows_1252; + break; + } + case ToUTF8::WINDOWS_1250: + { + arr = ToUTF8::windows_1250; + break; + } + case ToUTF8::WINDOWS_1251: + { + arr = ToUTF8::windows_1251; + break; + } + default: + { + assert(0); + } } - } - - // Double check that the input string stops at some point (it might - // contain zero terminators before this, inside its own data, which - // is also ok.) - char* input = &buf[0]; - assert(input[size] == 0); - - // TODO: The rest of this function is designed for single-character - // input encodings only. It also assumes that the input the input - // encoding shares its first 128 values (0-127) with ASCII. These - // conditions must be checked again if you add more input encodings - // later. - - // Compute output length, and check for pure ascii input at the same - // time. - bool ascii; - size_t outlen = getLength2(arr, input, ascii); - - // If we're pure ascii, then don't bother converting anything. - if(ascii) - return std::string(input, outlen); - - // Make sure the output is large enough - resize(output, outlen); - char *out = &output[0]; - - // Translate - while(*input) - copyFromArray2(arr, input, out); - - // Make sure that we wrote the correct number of bytes - assert((out-&output[0]) == (int)outlen); - - // And make extra sure the output is null terminated - assert(output.size() > outlen); - assert(output[outlen] == 0); - - // Return a string - return std::string(&output[0], outlen); + + // Double check that the input string stops at some point (it might + // contain zero terminators before this, inside its own data, which + // is also ok.) + char* input = &buf[0]; + assert(input[size] == 0); + + // TODO: The rest of this function is designed for single-character + // input encodings only. It also assumes that the input the input + // encoding shares its first 128 values (0-127) with ASCII. These + // conditions must be checked again if you add more input encodings + // later. + + // Compute output length, and check for pure ascii input at the same + // time. + bool ascii; + size_t outlen = getLength2(arr, input, ascii); + + // If we're pure ascii, then don't bother converting anything. + if(ascii) + return std::string(input, outlen); + + // Make sure the output is large enough + resize(output, outlen); + char *out = &output[0]; + + // Translate + while(*input) + copyFromArray2(arr, input, out); + + // Make sure that we wrote the correct number of bytes + assert((out-&output[0]) == (int)outlen); + + // And make extra sure the output is null terminated + assert(output.size() > outlen); + assert(output[outlen] == 0); + + // Return a string + return std::string(&output[0], outlen); } ToUTF8::FromType ToUTF8::calculateEncoding(const std::string& encodingName) { - if (encodingName == "win1250") - return ToUTF8::WINDOWS_1250; - else if (encodingName == "win1251") - return ToUTF8::WINDOWS_1251; - else - return ToUTF8::WINDOWS_1252; + if (encodingName == "win1250") + return ToUTF8::WINDOWS_1250; + else if (encodingName == "win1251") + return ToUTF8::WINDOWS_1251; + else + return ToUTF8::WINDOWS_1252; } std::string ToUTF8::encodingUsingMessage(const std::string& encodingName) { - if (encodingName == "win1250") - return "Using Central and Eastern European font encoding."; - else if (encodingName == "win1251") - return "Using Cyrillic font encoding."; - else - return "Using default (English) font encoding."; + if (encodingName == "win1250") + return "Using Central and Eastern European font encoding."; + else if (encodingName == "win1251") + return "Using Cyrillic font encoding."; + else + return "Using default (English) font encoding."; } diff --git a/components/to_utf8/to_utf8.hpp b/components/to_utf8/to_utf8.hpp index f52ae73bd8..6877e2dc17 100644 --- a/components/to_utf8/to_utf8.hpp +++ b/components/to_utf8/to_utf8.hpp @@ -2,29 +2,66 @@ #define COMPONENTS_TOUTF8_H #include +#include +#include namespace ToUTF8 { - // These are all the currently supported code pages - enum FromType + // These are all the currently supported code pages + enum FromType { - WINDOWS_1250, // Central ane Eastern European languages - WINDOWS_1251, // Cyrillic languages - WINDOWS_1252 // Used by English version of Morrowind (and - // probably others) + WINDOWS_1250, // Central ane Eastern European languages + WINDOWS_1251, // Cyrillic languages + WINDOWS_1252 // Used by English version of Morrowind (and + // probably others) }; - // Return a writable buffer of at least 'size' bytes. The buffer - // does not have to be freed. - char* getBuffer(int size); + // Return a writable buffer of at least 'size' bytes. The buffer + // does not have to be freed. + char* getBuffer(int size); - // Convert the previously written buffer to UTF8 from the given code - // page. - std::string getUtf8(FromType from); - std::string getLegacyEnc(FromType to); + // Convert the previously written buffer to UTF8 from the given code + // page. + std::string getUtf8(FromType from); + std::string getLegacyEnc(FromType to); - FromType calculateEncoding(const std::string& encodingName); - std::string encodingUsingMessage(const std::string& encodingName); + FromType calculateEncoding(const std::string& encodingName); + std::string encodingUsingMessage(const std::string& encodingName); + + // class + + class Utf8Encoder + { + public: + Utf8Encoder(void); + + void setEncoding(const FromType sourceEncoding); + + // Convert to UTF8 from the previously given code page. + std::string getUtf8(const char *input, int size); + inline std::string getUtf8(const std::string &str) + { + return getUtf8(str.c_str(), str.size()); + } + + std::string getLegacyEnc(const char *input, int size); + inline std::string getLegacyEnc(const std::string &str) + { + return getLegacyEnc(str.c_str(), str.size()); + } + + private: + void resize(size_t size); + size_t getLength(const char* input, bool &ascii); + void copyFromArray(unsigned char chp, char* &out); + size_t getLength2(const char* input, bool &ascii); + void copyFromArray2(const char*& chp, char* &out); + + FromType mEncoding; + std::vector mOutput; + int mSize; + char* translationArray; + }; } #endif From 67273fc1777b45d9860fe114689cbdc9836c1f48 Mon Sep 17 00:00:00 2001 From: Emanuel Guevel Date: Wed, 2 Jan 2013 23:39:21 +0100 Subject: [PATCH 2/9] mwiniimporter: use Utf8Encoder --- apps/mwiniimporter/importer.cpp | 12 +++--------- apps/mwiniimporter/importer.hpp | 2 +- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/apps/mwiniimporter/importer.cpp b/apps/mwiniimporter/importer.cpp index def70615bc..5c3dedd047 100644 --- a/apps/mwiniimporter/importer.cpp +++ b/apps/mwiniimporter/importer.cpp @@ -649,11 +649,13 @@ MwIniImporter::multistrmap MwIniImporter::loadIniFile(std::string filename) { std::string section(""); MwIniImporter::multistrmap map; boost::iostreams::streamfile(filename.c_str()); + ToUTF8::Utf8Encoder encoder; + encoder.setEncoding(mEncoding); std::string line; while (std::getline(file, line)) { - line = toUTF8(line); + line = encoder.getUtf8(line); // unify Unix-style and Windows file ending if (!(line.empty()) && (line[line.length()-1]) == '\r') { @@ -829,14 +831,6 @@ void MwIniImporter::writeToFile(boost::iostreams::stream #include -#include "../../components/to_utf8/to_utf8.hpp" +#include class MwIniImporter { public: From 9906c3051dddaaf435d55841f5e9d0d392d575c5 Mon Sep 17 00:00:00 2001 From: Emanuel Guevel Date: Thu, 3 Jan 2013 15:01:08 +0100 Subject: [PATCH 3/9] components/translation: use Utf8Encoder --- components/translation/translation.cpp | 6 ++---- components/translation/translation.hpp | 1 + 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/components/translation/translation.cpp b/components/translation/translation.cpp index fb5b038612..002446e4f9 100644 --- a/components/translation/translation.cpp +++ b/components/translation/translation.cpp @@ -50,10 +50,7 @@ namespace Translation if (!line.empty()) { - char* buffer = ToUTF8::getBuffer(line.size() + 1); - //buffer has at least line.size() + 1 bytes, so it must be safe - strcpy(buffer, line.c_str()); - line = ToUTF8::getUtf8(mEncoding); + line = mEncoder.getUtf8(line); size_t tab_pos = line.find('\t'); if (tab_pos != std::string::npos && tab_pos > 0 && tab_pos < line.size() - 1) @@ -107,6 +104,7 @@ namespace Translation void Storage::setEncoding (const ToUTF8::FromType& encoding) { mEncoding = encoding; + mEncoder.setEncoding(encoding); } bool Storage::hasTranslation() const diff --git a/components/translation/translation.hpp b/components/translation/translation.hpp index 80d44d871d..6c3e4df868 100644 --- a/components/translation/translation.hpp +++ b/components/translation/translation.hpp @@ -35,6 +35,7 @@ namespace Translation ToUTF8::FromType mEncoding; + ToUTF8::Utf8Encoder mEncoder; ContainerType mCellNamesTranslations, mTopicIDs, mPhraseForms; }; } From 02bf02f288c407ad639ad15c8bc3a72ae56597b4 Mon Sep 17 00:00:00 2001 From: Emanuel Guevel Date: Thu, 3 Jan 2013 21:15:18 +0100 Subject: [PATCH 4/9] ESMReader, ESMWriter: use Utf8Encoder --- components/esm/esmreader.cpp | 22 +++++++++++++++++++--- components/esm/esmreader.hpp | 6 ++++++ components/esm/esmwriter.cpp | 8 +++----- components/esm/esmwriter.hpp | 1 + 4 files changed, 29 insertions(+), 8 deletions(-) diff --git a/components/esm/esmreader.cpp b/components/esm/esmreader.cpp index 580e006dfe..5cd99a64a2 100644 --- a/components/esm/esmreader.cpp +++ b/components/esm/esmreader.cpp @@ -13,6 +13,11 @@ ESM_Context ESMReader::getContext() return mCtx; } +ESMReader::ESMReader(void): + mBuffer(50*1024) +{ +} + void ESMReader::restoreContext(const ESM_Context &rc) { // Reopen the file if necessary @@ -325,11 +330,21 @@ void ESMReader::getExact(void*x, int size) std::string ESMReader::getString(int size) { - char *ptr = ToUTF8::getBuffer(size); - mEsm->read(ptr, size); + size_t s = size; + if (mBuffer.size() <= s) + // Add some extra padding to reduce the chance of having to resize + // again later. + mBuffer.resize(3*s); + + // And make sure the string is zero terminated + mBuffer[s] = 0; + + // read ESM data + char *ptr = &mBuffer[0]; + getExact(ptr, size); // Convert to UTF8 and return - return ToUTF8::getUtf8(mEncoding); + return mEncoder.getUtf8(ptr, size); } void ESMReader::fail(const std::string &msg) @@ -350,6 +365,7 @@ void ESMReader::fail(const std::string &msg) void ESMReader::setEncoding(const ToUTF8::FromType& encoding) { mEncoding = encoding; + mEncoder.setEncoding(encoding); } } diff --git a/components/esm/esmreader.hpp b/components/esm/esmreader.hpp index 1d0f6f5806..57503aea77 100644 --- a/components/esm/esmreader.hpp +++ b/components/esm/esmreader.hpp @@ -20,6 +20,8 @@ class ESMReader { public: + ESMReader(void); + /************************************************************************* * * Public type definitions @@ -244,9 +246,13 @@ private: // Special file signifier (see SpecialFile enum above) int mSpf; + // Buffer for ESM strings + std::vector mBuffer; + SaveData mSaveData; MasterList mMasters; ToUTF8::FromType mEncoding; + ToUTF8::Utf8Encoder mEncoder; }; } #endif diff --git a/components/esm/esmwriter.cpp b/components/esm/esmwriter.cpp index c1ae064903..a00c7971d0 100644 --- a/components/esm/esmwriter.cpp +++ b/components/esm/esmwriter.cpp @@ -157,12 +157,8 @@ void ESMWriter::writeHString(const std::string& data) write("\0", 1); else { - char *ptr = ToUTF8::getBuffer(data.size()+1); - strncpy(ptr, &data[0], data.size()); - ptr[data.size()] = '\0'; - // Convert to UTF8 and return - std::string ascii = ToUTF8::getLegacyEnc(m_encoding); + std::string ascii = m_encoder.getLegacyEnc(data); write(ascii.c_str(), ascii.size()); } @@ -207,6 +203,8 @@ void ESMWriter::setEncoding(const std::string& encoding) // Default Latin encoding m_encoding = ToUTF8::WINDOWS_1252; } + + m_encoder.setEncoding(m_encoding); } } diff --git a/components/esm/esmwriter.hpp b/components/esm/esmwriter.hpp index d3777be813..20bc5da128 100644 --- a/components/esm/esmwriter.hpp +++ b/components/esm/esmwriter.hpp @@ -95,6 +95,7 @@ private: std::ostream* m_stream; std::streampos m_headerPos; ToUTF8::FromType m_encoding; + ToUTF8::Utf8Encoder m_encoder; int m_recordCount; HEDRstruct m_header; From 0bdf52a0719a756f3be97a988ff33784d02f7fcc Mon Sep 17 00:00:00 2001 From: Emanuel Guevel Date: Thu, 3 Jan 2013 23:21:14 +0100 Subject: [PATCH 5/9] components/to_utf8: keep only Utf8Encoder --- components/to_utf8/to_utf8.cpp | 314 --------------------------------- components/to_utf8/to_utf8.hpp | 9 - 2 files changed, 323 deletions(-) diff --git a/components/to_utf8/to_utf8.cpp b/components/to_utf8/to_utf8.cpp index 8ac582b81d..5efec36a4d 100644 --- a/components/to_utf8/to_utf8.cpp +++ b/components/to_utf8/to_utf8.cpp @@ -41,13 +41,6 @@ // Generated tables #include "tables_gen.hpp" -// Shared global buffers, we love you. These initial sizes are large -// enough to hold the largest books in Morrowind.esm, but we will -// resize automaticall if necessary. -static std::vector buf (50*1024); -static std::vector output (50*1024); -static int size; - using namespace ToUTF8; Utf8Encoder::Utf8Encoder(void): @@ -330,313 +323,6 @@ void Utf8Encoder::copyFromArray2(const char*& chp, char* &out) *(out++) = ch; // Could not find glyph, just put whatever } -static void resize(std::vector &buf, size_t size) -{ - if(buf.size() <= size) - // Add some extra padding to reduce the chance of having to resize - // again later. - buf.resize(3*size); - - // And make sure the string is zero terminated - buf[size] = 0; -} - -// This is just used to spew out a reusable input buffer for the -// conversion process. -char *ToUTF8::getBuffer(int s) -{ - // Remember the requested size - size = s; - resize(buf, size); - return &buf[0]; -} - -/** Get the total length length needed to decode the given string with - the given translation array. The arrays are encoded with 6 bytes - per character, with the first giving the length and the next 5 the - actual data. - - The function serves a dual purpose for optimization reasons: it - checks if the input is pure ascii (all values are <= 127). If this - is the case, then the ascii parameter is set to true, and the - caller can optimize for this case. - */ -static size_t getLength(const char *arr, const char* input, bool &ascii) -{ - ascii = true; - size_t len = 0; - const char* ptr = input; - unsigned char inp = *ptr; - - // Do away with the ascii part of the string first (this is almost - // always the entire string.) - while(inp && inp < 128) - inp = *(++ptr); - len += (ptr-input); - - // If we're not at the null terminator at this point, then there - // were some non-ascii characters to deal with. Go to slow-mode for - // the rest of the string. - if(inp) - { - ascii = false; - while(inp) - { - // Find the translated length of this character in the - // lookup table. - len += arr[inp*6]; - inp = *(++ptr); - } - } - return len; -} - -// Translate one character 'ch' using the translation array 'arr', and -// advance the output pointer accordingly. -static void copyFromArray(const char *arr, unsigned char ch, char* &out) -{ - // Optimize for ASCII values - if(ch < 128) - { - *(out++) = ch; - return; - } - - const char *in = arr + ch*6; - int len = *(in++); - for(int i=0; i outlen); - assert(output[outlen] == 0); - - // Return a string - return std::string(&output[0], outlen); -} - -static size_t getLength2(const char *arr, const char* input, bool &ascii) -{ - ascii = true; - size_t len = 0; - const char* ptr = input; - unsigned char inp = *ptr; - - // Do away with the ascii part of the string first (this is almost - // always the entire string.) - while(inp && inp < 128) - inp = *(++ptr); - len += (ptr-input); - - // If we're not at the null terminator at this point, then there - // were some non-ascii characters to deal with. Go to slow-mode for - // the rest of the string. - if(inp) - { - ascii = false; - while(inp) - { - len += 1; - // Find the translated length of this character in the - // lookup table. - switch(inp) - { - case 0xe2: len -= 2; break; - case 0xc2: - case 0xcb: - case 0xc4: - case 0xc6: - case 0xc3: - case 0xd0: - case 0xd1: - case 0xd2: - case 0xc5: len -= 1; break; - } - - inp = *(++ptr); - } - } - return len; -} - -static void copyFromArray2(const char *arr, char*& chp, char* &out) -{ - unsigned char ch = *(chp++); - // Optimize for ASCII values - if(ch < 128) - { - *(out++) = ch; - return; - } - - int len = 1; - switch (ch) - { - case 0xe2: len = 3; break; - case 0xc2: - case 0xcb: - case 0xc4: - case 0xc6: - case 0xc3: - case 0xd0: - case 0xd1: - case 0xd2: - case 0xc5: len = 2; break; - } - - if (len == 1) // There is no 1 length utf-8 glyph that is not 0x20 (empty space) - { - *(out++) = ch; - return; - } - - unsigned char ch2 = *(chp++); - unsigned char ch3 = '\0'; - if (len == 3) - ch3 = *(chp++); - - for (int i = 128; i < 256; i++) - { - unsigned char b1 = arr[i*6 + 1], b2 = arr[i*6 + 2], b3 = arr[i*6 + 3]; - if (b1 == ch && b2 == ch2 && (len != 3 || b3 == ch3)) - { - *(out++) = (char)i; - return; - } - } - - std::cout << "Could not find glyph " << std::hex << (int)ch << " " << (int)ch2 << " " << (int)ch3 << std::endl; - - *(out++) = ch; // Could not find glyph, just put whatever -} - -std::string ToUTF8::getLegacyEnc(ToUTF8::FromType to) -{ - // Pick translation array - const char *arr; - switch (to) - { - case ToUTF8::WINDOWS_1252: - { - arr = ToUTF8::windows_1252; - break; - } - case ToUTF8::WINDOWS_1250: - { - arr = ToUTF8::windows_1250; - break; - } - case ToUTF8::WINDOWS_1251: - { - arr = ToUTF8::windows_1251; - break; - } - default: - { - assert(0); - } - } - - // Double check that the input string stops at some point (it might - // contain zero terminators before this, inside its own data, which - // is also ok.) - char* input = &buf[0]; - assert(input[size] == 0); - - // TODO: The rest of this function is designed for single-character - // input encodings only. It also assumes that the input the input - // encoding shares its first 128 values (0-127) with ASCII. These - // conditions must be checked again if you add more input encodings - // later. - - // Compute output length, and check for pure ascii input at the same - // time. - bool ascii; - size_t outlen = getLength2(arr, input, ascii); - - // If we're pure ascii, then don't bother converting anything. - if(ascii) - return std::string(input, outlen); - - // Make sure the output is large enough - resize(output, outlen); - char *out = &output[0]; - - // Translate - while(*input) - copyFromArray2(arr, input, out); - - // Make sure that we wrote the correct number of bytes - assert((out-&output[0]) == (int)outlen); - - // And make extra sure the output is null terminated - assert(output.size() > outlen); - assert(output[outlen] == 0); - - // Return a string - return std::string(&output[0], outlen); -} - ToUTF8::FromType ToUTF8::calculateEncoding(const std::string& encodingName) { if (encodingName == "win1250") diff --git a/components/to_utf8/to_utf8.hpp b/components/to_utf8/to_utf8.hpp index 6877e2dc17..bfba8a1ac4 100644 --- a/components/to_utf8/to_utf8.hpp +++ b/components/to_utf8/to_utf8.hpp @@ -16,15 +16,6 @@ namespace ToUTF8 // probably others) }; - // Return a writable buffer of at least 'size' bytes. The buffer - // does not have to be freed. - char* getBuffer(int size); - - // Convert the previously written buffer to UTF8 from the given code - // page. - std::string getUtf8(FromType from); - std::string getLegacyEnc(FromType to); - FromType calculateEncoding(const std::string& encodingName); std::string encodingUsingMessage(const std::string& encodingName); From c947d87ab9e9510e322d8cb030f42b64f6f07dc4 Mon Sep 17 00:00:00 2001 From: Emanuel Guevel Date: Fri, 4 Jan 2013 15:10:30 +0100 Subject: [PATCH 6/9] Add a test for to_utf8 component --- components/to_utf8/tests/.gitignore | 1 + .../to_utf8/tests/output/to_utf8_test.out | 4 ++ components/to_utf8/tests/test.sh | 18 ++++++ components/to_utf8/tests/to_utf8_test.cpp | 61 +++++++++++++++++++ 4 files changed, 84 insertions(+) create mode 100644 components/to_utf8/tests/.gitignore create mode 100644 components/to_utf8/tests/output/to_utf8_test.out create mode 100755 components/to_utf8/tests/test.sh create mode 100644 components/to_utf8/tests/to_utf8_test.cpp diff --git a/components/to_utf8/tests/.gitignore b/components/to_utf8/tests/.gitignore new file mode 100644 index 0000000000..8144904045 --- /dev/null +++ b/components/to_utf8/tests/.gitignore @@ -0,0 +1 @@ +*_test diff --git a/components/to_utf8/tests/output/to_utf8_test.out b/components/to_utf8/tests/output/to_utf8_test.out new file mode 100644 index 0000000000..dcb32359ab --- /dev/null +++ b/components/to_utf8/tests/output/to_utf8_test.out @@ -0,0 +1,4 @@ +original: Без вопросов отдаете ему рулет, зная, что позже вы сможете привести с собой своих друзей и тогда он получит по заслугам? +converted: Без вопросов отдаете ему рулет, зная, что позже вы сможете привести с собой своих друзей и тогда он получит по заслугам? +original: Vous lui donnez le gâteau sans protester avant d’aller chercher tous vos amis et de revenir vous venger. +converted: Vous lui donnez le gâteau sans protester avant d’aller chercher tous vos amis et de revenir vous venger. diff --git a/components/to_utf8/tests/test.sh b/components/to_utf8/tests/test.sh new file mode 100755 index 0000000000..2d07708adc --- /dev/null +++ b/components/to_utf8/tests/test.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +make || exit + +mkdir -p output + +PROGS=*_test + +for a in $PROGS; do + if [ -f "output/$a.out" ]; then + echo "Running $a:" + ./$a | diff output/$a.out - + else + echo "Creating $a.out" + ./$a > "output/$a.out" + git add "output/$a.out" + fi +done diff --git a/components/to_utf8/tests/to_utf8_test.cpp b/components/to_utf8/tests/to_utf8_test.cpp new file mode 100644 index 0000000000..8c25c483e0 --- /dev/null +++ b/components/to_utf8/tests/to_utf8_test.cpp @@ -0,0 +1,61 @@ +#include +#include +#include +#include +#include + +#include "../to_utf8.hpp" + +std::string getFirstLine(const std::string &filename); +void testEncoder(ToUTF8::FromType encoding, const std::string &legacyEncFile, + const std::string &utf8File); + +/// Test character encoding conversion to and from UTF-8 +void testEncoder(ToUTF8::FromType encoding, const std::string &legacyEncFile, + const std::string &utf8File) +{ + // get some test data + std::string legacyEncLine = getFirstLine(legacyEncFile); + std::string utf8Line = getFirstLine(utf8File); + + // create an encoder for specified character encoding + ToUTF8::Utf8Encoder encoder; + encoder.setEncoding(encoding); + + // convert text to UTF-8 + std::string convertedUtf8Line = encoder.getUtf8(legacyEncLine); + + std::cout << "original: " << utf8Line << std::endl; + std::cout << "converted: " << convertedUtf8Line << std::endl; + + // check correctness + assert(convertedUtf8Line == utf8Line); + + // convert UTF-8 text to legacy encoding + std::string convertedLegacyEncLine = encoder.getLegacyEnc(utf8Line); + // check correctness + assert(convertedLegacyEncLine == legacyEncLine); +} + +std::string getFirstLine(const std::string &filename) +{ + std::string line; + std::ifstream text (filename.c_str()); + + if (!text.is_open()) + { + throw std::runtime_error("Unable to open file " + filename); + } + + std::getline(text, line); + text.close(); + + return line; +} + +int main() +{ + testEncoder(ToUTF8::WINDOWS_1251, "data/russian-win1251.txt", "data/russian-utf8.txt"); + testEncoder(ToUTF8::WINDOWS_1252, "data/french-win1252.txt", "data/french-utf8.txt"); + return 0; +} From cc792da85895c85db8b76a5c9692bc260f35f649 Mon Sep 17 00:00:00 2001 From: Emanuel Guevel Date: Fri, 4 Jan 2013 15:24:07 +0100 Subject: [PATCH 7/9] Fix to_utf8 test: add test data directory and remove unused include --- components/to_utf8/tests/test_data/french-utf8.txt | 1 + components/to_utf8/tests/test_data/french-win1252.txt | 1 + components/to_utf8/tests/test_data/russian-utf8.txt | 1 + components/to_utf8/tests/test_data/russian-win1251.txt | 1 + components/to_utf8/tests/to_utf8_test.cpp | 5 ++--- 5 files changed, 6 insertions(+), 3 deletions(-) create mode 100644 components/to_utf8/tests/test_data/french-utf8.txt create mode 100644 components/to_utf8/tests/test_data/french-win1252.txt create mode 100644 components/to_utf8/tests/test_data/russian-utf8.txt create mode 100644 components/to_utf8/tests/test_data/russian-win1251.txt diff --git a/components/to_utf8/tests/test_data/french-utf8.txt b/components/to_utf8/tests/test_data/french-utf8.txt new file mode 100644 index 0000000000..aaaccac737 --- /dev/null +++ b/components/to_utf8/tests/test_data/french-utf8.txt @@ -0,0 +1 @@ +Vous lui donnez le gâteau sans protester avant d’aller chercher tous vos amis et de revenir vous venger. \ No newline at end of file diff --git a/components/to_utf8/tests/test_data/french-win1252.txt b/components/to_utf8/tests/test_data/french-win1252.txt new file mode 100644 index 0000000000..1de4593e94 --- /dev/null +++ b/components/to_utf8/tests/test_data/french-win1252.txt @@ -0,0 +1 @@ +Vous lui donnez le gteau sans protester avant daller chercher tous vos amis et de revenir vous venger. \ No newline at end of file diff --git a/components/to_utf8/tests/test_data/russian-utf8.txt b/components/to_utf8/tests/test_data/russian-utf8.txt new file mode 100644 index 0000000000..eb20b32dd6 --- /dev/null +++ b/components/to_utf8/tests/test_data/russian-utf8.txt @@ -0,0 +1 @@ +Без вопросов отдаете ему рулет, зная, что позже вы сможете привести с собой своих друзей и тогда он получит по заслугам? \ No newline at end of file diff --git a/components/to_utf8/tests/test_data/russian-win1251.txt b/components/to_utf8/tests/test_data/russian-win1251.txt new file mode 100644 index 0000000000..086e57edd6 --- /dev/null +++ b/components/to_utf8/tests/test_data/russian-win1251.txt @@ -0,0 +1 @@ + , , ? \ No newline at end of file diff --git a/components/to_utf8/tests/to_utf8_test.cpp b/components/to_utf8/tests/to_utf8_test.cpp index 8c25c483e0..4bba0cf90d 100644 --- a/components/to_utf8/tests/to_utf8_test.cpp +++ b/components/to_utf8/tests/to_utf8_test.cpp @@ -2,7 +2,6 @@ #include #include #include -#include #include "../to_utf8.hpp" @@ -55,7 +54,7 @@ std::string getFirstLine(const std::string &filename) int main() { - testEncoder(ToUTF8::WINDOWS_1251, "data/russian-win1251.txt", "data/russian-utf8.txt"); - testEncoder(ToUTF8::WINDOWS_1252, "data/french-win1252.txt", "data/french-utf8.txt"); + testEncoder(ToUTF8::WINDOWS_1251, "test_data/russian-win1251.txt", "test_data/russian-utf8.txt"); + testEncoder(ToUTF8::WINDOWS_1252, "test_data/french-win1252.txt", "test_data/french-utf8.txt"); return 0; } From 63f09462fd1395f1550df11eb400e0e81959c14b Mon Sep 17 00:00:00 2001 From: Emanuel Guevel Date: Sun, 6 Jan 2013 01:37:58 +0100 Subject: [PATCH 8/9] to_utf8, Utf8Encoder: pass encoding as constructor parameter Edit other files accordingly. --- apps/esmtool/esmtool.cpp | 25 ++++++++----------------- apps/launcher/model/datafilesmodel.cpp | 6 ++++-- apps/mwiniimporter/importer.cpp | 3 +-- apps/openmw/engine.cpp | 7 +++++-- apps/openmw/mwworld/worldimp.cpp | 4 ++-- apps/openmw/mwworld/worldimp.hpp | 2 +- components/esm/esmreader.cpp | 7 +++---- components/esm/esmreader.hpp | 7 +++---- components/esm/esmwriter.cpp | 20 +++----------------- components/esm/esmwriter.hpp | 8 +++----- components/to_utf8/to_utf8.cpp | 10 ++-------- components/to_utf8/to_utf8.hpp | 6 +----- components/translation/translation.cpp | 7 +++---- components/translation/translation.hpp | 5 ++--- 14 files changed, 41 insertions(+), 76 deletions(-) diff --git a/apps/esmtool/esmtool.cpp b/apps/esmtool/esmtool.cpp index 0cd6e39053..fbfc884d70 100644 --- a/apps/esmtool/esmtool.cpp +++ b/apps/esmtool/esmtool.cpp @@ -165,23 +165,12 @@ bool parseOptions (int argc, char** argv, Arguments &info) // Font encoding settings info.encoding = variables["encoding"].as(); - if (info.encoding == "win1250") + if(info.encoding != "win1250" && info.encoding != "win1251" && info.encoding != "win1252") { - std::cout << "Using Central and Eastern European font encoding." << std::endl; - } - else if (info.encoding == "win1251") - { - std::cout << "Using Cyrillic font encoding." << std::endl; - } - else - { - if(info.encoding != "win1252") - { - std::cout << info.encoding << " is not a valid encoding option." << std::endl; - info.encoding = "win1252"; - } - std::cout << "Using default (English) font encoding." << std::endl; + std::cout << info.encoding << " is not a valid encoding option." << std::endl; + info.encoding = "win1252"; } + std::cout << ToUTF8::encodingUsingMessage(info.encoding) << std::endl; return true; } @@ -262,7 +251,8 @@ void printRaw(ESM::ESMReader &esm) int load(Arguments& info) { ESM::ESMReader& esm = info.reader; - esm.setEncoding(ToUTF8::calculateEncoding(info.encoding)); + ToUTF8::Utf8Encoder encoder (ToUTF8::calculateEncoding(info.encoding)); + esm.setEncoder(&encoder); std::string filename = info.filename; std::cout << "Loading file: " << filename << std::endl; @@ -432,7 +422,8 @@ int clone(Arguments& info) std::cout << std::endl << "Saving records to: " << info.outname << "..." << std::endl; ESM::ESMWriter& esm = info.writer; - esm.setEncoding(info.encoding); + ToUTF8::Utf8Encoder encoder (ToUTF8::calculateEncoding(info.encoding)); + esm.setEncoder(&encoder); esm.setAuthor(info.data.author); esm.setDescription(info.data.description); esm.setVersion(info.data.version); diff --git a/apps/launcher/model/datafilesmodel.cpp b/apps/launcher/model/datafilesmodel.cpp index 716c9e9026..e84dbe0acc 100644 --- a/apps/launcher/model/datafilesmodel.cpp +++ b/apps/launcher/model/datafilesmodel.cpp @@ -272,7 +272,8 @@ void DataFilesModel::addMasters(const QString &path) foreach (const QString &path, dir.entryList()) { try { ESM::ESMReader fileReader; - fileReader.setEncoding(ToUTF8::calculateEncoding(mEncoding.toStdString())); + ToUTF8::Utf8Encoder encoder (ToUTF8::calculateEncoding(mEncoding.toStdString())); + fileReader.setEncoder(&encoder); fileReader.open(dir.absoluteFilePath(path).toStdString()); ESM::ESMReader::MasterList mlist = fileReader.getMasters(); @@ -335,7 +336,8 @@ void DataFilesModel::addPlugins(const QString &path) try { ESM::ESMReader fileReader; - fileReader.setEncoding(ToUTF8::calculateEncoding(mEncoding.toStdString())); + ToUTF8::Utf8Encoder encoder (ToUTF8::calculateEncoding(mEncoding.toStdString())); + fileReader.setEncoder(&encoder); fileReader.open(dir.absoluteFilePath(path).toStdString()); ESM::ESMReader::MasterList mlist = fileReader.getMasters(); diff --git a/apps/mwiniimporter/importer.cpp b/apps/mwiniimporter/importer.cpp index 5c3dedd047..6a7274e0a1 100644 --- a/apps/mwiniimporter/importer.cpp +++ b/apps/mwiniimporter/importer.cpp @@ -649,8 +649,7 @@ MwIniImporter::multistrmap MwIniImporter::loadIniFile(std::string filename) { std::string section(""); MwIniImporter::multistrmap map; boost::iostreams::streamfile(filename.c_str()); - ToUTF8::Utf8Encoder encoder; - encoder.setEncoding(mEncoding); + ToUTF8::Utf8Encoder encoder(mEncoding); std::string line; while (std::getline(file, line)) { diff --git a/apps/openmw/engine.cpp b/apps/openmw/engine.cpp index e2d28a808d..aadeb7f3a2 100644 --- a/apps/openmw/engine.cpp +++ b/apps/openmw/engine.cpp @@ -331,11 +331,15 @@ void OMW::Engine::go() // cursor replacer (converts the cursor from the bsa so they can be used by mygui) MWGui::CursorReplace replacer; + // Create encoder + ToUTF8::Utf8Encoder encoder (mEncoding); + // Create the world mEnvironment.setWorld (new MWWorld::World (*mOgre, mFileCollections, mMaster, - mResDir, mCfgMgr.getCachePath(), mNewGame, mEncoding, mFallbackMap)); + mResDir, mCfgMgr.getCachePath(), mNewGame, &encoder, mFallbackMap)); //Load translation data + mTranslationDataStorage.setEncoder(&encoder); mTranslationDataStorage.loadTranslationData(mFileCollections, mMaster); // Create window manager - this manages all the MW-specific GUI windows @@ -494,7 +498,6 @@ void OMW::Engine::showFPS(int level) void OMW::Engine::setEncoding(const ToUTF8::FromType& encoding) { mEncoding = encoding; - mTranslationDataStorage.setEncoding (encoding); } void OMW::Engine::setFallbackValues(std::map fallbackMap) diff --git a/apps/openmw/mwworld/worldimp.cpp b/apps/openmw/mwworld/worldimp.cpp index 3995123b0e..4f60f6ef42 100644 --- a/apps/openmw/mwworld/worldimp.cpp +++ b/apps/openmw/mwworld/worldimp.cpp @@ -170,7 +170,7 @@ namespace MWWorld World::World (OEngine::Render::OgreRenderer& renderer, const Files::Collections& fileCollections, const std::string& master, const boost::filesystem::path& resDir, const boost::filesystem::path& cacheDir, bool newGame, - const ToUTF8::FromType& encoding, std::map fallbackMap) + ToUTF8::Utf8Encoder* encoder, std::map fallbackMap) : mPlayer (0), mLocalScripts (mStore), mGlobalVariables (0), mSky (true), mCells (mStore, mEsm), mNumFacing(0) @@ -187,7 +187,7 @@ namespace MWWorld std::cout << "Loading ESM " << masterPath.string() << "\n"; // This parses the ESM file and loads a sample cell - mEsm.setEncoding(encoding); + mEsm.setEncoder(encoder); mEsm.open (masterPath.string()); mStore.load (mEsm); diff --git a/apps/openmw/mwworld/worldimp.hpp b/apps/openmw/mwworld/worldimp.hpp index 71ae9597f8..d29adebf44 100644 --- a/apps/openmw/mwworld/worldimp.hpp +++ b/apps/openmw/mwworld/worldimp.hpp @@ -95,7 +95,7 @@ namespace MWWorld World (OEngine::Render::OgreRenderer& renderer, const Files::Collections& fileCollections, const std::string& master, const boost::filesystem::path& resDir, const boost::filesystem::path& cacheDir, bool newGame, - const ToUTF8::FromType& encoding, std::map fallbackMap); + ToUTF8::Utf8Encoder* encoder, std::map fallbackMap); virtual ~World(); diff --git a/components/esm/esmreader.cpp b/components/esm/esmreader.cpp index 5cd99a64a2..99f7971b18 100644 --- a/components/esm/esmreader.cpp +++ b/components/esm/esmreader.cpp @@ -344,7 +344,7 @@ std::string ESMReader::getString(int size) getExact(ptr, size); // Convert to UTF8 and return - return mEncoder.getUtf8(ptr, size); + return mEncoder->getUtf8(ptr, size); } void ESMReader::fail(const std::string &msg) @@ -362,10 +362,9 @@ void ESMReader::fail(const std::string &msg) throw std::runtime_error(ss.str()); } -void ESMReader::setEncoding(const ToUTF8::FromType& encoding) +void ESMReader::setEncoder(ToUTF8::Utf8Encoder* encoder) { - mEncoding = encoding; - mEncoder.setEncoding(encoding); + mEncoder = encoder; } } diff --git a/components/esm/esmreader.hpp b/components/esm/esmreader.hpp index 57503aea77..d52be25aa2 100644 --- a/components/esm/esmreader.hpp +++ b/components/esm/esmreader.hpp @@ -235,8 +235,8 @@ public: /// Used for error handling void fail(const std::string &msg); - /// Sets font encoding for ESM strings - void setEncoding(const ToUTF8::FromType& encoding); + /// Sets font encoder for ESM strings + void setEncoder(ToUTF8::Utf8Encoder* encoder); private: Ogre::DataStreamPtr mEsm; @@ -251,8 +251,7 @@ private: SaveData mSaveData; MasterList mMasters; - ToUTF8::FromType mEncoding; - ToUTF8::Utf8Encoder mEncoder; + ToUTF8::Utf8Encoder* mEncoder; }; } #endif diff --git a/components/esm/esmwriter.cpp b/components/esm/esmwriter.cpp index a00c7971d0..e2f878a257 100644 --- a/components/esm/esmwriter.cpp +++ b/components/esm/esmwriter.cpp @@ -158,7 +158,7 @@ void ESMWriter::writeHString(const std::string& data) else { // Convert to UTF8 and return - std::string ascii = m_encoder.getLegacyEnc(data); + std::string ascii = m_encoder->getLegacyEnc(data); write(ascii.c_str(), ascii.size()); } @@ -188,23 +188,9 @@ void ESMWriter::write(const char* data, int size) m_stream->write(data, size); } -void ESMWriter::setEncoding(const std::string& encoding) +void ESMWriter::setEncoder(ToUTF8::Utf8Encoder* encoder) { - if (encoding == "win1250") - { - m_encoding = ToUTF8::WINDOWS_1250; - } - else if (encoding == "win1251") - { - m_encoding = ToUTF8::WINDOWS_1251; - } - else - { - // Default Latin encoding - m_encoding = ToUTF8::WINDOWS_1252; - } - - m_encoder.setEncoding(m_encoding); + m_encoder = encoder; } } diff --git a/components/esm/esmwriter.hpp b/components/esm/esmwriter.hpp index 20bc5da128..b557a29ad8 100644 --- a/components/esm/esmwriter.hpp +++ b/components/esm/esmwriter.hpp @@ -6,7 +6,7 @@ #include #include "esmcommon.hpp" -#include "../to_utf8/to_utf8.hpp" +#include namespace ESM { @@ -24,7 +24,7 @@ public: void setVersion(int ver); int getType(); void setType(int type); - void setEncoding(const std::string& encoding); // Write strings as UTF-8? + void setEncoder(ToUTF8::Utf8Encoder *encoding); // Write strings as UTF-8? void setAuthor(const std::string& author); void setDescription(const std::string& desc); @@ -94,12 +94,10 @@ private: std::list m_records; std::ostream* m_stream; std::streampos m_headerPos; - ToUTF8::FromType m_encoding; - ToUTF8::Utf8Encoder m_encoder; + ToUTF8::Utf8Encoder* m_encoder; int m_recordCount; HEDRstruct m_header; - SaveData m_saveData; }; } diff --git a/components/to_utf8/to_utf8.cpp b/components/to_utf8/to_utf8.cpp index 5efec36a4d..275f5483f3 100644 --- a/components/to_utf8/to_utf8.cpp +++ b/components/to_utf8/to_utf8.cpp @@ -43,16 +43,10 @@ using namespace ToUTF8; -Utf8Encoder::Utf8Encoder(void): +Utf8Encoder::Utf8Encoder(const FromType sourceEncoding): mOutput(50*1024) { -} - -void Utf8Encoder::setEncoding(const FromType sourceEncoding) -{ - mEncoding = sourceEncoding; - - switch (mEncoding) + switch (sourceEncoding) { case ToUTF8::WINDOWS_1252: { diff --git a/components/to_utf8/to_utf8.hpp b/components/to_utf8/to_utf8.hpp index bfba8a1ac4..e150cf17ba 100644 --- a/components/to_utf8/to_utf8.hpp +++ b/components/to_utf8/to_utf8.hpp @@ -24,9 +24,7 @@ namespace ToUTF8 class Utf8Encoder { public: - Utf8Encoder(void); - - void setEncoding(const FromType sourceEncoding); + Utf8Encoder(FromType sourceEncoding); // Convert to UTF8 from the previously given code page. std::string getUtf8(const char *input, int size); @@ -48,9 +46,7 @@ namespace ToUTF8 size_t getLength2(const char* input, bool &ascii); void copyFromArray2(const char*& chp, char* &out); - FromType mEncoding; std::vector mOutput; - int mSize; char* translationArray; }; } diff --git a/components/translation/translation.cpp b/components/translation/translation.cpp index 002446e4f9..184cf399cd 100644 --- a/components/translation/translation.cpp +++ b/components/translation/translation.cpp @@ -50,7 +50,7 @@ namespace Translation if (!line.empty()) { - line = mEncoder.getUtf8(line); + line = mEncoder->getUtf8(line); size_t tab_pos = line.find('\t'); if (tab_pos != std::string::npos && tab_pos > 0 && tab_pos < line.size() - 1) @@ -101,10 +101,9 @@ namespace Translation return phrase; } - void Storage::setEncoding (const ToUTF8::FromType& encoding) + void Storage::setEncoder(ToUTF8::Utf8Encoder* encoder) { - mEncoding = encoding; - mEncoder.setEncoding(encoding); + mEncoder = encoder; } bool Storage::hasTranslation() const diff --git a/components/translation/translation.hpp b/components/translation/translation.hpp index 6c3e4df868..bca9ea255c 100644 --- a/components/translation/translation.hpp +++ b/components/translation/translation.hpp @@ -19,7 +19,7 @@ namespace Translation // Standard form usually means nominative case std::string topicStandardForm(const std::string& phrase) const; - void setEncoding (const ToUTF8::FromType& encoding); + void setEncoder(ToUTF8::Utf8Encoder* encoder); bool hasTranslation() const; @@ -34,8 +34,7 @@ namespace Translation void loadDataFromStream(ContainerType& container, std::istream& stream); - ToUTF8::FromType mEncoding; - ToUTF8::Utf8Encoder mEncoder; + ToUTF8::Utf8Encoder* mEncoder; ContainerType mCellNamesTranslations, mTopicIDs, mPhraseForms; }; } From 0b7d11d38d99033a09bed6b22c75378f692653a5 Mon Sep 17 00:00:00 2001 From: Emanuel Guevel Date: Sun, 6 Jan 2013 11:39:18 +0100 Subject: [PATCH 9/9] to_utf8 test: fix Utf8Encoder constructor --- components/to_utf8/tests/to_utf8_test.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/components/to_utf8/tests/to_utf8_test.cpp b/components/to_utf8/tests/to_utf8_test.cpp index 4bba0cf90d..3fcddd1581 100644 --- a/components/to_utf8/tests/to_utf8_test.cpp +++ b/components/to_utf8/tests/to_utf8_test.cpp @@ -18,8 +18,7 @@ void testEncoder(ToUTF8::FromType encoding, const std::string &legacyEncFile, std::string utf8Line = getFirstLine(utf8File); // create an encoder for specified character encoding - ToUTF8::Utf8Encoder encoder; - encoder.setEncoding(encoding); + ToUTF8::Utf8Encoder encoder (encoding); // convert text to UTF-8 std::string convertedUtf8Line = encoder.getUtf8(legacyEncLine);