Merge remote-tracking branch 'potatoesmaster/to_utf8-rewrite'

2013-01-06 13:31:01 +01:00 · 2013-01-06 13:31:01 +01:00 · 25815ab8f7
commit 25815ab8f7
parent 043e6c09fe 0b7d11d38d
23 changed files with 423 additions and 365 deletions
--- a/apps/esmtool/esmtool.cpp
+++ b/apps/esmtool/esmtool.cpp
@ -165,23 +165,12 @@ bool parseOptions (int argc, char** argv, Arguments &info)
    // Font encoding settings
    info.encoding = variables["encoding"].as<std::string>();
-    if (info.encoding == "win1250")
+    if(info.encoding != "win1250" && info.encoding != "win1251" && info.encoding != "win1252")
    {
-        std::cout << "Using Central and Eastern European font encoding." << std::endl;
+        std::cout << info.encoding << " is not a valid encoding option." << std::endl;
-    }
+        info.encoding = "win1252";
    else if (info.encoding == "win1251")
    {
        std::cout << "Using Cyrillic font encoding." << std::endl;
    }
    else
    {
        if(info.encoding != "win1252")
        {
            std::cout << info.encoding << " is not a valid encoding option." << std::endl;
            info.encoding = "win1252";
        }
        std::cout << "Using default (English) font encoding." << std::endl;
    }
    std::cout << ToUTF8::encodingUsingMessage(info.encoding) << std::endl;
    return true;
 }
@ -262,7 +251,8 @@ void printRaw(ESM::ESMReader &esm)
 int load(Arguments& info)
 {
    ESM::ESMReader& esm = info.reader;
-    esm.setEncoding(ToUTF8::calculateEncoding(info.encoding));
+    ToUTF8::Utf8Encoder encoder (ToUTF8::calculateEncoding(info.encoding));
    esm.setEncoder(&encoder);
    std::string filename = info.filename;
    std::cout << "Loading file: " << filename << std::endl;
@ -432,7 +422,8 @@ int clone(Arguments& info)
    std::cout << std::endl << "Saving records to: " << info.outname << "..." << std::endl;
    ESM::ESMWriter& esm = info.writer;
-    esm.setEncoding(info.encoding);
+    ToUTF8::Utf8Encoder encoder (ToUTF8::calculateEncoding(info.encoding));
    esm.setEncoder(&encoder);
    esm.setAuthor(info.data.author);
    esm.setDescription(info.data.description);
    esm.setVersion(info.data.version);
--- a/apps/launcher/model/datafilesmodel.cpp
+++ b/apps/launcher/model/datafilesmodel.cpp
@ -272,7 +272,8 @@ void DataFilesModel::addMasters(const QString &path)
    foreach (const QString &path, dir.entryList()) {
        try {
            ESM::ESMReader fileReader;
-            fileReader.setEncoding(ToUTF8::calculateEncoding(mEncoding.toStdString()));
+            ToUTF8::Utf8Encoder encoder (ToUTF8::calculateEncoding(mEncoding.toStdString()));
            fileReader.setEncoder(&encoder);
            fileReader.open(dir.absoluteFilePath(path).toStdString());
            ESM::ESMReader::MasterList mlist = fileReader.getMasters();
@ -335,7 +336,8 @@ void DataFilesModel::addPlugins(const QString &path)
        try {
            ESM::ESMReader fileReader;
-            fileReader.setEncoding(ToUTF8::calculateEncoding(mEncoding.toStdString()));
+            ToUTF8::Utf8Encoder encoder (ToUTF8::calculateEncoding(mEncoding.toStdString()));
            fileReader.setEncoder(&encoder);
            fileReader.open(dir.absoluteFilePath(path).toStdString());
            ESM::ESMReader::MasterList mlist = fileReader.getMasters();
--- a/apps/mwiniimporter/importer.cpp
+++ b/apps/mwiniimporter/importer.cpp
@ -649,11 +649,12 @@ MwIniImporter::multistrmap MwIniImporter::loadIniFile(std::string filename) {
    std::string section("");
    MwIniImporter::multistrmap map;
    boost::iostreams::stream<boost::iostreams::file_source>file(filename.c_str());
    ToUTF8::Utf8Encoder encoder(mEncoding);
    std::string line;
    while (std::getline(file, line)) {
-        line = toUTF8(line);
+        line = encoder.getUtf8(line);
        // unify Unix-style and Windows file ending
        if (!(line.empty()) && (line[line.length()-1]) == '\r') {
@ -829,14 +830,6 @@ void MwIniImporter::writeToFile(boost::iostreams::stream<boost::iostreams::file_
    }
 }
 std::string MwIniImporter::toUTF8(const std::string &str) {
    char *ptr = ToUTF8::getBuffer(str.length());
    strncpy(ptr, str.c_str(), str.length());
    // Convert to UTF8 and return
    return ToUTF8::getUtf8(mEncoding);
 }
 void MwIniImporter::setInputEncoding(const ToUTF8::FromType &encoding)
 {
  mEncoding = encoding;
--- a/apps/mwiniimporter/importer.hpp
+++ b/apps/mwiniimporter/importer.hpp
@ -8,7 +8,7 @@
 #include <vector>
 #include <exception>
-#include "../../components/to_utf8/to_utf8.hpp"
+#include <components/to_utf8/to_utf8.hpp>
 class MwIniImporter {
  public:
--- a/apps/openmw/engine.cpp
+++ b/apps/openmw/engine.cpp
@ -331,11 +331,15 @@ void OMW::Engine::go()
    // cursor replacer (converts the cursor from the bsa so they can be used by mygui)
    MWGui::CursorReplace replacer;
    // Create encoder
    ToUTF8::Utf8Encoder encoder (mEncoding);
    // Create the world
    mEnvironment.setWorld (new MWWorld::World (*mOgre, mFileCollections, mMaster,
-        mResDir, mCfgMgr.getCachePath(), mNewGame, mEncoding, mFallbackMap));
+        mResDir, mCfgMgr.getCachePath(), mNewGame, &encoder, mFallbackMap));
    //Load translation data
    mTranslationDataStorage.setEncoder(&encoder);
    mTranslationDataStorage.loadTranslationData(mFileCollections, mMaster);
    // Create window manager - this manages all the MW-specific GUI windows
@ -494,7 +498,6 @@ void OMW::Engine::showFPS(int level)
 void OMW::Engine::setEncoding(const ToUTF8::FromType& encoding)
 {
    mEncoding = encoding;
    mTranslationDataStorage.setEncoding (encoding);
 }
 void OMW::Engine::setFallbackValues(std::map<std::string,std::string> fallbackMap)
--- a/apps/openmw/mwworld/worldimp.cpp
+++ b/apps/openmw/mwworld/worldimp.cpp
@ -170,7 +170,7 @@ namespace MWWorld
    World::World (OEngine::Render::OgreRenderer& renderer,
        const Files::Collections& fileCollections,
        const std::string& master, const boost::filesystem::path& resDir, const boost::filesystem::path& cacheDir, bool newGame,
-        const ToUTF8::FromType& encoding, std::map<std::string,std::string> fallbackMap)
+        ToUTF8::Utf8Encoder* encoder, std::map<std::string,std::string> fallbackMap)
    : mPlayer (0), mLocalScripts (mStore), mGlobalVariables (0),
      mSky (true), mCells (mStore, mEsm),
      mNumFacing(0)
@ -187,7 +187,7 @@ namespace MWWorld
        std::cout << "Loading ESM " << masterPath.string() << "\n";
        // This parses the ESM file and loads a sample cell
-        mEsm.setEncoding(encoding);
+        mEsm.setEncoder(encoder);
        mEsm.open (masterPath.string());
        mStore.load (mEsm);
--- a/apps/openmw/mwworld/worldimp.hpp
+++ b/apps/openmw/mwworld/worldimp.hpp
@ -95,7 +95,7 @@ namespace MWWorld
            World (OEngine::Render::OgreRenderer& renderer,
                const Files::Collections& fileCollections,
                const std::string& master, const boost::filesystem::path& resDir, const boost::filesystem::path& cacheDir, bool newGame,
-                const ToUTF8::FromType& encoding, std::map<std::string,std::string> fallbackMap);
+                ToUTF8::Utf8Encoder* encoder, std::map<std::string,std::string> fallbackMap);
            virtual ~World();
--- a/components/esm/esmreader.cpp
+++ b/components/esm/esmreader.cpp
@ -15,6 +15,11 @@ ESM_Context ESMReader::getContext()
    return mCtx;
 }
 ESMReader::ESMReader(void):
    mBuffer(50*1024)
 {
 }
 void ESMReader::restoreContext(const ESM_Context &rc)
 {
    // Reopen the file if necessary
@ -323,11 +328,21 @@ void ESMReader::getExact(void*x, int size)
 std::string ESMReader::getString(int size)
 {
-    char *ptr = ToUTF8::getBuffer(size);
+    size_t s = size;
-    mEsm->read(ptr, size);
+    if (mBuffer.size() <= s)
        // Add some extra padding to reduce the chance of having to resize
        // again later.
        mBuffer.resize(3*s);
    // And make sure the string is zero terminated
    mBuffer[s] = 0;
    // read ESM data
    char *ptr = &mBuffer[0];
    getExact(ptr, size);
    // Convert to UTF8 and return
-    return ToUTF8::getUtf8(mEncoding);
+    return mEncoder->getUtf8(ptr, size);
 }
 void ESMReader::fail(const std::string &msg)
@ -345,9 +360,9 @@ void ESMReader::fail(const std::string &msg)
    throw std::runtime_error(ss.str());
 }
-void ESMReader::setEncoding(const ToUTF8::FromType& encoding)
+void ESMReader::setEncoder(ToUTF8::Utf8Encoder* encoder)
 {
-  mEncoding = encoding;
+    mEncoder = encoder;
 }
 }
--- a/components/esm/esmreader.hpp
+++ b/components/esm/esmreader.hpp
@ -20,6 +20,8 @@ class ESMReader
 {
 public:
  ESMReader(void);
  /*************************************************************************
   *
   *  Public type definitions
@ -233,8 +235,8 @@ public:
  /// Used for error handling
  void fail(const std::string &msg);
-  /// Sets font encoding for ESM strings
+  /// Sets font encoder for ESM strings
-  void setEncoding(const ToUTF8::FromType& encoding);
+  void setEncoder(ToUTF8::Utf8Encoder* encoder);
 private:
  Ogre::DataStreamPtr mEsm;
@ -244,9 +246,12 @@ private:
  // Special file signifier (see SpecialFile enum above)
  int mSpf;
  // Buffer for ESM strings
  std::vector<char> mBuffer;
  SaveData mSaveData;
  MasterList mMasters;
-  ToUTF8::FromType mEncoding;
+  ToUTF8::Utf8Encoder* mEncoder;
 };
 }
 #endif
--- a/components/esm/esmwriter.cpp
+++ b/components/esm/esmwriter.cpp
@ -157,12 +157,8 @@ void ESMWriter::writeHString(const std::string& data)
        write("\0", 1);
    else
    {
        char *ptr = ToUTF8::getBuffer(data.size()+1);
        strncpy(ptr, &data[0], data.size());
        ptr[data.size()] = '\0';
        // Convert to UTF8 and return
-        std::string ascii = ToUTF8::getLegacyEnc(m_encoding);
+        std::string ascii = m_encoder->getLegacyEnc(data);
        write(ascii.c_str(), ascii.size());
    }
@ -192,21 +188,9 @@ void ESMWriter::write(const char* data, int size)
    m_stream->write(data, size);
 }
-void ESMWriter::setEncoding(const std::string& encoding)
+void ESMWriter::setEncoder(ToUTF8::Utf8Encoder* encoder)
 {
-    if (encoding == "win1250")
+    m_encoder = encoder;
    {
        m_encoding = ToUTF8::WINDOWS_1250;
    }
    else if (encoding == "win1251")
    {
        m_encoding = ToUTF8::WINDOWS_1251;
    }
    else
    {
        // Default Latin encoding
        m_encoding = ToUTF8::WINDOWS_1252;
    }
 }
 }
--- a/components/esm/esmwriter.hpp
+++ b/components/esm/esmwriter.hpp
@ -6,7 +6,7 @@
 #include <assert.h>
 #include "esmcommon.hpp"
-#include "../to_utf8/to_utf8.hpp"
+#include <components/to_utf8/to_utf8.hpp>
 namespace ESM {
@ -24,7 +24,7 @@ public:
    void setVersion(int ver);
    int getType();
    void setType(int type);
-    void setEncoding(const std::string& encoding); // Write strings as UTF-8?
+    void setEncoder(ToUTF8::Utf8Encoder *encoding); // Write strings as UTF-8?
    void setAuthor(const std::string& author);
    void setDescription(const std::string& desc);
@ -94,11 +94,10 @@ private:
    std::list<RecordData> m_records;
    std::ostream* m_stream;
    std::streampos m_headerPos;
-    ToUTF8::FromType m_encoding;
+    ToUTF8::Utf8Encoder* m_encoder;
    int m_recordCount;
    HEDRstruct m_header;
    SaveData m_saveData;
 };
 }
--- a/components/to_utf8/tests/.gitignore
+++ b/components/to_utf8/tests/.gitignore
@ -0,0 +1 @@
 *_test
--- a/components/to_utf8/tests/output/to_utf8_test.out
+++ b/components/to_utf8/tests/output/to_utf8_test.out
@ -0,0 +1,4 @@
 original:  Без вопросов отдаете ему рулет, зная, что позже вы сможете привести с собой своих друзей и тогда он получит по заслугам?
 converted: Без вопросов отдаете ему рулет, зная, что позже вы сможете привести с собой своих друзей и тогда он получит по заслугам?
 original:  Vous lui donnez le gâteau sans protester avant d’aller chercher tous vos amis et de revenir vous venger.
 converted: Vous lui donnez le gâteau sans protester avant d’aller chercher tous vos amis et de revenir vous venger.
--- a/components/to_utf8/tests/test.sh
+++ b/components/to_utf8/tests/test.sh
@ -0,0 +1,18 @@
 #!/bin/bash
 make || exit
 mkdir -p output
 PROGS=*_test
 for a in $PROGS; do
    if [ -f "output/$a.out" ]; then
        echo "Running $a:"
        ./$a | diff output/$a.out -
    else
        echo "Creating $a.out"
        ./$a > "output/$a.out"
        git add "output/$a.out"
    fi
 done
--- a/components/to_utf8/tests/test_data/french-utf8.txt
+++ b/components/to_utf8/tests/test_data/french-utf8.txt
@ -0,0 +1 @@
 Vous lui donnez le gâteau sans protester avant d’aller chercher tous vos amis et de revenir vous venger.
--- a/components/to_utf8/tests/test_data/french-win1252.txt
+++ b/components/to_utf8/tests/test_data/french-win1252.txt
@ -0,0 +1 @@
 Vous lui donnez le gâteau sans protester avant d’aller chercher tous vos amis et de revenir vous venger.
--- a/components/to_utf8/tests/test_data/russian-utf8.txt
+++ b/components/to_utf8/tests/test_data/russian-utf8.txt
@ -0,0 +1 @@
 Без вопросов отдаете ему рулет, зная, что позже вы сможете привести с собой своих друзей и тогда он получит по заслугам?
--- a/components/to_utf8/tests/test_data/russian-win1251.txt
+++ b/components/to_utf8/tests/test_data/russian-win1251.txt
@ -0,0 +1 @@
 Без вопросов отдаете ему рулет, зная, что позже вы сможете привести с собой своих друзей и тогда он получит по заслугам?
--- a/components/to_utf8/tests/to_utf8_test.cpp
+++ b/components/to_utf8/tests/to_utf8_test.cpp
@ -0,0 +1,59 @@
 #include <iostream>
 #include <fstream>
 #include <cassert>
 #include <stdexcept>
 #include "../to_utf8.hpp"
 std::string getFirstLine(const std::string &filename);
 void testEncoder(ToUTF8::FromType encoding, const std::string &legacyEncFile,
                 const std::string &utf8File);
 /// Test character encoding conversion to and from UTF-8
 void testEncoder(ToUTF8::FromType encoding, const std::string &legacyEncFile,
                 const std::string &utf8File)
 {
    // get some test data
    std::string legacyEncLine = getFirstLine(legacyEncFile);
    std::string utf8Line = getFirstLine(utf8File);
    // create an encoder for specified character encoding
    ToUTF8::Utf8Encoder encoder (encoding);
    // convert text to UTF-8
    std::string convertedUtf8Line = encoder.getUtf8(legacyEncLine);
    std::cout << "original:  " << utf8Line          << std::endl;
    std::cout << "converted: " << convertedUtf8Line << std::endl;
    // check correctness
    assert(convertedUtf8Line == utf8Line);
    // convert UTF-8 text to legacy encoding
    std::string convertedLegacyEncLine = encoder.getLegacyEnc(utf8Line);
    // check correctness
    assert(convertedLegacyEncLine == legacyEncLine);
 }
 std::string getFirstLine(const std::string &filename)
 {
    std::string line;
    std::ifstream text (filename.c_str());
    if (!text.is_open())
    {
        throw std::runtime_error("Unable to open file " + filename);
    }
    std::getline(text, line);
    text.close();
    return line;
 }
 int main()
 {
    testEncoder(ToUTF8::WINDOWS_1251, "test_data/russian-win1251.txt", "test_data/russian-utf8.txt");
    testEncoder(ToUTF8::WINDOWS_1252, "test_data/french-win1252.txt", "test_data/french-utf8.txt");
    return 0;
 }
--- a/components/to_utf8/to_utf8.cpp
+++ b/components/to_utf8/to_utf8.cpp
@ -2,6 +2,8 @@
 #include <vector>
 #include <cassert>
 #include <iostream>
 #include <iomanip>
 /* This file contains the code to translate from WINDOWS-1252 (native
   charset used in English version of Morrowind) to UTF-8. The library
@ -39,341 +41,298 @@
 // Generated tables
 #include "tables_gen.hpp"
-// Shared global buffers, we love you. These initial sizes are large
+using namespace ToUTF8;
 // enough to hold the largest books in Morrowind.esm, but we will
 // resize automaticall if necessary.
 static std::vector<char> buf    (50*1024);
 static std::vector<char> output (50*1024);
 static int size;
-// Make sure the given vector is large enough for 'size' bytes,
+Utf8Encoder::Utf8Encoder(const FromType sourceEncoding):
-// including a terminating zero after it.
+    mOutput(50*1024)
 static void resize(std::vector<char> &buf, size_t size)
 {
-  if(buf.size() <= size)
+    switch (sourceEncoding)
-    // Add some extra padding to reduce the chance of having to resize
+    {
-    // again later.
+        case ToUTF8::WINDOWS_1252:
-    buf.resize(3*size);
+        {
-
+            translationArray = ToUTF8::windows_1252;
-  // And make sure the string is zero terminated
+            break;
-  buf[size] = 0;
+        }
        case ToUTF8::WINDOWS_1250:
        {
            translationArray = ToUTF8::windows_1250;
            break;
        }
        case ToUTF8::WINDOWS_1251:
        {
            translationArray = ToUTF8::windows_1251;
            break;
        }
        default:
        {
            assert(0);
        }
    }
 }
-// This is just used to spew out a reusable input buffer for the
+std::string Utf8Encoder::getUtf8(const char* input, int size)
 // conversion process.
 char *ToUTF8::getBuffer(int s)
 {
-  // Remember the requested size
+    // Double check that the input string stops at some point (it might
-  size = s;
+    // contain zero terminators before this, inside its own data, which
-  resize(buf, size);
+    // is also ok.)
-  return &buf[0];
+    assert(input[size] == 0);
    // TODO: The rest of this function is designed for single-character
    // input encodings only. It also assumes that the input the input
    // encoding shares its first 128 values (0-127) with ASCII. These
    // conditions must be checked again if you add more input encodings
    // later.
    // Compute output length, and check for pure ascii input at the same
    // time.
    bool ascii;
    size_t outlen = getLength(input, ascii);
    // If we're pure ascii, then don't bother converting anything.
    if(ascii)
        return std::string(input, outlen);
    // Make sure the output is large enough
    resize(outlen);
    char *out = &mOutput[0];
    // Translate
    while (*input)
        copyFromArray(*(input++), out);
    // Make sure that we wrote the correct number of bytes
    assert((out-&mOutput[0]) == (int)outlen);
    // And make extra sure the output is null terminated
    assert(mOutput.size() > outlen);
    assert(mOutput[outlen] == 0);
    // Return a string
    return std::string(&mOutput[0], outlen);
 }
 std::string Utf8Encoder::getLegacyEnc(const char *input, int size)
 {
    // Double check that the input string stops at some point (it might
    // contain zero terminators before this, inside its own data, which
    // is also ok.)
    assert(input[size] == 0);
    // TODO: The rest of this function is designed for single-character
    // input encodings only. It also assumes that the input the input
    // encoding shares its first 128 values (0-127) with ASCII. These
    // conditions must be checked again if you add more input encodings
    // later.
    // Compute output length, and check for pure ascii input at the same
    // time.
    bool ascii;
    size_t outlen = getLength2(input, ascii);
    // If we're pure ascii, then don't bother converting anything.
    if(ascii)
        return std::string(input, outlen);
    // Make sure the output is large enough
    resize(outlen);
    char *out = &mOutput[0];
    // Translate
    while(*input)
        copyFromArray2(input, out);
    // Make sure that we wrote the correct number of bytes
    assert((out-&mOutput[0]) == (int)outlen);
    // And make extra sure the output is null terminated
    assert(mOutput.size() > outlen);
    assert(mOutput[outlen] == 0);
    // Return a string
    return std::string(&mOutput[0], outlen);
 }
 // Make sure the output vector is large enough for 'size' bytes,
 // including a terminating zero after it.
 void Utf8Encoder::resize(size_t size)
 {
    if (mOutput.size() <= size)
        // Add some extra padding to reduce the chance of having to resize
        // again later.
        mOutput.resize(3*size);
    // And make sure the string is zero terminated
    mOutput[size] = 0;
 }
 /** Get the total length length needed to decode the given string with
-    the given translation array. The arrays are encoded with 6 bytes
+  the given translation array. The arrays are encoded with 6 bytes
-    per character, with the first giving the length and the next 5 the
+  per character, with the first giving the length and the next 5 the
-    actual data.
+  actual data.
-    The function serves a dual purpose for optimization reasons: it
+  The function serves a dual purpose for optimization reasons: it
-    checks if the input is pure ascii (all values are <= 127). If this
+  checks if the input is pure ascii (all values are <= 127). If this
-    is the case, then the ascii parameter is set to true, and the
+  is the case, then the ascii parameter is set to true, and the
-    caller can optimize for this case.
+  caller can optimize for this case.
 */
-static size_t getLength(const char *arr, const char* input, bool &ascii)
+size_t Utf8Encoder::getLength(const char* input, bool &ascii)
 {
-  ascii = true;
+    ascii = true;
-  size_t len = 0;
+    size_t len = 0;
-  const char* ptr = input;
+    const char* ptr = input;
-  unsigned char inp = *ptr;
+    unsigned char inp = *ptr;
-  // Do away with the ascii part of the string first (this is almost
+    // Do away with the ascii part of the string first (this is almost
-  // always the entire string.)
+    // always the entire string.)
-  while(inp && inp < 128)
+    while (inp && inp < 128)
-    inp = *(++ptr);
+        inp = *(++ptr);
-  len += (ptr-input);
+    len += (ptr-input);
-  // If we're not at the null terminator at this point, then there
+    // If we're not at the null terminator at this point, then there
-  // were some non-ascii characters to deal with. Go to slow-mode for
+    // were some non-ascii characters to deal with. Go to slow-mode for
-  // the rest of the string.
+    // the rest of the string.
-  if(inp)
+    if (inp)
    {
-      ascii = false;
+        ascii = false;
-      while(inp)
+        while (inp)
        {
-          // Find the translated length of this character in the
+            // Find the translated length of this character in the
-          // lookup table.
+            // lookup table.
-          len += arr[inp*6];
+            len += translationArray[inp*6];
-          inp = *(++ptr);
+            inp = *(++ptr);
        }
    }
-  return len;
+    return len;
 }
 // Translate one character 'ch' using the translation array 'arr', and
 // advance the output pointer accordingly.
-static void copyFromArray(const char *arr, unsigned char ch, char* &out)
+void Utf8Encoder::copyFromArray(unsigned char ch, char* &out)
 {
-  // Optimize for ASCII values
+    // Optimize for ASCII values
-  if(ch < 128)
+    if (ch < 128)
    {
-      *(out++) = ch;
+        *(out++) = ch;
-      return;
+        return;
    }
-  const char *in = arr + ch*6;
+    const char *in = translationArray + ch*6;
-  int len = *(in++);
+    int len = *(in++);
-  for(int i=0; i<len; i++)
+    for (int i=0; i<len; i++)
-    *(out++) = *(in++);
+        *(out++) = *(in++);
 }
-std::string ToUTF8::getUtf8(ToUTF8::FromType from)
+size_t Utf8Encoder::getLength2(const char* input, bool &ascii)
 {
-  // Pick translation array
+    ascii = true;
-  const char *arr;
+    size_t len = 0;
-  switch (from)
+    const char* ptr = input;
-  {
+    unsigned char inp = *ptr;
-    case ToUTF8::WINDOWS_1252:
+
    // Do away with the ascii part of the string first (this is almost
    // always the entire string.)
    while (inp && inp < 128)
        inp = *(++ptr);
    len += (ptr-input);
    // If we're not at the null terminator at this point, then there
    // were some non-ascii characters to deal with. Go to slow-mode for
    // the rest of the string.
    if (inp)
    {
-      arr = ToUTF8::windows_1252;
+        ascii = false;
-      break;
+        while(inp)
    }
    case ToUTF8::WINDOWS_1250:
    {
      arr = ToUTF8::windows_1250;
      break;
    }
    case ToUTF8::WINDOWS_1251:
    {
      arr = ToUTF8::windows_1251;
      break;
    }
    default:
    {
      assert(0);
    }
  }
  // Double check that the input string stops at some point (it might
  // contain zero terminators before this, inside its own data, which
  // is also ok.)
  const char* input = &buf[0];
  assert(input[size] == 0);
  // TODO: The rest of this function is designed for single-character
  // input encodings only. It also assumes that the input the input
  // encoding shares its first 128 values (0-127) with ASCII. These
  // conditions must be checked again if you add more input encodings
  // later.
  // Compute output length, and check for pure ascii input at the same
  // time.
  bool ascii;
  size_t outlen = getLength(arr, input, ascii);
  // If we're pure ascii, then don't bother converting anything.
  if(ascii)
    return std::string(input, outlen);
  // Make sure the output is large enough
  resize(output, outlen);
  char *out = &output[0];
  // Translate
  while(*input)
    copyFromArray(arr, *(input++), out);
  // Make sure that we wrote the correct number of bytes
  assert((out-&output[0]) == (int)outlen);
  // And make extra sure the output is null terminated
  assert(output.size() > outlen);
  assert(output[outlen] == 0);
  // Return a string
  return std::string(&output[0], outlen);
 }
 static size_t getLength2(const char *arr, const char* input, bool &ascii)
 {
  ascii = true;
  size_t len = 0;
  const char* ptr = input;
  unsigned char inp = *ptr;
  // Do away with the ascii part of the string first (this is almost
  // always the entire string.)
  while(inp && inp < 128)
    inp = *(++ptr);
  len += (ptr-input);
  // If we're not at the null terminator at this point, then there
  // were some non-ascii characters to deal with. Go to slow-mode for
  // the rest of the string.
  if(inp)
    {
      ascii = false;
      while(inp)
        {
            len += 1;
-          // Find the translated length of this character in the
+            // Find the translated length of this character in the
-          // lookup table.
+            // lookup table.
            switch(inp)
            {
-            case 0xe2: len -= 2; break;
+                case 0xe2: len -= 2; break;
-            case 0xc2:
+                case 0xc2:
-            case 0xcb:
+                case 0xcb:
-            case 0xc4:
+                case 0xc4:
-            case 0xc6:
+                case 0xc6:
-            case 0xc3:
+                case 0xc3:
-            case 0xd0:
+                case 0xd0:
-            case 0xd1:
+                case 0xd1:
-            case 0xd2:
+                case 0xd2:
-            case 0xc5: len -= 1; break;
+                case 0xc5: len -= 1; break;
            }
-          inp = *(++ptr);
+            inp = *(++ptr);
        }
    }
-  return len;
+    return len;
 }
-#include <iostream>
+void Utf8Encoder::copyFromArray2(const char*& chp, char* &out)
 #include <iomanip>
 static void copyFromArray2(const char *arr, char*& chp, char* &out)
 {
    unsigned char ch = *(chp++);
-  // Optimize for ASCII values
+    // Optimize for ASCII values
-  if(ch < 128)
+    if (ch < 128)
    {
-      *(out++) = ch;
+        *(out++) = ch;
-      return;
+        return;
    }
-  int len = 1;
+    int len = 1;
-  switch (ch)
+    switch (ch)
  {
  case 0xe2: len = 3; break;
  case 0xc2:
  case 0xcb:
  case 0xc4:
  case 0xc6:
  case 0xc3:
  case 0xd0:
  case 0xd1:
  case 0xd2:
  case 0xc5: len = 2; break;
  }
  if (len == 1) // There is no 1 length utf-8 glyph that is not 0x20 (empty space)
  {
      *(out++) = ch;
      return;
  }
  unsigned char ch2 = *(chp++);
  unsigned char ch3 = '\0';
  if (len == 3)
      ch3 = *(chp++);
  for (int i = 128; i < 256; i++)
  {
      unsigned char b1 = arr[i*6 + 1], b2 = arr[i*6 + 2], b3 = arr[i*6 + 3];
      if (b1 == ch && b2 == ch2 && (len != 3 || b3 == ch3))
      {
          *(out++) = (char)i;
          return;
      }
  }
  std::cout << "Could not find glyph " << std::hex << (int)ch << " " << (int)ch2 << " " << (int)ch3 << std::endl;
  *(out++) = ch; // Could not find glyph, just put whatever
 }
 std::string ToUTF8::getLegacyEnc(ToUTF8::FromType to)
 {
  // Pick translation array
  const char *arr;
  switch (to)
  {
    case ToUTF8::WINDOWS_1252:
    {
-      arr = ToUTF8::windows_1252;
+        case 0xe2: len = 3; break;
-      break;
+        case 0xc2:
        case 0xcb:
        case 0xc4:
        case 0xc6:
        case 0xc3:
        case 0xd0:
        case 0xd1:
        case 0xd2:
        case 0xc5: len = 2; break;
    }
-    case ToUTF8::WINDOWS_1250:
+
    if (len == 1) // There is no 1 length utf-8 glyph that is not 0x20 (empty space)
    {
-      arr = ToUTF8::windows_1250;
+        *(out++) = ch;
-      break;
+        return;
    }
-    case ToUTF8::WINDOWS_1251:
+
    unsigned char ch2 = *(chp++);
    unsigned char ch3 = '\0';
    if (len == 3)
        ch3 = *(chp++);
    for (int i = 128; i < 256; i++)
    {
-      arr = ToUTF8::windows_1251;
+        unsigned char b1 = translationArray[i*6 + 1], b2 = translationArray[i*6 + 2], b3 = translationArray[i*6 + 3];
-      break;
+        if (b1 == ch && b2 == ch2 && (len != 3 || b3 == ch3))
        {
            *(out++) = (char)i;
            return;
        }
    }
    default:
    {
      assert(0);
    }
  }
-  // Double check that the input string stops at some point (it might
+    std::cout << "Could not find glyph " << std::hex << (int)ch << " " << (int)ch2 << " " << (int)ch3 << std::endl;
  // contain zero terminators before this, inside its own data, which
  // is also ok.)
  char* input = &buf[0];
  assert(input[size] == 0);
-  // TODO: The rest of this function is designed for single-character
+    *(out++) = ch; // Could not find glyph, just put whatever
  // input encodings only. It also assumes that the input the input
  // encoding shares its first 128 values (0-127) with ASCII. These
  // conditions must be checked again if you add more input encodings
  // later.
  // Compute output length, and check for pure ascii input at the same
  // time.
  bool ascii;
  size_t outlen = getLength2(arr, input, ascii);
  // If we're pure ascii, then don't bother converting anything.
  if(ascii)
      return std::string(input, outlen);
  // Make sure the output is large enough
  resize(output, outlen);
  char *out = &output[0];
  // Translate
  while(*input)
    copyFromArray2(arr, input, out);
  // Make sure that we wrote the correct number of bytes
  assert((out-&output[0]) == (int)outlen);
  // And make extra sure the output is null terminated
  assert(output.size() > outlen);
  assert(output[outlen] == 0);
  // Return a string
  return std::string(&output[0], outlen);
 }
 ToUTF8::FromType ToUTF8::calculateEncoding(const std::string& encodingName)
 {
-  if (encodingName == "win1250")
+    if (encodingName == "win1250")
-    return ToUTF8::WINDOWS_1250;
+        return ToUTF8::WINDOWS_1250;
-  else if (encodingName == "win1251")
+    else if (encodingName == "win1251")
-    return ToUTF8::WINDOWS_1251;
+        return ToUTF8::WINDOWS_1251;
-  else
+    else
-    return ToUTF8::WINDOWS_1252;
+        return ToUTF8::WINDOWS_1252;
 }
 std::string ToUTF8::encodingUsingMessage(const std::string& encodingName)
 {
-  if (encodingName == "win1250")
+    if (encodingName == "win1250")
-    return "Using Central and Eastern European font encoding.";
+        return "Using Central and Eastern European font encoding.";
-  else if (encodingName == "win1251")
+    else if (encodingName == "win1251")
-    return "Using Cyrillic font encoding.";
+        return "Using Cyrillic font encoding.";
-  else
+    else
-    return "Using default (English) font encoding.";
+        return "Using default (English) font encoding.";
 }
--- a/components/to_utf8/to_utf8.hpp
+++ b/components/to_utf8/to_utf8.hpp
@ -2,29 +2,53 @@
 #define COMPONENTS_TOUTF8_H
 #include <string>
 #include <cstring>
 #include <vector>
 namespace ToUTF8
 {
-  // These are all the currently supported code pages
+    // These are all the currently supported code pages
-  enum FromType
+    enum FromType
    {
-      WINDOWS_1250,      // Central ane Eastern European languages
+        WINDOWS_1250,      // Central ane Eastern European languages
-      WINDOWS_1251,      // Cyrillic languages
+        WINDOWS_1251,      // Cyrillic languages
-      WINDOWS_1252       // Used by English version of Morrowind (and
+        WINDOWS_1252       // Used by English version of Morrowind (and
-                         // probably others)
+            // probably others)
    };
-  // Return a writable buffer of at least 'size' bytes. The buffer
+    FromType calculateEncoding(const std::string& encodingName);
-  // does not have to be freed.
+    std::string encodingUsingMessage(const std::string& encodingName);
  char* getBuffer(int size);
-  // Convert the previously written buffer to UTF8 from the given code
+    // class
  // page.
  std::string getUtf8(FromType from);
  std::string getLegacyEnc(FromType to);
-  FromType calculateEncoding(const std::string& encodingName);
+    class Utf8Encoder
-  std::string encodingUsingMessage(const std::string& encodingName);
+    {
        public:
            Utf8Encoder(FromType sourceEncoding);
            // Convert to UTF8 from the previously given code page.
            std::string getUtf8(const char *input, int size);
            inline std::string getUtf8(const std::string &str)
            {
                return getUtf8(str.c_str(), str.size());
            }
            std::string getLegacyEnc(const char *input, int size);
            inline std::string getLegacyEnc(const std::string &str)
            {
                return getLegacyEnc(str.c_str(), str.size());
            }
        private:
            void resize(size_t size);
            size_t getLength(const char* input, bool &ascii);
            void copyFromArray(unsigned char chp, char* &out);
            size_t getLength2(const char* input, bool &ascii);
            void copyFromArray2(const char*& chp, char* &out);
            std::vector<char> mOutput;
            char* translationArray;
    };
 }
 #endif
--- a/components/translation/translation.cpp
+++ b/components/translation/translation.cpp
@ -50,10 +50,7 @@ namespace Translation
            if (!line.empty())
            {
-                char* buffer = ToUTF8::getBuffer(line.size() + 1);
+                line = mEncoder->getUtf8(line);
                //buffer has at least line.size() + 1 bytes, so it must be safe
                strcpy(buffer, line.c_str());
                line = ToUTF8::getUtf8(mEncoding);
                size_t tab_pos = line.find('\t');
                if (tab_pos != std::string::npos && tab_pos > 0 && tab_pos < line.size() - 1)
@ -104,9 +101,9 @@ namespace Translation
            return phrase;
    }
-    void Storage::setEncoding (const ToUTF8::FromType& encoding)
+    void Storage::setEncoder(ToUTF8::Utf8Encoder* encoder)
    {
-        mEncoding = encoding;
+        mEncoder = encoder;
    }
    bool Storage::hasTranslation() const
--- a/components/translation/translation.hpp
+++ b/components/translation/translation.hpp
@ -19,7 +19,7 @@ namespace Translation
        // Standard form usually means nominative case
        std::string topicStandardForm(const std::string& phrase) const;
-        void setEncoding (const ToUTF8::FromType& encoding);
+        void setEncoder(ToUTF8::Utf8Encoder* encoder);
        bool hasTranslation() const;
@ -34,7 +34,7 @@ namespace Translation
        void loadDataFromStream(ContainerType& container, std::istream& stream);
-        ToUTF8::FromType mEncoding;
+        ToUTF8::Utf8Encoder* mEncoder;
        ContainerType mCellNamesTranslations, mTopicIDs, mPhraseForms;
    };
 }
		`@ -0,0 +1 @@`
							`Vous lui donnez le gâteau sans protester avant d’aller chercher tous vos amis et de revenir vous venger.`
		`@ -0,0 +1 @@`
							`Без вопросов отдаете ему рулет, зная, что позже вы сможете привести с собой своих друзей и тогда он получит по заслугам?`