From 740e2b5769d85bec22b8478846bd3800d2552049 Mon Sep 17 00:00:00 2001
From: Emanuel Guevel <guevel.emanuel@gmail.com>
Date: Wed, 2 Jan 2013 23:02:13 +0100
Subject: [PATCH 1/9] components/to_utf8: add class Utf8Encoder

---
 components/to_utf8/to_utf8.cpp | 743 +++++++++++++++++++++++----------
 components/to_utf8/to_utf8.hpp |  67 ++-
 2 files changed, 563 insertions(+), 247 deletions(-)
diff --git a/components/to_utf8/to_utf8.cpp b/components/to_utf8/to_utf8.cpp
index 7db6112475..8ac582b81d 100644
--- a/components/to_utf8/to_utf8.cpp
+++ b/components/to_utf8/to_utf8.cpp
@@ -2,6 +2,8 @@
 
 #include <vector>
 #include <cassert>
+#include <iostream>
+#include <iomanip>
 
 /* This file contains the code to translate from WINDOWS-1252 (native
    charset used in English version of Morrowind) to UTF-8. The library
@@ -46,334 +48,611 @@ static std::vector<char> buf    (50*1024);
 static std::vector<char> output (50*1024);
 static int size;
 
-// Make sure the given vector is large enough for 'size' bytes,
+using namespace ToUTF8;
+
+Utf8Encoder::Utf8Encoder(void):
+    mOutput(50*1024)
+{
+}
+
+void Utf8Encoder::setEncoding(const FromType sourceEncoding)
+{
+    mEncoding = sourceEncoding;
+
+    switch (mEncoding)
+    {
+        case ToUTF8::WINDOWS_1252:
+        {
+            translationArray = ToUTF8::windows_1252;
+            break;
+        }
+        case ToUTF8::WINDOWS_1250:
+        {
+            translationArray = ToUTF8::windows_1250;
+            break;
+        }
+        case ToUTF8::WINDOWS_1251:
+        {
+            translationArray = ToUTF8::windows_1251;
+            break;
+        }
+        default:
+        {
+            assert(0);
+        }
+    }
+}
+
+std::string Utf8Encoder::getUtf8(const char* input, int size)
+{
+    // Double check that the input string stops at some point (it might
+    // contain zero terminators before this, inside its own data, which
+    // is also ok.)
+    assert(input[size] == 0);
+
+    // TODO: The rest of this function is designed for single-character
+    // input encodings only. It also assumes that the input the input
+    // encoding shares its first 128 values (0-127) with ASCII. These
+    // conditions must be checked again if you add more input encodings
+    // later.
+
+    // Compute output length, and check for pure ascii input at the same
+    // time.
+    bool ascii;
+    size_t outlen = getLength(input, ascii);
+
+    // If we're pure ascii, then don't bother converting anything.
+    if(ascii)
+        return std::string(input, outlen);
+
+    // Make sure the output is large enough
+    resize(outlen);
+    char *out = &mOutput[0];
+
+    // Translate
+    while (*input)
+        copyFromArray(*(input++), out);
+
+    // Make sure that we wrote the correct number of bytes
+    assert((out-&mOutput[0]) == (int)outlen);
+
+    // And make extra sure the output is null terminated
+    assert(mOutput.size() > outlen);
+    assert(mOutput[outlen] == 0);
+
+    // Return a string
+    return std::string(&mOutput[0], outlen);
+}
+
+std::string Utf8Encoder::getLegacyEnc(const char *input, int size)
+{
+    // Double check that the input string stops at some point (it might
+    // contain zero terminators before this, inside its own data, which
+    // is also ok.)
+    assert(input[size] == 0);
+
+    // TODO: The rest of this function is designed for single-character
+    // input encodings only. It also assumes that the input the input
+    // encoding shares its first 128 values (0-127) with ASCII. These
+    // conditions must be checked again if you add more input encodings
+    // later.
+
+    // Compute output length, and check for pure ascii input at the same
+    // time.
+    bool ascii;
+    size_t outlen = getLength2(input, ascii);
+
+    // If we're pure ascii, then don't bother converting anything.
+    if(ascii)
+        return std::string(input, outlen);
+
+    // Make sure the output is large enough
+    resize(outlen);
+    char *out = &mOutput[0];
+
+    // Translate
+    while(*input)
+        copyFromArray2(input, out);
+
+    // Make sure that we wrote the correct number of bytes
+    assert((out-&mOutput[0]) == (int)outlen);
+
+    // And make extra sure the output is null terminated
+    assert(mOutput.size() > outlen);
+    assert(mOutput[outlen] == 0);
+
+    // Return a string
+    return std::string(&mOutput[0], outlen);
+}
+
+// Make sure the output vector is large enough for 'size' bytes,
 // including a terminating zero after it.
+void Utf8Encoder::resize(size_t size)
+{
+    if (mOutput.size() <= size)
+        // Add some extra padding to reduce the chance of having to resize
+        // again later.
+        mOutput.resize(3*size);
+
+    // And make sure the string is zero terminated
+    mOutput[size] = 0;
+}
+
+/** Get the total length length needed to decode the given string with
+  the given translation array. The arrays are encoded with 6 bytes
+  per character, with the first giving the length and the next 5 the
+  actual data.
+
+  The function serves a dual purpose for optimization reasons: it
+  checks if the input is pure ascii (all values are <= 127). If this
+  is the case, then the ascii parameter is set to true, and the
+  caller can optimize for this case.
+ */
+size_t Utf8Encoder::getLength(const char* input, bool &ascii)
+{
+    ascii = true;
+    size_t len = 0;
+    const char* ptr = input;
+    unsigned char inp = *ptr;
+
+    // Do away with the ascii part of the string first (this is almost
+    // always the entire string.)
+    while (inp && inp < 128)
+        inp = *(++ptr);
+    len += (ptr-input);
+
+    // If we're not at the null terminator at this point, then there
+    // were some non-ascii characters to deal with. Go to slow-mode for
+    // the rest of the string.
+    if (inp)
+    {
+        ascii = false;
+        while (inp)
+        {
+            // Find the translated length of this character in the
+            // lookup table.
+            len += translationArray[inp*6];
+            inp = *(++ptr);
+        }
+    }
+    return len;
+}
+
+// Translate one character 'ch' using the translation array 'arr', and
+// advance the output pointer accordingly.
+void Utf8Encoder::copyFromArray(unsigned char ch, char* &out)
+{
+    // Optimize for ASCII values
+    if (ch < 128)
+    {
+        *(out++) = ch;
+        return;
+    }
+
+    const char *in = translationArray + ch*6;
+    int len = *(in++);
+    for (int i=0; i<len; i++)
+        *(out++) = *(in++);
+}
+
+size_t Utf8Encoder::getLength2(const char* input, bool &ascii)
+{
+    ascii = true;
+    size_t len = 0;
+    const char* ptr = input;
+    unsigned char inp = *ptr;
+
+    // Do away with the ascii part of the string first (this is almost
+    // always the entire string.)
+    while (inp && inp < 128)
+        inp = *(++ptr);
+    len += (ptr-input);
+
+    // If we're not at the null terminator at this point, then there
+    // were some non-ascii characters to deal with. Go to slow-mode for
+    // the rest of the string.
+    if (inp)
+    {
+        ascii = false;
+        while(inp)
+        {
+            len += 1;
+            // Find the translated length of this character in the
+            // lookup table.
+            switch(inp)
+            {
+                case 0xe2: len -= 2; break;
+                case 0xc2:
+                case 0xcb:
+                case 0xc4:
+                case 0xc6:
+                case 0xc3:
+                case 0xd0:
+                case 0xd1:
+                case 0xd2:
+                case 0xc5: len -= 1; break;
+            }
+
+            inp = *(++ptr);
+        }
+    }
+    return len;
+}
+
+void Utf8Encoder::copyFromArray2(const char*& chp, char* &out)
+{
+    unsigned char ch = *(chp++);
+    // Optimize for ASCII values
+    if (ch < 128)
+    {
+        *(out++) = ch;
+        return;
+    }
+
+    int len = 1;
+    switch (ch)
+    {
+        case 0xe2: len = 3; break;
+        case 0xc2:
+        case 0xcb:
+        case 0xc4:
+        case 0xc6:
+        case 0xc3:
+        case 0xd0:
+        case 0xd1:
+        case 0xd2:
+        case 0xc5: len = 2; break;
+    }
+
+    if (len == 1) // There is no 1 length utf-8 glyph that is not 0x20 (empty space)
+    {
+        *(out++) = ch;
+        return;
+    }
+
+    unsigned char ch2 = *(chp++);
+    unsigned char ch3 = '\0';
+    if (len == 3)
+        ch3 = *(chp++);
+
+    for (int i = 128; i < 256; i++)
+    {
+        unsigned char b1 = translationArray[i*6 + 1], b2 = translationArray[i*6 + 2], b3 = translationArray[i*6 + 3];
+        if (b1 == ch && b2 == ch2 && (len != 3 || b3 == ch3))
+        {
+            *(out++) = (char)i;
+            return;
+        }
+    }
+
+    std::cout << "Could not find glyph " << std::hex << (int)ch << " " << (int)ch2 << " " << (int)ch3 << std::endl;
+
+    *(out++) = ch; // Could not find glyph, just put whatever
+}
+
 static void resize(std::vector<char> &buf, size_t size)
 {
-  if(buf.size() <= size)
-    // Add some extra padding to reduce the chance of having to resize
-    // again later.
-    buf.resize(3*size);
+    if(buf.size() <= size)
+        // Add some extra padding to reduce the chance of having to resize
+        // again later.
+        buf.resize(3*size);
 
-  // And make sure the string is zero terminated
-  buf[size] = 0;
+    // And make sure the string is zero terminated
+    buf[size] = 0;
 }
 
 // This is just used to spew out a reusable input buffer for the
 // conversion process.
 char *ToUTF8::getBuffer(int s)
 {
-  // Remember the requested size
-  size = s;
-  resize(buf, size);
-  return &buf[0];
+    // Remember the requested size
+    size = s;
+    resize(buf, size);
+    return &buf[0];
 }
 
 /** Get the total length length needed to decode the given string with
-    the given translation array. The arrays are encoded with 6 bytes
-    per character, with the first giving the length and the next 5 the
-    actual data.
+  the given translation array. The arrays are encoded with 6 bytes
+  per character, with the first giving the length and the next 5 the
+  actual data.
 
-    The function serves a dual purpose for optimization reasons: it
-    checks if the input is pure ascii (all values are <= 127). If this
-    is the case, then the ascii parameter is set to true, and the
-    caller can optimize for this case.
+  The function serves a dual purpose for optimization reasons: it
+  checks if the input is pure ascii (all values are <= 127). If this
+  is the case, then the ascii parameter is set to true, and the
+  caller can optimize for this case.
  */
 static size_t getLength(const char *arr, const char* input, bool &ascii)
 {
-  ascii = true;
-  size_t len = 0;
-  const char* ptr = input;
-  unsigned char inp = *ptr;
+    ascii = true;
+    size_t len = 0;
+    const char* ptr = input;
+    unsigned char inp = *ptr;
 
-  // Do away with the ascii part of the string first (this is almost
-  // always the entire string.)
-  while(inp && inp < 128)
-    inp = *(++ptr);
-  len += (ptr-input);
+    // Do away with the ascii part of the string first (this is almost
+    // always the entire string.)
+    while(inp && inp < 128)
+        inp = *(++ptr);
+    len += (ptr-input);
 
-  // If we're not at the null terminator at this point, then there
-  // were some non-ascii characters to deal with. Go to slow-mode for
-  // the rest of the string.
-  if(inp)
+    // If we're not at the null terminator at this point, then there
+    // were some non-ascii characters to deal with. Go to slow-mode for
+    // the rest of the string.
+    if(inp)
     {
-      ascii = false;
-      while(inp)
+        ascii = false;
+        while(inp)
         {
-          // Find the translated length of this character in the
-          // lookup table.
-          len += arr[inp*6];
-          inp = *(++ptr);
+            // Find the translated length of this character in the
+            // lookup table.
+            len += arr[inp*6];
+            inp = *(++ptr);
         }
     }
-  return len;
+    return len;
 }
 
 // Translate one character 'ch' using the translation array 'arr', and
 // advance the output pointer accordingly.
 static void copyFromArray(const char *arr, unsigned char ch, char* &out)
 {
-  // Optimize for ASCII values
-  if(ch < 128)
+    // Optimize for ASCII values
+    if(ch < 128)
     {
-      *(out++) = ch;
-      return;
+        *(out++) = ch;
+        return;
     }
 
-  const char *in = arr + ch*6;
-  int len = *(in++);
-  for(int i=0; i<len; i++)
-    *(out++) = *(in++);
+    const char *in = arr + ch*6;
+    int len = *(in++);
+    for(int i=0; i<len; i++)
+        *(out++) = *(in++);
 }
 
 std::string ToUTF8::getUtf8(ToUTF8::FromType from)
 {
-  // Pick translation array
-  const char *arr;
-  switch (from)
-  {
-    case ToUTF8::WINDOWS_1252:
+    // Pick translation array
+    const char *arr;
+    switch (from)
     {
-      arr = ToUTF8::windows_1252;
-      break;
+        case ToUTF8::WINDOWS_1252:
+        {
+            arr = ToUTF8::windows_1252;
+            break;
+        }
+        case ToUTF8::WINDOWS_1250:
+        {
+            arr = ToUTF8::windows_1250;
+            break;
+        }
+        case ToUTF8::WINDOWS_1251:
+        {
+            arr = ToUTF8::windows_1251;
+            break;
+        }
+        default:
+        {
+            assert(0);
+        }
     }
-    case ToUTF8::WINDOWS_1250:
-    {
-      arr = ToUTF8::windows_1250;
-      break;
-    }
-    case ToUTF8::WINDOWS_1251:
-    {
-      arr = ToUTF8::windows_1251;
-      break;
-    }
-    default:
-    {
-      assert(0);
-    }
-  }
 
-  // Double check that the input string stops at some point (it might
-  // contain zero terminators before this, inside its own data, which
-  // is also ok.)
-  const char* input = &buf[0];
-  assert(input[size] == 0);
+    // Double check that the input string stops at some point (it might
+    // contain zero terminators before this, inside its own data, which
+    // is also ok.)
+    const char* input = &buf[0];
+    assert(input[size] == 0);
 
-  // TODO: The rest of this function is designed for single-character
-  // input encodings only. It also assumes that the input the input
-  // encoding shares its first 128 values (0-127) with ASCII. These
-  // conditions must be checked again if you add more input encodings
-  // later.
+    // TODO: The rest of this function is designed for single-character
+    // input encodings only. It also assumes that the input the input
+    // encoding shares its first 128 values (0-127) with ASCII. These
+    // conditions must be checked again if you add more input encodings
+    // later.
 
-  // Compute output length, and check for pure ascii input at the same
-  // time.
-  bool ascii;
-  size_t outlen = getLength(arr, input, ascii);
+    // Compute output length, and check for pure ascii input at the same
+    // time.
+    bool ascii;
+    size_t outlen = getLength(arr, input, ascii);
 
-  // If we're pure ascii, then don't bother converting anything.
-  if(ascii)
-    return std::string(input, outlen);
+    // If we're pure ascii, then don't bother converting anything.
+    if(ascii)
+        return std::string(input, outlen);
 
-  // Make sure the output is large enough
-  resize(output, outlen);
-  char *out = &output[0];
+    // Make sure the output is large enough
+    resize(output, outlen);
+    char *out = &output[0];
 
-  // Translate
-  while(*input)
-    copyFromArray(arr, *(input++), out);
+    // Translate
+    while(*input)
+        copyFromArray(arr, *(input++), out);
 
-  // Make sure that we wrote the correct number of bytes
-  assert((out-&output[0]) == (int)outlen);
+    // Make sure that we wrote the correct number of bytes
+    assert((out-&output[0]) == (int)outlen);
 
-  // And make extra sure the output is null terminated
-  assert(output.size() > outlen);
-  assert(output[outlen] == 0);
+    // And make extra sure the output is null terminated
+    assert(output.size() > outlen);
+    assert(output[outlen] == 0);
 
-  // Return a string
-  return std::string(&output[0], outlen);
+    // Return a string
+    return std::string(&output[0], outlen);
 }
 
 static size_t getLength2(const char *arr, const char* input, bool &ascii)
 {
-  ascii = true;
-  size_t len = 0;
-  const char* ptr = input;
-  unsigned char inp = *ptr;
+    ascii = true;
+    size_t len = 0;
+    const char* ptr = input;
+    unsigned char inp = *ptr;
 
-  // Do away with the ascii part of the string first (this is almost
-  // always the entire string.)
-  while(inp && inp < 128)
-    inp = *(++ptr);
-  len += (ptr-input);
+    // Do away with the ascii part of the string first (this is almost
+    // always the entire string.)
+    while(inp && inp < 128)
+        inp = *(++ptr);
+    len += (ptr-input);
 
-  // If we're not at the null terminator at this point, then there
-  // were some non-ascii characters to deal with. Go to slow-mode for
-  // the rest of the string.
-  if(inp)
+    // If we're not at the null terminator at this point, then there
+    // were some non-ascii characters to deal with. Go to slow-mode for
+    // the rest of the string.
+    if(inp)
     {
-      ascii = false;
-      while(inp)
+        ascii = false;
+        while(inp)
         {
             len += 1;
-          // Find the translated length of this character in the
-          // lookup table.
+            // Find the translated length of this character in the
+            // lookup table.
             switch(inp)
             {
-            case 0xe2: len -= 2; break;
-            case 0xc2:
-            case 0xcb:
-            case 0xc4:
-            case 0xc6:
-            case 0xc3:
-            case 0xd0:
-            case 0xd1:
-            case 0xd2:
-            case 0xc5: len -= 1; break;
+                case 0xe2: len -= 2; break;
+                case 0xc2:
+                case 0xcb:
+                case 0xc4:
+                case 0xc6:
+                case 0xc3:
+                case 0xd0:
+                case 0xd1:
+                case 0xd2:
+                case 0xc5: len -= 1; break;
             }
 
-          inp = *(++ptr);
+            inp = *(++ptr);
         }
     }
-  return len;
+    return len;
 }
 
-#include <iostream>
-#include <iomanip>
-
 static void copyFromArray2(const char *arr, char*& chp, char* &out)
 {
     unsigned char ch = *(chp++);
-  // Optimize for ASCII values
-  if(ch < 128)
+    // Optimize for ASCII values
+    if(ch < 128)
     {
-      *(out++) = ch;
-      return;
+        *(out++) = ch;
+        return;
     }
 
-  int len = 1;
-  switch (ch)
-  {
-  case 0xe2: len = 3; break;
-  case 0xc2:
-  case 0xcb:
-  case 0xc4:
-  case 0xc6:
-  case 0xc3:
-  case 0xd0:
-  case 0xd1:
-  case 0xd2:
-  case 0xc5: len = 2; break;
-  }
+    int len = 1;
+    switch (ch)
+    {
+        case 0xe2: len = 3; break;
+        case 0xc2:
+        case 0xcb:
+        case 0xc4:
+        case 0xc6:
+        case 0xc3:
+        case 0xd0:
+        case 0xd1:
+        case 0xd2:
+        case 0xc5: len = 2; break;
+    }
 
-  if (len == 1) // There is no 1 length utf-8 glyph that is not 0x20 (empty space)
-  {
-      *(out++) = ch;
-      return;
-  }
+    if (len == 1) // There is no 1 length utf-8 glyph that is not 0x20 (empty space)
+    {
+        *(out++) = ch;
+        return;
+    }
 
-  unsigned char ch2 = *(chp++);
-  unsigned char ch3 = '\0';
-  if (len == 3)
-      ch3 = *(chp++);
+    unsigned char ch2 = *(chp++);
+    unsigned char ch3 = '\0';
+    if (len == 3)
+        ch3 = *(chp++);
 
-  for (int i = 128; i < 256; i++)
-  {
-      unsigned char b1 = arr[i*6 + 1], b2 = arr[i*6 + 2], b3 = arr[i*6 + 3];
-      if (b1 == ch && b2 == ch2 && (len != 3 || b3 == ch3))
-      {
-          *(out++) = (char)i;
-          return;
-      }
-  }
+    for (int i = 128; i < 256; i++)
+    {
+        unsigned char b1 = arr[i*6 + 1], b2 = arr[i*6 + 2], b3 = arr[i*6 + 3];
+        if (b1 == ch && b2 == ch2 && (len != 3 || b3 == ch3))
+        {
+            *(out++) = (char)i;
+            return;
+        }
+    }
 
-  std::cout << "Could not find glyph " << std::hex << (int)ch << " " << (int)ch2 << " " << (int)ch3 << std::endl;
+    std::cout << "Could not find glyph " << std::hex << (int)ch << " " << (int)ch2 << " " << (int)ch3 << std::endl;
 
-  *(out++) = ch; // Could not find glyph, just put whatever
+    *(out++) = ch; // Could not find glyph, just put whatever
 }
 
 std::string ToUTF8::getLegacyEnc(ToUTF8::FromType to)
 {
-  // Pick translation array
-  const char *arr;
-  switch (to)
-  {
-    case ToUTF8::WINDOWS_1252:
+    // Pick translation array
+    const char *arr;
+    switch (to)
     {
-      arr = ToUTF8::windows_1252;
-      break;
+        case ToUTF8::WINDOWS_1252:
+        {
+            arr = ToUTF8::windows_1252;
+            break;
+        }
+        case ToUTF8::WINDOWS_1250:
+        {
+            arr = ToUTF8::windows_1250;
+            break;
+        }
+        case ToUTF8::WINDOWS_1251:
+        {
+            arr = ToUTF8::windows_1251;
+            break;
+        }
+        default:
+        {
+            assert(0);
+        }
     }
-    case ToUTF8::WINDOWS_1250:
-    {
-      arr = ToUTF8::windows_1250;
-      break;
-    }
-    case ToUTF8::WINDOWS_1251:
-    {
-      arr = ToUTF8::windows_1251;
-      break;
-    }
-    default:
-    {
-      assert(0);
-    }
-  }
 
-  // Double check that the input string stops at some point (it might
-  // contain zero terminators before this, inside its own data, which
-  // is also ok.)
-  char* input = &buf[0];
-  assert(input[size] == 0);
+    // Double check that the input string stops at some point (it might
+    // contain zero terminators before this, inside its own data, which
+    // is also ok.)
+    char* input = &buf[0];
+    assert(input[size] == 0);
 
-  // TODO: The rest of this function is designed for single-character
-  // input encodings only. It also assumes that the input the input
-  // encoding shares its first 128 values (0-127) with ASCII. These
-  // conditions must be checked again if you add more input encodings
-  // later.
+    // TODO: The rest of this function is designed for single-character
+    // input encodings only. It also assumes that the input the input
+    // encoding shares its first 128 values (0-127) with ASCII. These
+    // conditions must be checked again if you add more input encodings
+    // later.
 
-  // Compute output length, and check for pure ascii input at the same
-  // time.
-  bool ascii;
-  size_t outlen = getLength2(arr, input, ascii);
+    // Compute output length, and check for pure ascii input at the same
+    // time.
+    bool ascii;
+    size_t outlen = getLength2(arr, input, ascii);
 
-  // If we're pure ascii, then don't bother converting anything.
-  if(ascii)
-      return std::string(input, outlen);
+    // If we're pure ascii, then don't bother converting anything.
+    if(ascii)
+        return std::string(input, outlen);
 
-  // Make sure the output is large enough
-  resize(output, outlen);
-  char *out = &output[0];
+    // Make sure the output is large enough
+    resize(output, outlen);
+    char *out = &output[0];
 
-  // Translate
-  while(*input)
-    copyFromArray2(arr, input, out);
+    // Translate
+    while(*input)
+        copyFromArray2(arr, input, out);
 
-  // Make sure that we wrote the correct number of bytes
-  assert((out-&output[0]) == (int)outlen);
+    // Make sure that we wrote the correct number of bytes
+    assert((out-&output[0]) == (int)outlen);
 
-  // And make extra sure the output is null terminated
-  assert(output.size() > outlen);
-  assert(output[outlen] == 0);
+    // And make extra sure the output is null terminated
+    assert(output.size() > outlen);
+    assert(output[outlen] == 0);
 
-  // Return a string
-  return std::string(&output[0], outlen);
+    // Return a string
+    return std::string(&output[0], outlen);
 }
 
 ToUTF8::FromType ToUTF8::calculateEncoding(const std::string& encodingName)
 {
-  if (encodingName == "win1250")
-    return ToUTF8::WINDOWS_1250;
-  else if (encodingName == "win1251")
-    return ToUTF8::WINDOWS_1251;
-  else
-    return ToUTF8::WINDOWS_1252;
+    if (encodingName == "win1250")
+        return ToUTF8::WINDOWS_1250;
+    else if (encodingName == "win1251")
+        return ToUTF8::WINDOWS_1251;
+    else
+        return ToUTF8::WINDOWS_1252;
 }
 
 std::string ToUTF8::encodingUsingMessage(const std::string& encodingName)
 {
-  if (encodingName == "win1250")
-    return "Using Central and Eastern European font encoding.";
-  else if (encodingName == "win1251")
-    return "Using Cyrillic font encoding.";
-  else
-    return "Using default (English) font encoding.";
+    if (encodingName == "win1250")
+        return "Using Central and Eastern European font encoding.";
+    else if (encodingName == "win1251")
+        return "Using Cyrillic font encoding.";
+    else
+        return "Using default (English) font encoding.";
 }
diff --git a/components/to_utf8/to_utf8.hpp b/components/to_utf8/to_utf8.hpp
index f52ae73bd8..6877e2dc17 100644
--- a/components/to_utf8/to_utf8.hpp
+++ b/components/to_utf8/to_utf8.hpp
@@ -2,29 +2,66 @@
 #define COMPONENTS_TOUTF8_H
 
 #include <string>
+#include <cstring>
+#include <vector>
 
 namespace ToUTF8
 {
-  // These are all the currently supported code pages
-  enum FromType
+    // These are all the currently supported code pages
+    enum FromType
     {
-      WINDOWS_1250,      // Central ane Eastern European languages
-      WINDOWS_1251,      // Cyrillic languages
-      WINDOWS_1252       // Used by English version of Morrowind (and
-                         // probably others)
+        WINDOWS_1250,      // Central ane Eastern European languages
+        WINDOWS_1251,      // Cyrillic languages
+        WINDOWS_1252       // Used by English version of Morrowind (and
+            // probably others)
     };
 
-  // Return a writable buffer of at least 'size' bytes. The buffer
-  // does not have to be freed.
-  char* getBuffer(int size);
+    // Return a writable buffer of at least 'size' bytes. The buffer
+    // does not have to be freed.
+    char* getBuffer(int size);
 
-  // Convert the previously written buffer to UTF8 from the given code
-  // page.
-  std::string getUtf8(FromType from);
-  std::string getLegacyEnc(FromType to);
+    // Convert the previously written buffer to UTF8 from the given code
+    // page.
+    std::string getUtf8(FromType from);
+    std::string getLegacyEnc(FromType to);
 
-  FromType calculateEncoding(const std::string& encodingName);
-  std::string encodingUsingMessage(const std::string& encodingName);
+    FromType calculateEncoding(const std::string& encodingName);
+    std::string encodingUsingMessage(const std::string& encodingName);
+
+    // class
+
+    class Utf8Encoder
+    {
+        public:
+            Utf8Encoder(void);
+
+            void setEncoding(const FromType sourceEncoding);
+
+            // Convert to UTF8 from the previously given code page.
+            std::string getUtf8(const char *input, int size);
+            inline std::string getUtf8(const std::string &str)
+            {
+                return getUtf8(str.c_str(), str.size());
+            }
+
+            std::string getLegacyEnc(const char *input, int size);
+            inline std::string getLegacyEnc(const std::string &str)
+            {
+                return getLegacyEnc(str.c_str(), str.size());
+            }
+
+        private:
+            void resize(size_t size);
+            size_t getLength(const char* input, bool &ascii);
+            void copyFromArray(unsigned char chp, char* &out);
+            size_t getLength2(const char* input, bool &ascii);
+            void copyFromArray2(const char*& chp, char* &out);
+
+            FromType mEncoding;
+            std::vector<char> mOutput;
+            int mSize;
+            char* translationArray;
+    };
 }
 
 #endif

From 67273fc1777b45d9860fe114689cbdc9836c1f48 Mon Sep 17 00:00:00 2001
From: Emanuel Guevel <guevel.emanuel@gmail.com>
Date: Wed, 2 Jan 2013 23:39:21 +0100
Subject: [PATCH 2/9] mwiniimporter: use Utf8Encoder

---
 apps/mwiniimporter/importer.cpp | 12 +++---------
 apps/mwiniimporter/importer.hpp |  2 +-
 2 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/apps/mwiniimporter/importer.cpp b/apps/mwiniimporter/importer.cpp
index def70615bc..5c3dedd047 100644
--- a/apps/mwiniimporter/importer.cpp
+++ b/apps/mwiniimporter/importer.cpp
@@ -649,11 +649,13 @@ MwIniImporter::multistrmap MwIniImporter::loadIniFile(std::string filename) {
     std::string section("");
     MwIniImporter::multistrmap map;
     boost::iostreams::stream<boost::iostreams::file_source>file(filename.c_str());
+    ToUTF8::Utf8Encoder encoder;
+    encoder.setEncoding(mEncoding);
 
     std::string line;
     while (std::getline(file, line)) {
 
-        line = toUTF8(line);
+        line = encoder.getUtf8(line);
 
         // unify Unix-style and Windows file ending
         if (!(line.empty()) && (line[line.length()-1]) == '\r') {
@@ -829,14 +831,6 @@ void MwIniImporter::writeToFile(boost::iostreams::stream<boost::iostreams::file_
     }
 }
 
-std::string MwIniImporter::toUTF8(const std::string &str) {
-    char *ptr = ToUTF8::getBuffer(str.length());
-    strncpy(ptr, str.c_str(), str.length());
-
-    // Convert to UTF8 and return
-    return ToUTF8::getUtf8(mEncoding);
-}
-
 void MwIniImporter::setInputEncoding(const ToUTF8::FromType &encoding)
 {
   mEncoding = encoding;
diff --git a/apps/mwiniimporter/importer.hpp b/apps/mwiniimporter/importer.hpp
index 55742bc090..c87fd3e164 100644
--- a/apps/mwiniimporter/importer.hpp
+++ b/apps/mwiniimporter/importer.hpp
@@ -8,7 +8,7 @@
 #include <vector>
 #include <exception>
 
-#include "../../components/to_utf8/to_utf8.hpp"
+#include <components/to_utf8/to_utf8.hpp>
 
 class MwIniImporter {
   public:

From 9906c3051dddaaf435d55841f5e9d0d392d575c5 Mon Sep 17 00:00:00 2001
From: Emanuel Guevel <guevel.emanuel@gmail.com>
Date: Thu, 3 Jan 2013 15:01:08 +0100
Subject: [PATCH 3/9] components/translation: use Utf8Encoder

---
 components/translation/translation.cpp | 6 ++----
 components/translation/translation.hpp | 1 +
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/components/translation/translation.cpp b/components/translation/translation.cpp
index fb5b038612..002446e4f9 100644
--- a/components/translation/translation.cpp
+++ b/components/translation/translation.cpp
@@ -50,10 +50,7 @@ namespace Translation
 
             if (!line.empty())
             {
-                char* buffer = ToUTF8::getBuffer(line.size() + 1);
-                //buffer has at least line.size() + 1 bytes, so it must be safe
-                strcpy(buffer, line.c_str());
-                line = ToUTF8::getUtf8(mEncoding);
+                line = mEncoder.getUtf8(line);
 
                 size_t tab_pos = line.find('\t');
                 if (tab_pos != std::string::npos && tab_pos > 0 && tab_pos < line.size() - 1)
@@ -107,6 +104,7 @@ namespace Translation
     void Storage::setEncoding (const ToUTF8::FromType& encoding)
     {
         mEncoding = encoding;
+        mEncoder.setEncoding(encoding);
     }
 
     bool Storage::hasTranslation() const
diff --git a/components/translation/translation.hpp b/components/translation/translation.hpp
index 80d44d871d..6c3e4df868 100644
--- a/components/translation/translation.hpp
+++ b/components/translation/translation.hpp
@@ -35,6 +35,7 @@ namespace Translation
 
 
         ToUTF8::FromType mEncoding;
+        ToUTF8::Utf8Encoder mEncoder;
         ContainerType mCellNamesTranslations, mTopicIDs, mPhraseForms;
     };
 }

From 02bf02f288c407ad639ad15c8bc3a72ae56597b4 Mon Sep 17 00:00:00 2001
From: Emanuel Guevel <guevel.emanuel@gmail.com>
Date: Thu, 3 Jan 2013 21:15:18 +0100
Subject: [PATCH 4/9] ESMReader, ESMWriter: use Utf8Encoder

---
 components/esm/esmreader.cpp | 22 +++++++++++++++++++---
 components/esm/esmreader.hpp |  6 ++++++
 components/esm/esmwriter.cpp |  8 +++-----
 components/esm/esmwriter.hpp |  1 +
 4 files changed, 29 insertions(+), 8 deletions(-)

diff --git a/components/esm/esmreader.cpp b/components/esm/esmreader.cpp
index 580e006dfe..5cd99a64a2 100644
--- a/components/esm/esmreader.cpp
+++ b/components/esm/esmreader.cpp
@@ -13,6 +13,11 @@ ESM_Context ESMReader::getContext()
     return mCtx;
 }
 
+ESMReader::ESMReader(void):
+    mBuffer(50*1024)
+{
+}
+
 void ESMReader::restoreContext(const ESM_Context &rc)
 {
     // Reopen the file if necessary
@@ -325,11 +330,21 @@ void ESMReader::getExact(void*x, int size)
 
 std::string ESMReader::getString(int size)
 {
-    char *ptr = ToUTF8::getBuffer(size);
-    mEsm->read(ptr, size);
+    size_t s = size;
+    if (mBuffer.size() <= s)
+        // Add some extra padding to reduce the chance of having to resize
+        // again later.
+        mBuffer.resize(3*s);
+
+    // And make sure the string is zero terminated
+    mBuffer[s] = 0;
+
+    // read ESM data
+    char *ptr = &mBuffer[0];
+    getExact(ptr, size);
 
     // Convert to UTF8 and return
-    return ToUTF8::getUtf8(mEncoding);
+    return mEncoder.getUtf8(ptr, size);
 }
 
 void ESMReader::fail(const std::string &msg)
@@ -350,6 +365,7 @@ void ESMReader::fail(const std::string &msg)
 void ESMReader::setEncoding(const ToUTF8::FromType& encoding)
 {
   mEncoding = encoding;
+  mEncoder.setEncoding(encoding);
 }
 
 }
diff --git a/components/esm/esmreader.hpp b/components/esm/esmreader.hpp
index 1d0f6f5806..57503aea77 100644
--- a/components/esm/esmreader.hpp
+++ b/components/esm/esmreader.hpp
@@ -20,6 +20,8 @@ class ESMReader
 {
 public:
 
+  ESMReader(void);
+
   /*************************************************************************
    *
    *  Public type definitions
@@ -244,9 +246,13 @@ private:
   // Special file signifier (see SpecialFile enum above)
   int mSpf;
 
+  // Buffer for ESM strings
+  std::vector<char> mBuffer;
+
   SaveData mSaveData;
   MasterList mMasters;
   ToUTF8::FromType mEncoding;
+  ToUTF8::Utf8Encoder mEncoder;
 };
 }
 #endif
diff --git a/components/esm/esmwriter.cpp b/components/esm/esmwriter.cpp
index c1ae064903..a00c7971d0 100644
--- a/components/esm/esmwriter.cpp
+++ b/components/esm/esmwriter.cpp
@@ -157,12 +157,8 @@ void ESMWriter::writeHString(const std::string& data)
         write("\0", 1);
     else
     {
-        char *ptr = ToUTF8::getBuffer(data.size()+1);
-        strncpy(ptr, &data[0], data.size());
-        ptr[data.size()] = '\0';
-
         // Convert to UTF8 and return
-        std::string ascii = ToUTF8::getLegacyEnc(m_encoding);
+        std::string ascii = m_encoder.getLegacyEnc(data);
 
         write(ascii.c_str(), ascii.size());
     }
@@ -207,6 +203,8 @@ void ESMWriter::setEncoding(const std::string& encoding)
         // Default Latin encoding
         m_encoding = ToUTF8::WINDOWS_1252;
     }
+
+    m_encoder.setEncoding(m_encoding);
 }
 
 }
diff --git a/components/esm/esmwriter.hpp b/components/esm/esmwriter.hpp
index d3777be813..20bc5da128 100644
--- a/components/esm/esmwriter.hpp
+++ b/components/esm/esmwriter.hpp
@@ -95,6 +95,7 @@ private:
     std::ostream* m_stream;
     std::streampos m_headerPos;
     ToUTF8::FromType m_encoding;
+    ToUTF8::Utf8Encoder m_encoder;
     int m_recordCount;
 
     HEDRstruct m_header;

From 0bdf52a0719a756f3be97a988ff33784d02f7fcc Mon Sep 17 00:00:00 2001
From: Emanuel Guevel <guevel.emanuel@gmail.com>
Date: Thu, 3 Jan 2013 23:21:14 +0100
Subject: [PATCH 5/9] components/to_utf8: keep only Utf8Encoder

---
 components/to_utf8/to_utf8.cpp | 314 ---------------------------------
 components/to_utf8/to_utf8.hpp |   9 -
 2 files changed, 323 deletions(-)

diff --git a/components/to_utf8/to_utf8.cpp b/components/to_utf8/to_utf8.cpp
index 8ac582b81d..5efec36a4d 100644
--- a/components/to_utf8/to_utf8.cpp
+++ b/components/to_utf8/to_utf8.cpp
@@ -41,13 +41,6 @@
 // Generated tables
 #include "tables_gen.hpp"
 
-// Shared global buffers, we love you. These initial sizes are large
-// enough to hold the largest books in Morrowind.esm, but we will
-// resize automaticall if necessary.
-static std::vector<char> buf    (50*1024);
-static std::vector<char> output (50*1024);
-static int size;
-
 using namespace ToUTF8;
 
 Utf8Encoder::Utf8Encoder(void):
@@ -330,313 +323,6 @@ void Utf8Encoder::copyFromArray2(const char*& chp, char* &out)
     *(out++) = ch; // Could not find glyph, just put whatever
 }
 
-static void resize(std::vector<char> &buf, size_t size)
-{
-    if(buf.size() <= size)
-        // Add some extra padding to reduce the chance of having to resize
-        // again later.
-        buf.resize(3*size);
-
-    // And make sure the string is zero terminated
-    buf[size] = 0;
-}
-
-// This is just used to spew out a reusable input buffer for the
-// conversion process.
-char *ToUTF8::getBuffer(int s)
-{
-    // Remember the requested size
-    size = s;
-    resize(buf, size);
-    return &buf[0];
-}
-
-/** Get the total length length needed to decode the given string with
-  the given translation array. The arrays are encoded with 6 bytes
-  per character, with the first giving the length and the next 5 the
-  actual data.
-
-  The function serves a dual purpose for optimization reasons: it
-  checks if the input is pure ascii (all values are <= 127). If this
-  is the case, then the ascii parameter is set to true, and the
-  caller can optimize for this case.
- */
-static size_t getLength(const char *arr, const char* input, bool &ascii)
-{
-    ascii = true;
-    size_t len = 0;
-    const char* ptr = input;
-    unsigned char inp = *ptr;
-
-    // Do away with the ascii part of the string first (this is almost
-    // always the entire string.)
-    while(inp && inp < 128)
-        inp = *(++ptr);
-    len += (ptr-input);
-
-    // If we're not at the null terminator at this point, then there
-    // were some non-ascii characters to deal with. Go to slow-mode for
-    // the rest of the string.
-    if(inp)
-    {
-        ascii = false;
-        while(inp)
-        {
-            // Find the translated length of this character in the
-            // lookup table.
-            len += arr[inp*6];
-            inp = *(++ptr);
-        }
-    }
-    return len;
-}
-
-// Translate one character 'ch' using the translation array 'arr', and
-// advance the output pointer accordingly.
-static void copyFromArray(const char *arr, unsigned char ch, char* &out)
-{
-    // Optimize for ASCII values
-    if(ch < 128)
-    {
-        *(out++) = ch;
-        return;
-    }
-
-    const char *in = arr + ch*6;
-    int len = *(in++);
-    for(int i=0; i<len; i++)
-        *(out++) = *(in++);
-}
-
-std::string ToUTF8::getUtf8(ToUTF8::FromType from)
-{
-    // Pick translation array
-    const char *arr;
-    switch (from)
-    {
-        case ToUTF8::WINDOWS_1252:
-        {
-            arr = ToUTF8::windows_1252;
-            break;
-        }
-        case ToUTF8::WINDOWS_1250:
-        {
-            arr = ToUTF8::windows_1250;
-            break;
-        }
-        case ToUTF8::WINDOWS_1251:
-        {
-            arr = ToUTF8::windows_1251;
-            break;
-        }
-        default:
-        {
-            assert(0);
-        }
-    }
-
-    // Double check that the input string stops at some point (it might
-    // contain zero terminators before this, inside its own data, which
-    // is also ok.)
-    const char* input = &buf[0];
-    assert(input[size] == 0);
-
-    // TODO: The rest of this function is designed for single-character
-    // input encodings only. It also assumes that the input the input
-    // encoding shares its first 128 values (0-127) with ASCII. These
-    // conditions must be checked again if you add more input encodings
-    // later.
-
-    // Compute output length, and check for pure ascii input at the same
-    // time.
-    bool ascii;
-    size_t outlen = getLength(arr, input, ascii);
-
-    // If we're pure ascii, then don't bother converting anything.
-    if(ascii)
-        return std::string(input, outlen);
-
-    // Make sure the output is large enough
-    resize(output, outlen);
-    char *out = &output[0];
-
-    // Translate
-    while(*input)
-        copyFromArray(arr, *(input++), out);
-
-    // Make sure that we wrote the correct number of bytes
-    assert((out-&output[0]) == (int)outlen);
-
-    // And make extra sure the output is null terminated
-    assert(output.size() > outlen);
-    assert(output[outlen] == 0);
-
-    // Return a string
-    return std::string(&output[0], outlen);
-}
-
-static size_t getLength2(const char *arr, const char* input, bool &ascii)
-{
-    ascii = true;
-    size_t len = 0;
-    const char* ptr = input;
-    unsigned char inp = *ptr;
-
-    // Do away with the ascii part of the string first (this is almost
-    // always the entire string.)
-    while(inp && inp < 128)
-        inp = *(++ptr);
-    len += (ptr-input);
-
-    // If we're not at the null terminator at this point, then there
-    // were some non-ascii characters to deal with. Go to slow-mode for
-    // the rest of the string.
-    if(inp)
-    {
-        ascii = false;
-        while(inp)
-        {
-            len += 1;
-            // Find the translated length of this character in the
-            // lookup table.
-            switch(inp)
-            {
-                case 0xe2: len -= 2; break;
-                case 0xc2:
-                case 0xcb:
-                case 0xc4:
-                case 0xc6:
-                case 0xc3:
-                case 0xd0:
-                case 0xd1:
-                case 0xd2:
-                case 0xc5: len -= 1; break;
-            }
-
-            inp = *(++ptr);
-        }
-    }
-    return len;
-}
-
-static void copyFromArray2(const char *arr, char*& chp, char* &out)
-{
-    unsigned char ch = *(chp++);
-    // Optimize for ASCII values
-    if(ch < 128)
-    {
-        *(out++) = ch;
-        return;
-    }
-
-    int len = 1;
-    switch (ch)
-    {
-        case 0xe2: len = 3; break;
-        case 0xc2:
-        case 0xcb:
-        case 0xc4:
-        case 0xc6:
-        case 0xc3:
-        case 0xd0:
-        case 0xd1:
-        case 0xd2:
-        case 0xc5: len = 2; break;
-    }
-
-    if (len == 1) // There is no 1 length utf-8 glyph that is not 0x20 (empty space)
-    {
-        *(out++) = ch;
-        return;
-    }
-
-    unsigned char ch2 = *(chp++);
-    unsigned char ch3 = '\0';
-    if (len == 3)
-        ch3 = *(chp++);
-
-    for (int i = 128; i < 256; i++)
-    {
-        unsigned char b1 = arr[i*6 + 1], b2 = arr[i*6 + 2], b3 = arr[i*6 + 3];
-        if (b1 == ch && b2 == ch2 && (len != 3 || b3 == ch3))
-        {
-            *(out++) = (char)i;
-            return;
-        }
-    }
-
-    std::cout << "Could not find glyph " << std::hex << (int)ch << " " << (int)ch2 << " " << (int)ch3 << std::endl;
-
-    *(out++) = ch; // Could not find glyph, just put whatever
-}
-
-std::string ToUTF8::getLegacyEnc(ToUTF8::FromType to)
-{
-    // Pick translation array
-    const char *arr;
-    switch (to)
-    {
-        case ToUTF8::WINDOWS_1252:
-        {
-            arr = ToUTF8::windows_1252;
-            break;
-        }
-        case ToUTF8::WINDOWS_1250:
-        {
-            arr = ToUTF8::windows_1250;
-            break;
-        }
-        case ToUTF8::WINDOWS_1251:
-        {
-            arr = ToUTF8::windows_1251;
-            break;
-        }
-        default:
-        {
-            assert(0);
-        }
-    }
-
-    // Double check that the input string stops at some point (it might
-    // contain zero terminators before this, inside its own data, which
-    // is also ok.)
-    char* input = &buf[0];
-    assert(input[size] == 0);
-
-    // TODO: The rest of this function is designed for single-character
-    // input encodings only. It also assumes that the input the input
-    // encoding shares its first 128 values (0-127) with ASCII. These
-    // conditions must be checked again if you add more input encodings
-    // later.
-
-    // Compute output length, and check for pure ascii input at the same
-    // time.
-    bool ascii;
-    size_t outlen = getLength2(arr, input, ascii);
-
-    // If we're pure ascii, then don't bother converting anything.
-    if(ascii)
-        return std::string(input, outlen);
-
-    // Make sure the output is large enough
-    resize(output, outlen);
-    char *out = &output[0];
-
-    // Translate
-    while(*input)
-        copyFromArray2(arr, input, out);
-
-    // Make sure that we wrote the correct number of bytes
-    assert((out-&output[0]) == (int)outlen);
-
-    // And make extra sure the output is null terminated
-    assert(output.size() > outlen);
-    assert(output[outlen] == 0);
-
-    // Return a string
-    return std::string(&output[0], outlen);
-}
-
 ToUTF8::FromType ToUTF8::calculateEncoding(const std::string& encodingName)
 {
     if (encodingName == "win1250")
diff --git a/components/to_utf8/to_utf8.hpp b/components/to_utf8/to_utf8.hpp
index 6877e2dc17..bfba8a1ac4 100644
--- a/components/to_utf8/to_utf8.hpp
+++ b/components/to_utf8/to_utf8.hpp
@@ -16,15 +16,6 @@ namespace ToUTF8
             // probably others)
     };
 
-    // Return a writable buffer of at least 'size' bytes. The buffer
-    // does not have to be freed.
-    char* getBuffer(int size);
-
-    // Convert the previously written buffer to UTF8 from the given code
-    // page.
-    std::string getUtf8(FromType from);
-    std::string getLegacyEnc(FromType to);
-
     FromType calculateEncoding(const std::string& encodingName);
     std::string encodingUsingMessage(const std::string& encodingName);
 

From c947d87ab9e9510e322d8cb030f42b64f6f07dc4 Mon Sep 17 00:00:00 2001
From: Emanuel Guevel <guevel.emanuel@gmail.com>
Date: Fri, 4 Jan 2013 15:10:30 +0100
Subject: [PATCH 6/9] Add a test for to_utf8 component

---
 components/to_utf8/tests/.gitignore           |  1 +
 .../to_utf8/tests/output/to_utf8_test.out     |  4 ++
 components/to_utf8/tests/test.sh              | 18 ++++++
 components/to_utf8/tests/to_utf8_test.cpp     | 61 +++++++++++++++++++
 4 files changed, 84 insertions(+)
 create mode 100644 components/to_utf8/tests/.gitignore
 create mode 100644 components/to_utf8/tests/output/to_utf8_test.out
 create mode 100755 components/to_utf8/tests/test.sh
 create mode 100644 components/to_utf8/tests/to_utf8_test.cpp

diff --git a/components/to_utf8/tests/.gitignore b/components/to_utf8/tests/.gitignore
new file mode 100644
index 0000000000..8144904045
--- /dev/null
+++ b/components/to_utf8/tests/.gitignore
@@ -0,0 +1 @@
+*_test
diff --git a/components/to_utf8/tests/output/to_utf8_test.out b/components/to_utf8/tests/output/to_utf8_test.out
new file mode 100644
index 0000000000..dcb32359ab
--- /dev/null
+++ b/components/to_utf8/tests/output/to_utf8_test.out
@@ -0,0 +1,4 @@
+original:  Без вопросов отдаете ему рулет, зная, что позже вы сможете привести с собой своих друзей и тогда он получит по заслугам?
+converted: Без вопросов отдаете ему рулет, зная, что позже вы сможете привести с собой своих друзей и тогда он получит по заслугам?
+original:  Vous lui donnez le gâteau sans protester avant d’aller chercher tous vos amis et de revenir vous venger.
+converted: Vous lui donnez le gâteau sans protester avant d’aller chercher tous vos amis et de revenir vous venger.
diff --git a/components/to_utf8/tests/test.sh b/components/to_utf8/tests/test.sh
new file mode 100755
index 0000000000..2d07708adc
--- /dev/null
+++ b/components/to_utf8/tests/test.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+make || exit
+
+mkdir -p output
+
+PROGS=*_test
+
+for a in $PROGS; do
+    if [ -f "output/$a.out" ]; then
+        echo "Running $a:"
+        ./$a | diff output/$a.out -
+    else
+        echo "Creating $a.out"
+        ./$a > "output/$a.out"
+        git add "output/$a.out"
+    fi
+done
diff --git a/components/to_utf8/tests/to_utf8_test.cpp b/components/to_utf8/tests/to_utf8_test.cpp
new file mode 100644
index 0000000000..8c25c483e0
--- /dev/null
+++ b/components/to_utf8/tests/to_utf8_test.cpp
@@ -0,0 +1,61 @@
+#include <iostream>
+#include <fstream>
+#include <cassert>
+#include <stdexcept>
+#include <typeinfo>
+
+#include "../to_utf8.hpp"
+
+std::string getFirstLine(const std::string &filename);
+void testEncoder(ToUTF8::FromType encoding, const std::string &legacyEncFile,
+                 const std::string &utf8File);
+
+/// Test character encoding conversion to and from UTF-8
+void testEncoder(ToUTF8::FromType encoding, const std::string &legacyEncFile,
+                 const std::string &utf8File)
+{
+    // get some test data
+    std::string legacyEncLine = getFirstLine(legacyEncFile);
+    std::string utf8Line = getFirstLine(utf8File);
+
+    // create an encoder for specified character encoding
+    ToUTF8::Utf8Encoder encoder;
+    encoder.setEncoding(encoding);
+
+    // convert text to UTF-8
+    std::string convertedUtf8Line = encoder.getUtf8(legacyEncLine);
+
+    std::cout << "original:  " << utf8Line          << std::endl;
+    std::cout << "converted: " << convertedUtf8Line << std::endl;
+
+    // check correctness
+    assert(convertedUtf8Line == utf8Line);
+
+    // convert UTF-8 text to legacy encoding
+    std::string convertedLegacyEncLine = encoder.getLegacyEnc(utf8Line);
+    // check correctness
+    assert(convertedLegacyEncLine == legacyEncLine);
+}
+
+std::string getFirstLine(const std::string &filename)
+{
+    std::string line;
+    std::ifstream text (filename.c_str());
+
+    if (!text.is_open())
+    {
+        throw std::runtime_error("Unable to open file " + filename);
+    }
+
+    std::getline(text, line);
+    text.close();
+
+    return line;
+}
+
+int main()
+{
+    testEncoder(ToUTF8::WINDOWS_1251, "data/russian-win1251.txt", "data/russian-utf8.txt");
+    testEncoder(ToUTF8::WINDOWS_1252, "data/french-win1252.txt", "data/french-utf8.txt");
+    return 0;
+}

From cc792da85895c85db8b76a5c9692bc260f35f649 Mon Sep 17 00:00:00 2001
From: Emanuel Guevel <guevel.emanuel@gmail.com>
Date: Fri, 4 Jan 2013 15:24:07 +0100
Subject: [PATCH 7/9] Fix to_utf8 test: add test data directory and remove
 unused include

---
 components/to_utf8/tests/test_data/french-utf8.txt     | 1 +
 components/to_utf8/tests/test_data/french-win1252.txt  | 1 +
 components/to_utf8/tests/test_data/russian-utf8.txt    | 1 +
 components/to_utf8/tests/test_data/russian-win1251.txt | 1 +
 components/to_utf8/tests/to_utf8_test.cpp              | 5 ++---
 5 files changed, 6 insertions(+), 3 deletions(-)
 create mode 100644 components/to_utf8/tests/test_data/french-utf8.txt
 create mode 100644 components/to_utf8/tests/test_data/french-win1252.txt
 create mode 100644 components/to_utf8/tests/test_data/russian-utf8.txt
 create mode 100644 components/to_utf8/tests/test_data/russian-win1251.txt

diff --git a/components/to_utf8/tests/test_data/french-utf8.txt b/components/to_utf8/tests/test_data/french-utf8.txt
new file mode 100644
index 0000000000..aaaccac737
--- /dev/null
+++ b/components/to_utf8/tests/test_data/french-utf8.txt
@@ -0,0 +1 @@
+Vous lui donnez le gâteau sans protester avant d’aller chercher tous vos amis et de revenir vous venger.
\ No newline at end of file
diff --git a/components/to_utf8/tests/test_data/french-win1252.txt b/components/to_utf8/tests/test_data/french-win1252.txt
new file mode 100644
index 0000000000..1de4593e94
--- /dev/null
+++ b/components/to_utf8/tests/test_data/french-win1252.txt
@@ -0,0 +1 @@
+Vous lui donnez le g�teau sans protester avant d�aller chercher tous vos amis et de revenir vous venger.
\ No newline at end of file
diff --git a/components/to_utf8/tests/test_data/russian-utf8.txt b/components/to_utf8/tests/test_data/russian-utf8.txt
new file mode 100644
index 0000000000..eb20b32dd6
--- /dev/null
+++ b/components/to_utf8/tests/test_data/russian-utf8.txt
@@ -0,0 +1 @@
+Без вопросов отдаете ему рулет, зная, что позже вы сможете привести с собой своих друзей и тогда он получит по заслугам?
\ No newline at end of file
diff --git a/components/to_utf8/tests/test_data/russian-win1251.txt b/components/to_utf8/tests/test_data/russian-win1251.txt
new file mode 100644
index 0000000000..086e57edd6
--- /dev/null
+++ b/components/to_utf8/tests/test_data/russian-win1251.txt
@@ -0,0 +1 @@
+��� �������� ������� ��� �����, ����, ��� ����� �� ������� �������� � ����� ����� ������ � ����� �� ������� �� ��������?
\ No newline at end of file
diff --git a/components/to_utf8/tests/to_utf8_test.cpp b/components/to_utf8/tests/to_utf8_test.cpp
index 8c25c483e0..4bba0cf90d 100644
--- a/components/to_utf8/tests/to_utf8_test.cpp
+++ b/components/to_utf8/tests/to_utf8_test.cpp
@@ -2,7 +2,6 @@
 #include <fstream>
 #include <cassert>
 #include <stdexcept>
-#include <typeinfo>
 
 #include "../to_utf8.hpp"
 
@@ -55,7 +54,7 @@ std::string getFirstLine(const std::string &filename)
 
 int main()
 {
-    testEncoder(ToUTF8::WINDOWS_1251, "data/russian-win1251.txt", "data/russian-utf8.txt");
-    testEncoder(ToUTF8::WINDOWS_1252, "data/french-win1252.txt", "data/french-utf8.txt");
+    testEncoder(ToUTF8::WINDOWS_1251, "test_data/russian-win1251.txt", "test_data/russian-utf8.txt");
+    testEncoder(ToUTF8::WINDOWS_1252, "test_data/french-win1252.txt", "test_data/french-utf8.txt");
     return 0;
 }

From 63f09462fd1395f1550df11eb400e0e81959c14b Mon Sep 17 00:00:00 2001
From: Emanuel Guevel <guevel.emanuel@gmail.com>
Date: Sun, 6 Jan 2013 01:37:58 +0100
Subject: [PATCH 8/9] to_utf8, Utf8Encoder: pass encoding as constructor
 parameter

Edit other files accordingly.
---
 apps/esmtool/esmtool.cpp               | 25 ++++++++-----------------
 apps/launcher/model/datafilesmodel.cpp |  6 ++++--
 apps/mwiniimporter/importer.cpp        |  3 +--
 apps/openmw/engine.cpp                 |  7 +++++--
 apps/openmw/mwworld/worldimp.cpp       |  4 ++--
 apps/openmw/mwworld/worldimp.hpp       |  2 +-
 components/esm/esmreader.cpp           |  7 +++----
 components/esm/esmreader.hpp           |  7 +++----
 components/esm/esmwriter.cpp           | 20 +++-----------------
 components/esm/esmwriter.hpp           |  8 +++-----
 components/to_utf8/to_utf8.cpp         | 10 ++--------
 components/to_utf8/to_utf8.hpp         |  6 +-----
 components/translation/translation.cpp |  7 +++----
 components/translation/translation.hpp |  5 ++---
 14 files changed, 41 insertions(+), 76 deletions(-)

diff --git a/apps/esmtool/esmtool.cpp b/apps/esmtool/esmtool.cpp
index 0cd6e39053..fbfc884d70 100644
--- a/apps/esmtool/esmtool.cpp
+++ b/apps/esmtool/esmtool.cpp
@@ -165,23 +165,12 @@ bool parseOptions (int argc, char** argv, Arguments &info)
 
     // Font encoding settings
     info.encoding = variables["encoding"].as<std::string>();
-    if (info.encoding == "win1250")
+    if(info.encoding != "win1250" && info.encoding != "win1251" && info.encoding != "win1252")
     {
-        std::cout << "Using Central and Eastern European font encoding." << std::endl;
-    }
-    else if (info.encoding == "win1251")
-    {
-        std::cout << "Using Cyrillic font encoding." << std::endl;
-    }
-    else
-    {
-        if(info.encoding != "win1252")
-        {
-            std::cout << info.encoding << " is not a valid encoding option." << std::endl;
-            info.encoding = "win1252";
-        }
-        std::cout << "Using default (English) font encoding." << std::endl;
+        std::cout << info.encoding << " is not a valid encoding option." << std::endl;
+        info.encoding = "win1252";
     }
+    std::cout << ToUTF8::encodingUsingMessage(info.encoding) << std::endl;
 
     return true;
 }
@@ -262,7 +251,8 @@ void printRaw(ESM::ESMReader &esm)
 int load(Arguments& info)
 {
     ESM::ESMReader& esm = info.reader;
-    esm.setEncoding(ToUTF8::calculateEncoding(info.encoding));
+    ToUTF8::Utf8Encoder encoder (ToUTF8::calculateEncoding(info.encoding));
+    esm.setEncoder(&encoder);
 
     std::string filename = info.filename;
     std::cout << "Loading file: " << filename << std::endl;
@@ -432,7 +422,8 @@ int clone(Arguments& info)
     std::cout << std::endl << "Saving records to: " << info.outname << "..." << std::endl;
 
     ESM::ESMWriter& esm = info.writer;
-    esm.setEncoding(info.encoding);
+    ToUTF8::Utf8Encoder encoder (ToUTF8::calculateEncoding(info.encoding));
+    esm.setEncoder(&encoder);
     esm.setAuthor(info.data.author);
     esm.setDescription(info.data.description);
     esm.setVersion(info.data.version);
diff --git a/apps/launcher/model/datafilesmodel.cpp b/apps/launcher/model/datafilesmodel.cpp
index 716c9e9026..e84dbe0acc 100644
--- a/apps/launcher/model/datafilesmodel.cpp
+++ b/apps/launcher/model/datafilesmodel.cpp
@@ -272,7 +272,8 @@ void DataFilesModel::addMasters(const QString &path)
     foreach (const QString &path, dir.entryList()) {
         try {
             ESM::ESMReader fileReader;
-            fileReader.setEncoding(ToUTF8::calculateEncoding(mEncoding.toStdString()));
+            ToUTF8::Utf8Encoder encoder (ToUTF8::calculateEncoding(mEncoding.toStdString()));
+            fileReader.setEncoder(&encoder);
             fileReader.open(dir.absoluteFilePath(path).toStdString());
 
             ESM::ESMReader::MasterList mlist = fileReader.getMasters();
@@ -335,7 +336,8 @@ void DataFilesModel::addPlugins(const QString &path)
 
         try {
             ESM::ESMReader fileReader;
-            fileReader.setEncoding(ToUTF8::calculateEncoding(mEncoding.toStdString()));
+            ToUTF8::Utf8Encoder encoder (ToUTF8::calculateEncoding(mEncoding.toStdString()));
+            fileReader.setEncoder(&encoder);
             fileReader.open(dir.absoluteFilePath(path).toStdString());
 
             ESM::ESMReader::MasterList mlist = fileReader.getMasters();
diff --git a/apps/mwiniimporter/importer.cpp b/apps/mwiniimporter/importer.cpp
index 5c3dedd047..6a7274e0a1 100644
--- a/apps/mwiniimporter/importer.cpp
+++ b/apps/mwiniimporter/importer.cpp
@@ -649,8 +649,7 @@ MwIniImporter::multistrmap MwIniImporter::loadIniFile(std::string filename) {
     std::string section("");
     MwIniImporter::multistrmap map;
     boost::iostreams::stream<boost::iostreams::file_source>file(filename.c_str());
-    ToUTF8::Utf8Encoder encoder;
-    encoder.setEncoding(mEncoding);
+    ToUTF8::Utf8Encoder encoder(mEncoding);
 
     std::string line;
     while (std::getline(file, line)) {
diff --git a/apps/openmw/engine.cpp b/apps/openmw/engine.cpp
index e2d28a808d..aadeb7f3a2 100644
--- a/apps/openmw/engine.cpp
+++ b/apps/openmw/engine.cpp
@@ -331,11 +331,15 @@ void OMW::Engine::go()
     // cursor replacer (converts the cursor from the bsa so they can be used by mygui)
     MWGui::CursorReplace replacer;
 
+    // Create encoder
+    ToUTF8::Utf8Encoder encoder (mEncoding);
+
     // Create the world
     mEnvironment.setWorld (new MWWorld::World (*mOgre, mFileCollections, mMaster,
-        mResDir, mCfgMgr.getCachePath(), mNewGame, mEncoding, mFallbackMap));
+        mResDir, mCfgMgr.getCachePath(), mNewGame, &encoder, mFallbackMap));
 
     //Load translation data
+    mTranslationDataStorage.setEncoder(&encoder);
     mTranslationDataStorage.loadTranslationData(mFileCollections, mMaster);
 
     // Create window manager - this manages all the MW-specific GUI windows
@@ -494,7 +498,6 @@ void OMW::Engine::showFPS(int level)
 void OMW::Engine::setEncoding(const ToUTF8::FromType& encoding)
 {
     mEncoding = encoding;
-    mTranslationDataStorage.setEncoding (encoding);
 }
 
 void OMW::Engine::setFallbackValues(std::map<std::string,std::string> fallbackMap)
diff --git a/apps/openmw/mwworld/worldimp.cpp b/apps/openmw/mwworld/worldimp.cpp
index 3995123b0e..4f60f6ef42 100644
--- a/apps/openmw/mwworld/worldimp.cpp
+++ b/apps/openmw/mwworld/worldimp.cpp
@@ -170,7 +170,7 @@ namespace MWWorld
     World::World (OEngine::Render::OgreRenderer& renderer,
         const Files::Collections& fileCollections,
         const std::string& master, const boost::filesystem::path& resDir, const boost::filesystem::path& cacheDir, bool newGame,
-        const ToUTF8::FromType& encoding, std::map<std::string,std::string> fallbackMap)
+        ToUTF8::Utf8Encoder* encoder, std::map<std::string,std::string> fallbackMap)
     : mPlayer (0), mLocalScripts (mStore), mGlobalVariables (0),
       mSky (true), mCells (mStore, mEsm),
       mNumFacing(0)
@@ -187,7 +187,7 @@ namespace MWWorld
         std::cout << "Loading ESM " << masterPath.string() << "\n";
 
         // This parses the ESM file and loads a sample cell
-        mEsm.setEncoding(encoding);
+        mEsm.setEncoder(encoder);
         mEsm.open (masterPath.string());
         mStore.load (mEsm);
 
diff --git a/apps/openmw/mwworld/worldimp.hpp b/apps/openmw/mwworld/worldimp.hpp
index 71ae9597f8..d29adebf44 100644
--- a/apps/openmw/mwworld/worldimp.hpp
+++ b/apps/openmw/mwworld/worldimp.hpp
@@ -95,7 +95,7 @@ namespace MWWorld
             World (OEngine::Render::OgreRenderer& renderer,
                 const Files::Collections& fileCollections,
                 const std::string& master, const boost::filesystem::path& resDir, const boost::filesystem::path& cacheDir, bool newGame,
-                const ToUTF8::FromType& encoding, std::map<std::string,std::string> fallbackMap);
+                ToUTF8::Utf8Encoder* encoder, std::map<std::string,std::string> fallbackMap);
 
             virtual ~World();
 
diff --git a/components/esm/esmreader.cpp b/components/esm/esmreader.cpp
index 5cd99a64a2..99f7971b18 100644
--- a/components/esm/esmreader.cpp
+++ b/components/esm/esmreader.cpp
@@ -344,7 +344,7 @@ std::string ESMReader::getString(int size)
     getExact(ptr, size);
 
     // Convert to UTF8 and return
-    return mEncoder.getUtf8(ptr, size);
+    return mEncoder->getUtf8(ptr, size);
 }
 
 void ESMReader::fail(const std::string &msg)
@@ -362,10 +362,9 @@ void ESMReader::fail(const std::string &msg)
     throw std::runtime_error(ss.str());
 }
 
-void ESMReader::setEncoding(const ToUTF8::FromType& encoding)
+void ESMReader::setEncoder(ToUTF8::Utf8Encoder* encoder)
 {
-  mEncoding = encoding;
-  mEncoder.setEncoding(encoding);
+    mEncoder = encoder;
 }
 
 }
diff --git a/components/esm/esmreader.hpp b/components/esm/esmreader.hpp
index 57503aea77..d52be25aa2 100644
--- a/components/esm/esmreader.hpp
+++ b/components/esm/esmreader.hpp
@@ -235,8 +235,8 @@ public:
   /// Used for error handling
   void fail(const std::string &msg);
 
-  /// Sets font encoding for ESM strings
-  void setEncoding(const ToUTF8::FromType& encoding);
+  /// Sets font encoder for ESM strings
+  void setEncoder(ToUTF8::Utf8Encoder* encoder);
 
 private:
   Ogre::DataStreamPtr mEsm;
@@ -251,8 +251,7 @@ private:
 
   SaveData mSaveData;
   MasterList mMasters;
-  ToUTF8::FromType mEncoding;
-  ToUTF8::Utf8Encoder mEncoder;
+  ToUTF8::Utf8Encoder* mEncoder;
 };
 }
 #endif
diff --git a/components/esm/esmwriter.cpp b/components/esm/esmwriter.cpp
index a00c7971d0..e2f878a257 100644
--- a/components/esm/esmwriter.cpp
+++ b/components/esm/esmwriter.cpp
@@ -158,7 +158,7 @@ void ESMWriter::writeHString(const std::string& data)
     else
     {
         // Convert to UTF8 and return
-        std::string ascii = m_encoder.getLegacyEnc(data);
+        std::string ascii = m_encoder->getLegacyEnc(data);
 
         write(ascii.c_str(), ascii.size());
     }
@@ -188,23 +188,9 @@ void ESMWriter::write(const char* data, int size)
     m_stream->write(data, size);
 }
 
-void ESMWriter::setEncoding(const std::string& encoding)
+void ESMWriter::setEncoder(ToUTF8::Utf8Encoder* encoder)
 {
-    if (encoding == "win1250")
-    {
-        m_encoding = ToUTF8::WINDOWS_1250;
-    }
-    else if (encoding == "win1251")
-    {
-        m_encoding = ToUTF8::WINDOWS_1251;
-    }
-    else
-    {
-        // Default Latin encoding
-        m_encoding = ToUTF8::WINDOWS_1252;
-    }
-
-    m_encoder.setEncoding(m_encoding);
+    m_encoder = encoder;
 }
 
 }
diff --git a/components/esm/esmwriter.hpp b/components/esm/esmwriter.hpp
index 20bc5da128..b557a29ad8 100644
--- a/components/esm/esmwriter.hpp
+++ b/components/esm/esmwriter.hpp
@@ -6,7 +6,7 @@
 #include <assert.h>
 
 #include "esmcommon.hpp"
-#include "../to_utf8/to_utf8.hpp"
+#include <components/to_utf8/to_utf8.hpp>
 
 namespace ESM {
 
@@ -24,7 +24,7 @@ public:
     void setVersion(int ver);
     int getType();
     void setType(int type);
-    void setEncoding(const std::string& encoding); // Write strings as UTF-8?
+    void setEncoder(ToUTF8::Utf8Encoder *encoding); // Write strings as UTF-8?
     void setAuthor(const std::string& author);
     void setDescription(const std::string& desc);
 
@@ -94,12 +94,10 @@ private:
     std::list<RecordData> m_records;
     std::ostream* m_stream;
     std::streampos m_headerPos;
-    ToUTF8::FromType m_encoding;
-    ToUTF8::Utf8Encoder m_encoder;
+    ToUTF8::Utf8Encoder* m_encoder;
     int m_recordCount;
 
     HEDRstruct m_header;
-    SaveData m_saveData;
 };
 
 }
diff --git a/components/to_utf8/to_utf8.cpp b/components/to_utf8/to_utf8.cpp
index 5efec36a4d..275f5483f3 100644
--- a/components/to_utf8/to_utf8.cpp
+++ b/components/to_utf8/to_utf8.cpp
@@ -43,16 +43,10 @@
 
 using namespace ToUTF8;
 
-Utf8Encoder::Utf8Encoder(void):
+Utf8Encoder::Utf8Encoder(const FromType sourceEncoding):
     mOutput(50*1024)
 {
-}
-
-void Utf8Encoder::setEncoding(const FromType sourceEncoding)
-{
-    mEncoding = sourceEncoding;
-
-    switch (mEncoding)
+    switch (sourceEncoding)
     {
         case ToUTF8::WINDOWS_1252:
         {
diff --git a/components/to_utf8/to_utf8.hpp b/components/to_utf8/to_utf8.hpp
index bfba8a1ac4..e150cf17ba 100644
--- a/components/to_utf8/to_utf8.hpp
+++ b/components/to_utf8/to_utf8.hpp
@@ -24,9 +24,7 @@ namespace ToUTF8
     class Utf8Encoder
     {
         public:
-            Utf8Encoder(void);
-
-            void setEncoding(const FromType sourceEncoding);
+            Utf8Encoder(FromType sourceEncoding);
 
             // Convert to UTF8 from the previously given code page.
             std::string getUtf8(const char *input, int size);
@@ -48,9 +46,7 @@ namespace ToUTF8
             size_t getLength2(const char* input, bool &ascii);
             void copyFromArray2(const char*& chp, char* &out);
 
-            FromType mEncoding;
             std::vector<char> mOutput;
-            int mSize;
             char* translationArray;
     };
 }
diff --git a/components/translation/translation.cpp b/components/translation/translation.cpp
index 002446e4f9..184cf399cd 100644
--- a/components/translation/translation.cpp
+++ b/components/translation/translation.cpp
@@ -50,7 +50,7 @@ namespace Translation
 
             if (!line.empty())
             {
-                line = mEncoder.getUtf8(line);
+                line = mEncoder->getUtf8(line);
 
                 size_t tab_pos = line.find('\t');
                 if (tab_pos != std::string::npos && tab_pos > 0 && tab_pos < line.size() - 1)
@@ -101,10 +101,9 @@ namespace Translation
             return phrase;
     }
 
-    void Storage::setEncoding (const ToUTF8::FromType& encoding)
+    void Storage::setEncoder(ToUTF8::Utf8Encoder* encoder)
     {
-        mEncoding = encoding;
-        mEncoder.setEncoding(encoding);
+        mEncoder = encoder;
     }
 
     bool Storage::hasTranslation() const
diff --git a/components/translation/translation.hpp b/components/translation/translation.hpp
index 6c3e4df868..bca9ea255c 100644
--- a/components/translation/translation.hpp
+++ b/components/translation/translation.hpp
@@ -19,7 +19,7 @@ namespace Translation
         // Standard form usually means nominative case
         std::string topicStandardForm(const std::string& phrase) const;
 
-        void setEncoding (const ToUTF8::FromType& encoding);
+        void setEncoder(ToUTF8::Utf8Encoder* encoder);
 
         bool hasTranslation() const;
 
@@ -34,8 +34,7 @@ namespace Translation
         void loadDataFromStream(ContainerType& container, std::istream& stream);
 
 
-        ToUTF8::FromType mEncoding;
-        ToUTF8::Utf8Encoder mEncoder;
+        ToUTF8::Utf8Encoder* mEncoder;
         ContainerType mCellNamesTranslations, mTopicIDs, mPhraseForms;
     };
 }

From 0b7d11d38d99033a09bed6b22c75378f692653a5 Mon Sep 17 00:00:00 2001
From: Emanuel Guevel <guevel.emanuel@gmail.com>
Date: Sun, 6 Jan 2013 11:39:18 +0100
Subject: [PATCH 9/9] to_utf8 test: fix Utf8Encoder constructor

---
 components/to_utf8/tests/to_utf8_test.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/components/to_utf8/tests/to_utf8_test.cpp b/components/to_utf8/tests/to_utf8_test.cpp
index 4bba0cf90d..3fcddd1581 100644
--- a/components/to_utf8/tests/to_utf8_test.cpp
+++ b/components/to_utf8/tests/to_utf8_test.cpp
@@ -18,8 +18,7 @@ void testEncoder(ToUTF8::FromType encoding, const std::string &legacyEncFile,
     std::string utf8Line = getFirstLine(utf8File);
 
     // create an encoder for specified character encoding
-    ToUTF8::Utf8Encoder encoder;
-    encoder.setEncoding(encoding);
+    ToUTF8::Utf8Encoder encoder (encoding);
 
     // convert text to UTF-8
     std::string convertedUtf8Line = encoder.getUtf8(legacyEncLine);