Merge remote-tracking branch 'potatoesmaster/to_utf8-rewrite'

pull/16/head
Marc Zinnschlag 12 years ago
commit 25815ab8f7

@ -165,23 +165,12 @@ bool parseOptions (int argc, char** argv, Arguments &info)
// Font encoding settings // Font encoding settings
info.encoding = variables["encoding"].as<std::string>(); info.encoding = variables["encoding"].as<std::string>();
if (info.encoding == "win1250") if(info.encoding != "win1250" && info.encoding != "win1251" && info.encoding != "win1252")
{ {
std::cout << "Using Central and Eastern European font encoding." << std::endl; std::cout << info.encoding << " is not a valid encoding option." << std::endl;
} info.encoding = "win1252";
else if (info.encoding == "win1251")
{
std::cout << "Using Cyrillic font encoding." << std::endl;
}
else
{
if(info.encoding != "win1252")
{
std::cout << info.encoding << " is not a valid encoding option." << std::endl;
info.encoding = "win1252";
}
std::cout << "Using default (English) font encoding." << std::endl;
} }
std::cout << ToUTF8::encodingUsingMessage(info.encoding) << std::endl;
return true; return true;
} }
@ -262,7 +251,8 @@ void printRaw(ESM::ESMReader &esm)
int load(Arguments& info) int load(Arguments& info)
{ {
ESM::ESMReader& esm = info.reader; ESM::ESMReader& esm = info.reader;
esm.setEncoding(ToUTF8::calculateEncoding(info.encoding)); ToUTF8::Utf8Encoder encoder (ToUTF8::calculateEncoding(info.encoding));
esm.setEncoder(&encoder);
std::string filename = info.filename; std::string filename = info.filename;
std::cout << "Loading file: " << filename << std::endl; std::cout << "Loading file: " << filename << std::endl;
@ -432,7 +422,8 @@ int clone(Arguments& info)
std::cout << std::endl << "Saving records to: " << info.outname << "..." << std::endl; std::cout << std::endl << "Saving records to: " << info.outname << "..." << std::endl;
ESM::ESMWriter& esm = info.writer; ESM::ESMWriter& esm = info.writer;
esm.setEncoding(info.encoding); ToUTF8::Utf8Encoder encoder (ToUTF8::calculateEncoding(info.encoding));
esm.setEncoder(&encoder);
esm.setAuthor(info.data.author); esm.setAuthor(info.data.author);
esm.setDescription(info.data.description); esm.setDescription(info.data.description);
esm.setVersion(info.data.version); esm.setVersion(info.data.version);

@ -272,7 +272,8 @@ void DataFilesModel::addMasters(const QString &path)
foreach (const QString &path, dir.entryList()) { foreach (const QString &path, dir.entryList()) {
try { try {
ESM::ESMReader fileReader; ESM::ESMReader fileReader;
fileReader.setEncoding(ToUTF8::calculateEncoding(mEncoding.toStdString())); ToUTF8::Utf8Encoder encoder (ToUTF8::calculateEncoding(mEncoding.toStdString()));
fileReader.setEncoder(&encoder);
fileReader.open(dir.absoluteFilePath(path).toStdString()); fileReader.open(dir.absoluteFilePath(path).toStdString());
ESM::ESMReader::MasterList mlist = fileReader.getMasters(); ESM::ESMReader::MasterList mlist = fileReader.getMasters();
@ -335,7 +336,8 @@ void DataFilesModel::addPlugins(const QString &path)
try { try {
ESM::ESMReader fileReader; ESM::ESMReader fileReader;
fileReader.setEncoding(ToUTF8::calculateEncoding(mEncoding.toStdString())); ToUTF8::Utf8Encoder encoder (ToUTF8::calculateEncoding(mEncoding.toStdString()));
fileReader.setEncoder(&encoder);
fileReader.open(dir.absoluteFilePath(path).toStdString()); fileReader.open(dir.absoluteFilePath(path).toStdString());
ESM::ESMReader::MasterList mlist = fileReader.getMasters(); ESM::ESMReader::MasterList mlist = fileReader.getMasters();

@ -649,11 +649,12 @@ MwIniImporter::multistrmap MwIniImporter::loadIniFile(std::string filename) {
std::string section(""); std::string section("");
MwIniImporter::multistrmap map; MwIniImporter::multistrmap map;
boost::iostreams::stream<boost::iostreams::file_source>file(filename.c_str()); boost::iostreams::stream<boost::iostreams::file_source>file(filename.c_str());
ToUTF8::Utf8Encoder encoder(mEncoding);
std::string line; std::string line;
while (std::getline(file, line)) { while (std::getline(file, line)) {
line = toUTF8(line); line = encoder.getUtf8(line);
// unify Unix-style and Windows file ending // unify Unix-style and Windows file ending
if (!(line.empty()) && (line[line.length()-1]) == '\r') { if (!(line.empty()) && (line[line.length()-1]) == '\r') {
@ -829,14 +830,6 @@ void MwIniImporter::writeToFile(boost::iostreams::stream<boost::iostreams::file_
} }
} }
std::string MwIniImporter::toUTF8(const std::string &str) {
char *ptr = ToUTF8::getBuffer(str.length());
strncpy(ptr, str.c_str(), str.length());
// Convert to UTF8 and return
return ToUTF8::getUtf8(mEncoding);
}
void MwIniImporter::setInputEncoding(const ToUTF8::FromType &encoding) void MwIniImporter::setInputEncoding(const ToUTF8::FromType &encoding)
{ {
mEncoding = encoding; mEncoding = encoding;

@ -8,7 +8,7 @@
#include <vector> #include <vector>
#include <exception> #include <exception>
#include "../../components/to_utf8/to_utf8.hpp" #include <components/to_utf8/to_utf8.hpp>
class MwIniImporter { class MwIniImporter {
public: public:

@ -331,11 +331,15 @@ void OMW::Engine::go()
// cursor replacer (converts the cursor from the bsa so they can be used by mygui) // cursor replacer (converts the cursor from the bsa so they can be used by mygui)
MWGui::CursorReplace replacer; MWGui::CursorReplace replacer;
// Create encoder
ToUTF8::Utf8Encoder encoder (mEncoding);
// Create the world // Create the world
mEnvironment.setWorld (new MWWorld::World (*mOgre, mFileCollections, mMaster, mEnvironment.setWorld (new MWWorld::World (*mOgre, mFileCollections, mMaster,
mResDir, mCfgMgr.getCachePath(), mNewGame, mEncoding, mFallbackMap)); mResDir, mCfgMgr.getCachePath(), mNewGame, &encoder, mFallbackMap));
//Load translation data //Load translation data
mTranslationDataStorage.setEncoder(&encoder);
mTranslationDataStorage.loadTranslationData(mFileCollections, mMaster); mTranslationDataStorage.loadTranslationData(mFileCollections, mMaster);
// Create window manager - this manages all the MW-specific GUI windows // Create window manager - this manages all the MW-specific GUI windows
@ -494,7 +498,6 @@ void OMW::Engine::showFPS(int level)
void OMW::Engine::setEncoding(const ToUTF8::FromType& encoding) void OMW::Engine::setEncoding(const ToUTF8::FromType& encoding)
{ {
mEncoding = encoding; mEncoding = encoding;
mTranslationDataStorage.setEncoding (encoding);
} }
void OMW::Engine::setFallbackValues(std::map<std::string,std::string> fallbackMap) void OMW::Engine::setFallbackValues(std::map<std::string,std::string> fallbackMap)

@ -170,7 +170,7 @@ namespace MWWorld
World::World (OEngine::Render::OgreRenderer& renderer, World::World (OEngine::Render::OgreRenderer& renderer,
const Files::Collections& fileCollections, const Files::Collections& fileCollections,
const std::string& master, const boost::filesystem::path& resDir, const boost::filesystem::path& cacheDir, bool newGame, const std::string& master, const boost::filesystem::path& resDir, const boost::filesystem::path& cacheDir, bool newGame,
const ToUTF8::FromType& encoding, std::map<std::string,std::string> fallbackMap) ToUTF8::Utf8Encoder* encoder, std::map<std::string,std::string> fallbackMap)
: mPlayer (0), mLocalScripts (mStore), mGlobalVariables (0), : mPlayer (0), mLocalScripts (mStore), mGlobalVariables (0),
mSky (true), mCells (mStore, mEsm), mSky (true), mCells (mStore, mEsm),
mNumFacing(0) mNumFacing(0)
@ -187,7 +187,7 @@ namespace MWWorld
std::cout << "Loading ESM " << masterPath.string() << "\n"; std::cout << "Loading ESM " << masterPath.string() << "\n";
// This parses the ESM file and loads a sample cell // This parses the ESM file and loads a sample cell
mEsm.setEncoding(encoding); mEsm.setEncoder(encoder);
mEsm.open (masterPath.string()); mEsm.open (masterPath.string());
mStore.load (mEsm); mStore.load (mEsm);

@ -95,7 +95,7 @@ namespace MWWorld
World (OEngine::Render::OgreRenderer& renderer, World (OEngine::Render::OgreRenderer& renderer,
const Files::Collections& fileCollections, const Files::Collections& fileCollections,
const std::string& master, const boost::filesystem::path& resDir, const boost::filesystem::path& cacheDir, bool newGame, const std::string& master, const boost::filesystem::path& resDir, const boost::filesystem::path& cacheDir, bool newGame,
const ToUTF8::FromType& encoding, std::map<std::string,std::string> fallbackMap); ToUTF8::Utf8Encoder* encoder, std::map<std::string,std::string> fallbackMap);
virtual ~World(); virtual ~World();

@ -15,6 +15,11 @@ ESM_Context ESMReader::getContext()
return mCtx; return mCtx;
} }
ESMReader::ESMReader(void):
mBuffer(50*1024)
{
}
void ESMReader::restoreContext(const ESM_Context &rc) void ESMReader::restoreContext(const ESM_Context &rc)
{ {
// Reopen the file if necessary // Reopen the file if necessary
@ -323,11 +328,21 @@ void ESMReader::getExact(void*x, int size)
std::string ESMReader::getString(int size) std::string ESMReader::getString(int size)
{ {
char *ptr = ToUTF8::getBuffer(size); size_t s = size;
mEsm->read(ptr, size); if (mBuffer.size() <= s)
// Add some extra padding to reduce the chance of having to resize
// again later.
mBuffer.resize(3*s);
// And make sure the string is zero terminated
mBuffer[s] = 0;
// read ESM data
char *ptr = &mBuffer[0];
getExact(ptr, size);
// Convert to UTF8 and return // Convert to UTF8 and return
return ToUTF8::getUtf8(mEncoding); return mEncoder->getUtf8(ptr, size);
} }
void ESMReader::fail(const std::string &msg) void ESMReader::fail(const std::string &msg)
@ -345,9 +360,9 @@ void ESMReader::fail(const std::string &msg)
throw std::runtime_error(ss.str()); throw std::runtime_error(ss.str());
} }
void ESMReader::setEncoding(const ToUTF8::FromType& encoding) void ESMReader::setEncoder(ToUTF8::Utf8Encoder* encoder)
{ {
mEncoding = encoding; mEncoder = encoder;
} }
} }

@ -20,6 +20,8 @@ class ESMReader
{ {
public: public:
ESMReader(void);
/************************************************************************* /*************************************************************************
* *
* Public type definitions * Public type definitions
@ -233,8 +235,8 @@ public:
/// Used for error handling /// Used for error handling
void fail(const std::string &msg); void fail(const std::string &msg);
/// Sets font encoding for ESM strings /// Sets font encoder for ESM strings
void setEncoding(const ToUTF8::FromType& encoding); void setEncoder(ToUTF8::Utf8Encoder* encoder);
private: private:
Ogre::DataStreamPtr mEsm; Ogre::DataStreamPtr mEsm;
@ -244,9 +246,12 @@ private:
// Special file signifier (see SpecialFile enum above) // Special file signifier (see SpecialFile enum above)
int mSpf; int mSpf;
// Buffer for ESM strings
std::vector<char> mBuffer;
SaveData mSaveData; SaveData mSaveData;
MasterList mMasters; MasterList mMasters;
ToUTF8::FromType mEncoding; ToUTF8::Utf8Encoder* mEncoder;
}; };
} }
#endif #endif

@ -157,12 +157,8 @@ void ESMWriter::writeHString(const std::string& data)
write("\0", 1); write("\0", 1);
else else
{ {
char *ptr = ToUTF8::getBuffer(data.size()+1);
strncpy(ptr, &data[0], data.size());
ptr[data.size()] = '\0';
// Convert to UTF8 and return // Convert to UTF8 and return
std::string ascii = ToUTF8::getLegacyEnc(m_encoding); std::string ascii = m_encoder->getLegacyEnc(data);
write(ascii.c_str(), ascii.size()); write(ascii.c_str(), ascii.size());
} }
@ -192,21 +188,9 @@ void ESMWriter::write(const char* data, int size)
m_stream->write(data, size); m_stream->write(data, size);
} }
void ESMWriter::setEncoding(const std::string& encoding) void ESMWriter::setEncoder(ToUTF8::Utf8Encoder* encoder)
{ {
if (encoding == "win1250") m_encoder = encoder;
{
m_encoding = ToUTF8::WINDOWS_1250;
}
else if (encoding == "win1251")
{
m_encoding = ToUTF8::WINDOWS_1251;
}
else
{
// Default Latin encoding
m_encoding = ToUTF8::WINDOWS_1252;
}
} }
} }

@ -6,7 +6,7 @@
#include <assert.h> #include <assert.h>
#include "esmcommon.hpp" #include "esmcommon.hpp"
#include "../to_utf8/to_utf8.hpp" #include <components/to_utf8/to_utf8.hpp>
namespace ESM { namespace ESM {
@ -24,7 +24,7 @@ public:
void setVersion(int ver); void setVersion(int ver);
int getType(); int getType();
void setType(int type); void setType(int type);
void setEncoding(const std::string& encoding); // Write strings as UTF-8? void setEncoder(ToUTF8::Utf8Encoder *encoding); // Write strings as UTF-8?
void setAuthor(const std::string& author); void setAuthor(const std::string& author);
void setDescription(const std::string& desc); void setDescription(const std::string& desc);
@ -94,11 +94,10 @@ private:
std::list<RecordData> m_records; std::list<RecordData> m_records;
std::ostream* m_stream; std::ostream* m_stream;
std::streampos m_headerPos; std::streampos m_headerPos;
ToUTF8::FromType m_encoding; ToUTF8::Utf8Encoder* m_encoder;
int m_recordCount; int m_recordCount;
HEDRstruct m_header; HEDRstruct m_header;
SaveData m_saveData;
}; };
} }

@ -0,0 +1,4 @@
original: Без вопросов отдаете ему рулет, зная, что позже вы сможете привести с собой своих друзей и тогда он получит по заслугам?
converted: Без вопросов отдаете ему рулет, зная, что позже вы сможете привести с собой своих друзей и тогда он получит по заслугам?
original: Vous lui donnez le gâteau sans protester avant daller chercher tous vos amis et de revenir vous venger.
converted: Vous lui donnez le gâteau sans protester avant daller chercher tous vos amis et de revenir vous venger.

@ -0,0 +1,18 @@
#!/bin/bash
make || exit
mkdir -p output
PROGS=*_test
for a in $PROGS; do
if [ -f "output/$a.out" ]; then
echo "Running $a:"
./$a | diff output/$a.out -
else
echo "Creating $a.out"
./$a > "output/$a.out"
git add "output/$a.out"
fi
done

@ -0,0 +1 @@
Vous lui donnez le gâteau sans protester avant daller chercher tous vos amis et de revenir vous venger.

@ -0,0 +1 @@
Vous lui donnez le gâteau sans protester avant daller chercher tous vos amis et de revenir vous venger.

@ -0,0 +1 @@
Без вопросов отдаете ему рулет, зная, что позже вы сможете привести с собой своих друзей и тогда он получит по заслугам?

@ -0,0 +1 @@
Без вопросов отдаете ему рулет, зная, что позже вы сможете привести с собой своих друзей и тогда он получит по заслугам?

@ -0,0 +1,59 @@
#include <iostream>
#include <fstream>
#include <cassert>
#include <stdexcept>
#include "../to_utf8.hpp"
std::string getFirstLine(const std::string &filename);
void testEncoder(ToUTF8::FromType encoding, const std::string &legacyEncFile,
const std::string &utf8File);
/// Test character encoding conversion to and from UTF-8
void testEncoder(ToUTF8::FromType encoding, const std::string &legacyEncFile,
const std::string &utf8File)
{
// get some test data
std::string legacyEncLine = getFirstLine(legacyEncFile);
std::string utf8Line = getFirstLine(utf8File);
// create an encoder for specified character encoding
ToUTF8::Utf8Encoder encoder (encoding);
// convert text to UTF-8
std::string convertedUtf8Line = encoder.getUtf8(legacyEncLine);
std::cout << "original: " << utf8Line << std::endl;
std::cout << "converted: " << convertedUtf8Line << std::endl;
// check correctness
assert(convertedUtf8Line == utf8Line);
// convert UTF-8 text to legacy encoding
std::string convertedLegacyEncLine = encoder.getLegacyEnc(utf8Line);
// check correctness
assert(convertedLegacyEncLine == legacyEncLine);
}
std::string getFirstLine(const std::string &filename)
{
std::string line;
std::ifstream text (filename.c_str());
if (!text.is_open())
{
throw std::runtime_error("Unable to open file " + filename);
}
std::getline(text, line);
text.close();
return line;
}
int main()
{
testEncoder(ToUTF8::WINDOWS_1251, "test_data/russian-win1251.txt", "test_data/russian-utf8.txt");
testEncoder(ToUTF8::WINDOWS_1252, "test_data/french-win1252.txt", "test_data/french-utf8.txt");
return 0;
}

@ -2,6 +2,8 @@
#include <vector> #include <vector>
#include <cassert> #include <cassert>
#include <iostream>
#include <iomanip>
/* This file contains the code to translate from WINDOWS-1252 (native /* This file contains the code to translate from WINDOWS-1252 (native
charset used in English version of Morrowind) to UTF-8. The library charset used in English version of Morrowind) to UTF-8. The library
@ -39,341 +41,298 @@
// Generated tables // Generated tables
#include "tables_gen.hpp" #include "tables_gen.hpp"
// Shared global buffers, we love you. These initial sizes are large using namespace ToUTF8;
// enough to hold the largest books in Morrowind.esm, but we will
// resize automaticall if necessary.
static std::vector<char> buf (50*1024);
static std::vector<char> output (50*1024);
static int size;
// Make sure the given vector is large enough for 'size' bytes, Utf8Encoder::Utf8Encoder(const FromType sourceEncoding):
// including a terminating zero after it. mOutput(50*1024)
static void resize(std::vector<char> &buf, size_t size)
{ {
if(buf.size() <= size) switch (sourceEncoding)
// Add some extra padding to reduce the chance of having to resize {
// again later. case ToUTF8::WINDOWS_1252:
buf.resize(3*size); {
translationArray = ToUTF8::windows_1252;
break;
}
case ToUTF8::WINDOWS_1250:
{
translationArray = ToUTF8::windows_1250;
break;
}
case ToUTF8::WINDOWS_1251:
{
translationArray = ToUTF8::windows_1251;
break;
}
default:
{
assert(0);
}
}
}
// And make sure the string is zero terminated std::string Utf8Encoder::getUtf8(const char* input, int size)
buf[size] = 0; {
// Double check that the input string stops at some point (it might
// contain zero terminators before this, inside its own data, which
// is also ok.)
assert(input[size] == 0);
// TODO: The rest of this function is designed for single-character
// input encodings only. It also assumes that the input the input
// encoding shares its first 128 values (0-127) with ASCII. These
// conditions must be checked again if you add more input encodings
// later.
// Compute output length, and check for pure ascii input at the same
// time.
bool ascii;
size_t outlen = getLength(input, ascii);
// If we're pure ascii, then don't bother converting anything.
if(ascii)
return std::string(input, outlen);
// Make sure the output is large enough
resize(outlen);
char *out = &mOutput[0];
// Translate
while (*input)
copyFromArray(*(input++), out);
// Make sure that we wrote the correct number of bytes
assert((out-&mOutput[0]) == (int)outlen);
// And make extra sure the output is null terminated
assert(mOutput.size() > outlen);
assert(mOutput[outlen] == 0);
// Return a string
return std::string(&mOutput[0], outlen);
} }
// This is just used to spew out a reusable input buffer for the std::string Utf8Encoder::getLegacyEnc(const char *input, int size)
// conversion process.
char *ToUTF8::getBuffer(int s)
{ {
// Remember the requested size // Double check that the input string stops at some point (it might
size = s; // contain zero terminators before this, inside its own data, which
resize(buf, size); // is also ok.)
return &buf[0]; assert(input[size] == 0);
// TODO: The rest of this function is designed for single-character
// input encodings only. It also assumes that the input the input
// encoding shares its first 128 values (0-127) with ASCII. These
// conditions must be checked again if you add more input encodings
// later.
// Compute output length, and check for pure ascii input at the same
// time.
bool ascii;
size_t outlen = getLength2(input, ascii);
// If we're pure ascii, then don't bother converting anything.
if(ascii)
return std::string(input, outlen);
// Make sure the output is large enough
resize(outlen);
char *out = &mOutput[0];
// Translate
while(*input)
copyFromArray2(input, out);
// Make sure that we wrote the correct number of bytes
assert((out-&mOutput[0]) == (int)outlen);
// And make extra sure the output is null terminated
assert(mOutput.size() > outlen);
assert(mOutput[outlen] == 0);
// Return a string
return std::string(&mOutput[0], outlen);
}
// Make sure the output vector is large enough for 'size' bytes,
// including a terminating zero after it.
void Utf8Encoder::resize(size_t size)
{
if (mOutput.size() <= size)
// Add some extra padding to reduce the chance of having to resize
// again later.
mOutput.resize(3*size);
// And make sure the string is zero terminated
mOutput[size] = 0;
} }
/** Get the total length length needed to decode the given string with /** Get the total length length needed to decode the given string with
the given translation array. The arrays are encoded with 6 bytes the given translation array. The arrays are encoded with 6 bytes
per character, with the first giving the length and the next 5 the per character, with the first giving the length and the next 5 the
actual data. actual data.
The function serves a dual purpose for optimization reasons: it The function serves a dual purpose for optimization reasons: it
checks if the input is pure ascii (all values are <= 127). If this checks if the input is pure ascii (all values are <= 127). If this
is the case, then the ascii parameter is set to true, and the is the case, then the ascii parameter is set to true, and the
caller can optimize for this case. caller can optimize for this case.
*/ */
static size_t getLength(const char *arr, const char* input, bool &ascii) size_t Utf8Encoder::getLength(const char* input, bool &ascii)
{ {
ascii = true; ascii = true;
size_t len = 0; size_t len = 0;
const char* ptr = input; const char* ptr = input;
unsigned char inp = *ptr; unsigned char inp = *ptr;
// Do away with the ascii part of the string first (this is almost // Do away with the ascii part of the string first (this is almost
// always the entire string.) // always the entire string.)
while(inp && inp < 128) while (inp && inp < 128)
inp = *(++ptr); inp = *(++ptr);
len += (ptr-input); len += (ptr-input);
// If we're not at the null terminator at this point, then there // If we're not at the null terminator at this point, then there
// were some non-ascii characters to deal with. Go to slow-mode for // were some non-ascii characters to deal with. Go to slow-mode for
// the rest of the string. // the rest of the string.
if(inp) if (inp)
{ {
ascii = false; ascii = false;
while(inp) while (inp)
{ {
// Find the translated length of this character in the // Find the translated length of this character in the
// lookup table. // lookup table.
len += arr[inp*6]; len += translationArray[inp*6];
inp = *(++ptr); inp = *(++ptr);
} }
} }
return len; return len;
} }
// Translate one character 'ch' using the translation array 'arr', and // Translate one character 'ch' using the translation array 'arr', and
// advance the output pointer accordingly. // advance the output pointer accordingly.
static void copyFromArray(const char *arr, unsigned char ch, char* &out) void Utf8Encoder::copyFromArray(unsigned char ch, char* &out)
{ {
// Optimize for ASCII values // Optimize for ASCII values
if(ch < 128) if (ch < 128)
{ {
*(out++) = ch; *(out++) = ch;
return; return;
} }
const char *in = arr + ch*6; const char *in = translationArray + ch*6;
int len = *(in++); int len = *(in++);
for(int i=0; i<len; i++) for (int i=0; i<len; i++)
*(out++) = *(in++); *(out++) = *(in++);
}
std::string ToUTF8::getUtf8(ToUTF8::FromType from)
{
// Pick translation array
const char *arr;
switch (from)
{
case ToUTF8::WINDOWS_1252:
{
arr = ToUTF8::windows_1252;
break;
}
case ToUTF8::WINDOWS_1250:
{
arr = ToUTF8::windows_1250;
break;
}
case ToUTF8::WINDOWS_1251:
{
arr = ToUTF8::windows_1251;
break;
}
default:
{
assert(0);
}
}
// Double check that the input string stops at some point (it might
// contain zero terminators before this, inside its own data, which
// is also ok.)
const char* input = &buf[0];
assert(input[size] == 0);
// TODO: The rest of this function is designed for single-character
// input encodings only. It also assumes that the input the input
// encoding shares its first 128 values (0-127) with ASCII. These
// conditions must be checked again if you add more input encodings
// later.
// Compute output length, and check for pure ascii input at the same
// time.
bool ascii;
size_t outlen = getLength(arr, input, ascii);
// If we're pure ascii, then don't bother converting anything.
if(ascii)
return std::string(input, outlen);
// Make sure the output is large enough
resize(output, outlen);
char *out = &output[0];
// Translate
while(*input)
copyFromArray(arr, *(input++), out);
// Make sure that we wrote the correct number of bytes
assert((out-&output[0]) == (int)outlen);
// And make extra sure the output is null terminated
assert(output.size() > outlen);
assert(output[outlen] == 0);
// Return a string
return std::string(&output[0], outlen);
} }
static size_t getLength2(const char *arr, const char* input, bool &ascii) size_t Utf8Encoder::getLength2(const char* input, bool &ascii)
{ {
ascii = true; ascii = true;
size_t len = 0; size_t len = 0;
const char* ptr = input; const char* ptr = input;
unsigned char inp = *ptr; unsigned char inp = *ptr;
// Do away with the ascii part of the string first (this is almost // Do away with the ascii part of the string first (this is almost
// always the entire string.) // always the entire string.)
while(inp && inp < 128) while (inp && inp < 128)
inp = *(++ptr); inp = *(++ptr);
len += (ptr-input); len += (ptr-input);
// If we're not at the null terminator at this point, then there // If we're not at the null terminator at this point, then there
// were some non-ascii characters to deal with. Go to slow-mode for // were some non-ascii characters to deal with. Go to slow-mode for
// the rest of the string. // the rest of the string.
if(inp) if (inp)
{ {
ascii = false; ascii = false;
while(inp) while(inp)
{ {
len += 1; len += 1;
// Find the translated length of this character in the // Find the translated length of this character in the
// lookup table. // lookup table.
switch(inp) switch(inp)
{ {
case 0xe2: len -= 2; break; case 0xe2: len -= 2; break;
case 0xc2: case 0xc2:
case 0xcb: case 0xcb:
case 0xc4: case 0xc4:
case 0xc6: case 0xc6:
case 0xc3: case 0xc3:
case 0xd0: case 0xd0:
case 0xd1: case 0xd1:
case 0xd2: case 0xd2:
case 0xc5: len -= 1; break; case 0xc5: len -= 1; break;
} }
inp = *(++ptr); inp = *(++ptr);
} }
} }
return len; return len;
} }
#include <iostream> void Utf8Encoder::copyFromArray2(const char*& chp, char* &out)
#include <iomanip>
static void copyFromArray2(const char *arr, char*& chp, char* &out)
{ {
unsigned char ch = *(chp++); unsigned char ch = *(chp++);
// Optimize for ASCII values // Optimize for ASCII values
if(ch < 128) if (ch < 128)
{ {
*(out++) = ch; *(out++) = ch;
return; return;
} }
int len = 1; int len = 1;
switch (ch) switch (ch)
{
case 0xe2: len = 3; break;
case 0xc2:
case 0xcb:
case 0xc4:
case 0xc6:
case 0xc3:
case 0xd0:
case 0xd1:
case 0xd2:
case 0xc5: len = 2; break;
}
if (len == 1) // There is no 1 length utf-8 glyph that is not 0x20 (empty space)
{
*(out++) = ch;
return;
}
unsigned char ch2 = *(chp++);
unsigned char ch3 = '\0';
if (len == 3)
ch3 = *(chp++);
for (int i = 128; i < 256; i++)
{
unsigned char b1 = arr[i*6 + 1], b2 = arr[i*6 + 2], b3 = arr[i*6 + 3];
if (b1 == ch && b2 == ch2 && (len != 3 || b3 == ch3))
{
*(out++) = (char)i;
return;
}
}
std::cout << "Could not find glyph " << std::hex << (int)ch << " " << (int)ch2 << " " << (int)ch3 << std::endl;
*(out++) = ch; // Could not find glyph, just put whatever
}
std::string ToUTF8::getLegacyEnc(ToUTF8::FromType to)
{
// Pick translation array
const char *arr;
switch (to)
{
case ToUTF8::WINDOWS_1252:
{
arr = ToUTF8::windows_1252;
break;
}
case ToUTF8::WINDOWS_1250:
{ {
arr = ToUTF8::windows_1250; case 0xe2: len = 3; break;
break; case 0xc2:
case 0xcb:
case 0xc4:
case 0xc6:
case 0xc3:
case 0xd0:
case 0xd1:
case 0xd2:
case 0xc5: len = 2; break;
} }
case ToUTF8::WINDOWS_1251:
if (len == 1) // There is no 1 length utf-8 glyph that is not 0x20 (empty space)
{ {
arr = ToUTF8::windows_1251; *(out++) = ch;
break; return;
} }
default:
unsigned char ch2 = *(chp++);
unsigned char ch3 = '\0';
if (len == 3)
ch3 = *(chp++);
for (int i = 128; i < 256; i++)
{ {
assert(0); unsigned char b1 = translationArray[i*6 + 1], b2 = translationArray[i*6 + 2], b3 = translationArray[i*6 + 3];
if (b1 == ch && b2 == ch2 && (len != 3 || b3 == ch3))
{
*(out++) = (char)i;
return;
}
} }
}
std::cout << "Could not find glyph " << std::hex << (int)ch << " " << (int)ch2 << " " << (int)ch3 << std::endl;
// Double check that the input string stops at some point (it might
// contain zero terminators before this, inside its own data, which *(out++) = ch; // Could not find glyph, just put whatever
// is also ok.)
char* input = &buf[0];
assert(input[size] == 0);
// TODO: The rest of this function is designed for single-character
// input encodings only. It also assumes that the input the input
// encoding shares its first 128 values (0-127) with ASCII. These
// conditions must be checked again if you add more input encodings
// later.
// Compute output length, and check for pure ascii input at the same
// time.
bool ascii;
size_t outlen = getLength2(arr, input, ascii);
// If we're pure ascii, then don't bother converting anything.
if(ascii)
return std::string(input, outlen);
// Make sure the output is large enough
resize(output, outlen);
char *out = &output[0];
// Translate
while(*input)
copyFromArray2(arr, input, out);
// Make sure that we wrote the correct number of bytes
assert((out-&output[0]) == (int)outlen);
// And make extra sure the output is null terminated
assert(output.size() > outlen);
assert(output[outlen] == 0);
// Return a string
return std::string(&output[0], outlen);
} }
ToUTF8::FromType ToUTF8::calculateEncoding(const std::string& encodingName) ToUTF8::FromType ToUTF8::calculateEncoding(const std::string& encodingName)
{ {
if (encodingName == "win1250") if (encodingName == "win1250")
return ToUTF8::WINDOWS_1250; return ToUTF8::WINDOWS_1250;
else if (encodingName == "win1251") else if (encodingName == "win1251")
return ToUTF8::WINDOWS_1251; return ToUTF8::WINDOWS_1251;
else else
return ToUTF8::WINDOWS_1252; return ToUTF8::WINDOWS_1252;
} }
std::string ToUTF8::encodingUsingMessage(const std::string& encodingName) std::string ToUTF8::encodingUsingMessage(const std::string& encodingName)
{ {
if (encodingName == "win1250") if (encodingName == "win1250")
return "Using Central and Eastern European font encoding."; return "Using Central and Eastern European font encoding.";
else if (encodingName == "win1251") else if (encodingName == "win1251")
return "Using Cyrillic font encoding."; return "Using Cyrillic font encoding.";
else else
return "Using default (English) font encoding."; return "Using default (English) font encoding.";
} }

@ -2,29 +2,53 @@
#define COMPONENTS_TOUTF8_H #define COMPONENTS_TOUTF8_H
#include <string> #include <string>
#include <cstring>
#include <vector>
namespace ToUTF8 namespace ToUTF8
{ {
// These are all the currently supported code pages // These are all the currently supported code pages
enum FromType enum FromType
{ {
WINDOWS_1250, // Central ane Eastern European languages WINDOWS_1250, // Central ane Eastern European languages
WINDOWS_1251, // Cyrillic languages WINDOWS_1251, // Cyrillic languages
WINDOWS_1252 // Used by English version of Morrowind (and WINDOWS_1252 // Used by English version of Morrowind (and
// probably others) // probably others)
}; };
// Return a writable buffer of at least 'size' bytes. The buffer FromType calculateEncoding(const std::string& encodingName);
// does not have to be freed. std::string encodingUsingMessage(const std::string& encodingName);
char* getBuffer(int size);
// Convert the previously written buffer to UTF8 from the given code // class
// page.
std::string getUtf8(FromType from);
std::string getLegacyEnc(FromType to);
FromType calculateEncoding(const std::string& encodingName); class Utf8Encoder
std::string encodingUsingMessage(const std::string& encodingName); {
public:
Utf8Encoder(FromType sourceEncoding);
// Convert to UTF8 from the previously given code page.
std::string getUtf8(const char *input, int size);
inline std::string getUtf8(const std::string &str)
{
return getUtf8(str.c_str(), str.size());
}
std::string getLegacyEnc(const char *input, int size);
inline std::string getLegacyEnc(const std::string &str)
{
return getLegacyEnc(str.c_str(), str.size());
}
private:
void resize(size_t size);
size_t getLength(const char* input, bool &ascii);
void copyFromArray(unsigned char chp, char* &out);
size_t getLength2(const char* input, bool &ascii);
void copyFromArray2(const char*& chp, char* &out);
std::vector<char> mOutput;
char* translationArray;
};
} }
#endif #endif

@ -50,10 +50,7 @@ namespace Translation
if (!line.empty()) if (!line.empty())
{ {
char* buffer = ToUTF8::getBuffer(line.size() + 1); line = mEncoder->getUtf8(line);
//buffer has at least line.size() + 1 bytes, so it must be safe
strcpy(buffer, line.c_str());
line = ToUTF8::getUtf8(mEncoding);
size_t tab_pos = line.find('\t'); size_t tab_pos = line.find('\t');
if (tab_pos != std::string::npos && tab_pos > 0 && tab_pos < line.size() - 1) if (tab_pos != std::string::npos && tab_pos > 0 && tab_pos < line.size() - 1)
@ -104,9 +101,9 @@ namespace Translation
return phrase; return phrase;
} }
void Storage::setEncoding (const ToUTF8::FromType& encoding) void Storage::setEncoder(ToUTF8::Utf8Encoder* encoder)
{ {
mEncoding = encoding; mEncoder = encoder;
} }
bool Storage::hasTranslation() const bool Storage::hasTranslation() const

@ -19,7 +19,7 @@ namespace Translation
// Standard form usually means nominative case // Standard form usually means nominative case
std::string topicStandardForm(const std::string& phrase) const; std::string topicStandardForm(const std::string& phrase) const;
void setEncoding (const ToUTF8::FromType& encoding); void setEncoder(ToUTF8::Utf8Encoder* encoder);
bool hasTranslation() const; bool hasTranslation() const;
@ -34,7 +34,7 @@ namespace Translation
void loadDataFromStream(ContainerType& container, std::istream& stream); void loadDataFromStream(ContainerType& container, std::istream& stream);
ToUTF8::FromType mEncoding; ToUTF8::Utf8Encoder* mEncoder;
ContainerType mCellNamesTranslations, mTopicIDs, mPhraseForms; ContainerType mCellNamesTranslations, mTopicIDs, mPhraseForms;
}; };
} }

Loading…
Cancel
Save