Merge remote-tracking branch 'potatoesmaster/to_utf8-rewrite'

This commit is contained in:
Marc Zinnschlag 2013-01-06 13:31:01 +01:00
commit 25815ab8f7
23 changed files with 423 additions and 365 deletions

View file

@ -165,23 +165,12 @@ bool parseOptions (int argc, char** argv, Arguments &info)
// Font encoding settings // Font encoding settings
info.encoding = variables["encoding"].as<std::string>(); info.encoding = variables["encoding"].as<std::string>();
if (info.encoding == "win1250") if(info.encoding != "win1250" && info.encoding != "win1251" && info.encoding != "win1252")
{ {
std::cout << "Using Central and Eastern European font encoding." << std::endl; std::cout << info.encoding << " is not a valid encoding option." << std::endl;
} info.encoding = "win1252";
else if (info.encoding == "win1251")
{
std::cout << "Using Cyrillic font encoding." << std::endl;
}
else
{
if(info.encoding != "win1252")
{
std::cout << info.encoding << " is not a valid encoding option." << std::endl;
info.encoding = "win1252";
}
std::cout << "Using default (English) font encoding." << std::endl;
} }
std::cout << ToUTF8::encodingUsingMessage(info.encoding) << std::endl;
return true; return true;
} }
@ -262,7 +251,8 @@ void printRaw(ESM::ESMReader &esm)
int load(Arguments& info) int load(Arguments& info)
{ {
ESM::ESMReader& esm = info.reader; ESM::ESMReader& esm = info.reader;
esm.setEncoding(ToUTF8::calculateEncoding(info.encoding)); ToUTF8::Utf8Encoder encoder (ToUTF8::calculateEncoding(info.encoding));
esm.setEncoder(&encoder);
std::string filename = info.filename; std::string filename = info.filename;
std::cout << "Loading file: " << filename << std::endl; std::cout << "Loading file: " << filename << std::endl;
@ -432,7 +422,8 @@ int clone(Arguments& info)
std::cout << std::endl << "Saving records to: " << info.outname << "..." << std::endl; std::cout << std::endl << "Saving records to: " << info.outname << "..." << std::endl;
ESM::ESMWriter& esm = info.writer; ESM::ESMWriter& esm = info.writer;
esm.setEncoding(info.encoding); ToUTF8::Utf8Encoder encoder (ToUTF8::calculateEncoding(info.encoding));
esm.setEncoder(&encoder);
esm.setAuthor(info.data.author); esm.setAuthor(info.data.author);
esm.setDescription(info.data.description); esm.setDescription(info.data.description);
esm.setVersion(info.data.version); esm.setVersion(info.data.version);

View file

@ -272,7 +272,8 @@ void DataFilesModel::addMasters(const QString &path)
foreach (const QString &path, dir.entryList()) { foreach (const QString &path, dir.entryList()) {
try { try {
ESM::ESMReader fileReader; ESM::ESMReader fileReader;
fileReader.setEncoding(ToUTF8::calculateEncoding(mEncoding.toStdString())); ToUTF8::Utf8Encoder encoder (ToUTF8::calculateEncoding(mEncoding.toStdString()));
fileReader.setEncoder(&encoder);
fileReader.open(dir.absoluteFilePath(path).toStdString()); fileReader.open(dir.absoluteFilePath(path).toStdString());
ESM::ESMReader::MasterList mlist = fileReader.getMasters(); ESM::ESMReader::MasterList mlist = fileReader.getMasters();
@ -335,7 +336,8 @@ void DataFilesModel::addPlugins(const QString &path)
try { try {
ESM::ESMReader fileReader; ESM::ESMReader fileReader;
fileReader.setEncoding(ToUTF8::calculateEncoding(mEncoding.toStdString())); ToUTF8::Utf8Encoder encoder (ToUTF8::calculateEncoding(mEncoding.toStdString()));
fileReader.setEncoder(&encoder);
fileReader.open(dir.absoluteFilePath(path).toStdString()); fileReader.open(dir.absoluteFilePath(path).toStdString());
ESM::ESMReader::MasterList mlist = fileReader.getMasters(); ESM::ESMReader::MasterList mlist = fileReader.getMasters();

View file

@ -649,11 +649,12 @@ MwIniImporter::multistrmap MwIniImporter::loadIniFile(std::string filename) {
std::string section(""); std::string section("");
MwIniImporter::multistrmap map; MwIniImporter::multistrmap map;
boost::iostreams::stream<boost::iostreams::file_source>file(filename.c_str()); boost::iostreams::stream<boost::iostreams::file_source>file(filename.c_str());
ToUTF8::Utf8Encoder encoder(mEncoding);
std::string line; std::string line;
while (std::getline(file, line)) { while (std::getline(file, line)) {
line = toUTF8(line); line = encoder.getUtf8(line);
// unify Unix-style and Windows file ending // unify Unix-style and Windows file ending
if (!(line.empty()) && (line[line.length()-1]) == '\r') { if (!(line.empty()) && (line[line.length()-1]) == '\r') {
@ -829,14 +830,6 @@ void MwIniImporter::writeToFile(boost::iostreams::stream<boost::iostreams::file_
} }
} }
std::string MwIniImporter::toUTF8(const std::string &str) {
char *ptr = ToUTF8::getBuffer(str.length());
strncpy(ptr, str.c_str(), str.length());
// Convert to UTF8 and return
return ToUTF8::getUtf8(mEncoding);
}
void MwIniImporter::setInputEncoding(const ToUTF8::FromType &encoding) void MwIniImporter::setInputEncoding(const ToUTF8::FromType &encoding)
{ {
mEncoding = encoding; mEncoding = encoding;

View file

@ -8,7 +8,7 @@
#include <vector> #include <vector>
#include <exception> #include <exception>
#include "../../components/to_utf8/to_utf8.hpp" #include <components/to_utf8/to_utf8.hpp>
class MwIniImporter { class MwIniImporter {
public: public:

View file

@ -331,11 +331,15 @@ void OMW::Engine::go()
// cursor replacer (converts the cursor from the bsa so they can be used by mygui) // cursor replacer (converts the cursor from the bsa so they can be used by mygui)
MWGui::CursorReplace replacer; MWGui::CursorReplace replacer;
// Create encoder
ToUTF8::Utf8Encoder encoder (mEncoding);
// Create the world // Create the world
mEnvironment.setWorld (new MWWorld::World (*mOgre, mFileCollections, mMaster, mEnvironment.setWorld (new MWWorld::World (*mOgre, mFileCollections, mMaster,
mResDir, mCfgMgr.getCachePath(), mNewGame, mEncoding, mFallbackMap)); mResDir, mCfgMgr.getCachePath(), mNewGame, &encoder, mFallbackMap));
//Load translation data //Load translation data
mTranslationDataStorage.setEncoder(&encoder);
mTranslationDataStorage.loadTranslationData(mFileCollections, mMaster); mTranslationDataStorage.loadTranslationData(mFileCollections, mMaster);
// Create window manager - this manages all the MW-specific GUI windows // Create window manager - this manages all the MW-specific GUI windows
@ -494,7 +498,6 @@ void OMW::Engine::showFPS(int level)
void OMW::Engine::setEncoding(const ToUTF8::FromType& encoding) void OMW::Engine::setEncoding(const ToUTF8::FromType& encoding)
{ {
mEncoding = encoding; mEncoding = encoding;
mTranslationDataStorage.setEncoding (encoding);
} }
void OMW::Engine::setFallbackValues(std::map<std::string,std::string> fallbackMap) void OMW::Engine::setFallbackValues(std::map<std::string,std::string> fallbackMap)

View file

@ -170,7 +170,7 @@ namespace MWWorld
World::World (OEngine::Render::OgreRenderer& renderer, World::World (OEngine::Render::OgreRenderer& renderer,
const Files::Collections& fileCollections, const Files::Collections& fileCollections,
const std::string& master, const boost::filesystem::path& resDir, const boost::filesystem::path& cacheDir, bool newGame, const std::string& master, const boost::filesystem::path& resDir, const boost::filesystem::path& cacheDir, bool newGame,
const ToUTF8::FromType& encoding, std::map<std::string,std::string> fallbackMap) ToUTF8::Utf8Encoder* encoder, std::map<std::string,std::string> fallbackMap)
: mPlayer (0), mLocalScripts (mStore), mGlobalVariables (0), : mPlayer (0), mLocalScripts (mStore), mGlobalVariables (0),
mSky (true), mCells (mStore, mEsm), mSky (true), mCells (mStore, mEsm),
mNumFacing(0) mNumFacing(0)
@ -187,7 +187,7 @@ namespace MWWorld
std::cout << "Loading ESM " << masterPath.string() << "\n"; std::cout << "Loading ESM " << masterPath.string() << "\n";
// This parses the ESM file and loads a sample cell // This parses the ESM file and loads a sample cell
mEsm.setEncoding(encoding); mEsm.setEncoder(encoder);
mEsm.open (masterPath.string()); mEsm.open (masterPath.string());
mStore.load (mEsm); mStore.load (mEsm);

View file

@ -95,7 +95,7 @@ namespace MWWorld
World (OEngine::Render::OgreRenderer& renderer, World (OEngine::Render::OgreRenderer& renderer,
const Files::Collections& fileCollections, const Files::Collections& fileCollections,
const std::string& master, const boost::filesystem::path& resDir, const boost::filesystem::path& cacheDir, bool newGame, const std::string& master, const boost::filesystem::path& resDir, const boost::filesystem::path& cacheDir, bool newGame,
const ToUTF8::FromType& encoding, std::map<std::string,std::string> fallbackMap); ToUTF8::Utf8Encoder* encoder, std::map<std::string,std::string> fallbackMap);
virtual ~World(); virtual ~World();

View file

@ -15,6 +15,11 @@ ESM_Context ESMReader::getContext()
return mCtx; return mCtx;
} }
ESMReader::ESMReader(void):
mBuffer(50*1024)
{
}
void ESMReader::restoreContext(const ESM_Context &rc) void ESMReader::restoreContext(const ESM_Context &rc)
{ {
// Reopen the file if necessary // Reopen the file if necessary
@ -323,11 +328,21 @@ void ESMReader::getExact(void*x, int size)
std::string ESMReader::getString(int size) std::string ESMReader::getString(int size)
{ {
char *ptr = ToUTF8::getBuffer(size); size_t s = size;
mEsm->read(ptr, size); if (mBuffer.size() <= s)
// Add some extra padding to reduce the chance of having to resize
// again later.
mBuffer.resize(3*s);
// And make sure the string is zero terminated
mBuffer[s] = 0;
// read ESM data
char *ptr = &mBuffer[0];
getExact(ptr, size);
// Convert to UTF8 and return // Convert to UTF8 and return
return ToUTF8::getUtf8(mEncoding); return mEncoder->getUtf8(ptr, size);
} }
void ESMReader::fail(const std::string &msg) void ESMReader::fail(const std::string &msg)
@ -345,9 +360,9 @@ void ESMReader::fail(const std::string &msg)
throw std::runtime_error(ss.str()); throw std::runtime_error(ss.str());
} }
void ESMReader::setEncoding(const ToUTF8::FromType& encoding) void ESMReader::setEncoder(ToUTF8::Utf8Encoder* encoder)
{ {
mEncoding = encoding; mEncoder = encoder;
} }
} }

View file

@ -20,6 +20,8 @@ class ESMReader
{ {
public: public:
ESMReader(void);
/************************************************************************* /*************************************************************************
* *
* Public type definitions * Public type definitions
@ -233,8 +235,8 @@ public:
/// Used for error handling /// Used for error handling
void fail(const std::string &msg); void fail(const std::string &msg);
/// Sets font encoding for ESM strings /// Sets font encoder for ESM strings
void setEncoding(const ToUTF8::FromType& encoding); void setEncoder(ToUTF8::Utf8Encoder* encoder);
private: private:
Ogre::DataStreamPtr mEsm; Ogre::DataStreamPtr mEsm;
@ -244,9 +246,12 @@ private:
// Special file signifier (see SpecialFile enum above) // Special file signifier (see SpecialFile enum above)
int mSpf; int mSpf;
// Buffer for ESM strings
std::vector<char> mBuffer;
SaveData mSaveData; SaveData mSaveData;
MasterList mMasters; MasterList mMasters;
ToUTF8::FromType mEncoding; ToUTF8::Utf8Encoder* mEncoder;
}; };
} }
#endif #endif

View file

@ -157,12 +157,8 @@ void ESMWriter::writeHString(const std::string& data)
write("\0", 1); write("\0", 1);
else else
{ {
char *ptr = ToUTF8::getBuffer(data.size()+1);
strncpy(ptr, &data[0], data.size());
ptr[data.size()] = '\0';
// Convert to UTF8 and return // Convert to UTF8 and return
std::string ascii = ToUTF8::getLegacyEnc(m_encoding); std::string ascii = m_encoder->getLegacyEnc(data);
write(ascii.c_str(), ascii.size()); write(ascii.c_str(), ascii.size());
} }
@ -192,21 +188,9 @@ void ESMWriter::write(const char* data, int size)
m_stream->write(data, size); m_stream->write(data, size);
} }
void ESMWriter::setEncoding(const std::string& encoding) void ESMWriter::setEncoder(ToUTF8::Utf8Encoder* encoder)
{ {
if (encoding == "win1250") m_encoder = encoder;
{
m_encoding = ToUTF8::WINDOWS_1250;
}
else if (encoding == "win1251")
{
m_encoding = ToUTF8::WINDOWS_1251;
}
else
{
// Default Latin encoding
m_encoding = ToUTF8::WINDOWS_1252;
}
} }
} }

View file

@ -6,7 +6,7 @@
#include <assert.h> #include <assert.h>
#include "esmcommon.hpp" #include "esmcommon.hpp"
#include "../to_utf8/to_utf8.hpp" #include <components/to_utf8/to_utf8.hpp>
namespace ESM { namespace ESM {
@ -24,7 +24,7 @@ public:
void setVersion(int ver); void setVersion(int ver);
int getType(); int getType();
void setType(int type); void setType(int type);
void setEncoding(const std::string& encoding); // Write strings as UTF-8? void setEncoder(ToUTF8::Utf8Encoder *encoding); // Write strings as UTF-8?
void setAuthor(const std::string& author); void setAuthor(const std::string& author);
void setDescription(const std::string& desc); void setDescription(const std::string& desc);
@ -94,11 +94,10 @@ private:
std::list<RecordData> m_records; std::list<RecordData> m_records;
std::ostream* m_stream; std::ostream* m_stream;
std::streampos m_headerPos; std::streampos m_headerPos;
ToUTF8::FromType m_encoding; ToUTF8::Utf8Encoder* m_encoder;
int m_recordCount; int m_recordCount;
HEDRstruct m_header; HEDRstruct m_header;
SaveData m_saveData;
}; };
} }

1
components/to_utf8/tests/.gitignore vendored Normal file
View file

@ -0,0 +1 @@
*_test

View file

@ -0,0 +1,4 @@
original: Без вопросов отдаете ему рулет, зная, что позже вы сможете привести с собой своих друзей и тогда он получит по заслугам?
converted: Без вопросов отдаете ему рулет, зная, что позже вы сможете привести с собой своих друзей и тогда он получит по заслугам?
original: Vous lui donnez le gâteau sans protester avant daller chercher tous vos amis et de revenir vous venger.
converted: Vous lui donnez le gâteau sans protester avant daller chercher tous vos amis et de revenir vous venger.

View file

@ -0,0 +1,18 @@
#!/bin/bash
make || exit
mkdir -p output
PROGS=*_test
for a in $PROGS; do
if [ -f "output/$a.out" ]; then
echo "Running $a:"
./$a | diff output/$a.out -
else
echo "Creating $a.out"
./$a > "output/$a.out"
git add "output/$a.out"
fi
done

View file

@ -0,0 +1 @@
Vous lui donnez le gâteau sans protester avant daller chercher tous vos amis et de revenir vous venger.

View file

@ -0,0 +1 @@
Vous lui donnez le gâteau sans protester avant daller chercher tous vos amis et de revenir vous venger.

View file

@ -0,0 +1 @@
Без вопросов отдаете ему рулет, зная, что позже вы сможете привести с собой своих друзей и тогда он получит по заслугам?

View file

@ -0,0 +1 @@
Без вопросов отдаете ему рулет, зная, что позже вы сможете привести с собой своих друзей и тогда он получит по заслугам?

View file

@ -0,0 +1,59 @@
#include <iostream>
#include <fstream>
#include <cassert>
#include <stdexcept>
#include "../to_utf8.hpp"
std::string getFirstLine(const std::string &filename);
void testEncoder(ToUTF8::FromType encoding, const std::string &legacyEncFile,
const std::string &utf8File);
/// Test character encoding conversion to and from UTF-8
void testEncoder(ToUTF8::FromType encoding, const std::string &legacyEncFile,
const std::string &utf8File)
{
// get some test data
std::string legacyEncLine = getFirstLine(legacyEncFile);
std::string utf8Line = getFirstLine(utf8File);
// create an encoder for specified character encoding
ToUTF8::Utf8Encoder encoder (encoding);
// convert text to UTF-8
std::string convertedUtf8Line = encoder.getUtf8(legacyEncLine);
std::cout << "original: " << utf8Line << std::endl;
std::cout << "converted: " << convertedUtf8Line << std::endl;
// check correctness
assert(convertedUtf8Line == utf8Line);
// convert UTF-8 text to legacy encoding
std::string convertedLegacyEncLine = encoder.getLegacyEnc(utf8Line);
// check correctness
assert(convertedLegacyEncLine == legacyEncLine);
}
std::string getFirstLine(const std::string &filename)
{
std::string line;
std::ifstream text (filename.c_str());
if (!text.is_open())
{
throw std::runtime_error("Unable to open file " + filename);
}
std::getline(text, line);
text.close();
return line;
}
int main()
{
testEncoder(ToUTF8::WINDOWS_1251, "test_data/russian-win1251.txt", "test_data/russian-utf8.txt");
testEncoder(ToUTF8::WINDOWS_1252, "test_data/french-win1252.txt", "test_data/french-utf8.txt");
return 0;
}

View file

@ -2,6 +2,8 @@
#include <vector> #include <vector>
#include <cassert> #include <cassert>
#include <iostream>
#include <iomanip>
/* This file contains the code to translate from WINDOWS-1252 (native /* This file contains the code to translate from WINDOWS-1252 (native
charset used in English version of Morrowind) to UTF-8. The library charset used in English version of Morrowind) to UTF-8. The library
@ -39,341 +41,298 @@
// Generated tables // Generated tables
#include "tables_gen.hpp" #include "tables_gen.hpp"
// Shared global buffers, we love you. These initial sizes are large using namespace ToUTF8;
// enough to hold the largest books in Morrowind.esm, but we will
// resize automaticall if necessary.
static std::vector<char> buf (50*1024);
static std::vector<char> output (50*1024);
static int size;
// Make sure the given vector is large enough for 'size' bytes, Utf8Encoder::Utf8Encoder(const FromType sourceEncoding):
// including a terminating zero after it. mOutput(50*1024)
static void resize(std::vector<char> &buf, size_t size)
{ {
if(buf.size() <= size) switch (sourceEncoding)
// Add some extra padding to reduce the chance of having to resize {
// again later. case ToUTF8::WINDOWS_1252:
buf.resize(3*size); {
translationArray = ToUTF8::windows_1252;
// And make sure the string is zero terminated break;
buf[size] = 0; }
case ToUTF8::WINDOWS_1250:
{
translationArray = ToUTF8::windows_1250;
break;
}
case ToUTF8::WINDOWS_1251:
{
translationArray = ToUTF8::windows_1251;
break;
}
default:
{
assert(0);
}
}
} }
// This is just used to spew out a reusable input buffer for the std::string Utf8Encoder::getUtf8(const char* input, int size)
// conversion process.
char *ToUTF8::getBuffer(int s)
{ {
// Remember the requested size // Double check that the input string stops at some point (it might
size = s; // contain zero terminators before this, inside its own data, which
resize(buf, size); // is also ok.)
return &buf[0]; assert(input[size] == 0);
// TODO: The rest of this function is designed for single-character
// input encodings only. It also assumes that the input the input
// encoding shares its first 128 values (0-127) with ASCII. These
// conditions must be checked again if you add more input encodings
// later.
// Compute output length, and check for pure ascii input at the same
// time.
bool ascii;
size_t outlen = getLength(input, ascii);
// If we're pure ascii, then don't bother converting anything.
if(ascii)
return std::string(input, outlen);
// Make sure the output is large enough
resize(outlen);
char *out = &mOutput[0];
// Translate
while (*input)
copyFromArray(*(input++), out);
// Make sure that we wrote the correct number of bytes
assert((out-&mOutput[0]) == (int)outlen);
// And make extra sure the output is null terminated
assert(mOutput.size() > outlen);
assert(mOutput[outlen] == 0);
// Return a string
return std::string(&mOutput[0], outlen);
}
std::string Utf8Encoder::getLegacyEnc(const char *input, int size)
{
// Double check that the input string stops at some point (it might
// contain zero terminators before this, inside its own data, which
// is also ok.)
assert(input[size] == 0);
// TODO: The rest of this function is designed for single-character
// input encodings only. It also assumes that the input the input
// encoding shares its first 128 values (0-127) with ASCII. These
// conditions must be checked again if you add more input encodings
// later.
// Compute output length, and check for pure ascii input at the same
// time.
bool ascii;
size_t outlen = getLength2(input, ascii);
// If we're pure ascii, then don't bother converting anything.
if(ascii)
return std::string(input, outlen);
// Make sure the output is large enough
resize(outlen);
char *out = &mOutput[0];
// Translate
while(*input)
copyFromArray2(input, out);
// Make sure that we wrote the correct number of bytes
assert((out-&mOutput[0]) == (int)outlen);
// And make extra sure the output is null terminated
assert(mOutput.size() > outlen);
assert(mOutput[outlen] == 0);
// Return a string
return std::string(&mOutput[0], outlen);
}
// Make sure the output vector is large enough for 'size' bytes,
// including a terminating zero after it.
void Utf8Encoder::resize(size_t size)
{
if (mOutput.size() <= size)
// Add some extra padding to reduce the chance of having to resize
// again later.
mOutput.resize(3*size);
// And make sure the string is zero terminated
mOutput[size] = 0;
} }
/** Get the total length length needed to decode the given string with /** Get the total length length needed to decode the given string with
the given translation array. The arrays are encoded with 6 bytes the given translation array. The arrays are encoded with 6 bytes
per character, with the first giving the length and the next 5 the per character, with the first giving the length and the next 5 the
actual data. actual data.
The function serves a dual purpose for optimization reasons: it The function serves a dual purpose for optimization reasons: it
checks if the input is pure ascii (all values are <= 127). If this checks if the input is pure ascii (all values are <= 127). If this
is the case, then the ascii parameter is set to true, and the is the case, then the ascii parameter is set to true, and the
caller can optimize for this case. caller can optimize for this case.
*/ */
static size_t getLength(const char *arr, const char* input, bool &ascii) size_t Utf8Encoder::getLength(const char* input, bool &ascii)
{ {
ascii = true; ascii = true;
size_t len = 0; size_t len = 0;
const char* ptr = input; const char* ptr = input;
unsigned char inp = *ptr; unsigned char inp = *ptr;
// Do away with the ascii part of the string first (this is almost // Do away with the ascii part of the string first (this is almost
// always the entire string.) // always the entire string.)
while(inp && inp < 128) while (inp && inp < 128)
inp = *(++ptr); inp = *(++ptr);
len += (ptr-input); len += (ptr-input);
// If we're not at the null terminator at this point, then there // If we're not at the null terminator at this point, then there
// were some non-ascii characters to deal with. Go to slow-mode for // were some non-ascii characters to deal with. Go to slow-mode for
// the rest of the string. // the rest of the string.
if(inp) if (inp)
{ {
ascii = false; ascii = false;
while(inp) while (inp)
{ {
// Find the translated length of this character in the // Find the translated length of this character in the
// lookup table. // lookup table.
len += arr[inp*6]; len += translationArray[inp*6];
inp = *(++ptr); inp = *(++ptr);
} }
} }
return len; return len;
} }
// Translate one character 'ch' using the translation array 'arr', and // Translate one character 'ch' using the translation array 'arr', and
// advance the output pointer accordingly. // advance the output pointer accordingly.
static void copyFromArray(const char *arr, unsigned char ch, char* &out) void Utf8Encoder::copyFromArray(unsigned char ch, char* &out)
{ {
// Optimize for ASCII values // Optimize for ASCII values
if(ch < 128) if (ch < 128)
{ {
*(out++) = ch; *(out++) = ch;
return; return;
} }
const char *in = arr + ch*6; const char *in = translationArray + ch*6;
int len = *(in++); int len = *(in++);
for(int i=0; i<len; i++) for (int i=0; i<len; i++)
*(out++) = *(in++); *(out++) = *(in++);
} }
std::string ToUTF8::getUtf8(ToUTF8::FromType from) size_t Utf8Encoder::getLength2(const char* input, bool &ascii)
{ {
// Pick translation array ascii = true;
const char *arr; size_t len = 0;
switch (from) const char* ptr = input;
{ unsigned char inp = *ptr;
case ToUTF8::WINDOWS_1252:
// Do away with the ascii part of the string first (this is almost
// always the entire string.)
while (inp && inp < 128)
inp = *(++ptr);
len += (ptr-input);
// If we're not at the null terminator at this point, then there
// were some non-ascii characters to deal with. Go to slow-mode for
// the rest of the string.
if (inp)
{ {
arr = ToUTF8::windows_1252; ascii = false;
break; while(inp)
}
case ToUTF8::WINDOWS_1250:
{
arr = ToUTF8::windows_1250;
break;
}
case ToUTF8::WINDOWS_1251:
{
arr = ToUTF8::windows_1251;
break;
}
default:
{
assert(0);
}
}
// Double check that the input string stops at some point (it might
// contain zero terminators before this, inside its own data, which
// is also ok.)
const char* input = &buf[0];
assert(input[size] == 0);
// TODO: The rest of this function is designed for single-character
// input encodings only. It also assumes that the input the input
// encoding shares its first 128 values (0-127) with ASCII. These
// conditions must be checked again if you add more input encodings
// later.
// Compute output length, and check for pure ascii input at the same
// time.
bool ascii;
size_t outlen = getLength(arr, input, ascii);
// If we're pure ascii, then don't bother converting anything.
if(ascii)
return std::string(input, outlen);
// Make sure the output is large enough
resize(output, outlen);
char *out = &output[0];
// Translate
while(*input)
copyFromArray(arr, *(input++), out);
// Make sure that we wrote the correct number of bytes
assert((out-&output[0]) == (int)outlen);
// And make extra sure the output is null terminated
assert(output.size() > outlen);
assert(output[outlen] == 0);
// Return a string
return std::string(&output[0], outlen);
}
static size_t getLength2(const char *arr, const char* input, bool &ascii)
{
ascii = true;
size_t len = 0;
const char* ptr = input;
unsigned char inp = *ptr;
// Do away with the ascii part of the string first (this is almost
// always the entire string.)
while(inp && inp < 128)
inp = *(++ptr);
len += (ptr-input);
// If we're not at the null terminator at this point, then there
// were some non-ascii characters to deal with. Go to slow-mode for
// the rest of the string.
if(inp)
{
ascii = false;
while(inp)
{ {
len += 1; len += 1;
// Find the translated length of this character in the // Find the translated length of this character in the
// lookup table. // lookup table.
switch(inp) switch(inp)
{ {
case 0xe2: len -= 2; break; case 0xe2: len -= 2; break;
case 0xc2: case 0xc2:
case 0xcb: case 0xcb:
case 0xc4: case 0xc4:
case 0xc6: case 0xc6:
case 0xc3: case 0xc3:
case 0xd0: case 0xd0:
case 0xd1: case 0xd1:
case 0xd2: case 0xd2:
case 0xc5: len -= 1; break; case 0xc5: len -= 1; break;
} }
inp = *(++ptr); inp = *(++ptr);
} }
} }
return len; return len;
} }
#include <iostream> void Utf8Encoder::copyFromArray2(const char*& chp, char* &out)
#include <iomanip>
static void copyFromArray2(const char *arr, char*& chp, char* &out)
{ {
unsigned char ch = *(chp++); unsigned char ch = *(chp++);
// Optimize for ASCII values // Optimize for ASCII values
if(ch < 128) if (ch < 128)
{ {
*(out++) = ch; *(out++) = ch;
return; return;
} }
int len = 1; int len = 1;
switch (ch) switch (ch)
{
case 0xe2: len = 3; break;
case 0xc2:
case 0xcb:
case 0xc4:
case 0xc6:
case 0xc3:
case 0xd0:
case 0xd1:
case 0xd2:
case 0xc5: len = 2; break;
}
if (len == 1) // There is no 1 length utf-8 glyph that is not 0x20 (empty space)
{
*(out++) = ch;
return;
}
unsigned char ch2 = *(chp++);
unsigned char ch3 = '\0';
if (len == 3)
ch3 = *(chp++);
for (int i = 128; i < 256; i++)
{
unsigned char b1 = arr[i*6 + 1], b2 = arr[i*6 + 2], b3 = arr[i*6 + 3];
if (b1 == ch && b2 == ch2 && (len != 3 || b3 == ch3))
{
*(out++) = (char)i;
return;
}
}
std::cout << "Could not find glyph " << std::hex << (int)ch << " " << (int)ch2 << " " << (int)ch3 << std::endl;
*(out++) = ch; // Could not find glyph, just put whatever
}
std::string ToUTF8::getLegacyEnc(ToUTF8::FromType to)
{
// Pick translation array
const char *arr;
switch (to)
{
case ToUTF8::WINDOWS_1252:
{ {
arr = ToUTF8::windows_1252; case 0xe2: len = 3; break;
break; case 0xc2:
case 0xcb:
case 0xc4:
case 0xc6:
case 0xc3:
case 0xd0:
case 0xd1:
case 0xd2:
case 0xc5: len = 2; break;
} }
case ToUTF8::WINDOWS_1250:
if (len == 1) // There is no 1 length utf-8 glyph that is not 0x20 (empty space)
{ {
arr = ToUTF8::windows_1250; *(out++) = ch;
break; return;
} }
case ToUTF8::WINDOWS_1251:
unsigned char ch2 = *(chp++);
unsigned char ch3 = '\0';
if (len == 3)
ch3 = *(chp++);
for (int i = 128; i < 256; i++)
{ {
arr = ToUTF8::windows_1251; unsigned char b1 = translationArray[i*6 + 1], b2 = translationArray[i*6 + 2], b3 = translationArray[i*6 + 3];
break; if (b1 == ch && b2 == ch2 && (len != 3 || b3 == ch3))
{
*(out++) = (char)i;
return;
}
} }
default:
{
assert(0);
}
}
// Double check that the input string stops at some point (it might std::cout << "Could not find glyph " << std::hex << (int)ch << " " << (int)ch2 << " " << (int)ch3 << std::endl;
// contain zero terminators before this, inside its own data, which
// is also ok.)
char* input = &buf[0];
assert(input[size] == 0);
// TODO: The rest of this function is designed for single-character *(out++) = ch; // Could not find glyph, just put whatever
// input encodings only. It also assumes that the input the input
// encoding shares its first 128 values (0-127) with ASCII. These
// conditions must be checked again if you add more input encodings
// later.
// Compute output length, and check for pure ascii input at the same
// time.
bool ascii;
size_t outlen = getLength2(arr, input, ascii);
// If we're pure ascii, then don't bother converting anything.
if(ascii)
return std::string(input, outlen);
// Make sure the output is large enough
resize(output, outlen);
char *out = &output[0];
// Translate
while(*input)
copyFromArray2(arr, input, out);
// Make sure that we wrote the correct number of bytes
assert((out-&output[0]) == (int)outlen);
// And make extra sure the output is null terminated
assert(output.size() > outlen);
assert(output[outlen] == 0);
// Return a string
return std::string(&output[0], outlen);
} }
ToUTF8::FromType ToUTF8::calculateEncoding(const std::string& encodingName) ToUTF8::FromType ToUTF8::calculateEncoding(const std::string& encodingName)
{ {
if (encodingName == "win1250") if (encodingName == "win1250")
return ToUTF8::WINDOWS_1250; return ToUTF8::WINDOWS_1250;
else if (encodingName == "win1251") else if (encodingName == "win1251")
return ToUTF8::WINDOWS_1251; return ToUTF8::WINDOWS_1251;
else else
return ToUTF8::WINDOWS_1252; return ToUTF8::WINDOWS_1252;
} }
std::string ToUTF8::encodingUsingMessage(const std::string& encodingName) std::string ToUTF8::encodingUsingMessage(const std::string& encodingName)
{ {
if (encodingName == "win1250") if (encodingName == "win1250")
return "Using Central and Eastern European font encoding."; return "Using Central and Eastern European font encoding.";
else if (encodingName == "win1251") else if (encodingName == "win1251")
return "Using Cyrillic font encoding."; return "Using Cyrillic font encoding.";
else else
return "Using default (English) font encoding."; return "Using default (English) font encoding.";
} }

View file

@ -2,29 +2,53 @@
#define COMPONENTS_TOUTF8_H #define COMPONENTS_TOUTF8_H
#include <string> #include <string>
#include <cstring>
#include <vector>
namespace ToUTF8 namespace ToUTF8
{ {
// These are all the currently supported code pages // These are all the currently supported code pages
enum FromType enum FromType
{ {
WINDOWS_1250, // Central ane Eastern European languages WINDOWS_1250, // Central ane Eastern European languages
WINDOWS_1251, // Cyrillic languages WINDOWS_1251, // Cyrillic languages
WINDOWS_1252 // Used by English version of Morrowind (and WINDOWS_1252 // Used by English version of Morrowind (and
// probably others) // probably others)
}; };
// Return a writable buffer of at least 'size' bytes. The buffer FromType calculateEncoding(const std::string& encodingName);
// does not have to be freed. std::string encodingUsingMessage(const std::string& encodingName);
char* getBuffer(int size);
// Convert the previously written buffer to UTF8 from the given code // class
// page.
std::string getUtf8(FromType from);
std::string getLegacyEnc(FromType to);
FromType calculateEncoding(const std::string& encodingName); class Utf8Encoder
std::string encodingUsingMessage(const std::string& encodingName); {
public:
Utf8Encoder(FromType sourceEncoding);
// Convert to UTF8 from the previously given code page.
std::string getUtf8(const char *input, int size);
inline std::string getUtf8(const std::string &str)
{
return getUtf8(str.c_str(), str.size());
}
std::string getLegacyEnc(const char *input, int size);
inline std::string getLegacyEnc(const std::string &str)
{
return getLegacyEnc(str.c_str(), str.size());
}
private:
void resize(size_t size);
size_t getLength(const char* input, bool &ascii);
void copyFromArray(unsigned char chp, char* &out);
size_t getLength2(const char* input, bool &ascii);
void copyFromArray2(const char*& chp, char* &out);
std::vector<char> mOutput;
char* translationArray;
};
} }
#endif #endif

View file

@ -50,10 +50,7 @@ namespace Translation
if (!line.empty()) if (!line.empty())
{ {
char* buffer = ToUTF8::getBuffer(line.size() + 1); line = mEncoder->getUtf8(line);
//buffer has at least line.size() + 1 bytes, so it must be safe
strcpy(buffer, line.c_str());
line = ToUTF8::getUtf8(mEncoding);
size_t tab_pos = line.find('\t'); size_t tab_pos = line.find('\t');
if (tab_pos != std::string::npos && tab_pos > 0 && tab_pos < line.size() - 1) if (tab_pos != std::string::npos && tab_pos > 0 && tab_pos < line.size() - 1)
@ -104,9 +101,9 @@ namespace Translation
return phrase; return phrase;
} }
void Storage::setEncoding (const ToUTF8::FromType& encoding) void Storage::setEncoder(ToUTF8::Utf8Encoder* encoder)
{ {
mEncoding = encoding; mEncoder = encoder;
} }
bool Storage::hasTranslation() const bool Storage::hasTranslation() const

View file

@ -19,7 +19,7 @@ namespace Translation
// Standard form usually means nominative case // Standard form usually means nominative case
std::string topicStandardForm(const std::string& phrase) const; std::string topicStandardForm(const std::string& phrase) const;
void setEncoding (const ToUTF8::FromType& encoding); void setEncoder(ToUTF8::Utf8Encoder* encoder);
bool hasTranslation() const; bool hasTranslation() const;
@ -34,7 +34,7 @@ namespace Translation
void loadDataFromStream(ContainerType& container, std::istream& stream); void loadDataFromStream(ContainerType& container, std::istream& stream);
ToUTF8::FromType mEncoding; ToUTF8::Utf8Encoder* mEncoder;
ContainerType mCellNamesTranslations, mTopicIDs, mPhraseForms; ContainerType mCellNamesTranslations, mTopicIDs, mPhraseForms;
}; };
} }