1
0
Fork 0
mirror of https://github.com/OpenMW/openmw.git synced 2025-01-19 20:23:54 +00:00

Merge remote-tracking branch 'potatoesmaster/to_utf8-rewrite'

This commit is contained in:
Marc Zinnschlag 2013-01-06 13:31:01 +01:00
commit 25815ab8f7
23 changed files with 423 additions and 365 deletions

View file

@ -165,23 +165,12 @@ bool parseOptions (int argc, char** argv, Arguments &info)
// Font encoding settings // Font encoding settings
info.encoding = variables["encoding"].as<std::string>(); info.encoding = variables["encoding"].as<std::string>();
if (info.encoding == "win1250") if(info.encoding != "win1250" && info.encoding != "win1251" && info.encoding != "win1252")
{
std::cout << "Using Central and Eastern European font encoding." << std::endl;
}
else if (info.encoding == "win1251")
{
std::cout << "Using Cyrillic font encoding." << std::endl;
}
else
{
if(info.encoding != "win1252")
{ {
std::cout << info.encoding << " is not a valid encoding option." << std::endl; std::cout << info.encoding << " is not a valid encoding option." << std::endl;
info.encoding = "win1252"; info.encoding = "win1252";
} }
std::cout << "Using default (English) font encoding." << std::endl; std::cout << ToUTF8::encodingUsingMessage(info.encoding) << std::endl;
}
return true; return true;
} }
@ -262,7 +251,8 @@ void printRaw(ESM::ESMReader &esm)
int load(Arguments& info) int load(Arguments& info)
{ {
ESM::ESMReader& esm = info.reader; ESM::ESMReader& esm = info.reader;
esm.setEncoding(ToUTF8::calculateEncoding(info.encoding)); ToUTF8::Utf8Encoder encoder (ToUTF8::calculateEncoding(info.encoding));
esm.setEncoder(&encoder);
std::string filename = info.filename; std::string filename = info.filename;
std::cout << "Loading file: " << filename << std::endl; std::cout << "Loading file: " << filename << std::endl;
@ -432,7 +422,8 @@ int clone(Arguments& info)
std::cout << std::endl << "Saving records to: " << info.outname << "..." << std::endl; std::cout << std::endl << "Saving records to: " << info.outname << "..." << std::endl;
ESM::ESMWriter& esm = info.writer; ESM::ESMWriter& esm = info.writer;
esm.setEncoding(info.encoding); ToUTF8::Utf8Encoder encoder (ToUTF8::calculateEncoding(info.encoding));
esm.setEncoder(&encoder);
esm.setAuthor(info.data.author); esm.setAuthor(info.data.author);
esm.setDescription(info.data.description); esm.setDescription(info.data.description);
esm.setVersion(info.data.version); esm.setVersion(info.data.version);

View file

@ -272,7 +272,8 @@ void DataFilesModel::addMasters(const QString &path)
foreach (const QString &path, dir.entryList()) { foreach (const QString &path, dir.entryList()) {
try { try {
ESM::ESMReader fileReader; ESM::ESMReader fileReader;
fileReader.setEncoding(ToUTF8::calculateEncoding(mEncoding.toStdString())); ToUTF8::Utf8Encoder encoder (ToUTF8::calculateEncoding(mEncoding.toStdString()));
fileReader.setEncoder(&encoder);
fileReader.open(dir.absoluteFilePath(path).toStdString()); fileReader.open(dir.absoluteFilePath(path).toStdString());
ESM::ESMReader::MasterList mlist = fileReader.getMasters(); ESM::ESMReader::MasterList mlist = fileReader.getMasters();
@ -335,7 +336,8 @@ void DataFilesModel::addPlugins(const QString &path)
try { try {
ESM::ESMReader fileReader; ESM::ESMReader fileReader;
fileReader.setEncoding(ToUTF8::calculateEncoding(mEncoding.toStdString())); ToUTF8::Utf8Encoder encoder (ToUTF8::calculateEncoding(mEncoding.toStdString()));
fileReader.setEncoder(&encoder);
fileReader.open(dir.absoluteFilePath(path).toStdString()); fileReader.open(dir.absoluteFilePath(path).toStdString());
ESM::ESMReader::MasterList mlist = fileReader.getMasters(); ESM::ESMReader::MasterList mlist = fileReader.getMasters();

View file

@ -649,11 +649,12 @@ MwIniImporter::multistrmap MwIniImporter::loadIniFile(std::string filename) {
std::string section(""); std::string section("");
MwIniImporter::multistrmap map; MwIniImporter::multistrmap map;
boost::iostreams::stream<boost::iostreams::file_source>file(filename.c_str()); boost::iostreams::stream<boost::iostreams::file_source>file(filename.c_str());
ToUTF8::Utf8Encoder encoder(mEncoding);
std::string line; std::string line;
while (std::getline(file, line)) { while (std::getline(file, line)) {
line = toUTF8(line); line = encoder.getUtf8(line);
// unify Unix-style and Windows file ending // unify Unix-style and Windows file ending
if (!(line.empty()) && (line[line.length()-1]) == '\r') { if (!(line.empty()) && (line[line.length()-1]) == '\r') {
@ -829,14 +830,6 @@ void MwIniImporter::writeToFile(boost::iostreams::stream<boost::iostreams::file_
} }
} }
std::string MwIniImporter::toUTF8(const std::string &str) {
char *ptr = ToUTF8::getBuffer(str.length());
strncpy(ptr, str.c_str(), str.length());
// Convert to UTF8 and return
return ToUTF8::getUtf8(mEncoding);
}
void MwIniImporter::setInputEncoding(const ToUTF8::FromType &encoding) void MwIniImporter::setInputEncoding(const ToUTF8::FromType &encoding)
{ {
mEncoding = encoding; mEncoding = encoding;

View file

@ -8,7 +8,7 @@
#include <vector> #include <vector>
#include <exception> #include <exception>
#include "../../components/to_utf8/to_utf8.hpp" #include <components/to_utf8/to_utf8.hpp>
class MwIniImporter { class MwIniImporter {
public: public:

View file

@ -331,11 +331,15 @@ void OMW::Engine::go()
// cursor replacer (converts the cursor from the bsa so they can be used by mygui) // cursor replacer (converts the cursor from the bsa so they can be used by mygui)
MWGui::CursorReplace replacer; MWGui::CursorReplace replacer;
// Create encoder
ToUTF8::Utf8Encoder encoder (mEncoding);
// Create the world // Create the world
mEnvironment.setWorld (new MWWorld::World (*mOgre, mFileCollections, mMaster, mEnvironment.setWorld (new MWWorld::World (*mOgre, mFileCollections, mMaster,
mResDir, mCfgMgr.getCachePath(), mNewGame, mEncoding, mFallbackMap)); mResDir, mCfgMgr.getCachePath(), mNewGame, &encoder, mFallbackMap));
//Load translation data //Load translation data
mTranslationDataStorage.setEncoder(&encoder);
mTranslationDataStorage.loadTranslationData(mFileCollections, mMaster); mTranslationDataStorage.loadTranslationData(mFileCollections, mMaster);
// Create window manager - this manages all the MW-specific GUI windows // Create window manager - this manages all the MW-specific GUI windows
@ -494,7 +498,6 @@ void OMW::Engine::showFPS(int level)
void OMW::Engine::setEncoding(const ToUTF8::FromType& encoding) void OMW::Engine::setEncoding(const ToUTF8::FromType& encoding)
{ {
mEncoding = encoding; mEncoding = encoding;
mTranslationDataStorage.setEncoding (encoding);
} }
void OMW::Engine::setFallbackValues(std::map<std::string,std::string> fallbackMap) void OMW::Engine::setFallbackValues(std::map<std::string,std::string> fallbackMap)

View file

@ -170,7 +170,7 @@ namespace MWWorld
World::World (OEngine::Render::OgreRenderer& renderer, World::World (OEngine::Render::OgreRenderer& renderer,
const Files::Collections& fileCollections, const Files::Collections& fileCollections,
const std::string& master, const boost::filesystem::path& resDir, const boost::filesystem::path& cacheDir, bool newGame, const std::string& master, const boost::filesystem::path& resDir, const boost::filesystem::path& cacheDir, bool newGame,
const ToUTF8::FromType& encoding, std::map<std::string,std::string> fallbackMap) ToUTF8::Utf8Encoder* encoder, std::map<std::string,std::string> fallbackMap)
: mPlayer (0), mLocalScripts (mStore), mGlobalVariables (0), : mPlayer (0), mLocalScripts (mStore), mGlobalVariables (0),
mSky (true), mCells (mStore, mEsm), mSky (true), mCells (mStore, mEsm),
mNumFacing(0) mNumFacing(0)
@ -187,7 +187,7 @@ namespace MWWorld
std::cout << "Loading ESM " << masterPath.string() << "\n"; std::cout << "Loading ESM " << masterPath.string() << "\n";
// This parses the ESM file and loads a sample cell // This parses the ESM file and loads a sample cell
mEsm.setEncoding(encoding); mEsm.setEncoder(encoder);
mEsm.open (masterPath.string()); mEsm.open (masterPath.string());
mStore.load (mEsm); mStore.load (mEsm);

View file

@ -95,7 +95,7 @@ namespace MWWorld
World (OEngine::Render::OgreRenderer& renderer, World (OEngine::Render::OgreRenderer& renderer,
const Files::Collections& fileCollections, const Files::Collections& fileCollections,
const std::string& master, const boost::filesystem::path& resDir, const boost::filesystem::path& cacheDir, bool newGame, const std::string& master, const boost::filesystem::path& resDir, const boost::filesystem::path& cacheDir, bool newGame,
const ToUTF8::FromType& encoding, std::map<std::string,std::string> fallbackMap); ToUTF8::Utf8Encoder* encoder, std::map<std::string,std::string> fallbackMap);
virtual ~World(); virtual ~World();

View file

@ -15,6 +15,11 @@ ESM_Context ESMReader::getContext()
return mCtx; return mCtx;
} }
ESMReader::ESMReader(void):
mBuffer(50*1024)
{
}
void ESMReader::restoreContext(const ESM_Context &rc) void ESMReader::restoreContext(const ESM_Context &rc)
{ {
// Reopen the file if necessary // Reopen the file if necessary
@ -323,11 +328,21 @@ void ESMReader::getExact(void*x, int size)
std::string ESMReader::getString(int size) std::string ESMReader::getString(int size)
{ {
char *ptr = ToUTF8::getBuffer(size); size_t s = size;
mEsm->read(ptr, size); if (mBuffer.size() <= s)
// Add some extra padding to reduce the chance of having to resize
// again later.
mBuffer.resize(3*s);
// And make sure the string is zero terminated
mBuffer[s] = 0;
// read ESM data
char *ptr = &mBuffer[0];
getExact(ptr, size);
// Convert to UTF8 and return // Convert to UTF8 and return
return ToUTF8::getUtf8(mEncoding); return mEncoder->getUtf8(ptr, size);
} }
void ESMReader::fail(const std::string &msg) void ESMReader::fail(const std::string &msg)
@ -345,9 +360,9 @@ void ESMReader::fail(const std::string &msg)
throw std::runtime_error(ss.str()); throw std::runtime_error(ss.str());
} }
void ESMReader::setEncoding(const ToUTF8::FromType& encoding) void ESMReader::setEncoder(ToUTF8::Utf8Encoder* encoder)
{ {
mEncoding = encoding; mEncoder = encoder;
} }
} }

View file

@ -20,6 +20,8 @@ class ESMReader
{ {
public: public:
ESMReader(void);
/************************************************************************* /*************************************************************************
* *
* Public type definitions * Public type definitions
@ -233,8 +235,8 @@ public:
/// Used for error handling /// Used for error handling
void fail(const std::string &msg); void fail(const std::string &msg);
/// Sets font encoding for ESM strings /// Sets font encoder for ESM strings
void setEncoding(const ToUTF8::FromType& encoding); void setEncoder(ToUTF8::Utf8Encoder* encoder);
private: private:
Ogre::DataStreamPtr mEsm; Ogre::DataStreamPtr mEsm;
@ -244,9 +246,12 @@ private:
// Special file signifier (see SpecialFile enum above) // Special file signifier (see SpecialFile enum above)
int mSpf; int mSpf;
// Buffer for ESM strings
std::vector<char> mBuffer;
SaveData mSaveData; SaveData mSaveData;
MasterList mMasters; MasterList mMasters;
ToUTF8::FromType mEncoding; ToUTF8::Utf8Encoder* mEncoder;
}; };
} }
#endif #endif

View file

@ -157,12 +157,8 @@ void ESMWriter::writeHString(const std::string& data)
write("\0", 1); write("\0", 1);
else else
{ {
char *ptr = ToUTF8::getBuffer(data.size()+1);
strncpy(ptr, &data[0], data.size());
ptr[data.size()] = '\0';
// Convert to UTF8 and return // Convert to UTF8 and return
std::string ascii = ToUTF8::getLegacyEnc(m_encoding); std::string ascii = m_encoder->getLegacyEnc(data);
write(ascii.c_str(), ascii.size()); write(ascii.c_str(), ascii.size());
} }
@ -192,21 +188,9 @@ void ESMWriter::write(const char* data, int size)
m_stream->write(data, size); m_stream->write(data, size);
} }
void ESMWriter::setEncoding(const std::string& encoding) void ESMWriter::setEncoder(ToUTF8::Utf8Encoder* encoder)
{ {
if (encoding == "win1250") m_encoder = encoder;
{
m_encoding = ToUTF8::WINDOWS_1250;
}
else if (encoding == "win1251")
{
m_encoding = ToUTF8::WINDOWS_1251;
}
else
{
// Default Latin encoding
m_encoding = ToUTF8::WINDOWS_1252;
}
} }
} }

View file

@ -6,7 +6,7 @@
#include <assert.h> #include <assert.h>
#include "esmcommon.hpp" #include "esmcommon.hpp"
#include "../to_utf8/to_utf8.hpp" #include <components/to_utf8/to_utf8.hpp>
namespace ESM { namespace ESM {
@ -24,7 +24,7 @@ public:
void setVersion(int ver); void setVersion(int ver);
int getType(); int getType();
void setType(int type); void setType(int type);
void setEncoding(const std::string& encoding); // Write strings as UTF-8? void setEncoder(ToUTF8::Utf8Encoder *encoding); // Write strings as UTF-8?
void setAuthor(const std::string& author); void setAuthor(const std::string& author);
void setDescription(const std::string& desc); void setDescription(const std::string& desc);
@ -94,11 +94,10 @@ private:
std::list<RecordData> m_records; std::list<RecordData> m_records;
std::ostream* m_stream; std::ostream* m_stream;
std::streampos m_headerPos; std::streampos m_headerPos;
ToUTF8::FromType m_encoding; ToUTF8::Utf8Encoder* m_encoder;
int m_recordCount; int m_recordCount;
HEDRstruct m_header; HEDRstruct m_header;
SaveData m_saveData;
}; };
} }

1
components/to_utf8/tests/.gitignore vendored Normal file
View file

@ -0,0 +1 @@
*_test

View file

@ -0,0 +1,4 @@
original: Без вопросов отдаете ему рулет, зная, что позже вы сможете привести с собой своих друзей и тогда он получит по заслугам?
converted: Без вопросов отдаете ему рулет, зная, что позже вы сможете привести с собой своих друзей и тогда он получит по заслугам?
original: Vous lui donnez le gâteau sans protester avant daller chercher tous vos amis et de revenir vous venger.
converted: Vous lui donnez le gâteau sans protester avant daller chercher tous vos amis et de revenir vous venger.

View file

@ -0,0 +1,18 @@
#!/bin/bash
make || exit
mkdir -p output
PROGS=*_test
for a in $PROGS; do
if [ -f "output/$a.out" ]; then
echo "Running $a:"
./$a | diff output/$a.out -
else
echo "Creating $a.out"
./$a > "output/$a.out"
git add "output/$a.out"
fi
done

View file

@ -0,0 +1 @@
Vous lui donnez le gâteau sans protester avant daller chercher tous vos amis et de revenir vous venger.

View file

@ -0,0 +1 @@
Vous lui donnez le gâteau sans protester avant daller chercher tous vos amis et de revenir vous venger.

View file

@ -0,0 +1 @@
Без вопросов отдаете ему рулет, зная, что позже вы сможете привести с собой своих друзей и тогда он получит по заслугам?

View file

@ -0,0 +1 @@
Без вопросов отдаете ему рулет, зная, что позже вы сможете привести с собой своих друзей и тогда он получит по заслугам?

View file

@ -0,0 +1,59 @@
#include <iostream>
#include <fstream>
#include <cassert>
#include <stdexcept>
#include "../to_utf8.hpp"
std::string getFirstLine(const std::string &filename);
void testEncoder(ToUTF8::FromType encoding, const std::string &legacyEncFile,
const std::string &utf8File);
/// Test character encoding conversion to and from UTF-8
void testEncoder(ToUTF8::FromType encoding, const std::string &legacyEncFile,
const std::string &utf8File)
{
// get some test data
std::string legacyEncLine = getFirstLine(legacyEncFile);
std::string utf8Line = getFirstLine(utf8File);
// create an encoder for specified character encoding
ToUTF8::Utf8Encoder encoder (encoding);
// convert text to UTF-8
std::string convertedUtf8Line = encoder.getUtf8(legacyEncLine);
std::cout << "original: " << utf8Line << std::endl;
std::cout << "converted: " << convertedUtf8Line << std::endl;
// check correctness
assert(convertedUtf8Line == utf8Line);
// convert UTF-8 text to legacy encoding
std::string convertedLegacyEncLine = encoder.getLegacyEnc(utf8Line);
// check correctness
assert(convertedLegacyEncLine == legacyEncLine);
}
std::string getFirstLine(const std::string &filename)
{
std::string line;
std::ifstream text (filename.c_str());
if (!text.is_open())
{
throw std::runtime_error("Unable to open file " + filename);
}
std::getline(text, line);
text.close();
return line;
}
int main()
{
testEncoder(ToUTF8::WINDOWS_1251, "test_data/russian-win1251.txt", "test_data/russian-utf8.txt");
testEncoder(ToUTF8::WINDOWS_1252, "test_data/french-win1252.txt", "test_data/french-utf8.txt");
return 0;
}

View file

@ -2,6 +2,8 @@
#include <vector> #include <vector>
#include <cassert> #include <cassert>
#include <iostream>
#include <iomanip>
/* This file contains the code to translate from WINDOWS-1252 (native /* This file contains the code to translate from WINDOWS-1252 (native
charset used in English version of Morrowind) to UTF-8. The library charset used in English version of Morrowind) to UTF-8. The library
@ -39,34 +41,128 @@
// Generated tables // Generated tables
#include "tables_gen.hpp" #include "tables_gen.hpp"
// Shared global buffers, we love you. These initial sizes are large using namespace ToUTF8;
// enough to hold the largest books in Morrowind.esm, but we will
// resize automaticall if necessary.
static std::vector<char> buf (50*1024);
static std::vector<char> output (50*1024);
static int size;
// Make sure the given vector is large enough for 'size' bytes, Utf8Encoder::Utf8Encoder(const FromType sourceEncoding):
// including a terminating zero after it. mOutput(50*1024)
static void resize(std::vector<char> &buf, size_t size)
{ {
if(buf.size() <= size) switch (sourceEncoding)
// Add some extra padding to reduce the chance of having to resize {
// again later. case ToUTF8::WINDOWS_1252:
buf.resize(3*size); {
translationArray = ToUTF8::windows_1252;
// And make sure the string is zero terminated break;
buf[size] = 0; }
case ToUTF8::WINDOWS_1250:
{
translationArray = ToUTF8::windows_1250;
break;
}
case ToUTF8::WINDOWS_1251:
{
translationArray = ToUTF8::windows_1251;
break;
}
default:
{
assert(0);
}
}
} }
// This is just used to spew out a reusable input buffer for the std::string Utf8Encoder::getUtf8(const char* input, int size)
// conversion process.
char *ToUTF8::getBuffer(int s)
{ {
// Remember the requested size // Double check that the input string stops at some point (it might
size = s; // contain zero terminators before this, inside its own data, which
resize(buf, size); // is also ok.)
return &buf[0]; assert(input[size] == 0);
// TODO: The rest of this function is designed for single-character
// input encodings only. It also assumes that the input the input
// encoding shares its first 128 values (0-127) with ASCII. These
// conditions must be checked again if you add more input encodings
// later.
// Compute output length, and check for pure ascii input at the same
// time.
bool ascii;
size_t outlen = getLength(input, ascii);
// If we're pure ascii, then don't bother converting anything.
if(ascii)
return std::string(input, outlen);
// Make sure the output is large enough
resize(outlen);
char *out = &mOutput[0];
// Translate
while (*input)
copyFromArray(*(input++), out);
// Make sure that we wrote the correct number of bytes
assert((out-&mOutput[0]) == (int)outlen);
// And make extra sure the output is null terminated
assert(mOutput.size() > outlen);
assert(mOutput[outlen] == 0);
// Return a string
return std::string(&mOutput[0], outlen);
}
std::string Utf8Encoder::getLegacyEnc(const char *input, int size)
{
// Double check that the input string stops at some point (it might
// contain zero terminators before this, inside its own data, which
// is also ok.)
assert(input[size] == 0);
// TODO: The rest of this function is designed for single-character
// input encodings only. It also assumes that the input the input
// encoding shares its first 128 values (0-127) with ASCII. These
// conditions must be checked again if you add more input encodings
// later.
// Compute output length, and check for pure ascii input at the same
// time.
bool ascii;
size_t outlen = getLength2(input, ascii);
// If we're pure ascii, then don't bother converting anything.
if(ascii)
return std::string(input, outlen);
// Make sure the output is large enough
resize(outlen);
char *out = &mOutput[0];
// Translate
while(*input)
copyFromArray2(input, out);
// Make sure that we wrote the correct number of bytes
assert((out-&mOutput[0]) == (int)outlen);
// And make extra sure the output is null terminated
assert(mOutput.size() > outlen);
assert(mOutput[outlen] == 0);
// Return a string
return std::string(&mOutput[0], outlen);
}
// Make sure the output vector is large enough for 'size' bytes,
// including a terminating zero after it.
void Utf8Encoder::resize(size_t size)
{
if (mOutput.size() <= size)
// Add some extra padding to reduce the chance of having to resize
// again later.
mOutput.resize(3*size);
// And make sure the string is zero terminated
mOutput[size] = 0;
} }
/** Get the total length length needed to decode the given string with /** Get the total length length needed to decode the given string with
@ -79,7 +175,7 @@ char *ToUTF8::getBuffer(int s)
is the case, then the ascii parameter is set to true, and the is the case, then the ascii parameter is set to true, and the
caller can optimize for this case. caller can optimize for this case.
*/ */
static size_t getLength(const char *arr, const char* input, bool &ascii) size_t Utf8Encoder::getLength(const char* input, bool &ascii)
{ {
ascii = true; ascii = true;
size_t len = 0; size_t len = 0;
@ -102,7 +198,7 @@ static size_t getLength(const char *arr, const char* input, bool &ascii)
{ {
// Find the translated length of this character in the // Find the translated length of this character in the
// lookup table. // lookup table.
len += arr[inp*6]; len += translationArray[inp*6];
inp = *(++ptr); inp = *(++ptr);
} }
} }
@ -111,7 +207,7 @@ static size_t getLength(const char *arr, const char* input, bool &ascii)
// Translate one character 'ch' using the translation array 'arr', and // Translate one character 'ch' using the translation array 'arr', and
// advance the output pointer accordingly. // advance the output pointer accordingly.
static void copyFromArray(const char *arr, unsigned char ch, char* &out) void Utf8Encoder::copyFromArray(unsigned char ch, char* &out)
{ {
// Optimize for ASCII values // Optimize for ASCII values
if (ch < 128) if (ch < 128)
@ -120,80 +216,13 @@ static void copyFromArray(const char *arr, unsigned char ch, char* &out)
return; return;
} }
const char *in = arr + ch*6; const char *in = translationArray + ch*6;
int len = *(in++); int len = *(in++);
for (int i=0; i<len; i++) for (int i=0; i<len; i++)
*(out++) = *(in++); *(out++) = *(in++);
} }
std::string ToUTF8::getUtf8(ToUTF8::FromType from) size_t Utf8Encoder::getLength2(const char* input, bool &ascii)
{
// Pick translation array
const char *arr;
switch (from)
{
case ToUTF8::WINDOWS_1252:
{
arr = ToUTF8::windows_1252;
break;
}
case ToUTF8::WINDOWS_1250:
{
arr = ToUTF8::windows_1250;
break;
}
case ToUTF8::WINDOWS_1251:
{
arr = ToUTF8::windows_1251;
break;
}
default:
{
assert(0);
}
}
// Double check that the input string stops at some point (it might
// contain zero terminators before this, inside its own data, which
// is also ok.)
const char* input = &buf[0];
assert(input[size] == 0);
// TODO: The rest of this function is designed for single-character
// input encodings only. It also assumes that the input the input
// encoding shares its first 128 values (0-127) with ASCII. These
// conditions must be checked again if you add more input encodings
// later.
// Compute output length, and check for pure ascii input at the same
// time.
bool ascii;
size_t outlen = getLength(arr, input, ascii);
// If we're pure ascii, then don't bother converting anything.
if(ascii)
return std::string(input, outlen);
// Make sure the output is large enough
resize(output, outlen);
char *out = &output[0];
// Translate
while(*input)
copyFromArray(arr, *(input++), out);
// Make sure that we wrote the correct number of bytes
assert((out-&output[0]) == (int)outlen);
// And make extra sure the output is null terminated
assert(output.size() > outlen);
assert(output[outlen] == 0);
// Return a string
return std::string(&output[0], outlen);
}
static size_t getLength2(const char *arr, const char* input, bool &ascii)
{ {
ascii = true; ascii = true;
size_t len = 0; size_t len = 0;
@ -237,10 +266,7 @@ static size_t getLength2(const char *arr, const char* input, bool &ascii)
return len; return len;
} }
#include <iostream> void Utf8Encoder::copyFromArray2(const char*& chp, char* &out)
#include <iomanip>
static void copyFromArray2(const char *arr, char*& chp, char* &out)
{ {
unsigned char ch = *(chp++); unsigned char ch = *(chp++);
// Optimize for ASCII values // Optimize for ASCII values
@ -278,7 +304,7 @@ static void copyFromArray2(const char *arr, char*& chp, char* &out)
for (int i = 128; i < 256; i++) for (int i = 128; i < 256; i++)
{ {
unsigned char b1 = arr[i*6 + 1], b2 = arr[i*6 + 2], b3 = arr[i*6 + 3]; unsigned char b1 = translationArray[i*6 + 1], b2 = translationArray[i*6 + 2], b3 = translationArray[i*6 + 3];
if (b1 == ch && b2 == ch2 && (len != 3 || b3 == ch3)) if (b1 == ch && b2 == ch2 && (len != 3 || b3 == ch3))
{ {
*(out++) = (char)i; *(out++) = (char)i;
@ -291,73 +317,6 @@ static void copyFromArray2(const char *arr, char*& chp, char* &out)
*(out++) = ch; // Could not find glyph, just put whatever *(out++) = ch; // Could not find glyph, just put whatever
} }
std::string ToUTF8::getLegacyEnc(ToUTF8::FromType to)
{
// Pick translation array
const char *arr;
switch (to)
{
case ToUTF8::WINDOWS_1252:
{
arr = ToUTF8::windows_1252;
break;
}
case ToUTF8::WINDOWS_1250:
{
arr = ToUTF8::windows_1250;
break;
}
case ToUTF8::WINDOWS_1251:
{
arr = ToUTF8::windows_1251;
break;
}
default:
{
assert(0);
}
}
// Double check that the input string stops at some point (it might
// contain zero terminators before this, inside its own data, which
// is also ok.)
char* input = &buf[0];
assert(input[size] == 0);
// TODO: The rest of this function is designed for single-character
// input encodings only. It also assumes that the input the input
// encoding shares its first 128 values (0-127) with ASCII. These
// conditions must be checked again if you add more input encodings
// later.
// Compute output length, and check for pure ascii input at the same
// time.
bool ascii;
size_t outlen = getLength2(arr, input, ascii);
// If we're pure ascii, then don't bother converting anything.
if(ascii)
return std::string(input, outlen);
// Make sure the output is large enough
resize(output, outlen);
char *out = &output[0];
// Translate
while(*input)
copyFromArray2(arr, input, out);
// Make sure that we wrote the correct number of bytes
assert((out-&output[0]) == (int)outlen);
// And make extra sure the output is null terminated
assert(output.size() > outlen);
assert(output[outlen] == 0);
// Return a string
return std::string(&output[0], outlen);
}
ToUTF8::FromType ToUTF8::calculateEncoding(const std::string& encodingName) ToUTF8::FromType ToUTF8::calculateEncoding(const std::string& encodingName)
{ {
if (encodingName == "win1250") if (encodingName == "win1250")

View file

@ -2,6 +2,8 @@
#define COMPONENTS_TOUTF8_H #define COMPONENTS_TOUTF8_H
#include <string> #include <string>
#include <cstring>
#include <vector>
namespace ToUTF8 namespace ToUTF8
{ {
@ -14,17 +16,39 @@ namespace ToUTF8
// probably others) // probably others)
}; };
// Return a writable buffer of at least 'size' bytes. The buffer
// does not have to be freed.
char* getBuffer(int size);
// Convert the previously written buffer to UTF8 from the given code
// page.
std::string getUtf8(FromType from);
std::string getLegacyEnc(FromType to);
FromType calculateEncoding(const std::string& encodingName); FromType calculateEncoding(const std::string& encodingName);
std::string encodingUsingMessage(const std::string& encodingName); std::string encodingUsingMessage(const std::string& encodingName);
// class
class Utf8Encoder
{
public:
Utf8Encoder(FromType sourceEncoding);
// Convert to UTF8 from the previously given code page.
std::string getUtf8(const char *input, int size);
inline std::string getUtf8(const std::string &str)
{
return getUtf8(str.c_str(), str.size());
}
std::string getLegacyEnc(const char *input, int size);
inline std::string getLegacyEnc(const std::string &str)
{
return getLegacyEnc(str.c_str(), str.size());
}
private:
void resize(size_t size);
size_t getLength(const char* input, bool &ascii);
void copyFromArray(unsigned char chp, char* &out);
size_t getLength2(const char* input, bool &ascii);
void copyFromArray2(const char*& chp, char* &out);
std::vector<char> mOutput;
char* translationArray;
};
} }
#endif #endif

View file

@ -50,10 +50,7 @@ namespace Translation
if (!line.empty()) if (!line.empty())
{ {
char* buffer = ToUTF8::getBuffer(line.size() + 1); line = mEncoder->getUtf8(line);
//buffer has at least line.size() + 1 bytes, so it must be safe
strcpy(buffer, line.c_str());
line = ToUTF8::getUtf8(mEncoding);
size_t tab_pos = line.find('\t'); size_t tab_pos = line.find('\t');
if (tab_pos != std::string::npos && tab_pos > 0 && tab_pos < line.size() - 1) if (tab_pos != std::string::npos && tab_pos > 0 && tab_pos < line.size() - 1)
@ -104,9 +101,9 @@ namespace Translation
return phrase; return phrase;
} }
void Storage::setEncoding (const ToUTF8::FromType& encoding) void Storage::setEncoder(ToUTF8::Utf8Encoder* encoder)
{ {
mEncoding = encoding; mEncoder = encoder;
} }
bool Storage::hasTranslation() const bool Storage::hasTranslation() const

View file

@ -19,7 +19,7 @@ namespace Translation
// Standard form usually means nominative case // Standard form usually means nominative case
std::string topicStandardForm(const std::string& phrase) const; std::string topicStandardForm(const std::string& phrase) const;
void setEncoding (const ToUTF8::FromType& encoding); void setEncoder(ToUTF8::Utf8Encoder* encoder);
bool hasTranslation() const; bool hasTranslation() const;
@ -34,7 +34,7 @@ namespace Translation
void loadDataFromStream(ContainerType& container, std::istream& stream); void loadDataFromStream(ContainerType& container, std::istream& stream);
ToUTF8::FromType mEncoding; ToUTF8::Utf8Encoder* mEncoder;
ContainerType mCellNamesTranslations, mTopicIDs, mPhraseForms; ContainerType mCellNamesTranslations, mTopicIDs, mPhraseForms;
}; };
} }