mirror of
https://github.com/OpenMW/openmw.git
synced 2025-01-19 20:23:54 +00:00
Merge remote-tracking branch 'potatoesmaster/to_utf8-rewrite'
This commit is contained in:
commit
25815ab8f7
23 changed files with 423 additions and 365 deletions
|
@ -165,23 +165,12 @@ bool parseOptions (int argc, char** argv, Arguments &info)
|
||||||
|
|
||||||
// Font encoding settings
|
// Font encoding settings
|
||||||
info.encoding = variables["encoding"].as<std::string>();
|
info.encoding = variables["encoding"].as<std::string>();
|
||||||
if (info.encoding == "win1250")
|
if(info.encoding != "win1250" && info.encoding != "win1251" && info.encoding != "win1252")
|
||||||
{
|
|
||||||
std::cout << "Using Central and Eastern European font encoding." << std::endl;
|
|
||||||
}
|
|
||||||
else if (info.encoding == "win1251")
|
|
||||||
{
|
|
||||||
std::cout << "Using Cyrillic font encoding." << std::endl;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
if(info.encoding != "win1252")
|
|
||||||
{
|
{
|
||||||
std::cout << info.encoding << " is not a valid encoding option." << std::endl;
|
std::cout << info.encoding << " is not a valid encoding option." << std::endl;
|
||||||
info.encoding = "win1252";
|
info.encoding = "win1252";
|
||||||
}
|
}
|
||||||
std::cout << "Using default (English) font encoding." << std::endl;
|
std::cout << ToUTF8::encodingUsingMessage(info.encoding) << std::endl;
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -262,7 +251,8 @@ void printRaw(ESM::ESMReader &esm)
|
||||||
int load(Arguments& info)
|
int load(Arguments& info)
|
||||||
{
|
{
|
||||||
ESM::ESMReader& esm = info.reader;
|
ESM::ESMReader& esm = info.reader;
|
||||||
esm.setEncoding(ToUTF8::calculateEncoding(info.encoding));
|
ToUTF8::Utf8Encoder encoder (ToUTF8::calculateEncoding(info.encoding));
|
||||||
|
esm.setEncoder(&encoder);
|
||||||
|
|
||||||
std::string filename = info.filename;
|
std::string filename = info.filename;
|
||||||
std::cout << "Loading file: " << filename << std::endl;
|
std::cout << "Loading file: " << filename << std::endl;
|
||||||
|
@ -432,7 +422,8 @@ int clone(Arguments& info)
|
||||||
std::cout << std::endl << "Saving records to: " << info.outname << "..." << std::endl;
|
std::cout << std::endl << "Saving records to: " << info.outname << "..." << std::endl;
|
||||||
|
|
||||||
ESM::ESMWriter& esm = info.writer;
|
ESM::ESMWriter& esm = info.writer;
|
||||||
esm.setEncoding(info.encoding);
|
ToUTF8::Utf8Encoder encoder (ToUTF8::calculateEncoding(info.encoding));
|
||||||
|
esm.setEncoder(&encoder);
|
||||||
esm.setAuthor(info.data.author);
|
esm.setAuthor(info.data.author);
|
||||||
esm.setDescription(info.data.description);
|
esm.setDescription(info.data.description);
|
||||||
esm.setVersion(info.data.version);
|
esm.setVersion(info.data.version);
|
||||||
|
|
|
@ -272,7 +272,8 @@ void DataFilesModel::addMasters(const QString &path)
|
||||||
foreach (const QString &path, dir.entryList()) {
|
foreach (const QString &path, dir.entryList()) {
|
||||||
try {
|
try {
|
||||||
ESM::ESMReader fileReader;
|
ESM::ESMReader fileReader;
|
||||||
fileReader.setEncoding(ToUTF8::calculateEncoding(mEncoding.toStdString()));
|
ToUTF8::Utf8Encoder encoder (ToUTF8::calculateEncoding(mEncoding.toStdString()));
|
||||||
|
fileReader.setEncoder(&encoder);
|
||||||
fileReader.open(dir.absoluteFilePath(path).toStdString());
|
fileReader.open(dir.absoluteFilePath(path).toStdString());
|
||||||
|
|
||||||
ESM::ESMReader::MasterList mlist = fileReader.getMasters();
|
ESM::ESMReader::MasterList mlist = fileReader.getMasters();
|
||||||
|
@ -335,7 +336,8 @@ void DataFilesModel::addPlugins(const QString &path)
|
||||||
|
|
||||||
try {
|
try {
|
||||||
ESM::ESMReader fileReader;
|
ESM::ESMReader fileReader;
|
||||||
fileReader.setEncoding(ToUTF8::calculateEncoding(mEncoding.toStdString()));
|
ToUTF8::Utf8Encoder encoder (ToUTF8::calculateEncoding(mEncoding.toStdString()));
|
||||||
|
fileReader.setEncoder(&encoder);
|
||||||
fileReader.open(dir.absoluteFilePath(path).toStdString());
|
fileReader.open(dir.absoluteFilePath(path).toStdString());
|
||||||
|
|
||||||
ESM::ESMReader::MasterList mlist = fileReader.getMasters();
|
ESM::ESMReader::MasterList mlist = fileReader.getMasters();
|
||||||
|
|
|
@ -649,11 +649,12 @@ MwIniImporter::multistrmap MwIniImporter::loadIniFile(std::string filename) {
|
||||||
std::string section("");
|
std::string section("");
|
||||||
MwIniImporter::multistrmap map;
|
MwIniImporter::multistrmap map;
|
||||||
boost::iostreams::stream<boost::iostreams::file_source>file(filename.c_str());
|
boost::iostreams::stream<boost::iostreams::file_source>file(filename.c_str());
|
||||||
|
ToUTF8::Utf8Encoder encoder(mEncoding);
|
||||||
|
|
||||||
std::string line;
|
std::string line;
|
||||||
while (std::getline(file, line)) {
|
while (std::getline(file, line)) {
|
||||||
|
|
||||||
line = toUTF8(line);
|
line = encoder.getUtf8(line);
|
||||||
|
|
||||||
// unify Unix-style and Windows file ending
|
// unify Unix-style and Windows file ending
|
||||||
if (!(line.empty()) && (line[line.length()-1]) == '\r') {
|
if (!(line.empty()) && (line[line.length()-1]) == '\r') {
|
||||||
|
@ -829,14 +830,6 @@ void MwIniImporter::writeToFile(boost::iostreams::stream<boost::iostreams::file_
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string MwIniImporter::toUTF8(const std::string &str) {
|
|
||||||
char *ptr = ToUTF8::getBuffer(str.length());
|
|
||||||
strncpy(ptr, str.c_str(), str.length());
|
|
||||||
|
|
||||||
// Convert to UTF8 and return
|
|
||||||
return ToUTF8::getUtf8(mEncoding);
|
|
||||||
}
|
|
||||||
|
|
||||||
void MwIniImporter::setInputEncoding(const ToUTF8::FromType &encoding)
|
void MwIniImporter::setInputEncoding(const ToUTF8::FromType &encoding)
|
||||||
{
|
{
|
||||||
mEncoding = encoding;
|
mEncoding = encoding;
|
||||||
|
|
|
@ -8,7 +8,7 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <exception>
|
#include <exception>
|
||||||
|
|
||||||
#include "../../components/to_utf8/to_utf8.hpp"
|
#include <components/to_utf8/to_utf8.hpp>
|
||||||
|
|
||||||
class MwIniImporter {
|
class MwIniImporter {
|
||||||
public:
|
public:
|
||||||
|
|
|
@ -331,11 +331,15 @@ void OMW::Engine::go()
|
||||||
// cursor replacer (converts the cursor from the bsa so they can be used by mygui)
|
// cursor replacer (converts the cursor from the bsa so they can be used by mygui)
|
||||||
MWGui::CursorReplace replacer;
|
MWGui::CursorReplace replacer;
|
||||||
|
|
||||||
|
// Create encoder
|
||||||
|
ToUTF8::Utf8Encoder encoder (mEncoding);
|
||||||
|
|
||||||
// Create the world
|
// Create the world
|
||||||
mEnvironment.setWorld (new MWWorld::World (*mOgre, mFileCollections, mMaster,
|
mEnvironment.setWorld (new MWWorld::World (*mOgre, mFileCollections, mMaster,
|
||||||
mResDir, mCfgMgr.getCachePath(), mNewGame, mEncoding, mFallbackMap));
|
mResDir, mCfgMgr.getCachePath(), mNewGame, &encoder, mFallbackMap));
|
||||||
|
|
||||||
//Load translation data
|
//Load translation data
|
||||||
|
mTranslationDataStorage.setEncoder(&encoder);
|
||||||
mTranslationDataStorage.loadTranslationData(mFileCollections, mMaster);
|
mTranslationDataStorage.loadTranslationData(mFileCollections, mMaster);
|
||||||
|
|
||||||
// Create window manager - this manages all the MW-specific GUI windows
|
// Create window manager - this manages all the MW-specific GUI windows
|
||||||
|
@ -494,7 +498,6 @@ void OMW::Engine::showFPS(int level)
|
||||||
void OMW::Engine::setEncoding(const ToUTF8::FromType& encoding)
|
void OMW::Engine::setEncoding(const ToUTF8::FromType& encoding)
|
||||||
{
|
{
|
||||||
mEncoding = encoding;
|
mEncoding = encoding;
|
||||||
mTranslationDataStorage.setEncoding (encoding);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void OMW::Engine::setFallbackValues(std::map<std::string,std::string> fallbackMap)
|
void OMW::Engine::setFallbackValues(std::map<std::string,std::string> fallbackMap)
|
||||||
|
|
|
@ -170,7 +170,7 @@ namespace MWWorld
|
||||||
World::World (OEngine::Render::OgreRenderer& renderer,
|
World::World (OEngine::Render::OgreRenderer& renderer,
|
||||||
const Files::Collections& fileCollections,
|
const Files::Collections& fileCollections,
|
||||||
const std::string& master, const boost::filesystem::path& resDir, const boost::filesystem::path& cacheDir, bool newGame,
|
const std::string& master, const boost::filesystem::path& resDir, const boost::filesystem::path& cacheDir, bool newGame,
|
||||||
const ToUTF8::FromType& encoding, std::map<std::string,std::string> fallbackMap)
|
ToUTF8::Utf8Encoder* encoder, std::map<std::string,std::string> fallbackMap)
|
||||||
: mPlayer (0), mLocalScripts (mStore), mGlobalVariables (0),
|
: mPlayer (0), mLocalScripts (mStore), mGlobalVariables (0),
|
||||||
mSky (true), mCells (mStore, mEsm),
|
mSky (true), mCells (mStore, mEsm),
|
||||||
mNumFacing(0)
|
mNumFacing(0)
|
||||||
|
@ -187,7 +187,7 @@ namespace MWWorld
|
||||||
std::cout << "Loading ESM " << masterPath.string() << "\n";
|
std::cout << "Loading ESM " << masterPath.string() << "\n";
|
||||||
|
|
||||||
// This parses the ESM file and loads a sample cell
|
// This parses the ESM file and loads a sample cell
|
||||||
mEsm.setEncoding(encoding);
|
mEsm.setEncoder(encoder);
|
||||||
mEsm.open (masterPath.string());
|
mEsm.open (masterPath.string());
|
||||||
mStore.load (mEsm);
|
mStore.load (mEsm);
|
||||||
|
|
||||||
|
|
|
@ -95,7 +95,7 @@ namespace MWWorld
|
||||||
World (OEngine::Render::OgreRenderer& renderer,
|
World (OEngine::Render::OgreRenderer& renderer,
|
||||||
const Files::Collections& fileCollections,
|
const Files::Collections& fileCollections,
|
||||||
const std::string& master, const boost::filesystem::path& resDir, const boost::filesystem::path& cacheDir, bool newGame,
|
const std::string& master, const boost::filesystem::path& resDir, const boost::filesystem::path& cacheDir, bool newGame,
|
||||||
const ToUTF8::FromType& encoding, std::map<std::string,std::string> fallbackMap);
|
ToUTF8::Utf8Encoder* encoder, std::map<std::string,std::string> fallbackMap);
|
||||||
|
|
||||||
virtual ~World();
|
virtual ~World();
|
||||||
|
|
||||||
|
|
|
@ -15,6 +15,11 @@ ESM_Context ESMReader::getContext()
|
||||||
return mCtx;
|
return mCtx;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ESMReader::ESMReader(void):
|
||||||
|
mBuffer(50*1024)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
void ESMReader::restoreContext(const ESM_Context &rc)
|
void ESMReader::restoreContext(const ESM_Context &rc)
|
||||||
{
|
{
|
||||||
// Reopen the file if necessary
|
// Reopen the file if necessary
|
||||||
|
@ -323,11 +328,21 @@ void ESMReader::getExact(void*x, int size)
|
||||||
|
|
||||||
std::string ESMReader::getString(int size)
|
std::string ESMReader::getString(int size)
|
||||||
{
|
{
|
||||||
char *ptr = ToUTF8::getBuffer(size);
|
size_t s = size;
|
||||||
mEsm->read(ptr, size);
|
if (mBuffer.size() <= s)
|
||||||
|
// Add some extra padding to reduce the chance of having to resize
|
||||||
|
// again later.
|
||||||
|
mBuffer.resize(3*s);
|
||||||
|
|
||||||
|
// And make sure the string is zero terminated
|
||||||
|
mBuffer[s] = 0;
|
||||||
|
|
||||||
|
// read ESM data
|
||||||
|
char *ptr = &mBuffer[0];
|
||||||
|
getExact(ptr, size);
|
||||||
|
|
||||||
// Convert to UTF8 and return
|
// Convert to UTF8 and return
|
||||||
return ToUTF8::getUtf8(mEncoding);
|
return mEncoder->getUtf8(ptr, size);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ESMReader::fail(const std::string &msg)
|
void ESMReader::fail(const std::string &msg)
|
||||||
|
@ -345,9 +360,9 @@ void ESMReader::fail(const std::string &msg)
|
||||||
throw std::runtime_error(ss.str());
|
throw std::runtime_error(ss.str());
|
||||||
}
|
}
|
||||||
|
|
||||||
void ESMReader::setEncoding(const ToUTF8::FromType& encoding)
|
void ESMReader::setEncoder(ToUTF8::Utf8Encoder* encoder)
|
||||||
{
|
{
|
||||||
mEncoding = encoding;
|
mEncoder = encoder;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -20,6 +20,8 @@ class ESMReader
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
|
|
||||||
|
ESMReader(void);
|
||||||
|
|
||||||
/*************************************************************************
|
/*************************************************************************
|
||||||
*
|
*
|
||||||
* Public type definitions
|
* Public type definitions
|
||||||
|
@ -233,8 +235,8 @@ public:
|
||||||
/// Used for error handling
|
/// Used for error handling
|
||||||
void fail(const std::string &msg);
|
void fail(const std::string &msg);
|
||||||
|
|
||||||
/// Sets font encoding for ESM strings
|
/// Sets font encoder for ESM strings
|
||||||
void setEncoding(const ToUTF8::FromType& encoding);
|
void setEncoder(ToUTF8::Utf8Encoder* encoder);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
Ogre::DataStreamPtr mEsm;
|
Ogre::DataStreamPtr mEsm;
|
||||||
|
@ -244,9 +246,12 @@ private:
|
||||||
// Special file signifier (see SpecialFile enum above)
|
// Special file signifier (see SpecialFile enum above)
|
||||||
int mSpf;
|
int mSpf;
|
||||||
|
|
||||||
|
// Buffer for ESM strings
|
||||||
|
std::vector<char> mBuffer;
|
||||||
|
|
||||||
SaveData mSaveData;
|
SaveData mSaveData;
|
||||||
MasterList mMasters;
|
MasterList mMasters;
|
||||||
ToUTF8::FromType mEncoding;
|
ToUTF8::Utf8Encoder* mEncoder;
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -157,12 +157,8 @@ void ESMWriter::writeHString(const std::string& data)
|
||||||
write("\0", 1);
|
write("\0", 1);
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
char *ptr = ToUTF8::getBuffer(data.size()+1);
|
|
||||||
strncpy(ptr, &data[0], data.size());
|
|
||||||
ptr[data.size()] = '\0';
|
|
||||||
|
|
||||||
// Convert to UTF8 and return
|
// Convert to UTF8 and return
|
||||||
std::string ascii = ToUTF8::getLegacyEnc(m_encoding);
|
std::string ascii = m_encoder->getLegacyEnc(data);
|
||||||
|
|
||||||
write(ascii.c_str(), ascii.size());
|
write(ascii.c_str(), ascii.size());
|
||||||
}
|
}
|
||||||
|
@ -192,21 +188,9 @@ void ESMWriter::write(const char* data, int size)
|
||||||
m_stream->write(data, size);
|
m_stream->write(data, size);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ESMWriter::setEncoding(const std::string& encoding)
|
void ESMWriter::setEncoder(ToUTF8::Utf8Encoder* encoder)
|
||||||
{
|
{
|
||||||
if (encoding == "win1250")
|
m_encoder = encoder;
|
||||||
{
|
|
||||||
m_encoding = ToUTF8::WINDOWS_1250;
|
|
||||||
}
|
|
||||||
else if (encoding == "win1251")
|
|
||||||
{
|
|
||||||
m_encoding = ToUTF8::WINDOWS_1251;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
// Default Latin encoding
|
|
||||||
m_encoding = ToUTF8::WINDOWS_1252;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -6,7 +6,7 @@
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
|
|
||||||
#include "esmcommon.hpp"
|
#include "esmcommon.hpp"
|
||||||
#include "../to_utf8/to_utf8.hpp"
|
#include <components/to_utf8/to_utf8.hpp>
|
||||||
|
|
||||||
namespace ESM {
|
namespace ESM {
|
||||||
|
|
||||||
|
@ -24,7 +24,7 @@ public:
|
||||||
void setVersion(int ver);
|
void setVersion(int ver);
|
||||||
int getType();
|
int getType();
|
||||||
void setType(int type);
|
void setType(int type);
|
||||||
void setEncoding(const std::string& encoding); // Write strings as UTF-8?
|
void setEncoder(ToUTF8::Utf8Encoder *encoding); // Write strings as UTF-8?
|
||||||
void setAuthor(const std::string& author);
|
void setAuthor(const std::string& author);
|
||||||
void setDescription(const std::string& desc);
|
void setDescription(const std::string& desc);
|
||||||
|
|
||||||
|
@ -94,11 +94,10 @@ private:
|
||||||
std::list<RecordData> m_records;
|
std::list<RecordData> m_records;
|
||||||
std::ostream* m_stream;
|
std::ostream* m_stream;
|
||||||
std::streampos m_headerPos;
|
std::streampos m_headerPos;
|
||||||
ToUTF8::FromType m_encoding;
|
ToUTF8::Utf8Encoder* m_encoder;
|
||||||
int m_recordCount;
|
int m_recordCount;
|
||||||
|
|
||||||
HEDRstruct m_header;
|
HEDRstruct m_header;
|
||||||
SaveData m_saveData;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
1
components/to_utf8/tests/.gitignore
vendored
Normal file
1
components/to_utf8/tests/.gitignore
vendored
Normal file
|
@ -0,0 +1 @@
|
||||||
|
*_test
|
4
components/to_utf8/tests/output/to_utf8_test.out
Normal file
4
components/to_utf8/tests/output/to_utf8_test.out
Normal file
|
@ -0,0 +1,4 @@
|
||||||
|
original: Без вопросов отдаете ему рулет, зная, что позже вы сможете привести с собой своих друзей и тогда он получит по заслугам?
|
||||||
|
converted: Без вопросов отдаете ему рулет, зная, что позже вы сможете привести с собой своих друзей и тогда он получит по заслугам?
|
||||||
|
original: Vous lui donnez le gâteau sans protester avant d’aller chercher tous vos amis et de revenir vous venger.
|
||||||
|
converted: Vous lui donnez le gâteau sans protester avant d’aller chercher tous vos amis et de revenir vous venger.
|
18
components/to_utf8/tests/test.sh
Executable file
18
components/to_utf8/tests/test.sh
Executable file
|
@ -0,0 +1,18 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
make || exit
|
||||||
|
|
||||||
|
mkdir -p output
|
||||||
|
|
||||||
|
PROGS=*_test
|
||||||
|
|
||||||
|
for a in $PROGS; do
|
||||||
|
if [ -f "output/$a.out" ]; then
|
||||||
|
echo "Running $a:"
|
||||||
|
./$a | diff output/$a.out -
|
||||||
|
else
|
||||||
|
echo "Creating $a.out"
|
||||||
|
./$a > "output/$a.out"
|
||||||
|
git add "output/$a.out"
|
||||||
|
fi
|
||||||
|
done
|
1
components/to_utf8/tests/test_data/french-utf8.txt
Normal file
1
components/to_utf8/tests/test_data/french-utf8.txt
Normal file
|
@ -0,0 +1 @@
|
||||||
|
Vous lui donnez le gâteau sans protester avant d’aller chercher tous vos amis et de revenir vous venger.
|
1
components/to_utf8/tests/test_data/french-win1252.txt
Normal file
1
components/to_utf8/tests/test_data/french-win1252.txt
Normal file
|
@ -0,0 +1 @@
|
||||||
|
Vous lui donnez le gâteau sans protester avant d’aller chercher tous vos amis et de revenir vous venger.
|
1
components/to_utf8/tests/test_data/russian-utf8.txt
Normal file
1
components/to_utf8/tests/test_data/russian-utf8.txt
Normal file
|
@ -0,0 +1 @@
|
||||||
|
Без вопросов отдаете ему рулет, зная, что позже вы сможете привести с собой своих друзей и тогда он получит по заслугам?
|
1
components/to_utf8/tests/test_data/russian-win1251.txt
Normal file
1
components/to_utf8/tests/test_data/russian-win1251.txt
Normal file
|
@ -0,0 +1 @@
|
||||||
|
Без вопросов отдаете ему рулет, зная, что позже вы сможете привести с собой своих друзей и тогда он получит по заслугам?
|
59
components/to_utf8/tests/to_utf8_test.cpp
Normal file
59
components/to_utf8/tests/to_utf8_test.cpp
Normal file
|
@ -0,0 +1,59 @@
|
||||||
|
#include <iostream>
|
||||||
|
#include <fstream>
|
||||||
|
#include <cassert>
|
||||||
|
#include <stdexcept>
|
||||||
|
|
||||||
|
#include "../to_utf8.hpp"
|
||||||
|
|
||||||
|
std::string getFirstLine(const std::string &filename);
|
||||||
|
void testEncoder(ToUTF8::FromType encoding, const std::string &legacyEncFile,
|
||||||
|
const std::string &utf8File);
|
||||||
|
|
||||||
|
/// Test character encoding conversion to and from UTF-8
|
||||||
|
void testEncoder(ToUTF8::FromType encoding, const std::string &legacyEncFile,
|
||||||
|
const std::string &utf8File)
|
||||||
|
{
|
||||||
|
// get some test data
|
||||||
|
std::string legacyEncLine = getFirstLine(legacyEncFile);
|
||||||
|
std::string utf8Line = getFirstLine(utf8File);
|
||||||
|
|
||||||
|
// create an encoder for specified character encoding
|
||||||
|
ToUTF8::Utf8Encoder encoder (encoding);
|
||||||
|
|
||||||
|
// convert text to UTF-8
|
||||||
|
std::string convertedUtf8Line = encoder.getUtf8(legacyEncLine);
|
||||||
|
|
||||||
|
std::cout << "original: " << utf8Line << std::endl;
|
||||||
|
std::cout << "converted: " << convertedUtf8Line << std::endl;
|
||||||
|
|
||||||
|
// check correctness
|
||||||
|
assert(convertedUtf8Line == utf8Line);
|
||||||
|
|
||||||
|
// convert UTF-8 text to legacy encoding
|
||||||
|
std::string convertedLegacyEncLine = encoder.getLegacyEnc(utf8Line);
|
||||||
|
// check correctness
|
||||||
|
assert(convertedLegacyEncLine == legacyEncLine);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string getFirstLine(const std::string &filename)
|
||||||
|
{
|
||||||
|
std::string line;
|
||||||
|
std::ifstream text (filename.c_str());
|
||||||
|
|
||||||
|
if (!text.is_open())
|
||||||
|
{
|
||||||
|
throw std::runtime_error("Unable to open file " + filename);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::getline(text, line);
|
||||||
|
text.close();
|
||||||
|
|
||||||
|
return line;
|
||||||
|
}
|
||||||
|
|
||||||
|
int main()
|
||||||
|
{
|
||||||
|
testEncoder(ToUTF8::WINDOWS_1251, "test_data/russian-win1251.txt", "test_data/russian-utf8.txt");
|
||||||
|
testEncoder(ToUTF8::WINDOWS_1252, "test_data/french-win1252.txt", "test_data/french-utf8.txt");
|
||||||
|
return 0;
|
||||||
|
}
|
|
@ -2,6 +2,8 @@
|
||||||
|
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
|
#include <iostream>
|
||||||
|
#include <iomanip>
|
||||||
|
|
||||||
/* This file contains the code to translate from WINDOWS-1252 (native
|
/* This file contains the code to translate from WINDOWS-1252 (native
|
||||||
charset used in English version of Morrowind) to UTF-8. The library
|
charset used in English version of Morrowind) to UTF-8. The library
|
||||||
|
@ -39,34 +41,128 @@
|
||||||
// Generated tables
|
// Generated tables
|
||||||
#include "tables_gen.hpp"
|
#include "tables_gen.hpp"
|
||||||
|
|
||||||
// Shared global buffers, we love you. These initial sizes are large
|
using namespace ToUTF8;
|
||||||
// enough to hold the largest books in Morrowind.esm, but we will
|
|
||||||
// resize automaticall if necessary.
|
|
||||||
static std::vector<char> buf (50*1024);
|
|
||||||
static std::vector<char> output (50*1024);
|
|
||||||
static int size;
|
|
||||||
|
|
||||||
// Make sure the given vector is large enough for 'size' bytes,
|
Utf8Encoder::Utf8Encoder(const FromType sourceEncoding):
|
||||||
// including a terminating zero after it.
|
mOutput(50*1024)
|
||||||
static void resize(std::vector<char> &buf, size_t size)
|
|
||||||
{
|
{
|
||||||
if(buf.size() <= size)
|
switch (sourceEncoding)
|
||||||
// Add some extra padding to reduce the chance of having to resize
|
{
|
||||||
// again later.
|
case ToUTF8::WINDOWS_1252:
|
||||||
buf.resize(3*size);
|
{
|
||||||
|
translationArray = ToUTF8::windows_1252;
|
||||||
// And make sure the string is zero terminated
|
break;
|
||||||
buf[size] = 0;
|
}
|
||||||
|
case ToUTF8::WINDOWS_1250:
|
||||||
|
{
|
||||||
|
translationArray = ToUTF8::windows_1250;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case ToUTF8::WINDOWS_1251:
|
||||||
|
{
|
||||||
|
translationArray = ToUTF8::windows_1251;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
{
|
||||||
|
assert(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// This is just used to spew out a reusable input buffer for the
|
std::string Utf8Encoder::getUtf8(const char* input, int size)
|
||||||
// conversion process.
|
|
||||||
char *ToUTF8::getBuffer(int s)
|
|
||||||
{
|
{
|
||||||
// Remember the requested size
|
// Double check that the input string stops at some point (it might
|
||||||
size = s;
|
// contain zero terminators before this, inside its own data, which
|
||||||
resize(buf, size);
|
// is also ok.)
|
||||||
return &buf[0];
|
assert(input[size] == 0);
|
||||||
|
|
||||||
|
// TODO: The rest of this function is designed for single-character
|
||||||
|
// input encodings only. It also assumes that the input the input
|
||||||
|
// encoding shares its first 128 values (0-127) with ASCII. These
|
||||||
|
// conditions must be checked again if you add more input encodings
|
||||||
|
// later.
|
||||||
|
|
||||||
|
// Compute output length, and check for pure ascii input at the same
|
||||||
|
// time.
|
||||||
|
bool ascii;
|
||||||
|
size_t outlen = getLength(input, ascii);
|
||||||
|
|
||||||
|
// If we're pure ascii, then don't bother converting anything.
|
||||||
|
if(ascii)
|
||||||
|
return std::string(input, outlen);
|
||||||
|
|
||||||
|
// Make sure the output is large enough
|
||||||
|
resize(outlen);
|
||||||
|
char *out = &mOutput[0];
|
||||||
|
|
||||||
|
// Translate
|
||||||
|
while (*input)
|
||||||
|
copyFromArray(*(input++), out);
|
||||||
|
|
||||||
|
// Make sure that we wrote the correct number of bytes
|
||||||
|
assert((out-&mOutput[0]) == (int)outlen);
|
||||||
|
|
||||||
|
// And make extra sure the output is null terminated
|
||||||
|
assert(mOutput.size() > outlen);
|
||||||
|
assert(mOutput[outlen] == 0);
|
||||||
|
|
||||||
|
// Return a string
|
||||||
|
return std::string(&mOutput[0], outlen);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string Utf8Encoder::getLegacyEnc(const char *input, int size)
|
||||||
|
{
|
||||||
|
// Double check that the input string stops at some point (it might
|
||||||
|
// contain zero terminators before this, inside its own data, which
|
||||||
|
// is also ok.)
|
||||||
|
assert(input[size] == 0);
|
||||||
|
|
||||||
|
// TODO: The rest of this function is designed for single-character
|
||||||
|
// input encodings only. It also assumes that the input the input
|
||||||
|
// encoding shares its first 128 values (0-127) with ASCII. These
|
||||||
|
// conditions must be checked again if you add more input encodings
|
||||||
|
// later.
|
||||||
|
|
||||||
|
// Compute output length, and check for pure ascii input at the same
|
||||||
|
// time.
|
||||||
|
bool ascii;
|
||||||
|
size_t outlen = getLength2(input, ascii);
|
||||||
|
|
||||||
|
// If we're pure ascii, then don't bother converting anything.
|
||||||
|
if(ascii)
|
||||||
|
return std::string(input, outlen);
|
||||||
|
|
||||||
|
// Make sure the output is large enough
|
||||||
|
resize(outlen);
|
||||||
|
char *out = &mOutput[0];
|
||||||
|
|
||||||
|
// Translate
|
||||||
|
while(*input)
|
||||||
|
copyFromArray2(input, out);
|
||||||
|
|
||||||
|
// Make sure that we wrote the correct number of bytes
|
||||||
|
assert((out-&mOutput[0]) == (int)outlen);
|
||||||
|
|
||||||
|
// And make extra sure the output is null terminated
|
||||||
|
assert(mOutput.size() > outlen);
|
||||||
|
assert(mOutput[outlen] == 0);
|
||||||
|
|
||||||
|
// Return a string
|
||||||
|
return std::string(&mOutput[0], outlen);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Make sure the output vector is large enough for 'size' bytes,
|
||||||
|
// including a terminating zero after it.
|
||||||
|
void Utf8Encoder::resize(size_t size)
|
||||||
|
{
|
||||||
|
if (mOutput.size() <= size)
|
||||||
|
// Add some extra padding to reduce the chance of having to resize
|
||||||
|
// again later.
|
||||||
|
mOutput.resize(3*size);
|
||||||
|
|
||||||
|
// And make sure the string is zero terminated
|
||||||
|
mOutput[size] = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Get the total length length needed to decode the given string with
|
/** Get the total length length needed to decode the given string with
|
||||||
|
@ -79,7 +175,7 @@ char *ToUTF8::getBuffer(int s)
|
||||||
is the case, then the ascii parameter is set to true, and the
|
is the case, then the ascii parameter is set to true, and the
|
||||||
caller can optimize for this case.
|
caller can optimize for this case.
|
||||||
*/
|
*/
|
||||||
static size_t getLength(const char *arr, const char* input, bool &ascii)
|
size_t Utf8Encoder::getLength(const char* input, bool &ascii)
|
||||||
{
|
{
|
||||||
ascii = true;
|
ascii = true;
|
||||||
size_t len = 0;
|
size_t len = 0;
|
||||||
|
@ -102,7 +198,7 @@ static size_t getLength(const char *arr, const char* input, bool &ascii)
|
||||||
{
|
{
|
||||||
// Find the translated length of this character in the
|
// Find the translated length of this character in the
|
||||||
// lookup table.
|
// lookup table.
|
||||||
len += arr[inp*6];
|
len += translationArray[inp*6];
|
||||||
inp = *(++ptr);
|
inp = *(++ptr);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -111,7 +207,7 @@ static size_t getLength(const char *arr, const char* input, bool &ascii)
|
||||||
|
|
||||||
// Translate one character 'ch' using the translation array 'arr', and
|
// Translate one character 'ch' using the translation array 'arr', and
|
||||||
// advance the output pointer accordingly.
|
// advance the output pointer accordingly.
|
||||||
static void copyFromArray(const char *arr, unsigned char ch, char* &out)
|
void Utf8Encoder::copyFromArray(unsigned char ch, char* &out)
|
||||||
{
|
{
|
||||||
// Optimize for ASCII values
|
// Optimize for ASCII values
|
||||||
if (ch < 128)
|
if (ch < 128)
|
||||||
|
@ -120,80 +216,13 @@ static void copyFromArray(const char *arr, unsigned char ch, char* &out)
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const char *in = arr + ch*6;
|
const char *in = translationArray + ch*6;
|
||||||
int len = *(in++);
|
int len = *(in++);
|
||||||
for (int i=0; i<len; i++)
|
for (int i=0; i<len; i++)
|
||||||
*(out++) = *(in++);
|
*(out++) = *(in++);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string ToUTF8::getUtf8(ToUTF8::FromType from)
|
size_t Utf8Encoder::getLength2(const char* input, bool &ascii)
|
||||||
{
|
|
||||||
// Pick translation array
|
|
||||||
const char *arr;
|
|
||||||
switch (from)
|
|
||||||
{
|
|
||||||
case ToUTF8::WINDOWS_1252:
|
|
||||||
{
|
|
||||||
arr = ToUTF8::windows_1252;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
case ToUTF8::WINDOWS_1250:
|
|
||||||
{
|
|
||||||
arr = ToUTF8::windows_1250;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
case ToUTF8::WINDOWS_1251:
|
|
||||||
{
|
|
||||||
arr = ToUTF8::windows_1251;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
default:
|
|
||||||
{
|
|
||||||
assert(0);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Double check that the input string stops at some point (it might
|
|
||||||
// contain zero terminators before this, inside its own data, which
|
|
||||||
// is also ok.)
|
|
||||||
const char* input = &buf[0];
|
|
||||||
assert(input[size] == 0);
|
|
||||||
|
|
||||||
// TODO: The rest of this function is designed for single-character
|
|
||||||
// input encodings only. It also assumes that the input the input
|
|
||||||
// encoding shares its first 128 values (0-127) with ASCII. These
|
|
||||||
// conditions must be checked again if you add more input encodings
|
|
||||||
// later.
|
|
||||||
|
|
||||||
// Compute output length, and check for pure ascii input at the same
|
|
||||||
// time.
|
|
||||||
bool ascii;
|
|
||||||
size_t outlen = getLength(arr, input, ascii);
|
|
||||||
|
|
||||||
// If we're pure ascii, then don't bother converting anything.
|
|
||||||
if(ascii)
|
|
||||||
return std::string(input, outlen);
|
|
||||||
|
|
||||||
// Make sure the output is large enough
|
|
||||||
resize(output, outlen);
|
|
||||||
char *out = &output[0];
|
|
||||||
|
|
||||||
// Translate
|
|
||||||
while(*input)
|
|
||||||
copyFromArray(arr, *(input++), out);
|
|
||||||
|
|
||||||
// Make sure that we wrote the correct number of bytes
|
|
||||||
assert((out-&output[0]) == (int)outlen);
|
|
||||||
|
|
||||||
// And make extra sure the output is null terminated
|
|
||||||
assert(output.size() > outlen);
|
|
||||||
assert(output[outlen] == 0);
|
|
||||||
|
|
||||||
// Return a string
|
|
||||||
return std::string(&output[0], outlen);
|
|
||||||
}
|
|
||||||
|
|
||||||
static size_t getLength2(const char *arr, const char* input, bool &ascii)
|
|
||||||
{
|
{
|
||||||
ascii = true;
|
ascii = true;
|
||||||
size_t len = 0;
|
size_t len = 0;
|
||||||
|
@ -237,10 +266,7 @@ static size_t getLength2(const char *arr, const char* input, bool &ascii)
|
||||||
return len;
|
return len;
|
||||||
}
|
}
|
||||||
|
|
||||||
#include <iostream>
|
void Utf8Encoder::copyFromArray2(const char*& chp, char* &out)
|
||||||
#include <iomanip>
|
|
||||||
|
|
||||||
static void copyFromArray2(const char *arr, char*& chp, char* &out)
|
|
||||||
{
|
{
|
||||||
unsigned char ch = *(chp++);
|
unsigned char ch = *(chp++);
|
||||||
// Optimize for ASCII values
|
// Optimize for ASCII values
|
||||||
|
@ -278,7 +304,7 @@ static void copyFromArray2(const char *arr, char*& chp, char* &out)
|
||||||
|
|
||||||
for (int i = 128; i < 256; i++)
|
for (int i = 128; i < 256; i++)
|
||||||
{
|
{
|
||||||
unsigned char b1 = arr[i*6 + 1], b2 = arr[i*6 + 2], b3 = arr[i*6 + 3];
|
unsigned char b1 = translationArray[i*6 + 1], b2 = translationArray[i*6 + 2], b3 = translationArray[i*6 + 3];
|
||||||
if (b1 == ch && b2 == ch2 && (len != 3 || b3 == ch3))
|
if (b1 == ch && b2 == ch2 && (len != 3 || b3 == ch3))
|
||||||
{
|
{
|
||||||
*(out++) = (char)i;
|
*(out++) = (char)i;
|
||||||
|
@ -291,73 +317,6 @@ static void copyFromArray2(const char *arr, char*& chp, char* &out)
|
||||||
*(out++) = ch; // Could not find glyph, just put whatever
|
*(out++) = ch; // Could not find glyph, just put whatever
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string ToUTF8::getLegacyEnc(ToUTF8::FromType to)
|
|
||||||
{
|
|
||||||
// Pick translation array
|
|
||||||
const char *arr;
|
|
||||||
switch (to)
|
|
||||||
{
|
|
||||||
case ToUTF8::WINDOWS_1252:
|
|
||||||
{
|
|
||||||
arr = ToUTF8::windows_1252;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
case ToUTF8::WINDOWS_1250:
|
|
||||||
{
|
|
||||||
arr = ToUTF8::windows_1250;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
case ToUTF8::WINDOWS_1251:
|
|
||||||
{
|
|
||||||
arr = ToUTF8::windows_1251;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
default:
|
|
||||||
{
|
|
||||||
assert(0);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Double check that the input string stops at some point (it might
|
|
||||||
// contain zero terminators before this, inside its own data, which
|
|
||||||
// is also ok.)
|
|
||||||
char* input = &buf[0];
|
|
||||||
assert(input[size] == 0);
|
|
||||||
|
|
||||||
// TODO: The rest of this function is designed for single-character
|
|
||||||
// input encodings only. It also assumes that the input the input
|
|
||||||
// encoding shares its first 128 values (0-127) with ASCII. These
|
|
||||||
// conditions must be checked again if you add more input encodings
|
|
||||||
// later.
|
|
||||||
|
|
||||||
// Compute output length, and check for pure ascii input at the same
|
|
||||||
// time.
|
|
||||||
bool ascii;
|
|
||||||
size_t outlen = getLength2(arr, input, ascii);
|
|
||||||
|
|
||||||
// If we're pure ascii, then don't bother converting anything.
|
|
||||||
if(ascii)
|
|
||||||
return std::string(input, outlen);
|
|
||||||
|
|
||||||
// Make sure the output is large enough
|
|
||||||
resize(output, outlen);
|
|
||||||
char *out = &output[0];
|
|
||||||
|
|
||||||
// Translate
|
|
||||||
while(*input)
|
|
||||||
copyFromArray2(arr, input, out);
|
|
||||||
|
|
||||||
// Make sure that we wrote the correct number of bytes
|
|
||||||
assert((out-&output[0]) == (int)outlen);
|
|
||||||
|
|
||||||
// And make extra sure the output is null terminated
|
|
||||||
assert(output.size() > outlen);
|
|
||||||
assert(output[outlen] == 0);
|
|
||||||
|
|
||||||
// Return a string
|
|
||||||
return std::string(&output[0], outlen);
|
|
||||||
}
|
|
||||||
|
|
||||||
ToUTF8::FromType ToUTF8::calculateEncoding(const std::string& encodingName)
|
ToUTF8::FromType ToUTF8::calculateEncoding(const std::string& encodingName)
|
||||||
{
|
{
|
||||||
if (encodingName == "win1250")
|
if (encodingName == "win1250")
|
||||||
|
|
|
@ -2,6 +2,8 @@
|
||||||
#define COMPONENTS_TOUTF8_H
|
#define COMPONENTS_TOUTF8_H
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
|
#include <cstring>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
namespace ToUTF8
|
namespace ToUTF8
|
||||||
{
|
{
|
||||||
|
@ -14,17 +16,39 @@ namespace ToUTF8
|
||||||
// probably others)
|
// probably others)
|
||||||
};
|
};
|
||||||
|
|
||||||
// Return a writable buffer of at least 'size' bytes. The buffer
|
|
||||||
// does not have to be freed.
|
|
||||||
char* getBuffer(int size);
|
|
||||||
|
|
||||||
// Convert the previously written buffer to UTF8 from the given code
|
|
||||||
// page.
|
|
||||||
std::string getUtf8(FromType from);
|
|
||||||
std::string getLegacyEnc(FromType to);
|
|
||||||
|
|
||||||
FromType calculateEncoding(const std::string& encodingName);
|
FromType calculateEncoding(const std::string& encodingName);
|
||||||
std::string encodingUsingMessage(const std::string& encodingName);
|
std::string encodingUsingMessage(const std::string& encodingName);
|
||||||
|
|
||||||
|
// class
|
||||||
|
|
||||||
|
class Utf8Encoder
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
Utf8Encoder(FromType sourceEncoding);
|
||||||
|
|
||||||
|
// Convert to UTF8 from the previously given code page.
|
||||||
|
std::string getUtf8(const char *input, int size);
|
||||||
|
inline std::string getUtf8(const std::string &str)
|
||||||
|
{
|
||||||
|
return getUtf8(str.c_str(), str.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string getLegacyEnc(const char *input, int size);
|
||||||
|
inline std::string getLegacyEnc(const std::string &str)
|
||||||
|
{
|
||||||
|
return getLegacyEnc(str.c_str(), str.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
void resize(size_t size);
|
||||||
|
size_t getLength(const char* input, bool &ascii);
|
||||||
|
void copyFromArray(unsigned char chp, char* &out);
|
||||||
|
size_t getLength2(const char* input, bool &ascii);
|
||||||
|
void copyFromArray2(const char*& chp, char* &out);
|
||||||
|
|
||||||
|
std::vector<char> mOutput;
|
||||||
|
char* translationArray;
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -50,10 +50,7 @@ namespace Translation
|
||||||
|
|
||||||
if (!line.empty())
|
if (!line.empty())
|
||||||
{
|
{
|
||||||
char* buffer = ToUTF8::getBuffer(line.size() + 1);
|
line = mEncoder->getUtf8(line);
|
||||||
//buffer has at least line.size() + 1 bytes, so it must be safe
|
|
||||||
strcpy(buffer, line.c_str());
|
|
||||||
line = ToUTF8::getUtf8(mEncoding);
|
|
||||||
|
|
||||||
size_t tab_pos = line.find('\t');
|
size_t tab_pos = line.find('\t');
|
||||||
if (tab_pos != std::string::npos && tab_pos > 0 && tab_pos < line.size() - 1)
|
if (tab_pos != std::string::npos && tab_pos > 0 && tab_pos < line.size() - 1)
|
||||||
|
@ -104,9 +101,9 @@ namespace Translation
|
||||||
return phrase;
|
return phrase;
|
||||||
}
|
}
|
||||||
|
|
||||||
void Storage::setEncoding (const ToUTF8::FromType& encoding)
|
void Storage::setEncoder(ToUTF8::Utf8Encoder* encoder)
|
||||||
{
|
{
|
||||||
mEncoding = encoding;
|
mEncoder = encoder;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Storage::hasTranslation() const
|
bool Storage::hasTranslation() const
|
||||||
|
|
|
@ -19,7 +19,7 @@ namespace Translation
|
||||||
// Standard form usually means nominative case
|
// Standard form usually means nominative case
|
||||||
std::string topicStandardForm(const std::string& phrase) const;
|
std::string topicStandardForm(const std::string& phrase) const;
|
||||||
|
|
||||||
void setEncoding (const ToUTF8::FromType& encoding);
|
void setEncoder(ToUTF8::Utf8Encoder* encoder);
|
||||||
|
|
||||||
bool hasTranslation() const;
|
bool hasTranslation() const;
|
||||||
|
|
||||||
|
@ -34,7 +34,7 @@ namespace Translation
|
||||||
void loadDataFromStream(ContainerType& container, std::istream& stream);
|
void loadDataFromStream(ContainerType& container, std::istream& stream);
|
||||||
|
|
||||||
|
|
||||||
ToUTF8::FromType mEncoding;
|
ToUTF8::Utf8Encoder* mEncoder;
|
||||||
ContainerType mCellNamesTranslations, mTopicIDs, mPhraseForms;
|
ContainerType mCellNamesTranslations, mTopicIDs, mPhraseForms;
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue