Added custom UTF8 converter. Removed iconv dependency.

actorid
Nicolay Korslund 15 years ago
parent 9a5e7816eb
commit 358e1ca5a5

@ -44,6 +44,12 @@ set(NIFOGRE_HEADER
${COMP_DIR}/nifogre/ogre_nif_loader.hpp)
source_group(components\\nifogre FILES ${NIFOGRE} ${NIFOGRE_HEADER})
set(TO_UTF8
${COMP_DIR}/to_utf8/to_utf8.cpp)
set(TO_UTF8_HEADER
${COMP_DIR}/to_utf8/to_utf8.hpp)
source_group(components\\to_utf8 FILES ${TO_UTF8} ${TO_UTF8_HEADER})
set(ESM_STORE
${COMP_DIR}/esm_store/store.cpp)
set(ESM_STORE_HEADER
@ -75,10 +81,10 @@ file(GLOB INTERPRETER ${COMP_DIR}/interpreter/*.cpp)
file(GLOB INTERPRETER_HEADER ${COMP_DIR}/interpreter/*.hpp)
source_group(components\\interpreter FILES ${INTERPRETER} ${INTERPRETER_HEADER})
set(COMPONENTS ${BSA} ${NIF} ${NIFOGRE} ${ESM_STORE} ${MISC}
set(COMPONENTS ${BSA} ${NIF} ${NIFOGRE} ${ESM_STORE} ${MISC} ${TO_UTF8}
${COMPILER} ${INTERPRETER} ${ESM})
set(COMPONENTS_HEADER ${BSA_HEADER} ${NIF_HEADER} ${NIFOGRE_HEADER} ${ESM_STORE_HEADER}
${ESM_HEADER} ${MISC_HEADER} ${COMPILER_HEADER}
${ESM_HEADER} ${MISC_HEADER} ${COMPILER_HEADER} ${TO_UTF8_HEADER}
${INTERPRETER_HEADER})
# source directory: libs
@ -158,12 +164,10 @@ endif (WIN32)
find_package(OGRE REQUIRED)
find_package(Boost REQUIRED COMPONENTS system filesystem program_options thread)
find_package(OIS REQUIRED)
find_package(Iconv REQUIRED)
find_package(OpenAL REQUIRED)
include_directories("."
${OGRE_INCLUDE_DIR} ${OGRE_INCLUDE_DIR}/Ogre
${OIS_INCLUDE_DIR} ${Boost_INCLUDE_DIR}
${ICONV_INCLUDE_DIR}
${PLATFORM_INCLUDE_DIR}
${CMAKE_HOME_DIRECTORY}/extern/caelum/include
${CMAKE_HOME_DIRECTORY}/extern/mygui_3.0.1/MyGUIEngine/include

@ -9,6 +9,7 @@ source_group(apps\\esmtool FILES ${ESMTOOL})
add_executable(esmtool
${ESMTOOL}
${MISC} ${MISC_HEADER}
${TO_UTF8}
)
target_link_libraries(esmtool

@ -199,7 +199,6 @@ target_link_libraries(openmw
${Boost_LIBRARIES}
${OPENAL_LIBRARY}
${SOUND_INPUT_LIBRARY}
${ICONV_LIBRARIES}
caelum
MyGUIEngine
MyGUI.OgrePlatform

@ -1,69 +0,0 @@
# - Try to find Iconv
# Once done this will define
#
# ICONV_FOUND - system has Iconv
# ICONV_INCLUDE_DIR - the Iconv include directory
# ICONV_LIBRARIES - Link these to use Iconv
# ICONV_SECOND_ARGUMENT_IS_CONST - the second argument for iconv() is const
#
include(CheckCCompilerFlag)
include(CheckCXXSourceCompiles)
IF (ICONV_INCLUDE_DIR AND ICONV_LIBRARIES)
# Already in cache, be silent
SET(ICONV_FIND_QUIETLY TRUE)
ENDIF (ICONV_INCLUDE_DIR AND ICONV_LIBRARIES)
IF(WIN32)
SET(ICONV_INCLUDE_DIR $ENV{ICONV_INCLUDE_DIR})
SET(ICONV_LIBRARIES $ENV{ICONV_LIBRARIES})
ENDIF(WIN32)
FIND_PATH(ICONV_INCLUDE_DIR iconv.h)
FIND_LIBRARY(ICONV_LIBRARIES NAMES iconv libiconv c)
IF(ICONV_INCLUDE_DIR AND ICONV_LIBRARIES)
SET(ICONV_FOUND TRUE)
ENDIF(ICONV_INCLUDE_DIR AND ICONV_LIBRARIES)
set(CMAKE_REQUIRED_INCLUDES ${ICONV_INCLUDE_DIR})
set(CMAKE_REQUIRED_LIBRARIES ${ICONV_LIBRARIES})
IF(ICONV_FOUND)
check_c_compiler_flag("-Werror" ICONV_HAVE_WERROR)
set (CMAKE_C_FLAGS_BACKUP "${CMAKE_C_FLAGS}")
if(ICONV_HAVE_WERROR)
set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror")
endif(ICONV_HAVE_WERROR)
check_c_source_compiles("
#include <iconv.h>
int main(){
iconv_t conv = 0;
const char* in = 0;
size_t ilen = 0;
char* out = 0;
size_t olen = 0;
iconv(conv, &in, &ilen, &out, &olen);
return 0;
}
" ICONV_SECOND_ARGUMENT_IS_CONST )
set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS_BACKUP}")
ENDIF(ICONV_FOUND)
set(CMAKE_REQUIRED_INCLUDES)
set(CMAKE_REQUIRED_LIBRARIES)
IF(ICONV_FOUND)
IF(NOT ICONV_FIND_QUIETLY)
MESSAGE(STATUS "Found Iconv: ${ICONV_LIBRARIES}")
ENDIF(NOT ICONV_FIND_QUIETLY)
ELSE(ICONV_FOUND)
IF(Iconv_FIND_REQUIRED)
MESSAGE(FATAL_ERROR "Could not find Iconv")
ENDIF(Iconv_FIND_REQUIRED)
ENDIF(ICONV_FOUND)
MARK_AS_ADVANCED(
ICONV_INCLUDE_DIR
ICONV_LIBRARIES
ICONV_SECOND_ARGUMENT_IS_CONST
)

@ -3,22 +3,18 @@
#include <string>
#include <libs/platform/stdint.h>
#include <string.h>
#include <assert.h>
#include <vector>
#include <sstream>
#include <iomanip>
#include <errno.h>
#ifndef __WIN32__
#include <iconv.h>
#endif
#include <string.h>
#include <libs/mangle/stream/stream.hpp>
#include <libs/mangle/stream/servers/file_stream.hpp>
#include <libs/mangle/tools/str_exception.hpp>
#include <components/misc/stringops.hpp>
#include <components/to_utf8/to_utf8.hpp>
#ifdef __APPLE__
// need our own implementation of strnlen
static size_t strnlen(const char *s, size_t n)
@ -603,112 +599,17 @@ public:
void getName(NAME &name) { getT(name); }
void getUint(uint32_t &u) { getT(u); }
// Read the next size bytes and return them as a string
// Read the next 'size' bytes and return them as a string. Converts
// them from native encoding to UTF8 in the process.
std::string getString(int size)
{
// Not very optimized, but we can fix that later
char *ptr = new char[size];
char *ptr = ToUTF8::getBuffer(size);
esm->read(ptr,size);
// Remove any zero terminators
for(int i=0; i<size; i++)
if(ptr[i] == 0)
size = i;
// Convert to std::string and return
std::string res(ptr,size);
delete[] ptr;
return convertToUTF8(res);
// Convert to UTF8 and return
return ToUTF8::getUtf8(ToUTF8::WINDOWS_1252);
}
// Convert a string from the encoding used by Morrowind to UTF-8
std::string convertToUTF8 (std::string input)
{
#ifdef __WIN32__
return input;
#else
std::string output = "";
//create convert description
iconv_t cd = iconv_open ("UTF-8", "WINDOWS-1252");
if (cd == (iconv_t)-1) //error handling
{
std::string errMsg = "Creating description for UTF-8 converting failed: ";
switch (errno) //detailed error messages (maybe it contains too much detail :)
{
case EMFILE:
errMsg += "{OPEN_MAX} files descriptors are currently open in the calling process.";
case ENFILE:
errMsg += "Too many files are currently open in the system.";
case ENOMEM:
errMsg +="Insufficient storage space is available.";
case EINVAL:
errMsg += "The conversion specified by fromcode and tocode is not supported by the implementation.";
default:
errMsg += "Unknown Error\n";
}
fail (errMsg);
}
else
{
const size_t inputSize = input.size();
if (inputSize) //input is not empty
{
//convert function doesn't accept const char *, therefore copy content into an char *
std::vector<char> inputBuffer (input.begin(), input.end());
char *inputBufferBegin = &inputBuffer[0];
size_t inputBytesLeft = inputSize; //bytes to convert
static const size_t outputSize = 1000;
size_t outputBytesLeft;
char outputBuffer[outputSize];
char *outputBufferBegin;
while (inputBytesLeft > 0)
{
outputBytesLeft = outputSize;
outputBufferBegin = outputBuffer;
if (iconv (cd, &inputBufferBegin, &inputBytesLeft, &outputBufferBegin, &outputBytesLeft) == (size_t)-1)
{
switch (errno)
{
case E2BIG: //outputBuffer is full
output += std::string (outputBuffer, outputSize);
break;
case EILSEQ:
fail ("Iconv: Invalid multibyte sequence.\n");
break;
case EINVAL:
fail ("Iconv: Incomplete multibyte sequence.\n");
break;
default:
fail ("Iconv: Unknown Error\n");
}
}
}
//read only relevant bytes from outputBuffer
output += std::string (outputBuffer, outputSize - outputBytesLeft);
}
}
iconv_close (cd);
return output;
}
#endif
void skip(int bytes) { esm->seek(esm->tell()+bytes); }
uint64_t getOffset() { return esm->tell(); }

@ -0,0 +1 @@
gen_iconv

@ -0,0 +1,5 @@
tables_gen.hpp: gen_iconv
gen_iconv > tables_gen.hpp
gen_iconv: gen_iconv.cpp
g++ -Wall $^ -o $@

@ -0,0 +1,86 @@
// This program generates the file tables_gen.hpp
#include <iostream>
#include <iomanip>
using namespace std;
#include <iconv.h>
#include <assert.h>
void tab() { cout << " "; }
// write one number with a space in front of it and a comma after it
void num(unsigned char i, bool last)
{
cout << " 0x" << (unsigned)i;
if(!last) cout << ",";
}
// Write one table entry (UTF8 value), 1-5 bytes
void writeChar(char *value, int length, bool last, const std::string &comment="")
{
assert(length >= 1 && length <= 5);
tab();
num(length, false);
for(int i=0;i<5;i++)
num(value[i], last && i==4);
if(comment != "")
cout << " // " << comment;
cout << endl;
}
// What to write on missing characters
void writeMissing(bool last)
{
// Just write a space character
char value[5];
value[0] = ' ';
for(int i=1; i<5; i++)
value[i] = 0;
writeChar(value, 1, last, "not part of this charset");
}
int write_table(const std::string &charset, const std::string &tableName)
{
// Write table header
cout << "static char " << tableName << "[] =\n{\n";
// Open conversion system
iconv_t cd = iconv_open ("UTF-8", charset.c_str());
// Convert each character from 0 to 255
for(int i=0; i<256; i++)
{
bool last = (i==255);
char input = i;
char *iptr = &input;
size_t ileft = 1;
char output[5];
for(int k=0; k<5; k++) output[k] = 0;
char *optr = output;
size_t oleft = 5;
size_t res = iconv(cd, &iptr, &ileft, &optr, &oleft);
if(res) writeMissing(last);
else writeChar(output, 5-oleft, last);
}
iconv_close (cd);
// Finish table
cout << "};\n";
}
int main()
{
cout << hex;
// English
write_table("WINDOWS-1252", "windows_1252");
return 0;
}

@ -0,0 +1,259 @@
static char windows_1252[] =
{
0x1, 0x0, 0x0, 0x0, 0x0, 0x0,
0x1, 0x1, 0x0, 0x0, 0x0, 0x0,
0x1, 0x2, 0x0, 0x0, 0x0, 0x0,
0x1, 0x3, 0x0, 0x0, 0x0, 0x0,
0x1, 0x4, 0x0, 0x0, 0x0, 0x0,
0x1, 0x5, 0x0, 0x0, 0x0, 0x0,
0x1, 0x6, 0x0, 0x0, 0x0, 0x0,
0x1, 0x7, 0x0, 0x0, 0x0, 0x0,
0x1, 0x8, 0x0, 0x0, 0x0, 0x0,
0x1, 0x9, 0x0, 0x0, 0x0, 0x0,
0x1, 0xa, 0x0, 0x0, 0x0, 0x0,
0x1, 0xb, 0x0, 0x0, 0x0, 0x0,
0x1, 0xc, 0x0, 0x0, 0x0, 0x0,
0x1, 0xd, 0x0, 0x0, 0x0, 0x0,
0x1, 0xe, 0x0, 0x0, 0x0, 0x0,
0x1, 0xf, 0x0, 0x0, 0x0, 0x0,
0x1, 0x10, 0x0, 0x0, 0x0, 0x0,
0x1, 0x11, 0x0, 0x0, 0x0, 0x0,
0x1, 0x12, 0x0, 0x0, 0x0, 0x0,
0x1, 0x13, 0x0, 0x0, 0x0, 0x0,
0x1, 0x14, 0x0, 0x0, 0x0, 0x0,
0x1, 0x15, 0x0, 0x0, 0x0, 0x0,
0x1, 0x16, 0x0, 0x0, 0x0, 0x0,
0x1, 0x17, 0x0, 0x0, 0x0, 0x0,
0x1, 0x18, 0x0, 0x0, 0x0, 0x0,
0x1, 0x19, 0x0, 0x0, 0x0, 0x0,
0x1, 0x1a, 0x0, 0x0, 0x0, 0x0,
0x1, 0x1b, 0x0, 0x0, 0x0, 0x0,
0x1, 0x1c, 0x0, 0x0, 0x0, 0x0,
0x1, 0x1d, 0x0, 0x0, 0x0, 0x0,
0x1, 0x1e, 0x0, 0x0, 0x0, 0x0,
0x1, 0x1f, 0x0, 0x0, 0x0, 0x0,
0x1, 0x20, 0x0, 0x0, 0x0, 0x0,
0x1, 0x21, 0x0, 0x0, 0x0, 0x0,
0x1, 0x22, 0x0, 0x0, 0x0, 0x0,
0x1, 0x23, 0x0, 0x0, 0x0, 0x0,
0x1, 0x24, 0x0, 0x0, 0x0, 0x0,
0x1, 0x25, 0x0, 0x0, 0x0, 0x0,
0x1, 0x26, 0x0, 0x0, 0x0, 0x0,
0x1, 0x27, 0x0, 0x0, 0x0, 0x0,
0x1, 0x28, 0x0, 0x0, 0x0, 0x0,
0x1, 0x29, 0x0, 0x0, 0x0, 0x0,
0x1, 0x2a, 0x0, 0x0, 0x0, 0x0,
0x1, 0x2b, 0x0, 0x0, 0x0, 0x0,
0x1, 0x2c, 0x0, 0x0, 0x0, 0x0,
0x1, 0x2d, 0x0, 0x0, 0x0, 0x0,
0x1, 0x2e, 0x0, 0x0, 0x0, 0x0,
0x1, 0x2f, 0x0, 0x0, 0x0, 0x0,
0x1, 0x30, 0x0, 0x0, 0x0, 0x0,
0x1, 0x31, 0x0, 0x0, 0x0, 0x0,
0x1, 0x32, 0x0, 0x0, 0x0, 0x0,
0x1, 0x33, 0x0, 0x0, 0x0, 0x0,
0x1, 0x34, 0x0, 0x0, 0x0, 0x0,
0x1, 0x35, 0x0, 0x0, 0x0, 0x0,
0x1, 0x36, 0x0, 0x0, 0x0, 0x0,
0x1, 0x37, 0x0, 0x0, 0x0, 0x0,
0x1, 0x38, 0x0, 0x0, 0x0, 0x0,
0x1, 0x39, 0x0, 0x0, 0x0, 0x0,
0x1, 0x3a, 0x0, 0x0, 0x0, 0x0,
0x1, 0x3b, 0x0, 0x0, 0x0, 0x0,
0x1, 0x3c, 0x0, 0x0, 0x0, 0x0,
0x1, 0x3d, 0x0, 0x0, 0x0, 0x0,
0x1, 0x3e, 0x0, 0x0, 0x0, 0x0,
0x1, 0x3f, 0x0, 0x0, 0x0, 0x0,
0x1, 0x40, 0x0, 0x0, 0x0, 0x0,
0x1, 0x41, 0x0, 0x0, 0x0, 0x0,
0x1, 0x42, 0x0, 0x0, 0x0, 0x0,
0x1, 0x43, 0x0, 0x0, 0x0, 0x0,
0x1, 0x44, 0x0, 0x0, 0x0, 0x0,
0x1, 0x45, 0x0, 0x0, 0x0, 0x0,
0x1, 0x46, 0x0, 0x0, 0x0, 0x0,
0x1, 0x47, 0x0, 0x0, 0x0, 0x0,
0x1, 0x48, 0x0, 0x0, 0x0, 0x0,
0x1, 0x49, 0x0, 0x0, 0x0, 0x0,
0x1, 0x4a, 0x0, 0x0, 0x0, 0x0,
0x1, 0x4b, 0x0, 0x0, 0x0, 0x0,
0x1, 0x4c, 0x0, 0x0, 0x0, 0x0,
0x1, 0x4d, 0x0, 0x0, 0x0, 0x0,
0x1, 0x4e, 0x0, 0x0, 0x0, 0x0,
0x1, 0x4f, 0x0, 0x0, 0x0, 0x0,
0x1, 0x50, 0x0, 0x0, 0x0, 0x0,
0x1, 0x51, 0x0, 0x0, 0x0, 0x0,
0x1, 0x52, 0x0, 0x0, 0x0, 0x0,
0x1, 0x53, 0x0, 0x0, 0x0, 0x0,
0x1, 0x54, 0x0, 0x0, 0x0, 0x0,
0x1, 0x55, 0x0, 0x0, 0x0, 0x0,
0x1, 0x56, 0x0, 0x0, 0x0, 0x0,
0x1, 0x57, 0x0, 0x0, 0x0, 0x0,
0x1, 0x58, 0x0, 0x0, 0x0, 0x0,
0x1, 0x59, 0x0, 0x0, 0x0, 0x0,
0x1, 0x5a, 0x0, 0x0, 0x0, 0x0,
0x1, 0x5b, 0x0, 0x0, 0x0, 0x0,
0x1, 0x5c, 0x0, 0x0, 0x0, 0x0,
0x1, 0x5d, 0x0, 0x0, 0x0, 0x0,
0x1, 0x5e, 0x0, 0x0, 0x0, 0x0,
0x1, 0x5f, 0x0, 0x0, 0x0, 0x0,
0x1, 0x60, 0x0, 0x0, 0x0, 0x0,
0x1, 0x61, 0x0, 0x0, 0x0, 0x0,
0x1, 0x62, 0x0, 0x0, 0x0, 0x0,
0x1, 0x63, 0x0, 0x0, 0x0, 0x0,
0x1, 0x64, 0x0, 0x0, 0x0, 0x0,
0x1, 0x65, 0x0, 0x0, 0x0, 0x0,
0x1, 0x66, 0x0, 0x0, 0x0, 0x0,
0x1, 0x67, 0x0, 0x0, 0x0, 0x0,
0x1, 0x68, 0x0, 0x0, 0x0, 0x0,
0x1, 0x69, 0x0, 0x0, 0x0, 0x0,
0x1, 0x6a, 0x0, 0x0, 0x0, 0x0,
0x1, 0x6b, 0x0, 0x0, 0x0, 0x0,
0x1, 0x6c, 0x0, 0x0, 0x0, 0x0,
0x1, 0x6d, 0x0, 0x0, 0x0, 0x0,
0x1, 0x6e, 0x0, 0x0, 0x0, 0x0,
0x1, 0x6f, 0x0, 0x0, 0x0, 0x0,
0x1, 0x70, 0x0, 0x0, 0x0, 0x0,
0x1, 0x71, 0x0, 0x0, 0x0, 0x0,
0x1, 0x72, 0x0, 0x0, 0x0, 0x0,
0x1, 0x73, 0x0, 0x0, 0x0, 0x0,
0x1, 0x74, 0x0, 0x0, 0x0, 0x0,
0x1, 0x75, 0x0, 0x0, 0x0, 0x0,
0x1, 0x76, 0x0, 0x0, 0x0, 0x0,
0x1, 0x77, 0x0, 0x0, 0x0, 0x0,
0x1, 0x78, 0x0, 0x0, 0x0, 0x0,
0x1, 0x79, 0x0, 0x0, 0x0, 0x0,
0x1, 0x7a, 0x0, 0x0, 0x0, 0x0,
0x1, 0x7b, 0x0, 0x0, 0x0, 0x0,
0x1, 0x7c, 0x0, 0x0, 0x0, 0x0,
0x1, 0x7d, 0x0, 0x0, 0x0, 0x0,
0x1, 0x7e, 0x0, 0x0, 0x0, 0x0,
0x1, 0x7f, 0x0, 0x0, 0x0, 0x0,
0x3, 0xe2, 0x82, 0xac, 0x0, 0x0,
0x1, 0x20, 0x0, 0x0, 0x0, 0x0, // not part of this charset
0x3, 0xe2, 0x80, 0x9a, 0x0, 0x0,
0x2, 0xc6, 0x92, 0x0, 0x0, 0x0,
0x3, 0xe2, 0x80, 0x9e, 0x0, 0x0,
0x3, 0xe2, 0x80, 0xa6, 0x0, 0x0,
0x3, 0xe2, 0x80, 0xa0, 0x0, 0x0,
0x3, 0xe2, 0x80, 0xa1, 0x0, 0x0,
0x2, 0xcb, 0x86, 0x0, 0x0, 0x0,
0x3, 0xe2, 0x80, 0xb0, 0x0, 0x0,
0x2, 0xc5, 0xa0, 0x0, 0x0, 0x0,
0x3, 0xe2, 0x80, 0xb9, 0x0, 0x0,
0x2, 0xc5, 0x92, 0x0, 0x0, 0x0,
0x1, 0x20, 0x0, 0x0, 0x0, 0x0, // not part of this charset
0x2, 0xc5, 0xbd, 0x0, 0x0, 0x0,
0x1, 0x20, 0x0, 0x0, 0x0, 0x0, // not part of this charset
0x1, 0x20, 0x0, 0x0, 0x0, 0x0, // not part of this charset
0x3, 0xe2, 0x80, 0x98, 0x0, 0x0,
0x3, 0xe2, 0x80, 0x99, 0x0, 0x0,
0x3, 0xe2, 0x80, 0x9c, 0x0, 0x0,
0x3, 0xe2, 0x80, 0x9d, 0x0, 0x0,
0x3, 0xe2, 0x80, 0xa2, 0x0, 0x0,
0x3, 0xe2, 0x80, 0x93, 0x0, 0x0,
0x3, 0xe2, 0x80, 0x94, 0x0, 0x0,
0x2, 0xcb, 0x9c, 0x0, 0x0, 0x0,
0x3, 0xe2, 0x84, 0xa2, 0x0, 0x0,
0x2, 0xc5, 0xa1, 0x0, 0x0, 0x0,
0x3, 0xe2, 0x80, 0xba, 0x0, 0x0,
0x2, 0xc5, 0x93, 0x0, 0x0, 0x0,
0x1, 0x20, 0x0, 0x0, 0x0, 0x0, // not part of this charset
0x2, 0xc5, 0xbe, 0x0, 0x0, 0x0,
0x2, 0xc5, 0xb8, 0x0, 0x0, 0x0,
0x2, 0xc2, 0xa0, 0x0, 0x0, 0x0,
0x2, 0xc2, 0xa1, 0x0, 0x0, 0x0,
0x2, 0xc2, 0xa2, 0x0, 0x0, 0x0,
0x2, 0xc2, 0xa3, 0x0, 0x0, 0x0,
0x2, 0xc2, 0xa4, 0x0, 0x0, 0x0,
0x2, 0xc2, 0xa5, 0x0, 0x0, 0x0,
0x2, 0xc2, 0xa6, 0x0, 0x0, 0x0,
0x2, 0xc2, 0xa7, 0x0, 0x0, 0x0,
0x2, 0xc2, 0xa8, 0x0, 0x0, 0x0,
0x2, 0xc2, 0xa9, 0x0, 0x0, 0x0,
0x2, 0xc2, 0xaa, 0x0, 0x0, 0x0,
0x2, 0xc2, 0xab, 0x0, 0x0, 0x0,
0x2, 0xc2, 0xac, 0x0, 0x0, 0x0,
0x2, 0xc2, 0xad, 0x0, 0x0, 0x0,
0x2, 0xc2, 0xae, 0x0, 0x0, 0x0,
0x2, 0xc2, 0xaf, 0x0, 0x0, 0x0,
0x2, 0xc2, 0xb0, 0x0, 0x0, 0x0,
0x2, 0xc2, 0xb1, 0x0, 0x0, 0x0,
0x2, 0xc2, 0xb2, 0x0, 0x0, 0x0,
0x2, 0xc2, 0xb3, 0x0, 0x0, 0x0,
0x2, 0xc2, 0xb4, 0x0, 0x0, 0x0,
0x2, 0xc2, 0xb5, 0x0, 0x0, 0x0,
0x2, 0xc2, 0xb6, 0x0, 0x0, 0x0,
0x2, 0xc2, 0xb7, 0x0, 0x0, 0x0,
0x2, 0xc2, 0xb8, 0x0, 0x0, 0x0,
0x2, 0xc2, 0xb9, 0x0, 0x0, 0x0,
0x2, 0xc2, 0xba, 0x0, 0x0, 0x0,
0x2, 0xc2, 0xbb, 0x0, 0x0, 0x0,
0x2, 0xc2, 0xbc, 0x0, 0x0, 0x0,
0x2, 0xc2, 0xbd, 0x0, 0x0, 0x0,
0x2, 0xc2, 0xbe, 0x0, 0x0, 0x0,
0x2, 0xc2, 0xbf, 0x0, 0x0, 0x0,
0x2, 0xc3, 0x80, 0x0, 0x0, 0x0,
0x2, 0xc3, 0x81, 0x0, 0x0, 0x0,
0x2, 0xc3, 0x82, 0x0, 0x0, 0x0,
0x2, 0xc3, 0x83, 0x0, 0x0, 0x0,
0x2, 0xc3, 0x84, 0x0, 0x0, 0x0,
0x2, 0xc3, 0x85, 0x0, 0x0, 0x0,
0x2, 0xc3, 0x86, 0x0, 0x0, 0x0,
0x2, 0xc3, 0x87, 0x0, 0x0, 0x0,
0x2, 0xc3, 0x88, 0x0, 0x0, 0x0,
0x2, 0xc3, 0x89, 0x0, 0x0, 0x0,
0x2, 0xc3, 0x8a, 0x0, 0x0, 0x0,
0x2, 0xc3, 0x8b, 0x0, 0x0, 0x0,
0x2, 0xc3, 0x8c, 0x0, 0x0, 0x0,
0x2, 0xc3, 0x8d, 0x0, 0x0, 0x0,
0x2, 0xc3, 0x8e, 0x0, 0x0, 0x0,
0x2, 0xc3, 0x8f, 0x0, 0x0, 0x0,
0x2, 0xc3, 0x90, 0x0, 0x0, 0x0,
0x2, 0xc3, 0x91, 0x0, 0x0, 0x0,
0x2, 0xc3, 0x92, 0x0, 0x0, 0x0,
0x2, 0xc3, 0x93, 0x0, 0x0, 0x0,
0x2, 0xc3, 0x94, 0x0, 0x0, 0x0,
0x2, 0xc3, 0x95, 0x0, 0x0, 0x0,
0x2, 0xc3, 0x96, 0x0, 0x0, 0x0,
0x2, 0xc3, 0x97, 0x0, 0x0, 0x0,
0x2, 0xc3, 0x98, 0x0, 0x0, 0x0,
0x2, 0xc3, 0x99, 0x0, 0x0, 0x0,
0x2, 0xc3, 0x9a, 0x0, 0x0, 0x0,
0x2, 0xc3, 0x9b, 0x0, 0x0, 0x0,
0x2, 0xc3, 0x9c, 0x0, 0x0, 0x0,
0x2, 0xc3, 0x9d, 0x0, 0x0, 0x0,
0x2, 0xc3, 0x9e, 0x0, 0x0, 0x0,
0x2, 0xc3, 0x9f, 0x0, 0x0, 0x0,
0x2, 0xc3, 0xa0, 0x0, 0x0, 0x0,
0x2, 0xc3, 0xa1, 0x0, 0x0, 0x0,
0x2, 0xc3, 0xa2, 0x0, 0x0, 0x0,
0x2, 0xc3, 0xa3, 0x0, 0x0, 0x0,
0x2, 0xc3, 0xa4, 0x0, 0x0, 0x0,
0x2, 0xc3, 0xa5, 0x0, 0x0, 0x0,
0x2, 0xc3, 0xa6, 0x0, 0x0, 0x0,
0x2, 0xc3, 0xa7, 0x0, 0x0, 0x0,
0x2, 0xc3, 0xa8, 0x0, 0x0, 0x0,
0x2, 0xc3, 0xa9, 0x0, 0x0, 0x0,
0x2, 0xc3, 0xaa, 0x0, 0x0, 0x0,
0x2, 0xc3, 0xab, 0x0, 0x0, 0x0,
0x2, 0xc3, 0xac, 0x0, 0x0, 0x0,
0x2, 0xc3, 0xad, 0x0, 0x0, 0x0,
0x2, 0xc3, 0xae, 0x0, 0x0, 0x0,
0x2, 0xc3, 0xaf, 0x0, 0x0, 0x0,
0x2, 0xc3, 0xb0, 0x0, 0x0, 0x0,
0x2, 0xc3, 0xb1, 0x0, 0x0, 0x0,
0x2, 0xc3, 0xb2, 0x0, 0x0, 0x0,
0x2, 0xc3, 0xb3, 0x0, 0x0, 0x0,
0x2, 0xc3, 0xb4, 0x0, 0x0, 0x0,
0x2, 0xc3, 0xb5, 0x0, 0x0, 0x0,
0x2, 0xc3, 0xb6, 0x0, 0x0, 0x0,
0x2, 0xc3, 0xb7, 0x0, 0x0, 0x0,
0x2, 0xc3, 0xb8, 0x0, 0x0, 0x0,
0x2, 0xc3, 0xb9, 0x0, 0x0, 0x0,
0x2, 0xc3, 0xba, 0x0, 0x0, 0x0,
0x2, 0xc3, 0xbb, 0x0, 0x0, 0x0,
0x2, 0xc3, 0xbc, 0x0, 0x0, 0x0,
0x2, 0xc3, 0xbd, 0x0, 0x0, 0x0,
0x2, 0xc3, 0xbe, 0x0, 0x0, 0x0,
0x2, 0xc3, 0xbf, 0x0, 0x0, 0x0
};

@ -0,0 +1,159 @@
#include "to_utf8.hpp"
#include <vector>
#include <assert.h>
/* This file contains the code to translate from WINDOWS-1252 (native
charset used in English version of Morrowind) to UTF-8. The library
is designed to be extened to support more source encodings later,
which means that we may add support for Russian, Polish and Chinese
files and so on.
The code does not depend on any external library at
runtime. Instead, it uses a pregenerated table made with iconv (see
gen_iconv.cpp and the Makefile) which is located in tables_gen.hpp.
This is both faster and uses less dependencies. The tables would
only need to be regenerated if we are adding support more input
encodings. As such, there is no need to make the generator code
platform independent.
The library is optimized for the case of pure ASCII input strings,
which is the vast majority of cases at least for the English
version. A test of my version of Morrowind.esm got 130 non-ASCII vs
236195 ASCII strings, or less than 0.06% of strings containing
non-ASCII characters.
To optmize for this, ff the first pass of the string does not find
any non-ASCII characters, the entire string is passed along without
any modification.
Most of the non-ASCII strings are books, and are quite large. (The
non-ASCII characters are typically starting and ending quotation
marks.) Within these, almost all the characters are ASCII. For this
purpose, the library is also optimized for mostly-ASCII contents
even in the cases where some conversion is necessary.
*/
// Generated tables
#include "tables_gen.hpp"
// Shared global buffers, we love you.
static std::vector<char> buf;
static std::vector<char> output;
static int size;
// Make sure the given vector is large enough for 'size' bytes,
// including a terminating zero after it.
static void resize(std::vector<char> &buf, size_t size)
{
if(buf.size() <= size)
// Add some extra padding to reduce the chance of having to resize
// again later.
buf.resize(3*size);
// And make sure the string is zero terminated
buf[size] = 0;
}
// This is just used to spew out a reusable input buffer for the
// conversion process.
char *ToUTF8::getBuffer(int s)
{
// Remember the requested size
size = s;
resize(buf, size);
return &buf[0];
}
/** Get the total length length needed to decode the given string with
the given translation array. The arrays are encoded with 6 bytes
per character, with the first giving the length and the next 5 the
actual data.
The function serves a dual purpose for optimization reasons: it
checks if the input is pure ascii (all values are <= 127). If this
is the case, then the ascii parameter is set to true, and the
caller can optimize for this case.
*/
static size_t getLength(const char *arr, const char* input, bool &ascii)
{
ascii = true;
size_t len = 0;
unsigned char inp = *input;
while(inp)
{
if(inp > 127) ascii = false;
len += arr[inp*6];
inp = *(++input);
}
return len;
}
// Translate one character 'ch' using the translation array 'arr', and
// advance the output pointer accordingly.
static void copyFromArray(const char *arr, unsigned char ch, char* &out)
{
// Optimize for ASCII values
if(ch < 128)
{
*(out++) = ch;
return;
}
const char *in = arr + ch*6;
int len = *(in++);
for(int i=0; i<len; i++)
*(out++) = *(in++);
}
std::string ToUTF8::getUtf8(ToUTF8::FromType from)
{
// Pick translation array
const char *arr;
if(from == ToUTF8::WINDOWS_1252)
arr = windows_1252;
else
assert(0);
// Double check that the input string stops at some point (it might
// contain zero terminators before this, inside its own data, which
// is also ok.)
const char* input = &buf[0];
assert(input[size] == 0);
// TODO: The rest of this function is designed for single-character
// input encodings only. It also assumes that the input the input
// encoding shares its first 128 values (0-127) with ASCII. These
// conditions must be checked again if you add more input encodings
// later.
// Compute output length, and check for pure ascii input at the same
// time.
bool ascii;
size_t outlen = getLength(arr, input, ascii);
// If we're pure ascii, then don't bother converting anything.
if(ascii)
return std::string(input, outlen);
// Make sure the output is large enough
resize(output, outlen);
char *out = &output[0];
// Translate
while(*input)
copyFromArray(arr, *(input++), out);
// Make sure that we wrote the correct number of bytes
assert((out-&output[0]) == (int)outlen);
// And make extra sure the output is null terminated
assert(output.size() > outlen);
assert(output[outlen] == 0);
// Return a string
return std::string(&output[0], outlen);
}

@ -0,0 +1,24 @@
#ifndef COMPONENTS_TOUTF8_H
#define COMPONENTS_TOUTF8_H
#include <string>
namespace ToUTF8
{
// These are all the currently supported code pages
enum FromType
{
WINDOWS_1252 // Used by English version of Morrowind (and
// probably others)
};
// Return a writable buffer of at least 'size' bytes. The buffer
// does not have to be freed.
char* getBuffer(int size);
// Convert the previously written buffer to UTF8 from the given code
// page.
std::string getUtf8(FromType from);
}
#endif
Loading…
Cancel
Save