From 358e1ca5a5529ec0a59050f015dbd706c53b95ee Mon Sep 17 00:00:00 2001 From: Nicolay Korslund Date: Wed, 18 Aug 2010 18:45:44 +0200 Subject: [PATCH] Added custom UTF8 converter. Removed iconv dependency. --- CMakeLists.txt | 12 +- apps/esmtool/CMakeLists.txt | 1 + apps/openmw/CMakeLists.txt | 1 - cmake/FindIconv.cmake | 69 -------- components/esm/esm_reader.hpp | 115 +------------ components/to_utf8/.gitignore | 1 + components/to_utf8/Makefile | 5 + components/to_utf8/gen_iconv.cpp | 86 ++++++++++ components/to_utf8/tables_gen.hpp | 259 ++++++++++++++++++++++++++++++ components/to_utf8/to_utf8.cpp | 159 ++++++++++++++++++ components/to_utf8/to_utf8.hpp | 24 +++ 11 files changed, 551 insertions(+), 181 deletions(-) delete mode 100644 cmake/FindIconv.cmake create mode 100644 components/to_utf8/.gitignore create mode 100644 components/to_utf8/Makefile create mode 100644 components/to_utf8/gen_iconv.cpp create mode 100644 components/to_utf8/tables_gen.hpp create mode 100644 components/to_utf8/to_utf8.cpp create mode 100644 components/to_utf8/to_utf8.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 32f4e483a..b4d2900b1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -44,6 +44,12 @@ set(NIFOGRE_HEADER ${COMP_DIR}/nifogre/ogre_nif_loader.hpp) source_group(components\\nifogre FILES ${NIFOGRE} ${NIFOGRE_HEADER}) +set(TO_UTF8 + ${COMP_DIR}/to_utf8/to_utf8.cpp) +set(TO_UTF8_HEADER + ${COMP_DIR}/to_utf8/to_utf8.hpp) +source_group(components\\to_utf8 FILES ${TO_UTF8} ${TO_UTF8_HEADER}) + set(ESM_STORE ${COMP_DIR}/esm_store/store.cpp) set(ESM_STORE_HEADER @@ -75,10 +81,10 @@ file(GLOB INTERPRETER ${COMP_DIR}/interpreter/*.cpp) file(GLOB INTERPRETER_HEADER ${COMP_DIR}/interpreter/*.hpp) source_group(components\\interpreter FILES ${INTERPRETER} ${INTERPRETER_HEADER}) -set(COMPONENTS ${BSA} ${NIF} ${NIFOGRE} ${ESM_STORE} ${MISC} +set(COMPONENTS ${BSA} ${NIF} ${NIFOGRE} ${ESM_STORE} ${MISC} ${TO_UTF8} ${COMPILER} ${INTERPRETER} ${ESM}) set(COMPONENTS_HEADER ${BSA_HEADER} ${NIF_HEADER} ${NIFOGRE_HEADER} ${ESM_STORE_HEADER} - ${ESM_HEADER} ${MISC_HEADER} ${COMPILER_HEADER} + ${ESM_HEADER} ${MISC_HEADER} ${COMPILER_HEADER} ${TO_UTF8_HEADER} ${INTERPRETER_HEADER}) # source directory: libs @@ -158,12 +164,10 @@ endif (WIN32) find_package(OGRE REQUIRED) find_package(Boost REQUIRED COMPONENTS system filesystem program_options thread) find_package(OIS REQUIRED) -find_package(Iconv REQUIRED) find_package(OpenAL REQUIRED) include_directories("." ${OGRE_INCLUDE_DIR} ${OGRE_INCLUDE_DIR}/Ogre ${OIS_INCLUDE_DIR} ${Boost_INCLUDE_DIR} - ${ICONV_INCLUDE_DIR} ${PLATFORM_INCLUDE_DIR} ${CMAKE_HOME_DIRECTORY}/extern/caelum/include ${CMAKE_HOME_DIRECTORY}/extern/mygui_3.0.1/MyGUIEngine/include diff --git a/apps/esmtool/CMakeLists.txt b/apps/esmtool/CMakeLists.txt index a5ff810af..5b2f846e2 100644 --- a/apps/esmtool/CMakeLists.txt +++ b/apps/esmtool/CMakeLists.txt @@ -9,6 +9,7 @@ source_group(apps\\esmtool FILES ${ESMTOOL}) add_executable(esmtool ${ESMTOOL} ${MISC} ${MISC_HEADER} + ${TO_UTF8} ) target_link_libraries(esmtool diff --git a/apps/openmw/CMakeLists.txt b/apps/openmw/CMakeLists.txt index 35fc597fe..b5948aaea 100644 --- a/apps/openmw/CMakeLists.txt +++ b/apps/openmw/CMakeLists.txt @@ -199,7 +199,6 @@ target_link_libraries(openmw ${Boost_LIBRARIES} ${OPENAL_LIBRARY} ${SOUND_INPUT_LIBRARY} - ${ICONV_LIBRARIES} caelum MyGUIEngine MyGUI.OgrePlatform diff --git a/cmake/FindIconv.cmake b/cmake/FindIconv.cmake deleted file mode 100644 index 571a959af..000000000 --- a/cmake/FindIconv.cmake +++ /dev/null @@ -1,69 +0,0 @@ -# - Try to find Iconv -# Once done this will define -# -# ICONV_FOUND - system has Iconv -# ICONV_INCLUDE_DIR - the Iconv include directory -# ICONV_LIBRARIES - Link these to use Iconv -# ICONV_SECOND_ARGUMENT_IS_CONST - the second argument for iconv() is const -# -include(CheckCCompilerFlag) -include(CheckCXXSourceCompiles) - -IF (ICONV_INCLUDE_DIR AND ICONV_LIBRARIES) - # Already in cache, be silent - SET(ICONV_FIND_QUIETLY TRUE) -ENDIF (ICONV_INCLUDE_DIR AND ICONV_LIBRARIES) - -IF(WIN32) - SET(ICONV_INCLUDE_DIR $ENV{ICONV_INCLUDE_DIR}) - SET(ICONV_LIBRARIES $ENV{ICONV_LIBRARIES}) -ENDIF(WIN32) - -FIND_PATH(ICONV_INCLUDE_DIR iconv.h) - -FIND_LIBRARY(ICONV_LIBRARIES NAMES iconv libiconv c) - -IF(ICONV_INCLUDE_DIR AND ICONV_LIBRARIES) - SET(ICONV_FOUND TRUE) -ENDIF(ICONV_INCLUDE_DIR AND ICONV_LIBRARIES) - -set(CMAKE_REQUIRED_INCLUDES ${ICONV_INCLUDE_DIR}) -set(CMAKE_REQUIRED_LIBRARIES ${ICONV_LIBRARIES}) -IF(ICONV_FOUND) - check_c_compiler_flag("-Werror" ICONV_HAVE_WERROR) - set (CMAKE_C_FLAGS_BACKUP "${CMAKE_C_FLAGS}") - if(ICONV_HAVE_WERROR) - set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror") - endif(ICONV_HAVE_WERROR) - check_c_source_compiles(" - #include - int main(){ - iconv_t conv = 0; - const char* in = 0; - size_t ilen = 0; - char* out = 0; - size_t olen = 0; - iconv(conv, &in, &ilen, &out, &olen); - return 0; - } -" ICONV_SECOND_ARGUMENT_IS_CONST ) - set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS_BACKUP}") -ENDIF(ICONV_FOUND) -set(CMAKE_REQUIRED_INCLUDES) -set(CMAKE_REQUIRED_LIBRARIES) - -IF(ICONV_FOUND) - IF(NOT ICONV_FIND_QUIETLY) - MESSAGE(STATUS "Found Iconv: ${ICONV_LIBRARIES}") - ENDIF(NOT ICONV_FIND_QUIETLY) -ELSE(ICONV_FOUND) - IF(Iconv_FIND_REQUIRED) - MESSAGE(FATAL_ERROR "Could not find Iconv") - ENDIF(Iconv_FIND_REQUIRED) -ENDIF(ICONV_FOUND) - -MARK_AS_ADVANCED( - ICONV_INCLUDE_DIR - ICONV_LIBRARIES - ICONV_SECOND_ARGUMENT_IS_CONST -) diff --git a/components/esm/esm_reader.hpp b/components/esm/esm_reader.hpp index 32f5570ad..6e214e1bd 100644 --- a/components/esm/esm_reader.hpp +++ b/components/esm/esm_reader.hpp @@ -3,22 +3,18 @@ #include #include -#include #include #include #include -#include -#include - -#ifndef __WIN32__ - #include -#endif +#include #include #include #include #include +#include + #ifdef __APPLE__ // need our own implementation of strnlen static size_t strnlen(const char *s, size_t n) @@ -603,112 +599,17 @@ public: void getName(NAME &name) { getT(name); } void getUint(uint32_t &u) { getT(u); } - // Read the next size bytes and return them as a string + // Read the next 'size' bytes and return them as a string. Converts + // them from native encoding to UTF8 in the process. std::string getString(int size) { - // Not very optimized, but we can fix that later - char *ptr = new char[size]; + char *ptr = ToUTF8::getBuffer(size); esm->read(ptr,size); - // Remove any zero terminators - for(int i=0; i inputBuffer (input.begin(), input.end()); - char *inputBufferBegin = &inputBuffer[0]; - - size_t inputBytesLeft = inputSize; //bytes to convert - - static const size_t outputSize = 1000; - size_t outputBytesLeft; - - char outputBuffer[outputSize]; - char *outputBufferBegin; - - while (inputBytesLeft > 0) - { - outputBytesLeft = outputSize; - outputBufferBegin = outputBuffer; - - if (iconv (cd, &inputBufferBegin, &inputBytesLeft, &outputBufferBegin, &outputBytesLeft) == (size_t)-1) - { - switch (errno) - { - case E2BIG: //outputBuffer is full - output += std::string (outputBuffer, outputSize); - break; - case EILSEQ: - fail ("Iconv: Invalid multibyte sequence.\n"); - break; - case EINVAL: - fail ("Iconv: Incomplete multibyte sequence.\n"); - break; - default: - fail ("Iconv: Unknown Error\n"); - } - - } - } - - //read only relevant bytes from outputBuffer - output += std::string (outputBuffer, outputSize - outputBytesLeft); - - } - } - - iconv_close (cd); - - return output; - } -#endif - void skip(int bytes) { esm->seek(esm->tell()+bytes); } uint64_t getOffset() { return esm->tell(); } diff --git a/components/to_utf8/.gitignore b/components/to_utf8/.gitignore new file mode 100644 index 000000000..4e0357749 --- /dev/null +++ b/components/to_utf8/.gitignore @@ -0,0 +1 @@ +gen_iconv diff --git a/components/to_utf8/Makefile b/components/to_utf8/Makefile new file mode 100644 index 000000000..a84cc240e --- /dev/null +++ b/components/to_utf8/Makefile @@ -0,0 +1,5 @@ +tables_gen.hpp: gen_iconv + gen_iconv > tables_gen.hpp + +gen_iconv: gen_iconv.cpp + g++ -Wall $^ -o $@ diff --git a/components/to_utf8/gen_iconv.cpp b/components/to_utf8/gen_iconv.cpp new file mode 100644 index 000000000..42e997783 --- /dev/null +++ b/components/to_utf8/gen_iconv.cpp @@ -0,0 +1,86 @@ +// This program generates the file tables_gen.hpp + +#include +#include +using namespace std; + +#include +#include + +void tab() { cout << " "; } + +// write one number with a space in front of it and a comma after it +void num(unsigned char i, bool last) +{ + cout << " 0x" << (unsigned)i; + if(!last) cout << ","; +} + +// Write one table entry (UTF8 value), 1-5 bytes +void writeChar(char *value, int length, bool last, const std::string &comment="") +{ + assert(length >= 1 && length <= 5); + tab(); + num(length, false); + for(int i=0;i<5;i++) + num(value[i], last && i==4); + + if(comment != "") + cout << " // " << comment; + + cout << endl; +} + +// What to write on missing characters +void writeMissing(bool last) +{ + // Just write a space character + char value[5]; + value[0] = ' '; + for(int i=1; i<5; i++) + value[i] = 0; + writeChar(value, 1, last, "not part of this charset"); +} + +int write_table(const std::string &charset, const std::string &tableName) +{ + // Write table header + cout << "static char " << tableName << "[] =\n{\n"; + + // Open conversion system + iconv_t cd = iconv_open ("UTF-8", charset.c_str()); + + // Convert each character from 0 to 255 + for(int i=0; i<256; i++) + { + bool last = (i==255); + + char input = i; + char *iptr = &input; + size_t ileft = 1; + + char output[5]; + for(int k=0; k<5; k++) output[k] = 0; + char *optr = output; + size_t oleft = 5; + + size_t res = iconv(cd, &iptr, &ileft, &optr, &oleft); + + if(res) writeMissing(last); + else writeChar(output, 5-oleft, last); + } + + iconv_close (cd); + + // Finish table + cout << "};\n"; +} + +int main() +{ + cout << hex; + + // English + write_table("WINDOWS-1252", "windows_1252"); + return 0; +} diff --git a/components/to_utf8/tables_gen.hpp b/components/to_utf8/tables_gen.hpp new file mode 100644 index 000000000..55a06cd94 --- /dev/null +++ b/components/to_utf8/tables_gen.hpp @@ -0,0 +1,259 @@ +static char windows_1252[] = +{ + 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x1, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x2, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x3, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x4, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x5, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x6, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x7, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x8, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x9, 0x0, 0x0, 0x0, 0x0, + 0x1, 0xa, 0x0, 0x0, 0x0, 0x0, + 0x1, 0xb, 0x0, 0x0, 0x0, 0x0, + 0x1, 0xc, 0x0, 0x0, 0x0, 0x0, + 0x1, 0xd, 0x0, 0x0, 0x0, 0x0, + 0x1, 0xe, 0x0, 0x0, 0x0, 0x0, + 0x1, 0xf, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x10, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x11, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x12, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x13, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x14, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x15, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x16, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x17, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x18, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x19, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x1a, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x1b, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x1c, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x1d, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x1e, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x1f, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x20, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x21, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x22, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x23, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x24, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x25, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x26, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x27, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x28, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x29, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x2a, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x2b, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x2c, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x2d, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x2e, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x2f, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x30, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x31, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x32, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x33, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x34, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x35, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x36, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x37, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x38, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x39, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x3a, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x3b, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x3c, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x3d, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x3e, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x3f, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x40, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x41, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x42, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x43, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x44, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x45, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x46, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x47, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x48, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x49, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x4a, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x4b, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x4c, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x4d, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x4e, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x4f, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x50, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x51, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x52, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x53, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x54, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x55, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x56, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x57, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x58, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x59, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x5a, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x5b, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x5c, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x5d, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x5e, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x5f, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x60, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x61, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x62, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x63, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x64, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x65, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x66, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x67, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x68, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x69, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x6a, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x6b, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x6c, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x6d, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x6e, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x6f, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x70, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x71, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x72, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x73, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x74, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x75, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x76, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x77, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x78, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x79, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x7a, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x7b, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x7c, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x7d, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x7e, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x7f, 0x0, 0x0, 0x0, 0x0, + 0x3, 0xe2, 0x82, 0xac, 0x0, 0x0, + 0x1, 0x20, 0x0, 0x0, 0x0, 0x0, // not part of this charset + 0x3, 0xe2, 0x80, 0x9a, 0x0, 0x0, + 0x2, 0xc6, 0x92, 0x0, 0x0, 0x0, + 0x3, 0xe2, 0x80, 0x9e, 0x0, 0x0, + 0x3, 0xe2, 0x80, 0xa6, 0x0, 0x0, + 0x3, 0xe2, 0x80, 0xa0, 0x0, 0x0, + 0x3, 0xe2, 0x80, 0xa1, 0x0, 0x0, + 0x2, 0xcb, 0x86, 0x0, 0x0, 0x0, + 0x3, 0xe2, 0x80, 0xb0, 0x0, 0x0, + 0x2, 0xc5, 0xa0, 0x0, 0x0, 0x0, + 0x3, 0xe2, 0x80, 0xb9, 0x0, 0x0, + 0x2, 0xc5, 0x92, 0x0, 0x0, 0x0, + 0x1, 0x20, 0x0, 0x0, 0x0, 0x0, // not part of this charset + 0x2, 0xc5, 0xbd, 0x0, 0x0, 0x0, + 0x1, 0x20, 0x0, 0x0, 0x0, 0x0, // not part of this charset + 0x1, 0x20, 0x0, 0x0, 0x0, 0x0, // not part of this charset + 0x3, 0xe2, 0x80, 0x98, 0x0, 0x0, + 0x3, 0xe2, 0x80, 0x99, 0x0, 0x0, + 0x3, 0xe2, 0x80, 0x9c, 0x0, 0x0, + 0x3, 0xe2, 0x80, 0x9d, 0x0, 0x0, + 0x3, 0xe2, 0x80, 0xa2, 0x0, 0x0, + 0x3, 0xe2, 0x80, 0x93, 0x0, 0x0, + 0x3, 0xe2, 0x80, 0x94, 0x0, 0x0, + 0x2, 0xcb, 0x9c, 0x0, 0x0, 0x0, + 0x3, 0xe2, 0x84, 0xa2, 0x0, 0x0, + 0x2, 0xc5, 0xa1, 0x0, 0x0, 0x0, + 0x3, 0xe2, 0x80, 0xba, 0x0, 0x0, + 0x2, 0xc5, 0x93, 0x0, 0x0, 0x0, + 0x1, 0x20, 0x0, 0x0, 0x0, 0x0, // not part of this charset + 0x2, 0xc5, 0xbe, 0x0, 0x0, 0x0, + 0x2, 0xc5, 0xb8, 0x0, 0x0, 0x0, + 0x2, 0xc2, 0xa0, 0x0, 0x0, 0x0, + 0x2, 0xc2, 0xa1, 0x0, 0x0, 0x0, + 0x2, 0xc2, 0xa2, 0x0, 0x0, 0x0, + 0x2, 0xc2, 0xa3, 0x0, 0x0, 0x0, + 0x2, 0xc2, 0xa4, 0x0, 0x0, 0x0, + 0x2, 0xc2, 0xa5, 0x0, 0x0, 0x0, + 0x2, 0xc2, 0xa6, 0x0, 0x0, 0x0, + 0x2, 0xc2, 0xa7, 0x0, 0x0, 0x0, + 0x2, 0xc2, 0xa8, 0x0, 0x0, 0x0, + 0x2, 0xc2, 0xa9, 0x0, 0x0, 0x0, + 0x2, 0xc2, 0xaa, 0x0, 0x0, 0x0, + 0x2, 0xc2, 0xab, 0x0, 0x0, 0x0, + 0x2, 0xc2, 0xac, 0x0, 0x0, 0x0, + 0x2, 0xc2, 0xad, 0x0, 0x0, 0x0, + 0x2, 0xc2, 0xae, 0x0, 0x0, 0x0, + 0x2, 0xc2, 0xaf, 0x0, 0x0, 0x0, + 0x2, 0xc2, 0xb0, 0x0, 0x0, 0x0, + 0x2, 0xc2, 0xb1, 0x0, 0x0, 0x0, + 0x2, 0xc2, 0xb2, 0x0, 0x0, 0x0, + 0x2, 0xc2, 0xb3, 0x0, 0x0, 0x0, + 0x2, 0xc2, 0xb4, 0x0, 0x0, 0x0, + 0x2, 0xc2, 0xb5, 0x0, 0x0, 0x0, + 0x2, 0xc2, 0xb6, 0x0, 0x0, 0x0, + 0x2, 0xc2, 0xb7, 0x0, 0x0, 0x0, + 0x2, 0xc2, 0xb8, 0x0, 0x0, 0x0, + 0x2, 0xc2, 0xb9, 0x0, 0x0, 0x0, + 0x2, 0xc2, 0xba, 0x0, 0x0, 0x0, + 0x2, 0xc2, 0xbb, 0x0, 0x0, 0x0, + 0x2, 0xc2, 0xbc, 0x0, 0x0, 0x0, + 0x2, 0xc2, 0xbd, 0x0, 0x0, 0x0, + 0x2, 0xc2, 0xbe, 0x0, 0x0, 0x0, + 0x2, 0xc2, 0xbf, 0x0, 0x0, 0x0, + 0x2, 0xc3, 0x80, 0x0, 0x0, 0x0, + 0x2, 0xc3, 0x81, 0x0, 0x0, 0x0, + 0x2, 0xc3, 0x82, 0x0, 0x0, 0x0, + 0x2, 0xc3, 0x83, 0x0, 0x0, 0x0, + 0x2, 0xc3, 0x84, 0x0, 0x0, 0x0, + 0x2, 0xc3, 0x85, 0x0, 0x0, 0x0, + 0x2, 0xc3, 0x86, 0x0, 0x0, 0x0, + 0x2, 0xc3, 0x87, 0x0, 0x0, 0x0, + 0x2, 0xc3, 0x88, 0x0, 0x0, 0x0, + 0x2, 0xc3, 0x89, 0x0, 0x0, 0x0, + 0x2, 0xc3, 0x8a, 0x0, 0x0, 0x0, + 0x2, 0xc3, 0x8b, 0x0, 0x0, 0x0, + 0x2, 0xc3, 0x8c, 0x0, 0x0, 0x0, + 0x2, 0xc3, 0x8d, 0x0, 0x0, 0x0, + 0x2, 0xc3, 0x8e, 0x0, 0x0, 0x0, + 0x2, 0xc3, 0x8f, 0x0, 0x0, 0x0, + 0x2, 0xc3, 0x90, 0x0, 0x0, 0x0, + 0x2, 0xc3, 0x91, 0x0, 0x0, 0x0, + 0x2, 0xc3, 0x92, 0x0, 0x0, 0x0, + 0x2, 0xc3, 0x93, 0x0, 0x0, 0x0, + 0x2, 0xc3, 0x94, 0x0, 0x0, 0x0, + 0x2, 0xc3, 0x95, 0x0, 0x0, 0x0, + 0x2, 0xc3, 0x96, 0x0, 0x0, 0x0, + 0x2, 0xc3, 0x97, 0x0, 0x0, 0x0, + 0x2, 0xc3, 0x98, 0x0, 0x0, 0x0, + 0x2, 0xc3, 0x99, 0x0, 0x0, 0x0, + 0x2, 0xc3, 0x9a, 0x0, 0x0, 0x0, + 0x2, 0xc3, 0x9b, 0x0, 0x0, 0x0, + 0x2, 0xc3, 0x9c, 0x0, 0x0, 0x0, + 0x2, 0xc3, 0x9d, 0x0, 0x0, 0x0, + 0x2, 0xc3, 0x9e, 0x0, 0x0, 0x0, + 0x2, 0xc3, 0x9f, 0x0, 0x0, 0x0, + 0x2, 0xc3, 0xa0, 0x0, 0x0, 0x0, + 0x2, 0xc3, 0xa1, 0x0, 0x0, 0x0, + 0x2, 0xc3, 0xa2, 0x0, 0x0, 0x0, + 0x2, 0xc3, 0xa3, 0x0, 0x0, 0x0, + 0x2, 0xc3, 0xa4, 0x0, 0x0, 0x0, + 0x2, 0xc3, 0xa5, 0x0, 0x0, 0x0, + 0x2, 0xc3, 0xa6, 0x0, 0x0, 0x0, + 0x2, 0xc3, 0xa7, 0x0, 0x0, 0x0, + 0x2, 0xc3, 0xa8, 0x0, 0x0, 0x0, + 0x2, 0xc3, 0xa9, 0x0, 0x0, 0x0, + 0x2, 0xc3, 0xaa, 0x0, 0x0, 0x0, + 0x2, 0xc3, 0xab, 0x0, 0x0, 0x0, + 0x2, 0xc3, 0xac, 0x0, 0x0, 0x0, + 0x2, 0xc3, 0xad, 0x0, 0x0, 0x0, + 0x2, 0xc3, 0xae, 0x0, 0x0, 0x0, + 0x2, 0xc3, 0xaf, 0x0, 0x0, 0x0, + 0x2, 0xc3, 0xb0, 0x0, 0x0, 0x0, + 0x2, 0xc3, 0xb1, 0x0, 0x0, 0x0, + 0x2, 0xc3, 0xb2, 0x0, 0x0, 0x0, + 0x2, 0xc3, 0xb3, 0x0, 0x0, 0x0, + 0x2, 0xc3, 0xb4, 0x0, 0x0, 0x0, + 0x2, 0xc3, 0xb5, 0x0, 0x0, 0x0, + 0x2, 0xc3, 0xb6, 0x0, 0x0, 0x0, + 0x2, 0xc3, 0xb7, 0x0, 0x0, 0x0, + 0x2, 0xc3, 0xb8, 0x0, 0x0, 0x0, + 0x2, 0xc3, 0xb9, 0x0, 0x0, 0x0, + 0x2, 0xc3, 0xba, 0x0, 0x0, 0x0, + 0x2, 0xc3, 0xbb, 0x0, 0x0, 0x0, + 0x2, 0xc3, 0xbc, 0x0, 0x0, 0x0, + 0x2, 0xc3, 0xbd, 0x0, 0x0, 0x0, + 0x2, 0xc3, 0xbe, 0x0, 0x0, 0x0, + 0x2, 0xc3, 0xbf, 0x0, 0x0, 0x0 +}; diff --git a/components/to_utf8/to_utf8.cpp b/components/to_utf8/to_utf8.cpp new file mode 100644 index 000000000..701b7fe56 --- /dev/null +++ b/components/to_utf8/to_utf8.cpp @@ -0,0 +1,159 @@ +#include "to_utf8.hpp" + +#include +#include + +/* This file contains the code to translate from WINDOWS-1252 (native + charset used in English version of Morrowind) to UTF-8. The library + is designed to be extened to support more source encodings later, + which means that we may add support for Russian, Polish and Chinese + files and so on. + + The code does not depend on any external library at + runtime. Instead, it uses a pregenerated table made with iconv (see + gen_iconv.cpp and the Makefile) which is located in tables_gen.hpp. + + This is both faster and uses less dependencies. The tables would + only need to be regenerated if we are adding support more input + encodings. As such, there is no need to make the generator code + platform independent. + + The library is optimized for the case of pure ASCII input strings, + which is the vast majority of cases at least for the English + version. A test of my version of Morrowind.esm got 130 non-ASCII vs + 236195 ASCII strings, or less than 0.06% of strings containing + non-ASCII characters. + + To optmize for this, ff the first pass of the string does not find + any non-ASCII characters, the entire string is passed along without + any modification. + + Most of the non-ASCII strings are books, and are quite large. (The + non-ASCII characters are typically starting and ending quotation + marks.) Within these, almost all the characters are ASCII. For this + purpose, the library is also optimized for mostly-ASCII contents + even in the cases where some conversion is necessary. + */ + + +// Generated tables +#include "tables_gen.hpp" + +// Shared global buffers, we love you. +static std::vector buf; +static std::vector output; +static int size; + +// Make sure the given vector is large enough for 'size' bytes, +// including a terminating zero after it. +static void resize(std::vector &buf, size_t size) +{ + if(buf.size() <= size) + // Add some extra padding to reduce the chance of having to resize + // again later. + buf.resize(3*size); + + // And make sure the string is zero terminated + buf[size] = 0; +} + +// This is just used to spew out a reusable input buffer for the +// conversion process. +char *ToUTF8::getBuffer(int s) +{ + // Remember the requested size + size = s; + resize(buf, size); + return &buf[0]; +} + +/** Get the total length length needed to decode the given string with + the given translation array. The arrays are encoded with 6 bytes + per character, with the first giving the length and the next 5 the + actual data. + + The function serves a dual purpose for optimization reasons: it + checks if the input is pure ascii (all values are <= 127). If this + is the case, then the ascii parameter is set to true, and the + caller can optimize for this case. + */ +static size_t getLength(const char *arr, const char* input, bool &ascii) +{ + ascii = true; + size_t len = 0; + unsigned char inp = *input; + while(inp) + { + if(inp > 127) ascii = false; + len += arr[inp*6]; + inp = *(++input); + } + return len; +} + +// Translate one character 'ch' using the translation array 'arr', and +// advance the output pointer accordingly. +static void copyFromArray(const char *arr, unsigned char ch, char* &out) +{ + // Optimize for ASCII values + if(ch < 128) + { + *(out++) = ch; + return; + } + + const char *in = arr + ch*6; + int len = *(in++); + for(int i=0; i outlen); + assert(output[outlen] == 0); + + // Return a string + return std::string(&output[0], outlen); +} + diff --git a/components/to_utf8/to_utf8.hpp b/components/to_utf8/to_utf8.hpp new file mode 100644 index 000000000..ec2231be7 --- /dev/null +++ b/components/to_utf8/to_utf8.hpp @@ -0,0 +1,24 @@ +#ifndef COMPONENTS_TOUTF8_H +#define COMPONENTS_TOUTF8_H + +#include + +namespace ToUTF8 +{ + // These are all the currently supported code pages + enum FromType + { + WINDOWS_1252 // Used by English version of Morrowind (and + // probably others) + }; + + // Return a writable buffer of at least 'size' bytes. The buffer + // does not have to be freed. + char* getBuffer(int size); + + // Convert the previously written buffer to UTF8 from the given code + // page. + std::string getUtf8(FromType from); +} + +#endif