From 27b68f6f5ba68d1ef86e9a05bbd99f5194d4c955 Mon Sep 17 00:00:00 2001 From: nkorslund Date: Wed, 23 Jul 2008 15:19:37 +0000 Subject: [PATCH] Made a better workaround for the UTF encoding problem git-svn-id: https://openmw.svn.sourceforge.net/svnroot/openmw/trunk@34 ea6a568a-9f4f-0410-981a-c910a81bb256 --- esm/filereader.d | 24 ++++++++++--- esmtool.d | 25 ++++--------- monster/util/string.d | 84 +++++++++++++++++++++++++++++++++++++++---- nif/extra.d | 6 ++-- nif/misc.d | 17 --------- 5 files changed, 107 insertions(+), 49 deletions(-) diff --git a/esm/filereader.d b/esm/filereader.d index b730a0778..ffe627764 100644 --- a/esm/filereader.d +++ b/esm/filereader.d @@ -29,7 +29,7 @@ import std.stream; import std.string; import util.regions; - +import monster.util.string; import core.resource; import esm.listkeeper; @@ -378,12 +378,16 @@ struct TES3File // This should be more than big enough for references. private char lookupBuffer[200]; + // Get a temporary string. This is faster and more memory efficient + // that the other string functions (because it is allocation free), + // but the returned string is only valid until tmpHString() is + // called again. char[] tmpHString() { getSubHeader(); + assert(leftSub <= lookupBuffer.length, "lookupBuffer wasn't large enough"); // Use this to test the difference in memory consumption. - //return getString(region.getString(leftSub)); return getString(lookupBuffer[0..leftSub]); } @@ -581,13 +585,17 @@ struct TES3File { getSubHeader(); - // Hack to make MultiMark.esp load + // Hack to make MultiMark.esp load. Zero-length strings do not + // occur in any of the official mods, but MultiMark makes use of + // them. For some reason, they break the rules, and contain a + // byte (value 0) even if the header says there is no data. If + // Morrowind accepts it, so should we. if(leftSub == 0) { // Skip the following zero byte leftRec--; assert(file.getc() == 0); - // Report this by setting a flag or something? + // TODO: Report this by setting a flag or something? return null; } @@ -729,13 +737,19 @@ struct TES3File // Fill buffer of predefined length. If actual string is shorter // (ie. null terminated), the buffer length is set - // accordingly. Chopped string is returned. + // accordingly. Chopped string is returned. All strings pass through + // this function, so any character encoding conversions should + // happen here. char[] getString(char[] str) { if(str.length != file.readBlock(str.ptr,str.length)) fail("getString() could not find enough data in stream"); str = stripz(str); + makeUTF8(str); // TODO: A hack. Will replace non-utf characters + // with question marks. This is neither a very + // desirable result nor a very optimized + // implementation of it. return str; } diff --git a/esmtool.d b/esmtool.d index 07c37089e..b4a951808 100644 --- a/esmtool.d +++ b/esmtool.d @@ -27,12 +27,11 @@ import std.stdio; import core.memory; import esm.esmmain; +import monster.util.string; -import std.utf; import std.gc; import gcstats; -import monster.util.string; // Not used, but we have to link it in along with the C++ stuff. import input.events; @@ -129,8 +128,7 @@ void main(char[][] args) try loadTESFiles(files); catch(Exception e) { - try {writefln(e);} - catch {writefln("(Invalid UTF in error message)");} + writefln(e); } catch { writefln("Error: Unkown failure"); } @@ -156,8 +154,7 @@ void main(char[][] args) case WT.Bolt: writef("Bolt"); break; default: assert(0); } - try writefln(" id '%s': name '%s'", n, m.name); - catch {writefln("(Invalid UTF string)");} + writefln(" id '%s': name '%s'", n, m.name); if(m.data.flags & Weapon.Flags.Magical) writefln("Magical"); @@ -228,20 +225,14 @@ void main(char[][] args) if(scptList) foreach(a, b; scripts.names) writefln(a); if(ciList) foreach(a, b; cells.in_cells) - { - try writefln(a); - catch {writefln("(Invalid UTF string)");} - } + writefln(a); if(ceList) foreach(uint i, c; .cells.ex_cells) { int x, y; CellList.decompound(i, x, y); if(c.name.length) - { - try writefln("%s,%s: %s", x, y, c.name); - catch {writefln("(Invalid UTF string)");} - } + writefln("%s,%s: %s", x, y, c.name); } if(scptShow) @@ -346,11 +337,7 @@ void printRaw() //subName == "SCTX") // For script text //getHString(); { - try{writefln("'%s'", getHString());} - catch(UtfException e) - { - writefln("Got an UTF-ie, ", e); - } + writefln("'%s'", getHString()); } else if(subName == "FLTV" || subName == "XSCL") writefln(getHFloat()); else if(subName == "INTV" /*|| subName == "NAM0"*/ || subName == "FRMR") diff --git a/monster/util/string.d b/monster/util/string.d index 0dded5359..f8c21181c 100644 --- a/monster/util/string.d +++ b/monster/util/string.d @@ -24,8 +24,8 @@ module monster.util.string; -import std.utf; import std.string; +import std.utf; bool begins(char[] str, char[] start) { @@ -137,6 +137,81 @@ unittest assert("".iEnds("")); } +// A specialized version of std.utf.decode() +private bool fdecode(char[] s, inout size_t idx) + { + size_t len = s.length; + dchar V; + size_t i = idx; + char u = s[i]; + + if (u & 0x80) + { uint n; + char u2; + + /* The following encodings are valid, except for the 5 and 6 byte + * combinations: + * 0xxxxxxx + * 110xxxxx 10xxxxxx + * 1110xxxx 10xxxxxx 10xxxxxx + * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + */ + for (n = 1; ; n++) + { + if (n > 4) + return false; // only do the first 4 of 6 encodings + if (((u << n) & 0x80) == 0) + { + if (n == 1) + return false; + break; + } + } + + // Pick off (7 - n) significant bits of B from first byte of octet + V = cast(dchar)(u & ((1 << (7 - n)) - 1)); + + if (i + (n - 1) >= len) + return false; // off end of string + + /* The following combinations are overlong, and illegal: + * 1100000x (10xxxxxx) + * 11100000 100xxxxx (10xxxxxx) + * 11110000 1000xxxx (10xxxxxx 10xxxxxx) + * 11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx) + * 11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx) + */ + u2 = s[i + 1]; + if ((u & 0xFE) == 0xC0 || + (u == 0xE0 && (u2 & 0xE0) == 0x80) || + (u == 0xF0 && (u2 & 0xF0) == 0x80) || + (u == 0xF8 && (u2 & 0xF8) == 0x80) || + (u == 0xFC && (u2 & 0xFC) == 0x80)) + return false; // overlong combination + + for (uint j = 1; j != n; j++) + { + u = s[i + j]; + if ((u & 0xC0) != 0x80) + return false; // trailing bytes are 10xxxxxx + V = (V << 6) | (u & 0x3F); + } + if (!isValidDchar(V)) + return false; + i += n; + } + else + { + V = cast(dchar) u; + i++; + } + + idx = i; + return true; + } + // Converts any string to valid UTF8 so it can be safely printed. It // does not translate from other encodings but simply replaces invalid // characters with 'replace'. Does everything in place. @@ -144,11 +219,8 @@ char[] makeUTF8(char[] str, char replace = '?') { size_t idx = 0; while(idx < str.length) - { - try decode(str, idx); - catch(UtfException ue) - str[idx++] = replace; - } + if(!fdecode(str, idx)) + str[idx++] = replace; return str; } diff --git a/nif/extra.d b/nif/extra.d index 559266c47..ba8e7663c 100644 --- a/nif/extra.d +++ b/nif/extra.d @@ -25,6 +25,8 @@ module nif.extra; import nif.record; import nif.controlled; +import monster.util.string; + abstract class Extra : Record { Extra extra; @@ -98,7 +100,7 @@ class NiTextKeyExtraData : Extra k.string = nifFile.getString; debug(verbose) - writefln(" %d: %s @ %f ", i, k.string.clean().chomp(), k.time); + writefln(" %d: %s @ %f ", i, makeUTF8(k.string).chomp(), k.time); } } } @@ -120,7 +122,7 @@ class NiStringExtraData : Extra debug(verbose) { // "NCO" means 'no collision', I think - writefln("String: %s", string.clean()); + writefln("String: %s", makeUTF8(string)); } } } diff --git a/nif/misc.d b/nif/misc.d index 03686c416..a01d1be5b 100644 --- a/nif/misc.d +++ b/nif/misc.d @@ -24,26 +24,9 @@ // This doesn't have to be part of the nif package at all. module nif.misc; -import std.utf; import std.string; import monster.util.string; -// Find an alternative to this -char[] clean(char[] s) -{ - try{validate(s);} - catch(UtfException e) - { - return "(invalid utf-string)"; - } - return s; -} - -unittest -{ - assert(clean("abc æøå") == "abc æøå"); -} - struct Vector { float array[3];