Made a better workaround for the UTF encoding problem

git-svn-id: https://openmw.svn.sourceforge.net/svnroot/openmw/trunk@34 ea6a568a-9f4f-0410-981a-c910a81bb256
actorid
nkorslund 17 years ago
parent 4bc7d1b6d9
commit 27b68f6f5b

@ -29,7 +29,7 @@ import std.stream;
import std.string;
import util.regions;
import monster.util.string;
import core.resource;
import esm.listkeeper;
@ -378,12 +378,16 @@ struct TES3File
// This should be more than big enough for references.
private char lookupBuffer[200];
// Get a temporary string. This is faster and more memory efficient
// that the other string functions (because it is allocation free),
// but the returned string is only valid until tmpHString() is
// called again.
char[] tmpHString()
{
getSubHeader();
assert(leftSub <= lookupBuffer.length, "lookupBuffer wasn't large enough");
// Use this to test the difference in memory consumption.
//return getString(region.getString(leftSub));
return getString(lookupBuffer[0..leftSub]);
}
@ -581,13 +585,17 @@ struct TES3File
{
getSubHeader();
// Hack to make MultiMark.esp load
// Hack to make MultiMark.esp load. Zero-length strings do not
// occur in any of the official mods, but MultiMark makes use of
// them. For some reason, they break the rules, and contain a
// byte (value 0) even if the header says there is no data. If
// Morrowind accepts it, so should we.
if(leftSub == 0)
{
// Skip the following zero byte
leftRec--;
assert(file.getc() == 0);
// Report this by setting a flag or something?
// TODO: Report this by setting a flag or something?
return null;
}
@ -729,13 +737,19 @@ struct TES3File
// Fill buffer of predefined length. If actual string is shorter
// (ie. null terminated), the buffer length is set
// accordingly. Chopped string is returned.
// accordingly. Chopped string is returned. All strings pass through
// this function, so any character encoding conversions should
// happen here.
char[] getString(char[] str)
{
if(str.length != file.readBlock(str.ptr,str.length))
fail("getString() could not find enough data in stream");
str = stripz(str);
makeUTF8(str); // TODO: A hack. Will replace non-utf characters
// with question marks. This is neither a very
// desirable result nor a very optimized
// implementation of it.
return str;
}

@ -27,12 +27,11 @@ import std.stdio;
import core.memory;
import esm.esmmain;
import monster.util.string;
import std.utf;
import std.gc;
import gcstats;
import monster.util.string;
// Not used, but we have to link it in along with the C++ stuff.
import input.events;
@ -129,8 +128,7 @@ void main(char[][] args)
try loadTESFiles(files);
catch(Exception e)
{
try {writefln(e);}
catch {writefln("(Invalid UTF in error message)");}
writefln(e);
}
catch { writefln("Error: Unkown failure"); }
@ -156,8 +154,7 @@ void main(char[][] args)
case WT.Bolt: writef("Bolt"); break;
default: assert(0);
}
try writefln(" id '%s': name '%s'", n, m.name);
catch {writefln("(Invalid UTF string)");}
writefln(" id '%s': name '%s'", n, m.name);
if(m.data.flags & Weapon.Flags.Magical)
writefln("Magical");
@ -228,20 +225,14 @@ void main(char[][] args)
if(scptList) foreach(a, b; scripts.names) writefln(a);
if(ciList)
foreach(a, b; cells.in_cells)
{
try writefln(a);
catch {writefln("(Invalid UTF string)");}
}
writefln(a);
if(ceList)
foreach(uint i, c; .cells.ex_cells)
{
int x, y;
CellList.decompound(i, x, y);
if(c.name.length)
{
try writefln("%s,%s: %s", x, y, c.name);
catch {writefln("(Invalid UTF string)");}
}
writefln("%s,%s: %s", x, y, c.name);
}
if(scptShow)
@ -346,11 +337,7 @@ void printRaw()
//subName == "SCTX") // For script text
//getHString();
{
try{writefln("'%s'", getHString());}
catch(UtfException e)
{
writefln("Got an UTF-ie, ", e);
}
writefln("'%s'", getHString());
}
else if(subName == "FLTV" || subName == "XSCL") writefln(getHFloat());
else if(subName == "INTV" /*|| subName == "NAM0"*/ || subName == "FRMR")

@ -24,8 +24,8 @@
module monster.util.string;
import std.utf;
import std.string;
import std.utf;
bool begins(char[] str, char[] start)
{
@ -137,6 +137,81 @@ unittest
assert("".iEnds(""));
}
// A specialized version of std.utf.decode()
private bool fdecode(char[] s, inout size_t idx)
{
size_t len = s.length;
dchar V;
size_t i = idx;
char u = s[i];
if (u & 0x80)
{ uint n;
char u2;
/* The following encodings are valid, except for the 5 and 6 byte
* combinations:
* 0xxxxxxx
* 110xxxxx 10xxxxxx
* 1110xxxx 10xxxxxx 10xxxxxx
* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
*/
for (n = 1; ; n++)
{
if (n > 4)
return false; // only do the first 4 of 6 encodings
if (((u << n) & 0x80) == 0)
{
if (n == 1)
return false;
break;
}
}
// Pick off (7 - n) significant bits of B from first byte of octet
V = cast(dchar)(u & ((1 << (7 - n)) - 1));
if (i + (n - 1) >= len)
return false; // off end of string
/* The following combinations are overlong, and illegal:
* 1100000x (10xxxxxx)
* 11100000 100xxxxx (10xxxxxx)
* 11110000 1000xxxx (10xxxxxx 10xxxxxx)
* 11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx)
* 11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx)
*/
u2 = s[i + 1];
if ((u & 0xFE) == 0xC0 ||
(u == 0xE0 && (u2 & 0xE0) == 0x80) ||
(u == 0xF0 && (u2 & 0xF0) == 0x80) ||
(u == 0xF8 && (u2 & 0xF8) == 0x80) ||
(u == 0xFC && (u2 & 0xFC) == 0x80))
return false; // overlong combination
for (uint j = 1; j != n; j++)
{
u = s[i + j];
if ((u & 0xC0) != 0x80)
return false; // trailing bytes are 10xxxxxx
V = (V << 6) | (u & 0x3F);
}
if (!isValidDchar(V))
return false;
i += n;
}
else
{
V = cast(dchar) u;
i++;
}
idx = i;
return true;
}
// Converts any string to valid UTF8 so it can be safely printed. It
// does not translate from other encodings but simply replaces invalid
// characters with 'replace'. Does everything in place.
@ -144,11 +219,8 @@ char[] makeUTF8(char[] str, char replace = '?')
{
size_t idx = 0;
while(idx < str.length)
{
try decode(str, idx);
catch(UtfException ue)
if(!fdecode(str, idx))
str[idx++] = replace;
}
return str;
}

@ -25,6 +25,8 @@ module nif.extra;
import nif.record;
import nif.controlled;
import monster.util.string;
abstract class Extra : Record
{
Extra extra;
@ -98,7 +100,7 @@ class NiTextKeyExtraData : Extra
k.string = nifFile.getString;
debug(verbose)
writefln(" %d: %s @ %f ", i, k.string.clean().chomp(), k.time);
writefln(" %d: %s @ %f ", i, makeUTF8(k.string).chomp(), k.time);
}
}
}
@ -120,7 +122,7 @@ class NiStringExtraData : Extra
debug(verbose)
{
// "NCO" means 'no collision', I think
writefln("String: %s", string.clean());
writefln("String: %s", makeUTF8(string));
}
}
}

@ -24,26 +24,9 @@
// This doesn't have to be part of the nif package at all.
module nif.misc;
import std.utf;
import std.string;
import monster.util.string;
// Find an alternative to this
char[] clean(char[] s)
{
try{validate(s);}
catch(UtfException e)
{
return "(invalid utf-string)";
}
return s;
}
unittest
{
assert(clean("abc æøå") == "abc æøå");
}
struct Vector
{
float array[3];

Loading…
Cancel
Save