mirror of
https://github.com/OpenMW/openmw.git
synced 2025-02-28 16:39:43 +00:00
Made a better workaround for the UTF encoding problem
git-svn-id: https://openmw.svn.sourceforge.net/svnroot/openmw/trunk@34 ea6a568a-9f4f-0410-981a-c910a81bb256
This commit is contained in:
parent
4bc7d1b6d9
commit
27b68f6f5b
5 changed files with 107 additions and 49 deletions
|
@ -29,7 +29,7 @@ import std.stream;
|
|||
import std.string;
|
||||
|
||||
import util.regions;
|
||||
|
||||
import monster.util.string;
|
||||
import core.resource;
|
||||
|
||||
import esm.listkeeper;
|
||||
|
@ -378,12 +378,16 @@ struct TES3File
|
|||
// This should be more than big enough for references.
|
||||
private char lookupBuffer[200];
|
||||
|
||||
// Get a temporary string. This is faster and more memory efficient
|
||||
// that the other string functions (because it is allocation free),
|
||||
// but the returned string is only valid until tmpHString() is
|
||||
// called again.
|
||||
char[] tmpHString()
|
||||
{
|
||||
getSubHeader();
|
||||
assert(leftSub <= lookupBuffer.length, "lookupBuffer wasn't large enough");
|
||||
|
||||
// Use this to test the difference in memory consumption.
|
||||
//return getString(region.getString(leftSub));
|
||||
return getString(lookupBuffer[0..leftSub]);
|
||||
}
|
||||
|
||||
|
@ -581,13 +585,17 @@ struct TES3File
|
|||
{
|
||||
getSubHeader();
|
||||
|
||||
// Hack to make MultiMark.esp load
|
||||
// Hack to make MultiMark.esp load. Zero-length strings do not
|
||||
// occur in any of the official mods, but MultiMark makes use of
|
||||
// them. For some reason, they break the rules, and contain a
|
||||
// byte (value 0) even if the header says there is no data. If
|
||||
// Morrowind accepts it, so should we.
|
||||
if(leftSub == 0)
|
||||
{
|
||||
// Skip the following zero byte
|
||||
leftRec--;
|
||||
assert(file.getc() == 0);
|
||||
// Report this by setting a flag or something?
|
||||
// TODO: Report this by setting a flag or something?
|
||||
return null;
|
||||
}
|
||||
|
||||
|
@ -729,13 +737,19 @@ struct TES3File
|
|||
|
||||
// Fill buffer of predefined length. If actual string is shorter
|
||||
// (ie. null terminated), the buffer length is set
|
||||
// accordingly. Chopped string is returned.
|
||||
// accordingly. Chopped string is returned. All strings pass through
|
||||
// this function, so any character encoding conversions should
|
||||
// happen here.
|
||||
char[] getString(char[] str)
|
||||
{
|
||||
if(str.length != file.readBlock(str.ptr,str.length))
|
||||
fail("getString() could not find enough data in stream");
|
||||
|
||||
str = stripz(str);
|
||||
makeUTF8(str); // TODO: A hack. Will replace non-utf characters
|
||||
// with question marks. This is neither a very
|
||||
// desirable result nor a very optimized
|
||||
// implementation of it.
|
||||
return str;
|
||||
}
|
||||
|
||||
|
|
25
esmtool.d
25
esmtool.d
|
@ -27,12 +27,11 @@ import std.stdio;
|
|||
|
||||
import core.memory;
|
||||
import esm.esmmain;
|
||||
import monster.util.string;
|
||||
|
||||
import std.utf;
|
||||
import std.gc;
|
||||
import gcstats;
|
||||
|
||||
import monster.util.string;
|
||||
|
||||
// Not used, but we have to link it in along with the C++ stuff.
|
||||
import input.events;
|
||||
|
@ -129,8 +128,7 @@ void main(char[][] args)
|
|||
try loadTESFiles(files);
|
||||
catch(Exception e)
|
||||
{
|
||||
try {writefln(e);}
|
||||
catch {writefln("(Invalid UTF in error message)");}
|
||||
writefln(e);
|
||||
}
|
||||
catch { writefln("Error: Unkown failure"); }
|
||||
|
||||
|
@ -156,8 +154,7 @@ void main(char[][] args)
|
|||
case WT.Bolt: writef("Bolt"); break;
|
||||
default: assert(0);
|
||||
}
|
||||
try writefln(" id '%s': name '%s'", n, m.name);
|
||||
catch {writefln("(Invalid UTF string)");}
|
||||
writefln(" id '%s': name '%s'", n, m.name);
|
||||
|
||||
if(m.data.flags & Weapon.Flags.Magical)
|
||||
writefln("Magical");
|
||||
|
@ -228,20 +225,14 @@ void main(char[][] args)
|
|||
if(scptList) foreach(a, b; scripts.names) writefln(a);
|
||||
if(ciList)
|
||||
foreach(a, b; cells.in_cells)
|
||||
{
|
||||
try writefln(a);
|
||||
catch {writefln("(Invalid UTF string)");}
|
||||
}
|
||||
writefln(a);
|
||||
if(ceList)
|
||||
foreach(uint i, c; .cells.ex_cells)
|
||||
{
|
||||
int x, y;
|
||||
CellList.decompound(i, x, y);
|
||||
if(c.name.length)
|
||||
{
|
||||
try writefln("%s,%s: %s", x, y, c.name);
|
||||
catch {writefln("(Invalid UTF string)");}
|
||||
}
|
||||
writefln("%s,%s: %s", x, y, c.name);
|
||||
}
|
||||
|
||||
if(scptShow)
|
||||
|
@ -346,11 +337,7 @@ void printRaw()
|
|||
//subName == "SCTX") // For script text
|
||||
//getHString();
|
||||
{
|
||||
try{writefln("'%s'", getHString());}
|
||||
catch(UtfException e)
|
||||
{
|
||||
writefln("Got an UTF-ie, ", e);
|
||||
}
|
||||
writefln("'%s'", getHString());
|
||||
}
|
||||
else if(subName == "FLTV" || subName == "XSCL") writefln(getHFloat());
|
||||
else if(subName == "INTV" /*|| subName == "NAM0"*/ || subName == "FRMR")
|
||||
|
|
|
@ -24,8 +24,8 @@
|
|||
|
||||
module monster.util.string;
|
||||
|
||||
import std.utf;
|
||||
import std.string;
|
||||
import std.utf;
|
||||
|
||||
bool begins(char[] str, char[] start)
|
||||
{
|
||||
|
@ -137,6 +137,81 @@ unittest
|
|||
assert("".iEnds(""));
|
||||
}
|
||||
|
||||
// A specialized version of std.utf.decode()
|
||||
private bool fdecode(char[] s, inout size_t idx)
|
||||
{
|
||||
size_t len = s.length;
|
||||
dchar V;
|
||||
size_t i = idx;
|
||||
char u = s[i];
|
||||
|
||||
if (u & 0x80)
|
||||
{ uint n;
|
||||
char u2;
|
||||
|
||||
/* The following encodings are valid, except for the 5 and 6 byte
|
||||
* combinations:
|
||||
* 0xxxxxxx
|
||||
* 110xxxxx 10xxxxxx
|
||||
* 1110xxxx 10xxxxxx 10xxxxxx
|
||||
* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||
* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||
* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||
*/
|
||||
for (n = 1; ; n++)
|
||||
{
|
||||
if (n > 4)
|
||||
return false; // only do the first 4 of 6 encodings
|
||||
if (((u << n) & 0x80) == 0)
|
||||
{
|
||||
if (n == 1)
|
||||
return false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Pick off (7 - n) significant bits of B from first byte of octet
|
||||
V = cast(dchar)(u & ((1 << (7 - n)) - 1));
|
||||
|
||||
if (i + (n - 1) >= len)
|
||||
return false; // off end of string
|
||||
|
||||
/* The following combinations are overlong, and illegal:
|
||||
* 1100000x (10xxxxxx)
|
||||
* 11100000 100xxxxx (10xxxxxx)
|
||||
* 11110000 1000xxxx (10xxxxxx 10xxxxxx)
|
||||
* 11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx)
|
||||
* 11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx)
|
||||
*/
|
||||
u2 = s[i + 1];
|
||||
if ((u & 0xFE) == 0xC0 ||
|
||||
(u == 0xE0 && (u2 & 0xE0) == 0x80) ||
|
||||
(u == 0xF0 && (u2 & 0xF0) == 0x80) ||
|
||||
(u == 0xF8 && (u2 & 0xF8) == 0x80) ||
|
||||
(u == 0xFC && (u2 & 0xFC) == 0x80))
|
||||
return false; // overlong combination
|
||||
|
||||
for (uint j = 1; j != n; j++)
|
||||
{
|
||||
u = s[i + j];
|
||||
if ((u & 0xC0) != 0x80)
|
||||
return false; // trailing bytes are 10xxxxxx
|
||||
V = (V << 6) | (u & 0x3F);
|
||||
}
|
||||
if (!isValidDchar(V))
|
||||
return false;
|
||||
i += n;
|
||||
}
|
||||
else
|
||||
{
|
||||
V = cast(dchar) u;
|
||||
i++;
|
||||
}
|
||||
|
||||
idx = i;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Converts any string to valid UTF8 so it can be safely printed. It
|
||||
// does not translate from other encodings but simply replaces invalid
|
||||
// characters with 'replace'. Does everything in place.
|
||||
|
@ -144,11 +219,8 @@ char[] makeUTF8(char[] str, char replace = '?')
|
|||
{
|
||||
size_t idx = 0;
|
||||
while(idx < str.length)
|
||||
{
|
||||
try decode(str, idx);
|
||||
catch(UtfException ue)
|
||||
str[idx++] = replace;
|
||||
}
|
||||
if(!fdecode(str, idx))
|
||||
str[idx++] = replace;
|
||||
return str;
|
||||
}
|
||||
|
||||
|
|
|
@ -25,6 +25,8 @@ module nif.extra;
|
|||
import nif.record;
|
||||
import nif.controlled;
|
||||
|
||||
import monster.util.string;
|
||||
|
||||
abstract class Extra : Record
|
||||
{
|
||||
Extra extra;
|
||||
|
@ -98,7 +100,7 @@ class NiTextKeyExtraData : Extra
|
|||
k.string = nifFile.getString;
|
||||
|
||||
debug(verbose)
|
||||
writefln(" %d: %s @ %f ", i, k.string.clean().chomp(), k.time);
|
||||
writefln(" %d: %s @ %f ", i, makeUTF8(k.string).chomp(), k.time);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -120,7 +122,7 @@ class NiStringExtraData : Extra
|
|||
debug(verbose)
|
||||
{
|
||||
// "NCO" means 'no collision', I think
|
||||
writefln("String: %s", string.clean());
|
||||
writefln("String: %s", makeUTF8(string));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
17
nif/misc.d
17
nif/misc.d
|
@ -24,26 +24,9 @@
|
|||
// This doesn't have to be part of the nif package at all.
|
||||
module nif.misc;
|
||||
|
||||
import std.utf;
|
||||
import std.string;
|
||||
import monster.util.string;
|
||||
|
||||
// Find an alternative to this
|
||||
char[] clean(char[] s)
|
||||
{
|
||||
try{validate(s);}
|
||||
catch(UtfException e)
|
||||
{
|
||||
return "(invalid utf-string)";
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
unittest
|
||||
{
|
||||
assert(clean("abc æøå") == "abc æøå");
|
||||
}
|
||||
|
||||
struct Vector
|
||||
{
|
||||
float array[3];
|
||||
|
|
Loading…
Reference in a new issue