Made a better workaround for the UTF encoding problem

git-svn-id: https://openmw.svn.sourceforge.net/svnroot/openmw/trunk@34 ea6a568a-9f4f-0410-981a-c910a81bb256
2025-10-05 04:56:30 +00:00 · 2008-07-23 15:19:37 +00:00 · 2008-07-23 15:19:37 +00:00 · 27b68f6f5b
commit 27b68f6f5b
parent 4bc7d1b6d9
5 changed files with 107 additions and 49 deletions
--- a/esm/filereader.d
+++ b/esm/filereader.d
@ -29,7 +29,7 @@ import std.stream;
 import std.string;

 import util.regions;
-
+import monster.util.string;
 import core.resource;

 import esm.listkeeper;
@ -378,12 +378,16 @@ struct TES3File
  // This should be more than big enough for references.
  private char lookupBuffer[200];

+  // Get a temporary string. This is faster and more memory efficient
+  // that the other string functions (because it is allocation free),
+  // but the returned string is only valid until tmpHString() is
+  // called again.
  char[] tmpHString()
  {
    getSubHeader();
+    assert(leftSub <= lookupBuffer.length, "lookupBuffer wasn't large enough");

    // Use this to test the difference in memory consumption.
-    //return getString(region.getString(leftSub));
    return getString(lookupBuffer[0..leftSub]);
  }

@ -581,13 +585,17 @@ struct TES3File
    {
      getSubHeader();

-      // Hack to make MultiMark.esp load
+      // Hack to make MultiMark.esp load. Zero-length strings do not
+      // occur in any of the official mods, but MultiMark makes use of
+      // them. For some reason, they break the rules, and contain a
+      // byte (value 0) even if the header says there is no data. If
+      // Morrowind accepts it, so should we.
      if(leftSub == 0)
 	{
 	  // Skip the following zero byte
 	  leftRec--;
 	  assert(file.getc() == 0);
-	  // Report this by setting a flag or something?
+	  // TODO: Report this by setting a flag or something?
 	  return null;
 	}

@ -729,13 +737,19 @@ struct TES3File

  // Fill buffer of predefined length. If actual string is shorter
  // (ie. null terminated), the buffer length is set
-  // accordingly. Chopped string is returned.
+  // accordingly. Chopped string is returned. All strings pass through
+  // this function, so any character encoding conversions should
+  // happen here.
  char[] getString(char[] str)
    {
      if(str.length != file.readBlock(str.ptr,str.length))
 	fail("getString() could not find enough data in stream");

      str = stripz(str);
+      makeUTF8(str); // TODO: A hack. Will replace non-utf characters
+                     // with question marks. This is neither a very
+                     // desirable result nor a very optimized
+                     // implementation of it.
      return str;
    }

--- a/esmtool.d
+++ b/esmtool.d
@ -27,12 +27,11 @@ import std.stdio;

 import core.memory;
 import esm.esmmain;
+import monster.util.string;

-import std.utf;
 import std.gc;
 import gcstats;

-import monster.util.string;

 // Not used, but we have to link it in along with the C++ stuff.
 import input.events;
@ -129,8 +128,7 @@ void main(char[][] args)
  try loadTESFiles(files);
  catch(Exception e)
    {
-      try {writefln(e);}
-      catch {writefln("(Invalid UTF in error message)");}
+      writefln(e);
    }
  catch { writefln("Error: Unkown failure"); }

@ -156,8 +154,7 @@ void main(char[][] args)
 	case WT.Bolt: writef("Bolt"); break;
        default: assert(0);
 	}
-      try writefln(" id '%s': name '%s'", n, m.name);
-      catch {writefln("(Invalid UTF string)");}
+      writefln(" id '%s': name '%s'", n, m.name);

      if(m.data.flags & Weapon.Flags.Magical)
 	writefln("Magical");
@ -228,20 +225,14 @@ void main(char[][] args)
  if(scptList) foreach(a, b; scripts.names) writefln(a);
  if(ciList)
    foreach(a, b; cells.in_cells)
-    {
-      try writefln(a);
-      catch {writefln("(Invalid UTF string)");}
-    }
+      writefln(a);
  if(ceList)
    foreach(uint i, c; .cells.ex_cells)
    {
      int x, y;
      CellList.decompound(i, x, y);
      if(c.name.length)
-        {
-          try writefln("%s,%s: %s", x, y, c.name);
-          catch {writefln("(Invalid UTF string)");}
-        }
+        writefln("%s,%s: %s", x, y, c.name);
    }

  if(scptShow)
@ -346,11 +337,7 @@ void printRaw()
 		//subName == "SCTX") // For script text
 		//getHString();
 		{
-		  try{writefln("'%s'", getHString());}
-		  catch(UtfException e)
-		    {
-		      writefln("Got an UTF-ie, ", e);
-		    }
+		  writefln("'%s'", getHString());
 		}
 	      else if(subName == "FLTV" || subName == "XSCL") writefln(getHFloat());
 	      else if(subName == "INTV" /*|| subName == "NAM0"*/ || subName == "FRMR")
--- a/monster/util/string.d
+++ b/monster/util/string.d
@ -24,8 +24,8 @@

 module monster.util.string;

-import std.utf;
 import std.string;
+import std.utf;

 bool begins(char[] str, char[] start)
 {
@ -137,6 +137,81 @@ unittest
  assert("".iEnds(""));
 }

+// A specialized version of std.utf.decode()
+private bool fdecode(char[] s, inout size_t idx)
+    {
+	size_t len = s.length;
+	dchar V;
+	size_t i = idx;
+	char u = s[i];
+
+	if (u & 0x80)
+	{   uint n;
+	    char u2;
+
+	    /* The following encodings are valid, except for the 5 and 6 byte
+	     * combinations:
+	     *	0xxxxxxx
+	     *	110xxxxx 10xxxxxx
+	     *	1110xxxx 10xxxxxx 10xxxxxx
+	     *	11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+	     *	111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+	     *	1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+	     */
+	    for (n = 1; ; n++)
+	    {
+		if (n > 4)
+		    return false;		// only do the first 4 of 6 encodings
+		if (((u << n) & 0x80) == 0)
+		{
+		    if (n == 1)
+			return false;
+		    break;
+		}
+	    }
+
+	    // Pick off (7 - n) significant bits of B from first byte of octet
+	    V = cast(dchar)(u & ((1 << (7 - n)) - 1));
+
+	    if (i + (n - 1) >= len)
+		return false;			// off end of string
+
+	    /* The following combinations are overlong, and illegal:
+	     *	1100000x (10xxxxxx)
+	     *	11100000 100xxxxx (10xxxxxx)
+	     *	11110000 1000xxxx (10xxxxxx 10xxxxxx)
+	     *	11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx)
+	     *	11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx)
+	     */
+	    u2 = s[i + 1];
+	    if ((u & 0xFE) == 0xC0 ||
+		(u == 0xE0 && (u2 & 0xE0) == 0x80) ||
+		(u == 0xF0 && (u2 & 0xF0) == 0x80) ||
+		(u == 0xF8 && (u2 & 0xF8) == 0x80) ||
+		(u == 0xFC && (u2 & 0xFC) == 0x80))
+		return false;			// overlong combination
+
+	    for (uint j = 1; j != n; j++)
+	    {
+		u = s[i + j];
+		if ((u & 0xC0) != 0x80)
+		    return false;			// trailing bytes are 10xxxxxx
+		V = (V << 6) | (u & 0x3F);
+	    }
+	    if (!isValidDchar(V))
+		return false;
+	    i += n;
+	}
+	else
+	{
+	    V = cast(dchar) u;
+	    i++;
+	}
+
+	idx = i;
+	return true;
+    }
+
 // Converts any string to valid UTF8 so it can be safely printed. It
 // does not translate from other encodings but simply replaces invalid
 // characters with 'replace'. Does everything in place.
@ -144,11 +219,8 @@ char[] makeUTF8(char[] str, char replace = '?')
 {
  size_t idx = 0;
  while(idx < str.length)
-    {
-      try decode(str, idx);
-      catch(UtfException ue)
+    if(!fdecode(str, idx))
      str[idx++] = replace;
-    }
  return str;
 }

--- a/nif/extra.d
+++ b/nif/extra.d
@ -25,6 +25,8 @@ module nif.extra;
 import nif.record;
 import nif.controlled;

+import monster.util.string;
+
 abstract class Extra : Record
 {
  Extra extra;
@ -98,7 +100,7 @@ class NiTextKeyExtraData : Extra
 	  k.string = nifFile.getString;

 	  debug(verbose)
-	    writefln("  %d: %s @ %f ", i, k.string.clean().chomp(), k.time);
+	    writefln("  %d: %s @ %f ", i, makeUTF8(k.string).chomp(), k.time);
 	}
    }
 }
@ -120,7 +122,7 @@ class NiStringExtraData : Extra
      debug(verbose)
 	{
 	  // "NCO" means 'no collision', I think
-	  writefln("String: %s", string.clean());
+	  writefln("String: %s", makeUTF8(string));
 	}
    }
 }
--- a/nif/misc.d
+++ b/nif/misc.d
@ -24,26 +24,9 @@
 // This doesn't have to be part of the nif package at all.
 module nif.misc;

-import std.utf;
 import std.string;
 import monster.util.string;

-// Find an alternative to this
-char[] clean(char[] s)
-{
-  try{validate(s);}
-  catch(UtfException e)
-    {
-      return "(invalid utf-string)";
-    }
-  return s;
-}
-
-unittest
-{
-  assert(clean("abc æøå") == "abc æøå");
-}
-
 struct Vector
 {
  float array[3];