From 45ab6e1430348eb9c838384bd9a97a5a9cdfa658 Mon Sep 17 00:00:00 2001
From: Andrei Kortunov <andrei.kortunov@yandex.ru>
Date: Tue, 29 Oct 2019 11:05:18 +0400
Subject: [PATCH] Implement UTF-8 support for script parser (bug #4598)

---
 CHANGELOG.md                                 |   1 +
 apps/opencs/view/world/scripthighlighter.cpp |   5 +-
 components/compiler/scanner.cpp              | 117 +++++--------
 components/compiler/scanner.hpp              | 169 ++++++++++++++++++-
 4 files changed, 210 insertions(+), 82 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a2e13e3f0a..590d7b616c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -35,6 +35,7 @@
     Bug #4456: AiActivate should not be cancelled after target activation
     Bug #4540: Rain delay when exiting water
     Bug #4594: Actors without AI packages don't use Hello dialogue
+    Bug #4598: Script parser does not support non-ASCII characters
     Bug #4600: Crash when no sound output is available or --no-sound is used.
     Bug #4639: Black screen after completing first mages guild mission + training
     Bug #4650: Focus is lost after pressing ESC in confirmation dialog inside savegame dialog
diff --git a/apps/opencs/view/world/scripthighlighter.cpp b/apps/opencs/view/world/scripthighlighter.cpp
index 3fb82fad8f..147beb82a8 100644
--- a/apps/opencs/view/world/scripthighlighter.cpp
+++ b/apps/opencs/view/world/scripthighlighter.cpp
@@ -65,7 +65,10 @@ void CSVWorld::ScriptHighlighter::parseEOF (Compiler::Scanner& scanner)
 
 void CSVWorld::ScriptHighlighter::highlight (const Compiler::TokenLoc& loc, Type type)
 {
-    int length = static_cast<int> (loc.mLiteral.size());
+    // We should take in account multibyte characters
+    int length = 0;
+    const char* token = loc.mLiteral.c_str();
+    while (*token) length += (*token++ & 0xc0) != 0x80;
 
     int index = loc.mColumn;
 
diff --git a/components/compiler/scanner.cpp b/components/compiler/scanner.cpp
index 65c050ce01..6d66b0493c 100644
--- a/components/compiler/scanner.cpp
+++ b/components/compiler/scanner.cpp
@@ -1,8 +1,6 @@
 #include "scanner.hpp"
 
 #include <cassert>
-#include <cctype>
-#include <sstream>
 #include <iterator>
 
 #include "exception.hpp"
@@ -14,14 +12,12 @@
 
 namespace Compiler
 {
-    bool Scanner::get (char& c)
+    bool Scanner::get (MultiChar& c)
     {
-        mStream.get (c);
-
-        if (!mStream.good())
+        if (!c.getFrom(mStream))
             return false;
 
-        mPrevLoc =mLoc;
+        mPrevLoc = mLoc;
 
         if (c=='\n')
         {
@@ -34,15 +30,15 @@ namespace Compiler
         else
         {
             ++mLoc.mColumn;
-            mLoc.mLiteral += c;
+            c.appendTo(mLoc.mLiteral);
         }
 
         return true;
     }
 
-    void Scanner::putback (char c)
+    void Scanner::putback (MultiChar& c)
     {
-        mStream.putback (c);
+        c.putback(mStream);
         mLoc = mPrevLoc;
     }
 
@@ -80,7 +76,7 @@ namespace Compiler
                 break;
         }
 
-        char c;
+        MultiChar c;
 
         if (!get (c))
         {
@@ -91,7 +87,7 @@ namespace Compiler
         {
             std::string comment;
 
-            comment += c;
+            c.appendTo(comment);
 
             while (get (c))
             {
@@ -101,7 +97,7 @@ namespace Compiler
                     break;
                 }
                 else
-                    comment += c;
+                    c.appendTo(comment);
             }
 
             TokenLoc loc (mLoc);
@@ -109,7 +105,7 @@ namespace Compiler
 
             return parser.parseComment (comment, loc, *this);
         }
-        else if (isWhitespace (c))
+        else if (c.isWhitespace())
         {
             mLoc.mLiteral.clear();
             return true;
@@ -120,7 +116,7 @@ namespace Compiler
             mLoc.mLiteral.clear();
             return true;
         }
-        else if (std::isalpha (c) || c=='_' || c=='"')
+        else if (c.isAlpha() || c=='_' || c=='"')
         {
             bool cont = false;
 
@@ -130,7 +126,7 @@ namespace Compiler
                 return cont;
             }
         }
-        else if (std::isdigit (c))
+        else if (c.isDigit())
         {
             bool cont = false;
 
@@ -162,24 +158,24 @@ namespace Compiler
         throw SourceException();
     }
 
-    bool Scanner::scanInt (char c, Parser& parser, bool& cont)
+    bool Scanner::scanInt (MultiChar& c, Parser& parser, bool& cont)
     {
         assert(c != '\0');
         std::string value;
-        value += c;
+        c.appendTo(value);
 
         bool error = false;
 
         while (get (c))
         {
-            if (std::isdigit (c))
+            if (c.isDigit())
             {
-                value += c;
+                c.appendTo(value);
             }
-            else if (c!='-' && isStringCharacter (c))
+            else if (!c.isMinusSign() && isStringCharacter (c))
             {
                 error = true;
-                value += c;
+                c.appendTo(value);
             }
             else if (c=='.')
             {
@@ -224,19 +220,19 @@ namespace Compiler
     {
         std::string value = intValue + ".";
 
-        char c;
+        MultiChar c;
 
         bool empty = intValue.empty() || intValue=="-";
         bool error = false;
 
         while (get (c))
         {
-            if (std::isdigit (c))
+            if (c.isDigit())
             {
-                value += c;
+                c.appendTo(value);
                 empty = false;
             }
-            else if (std::isalpha (c) || c=='_')
+            else if (c.isAlpha() || c=='_')
                 error = true;
             else
             {
@@ -279,10 +275,10 @@ namespace Compiler
         0
     };
 
-    bool Scanner::scanName (char c, Parser& parser, bool& cont)
+    bool Scanner::scanName (MultiChar& c, Parser& parser, bool& cont)
     {
         std::string name;
-        name += c;
+        c.appendTo(name);
 
         if (!scanName (name))
             return false;
@@ -315,8 +311,8 @@ namespace Compiler
 
         // Russian localization and some mods use a quirk - add newline character directly
         // to compiled bytecode via HEX-editor to implement multiline messageboxes.
-        // Of course, original editor will not compile such script.
-        // Allow messageboxes to bybass the "incomplete string or name" error.
+        // Of course, original editor can not compile such script.
+        // Allow messageboxes to bypass the "incomplete string or name" error.
         if (lowerCase == "messagebox")
             enableIgnoreNewlines();
         else if (isKeyword)
@@ -344,7 +340,7 @@ namespace Compiler
 
     bool Scanner::scanName (std::string& name)
     {
-        char c;
+        MultiChar c;
         bool error = false;
 
         while (get (c))
@@ -353,7 +349,7 @@ namespace Compiler
             {
                 if (c=='"')
                 {
-                    name += c;
+                    c.appendTo(name);
                     break;
                 }
 // ignoring escape sequences for now, because they are messing up stupid Windows path names.
@@ -380,20 +376,20 @@ namespace Compiler
             }
             else if (!(c=='"' && name.empty()))
             {
-                if (!isStringCharacter (c) && !(mTolerantNames && (c=='.' || c=='-')))
+                if (!isStringCharacter (c) && !(mTolerantNames && (c=='.' || c == '-')))
                 {
                     putback (c);
                     break;
                 }
             }
 
-            name += c;
+            c.appendTo(name);
         }
 
         return !error;
     }
 
-    bool Scanner::scanSpecial (char c, Parser& parser, bool& cont)
+    bool Scanner::scanSpecial (MultiChar& c, Parser& parser, bool& cont)
     {
         int special = -1;
 
@@ -410,7 +406,7 @@ namespace Compiler
             {
                 putback (c);
 
-                if (std::isdigit (c))
+                if (c.isDigit())
                     return scanFloat ("", parser, cont);
             }
 
@@ -428,7 +424,7 @@ namespace Compiler
                 else if (c == '>' || c == '<')  // Treat => and =< as ==
                 {
                     special = S_cmpEQ;
-                    mErrorHandler.warning (std::string("invalid operator =") + c + ", treating it as ==", mLoc);
+                    mErrorHandler.warning (std::string("invalid operator =") + c.data() + ", treating it as ==", mLoc);
                 }
                 else
                 {
@@ -463,7 +459,7 @@ namespace Compiler
             else
                 return false;
         }
-        else if (c=='-')
+        else if (c.isMinusSign())
         {
             if (get (c))
             {
@@ -478,32 +474,6 @@ namespace Compiler
             else
                 special = S_minus;
         }
-        else if (static_cast<unsigned char> (c)==0xe2)
-        {
-            /// Workaround for some translator who apparently can't keep his minus in order
-            /// \todo disable for later script formats
-            if (get (c) && static_cast<unsigned char> (c)==0x80 &&
-                get (c) && static_cast<unsigned char> (c)==0x93)
-            {
-                if (get (c))
-                {
-                    if (c=='>')
-                        special = S_ref;
-                    else
-                    {
-                        putback (c);
-                        special = S_minus;
-                    }
-                }
-                else
-                    special = S_minus;
-            }
-            else
-            {
-                mErrorHandler.error ("Invalid character", mLoc);
-                return false;
-            }
-        }
         else if (c=='<')
         {
             if (get (c))
@@ -582,20 +552,21 @@ namespace Compiler
         return true;
     }
 
-    bool Scanner::isStringCharacter (char c, bool lookAhead)
+    bool Scanner::isStringCharacter (MultiChar& c, bool lookAhead)
     {
-        return std::isalpha (c) || std::isdigit (c) || c=='_' ||
-            /// \todo disable this when doing more stricter compiling
-            c=='`' || c=='\'' ||
+        if (lookAhead && c.isMinusSign())
+        {
             /// \todo disable this when doing more stricter compiling. Also, find out who is
             /// responsible for allowing it in the first place and meet up with that person in
             /// a dark alley.
-            (c=='-' && (!lookAhead || isStringCharacter (mStream.peek(), false)));
-    }
+            MultiChar next;
+            if (next.peek(mStream) && isStringCharacter (next, false))
+                return true;
+        }
 
-    bool Scanner::isWhitespace (char c)
-    {
-        return c==' ' || c=='\t';
+        return c.isAlpha() || c.isDigit() || c=='_' ||
+            /// \todo disable this when doing more stricter compiling
+            c=='`' || c=='\'';
     }
 
     // constructor
diff --git a/components/compiler/scanner.hpp b/components/compiler/scanner.hpp
index a431cabb29..c8a528348a 100644
--- a/components/compiler/scanner.hpp
+++ b/components/compiler/scanner.hpp
@@ -1,9 +1,11 @@
 #ifndef COMPILER_SCANNER_H_INCLUDED
 #define COMPILER_SCANNER_H_INCLUDED
 
+#include <cctype>
 #include <string>
 #include <iosfwd>
 #include <vector>
+#include <sstream>
 
 #include "tokenloc.hpp"
 
@@ -18,6 +20,159 @@ namespace Compiler
     /// This class translate a char-stream to a token stream (delivered via
     /// parser-callbacks).
 
+    class MultiChar
+    {
+    public:
+        MultiChar()
+        {
+            blank();
+        }
+
+        MultiChar(const char ch)
+        {
+            blank();
+            mData[0] = ch;
+
+            mLength = getCharLength(ch);
+        }
+
+        int getCharLength(const char ch)
+        {
+            unsigned char c = ch;
+            if (c<=127) return 0;
+            else if ((c & 0xE0) == 0xC0) return 1;
+            else if ((c & 0xF0) == 0xE0) return 2;
+            else if ((c & 0xF8) == 0xF0) return 3;
+            else return -1;
+        }
+
+        bool operator== (const char ch)
+        {
+            return mData[0]==ch && mData[1]==0 && mData[2]==0 && mData[3]==0;
+        }
+
+        bool operator== (const MultiChar& ch)
+        {
+            return mData[0]==ch.mData[0] && mData[1]==ch.mData[1] && mData[2]==ch.mData[2] && mData[3]==ch.mData[3];
+        }
+
+        bool operator!= (const char ch)
+        {
+            return mData[0]!=ch || mData[1]!=0 || mData[2]!=0 || mData[3]!=0;
+        }
+
+        bool isWhitespace()
+        {
+            return (mData[0]==' ' || mData[0]=='\t') && mData[1]==0 && mData[2]==0 && mData[3]==0;
+        }
+
+        bool isDigit()
+        {
+            return std::isdigit(mData[0]) && mData[1]==0 && mData[2]==0 && mData[3]==0;
+        }
+
+        bool isMinusSign()
+        {
+            if (mData[0] == '-' && mData[1] == 0 && mData[2] == 0 && mData[3] == 0)
+                return true;
+
+            return mData[0] == '\xe2' && mData[1] == '\x80' && mData[2] == '\x93' && mData[3] == 0;
+        }
+
+        bool isAlpha()
+        {
+            if (isMinusSign())
+                return false;
+
+            return std::isalpha(mData[0]) || mData[1]!=0 || mData[2]!=0 || mData[3]!=0;
+        }
+
+        void appendTo(std::string& str)
+        {
+            for (int i = 0; i <= mLength; i++)
+                str += mData[i];
+        }
+
+        void putback (std::istream& in)
+        {
+            for (int i = mLength; i >= 0; i--)
+                in.putback (mData[i]);
+        }
+
+        bool getFrom(std::istream& in)
+        {
+            blank();
+
+            char ch = in.peek();
+
+            if (!in.good())
+                return false;
+
+            int length = getCharLength(ch);
+            if (length < 0) return false;
+
+            for (int i = 0; i <= length; i++)
+            {
+                in.get (ch);
+
+                if (!in.good())
+                    return false;
+
+                mData[i] = ch;
+            }
+
+            mLength = length;
+
+            return true;
+        }
+
+        bool peek(std::istream& in)
+        {
+            std::streampos p_orig = in.tellg();
+
+            char ch = in.peek();
+
+            if (!in.good())
+                return false;
+
+            int length = getCharLength(ch);
+            if (length < 0) return false;
+
+            for (int i = 0; i <= length; i++)
+            {
+                if (length >= i)
+                {
+                    in.get (ch);
+
+                    if (!in.good())
+                        return false;
+
+                    mData[i] = ch;
+                }
+            }
+
+            mLength = length;
+
+            in.seekg(p_orig);
+            return true;
+        };
+
+        void blank()
+        {
+            std::fill(mData, mData + sizeof(mData), 0);
+            mLength = -1;
+        }
+
+        std::string data()
+        {
+            return mData;
+        }
+
+    private:
+        char mData[4];
+        int mLength;
+    };
+
     class Scanner
     {
             enum putback_type
@@ -79,26 +234,24 @@ namespace Compiler
             Scanner (const Scanner&);
             Scanner& operator= (const Scanner&);
 
-            bool get (char& c);
+            bool get (MultiChar& c);
 
-            void putback (char c);
+            void putback (MultiChar& c);
 
             bool scanToken (Parser& parser);
 
-            bool scanInt (char c, Parser& parser, bool& cont);
+            bool scanInt (MultiChar& c, Parser& parser, bool& cont);
 
             bool scanFloat (const std::string& intValue, Parser& parser, bool& cont);
 
-            bool scanName (char c, Parser& parser, bool& cont);
+            bool scanName (MultiChar& c, Parser& parser, bool& cont);
 
             /// \param name May contain the start of the name (one or more characters)
             bool scanName (std::string& name);
 
-            bool scanSpecial (char c, Parser& parser, bool& cont);
+            bool scanSpecial (MultiChar& c, Parser& parser, bool& cont);
 
-            bool isStringCharacter (char c, bool lookAhead = true);
-
-            static bool isWhitespace (char c);
+            bool isStringCharacter (MultiChar& c, bool lookAhead = true);
 
         public: