From 45ab6e1430348eb9c838384bd9a97a5a9cdfa658 Mon Sep 17 00:00:00 2001 From: Andrei Kortunov Date: Tue, 29 Oct 2019 11:05:18 +0400 Subject: [PATCH] Implement UTF-8 support for script parser (bug #4598) --- CHANGELOG.md | 1 + apps/opencs/view/world/scripthighlighter.cpp | 5 +- components/compiler/scanner.cpp | 117 +++++-------- components/compiler/scanner.hpp | 169 ++++++++++++++++++- 4 files changed, 210 insertions(+), 82 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a2e13e3f0..590d7b616 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -35,6 +35,7 @@ Bug #4456: AiActivate should not be cancelled after target activation Bug #4540: Rain delay when exiting water Bug #4594: Actors without AI packages don't use Hello dialogue + Bug #4598: Script parser does not support non-ASCII characters Bug #4600: Crash when no sound output is available or --no-sound is used. Bug #4639: Black screen after completing first mages guild mission + training Bug #4650: Focus is lost after pressing ESC in confirmation dialog inside savegame dialog diff --git a/apps/opencs/view/world/scripthighlighter.cpp b/apps/opencs/view/world/scripthighlighter.cpp index 3fb82fad8..147beb82a 100644 --- a/apps/opencs/view/world/scripthighlighter.cpp +++ b/apps/opencs/view/world/scripthighlighter.cpp @@ -65,7 +65,10 @@ void CSVWorld::ScriptHighlighter::parseEOF (Compiler::Scanner& scanner) void CSVWorld::ScriptHighlighter::highlight (const Compiler::TokenLoc& loc, Type type) { - int length = static_cast (loc.mLiteral.size()); + // We should take in account multibyte characters + int length = 0; + const char* token = loc.mLiteral.c_str(); + while (*token) length += (*token++ & 0xc0) != 0x80; int index = loc.mColumn; diff --git a/components/compiler/scanner.cpp b/components/compiler/scanner.cpp index 65c050ce0..6d66b0493 100644 --- a/components/compiler/scanner.cpp +++ b/components/compiler/scanner.cpp @@ -1,8 +1,6 @@ #include "scanner.hpp" #include -#include -#include #include #include "exception.hpp" @@ -14,14 +12,12 @@ namespace Compiler { - bool Scanner::get (char& c) + bool Scanner::get (MultiChar& c) { - mStream.get (c); - - if (!mStream.good()) + if (!c.getFrom(mStream)) return false; - mPrevLoc =mLoc; + mPrevLoc = mLoc; if (c=='\n') { @@ -34,15 +30,15 @@ namespace Compiler else { ++mLoc.mColumn; - mLoc.mLiteral += c; + c.appendTo(mLoc.mLiteral); } return true; } - void Scanner::putback (char c) + void Scanner::putback (MultiChar& c) { - mStream.putback (c); + c.putback(mStream); mLoc = mPrevLoc; } @@ -80,7 +76,7 @@ namespace Compiler break; } - char c; + MultiChar c; if (!get (c)) { @@ -91,7 +87,7 @@ namespace Compiler { std::string comment; - comment += c; + c.appendTo(comment); while (get (c)) { @@ -101,7 +97,7 @@ namespace Compiler break; } else - comment += c; + c.appendTo(comment); } TokenLoc loc (mLoc); @@ -109,7 +105,7 @@ namespace Compiler return parser.parseComment (comment, loc, *this); } - else if (isWhitespace (c)) + else if (c.isWhitespace()) { mLoc.mLiteral.clear(); return true; @@ -120,7 +116,7 @@ namespace Compiler mLoc.mLiteral.clear(); return true; } - else if (std::isalpha (c) || c=='_' || c=='"') + else if (c.isAlpha() || c=='_' || c=='"') { bool cont = false; @@ -130,7 +126,7 @@ namespace Compiler return cont; } } - else if (std::isdigit (c)) + else if (c.isDigit()) { bool cont = false; @@ -162,24 +158,24 @@ namespace Compiler throw SourceException(); } - bool Scanner::scanInt (char c, Parser& parser, bool& cont) + bool Scanner::scanInt (MultiChar& c, Parser& parser, bool& cont) { assert(c != '\0'); std::string value; - value += c; + c.appendTo(value); bool error = false; while (get (c)) { - if (std::isdigit (c)) + if (c.isDigit()) { - value += c; + c.appendTo(value); } - else if (c!='-' && isStringCharacter (c)) + else if (!c.isMinusSign() && isStringCharacter (c)) { error = true; - value += c; + c.appendTo(value); } else if (c=='.') { @@ -224,19 +220,19 @@ namespace Compiler { std::string value = intValue + "."; - char c; + MultiChar c; bool empty = intValue.empty() || intValue=="-"; bool error = false; while (get (c)) { - if (std::isdigit (c)) + if (c.isDigit()) { - value += c; + c.appendTo(value); empty = false; } - else if (std::isalpha (c) || c=='_') + else if (c.isAlpha() || c=='_') error = true; else { @@ -279,10 +275,10 @@ namespace Compiler 0 }; - bool Scanner::scanName (char c, Parser& parser, bool& cont) + bool Scanner::scanName (MultiChar& c, Parser& parser, bool& cont) { std::string name; - name += c; + c.appendTo(name); if (!scanName (name)) return false; @@ -315,8 +311,8 @@ namespace Compiler // Russian localization and some mods use a quirk - add newline character directly // to compiled bytecode via HEX-editor to implement multiline messageboxes. - // Of course, original editor will not compile such script. - // Allow messageboxes to bybass the "incomplete string or name" error. + // Of course, original editor can not compile such script. + // Allow messageboxes to bypass the "incomplete string or name" error. if (lowerCase == "messagebox") enableIgnoreNewlines(); else if (isKeyword) @@ -344,7 +340,7 @@ namespace Compiler bool Scanner::scanName (std::string& name) { - char c; + MultiChar c; bool error = false; while (get (c)) @@ -353,7 +349,7 @@ namespace Compiler { if (c=='"') { - name += c; + c.appendTo(name); break; } // ignoring escape sequences for now, because they are messing up stupid Windows path names. @@ -380,20 +376,20 @@ namespace Compiler } else if (!(c=='"' && name.empty())) { - if (!isStringCharacter (c) && !(mTolerantNames && (c=='.' || c=='-'))) + if (!isStringCharacter (c) && !(mTolerantNames && (c=='.' || c == '-'))) { putback (c); break; } } - name += c; + c.appendTo(name); } return !error; } - bool Scanner::scanSpecial (char c, Parser& parser, bool& cont) + bool Scanner::scanSpecial (MultiChar& c, Parser& parser, bool& cont) { int special = -1; @@ -410,7 +406,7 @@ namespace Compiler { putback (c); - if (std::isdigit (c)) + if (c.isDigit()) return scanFloat ("", parser, cont); } @@ -428,7 +424,7 @@ namespace Compiler else if (c == '>' || c == '<') // Treat => and =< as == { special = S_cmpEQ; - mErrorHandler.warning (std::string("invalid operator =") + c + ", treating it as ==", mLoc); + mErrorHandler.warning (std::string("invalid operator =") + c.data() + ", treating it as ==", mLoc); } else { @@ -463,7 +459,7 @@ namespace Compiler else return false; } - else if (c=='-') + else if (c.isMinusSign()) { if (get (c)) { @@ -478,32 +474,6 @@ namespace Compiler else special = S_minus; } - else if (static_cast (c)==0xe2) - { - /// Workaround for some translator who apparently can't keep his minus in order - /// \todo disable for later script formats - if (get (c) && static_cast (c)==0x80 && - get (c) && static_cast (c)==0x93) - { - if (get (c)) - { - if (c=='>') - special = S_ref; - else - { - putback (c); - special = S_minus; - } - } - else - special = S_minus; - } - else - { - mErrorHandler.error ("Invalid character", mLoc); - return false; - } - } else if (c=='<') { if (get (c)) @@ -582,20 +552,21 @@ namespace Compiler return true; } - bool Scanner::isStringCharacter (char c, bool lookAhead) + bool Scanner::isStringCharacter (MultiChar& c, bool lookAhead) { - return std::isalpha (c) || std::isdigit (c) || c=='_' || - /// \todo disable this when doing more stricter compiling - c=='`' || c=='\'' || + if (lookAhead && c.isMinusSign()) + { /// \todo disable this when doing more stricter compiling. Also, find out who is /// responsible for allowing it in the first place and meet up with that person in /// a dark alley. - (c=='-' && (!lookAhead || isStringCharacter (mStream.peek(), false))); - } + MultiChar next; + if (next.peek(mStream) && isStringCharacter (next, false)) + return true; + } - bool Scanner::isWhitespace (char c) - { - return c==' ' || c=='\t'; + return c.isAlpha() || c.isDigit() || c=='_' || + /// \todo disable this when doing more stricter compiling + c=='`' || c=='\''; } // constructor diff --git a/components/compiler/scanner.hpp b/components/compiler/scanner.hpp index a431cabb2..c8a528348 100644 --- a/components/compiler/scanner.hpp +++ b/components/compiler/scanner.hpp @@ -1,9 +1,11 @@ #ifndef COMPILER_SCANNER_H_INCLUDED #define COMPILER_SCANNER_H_INCLUDED +#include #include #include #include +#include #include "tokenloc.hpp" @@ -18,6 +20,159 @@ namespace Compiler /// This class translate a char-stream to a token stream (delivered via /// parser-callbacks). + class MultiChar + { + public: + MultiChar() + { + blank(); + } + + MultiChar(const char ch) + { + blank(); + mData[0] = ch; + + mLength = getCharLength(ch); + } + + int getCharLength(const char ch) + { + unsigned char c = ch; + if (c<=127) return 0; + else if ((c & 0xE0) == 0xC0) return 1; + else if ((c & 0xF0) == 0xE0) return 2; + else if ((c & 0xF8) == 0xF0) return 3; + else return -1; + } + + bool operator== (const char ch) + { + return mData[0]==ch && mData[1]==0 && mData[2]==0 && mData[3]==0; + } + + bool operator== (const MultiChar& ch) + { + return mData[0]==ch.mData[0] && mData[1]==ch.mData[1] && mData[2]==ch.mData[2] && mData[3]==ch.mData[3]; + } + + bool operator!= (const char ch) + { + return mData[0]!=ch || mData[1]!=0 || mData[2]!=0 || mData[3]!=0; + } + + bool isWhitespace() + { + return (mData[0]==' ' || mData[0]=='\t') && mData[1]==0 && mData[2]==0 && mData[3]==0; + } + + bool isDigit() + { + return std::isdigit(mData[0]) && mData[1]==0 && mData[2]==0 && mData[3]==0; + } + + bool isMinusSign() + { + if (mData[0] == '-' && mData[1] == 0 && mData[2] == 0 && mData[3] == 0) + return true; + + return mData[0] == '\xe2' && mData[1] == '\x80' && mData[2] == '\x93' && mData[3] == 0; + } + + bool isAlpha() + { + if (isMinusSign()) + return false; + + return std::isalpha(mData[0]) || mData[1]!=0 || mData[2]!=0 || mData[3]!=0; + } + + void appendTo(std::string& str) + { + for (int i = 0; i <= mLength; i++) + str += mData[i]; + } + + void putback (std::istream& in) + { + for (int i = mLength; i >= 0; i--) + in.putback (mData[i]); + } + + bool getFrom(std::istream& in) + { + blank(); + + char ch = in.peek(); + + if (!in.good()) + return false; + + int length = getCharLength(ch); + if (length < 0) return false; + + for (int i = 0; i <= length; i++) + { + in.get (ch); + + if (!in.good()) + return false; + + mData[i] = ch; + } + + mLength = length; + + return true; + } + + bool peek(std::istream& in) + { + std::streampos p_orig = in.tellg(); + + char ch = in.peek(); + + if (!in.good()) + return false; + + int length = getCharLength(ch); + if (length < 0) return false; + + for (int i = 0; i <= length; i++) + { + if (length >= i) + { + in.get (ch); + + if (!in.good()) + return false; + + mData[i] = ch; + } + } + + mLength = length; + + in.seekg(p_orig); + return true; + }; + + void blank() + { + std::fill(mData, mData + sizeof(mData), 0); + mLength = -1; + } + + std::string data() + { + return mData; + } + + private: + char mData[4]; + int mLength; + }; + class Scanner { enum putback_type @@ -79,26 +234,24 @@ namespace Compiler Scanner (const Scanner&); Scanner& operator= (const Scanner&); - bool get (char& c); + bool get (MultiChar& c); - void putback (char c); + void putback (MultiChar& c); bool scanToken (Parser& parser); - bool scanInt (char c, Parser& parser, bool& cont); + bool scanInt (MultiChar& c, Parser& parser, bool& cont); bool scanFloat (const std::string& intValue, Parser& parser, bool& cont); - bool scanName (char c, Parser& parser, bool& cont); + bool scanName (MultiChar& c, Parser& parser, bool& cont); /// \param name May contain the start of the name (one or more characters) bool scanName (std::string& name); - bool scanSpecial (char c, Parser& parser, bool& cont); - - bool isStringCharacter (char c, bool lookAhead = true); + bool scanSpecial (MultiChar& c, Parser& parser, bool& cont); - static bool isWhitespace (char c); + bool isStringCharacter (MultiChar& c, bool lookAhead = true); public: