Implement UTF-8 support for script parser (bug #4598)

pull/2575/head
Andrei Kortunov 5 years ago
parent fe2f4bcedc
commit 45ab6e1430

@ -35,6 +35,7 @@
Bug #4456: AiActivate should not be cancelled after target activation Bug #4456: AiActivate should not be cancelled after target activation
Bug #4540: Rain delay when exiting water Bug #4540: Rain delay when exiting water
Bug #4594: Actors without AI packages don't use Hello dialogue Bug #4594: Actors without AI packages don't use Hello dialogue
Bug #4598: Script parser does not support non-ASCII characters
Bug #4600: Crash when no sound output is available or --no-sound is used. Bug #4600: Crash when no sound output is available or --no-sound is used.
Bug #4639: Black screen after completing first mages guild mission + training Bug #4639: Black screen after completing first mages guild mission + training
Bug #4650: Focus is lost after pressing ESC in confirmation dialog inside savegame dialog Bug #4650: Focus is lost after pressing ESC in confirmation dialog inside savegame dialog

@ -65,7 +65,10 @@ void CSVWorld::ScriptHighlighter::parseEOF (Compiler::Scanner& scanner)
void CSVWorld::ScriptHighlighter::highlight (const Compiler::TokenLoc& loc, Type type) void CSVWorld::ScriptHighlighter::highlight (const Compiler::TokenLoc& loc, Type type)
{ {
int length = static_cast<int> (loc.mLiteral.size()); // We should take in account multibyte characters
int length = 0;
const char* token = loc.mLiteral.c_str();
while (*token) length += (*token++ & 0xc0) != 0x80;
int index = loc.mColumn; int index = loc.mColumn;

@ -1,8 +1,6 @@
#include "scanner.hpp" #include "scanner.hpp"
#include <cassert> #include <cassert>
#include <cctype>
#include <sstream>
#include <iterator> #include <iterator>
#include "exception.hpp" #include "exception.hpp"
@ -14,14 +12,12 @@
namespace Compiler namespace Compiler
{ {
bool Scanner::get (char& c) bool Scanner::get (MultiChar& c)
{ {
mStream.get (c); if (!c.getFrom(mStream))
if (!mStream.good())
return false; return false;
mPrevLoc =mLoc; mPrevLoc = mLoc;
if (c=='\n') if (c=='\n')
{ {
@ -34,15 +30,15 @@ namespace Compiler
else else
{ {
++mLoc.mColumn; ++mLoc.mColumn;
mLoc.mLiteral += c; c.appendTo(mLoc.mLiteral);
} }
return true; return true;
} }
void Scanner::putback (char c) void Scanner::putback (MultiChar& c)
{ {
mStream.putback (c); c.putback(mStream);
mLoc = mPrevLoc; mLoc = mPrevLoc;
} }
@ -80,7 +76,7 @@ namespace Compiler
break; break;
} }
char c; MultiChar c;
if (!get (c)) if (!get (c))
{ {
@ -91,7 +87,7 @@ namespace Compiler
{ {
std::string comment; std::string comment;
comment += c; c.appendTo(comment);
while (get (c)) while (get (c))
{ {
@ -101,7 +97,7 @@ namespace Compiler
break; break;
} }
else else
comment += c; c.appendTo(comment);
} }
TokenLoc loc (mLoc); TokenLoc loc (mLoc);
@ -109,7 +105,7 @@ namespace Compiler
return parser.parseComment (comment, loc, *this); return parser.parseComment (comment, loc, *this);
} }
else if (isWhitespace (c)) else if (c.isWhitespace())
{ {
mLoc.mLiteral.clear(); mLoc.mLiteral.clear();
return true; return true;
@ -120,7 +116,7 @@ namespace Compiler
mLoc.mLiteral.clear(); mLoc.mLiteral.clear();
return true; return true;
} }
else if (std::isalpha (c) || c=='_' || c=='"') else if (c.isAlpha() || c=='_' || c=='"')
{ {
bool cont = false; bool cont = false;
@ -130,7 +126,7 @@ namespace Compiler
return cont; return cont;
} }
} }
else if (std::isdigit (c)) else if (c.isDigit())
{ {
bool cont = false; bool cont = false;
@ -162,24 +158,24 @@ namespace Compiler
throw SourceException(); throw SourceException();
} }
bool Scanner::scanInt (char c, Parser& parser, bool& cont) bool Scanner::scanInt (MultiChar& c, Parser& parser, bool& cont)
{ {
assert(c != '\0'); assert(c != '\0');
std::string value; std::string value;
value += c; c.appendTo(value);
bool error = false; bool error = false;
while (get (c)) while (get (c))
{ {
if (std::isdigit (c)) if (c.isDigit())
{ {
value += c; c.appendTo(value);
} }
else if (c!='-' && isStringCharacter (c)) else if (!c.isMinusSign() && isStringCharacter (c))
{ {
error = true; error = true;
value += c; c.appendTo(value);
} }
else if (c=='.') else if (c=='.')
{ {
@ -224,19 +220,19 @@ namespace Compiler
{ {
std::string value = intValue + "."; std::string value = intValue + ".";
char c; MultiChar c;
bool empty = intValue.empty() || intValue=="-"; bool empty = intValue.empty() || intValue=="-";
bool error = false; bool error = false;
while (get (c)) while (get (c))
{ {
if (std::isdigit (c)) if (c.isDigit())
{ {
value += c; c.appendTo(value);
empty = false; empty = false;
} }
else if (std::isalpha (c) || c=='_') else if (c.isAlpha() || c=='_')
error = true; error = true;
else else
{ {
@ -279,10 +275,10 @@ namespace Compiler
0 0
}; };
bool Scanner::scanName (char c, Parser& parser, bool& cont) bool Scanner::scanName (MultiChar& c, Parser& parser, bool& cont)
{ {
std::string name; std::string name;
name += c; c.appendTo(name);
if (!scanName (name)) if (!scanName (name))
return false; return false;
@ -315,8 +311,8 @@ namespace Compiler
// Russian localization and some mods use a quirk - add newline character directly // Russian localization and some mods use a quirk - add newline character directly
// to compiled bytecode via HEX-editor to implement multiline messageboxes. // to compiled bytecode via HEX-editor to implement multiline messageboxes.
// Of course, original editor will not compile such script. // Of course, original editor can not compile such script.
// Allow messageboxes to bybass the "incomplete string or name" error. // Allow messageboxes to bypass the "incomplete string or name" error.
if (lowerCase == "messagebox") if (lowerCase == "messagebox")
enableIgnoreNewlines(); enableIgnoreNewlines();
else if (isKeyword) else if (isKeyword)
@ -344,7 +340,7 @@ namespace Compiler
bool Scanner::scanName (std::string& name) bool Scanner::scanName (std::string& name)
{ {
char c; MultiChar c;
bool error = false; bool error = false;
while (get (c)) while (get (c))
@ -353,7 +349,7 @@ namespace Compiler
{ {
if (c=='"') if (c=='"')
{ {
name += c; c.appendTo(name);
break; break;
} }
// ignoring escape sequences for now, because they are messing up stupid Windows path names. // ignoring escape sequences for now, because they are messing up stupid Windows path names.
@ -380,20 +376,20 @@ namespace Compiler
} }
else if (!(c=='"' && name.empty())) else if (!(c=='"' && name.empty()))
{ {
if (!isStringCharacter (c) && !(mTolerantNames && (c=='.' || c=='-'))) if (!isStringCharacter (c) && !(mTolerantNames && (c=='.' || c == '-')))
{ {
putback (c); putback (c);
break; break;
} }
} }
name += c; c.appendTo(name);
} }
return !error; return !error;
} }
bool Scanner::scanSpecial (char c, Parser& parser, bool& cont) bool Scanner::scanSpecial (MultiChar& c, Parser& parser, bool& cont)
{ {
int special = -1; int special = -1;
@ -410,7 +406,7 @@ namespace Compiler
{ {
putback (c); putback (c);
if (std::isdigit (c)) if (c.isDigit())
return scanFloat ("", parser, cont); return scanFloat ("", parser, cont);
} }
@ -428,7 +424,7 @@ namespace Compiler
else if (c == '>' || c == '<') // Treat => and =< as == else if (c == '>' || c == '<') // Treat => and =< as ==
{ {
special = S_cmpEQ; special = S_cmpEQ;
mErrorHandler.warning (std::string("invalid operator =") + c + ", treating it as ==", mLoc); mErrorHandler.warning (std::string("invalid operator =") + c.data() + ", treating it as ==", mLoc);
} }
else else
{ {
@ -463,7 +459,7 @@ namespace Compiler
else else
return false; return false;
} }
else if (c=='-') else if (c.isMinusSign())
{ {
if (get (c)) if (get (c))
{ {
@ -478,32 +474,6 @@ namespace Compiler
else else
special = S_minus; special = S_minus;
} }
else if (static_cast<unsigned char> (c)==0xe2)
{
/// Workaround for some translator who apparently can't keep his minus in order
/// \todo disable for later script formats
if (get (c) && static_cast<unsigned char> (c)==0x80 &&
get (c) && static_cast<unsigned char> (c)==0x93)
{
if (get (c))
{
if (c=='>')
special = S_ref;
else
{
putback (c);
special = S_minus;
}
}
else
special = S_minus;
}
else
{
mErrorHandler.error ("Invalid character", mLoc);
return false;
}
}
else if (c=='<') else if (c=='<')
{ {
if (get (c)) if (get (c))
@ -582,20 +552,21 @@ namespace Compiler
return true; return true;
} }
bool Scanner::isStringCharacter (char c, bool lookAhead) bool Scanner::isStringCharacter (MultiChar& c, bool lookAhead)
{ {
return std::isalpha (c) || std::isdigit (c) || c=='_' || if (lookAhead && c.isMinusSign())
/// \todo disable this when doing more stricter compiling {
c=='`' || c=='\'' ||
/// \todo disable this when doing more stricter compiling. Also, find out who is /// \todo disable this when doing more stricter compiling. Also, find out who is
/// responsible for allowing it in the first place and meet up with that person in /// responsible for allowing it in the first place and meet up with that person in
/// a dark alley. /// a dark alley.
(c=='-' && (!lookAhead || isStringCharacter (mStream.peek(), false))); MultiChar next;
} if (next.peek(mStream) && isStringCharacter (next, false))
return true;
}
bool Scanner::isWhitespace (char c) return c.isAlpha() || c.isDigit() || c=='_' ||
{ /// \todo disable this when doing more stricter compiling
return c==' ' || c=='\t'; c=='`' || c=='\'';
} }
// constructor // constructor

@ -1,9 +1,11 @@
#ifndef COMPILER_SCANNER_H_INCLUDED #ifndef COMPILER_SCANNER_H_INCLUDED
#define COMPILER_SCANNER_H_INCLUDED #define COMPILER_SCANNER_H_INCLUDED
#include <cctype>
#include <string> #include <string>
#include <iosfwd> #include <iosfwd>
#include <vector> #include <vector>
#include <sstream>
#include "tokenloc.hpp" #include "tokenloc.hpp"
@ -18,6 +20,159 @@ namespace Compiler
/// This class translate a char-stream to a token stream (delivered via /// This class translate a char-stream to a token stream (delivered via
/// parser-callbacks). /// parser-callbacks).
class MultiChar
{
public:
MultiChar()
{
blank();
}
MultiChar(const char ch)
{
blank();
mData[0] = ch;
mLength = getCharLength(ch);
}
int getCharLength(const char ch)
{
unsigned char c = ch;
if (c<=127) return 0;
else if ((c & 0xE0) == 0xC0) return 1;
else if ((c & 0xF0) == 0xE0) return 2;
else if ((c & 0xF8) == 0xF0) return 3;
else return -1;
}
bool operator== (const char ch)
{
return mData[0]==ch && mData[1]==0 && mData[2]==0 && mData[3]==0;
}
bool operator== (const MultiChar& ch)
{
return mData[0]==ch.mData[0] && mData[1]==ch.mData[1] && mData[2]==ch.mData[2] && mData[3]==ch.mData[3];
}
bool operator!= (const char ch)
{
return mData[0]!=ch || mData[1]!=0 || mData[2]!=0 || mData[3]!=0;
}
bool isWhitespace()
{
return (mData[0]==' ' || mData[0]=='\t') && mData[1]==0 && mData[2]==0 && mData[3]==0;
}
bool isDigit()
{
return std::isdigit(mData[0]) && mData[1]==0 && mData[2]==0 && mData[3]==0;
}
bool isMinusSign()
{
if (mData[0] == '-' && mData[1] == 0 && mData[2] == 0 && mData[3] == 0)
return true;
return mData[0] == '\xe2' && mData[1] == '\x80' && mData[2] == '\x93' && mData[3] == 0;
}
bool isAlpha()
{
if (isMinusSign())
return false;
return std::isalpha(mData[0]) || mData[1]!=0 || mData[2]!=0 || mData[3]!=0;
}
void appendTo(std::string& str)
{
for (int i = 0; i <= mLength; i++)
str += mData[i];
}
void putback (std::istream& in)
{
for (int i = mLength; i >= 0; i--)
in.putback (mData[i]);
}
bool getFrom(std::istream& in)
{
blank();
char ch = in.peek();
if (!in.good())
return false;
int length = getCharLength(ch);
if (length < 0) return false;
for (int i = 0; i <= length; i++)
{
in.get (ch);
if (!in.good())
return false;
mData[i] = ch;
}
mLength = length;
return true;
}
bool peek(std::istream& in)
{
std::streampos p_orig = in.tellg();
char ch = in.peek();
if (!in.good())
return false;
int length = getCharLength(ch);
if (length < 0) return false;
for (int i = 0; i <= length; i++)
{
if (length >= i)
{
in.get (ch);
if (!in.good())
return false;
mData[i] = ch;
}
}
mLength = length;
in.seekg(p_orig);
return true;
};
void blank()
{
std::fill(mData, mData + sizeof(mData), 0);
mLength = -1;
}
std::string data()
{
return mData;
}
private:
char mData[4];
int mLength;
};
class Scanner class Scanner
{ {
enum putback_type enum putback_type
@ -79,26 +234,24 @@ namespace Compiler
Scanner (const Scanner&); Scanner (const Scanner&);
Scanner& operator= (const Scanner&); Scanner& operator= (const Scanner&);
bool get (char& c); bool get (MultiChar& c);
void putback (char c); void putback (MultiChar& c);
bool scanToken (Parser& parser); bool scanToken (Parser& parser);
bool scanInt (char c, Parser& parser, bool& cont); bool scanInt (MultiChar& c, Parser& parser, bool& cont);
bool scanFloat (const std::string& intValue, Parser& parser, bool& cont); bool scanFloat (const std::string& intValue, Parser& parser, bool& cont);
bool scanName (char c, Parser& parser, bool& cont); bool scanName (MultiChar& c, Parser& parser, bool& cont);
/// \param name May contain the start of the name (one or more characters) /// \param name May contain the start of the name (one or more characters)
bool scanName (std::string& name); bool scanName (std::string& name);
bool scanSpecial (char c, Parser& parser, bool& cont); bool scanSpecial (MultiChar& c, Parser& parser, bool& cont);
bool isStringCharacter (char c, bool lookAhead = true);
static bool isWhitespace (char c); bool isStringCharacter (MultiChar& c, bool lookAhead = true);
public: public:

Loading…
Cancel
Save