Implement UTF-8 support for script parser (bug #4598)

pull/2575/head
Andrei Kortunov 5 years ago
parent fe2f4bcedc
commit 45ab6e1430

@ -35,6 +35,7 @@
Bug #4456: AiActivate should not be cancelled after target activation
Bug #4540: Rain delay when exiting water
Bug #4594: Actors without AI packages don't use Hello dialogue
Bug #4598: Script parser does not support non-ASCII characters
Bug #4600: Crash when no sound output is available or --no-sound is used.
Bug #4639: Black screen after completing first mages guild mission + training
Bug #4650: Focus is lost after pressing ESC in confirmation dialog inside savegame dialog

@ -65,7 +65,10 @@ void CSVWorld::ScriptHighlighter::parseEOF (Compiler::Scanner& scanner)
void CSVWorld::ScriptHighlighter::highlight (const Compiler::TokenLoc& loc, Type type)
{
int length = static_cast<int> (loc.mLiteral.size());
// We should take in account multibyte characters
int length = 0;
const char* token = loc.mLiteral.c_str();
while (*token) length += (*token++ & 0xc0) != 0x80;
int index = loc.mColumn;

@ -1,8 +1,6 @@
#include "scanner.hpp"
#include <cassert>
#include <cctype>
#include <sstream>
#include <iterator>
#include "exception.hpp"
@ -14,14 +12,12 @@
namespace Compiler
{
bool Scanner::get (char& c)
bool Scanner::get (MultiChar& c)
{
mStream.get (c);
if (!mStream.good())
if (!c.getFrom(mStream))
return false;
mPrevLoc =mLoc;
mPrevLoc = mLoc;
if (c=='\n')
{
@ -34,15 +30,15 @@ namespace Compiler
else
{
++mLoc.mColumn;
mLoc.mLiteral += c;
c.appendTo(mLoc.mLiteral);
}
return true;
}
void Scanner::putback (char c)
void Scanner::putback (MultiChar& c)
{
mStream.putback (c);
c.putback(mStream);
mLoc = mPrevLoc;
}
@ -80,7 +76,7 @@ namespace Compiler
break;
}
char c;
MultiChar c;
if (!get (c))
{
@ -91,7 +87,7 @@ namespace Compiler
{
std::string comment;
comment += c;
c.appendTo(comment);
while (get (c))
{
@ -101,7 +97,7 @@ namespace Compiler
break;
}
else
comment += c;
c.appendTo(comment);
}
TokenLoc loc (mLoc);
@ -109,7 +105,7 @@ namespace Compiler
return parser.parseComment (comment, loc, *this);
}
else if (isWhitespace (c))
else if (c.isWhitespace())
{
mLoc.mLiteral.clear();
return true;
@ -120,7 +116,7 @@ namespace Compiler
mLoc.mLiteral.clear();
return true;
}
else if (std::isalpha (c) || c=='_' || c=='"')
else if (c.isAlpha() || c=='_' || c=='"')
{
bool cont = false;
@ -130,7 +126,7 @@ namespace Compiler
return cont;
}
}
else if (std::isdigit (c))
else if (c.isDigit())
{
bool cont = false;
@ -162,24 +158,24 @@ namespace Compiler
throw SourceException();
}
bool Scanner::scanInt (char c, Parser& parser, bool& cont)
bool Scanner::scanInt (MultiChar& c, Parser& parser, bool& cont)
{
assert(c != '\0');
std::string value;
value += c;
c.appendTo(value);
bool error = false;
while (get (c))
{
if (std::isdigit (c))
if (c.isDigit())
{
value += c;
c.appendTo(value);
}
else if (c!='-' && isStringCharacter (c))
else if (!c.isMinusSign() && isStringCharacter (c))
{
error = true;
value += c;
c.appendTo(value);
}
else if (c=='.')
{
@ -224,19 +220,19 @@ namespace Compiler
{
std::string value = intValue + ".";
char c;
MultiChar c;
bool empty = intValue.empty() || intValue=="-";
bool error = false;
while (get (c))
{
if (std::isdigit (c))
if (c.isDigit())
{
value += c;
c.appendTo(value);
empty = false;
}
else if (std::isalpha (c) || c=='_')
else if (c.isAlpha() || c=='_')
error = true;
else
{
@ -279,10 +275,10 @@ namespace Compiler
0
};
bool Scanner::scanName (char c, Parser& parser, bool& cont)
bool Scanner::scanName (MultiChar& c, Parser& parser, bool& cont)
{
std::string name;
name += c;
c.appendTo(name);
if (!scanName (name))
return false;
@ -315,8 +311,8 @@ namespace Compiler
// Russian localization and some mods use a quirk - add newline character directly
// to compiled bytecode via HEX-editor to implement multiline messageboxes.
// Of course, original editor will not compile such script.
// Allow messageboxes to bybass the "incomplete string or name" error.
// Of course, original editor can not compile such script.
// Allow messageboxes to bypass the "incomplete string or name" error.
if (lowerCase == "messagebox")
enableIgnoreNewlines();
else if (isKeyword)
@ -344,7 +340,7 @@ namespace Compiler
bool Scanner::scanName (std::string& name)
{
char c;
MultiChar c;
bool error = false;
while (get (c))
@ -353,7 +349,7 @@ namespace Compiler
{
if (c=='"')
{
name += c;
c.appendTo(name);
break;
}
// ignoring escape sequences for now, because they are messing up stupid Windows path names.
@ -380,20 +376,20 @@ namespace Compiler
}
else if (!(c=='"' && name.empty()))
{
if (!isStringCharacter (c) && !(mTolerantNames && (c=='.' || c=='-')))
if (!isStringCharacter (c) && !(mTolerantNames && (c=='.' || c == '-')))
{
putback (c);
break;
}
}
name += c;
c.appendTo(name);
}
return !error;
}
bool Scanner::scanSpecial (char c, Parser& parser, bool& cont)
bool Scanner::scanSpecial (MultiChar& c, Parser& parser, bool& cont)
{
int special = -1;
@ -410,7 +406,7 @@ namespace Compiler
{
putback (c);
if (std::isdigit (c))
if (c.isDigit())
return scanFloat ("", parser, cont);
}
@ -428,7 +424,7 @@ namespace Compiler
else if (c == '>' || c == '<') // Treat => and =< as ==
{
special = S_cmpEQ;
mErrorHandler.warning (std::string("invalid operator =") + c + ", treating it as ==", mLoc);
mErrorHandler.warning (std::string("invalid operator =") + c.data() + ", treating it as ==", mLoc);
}
else
{
@ -463,27 +459,7 @@ namespace Compiler
else
return false;
}
else if (c=='-')
{
if (get (c))
{
if (c=='>')
special = S_ref;
else
{
putback (c);
special = S_minus;
}
}
else
special = S_minus;
}
else if (static_cast<unsigned char> (c)==0xe2)
{
/// Workaround for some translator who apparently can't keep his minus in order
/// \todo disable for later script formats
if (get (c) && static_cast<unsigned char> (c)==0x80 &&
get (c) && static_cast<unsigned char> (c)==0x93)
else if (c.isMinusSign())
{
if (get (c))
{
@ -498,12 +474,6 @@ namespace Compiler
else
special = S_minus;
}
else
{
mErrorHandler.error ("Invalid character", mLoc);
return false;
}
}
else if (c=='<')
{
if (get (c))
@ -582,20 +552,21 @@ namespace Compiler
return true;
}
bool Scanner::isStringCharacter (char c, bool lookAhead)
bool Scanner::isStringCharacter (MultiChar& c, bool lookAhead)
{
if (lookAhead && c.isMinusSign())
{
return std::isalpha (c) || std::isdigit (c) || c=='_' ||
/// \todo disable this when doing more stricter compiling
c=='`' || c=='\'' ||
/// \todo disable this when doing more stricter compiling. Also, find out who is
/// responsible for allowing it in the first place and meet up with that person in
/// a dark alley.
(c=='-' && (!lookAhead || isStringCharacter (mStream.peek(), false)));
MultiChar next;
if (next.peek(mStream) && isStringCharacter (next, false))
return true;
}
bool Scanner::isWhitespace (char c)
{
return c==' ' || c=='\t';
return c.isAlpha() || c.isDigit() || c=='_' ||
/// \todo disable this when doing more stricter compiling
c=='`' || c=='\'';
}
// constructor

@ -1,9 +1,11 @@
#ifndef COMPILER_SCANNER_H_INCLUDED
#define COMPILER_SCANNER_H_INCLUDED
#include <cctype>
#include <string>
#include <iosfwd>
#include <vector>
#include <sstream>
#include "tokenloc.hpp"
@ -18,6 +20,159 @@ namespace Compiler
/// This class translate a char-stream to a token stream (delivered via
/// parser-callbacks).
class MultiChar
{
public:
MultiChar()
{
blank();
}
MultiChar(const char ch)
{
blank();
mData[0] = ch;
mLength = getCharLength(ch);
}
int getCharLength(const char ch)
{
unsigned char c = ch;
if (c<=127) return 0;
else if ((c & 0xE0) == 0xC0) return 1;
else if ((c & 0xF0) == 0xE0) return 2;
else if ((c & 0xF8) == 0xF0) return 3;
else return -1;
}
bool operator== (const char ch)
{
return mData[0]==ch && mData[1]==0 && mData[2]==0 && mData[3]==0;
}
bool operator== (const MultiChar& ch)
{
return mData[0]==ch.mData[0] && mData[1]==ch.mData[1] && mData[2]==ch.mData[2] && mData[3]==ch.mData[3];
}
bool operator!= (const char ch)
{
return mData[0]!=ch || mData[1]!=0 || mData[2]!=0 || mData[3]!=0;
}
bool isWhitespace()
{
return (mData[0]==' ' || mData[0]=='\t') && mData[1]==0 && mData[2]==0 && mData[3]==0;
}
bool isDigit()
{
return std::isdigit(mData[0]) && mData[1]==0 && mData[2]==0 && mData[3]==0;
}
bool isMinusSign()
{
if (mData[0] == '-' && mData[1] == 0 && mData[2] == 0 && mData[3] == 0)
return true;
return mData[0] == '\xe2' && mData[1] == '\x80' && mData[2] == '\x93' && mData[3] == 0;
}
bool isAlpha()
{
if (isMinusSign())
return false;
return std::isalpha(mData[0]) || mData[1]!=0 || mData[2]!=0 || mData[3]!=0;
}
void appendTo(std::string& str)
{
for (int i = 0; i <= mLength; i++)
str += mData[i];
}
void putback (std::istream& in)
{
for (int i = mLength; i >= 0; i--)
in.putback (mData[i]);
}
bool getFrom(std::istream& in)
{
blank();
char ch = in.peek();
if (!in.good())
return false;
int length = getCharLength(ch);
if (length < 0) return false;
for (int i = 0; i <= length; i++)
{
in.get (ch);
if (!in.good())
return false;
mData[i] = ch;
}
mLength = length;
return true;
}
bool peek(std::istream& in)
{
std::streampos p_orig = in.tellg();
char ch = in.peek();
if (!in.good())
return false;
int length = getCharLength(ch);
if (length < 0) return false;
for (int i = 0; i <= length; i++)
{
if (length >= i)
{
in.get (ch);
if (!in.good())
return false;
mData[i] = ch;
}
}
mLength = length;
in.seekg(p_orig);
return true;
};
void blank()
{
std::fill(mData, mData + sizeof(mData), 0);
mLength = -1;
}
std::string data()
{
return mData;
}
private:
char mData[4];
int mLength;
};
class Scanner
{
enum putback_type
@ -79,26 +234,24 @@ namespace Compiler
Scanner (const Scanner&);
Scanner& operator= (const Scanner&);
bool get (char& c);
bool get (MultiChar& c);
void putback (char c);
void putback (MultiChar& c);
bool scanToken (Parser& parser);
bool scanInt (char c, Parser& parser, bool& cont);
bool scanInt (MultiChar& c, Parser& parser, bool& cont);
bool scanFloat (const std::string& intValue, Parser& parser, bool& cont);
bool scanName (char c, Parser& parser, bool& cont);
bool scanName (MultiChar& c, Parser& parser, bool& cont);
/// \param name May contain the start of the name (one or more characters)
bool scanName (std::string& name);
bool scanSpecial (char c, Parser& parser, bool& cont);
bool isStringCharacter (char c, bool lookAhead = true);
bool scanSpecial (MultiChar& c, Parser& parser, bool& cont);
static bool isWhitespace (char c);
bool isStringCharacter (MultiChar& c, bool lookAhead = true);
public:

Loading…
Cancel
Save