mirror of
https://github.com/TES3MP/openmw-tes3mp.git
synced 2025-01-30 16:15:31 +00:00
Implement UTF-8 support for script parser (bug #4598)
This commit is contained in:
parent
fe2f4bcedc
commit
45ab6e1430
4 changed files with 210 additions and 82 deletions
|
@ -35,6 +35,7 @@
|
|||
Bug #4456: AiActivate should not be cancelled after target activation
|
||||
Bug #4540: Rain delay when exiting water
|
||||
Bug #4594: Actors without AI packages don't use Hello dialogue
|
||||
Bug #4598: Script parser does not support non-ASCII characters
|
||||
Bug #4600: Crash when no sound output is available or --no-sound is used.
|
||||
Bug #4639: Black screen after completing first mages guild mission + training
|
||||
Bug #4650: Focus is lost after pressing ESC in confirmation dialog inside savegame dialog
|
||||
|
|
|
@ -65,7 +65,10 @@ void CSVWorld::ScriptHighlighter::parseEOF (Compiler::Scanner& scanner)
|
|||
|
||||
void CSVWorld::ScriptHighlighter::highlight (const Compiler::TokenLoc& loc, Type type)
|
||||
{
|
||||
int length = static_cast<int> (loc.mLiteral.size());
|
||||
// We should take in account multibyte characters
|
||||
int length = 0;
|
||||
const char* token = loc.mLiteral.c_str();
|
||||
while (*token) length += (*token++ & 0xc0) != 0x80;
|
||||
|
||||
int index = loc.mColumn;
|
||||
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
#include "scanner.hpp"
|
||||
|
||||
#include <cassert>
|
||||
#include <cctype>
|
||||
#include <sstream>
|
||||
#include <iterator>
|
||||
|
||||
#include "exception.hpp"
|
||||
|
@ -14,14 +12,12 @@
|
|||
|
||||
namespace Compiler
|
||||
{
|
||||
bool Scanner::get (char& c)
|
||||
bool Scanner::get (MultiChar& c)
|
||||
{
|
||||
mStream.get (c);
|
||||
|
||||
if (!mStream.good())
|
||||
if (!c.getFrom(mStream))
|
||||
return false;
|
||||
|
||||
mPrevLoc =mLoc;
|
||||
mPrevLoc = mLoc;
|
||||
|
||||
if (c=='\n')
|
||||
{
|
||||
|
@ -34,15 +30,15 @@ namespace Compiler
|
|||
else
|
||||
{
|
||||
++mLoc.mColumn;
|
||||
mLoc.mLiteral += c;
|
||||
c.appendTo(mLoc.mLiteral);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void Scanner::putback (char c)
|
||||
void Scanner::putback (MultiChar& c)
|
||||
{
|
||||
mStream.putback (c);
|
||||
c.putback(mStream);
|
||||
mLoc = mPrevLoc;
|
||||
}
|
||||
|
||||
|
@ -80,7 +76,7 @@ namespace Compiler
|
|||
break;
|
||||
}
|
||||
|
||||
char c;
|
||||
MultiChar c;
|
||||
|
||||
if (!get (c))
|
||||
{
|
||||
|
@ -91,7 +87,7 @@ namespace Compiler
|
|||
{
|
||||
std::string comment;
|
||||
|
||||
comment += c;
|
||||
c.appendTo(comment);
|
||||
|
||||
while (get (c))
|
||||
{
|
||||
|
@ -101,7 +97,7 @@ namespace Compiler
|
|||
break;
|
||||
}
|
||||
else
|
||||
comment += c;
|
||||
c.appendTo(comment);
|
||||
}
|
||||
|
||||
TokenLoc loc (mLoc);
|
||||
|
@ -109,7 +105,7 @@ namespace Compiler
|
|||
|
||||
return parser.parseComment (comment, loc, *this);
|
||||
}
|
||||
else if (isWhitespace (c))
|
||||
else if (c.isWhitespace())
|
||||
{
|
||||
mLoc.mLiteral.clear();
|
||||
return true;
|
||||
|
@ -120,7 +116,7 @@ namespace Compiler
|
|||
mLoc.mLiteral.clear();
|
||||
return true;
|
||||
}
|
||||
else if (std::isalpha (c) || c=='_' || c=='"')
|
||||
else if (c.isAlpha() || c=='_' || c=='"')
|
||||
{
|
||||
bool cont = false;
|
||||
|
||||
|
@ -130,7 +126,7 @@ namespace Compiler
|
|||
return cont;
|
||||
}
|
||||
}
|
||||
else if (std::isdigit (c))
|
||||
else if (c.isDigit())
|
||||
{
|
||||
bool cont = false;
|
||||
|
||||
|
@ -162,24 +158,24 @@ namespace Compiler
|
|||
throw SourceException();
|
||||
}
|
||||
|
||||
bool Scanner::scanInt (char c, Parser& parser, bool& cont)
|
||||
bool Scanner::scanInt (MultiChar& c, Parser& parser, bool& cont)
|
||||
{
|
||||
assert(c != '\0');
|
||||
std::string value;
|
||||
value += c;
|
||||
c.appendTo(value);
|
||||
|
||||
bool error = false;
|
||||
|
||||
while (get (c))
|
||||
{
|
||||
if (std::isdigit (c))
|
||||
if (c.isDigit())
|
||||
{
|
||||
value += c;
|
||||
c.appendTo(value);
|
||||
}
|
||||
else if (c!='-' && isStringCharacter (c))
|
||||
else if (!c.isMinusSign() && isStringCharacter (c))
|
||||
{
|
||||
error = true;
|
||||
value += c;
|
||||
c.appendTo(value);
|
||||
}
|
||||
else if (c=='.')
|
||||
{
|
||||
|
@ -224,19 +220,19 @@ namespace Compiler
|
|||
{
|
||||
std::string value = intValue + ".";
|
||||
|
||||
char c;
|
||||
MultiChar c;
|
||||
|
||||
bool empty = intValue.empty() || intValue=="-";
|
||||
bool error = false;
|
||||
|
||||
while (get (c))
|
||||
{
|
||||
if (std::isdigit (c))
|
||||
if (c.isDigit())
|
||||
{
|
||||
value += c;
|
||||
c.appendTo(value);
|
||||
empty = false;
|
||||
}
|
||||
else if (std::isalpha (c) || c=='_')
|
||||
else if (c.isAlpha() || c=='_')
|
||||
error = true;
|
||||
else
|
||||
{
|
||||
|
@ -279,10 +275,10 @@ namespace Compiler
|
|||
0
|
||||
};
|
||||
|
||||
bool Scanner::scanName (char c, Parser& parser, bool& cont)
|
||||
bool Scanner::scanName (MultiChar& c, Parser& parser, bool& cont)
|
||||
{
|
||||
std::string name;
|
||||
name += c;
|
||||
c.appendTo(name);
|
||||
|
||||
if (!scanName (name))
|
||||
return false;
|
||||
|
@ -315,8 +311,8 @@ namespace Compiler
|
|||
|
||||
// Russian localization and some mods use a quirk - add newline character directly
|
||||
// to compiled bytecode via HEX-editor to implement multiline messageboxes.
|
||||
// Of course, original editor will not compile such script.
|
||||
// Allow messageboxes to bybass the "incomplete string or name" error.
|
||||
// Of course, original editor can not compile such script.
|
||||
// Allow messageboxes to bypass the "incomplete string or name" error.
|
||||
if (lowerCase == "messagebox")
|
||||
enableIgnoreNewlines();
|
||||
else if (isKeyword)
|
||||
|
@ -344,7 +340,7 @@ namespace Compiler
|
|||
|
||||
bool Scanner::scanName (std::string& name)
|
||||
{
|
||||
char c;
|
||||
MultiChar c;
|
||||
bool error = false;
|
||||
|
||||
while (get (c))
|
||||
|
@ -353,7 +349,7 @@ namespace Compiler
|
|||
{
|
||||
if (c=='"')
|
||||
{
|
||||
name += c;
|
||||
c.appendTo(name);
|
||||
break;
|
||||
}
|
||||
// ignoring escape sequences for now, because they are messing up stupid Windows path names.
|
||||
|
@ -380,20 +376,20 @@ namespace Compiler
|
|||
}
|
||||
else if (!(c=='"' && name.empty()))
|
||||
{
|
||||
if (!isStringCharacter (c) && !(mTolerantNames && (c=='.' || c=='-')))
|
||||
if (!isStringCharacter (c) && !(mTolerantNames && (c=='.' || c == '-')))
|
||||
{
|
||||
putback (c);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
name += c;
|
||||
c.appendTo(name);
|
||||
}
|
||||
|
||||
return !error;
|
||||
}
|
||||
|
||||
bool Scanner::scanSpecial (char c, Parser& parser, bool& cont)
|
||||
bool Scanner::scanSpecial (MultiChar& c, Parser& parser, bool& cont)
|
||||
{
|
||||
int special = -1;
|
||||
|
||||
|
@ -410,7 +406,7 @@ namespace Compiler
|
|||
{
|
||||
putback (c);
|
||||
|
||||
if (std::isdigit (c))
|
||||
if (c.isDigit())
|
||||
return scanFloat ("", parser, cont);
|
||||
}
|
||||
|
||||
|
@ -428,7 +424,7 @@ namespace Compiler
|
|||
else if (c == '>' || c == '<') // Treat => and =< as ==
|
||||
{
|
||||
special = S_cmpEQ;
|
||||
mErrorHandler.warning (std::string("invalid operator =") + c + ", treating it as ==", mLoc);
|
||||
mErrorHandler.warning (std::string("invalid operator =") + c.data() + ", treating it as ==", mLoc);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -463,7 +459,7 @@ namespace Compiler
|
|||
else
|
||||
return false;
|
||||
}
|
||||
else if (c=='-')
|
||||
else if (c.isMinusSign())
|
||||
{
|
||||
if (get (c))
|
||||
{
|
||||
|
@ -478,32 +474,6 @@ namespace Compiler
|
|||
else
|
||||
special = S_minus;
|
||||
}
|
||||
else if (static_cast<unsigned char> (c)==0xe2)
|
||||
{
|
||||
/// Workaround for some translator who apparently can't keep his minus in order
|
||||
/// \todo disable for later script formats
|
||||
if (get (c) && static_cast<unsigned char> (c)==0x80 &&
|
||||
get (c) && static_cast<unsigned char> (c)==0x93)
|
||||
{
|
||||
if (get (c))
|
||||
{
|
||||
if (c=='>')
|
||||
special = S_ref;
|
||||
else
|
||||
{
|
||||
putback (c);
|
||||
special = S_minus;
|
||||
}
|
||||
}
|
||||
else
|
||||
special = S_minus;
|
||||
}
|
||||
else
|
||||
{
|
||||
mErrorHandler.error ("Invalid character", mLoc);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
else if (c=='<')
|
||||
{
|
||||
if (get (c))
|
||||
|
@ -582,20 +552,21 @@ namespace Compiler
|
|||
return true;
|
||||
}
|
||||
|
||||
bool Scanner::isStringCharacter (char c, bool lookAhead)
|
||||
bool Scanner::isStringCharacter (MultiChar& c, bool lookAhead)
|
||||
{
|
||||
return std::isalpha (c) || std::isdigit (c) || c=='_' ||
|
||||
/// \todo disable this when doing more stricter compiling
|
||||
c=='`' || c=='\'' ||
|
||||
if (lookAhead && c.isMinusSign())
|
||||
{
|
||||
/// \todo disable this when doing more stricter compiling. Also, find out who is
|
||||
/// responsible for allowing it in the first place and meet up with that person in
|
||||
/// a dark alley.
|
||||
(c=='-' && (!lookAhead || isStringCharacter (mStream.peek(), false)));
|
||||
}
|
||||
MultiChar next;
|
||||
if (next.peek(mStream) && isStringCharacter (next, false))
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Scanner::isWhitespace (char c)
|
||||
{
|
||||
return c==' ' || c=='\t';
|
||||
return c.isAlpha() || c.isDigit() || c=='_' ||
|
||||
/// \todo disable this when doing more stricter compiling
|
||||
c=='`' || c=='\'';
|
||||
}
|
||||
|
||||
// constructor
|
||||
|
|
|
@ -1,9 +1,11 @@
|
|||
#ifndef COMPILER_SCANNER_H_INCLUDED
|
||||
#define COMPILER_SCANNER_H_INCLUDED
|
||||
|
||||
#include <cctype>
|
||||
#include <string>
|
||||
#include <iosfwd>
|
||||
#include <vector>
|
||||
#include <sstream>
|
||||
|
||||
#include "tokenloc.hpp"
|
||||
|
||||
|
@ -18,6 +20,159 @@ namespace Compiler
|
|||
/// This class translate a char-stream to a token stream (delivered via
|
||||
/// parser-callbacks).
|
||||
|
||||
class MultiChar
|
||||
{
|
||||
public:
|
||||
MultiChar()
|
||||
{
|
||||
blank();
|
||||
}
|
||||
|
||||
MultiChar(const char ch)
|
||||
{
|
||||
blank();
|
||||
mData[0] = ch;
|
||||
|
||||
mLength = getCharLength(ch);
|
||||
}
|
||||
|
||||
int getCharLength(const char ch)
|
||||
{
|
||||
unsigned char c = ch;
|
||||
if (c<=127) return 0;
|
||||
else if ((c & 0xE0) == 0xC0) return 1;
|
||||
else if ((c & 0xF0) == 0xE0) return 2;
|
||||
else if ((c & 0xF8) == 0xF0) return 3;
|
||||
else return -1;
|
||||
}
|
||||
|
||||
bool operator== (const char ch)
|
||||
{
|
||||
return mData[0]==ch && mData[1]==0 && mData[2]==0 && mData[3]==0;
|
||||
}
|
||||
|
||||
bool operator== (const MultiChar& ch)
|
||||
{
|
||||
return mData[0]==ch.mData[0] && mData[1]==ch.mData[1] && mData[2]==ch.mData[2] && mData[3]==ch.mData[3];
|
||||
}
|
||||
|
||||
bool operator!= (const char ch)
|
||||
{
|
||||
return mData[0]!=ch || mData[1]!=0 || mData[2]!=0 || mData[3]!=0;
|
||||
}
|
||||
|
||||
bool isWhitespace()
|
||||
{
|
||||
return (mData[0]==' ' || mData[0]=='\t') && mData[1]==0 && mData[2]==0 && mData[3]==0;
|
||||
}
|
||||
|
||||
bool isDigit()
|
||||
{
|
||||
return std::isdigit(mData[0]) && mData[1]==0 && mData[2]==0 && mData[3]==0;
|
||||
}
|
||||
|
||||
bool isMinusSign()
|
||||
{
|
||||
if (mData[0] == '-' && mData[1] == 0 && mData[2] == 0 && mData[3] == 0)
|
||||
return true;
|
||||
|
||||
return mData[0] == '\xe2' && mData[1] == '\x80' && mData[2] == '\x93' && mData[3] == 0;
|
||||
}
|
||||
|
||||
bool isAlpha()
|
||||
{
|
||||
if (isMinusSign())
|
||||
return false;
|
||||
|
||||
return std::isalpha(mData[0]) || mData[1]!=0 || mData[2]!=0 || mData[3]!=0;
|
||||
}
|
||||
|
||||
void appendTo(std::string& str)
|
||||
{
|
||||
for (int i = 0; i <= mLength; i++)
|
||||
str += mData[i];
|
||||
}
|
||||
|
||||
void putback (std::istream& in)
|
||||
{
|
||||
for (int i = mLength; i >= 0; i--)
|
||||
in.putback (mData[i]);
|
||||
}
|
||||
|
||||
bool getFrom(std::istream& in)
|
||||
{
|
||||
blank();
|
||||
|
||||
char ch = in.peek();
|
||||
|
||||
if (!in.good())
|
||||
return false;
|
||||
|
||||
int length = getCharLength(ch);
|
||||
if (length < 0) return false;
|
||||
|
||||
for (int i = 0; i <= length; i++)
|
||||
{
|
||||
in.get (ch);
|
||||
|
||||
if (!in.good())
|
||||
return false;
|
||||
|
||||
mData[i] = ch;
|
||||
}
|
||||
|
||||
mLength = length;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool peek(std::istream& in)
|
||||
{
|
||||
std::streampos p_orig = in.tellg();
|
||||
|
||||
char ch = in.peek();
|
||||
|
||||
if (!in.good())
|
||||
return false;
|
||||
|
||||
int length = getCharLength(ch);
|
||||
if (length < 0) return false;
|
||||
|
||||
for (int i = 0; i <= length; i++)
|
||||
{
|
||||
if (length >= i)
|
||||
{
|
||||
in.get (ch);
|
||||
|
||||
if (!in.good())
|
||||
return false;
|
||||
|
||||
mData[i] = ch;
|
||||
}
|
||||
}
|
||||
|
||||
mLength = length;
|
||||
|
||||
in.seekg(p_orig);
|
||||
return true;
|
||||
};
|
||||
|
||||
void blank()
|
||||
{
|
||||
std::fill(mData, mData + sizeof(mData), 0);
|
||||
mLength = -1;
|
||||
}
|
||||
|
||||
std::string data()
|
||||
{
|
||||
return mData;
|
||||
}
|
||||
|
||||
private:
|
||||
char mData[4];
|
||||
int mLength;
|
||||
};
|
||||
|
||||
class Scanner
|
||||
{
|
||||
enum putback_type
|
||||
|
@ -79,26 +234,24 @@ namespace Compiler
|
|||
Scanner (const Scanner&);
|
||||
Scanner& operator= (const Scanner&);
|
||||
|
||||
bool get (char& c);
|
||||
bool get (MultiChar& c);
|
||||
|
||||
void putback (char c);
|
||||
void putback (MultiChar& c);
|
||||
|
||||
bool scanToken (Parser& parser);
|
||||
|
||||
bool scanInt (char c, Parser& parser, bool& cont);
|
||||
bool scanInt (MultiChar& c, Parser& parser, bool& cont);
|
||||
|
||||
bool scanFloat (const std::string& intValue, Parser& parser, bool& cont);
|
||||
|
||||
bool scanName (char c, Parser& parser, bool& cont);
|
||||
bool scanName (MultiChar& c, Parser& parser, bool& cont);
|
||||
|
||||
/// \param name May contain the start of the name (one or more characters)
|
||||
bool scanName (std::string& name);
|
||||
|
||||
bool scanSpecial (char c, Parser& parser, bool& cont);
|
||||
bool scanSpecial (MultiChar& c, Parser& parser, bool& cont);
|
||||
|
||||
bool isStringCharacter (char c, bool lookAhead = true);
|
||||
|
||||
static bool isWhitespace (char c);
|
||||
bool isStringCharacter (MultiChar& c, bool lookAhead = true);
|
||||
|
||||
public:
|
||||
|
||||
|
|
Loading…
Reference in a new issue