mirror of
https://github.com/OpenMW/openmw.git
synced 2025-03-03 15:09:39 +00:00
Add StatelessUtf8Encoder to support caller provided buffer for output
This commit is contained in:
parent
b617d0aec4
commit
c044bef6a7
4 changed files with 105 additions and 72 deletions
|
@ -370,7 +370,7 @@ int CS::Editor::run()
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
ESM::ESMReader fileReader;
|
ESM::ESMReader fileReader;
|
||||||
ToUTF8::Utf8Encoder encoder = ToUTF8::calculateEncoding(mEncodingName);
|
ToUTF8::Utf8Encoder encoder(ToUTF8::calculateEncoding(mEncodingName));
|
||||||
fileReader.setEncoder(&encoder);
|
fileReader.setEncoder(&encoder);
|
||||||
fileReader.open(mFileToLoad.string());
|
fileReader.open(mFileToLoad.string());
|
||||||
|
|
||||||
|
|
|
@ -445,8 +445,7 @@ void ContentSelectorModel::ContentModel::addFiles(const QString &path)
|
||||||
|
|
||||||
try {
|
try {
|
||||||
ESM::ESMReader fileReader;
|
ESM::ESMReader fileReader;
|
||||||
ToUTF8::Utf8Encoder encoder =
|
ToUTF8::Utf8Encoder encoder(ToUTF8::calculateEncoding(mEncoding.toStdString()));
|
||||||
ToUTF8::calculateEncoding(mEncoding.toStdString());
|
|
||||||
fileReader.setEncoder(&encoder);
|
fileReader.setEncoder(&encoder);
|
||||||
fileReader.open(std::string(dir.absoluteFilePath(path2).toUtf8().constData()));
|
fileReader.open(std::string(dir.absoluteFilePath(path2).toUtf8().constData()));
|
||||||
|
|
||||||
|
|
|
@ -51,42 +51,52 @@ namespace
|
||||||
{
|
{
|
||||||
return std::find_if(input.begin(), input.end(), [] (unsigned char v) { return v == 0 || v >= 128; });
|
return std::find_if(input.begin(), input.end(), [] (unsigned char v) { return v == 0 || v >= 128; });
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
Utf8Encoder::Utf8Encoder(const FromType sourceEncoding):
|
std::basic_string_view<signed char> getTranslationArray(FromType sourceEncoding)
|
||||||
mOutput(50*1024)
|
|
||||||
{
|
|
||||||
switch (sourceEncoding)
|
|
||||||
{
|
{
|
||||||
case ToUTF8::WINDOWS_1252:
|
switch (sourceEncoding)
|
||||||
{
|
{
|
||||||
translationArray = ToUTF8::windows_1252;
|
case ToUTF8::WINDOWS_1252:
|
||||||
break;
|
return ToUTF8::windows_1252;
|
||||||
}
|
case ToUTF8::WINDOWS_1250:
|
||||||
case ToUTF8::WINDOWS_1250:
|
return ToUTF8::windows_1250;
|
||||||
{
|
case ToUTF8::WINDOWS_1251:
|
||||||
translationArray = ToUTF8::windows_1250;
|
return ToUTF8::windows_1251;
|
||||||
break;
|
case ToUTF8::CP437:
|
||||||
}
|
return ToUTF8::cp437;
|
||||||
case ToUTF8::WINDOWS_1251:
|
|
||||||
{
|
|
||||||
translationArray = ToUTF8::windows_1251;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
case ToUTF8::CP437:
|
|
||||||
{
|
|
||||||
translationArray = ToUTF8::cp437;
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
|
throw std::logic_error("Invalid source encoding: " + std::to_string(sourceEncoding));
|
||||||
|
}
|
||||||
|
|
||||||
default:
|
// Make sure the output vector is large enough for 'size' bytes,
|
||||||
|
// including a terminating zero after it.
|
||||||
|
void resize(std::size_t size, BufferAllocationPolicy bufferAllocationPolicy, std::string& buffer)
|
||||||
|
{
|
||||||
|
if (buffer.size() >= size)
|
||||||
|
return;
|
||||||
|
|
||||||
|
switch (bufferAllocationPolicy)
|
||||||
{
|
{
|
||||||
assert(0);
|
case BufferAllocationPolicy::FitToRequiredSize:
|
||||||
|
buffer.resize(size);
|
||||||
|
break;
|
||||||
|
case BufferAllocationPolicy::UseGrowFactor:
|
||||||
|
// Add some extra padding to reduce the chance of having to resize
|
||||||
|
// again later.
|
||||||
|
buffer.resize(3 * size);
|
||||||
|
// And make sure the string is zero terminated
|
||||||
|
buffer[size] = 0;
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string_view Utf8Encoder::getUtf8(std::string_view input)
|
StatelessUtf8Encoder::StatelessUtf8Encoder(FromType sourceEncoding)
|
||||||
|
: mTranslationArray(getTranslationArray(sourceEncoding))
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string_view StatelessUtf8Encoder::getUtf8(std::string_view input, BufferAllocationPolicy bufferAllocationPolicy, std::string& buffer) const
|
||||||
{
|
{
|
||||||
if (input.empty())
|
if (input.empty())
|
||||||
return input;
|
return input;
|
||||||
|
@ -106,24 +116,24 @@ std::string_view Utf8Encoder::getUtf8(std::string_view input)
|
||||||
return std::string_view(input.data(), outlen);
|
return std::string_view(input.data(), outlen);
|
||||||
|
|
||||||
// Make sure the output is large enough
|
// Make sure the output is large enough
|
||||||
resize(outlen);
|
resize(outlen, bufferAllocationPolicy, buffer);
|
||||||
char *out = &mOutput[0];
|
char *out = buffer.data();
|
||||||
|
|
||||||
// Translate
|
// Translate
|
||||||
for (auto it = input.begin(); it != input.end() && *it != 0; ++it)
|
for (auto it = input.begin(); it != input.end() && *it != 0; ++it)
|
||||||
copyFromArray(*it, out);
|
copyFromArray(*it, out);
|
||||||
|
|
||||||
// Make sure that we wrote the correct number of bytes
|
// Make sure that we wrote the correct number of bytes
|
||||||
assert((out-&mOutput[0]) == (int)outlen);
|
assert((out - buffer.data()) == (int)outlen);
|
||||||
|
|
||||||
// And make extra sure the output is null terminated
|
// And make extra sure the output is null terminated
|
||||||
assert(mOutput.size() > outlen);
|
assert(buffer.size() >= outlen);
|
||||||
assert(mOutput[outlen] == 0);
|
assert(buffer[outlen] == 0);
|
||||||
|
|
||||||
return std::string_view(mOutput.data(), outlen);
|
return std::string_view(buffer.data(), outlen);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string_view Utf8Encoder::getLegacyEnc(std::string_view input)
|
std::string_view StatelessUtf8Encoder::getLegacyEnc(std::string_view input, BufferAllocationPolicy bufferAllocationPolicy, std::string& buffer) const
|
||||||
{
|
{
|
||||||
if (input.empty())
|
if (input.empty())
|
||||||
return input;
|
return input;
|
||||||
|
@ -143,34 +153,21 @@ std::string_view Utf8Encoder::getLegacyEnc(std::string_view input)
|
||||||
return std::string_view(input.data(), outlen);
|
return std::string_view(input.data(), outlen);
|
||||||
|
|
||||||
// Make sure the output is large enough
|
// Make sure the output is large enough
|
||||||
resize(outlen);
|
resize(outlen, bufferAllocationPolicy, buffer);
|
||||||
char *out = &mOutput[0];
|
char *out = buffer.data();
|
||||||
|
|
||||||
// Translate
|
// Translate
|
||||||
for (auto it = input.begin(); it != input.end() && *it != 0;)
|
for (auto it = input.begin(); it != input.end() && *it != 0;)
|
||||||
copyFromArrayLegacyEnc(it, input.end(), out);
|
copyFromArrayLegacyEnc(it, input.end(), out);
|
||||||
|
|
||||||
// Make sure that we wrote the correct number of bytes
|
// Make sure that we wrote the correct number of bytes
|
||||||
assert((out-&mOutput[0]) == (int)outlen);
|
assert((out - buffer.data()) == static_cast<int>(outlen));
|
||||||
|
|
||||||
// And make extra sure the output is null terminated
|
// And make extra sure the output is null terminated
|
||||||
assert(mOutput.size() > outlen);
|
assert(buffer.size() >= outlen);
|
||||||
assert(mOutput[outlen] == 0);
|
assert(buffer[outlen] == 0);
|
||||||
|
|
||||||
return std::string_view(mOutput.data(), outlen);
|
return std::string_view(buffer.data(), outlen);
|
||||||
}
|
|
||||||
|
|
||||||
// Make sure the output vector is large enough for 'size' bytes,
|
|
||||||
// including a terminating zero after it.
|
|
||||||
void Utf8Encoder::resize(size_t size)
|
|
||||||
{
|
|
||||||
if (mOutput.size() <= size)
|
|
||||||
// Add some extra padding to reduce the chance of having to resize
|
|
||||||
// again later.
|
|
||||||
mOutput.resize(3*size);
|
|
||||||
|
|
||||||
// And make sure the string is zero terminated
|
|
||||||
mOutput[size] = 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Get the total length length needed to decode the given string with
|
/** Get the total length length needed to decode the given string with
|
||||||
|
@ -183,7 +180,7 @@ void Utf8Encoder::resize(size_t size)
|
||||||
is the case, then the ascii parameter is set to true, and the
|
is the case, then the ascii parameter is set to true, and the
|
||||||
caller can optimize for this case.
|
caller can optimize for this case.
|
||||||
*/
|
*/
|
||||||
std::pair<std::size_t, bool> Utf8Encoder::getLength(std::string_view input) const
|
std::pair<std::size_t, bool> StatelessUtf8Encoder::getLength(std::string_view input) const
|
||||||
{
|
{
|
||||||
// Do away with the ascii part of the string first (this is almost
|
// Do away with the ascii part of the string first (this is almost
|
||||||
// always the entire string.)
|
// always the entire string.)
|
||||||
|
@ -201,7 +198,7 @@ std::pair<std::size_t, bool> Utf8Encoder::getLength(std::string_view input) cons
|
||||||
{
|
{
|
||||||
// Find the translated length of this character in the
|
// Find the translated length of this character in the
|
||||||
// lookup table.
|
// lookup table.
|
||||||
len += translationArray[static_cast<unsigned char>(*it) * 6];
|
len += mTranslationArray[static_cast<unsigned char>(*it) * 6];
|
||||||
++it;
|
++it;
|
||||||
}
|
}
|
||||||
while (it != input.end() && *it != 0);
|
while (it != input.end() && *it != 0);
|
||||||
|
@ -211,7 +208,7 @@ std::pair<std::size_t, bool> Utf8Encoder::getLength(std::string_view input) cons
|
||||||
|
|
||||||
// Translate one character 'ch' using the translation array 'arr', and
|
// Translate one character 'ch' using the translation array 'arr', and
|
||||||
// advance the output pointer accordingly.
|
// advance the output pointer accordingly.
|
||||||
void Utf8Encoder::copyFromArray(unsigned char ch, char* &out) const
|
void StatelessUtf8Encoder::copyFromArray(unsigned char ch, char* &out) const
|
||||||
{
|
{
|
||||||
// Optimize for ASCII values
|
// Optimize for ASCII values
|
||||||
if (ch < 128)
|
if (ch < 128)
|
||||||
|
@ -220,13 +217,13 @@ void Utf8Encoder::copyFromArray(unsigned char ch, char* &out) const
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const signed char *in = translationArray + ch*6;
|
const signed char *in = &mTranslationArray[ch * 6];
|
||||||
int len = *(in++);
|
int len = *(in++);
|
||||||
memcpy(out, in, len);
|
memcpy(out, in, len);
|
||||||
out += len;
|
out += len;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::pair<std::size_t, bool> Utf8Encoder::getLengthLegacyEnc(std::string_view input) const
|
std::pair<std::size_t, bool> StatelessUtf8Encoder::getLengthLegacyEnc(std::string_view input) const
|
||||||
{
|
{
|
||||||
// Do away with the ascii part of the string first (this is almost
|
// Do away with the ascii part of the string first (this is almost
|
||||||
// always the entire string.)
|
// always the entire string.)
|
||||||
|
@ -271,7 +268,7 @@ std::pair<std::size_t, bool> Utf8Encoder::getLengthLegacyEnc(std::string_view in
|
||||||
return {len, false};
|
return {len, false};
|
||||||
}
|
}
|
||||||
|
|
||||||
void Utf8Encoder::copyFromArrayLegacyEnc(std::string_view::iterator& chp, std::string_view::iterator end, char* &out) const
|
void StatelessUtf8Encoder::copyFromArrayLegacyEnc(std::string_view::iterator& chp, std::string_view::iterator end, char* &out) const
|
||||||
{
|
{
|
||||||
unsigned char ch = *(chp++);
|
unsigned char ch = *(chp++);
|
||||||
// Optimize for ASCII values
|
// Optimize for ASCII values
|
||||||
|
@ -316,7 +313,7 @@ void Utf8Encoder::copyFromArrayLegacyEnc(std::string_view::iterator& chp, std::s
|
||||||
|
|
||||||
for (int i = 128; i < 256; i++)
|
for (int i = 128; i < 256; i++)
|
||||||
{
|
{
|
||||||
unsigned char b1 = translationArray[i*6 + 1], b2 = translationArray[i*6 + 2], b3 = translationArray[i*6 + 3];
|
unsigned char b1 = mTranslationArray[i*6 + 1], b2 = mTranslationArray[i*6 + 2], b3 = mTranslationArray[i*6 + 3];
|
||||||
if (b1 == ch && b2 == ch2 && (len != 3 || b3 == ch3))
|
if (b1 == ch && b2 == ch2 && (len != 3 || b3 == ch3))
|
||||||
{
|
{
|
||||||
*(out++) = (char)i;
|
*(out++) = (char)i;
|
||||||
|
@ -329,6 +326,22 @@ void Utf8Encoder::copyFromArrayLegacyEnc(std::string_view::iterator& chp, std::s
|
||||||
*(out++) = ch; // Could not find glyph, just put whatever
|
*(out++) = ch; // Could not find glyph, just put whatever
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Utf8Encoder::Utf8Encoder(FromType sourceEncoding)
|
||||||
|
: mBuffer(50 * 1024, '\0')
|
||||||
|
, mImpl(sourceEncoding)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string_view Utf8Encoder::getUtf8(std::string_view input)
|
||||||
|
{
|
||||||
|
return mImpl.getUtf8(input, BufferAllocationPolicy::UseGrowFactor, mBuffer);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string_view Utf8Encoder::getLegacyEnc(std::string_view input)
|
||||||
|
{
|
||||||
|
return mImpl.getLegacyEnc(input, BufferAllocationPolicy::UseGrowFactor, mBuffer);
|
||||||
|
}
|
||||||
|
|
||||||
ToUTF8::FromType ToUTF8::calculateEncoding(const std::string& encodingName)
|
ToUTF8::FromType ToUTF8::calculateEncoding(const std::string& encodingName)
|
||||||
{
|
{
|
||||||
if (encodingName == "win1250")
|
if (encodingName == "win1250")
|
||||||
|
|
|
@ -18,34 +18,55 @@ namespace ToUTF8
|
||||||
CP437 // Used for fonts (*.fnt) if data files encoding is 1252. Otherwise, uses the same encoding as the data files.
|
CP437 // Used for fonts (*.fnt) if data files encoding is 1252. Otherwise, uses the same encoding as the data files.
|
||||||
};
|
};
|
||||||
|
|
||||||
|
enum class BufferAllocationPolicy
|
||||||
|
{
|
||||||
|
FitToRequiredSize,
|
||||||
|
UseGrowFactor,
|
||||||
|
};
|
||||||
|
|
||||||
FromType calculateEncoding(const std::string& encodingName);
|
FromType calculateEncoding(const std::string& encodingName);
|
||||||
std::string encodingUsingMessage(const std::string& encodingName);
|
std::string encodingUsingMessage(const std::string& encodingName);
|
||||||
|
|
||||||
// class
|
class StatelessUtf8Encoder
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
explicit StatelessUtf8Encoder(FromType sourceEncoding);
|
||||||
|
|
||||||
|
/// Convert to UTF8 from the previously given code page.
|
||||||
|
/// Returns a view to passed buffer that will be resized to fit output if it's too small.
|
||||||
|
std::string_view getUtf8(std::string_view input, BufferAllocationPolicy bufferAllocationPolicy, std::string& buffer) const;
|
||||||
|
|
||||||
|
/// Convert from UTF-8 to sourceEncoding.
|
||||||
|
/// Returns a view to passed buffer that will be resized to fit output if it's too small.
|
||||||
|
std::string_view getLegacyEnc(std::string_view input, BufferAllocationPolicy bufferAllocationPolicy, std::string& buffer) const;
|
||||||
|
|
||||||
|
private:
|
||||||
|
inline std::pair<std::size_t, bool> getLength(std::string_view input) const;
|
||||||
|
inline void copyFromArray(unsigned char chp, char* &out) const;
|
||||||
|
inline std::pair<std::size_t, bool> getLengthLegacyEnc(std::string_view input) const;
|
||||||
|
inline void copyFromArrayLegacyEnc(std::string_view::iterator& chp, std::string_view::iterator end, char* &out) const;
|
||||||
|
|
||||||
|
const std::basic_string_view<signed char> mTranslationArray;
|
||||||
|
};
|
||||||
|
|
||||||
class Utf8Encoder
|
class Utf8Encoder
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
Utf8Encoder(FromType sourceEncoding);
|
explicit Utf8Encoder(FromType sourceEncoding);
|
||||||
|
|
||||||
/// Convert to UTF8 from the previously given code page.
|
/// Convert to UTF8 from the previously given code page.
|
||||||
/// Returns a view to internal buffer invalidate by next getUtf8 or getLegacyEnc call if input is not
|
/// Returns a view to internal buffer invalidate by next getUtf8 or getLegacyEnc call if input is not
|
||||||
/// ASCII-only string. Otherwise returns a view to the input.
|
/// ASCII-only string. Otherwise returns a view to the input.
|
||||||
std::string_view getUtf8(std::string_view input);
|
std::string_view getUtf8(std::string_view input);
|
||||||
|
|
||||||
|
/// Convert from UTF-8 to sourceEncoding.
|
||||||
/// Returns a view to internal buffer invalidate by next getUtf8 or getLegacyEnc call if input is not
|
/// Returns a view to internal buffer invalidate by next getUtf8 or getLegacyEnc call if input is not
|
||||||
/// ASCII-only string. Otherwise returns a view to the input.
|
/// ASCII-only string. Otherwise returns a view to the input.
|
||||||
std::string_view getLegacyEnc(std::string_view input);
|
std::string_view getLegacyEnc(std::string_view input);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
inline void resize(std::size_t size);
|
std::string mBuffer;
|
||||||
inline std::pair<std::size_t, bool> getLength(std::string_view input) const;
|
StatelessUtf8Encoder mImpl;
|
||||||
inline void copyFromArray(unsigned char chp, char* &out) const;
|
|
||||||
inline std::pair<std::size_t, bool> getLengthLegacyEnc(std::string_view input) const;
|
|
||||||
inline void copyFromArrayLegacyEnc(std::string_view::iterator& chp, std::string_view::iterator end, char* &out) const;
|
|
||||||
|
|
||||||
std::vector<char> mOutput;
|
|
||||||
const signed char* translationArray;
|
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue