From 358b7ad3ec7d27f4a8291adde7f19eac904d3ddc Mon Sep 17 00:00:00 2001 From: elsid Date: Mon, 22 May 2023 18:31:08 +0200 Subject: [PATCH] Keep ESM4 localized strings in memory Size of the files is in order of megabytes at max. Storing offset lookup table to read from file on demand is less efficient than reading from memory for such size. Read and store offsets first. Sort them to read values sequentially. Memoize last offset and value to avoid reading the same value twice. Use seek only when current possition does not match offset. Optimize seek for short distance by calling read instead. --- apps/esmtool/tes4.cpp | 3 +- apps/openmw/mwworld/esmloader.cpp | 5 +- components/esm4/reader.cpp | 221 ++++++++++++++++++------------ components/esm4/reader.hpp | 47 +++---- 4 files changed, 155 insertions(+), 121 deletions(-) diff --git a/apps/esmtool/tes4.cpp b/apps/esmtool/tes4.cpp index 904040428d..8760e33dfb 100644 --- a/apps/esmtool/tes4.cpp +++ b/apps/esmtool/tes4.cpp @@ -526,8 +526,7 @@ namespace EsmTool try { const ToUTF8::StatelessUtf8Encoder encoder(ToUTF8::calculateEncoding(info.encoding)); - ESM4::Reader reader(std::move(stream), info.filename); - reader.setEncoder(&encoder); + ESM4::Reader reader(std::move(stream), info.filename, nullptr, &encoder); const Params params(info); if (!params.mQuite) diff --git a/apps/openmw/mwworld/esmloader.cpp b/apps/openmw/mwworld/esmloader.cpp index 9190d53ea5..e586a4c204 100644 --- a/apps/openmw/mwworld/esmloader.cpp +++ b/apps/openmw/mwworld/esmloader.cpp @@ -64,9 +64,8 @@ namespace MWWorld } case ESM::Format::Tes4: { - ESM4::Reader readerESM4( - std::move(stream), filepath, MWBase::Environment::get().getResourceSystem()->getVFS()); - readerESM4.setEncoder(mReaders.getStatelessEncoder()); + ESM4::Reader readerESM4(std::move(stream), filepath, + MWBase::Environment::get().getResourceSystem()->getVFS(), mReaders.getStatelessEncoder()); readerESM4.setModIndex(index); readerESM4.updateModIndices(mNameToIndex); mStore.loadESM4(readerESM4); diff --git a/components/esm4/reader.cpp b/components/esm4/reader.cpp index ff578b23a4..8ce5b1f4d2 100644 --- a/components/esm4/reader.cpp +++ b/components/esm4/reader.cpp @@ -46,6 +46,7 @@ #include #include +#include #include #include #include @@ -56,6 +57,23 @@ namespace ESM4 { + namespace + { + std::u8string_view getStringsSuffix(LocalizedStringType type) + { + switch (type) + { + case LocalizedStringType::Strings: + return u8"_English.STRINGS"; + case LocalizedStringType::ILStrings: + return u8"_English.ILSTRINGS"; + case LocalizedStringType::DLStrings: + return u8"_English.DLSTRINGS"; + } + + throw std::logic_error("Unsupported LocalizedStringType: " + std::to_string(static_cast(type))); + } + } ReaderContext::ReaderContext() : modIndex(0) @@ -72,9 +90,10 @@ namespace ESM4 subRecordHeader.dataSize = 0; } - Reader::Reader(Files::IStreamPtr&& esmStream, const std::filesystem::path& filename, VFS::Manager const* vfs) + Reader::Reader(Files::IStreamPtr&& esmStream, const std::filesystem::path& filename, VFS::Manager const* vfs, + const ToUTF8::StatelessUtf8Encoder* encoder) : mVFS(vfs) - , mEncoder(nullptr) + , mEncoder(encoder) , mFileSize(0) , mStream(std::move(esmStream)) { @@ -209,58 +228,120 @@ namespace ESM4 if ((mHeader.mFlags & Rec_ESM) == 0 || (mHeader.mFlags & Rec_Localized) == 0) return; - const auto filename = mCtx.filename.stem().filename().u8string(); + const std::u8string prefix = mCtx.filename.stem().filename().u8string(); - static const std::filesystem::path s("Strings"); - buildLStringIndex(s / (filename + u8"_English.STRINGS"), Type_Strings); - buildLStringIndex(s / (filename + u8"_English.ILSTRINGS"), Type_ILStrings); - buildLStringIndex(s / (filename + u8"_English.DLSTRINGS"), Type_DLStrings); + buildLStringIndex(LocalizedStringType::Strings, prefix); + buildLStringIndex(LocalizedStringType::ILStrings, prefix); + buildLStringIndex(LocalizedStringType::DLStrings, prefix); } - void Reader::buildLStringIndex(const std::filesystem::path& stringFile, LocalizedStringType stringType) + void Reader::buildLStringIndex(LocalizedStringType stringType, const std::u8string& prefix) { - std::uint32_t numEntries; - std::uint32_t dataSize; - std::uint32_t stringId; - LStringOffset sp; - sp.type = stringType; + static const std::filesystem::path strings("Strings"); + const std::u8string suffix(getStringsSuffix(stringType)); + std::filesystem::path path = strings / (prefix + suffix); - // TODO: possibly check if the resource exists? - Files::IStreamPtr filestream = mVFS - ? mVFS->get(stringFile.string()) - : Files::openConstrainedFileStream(mCtx.filename.parent_path() / stringFile); - - filestream->seekg(0, std::ios::end); - std::size_t fileSize = filestream->tellg(); - filestream->seekg(0, std::ios::beg); - - std::istream* stream = filestream.get(); - switch (stringType) + if (mVFS != nullptr) { - case Type_Strings: - mStrings = std::move(filestream); - break; - case Type_ILStrings: - mILStrings = std::move(filestream); - break; - case Type_DLStrings: - mDLStrings = std::move(filestream); - break; - default: - throw std::runtime_error("ESM4::Reader::unknown localised string type"); + const Files::IStreamPtr stream = mVFS->get(Files::pathToUnicodeString(path)); + buildLStringIndex(stringType, *stream); + return; } - stream->read((char*)&numEntries, sizeof(numEntries)); - stream->read((char*)&dataSize, sizeof(dataSize)); - std::size_t dataStart = fileSize - dataSize; - for (unsigned int i = 0; i < numEntries; ++i) + const Files::IStreamPtr stream = Files::openConstrainedFileStream(mCtx.filename.parent_path() / path); + buildLStringIndex(stringType, *stream); + } + + void Reader::buildLStringIndex(LocalizedStringType stringType, std::istream& stream) + { + stream.seekg(0, std::ios::end); + const std::istream::pos_type fileSize = stream.tellg(); + stream.seekg(0, std::ios::beg); + + std::uint32_t numEntries = 0; + stream.read(reinterpret_cast(&numEntries), sizeof(numEntries)); + + std::uint32_t dataSize = 0; + stream.read(reinterpret_cast(&dataSize), sizeof(dataSize)); + + const std::istream::pos_type dataStart = fileSize - static_cast(dataSize); + + struct LocalizedString { - stream->read((char*)&stringId, sizeof(stringId)); - stream->read((char*)&sp.offset, sizeof(sp.offset)); - sp.offset += (std::uint32_t)dataStart; - mLStringIndex[FormId::fromUint32(stringId)] = sp; + std::uint32_t mOffset = 0; + std::uint32_t mStringId = 0; + }; + + std::vector strings; + strings.reserve(numEntries); + + for (std::uint32_t i = 0; i < numEntries; ++i) + { + LocalizedString string; + + stream.read(reinterpret_cast(&string.mStringId), sizeof(string.mStringId)); + stream.read(reinterpret_cast(&string.mOffset), sizeof(string.mOffset)); + + strings.push_back(string); } - // assert (dataStart - stream->tell() == 0 && "String file start of data section mismatch"); + + std::sort(strings.begin(), strings.end(), + [](const LocalizedString& l, const LocalizedString& r) { return l.mOffset < r.mOffset; }); + + std::uint32_t lastOffset = 0; + std::string_view lastValue; + + for (const LocalizedString& string : strings) + { + if (string.mOffset == lastOffset) + { + mLStringIndex.emplace(FormId::fromUint32(string.mStringId), lastValue); + continue; + } + + const std::istream::pos_type offset = string.mOffset + dataStart; + const std::istream::pos_type pos = stream.tellg(); + if (pos != offset) + { + char buffer[4096]; + if (pos < offset && offset - pos < static_cast(sizeof(buffer))) + stream.read(buffer, offset - pos); + else + stream.seekg(offset); + } + + const auto it + = mLStringIndex.emplace(FormId::fromUint32(string.mStringId), readLocalizedString(stringType, stream)) + .first; + lastOffset = string.mOffset; + lastValue = it->second; + } + } + + std::string Reader::readLocalizedString(LocalizedStringType type, std::istream& stream) + { + if (type == LocalizedStringType::Strings) + { + std::string data; + + while (true) + { + char ch = 0; + stream.read(&ch, sizeof(ch)); + if (ch == 0) + break; + data.push_back(ch); + } + + return data; + } + + std::uint32_t size = 0; + stream.read(reinterpret_cast(&size), sizeof(size)); + + std::string result; + getStringImpl(result, size, stream, true); // expect null terminated string + return result; } void Reader::getLocalizedString(std::string& str) @@ -277,48 +358,13 @@ namespace ESM4 // FIXME: very messy and probably slow/inefficient void Reader::getLocalizedStringImpl(const FormId stringId, std::string& str) { - const std::map::const_iterator it = mLStringIndex.find(stringId); + const auto it = mLStringIndex.find(stringId); - if (it != mLStringIndex.end()) - { - std::istream* filestream = nullptr; + if (it == mLStringIndex.end()) + throw std::runtime_error( + "ESM4::Reader::getLocalizedString localized string not found for " + formIdToString(stringId)); - switch (it->second.type) - { - case Type_Strings: // no string size provided - { - filestream = mStrings.get(); - filestream->seekg(it->second.offset); - - char ch; - std::vector data; - do - { - filestream->read(&ch, sizeof(ch)); - data.push_back(ch); - } while (ch != 0); - - str = std::string(data.data()); - return; - } - case Type_ILStrings: - filestream = mILStrings.get(); - break; - case Type_DLStrings: - filestream = mDLStrings.get(); - break; - default: - throw std::runtime_error("ESM4::Reader::getLocalizedString unknown string type"); - } - - // get ILStrings or DLStrings (they provide string size) - filestream->seekg(it->second.offset); - std::uint32_t size = 0; - filestream->read((char*)&size, sizeof(size)); - getStringImpl(str, size, *filestream, mEncoder, true); // expect null terminated string - } - else - throw std::runtime_error("ESM4::Reader::getLocalizedString localized string not found"); + str = it->second; } bool Reader::getRecordHeader() @@ -659,19 +705,18 @@ namespace ESM4 throw std::runtime_error(ss.str()); } - bool Reader::getStringImpl(std::string& str, std::size_t size, std::istream& stream, - const ToUTF8::StatelessUtf8Encoder* encoder, bool hasNull) + bool Reader::getStringImpl(std::string& str, std::size_t size, std::istream& stream, bool hasNull) { std::size_t newSize = size; - if (encoder) + if (mEncoder != nullptr) { std::string input(size, '\0'); stream.read(input.data(), size); if (stream.gcount() == static_cast(size)) { const std::string_view result - = encoder->getUtf8(input, ToUTF8::BufferAllocationPolicy::FitToRequiredSize, str); + = mEncoder->getUtf8(input, ToUTF8::BufferAllocationPolicy::FitToRequiredSize, str); if (str.empty() && !result.empty()) { str = std::move(input); diff --git a/components/esm4/reader.hpp b/components/esm4/reader.hpp index 86a52115e4..2f6dd6bfdf 100644 --- a/components/esm4/reader.hpp +++ b/components/esm4/reader.hpp @@ -131,6 +131,13 @@ namespace ESM4 ReaderContext(); }; + enum class LocalizedStringType + { + Strings, + ILStrings, + DLStrings, + }; + class Reader { VFS::Manager const* mVFS; @@ -149,24 +156,15 @@ namespace ESM4 Files::IStreamPtr mILStrings; Files::IStreamPtr mDLStrings; - enum LocalizedStringType - { - Type_Strings = 0, - Type_ILStrings = 1, - Type_DLStrings = 2 - }; - - struct LStringOffset - { - LocalizedStringType type; - std::uint32_t offset; - }; - - std::map mLStringIndex; + std::unordered_map mLStringIndex; std::vector* mGlobalReaderList = nullptr; - void buildLStringIndex(const std::filesystem::path& stringFile, LocalizedStringType stringType); + void buildLStringIndex(LocalizedStringType stringType, const std::u8string& prefix); + + void buildLStringIndex(LocalizedStringType stringType, std::istream& stream); + + std::string readLocalizedString(LocalizedStringType type, std::istream& stream); inline bool hasLocalizedStrings() const { return (mHeader.mFlags & Rec_Localized) != 0; } @@ -185,11 +183,12 @@ namespace ESM4 Reader() = default; - bool getStringImpl(std::string& str, std::size_t size, std::istream& stream, - const ToUTF8::StatelessUtf8Encoder* encoder, bool hasNull = false); + bool getStringImpl(std::string& str, std::size_t size, std::istream& stream, bool hasNull = false); public: - Reader(Files::IStreamPtr&& esmStream, const std::filesystem::path& filename, VFS::Manager const* vfs = nullptr); + Reader(Files::IStreamPtr&& esmStream, const std::filesystem::path& filename, VFS::Manager const* vfs, + const ToUTF8::StatelessUtf8Encoder* encoder); + ~Reader(); void open(const std::filesystem::path& filename); @@ -198,8 +197,6 @@ namespace ESM4 inline bool isEsm4() const { return true; } - inline void setEncoder(const ToUTF8::StatelessUtf8Encoder* encoder) { mEncoder = encoder; } - const std::vector& getGameFiles() const { return mHeader.mMaster; } inline int getRecordCount() const { return mHeader.mData.records; } @@ -348,14 +345,8 @@ namespace ESM4 void adjustGRUPFormId(); // Note: uses the string size from the subrecord header rather than checking null termination - bool getZString(std::string& str) - { - return getStringImpl(str, mCtx.subRecordHeader.dataSize, *mStream, mEncoder, true); - } - bool getString(std::string& str) - { - return getStringImpl(str, mCtx.subRecordHeader.dataSize, *mStream, mEncoder); - } + bool getZString(std::string& str) { return getStringImpl(str, mCtx.subRecordHeader.dataSize, *mStream, true); } + bool getString(std::string& str) { return getStringImpl(str, mCtx.subRecordHeader.dataSize, *mStream); } bool getZeroTerminatedStringArray(std::vector& values);