mirror of
https://github.com/OpenMW/openmw.git
synced 2025-11-30 00:34:32 +00:00
Merge branch 'keywordsoup' into 'master'
Fix KeywordSearch Closes #8760 See merge request OpenMW/openmw!4960
This commit is contained in:
commit
429305401e
3 changed files with 77 additions and 136 deletions
|
|
@ -1,9 +1,7 @@
|
||||||
#ifndef GAME_MWDIALOGUE_KEYWORDSEARCH_H
|
#ifndef GAME_MWDIALOGUE_KEYWORDSEARCH_H
|
||||||
#define GAME_MWDIALOGUE_KEYWORDSEARCH_H
|
#define GAME_MWDIALOGUE_KEYWORDSEARCH_H
|
||||||
|
|
||||||
#include <algorithm> // std::reverse
|
#include <algorithm>
|
||||||
#include <cctype>
|
|
||||||
#include <cstring>
|
|
||||||
#include <map>
|
#include <map>
|
||||||
#include <stdexcept>
|
#include <stdexcept>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
@ -14,7 +12,7 @@
|
||||||
namespace MWDialogue
|
namespace MWDialogue
|
||||||
{
|
{
|
||||||
|
|
||||||
template <typename value_t>
|
template <typename Value>
|
||||||
class KeywordSearch
|
class KeywordSearch
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
|
|
@ -24,14 +22,14 @@ namespace MWDialogue
|
||||||
{
|
{
|
||||||
Point mBeg;
|
Point mBeg;
|
||||||
Point mEnd;
|
Point mEnd;
|
||||||
value_t mValue;
|
Value mValue;
|
||||||
};
|
};
|
||||||
|
|
||||||
void seed(std::string_view keyword, value_t value)
|
void seed(std::string_view keyword, Value value)
|
||||||
{
|
{
|
||||||
if (keyword.empty())
|
if (keyword.empty())
|
||||||
return;
|
return;
|
||||||
seed_impl(keyword, std::move(value), 0, mRoot);
|
buildTrie(keyword, value, 0, mRoot);
|
||||||
}
|
}
|
||||||
|
|
||||||
void clear()
|
void clear()
|
||||||
|
|
@ -40,37 +38,24 @@ namespace MWDialogue
|
||||||
mRoot.mKeyword.clear();
|
mRoot.mKeyword.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
bool containsKeyword(std::string_view keyword, value_t& value)
|
bool containsKeyword(std::string_view keyword, Value& value) const
|
||||||
{
|
{
|
||||||
auto it = keyword.begin();
|
const Entry* current = &mRoot;
|
||||||
auto current = mRoot.mChildren.find(Misc::StringUtils::toLower(*it));
|
for (char c : keyword)
|
||||||
if (current == mRoot.mChildren.end())
|
|
||||||
return false;
|
|
||||||
else if (Misc::StringUtils::ciEqual(current->second.mKeyword, keyword))
|
|
||||||
{
|
{
|
||||||
value = current->second.mValue;
|
auto it = current->mChildren.find(Misc::StringUtils::toLower(c));
|
||||||
return true;
|
if (it == current->mChildren.end())
|
||||||
}
|
|
||||||
|
|
||||||
for (++it; it != keyword.end(); ++it)
|
|
||||||
{
|
|
||||||
auto next = current->second.mChildren.find(Misc::StringUtils::toLower(*it));
|
|
||||||
if (next == current->second.mChildren.end())
|
|
||||||
return false;
|
return false;
|
||||||
if (Misc::StringUtils::ciEqual(next->second.mKeyword, keyword))
|
else if (Misc::StringUtils::ciEqual(it->second.mKeyword, keyword))
|
||||||
{
|
{
|
||||||
value = next->second.mValue;
|
value = it->second.mValue;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
current = next;
|
current = &it->second;
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool isWordSeparator(char c) { return std::strchr("\n\r \t'\"", c) != nullptr; }
|
|
||||||
|
|
||||||
static bool sortMatches(const Match& left, const Match& right) { return left.mBeg < right.mBeg; }
|
|
||||||
|
|
||||||
void highlightKeywords(Point beg, Point end, std::vector<Match>& out) const
|
void highlightKeywords(Point beg, Point end, std::vector<Match>& out) const
|
||||||
{
|
{
|
||||||
std::vector<Match> matches;
|
std::vector<Match> matches;
|
||||||
|
|
@ -80,87 +65,39 @@ namespace MWDialogue
|
||||||
{
|
{
|
||||||
Point prev = i;
|
Point prev = i;
|
||||||
--prev;
|
--prev;
|
||||||
if (!isWordSeparator(*prev))
|
constexpr std::string_view wordSeparators = "\n\r \t'\"";
|
||||||
|
if (wordSeparators.find(*prev) == std::string_view::npos)
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// check first character
|
const Entry* current = &mRoot;
|
||||||
typename Entry::childen_t::const_iterator candidate
|
for (Point it = i; it != end; ++it)
|
||||||
= mRoot.mChildren.find(Misc::StringUtils::toLower(*i));
|
|
||||||
|
|
||||||
// no match, on to next character
|
|
||||||
if (candidate == mRoot.mChildren.end())
|
|
||||||
continue;
|
|
||||||
|
|
||||||
// see how far the match goes
|
|
||||||
Point j = i;
|
|
||||||
|
|
||||||
// some keywords might be longer variations of other keywords, so we definitely need a list of
|
|
||||||
// candidates the first element in the pair is length of the match, i.e. depth from the first character
|
|
||||||
std::vector<typename std::pair<std::ptrdiff_t, typename Entry::childen_t::const_iterator>> candidates;
|
|
||||||
|
|
||||||
while ((j + 1) != end)
|
|
||||||
{
|
{
|
||||||
typename Entry::childen_t::const_iterator next
|
auto found = current->mChildren.find(Misc::StringUtils::toLower(*it));
|
||||||
= candidate->second.mChildren.find(Misc::StringUtils::toLower(*++j));
|
if (found == current->mChildren.end())
|
||||||
|
|
||||||
if (next == candidate->second.mChildren.end())
|
|
||||||
{
|
|
||||||
if (candidate->second.mKeyword.size() > 0)
|
|
||||||
candidates.push_back(std::make_pair((j - i), candidate));
|
|
||||||
break;
|
break;
|
||||||
}
|
current = &found->second;
|
||||||
|
if (!current->mKeyword.empty())
|
||||||
candidate = next;
|
|
||||||
|
|
||||||
if (candidate->second.mKeyword.size() > 0)
|
|
||||||
candidates.push_back(std::make_pair((j - i), candidate));
|
|
||||||
}
|
|
||||||
|
|
||||||
if (candidates.empty())
|
|
||||||
continue; // didn't match enough to disambiguate, on to next character
|
|
||||||
|
|
||||||
// shorter candidates will be added to the vector first. however, we want to check against longer
|
|
||||||
// candidates first
|
|
||||||
std::reverse(candidates.begin(), candidates.end());
|
|
||||||
|
|
||||||
for (const auto& [pos, c] : candidates)
|
|
||||||
{
|
|
||||||
candidate = c;
|
|
||||||
// try to match the rest of the keyword
|
|
||||||
Point k = i + pos;
|
|
||||||
Point t = candidate->second.mKeyword.begin() + (k - i);
|
|
||||||
|
|
||||||
while (k != end && t != candidate->second.mKeyword.end())
|
|
||||||
{
|
{
|
||||||
if (Misc::StringUtils::toLower(*k) != Misc::StringUtils::toLower(*t))
|
std::string_view remainingText(it + 1, end);
|
||||||
break;
|
std::string_view remainingKeyword = std::string_view(current->mKeyword).substr(it + 1 - i);
|
||||||
|
if (Misc::StringUtils::ciStartsWith(remainingText, remainingKeyword))
|
||||||
++k, ++t;
|
{
|
||||||
|
Match match;
|
||||||
|
match.mValue = current->mValue;
|
||||||
|
match.mBeg = i;
|
||||||
|
match.mEnd = i + current->mKeyword.size();
|
||||||
|
matches.push_back(match);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// didn't match full keyword, try the next candidate
|
|
||||||
if (t != candidate->second.mKeyword.end())
|
|
||||||
continue;
|
|
||||||
|
|
||||||
// found a keyword, but there might still be longer keywords that start somewhere _within_ this
|
|
||||||
// keyword we will resolve these overlapping keywords later, choosing the longest one in case of
|
|
||||||
// conflict
|
|
||||||
Match match;
|
|
||||||
match.mValue = candidate->second.mValue;
|
|
||||||
match.mBeg = i;
|
|
||||||
match.mEnd = k;
|
|
||||||
matches.push_back(match);
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// resolve overlapping keywords
|
// resolve overlapping keywords
|
||||||
while (!matches.empty())
|
while (!matches.empty())
|
||||||
{
|
{
|
||||||
std::size_t longestKeywordSize = 0;
|
std::size_t longestKeywordSize = 0;
|
||||||
typename std::vector<Match>::iterator longestKeyword = matches.begin();
|
auto longestKeyword = matches.begin();
|
||||||
for (typename std::vector<Match>::iterator it = matches.begin(); it != matches.end(); ++it)
|
for (auto it = matches.begin(); it != matches.end(); ++it)
|
||||||
{
|
{
|
||||||
std::size_t size = it->mEnd - it->mBeg;
|
std::size_t size = it->mEnd - it->mBeg;
|
||||||
if (size > longestKeywordSize)
|
if (size > longestKeywordSize)
|
||||||
|
|
@ -169,8 +106,7 @@ namespace MWDialogue
|
||||||
longestKeyword = it;
|
longestKeyword = it;
|
||||||
}
|
}
|
||||||
|
|
||||||
typename std::vector<Match>::iterator next = it;
|
auto next = it + 1;
|
||||||
++next;
|
|
||||||
|
|
||||||
if (next == matches.end())
|
if (next == matches.end())
|
||||||
break;
|
break;
|
||||||
|
|
@ -185,61 +121,55 @@ namespace MWDialogue
|
||||||
matches.erase(longestKeyword);
|
matches.erase(longestKeyword);
|
||||||
out.push_back(keyword);
|
out.push_back(keyword);
|
||||||
// erase anything that overlaps with the keyword we just added to the output
|
// erase anything that overlaps with the keyword we just added to the output
|
||||||
for (typename std::vector<Match>::iterator it = matches.begin(); it != matches.end();)
|
std::erase_if(matches,
|
||||||
{
|
[&](const Match& match) { return match.mBeg < keyword.mEnd && match.mEnd > keyword.mBeg; });
|
||||||
if (it->mBeg < keyword.mEnd && it->mEnd > keyword.mBeg)
|
|
||||||
it = matches.erase(it);
|
|
||||||
else
|
|
||||||
++it;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
std::sort(out.begin(), out.end(), sortMatches);
|
std::sort(
|
||||||
|
out.begin(), out.end(), [](const Match& left, const Match& right) { return left.mBeg < right.mBeg; });
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
struct Entry
|
struct Entry
|
||||||
{
|
{
|
||||||
typedef std::map<wchar_t, Entry> childen_t;
|
|
||||||
|
|
||||||
std::string mKeyword;
|
std::string mKeyword;
|
||||||
value_t mValue;
|
Value mValue;
|
||||||
childen_t mChildren;
|
std::map<char, Entry> mChildren;
|
||||||
};
|
};
|
||||||
|
|
||||||
void seed_impl(std::string_view keyword, value_t value, size_t depth, Entry& entry)
|
void buildTrie(std::string_view keyword, Value value, size_t depth, Entry& entry)
|
||||||
{
|
{
|
||||||
auto ch = Misc::StringUtils::toLower(keyword.at(depth));
|
const char ch = Misc::StringUtils::toLower(keyword[depth]);
|
||||||
|
const auto found = entry.mChildren.find(ch);
|
||||||
|
|
||||||
typename Entry::childen_t::iterator j = entry.mChildren.find(ch);
|
if (found == entry.mChildren.end())
|
||||||
|
|
||||||
if (j == entry.mChildren.end())
|
|
||||||
{
|
{
|
||||||
entry.mChildren[ch].mValue = std::move(value);
|
entry.mChildren[ch].mValue = value;
|
||||||
entry.mChildren[ch].mKeyword = keyword;
|
entry.mChildren[ch].mKeyword = keyword;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
if (j->second.mKeyword.size() > 0)
|
if (!found->second.mKeyword.empty())
|
||||||
{
|
{
|
||||||
if (keyword == j->second.mKeyword)
|
std::string_view existingKeyword = found->second.mKeyword;
|
||||||
|
if (Misc::StringUtils::ciEqual(keyword, existingKeyword))
|
||||||
throw std::runtime_error("duplicate keyword inserted");
|
throw std::runtime_error("duplicate keyword inserted");
|
||||||
|
if (depth >= existingKeyword.size())
|
||||||
const auto& pushKeyword = j->second.mKeyword;
|
|
||||||
|
|
||||||
if (depth >= pushKeyword.size())
|
|
||||||
throw std::runtime_error("unexpected");
|
throw std::runtime_error("unexpected");
|
||||||
|
// Turn this Entry into a branch and append a leaf to hold its current value
|
||||||
if (depth + 1 < pushKeyword.size())
|
if (depth + 1 < existingKeyword.size())
|
||||||
{
|
{
|
||||||
seed_impl(pushKeyword, j->second.mValue, depth + 1, j->second);
|
buildTrie(existingKeyword, found->second.mValue, depth + 1, found->second);
|
||||||
j->second.mKeyword.clear();
|
found->second.mKeyword.clear();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (depth + 1 == keyword.size())
|
if (depth + 1 == keyword.size())
|
||||||
j->second.mKeyword = value;
|
{
|
||||||
|
found->second.mValue = value;
|
||||||
|
found->second.mKeyword = keyword;
|
||||||
|
}
|
||||||
else // depth+1 < keyword.size()
|
else // depth+1 < keyword.size()
|
||||||
seed_impl(keyword, std::move(value), depth + 1, j->second);
|
buildTrie(keyword, value, depth + 1, found->second);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1173,14 +1173,8 @@ namespace MWWorld
|
||||||
{
|
{
|
||||||
mKeywordSearch.clear();
|
mKeywordSearch.clear();
|
||||||
|
|
||||||
std::vector<std::string> keywordList;
|
for (const ESM::Dialogue& topic : *this)
|
||||||
keywordList.reserve(getSize());
|
mKeywordSearch.seed(topic.mStringId, 0 /*unused*/);
|
||||||
for (const auto& it : *this)
|
|
||||||
keywordList.push_back(Misc::StringUtils::lowerCase(it.mStringId));
|
|
||||||
sort(keywordList.begin(), keywordList.end());
|
|
||||||
|
|
||||||
for (const auto& it : keywordList)
|
|
||||||
mKeywordSearch.seed(it, 0 /*unused*/);
|
|
||||||
|
|
||||||
mKeywordSearchModFlag = false;
|
mKeywordSearchModFlag = false;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -164,3 +164,20 @@ TEST_F(KeywordSearchTest, keyword_test_french_substrings)
|
||||||
|
|
||||||
EXPECT_EQ(matches.size(), 0);
|
EXPECT_EQ(matches.size(), 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST_F(KeywordSearchTest, keyword_test_single_char_strings)
|
||||||
|
{
|
||||||
|
// It should be possible to match a single character
|
||||||
|
MWDialogue::KeywordSearch<int> search;
|
||||||
|
search.seed("AB", 0);
|
||||||
|
search.seed("a", 0);
|
||||||
|
|
||||||
|
std::string text = "a ab";
|
||||||
|
|
||||||
|
std::vector<MWDialogue::KeywordSearch<int>::Match> matches;
|
||||||
|
search.highlightKeywords(text.begin(), text.end(), matches);
|
||||||
|
|
||||||
|
EXPECT_EQ(matches.size(), 2);
|
||||||
|
EXPECT_EQ(std::string(matches[0].mBeg, matches[0].mEnd), "a");
|
||||||
|
EXPECT_EQ(std::string(matches[1].mBeg, matches[1].mEnd), "ab");
|
||||||
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue