Highlight only the prefix.

This commit is contained in:
Kishore Nallan 2022-01-02 15:44:20 +05:30
parent 71b57c6fd2
commit 4f961f4919
6 changed files with 174 additions and 63 deletions

View File

@ -190,7 +190,8 @@ private:
std::string get_seq_id_key(uint32_t seq_id) const;
void highlight_result(const field &search_field, const std::vector<std::vector<art_leaf *>> &searched_queries,
void highlight_result(const std::string& raw_query,
const field &search_field, const std::vector<std::vector<art_leaf *>> &searched_queries,
const std::vector<std::string>& q_tokens,
const KV* field_order_kv, const nlohmann::json &document,
StringUtils & string_utils,
@ -432,5 +433,11 @@ public:
Option<bool> add_synonym(const synonym_t& synonym);
Option<bool> remove_synonym(const std::string & id);
static void highlight_text(const string& highlight_start_tag, const string& highlight_end_tag, const string& last_raw_q_token,
const string& text, const std::map<size_t, size_t>& token_offsets,
const std::map<size_t, std::string>& prefix_start_offsets, size_t snippet_end_offset,
std::vector<std::string>& matched_tokens, std::map<size_t, size_t>::iterator& offset_it,
std::stringstream& highlighted_text, size_t snippet_start_offset) ;
};

View File

@ -47,10 +47,6 @@ private:
);
}
static inline bool is_ascii_char(char c) {
return (c & ~0x7f) == 0;
}
public:
explicit Tokenizer(const std::string& input,
@ -77,4 +73,8 @@ public:
bool tokenize(std::string& token);
static bool is_cyrillic(const std::string& locale);
static inline bool is_ascii_char(char c) {
return (c & ~0x7f) == 0;
}
};

View File

@ -1124,7 +1124,7 @@ Option<nlohmann::json> Collection::search(const std::string & raw_query, const s
/*if(document["title"] == "Quantum Quest: A Cassini Space Odyssey") {
LOG(INFO) << "here!";
}*/
highlight_result(search_field, searched_queries, q_tokens, field_order_kv, document,
highlight_result(raw_query, search_field, searched_queries, q_tokens, field_order_kv, document,
string_utils, snippet_threshold, highlight_affix_num_tokens,
highlighted_fully, highlight_start_tag, highlight_end_tag, highlight);
//LOG(INFO) << "End";
@ -1588,7 +1588,7 @@ bool Collection::facet_value_to_string(const facet &a_facet, const facet_count_t
return true;
}
void Collection::highlight_result(const field &search_field,
void Collection::highlight_result(const std::string& raw_query, const field &search_field,
const std::vector<std::vector<art_leaf *>> &searched_queries,
const std::vector<std::string>& q_tokens,
const KV* field_order_kv, const nlohmann::json & document,
@ -1600,9 +1600,23 @@ void Collection::highlight_result(const field &search_field,
const std::string& highlight_end_tag,
highlight_t & highlight) const {
if(q_tokens.size() == 1 && q_tokens[0] == "*") {
return;
}
std::vector<art_leaf*> query_suggestion;
std::set<std::string> query_suggestion_tokens;
bool is_cyrillic = Tokenizer::is_cyrillic(search_field.locale);
bool normalise = is_cyrillic ? false : true;
std::vector<std::string> raw_query_tokens;
Tokenizer(raw_query, normalise, false, search_field.locale, symbols_to_index, token_separators).tokenize(raw_query_tokens);
const std::string& last_raw_q_token = raw_query_tokens.back();
const std::string& last_q_token = q_tokens.back();
std::set<std::string> last_full_q_tokens;
size_t qindex = 0;
do {
@ -1616,7 +1630,8 @@ void Collection::highlight_result(const field &search_field,
(field_order_kv->query_indices == nullptr) ? searched_queries[field_order_kv->query_index] :
searched_queries[field_order_kv->query_indices[qindex + 1]];
for (art_leaf* token_leaf : searched_query) {
for (size_t i = 0; i < searched_query.size(); i++) {
art_leaf* token_leaf = searched_query[i];
std::string token(reinterpret_cast<char*>(token_leaf->key), token_leaf->key_len - 1);
if(query_suggestion_tokens.count(token) != 0) {
@ -1631,6 +1646,10 @@ void Collection::highlight_result(const field &search_field,
query_suggestion.push_back(actual_leaf);
query_suggestion_tokens.insert(token);
//LOG(INFO) << "field: " << search_field.name << ", key: " << token;
if(i == searched_query.size()-1 &&
(q_tokens.size() == searched_query.size() || token.rfind(last_q_token, 0) == 0)) {
last_full_q_tokens.insert(token);
}
}
}
@ -1689,8 +1708,6 @@ void Collection::highlight_result(const field &search_field,
<< ", match.distance: " << size_t(this_match.distance);*/
}
const std::string& prefix_token = q_tokens.back();
if(match_indices.empty()) {
// none of the tokens from the query were found on this field
// let's try to look only for prefix matches
@ -1743,8 +1760,6 @@ void Collection::highlight_result(const field &search_field,
text = document[search_field.name][match_index.index];
}
bool is_cyrillic = Tokenizer::is_cyrillic(search_field.locale);
bool normalise = is_cyrillic ? false : true;
Tokenizer tokenizer(text, normalise, false, search_field.locale, symbols_to_index, token_separators);
// word tokenizer is a secondary tokenizer used for specific languages that requires transliteration
@ -1756,6 +1771,7 @@ void Collection::highlight_result(const field &search_field,
// need an ordered map here to ensure that it is ordered by the key (start offset)
std::map<size_t, size_t> token_offsets;
std::map<size_t, std::string> prefix_start_offsets;
int match_offset_index = 0;
std::string raw_token;
@ -1811,11 +1827,15 @@ void Collection::highlight_result(const field &search_field,
found_first_match = true;
} else if(query_suggestion_tokens.find(raw_token) != query_suggestion_tokens.end() ||
raw_token.rfind(prefix_token, 0) == 0) {
raw_token.rfind(last_raw_q_token, 0) == 0) {
token_offsets.emplace(tok_start, tok_end);
token_hits.insert(raw_token);
}
if(last_full_q_tokens.find(raw_token) != last_full_q_tokens.end()) {
prefix_start_offsets.emplace(tok_start, raw_token);
}
if(raw_token_index >= last_valid_offset + highlight_affix_num_tokens) {
// register end of highlight snippet
if(snippet_end_offset == text.size() - 1) {
@ -1841,39 +1861,23 @@ void Collection::highlight_result(const field &search_field,
continue;
}
if(highlighted_fully || raw_token_index <= snippet_threshold-1) {
if(raw_token_index <= snippet_threshold-1) {
// fully highlight field whose token size is less than given snippet threshold
snippet_start_offset = 0;
snippet_end_offset = text.size() - 1;
}
// `token_offsets` has a list of ranges to target for highlighting
auto offset_it = token_offsets.begin();
std::stringstream highlighted_text;
// tokens from query might occur before actual snippet start offset: we skip that
auto offset_it = token_offsets.begin();
while(offset_it != token_offsets.end() && offset_it->first < snippet_start_offset) {
offset_it++;
}
for(size_t i = snippet_start_offset; i <= snippet_end_offset; i++) {
if(offset_it != token_offsets.end()) {
if (i == offset_it->first) {
highlighted_text << highlight_start_tag;
matched_tokens.push_back(text.substr(i, (offset_it->second - i) + 1));
}
if (i == offset_it->second) {
highlighted_text << text[i];
highlighted_text << highlight_end_tag;
offset_it++;
continue;
}
}
highlighted_text << text[i];
}
std::stringstream highlighted_text;
highlight_text(highlight_start_tag, highlight_end_tag, last_raw_q_token, text, token_offsets,
prefix_start_offsets, snippet_end_offset, matched_tokens, offset_it,
highlighted_text, snippet_start_offset);
highlight.snippets.push_back(highlighted_text.str());
if(search_field.type == field_types::STRING_ARRAY) {
@ -1883,24 +1887,10 @@ void Collection::highlight_result(const field &search_field,
if(highlighted_fully) {
std::stringstream value_stream;
offset_it = token_offsets.begin();
for(size_t i = 0; i < text.size(); i++) {
if(offset_it != token_offsets.end()) {
if (i == offset_it->first) {
value_stream << highlight_start_tag;
}
if (i == offset_it->second) {
value_stream << text[i];
value_stream << highlight_end_tag;
offset_it++;
continue;
}
}
value_stream << text[i];
}
std::vector<std::string> full_matched_tokens;
highlight_text(highlight_start_tag, highlight_end_tag, last_raw_q_token, text, token_offsets,
prefix_start_offsets, text.size()-1, full_matched_tokens, offset_it,
value_stream, 0);
highlight.values.push_back(value_stream.str());
}
}
@ -1909,6 +1899,59 @@ void Collection::highlight_result(const field &search_field,
highlight.match_score = match_indices[0].match_score;
}
void Collection::highlight_text(const string& highlight_start_tag, const string& highlight_end_tag,
const string& last_raw_q_token, const string& text,
const std::map<size_t, size_t>& token_offsets,
const std::map<size_t, std::string>& prefix_start_offsets,
size_t snippet_end_offset, std::vector<std::string>& matched_tokens,
std::map<size_t, size_t>::iterator& offset_it,
std::stringstream& highlighted_text,
size_t snippet_start_offset) {
while(snippet_start_offset <= snippet_end_offset) {
if(offset_it != token_offsets.end()) {
if (snippet_start_offset == offset_it->first) {
highlighted_text << highlight_start_tag;
const std::string& text_token = text.substr(snippet_start_offset, (offset_it->second - snippet_start_offset) + 1);
matched_tokens.push_back(text_token);
size_t token_len = offset_it->second - snippet_start_offset + 1;
if(prefix_start_offsets.find(offset_it->first) != prefix_start_offsets.end() &&
last_raw_q_token.size() < token_len) {
// if length diff is within 2, we still might not want to highlight partially in some cases
// e.g. "samsng" vs "samsung" -> full highlight is preferred
bool within_two_chars = (abs(int64_t(last_raw_q_token.size()) - (int64_t)token_len) <= 2);
if(!within_two_chars || last_raw_q_token.back() == text_token[last_raw_q_token.size()-1]) {
// also account for presence ascii symbols in the text
size_t num_symbols = 0;
for(size_t j = 0; j < text_token.size(); j++) {
char c = text_token[j];
if(Tokenizer::is_ascii_char(c) && !isalnum(c)) {
num_symbols++;
}
}
token_len = std::min(token_len, last_raw_q_token.size() + num_symbols);
}
}
for(size_t j = 0; j < token_len; j++) {
highlighted_text << text[snippet_start_offset + j];
}
highlighted_text << highlight_end_tag;
offset_it++;
snippet_start_offset += token_len;
continue;
}
}
highlighted_text << text[snippet_start_offset];
snippet_start_offset++;
}
}
Option<nlohmann::json> Collection::get(const std::string & id) const {
std::string seq_id_str;
StoreStatus seq_id_status = store->get(get_doc_id_key(id), seq_id_str);

View File

@ -104,7 +104,7 @@ TEST_F(CollectionLocaleTest, SearchAgainstChineseText) {
ASSERT_EQ(1, results["found"].get<size_t>());
ASSERT_EQ(1, results["hits"].size());
ASSERT_EQ("0", results["hits"][0]["document"]["id"].get<std::string>());
ASSERT_EQ("爱<mark>并</mark>会因时间而", results["hits"][0]["highlights"][0]["snippet"].get<std::string>());
ASSERT_EQ("爱<mark>并</mark>会因时间而", results["hits"][0]["highlights"][0]["snippet"].get<std::string>());
// partial token should not match as prefix when prefix is set to false
@ -119,7 +119,7 @@ TEST_F(CollectionLocaleTest, SearchAgainstChineseText) {
ASSERT_EQ(1, results["found"].get<size_t>());
ASSERT_EQ(1, results["hits"].size());
ASSERT_EQ("3", results["hits"][0]["document"]["id"].get<std::string>());
ASSERT_EQ("看誰先跑到小山丘<mark>上</mark>。<mark>媽</mark>總是第", results["hits"][0]["highlights"][0]["snippet"].get<std::string>());
ASSERT_EQ("看誰先跑到小山丘<mark>上</mark>。<mark>媽</mark>總是第", results["hits"][0]["highlights"][0]["snippet"].get<std::string>());
// search using simplified chinese
@ -129,7 +129,7 @@ TEST_F(CollectionLocaleTest, SearchAgainstChineseText) {
ASSERT_EQ(1, results["found"].get<size_t>());
ASSERT_EQ(1, results["hits"].size());
ASSERT_EQ("3", results["hits"][0]["document"]["id"].get<std::string>());
ASSERT_EQ("看誰先跑到小山丘上。<mark>媽</mark>總是第", results["hits"][0]["highlights"][0]["snippet"].get<std::string>());
ASSERT_EQ("看誰先跑到小山丘上。<mark>媽</mark>總是第", results["hits"][0]["highlights"][0]["snippet"].get<std::string>());
}
TEST_F(CollectionLocaleTest, SearchAgainstThaiText) {
@ -546,9 +546,12 @@ TEST_F(CollectionLocaleTest, SearchAndFacetSearchForGreekText) {
doc["title"] = "Εμφάνιση κάθε μέρα.";
ASSERT_TRUE(coll1->add(doc.dump()).ok());
auto results = coll1->search("Εμφάν", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}).get();
auto results = coll1->search("Εμφάν", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true},
10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "title").get();
ASSERT_EQ(1, results["hits"].size());
ASSERT_EQ("<mark>Εμφάνιση</mark> κάθε μέρα.", results["hits"][0]["highlights"][0]["snippet"].get<std::string>());
ASSERT_EQ("<mark>Εμφάν</mark>ιση κάθε μέρα.", results["hits"][0]["highlights"][0]["snippet"].get<std::string>());
ASSERT_EQ("<mark>Εμφάν</mark>ιση κάθε μέρα.", results["hits"][0]["highlights"][0]["value"].get<std::string>());
// with typo
@ -596,11 +599,14 @@ TEST_F(CollectionLocaleTest, SearchOnCyrillicTextWithSpecialCharacters) {
auto results = coll1->search("отсутствие", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true},
10, spp::sparse_hash_set<std::string>(), spp::sparse_hash_set<std::string>(),
10, "", 10).get();
10, "", 10, 4, "title").get();
ASSERT_EQ(1, results["hits"].size());
ASSERT_EQ("скромности. Посыл, среди которых <mark>отсутствие</mark> мобильного страшное.",
results["hits"][0]["highlights"][0]["snippet"].get<std::string>());
ASSERT_EQ("«Сирый», «несчастный», «никчёмный» — принятое особ, сейчас, впрочем, оттенок скромности. "
"Посыл, среди которых <mark>отсутствие</mark> мобильного страшное.",
results["hits"][0]["highlights"][0]["value"].get<std::string>());
results = coll1->search("принятое", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}).get();

View File

@ -687,7 +687,7 @@ TEST_F(CollectionSpecificTest, HighlightSecondaryFieldWithPrefixMatch) {
ASSERT_EQ(2, results["hits"][0]["highlights"].size());
ASSERT_EQ("<mark>Functions</mark> and Equations",
ASSERT_EQ("<mark>Function</mark>s and Equations",
results["hits"][0]["highlights"][0]["snippet"].get<std::string>());
ASSERT_EQ("Use a <mark>function</mark> to solve an equation.",
@ -2043,6 +2043,57 @@ TEST_F(CollectionSpecificTest, EmptyArrayShouldBeAcceptedAsFirstValue) {
collectionManager.drop_collection("coll1");
}
TEST_F(CollectionSpecificTest, SimplePrefixQueryHighlight) {
std::vector<field> fields = {field("title", field_types::STRING, false),
field("points", field_types::INT32, false),};
Collection* coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
nlohmann::json doc1;
doc1["id"] = "0";
doc1["title"] = "The Hound of the Baskervilles";
doc1["points"] = 100;
ASSERT_TRUE(coll1->add(doc1.dump()).ok());
auto results = coll1->search("basker", {"title"},
"", {}, {}, {2}, 10,
1, FREQUENCY, {true},
10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "title", 20, {}, {}, {}, 0,
"<mark>", "</mark>", {}, 1000, true).get();
ASSERT_EQ(1, results["hits"].size());
ASSERT_EQ("The Hound of the <mark>Basker</mark>villes", results["hits"][0]["highlights"][0]["snippet"].get<std::string>());
ASSERT_EQ("The Hound of the <mark>Basker</mark>villes", results["hits"][0]["highlights"][0]["value"].get<std::string>());
results = coll1->search("bassker", {"title"},
"", {}, {}, {2}, 10,
1, FREQUENCY, {true},
10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "title", 20, {}, {}, {}, 0,
"<mark>", "</mark>", {}, 1000, true).get();
ASSERT_EQ(1, results["hits"].size());
ASSERT_EQ("The Hound of the <mark>Baskerv</mark>illes", results["hits"][0]["highlights"][0]["snippet"].get<std::string>());
ASSERT_EQ("The Hound of the <mark>Baskerv</mark>illes", results["hits"][0]["highlights"][0]["value"].get<std::string>());
// multiple tokens with typo in prefix
results = coll1->search("hound of bassker", {"title"},
"", {}, {}, {2}, 10,
1, FREQUENCY, {true},
10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "title", 20, {}, {}, {}, 0,
"<mark>", "</mark>", {}, 1000, true).get();
ASSERT_EQ(1, results["hits"].size());
ASSERT_EQ("The <mark>Hound</mark> <mark>of</mark> the <mark>Baskerv</mark>illes", results["hits"][0]["highlights"][0]["snippet"].get<std::string>());
ASSERT_EQ("The <mark>Hound</mark> <mark>of</mark> the <mark>Baskerv</mark>illes", results["hits"][0]["highlights"][0]["value"].get<std::string>());
collectionManager.drop_collection("coll1");
}
TEST_F(CollectionSpecificTest, PhraseSearch) {
Collection *coll1;
std::vector<field> fields = {field("title", field_types::STRING, false, true),
@ -2085,7 +2136,7 @@ TEST_F(CollectionSpecificTest, PhraseSearch) {
results = coll1->search(R"("by the" -train)", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}, 10).get();
ASSERT_EQ(1, results["hits"].size());
ASSERT_EQ("0", results["hits"][0]["document"]["id"].get<std::string>());
ASSERT_EQ("<mark>Then</mark> and <mark>there</mark> <mark>by</mark> <mark>the</mark> down", results["hits"][0]["highlights"][0]["snippet"].get<std::string>());
ASSERT_EQ("Then and there <mark>by</mark> <mark>the</mark> down", results["hits"][0]["highlights"][0]["snippet"].get<std::string>());
// exclusion of an entire phrase
results = coll1->search(R"(-"by the down")", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}, 10).get();

View File

@ -3555,7 +3555,7 @@ TEST_F(CollectionTest, MultiFieldHighlighting) {
results["hits"][0]["highlights"][1]["snippet"].get<std::string>());
ASSERT_EQ("categories", results["hits"][0]["highlights"][2]["field"].get<std::string>());
ASSERT_EQ("Car <mark>Chargers</mark>", results["hits"][0]["highlights"][2]["snippets"][0].get<std::string>());
ASSERT_EQ("Car <mark>Charger</mark>s", results["hits"][0]["highlights"][2]["snippets"][0].get<std::string>());
results = coll1->search("John With Denver",
{"description"}, "", {}, {}, {0}, 10, 1, FREQUENCY,
@ -3809,13 +3809,17 @@ TEST_F(CollectionTest, HighlightWithAccentedCharacters) {
ASSERT_STREQ("à", results["hits"][0]["highlights"][0]["matched_tokens"][0].get<std::string>().c_str());
ASSERT_STREQ("jour", results["hits"][0]["highlights"][0]["matched_tokens"][1].get<std::string>().c_str());
results = coll1->search("by train", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY).get();
results = coll1->search("by train", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY,
{true}, 10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "title").get();
ASSERT_EQ(1, results["found"].get<size_t>());
ASSERT_EQ(1, results["hits"].size());
ASSERT_STREQ("Down There <mark>by</mark> the <mark>T.r.a.i.n</mark>",
results["hits"][0]["highlights"][0]["snippet"].get<std::string>().c_str());
ASSERT_STREQ("Down There <mark>by</mark> the <mark>T.r.a.i.n</mark>",
results["hits"][0]["highlights"][0]["value"].get<std::string>().c_str());
results = coll1->search("state trooper", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY).get();