Better highlighting for query tokens match across fields.

This commit is contained in:
Kishore Nallan 2021-04-19 13:02:03 +05:30
parent b3b47f5651
commit 006ff75154
4 changed files with 53 additions and 6 deletions

View File

@ -334,6 +334,7 @@ private:
std::string get_seq_id_key(uint32_t seq_id) const;
void highlight_result(const field &search_field, const std::vector<std::vector<art_leaf *>> &searched_queries,
const std::vector<std::string>& q_tokens,
const KV* field_order_kv, const nlohmann::json &document,
StringUtils & string_utils,
const size_t snippet_threshold,

View File

@ -1037,7 +1037,10 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
fields_highlighted_fully.emplace(highlight_full_field);
}
for(const std::string & field_name: search_fields) {
for(size_t i = 0; i < search_fields.size(); i++) {
const std::string& field_name = search_fields[i];
const std::vector<std::string>& q_tokens = field_query_tokens[i].q_include_tokens;
// should not pick excluded field for highlighting
if(exclude_fields.count(field_name) > 0) {
continue;
@ -1049,7 +1052,7 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
bool highlighted_fully = (fields_highlighted_fully.find(field_name) != fields_highlighted_fully.end());
highlight_t highlight;
highlight_result(search_field, searched_queries, field_order_kv, document,
highlight_result(search_field, searched_queries, q_tokens, field_order_kv, document,
string_utils, snippet_threshold, highlight_affix_num_tokens,
highlighted_fully, highlight_start_tag, highlight_end_tag, highlight);
@ -1378,6 +1381,7 @@ bool Collection::facet_value_to_string(const facet &a_facet, const facet_count_t
void Collection::highlight_result(const field &search_field,
const std::vector<std::vector<art_leaf *>> &searched_queries,
const std::vector<std::string>& q_tokens,
const KV* field_order_kv, const nlohmann::json & document,
StringUtils & string_utils,
const size_t snippet_threshold,
@ -1412,6 +1416,24 @@ void Collection::highlight_result(const field &search_field,
}
}
if(query_suggestion.empty()) {
// can happen for compound query matched across 2 fields: try to use original query tokens
for(const std::string& q_token: q_tokens) {
Index* index = indices[field_order_kv->key % num_memory_shards];
art_leaf *actual_leaf = index->get_token_leaf(search_field.name,
reinterpret_cast<const unsigned char *>(q_token.c_str()),
q_token.size() + 1);
if(actual_leaf != nullptr) {
query_suggestion.push_back(actual_leaf);
std::vector<uint16_t> positions;
uint32_t doc_index = actual_leaf->values->ids.indexOf(field_order_kv->key);
auto doc_indices = new uint32_t[1];
doc_indices[0] = doc_index;
leaf_to_indices.push_back(doc_indices);
}
}
}
if(query_suggestion.empty()) {
// none of the tokens from the query were found on this field
free_leaf_indices(leaf_to_indices);

View File

@ -880,9 +880,9 @@ void Index::search_candidates(const uint8_t & field_id,
query_suggestion, token_bits);
/*LOG(INFO) << "n: " << n;
for(size_t i=0; i < query_suggestion.size(); i++) {
LOG(INFO) << "i: " << i << " - " << query_suggestion[i]->key << ", ids: "
<< query_suggestion[i]->values->ids.getLength() << ", total_cost: " << total_cost;
for(size_t i=0; i < actual_query_suggestion.size(); i++) {
LOG(INFO) << "i: " << i << " - " << actual_query_suggestion[i]->key << ", ids: "
<< actual_query_suggestion[i]->values->ids.getLength() << ", total_cost: " << total_cost;
}*/
// initialize results with the starting element (for further intersection)
@ -1880,7 +1880,7 @@ void Index::search_field(const uint8_t & field_id,
const std::string token_cost_hash = token + std::to_string(costs[token_index]);
std::vector<art_leaf*> leaves;
//LOG(INFO) << "\nSearching for field: " << field << ", token:" << token << " - cost: " << costs[token_index];
//LOG(INFO) << "Searching for field: " << field << ", token:" << token << " - cost: " << costs[token_index];
if(token_cost_cache.count(token_cost_hash) != 0) {
leaves = token_cost_cache[token_cost_hash];

View File

@ -3116,6 +3116,10 @@ TEST_F(CollectionTest, MultiFieldHighlighting) {
{"Best Wireless Vehicle Charger",
"Easily replenish your cell phone with this wireless charger.",
"Cell Phones > Cell Phone Accessories > Car Chargers"},
{"Annie's Song",
"John Denver",
"Album > Compilation"},
};
for(size_t i=0; i<records.size(); i++) {
@ -3152,6 +3156,26 @@ TEST_F(CollectionTest, MultiFieldHighlighting) {
ASSERT_EQ("Easily replenish your cell phone with this wireless <mark>charger.</mark>",
results["hits"][0]["highlights"][1]["snippet"].get<std::string>());
results = coll1->search("Annies song John Denver",
{"name","description"}, "", {}, {}, 0, 10, 1, FREQUENCY,
true, 1, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0,
"<mark>", "</mark>", {1, 1}).get();
ASSERT_EQ(1, results["found"].get<size_t>());
ASSERT_EQ(1, results["hits"].size());
ASSERT_STREQ("1", results["hits"][0]["document"]["id"].get<std::string>().c_str());
ASSERT_EQ(2, results["hits"][0]["highlights"].size());
ASSERT_EQ("name", results["hits"][0]["highlights"][0]["field"].get<std::string>());
ASSERT_EQ("<mark>Annie's</mark> <mark>Song</mark>",
results["hits"][0]["highlights"][0]["snippet"].get<std::string>());
ASSERT_EQ("description", results["hits"][0]["highlights"][1]["field"].get<std::string>());
ASSERT_EQ("<mark>John</mark> <mark>Denver</mark>",
results["hits"][0]["highlights"][1]["snippet"].get<std::string>());
collectionManager.drop_collection("coll1");
}