Ensure that highlighting is on the best matched field.

This commit is contained in:
kishorenc 2021-03-26 19:21:22 +05:30
parent 3159712ca3
commit a87d108aa5
3 changed files with 88 additions and 4 deletions

View File

@ -862,7 +862,7 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
std::unique_lock<std::mutex> lock_process(m_process);
cv_process.wait(lock_process, [&](){ return num_processed == num_indices; });
// for grouping we have re-aggregate
// for grouping we have to re-aggregate
const size_t topster_size = std::max((size_t)1, max_hits);
Topster topster(topster_size, group_limit);
@ -1391,8 +1391,9 @@ void Collection::highlight_result(const field &search_field,
Index* index = indices[field_order_kv->key % num_memory_shards];
art_leaf *actual_leaf = index->get_token_leaf(search_field.name, &token_leaf->key[0], token_leaf->key_len);
//LOG(INFO) << "field: " << search_field.name << ", key: " << token_leaf->key;
if(actual_leaf != nullptr) {
//LOG(INFO) << "field: " << search_field.name << ", key: " << actual_leaf->key;
query_suggestion.push_back(actual_leaf);
std::vector<uint16_t> positions;
uint32_t doc_index = actual_leaf->values->ids.indexOf(field_order_kv->key);

View File

@ -1700,9 +1700,11 @@ void Index::search(const std::vector<std::string>& q_include_tokens,
}
}
for(const auto& seq_id_kvs: topster_ids) {
for(auto& seq_id_kvs: topster_ids) {
const uint64_t seq_id = seq_id_kvs.first;
const auto& kvs = seq_id_kvs.second; // each `kv` can be from a different field
auto& kvs = seq_id_kvs.second; // each `kv` can be from a different field
std::sort(kvs.begin(), kvs.end(), Topster::is_greater);
// LOG(INFO) << "DOC ID: " << seq_id << ", score: " << kvs[0]->scores[kvs[0]->match_score_index];

View File

@ -3001,6 +3001,87 @@ TEST_F(CollectionTest, MultiFieldRelevance5) {
ASSERT_STREQ("2", results["hits"][1]["document"]["id"].get<std::string>().c_str());
ASSERT_STREQ("1", results["hits"][2]["document"]["id"].get<std::string>().c_str());
results = coll1->search("Canada",
{"company_name","field_a","country"}, "", {}, {}, 2, 10, 1, FREQUENCY,
true, 10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0,
"<mark>", "</mark>", {1, 1, 1}).get();
ASSERT_EQ(3, results["found"].get<size_t>());
ASSERT_EQ(3, results["hits"].size());
ASSERT_STREQ("0", results["hits"][0]["document"]["id"].get<std::string>().c_str());
ASSERT_STREQ("2", results["hits"][1]["document"]["id"].get<std::string>().c_str());
ASSERT_STREQ("1", results["hits"][2]["document"]["id"].get<std::string>().c_str());
ASSERT_EQ(1, results["hits"][0]["highlights"].size());
ASSERT_EQ("country", results["hits"][0]["highlights"][0]["field"].get<std::string>());
ASSERT_EQ("<mark>Canada</mark>", results["hits"][0]["highlights"][0]["snippet"].get<std::string>());
ASSERT_EQ(1, results["hits"][1]["highlights"].size());
ASSERT_EQ("field_a", results["hits"][1]["highlights"][0]["field"].get<std::string>());
ASSERT_EQ("<mark>Canadoo</mark>", results["hits"][1]["highlights"][0]["snippet"].get<std::string>());
ASSERT_EQ(1, results["hits"][2]["highlights"].size());
ASSERT_EQ("company_name", results["hits"][2]["highlights"][0]["field"].get<std::string>());
ASSERT_EQ("<mark>Canaida</mark> Corp", results["hits"][2]["highlights"][0]["snippet"].get<std::string>());
collectionManager.drop_collection("coll1");
}
TEST_F(CollectionTest, MultiFieldHighlighting) {
Collection *coll1;
std::vector<field> fields = {field("name", field_types::STRING, false),
field("description", field_types::STRING, false),
field("categories", field_types::STRING_ARRAY, false),
field("points", field_types::INT32, false)};
coll1 = collectionManager.get_collection("coll1").get();
if(coll1 == nullptr) {
coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
}
std::vector<std::vector<std::string>> records = {
{"Best Wireless Vehicle Charger",
"Easily replenish your cell phone with this wireless charger.",
"Cell Phones > Cell Phone Accessories > Car Chargers"},
};
for(size_t i=0; i<records.size(); i++) {
nlohmann::json doc;
std::vector<std::string> categories;
StringUtils::split(records[i][2], categories, ">");
doc["id"] = std::to_string(i);
doc["name"] = records[i][0];
doc["description"] = records[i][1];
doc["categories"] = categories;
doc["points"] = i;
ASSERT_TRUE(coll1->add(doc.dump()).ok());
}
auto results = coll1->search("charger",
{"name","description","categories"}, "", {}, {}, 2, 10, 1, FREQUENCY,
true, 10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0,
"<mark>", "</mark>", {1, 1, 1}).get();
ASSERT_EQ(1, results["found"].get<size_t>());
ASSERT_EQ(1, results["hits"].size());
ASSERT_STREQ("0", results["hits"][0]["document"]["id"].get<std::string>().c_str());
ASSERT_EQ(2, results["hits"][0]["highlights"].size());
ASSERT_EQ("name", results["hits"][0]["highlights"][0]["field"].get<std::string>());
ASSERT_EQ("Best Wireless Vehicle <mark>Charger</mark>",
results["hits"][0]["highlights"][0]["snippet"].get<std::string>());
ASSERT_EQ("description", results["hits"][0]["highlights"][1]["field"].get<std::string>());
ASSERT_EQ("Easily replenish your cell phone with this wireless <mark>charger</mark>.",
results["hits"][0]["highlights"][1]["snippet"].get<std::string>());
collectionManager.drop_collection("coll1");
}