Ensure that highlighting is on the best matched field.

2025-05-21 22:33:27 +08:00 · 2021-03-26 19:21:22 +05:30 · 2021-03-26 19:21:22 +05:30 · a87d108aa5
commit a87d108aa5
parent 3159712ca3
3 changed files with 88 additions and 4 deletions
--- a/src/collection.cpp
+++ b/src/collection.cpp
@ -862,7 +862,7 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
    std::unique_lock<std::mutex> lock_process(m_process);
    cv_process.wait(lock_process, [&](){ return num_processed == num_indices; });

-    // for grouping we have re-aggregate
+    // for grouping we have to re-aggregate

    const size_t topster_size = std::max((size_t)1, max_hits);
    Topster topster(topster_size, group_limit);
@ -1391,8 +1391,9 @@ void Collection::highlight_result(const field &search_field,
        Index* index = indices[field_order_kv->key % num_memory_shards];
        art_leaf *actual_leaf = index->get_token_leaf(search_field.name, &token_leaf->key[0], token_leaf->key_len);

+        //LOG(INFO) << "field: " << search_field.name << ", key: " << token_leaf->key;
+
        if(actual_leaf != nullptr) {
-            //LOG(INFO) << "field: " << search_field.name << ", key: " << actual_leaf->key;
            query_suggestion.push_back(actual_leaf);
            std::vector<uint16_t> positions;
            uint32_t doc_index = actual_leaf->values->ids.indexOf(field_order_kv->key);
--- a/src/index.cpp
+++ b/src/index.cpp
@ -1700,9 +1700,11 @@ void Index::search(const std::vector<std::string>& q_include_tokens,
            }
        }

-        for(const auto& seq_id_kvs: topster_ids) {
+        for(auto& seq_id_kvs: topster_ids) {
            const uint64_t seq_id = seq_id_kvs.first;
-            const auto& kvs = seq_id_kvs.second; // each `kv` can be from a different field
+            auto& kvs = seq_id_kvs.second; // each `kv` can be from a different field
+
+            std::sort(kvs.begin(), kvs.end(), Topster::is_greater);

            // LOG(INFO) << "DOC ID: " << seq_id << ", score: " << kvs[0]->scores[kvs[0]->match_score_index];

--- a/test/collection_test.cpp
+++ b/test/collection_test.cpp
@ -3001,6 +3001,87 @@ TEST_F(CollectionTest, MultiFieldRelevance5) {
    ASSERT_STREQ("2", results["hits"][1]["document"]["id"].get<std::string>().c_str());
    ASSERT_STREQ("1", results["hits"][2]["document"]["id"].get<std::string>().c_str());

+    results = coll1->search("Canada",
+                             {"company_name","field_a","country"}, "", {}, {}, 2, 10, 1, FREQUENCY,
+                             true, 10, spp::sparse_hash_set<std::string>(),
+                             spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0,
+                             "<mark>", "</mark>", {1, 1, 1}).get();
+
+    ASSERT_EQ(3, results["found"].get<size_t>());
+    ASSERT_EQ(3, results["hits"].size());
+
+    ASSERT_STREQ("0", results["hits"][0]["document"]["id"].get<std::string>().c_str());
+    ASSERT_STREQ("2", results["hits"][1]["document"]["id"].get<std::string>().c_str());
+    ASSERT_STREQ("1", results["hits"][2]["document"]["id"].get<std::string>().c_str());
+
+    ASSERT_EQ(1, results["hits"][0]["highlights"].size());
+    ASSERT_EQ("country", results["hits"][0]["highlights"][0]["field"].get<std::string>());
+    ASSERT_EQ("<mark>Canada</mark>", results["hits"][0]["highlights"][0]["snippet"].get<std::string>());
+
+    ASSERT_EQ(1, results["hits"][1]["highlights"].size());
+    ASSERT_EQ("field_a", results["hits"][1]["highlights"][0]["field"].get<std::string>());
+    ASSERT_EQ("<mark>Canadoo</mark>", results["hits"][1]["highlights"][0]["snippet"].get<std::string>());
+
+    ASSERT_EQ(1, results["hits"][2]["highlights"].size());
+    ASSERT_EQ("company_name", results["hits"][2]["highlights"][0]["field"].get<std::string>());
+    ASSERT_EQ("<mark>Canaida</mark> Corp", results["hits"][2]["highlights"][0]["snippet"].get<std::string>());
+
+    collectionManager.drop_collection("coll1");
+}
+
+TEST_F(CollectionTest, MultiFieldHighlighting) {
+    Collection *coll1;
+
+    std::vector<field> fields = {field("name", field_types::STRING, false),
+                                 field("description", field_types::STRING, false),
+                                 field("categories", field_types::STRING_ARRAY, false),
+                                 field("points", field_types::INT32, false)};
+
+    coll1 = collectionManager.get_collection("coll1").get();
+    if(coll1 == nullptr) {
+        coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
+    }
+
+    std::vector<std::vector<std::string>> records = {
+        {"Best Wireless Vehicle Charger",
+         "Easily replenish your cell phone with this wireless charger.",
+         "Cell Phones > Cell Phone Accessories > Car Chargers"},
+    };
+
+    for(size_t i=0; i<records.size(); i++) {
+        nlohmann::json doc;
+        std::vector<std::string> categories;
+        StringUtils::split(records[i][2], categories, ">");
+
+        doc["id"] = std::to_string(i);
+        doc["name"] = records[i][0];
+        doc["description"] = records[i][1];
+        doc["categories"] = categories;
+        doc["points"] = i;
+
+        ASSERT_TRUE(coll1->add(doc.dump()).ok());
+    }
+
+    auto results = coll1->search("charger",
+                                 {"name","description","categories"}, "", {}, {}, 2, 10, 1, FREQUENCY,
+                                 true, 10, spp::sparse_hash_set<std::string>(),
+                                 spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0,
+                                 "<mark>", "</mark>", {1, 1, 1}).get();
+
+    ASSERT_EQ(1, results["found"].get<size_t>());
+    ASSERT_EQ(1, results["hits"].size());
+
+    ASSERT_STREQ("0", results["hits"][0]["document"]["id"].get<std::string>().c_str());
+
+    ASSERT_EQ(2, results["hits"][0]["highlights"].size());
+    ASSERT_EQ("name", results["hits"][0]["highlights"][0]["field"].get<std::string>());
+    ASSERT_EQ("Best Wireless Vehicle <mark>Charger</mark>",
+              results["hits"][0]["highlights"][0]["snippet"].get<std::string>());
+
+    ASSERT_EQ("description", results["hits"][0]["highlights"][1]["field"].get<std::string>());
+    ASSERT_EQ("Easily replenish your cell phone with this wireless <mark>charger</mark>.",
+              results["hits"][0]["highlights"][1]["snippet"].get<std::string>());
+
    collectionManager.drop_collection("coll1");
 }