Dropped tokens should not be prioritized as exact matches.

2025-05-20 13:42:26 +08:00 · 2021-09-13 16:23:56 +05:30 · 2021-09-13 16:23:56 +05:30 · 703110264a
commit 703110264a
parent 902704887c
3 changed files with 75 additions and 4 deletions
--- a/src/index.cpp
+++ b/src/index.cpp
@ -1964,6 +1964,11 @@ void Index::search(std::vector<query_tokens_t>& field_query_tokens,
            // num tokens present across fields including those containing typos
            int64_t uniq_tokens_found = int64_t(__builtin_popcount(token_bits)) - 1;

+            // verbtaim match should not consider dropped-token cases
+            if(uniq_tokens_found != field_query_tokens[0].q_include_tokens.size()) {
+                verbatim_match_fields = 0;
+            }
+
            verbatim_match_fields = std::min<uint64_t>(255, verbatim_match_fields);
            exact_match_fields = std::min<uint64_t>(255, exact_match_fields);
            max_weighted_tokens_match = std::min<uint64_t>(255, max_weighted_tokens_match);
--- a/test/collection_grouping_test.cpp
+++ b/test/collection_grouping_test.cpp
@ -307,8 +307,8 @@ TEST_F(CollectionGroupingTest, GroupingWithMultiFieldRelevance) {

    ASSERT_STREQ("country", results["grouped_hits"][2]["group_key"][0].get<std::string>().c_str());
    ASSERT_EQ(2, results["grouped_hits"][2]["hits"].size());
-    ASSERT_STREQ("3", results["grouped_hits"][2]["hits"][0]["document"]["id"].get<std::string>().c_str());
-    ASSERT_STREQ("8", results["grouped_hits"][2]["hits"][1]["document"]["id"].get<std::string>().c_str());
+    ASSERT_STREQ("8", results["grouped_hits"][2]["hits"][0]["document"]["id"].get<std::string>().c_str());
+    ASSERT_STREQ("3", results["grouped_hits"][2]["hits"][1]["document"]["id"].get<std::string>().c_str());

    collectionManager.drop_collection("coll1");
 }
--- a/test/collection_specific_test.cpp
+++ b/test/collection_specific_test.cpp
@ -1282,8 +1282,8 @@ TEST_F(CollectionSpecificTest, TypoCorrectionWithFaceting) {
    collectionManager.drop_collection("coll1");
 }

-TEST_F(CollectionSpecificTest, MultiFieldMatchesShouldBeWeighted) {
-    // 2 matches on low weighted fields should not overpower a single match on high weighted field
+TEST_F(CollectionSpecificTest, MultiFieldVerbatimMatchesShouldBeWeighted) {
+    // 2 exact matches on low weighted fields should not overpower a single exact match on high weighted field
    std::vector<field> fields = {field("name", field_types::STRING, false),
                                 field("category", field_types::STRING, false),
                                 field("label", field_types::STRING, false),
@ -1493,3 +1493,69 @@ TEST_F(CollectionSpecificTest, FacetParallelizationVerification) {

    collectionManager.drop_collection("coll1");
 }
+
+TEST_F(CollectionSpecificTest, VerbatimMatchShouldConsiderTokensMatchedAcrossAllFields) {
+    // dropped tokens on a single field cannot be deemed as verbatim match
+
+    std::vector<field> fields = {field("name", field_types::STRING, false),
+                                 field("brand", field_types::STRING, false),
+                                 field("points", field_types::INT32, false),};
+
+    Collection* coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
+
+    nlohmann::json doc1;
+    doc1["id"] = "0";
+    doc1["name"] = "Hamburger";
+    doc1["brand"] = "Burger King";
+    doc1["points"] = 10;
+
+    nlohmann::json doc2;
+    doc2["id"] = "1";
+    doc2["name"] = "Hamburger Bun";
+    doc2["brand"] = "Trader Joe’s";
+    doc2["points"] = 5;
+
+    ASSERT_TRUE(coll1->add(doc1.dump()).ok());
+    ASSERT_TRUE(coll1->add(doc2.dump()).ok());
+
+    auto results = coll1->search("hamburger trader", {"name", "brand"},
+                                 "", {}, {}, {0, 0}, 10,
+                                 1, FREQUENCY, {false, false},
+                                 2, spp::sparse_hash_set<std::string>(),
+                                 spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 10, {}, {}, {}, 0,
+                                 "<mark>", "</mark>", {1, 1},
+                                 1000, true).get();
+
+    ASSERT_EQ(2, results["hits"].size());
+    ASSERT_EQ("1", results["hits"][0]["document"]["id"].get<std::string>());
+    ASSERT_EQ("0", results["hits"][1]["document"]["id"].get<std::string>());
+
+    nlohmann::json doc3;
+    doc3["id"] = "2";
+    doc3["name"] = "Potato Wedges";
+    doc3["brand"] = "McDonalds";
+    doc3["points"] = 10;
+
+    nlohmann::json doc4;
+    doc4["id"] = "3";
+    doc4["name"] = "Hot Potato Wedges";
+    doc4["brand"] = "KFC Inc.";
+    doc4["points"] = 5;
+
+    ASSERT_TRUE(coll1->add(doc3.dump()).ok());
+    ASSERT_TRUE(coll1->add(doc4.dump()).ok());
+
+    results = coll1->search("potato wedges kfc", {"name", "brand"},
+                            "", {}, {}, {0, 0}, 10,
+                            1, FREQUENCY, {false, false},
+                            2, spp::sparse_hash_set<std::string>(),
+                            spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 10, {}, {}, {}, 0,
+                            "<mark>", "</mark>", {1, 1},
+                            1000, true).get();
+
+    ASSERT_EQ(2, results["hits"].size());
+    ASSERT_EQ("3", results["hits"][0]["document"]["id"].get<std::string>());
+    ASSERT_EQ("2", results["hits"][1]["document"]["id"].get<std::string>());
+
+    collectionManager.drop_collection("coll1");
+}