Fix prioritize exact match when q has duplicate tokens.

2025-05-20 21:52:23 +08:00 · 2022-02-14 21:15:47 +05:30 · 2022-02-14 21:15:47 +05:30 · f7b5cf6ada
commit f7b5cf6ada
parent 4549e09063
2 changed files with 70 additions and 4 deletions
--- a/include/match_score.h
+++ b/include/match_score.h
@ -219,11 +219,14 @@ struct Match {

        if(check_exact_match) {

-            if(distance != token_offsets.size()-1) {
+            if(distance > token_offsets.size()-1) {
                // we can exit early and don't have to care about other requirements
                return;
            }

+            // 1) distance < num tokens when there are repeating query tokens
+            // 2) distance can be same as num tokens and still not be an exact match
+
            int last_token_index = -1;
            size_t total_offsets = 0;

@ -231,15 +234,21 @@ struct Match {
                if(token_positions.last_token && !token_positions.positions.empty()) {
                    last_token_index = token_positions.positions.back();
                }
+
                total_offsets += token_positions.positions.size();
-                if(total_offsets > token_offsets.size()) {
+
+                if(total_offsets > token_offsets.size() && distance == token_offsets.size()-1) {
                    // if total offsets exceed query length, there cannot possibly be an exact match
                    return;
                }
            }

-            if(last_token_index == int(token_offsets.size())-1 && total_offsets == token_offsets.size()) {
-                exact_match = 1;
+            if(last_token_index == int(token_offsets.size())-1) {
+                if(total_offsets == token_offsets.size() && distance == token_offsets.size()-1) {
+                    exact_match = 1;
+                } else if(distance < token_offsets.size()-1) {
+                    exact_match = 1;
+                }
            }
        }
    }
--- a/test/collection_sorting_test.cpp
+++ b/test/collection_sorting_test.cpp
@ -1299,3 +1299,60 @@ TEST_F(CollectionSortingTest, TextMatchBucketRanking) {

    collectionManager.drop_collection("coll1");
 }
+
+TEST_F(CollectionSortingTest, RepeatingTokenRanking) {
+    std::vector<field> fields = {field("title", field_types::STRING, false),
+                                 field("points", field_types::INT32, false),};
+
+    Collection* coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
+
+    nlohmann::json doc1;
+    doc1["id"] = "0";
+    doc1["title"] = "Mong Mong";
+    doc1["points"] = 100;
+
+    nlohmann::json doc2;
+    doc2["id"] = "1";
+    doc2["title"] = "Mong Spencer";
+    doc2["points"] = 200;
+
+    nlohmann::json doc3;
+    doc3["id"] = "2";
+    doc3["title"] = "Mong Mong Spencer";
+    doc3["points"] = 300;
+
+    nlohmann::json doc4;
+    doc4["id"] = "3";
+    doc4["title"] = "Spencer Mong Mong";
+    doc4["points"] = 400;
+
+    ASSERT_TRUE(coll1->add(doc1.dump()).ok());
+    ASSERT_TRUE(coll1->add(doc2.dump()).ok());
+    ASSERT_TRUE(coll1->add(doc3.dump()).ok());
+    ASSERT_TRUE(coll1->add(doc4.dump()).ok());
+
+    sort_fields = {
+        sort_by("_text_match", "DESC"),
+        sort_by("points", "DESC"),
+    };
+
+    auto results = coll1->search("mong mong", {"title"},
+                                 "", {}, sort_fields, {2}, 10,
+                                 1, FREQUENCY, {true},
+                                 10, spp::sparse_hash_set<std::string>(),
+                                 spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "title", 20, {}, {}, {}, 0,
+                                 "<mark>", "</mark>", {3}, 1000, true).get();
+
+    ASSERT_EQ(4, results["hits"].size());
+    ASSERT_EQ("0", results["hits"][0]["document"]["id"].get<std::string>());
+    ASSERT_EQ("3", results["hits"][1]["document"]["id"].get<std::string>());
+    ASSERT_EQ("2", results["hits"][2]["document"]["id"].get<std::string>());
+    ASSERT_EQ("1", results["hits"][3]["document"]["id"].get<std::string>());
+
+    ASSERT_EQ(50291713, results["hits"][0]["text_match"].get<uint32_t>());
+    ASSERT_EQ(50291712, results["hits"][1]["text_match"].get<uint32_t>());
+    ASSERT_EQ(50291712, results["hits"][2]["text_match"].get<uint32_t>());
+    ASSERT_EQ(50291712, results["hits"][3]["text_match"].get<uint32_t>());
+
+    collectionManager.drop_collection("coll1");
+}