From f7b5cf6ada05e505f16320d6b6267c2651c74ac0 Mon Sep 17 00:00:00 2001
From: Kishore Nallan <kishorenc@gmail.com>
Date: Mon, 14 Feb 2022 21:15:47 +0530
Subject: [PATCH] Fix prioritize exact match when q has duplicate tokens.

---
 include/match_score.h            | 17 +++++++---
 test/collection_sorting_test.cpp | 57 ++++++++++++++++++++++++++++++++
 2 files changed, 70 insertions(+), 4 deletions(-)
diff --git a/include/match_score.h b/include/match_score.h
index 7f9e6874..21e4a209 100644
--- a/include/match_score.h
+++ b/include/match_score.h
@@ -219,11 +219,14 @@ struct Match {
 
         if(check_exact_match) {
 
-            if(distance != token_offsets.size()-1) {
+            if(distance > token_offsets.size()-1) {
                 // we can exit early and don't have to care about other requirements
                 return;
             }
 
+            // 1) distance < num tokens when there are repeating query tokens
+            // 2) distance can be same as num tokens and still not be an exact match
+
             int last_token_index = -1;
             size_t total_offsets = 0;
 
@@ -231,15 +234,21 @@ struct Match {
                 if(token_positions.last_token && !token_positions.positions.empty()) {
                     last_token_index = token_positions.positions.back();
                 }
+
                 total_offsets += token_positions.positions.size();
-                if(total_offsets > token_offsets.size()) {
+
+                if(total_offsets > token_offsets.size() && distance == token_offsets.size()-1) {
                     // if total offsets exceed query length, there cannot possibly be an exact match
                     return;
                 }
             }
 
-            if(last_token_index == int(token_offsets.size())-1 && total_offsets == token_offsets.size()) {
-                exact_match = 1;
+            if(last_token_index == int(token_offsets.size())-1) {
+                if(total_offsets == token_offsets.size() && distance == token_offsets.size()-1) {
+                    exact_match = 1;
+                } else if(distance < token_offsets.size()-1) {
+                    exact_match = 1;
+                }
             }
         }
     }
diff --git a/test/collection_sorting_test.cpp b/test/collection_sorting_test.cpp
index 0f186fa0..4f7bdf48 100644
--- a/test/collection_sorting_test.cpp
+++ b/test/collection_sorting_test.cpp
@@ -1299,3 +1299,60 @@ TEST_F(CollectionSortingTest, TextMatchBucketRanking) {
 
     collectionManager.drop_collection("coll1");
 }
+
+TEST_F(CollectionSortingTest, RepeatingTokenRanking) {
+    std::vector<field> fields = {field("title", field_types::STRING, false),
+                                 field("points", field_types::INT32, false),};
+
+    Collection* coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
+
+    nlohmann::json doc1;
+    doc1["id"] = "0";
+    doc1["title"] = "Mong Mong";
+    doc1["points"] = 100;
+
+    nlohmann::json doc2;
+    doc2["id"] = "1";
+    doc2["title"] = "Mong Spencer";
+    doc2["points"] = 200;
+
+    nlohmann::json doc3;
+    doc3["id"] = "2";
+    doc3["title"] = "Mong Mong Spencer";
+    doc3["points"] = 300;
+
+    nlohmann::json doc4;
+    doc4["id"] = "3";
+    doc4["title"] = "Spencer Mong Mong";
+    doc4["points"] = 400;
+
+    ASSERT_TRUE(coll1->add(doc1.dump()).ok());
+    ASSERT_TRUE(coll1->add(doc2.dump()).ok());
+    ASSERT_TRUE(coll1->add(doc3.dump()).ok());
+    ASSERT_TRUE(coll1->add(doc4.dump()).ok());
+
+    sort_fields = {
+        sort_by("_text_match", "DESC"),
+        sort_by("points", "DESC"),
+    };
+
+    auto results = coll1->search("mong mong", {"title"},
+                                 "", {}, sort_fields, {2}, 10,
+                                 1, FREQUENCY, {true},
+                                 10, spp::sparse_hash_set<std::string>(),
+                                 spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "title", 20, {}, {}, {}, 0,
+                                 "<mark>", "</mark>", {3}, 1000, true).get();
+
+    ASSERT_EQ(4, results["hits"].size());
+    ASSERT_EQ("0", results["hits"][0]["document"]["id"].get<std::string>());
+    ASSERT_EQ("3", results["hits"][1]["document"]["id"].get<std::string>());
+    ASSERT_EQ("2", results["hits"][2]["document"]["id"].get<std::string>());
+    ASSERT_EQ("1", results["hits"][3]["document"]["id"].get<std::string>());
+
+    ASSERT_EQ(50291713, results["hits"][0]["text_match"].get<uint32_t>());
+    ASSERT_EQ(50291712, results["hits"][1]["text_match"].get<uint32_t>());
+    ASSERT_EQ(50291712, results["hits"][2]["text_match"].get<uint32_t>());
+    ASSERT_EQ(50291712, results["hits"][3]["text_match"].get<uint32_t>());
+
+    collectionManager.drop_collection("coll1");
+}
\ No newline at end of file