From 133c64d2d273d09a9971ccb4cf06da57fd5f4458 Mon Sep 17 00:00:00 2001
From: Kishore Nallan <kishorenc@gmail.com>
Date: Fri, 7 Jan 2022 13:41:28 +0530
Subject: [PATCH] Verbatim match must not overpower weight.

---
 src/index.cpp                     | 18 ++++----
 test/collection_specific_test.cpp | 73 +++++++++++++++++++++++++++++++
 2 files changed, 81 insertions(+), 10 deletions(-)

diff --git a/src/index.cpp b/src/index.cpp
index 1e6a427a..db589f3a 100644
--- a/src/index.cpp
+++ b/src/index.cpp
@@ -2287,6 +2287,10 @@ void Index::search(std::vector<query_tokens_t>& field_query_tokens,
         }
 
         //auto begin0 = std::chrono::high_resolution_clock::now();
+        /*size_t total_q_tokens = field_query_tokens[0].q_include_tokens.size();
+        for(const auto& phrase: field_query_tokens[0].q_phrases) {
+            total_q_tokens += phrase.size();
+        }*/
 
         for(auto& seq_id_kvs: topster_ids) {
             const uint64_t seq_id = seq_id_kvs.first;
@@ -2312,7 +2316,7 @@ void Index::search(std::vector<query_tokens_t>& field_query_tokens,
             uint32_t token_bits = (uint32_t(1) << 31);      // top most bit set to guarantee atleast 1 bit set
             uint64_t total_typos = 0, total_distances = 0, min_typos = 1000;
 
-            uint64_t verbatim_match_fields = 0;        // query matching field verbatim
+            uint64_t verbatim_match_fields = 0;        // field value *exactly* same as query tokens
             uint64_t exact_match_fields = 0;           // number of fields that contains all of query tokens
             uint64_t max_weighted_tokens_match = 0;    // weighted max number of tokens matched in a field
             uint64_t total_token_matches = 0;          // total matches across fields (including fuzzy ones)
@@ -2325,10 +2329,6 @@ void Index::search(std::vector<query_tokens_t>& field_query_tokens,
                 const size_t weight = search_fields[i].weight;
 
                 //LOG(INFO) << "--- field index: " << i << ", priority: " << priority;
-                // using `5` here because typo + prefix combo score range is: 0 - 5
-                // 0    1    2
-                // 0,1  2,3  4,5
-                int64_t MAX_SUM_TYPOS = 5 * field_query_tokens[i].q_include_tokens.size();
 
                 if(existing_field_kvs.count(field_id) != 0) {
                     // for existing field, we will simply sum field-wise weighted scores
@@ -2370,13 +2370,11 @@ void Index::search(std::vector<query_tokens_t>& field_query_tokens,
                     continue;
                 }
 
-                const std::string& field = search_fields[i].name;
-                const bool field_prefix = (i < prefixes.size()) ? prefixes[i] : prefixes[0];
-
                 // compute approximate match score for this field from actual query
-
+                const std::string& field = search_fields[i].name;
                 size_t words_present = 0;
 
+                // FIXME: must consider phrase tokens also
                 for(size_t token_index=0; token_index < field_query_tokens[i].q_include_tokens.size(); token_index++) {
                     const auto& token = field_query_tokens[i].q_include_tokens[token_index];
                     const art_leaf* leaf = (art_leaf *) art_search(search_index.at(field), (const unsigned char*) token.c_str(),
@@ -2450,13 +2448,13 @@ void Index::search(std::vector<query_tokens_t>& field_query_tokens,
             // protect most significant byte from overflow, since topster uses int64_t
             verbatim_match_fields = std::min<uint64_t>(INT8_MAX, verbatim_match_fields);
 
+            exact_match_fields += verbatim_match_fields;
             exact_match_fields = std::min<uint64_t>(255, exact_match_fields);
             max_weighted_tokens_match = std::min<uint64_t>(255, max_weighted_tokens_match);
             total_typos = std::min<uint64_t>(255, total_typos);
             total_distances = std::min<uint64_t>(100, total_distances);
 
             uint64_t aggregated_score = (
-                (verbatim_match_fields << 56)  |      // field value *exactly* same as query tokens
                 (exact_match_fields << 48)  |         // number of fields that contain *all tokens* in the query
                 (max_weighted_tokens_match << 40) |   // weighted max number of tokens matched in a field
                 (uniq_tokens_found << 32)   |         // number of unique tokens found across fields including typos
diff --git a/test/collection_specific_test.cpp b/test/collection_specific_test.cpp
index c74dc1d8..a64d1840 100644
--- a/test/collection_specific_test.cpp
+++ b/test/collection_specific_test.cpp
@@ -2244,3 +2244,76 @@ TEST_F(CollectionSpecificTest, HandleLargeWeights) {
 
     collectionManager.drop_collection("coll1");
 }
+
+TEST_F(CollectionSpecificTest, DISABLED_ExactMatchOnAFieldIgnoresOtherFieldScores) {
+    std::vector<field> fields = {field("title", field_types::STRING, false),
+                                 field("description", field_types::STRING, false),
+                                 field("points", field_types::INT32, false),};
+
+    Collection* coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
+
+    nlohmann::json doc1;
+    doc1["id"] = "0";
+    doc1["title"] = "Mark Antony";
+    doc1["description"] = "Marriage Counsellor";
+    doc1["points"] = 100;
+
+    nlohmann::json doc2;
+    doc2["id"] = "1";
+    doc2["title"] = "Mark Spencer";
+    doc2["description"] = "Sales Expert";
+    doc2["points"] = 200;
+
+    ASSERT_TRUE(coll1->add(doc1.dump()).ok());
+    ASSERT_TRUE(coll1->add(doc2.dump()).ok());
+
+    auto results = coll1->search("mark", {"title", "description"},
+                                 "", {}, {}, {2, 2}, 10,
+                                 1, FREQUENCY, {true, true},
+                                 10, spp::sparse_hash_set<std::string>(),
+                                 spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "title", 20, {}, {}, {}, 0,
+                                 "<mark>", "</mark>", {3, 1}, 1000, true).get();
+
+    ASSERT_EQ(2, results["hits"].size());
+    ASSERT_EQ("1", results["hits"][0]["document"]["id"].get<std::string>());
+    ASSERT_EQ("0", results["hits"][1]["document"]["id"].get<std::string>());
+
+    collectionManager.drop_collection("coll1");
+}
+
+TEST_F(CollectionSpecificTest, VerbatimMatchShouldNotOverpowerHigherWeightedField) {
+    std::vector<field> fields = {field("title", field_types::STRING, false),
+                                 field("description", field_types::STRING, false),
+                                 field("points", field_types::INT32, false),};
+
+    Collection* coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
+
+    nlohmann::json doc1;
+    doc1["id"] = "0";
+    doc1["title"] = "Basketball Shoes";
+    doc1["description"] = "Basketball";
+    doc1["points"] = 100;
+
+    nlohmann::json doc2;
+    doc2["id"] = "1";
+    doc2["title"] = "Nike Jordan";
+    doc2["description"] = "Shoes";
+    doc2["points"] = 200;
+
+    ASSERT_TRUE(coll1->add(doc1.dump()).ok());
+    ASSERT_TRUE(coll1->add(doc2.dump()).ok());
+
+    auto results = coll1->search("shoes", {"title", "description"},
+                                 "", {}, {}, {2, 2}, 10,
+                                 1, FREQUENCY, {true, true},
+                                 10, spp::sparse_hash_set<std::string>(),
+                                 spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "title", 20, {}, {}, {}, 0,
+                                 "<mark>", "</mark>", {4, 1}, 1000, true).get();
+
+    ASSERT_EQ(2, results["hits"].size());
+    ASSERT_EQ("0", results["hits"][0]["document"]["id"].get<std::string>());
+    ASSERT_EQ("1", results["hits"][1]["document"]["id"].get<std::string>());
+
+    collectionManager.drop_collection("coll1");
+}
+