Verbatim match must not overpower weight.

2025-05-21 22:33:27 +08:00 · 2022-01-07 13:41:28 +05:30 · 2022-01-07 13:41:28 +05:30 · 133c64d2d2
commit 133c64d2d2
parent 87e2d6914f
2 changed files with 81 additions and 10 deletions
--- a/src/index.cpp
+++ b/src/index.cpp
@ -2287,6 +2287,10 @@ void Index::search(std::vector<query_tokens_t>& field_query_tokens,
        }

        //auto begin0 = std::chrono::high_resolution_clock::now();
+        /*size_t total_q_tokens = field_query_tokens[0].q_include_tokens.size();
+        for(const auto& phrase: field_query_tokens[0].q_phrases) {
+            total_q_tokens += phrase.size();
+        }*/

        for(auto& seq_id_kvs: topster_ids) {
            const uint64_t seq_id = seq_id_kvs.first;
@ -2312,7 +2316,7 @@ void Index::search(std::vector<query_tokens_t>& field_query_tokens,
            uint32_t token_bits = (uint32_t(1) << 31);      // top most bit set to guarantee atleast 1 bit set
            uint64_t total_typos = 0, total_distances = 0, min_typos = 1000;

-            uint64_t verbatim_match_fields = 0;        // query matching field verbatim
+            uint64_t verbatim_match_fields = 0;        // field value *exactly* same as query tokens
            uint64_t exact_match_fields = 0;           // number of fields that contains all of query tokens
            uint64_t max_weighted_tokens_match = 0;    // weighted max number of tokens matched in a field
            uint64_t total_token_matches = 0;          // total matches across fields (including fuzzy ones)
@ -2325,10 +2329,6 @@ void Index::search(std::vector<query_tokens_t>& field_query_tokens,
                const size_t weight = search_fields[i].weight;

                //LOG(INFO) << "--- field index: " << i << ", priority: " << priority;
-                // using `5` here because typo + prefix combo score range is: 0 - 5
-                // 0    1    2
-                // 0,1  2,3  4,5
-                int64_t MAX_SUM_TYPOS = 5 * field_query_tokens[i].q_include_tokens.size();

                if(existing_field_kvs.count(field_id) != 0) {
                    // for existing field, we will simply sum field-wise weighted scores
@ -2370,13 +2370,11 @@ void Index::search(std::vector<query_tokens_t>& field_query_tokens,
                    continue;
                }

-                const std::string& field = search_fields[i].name;
-                const bool field_prefix = (i < prefixes.size()) ? prefixes[i] : prefixes[0];
-
                // compute approximate match score for this field from actual query
-
+                const std::string& field = search_fields[i].name;
                size_t words_present = 0;

+                // FIXME: must consider phrase tokens also
                for(size_t token_index=0; token_index < field_query_tokens[i].q_include_tokens.size(); token_index++) {
                    const auto& token = field_query_tokens[i].q_include_tokens[token_index];
                    const art_leaf* leaf = (art_leaf *) art_search(search_index.at(field), (const unsigned char*) token.c_str(),
@ -2450,13 +2448,13 @@ void Index::search(std::vector<query_tokens_t>& field_query_tokens,
            // protect most significant byte from overflow, since topster uses int64_t
            verbatim_match_fields = std::min<uint64_t>(INT8_MAX, verbatim_match_fields);

+            exact_match_fields += verbatim_match_fields;
            exact_match_fields = std::min<uint64_t>(255, exact_match_fields);
            max_weighted_tokens_match = std::min<uint64_t>(255, max_weighted_tokens_match);
            total_typos = std::min<uint64_t>(255, total_typos);
            total_distances = std::min<uint64_t>(100, total_distances);

            uint64_t aggregated_score = (
-                (verbatim_match_fields << 56)  |      // field value *exactly* same as query tokens
                (exact_match_fields << 48)  |         // number of fields that contain *all tokens* in the query
                (max_weighted_tokens_match << 40) |   // weighted max number of tokens matched in a field
                (uniq_tokens_found << 32)   |         // number of unique tokens found across fields including typos
--- a/test/collection_specific_test.cpp
+++ b/test/collection_specific_test.cpp
@ -2244,3 +2244,76 @@ TEST_F(CollectionSpecificTest, HandleLargeWeights) {

    collectionManager.drop_collection("coll1");
 }
+
+TEST_F(CollectionSpecificTest, DISABLED_ExactMatchOnAFieldIgnoresOtherFieldScores) {
+    std::vector<field> fields = {field("title", field_types::STRING, false),
+                                 field("description", field_types::STRING, false),
+                                 field("points", field_types::INT32, false),};
+
+    Collection* coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
+
+    nlohmann::json doc1;
+    doc1["id"] = "0";
+    doc1["title"] = "Mark Antony";
+    doc1["description"] = "Marriage Counsellor";
+    doc1["points"] = 100;
+
+    nlohmann::json doc2;
+    doc2["id"] = "1";
+    doc2["title"] = "Mark Spencer";
+    doc2["description"] = "Sales Expert";
+    doc2["points"] = 200;
+
+    ASSERT_TRUE(coll1->add(doc1.dump()).ok());
+    ASSERT_TRUE(coll1->add(doc2.dump()).ok());
+
+    auto results = coll1->search("mark", {"title", "description"},
+                                 "", {}, {}, {2, 2}, 10,
+                                 1, FREQUENCY, {true, true},
+                                 10, spp::sparse_hash_set<std::string>(),
+                                 spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "title", 20, {}, {}, {}, 0,
+                                 "<mark>", "</mark>", {3, 1}, 1000, true).get();
+
+    ASSERT_EQ(2, results["hits"].size());
+    ASSERT_EQ("1", results["hits"][0]["document"]["id"].get<std::string>());
+    ASSERT_EQ("0", results["hits"][1]["document"]["id"].get<std::string>());
+
+    collectionManager.drop_collection("coll1");
+}
+
+TEST_F(CollectionSpecificTest, VerbatimMatchShouldNotOverpowerHigherWeightedField) {
+    std::vector<field> fields = {field("title", field_types::STRING, false),
+                                 field("description", field_types::STRING, false),
+                                 field("points", field_types::INT32, false),};
+
+    Collection* coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
+
+    nlohmann::json doc1;
+    doc1["id"] = "0";
+    doc1["title"] = "Basketball Shoes";
+    doc1["description"] = "Basketball";
+    doc1["points"] = 100;
+
+    nlohmann::json doc2;
+    doc2["id"] = "1";
+    doc2["title"] = "Nike Jordan";
+    doc2["description"] = "Shoes";
+    doc2["points"] = 200;
+
+    ASSERT_TRUE(coll1->add(doc1.dump()).ok());
+    ASSERT_TRUE(coll1->add(doc2.dump()).ok());
+
+    auto results = coll1->search("shoes", {"title", "description"},
+                                 "", {}, {}, {2, 2}, 10,
+                                 1, FREQUENCY, {true, true},
+                                 10, spp::sparse_hash_set<std::string>(),
+                                 spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "title", 20, {}, {}, {}, 0,
+                                 "<mark>", "</mark>", {4, 1}, 1000, true).get();
+
+    ASSERT_EQ(2, results["hits"].size());
+    ASSERT_EQ("0", results["hits"][0]["document"]["id"].get<std::string>());
+    ASSERT_EQ("1", results["hits"][1]["document"]["id"].get<std::string>());
+
+    collectionManager.drop_collection("coll1");
+}
+