Prioritize record with a field containing all tokens in the query.

2025-05-18 04:32:38 +08:00 · 2021-08-27 20:52:51 +05:30 · 2021-08-27 20:52:51 +05:30 · ce7b6e12e9
commit ce7b6e12e9
parent 07d838e385
2 changed files with 17 additions and 6 deletions
--- a/src/index.cpp
+++ b/src/index.cpp
@ -945,7 +945,7 @@ void Index::search_candidates(const uint8_t & field_id, bool field_is_array,
            std::string qtok(reinterpret_cast<char*>(qleaf->key),qleaf->key_len - 1);
            fullq << qtok << " ";
        }
-        LOG(INFO) << "field: " << size_t(field_id) << ", query: " << fullq.str();*/
+        LOG(INFO) << "field: " << size_t(field_id) << ", query: " << fullq.str() << ", total_cost: " << total_cost;*/

        // Prepare excluded document IDs that we can later remove from the result set
        uint32_t* excluded_result_ids = nullptr;
@ -1803,8 +1803,8 @@ void Index::search(const std::vector<query_tokens_t>& field_query_tokens,
            max_weighted_tokens_match = std::min<uint64_t>(255, max_weighted_tokens_match);

            uint64_t aggregated_score = (
-                //(exact_match_fields << 48)  |       // number of fields that contain *all tokens* in the query
-                (verbatim_match_fields << 48)  |      // field value *exactly* same as query tokens
+                (verbatim_match_fields << 56)  |      // field value *exactly* same as query tokens
+                (exact_match_fields << 48)  |         // number of fields that contain *all tokens* in the query
                (max_weighted_tokens_match << 40) |   // weighted max number of tokens matched in a field
                (uniq_tokens_found << 32)   |         // number of unique tokens found across fields including typos
                ((255 - min_typos) << 24)   |         // minimum typo cost across all fields
--- a/test/collection_test.cpp
+++ b/test/collection_test.cpp
@ -3014,7 +3014,7 @@ TEST_F(CollectionTest, MultiFieldRelevance2) {
    ASSERT_STREQ("1", results["hits"][0]["document"]["id"].get<std::string>().c_str());
    ASSERT_STREQ("0", results["hits"][1]["document"]["id"].get<std::string>().c_str());

-    // change weights to favor artist
+    // changing weights to favor artist still favors title because it contains all tokens of the query

    results = coll1->search("on a jetplane",
                            {"title", "artist"}, "", {}, {}, {0}, 10, 1, FREQUENCY,
@ -3022,8 +3022,8 @@ TEST_F(CollectionTest, MultiFieldRelevance2) {
                            spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0,
                            "<mark>", "</mark>", {1, 4}).get();

-    ASSERT_STREQ("0", results["hits"][0]["document"]["id"].get<std::string>().c_str());
-    ASSERT_STREQ("1", results["hits"][1]["document"]["id"].get<std::string>().c_str());
+    ASSERT_STREQ("1", results["hits"][0]["document"]["id"].get<std::string>().c_str());
+    ASSERT_STREQ("0", results["hits"][1]["document"]["id"].get<std::string>().c_str());

    // use same weights

@ -3036,6 +3036,17 @@ TEST_F(CollectionTest, MultiFieldRelevance2) {
    ASSERT_STREQ("1", results["hits"][0]["document"]["id"].get<std::string>().c_str());
    ASSERT_STREQ("0", results["hits"][1]["document"]["id"].get<std::string>().c_str());

+    // add weights to favor artist without all tokens in a query being found in a field
+
+    results = coll1->search("on a helicopter",
+                            {"title", "artist"}, "", {}, {}, {0}, 10, 1, FREQUENCY,
+                            {true}, 10, spp::sparse_hash_set<std::string>(),
+                            spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0,
+                            "<mark>", "</mark>", {1, 4}).get();
+
+    ASSERT_STREQ("0", results["hits"][0]["document"]["id"].get<std::string>().c_str());
+    ASSERT_STREQ("1", results["hits"][1]["document"]["id"].get<std::string>().c_str());
+
    collectionManager.drop_collection("coll1");
 }