From ce7b6e12e996709ade8c2d1cb03227c955fd9424 Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Fri, 27 Aug 2021 20:52:51 +0530 Subject: [PATCH] Prioritize record with a field containing all tokens in the query. --- src/index.cpp | 6 +++--- test/collection_test.cpp | 17 ++++++++++++++--- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/src/index.cpp b/src/index.cpp index 467ff762..ac0f953f 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -945,7 +945,7 @@ void Index::search_candidates(const uint8_t & field_id, bool field_is_array, std::string qtok(reinterpret_cast(qleaf->key),qleaf->key_len - 1); fullq << qtok << " "; } - LOG(INFO) << "field: " << size_t(field_id) << ", query: " << fullq.str();*/ + LOG(INFO) << "field: " << size_t(field_id) << ", query: " << fullq.str() << ", total_cost: " << total_cost;*/ // Prepare excluded document IDs that we can later remove from the result set uint32_t* excluded_result_ids = nullptr; @@ -1803,8 +1803,8 @@ void Index::search(const std::vector& field_query_tokens, max_weighted_tokens_match = std::min(255, max_weighted_tokens_match); uint64_t aggregated_score = ( - //(exact_match_fields << 48) | // number of fields that contain *all tokens* in the query - (verbatim_match_fields << 48) | // field value *exactly* same as query tokens + (verbatim_match_fields << 56) | // field value *exactly* same as query tokens + (exact_match_fields << 48) | // number of fields that contain *all tokens* in the query (max_weighted_tokens_match << 40) | // weighted max number of tokens matched in a field (uniq_tokens_found << 32) | // number of unique tokens found across fields including typos ((255 - min_typos) << 24) | // minimum typo cost across all fields diff --git a/test/collection_test.cpp b/test/collection_test.cpp index 187845df..1811c45a 100644 --- a/test/collection_test.cpp +++ b/test/collection_test.cpp @@ -3014,7 +3014,7 @@ TEST_F(CollectionTest, MultiFieldRelevance2) { ASSERT_STREQ("1", results["hits"][0]["document"]["id"].get().c_str()); ASSERT_STREQ("0", results["hits"][1]["document"]["id"].get().c_str()); - // change weights to favor artist + // changing weights to favor artist still favors title because it contains all tokens of the query results = coll1->search("on a jetplane", {"title", "artist"}, "", {}, {}, {0}, 10, 1, FREQUENCY, @@ -3022,8 +3022,8 @@ TEST_F(CollectionTest, MultiFieldRelevance2) { spp::sparse_hash_set(), 10, "", 30, 4, "", 40, {}, {}, {}, 0, "", "", {1, 4}).get(); - ASSERT_STREQ("0", results["hits"][0]["document"]["id"].get().c_str()); - ASSERT_STREQ("1", results["hits"][1]["document"]["id"].get().c_str()); + ASSERT_STREQ("1", results["hits"][0]["document"]["id"].get().c_str()); + ASSERT_STREQ("0", results["hits"][1]["document"]["id"].get().c_str()); // use same weights @@ -3036,6 +3036,17 @@ TEST_F(CollectionTest, MultiFieldRelevance2) { ASSERT_STREQ("1", results["hits"][0]["document"]["id"].get().c_str()); ASSERT_STREQ("0", results["hits"][1]["document"]["id"].get().c_str()); + // add weights to favor artist without all tokens in a query being found in a field + + results = coll1->search("on a helicopter", + {"title", "artist"}, "", {}, {}, {0}, 10, 1, FREQUENCY, + {true}, 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "", 40, {}, {}, {}, 0, + "", "", {1, 4}).get(); + + ASSERT_STREQ("0", results["hits"][0]["document"]["id"].get().c_str()); + ASSERT_STREQ("1", results["hits"][1]["document"]["id"].get().c_str()); + collectionManager.drop_collection("coll1"); }