Prioritize record with a field containing all tokens in the query.

This commit is contained in:
Kishore Nallan 2021-08-27 20:52:51 +05:30
parent 07d838e385
commit ce7b6e12e9
2 changed files with 17 additions and 6 deletions

View File

@ -945,7 +945,7 @@ void Index::search_candidates(const uint8_t & field_id, bool field_is_array,
std::string qtok(reinterpret_cast<char*>(qleaf->key),qleaf->key_len - 1);
fullq << qtok << " ";
}
LOG(INFO) << "field: " << size_t(field_id) << ", query: " << fullq.str();*/
LOG(INFO) << "field: " << size_t(field_id) << ", query: " << fullq.str() << ", total_cost: " << total_cost;*/
// Prepare excluded document IDs that we can later remove from the result set
uint32_t* excluded_result_ids = nullptr;
@ -1803,8 +1803,8 @@ void Index::search(const std::vector<query_tokens_t>& field_query_tokens,
max_weighted_tokens_match = std::min<uint64_t>(255, max_weighted_tokens_match);
uint64_t aggregated_score = (
//(exact_match_fields << 48) | // number of fields that contain *all tokens* in the query
(verbatim_match_fields << 48) | // field value *exactly* same as query tokens
(verbatim_match_fields << 56) | // field value *exactly* same as query tokens
(exact_match_fields << 48) | // number of fields that contain *all tokens* in the query
(max_weighted_tokens_match << 40) | // weighted max number of tokens matched in a field
(uniq_tokens_found << 32) | // number of unique tokens found across fields including typos
((255 - min_typos) << 24) | // minimum typo cost across all fields

View File

@ -3014,7 +3014,7 @@ TEST_F(CollectionTest, MultiFieldRelevance2) {
ASSERT_STREQ("1", results["hits"][0]["document"]["id"].get<std::string>().c_str());
ASSERT_STREQ("0", results["hits"][1]["document"]["id"].get<std::string>().c_str());
// change weights to favor artist
// changing weights to favor artist still favors title because it contains all tokens of the query
results = coll1->search("on a jetplane",
{"title", "artist"}, "", {}, {}, {0}, 10, 1, FREQUENCY,
@ -3022,8 +3022,8 @@ TEST_F(CollectionTest, MultiFieldRelevance2) {
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0,
"<mark>", "</mark>", {1, 4}).get();
ASSERT_STREQ("0", results["hits"][0]["document"]["id"].get<std::string>().c_str());
ASSERT_STREQ("1", results["hits"][1]["document"]["id"].get<std::string>().c_str());
ASSERT_STREQ("1", results["hits"][0]["document"]["id"].get<std::string>().c_str());
ASSERT_STREQ("0", results["hits"][1]["document"]["id"].get<std::string>().c_str());
// use same weights
@ -3036,6 +3036,17 @@ TEST_F(CollectionTest, MultiFieldRelevance2) {
ASSERT_STREQ("1", results["hits"][0]["document"]["id"].get<std::string>().c_str());
ASSERT_STREQ("0", results["hits"][1]["document"]["id"].get<std::string>().c_str());
// add weights to favor artist without all tokens in a query being found in a field
results = coll1->search("on a helicopter",
{"title", "artist"}, "", {}, {}, {0}, 10, 1, FREQUENCY,
{true}, 10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0,
"<mark>", "</mark>", {1, 4}).get();
ASSERT_STREQ("0", results["hits"][0]["document"]["id"].get<std::string>().c_str());
ASSERT_STREQ("1", results["hits"][1]["document"]["id"].get<std::string>().c_str());
collectionManager.drop_collection("coll1");
}