Fix prioritize exact match when q has duplicate tokens.

This commit is contained in:
Kishore Nallan 2022-02-14 21:15:47 +05:30
parent 4549e09063
commit f7b5cf6ada
2 changed files with 70 additions and 4 deletions

View File

@ -219,11 +219,14 @@ struct Match {
if(check_exact_match) {
if(distance != token_offsets.size()-1) {
if(distance > token_offsets.size()-1) {
// we can exit early and don't have to care about other requirements
return;
}
// 1) distance < num tokens when there are repeating query tokens
// 2) distance can be same as num tokens and still not be an exact match
int last_token_index = -1;
size_t total_offsets = 0;
@ -231,15 +234,21 @@ struct Match {
if(token_positions.last_token && !token_positions.positions.empty()) {
last_token_index = token_positions.positions.back();
}
total_offsets += token_positions.positions.size();
if(total_offsets > token_offsets.size()) {
if(total_offsets > token_offsets.size() && distance == token_offsets.size()-1) {
// if total offsets exceed query length, there cannot possibly be an exact match
return;
}
}
if(last_token_index == int(token_offsets.size())-1 && total_offsets == token_offsets.size()) {
exact_match = 1;
if(last_token_index == int(token_offsets.size())-1) {
if(total_offsets == token_offsets.size() && distance == token_offsets.size()-1) {
exact_match = 1;
} else if(distance < token_offsets.size()-1) {
exact_match = 1;
}
}
}
}

View File

@ -1299,3 +1299,60 @@ TEST_F(CollectionSortingTest, TextMatchBucketRanking) {
collectionManager.drop_collection("coll1");
}
TEST_F(CollectionSortingTest, RepeatingTokenRanking) {
std::vector<field> fields = {field("title", field_types::STRING, false),
field("points", field_types::INT32, false),};
Collection* coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
nlohmann::json doc1;
doc1["id"] = "0";
doc1["title"] = "Mong Mong";
doc1["points"] = 100;
nlohmann::json doc2;
doc2["id"] = "1";
doc2["title"] = "Mong Spencer";
doc2["points"] = 200;
nlohmann::json doc3;
doc3["id"] = "2";
doc3["title"] = "Mong Mong Spencer";
doc3["points"] = 300;
nlohmann::json doc4;
doc4["id"] = "3";
doc4["title"] = "Spencer Mong Mong";
doc4["points"] = 400;
ASSERT_TRUE(coll1->add(doc1.dump()).ok());
ASSERT_TRUE(coll1->add(doc2.dump()).ok());
ASSERT_TRUE(coll1->add(doc3.dump()).ok());
ASSERT_TRUE(coll1->add(doc4.dump()).ok());
sort_fields = {
sort_by("_text_match", "DESC"),
sort_by("points", "DESC"),
};
auto results = coll1->search("mong mong", {"title"},
"", {}, sort_fields, {2}, 10,
1, FREQUENCY, {true},
10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "title", 20, {}, {}, {}, 0,
"<mark>", "</mark>", {3}, 1000, true).get();
ASSERT_EQ(4, results["hits"].size());
ASSERT_EQ("0", results["hits"][0]["document"]["id"].get<std::string>());
ASSERT_EQ("3", results["hits"][1]["document"]["id"].get<std::string>());
ASSERT_EQ("2", results["hits"][2]["document"]["id"].get<std::string>());
ASSERT_EQ("1", results["hits"][3]["document"]["id"].get<std::string>());
ASSERT_EQ(50291713, results["hits"][0]["text_match"].get<uint32_t>());
ASSERT_EQ(50291712, results["hits"][1]["text_match"].get<uint32_t>());
ASSERT_EQ(50291712, results["hits"][2]["text_match"].get<uint32_t>());
ASSERT_EQ(50291712, results["hits"][3]["text_match"].get<uint32_t>());
collectionManager.drop_collection("coll1");
}