From f7b5cf6ada05e505f16320d6b6267c2651c74ac0 Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Mon, 14 Feb 2022 21:15:47 +0530 Subject: [PATCH] Fix prioritize exact match when q has duplicate tokens. --- include/match_score.h | 17 +++++++--- test/collection_sorting_test.cpp | 57 ++++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+), 4 deletions(-) diff --git a/include/match_score.h b/include/match_score.h index 7f9e6874..21e4a209 100644 --- a/include/match_score.h +++ b/include/match_score.h @@ -219,11 +219,14 @@ struct Match { if(check_exact_match) { - if(distance != token_offsets.size()-1) { + if(distance > token_offsets.size()-1) { // we can exit early and don't have to care about other requirements return; } + // 1) distance < num tokens when there are repeating query tokens + // 2) distance can be same as num tokens and still not be an exact match + int last_token_index = -1; size_t total_offsets = 0; @@ -231,15 +234,21 @@ struct Match { if(token_positions.last_token && !token_positions.positions.empty()) { last_token_index = token_positions.positions.back(); } + total_offsets += token_positions.positions.size(); - if(total_offsets > token_offsets.size()) { + + if(total_offsets > token_offsets.size() && distance == token_offsets.size()-1) { // if total offsets exceed query length, there cannot possibly be an exact match return; } } - if(last_token_index == int(token_offsets.size())-1 && total_offsets == token_offsets.size()) { - exact_match = 1; + if(last_token_index == int(token_offsets.size())-1) { + if(total_offsets == token_offsets.size() && distance == token_offsets.size()-1) { + exact_match = 1; + } else if(distance < token_offsets.size()-1) { + exact_match = 1; + } } } } diff --git a/test/collection_sorting_test.cpp b/test/collection_sorting_test.cpp index 0f186fa0..4f7bdf48 100644 --- a/test/collection_sorting_test.cpp +++ b/test/collection_sorting_test.cpp @@ -1299,3 +1299,60 @@ TEST_F(CollectionSortingTest, TextMatchBucketRanking) { collectionManager.drop_collection("coll1"); } + +TEST_F(CollectionSortingTest, RepeatingTokenRanking) { + std::vector fields = {field("title", field_types::STRING, false), + field("points", field_types::INT32, false),}; + + Collection* coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get(); + + nlohmann::json doc1; + doc1["id"] = "0"; + doc1["title"] = "Mong Mong"; + doc1["points"] = 100; + + nlohmann::json doc2; + doc2["id"] = "1"; + doc2["title"] = "Mong Spencer"; + doc2["points"] = 200; + + nlohmann::json doc3; + doc3["id"] = "2"; + doc3["title"] = "Mong Mong Spencer"; + doc3["points"] = 300; + + nlohmann::json doc4; + doc4["id"] = "3"; + doc4["title"] = "Spencer Mong Mong"; + doc4["points"] = 400; + + ASSERT_TRUE(coll1->add(doc1.dump()).ok()); + ASSERT_TRUE(coll1->add(doc2.dump()).ok()); + ASSERT_TRUE(coll1->add(doc3.dump()).ok()); + ASSERT_TRUE(coll1->add(doc4.dump()).ok()); + + sort_fields = { + sort_by("_text_match", "DESC"), + sort_by("points", "DESC"), + }; + + auto results = coll1->search("mong mong", {"title"}, + "", {}, sort_fields, {2}, 10, + 1, FREQUENCY, {true}, + 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "title", 20, {}, {}, {}, 0, + "", "", {3}, 1000, true).get(); + + ASSERT_EQ(4, results["hits"].size()); + ASSERT_EQ("0", results["hits"][0]["document"]["id"].get()); + ASSERT_EQ("3", results["hits"][1]["document"]["id"].get()); + ASSERT_EQ("2", results["hits"][2]["document"]["id"].get()); + ASSERT_EQ("1", results["hits"][3]["document"]["id"].get()); + + ASSERT_EQ(50291713, results["hits"][0]["text_match"].get()); + ASSERT_EQ(50291712, results["hits"][1]["text_match"].get()); + ASSERT_EQ(50291712, results["hits"][2]["text_match"].get()); + ASSERT_EQ(50291712, results["hits"][3]["text_match"].get()); + + collectionManager.drop_collection("coll1"); +} \ No newline at end of file