Verbatim match must not overpower weight.

This commit is contained in:
Kishore Nallan 2022-01-07 13:41:28 +05:30
parent 87e2d6914f
commit 133c64d2d2
2 changed files with 81 additions and 10 deletions

View File

@ -2287,6 +2287,10 @@ void Index::search(std::vector<query_tokens_t>& field_query_tokens,
}
//auto begin0 = std::chrono::high_resolution_clock::now();
/*size_t total_q_tokens = field_query_tokens[0].q_include_tokens.size();
for(const auto& phrase: field_query_tokens[0].q_phrases) {
total_q_tokens += phrase.size();
}*/
for(auto& seq_id_kvs: topster_ids) {
const uint64_t seq_id = seq_id_kvs.first;
@ -2312,7 +2316,7 @@ void Index::search(std::vector<query_tokens_t>& field_query_tokens,
uint32_t token_bits = (uint32_t(1) << 31); // top most bit set to guarantee atleast 1 bit set
uint64_t total_typos = 0, total_distances = 0, min_typos = 1000;
uint64_t verbatim_match_fields = 0; // query matching field verbatim
uint64_t verbatim_match_fields = 0; // field value *exactly* same as query tokens
uint64_t exact_match_fields = 0; // number of fields that contains all of query tokens
uint64_t max_weighted_tokens_match = 0; // weighted max number of tokens matched in a field
uint64_t total_token_matches = 0; // total matches across fields (including fuzzy ones)
@ -2325,10 +2329,6 @@ void Index::search(std::vector<query_tokens_t>& field_query_tokens,
const size_t weight = search_fields[i].weight;
//LOG(INFO) << "--- field index: " << i << ", priority: " << priority;
// using `5` here because typo + prefix combo score range is: 0 - 5
// 0 1 2
// 0,1 2,3 4,5
int64_t MAX_SUM_TYPOS = 5 * field_query_tokens[i].q_include_tokens.size();
if(existing_field_kvs.count(field_id) != 0) {
// for existing field, we will simply sum field-wise weighted scores
@ -2370,13 +2370,11 @@ void Index::search(std::vector<query_tokens_t>& field_query_tokens,
continue;
}
const std::string& field = search_fields[i].name;
const bool field_prefix = (i < prefixes.size()) ? prefixes[i] : prefixes[0];
// compute approximate match score for this field from actual query
const std::string& field = search_fields[i].name;
size_t words_present = 0;
// FIXME: must consider phrase tokens also
for(size_t token_index=0; token_index < field_query_tokens[i].q_include_tokens.size(); token_index++) {
const auto& token = field_query_tokens[i].q_include_tokens[token_index];
const art_leaf* leaf = (art_leaf *) art_search(search_index.at(field), (const unsigned char*) token.c_str(),
@ -2450,13 +2448,13 @@ void Index::search(std::vector<query_tokens_t>& field_query_tokens,
// protect most significant byte from overflow, since topster uses int64_t
verbatim_match_fields = std::min<uint64_t>(INT8_MAX, verbatim_match_fields);
exact_match_fields += verbatim_match_fields;
exact_match_fields = std::min<uint64_t>(255, exact_match_fields);
max_weighted_tokens_match = std::min<uint64_t>(255, max_weighted_tokens_match);
total_typos = std::min<uint64_t>(255, total_typos);
total_distances = std::min<uint64_t>(100, total_distances);
uint64_t aggregated_score = (
(verbatim_match_fields << 56) | // field value *exactly* same as query tokens
(exact_match_fields << 48) | // number of fields that contain *all tokens* in the query
(max_weighted_tokens_match << 40) | // weighted max number of tokens matched in a field
(uniq_tokens_found << 32) | // number of unique tokens found across fields including typos

View File

@ -2244,3 +2244,76 @@ TEST_F(CollectionSpecificTest, HandleLargeWeights) {
collectionManager.drop_collection("coll1");
}
TEST_F(CollectionSpecificTest, DISABLED_ExactMatchOnAFieldIgnoresOtherFieldScores) {
std::vector<field> fields = {field("title", field_types::STRING, false),
field("description", field_types::STRING, false),
field("points", field_types::INT32, false),};
Collection* coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
nlohmann::json doc1;
doc1["id"] = "0";
doc1["title"] = "Mark Antony";
doc1["description"] = "Marriage Counsellor";
doc1["points"] = 100;
nlohmann::json doc2;
doc2["id"] = "1";
doc2["title"] = "Mark Spencer";
doc2["description"] = "Sales Expert";
doc2["points"] = 200;
ASSERT_TRUE(coll1->add(doc1.dump()).ok());
ASSERT_TRUE(coll1->add(doc2.dump()).ok());
auto results = coll1->search("mark", {"title", "description"},
"", {}, {}, {2, 2}, 10,
1, FREQUENCY, {true, true},
10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "title", 20, {}, {}, {}, 0,
"<mark>", "</mark>", {3, 1}, 1000, true).get();
ASSERT_EQ(2, results["hits"].size());
ASSERT_EQ("1", results["hits"][0]["document"]["id"].get<std::string>());
ASSERT_EQ("0", results["hits"][1]["document"]["id"].get<std::string>());
collectionManager.drop_collection("coll1");
}
TEST_F(CollectionSpecificTest, VerbatimMatchShouldNotOverpowerHigherWeightedField) {
std::vector<field> fields = {field("title", field_types::STRING, false),
field("description", field_types::STRING, false),
field("points", field_types::INT32, false),};
Collection* coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
nlohmann::json doc1;
doc1["id"] = "0";
doc1["title"] = "Basketball Shoes";
doc1["description"] = "Basketball";
doc1["points"] = 100;
nlohmann::json doc2;
doc2["id"] = "1";
doc2["title"] = "Nike Jordan";
doc2["description"] = "Shoes";
doc2["points"] = 200;
ASSERT_TRUE(coll1->add(doc1.dump()).ok());
ASSERT_TRUE(coll1->add(doc2.dump()).ok());
auto results = coll1->search("shoes", {"title", "description"},
"", {}, {}, {2, 2}, 10,
1, FREQUENCY, {true, true},
10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "title", 20, {}, {}, {}, 0,
"<mark>", "</mark>", {4, 1}, 1000, true).get();
ASSERT_EQ(2, results["hits"].size());
ASSERT_EQ("0", results["hits"][0]["document"]["id"].get<std::string>());
ASSERT_EQ("1", results["hits"][1]["document"]["id"].get<std::string>());
collectionManager.drop_collection("coll1");
}