From 133c64d2d273d09a9971ccb4cf06da57fd5f4458 Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Fri, 7 Jan 2022 13:41:28 +0530 Subject: [PATCH] Verbatim match must not overpower weight. --- src/index.cpp | 18 ++++---- test/collection_specific_test.cpp | 73 +++++++++++++++++++++++++++++++ 2 files changed, 81 insertions(+), 10 deletions(-) diff --git a/src/index.cpp b/src/index.cpp index 1e6a427a..db589f3a 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -2287,6 +2287,10 @@ void Index::search(std::vector& field_query_tokens, } //auto begin0 = std::chrono::high_resolution_clock::now(); + /*size_t total_q_tokens = field_query_tokens[0].q_include_tokens.size(); + for(const auto& phrase: field_query_tokens[0].q_phrases) { + total_q_tokens += phrase.size(); + }*/ for(auto& seq_id_kvs: topster_ids) { const uint64_t seq_id = seq_id_kvs.first; @@ -2312,7 +2316,7 @@ void Index::search(std::vector& field_query_tokens, uint32_t token_bits = (uint32_t(1) << 31); // top most bit set to guarantee atleast 1 bit set uint64_t total_typos = 0, total_distances = 0, min_typos = 1000; - uint64_t verbatim_match_fields = 0; // query matching field verbatim + uint64_t verbatim_match_fields = 0; // field value *exactly* same as query tokens uint64_t exact_match_fields = 0; // number of fields that contains all of query tokens uint64_t max_weighted_tokens_match = 0; // weighted max number of tokens matched in a field uint64_t total_token_matches = 0; // total matches across fields (including fuzzy ones) @@ -2325,10 +2329,6 @@ void Index::search(std::vector& field_query_tokens, const size_t weight = search_fields[i].weight; //LOG(INFO) << "--- field index: " << i << ", priority: " << priority; - // using `5` here because typo + prefix combo score range is: 0 - 5 - // 0 1 2 - // 0,1 2,3 4,5 - int64_t MAX_SUM_TYPOS = 5 * field_query_tokens[i].q_include_tokens.size(); if(existing_field_kvs.count(field_id) != 0) { // for existing field, we will simply sum field-wise weighted scores @@ -2370,13 +2370,11 @@ void Index::search(std::vector& field_query_tokens, continue; } - const std::string& field = search_fields[i].name; - const bool field_prefix = (i < prefixes.size()) ? prefixes[i] : prefixes[0]; - // compute approximate match score for this field from actual query - + const std::string& field = search_fields[i].name; size_t words_present = 0; + // FIXME: must consider phrase tokens also for(size_t token_index=0; token_index < field_query_tokens[i].q_include_tokens.size(); token_index++) { const auto& token = field_query_tokens[i].q_include_tokens[token_index]; const art_leaf* leaf = (art_leaf *) art_search(search_index.at(field), (const unsigned char*) token.c_str(), @@ -2450,13 +2448,13 @@ void Index::search(std::vector& field_query_tokens, // protect most significant byte from overflow, since topster uses int64_t verbatim_match_fields = std::min(INT8_MAX, verbatim_match_fields); + exact_match_fields += verbatim_match_fields; exact_match_fields = std::min(255, exact_match_fields); max_weighted_tokens_match = std::min(255, max_weighted_tokens_match); total_typos = std::min(255, total_typos); total_distances = std::min(100, total_distances); uint64_t aggregated_score = ( - (verbatim_match_fields << 56) | // field value *exactly* same as query tokens (exact_match_fields << 48) | // number of fields that contain *all tokens* in the query (max_weighted_tokens_match << 40) | // weighted max number of tokens matched in a field (uniq_tokens_found << 32) | // number of unique tokens found across fields including typos diff --git a/test/collection_specific_test.cpp b/test/collection_specific_test.cpp index c74dc1d8..a64d1840 100644 --- a/test/collection_specific_test.cpp +++ b/test/collection_specific_test.cpp @@ -2244,3 +2244,76 @@ TEST_F(CollectionSpecificTest, HandleLargeWeights) { collectionManager.drop_collection("coll1"); } + +TEST_F(CollectionSpecificTest, DISABLED_ExactMatchOnAFieldIgnoresOtherFieldScores) { + std::vector fields = {field("title", field_types::STRING, false), + field("description", field_types::STRING, false), + field("points", field_types::INT32, false),}; + + Collection* coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get(); + + nlohmann::json doc1; + doc1["id"] = "0"; + doc1["title"] = "Mark Antony"; + doc1["description"] = "Marriage Counsellor"; + doc1["points"] = 100; + + nlohmann::json doc2; + doc2["id"] = "1"; + doc2["title"] = "Mark Spencer"; + doc2["description"] = "Sales Expert"; + doc2["points"] = 200; + + ASSERT_TRUE(coll1->add(doc1.dump()).ok()); + ASSERT_TRUE(coll1->add(doc2.dump()).ok()); + + auto results = coll1->search("mark", {"title", "description"}, + "", {}, {}, {2, 2}, 10, + 1, FREQUENCY, {true, true}, + 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "title", 20, {}, {}, {}, 0, + "", "", {3, 1}, 1000, true).get(); + + ASSERT_EQ(2, results["hits"].size()); + ASSERT_EQ("1", results["hits"][0]["document"]["id"].get()); + ASSERT_EQ("0", results["hits"][1]["document"]["id"].get()); + + collectionManager.drop_collection("coll1"); +} + +TEST_F(CollectionSpecificTest, VerbatimMatchShouldNotOverpowerHigherWeightedField) { + std::vector fields = {field("title", field_types::STRING, false), + field("description", field_types::STRING, false), + field("points", field_types::INT32, false),}; + + Collection* coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get(); + + nlohmann::json doc1; + doc1["id"] = "0"; + doc1["title"] = "Basketball Shoes"; + doc1["description"] = "Basketball"; + doc1["points"] = 100; + + nlohmann::json doc2; + doc2["id"] = "1"; + doc2["title"] = "Nike Jordan"; + doc2["description"] = "Shoes"; + doc2["points"] = 200; + + ASSERT_TRUE(coll1->add(doc1.dump()).ok()); + ASSERT_TRUE(coll1->add(doc2.dump()).ok()); + + auto results = coll1->search("shoes", {"title", "description"}, + "", {}, {}, {2, 2}, 10, + 1, FREQUENCY, {true, true}, + 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "title", 20, {}, {}, {}, 0, + "", "", {4, 1}, 1000, true).get(); + + ASSERT_EQ(2, results["hits"].size()); + ASSERT_EQ("0", results["hits"][0]["document"]["id"].get()); + ASSERT_EQ("1", results["hits"][1]["document"]["id"].get()); + + collectionManager.drop_collection("coll1"); +} +