Fix verbatim match on array.

2025-05-18 04:32:38 +08:00 · 2022-05-29 19:24:55 +05:30 · 2022-05-29 19:24:55 +05:30 · a409df8dad
commit a409df8dad
parent fa607f0013
4 changed files with 57 additions and 21 deletions
--- a/include/index.h
+++ b/include/index.h
@ -700,10 +700,9 @@ public:
    static void concat_topster_ids(Topster* topster, spp::sparse_hash_map<uint64_t, std::vector<KV*>>& topster_ids);

    int64_t score_results2(const std::vector<sort_by> & sort_fields, const uint16_t & query_index,
-                           const bool field_is_array, const uint32_t total_cost,
+                           const size_t field_id, const bool field_is_array, const uint32_t total_cost,
                           int64_t& match_score,
                           const uint32_t seq_id, const int sort_order[3],
-                           const size_t group_limit, const std::vector<std::string>& group_by_fields,
                           const bool prioritize_exact_match,
                           const bool single_exact_query_token,
                           size_t num_query_tokens,
--- a/src/index.cpp
+++ b/src/index.cpp
@ -3118,9 +3118,9 @@ void Index::search_across_fields(const std::vector<token_t>& query_tokens,
                single_exact_query_token = true;
            }

-            score_results2(sort_fields, searched_queries.size(), field_is_array,
+            score_results2(sort_fields, searched_queries.size(), fi, field_is_array,
                          total_cost, field_match_score,
-                          seq_id, sort_order, group_limit, group_by_fields,
+                          seq_id, sort_order,
                          prioritize_exact_match, single_exact_query_token,
                          query_tokens.size(), syn_orig_num_tokens, token_postings);

@ -3543,9 +3543,8 @@ void Index::do_infix_search(const size_t num_search_fields, const std::vector<se
                    auto seq_id = raw_infix_ids[i];

                    int64_t match_score = 0;
-                    score_results2(sort_fields, searched_queries.size(), field_is_array,
-                                   0, match_score, seq_id, sort_order, group_limit, group_by_fields,
-                                   false, false, 1, -1, {});
+                    score_results2(sort_fields, searched_queries.size(), field_id, field_is_array,
+                                   0, match_score, seq_id, sort_order, false, false, 1, -1, {});

                    int64_t scores[3] = {0};
                    int64_t match_score_index = 0;
@ -3870,9 +3869,8 @@ void Index::search_wildcard(const std::vector<filter>& filters,
                const uint32_t seq_id = batch_result_ids[i];
                int64_t match_score = 0;

-                score_results2(sort_fields, (uint16_t) searched_queries.size(), false, 0,
-                               match_score, seq_id, sort_order, group_limit, group_by_fields, false,
-                               false, 1, -1, plists);
+                score_results2(sort_fields, (uint16_t) searched_queries.size(), 0, false, 0,
+                               match_score, seq_id, sort_order, false, false, 1, -1, plists);

                int64_t scores[3] = {0};
                int64_t match_score_index = 0;
@ -4168,10 +4166,11 @@ void Index::log_leaves(const int cost, const std::string &token, const std::vect
 }

 int64_t Index::score_results2(const std::vector<sort_by> & sort_fields, const uint16_t & query_index,
-                          const bool field_is_array, const uint32_t total_cost,
+                          const size_t field_id,
+                          const bool field_is_array,
+                          const uint32_t total_cost,
                          int64_t& match_score,
                          const uint32_t seq_id, const int sort_order[3],
-                          const size_t group_limit, const std::vector<std::string>& group_by_fields,
                          const bool prioritize_exact_match,
                          const bool single_exact_query_token,
                          size_t num_query_tokens,
@ -4183,8 +4182,8 @@ int64_t Index::score_results2(const std::vector<sort_by> & sort_fields, const ui

    if (posting_lists.size() <= 1) {
        const uint8_t is_verbatim_match = uint8_t(
-                prioritize_exact_match && single_exact_query_token &&
-                posting_list_t::is_single_token_verbatim_match(posting_lists[0], field_is_array)
+            prioritize_exact_match && single_exact_query_token &&
+            posting_list_t::is_single_token_verbatim_match(posting_lists[0], field_is_array)
        );
        size_t words_present = (num_query_tokens == 1 && syn_orig_num_tokens != -1) ? syn_orig_num_tokens : 1;
        size_t distance = (num_query_tokens == 1 && syn_orig_num_tokens != -1) ? syn_orig_num_tokens-1 : 0;
@ -4203,11 +4202,18 @@ int64_t Index::score_results2(const std::vector<sort_by> & sort_fields, const ui
            const Match &match = Match(seq_id, token_positions, false, prioritize_exact_match);
            uint64_t this_match_score = match.get_match_score(total_cost, posting_lists.size());

+            // Within a field, only a subset of query tokens can match (unique_words), but even a smaller set
+            // might be available within the window used for proximity calculation (this_words_present)
+
            auto this_words_present = ((this_match_score >> 24) & 0xFF);
            auto unique_words = field_is_array ? this_words_present : ((this_match_score >> 32) & 0xFF);
            auto typo_score = ((this_match_score >> 16) & 0xFF);
            auto proximity = ((this_match_score >> 8) & 0xFF);
-            auto verbatim = (this_match_score & 0xFF);
+
+            // for array we have to compare with total query tokens to account for global context
+            auto verbatim = field_is_array ?
+                            (this_match_score & 0xFF) && (int64_t)(num_query_tokens == this_words_present) :
+                            (this_match_score & 0xFF);

            if(syn_orig_num_tokens != -1 && num_query_tokens == posting_lists.size()) {
                unique_words = syn_orig_num_tokens;
@ -4215,9 +4221,6 @@ int64_t Index::score_results2(const std::vector<sort_by> & sort_fields, const ui
                proximity = 100 - (syn_orig_num_tokens - 1);
            }

-            // Within a field, only a subset of query tokens can match (unique_words), but even a smaller set
-            // might be available within the window used for proximity calculation (this_words_present)
-
            uint64_t mod_match_score = (
                (int64_t(this_words_present) << 32) |
                (int64_t(unique_words) << 24) |
@ -4231,12 +4234,17 @@ int64_t Index::score_results2(const std::vector<sort_by> & sort_fields, const ui
            }

            /*std::ostringstream os;
-            os << "seq_id: " << seq_id
+            os << "seq_id: " << seq_id << ", field_id: " << field_id
               << ", this_words_present: " << this_words_present
               << ", unique_words: " << unique_words
               << ", typo_score: " << typo_score
               << ", proximity: " << proximity
               << ", verbatim: " << verbatim
+               << ", mod_match_score: " << mod_match_score
+               << ", token_positions: " << token_positions.size()
+               << ", num_query_tokens: " << num_query_tokens
+               << ", posting_lists.size: " << posting_lists.size()
+               << ", array_index: " << kv.first
               << std::endl;
            LOG(INFO) << os.str();*/
        }
--- a/test/collection_specific_more_test.cpp
+++ b/test/collection_specific_more_test.cpp
@ -229,3 +229,33 @@ TEST_F(CollectionSpecificMoreTest, MatchedSegmentMoreImportantThanTotalMatches)
    ASSERT_EQ("2", results["hits"][1]["document"]["id"].get<std::string>());
    ASSERT_EQ("1", results["hits"][2]["document"]["id"].get<std::string>());
 }
+
+TEST_F(CollectionSpecificMoreTest, VerbatimMatchNotOnPartialTokenMatch) {
+    std::vector<field> fields = {field("title", field_types::STRING, false),
+                                 field("tags", field_types::STRING_ARRAY, false)};
+
+    Collection* coll1 = collectionManager.create_collection("coll1", 1, fields).get();
+
+    nlohmann::json doc1;
+    doc1["id"] = "0";
+    doc1["title"] = "Thirteen Fourteen";
+    doc1["tags"] = {"foo", "bar", "Hundred", "Thirteen Fourteen"};
+
+    nlohmann::json doc2;
+    doc2["id"] = "1";
+    doc2["title"] = "One Eleven Thirteen Fourteen Three";
+    doc2["tags"] = {"foo", "bar", "Hundred", "One Eleven Thirteen Fourteen Three"};
+
+    ASSERT_TRUE(coll1->add(doc1.dump()).ok());
+    ASSERT_TRUE(coll1->add(doc2.dump()).ok());
+
+    auto results = coll1->search("hundred thirteen fourteen", {"tags"},
+                                 "", {}, {}, {2}, 10,
+                                 1, FREQUENCY, {true},
+                                 1, spp::sparse_hash_set<std::string>(),
+                                 spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "title", 5, {}, {}, {}, 0,
+                                 "<mark>", "</mark>", {}, 1000, true).get();
+
+    ASSERT_EQ(2, results["hits"].size());
+    ASSERT_EQ(results["hits"][0]["text_match"].get<size_t>(), results["hits"][1]["text_match"].get<size_t>());
+}
--- a/test/collection_test.cpp
+++ b/test/collection_test.cpp
@ -3687,8 +3687,7 @@ TEST_F(CollectionTest, MultiFieldMatchRankingOnArray) {
    ASSERT_EQ(2, results["found"].get<size_t>());
    ASSERT_EQ(2, results["hits"].size());

-    ASSERT_STREQ("0", results["hits"][0]["document"]["id"].get<std::string>().c_str());
-    ASSERT_STREQ("1", results["hits"][1]["document"]["id"].get<std::string>().c_str());
+    ASSERT_EQ(results["hits"][0]["text_match"].get<size_t>(), results["hits"][0]["text_match"].get<size_t>());

    collectionManager.drop_collection("coll1");
 }