Fix another exact filter match on array edge case.

2025-05-21 22:33:27 +08:00 · 2022-11-18 18:22:56 +05:30 · 2022-11-18 18:22:56 +05:30 · ad4ef33fa2
commit ad4ef33fa2
parent f9598dfd55
2 changed files with 58 additions and 3 deletions
--- a/src/posting_list.cpp
+++ b/src/posting_list.cpp
@ -1124,10 +1124,15 @@ void posting_list_t::get_exact_matches(std::vector<iterator_t>& its, const bool
        else {
            // field is an array

+            struct token_index_meta_t {
+                std::bitset<32> token_index;
+                bool has_last_token;
+            };
+
            for(size_t i = 0; i < num_ids; i++) {
                uint32_t id = ids[i];

-                std::map<size_t, std::bitset<32>> array_index_to_token_index;
+                std::map<size_t, token_index_meta_t> array_index_to_token_index;
                bool premature_exit = false;

                for(int j = its.size()-1; j >= 0; j--) {
@ -1165,13 +1170,14 @@ void posting_list_t::get_exact_matches(std::vector<iterator_t>& its, const bool
                                size_t next_offset = (size_t) offsets[start_offset_index + 1];
                                if(next_offset == 0 && pos == its.size()) {
                                    // indicates that token is the last token on the doc
+                                    array_index_to_token_index[array_index].has_last_token = true;
                                    has_atleast_one_last_token = true;
                                    start_offset_index++;
                                }
                            }

                            if(found_matching_index) {
-                                array_index_to_token_index[array_index].set(j+1);
+                                array_index_to_token_index[array_index].token_index.set(j+1);
                            }

                            start_offset_index++;  // skip current value which is the array index or flag for last index
@ -1205,7 +1211,7 @@ void posting_list_t::get_exact_matches(std::vector<iterator_t>& its, const bool
                if(!premature_exit) {
                    // iterate array index to token index to check if atleast 1 array position contains all tokens
                    for(auto& kv: array_index_to_token_index) {
-                        if(kv.second.count() == its.size()) {
+                        if(kv.second.token_index.count() == its.size() && kv.second.has_last_token) {
                            exact_ids[exact_id_index++] = id;
                            break;
                        }
--- a/test/collection_specific_more_test.cpp
+++ b/test/collection_specific_more_test.cpp
@ -661,6 +661,55 @@ TEST_F(CollectionSpecificMoreTest, ExactFilteringOnArray) {
                                 false, false).get();

    ASSERT_EQ(0, results["hits"].size());
+
+    results = coll1->search("*", {"tags"}, "tags:=§ 23", {}, {}, {0}, 100, 1, MAX_SCORE, {true},
+                            Index::DROP_TOKENS_THRESHOLD,
+                            spp::sparse_hash_set<std::string>(),
+                            spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
+                            "", 10, {}, {}, {}, 0,
+                            "<mark>", "</mark>", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback,
+                            4, {off}, 32767, 32767, 2,
+                            false, false).get();
+
+    ASSERT_EQ(1, results["hits"].size());
+
+    results = coll1->search("*", {"tags"}, "tags:=§ 23 Satz", {}, {}, {0}, 100, 1, MAX_SCORE, {true},
+                            Index::DROP_TOKENS_THRESHOLD,
+                            spp::sparse_hash_set<std::string>(),
+                            spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
+                            "", 10, {}, {}, {}, 0,
+                            "<mark>", "</mark>", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback,
+                            4, {off}, 32767, 32767, 2,
+                            false, false).get();
+
+    ASSERT_EQ(0, results["hits"].size());
+}
+
+TEST_F(CollectionSpecificMoreTest, ExactFilteringOnArray2) {
+    auto schema = R"({
+        "name": "coll1",
+        "fields": [
+            {"name": "capability", "type": "string[]", "facet": true}
+        ]
+    })"_json;
+
+    Collection* coll1 = collectionManager.create_collection(schema).get();
+
+    nlohmann::json doc1;
+    doc1["capability"] = {"Encoding capabilities for network communications",
+                            "Obfuscation capabilities"};
+    coll1->add(doc1.dump());
+
+    auto results = coll1->search("*", {}, "capability:=Encoding capabilities", {}, {}, {0}, 100, 1, MAX_SCORE, {true},
+                                 Index::DROP_TOKENS_THRESHOLD,
+                                 spp::sparse_hash_set<std::string>(),
+                                 spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
+                                 "", 10, {}, {}, {}, 0,
+                                 "<mark>", "</mark>", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback,
+                                 4, {off}, 32767, 32767, 2,
+                                 false, false).get();
+
+    ASSERT_EQ(0, results["hits"].size());
 }

 TEST_F(CollectionSpecificMoreTest, SplitTokensCrossFieldMatching) {