Fix exact filtering edge cases.

2025-05-22 06:40:30 +08:00 · 2021-11-05 11:29:24 +05:30 · 2021-11-05 11:29:24 +05:30 · c33a8fad19
commit c33a8fad19
parent f37e8e9928
3 changed files with 104 additions and 4 deletions
--- a/src/index.cpp
+++ b/src/index.cpp
@ -1381,7 +1381,7 @@ void Index::do_filtering(uint32_t*& filter_ids, uint32_t& filter_ids_length,
                }

                // For NOT_EQUALS alone, it is okay for none of the results to match prior to negation
-                // e.g. field:- [RANDOM_NON_EXISTING_STRING]
+                // e.g. field:!= [RANDOM_NON_EXISTING_STRING]
                if(a_filter.comparators[0] != NOT_EQUALS && posting_lists.size() != str_tokens.size()) {
                    continue;
                }
--- a/src/posting_list.cpp
+++ b/src/posting_list.cpp
@ -1019,7 +1019,8 @@ void posting_list_t::get_exact_matches(std::vector<iterator_t>& its, const bool

                    if(j == its.size()-1) {
                        // check if the last query token is the last offset
-                        if(offsets[end_offset_index-1] != 0) {
+                        if( offsets[end_offset_index-1] != 0 ||
+                            (end_offset_index-2 >= 0 && offsets[end_offset_index-2] != its.size())) {
                            // not the last token for the document, so skip
                            is_exact_match = false;
                            break;
@ -1029,6 +1030,7 @@ void posting_list_t::get_exact_matches(std::vector<iterator_t>& its, const bool
                    // looping handles duplicate query tokens, e.g. "hip hip hurray hurray"
                    while(start_offset_index < end_offset_index) {
                        uint32_t offset = offsets[start_offset_index];
+                        start_offset_index++;

                        if(offset == (j + 1)) {
                            // we have found a matching index, no need to look further
@ -1094,7 +1096,7 @@ void posting_list_t::get_exact_matches(std::vector<iterator_t>& its, const bool

                            if(start_offset_index+1 < end_offset_index) {
                                size_t next_offset = (size_t) offsets[start_offset_index + 1];
-                                if(next_offset == 0) {
+                                if(next_offset == 0 && pos == its.size()) {
                                    // indicates that token is the last token on the doc
                                    has_atleast_one_last_token = true;
                                    start_offset_index++;
--- a/test/collection_filtering_test.cpp
+++ b/test/collection_filtering_test.cpp
@ -1961,4 +1961,102 @@ TEST_F(CollectionFilteringTest, FilteringWithTokenSeparators) {
    ASSERT_EQ(1, results["hits"].size());

    collectionManager.drop_collection("coll2");
-}
+}
+
+TEST_F(CollectionFilteringTest, ExactFilteringRepeatingTokensSingularField) {
+    std::vector<field> fields = {field("name", field_types::STRING, true)};
+
+    Collection* coll1 = collectionManager.create_collection(
+        "coll1", 1, fields, "", 0, "", {}, {"."}
+    ).get();
+
+    nlohmann::json doc1;
+    doc1["id"] = "0";
+    doc1["name"] = "Cardiology - Interventional Cardiology";
+
+    nlohmann::json doc2;
+    doc2["id"] = "1";
+    doc2["name"] = "Cardiology - Interventional";
+
+    nlohmann::json doc3;
+    doc3["id"] = "2";
+    doc3["name"] = "Cardiology - Interventional Cardiology Department";
+
+    nlohmann::json doc4;
+    doc4["id"] = "3";
+    doc4["name"] = "Interventional Cardiology - Interventional Cardiology";
+
+    ASSERT_TRUE(coll1->add(doc1.dump()).ok());
+    ASSERT_TRUE(coll1->add(doc2.dump()).ok());
+    ASSERT_TRUE(coll1->add(doc3.dump()).ok());
+    ASSERT_TRUE(coll1->add(doc4.dump()).ok());
+
+    auto results = coll1->search("*", {},"name:=Cardiology - Interventional Cardiology", {}, {}, {0}, 10,
+                                 1, FREQUENCY, {false}).get();
+    ASSERT_EQ(1, results["hits"].size());
+    ASSERT_EQ("0", results["hits"][0]["document"]["id"].get<std::string>());
+
+    results = coll1->search("*", {},"name:=Cardiology - Interventional", {}, {}, {0}, 10,
+                            1, FREQUENCY, {false}).get();
+    ASSERT_EQ(1, results["hits"].size());
+    ASSERT_EQ("1", results["hits"][0]["document"]["id"].get<std::string>());
+
+    results = coll1->search("*", {},"name:=Interventional Cardiology", {}, {}, {0}, 10,
+                            1, FREQUENCY, {false}).get();
+    ASSERT_EQ(0, results["hits"].size());
+
+    results = coll1->search("*", {},"name:=Cardiology", {}, {}, {0}, 10,
+                            1, FREQUENCY, {false}).get();
+    ASSERT_EQ(0, results["hits"].size());
+
+    collectionManager.drop_collection("coll1");
+}
+
+TEST_F(CollectionFilteringTest, ExactFilteringRepeatingTokensArrayField) {
+    std::vector<field> fields = {field("name", field_types::STRING_ARRAY, true)};
+
+    Collection* coll1 = collectionManager.create_collection(
+        "coll1", 1, fields, "", 0, "", {}, {"."}
+    ).get();
+
+    nlohmann::json doc1;
+    doc1["id"] = "0";
+    doc1["name"] = {"Cardiology - Interventional Cardiology"};
+
+    nlohmann::json doc2;
+    doc2["id"] = "1";
+    doc2["name"] = {"Cardiology - Interventional"};
+
+    nlohmann::json doc3;
+    doc3["id"] = "2";
+    doc3["name"] = {"Cardiology - Interventional Cardiology Department"};
+
+    nlohmann::json doc4;
+    doc4["id"] = "3";
+    doc4["name"] = {"Interventional Cardiology - Interventional Cardiology"};
+
+    ASSERT_TRUE(coll1->add(doc1.dump()).ok());
+    ASSERT_TRUE(coll1->add(doc2.dump()).ok());
+    ASSERT_TRUE(coll1->add(doc3.dump()).ok());
+    ASSERT_TRUE(coll1->add(doc4.dump()).ok());
+
+    auto results = coll1->search("*", {},"name:=Cardiology - Interventional Cardiology", {}, {}, {0}, 10,
+                                 1, FREQUENCY, {false}).get();
+    ASSERT_EQ(1, results["hits"].size());
+    ASSERT_EQ("0", results["hits"][0]["document"]["id"].get<std::string>());
+
+    results = coll1->search("*", {},"name:=Cardiology - Interventional", {}, {}, {0}, 10,
+                            1, FREQUENCY, {false}).get();
+    ASSERT_EQ(1, results["hits"].size());
+    ASSERT_EQ("1", results["hits"][0]["document"]["id"].get<std::string>());
+
+    results = coll1->search("*", {},"name:=Interventional Cardiology", {}, {}, {0}, 10,
+                            1, FREQUENCY, {false}).get();
+    ASSERT_EQ(0, results["hits"].size());
+
+    results = coll1->search("*", {},"name:=Cardiology", {}, {}, {0}, 10,
+                            1, FREQUENCY, {false}).get();
+    ASSERT_EQ(0, results["hits"].size());
+
+    collectionManager.drop_collection("coll1");
+}