diff --git a/src/index.cpp b/src/index.cpp index 9fa6058c..6168d107 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -1381,7 +1381,7 @@ void Index::do_filtering(uint32_t*& filter_ids, uint32_t& filter_ids_length, } // For NOT_EQUALS alone, it is okay for none of the results to match prior to negation - // e.g. field:- [RANDOM_NON_EXISTING_STRING] + // e.g. field:!= [RANDOM_NON_EXISTING_STRING] if(a_filter.comparators[0] != NOT_EQUALS && posting_lists.size() != str_tokens.size()) { continue; } diff --git a/src/posting_list.cpp b/src/posting_list.cpp index e202d689..83787abd 100644 --- a/src/posting_list.cpp +++ b/src/posting_list.cpp @@ -1019,7 +1019,8 @@ void posting_list_t::get_exact_matches(std::vector& its, const bool if(j == its.size()-1) { // check if the last query token is the last offset - if(offsets[end_offset_index-1] != 0) { + if( offsets[end_offset_index-1] != 0 || + (end_offset_index-2 >= 0 && offsets[end_offset_index-2] != its.size())) { // not the last token for the document, so skip is_exact_match = false; break; @@ -1029,6 +1030,7 @@ void posting_list_t::get_exact_matches(std::vector& its, const bool // looping handles duplicate query tokens, e.g. "hip hip hurray hurray" while(start_offset_index < end_offset_index) { uint32_t offset = offsets[start_offset_index]; + start_offset_index++; if(offset == (j + 1)) { // we have found a matching index, no need to look further @@ -1094,7 +1096,7 @@ void posting_list_t::get_exact_matches(std::vector& its, const bool if(start_offset_index+1 < end_offset_index) { size_t next_offset = (size_t) offsets[start_offset_index + 1]; - if(next_offset == 0) { + if(next_offset == 0 && pos == its.size()) { // indicates that token is the last token on the doc has_atleast_one_last_token = true; start_offset_index++; diff --git a/test/collection_filtering_test.cpp b/test/collection_filtering_test.cpp index 01452837..02866b76 100644 --- a/test/collection_filtering_test.cpp +++ b/test/collection_filtering_test.cpp @@ -1961,4 +1961,102 @@ TEST_F(CollectionFilteringTest, FilteringWithTokenSeparators) { ASSERT_EQ(1, results["hits"].size()); collectionManager.drop_collection("coll2"); -} \ No newline at end of file +} + +TEST_F(CollectionFilteringTest, ExactFilteringRepeatingTokensSingularField) { + std::vector fields = {field("name", field_types::STRING, true)}; + + Collection* coll1 = collectionManager.create_collection( + "coll1", 1, fields, "", 0, "", {}, {"."} + ).get(); + + nlohmann::json doc1; + doc1["id"] = "0"; + doc1["name"] = "Cardiology - Interventional Cardiology"; + + nlohmann::json doc2; + doc2["id"] = "1"; + doc2["name"] = "Cardiology - Interventional"; + + nlohmann::json doc3; + doc3["id"] = "2"; + doc3["name"] = "Cardiology - Interventional Cardiology Department"; + + nlohmann::json doc4; + doc4["id"] = "3"; + doc4["name"] = "Interventional Cardiology - Interventional Cardiology"; + + ASSERT_TRUE(coll1->add(doc1.dump()).ok()); + ASSERT_TRUE(coll1->add(doc2.dump()).ok()); + ASSERT_TRUE(coll1->add(doc3.dump()).ok()); + ASSERT_TRUE(coll1->add(doc4.dump()).ok()); + + auto results = coll1->search("*", {},"name:=Cardiology - Interventional Cardiology", {}, {}, {0}, 10, + 1, FREQUENCY, {false}).get(); + ASSERT_EQ(1, results["hits"].size()); + ASSERT_EQ("0", results["hits"][0]["document"]["id"].get()); + + results = coll1->search("*", {},"name:=Cardiology - Interventional", {}, {}, {0}, 10, + 1, FREQUENCY, {false}).get(); + ASSERT_EQ(1, results["hits"].size()); + ASSERT_EQ("1", results["hits"][0]["document"]["id"].get()); + + results = coll1->search("*", {},"name:=Interventional Cardiology", {}, {}, {0}, 10, + 1, FREQUENCY, {false}).get(); + ASSERT_EQ(0, results["hits"].size()); + + results = coll1->search("*", {},"name:=Cardiology", {}, {}, {0}, 10, + 1, FREQUENCY, {false}).get(); + ASSERT_EQ(0, results["hits"].size()); + + collectionManager.drop_collection("coll1"); +} + +TEST_F(CollectionFilteringTest, ExactFilteringRepeatingTokensArrayField) { + std::vector fields = {field("name", field_types::STRING_ARRAY, true)}; + + Collection* coll1 = collectionManager.create_collection( + "coll1", 1, fields, "", 0, "", {}, {"."} + ).get(); + + nlohmann::json doc1; + doc1["id"] = "0"; + doc1["name"] = {"Cardiology - Interventional Cardiology"}; + + nlohmann::json doc2; + doc2["id"] = "1"; + doc2["name"] = {"Cardiology - Interventional"}; + + nlohmann::json doc3; + doc3["id"] = "2"; + doc3["name"] = {"Cardiology - Interventional Cardiology Department"}; + + nlohmann::json doc4; + doc4["id"] = "3"; + doc4["name"] = {"Interventional Cardiology - Interventional Cardiology"}; + + ASSERT_TRUE(coll1->add(doc1.dump()).ok()); + ASSERT_TRUE(coll1->add(doc2.dump()).ok()); + ASSERT_TRUE(coll1->add(doc3.dump()).ok()); + ASSERT_TRUE(coll1->add(doc4.dump()).ok()); + + auto results = coll1->search("*", {},"name:=Cardiology - Interventional Cardiology", {}, {}, {0}, 10, + 1, FREQUENCY, {false}).get(); + ASSERT_EQ(1, results["hits"].size()); + ASSERT_EQ("0", results["hits"][0]["document"]["id"].get()); + + results = coll1->search("*", {},"name:=Cardiology - Interventional", {}, {}, {0}, 10, + 1, FREQUENCY, {false}).get(); + ASSERT_EQ(1, results["hits"].size()); + ASSERT_EQ("1", results["hits"][0]["document"]["id"].get()); + + results = coll1->search("*", {},"name:=Interventional Cardiology", {}, {}, {0}, 10, + 1, FREQUENCY, {false}).get(); + ASSERT_EQ(0, results["hits"].size()); + + results = coll1->search("*", {},"name:=Cardiology", {}, {}, {0}, 10, + 1, FREQUENCY, {false}).get(); + ASSERT_EQ(0, results["hits"].size()); + + collectionManager.drop_collection("coll1"); +}