Fix exact filtering edge cases.

This commit is contained in:
Kishore Nallan 2021-11-05 11:29:24 +05:30
parent f37e8e9928
commit c33a8fad19
3 changed files with 104 additions and 4 deletions

View File

@ -1381,7 +1381,7 @@ void Index::do_filtering(uint32_t*& filter_ids, uint32_t& filter_ids_length,
}
// For NOT_EQUALS alone, it is okay for none of the results to match prior to negation
// e.g. field:- [RANDOM_NON_EXISTING_STRING]
// e.g. field:!= [RANDOM_NON_EXISTING_STRING]
if(a_filter.comparators[0] != NOT_EQUALS && posting_lists.size() != str_tokens.size()) {
continue;
}

View File

@ -1019,7 +1019,8 @@ void posting_list_t::get_exact_matches(std::vector<iterator_t>& its, const bool
if(j == its.size()-1) {
// check if the last query token is the last offset
if(offsets[end_offset_index-1] != 0) {
if( offsets[end_offset_index-1] != 0 ||
(end_offset_index-2 >= 0 && offsets[end_offset_index-2] != its.size())) {
// not the last token for the document, so skip
is_exact_match = false;
break;
@ -1029,6 +1030,7 @@ void posting_list_t::get_exact_matches(std::vector<iterator_t>& its, const bool
// looping handles duplicate query tokens, e.g. "hip hip hurray hurray"
while(start_offset_index < end_offset_index) {
uint32_t offset = offsets[start_offset_index];
start_offset_index++;
if(offset == (j + 1)) {
// we have found a matching index, no need to look further
@ -1094,7 +1096,7 @@ void posting_list_t::get_exact_matches(std::vector<iterator_t>& its, const bool
if(start_offset_index+1 < end_offset_index) {
size_t next_offset = (size_t) offsets[start_offset_index + 1];
if(next_offset == 0) {
if(next_offset == 0 && pos == its.size()) {
// indicates that token is the last token on the doc
has_atleast_one_last_token = true;
start_offset_index++;

View File

@ -1961,4 +1961,102 @@ TEST_F(CollectionFilteringTest, FilteringWithTokenSeparators) {
ASSERT_EQ(1, results["hits"].size());
collectionManager.drop_collection("coll2");
}
}
TEST_F(CollectionFilteringTest, ExactFilteringRepeatingTokensSingularField) {
std::vector<field> fields = {field("name", field_types::STRING, true)};
Collection* coll1 = collectionManager.create_collection(
"coll1", 1, fields, "", 0, "", {}, {"."}
).get();
nlohmann::json doc1;
doc1["id"] = "0";
doc1["name"] = "Cardiology - Interventional Cardiology";
nlohmann::json doc2;
doc2["id"] = "1";
doc2["name"] = "Cardiology - Interventional";
nlohmann::json doc3;
doc3["id"] = "2";
doc3["name"] = "Cardiology - Interventional Cardiology Department";
nlohmann::json doc4;
doc4["id"] = "3";
doc4["name"] = "Interventional Cardiology - Interventional Cardiology";
ASSERT_TRUE(coll1->add(doc1.dump()).ok());
ASSERT_TRUE(coll1->add(doc2.dump()).ok());
ASSERT_TRUE(coll1->add(doc3.dump()).ok());
ASSERT_TRUE(coll1->add(doc4.dump()).ok());
auto results = coll1->search("*", {},"name:=Cardiology - Interventional Cardiology", {}, {}, {0}, 10,
1, FREQUENCY, {false}).get();
ASSERT_EQ(1, results["hits"].size());
ASSERT_EQ("0", results["hits"][0]["document"]["id"].get<std::string>());
results = coll1->search("*", {},"name:=Cardiology - Interventional", {}, {}, {0}, 10,
1, FREQUENCY, {false}).get();
ASSERT_EQ(1, results["hits"].size());
ASSERT_EQ("1", results["hits"][0]["document"]["id"].get<std::string>());
results = coll1->search("*", {},"name:=Interventional Cardiology", {}, {}, {0}, 10,
1, FREQUENCY, {false}).get();
ASSERT_EQ(0, results["hits"].size());
results = coll1->search("*", {},"name:=Cardiology", {}, {}, {0}, 10,
1, FREQUENCY, {false}).get();
ASSERT_EQ(0, results["hits"].size());
collectionManager.drop_collection("coll1");
}
TEST_F(CollectionFilteringTest, ExactFilteringRepeatingTokensArrayField) {
std::vector<field> fields = {field("name", field_types::STRING_ARRAY, true)};
Collection* coll1 = collectionManager.create_collection(
"coll1", 1, fields, "", 0, "", {}, {"."}
).get();
nlohmann::json doc1;
doc1["id"] = "0";
doc1["name"] = {"Cardiology - Interventional Cardiology"};
nlohmann::json doc2;
doc2["id"] = "1";
doc2["name"] = {"Cardiology - Interventional"};
nlohmann::json doc3;
doc3["id"] = "2";
doc3["name"] = {"Cardiology - Interventional Cardiology Department"};
nlohmann::json doc4;
doc4["id"] = "3";
doc4["name"] = {"Interventional Cardiology - Interventional Cardiology"};
ASSERT_TRUE(coll1->add(doc1.dump()).ok());
ASSERT_TRUE(coll1->add(doc2.dump()).ok());
ASSERT_TRUE(coll1->add(doc3.dump()).ok());
ASSERT_TRUE(coll1->add(doc4.dump()).ok());
auto results = coll1->search("*", {},"name:=Cardiology - Interventional Cardiology", {}, {}, {0}, 10,
1, FREQUENCY, {false}).get();
ASSERT_EQ(1, results["hits"].size());
ASSERT_EQ("0", results["hits"][0]["document"]["id"].get<std::string>());
results = coll1->search("*", {},"name:=Cardiology - Interventional", {}, {}, {0}, 10,
1, FREQUENCY, {false}).get();
ASSERT_EQ(1, results["hits"].size());
ASSERT_EQ("1", results["hits"][0]["document"]["id"].get<std::string>());
results = coll1->search("*", {},"name:=Interventional Cardiology", {}, {}, {0}, 10,
1, FREQUENCY, {false}).get();
ASSERT_EQ(0, results["hits"].size());
results = coll1->search("*", {},"name:=Cardiology", {}, {}, {0}, 10,
1, FREQUENCY, {false}).get();
ASSERT_EQ(0, results["hits"].size());
collectionManager.drop_collection("coll1");
}