Filtering must require presence of ALL tokens.

Maybe in the future, we can introduce :~ operator to return documents containing partial tokens.
This commit is contained in:
Kishore Nallan 2021-04-09 18:24:36 +05:30
parent 6b793afbae
commit a120c903a1
3 changed files with 166 additions and 148 deletions

View File

@ -1247,6 +1247,7 @@ uint32_t Index::do_filtering(uint32_t** filter_ids_out, const std::vector<filter
// e.g. country: South Africa
Tokenizer tokenizer(filter_value, false, true, false, f.locale);
std::string str_token;
size_t token_index = 0;
std::vector<std::string> str_tokens;
@ -1261,7 +1262,13 @@ uint32_t Index::do_filtering(uint32_t** filter_ids_out, const std::vector<filter
}
query_suggestion.push_back(leaf);
}
if(query_suggestion.size() != str_tokens.size()) {
continue;
}
for(const auto& leaf: query_suggestion) {
if(strt_ids == nullptr) {
strt_ids = leaf->values->ids.uncompress();
strt_ids_size = leaf->values->ids.getLength();
@ -1294,8 +1301,8 @@ uint32_t Index::do_filtering(uint32_t** filter_ids_out, const std::vector<filter
uint64_t filter_hash = 1;
for(size_t sindex=0; sindex < str_tokens.size(); sindex++) {
auto& str_token = str_tokens[sindex];
uint64_t thash = facet_token_hash(f, str_token);
auto& this_str_token = str_tokens[sindex];
uint64_t thash = facet_token_hash(f, this_str_token);
filter_hash *= (1779033703 + 2*thash*(sindex+1));
}

View File

@ -34,152 +34,6 @@ protected:
}
};
TEST_F(CollectionFacetingTest, FacetFieldStringFiltering) {
Collection *coll_str;
std::ifstream infile(std::string(ROOT_DIR)+"test/multi_field_documents.jsonl");
std::vector<field> fields = {
field("title", field_types::STRING, false),
field("starring", field_types::STRING, true),
field("cast", field_types::STRING_ARRAY, false),
field("points", field_types::INT32, false)
};
std::vector<sort_by> sort_fields = { sort_by("points", "DESC") };
coll_str = collectionManager.get_collection("coll_str").get();
if(coll_str == nullptr) {
coll_str = collectionManager.create_collection("coll_str", 1, fields, "points").get();
}
std::string json_line;
while (std::getline(infile, json_line)) {
nlohmann::json document = nlohmann::json::parse(json_line);
coll_str->add(document.dump());
}
infile.close();
query_fields = {"title"};
std::vector<std::string> facets;
// exact filter on string field must fail when single token is used
facets.clear();
facets.emplace_back("starring");
auto results = coll_str->search("*", query_fields, "starring:= samuel", facets, sort_fields, 0, 10, 1, FREQUENCY, false).get();
ASSERT_EQ(0, results["hits"].size());
ASSERT_EQ(0, results["found"].get<size_t>());
// multiple tokens but with a typo on one of them
results = coll_str->search("*", query_fields, "starring:= ssamuel l. Jackson", facets, sort_fields, 0, 10, 1, FREQUENCY, false).get();
ASSERT_EQ(0, results["hits"].size());
ASSERT_EQ(0, results["found"].get<size_t>());
// same should succeed when verbatim filter is made
results = coll_str->search("*", query_fields, "starring:= samuel l. Jackson", facets, sort_fields, 0, 10, 1, FREQUENCY, false).get();
ASSERT_EQ(2, results["hits"].size());
ASSERT_EQ(2, results["found"].get<size_t>());
// contains filter with a single token should work as well
results = coll_str->search("*", query_fields, "starring: jackson", facets, sort_fields, 0, 10, 1, FREQUENCY, false).get();
ASSERT_EQ(2, results["hits"].size());
ASSERT_EQ(2, results["found"].get<size_t>());
results = coll_str->search("*", query_fields, "starring: samuel", facets, sort_fields, 0, 10, 1, FREQUENCY, false).get();
ASSERT_EQ(2, results["hits"].size());
ASSERT_EQ(2, results["found"].get<size_t>());
// contains when only 1 token matches
results = coll_str->search("*", query_fields, "starring: samuel johnson", facets, sort_fields, 0, 10, 1, FREQUENCY, false).get();
ASSERT_EQ(2, results["hits"].size());
ASSERT_EQ(2, results["found"].get<size_t>());
collectionManager.drop_collection("coll_str");
}
TEST_F(CollectionFacetingTest, FacetFieldStringArrayFiltering) {
Collection *coll_array_fields;
std::ifstream infile(std::string(ROOT_DIR)+"test/numeric_array_documents.jsonl");
std::vector<field> fields = {field("name", field_types::STRING, false),
field("name_facet", field_types::STRING, true),
field("age", field_types::INT32, true),
field("years", field_types::INT32_ARRAY, true),
field("rating", field_types::FLOAT, true),
field("timestamps", field_types::INT64_ARRAY, true),
field("tags", field_types::STRING_ARRAY, true)};
std::vector<sort_by> sort_fields = { sort_by("age", "DESC") };
coll_array_fields = collectionManager.get_collection("coll_array_fields").get();
if(coll_array_fields == nullptr) {
coll_array_fields = collectionManager.create_collection("coll_array_fields", 1, fields, "age").get();
}
std::string json_line;
while (std::getline(infile, json_line)) {
nlohmann::json document = nlohmann::json::parse(json_line);
document["name_facet"] = document["name"];
const std::string & patched_json_line = document.dump();
coll_array_fields->add(patched_json_line);
}
infile.close();
query_fields = {"name"};
std::vector<std::string> facets = {"tags"};
// facet with filter on string array field must fail when exact token is used
facets.clear();
facets.push_back("tags");
auto results = coll_array_fields->search("Jeremy", query_fields, "tags:= PLATINUM", facets, sort_fields, 0, 10, 1, FREQUENCY, false).get();
ASSERT_EQ(0, results["hits"].size());
ASSERT_EQ(0, results["found"].get<size_t>());
results = coll_array_fields->search("Jeremy", query_fields, "tags:= FINE", facets, sort_fields, 0, 10, 1, FREQUENCY, false).get();
ASSERT_EQ(0, results["hits"].size());
results = coll_array_fields->search("Jeremy", query_fields, "tags:= FFINE PLATINUM", facets, sort_fields, 0, 10, 1, FREQUENCY, false).get();
ASSERT_EQ(0, results["hits"].size());
// partial token filter should be made without "=" operator
results = coll_array_fields->search("Jeremy", query_fields, "tags: PLATINUM", facets, sort_fields, 0, 10, 1, FREQUENCY, false).get();
ASSERT_EQ(1, results["hits"].size());
ASSERT_EQ(1, results["found"].get<size_t>());
results = coll_array_fields->search("Jeremy", query_fields, "tags: FINE", facets, sort_fields, 0, 10, 1, FREQUENCY, false).get();
ASSERT_EQ(1, results["hits"].size());
ASSERT_EQ(1, results["found"].get<size_t>());
// to make tokens match facet value exactly, use "=" operator
results = coll_array_fields->search("Jeremy", query_fields, "tags:= FINE PLATINUM", facets, sort_fields, 0, 10, 1, FREQUENCY, false).get();
ASSERT_EQ(1, results["hits"].size());
ASSERT_EQ(1, results["found"].get<size_t>());
// don't allow exact filter on non-faceted field
auto res_op = coll_array_fields->search("Jeremy", query_fields, "name:= Jeremy Howard", facets, sort_fields, 0, 10, 1, FREQUENCY, false);
ASSERT_FALSE(res_op.ok());
ASSERT_STREQ("To perform exact filtering, filter field `name` must be a facet field.", res_op.error().c_str());
// multi match exact query (OR condition)
results = coll_array_fields->search("Jeremy", query_fields, "tags:= [Gold, bronze]", facets, sort_fields, 0, 10, 1, FREQUENCY, false).get();
ASSERT_EQ(3, results["hits"].size());
ASSERT_EQ(3, results["found"].get<size_t>());
results = coll_array_fields->search("Jeremy", query_fields, "tags:= [Gold, bronze, fine PLATINUM]", facets, sort_fields, 0, 10, 1, FREQUENCY, false).get();
ASSERT_EQ(4, results["hits"].size());
ASSERT_EQ(4, results["found"].get<size_t>());
// single array multi match
results = coll_array_fields->search("Jeremy", query_fields, "tags:= [fine PLATINUM]", facets, sort_fields, 0, 10, 1, FREQUENCY, false).get();
ASSERT_EQ(1, results["hits"].size());
ASSERT_EQ(1, results["found"].get<size_t>());
collectionManager.drop_collection("coll_array_fields");
}
TEST_F(CollectionFacetingTest, FacetCounts) {
Collection *coll_array_fields;

View File

@ -78,6 +78,18 @@ TEST_F(CollectionFilteringTest, FilterOnTextFields) {
results = coll_array_fields->search("Jeremy", query_fields, "tags : fine PLATINUM", facets, sort_fields, 0, 10, 1, FREQUENCY, false).get();
ASSERT_EQ(1, results["hits"].size());
// using just ":", filtering should return documents that contain ALL tokens in the filter expression
results = coll_array_fields->search("Jeremy", query_fields, "tags : PLATINUM", facets, sort_fields, 0, 10, 1, FREQUENCY, false).get();
ASSERT_EQ(1, results["hits"].size());
// no documents contain both "white" and "platinum", so
results = coll_array_fields->search("Jeremy", query_fields, "tags : WHITE PLATINUM", facets, sort_fields, 0, 10, 1, FREQUENCY, false).get();
ASSERT_EQ(0, results["hits"].size());
// with exact match operator (:=) partial matches are not allowed
results = coll_array_fields->search("Jeremy", query_fields, "tags:= PLATINUM", facets, sort_fields, 0, 10, 1, FREQUENCY, false).get();
ASSERT_EQ(0, results["hits"].size());
results = coll_array_fields->search("Jeremy", query_fields, "tags : bronze", facets, sort_fields, 0, 10, 1, FREQUENCY, false).get();
ASSERT_EQ(2, results["hits"].size());
@ -120,6 +132,151 @@ TEST_F(CollectionFilteringTest, FilterOnTextFields) {
collectionManager.drop_collection("coll_array_fields");
}
TEST_F(CollectionFilteringTest, FacetFieldStringFiltering) {
Collection *coll_str;
std::ifstream infile(std::string(ROOT_DIR)+"test/multi_field_documents.jsonl");
std::vector<field> fields = {
field("title", field_types::STRING, false),
field("starring", field_types::STRING, true),
field("cast", field_types::STRING_ARRAY, false),
field("points", field_types::INT32, false)
};
std::vector<sort_by> sort_fields = { sort_by("points", "DESC") };
coll_str = collectionManager.get_collection("coll_str").get();
if(coll_str == nullptr) {
coll_str = collectionManager.create_collection("coll_str", 1, fields, "points").get();
}
std::string json_line;
while (std::getline(infile, json_line)) {
nlohmann::json document = nlohmann::json::parse(json_line);
coll_str->add(document.dump());
}
infile.close();
query_fields = {"title"};
std::vector<std::string> facets;
// exact filter on string field must fail when single token is used
facets.clear();
facets.emplace_back("starring");
auto results = coll_str->search("*", query_fields, "starring:= samuel", facets, sort_fields, 0, 10, 1, FREQUENCY, false).get();
ASSERT_EQ(0, results["hits"].size());
ASSERT_EQ(0, results["found"].get<size_t>());
// multiple tokens but with a typo on one of them
results = coll_str->search("*", query_fields, "starring:= ssamuel l. Jackson", facets, sort_fields, 0, 10, 1, FREQUENCY, false).get();
ASSERT_EQ(0, results["hits"].size());
ASSERT_EQ(0, results["found"].get<size_t>());
// same should succeed when verbatim filter is made
results = coll_str->search("*", query_fields, "starring:= samuel l. Jackson", facets, sort_fields, 0, 10, 1, FREQUENCY, false).get();
ASSERT_EQ(2, results["hits"].size());
ASSERT_EQ(2, results["found"].get<size_t>());
// contains filter with a single token should work as well
results = coll_str->search("*", query_fields, "starring: jackson", facets, sort_fields, 0, 10, 1, FREQUENCY, false).get();
ASSERT_EQ(2, results["hits"].size());
ASSERT_EQ(2, results["found"].get<size_t>());
results = coll_str->search("*", query_fields, "starring: samuel", facets, sort_fields, 0, 10, 1, FREQUENCY, false).get();
ASSERT_EQ(2, results["hits"].size());
ASSERT_EQ(2, results["found"].get<size_t>());
// contains when only 1 token so should not match
results = coll_str->search("*", query_fields, "starring: samuel johnson", facets, sort_fields, 0, 10, 1, FREQUENCY, false).get();
ASSERT_EQ(0, results["hits"].size());
collectionManager.drop_collection("coll_str");
}
TEST_F(CollectionFilteringTest, FacetFieldStringArrayFiltering) {
Collection *coll_array_fields;
std::ifstream infile(std::string(ROOT_DIR)+"test/numeric_array_documents.jsonl");
std::vector<field> fields = {field("name", field_types::STRING, false),
field("name_facet", field_types::STRING, true),
field("age", field_types::INT32, true),
field("years", field_types::INT32_ARRAY, true),
field("rating", field_types::FLOAT, true),
field("timestamps", field_types::INT64_ARRAY, true),
field("tags", field_types::STRING_ARRAY, true)};
std::vector<sort_by> sort_fields = { sort_by("age", "DESC") };
coll_array_fields = collectionManager.get_collection("coll_array_fields").get();
if(coll_array_fields == nullptr) {
coll_array_fields = collectionManager.create_collection("coll_array_fields", 1, fields, "age").get();
}
std::string json_line;
while (std::getline(infile, json_line)) {
nlohmann::json document = nlohmann::json::parse(json_line);
document["name_facet"] = document["name"];
const std::string & patched_json_line = document.dump();
coll_array_fields->add(patched_json_line);
}
infile.close();
query_fields = {"name"};
std::vector<std::string> facets = {"tags"};
// facet with filter on string array field must fail when exact token is used
facets.clear();
facets.push_back("tags");
auto results = coll_array_fields->search("Jeremy", query_fields, "tags:= PLATINUM", facets, sort_fields, 0, 10, 1, FREQUENCY, false).get();
ASSERT_EQ(0, results["hits"].size());
ASSERT_EQ(0, results["found"].get<size_t>());
results = coll_array_fields->search("Jeremy", query_fields, "tags:= FINE", facets, sort_fields, 0, 10, 1, FREQUENCY, false).get();
ASSERT_EQ(0, results["hits"].size());
results = coll_array_fields->search("Jeremy", query_fields, "tags:= FFINE PLATINUM", facets, sort_fields, 0, 10, 1, FREQUENCY, false).get();
ASSERT_EQ(0, results["hits"].size());
// partial token filter should be made without "=" operator
results = coll_array_fields->search("Jeremy", query_fields, "tags: PLATINUM", facets, sort_fields, 0, 10, 1, FREQUENCY, false).get();
ASSERT_EQ(1, results["hits"].size());
ASSERT_EQ(1, results["found"].get<size_t>());
results = coll_array_fields->search("Jeremy", query_fields, "tags: FINE", facets, sort_fields, 0, 10, 1, FREQUENCY, false).get();
ASSERT_EQ(1, results["hits"].size());
ASSERT_EQ(1, results["found"].get<size_t>());
// to make tokens match facet value exactly, use "=" operator
results = coll_array_fields->search("Jeremy", query_fields, "tags:= FINE PLATINUM", facets, sort_fields, 0, 10, 1, FREQUENCY, false).get();
ASSERT_EQ(1, results["hits"].size());
ASSERT_EQ(1, results["found"].get<size_t>());
// don't allow exact filter on non-faceted field
auto res_op = coll_array_fields->search("Jeremy", query_fields, "name:= Jeremy Howard", facets, sort_fields, 0, 10, 1, FREQUENCY, false);
ASSERT_FALSE(res_op.ok());
ASSERT_STREQ("To perform exact filtering, filter field `name` must be a facet field.", res_op.error().c_str());
// multi match exact query (OR condition)
results = coll_array_fields->search("Jeremy", query_fields, "tags:= [Gold, bronze]", facets, sort_fields, 0, 10, 1, FREQUENCY, false).get();
ASSERT_EQ(3, results["hits"].size());
ASSERT_EQ(3, results["found"].get<size_t>());
results = coll_array_fields->search("Jeremy", query_fields, "tags:= [Gold, bronze, fine PLATINUM]", facets, sort_fields, 0, 10, 1, FREQUENCY, false).get();
ASSERT_EQ(4, results["hits"].size());
ASSERT_EQ(4, results["found"].get<size_t>());
// single array multi match
results = coll_array_fields->search("Jeremy", query_fields, "tags:= [fine PLATINUM]", facets, sort_fields, 0, 10, 1, FREQUENCY, false).get();
ASSERT_EQ(1, results["hits"].size());
ASSERT_EQ(1, results["found"].get<size_t>());
collectionManager.drop_collection("coll_array_fields");
}
TEST_F(CollectionFilteringTest, FilterOnTextFieldWithColon) {
Collection *coll1;