diff --git a/include/filter_result_iterator.h b/include/filter_result_iterator.h index da27131a..abec6254 100644 --- a/include/filter_result_iterator.h +++ b/include/filter_result_iterator.h @@ -280,6 +280,10 @@ private: /// Sample filter: [>10, !15]. std::unordered_set numerical_not_iterator_index; + /// String filter can specify prefix value match. + /// Sample filter: [Chris P*]. + std::unordered_set string_prefix_filter_index; + bool delete_filter_node = false; std::unique_ptr timeout_info; diff --git a/include/posting_list.h b/include/posting_list.h index 13ee87ca..40033173 100644 --- a/include/posting_list.h +++ b/include/posting_list.h @@ -203,10 +203,19 @@ public: static bool is_single_token_verbatim_match(const posting_list_t::iterator_t& it, bool field_is_array); + static bool is_single_token_prefix_match(const posting_list_t::iterator_t& it, bool field_is_array); + + static void get_prefix_matches(std::vector& its, const bool field_is_array, + const uint32_t* ids, const uint32_t num_ids, + uint32_t*& prefix_ids, size_t& num_prefix_ids); + static void get_exact_matches(std::vector& its, bool field_is_array, const uint32_t* ids, const uint32_t num_ids, uint32_t*& exact_ids, size_t& num_exact_ids); + static bool has_prefix_match(std::vector& posting_list_iterators, + const bool field_is_array); + static bool has_exact_match(std::vector& posting_list_iterators, const bool field_is_array); diff --git a/src/filter_result_iterator.cpp b/src/filter_result_iterator.cpp index df21fd65..e4a94e13 100644 --- a/src/filter_result_iterator.cpp +++ b/src/filter_result_iterator.cpp @@ -355,7 +355,7 @@ void filter_result_iterator_t::get_string_filter_next_match(const bool& field_is uint32_t lowest_id = UINT32_MAX; if (filter_node->filter_exp.comparators[0] == EQUALS || filter_node->filter_exp.comparators[0] == NOT_EQUALS) { - bool exact_match_found = false; + bool match_found = false; switch (posting_list_iterators.size()) { case 1: while(true) { @@ -366,28 +366,32 @@ void filter_result_iterator_t::get_string_filter_next_match(const bool& field_is break; } - if (posting_list_t::has_exact_match(posting_list_iterators[0], field_is_array)) { - exact_match_found = true; - break; - } else { - // Keep advancing token iterators till exact match is not found. - for (auto& iter: posting_list_iterators[0]) { - if (!iter.valid()) { - break; - } + match_found = string_prefix_filter_index.count(0) == 0 ? + posting_list_t::has_exact_match(posting_list_iterators[0], field_is_array) : + posting_list_t::has_prefix_match(posting_list_iterators[0], field_is_array); - iter.next(); + if (match_found) { + break; + } + + // Keep advancing token iterators till match is not found. + for (auto& iter: posting_list_iterators[0]) { + if (!iter.valid()) { + break; } + + iter.next(); } } - if (one_is_valid && exact_match_found) { + if (one_is_valid && match_found) { lowest_id = posting_list_iterators[0][0].id(); } break; default : - for (auto& filter_value_tokens : posting_list_iterators) { + for (uint32_t i = 0; i < posting_list_iterators.size(); i++) { + auto& filter_value_tokens = posting_list_iterators[i]; bool tokens_iter_is_valid; while(true) { // Perform AND between tokens of a filter value. @@ -397,24 +401,27 @@ void filter_result_iterator_t::get_string_filter_next_match(const bool& field_is break; } - if (posting_list_t::has_exact_match(filter_value_tokens, field_is_array)) { - exact_match_found = true; - break; - } else { - // Keep advancing token iterators till exact match is not found. - for (auto &iter: filter_value_tokens) { - if (!iter.valid()) { - break; - } + match_found = string_prefix_filter_index.count(i) == 0 ? + posting_list_t::has_exact_match(filter_value_tokens, field_is_array) : + posting_list_t::has_prefix_match(filter_value_tokens, field_is_array); - iter.next(); + if (match_found) { + break; + } + + // Keep advancing token iterators till exact match is not found. + for (auto &iter: filter_value_tokens) { + if (!iter.valid()) { + break; } + + iter.next(); } } one_is_valid = tokens_iter_is_valid || one_is_valid; - if (tokens_iter_is_valid && exact_match_found && filter_value_tokens[0].id() < lowest_id) { + if (tokens_iter_is_valid && match_found && filter_value_tokens[0].id() < lowest_id) { lowest_id = filter_value_tokens[0].id(); } } @@ -1360,7 +1367,8 @@ void filter_result_iterator_t::init() { } else if (f.is_string()) { art_tree* t = index->search_index.at(a_filter.field_name); - for (std::string filter_value : a_filter.values) { + for (uint32_t i = 0; i < a_filter.values.size(); i++) { + auto filter_value = a_filter.values[i]; auto is_prefix_match = filter_value.size() > 1 && filter_value[filter_value.size() - 1] == '*'; if (is_prefix_match) { filter_value.erase(filter_value.size() - 1); @@ -1469,6 +1477,7 @@ void filter_result_iterator_t::init() { continue; } + string_prefix_filter_index.insert(posting_lists.size()); posting_lists.push_back(plists); posting_list_iterators.emplace_back(std::vector()); for (auto const& plist: plists) { @@ -2485,7 +2494,33 @@ void filter_result_iterator_t::compute_iterators() { for (uint32_t i = 0; i < posting_lists.size(); i++) { auto& p_list = posting_lists[i]; - if (a_filter.comparators[0] == EQUALS || a_filter.comparators[0] == NOT_EQUALS) { + if (string_prefix_filter_index.count(i) != 0 && + (a_filter.comparators[0] == EQUALS || a_filter.comparators[0] == NOT_EQUALS)) { + // Exact prefix match, needs intersection + prefix matching + std::vector result_id_vec; + posting_list_t::intersect(p_list, result_id_vec); + + if (result_id_vec.empty()) { + continue; + } + + // need to do prefix match + uint32_t* prefix_str_ids = new uint32_t[result_id_vec.size()]; + size_t prefix_str_ids_size = 0; + std::unique_ptr prefix_str_ids_guard(prefix_str_ids); + + posting_list_t::get_prefix_matches(posting_list_iterators[i], f.is_array(), + result_id_vec.data(), result_id_vec.size(), + prefix_str_ids, prefix_str_ids_size); + + if (prefix_str_ids_size == 0) { + continue; + } + + for (size_t pi = 0; pi < prefix_str_ids_size; pi++) { + f_id_buff.push_back(prefix_str_ids[pi]); + } + } else if (a_filter.comparators[0] == EQUALS || a_filter.comparators[0] == NOT_EQUALS) { // needs intersection + exact matching (unlike CONTAINS) std::vector result_id_vec; posting_list_t::intersect(p_list, result_id_vec); diff --git a/src/posting_list.cpp b/src/posting_list.cpp index 49d09e2c..a548f9ce 100644 --- a/src/posting_list.cpp +++ b/src/posting_list.cpp @@ -1111,6 +1111,173 @@ bool posting_list_t::contains_atleast_one(const uint32_t* target_ids, size_t tar return false; } +bool posting_list_t::is_single_token_prefix_match(const posting_list_t::iterator_t& it, bool field_is_array) { + block_t* curr_block = it.block(); + uint32_t curr_index = it.index(); + + if (curr_block == nullptr || curr_index == UINT32_MAX) { + return false; + } + + uint32_t* offsets = it.offsets; + uint32_t start_offset = it.offset_index[curr_index]; + + // If the field value starts with the token, it's a match. + return offsets[start_offset] == 1; +} + +void posting_list_t::get_prefix_matches(std::vector& its, const bool field_is_array, + const uint32_t* ids, const uint32_t num_ids, + uint32_t*& prefix_ids, size_t& num_prefix_ids) { + size_t prefix_id_index = 0; + + if (its.size() == 1) { + for (size_t i = 0; i < num_ids; i++) { + auto const& id = ids[i]; + its[0].skip_to(id); + if (is_single_token_prefix_match(its[0], field_is_array)) { + prefix_ids[prefix_id_index++] = id; + } + } + } else { + + if (!field_is_array) { + for (size_t i = 0; i < num_ids; i++) { + uint32_t id = ids[i]; + bool is_match = true; + + for (int j = its.size()-1; j >= 0; j--) { + posting_list_t::iterator_t& it = its[j]; + it.skip_to(id); + + block_t* curr_block = it.block(); + uint32_t curr_index = it.index(); + + if (curr_block == nullptr || curr_index == UINT32_MAX) { + is_match = false; + break; + } + + uint32_t* offsets = it.offsets; + + uint32_t start_offset_index = it.offset_index[curr_index]; + uint32_t end_offset_index = (curr_index == curr_block->size() - 1) ? + curr_block->offsets.getLength() : + it.offset_index[curr_index + 1]; + + // looping handles duplicate query tokens, e.g. "hip hip hurray hurray" + while (start_offset_index < end_offset_index) { + uint32_t offset = offsets[start_offset_index]; + start_offset_index++; + + if (offset == (j + 1)) { + // we have found a matching index, no need to look further for this token + is_match = true; + break; + } + + if (offset > (j + 1)) { + is_match = false; + break; + } + } + + if (!is_match) { + break; + } + } + + if (is_match) { + prefix_ids[prefix_id_index++] = id; + } + } + } + + else { + // field is an array + + struct token_index_meta_t { + std::bitset<32> token_index; + }; + + for (size_t i = 0; i < num_ids; i++) { + uint32_t id = ids[i]; + + std::map array_index_to_token_index; + bool premature_exit = false; + + for (int j = its.size()-1; j >= 0; j--) { + posting_list_t::iterator_t& it = its[j]; + + it.skip_to(id); + + block_t* curr_block = it.block(); + uint32_t curr_index = it.index(); + + if (curr_block == nullptr || curr_index == UINT32_MAX) { + premature_exit = true; + break; + } + + uint32_t* offsets = it.offsets; + uint32_t start_offset_index = it.offset_index[curr_index]; + uint32_t end_offset_index = (curr_index == curr_block->size() - 1) ? + curr_block->offsets.getLength() : + it.offset_index[curr_index + 1]; + + int prev_pos = -1; + bool found_matching_index = false; + size_t num_matching_index = 0; + + while (start_offset_index < end_offset_index) { + int pos = offsets[start_offset_index]; + start_offset_index++; + + if (pos == prev_pos) { // indicates end of array index + size_t array_index = (size_t) offsets[start_offset_index]; + + if (found_matching_index) { + array_index_to_token_index[array_index].token_index.set(j+1); + } + + start_offset_index++; // skip current value which is the array index or flag for last index + prev_pos = -1; + found_matching_index = false; + continue; + } + + if (pos == (j + 1)) { + // we have found a matching index + found_matching_index = true; + num_matching_index++; + } + + prev_pos = pos; + } + + if (num_matching_index == 0) { + // not even a single matching index found: can never be an exact match + premature_exit = true; + break; + } + } + + if (!premature_exit) { + // iterate array index to token index to check if atleast 1 array position contains all tokens + for (auto& kv: array_index_to_token_index) { + if (kv.second.token_index.count() == its.size()) { + prefix_ids[prefix_id_index++] = id; + break; + } + } + } + } + } + } + + num_prefix_ids = prefix_id_index; +} + void posting_list_t::get_exact_matches(std::vector& its, const bool field_is_array, const uint32_t* ids, const uint32_t num_ids, uint32_t*& exact_ids, size_t& num_exact_ids) { @@ -1292,6 +1459,123 @@ void posting_list_t::get_exact_matches(std::vector& its, const bool num_exact_ids = exact_id_index; } +bool posting_list_t::has_prefix_match(std::vector& posting_list_iterators, + const bool field_is_array) { + if (posting_list_iterators.empty()) { + return false; + } + + if (posting_list_iterators.size() == 1) { + return is_single_token_prefix_match(posting_list_iterators[0], field_is_array); + } + + if (!field_is_array) { + for (uint32_t i = 0; i < posting_list_iterators.size(); i++) { + posting_list_t::iterator_t& it = posting_list_iterators[i]; + + block_t* curr_block = it.block(); + uint32_t curr_index = it.index(); + + if (curr_block == nullptr || curr_index == UINT32_MAX) { + return false; + } + + uint32_t* offsets = it.offsets; + + uint32_t start_offset_index = it.offset_index[curr_index]; + uint32_t end_offset_index = (curr_index == curr_block->size() - 1) ? + curr_block->offsets.getLength() : + it.offset_index[curr_index + 1]; + + // looping handles duplicate query tokens, e.g. "hip hip hurray hurray" + while (start_offset_index < end_offset_index) { + uint32_t offset = offsets[start_offset_index]; + start_offset_index++; + + if (offset == (i + 1)) { + // we have found a matching index, no need to look further for this token. + break; + } + + if (offset > (i + 1)) { + return false; + } + } + } + } + + else { + // field is an array + + struct token_index_meta_t { + std::bitset<32> token_index; + }; + + std::map array_index_to_token_index; + + for (int i = posting_list_iterators.size() - 1; i >= 0; i--) { + posting_list_t::iterator_t& it = posting_list_iterators[i]; + + block_t* curr_block = it.block(); + uint32_t curr_index = it.index(); + + if (curr_block == nullptr || curr_index == UINT32_MAX) { + return false; + } + + uint32_t* offsets = it.offsets; + uint32_t start_offset_index = it.offset_index[curr_index]; + uint32_t end_offset_index = (curr_index == curr_block->size() - 1) ? + curr_block->offsets.getLength() : + it.offset_index[curr_index + 1]; + + int prev_pos = -1; + bool found_matching_index = false; + size_t num_matching_index = 0; + + while (start_offset_index < end_offset_index) { + int pos = offsets[start_offset_index]; + start_offset_index++; + + if (pos == prev_pos) { // indicates end of array index + size_t array_index = (size_t) offsets[start_offset_index]; + + if (found_matching_index) { + array_index_to_token_index[array_index].token_index.set(i + 1); + } + + start_offset_index++; // skip current value which is the array index or flag for last index + prev_pos = -1; + found_matching_index = false; + continue; + } + + if (pos == (i + 1)) { + // we have found a matching index + found_matching_index = true; + num_matching_index++; + } + + prev_pos = pos; + } + + if (num_matching_index == 0) { + // not even a single matching index found: can never be an exact match + return false; + } + } + + // iterate array index to token index to check if atleast 1 array position contains all tokens + for (auto& kv: array_index_to_token_index) { + if (kv.second.token_index.count() == posting_list_iterators.size()) { + return true; + } + } + } + + return true; +} + bool posting_list_t::has_exact_match(std::vector& posting_list_iterators, const bool field_is_array) { if(posting_list_iterators.size() == 1) { diff --git a/test/collection_filtering_test.cpp b/test/collection_filtering_test.cpp index 56953d84..e704cec7 100644 --- a/test/collection_filtering_test.cpp +++ b/test/collection_filtering_test.cpp @@ -2563,6 +2563,368 @@ TEST_F(CollectionFilteringTest, PrefixFilterOnTextFields) { std::string id = ids.at(i); ASSERT_EQ(id, result_id); } + + auto schema_json = + R"({ + "name": "Names", + "fields": [ + {"name": "name", "type": "string", "optional": true}, + {"name": "names", "type": "string[]", "optional": true} + ] + })"_json; + std::vector documents = { + R"({ + "name": "Steve Jobs" + })"_json, + R"({ + "name": "Adam Stator" + })"_json, + }; + + auto collection_create_op = collectionManager.create_collection(schema_json); + ASSERT_TRUE(collection_create_op.ok()); + for (auto const &json: documents) { + auto add_op = collection_create_op.get()->add(json.dump()); + ASSERT_TRUE(add_op.ok()); + } + + std::map req_params = { + {"collection", "Names"}, + {"q", "*"}, + {"query_by", "name"}, + {"filter_by", "name:= S*"} + }; + nlohmann::json embedded_params; + std::string json_res; + auto now_ts = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()).count(); + + auto search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts); + ASSERT_TRUE(search_op.ok()); + + auto res_obj = nlohmann::json::parse(json_res); + ASSERT_EQ(1, res_obj["found"].get()); + ASSERT_EQ(1, res_obj["hits"].size()); + ASSERT_EQ("Steve Jobs", res_obj["hits"][0]["document"].at("name")); + + req_params = { + {"collection", "Names"}, + {"q", "*"}, + {"query_by", "name"}, + {"filter_by", "name: S*"} + }; + + search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts); + ASSERT_TRUE(search_op.ok()); + + res_obj = nlohmann::json::parse(json_res); + ASSERT_EQ(2, res_obj["found"].get()); + ASSERT_EQ(2, res_obj["hits"].size()); + ASSERT_EQ("Adam Stator", res_obj["hits"][0]["document"].at("name")); + ASSERT_EQ("Steve Jobs", res_obj["hits"][1]["document"].at("name")); + + documents = { + R"({ + "name": "Steve Reiley" + })"_json, + R"({ + "name": "Storm" + })"_json, + R"({ + "name": "Steve Rogers" + })"_json, + }; + + for (auto const &json: documents) { + auto add_op = collection_create_op.get()->add(json.dump()); + ASSERT_TRUE(add_op.ok()); + } + + req_params = { + {"collection", "Names"}, + {"q", "s"}, + {"query_by", "name"}, + {"filter_by", "name:= St*"} + }; + + search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts); + ASSERT_TRUE(search_op.ok()); + + res_obj = nlohmann::json::parse(json_res); + ASSERT_EQ(4, res_obj["found"].get()); + ASSERT_EQ(4, res_obj["hits"].size()); + ASSERT_EQ("Steve Rogers", res_obj["hits"][0]["document"].at("name")); + ASSERT_EQ("Storm", res_obj["hits"][1]["document"].at("name")); + ASSERT_EQ("Steve Reiley", res_obj["hits"][2]["document"].at("name")); + ASSERT_EQ("Steve Jobs", res_obj["hits"][3]["document"].at("name")); + + req_params = { + {"collection", "Names"}, + {"q", "s"}, + {"query_by", "name"}, + {"filter_by", "name: St*"} + }; + + search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts); + ASSERT_TRUE(search_op.ok()); + + res_obj = nlohmann::json::parse(json_res); + ASSERT_EQ(5, res_obj["found"].get()); + ASSERT_EQ(5, res_obj["hits"].size()); + ASSERT_EQ("Steve Rogers", res_obj["hits"][0]["document"].at("name")); + ASSERT_EQ("Storm", res_obj["hits"][1]["document"].at("name")); + ASSERT_EQ("Steve Reiley", res_obj["hits"][2]["document"].at("name")); + ASSERT_EQ("Adam Stator", res_obj["hits"][3]["document"].at("name")); + ASSERT_EQ("Steve Jobs", res_obj["hits"][4]["document"].at("name")); + + req_params = { + {"collection", "Names"}, + {"q", "s"}, + {"query_by", "name"}, + {"filter_by", "name:= Steve R*"} + }; + + search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts); + ASSERT_TRUE(search_op.ok()); + + res_obj = nlohmann::json::parse(json_res); + ASSERT_EQ(2, res_obj["found"].get()); + ASSERT_EQ(2, res_obj["hits"].size()); + ASSERT_EQ("Steve Rogers", res_obj["hits"][0]["document"].at("name")); + ASSERT_EQ("Steve Reiley", res_obj["hits"][1]["document"].at("name")); + + req_params = { + {"collection", "Names"}, + {"q", "s"}, + {"query_by", "name"}, + {"filter_by", "name: Steve R*"} + }; + + search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts); + ASSERT_TRUE(search_op.ok()); + + res_obj = nlohmann::json::parse(json_res); + ASSERT_EQ(2, res_obj["found"].get()); + ASSERT_EQ(2, res_obj["hits"].size()); + ASSERT_EQ("Steve Rogers", res_obj["hits"][0]["document"].at("name")); + ASSERT_EQ("Steve Reiley", res_obj["hits"][1]["document"].at("name")); + + documents = { + R"({ + "names": [] + })"_json, + R"({ + "names": ["Steve Jobs"] + })"_json, + R"({ + "names": ["Adam Stator"] + })"_json + }; + + for (auto const &json: documents) { + auto add_op = collection_create_op.get()->add(json.dump()); + ASSERT_TRUE(add_op.ok()); + } + + req_params = { + {"collection", "Names"}, + {"q", "s"}, + {"query_by", "names"}, + {"filter_by", "names:= St*"} + }; + + search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts); + ASSERT_TRUE(search_op.ok()); + + res_obj = nlohmann::json::parse(json_res); + ASSERT_EQ(1, res_obj["found"].get()); + ASSERT_EQ(1, res_obj["hits"].size()); + ASSERT_EQ("Steve Jobs", res_obj["hits"][0]["document"]["names"][0]); + + req_params = { + {"collection", "Names"}, + {"q", "s"}, + {"query_by", "names"}, + {"filter_by", "names: St*"} + }; + + search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts); + ASSERT_TRUE(search_op.ok()); + + res_obj = nlohmann::json::parse(json_res); + ASSERT_EQ(2, res_obj["found"].get()); + ASSERT_EQ(2, res_obj["hits"].size()); + ASSERT_EQ("Adam Stator", res_obj["hits"][0]["document"]["names"][0]); + ASSERT_EQ("Steve Jobs", res_obj["hits"][1]["document"]["names"][0]); + + documents = { + R"({ + "names": ["Steve Reiley"] + })"_json, + R"({ + "names": ["Storm"] + })"_json, + R"({ + "names": ["Adam", "Steve Rogers"] + })"_json, + }; + + for (auto const &json: documents) { + auto add_op = collection_create_op.get()->add(json.dump()); + ASSERT_TRUE(add_op.ok()); + } + + req_params = { + {"collection", "Names"}, + {"q", "s"}, + {"query_by", "names"}, + {"filter_by", "names:= St*"} + }; + + search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts); + ASSERT_TRUE(search_op.ok()); + + res_obj = nlohmann::json::parse(json_res); + ASSERT_EQ(4, res_obj["found"].get()); + ASSERT_EQ(4, res_obj["hits"].size()); + ASSERT_EQ("Steve Rogers", res_obj["hits"][0]["document"]["names"][1]); + ASSERT_EQ("Storm", res_obj["hits"][1]["document"]["names"][0]); + ASSERT_EQ("Steve Reiley", res_obj["hits"][2]["document"]["names"][0]); + ASSERT_EQ("Steve Jobs", res_obj["hits"][3]["document"]["names"][0]); + + req_params = { + {"collection", "Names"}, + {"q", "s"}, + {"query_by", "names"}, + {"filter_by", "names: St*"} + }; + + search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts); + ASSERT_TRUE(search_op.ok()); + + res_obj = nlohmann::json::parse(json_res); + ASSERT_EQ(5, res_obj["found"].get()); + ASSERT_EQ(5, res_obj["hits"].size()); + ASSERT_EQ("Steve Rogers", res_obj["hits"][0]["document"]["names"][1]); + ASSERT_EQ("Storm", res_obj["hits"][1]["document"]["names"][0]); + ASSERT_EQ("Steve Reiley", res_obj["hits"][2]["document"]["names"][0]); + ASSERT_EQ("Adam Stator", res_obj["hits"][3]["document"]["names"][0]); + ASSERT_EQ("Steve Jobs", res_obj["hits"][4]["document"]["names"][0]); + + req_params = { + {"collection", "Names"}, + {"q", "s"}, + {"query_by", "names"}, + {"filter_by", "names:= Steve*"} + }; + + search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts); + ASSERT_TRUE(search_op.ok()); + + res_obj = nlohmann::json::parse(json_res); + ASSERT_EQ(3, res_obj["found"].get()); + ASSERT_EQ(3, res_obj["hits"].size()); + ASSERT_EQ("Steve Rogers", res_obj["hits"][0]["document"]["names"][1]); + ASSERT_EQ("Steve Reiley", res_obj["hits"][1]["document"]["names"][0]); + ASSERT_EQ("Steve Jobs", res_obj["hits"][2]["document"]["names"][0]); + + req_params = { + {"collection", "Names"}, + {"q", "s"}, + {"query_by", "names"}, + {"filter_by", "names: Steve*"} + }; + + search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts); + ASSERT_TRUE(search_op.ok()); + + res_obj = nlohmann::json::parse(json_res); + ASSERT_EQ(3, res_obj["found"].get()); + ASSERT_EQ(3, res_obj["hits"].size()); + ASSERT_EQ("Steve Rogers", res_obj["hits"][0]["document"]["names"][1]); + ASSERT_EQ("Steve Reiley", res_obj["hits"][1]["document"]["names"][0]); + ASSERT_EQ("Steve Jobs", res_obj["hits"][2]["document"]["names"][0]); + + req_params = { + {"collection", "Names"}, + {"q", "s"}, + {"query_by", "names"}, + {"filter_by", "names:= Steve R*"} + }; + + search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts); + ASSERT_TRUE(search_op.ok()); + + res_obj = nlohmann::json::parse(json_res); + ASSERT_EQ(2, res_obj["found"].get()); + ASSERT_EQ(2, res_obj["hits"].size()); + ASSERT_EQ("Steve Rogers", res_obj["hits"][0]["document"]["names"][1]); + ASSERT_EQ("Steve Reiley", res_obj["hits"][1]["document"]["names"][0]); + + req_params = { + {"collection", "Names"}, + {"q", "s"}, + {"query_by", "names"}, + {"filter_by", "names: Steve R*"} + }; + + search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts); + ASSERT_TRUE(search_op.ok()); + + res_obj = nlohmann::json::parse(json_res); + ASSERT_EQ(2, res_obj["found"].get()); + ASSERT_EQ(2, res_obj["hits"].size()); + ASSERT_EQ("Steve Rogers", res_obj["hits"][0]["document"]["names"][1]); + ASSERT_EQ("Steve Reiley", res_obj["hits"][1]["document"]["names"][0]); + + documents = { + R"({ + "names": ["Steve Runner foo"] + })"_json, + R"({ + "names": ["foo Steve Runner"] + })"_json, + }; + + for (auto const &json: documents) { + auto add_op = collection_create_op.get()->add(json.dump()); + ASSERT_TRUE(add_op.ok()); + } + + req_params = { + {"collection", "Names"}, + {"q", "s"}, + {"query_by", "names"}, + {"filter_by", "names:= Steve R*"} + }; + + search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts); + ASSERT_TRUE(search_op.ok()); + + res_obj = nlohmann::json::parse(json_res); + ASSERT_EQ(3, res_obj["found"].get()); + ASSERT_EQ(3, res_obj["hits"].size()); + ASSERT_EQ("Steve Runner foo", res_obj["hits"][0]["document"]["names"][0]); + ASSERT_EQ("Steve Rogers", res_obj["hits"][1]["document"]["names"][1]); + ASSERT_EQ("Steve Reiley", res_obj["hits"][2]["document"]["names"][0]); + + req_params = { + {"collection", "Names"}, + {"q", "s"}, + {"query_by", "names"}, + {"filter_by", "names: Steve R*"} + }; + + search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts); + ASSERT_TRUE(search_op.ok()); + + res_obj = nlohmann::json::parse(json_res); + ASSERT_EQ(4, res_obj["found"].get()); + ASSERT_EQ(4, res_obj["hits"].size()); + ASSERT_EQ("foo Steve Runner", res_obj["hits"][0]["document"]["names"][0]); + ASSERT_EQ("Steve Runner foo", res_obj["hits"][1]["document"]["names"][0]); + ASSERT_EQ("Steve Rogers", res_obj["hits"][2]["document"]["names"][1]); + ASSERT_EQ("Steve Reiley", res_obj["hits"][3]["document"]["names"][0]); } TEST_F(CollectionFilteringTest, FilterOnStemmedField) { diff --git a/test/filter_test.cpp b/test/filter_test.cpp index e44dc6d6..e1bdada5 100644 --- a/test/filter_test.cpp +++ b/test/filter_test.cpp @@ -1503,3 +1503,266 @@ TEST_F(FilterTest, NumericFilterIterator) { delete filter_tree_root; } + +TEST_F(FilterTest, PrefixStringFilter) { + auto schema_json = + R"({ + "name": "Names", + "fields": [ + {"name": "name", "type": "string"} + ] + })"_json; + std::vector documents = { + R"({ + "name": "Steve Jobs" + })"_json, + R"({ + "name": "Adam Stator" + })"_json, + }; + + auto collection_create_op = collectionManager.create_collection(schema_json); + ASSERT_TRUE(collection_create_op.ok()); + Collection* coll = collection_create_op.get(); + for (auto const &json: documents) { + auto add_op = coll->add(json.dump()); + ASSERT_TRUE(add_op.ok()); + } + + const std::string doc_id_prefix = std::to_string(coll->get_collection_id()) + "_" + Collection::DOC_ID_PREFIX + "_"; + filter_node_t* filter_tree_root = nullptr; + + search_stop_us = UINT64_MAX; // `Index::fuzzy_search_fields` checks for timeout. + Option filter_op = filter::parse_filter_query("name:= S*", coll->get_schema(), store, doc_id_prefix, + filter_tree_root); + ASSERT_TRUE(filter_op.ok()); + + auto computed_exact_prefix_test = filter_result_iterator_t(coll->get_name(), coll->_get_index(), filter_tree_root); + ASSERT_TRUE(computed_exact_prefix_test.init_status().ok()); + ASSERT_TRUE(computed_exact_prefix_test._get_is_filter_result_initialized()); + + std::vector expected = {0}; + for (auto const& i : expected) { + ASSERT_EQ(filter_result_iterator_t::valid, computed_exact_prefix_test.validity); + ASSERT_EQ(i, computed_exact_prefix_test.seq_id); + computed_exact_prefix_test.next(); + } + ASSERT_EQ(filter_result_iterator_t::invalid, computed_exact_prefix_test.validity); + + delete filter_tree_root; + filter_tree_root = nullptr; + filter_op = filter::parse_filter_query("name: S*", coll->get_schema(), store, doc_id_prefix, + filter_tree_root); + ASSERT_TRUE(filter_op.ok()); + + auto computed_contains_prefix_test = filter_result_iterator_t(coll->get_name(), coll->_get_index(), filter_tree_root); + ASSERT_TRUE(computed_contains_prefix_test.init_status().ok()); + ASSERT_TRUE(computed_contains_prefix_test._get_is_filter_result_initialized()); + + expected = {0, 1}; + for (auto const& i : expected) { + ASSERT_EQ(filter_result_iterator_t::valid, computed_contains_prefix_test.validity); + ASSERT_EQ(i, computed_contains_prefix_test.seq_id); + computed_contains_prefix_test.next(); + } + ASSERT_EQ(filter_result_iterator_t::invalid, computed_contains_prefix_test.validity); + + delete filter_tree_root; + + documents = { + R"({ + "name": "Steve Reiley" + })"_json, + R"({ + "name": "Storm" + })"_json, + R"({ + "name": "Steve Rogers" + })"_json, + }; + + for (auto const &json: documents) { + auto add_op = collection_create_op.get()->add(json.dump()); + ASSERT_TRUE(add_op.ok()); + } + + filter_tree_root = nullptr; + filter_op = filter::parse_filter_query("name:= S*", coll->get_schema(), store, doc_id_prefix, + filter_tree_root); + ASSERT_TRUE(filter_op.ok()); + + auto iter_exact_prefix_test = filter_result_iterator_t(coll->get_name(), coll->_get_index(), filter_tree_root); + ASSERT_TRUE(iter_exact_prefix_test.init_status().ok()); + ASSERT_FALSE(iter_exact_prefix_test._get_is_filter_result_initialized()); + + std::vector validate_ids = {0, 1, 2, 3, 4, 5}; + std::vector seq_ids = {2, 2, 3, 4, 4, 4}; + std::vector equals_match_seq_ids = {0, 2, 2, 3, 4, 4}; + std::vector equals_iterator_valid = {true, true, true, true, true, false}; + expected = {1, 0, 1, 1, 1, -1}; + for (uint32_t i = 0; i < validate_ids.size(); i++) { + if (i < 5) { + ASSERT_EQ(filter_result_iterator_t::valid, iter_exact_prefix_test.validity); + } else { + ASSERT_EQ(filter_result_iterator_t::invalid, iter_exact_prefix_test.validity); + } + ASSERT_EQ(expected[i], iter_exact_prefix_test.is_valid(validate_ids[i])); + ASSERT_EQ(equals_match_seq_ids[i], iter_exact_prefix_test._get_equals_iterator_id()); + ASSERT_EQ(equals_iterator_valid[i], iter_exact_prefix_test._get_is_equals_iterator_valid()); + + if (expected[i] == 1) { + iter_exact_prefix_test.next(); + } + ASSERT_EQ(seq_ids[i], iter_exact_prefix_test.seq_id); + } + ASSERT_EQ(filter_result_iterator_t::invalid, iter_exact_prefix_test.validity); + + delete filter_tree_root; + filter_tree_root = nullptr; + filter_op = filter::parse_filter_query("name: S*", coll->get_schema(), store, doc_id_prefix, + filter_tree_root); + ASSERT_TRUE(filter_op.ok()); + + auto iter_contains_prefix_test = filter_result_iterator_t(coll->get_name(), coll->_get_index(), filter_tree_root); + ASSERT_TRUE(iter_contains_prefix_test.init_status().ok()); + ASSERT_FALSE(iter_contains_prefix_test._get_is_filter_result_initialized()); + + validate_ids = {0, 1, 2, 3, 4, 5}; + seq_ids = {1, 2, 3, 4, 4, 4}; + equals_match_seq_ids = {0, 1, 2, 3, 4, 4}; + equals_iterator_valid = {true, true, true, true, true, false}; + expected = {1, 1, 1, 1, 1, -1}; + for (uint32_t i = 0; i < validate_ids.size(); i++) { + if (i < 5) { + ASSERT_EQ(filter_result_iterator_t::valid, iter_contains_prefix_test.validity); + } else { + ASSERT_EQ(filter_result_iterator_t::invalid, iter_contains_prefix_test.validity); + } + ASSERT_EQ(expected[i], iter_contains_prefix_test.is_valid(validate_ids[i])); + ASSERT_EQ(equals_match_seq_ids[i], iter_contains_prefix_test._get_equals_iterator_id()); + ASSERT_EQ(equals_iterator_valid[i], iter_contains_prefix_test._get_is_equals_iterator_valid()); + + if (expected[i] == 1) { + iter_contains_prefix_test.next(); + } + ASSERT_EQ(seq_ids[i], iter_contains_prefix_test.seq_id); + } + ASSERT_EQ(filter_result_iterator_t::invalid, iter_contains_prefix_test.validity); + + delete filter_tree_root; + filter_tree_root = nullptr; + filter_op = filter::parse_filter_query("name:= Steve R*", coll->get_schema(), store, doc_id_prefix, + filter_tree_root); + ASSERT_TRUE(filter_op.ok()); + + auto computed_exact_prefix_test_2 = filter_result_iterator_t(coll->get_name(), coll->_get_index(), filter_tree_root); + ASSERT_TRUE(computed_exact_prefix_test_2.init_status().ok()); + ASSERT_TRUE(computed_exact_prefix_test_2._get_is_filter_result_initialized()); + + expected = {2, 4}; + for (auto const& i : expected) { + ASSERT_EQ(filter_result_iterator_t::valid, computed_exact_prefix_test_2.validity); + ASSERT_EQ(i, computed_exact_prefix_test_2.seq_id); + computed_exact_prefix_test_2.next(); + } + ASSERT_EQ(filter_result_iterator_t::invalid, computed_exact_prefix_test_2.validity); + + delete filter_tree_root; + filter_tree_root = nullptr; + filter_op = filter::parse_filter_query("name: Steve R*", coll->get_schema(), store, doc_id_prefix, + filter_tree_root); + ASSERT_TRUE(filter_op.ok()); + + auto computed_contains_prefix_test_2 = filter_result_iterator_t(coll->get_name(), coll->_get_index(), filter_tree_root); + ASSERT_TRUE(computed_contains_prefix_test_2.init_status().ok()); + ASSERT_TRUE(computed_contains_prefix_test_2._get_is_filter_result_initialized()); + + expected = {2, 4}; + for (auto const& i : expected) { + ASSERT_EQ(filter_result_iterator_t::valid, computed_contains_prefix_test_2.validity); + ASSERT_EQ(i, computed_contains_prefix_test_2.seq_id); + computed_contains_prefix_test_2.next(); + } + ASSERT_EQ(filter_result_iterator_t::invalid, computed_contains_prefix_test_2.validity); + + delete filter_tree_root; + + documents = { + R"({ + "name": "Steve Runner foo" + })"_json, + R"({ + "name": "foo Steve Runner" + })"_json, + }; + + for (auto const &json: documents) { + auto add_op = collection_create_op.get()->add(json.dump()); + ASSERT_TRUE(add_op.ok()); + } + + filter_tree_root = nullptr; + filter_op = filter::parse_filter_query("name:= Steve R*", coll->get_schema(), store, doc_id_prefix, + filter_tree_root); + ASSERT_TRUE(filter_op.ok()); + + auto iter_exact_prefix_test_2 = filter_result_iterator_t(coll->get_name(), coll->_get_index(), filter_tree_root); + ASSERT_TRUE(iter_exact_prefix_test_2.init_status().ok()); + ASSERT_FALSE(iter_exact_prefix_test_2._get_is_filter_result_initialized()); + + validate_ids = {0, 1, 2, 3, 4, 5, 6, 7}; + seq_ids = {2, 2, 4, 4, 5, 5, 5, 5}; + equals_match_seq_ids = {2, 2, 2, 4, 4, 5, 5, 5}; + equals_iterator_valid = {true, true, true, true, true, true, false, false}; + expected = {0, 0, 1, 0, 1, 1, -1, -1}; + for (uint32_t i = 0; i < validate_ids.size(); i++) { + if (i < 6) { + ASSERT_EQ(filter_result_iterator_t::valid, iter_exact_prefix_test_2.validity); + } else { + ASSERT_EQ(filter_result_iterator_t::invalid, iter_exact_prefix_test_2.validity); + } + ASSERT_EQ(expected[i], iter_exact_prefix_test_2.is_valid(validate_ids[i])); + ASSERT_EQ(equals_match_seq_ids[i], iter_exact_prefix_test_2._get_equals_iterator_id()); + ASSERT_EQ(equals_iterator_valid[i], iter_exact_prefix_test_2._get_is_equals_iterator_valid()); + + if (expected[i] == 1) { + iter_exact_prefix_test_2.next(); + } + ASSERT_EQ(seq_ids[i], iter_exact_prefix_test_2.seq_id); + } + ASSERT_EQ(filter_result_iterator_t::invalid, iter_exact_prefix_test_2.validity); + + delete filter_tree_root; + filter_tree_root = nullptr; + filter_op = filter::parse_filter_query("name: Steve R*", coll->get_schema(), store, doc_id_prefix, + filter_tree_root); + ASSERT_TRUE(filter_op.ok()); + + auto iter_contains_prefix_test_2 = filter_result_iterator_t(coll->get_name(), coll->_get_index(), filter_tree_root); + ASSERT_TRUE(iter_contains_prefix_test_2.init_status().ok()); + ASSERT_FALSE(iter_contains_prefix_test_2._get_is_filter_result_initialized()); + + validate_ids = {0, 1, 2, 3, 4, 5, 6, 7}; + seq_ids = {2, 2, 4, 4, 5, 6, 6, 6}; + equals_match_seq_ids = {2, 2, 2, 4, 4, 5, 6, 6}; + equals_iterator_valid = {true, true, true, true, true, true, true, false}; + expected = {0, 0, 1, 0, 1, 1, 1, -1}; + for (uint32_t i = 0; i < validate_ids.size(); i++) { + if (i < 7) { + ASSERT_EQ(filter_result_iterator_t::valid, iter_contains_prefix_test_2.validity); + } else { + ASSERT_EQ(filter_result_iterator_t::invalid, iter_contains_prefix_test_2.validity); + } + ASSERT_EQ(expected[i], iter_contains_prefix_test_2.is_valid(validate_ids[i])); + ASSERT_EQ(equals_match_seq_ids[i], iter_contains_prefix_test_2._get_equals_iterator_id()); + ASSERT_EQ(equals_iterator_valid[i], iter_contains_prefix_test_2._get_is_equals_iterator_valid()); + + if (expected[i] == 1) { + iter_contains_prefix_test_2.next(); + } + ASSERT_EQ(seq_ids[i], iter_contains_prefix_test_2.seq_id); + } + ASSERT_EQ(filter_result_iterator_t::invalid, iter_contains_prefix_test_2.validity); + + delete filter_tree_root; +}