Exact prefix value filter. (#1763)

* Exact prefix match on `string` field.

* Exact prefix match on `string[]` field.

---------

Co-authored-by: Kishore Nallan <kishorenc@gmail.com>
This commit is contained in:
Harpreet Sangar 2024-06-07 14:57:34 +05:30 committed by GitHub
parent 25762a7c69
commit 4fee4dc286
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 983 additions and 26 deletions

View File

@ -280,6 +280,10 @@ private:
/// Sample filter: [>10, !15].
std::unordered_set<uint32_t> numerical_not_iterator_index;
/// String filter can specify prefix value match.
/// Sample filter: [Chris P*].
std::unordered_set<uint32_t> string_prefix_filter_index;
bool delete_filter_node = false;
std::unique_ptr<filter_result_iterator_timeout_info> timeout_info;

View File

@ -203,10 +203,19 @@ public:
static bool is_single_token_verbatim_match(const posting_list_t::iterator_t& it, bool field_is_array);
static bool is_single_token_prefix_match(const posting_list_t::iterator_t& it, bool field_is_array);
static void get_prefix_matches(std::vector<iterator_t>& its, const bool field_is_array,
const uint32_t* ids, const uint32_t num_ids,
uint32_t*& prefix_ids, size_t& num_prefix_ids);
static void get_exact_matches(std::vector<iterator_t>& its, bool field_is_array,
const uint32_t* ids, const uint32_t num_ids,
uint32_t*& exact_ids, size_t& num_exact_ids);
static bool has_prefix_match(std::vector<posting_list_t::iterator_t>& posting_list_iterators,
const bool field_is_array);
static bool has_exact_match(std::vector<posting_list_t::iterator_t>& posting_list_iterators,
const bool field_is_array);

View File

@ -355,7 +355,7 @@ void filter_result_iterator_t::get_string_filter_next_match(const bool& field_is
uint32_t lowest_id = UINT32_MAX;
if (filter_node->filter_exp.comparators[0] == EQUALS || filter_node->filter_exp.comparators[0] == NOT_EQUALS) {
bool exact_match_found = false;
bool match_found = false;
switch (posting_list_iterators.size()) {
case 1:
while(true) {
@ -366,28 +366,32 @@ void filter_result_iterator_t::get_string_filter_next_match(const bool& field_is
break;
}
if (posting_list_t::has_exact_match(posting_list_iterators[0], field_is_array)) {
exact_match_found = true;
break;
} else {
// Keep advancing token iterators till exact match is not found.
for (auto& iter: posting_list_iterators[0]) {
if (!iter.valid()) {
break;
}
match_found = string_prefix_filter_index.count(0) == 0 ?
posting_list_t::has_exact_match(posting_list_iterators[0], field_is_array) :
posting_list_t::has_prefix_match(posting_list_iterators[0], field_is_array);
iter.next();
if (match_found) {
break;
}
// Keep advancing token iterators till match is not found.
for (auto& iter: posting_list_iterators[0]) {
if (!iter.valid()) {
break;
}
iter.next();
}
}
if (one_is_valid && exact_match_found) {
if (one_is_valid && match_found) {
lowest_id = posting_list_iterators[0][0].id();
}
break;
default :
for (auto& filter_value_tokens : posting_list_iterators) {
for (uint32_t i = 0; i < posting_list_iterators.size(); i++) {
auto& filter_value_tokens = posting_list_iterators[i];
bool tokens_iter_is_valid;
while(true) {
// Perform AND between tokens of a filter value.
@ -397,24 +401,27 @@ void filter_result_iterator_t::get_string_filter_next_match(const bool& field_is
break;
}
if (posting_list_t::has_exact_match(filter_value_tokens, field_is_array)) {
exact_match_found = true;
break;
} else {
// Keep advancing token iterators till exact match is not found.
for (auto &iter: filter_value_tokens) {
if (!iter.valid()) {
break;
}
match_found = string_prefix_filter_index.count(i) == 0 ?
posting_list_t::has_exact_match(filter_value_tokens, field_is_array) :
posting_list_t::has_prefix_match(filter_value_tokens, field_is_array);
iter.next();
if (match_found) {
break;
}
// Keep advancing token iterators till exact match is not found.
for (auto &iter: filter_value_tokens) {
if (!iter.valid()) {
break;
}
iter.next();
}
}
one_is_valid = tokens_iter_is_valid || one_is_valid;
if (tokens_iter_is_valid && exact_match_found && filter_value_tokens[0].id() < lowest_id) {
if (tokens_iter_is_valid && match_found && filter_value_tokens[0].id() < lowest_id) {
lowest_id = filter_value_tokens[0].id();
}
}
@ -1360,7 +1367,8 @@ void filter_result_iterator_t::init() {
} else if (f.is_string()) {
art_tree* t = index->search_index.at(a_filter.field_name);
for (std::string filter_value : a_filter.values) {
for (uint32_t i = 0; i < a_filter.values.size(); i++) {
auto filter_value = a_filter.values[i];
auto is_prefix_match = filter_value.size() > 1 && filter_value[filter_value.size() - 1] == '*';
if (is_prefix_match) {
filter_value.erase(filter_value.size() - 1);
@ -1469,6 +1477,7 @@ void filter_result_iterator_t::init() {
continue;
}
string_prefix_filter_index.insert(posting_lists.size());
posting_lists.push_back(plists);
posting_list_iterators.emplace_back(std::vector<posting_list_t::iterator_t>());
for (auto const& plist: plists) {
@ -2485,7 +2494,33 @@ void filter_result_iterator_t::compute_iterators() {
for (uint32_t i = 0; i < posting_lists.size(); i++) {
auto& p_list = posting_lists[i];
if (a_filter.comparators[0] == EQUALS || a_filter.comparators[0] == NOT_EQUALS) {
if (string_prefix_filter_index.count(i) != 0 &&
(a_filter.comparators[0] == EQUALS || a_filter.comparators[0] == NOT_EQUALS)) {
// Exact prefix match, needs intersection + prefix matching
std::vector<uint32_t> result_id_vec;
posting_list_t::intersect(p_list, result_id_vec);
if (result_id_vec.empty()) {
continue;
}
// need to do prefix match
uint32_t* prefix_str_ids = new uint32_t[result_id_vec.size()];
size_t prefix_str_ids_size = 0;
std::unique_ptr<uint32_t[]> prefix_str_ids_guard(prefix_str_ids);
posting_list_t::get_prefix_matches(posting_list_iterators[i], f.is_array(),
result_id_vec.data(), result_id_vec.size(),
prefix_str_ids, prefix_str_ids_size);
if (prefix_str_ids_size == 0) {
continue;
}
for (size_t pi = 0; pi < prefix_str_ids_size; pi++) {
f_id_buff.push_back(prefix_str_ids[pi]);
}
} else if (a_filter.comparators[0] == EQUALS || a_filter.comparators[0] == NOT_EQUALS) {
// needs intersection + exact matching (unlike CONTAINS)
std::vector<uint32_t> result_id_vec;
posting_list_t::intersect(p_list, result_id_vec);

View File

@ -1111,6 +1111,173 @@ bool posting_list_t::contains_atleast_one(const uint32_t* target_ids, size_t tar
return false;
}
bool posting_list_t::is_single_token_prefix_match(const posting_list_t::iterator_t& it, bool field_is_array) {
block_t* curr_block = it.block();
uint32_t curr_index = it.index();
if (curr_block == nullptr || curr_index == UINT32_MAX) {
return false;
}
uint32_t* offsets = it.offsets;
uint32_t start_offset = it.offset_index[curr_index];
// If the field value starts with the token, it's a match.
return offsets[start_offset] == 1;
}
void posting_list_t::get_prefix_matches(std::vector<iterator_t>& its, const bool field_is_array,
const uint32_t* ids, const uint32_t num_ids,
uint32_t*& prefix_ids, size_t& num_prefix_ids) {
size_t prefix_id_index = 0;
if (its.size() == 1) {
for (size_t i = 0; i < num_ids; i++) {
auto const& id = ids[i];
its[0].skip_to(id);
if (is_single_token_prefix_match(its[0], field_is_array)) {
prefix_ids[prefix_id_index++] = id;
}
}
} else {
if (!field_is_array) {
for (size_t i = 0; i < num_ids; i++) {
uint32_t id = ids[i];
bool is_match = true;
for (int j = its.size()-1; j >= 0; j--) {
posting_list_t::iterator_t& it = its[j];
it.skip_to(id);
block_t* curr_block = it.block();
uint32_t curr_index = it.index();
if (curr_block == nullptr || curr_index == UINT32_MAX) {
is_match = false;
break;
}
uint32_t* offsets = it.offsets;
uint32_t start_offset_index = it.offset_index[curr_index];
uint32_t end_offset_index = (curr_index == curr_block->size() - 1) ?
curr_block->offsets.getLength() :
it.offset_index[curr_index + 1];
// looping handles duplicate query tokens, e.g. "hip hip hurray hurray"
while (start_offset_index < end_offset_index) {
uint32_t offset = offsets[start_offset_index];
start_offset_index++;
if (offset == (j + 1)) {
// we have found a matching index, no need to look further for this token
is_match = true;
break;
}
if (offset > (j + 1)) {
is_match = false;
break;
}
}
if (!is_match) {
break;
}
}
if (is_match) {
prefix_ids[prefix_id_index++] = id;
}
}
}
else {
// field is an array
struct token_index_meta_t {
std::bitset<32> token_index;
};
for (size_t i = 0; i < num_ids; i++) {
uint32_t id = ids[i];
std::map<size_t, token_index_meta_t> array_index_to_token_index;
bool premature_exit = false;
for (int j = its.size()-1; j >= 0; j--) {
posting_list_t::iterator_t& it = its[j];
it.skip_to(id);
block_t* curr_block = it.block();
uint32_t curr_index = it.index();
if (curr_block == nullptr || curr_index == UINT32_MAX) {
premature_exit = true;
break;
}
uint32_t* offsets = it.offsets;
uint32_t start_offset_index = it.offset_index[curr_index];
uint32_t end_offset_index = (curr_index == curr_block->size() - 1) ?
curr_block->offsets.getLength() :
it.offset_index[curr_index + 1];
int prev_pos = -1;
bool found_matching_index = false;
size_t num_matching_index = 0;
while (start_offset_index < end_offset_index) {
int pos = offsets[start_offset_index];
start_offset_index++;
if (pos == prev_pos) { // indicates end of array index
size_t array_index = (size_t) offsets[start_offset_index];
if (found_matching_index) {
array_index_to_token_index[array_index].token_index.set(j+1);
}
start_offset_index++; // skip current value which is the array index or flag for last index
prev_pos = -1;
found_matching_index = false;
continue;
}
if (pos == (j + 1)) {
// we have found a matching index
found_matching_index = true;
num_matching_index++;
}
prev_pos = pos;
}
if (num_matching_index == 0) {
// not even a single matching index found: can never be an exact match
premature_exit = true;
break;
}
}
if (!premature_exit) {
// iterate array index to token index to check if atleast 1 array position contains all tokens
for (auto& kv: array_index_to_token_index) {
if (kv.second.token_index.count() == its.size()) {
prefix_ids[prefix_id_index++] = id;
break;
}
}
}
}
}
}
num_prefix_ids = prefix_id_index;
}
void posting_list_t::get_exact_matches(std::vector<iterator_t>& its, const bool field_is_array,
const uint32_t* ids, const uint32_t num_ids,
uint32_t*& exact_ids, size_t& num_exact_ids) {
@ -1292,6 +1459,123 @@ void posting_list_t::get_exact_matches(std::vector<iterator_t>& its, const bool
num_exact_ids = exact_id_index;
}
bool posting_list_t::has_prefix_match(std::vector<posting_list_t::iterator_t>& posting_list_iterators,
const bool field_is_array) {
if (posting_list_iterators.empty()) {
return false;
}
if (posting_list_iterators.size() == 1) {
return is_single_token_prefix_match(posting_list_iterators[0], field_is_array);
}
if (!field_is_array) {
for (uint32_t i = 0; i < posting_list_iterators.size(); i++) {
posting_list_t::iterator_t& it = posting_list_iterators[i];
block_t* curr_block = it.block();
uint32_t curr_index = it.index();
if (curr_block == nullptr || curr_index == UINT32_MAX) {
return false;
}
uint32_t* offsets = it.offsets;
uint32_t start_offset_index = it.offset_index[curr_index];
uint32_t end_offset_index = (curr_index == curr_block->size() - 1) ?
curr_block->offsets.getLength() :
it.offset_index[curr_index + 1];
// looping handles duplicate query tokens, e.g. "hip hip hurray hurray"
while (start_offset_index < end_offset_index) {
uint32_t offset = offsets[start_offset_index];
start_offset_index++;
if (offset == (i + 1)) {
// we have found a matching index, no need to look further for this token.
break;
}
if (offset > (i + 1)) {
return false;
}
}
}
}
else {
// field is an array
struct token_index_meta_t {
std::bitset<32> token_index;
};
std::map<size_t, token_index_meta_t> array_index_to_token_index;
for (int i = posting_list_iterators.size() - 1; i >= 0; i--) {
posting_list_t::iterator_t& it = posting_list_iterators[i];
block_t* curr_block = it.block();
uint32_t curr_index = it.index();
if (curr_block == nullptr || curr_index == UINT32_MAX) {
return false;
}
uint32_t* offsets = it.offsets;
uint32_t start_offset_index = it.offset_index[curr_index];
uint32_t end_offset_index = (curr_index == curr_block->size() - 1) ?
curr_block->offsets.getLength() :
it.offset_index[curr_index + 1];
int prev_pos = -1;
bool found_matching_index = false;
size_t num_matching_index = 0;
while (start_offset_index < end_offset_index) {
int pos = offsets[start_offset_index];
start_offset_index++;
if (pos == prev_pos) { // indicates end of array index
size_t array_index = (size_t) offsets[start_offset_index];
if (found_matching_index) {
array_index_to_token_index[array_index].token_index.set(i + 1);
}
start_offset_index++; // skip current value which is the array index or flag for last index
prev_pos = -1;
found_matching_index = false;
continue;
}
if (pos == (i + 1)) {
// we have found a matching index
found_matching_index = true;
num_matching_index++;
}
prev_pos = pos;
}
if (num_matching_index == 0) {
// not even a single matching index found: can never be an exact match
return false;
}
}
// iterate array index to token index to check if atleast 1 array position contains all tokens
for (auto& kv: array_index_to_token_index) {
if (kv.second.token_index.count() == posting_list_iterators.size()) {
return true;
}
}
}
return true;
}
bool posting_list_t::has_exact_match(std::vector<posting_list_t::iterator_t>& posting_list_iterators,
const bool field_is_array) {
if(posting_list_iterators.size() == 1) {

View File

@ -2563,6 +2563,368 @@ TEST_F(CollectionFilteringTest, PrefixFilterOnTextFields) {
std::string id = ids.at(i);
ASSERT_EQ(id, result_id);
}
auto schema_json =
R"({
"name": "Names",
"fields": [
{"name": "name", "type": "string", "optional": true},
{"name": "names", "type": "string[]", "optional": true}
]
})"_json;
std::vector<nlohmann::json> documents = {
R"({
"name": "Steve Jobs"
})"_json,
R"({
"name": "Adam Stator"
})"_json,
};
auto collection_create_op = collectionManager.create_collection(schema_json);
ASSERT_TRUE(collection_create_op.ok());
for (auto const &json: documents) {
auto add_op = collection_create_op.get()->add(json.dump());
ASSERT_TRUE(add_op.ok());
}
std::map<std::string, std::string> req_params = {
{"collection", "Names"},
{"q", "*"},
{"query_by", "name"},
{"filter_by", "name:= S*"}
};
nlohmann::json embedded_params;
std::string json_res;
auto now_ts = std::chrono::duration_cast<std::chrono::microseconds>(
std::chrono::system_clock::now().time_since_epoch()).count();
auto search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts);
ASSERT_TRUE(search_op.ok());
auto res_obj = nlohmann::json::parse(json_res);
ASSERT_EQ(1, res_obj["found"].get<size_t>());
ASSERT_EQ(1, res_obj["hits"].size());
ASSERT_EQ("Steve Jobs", res_obj["hits"][0]["document"].at("name"));
req_params = {
{"collection", "Names"},
{"q", "*"},
{"query_by", "name"},
{"filter_by", "name: S*"}
};
search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts);
ASSERT_TRUE(search_op.ok());
res_obj = nlohmann::json::parse(json_res);
ASSERT_EQ(2, res_obj["found"].get<size_t>());
ASSERT_EQ(2, res_obj["hits"].size());
ASSERT_EQ("Adam Stator", res_obj["hits"][0]["document"].at("name"));
ASSERT_EQ("Steve Jobs", res_obj["hits"][1]["document"].at("name"));
documents = {
R"({
"name": "Steve Reiley"
})"_json,
R"({
"name": "Storm"
})"_json,
R"({
"name": "Steve Rogers"
})"_json,
};
for (auto const &json: documents) {
auto add_op = collection_create_op.get()->add(json.dump());
ASSERT_TRUE(add_op.ok());
}
req_params = {
{"collection", "Names"},
{"q", "s"},
{"query_by", "name"},
{"filter_by", "name:= St*"}
};
search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts);
ASSERT_TRUE(search_op.ok());
res_obj = nlohmann::json::parse(json_res);
ASSERT_EQ(4, res_obj["found"].get<size_t>());
ASSERT_EQ(4, res_obj["hits"].size());
ASSERT_EQ("Steve Rogers", res_obj["hits"][0]["document"].at("name"));
ASSERT_EQ("Storm", res_obj["hits"][1]["document"].at("name"));
ASSERT_EQ("Steve Reiley", res_obj["hits"][2]["document"].at("name"));
ASSERT_EQ("Steve Jobs", res_obj["hits"][3]["document"].at("name"));
req_params = {
{"collection", "Names"},
{"q", "s"},
{"query_by", "name"},
{"filter_by", "name: St*"}
};
search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts);
ASSERT_TRUE(search_op.ok());
res_obj = nlohmann::json::parse(json_res);
ASSERT_EQ(5, res_obj["found"].get<size_t>());
ASSERT_EQ(5, res_obj["hits"].size());
ASSERT_EQ("Steve Rogers", res_obj["hits"][0]["document"].at("name"));
ASSERT_EQ("Storm", res_obj["hits"][1]["document"].at("name"));
ASSERT_EQ("Steve Reiley", res_obj["hits"][2]["document"].at("name"));
ASSERT_EQ("Adam Stator", res_obj["hits"][3]["document"].at("name"));
ASSERT_EQ("Steve Jobs", res_obj["hits"][4]["document"].at("name"));
req_params = {
{"collection", "Names"},
{"q", "s"},
{"query_by", "name"},
{"filter_by", "name:= Steve R*"}
};
search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts);
ASSERT_TRUE(search_op.ok());
res_obj = nlohmann::json::parse(json_res);
ASSERT_EQ(2, res_obj["found"].get<size_t>());
ASSERT_EQ(2, res_obj["hits"].size());
ASSERT_EQ("Steve Rogers", res_obj["hits"][0]["document"].at("name"));
ASSERT_EQ("Steve Reiley", res_obj["hits"][1]["document"].at("name"));
req_params = {
{"collection", "Names"},
{"q", "s"},
{"query_by", "name"},
{"filter_by", "name: Steve R*"}
};
search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts);
ASSERT_TRUE(search_op.ok());
res_obj = nlohmann::json::parse(json_res);
ASSERT_EQ(2, res_obj["found"].get<size_t>());
ASSERT_EQ(2, res_obj["hits"].size());
ASSERT_EQ("Steve Rogers", res_obj["hits"][0]["document"].at("name"));
ASSERT_EQ("Steve Reiley", res_obj["hits"][1]["document"].at("name"));
documents = {
R"({
"names": []
})"_json,
R"({
"names": ["Steve Jobs"]
})"_json,
R"({
"names": ["Adam Stator"]
})"_json
};
for (auto const &json: documents) {
auto add_op = collection_create_op.get()->add(json.dump());
ASSERT_TRUE(add_op.ok());
}
req_params = {
{"collection", "Names"},
{"q", "s"},
{"query_by", "names"},
{"filter_by", "names:= St*"}
};
search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts);
ASSERT_TRUE(search_op.ok());
res_obj = nlohmann::json::parse(json_res);
ASSERT_EQ(1, res_obj["found"].get<size_t>());
ASSERT_EQ(1, res_obj["hits"].size());
ASSERT_EQ("Steve Jobs", res_obj["hits"][0]["document"]["names"][0]);
req_params = {
{"collection", "Names"},
{"q", "s"},
{"query_by", "names"},
{"filter_by", "names: St*"}
};
search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts);
ASSERT_TRUE(search_op.ok());
res_obj = nlohmann::json::parse(json_res);
ASSERT_EQ(2, res_obj["found"].get<size_t>());
ASSERT_EQ(2, res_obj["hits"].size());
ASSERT_EQ("Adam Stator", res_obj["hits"][0]["document"]["names"][0]);
ASSERT_EQ("Steve Jobs", res_obj["hits"][1]["document"]["names"][0]);
documents = {
R"({
"names": ["Steve Reiley"]
})"_json,
R"({
"names": ["Storm"]
})"_json,
R"({
"names": ["Adam", "Steve Rogers"]
})"_json,
};
for (auto const &json: documents) {
auto add_op = collection_create_op.get()->add(json.dump());
ASSERT_TRUE(add_op.ok());
}
req_params = {
{"collection", "Names"},
{"q", "s"},
{"query_by", "names"},
{"filter_by", "names:= St*"}
};
search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts);
ASSERT_TRUE(search_op.ok());
res_obj = nlohmann::json::parse(json_res);
ASSERT_EQ(4, res_obj["found"].get<size_t>());
ASSERT_EQ(4, res_obj["hits"].size());
ASSERT_EQ("Steve Rogers", res_obj["hits"][0]["document"]["names"][1]);
ASSERT_EQ("Storm", res_obj["hits"][1]["document"]["names"][0]);
ASSERT_EQ("Steve Reiley", res_obj["hits"][2]["document"]["names"][0]);
ASSERT_EQ("Steve Jobs", res_obj["hits"][3]["document"]["names"][0]);
req_params = {
{"collection", "Names"},
{"q", "s"},
{"query_by", "names"},
{"filter_by", "names: St*"}
};
search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts);
ASSERT_TRUE(search_op.ok());
res_obj = nlohmann::json::parse(json_res);
ASSERT_EQ(5, res_obj["found"].get<size_t>());
ASSERT_EQ(5, res_obj["hits"].size());
ASSERT_EQ("Steve Rogers", res_obj["hits"][0]["document"]["names"][1]);
ASSERT_EQ("Storm", res_obj["hits"][1]["document"]["names"][0]);
ASSERT_EQ("Steve Reiley", res_obj["hits"][2]["document"]["names"][0]);
ASSERT_EQ("Adam Stator", res_obj["hits"][3]["document"]["names"][0]);
ASSERT_EQ("Steve Jobs", res_obj["hits"][4]["document"]["names"][0]);
req_params = {
{"collection", "Names"},
{"q", "s"},
{"query_by", "names"},
{"filter_by", "names:= Steve*"}
};
search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts);
ASSERT_TRUE(search_op.ok());
res_obj = nlohmann::json::parse(json_res);
ASSERT_EQ(3, res_obj["found"].get<size_t>());
ASSERT_EQ(3, res_obj["hits"].size());
ASSERT_EQ("Steve Rogers", res_obj["hits"][0]["document"]["names"][1]);
ASSERT_EQ("Steve Reiley", res_obj["hits"][1]["document"]["names"][0]);
ASSERT_EQ("Steve Jobs", res_obj["hits"][2]["document"]["names"][0]);
req_params = {
{"collection", "Names"},
{"q", "s"},
{"query_by", "names"},
{"filter_by", "names: Steve*"}
};
search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts);
ASSERT_TRUE(search_op.ok());
res_obj = nlohmann::json::parse(json_res);
ASSERT_EQ(3, res_obj["found"].get<size_t>());
ASSERT_EQ(3, res_obj["hits"].size());
ASSERT_EQ("Steve Rogers", res_obj["hits"][0]["document"]["names"][1]);
ASSERT_EQ("Steve Reiley", res_obj["hits"][1]["document"]["names"][0]);
ASSERT_EQ("Steve Jobs", res_obj["hits"][2]["document"]["names"][0]);
req_params = {
{"collection", "Names"},
{"q", "s"},
{"query_by", "names"},
{"filter_by", "names:= Steve R*"}
};
search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts);
ASSERT_TRUE(search_op.ok());
res_obj = nlohmann::json::parse(json_res);
ASSERT_EQ(2, res_obj["found"].get<size_t>());
ASSERT_EQ(2, res_obj["hits"].size());
ASSERT_EQ("Steve Rogers", res_obj["hits"][0]["document"]["names"][1]);
ASSERT_EQ("Steve Reiley", res_obj["hits"][1]["document"]["names"][0]);
req_params = {
{"collection", "Names"},
{"q", "s"},
{"query_by", "names"},
{"filter_by", "names: Steve R*"}
};
search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts);
ASSERT_TRUE(search_op.ok());
res_obj = nlohmann::json::parse(json_res);
ASSERT_EQ(2, res_obj["found"].get<size_t>());
ASSERT_EQ(2, res_obj["hits"].size());
ASSERT_EQ("Steve Rogers", res_obj["hits"][0]["document"]["names"][1]);
ASSERT_EQ("Steve Reiley", res_obj["hits"][1]["document"]["names"][0]);
documents = {
R"({
"names": ["Steve Runner foo"]
})"_json,
R"({
"names": ["foo Steve Runner"]
})"_json,
};
for (auto const &json: documents) {
auto add_op = collection_create_op.get()->add(json.dump());
ASSERT_TRUE(add_op.ok());
}
req_params = {
{"collection", "Names"},
{"q", "s"},
{"query_by", "names"},
{"filter_by", "names:= Steve R*"}
};
search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts);
ASSERT_TRUE(search_op.ok());
res_obj = nlohmann::json::parse(json_res);
ASSERT_EQ(3, res_obj["found"].get<size_t>());
ASSERT_EQ(3, res_obj["hits"].size());
ASSERT_EQ("Steve Runner foo", res_obj["hits"][0]["document"]["names"][0]);
ASSERT_EQ("Steve Rogers", res_obj["hits"][1]["document"]["names"][1]);
ASSERT_EQ("Steve Reiley", res_obj["hits"][2]["document"]["names"][0]);
req_params = {
{"collection", "Names"},
{"q", "s"},
{"query_by", "names"},
{"filter_by", "names: Steve R*"}
};
search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts);
ASSERT_TRUE(search_op.ok());
res_obj = nlohmann::json::parse(json_res);
ASSERT_EQ(4, res_obj["found"].get<size_t>());
ASSERT_EQ(4, res_obj["hits"].size());
ASSERT_EQ("foo Steve Runner", res_obj["hits"][0]["document"]["names"][0]);
ASSERT_EQ("Steve Runner foo", res_obj["hits"][1]["document"]["names"][0]);
ASSERT_EQ("Steve Rogers", res_obj["hits"][2]["document"]["names"][1]);
ASSERT_EQ("Steve Reiley", res_obj["hits"][3]["document"]["names"][0]);
}
TEST_F(CollectionFilteringTest, FilterOnStemmedField) {

View File

@ -1503,3 +1503,266 @@ TEST_F(FilterTest, NumericFilterIterator) {
delete filter_tree_root;
}
TEST_F(FilterTest, PrefixStringFilter) {
auto schema_json =
R"({
"name": "Names",
"fields": [
{"name": "name", "type": "string"}
]
})"_json;
std::vector<nlohmann::json> documents = {
R"({
"name": "Steve Jobs"
})"_json,
R"({
"name": "Adam Stator"
})"_json,
};
auto collection_create_op = collectionManager.create_collection(schema_json);
ASSERT_TRUE(collection_create_op.ok());
Collection* coll = collection_create_op.get();
for (auto const &json: documents) {
auto add_op = coll->add(json.dump());
ASSERT_TRUE(add_op.ok());
}
const std::string doc_id_prefix = std::to_string(coll->get_collection_id()) + "_" + Collection::DOC_ID_PREFIX + "_";
filter_node_t* filter_tree_root = nullptr;
search_stop_us = UINT64_MAX; // `Index::fuzzy_search_fields` checks for timeout.
Option<bool> filter_op = filter::parse_filter_query("name:= S*", coll->get_schema(), store, doc_id_prefix,
filter_tree_root);
ASSERT_TRUE(filter_op.ok());
auto computed_exact_prefix_test = filter_result_iterator_t(coll->get_name(), coll->_get_index(), filter_tree_root);
ASSERT_TRUE(computed_exact_prefix_test.init_status().ok());
ASSERT_TRUE(computed_exact_prefix_test._get_is_filter_result_initialized());
std::vector<int> expected = {0};
for (auto const& i : expected) {
ASSERT_EQ(filter_result_iterator_t::valid, computed_exact_prefix_test.validity);
ASSERT_EQ(i, computed_exact_prefix_test.seq_id);
computed_exact_prefix_test.next();
}
ASSERT_EQ(filter_result_iterator_t::invalid, computed_exact_prefix_test.validity);
delete filter_tree_root;
filter_tree_root = nullptr;
filter_op = filter::parse_filter_query("name: S*", coll->get_schema(), store, doc_id_prefix,
filter_tree_root);
ASSERT_TRUE(filter_op.ok());
auto computed_contains_prefix_test = filter_result_iterator_t(coll->get_name(), coll->_get_index(), filter_tree_root);
ASSERT_TRUE(computed_contains_prefix_test.init_status().ok());
ASSERT_TRUE(computed_contains_prefix_test._get_is_filter_result_initialized());
expected = {0, 1};
for (auto const& i : expected) {
ASSERT_EQ(filter_result_iterator_t::valid, computed_contains_prefix_test.validity);
ASSERT_EQ(i, computed_contains_prefix_test.seq_id);
computed_contains_prefix_test.next();
}
ASSERT_EQ(filter_result_iterator_t::invalid, computed_contains_prefix_test.validity);
delete filter_tree_root;
documents = {
R"({
"name": "Steve Reiley"
})"_json,
R"({
"name": "Storm"
})"_json,
R"({
"name": "Steve Rogers"
})"_json,
};
for (auto const &json: documents) {
auto add_op = collection_create_op.get()->add(json.dump());
ASSERT_TRUE(add_op.ok());
}
filter_tree_root = nullptr;
filter_op = filter::parse_filter_query("name:= S*", coll->get_schema(), store, doc_id_prefix,
filter_tree_root);
ASSERT_TRUE(filter_op.ok());
auto iter_exact_prefix_test = filter_result_iterator_t(coll->get_name(), coll->_get_index(), filter_tree_root);
ASSERT_TRUE(iter_exact_prefix_test.init_status().ok());
ASSERT_FALSE(iter_exact_prefix_test._get_is_filter_result_initialized());
std::vector<uint32_t> validate_ids = {0, 1, 2, 3, 4, 5};
std::vector<uint32_t> seq_ids = {2, 2, 3, 4, 4, 4};
std::vector<uint32_t> equals_match_seq_ids = {0, 2, 2, 3, 4, 4};
std::vector<bool> equals_iterator_valid = {true, true, true, true, true, false};
expected = {1, 0, 1, 1, 1, -1};
for (uint32_t i = 0; i < validate_ids.size(); i++) {
if (i < 5) {
ASSERT_EQ(filter_result_iterator_t::valid, iter_exact_prefix_test.validity);
} else {
ASSERT_EQ(filter_result_iterator_t::invalid, iter_exact_prefix_test.validity);
}
ASSERT_EQ(expected[i], iter_exact_prefix_test.is_valid(validate_ids[i]));
ASSERT_EQ(equals_match_seq_ids[i], iter_exact_prefix_test._get_equals_iterator_id());
ASSERT_EQ(equals_iterator_valid[i], iter_exact_prefix_test._get_is_equals_iterator_valid());
if (expected[i] == 1) {
iter_exact_prefix_test.next();
}
ASSERT_EQ(seq_ids[i], iter_exact_prefix_test.seq_id);
}
ASSERT_EQ(filter_result_iterator_t::invalid, iter_exact_prefix_test.validity);
delete filter_tree_root;
filter_tree_root = nullptr;
filter_op = filter::parse_filter_query("name: S*", coll->get_schema(), store, doc_id_prefix,
filter_tree_root);
ASSERT_TRUE(filter_op.ok());
auto iter_contains_prefix_test = filter_result_iterator_t(coll->get_name(), coll->_get_index(), filter_tree_root);
ASSERT_TRUE(iter_contains_prefix_test.init_status().ok());
ASSERT_FALSE(iter_contains_prefix_test._get_is_filter_result_initialized());
validate_ids = {0, 1, 2, 3, 4, 5};
seq_ids = {1, 2, 3, 4, 4, 4};
equals_match_seq_ids = {0, 1, 2, 3, 4, 4};
equals_iterator_valid = {true, true, true, true, true, false};
expected = {1, 1, 1, 1, 1, -1};
for (uint32_t i = 0; i < validate_ids.size(); i++) {
if (i < 5) {
ASSERT_EQ(filter_result_iterator_t::valid, iter_contains_prefix_test.validity);
} else {
ASSERT_EQ(filter_result_iterator_t::invalid, iter_contains_prefix_test.validity);
}
ASSERT_EQ(expected[i], iter_contains_prefix_test.is_valid(validate_ids[i]));
ASSERT_EQ(equals_match_seq_ids[i], iter_contains_prefix_test._get_equals_iterator_id());
ASSERT_EQ(equals_iterator_valid[i], iter_contains_prefix_test._get_is_equals_iterator_valid());
if (expected[i] == 1) {
iter_contains_prefix_test.next();
}
ASSERT_EQ(seq_ids[i], iter_contains_prefix_test.seq_id);
}
ASSERT_EQ(filter_result_iterator_t::invalid, iter_contains_prefix_test.validity);
delete filter_tree_root;
filter_tree_root = nullptr;
filter_op = filter::parse_filter_query("name:= Steve R*", coll->get_schema(), store, doc_id_prefix,
filter_tree_root);
ASSERT_TRUE(filter_op.ok());
auto computed_exact_prefix_test_2 = filter_result_iterator_t(coll->get_name(), coll->_get_index(), filter_tree_root);
ASSERT_TRUE(computed_exact_prefix_test_2.init_status().ok());
ASSERT_TRUE(computed_exact_prefix_test_2._get_is_filter_result_initialized());
expected = {2, 4};
for (auto const& i : expected) {
ASSERT_EQ(filter_result_iterator_t::valid, computed_exact_prefix_test_2.validity);
ASSERT_EQ(i, computed_exact_prefix_test_2.seq_id);
computed_exact_prefix_test_2.next();
}
ASSERT_EQ(filter_result_iterator_t::invalid, computed_exact_prefix_test_2.validity);
delete filter_tree_root;
filter_tree_root = nullptr;
filter_op = filter::parse_filter_query("name: Steve R*", coll->get_schema(), store, doc_id_prefix,
filter_tree_root);
ASSERT_TRUE(filter_op.ok());
auto computed_contains_prefix_test_2 = filter_result_iterator_t(coll->get_name(), coll->_get_index(), filter_tree_root);
ASSERT_TRUE(computed_contains_prefix_test_2.init_status().ok());
ASSERT_TRUE(computed_contains_prefix_test_2._get_is_filter_result_initialized());
expected = {2, 4};
for (auto const& i : expected) {
ASSERT_EQ(filter_result_iterator_t::valid, computed_contains_prefix_test_2.validity);
ASSERT_EQ(i, computed_contains_prefix_test_2.seq_id);
computed_contains_prefix_test_2.next();
}
ASSERT_EQ(filter_result_iterator_t::invalid, computed_contains_prefix_test_2.validity);
delete filter_tree_root;
documents = {
R"({
"name": "Steve Runner foo"
})"_json,
R"({
"name": "foo Steve Runner"
})"_json,
};
for (auto const &json: documents) {
auto add_op = collection_create_op.get()->add(json.dump());
ASSERT_TRUE(add_op.ok());
}
filter_tree_root = nullptr;
filter_op = filter::parse_filter_query("name:= Steve R*", coll->get_schema(), store, doc_id_prefix,
filter_tree_root);
ASSERT_TRUE(filter_op.ok());
auto iter_exact_prefix_test_2 = filter_result_iterator_t(coll->get_name(), coll->_get_index(), filter_tree_root);
ASSERT_TRUE(iter_exact_prefix_test_2.init_status().ok());
ASSERT_FALSE(iter_exact_prefix_test_2._get_is_filter_result_initialized());
validate_ids = {0, 1, 2, 3, 4, 5, 6, 7};
seq_ids = {2, 2, 4, 4, 5, 5, 5, 5};
equals_match_seq_ids = {2, 2, 2, 4, 4, 5, 5, 5};
equals_iterator_valid = {true, true, true, true, true, true, false, false};
expected = {0, 0, 1, 0, 1, 1, -1, -1};
for (uint32_t i = 0; i < validate_ids.size(); i++) {
if (i < 6) {
ASSERT_EQ(filter_result_iterator_t::valid, iter_exact_prefix_test_2.validity);
} else {
ASSERT_EQ(filter_result_iterator_t::invalid, iter_exact_prefix_test_2.validity);
}
ASSERT_EQ(expected[i], iter_exact_prefix_test_2.is_valid(validate_ids[i]));
ASSERT_EQ(equals_match_seq_ids[i], iter_exact_prefix_test_2._get_equals_iterator_id());
ASSERT_EQ(equals_iterator_valid[i], iter_exact_prefix_test_2._get_is_equals_iterator_valid());
if (expected[i] == 1) {
iter_exact_prefix_test_2.next();
}
ASSERT_EQ(seq_ids[i], iter_exact_prefix_test_2.seq_id);
}
ASSERT_EQ(filter_result_iterator_t::invalid, iter_exact_prefix_test_2.validity);
delete filter_tree_root;
filter_tree_root = nullptr;
filter_op = filter::parse_filter_query("name: Steve R*", coll->get_schema(), store, doc_id_prefix,
filter_tree_root);
ASSERT_TRUE(filter_op.ok());
auto iter_contains_prefix_test_2 = filter_result_iterator_t(coll->get_name(), coll->_get_index(), filter_tree_root);
ASSERT_TRUE(iter_contains_prefix_test_2.init_status().ok());
ASSERT_FALSE(iter_contains_prefix_test_2._get_is_filter_result_initialized());
validate_ids = {0, 1, 2, 3, 4, 5, 6, 7};
seq_ids = {2, 2, 4, 4, 5, 6, 6, 6};
equals_match_seq_ids = {2, 2, 2, 4, 4, 5, 6, 6};
equals_iterator_valid = {true, true, true, true, true, true, true, false};
expected = {0, 0, 1, 0, 1, 1, 1, -1};
for (uint32_t i = 0; i < validate_ids.size(); i++) {
if (i < 7) {
ASSERT_EQ(filter_result_iterator_t::valid, iter_contains_prefix_test_2.validity);
} else {
ASSERT_EQ(filter_result_iterator_t::invalid, iter_contains_prefix_test_2.validity);
}
ASSERT_EQ(expected[i], iter_contains_prefix_test_2.is_valid(validate_ids[i]));
ASSERT_EQ(equals_match_seq_ids[i], iter_contains_prefix_test_2._get_equals_iterator_id());
ASSERT_EQ(equals_iterator_valid[i], iter_contains_prefix_test_2._get_is_equals_iterator_valid());
if (expected[i] == 1) {
iter_contains_prefix_test_2.next();
}
ASSERT_EQ(seq_ids[i], iter_contains_prefix_test_2.seq_id);
}
ASSERT_EQ(filter_result_iterator_t::invalid, iter_contains_prefix_test_2.validity);
delete filter_tree_root;
}