Add filter_result_iterator_t::compute_result.

This commit is contained in:
Harpreet Sangar 2023-06-15 18:59:53 +05:30
parent 7efa690810
commit fa42191110
3 changed files with 132 additions and 0 deletions

View File

@ -160,6 +160,9 @@ public:
/// Returns the status of the initialization of iterator tree.
Option<bool> init_status();
/// Recursively computes the result of each node and stores the final result in root node.
void compute_result();
/// Returns a tri-state:
/// 0: id is not valid
/// 1: id is valid

View File

@ -1528,3 +1528,131 @@ void filter_result_iterator_t::add_phrase_ids(filter_result_iterator_t*& filter_
root_iterator->seq_id = left_it->seq_id;
filter_result_iterator = root_iterator;
}
void filter_result_iterator_t::compute_result() {
if (filter_node->isOperator) {
left_it->compute_result();
right_it->compute_result();
if (filter_node->filter_operator == AND) {
filter_result_t::and_filter_results(left_it->filter_result, right_it->filter_result, filter_result);
} else {
filter_result_t::or_filter_results(left_it->filter_result, right_it->filter_result, filter_result);
}
seq_id = filter_result.docs[result_index];
is_filter_result_initialized = true;
approx_filter_ids_length = filter_result.count;
return;
}
// Only string field filter needs to be evaluated.
if (is_filter_result_initialized || index->search_index.count(filter_node->filter_exp.field_name) == 0) {
return;
}
auto const& a_filter = filter_node->filter_exp;
auto const& f = index->search_schema.at(a_filter.field_name);
art_tree* t = index->search_index.at(a_filter.field_name);
uint32_t* or_ids = nullptr;
size_t or_ids_size = 0;
// aggregates IDs across array of filter values and reduces excessive ORing
std::vector<uint32_t> f_id_buff;
for (const std::string& filter_value : a_filter.values) {
std::vector<void*> posting_lists;
// there could be multiple tokens in a filter value, which we have to treat as ANDs
// e.g. country: South Africa
Tokenizer tokenizer(filter_value, true, false, f.locale, index->symbols_to_index, index->token_separators);
std::string str_token;
size_t token_index = 0;
std::vector<std::string> str_tokens;
while (tokenizer.next(str_token, token_index)) {
str_tokens.push_back(str_token);
art_leaf* leaf = (art_leaf *) art_search(t, (const unsigned char*) str_token.c_str(),
str_token.length()+1);
if (leaf == nullptr) {
continue;
}
posting_lists.push_back(leaf->values);
}
if (posting_lists.size() != str_tokens.size()) {
continue;
}
if(a_filter.comparators[0] == EQUALS || a_filter.comparators[0] == NOT_EQUALS) {
// needs intersection + exact matching (unlike CONTAINS)
std::vector<uint32_t> result_id_vec;
posting_t::intersect(posting_lists, result_id_vec);
if (result_id_vec.empty()) {
continue;
}
// need to do exact match
uint32_t* exact_str_ids = new uint32_t[result_id_vec.size()];
size_t exact_str_ids_size = 0;
std::unique_ptr<uint32_t[]> exact_str_ids_guard(exact_str_ids);
posting_t::get_exact_matches(posting_lists, f.is_array(), result_id_vec.data(), result_id_vec.size(),
exact_str_ids, exact_str_ids_size);
if (exact_str_ids_size == 0) {
continue;
}
for (size_t ei = 0; ei < exact_str_ids_size; ei++) {
f_id_buff.push_back(exact_str_ids[ei]);
}
} else {
// CONTAINS
size_t before_size = f_id_buff.size();
posting_t::intersect(posting_lists, f_id_buff);
if (f_id_buff.size() == before_size) {
continue;
}
}
if (f_id_buff.size() > 100000 || a_filter.values.size() == 1) {
gfx::timsort(f_id_buff.begin(), f_id_buff.end());
f_id_buff.erase(std::unique( f_id_buff.begin(), f_id_buff.end() ), f_id_buff.end());
uint32_t* out = nullptr;
or_ids_size = ArrayUtils::or_scalar(or_ids, or_ids_size, f_id_buff.data(), f_id_buff.size(), &out);
delete[] or_ids;
or_ids = out;
std::vector<uint32_t>().swap(f_id_buff); // clears out memory
}
}
if (!f_id_buff.empty()) {
gfx::timsort(f_id_buff.begin(), f_id_buff.end());
f_id_buff.erase(std::unique( f_id_buff.begin(), f_id_buff.end() ), f_id_buff.end());
uint32_t* out = nullptr;
or_ids_size = ArrayUtils::or_scalar(or_ids, or_ids_size, f_id_buff.data(), f_id_buff.size(), &out);
delete[] or_ids;
or_ids = out;
std::vector<uint32_t>().swap(f_id_buff); // clears out memory
}
filter_result.docs = or_ids;
filter_result.count = or_ids_size;
if (a_filter.apply_not_equals) {
apply_not_equals(index->seq_ids->uncompress(), index->seq_ids->num_ids(), filter_result.docs, filter_result.count);
}
result_index = 0;
seq_id = filter_result.docs[result_index];
is_filter_result_initialized = true;
approx_filter_ids_length = filter_result.count;
}

View File

@ -4549,6 +4549,7 @@ void Index::search_wildcard(filter_node_t const* const& filter_tree_root,
std::array<spp::sparse_hash_map<uint32_t, int64_t>*, 3>& field_values,
const std::vector<size_t>& geopoint_indices) const {
filter_result_iterator->compute_result();
auto const& approx_filter_ids_length = filter_result_iterator->approx_filter_ids_length;
uint32_t token_bits = 0;