Filtering performance fix.

This commit is contained in:
kishorenc 2020-08-11 17:43:14 +05:30
parent 13591fc017
commit 83e3661297
2 changed files with 113 additions and 89 deletions

View File

@ -184,6 +184,8 @@ private:
void remove_and_shift_offset_index(sorted_array &offset_index, const uint32_t *indices_sorted,
const uint32_t indices_length);
uint32_t* collate_leaf_ids(const std::vector<const art_leaf *> &leaves, size_t& result_ids_len) const;
void collate_included_ids(const std::string & query, const std::string & field, const uint8_t field_id,
const std::map<size_t, std::map<size_t, uint32_t>> & included_ids_map,
Topster* curated_topster, std::vector<std::vector<art_leaf*>> & searched_queries);

View File

@ -894,106 +894,112 @@ Option<uint32_t> Index::do_filtering(uint32_t** filter_ids_out, const std::vecto
for(size_t i = 0; i < filters.size(); i++) {
const filter & a_filter = filters[i];
if(search_index.count(a_filter.field_name) == 0) {
continue;
}
if(search_index.count(a_filter.field_name) != 0) {
art_tree* t = search_index.at(a_filter.field_name);
field f = search_schema.at(a_filter.field_name);
std::vector<std::pair<uint32_t*, size_t>> filter_result_array_pairs;
art_tree* t = search_index.at(a_filter.field_name);
field f = search_schema.at(a_filter.field_name);
if(f.is_integer()) {
std::vector<const art_leaf*> leaves;
uint32_t* result_ids = nullptr;
size_t result_ids_len = 0;
for(const std::string & filter_value: a_filter.values) {
if(f.type == field_types::INT32 || f.type == field_types::INT32_ARRAY) {
int32_t value = (int32_t) std::stoi(filter_value);
art_int32_search(t, value, a_filter.compare_operator, leaves);
} else { // int64
int64_t value = (int64_t) std::stol(filter_value);
art_int64_search(t, value, a_filter.compare_operator, leaves);
}
if(f.is_integer()) {
std::vector<const art_leaf*> leaves;
std::vector<uint32_t> ids;
for(const art_leaf* leaf: leaves) {
filter_result_array_pairs.push_back(std::make_pair(leaf->values->ids.uncompress(),
leaf->values->ids.getLength()));
}
}
} else if(f.is_float()) {
std::vector<const art_leaf*> leaves;
for(const std::string & filter_value: a_filter.values) {
float value = (float) std::atof(filter_value.c_str());
art_float_search(t, value, a_filter.compare_operator, leaves);
for(const art_leaf* leaf: leaves) {
filter_result_array_pairs.push_back(std::make_pair(leaf->values->ids.uncompress(),
leaf->values->ids.getLength()));
}
}
} else if(f.is_bool()) {
std::vector<const art_leaf*> leaves;
for(const std::string & filter_value: a_filter.values) {
art_leaf* leaf = (art_leaf *) art_search(t, (const unsigned char*) filter_value.c_str(),
filter_value.length());
if(leaf) {
filter_result_array_pairs.push_back(std::make_pair(leaf->values->ids.uncompress(),
leaf->values->ids.getLength()));
}
}
} else if(f.is_string()) {
for(const std::string & filter_value: a_filter.values) {
std::vector<std::string> str_tokens;
StringUtils::split(filter_value, str_tokens, " ");
uint32_t* filtered_ids = nullptr;
size_t filtered_size = 0;
for(size_t i = 0; i < str_tokens.size(); i++) {
std::string & str_token = str_tokens[i];
string_utils.unicode_normalize(str_token);
art_leaf* leaf = (art_leaf *) art_search(t, (const unsigned char*) str_token.c_str(),
str_token.length()+1);
if(leaf == nullptr) {
continue;
}
if(i == 0) {
filtered_ids = leaf->values->ids.uncompress();
filtered_size = leaf->values->ids.getLength();
} else {
// do AND for an exact match
uint32_t* out = nullptr;
uint32_t* leaf_ids = leaf->values->ids.uncompress();
filtered_size = ArrayUtils::and_scalar(filtered_ids, filtered_size, leaf_ids,
leaf->values->ids.getLength(), &out);
delete[] leaf_ids;
delete[] filtered_ids;
filtered_ids = out;
}
}
filter_result_array_pairs.emplace_back(filtered_ids, filtered_size);
for(const std::string & filter_value: a_filter.values) {
if(f.type == field_types::INT32 || f.type == field_types::INT32_ARRAY) {
int32_t value = (int32_t) std::stoi(filter_value);
art_int32_search(t, value, a_filter.compare_operator, leaves);
} else { // int64
int64_t value = (int64_t) std::stol(filter_value);
art_int64_search(t, value, a_filter.compare_operator, leaves);
}
}
uint32_t* result_ids = nullptr;
size_t result_ids_length = union_of_ids(filter_result_array_pairs, &result_ids);
result_ids = collate_leaf_ids(leaves, result_ids_len);
if(i == 0) {
filter_ids = result_ids;
filter_ids_length = result_ids_length;
} else {
uint32_t* filtered_results = nullptr;
filter_ids_length = ArrayUtils::and_scalar(filter_ids, filter_ids_length, result_ids,
result_ids_length, &filtered_results);
delete [] result_ids;
delete [] filter_ids;
filter_ids = filtered_results;
} else if(f.is_float()) {
std::vector<const art_leaf*> leaves;
std::vector<uint32_t> ids;
for(const std::string & filter_value: a_filter.values) {
float value = (float) std::atof(filter_value.c_str());
art_float_search(t, value, a_filter.compare_operator, leaves);
}
for(std::pair<uint32_t*, size_t> & filter_result_array_pair: filter_result_array_pairs) {
delete[] filter_result_array_pair.first;
result_ids = collate_leaf_ids(leaves, result_ids_len);
} else if(f.is_bool()) {
std::vector<const art_leaf*> leaves;
for(const std::string & filter_value: a_filter.values) {
art_leaf* leaf = (art_leaf *) art_search(t, (const unsigned char*) filter_value.c_str(),
filter_value.length());
if(leaf) {
leaves.push_back(leaf);
}
}
result_ids = collate_leaf_ids(leaves, result_ids_len);
} else if(f.is_string()) {
uint32_t* ids = nullptr;
size_t ids_size = 0;
for(const std::string & filter_value: a_filter.values) {
std::vector<std::string> str_tokens;
StringUtils::split(filter_value, str_tokens, " ");
uint32_t* strt_ids = nullptr;
size_t strt_ids_size = 0;
// there could be multiple tokens in a filter value, which we have to treat as ANDs
// e.g. country: South Africa
for(auto & str_token : str_tokens) {
string_utils.unicode_normalize(str_token);
art_leaf* leaf = (art_leaf *) art_search(t, (const unsigned char*) str_token.c_str(),
str_token.length()+1);
if(leaf == nullptr) {
continue;
}
if(strt_ids == nullptr) {
strt_ids = leaf->values->ids.uncompress();
strt_ids_size = leaf->values->ids.getLength();
} else {
// do AND for an exact match
uint32_t* out = nullptr;
uint32_t* leaf_ids = leaf->values->ids.uncompress();
strt_ids_size = ArrayUtils::and_scalar(strt_ids, strt_ids_size, leaf_ids,
leaf->values->ids.getLength(), &out);
delete[] leaf_ids;
delete[] strt_ids;
strt_ids = out;
}
}
uint32_t* out = nullptr;
ids_size = ArrayUtils::or_scalar(ids, ids_size, strt_ids, strt_ids_size, &out);
delete[] strt_ids;
delete[] ids;
ids = out;
}
result_ids = ids;
result_ids_len = ids_size;
}
if(i == 0) {
filter_ids = result_ids;
filter_ids_length = result_ids_len;
} else {
uint32_t* filtered_results = nullptr;
filter_ids_length = ArrayUtils::and_scalar(filter_ids, filter_ids_length, result_ids,
result_ids_len, &filtered_results);
delete [] result_ids;
delete [] filter_ids;
filter_ids = filtered_results;
}
}
@ -1001,6 +1007,22 @@ Option<uint32_t> Index::do_filtering(uint32_t** filter_ids_out, const std::vecto
return Option<>(filter_ids_length);
}
uint32_t* Index::collate_leaf_ids(const std::vector<const art_leaf *> &leaves, size_t& result_ids_len) const {
std::vector<uint32_t> ids;
for(const art_leaf* leaf: leaves) {
uint32_t num_ids = leaf->values->ids.getLength();
uint32_t* leaf_ids = leaf->values->ids.uncompress();
std::copy(leaf_ids, leaf_ids + num_ids, std::back_inserter(ids));
}
uint32_t* result_ids = new uint32_t[ids.size()];
std::sort(ids.begin(), ids.end());
std::copy(ids.begin(), ids.end(), result_ids);
result_ids_len = ids.size();
return result_ids;
}
void Index::run_search() {
while(true) {
// wait until main thread sends data