From eb44e3e592e3036976c910add70cc802e4cbace5 Mon Sep 17 00:00:00 2001 From: Harpreet Sangar Date: Mon, 17 Apr 2023 20:54:50 +0530 Subject: [PATCH] Add numeric field support in `filter_result_t`. --- include/ids_t.h | 8 +- include/num_tree.h | 10 ++ src/filter_result_iterator.cpp | 199 +++++++++++++++++++++++++++++++-- src/num_tree.cpp | 78 +++++++++++++ 4 files changed, 283 insertions(+), 12 deletions(-) diff --git a/include/ids_t.h b/include/ids_t.h index 15cf8c6e..949c71b8 100644 --- a/include/ids_t.h +++ b/include/ids_t.h @@ -39,11 +39,6 @@ struct compact_id_list_t { }; class ids_t { -private: - - static void to_expanded_id_lists(const std::vector& raw_id_lists, std::vector& id_lists, - std::vector& expanded_id_lists); - public: static constexpr size_t COMPACT_LIST_THRESHOLD_LENGTH = 64; static constexpr size_t MAX_BLOCK_ELEMENTS = 256; @@ -104,6 +99,9 @@ public: static uint32_t* uncompress(void*& obj); static void uncompress(void*& obj, std::vector& ids); + + static void to_expanded_id_lists(const std::vector& raw_id_lists, std::vector& id_lists, + std::vector& expanded_id_lists); }; template diff --git a/include/num_tree.h b/include/num_tree.h index 2170a30e..444f6266 100644 --- a/include/num_tree.h +++ b/include/num_tree.h @@ -30,6 +30,11 @@ public: void range_inclusive_search(int64_t start, int64_t end, uint32_t** ids, size_t& ids_len); + void range_inclusive_search_iterators(int64_t start, + int64_t end, + std::vector& id_list_iterators, + std::vector& expanded_id_lists); + void approx_range_inclusive_search_count(int64_t start, int64_t end, uint32_t& ids_len); void range_inclusive_contains(const int64_t& start, const int64_t& end, @@ -42,6 +47,11 @@ public: void search(NUM_COMPARATOR comparator, int64_t value, uint32_t** ids, size_t& ids_len); + void search_iterators(NUM_COMPARATOR comparator, + int64_t value, + std::vector& id_list_iterators, + std::vector& expanded_id_lists); + void approx_search_count(NUM_COMPARATOR comparator, int64_t value, uint32_t& ids_len); void remove(uint64_t value, uint32_t id); diff --git a/src/filter_result_iterator.cpp b/src/filter_result_iterator.cpp index f7217c0a..6076807f 100644 --- a/src/filter_result_iterator.cpp +++ b/src/filter_result_iterator.cpp @@ -1,3 +1,5 @@ +#include +#include #include "filter_result_iterator.h" #include "index.h" #include "posting.h" @@ -395,7 +397,16 @@ void filter_result_iterator_t::next() { field f = index->search_schema.at(a_filter.field_name); - if (f.is_string()) { + if (f.is_integer() || f.is_float() || f.is_bool()) { + result_index++; + if (result_index >= filter_result.count) { + is_valid = false; + return; + } + + seq_id = filter_result.docs[result_index]; + return; + } else if (f.is_string()) { if (filter_node->filter_exp.apply_not_equals) { if (++seq_id < result_index) { return; @@ -432,6 +443,40 @@ void filter_result_iterator_t::next() { } } +void merge_id_list_iterators(std::vector& id_list_iterators, + std::vector& result_ids) { + struct comp { + bool operator()(const id_list_t::iterator_t *lhs, const id_list_t::iterator_t *rhs) const { + return lhs->id() > rhs->id(); + } + }; + + std::priority_queue, comp> iter_queue; + for (auto& id_list_iterator: id_list_iterators) { + if (id_list_iterator.valid()) { + iter_queue.push(&id_list_iterator); + } + } + + if (iter_queue.empty()) { + return; + } + + // TODO: Handle != + + do { + id_list_t::iterator_t* iter = iter_queue.top(); + iter_queue.pop(); + + result_ids.push_back(iter->id()); + iter->next(); + + if (iter->valid()) { + iter_queue.push(iter); + } + } while (!iter_queue.empty()); +} + void filter_result_iterator_t::init() { if (filter_node == nullptr) { return; @@ -470,7 +515,12 @@ void filter_result_iterator_t::init() { return; } - is_valid = filter_result.count > 0; + if (filter_result.count == 0) { + is_valid = false; + return; + } + + seq_id = filter_result.docs[result_index]; return; } @@ -491,6 +541,7 @@ void filter_result_iterator_t::init() { filter_result.count = result_ids.size(); filter_result.docs = new uint32_t[result_ids.size()]; std::copy(result_ids.begin(), result_ids.end(), filter_result.docs); + seq_id = filter_result.docs[result_index]; } if (!index->field_is_indexed(a_filter.field_name)) { @@ -500,7 +551,112 @@ void filter_result_iterator_t::init() { field f = index->search_schema.at(a_filter.field_name); - if (f.is_string()) { + if (f.is_integer()) { + auto num_tree = index->numerical_index.at(a_filter.field_name); + + std::vector ids; + for (size_t fi = 0; fi < a_filter.values.size(); fi++) { + const std::string& filter_value = a_filter.values[fi]; + int64_t value = (int64_t)std::stol(filter_value); + std::vector id_list_iterators; + std::vector expanded_id_lists; + + if (a_filter.comparators[fi] == RANGE_INCLUSIVE && fi+1 < a_filter.values.size()) { + const std::string& next_filter_value = a_filter.values[fi + 1]; + auto const range_end_value = (int64_t)std::stol(next_filter_value); + num_tree->range_inclusive_search_iterators(value, range_end_value, id_list_iterators, expanded_id_lists); + fi++; + } else { + num_tree->search_iterators(a_filter.comparators[fi] == NOT_EQUALS ? EQUALS : a_filter.comparators[fi], + value, id_list_iterators, expanded_id_lists); + } + + merge_id_list_iterators(id_list_iterators, ids); + + for(id_list_t* expanded_id_list: expanded_id_lists) { + delete expanded_id_list; + } + } + + if (ids.empty()) { + is_valid = false; + return; + } + + filter_result.count = ids.size(); + filter_result.docs = new uint32_t[ids.size()]; + std::copy(ids.begin(), ids.end(), filter_result.docs); + seq_id = filter_result.docs[result_index]; + } else if (f.is_float()) { + auto num_tree = index->numerical_index.at(a_filter.field_name); + + std::vector ids; + for (size_t fi = 0; fi < a_filter.values.size(); fi++) { + const std::string& filter_value = a_filter.values[fi]; + float value = (float)std::atof(filter_value.c_str()); + int64_t float_int64 = Index::float_to_int64_t(value); + std::vector id_list_iterators; + std::vector expanded_id_lists; + + if (a_filter.comparators[fi] == RANGE_INCLUSIVE && fi+1 < a_filter.values.size()) { + const std::string& next_filter_value = a_filter.values[fi+1]; + int64_t range_end_value = Index::float_to_int64_t((float) std::atof(next_filter_value.c_str())); + num_tree->range_inclusive_search_iterators(float_int64, range_end_value, + id_list_iterators, expanded_id_lists); + fi++; + } else { + num_tree->search_iterators(a_filter.comparators[fi] == NOT_EQUALS ? EQUALS : a_filter.comparators[fi], + float_int64, id_list_iterators, expanded_id_lists); + } + + merge_id_list_iterators(id_list_iterators, ids); + + for(id_list_t* expanded_id_list: expanded_id_lists) { + delete expanded_id_list; + } + } + + if (ids.empty()) { + is_valid = false; + return; + } + + filter_result.count = ids.size(); + filter_result.docs = new uint32_t[ids.size()]; + std::copy(ids.begin(), ids.end(), filter_result.docs); + seq_id = filter_result.docs[result_index]; + } else if (f.is_bool()) { + auto num_tree = index->numerical_index.at(a_filter.field_name); + + std::vector ids; + size_t value_index = 0; + for (const std::string& filter_value : a_filter.values) { + int64_t bool_int64 = (filter_value == "1") ? 1 : 0; + std::vector id_list_iterators; + std::vector expanded_id_lists; + + num_tree->search_iterators(a_filter.comparators[value_index] == NOT_EQUALS ? EQUALS : a_filter.comparators[value_index], + bool_int64, id_list_iterators, expanded_id_lists); + + merge_id_list_iterators(id_list_iterators, ids); + + for(id_list_t* expanded_id_list: expanded_id_lists) { + delete expanded_id_list; + } + + value_index++; + } + + if (ids.empty()) { + is_valid = false; + return; + } + + filter_result.count = ids.size(); + filter_result.docs = new uint32_t[ids.size()]; + std::copy(ids.begin(), ids.end(), filter_result.docs); + seq_id = filter_result.docs[result_index]; + } else if (f.is_string()) { art_tree* t = index->search_index.at(a_filter.field_name); for (const std::string& filter_value : a_filter.values) { @@ -615,7 +771,10 @@ bool filter_result_iterator_t::valid() { field f = index->search_schema.at(a_filter.field_name); - if (f.is_string()) { + if (f.is_integer() || f.is_float() || f.is_bool()) { + is_valid = result_index < filter_result.count; + return is_valid; + } else if (f.is_string()) { if (filter_node->filter_exp.apply_not_equals) { return seq_id < result_index; } @@ -694,7 +853,17 @@ void filter_result_iterator_t::skip_to(uint32_t id) { field f = index->search_schema.at(a_filter.field_name); - if (f.is_string()) { + if (f.is_integer() || f.is_float() || f.is_bool()) { + while(result_index < filter_result.count && filter_result.docs[result_index] < id) { + result_index++; + } + + if (result_index >= filter_result.count) { + is_valid = false; + } + + return; + } else if (f.is_string()) { if (filter_node->filter_exp.apply_not_equals) { if (id < seq_id) { return; @@ -861,8 +1030,14 @@ void filter_result_iterator_t::reset() { bool is_referenced_filter = !a_filter.referenced_collection_name.empty(); if (is_referenced_filter || a_filter.field_name == "id") { + if (filter_result.count == 0) { + is_valid = false; + return; + } + result_index = 0; - is_valid = filter_result.count > 0; + seq_id = filter_result.docs[result_index]; + is_valid = true; return; } @@ -872,7 +1047,17 @@ void filter_result_iterator_t::reset() { field f = index->search_schema.at(a_filter.field_name); - if (f.is_string()) { + if (f.is_integer() || f.is_float() || f.is_bool()) { + if (filter_result.count == 0) { + is_valid = false; + return; + } + + result_index = 0; + seq_id = filter_result.docs[result_index]; + is_valid = true; + return; + } else if (f.is_string()) { posting_list_iterators.clear(); for(auto expanded_plist: expanded_plists) { delete expanded_plist; diff --git a/src/num_tree.cpp b/src/num_tree.cpp index c59cb008..89c5e3a0 100644 --- a/src/num_tree.cpp +++ b/src/num_tree.cpp @@ -43,6 +43,30 @@ void num_tree_t::range_inclusive_search(int64_t start, int64_t end, uint32_t** i *ids = out; } +void num_tree_t::range_inclusive_search_iterators(int64_t start, + int64_t end, + std::vector& id_list_iterators, + std::vector& expanded_id_lists) { + if (int64map.empty()) { + return; + } + + auto it_start = int64map.lower_bound(start); // iter values will be >= start + + std::vector raw_id_lists; + while (it_start != int64map.end() && it_start->first <= end) { + raw_id_lists.push_back(it_start->second); + it_start++; + } + + std::vector id_lists; + ids_t::to_expanded_id_lists(raw_id_lists, id_lists, expanded_id_lists); + + for (const auto &id_list: id_lists) { + id_list_iterators.emplace_back(id_list->new_iterator()); + } +} + void num_tree_t::approx_range_inclusive_search_count(int64_t start, int64_t end, uint32_t& ids_len) { if (int64map.empty()) { return; @@ -187,6 +211,60 @@ void num_tree_t::search(NUM_COMPARATOR comparator, int64_t value, uint32_t** ids } } +void num_tree_t::search_iterators(NUM_COMPARATOR comparator, + int64_t value, + std::vector& id_list_iterators, + std::vector& expanded_id_lists) { + if (int64map.empty()) { + return ; + } + + std::vector raw_id_lists; + if (comparator == EQUALS) { + const auto& it = int64map.find(value); + if (it != int64map.end()) { + raw_id_lists.emplace_back(it->second); + } + } else if (comparator == GREATER_THAN || comparator == GREATER_THAN_EQUALS) { + // iter entries will be >= value, or end() if all entries are before value + auto iter_ge_value = int64map.lower_bound(value); + + if(iter_ge_value == int64map.end()) { + return ; + } + + if(comparator == GREATER_THAN && iter_ge_value->first == value) { + iter_ge_value++; + } + + while(iter_ge_value != int64map.end()) { + raw_id_lists.emplace_back(iter_ge_value->second); + iter_ge_value++; + } + } else if(comparator == LESS_THAN || comparator == LESS_THAN_EQUALS) { + // iter entries will be >= value, or end() if all entries are before value + auto iter_ge_value = int64map.lower_bound(value); + + auto it = int64map.begin(); + while(it != iter_ge_value) { + raw_id_lists.emplace_back(it->second); + it++; + } + + // for LESS_THAN_EQUALS, check if last iter entry is equal to value + if(it != int64map.end() && comparator == LESS_THAN_EQUALS && it->first == value) { + raw_id_lists.emplace_back(it->second); + } + } + + std::vector id_lists; + ids_t::to_expanded_id_lists(raw_id_lists, id_lists, expanded_id_lists); + + for (const auto &id_list: id_lists) { + id_list_iterators.emplace_back(id_list->new_iterator()); + } +} + void num_tree_t::approx_search_count(NUM_COMPARATOR comparator, int64_t value, uint32_t& ids_len) { if (int64map.empty()) { return;