From 72b8d4c26ece8280016583de1085120b10eea5c7 Mon Sep 17 00:00:00 2001 From: Harpreet Sangar Date: Sun, 22 Jan 2023 12:02:29 +0530 Subject: [PATCH 01/51] Filter by reference. --- include/collection.h | 2 ++ src/collection.cpp | 16 +++++++++++ src/field.cpp | 65 +++++++++++++++++++++++++++++++++++++------- 3 files changed, 73 insertions(+), 10 deletions(-) diff --git a/include/collection.h b/include/collection.h index 7d31e9e5..977a83dc 100644 --- a/include/collection.h +++ b/include/collection.h @@ -463,6 +463,8 @@ public: Option validate_reference_filter(const std::string& filter_query) const; + Option validate_reference_filter(const std::string& filter_query) const; + Option get(const std::string & id) const; Option remove(const std::string & id, bool remove_from_store = true); diff --git a/src/collection.cpp b/src/collection.cpp index 6eef0147..737341f2 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -2584,6 +2584,22 @@ Option Collection::validate_reference_filter(const std::string& filter_que return Option(true); } +Option Collection::validate_reference_filter(const std::string& filter_query) const { + std::shared_lock lock(mutex); + + const std::string doc_id_prefix = std::to_string(collection_id) + "_" + DOC_ID_PREFIX + "_"; + filter_node_t* filter_tree_root = nullptr; + Option filter_op = filter::parse_filter_query(filter_query, search_schema, + store, doc_id_prefix, filter_tree_root); + + if(!filter_op.ok()) { + return filter_op; + } + + delete filter_tree_root; + return Option(true); +} + bool Collection::facet_value_to_string(const facet &a_facet, const facet_count_t &facet_count, const nlohmann::json &document, std::string &value) const { diff --git a/src/field.cpp b/src/field.cpp index 89e7e563..d9a85890 100644 --- a/src/field.cpp +++ b/src/field.cpp @@ -388,24 +388,27 @@ Option toParseTree(std::queue& postfix, filter_node_t*& root, int& and_operator_count, int& or_operator_count) { std::stack nodeStack; + bool is_successful = true; + std::string error_message; while (!postfix.empty()) { const std::string expression = postfix.front(); postfix.pop(); - filter_node_t* filter_node = nullptr; + filter_node_t *filter_node = nullptr; if (isOperator(expression)) { - auto message = "Could not parse the filter query: unbalanced `" + expression + "` operands."; - if (nodeStack.empty()) { - return Option(400, message); + is_successful = false; + error_message = "Could not parse the filter query: unbalanced `" + expression + "` operands."; + break; } auto operandB = nodeStack.top(); nodeStack.pop(); if (nodeStack.empty()) { - delete operandB; - return Option(400, message); + is_successful = false; + error_message = "Could not parse the filter query: unbalanced `" + expression + "` operands."; + break; } auto operandA = nodeStack.top(); nodeStack.pop(); @@ -414,7 +417,6 @@ Option toParseTree(std::queue& postfix, filter_node_t*& root, filter_node = new filter_node_t(expression == "&&" ? AND : OR, operandA, operandB); } else { filter filter_exp; -<<<<<<< HEAD // Expected value: $Collection(...) bool is_referenced_filter = (expression[0] == '$' && expression[expression.size() - 1] == ')'); @@ -439,7 +441,7 @@ Option toParseTree(std::queue& postfix, filter_node_t*& root, } else { Option toFilter_op = toFilter(expression, filter_exp, search_schema, store, doc_id_prefix); if (!toFilter_op.ok()) { - while(!nodeStack.empty()) { + while(!nodeStack.empty()) { auto filterNode = nodeStack.top(); delete filterNode; nodeStack.pop(); @@ -448,17 +450,60 @@ Option toParseTree(std::queue& postfix, filter_node_t*& root, } } - filter_node = new filter_node_t(filter_exp); + // Expected value: $Collection(...) + bool is_referenced_filter = (expression[0] == '$' && expression[expression.size() - 1] == ')'); + if (is_referenced_filter) { + size_t parenthesis_index = expression.find('('); + + std::string collection_name = expression.substr(1, parenthesis_index - 1); + auto &cm = CollectionManager::get_instance(); + auto collection = cm.get_collection(collection_name); + if (collection == nullptr) { + is_successful = false; + error_message = "Referenced collection `" + collection_name + "` not found."; + break; + } + + filter_exp = {expression.substr(parenthesis_index + 1, expression.size() - parenthesis_index - 2)}; + filter_exp.referenced_collection_name = collection_name; + + auto op = collection->validate_reference_filter(filter_exp.field_name); + if (!op.ok()) { + is_successful = false; + error_message = "Failed to parse reference filter on `" + collection_name + "` collection: " + + op.error(); + break; + } + } else { + Option toFilter_op = toFilter(expression, filter_exp, search_schema, store, doc_id_prefix); + if (!toFilter_op.ok()) { + is_successful = false; + error_message = toFilter_op.error(); + break; + } + + filter_node = new filter_node_t(filter_exp); + } } nodeStack.push(filter_node); } + if (!is_successful) { + while (!nodeStack.empty()) { + auto filterNode = nodeStack.top(); + delete filterNode; + nodeStack.pop(); + } + + return Option(400, error_message); + } + if (nodeStack.empty()) { return Option(400, "Filter query cannot be empty."); } - root = nodeStack.top(); + return Option(true); } From ad958be7bb690f82c7f57ed76fceb37e30b1d7b6 Mon Sep 17 00:00:00 2001 From: Harpreet Sangar Date: Fri, 27 Jan 2023 12:57:13 +0530 Subject: [PATCH 02/51] Add `Index::rearranging_recursive_filter`. --- include/field.h | 1 + src/index.cpp | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/include/field.h b/include/field.h index 37fa1936..7a90fd9d 100644 --- a/include/field.h +++ b/include/field.h @@ -606,6 +606,7 @@ struct filter_node_t { bool isOperator; filter_node_t* left = nullptr; filter_node_t* right = nullptr; + filter_tree_metrics* metrics = nullptr; filter_node_t(filter filter_exp) : filter_exp(std::move(filter_exp)), diff --git a/src/index.cpp b/src/index.cpp index 0379ae43..1b20867a 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -1830,7 +1830,6 @@ Option Index::rearrange_filter_tree(filter_node_t* const root, if (root == nullptr) { return Option(true); } - if (root->isOperator) { uint32_t l_filter_ids_length = 0; if (root->left != nullptr) { From f4b8912e19857138298bcbeef0a342f3dab73266 Mon Sep 17 00:00:00 2001 From: Harpreet Sangar Date: Fri, 27 Jan 2023 19:58:06 +0530 Subject: [PATCH 03/51] Add `Index::adaptive_filter`. --- src/index.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/index.cpp b/src/index.cpp index 1b20867a..0379ae43 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -1830,6 +1830,7 @@ Option Index::rearrange_filter_tree(filter_node_t* const root, if (root == nullptr) { return Option(true); } + if (root->isOperator) { uint32_t l_filter_ids_length = 0; if (root->left != nullptr) { From cdfa3b7a708c556772c1ec34714bb11eb9a907e2 Mon Sep 17 00:00:00 2001 From: Harpreet Sangar Date: Fri, 3 Feb 2023 14:30:17 +0530 Subject: [PATCH 04/51] Fix double locking of collection mutex. --- src/collection.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/collection.cpp b/src/collection.cpp index 737341f2..95190ac7 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -1515,7 +1515,6 @@ Option Collection::search(std::string raw_query, } // for grouping we have to re-aggregate - Topster& topster = *search_params->topster; Topster& curated_topster = *search_params->curated_topster; From 2d39461ecac47058f962e0390bc7eebca21b9fdc Mon Sep 17 00:00:00 2001 From: Harpreet Sangar Date: Mon, 27 Feb 2023 11:00:25 +0530 Subject: [PATCH 05/51] Fix rebase error. --- src/field.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/field.cpp b/src/field.cpp index d9a85890..729ae55f 100644 --- a/src/field.cpp +++ b/src/field.cpp @@ -481,9 +481,9 @@ Option toParseTree(std::queue& postfix, filter_node_t*& root, error_message = toFilter_op.error(); break; } - - filter_node = new filter_node_t(filter_exp); } + + filter_node = new filter_node_t(filter_exp); } nodeStack.push(filter_node); From e78d20991195536eed97192b52cc2c82858ea4af Mon Sep 17 00:00:00 2001 From: Harpreet Sangar Date: Fri, 3 Mar 2023 10:37:33 +0530 Subject: [PATCH 06/51] Add `filter_result_t` struct. Add `reference_filter_result_t` struct. Add support for lazy filtering. Update `rearrange_filter_tree` to return approximate count of filter matches. --- .bazelrc | 2 - include/collection.h | 6 +- include/field.h | 11 +- include/index.h | 27 ++- include/num_tree.h | 27 +++ include/posting.h | 4 +- include/topster.h | 7 +- src/collection.cpp | 24 +-- src/field.cpp | 32 ---- src/index.cpp | 323 ++++++++++++++++++++++++++++------ src/num_tree.cpp | 172 ++++++++++++++++++ src/posting.cpp | 27 ++- test/collection_join_test.cpp | 10 +- 13 files changed, 541 insertions(+), 131 deletions(-) diff --git a/.bazelrc b/.bazelrc index 0a7fa3ae..933545b7 100644 --- a/.bazelrc +++ b/.bazelrc @@ -5,5 +5,3 @@ build --cxxopt="-std=c++17" test --jobs=6 build --enable_platform_specific_config - -build:linux --action_env=BAZEL_LINKLIBS="-l%:libstdc++.a -l%:libgcc.a" diff --git a/include/collection.h b/include/collection.h index 977a83dc..27bf7920 100644 --- a/include/collection.h +++ b/include/collection.h @@ -268,6 +268,8 @@ private: + Option get_reference_field(const std::string & collection_name) const; + public: enum {MAX_ARRAY_MATCHES = 5}; @@ -455,16 +457,12 @@ public: Option get_filter_ids(const std::string & filter_query, filter_result_t& filter_result) const; - Option get_reference_field(const std::string & collection_name) const; - Option get_reference_filter_ids(const std::string & filter_query, filter_result_t& filter_result, const std::string & collection_name) const; Option validate_reference_filter(const std::string& filter_query) const; - Option validate_reference_filter(const std::string& filter_query) const; - Option get(const std::string & id) const; Option remove(const std::string & id, bool remove_from_store = true); diff --git a/include/field.h b/include/field.h index 7a90fd9d..776481d2 100644 --- a/include/field.h +++ b/include/field.h @@ -641,11 +641,18 @@ struct reference_filter_result_t { struct filter_result_t { uint32_t count = 0; uint32_t* docs = nullptr; - reference_filter_result_t* reference_filter_result = nullptr; + // Collection name -> Reference filter result + std::map reference_filter_results; + + filter_result_t() {} + + filter_result_t(uint32_t count, uint32_t* docs) : count(count), docs(docs) {} ~filter_result_t() { delete[] docs; - delete[] reference_filter_result; + for (const auto &item: reference_filter_results) { + delete[] item.second; + } } }; diff --git a/include/index.h b/include/index.h index 66f4e5de..0ce10daf 100644 --- a/include/index.h +++ b/include/index.h @@ -467,16 +467,28 @@ private: void numeric_not_equals_filter(num_tree_t* const num_tree, const int64_t value, - uint32_t*& ids, - size_t& ids_len) const; + const uint32_t& context_ids_length, + const uint32_t* context_ids, + size_t& ids_len, + uint32_t*& ids) const; + + bool field_is_indexed(const std::string& field_name) const; Option do_filtering(filter_node_t* const root, filter_result_t& result, - const std::string& collection_name = "") const; + const std::string& collection_name = "", + const uint32_t& context_ids_length = 0, + const uint32_t* context_ids = nullptr) const; - Option rearranging_recursive_filter (filter_node_t* const filter_tree_root, - filter_result_t& result, - const std::string& collection_name = "") const; + void aproximate_numerical_match(num_tree_t* const num_tree, + const NUM_COMPARATOR& comparator, + const int64_t& value, + const int64_t& range_end_value, + uint32_t& filter_ids_length) const; + + Option rearranging_recursive_filter(filter_node_t* const filter_tree_root, + filter_result_t& result, + const std::string& collection_name = "") const; Option recursive_filter(filter_node_t* const root, filter_result_t& result, @@ -687,7 +699,8 @@ public: Option do_reference_filtering_with_lock(filter_node_t* const filter_tree_root, filter_result_t& filter_result, - const std::string & reference_helper_field_name) const; + const std::string& collection_name, + const std::string& reference_helper_field_name) const; void refresh_schemas(const std::vector& new_fields, const std::vector& del_fields); diff --git a/include/num_tree.h b/include/num_tree.h index f26b72ba..280f47dd 100644 --- a/include/num_tree.h +++ b/include/num_tree.h @@ -11,6 +11,17 @@ class num_tree_t { private: std::map int64map; + [[nodiscard]] bool range_inclusive_contains(const int64_t& start, const int64_t& end, const uint32_t& id) const; + + [[nodiscard]] bool contains(const int64_t& value, const uint32_t& id) const { + if (int64map.count(value) == 0) { + return false; + } + + auto ids = int64map.at(value); + return ids_t::contains(ids, id); + } + public: ~num_tree_t(); @@ -19,11 +30,27 @@ public: void range_inclusive_search(int64_t start, int64_t end, uint32_t** ids, size_t& ids_len); + void approx_range_inclusive_search_count(int64_t start, int64_t end, uint32_t& ids_len); + + void range_inclusive_contains(const int64_t& start, const int64_t& end, + const uint32_t& context_ids_length, + const uint32_t*& context_ids, + size_t& result_ids_len, + uint32_t*& result_ids) const; + size_t get(int64_t value, std::vector& geo_result_ids); void search(NUM_COMPARATOR comparator, int64_t value, uint32_t** ids, size_t& ids_len); + void approx_search_count(NUM_COMPARATOR comparator, int64_t value, uint32_t& ids_len); + void remove(uint64_t value, uint32_t id); size_t size(); + + void contains(const NUM_COMPARATOR& comparator, const int64_t& value, + const uint32_t& context_ids_length, + const uint32_t*& context_ids, + size_t& result_ids_len, + uint32_t*& result_ids) const; }; \ No newline at end of file diff --git a/include/posting.h b/include/posting.h index 29ab8cc4..6b9e6882 100644 --- a/include/posting.h +++ b/include/posting.h @@ -91,7 +91,9 @@ public: static void merge(const std::vector& posting_lists, std::vector& result_ids); - static void intersect(const std::vector& posting_lists, std::vector& result_ids); + static void intersect(const std::vector& posting_lists, std::vector& result_ids, + const uint32_t& context_ids_length = 0, + const uint32_t* context_ids = nullptr); static void get_array_token_positions( uint32_t id, diff --git a/include/topster.h b/include/topster.h index 25022423..e59ae74c 100644 --- a/include/topster.h +++ b/include/topster.h @@ -14,14 +14,15 @@ struct KV { uint64_t key{}; uint64_t distinct_key{}; int64_t scores[3]{}; // match score + 2 custom attributes - reference_filter_result_t* reference_filter_result; + reference_filter_result_t* reference_filter_result = nullptr; // to be used only in final aggregation uint64_t* query_indices = nullptr; - KV(uint16_t queryIndex, uint64_t key, uint64_t distinct_key, uint8_t match_score_index, const int64_t *scores): + KV(uint16_t queryIndex, uint64_t key, uint64_t distinct_key, uint8_t match_score_index, const int64_t *scores, + reference_filter_result_t* reference_filter_result = nullptr): match_score_index(match_score_index), query_index(queryIndex), array_index(0), key(key), - distinct_key(distinct_key) { + distinct_key(distinct_key), reference_filter_result(reference_filter_result) { this->scores[0] = scores[0]; this->scores[1] = scores[1]; this->scores[2] = scores[2]; diff --git a/src/collection.cpp b/src/collection.cpp index 95190ac7..3766a94d 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -2519,8 +2519,6 @@ Option Collection::get_filter_ids(const std::string& filter_query, filter_ } Option Collection::get_reference_field(const std::string & collection_name) const { - std::shared_lock lock(mutex); - std::string reference_field_name; for (auto const& pair: reference_fields) { auto reference_pair = pair.second; @@ -2541,13 +2539,13 @@ Option Collection::get_reference_field(const std::string & collecti Option Collection::get_reference_filter_ids(const std::string & filter_query, filter_result_t& filter_result, const std::string & collection_name) const { + std::shared_lock lock(mutex); + auto reference_field_op = get_reference_field(collection_name); if (!reference_field_op.ok()) { return Option(reference_field_op.code(), reference_field_op.error()); } - std::shared_lock lock(mutex); - const std::string doc_id_prefix = std::to_string(collection_id) + "_" + DOC_ID_PREFIX + "_"; filter_node_t* filter_tree_root = nullptr; Option parse_op = filter::parse_filter_query(filter_query, search_schema, @@ -2558,7 +2556,7 @@ Option Collection::get_reference_filter_ids(const std::string & filter_que // Reference helper field has the sequence id of other collection's documents. auto field_name = reference_field_op.get() + REFERENCE_HELPER_FIELD_SUFFIX; - auto filter_op = index->do_reference_filtering_with_lock(filter_tree_root, filter_result, field_name); + auto filter_op = index->do_reference_filtering_with_lock(filter_tree_root, filter_result, name, field_name); if (!filter_op.ok()) { return filter_op; } @@ -2583,22 +2581,6 @@ Option Collection::validate_reference_filter(const std::string& filter_que return Option(true); } -Option Collection::validate_reference_filter(const std::string& filter_query) const { - std::shared_lock lock(mutex); - - const std::string doc_id_prefix = std::to_string(collection_id) + "_" + DOC_ID_PREFIX + "_"; - filter_node_t* filter_tree_root = nullptr; - Option filter_op = filter::parse_filter_query(filter_query, search_schema, - store, doc_id_prefix, filter_tree_root); - - if(!filter_op.ok()) { - return filter_op; - } - - delete filter_tree_root; - return Option(true); -} - bool Collection::facet_value_to_string(const facet &a_facet, const facet_count_t &facet_count, const nlohmann::json &document, std::string &value) const { diff --git a/src/field.cpp b/src/field.cpp index 729ae55f..129c7512 100644 --- a/src/field.cpp +++ b/src/field.cpp @@ -418,38 +418,6 @@ Option toParseTree(std::queue& postfix, filter_node_t*& root, } else { filter filter_exp; - // Expected value: $Collection(...) - bool is_referenced_filter = (expression[0] == '$' && expression[expression.size() - 1] == ')'); - if (is_referenced_filter) { - size_t parenthesis_index = expression.find('('); - - std::string collection_name = expression.substr(1, parenthesis_index - 1); - auto& cm = CollectionManager::get_instance(); - auto collection = cm.get_collection(collection_name); - if (collection == nullptr) { - return Option(400, "Referenced collection `" + collection_name + "` not found."); - } - - filter_exp = {expression.substr(parenthesis_index + 1, expression.size() - parenthesis_index - 2)}; - filter_exp.referenced_collection_name = collection_name; - - auto op = collection->validate_reference_filter(filter_exp.field_name); - if (!op.ok()) { - return Option(400, "Failed to parse reference filter on `" + collection_name + - "` collection: " + op.error()); - } - } else { - Option toFilter_op = toFilter(expression, filter_exp, search_schema, store, doc_id_prefix); - if (!toFilter_op.ok()) { - while(!nodeStack.empty()) { - auto filterNode = nodeStack.top(); - delete filterNode; - nodeStack.pop(); - } - return toFilter_op; - } - } - // Expected value: $Collection(...) bool is_referenced_filter = (expression[0] == '$' && expression[expression.size() - 1] == ')'); if (is_referenced_filter) { diff --git a/src/index.cpp b/src/index.cpp index 0379ae43..0891968f 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -1451,11 +1451,18 @@ void Index::search_candidates(const uint8_t & field_id, bool field_is_array, void Index::numeric_not_equals_filter(num_tree_t* const num_tree, const int64_t value, - uint32_t*& ids, - size_t& ids_len) const { + const uint32_t& context_ids_length, + const uint32_t* context_ids, + size_t& ids_len, + uint32_t*& ids) const { uint32_t* to_exclude_ids = nullptr; size_t to_exclude_ids_len = 0; - num_tree->search(EQUALS, value, &to_exclude_ids, to_exclude_ids_len); + + if (context_ids_length != 0) { + num_tree->contains(EQUALS, value, context_ids_length, context_ids, to_exclude_ids_len, to_exclude_ids); + } else { + num_tree->search(EQUALS, value, &to_exclude_ids, to_exclude_ids_len); + } auto all_ids = seq_ids->uncompress(); auto all_ids_size = seq_ids->num_ids(); @@ -1470,17 +1477,25 @@ void Index::numeric_not_equals_filter(num_tree_t* const num_tree, delete[] to_exclude_ids; uint32_t* out = nullptr; - ids_len = ArrayUtils::or_scalar(ids, ids_len, - to_include_ids, to_include_ids_len, &out); + ids_len = ArrayUtils::or_scalar(ids, ids_len, to_include_ids, to_include_ids_len, &out); + delete[] ids; delete[] to_include_ids; ids = out; } +bool Index::field_is_indexed(const std::string& field_name) const { + return search_index.count(field_name) != 0 || + numerical_index.count(field_name) != 0 || + geopoint_index.count(field_name) != 0; +} + Option Index::do_filtering(filter_node_t* const root, filter_result_t& result, - const std::string& collection_name) const { + const std::string& collection_name, + const uint32_t& context_ids_length, + const uint32_t* context_ids) const { // auto begin = std::chrono::high_resolution_clock::now(); const filter a_filter = root->filter_exp; @@ -1492,13 +1507,46 @@ Option Index::do_filtering(filter_node_t* const root, if (collection == nullptr) { return Option(400, "Referenced collection `" + a_filter.referenced_collection_name + "` not found."); } + + filter_result_t reference_filter_result; auto reference_filter_op = collection->get_reference_filter_ids(a_filter.field_name, - result, + reference_filter_result, collection_name); if (!reference_filter_op.ok()) { return reference_filter_op; } + if (context_ids_length != 0) { + std::vector include_indexes; + include_indexes.reserve(std::min(context_ids_length, reference_filter_result.count)); + + size_t context_index = 0, reference_result_index = 0; + while (context_index < context_ids_length && reference_result_index < reference_filter_result.count) { + if (context_ids[context_index] == reference_filter_result.docs[reference_result_index]) { + include_indexes.push_back(reference_result_index); + context_index++; + reference_result_index++; + } else if (context_ids[context_index] < reference_filter_result.docs[reference_result_index]) { + context_index++; + } else { + reference_result_index++; + } + } + + result.count = include_indexes.size(); + result.docs = new uint32_t[include_indexes.size()]; + auto& result_references = result.reference_filter_results[a_filter.referenced_collection_name]; + result_references = new reference_filter_result_t[include_indexes.size()]; + + for (uint32_t i = 0; i < include_indexes.size(); i++) { + result.docs[i] = reference_filter_result.docs[include_indexes[i]]; + result_references[i] = reference_filter_result.reference_filter_results[a_filter.referenced_collection_name][include_indexes[i]]; + } + + return Option(true); + } + + result = reference_filter_result; return Option(true); } @@ -1511,18 +1559,26 @@ Option Index::do_filtering(filter_node_t* const root, std::sort(result_ids.begin(), result_ids.end()); - result.docs = new uint32[result_ids.size()]; - std::copy(result_ids.begin(), result_ids.end(), result.docs); - result.count = result_ids.size(); + auto result_array = new uint32[result_ids.size()]; + std::copy(result_ids.begin(), result_ids.end(), result_array); + if (context_ids_length != 0) { + uint32_t* out = nullptr; + result.count = ArrayUtils::and_scalar(context_ids, context_ids_length, + result_array, result_ids.size(), &out); + + delete[] result_array; + + result.docs = out; + return Option(true); + } + + result.docs = result_array; + result.count = result_ids.size(); return Option(true); } - bool has_search_index = search_index.count(a_filter.field_name) != 0 || - numerical_index.count(a_filter.field_name) != 0 || - geopoint_index.count(a_filter.field_name) != 0; - - if (!has_search_index) { + if (!field_is_indexed(a_filter.field_name)) { return Option(true); } @@ -1540,13 +1596,25 @@ Option Index::do_filtering(filter_node_t* const root, if(a_filter.comparators[fi] == RANGE_INCLUSIVE && fi+1 < a_filter.values.size()) { const std::string& next_filter_value = a_filter.values[fi + 1]; - int64_t range_end_value = (int64_t)std::stol(next_filter_value); - num_tree->range_inclusive_search(value, range_end_value, &result_ids, result_ids_len); + auto const range_end_value = (int64_t)std::stol(next_filter_value); + + if (context_ids_length != 0) { + num_tree->range_inclusive_contains(value, range_end_value, context_ids_length, context_ids, + result_ids_len, result_ids); + } else { + num_tree->range_inclusive_search(value, range_end_value, &result_ids, result_ids_len); + } + fi++; } else if (a_filter.comparators[fi] == NOT_EQUALS) { - numeric_not_equals_filter(num_tree, value, result_ids, result_ids_len); + numeric_not_equals_filter(num_tree, value, context_ids_length, context_ids, result_ids_len, result_ids); } else { - num_tree->search(a_filter.comparators[fi], value, &result_ids, result_ids_len); + if (context_ids_length != 0) { + num_tree->contains(a_filter.comparators[fi], value, + context_ids_length, context_ids, result_ids_len, result_ids); + } else { + num_tree->search(a_filter.comparators[fi], value, &result_ids, result_ids_len); + } } } } else if (f.is_float()) { @@ -1560,12 +1628,25 @@ Option Index::do_filtering(filter_node_t* const root, if(a_filter.comparators[fi] == RANGE_INCLUSIVE && fi+1 < a_filter.values.size()) { const std::string& next_filter_value = a_filter.values[fi+1]; int64_t range_end_value = float_to_int64_t((float) std::atof(next_filter_value.c_str())); - num_tree->range_inclusive_search(float_int64, range_end_value, &result_ids, result_ids_len); + + if (context_ids_length != 0) { + num_tree->range_inclusive_contains(float_int64, range_end_value, context_ids_length, context_ids, + result_ids_len, result_ids); + } else { + num_tree->range_inclusive_search(float_int64, range_end_value, &result_ids, result_ids_len); + } + fi++; } else if (a_filter.comparators[fi] == NOT_EQUALS) { - numeric_not_equals_filter(num_tree, value, result_ids, result_ids_len); + numeric_not_equals_filter(num_tree, float_int64, + context_ids_length, context_ids, result_ids_len, result_ids); } else { - num_tree->search(a_filter.comparators[fi], float_int64, &result_ids, result_ids_len); + if (context_ids_length != 0) { + num_tree->contains(a_filter.comparators[fi], float_int64, + context_ids_length, context_ids, result_ids_len, result_ids); + } else { + num_tree->search(a_filter.comparators[fi], float_int64, &result_ids, result_ids_len); + } } } } else if (f.is_bool()) { @@ -1575,9 +1656,15 @@ Option Index::do_filtering(filter_node_t* const root, for (const std::string& filter_value : a_filter.values) { int64_t bool_int64 = (filter_value == "1") ? 1 : 0; if (a_filter.comparators[value_index] == NOT_EQUALS) { - numeric_not_equals_filter(num_tree, bool_int64, result_ids, result_ids_len); + numeric_not_equals_filter(num_tree, bool_int64, + context_ids_length, context_ids, result_ids_len, result_ids); } else { - num_tree->search(a_filter.comparators[value_index], bool_int64, &result_ids, result_ids_len); + if (context_ids_length != 0) { + num_tree->contains(a_filter.comparators[value_index], bool_int64, + context_ids_length, context_ids, result_ids_len, result_ids); + } else { + num_tree->search(a_filter.comparators[value_index], bool_int64, &result_ids, result_ids_len); + } } value_index++; @@ -1652,6 +1739,14 @@ Option Index::do_filtering(filter_node_t* const root, // `geo_result_ids` will contain all IDs that are within approximately within query radius // we still need to do another round of exact filtering on them + if (context_ids_length != 0) { + uint32_t *out = nullptr; + uint32_t count = ArrayUtils::and_scalar(context_ids, context_ids_length, + &geo_result_ids[0], geo_result_ids.size(), &out); + + geo_result_ids = std::vector(out, out + count); + } + std::vector exact_geo_result_ids; if (f.is_single_geopoint()) { @@ -1739,7 +1834,7 @@ Option Index::do_filtering(filter_node_t* const root, if(a_filter.comparators[0] == EQUALS || a_filter.comparators[0] == NOT_EQUALS) { // needs intersection + exact matching (unlike CONTAINS) std::vector result_id_vec; - posting_t::intersect(posting_lists, result_id_vec); + posting_t::intersect(posting_lists, result_id_vec, context_ids_length, context_ids); if (result_id_vec.empty()) { continue; @@ -1763,7 +1858,7 @@ Option Index::do_filtering(filter_node_t* const root, } else { // CONTAINS size_t before_size = f_id_buff.size(); - posting_t::intersect(posting_lists, f_id_buff); + posting_t::intersect(posting_lists, f_id_buff, context_ids_length, context_ids); if (f_id_buff.size() == before_size) { continue; } @@ -1811,6 +1906,17 @@ Option Index::do_filtering(filter_node_t* const root, result_ids = to_include_ids; result_ids_len = to_include_ids_len; + + if (context_ids_length != 0) { + uint32_t *out = nullptr; + result.count = ArrayUtils::and_scalar(context_ids, context_ids_length, + result_ids, result_ids_len, &out); + + delete[] result_ids; + + result.docs = out; + return Option(true); + } } result.docs = result_ids; @@ -1824,6 +1930,28 @@ Option Index::do_filtering(filter_node_t* const root, LOG(INFO) << "Time taken for filtering: " << timeMillis << "ms";*/ } +void Index::aproximate_numerical_match(num_tree_t* const num_tree, + const NUM_COMPARATOR& comparator, + const int64_t& value, + const int64_t& range_end_value, + uint32_t& filter_ids_length) const { + if (comparator == RANGE_INCLUSIVE) { + num_tree->approx_range_inclusive_search_count(value, range_end_value, filter_ids_length); + return; + } + + if (comparator == NOT_EQUALS) { + uint32_t to_exclude_ids_len = 0; + num_tree->approx_search_count(EQUALS, value, to_exclude_ids_len); + + auto all_ids_size = seq_ids->num_ids(); + filter_ids_length += (all_ids_size - to_exclude_ids_len); + return; + } + + num_tree->approx_search_count(comparator, value, filter_ids_length); +} + Option Index::rearrange_filter_tree(filter_node_t* const root, uint32_t& filter_ids_length, const std::string& collection_name) const { @@ -1861,13 +1989,94 @@ Option Index::rearrange_filter_tree(filter_node_t* const root, return Option(true); } - filter_result_t result; - auto filter_op = do_filtering(root, result, collection_name); - if (!filter_op.ok()) { - return filter_op; + auto a_filter = root->filter_exp; + + if (a_filter.field_name == "id") { + filter_ids_length = a_filter.values.size(); + return Option(true); + } + + if (!field_is_indexed(a_filter.field_name)) { + return Option(true); + } + + field f = search_schema.at(a_filter.field_name); + + if (f.is_integer()) { + auto num_tree = numerical_index.at(f.name); + + for (size_t fi = 0; fi < a_filter.values.size(); fi++) { + const std::string& filter_value = a_filter.values[fi]; + auto const value = (int64_t)std::stol(filter_value); + + if (a_filter.comparators[fi] == RANGE_INCLUSIVE && fi+1 < a_filter.values.size()) { + const std::string& next_filter_value = a_filter.values[fi + 1]; + auto const range_end_value = (int64_t)std::stol(next_filter_value); + + aproximate_numerical_match(num_tree, a_filter.comparators[fi], value, range_end_value, + filter_ids_length); + fi++; + } else { + aproximate_numerical_match(num_tree, a_filter.comparators[fi], value, 0, filter_ids_length); + } + } + } else if (f.is_float()) { + auto num_tree = numerical_index.at(a_filter.field_name); + + for (size_t fi = 0; fi < a_filter.values.size(); fi++) { + const std::string& filter_value = a_filter.values[fi]; + float value = (float)std::atof(filter_value.c_str()); + int64_t float_int64 = float_to_int64_t(value); + + if (a_filter.comparators[fi] == RANGE_INCLUSIVE && fi+1 < a_filter.values.size()) { + const std::string& next_filter_value = a_filter.values[fi + 1]; + auto const range_end_value = float_to_int64_t((float) std::atof(next_filter_value.c_str())); + + aproximate_numerical_match(num_tree, a_filter.comparators[fi], float_int64, range_end_value, + filter_ids_length); + fi++; + } else { + aproximate_numerical_match(num_tree, a_filter.comparators[fi], float_int64, 0, filter_ids_length); + } + } + } else if (f.is_bool()) { + auto num_tree = numerical_index.at(a_filter.field_name); + + size_t value_index = 0; + for (const std::string& filter_value : a_filter.values) { + int64_t bool_int64 = (filter_value == "1") ? 1 : 0; + + aproximate_numerical_match(num_tree, a_filter.comparators[value_index], bool_int64, 0, filter_ids_length); + value_index++; + } + } else if (f.is_geopoint()) { + filter_ids_length = 100; + } else if (f.is_string()) { + art_tree* t = search_index.at(a_filter.field_name); + + for (const std::string& filter_value : a_filter.values) { + Tokenizer tokenizer(filter_value, true, false, f.locale, symbols_to_index, token_separators); + + std::string str_token; + size_t token_index = 0; + + while (tokenizer.next(str_token, token_index)) { + auto const leaf = (art_leaf *) art_search(t, (const unsigned char*) str_token.c_str(), + str_token.length()+1); + if (leaf == nullptr) { + continue; + } + + filter_ids_length += posting_t::num_ids(leaf->values); + } + } + } + + if (a_filter.apply_not_equals) { + auto all_ids_size = seq_ids->num_ids(); + filter_ids_length = (all_ids_size - filter_ids_length); } - filter_ids_length = result.count; return Option(true); } @@ -1884,19 +2093,23 @@ Option Index::rearranging_recursive_filter(filter_node_t* const filter_tre } void copy_reference_ids(filter_result_t& from, filter_result_t& to) { - if (to.count > 0 && from.reference_filter_result != nullptr && from.reference_filter_result->count > 0) { - to.reference_filter_result = new reference_filter_result_t[to.count]; + if (to.count > 0 && !from.reference_filter_results.empty()) { + for (const auto &item: from.reference_filter_results) { + auto& from_reference_result = from.reference_filter_results[item.first]; + auto& to_reference_result = to.reference_filter_results[item.first]; + to_reference_result = new reference_filter_result_t[to.count]; - size_t to_index = 0, from_index = 0; - while (to_index < to.count && from_index < from.count) { - if (to.docs[to_index] == from.docs[from_index]) { - to.reference_filter_result[to_index] = from.reference_filter_result[from_index]; - to_index++; - from_index++; - } else if (to.docs[to_index] < from.docs[from_index]) { - to_index++; - } else { - from_index++; + size_t to_index = 0, from_index = 0; + while (to_index < to.count && from_index < from.count) { + if (to.docs[to_index] == from.docs[from_index]) { + to_reference_result[to_index] = from_reference_result[from_index]; + to_index++; + from_index++; + } else if (to.docs[to_index] < from.docs[from_index]) { + to_index++; + } else { + from_index++; + } } } } @@ -1938,8 +2151,8 @@ Option Index::recursive_filter(filter_node_t* const root, } result.docs = filtered_results; - if (l_result.reference_filter_result != nullptr || r_result.reference_filter_result != nullptr) { - copy_reference_ids(l_result.reference_filter_result != nullptr ? l_result : r_result, result); + if (!l_result.reference_filter_results.empty() || !r_result.reference_filter_results.empty()) { + copy_reference_ids(!l_result.reference_filter_results.empty() ? l_result : r_result, result); } return Option(true); @@ -1982,7 +2195,8 @@ Option Index::do_filtering_with_lock(filter_node_t* const filter_tree_root Option Index::do_reference_filtering_with_lock(filter_node_t* const filter_tree_root, filter_result_t& filter_result, - const std::string & reference_helper_field_name) const { + const std::string& collection_name, + const std::string& reference_helper_field_name) const { std::shared_lock lock(mutex); filter_result_t reference_filter_result; @@ -2002,15 +2216,17 @@ Option Index::do_reference_filtering_with_lock(filter_node_t* const filter filter_result.count = reference_map.size(); filter_result.docs = new uint32_t[reference_map.size()]; - filter_result.reference_filter_result = new reference_filter_result_t[reference_map.size()]; + filter_result.reference_filter_results[collection_name] = new reference_filter_result_t[reference_map.size()]; size_t doc_index = 0; for (auto &item: reference_map) { filter_result.docs[doc_index] = item.first; - filter_result.reference_filter_result[doc_index].count = item.second.size(); - filter_result.reference_filter_result[doc_index].docs = new uint32_t[item.second.size()]; - std::copy(item.second.begin(), item.second.end(), filter_result.reference_filter_result[doc_index].docs); + auto& reference_result = filter_result.reference_filter_results[collection_name][doc_index]; + reference_result.count = item.second.size(); + reference_result.docs = new uint32_t[item.second.size()]; + std::copy(item.second.begin(), item.second.end(), reference_result.docs); + doc_index++; } @@ -2080,7 +2296,7 @@ void Index::collate_included_ids(const std::vector& q_included_tokens, scores[1] = int64_t(1); scores[2] = int64_t(1); - KV kv(searched_queries.size(), seq_id, distinct_id, 0, scores); + KV kv(searched_queries.size(), seq_id, distinct_id, 0, scores, nullptr); curated_topster->add(&kv); } } @@ -2582,7 +2798,8 @@ Option Index::search(std::vector& field_query_tokens, cons int64_t match_score_index = -1; result_ids.push_back(seq_id); - KV kv(searched_queries.size(), seq_id, distinct_id, match_score_index, scores); + + KV kv(searched_queries.size(), seq_id, distinct_id, match_score_index, scores, nullptr); int ret = topster->add(&kv); if(group_limit != 0 && ret < 2) { @@ -2681,7 +2898,7 @@ Option Index::search(std::vector& field_query_tokens, cons //LOG(INFO) << "SEQ_ID: " << seq_id << ", score: " << dist_label.first; - KV kv(searched_queries.size(), seq_id, distinct_id, match_score_index, scores); + KV kv(searched_queries.size(), seq_id, distinct_id, match_score_index, scores, nullptr); int ret = topster->add(&kv); if(group_limit != 0 && ret < 2) { diff --git a/src/num_tree.cpp b/src/num_tree.cpp index c8ce253c..5a1b95d3 100644 --- a/src/num_tree.cpp +++ b/src/num_tree.cpp @@ -43,6 +43,61 @@ void num_tree_t::range_inclusive_search(int64_t start, int64_t end, uint32_t** i *ids = out; } +void num_tree_t::approx_range_inclusive_search_count(int64_t start, int64_t end, uint32_t& ids_len) { + if (int64map.empty()) { + return; + } + + auto it_start = int64map.lower_bound(start); // iter values will be >= start + + while (it_start != int64map.end() && it_start->first <= end) { + uint32_t val_ids = ids_t::num_ids(it_start->second); + ids_len += val_ids; + it_start++; + } +} + +bool num_tree_t::range_inclusive_contains(const int64_t& start, const int64_t& end, const uint32_t& id) const { + if (int64map.empty()) { + return false; + } + + auto it_start = int64map.lower_bound(start); // iter values will be >= start + + while (it_start != int64map.end() && it_start->first <= end) { + if (ids_t::contains(it_start->second, id)) { + return true; + } + } + + return false; +} + +void num_tree_t::range_inclusive_contains(const int64_t& start, const int64_t& end, + const uint32_t& context_ids_length, + const uint32_t*& context_ids, + size_t& result_ids_len, + uint32_t*& result_ids) const { + if (int64map.empty()) { + return; + } + + std::vector consolidated_ids; + consolidated_ids.reserve(context_ids_length); + for (uint32_t i = 0; i < context_ids_length; i++) { + if (range_inclusive_contains(start, end, context_ids[i])) { + consolidated_ids.push_back(context_ids[i]); + } + } + + uint32_t *out = nullptr; + result_ids_len = ArrayUtils::or_scalar(&consolidated_ids[0], consolidated_ids.size(), + result_ids, result_ids_len, &out); + + delete [] result_ids; + result_ids = out; +} + size_t num_tree_t::get(int64_t value, std::vector& geo_result_ids) { const auto& it = int64map.find(value); if(it == int64map.end()) { @@ -132,6 +187,54 @@ void num_tree_t::search(NUM_COMPARATOR comparator, int64_t value, uint32_t** ids } } +void num_tree_t::approx_search_count(NUM_COMPARATOR comparator, int64_t value, uint32_t& ids_len) { + if (int64map.empty()) { + return; + } + + if (comparator == EQUALS) { + const auto& it = int64map.find(value); + if (it != int64map.end()) { + uint32_t val_ids = ids_t::num_ids(it->second); + ids_len += val_ids; + } + } else if (comparator == GREATER_THAN || comparator == GREATER_THAN_EQUALS) { + // iter entries will be >= value, or end() if all entries are before value + auto iter_ge_value = int64map.lower_bound(value); + + if (iter_ge_value == int64map.end()) { + return; + } + + if (comparator == GREATER_THAN && iter_ge_value->first == value) { + iter_ge_value++; + } + + while (iter_ge_value != int64map.end()) { + uint32_t val_ids = ids_t::num_ids(iter_ge_value->second); + ids_len += val_ids; + iter_ge_value++; + } + } else if (comparator == LESS_THAN || comparator == LESS_THAN_EQUALS) { + // iter entries will be >= value, or end() if all entries are before value + auto iter_ge_value = int64map.lower_bound(value); + + auto it = int64map.begin(); + + while (it != iter_ge_value) { + uint32_t val_ids = ids_t::num_ids(it->second); + ids_len += val_ids; + it++; + } + + // for LESS_THAN_EQUALS, check if last iter entry is equal to value + if (it != int64map.end() && comparator == LESS_THAN_EQUALS && it->first == value) { + uint32_t val_ids = ids_t::num_ids(it->second); + ids_len += val_ids; + } + } +} + void num_tree_t::remove(uint64_t value, uint32_t id) { if(int64map.count(value) != 0) { void* arr = int64map[value]; @@ -146,6 +249,75 @@ void num_tree_t::remove(uint64_t value, uint32_t id) { } } +void num_tree_t::contains(const NUM_COMPARATOR& comparator, const int64_t& value, + const uint32_t& context_ids_length, + const uint32_t*& context_ids, + size_t& result_ids_len, + uint32_t*& result_ids) const { + if (int64map.empty()) { + return; + } + + std::vector consolidated_ids; + consolidated_ids.reserve(context_ids_length); + for (uint32_t i = 0; i < context_ids_length; i++) { + if (comparator == EQUALS) { + if (contains(value, context_ids[i])) { + consolidated_ids.push_back(context_ids[i]); + } + } else if (comparator == GREATER_THAN || comparator == GREATER_THAN_EQUALS) { + // iter entries will be >= value, or end() if all entries are before value + auto iter_ge_value = int64map.lower_bound(value); + + if (iter_ge_value == int64map.end()) { + continue; + } + + if (comparator == GREATER_THAN && iter_ge_value->first == value) { + iter_ge_value++; + } + + while (iter_ge_value != int64map.end()) { + if (contains(iter_ge_value->first, context_ids[i])) { + consolidated_ids.push_back(context_ids[i]); + break; + } + iter_ge_value++; + } + } else if(comparator == LESS_THAN || comparator == LESS_THAN_EQUALS) { + // iter entries will be >= value, or end() if all entries are before value + auto iter_ge_value = int64map.lower_bound(value); + auto it = int64map.begin(); + + while (it != iter_ge_value) { + if (contains(it->first, context_ids[i])) { + consolidated_ids.push_back(context_ids[i]); + break; + } + it++; + } + + // for LESS_THAN_EQUALS, check if last iter entry is equal to value + if (it != int64map.end() && comparator == LESS_THAN_EQUALS && it->first == value) { + if (contains(it->first, context_ids[i])) { + consolidated_ids.push_back(context_ids[i]); + break; + } + } + } + } + + gfx::timsort(consolidated_ids.begin(), consolidated_ids.end()); + consolidated_ids.erase(unique(consolidated_ids.begin(), consolidated_ids.end()), consolidated_ids.end()); + + uint32_t *out = nullptr; + result_ids_len = ArrayUtils::or_scalar(&consolidated_ids[0], consolidated_ids.size(), + result_ids, result_ids_len, &out); + + delete[] result_ids; + result_ids = out; +} + size_t num_tree_t::size() { return int64map.size(); } diff --git a/src/posting.cpp b/src/posting.cpp index 8b72f078..05b5b061 100644 --- a/src/posting.cpp +++ b/src/posting.cpp @@ -386,7 +386,32 @@ void posting_t::merge(const std::vector& raw_posting_lists, std::vector& raw_posting_lists, std::vector& result_ids) { +void posting_t::intersect(const std::vector& raw_posting_lists, std::vector& result_ids, + const uint32_t& context_ids_length, + const uint32_t* context_ids) { + if (context_ids_length != 0) { + if (raw_posting_lists.empty()) { + return; + } + + for (uint32_t i = 0; i < context_ids_length; i++) { + bool is_present = true; + + for (auto const& raw_posting_list: raw_posting_lists) { + if (!contains(raw_posting_list, context_ids[i])) { + is_present = false; + break; + } + } + + if (is_present) { + result_ids.push_back(context_ids[i]); + } + } + + return; + } + // we will have to convert the compact posting list (if any) to full form std::vector plists; std::vector expanded_plists; diff --git a/test/collection_join_test.cpp b/test/collection_join_test.cpp index c8ee0cfd..f302d3dc 100644 --- a/test/collection_join_test.cpp +++ b/test/collection_join_test.cpp @@ -651,11 +651,11 @@ TEST_F(CollectionJoinTest, IncludeFieldsByReference_SingleMatch) { ASSERT_FALSE(search_op.ok()); ASSERT_EQ("Invalid reference in include_fields, expected `$CollectionName(fieldA, ...)`.", search_op.error()); - req_params["include_fields"] = "$foo(bar)"; - search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts); - ASSERT_FALSE(search_op.ok()); - ASSERT_EQ("Referenced collection `foo` not found.", search_op.error()); - +// req_params["include_fields"] = "$foo(bar)"; +// search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts); +// ASSERT_FALSE(search_op.ok()); +// ASSERT_EQ("Referenced collection `foo` not found.", search_op.error()); +// // req_params["include_fields"] = "$Customers(bar)"; // search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts); // ASSERT_TRUE(search_op.ok()); From 665eee9b031514f26b814a5c472ab7ad414923f4 Mon Sep 17 00:00:00 2001 From: Harpreet Sangar Date: Fri, 3 Mar 2023 11:46:40 +0530 Subject: [PATCH 07/51] Fix invalid read. --- include/field.h | 14 ++++++++++++++ src/index.cpp | 2 +- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/include/field.h b/include/field.h index 776481d2..18a1d4b7 100644 --- a/include/field.h +++ b/include/field.h @@ -648,6 +648,20 @@ struct filter_result_t { filter_result_t(uint32_t count, uint32_t* docs) : count(count), docs(docs) {} + filter_result_t& operator=(filter_result_t&& obj) noexcept { + if (&obj == this) + return *this; + + count = obj.count; + docs = obj.docs; + reference_filter_results = std::map(obj.reference_filter_results); + + obj.docs = nullptr; + obj.reference_filter_results.clear(); + + return *this; + } + ~filter_result_t() { delete[] docs; for (const auto &item: reference_filter_results) { diff --git a/src/index.cpp b/src/index.cpp index 0891968f..f9ba9f35 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -1546,7 +1546,7 @@ Option Index::do_filtering(filter_node_t* const root, return Option(true); } - result = reference_filter_result; + result = std::move(reference_filter_result); return Option(true); } From 61bdcd91fd2ba43c4393a7bf898456eb790135fd Mon Sep 17 00:00:00 2001 From: Harpreet Sangar Date: Tue, 17 Jan 2023 14:08:39 +0530 Subject: [PATCH 08/51] Abstract `foo_sequence_id` field from user. --- include/field.h | 7 +++++++ src/collection_manager.cpp | 4 ++++ test/collection_manager_test.cpp | 17 ++--------------- 3 files changed, 13 insertions(+), 15 deletions(-) diff --git a/include/field.h b/include/field.h index 18a1d4b7..44de4637 100644 --- a/include/field.h +++ b/include/field.h @@ -11,6 +11,7 @@ #include #include "json.hpp" #include "text_embedder_manager.h" +#include namespace field_types { // first field value indexed will determine the type @@ -284,11 +285,17 @@ struct field { const std::string & default_sorting_field, nlohmann::json& fields_json) { bool found_default_sorting_field = false; + const std::regex sequence_id_pattern(".*_sequence_id$"); // Check for duplicates in field names std::map> unique_fields; for(const field & field: fields) { + if (std::regex_match(field.name, sequence_id_pattern)) { + // Don't add foo_sequence_id field. + continue; + } + unique_fields[field.name].push_back(&field); if(field.name == "id") { diff --git a/src/collection_manager.cpp b/src/collection_manager.cpp index 0475dfbe..96a187b9 100644 --- a/src/collection_manager.cpp +++ b/src/collection_manager.cpp @@ -89,6 +89,10 @@ Collection* CollectionManager::init_collection(const nlohmann::json & collection } fields.push_back(f); + + if (!f.reference.empty()) { + fields.emplace_back(field(f.name + "_sequence_id", "string", false, f.optional, true)); + } } std::string default_sorting_field = collection_meta[Collection::COLLECTION_DEFAULT_SORTING_FIELD_KEY].get(); diff --git a/test/collection_manager_test.cpp b/test/collection_manager_test.cpp index 38c7e014..cdbd482e 100644 --- a/test/collection_manager_test.cpp +++ b/test/collection_manager_test.cpp @@ -227,17 +227,6 @@ TEST_F(CollectionManagerTest, CollectionCreation) { "sort":false, "type":"string", "reference":"Products.product_id" - }, - { - "facet":false, - "index":true, - "infix":false, - "locale":"", - "name":"product_id_sequence_id", - "nested":false, - "optional":true, - "sort":true, - "type":"int64" } ], "id":0, @@ -475,11 +464,9 @@ TEST_F(CollectionManagerTest, RestoreRecordsOnRestart) { ASSERT_EQ(0, collection1->get_collection_id()); ASSERT_EQ(18, collection1->get_next_seq_id()); ASSERT_EQ(facet_fields_expected, collection1->get_facet_fields()); - // product_id_sequence_id is also included - ASSERT_EQ(3, collection1->get_sort_fields().size()); + ASSERT_EQ(2, collection1->get_sort_fields().size()); ASSERT_EQ("location", collection1->get_sort_fields()[0].name); - ASSERT_EQ("product_id_sequence_id", collection1->get_sort_fields()[1].name); - ASSERT_EQ("points", collection1->get_sort_fields()[2].name); + ASSERT_EQ("points", collection1->get_sort_fields()[1].name); ASSERT_EQ(schema.size(), collection1->get_schema().size()); ASSERT_EQ("points", collection1->get_default_sorting_field()); From 0fb5e0d2a27508c20ece66d33f6b8c35905d88d1 Mon Sep 17 00:00:00 2001 From: Harpreet Sangar Date: Thu, 19 Jan 2023 11:25:43 +0530 Subject: [PATCH 09/51] Serialize sequence id. --- test/collection_join_test.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/test/collection_join_test.cpp b/test/collection_join_test.cpp index f302d3dc..9db121aa 100644 --- a/test/collection_join_test.cpp +++ b/test/collection_join_test.cpp @@ -265,7 +265,6 @@ TEST_F(CollectionJoinTest, IndexDocumentHavingReferenceField) { } ASSERT_TRUE(add_op.ok()); } - collectionManager.drop_collection("Customers"); customers_schema_json = R"({ From e27cc6e34ca600f91db9a948312b8cf11ec51979 Mon Sep 17 00:00:00 2001 From: Harpreet Sangar Date: Thu, 19 Jan 2023 11:27:52 +0530 Subject: [PATCH 10/51] Store `foo_sequence_id` in collection's meta-data. --- include/field.h | 7 ------- src/collection_manager.cpp | 4 ---- test/collection_manager_test.cpp | 17 +++++++++++++++-- 3 files changed, 15 insertions(+), 13 deletions(-) diff --git a/include/field.h b/include/field.h index 44de4637..18a1d4b7 100644 --- a/include/field.h +++ b/include/field.h @@ -11,7 +11,6 @@ #include #include "json.hpp" #include "text_embedder_manager.h" -#include namespace field_types { // first field value indexed will determine the type @@ -285,17 +284,11 @@ struct field { const std::string & default_sorting_field, nlohmann::json& fields_json) { bool found_default_sorting_field = false; - const std::regex sequence_id_pattern(".*_sequence_id$"); // Check for duplicates in field names std::map> unique_fields; for(const field & field: fields) { - if (std::regex_match(field.name, sequence_id_pattern)) { - // Don't add foo_sequence_id field. - continue; - } - unique_fields[field.name].push_back(&field); if(field.name == "id") { diff --git a/src/collection_manager.cpp b/src/collection_manager.cpp index 96a187b9..0475dfbe 100644 --- a/src/collection_manager.cpp +++ b/src/collection_manager.cpp @@ -89,10 +89,6 @@ Collection* CollectionManager::init_collection(const nlohmann::json & collection } fields.push_back(f); - - if (!f.reference.empty()) { - fields.emplace_back(field(f.name + "_sequence_id", "string", false, f.optional, true)); - } } std::string default_sorting_field = collection_meta[Collection::COLLECTION_DEFAULT_SORTING_FIELD_KEY].get(); diff --git a/test/collection_manager_test.cpp b/test/collection_manager_test.cpp index cdbd482e..38c7e014 100644 --- a/test/collection_manager_test.cpp +++ b/test/collection_manager_test.cpp @@ -227,6 +227,17 @@ TEST_F(CollectionManagerTest, CollectionCreation) { "sort":false, "type":"string", "reference":"Products.product_id" + }, + { + "facet":false, + "index":true, + "infix":false, + "locale":"", + "name":"product_id_sequence_id", + "nested":false, + "optional":true, + "sort":true, + "type":"int64" } ], "id":0, @@ -464,9 +475,11 @@ TEST_F(CollectionManagerTest, RestoreRecordsOnRestart) { ASSERT_EQ(0, collection1->get_collection_id()); ASSERT_EQ(18, collection1->get_next_seq_id()); ASSERT_EQ(facet_fields_expected, collection1->get_facet_fields()); - ASSERT_EQ(2, collection1->get_sort_fields().size()); + // product_id_sequence_id is also included + ASSERT_EQ(3, collection1->get_sort_fields().size()); ASSERT_EQ("location", collection1->get_sort_fields()[0].name); - ASSERT_EQ("points", collection1->get_sort_fields()[1].name); + ASSERT_EQ("product_id_sequence_id", collection1->get_sort_fields()[1].name); + ASSERT_EQ("points", collection1->get_sort_fields()[2].name); ASSERT_EQ(schema.size(), collection1->get_schema().size()); ASSERT_EQ("points", collection1->get_default_sorting_field()); From 72f896dc2e03242146561e65e01bde7a9c7e40f6 Mon Sep 17 00:00:00 2001 From: Harpreet Sangar Date: Sun, 22 Jan 2023 12:02:29 +0530 Subject: [PATCH 11/51] Filter by reference. --- include/collection.h | 2 ++ src/collection.cpp | 16 ++++++++++++++++ test/collection_join_test.cpp | 3 +++ 3 files changed, 21 insertions(+) diff --git a/include/collection.h b/include/collection.h index 27bf7920..f3dbc66a 100644 --- a/include/collection.h +++ b/include/collection.h @@ -463,6 +463,8 @@ public: Option validate_reference_filter(const std::string& filter_query) const; + Option validate_reference_filter(const std::string& filter_query) const; + Option get(const std::string & id) const; Option remove(const std::string & id, bool remove_from_store = true); diff --git a/src/collection.cpp b/src/collection.cpp index 3766a94d..c4a1066a 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -2581,6 +2581,22 @@ Option Collection::validate_reference_filter(const std::string& filter_que return Option(true); } +Option Collection::validate_reference_filter(const std::string& filter_query) const { + std::shared_lock lock(mutex); + + const std::string doc_id_prefix = std::to_string(collection_id) + "_" + DOC_ID_PREFIX + "_"; + filter_node_t* filter_tree_root = nullptr; + Option filter_op = filter::parse_filter_query(filter_query, search_schema, + store, doc_id_prefix, filter_tree_root); + + if(!filter_op.ok()) { + return filter_op; + } + + delete filter_tree_root; + return Option(true); +} + bool Collection::facet_value_to_string(const facet &a_facet, const facet_count_t &facet_count, const nlohmann::json &document, std::string &value) const { diff --git a/test/collection_join_test.cpp b/test/collection_join_test.cpp index 9db121aa..0cedd216 100644 --- a/test/collection_join_test.cpp +++ b/test/collection_join_test.cpp @@ -284,6 +284,9 @@ TEST_F(CollectionJoinTest, IndexDocumentHavingReferenceField) { ASSERT_TRUE(add_doc_op.ok()); ASSERT_EQ(customer_collection->get("0").get().count("reference_id_sequence_id"), 1); + // Referenced document should be accessible from Customers collection. + auto sequence_id = collectionManager.get_collection("Products")->get_seq_id_collection_prefix() + "_" + + customer_collection->get("0").get()["product_id_sequence_id"].get(); nlohmann::json document; // Referenced document's sequence_id must be valid. auto get_op = collectionManager.get_collection("Products")->get_document_from_store( From 1d3e05f2cbf6f9e342a118db2c4bd250471f34ea Mon Sep 17 00:00:00 2001 From: Harpreet Sangar Date: Tue, 24 Jan 2023 10:57:29 +0530 Subject: [PATCH 12/51] Optimize reference filtering. --- include/collection.h | 4 +++- include/index.h | 4 ++++ src/collection.cpp | 16 ---------------- src/index.cpp | 2 +- test/collection_join_test.cpp | 3 --- 5 files changed, 8 insertions(+), 21 deletions(-) diff --git a/include/collection.h b/include/collection.h index f3dbc66a..14693473 100644 --- a/include/collection.h +++ b/include/collection.h @@ -463,7 +463,9 @@ public: Option validate_reference_filter(const std::string& filter_query) const; - Option validate_reference_filter(const std::string& filter_query) const; + Option get_reference_filter_ids(const std::string & filter_query, + const std::string & collection_name, + std::pair& reference_index_ids) const; Option get(const std::string & id) const; diff --git a/include/index.h b/include/index.h index 0ce10daf..29e1292c 100644 --- a/include/index.h +++ b/include/index.h @@ -702,6 +702,10 @@ public: const std::string& collection_name, const std::string& reference_helper_field_name) const; + void do_reference_filtering_with_lock(std::pair& reference_index_ids, + filter_node_t const* const& filter_tree_root, + const std::string& reference_field_name) const; + void refresh_schemas(const std::vector& new_fields, const std::vector& del_fields); // the following methods are not synchronized because their parent calls are synchronized or they are const/static diff --git a/src/collection.cpp b/src/collection.cpp index c4a1066a..3766a94d 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -2581,22 +2581,6 @@ Option Collection::validate_reference_filter(const std::string& filter_que return Option(true); } -Option Collection::validate_reference_filter(const std::string& filter_query) const { - std::shared_lock lock(mutex); - - const std::string doc_id_prefix = std::to_string(collection_id) + "_" + DOC_ID_PREFIX + "_"; - filter_node_t* filter_tree_root = nullptr; - Option filter_op = filter::parse_filter_query(filter_query, search_schema, - store, doc_id_prefix, filter_tree_root); - - if(!filter_op.ok()) { - return filter_op; - } - - delete filter_tree_root; - return Option(true); -} - bool Collection::facet_value_to_string(const facet &a_facet, const facet_count_t &facet_count, const nlohmann::json &document, std::string &value) const { diff --git a/src/index.cpp b/src/index.cpp index f9ba9f35..840cebfd 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -1497,7 +1497,7 @@ Option Index::do_filtering(filter_node_t* const root, const uint32_t& context_ids_length, const uint32_t* context_ids) const { // auto begin = std::chrono::high_resolution_clock::now(); - const filter a_filter = root->filter_exp; +/**/ const filter a_filter = root->filter_exp; bool is_referenced_filter = !a_filter.referenced_collection_name.empty(); if (is_referenced_filter) { diff --git a/test/collection_join_test.cpp b/test/collection_join_test.cpp index 0cedd216..9db121aa 100644 --- a/test/collection_join_test.cpp +++ b/test/collection_join_test.cpp @@ -284,9 +284,6 @@ TEST_F(CollectionJoinTest, IndexDocumentHavingReferenceField) { ASSERT_TRUE(add_doc_op.ok()); ASSERT_EQ(customer_collection->get("0").get().count("reference_id_sequence_id"), 1); - // Referenced document should be accessible from Customers collection. - auto sequence_id = collectionManager.get_collection("Products")->get_seq_id_collection_prefix() + "_" + - customer_collection->get("0").get()["product_id_sequence_id"].get(); nlohmann::json document; // Referenced document's sequence_id must be valid. auto get_op = collectionManager.get_collection("Products")->get_document_from_store( From 753aa298881715ecd980050e462b3c685106ce42 Mon Sep 17 00:00:00 2001 From: Harpreet Sangar Date: Fri, 27 Jan 2023 12:57:13 +0530 Subject: [PATCH 13/51] Add `Index::rearranging_recursive_filter`. --- include/index.h | 2 +- src/index.cpp | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/include/index.h b/include/index.h index 29e1292c..600d3c6f 100644 --- a/include/index.h +++ b/include/index.h @@ -703,7 +703,7 @@ public: const std::string& reference_helper_field_name) const; void do_reference_filtering_with_lock(std::pair& reference_index_ids, - filter_node_t const* const& filter_tree_root, + filter_node_t* filter_tree_root, const std::string& reference_field_name) const; void refresh_schemas(const std::vector& new_fields, const std::vector& del_fields); diff --git a/src/index.cpp b/src/index.cpp index 840cebfd..cd307846 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -1497,7 +1497,7 @@ Option Index::do_filtering(filter_node_t* const root, const uint32_t& context_ids_length, const uint32_t* context_ids) const { // auto begin = std::chrono::high_resolution_clock::now(); -/**/ const filter a_filter = root->filter_exp; + const filter a_filter = root->filter_exp; bool is_referenced_filter = !a_filter.referenced_collection_name.empty(); if (is_referenced_filter) { @@ -1958,7 +1958,6 @@ Option Index::rearrange_filter_tree(filter_node_t* const root, if (root == nullptr) { return Option(true); } - if (root->isOperator) { uint32_t l_filter_ids_length = 0; if (root->left != nullptr) { From c4ef71f274dfbd14d502a7b44d2568be1e64f2e5 Mon Sep 17 00:00:00 2001 From: Harpreet Sangar Date: Fri, 27 Jan 2023 19:58:06 +0530 Subject: [PATCH 14/51] Add `Index::adaptive_filter`. --- src/index.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/index.cpp b/src/index.cpp index cd307846..f9ba9f35 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -1958,6 +1958,7 @@ Option Index::rearrange_filter_tree(filter_node_t* const root, if (root == nullptr) { return Option(true); } + if (root->isOperator) { uint32_t l_filter_ids_length = 0; if (root->left != nullptr) { From f15f0822b234513a08ed13e0098188a2d9890d1e Mon Sep 17 00:00:00 2001 From: Harpreet Sangar Date: Mon, 30 Jan 2023 10:47:04 +0530 Subject: [PATCH 15/51] Refactor filtering logic. --- src/index.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/index.cpp b/src/index.cpp index f9ba9f35..135d073c 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -2113,6 +2113,13 @@ void copy_reference_ids(filter_result_t& from, filter_result_t& to) { } } } + + do_filtering(root); + filter_ids_length = root->match_index_ids.first; + filter_ids = root->match_index_ids.second; + + // Prevents double deletion. We'll be deleting this array upstream and when the filter tree is destructed. + root->match_index_ids.second = nullptr; } Option Index::recursive_filter(filter_node_t* const root, From a11a899a4c8ed0677a4b373eacc4149f8fe9ffd4 Mon Sep 17 00:00:00 2001 From: Harpreet Sangar Date: Thu, 2 Feb 2023 11:23:09 +0530 Subject: [PATCH 16/51] Add `reference_fields` map in `Collection`. --- include/index.h | 2 +- src/index.cpp | 7 ------- test/collection_join_test.cpp | 3 ++- 3 files changed, 3 insertions(+), 9 deletions(-) diff --git a/include/index.h b/include/index.h index 600d3c6f..8742300f 100644 --- a/include/index.h +++ b/include/index.h @@ -704,7 +704,7 @@ public: void do_reference_filtering_with_lock(std::pair& reference_index_ids, filter_node_t* filter_tree_root, - const std::string& reference_field_name) const; + const std::string& reference_helper_field_name) const; void refresh_schemas(const std::vector& new_fields, const std::vector& del_fields); diff --git a/src/index.cpp b/src/index.cpp index 135d073c..f9ba9f35 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -2113,13 +2113,6 @@ void copy_reference_ids(filter_result_t& from, filter_result_t& to) { } } } - - do_filtering(root); - filter_ids_length = root->match_index_ids.first; - filter_ids = root->match_index_ids.second; - - // Prevents double deletion. We'll be deleting this array upstream and when the filter tree is destructed. - root->match_index_ids.second = nullptr; } Option Index::recursive_filter(filter_node_t* const root, diff --git a/test/collection_join_test.cpp b/test/collection_join_test.cpp index 9db121aa..b25439e6 100644 --- a/test/collection_join_test.cpp +++ b/test/collection_join_test.cpp @@ -265,6 +265,7 @@ TEST_F(CollectionJoinTest, IndexDocumentHavingReferenceField) { } ASSERT_TRUE(add_op.ok()); } + collectionManager.drop_collection("Customers"); customers_schema_json = R"({ @@ -708,4 +709,4 @@ TEST_F(CollectionJoinTest, IncludeFieldsByReference_SingleMatch) { // // 3 fields in Products document and 2 fields from Customers document // ASSERT_EQ(5, res_obj["hits"][0]["document"].size()); // ASSERT_EQ(1, res_obj["hits"][0]["document"].count("product_id_sequence_id")); -} \ No newline at end of file +} From 424c0f20da412fcf270553095badbd1e6a5c07b6 Mon Sep 17 00:00:00 2001 From: Harpreet Sangar Date: Fri, 3 Feb 2023 14:30:17 +0530 Subject: [PATCH 17/51] Fix double locking of collection mutex. --- test/collection_join_test.cpp | 176 ++-------------------------------- 1 file changed, 8 insertions(+), 168 deletions(-) diff --git a/test/collection_join_test.cpp b/test/collection_join_test.cpp index b25439e6..11de2367 100644 --- a/test/collection_join_test.cpp +++ b/test/collection_join_test.cpp @@ -405,11 +405,11 @@ TEST_F(CollectionJoinTest, FilterByReference_SingleMatch) { ASSERT_EQ(1, result["hits"].size()); ASSERT_EQ("soap", result["hits"][0]["document"]["product_name"].get()); - collectionManager.drop_collection("Customers"); - collectionManager.drop_collection("Products"); +// collectionManager.drop_collection("Customers"); +// collectionManager.drop_collection("Products"); } -TEST_F(CollectionJoinTest, FilterByReference_MultipleMatch) { +TEST_F(CollectionJoinTest, FilterByReferenceField_MultipleMatch) { auto schema_json = R"({ "name": "Users", @@ -535,7 +535,7 @@ TEST_F(CollectionJoinTest, FilterByReference_MultipleMatch) { ASSERT_TRUE(add_op.ok()); } - auto coll = collectionManager.get_collection_unsafe("Users"); + auto coll = collectionManager.get_collection("Users"); // Search for users linked to repo_b auto result = coll->search("R", {"user_name"}, "$Links(repo_id:=repo_b)", {}, {}, {0}, @@ -546,167 +546,7 @@ TEST_F(CollectionJoinTest, FilterByReference_MultipleMatch) { ASSERT_EQ("user_b", result["hits"][0]["document"]["user_id"].get()); ASSERT_EQ("user_a", result["hits"][1]["document"]["user_id"].get()); - collectionManager.drop_collection("Users"); - collectionManager.drop_collection("Repos"); - collectionManager.drop_collection("Links"); -} - -TEST_F(CollectionJoinTest, IncludeFieldsByReference_SingleMatch) { - auto schema_json = - R"({ - "name": "Products", - "fields": [ - {"name": "product_id", "type": "string"}, - {"name": "product_name", "type": "string"}, - {"name": "product_description", "type": "string"} - ] - })"_json; - std::vector documents = { - R"({ - "product_id": "product_a", - "product_name": "shampoo", - "product_description": "Our new moisturizing shampoo is perfect for those with dry or damaged hair." - })"_json, - R"({ - "product_id": "product_b", - "product_name": "soap", - "product_description": "Introducing our all-natural, organic soap bar made with essential oils and botanical ingredients." - })"_json - }; - auto collection_create_op = collectionManager.create_collection(schema_json); - ASSERT_TRUE(collection_create_op.ok()); - for (auto const &json: documents) { - auto add_op = collection_create_op.get()->add(json.dump()); - if (!add_op.ok()) { - LOG(INFO) << add_op.error(); - } - ASSERT_TRUE(add_op.ok()); - } - - schema_json = - R"({ - "name": "Customers", - "fields": [ - {"name": "customer_id", "type": "string"}, - {"name": "customer_name", "type": "string"}, - {"name": "product_price", "type": "float"}, - {"name": "product_id", "type": "string", "reference": "Products.product_id"} - ] - })"_json; - documents = { - R"({ - "customer_id": "customer_a", - "customer_name": "Joe", - "product_price": 143, - "product_id": "product_a" - })"_json, - R"({ - "customer_id": "customer_a", - "customer_name": "Joe", - "product_price": 73.5, - "product_id": "product_b" - })"_json, - R"({ - "customer_id": "customer_b", - "customer_name": "Dan", - "product_price": 75, - "product_id": "product_a" - })"_json, - R"({ - "customer_id": "customer_b", - "customer_name": "Dan", - "product_price": 140, - "product_id": "product_b" - })"_json - }; - collection_create_op = collectionManager.create_collection(schema_json); - ASSERT_TRUE(collection_create_op.ok()); - for (auto const &json: documents) { - auto add_op = collection_create_op.get()->add(json.dump()); - if (!add_op.ok()) { - LOG(INFO) << add_op.error(); - } - ASSERT_TRUE(add_op.ok()); - } - - std::map req_params = { - {"collection", "Products"}, - {"q", "s"}, - {"query_by", "product_name"}, - {"filter_by", "$Customers(customer_id:=customer_a && product_price:<100)"}, - }; - - nlohmann::json embedded_params; - std::string json_res; - auto now_ts = std::chrono::duration_cast( - std::chrono::system_clock::now().time_since_epoch()).count(); - - req_params["include_fields"] = "$foo.bar"; - auto search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts); - ASSERT_FALSE(search_op.ok()); - ASSERT_EQ("Invalid reference in include_fields, expected `$CollectionName(fieldA, ...)`.", search_op.error()); - - req_params["include_fields"] = "$foo(bar"; - search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts); - ASSERT_FALSE(search_op.ok()); - ASSERT_EQ("Invalid reference in include_fields, expected `$CollectionName(fieldA, ...)`.", search_op.error()); - -// req_params["include_fields"] = "$foo(bar)"; -// search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts); -// ASSERT_FALSE(search_op.ok()); -// ASSERT_EQ("Referenced collection `foo` not found.", search_op.error()); -// -// req_params["include_fields"] = "$Customers(bar)"; -// search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts); -// ASSERT_TRUE(search_op.ok()); -// -// nlohmann::json res_obj = nlohmann::json::parse(json_res); -// ASSERT_EQ(1, res_obj["found"].get()); -// ASSERT_EQ(1, res_obj["hits"].size()); -// ASSERT_EQ(0, res_obj["hits"][0]["document"].size()); -// -// req_params["include_fields"] = "$Customers(product_price)"; -// search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts); -// ASSERT_TRUE(search_op.ok()); -// -// res_obj = nlohmann::json::parse(json_res); -// ASSERT_EQ(1, res_obj["found"].get()); -// ASSERT_EQ(1, res_obj["hits"].size()); -// ASSERT_EQ(1, res_obj["hits"][0]["document"].size()); -// ASSERT_EQ(1, res_obj["hits"][0]["document"].count("product_price")); -// ASSERT_EQ(73.5, res_obj["hits"][0]["document"].at("product_price")); -// -// req_params["include_fields"] = "$Customers(product_price, customer_id)"; -// search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts); -// ASSERT_TRUE(search_op.ok()); -// -// res_obj = nlohmann::json::parse(json_res); -// ASSERT_EQ(1, res_obj["found"].get()); -// ASSERT_EQ(1, res_obj["hits"].size()); -// ASSERT_EQ(2, res_obj["hits"][0]["document"].size()); -// ASSERT_EQ(1, res_obj["hits"][0]["document"].count("product_price")); -// ASSERT_EQ(73.5, res_obj["hits"][0]["document"].at("product_price")); -// ASSERT_EQ(1, res_obj["hits"][0]["document"].count("customer_id")); -// ASSERT_EQ("customer_a", res_obj["hits"][0]["document"].at("customer_id")); -// -// req_params["include_fields"] = "*, $Customers(product_price, customer_id)"; -// search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts); -// ASSERT_TRUE(search_op.ok()); -// -// res_obj = nlohmann::json::parse(json_res); -// ASSERT_EQ(1, res_obj["found"].get()); -// ASSERT_EQ(1, res_obj["hits"].size()); -// // 3 fields in Products document and 2 fields from Customers document -// ASSERT_EQ(5, res_obj["hits"][0]["document"].size()); -// -// req_params["include_fields"] = "*, $Customers(product*)"; -// search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts); -// ASSERT_TRUE(search_op.ok()); -// -// res_obj = nlohmann::json::parse(json_res); -// ASSERT_EQ(1, res_obj["found"].get()); -// ASSERT_EQ(1, res_obj["hits"].size()); -// // 3 fields in Products document and 2 fields from Customers document -// ASSERT_EQ(5, res_obj["hits"][0]["document"].size()); -// ASSERT_EQ(1, res_obj["hits"][0]["document"].count("product_id_sequence_id")); -} +// collectionManager.drop_collection("Users"); +// collectionManager.drop_collection("Repos"); +// collectionManager.drop_collection("Links"); +} \ No newline at end of file From c77e0373ce5d10d84062290575a0112350ce38cb Mon Sep 17 00:00:00 2001 From: Harpreet Sangar Date: Tue, 7 Feb 2023 10:53:18 +0530 Subject: [PATCH 18/51] Fix tests. --- test/collection_join_test.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/test/collection_join_test.cpp b/test/collection_join_test.cpp index 11de2367..98e7663f 100644 --- a/test/collection_join_test.cpp +++ b/test/collection_join_test.cpp @@ -405,11 +405,11 @@ TEST_F(CollectionJoinTest, FilterByReference_SingleMatch) { ASSERT_EQ(1, result["hits"].size()); ASSERT_EQ("soap", result["hits"][0]["document"]["product_name"].get()); -// collectionManager.drop_collection("Customers"); -// collectionManager.drop_collection("Products"); + collectionManager.drop_collection("Customers"); + collectionManager.drop_collection("Products"); } -TEST_F(CollectionJoinTest, FilterByReferenceField_MultipleMatch) { +TEST_F(CollectionJoinTest, FilterByReference_MultipleMatch) { auto schema_json = R"({ "name": "Users", @@ -535,7 +535,7 @@ TEST_F(CollectionJoinTest, FilterByReferenceField_MultipleMatch) { ASSERT_TRUE(add_op.ok()); } - auto coll = collectionManager.get_collection("Users"); + auto coll = collectionManager.get_collection_unsafe("Users"); // Search for users linked to repo_b auto result = coll->search("R", {"user_name"}, "$Links(repo_id:=repo_b)", {}, {}, {0}, @@ -546,7 +546,7 @@ TEST_F(CollectionJoinTest, FilterByReferenceField_MultipleMatch) { ASSERT_EQ("user_b", result["hits"][0]["document"]["user_id"].get()); ASSERT_EQ("user_a", result["hits"][1]["document"]["user_id"].get()); -// collectionManager.drop_collection("Users"); -// collectionManager.drop_collection("Repos"); -// collectionManager.drop_collection("Links"); + collectionManager.drop_collection("Users"); + collectionManager.drop_collection("Repos"); + collectionManager.drop_collection("Links"); } \ No newline at end of file From e7949e650ab2ce91c6da7529ca5fa0ed137e133b Mon Sep 17 00:00:00 2001 From: Harpreet Sangar Date: Thu, 9 Feb 2023 11:50:58 +0530 Subject: [PATCH 19/51] Reference `include_fields`. --- test/collection_join_test.cpp | 160 ++++++++++++++++++++++++++++++++++ 1 file changed, 160 insertions(+) diff --git a/test/collection_join_test.cpp b/test/collection_join_test.cpp index 98e7663f..f302d3dc 100644 --- a/test/collection_join_test.cpp +++ b/test/collection_join_test.cpp @@ -549,4 +549,164 @@ TEST_F(CollectionJoinTest, FilterByReference_MultipleMatch) { collectionManager.drop_collection("Users"); collectionManager.drop_collection("Repos"); collectionManager.drop_collection("Links"); +} + +TEST_F(CollectionJoinTest, IncludeFieldsByReference_SingleMatch) { + auto schema_json = + R"({ + "name": "Products", + "fields": [ + {"name": "product_id", "type": "string"}, + {"name": "product_name", "type": "string"}, + {"name": "product_description", "type": "string"} + ] + })"_json; + std::vector documents = { + R"({ + "product_id": "product_a", + "product_name": "shampoo", + "product_description": "Our new moisturizing shampoo is perfect for those with dry or damaged hair." + })"_json, + R"({ + "product_id": "product_b", + "product_name": "soap", + "product_description": "Introducing our all-natural, organic soap bar made with essential oils and botanical ingredients." + })"_json + }; + auto collection_create_op = collectionManager.create_collection(schema_json); + ASSERT_TRUE(collection_create_op.ok()); + for (auto const &json: documents) { + auto add_op = collection_create_op.get()->add(json.dump()); + if (!add_op.ok()) { + LOG(INFO) << add_op.error(); + } + ASSERT_TRUE(add_op.ok()); + } + + schema_json = + R"({ + "name": "Customers", + "fields": [ + {"name": "customer_id", "type": "string"}, + {"name": "customer_name", "type": "string"}, + {"name": "product_price", "type": "float"}, + {"name": "product_id", "type": "string", "reference": "Products.product_id"} + ] + })"_json; + documents = { + R"({ + "customer_id": "customer_a", + "customer_name": "Joe", + "product_price": 143, + "product_id": "product_a" + })"_json, + R"({ + "customer_id": "customer_a", + "customer_name": "Joe", + "product_price": 73.5, + "product_id": "product_b" + })"_json, + R"({ + "customer_id": "customer_b", + "customer_name": "Dan", + "product_price": 75, + "product_id": "product_a" + })"_json, + R"({ + "customer_id": "customer_b", + "customer_name": "Dan", + "product_price": 140, + "product_id": "product_b" + })"_json + }; + collection_create_op = collectionManager.create_collection(schema_json); + ASSERT_TRUE(collection_create_op.ok()); + for (auto const &json: documents) { + auto add_op = collection_create_op.get()->add(json.dump()); + if (!add_op.ok()) { + LOG(INFO) << add_op.error(); + } + ASSERT_TRUE(add_op.ok()); + } + + std::map req_params = { + {"collection", "Products"}, + {"q", "s"}, + {"query_by", "product_name"}, + {"filter_by", "$Customers(customer_id:=customer_a && product_price:<100)"}, + }; + + nlohmann::json embedded_params; + std::string json_res; + auto now_ts = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()).count(); + + req_params["include_fields"] = "$foo.bar"; + auto search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts); + ASSERT_FALSE(search_op.ok()); + ASSERT_EQ("Invalid reference in include_fields, expected `$CollectionName(fieldA, ...)`.", search_op.error()); + + req_params["include_fields"] = "$foo(bar"; + search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts); + ASSERT_FALSE(search_op.ok()); + ASSERT_EQ("Invalid reference in include_fields, expected `$CollectionName(fieldA, ...)`.", search_op.error()); + +// req_params["include_fields"] = "$foo(bar)"; +// search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts); +// ASSERT_FALSE(search_op.ok()); +// ASSERT_EQ("Referenced collection `foo` not found.", search_op.error()); +// +// req_params["include_fields"] = "$Customers(bar)"; +// search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts); +// ASSERT_TRUE(search_op.ok()); +// +// nlohmann::json res_obj = nlohmann::json::parse(json_res); +// ASSERT_EQ(1, res_obj["found"].get()); +// ASSERT_EQ(1, res_obj["hits"].size()); +// ASSERT_EQ(0, res_obj["hits"][0]["document"].size()); +// +// req_params["include_fields"] = "$Customers(product_price)"; +// search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts); +// ASSERT_TRUE(search_op.ok()); +// +// res_obj = nlohmann::json::parse(json_res); +// ASSERT_EQ(1, res_obj["found"].get()); +// ASSERT_EQ(1, res_obj["hits"].size()); +// ASSERT_EQ(1, res_obj["hits"][0]["document"].size()); +// ASSERT_EQ(1, res_obj["hits"][0]["document"].count("product_price")); +// ASSERT_EQ(73.5, res_obj["hits"][0]["document"].at("product_price")); +// +// req_params["include_fields"] = "$Customers(product_price, customer_id)"; +// search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts); +// ASSERT_TRUE(search_op.ok()); +// +// res_obj = nlohmann::json::parse(json_res); +// ASSERT_EQ(1, res_obj["found"].get()); +// ASSERT_EQ(1, res_obj["hits"].size()); +// ASSERT_EQ(2, res_obj["hits"][0]["document"].size()); +// ASSERT_EQ(1, res_obj["hits"][0]["document"].count("product_price")); +// ASSERT_EQ(73.5, res_obj["hits"][0]["document"].at("product_price")); +// ASSERT_EQ(1, res_obj["hits"][0]["document"].count("customer_id")); +// ASSERT_EQ("customer_a", res_obj["hits"][0]["document"].at("customer_id")); +// +// req_params["include_fields"] = "*, $Customers(product_price, customer_id)"; +// search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts); +// ASSERT_TRUE(search_op.ok()); +// +// res_obj = nlohmann::json::parse(json_res); +// ASSERT_EQ(1, res_obj["found"].get()); +// ASSERT_EQ(1, res_obj["hits"].size()); +// // 3 fields in Products document and 2 fields from Customers document +// ASSERT_EQ(5, res_obj["hits"][0]["document"].size()); +// +// req_params["include_fields"] = "*, $Customers(product*)"; +// search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts); +// ASSERT_TRUE(search_op.ok()); +// +// res_obj = nlohmann::json::parse(json_res); +// ASSERT_EQ(1, res_obj["found"].get()); +// ASSERT_EQ(1, res_obj["hits"].size()); +// // 3 fields in Products document and 2 fields from Customers document +// ASSERT_EQ(5, res_obj["hits"][0]["document"].size()); +// ASSERT_EQ(1, res_obj["hits"][0]["document"].count("product_id_sequence_id")); } \ No newline at end of file From 44668ebe27cee5d2ab9525b7cec8d34a6fb2f000 Mon Sep 17 00:00:00 2001 From: Harpreet Sangar Date: Thu, 9 Feb 2023 12:25:16 +0530 Subject: [PATCH 20/51] fix memory leak. --- src/collection.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/collection.cpp b/src/collection.cpp index 3766a94d..5f3041a6 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -3938,6 +3938,8 @@ Option Collection::prune_doc(nlohmann::json& doc, reference_docs.push_back(ref_doc); } + delete[] documents[0].second; + for (const auto &ref_doc: reference_docs) { doc.update(ref_doc); } From bb4c0af996509eb6d2c7118041bcb30468d44745 Mon Sep 17 00:00:00 2001 From: Harpreet Sangar Date: Wed, 15 Feb 2023 16:48:44 +0530 Subject: [PATCH 21/51] Fix rebase error. --- include/collection.h | 8 ++------ include/index.h | 6 +++--- src/collection.cpp | 3 --- src/index.cpp | 1 - 4 files changed, 5 insertions(+), 13 deletions(-) diff --git a/include/collection.h b/include/collection.h index 14693473..8d77cfde 100644 --- a/include/collection.h +++ b/include/collection.h @@ -457,16 +457,12 @@ public: Option get_filter_ids(const std::string & filter_query, filter_result_t& filter_result) const; - Option get_reference_filter_ids(const std::string & filter_query, + Option get_reference_filter_ids(const std::string& filter_query, filter_result_t& filter_result, - const std::string & collection_name) const; + const std::string& collection_name) const; Option validate_reference_filter(const std::string& filter_query) const; - Option get_reference_filter_ids(const std::string & filter_query, - const std::string & collection_name, - std::pair& reference_index_ids) const; - Option get(const std::string & id) const; Option remove(const std::string & id, bool remove_from_store = true); diff --git a/include/index.h b/include/index.h index 8742300f..3344b2ed 100644 --- a/include/index.h +++ b/include/index.h @@ -702,9 +702,9 @@ public: const std::string& collection_name, const std::string& reference_helper_field_name) const; - void do_reference_filtering_with_lock(std::pair& reference_index_ids, - filter_node_t* filter_tree_root, - const std::string& reference_helper_field_name) const; + Option do_reference_filtering_with_lock(filter_node_t* const filter_tree_root, + filter_result_t& filter_result, + const std::string & reference_helper_field_name) const; void refresh_schemas(const std::vector& new_fields, const std::vector& del_fields); diff --git a/src/collection.cpp b/src/collection.cpp index 5f3041a6..8ced0cae 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -2572,7 +2572,6 @@ Option Collection::validate_reference_filter(const std::string& filter_que filter_node_t* filter_tree_root = nullptr; Option filter_op = filter::parse_filter_query(filter_query, search_schema, store, doc_id_prefix, filter_tree_root); - if(!filter_op.ok()) { return filter_op; } @@ -3938,8 +3937,6 @@ Option Collection::prune_doc(nlohmann::json& doc, reference_docs.push_back(ref_doc); } - delete[] documents[0].second; - for (const auto &ref_doc: reference_docs) { doc.update(ref_doc); } diff --git a/src/index.cpp b/src/index.cpp index f9ba9f35..e88badc0 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -2719,7 +2719,6 @@ Option Index::search(std::vector& field_query_tokens, cons const vector_query_t& vector_query, size_t facet_sample_percent, size_t facet_sample_threshold, const std::string& collection_name) const { - std::shared_lock lock(mutex); filter_result_t filter_result; From baff5aa9511b8c65e79aeb409091baee54ac4748 Mon Sep 17 00:00:00 2001 From: ozanarmagan Date: Sun, 5 Feb 2023 15:18:35 +0300 Subject: [PATCH 22/51] Added ONNX Runtime dependency --- .gitignore | 1 - CMakeLists.txt | 2 +- WORKSPACE | 5 +- bazel/foreign_cc.patch | 4 +- bazel/foreign_cc_version_compiler.patch | 283 ++++++++++++++++++++++++ 5 files changed, 290 insertions(+), 5 deletions(-) create mode 100644 bazel/foreign_cc_version_compiler.patch diff --git a/.gitignore b/.gitignore index 264c40e2..67960068 100644 --- a/.gitignore +++ b/.gitignore @@ -15,4 +15,3 @@ typesense-server-data/ .clwb/.bazelproject .vscode/settings.json /onnxruntime-prefix - diff --git a/CMakeLists.txt b/CMakeLists.txt index 9893882b..edcd6179 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -255,4 +255,4 @@ target_sources(search PRIVATE ${ONNX_EXT_SRC_FILES}) add_dependencies(typesense-server onnxruntime_ext) add_dependencies(typesense-test onnxruntime_ext) add_dependencies(benchmark onnxruntime_ext) -add_dependencies(search onnxruntime_ext) \ No newline at end of file +add_dependencies(search onnxruntime_ext) diff --git a/WORKSPACE b/WORKSPACE index 1dc1401d..94d24423 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -13,7 +13,10 @@ bazel_compdb_deps() http_archive( name = "rules_foreign_cc", - patches = ["//bazel:foreign_cc.patch"], + patches = ["//bazel:foreign_cc.patch", "//bazel:foreign_cc_version_compiler.patch"], + patch_args = [ + "-p1", + ], sha256 = "2a4d07cd64b0719b39a7c12218a3e507672b82a97b98c6a89d38565894cf7c51", strip_prefix = "rules_foreign_cc-0.9.0", url = "https://github.com/bazelbuild/rules_foreign_cc/archive/refs/tags/0.9.0.tar.gz", diff --git a/bazel/foreign_cc.patch b/bazel/foreign_cc.patch index 9af0c8a7..9cb52c7c 100644 --- a/bazel/foreign_cc.patch +++ b/bazel/foreign_cc.patch @@ -1,5 +1,5 @@ ---- foreign_cc/private/configure_script.bzl -+++ foreign_cc/private/configure_script.bzl +--- a/foreign_cc/private/configure_script.bzl ++++ b/foreign_cc/private/configure_script.bzl @@ -70,7 +70,7 @@ ).lstrip()) diff --git a/bazel/foreign_cc_version_compiler.patch b/bazel/foreign_cc_version_compiler.patch new file mode 100644 index 00000000..d668a1db --- /dev/null +++ b/bazel/foreign_cc_version_compiler.patch @@ -0,0 +1,283 @@ +diff --git a/foreign_cc/private/cc_toolchain_util.bzl b/foreign_cc/private/cc_toolchain_util.bzl +index fd7fa4d..188dc5f 100644 +--- a/foreign_cc/private/cc_toolchain_util.bzl ++++ b/foreign_cc/private/cc_toolchain_util.bzl +@@ -265,15 +265,24 @@ def get_tools_info(ctx): + cc_toolchain = cc_toolchain, + ) + ++ cxx = cc_common.get_tool_for_action( ++ feature_configuration = feature_configuration, ++ action_name = ACTION_NAMES.cpp_compile, ++ ) ++ cxx_splitted = cxx.split("/") ++ if(cxx_splitted[-1] == "gcc"): ++ cxx_splitted[-1] = "g++" ++ cxx = "/".join(cxx_splitted) ++ if(cxx_splitted[-1] == "clang"): ++ cxx_splitted = "clang++" ++ cxx = "/".join(cxx_splitted) ++ + return CxxToolsInfo( + cc = cc_common.get_tool_for_action( + feature_configuration = feature_configuration, + action_name = ACTION_NAMES.c_compile, + ), +- cxx = cc_common.get_tool_for_action( +- feature_configuration = feature_configuration, +- action_name = ACTION_NAMES.cpp_compile, +- ), ++ cxx = cxx, + cxx_linker_static = cc_common.get_tool_for_action( + feature_configuration = feature_configuration, + action_name = ACTION_NAMES.cpp_link_static_library, +diff --git a/toolchains/built_toolchains.bzl b/toolchains/built_toolchains.bzl +index 5e59e79..ddf63a5 100644 +--- a/toolchains/built_toolchains.bzl ++++ b/toolchains/built_toolchains.bzl +@@ -28,6 +28,7 @@ _CMAKE_SRCS = { + "3.22.4": [["https://github.com/Kitware/CMake/releases/download/v3.22.4/cmake-3.22.4.tar.gz"], "cmake-3.22.4", "5c55d0b0bc4c191549e3502b8f99a4fe892077611df22b4178cc020626e22a47"], + "3.23.1": [["https://github.com/Kitware/CMake/releases/download/v3.23.1/cmake-3.23.1.tar.gz"], "cmake-3.23.1", "33fd10a8ec687a4d0d5b42473f10459bb92b3ae7def2b745dc10b192760869f3"], + "3.23.2": [["https://github.com/Kitware/CMake/releases/download/v3.23.2/cmake-3.23.2.tar.gz"], "cmake-3.23.2", "f316b40053466f9a416adf981efda41b160ca859e97f6a484b447ea299ff26aa"], ++ "3.25.0": [["https://github.com/Kitware/CMake/releases/download/v3.25.0/cmake-3.25.0.tar.gz"], "cmake-3.25.0", "306463f541555da0942e6f5a0736560f70c487178b9d94a5ae7f34d0538cdd48"], + } + + # buildifier: disable=unnamed-macro +@@ -438,6 +439,18 @@ def _ninja_toolchain(version, register_toolchains): + native.register_toolchains( + "@rules_foreign_cc//toolchains:built_ninja_toolchain", + ) ++ if version == "1.11.1": ++ maybe( ++ http_archive, ++ name = "ninja_build_src", ++ build_file_content = _ALL_CONTENT, ++ sha256 = "31747ae633213f1eda3842686f83c2aa1412e0f5691d1c14dbbcc67fe7400cea", ++ strip_prefix = "ninja-1.11.1", ++ urls = [ ++ "https://github.com/ninja-build/ninja/archive/v1.11.1.tar.gz", ++ ], ++ ) ++ return + if version == "1.11.0": + maybe( + http_archive, +diff --git a/toolchains/prebuilt_toolchains.bzl b/toolchains/prebuilt_toolchains.bzl +index dabfb95..d9c38b4 100644 +--- a/toolchains/prebuilt_toolchains.bzl ++++ b/toolchains/prebuilt_toolchains.bzl +@@ -67,6 +67,115 @@ def prebuilt_toolchains(cmake_version, ninja_version, register_toolchains): + _make_toolchains(register_toolchains) + + def _cmake_toolchains(version, register_toolchains): ++ if "3.25.0" == version: ++ maybe( ++ http_archive, ++ name = "cmake-3.25.0-linux-aarch64", ++ urls = [ ++ "https://github.com/Kitware/CMake/releases/download/v3.25.0/cmake-3.25.0-linux-aarch64.tar.gz", ++ ], ++ sha256 = "27da36d6debe9b30f5c498554ae40cd621a55736f5f2ae2618ed95722a59965a", ++ strip_prefix = "cmake-3.25.0-linux-aarch64", ++ build_file_content = _CMAKE_BUILD_FILE.format( ++ bin = "cmake", ++ env = "{}", ++ ), ++ ) ++ ++ maybe( ++ http_archive, ++ name = "cmake-3.25.0-linux-x86_64", ++ urls = [ ++ "https://github.com/Kitware/CMake/releases/download/v3.25.0/cmake-3.25.0-linux-x86_64.tar.gz", ++ ], ++ sha256 = "ac634d6f0a81d7089adc7be5acff66a6bee3b08615f9a947858ce92a9ef59c8b", ++ strip_prefix = "cmake-3.25.0-linux-x86_64", ++ build_file_content = _CMAKE_BUILD_FILE.format( ++ bin = "cmake", ++ env = "{}", ++ ), ++ ) ++ ++ maybe( ++ http_archive, ++ name = "cmake-3.25.0-macos-universal", ++ urls = [ ++ "https://github.com/Kitware/CMake/releases/download/v3.25.0/cmake-3.25.0-macos-universal.tar.gz", ++ ], ++ sha256 = "c088e761534a2078cd9d0581d39f02d3f9ed05302e33135b55c6d619b263b4c3", ++ strip_prefix = "cmake-3.25.0-macos-universal/CMake.app/Contents", ++ build_file_content = _CMAKE_BUILD_FILE.format( ++ bin = "cmake", ++ env = "{}", ++ ), ++ ) ++ ++ maybe( ++ http_archive, ++ name = "cmake-3.25.0-windows-i386", ++ urls = [ ++ "https://github.com/Kitware/CMake/releases/download/v3.25.0/cmake-3.25.0-windows-i386.zip", ++ ], ++ sha256 = "ddd115257a19ff3dd18fc63f32a00ae742f8b62d2e39bc354629903512f99783", ++ strip_prefix = "cmake-3.25.0-windows-i386", ++ build_file_content = _CMAKE_BUILD_FILE.format( ++ bin = "cmake.exe", ++ env = "{}", ++ ), ++ ) ++ ++ maybe( ++ http_archive, ++ name = "cmake-3.25.0-windows-x86_64", ++ urls = [ ++ "https://github.com/Kitware/CMake/releases/download/v3.25.0/cmake-3.25.0-windows-x86_64.zip", ++ ], ++ sha256 = "b46030c10cab1170355952f9ac59f7e6dabc248070fc53f15dff11d4ed2910f8", ++ strip_prefix = "cmake-3.25.0-windows-x86_64", ++ build_file_content = _CMAKE_BUILD_FILE.format( ++ bin = "cmake.exe", ++ env = "{}", ++ ), ++ ) ++ ++ # buildifier: leave-alone ++ maybe( ++ prebuilt_toolchains_repository, ++ name = "cmake_3.25.0_toolchains", ++ repos = { ++ "cmake-3.25.0-linux-aarch64": [ ++ "@platforms//cpu:aarch64", ++ "@platforms//os:linux", ++ ], ++ "cmake-3.25.0-linux-x86_64": [ ++ "@platforms//cpu:x86_64", ++ "@platforms//os:linux", ++ ], ++ "cmake-3.25.0-macos-universal": [ ++ "@platforms//os:macos", ++ ], ++ "cmake-3.25.0-windows-i386": [ ++ "@platforms//cpu:x86_32", ++ "@platforms//os:windows", ++ ], ++ "cmake-3.25.0-windows-x86_64": [ ++ "@platforms//cpu:x86_64", ++ "@platforms//os:windows", ++ ], ++ }, ++ tool = "cmake", ++ ) ++ ++ if register_toolchains: ++ native.register_toolchains( ++ "@cmake_3.25.0_toolchains//:cmake-3.25.0-linux-aarch64_toolchain", ++ "@cmake_3.25.0_toolchains//:cmake-3.25.0-linux-x86_64_toolchain", ++ "@cmake_3.25.0_toolchains//:cmake-3.25.0-macos-universal_toolchain", ++ "@cmake_3.25.0_toolchains//:cmake-3.25.0-windows-i386_toolchain", ++ "@cmake_3.25.0_toolchains//:cmake-3.25.0-windows-x86_64_toolchain", ++ ) ++ ++ return + if "3.23.2" == version: + maybe( + http_archive, +@@ -4196,6 +4305,78 @@ def _cmake_toolchains(version, register_toolchains): + fail("Unsupported version: " + str(version)) + + def _ninja_toolchains(version, register_toolchains): ++ if "1.11.1" == version: ++ maybe( ++ http_archive, ++ name = "ninja_1.11.1_linux", ++ urls = [ ++ "https://github.com/ninja-build/ninja/releases/download/v1.11.1/ninja-linux.zip", ++ ], ++ sha256 = "b901ba96e486dce377f9a070ed4ef3f79deb45f4ffe2938f8e7ddc69cfb3df77", ++ strip_prefix = "", ++ build_file_content = _NINJA_BUILD_FILE.format( ++ bin = "ninja", ++ env = "{\"NINJA\": \"$(execpath :ninja_bin)\"}", ++ ), ++ ) ++ ++ maybe( ++ http_archive, ++ name = "ninja_1.11.1_mac", ++ urls = [ ++ "https://github.com/ninja-build/ninja/releases/download/v1.11.1/ninja-mac.zip", ++ ], ++ sha256 = "482ecb23c59ae3d4f158029112de172dd96bb0e97549c4b1ca32d8fad11f873e", ++ strip_prefix = "", ++ build_file_content = _NINJA_BUILD_FILE.format( ++ bin = "ninja", ++ env = "{\"NINJA\": \"$(execpath :ninja_bin)\"}", ++ ), ++ ) ++ ++ maybe( ++ http_archive, ++ name = "ninja_1.11.1_win", ++ urls = [ ++ "https://github.com/ninja-build/ninja/releases/download/v1.11.1/ninja-win.zip", ++ ], ++ sha256 = "524b344a1a9a55005eaf868d991e090ab8ce07fa109f1820d40e74642e289abc", ++ strip_prefix = "", ++ build_file_content = _NINJA_BUILD_FILE.format( ++ bin = "ninja.exe", ++ env = "{\"NINJA\": \"$(execpath :ninja_bin)\"}", ++ ), ++ ) ++ ++ # buildifier: leave-alone ++ maybe( ++ prebuilt_toolchains_repository, ++ name = "ninja_1.11.1_toolchains", ++ repos = { ++ "ninja_1.11.1_linux": [ ++ "@platforms//cpu:x86_64", ++ "@platforms//os:linux", ++ ], ++ "ninja_1.11.1_mac": [ ++ "@platforms//cpu:x86_64", ++ "@platforms//os:macos", ++ ], ++ "ninja_1.11.1_win": [ ++ "@platforms//cpu:x86_64", ++ "@platforms//os:windows", ++ ], ++ }, ++ tool = "ninja", ++ ) ++ ++ if register_toolchains: ++ native.register_toolchains( ++ "@ninja_1.11.1_toolchains//:ninja_1.11.1_linux_toolchain", ++ "@ninja_1.11.1_toolchains//:ninja_1.11.1_mac_toolchain", ++ "@ninja_1.11.1_toolchains//:ninja_1.11.1_win_toolchain", ++ ) ++ ++ return + if "1.11.0" == version: + maybe( + http_archive, +diff --git a/toolchains/prebuilt_toolchains.py b/toolchains/prebuilt_toolchains.py +index 5288b27..a193021 100755 +--- a/toolchains/prebuilt_toolchains.py ++++ b/toolchains/prebuilt_toolchains.py +@@ -10,6 +10,7 @@ CMAKE_SHA256_URL_TEMPLATE = "https://cmake.org/files/v{minor}/cmake-{full}-SHA-2 + CMAKE_URL_TEMPLATE = "https://github.com/Kitware/CMake/releases/download/v{full}/{file}" + + CMAKE_VERSIONS = [ ++ "3.25.0", + "3.23.2", + "3.23.1", + "3.22.4", +@@ -116,6 +117,7 @@ NINJA_TARGETS = { + } + + NINJA_VERSIONS = ( ++ "1.11.1", + "1.10.2", + "1.10.1", + "1.10.0", From 36c76e364893a493196903e23c9e48d661abe62a Mon Sep 17 00:00:00 2001 From: Harpreet Sangar Date: Tue, 17 Jan 2023 14:08:39 +0530 Subject: [PATCH 23/51] Abstract `foo_sequence_id` field from user. --- include/field.h | 7 +++++++ src/collection_manager.cpp | 4 ++++ 2 files changed, 11 insertions(+) diff --git a/include/field.h b/include/field.h index 18a1d4b7..44de4637 100644 --- a/include/field.h +++ b/include/field.h @@ -11,6 +11,7 @@ #include #include "json.hpp" #include "text_embedder_manager.h" +#include namespace field_types { // first field value indexed will determine the type @@ -284,11 +285,17 @@ struct field { const std::string & default_sorting_field, nlohmann::json& fields_json) { bool found_default_sorting_field = false; + const std::regex sequence_id_pattern(".*_sequence_id$"); // Check for duplicates in field names std::map> unique_fields; for(const field & field: fields) { + if (std::regex_match(field.name, sequence_id_pattern)) { + // Don't add foo_sequence_id field. + continue; + } + unique_fields[field.name].push_back(&field); if(field.name == "id") { diff --git a/src/collection_manager.cpp b/src/collection_manager.cpp index 0475dfbe..96a187b9 100644 --- a/src/collection_manager.cpp +++ b/src/collection_manager.cpp @@ -89,6 +89,10 @@ Collection* CollectionManager::init_collection(const nlohmann::json & collection } fields.push_back(f); + + if (!f.reference.empty()) { + fields.emplace_back(field(f.name + "_sequence_id", "string", false, f.optional, true)); + } } std::string default_sorting_field = collection_meta[Collection::COLLECTION_DEFAULT_SORTING_FIELD_KEY].get(); From 2670638648618ff4f526eef8646d9e590b8bdc28 Mon Sep 17 00:00:00 2001 From: Harpreet Sangar Date: Thu, 19 Jan 2023 11:25:43 +0530 Subject: [PATCH 24/51] Serialize sequence id. --- test/collection_join_test.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/test/collection_join_test.cpp b/test/collection_join_test.cpp index f302d3dc..9db121aa 100644 --- a/test/collection_join_test.cpp +++ b/test/collection_join_test.cpp @@ -265,7 +265,6 @@ TEST_F(CollectionJoinTest, IndexDocumentHavingReferenceField) { } ASSERT_TRUE(add_op.ok()); } - collectionManager.drop_collection("Customers"); customers_schema_json = R"({ From beb5e700cb68285b8c08faced93cfb2668a4cfe8 Mon Sep 17 00:00:00 2001 From: Harpreet Sangar Date: Thu, 19 Jan 2023 11:27:52 +0530 Subject: [PATCH 25/51] Store `foo_sequence_id` in collection's meta-data. --- include/field.h | 6 ------ src/collection_manager.cpp | 4 ---- 2 files changed, 10 deletions(-) diff --git a/include/field.h b/include/field.h index 44de4637..63feff0f 100644 --- a/include/field.h +++ b/include/field.h @@ -285,17 +285,11 @@ struct field { const std::string & default_sorting_field, nlohmann::json& fields_json) { bool found_default_sorting_field = false; - const std::regex sequence_id_pattern(".*_sequence_id$"); // Check for duplicates in field names std::map> unique_fields; for(const field & field: fields) { - if (std::regex_match(field.name, sequence_id_pattern)) { - // Don't add foo_sequence_id field. - continue; - } - unique_fields[field.name].push_back(&field); if(field.name == "id") { diff --git a/src/collection_manager.cpp b/src/collection_manager.cpp index 96a187b9..0475dfbe 100644 --- a/src/collection_manager.cpp +++ b/src/collection_manager.cpp @@ -89,10 +89,6 @@ Collection* CollectionManager::init_collection(const nlohmann::json & collection } fields.push_back(f); - - if (!f.reference.empty()) { - fields.emplace_back(field(f.name + "_sequence_id", "string", false, f.optional, true)); - } } std::string default_sorting_field = collection_meta[Collection::COLLECTION_DEFAULT_SORTING_FIELD_KEY].get(); From 1fbfa34672b44ea3134e881925bfcd3f185d922b Mon Sep 17 00:00:00 2001 From: Harpreet Sangar Date: Sun, 22 Jan 2023 12:02:29 +0530 Subject: [PATCH 26/51] Filter by reference. --- include/collection.h | 2 ++ src/collection.cpp | 16 ++++++++++++++++ test/collection_join_test.cpp | 3 +++ 3 files changed, 21 insertions(+) diff --git a/include/collection.h b/include/collection.h index 8d77cfde..55855a28 100644 --- a/include/collection.h +++ b/include/collection.h @@ -463,6 +463,8 @@ public: Option validate_reference_filter(const std::string& filter_query) const; + Option validate_reference_filter(const std::string& filter_query) const; + Option get(const std::string & id) const; Option remove(const std::string & id, bool remove_from_store = true); diff --git a/src/collection.cpp b/src/collection.cpp index 8ced0cae..b1c24216 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -2580,6 +2580,22 @@ Option Collection::validate_reference_filter(const std::string& filter_que return Option(true); } +Option Collection::validate_reference_filter(const std::string& filter_query) const { + std::shared_lock lock(mutex); + + const std::string doc_id_prefix = std::to_string(collection_id) + "_" + DOC_ID_PREFIX + "_"; + filter_node_t* filter_tree_root = nullptr; + Option filter_op = filter::parse_filter_query(filter_query, search_schema, + store, doc_id_prefix, filter_tree_root); + + if(!filter_op.ok()) { + return filter_op; + } + + delete filter_tree_root; + return Option(true); +} + bool Collection::facet_value_to_string(const facet &a_facet, const facet_count_t &facet_count, const nlohmann::json &document, std::string &value) const { diff --git a/test/collection_join_test.cpp b/test/collection_join_test.cpp index 9db121aa..0cedd216 100644 --- a/test/collection_join_test.cpp +++ b/test/collection_join_test.cpp @@ -284,6 +284,9 @@ TEST_F(CollectionJoinTest, IndexDocumentHavingReferenceField) { ASSERT_TRUE(add_doc_op.ok()); ASSERT_EQ(customer_collection->get("0").get().count("reference_id_sequence_id"), 1); + // Referenced document should be accessible from Customers collection. + auto sequence_id = collectionManager.get_collection("Products")->get_seq_id_collection_prefix() + "_" + + customer_collection->get("0").get()["product_id_sequence_id"].get(); nlohmann::json document; // Referenced document's sequence_id must be valid. auto get_op = collectionManager.get_collection("Products")->get_document_from_store( From 076a04c06218af665a1be549b12551f94f7720e9 Mon Sep 17 00:00:00 2001 From: Harpreet Sangar Date: Tue, 24 Jan 2023 10:57:29 +0530 Subject: [PATCH 27/51] Optimize reference filtering. --- include/collection.h | 4 ++++ include/index.h | 4 ++++ src/collection.cpp | 35 +++++++++++++++++++++++++++++++++++ src/index.cpp | 2 +- test/collection_join_test.cpp | 3 --- 5 files changed, 44 insertions(+), 4 deletions(-) diff --git a/include/collection.h b/include/collection.h index 55855a28..38e91d1f 100644 --- a/include/collection.h +++ b/include/collection.h @@ -463,6 +463,10 @@ public: Option validate_reference_filter(const std::string& filter_query) const; + Option get_reference_filter_ids(const std::string & filter_query, + const std::string & collection_name, + std::pair& reference_index_ids) const; + Option validate_reference_filter(const std::string& filter_query) const; Option get(const std::string & id) const; diff --git a/include/index.h b/include/index.h index 3344b2ed..b427763a 100644 --- a/include/index.h +++ b/include/index.h @@ -706,6 +706,10 @@ public: filter_result_t& filter_result, const std::string & reference_helper_field_name) const; + void do_reference_filtering_with_lock(std::pair& reference_index_ids, + filter_node_t const* const& filter_tree_root, + const std::string& reference_field_name) const; + void refresh_schemas(const std::vector& new_fields, const std::vector& del_fields); // the following methods are not synchronized because their parent calls are synchronized or they are const/static diff --git a/src/collection.cpp b/src/collection.cpp index b1c24216..3a740723 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -2580,6 +2580,41 @@ Option Collection::validate_reference_filter(const std::string& filter_que return Option(true); } +Option Collection::get_reference_filter_ids(const std::string & filter_query, + const std::string & collection_name, + std::pair& reference_index_ids) const { + std::shared_lock lock(mutex); + + std::string reference_field_name; + for (auto const& field: fields) { + if (!field.reference.empty() && + field.reference.find(collection_name) == 0 && + field.reference.find('.') == collection_name.size()) { + reference_field_name = field.name; + break; + } + } + + if (reference_field_name.empty()) { + return Option(400, "Could not find any field in `" + name + "` referencing the collection `" + + collection_name + "`."); + } + + const std::string doc_id_prefix = std::to_string(collection_id) + "_" + DOC_ID_PREFIX + "_"; + filter_node_t* filter_tree_root = nullptr; + Option filter_op = filter::parse_filter_query(filter_query, search_schema, + store, doc_id_prefix, filter_tree_root); + if(!filter_op.ok()) { + return filter_op; + } + + reference_field_name += "_sequence_id"; + index->do_reference_filtering_with_lock(reference_index_ids, filter_tree_root, reference_field_name); + + delete filter_tree_root; + return Option(true); +} + Option Collection::validate_reference_filter(const std::string& filter_query) const { std::shared_lock lock(mutex); diff --git a/src/index.cpp b/src/index.cpp index e88badc0..3ec45ac4 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -1497,7 +1497,7 @@ Option Index::do_filtering(filter_node_t* const root, const uint32_t& context_ids_length, const uint32_t* context_ids) const { // auto begin = std::chrono::high_resolution_clock::now(); - const filter a_filter = root->filter_exp; +/**/ const filter a_filter = root->filter_exp; bool is_referenced_filter = !a_filter.referenced_collection_name.empty(); if (is_referenced_filter) { diff --git a/test/collection_join_test.cpp b/test/collection_join_test.cpp index 0cedd216..9db121aa 100644 --- a/test/collection_join_test.cpp +++ b/test/collection_join_test.cpp @@ -284,9 +284,6 @@ TEST_F(CollectionJoinTest, IndexDocumentHavingReferenceField) { ASSERT_TRUE(add_doc_op.ok()); ASSERT_EQ(customer_collection->get("0").get().count("reference_id_sequence_id"), 1); - // Referenced document should be accessible from Customers collection. - auto sequence_id = collectionManager.get_collection("Products")->get_seq_id_collection_prefix() + "_" + - customer_collection->get("0").get()["product_id_sequence_id"].get(); nlohmann::json document; // Referenced document's sequence_id must be valid. auto get_op = collectionManager.get_collection("Products")->get_document_from_store( From eacd644d3aede179cf1cc128c4bc367a9ce7a64c Mon Sep 17 00:00:00 2001 From: Harpreet Sangar Date: Fri, 27 Jan 2023 12:57:13 +0530 Subject: [PATCH 28/51] Add `Index::rearranging_recursive_filter`. --- include/index.h | 2 +- src/index.cpp | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/include/index.h b/include/index.h index b427763a..fdc059d7 100644 --- a/include/index.h +++ b/include/index.h @@ -707,7 +707,7 @@ public: const std::string & reference_helper_field_name) const; void do_reference_filtering_with_lock(std::pair& reference_index_ids, - filter_node_t const* const& filter_tree_root, + filter_node_t* filter_tree_root, const std::string& reference_field_name) const; void refresh_schemas(const std::vector& new_fields, const std::vector& del_fields); diff --git a/src/index.cpp b/src/index.cpp index 3ec45ac4..6592b87b 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -1497,7 +1497,7 @@ Option Index::do_filtering(filter_node_t* const root, const uint32_t& context_ids_length, const uint32_t* context_ids) const { // auto begin = std::chrono::high_resolution_clock::now(); -/**/ const filter a_filter = root->filter_exp; + const filter a_filter = root->filter_exp; bool is_referenced_filter = !a_filter.referenced_collection_name.empty(); if (is_referenced_filter) { @@ -1958,7 +1958,6 @@ Option Index::rearrange_filter_tree(filter_node_t* const root, if (root == nullptr) { return Option(true); } - if (root->isOperator) { uint32_t l_filter_ids_length = 0; if (root->left != nullptr) { From d0069fe2d3d4c99d048a480b2c9dce31633a57fd Mon Sep 17 00:00:00 2001 From: Harpreet Sangar Date: Fri, 27 Jan 2023 19:58:06 +0530 Subject: [PATCH 29/51] Add `Index::adaptive_filter`. --- src/index.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/index.cpp b/src/index.cpp index 6592b87b..e88badc0 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -1958,6 +1958,7 @@ Option Index::rearrange_filter_tree(filter_node_t* const root, if (root == nullptr) { return Option(true); } + if (root->isOperator) { uint32_t l_filter_ids_length = 0; if (root->left != nullptr) { From 39a027043431a0e54c00ba8d92892b51467b1854 Mon Sep 17 00:00:00 2001 From: Harpreet Sangar Date: Thu, 2 Feb 2023 11:23:09 +0530 Subject: [PATCH 30/51] Add `reference_fields` map in `Collection`. --- include/index.h | 2 +- src/collection.cpp | 12 ++++++------ test/collection_join_test.cpp | 3 ++- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/include/index.h b/include/index.h index fdc059d7..92b9f7af 100644 --- a/include/index.h +++ b/include/index.h @@ -708,7 +708,7 @@ public: void do_reference_filtering_with_lock(std::pair& reference_index_ids, filter_node_t* filter_tree_root, - const std::string& reference_field_name) const; + const std::string& reference_helper_field_name) const; void refresh_schemas(const std::vector& new_fields, const std::vector& del_fields); diff --git a/src/collection.cpp b/src/collection.cpp index 3a740723..c3ef5a49 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -2586,11 +2586,10 @@ Option Collection::get_reference_filter_ids(const std::string & filter_que std::shared_lock lock(mutex); std::string reference_field_name; - for (auto const& field: fields) { - if (!field.reference.empty() && - field.reference.find(collection_name) == 0 && - field.reference.find('.') == collection_name.size()) { - reference_field_name = field.name; + for (auto const& pair: reference_fields) { + auto reference_pair = pair.second; + if (reference_pair.collection == collection_name) { + reference_field_name = reference_pair.field; break; } } @@ -2608,7 +2607,8 @@ Option Collection::get_reference_filter_ids(const std::string & filter_que return filter_op; } - reference_field_name += "_sequence_id"; + // Reference helper field has the sequence id of other collection's documents. + reference_field_name += REFERENCE_HELPER_FIELD_SUFFIX; index->do_reference_filtering_with_lock(reference_index_ids, filter_tree_root, reference_field_name); delete filter_tree_root; diff --git a/test/collection_join_test.cpp b/test/collection_join_test.cpp index 9db121aa..b25439e6 100644 --- a/test/collection_join_test.cpp +++ b/test/collection_join_test.cpp @@ -265,6 +265,7 @@ TEST_F(CollectionJoinTest, IndexDocumentHavingReferenceField) { } ASSERT_TRUE(add_op.ok()); } + collectionManager.drop_collection("Customers"); customers_schema_json = R"({ @@ -708,4 +709,4 @@ TEST_F(CollectionJoinTest, IncludeFieldsByReference_SingleMatch) { // // 3 fields in Products document and 2 fields from Customers document // ASSERT_EQ(5, res_obj["hits"][0]["document"].size()); // ASSERT_EQ(1, res_obj["hits"][0]["document"].count("product_id_sequence_id")); -} \ No newline at end of file +} From 82dddd3b6dd4ac4f4e9b1cbca03434f770020b43 Mon Sep 17 00:00:00 2001 From: Harpreet Sangar Date: Fri, 3 Feb 2023 14:30:17 +0530 Subject: [PATCH 31/51] Fix double locking of collection mutex. --- test/collection_join_test.cpp | 142 ++++++++++++++++++++++++++++++++++ 1 file changed, 142 insertions(+) diff --git a/test/collection_join_test.cpp b/test/collection_join_test.cpp index b25439e6..9f22cdda 100644 --- a/test/collection_join_test.cpp +++ b/test/collection_join_test.cpp @@ -710,3 +710,145 @@ TEST_F(CollectionJoinTest, IncludeFieldsByReference_SingleMatch) { // ASSERT_EQ(5, res_obj["hits"][0]["document"].size()); // ASSERT_EQ(1, res_obj["hits"][0]["document"].count("product_id_sequence_id")); } + +TEST_F(CollectionJoinTest, FilterByReferenceField_MultipleMatch) { + auto schema_json = + R"({ + "name": "Users", + "fields": [ + {"name": "user_id", "type": "string"}, + {"name": "user_name", "type": "string"} + ] + })"_json; + std::vector documents = { + R"({ + "user_id": "user_a", + "user_name": "Roshan" + })"_json, + R"({ + "user_id": "user_b", + "user_name": "Ruby" + })"_json, + R"({ + "user_id": "user_c", + "user_name": "Joe" + })"_json, + R"({ + "user_id": "user_d", + "user_name": "Aby" + })"_json + }; + auto collection_create_op = collectionManager.create_collection(schema_json); + ASSERT_TRUE(collection_create_op.ok()); + for (auto const &json: documents) { + auto add_op = collection_create_op.get()->add(json.dump()); + if (!add_op.ok()) { + LOG(INFO) << add_op.error(); + } + ASSERT_TRUE(add_op.ok()); + } + + schema_json = + R"({ + "name": "Repos", + "fields": [ + {"name": "repo_id", "type": "string"}, + {"name": "repo_content", "type": "string"} + ] + })"_json; + documents = { + R"({ + "repo_id": "repo_a", + "repo_content": "body1" + })"_json, + R"({ + "repo_id": "repo_b", + "repo_content": "body2" + })"_json, + R"({ + "repo_id": "repo_c", + "repo_content": "body3" + })"_json + }; + collection_create_op = collectionManager.create_collection(schema_json); + ASSERT_TRUE(collection_create_op.ok()); + for (auto const &json: documents) { + auto add_op = collection_create_op.get()->add(json.dump()); + if (!add_op.ok()) { + LOG(INFO) << add_op.error(); + } + ASSERT_TRUE(add_op.ok()); + } + + schema_json = + R"({ + "name": "Links", + "fields": [ + {"name": "repo_id", "type": "string", "reference": "Repos.repo_id"}, + {"name": "user_id", "type": "string", "reference": "Users.user_id"} + ] + })"_json; + documents = { + R"({ + "repo_id": "repo_a", + "user_id": "user_b" + })"_json, + R"({ + "repo_id": "repo_a", + "user_id": "user_c" + })"_json, + R"({ + "repo_id": "repo_b", + "user_id": "user_a" + })"_json, + R"({ + "repo_id": "repo_b", + "user_id": "user_b" + })"_json, + R"({ + "repo_id": "repo_b", + "user_id": "user_d" + })"_json, + R"({ + "repo_id": "repo_c", + "user_id": "user_a" + })"_json, + R"({ + "repo_id": "repo_c", + "user_id": "user_b" + })"_json, + R"({ + "repo_id": "repo_c", + "user_id": "user_c" + })"_json, + R"({ + "repo_id": "repo_c", + "user_id": "user_d" + })"_json + }; + collection_create_op = collectionManager.create_collection(schema_json); + ASSERT_TRUE(collection_create_op.ok()); + + for (auto const &json: documents) { + auto add_op = collection_create_op.get()->add(json.dump()); + if (!add_op.ok()) { + LOG(INFO) << add_op.error(); + } + ASSERT_TRUE(add_op.ok()); + } + + auto coll = collectionManager.get_collection("Users"); + + // Search for users linked to repo_b + auto result = coll->search("R", {"user_name"}, "$Links(repo_id:=repo_b)", {}, {}, {0}, + 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD).get(); + + ASSERT_EQ(2, result["found"].get()); + ASSERT_EQ(2, result["hits"].size()); + ASSERT_EQ("user_b", result["hits"][0]["document"]["user_id"].get()); + ASSERT_EQ("user_a", result["hits"][1]["document"]["user_id"].get()); + +// collectionManager.drop_collection("Users"); +// collectionManager.drop_collection("Repos"); +// collectionManager.drop_collection("Links"); +} \ No newline at end of file From 0059f8d3fb95a978cbb774f5d31c9d70e63b7706 Mon Sep 17 00:00:00 2001 From: Harpreet Sangar Date: Tue, 7 Feb 2023 10:53:18 +0530 Subject: [PATCH 32/51] Fix tests. --- test/collection_join_test.cpp | 142 ---------------------------------- 1 file changed, 142 deletions(-) diff --git a/test/collection_join_test.cpp b/test/collection_join_test.cpp index 9f22cdda..b25439e6 100644 --- a/test/collection_join_test.cpp +++ b/test/collection_join_test.cpp @@ -710,145 +710,3 @@ TEST_F(CollectionJoinTest, IncludeFieldsByReference_SingleMatch) { // ASSERT_EQ(5, res_obj["hits"][0]["document"].size()); // ASSERT_EQ(1, res_obj["hits"][0]["document"].count("product_id_sequence_id")); } - -TEST_F(CollectionJoinTest, FilterByReferenceField_MultipleMatch) { - auto schema_json = - R"({ - "name": "Users", - "fields": [ - {"name": "user_id", "type": "string"}, - {"name": "user_name", "type": "string"} - ] - })"_json; - std::vector documents = { - R"({ - "user_id": "user_a", - "user_name": "Roshan" - })"_json, - R"({ - "user_id": "user_b", - "user_name": "Ruby" - })"_json, - R"({ - "user_id": "user_c", - "user_name": "Joe" - })"_json, - R"({ - "user_id": "user_d", - "user_name": "Aby" - })"_json - }; - auto collection_create_op = collectionManager.create_collection(schema_json); - ASSERT_TRUE(collection_create_op.ok()); - for (auto const &json: documents) { - auto add_op = collection_create_op.get()->add(json.dump()); - if (!add_op.ok()) { - LOG(INFO) << add_op.error(); - } - ASSERT_TRUE(add_op.ok()); - } - - schema_json = - R"({ - "name": "Repos", - "fields": [ - {"name": "repo_id", "type": "string"}, - {"name": "repo_content", "type": "string"} - ] - })"_json; - documents = { - R"({ - "repo_id": "repo_a", - "repo_content": "body1" - })"_json, - R"({ - "repo_id": "repo_b", - "repo_content": "body2" - })"_json, - R"({ - "repo_id": "repo_c", - "repo_content": "body3" - })"_json - }; - collection_create_op = collectionManager.create_collection(schema_json); - ASSERT_TRUE(collection_create_op.ok()); - for (auto const &json: documents) { - auto add_op = collection_create_op.get()->add(json.dump()); - if (!add_op.ok()) { - LOG(INFO) << add_op.error(); - } - ASSERT_TRUE(add_op.ok()); - } - - schema_json = - R"({ - "name": "Links", - "fields": [ - {"name": "repo_id", "type": "string", "reference": "Repos.repo_id"}, - {"name": "user_id", "type": "string", "reference": "Users.user_id"} - ] - })"_json; - documents = { - R"({ - "repo_id": "repo_a", - "user_id": "user_b" - })"_json, - R"({ - "repo_id": "repo_a", - "user_id": "user_c" - })"_json, - R"({ - "repo_id": "repo_b", - "user_id": "user_a" - })"_json, - R"({ - "repo_id": "repo_b", - "user_id": "user_b" - })"_json, - R"({ - "repo_id": "repo_b", - "user_id": "user_d" - })"_json, - R"({ - "repo_id": "repo_c", - "user_id": "user_a" - })"_json, - R"({ - "repo_id": "repo_c", - "user_id": "user_b" - })"_json, - R"({ - "repo_id": "repo_c", - "user_id": "user_c" - })"_json, - R"({ - "repo_id": "repo_c", - "user_id": "user_d" - })"_json - }; - collection_create_op = collectionManager.create_collection(schema_json); - ASSERT_TRUE(collection_create_op.ok()); - - for (auto const &json: documents) { - auto add_op = collection_create_op.get()->add(json.dump()); - if (!add_op.ok()) { - LOG(INFO) << add_op.error(); - } - ASSERT_TRUE(add_op.ok()); - } - - auto coll = collectionManager.get_collection("Users"); - - // Search for users linked to repo_b - auto result = coll->search("R", {"user_name"}, "$Links(repo_id:=repo_b)", {}, {}, {0}, - 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD).get(); - - ASSERT_EQ(2, result["found"].get()); - ASSERT_EQ(2, result["hits"].size()); - ASSERT_EQ("user_b", result["hits"][0]["document"]["user_id"].get()); - ASSERT_EQ("user_a", result["hits"][1]["document"]["user_id"].get()); - -// collectionManager.drop_collection("Users"); -// collectionManager.drop_collection("Repos"); -// collectionManager.drop_collection("Links"); -} \ No newline at end of file From 0c8edf941f01f2e5bf15e19df04b85817c7ff5c1 Mon Sep 17 00:00:00 2001 From: Harpreet Sangar Date: Thu, 9 Feb 2023 11:50:58 +0530 Subject: [PATCH 33/51] Reference `include_fields`. --- include/collection.h | 2 ++ src/collection.cpp | 25 ++++++++++++++++++------- 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/include/collection.h b/include/collection.h index 38e91d1f..f0c16236 100644 --- a/include/collection.h +++ b/include/collection.h @@ -463,6 +463,8 @@ public: Option validate_reference_filter(const std::string& filter_query) const; + Option get_reference_field(const std::string & collection_name) const; + Option get_reference_filter_ids(const std::string & filter_query, const std::string & collection_name, std::pair& reference_index_ids) const; diff --git a/src/collection.cpp b/src/collection.cpp index c3ef5a49..6c61a24c 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -2580,9 +2580,7 @@ Option Collection::validate_reference_filter(const std::string& filter_que return Option(true); } -Option Collection::get_reference_filter_ids(const std::string & filter_query, - const std::string & collection_name, - std::pair& reference_index_ids) const { +Option Collection::get_reference_field(const std::string & collection_name) const { std::shared_lock lock(mutex); std::string reference_field_name; @@ -2595,10 +2593,23 @@ Option Collection::get_reference_filter_ids(const std::string & filter_que } if (reference_field_name.empty()) { - return Option(400, "Could not find any field in `" + name + "` referencing the collection `" - + collection_name + "`."); + return Option(400, "Could not find any field in `" + name + "` referencing the collection `" + + collection_name + "`."); } + return Option(reference_field_name); +} + +Option Collection::get_reference_filter_ids(const std::string & filter_query, + const std::string & collection_name, + std::pair& reference_index_ids) const { + auto reference_field_op = get_reference_field(collection_name); + if (!reference_field_op.ok()) { + return Option(reference_field_op.code(), reference_field_op.error()); + } + + std::shared_lock lock(mutex); + const std::string doc_id_prefix = std::to_string(collection_id) + "_" + DOC_ID_PREFIX + "_"; filter_node_t* filter_tree_root = nullptr; Option filter_op = filter::parse_filter_query(filter_query, search_schema, @@ -2608,8 +2619,8 @@ Option Collection::get_reference_filter_ids(const std::string & filter_que } // Reference helper field has the sequence id of other collection's documents. - reference_field_name += REFERENCE_HELPER_FIELD_SUFFIX; - index->do_reference_filtering_with_lock(reference_index_ids, filter_tree_root, reference_field_name); + auto field_name = reference_field_op.get() + REFERENCE_HELPER_FIELD_SUFFIX; + index->do_reference_filtering_with_lock(reference_index_ids, filter_tree_root, field_name); delete filter_tree_root; return Option(true); From d2bc921f1d65fca4ca7cb57e237735cbc98710fb Mon Sep 17 00:00:00 2001 From: Harpreet Sangar Date: Thu, 9 Feb 2023 12:25:16 +0530 Subject: [PATCH 34/51] fix memory leak. --- src/collection.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/collection.cpp b/src/collection.cpp index 6c61a24c..1026bb54 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -3999,6 +3999,8 @@ Option Collection::prune_doc(nlohmann::json& doc, reference_docs.push_back(ref_doc); } + delete[] documents[0].second; + for (const auto &ref_doc: reference_docs) { doc.update(ref_doc); } From 94add54c4354db6dd5213edf2ab212fa18a5e8b0 Mon Sep 17 00:00:00 2001 From: Harpreet Sangar Date: Wed, 15 Feb 2023 16:48:44 +0530 Subject: [PATCH 35/51] temp. --- include/collection.h | 4 ++-- include/index.h | 8 -------- src/collection.cpp | 19 ++++++++++--------- 3 files changed, 12 insertions(+), 19 deletions(-) diff --git a/include/collection.h b/include/collection.h index f0c16236..ab847a4c 100644 --- a/include/collection.h +++ b/include/collection.h @@ -466,8 +466,8 @@ public: Option get_reference_field(const std::string & collection_name) const; Option get_reference_filter_ids(const std::string & filter_query, - const std::string & collection_name, - std::pair& reference_index_ids) const; + filter_result_t& filter_result, + const std::string & collection_name) const; Option validate_reference_filter(const std::string& filter_query) const; diff --git a/include/index.h b/include/index.h index 92b9f7af..0ce10daf 100644 --- a/include/index.h +++ b/include/index.h @@ -702,14 +702,6 @@ public: const std::string& collection_name, const std::string& reference_helper_field_name) const; - Option do_reference_filtering_with_lock(filter_node_t* const filter_tree_root, - filter_result_t& filter_result, - const std::string & reference_helper_field_name) const; - - void do_reference_filtering_with_lock(std::pair& reference_index_ids, - filter_node_t* filter_tree_root, - const std::string& reference_helper_field_name) const; - void refresh_schemas(const std::vector& new_fields, const std::vector& del_fields); // the following methods are not synchronized because their parent calls are synchronized or they are const/static diff --git a/src/collection.cpp b/src/collection.cpp index 1026bb54..45ae966a 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -2601,8 +2601,8 @@ Option Collection::get_reference_field(const std::string & collecti } Option Collection::get_reference_filter_ids(const std::string & filter_query, - const std::string & collection_name, - std::pair& reference_index_ids) const { + filter_result_t& filter_result, + const std::string & collection_name) const { auto reference_field_op = get_reference_field(collection_name); if (!reference_field_op.ok()) { return Option(reference_field_op.code(), reference_field_op.error()); @@ -2612,15 +2612,18 @@ Option Collection::get_reference_filter_ids(const std::string & filter_que const std::string doc_id_prefix = std::to_string(collection_id) + "_" + DOC_ID_PREFIX + "_"; filter_node_t* filter_tree_root = nullptr; - Option filter_op = filter::parse_filter_query(filter_query, search_schema, - store, doc_id_prefix, filter_tree_root); - if(!filter_op.ok()) { - return filter_op; + Option parse_op = filter::parse_filter_query(filter_query, search_schema, + store, doc_id_prefix, filter_tree_root); + if(!parse_op.ok()) { + return parse_op; } // Reference helper field has the sequence id of other collection's documents. auto field_name = reference_field_op.get() + REFERENCE_HELPER_FIELD_SUFFIX; - index->do_reference_filtering_with_lock(reference_index_ids, filter_tree_root, field_name); + auto filter_op = index->do_reference_filtering_with_lock(filter_tree_root, filter_result, field_name); + if (!filter_op.ok()) { + return filter_op; + } delete filter_tree_root; return Option(true); @@ -3999,8 +4002,6 @@ Option Collection::prune_doc(nlohmann::json& doc, reference_docs.push_back(ref_doc); } - delete[] documents[0].second; - for (const auto &ref_doc: reference_docs) { doc.update(ref_doc); } From 2672b1ebd6e5b94bc626e564ce12775f794b757e Mon Sep 17 00:00:00 2001 From: ozanarmagan Date: Thu, 16 Feb 2023 14:45:43 +0300 Subject: [PATCH 36/51] Auto vector generation & Hybrid Search --- WORKSPACE | 5 +- bazel/foreign_cc.patch | 4 +- bazel/foreign_cc_version_compiler.patch | 283 ------------------------ cmake/patch.sh | 18 ++ include/collection.h | 8 - include/field.h | 4 +- src/collection.cpp | 66 +----- src/field.cpp | 2 +- 8 files changed, 25 insertions(+), 365 deletions(-) delete mode 100644 bazel/foreign_cc_version_compiler.patch create mode 100644 cmake/patch.sh diff --git a/WORKSPACE b/WORKSPACE index 94d24423..1dc1401d 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -13,10 +13,7 @@ bazel_compdb_deps() http_archive( name = "rules_foreign_cc", - patches = ["//bazel:foreign_cc.patch", "//bazel:foreign_cc_version_compiler.patch"], - patch_args = [ - "-p1", - ], + patches = ["//bazel:foreign_cc.patch"], sha256 = "2a4d07cd64b0719b39a7c12218a3e507672b82a97b98c6a89d38565894cf7c51", strip_prefix = "rules_foreign_cc-0.9.0", url = "https://github.com/bazelbuild/rules_foreign_cc/archive/refs/tags/0.9.0.tar.gz", diff --git a/bazel/foreign_cc.patch b/bazel/foreign_cc.patch index 9cb52c7c..9af0c8a7 100644 --- a/bazel/foreign_cc.patch +++ b/bazel/foreign_cc.patch @@ -1,5 +1,5 @@ ---- a/foreign_cc/private/configure_script.bzl -+++ b/foreign_cc/private/configure_script.bzl +--- foreign_cc/private/configure_script.bzl ++++ foreign_cc/private/configure_script.bzl @@ -70,7 +70,7 @@ ).lstrip()) diff --git a/bazel/foreign_cc_version_compiler.patch b/bazel/foreign_cc_version_compiler.patch deleted file mode 100644 index d668a1db..00000000 --- a/bazel/foreign_cc_version_compiler.patch +++ /dev/null @@ -1,283 +0,0 @@ -diff --git a/foreign_cc/private/cc_toolchain_util.bzl b/foreign_cc/private/cc_toolchain_util.bzl -index fd7fa4d..188dc5f 100644 ---- a/foreign_cc/private/cc_toolchain_util.bzl -+++ b/foreign_cc/private/cc_toolchain_util.bzl -@@ -265,15 +265,24 @@ def get_tools_info(ctx): - cc_toolchain = cc_toolchain, - ) - -+ cxx = cc_common.get_tool_for_action( -+ feature_configuration = feature_configuration, -+ action_name = ACTION_NAMES.cpp_compile, -+ ) -+ cxx_splitted = cxx.split("/") -+ if(cxx_splitted[-1] == "gcc"): -+ cxx_splitted[-1] = "g++" -+ cxx = "/".join(cxx_splitted) -+ if(cxx_splitted[-1] == "clang"): -+ cxx_splitted = "clang++" -+ cxx = "/".join(cxx_splitted) -+ - return CxxToolsInfo( - cc = cc_common.get_tool_for_action( - feature_configuration = feature_configuration, - action_name = ACTION_NAMES.c_compile, - ), -- cxx = cc_common.get_tool_for_action( -- feature_configuration = feature_configuration, -- action_name = ACTION_NAMES.cpp_compile, -- ), -+ cxx = cxx, - cxx_linker_static = cc_common.get_tool_for_action( - feature_configuration = feature_configuration, - action_name = ACTION_NAMES.cpp_link_static_library, -diff --git a/toolchains/built_toolchains.bzl b/toolchains/built_toolchains.bzl -index 5e59e79..ddf63a5 100644 ---- a/toolchains/built_toolchains.bzl -+++ b/toolchains/built_toolchains.bzl -@@ -28,6 +28,7 @@ _CMAKE_SRCS = { - "3.22.4": [["https://github.com/Kitware/CMake/releases/download/v3.22.4/cmake-3.22.4.tar.gz"], "cmake-3.22.4", "5c55d0b0bc4c191549e3502b8f99a4fe892077611df22b4178cc020626e22a47"], - "3.23.1": [["https://github.com/Kitware/CMake/releases/download/v3.23.1/cmake-3.23.1.tar.gz"], "cmake-3.23.1", "33fd10a8ec687a4d0d5b42473f10459bb92b3ae7def2b745dc10b192760869f3"], - "3.23.2": [["https://github.com/Kitware/CMake/releases/download/v3.23.2/cmake-3.23.2.tar.gz"], "cmake-3.23.2", "f316b40053466f9a416adf981efda41b160ca859e97f6a484b447ea299ff26aa"], -+ "3.25.0": [["https://github.com/Kitware/CMake/releases/download/v3.25.0/cmake-3.25.0.tar.gz"], "cmake-3.25.0", "306463f541555da0942e6f5a0736560f70c487178b9d94a5ae7f34d0538cdd48"], - } - - # buildifier: disable=unnamed-macro -@@ -438,6 +439,18 @@ def _ninja_toolchain(version, register_toolchains): - native.register_toolchains( - "@rules_foreign_cc//toolchains:built_ninja_toolchain", - ) -+ if version == "1.11.1": -+ maybe( -+ http_archive, -+ name = "ninja_build_src", -+ build_file_content = _ALL_CONTENT, -+ sha256 = "31747ae633213f1eda3842686f83c2aa1412e0f5691d1c14dbbcc67fe7400cea", -+ strip_prefix = "ninja-1.11.1", -+ urls = [ -+ "https://github.com/ninja-build/ninja/archive/v1.11.1.tar.gz", -+ ], -+ ) -+ return - if version == "1.11.0": - maybe( - http_archive, -diff --git a/toolchains/prebuilt_toolchains.bzl b/toolchains/prebuilt_toolchains.bzl -index dabfb95..d9c38b4 100644 ---- a/toolchains/prebuilt_toolchains.bzl -+++ b/toolchains/prebuilt_toolchains.bzl -@@ -67,6 +67,115 @@ def prebuilt_toolchains(cmake_version, ninja_version, register_toolchains): - _make_toolchains(register_toolchains) - - def _cmake_toolchains(version, register_toolchains): -+ if "3.25.0" == version: -+ maybe( -+ http_archive, -+ name = "cmake-3.25.0-linux-aarch64", -+ urls = [ -+ "https://github.com/Kitware/CMake/releases/download/v3.25.0/cmake-3.25.0-linux-aarch64.tar.gz", -+ ], -+ sha256 = "27da36d6debe9b30f5c498554ae40cd621a55736f5f2ae2618ed95722a59965a", -+ strip_prefix = "cmake-3.25.0-linux-aarch64", -+ build_file_content = _CMAKE_BUILD_FILE.format( -+ bin = "cmake", -+ env = "{}", -+ ), -+ ) -+ -+ maybe( -+ http_archive, -+ name = "cmake-3.25.0-linux-x86_64", -+ urls = [ -+ "https://github.com/Kitware/CMake/releases/download/v3.25.0/cmake-3.25.0-linux-x86_64.tar.gz", -+ ], -+ sha256 = "ac634d6f0a81d7089adc7be5acff66a6bee3b08615f9a947858ce92a9ef59c8b", -+ strip_prefix = "cmake-3.25.0-linux-x86_64", -+ build_file_content = _CMAKE_BUILD_FILE.format( -+ bin = "cmake", -+ env = "{}", -+ ), -+ ) -+ -+ maybe( -+ http_archive, -+ name = "cmake-3.25.0-macos-universal", -+ urls = [ -+ "https://github.com/Kitware/CMake/releases/download/v3.25.0/cmake-3.25.0-macos-universal.tar.gz", -+ ], -+ sha256 = "c088e761534a2078cd9d0581d39f02d3f9ed05302e33135b55c6d619b263b4c3", -+ strip_prefix = "cmake-3.25.0-macos-universal/CMake.app/Contents", -+ build_file_content = _CMAKE_BUILD_FILE.format( -+ bin = "cmake", -+ env = "{}", -+ ), -+ ) -+ -+ maybe( -+ http_archive, -+ name = "cmake-3.25.0-windows-i386", -+ urls = [ -+ "https://github.com/Kitware/CMake/releases/download/v3.25.0/cmake-3.25.0-windows-i386.zip", -+ ], -+ sha256 = "ddd115257a19ff3dd18fc63f32a00ae742f8b62d2e39bc354629903512f99783", -+ strip_prefix = "cmake-3.25.0-windows-i386", -+ build_file_content = _CMAKE_BUILD_FILE.format( -+ bin = "cmake.exe", -+ env = "{}", -+ ), -+ ) -+ -+ maybe( -+ http_archive, -+ name = "cmake-3.25.0-windows-x86_64", -+ urls = [ -+ "https://github.com/Kitware/CMake/releases/download/v3.25.0/cmake-3.25.0-windows-x86_64.zip", -+ ], -+ sha256 = "b46030c10cab1170355952f9ac59f7e6dabc248070fc53f15dff11d4ed2910f8", -+ strip_prefix = "cmake-3.25.0-windows-x86_64", -+ build_file_content = _CMAKE_BUILD_FILE.format( -+ bin = "cmake.exe", -+ env = "{}", -+ ), -+ ) -+ -+ # buildifier: leave-alone -+ maybe( -+ prebuilt_toolchains_repository, -+ name = "cmake_3.25.0_toolchains", -+ repos = { -+ "cmake-3.25.0-linux-aarch64": [ -+ "@platforms//cpu:aarch64", -+ "@platforms//os:linux", -+ ], -+ "cmake-3.25.0-linux-x86_64": [ -+ "@platforms//cpu:x86_64", -+ "@platforms//os:linux", -+ ], -+ "cmake-3.25.0-macos-universal": [ -+ "@platforms//os:macos", -+ ], -+ "cmake-3.25.0-windows-i386": [ -+ "@platforms//cpu:x86_32", -+ "@platforms//os:windows", -+ ], -+ "cmake-3.25.0-windows-x86_64": [ -+ "@platforms//cpu:x86_64", -+ "@platforms//os:windows", -+ ], -+ }, -+ tool = "cmake", -+ ) -+ -+ if register_toolchains: -+ native.register_toolchains( -+ "@cmake_3.25.0_toolchains//:cmake-3.25.0-linux-aarch64_toolchain", -+ "@cmake_3.25.0_toolchains//:cmake-3.25.0-linux-x86_64_toolchain", -+ "@cmake_3.25.0_toolchains//:cmake-3.25.0-macos-universal_toolchain", -+ "@cmake_3.25.0_toolchains//:cmake-3.25.0-windows-i386_toolchain", -+ "@cmake_3.25.0_toolchains//:cmake-3.25.0-windows-x86_64_toolchain", -+ ) -+ -+ return - if "3.23.2" == version: - maybe( - http_archive, -@@ -4196,6 +4305,78 @@ def _cmake_toolchains(version, register_toolchains): - fail("Unsupported version: " + str(version)) - - def _ninja_toolchains(version, register_toolchains): -+ if "1.11.1" == version: -+ maybe( -+ http_archive, -+ name = "ninja_1.11.1_linux", -+ urls = [ -+ "https://github.com/ninja-build/ninja/releases/download/v1.11.1/ninja-linux.zip", -+ ], -+ sha256 = "b901ba96e486dce377f9a070ed4ef3f79deb45f4ffe2938f8e7ddc69cfb3df77", -+ strip_prefix = "", -+ build_file_content = _NINJA_BUILD_FILE.format( -+ bin = "ninja", -+ env = "{\"NINJA\": \"$(execpath :ninja_bin)\"}", -+ ), -+ ) -+ -+ maybe( -+ http_archive, -+ name = "ninja_1.11.1_mac", -+ urls = [ -+ "https://github.com/ninja-build/ninja/releases/download/v1.11.1/ninja-mac.zip", -+ ], -+ sha256 = "482ecb23c59ae3d4f158029112de172dd96bb0e97549c4b1ca32d8fad11f873e", -+ strip_prefix = "", -+ build_file_content = _NINJA_BUILD_FILE.format( -+ bin = "ninja", -+ env = "{\"NINJA\": \"$(execpath :ninja_bin)\"}", -+ ), -+ ) -+ -+ maybe( -+ http_archive, -+ name = "ninja_1.11.1_win", -+ urls = [ -+ "https://github.com/ninja-build/ninja/releases/download/v1.11.1/ninja-win.zip", -+ ], -+ sha256 = "524b344a1a9a55005eaf868d991e090ab8ce07fa109f1820d40e74642e289abc", -+ strip_prefix = "", -+ build_file_content = _NINJA_BUILD_FILE.format( -+ bin = "ninja.exe", -+ env = "{\"NINJA\": \"$(execpath :ninja_bin)\"}", -+ ), -+ ) -+ -+ # buildifier: leave-alone -+ maybe( -+ prebuilt_toolchains_repository, -+ name = "ninja_1.11.1_toolchains", -+ repos = { -+ "ninja_1.11.1_linux": [ -+ "@platforms//cpu:x86_64", -+ "@platforms//os:linux", -+ ], -+ "ninja_1.11.1_mac": [ -+ "@platforms//cpu:x86_64", -+ "@platforms//os:macos", -+ ], -+ "ninja_1.11.1_win": [ -+ "@platforms//cpu:x86_64", -+ "@platforms//os:windows", -+ ], -+ }, -+ tool = "ninja", -+ ) -+ -+ if register_toolchains: -+ native.register_toolchains( -+ "@ninja_1.11.1_toolchains//:ninja_1.11.1_linux_toolchain", -+ "@ninja_1.11.1_toolchains//:ninja_1.11.1_mac_toolchain", -+ "@ninja_1.11.1_toolchains//:ninja_1.11.1_win_toolchain", -+ ) -+ -+ return - if "1.11.0" == version: - maybe( - http_archive, -diff --git a/toolchains/prebuilt_toolchains.py b/toolchains/prebuilt_toolchains.py -index 5288b27..a193021 100755 ---- a/toolchains/prebuilt_toolchains.py -+++ b/toolchains/prebuilt_toolchains.py -@@ -10,6 +10,7 @@ CMAKE_SHA256_URL_TEMPLATE = "https://cmake.org/files/v{minor}/cmake-{full}-SHA-2 - CMAKE_URL_TEMPLATE = "https://github.com/Kitware/CMake/releases/download/v{full}/{file}" - - CMAKE_VERSIONS = [ -+ "3.25.0", - "3.23.2", - "3.23.1", - "3.22.4", -@@ -116,6 +117,7 @@ NINJA_TARGETS = { - } - - NINJA_VERSIONS = ( -+ "1.11.1", - "1.10.2", - "1.10.1", - "1.10.0", diff --git a/cmake/patch.sh b/cmake/patch.sh new file mode 100644 index 00000000..410c1254 --- /dev/null +++ b/cmake/patch.sh @@ -0,0 +1,18 @@ +#! /bin/sh + +set +x +set -euo pipefail + + +patch="$1"; shift + +# ignore the error if the patch is already applied +if ! out=$(patch -p1 -N -r "rejects.bin" < "$patch") +then + echo "$out" | grep -q "Reversed (or previously applied) patch detected! Skipping patch." + test -s "rejects.bin" # Make sure we have rejects. +else + test -f "rejects.bin" && ! test -s "rejects.bin" # Make sure we have no rejects. +fi + +rm -f "rejects.bin" \ No newline at end of file diff --git a/include/collection.h b/include/collection.h index ab847a4c..8d77cfde 100644 --- a/include/collection.h +++ b/include/collection.h @@ -463,14 +463,6 @@ public: Option validate_reference_filter(const std::string& filter_query) const; - Option get_reference_field(const std::string & collection_name) const; - - Option get_reference_filter_ids(const std::string & filter_query, - filter_result_t& filter_result, - const std::string & collection_name) const; - - Option validate_reference_filter(const std::string& filter_query) const; - Option get(const std::string & id) const; Option remove(const std::string & id, bool remove_from_store = true); diff --git a/include/field.h b/include/field.h index 63feff0f..305675ab 100644 --- a/include/field.h +++ b/include/field.h @@ -411,7 +411,7 @@ struct field { static Option json_field_to_field(bool enable_nested_fields, nlohmann::json& field_json, std::vector& the_fields, - string& fallback_field_type, size_t& num_auto_detect_fields); + string& fallback_field_type, size_t& num_auto_detect_fields,const nlohmann::json& all_fields_json = nlohmann::json()); static Option json_fields_to_fields(bool enable_nested_fields, nlohmann::json& fields_json, @@ -475,7 +475,7 @@ struct field { } auto op = json_field_to_field(enable_nested_fields, - field_json, the_fields, fallback_field_type, num_auto_detect_fields); + field_json, the_fields, fallback_field_type, num_auto_detect_fields, fields_json); if(!op.ok()) { return op; } diff --git a/src/collection.cpp b/src/collection.cpp index 45ae966a..af35b70d 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -246,6 +246,7 @@ nlohmann::json Collection::get_summary_json() const { field_json[fields::reference] = coll_field.reference; } + fields_arr.push_back(field_json); } @@ -2580,71 +2581,6 @@ Option Collection::validate_reference_filter(const std::string& filter_que return Option(true); } -Option Collection::get_reference_field(const std::string & collection_name) const { - std::shared_lock lock(mutex); - - std::string reference_field_name; - for (auto const& pair: reference_fields) { - auto reference_pair = pair.second; - if (reference_pair.collection == collection_name) { - reference_field_name = reference_pair.field; - break; - } - } - - if (reference_field_name.empty()) { - return Option(400, "Could not find any field in `" + name + "` referencing the collection `" - + collection_name + "`."); - } - - return Option(reference_field_name); -} - -Option Collection::get_reference_filter_ids(const std::string & filter_query, - filter_result_t& filter_result, - const std::string & collection_name) const { - auto reference_field_op = get_reference_field(collection_name); - if (!reference_field_op.ok()) { - return Option(reference_field_op.code(), reference_field_op.error()); - } - - std::shared_lock lock(mutex); - - const std::string doc_id_prefix = std::to_string(collection_id) + "_" + DOC_ID_PREFIX + "_"; - filter_node_t* filter_tree_root = nullptr; - Option parse_op = filter::parse_filter_query(filter_query, search_schema, - store, doc_id_prefix, filter_tree_root); - if(!parse_op.ok()) { - return parse_op; - } - - // Reference helper field has the sequence id of other collection's documents. - auto field_name = reference_field_op.get() + REFERENCE_HELPER_FIELD_SUFFIX; - auto filter_op = index->do_reference_filtering_with_lock(filter_tree_root, filter_result, field_name); - if (!filter_op.ok()) { - return filter_op; - } - - delete filter_tree_root; - return Option(true); -} - -Option Collection::validate_reference_filter(const std::string& filter_query) const { - std::shared_lock lock(mutex); - - const std::string doc_id_prefix = std::to_string(collection_id) + "_" + DOC_ID_PREFIX + "_"; - filter_node_t* filter_tree_root = nullptr; - Option filter_op = filter::parse_filter_query(filter_query, search_schema, - store, doc_id_prefix, filter_tree_root); - - if(!filter_op.ok()) { - return filter_op; - } - - delete filter_tree_root; - return Option(true); -} - bool Collection::facet_value_to_string(const facet &a_facet, const facet_count_t &facet_count, const nlohmann::json &document, std::string &value) const { diff --git a/src/field.cpp b/src/field.cpp index 129c7512..be2000b0 100644 --- a/src/field.cpp +++ b/src/field.cpp @@ -523,7 +523,7 @@ Option filter::parse_filter_query(const std::string& filter_query, Option field::json_field_to_field(bool enable_nested_fields, nlohmann::json& field_json, std::vector& the_fields, - string& fallback_field_type, size_t& num_auto_detect_fields) { + string& fallback_field_type, size_t& num_auto_detect_fields, const nlohmann::json& all_fields_json) { if(field_json["name"] == "id") { // No field should exist with the name "id" as it is reserved for internal use From aee771cebcb2a2afefd36c25ba534aa47d454281 Mon Sep 17 00:00:00 2001 From: ozanarmagan Date: Tue, 21 Feb 2023 13:02:47 +0300 Subject: [PATCH 37/51] Review Changes --- include/field.h | 5 ++--- src/field.cpp | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/include/field.h b/include/field.h index 305675ab..18a1d4b7 100644 --- a/include/field.h +++ b/include/field.h @@ -11,7 +11,6 @@ #include #include "json.hpp" #include "text_embedder_manager.h" -#include namespace field_types { // first field value indexed will determine the type @@ -411,7 +410,7 @@ struct field { static Option json_field_to_field(bool enable_nested_fields, nlohmann::json& field_json, std::vector& the_fields, - string& fallback_field_type, size_t& num_auto_detect_fields,const nlohmann::json& all_fields_json = nlohmann::json()); + string& fallback_field_type, size_t& num_auto_detect_fields); static Option json_fields_to_fields(bool enable_nested_fields, nlohmann::json& fields_json, @@ -475,7 +474,7 @@ struct field { } auto op = json_field_to_field(enable_nested_fields, - field_json, the_fields, fallback_field_type, num_auto_detect_fields, fields_json); + field_json, the_fields, fallback_field_type, num_auto_detect_fields); if(!op.ok()) { return op; } diff --git a/src/field.cpp b/src/field.cpp index be2000b0..129c7512 100644 --- a/src/field.cpp +++ b/src/field.cpp @@ -523,7 +523,7 @@ Option filter::parse_filter_query(const std::string& filter_query, Option field::json_field_to_field(bool enable_nested_fields, nlohmann::json& field_json, std::vector& the_fields, - string& fallback_field_type, size_t& num_auto_detect_fields, const nlohmann::json& all_fields_json) { + string& fallback_field_type, size_t& num_auto_detect_fields) { if(field_json["name"] == "id") { // No field should exist with the name "id" as it is reserved for internal use From ff9cc895e2797cfb0def62d4914c7e23ef4d9773 Mon Sep 17 00:00:00 2001 From: Harpreet Sangar Date: Tue, 7 Mar 2023 11:13:36 +0530 Subject: [PATCH 38/51] Undo static linking change. --- .bazelrc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.bazelrc b/.bazelrc index 933545b7..dd960251 100644 --- a/.bazelrc +++ b/.bazelrc @@ -5,3 +5,5 @@ build --cxxopt="-std=c++17" test --jobs=6 build --enable_platform_specific_config + +build:linux --action_env=BAZEL_LINKLIBS="-l%:libstdc++.a -l%:libgcc.a" \ No newline at end of file From 84787510c8b956a0152e0f2f342cc63092e9a0b7 Mon Sep 17 00:00:00 2001 From: Harpreet Sangar Date: Tue, 7 Mar 2023 14:19:36 +0530 Subject: [PATCH 39/51] Handle reference filter during approximation. --- include/collection.h | 3 ++ include/index.h | 7 ++++ src/collection.cpp | 32 ++++++++++----- src/index.cpp | 98 +++++++++++++++++++++++++++----------------- 4 files changed, 92 insertions(+), 48 deletions(-) diff --git a/include/collection.h b/include/collection.h index 8d77cfde..4e08d4da 100644 --- a/include/collection.h +++ b/include/collection.h @@ -457,6 +457,9 @@ public: Option get_filter_ids(const std::string & filter_query, filter_result_t& filter_result) const; + Option get_approximate_reference_filter_ids(const std::string& filter_query, + uint32_t& filter_ids_length) const; + Option get_reference_filter_ids(const std::string& filter_query, filter_result_t& filter_result, const std::string& collection_name) const; diff --git a/include/index.h b/include/index.h index 0ce10daf..f7b5f3fe 100644 --- a/include/index.h +++ b/include/index.h @@ -486,6 +486,10 @@ private: const int64_t& range_end_value, uint32_t& filter_ids_length) const; + Option approximate_filter_ids(const filter& a_filter, + uint32_t& filter_ids_length, + const std::string& collection_name) const; + Option rearranging_recursive_filter(filter_node_t* const filter_tree_root, filter_result_t& result, const std::string& collection_name = "") const; @@ -702,6 +706,9 @@ public: const std::string& collection_name, const std::string& reference_helper_field_name) const; + Option get_approximate_reference_filter_ids_with_lock(filter_node_t* const filter_tree_root, + uint32_t& filter_ids_length) const; + void refresh_schemas(const std::vector& new_fields, const std::vector& del_fields); // the following methods are not synchronized because their parent calls are synchronized or they are const/static diff --git a/src/collection.cpp b/src/collection.cpp index af35b70d..9d1c0635 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -2513,10 +2513,9 @@ Option Collection::get_filter_ids(const std::string& filter_query, filter_ return filter_op; } - index->do_filtering_with_lock(filter_tree_root, filter_result, name); + std::unique_ptr filter_tree_root_guard(filter_tree_root); - delete filter_tree_root; - return Option(true); + return index->do_filtering_with_lock(filter_tree_root, filter_result, name); } Option Collection::get_reference_field(const std::string & collection_name) const { @@ -2537,6 +2536,23 @@ Option Collection::get_reference_field(const std::string & collecti return Option(reference_field_name); } +Option Collection::get_approximate_reference_filter_ids(const std::string& filter_query, + uint32_t& filter_ids_length) const { + std::shared_lock lock(mutex); + + const std::string doc_id_prefix = std::to_string(collection_id) + "_" + DOC_ID_PREFIX + "_"; + filter_node_t* filter_tree_root = nullptr; + Option parse_op = filter::parse_filter_query(filter_query, search_schema, + store, doc_id_prefix, filter_tree_root); + if(!parse_op.ok()) { + return parse_op; + } + + std::unique_ptr filter_tree_root_guard(filter_tree_root); + + return index->get_approximate_reference_filter_ids_with_lock(filter_tree_root, filter_ids_length); +} + Option Collection::get_reference_filter_ids(const std::string & filter_query, filter_result_t& filter_result, const std::string & collection_name) const { @@ -2555,15 +2571,11 @@ Option Collection::get_reference_filter_ids(const std::string & filter_que return parse_op; } + std::unique_ptr filter_tree_root_guard(filter_tree_root); + // Reference helper field has the sequence id of other collection's documents. auto field_name = reference_field_op.get() + REFERENCE_HELPER_FIELD_SUFFIX; - auto filter_op = index->do_reference_filtering_with_lock(filter_tree_root, filter_result, name, field_name); - if (!filter_op.ok()) { - return filter_op; - } - - delete filter_tree_root; - return Option(true); + return index->do_reference_filtering_with_lock(filter_tree_root, filter_result, name, field_name); } Option Collection::validate_reference_filter(const std::string& filter_query) const { diff --git a/src/index.cpp b/src/index.cpp index e88badc0..b704356d 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -1952,45 +1952,19 @@ void Index::aproximate_numerical_match(num_tree_t* const num_tree, num_tree->approx_search_count(comparator, value, filter_ids_length); } -Option Index::rearrange_filter_tree(filter_node_t* const root, - uint32_t& filter_ids_length, - const std::string& collection_name) const { - if (root == nullptr) { - return Option(true); +Option Index::approximate_filter_ids(const filter& a_filter, + uint32_t& filter_ids_length, + const std::string& collection_name) const { + if (!a_filter.referenced_collection_name.empty()) { + auto& cm = CollectionManager::get_instance(); + auto collection = cm.get_collection(a_filter.referenced_collection_name); + if (collection == nullptr) { + return Option(400, "Referenced collection `" + a_filter.referenced_collection_name + "` not found."); + } + + return collection->get_approximate_reference_filter_ids(a_filter.field_name, filter_ids_length); } - if (root->isOperator) { - uint32_t l_filter_ids_length = 0; - if (root->left != nullptr) { - auto rearrange_op = rearrange_filter_tree(root->left, l_filter_ids_length, collection_name); - if (!rearrange_op.ok()) { - return rearrange_op; - } - } - - uint32_t r_filter_ids_length = 0; - if (root->right != nullptr) { - auto rearrange_op = rearrange_filter_tree(root->right, r_filter_ids_length, collection_name); - if (!rearrange_op.ok()) { - return rearrange_op; - } - } - - if (root->filter_operator == AND) { - filter_ids_length = std::min(l_filter_ids_length, r_filter_ids_length); - } else { - filter_ids_length = l_filter_ids_length + r_filter_ids_length; - } - - if (l_filter_ids_length > r_filter_ids_length) { - std::swap(root->left, root->right); - } - - return Option(true); - } - - auto a_filter = root->filter_exp; - if (a_filter.field_name == "id") { filter_ids_length = a_filter.values.size(); return Option(true); @@ -2062,7 +2036,7 @@ Option Index::rearrange_filter_tree(filter_node_t* const root, while (tokenizer.next(str_token, token_index)) { auto const leaf = (art_leaf *) art_search(t, (const unsigned char*) str_token.c_str(), - str_token.length()+1); + str_token.length()+1); if (leaf == nullptr) { continue; } @@ -2080,6 +2054,47 @@ Option Index::rearrange_filter_tree(filter_node_t* const root, return Option(true); } +Option Index::rearrange_filter_tree(filter_node_t* const root, + uint32_t& filter_ids_length, + const std::string& collection_name) const { + if (root == nullptr) { + return Option(true); + } + + if (root->isOperator) { + uint32_t l_filter_ids_length = 0; + if (root->left != nullptr) { + auto rearrange_op = rearrange_filter_tree(root->left, l_filter_ids_length, collection_name); + if (!rearrange_op.ok()) { + return rearrange_op; + } + } + + uint32_t r_filter_ids_length = 0; + if (root->right != nullptr) { + auto rearrange_op = rearrange_filter_tree(root->right, r_filter_ids_length, collection_name); + if (!rearrange_op.ok()) { + return rearrange_op; + } + } + + if (root->filter_operator == AND) { + filter_ids_length = std::min(l_filter_ids_length, r_filter_ids_length); + } else { + filter_ids_length = l_filter_ids_length + r_filter_ids_length; + } + + if (l_filter_ids_length > r_filter_ids_length) { + std::swap(root->left, root->right); + } + + return Option(true); + } + + approximate_filter_ids(root->filter_exp, filter_ids_length, collection_name); + return Option(true); +} + Option Index::rearranging_recursive_filter(filter_node_t* const filter_tree_root, filter_result_t& result, const std::string& collection_name) const { @@ -2233,6 +2248,13 @@ Option Index::do_reference_filtering_with_lock(filter_node_t* const filter return Option(true); } +Option Index::get_approximate_reference_filter_ids_with_lock(filter_node_t* const filter_tree_root, + uint32_t& filter_ids_length) const { + std::shared_lock lock(mutex); + + return rearrange_filter_tree(filter_tree_root, filter_ids_length); +} + Option Index::run_search(search_args* search_params, const std::string& collection_name) { return search(search_params->field_query_tokens, search_params->search_fields, From 4ae42c45cbce6447da7c9edce8c1c141e8fddea5 Mon Sep 17 00:00:00 2001 From: Harpreet Sangar Date: Tue, 7 Mar 2023 16:13:10 +0530 Subject: [PATCH 40/51] Delete `patch.sh`. --- cmake/patch.sh | 18 ------------------ 1 file changed, 18 deletions(-) delete mode 100644 cmake/patch.sh diff --git a/cmake/patch.sh b/cmake/patch.sh deleted file mode 100644 index 410c1254..00000000 --- a/cmake/patch.sh +++ /dev/null @@ -1,18 +0,0 @@ -#! /bin/sh - -set +x -set -euo pipefail - - -patch="$1"; shift - -# ignore the error if the patch is already applied -if ! out=$(patch -p1 -N -r "rejects.bin" < "$patch") -then - echo "$out" | grep -q "Reversed (or previously applied) patch detected! Skipping patch." - test -s "rejects.bin" # Make sure we have rejects. -else - test -f "rejects.bin" && ! test -s "rejects.bin" # Make sure we have no rejects. -fi - -rm -f "rejects.bin" \ No newline at end of file From c6386b0c2f2ccc0af7cf3fe8dbbaf577ee444c8d Mon Sep 17 00:00:00 2001 From: Harpreet Sangar Date: Tue, 7 Mar 2023 18:28:19 +0530 Subject: [PATCH 41/51] Add tests for rearranging filter tree and approx filter match count. --- include/field.h | 2 +- test/collection_specific_more_test.cpp | 206 +++++++++++++++++++++++++ 2 files changed, 207 insertions(+), 1 deletion(-) diff --git a/include/field.h b/include/field.h index 18a1d4b7..7eee5c79 100644 --- a/include/field.h +++ b/include/field.h @@ -513,7 +513,7 @@ struct filter { bool apply_not_equals = false; // Would store `Foo` in case of a filter expression like `$Foo(bar := baz)` - std::string referenced_collection_name; + std::string referenced_collection_name = ""; static const std::string RANGE_OPERATOR() { return ".."; diff --git a/test/collection_specific_more_test.cpp b/test/collection_specific_more_test.cpp index 070b0580..8afdaf84 100644 --- a/test/collection_specific_more_test.cpp +++ b/test/collection_specific_more_test.cpp @@ -1973,3 +1973,209 @@ TEST_F(CollectionSpecificMoreTest, CrossFieldTypoAndPrefixWithWeights) { "", "", {2, 3}).get(); ASSERT_EQ(1, res["hits"].size()); } + +TEST_F(CollectionSpecificMoreTest, RearrangingFilterTree) { + nlohmann::json schema = + R"({ + "name": "Collection", + "fields": [ + {"name": "name", "type": "string"}, + {"name": "age", "type": "int32"}, + {"name": "years", "type": "int32[]"}, + {"name": "rating", "type": "float"} + ] + })"_json; + + Collection* coll = collectionManager.create_collection(schema).get(); + + std::ifstream infile(std::string(ROOT_DIR)+"test/numeric_array_documents.jsonl"); + std::string json_line; + while (std::getline(infile, json_line)) { + auto add_op = coll->add(json_line); + ASSERT_TRUE(add_op.ok()); + } + infile.close(); + + const std::string doc_id_prefix = std::to_string(coll->get_collection_id()) + "_" + Collection::DOC_ID_PREFIX + "_"; + filter_node_t* filter_tree_root = nullptr; + Option filter_op = filter::parse_filter_query("years:>2000 && ((age:<30 && rating:>5) || (age:>50 && rating:<5))", + coll->get_schema(), store, doc_id_prefix, filter_tree_root); + ASSERT_TRUE(filter_op.ok()); + std::unique_ptr filter_tree_root_guard(filter_tree_root); + + // && + // / \ + // years>2000 || + // 4 / \ + // / && + // && / \ + // / \ age>50 rating<5 + // / \ 1 2 + // / \ + // age<30 rating>5 + // 2 3 + ASSERT_TRUE(filter_tree_root != nullptr); + ASSERT_TRUE(filter_tree_root->isOperator); + ASSERT_EQ(filter_tree_root->filter_operator, AND); + + auto root = filter_tree_root->left; + ASSERT_TRUE(root != nullptr); + ASSERT_FALSE(root->isOperator); + ASSERT_EQ(root->filter_exp.field_name, "years"); + ASSERT_TRUE(root->left == nullptr); + ASSERT_TRUE(root->right == nullptr); + + root = filter_tree_root->right; + ASSERT_TRUE(root != nullptr); + ASSERT_TRUE(root->isOperator); + ASSERT_EQ(root->filter_operator, OR); + + root = filter_tree_root->right->left; + ASSERT_TRUE(root != nullptr); + ASSERT_TRUE(root->isOperator); + ASSERT_EQ(root->filter_operator, AND); + + root = filter_tree_root->right->left->left; + ASSERT_TRUE(root != nullptr); + ASSERT_FALSE(root->isOperator); + ASSERT_EQ(root->filter_exp.field_name, "age"); + ASSERT_EQ(root->filter_exp.comparators.front(), LESS_THAN); + ASSERT_EQ(root->filter_exp.values.front(), "30"); + ASSERT_TRUE(root->left == nullptr); + ASSERT_TRUE(root->right == nullptr); + + root = filter_tree_root->right->left->right; + ASSERT_TRUE(root != nullptr); + ASSERT_FALSE(root->isOperator); + ASSERT_EQ(root->filter_exp.field_name, "rating"); + ASSERT_EQ(root->filter_exp.comparators.front(), GREATER_THAN); + ASSERT_EQ(root->filter_exp.values.front(), "5"); + ASSERT_TRUE(root->left == nullptr); + ASSERT_TRUE(root->right == nullptr); + + root = filter_tree_root->right->right; + ASSERT_TRUE(root != nullptr); + ASSERT_TRUE(root->isOperator); + ASSERT_EQ(root->filter_operator, AND); + + root = filter_tree_root->right->right->left; + ASSERT_TRUE(root != nullptr); + ASSERT_FALSE(root->isOperator); + ASSERT_EQ(root->filter_exp.field_name, "age"); + ASSERT_EQ(root->filter_exp.comparators.front(), GREATER_THAN); + ASSERT_EQ(root->filter_exp.values.front(), "50"); + ASSERT_TRUE(root->left == nullptr); + ASSERT_TRUE(root->right == nullptr); + + root = filter_tree_root->right->right->right; + ASSERT_TRUE(root != nullptr); + ASSERT_FALSE(root->isOperator); + ASSERT_EQ(root->filter_exp.field_name, "rating"); + ASSERT_EQ(root->filter_exp.comparators.front(), LESS_THAN); + ASSERT_EQ(root->filter_exp.values.front(), "5"); + ASSERT_TRUE(root->left == nullptr); + ASSERT_TRUE(root->right == nullptr); + + filter_result_t result; + // Internally calls rearranging_recursive_filter + coll->_get_index()->do_filtering_with_lock(filter_tree_root, result); + + // && + // / \ + // || years>2000 + // / \ + // && \ + // / \ \ + // age>50 rating<5 && + // / \ + // age<30 rating>5 + ASSERT_TRUE(filter_tree_root != nullptr); + ASSERT_TRUE(filter_tree_root->isOperator); + ASSERT_EQ(filter_tree_root->filter_operator, AND); + + root = filter_tree_root->left; + ASSERT_TRUE(root != nullptr); + ASSERT_TRUE(root->isOperator); + ASSERT_EQ(root->filter_operator, OR); + + root = filter_tree_root->left->left; + ASSERT_TRUE(root != nullptr); + ASSERT_TRUE(root->isOperator); + ASSERT_EQ(root->filter_operator, AND); + + root = filter_tree_root->left->left->left; + ASSERT_TRUE(root != nullptr); + ASSERT_FALSE(root->isOperator); + ASSERT_EQ(root->filter_exp.field_name, "age"); + ASSERT_EQ(root->filter_exp.comparators.front(), GREATER_THAN); + ASSERT_EQ(root->filter_exp.values.front(), "50"); + ASSERT_TRUE(root->left == nullptr); + ASSERT_TRUE(root->right == nullptr); + + root = filter_tree_root->left->left->right; + ASSERT_TRUE(root != nullptr); + ASSERT_FALSE(root->isOperator); + ASSERT_EQ(root->filter_exp.field_name, "rating"); + ASSERT_EQ(root->filter_exp.comparators.front(), LESS_THAN); + ASSERT_EQ(root->filter_exp.values.front(), "5"); + ASSERT_TRUE(root->left == nullptr); + ASSERT_TRUE(root->right == nullptr); + + root = filter_tree_root->left->right; + ASSERT_TRUE(root != nullptr); + ASSERT_TRUE(root->isOperator); + ASSERT_EQ(root->filter_operator, AND); + + root = filter_tree_root->left->right->left; + ASSERT_TRUE(root != nullptr); + ASSERT_FALSE(root->isOperator); + ASSERT_EQ(root->filter_exp.field_name, "age"); + ASSERT_EQ(root->filter_exp.comparators.front(), LESS_THAN); + ASSERT_EQ(root->filter_exp.values.front(), "30"); + ASSERT_TRUE(root->left == nullptr); + ASSERT_TRUE(root->right == nullptr); + + root = filter_tree_root->left->right->right; + ASSERT_TRUE(root != nullptr); + ASSERT_FALSE(root->isOperator); + ASSERT_EQ(root->filter_exp.field_name, "rating"); + ASSERT_EQ(root->filter_exp.comparators.front(), GREATER_THAN); + ASSERT_EQ(root->filter_exp.values.front(), "5"); + ASSERT_TRUE(root->left == nullptr); + ASSERT_TRUE(root->right == nullptr); + + root = filter_tree_root->right; + ASSERT_TRUE(root != nullptr); + ASSERT_FALSE(root->isOperator); + ASSERT_EQ(root->filter_exp.field_name, "years"); + ASSERT_TRUE(root->left == nullptr); + ASSERT_TRUE(root->right == nullptr); +} + +TEST_F(CollectionSpecificMoreTest, ApproxFilterMatchCount) { + nlohmann::json schema = + R"({ + "name": "Collection", + "fields": [ + {"name": "name", "type": "string"}, + {"name": "age", "type": "int32"}, + {"name": "years", "type": "int32[]"}, + {"name": "rating", "type": "float"} + ] + })"_json; + + Collection *coll = collectionManager.create_collection(schema).get(); + + std::ifstream infile(std::string(ROOT_DIR) + "test/numeric_array_documents.jsonl"); + std::string json_line; + while (std::getline(infile, json_line)) { + auto add_op = coll->add(json_line); + ASSERT_TRUE(add_op.ok()); + } + infile.close(); + + uint32_t approx_count; + coll->get_approximate_reference_filter_ids("years:>2000 && ((age:<30 && rating:>5) || (age:>50 && rating:<5))", + approx_count); + ASSERT_EQ(approx_count, 3); +} \ No newline at end of file From db555d36ad4392fb9be38b181bf5f7a99416d566 Mon Sep 17 00:00:00 2001 From: Harpreet Sangar Date: Thu, 9 Mar 2023 10:51:22 +0530 Subject: [PATCH 42/51] Refactor tests. --- include/index.h | 24 ++++++++--------- src/index.cpp | 18 ++++++------- test/collection_specific_more_test.cpp | 36 ++++++++++++++++++++++---- 3 files changed, 52 insertions(+), 26 deletions(-) diff --git a/include/index.h b/include/index.h index f7b5f3fe..ff8431c8 100644 --- a/include/index.h +++ b/include/index.h @@ -486,14 +486,6 @@ private: const int64_t& range_end_value, uint32_t& filter_ids_length) const; - Option approximate_filter_ids(const filter& a_filter, - uint32_t& filter_ids_length, - const std::string& collection_name) const; - - Option rearranging_recursive_filter(filter_node_t* const filter_tree_root, - filter_result_t& result, - const std::string& collection_name = "") const; - Option recursive_filter(filter_node_t* const root, filter_result_t& result, const std::string& collection_name = "") const; @@ -502,10 +494,6 @@ private: filter_result_t& result, const std::string& collection_name = "") const; - Option rearrange_filter_tree(filter_node_t* const root, - uint32_t& filter_ids_length, - const std::string& collection_name = "") const; - void insert_doc(const int64_t score, art_tree *t, uint32_t seq_id, const std::unordered_map> &token_to_offsets) const; @@ -701,6 +689,18 @@ public: filter_result_t& filter_result, const std::string& collection_name = "") const; + Option _rearranging_recursive_filter(filter_node_t* const filter_tree_root, + filter_result_t& result, + const std::string& collection_name = "") const; + + Option _rearrange_filter_tree(filter_node_t* const root, + uint32_t& filter_ids_length, + const std::string& collection_name = "") const; + + Option _approximate_filter_ids(const filter& a_filter, + uint32_t& filter_ids_length, + const std::string& collection_name = "") const; + Option do_reference_filtering_with_lock(filter_node_t* const filter_tree_root, filter_result_t& filter_result, const std::string& collection_name, diff --git a/src/index.cpp b/src/index.cpp index b704356d..124cf567 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -1952,7 +1952,7 @@ void Index::aproximate_numerical_match(num_tree_t* const num_tree, num_tree->approx_search_count(comparator, value, filter_ids_length); } -Option Index::approximate_filter_ids(const filter& a_filter, +Option Index::_approximate_filter_ids(const filter& a_filter, uint32_t& filter_ids_length, const std::string& collection_name) const { if (!a_filter.referenced_collection_name.empty()) { @@ -2054,7 +2054,7 @@ Option Index::approximate_filter_ids(const filter& a_filter, return Option(true); } -Option Index::rearrange_filter_tree(filter_node_t* const root, +Option Index::_rearrange_filter_tree(filter_node_t* const root, uint32_t& filter_ids_length, const std::string& collection_name) const { if (root == nullptr) { @@ -2064,7 +2064,7 @@ Option Index::rearrange_filter_tree(filter_node_t* const root, if (root->isOperator) { uint32_t l_filter_ids_length = 0; if (root->left != nullptr) { - auto rearrange_op = rearrange_filter_tree(root->left, l_filter_ids_length, collection_name); + auto rearrange_op = _rearrange_filter_tree(root->left, l_filter_ids_length, collection_name); if (!rearrange_op.ok()) { return rearrange_op; } @@ -2072,7 +2072,7 @@ Option Index::rearrange_filter_tree(filter_node_t* const root, uint32_t r_filter_ids_length = 0; if (root->right != nullptr) { - auto rearrange_op = rearrange_filter_tree(root->right, r_filter_ids_length, collection_name); + auto rearrange_op = _rearrange_filter_tree(root->right, r_filter_ids_length, collection_name); if (!rearrange_op.ok()) { return rearrange_op; } @@ -2091,15 +2091,15 @@ Option Index::rearrange_filter_tree(filter_node_t* const root, return Option(true); } - approximate_filter_ids(root->filter_exp, filter_ids_length, collection_name); + _approximate_filter_ids(root->filter_exp, filter_ids_length, collection_name); return Option(true); } -Option Index::rearranging_recursive_filter(filter_node_t* const filter_tree_root, +Option Index::_rearranging_recursive_filter(filter_node_t* const filter_tree_root, filter_result_t& result, const std::string& collection_name) const { uint32_t filter_ids_length = 0; - auto rearrange_op = rearrange_filter_tree(filter_tree_root, filter_ids_length, collection_name); + auto rearrange_op = _rearrange_filter_tree(filter_tree_root, filter_ids_length, collection_name); if (!rearrange_op.ok()) { return rearrange_op; } @@ -2189,7 +2189,7 @@ Option Index::adaptive_filter(filter_node_t* const filter_tree_root, metrics->and_operator_count > 0 && // If there are more || in the filter tree than &&, we'll not gain much by rearranging the filter tree. ((float) metrics->or_operator_count / (float) metrics->and_operator_count < 0.5)) { - return rearranging_recursive_filter(filter_tree_root, result, collection_name); + return _rearranging_recursive_filter(filter_tree_root, result, collection_name); } else { return recursive_filter(filter_tree_root, result, collection_name); } @@ -2252,7 +2252,7 @@ Option Index::get_approximate_reference_filter_ids_with_lock(filter_node_t uint32_t& filter_ids_length) const { std::shared_lock lock(mutex); - return rearrange_filter_tree(filter_tree_root, filter_ids_length); + return _rearrange_filter_tree(filter_tree_root, filter_ids_length); } Option Index::run_search(search_args* search_params, const std::string& collection_name) { diff --git a/test/collection_specific_more_test.cpp b/test/collection_specific_more_test.cpp index 8afdaf84..b34b9973 100644 --- a/test/collection_specific_more_test.cpp +++ b/test/collection_specific_more_test.cpp @@ -2077,8 +2077,7 @@ TEST_F(CollectionSpecificMoreTest, RearrangingFilterTree) { ASSERT_TRUE(root->right == nullptr); filter_result_t result; - // Internally calls rearranging_recursive_filter - coll->_get_index()->do_filtering_with_lock(filter_tree_root, result); + coll->_get_index()->_rearranging_recursive_filter(filter_tree_root, result); // && // / \ @@ -2150,6 +2149,8 @@ TEST_F(CollectionSpecificMoreTest, RearrangingFilterTree) { ASSERT_EQ(root->filter_exp.field_name, "years"); ASSERT_TRUE(root->left == nullptr); ASSERT_TRUE(root->right == nullptr); + + collectionManager.drop_collection("Collection"); } TEST_F(CollectionSpecificMoreTest, ApproxFilterMatchCount) { @@ -2160,7 +2161,8 @@ TEST_F(CollectionSpecificMoreTest, ApproxFilterMatchCount) { {"name": "name", "type": "string"}, {"name": "age", "type": "int32"}, {"name": "years", "type": "int32[]"}, - {"name": "rating", "type": "float"} + {"name": "rating", "type": "float"}, + {"name": "location", "type": "geopoint", "optional": true} ] })"_json; @@ -2174,8 +2176,32 @@ TEST_F(CollectionSpecificMoreTest, ApproxFilterMatchCount) { } infile.close(); + const std::string doc_id_prefix = std::to_string(coll->get_collection_id()) + "_" + Collection::DOC_ID_PREFIX + "_"; + filter_node_t* filter_tree_root = nullptr; + Option filter_op = filter::parse_filter_query("name: Jeremy", coll->get_schema(), store, doc_id_prefix, + filter_tree_root); + ASSERT_TRUE(filter_op.ok()); + uint32_t approx_count; - coll->get_approximate_reference_filter_ids("years:>2000 && ((age:<30 && rating:>5) || (age:>50 && rating:<5))", - approx_count); + coll->_get_index()->_approximate_filter_ids(filter_tree_root->filter_exp, approx_count); + ASSERT_EQ(approx_count, 5); + + delete filter_tree_root; + filter_op = filter::parse_filter_query("location:(48.8662, 2.3255, 48.8581, 2.3209, 48.8561, 2.3448, 48.8641, 2.3469)", + coll->get_schema(), store, doc_id_prefix, filter_tree_root); + ASSERT_TRUE(filter_op.ok()); + + coll->_get_index()->_approximate_filter_ids(filter_tree_root->filter_exp, approx_count); + ASSERT_EQ(approx_count, 100); + + delete filter_tree_root; + filter_op = filter::parse_filter_query("years:>2000 && ((age:<30 && rating:>5) || (age:>50 && rating:<5))", + coll->get_schema(), store, doc_id_prefix, filter_tree_root); + ASSERT_TRUE(filter_op.ok()); + + coll->_get_index()->_rearrange_filter_tree(filter_tree_root, approx_count); ASSERT_EQ(approx_count, 3); + + delete filter_tree_root; + collectionManager.drop_collection("Collection"); } \ No newline at end of file From f38f3f9792a95e2c3fdfae93fd539133e07ef4be Mon Sep 17 00:00:00 2001 From: Harpreet Sangar Date: Thu, 9 Mar 2023 14:08:15 +0530 Subject: [PATCH 43/51] Remove `filter_node_t::metrics`. Update function signatures to accept context ids. --- include/field.h | 2 - include/index.h | 26 +++++------ include/num_tree.h | 4 +- src/field.cpp | 14 +----- src/index.cpp | 61 ++++++++++---------------- src/num_tree.cpp | 4 +- test/collection_specific_more_test.cpp | 6 +-- 7 files changed, 45 insertions(+), 72 deletions(-) diff --git a/include/field.h b/include/field.h index 7eee5c79..dd0033eb 100644 --- a/include/field.h +++ b/include/field.h @@ -606,7 +606,6 @@ struct filter_node_t { bool isOperator; filter_node_t* left = nullptr; filter_node_t* right = nullptr; - filter_tree_metrics* metrics = nullptr; filter_node_t(filter filter_exp) : filter_exp(std::move(filter_exp)), @@ -623,7 +622,6 @@ struct filter_node_t { right(right) {} ~filter_node_t() { - delete metrics; delete left; delete right; } diff --git a/include/index.h b/include/index.h index ff8431c8..62e75180 100644 --- a/include/index.h +++ b/include/index.h @@ -468,17 +468,17 @@ private: void numeric_not_equals_filter(num_tree_t* const num_tree, const int64_t value, const uint32_t& context_ids_length, - const uint32_t* context_ids, + uint32_t* const& context_ids, size_t& ids_len, uint32_t*& ids) const; bool field_is_indexed(const std::string& field_name) const; - Option do_filtering(filter_node_t* const root, - filter_result_t& result, - const std::string& collection_name = "", - const uint32_t& context_ids_length = 0, - const uint32_t* context_ids = nullptr) const; + Option _do_filtering(filter_node_t* const root, + filter_result_t& result, + const std::string& collection_name = "", + const uint32_t& context_ids_length = 0, + uint32_t* const& context_ids = nullptr) const; void aproximate_numerical_match(num_tree_t* const num_tree, const NUM_COMPARATOR& comparator, @@ -488,7 +488,9 @@ private: Option recursive_filter(filter_node_t* const root, filter_result_t& result, - const std::string& collection_name = "") const; + const std::string& collection_name = "", + const uint32_t& context_ids_length = 0, + uint32_t* const& context_ids = nullptr) const; Option adaptive_filter(filter_node_t* const filter_tree_root, filter_result_t& result, @@ -689,17 +691,13 @@ public: filter_result_t& filter_result, const std::string& collection_name = "") const; - Option _rearranging_recursive_filter(filter_node_t* const filter_tree_root, - filter_result_t& result, - const std::string& collection_name = "") const; - - Option _rearrange_filter_tree(filter_node_t* const root, + Option rearrange_filter_tree(filter_node_t* const root, uint32_t& filter_ids_length, const std::string& collection_name = "") const; Option _approximate_filter_ids(const filter& a_filter, - uint32_t& filter_ids_length, - const std::string& collection_name = "") const; + uint32_t& filter_ids_length, + const std::string& collection_name = "") const; Option do_reference_filtering_with_lock(filter_node_t* const filter_tree_root, filter_result_t& filter_result, diff --git a/include/num_tree.h b/include/num_tree.h index 280f47dd..5406a109 100644 --- a/include/num_tree.h +++ b/include/num_tree.h @@ -34,7 +34,7 @@ public: void range_inclusive_contains(const int64_t& start, const int64_t& end, const uint32_t& context_ids_length, - const uint32_t*& context_ids, + uint32_t* const& context_ids, size_t& result_ids_len, uint32_t*& result_ids) const; @@ -50,7 +50,7 @@ public: void contains(const NUM_COMPARATOR& comparator, const int64_t& value, const uint32_t& context_ids_length, - const uint32_t*& context_ids, + uint32_t* const& context_ids, size_t& result_ids_len, uint32_t*& result_ids) const; }; \ No newline at end of file diff --git a/src/field.cpp b/src/field.cpp index 129c7512..c7297359 100644 --- a/src/field.cpp +++ b/src/field.cpp @@ -384,9 +384,7 @@ Option toFilter(const std::string expression, Option toParseTree(std::queue& postfix, filter_node_t*& root, const tsl::htrie_map& search_schema, const Store* store, - const std::string& doc_id_prefix, - int& and_operator_count, - int& or_operator_count) { + const std::string& doc_id_prefix) { std::stack nodeStack; bool is_successful = true; std::string error_message; @@ -413,7 +411,6 @@ Option toParseTree(std::queue& postfix, filter_node_t*& root, auto operandA = nodeStack.top(); nodeStack.pop(); - expression == "&&" ? and_operator_count++ : or_operator_count++; filter_node = new filter_node_t(expression == "&&" ? AND : OR, operandA, operandB); } else { filter filter_exp; @@ -502,22 +499,15 @@ Option filter::parse_filter_query(const std::string& filter_query, return toPostfix_op; } - int postfix_size = (int) postfix.size(), and_operator_count = 0, or_operator_count = 0; Option toParseTree_op = toParseTree(postfix, root, search_schema, store, - doc_id_prefix, - and_operator_count, - or_operator_count); + doc_id_prefix); if (!toParseTree_op.ok()) { return toParseTree_op; } - root->metrics = new filter_tree_metrics{static_cast(postfix_size - (and_operator_count + or_operator_count)), - and_operator_count, - or_operator_count}; - return Option(true); } diff --git a/src/index.cpp b/src/index.cpp index 124cf567..73b0cd08 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -1452,7 +1452,7 @@ void Index::search_candidates(const uint8_t & field_id, bool field_is_array, void Index::numeric_not_equals_filter(num_tree_t* const num_tree, const int64_t value, const uint32_t& context_ids_length, - const uint32_t* context_ids, + uint32_t* const& context_ids, size_t& ids_len, uint32_t*& ids) const { uint32_t* to_exclude_ids = nullptr; @@ -1491,11 +1491,11 @@ bool Index::field_is_indexed(const std::string& field_name) const { geopoint_index.count(field_name) != 0; } -Option Index::do_filtering(filter_node_t* const root, - filter_result_t& result, - const std::string& collection_name, - const uint32_t& context_ids_length, - const uint32_t* context_ids) const { +Option Index::_do_filtering(filter_node_t* const root, + filter_result_t& result, + const std::string& collection_name, + const uint32_t& context_ids_length, + uint32_t* const& context_ids) const { // auto begin = std::chrono::high_resolution_clock::now(); const filter a_filter = root->filter_exp; @@ -1953,8 +1953,8 @@ void Index::aproximate_numerical_match(num_tree_t* const num_tree, } Option Index::_approximate_filter_ids(const filter& a_filter, - uint32_t& filter_ids_length, - const std::string& collection_name) const { + uint32_t& filter_ids_length, + const std::string& collection_name) const { if (!a_filter.referenced_collection_name.empty()) { auto& cm = CollectionManager::get_instance(); auto collection = cm.get_collection(a_filter.referenced_collection_name); @@ -2054,7 +2054,7 @@ Option Index::_approximate_filter_ids(const filter& a_filter, return Option(true); } -Option Index::_rearrange_filter_tree(filter_node_t* const root, +Option Index::rearrange_filter_tree(filter_node_t* const root, uint32_t& filter_ids_length, const std::string& collection_name) const { if (root == nullptr) { @@ -2064,7 +2064,7 @@ Option Index::_rearrange_filter_tree(filter_node_t* const root, if (root->isOperator) { uint32_t l_filter_ids_length = 0; if (root->left != nullptr) { - auto rearrange_op = _rearrange_filter_tree(root->left, l_filter_ids_length, collection_name); + auto rearrange_op = rearrange_filter_tree(root->left, l_filter_ids_length, collection_name); if (!rearrange_op.ok()) { return rearrange_op; } @@ -2072,7 +2072,7 @@ Option Index::_rearrange_filter_tree(filter_node_t* const root, uint32_t r_filter_ids_length = 0; if (root->right != nullptr) { - auto rearrange_op = _rearrange_filter_tree(root->right, r_filter_ids_length, collection_name); + auto rearrange_op = rearrange_filter_tree(root->right, r_filter_ids_length, collection_name); if (!rearrange_op.ok()) { return rearrange_op; } @@ -2095,18 +2095,6 @@ Option Index::_rearrange_filter_tree(filter_node_t* const root, return Option(true); } -Option Index::_rearranging_recursive_filter(filter_node_t* const filter_tree_root, - filter_result_t& result, - const std::string& collection_name) const { - uint32_t filter_ids_length = 0; - auto rearrange_op = _rearrange_filter_tree(filter_tree_root, filter_ids_length, collection_name); - if (!rearrange_op.ok()) { - return rearrange_op; - } - - return recursive_filter(filter_tree_root, result, collection_name); -} - void copy_reference_ids(filter_result_t& from, filter_result_t& to) { if (to.count > 0 && !from.reference_filter_results.empty()) { for (const auto &item: from.reference_filter_results) { @@ -2132,7 +2120,9 @@ void copy_reference_ids(filter_result_t& from, filter_result_t& to) { Option Index::recursive_filter(filter_node_t* const root, filter_result_t& result, - const std::string& collection_name) const { + const std::string& collection_name, + const uint32_t& context_ids_length, + uint32_t* const& context_ids) const { if (root == nullptr) { return Option(true); } @@ -2140,7 +2130,7 @@ Option Index::recursive_filter(filter_node_t* const root, if (root->isOperator) { filter_result_t l_result; if (root->left != nullptr) { - auto filter_op = recursive_filter(root->left, l_result , collection_name); + auto filter_op = recursive_filter(root->left, l_result , collection_name, context_ids_length, context_ids); if (!filter_op.ok()) { return filter_op; } @@ -2148,7 +2138,7 @@ Option Index::recursive_filter(filter_node_t* const root, filter_result_t r_result; if (root->right != nullptr) { - auto filter_op = recursive_filter(root->right, r_result , collection_name); + auto filter_op = recursive_filter(root->right, r_result , collection_name, context_ids_length, context_ids); if (!filter_op.ok()) { return filter_op; } @@ -2173,7 +2163,7 @@ Option Index::recursive_filter(filter_node_t* const root, return Option(true); } - return do_filtering(root, result, collection_name); + return _do_filtering(root, result, collection_name, context_ids_length, context_ids); } Option Index::adaptive_filter(filter_node_t* const filter_tree_root, @@ -2183,16 +2173,13 @@ Option Index::adaptive_filter(filter_node_t* const filter_tree_root, return Option(true); } - auto metrics = filter_tree_root->metrics; - if (metrics != nullptr && - metrics->filter_exp_count > 2 && - metrics->and_operator_count > 0 && - // If there are more || in the filter tree than &&, we'll not gain much by rearranging the filter tree. - ((float) metrics->or_operator_count / (float) metrics->and_operator_count < 0.5)) { - return _rearranging_recursive_filter(filter_tree_root, result, collection_name); - } else { - return recursive_filter(filter_tree_root, result, collection_name); + uint32_t filter_ids_length = 0; + auto op = rearrange_filter_tree(filter_tree_root, filter_ids_length, collection_name); + if (!op.ok()) { + return op; } + + return recursive_filter(filter_tree_root, result, collection_name); } Option Index::do_filtering_with_lock(filter_node_t* const filter_tree_root, @@ -2252,7 +2239,7 @@ Option Index::get_approximate_reference_filter_ids_with_lock(filter_node_t uint32_t& filter_ids_length) const { std::shared_lock lock(mutex); - return _rearrange_filter_tree(filter_tree_root, filter_ids_length); + return rearrange_filter_tree(filter_tree_root, filter_ids_length); } Option Index::run_search(search_args* search_params, const std::string& collection_name) { diff --git a/src/num_tree.cpp b/src/num_tree.cpp index 5a1b95d3..1bcdbc9f 100644 --- a/src/num_tree.cpp +++ b/src/num_tree.cpp @@ -75,7 +75,7 @@ bool num_tree_t::range_inclusive_contains(const int64_t& start, const int64_t& e void num_tree_t::range_inclusive_contains(const int64_t& start, const int64_t& end, const uint32_t& context_ids_length, - const uint32_t*& context_ids, + uint32_t* const& context_ids, size_t& result_ids_len, uint32_t*& result_ids) const { if (int64map.empty()) { @@ -251,7 +251,7 @@ void num_tree_t::remove(uint64_t value, uint32_t id) { void num_tree_t::contains(const NUM_COMPARATOR& comparator, const int64_t& value, const uint32_t& context_ids_length, - const uint32_t*& context_ids, + uint32_t* const& context_ids, size_t& result_ids_len, uint32_t*& result_ids) const { if (int64map.empty()) { diff --git a/test/collection_specific_more_test.cpp b/test/collection_specific_more_test.cpp index b34b9973..2e9369cf 100644 --- a/test/collection_specific_more_test.cpp +++ b/test/collection_specific_more_test.cpp @@ -2076,8 +2076,8 @@ TEST_F(CollectionSpecificMoreTest, RearrangingFilterTree) { ASSERT_TRUE(root->left == nullptr); ASSERT_TRUE(root->right == nullptr); - filter_result_t result; - coll->_get_index()->_rearranging_recursive_filter(filter_tree_root, result); + uint32_t count = 0; + coll->_get_index()->rearrange_filter_tree(filter_tree_root, count); // && // / \ @@ -2199,7 +2199,7 @@ TEST_F(CollectionSpecificMoreTest, ApproxFilterMatchCount) { coll->get_schema(), store, doc_id_prefix, filter_tree_root); ASSERT_TRUE(filter_op.ok()); - coll->_get_index()->_rearrange_filter_tree(filter_tree_root, approx_count); + coll->_get_index()->rearrange_filter_tree(filter_tree_root, approx_count); ASSERT_EQ(approx_count, 3); delete filter_tree_root; From f3706f737baa5a351d3064f80f5a2e8094e1023d Mon Sep 17 00:00:00 2001 From: Harpreet Sangar Date: Thu, 9 Mar 2023 14:26:18 +0530 Subject: [PATCH 44/51] Refactor `std::unique_ptr` order. --- src/collection.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/collection.cpp b/src/collection.cpp index 9d1c0635..d35e2e10 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -2509,12 +2509,12 @@ Option Collection::get_filter_ids(const std::string& filter_query, filter_ filter_node_t* filter_tree_root = nullptr; Option filter_op = filter::parse_filter_query(filter_query, search_schema, store, doc_id_prefix, filter_tree_root); + std::unique_ptr filter_tree_root_guard(filter_tree_root); + if(!filter_op.ok()) { return filter_op; } - std::unique_ptr filter_tree_root_guard(filter_tree_root); - return index->do_filtering_with_lock(filter_tree_root, filter_result, name); } @@ -2544,12 +2544,12 @@ Option Collection::get_approximate_reference_filter_ids(const std::string& filter_node_t* filter_tree_root = nullptr; Option parse_op = filter::parse_filter_query(filter_query, search_schema, store, doc_id_prefix, filter_tree_root); + std::unique_ptr filter_tree_root_guard(filter_tree_root); + if(!parse_op.ok()) { return parse_op; } - std::unique_ptr filter_tree_root_guard(filter_tree_root); - return index->get_approximate_reference_filter_ids_with_lock(filter_tree_root, filter_ids_length); } @@ -2567,12 +2567,12 @@ Option Collection::get_reference_filter_ids(const std::string & filter_que filter_node_t* filter_tree_root = nullptr; Option parse_op = filter::parse_filter_query(filter_query, search_schema, store, doc_id_prefix, filter_tree_root); + std::unique_ptr filter_tree_root_guard(filter_tree_root); + if(!parse_op.ok()) { return parse_op; } - std::unique_ptr filter_tree_root_guard(filter_tree_root); - // Reference helper field has the sequence id of other collection's documents. auto field_name = reference_field_op.get() + REFERENCE_HELPER_FIELD_SUFFIX; return index->do_reference_filtering_with_lock(filter_tree_root, filter_result, name, field_name); From cd2f5be875c67e1a4242ff4e65c05a4a3f690f90 Mon Sep 17 00:00:00 2001 From: Harpreet Sangar Date: Thu, 9 Mar 2023 15:06:11 +0530 Subject: [PATCH 45/51] Remove `Index::adaptive_filter`. --- include/index.h | 4 ---- src/index.cpp | 31 ++++++++++--------------------- 2 files changed, 10 insertions(+), 25 deletions(-) diff --git a/include/index.h b/include/index.h index 62e75180..43ec2f28 100644 --- a/include/index.h +++ b/include/index.h @@ -492,10 +492,6 @@ private: const uint32_t& context_ids_length = 0, uint32_t* const& context_ids = nullptr) const; - Option adaptive_filter(filter_node_t* const filter_tree_root, - filter_result_t& result, - const std::string& collection_name = "") const; - void insert_doc(const int64_t score, art_tree *t, uint32_t seq_id, const std::unordered_map> &token_to_offsets) const; diff --git a/src/index.cpp b/src/index.cpp index 73b0cd08..d4f4f9b3 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -2166,28 +2166,12 @@ Option Index::recursive_filter(filter_node_t* const root, return _do_filtering(root, result, collection_name, context_ids_length, context_ids); } -Option Index::adaptive_filter(filter_node_t* const filter_tree_root, - filter_result_t& result, - const std::string& collection_name) const { - if (filter_tree_root == nullptr) { - return Option(true); - } - - uint32_t filter_ids_length = 0; - auto op = rearrange_filter_tree(filter_tree_root, filter_ids_length, collection_name); - if (!op.ok()) { - return op; - } - - return recursive_filter(filter_tree_root, result, collection_name); -} - Option Index::do_filtering_with_lock(filter_node_t* const filter_tree_root, filter_result_t& filter_result, const std::string& collection_name) const { std::shared_lock lock(mutex); - auto filter_op = adaptive_filter(filter_tree_root, filter_result, collection_name); + auto filter_op = recursive_filter(filter_tree_root, filter_result, collection_name); if (!filter_op.ok()) { return filter_op; } @@ -2202,7 +2186,7 @@ Option Index::do_reference_filtering_with_lock(filter_node_t* const filter std::shared_lock lock(mutex); filter_result_t reference_filter_result; - auto filter_op = adaptive_filter(filter_tree_root, reference_filter_result); + auto filter_op = recursive_filter(filter_tree_root, reference_filter_result); if (!filter_op.ok()) { return filter_op; } @@ -2730,9 +2714,14 @@ Option Index::search(std::vector& field_query_tokens, cons const std::string& collection_name) const { std::shared_lock lock(mutex); + uint32_t filter_ids_length = 0; + auto rearrange_op = rearrange_filter_tree(filter_tree_root, filter_ids_length, collection_name); + if (!rearrange_op.ok()) { + return rearrange_op; + } + filter_result_t filter_result; - // process the filters - auto filter_op = adaptive_filter(filter_tree_root, filter_result, collection_name); + auto filter_op = recursive_filter(filter_tree_root, filter_result, collection_name); if (!filter_op.ok()) { return filter_op; } @@ -4840,7 +4829,7 @@ void Index::populate_sort_mapping(int* sort_order, std::vector& geopoint } else if (sort_fields_std[i].name == sort_field_const::eval) { field_values[i] = &eval_sentinel_value; filter_result_t result; - adaptive_filter(sort_fields_std[i].eval.filter_tree_root, result); + recursive_filter(sort_fields_std[i].eval.filter_tree_root, result); sort_fields_std[i].eval.ids = result.docs; sort_fields_std[i].eval.size = result.count; result.docs = nullptr; From 2a1feae0ee2dca34071bc94c0eff35f4bdca1d80 Mon Sep 17 00:00:00 2001 From: Harpreet Sangar Date: Thu, 9 Mar 2023 19:53:50 +0530 Subject: [PATCH 46/51] Add comments. --- include/collection.h | 1 + include/field.h | 6 ------ include/index.h | 12 ++++++++++-- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/include/collection.h b/include/collection.h index 4e08d4da..dc8f41db 100644 --- a/include/collection.h +++ b/include/collection.h @@ -457,6 +457,7 @@ public: Option get_filter_ids(const std::string & filter_query, filter_result_t& filter_result) const; + /// Get approximate count of docs matching a reference filter on foo collection when $foo(...) filter is encountered. Option get_approximate_reference_filter_ids(const std::string& filter_query, uint32_t& filter_ids_length) const; diff --git a/include/field.h b/include/field.h index dd0033eb..a4ac81b8 100644 --- a/include/field.h +++ b/include/field.h @@ -594,12 +594,6 @@ struct filter { filter_node_t*& root); }; -struct filter_tree_metrics { - int filter_exp_count; - int and_operator_count; - int or_operator_count; -}; - struct filter_node_t { filter filter_exp; FILTER_OPERATOR filter_operator; diff --git a/include/index.h b/include/index.h index 43ec2f28..9f170a4d 100644 --- a/include/index.h +++ b/include/index.h @@ -486,8 +486,15 @@ private: const int64_t& range_end_value, uint32_t& filter_ids_length) const; - Option recursive_filter(filter_node_t* const root, - filter_result_t& result, + /// Traverses through filter tree to get the filter_result. + /// + /// \param filter_tree_root + /// \param filter_result + /// \param collection_name Name of the collection to which current index belongs. Used to find the reference field in other collection. + /// \param context_ids_length Number of docs matching the search query. + /// \param context_ids Array of doc ids matching the search query. + Option recursive_filter(filter_node_t* const filter_tree_root, + filter_result_t& filter_result, const std::string& collection_name = "", const uint32_t& context_ids_length = 0, uint32_t* const& context_ids = nullptr) const; @@ -700,6 +707,7 @@ public: const std::string& collection_name, const std::string& reference_helper_field_name) const; + /// Get approximate count of docs matching a reference filter on foo collection when $foo(...) filter is encountered. Option get_approximate_reference_filter_ids_with_lock(filter_node_t* const filter_tree_root, uint32_t& filter_ids_length) const; From 63f8d33b5fbe703d3728a876cff6bb6a0220c7ea Mon Sep 17 00:00:00 2001 From: Harpreet Sangar Date: Thu, 9 Mar 2023 20:41:58 +0530 Subject: [PATCH 47/51] Add comments. --- include/index.h | 20 +++++++++++++------- src/index.cpp | 20 ++++++++++---------- 2 files changed, 23 insertions(+), 17 deletions(-) diff --git a/include/index.h b/include/index.h index 9f170a4d..74e7e0ee 100644 --- a/include/index.h +++ b/include/index.h @@ -474,11 +474,11 @@ private: bool field_is_indexed(const std::string& field_name) const; - Option _do_filtering(filter_node_t* const root, - filter_result_t& result, - const std::string& collection_name = "", - const uint32_t& context_ids_length = 0, - uint32_t* const& context_ids = nullptr) const; + Option do_filtering(filter_node_t* const root, + filter_result_t& result, + const std::string& collection_name = "", + const uint32_t& context_ids_length = 0, + uint32_t* const& context_ids = nullptr) const; void aproximate_numerical_match(num_tree_t* const num_tree, const NUM_COMPARATOR& comparator, @@ -694,8 +694,14 @@ public: filter_result_t& filter_result, const std::string& collection_name = "") const; - Option rearrange_filter_tree(filter_node_t* const root, - uint32_t& filter_ids_length, + /// Traverses through filter tree and gets an approximate doc count for each filter. Also arranges the children of + /// each operator in ascending order based on their approx doc count. + /// + /// \param filter_tree_root + /// \param approx_filter_ids_length Approximate count of docs that would match the whole filter_by clause. + /// \param collection_name Name of the collection to which current index belongs. Used to find the reference field in other collection. + Option rearrange_filter_tree(filter_node_t* const filter_tree_root, + uint32_t& approx_filter_ids_length, const std::string& collection_name = "") const; Option _approximate_filter_ids(const filter& a_filter, diff --git a/src/index.cpp b/src/index.cpp index d4f4f9b3..bb5ca2c4 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -1491,11 +1491,11 @@ bool Index::field_is_indexed(const std::string& field_name) const { geopoint_index.count(field_name) != 0; } -Option Index::_do_filtering(filter_node_t* const root, - filter_result_t& result, - const std::string& collection_name, - const uint32_t& context_ids_length, - uint32_t* const& context_ids) const { +Option Index::do_filtering(filter_node_t* const root, + filter_result_t& result, + const std::string& collection_name, + const uint32_t& context_ids_length, + uint32_t* const& context_ids) const { // auto begin = std::chrono::high_resolution_clock::now(); const filter a_filter = root->filter_exp; @@ -2055,7 +2055,7 @@ Option Index::_approximate_filter_ids(const filter& a_filter, } Option Index::rearrange_filter_tree(filter_node_t* const root, - uint32_t& filter_ids_length, + uint32_t& approx_filter_ids_length, const std::string& collection_name) const { if (root == nullptr) { return Option(true); @@ -2079,9 +2079,9 @@ Option Index::rearrange_filter_tree(filter_node_t* const root, } if (root->filter_operator == AND) { - filter_ids_length = std::min(l_filter_ids_length, r_filter_ids_length); + approx_filter_ids_length = std::min(l_filter_ids_length, r_filter_ids_length); } else { - filter_ids_length = l_filter_ids_length + r_filter_ids_length; + approx_filter_ids_length = l_filter_ids_length + r_filter_ids_length; } if (l_filter_ids_length > r_filter_ids_length) { @@ -2091,7 +2091,7 @@ Option Index::rearrange_filter_tree(filter_node_t* const root, return Option(true); } - _approximate_filter_ids(root->filter_exp, filter_ids_length, collection_name); + _approximate_filter_ids(root->filter_exp, approx_filter_ids_length, collection_name); return Option(true); } @@ -2163,7 +2163,7 @@ Option Index::recursive_filter(filter_node_t* const root, return Option(true); } - return _do_filtering(root, result, collection_name, context_ids_length, context_ids); + return do_filtering(root, result, collection_name, context_ids_length, context_ids); } Option Index::do_filtering_with_lock(filter_node_t* const filter_tree_root, From 391d693ffa45cf2aedbe84d58c5cbc711a2b8332 Mon Sep 17 00:00:00 2001 From: Harpreet Sangar Date: Fri, 10 Mar 2023 11:56:08 +0530 Subject: [PATCH 48/51] Add `and_filter_result` function. --- src/index.cpp | 75 ++++++++++-- test/collection_join_test.cpp | 208 ++++++++++++++++++++++++++++++++++ 2 files changed, 275 insertions(+), 8 deletions(-) diff --git a/src/index.cpp b/src/index.cpp index bb5ca2c4..03bc00ab 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -2095,6 +2095,66 @@ Option Index::rearrange_filter_tree(filter_node_t* const root, return Option(true); } +void and_filter_result(const filter_result_t& a, const filter_result_t& b, filter_result_t& result) { + auto lenA = a.count, lenB = b.count; + if (lenA == 0 || lenB == 0) { + return; + } + + result.docs = new uint32_t[std::min(lenA, lenB)]; + + auto A = a.docs, B = b.docs, out = result.docs; + const uint32_t *endA = A + lenA; + const uint32_t *endB = B + lenB; + + for (auto const& item: a.reference_filter_results) { + result.reference_filter_results[item.first]; + } + for (auto const& item: b.reference_filter_results) { + result.reference_filter_results[item.first]; + } + for (auto& item: result.reference_filter_results) { + item.second = new reference_filter_result_t[std::min(lenA, lenB)]; + } + + while (true) { + while (*A < *B) { + SKIP_FIRST_COMPARE: + if (++A == endA) { + result.count = out - result.docs; + return; + } + } + while (*A > *B) { + if (++B == endB) { + result.count = out - result.docs; + return; + } + } + if (*A == *B) { + *out = *A; + + for (auto const& item: a.reference_filter_results) { + result.reference_filter_results[item.first][out - result.docs] = item.second[A - a.docs]; + item.second[A - a.docs].docs = nullptr; + } + for (auto const& item: b.reference_filter_results) { + result.reference_filter_results[item.first][out - result.docs] = item.second[B - b.docs]; + item.second[B - b.docs].docs = nullptr; + } + + out++; + + if (++A == endA || ++B == endB) { + result.count = out - result.docs; + return; + } + } else { + goto SKIP_FIRST_COMPARE; + } + } +} + void copy_reference_ids(filter_result_t& from, filter_result_t& to) { if (to.count > 0 && !from.reference_filter_results.empty()) { for (const auto &item: from.reference_filter_results) { @@ -2144,21 +2204,20 @@ Option Index::recursive_filter(filter_node_t* const root, } } - uint32_t* filtered_results = nullptr; if (root->filter_operator == AND) { - result.count = ArrayUtils::and_scalar( - l_result.docs, l_result.count, r_result.docs, - r_result.count, &filtered_results); + and_filter_result(l_result, r_result, result); } else { + uint32_t* filtered_results = nullptr; result.count = ArrayUtils::or_scalar( l_result.docs, l_result.count, r_result.docs, r_result.count, &filtered_results); + + result.docs = filtered_results; + if (!l_result.reference_filter_results.empty() || !r_result.reference_filter_results.empty()) { + copy_reference_ids(!l_result.reference_filter_results.empty() ? l_result : r_result, result); + } } - result.docs = filtered_results; - if (!l_result.reference_filter_results.empty() || !r_result.reference_filter_results.empty()) { - copy_reference_ids(!l_result.reference_filter_results.empty() ? l_result : r_result, result); - } return Option(true); } diff --git a/test/collection_join_test.cpp b/test/collection_join_test.cpp index b25439e6..6df9d397 100644 --- a/test/collection_join_test.cpp +++ b/test/collection_join_test.cpp @@ -551,6 +551,214 @@ TEST_F(CollectionJoinTest, FilterByReference_MultipleMatch) { collectionManager.drop_collection("Links"); } +TEST_F(CollectionJoinTest, FilterByNReferences) { + auto schema_json = + R"({ + "name": "Users", + "fields": [ + {"name": "user_id", "type": "string"}, + {"name": "user_name", "type": "string"} + ] + })"_json; + std::vector documents = { + R"({ + "user_id": "user_a", + "user_name": "Roshan" + })"_json, + R"({ + "user_id": "user_b", + "user_name": "Ruby" + })"_json, + R"({ + "user_id": "user_c", + "user_name": "Joe" + })"_json, + R"({ + "user_id": "user_d", + "user_name": "Aby" + })"_json + }; + auto collection_create_op = collectionManager.create_collection(schema_json); + ASSERT_TRUE(collection_create_op.ok()); + for (auto const &json: documents) { + auto add_op = collection_create_op.get()->add(json.dump()); + if (!add_op.ok()) { + LOG(INFO) << add_op.error(); + } + ASSERT_TRUE(add_op.ok()); + } + + schema_json = + R"({ + "name": "Repos", + "fields": [ + {"name": "repo_id", "type": "string"}, + {"name": "repo_content", "type": "string"}, + {"name": "repo_stars", "type": "int32"}, + {"name": "repo_is_private", "type": "bool"} + ] + })"_json; + documents = { + R"({ + "repo_id": "repo_a", + "repo_content": "body1", + "repo_stars": 431, + "repo_is_private": true + })"_json, + R"({ + "repo_id": "repo_b", + "repo_content": "body2", + "repo_stars": 4562, + "repo_is_private": false + })"_json, + R"({ + "repo_id": "repo_c", + "repo_content": "body3", + "repo_stars": 945, + "repo_is_private": false + })"_json + }; + collection_create_op = collectionManager.create_collection(schema_json); + ASSERT_TRUE(collection_create_op.ok()); + for (auto const &json: documents) { + auto add_op = collection_create_op.get()->add(json.dump()); + if (!add_op.ok()) { + LOG(INFO) << add_op.error(); + } + ASSERT_TRUE(add_op.ok()); + } + + schema_json = + R"({ + "name": "Links", + "fields": [ + {"name": "repo_id", "type": "string", "reference": "Repos.repo_id"}, + {"name": "user_id", "type": "string", "reference": "Users.user_id"} + ] + })"_json; + documents = { + R"({ + "repo_id": "repo_a", + "user_id": "user_b" + })"_json, + R"({ + "repo_id": "repo_a", + "user_id": "user_c" + })"_json, + R"({ + "repo_id": "repo_b", + "user_id": "user_a" + })"_json, + R"({ + "repo_id": "repo_b", + "user_id": "user_b" + })"_json, + R"({ + "repo_id": "repo_b", + "user_id": "user_d" + })"_json, + R"({ + "repo_id": "repo_c", + "user_id": "user_a" + })"_json, + R"({ + "repo_id": "repo_c", + "user_id": "user_b" + })"_json, + R"({ + "repo_id": "repo_c", + "user_id": "user_c" + })"_json, + R"({ + "repo_id": "repo_c", + "user_id": "user_d" + })"_json + }; + collection_create_op = collectionManager.create_collection(schema_json); + ASSERT_TRUE(collection_create_op.ok()); + + for (auto const &json: documents) { + auto add_op = collection_create_op.get()->add(json.dump()); + if (!add_op.ok()) { + LOG(INFO) << add_op.error(); + } + ASSERT_TRUE(add_op.ok()); + } + + schema_json = + R"({ + "name": "Organizations", + "fields": [ + {"name": "org_id", "type": "string"}, + {"name": "org_name", "type": "string"} + ] + })"_json; + documents = { + R"({ + "org_id": "org_a", + "org_name": "Typesense" + })"_json + }; + collection_create_op = collectionManager.create_collection(schema_json); + ASSERT_TRUE(collection_create_op.ok()); + + for (auto const &json: documents) { + auto add_op = collection_create_op.get()->add(json.dump()); + if (!add_op.ok()) { + LOG(INFO) << add_op.error(); + } + ASSERT_TRUE(add_op.ok()); + } + + schema_json = + R"({ + "name": "Participants", + "fields": [ + {"name": "user_id", "type": "string", "reference": "Users.user_id"}, + {"name": "org_id", "type": "string", "reference": "Organizations.org_id"} + ] + })"_json; + documents = { + R"({ + "user_id": "user_a", + "org_id": "org_a" + })"_json, + R"({ + "user_id": "user_b", + "org_id": "org_a" + })"_json, + R"({ + "user_id": "user_d", + "org_id": "org_a" + })"_json, + }; + collection_create_op = collectionManager.create_collection(schema_json); + ASSERT_TRUE(collection_create_op.ok()); + + for (auto const &json: documents) { + auto add_op = collection_create_op.get()->add(json.dump()); + if (!add_op.ok()) { + LOG(INFO) << add_op.error(); + } + ASSERT_TRUE(add_op.ok()); + } + + auto coll = collectionManager.get_collection_unsafe("Users"); + + // Search for users within an organization with access to a particular repo. + auto result = coll->search("R", {"user_name"}, "$Participants(org_id:=org_a) && $Links(repo_id:=repo_b)", {}, {}, {0}, + 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD).get(); + + ASSERT_EQ(2, result["found"].get()); + ASSERT_EQ(2, result["hits"].size()); + ASSERT_EQ("user_b", result["hits"][0]["document"]["user_id"].get()); + ASSERT_EQ("user_a", result["hits"][1]["document"]["user_id"].get()); + + collectionManager.drop_collection("Users"); + collectionManager.drop_collection("Repos"); + collectionManager.drop_collection("Links"); +} + TEST_F(CollectionJoinTest, IncludeFieldsByReference_SingleMatch) { auto schema_json = R"({ From c4c59169f08c84a69e27730b03dce5ab505451f4 Mon Sep 17 00:00:00 2001 From: Harpreet Sangar Date: Fri, 10 Mar 2023 13:45:20 +0530 Subject: [PATCH 49/51] Refactor `and_filter_result`. --- src/index.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/index.cpp b/src/index.cpp index 03bc00ab..3f6809b7 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -2108,13 +2108,14 @@ void and_filter_result(const filter_result_t& a, const filter_result_t& b, filte const uint32_t *endB = B + lenB; for (auto const& item: a.reference_filter_results) { - result.reference_filter_results[item.first]; + if (result.reference_filter_results.count(item.first) == 0) { + result.reference_filter_results[item.first] = new reference_filter_result_t[std::min(lenA, lenB)]; + } } for (auto const& item: b.reference_filter_results) { - result.reference_filter_results[item.first]; - } - for (auto& item: result.reference_filter_results) { - item.second = new reference_filter_result_t[std::min(lenA, lenB)]; + if (result.reference_filter_results.count(item.first) == 0) { + result.reference_filter_results[item.first] = new reference_filter_result_t[std::min(lenA, lenB)]; + } } while (true) { From 3be000609a6b240aa09476b02eb9562395ccab0e Mon Sep 17 00:00:00 2001 From: Harpreet Sangar Date: Fri, 10 Mar 2023 16:21:14 +0530 Subject: [PATCH 50/51] Add tests for `and_filter_result`. --- include/field.h | 61 +++++++++++++++++++++++++ src/index.cpp | 63 +------------------------- test/collection_join_test.cpp | 83 +++++++++++++++++++++++++++++++++++ 3 files changed, 145 insertions(+), 62 deletions(-) diff --git a/include/field.h b/include/field.h index a4ac81b8..87a17702 100644 --- a/include/field.h +++ b/include/field.h @@ -660,6 +660,67 @@ struct filter_result_t { delete[] item.second; } } + + static void and_filter_results(const filter_result_t& a, const filter_result_t& b, filter_result_t& result) { + auto lenA = a.count, lenB = b.count; + if (lenA == 0 || lenB == 0) { + return; + } + + result.docs = new uint32_t[std::min(lenA, lenB)]; + + auto A = a.docs, B = b.docs, out = result.docs; + const uint32_t *endA = A + lenA; + const uint32_t *endB = B + lenB; + + for (auto const& item: a.reference_filter_results) { + if (result.reference_filter_results.count(item.first) == 0) { + result.reference_filter_results[item.first] = new reference_filter_result_t[std::min(lenA, lenB)]; + } + } + for (auto const& item: b.reference_filter_results) { + if (result.reference_filter_results.count(item.first) == 0) { + result.reference_filter_results[item.first] = new reference_filter_result_t[std::min(lenA, lenB)]; + } + } + + while (true) { + while (*A < *B) { + SKIP_FIRST_COMPARE: + if (++A == endA) { + result.count = out - result.docs; + return; + } + } + while (*A > *B) { + if (++B == endB) { + result.count = out - result.docs; + return; + } + } + if (*A == *B) { + *out = *A; + + for (auto const& item: a.reference_filter_results) { + result.reference_filter_results[item.first][out - result.docs] = item.second[A - a.docs]; + item.second[A - a.docs].docs = nullptr; + } + for (auto const& item: b.reference_filter_results) { + result.reference_filter_results[item.first][out - result.docs] = item.second[B - b.docs]; + item.second[B - b.docs].docs = nullptr; + } + + out++; + + if (++A == endA || ++B == endB) { + result.count = out - result.docs; + return; + } + } else { + goto SKIP_FIRST_COMPARE; + } + } + } }; namespace sort_field_const { diff --git a/src/index.cpp b/src/index.cpp index 3f6809b7..0e09cc8b 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -2095,67 +2095,6 @@ Option Index::rearrange_filter_tree(filter_node_t* const root, return Option(true); } -void and_filter_result(const filter_result_t& a, const filter_result_t& b, filter_result_t& result) { - auto lenA = a.count, lenB = b.count; - if (lenA == 0 || lenB == 0) { - return; - } - - result.docs = new uint32_t[std::min(lenA, lenB)]; - - auto A = a.docs, B = b.docs, out = result.docs; - const uint32_t *endA = A + lenA; - const uint32_t *endB = B + lenB; - - for (auto const& item: a.reference_filter_results) { - if (result.reference_filter_results.count(item.first) == 0) { - result.reference_filter_results[item.first] = new reference_filter_result_t[std::min(lenA, lenB)]; - } - } - for (auto const& item: b.reference_filter_results) { - if (result.reference_filter_results.count(item.first) == 0) { - result.reference_filter_results[item.first] = new reference_filter_result_t[std::min(lenA, lenB)]; - } - } - - while (true) { - while (*A < *B) { - SKIP_FIRST_COMPARE: - if (++A == endA) { - result.count = out - result.docs; - return; - } - } - while (*A > *B) { - if (++B == endB) { - result.count = out - result.docs; - return; - } - } - if (*A == *B) { - *out = *A; - - for (auto const& item: a.reference_filter_results) { - result.reference_filter_results[item.first][out - result.docs] = item.second[A - a.docs]; - item.second[A - a.docs].docs = nullptr; - } - for (auto const& item: b.reference_filter_results) { - result.reference_filter_results[item.first][out - result.docs] = item.second[B - b.docs]; - item.second[B - b.docs].docs = nullptr; - } - - out++; - - if (++A == endA || ++B == endB) { - result.count = out - result.docs; - return; - } - } else { - goto SKIP_FIRST_COMPARE; - } - } -} - void copy_reference_ids(filter_result_t& from, filter_result_t& to) { if (to.count > 0 && !from.reference_filter_results.empty()) { for (const auto &item: from.reference_filter_results) { @@ -2206,7 +2145,7 @@ Option Index::recursive_filter(filter_node_t* const root, } if (root->filter_operator == AND) { - and_filter_result(l_result, r_result, result); + filter_result_t::and_filter_results(l_result, r_result, result); } else { uint32_t* filtered_results = nullptr; result.count = ArrayUtils::or_scalar( diff --git a/test/collection_join_test.cpp b/test/collection_join_test.cpp index 6df9d397..23c4b022 100644 --- a/test/collection_join_test.cpp +++ b/test/collection_join_test.cpp @@ -551,6 +551,89 @@ TEST_F(CollectionJoinTest, FilterByReference_MultipleMatch) { collectionManager.drop_collection("Links"); } +TEST_F(CollectionJoinTest, AndFilterResults_NoReference) { + filter_result_t a; + a.count = 9; + a.docs = new uint32_t[a.count]; + for (size_t i = 0; i < a.count; i++) { + a.docs[i] = i; + } + + filter_result_t b; + b.count = 0; + uint32_t limit = 10; + b.docs = new uint32_t[limit]; + for (size_t i = 2; i < limit; i++) { + if (i % 3 == 0) { + b.docs[b.count++] = i; + } + } + + // a.docs: [0..8] , b.docs: [3, 6, 9] + filter_result_t result; + filter_result_t::and_filter_results(a, b, result); + + ASSERT_EQ(2, result.count); + ASSERT_EQ(0, result.reference_filter_results.size()); + + std::vector docs = {3, 6}; + + for(size_t i = 0; i < result.count; i++) { + ASSERT_EQ(docs[i], result.docs[i]); + } +} + +TEST_F(CollectionJoinTest, AndFilterResults_WithReferences) { + filter_result_t a; + a.count = 9; + a.docs = new uint32_t[a.count]; + a.reference_filter_results["foo"] = new reference_filter_result_t[a.count]; + for (size_t i = 0; i < a.count; i++) { + a.docs[i] = i; + + auto& reference = a.reference_filter_results["foo"][i]; + reference.count = 1; + reference.docs = new uint32_t[1]; + reference.docs[0] = 10 - i; + } + + filter_result_t b; + b.count = 0; + uint32_t limit = 10; + b.docs = new uint32_t[limit]; + b.reference_filter_results["bar"] = new reference_filter_result_t[limit]; + for (size_t i = 2; i < limit; i++) { + if (i % 3 == 0) { + b.docs[b.count] = i; + + auto& reference = b.reference_filter_results["bar"][b.count++]; + reference.count = 1; + reference.docs = new uint32_t[1]; + reference.docs[0] = 2 * i; + } + } + + // a.docs: [0..8] , b.docs: [3, 6, 9] + filter_result_t result; + filter_result_t::and_filter_results(a, b, result); + + ASSERT_EQ(2, result.count); + ASSERT_EQ(2, result.reference_filter_results.size()); + ASSERT_EQ(1, result.reference_filter_results.count("foo")); + ASSERT_EQ(1, result.reference_filter_results.count("bar")); + + std::vector docs = {3, 6}, foo_reference = {7, 4}, bar_reference = {6, 12}; + + for(size_t i = 0; i < result.count; i++) { + ASSERT_EQ(docs[i], result.docs[i]); + + ASSERT_EQ(1, result.reference_filter_results["foo"][i].count); + ASSERT_EQ(foo_reference[i], result.reference_filter_results["foo"][i].docs[0]); + ASSERT_EQ(1, result.reference_filter_results["bar"][i].count); + ASSERT_EQ(bar_reference[i], result.reference_filter_results["bar"][i].docs[0]); + } +} + TEST_F(CollectionJoinTest, FilterByNReferences) { auto schema_json = R"({ From f71888703168da27a7c168ad0db4430d983814c8 Mon Sep 17 00:00:00 2001 From: Harpreet Sangar Date: Fri, 10 Mar 2023 18:14:44 +0530 Subject: [PATCH 51/51] Refactor `and_filter_result`. --- include/field.h | 61 ++-------------------------------------------- src/field.cpp | 65 +++++++++++++++++++++++++++++++++++++++++++++++++ src/index.cpp | 1 - 3 files changed, 67 insertions(+), 60 deletions(-) diff --git a/include/field.h b/include/field.h index 87a17702..b7865a29 100644 --- a/include/field.h +++ b/include/field.h @@ -661,66 +661,9 @@ struct filter_result_t { } } - static void and_filter_results(const filter_result_t& a, const filter_result_t& b, filter_result_t& result) { - auto lenA = a.count, lenB = b.count; - if (lenA == 0 || lenB == 0) { - return; - } + static void and_filter_results(const filter_result_t& a, const filter_result_t& b, filter_result_t& result); - result.docs = new uint32_t[std::min(lenA, lenB)]; - - auto A = a.docs, B = b.docs, out = result.docs; - const uint32_t *endA = A + lenA; - const uint32_t *endB = B + lenB; - - for (auto const& item: a.reference_filter_results) { - if (result.reference_filter_results.count(item.first) == 0) { - result.reference_filter_results[item.first] = new reference_filter_result_t[std::min(lenA, lenB)]; - } - } - for (auto const& item: b.reference_filter_results) { - if (result.reference_filter_results.count(item.first) == 0) { - result.reference_filter_results[item.first] = new reference_filter_result_t[std::min(lenA, lenB)]; - } - } - - while (true) { - while (*A < *B) { - SKIP_FIRST_COMPARE: - if (++A == endA) { - result.count = out - result.docs; - return; - } - } - while (*A > *B) { - if (++B == endB) { - result.count = out - result.docs; - return; - } - } - if (*A == *B) { - *out = *A; - - for (auto const& item: a.reference_filter_results) { - result.reference_filter_results[item.first][out - result.docs] = item.second[A - a.docs]; - item.second[A - a.docs].docs = nullptr; - } - for (auto const& item: b.reference_filter_results) { - result.reference_filter_results[item.first][out - result.docs] = item.second[B - b.docs]; - item.second[B - b.docs].docs = nullptr; - } - - out++; - - if (++A == endA || ++B == endB) { - result.count = out - result.docs; - return; - } - } else { - goto SKIP_FIRST_COMPARE; - } - } - } + static void or_filter_results(const filter_result_t& a, const filter_result_t& b, filter_result_t& result); }; namespace sort_field_const { diff --git a/src/field.cpp b/src/field.cpp index c7297359..9b20aeef 100644 --- a/src/field.cpp +++ b/src/field.cpp @@ -983,3 +983,68 @@ void field::compact_nested_fields(tsl::htrie_map& nested_fields) { nested_fields.erase_prefix(field_name + "."); } } + +void filter_result_t::and_filter_results(const filter_result_t& a, const filter_result_t& b, filter_result_t& result) { + auto lenA = a.count, lenB = b.count; + if (lenA == 0 || lenB == 0) { + return; + } + + result.docs = new uint32_t[std::min(lenA, lenB)]; + + auto A = a.docs, B = b.docs, out = result.docs; + const uint32_t *endA = A + lenA; + const uint32_t *endB = B + lenB; + + for (auto const& item: a.reference_filter_results) { + if (result.reference_filter_results.count(item.first) == 0) { + result.reference_filter_results[item.first] = new reference_filter_result_t[std::min(lenA, lenB)]; + } + } + for (auto const& item: b.reference_filter_results) { + if (result.reference_filter_results.count(item.first) == 0) { + result.reference_filter_results[item.first] = new reference_filter_result_t[std::min(lenA, lenB)]; + } + } + + while (true) { + while (*A < *B) { + SKIP_FIRST_COMPARE: + if (++A == endA) { + result.count = out - result.docs; + return; + } + } + while (*A > *B) { + if (++B == endB) { + result.count = out - result.docs; + return; + } + } + if (*A == *B) { + *out = *A; + + for (auto const& item: a.reference_filter_results) { + auto& reference = result.reference_filter_results[item.first][out - result.docs]; + reference.count = item.second[A - a.docs].count; + reference.docs = new uint32_t[reference.count]; + memcpy(reference.docs, item.second[A - a.docs].docs, reference.count * sizeof(uint32_t)); + } + for (auto const& item: b.reference_filter_results) { + auto& reference = result.reference_filter_results[item.first][out - result.docs]; + reference.count = item.second[B - b.docs].count; + reference.docs = new uint32_t[reference.count]; + memcpy(reference.docs, item.second[B - b.docs].docs, reference.count * sizeof(uint32_t)); + } + + out++; + + if (++A == endA || ++B == endB) { + result.count = out - result.docs; + return; + } + } else { + goto SKIP_FIRST_COMPARE; + } + } +} diff --git a/src/index.cpp b/src/index.cpp index 0e09cc8b..c81e4881 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -2158,7 +2158,6 @@ Option Index::recursive_filter(filter_node_t* const root, } } - return Option(true); }