From 6c5662bc955e2901be1e03e46f9a9869ae0a0d3b Mon Sep 17 00:00:00 2001 From: Harpreet Sangar Date: Tue, 24 Jan 2023 10:57:29 +0530 Subject: [PATCH] Optimize reference filtering. --- include/collection.h | 4 ++ include/index.h | 4 ++ src/collection.cpp | 35 +++++++++++++++ src/field.cpp | 2 +- src/index.cpp | 68 ++++++++++++----------------- test/collection_join_test.cpp | 74 ++++++++++++++++---------------- test/collection_manager_test.cpp | 16 ++++--- 7 files changed, 120 insertions(+), 83 deletions(-) diff --git a/include/collection.h b/include/collection.h index 71b879b1..6d25bcfa 100644 --- a/include/collection.h +++ b/include/collection.h @@ -436,6 +436,10 @@ public: Option get_filter_ids(const std::string & filter_query, std::vector>& index_ids) const; + Option get_reference_filter_ids(const std::string & filter_query, + const std::string & collection_name, + std::pair& reference_index_ids) const; + Option validate_reference_filter(const std::string& filter_query) const; Option get(const std::string & id) const; diff --git a/include/index.h b/include/index.h index 999993b6..a6b161cd 100644 --- a/include/index.h +++ b/include/index.h @@ -715,6 +715,10 @@ public: uint32_t& filter_ids_length, filter_node_t const* const& filter_tree_root) const; + void do_reference_filtering_with_lock(std::pair& reference_index_ids, + filter_node_t const* const& filter_tree_root, + const std::string& reference_field_name) const; + void refresh_schemas(const std::vector& new_fields, const std::vector& del_fields); // the following methods are not synchronized because their parent calls are synchronized or they are const/static diff --git a/src/collection.cpp b/src/collection.cpp index 516a8bf3..7716b5ac 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -2362,6 +2362,41 @@ Option Collection::get_filter_ids(const std::string & filter_query, return Option(true); } +Option Collection::get_reference_filter_ids(const std::string & filter_query, + const std::string & collection_name, + std::pair& reference_index_ids) const { + std::shared_lock lock(mutex); + + std::string reference_field_name; + for (auto const& field: fields) { + if (!field.reference.empty() && + field.reference.find(collection_name) == 0 && + field.reference.find('.') == collection_name.size()) { + reference_field_name = field.name; + break; + } + } + + if (reference_field_name.empty()) { + return Option(400, "Could not find any field in `" + name + "` referencing the collection `" + + collection_name + "`."); + } + + const std::string doc_id_prefix = std::to_string(collection_id) + "_" + DOC_ID_PREFIX + "_"; + filter_node_t* filter_tree_root = nullptr; + Option filter_op = filter::parse_filter_query(filter_query, search_schema, + store, doc_id_prefix, filter_tree_root); + if(!filter_op.ok()) { + return filter_op; + } + + reference_field_name += "_sequence_id"; + index->do_reference_filtering_with_lock(reference_index_ids, filter_tree_root, reference_field_name); + + delete filter_tree_root; + return Option(true); +} + Option Collection::validate_reference_filter(const std::string& filter_query) const { std::shared_lock lock(mutex); diff --git a/src/field.cpp b/src/field.cpp index f0bd4eaa..36ab953e 100644 --- a/src/field.cpp +++ b/src/field.cpp @@ -723,7 +723,7 @@ Option field::json_field_to_field(bool enable_nested_fields, nlohmann::jso if (!field_json[fields::reference].get().empty()) { the_fields.emplace_back( - field(field_json[fields::name].get() + "_sequence_id", "string", false, + field(field_json[fields::name].get() + "_sequence_id", "int64", false, field_json[fields::optional], true) ); } diff --git a/src/index.cpp b/src/index.cpp index e21dddcc..64ad5a66 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -470,7 +470,7 @@ Option Index::validate_index_in_memory(nlohmann::json& document, uint3 "Multiple documents having" + match + "found in the collection `" + tokens[0] + "`."); } - document[a_field.name + "_sequence_id"] = StringUtils::serialize_uint32_t(*(documents[0].second)); + document[a_field.name + "_sequence_id"] = *(documents[0].second); delete [] documents[0].second; } @@ -1665,7 +1665,7 @@ void Index::do_filtering(uint32_t*& filter_ids, uint32_t& filter_ids_length, filter_node_t const* const root) const { // auto begin = std::chrono::high_resolution_clock::now(); - const filter a_filter = root->filter_exp; +/**/ const filter a_filter = root->filter_exp; bool is_referenced_filter = !a_filter.referenced_collection_name.empty(); if (is_referenced_filter) { @@ -1673,48 +1673,16 @@ void Index::do_filtering(uint32_t*& filter_ids, auto& cm = CollectionManager::get_instance(); auto collection = cm.get_collection(a_filter.referenced_collection_name); - std::vector> documents; - auto op = collection->get_filter_ids(a_filter.field_name, documents); + std::pair documents; + auto op = collection->get_reference_filter_ids(a_filter.field_name, + cm.get_collection_with_id(collection_id)->get_name(), + documents); if (!op.ok()) { return; } - if (documents[0].first > 0) { - const field* reference_field = nullptr; - for (auto const& f: collection->get_fields()) { - auto this_collection_name = cm.get_collection_with_id(collection_id)->get_name(); - if (!f.reference.empty() && - f.reference.find(this_collection_name) == 0 && - f.reference.find('.') == this_collection_name.size()) { - reference_field = &f; - break; - } - } - - if (reference_field == nullptr) { - return; - } - - std::vector result_ids; - for (size_t i = 0; i < documents[0].first; i++) { - uint32_t seq_id = *(documents[0].second + i); - - nlohmann::json document; - auto op = collection->get_document_from_store(seq_id, document); - if (!op.ok()) { - return; - } - - result_ids.push_back(StringUtils::deserialize_uint32_t(document[reference_field->name + "_sequence_id"].get())); - } - - filter_ids = new uint32[result_ids.size()]; - std::sort(result_ids.begin(), result_ids.end()); - std::copy(result_ids.begin(), result_ids.end(), filter_ids); - filter_ids_length = result_ids.size(); - } - - delete [] documents[0].second; + filter_ids_length = documents.first; + filter_ids = documents.second; return; } @@ -2099,6 +2067,26 @@ void Index::do_filtering_with_lock(uint32_t*& filter_ids, recursive_filter(filter_ids, filter_ids_length, filter_tree_root, false); } +void Index::do_reference_filtering_with_lock(std::pair& reference_index_ids, + filter_node_t const* const& filter_tree_root, + const std::string& reference_field_name) const { + std::shared_lock lock(mutex); + recursive_filter(reference_index_ids.second, reference_index_ids.first, filter_tree_root, false); + + std::vector vector; + vector.reserve(reference_index_ids.first); + + for (uint32_t i = 0; i < reference_index_ids.first; i++) { + auto filtered_doc_id = *(reference_index_ids.second + i); + + // Extract the sequence_id from the reference field. + vector.push_back(sort_index.at(reference_field_name)->at(filtered_doc_id)); + } + + std::sort(vector.begin(), vector.end()); + std::copy(vector.begin(), vector.end(), reference_index_ids.second); +} + void Index::run_search(search_args* search_params) { search(search_params->field_query_tokens, search_params->search_fields, diff --git a/test/collection_join_test.cpp b/test/collection_join_test.cpp index 26a7d476..e7e5636f 100644 --- a/test/collection_join_test.cpp +++ b/test/collection_join_test.cpp @@ -101,18 +101,6 @@ TEST_F(CollectionJoinTest, SchemaReferenceField) { } TEST_F(CollectionJoinTest, IndexDocumentHavingReferenceField) { - auto products_schema_json = - R"({ - "name": "Products", - "fields": [ - {"name": "product_id", "type": "string", "index": false, "optional": true}, - {"name": "product_name", "type": "string"}, - {"name": "product_description", "type": "string"} - ] - })"_json; - auto collection_create_op = collectionManager.create_collection(products_schema_json); - ASSERT_TRUE(collection_create_op.ok()); - auto customers_schema_json = R"({ "name": "Customers", @@ -123,7 +111,7 @@ TEST_F(CollectionJoinTest, IndexDocumentHavingReferenceField) { {"name": "product_id", "type": "string", "reference": "foo"} ] })"_json; - collection_create_op = collectionManager.create_collection(customers_schema_json); + auto collection_create_op = collectionManager.create_collection(customers_schema_json); ASSERT_TRUE(collection_create_op.ok()); nlohmann::json customer_json = R"({ @@ -134,9 +122,9 @@ TEST_F(CollectionJoinTest, IndexDocumentHavingReferenceField) { })"_json; auto customer_collection = collection_create_op.get(); - auto add_op = customer_collection->add(customer_json.dump()); - ASSERT_FALSE(add_op.ok()); - ASSERT_EQ("Invalid reference `foo`.", add_op.error()); + auto add_doc_op = customer_collection->add(customer_json.dump()); + ASSERT_FALSE(add_doc_op.ok()); + ASSERT_EQ("Invalid reference `foo`.", add_doc_op.error()); collectionManager.drop_collection("Customers"); customers_schema_json = @@ -153,9 +141,9 @@ TEST_F(CollectionJoinTest, IndexDocumentHavingReferenceField) { ASSERT_TRUE(collection_create_op.ok()); customer_collection = collection_create_op.get(); - add_op = customer_collection->add(customer_json.dump()); - ASSERT_FALSE(add_op.ok()); - ASSERT_EQ("Referenced collection `products` not found.", add_op.error()); + add_doc_op = customer_collection->add(customer_json.dump()); + ASSERT_FALSE(add_doc_op.ok()); + ASSERT_EQ("Referenced collection `products` not found.", add_doc_op.error()); collectionManager.drop_collection("Customers"); customers_schema_json = @@ -170,11 +158,23 @@ TEST_F(CollectionJoinTest, IndexDocumentHavingReferenceField) { })"_json; collection_create_op = collectionManager.create_collection(customers_schema_json); ASSERT_TRUE(collection_create_op.ok()); - customer_collection = collection_create_op.get(); - add_op = customer_collection->add(customer_json.dump()); - ASSERT_FALSE(add_op.ok()); - ASSERT_EQ("Referenced field `id` not found in the collection `Products`.", add_op.error()); + + auto products_schema_json = + R"({ + "name": "Products", + "fields": [ + {"name": "product_id", "type": "string", "index": false, "optional": true}, + {"name": "product_name", "type": "string"}, + {"name": "product_description", "type": "string"} + ] + })"_json; + collection_create_op = collectionManager.create_collection(products_schema_json); + ASSERT_TRUE(collection_create_op.ok()); + + add_doc_op = customer_collection->add(customer_json.dump()); + ASSERT_FALSE(add_doc_op.ok()); + ASSERT_EQ("Referenced field `id` not found in the collection `Products`.", add_doc_op.error()); collectionManager.drop_collection("Customers"); customers_schema_json = @@ -191,9 +191,9 @@ TEST_F(CollectionJoinTest, IndexDocumentHavingReferenceField) { ASSERT_TRUE(collection_create_op.ok()); customer_collection = collection_create_op.get(); - add_op = customer_collection->add(customer_json.dump()); - ASSERT_FALSE(add_op.ok()); - ASSERT_EQ("Referenced field `product_id` in the collection `Products` must be indexed.", add_op.error()); + add_doc_op = customer_collection->add(customer_json.dump()); + ASSERT_FALSE(add_doc_op.ok()); + ASSERT_EQ("Referenced field `product_id` in the collection `Products` must be indexed.", add_doc_op.error()); collectionManager.drop_collection("Products"); products_schema_json = @@ -208,8 +208,8 @@ TEST_F(CollectionJoinTest, IndexDocumentHavingReferenceField) { collection_create_op = collectionManager.create_collection(products_schema_json); ASSERT_TRUE(collection_create_op.ok()); - add_op = customer_collection->add(customer_json.dump()); - ASSERT_EQ("Referenced document having `product_id` = `a` not found in the collection `Products`.", add_op.error()); + add_doc_op = customer_collection->add(customer_json.dump()); + ASSERT_EQ("Referenced document having `product_id` = `a` not found in the collection `Products`.", add_doc_op.error()); std::vector products = { R"({ @@ -232,8 +232,8 @@ TEST_F(CollectionJoinTest, IndexDocumentHavingReferenceField) { } customer_json["product_id"] = "product_a"; - add_op = customer_collection->add(customer_json.dump()); - ASSERT_EQ("Multiple documents having `product_id` = `product_a` found in the collection `Products`.", add_op.error()); + add_doc_op = customer_collection->add(customer_json.dump()); + ASSERT_EQ("Multiple documents having `product_id` = `product_a` found in the collection `Products`.", add_doc_op.error()); collectionManager.drop_collection("Products"); products[1]["product_id"] = "product_b"; @@ -268,15 +268,17 @@ TEST_F(CollectionJoinTest, IndexDocumentHavingReferenceField) { })"_json; collection_create_op = collectionManager.create_collection(customers_schema_json); ASSERT_TRUE(collection_create_op.ok()); - add_op = customer_collection->add(customer_json.dump()); - ASSERT_TRUE(add_op.ok()); + + customer_collection = collection_create_op.get(); + add_doc_op = customer_collection->add(customer_json.dump()); + ASSERT_TRUE(add_doc_op.ok()); ASSERT_EQ(customer_collection->get("0").get().count("product_id_sequence_id"), 1); - // Referenced document should be accessible from Customers collection. - auto sequence_id = collectionManager.get_collection("Products")->get_seq_id_collection_prefix() + "_" + - customer_collection->get("0").get()["product_id_sequence_id"].get(); nlohmann::json document; - auto get_op = customer_collection->get_document_from_store(sequence_id, document); + // Referenced document's sequence_id must be valid. + auto get_op = collectionManager.get_collection("Products")->get_document_from_store( + customer_collection->get("0").get()["product_id_sequence_id"].get(), + document); ASSERT_TRUE(get_op.ok()); ASSERT_EQ(document.count("product_id"), 1); ASSERT_EQ(document["product_id"], "product_a"); diff --git a/test/collection_manager_test.cpp b/test/collection_manager_test.cpp index 1bd74a27..b2410558 100644 --- a/test/collection_manager_test.cpp +++ b/test/collection_manager_test.cpp @@ -74,9 +74,11 @@ TEST_F(CollectionManagerTest, CollectionCreation) { ASSERT_EQ(0, collection1->get_collection_id()); ASSERT_EQ(0, collection1->get_next_seq_id()); ASSERT_EQ(facet_fields_expected, collection1->get_facet_fields()); - ASSERT_EQ(2, collection1->get_sort_fields().size()); + // product_id_sequence_id is also included + ASSERT_EQ(3, collection1->get_sort_fields().size()); ASSERT_EQ("location", collection1->get_sort_fields()[0].name); - ASSERT_EQ("points", collection1->get_sort_fields()[1].name); + ASSERT_EQ("product_id_sequence_id", collection1->get_sort_fields()[1].name); + ASSERT_EQ("points", collection1->get_sort_fields()[2].name); ASSERT_EQ(schema.size(), collection1->get_schema().size()); ASSERT_EQ("points", collection1->get_default_sorting_field()); ASSERT_EQ(false, schema.at("not_stored").index); @@ -234,8 +236,8 @@ TEST_F(CollectionManagerTest, CollectionCreation) { "name":"product_id_sequence_id", "nested":false, "optional":true, - "sort":false, - "type":"string" + "sort":true, + "type":"int64" } ], "id":0, @@ -473,9 +475,11 @@ TEST_F(CollectionManagerTest, RestoreRecordsOnRestart) { ASSERT_EQ(0, collection1->get_collection_id()); ASSERT_EQ(18, collection1->get_next_seq_id()); ASSERT_EQ(facet_fields_expected, collection1->get_facet_fields()); - ASSERT_EQ(2, collection1->get_sort_fields().size()); + // product_id_sequence_id is also included + ASSERT_EQ(3, collection1->get_sort_fields().size()); ASSERT_EQ("location", collection1->get_sort_fields()[0].name); - ASSERT_EQ("points", collection1->get_sort_fields()[1].name); + ASSERT_EQ("product_id_sequence_id", collection1->get_sort_fields()[1].name); + ASSERT_EQ("points", collection1->get_sort_fields()[2].name); ASSERT_EQ(schema.size(), collection1->get_schema().size()); ASSERT_EQ("points", collection1->get_default_sorting_field());