Optimize reference filtering.

This commit is contained in:
Harpreet Sangar 2023-01-24 10:57:29 +05:30
parent ebfbf4f48d
commit 6c5662bc95
7 changed files with 120 additions and 83 deletions

View File

@ -436,6 +436,10 @@ public:
Option<bool> get_filter_ids(const std::string & filter_query,
std::vector<std::pair<size_t, uint32_t*>>& index_ids) const;
Option<bool> get_reference_filter_ids(const std::string & filter_query,
const std::string & collection_name,
std::pair<uint32_t, uint32_t*>& reference_index_ids) const;
Option<bool> validate_reference_filter(const std::string& filter_query) const;
Option<nlohmann::json> get(const std::string & id) const;

View File

@ -715,6 +715,10 @@ public:
uint32_t& filter_ids_length,
filter_node_t const* const& filter_tree_root) const;
void do_reference_filtering_with_lock(std::pair<uint32_t, uint32_t*>& reference_index_ids,
filter_node_t const* const& filter_tree_root,
const std::string& reference_field_name) const;
void refresh_schemas(const std::vector<field>& new_fields, const std::vector<field>& del_fields);
// the following methods are not synchronized because their parent calls are synchronized or they are const/static

View File

@ -2362,6 +2362,41 @@ Option<bool> Collection::get_filter_ids(const std::string & filter_query,
return Option<bool>(true);
}
Option<bool> Collection::get_reference_filter_ids(const std::string & filter_query,
const std::string & collection_name,
std::pair<uint32_t, uint32_t*>& reference_index_ids) const {
std::shared_lock lock(mutex);
std::string reference_field_name;
for (auto const& field: fields) {
if (!field.reference.empty() &&
field.reference.find(collection_name) == 0 &&
field.reference.find('.') == collection_name.size()) {
reference_field_name = field.name;
break;
}
}
if (reference_field_name.empty()) {
return Option<bool>(400, "Could not find any field in `" + name + "` referencing the collection `"
+ collection_name + "`.");
}
const std::string doc_id_prefix = std::to_string(collection_id) + "_" + DOC_ID_PREFIX + "_";
filter_node_t* filter_tree_root = nullptr;
Option<bool> filter_op = filter::parse_filter_query(filter_query, search_schema,
store, doc_id_prefix, filter_tree_root);
if(!filter_op.ok()) {
return filter_op;
}
reference_field_name += "_sequence_id";
index->do_reference_filtering_with_lock(reference_index_ids, filter_tree_root, reference_field_name);
delete filter_tree_root;
return Option<bool>(true);
}
Option<bool> Collection::validate_reference_filter(const std::string& filter_query) const {
std::shared_lock lock(mutex);

View File

@ -723,7 +723,7 @@ Option<bool> field::json_field_to_field(bool enable_nested_fields, nlohmann::jso
if (!field_json[fields::reference].get<std::string>().empty()) {
the_fields.emplace_back(
field(field_json[fields::name].get<std::string>() + "_sequence_id", "string", false,
field(field_json[fields::name].get<std::string>() + "_sequence_id", "int64", false,
field_json[fields::optional], true)
);
}

View File

@ -470,7 +470,7 @@ Option<uint32_t> Index::validate_index_in_memory(nlohmann::json& document, uint3
"Multiple documents having" + match + "found in the collection `" + tokens[0] + "`.");
}
document[a_field.name + "_sequence_id"] = StringUtils::serialize_uint32_t(*(documents[0].second));
document[a_field.name + "_sequence_id"] = *(documents[0].second);
delete [] documents[0].second;
}
@ -1665,7 +1665,7 @@ void Index::do_filtering(uint32_t*& filter_ids,
uint32_t& filter_ids_length,
filter_node_t const* const root) const {
// auto begin = std::chrono::high_resolution_clock::now();
const filter a_filter = root->filter_exp;
/**/ const filter a_filter = root->filter_exp;
bool is_referenced_filter = !a_filter.referenced_collection_name.empty();
if (is_referenced_filter) {
@ -1673,48 +1673,16 @@ void Index::do_filtering(uint32_t*& filter_ids,
auto& cm = CollectionManager::get_instance();
auto collection = cm.get_collection(a_filter.referenced_collection_name);
std::vector<std::pair<size_t, uint32_t*>> documents;
auto op = collection->get_filter_ids(a_filter.field_name, documents);
std::pair<uint32_t, uint32_t*> documents;
auto op = collection->get_reference_filter_ids(a_filter.field_name,
cm.get_collection_with_id(collection_id)->get_name(),
documents);
if (!op.ok()) {
return;
}
if (documents[0].first > 0) {
const field* reference_field = nullptr;
for (auto const& f: collection->get_fields()) {
auto this_collection_name = cm.get_collection_with_id(collection_id)->get_name();
if (!f.reference.empty() &&
f.reference.find(this_collection_name) == 0 &&
f.reference.find('.') == this_collection_name.size()) {
reference_field = &f;
break;
}
}
if (reference_field == nullptr) {
return;
}
std::vector<uint32> result_ids;
for (size_t i = 0; i < documents[0].first; i++) {
uint32_t seq_id = *(documents[0].second + i);
nlohmann::json document;
auto op = collection->get_document_from_store(seq_id, document);
if (!op.ok()) {
return;
}
result_ids.push_back(StringUtils::deserialize_uint32_t(document[reference_field->name + "_sequence_id"].get<std::string>()));
}
filter_ids = new uint32[result_ids.size()];
std::sort(result_ids.begin(), result_ids.end());
std::copy(result_ids.begin(), result_ids.end(), filter_ids);
filter_ids_length = result_ids.size();
}
delete [] documents[0].second;
filter_ids_length = documents.first;
filter_ids = documents.second;
return;
}
@ -2099,6 +2067,26 @@ void Index::do_filtering_with_lock(uint32_t*& filter_ids,
recursive_filter(filter_ids, filter_ids_length, filter_tree_root, false);
}
void Index::do_reference_filtering_with_lock(std::pair<uint32_t, uint32_t*>& reference_index_ids,
filter_node_t const* const& filter_tree_root,
const std::string& reference_field_name) const {
std::shared_lock lock(mutex);
recursive_filter(reference_index_ids.second, reference_index_ids.first, filter_tree_root, false);
std::vector<uint32> vector;
vector.reserve(reference_index_ids.first);
for (uint32_t i = 0; i < reference_index_ids.first; i++) {
auto filtered_doc_id = *(reference_index_ids.second + i);
// Extract the sequence_id from the reference field.
vector.push_back(sort_index.at(reference_field_name)->at(filtered_doc_id));
}
std::sort(vector.begin(), vector.end());
std::copy(vector.begin(), vector.end(), reference_index_ids.second);
}
void Index::run_search(search_args* search_params) {
search(search_params->field_query_tokens,
search_params->search_fields,

View File

@ -101,18 +101,6 @@ TEST_F(CollectionJoinTest, SchemaReferenceField) {
}
TEST_F(CollectionJoinTest, IndexDocumentHavingReferenceField) {
auto products_schema_json =
R"({
"name": "Products",
"fields": [
{"name": "product_id", "type": "string", "index": false, "optional": true},
{"name": "product_name", "type": "string"},
{"name": "product_description", "type": "string"}
]
})"_json;
auto collection_create_op = collectionManager.create_collection(products_schema_json);
ASSERT_TRUE(collection_create_op.ok());
auto customers_schema_json =
R"({
"name": "Customers",
@ -123,7 +111,7 @@ TEST_F(CollectionJoinTest, IndexDocumentHavingReferenceField) {
{"name": "product_id", "type": "string", "reference": "foo"}
]
})"_json;
collection_create_op = collectionManager.create_collection(customers_schema_json);
auto collection_create_op = collectionManager.create_collection(customers_schema_json);
ASSERT_TRUE(collection_create_op.ok());
nlohmann::json customer_json = R"({
@ -134,9 +122,9 @@ TEST_F(CollectionJoinTest, IndexDocumentHavingReferenceField) {
})"_json;
auto customer_collection = collection_create_op.get();
auto add_op = customer_collection->add(customer_json.dump());
ASSERT_FALSE(add_op.ok());
ASSERT_EQ("Invalid reference `foo`.", add_op.error());
auto add_doc_op = customer_collection->add(customer_json.dump());
ASSERT_FALSE(add_doc_op.ok());
ASSERT_EQ("Invalid reference `foo`.", add_doc_op.error());
collectionManager.drop_collection("Customers");
customers_schema_json =
@ -153,9 +141,9 @@ TEST_F(CollectionJoinTest, IndexDocumentHavingReferenceField) {
ASSERT_TRUE(collection_create_op.ok());
customer_collection = collection_create_op.get();
add_op = customer_collection->add(customer_json.dump());
ASSERT_FALSE(add_op.ok());
ASSERT_EQ("Referenced collection `products` not found.", add_op.error());
add_doc_op = customer_collection->add(customer_json.dump());
ASSERT_FALSE(add_doc_op.ok());
ASSERT_EQ("Referenced collection `products` not found.", add_doc_op.error());
collectionManager.drop_collection("Customers");
customers_schema_json =
@ -170,11 +158,23 @@ TEST_F(CollectionJoinTest, IndexDocumentHavingReferenceField) {
})"_json;
collection_create_op = collectionManager.create_collection(customers_schema_json);
ASSERT_TRUE(collection_create_op.ok());
customer_collection = collection_create_op.get();
add_op = customer_collection->add(customer_json.dump());
ASSERT_FALSE(add_op.ok());
ASSERT_EQ("Referenced field `id` not found in the collection `Products`.", add_op.error());
auto products_schema_json =
R"({
"name": "Products",
"fields": [
{"name": "product_id", "type": "string", "index": false, "optional": true},
{"name": "product_name", "type": "string"},
{"name": "product_description", "type": "string"}
]
})"_json;
collection_create_op = collectionManager.create_collection(products_schema_json);
ASSERT_TRUE(collection_create_op.ok());
add_doc_op = customer_collection->add(customer_json.dump());
ASSERT_FALSE(add_doc_op.ok());
ASSERT_EQ("Referenced field `id` not found in the collection `Products`.", add_doc_op.error());
collectionManager.drop_collection("Customers");
customers_schema_json =
@ -191,9 +191,9 @@ TEST_F(CollectionJoinTest, IndexDocumentHavingReferenceField) {
ASSERT_TRUE(collection_create_op.ok());
customer_collection = collection_create_op.get();
add_op = customer_collection->add(customer_json.dump());
ASSERT_FALSE(add_op.ok());
ASSERT_EQ("Referenced field `product_id` in the collection `Products` must be indexed.", add_op.error());
add_doc_op = customer_collection->add(customer_json.dump());
ASSERT_FALSE(add_doc_op.ok());
ASSERT_EQ("Referenced field `product_id` in the collection `Products` must be indexed.", add_doc_op.error());
collectionManager.drop_collection("Products");
products_schema_json =
@ -208,8 +208,8 @@ TEST_F(CollectionJoinTest, IndexDocumentHavingReferenceField) {
collection_create_op = collectionManager.create_collection(products_schema_json);
ASSERT_TRUE(collection_create_op.ok());
add_op = customer_collection->add(customer_json.dump());
ASSERT_EQ("Referenced document having `product_id` = `a` not found in the collection `Products`.", add_op.error());
add_doc_op = customer_collection->add(customer_json.dump());
ASSERT_EQ("Referenced document having `product_id` = `a` not found in the collection `Products`.", add_doc_op.error());
std::vector<nlohmann::json> products = {
R"({
@ -232,8 +232,8 @@ TEST_F(CollectionJoinTest, IndexDocumentHavingReferenceField) {
}
customer_json["product_id"] = "product_a";
add_op = customer_collection->add(customer_json.dump());
ASSERT_EQ("Multiple documents having `product_id` = `product_a` found in the collection `Products`.", add_op.error());
add_doc_op = customer_collection->add(customer_json.dump());
ASSERT_EQ("Multiple documents having `product_id` = `product_a` found in the collection `Products`.", add_doc_op.error());
collectionManager.drop_collection("Products");
products[1]["product_id"] = "product_b";
@ -268,15 +268,17 @@ TEST_F(CollectionJoinTest, IndexDocumentHavingReferenceField) {
})"_json;
collection_create_op = collectionManager.create_collection(customers_schema_json);
ASSERT_TRUE(collection_create_op.ok());
add_op = customer_collection->add(customer_json.dump());
ASSERT_TRUE(add_op.ok());
customer_collection = collection_create_op.get();
add_doc_op = customer_collection->add(customer_json.dump());
ASSERT_TRUE(add_doc_op.ok());
ASSERT_EQ(customer_collection->get("0").get().count("product_id_sequence_id"), 1);
// Referenced document should be accessible from Customers collection.
auto sequence_id = collectionManager.get_collection("Products")->get_seq_id_collection_prefix() + "_" +
customer_collection->get("0").get()["product_id_sequence_id"].get<std::string>();
nlohmann::json document;
auto get_op = customer_collection->get_document_from_store(sequence_id, document);
// Referenced document's sequence_id must be valid.
auto get_op = collectionManager.get_collection("Products")->get_document_from_store(
customer_collection->get("0").get()["product_id_sequence_id"].get<uint32_t>(),
document);
ASSERT_TRUE(get_op.ok());
ASSERT_EQ(document.count("product_id"), 1);
ASSERT_EQ(document["product_id"], "product_a");

View File

@ -74,9 +74,11 @@ TEST_F(CollectionManagerTest, CollectionCreation) {
ASSERT_EQ(0, collection1->get_collection_id());
ASSERT_EQ(0, collection1->get_next_seq_id());
ASSERT_EQ(facet_fields_expected, collection1->get_facet_fields());
ASSERT_EQ(2, collection1->get_sort_fields().size());
// product_id_sequence_id is also included
ASSERT_EQ(3, collection1->get_sort_fields().size());
ASSERT_EQ("location", collection1->get_sort_fields()[0].name);
ASSERT_EQ("points", collection1->get_sort_fields()[1].name);
ASSERT_EQ("product_id_sequence_id", collection1->get_sort_fields()[1].name);
ASSERT_EQ("points", collection1->get_sort_fields()[2].name);
ASSERT_EQ(schema.size(), collection1->get_schema().size());
ASSERT_EQ("points", collection1->get_default_sorting_field());
ASSERT_EQ(false, schema.at("not_stored").index);
@ -234,8 +236,8 @@ TEST_F(CollectionManagerTest, CollectionCreation) {
"name":"product_id_sequence_id",
"nested":false,
"optional":true,
"sort":false,
"type":"string"
"sort":true,
"type":"int64"
}
],
"id":0,
@ -473,9 +475,11 @@ TEST_F(CollectionManagerTest, RestoreRecordsOnRestart) {
ASSERT_EQ(0, collection1->get_collection_id());
ASSERT_EQ(18, collection1->get_next_seq_id());
ASSERT_EQ(facet_fields_expected, collection1->get_facet_fields());
ASSERT_EQ(2, collection1->get_sort_fields().size());
// product_id_sequence_id is also included
ASSERT_EQ(3, collection1->get_sort_fields().size());
ASSERT_EQ("location", collection1->get_sort_fields()[0].name);
ASSERT_EQ("points", collection1->get_sort_fields()[1].name);
ASSERT_EQ("product_id_sequence_id", collection1->get_sort_fields()[1].name);
ASSERT_EQ("points", collection1->get_sort_fields()[2].name);
ASSERT_EQ(schema.size(), collection1->get_schema().size());
ASSERT_EQ("points", collection1->get_default_sorting_field());