Filter by reference.

This commit is contained in:
Harpreet Sangar 2023-01-22 12:02:29 +05:30
parent 3bc7275e23
commit ebfbf4f48d
8 changed files with 260 additions and 10 deletions

View File

@ -436,6 +436,8 @@ public:
Option<bool> get_filter_ids(const std::string & filter_query,
std::vector<std::pair<size_t, uint32_t*>>& index_ids) const;
Option<bool> validate_reference_filter(const std::string& filter_query) const;
Option<nlohmann::json> get(const std::string & id) const;
Option<std::string> remove(const std::string & id, bool remove_from_store = true);

View File

@ -77,7 +77,7 @@ struct field {
static constexpr int VAL_UNKNOWN = 2;
std::string reference; // Reference to another collection.
std::string reference; // Foo.bar (reference to bar field in Foo collection).
field() {}
@ -448,6 +448,9 @@ struct filter {
// aggregated and then this flag is checked if negation on the aggregated result is required.
bool apply_not_equals = false;
// Would store `Foo` in case of a filter expression like `$Foo(bar := baz)`
std::string referenced_collection_name;
static const std::string RANGE_OPERATOR() {
return "..";
}

View File

@ -2362,6 +2362,22 @@ Option<bool> Collection::get_filter_ids(const std::string & filter_query,
return Option<bool>(true);
}
Option<bool> Collection::validate_reference_filter(const std::string& filter_query) const {
std::shared_lock lock(mutex);
const std::string doc_id_prefix = std::to_string(collection_id) + "_" + DOC_ID_PREFIX + "_";
filter_node_t* filter_tree_root = nullptr;
Option<bool> filter_op = filter::parse_filter_query(filter_query, search_schema,
store, doc_id_prefix, filter_tree_root);
if(!filter_op.ok()) {
return filter_op;
}
delete filter_tree_root;
return Option<bool>(true);
}
bool Collection::facet_value_to_string(const facet &a_facet, const facet_count_t &facet_count,
const nlohmann::json &document, std::string &value) const {

View File

@ -2,6 +2,7 @@
#include "field.h"
#include "magic_enum.hpp"
#include <stack>
#include <collection_manager.h>
Option<bool> filter::parse_geopoint_filter_value(std::string& raw_value,
const std::string& format_err_msg,
@ -408,9 +409,32 @@ Option<bool> toParseTree(std::queue<std::string>& postfix, filter_node_t*& root,
filter_node = new filter_node_t(expression == "&&" ? AND : OR, operandA, operandB);
} else {
filter filter_exp;
Option<bool> toFilter_op = toFilter(expression, filter_exp, search_schema, store, doc_id_prefix);
if (!toFilter_op.ok()) {
return toFilter_op;
// Expected value: $Collection(...)
bool is_referenced_filter = (expression[0] == '$' && expression[expression.size() - 1] == ')');
if (is_referenced_filter) {
size_t parenthesis_index = expression.find('(');
std::string collection_name = expression.substr(1, parenthesis_index - 1);
auto& cm = CollectionManager::get_instance();
auto collection = cm.get_collection(collection_name);
if (collection == nullptr) {
return Option<bool>(400, "Referenced collection `" + collection_name + "` not found.");
}
filter_exp = {expression.substr(parenthesis_index + 1, expression.size() - parenthesis_index - 2)};
filter_exp.referenced_collection_name = collection_name;
auto op = collection->validate_reference_filter(filter_exp.field_name);
if (!op.ok()) {
return Option<bool>(400, "Failed to parse reference filter on `" + collection_name +
"` collection: " + op.error());
}
} else {
Option<bool> toFilter_op = toFilter(expression, filter_exp, search_schema, store, doc_id_prefix);
if (!toFilter_op.ok()) {
return toFilter_op;
}
}
filter_node = new filter_node_t(filter_exp);

View File

@ -470,8 +470,7 @@ Option<uint32_t> Index::validate_index_in_memory(nlohmann::json& document, uint3
"Multiple documents having" + match + "found in the collection `" + tokens[0] + "`.");
}
document[a_field.name + "_sequence_id"] = collection->get_seq_id_collection_prefix() + "_" +
StringUtils::serialize_uint32_t(*(documents[0].second));
document[a_field.name + "_sequence_id"] = StringUtils::serialize_uint32_t(*(documents[0].second));
delete [] documents[0].second;
}
@ -1668,6 +1667,57 @@ void Index::do_filtering(uint32_t*& filter_ids,
// auto begin = std::chrono::high_resolution_clock::now();
const filter a_filter = root->filter_exp;
bool is_referenced_filter = !a_filter.referenced_collection_name.empty();
if (is_referenced_filter) {
// Apply filter on referenced collection and get the sequence ids of current collection from the filtered documents.
auto& cm = CollectionManager::get_instance();
auto collection = cm.get_collection(a_filter.referenced_collection_name);
std::vector<std::pair<size_t, uint32_t*>> documents;
auto op = collection->get_filter_ids(a_filter.field_name, documents);
if (!op.ok()) {
return;
}
if (documents[0].first > 0) {
const field* reference_field = nullptr;
for (auto const& f: collection->get_fields()) {
auto this_collection_name = cm.get_collection_with_id(collection_id)->get_name();
if (!f.reference.empty() &&
f.reference.find(this_collection_name) == 0 &&
f.reference.find('.') == this_collection_name.size()) {
reference_field = &f;
break;
}
}
if (reference_field == nullptr) {
return;
}
std::vector<uint32> result_ids;
for (size_t i = 0; i < documents[0].first; i++) {
uint32_t seq_id = *(documents[0].second + i);
nlohmann::json document;
auto op = collection->get_document_from_store(seq_id, document);
if (!op.ok()) {
return;
}
result_ids.push_back(StringUtils::deserialize_uint32_t(document[reference_field->name + "_sequence_id"].get<std::string>()));
}
filter_ids = new uint32[result_ids.size()];
std::sort(result_ids.begin(), result_ids.end());
std::copy(result_ids.begin(), result_ids.end(), filter_ids);
filter_ids_length = result_ids.size();
}
delete [] documents[0].second;
return;
}
if (a_filter.field_name == "id") {
// we handle `ids` separately
std::vector<uint32> result_ids;

View File

@ -349,9 +349,40 @@ size_t StringUtils::get_num_chars(const std::string& s) {
return j;
}
Option<bool> parse_reference_filter(const std::string& filter_query, std::queue<std::string>& tokens, size_t& index) {
auto error = Option<bool>(400, "Could not parse the reference filter.");
if (filter_query[index] != '$') {
return error;
}
int start_index = index;
auto size = filter_query.size();
while(++index < size && filter_query[index] != '(') {}
if (index >= size) {
return error;
}
int parenthesis_count = 1;
while (++index < size && parenthesis_count > 0) {
if (filter_query[index] == '(') {
parenthesis_count++;
} else if (filter_query[index] == ')') {
parenthesis_count--;
}
}
if (parenthesis_count != 0) {
return error;
}
tokens.push(filter_query.substr(start_index, index - start_index));
return Option<bool>(true);
}
Option<bool> StringUtils::tokenize_filter_query(const std::string& filter_query, std::queue<std::string>& tokens) {
auto size = filter_query.size();
for (auto i = 0; i < size;) {
for (size_t i = 0; i < size;) {
auto c = filter_query[i];
if (c == ' ') {
i++;
@ -377,6 +408,15 @@ Option<bool> StringUtils::tokenize_filter_query(const std::string& filter_query,
tokens.push("||");
i += 2;
} else {
// Reference filter would start with $ symbol.
if (c == '$') {
auto op = parse_reference_filter(filter_query, tokens, i);
if (!op.ok()) {
return op;
}
continue;
}
std::stringstream ss;
bool inBacktick = false;
bool preceding_colon = false;

View File

@ -93,14 +93,14 @@ TEST_F(CollectionJoinTest, SchemaReferenceField) {
ASSERT_EQ(schema.at("customer_name").reference, "");
ASSERT_EQ(schema.at("product_id").reference, "Products.product_id");
// Index a `foo_sequence_id` field for `foo` reference field.
// Add a `foo_sequence_id` field in the schema for `foo` reference field.
ASSERT_EQ(schema.count("product_id_sequence_id"), 1);
ASSERT_TRUE(schema.at("product_id_sequence_id").index);
collectionManager.drop_collection("Customers");
}
TEST_F(CollectionJoinTest, IndexReferenceField) {
TEST_F(CollectionJoinTest, IndexDocumentHavingReferenceField) {
auto products_schema_json =
R"({
"name": "Products",
@ -272,12 +272,123 @@ TEST_F(CollectionJoinTest, IndexReferenceField) {
ASSERT_TRUE(add_op.ok());
ASSERT_EQ(customer_collection->get("0").get().count("product_id_sequence_id"), 1);
// Referenced document should be accessible from Customers collection.
auto sequence_id = collectionManager.get_collection("Products")->get_seq_id_collection_prefix() + "_" +
customer_collection->get("0").get()["product_id_sequence_id"].get<std::string>();
nlohmann::json document;
auto get_op = customer_collection->get_document_from_store(customer_collection->get("0").get()["product_id_sequence_id"].get<std::string>(), document);
auto get_op = customer_collection->get_document_from_store(sequence_id, document);
ASSERT_TRUE(get_op.ok());
ASSERT_EQ(document.count("product_id"), 1);
ASSERT_EQ(document["product_id"], "product_a");
ASSERT_EQ(document["product_name"], "shampoo");
collectionManager.drop_collection("Customers");
collectionManager.drop_collection("Products");
}
TEST_F(CollectionJoinTest, FilterByReferenceField) {
auto schema_json =
R"({
"name": "Products",
"fields": [
{"name": "product_id", "type": "string"},
{"name": "product_name", "type": "string"},
{"name": "product_description", "type": "string"}
]
})"_json;
std::vector<nlohmann::json> documents = {
R"({
"product_id": "product_a",
"product_name": "shampoo",
"product_description": "Our new moisturizing shampoo is perfect for those with dry or damaged hair."
})"_json,
R"({
"product_id": "product_b",
"product_name": "soap",
"product_description": "Introducing our all-natural, organic soap bar made with essential oils and botanical ingredients."
})"_json
};
auto collection_create_op = collectionManager.create_collection(schema_json);
ASSERT_TRUE(collection_create_op.ok());
for (auto const &json: documents) {
auto add_op = collection_create_op.get()->add(json.dump());
if (!add_op.ok()) {
LOG(INFO) << add_op.error();
}
ASSERT_TRUE(add_op.ok());
}
schema_json =
R"({
"name": "Customers",
"fields": [
{"name": "customer_id", "type": "string"},
{"name": "customer_name", "type": "string"},
{"name": "product_price", "type": "float"},
{"name": "product_id", "type": "string", "reference": "Products.product_id"}
]
})"_json;
documents = {
R"({
"customer_id": "customer_a",
"customer_name": "Joe",
"product_price": 143,
"product_id": "product_a"
})"_json,
R"({
"customer_id": "customer_a",
"customer_name": "Joe",
"product_price": 73.5,
"product_id": "product_b"
})"_json,
R"({
"customer_id": "customer_b",
"customer_name": "Dan",
"product_price": 75,
"product_id": "product_a"
})"_json,
R"({
"customer_id": "customer_b",
"customer_name": "Dan",
"product_price": 140,
"product_id": "product_b"
})"_json
};
collection_create_op = collectionManager.create_collection(schema_json);
ASSERT_TRUE(collection_create_op.ok());
for (auto const &json: documents) {
auto add_op = collection_create_op.get()->add(json.dump());
if (!add_op.ok()) {
LOG(INFO) << add_op.error();
}
ASSERT_TRUE(add_op.ok());
}
auto coll = collectionManager.get_collection("Products");
auto search_op = coll->search("s", {"product_name"}, "$foo:=customer_a", {}, {}, {0},
10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD);
ASSERT_FALSE(search_op.ok());
ASSERT_EQ(search_op.error(), "Could not parse the reference filter.");
search_op = coll->search("s", {"product_name"}, "$foo(:=customer_a", {}, {}, {0},
10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD);
ASSERT_FALSE(search_op.ok());
ASSERT_EQ(search_op.error(), "Could not parse the reference filter.");
search_op = coll->search("s", {"product_name"}, "$foo(:=customer_a)", {}, {}, {0},
10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD);
ASSERT_FALSE(search_op.ok());
ASSERT_EQ(search_op.error(), "Referenced collection `foo` not found.");
search_op = coll->search("s", {"product_name"}, "$Customers(foo:=customer_a)", {}, {}, {0},
10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD);
ASSERT_FALSE(search_op.ok());
ASSERT_EQ(search_op.error(), "Failed to parse reference filter on `Customers` collection: Could not find a filter field named `foo` in the schema.");
auto result = coll->search("s", {"product_name"}, "$Customers(customer_id:=customer_a && product_price:<100)", {}, {}, {0},
10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD).get();
ASSERT_EQ(1, result["found"].get<size_t>());
ASSERT_EQ(1, result["hits"].size());
ASSERT_EQ("soap", result["hits"][0]["document"]["product_name"].get<std::string>());
}

View File

@ -393,4 +393,8 @@ TEST(StringUtilsTest, TokenizeFilterQuery) {
filter_query = "((age:<5||age:>10)&&location:(48.906,2.343,5mi))||tags:AT&T";
tokenList = {"(", "(", "age:<5", "||", "age:>10", ")", "&&", "location:(48.906,2.343,5mi)", ")", "||", "tags:AT&T"};
tokenizeTestHelper(filter_query, tokenList);
filter_query = "((age: <5 || age: >10) && category:= [shoes]) && $Customers(customer_id:=customer_a && (product_price:>100 && product_price:<200))";
tokenList = {"(", "(", "age: <5", "||", "age: >10", ")", "&&", "category:= [shoes]", ")", "&&", "$Customers(customer_id:=customer_a && (product_price:>100 && product_price:<200))"};
tokenizeTestHelper(filter_query, tokenList);
}