diff --git a/include/collection.h b/include/collection.h index c6592b77..720631c0 100644 --- a/include/collection.h +++ b/include/collection.h @@ -229,7 +229,8 @@ private: const tsl::htrie_map& search_schema, std::vector& processed_search_fields, bool extract_only_string_fields, - bool enable_nested_fields); + bool enable_nested_fields, + const bool handle_wildcard = true); bool is_nested_array(const nlohmann::json& obj, std::vector path_parts, size_t part_i) const; diff --git a/src/collection.cpp b/src/collection.cpp index 5af7a5b6..e2d2eb4e 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -884,13 +884,19 @@ Option Collection::extract_field_name(const std::string& field_name, const tsl::htrie_map& search_schema, std::vector& processed_search_fields, const bool extract_only_string_fields, - const bool enable_nested_fields) { + const bool enable_nested_fields, + const bool handle_wildcard) { if(field_name == "id") { processed_search_fields.push_back(field_name); return Option(true); } - auto prefix_it = search_schema.equal_prefix_range(field_name); + bool is_wildcard = field_name.find('*') != std::string::npos; + if (is_wildcard && !handle_wildcard) { + return Option(400, "Pattern `" + field_name + "` is not allowed."); + } + // If wildcard, remove * + auto prefix_it = search_schema.equal_prefix_range(field_name.substr(0, field_name.size() - is_wildcard)); bool field_found = false; for(auto kv = prefix_it.first; kv != prefix_it.second; ++kv) { @@ -898,7 +904,7 @@ Option Collection::extract_field_name(const std::string& field_name, bool exact_primitive_match = exact_key_match && !kv.value().is_object(); if(extract_only_string_fields && !kv.value().is_string()) { - if(exact_primitive_match) { + if(exact_primitive_match && !is_wildcard) { // upstream needs to be returned an error return Option(400, "Field `" + field_name + "` should be a string or a string array."); } @@ -906,16 +912,19 @@ Option Collection::extract_field_name(const std::string& field_name, continue; } - // field_name prefix must be followed by a "." to indicate an object search - if (exact_primitive_match || (enable_nested_fields && kv.key().size() > field_name.size() && - kv.key()[field_name.size()] == '.')) { + if (exact_primitive_match || is_wildcard || + // field_name prefix must be followed by a "." to indicate an object search + (enable_nested_fields && kv.key().size() > field_name.size() && kv.key()[field_name.size()] == '.')) { processed_search_fields.push_back(kv.key()); field_found = true; } } - if(!field_found) { - std::string error = "Could not find a field named `" + field_name + "` in the schema."; + if (is_wildcard && extract_only_string_fields && !field_found) { + std::string error = "No string or string array field found matching the pattern `" + field_name + "` in the schema."; + return Option(404, error); + } else if (!field_found) { + std::string error = is_wildcard ? "No field found matching the pattern `" : "Could not find a field named `" + field_name + "` in the schema."; return Option(404, error); } @@ -1092,7 +1101,7 @@ Option Collection::search(const std::string & raw_query, std::vector group_by_fields; for(const std::string& field_name: raw_group_by_fields) { - auto field_op = extract_field_name(field_name, search_schema, group_by_fields, false, enable_nested_fields); + auto field_op = extract_field_name(field_name, search_schema, group_by_fields, false, enable_nested_fields, false); if(!field_op.ok()) { return Option(404, field_op.error()); } diff --git a/test/collection_grouping_test.cpp b/test/collection_grouping_test.cpp index 9835f44a..707eb2af 100644 --- a/test/collection_grouping_test.cpp +++ b/test/collection_grouping_test.cpp @@ -141,6 +141,15 @@ TEST_F(CollectionGroupingTest, GroupingBasics) { ASSERT_EQ(3, (int) res["facet_counts"][0]["counts"][0]["count"]); ASSERT_STREQ("Omega", res["facet_counts"][0]["counts"][0]["value"].get().c_str()); ASSERT_STREQ("Omega", res["facet_counts"][0]["counts"][0]["highlighted"].get().c_str()); + + // Wildcard group_by is not allowed + auto error = coll_group->search("*", {}, "", {"brand"}, {}, {0}, 50, 1, FREQUENCY, + {false}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 5, + "", 10, + {}, {}, {"foo*"}, 2).error(); + ASSERT_EQ("Pattern `foo*` is not allowed.", error); } TEST_F(CollectionGroupingTest, GroupingCompoundKey) { diff --git a/test/collection_specific_more_test.cpp b/test/collection_specific_more_test.cpp index 57769205..5a3e16fe 100644 --- a/test/collection_specific_more_test.cpp +++ b/test/collection_specific_more_test.cpp @@ -1592,6 +1592,93 @@ TEST_F(CollectionSpecificMoreTest, IncludeExcludeUnIndexedField) { ASSERT_EQ("0", res["hits"][0]["document"]["id"].get()); } +TEST_F(CollectionSpecificMoreTest, WildcardIncludeExclude) { + nlohmann::json schema = R"({ + "name": "posts", + "enable_nested_fields": true, + "fields": [ + {"name": "username", "type": "string", "facet": true}, + {"name": "user.rank", "type": "int32", "facet": true}, + {"name": "user.bio", "type": "string"}, + {"name": "likes", "type": "int32"}, + {"name": "content", "type": "object"} + ], + "default_sorting_field": "likes" + })"_json; + + auto op = collectionManager.create_collection(schema); + ASSERT_TRUE(op.ok()); + Collection* coll = op.get(); + + std::vector json_lines = { + R"({"id": "124","username": "user_a","user": {"rank": 100,"bio": "Hi! I'm user_a"},"likes": 5215,"content": {"title": "title 1","body": "body 1"}})", + R"({"id": "125","username": "user_b","user": {"rank": 50,"bio": "user_b here, nice to meet you!"},"likes": 5215,"content": {"title": "title 2","body": "body 2"}})" + }; + + for (auto const& json: json_lines){ + auto add_op = coll->add(json); + if (!add_op.ok()) { + LOG(INFO) << add_op.error(); + } + ASSERT_TRUE(add_op.ok()); + } + + // include test: user* matches username, user.bio and user.rank + auto result = coll->search("user_a", {"username"}, "", {}, {}, {0}, + 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, {"user*"}).get(); + + ASSERT_EQ(1, result["found"].get()); + ASSERT_EQ(1, result["hits"].size()); + + ASSERT_EQ(0, result["hits"][0]["document"].count("id")); + ASSERT_EQ(0, result["hits"][0]["document"].count("likes")); + ASSERT_EQ(0, result["hits"][0]["document"].count("content")); + ASSERT_EQ(1, result["hits"][0]["document"].count("user")); + ASSERT_EQ(1, result["hits"][0]["document"]["user"].count("bio")); + ASSERT_EQ(1, result["hits"][0]["document"]["user"].count("rank")); + ASSERT_EQ(1, result["hits"][0]["document"].count("username")); + + spp::sparse_hash_set include_fields; + // exclude test: user.* matches user.rank and user.bio + result = coll->search("user_a", {"username"}, "", {}, {}, {0}, + 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, include_fields, {"user.*"}).get(); + + ASSERT_EQ(1, result["found"].get()); + ASSERT_EQ(1, result["hits"].size()); + + ASSERT_EQ(1, result["hits"][0]["document"].count("id")); + ASSERT_EQ(1, result["hits"][0]["document"].count("likes")); + ASSERT_EQ(1, result["hits"][0]["document"].count("content")); + ASSERT_EQ(1, result["hits"][0]["document"]["content"].count("title")); + ASSERT_EQ(1, result["hits"][0]["document"]["content"].count("body")); + ASSERT_EQ(0, result["hits"][0]["document"].count("user")); + ASSERT_EQ(1, result["hits"][0]["document"].count("username")); + + // No matching field for include_fields/exclude_fields + result = coll->search("user_a", {"username"}, "", {}, {}, {0}, + 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, {"foo.*"}).get(); + + ASSERT_EQ(1, result["found"].get()); + ASSERT_EQ(1, result["hits"].size()); + ASSERT_EQ(0, result["hits"][0]["document"].size()); + + result = coll->search("user_a", {"username"}, "", {}, {}, {0}, + 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, include_fields, {"foo.*"}).get(); + + ASSERT_EQ(1, result["found"].get()); + ASSERT_EQ(1, result["hits"].size()); + + ASSERT_EQ(1, result["hits"][0]["document"].count("id")); + ASSERT_EQ(1, result["hits"][0]["document"].count("likes")); + ASSERT_EQ(1, result["hits"][0]["document"].count("content")); + ASSERT_EQ(1, result["hits"][0]["document"]["content"].count("title")); + ASSERT_EQ(1, result["hits"][0]["document"]["content"].count("body")); + ASSERT_EQ(1, result["hits"][0]["document"].count("user")); + ASSERT_EQ(1, result["hits"][0]["document"]["user"].count("bio")); + ASSERT_EQ(1, result["hits"][0]["document"]["user"].count("rank")); + ASSERT_EQ(1, result["hits"][0]["document"].count("username")); +} + TEST_F(CollectionSpecificMoreTest, PhraseMatchRepeatingTokens) { nlohmann::json schema = R"({ "name": "coll1", diff --git a/test/collection_test.cpp b/test/collection_test.cpp index 2e547041..e46c8ce0 100644 --- a/test/collection_test.cpp +++ b/test/collection_test.cpp @@ -4350,3 +4350,259 @@ TEST_F(CollectionTest, QueryParsingForPhraseSearch) { collectionManager.drop_collection("coll1"); } + +TEST_F(CollectionTest, WildcardQueryBy) { + nlohmann::json schema = R"({ + "name": "posts", + "enable_nested_fields": true, + "fields": [ + {"name": "username", "type": "string", "facet": true}, + {"name": "user.rank", "type": "int32", "facet": true}, + {"name": "user.bio", "type": "string"}, + {"name": "likes", "type": "int32"}, + {"name": "content", "type": "object"} + ], + "default_sorting_field": "likes" + })"_json; + + auto op = collectionManager.create_collection(schema); + ASSERT_TRUE(op.ok()); + Collection* coll = op.get(); + + std::vector json_lines = { + R"({"id": "124","username": "user_a","user": {"rank": 100,"bio": "Hi! I'm user_a"},"likes": 5215,"content": {"title": "title 1","body": "body 1 user_a"}})", + R"({"id": "125","username": "user_b","user": {"rank": 50,"bio": "user_b here, nice to meet you!"},"likes": 5215,"content": {"title": "title 2","body": "body 2 user_b"}})" + }; + + for (auto const& json: json_lines){ + auto add_op = coll->add(json); + if (!add_op.ok()) { + LOG(INFO) << add_op.error(); + } + ASSERT_TRUE(add_op.ok()); + } + + // * matches username, user.bio, content.title, content.body + auto result = coll->search("user_a", {"*"}, "", {}, {}, {0}).get(); + + ASSERT_EQ(1, result["found"].get()); + ASSERT_EQ(1, result["hits"].size()); + + ASSERT_EQ("Hi! I'm user_a", + result["hits"][0]["highlight"]["user"]["bio"]["snippet"].get()); + ASSERT_EQ("user_a", + result["hits"][0]["highlight"]["username"]["snippet"].get()); +// ASSERT_EQ("body 1 user_a", +// result["hits"][0]["highlight"]["content"]["body"]["snippet"].get()); + + // user* matches username and user.bio + result = coll->search("user_a", {"user*"}, "", {}, {}, {0}).get(); + + ASSERT_EQ(1, result["found"].get()); + ASSERT_EQ(1, result["hits"].size()); + + ASSERT_EQ("Hi! I'm user_a", + result["hits"][0]["highlight"]["user"]["bio"]["snippet"].get()); + ASSERT_EQ("user_a", + result["hits"][0]["highlight"]["username"]["snippet"].get()); + + // user.* matches user.bio + result = coll->search("user_a", {"user.*"}, "", {}, {}, {0}).get(); + + ASSERT_EQ(1, result["found"].get()); + ASSERT_EQ(1, result["hits"].size()); + + ASSERT_EQ("Hi! I'm user_a", + result["hits"][0]["highlight"]["user"]["bio"]["snippet"].get()); + + // user.rank cannot be queried + result = coll->search("100", {"user*"}, "", {}, {}, {0}).get(); + ASSERT_EQ(0, result["found"].get()); + ASSERT_EQ(0, result["hits"].size()); + + // No matching field for query_by + auto error = coll->search("user_a", {"foo*"}, "", {}, {}, {0}).error(); + ASSERT_EQ("No string or string array field found matching the pattern `foo*` in the schema.", error); +} + +TEST_F(CollectionTest, WildcardHighlightFields) { + nlohmann::json schema = R"({ + "name": "posts", + "enable_nested_fields": true, + "fields": [ + {"name": "user_name", "type": "string", "facet": true}, + {"name": "user", "type": "object"} + ] + })"_json; + + auto op = collectionManager.create_collection(schema); + ASSERT_TRUE(op.ok()); + Collection* coll = op.get(); + + auto add_op = coll->add(R"({"id": "124","user_name": "user_a","user": {"rank": 100,"phone": "+91 123123123"}})"); + if (!add_op.ok()) { + LOG(INFO) << add_op.error(); + } + ASSERT_TRUE(add_op.ok()); + + spp::sparse_hash_set dummy_include_exclude; + std::string highlight_fields = "user*"; + // user* matches user_name, user.rank and user.phone + auto result = coll->search("+91", {"user"}, "", {}, {}, {0}, + 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, dummy_include_exclude, dummy_include_exclude, 10, "", + 30, 4, "", Index::TYPO_TOKENS_THRESHOLD, "", "", {}, 3, "", "", {}, UINT32_MAX, + true, false, true, highlight_fields).get(); + + ASSERT_EQ(1, result["found"].get()); + ASSERT_EQ(1, result["hits"].size()); + +// ASSERT_EQ("+91 123123123", +// result["hits"][0]["highlight"]["user"]["phone"]["snippet"].get()); +// ASSERT_EQ("100", +// result["hits"][0]["highlight"]["user"]["rank"]["snippet"].get()); + ASSERT_EQ("user_a", + result["hits"][0]["highlight"]["user_name"]["snippet"].get()); + + highlight_fields = "user.*"; + // user.* matches user.rank and user.phone + result = coll->search("+91", {"user"}, "", {}, {}, {0}, + 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, dummy_include_exclude, dummy_include_exclude, 10, "", + 30, 4, "", Index::TYPO_TOKENS_THRESHOLD, "", "", {}, 3, "", "", {}, UINT32_MAX, + true, false, true, highlight_fields).get(); + + ASSERT_EQ(1, result["found"].get()); + ASSERT_EQ(1, result["hits"].size()); + + ASSERT_EQ("+91 123123123", + result["hits"][0]["highlight"]["user"]["phone"]["snippet"].get()); +// ASSERT_EQ("100", +// result["hits"][0]["highlight"]["user"]["rank"]["snippet"].get()); + + highlight_fields = "user*"; + // user* matches user_name, user.rank and user.phone + result = coll->search("user_a", {"user_name"}, "", {}, {}, {0}, + 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, dummy_include_exclude, dummy_include_exclude, 10, "", + 30, 4, "", Index::TYPO_TOKENS_THRESHOLD, "", "", {}, 3, "", "", {}, UINT32_MAX, + true, false, true, highlight_fields).get(); + + ASSERT_EQ(1, result["found"].get()); + ASSERT_EQ(1, result["hits"].size()); + +// ASSERT_EQ("+91 123123123", +// result["hits"][0]["highlight"]["user"]["phone"]["snippet"].get()); +// ASSERT_EQ("100", +// result["hits"][0]["highlight"]["user"]["rank"]["snippet"].get()); + ASSERT_EQ("user_a", + result["hits"][0]["highlight"]["user_name"]["snippet"].get()); + + highlight_fields = "user.*"; + // user.* matches user.rank and user.phone + result = coll->search("user_a", {"user_name"}, "", {}, {}, {0}, + 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, dummy_include_exclude, dummy_include_exclude, 10, "", + 30, 4, "", Index::TYPO_TOKENS_THRESHOLD, "", "", {}, 3, "", "", {}, UINT32_MAX, + true, false, true, highlight_fields).get(); + + ASSERT_EQ(1, result["found"].get()); + ASSERT_EQ(1, result["hits"].size()); + + ASSERT_EQ("+91 123123123", + result["hits"][0]["highlight"]["user"]["phone"]["snippet"].get()); + ASSERT_EQ("100", + result["hits"][0]["highlight"]["user"]["rank"]["snippet"].get()); + + highlight_fields = "foo*"; + // No matching field for highlight_fields + result = coll->search("user_a", {"user_name"}, "", {}, {}, {0}, + 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, dummy_include_exclude, dummy_include_exclude, 10, "", + 30, 4, "", Index::TYPO_TOKENS_THRESHOLD, "", "", {}, 3, "", "", {}, UINT32_MAX, + true, false, true, highlight_fields).get(); + + ASSERT_EQ(1, result["found"].get()); + ASSERT_EQ(1, result["hits"].size()); + ASSERT_EQ(0, result["hits"][0]["highlight"].size()); +} + +TEST_F(CollectionTest, WildcardHighlightFullFields) { + nlohmann::json schema = R"({ + "name": "posts", + "enable_nested_fields": true, + "fields": [ + {"name": "user_name", "type": "string", "facet": true}, + {"name": "user.rank", "type": "int32", "facet": true}, + {"name": "user.phone", "type": "string"}, + {"name": "user.bio", "type": "string"} + ] + })"_json; + + auto op = collectionManager.create_collection(schema); + ASSERT_TRUE(op.ok()); + Collection* coll = op.get(); + + auto json = R"({ + "id": "124", + "user_name": "user_a", + "user": { + "rank": 100, + "phone": "+91 123123123" + } + })"_json; + std::string bio = "Once there was a middle-aged boy named User_a who was an avid swimmer." + "He had been swimming competitively for most of his life, and had even competed in several national competitions." + "However, despite his passion and talent for the sport, he had never quite managed to win that elusive gold medal." + "Determined to change that, User_a began training harder than ever before." + "He woke up early every morning to swim laps before work and spent his evenings at the pool as well." + "Despite the grueling schedule, he never once complained." + "Instead, he reminded himself of his goal: to become a national champion."; + json["user"]["bio"] = bio; + + auto add_op = coll->add(json.dump()); + if (!add_op.ok()) { + LOG(INFO) << add_op.error(); + } + ASSERT_TRUE(add_op.ok()); + + spp::sparse_hash_set dummy_include_exclude; + std::string highlight_full_fields = "user*"; + // user* matches user_name, user.bio + auto result = coll->search("user_a", {"*"}, "", {}, {}, {0}, + 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, dummy_include_exclude, dummy_include_exclude, 10, "", + 30, 4, highlight_full_fields).get(); + + ASSERT_EQ(1, result["found"].get()); + ASSERT_EQ(1, result["hits"].size()); + + ASSERT_EQ("a middle-aged boy named User_a who was an avid", + result["hits"][0]["highlight"]["user"]["bio"]["snippet"].get()); + + std::string highlighted_value = "Once there was a middle-aged boy named User_a who was an avid swimmer." + "He had been swimming competitively for most of his life, and had even competed in several national competitions." + "However, despite his passion and talent for the sport, he had never quite managed to win that elusive gold medal." + "Determined to change that, User_a began training harder than ever before." + "He woke up early every morning to swim laps before work and spent his evenings at the pool as well." + "Despite the grueling schedule, he never once complained." + "Instead, he reminded himself of his goal: to become a national champion."; + ASSERT_EQ( highlighted_value, result["hits"][0]["highlight"]["user"]["bio"]["value"].get()); + ASSERT_EQ("user_a", + result["hits"][0]["highlight"]["user_name"]["value"].get()); + + highlight_full_fields = "user.*"; + // user.* matches user.bio + result = coll->search("user_a", {"*"}, "", {}, {}, {0}, + 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, dummy_include_exclude, dummy_include_exclude, 10, "", + 30, 4, highlight_full_fields).get(); + + ASSERT_EQ(1, result["found"].get()); + ASSERT_EQ(1, result["hits"].size()); + + ASSERT_EQ(highlighted_value, result["hits"][0]["highlight"]["user"]["bio"]["value"].get()); + ASSERT_EQ(0, result["hits"][0]["highlight"]["user_name"].count("value")); + + highlight_full_fields = "foo*"; + // No matching field for highlight_fields + result = coll->search("user_a", {"*"}, "", {}, {}, {0}, + 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, dummy_include_exclude, dummy_include_exclude, 10, "", + 30, 4, highlight_full_fields).get(); + + ASSERT_EQ(0, result["hits"][0]["highlight"]["user"]["bio"].count("value")); + ASSERT_EQ(0, result["hits"][0]["highlight"]["user_name"].count("value")); +} \ No newline at end of file