Support wildcard. (#852)

* Support wildcard.

* Add test case for `query_by=user.*`

* Remove duplicate.

* Add WildcardHighlightFields test.

* Add WildcardHighlightFullFields test.

* Review changes.
This commit is contained in:
Harpreet Sangar 2023-01-11 20:30:07 +05:30 committed by GitHub
parent 40561b91fa
commit d7ff4bdc0e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 372 additions and 10 deletions

View File

@ -229,7 +229,8 @@ private:
const tsl::htrie_map<char, field>& search_schema,
std::vector<std::string>& processed_search_fields,
bool extract_only_string_fields,
bool enable_nested_fields);
bool enable_nested_fields,
const bool handle_wildcard = true);
bool is_nested_array(const nlohmann::json& obj, std::vector<std::string> path_parts, size_t part_i) const;

View File

@ -884,13 +884,19 @@ Option<bool> Collection::extract_field_name(const std::string& field_name,
const tsl::htrie_map<char, field>& search_schema,
std::vector<std::string>& processed_search_fields,
const bool extract_only_string_fields,
const bool enable_nested_fields) {
const bool enable_nested_fields,
const bool handle_wildcard) {
if(field_name == "id") {
processed_search_fields.push_back(field_name);
return Option<bool>(true);
}
auto prefix_it = search_schema.equal_prefix_range(field_name);
bool is_wildcard = field_name.find('*') != std::string::npos;
if (is_wildcard && !handle_wildcard) {
return Option<bool>(400, "Pattern `" + field_name + "` is not allowed.");
}
// If wildcard, remove *
auto prefix_it = search_schema.equal_prefix_range(field_name.substr(0, field_name.size() - is_wildcard));
bool field_found = false;
for(auto kv = prefix_it.first; kv != prefix_it.second; ++kv) {
@ -898,7 +904,7 @@ Option<bool> Collection::extract_field_name(const std::string& field_name,
bool exact_primitive_match = exact_key_match && !kv.value().is_object();
if(extract_only_string_fields && !kv.value().is_string()) {
if(exact_primitive_match) {
if(exact_primitive_match && !is_wildcard) {
// upstream needs to be returned an error
return Option<bool>(400, "Field `" + field_name + "` should be a string or a string array.");
}
@ -906,16 +912,19 @@ Option<bool> Collection::extract_field_name(const std::string& field_name,
continue;
}
// field_name prefix must be followed by a "." to indicate an object search
if (exact_primitive_match || (enable_nested_fields && kv.key().size() > field_name.size() &&
kv.key()[field_name.size()] == '.')) {
if (exact_primitive_match || is_wildcard ||
// field_name prefix must be followed by a "." to indicate an object search
(enable_nested_fields && kv.key().size() > field_name.size() && kv.key()[field_name.size()] == '.')) {
processed_search_fields.push_back(kv.key());
field_found = true;
}
}
if(!field_found) {
std::string error = "Could not find a field named `" + field_name + "` in the schema.";
if (is_wildcard && extract_only_string_fields && !field_found) {
std::string error = "No string or string array field found matching the pattern `" + field_name + "` in the schema.";
return Option<bool>(404, error);
} else if (!field_found) {
std::string error = is_wildcard ? "No field found matching the pattern `" : "Could not find a field named `" + field_name + "` in the schema.";
return Option<bool>(404, error);
}
@ -1092,7 +1101,7 @@ Option<nlohmann::json> Collection::search(const std::string & raw_query,
std::vector<std::string> group_by_fields;
for(const std::string& field_name: raw_group_by_fields) {
auto field_op = extract_field_name(field_name, search_schema, group_by_fields, false, enable_nested_fields);
auto field_op = extract_field_name(field_name, search_schema, group_by_fields, false, enable_nested_fields, false);
if(!field_op.ok()) {
return Option<nlohmann::json>(404, field_op.error());
}

View File

@ -141,6 +141,15 @@ TEST_F(CollectionGroupingTest, GroupingBasics) {
ASSERT_EQ(3, (int) res["facet_counts"][0]["counts"][0]["count"]);
ASSERT_STREQ("Omega", res["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
ASSERT_STREQ("<mark>Omeg</mark>a", res["facet_counts"][0]["counts"][0]["highlighted"].get<std::string>().c_str());
// Wildcard group_by is not allowed
auto error = coll_group->search("*", {}, "", {"brand"}, {}, {0}, 50, 1, FREQUENCY,
{false}, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
"", 10,
{}, {}, {"foo*"}, 2).error();
ASSERT_EQ("Pattern `foo*` is not allowed.", error);
}
TEST_F(CollectionGroupingTest, GroupingCompoundKey) {

View File

@ -1592,6 +1592,93 @@ TEST_F(CollectionSpecificMoreTest, IncludeExcludeUnIndexedField) {
ASSERT_EQ("0", res["hits"][0]["document"]["id"].get<std::string>());
}
TEST_F(CollectionSpecificMoreTest, WildcardIncludeExclude) {
nlohmann::json schema = R"({
"name": "posts",
"enable_nested_fields": true,
"fields": [
{"name": "username", "type": "string", "facet": true},
{"name": "user.rank", "type": "int32", "facet": true},
{"name": "user.bio", "type": "string"},
{"name": "likes", "type": "int32"},
{"name": "content", "type": "object"}
],
"default_sorting_field": "likes"
})"_json;
auto op = collectionManager.create_collection(schema);
ASSERT_TRUE(op.ok());
Collection* coll = op.get();
std::vector<std::string> json_lines = {
R"({"id": "124","username": "user_a","user": {"rank": 100,"bio": "Hi! I'm user_a"},"likes": 5215,"content": {"title": "title 1","body": "body 1"}})",
R"({"id": "125","username": "user_b","user": {"rank": 50,"bio": "user_b here, nice to meet you!"},"likes": 5215,"content": {"title": "title 2","body": "body 2"}})"
};
for (auto const& json: json_lines){
auto add_op = coll->add(json);
if (!add_op.ok()) {
LOG(INFO) << add_op.error();
}
ASSERT_TRUE(add_op.ok());
}
// include test: user* matches username, user.bio and user.rank
auto result = coll->search("user_a", {"username"}, "", {}, {}, {0},
10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, {"user*"}).get();
ASSERT_EQ(1, result["found"].get<size_t>());
ASSERT_EQ(1, result["hits"].size());
ASSERT_EQ(0, result["hits"][0]["document"].count("id"));
ASSERT_EQ(0, result["hits"][0]["document"].count("likes"));
ASSERT_EQ(0, result["hits"][0]["document"].count("content"));
ASSERT_EQ(1, result["hits"][0]["document"].count("user"));
ASSERT_EQ(1, result["hits"][0]["document"]["user"].count("bio"));
ASSERT_EQ(1, result["hits"][0]["document"]["user"].count("rank"));
ASSERT_EQ(1, result["hits"][0]["document"].count("username"));
spp::sparse_hash_set<std::string> include_fields;
// exclude test: user.* matches user.rank and user.bio
result = coll->search("user_a", {"username"}, "", {}, {}, {0},
10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, include_fields, {"user.*"}).get();
ASSERT_EQ(1, result["found"].get<size_t>());
ASSERT_EQ(1, result["hits"].size());
ASSERT_EQ(1, result["hits"][0]["document"].count("id"));
ASSERT_EQ(1, result["hits"][0]["document"].count("likes"));
ASSERT_EQ(1, result["hits"][0]["document"].count("content"));
ASSERT_EQ(1, result["hits"][0]["document"]["content"].count("title"));
ASSERT_EQ(1, result["hits"][0]["document"]["content"].count("body"));
ASSERT_EQ(0, result["hits"][0]["document"].count("user"));
ASSERT_EQ(1, result["hits"][0]["document"].count("username"));
// No matching field for include_fields/exclude_fields
result = coll->search("user_a", {"username"}, "", {}, {}, {0},
10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, {"foo.*"}).get();
ASSERT_EQ(1, result["found"].get<size_t>());
ASSERT_EQ(1, result["hits"].size());
ASSERT_EQ(0, result["hits"][0]["document"].size());
result = coll->search("user_a", {"username"}, "", {}, {}, {0},
10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, include_fields, {"foo.*"}).get();
ASSERT_EQ(1, result["found"].get<size_t>());
ASSERT_EQ(1, result["hits"].size());
ASSERT_EQ(1, result["hits"][0]["document"].count("id"));
ASSERT_EQ(1, result["hits"][0]["document"].count("likes"));
ASSERT_EQ(1, result["hits"][0]["document"].count("content"));
ASSERT_EQ(1, result["hits"][0]["document"]["content"].count("title"));
ASSERT_EQ(1, result["hits"][0]["document"]["content"].count("body"));
ASSERT_EQ(1, result["hits"][0]["document"].count("user"));
ASSERT_EQ(1, result["hits"][0]["document"]["user"].count("bio"));
ASSERT_EQ(1, result["hits"][0]["document"]["user"].count("rank"));
ASSERT_EQ(1, result["hits"][0]["document"].count("username"));
}
TEST_F(CollectionSpecificMoreTest, PhraseMatchRepeatingTokens) {
nlohmann::json schema = R"({
"name": "coll1",

View File

@ -4350,3 +4350,259 @@ TEST_F(CollectionTest, QueryParsingForPhraseSearch) {
collectionManager.drop_collection("coll1");
}
TEST_F(CollectionTest, WildcardQueryBy) {
nlohmann::json schema = R"({
"name": "posts",
"enable_nested_fields": true,
"fields": [
{"name": "username", "type": "string", "facet": true},
{"name": "user.rank", "type": "int32", "facet": true},
{"name": "user.bio", "type": "string"},
{"name": "likes", "type": "int32"},
{"name": "content", "type": "object"}
],
"default_sorting_field": "likes"
})"_json;
auto op = collectionManager.create_collection(schema);
ASSERT_TRUE(op.ok());
Collection* coll = op.get();
std::vector<std::string> json_lines = {
R"({"id": "124","username": "user_a","user": {"rank": 100,"bio": "Hi! I'm user_a"},"likes": 5215,"content": {"title": "title 1","body": "body 1 user_a"}})",
R"({"id": "125","username": "user_b","user": {"rank": 50,"bio": "user_b here, nice to meet you!"},"likes": 5215,"content": {"title": "title 2","body": "body 2 user_b"}})"
};
for (auto const& json: json_lines){
auto add_op = coll->add(json);
if (!add_op.ok()) {
LOG(INFO) << add_op.error();
}
ASSERT_TRUE(add_op.ok());
}
// * matches username, user.bio, content.title, content.body
auto result = coll->search("user_a", {"*"}, "", {}, {}, {0}).get();
ASSERT_EQ(1, result["found"].get<size_t>());
ASSERT_EQ(1, result["hits"].size());
ASSERT_EQ("Hi! I'm <mark>user_a</mark>",
result["hits"][0]["highlight"]["user"]["bio"]["snippet"].get<std::string>());
ASSERT_EQ("<mark>user_a</mark>",
result["hits"][0]["highlight"]["username"]["snippet"].get<std::string>());
// ASSERT_EQ("body 1 <mark>user_a</mark>",
// result["hits"][0]["highlight"]["content"]["body"]["snippet"].get<std::string>());
// user* matches username and user.bio
result = coll->search("user_a", {"user*"}, "", {}, {}, {0}).get();
ASSERT_EQ(1, result["found"].get<size_t>());
ASSERT_EQ(1, result["hits"].size());
ASSERT_EQ("Hi! I'm <mark>user_a</mark>",
result["hits"][0]["highlight"]["user"]["bio"]["snippet"].get<std::string>());
ASSERT_EQ("<mark>user_a</mark>",
result["hits"][0]["highlight"]["username"]["snippet"].get<std::string>());
// user.* matches user.bio
result = coll->search("user_a", {"user.*"}, "", {}, {}, {0}).get();
ASSERT_EQ(1, result["found"].get<size_t>());
ASSERT_EQ(1, result["hits"].size());
ASSERT_EQ("Hi! I'm <mark>user_a</mark>",
result["hits"][0]["highlight"]["user"]["bio"]["snippet"].get<std::string>());
// user.rank cannot be queried
result = coll->search("100", {"user*"}, "", {}, {}, {0}).get();
ASSERT_EQ(0, result["found"].get<size_t>());
ASSERT_EQ(0, result["hits"].size());
// No matching field for query_by
auto error = coll->search("user_a", {"foo*"}, "", {}, {}, {0}).error();
ASSERT_EQ("No string or string array field found matching the pattern `foo*` in the schema.", error);
}
TEST_F(CollectionTest, WildcardHighlightFields) {
nlohmann::json schema = R"({
"name": "posts",
"enable_nested_fields": true,
"fields": [
{"name": "user_name", "type": "string", "facet": true},
{"name": "user", "type": "object"}
]
})"_json;
auto op = collectionManager.create_collection(schema);
ASSERT_TRUE(op.ok());
Collection* coll = op.get();
auto add_op = coll->add(R"({"id": "124","user_name": "user_a","user": {"rank": 100,"phone": "+91 123123123"}})");
if (!add_op.ok()) {
LOG(INFO) << add_op.error();
}
ASSERT_TRUE(add_op.ok());
spp::sparse_hash_set<std::string> dummy_include_exclude;
std::string highlight_fields = "user*";
// user* matches user_name, user.rank and user.phone
auto result = coll->search("+91", {"user"}, "", {}, {}, {0},
10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, dummy_include_exclude, dummy_include_exclude, 10, "",
30, 4, "", Index::TYPO_TOKENS_THRESHOLD, "", "", {}, 3, "<mark>", "</mark>", {}, UINT32_MAX,
true, false, true, highlight_fields).get();
ASSERT_EQ(1, result["found"].get<size_t>());
ASSERT_EQ(1, result["hits"].size());
// ASSERT_EQ("+<mark>91</mark> 123123123",
// result["hits"][0]["highlight"]["user"]["phone"]["snippet"].get<std::string>());
// ASSERT_EQ("100",
// result["hits"][0]["highlight"]["user"]["rank"]["snippet"].get<std::string>());
ASSERT_EQ("user_a",
result["hits"][0]["highlight"]["user_name"]["snippet"].get<std::string>());
highlight_fields = "user.*";
// user.* matches user.rank and user.phone
result = coll->search("+91", {"user"}, "", {}, {}, {0},
10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, dummy_include_exclude, dummy_include_exclude, 10, "",
30, 4, "", Index::TYPO_TOKENS_THRESHOLD, "", "", {}, 3, "<mark>", "</mark>", {}, UINT32_MAX,
true, false, true, highlight_fields).get();
ASSERT_EQ(1, result["found"].get<size_t>());
ASSERT_EQ(1, result["hits"].size());
ASSERT_EQ("+<mark>91</mark> 123123123",
result["hits"][0]["highlight"]["user"]["phone"]["snippet"].get<std::string>());
// ASSERT_EQ("100",
// result["hits"][0]["highlight"]["user"]["rank"]["snippet"].get<std::string>());
highlight_fields = "user*";
// user* matches user_name, user.rank and user.phone
result = coll->search("user_a", {"user_name"}, "", {}, {}, {0},
10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, dummy_include_exclude, dummy_include_exclude, 10, "",
30, 4, "", Index::TYPO_TOKENS_THRESHOLD, "", "", {}, 3, "<mark>", "</mark>", {}, UINT32_MAX,
true, false, true, highlight_fields).get();
ASSERT_EQ(1, result["found"].get<size_t>());
ASSERT_EQ(1, result["hits"].size());
// ASSERT_EQ("+91 123123123",
// result["hits"][0]["highlight"]["user"]["phone"]["snippet"].get<std::string>());
// ASSERT_EQ("100",
// result["hits"][0]["highlight"]["user"]["rank"]["snippet"].get<std::string>());
ASSERT_EQ("<mark>user_a</mark>",
result["hits"][0]["highlight"]["user_name"]["snippet"].get<std::string>());
highlight_fields = "user.*";
// user.* matches user.rank and user.phone
result = coll->search("user_a", {"user_name"}, "", {}, {}, {0},
10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, dummy_include_exclude, dummy_include_exclude, 10, "",
30, 4, "", Index::TYPO_TOKENS_THRESHOLD, "", "", {}, 3, "<mark>", "</mark>", {}, UINT32_MAX,
true, false, true, highlight_fields).get();
ASSERT_EQ(1, result["found"].get<size_t>());
ASSERT_EQ(1, result["hits"].size());
ASSERT_EQ("+91 123123123",
result["hits"][0]["highlight"]["user"]["phone"]["snippet"].get<std::string>());
ASSERT_EQ("100",
result["hits"][0]["highlight"]["user"]["rank"]["snippet"].get<std::string>());
highlight_fields = "foo*";
// No matching field for highlight_fields
result = coll->search("user_a", {"user_name"}, "", {}, {}, {0},
10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, dummy_include_exclude, dummy_include_exclude, 10, "",
30, 4, "", Index::TYPO_TOKENS_THRESHOLD, "", "", {}, 3, "<mark>", "</mark>", {}, UINT32_MAX,
true, false, true, highlight_fields).get();
ASSERT_EQ(1, result["found"].get<size_t>());
ASSERT_EQ(1, result["hits"].size());
ASSERT_EQ(0, result["hits"][0]["highlight"].size());
}
TEST_F(CollectionTest, WildcardHighlightFullFields) {
nlohmann::json schema = R"({
"name": "posts",
"enable_nested_fields": true,
"fields": [
{"name": "user_name", "type": "string", "facet": true},
{"name": "user.rank", "type": "int32", "facet": true},
{"name": "user.phone", "type": "string"},
{"name": "user.bio", "type": "string"}
]
})"_json;
auto op = collectionManager.create_collection(schema);
ASSERT_TRUE(op.ok());
Collection* coll = op.get();
auto json = R"({
"id": "124",
"user_name": "user_a",
"user": {
"rank": 100,
"phone": "+91 123123123"
}
})"_json;
std::string bio = "Once there was a middle-aged boy named User_a who was an avid swimmer."
"He had been swimming competitively for most of his life, and had even competed in several national competitions."
"However, despite his passion and talent for the sport, he had never quite managed to win that elusive gold medal."
"Determined to change that, User_a began training harder than ever before."
"He woke up early every morning to swim laps before work and spent his evenings at the pool as well."
"Despite the grueling schedule, he never once complained."
"Instead, he reminded himself of his goal: to become a national champion.";
json["user"]["bio"] = bio;
auto add_op = coll->add(json.dump());
if (!add_op.ok()) {
LOG(INFO) << add_op.error();
}
ASSERT_TRUE(add_op.ok());
spp::sparse_hash_set<std::string> dummy_include_exclude;
std::string highlight_full_fields = "user*";
// user* matches user_name, user.bio
auto result = coll->search("user_a", {"*"}, "", {}, {}, {0},
10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, dummy_include_exclude, dummy_include_exclude, 10, "",
30, 4, highlight_full_fields).get();
ASSERT_EQ(1, result["found"].get<size_t>());
ASSERT_EQ(1, result["hits"].size());
ASSERT_EQ("a middle-aged boy named <mark>User_a</mark> who was an avid",
result["hits"][0]["highlight"]["user"]["bio"]["snippet"].get<std::string>());
std::string highlighted_value = "Once there was a middle-aged boy named <mark>User_a</mark> who was an avid swimmer."
"He had been swimming competitively for most of his life, and had even competed in several national competitions."
"However, despite his passion and talent for the sport, he had never quite managed to win that elusive gold medal."
"Determined to change that, <mark>User_a</mark> began training harder than ever before."
"He woke up early every morning to swim laps before work and spent his evenings at the pool as well."
"Despite the grueling schedule, he never once complained."
"Instead, he reminded himself of his goal: to become a national champion.";
ASSERT_EQ( highlighted_value, result["hits"][0]["highlight"]["user"]["bio"]["value"].get<std::string>());
ASSERT_EQ("<mark>user_a</mark>",
result["hits"][0]["highlight"]["user_name"]["value"].get<std::string>());
highlight_full_fields = "user.*";
// user.* matches user.bio
result = coll->search("user_a", {"*"}, "", {}, {}, {0},
10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, dummy_include_exclude, dummy_include_exclude, 10, "",
30, 4, highlight_full_fields).get();
ASSERT_EQ(1, result["found"].get<size_t>());
ASSERT_EQ(1, result["hits"].size());
ASSERT_EQ(highlighted_value, result["hits"][0]["highlight"]["user"]["bio"]["value"].get<std::string>());
ASSERT_EQ(0, result["hits"][0]["highlight"]["user_name"].count("value"));
highlight_full_fields = "foo*";
// No matching field for highlight_fields
result = coll->search("user_a", {"*"}, "", {}, {}, {0},
10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, dummy_include_exclude, dummy_include_exclude, 10, "",
30, 4, highlight_full_fields).get();
ASSERT_EQ(0, result["hits"][0]["highlight"]["user"]["bio"].count("value"));
ASSERT_EQ(0, result["hits"][0]["highlight"]["user_name"].count("value"));
}