Support wildcard. (#852)

* Support wildcard. * Add test case for `query_by=user.*` * Remove duplicate. * Add WildcardHighlightFields test. * Add WildcardHighlightFullFields test. * Review changes.
2025-05-20 21:52:23 +08:00 · 2023-01-11 20:30:07 +05:30 · 2023-01-11 20:30:07 +05:30 · d7ff4bdc0e
commit d7ff4bdc0e
parent 40561b91fa
5 changed files with 372 additions and 10 deletions
--- a/include/collection.h
+++ b/include/collection.h
@ -229,7 +229,8 @@ private:
                                           const tsl::htrie_map<char, field>& search_schema,
                                           std::vector<std::string>& processed_search_fields,
                                           bool extract_only_string_fields,
-                                           bool enable_nested_fields);
+                                           bool enable_nested_fields,
+                                           const bool handle_wildcard = true);

    bool is_nested_array(const nlohmann::json& obj, std::vector<std::string> path_parts, size_t part_i) const;

--- a/src/collection.cpp
+++ b/src/collection.cpp
@ -884,13 +884,19 @@ Option<bool> Collection::extract_field_name(const std::string& field_name,
                                            const tsl::htrie_map<char, field>& search_schema,
                                            std::vector<std::string>& processed_search_fields,
                                            const bool extract_only_string_fields,
-                                            const bool enable_nested_fields) {
+                                            const bool enable_nested_fields,
+                                            const bool handle_wildcard) {
    if(field_name == "id") {
        processed_search_fields.push_back(field_name);
        return Option<bool>(true);
    }

-    auto prefix_it = search_schema.equal_prefix_range(field_name);
+    bool is_wildcard = field_name.find('*') != std::string::npos;
+    if (is_wildcard && !handle_wildcard) {
+        return Option<bool>(400, "Pattern `" + field_name + "` is not allowed.");
+    }
+    // If wildcard, remove *
+    auto prefix_it = search_schema.equal_prefix_range(field_name.substr(0, field_name.size() - is_wildcard));
    bool field_found = false;

    for(auto kv = prefix_it.first; kv != prefix_it.second; ++kv) {
@ -898,7 +904,7 @@ Option<bool> Collection::extract_field_name(const std::string& field_name,
        bool exact_primitive_match = exact_key_match && !kv.value().is_object();

        if(extract_only_string_fields && !kv.value().is_string()) {
-            if(exact_primitive_match) {
+            if(exact_primitive_match && !is_wildcard) {
                // upstream needs to be returned an error
                return Option<bool>(400, "Field `" + field_name + "` should be a string or a string array.");
            }
@ -906,16 +912,19 @@ Option<bool> Collection::extract_field_name(const std::string& field_name,
            continue;
        }

-        // field_name prefix must be followed by a "." to indicate an object search
-        if (exact_primitive_match || (enable_nested_fields && kv.key().size() > field_name.size() &&
-                                      kv.key()[field_name.size()] == '.')) {
+        if (exact_primitive_match || is_wildcard ||
+            // field_name prefix must be followed by a "." to indicate an object search
+            (enable_nested_fields && kv.key().size() > field_name.size() && kv.key()[field_name.size()] == '.')) {
            processed_search_fields.push_back(kv.key());
            field_found = true;
        }
    }

-    if(!field_found) {
-        std::string error = "Could not find a field named `" + field_name + "` in the schema.";
+    if (is_wildcard && extract_only_string_fields && !field_found) {
+        std::string error = "No string or string array field found matching the pattern `" + field_name + "` in the schema.";
+        return Option<bool>(404, error);
+    } else if (!field_found) {
+        std::string error = is_wildcard ? "No field found matching the pattern `" : "Could not find a field named `" + field_name + "` in the schema.";
        return Option<bool>(404, error);
    }

@ -1092,7 +1101,7 @@ Option<nlohmann::json> Collection::search(const std::string & raw_query,
    std::vector<std::string> group_by_fields;

    for(const std::string& field_name: raw_group_by_fields) {
-        auto field_op = extract_field_name(field_name, search_schema, group_by_fields, false, enable_nested_fields);
+        auto field_op = extract_field_name(field_name, search_schema, group_by_fields, false, enable_nested_fields, false);
        if(!field_op.ok()) {
            return Option<nlohmann::json>(404, field_op.error());
        }
--- a/test/collection_grouping_test.cpp
+++ b/test/collection_grouping_test.cpp
@ -141,6 +141,15 @@ TEST_F(CollectionGroupingTest, GroupingBasics) {
    ASSERT_EQ(3, (int) res["facet_counts"][0]["counts"][0]["count"]);
    ASSERT_STREQ("Omega", res["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
    ASSERT_STREQ("<mark>Omeg</mark>a", res["facet_counts"][0]["counts"][0]["highlighted"].get<std::string>().c_str());
+
+    // Wildcard group_by is not allowed
+    auto error = coll_group->search("*", {}, "", {"brand"}, {}, {0}, 50, 1, FREQUENCY,
+                                   {false}, Index::DROP_TOKENS_THRESHOLD,
+                                   spp::sparse_hash_set<std::string>(),
+                                   spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
+                                   "", 10,
+                                   {}, {}, {"foo*"}, 2).error();
+    ASSERT_EQ("Pattern `foo*` is not allowed.",  error);
 }

 TEST_F(CollectionGroupingTest, GroupingCompoundKey) {
--- a/test/collection_specific_more_test.cpp
+++ b/test/collection_specific_more_test.cpp
@ -1592,6 +1592,93 @@ TEST_F(CollectionSpecificMoreTest, IncludeExcludeUnIndexedField) {
    ASSERT_EQ("0", res["hits"][0]["document"]["id"].get<std::string>());
 }

+TEST_F(CollectionSpecificMoreTest, WildcardIncludeExclude) {
+    nlohmann::json schema = R"({
+         "name": "posts",
+         "enable_nested_fields": true,
+         "fields": [
+           {"name": "username", "type": "string", "facet": true},
+           {"name": "user.rank", "type": "int32", "facet": true},
+           {"name": "user.bio", "type": "string"},
+           {"name": "likes", "type": "int32"},
+           {"name": "content", "type": "object"}
+         ],
+         "default_sorting_field": "likes"
+       })"_json;
+
+    auto op = collectionManager.create_collection(schema);
+    ASSERT_TRUE(op.ok());
+    Collection* coll = op.get();
+
+    std::vector<std::string> json_lines = {
+            R"({"id": "124","username": "user_a","user": {"rank": 100,"bio": "Hi! I'm user_a"},"likes": 5215,"content": {"title": "title 1","body": "body 1"}})",
+            R"({"id": "125","username": "user_b","user": {"rank": 50,"bio": "user_b here, nice to meet you!"},"likes": 5215,"content": {"title": "title 2","body": "body 2"}})"
+    };
+
+    for (auto const& json: json_lines){
+        auto add_op = coll->add(json);
+        if (!add_op.ok()) {
+            LOG(INFO) << add_op.error();
+        }
+        ASSERT_TRUE(add_op.ok());
+    }
+
+    // include test: user* matches username, user.bio and user.rank
+    auto result = coll->search("user_a", {"username"}, "", {}, {}, {0},
+                                        10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, {"user*"}).get();
+
+    ASSERT_EQ(1, result["found"].get<size_t>());
+    ASSERT_EQ(1, result["hits"].size());
+
+    ASSERT_EQ(0, result["hits"][0]["document"].count("id"));
+    ASSERT_EQ(0, result["hits"][0]["document"].count("likes"));
+    ASSERT_EQ(0, result["hits"][0]["document"].count("content"));
+    ASSERT_EQ(1, result["hits"][0]["document"].count("user"));
+    ASSERT_EQ(1, result["hits"][0]["document"]["user"].count("bio"));
+    ASSERT_EQ(1, result["hits"][0]["document"]["user"].count("rank"));
+    ASSERT_EQ(1, result["hits"][0]["document"].count("username"));
+
+    spp::sparse_hash_set<std::string> include_fields;
+    // exclude test: user.* matches user.rank and user.bio
+    result = coll->search("user_a", {"username"}, "", {}, {}, {0},
+                               10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, include_fields, {"user.*"}).get();
+
+    ASSERT_EQ(1, result["found"].get<size_t>());
+    ASSERT_EQ(1, result["hits"].size());
+
+    ASSERT_EQ(1, result["hits"][0]["document"].count("id"));
+    ASSERT_EQ(1, result["hits"][0]["document"].count("likes"));
+    ASSERT_EQ(1, result["hits"][0]["document"].count("content"));
+    ASSERT_EQ(1, result["hits"][0]["document"]["content"].count("title"));
+    ASSERT_EQ(1, result["hits"][0]["document"]["content"].count("body"));
+    ASSERT_EQ(0, result["hits"][0]["document"].count("user"));
+    ASSERT_EQ(1, result["hits"][0]["document"].count("username"));
+
+    // No matching field for include_fields/exclude_fields
+    result = coll->search("user_a", {"username"}, "", {}, {}, {0},
+                          10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, {"foo.*"}).get();
+
+    ASSERT_EQ(1, result["found"].get<size_t>());
+    ASSERT_EQ(1, result["hits"].size());
+    ASSERT_EQ(0, result["hits"][0]["document"].size());
+
+    result = coll->search("user_a", {"username"}, "", {}, {}, {0},
+                         10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, include_fields, {"foo.*"}).get();
+
+    ASSERT_EQ(1, result["found"].get<size_t>());
+    ASSERT_EQ(1, result["hits"].size());
+
+    ASSERT_EQ(1, result["hits"][0]["document"].count("id"));
+    ASSERT_EQ(1, result["hits"][0]["document"].count("likes"));
+    ASSERT_EQ(1, result["hits"][0]["document"].count("content"));
+    ASSERT_EQ(1, result["hits"][0]["document"]["content"].count("title"));
+    ASSERT_EQ(1, result["hits"][0]["document"]["content"].count("body"));
+    ASSERT_EQ(1, result["hits"][0]["document"].count("user"));
+    ASSERT_EQ(1, result["hits"][0]["document"]["user"].count("bio"));
+    ASSERT_EQ(1, result["hits"][0]["document"]["user"].count("rank"));
+    ASSERT_EQ(1, result["hits"][0]["document"].count("username"));
+}
+
 TEST_F(CollectionSpecificMoreTest, PhraseMatchRepeatingTokens) {
    nlohmann::json schema = R"({
        "name": "coll1",
--- a/test/collection_test.cpp
+++ b/test/collection_test.cpp
@ -4350,3 +4350,259 @@ TEST_F(CollectionTest, QueryParsingForPhraseSearch) {

    collectionManager.drop_collection("coll1");
 }
+
+TEST_F(CollectionTest, WildcardQueryBy) {
+    nlohmann::json schema = R"({
+         "name": "posts",
+         "enable_nested_fields": true,
+         "fields": [
+           {"name": "username", "type": "string", "facet": true},
+           {"name": "user.rank", "type": "int32", "facet": true},
+           {"name": "user.bio", "type": "string"},
+           {"name": "likes", "type": "int32"},
+           {"name": "content", "type": "object"}
+         ],
+         "default_sorting_field": "likes"
+       })"_json;
+
+    auto op = collectionManager.create_collection(schema);
+    ASSERT_TRUE(op.ok());
+    Collection* coll = op.get();
+
+    std::vector<std::string> json_lines = {
+            R"({"id": "124","username": "user_a","user": {"rank": 100,"bio": "Hi! I'm user_a"},"likes": 5215,"content": {"title": "title 1","body": "body 1 user_a"}})",
+            R"({"id": "125","username": "user_b","user": {"rank": 50,"bio": "user_b here, nice to meet you!"},"likes": 5215,"content": {"title": "title 2","body": "body 2 user_b"}})"
+    };
+
+    for (auto const& json: json_lines){
+        auto add_op = coll->add(json);
+        if (!add_op.ok()) {
+            LOG(INFO) << add_op.error();
+        }
+        ASSERT_TRUE(add_op.ok());
+    }
+
+    // * matches username, user.bio, content.title, content.body
+    auto result = coll->search("user_a", {"*"}, "", {}, {}, {0}).get();
+
+    ASSERT_EQ(1, result["found"].get<size_t>());
+    ASSERT_EQ(1, result["hits"].size());
+
+    ASSERT_EQ("Hi! I'm <mark>user_a</mark>",
+                 result["hits"][0]["highlight"]["user"]["bio"]["snippet"].get<std::string>());
+    ASSERT_EQ("<mark>user_a</mark>",
+                 result["hits"][0]["highlight"]["username"]["snippet"].get<std::string>());
+//    ASSERT_EQ("body 1 <mark>user_a</mark>",
+//              result["hits"][0]["highlight"]["content"]["body"]["snippet"].get<std::string>());
+
+    // user* matches username and user.bio
+    result = coll->search("user_a", {"user*"}, "", {}, {}, {0}).get();
+
+    ASSERT_EQ(1, result["found"].get<size_t>());
+    ASSERT_EQ(1, result["hits"].size());
+
+    ASSERT_EQ("Hi! I'm <mark>user_a</mark>",
+                 result["hits"][0]["highlight"]["user"]["bio"]["snippet"].get<std::string>());
+    ASSERT_EQ("<mark>user_a</mark>",
+                 result["hits"][0]["highlight"]["username"]["snippet"].get<std::string>());
+
+    // user.* matches user.bio
+    result = coll->search("user_a", {"user.*"}, "", {}, {}, {0}).get();
+
+    ASSERT_EQ(1, result["found"].get<size_t>());
+    ASSERT_EQ(1, result["hits"].size());
+
+    ASSERT_EQ("Hi! I'm <mark>user_a</mark>",
+              result["hits"][0]["highlight"]["user"]["bio"]["snippet"].get<std::string>());
+
+    // user.rank cannot be queried
+    result = coll->search("100", {"user*"}, "", {}, {}, {0}).get();
+    ASSERT_EQ(0, result["found"].get<size_t>());
+    ASSERT_EQ(0, result["hits"].size());
+
+    // No matching field for query_by
+    auto error = coll->search("user_a", {"foo*"}, "", {}, {}, {0}).error();
+    ASSERT_EQ("No string or string array field found matching the pattern `foo*` in the schema.",  error);
+}
+
+TEST_F(CollectionTest, WildcardHighlightFields) {
+    nlohmann::json schema = R"({
+         "name": "posts",
+         "enable_nested_fields": true,
+         "fields": [
+           {"name": "user_name", "type": "string", "facet": true},
+           {"name": "user", "type": "object"}
+         ]
+       })"_json;
+
+    auto op = collectionManager.create_collection(schema);
+    ASSERT_TRUE(op.ok());
+    Collection* coll = op.get();
+
+    auto add_op = coll->add(R"({"id": "124","user_name": "user_a","user": {"rank": 100,"phone": "+91 123123123"}})");
+    if (!add_op.ok()) {
+        LOG(INFO) << add_op.error();
+    }
+    ASSERT_TRUE(add_op.ok());
+
+    spp::sparse_hash_set<std::string> dummy_include_exclude;
+    std::string highlight_fields = "user*";
+    // user* matches user_name, user.rank and user.phone
+    auto result = coll->search("+91", {"user"}, "", {}, {}, {0},
+                               10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, dummy_include_exclude, dummy_include_exclude, 10, "",
+                               30, 4, "", Index::TYPO_TOKENS_THRESHOLD, "", "", {}, 3, "<mark>", "</mark>", {}, UINT32_MAX,
+                               true, false, true, highlight_fields).get();
+
+    ASSERT_EQ(1, result["found"].get<size_t>());
+    ASSERT_EQ(1, result["hits"].size());
+
+//    ASSERT_EQ("+<mark>91</mark> 123123123",
+//              result["hits"][0]["highlight"]["user"]["phone"]["snippet"].get<std::string>());
+//    ASSERT_EQ("100",
+//              result["hits"][0]["highlight"]["user"]["rank"]["snippet"].get<std::string>());
+    ASSERT_EQ("user_a",
+              result["hits"][0]["highlight"]["user_name"]["snippet"].get<std::string>());
+
+    highlight_fields = "user.*";
+    // user.* matches user.rank and user.phone
+    result = coll->search("+91", {"user"}, "", {}, {}, {0},
+                               10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, dummy_include_exclude, dummy_include_exclude, 10, "",
+                               30, 4, "", Index::TYPO_TOKENS_THRESHOLD, "", "", {}, 3, "<mark>", "</mark>", {}, UINT32_MAX,
+                               true, false, true, highlight_fields).get();
+
+    ASSERT_EQ(1, result["found"].get<size_t>());
+    ASSERT_EQ(1, result["hits"].size());
+
+    ASSERT_EQ("+<mark>91</mark> 123123123",
+              result["hits"][0]["highlight"]["user"]["phone"]["snippet"].get<std::string>());
+//    ASSERT_EQ("100",
+//              result["hits"][0]["highlight"]["user"]["rank"]["snippet"].get<std::string>());
+
+    highlight_fields = "user*";
+    // user* matches user_name, user.rank and user.phone
+    result = coll->search("user_a", {"user_name"}, "", {}, {}, {0},
+                               10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, dummy_include_exclude, dummy_include_exclude, 10, "",
+                               30, 4, "", Index::TYPO_TOKENS_THRESHOLD, "", "", {}, 3, "<mark>", "</mark>", {}, UINT32_MAX,
+                               true, false, true, highlight_fields).get();
+
+    ASSERT_EQ(1, result["found"].get<size_t>());
+    ASSERT_EQ(1, result["hits"].size());
+
+//    ASSERT_EQ("+91 123123123",
+//              result["hits"][0]["highlight"]["user"]["phone"]["snippet"].get<std::string>());
+//    ASSERT_EQ("100",
+//              result["hits"][0]["highlight"]["user"]["rank"]["snippet"].get<std::string>());
+    ASSERT_EQ("<mark>user_a</mark>",
+              result["hits"][0]["highlight"]["user_name"]["snippet"].get<std::string>());
+
+    highlight_fields = "user.*";
+    // user.* matches user.rank and user.phone
+    result = coll->search("user_a", {"user_name"}, "", {}, {}, {0},
+                          10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, dummy_include_exclude, dummy_include_exclude, 10, "",
+                          30, 4, "", Index::TYPO_TOKENS_THRESHOLD, "", "", {}, 3, "<mark>", "</mark>", {}, UINT32_MAX,
+                          true, false, true, highlight_fields).get();
+
+    ASSERT_EQ(1, result["found"].get<size_t>());
+    ASSERT_EQ(1, result["hits"].size());
+
+    ASSERT_EQ("+91 123123123",
+              result["hits"][0]["highlight"]["user"]["phone"]["snippet"].get<std::string>());
+    ASSERT_EQ("100",
+              result["hits"][0]["highlight"]["user"]["rank"]["snippet"].get<std::string>());
+
+    highlight_fields = "foo*";
+    // No matching field for highlight_fields
+    result = coll->search("user_a", {"user_name"}, "", {}, {}, {0},
+                          10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, dummy_include_exclude, dummy_include_exclude, 10, "",
+                          30, 4, "", Index::TYPO_TOKENS_THRESHOLD, "", "", {}, 3, "<mark>", "</mark>", {}, UINT32_MAX,
+                          true, false, true, highlight_fields).get();
+
+    ASSERT_EQ(1, result["found"].get<size_t>());
+    ASSERT_EQ(1, result["hits"].size());
+    ASSERT_EQ(0, result["hits"][0]["highlight"].size());
+}
+
+TEST_F(CollectionTest, WildcardHighlightFullFields) {
+    nlohmann::json schema = R"({
+         "name": "posts",
+         "enable_nested_fields": true,
+         "fields": [
+           {"name": "user_name", "type": "string", "facet": true},
+           {"name": "user.rank", "type": "int32", "facet": true},
+           {"name": "user.phone", "type": "string"},
+           {"name": "user.bio", "type": "string"}
+         ]
+       })"_json;
+
+    auto op = collectionManager.create_collection(schema);
+    ASSERT_TRUE(op.ok());
+    Collection* coll = op.get();
+
+    auto json = R"({
+                        "id": "124",
+                        "user_name": "user_a",
+                        "user": {
+                            "rank": 100,
+                            "phone": "+91 123123123"
+                        }
+                    })"_json;
+    std::string bio = "Once there was a middle-aged boy named User_a who was an avid swimmer."
+                      "He had been swimming competitively for most of his life, and had even competed in several national competitions."
+                      "However, despite his passion and talent for the sport, he had never quite managed to win that elusive gold medal."
+                      "Determined to change that, User_a began training harder than ever before."
+                      "He woke up early every morning to swim laps before work and spent his evenings at the pool as well."
+                      "Despite the grueling schedule, he never once complained."
+                      "Instead, he reminded himself of his goal: to become a national champion.";
+    json["user"]["bio"] = bio;
+
+    auto add_op = coll->add(json.dump());
+    if (!add_op.ok()) {
+        LOG(INFO) << add_op.error();
+    }
+    ASSERT_TRUE(add_op.ok());
+
+    spp::sparse_hash_set<std::string> dummy_include_exclude;
+    std::string highlight_full_fields = "user*";
+    // user* matches user_name, user.bio
+    auto result = coll->search("user_a", {"*"}, "", {}, {}, {0},
+                               10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, dummy_include_exclude, dummy_include_exclude, 10, "",
+                               30, 4, highlight_full_fields).get();
+
+    ASSERT_EQ(1, result["found"].get<size_t>());
+    ASSERT_EQ(1, result["hits"].size());
+
+    ASSERT_EQ("a middle-aged boy named <mark>User_a</mark> who was an avid",
+              result["hits"][0]["highlight"]["user"]["bio"]["snippet"].get<std::string>());
+
+    std::string highlighted_value = "Once there was a middle-aged boy named <mark>User_a</mark> who was an avid swimmer."
+                                    "He had been swimming competitively for most of his life, and had even competed in several national competitions."
+                                    "However, despite his passion and talent for the sport, he had never quite managed to win that elusive gold medal."
+                                    "Determined to change that, <mark>User_a</mark> began training harder than ever before."
+                                    "He woke up early every morning to swim laps before work and spent his evenings at the pool as well."
+                                    "Despite the grueling schedule, he never once complained."
+                                    "Instead, he reminded himself of his goal: to become a national champion.";
+    ASSERT_EQ( highlighted_value, result["hits"][0]["highlight"]["user"]["bio"]["value"].get<std::string>());
+    ASSERT_EQ("<mark>user_a</mark>",
+              result["hits"][0]["highlight"]["user_name"]["value"].get<std::string>());
+
+    highlight_full_fields = "user.*";
+    // user.* matches user.bio
+    result = coll->search("user_a", {"*"}, "", {}, {}, {0},
+                          10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, dummy_include_exclude, dummy_include_exclude, 10, "",
+                          30, 4, highlight_full_fields).get();
+
+    ASSERT_EQ(1, result["found"].get<size_t>());
+    ASSERT_EQ(1, result["hits"].size());
+
+    ASSERT_EQ(highlighted_value, result["hits"][0]["highlight"]["user"]["bio"]["value"].get<std::string>());
+    ASSERT_EQ(0, result["hits"][0]["highlight"]["user_name"].count("value"));
+
+    highlight_full_fields = "foo*";
+    // No matching field for highlight_fields
+    result = coll->search("user_a", {"*"}, "", {}, {}, {0},
+                          10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, dummy_include_exclude, dummy_include_exclude, 10, "",
+                          30, 4, highlight_full_fields).get();
+
+    ASSERT_EQ(0, result["hits"][0]["highlight"]["user"]["bio"].count("value"));
+    ASSERT_EQ(0, result["hits"][0]["highlight"]["user_name"].count("value"));
+}