Fix include/exclude fields in export for nested docs.

2025-05-19 21:22:25 +08:00 · 2023-01-10 16:17:35 +05:30 · 2023-01-10 16:17:35 +05:30 · 8b69d524ba
commit 8b69d524ba
parent b430e7fa9a
6 changed files with 208 additions and 72 deletions
--- a/include/collection.h
+++ b/include/collection.h
@ -215,8 +215,6 @@ private:

    void populate_text_match_info(nlohmann::json& info, uint64_t match_score, const text_match_type_t match_type) const;

-    static void remove_flat_fields(nlohmann::json& document);
-
    bool handle_highlight_text(std::string& text, bool normalise, const field &search_field,
                               const std::vector<char>& symbols_to_index, const std::vector<char>& token_separators,
                               highlight_t& highlight, StringUtils & string_utils, bool use_word_tokenizer,
@ -251,6 +249,11 @@ private:

    static uint64_t extract_bits(uint64_t value, unsigned lsb_offset, unsigned n);

+    Option<bool> populate_include_exclude_fields(const spp::sparse_hash_set<std::string>& include_fields,
+                                                 const spp::sparse_hash_set<std::string>& exclude_fields,
+                                                 tsl::htrie_set<char>& include_fields_full,
+                                                 tsl::htrie_set<char>& exclude_fields_full) const;
+
 public:

    enum {MAX_ARRAY_MATCHES = 5};
@ -337,6 +340,8 @@ public:
    Option<uint32_t> index_in_memory(nlohmann::json & document, uint32_t seq_id,
                                     const index_operation_t op, const DIRTY_VALUES& dirty_values);

+    static void remove_flat_fields(nlohmann::json& document);
+
    static void prune_doc(nlohmann::json& doc, const tsl::htrie_set<char>& include_names,
                          const tsl::htrie_set<char>& exclude_names, const std::string& parent_name = "", size_t depth = 0);

@ -377,6 +382,11 @@ public:
                                                  std::string& req_dirty_values,
                                                  const int batch_size = 1000);

+    Option<bool> populate_include_exclude_fields_lk(const spp::sparse_hash_set<std::string>& include_fields,
+                                                     const spp::sparse_hash_set<std::string>& exclude_fields,
+                                                     tsl::htrie_set<char>& include_fields_full,
+                                                     tsl::htrie_set<char>& exclude_fields_full) const;
+
    Option<nlohmann::json> search(const std::string & query, const std::vector<std::string> & search_fields,
                                  const std::string & filter_query, const std::vector<std::string> & facet_fields,
                                  const std::vector<sort_by> & sort_fields, const std::vector<uint32_t>& num_typos,
--- a/include/core_api_utils.h
+++ b/include/core_api_utils.h
@ -22,8 +22,8 @@ struct export_state_t: public req_state_t {
    Collection* collection;
    std::vector<std::pair<size_t, uint32_t*>> index_ids;
    std::vector<size_t> offsets;
-    std::set<std::string> include_fields;
-    std::set<std::string> exclude_fields;
+    tsl::htrie_set<char> include_fields;
+    tsl::htrie_set<char> exclude_fields;
    size_t export_batch_size = 100;
    std::string* res_body;

--- a/src/collection.cpp
+++ b/src/collection.cpp
@ -1113,49 +1113,16 @@ Option<nlohmann::json> Collection::search(const std::string & raw_query,
        }
    }

-    std::vector<std::string> include_fields_vec;
-    std::vector<std::string> exclude_fields_vec;
    tsl::htrie_set<char> include_fields_full;
    tsl::htrie_set<char> exclude_fields_full;

-    for(auto& f_name: include_fields) {
-        auto field_op = extract_field_name(f_name, search_schema, include_fields_vec, false, enable_nested_fields);
-        if(!field_op.ok()) {
-            if(field_op.code() == 404) {
-                // field need not be part of schema to be included (could be a stored value in the doc)
-                include_fields_vec.push_back(f_name);
-                continue;
-            }
-            return Option<nlohmann::json>(field_op.code(), field_op.error());
-        }
+    auto include_exclude_op = populate_include_exclude_fields(include_fields, exclude_fields,
+                                                              include_fields_full, exclude_fields_full);
+
+    if(!include_exclude_op.ok()) {
+        return Option<nlohmann::json>(include_exclude_op.code(), include_exclude_op.error());
    }

-    for(auto& f_name: exclude_fields) {
-        if(f_name == "out_of") {
-            // `out_of` is strictly a meta-field, but we handle it since it's useful
-            continue;
-        }
-
-        auto field_op = extract_field_name(f_name, search_schema, exclude_fields_vec, false, enable_nested_fields);
-        if(!field_op.ok()) {
-            if(field_op.code() == 404) {
-                // field need not be part of schema to be excluded (could be a stored value in the doc)
-                exclude_fields_vec.push_back(f_name);
-                continue;
-            }
-            return Option<nlohmann::json>(field_op.code(), field_op.error());
-        }
-    }
-
-    for(auto& f_name: include_fields_vec) {
-        include_fields_full.insert(f_name);
-    }
-
-    for(auto& f_name: exclude_fields_vec) {
-        exclude_fields_full.insert(f_name);
-    }
-
-
    // process weights for search fields
    std::vector<std::string> reordered_search_fields;
    std::vector<search_field_t> weighted_search_fields;
@ -4327,4 +4294,60 @@ Option<bool> Collection::parse_facet(const std::string& facet_field, std::vector
        }

    return Option<bool>(true);
+}
+
+Option<bool> Collection::populate_include_exclude_fields(const spp::sparse_hash_set<std::string>& include_fields,
+                                                         const spp::sparse_hash_set<std::string>& exclude_fields,
+                                                         tsl::htrie_set<char>& include_fields_full,
+                                                         tsl::htrie_set<char>& exclude_fields_full) const {
+
+    std::vector<std::string> include_fields_vec;
+    std::vector<std::string> exclude_fields_vec;
+
+    for(auto& f_name: include_fields) {
+        auto field_op = extract_field_name(f_name, search_schema, include_fields_vec, false, enable_nested_fields);
+        if(!field_op.ok()) {
+            if(field_op.code() == 404) {
+                // field need not be part of schema to be included (could be a stored value in the doc)
+                include_fields_vec.push_back(f_name);
+                continue;
+            }
+            return Option<bool>(field_op.code(), field_op.error());
+        }
+    }
+
+    for(auto& f_name: exclude_fields) {
+        if(f_name == "out_of") {
+            // `out_of` is strictly a meta-field, but we handle it since it's useful
+            continue;
+        }
+
+        auto field_op = extract_field_name(f_name, search_schema, exclude_fields_vec, false, enable_nested_fields);
+        if(!field_op.ok()) {
+            if(field_op.code() == 404) {
+                // field need not be part of schema to be excluded (could be a stored value in the doc)
+                exclude_fields_vec.push_back(f_name);
+                continue;
+            }
+            return Option<bool>(field_op.code(), field_op.error());
+        }
+    }
+
+    for(auto& f_name: include_fields_vec) {
+        include_fields_full.insert(f_name);
+    }
+
+    for(auto& f_name: exclude_fields_vec) {
+        exclude_fields_full.insert(f_name);
+    }
+
+    return Option<bool>(true);
+}
+
+Option<bool> Collection::populate_include_exclude_fields_lk(const spp::sparse_hash_set<std::string>& include_fields,
+                                                            const spp::sparse_hash_set<std::string>& exclude_fields,
+                                                            tsl::htrie_set<char>& include_fields_full,
+                                                            tsl::htrie_set<char>& exclude_fields_full) const {
+    std::shared_lock lock(mutex);
+    return populate_include_exclude_fields(include_fields, exclude_fields, include_fields_full, exclude_fields_full);
 }
--- a/src/core_api.cpp
+++ b/src/core_api.cpp
@ -603,6 +603,8 @@ bool get_export_documents(const std::shared_ptr<http_req>& req, const std::share
        req->data = export_state;

        std::string simple_filter_query;
+        spp::sparse_hash_set<std::string> exclude_fields;
+        spp::sparse_hash_set<std::string> include_fields;

        if(req->params.count(FILTER_BY) != 0) {
            simple_filter_query = req->params[FILTER_BY];
@ -611,15 +613,18 @@ bool get_export_documents(const std::shared_ptr<http_req>& req, const std::share
        if(req->params.count(INCLUDE_FIELDS) != 0) {
            std::vector<std::string> include_fields_vec;
            StringUtils::split(req->params[INCLUDE_FIELDS], include_fields_vec, ",");
-            export_state->include_fields = std::set<std::string>(include_fields_vec.begin(), include_fields_vec.end());
+            include_fields = spp::sparse_hash_set<std::string>(include_fields_vec.begin(), include_fields_vec.end());
        }

        if(req->params.count(EXCLUDE_FIELDS) != 0) {
            std::vector<std::string> exclude_fields_vec;
            StringUtils::split(req->params[EXCLUDE_FIELDS], exclude_fields_vec, ",");
-            export_state->exclude_fields = std::set<std::string>(exclude_fields_vec.begin(), exclude_fields_vec.end());
+            exclude_fields = spp::sparse_hash_set<std::string>(exclude_fields_vec.begin(), exclude_fields_vec.end());
        }

+        collection->populate_include_exclude_fields_lk(include_fields, exclude_fields,
+                                                      export_state->include_fields, export_state->exclude_fields);
+
        if(req->params.count(BATCH_SIZE) != 0 && StringUtils::is_uint32_t(req->params[BATCH_SIZE])) {
            export_state->export_batch_size = std::stoul(req->params[BATCH_SIZE]);
        }
@ -659,20 +664,8 @@ bool get_export_documents(const std::shared_ptr<http_req>& req, const std::share
                res->body += it->value().ToString();
            } else {
                nlohmann::json doc = nlohmann::json::parse(it->value().ToString());
-                nlohmann::json filtered_doc;
-                for(const auto& kv: doc.items()) {
-                    bool must_include = export_state->include_fields.empty() ||
-                                        (export_state->include_fields.count(kv.key()) != 0);
-
-                    bool must_exclude = !export_state->exclude_fields.empty() &&
-                                        (export_state->exclude_fields.count(kv.key()) != 0);
-
-                    if(must_include && !must_exclude) {
-                        filtered_doc[kv.key()] = kv.value();
-                    }
-                }
-
-                res->body += filtered_doc.dump();
+                Collection::prune_doc(doc, export_state->include_fields, export_state->exclude_fields);
+                res->body += doc.dump();
            }

            it->Next();
--- a/src/core_api_utils.cpp
+++ b/src/core_api_utils.cpp
@ -66,20 +66,9 @@ Option<bool> stateful_export_docs(export_state_t* export_state, size_t batch_siz
                if(export_state->include_fields.empty() && export_state->exclude_fields.empty()) {
                    export_state->res_body->append(doc.dump());
                } else {
-                    nlohmann::json filtered_doc;
-                    for(const auto& kv: doc.items()) {
-                        bool must_include = export_state->include_fields.empty() ||
-                                            (export_state->include_fields.count(kv.key()) != 0);
-
-                        bool must_exclude = !export_state->exclude_fields.empty() &&
-                                            (export_state->exclude_fields.count(kv.key()) != 0);
-
-                        if(must_include && !must_exclude) {
-                            filtered_doc[kv.key()] = kv.value();
-                        }
-                    }
-
-                    export_state->res_body->append(filtered_doc.dump());
+                    Collection::remove_flat_fields(doc);
+                    Collection::prune_doc(doc, export_state->include_fields, export_state->exclude_fields);
+                    export_state->res_body->append(doc.dump());
                }

                export_state->res_body->append("\n");
--- a/test/core_api_utils_test.cpp
+++ b/test/core_api_utils_test.cpp
@ -559,3 +559,124 @@ TEST_F(CoreAPIUtilsTest, ExportWithFilter) {
    ASSERT_TRUE(done);
    ASSERT_EQ('}', export_state.res_body->back());
 }
+
+TEST_F(CoreAPIUtilsTest, ExportIncludeExcludeFields) {
+    nlohmann::json schema = R"({
+        "name": "coll1",
+        "enable_nested_fields": true,
+        "fields": [
+          {"name": "name", "type": "object" },
+          {"name": "points", "type": "int32" }
+        ]
+    })"_json;
+
+    auto op = collectionManager.create_collection(schema);
+    ASSERT_TRUE(op.ok());
+    Collection* coll1 = op.get();
+
+    auto doc1 = R"({
+        "name": {"first": "John", "last": "Smith"},
+        "points": 100
+    })"_json;
+
+    auto add_op = coll1->add(doc1.dump(), CREATE);
+    ASSERT_TRUE(add_op.ok());
+
+    std::shared_ptr<http_req> req = std::make_shared<http_req>();
+    std::shared_ptr<http_res> res = std::make_shared<http_res>(nullptr);
+    req->params["collection"] = "coll1";
+
+    // include fields
+
+    req->params["include_fields"] = "name.last";
+
+    get_export_documents(req, res);
+
+    std::vector<std::string> res_strs;
+    StringUtils::split(res->body, res_strs, "\n");
+    nlohmann::json doc = nlohmann::json::parse(res_strs[0]);
+    ASSERT_EQ(1, doc.size());
+    ASSERT_EQ(1, doc.count("name"));
+    ASSERT_EQ(1, doc["name"].count("last"));
+
+    // exclude fields
+
+    delete dynamic_cast<deletion_state_t*>(req->data);
+    req->data = nullptr;
+    res->body.clear();
+    req->params.erase("include_fields");
+    req->params["exclude_fields"] = "name.last";
+    get_export_documents(req, res);
+
+    res_strs.clear();
+    StringUtils::split(res->body, res_strs, "\n");
+    doc = nlohmann::json::parse(res_strs[0]);
+    ASSERT_EQ(3, doc.size());
+    ASSERT_EQ(1, doc.count("id"));
+    ASSERT_EQ(1, doc.count("points"));
+    ASSERT_EQ(1, doc.count("name"));
+    ASSERT_EQ(1, doc["name"].count("first"));
+
+    collectionManager.drop_collection("coll1");
+}
+
+TEST_F(CoreAPIUtilsTest, ExportIncludeExcludeFieldsWithFilter) {
+    nlohmann::json schema = R"({
+        "name": "coll1",
+        "enable_nested_fields": true,
+        "fields": [
+          {"name": "name", "type": "object" },
+          {"name": "points", "type": "int32" }
+        ]
+    })"_json;
+
+    auto op = collectionManager.create_collection(schema);
+    ASSERT_TRUE(op.ok());
+    Collection* coll1 = op.get();
+
+    auto doc1 = R"({
+        "name": {"first": "John", "last": "Smith"},
+        "points": 100
+    })"_json;
+
+    auto add_op = coll1->add(doc1.dump(), CREATE);
+    ASSERT_TRUE(add_op.ok());
+
+    std::shared_ptr<http_req> req = std::make_shared<http_req>();
+    std::shared_ptr<http_res> res = std::make_shared<http_res>(nullptr);
+    req->params["collection"] = "coll1";
+
+    // include fields
+
+    req->params["include_fields"] = "name.last";
+    req->params["filter_by"] = "points:>=0";
+
+    get_export_documents(req, res);
+
+    std::vector<std::string> res_strs;
+    StringUtils::split(res->body, res_strs, "\n");
+    nlohmann::json doc = nlohmann::json::parse(res_strs[0]);
+    ASSERT_EQ(1, doc.size());
+    ASSERT_EQ(1, doc.count("name"));
+    ASSERT_EQ(1, doc["name"].count("last"));
+
+    // exclude fields
+
+    delete dynamic_cast<deletion_state_t*>(req->data);
+    req->data = nullptr;
+    res->body.clear();
+    req->params.erase("include_fields");
+    req->params["exclude_fields"] = "name.last";
+    get_export_documents(req, res);
+
+    res_strs.clear();
+    StringUtils::split(res->body, res_strs, "\n");
+    doc = nlohmann::json::parse(res_strs[0]);
+    ASSERT_EQ(3, doc.size());
+    ASSERT_EQ(1, doc.count("id"));
+    ASSERT_EQ(1, doc.count("points"));
+    ASSERT_EQ(1, doc.count("name"));
+    ASSERT_EQ(1, doc["name"].count("first"));
+
+    collectionManager.drop_collection("coll1");
+}