Fix include/exclude fields in export for nested docs.

This commit is contained in:
Kishore Nallan 2023-01-10 16:17:35 +05:30
parent b430e7fa9a
commit 8b69d524ba
6 changed files with 208 additions and 72 deletions

View File

@ -215,8 +215,6 @@ private:
void populate_text_match_info(nlohmann::json& info, uint64_t match_score, const text_match_type_t match_type) const;
static void remove_flat_fields(nlohmann::json& document);
bool handle_highlight_text(std::string& text, bool normalise, const field &search_field,
const std::vector<char>& symbols_to_index, const std::vector<char>& token_separators,
highlight_t& highlight, StringUtils & string_utils, bool use_word_tokenizer,
@ -251,6 +249,11 @@ private:
static uint64_t extract_bits(uint64_t value, unsigned lsb_offset, unsigned n);
Option<bool> populate_include_exclude_fields(const spp::sparse_hash_set<std::string>& include_fields,
const spp::sparse_hash_set<std::string>& exclude_fields,
tsl::htrie_set<char>& include_fields_full,
tsl::htrie_set<char>& exclude_fields_full) const;
public:
enum {MAX_ARRAY_MATCHES = 5};
@ -337,6 +340,8 @@ public:
Option<uint32_t> index_in_memory(nlohmann::json & document, uint32_t seq_id,
const index_operation_t op, const DIRTY_VALUES& dirty_values);
static void remove_flat_fields(nlohmann::json& document);
static void prune_doc(nlohmann::json& doc, const tsl::htrie_set<char>& include_names,
const tsl::htrie_set<char>& exclude_names, const std::string& parent_name = "", size_t depth = 0);
@ -377,6 +382,11 @@ public:
std::string& req_dirty_values,
const int batch_size = 1000);
Option<bool> populate_include_exclude_fields_lk(const spp::sparse_hash_set<std::string>& include_fields,
const spp::sparse_hash_set<std::string>& exclude_fields,
tsl::htrie_set<char>& include_fields_full,
tsl::htrie_set<char>& exclude_fields_full) const;
Option<nlohmann::json> search(const std::string & query, const std::vector<std::string> & search_fields,
const std::string & filter_query, const std::vector<std::string> & facet_fields,
const std::vector<sort_by> & sort_fields, const std::vector<uint32_t>& num_typos,

View File

@ -22,8 +22,8 @@ struct export_state_t: public req_state_t {
Collection* collection;
std::vector<std::pair<size_t, uint32_t*>> index_ids;
std::vector<size_t> offsets;
std::set<std::string> include_fields;
std::set<std::string> exclude_fields;
tsl::htrie_set<char> include_fields;
tsl::htrie_set<char> exclude_fields;
size_t export_batch_size = 100;
std::string* res_body;

View File

@ -1113,49 +1113,16 @@ Option<nlohmann::json> Collection::search(const std::string & raw_query,
}
}
std::vector<std::string> include_fields_vec;
std::vector<std::string> exclude_fields_vec;
tsl::htrie_set<char> include_fields_full;
tsl::htrie_set<char> exclude_fields_full;
for(auto& f_name: include_fields) {
auto field_op = extract_field_name(f_name, search_schema, include_fields_vec, false, enable_nested_fields);
if(!field_op.ok()) {
if(field_op.code() == 404) {
// field need not be part of schema to be included (could be a stored value in the doc)
include_fields_vec.push_back(f_name);
continue;
}
return Option<nlohmann::json>(field_op.code(), field_op.error());
}
auto include_exclude_op = populate_include_exclude_fields(include_fields, exclude_fields,
include_fields_full, exclude_fields_full);
if(!include_exclude_op.ok()) {
return Option<nlohmann::json>(include_exclude_op.code(), include_exclude_op.error());
}
for(auto& f_name: exclude_fields) {
if(f_name == "out_of") {
// `out_of` is strictly a meta-field, but we handle it since it's useful
continue;
}
auto field_op = extract_field_name(f_name, search_schema, exclude_fields_vec, false, enable_nested_fields);
if(!field_op.ok()) {
if(field_op.code() == 404) {
// field need not be part of schema to be excluded (could be a stored value in the doc)
exclude_fields_vec.push_back(f_name);
continue;
}
return Option<nlohmann::json>(field_op.code(), field_op.error());
}
}
for(auto& f_name: include_fields_vec) {
include_fields_full.insert(f_name);
}
for(auto& f_name: exclude_fields_vec) {
exclude_fields_full.insert(f_name);
}
// process weights for search fields
std::vector<std::string> reordered_search_fields;
std::vector<search_field_t> weighted_search_fields;
@ -4327,4 +4294,60 @@ Option<bool> Collection::parse_facet(const std::string& facet_field, std::vector
}
return Option<bool>(true);
}
Option<bool> Collection::populate_include_exclude_fields(const spp::sparse_hash_set<std::string>& include_fields,
const spp::sparse_hash_set<std::string>& exclude_fields,
tsl::htrie_set<char>& include_fields_full,
tsl::htrie_set<char>& exclude_fields_full) const {
std::vector<std::string> include_fields_vec;
std::vector<std::string> exclude_fields_vec;
for(auto& f_name: include_fields) {
auto field_op = extract_field_name(f_name, search_schema, include_fields_vec, false, enable_nested_fields);
if(!field_op.ok()) {
if(field_op.code() == 404) {
// field need not be part of schema to be included (could be a stored value in the doc)
include_fields_vec.push_back(f_name);
continue;
}
return Option<bool>(field_op.code(), field_op.error());
}
}
for(auto& f_name: exclude_fields) {
if(f_name == "out_of") {
// `out_of` is strictly a meta-field, but we handle it since it's useful
continue;
}
auto field_op = extract_field_name(f_name, search_schema, exclude_fields_vec, false, enable_nested_fields);
if(!field_op.ok()) {
if(field_op.code() == 404) {
// field need not be part of schema to be excluded (could be a stored value in the doc)
exclude_fields_vec.push_back(f_name);
continue;
}
return Option<bool>(field_op.code(), field_op.error());
}
}
for(auto& f_name: include_fields_vec) {
include_fields_full.insert(f_name);
}
for(auto& f_name: exclude_fields_vec) {
exclude_fields_full.insert(f_name);
}
return Option<bool>(true);
}
Option<bool> Collection::populate_include_exclude_fields_lk(const spp::sparse_hash_set<std::string>& include_fields,
const spp::sparse_hash_set<std::string>& exclude_fields,
tsl::htrie_set<char>& include_fields_full,
tsl::htrie_set<char>& exclude_fields_full) const {
std::shared_lock lock(mutex);
return populate_include_exclude_fields(include_fields, exclude_fields, include_fields_full, exclude_fields_full);
}

View File

@ -603,6 +603,8 @@ bool get_export_documents(const std::shared_ptr<http_req>& req, const std::share
req->data = export_state;
std::string simple_filter_query;
spp::sparse_hash_set<std::string> exclude_fields;
spp::sparse_hash_set<std::string> include_fields;
if(req->params.count(FILTER_BY) != 0) {
simple_filter_query = req->params[FILTER_BY];
@ -611,15 +613,18 @@ bool get_export_documents(const std::shared_ptr<http_req>& req, const std::share
if(req->params.count(INCLUDE_FIELDS) != 0) {
std::vector<std::string> include_fields_vec;
StringUtils::split(req->params[INCLUDE_FIELDS], include_fields_vec, ",");
export_state->include_fields = std::set<std::string>(include_fields_vec.begin(), include_fields_vec.end());
include_fields = spp::sparse_hash_set<std::string>(include_fields_vec.begin(), include_fields_vec.end());
}
if(req->params.count(EXCLUDE_FIELDS) != 0) {
std::vector<std::string> exclude_fields_vec;
StringUtils::split(req->params[EXCLUDE_FIELDS], exclude_fields_vec, ",");
export_state->exclude_fields = std::set<std::string>(exclude_fields_vec.begin(), exclude_fields_vec.end());
exclude_fields = spp::sparse_hash_set<std::string>(exclude_fields_vec.begin(), exclude_fields_vec.end());
}
collection->populate_include_exclude_fields_lk(include_fields, exclude_fields,
export_state->include_fields, export_state->exclude_fields);
if(req->params.count(BATCH_SIZE) != 0 && StringUtils::is_uint32_t(req->params[BATCH_SIZE])) {
export_state->export_batch_size = std::stoul(req->params[BATCH_SIZE]);
}
@ -659,20 +664,8 @@ bool get_export_documents(const std::shared_ptr<http_req>& req, const std::share
res->body += it->value().ToString();
} else {
nlohmann::json doc = nlohmann::json::parse(it->value().ToString());
nlohmann::json filtered_doc;
for(const auto& kv: doc.items()) {
bool must_include = export_state->include_fields.empty() ||
(export_state->include_fields.count(kv.key()) != 0);
bool must_exclude = !export_state->exclude_fields.empty() &&
(export_state->exclude_fields.count(kv.key()) != 0);
if(must_include && !must_exclude) {
filtered_doc[kv.key()] = kv.value();
}
}
res->body += filtered_doc.dump();
Collection::prune_doc(doc, export_state->include_fields, export_state->exclude_fields);
res->body += doc.dump();
}
it->Next();

View File

@ -66,20 +66,9 @@ Option<bool> stateful_export_docs(export_state_t* export_state, size_t batch_siz
if(export_state->include_fields.empty() && export_state->exclude_fields.empty()) {
export_state->res_body->append(doc.dump());
} else {
nlohmann::json filtered_doc;
for(const auto& kv: doc.items()) {
bool must_include = export_state->include_fields.empty() ||
(export_state->include_fields.count(kv.key()) != 0);
bool must_exclude = !export_state->exclude_fields.empty() &&
(export_state->exclude_fields.count(kv.key()) != 0);
if(must_include && !must_exclude) {
filtered_doc[kv.key()] = kv.value();
}
}
export_state->res_body->append(filtered_doc.dump());
Collection::remove_flat_fields(doc);
Collection::prune_doc(doc, export_state->include_fields, export_state->exclude_fields);
export_state->res_body->append(doc.dump());
}
export_state->res_body->append("\n");

View File

@ -559,3 +559,124 @@ TEST_F(CoreAPIUtilsTest, ExportWithFilter) {
ASSERT_TRUE(done);
ASSERT_EQ('}', export_state.res_body->back());
}
TEST_F(CoreAPIUtilsTest, ExportIncludeExcludeFields) {
nlohmann::json schema = R"({
"name": "coll1",
"enable_nested_fields": true,
"fields": [
{"name": "name", "type": "object" },
{"name": "points", "type": "int32" }
]
})"_json;
auto op = collectionManager.create_collection(schema);
ASSERT_TRUE(op.ok());
Collection* coll1 = op.get();
auto doc1 = R"({
"name": {"first": "John", "last": "Smith"},
"points": 100
})"_json;
auto add_op = coll1->add(doc1.dump(), CREATE);
ASSERT_TRUE(add_op.ok());
std::shared_ptr<http_req> req = std::make_shared<http_req>();
std::shared_ptr<http_res> res = std::make_shared<http_res>(nullptr);
req->params["collection"] = "coll1";
// include fields
req->params["include_fields"] = "name.last";
get_export_documents(req, res);
std::vector<std::string> res_strs;
StringUtils::split(res->body, res_strs, "\n");
nlohmann::json doc = nlohmann::json::parse(res_strs[0]);
ASSERT_EQ(1, doc.size());
ASSERT_EQ(1, doc.count("name"));
ASSERT_EQ(1, doc["name"].count("last"));
// exclude fields
delete dynamic_cast<deletion_state_t*>(req->data);
req->data = nullptr;
res->body.clear();
req->params.erase("include_fields");
req->params["exclude_fields"] = "name.last";
get_export_documents(req, res);
res_strs.clear();
StringUtils::split(res->body, res_strs, "\n");
doc = nlohmann::json::parse(res_strs[0]);
ASSERT_EQ(3, doc.size());
ASSERT_EQ(1, doc.count("id"));
ASSERT_EQ(1, doc.count("points"));
ASSERT_EQ(1, doc.count("name"));
ASSERT_EQ(1, doc["name"].count("first"));
collectionManager.drop_collection("coll1");
}
TEST_F(CoreAPIUtilsTest, ExportIncludeExcludeFieldsWithFilter) {
nlohmann::json schema = R"({
"name": "coll1",
"enable_nested_fields": true,
"fields": [
{"name": "name", "type": "object" },
{"name": "points", "type": "int32" }
]
})"_json;
auto op = collectionManager.create_collection(schema);
ASSERT_TRUE(op.ok());
Collection* coll1 = op.get();
auto doc1 = R"({
"name": {"first": "John", "last": "Smith"},
"points": 100
})"_json;
auto add_op = coll1->add(doc1.dump(), CREATE);
ASSERT_TRUE(add_op.ok());
std::shared_ptr<http_req> req = std::make_shared<http_req>();
std::shared_ptr<http_res> res = std::make_shared<http_res>(nullptr);
req->params["collection"] = "coll1";
// include fields
req->params["include_fields"] = "name.last";
req->params["filter_by"] = "points:>=0";
get_export_documents(req, res);
std::vector<std::string> res_strs;
StringUtils::split(res->body, res_strs, "\n");
nlohmann::json doc = nlohmann::json::parse(res_strs[0]);
ASSERT_EQ(1, doc.size());
ASSERT_EQ(1, doc.count("name"));
ASSERT_EQ(1, doc["name"].count("last"));
// exclude fields
delete dynamic_cast<deletion_state_t*>(req->data);
req->data = nullptr;
res->body.clear();
req->params.erase("include_fields");
req->params["exclude_fields"] = "name.last";
get_export_documents(req, res);
res_strs.clear();
StringUtils::split(res->body, res_strs, "\n");
doc = nlohmann::json::parse(res_strs[0]);
ASSERT_EQ(3, doc.size());
ASSERT_EQ(1, doc.count("id"));
ASSERT_EQ(1, doc.count("points"));
ASSERT_EQ(1, doc.count("name"));
ASSERT_EQ(1, doc["name"].count("first"));
collectionManager.drop_collection("coll1");
}