Explicitly dot separated fields should have precedence.

This commit is contained in:
Kishore Nallan 2022-08-08 11:47:10 +05:30
parent 134af13281
commit 9b4fb12729
4 changed files with 179 additions and 43 deletions

View File

@ -386,11 +386,11 @@ struct field {
}
static bool flatten_obj(nlohmann::json& doc, nlohmann::json& value, bool has_array, bool has_obj_array,
const std::string& flat_name, std::vector<field>& flattened_fields);
const std::string& flat_name, std::unordered_map<std::string, field>& flattened_fields);
static bool flatten_field(nlohmann::json& doc, nlohmann::json& obj, const field& the_field,
std::vector<std::string>& path_parts, size_t path_index, bool has_array,
bool has_obj_array, std::vector<field>& flattened_fields);
bool has_obj_array, std::unordered_map<std::string, field>& flattened_fields);
static Option<bool> flatten_doc(nlohmann::json& document, const std::vector<field>& nested_fields,
std::vector<field>& flattened_fields);

View File

@ -536,7 +536,7 @@ Option<bool> field::json_field_to_field(nlohmann::json& field_json, std::vector<
}
bool field::flatten_obj(nlohmann::json& doc, nlohmann::json& value, bool has_array, bool has_obj_array,
const std::string& flat_name, std::vector<field>& flattened_fields) {
const std::string& flat_name, std::unordered_map<std::string, field>& flattened_fields) {
if(value.is_object()) {
has_obj_array = has_array;
for(const auto& kv: value.items()) {
@ -546,8 +546,11 @@ bool field::flatten_obj(nlohmann::json& doc, nlohmann::json& value, bool has_arr
for(const auto& kv: value.items()) {
flatten_obj(doc, kv.value(), true, has_obj_array, flat_name, flattened_fields);
}
} else {
// must be a primitive
} else { // must be a primitive
if(doc.count(flat_name) != 0 && flattened_fields.find(flat_name) == flattened_fields.end()) {
return true;
}
if(has_array) {
doc[flat_name].push_back(value);
} else {
@ -567,7 +570,7 @@ bool field::flatten_obj(nlohmann::json& doc, nlohmann::json& value, bool has_arr
field flattened_field(flat_name, detected_type, false, true);
flattened_field.nested = true;
flattened_field.nested_array = has_obj_array;
flattened_fields.push_back(flattened_field);
flattened_fields[flat_name] = flattened_field;
}
return true;
@ -575,7 +578,7 @@ bool field::flatten_obj(nlohmann::json& doc, nlohmann::json& value, bool has_arr
bool field::flatten_field(nlohmann::json& doc, nlohmann::json& obj, const field& the_field,
std::vector<std::string>& path_parts, size_t path_index,
bool has_array, bool has_obj_array, std::vector<field>& flattened_fields) {
bool has_array, bool has_obj_array, std::unordered_map<std::string, field>& flattened_fields) {
if(path_index == path_parts.size()) {
// end of path: check if obj matches expected type
std::string detected_type;
@ -590,10 +593,21 @@ bool field::flatten_field(nlohmann::json& doc, nlohmann::json& obj, const field&
has_obj_array = has_obj_array || ((detected_type == field_types::OBJECT) && has_array);
if(detected_type == the_field.type) {
// handle differences in detection of numerical types
bool is_numericaly_valid = (detected_type != the_field.type) &&
((detected_type == field_types::INT64 &&
(the_field.type == field_types::INT32 || the_field.type == field_types::FLOAT)) ||
(detected_type == field_types::INT64_ARRAY &&
(the_field.type == field_types::INT32_ARRAY || the_field.type == field_types::FLOAT_ARRAY)));
if(detected_type == the_field.type || is_numericaly_valid) {
if(the_field.is_object()) {
flatten_obj(doc, obj, has_array, has_obj_array, the_field.name, flattened_fields);
} else {
if(doc.count(the_field.name) != 0 && flattened_fields.find(the_field.name) == flattened_fields.end()) {
return true;
}
if(has_array) {
doc[the_field.name].push_back(obj);
} else {
@ -603,30 +617,9 @@ bool field::flatten_field(nlohmann::json& doc, nlohmann::json& obj, const field&
field flattened_field(the_field.name, detected_type, false, true);
flattened_field.nested = (path_index > 1);
flattened_field.nested_array = has_obj_array;
flattened_fields.push_back(flattened_field);
flattened_fields[the_field.name] = flattened_field;
}
return true;
}
// handle differences in detection of numerical types
bool is_numericaly_valid = (detected_type == field_types::INT64 && (the_field.type == field_types::INT32 ||
the_field.type == field_types::FLOAT)) ||
(detected_type == field_types::INT64_ARRAY &&
(the_field.type == field_types::INT32_ARRAY ||
the_field.type == field_types::FLOAT_ARRAY));
if(is_numericaly_valid) {
if(has_array) {
doc[the_field.name].push_back(obj);
} else {
doc[the_field.name] = obj;
}
field flattened_field(the_field.name, the_field.type, false, true);
flattened_field.nested = (path_index > 1);
flattened_field.nested_array = has_obj_array;
flattened_fields.push_back(flattened_field);
return true;
} else {
return false;
@ -657,22 +650,27 @@ Option<bool> field::flatten_doc(nlohmann::json& document,
const std::vector<field>& nested_fields,
std::vector<field>& flattened_fields) {
std::unordered_map<std::string, field> flattened_fields_map;
for(auto& nested_field: nested_fields) {
std::vector<std::string> field_parts;
StringUtils::split(nested_field.name, field_parts, ".");
bool resolved = flatten_field(document, document, nested_field, field_parts, 0, false, false, flattened_fields);
if(field_parts.size() > 1 && document.count(nested_field.name) != 0) {
// skip explicitly present nested fields
continue;
}
bool resolved = flatten_field(document, document, nested_field, field_parts, 0, false, false, flattened_fields_map);
if(!resolved && !nested_field.optional) {
return Option<bool>(400, "Field `" + nested_field.name + "` was not found or has an incorrect type.");
}
}
std::sort(flattened_fields.begin(), flattened_fields.end());
flattened_fields.erase(std::unique(flattened_fields.begin(), flattened_fields.end()), flattened_fields.end());
document[".flat"] = nlohmann::json::array();
for(auto& f: flattened_fields) {
document[".flat"].push_back(f.name);
for(auto& kv: flattened_fields_map) {
document[".flat"].push_back(kv.second.name);
flattened_fields.push_back(kv.second);
}
return Option<bool>(true);

View File

@ -1112,8 +1112,8 @@ TEST_F(CollectionAllFieldsTest, WildcardFieldAndDictionaryField) {
ASSERT_EQ(4, schema.size());
ASSERT_EQ(".*", schema[0].name);
ASSERT_EQ("year", schema[1].name);
ASSERT_EQ("kinds.CGXX", schema[2].name);
ASSERT_EQ("kinds.ZBXX", schema[3].name);
ASSERT_EQ("kinds.ZBXX", schema[2].name);
ASSERT_EQ("kinds.CGXX", schema[3].name);
// filter on object key
results = coll1->search("*", {}, "kinds.CGXX: 13", {}, sort_fields, {0}, 10, 1, FREQUENCY, {false}).get();

View File

@ -65,8 +65,8 @@ TEST_F(CollectionNestedFieldsTest, FlattenJSONObject) {
auto expected_json = R"(
{
".flat": ["locations.address.city","locations.address.products","locations.address.street",
"locations.country","locations.pincode"],
".flat": ["locations.pincode","locations.country","locations.address.street","locations.address.products",
"locations.address.city"],
"company":{"name":"nike"},
"employees":{"num":1200},
"locations":[
@ -132,7 +132,7 @@ TEST_F(CollectionNestedFieldsTest, FlattenJSONObject) {
expected_json = R"(
{
".flat": ["locations.address.city", "locations.address.products", "locations.address.street"],
".flat": ["locations.address.street", "locations.address.products","locations.address.city"],
"company":{"name":"nike"},
"employees":{"num":1200},
"locations":[
@ -236,6 +236,7 @@ TEST_F(CollectionNestedFieldsTest, TestNestedArrayField) {
// test against deep paths
flattened_fields.clear();
doc = nlohmann::json::parse(json_str);
nested_fields = {
field("employees.details.num_tags", field_types::INT32_ARRAY, false),
field("employees.details.tags", field_types::STRING_ARRAY, false),
@ -248,10 +249,10 @@ TEST_F(CollectionNestedFieldsTest, TestNestedArrayField) {
ASSERT_EQ("employees.detail.tags",flattened_fields[0].name);
ASSERT_FALSE(flattened_fields[0].nested_array);
ASSERT_EQ("employees.details.num_tags",flattened_fields[1].name);
ASSERT_EQ("employees.details.tags",flattened_fields[1].name);
ASSERT_TRUE(flattened_fields[1].nested_array);
ASSERT_EQ("employees.details.tags",flattened_fields[2].name);
ASSERT_EQ("employees.details.num_tags",flattened_fields[2].name);
ASSERT_TRUE(flattened_fields[2].nested_array);
}
@ -1117,6 +1118,113 @@ TEST_F(CollectionNestedFieldsTest, VerifyDisableOfNestedFields) {
ASSERT_EQ(2, coll2->get_fields().size());
}
TEST_F(CollectionNestedFieldsTest, ExplicitDotSeparatedFieldsShouldHavePrecendence) {
nlohmann::json schema = R"({
"name": "coll1",
"enable_nested_fields": true,
"fields": [
{"name": ".*", "type": "auto"}
]
})"_json;
auto op = collectionManager.create_collection(schema);
ASSERT_TRUE(op.ok());
Collection* coll1 = op.get();
auto doc1 = R"({
"company": {"num_employees": 1000, "ids": [1,2]},
"details": [{"name": "bar"}],
"company.num_employees": 2000,
"company.ids": [10],
"details.name": "foo"
})"_json;
ASSERT_TRUE(coll1->add(doc1.dump(), CREATE).ok());
auto fs = coll1->get_fields();
ASSERT_EQ(4, coll1->get_fields().size());
// simple nested object
auto results = coll1->search("*", {}, "company.num_employees: 2000", {}, sort_fields, {0}, 10, 1,
token_ordering::FREQUENCY, {true}).get();
ASSERT_EQ(1, results["found"].get<size_t>());
results = coll1->search("*", {}, "company.num_employees: 1000", {}, sort_fields, {0}, 10, 1,
token_ordering::FREQUENCY, {true}).get();
ASSERT_EQ(0, results["found"].get<size_t>());
// nested array object
results = coll1->search("foo", {"details.name"}, "", {}, sort_fields, {0}, 10, 1,
token_ordering::FREQUENCY, {true}).get();
ASSERT_EQ(1, results["found"].get<size_t>());
results = coll1->search("bar", {"details.name"}, "", {}, sort_fields, {0}, 10, 1,
token_ordering::FREQUENCY, {true}).get();
ASSERT_EQ(0, results["found"].get<size_t>());
// nested simple array
results = coll1->search("*", {}, "company.ids: 10", {}, sort_fields, {0}, 10, 1,
token_ordering::FREQUENCY, {true}).get();
ASSERT_EQ(1, results["found"].get<size_t>());
results = coll1->search("*", {}, "company.ids: 1", {}, sort_fields, {0}, 10, 1,
token_ordering::FREQUENCY, {true}).get();
ASSERT_EQ(0, results["found"].get<size_t>());
// WITH EXPLICIT SCHEMA
schema = R"({
"name": "coll2",
"enable_nested_fields": true,
"fields": [
{"name": "company.num_employees", "type": "int32"},
{"name": "company.ids", "type": "int32[]"},
{"name": "details.name", "type": "string[]"}
]
})"_json;
op = collectionManager.create_collection(schema);
ASSERT_TRUE(op.ok());
Collection* coll2 = op.get();
auto doc2 = R"({
"company": {"num_employees": 1000, "ids": [1,2]},
"details": [{"name": "bar"}],
"company.num_employees": 2000,
"company.ids": [10],
"details.name": ["foo"]
})"_json;
ASSERT_TRUE(coll2->add(doc2.dump(), CREATE).ok());
// simple nested object
results = coll2->search("*", {}, "company.num_employees: 2000", {}, sort_fields, {0}, 10, 1,
token_ordering::FREQUENCY, {true}).get();
ASSERT_EQ(1, results["found"].get<size_t>());
results = coll2->search("*", {}, "company.num_employees: 1000", {}, sort_fields, {0}, 10, 1,
token_ordering::FREQUENCY, {true}).get();
ASSERT_EQ(0, results["found"].get<size_t>());
// nested array object
results = coll2->search("foo", {"details.name"}, "", {}, sort_fields, {0}, 10, 1,
token_ordering::FREQUENCY, {true}).get();
ASSERT_EQ(1, results["found"].get<size_t>());
results = coll2->search("bar", {"details.name"}, "", {}, sort_fields, {0}, 10, 1,
token_ordering::FREQUENCY, {true}).get();
ASSERT_EQ(0, results["found"].get<size_t>());
// nested simple array
results = coll2->search("*", {}, "company.ids: 10", {}, sort_fields, {0}, 10, 1,
token_ordering::FREQUENCY, {true}).get();
ASSERT_EQ(1, results["found"].get<size_t>());
results = coll2->search("*", {}, "company.ids: 1", {}, sort_fields, {0}, 10, 1,
token_ordering::FREQUENCY, {true}).get();
ASSERT_EQ(0, results["found"].get<size_t>());
}
TEST_F(CollectionNestedFieldsTest, GroupByOnNestedFieldsWithWildcardSchema) {
std::vector<field> fields = {field(".*", field_types::AUTO, false, true),
field("education.name", field_types::STRING_ARRAY, true, true),
@ -1189,6 +1297,36 @@ TEST_F(CollectionNestedFieldsTest, GroupByOnNestedFieldsWithWildcardSchema) {
ASSERT_EQ("0", results["grouped_hits"][1]["hits"][0]["document"]["id"].get<std::string>());
}
TEST_F(CollectionNestedFieldsTest, WildcardWithExplicitSchema) {
nlohmann::json schema = R"({
"name": "coll1",
"enable_nested_fields": true,
"fields": [
{"name": ".*", "type": "auto"},
{"name": "company.id", "type": "int32"},
{"name": "studies.year", "type": "int32[]"}
]
})"_json;
auto op = collectionManager.create_collection(schema);
ASSERT_TRUE(op.ok());
Collection* coll1 = op.get();
auto doc1 = R"({
"id": "0",
"company": {"id": 1000, "name": "Foo"},
"studies": [{"name": "College 1", "year": 1997}]
})"_json;
ASSERT_TRUE(coll1->add(doc1.dump(), CREATE).ok());
auto results = coll1->search("*", {}, "company.id: 1000", {}, {}, {0}, 10, 1, FREQUENCY, {false}).get();
ASSERT_EQ(1, results["found"].get<size_t>());
results = coll1->search("*", {}, "studies.year: 1997", {}, {}, {0}, 10, 1, FREQUENCY, {false}).get();
ASSERT_EQ(1, results["found"].get<size_t>());
}
TEST_F(CollectionNestedFieldsTest, UpdateOfNestFields) {
nlohmann::json schema = R"({
"name": "coll1",