Add more tests for testing schema detection.

This commit is contained in:
kishorenc 2021-02-23 18:25:16 +05:30
parent f1b70384cc
commit 0a9cf4aee0
9 changed files with 302 additions and 102 deletions

View File

@ -291,9 +291,9 @@ private:
const std::string name;
const uint32_t collection_id;
const std::atomic<uint32_t> collection_id;
const uint64_t created_at;
const std::atomic<uint64_t> created_at;
std::atomic<size_t> num_documents;
@ -486,10 +486,6 @@ public:
bool is_exceeding_memory_threshold() const;
static void get_doc_changes(const nlohmann::json &document, nlohmann::json &old_doc,
nlohmann::json &new_doc,
nlohmann::json &del_doc);
void parse_search_query(const std::string &query, std::vector<std::string>& q_include_tokens,
std::vector<std::string>& q_exclude_tokens) const;

View File

@ -225,6 +225,75 @@ struct field {
return Option<bool>(true);
}
static Option<bool> json_fields_to_fields(nlohmann::json& fields_json,
std::string& auto_detect_schema,
std::vector<field>& fields) {
size_t num_auto_detect_fields = 0;
for(nlohmann::json & field_json: fields_json) {
if(!field_json.is_object() ||
field_json.count(fields::name) == 0 || field_json.count(fields::type) == 0 ||
!field_json.at(fields::name).is_string() || !field_json.at(fields::type).is_string()) {
return Option<bool>(400, "Wrong format for `fields`. It should be an array of objects containing "
"`name`, `type`, `optional` and `facet` properties.");
}
if(field_json.count(fields::facet) != 0 && !field_json.at(fields::facet).is_boolean()) {
return Option<bool>(400, std::string("The `facet` property of the field `") +
field_json[fields::name].get<std::string>() + std::string("` should be a boolean."));
}
if(field_json.count(fields::optional) != 0 && !field_json.at(fields::optional).is_boolean()) {
return Option<bool>(400, std::string("The `optional` property of the field `") +
field_json[fields::name].get<std::string>() + std::string("` should be a boolean."));
}
if(field_json["name"] == "*") {
if(field_json["type"] == schema_detect_types::AUTO || field_json["type"] == schema_detect_types::STRINGIFY) {
auto_detect_schema = field_json["type"];
num_auto_detect_fields++;
} else {
return Option<bool>(400, "The `type` of field `*` is invalid.");
}
if(field_json.count("facet") == 0) {
field_json["facet"] = false;
}
if(field_json.count("optional") == 0) {
field_json["optional"] = true;
}
if(field_json["optional"] == false) {
return Option<bool>(400, "Field `*` must be an optional field.");
}
if(field_json["facet"] == true) {
return Option<bool>(400, "Field `*` cannot be a facet field.");
}
}
if(field_json.count("facet") == 0) {
field_json["facet"] = false;
}
if(field_json.count("optional") == 0) {
field_json["optional"] = false;
}
fields.emplace_back(
field(field_json["name"], field_json["type"], field_json["facet"], field_json["optional"])
);
}
if(num_auto_detect_fields > 1) {
return Option<bool>(400,"There can be only one field named `*`.");
}
return Option<bool>(true);
}
};
struct filter {

View File

@ -243,6 +243,9 @@ private:
static void compute_facet_stats(facet &a_facet, uint64_t raw_value, const std::string & field_type);
static void get_doc_changes(const nlohmann::json &document, nlohmann::json &old_doc,
nlohmann::json &new_doc, nlohmann::json &del_doc);
static Option<uint32_t> coerce_string(const DIRTY_VALUES& dirty_values, const field& a_field, nlohmann::json &document,
const std::string &field_name, const int array_index);

View File

@ -58,6 +58,7 @@ Collection::~Collection() {
}
uint32_t Collection::get_next_seq_id() {
std::shared_lock lock(mutex);
store->increment(get_next_seq_id_key(name), 1);
return next_seq_id++;
}
@ -154,7 +155,7 @@ nlohmann::json Collection::get_summary_json() const {
json_response["name"] = name;
json_response["num_memory_shards"] = num_memory_shards.load();
json_response["num_documents"] = num_documents.load();
json_response["created_at"] = created_at;
json_response["created_at"] = created_at.load();
nlohmann::json fields_arr;
@ -195,27 +196,6 @@ Option<nlohmann::json> Collection::add(const std::string & json_str,
return Option<nlohmann::json>(document);
}
void Collection::get_doc_changes(const nlohmann::json &document, nlohmann::json &old_doc,
nlohmann::json &new_doc, nlohmann::json &del_doc) {
for(auto it = old_doc.begin(); it != old_doc.end(); ++it) {
new_doc[it.key()] = it.value();
}
for(auto it = document.begin(); it != document.end(); ++it) {
// adds new key or overrides existing key from `old_doc`
new_doc[it.key()] = it.value();
// if the update document contains a field that exists in old, we record that (for delete + reindex)
bool field_exists_in_old_doc = (old_doc.count(it.key()) != 0);
if(field_exists_in_old_doc) {
// key exists in the stored doc, so it must be reindexed
// we need to check for this because a field can be optional
del_doc[it.key()] = old_doc[it.key()];
}
}
}
nlohmann::json Collection::add_many(std::vector<std::string>& json_lines, nlohmann::json& document,
const index_operation_t& operation, const std::string& id,
const DIRTY_VALUES& dirty_values) {
@ -251,7 +231,6 @@ nlohmann::json Collection::add_many(std::vector<std::string>& json_lines, nlohma
record.is_update = !doc_seq_id_op.get().is_new;
if(record.is_update) {
get_document_from_store(get_seq_id_key(seq_id), record.old_doc);
get_doc_changes(document, record.old_doc, record.new_doc, record.del_doc);
}
// if `auto_detect_schema` is enabled, we will have to update schema first before indexing
@ -313,6 +292,8 @@ void Collection::batch_index(std::vector<std::vector<index_record>> &index_batch
if(index_record.indexed.ok()) {
if(index_record.is_update) {
//get_doc_changes(index_record.doc, index_record.old_doc, index_record.new_doc, index_record.del_doc);
const std::string& serialized_json = index_record.new_doc.dump(-1, ' ', false, nlohmann::detail::error_handler_t::ignore);
bool write_ok = store->insert(get_seq_id_key(index_record.seq_id), serialized_json);
@ -425,11 +406,9 @@ void Collection::prune_document(nlohmann::json &document, const spp::sparse_hash
const spp::sparse_hash_set<std::string>& exclude_fields) {
auto it = document.begin();
for(; it != document.end(); ) {
if(document.count(Collection::DOC_META_KEY) != 0) {
document.erase(Collection::DOC_META_KEY);
}
if(exclude_fields.count(it.key()) != 0 || (!include_fields.empty() && include_fields.count(it.key()) == 0)) {
if (exclude_fields.count(it.key()) != 0 ||
(!include_fields.empty() && include_fields.count(it.key()) == 0) ||
document.count(Collection::DOC_META_KEY) != 0) {
it = document.erase(it);
} else {
++it;
@ -1586,12 +1565,15 @@ Option<nlohmann::json> Collection::get(const std::string & id) const {
}
void Collection::remove_document(const nlohmann::json & document, const uint32_t seq_id, bool remove_from_store) {
std::unique_lock lock(mutex);
const std::string& id = document["id"];
Index* index = indices[seq_id % num_memory_shards];
index->remove(seq_id, document);
num_documents -= 1;
{
std::unique_lock lock(mutex);
Index* index = indices[seq_id % num_memory_shards];
index->remove(seq_id, document);
num_documents -= 1;
}
if(remove_from_store) {
store->remove(get_doc_id_key(id));
@ -1687,7 +1669,7 @@ Option<uint32_t> Collection::remove_override(const std::string & id) {
}
size_t Collection::get_num_memory_shards() {
return num_memory_shards;
return num_memory_shards.load();
}
uint32_t Collection::get_seq_id_from_key(const std::string & key) {
@ -1712,11 +1694,12 @@ std::string Collection::get_doc_id_key(const std::string & doc_id) const {
}
std::string Collection::get_name() const {
std::shared_lock lock(mutex);
return name;
}
uint64_t Collection::get_created_at() const {
return created_at;
return created_at.load();
}
size_t Collection::get_num_documents() const {
@ -1724,7 +1707,7 @@ size_t Collection::get_num_documents() const {
}
uint32_t Collection::get_collection_id() const {
return collection_id;
return collection_id.load();
}
Option<uint32_t> Collection::doc_id_to_seq_id(const std::string & doc_id) const {
@ -1743,6 +1726,8 @@ Option<uint32_t> Collection::doc_id_to_seq_id(const std::string & doc_id) const
}
std::vector<std::string> Collection::get_facet_fields() {
std::shared_lock lock(mutex);
std::vector<std::string> facet_fields_copy;
for(auto it = facet_schema.begin(); it != facet_schema.end(); ++it) {
facet_fields_copy.push_back(it->first);
@ -1752,6 +1737,8 @@ std::vector<std::string> Collection::get_facet_fields() {
}
std::vector<field> Collection::get_sort_fields() {
std::shared_lock lock(mutex);
std::vector<field> sort_fields_copy;
for(auto it = sort_schema.begin(); it != sort_schema.end(); ++it) {
sort_fields_copy.push_back(it->second);
@ -1761,10 +1748,12 @@ std::vector<field> Collection::get_sort_fields() {
}
std::vector<field> Collection::get_fields() {
std::shared_lock lock(mutex);
return fields;
}
std::unordered_map<std::string, field> Collection::get_schema() {
std::shared_lock lock(mutex);
return search_schema;
};
@ -1785,6 +1774,7 @@ std::string Collection::get_seq_id_collection_prefix() const {
}
std::string Collection::get_default_sorting_field() {
std::shared_lock lock(mutex);
return default_sorting_field;
}

View File

@ -133,8 +133,6 @@ bool post_create_collection(http_req & req, http_res & res) {
// field specific validation
std::vector<field> fields;
if(!req_json["fields"].is_array() || req_json["fields"].empty()) {
res.set_400("The `fields` value should be an array of objects containing "
"`name`, `type` and optionally, `facet` properties.");
@ -142,52 +140,11 @@ bool post_create_collection(http_req & req, http_res & res) {
}
std::string auto_detect_schema = schema_detect_types::OFF;
size_t num_auto_detect_fields = 0;
std::vector<field> fields;
auto parse_op = field::json_fields_to_fields(req_json["fields"], auto_detect_schema, fields);
for(nlohmann::json & field_json: req_json["fields"]) {
if(!field_json.is_object() ||
field_json.count(fields::name) == 0 || field_json.count(fields::type) == 0 ||
!field_json.at(fields::name).is_string() || !field_json.at(fields::type).is_string()) {
res.set_400("Wrong format for `fields`. It should be an array of objects containing "
"`name`, `type` and optionally, `facet` properties.");
return false;
}
if(field_json.count("facet") != 0 && !field_json.at(fields::facet).is_boolean()) {
res.set_400(std::string("The `facet` property of the field `") +
field_json.at(fields::name).get<std::string>() + "` should be a boolean.");
return false;
}
if(field_json.count("facet") == 0) {
field_json["facet"] = false;
}
if(field_json.count("optional") == 0) {
field_json["optional"] = false;
}
if(field_json["name"] == "*") {
if(field_json["type"] == schema_detect_types::AUTO || field_json["type"] == schema_detect_types::STRINGIFY) {
auto_detect_schema = field_json["type"];
num_auto_detect_fields++;
} else {
res.set_400(std::string("The `type` of field `") +
field_json["name"].get<std::string>() + "` is invalid.");
return false;
}
continue;
}
fields.emplace_back(
field(field_json["name"], field_json["type"], field_json["facet"], field_json["optional"])
);
}
if(num_auto_detect_fields > 1) {
res.set_400("There can be only one field with name `*`.");
if(!parse_op.ok()) {
res.set(parse_op.code(), parse_op.error());
return false;
}

View File

@ -485,6 +485,7 @@ size_t Index::batch_memory_index(Index *index, std::vector<index_record> & iter_
if(index_rec.is_update) {
// scrub string fields to reduce delete ops
get_doc_changes(index_rec.doc, index_rec.old_doc, index_rec.new_doc, index_rec.del_doc);
index->scrub_reindex_doc(index_rec.doc, index_rec.del_doc, index_rec.old_doc);
index->remove(index_rec.seq_id, index_rec.del_doc);
}
@ -2627,3 +2628,23 @@ Option<uint32_t> Index::coerce_float(const DIRTY_VALUES& dirty_values, const fie
return Option<uint32_t>(200);
}
void Index::get_doc_changes(const nlohmann::json &document, nlohmann::json &old_doc, nlohmann::json &new_doc,
nlohmann::json &del_doc) {
for(auto it = old_doc.begin(); it != old_doc.end(); ++it) {
new_doc[it.key()] = it.value();
}
for(auto it = document.begin(); it != document.end(); ++it) {
// adds new key or overrides existing key from `old_doc`
new_doc[it.key()] = it.value();
// if the update document contains a field that exists in old, we record that (for delete + reindex)
bool field_exists_in_old_doc = (old_doc.count(it.key()) != 0);
if(field_exists_in_old_doc) {
// key exists in the stored doc, so it must be reindexed
// we need to check for this because a field can be optional
del_doc[it.key()] = old_doc[it.key()];
}
}
}

View File

@ -242,6 +242,94 @@ TEST_F(CollectionAllFieldsTest, NonOptionalFieldShouldNotBeDropped) {
add_op = coll1->add(doc.dump(), CREATE, "0", DIRTY_VALUES::COERCE_OR_DROP);
ASSERT_FALSE(add_op.ok());
ASSERT_EQ("Field `points` must be an int32.", add_op.error());
collectionManager.drop_collection("coll1");
}
TEST_F(CollectionAllFieldsTest, ShouldBeAbleToUpdateSchemaDetectedDocs) {
Collection *coll1;
std::vector<field> fields = {
};
coll1 = collectionManager.get_collection("coll1").get();
if (coll1 == nullptr) {
coll1 = collectionManager.create_collection("coll1", 4, fields, "", 0, schema_detect_types::AUTO).get();
}
nlohmann::json doc;
doc["title"] = "FIRST";
doc["scores"] = {100, 200, 300};
Option<nlohmann::json> add_op = coll1->add(doc.dump(), CREATE, "0", DIRTY_VALUES::REJECT);
ASSERT_TRUE(add_op.ok());
// now update both values and reinsert
doc["title"] = "SECOND";
doc["scores"] = {100, 250, "300", 400};
add_op = coll1->add(doc.dump(), UPDATE, "0", DIRTY_VALUES::COERCE_OR_DROP);
ASSERT_TRUE(add_op.ok());
auto results = coll1->search("second", {"title"}, "", {}, {}, 0, 10, 1, FREQUENCY, false).get();
ASSERT_EQ(1, results["hits"].size());
ASSERT_EQ("SECOND", results["hits"][0]["document"]["title"].get<std::string>());
ASSERT_EQ(4, results["hits"][0]["document"]["scores"].size());
ASSERT_EQ(100, results["hits"][0]["document"]["scores"][0].get<size_t>());
ASSERT_EQ(250, results["hits"][0]["document"]["scores"][1].get<size_t>());
ASSERT_EQ(300, results["hits"][0]["document"]["scores"][2].get<size_t>());
ASSERT_EQ(400, results["hits"][0]["document"]["scores"][3].get<size_t>());
// insert multiple docs at the same time
const size_t NUM_DOCS = 20;
std::vector<std::string> json_lines;
for(size_t i = 0; i < NUM_DOCS; i++) {
const std::string &i_str = std::to_string(i);
doc["title"] = std::string("upserted ") + std::to_string(StringUtils::hash_wy(i_str.c_str(), i_str.size()));
doc["scores"] = {i};
doc["max"] = i;
doc["id"] = std::to_string(i+10);
json_lines.push_back(doc.dump());
}
nlohmann::json insert_doc;
auto res = coll1->add_many(json_lines, insert_doc, UPSERT);
ASSERT_TRUE(res["success"].get<bool>());
// now we will replace all `max` values with the same value and assert that
json_lines.clear();
insert_doc.clear();
for(size_t i = 0; i < NUM_DOCS; i++) {
const std::string &i_str = std::to_string(i);
doc.clear();
doc["title"] = std::string("updated ") + std::to_string(StringUtils::hash_wy(i_str.c_str(), i_str.size()));
doc["scores"] = {1000, 2000};
doc["max"] = 2000;
doc["id"] = std::to_string(i+10);
json_lines.push_back(doc.dump());
}
res = coll1->add_many(json_lines, insert_doc, UPDATE);
ASSERT_TRUE(res["success"].get<bool>());
results = coll1->search("updated", {"title"}, "", {}, {}, 0, 50, 1, FREQUENCY, false).get();
ASSERT_EQ(20, results["hits"].size());
for(auto& hit: results["hits"]) {
ASSERT_EQ(2000, hit["document"]["max"].get<int>());
ASSERT_EQ(2, hit["document"]["scores"].size());
ASSERT_EQ(1000, hit["document"]["scores"][0].get<int>());
ASSERT_EQ(2000, hit["document"]["scores"][1].get<int>());
}
collectionManager.drop_collection("coll1");
}
TEST_F(CollectionAllFieldsTest, StringifyAllValues) {
@ -267,4 +355,61 @@ TEST_F(CollectionAllFieldsTest, StringifyAllValues) {
ASSERT_EQ(2, results["hits"][0]["document"]["int_values"].size());
ASSERT_EQ("1", results["hits"][0]["document"]["int_values"][0].get<std::string>());
ASSERT_EQ("2", results["hits"][0]["document"]["int_values"][1].get<std::string>());
}
collectionManager.drop_collection("coll1");
}
TEST_F(CollectionAllFieldsTest, JsonFieldsToFieldsConversion) {
nlohmann::json fields_json = nlohmann::json::array();
nlohmann::json all_field;
all_field[fields::name] = "*";
all_field[fields::type] = "stringify";
fields_json.emplace_back(all_field);
std::string auto_detect_schema;
std::vector<field> fields;
auto parse_op = field::json_fields_to_fields(fields_json, auto_detect_schema, fields);
ASSERT_TRUE(parse_op.ok());
ASSERT_EQ(1, fields.size());
ASSERT_EQ("stringify", auto_detect_schema);
ASSERT_EQ(true, fields[0].optional);
ASSERT_EQ(false, fields[0].facet);
ASSERT_EQ("*", fields[0].name);
ASSERT_EQ("stringify", fields[0].type);
// reject when you try to set optional to false or facet to true
fields_json[0][fields::optional] = false;
parse_op = field::json_fields_to_fields(fields_json, auto_detect_schema, fields);
ASSERT_FALSE(parse_op.ok());
ASSERT_EQ("Field `*` must be an optional field.", parse_op.error());
fields_json[0][fields::optional] = true;
fields_json[0][fields::facet] = true;
parse_op = field::json_fields_to_fields(fields_json, auto_detect_schema, fields);
ASSERT_FALSE(parse_op.ok());
ASSERT_EQ("Field `*` cannot be a facet field.", parse_op.error());
fields_json[0][fields::facet] = false;
// can have only one "*" field
fields_json.emplace_back(all_field);
parse_op = field::json_fields_to_fields(fields_json, auto_detect_schema, fields);
ASSERT_FALSE(parse_op.ok());
ASSERT_EQ("There can be only one field named `*`.", parse_op.error());
// try with the `auto` type
fields_json.clear();
fields.clear();
all_field[fields::type] = "auto";
fields_json.emplace_back(all_field);
parse_op = field::json_fields_to_fields(fields_json, auto_detect_schema, fields);
ASSERT_TRUE(parse_op.ok());
ASSERT_EQ("auto", fields[0].type);
}

View File

@ -169,13 +169,13 @@ TEST_F(CollectionManagerTest, RestoreRecordsOnRestart) {
override_t::parse(override_json_include, "", override_include);
nlohmann::json override_json = {
{"id", "exclude-rule"},
{
"rule", {
{"query", "of"},
{"match", override_t::MATCH_EXACT}
}
}
{"id", "exclude-rule"},
{
"rule", {
{"query", "of"},
{"match", override_t::MATCH_EXACT}
}
}
};
override_json["excludes"] = nlohmann::json::array();
override_json["excludes"][0] = nlohmann::json::object();
@ -304,8 +304,9 @@ TEST_F(CollectionManagerTest, RestoreAutoSchemaDocsOnRestart) {
ASSERT_EQ(1, coll1->get_collection_id());
ASSERT_EQ(3, coll1->get_sort_fields().size());
// index a document with a bad field value with COERCE_OR_DROP setting
auto doc_json = R"({"title": "Unique record.", "max": 25, "scores": [22, "how", 44],
// index a document with a 2 bad field values with COERCE_OR_DROP setting
// `title` is an integer and `average` is a string
auto doc_json = R"({"title": 12345, "max": 25, "scores": [22, "how", 44],
"average": "bad data", "is_valid": true})";
Option<nlohmann::json> add_op = coll1->add(doc_json, CREATE, "", DIRTY_VALUES::COERCE_OR_DROP);
@ -362,11 +363,14 @@ TEST_F(CollectionManagerTest, RestoreAutoSchemaDocsOnRestart) {
}
// try searching for record with bad data
auto results = restored_coll->search("unique", {"title"}, "", {}, {}, 0, 10, 1, FREQUENCY, false).get();
auto results = restored_coll->search("12345", {"title"}, "", {}, {}, 0, 10, 1, FREQUENCY, false).get();
ASSERT_EQ(1, results["hits"].size());
ASSERT_STREQ("Unique record.", results["hits"][0]["document"]["title"].get<std::string>().c_str());
// int to string conversion should be done for `title` while `average` field must be dropped
ASSERT_STREQ("12345", results["hits"][0]["document"]["title"].get<std::string>().c_str());
ASSERT_EQ(0, results["hits"][0]["document"].count("average"));
ASSERT_EQ(2, results["hits"][0]["document"]["scores"].size());
ASSERT_EQ(22, results["hits"][0]["document"]["scores"][0]);
ASSERT_EQ(44, results["hits"][0]["document"]["scores"][1]);

View File

@ -91,7 +91,7 @@ TEST_F(CollectionTest, VerifyCountOfDocuments) {
ASSERT_EQ(DIRTY_VALUES::REJECT, collection->parse_dirty_values_option(empty_dirty_values));
}
TEST_F(CollectionTest, MetaKeyIsNotReturnedAsDocumentField) {
TEST_F(CollectionTest, MetaKeyChecks) {
nlohmann::json results = collection->search("the", query_fields, "", {}, sort_fields, 0, 10).get();
ASSERT_EQ(7, results["hits"].size());
ASSERT_EQ(7, results["found"].get<int>());
@ -100,6 +100,16 @@ TEST_F(CollectionTest, MetaKeyIsNotReturnedAsDocumentField) {
nlohmann::json doc = results["hits"].at(i)["document"];
ASSERT_EQ(0, doc.count(Collection::DOC_META_KEY));
}
// don't allow a document with meta key to be indexed since it is reserved
nlohmann::json doc;
doc["title"] = "foo bar";
doc["points"] = 100;
doc[Collection::DOC_META_KEY] = "override";
auto op = collection->add(doc.dump());
ASSERT_FALSE(op.ok());
ASSERT_EQ("Document cannot contain a `$TSM$_` key.", op.error());
}
TEST_F(CollectionTest, RetrieveADocumentById) {
@ -114,6 +124,9 @@ TEST_F(CollectionTest, RetrieveADocumentById) {
id = doc["id"];
ASSERT_STREQ("foo", id.c_str());
// returned document should not have internal doc meta key
ASSERT_EQ(0, doc.count(Collection::DOC_META_KEY));
doc_option = collection->get("baz");
ASSERT_FALSE(doc_option.ok());
}
@ -652,7 +665,9 @@ TEST_F(CollectionTest, MultiOccurrenceString) {
document["title"] = "The brown fox was the tallest of the lot and the quickest of the trot.";
document["points"] = 100;
coll_multi_string->add(document.dump());
auto doc = coll_multi_string->add(document.dump()).get();
ASSERT_EQ(0, doc.count(Collection::DOC_META_KEY));
query_fields = {"title"};
nlohmann::json results = coll_multi_string->search("the", query_fields, "", {}, sort_fields, 0, 10, 1,