Add more tests for testing schema detection.

2025-05-21 06:02:26 +08:00 · 2021-02-23 18:25:16 +05:30 · 2021-02-23 18:25:16 +05:30 · 0a9cf4aee0
commit 0a9cf4aee0
parent f1b70384cc
9 changed files with 302 additions and 102 deletions
--- a/include/collection.h
+++ b/include/collection.h
@ -291,9 +291,9 @@ private:

    const std::string name;

-    const uint32_t collection_id;
+    const std::atomic<uint32_t> collection_id;

-    const uint64_t created_at;
+    const std::atomic<uint64_t> created_at;

    std::atomic<size_t> num_documents;

@ -486,10 +486,6 @@ public:

    bool is_exceeding_memory_threshold() const;

-    static void get_doc_changes(const nlohmann::json &document, nlohmann::json &old_doc,
-                         nlohmann::json &new_doc,
-                         nlohmann::json &del_doc);
-
    void parse_search_query(const std::string &query, std::vector<std::string>& q_include_tokens,
                                            std::vector<std::string>& q_exclude_tokens) const;

--- a/include/field.h
+++ b/include/field.h
@ -225,6 +225,75 @@ struct field {

        return Option<bool>(true);
    }
+
+    static Option<bool> json_fields_to_fields(nlohmann::json& fields_json,
+                                              std::string& auto_detect_schema,
+                                              std::vector<field>& fields) {
+        size_t num_auto_detect_fields = 0;
+
+        for(nlohmann::json & field_json: fields_json) {
+            if(!field_json.is_object() ||
+               field_json.count(fields::name) == 0 || field_json.count(fields::type) == 0 ||
+               !field_json.at(fields::name).is_string() || !field_json.at(fields::type).is_string()) {
+
+                return Option<bool>(400, "Wrong format for `fields`. It should be an array of objects containing "
+                            "`name`, `type`, `optional` and `facet` properties.");
+            }
+
+            if(field_json.count(fields::facet) != 0 && !field_json.at(fields::facet).is_boolean()) {
+                return Option<bool>(400, std::string("The `facet` property of the field `") +
+                            field_json[fields::name].get<std::string>() + std::string("` should be a boolean."));
+            }
+
+            if(field_json.count(fields::optional) != 0 && !field_json.at(fields::optional).is_boolean()) {
+                return Option<bool>(400, std::string("The `optional` property of the field `") +
+                                         field_json[fields::name].get<std::string>() + std::string("` should be a boolean."));
+            }
+
+            if(field_json["name"] == "*") {
+                if(field_json["type"] == schema_detect_types::AUTO || field_json["type"] == schema_detect_types::STRINGIFY) {
+                    auto_detect_schema = field_json["type"];
+                    num_auto_detect_fields++;
+                } else {
+                    return Option<bool>(400, "The `type` of field `*` is invalid.");
+                }
+
+                if(field_json.count("facet") == 0) {
+                    field_json["facet"] = false;
+                }
+
+                if(field_json.count("optional") == 0) {
+                    field_json["optional"] = true;
+                }
+
+                if(field_json["optional"] == false) {
+                    return Option<bool>(400, "Field `*` must be an optional field.");
+                }
+
+                if(field_json["facet"] == true) {
+                    return Option<bool>(400, "Field `*` cannot be a facet field.");
+                }
+            }
+
+            if(field_json.count("facet") == 0) {
+                field_json["facet"] = false;
+            }
+
+            if(field_json.count("optional") == 0) {
+                field_json["optional"] = false;
+            }
+
+            fields.emplace_back(
+                field(field_json["name"], field_json["type"], field_json["facet"], field_json["optional"])
+            );
+        }
+
+        if(num_auto_detect_fields > 1) {
+            return Option<bool>(400,"There can be only one field named `*`.");
+        }
+
+        return Option<bool>(true);
+    }
 };

 struct filter {
--- a/include/index.h
+++ b/include/index.h
@ -243,6 +243,9 @@ private:

    static void compute_facet_stats(facet &a_facet, uint64_t raw_value, const std::string & field_type);

+    static void get_doc_changes(const nlohmann::json &document, nlohmann::json &old_doc,
+                                nlohmann::json &new_doc, nlohmann::json &del_doc);
+
    static Option<uint32_t> coerce_string(const DIRTY_VALUES& dirty_values, const field& a_field, nlohmann::json &document,
                                          const std::string &field_name, const int array_index);

--- a/src/collection.cpp
+++ b/src/collection.cpp
@ -58,6 +58,7 @@ Collection::~Collection() {
 }

 uint32_t Collection::get_next_seq_id() {
+    std::shared_lock lock(mutex);
    store->increment(get_next_seq_id_key(name), 1);
    return next_seq_id++;
 }
@ -154,7 +155,7 @@ nlohmann::json Collection::get_summary_json() const {
    json_response["name"] = name;
    json_response["num_memory_shards"] = num_memory_shards.load();
    json_response["num_documents"] = num_documents.load();
-    json_response["created_at"] = created_at;
+    json_response["created_at"] = created_at.load();

    nlohmann::json fields_arr;

@ -195,27 +196,6 @@ Option<nlohmann::json> Collection::add(const std::string & json_str,
    return Option<nlohmann::json>(document);
 }

-void Collection::get_doc_changes(const nlohmann::json &document, nlohmann::json &old_doc,
-                                 nlohmann::json &new_doc, nlohmann::json &del_doc) {
-
-    for(auto it = old_doc.begin(); it != old_doc.end(); ++it) {
-        new_doc[it.key()] = it.value();
-    }
-
-    for(auto it = document.begin(); it != document.end(); ++it) {
-        // adds new key or overrides existing key from `old_doc`
-        new_doc[it.key()] = it.value();
-
-        // if the update document contains a field that exists in old, we record that (for delete + reindex)
-        bool field_exists_in_old_doc = (old_doc.count(it.key()) != 0);
-        if(field_exists_in_old_doc) {
-            // key exists in the stored doc, so it must be reindexed
-            // we need to check for this because a field can be optional
-            del_doc[it.key()] = old_doc[it.key()];
-        }
-    }
-}
-
 nlohmann::json Collection::add_many(std::vector<std::string>& json_lines, nlohmann::json& document,
                                    const index_operation_t& operation, const std::string& id,
                                    const DIRTY_VALUES& dirty_values) {
@ -251,7 +231,6 @@ nlohmann::json Collection::add_many(std::vector<std::string>& json_lines, nlohma
            record.is_update = !doc_seq_id_op.get().is_new;
            if(record.is_update) {
                get_document_from_store(get_seq_id_key(seq_id), record.old_doc);
-                get_doc_changes(document, record.old_doc, record.new_doc, record.del_doc);
            }

            // if `auto_detect_schema` is enabled, we will have to update schema first before indexing
@ -313,6 +292,8 @@ void Collection::batch_index(std::vector<std::vector<index_record>> &index_batch

            if(index_record.indexed.ok()) {
                if(index_record.is_update) {
+                    //get_doc_changes(index_record.doc, index_record.old_doc, index_record.new_doc, index_record.del_doc);
+
                    const std::string& serialized_json = index_record.new_doc.dump(-1, ' ', false, nlohmann::detail::error_handler_t::ignore);
                    bool write_ok = store->insert(get_seq_id_key(index_record.seq_id), serialized_json);

@ -425,11 +406,9 @@ void Collection::prune_document(nlohmann::json &document, const spp::sparse_hash
                                const spp::sparse_hash_set<std::string>& exclude_fields) {
    auto it = document.begin();
    for(; it != document.end(); ) {
-        if(document.count(Collection::DOC_META_KEY) != 0) {
-            document.erase(Collection::DOC_META_KEY);
-        }
-
-        if(exclude_fields.count(it.key()) != 0 || (!include_fields.empty() && include_fields.count(it.key()) == 0)) {
+        if (exclude_fields.count(it.key()) != 0 ||
+           (!include_fields.empty() && include_fields.count(it.key()) == 0) ||
+           document.count(Collection::DOC_META_KEY) != 0) {
            it = document.erase(it);
        } else {
            ++it;
@ -1586,12 +1565,15 @@ Option<nlohmann::json> Collection::get(const std::string & id) const {
 }

 void Collection::remove_document(const nlohmann::json & document, const uint32_t seq_id, bool remove_from_store) {
-    std::unique_lock lock(mutex);
    const std::string& id = document["id"];

-    Index* index = indices[seq_id % num_memory_shards];
-    index->remove(seq_id, document);
-    num_documents -= 1;
+    {
+        std::unique_lock lock(mutex);
+
+        Index* index = indices[seq_id % num_memory_shards];
+        index->remove(seq_id, document);
+        num_documents -= 1;
+    }

    if(remove_from_store) {
        store->remove(get_doc_id_key(id));
@ -1687,7 +1669,7 @@ Option<uint32_t> Collection::remove_override(const std::string & id) {
 }

 size_t Collection::get_num_memory_shards() {
-    return num_memory_shards;
+    return num_memory_shards.load();
 }

 uint32_t Collection::get_seq_id_from_key(const std::string & key) {
@ -1712,11 +1694,12 @@ std::string Collection::get_doc_id_key(const std::string & doc_id) const {
 }

 std::string Collection::get_name() const {
+    std::shared_lock lock(mutex);
    return name;
 }

 uint64_t Collection::get_created_at() const {
-    return created_at;
+    return created_at.load();
 }

 size_t Collection::get_num_documents() const {
@ -1724,7 +1707,7 @@ size_t Collection::get_num_documents() const {
 }

 uint32_t Collection::get_collection_id() const {
-    return collection_id;
+    return collection_id.load();
 }

 Option<uint32_t> Collection::doc_id_to_seq_id(const std::string & doc_id) const {
@ -1743,6 +1726,8 @@ Option<uint32_t> Collection::doc_id_to_seq_id(const std::string & doc_id) const
 }

 std::vector<std::string> Collection::get_facet_fields() {
+    std::shared_lock lock(mutex);
+
    std::vector<std::string> facet_fields_copy;
    for(auto it = facet_schema.begin(); it != facet_schema.end(); ++it) {
        facet_fields_copy.push_back(it->first);
@ -1752,6 +1737,8 @@ std::vector<std::string> Collection::get_facet_fields() {
 }

 std::vector<field> Collection::get_sort_fields() {
+    std::shared_lock lock(mutex);
+
    std::vector<field> sort_fields_copy;
    for(auto it = sort_schema.begin(); it != sort_schema.end(); ++it) {
        sort_fields_copy.push_back(it->second);
@ -1761,10 +1748,12 @@ std::vector<field> Collection::get_sort_fields() {
 }

 std::vector<field> Collection::get_fields() {
+    std::shared_lock lock(mutex);
    return fields;
 }

 std::unordered_map<std::string, field> Collection::get_schema() {
+    std::shared_lock lock(mutex);
    return search_schema;
 };

@ -1785,6 +1774,7 @@ std::string Collection::get_seq_id_collection_prefix() const {
 }

 std::string Collection::get_default_sorting_field() {
+    std::shared_lock lock(mutex);
    return default_sorting_field;
 }

--- a/src/core_api.cpp
+++ b/src/core_api.cpp
@ -133,8 +133,6 @@ bool post_create_collection(http_req & req, http_res & res) {

    // field specific validation

-    std::vector<field> fields;
-
    if(!req_json["fields"].is_array() || req_json["fields"].empty()) {
        res.set_400("The `fields` value should be an array of objects containing "
                    "`name`, `type` and optionally, `facet` properties.");
@ -142,52 +140,11 @@ bool post_create_collection(http_req & req, http_res & res) {
    }

    std::string auto_detect_schema = schema_detect_types::OFF;
-    size_t num_auto_detect_fields = 0;
+    std::vector<field> fields;
+    auto parse_op = field::json_fields_to_fields(req_json["fields"], auto_detect_schema, fields);

-    for(nlohmann::json & field_json: req_json["fields"]) {
-        if(!field_json.is_object() ||
-            field_json.count(fields::name) == 0 || field_json.count(fields::type) == 0 ||
-            !field_json.at(fields::name).is_string() || !field_json.at(fields::type).is_string()) {
-
-            res.set_400("Wrong format for `fields`. It should be an array of objects containing "
-                        "`name`, `type` and optionally, `facet` properties.");
-            return false;
-        }
-
-        if(field_json.count("facet") != 0 && !field_json.at(fields::facet).is_boolean()) {
-            res.set_400(std::string("The `facet` property of the field `") +
-                        field_json.at(fields::name).get<std::string>() + "` should be a boolean.");
-            return false;
-        }
-
-        if(field_json.count("facet") == 0) {
-            field_json["facet"] = false;
-        }
-
-        if(field_json.count("optional") == 0) {
-            field_json["optional"] = false;
-        }
-
-        if(field_json["name"] == "*") {
-            if(field_json["type"] == schema_detect_types::AUTO || field_json["type"] == schema_detect_types::STRINGIFY) {
-                auto_detect_schema = field_json["type"];
-                num_auto_detect_fields++;
-            } else {
-                res.set_400(std::string("The `type` of field `") +
-                            field_json["name"].get<std::string>() + "` is invalid.");
-                return false;
-            }
-
-            continue;
-        }
-
-        fields.emplace_back(
-            field(field_json["name"], field_json["type"], field_json["facet"], field_json["optional"])
-        );
-    }
-
-    if(num_auto_detect_fields > 1) {
-        res.set_400("There can be only one field with name `*`.");
+    if(!parse_op.ok()) {
+        res.set(parse_op.code(), parse_op.error());
        return false;
    }

--- a/src/index.cpp
+++ b/src/index.cpp
@ -485,6 +485,7 @@ size_t Index::batch_memory_index(Index *index, std::vector<index_record> & iter_

            if(index_rec.is_update) {
                // scrub string fields to reduce delete ops
+                get_doc_changes(index_rec.doc, index_rec.old_doc, index_rec.new_doc, index_rec.del_doc);
                index->scrub_reindex_doc(index_rec.doc, index_rec.del_doc, index_rec.old_doc);
                index->remove(index_rec.seq_id, index_rec.del_doc);
            }
@ -2627,3 +2628,23 @@ Option<uint32_t> Index::coerce_float(const DIRTY_VALUES& dirty_values, const fie

    return Option<uint32_t>(200);
 }
+
+void Index::get_doc_changes(const nlohmann::json &document, nlohmann::json &old_doc, nlohmann::json &new_doc,
+                            nlohmann::json &del_doc) {
+    for(auto it = old_doc.begin(); it != old_doc.end(); ++it) {
+        new_doc[it.key()] = it.value();
+    }
+
+    for(auto it = document.begin(); it != document.end(); ++it) {
+        // adds new key or overrides existing key from `old_doc`
+        new_doc[it.key()] = it.value();
+
+        // if the update document contains a field that exists in old, we record that (for delete + reindex)
+        bool field_exists_in_old_doc = (old_doc.count(it.key()) != 0);
+        if(field_exists_in_old_doc) {
+            // key exists in the stored doc, so it must be reindexed
+            // we need to check for this because a field can be optional
+            del_doc[it.key()] = old_doc[it.key()];
+        }
+    }
+}
--- a/test/collection_all_fields_test.cpp
+++ b/test/collection_all_fields_test.cpp
@ -242,6 +242,94 @@ TEST_F(CollectionAllFieldsTest, NonOptionalFieldShouldNotBeDropped) {
    add_op = coll1->add(doc.dump(), CREATE, "0", DIRTY_VALUES::COERCE_OR_DROP);
    ASSERT_FALSE(add_op.ok());
    ASSERT_EQ("Field `points` must be an int32.", add_op.error());
+
+    collectionManager.drop_collection("coll1");
+}
+
+TEST_F(CollectionAllFieldsTest, ShouldBeAbleToUpdateSchemaDetectedDocs) {
+    Collection *coll1;
+
+    std::vector<field> fields = {
+
+    };
+
+    coll1 = collectionManager.get_collection("coll1").get();
+    if (coll1 == nullptr) {
+        coll1 = collectionManager.create_collection("coll1", 4, fields, "", 0, schema_detect_types::AUTO).get();
+    }
+
+    nlohmann::json doc;
+    doc["title"] = "FIRST";
+    doc["scores"] = {100, 200, 300};
+
+    Option<nlohmann::json> add_op = coll1->add(doc.dump(), CREATE, "0", DIRTY_VALUES::REJECT);
+    ASSERT_TRUE(add_op.ok());
+
+    // now update both values and reinsert
+    doc["title"] = "SECOND";
+    doc["scores"] = {100, 250, "300", 400};
+
+    add_op = coll1->add(doc.dump(), UPDATE, "0", DIRTY_VALUES::COERCE_OR_DROP);
+    ASSERT_TRUE(add_op.ok());
+
+    auto results = coll1->search("second", {"title"}, "", {}, {}, 0, 10, 1, FREQUENCY, false).get();
+
+    ASSERT_EQ(1, results["hits"].size());
+    ASSERT_EQ("SECOND", results["hits"][0]["document"]["title"].get<std::string>());
+    ASSERT_EQ(4, results["hits"][0]["document"]["scores"].size());
+
+    ASSERT_EQ(100, results["hits"][0]["document"]["scores"][0].get<size_t>());
+    ASSERT_EQ(250, results["hits"][0]["document"]["scores"][1].get<size_t>());
+    ASSERT_EQ(300, results["hits"][0]["document"]["scores"][2].get<size_t>());
+    ASSERT_EQ(400, results["hits"][0]["document"]["scores"][3].get<size_t>());
+
+    // insert multiple docs at the same time
+    const size_t NUM_DOCS = 20;
+    std::vector<std::string> json_lines;
+    
+    for(size_t i = 0; i < NUM_DOCS; i++) {
+        const std::string &i_str = std::to_string(i);
+        doc["title"] = std::string("upserted ") + std::to_string(StringUtils::hash_wy(i_str.c_str(), i_str.size()));
+        doc["scores"] = {i};
+        doc["max"] = i;
+        doc["id"] = std::to_string(i+10);
+
+        json_lines.push_back(doc.dump());
+    }
+
+    nlohmann::json insert_doc;
+    auto res = coll1->add_many(json_lines, insert_doc, UPSERT);
+    ASSERT_TRUE(res["success"].get<bool>());
+
+    // now we will replace all `max` values with the same value and assert that
+    json_lines.clear();
+    insert_doc.clear();
+
+    for(size_t i = 0; i < NUM_DOCS; i++) {
+        const std::string &i_str = std::to_string(i);
+        doc.clear();
+        doc["title"] = std::string("updated ") + std::to_string(StringUtils::hash_wy(i_str.c_str(), i_str.size()));
+        doc["scores"] = {1000, 2000};
+        doc["max"] = 2000;
+        doc["id"] = std::to_string(i+10);
+
+        json_lines.push_back(doc.dump());
+    }
+
+    res = coll1->add_many(json_lines, insert_doc, UPDATE);
+    ASSERT_TRUE(res["success"].get<bool>());
+
+    results = coll1->search("updated", {"title"}, "", {}, {}, 0, 50, 1, FREQUENCY, false).get();
+    ASSERT_EQ(20, results["hits"].size());
+
+    for(auto& hit: results["hits"]) {
+        ASSERT_EQ(2000, hit["document"]["max"].get<int>());
+        ASSERT_EQ(2, hit["document"]["scores"].size());
+        ASSERT_EQ(1000, hit["document"]["scores"][0].get<int>());
+        ASSERT_EQ(2000, hit["document"]["scores"][1].get<int>());
+    }
+
+    collectionManager.drop_collection("coll1");
 }

 TEST_F(CollectionAllFieldsTest, StringifyAllValues) {
@ -267,4 +355,61 @@ TEST_F(CollectionAllFieldsTest, StringifyAllValues) {
    ASSERT_EQ(2, results["hits"][0]["document"]["int_values"].size());
    ASSERT_EQ("1", results["hits"][0]["document"]["int_values"][0].get<std::string>());
    ASSERT_EQ("2", results["hits"][0]["document"]["int_values"][1].get<std::string>());
-}
+
+    collectionManager.drop_collection("coll1");
+}
+
+TEST_F(CollectionAllFieldsTest, JsonFieldsToFieldsConversion) {
+    nlohmann::json fields_json = nlohmann::json::array();
+    nlohmann::json all_field;
+    all_field[fields::name] = "*";
+    all_field[fields::type] = "stringify";
+    fields_json.emplace_back(all_field);
+
+    std::string auto_detect_schema;
+    std::vector<field> fields;
+
+    auto parse_op = field::json_fields_to_fields(fields_json, auto_detect_schema, fields);
+
+    ASSERT_TRUE(parse_op.ok());
+    ASSERT_EQ(1, fields.size());
+    ASSERT_EQ("stringify", auto_detect_schema);
+    ASSERT_EQ(true, fields[0].optional);
+    ASSERT_EQ(false, fields[0].facet);
+    ASSERT_EQ("*", fields[0].name);
+    ASSERT_EQ("stringify", fields[0].type);
+
+    // reject when you try to set optional to false or facet to true
+    fields_json[0][fields::optional] = false;
+    parse_op = field::json_fields_to_fields(fields_json, auto_detect_schema, fields);
+
+    ASSERT_FALSE(parse_op.ok());
+    ASSERT_EQ("Field `*` must be an optional field.", parse_op.error());
+
+    fields_json[0][fields::optional] = true;
+    fields_json[0][fields::facet] = true;
+    parse_op = field::json_fields_to_fields(fields_json, auto_detect_schema, fields);
+
+    ASSERT_FALSE(parse_op.ok());
+    ASSERT_EQ("Field `*` cannot be a facet field.", parse_op.error());
+
+    fields_json[0][fields::facet] = false;
+
+    // can have only one "*" field
+    fields_json.emplace_back(all_field);
+
+    parse_op = field::json_fields_to_fields(fields_json, auto_detect_schema, fields);
+
+    ASSERT_FALSE(parse_op.ok());
+    ASSERT_EQ("There can be only one field named `*`.", parse_op.error());
+
+    // try with the `auto` type
+    fields_json.clear();
+    fields.clear();
+    all_field[fields::type] = "auto";
+    fields_json.emplace_back(all_field);
+
+    parse_op = field::json_fields_to_fields(fields_json, auto_detect_schema, fields);
+    ASSERT_TRUE(parse_op.ok());
+    ASSERT_EQ("auto", fields[0].type);
+}
--- a/test/collection_manager_test.cpp
+++ b/test/collection_manager_test.cpp
@ -169,13 +169,13 @@ TEST_F(CollectionManagerTest, RestoreRecordsOnRestart) {
    override_t::parse(override_json_include, "", override_include);

    nlohmann::json override_json = {
-            {"id", "exclude-rule"},
-            {
-             "rule", {
-                           {"query", "of"},
-                           {"match", override_t::MATCH_EXACT}
-                   }
-            }
+        {"id", "exclude-rule"},
+        {
+         "rule", {
+                       {"query", "of"},
+                       {"match", override_t::MATCH_EXACT}
+               }
+        }
    };
    override_json["excludes"] = nlohmann::json::array();
    override_json["excludes"][0] = nlohmann::json::object();
@ -304,8 +304,9 @@ TEST_F(CollectionManagerTest, RestoreAutoSchemaDocsOnRestart) {
    ASSERT_EQ(1, coll1->get_collection_id());
    ASSERT_EQ(3, coll1->get_sort_fields().size());

-    // index a document with a bad field value with COERCE_OR_DROP setting
-    auto doc_json = R"({"title": "Unique record.", "max": 25, "scores": [22, "how", 44],
+    // index a document with a 2 bad field values with COERCE_OR_DROP setting
+    // `title` is an integer and `average` is a string
+    auto doc_json = R"({"title": 12345, "max": 25, "scores": [22, "how", 44],
                        "average": "bad data", "is_valid": true})";

    Option<nlohmann::json> add_op = coll1->add(doc_json, CREATE, "", DIRTY_VALUES::COERCE_OR_DROP);
@ -362,11 +363,14 @@ TEST_F(CollectionManagerTest, RestoreAutoSchemaDocsOnRestart) {
    }

    // try searching for record with bad data
-    auto results = restored_coll->search("unique", {"title"}, "", {}, {}, 0, 10, 1, FREQUENCY, false).get();
+    auto results = restored_coll->search("12345", {"title"}, "", {}, {}, 0, 10, 1, FREQUENCY, false).get();

    ASSERT_EQ(1, results["hits"].size());
-    ASSERT_STREQ("Unique record.", results["hits"][0]["document"]["title"].get<std::string>().c_str());
+
+    // int to string conversion should be done for `title` while `average` field must be dropped
+    ASSERT_STREQ("12345", results["hits"][0]["document"]["title"].get<std::string>().c_str());
    ASSERT_EQ(0, results["hits"][0]["document"].count("average"));
+
    ASSERT_EQ(2, results["hits"][0]["document"]["scores"].size());
    ASSERT_EQ(22, results["hits"][0]["document"]["scores"][0]);
    ASSERT_EQ(44, results["hits"][0]["document"]["scores"][1]);
--- a/test/collection_test.cpp
+++ b/test/collection_test.cpp
@ -91,7 +91,7 @@ TEST_F(CollectionTest, VerifyCountOfDocuments) {
    ASSERT_EQ(DIRTY_VALUES::REJECT, collection->parse_dirty_values_option(empty_dirty_values));
 }

-TEST_F(CollectionTest, MetaKeyIsNotReturnedAsDocumentField) {
+TEST_F(CollectionTest, MetaKeyChecks) {
    nlohmann::json results = collection->search("the", query_fields, "", {}, sort_fields, 0, 10).get();
    ASSERT_EQ(7, results["hits"].size());
    ASSERT_EQ(7, results["found"].get<int>());
@ -100,6 +100,16 @@ TEST_F(CollectionTest, MetaKeyIsNotReturnedAsDocumentField) {
        nlohmann::json doc = results["hits"].at(i)["document"];
        ASSERT_EQ(0, doc.count(Collection::DOC_META_KEY));
    }
+
+    // don't allow a document with meta key to be indexed since it is reserved
+    nlohmann::json doc;
+    doc["title"] = "foo bar";
+    doc["points"] = 100;
+    doc[Collection::DOC_META_KEY] = "override";
+
+    auto op = collection->add(doc.dump());
+    ASSERT_FALSE(op.ok());
+    ASSERT_EQ("Document cannot contain a `$TSM$_` key.", op.error());
 }

 TEST_F(CollectionTest, RetrieveADocumentById) {
@ -114,6 +124,9 @@ TEST_F(CollectionTest, RetrieveADocumentById) {
    id = doc["id"];
    ASSERT_STREQ("foo", id.c_str());

+    // returned document should not have internal doc meta key
+    ASSERT_EQ(0, doc.count(Collection::DOC_META_KEY));
+
    doc_option = collection->get("baz");
    ASSERT_FALSE(doc_option.ok());
 }
@ -652,7 +665,9 @@ TEST_F(CollectionTest, MultiOccurrenceString) {
    document["title"] = "The brown fox was the tallest of the lot and the quickest of the trot.";
    document["points"] = 100;

-    coll_multi_string->add(document.dump());
+    auto doc = coll_multi_string->add(document.dump()).get();
+
+    ASSERT_EQ(0, doc.count(Collection::DOC_META_KEY));

    query_fields = {"title"};
    nlohmann::json results = coll_multi_string->search("the", query_fields, "", {}, sort_fields, 0, 10, 1,