Allow fields to be stringified automatically.

2025-05-19 13:12:22 +08:00 · 2021-02-23 12:58:14 +05:30 · 2021-02-23 12:58:14 +05:30 · f1b70384cc
commit f1b70384cc
parent c24fc02d4d
11 changed files with 132 additions and 51 deletions
--- a/include/collection.h
+++ b/include/collection.h
@ -323,7 +323,7 @@ private:

    const std::vector<Index*> indices;

-    const std::atomic<bool> index_all_fields;
+    const std::string auto_detect_schema;

    // methods

@ -409,7 +409,7 @@ public:
    static constexpr const char* COLLECTION_DEFAULT_SORTING_FIELD_KEY = "default_sorting_field";
    static constexpr const char* COLLECTION_CREATED = "created_at";
    static constexpr const char* COLLECTION_NUM_MEMORY_SHARDS = "num_memory_shards";
-    static constexpr const char* COLLECTION_INDEX_ALL_FIELDS = "index_all_fields";
+    static constexpr const char* COLLECTION_AUTO_DETECT_SCHEMA = "auto_detect_schema";

    // DON'T CHANGE THESE VALUES!
    // this key is used as namespace key to store metadata about the document
@ -423,7 +423,7 @@ public:
    Collection(const std::string& name, const uint32_t collection_id, const uint64_t created_at,
               const uint32_t next_seq_id, Store *store, const std::vector<field>& fields,
               const std::string& default_sorting_field, const size_t num_memory_shards,
-               const float max_memory_ratio, const bool index_all_fields);
+               const float max_memory_ratio, const std::string& auto_detect_schema);

    ~Collection();

--- a/include/collection_manager.h
+++ b/include/collection_manager.h
@ -129,7 +129,7 @@ public:
                                          const std::vector<field> & fields,
                                          const std::string & default_sorting_field="",
                                          const uint64_t created_at = static_cast<uint64_t>(std::time(nullptr)),
-                                          const bool index_all_fields = false);
+                                          const std::string& auto_detect_schema = schema_detect_types::OFF);

    locked_resource_view_t<Collection> get_collection(const std::string & collection_name) const;

--- a/include/field.h
+++ b/include/field.h
@ -27,6 +27,12 @@ namespace fields {
    static const std::string optional = "optional";
 }

+namespace schema_detect_types {
+    static const std::string OFF = "off";
+    static const std::string STRINGIFY = "stringify";
+    static const std::string AUTO = "auto";
+}
+
 static const uint8_t DEFAULT_GEO_RESOLUTION = 7;
 static const uint8_t FINEST_GEO_RESOLUTION = 15;

@ -54,6 +60,10 @@ struct field {

    }

+    bool is_auto() const {
+        return (type == schema_detect_types::AUTO || type == schema_detect_types::STRINGIFY);
+    }
+
    bool is_single_integer() const {
        return (type == field_types::INT32 || type == field_types::INT64);
    }
@ -119,7 +129,7 @@ struct field {
    }

    bool has_valid_type() const {
-        return is_string() || is_integer() || is_float() || is_bool() || is_geopoint();
+        return is_string() || is_integer() || is_float() || is_bool() || is_geopoint() || is_auto();
    }

    std::string faceted_name() const {
@ -128,7 +138,7 @@ struct field {

    static bool get_type(const nlohmann::json& obj, std::string& field_type) {
        if(obj.is_array()) {
-            if(obj.empty() || obj[0].is_array()) {
+            if(obj.empty()) {
                return false;
            }

@ -173,7 +183,8 @@ struct field {
    }

    static Option<bool> fields_to_json_fields(const std::vector<field> & fields,
-                                              const std::string & default_sorting_field, nlohmann::json& fields_json) {
+                                              const std::string & default_sorting_field,
+                                              nlohmann::json& fields_json) {
        bool found_default_sorting_field = false;

        for(const field & field: fields) {
--- a/include/index.h
+++ b/include/index.h
@ -367,7 +367,7 @@ public:
                                     const std::string & default_sorting_field,
                                     const std::unordered_map<std::string, field> & search_schema,
                                     const std::map<std::string, field> & facet_schema,
-                                     bool index_all_fields);
+                                     const std::string& auto_detect_schema);

    static void populate_token_positions(const std::vector<art_leaf *> &query_suggestion,
                                         const std::vector<uint32_t*>& leaf_to_indices,
@ -385,7 +385,7 @@ public:
                                                     const std::unordered_map<std::string, field> & search_schema,
                                                     const std::map<std::string, field> & facet_schema,
                                                     bool is_update,
-                                                     bool index_all_fields,
+                                                     const std::string& auto_detect_schema,
                                                     const DIRTY_VALUES& dirty_values);

    void refresh_schemas(const std::vector<field>& new_fields);
--- a/src/collection.cpp
+++ b/src/collection.cpp
@ -40,13 +40,13 @@ struct match_index_t {
 Collection::Collection(const std::string& name, const uint32_t collection_id, const uint64_t created_at,
                       const uint32_t next_seq_id, Store *store, const std::vector<field> &fields,
                       const std::string& default_sorting_field, const size_t num_memory_shards,
-                       const float max_memory_ratio, const bool index_all_fields):
+                       const float max_memory_ratio, const std::string& auto_detect_schema):
        name(name), collection_id(collection_id), created_at(created_at),
        next_seq_id(next_seq_id), store(store),
        fields(fields), default_sorting_field(default_sorting_field),
        num_memory_shards(num_memory_shards),
        max_memory_ratio(max_memory_ratio),
-        indices(init_indices()), index_all_fields(index_all_fields) {
+        indices(init_indices()), auto_detect_schema(auto_detect_schema) {

    this->num_documents = 0;
 }
@ -237,6 +237,10 @@ nlohmann::json Collection::add_many(std::vector<std::string>& json_lines, nlohma
        const uint32_t seq_id = doc_seq_id_op.ok() ? doc_seq_id_op.get().seq_id : 0;
        index_record record(i, seq_id, document, operation, dirty_values);

+        if(document.count(Collection::DOC_META_KEY) != 0) {
+            document.erase(Collection::DOC_META_KEY);
+        }
+
        // NOTE: we overwrite the input json_lines with result to avoid memory pressure

        record.is_update = false;
@ -250,8 +254,8 @@ nlohmann::json Collection::add_many(std::vector<std::string>& json_lines, nlohma
                get_doc_changes(document, record.old_doc, record.new_doc, record.del_doc);
            }

-            // if `index_all_fields` is enabled, we will have to update schema first before indexing
-            if(index_all_fields) {
+            // if `auto_detect_schema` is enabled, we will have to update schema first before indexing
+            if(auto_detect_schema != schema_detect_types::OFF) {
                Option<bool> schema_change_op = check_and_update_schema(document);
                if(!schema_change_op.ok()) {
                    record.index_failure(schema_change_op.code(), schema_change_op.error());
@ -366,7 +370,7 @@ Option<uint32_t> Collection::index_in_memory(nlohmann::json &document, uint32_t

    Option<uint32_t> validation_op = Index::validate_index_in_memory(document, seq_id, default_sorting_field,
                                                                     search_schema, facet_schema, is_update,
-                                                                     index_all_fields, dirty_values);
+                                                                     auto_detect_schema, dirty_values);

    if(!validation_op.ok()) {
        return validation_op;
@ -394,7 +398,7 @@ size_t Collection::par_index_in_memory(std::vector<std::vector<index_record>> &
        CollectionManager::get_instance().get_thread_pool()->enqueue(
        [index, index_id, &num_indexed_vec, &iter_batch, this, &m_process, &num_processed, &cv_process]() {
            size_t num_indexed = Index::batch_memory_index(index, std::ref(iter_batch[index_id]), default_sorting_field,
-                                      search_schema, facet_schema, index_all_fields);
+                                      search_schema, facet_schema, auto_detect_schema);
            std::unique_lock<std::mutex> lock(m_process);
            num_indexed_vec[index_id] = num_indexed;
            num_processed++;
@ -421,7 +425,11 @@ void Collection::prune_document(nlohmann::json &document, const spp::sparse_hash
                                const spp::sparse_hash_set<std::string>& exclude_fields) {
    auto it = document.begin();
    for(; it != document.end(); ) {
-        if(exclude_fields.count(it.key()) != 0 || (include_fields.size() != 0 && include_fields.count(it.key()) == 0)) {
+        if(document.count(Collection::DOC_META_KEY) != 0) {
+            document.erase(Collection::DOC_META_KEY);
+        }
+
+        if(exclude_fields.count(it.key()) != 0 || (!include_fields.empty() && include_fields.count(it.key()) == 0)) {
            it = document.erase(it);
        } else {
            ++it;
@ -1570,6 +1578,10 @@ Option<nlohmann::json> Collection::get(const std::string & id) const {
        return Option<nlohmann::json>(500, "Error while parsing stored document.");
    }

+    if(document.count(Collection::DOC_META_KEY) != 0) {
+        document.erase(Collection::DOC_META_KEY);
+    }
+
    return Option<nlohmann::json>(document);
 }

@ -2257,6 +2269,15 @@ Option<bool> Collection::check_and_update_schema(nlohmann::json& document) {
            if (parseable) {
                const std::string& fname = kv.key();
                field new_field(fname, field_type, false, true);
+
+                if(auto_detect_schema == schema_detect_types::STRINGIFY) {
+                    if(new_field.is_array()) {
+                        new_field.type = field_types::STRING_ARRAY;
+                    } else {
+                        new_field.type = field_types::STRING;
+                    }
+                }
+
                search_schema.emplace(fname, new_field);
                fields.emplace_back(new_field);
                new_fields.emplace_back(new_field);
@ -2332,7 +2353,8 @@ std::vector<Index *> Collection::init_indices() {
 }

 DIRTY_VALUES Collection::parse_dirty_values_option(std::string& dirty_values) const {
-    // no need for a shared lock here since `index_all_fields` is atomic
+    std::shared_lock lock(mutex);
+
    StringUtils::toupper(dirty_values);
    auto dirty_values_op = magic_enum::enum_cast<DIRTY_VALUES>(dirty_values);
    DIRTY_VALUES dirty_values_action;
@ -2340,7 +2362,8 @@ DIRTY_VALUES Collection::parse_dirty_values_option(std::string& dirty_values) co
    if(dirty_values_op.has_value()) {
        dirty_values_action = dirty_values_op.value();
    } else {
-        dirty_values_action = index_all_fields ? DIRTY_VALUES::COERCE_OR_REJECT : DIRTY_VALUES::REJECT;
+        dirty_values_action = (auto_detect_schema == schema_detect_types::OFF) ? DIRTY_VALUES::REJECT
+                                                                               : DIRTY_VALUES::COERCE_OR_REJECT;
    }

    return dirty_values_action;
--- a/src/collection_manager.cpp
+++ b/src/collection_manager.cpp
@ -38,9 +38,9 @@ Collection* CollectionManager::init_collection(const nlohmann::json & collection
                               collection_meta[Collection::COLLECTION_NUM_MEMORY_SHARDS].get<size_t>() :
                               DEFAULT_NUM_MEMORY_SHARDS;

-    size_t index_all_fields = collection_meta.count(Collection::COLLECTION_INDEX_ALL_FIELDS) != 0 ?
-                              collection_meta[Collection::COLLECTION_INDEX_ALL_FIELDS].get<bool>() :
-                              false;
+    std::string auto_detect_schema = collection_meta.count(Collection::COLLECTION_AUTO_DETECT_SCHEMA) != 0 ?
+                              collection_meta[Collection::COLLECTION_AUTO_DETECT_SCHEMA].get<std::string>() :
+                              schema_detect_types::OFF;

    LOG(INFO) << "Found collection " << this_collection_name << " with " << num_memory_shards << " memory shards.";

@ -53,7 +53,7 @@ Collection* CollectionManager::init_collection(const nlohmann::json & collection
                                            default_sorting_field,
                                            num_memory_shards,
                                            max_memory_ratio,
-                                            index_all_fields);
+                                            auto_detect_schema);

    return collection;
 }
@ -315,7 +315,7 @@ Option<Collection*> CollectionManager::create_collection(const std::string& name
                                                         const std::vector<field> & fields,
                                                         const std::string& default_sorting_field,
                                                         const uint64_t created_at,
-                                                         const bool index_all_fields) {
+                                                         const std::string& auto_detect_schema) {
    std::unique_lock lock(mutex);

    if(store->contains(Collection::get_meta_key(name))) {
@ -337,11 +337,11 @@ Option<Collection*> CollectionManager::create_collection(const std::string& name
    collection_meta[Collection::COLLECTION_DEFAULT_SORTING_FIELD_KEY] = default_sorting_field;
    collection_meta[Collection::COLLECTION_CREATED] = created_at;
    collection_meta[Collection::COLLECTION_NUM_MEMORY_SHARDS] = num_memory_shards;
-    collection_meta[Collection::COLLECTION_INDEX_ALL_FIELDS] = index_all_fields;
+    collection_meta[Collection::COLLECTION_AUTO_DETECT_SCHEMA] = auto_detect_schema;

    Collection* new_collection = new Collection(name, next_collection_id, created_at, 0, store, fields,
                                                default_sorting_field, num_memory_shards,
-                                                this->max_memory_ratio, index_all_fields);
+                                                this->max_memory_ratio, auto_detect_schema);
    next_collection_id++;

    rocksdb::WriteBatch batch;
--- a/src/core_api.cpp
+++ b/src/core_api.cpp
@ -75,7 +75,6 @@ bool get_collections(http_req & req, http_res & res) {

 bool post_create_collection(http_req & req, http_res & res) {
    const char* NUM_MEMORY_SHARDS = "num_memory_shards";
-    const char* INDEX_ALL_FIELDS = "index_all_fields";

    nlohmann::json req_json;

@ -100,8 +99,8 @@ bool post_create_collection(http_req & req, http_res & res) {
        req_json[NUM_MEMORY_SHARDS] = CollectionManager::DEFAULT_NUM_MEMORY_SHARDS;
    }

-    if(req_json.count("fields") == 0 && req_json.count(INDEX_ALL_FIELDS) == 0) {
-        res.set_400("Parameter `fields` or `index_all_fields` is required.");
+    if(req_json.count("fields") == 0) {
+        res.set_400("Parameter `fields` is required.");
        return false;
    }

@ -127,12 +126,6 @@ bool post_create_collection(http_req & req, http_res & res) {
        return false;
    }

-    bool index_all_fields = false;
-
-    if(req_json.count(INDEX_ALL_FIELDS) != 0 && req_json[INDEX_ALL_FIELDS].is_boolean()) {
-        index_all_fields = req_json[INDEX_ALL_FIELDS].get<bool>();
-    }
-
    if(collectionManager.get_collection(req_json["name"]) != nullptr) {
        res.set_409("Collection with name `" + req_json["name"].get<std::string>() + "` already exists.");
        return false;
@ -142,16 +135,15 @@ bool post_create_collection(http_req & req, http_res & res) {

    std::vector<field> fields;

-    if(req_json.count("fields") == 0) {
-        req_json["fields"] = nlohmann::json::array();
-    }
-
-    if(!req_json["fields"].is_array()) {
-        res.set_400("Wrong format for `fields`. It should be an array of objects containing "
+    if(!req_json["fields"].is_array() || req_json["fields"].empty()) {
+        res.set_400("The `fields` value should be an array of objects containing "
                    "`name`, `type` and optionally, `facet` properties.");
        return false;
    }

+    std::string auto_detect_schema = schema_detect_types::OFF;
+    size_t num_auto_detect_fields = 0;
+
    for(nlohmann::json & field_json: req_json["fields"]) {
        if(!field_json.is_object() ||
            field_json.count(fields::name) == 0 || field_json.count(fields::type) == 0 ||
@ -176,17 +168,35 @@ bool post_create_collection(http_req & req, http_res & res) {
            field_json["optional"] = false;
        }

+        if(field_json["name"] == "*") {
+            if(field_json["type"] == schema_detect_types::AUTO || field_json["type"] == schema_detect_types::STRINGIFY) {
+                auto_detect_schema = field_json["type"];
+                num_auto_detect_fields++;
+            } else {
+                res.set_400(std::string("The `type` of field `") +
+                            field_json["name"].get<std::string>() + "` is invalid.");
+                return false;
+            }
+
+            continue;
+        }
+
        fields.emplace_back(
            field(field_json["name"], field_json["type"], field_json["facet"], field_json["optional"])
        );
    }

+    if(num_auto_detect_fields > 1) {
+        res.set_400("There can be only one field with name `*`.");
+        return false;
+    }
+
    const std::string & default_sorting_field = req_json[DEFAULT_SORTING_FIELD].get<std::string>();
-    const uint64_t created_at = static_cast<uint64_t>(std::time(nullptr));
+    const auto created_at = static_cast<uint64_t>(std::time(nullptr));

    const Option<Collection*> & collection_op =
            collectionManager.create_collection(req_json["name"], req_json[NUM_MEMORY_SHARDS].get<size_t>(),
-            fields, default_sorting_field, created_at, index_all_fields);
+            fields, default_sorting_field, created_at, auto_detect_schema);

    if(collection_op.ok()) {
        nlohmann::json json_response = collection_op.get()->get_summary_json();
--- a/src/index.cpp
+++ b/src/index.cpp
@ -273,7 +273,7 @@ Option<uint32_t> Index::validate_index_in_memory(nlohmann::json& document, uint3
                                                 const std::unordered_map<std::string, field> & search_schema,
                                                 const std::map<std::string, field> & facet_schema,
                                                 bool is_update,
-                                                 bool index_all_fields,
+                                                 const std::string& auto_detect_schema,
                                                 const DIRTY_VALUES& dirty_values) {

    bool missing_default_sort_field = (!default_sorting_field.empty() && document.count(default_sorting_field) == 0);
@ -460,7 +460,7 @@ size_t Index::batch_memory_index(Index *index, std::vector<index_record> & iter_
                                 const std::string & default_sorting_field,
                                 const std::unordered_map<std::string, field> & search_schema,
                                 const std::map<std::string, field> & facet_schema,
-                                 bool index_all_fields) {
+                                 const std::string& auto_detect_schema) {

    size_t num_indexed = 0;

@ -475,7 +475,7 @@ size_t Index::batch_memory_index(Index *index, std::vector<index_record> & iter_
                                                                      default_sorting_field,
                                                                      search_schema, facet_schema,
                                                                      index_rec.is_update,
-                                                                      index_all_fields,
+                                                                      auto_detect_schema,
                                                                      index_rec.dirty_values);

            if(!validation_op.ok()) {
--- a/test/collection_all_fields_test.cpp
+++ b/test/collection_all_fields_test.cpp
@ -44,7 +44,8 @@ TEST_F(CollectionAllFieldsTest, IndexDocsWithoutSchema) {

    coll1 = collectionManager.get_collection("coll1").get();
    if(coll1 == nullptr) {
-        coll1 = collectionManager.create_collection("coll1", 1, fields, "", 0, true).get();
+        auto coll_op = collectionManager.create_collection("coll1", 1, fields, "", 0, schema_detect_types::AUTO);
+        coll1 = coll_op.get();
    }

    std::string json_line;
@ -166,7 +167,7 @@ TEST_F(CollectionAllFieldsTest, HandleArrayTypes) {

    coll1 = collectionManager.get_collection("coll1").get();
    if(coll1 == nullptr) {
-        coll1 = collectionManager.create_collection("coll1", 1, {}, "", 0, true).get();
+        coll1 = collectionManager.create_collection("coll1", 1, {}, "", 0, schema_detect_types::AUTO).get();
    }

    nlohmann::json doc;
@ -227,7 +228,7 @@ TEST_F(CollectionAllFieldsTest, NonOptionalFieldShouldNotBeDropped) {

    coll1 = collectionManager.get_collection("coll1").get();
    if (coll1 == nullptr) {
-        coll1 = collectionManager.create_collection("coll1", 1, fields, "", 0, true).get();
+        coll1 = collectionManager.create_collection("coll1", 1, fields, "", 0).get();
    }

    nlohmann::json doc;
@ -241,4 +242,29 @@ TEST_F(CollectionAllFieldsTest, NonOptionalFieldShouldNotBeDropped) {
    add_op = coll1->add(doc.dump(), CREATE, "0", DIRTY_VALUES::COERCE_OR_DROP);
    ASSERT_FALSE(add_op.ok());
    ASSERT_EQ("Field `points` must be an int32.", add_op.error());
+}
+
+TEST_F(CollectionAllFieldsTest, StringifyAllValues) {
+    Collection *coll1;
+
+    coll1 = collectionManager.get_collection("coll1").get();
+    if (coll1 == nullptr) {
+        coll1 = collectionManager.create_collection("coll1", 1, {}, "", 0, schema_detect_types::STRINGIFY).get();
+    }
+
+    nlohmann::json doc;
+    doc["title"] = "FIRST";
+    doc["int_values"] = {1, 2};
+
+    Option<nlohmann::json> add_op = coll1->add(doc.dump(), CREATE, "0");
+    ASSERT_TRUE(add_op.ok());
+
+    auto results = coll1->search("first", {"title"}, "", {}, sort_fields, 0, 10, 1, FREQUENCY, false).get();
+
+    ASSERT_EQ("FIRST", results["hits"][0]["document"]["title"].get<std::string>());
+
+    ASSERT_EQ(1, results["hits"][0]["document"].count("int_values"));
+    ASSERT_EQ(2, results["hits"][0]["document"]["int_values"].size());
+    ASSERT_EQ("1", results["hits"][0]["document"]["int_values"][0].get<std::string>());
+    ASSERT_EQ("2", results["hits"][0]["document"]["int_values"][1].get<std::string>());
 }
--- a/test/collection_manager_test.cpp
+++ b/test/collection_manager_test.cpp
@ -82,12 +82,12 @@ TEST_F(CollectionManagerTest, CollectionCreation) {
    ASSERT_EQ(3, num_keys);
    // we already call `collection1->get_next_seq_id` above, which is side-effecting
    ASSERT_EQ(1, StringUtils::deserialize_uint32_t(next_seq_id));
-    ASSERT_EQ("{\"created_at\":12345,\"default_sorting_field\":\"points\","
+    ASSERT_EQ("{\"auto_detect_schema\":\"off\",\"created_at\":12345,\"default_sorting_field\":\"points\","
              "\"fields\":[{\"facet\":false,\"name\":\"title\",\"optional\":false,\"type\":\"string\"},"
              "{\"facet\":false,\"name\":\"starring\",\"optional\":false,\"type\":\"string\"},"
              "{\"facet\":true,\"name\":\"cast\",\"optional\":true,\"type\":\"string[]\"},"
              "{\"facet\":false,\"name\":\"points\",\"optional\":false,\"type\":\"int32\"}],\"id\":0,"
-              "\"index_all_fields\":false,\"name\":\"collection1\",\"num_memory_shards\":4}",
+              "\"name\":\"collection1\",\"num_memory_shards\":4}",
              collection_meta_json);
    ASSERT_EQ("1", next_collection_id);
 }
@ -288,7 +288,7 @@ TEST_F(CollectionManagerTest, RestoreAutoSchemaDocsOnRestart) {

    coll1 = collectionManager.get_collection("coll1").get();
    if(coll1 == nullptr) {
-        coll1 = collectionManager.create_collection("coll1", 1, fields, "max", 0, true).get();
+        coll1 = collectionManager.create_collection("coll1", 1, fields, "max", 0, schema_detect_types::AUTO).get();
    }

    std::string json_line;
--- a/test/collection_test.cpp
+++ b/test/collection_test.cpp
@ -91,6 +91,17 @@ TEST_F(CollectionTest, VerifyCountOfDocuments) {
    ASSERT_EQ(DIRTY_VALUES::REJECT, collection->parse_dirty_values_option(empty_dirty_values));
 }

+TEST_F(CollectionTest, MetaKeyIsNotReturnedAsDocumentField) {
+    nlohmann::json results = collection->search("the", query_fields, "", {}, sort_fields, 0, 10).get();
+    ASSERT_EQ(7, results["hits"].size());
+    ASSERT_EQ(7, results["found"].get<int>());
+
+    for(size_t i = 0; i < results["hits"].size(); i++) {
+        nlohmann::json doc = results["hits"].at(i)["document"];
+        ASSERT_EQ(0, doc.count(Collection::DOC_META_KEY));
+    }
+}
+
 TEST_F(CollectionTest, RetrieveADocumentById) {
    Option<nlohmann::json> doc_option = collection->get("1");
    ASSERT_TRUE(doc_option.ok());