diff --git a/include/collection.h b/include/collection.h index 1a62ab52..028fa65f 100644 --- a/include/collection.h +++ b/include/collection.h @@ -323,7 +323,7 @@ private: const std::vector indices; - const std::atomic index_all_fields; + const std::string auto_detect_schema; // methods @@ -409,7 +409,7 @@ public: static constexpr const char* COLLECTION_DEFAULT_SORTING_FIELD_KEY = "default_sorting_field"; static constexpr const char* COLLECTION_CREATED = "created_at"; static constexpr const char* COLLECTION_NUM_MEMORY_SHARDS = "num_memory_shards"; - static constexpr const char* COLLECTION_INDEX_ALL_FIELDS = "index_all_fields"; + static constexpr const char* COLLECTION_AUTO_DETECT_SCHEMA = "auto_detect_schema"; // DON'T CHANGE THESE VALUES! // this key is used as namespace key to store metadata about the document @@ -423,7 +423,7 @@ public: Collection(const std::string& name, const uint32_t collection_id, const uint64_t created_at, const uint32_t next_seq_id, Store *store, const std::vector& fields, const std::string& default_sorting_field, const size_t num_memory_shards, - const float max_memory_ratio, const bool index_all_fields); + const float max_memory_ratio, const std::string& auto_detect_schema); ~Collection(); diff --git a/include/collection_manager.h b/include/collection_manager.h index 7e627368..0a8bab77 100644 --- a/include/collection_manager.h +++ b/include/collection_manager.h @@ -129,7 +129,7 @@ public: const std::vector & fields, const std::string & default_sorting_field="", const uint64_t created_at = static_cast(std::time(nullptr)), - const bool index_all_fields = false); + const std::string& auto_detect_schema = schema_detect_types::OFF); locked_resource_view_t get_collection(const std::string & collection_name) const; diff --git a/include/field.h b/include/field.h index c2c1206b..c4c2db3a 100644 --- a/include/field.h +++ b/include/field.h @@ -27,6 +27,12 @@ namespace fields { static const std::string optional = "optional"; } +namespace schema_detect_types { + static const std::string OFF = "off"; + static const std::string STRINGIFY = "stringify"; + static const std::string AUTO = "auto"; +} + static const uint8_t DEFAULT_GEO_RESOLUTION = 7; static const uint8_t FINEST_GEO_RESOLUTION = 15; @@ -54,6 +60,10 @@ struct field { } + bool is_auto() const { + return (type == schema_detect_types::AUTO || type == schema_detect_types::STRINGIFY); + } + bool is_single_integer() const { return (type == field_types::INT32 || type == field_types::INT64); } @@ -119,7 +129,7 @@ struct field { } bool has_valid_type() const { - return is_string() || is_integer() || is_float() || is_bool() || is_geopoint(); + return is_string() || is_integer() || is_float() || is_bool() || is_geopoint() || is_auto(); } std::string faceted_name() const { @@ -128,7 +138,7 @@ struct field { static bool get_type(const nlohmann::json& obj, std::string& field_type) { if(obj.is_array()) { - if(obj.empty() || obj[0].is_array()) { + if(obj.empty()) { return false; } @@ -173,7 +183,8 @@ struct field { } static Option fields_to_json_fields(const std::vector & fields, - const std::string & default_sorting_field, nlohmann::json& fields_json) { + const std::string & default_sorting_field, + nlohmann::json& fields_json) { bool found_default_sorting_field = false; for(const field & field: fields) { diff --git a/include/index.h b/include/index.h index 667f8086..a3464742 100644 --- a/include/index.h +++ b/include/index.h @@ -367,7 +367,7 @@ public: const std::string & default_sorting_field, const std::unordered_map & search_schema, const std::map & facet_schema, - bool index_all_fields); + const std::string& auto_detect_schema); static void populate_token_positions(const std::vector &query_suggestion, const std::vector& leaf_to_indices, @@ -385,7 +385,7 @@ public: const std::unordered_map & search_schema, const std::map & facet_schema, bool is_update, - bool index_all_fields, + const std::string& auto_detect_schema, const DIRTY_VALUES& dirty_values); void refresh_schemas(const std::vector& new_fields); diff --git a/src/collection.cpp b/src/collection.cpp index cfad8171..9555b23b 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -40,13 +40,13 @@ struct match_index_t { Collection::Collection(const std::string& name, const uint32_t collection_id, const uint64_t created_at, const uint32_t next_seq_id, Store *store, const std::vector &fields, const std::string& default_sorting_field, const size_t num_memory_shards, - const float max_memory_ratio, const bool index_all_fields): + const float max_memory_ratio, const std::string& auto_detect_schema): name(name), collection_id(collection_id), created_at(created_at), next_seq_id(next_seq_id), store(store), fields(fields), default_sorting_field(default_sorting_field), num_memory_shards(num_memory_shards), max_memory_ratio(max_memory_ratio), - indices(init_indices()), index_all_fields(index_all_fields) { + indices(init_indices()), auto_detect_schema(auto_detect_schema) { this->num_documents = 0; } @@ -237,6 +237,10 @@ nlohmann::json Collection::add_many(std::vector& json_lines, nlohma const uint32_t seq_id = doc_seq_id_op.ok() ? doc_seq_id_op.get().seq_id : 0; index_record record(i, seq_id, document, operation, dirty_values); + if(document.count(Collection::DOC_META_KEY) != 0) { + document.erase(Collection::DOC_META_KEY); + } + // NOTE: we overwrite the input json_lines with result to avoid memory pressure record.is_update = false; @@ -250,8 +254,8 @@ nlohmann::json Collection::add_many(std::vector& json_lines, nlohma get_doc_changes(document, record.old_doc, record.new_doc, record.del_doc); } - // if `index_all_fields` is enabled, we will have to update schema first before indexing - if(index_all_fields) { + // if `auto_detect_schema` is enabled, we will have to update schema first before indexing + if(auto_detect_schema != schema_detect_types::OFF) { Option schema_change_op = check_and_update_schema(document); if(!schema_change_op.ok()) { record.index_failure(schema_change_op.code(), schema_change_op.error()); @@ -366,7 +370,7 @@ Option Collection::index_in_memory(nlohmann::json &document, uint32_t Option validation_op = Index::validate_index_in_memory(document, seq_id, default_sorting_field, search_schema, facet_schema, is_update, - index_all_fields, dirty_values); + auto_detect_schema, dirty_values); if(!validation_op.ok()) { return validation_op; @@ -394,7 +398,7 @@ size_t Collection::par_index_in_memory(std::vector> & CollectionManager::get_instance().get_thread_pool()->enqueue( [index, index_id, &num_indexed_vec, &iter_batch, this, &m_process, &num_processed, &cv_process]() { size_t num_indexed = Index::batch_memory_index(index, std::ref(iter_batch[index_id]), default_sorting_field, - search_schema, facet_schema, index_all_fields); + search_schema, facet_schema, auto_detect_schema); std::unique_lock lock(m_process); num_indexed_vec[index_id] = num_indexed; num_processed++; @@ -421,7 +425,11 @@ void Collection::prune_document(nlohmann::json &document, const spp::sparse_hash const spp::sparse_hash_set& exclude_fields) { auto it = document.begin(); for(; it != document.end(); ) { - if(exclude_fields.count(it.key()) != 0 || (include_fields.size() != 0 && include_fields.count(it.key()) == 0)) { + if(document.count(Collection::DOC_META_KEY) != 0) { + document.erase(Collection::DOC_META_KEY); + } + + if(exclude_fields.count(it.key()) != 0 || (!include_fields.empty() && include_fields.count(it.key()) == 0)) { it = document.erase(it); } else { ++it; @@ -1570,6 +1578,10 @@ Option Collection::get(const std::string & id) const { return Option(500, "Error while parsing stored document."); } + if(document.count(Collection::DOC_META_KEY) != 0) { + document.erase(Collection::DOC_META_KEY); + } + return Option(document); } @@ -2257,6 +2269,15 @@ Option Collection::check_and_update_schema(nlohmann::json& document) { if (parseable) { const std::string& fname = kv.key(); field new_field(fname, field_type, false, true); + + if(auto_detect_schema == schema_detect_types::STRINGIFY) { + if(new_field.is_array()) { + new_field.type = field_types::STRING_ARRAY; + } else { + new_field.type = field_types::STRING; + } + } + search_schema.emplace(fname, new_field); fields.emplace_back(new_field); new_fields.emplace_back(new_field); @@ -2332,7 +2353,8 @@ std::vector Collection::init_indices() { } DIRTY_VALUES Collection::parse_dirty_values_option(std::string& dirty_values) const { - // no need for a shared lock here since `index_all_fields` is atomic + std::shared_lock lock(mutex); + StringUtils::toupper(dirty_values); auto dirty_values_op = magic_enum::enum_cast(dirty_values); DIRTY_VALUES dirty_values_action; @@ -2340,7 +2362,8 @@ DIRTY_VALUES Collection::parse_dirty_values_option(std::string& dirty_values) co if(dirty_values_op.has_value()) { dirty_values_action = dirty_values_op.value(); } else { - dirty_values_action = index_all_fields ? DIRTY_VALUES::COERCE_OR_REJECT : DIRTY_VALUES::REJECT; + dirty_values_action = (auto_detect_schema == schema_detect_types::OFF) ? DIRTY_VALUES::REJECT + : DIRTY_VALUES::COERCE_OR_REJECT; } return dirty_values_action; diff --git a/src/collection_manager.cpp b/src/collection_manager.cpp index fb7be8dc..3d96c6cd 100644 --- a/src/collection_manager.cpp +++ b/src/collection_manager.cpp @@ -38,9 +38,9 @@ Collection* CollectionManager::init_collection(const nlohmann::json & collection collection_meta[Collection::COLLECTION_NUM_MEMORY_SHARDS].get() : DEFAULT_NUM_MEMORY_SHARDS; - size_t index_all_fields = collection_meta.count(Collection::COLLECTION_INDEX_ALL_FIELDS) != 0 ? - collection_meta[Collection::COLLECTION_INDEX_ALL_FIELDS].get() : - false; + std::string auto_detect_schema = collection_meta.count(Collection::COLLECTION_AUTO_DETECT_SCHEMA) != 0 ? + collection_meta[Collection::COLLECTION_AUTO_DETECT_SCHEMA].get() : + schema_detect_types::OFF; LOG(INFO) << "Found collection " << this_collection_name << " with " << num_memory_shards << " memory shards."; @@ -53,7 +53,7 @@ Collection* CollectionManager::init_collection(const nlohmann::json & collection default_sorting_field, num_memory_shards, max_memory_ratio, - index_all_fields); + auto_detect_schema); return collection; } @@ -315,7 +315,7 @@ Option CollectionManager::create_collection(const std::string& name const std::vector & fields, const std::string& default_sorting_field, const uint64_t created_at, - const bool index_all_fields) { + const std::string& auto_detect_schema) { std::unique_lock lock(mutex); if(store->contains(Collection::get_meta_key(name))) { @@ -337,11 +337,11 @@ Option CollectionManager::create_collection(const std::string& name collection_meta[Collection::COLLECTION_DEFAULT_SORTING_FIELD_KEY] = default_sorting_field; collection_meta[Collection::COLLECTION_CREATED] = created_at; collection_meta[Collection::COLLECTION_NUM_MEMORY_SHARDS] = num_memory_shards; - collection_meta[Collection::COLLECTION_INDEX_ALL_FIELDS] = index_all_fields; + collection_meta[Collection::COLLECTION_AUTO_DETECT_SCHEMA] = auto_detect_schema; Collection* new_collection = new Collection(name, next_collection_id, created_at, 0, store, fields, default_sorting_field, num_memory_shards, - this->max_memory_ratio, index_all_fields); + this->max_memory_ratio, auto_detect_schema); next_collection_id++; rocksdb::WriteBatch batch; diff --git a/src/core_api.cpp b/src/core_api.cpp index 391d2968..1dd39c93 100644 --- a/src/core_api.cpp +++ b/src/core_api.cpp @@ -75,7 +75,6 @@ bool get_collections(http_req & req, http_res & res) { bool post_create_collection(http_req & req, http_res & res) { const char* NUM_MEMORY_SHARDS = "num_memory_shards"; - const char* INDEX_ALL_FIELDS = "index_all_fields"; nlohmann::json req_json; @@ -100,8 +99,8 @@ bool post_create_collection(http_req & req, http_res & res) { req_json[NUM_MEMORY_SHARDS] = CollectionManager::DEFAULT_NUM_MEMORY_SHARDS; } - if(req_json.count("fields") == 0 && req_json.count(INDEX_ALL_FIELDS) == 0) { - res.set_400("Parameter `fields` or `index_all_fields` is required."); + if(req_json.count("fields") == 0) { + res.set_400("Parameter `fields` is required."); return false; } @@ -127,12 +126,6 @@ bool post_create_collection(http_req & req, http_res & res) { return false; } - bool index_all_fields = false; - - if(req_json.count(INDEX_ALL_FIELDS) != 0 && req_json[INDEX_ALL_FIELDS].is_boolean()) { - index_all_fields = req_json[INDEX_ALL_FIELDS].get(); - } - if(collectionManager.get_collection(req_json["name"]) != nullptr) { res.set_409("Collection with name `" + req_json["name"].get() + "` already exists."); return false; @@ -142,16 +135,15 @@ bool post_create_collection(http_req & req, http_res & res) { std::vector fields; - if(req_json.count("fields") == 0) { - req_json["fields"] = nlohmann::json::array(); - } - - if(!req_json["fields"].is_array()) { - res.set_400("Wrong format for `fields`. It should be an array of objects containing " + if(!req_json["fields"].is_array() || req_json["fields"].empty()) { + res.set_400("The `fields` value should be an array of objects containing " "`name`, `type` and optionally, `facet` properties."); return false; } + std::string auto_detect_schema = schema_detect_types::OFF; + size_t num_auto_detect_fields = 0; + for(nlohmann::json & field_json: req_json["fields"]) { if(!field_json.is_object() || field_json.count(fields::name) == 0 || field_json.count(fields::type) == 0 || @@ -176,17 +168,35 @@ bool post_create_collection(http_req & req, http_res & res) { field_json["optional"] = false; } + if(field_json["name"] == "*") { + if(field_json["type"] == schema_detect_types::AUTO || field_json["type"] == schema_detect_types::STRINGIFY) { + auto_detect_schema = field_json["type"]; + num_auto_detect_fields++; + } else { + res.set_400(std::string("The `type` of field `") + + field_json["name"].get() + "` is invalid."); + return false; + } + + continue; + } + fields.emplace_back( field(field_json["name"], field_json["type"], field_json["facet"], field_json["optional"]) ); } + if(num_auto_detect_fields > 1) { + res.set_400("There can be only one field with name `*`."); + return false; + } + const std::string & default_sorting_field = req_json[DEFAULT_SORTING_FIELD].get(); - const uint64_t created_at = static_cast(std::time(nullptr)); + const auto created_at = static_cast(std::time(nullptr)); const Option & collection_op = collectionManager.create_collection(req_json["name"], req_json[NUM_MEMORY_SHARDS].get(), - fields, default_sorting_field, created_at, index_all_fields); + fields, default_sorting_field, created_at, auto_detect_schema); if(collection_op.ok()) { nlohmann::json json_response = collection_op.get()->get_summary_json(); diff --git a/src/index.cpp b/src/index.cpp index 9845a88b..15523fa2 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -273,7 +273,7 @@ Option Index::validate_index_in_memory(nlohmann::json& document, uint3 const std::unordered_map & search_schema, const std::map & facet_schema, bool is_update, - bool index_all_fields, + const std::string& auto_detect_schema, const DIRTY_VALUES& dirty_values) { bool missing_default_sort_field = (!default_sorting_field.empty() && document.count(default_sorting_field) == 0); @@ -460,7 +460,7 @@ size_t Index::batch_memory_index(Index *index, std::vector & iter_ const std::string & default_sorting_field, const std::unordered_map & search_schema, const std::map & facet_schema, - bool index_all_fields) { + const std::string& auto_detect_schema) { size_t num_indexed = 0; @@ -475,7 +475,7 @@ size_t Index::batch_memory_index(Index *index, std::vector & iter_ default_sorting_field, search_schema, facet_schema, index_rec.is_update, - index_all_fields, + auto_detect_schema, index_rec.dirty_values); if(!validation_op.ok()) { diff --git a/test/collection_all_fields_test.cpp b/test/collection_all_fields_test.cpp index a7514077..154ef9e1 100644 --- a/test/collection_all_fields_test.cpp +++ b/test/collection_all_fields_test.cpp @@ -44,7 +44,8 @@ TEST_F(CollectionAllFieldsTest, IndexDocsWithoutSchema) { coll1 = collectionManager.get_collection("coll1").get(); if(coll1 == nullptr) { - coll1 = collectionManager.create_collection("coll1", 1, fields, "", 0, true).get(); + auto coll_op = collectionManager.create_collection("coll1", 1, fields, "", 0, schema_detect_types::AUTO); + coll1 = coll_op.get(); } std::string json_line; @@ -166,7 +167,7 @@ TEST_F(CollectionAllFieldsTest, HandleArrayTypes) { coll1 = collectionManager.get_collection("coll1").get(); if(coll1 == nullptr) { - coll1 = collectionManager.create_collection("coll1", 1, {}, "", 0, true).get(); + coll1 = collectionManager.create_collection("coll1", 1, {}, "", 0, schema_detect_types::AUTO).get(); } nlohmann::json doc; @@ -227,7 +228,7 @@ TEST_F(CollectionAllFieldsTest, NonOptionalFieldShouldNotBeDropped) { coll1 = collectionManager.get_collection("coll1").get(); if (coll1 == nullptr) { - coll1 = collectionManager.create_collection("coll1", 1, fields, "", 0, true).get(); + coll1 = collectionManager.create_collection("coll1", 1, fields, "", 0).get(); } nlohmann::json doc; @@ -241,4 +242,29 @@ TEST_F(CollectionAllFieldsTest, NonOptionalFieldShouldNotBeDropped) { add_op = coll1->add(doc.dump(), CREATE, "0", DIRTY_VALUES::COERCE_OR_DROP); ASSERT_FALSE(add_op.ok()); ASSERT_EQ("Field `points` must be an int32.", add_op.error()); +} + +TEST_F(CollectionAllFieldsTest, StringifyAllValues) { + Collection *coll1; + + coll1 = collectionManager.get_collection("coll1").get(); + if (coll1 == nullptr) { + coll1 = collectionManager.create_collection("coll1", 1, {}, "", 0, schema_detect_types::STRINGIFY).get(); + } + + nlohmann::json doc; + doc["title"] = "FIRST"; + doc["int_values"] = {1, 2}; + + Option add_op = coll1->add(doc.dump(), CREATE, "0"); + ASSERT_TRUE(add_op.ok()); + + auto results = coll1->search("first", {"title"}, "", {}, sort_fields, 0, 10, 1, FREQUENCY, false).get(); + + ASSERT_EQ("FIRST", results["hits"][0]["document"]["title"].get()); + + ASSERT_EQ(1, results["hits"][0]["document"].count("int_values")); + ASSERT_EQ(2, results["hits"][0]["document"]["int_values"].size()); + ASSERT_EQ("1", results["hits"][0]["document"]["int_values"][0].get()); + ASSERT_EQ("2", results["hits"][0]["document"]["int_values"][1].get()); } \ No newline at end of file diff --git a/test/collection_manager_test.cpp b/test/collection_manager_test.cpp index 4e07e882..4c7f886b 100644 --- a/test/collection_manager_test.cpp +++ b/test/collection_manager_test.cpp @@ -82,12 +82,12 @@ TEST_F(CollectionManagerTest, CollectionCreation) { ASSERT_EQ(3, num_keys); // we already call `collection1->get_next_seq_id` above, which is side-effecting ASSERT_EQ(1, StringUtils::deserialize_uint32_t(next_seq_id)); - ASSERT_EQ("{\"created_at\":12345,\"default_sorting_field\":\"points\"," + ASSERT_EQ("{\"auto_detect_schema\":\"off\",\"created_at\":12345,\"default_sorting_field\":\"points\"," "\"fields\":[{\"facet\":false,\"name\":\"title\",\"optional\":false,\"type\":\"string\"}," "{\"facet\":false,\"name\":\"starring\",\"optional\":false,\"type\":\"string\"}," "{\"facet\":true,\"name\":\"cast\",\"optional\":true,\"type\":\"string[]\"}," "{\"facet\":false,\"name\":\"points\",\"optional\":false,\"type\":\"int32\"}],\"id\":0," - "\"index_all_fields\":false,\"name\":\"collection1\",\"num_memory_shards\":4}", + "\"name\":\"collection1\",\"num_memory_shards\":4}", collection_meta_json); ASSERT_EQ("1", next_collection_id); } @@ -288,7 +288,7 @@ TEST_F(CollectionManagerTest, RestoreAutoSchemaDocsOnRestart) { coll1 = collectionManager.get_collection("coll1").get(); if(coll1 == nullptr) { - coll1 = collectionManager.create_collection("coll1", 1, fields, "max", 0, true).get(); + coll1 = collectionManager.create_collection("coll1", 1, fields, "max", 0, schema_detect_types::AUTO).get(); } std::string json_line; diff --git a/test/collection_test.cpp b/test/collection_test.cpp index 033779f9..922887db 100644 --- a/test/collection_test.cpp +++ b/test/collection_test.cpp @@ -91,6 +91,17 @@ TEST_F(CollectionTest, VerifyCountOfDocuments) { ASSERT_EQ(DIRTY_VALUES::REJECT, collection->parse_dirty_values_option(empty_dirty_values)); } +TEST_F(CollectionTest, MetaKeyIsNotReturnedAsDocumentField) { + nlohmann::json results = collection->search("the", query_fields, "", {}, sort_fields, 0, 10).get(); + ASSERT_EQ(7, results["hits"].size()); + ASSERT_EQ(7, results["found"].get()); + + for(size_t i = 0; i < results["hits"].size(); i++) { + nlohmann::json doc = results["hits"].at(i)["document"]; + ASSERT_EQ(0, doc.count(Collection::DOC_META_KEY)); + } +} + TEST_F(CollectionTest, RetrieveADocumentById) { Option doc_option = collection->get("1"); ASSERT_TRUE(doc_option.ok());