Allow fields to be stringified automatically.

This commit is contained in:
kishorenc 2021-02-23 12:58:14 +05:30
parent c24fc02d4d
commit f1b70384cc
11 changed files with 132 additions and 51 deletions

View File

@ -323,7 +323,7 @@ private:
const std::vector<Index*> indices;
const std::atomic<bool> index_all_fields;
const std::string auto_detect_schema;
// methods
@ -409,7 +409,7 @@ public:
static constexpr const char* COLLECTION_DEFAULT_SORTING_FIELD_KEY = "default_sorting_field";
static constexpr const char* COLLECTION_CREATED = "created_at";
static constexpr const char* COLLECTION_NUM_MEMORY_SHARDS = "num_memory_shards";
static constexpr const char* COLLECTION_INDEX_ALL_FIELDS = "index_all_fields";
static constexpr const char* COLLECTION_AUTO_DETECT_SCHEMA = "auto_detect_schema";
// DON'T CHANGE THESE VALUES!
// this key is used as namespace key to store metadata about the document
@ -423,7 +423,7 @@ public:
Collection(const std::string& name, const uint32_t collection_id, const uint64_t created_at,
const uint32_t next_seq_id, Store *store, const std::vector<field>& fields,
const std::string& default_sorting_field, const size_t num_memory_shards,
const float max_memory_ratio, const bool index_all_fields);
const float max_memory_ratio, const std::string& auto_detect_schema);
~Collection();

View File

@ -129,7 +129,7 @@ public:
const std::vector<field> & fields,
const std::string & default_sorting_field="",
const uint64_t created_at = static_cast<uint64_t>(std::time(nullptr)),
const bool index_all_fields = false);
const std::string& auto_detect_schema = schema_detect_types::OFF);
locked_resource_view_t<Collection> get_collection(const std::string & collection_name) const;

View File

@ -27,6 +27,12 @@ namespace fields {
static const std::string optional = "optional";
}
namespace schema_detect_types {
static const std::string OFF = "off";
static const std::string STRINGIFY = "stringify";
static const std::string AUTO = "auto";
}
static const uint8_t DEFAULT_GEO_RESOLUTION = 7;
static const uint8_t FINEST_GEO_RESOLUTION = 15;
@ -54,6 +60,10 @@ struct field {
}
bool is_auto() const {
return (type == schema_detect_types::AUTO || type == schema_detect_types::STRINGIFY);
}
bool is_single_integer() const {
return (type == field_types::INT32 || type == field_types::INT64);
}
@ -119,7 +129,7 @@ struct field {
}
bool has_valid_type() const {
return is_string() || is_integer() || is_float() || is_bool() || is_geopoint();
return is_string() || is_integer() || is_float() || is_bool() || is_geopoint() || is_auto();
}
std::string faceted_name() const {
@ -128,7 +138,7 @@ struct field {
static bool get_type(const nlohmann::json& obj, std::string& field_type) {
if(obj.is_array()) {
if(obj.empty() || obj[0].is_array()) {
if(obj.empty()) {
return false;
}
@ -173,7 +183,8 @@ struct field {
}
static Option<bool> fields_to_json_fields(const std::vector<field> & fields,
const std::string & default_sorting_field, nlohmann::json& fields_json) {
const std::string & default_sorting_field,
nlohmann::json& fields_json) {
bool found_default_sorting_field = false;
for(const field & field: fields) {

View File

@ -367,7 +367,7 @@ public:
const std::string & default_sorting_field,
const std::unordered_map<std::string, field> & search_schema,
const std::map<std::string, field> & facet_schema,
bool index_all_fields);
const std::string& auto_detect_schema);
static void populate_token_positions(const std::vector<art_leaf *> &query_suggestion,
const std::vector<uint32_t*>& leaf_to_indices,
@ -385,7 +385,7 @@ public:
const std::unordered_map<std::string, field> & search_schema,
const std::map<std::string, field> & facet_schema,
bool is_update,
bool index_all_fields,
const std::string& auto_detect_schema,
const DIRTY_VALUES& dirty_values);
void refresh_schemas(const std::vector<field>& new_fields);

View File

@ -40,13 +40,13 @@ struct match_index_t {
Collection::Collection(const std::string& name, const uint32_t collection_id, const uint64_t created_at,
const uint32_t next_seq_id, Store *store, const std::vector<field> &fields,
const std::string& default_sorting_field, const size_t num_memory_shards,
const float max_memory_ratio, const bool index_all_fields):
const float max_memory_ratio, const std::string& auto_detect_schema):
name(name), collection_id(collection_id), created_at(created_at),
next_seq_id(next_seq_id), store(store),
fields(fields), default_sorting_field(default_sorting_field),
num_memory_shards(num_memory_shards),
max_memory_ratio(max_memory_ratio),
indices(init_indices()), index_all_fields(index_all_fields) {
indices(init_indices()), auto_detect_schema(auto_detect_schema) {
this->num_documents = 0;
}
@ -237,6 +237,10 @@ nlohmann::json Collection::add_many(std::vector<std::string>& json_lines, nlohma
const uint32_t seq_id = doc_seq_id_op.ok() ? doc_seq_id_op.get().seq_id : 0;
index_record record(i, seq_id, document, operation, dirty_values);
if(document.count(Collection::DOC_META_KEY) != 0) {
document.erase(Collection::DOC_META_KEY);
}
// NOTE: we overwrite the input json_lines with result to avoid memory pressure
record.is_update = false;
@ -250,8 +254,8 @@ nlohmann::json Collection::add_many(std::vector<std::string>& json_lines, nlohma
get_doc_changes(document, record.old_doc, record.new_doc, record.del_doc);
}
// if `index_all_fields` is enabled, we will have to update schema first before indexing
if(index_all_fields) {
// if `auto_detect_schema` is enabled, we will have to update schema first before indexing
if(auto_detect_schema != schema_detect_types::OFF) {
Option<bool> schema_change_op = check_and_update_schema(document);
if(!schema_change_op.ok()) {
record.index_failure(schema_change_op.code(), schema_change_op.error());
@ -366,7 +370,7 @@ Option<uint32_t> Collection::index_in_memory(nlohmann::json &document, uint32_t
Option<uint32_t> validation_op = Index::validate_index_in_memory(document, seq_id, default_sorting_field,
search_schema, facet_schema, is_update,
index_all_fields, dirty_values);
auto_detect_schema, dirty_values);
if(!validation_op.ok()) {
return validation_op;
@ -394,7 +398,7 @@ size_t Collection::par_index_in_memory(std::vector<std::vector<index_record>> &
CollectionManager::get_instance().get_thread_pool()->enqueue(
[index, index_id, &num_indexed_vec, &iter_batch, this, &m_process, &num_processed, &cv_process]() {
size_t num_indexed = Index::batch_memory_index(index, std::ref(iter_batch[index_id]), default_sorting_field,
search_schema, facet_schema, index_all_fields);
search_schema, facet_schema, auto_detect_schema);
std::unique_lock<std::mutex> lock(m_process);
num_indexed_vec[index_id] = num_indexed;
num_processed++;
@ -421,7 +425,11 @@ void Collection::prune_document(nlohmann::json &document, const spp::sparse_hash
const spp::sparse_hash_set<std::string>& exclude_fields) {
auto it = document.begin();
for(; it != document.end(); ) {
if(exclude_fields.count(it.key()) != 0 || (include_fields.size() != 0 && include_fields.count(it.key()) == 0)) {
if(document.count(Collection::DOC_META_KEY) != 0) {
document.erase(Collection::DOC_META_KEY);
}
if(exclude_fields.count(it.key()) != 0 || (!include_fields.empty() && include_fields.count(it.key()) == 0)) {
it = document.erase(it);
} else {
++it;
@ -1570,6 +1578,10 @@ Option<nlohmann::json> Collection::get(const std::string & id) const {
return Option<nlohmann::json>(500, "Error while parsing stored document.");
}
if(document.count(Collection::DOC_META_KEY) != 0) {
document.erase(Collection::DOC_META_KEY);
}
return Option<nlohmann::json>(document);
}
@ -2257,6 +2269,15 @@ Option<bool> Collection::check_and_update_schema(nlohmann::json& document) {
if (parseable) {
const std::string& fname = kv.key();
field new_field(fname, field_type, false, true);
if(auto_detect_schema == schema_detect_types::STRINGIFY) {
if(new_field.is_array()) {
new_field.type = field_types::STRING_ARRAY;
} else {
new_field.type = field_types::STRING;
}
}
search_schema.emplace(fname, new_field);
fields.emplace_back(new_field);
new_fields.emplace_back(new_field);
@ -2332,7 +2353,8 @@ std::vector<Index *> Collection::init_indices() {
}
DIRTY_VALUES Collection::parse_dirty_values_option(std::string& dirty_values) const {
// no need for a shared lock here since `index_all_fields` is atomic
std::shared_lock lock(mutex);
StringUtils::toupper(dirty_values);
auto dirty_values_op = magic_enum::enum_cast<DIRTY_VALUES>(dirty_values);
DIRTY_VALUES dirty_values_action;
@ -2340,7 +2362,8 @@ DIRTY_VALUES Collection::parse_dirty_values_option(std::string& dirty_values) co
if(dirty_values_op.has_value()) {
dirty_values_action = dirty_values_op.value();
} else {
dirty_values_action = index_all_fields ? DIRTY_VALUES::COERCE_OR_REJECT : DIRTY_VALUES::REJECT;
dirty_values_action = (auto_detect_schema == schema_detect_types::OFF) ? DIRTY_VALUES::REJECT
: DIRTY_VALUES::COERCE_OR_REJECT;
}
return dirty_values_action;

View File

@ -38,9 +38,9 @@ Collection* CollectionManager::init_collection(const nlohmann::json & collection
collection_meta[Collection::COLLECTION_NUM_MEMORY_SHARDS].get<size_t>() :
DEFAULT_NUM_MEMORY_SHARDS;
size_t index_all_fields = collection_meta.count(Collection::COLLECTION_INDEX_ALL_FIELDS) != 0 ?
collection_meta[Collection::COLLECTION_INDEX_ALL_FIELDS].get<bool>() :
false;
std::string auto_detect_schema = collection_meta.count(Collection::COLLECTION_AUTO_DETECT_SCHEMA) != 0 ?
collection_meta[Collection::COLLECTION_AUTO_DETECT_SCHEMA].get<std::string>() :
schema_detect_types::OFF;
LOG(INFO) << "Found collection " << this_collection_name << " with " << num_memory_shards << " memory shards.";
@ -53,7 +53,7 @@ Collection* CollectionManager::init_collection(const nlohmann::json & collection
default_sorting_field,
num_memory_shards,
max_memory_ratio,
index_all_fields);
auto_detect_schema);
return collection;
}
@ -315,7 +315,7 @@ Option<Collection*> CollectionManager::create_collection(const std::string& name
const std::vector<field> & fields,
const std::string& default_sorting_field,
const uint64_t created_at,
const bool index_all_fields) {
const std::string& auto_detect_schema) {
std::unique_lock lock(mutex);
if(store->contains(Collection::get_meta_key(name))) {
@ -337,11 +337,11 @@ Option<Collection*> CollectionManager::create_collection(const std::string& name
collection_meta[Collection::COLLECTION_DEFAULT_SORTING_FIELD_KEY] = default_sorting_field;
collection_meta[Collection::COLLECTION_CREATED] = created_at;
collection_meta[Collection::COLLECTION_NUM_MEMORY_SHARDS] = num_memory_shards;
collection_meta[Collection::COLLECTION_INDEX_ALL_FIELDS] = index_all_fields;
collection_meta[Collection::COLLECTION_AUTO_DETECT_SCHEMA] = auto_detect_schema;
Collection* new_collection = new Collection(name, next_collection_id, created_at, 0, store, fields,
default_sorting_field, num_memory_shards,
this->max_memory_ratio, index_all_fields);
this->max_memory_ratio, auto_detect_schema);
next_collection_id++;
rocksdb::WriteBatch batch;

View File

@ -75,7 +75,6 @@ bool get_collections(http_req & req, http_res & res) {
bool post_create_collection(http_req & req, http_res & res) {
const char* NUM_MEMORY_SHARDS = "num_memory_shards";
const char* INDEX_ALL_FIELDS = "index_all_fields";
nlohmann::json req_json;
@ -100,8 +99,8 @@ bool post_create_collection(http_req & req, http_res & res) {
req_json[NUM_MEMORY_SHARDS] = CollectionManager::DEFAULT_NUM_MEMORY_SHARDS;
}
if(req_json.count("fields") == 0 && req_json.count(INDEX_ALL_FIELDS) == 0) {
res.set_400("Parameter `fields` or `index_all_fields` is required.");
if(req_json.count("fields") == 0) {
res.set_400("Parameter `fields` is required.");
return false;
}
@ -127,12 +126,6 @@ bool post_create_collection(http_req & req, http_res & res) {
return false;
}
bool index_all_fields = false;
if(req_json.count(INDEX_ALL_FIELDS) != 0 && req_json[INDEX_ALL_FIELDS].is_boolean()) {
index_all_fields = req_json[INDEX_ALL_FIELDS].get<bool>();
}
if(collectionManager.get_collection(req_json["name"]) != nullptr) {
res.set_409("Collection with name `" + req_json["name"].get<std::string>() + "` already exists.");
return false;
@ -142,16 +135,15 @@ bool post_create_collection(http_req & req, http_res & res) {
std::vector<field> fields;
if(req_json.count("fields") == 0) {
req_json["fields"] = nlohmann::json::array();
}
if(!req_json["fields"].is_array()) {
res.set_400("Wrong format for `fields`. It should be an array of objects containing "
if(!req_json["fields"].is_array() || req_json["fields"].empty()) {
res.set_400("The `fields` value should be an array of objects containing "
"`name`, `type` and optionally, `facet` properties.");
return false;
}
std::string auto_detect_schema = schema_detect_types::OFF;
size_t num_auto_detect_fields = 0;
for(nlohmann::json & field_json: req_json["fields"]) {
if(!field_json.is_object() ||
field_json.count(fields::name) == 0 || field_json.count(fields::type) == 0 ||
@ -176,17 +168,35 @@ bool post_create_collection(http_req & req, http_res & res) {
field_json["optional"] = false;
}
if(field_json["name"] == "*") {
if(field_json["type"] == schema_detect_types::AUTO || field_json["type"] == schema_detect_types::STRINGIFY) {
auto_detect_schema = field_json["type"];
num_auto_detect_fields++;
} else {
res.set_400(std::string("The `type` of field `") +
field_json["name"].get<std::string>() + "` is invalid.");
return false;
}
continue;
}
fields.emplace_back(
field(field_json["name"], field_json["type"], field_json["facet"], field_json["optional"])
);
}
if(num_auto_detect_fields > 1) {
res.set_400("There can be only one field with name `*`.");
return false;
}
const std::string & default_sorting_field = req_json[DEFAULT_SORTING_FIELD].get<std::string>();
const uint64_t created_at = static_cast<uint64_t>(std::time(nullptr));
const auto created_at = static_cast<uint64_t>(std::time(nullptr));
const Option<Collection*> & collection_op =
collectionManager.create_collection(req_json["name"], req_json[NUM_MEMORY_SHARDS].get<size_t>(),
fields, default_sorting_field, created_at, index_all_fields);
fields, default_sorting_field, created_at, auto_detect_schema);
if(collection_op.ok()) {
nlohmann::json json_response = collection_op.get()->get_summary_json();

View File

@ -273,7 +273,7 @@ Option<uint32_t> Index::validate_index_in_memory(nlohmann::json& document, uint3
const std::unordered_map<std::string, field> & search_schema,
const std::map<std::string, field> & facet_schema,
bool is_update,
bool index_all_fields,
const std::string& auto_detect_schema,
const DIRTY_VALUES& dirty_values) {
bool missing_default_sort_field = (!default_sorting_field.empty() && document.count(default_sorting_field) == 0);
@ -460,7 +460,7 @@ size_t Index::batch_memory_index(Index *index, std::vector<index_record> & iter_
const std::string & default_sorting_field,
const std::unordered_map<std::string, field> & search_schema,
const std::map<std::string, field> & facet_schema,
bool index_all_fields) {
const std::string& auto_detect_schema) {
size_t num_indexed = 0;
@ -475,7 +475,7 @@ size_t Index::batch_memory_index(Index *index, std::vector<index_record> & iter_
default_sorting_field,
search_schema, facet_schema,
index_rec.is_update,
index_all_fields,
auto_detect_schema,
index_rec.dirty_values);
if(!validation_op.ok()) {

View File

@ -44,7 +44,8 @@ TEST_F(CollectionAllFieldsTest, IndexDocsWithoutSchema) {
coll1 = collectionManager.get_collection("coll1").get();
if(coll1 == nullptr) {
coll1 = collectionManager.create_collection("coll1", 1, fields, "", 0, true).get();
auto coll_op = collectionManager.create_collection("coll1", 1, fields, "", 0, schema_detect_types::AUTO);
coll1 = coll_op.get();
}
std::string json_line;
@ -166,7 +167,7 @@ TEST_F(CollectionAllFieldsTest, HandleArrayTypes) {
coll1 = collectionManager.get_collection("coll1").get();
if(coll1 == nullptr) {
coll1 = collectionManager.create_collection("coll1", 1, {}, "", 0, true).get();
coll1 = collectionManager.create_collection("coll1", 1, {}, "", 0, schema_detect_types::AUTO).get();
}
nlohmann::json doc;
@ -227,7 +228,7 @@ TEST_F(CollectionAllFieldsTest, NonOptionalFieldShouldNotBeDropped) {
coll1 = collectionManager.get_collection("coll1").get();
if (coll1 == nullptr) {
coll1 = collectionManager.create_collection("coll1", 1, fields, "", 0, true).get();
coll1 = collectionManager.create_collection("coll1", 1, fields, "", 0).get();
}
nlohmann::json doc;
@ -241,4 +242,29 @@ TEST_F(CollectionAllFieldsTest, NonOptionalFieldShouldNotBeDropped) {
add_op = coll1->add(doc.dump(), CREATE, "0", DIRTY_VALUES::COERCE_OR_DROP);
ASSERT_FALSE(add_op.ok());
ASSERT_EQ("Field `points` must be an int32.", add_op.error());
}
TEST_F(CollectionAllFieldsTest, StringifyAllValues) {
Collection *coll1;
coll1 = collectionManager.get_collection("coll1").get();
if (coll1 == nullptr) {
coll1 = collectionManager.create_collection("coll1", 1, {}, "", 0, schema_detect_types::STRINGIFY).get();
}
nlohmann::json doc;
doc["title"] = "FIRST";
doc["int_values"] = {1, 2};
Option<nlohmann::json> add_op = coll1->add(doc.dump(), CREATE, "0");
ASSERT_TRUE(add_op.ok());
auto results = coll1->search("first", {"title"}, "", {}, sort_fields, 0, 10, 1, FREQUENCY, false).get();
ASSERT_EQ("FIRST", results["hits"][0]["document"]["title"].get<std::string>());
ASSERT_EQ(1, results["hits"][0]["document"].count("int_values"));
ASSERT_EQ(2, results["hits"][0]["document"]["int_values"].size());
ASSERT_EQ("1", results["hits"][0]["document"]["int_values"][0].get<std::string>());
ASSERT_EQ("2", results["hits"][0]["document"]["int_values"][1].get<std::string>());
}

View File

@ -82,12 +82,12 @@ TEST_F(CollectionManagerTest, CollectionCreation) {
ASSERT_EQ(3, num_keys);
// we already call `collection1->get_next_seq_id` above, which is side-effecting
ASSERT_EQ(1, StringUtils::deserialize_uint32_t(next_seq_id));
ASSERT_EQ("{\"created_at\":12345,\"default_sorting_field\":\"points\","
ASSERT_EQ("{\"auto_detect_schema\":\"off\",\"created_at\":12345,\"default_sorting_field\":\"points\","
"\"fields\":[{\"facet\":false,\"name\":\"title\",\"optional\":false,\"type\":\"string\"},"
"{\"facet\":false,\"name\":\"starring\",\"optional\":false,\"type\":\"string\"},"
"{\"facet\":true,\"name\":\"cast\",\"optional\":true,\"type\":\"string[]\"},"
"{\"facet\":false,\"name\":\"points\",\"optional\":false,\"type\":\"int32\"}],\"id\":0,"
"\"index_all_fields\":false,\"name\":\"collection1\",\"num_memory_shards\":4}",
"\"name\":\"collection1\",\"num_memory_shards\":4}",
collection_meta_json);
ASSERT_EQ("1", next_collection_id);
}
@ -288,7 +288,7 @@ TEST_F(CollectionManagerTest, RestoreAutoSchemaDocsOnRestart) {
coll1 = collectionManager.get_collection("coll1").get();
if(coll1 == nullptr) {
coll1 = collectionManager.create_collection("coll1", 1, fields, "max", 0, true).get();
coll1 = collectionManager.create_collection("coll1", 1, fields, "max", 0, schema_detect_types::AUTO).get();
}
std::string json_line;

View File

@ -91,6 +91,17 @@ TEST_F(CollectionTest, VerifyCountOfDocuments) {
ASSERT_EQ(DIRTY_VALUES::REJECT, collection->parse_dirty_values_option(empty_dirty_values));
}
TEST_F(CollectionTest, MetaKeyIsNotReturnedAsDocumentField) {
nlohmann::json results = collection->search("the", query_fields, "", {}, sort_fields, 0, 10).get();
ASSERT_EQ(7, results["hits"].size());
ASSERT_EQ(7, results["found"].get<int>());
for(size_t i = 0; i < results["hits"].size(); i++) {
nlohmann::json doc = results["hits"].at(i)["document"];
ASSERT_EQ(0, doc.count(Collection::DOC_META_KEY));
}
}
TEST_F(CollectionTest, RetrieveADocumentById) {
Option<nlohmann::json> doc_option = collection->get("1");
ASSERT_TRUE(doc_option.ok());