#include #include "field.h" #include "magic_enum.hpp" #include "text_embedder_manager.h" #include #include #include Option field::json_field_to_field(bool enable_nested_fields, nlohmann::json& field_json, std::vector& the_fields, string& fallback_field_type, size_t& num_auto_detect_fields) { if(field_json["name"] == "id") { // No field should exist with the name "id" as it is reserved for internal use // We cannot throw an error here anymore since that will break backward compatibility! LOG(WARNING) << "Collection schema cannot contain a field with name `id`. Ignoring field."; return Option(true); } if(!field_json.is_object() || field_json.count(fields::name) == 0 || field_json.count(fields::type) == 0 || !field_json.at(fields::name).is_string() || !field_json.at(fields::type).is_string()) { return Option(400, "Wrong format for `fields`. It should be an array of objects containing " "`name`, `type`, `optional` and `facet` properties."); } if(field_json.count("drop") != 0) { return Option(400, std::string("Invalid property `drop` on field `") + field_json[fields::name].get() + std::string("`: it is allowed only " "during schema update.")); } if(field_json.count(fields::facet) != 0 && !field_json.at(fields::facet).is_boolean()) { return Option(400, std::string("The `facet` property of the field `") + field_json[fields::name].get() + std::string("` should be a boolean.")); } if(field_json.count(fields::optional) != 0 && !field_json.at(fields::optional).is_boolean()) { return Option(400, std::string("The `optional` property of the field `") + field_json[fields::name].get() + std::string("` should be a boolean.")); } if(field_json.count(fields::index) != 0 && !field_json.at(fields::index).is_boolean()) { return Option(400, std::string("The `index` property of the field `") + field_json[fields::name].get() + std::string("` should be a boolean.")); } if(field_json.count(fields::sort) != 0 && !field_json.at(fields::sort).is_boolean()) { return Option(400, std::string("The `sort` property of the field `") + field_json[fields::name].get() + std::string("` should be a boolean.")); } if(field_json.count(fields::infix) != 0 && !field_json.at(fields::infix).is_boolean()) { return Option(400, std::string("The `infix` property of the field `") + field_json[fields::name].get() + std::string("` should be a boolean.")); } if(field_json.count(fields::locale) != 0){ if(!field_json.at(fields::locale).is_string()) { return Option(400, std::string("The `locale` property of the field `") + field_json[fields::name].get() + std::string("` should be a string.")); } if(!field_json[fields::locale].get().empty() && field_json[fields::locale].get().size() != 2) { return Option(400, std::string("The `locale` value of the field `") + field_json[fields::name].get() + std::string("` is not valid.")); } } if (field_json.count(fields::reference) != 0 && !field_json.at(fields::reference).is_string()) { return Option(400, "Reference should be a string."); } else if (field_json.count(fields::reference) == 0) { field_json[fields::reference] = ""; } if (field_json.count(fields::range_index) != 0) { if (!field_json.at(fields::range_index).is_boolean()) { return Option(400, std::string("The `range_index` property of the field `") + field_json[fields::name].get() + std::string("` should be a boolean.")); } auto const& type = field_json["type"]; if (field_json[fields::range_index] && type != field_types::INT32 && type != field_types::INT32_ARRAY && type != field_types::INT64 && type != field_types::INT64_ARRAY && type != field_types::FLOAT && type != field_types::FLOAT_ARRAY) { return Option(400, std::string("The `range_index` property is only allowed for the numerical fields`")); } } else { field_json[fields::range_index] = false; } if(field_json["name"] == ".*") { if(field_json.count(fields::facet) == 0) { field_json[fields::facet] = false; } if(field_json.count(fields::optional) == 0) { field_json[fields::optional] = true; } if(field_json.count(fields::index) == 0) { field_json[fields::index] = true; } if(field_json.count(fields::locale) == 0) { field_json[fields::locale] = ""; } if(field_json.count(fields::sort) == 0) { field_json[fields::sort] = false; } if(field_json.count(fields::infix) == 0) { field_json[fields::infix] = false; } if(field_json[fields::optional] == false) { return Option(400, "Field `.*` must be an optional field."); } if(field_json[fields::facet] == true) { return Option(400, "Field `.*` cannot be a facet field."); } if(field_json[fields::index] == false) { return Option(400, "Field `.*` must be an index field."); } if (!field_json[fields::reference].get().empty()) { return Option(400, "Field `.*` cannot be a reference field."); } field fallback_field(field_json["name"], field_json["type"], field_json["facet"], field_json["optional"], field_json[fields::index], field_json[fields::locale], field_json[fields::sort], field_json[fields::infix]); if(fallback_field.has_valid_type()) { fallback_field_type = fallback_field.type; num_auto_detect_fields++; } else { return Option(400, "The `type` of field `.*` is invalid."); } the_fields.emplace_back(fallback_field); return Option(true); } if(field_json.count(fields::facet) == 0) { field_json[fields::facet] = false; } if(field_json.count(fields::index) == 0) { field_json[fields::index] = true; } if(field_json.count(fields::locale) == 0) { field_json[fields::locale] = ""; } if(field_json.count(fields::sort) == 0) { if(field_json["type"] == field_types::INT32 || field_json["type"] == field_types::INT64 || field_json["type"] == field_types::FLOAT || field_json["type"] == field_types::BOOL || field_json["type"] == field_types::GEOPOINT || field_json["type"] == field_types::GEOPOINT_ARRAY) { if((field_json.count(fields::num_dim) == 0) || (field_json[fields::facet])) { field_json[fields::sort] = true; } else { field_json[fields::sort] = false; } } else { field_json[fields::sort] = false; } } if(field_json.count(fields::infix) == 0) { field_json[fields::infix] = false; } if(field_json[fields::type] == field_types::OBJECT || field_json[fields::type] == field_types::OBJECT_ARRAY) { if(!enable_nested_fields) { return Option(400, "Type `object` or `object[]` can be used only when nested fields are enabled by " "setting` enable_nested_fields` to true."); } } if(field_json.count(fields::embed) != 0) { if(!field_json[fields::embed].is_object()) { return Option(400, "Property `" + fields::embed + "` must be an object."); } auto& embed_json = field_json[fields::embed]; if(field_json[fields::embed].count(fields::from) == 0) { return Option(400, "Property `" + fields::embed + "` must contain a `" + fields::from + "` property."); } if(!field_json[fields::embed][fields::from].is_array()) { return Option(400, "Property `" + fields::embed + "." + fields::from + "` must be an array."); } if(field_json[fields::embed][fields::from].empty()) { return Option(400, "Property `" + fields::embed + "." + fields::from + "` must have at least one element."); } if(embed_json.count(fields::model_config) == 0) { return Option(400, "Property `" + fields::embed + "." + fields::model_config + "` not found."); } auto& model_config = embed_json[fields::model_config]; if(model_config.count(fields::model_name) == 0) { return Option(400, "Property `" + fields::embed + "." + fields::model_config + "." + fields::model_name + "`not found"); } if(!model_config[fields::model_name].is_string()) { return Option(400, "Property `" + fields::embed + "." + fields::model_config + "." + fields::model_name + "` must be a string."); } if(model_config[fields::model_name].get().empty()) { return Option(400, "Property `" + fields::embed + "." + fields::model_config + "." + fields::model_name + "` cannot be empty."); } if(model_config.count(fields::indexing_prefix) != 0) { if(!model_config[fields::indexing_prefix].is_string()) { return Option(400, "Property `" + fields::embed + "." + fields::model_config + "." + fields::indexing_prefix + "` must be a string."); } } if(model_config.count(fields::query_prefix) != 0) { if(!model_config[fields::query_prefix].is_string()) { return Option(400, "Property `" + fields::embed + "." + fields::model_config + "." + fields::query_prefix + "` must be a string."); } } for(auto& embed_from_field : field_json[fields::embed][fields::from]) { if(!embed_from_field.is_string()) { return Option(400, "Property `" + fields::embed + "." + fields::from + "` must contain only field names as strings."); } } } auto DEFAULT_VEC_DIST_METRIC = magic_enum::enum_name(vector_distance_type_t::cosine); if(field_json.count(fields::num_dim) == 0) { field_json[fields::num_dim] = 0; field_json[fields::vec_dist] = DEFAULT_VEC_DIST_METRIC; } else { if(!field_json[fields::num_dim].is_number_unsigned() || field_json[fields::num_dim] == 0) { return Option(400, "Property `" + fields::num_dim + "` must be a positive integer."); } if(field_json[fields::type] != field_types::FLOAT_ARRAY) { return Option(400, "Property `" + fields::num_dim + "` is only allowed on a float array field."); } if(field_json[fields::facet].get()) { return Option(400, "Property `" + fields::facet + "` is not allowed on a vector field."); } if(field_json[fields::sort].get()) { return Option(400, "Property `" + fields::sort + "` cannot be enabled on a vector field."); } if(field_json.count(fields::vec_dist) == 0) { field_json[fields::vec_dist] = DEFAULT_VEC_DIST_METRIC; } else { if(!field_json[fields::vec_dist].is_string()) { return Option(400, "Property `" + fields::vec_dist + "` must be a string."); } auto vec_dist_op = magic_enum::enum_cast(field_json[fields::vec_dist].get()); if(!vec_dist_op.has_value()) { return Option(400, "Property `" + fields::vec_dist + "` is invalid."); } } } if(field_json.count(fields::optional) == 0) { // dynamic type fields are always optional bool is_dynamic = field::is_dynamic(field_json[fields::name], field_json[fields::type]); field_json[fields::optional] = is_dynamic; } bool is_obj = field_json[fields::type] == field_types::OBJECT || field_json[fields::type] == field_types::OBJECT_ARRAY; bool is_regexp_name = field_json[fields::name].get().find(".*") != std::string::npos; if (is_regexp_name && !field_json[fields::reference].get().empty()) { return Option(400, "Wildcard field cannot have a reference."); } if(is_obj || (!is_regexp_name && enable_nested_fields && field_json[fields::name].get().find('.') != std::string::npos)) { field_json[fields::nested] = true; field_json[fields::nested_array] = field::VAL_UNKNOWN; // unknown, will be resolved during read } else { field_json[fields::nested] = false; field_json[fields::nested_array] = 0; } if(field_json[fields::type] == field_types::GEOPOINT && field_json[fields::sort] == false) { LOG(WARNING) << "Forcing geopoint field `" << field_json[fields::name].get() << "` to be sortable."; field_json[fields::sort] = true; } auto vec_dist = magic_enum::enum_cast(field_json[fields::vec_dist].get()).value(); if (!field_json[fields::reference].get().empty()) { std::vector tokens; StringUtils::split(field_json[fields::reference].get(), tokens, "."); if (tokens.size() < 2) { return Option(400, "Invalid reference `" + field_json[fields::reference].get() + "`."); } } the_fields.emplace_back( field(field_json[fields::name], field_json[fields::type], field_json[fields::facet], field_json[fields::optional], field_json[fields::index], field_json[fields::locale], field_json[fields::sort], field_json[fields::infix], field_json[fields::nested], field_json[fields::nested_array], field_json[fields::num_dim], vec_dist, field_json[fields::reference], field_json[fields::embed], field_json[fields::range_index]) ); if (!field_json[fields::reference].get().empty()) { // Add a reference helper field in the schema. It stores the doc id of the document it references to reduce the // computation while searching. the_fields.emplace_back( field(field_json[fields::name].get() + Collection::REFERENCE_HELPER_FIELD_SUFFIX, "int64", false, field_json[fields::optional], true) ); } return Option(true); } bool field::flatten_obj(nlohmann::json& doc, nlohmann::json& value, bool has_array, bool has_obj_array, const field& the_field, const std::string& flat_name, const std::unordered_map& dyn_fields, std::unordered_map& flattened_fields) { if(value.is_object()) { has_obj_array = has_array; for(const auto& kv: value.items()) { flatten_obj(doc, kv.value(), has_array, has_obj_array, the_field, flat_name + "." + kv.key(), dyn_fields, flattened_fields); } } else if(value.is_array()) { for(const auto& kv: value.items()) { flatten_obj(doc, kv.value(), true, has_obj_array, the_field, flat_name, dyn_fields, flattened_fields); } } else { // must be a primitive if(doc.count(flat_name) != 0 && flattened_fields.find(flat_name) == flattened_fields.end()) { return true; } std::string detected_type; bool found_dynamic_field = false; for(auto dyn_field_it = dyn_fields.begin(); dyn_field_it != dyn_fields.end(); dyn_field_it++) { auto& dynamic_field = dyn_field_it->second; if(dynamic_field.is_auto() || dynamic_field.is_string_star()) { continue; } if(std::regex_match(flat_name, std::regex(flat_name))) { detected_type = dynamic_field.type; found_dynamic_field = true; break; } } if(!found_dynamic_field) { if(!field::get_type(value, detected_type)) { return false; } if(std::isalnum(detected_type.back()) && has_array) { // convert singular type to multi valued type detected_type += "[]"; } } if(has_array) { doc[flat_name].push_back(value); } else { doc[flat_name] = value; } field flattened_field = the_field; flattened_field.name = flat_name; flattened_field.type = detected_type; flattened_field.optional = true; flattened_field.nested = true; flattened_field.nested_array = has_obj_array; flattened_field.set_computed_defaults(-1, -1); flattened_fields[flat_name] = flattened_field; } return true; } Option field::flatten_field(nlohmann::json& doc, nlohmann::json& obj, const field& the_field, std::vector& path_parts, size_t path_index, bool has_array, bool has_obj_array, const std::unordered_map& dyn_fields, std::unordered_map& flattened_fields) { if(path_index == path_parts.size()) { // end of path: check if obj matches expected type std::string detected_type; bool found_dynamic_field = false; for(auto dyn_field_it = dyn_fields.begin(); dyn_field_it != dyn_fields.end(); dyn_field_it++) { auto& dynamic_field = dyn_field_it->second; if(dynamic_field.is_auto() || dynamic_field.is_string_star()) { continue; } if(std::regex_match(the_field.name, std::regex(dynamic_field.name))) { detected_type = obj.is_object() ? field_types::OBJECT : dynamic_field.type; found_dynamic_field = true; break; } } if(!found_dynamic_field) { if(!field::get_type(obj, detected_type)) { if(obj.is_null() && the_field.optional) { // null values are allowed only if field is optional return Option(true); } return Option(400, "Field `" + the_field.name + "` has an incorrect type."); } if(std::isalnum(detected_type.back()) && has_array) { // convert singular type to multi valued type detected_type += "[]"; } } has_obj_array = has_obj_array || ((detected_type == field_types::OBJECT) && has_array); // handle differences in detection of numerical types bool is_numericaly_valid = (detected_type != the_field.type) && ( (detected_type == field_types::INT64 && (the_field.type == field_types::INT32 || the_field.type == field_types::FLOAT)) || (detected_type == field_types::INT64_ARRAY && (the_field.type == field_types::INT32_ARRAY || the_field.type == field_types::FLOAT_ARRAY)) || (detected_type == field_types::FLOAT_ARRAY && the_field.type == field_types::GEOPOINT_ARRAY) || (detected_type == field_types::FLOAT_ARRAY && the_field.type == field_types::GEOPOINT && !has_obj_array) ); if(detected_type == the_field.type || is_numericaly_valid) { if(the_field.is_object()) { flatten_obj(doc, obj, has_array, has_obj_array, the_field, the_field.name, dyn_fields, flattened_fields); } else { if(doc.count(the_field.name) != 0 && flattened_fields.find(the_field.name) == flattened_fields.end()) { return Option(true); } if(has_array) { doc[the_field.name].push_back(obj); } else { doc[the_field.name] = obj; } field flattened_field = the_field; flattened_field.type = detected_type; flattened_field.nested = (path_index > 1); flattened_field.nested_array = has_obj_array; flattened_fields[the_field.name] = flattened_field; } return Option(true); } else { if(has_obj_array && !the_field.is_array()) { return Option(400, "Field `" + the_field.name + "` has an incorrect type. " "Hint: field inside an array of objects must be an array type as well."); } return Option(400, "Field `" + the_field.name + "` has an incorrect type."); } } const std::string& fragment = path_parts[path_index]; const auto& it = obj.find(fragment); if(it != obj.end()) { if(it.value().is_array()) { if(it.value().empty()) { return Option(404, "Field `" + the_field.name + "` not found."); } has_array = true; for(auto& ele: it.value()) { has_obj_array = has_obj_array || ele.is_object(); Option op = flatten_field(doc, ele, the_field, path_parts, path_index + 1, has_array, has_obj_array, dyn_fields, flattened_fields); if(!op.ok()) { return op; } } return Option(true); } else { return flatten_field(doc, it.value(), the_field, path_parts, path_index + 1, has_array, has_obj_array, dyn_fields, flattened_fields); } } { return Option(404, "Field `" + the_field.name + "` not found."); } } Option field::flatten_doc(nlohmann::json& document, const tsl::htrie_map& nested_fields, const std::unordered_map& dyn_fields, bool missing_is_ok, std::vector& flattened_fields) { std::unordered_map flattened_fields_map; for(auto& nested_field: nested_fields) { std::vector field_parts; StringUtils::split(nested_field.name, field_parts, "."); if(field_parts.size() > 1 && document.count(nested_field.name) != 0) { // skip explicitly present nested fields continue; } auto op = flatten_field(document, document, nested_field, field_parts, 0, false, false, dyn_fields, flattened_fields_map); if(op.ok()) { continue; } if(op.code() == 404 && (missing_is_ok || nested_field.optional)) { continue; } else { return op; } } document[".flat"] = nlohmann::json::array(); for(auto& kv: flattened_fields_map) { document[".flat"].push_back(kv.second.name); flattened_fields.push_back(kv.second); } return Option(true); } void field::compact_nested_fields(tsl::htrie_map& nested_fields) { std::vector nested_fields_vec; for(const auto& f: nested_fields) { nested_fields_vec.push_back(f.name); } for(auto& field_name: nested_fields_vec) { nested_fields.erase_prefix(field_name + "."); } } Option field::json_fields_to_fields(bool enable_nested_fields, nlohmann::json &fields_json, string &fallback_field_type, std::vector& the_fields) { size_t num_auto_detect_fields = 0; std::vector> embed_json_field_indices; for(size_t i = 0; i < fields_json.size(); i++) { nlohmann::json& field_json = fields_json[i]; auto op = json_field_to_field(enable_nested_fields, field_json, the_fields, fallback_field_type, num_auto_detect_fields); if(!op.ok()) { return op; } if(!the_fields.empty() && !the_fields.back().embed.empty()) { embed_json_field_indices.emplace_back(i, i); } } const tsl::htrie_map dummy_search_schema; auto validation_op = field::validate_and_init_embed_fields(embed_json_field_indices, dummy_search_schema, fields_json, the_fields); if(!validation_op.ok()) { return validation_op; } if(num_auto_detect_fields > 1) { return Option(400,"There can be only one field named `.*`."); } return Option(true); } Option field::validate_and_init_embed_fields(const std::vector>& embed_json_field_indices, const tsl::htrie_map& search_schema, nlohmann::json& fields_json, std::vector& fields_vec) { for(const auto& json_field_index: embed_json_field_indices) { auto& field_json = fields_json[json_field_index.first]; const std::string err_msg = "Property `" + fields::embed + "." + fields::from + "` can only refer to string or string array fields."; LOG(INFO) << "field_json: " << field_json; for(auto& field_name : field_json[fields::embed][fields::from].get>()) { auto embed_field = std::find_if(fields_json.begin(), fields_json.end(), [&field_name](const nlohmann::json& x) { return x["name"].get() == field_name; }); if(embed_field == fields_json.end()) { const auto& embed_field2 = search_schema.find(field_name); if (embed_field2 == search_schema.end()) { return Option(400, err_msg); } else if (embed_field2->type != field_types::STRING && embed_field2->type != field_types::STRING_ARRAY) { return Option(400, err_msg); } } else if((*embed_field)[fields::type] != field_types::STRING && (*embed_field)[fields::type] != field_types::STRING_ARRAY) { return Option(400, err_msg); } } const auto& model_config = field_json[fields::embed][fields::model_config]; size_t num_dim = 0; auto res = TextEmbedderManager::validate_and_init_model(model_config, num_dim); if(!res.ok()) { return Option(res.code(), res.error()); } LOG(INFO) << "Model init done."; field_json[fields::num_dim] = num_dim; fields_vec[json_field_index.second].num_dim = num_dim; } return Option(true); }