Extract doc validation functions out.

This commit is contained in:
Kishore Nallan 2023-03-05 10:55:42 +05:30
parent be9ba98a25
commit f48b9ffe6e
8 changed files with 635 additions and 597 deletions

View File

@ -3,6 +3,7 @@ build --action_env=BAZEL_CXXOPTS="-std=c++17"
build --define=TYPESENSE_VERSION=\"nightly\"
build --cxxopt="-std=c++17"
test --jobs=6
build --enable_platform_specific_config
build:linux --action_env=BAZEL_LINKLIBS="-l%:libstdc++.a -l%:libgcc.a"

View File

@ -6,6 +6,7 @@
#include "option.h"
#include "string_utils.h"
#include "logger.h"
#include "store.h"
#include <sparsepp.h>
#include <tsl/htrie_map.h>
#include "json.hpp"

View File

@ -13,6 +13,7 @@
#include <topster.h>
#include <json.hpp>
#include <field.h>
#include <validator.h>
#include <option.h>
#include <set>
#include "string_utils.h"
@ -186,21 +187,6 @@ struct search_args {
};
};
enum index_operation_t {
CREATE,
UPSERT,
UPDATE,
EMPLACE,
DELETE
};
enum class DIRTY_VALUES {
REJECT = 1,
DROP = 2,
COERCE_OR_REJECT = 3,
COERCE_OR_DROP = 4,
};
struct offsets_facet_hashes_t {
std::unordered_map<std::string, std::vector<uint32_t>> offsets;
std::vector<uint64_t> facet_hashes;
@ -520,33 +506,6 @@ private:
static void get_doc_changes(const index_operation_t op, nlohmann::json &update_doc,
const nlohmann::json &old_doc, nlohmann::json &new_doc, nlohmann::json &del_doc);
static Option<uint32_t> coerce_string(const DIRTY_VALUES& dirty_values, const std::string& fallback_field_type,
const field& a_field, nlohmann::json &document,
const std::string &field_name,
nlohmann::json::iterator& array_iter,
bool is_array,
bool& array_ele_erased);
static Option<uint32_t> coerce_int32_t(const DIRTY_VALUES& dirty_values, const field& a_field, nlohmann::json &document,
const std::string &field_name,
nlohmann::json::iterator& array_iter, bool is_array, bool& array_ele_erased);
static Option<uint32_t> coerce_int64_t(const DIRTY_VALUES& dirty_values, const field& a_field, nlohmann::json &document,
const std::string &field_name,
nlohmann::json::iterator& array_iter, bool is_array, bool& array_ele_erased);
static Option<uint32_t> coerce_float(const DIRTY_VALUES& dirty_values, const field& a_field, nlohmann::json &document,
const std::string &field_name,
nlohmann::json::iterator& array_iter, bool is_array, bool& array_ele_erased);
static Option<uint32_t> coerce_bool(const DIRTY_VALUES& dirty_values, const field& a_field, nlohmann::json &document,
const std::string &field_name,
nlohmann::json::iterator& array_iter, bool is_array, bool& array_ele_erased);
static Option<uint32_t> coerce_geopoint(const DIRTY_VALUES& dirty_values, const field& a_field, nlohmann::json &document,
const std::string &field_name,
nlohmann::json::iterator& array_iter, bool is_array, bool& array_ele_erased);
bool common_results_exist(std::vector<art_leaf*>& leaves, bool must_match_phrase) const;
static void remove_facet_token(const field& search_field, spp::sparse_hash_map<std::string, art_tree*>& search_index,
@ -719,13 +678,6 @@ public:
// the following methods are not synchronized because their parent calls are synchronized or they are const/static
static Option<uint32_t> validate_index_in_memory(nlohmann::json &document, uint32_t seq_id,
const std::string & default_sorting_field,
const tsl::htrie_map<char, field> & search_schema,
const index_operation_t op,
const std::string& fallback_field_type,
const DIRTY_VALUES& dirty_values);
void search_wildcard(filter_node_t const* const& filter_tree_root,
const std::map<size_t, std::map<size_t, uint32_t>>& included_ids_map,
const std::vector<sort_by>& sort_fields, Topster* topster, Topster* curated_topster,

67
include/validator.h Normal file
View File

@ -0,0 +1,67 @@
#pragma once
#include "option.h"
#include <cctype>
#include "json.hpp"
#include "tsl/htrie_map.h"
#include "field.h"
enum index_operation_t {
CREATE,
UPSERT,
UPDATE,
EMPLACE,
DELETE
};
enum class DIRTY_VALUES {
REJECT = 1,
DROP = 2,
COERCE_OR_REJECT = 3,
COERCE_OR_DROP = 4,
};
class validator_t {
public:
static Option<uint32_t> validate_index_in_memory(nlohmann::json &document, uint32_t seq_id,
const std::string & default_sorting_field,
const tsl::htrie_map<char, field> & search_schema,
const index_operation_t op,
const std::string& fallback_field_type,
const DIRTY_VALUES& dirty_values);
static Option<uint32_t> coerce_element(const field& a_field, nlohmann::json& document,
nlohmann::json& doc_ele,
const std::string& fallback_field_type,
const DIRTY_VALUES& dirty_values);
static Option<uint32_t> coerce_string(const DIRTY_VALUES& dirty_values, const std::string& fallback_field_type,
const field& a_field, nlohmann::json &document,
const std::string &field_name,
nlohmann::json::iterator& array_iter,
bool is_array,
bool& array_ele_erased);
static Option<uint32_t> coerce_int32_t(const DIRTY_VALUES& dirty_values, const field& a_field, nlohmann::json &document,
const std::string &field_name,
nlohmann::json::iterator& array_iter, bool is_array, bool& array_ele_erased);
static Option<uint32_t> coerce_int64_t(const DIRTY_VALUES& dirty_values, const field& a_field, nlohmann::json &document,
const std::string &field_name,
nlohmann::json::iterator& array_iter, bool is_array, bool& array_ele_erased);
static Option<uint32_t> coerce_float(const DIRTY_VALUES& dirty_values, const field& a_field, nlohmann::json &document,
const std::string &field_name,
nlohmann::json::iterator& array_iter, bool is_array, bool& array_ele_erased);
static Option<uint32_t> coerce_bool(const DIRTY_VALUES& dirty_values, const field& a_field, nlohmann::json &document,
const std::string &field_name,
nlohmann::json::iterator& array_iter, bool is_array, bool& array_ele_erased);
static Option<uint32_t> coerce_geopoint(const DIRTY_VALUES& dirty_values, const field& a_field, nlohmann::json &document,
const std::string &field_name,
nlohmann::json::iterator& array_iter, bool is_array, bool& array_ele_erased);
};

View File

@ -493,7 +493,7 @@ Option<uint32_t> Collection::index_in_memory(nlohmann::json &document, uint32_t
const index_operation_t op, const DIRTY_VALUES& dirty_values) {
std::unique_lock lock(mutex);
Option<uint32_t> validation_op = Index::validate_index_in_memory(document, seq_id, default_sorting_field,
Option<uint32_t> validation_op = validator_t::validate_index_in_memory(document, seq_id, default_sorting_field,
search_schema, op,
fallback_field_type, dirty_values);
@ -3889,7 +3889,7 @@ Option<bool> Collection::validate_alter_payload(nlohmann::json& schema_changes,
}
// validate existing data on disk for compatibility via updated_search_schema
auto validate_op = Index::validate_index_in_memory(document, seq_id, default_sorting_field,
auto validate_op = validator_t::validate_index_in_memory(document, seq_id, default_sorting_field,
updated_search_schema,
index_operation_t::CREATE,
fallback_field_type,

View File

@ -708,17 +708,17 @@ bool field::flatten_obj(nlohmann::json& doc, nlohmann::json& value, bool has_arr
return true;
}
std::string detected_type;
if(!field::get_type(value, detected_type)) {
return false;
}
if(has_array) {
doc[flat_name].push_back(value);
} else {
doc[flat_name] = value;
}
std::string detected_type;
if(!field::get_type(value, detected_type)) {
return false;
}
if(std::isalnum(detected_type.back()) && has_array) {
// convert singular type to multi valued type
detected_type += "[]";
@ -746,7 +746,7 @@ Option<bool> field::flatten_field(nlohmann::json& doc, nlohmann::json& obj, cons
if(!field::get_type(obj, detected_type)) {
if(obj.is_null() && the_field.optional) {
// null values are allowed only if field is optional
return Option<bool>(false);
return Option<bool>(true);
}
return Option<bool>(400, "Field `" + the_field.name + "` has an incorrect type.");

View File

@ -406,171 +406,6 @@ bool validate_object_field(nlohmann::json& doc, const field& a_field) {
return false;
}
Option<uint32_t> Index::validate_index_in_memory(nlohmann::json& document, uint32_t seq_id,
const std::string & default_sorting_field,
const tsl::htrie_map<char, field> & search_schema,
const index_operation_t op,
const std::string& fallback_field_type,
const DIRTY_VALUES& dirty_values) {
bool missing_default_sort_field = (!default_sorting_field.empty() && document.count(default_sorting_field) == 0);
if((op != UPDATE && op != EMPLACE) && missing_default_sort_field) {
return Option<>(400, "Field `" + default_sorting_field + "` has been declared as a default sorting field, "
"but is not found in the document.");
}
for(const auto& a_field: search_schema) {
const std::string& field_name = a_field.name;
if(field_name == "id" || a_field.is_object()) {
continue;
}
if((a_field.optional || op == UPDATE || op == EMPLACE) && document.count(field_name) == 0) {
continue;
}
if(document.count(field_name) == 0) {
return Option<>(400, "Field `" + field_name + "` has been declared in the schema, "
"but is not found in the document.");
}
if(a_field.optional && document[field_name].is_null()) {
// we will ignore `null` on an option field
if(op != UPDATE && op != EMPLACE) {
// for updates, the erasure is done later since we need to keep the key for overwrite
document.erase(field_name);
}
continue;
}
nlohmann::json::iterator dummy_iter;
bool array_ele_erased = false;
if(a_field.type == field_types::STRING && !document[field_name].is_string()) {
Option<uint32_t> coerce_op = coerce_string(dirty_values, fallback_field_type, a_field, document, field_name, dummy_iter, false, array_ele_erased);
if(!coerce_op.ok()) {
return coerce_op;
}
} else if(a_field.type == field_types::INT32) {
if(!document[field_name].is_number_integer()) {
Option<uint32_t> coerce_op = coerce_int32_t(dirty_values, a_field, document, field_name, dummy_iter, false, array_ele_erased);
if(!coerce_op.ok()) {
return coerce_op;
}
}
} else if(a_field.type == field_types::INT64 && !document[field_name].is_number_integer()) {
Option<uint32_t> coerce_op = coerce_int64_t(dirty_values, a_field, document, field_name, dummy_iter, false, array_ele_erased);
if(!coerce_op.ok()) {
return coerce_op;
}
} else if(a_field.type == field_types::FLOAT && !document[field_name].is_number()) {
// using `is_number` allows integer to be passed to a float field
Option<uint32_t> coerce_op = coerce_float(dirty_values, a_field, document, field_name, dummy_iter, false, array_ele_erased);
if(!coerce_op.ok()) {
return coerce_op;
}
} else if(a_field.type == field_types::BOOL && !document[field_name].is_boolean()) {
Option<uint32_t> coerce_op = coerce_bool(dirty_values, a_field, document, field_name, dummy_iter, false, array_ele_erased);
if(!coerce_op.ok()) {
return coerce_op;
}
} else if(a_field.type == field_types::GEOPOINT) {
if(!document[field_name].is_array() || document[field_name].size() != 2) {
return Option<>(400, "Field `" + field_name + "` must be a 2 element array: [lat, lng].");
}
if(!(document[field_name][0].is_number() && document[field_name][1].is_number())) {
// one or more elements is not an number, try to coerce
Option<uint32_t> coerce_op = coerce_geopoint(dirty_values, a_field, document, field_name, dummy_iter, false, array_ele_erased);
if(!coerce_op.ok()) {
return coerce_op;
}
}
} else if(a_field.is_array()) {
if(!document[field_name].is_array()) {
if(a_field.optional && (dirty_values == DIRTY_VALUES::DROP ||
dirty_values == DIRTY_VALUES::COERCE_OR_DROP)) {
document.erase(field_name);
continue;
} else {
return Option<>(400, "Field `" + field_name + "` must be an array.");
}
}
nlohmann::json::iterator it = document[field_name].begin();
// Handle a geopoint[] type inside an array of object: it won't be an array of array, so cannot iterate
if(a_field.nested && a_field.type == field_types::GEOPOINT_ARRAY &&
it->is_number() && document[field_name].size() == 2) {
const auto& item = document[field_name];
if(!(item[0].is_number() && item[1].is_number())) {
// one or more elements is not an number, try to coerce
Option<uint32_t> coerce_op = coerce_geopoint(dirty_values, a_field, document, field_name, it, true, array_ele_erased);
if(!coerce_op.ok()) {
return coerce_op;
}
}
continue;
}
for(; it != document[field_name].end(); ) {
const auto& item = it.value();
array_ele_erased = false;
if (a_field.type == field_types::STRING_ARRAY && !item.is_string()) {
Option<uint32_t> coerce_op = coerce_string(dirty_values, fallback_field_type, a_field, document, field_name, it, true, array_ele_erased);
if (!coerce_op.ok()) {
return coerce_op;
}
} else if (a_field.type == field_types::INT32_ARRAY && !item.is_number_integer()) {
Option<uint32_t> coerce_op = coerce_int32_t(dirty_values, a_field, document, field_name, it, true, array_ele_erased);
if (!coerce_op.ok()) {
return coerce_op;
}
} else if (a_field.type == field_types::INT64_ARRAY && !item.is_number_integer()) {
Option<uint32_t> coerce_op = coerce_int64_t(dirty_values, a_field, document, field_name, it, true, array_ele_erased);
if (!coerce_op.ok()) {
return coerce_op;
}
} else if (a_field.type == field_types::FLOAT_ARRAY && !item.is_number()) {
// we check for `is_number` to allow whole numbers to be passed into float fields
Option<uint32_t> coerce_op = coerce_float(dirty_values, a_field, document, field_name, it, true, array_ele_erased);
if (!coerce_op.ok()) {
return coerce_op;
}
} else if (a_field.type == field_types::BOOL_ARRAY && !item.is_boolean()) {
Option<uint32_t> coerce_op = coerce_bool(dirty_values, a_field, document, field_name, it, true, array_ele_erased);
if (!coerce_op.ok()) {
return coerce_op;
}
} else if (a_field.type == field_types::GEOPOINT_ARRAY) {
if(!item.is_array() || item.size() != 2) {
return Option<>(400, "Field `" + field_name + "` must contain 2 element arrays: [ [lat, lng],... ].");
}
if(!(item[0].is_number() && item[1].is_number())) {
// one or more elements is not an number, try to coerce
Option<uint32_t> coerce_op = coerce_geopoint(dirty_values, a_field, document, field_name, it, true, array_ele_erased);
if(!coerce_op.ok()) {
return coerce_op;
}
}
}
if(!array_ele_erased) {
// if it is erased, the iterator will be reassigned
it++;
}
}
}
}
return Option<>(200);
}
void Index::validate_and_preprocess(Index *index, std::vector<index_record>& iter_batch,
const size_t batch_start_index, const size_t batch_size,
const std::string& default_sorting_field,
@ -596,7 +431,7 @@ void Index::validate_and_preprocess(Index *index, std::vector<index_record>& ite
}
if(do_validation) {
Option<uint32_t> validation_op = validate_index_in_memory(index_rec.doc, index_rec.seq_id,
Option<uint32_t> validation_op = validator_t::validate_index_in_memory(index_rec.doc, index_rec.seq_id,
default_sorting_field,
search_schema,
index_rec.operation,
@ -5652,380 +5487,6 @@ void Index::refresh_schemas(const std::vector<field>& new_fields, const std::vec
}
}
Option<uint32_t> Index::coerce_string(const DIRTY_VALUES& dirty_values, const std::string& fallback_field_type,
const field& a_field, nlohmann::json &document,
const std::string &field_name, nlohmann::json::iterator& array_iter,
bool is_array, bool& array_ele_erased) {
std::string suffix = is_array ? "an array of" : "a";
auto& item = is_array ? array_iter.value() : document[field_name];
if(dirty_values == DIRTY_VALUES::REJECT) {
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " string.");
}
if(dirty_values == DIRTY_VALUES::DROP) {
if(!a_field.optional) {
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " string.");
}
if(!is_array) {
document.erase(field_name);
} else {
array_iter = document[field_name].erase(array_iter);
array_ele_erased = true;
}
return Option<uint32_t>(200);
}
// we will try to coerce the value to a string
if (item.is_number_integer()) {
item = std::to_string((int64_t)item);
}
else if(item.is_number_float()) {
item = StringUtils::float_to_str((float)item);
}
else if(item.is_boolean()) {
item = item == true ? "true" : "false";
}
else {
if(dirty_values == DIRTY_VALUES::COERCE_OR_DROP) {
if(!a_field.optional) {
if(a_field.nested && item.is_array()) {
return Option<>(400, "Field `" + field_name + "` has an incorrect type. "
"Hint: field inside an array of objects must be an array type as well.");
}
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " string.");
}
if(!is_array) {
document.erase(field_name);
} else {
array_iter = document[field_name].erase(array_iter);
array_ele_erased = true;
}
} else {
// COERCE_OR_REJECT / non-optional + DROP
if(a_field.nested && item.is_array()) {
return Option<>(400, "Field `" + field_name + "` has an incorrect type. "
"Hint: field inside an array of objects must be an array type as well.");
}
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " string.");
}
}
return Option<>(200);
}
Option<uint32_t> Index::coerce_int32_t(const DIRTY_VALUES& dirty_values, const field& a_field, nlohmann::json &document,
const std::string &field_name,
nlohmann::json::iterator& array_iter, bool is_array, bool& array_ele_erased) {
std::string suffix = is_array ? "an array of" : "an";
auto& item = is_array ? array_iter.value() : document[field_name];
if(dirty_values == DIRTY_VALUES::REJECT) {
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " int32.");
}
if(dirty_values == DIRTY_VALUES::DROP) {
if(!a_field.optional) {
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " int32.");
}
if(!is_array) {
document.erase(field_name);
} else {
array_iter = document[field_name].erase(array_iter);
array_ele_erased = true;
}
return Option<uint32_t>(200);
}
// try to value coerce into an integer
if(item.is_number_float()) {
item = static_cast<int32_t>(item.get<float>());
}
else if(item.is_boolean()) {
item = item == true ? 1 : 0;
}
else if(item.is_string() && StringUtils::is_int32_t(item)) {
item = std::atol(item.get<std::string>().c_str());
}
else {
if(dirty_values == DIRTY_VALUES::COERCE_OR_DROP) {
if(!a_field.optional) {
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " int32.");
}
if(!is_array) {
document.erase(field_name);
} else {
array_iter = document[field_name].erase(array_iter);
array_ele_erased = true;
}
} else {
// COERCE_OR_REJECT / non-optional + DROP
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " int32.");
}
}
if(document.contains(field_name) && document[field_name].get<int64_t>() > INT32_MAX) {
if(a_field.optional && (dirty_values == DIRTY_VALUES::DROP || dirty_values == DIRTY_VALUES::COERCE_OR_REJECT)) {
document.erase(field_name);
} else {
return Option<>(400, "Field `" + field_name + "` exceeds maximum value of int32.");
}
}
return Option<uint32_t>(200);
}
Option<uint32_t> Index::coerce_int64_t(const DIRTY_VALUES& dirty_values, const field& a_field, nlohmann::json &document,
const std::string &field_name,
nlohmann::json::iterator& array_iter, bool is_array, bool& array_ele_erased) {
std::string suffix = is_array ? "an array of" : "an";
auto& item = is_array ? array_iter.value() : document[field_name];
if(dirty_values == DIRTY_VALUES::REJECT) {
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " int64.");
}
if(dirty_values == DIRTY_VALUES::DROP) {
if(!a_field.optional) {
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " int64.");
}
if(!is_array) {
document.erase(field_name);
} else {
array_iter = document[field_name].erase(array_iter);
array_ele_erased = true;
}
return Option<uint32_t>(200);
}
// try to value coerce into an integer
if(item.is_number_float()) {
item = static_cast<int64_t>(item.get<float>());
}
else if(item.is_boolean()) {
item = item == true ? 1 : 0;
}
else if(item.is_string() && StringUtils::is_int64_t(item)) {
item = std::atoll(item.get<std::string>().c_str());
}
else {
if(dirty_values == DIRTY_VALUES::COERCE_OR_DROP) {
if(!a_field.optional) {
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " int64.");
}
if(!is_array) {
document.erase(field_name);
} else {
array_iter = document[field_name].erase(array_iter);
array_ele_erased = true;
}
} else {
// COERCE_OR_REJECT / non-optional + DROP
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " int64.");
}
}
return Option<uint32_t>(200);
}
Option<uint32_t> Index::coerce_bool(const DIRTY_VALUES& dirty_values, const field& a_field, nlohmann::json &document,
const std::string &field_name,
nlohmann::json::iterator& array_iter, bool is_array, bool& array_ele_erased) {
std::string suffix = is_array ? "a array of" : "a";
auto& item = is_array ? array_iter.value() : document[field_name];
if(dirty_values == DIRTY_VALUES::REJECT) {
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " bool.");
}
if(dirty_values == DIRTY_VALUES::DROP) {
if(!a_field.optional) {
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " bool.");
}
if(!is_array) {
document.erase(field_name);
} else {
array_iter = document[field_name].erase(array_iter);
array_ele_erased = true;
}
return Option<uint32_t>(200);
}
// try to value coerce into a bool
if (item.is_number_integer() &&
(item.get<int64_t>() == 1 || item.get<int64_t>() == 0)) {
item = item.get<int64_t>() == 1;
}
else if(item.is_string()) {
std::string str_val = item.get<std::string>();
StringUtils::tolowercase(str_val);
if(str_val == "true") {
item = true;
return Option<uint32_t>(200);
} else if(str_val == "false") {
item = false;
return Option<uint32_t>(200);
} else {
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " bool.");
}
}
else {
if(dirty_values == DIRTY_VALUES::COERCE_OR_DROP) {
if(!a_field.optional) {
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " bool.");
}
if(!is_array) {
document.erase(field_name);
} else {
array_iter = document[field_name].erase(array_iter);
array_ele_erased = true;
}
} else {
// COERCE_OR_REJECT / non-optional + DROP
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " bool.");
}
}
return Option<uint32_t>(200);
}
Option<uint32_t> Index::coerce_geopoint(const DIRTY_VALUES& dirty_values, const field& a_field, nlohmann::json &document,
const std::string &field_name,
nlohmann::json::iterator& array_iter, bool is_array, bool& array_ele_erased) {
std::string suffix = is_array ? "an array of" : "a";
auto& item = is_array ? array_iter.value() : document[field_name];
if(dirty_values == DIRTY_VALUES::REJECT) {
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " geopoint.");
}
if(dirty_values == DIRTY_VALUES::DROP) {
if(!a_field.optional) {
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " geopoint.");
}
if(!is_array) {
document.erase(field_name);
} else {
array_iter = document[field_name].erase(array_iter);
array_ele_erased = true;
}
return Option<uint32_t>(200);
}
// try to value coerce into a geopoint
if(!item[0].is_number() && item[0].is_string()) {
if(StringUtils::is_float(item[0])) {
item[0] = std::stof(item[0].get<std::string>());
}
}
if(!item[1].is_number() && item[1].is_string()) {
if(StringUtils::is_float(item[1])) {
item[1] = std::stof(item[1].get<std::string>());
}
}
if(!item[0].is_number() || !item[1].is_number()) {
if(dirty_values == DIRTY_VALUES::COERCE_OR_DROP) {
if(!a_field.optional) {
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " geopoint.");
}
if(!is_array) {
document.erase(field_name);
} else {
array_iter = document[field_name].erase(array_iter);
array_ele_erased = true;
}
} else {
// COERCE_OR_REJECT / non-optional + DROP
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " geopoint.");
}
}
return Option<uint32_t>(200);
}
Option<uint32_t> Index::coerce_float(const DIRTY_VALUES& dirty_values, const field& a_field, nlohmann::json &document,
const std::string &field_name,
nlohmann::json::iterator& array_iter, bool is_array, bool& array_ele_erased) {
std::string suffix = is_array ? "a array of" : "a";
auto& item = is_array ? array_iter.value() : document[field_name];
if(dirty_values == DIRTY_VALUES::REJECT) {
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " float.");
}
if(dirty_values == DIRTY_VALUES::DROP) {
if(!a_field.optional) {
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " float.");
}
if(!is_array) {
document.erase(field_name);
} else {
array_iter = document[field_name].erase(array_iter);
array_ele_erased = true;
}
return Option<uint32_t>(200);
}
// try to value coerce into a float
if(item.is_string() && StringUtils::is_float(item)) {
item = std::atof(item.get<std::string>().c_str());
}
else if(item.is_boolean()) {
item = item == true ? 1.0 : 0.0;
}
else {
if(dirty_values == DIRTY_VALUES::COERCE_OR_DROP) {
if(!a_field.optional) {
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " float.");
}
if(!is_array) {
document.erase(field_name);
} else {
array_iter = document[field_name].erase(array_iter);
array_ele_erased = true;
}
} else {
// COERCE_OR_REJECT / non-optional + DROP
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " float.");
}
}
return Option<uint32_t>(200);
}
void Index::get_doc_changes(const index_operation_t op, nlohmann::json& update_doc,
const nlohmann::json& old_doc, nlohmann::json& new_doc, nlohmann::json& del_doc) {

556
src/validator.cpp Normal file
View File

@ -0,0 +1,556 @@
#include "validator.h"
Option<uint32_t> validator_t::coerce_element(const field& a_field, nlohmann::json& document,
nlohmann::json& doc_ele,
const std::string& fallback_field_type,
const DIRTY_VALUES& dirty_values) {
const std::string& field_name = a_field.name;
bool array_ele_erased = false;
nlohmann::json::iterator dummy_iter;
if(a_field.type == field_types::STRING && !doc_ele.is_string()) {
Option<uint32_t> coerce_op = coerce_string(dirty_values, fallback_field_type, a_field, document, field_name, dummy_iter, false, array_ele_erased);
if(!coerce_op.ok()) {
return coerce_op;
}
} else if(a_field.type == field_types::INT32) {
if(!doc_ele.is_number_integer()) {
Option<uint32_t> coerce_op = coerce_int32_t(dirty_values, a_field, document, field_name, dummy_iter, false, array_ele_erased);
if(!coerce_op.ok()) {
return coerce_op;
}
}
} else if(a_field.type == field_types::INT64 && !doc_ele.is_number_integer()) {
Option<uint32_t> coerce_op = coerce_int64_t(dirty_values, a_field, document, field_name, dummy_iter, false, array_ele_erased);
if(!coerce_op.ok()) {
return coerce_op;
}
} else if(a_field.type == field_types::FLOAT && !doc_ele.is_number()) {
// using `is_number` allows integer to be passed to a float field
Option<uint32_t> coerce_op = coerce_float(dirty_values, a_field, document, field_name, dummy_iter, false, array_ele_erased);
if(!coerce_op.ok()) {
return coerce_op;
}
} else if(a_field.type == field_types::BOOL && !doc_ele.is_boolean()) {
Option<uint32_t> coerce_op = coerce_bool(dirty_values, a_field, document, field_name, dummy_iter, false, array_ele_erased);
if(!coerce_op.ok()) {
return coerce_op;
}
} else if(a_field.type == field_types::GEOPOINT) {
if(!doc_ele.is_array() || doc_ele.size() != 2) {
return Option<>(400, "Field `" + field_name + "` must be a 2 element array: [lat, lng].");
}
if(!(doc_ele[0].is_number() && doc_ele[1].is_number())) {
// one or more elements is not an number, try to coerce
Option<uint32_t> coerce_op = coerce_geopoint(dirty_values, a_field, document, field_name, dummy_iter, false, array_ele_erased);
if(!coerce_op.ok()) {
return coerce_op;
}
}
} else if(a_field.is_array()) {
if(!doc_ele.is_array()) {
if(a_field.optional && (dirty_values == DIRTY_VALUES::DROP ||
dirty_values == DIRTY_VALUES::COERCE_OR_DROP)) {
document.erase(field_name);
return Option<uint32_t>(200);
} else {
return Option<>(400, "Field `" + field_name + "` must be an array.");
}
}
nlohmann::json::iterator it = doc_ele.begin();
// Handle a geopoint[] type inside an array of object: it won't be an array of array, so cannot iterate
if(a_field.nested && a_field.type == field_types::GEOPOINT_ARRAY &&
it->is_number() && doc_ele.size() == 2) {
const auto& item = doc_ele;
if(!(item[0].is_number() && item[1].is_number())) {
// one or more elements is not an number, try to coerce
Option<uint32_t> coerce_op = coerce_geopoint(dirty_values, a_field, document, field_name, it, true, array_ele_erased);
if(!coerce_op.ok()) {
return coerce_op;
}
}
return Option<uint32_t>(200);
}
for(; it != doc_ele.end(); ) {
const auto& item = it.value();
array_ele_erased = false;
if (a_field.type == field_types::STRING_ARRAY && !item.is_string()) {
Option<uint32_t> coerce_op = coerce_string(dirty_values, fallback_field_type, a_field, document, field_name, it, true, array_ele_erased);
if (!coerce_op.ok()) {
return coerce_op;
}
} else if (a_field.type == field_types::INT32_ARRAY && !item.is_number_integer()) {
Option<uint32_t> coerce_op = coerce_int32_t(dirty_values, a_field, document, field_name, it, true, array_ele_erased);
if (!coerce_op.ok()) {
return coerce_op;
}
} else if (a_field.type == field_types::INT64_ARRAY && !item.is_number_integer()) {
Option<uint32_t> coerce_op = coerce_int64_t(dirty_values, a_field, document, field_name, it, true, array_ele_erased);
if (!coerce_op.ok()) {
return coerce_op;
}
} else if (a_field.type == field_types::FLOAT_ARRAY && !item.is_number()) {
// we check for `is_number` to allow whole numbers to be passed into float fields
Option<uint32_t> coerce_op = coerce_float(dirty_values, a_field, document, field_name, it, true, array_ele_erased);
if (!coerce_op.ok()) {
return coerce_op;
}
} else if (a_field.type == field_types::BOOL_ARRAY && !item.is_boolean()) {
Option<uint32_t> coerce_op = coerce_bool(dirty_values, a_field, document, field_name, it, true, array_ele_erased);
if (!coerce_op.ok()) {
return coerce_op;
}
} else if (a_field.type == field_types::GEOPOINT_ARRAY) {
if(!item.is_array() || item.size() != 2) {
return Option<>(400, "Field `" + field_name + "` must contain 2 element arrays: [ [lat, lng],... ].");
}
if(!(item[0].is_number() && item[1].is_number())) {
// one or more elements is not an number, try to coerce
Option<uint32_t> coerce_op = coerce_geopoint(dirty_values, a_field, document, field_name, it, true, array_ele_erased);
if(!coerce_op.ok()) {
return coerce_op;
}
}
}
if(!array_ele_erased) {
// if it is erased, the iterator will be reassigned
it++;
}
}
}
return Option<uint32_t>(200);
}
Option<uint32_t> validator_t::coerce_string(const DIRTY_VALUES& dirty_values, const std::string& fallback_field_type,
const field& a_field, nlohmann::json &document,
const std::string &field_name, nlohmann::json::iterator& array_iter,
bool is_array, bool& array_ele_erased) {
std::string suffix = is_array ? "an array of" : "a";
auto& item = is_array ? array_iter.value() : document[field_name];
if(dirty_values == DIRTY_VALUES::REJECT) {
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " string.");
}
if(dirty_values == DIRTY_VALUES::DROP) {
if(!a_field.optional) {
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " string.");
}
if(!is_array) {
document.erase(field_name);
} else {
array_iter = document[field_name].erase(array_iter);
array_ele_erased = true;
}
return Option<uint32_t>(200);
}
// we will try to coerce the value to a string
if (item.is_number_integer()) {
item = std::to_string((int64_t)item);
}
else if(item.is_number_float()) {
item = StringUtils::float_to_str((float)item);
}
else if(item.is_boolean()) {
item = item == true ? "true" : "false";
}
else {
if(dirty_values == DIRTY_VALUES::COERCE_OR_DROP) {
if(!a_field.optional) {
if(a_field.nested && item.is_array()) {
return Option<>(400, "Field `" + field_name + "` has an incorrect type. "
"Hint: field inside an array of objects must be an array type as well.");
}
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " string.");
}
if(!is_array) {
document.erase(field_name);
} else {
array_iter = document[field_name].erase(array_iter);
array_ele_erased = true;
}
} else {
// COERCE_OR_REJECT / non-optional + DROP
if(a_field.nested && item.is_array()) {
return Option<>(400, "Field `" + field_name + "` has an incorrect type. "
"Hint: field inside an array of objects must be an array type as well.");
}
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " string.");
}
}
return Option<>(200);
}
Option<uint32_t> validator_t::coerce_int32_t(const DIRTY_VALUES& dirty_values, const field& a_field, nlohmann::json &document,
const std::string &field_name,
nlohmann::json::iterator& array_iter, bool is_array, bool& array_ele_erased) {
std::string suffix = is_array ? "an array of" : "an";
auto& item = is_array ? array_iter.value() : document[field_name];
if(dirty_values == DIRTY_VALUES::REJECT) {
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " int32.");
}
if(dirty_values == DIRTY_VALUES::DROP) {
if(!a_field.optional) {
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " int32.");
}
if(!is_array) {
document.erase(field_name);
} else {
array_iter = document[field_name].erase(array_iter);
array_ele_erased = true;
}
return Option<uint32_t>(200);
}
// try to value coerce into an integer
if(item.is_number_float()) {
item = static_cast<int32_t>(item.get<float>());
}
else if(item.is_boolean()) {
item = item == true ? 1 : 0;
}
else if(item.is_string() && StringUtils::is_int32_t(item)) {
item = std::atol(item.get<std::string>().c_str());
}
else {
if(dirty_values == DIRTY_VALUES::COERCE_OR_DROP) {
if(!a_field.optional) {
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " int32.");
}
if(!is_array) {
document.erase(field_name);
} else {
array_iter = document[field_name].erase(array_iter);
array_ele_erased = true;
}
} else {
// COERCE_OR_REJECT / non-optional + DROP
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " int32.");
}
}
if(document.contains(field_name) && document[field_name].get<int64_t>() > INT32_MAX) {
if(a_field.optional && (dirty_values == DIRTY_VALUES::DROP || dirty_values == DIRTY_VALUES::COERCE_OR_REJECT)) {
document.erase(field_name);
} else {
return Option<>(400, "Field `" + field_name + "` exceeds maximum value of int32.");
}
}
return Option<uint32_t>(200);
}
Option<uint32_t> validator_t::coerce_int64_t(const DIRTY_VALUES& dirty_values, const field& a_field, nlohmann::json &document,
const std::string &field_name,
nlohmann::json::iterator& array_iter, bool is_array, bool& array_ele_erased) {
std::string suffix = is_array ? "an array of" : "an";
auto& item = is_array ? array_iter.value() : document[field_name];
if(dirty_values == DIRTY_VALUES::REJECT) {
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " int64.");
}
if(dirty_values == DIRTY_VALUES::DROP) {
if(!a_field.optional) {
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " int64.");
}
if(!is_array) {
document.erase(field_name);
} else {
array_iter = document[field_name].erase(array_iter);
array_ele_erased = true;
}
return Option<uint32_t>(200);
}
// try to value coerce into an integer
if(item.is_number_float()) {
item = static_cast<int64_t>(item.get<float>());
}
else if(item.is_boolean()) {
item = item == true ? 1 : 0;
}
else if(item.is_string() && StringUtils::is_int64_t(item)) {
item = std::atoll(item.get<std::string>().c_str());
}
else {
if(dirty_values == DIRTY_VALUES::COERCE_OR_DROP) {
if(!a_field.optional) {
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " int64.");
}
if(!is_array) {
document.erase(field_name);
} else {
array_iter = document[field_name].erase(array_iter);
array_ele_erased = true;
}
} else {
// COERCE_OR_REJECT / non-optional + DROP
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " int64.");
}
}
return Option<uint32_t>(200);
}
Option<uint32_t> validator_t::coerce_bool(const DIRTY_VALUES& dirty_values, const field& a_field, nlohmann::json &document,
const std::string &field_name,
nlohmann::json::iterator& array_iter, bool is_array, bool& array_ele_erased) {
std::string suffix = is_array ? "a array of" : "a";
auto& item = is_array ? array_iter.value() : document[field_name];
if(dirty_values == DIRTY_VALUES::REJECT) {
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " bool.");
}
if(dirty_values == DIRTY_VALUES::DROP) {
if(!a_field.optional) {
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " bool.");
}
if(!is_array) {
document.erase(field_name);
} else {
array_iter = document[field_name].erase(array_iter);
array_ele_erased = true;
}
return Option<uint32_t>(200);
}
// try to value coerce into a bool
if (item.is_number_integer() &&
(item.get<int64_t>() == 1 || item.get<int64_t>() == 0)) {
item = item.get<int64_t>() == 1;
}
else if(item.is_string()) {
std::string str_val = item.get<std::string>();
StringUtils::tolowercase(str_val);
if(str_val == "true") {
item = true;
return Option<uint32_t>(200);
} else if(str_val == "false") {
item = false;
return Option<uint32_t>(200);
} else {
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " bool.");
}
}
else {
if(dirty_values == DIRTY_VALUES::COERCE_OR_DROP) {
if(!a_field.optional) {
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " bool.");
}
if(!is_array) {
document.erase(field_name);
} else {
array_iter = document[field_name].erase(array_iter);
array_ele_erased = true;
}
} else {
// COERCE_OR_REJECT / non-optional + DROP
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " bool.");
}
}
return Option<uint32_t>(200);
}
Option<uint32_t> validator_t::coerce_geopoint(const DIRTY_VALUES& dirty_values, const field& a_field, nlohmann::json &document,
const std::string &field_name,
nlohmann::json::iterator& array_iter, bool is_array, bool& array_ele_erased) {
std::string suffix = is_array ? "an array of" : "a";
auto& item = is_array ? array_iter.value() : document[field_name];
if(dirty_values == DIRTY_VALUES::REJECT) {
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " geopoint.");
}
if(dirty_values == DIRTY_VALUES::DROP) {
if(!a_field.optional) {
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " geopoint.");
}
if(!is_array) {
document.erase(field_name);
} else {
array_iter = document[field_name].erase(array_iter);
array_ele_erased = true;
}
return Option<uint32_t>(200);
}
// try to value coerce into a geopoint
if(!item[0].is_number() && item[0].is_string()) {
if(StringUtils::is_float(item[0])) {
item[0] = std::stof(item[0].get<std::string>());
}
}
if(!item[1].is_number() && item[1].is_string()) {
if(StringUtils::is_float(item[1])) {
item[1] = std::stof(item[1].get<std::string>());
}
}
if(!item[0].is_number() || !item[1].is_number()) {
if(dirty_values == DIRTY_VALUES::COERCE_OR_DROP) {
if(!a_field.optional) {
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " geopoint.");
}
if(!is_array) {
document.erase(field_name);
} else {
array_iter = document[field_name].erase(array_iter);
array_ele_erased = true;
}
} else {
// COERCE_OR_REJECT / non-optional + DROP
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " geopoint.");
}
}
return Option<uint32_t>(200);
}
Option<uint32_t> validator_t::coerce_float(const DIRTY_VALUES& dirty_values, const field& a_field, nlohmann::json &document,
const std::string &field_name,
nlohmann::json::iterator& array_iter, bool is_array, bool& array_ele_erased) {
std::string suffix = is_array ? "a array of" : "a";
auto& item = is_array ? array_iter.value() : document[field_name];
if(dirty_values == DIRTY_VALUES::REJECT) {
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " float.");
}
if(dirty_values == DIRTY_VALUES::DROP) {
if(!a_field.optional) {
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " float.");
}
if(!is_array) {
document.erase(field_name);
} else {
array_iter = document[field_name].erase(array_iter);
array_ele_erased = true;
}
return Option<uint32_t>(200);
}
// try to value coerce into a float
if(item.is_string() && StringUtils::is_float(item)) {
item = std::atof(item.get<std::string>().c_str());
}
else if(item.is_boolean()) {
item = item == true ? 1.0 : 0.0;
}
else {
if(dirty_values == DIRTY_VALUES::COERCE_OR_DROP) {
if(!a_field.optional) {
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " float.");
}
if(!is_array) {
document.erase(field_name);
} else {
array_iter = document[field_name].erase(array_iter);
array_ele_erased = true;
}
} else {
// COERCE_OR_REJECT / non-optional + DROP
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " float.");
}
}
return Option<uint32_t>(200);
}
Option<uint32_t> validator_t::validate_index_in_memory(nlohmann::json& document, uint32_t seq_id,
const std::string & default_sorting_field,
const tsl::htrie_map<char, field> & search_schema,
const index_operation_t op,
const std::string& fallback_field_type,
const DIRTY_VALUES& dirty_values) {
bool missing_default_sort_field = (!default_sorting_field.empty() && document.count(default_sorting_field) == 0);
if((op != UPDATE && op != EMPLACE) && missing_default_sort_field) {
return Option<>(400, "Field `" + default_sorting_field + "` has been declared as a default sorting field, "
"but is not found in the document.");
}
for(const auto& a_field: search_schema) {
const std::string& field_name = a_field.name;
if(field_name == "id" || a_field.is_object()) {
continue;
}
if((a_field.optional || op == UPDATE || op == EMPLACE) && document.count(field_name) == 0) {
continue;
}
if(document.count(field_name) == 0) {
return Option<>(400, "Field `" + field_name + "` has been declared in the schema, "
"but is not found in the document.");
}
nlohmann::json& doc_ele = document[field_name];
if(a_field.optional && doc_ele.is_null()) {
// we will ignore `null` on an option field
if(op != UPDATE && op != EMPLACE) {
// for updates, the erasure is done later since we need to keep the key for overwrite
document.erase(field_name);
}
continue;
}
auto coerce_op = coerce_element(a_field, document, doc_ele, fallback_field_type, dirty_values);
if(!coerce_op.ok()) {
return coerce_op;
}
}
return Option<>(200);
}