Handle indexing document that does not have all the fields defined in the schema.

This commit is contained in:
Kishore Nallan 2017-03-25 21:45:06 +05:30
parent 7af95e7f22
commit 222e2c689a
6 changed files with 104 additions and 28 deletions

View File

@ -27,10 +27,10 @@
- ~~Assumption that all tokens match for scoring is no longer true~~
- ~~Filters~~
- ~~Facets~~
- Prevent string copy during indexing
- Schema validation during insertion
- clean special chars before indexing
- Schema validation during insertion (missing fields + type errors)
- Proper score field for ranking tokens
- Prevent string copy during indexing
- clean special chars before indexing
- Minimum results should be a variable instead of blindly going with max_results
- Pagination parameter
- Iterator

View File

@ -102,7 +102,7 @@ public:
spp::sparse_hash_map<std::string, field> get_schema();
std::string add(std::string json_str);
Option<std::string> add(std::string json_str);
nlohmann::json search(std::string query, const std::vector<std::string> search_fields,
const std::string & simple_filter_query, const std::vector<std::string> & facet_fields,
@ -115,7 +115,7 @@ public:
const std::vector<art_leaf *> & query_suggestion, const uint32_t *result_ids,
const size_t result_size) const;
void index_in_memory(const nlohmann::json &document, uint32_t seq_id);
Option<uint32_t> index_in_memory(const nlohmann::json &document, uint32_t seq_id);
enum {MAX_SEARCH_TOKENS = 20};
enum {MAX_RESULTS = 100};

View File

@ -9,7 +9,7 @@ private:
bool is_ok;
std::string error_msg;
uint32_t code;
uint32_t error_code;
public:
@ -17,11 +17,11 @@ public:
}
Option(uint32_t code, const std::string & error_msg): code(code), error_msg(error_msg), is_ok(false) {
Option(uint32_t code, const std::string & error_msg): error_code(code), error_msg(error_msg), is_ok(false) {
}
bool ok() {
bool ok() const {
return is_ok;
}
@ -29,7 +29,11 @@ public:
return value;
}
std::string error() {
std::string error() const {
return error_msg;
}
uint32_t code() const {
return error_code;
}
};

View File

@ -55,7 +55,7 @@ uint32_t Collection::get_next_seq_id() {
return next_seq_id++;
}
std::string Collection::add(std::string json_str) {
Option<std::string> Collection::add(std::string json_str) {
nlohmann::json document = nlohmann::json::parse(json_str);
uint32_t seq_id = get_next_seq_id();
@ -65,16 +65,20 @@ std::string Collection::add(std::string json_str) {
document["id"] = seq_id_str;
}
const Option<uint32_t> & index_memory_op = index_in_memory(document, seq_id);
if(!index_memory_op.ok()) {
return Option<std::string>(index_memory_op.code(), index_memory_op.error());
}
store->insert(get_seq_id_key(seq_id), document.dump());
store->insert(get_doc_id_key(document["id"]), seq_id_str);
index_in_memory(document, seq_id);
return document["id"];
std::string doc_id = document["id"];
return Option<std::string>(doc_id);
}
void Collection::index_in_memory(const nlohmann::json &document, uint32_t seq_id) {
// FIXME: field might not exist in the document or field type might be invalid - need to validate!
Option<uint32_t> Collection::index_in_memory(const nlohmann::json &document, uint32_t seq_id) {
uint32_t points = 0;
if(document.count("points") != 0) {
points = document["points"];
@ -82,6 +86,12 @@ void Collection::index_in_memory(const nlohmann::json &document, uint32_t seq_id
for(const std::pair<std::string, field> & field_pair: search_schema) {
const std::string & field_name = field_pair.first;
if(document.count(field_name) == 0) {
return Option<>(400, "Field `" + field_name + "` has been declared as a search field in the schema, "
"but is not found in the document.");
}
art_tree *t = search_index.at(field_name);
if(field_pair.second.type == field_types::STRING) {
@ -107,6 +117,12 @@ void Collection::index_in_memory(const nlohmann::json &document, uint32_t seq_id
for(const std::pair<std::string, field> & field_pair: facet_schema) {
const std::string & field_name = field_pair.first;
if(document.count(field_name) == 0) {
return Option<>(400, "Field `" + field_name + "` has been declared as a facet field in the schema, "
"but is not found in the document.");
}
art_tree *t = facet_index.at(field_name);
if(field_pair.second.type == field_types::STRING) {
const std::string & text = document[field_name];
@ -118,12 +134,16 @@ void Collection::index_in_memory(const nlohmann::json &document, uint32_t seq_id
}
for(const std::string & rank_field: rank_fields) {
if(rank_index.count(rank_field) > 0) {
spp::sparse_hash_map<uint32_t, int64_t> *doc_to_score = rank_index.at(rank_field);
doc_to_score->emplace(seq_id, document[rank_fields[0]].get<int64_t>());
if(document.count(rank_field) == 0) {
return Option<>(400, "Field `" + rank_field + "` has been declared as a rank field in the schema, "
"but is not found in the document.");
}
// FIXME: handle else (return error)
spp::sparse_hash_map<uint32_t, int64_t> *doc_to_score = rank_index.at(rank_field);
doc_to_score->emplace(seq_id, document[rank_fields[0]].get<int64_t>());
}
return Option<>(200);
}
void Collection::index_int32_field(const int32_t value, uint32_t score, art_tree *t, uint32_t seq_id) const {

View File

@ -18,6 +18,7 @@
#include "string_utils.h"
#include "collection.h"
#include "collection_manager.h"
#include "option.h"
#include <sys/resource.h>
#include "h2o.h"
@ -122,17 +123,23 @@ static int get_search(h2o_handler_t *self, h2o_req_t *req) {
static int post_add_document(h2o_handler_t *self, h2o_req_t *req) {
std::string document(req->entity.base, req->entity.len);
std::string inserted_id = collection->add(document);
static h2o_generator_t generator = {NULL, NULL};
req->res.status = 200;
req->res.reason = "OK";
h2o_add_header(&req->pool, &req->res.headers, H2O_TOKEN_CONTENT_TYPE, H2O_STRLIT("application/json; charset=utf-8"));
h2o_start_response(req, &generator);
Option<std::string> inserted_id_op = collection->add(document);
nlohmann::json json_response;
json_response["id"] = inserted_id;
json_response["status"] = "SUCCESS";
static h2o_generator_t generator = {NULL, NULL};
if(!inserted_id_op.ok()) {
req->res.status = 400;
req->res.reason = "BAD REQUEST";
json_response["message"] = inserted_id_op.error();
} else {
req->res.status = 201;
req->res.reason = "CREATED";
json_response["id"] = inserted_id_op.get();
}
h2o_add_header(&req->pool, &req->res.headers, H2O_TOKEN_CONTENT_TYPE, H2O_STRLIT("application/json; charset=utf-8"));
h2o_start_response(req, &generator);
h2o_iovec_t body = h2o_strdup(&req->pool, json_response.dump().c_str(), SIZE_MAX);
h2o_send(req, &body, 1, 1);

View File

@ -781,4 +781,49 @@ TEST_F(CollectionTest, SearchingWithMissingFields) {
ASSERT_STREQ("Could not find a rank field named `_rank` in the schema.", res["error"].get<std::string>().c_str());
collectionManager.drop_collection("coll_array_fields");
}
TEST_F(CollectionTest, IndexingWithBadData) {
// should not crash when document to-be-indexed doesn't match schema
Collection *sample_collection;
std::vector<field> fields = {field("name", field_types::STRING), field("age", field_types::INT32)};
facet_fields = {field("tags", field_types::STRING_ARRAY)};
std::vector<std::string> rank_fields = {"age", "average"};
sample_collection = collectionManager.get_collection("sample_collection");
if(sample_collection == nullptr) {
sample_collection = collectionManager.create_collection("sample_collection", fields, facet_fields, rank_fields);
}
const Option<std::string> & search_fields_missing_op1 = sample_collection->add("{\"namezz\": \"foo\"}");
ASSERT_FALSE(search_fields_missing_op1.ok());
ASSERT_STREQ("Field `name` has been declared as a search field in the schema, but is not found in the document.",
search_fields_missing_op1.error().c_str());
const Option<std::string> & search_fields_missing_op2 = sample_collection->add("{\"name\": \"foo\", \"agez\": 34}");
ASSERT_FALSE(search_fields_missing_op2.ok());
ASSERT_STREQ("Field `age` has been declared as a search field in the schema, but is not found in the document.",
search_fields_missing_op2.error().c_str());
const Option<std::string> & facet_fields_missing_op1 = sample_collection->add("{\"name\": \"foo\", \"age\": 34}");
ASSERT_FALSE(facet_fields_missing_op1.ok());
ASSERT_STREQ("Field `tags` has been declared as a facet field in the schema, but is not found in the document.",
facet_fields_missing_op1.error().c_str());
const char *doc_str = "{\"name\": \"foo\", \"age\": 34, \"tags\": [\"red\", \"blue\"]}";
const Option<std::string> & rank_fields_missing_op1 = sample_collection->add(doc_str);
ASSERT_FALSE(rank_fields_missing_op1.ok());
ASSERT_STREQ("Field `average` has been declared as a rank field in the schema, but is not found in the document.",
rank_fields_missing_op1.error().c_str());
// handle type errors
const char *doc_str2 = "{\"name\": \"foo\", \"age\": 34, \"tags\": 22}";
const Option<std::string> & rank_fields_missing_op2 = sample_collection->add(doc_str2);
ASSERT_FALSE(rank_fields_missing_op2.ok());
ASSERT_STREQ("Field `average` has been declared as a rank field in the schema, but is not found in the document.",
rank_fields_missing_op2.error().c_str());
collectionManager.drop_collection("sample_collection");
}