diff --git a/TODO.md b/TODO.md index 2d6f9044..56ccc9ff 100644 --- a/TODO.md +++ b/TODO.md @@ -27,10 +27,10 @@ - ~~Assumption that all tokens match for scoring is no longer true~~ - ~~Filters~~ - ~~Facets~~ -- Prevent string copy during indexing -- Schema validation during insertion -- clean special chars before indexing +- Schema validation during insertion (missing fields + type errors) - Proper score field for ranking tokens +- Prevent string copy during indexing +- clean special chars before indexing - Minimum results should be a variable instead of blindly going with max_results - Pagination parameter - Iterator diff --git a/include/collection.h b/include/collection.h index 05f7fa64..f12fd2f2 100644 --- a/include/collection.h +++ b/include/collection.h @@ -102,7 +102,7 @@ public: spp::sparse_hash_map get_schema(); - std::string add(std::string json_str); + Option add(std::string json_str); nlohmann::json search(std::string query, const std::vector search_fields, const std::string & simple_filter_query, const std::vector & facet_fields, @@ -115,7 +115,7 @@ public: const std::vector & query_suggestion, const uint32_t *result_ids, const size_t result_size) const; - void index_in_memory(const nlohmann::json &document, uint32_t seq_id); + Option index_in_memory(const nlohmann::json &document, uint32_t seq_id); enum {MAX_SEARCH_TOKENS = 20}; enum {MAX_RESULTS = 100}; diff --git a/include/option.h b/include/option.h index c05c099d..04b44935 100644 --- a/include/option.h +++ b/include/option.h @@ -9,7 +9,7 @@ private: bool is_ok; std::string error_msg; - uint32_t code; + uint32_t error_code; public: @@ -17,11 +17,11 @@ public: } - Option(uint32_t code, const std::string & error_msg): code(code), error_msg(error_msg), is_ok(false) { + Option(uint32_t code, const std::string & error_msg): error_code(code), error_msg(error_msg), is_ok(false) { } - bool ok() { + bool ok() const { return is_ok; } @@ -29,7 +29,11 @@ public: return value; } - std::string error() { + std::string error() const { return error_msg; } + + uint32_t code() const { + return error_code; + } }; \ No newline at end of file diff --git a/src/collection.cpp b/src/collection.cpp index 5d9db218..75415be8 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -55,7 +55,7 @@ uint32_t Collection::get_next_seq_id() { return next_seq_id++; } -std::string Collection::add(std::string json_str) { +Option Collection::add(std::string json_str) { nlohmann::json document = nlohmann::json::parse(json_str); uint32_t seq_id = get_next_seq_id(); @@ -65,16 +65,20 @@ std::string Collection::add(std::string json_str) { document["id"] = seq_id_str; } + const Option & index_memory_op = index_in_memory(document, seq_id); + + if(!index_memory_op.ok()) { + return Option(index_memory_op.code(), index_memory_op.error()); + } + store->insert(get_seq_id_key(seq_id), document.dump()); store->insert(get_doc_id_key(document["id"]), seq_id_str); - index_in_memory(document, seq_id); - return document["id"]; + std::string doc_id = document["id"]; + return Option(doc_id); } -void Collection::index_in_memory(const nlohmann::json &document, uint32_t seq_id) { - // FIXME: field might not exist in the document or field type might be invalid - need to validate! - +Option Collection::index_in_memory(const nlohmann::json &document, uint32_t seq_id) { uint32_t points = 0; if(document.count("points") != 0) { points = document["points"]; @@ -82,6 +86,12 @@ void Collection::index_in_memory(const nlohmann::json &document, uint32_t seq_id for(const std::pair & field_pair: search_schema) { const std::string & field_name = field_pair.first; + + if(document.count(field_name) == 0) { + return Option<>(400, "Field `" + field_name + "` has been declared as a search field in the schema, " + "but is not found in the document."); + } + art_tree *t = search_index.at(field_name); if(field_pair.second.type == field_types::STRING) { @@ -107,6 +117,12 @@ void Collection::index_in_memory(const nlohmann::json &document, uint32_t seq_id for(const std::pair & field_pair: facet_schema) { const std::string & field_name = field_pair.first; + + if(document.count(field_name) == 0) { + return Option<>(400, "Field `" + field_name + "` has been declared as a facet field in the schema, " + "but is not found in the document."); + } + art_tree *t = facet_index.at(field_name); if(field_pair.second.type == field_types::STRING) { const std::string & text = document[field_name]; @@ -118,12 +134,16 @@ void Collection::index_in_memory(const nlohmann::json &document, uint32_t seq_id } for(const std::string & rank_field: rank_fields) { - if(rank_index.count(rank_field) > 0) { - spp::sparse_hash_map *doc_to_score = rank_index.at(rank_field); - doc_to_score->emplace(seq_id, document[rank_fields[0]].get()); + if(document.count(rank_field) == 0) { + return Option<>(400, "Field `" + rank_field + "` has been declared as a rank field in the schema, " + "but is not found in the document."); } - // FIXME: handle else (return error) + + spp::sparse_hash_map *doc_to_score = rank_index.at(rank_field); + doc_to_score->emplace(seq_id, document[rank_fields[0]].get()); } + + return Option<>(200); } void Collection::index_int32_field(const int32_t value, uint32_t score, art_tree *t, uint32_t seq_id) const { diff --git a/src/main/server.cpp b/src/main/server.cpp index 87c51a0e..85bd1523 100644 --- a/src/main/server.cpp +++ b/src/main/server.cpp @@ -18,6 +18,7 @@ #include "string_utils.h" #include "collection.h" #include "collection_manager.h" +#include "option.h" #include #include "h2o.h" @@ -122,17 +123,23 @@ static int get_search(h2o_handler_t *self, h2o_req_t *req) { static int post_add_document(h2o_handler_t *self, h2o_req_t *req) { std::string document(req->entity.base, req->entity.len); - std::string inserted_id = collection->add(document); - - static h2o_generator_t generator = {NULL, NULL}; - req->res.status = 200; - req->res.reason = "OK"; - h2o_add_header(&req->pool, &req->res.headers, H2O_TOKEN_CONTENT_TYPE, H2O_STRLIT("application/json; charset=utf-8")); - h2o_start_response(req, &generator); + Option inserted_id_op = collection->add(document); nlohmann::json json_response; - json_response["id"] = inserted_id; - json_response["status"] = "SUCCESS"; + static h2o_generator_t generator = {NULL, NULL}; + + if(!inserted_id_op.ok()) { + req->res.status = 400; + req->res.reason = "BAD REQUEST"; + json_response["message"] = inserted_id_op.error(); + } else { + req->res.status = 201; + req->res.reason = "CREATED"; + json_response["id"] = inserted_id_op.get(); + } + + h2o_add_header(&req->pool, &req->res.headers, H2O_TOKEN_CONTENT_TYPE, H2O_STRLIT("application/json; charset=utf-8")); + h2o_start_response(req, &generator); h2o_iovec_t body = h2o_strdup(&req->pool, json_response.dump().c_str(), SIZE_MAX); h2o_send(req, &body, 1, 1); diff --git a/test/collection_test.cpp b/test/collection_test.cpp index c485c460..4d6659da 100644 --- a/test/collection_test.cpp +++ b/test/collection_test.cpp @@ -781,4 +781,49 @@ TEST_F(CollectionTest, SearchingWithMissingFields) { ASSERT_STREQ("Could not find a rank field named `_rank` in the schema.", res["error"].get().c_str()); collectionManager.drop_collection("coll_array_fields"); +} + +TEST_F(CollectionTest, IndexingWithBadData) { + // should not crash when document to-be-indexed doesn't match schema + Collection *sample_collection; + + std::vector fields = {field("name", field_types::STRING), field("age", field_types::INT32)}; + facet_fields = {field("tags", field_types::STRING_ARRAY)}; + std::vector rank_fields = {"age", "average"}; + + sample_collection = collectionManager.get_collection("sample_collection"); + if(sample_collection == nullptr) { + sample_collection = collectionManager.create_collection("sample_collection", fields, facet_fields, rank_fields); + } + + const Option & search_fields_missing_op1 = sample_collection->add("{\"namezz\": \"foo\"}"); + ASSERT_FALSE(search_fields_missing_op1.ok()); + ASSERT_STREQ("Field `name` has been declared as a search field in the schema, but is not found in the document.", + search_fields_missing_op1.error().c_str()); + + const Option & search_fields_missing_op2 = sample_collection->add("{\"name\": \"foo\", \"agez\": 34}"); + ASSERT_FALSE(search_fields_missing_op2.ok()); + ASSERT_STREQ("Field `age` has been declared as a search field in the schema, but is not found in the document.", + search_fields_missing_op2.error().c_str()); + + const Option & facet_fields_missing_op1 = sample_collection->add("{\"name\": \"foo\", \"age\": 34}"); + ASSERT_FALSE(facet_fields_missing_op1.ok()); + ASSERT_STREQ("Field `tags` has been declared as a facet field in the schema, but is not found in the document.", + facet_fields_missing_op1.error().c_str()); + + const char *doc_str = "{\"name\": \"foo\", \"age\": 34, \"tags\": [\"red\", \"blue\"]}"; + const Option & rank_fields_missing_op1 = sample_collection->add(doc_str); + ASSERT_FALSE(rank_fields_missing_op1.ok()); + ASSERT_STREQ("Field `average` has been declared as a rank field in the schema, but is not found in the document.", + rank_fields_missing_op1.error().c_str()); + + // handle type errors + + const char *doc_str2 = "{\"name\": \"foo\", \"age\": 34, \"tags\": 22}"; + const Option & rank_fields_missing_op2 = sample_collection->add(doc_str2); + ASSERT_FALSE(rank_fields_missing_op2.ok()); + ASSERT_STREQ("Field `average` has been declared as a rank field in the schema, but is not found in the document.", + rank_fields_missing_op2.error().c_str()); + + collectionManager.drop_collection("sample_collection"); } \ No newline at end of file