Handle indexing document that does not have all the fields defined in the schema.

2025-05-18 20:52:50 +08:00 · 2017-03-25 21:45:06 +05:30 · 2017-03-25 21:45:06 +05:30 · 222e2c689a
commit 222e2c689a
parent 7af95e7f22
6 changed files with 104 additions and 28 deletions
--- a/TODO.md
+++ b/TODO.md
@ -27,10 +27,10 @@
 - ~~Assumption that all tokens match for scoring is no longer true~~
 - ~~Filters~~
 - ~~Facets~~
- Prevent string copy during indexing
- Schema validation during insertion
- clean special chars before indexing
+- Schema validation during insertion (missing fields + type errors)
 - Proper score field for ranking tokens
+- Prevent string copy during indexing
+- clean special chars before indexing
 - Minimum results should be a variable instead of blindly going with max_results
 - Pagination parameter
 - Iterator
--- a/include/collection.h
+++ b/include/collection.h
@ -102,7 +102,7 @@ public:

    spp::sparse_hash_map<std::string, field> get_schema();

-    std::string add(std::string json_str);
+    Option<std::string> add(std::string json_str);

    nlohmann::json search(std::string query, const std::vector<std::string> search_fields,
                          const std::string & simple_filter_query, const std::vector<std::string> & facet_fields,
@ -115,7 +115,7 @@ public:
                       const std::vector<art_leaf *> & query_suggestion, const uint32_t *result_ids,
                       const size_t result_size) const;

-    void index_in_memory(const nlohmann::json &document, uint32_t seq_id);
+    Option<uint32_t> index_in_memory(const nlohmann::json &document, uint32_t seq_id);

    enum {MAX_SEARCH_TOKENS = 20};
    enum {MAX_RESULTS = 100};
--- a/include/option.h
+++ b/include/option.h
@ -9,7 +9,7 @@ private:
    bool is_ok;

    std::string error_msg;
-    uint32_t code;
+    uint32_t error_code;

 public:

@ -17,11 +17,11 @@ public:

    }

-    Option(uint32_t code, const std::string & error_msg): code(code), error_msg(error_msg), is_ok(false) {
+    Option(uint32_t code, const std::string & error_msg): error_code(code), error_msg(error_msg), is_ok(false) {

    }

-    bool ok() {
+    bool ok() const {
        return is_ok;
    }

@ -29,7 +29,11 @@ public:
        return value;
    }

-    std::string error() {
+    std::string error() const {
        return error_msg;
    }
+
+    uint32_t code() const {
+        return error_code;
+    }
 };
--- a/src/collection.cpp
+++ b/src/collection.cpp
@ -55,7 +55,7 @@ uint32_t Collection::get_next_seq_id() {
    return next_seq_id++;
 }

-std::string Collection::add(std::string json_str) {
+Option<std::string> Collection::add(std::string json_str) {
    nlohmann::json document = nlohmann::json::parse(json_str);

    uint32_t seq_id = get_next_seq_id();
@ -65,16 +65,20 @@ std::string Collection::add(std::string json_str) {
        document["id"] = seq_id_str;
    }

+    const Option<uint32_t> & index_memory_op = index_in_memory(document, seq_id);
+
+    if(!index_memory_op.ok()) {
+        return Option<std::string>(index_memory_op.code(), index_memory_op.error());
+    }
+
    store->insert(get_seq_id_key(seq_id), document.dump());
    store->insert(get_doc_id_key(document["id"]), seq_id_str);

-    index_in_memory(document, seq_id);
-    return document["id"];
+    std::string doc_id = document["id"];
+    return Option<std::string>(doc_id);
 }

-void Collection::index_in_memory(const nlohmann::json &document, uint32_t seq_id) {
-    // FIXME: field might not exist in the document or field type might be invalid - need to validate!
-
+Option<uint32_t> Collection::index_in_memory(const nlohmann::json &document, uint32_t seq_id) {
    uint32_t points = 0;
    if(document.count("points") != 0) {
        points = document["points"];
@ -82,6 +86,12 @@ void Collection::index_in_memory(const nlohmann::json &document, uint32_t seq_id

    for(const std::pair<std::string, field> & field_pair: search_schema) {
        const std::string & field_name = field_pair.first;
+
+        if(document.count(field_name) == 0) {
+            return Option<>(400, "Field `" + field_name  + "` has been declared as a search field in the schema, "
+                            "but is not found in the document.");
+        }
+
        art_tree *t = search_index.at(field_name);

        if(field_pair.second.type == field_types::STRING) {
@ -107,6 +117,12 @@ void Collection::index_in_memory(const nlohmann::json &document, uint32_t seq_id

    for(const std::pair<std::string, field> & field_pair: facet_schema) {
        const std::string & field_name = field_pair.first;
+
+        if(document.count(field_name) == 0) {
+            return Option<>(400, "Field `" + field_name  + "` has been declared as a facet field in the schema, "
+                            "but is not found in the document.");
+        }
+
        art_tree *t = facet_index.at(field_name);
        if(field_pair.second.type == field_types::STRING) {
            const std::string & text = document[field_name];
@ -118,12 +134,16 @@ void Collection::index_in_memory(const nlohmann::json &document, uint32_t seq_id
    }

    for(const std::string & rank_field: rank_fields) {
-        if(rank_index.count(rank_field) > 0) {
-            spp::sparse_hash_map<uint32_t, int64_t> *doc_to_score = rank_index.at(rank_field);
-            doc_to_score->emplace(seq_id, document[rank_fields[0]].get<int64_t>());
+        if(document.count(rank_field) == 0) {
+            return Option<>(400, "Field `" + rank_field  + "` has been declared as a rank field in the schema, "
+                    "but is not found in the document.");
        }
-        // FIXME: handle else (return error)
+
+        spp::sparse_hash_map<uint32_t, int64_t> *doc_to_score = rank_index.at(rank_field);
+        doc_to_score->emplace(seq_id, document[rank_fields[0]].get<int64_t>());
    }
+
+    return Option<>(200);
 }

 void Collection::index_int32_field(const int32_t value, uint32_t score, art_tree *t, uint32_t seq_id) const {
--- a/src/main/server.cpp
+++ b/src/main/server.cpp
@ -18,6 +18,7 @@
 #include "string_utils.h"
 #include "collection.h"
 #include "collection_manager.h"
+#include "option.h"
 #include <sys/resource.h>

 #include "h2o.h"
@ -122,17 +123,23 @@ static int get_search(h2o_handler_t *self, h2o_req_t *req) {

 static int post_add_document(h2o_handler_t *self, h2o_req_t *req) {
    std::string document(req->entity.base, req->entity.len);
-    std::string inserted_id = collection->add(document);
-
-    static h2o_generator_t generator = {NULL, NULL};
-    req->res.status = 200;
-    req->res.reason = "OK";
-    h2o_add_header(&req->pool, &req->res.headers, H2O_TOKEN_CONTENT_TYPE, H2O_STRLIT("application/json; charset=utf-8"));
-    h2o_start_response(req, &generator);
+    Option<std::string> inserted_id_op = collection->add(document);

    nlohmann::json json_response;
-    json_response["id"] = inserted_id;
-    json_response["status"] = "SUCCESS";
+    static h2o_generator_t generator = {NULL, NULL};
+
+    if(!inserted_id_op.ok()) {
+        req->res.status = 400;
+        req->res.reason = "BAD REQUEST";
+        json_response["message"] = inserted_id_op.error();
+    } else {
+        req->res.status = 201;
+        req->res.reason = "CREATED";
+        json_response["id"] = inserted_id_op.get();
+    }
+
+    h2o_add_header(&req->pool, &req->res.headers, H2O_TOKEN_CONTENT_TYPE, H2O_STRLIT("application/json; charset=utf-8"));
+    h2o_start_response(req, &generator);

    h2o_iovec_t body = h2o_strdup(&req->pool, json_response.dump().c_str(), SIZE_MAX);
    h2o_send(req, &body, 1, 1);
--- a/test/collection_test.cpp
+++ b/test/collection_test.cpp
@ -781,4 +781,49 @@ TEST_F(CollectionTest, SearchingWithMissingFields) {
    ASSERT_STREQ("Could not find a rank field named `_rank` in the schema.", res["error"].get<std::string>().c_str());

    collectionManager.drop_collection("coll_array_fields");
+}
+
+TEST_F(CollectionTest, IndexingWithBadData) {
+    // should not crash when document to-be-indexed doesn't match schema
+    Collection *sample_collection;
+
+    std::vector<field> fields = {field("name", field_types::STRING), field("age", field_types::INT32)};
+    facet_fields = {field("tags", field_types::STRING_ARRAY)};
+    std::vector<std::string> rank_fields = {"age", "average"};
+
+    sample_collection = collectionManager.get_collection("sample_collection");
+    if(sample_collection == nullptr) {
+        sample_collection = collectionManager.create_collection("sample_collection", fields, facet_fields, rank_fields);
+    }
+
+    const Option<std::string> & search_fields_missing_op1 = sample_collection->add("{\"namezz\": \"foo\"}");
+    ASSERT_FALSE(search_fields_missing_op1.ok());
+    ASSERT_STREQ("Field `name` has been declared as a search field in the schema, but is not found in the document.",
+                 search_fields_missing_op1.error().c_str());
+
+    const Option<std::string> & search_fields_missing_op2 = sample_collection->add("{\"name\": \"foo\", \"agez\": 34}");
+    ASSERT_FALSE(search_fields_missing_op2.ok());
+    ASSERT_STREQ("Field `age` has been declared as a search field in the schema, but is not found in the document.",
+                 search_fields_missing_op2.error().c_str());
+
+    const Option<std::string> & facet_fields_missing_op1 = sample_collection->add("{\"name\": \"foo\", \"age\": 34}");
+    ASSERT_FALSE(facet_fields_missing_op1.ok());
+    ASSERT_STREQ("Field `tags` has been declared as a facet field in the schema, but is not found in the document.",
+                 facet_fields_missing_op1.error().c_str());
+
+    const char *doc_str = "{\"name\": \"foo\", \"age\": 34, \"tags\": [\"red\", \"blue\"]}";
+    const Option<std::string> & rank_fields_missing_op1 = sample_collection->add(doc_str);
+    ASSERT_FALSE(rank_fields_missing_op1.ok());
+    ASSERT_STREQ("Field `average` has been declared as a rank field in the schema, but is not found in the document.",
+                 rank_fields_missing_op1.error().c_str());
+
+    // handle type errors
+
+    const char *doc_str2 = "{\"name\": \"foo\", \"age\": 34, \"tags\": 22}";
+    const Option<std::string> & rank_fields_missing_op2 = sample_collection->add(doc_str2);
+    ASSERT_FALSE(rank_fields_missing_op2.ok());
+    ASSERT_STREQ("Field `average` has been declared as a rank field in the schema, but is not found in the document.",
+                 rank_fields_missing_op2.error().c_str());
+
+    collectionManager.drop_collection("sample_collection");
 }