mirror of
https://github.com/typesense/typesense.git
synced 2025-05-18 20:52:50 +08:00
Handle indexing document that does not have all the fields defined in the schema.
This commit is contained in:
parent
7af95e7f22
commit
222e2c689a
6
TODO.md
6
TODO.md
@ -27,10 +27,10 @@
|
||||
- ~~Assumption that all tokens match for scoring is no longer true~~
|
||||
- ~~Filters~~
|
||||
- ~~Facets~~
|
||||
- Prevent string copy during indexing
|
||||
- Schema validation during insertion
|
||||
- clean special chars before indexing
|
||||
- Schema validation during insertion (missing fields + type errors)
|
||||
- Proper score field for ranking tokens
|
||||
- Prevent string copy during indexing
|
||||
- clean special chars before indexing
|
||||
- Minimum results should be a variable instead of blindly going with max_results
|
||||
- Pagination parameter
|
||||
- Iterator
|
||||
|
@ -102,7 +102,7 @@ public:
|
||||
|
||||
spp::sparse_hash_map<std::string, field> get_schema();
|
||||
|
||||
std::string add(std::string json_str);
|
||||
Option<std::string> add(std::string json_str);
|
||||
|
||||
nlohmann::json search(std::string query, const std::vector<std::string> search_fields,
|
||||
const std::string & simple_filter_query, const std::vector<std::string> & facet_fields,
|
||||
@ -115,7 +115,7 @@ public:
|
||||
const std::vector<art_leaf *> & query_suggestion, const uint32_t *result_ids,
|
||||
const size_t result_size) const;
|
||||
|
||||
void index_in_memory(const nlohmann::json &document, uint32_t seq_id);
|
||||
Option<uint32_t> index_in_memory(const nlohmann::json &document, uint32_t seq_id);
|
||||
|
||||
enum {MAX_SEARCH_TOKENS = 20};
|
||||
enum {MAX_RESULTS = 100};
|
||||
|
@ -9,7 +9,7 @@ private:
|
||||
bool is_ok;
|
||||
|
||||
std::string error_msg;
|
||||
uint32_t code;
|
||||
uint32_t error_code;
|
||||
|
||||
public:
|
||||
|
||||
@ -17,11 +17,11 @@ public:
|
||||
|
||||
}
|
||||
|
||||
Option(uint32_t code, const std::string & error_msg): code(code), error_msg(error_msg), is_ok(false) {
|
||||
Option(uint32_t code, const std::string & error_msg): error_code(code), error_msg(error_msg), is_ok(false) {
|
||||
|
||||
}
|
||||
|
||||
bool ok() {
|
||||
bool ok() const {
|
||||
return is_ok;
|
||||
}
|
||||
|
||||
@ -29,7 +29,11 @@ public:
|
||||
return value;
|
||||
}
|
||||
|
||||
std::string error() {
|
||||
std::string error() const {
|
||||
return error_msg;
|
||||
}
|
||||
|
||||
uint32_t code() const {
|
||||
return error_code;
|
||||
}
|
||||
};
|
@ -55,7 +55,7 @@ uint32_t Collection::get_next_seq_id() {
|
||||
return next_seq_id++;
|
||||
}
|
||||
|
||||
std::string Collection::add(std::string json_str) {
|
||||
Option<std::string> Collection::add(std::string json_str) {
|
||||
nlohmann::json document = nlohmann::json::parse(json_str);
|
||||
|
||||
uint32_t seq_id = get_next_seq_id();
|
||||
@ -65,16 +65,20 @@ std::string Collection::add(std::string json_str) {
|
||||
document["id"] = seq_id_str;
|
||||
}
|
||||
|
||||
const Option<uint32_t> & index_memory_op = index_in_memory(document, seq_id);
|
||||
|
||||
if(!index_memory_op.ok()) {
|
||||
return Option<std::string>(index_memory_op.code(), index_memory_op.error());
|
||||
}
|
||||
|
||||
store->insert(get_seq_id_key(seq_id), document.dump());
|
||||
store->insert(get_doc_id_key(document["id"]), seq_id_str);
|
||||
|
||||
index_in_memory(document, seq_id);
|
||||
return document["id"];
|
||||
std::string doc_id = document["id"];
|
||||
return Option<std::string>(doc_id);
|
||||
}
|
||||
|
||||
void Collection::index_in_memory(const nlohmann::json &document, uint32_t seq_id) {
|
||||
// FIXME: field might not exist in the document or field type might be invalid - need to validate!
|
||||
|
||||
Option<uint32_t> Collection::index_in_memory(const nlohmann::json &document, uint32_t seq_id) {
|
||||
uint32_t points = 0;
|
||||
if(document.count("points") != 0) {
|
||||
points = document["points"];
|
||||
@ -82,6 +86,12 @@ void Collection::index_in_memory(const nlohmann::json &document, uint32_t seq_id
|
||||
|
||||
for(const std::pair<std::string, field> & field_pair: search_schema) {
|
||||
const std::string & field_name = field_pair.first;
|
||||
|
||||
if(document.count(field_name) == 0) {
|
||||
return Option<>(400, "Field `" + field_name + "` has been declared as a search field in the schema, "
|
||||
"but is not found in the document.");
|
||||
}
|
||||
|
||||
art_tree *t = search_index.at(field_name);
|
||||
|
||||
if(field_pair.second.type == field_types::STRING) {
|
||||
@ -107,6 +117,12 @@ void Collection::index_in_memory(const nlohmann::json &document, uint32_t seq_id
|
||||
|
||||
for(const std::pair<std::string, field> & field_pair: facet_schema) {
|
||||
const std::string & field_name = field_pair.first;
|
||||
|
||||
if(document.count(field_name) == 0) {
|
||||
return Option<>(400, "Field `" + field_name + "` has been declared as a facet field in the schema, "
|
||||
"but is not found in the document.");
|
||||
}
|
||||
|
||||
art_tree *t = facet_index.at(field_name);
|
||||
if(field_pair.second.type == field_types::STRING) {
|
||||
const std::string & text = document[field_name];
|
||||
@ -118,12 +134,16 @@ void Collection::index_in_memory(const nlohmann::json &document, uint32_t seq_id
|
||||
}
|
||||
|
||||
for(const std::string & rank_field: rank_fields) {
|
||||
if(rank_index.count(rank_field) > 0) {
|
||||
spp::sparse_hash_map<uint32_t, int64_t> *doc_to_score = rank_index.at(rank_field);
|
||||
doc_to_score->emplace(seq_id, document[rank_fields[0]].get<int64_t>());
|
||||
if(document.count(rank_field) == 0) {
|
||||
return Option<>(400, "Field `" + rank_field + "` has been declared as a rank field in the schema, "
|
||||
"but is not found in the document.");
|
||||
}
|
||||
// FIXME: handle else (return error)
|
||||
|
||||
spp::sparse_hash_map<uint32_t, int64_t> *doc_to_score = rank_index.at(rank_field);
|
||||
doc_to_score->emplace(seq_id, document[rank_fields[0]].get<int64_t>());
|
||||
}
|
||||
|
||||
return Option<>(200);
|
||||
}
|
||||
|
||||
void Collection::index_int32_field(const int32_t value, uint32_t score, art_tree *t, uint32_t seq_id) const {
|
||||
|
@ -18,6 +18,7 @@
|
||||
#include "string_utils.h"
|
||||
#include "collection.h"
|
||||
#include "collection_manager.h"
|
||||
#include "option.h"
|
||||
#include <sys/resource.h>
|
||||
|
||||
#include "h2o.h"
|
||||
@ -122,17 +123,23 @@ static int get_search(h2o_handler_t *self, h2o_req_t *req) {
|
||||
|
||||
static int post_add_document(h2o_handler_t *self, h2o_req_t *req) {
|
||||
std::string document(req->entity.base, req->entity.len);
|
||||
std::string inserted_id = collection->add(document);
|
||||
|
||||
static h2o_generator_t generator = {NULL, NULL};
|
||||
req->res.status = 200;
|
||||
req->res.reason = "OK";
|
||||
h2o_add_header(&req->pool, &req->res.headers, H2O_TOKEN_CONTENT_TYPE, H2O_STRLIT("application/json; charset=utf-8"));
|
||||
h2o_start_response(req, &generator);
|
||||
Option<std::string> inserted_id_op = collection->add(document);
|
||||
|
||||
nlohmann::json json_response;
|
||||
json_response["id"] = inserted_id;
|
||||
json_response["status"] = "SUCCESS";
|
||||
static h2o_generator_t generator = {NULL, NULL};
|
||||
|
||||
if(!inserted_id_op.ok()) {
|
||||
req->res.status = 400;
|
||||
req->res.reason = "BAD REQUEST";
|
||||
json_response["message"] = inserted_id_op.error();
|
||||
} else {
|
||||
req->res.status = 201;
|
||||
req->res.reason = "CREATED";
|
||||
json_response["id"] = inserted_id_op.get();
|
||||
}
|
||||
|
||||
h2o_add_header(&req->pool, &req->res.headers, H2O_TOKEN_CONTENT_TYPE, H2O_STRLIT("application/json; charset=utf-8"));
|
||||
h2o_start_response(req, &generator);
|
||||
|
||||
h2o_iovec_t body = h2o_strdup(&req->pool, json_response.dump().c_str(), SIZE_MAX);
|
||||
h2o_send(req, &body, 1, 1);
|
||||
|
@ -781,4 +781,49 @@ TEST_F(CollectionTest, SearchingWithMissingFields) {
|
||||
ASSERT_STREQ("Could not find a rank field named `_rank` in the schema.", res["error"].get<std::string>().c_str());
|
||||
|
||||
collectionManager.drop_collection("coll_array_fields");
|
||||
}
|
||||
|
||||
TEST_F(CollectionTest, IndexingWithBadData) {
|
||||
// should not crash when document to-be-indexed doesn't match schema
|
||||
Collection *sample_collection;
|
||||
|
||||
std::vector<field> fields = {field("name", field_types::STRING), field("age", field_types::INT32)};
|
||||
facet_fields = {field("tags", field_types::STRING_ARRAY)};
|
||||
std::vector<std::string> rank_fields = {"age", "average"};
|
||||
|
||||
sample_collection = collectionManager.get_collection("sample_collection");
|
||||
if(sample_collection == nullptr) {
|
||||
sample_collection = collectionManager.create_collection("sample_collection", fields, facet_fields, rank_fields);
|
||||
}
|
||||
|
||||
const Option<std::string> & search_fields_missing_op1 = sample_collection->add("{\"namezz\": \"foo\"}");
|
||||
ASSERT_FALSE(search_fields_missing_op1.ok());
|
||||
ASSERT_STREQ("Field `name` has been declared as a search field in the schema, but is not found in the document.",
|
||||
search_fields_missing_op1.error().c_str());
|
||||
|
||||
const Option<std::string> & search_fields_missing_op2 = sample_collection->add("{\"name\": \"foo\", \"agez\": 34}");
|
||||
ASSERT_FALSE(search_fields_missing_op2.ok());
|
||||
ASSERT_STREQ("Field `age` has been declared as a search field in the schema, but is not found in the document.",
|
||||
search_fields_missing_op2.error().c_str());
|
||||
|
||||
const Option<std::string> & facet_fields_missing_op1 = sample_collection->add("{\"name\": \"foo\", \"age\": 34}");
|
||||
ASSERT_FALSE(facet_fields_missing_op1.ok());
|
||||
ASSERT_STREQ("Field `tags` has been declared as a facet field in the schema, but is not found in the document.",
|
||||
facet_fields_missing_op1.error().c_str());
|
||||
|
||||
const char *doc_str = "{\"name\": \"foo\", \"age\": 34, \"tags\": [\"red\", \"blue\"]}";
|
||||
const Option<std::string> & rank_fields_missing_op1 = sample_collection->add(doc_str);
|
||||
ASSERT_FALSE(rank_fields_missing_op1.ok());
|
||||
ASSERT_STREQ("Field `average` has been declared as a rank field in the schema, but is not found in the document.",
|
||||
rank_fields_missing_op1.error().c_str());
|
||||
|
||||
// handle type errors
|
||||
|
||||
const char *doc_str2 = "{\"name\": \"foo\", \"age\": 34, \"tags\": 22}";
|
||||
const Option<std::string> & rank_fields_missing_op2 = sample_collection->add(doc_str2);
|
||||
ASSERT_FALSE(rank_fields_missing_op2.ok());
|
||||
ASSERT_STREQ("Field `average` has been declared as a rank field in the schema, but is not found in the document.",
|
||||
rank_fields_missing_op2.error().c_str());
|
||||
|
||||
collectionManager.drop_collection("sample_collection");
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user