diff --git a/README.md b/README.md index 32c835aa..af3d5ae6 100644 --- a/README.md +++ b/README.md @@ -52,7 +52,7 @@ Here's a quick example showcasing how you can create a collection, index a docum Let's begin by starting the Typesense server via Docker: ``` -docker run -p 8108:8108 -v/tmp/data:/data typesense/typesense:0.15.0 --data-dir /data --api-key=Hu52dwsas2AdxdE +docker run -p 8108:8108 -v/tmp/data:/data typesense/typesense:0.16.0 --data-dir /data --api-key=Hu52dwsas2AdxdE ``` We have [API Clients](#api-clients) in a couple of languages, but let's use the Python client for this example. diff --git a/include/array.h b/include/array.h index cb402b96..4510a5bb 100644 --- a/include/array.h +++ b/include/array.h @@ -18,6 +18,8 @@ private: } public: + void load(const uint32_t *sorted_array, uint32_t array_length, uint32_t m, uint32_t M); + uint32_t at(uint32_t index); bool contains(uint32_t value); @@ -26,5 +28,7 @@ public: bool append(uint32_t value); + bool insert(size_t index, const uint32_t* values, size_t num_values); + void remove_index(uint32_t start_index, uint32_t end_index); }; \ No newline at end of file diff --git a/include/array_base.h b/include/array_base.h index 450e0a0e..d4eae6a8 100644 --- a/include/array_base.h +++ b/include/array_base.h @@ -36,7 +36,8 @@ public: in = nullptr; } - uint32_t* uncompress(); + // len determines length of output buffer (default: length of input) + uint32_t* uncompress(uint32_t len=0); uint32_t getSizeInBytes(); diff --git a/include/art.h b/include/art.h index 9c6276d0..b4e4eba0 100644 --- a/include/art.h +++ b/include/art.h @@ -96,9 +96,9 @@ typedef struct { * of arbitrary size, as they include the key. */ typedef struct { - art_values* values; - int32_t max_score; uint32_t key_len; + int64_t max_score; + art_values* values; unsigned char key[]; } art_leaf; diff --git a/include/collection.h b/include/collection.h index f459c2a5..ebfacb57 100644 --- a/include/collection.h +++ b/include/collection.h @@ -92,6 +92,11 @@ struct override_t { } }; +struct doc_seq_id_t { + uint32_t seq_id; + bool is_new; +}; + class Collection { private: @@ -150,7 +155,9 @@ private: void highlight_result(const field &search_field, const std::vector> &searched_queries, const KV* field_order_kv, const nlohmann::json &document, - StringUtils & string_utils, size_t snippet_threshold, + StringUtils & string_utils, + const size_t snippet_threshold, + const size_t highlight_affix_num_tokens, bool highlighted_fully, highlight_t &highlight); @@ -217,13 +224,16 @@ public: std::string get_default_sorting_field(); - Option to_doc(const std::string & json_str, nlohmann::json & document); + Option to_doc(const std::string& json_str, nlohmann::json& document, + const index_operation_t& operation, const std::string& id=""); nlohmann::json get_summary_json(); - Option add(const std::string & json_str); + Option add(const std::string & json_str, + const index_operation_t& operation=CREATE, const std::string& id=""); - nlohmann::json add_many(std::vector& json_lines); + nlohmann::json add_many(std::vector& json_lines, nlohmann::json& document, + const index_operation_t& operation=CREATE, const std::string& id=""); Option search(const std::string & query, const std::vector & search_fields, const std::string & simple_filter_query, const std::vector & facet_fields, @@ -236,6 +246,7 @@ public: size_t max_facet_values=10, const std::string & simple_facet_query = "", const size_t snippet_threshold = 30, + const size_t highlight_affix_num_tokens = 4, const std::string & highlight_full_fields = "", size_t typo_tokens_threshold = Index::TYPO_TOKENS_THRESHOLD, const std::map>& pinned_hits={}, @@ -263,7 +274,7 @@ public: Option get_document_from_store(const std::string & seq_id_key, nlohmann::json & document); - Option index_in_memory(const nlohmann::json & document, uint32_t seq_id); + Option index_in_memory(const nlohmann::json & document, uint32_t seq_id, bool is_update); size_t par_index_in_memory(std::vector> & iter_batch, std::vector& indexed_counts); @@ -296,5 +307,9 @@ public: size_t &num_indexed); bool is_exceeding_memory_threshold() const; + + void get_doc_changes(const nlohmann::json &document, nlohmann::json &old_doc, + nlohmann::json &new_doc, + nlohmann::json &del_doc); }; diff --git a/include/core_api.h b/include/core_api.h index f4536644..713e5dc1 100644 --- a/include/core_api.h +++ b/include/core_api.h @@ -23,6 +23,8 @@ bool get_export_documents(http_req& req, http_res& res); bool post_add_document(http_req& req, http_res& res); +bool patch_update_document(http_req& req, http_res& res); + bool post_import_documents(http_req& req, http_res& res); bool get_fetch_document(http_req& req, http_res& res); diff --git a/include/http_server.h b/include/http_server.h index 0a30e4b5..847cecef 100644 --- a/include/http_server.h +++ b/include/http_server.h @@ -127,6 +127,8 @@ public: void put(const std::string & path, bool (*handler)(http_req & req, http_res & res), bool async_req=false, bool async_res=false); + void patch(const std::string & path, bool (*handler)(http_req & req, http_res & res), bool async_req=false, bool async_res=false); + void del(const std::string & path, bool (*handler)(http_req & req, http_res & res), bool async_req=false, bool async_res=false); void on(const std::string & message, bool (*handler)(void*)); diff --git a/include/index.h b/include/index.h index 470e320a..13aa537e 100644 --- a/include/index.h +++ b/include/index.h @@ -79,15 +79,29 @@ struct search_args { }; }; +enum index_operation_t { + CREATE, + UPSERT, + UPDATE, + DELETE +}; + struct index_record { - size_t position; // position of record in the original request + size_t position; // position of record in the original request uint32_t seq_id; - nlohmann::json document; - Option indexed; // indicates if the indexing operation was a success + nlohmann::json doc; + nlohmann::json old_doc; + nlohmann::json new_doc; + nlohmann::json del_doc; - index_record(size_t record_pos, uint32_t seq_id, const nlohmann::json& doc): - position(record_pos), seq_id(seq_id), document(doc), indexed(true) { + index_operation_t operation; + bool is_update; + + Option indexed; // indicates if the indexing operation was a success + + index_record(size_t record_pos, uint32_t seq_id, const nlohmann::json& doc, index_operation_t operation): + position(record_pos), seq_id(seq_id), doc(doc), operation(operation), is_update(false), indexed(false) { } @@ -95,7 +109,7 @@ struct index_record { indexed = Option(err_code, err_msg); } - void index_success(const index_record & record) { + void index_success() { indexed = Option(true); } }; @@ -154,32 +168,32 @@ private: size_t & all_result_ids_len, const size_t typo_tokens_threshold); - void insert_doc(const uint32_t score, art_tree *t, uint32_t seq_id, + void insert_doc(const int64_t score, art_tree *t, uint32_t seq_id, const std::unordered_map> &token_to_offsets) const; - void index_string_field(const std::string & text, const uint32_t score, art_tree *t, uint32_t seq_id, + void index_string_field(const std::string & text, const int64_t score, art_tree *t, uint32_t seq_id, int facet_id, const field & a_field); - void index_string_array_field(const std::vector & strings, const uint32_t score, art_tree *t, + void index_string_array_field(const std::vector & strings, const int64_t score, art_tree *t, uint32_t seq_id, int facet_id, const field & a_field); - void index_int32_field(const int32_t value, const uint32_t score, art_tree *t, uint32_t seq_id) const; + void index_int32_field(const int32_t value, const int64_t score, art_tree *t, uint32_t seq_id) const; - void index_int64_field(const int64_t value, const uint32_t score, art_tree *t, uint32_t seq_id) const; + void index_int64_field(const int64_t value, const int64_t score, art_tree *t, uint32_t seq_id) const; - void index_float_field(const float value, const uint32_t score, art_tree *t, uint32_t seq_id) const; + void index_float_field(const float value, const int64_t score, art_tree *t, uint32_t seq_id) const; - void index_bool_field(const bool value, const uint32_t score, art_tree *t, uint32_t seq_id) const; + void index_bool_field(const bool value, const int64_t score, art_tree *t, uint32_t seq_id) const; - void index_int32_array_field(const std::vector & values, const uint32_t score, art_tree *t, uint32_t seq_id) const; + void index_int32_array_field(const std::vector & values, const int64_t score, art_tree *t, uint32_t seq_id) const; - void index_int64_array_field(const std::vector & values, const uint32_t score, art_tree *t, uint32_t seq_id) const; + void index_int64_array_field(const std::vector & values, const int64_t score, art_tree *t, uint32_t seq_id) const; - void index_float_array_field(const std::vector & values, const uint32_t score, art_tree *t, uint32_t seq_id) const; + void index_float_array_field(const std::vector & values, const int64_t score, art_tree *t, uint32_t seq_id) const; - void index_bool_array_field(const std::vector & values, const uint32_t score, art_tree *t, uint32_t seq_id) const; + void index_bool_array_field(const std::vector & values, const int64_t score, art_tree *t, uint32_t seq_id) const; - void remove_and_shift_offset_index(sorted_array &offset_index, const uint32_t *indices_sorted, + void remove_and_shift_offset_index(sorted_array& offset_index, const uint32_t* indices_sorted, const uint32_t indices_length); uint32_t* collate_leaf_ids(const std::vector &leaves, size_t& result_ids_len) const; @@ -238,21 +252,22 @@ public: spp::sparse_hash_set& groups_processed, const uint32_t *result_ids, const size_t result_size); - static int32_t get_points_from_doc(const nlohmann::json &document, const std::string & default_sorting_field); + static int64_t get_points_from_doc(const nlohmann::json &document, const std::string & default_sorting_field); Option index_in_memory(const nlohmann::json & document, uint32_t seq_id, - const std::string & default_sorting_field); + const std::string & default_sorting_field, bool is_update); static Option validate_index_in_memory(const nlohmann::json &document, uint32_t seq_id, const std::string & default_sorting_field, const std::unordered_map & search_schema, - const std::map & facet_schema); + const std::map & facet_schema, + bool is_update); static size_t batch_memory_index(Index *index, - std::vector & iter_batch, - const std::string & default_sorting_field, - const std::unordered_map & search_schema, - const std::map & facet_schema); + std::vector & iter_batch, + const std::string & default_sorting_field, + const std::unordered_map & search_schema, + const std::map & facet_schema); const spp::sparse_hash_map &_get_search_index() const; @@ -291,5 +306,10 @@ public: void eq_str_filter_plain(const uint32_t *strt_ids, size_t strt_ids_size, const std::vector &query_suggestion, uint32_t *exact_strt_ids, size_t& exact_strt_size) const; + + void scrub_reindex_doc(nlohmann::json& update_doc, nlohmann::json& del_doc, nlohmann::json& old_doc); + + void tokenize_doc_field(const nlohmann::json& document, const std::string& field_name, const field& search_field, + std::vector& tokens); }; diff --git a/include/sorted_array.h b/include/sorted_array.h index 5df7037f..b2fd07ab 100644 --- a/include/sorted_array.h +++ b/include/sorted_array.h @@ -8,6 +8,7 @@ #include #include #include "array_base.h" +#include "logger.h" class sorted_array: public array_base { private: @@ -16,7 +17,15 @@ private: uint32_t m = std::min(min, value); uint32_t M = std::max(max, value); uint32_t bnew = required_bits(M - m); - return METADATA_OVERHEAD + 4 + for_compressed_size_bits(new_length, bnew); + uint32_t size_bits = for_compressed_size_bits(new_length, bnew); + + + /*if(new_length == 15) { + LOG(INFO) << "value: " << value << ", m: " << m << ", M: " << M << ", bnew: " + << bnew << ", size_bits: " << size_bits; + }*/ + + return METADATA_OVERHEAD + 4 + size_bits; } uint32_t lower_bound_search_bits(const uint8_t *in, uint32_t imin, uint32_t imax, uint32_t base, @@ -39,7 +48,11 @@ public: void indexOf(const uint32_t *values, const size_t values_len, uint32_t* indices); // returns false if malloc fails - bool append(uint32_t value); + size_t append(uint32_t value); - void remove_values(uint32_t *sorted_values, uint32_t values_length); + bool insert(size_t index, uint32_t value); + + void remove_value(uint32_t value); + + void remove_values(uint32_t *sorted_values, uint32_t sorted_values_length); }; \ No newline at end of file diff --git a/include/string_utils.h b/include/string_utils.h index 41eab296..46baf767 100644 --- a/include/string_utils.h +++ b/include/string_utils.h @@ -199,6 +199,15 @@ struct StringUtils { return (*p == 0) && val >= std::numeric_limits::min() && val <= std::numeric_limits::max(); } + static bool is_bool(std::string &s) { + if(s.empty()) { + return false; + } + + StringUtils::tolowercase(s); + return s == "true" || s == "false"; + } + static void toupper(std::string& str) { std::transform(str.begin(), str.end(), str.begin(), ::toupper); } diff --git a/src/array.cpp b/src/array.cpp index 7a009d2c..203e0cf3 100644 --- a/src/array.cpp +++ b/src/array.cpp @@ -41,6 +41,47 @@ bool array::append(uint32_t value) { return true; } +void array::load(const uint32_t *sorted_array, const uint32_t array_length, const uint32_t m, const uint32_t M) { + min = m; + max = M; + + uint32_t size_required = (uint32_t) (unsorted_append_size_required(max, array_length) * FOR_GROWTH_FACTOR); + uint8_t *out = (uint8_t *) malloc(size_required * sizeof *out); + uint32_t actual_size = for_compress_unsorted(sorted_array, out, array_length); + + free(in); + in = nullptr; + + in = out; + length = array_length; + size_bytes = size_required; + length_bytes = actual_size; +} + +bool array::insert(size_t index, const uint32_t* values, size_t num_values) { + if(index >= length) { + return false; + } + + uint32_t *curr_array = uncompress(length+num_values); + memmove(&curr_array[index+num_values], &curr_array[index], sizeof(uint32_t)*(length-index)); + + uint32_t m = min, M = max; + + for(size_t i=0; i M) M = value; + curr_array[index+i] = value; + } + + load(curr_array, length+num_values, m, M); + + delete [] curr_array; + + return true; +} + void array::remove_index(uint32_t start_index, uint32_t end_index) { uint32_t *curr_array = uncompress(); diff --git a/src/array_base.cpp b/src/array_base.cpp index 4a9a29ff..2eae9dfd 100644 --- a/src/array_base.cpp +++ b/src/array_base.cpp @@ -1,7 +1,8 @@ #include "array_base.h" -uint32_t* array_base::uncompress() { - uint32_t *out = new uint32_t[length]; +uint32_t* array_base::uncompress(uint32_t len) { + uint32_t actual_len = std::max(len, length); + uint32_t *out = new uint32_t[actual_len]; for_uncompress(in, out, length); return out; } diff --git a/src/art.cpp b/src/art.cpp index 0cd2eed3..a77e60cd 100644 --- a/src/art.cpp +++ b/src/art.cpp @@ -39,6 +39,8 @@ static void art_fuzzy_recurse(unsigned char p, unsigned char c, const art_node * void art_int_fuzzy_recurse(art_node *n, int depth, const unsigned char* int_str, int int_str_len, NUM_COMPARATOR comparator, std::vector &results); +static void insert_and_shift_offset_index(sorted_array& offset_index, const uint32_t index, const uint32_t num_offsets); + bool compare_art_leaf_frequency(const art_leaf *a, const art_leaf *b) { return a->values->ids.getLength() > b->values->ids.getLength(); } @@ -408,15 +410,42 @@ art_leaf* art_maximum(art_tree *t) { static void add_document_to_leaf(const art_document *document, art_leaf *leaf) { leaf->max_score = MAX(leaf->max_score, document->score); - leaf->values->ids.append(document->id); - uint32_t curr_index = leaf->values->offsets.getLength(); - leaf->values->offset_index.append(curr_index); + size_t inserted_index = leaf->values->ids.append(document->id); - for(uint32_t i=0; ioffsets_len; i++) { - leaf->values->offsets.append(document->offsets[i]); + if(inserted_index == leaf->values->ids.getLength()-1) { + // treat as appends + uint32_t curr_index = leaf->values->offsets.getLength(); + leaf->values->offset_index.append(curr_index); + for(uint32_t i=0; ioffsets_len; i++) { + leaf->values->offsets.append(document->offsets[i]); + } + } else { + uint32_t existing_offset_index = leaf->values->offset_index.at(inserted_index); + insert_and_shift_offset_index(leaf->values->offset_index, inserted_index, document->offsets_len); + leaf->values->offsets.insert(existing_offset_index, document->offsets, document->offsets_len); } } +void insert_and_shift_offset_index(sorted_array& offset_index, const uint32_t index, const uint32_t num_offsets) { + uint32_t existing_offset_index = offset_index.at(index); + uint32_t length = offset_index.getLength(); + uint32_t new_length = length + 1; + uint32_t *curr_array = offset_index.uncompress(new_length); + + memmove(&curr_array[index+1], &curr_array[index], sizeof(uint32_t)*(length - index)); + curr_array[index] = existing_offset_index; + + uint32_t curr_index = index + 1; + while(curr_index < new_length) { + curr_array[curr_index] += num_offsets; + curr_index++; + } + + offset_index.load(curr_array, new_length); + + delete [] curr_array; +} + static art_leaf* make_leaf(const unsigned char *key, uint32_t key_len, art_document *document) { art_leaf *l = (art_leaf *) malloc(sizeof(art_leaf) + key_len); l->values = new art_values; diff --git a/src/auth_manager.cpp b/src/auth_manager.cpp index 3df010b2..14a3f407 100644 --- a/src/auth_manager.cpp +++ b/src/auth_manager.cpp @@ -5,7 +5,7 @@ constexpr const char* AuthManager::DOCUMENTS_SEARCH_ACTION; Option AuthManager::init(Store *store) { // This function must be idempotent, i.e. when called multiple times, must produce the same state without leaks - LOG(INFO) << "AuthManager::init()"; + //LOG(INFO) << "AuthManager::init()"; this->store = store; @@ -157,7 +157,7 @@ bool AuthManager::authenticate(const std::string& req_api_key, const std::string } // enrich params with values from embedded_params - for (const auto& it: embedded_params.items()){ + for(auto it = embedded_params.begin(); it != embedded_params.end(); ++it) { if(params.count(it.key()) == 0) { params[it.key()] = it.value(); } else if(it.key() == "filter_by") { diff --git a/src/collection.cpp b/src/collection.cpp index 4abfba74..aaa51843 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -8,7 +8,6 @@ #include #include #include -#include #include #include #include "topster.h" @@ -99,33 +98,75 @@ void Collection::increment_next_seq_id_field() { next_seq_id++; } -Option Collection::to_doc(const std::string & json_str, nlohmann::json & document) { +Option Collection::to_doc(const std::string & json_str, nlohmann::json& document, + const index_operation_t& operation, const std::string& id) { try { document = nlohmann::json::parse(json_str); } catch(const std::exception& e) { LOG(ERROR) << "JSON error: " << e.what(); - return Option(400, std::string("Bad JSON: ") + e.what()); + return Option(400, std::string("Bad JSON: ") + e.what()); } if(!document.is_object()) { - return Option(400, "Bad JSON: not a properly formed document."); + return Option(400, "Bad JSON: not a properly formed document."); } - uint32_t seq_id = get_next_seq_id(); - std::string seq_id_str = std::to_string(seq_id); + if(document.count("id") != 0 && id != "" && document["id"] != id) { + return Option(400, "The `id` of the resource does not match the `id` in the JSON body."); + } + + if(document.count("id") == 0 && !id.empty()) { + // use the explicit ID (usually from a PUT request) if document body does not have it + document["id"] = id; + } + + if(document.count("id") != 0 && document["id"] == "") { + return Option(400, "The `id` should not be empty."); + } if(document.count("id") == 0) { - document["id"] = seq_id_str; - } else if(!document["id"].is_string()) { - return Option(400, "Document's `id` field should be a string."); - } + if(operation == UPDATE) { + return Option(400, "For update, the `id` key must be provided."); + } + // for UPSERT or CREATE, if a document does not have an ID, we will treat it as a new doc + uint32_t seq_id = get_next_seq_id(); + document["id"] = std::to_string(seq_id); + return Option(doc_seq_id_t{seq_id, true}); + } else { + if(!document["id"].is_string()) { + return Option(400, "Document's `id` field should be a string."); + } - const std::string& doc_id = document["id"]; - if(doc_exists(doc_id)) { - return Option(409, std::string("A document with id ") + doc_id + " already exists."); - } + const std::string& doc_id = document["id"]; - return Option(seq_id); + // try to get the corresponding sequence id from disk if present + std::string seq_id_str; + StoreStatus seq_id_status = store->get(get_doc_id_key(doc_id), seq_id_str); + + if(seq_id_status == StoreStatus::ERROR) { + return Option(500, "Error fetching the sequence key for document with id: " + doc_id); + } + + if(seq_id_status == StoreStatus::FOUND) { + if(operation == CREATE) { + return Option(409, std::string("A document with id ") + doc_id + " already exists."); + } + + // UPSERT or UPDATE + uint32_t seq_id = (uint32_t) std::stoul(seq_id_str); + return Option(doc_seq_id_t{seq_id, false}); + + } else { + if(operation == UPDATE) { + // for UPDATE, a document with given ID must be found + return Option(404, "Could not find a document with id: " + doc_id); + } else { + // for UPSERT or CREATE, if a document with given ID is not found, we will treat it as a new doc + uint32_t seq_id = get_next_seq_id(); + return Option(doc_seq_id_t{seq_id, true}); + } + } + } } nlohmann::json Collection::get_summary_json() { @@ -152,45 +193,48 @@ nlohmann::json Collection::get_summary_json() { return json_response; } -Option Collection::add(const std::string & json_str) { +Option Collection::add(const std::string & json_str, + const index_operation_t& operation, const std::string& id) { nlohmann::json document; - Option doc_seq_id_op = to_doc(json_str, document); + std::vector json_lines = {json_str}; + const nlohmann::json& res = add_many(json_lines, document, operation, id); - if(!doc_seq_id_op.ok()) { - return Option(doc_seq_id_op.code(), doc_seq_id_op.error()); - } + if(!res["success"].get()) { + nlohmann::json res_doc; - /*if(is_exceeding_memory_threshold()) { - return Option(403, "Max memory ratio exceeded."); - }*/ + try { + res_doc = nlohmann::json::parse(json_lines[0]); + } catch(const std::exception& e) { + LOG(ERROR) << "JSON error: " << e.what(); + return Option(400, std::string("Bad JSON: ") + e.what()); + } - const uint32_t seq_id = doc_seq_id_op.get(); - const std::string seq_id_str = std::to_string(seq_id); - - const Option & index_memory_op = index_in_memory(document, seq_id); - - if(!index_memory_op.ok()) { - return Option(index_memory_op.code(), index_memory_op.error()); - } - - const std::string& serialized_json = document.dump(-1, ' ', false, nlohmann::detail::error_handler_t::ignore); - - rocksdb::WriteBatch batch; - batch.Put(get_doc_id_key(document["id"]), seq_id_str); - batch.Put(get_seq_id_key(seq_id), serialized_json); - bool write_ok = store->batch_write(batch); - - if(!write_ok) { - remove_document(document, seq_id, false); // remove from in-memory store too - return Option(500, "Could not write to on-disk storage."); + return Option(res_doc["code"].get(), res_doc["error"].get()); } return Option(document); } -nlohmann::json Collection::add_many(std::vector& json_lines) { - //LOG(INFO) << "Memory ratio. Max = " << max_memory_ratio << ", Used = " << SystemMetrics::used_memory_ratio(); +void Collection::get_doc_changes(const nlohmann::json &document, nlohmann::json &old_doc, + nlohmann::json &new_doc, nlohmann::json &del_doc) { + for(auto it = old_doc.begin(); it != old_doc.end(); ++it) { + new_doc[it.key()] = it.value(); + } + + for(auto it = document.begin(); it != document.end(); ++it) { + new_doc[it.key()] = it.value(); + if(old_doc.count(it.key()) != 0) { + // key exists in the stored doc, so it must be reindexed + // we need to check for this because a field can be optional + del_doc[it.key()] = old_doc[it.key()]; + } + } +} + +nlohmann::json Collection::add_many(std::vector& json_lines, nlohmann::json& document, + const index_operation_t& operation, const std::string& id) { + //LOG(INFO) << "Memory ratio. Max = " << max_memory_ratio << ", Used = " << SystemMetrics::used_memory_ratio(); std::vector> iter_batch; for(size_t i = 0; i < num_memory_shards; i++) { @@ -203,16 +247,23 @@ nlohmann::json Collection::add_many(std::vector& json_lines) { for(size_t i=0; i < json_lines.size(); i++) { const std::string & json_line = json_lines[i]; - nlohmann::json document; - Option doc_seq_id_op = to_doc(json_line, document); + Option doc_seq_id_op = to_doc(json_line, document, operation, id); - const uint32_t seq_id = doc_seq_id_op.ok() ? doc_seq_id_op.get() : 0; - index_record record(i, seq_id, document); + const uint32_t seq_id = doc_seq_id_op.ok() ? doc_seq_id_op.get().seq_id : 0; + index_record record(i, seq_id, document, operation); // NOTE: we overwrite the input json_lines with result to avoid memory pressure + record.is_update = false; + if(!doc_seq_id_op.ok()) { record.index_failure(doc_seq_id_op.code(), doc_seq_id_op.error()); + } else { + record.is_update = !doc_seq_id_op.get().is_new; + if(record.is_update) { + get_document_from_store(get_seq_id_key(seq_id), record.old_doc); + get_doc_changes(document, record.old_doc, record.new_doc, record.del_doc); + } } /* @@ -261,45 +312,74 @@ void Collection::batch_index(std::vector> &index_batch // store only documents that were indexed in-memory successfully for(auto& index_batch: index_batches) { for(auto& index_record: index_batch) { + nlohmann::json res; + if(index_record.indexed.ok()) { - const std::string& seq_id_str = std::to_string(index_record.seq_id); - const std::string& serialized_json = index_record.document.dump(-1, ' ', false, - nlohmann::detail::error_handler_t::ignore); + if(index_record.is_update) { + const std::string& serialized_json = index_record.new_doc.dump(-1, ' ', false, nlohmann::detail::error_handler_t::ignore); + bool write_ok = store->insert(get_seq_id_key(index_record.seq_id), serialized_json); - rocksdb::WriteBatch batch; - batch.Put(get_doc_id_key(index_record.document["id"]), seq_id_str); - batch.Put(get_seq_id_key(index_record.seq_id), serialized_json); - bool write_ok = store->batch_write(batch); + if(!write_ok) { + // we will attempt to reindex the old doc on a best-effort basis + remove_document(index_record.new_doc, index_record.seq_id, false); + index_in_memory(index_record.old_doc, index_record.seq_id, false); + index_record.index_failure(500, "Could not write to on-disk storage."); + } else { + num_indexed++; + index_record.index_success(); + } - if(!write_ok) { - index_record.indexed = Option(500, "Could not write to on-disk storage.");; - // remove from in-memory store to keep the state synced - remove_document(index_record.document, index_record.seq_id, false); + } else { + const std::string& seq_id_str = std::to_string(index_record.seq_id); + const std::string& serialized_json = index_record.doc.dump(-1, ' ', false, + nlohmann::detail::error_handler_t::ignore); + + rocksdb::WriteBatch batch; + batch.Put(get_doc_id_key(index_record.doc["id"]), seq_id_str); + batch.Put(get_seq_id_key(index_record.seq_id), serialized_json); + bool write_ok = store->batch_write(batch); + + if(!write_ok) { + // remove from in-memory store to keep the state synced + remove_document(index_record.doc, index_record.seq_id, false); + index_record.index_failure(500, "Could not write to on-disk storage."); + } else { + num_indexed++; + index_record.index_success(); + } } - json_out[index_record.position] = R"({"success": true})"; - num_indexed++; + res["success"] = index_record.indexed.ok(); + if(!index_record.indexed.ok()) { + res["document"] = json_out[index_record.position]; + res["error"] = index_record.indexed.error(); + res["code"] = index_record.indexed.code(); + } } else { - nlohmann::json res; res["success"] = false; - res["error"] = index_record.indexed.error(); res["document"] = json_out[index_record.position]; - json_out[index_record.position] = res.dump(); + res["error"] = index_record.indexed.error(); + res["code"] = index_record.indexed.code(); } + + json_out[index_record.position] = res.dump(); } } } -Option Collection::index_in_memory(const nlohmann::json &document, uint32_t seq_id) { - Option validation_op = Index::validate_index_in_memory(document, seq_id, default_sorting_field, - search_schema, facet_schema); +Option Collection::index_in_memory(const nlohmann::json &document, uint32_t seq_id, bool is_update) { + if(!is_update) { + // for update, validation should be done prior + Option validation_op = Index::validate_index_in_memory(document, seq_id, default_sorting_field, + search_schema, facet_schema, is_update); - if(!validation_op.ok()) { - return validation_op; + if(!validation_op.ok()) { + return validation_op; + } } Index* index = indices[seq_id % num_memory_shards]; - index->index_in_memory(document, seq_id, default_sorting_field); + index->index_in_memory(document, seq_id, default_sorting_field, is_update); num_documents += 1; return Option<>(200); @@ -418,6 +498,7 @@ Option Collection::search(const std::string & query, const std:: const size_t max_facet_values, const std::string & simple_facet_query, const size_t snippet_threshold, + const size_t highlight_affix_num_tokens, const std::string & highlight_full_fields, size_t typo_tokens_threshold, const std::map>& pinned_hits, @@ -992,7 +1073,8 @@ Option Collection::search(const std::string & query, const std:: bool highlighted_fully = (fields_highlighted_fully.find(field_name) != fields_highlighted_fully.end()); highlight_t highlight; highlight_result(search_field, searched_queries, field_order_kv, document, - string_utils, snippet_threshold, highlighted_fully, highlight); + string_utils, snippet_threshold, highlight_affix_num_tokens, + highlighted_fully, highlight); if(!highlight.snippets.empty()) { highlights.push_back(highlight); @@ -1238,7 +1320,9 @@ void Collection::facet_value_to_string(const facet &a_facet, const facet_count_t void Collection::highlight_result(const field &search_field, const std::vector> &searched_queries, const KV* field_order_kv, const nlohmann::json & document, - StringUtils & string_utils, size_t snippet_threshold, + StringUtils & string_utils, + const size_t snippet_threshold, + const size_t highlight_affix_num_tokens, bool highlighted_fully, highlight_t & highlight) { @@ -1316,6 +1400,10 @@ void Collection::highlight_result(const field &search_field, if(match.offsets[i].offset != MAX_DISPLACEMENT) { size_t token_index = (size_t)(match.offsets[i].offset); token_indices.push_back(token_index); + if(token_index >= tokens.size()) { + LOG(ERROR) << "Highlight token index " << token_index << " is greater than length of store field."; + continue; + } std::string token = tokens[token_index]; string_utils.unicode_normalize(token); token_hits.insert(token); @@ -1324,12 +1412,15 @@ void Collection::highlight_result(const field &search_field, auto minmax = std::minmax_element(token_indices.begin(), token_indices.end()); + size_t prefix_length = highlight_affix_num_tokens; + size_t suffix_length = highlight_affix_num_tokens + 1; + // For longer strings, pick surrounding tokens within 4 tokens of min_index and max_index for the snippet const size_t start_index = (tokens.size() <= snippet_threshold) ? 0 : - std::max(0, (int)(*(minmax.first) - 4)); + std::max(0, (int)(*(minmax.first) - prefix_length)); const size_t end_index = (tokens.size() <= snippet_threshold) ? tokens.size() : - std::min((int)tokens.size(), (int)(*(minmax.second) + 5)); + std::min((int)tokens.size(), (int)(*(minmax.second) + suffix_length)); std::stringstream snippet_stream; for(size_t snippet_index = start_index; snippet_index < end_index; snippet_index++) { @@ -1401,7 +1492,7 @@ Option Collection::get(const std::string & id) { return Option(500, "Error while fetching the document."); } - uint32_t seq_id = (uint32_t) std::stol(seq_id_str); + uint32_t seq_id = (uint32_t) std::stoul(seq_id_str); std::string parsed_document; StoreStatus doc_status = store->get(get_seq_id_key(seq_id), parsed_document); @@ -1450,7 +1541,7 @@ Option Collection::remove(const std::string & id, const bool remove return Option(500, "Error while fetching the document."); } - uint32_t seq_id = (uint32_t) std::stol(seq_id_str); + uint32_t seq_id = (uint32_t) std::stoul(seq_id_str); std::string parsed_document; StoreStatus doc_status = store->get(get_seq_id_key(seq_id), parsed_document); diff --git a/src/collection_manager.cpp b/src/collection_manager.cpp index e548659f..59ee29dc 100644 --- a/src/collection_manager.cpp +++ b/src/collection_manager.cpp @@ -173,7 +173,7 @@ Option CollectionManager::load(const size_t init_batch_size) { } num_valid_docs++; - iter_batch[seq_id % collection->get_num_memory_shards()].emplace_back(index_record(0, seq_id, document)); + iter_batch[seq_id % collection->get_num_memory_shards()].emplace_back(index_record(0, seq_id, document, CREATE)); // Peek and check for last record right here so that we handle batched indexing correctly // Without doing this, the "last batch" would have to be indexed outside the loop. @@ -195,7 +195,7 @@ Option CollectionManager::load(const size_t init_batch_size) { if(num_indexed != num_records) { const Option & index_error_op = get_first_index_error(iter_batch[i]); - if(index_error_op.ok()) { + if(!index_error_op.ok()) { return Option(false, index_error_op.get()); } } diff --git a/src/core_api.cpp b/src/core_api.cpp index f2b2f9c9..bdb36297 100644 --- a/src/core_api.cpp +++ b/src/core_api.cpp @@ -27,6 +27,18 @@ bool handle_authentication(std::map& req_params, const return collectionManager.auth_key_matches(auth_key, rpath.action, collection, req_params); } +index_operation_t get_index_operation(const std::string& action) { + if(action == "create") { + return CREATE; + } else if(action == "update") { + return UPDATE; + } else if(action == "upsert") { + return UPSERT; + } + + return CREATE; +} + bool get_collections(http_req & req, http_res & res) { CollectionManager & collectionManager = CollectionManager::get_instance(); std::vector collections = collectionManager.get_collections(); @@ -254,6 +266,9 @@ bool get_search(http_req & req, http_res & res) { // strings under this length will be fully highlighted, instead of showing a snippet of relevant portion const char *SNIPPET_THRESHOLD = "snippet_threshold"; + // the number of tokens that should surround the highlighted text + const char *HIGHLIGHT_AFFIX_NUM_TOKENS = "highlight_affix_num_tokens"; + // list of fields which will be highlighted fully without snippeting const char *HIGHLIGHT_FULL_FIELDS = "highlight_full_fields"; @@ -290,6 +305,10 @@ bool get_search(http_req & req, http_res & res) { req.params[SNIPPET_THRESHOLD] = "30"; } + if(req.params.count(HIGHLIGHT_AFFIX_NUM_TOKENS) == 0) { + req.params[HIGHLIGHT_AFFIX_NUM_TOKENS] = "4"; + } + if(req.params.count(HIGHLIGHT_FULL_FIELDS) == 0) { req.params[HIGHLIGHT_FULL_FIELDS] = ""; } @@ -362,6 +381,11 @@ bool get_search(http_req & req, http_res & res) { return false; } + if(!StringUtils::is_uint32_t(req.params[HIGHLIGHT_AFFIX_NUM_TOKENS])) { + res.set_400("Parameter `" + std::string(HIGHLIGHT_AFFIX_NUM_TOKENS) + "` must be an unsigned integer."); + return false; + } + if(!StringUtils::is_uint32_t(req.params[GROUP_LIMIT])) { res.set_400("Parameter `" + std::string(GROUP_LIMIT) + "` must be an unsigned integer."); return false; @@ -474,6 +498,7 @@ bool get_search(http_req & req, http_res & res) { static_cast(std::stol(req.params[MAX_FACET_VALUES])), req.params[FACET_QUERY], static_cast(std::stol(req.params[SNIPPET_THRESHOLD])), + static_cast(std::stol(req.params[HIGHLIGHT_AFFIX_NUM_TOKENS])), req.params[HIGHLIGHT_FULL_FIELDS], typo_tokens_threshold, pinned_hits, @@ -579,11 +604,16 @@ bool post_import_documents(http_req& req, http_res& res) { //LOG(INFO) << "post_import_documents"; //LOG(INFO) << "req.first_chunk=" << req.first_chunk_aggregate << ", last_chunk=" << req.last_chunk_aggregate; const char *BATCH_SIZE = "batch_size"; + const char *ACTION = "action"; if(req.params.count(BATCH_SIZE) == 0) { req.params[BATCH_SIZE] = "40"; } + if(req.params.count(ACTION) == 0) { + req.params[ACTION] = "create"; + } + if(!StringUtils::is_uint32_t(req.params[BATCH_SIZE])) { req.last_chunk_aggregate = true; res.final = true; @@ -592,6 +622,14 @@ bool post_import_documents(http_req& req, http_res& res) { return false; } + if(req.params[ACTION] != "create" && req.params[ACTION] != "update" && req.params[ACTION] != "upsert") { + req.last_chunk_aggregate = true; + res.final = true; + res.set_400("Parameter `" + std::string(ACTION) + "` must be a create|update|upsert."); + HttpServer::stream_response(req, res); + return false; + } + const size_t IMPORT_BATCH_SIZE = std::stoi(req.params[BATCH_SIZE]); if(IMPORT_BATCH_SIZE == 0) { @@ -667,8 +705,11 @@ bool post_import_documents(http_req& req, http_res& res) { //LOG(INFO) << "single_partial_record_body: " << single_partial_record_body; + const index_operation_t operation = get_index_operation(req.params[ACTION]); + if(!single_partial_record_body) { - nlohmann::json json_res = collection->add_many(json_lines); + nlohmann::json document; + nlohmann::json json_res = collection->add_many(json_lines, document, operation); //const std::string& import_summary_json = json_res.dump(); //response_stream << import_summary_json << "\n"; @@ -698,6 +739,16 @@ bool post_import_documents(http_req& req, http_res& res) { } bool post_add_document(http_req & req, http_res & res) { + const char *ACTION = "action"; + if(req.params.count(ACTION) == 0) { + req.params[ACTION] = "create"; + } + + if(req.params[ACTION] != "create" && req.params[ACTION] != "update" && req.params[ACTION] != "upsert") { + res.set_400("Parameter `" + std::string(ACTION) + "` must be a create|update|upsert."); + return false; + } + CollectionManager & collectionManager = CollectionManager::get_instance(); Collection* collection = collectionManager.get_collection(req.params["collection"]); @@ -706,7 +757,8 @@ bool post_add_document(http_req & req, http_res & res) { return false; } - Option inserted_doc_op = collection->add(req.body); + const index_operation_t operation = get_index_operation(req.params[ACTION]); + Option inserted_doc_op = collection->add(req.body, operation); if(!inserted_doc_op.ok()) { res.set(inserted_doc_op.code(), inserted_doc_op.error()); @@ -717,6 +769,28 @@ bool post_add_document(http_req & req, http_res & res) { return true; } +bool patch_update_document(http_req & req, http_res & res) { + std::string doc_id = req.params["id"]; + + CollectionManager & collectionManager = CollectionManager::get_instance(); + Collection* collection = collectionManager.get_collection(req.params["collection"]); + + if(collection == nullptr) { + res.set_404(); + return false; + } + + Option upserted_doc_op = collection->add(req.body, index_operation_t::UPDATE, doc_id); + + if(!upserted_doc_op.ok()) { + res.set(upserted_doc_op.code(), upserted_doc_op.error()); + return false; + } + + res.set_201(upserted_doc_op.get().dump()); + return true; +} + bool get_fetch_document(http_req & req, http_res & res) { std::string doc_id = req.params["id"]; @@ -1044,7 +1118,7 @@ bool get_key(http_req &req, http_res &res) { AuthManager &auth_manager = collectionManager.getAuthManager(); const std::string& key_id_str = req.params["id"]; - uint32_t key_id = (uint32_t) std::stol(key_id_str); + uint32_t key_id = (uint32_t) std::stoul(key_id_str); const Option& key_op = auth_manager.get_key(key_id); @@ -1066,7 +1140,7 @@ bool del_key(http_req &req, http_res &res) { AuthManager &auth_manager = collectionManager.getAuthManager(); const std::string& key_id_str = req.params["id"]; - uint32_t key_id = (uint32_t) std::stol(key_id_str); + uint32_t key_id = (uint32_t) std::stoul(key_id_str); const Option &del_op = auth_manager.remove_key(key_id); diff --git a/src/http_server.cpp b/src/http_server.cpp index 75b07790..9d9a035a 100644 --- a/src/http_server.cpp +++ b/src/http_server.cpp @@ -129,6 +129,7 @@ int HttpServer::create_listener() { ctx.globalconf->server_name = h2o_strdup(nullptr, "", SIZE_MAX); ctx.globalconf->http2.active_stream_window_size = ACTIVE_STREAM_WINDOW_SIZE; ctx.globalconf->http2.idle_timeout = REQ_TIMEOUT_MS; + ctx.globalconf->max_request_entity_size = (1024 * 1024 * 1024); // 1 GB ctx.globalconf->http1.req_timeout = REQ_TIMEOUT_MS; ctx.globalconf->http1.req_io_timeout = REQ_TIMEOUT_MS; @@ -705,6 +706,13 @@ void HttpServer::put(const std::string & path, bool (*handler)(http_req &, http_ routes.emplace_back(rpath.route_hash(), rpath); } +void HttpServer::patch(const std::string & path, bool (*handler)(http_req &, http_res &), bool async_req, bool async_res) { + std::vector path_parts; + StringUtils::split(path, path_parts, "/"); + route_path rpath("PATCH", path_parts, handler, async_req, async_res); + routes.emplace_back(rpath.route_hash(), rpath); +} + void HttpServer::del(const std::string & path, bool (*handler)(http_req &, http_res &), bool async_req, bool async_res) { std::vector path_parts; StringUtils::split(path, path_parts, "/"); diff --git a/src/index.cpp b/src/index.cpp index 21262d01..469cde0e 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -56,8 +56,8 @@ Index::~Index() { sort_index.clear(); } -int32_t Index::get_points_from_doc(const nlohmann::json &document, const std::string & default_sorting_field) { - int32_t points = 0; +int64_t Index::get_points_from_doc(const nlohmann::json &document, const std::string & default_sorting_field) { + int64_t points = 0; if(!default_sorting_field.empty()) { if(document[default_sorting_field].is_number_float()) { @@ -85,8 +85,15 @@ int64_t Index::float_to_in64_t(float f) { } Option Index::index_in_memory(const nlohmann::json &document, uint32_t seq_id, - const std::string & default_sorting_field) { - int32_t points = get_points_from_doc(document, default_sorting_field); + const std::string & default_sorting_field, bool is_update) { + + int64_t points = 0; + + if(is_update && document.count(default_sorting_field) == 0) { + points = sort_index[default_sorting_field]->at(seq_id); + } else { + points = get_points_from_doc(document, default_sorting_field); + } std::unordered_map facet_to_id; size_t i_facet = 0; @@ -104,7 +111,7 @@ Option Index::index_in_memory(const nlohmann::json &document, uint32_t for(const std::pair & field_pair: search_schema) { const std::string & field_name = field_pair.first; - if(field_pair.second.optional && document.count(field_name) == 0) { + if((field_pair.second.optional || is_update) && document.count(field_name) == 0) { continue; } @@ -212,17 +219,22 @@ Option Index::index_in_memory(const nlohmann::json &document, uint32_t Option Index::validate_index_in_memory(const nlohmann::json &document, uint32_t seq_id, const std::string & default_sorting_field, const std::unordered_map & search_schema, - const std::map & facet_schema) { - if(document.count(default_sorting_field) == 0) { + const std::map & facet_schema, + bool is_update) { + + bool has_default_sort_field = (document.count(default_sorting_field) != 0); + + if(!has_default_sort_field && !is_update) { return Option<>(400, "Field `" + default_sorting_field + "` has been declared as a default sorting field, " "but is not found in the document."); } - if(!document[default_sorting_field].is_number_integer() && !document[default_sorting_field].is_number_float()) { + if(has_default_sort_field && + !document[default_sorting_field].is_number_integer() && !document[default_sorting_field].is_number_float()) { return Option<>(400, "Default sorting field `" + default_sorting_field + "` must be a single valued numerical field."); } - if(search_schema.at(default_sorting_field).is_single_float() && + if(has_default_sort_field && search_schema.at(default_sorting_field).is_single_float() && document[default_sorting_field].get() > std::numeric_limits::max()) { return Option<>(400, "Default sorting field `" + default_sorting_field + "` exceeds maximum value of a float."); } @@ -230,7 +242,7 @@ Option Index::validate_index_in_memory(const nlohmann::json &document, for(const std::pair & field_pair: search_schema) { const std::string & field_name = field_pair.first; - if(field_pair.second.optional && document.count(field_name) == 0) { + if((field_pair.second.optional || is_update) && document.count(field_name) == 0) { continue; } @@ -309,6 +321,48 @@ Option Index::validate_index_in_memory(const nlohmann::json &document, return Option<>(200); } +void Index::scrub_reindex_doc(nlohmann::json& update_doc, nlohmann::json& del_doc, nlohmann::json& old_doc) { + auto it = del_doc.cbegin(); + while(it != del_doc.cend()) { + const std::string& field_name = it.key(); + const auto& search_field_it = search_schema.find(field_name); + if(search_field_it == search_schema.end()) { + ++it; + continue; + } + + const auto& search_field = search_field_it->second; + + // Go through all the field names and find the keys+values so that they can be removed from in-memory index + std::vector reindex_tokens; + std::vector old_tokens; + tokenize_doc_field(update_doc, field_name, search_field, reindex_tokens); + tokenize_doc_field(old_doc, field_name, search_field, old_tokens); + + if(old_tokens.size() != reindex_tokens.size()) { + ++it; + continue; + } + + bool exact_match = true; + + for(size_t i=0; i & iter_batch, const std::string & default_sorting_field, const std::unordered_map & search_schema, @@ -322,29 +376,42 @@ size_t Index::batch_memory_index(Index *index, std::vector & iter_ continue; } - Option validation_op = validate_index_in_memory(index_rec.document, index_rec.seq_id, - default_sorting_field, - search_schema, facet_schema); + if(index_rec.operation != DELETE) { + Option validation_op = validate_index_in_memory(index_rec.doc, index_rec.seq_id, + default_sorting_field, + search_schema, facet_schema, index_rec.is_update); - if(!validation_op.ok()) { - index_rec.index_failure(validation_op.code(), validation_op.error()); - continue; + if(!validation_op.ok()) { + index_rec.index_failure(validation_op.code(), validation_op.error()); + continue; + } + + if(index_rec.is_update) { + // scrub string fields to reduce delete ops + index->scrub_reindex_doc(index_rec.doc, index_rec.del_doc, index_rec.old_doc); + index->remove(index_rec.seq_id, index_rec.del_doc); + } + + Option index_mem_op = index->index_in_memory(index_rec.doc, index_rec.seq_id, + default_sorting_field, index_rec.is_update); + if(!index_mem_op.ok()) { + index->index_in_memory(index_rec.del_doc, index_rec.seq_id, default_sorting_field, true); + index_rec.index_failure(index_mem_op.code(), index_mem_op.error()); + continue; + } + + index_rec.index_success(); + + if(!index_rec.is_update) { + num_indexed++; + } } - - Option index_mem_op = index->index_in_memory(index_rec.document, index_rec.seq_id, default_sorting_field); - if(!index_mem_op.ok()) { - index_rec.index_failure(index_mem_op.code(), index_mem_op.error()); - continue; - } - - index_rec.index_success(index_rec); - num_indexed++; } return num_indexed; } -void Index::insert_doc(const uint32_t score, art_tree *t, uint32_t seq_id, +void Index::insert_doc(const int64_t score, art_tree *t, uint32_t seq_id, const std::unordered_map> &token_to_offsets) const { for(auto & kv: token_to_offsets) { art_document art_doc; @@ -369,13 +436,14 @@ void Index::insert_doc(const uint32_t score, art_tree *t, uint32_t seq_id, art_doc.offsets[i] = kv.second[i]; } + //LOG(INFO) << "key: " << key << ", art_doc.id: " << art_doc.id; art_insert(t, key, key_len, &art_doc, num_hits); delete [] art_doc.offsets; art_doc.offsets = nullptr; } } -void Index::index_int32_field(const int32_t value, uint32_t score, art_tree *t, uint32_t seq_id) const { +void Index::index_int32_field(const int32_t value, int64_t score, art_tree *t, uint32_t seq_id) const { const int KEY_LEN = 8; unsigned char key[KEY_LEN]; @@ -398,7 +466,7 @@ void Index::index_int32_field(const int32_t value, uint32_t score, art_tree *t, art_insert(t, key, KEY_LEN, &art_doc, num_hits); } -void Index::index_int64_field(const int64_t value, uint32_t score, art_tree *t, uint32_t seq_id) const { +void Index::index_int64_field(const int64_t value, int64_t score, art_tree *t, uint32_t seq_id) const { const int KEY_LEN = 8; unsigned char key[KEY_LEN]; @@ -421,7 +489,7 @@ void Index::index_int64_field(const int64_t value, uint32_t score, art_tree *t, art_insert(t, key, KEY_LEN, &art_doc, num_hits); } -void Index::index_bool_field(const bool value, const uint32_t score, art_tree *t, uint32_t seq_id) const { +void Index::index_bool_field(const bool value, const int64_t score, art_tree *t, uint32_t seq_id) const { const int KEY_LEN = 1; unsigned char key[KEY_LEN]; key[0] = value ? '1' : '0'; @@ -443,7 +511,7 @@ void Index::index_bool_field(const bool value, const uint32_t score, art_tree *t art_insert(t, key, KEY_LEN, &art_doc, num_hits); } -void Index::index_float_field(const float value, uint32_t score, art_tree *t, uint32_t seq_id) const { +void Index::index_float_field(const float value, int64_t score, art_tree *t, uint32_t seq_id) const { const int KEY_LEN = 8; unsigned char key[KEY_LEN]; @@ -484,7 +552,7 @@ uint64_t Index::facet_token_hash(const field & a_field, const std::string &token return hash; } -void Index::index_string_field(const std::string & text, const uint32_t score, art_tree *t, +void Index::index_string_field(const std::string & text, const int64_t score, art_tree *t, uint32_t seq_id, int facet_id, const field & a_field) { std::vector tokens; StringUtils::split(text, tokens, " "); @@ -506,6 +574,10 @@ void Index::index_string_field(const std::string & text, const uint32_t score, a token_to_offsets[token].push_back(i); } + /*if(seq_id == 0) { + LOG(INFO) << "field name: " << a_field.name; + }*/ + insert_doc(score, t, seq_id, token_to_offsets); if(facet_id >= 0) { @@ -513,7 +585,7 @@ void Index::index_string_field(const std::string & text, const uint32_t score, a } } -void Index::index_string_array_field(const std::vector & strings, const uint32_t score, art_tree *t, +void Index::index_string_array_field(const std::vector & strings, const int64_t score, art_tree *t, uint32_t seq_id, int facet_id, const field & a_field) { std::unordered_map> token_positions; @@ -565,28 +637,28 @@ void Index::index_string_array_field(const std::vector & strings, c insert_doc(score, t, seq_id, token_positions); } -void Index::index_int32_array_field(const std::vector & values, const uint32_t score, art_tree *t, +void Index::index_int32_array_field(const std::vector & values, const int64_t score, art_tree *t, uint32_t seq_id) const { for(const int32_t value: values) { index_int32_field(value, score, t, seq_id); } } -void Index::index_int64_array_field(const std::vector & values, const uint32_t score, art_tree *t, +void Index::index_int64_array_field(const std::vector & values, const int64_t score, art_tree *t, uint32_t seq_id) const { for(const int64_t value: values) { index_int64_field(value, score, t, seq_id); } } -void Index::index_bool_array_field(const std::vector & values, const uint32_t score, art_tree *t, +void Index::index_bool_array_field(const std::vector & values, const int64_t score, art_tree *t, uint32_t seq_id) const { for(const bool value: values) { index_bool_field(value, score, t, seq_id); } } -void Index::index_float_array_field(const std::vector & values, const uint32_t score, art_tree *t, +void Index::index_float_array_field(const std::vector & values, const int64_t score, art_tree *t, uint32_t seq_id) const { for(const float value: values) { index_float_field(value, score, t, seq_id); @@ -996,7 +1068,7 @@ Option Index::do_filtering(uint32_t** filter_ids_out, const std::vecto bool found_filter = false; if(!f.is_array()) { - found_filter = (str_tokens.size() == fvalues.size()); + found_filter = (query_suggestion.size() == fvalues.size()); } else { uint64_t filter_hash = 1; @@ -1712,6 +1784,11 @@ void Index::populate_token_positions(const std::vector& query_sugges // a) last element is array_index b) second and third last elements will be largest offset // (last element is repeated to indicate end of offsets for a given array index) + /*uint32_t* offsets = token_leaf->values->offsets.uncompress(); + for(size_t ii=0; ii < token_leaf->values->offsets.getLength(); ii++) { + LOG(INFO) << "offset: " << offsets[ii]; + }*/ + uint32_t start_offset = token_leaf->values->offset_index.at(doc_index); uint32_t end_offset = (doc_index == token_leaf->values->ids.getLength() - 1) ? token_leaf->values->offsets.getLength() : @@ -1767,8 +1844,8 @@ inline std::vector Index::next_suggestion(const std::vector Index::remove(const uint32_t seq_id, const nlohmann::json & document) { - for(auto & name_field: search_schema) { - if(name_field.second.optional && document.count(name_field.first) == 0) { + std::unordered_map facet_to_index; + get_facet_to_index(facet_to_index); + + for(auto it = document.begin(); it != document.end(); ++it) { + const std::string& field_name = it.key(); + const auto& search_field_it = search_schema.find(field_name); + if(search_field_it == search_schema.end()) { continue; } + const auto& search_field = search_field_it->second; + // Go through all the field names and find the keys+values so that they can be removed from in-memory index std::vector tokens; - if(name_field.second.type == field_types::STRING) { - StringUtils::split(document[name_field.first], tokens, " "); - } else if(name_field.second.type == field_types::STRING_ARRAY) { - std::vector values = document[name_field.first].get>(); - for(const std::string & value: values) { - StringUtils::split(value, tokens, " "); - } - } else if(name_field.second.type == field_types::INT32) { - const int KEY_LEN = 8; - unsigned char key[KEY_LEN]; - int32_t value = document[name_field.first].get(); - encode_int32(value, key); - tokens.push_back(std::string((char*)key, KEY_LEN)); - } else if(name_field.second.type == field_types::INT32_ARRAY) { - std::vector values = document[name_field.first].get>(); - for(const int32_t value: values) { - const int KEY_LEN = 8; - unsigned char key[KEY_LEN]; - encode_int32(value, key); - tokens.push_back(std::string((char*)key, KEY_LEN)); - } - } else if(name_field.second.type == field_types::INT64) { - const int KEY_LEN = 8; - unsigned char key[KEY_LEN]; - int64_t value = document[name_field.first].get(); - encode_int64(value, key); - tokens.push_back(std::string((char*)key, KEY_LEN)); - } else if(name_field.second.type == field_types::INT64_ARRAY) { - std::vector values = document[name_field.first].get>(); - for(const int64_t value: values) { - const int KEY_LEN = 8; - unsigned char key[KEY_LEN]; - encode_int64(value, key); - tokens.push_back(std::string((char*)key, KEY_LEN)); - } - } else if(name_field.second.type == field_types::FLOAT) { - const int KEY_LEN = 8; - unsigned char key[KEY_LEN]; - int64_t value = document[name_field.first].get(); - encode_float(value, key); - tokens.push_back(std::string((char*)key, KEY_LEN)); - } else if(name_field.second.type == field_types::FLOAT_ARRAY) { - std::vector values = document[name_field.first].get>(); - for(const float value: values) { - const int KEY_LEN = 8; - unsigned char key[KEY_LEN]; - encode_float(value, key); - tokens.push_back(std::string((char*)key, KEY_LEN)); - } - } else if(name_field.second.type == field_types::BOOL) { - const int KEY_LEN = 1; - unsigned char key[KEY_LEN]; - bool value = document[name_field.first].get(); - key[0] = value ? '1' : '0'; - tokens.push_back(std::string((char*)key, KEY_LEN)); - } else if(name_field.second.type == field_types::BOOL_ARRAY) { - std::vector values = document[name_field.first].get>(); - for(const bool value: values) { - const int KEY_LEN = 1; - unsigned char key[KEY_LEN]; - key[0] = value ? '1' : '0'; - tokens.push_back(std::string((char*)key, KEY_LEN)); - } - } + tokenize_doc_field(document, field_name, search_field, tokens); for(auto & token: tokens) { const unsigned char *key; int key_len; - if(name_field.second.type == field_types::STRING_ARRAY || name_field.second.type == field_types::STRING) { + if(search_field.type == field_types::STRING_ARRAY || search_field.type == field_types::STRING) { string_utils.unicode_normalize(token); key = (const unsigned char *) token.c_str(); key_len = (int) (token.length() + 1); @@ -1886,9 +1907,8 @@ Option Index::remove(const uint32_t seq_id, const nlohmann::json & doc key_len = (int) (token.length()); } - art_leaf* leaf = (art_leaf *) art_search(search_index.at(name_field.first), key, key_len); - if(leaf != NULL) { - uint32_t seq_id_values[1] = {seq_id}; + art_leaf* leaf = (art_leaf *) art_search(search_index.at(field_name), key, key_len); + if(leaf != nullptr) { uint32_t doc_index = leaf->values->ids.indexOf(seq_id); if(doc_index == leaf->values->ids.getLength()) { @@ -1905,7 +1925,7 @@ Option Index::remove(const uint32_t seq_id, const nlohmann::json & doc remove_and_shift_offset_index(leaf->values->offset_index, doc_indices, 1); leaf->values->offsets.remove_index(start_offset, end_offset); - leaf->values->ids.remove_values(seq_id_values, 1); + leaf->values->ids.remove_value(seq_id); /*len = leaf->values->offset_index.getLength(); for(auto i=0; i Index::remove(const uint32_t seq_id, const nlohmann::json & doc LOG(INFO) << "----";*/ if(leaf->values->ids.getLength() == 0) { - art_values* values = (art_values*) art_delete(search_index.at(name_field.first), key, key_len); + art_values* values = (art_values*) art_delete(search_index.at(field_name), key, key_len); delete values; - values = nullptr; } } } - } - // remove facets if any - facet_index_v2.erase(seq_id); + // remove facets + if(facet_to_index.count(field_name) != 0 && facet_index_v2.count(seq_id) != 0) { + size_t facet_index = facet_to_index[field_name]; + std::vector>& facet_values = facet_index_v2[seq_id]; + facet_values[facet_index].clear(); + } - // remove sort index if any - for(auto & field_doc_value_map: sort_index) { - field_doc_value_map.second->erase(seq_id); + // remove sort field + if(sort_index.count(field_name) != 0) { + sort_index[field_name]->erase(seq_id); + } } return Option(seq_id); } +void Index::tokenize_doc_field(const nlohmann::json& document, const std::string& field_name, const field& search_field, + std::vector& tokens) { + if(search_field.type == field_types::STRING) { + StringUtils::split(document[field_name], tokens, " "); + } else if(search_field.type == field_types::STRING_ARRAY) { + const std::vector& values = document[field_name].get>(); + for(const std::string & value: values) { + StringUtils::split(value, tokens, " "); + } + } else if(search_field.type == field_types::INT32) { + const int KEY_LEN = 8; + unsigned char key[KEY_LEN]; + const int32_t& value = document[field_name].get(); + encode_int32(value, key); + tokens.emplace_back((char*)key, KEY_LEN); + } else if(search_field.type == field_types::INT32_ARRAY) { + const std::vector& values = document[field_name].get>(); + for(const int32_t value: values) { + const int KEY_LEN = 8; + unsigned char key[KEY_LEN]; + encode_int32(value, key); + tokens.emplace_back((char*)key, KEY_LEN); + } + } else if(search_field.type == field_types::INT64) { + const int KEY_LEN = 8; + unsigned char key[KEY_LEN]; + const int64_t& value = document[field_name].get(); + encode_int64(value, key); + tokens.emplace_back((char*)key, KEY_LEN); + } else if(search_field.type == field_types::INT64_ARRAY) { + const std::vector& values = document[field_name].get>(); + for(const int64_t value: values) { + const int KEY_LEN = 8; + unsigned char key[KEY_LEN]; + encode_int64(value, key); + tokens.emplace_back((char*)key, KEY_LEN); + } + } else if(search_field.type == field_types::FLOAT) { + const int KEY_LEN = 8; + unsigned char key[KEY_LEN]; + const int64_t& value = document[field_name].get(); + encode_float(value, key); + tokens.emplace_back((char*)key, KEY_LEN); + } else if(search_field.type == field_types::FLOAT_ARRAY) { + const std::vector& values = document[field_name].get>(); + for(const float value: values) { + const int KEY_LEN = 8; + unsigned char key[KEY_LEN]; + encode_float(value, key); + tokens.emplace_back((char*)key, KEY_LEN); + } + } else if(search_field.type == field_types::BOOL) { + const int KEY_LEN = 1; + unsigned char key[KEY_LEN]; + const bool& value = document[field_name].get(); + key[0] = value ? '1' : '0'; + tokens.emplace_back((char*)key, KEY_LEN); + } else if(search_field.type == field_types::BOOL_ARRAY) { + const std::vector& values = document[field_name].get>(); + for(const bool value: values) { + const int KEY_LEN = 1; + unsigned char key[KEY_LEN]; + key[0] = value ? '1' : '0'; + tokens.emplace_back((char*)key, KEY_LEN); + } + } +} + art_leaf* Index::get_token_leaf(const std::string & field_name, const unsigned char* token, uint32_t token_len) { const art_tree *t = search_index.at(field_name); return (art_leaf*) art_search(t, token, (int) token_len); diff --git a/src/main/typesense_server.cpp b/src/main/typesense_server.cpp index 4164110e..28e2cd79 100644 --- a/src/main/typesense_server.cpp +++ b/src/main/typesense_server.cpp @@ -21,6 +21,7 @@ void master_server_routes() { // document management - `/documents/:id` end-points must be placed last in the list server->post("/collections/:collection/documents", post_add_document); + server->patch("/collections/:collection/documents/:id", patch_update_document); server->get("/collections/:collection/documents/search", get_search); server->post("/collections/:collection/documents/import", post_import_documents, true, true); diff --git a/src/sorted_array.cpp b/src/sorted_array.cpp index 60f0a962..bf5f384c 100644 --- a/src/sorted_array.cpp +++ b/src/sorted_array.cpp @@ -1,5 +1,6 @@ #include "sorted_array.h" #include "array_utils.h" +#include "logger.h" void sorted_array::load(const uint32_t *sorted_array, const uint32_t array_length) { min = array_length != 0 ? sorted_array[0] : 0; @@ -18,28 +19,67 @@ void sorted_array::load(const uint32_t *sorted_array, const uint32_t array_lengt length_bytes = actual_size; } -bool sorted_array::append(uint32_t value) { - uint32_t size_required = sorted_append_size_required(value, length+1); +size_t sorted_array::append(uint32_t value) { + if(value < max) { + // we will have to re-encode the whole sequence again + uint32_t* arr = uncompress(length+1); - if(size_required+FOR_ELE_SIZE > size_bytes) { - // grow the array first - size_t new_size = (size_t) (size_required * FOR_GROWTH_FACTOR); - uint8_t *new_location = (uint8_t *) realloc(in, new_size); - if(new_location == NULL) { - abort(); + // find the index of the element which is >= to `value` + uint32_t found_val; + uint32_t gte_index = for_lower_bound_search(in, length, value, &found_val); + + for(size_t j=length; j>gte_index; j--) { + arr[j] = arr[j-1]; } - in = new_location; - size_bytes = (uint32_t) new_size; + + arr[gte_index] = value; + + load(arr, length+1); + delete [] arr; + + return gte_index; + } else { + uint32_t size_required = sorted_append_size_required(value, length+1); + size_t min_expected_size = size_required + FOR_ELE_SIZE; + + if(size_bytes < min_expected_size) { + // grow the array first + size_t new_size = min_expected_size * FOR_GROWTH_FACTOR; + uint8_t *new_location = (uint8_t *) realloc(in, new_size); + if(new_location == NULL) { + abort(); + } + in = new_location; + size_bytes = (uint32_t) new_size; + + //LOG(INFO) << "new_size: " << new_size; + } + + uint32_t new_length_bytes = for_append_sorted(in, length, value); + if(new_length_bytes == 0) return false; + + length_bytes = new_length_bytes; + length++; + + if(value < min) min = value; + if(value > max) max = value; + + return length-1; + } +} + +bool sorted_array::insert(size_t index, uint32_t value) { + if(index >= length) { + return false; } - uint32_t new_length_bytes = for_append_sorted(in, length, value); - if(new_length_bytes == 0) return false; + uint32_t *curr_array = uncompress(length+1); + memmove(&curr_array[index+1], &curr_array[index], sizeof(uint32_t)*(length-index)); + curr_array[index] = value; - length_bytes = new_length_bytes; - length++; + load(curr_array, length+1); - if(value < min) min = value; - if(value > max) max = value; + delete [] curr_array; return true; } @@ -61,7 +101,11 @@ uint32_t sorted_array::indexOf(uint32_t value) { uint32_t actual; uint32_t index = for_lower_bound_search(in, length, value, &actual); - if(actual == value) return index; + + if(actual == value) { + return index; + } + return length; } @@ -150,20 +194,40 @@ void sorted_array::indexOf(const uint32_t *values, const size_t values_len, uint binary_search_indices(values, head, tail, low_index, high_index, base, bits, indices); } -void sorted_array::remove_values(uint32_t *sorted_values, uint32_t values_length) { +void sorted_array::remove_value(uint32_t value) { + // A lower bound search returns the first element in the sequence that is >= `value` + // So, `found_val` will be either equal or greater than `value` + uint32_t found_val; + uint32_t found_index = for_lower_bound_search(in, length, value, &found_val); + + if(found_val != value) { + return ; + } + + uint32_t *curr_array = uncompress(); + + if(found_index + 1 < length) { + memmove(&curr_array[found_index], &curr_array[found_index+1], sizeof(uint32_t) * (length - found_index - 1)); + } + + size_t new_length = (length == 0) ? 0 : (length - 1); + load(curr_array, new_length); + + delete [] curr_array; +} + +void sorted_array::remove_values(uint32_t *sorted_values, uint32_t sorted_values_length) { uint32_t *curr_array = uncompress(); uint32_t *new_array = new uint32_t[length]; uint32_t new_index = 0; - uint32_t curr_index = 0; + uint32_t sorted_values_index = 0; + uint32_t curr_index = 0; while(curr_index < length) { - if(sorted_values_index < values_length && curr_array[curr_index] >= sorted_values[sorted_values_index]) { - // skip copying - if(curr_array[curr_index] == sorted_values[sorted_values_index]) { - curr_index++; - } + if(sorted_values_index < sorted_values_length && sorted_values[sorted_values_index] == curr_array[curr_index]) { + curr_index++; sorted_values_index++; } else { new_array[new_index++] = curr_array[curr_index++]; diff --git a/test/array_test.cpp b/test/array_test.cpp index 1efa0bd2..d59a7825 100644 --- a/test/array_test.cpp +++ b/test/array_test.cpp @@ -45,6 +45,31 @@ TEST(ArrayTest, Append) { } } +TEST(ArrayTest, InsertValues) { + std::vector eles = {10, 1, 4, 5, 7}; + array arr; + + for(size_t i=0; i < eles.size(); i++) { + arr.append(eles[i]); + } + + uint32_t insert_arr[2] = {2, 3}; + arr.insert(2, insert_arr, 2); + eles = {10, 1, 2, 3, 4, 5, 7}; + + for(size_t i=0; i < eles.size(); i++) { + ASSERT_EQ(eles[i], arr.at(i)); + } + + uint32_t insert_arr2[2] = {20, 25}; + arr.insert(6, insert_arr2, 2); + + eles = {10, 1, 2, 3, 4, 5, 20, 25, 7}; + for(size_t i=0; i < eles.size(); i++) { + ASSERT_EQ(eles[i], arr.at(i)); + } +} + TEST(ArrayTest, Uncompress) { const size_t SIZE = 10*1000; diff --git a/test/collection_faceting_test.cpp b/test/collection_faceting_test.cpp index 7748cbf4..f1a9dd88 100644 --- a/test/collection_faceting_test.cpp +++ b/test/collection_faceting_test.cpp @@ -71,6 +71,11 @@ TEST_F(CollectionFacetingTest, FacetFieldStringFiltering) { ASSERT_EQ(0, results["hits"].size()); ASSERT_EQ(0, results["found"].get()); + // multiple tokens but with a typo on one of them + results = coll_str->search("*", query_fields, "starring:= ssamuel l. Jackson", facets, sort_fields, 0, 10, 1, FREQUENCY, false).get(); + ASSERT_EQ(0, results["hits"].size()); + ASSERT_EQ(0, results["found"].get()); + // same should succeed when verbatim filter is made results = coll_str->search("*", query_fields, "starring:= samuel l. Jackson", facets, sort_fields, 0, 10, 1, FREQUENCY, false).get(); ASSERT_EQ(2, results["hits"].size()); @@ -85,6 +90,11 @@ TEST_F(CollectionFacetingTest, FacetFieldStringFiltering) { ASSERT_EQ(2, results["hits"].size()); ASSERT_EQ(2, results["found"].get()); + // contains when only 1 token matches + results = coll_str->search("*", query_fields, "starring: samuel johnson", facets, sort_fields, 0, 10, 1, FREQUENCY, false).get(); + ASSERT_EQ(2, results["hits"].size()); + ASSERT_EQ(2, results["found"].get()); + collectionManager.drop_collection("coll_str"); } @@ -131,6 +141,9 @@ TEST_F(CollectionFacetingTest, FacetFieldStringArrayFiltering) { results = coll_array_fields->search("Jeremy", query_fields, "tags:= FINE", facets, sort_fields, 0, 10, 1, FREQUENCY, false).get(); ASSERT_EQ(0, results["hits"].size()); + results = coll_array_fields->search("Jeremy", query_fields, "tags:= FFINE PLATINUM", facets, sort_fields, 0, 10, 1, FREQUENCY, false).get(); + ASSERT_EQ(0, results["hits"].size()); + // partial token filter should be made without "=" operator results = coll_array_fields->search("Jeremy", query_fields, "tags: PLATINUM", facets, sort_fields, 0, 10, 1, FREQUENCY, false).get(); ASSERT_EQ(1, results["hits"].size()); diff --git a/test/collection_grouping_test.cpp b/test/collection_grouping_test.cpp index 9b06dd72..fb8a12ed 100644 --- a/test/collection_grouping_test.cpp +++ b/test/collection_grouping_test.cpp @@ -64,7 +64,7 @@ TEST_F(CollectionGroupingTest, GroupingBasics) { auto res = coll_group->search("*", {}, "", {"brand"}, {}, 0, 50, 1, FREQUENCY, false, Index::DROP_TOKENS_THRESHOLD, spp::sparse_hash_set(), - spp::sparse_hash_set(), 10, "", 30, + spp::sparse_hash_set(), 10, "", 30, 5, "", 10, {}, {}, {"size"}, 2).get(); @@ -107,7 +107,7 @@ TEST_F(CollectionGroupingTest, GroupingBasics) { res = coll_group->search("*", {}, "", {"brand"}, sort_size, 0, 50, 1, FREQUENCY, false, Index::DROP_TOKENS_THRESHOLD, spp::sparse_hash_set(), - spp::sparse_hash_set(), 10, "brand: omeg", 30, + spp::sparse_hash_set(), 10, "brand: omeg", 30, 5, "", 10, {}, {}, {"rating"}, 2).get(); @@ -147,7 +147,7 @@ TEST_F(CollectionGroupingTest, GroupingCompoundKey) { auto res = coll_group->search("*", {}, "", {"brand"}, {}, 0, 50, 1, FREQUENCY, false, Index::DROP_TOKENS_THRESHOLD, spp::sparse_hash_set(), - spp::sparse_hash_set(), 10, "", 30, + spp::sparse_hash_set(), 10, "", 30, 5, "", 10, {}, {}, {"size", "brand"}, 2).get(); @@ -194,7 +194,7 @@ TEST_F(CollectionGroupingTest, GroupingCompoundKey) { res = coll_group->search("*", {}, "", {"brand"}, {}, 0, 2, 2, FREQUENCY, false, Index::DROP_TOKENS_THRESHOLD, spp::sparse_hash_set(), - spp::sparse_hash_set(), 10, "", 30, + spp::sparse_hash_set(), 10, "", 30, 5, "", 10, {}, {}, {"size", "brand"}, 2).get(); @@ -230,7 +230,7 @@ TEST_F(CollectionGroupingTest, GroupingCompoundKey) { auto res_op = coll_group->search("*", {}, "", {"brand"}, {}, 0, 50, 1, FREQUENCY, false, Index::DROP_TOKENS_THRESHOLD, spp::sparse_hash_set(), - spp::sparse_hash_set(), 10, "brand: omeg", 30, + spp::sparse_hash_set(), 10, "brand: omeg", 30, 5, "", 10, {}, {}, {"rating"}, 100); @@ -240,7 +240,7 @@ TEST_F(CollectionGroupingTest, GroupingCompoundKey) { res_op = coll_group->search("*", {}, "", {"brand"}, {}, 0, 50, 1, FREQUENCY, false, Index::DROP_TOKENS_THRESHOLD, spp::sparse_hash_set(), - spp::sparse_hash_set(), 10, "brand: omeg", 30, + spp::sparse_hash_set(), 10, "brand: omeg", 30, 5, "", 10, {}, {}, {"rating"}, 0); @@ -252,7 +252,7 @@ TEST_F(CollectionGroupingTest, GroupingWithGropLimitOfOne) { auto res = coll_group->search("*", {}, "", {"brand"}, {}, 0, 50, 1, FREQUENCY, false, Index::DROP_TOKENS_THRESHOLD, spp::sparse_hash_set(), - spp::sparse_hash_set(), 10, "", 30, + spp::sparse_hash_set(), 10, "", 30, 5, "", 10, {}, {}, {"brand"}, 1).get(); @@ -322,7 +322,7 @@ TEST_F(CollectionGroupingTest, GroupingWithArrayFieldAndOverride) { auto res = coll_group->search("shirt", {"title"}, "", {"brand"}, {}, 0, 50, 1, FREQUENCY, false, Index::DROP_TOKENS_THRESHOLD, spp::sparse_hash_set(), - spp::sparse_hash_set(), 10, "", 30, + spp::sparse_hash_set(), 10, "", 30, 5, "", 10, {}, {}, {"colors"}, 2).get(); diff --git a/test/collection_manager_test.cpp b/test/collection_manager_test.cpp index 63fbea4e..da84b0d5 100644 --- a/test/collection_manager_test.cpp +++ b/test/collection_manager_test.cpp @@ -213,7 +213,13 @@ TEST_F(CollectionManagerTest, RestoreRecordsOnRestart) { // create a new collection manager to ensure that it restores the records from the disk backed store CollectionManager & collectionManager2 = CollectionManager::get_instance(); collectionManager2.init(store, 1.0, "auth_key"); - collectionManager2.load(); + auto load_op = collectionManager2.load(); + + if(!load_op.ok()) { + LOG(ERROR) << load_op.error(); + } + + ASSERT_TRUE(load_op.ok()); collection1 = collectionManager2.get_collection("collection1"); ASSERT_NE(nullptr, collection1); diff --git a/test/collection_override_test.cpp b/test/collection_override_test.cpp index 7da68249..312c297c 100644 --- a/test/collection_override_test.cpp +++ b/test/collection_override_test.cpp @@ -271,7 +271,7 @@ TEST_F(CollectionOverrideTest, IncludeExcludeHitsQuery) { auto results = coll_mul_fields->search("the", {"title"}, "", {"starring"}, {}, 0, 50, 1, FREQUENCY, false, Index::DROP_TOKENS_THRESHOLD, spp::sparse_hash_set(), - spp::sparse_hash_set(), 10, "starring: will", 30, + spp::sparse_hash_set(), 10, "starring: will", 30, 5, "", 10, pinned_hits, {}).get(); @@ -289,7 +289,7 @@ TEST_F(CollectionOverrideTest, IncludeExcludeHitsQuery) { results = coll_mul_fields->search("the", {"title"}, "", {"starring"}, {}, 0, 50, 1, FREQUENCY, false, Index::DROP_TOKENS_THRESHOLD, spp::sparse_hash_set(), - spp::sparse_hash_set(), 10, "starring: will", 30, + spp::sparse_hash_set(), 10, "starring: will", 30, 5, "", 10, pinned_hits, hidden_hits).get(); @@ -305,7 +305,7 @@ TEST_F(CollectionOverrideTest, IncludeExcludeHitsQuery) { results = coll_mul_fields->search("the", {"title"}, "", {"starring"}, {}, 0, 2, 2, FREQUENCY, false, Index::DROP_TOKENS_THRESHOLD, spp::sparse_hash_set(), - spp::sparse_hash_set(), 10, "starring: will", 30, + spp::sparse_hash_set(), 10, "starring: will", 30, 5, "", 10, pinned_hits, hidden_hits).get(); @@ -341,7 +341,7 @@ TEST_F(CollectionOverrideTest, IncludeExcludeHitsQuery) { results = coll_mul_fields->search("the", {"title"}, "", {"starring"}, {}, 0, 50, 1, FREQUENCY, false, Index::DROP_TOKENS_THRESHOLD, spp::sparse_hash_set(), - spp::sparse_hash_set(), 10, "starring: will", 30, + spp::sparse_hash_set(), 10, "starring: will", 30, 5, "", 10, {}, {hidden_hits}).get(); @@ -362,7 +362,7 @@ TEST_F(CollectionOverrideTest, PinnedHitsGrouping) { auto results = coll_mul_fields->search("the", {"title"}, "", {"starring"}, {}, 0, 50, 1, FREQUENCY, false, Index::DROP_TOKENS_THRESHOLD, spp::sparse_hash_set(), - spp::sparse_hash_set(), 10, "starring: will", 30, + spp::sparse_hash_set(), 10, "starring: will", 30, 5, "", 10, pinned_hits, {}).get(); @@ -383,7 +383,7 @@ TEST_F(CollectionOverrideTest, PinnedHitsGrouping) { results = coll_mul_fields->search("the", {"title"}, "", {"starring"}, {}, 0, 50, 1, FREQUENCY, false, Index::DROP_TOKENS_THRESHOLD, spp::sparse_hash_set(), - spp::sparse_hash_set(), 10, "starring: will", 30, + spp::sparse_hash_set(), 10, "starring: will", 30, 5, "", 10, pinned_hits, {}, {"cast"}, 2).get(); diff --git a/test/collection_test.cpp b/test/collection_test.cpp index dea95760..efdc33b8 100644 --- a/test/collection_test.cpp +++ b/test/collection_test.cpp @@ -14,6 +14,9 @@ protected: CollectionManager & collectionManager = CollectionManager::get_instance(); std::vector sort_fields; + // used for generating random text + std::vector words; + void setupCollection() { std::string state_dir_path = "/tmp/typesense_test/collection"; LOG(INFO) << "Truncating and creating: " << state_dir_path; @@ -48,6 +51,12 @@ protected: } infile.close(); + + std::ifstream words_file(std::string(ROOT_DIR)+"test/resources/common100_english.txt"); + std::stringstream strstream; + strstream << words_file.rdbuf(); + words_file.close(); + StringUtils::split(strstream.str(), words, "\n"); } virtual void SetUp() { @@ -59,6 +68,18 @@ protected: collectionManager.dispose(); delete store; } + + std::string get_text(size_t num_words) { + time_t t; + srand((unsigned) time(&t)); + std::vector strs; + + for(size_t i = 0 ; i < num_words ; i++ ) { + int word_index = rand() % 100; + strs.push_back(words[word_index]); + } + return StringUtils::join(strs, " "); + } }; TEST_F(CollectionTest, VerifyCountOfDocuments) { @@ -558,14 +579,14 @@ TEST_F(CollectionTest, TypoTokensThreshold) { // Query expansion should happen only based on the `typo_tokens_threshold` value auto results = collection->search("launch", {"title"}, "", {}, sort_fields, 2, 10, 1, token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set(), - spp::sparse_hash_set(), 10, "", 5, "", 0).get(); + spp::sparse_hash_set(), 10, "", 5, 5, "", 0).get(); ASSERT_EQ(5, results["hits"].size()); ASSERT_EQ(5, results["found"].get()); results = collection->search("launch", {"title"}, "", {}, sort_fields, 2, 10, 1, token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set(), - spp::sparse_hash_set(), 10, "", 5, "", 10).get(); + spp::sparse_hash_set(), 10, "", 5, 5, "", 10).get(); ASSERT_EQ(7, results["hits"].size()); ASSERT_EQ(7, results["found"].get()); @@ -1296,6 +1317,243 @@ std::vector import_res_to_json(const std::vector& i return out; } +TEST_F(CollectionTest, ImportDocumentsUpsert) { + Collection *coll_mul_fields; + + std::ifstream infile(std::string(ROOT_DIR)+"test/multi_field_documents.jsonl"); + std::stringstream strstream; + strstream << infile.rdbuf(); + infile.close(); + + std::vector import_records; + StringUtils::split(strstream.str(), import_records, "\n"); + + std::vector fields = { + field("title", field_types::STRING, false), + field("starring", field_types::STRING, false), + field("cast", field_types::STRING_ARRAY, false), + field("points", field_types::INT32, false) + }; + + coll_mul_fields = collectionManager.get_collection("coll_mul_fields"); + if(coll_mul_fields == nullptr) { + coll_mul_fields = collectionManager.create_collection("coll_mul_fields", 1, fields, "points").get(); + } + + // try importing records + nlohmann::json document; + nlohmann::json import_response = coll_mul_fields->add_many(import_records, document); + ASSERT_TRUE(import_response["success"].get()); + ASSERT_EQ(18, import_response["num_imported"].get()); + + // update + upsert records + std::vector more_records = {R"({"id": "0", "title": "The Fifth Harry"})", + R"({"id": "2", "cast": ["Chris Fisher", "Rand Alan"]})", + R"({"id": "18", "title": "Back Again Forest", "points": 45, "starring": "Ronald Wells", "cast": ["Dant Saren"]})", + R"({"id": "6", "points": 77})"}; + + import_response = coll_mul_fields->add_many(more_records, document, UPSERT); + + ASSERT_TRUE(import_response["success"].get()); + ASSERT_EQ(4, import_response["num_imported"].get()); + + std::vector import_results = import_res_to_json(more_records); + ASSERT_EQ(4, import_results.size()); + + for(size_t i=0; i<4; i++) { + ASSERT_TRUE(import_results[i]["success"].get()); + ASSERT_EQ(1, import_results[i].size()); + } + + auto results = coll_mul_fields->search("*", query_fields, "", {}, sort_fields, 0, 30, 1, FREQUENCY, false).get(); + ASSERT_EQ(19, results["hits"].size()); + + ASSERT_EQ(19, coll_mul_fields->get_num_documents()); + + results = coll_mul_fields->search("back again forest", query_fields, "", {}, sort_fields, 0, 30, 1, FREQUENCY, false).get(); + ASSERT_EQ(1, results["hits"].size()); + + ASSERT_STREQ("Back Again Forest", coll_mul_fields->get("18").get()["title"].get().c_str()); + + results = coll_mul_fields->search("fifth", query_fields, "", {}, sort_fields, 0, 10, 1, FREQUENCY, false).get(); + ASSERT_EQ(2, results["hits"].size()); + + ASSERT_STREQ("The Fifth Harry", results["hits"][0]["highlights"][0]["snippet"].get().c_str()); + ASSERT_STREQ("The Woman in the Fifth from Kristin", results["hits"][1]["highlights"][0]["snippet"].get().c_str()); + + results = coll_mul_fields->search("burgundy", query_fields, "", {}, sort_fields, 0, 10, 1, FREQUENCY, false).get(); + ASSERT_EQ(0, results["hits"].size()); + + results = coll_mul_fields->search("harry", query_fields, "", {}, sort_fields, 0, 10, 1, FREQUENCY, false).get(); + ASSERT_EQ(1, results["hits"].size()); + + results = coll_mul_fields->search("captain america", query_fields, "", {}, sort_fields, 0, 10, 1, FREQUENCY, false).get(); + ASSERT_EQ(1, results["hits"].size()); + ASSERT_EQ(77, results["hits"][0]["document"]["points"].get()); + + // upserting with some bad docs + more_records = {R"({"id": "1", "title": "Wake up, Harry"})", + R"({"id": "90", "cast": ["Kim Werrel", "Random Wake"]})", // missing fields + R"({"id": "5", "points": 60})", + R"({"id": "24", "starring": "John", "cast": ["John Kim"], "points": 11})"}; // missing fields + + import_response = coll_mul_fields->add_many(more_records, document, UPSERT); + + ASSERT_FALSE(import_response["success"].get()); + ASSERT_EQ(2, import_response["num_imported"].get()); + + import_results = import_res_to_json(more_records); + ASSERT_FALSE(import_results[1]["success"].get()); + ASSERT_FALSE(import_results[3]["success"].get()); + ASSERT_STREQ("Field `points` has been declared as a default sorting field, but is not found in the document.", import_results[1]["error"].get().c_str()); + ASSERT_STREQ("Field `title` has been declared in the schema, but is not found in the document.", import_results[3]["error"].get().c_str()); + + // try to duplicate records without upsert option + + more_records = {R"({"id": "1", "title": "Wake up, Harry"})", + R"({"id": "5", "points": 60})"}; + + import_response = coll_mul_fields->add_many(more_records, document, CREATE); + ASSERT_FALSE(import_response["success"].get()); + ASSERT_EQ(0, import_response["num_imported"].get()); + + import_results = import_res_to_json(more_records); + ASSERT_FALSE(import_results[0]["success"].get()); + ASSERT_FALSE(import_results[1]["success"].get()); + ASSERT_STREQ("A document with id 1 already exists.", import_results[0]["error"].get().c_str()); + ASSERT_STREQ("A document with id 5 already exists.", import_results[1]["error"].get().c_str()); + + // update document with verbatim fields, except for points + more_records = {R"({"id": "3", "cast":["Matt Damon","Ben Affleck","Minnie Driver"], + "points":70,"starring":"Robin Williams","starring_facet":"Robin Williams", + "title":"Good Will Hunting"})"}; + + import_response = coll_mul_fields->add_many(more_records, document, UPDATE); + ASSERT_TRUE(import_response["success"].get()); + ASSERT_EQ(1, import_response["num_imported"].get()); + + results = coll_mul_fields->search("Good Will Hunting", query_fields, "", {}, sort_fields, 0, 10, 1, FREQUENCY, false).get(); + ASSERT_EQ(70, results["hits"][0]["document"]["points"].get()); + + // updating a document that does not exist should fail, others should succeed + more_records = {R"({"id": "20", "points": 51})", + R"({"id": "1", "points": 64})"}; + + import_response = coll_mul_fields->add_many(more_records, document, UPDATE); + ASSERT_FALSE(import_response["success"].get()); + ASSERT_EQ(1, import_response["num_imported"].get()); + + import_results = import_res_to_json(more_records); + ASSERT_FALSE(import_results[0]["success"].get()); + ASSERT_TRUE(import_results[1]["success"].get()); + ASSERT_STREQ("Could not find a document with id: 20", import_results[0]["error"].get().c_str()); + ASSERT_EQ(404, import_results[0]["code"].get()); + + results = coll_mul_fields->search("wake up harry", query_fields, "", {}, sort_fields, 0, 10, 1, FREQUENCY, false).get(); + ASSERT_EQ(64, results["hits"][0]["document"]["points"].get()); + + // trying to create documents with existing IDs should fail + more_records = {R"({"id": "2", "points": 51})", + R"({"id": "1", "points": 64})"}; + + import_response = coll_mul_fields->add_many(more_records, document, CREATE); + ASSERT_FALSE(import_response["success"].get()); + ASSERT_EQ(0, import_response["num_imported"].get()); + + import_results = import_res_to_json(more_records); + ASSERT_FALSE(import_results[0]["success"].get()); + ASSERT_FALSE(import_results[1]["success"].get()); + ASSERT_STREQ("A document with id 2 already exists.", import_results[0]["error"].get().c_str()); + ASSERT_STREQ("A document with id 1 already exists.", import_results[1]["error"].get().c_str()); + + ASSERT_EQ(409, import_results[0]["code"].get()); + ASSERT_EQ(409, import_results[1]["code"].get()); +} + + +TEST_F(CollectionTest, ImportDocumentsUpsertOptional) { + Collection *coll1; + std::vector fields = { + field("title", field_types::STRING_ARRAY, false, true), + field("points", field_types::INT32, false) + }; + + coll1 = collectionManager.get_collection("coll1"); + if(coll1 == nullptr) { + coll1 = collectionManager.create_collection("coll1", 4, fields, "points").get(); + } + + std::vector records; + + size_t NUM_RECORDS = 1000; + + for(size_t i=0; iadd_many(records, document, CREATE); + ASSERT_TRUE(import_response["success"].get()); + ASSERT_EQ(1000, import_response["num_imported"].get()); + + // upsert documents with title + + records.clear(); + + for(size_t i=0; iadd_many(records, document, UPSERT); + auto time_micros = std::chrono::duration_cast( + std::chrono::high_resolution_clock::now() - begin).count(); + + //LOG(INFO) << "Time taken for first upsert: " << time_micros; + + ASSERT_TRUE(import_response["success"].get()); + ASSERT_EQ(1000, import_response["num_imported"].get()); + + // run upsert again with title override + + records.clear(); + + for(size_t i=0; iadd_many(records, document, UPSERT); + time_micros = std::chrono::duration_cast( + std::chrono::high_resolution_clock::now() - begin).count(); + + //LOG(INFO) << "Time taken for second upsert: " << time_micros; + + ASSERT_TRUE(import_response["success"].get()); + ASSERT_EQ(1000, import_response["num_imported"].get()); +} + TEST_F(CollectionTest, ImportDocuments) { Collection *coll_mul_fields; @@ -1320,8 +1578,8 @@ TEST_F(CollectionTest, ImportDocuments) { } // try importing records - - nlohmann::json import_response = coll_mul_fields->add_many(import_records); + nlohmann::json document; + nlohmann::json import_response = coll_mul_fields->add_many(import_records, document); ASSERT_TRUE(import_response["success"].get()); ASSERT_EQ(18, import_response["num_imported"].get()); @@ -1346,7 +1604,7 @@ TEST_F(CollectionTest, ImportDocuments) { // verify that empty import is handled gracefully std::vector empty_records; - import_response = coll_mul_fields->add_many(empty_records); + import_response = coll_mul_fields->add_many(empty_records, document); ASSERT_TRUE(import_response["success"].get()); ASSERT_EQ(0, import_response["num_imported"].get()); @@ -1360,7 +1618,7 @@ TEST_F(CollectionTest, ImportDocuments) { "{\"title\": \"Test4\", \"points\": 55, " "\"cast\": [\"Tom Skerritt\"] }"}; - import_response = coll_mul_fields->add_many(more_records); + import_response = coll_mul_fields->add_many(more_records, document); ASSERT_FALSE(import_response["success"].get()); ASSERT_EQ(2, import_response["num_imported"].get()); @@ -1385,7 +1643,7 @@ TEST_F(CollectionTest, ImportDocuments) { "{\"id\": \"id1\", \"title\": \"Test1\", \"starring\": \"Rand Fish\", \"points\": 12, " "\"cast\": [\"Tom Skerritt\"] }"}; - import_response = coll_mul_fields->add_many(more_records); + import_response = coll_mul_fields->add_many(more_records, document); ASSERT_FALSE(import_response["success"].get()); ASSERT_EQ(1, import_response["num_imported"].get()); @@ -1403,7 +1661,7 @@ TEST_F(CollectionTest, ImportDocuments) { // valid JSON but not a document more_records = {"[]"}; - import_response = coll_mul_fields->add_many(more_records); + import_response = coll_mul_fields->add_many(more_records, document); ASSERT_FALSE(import_response["success"].get()); ASSERT_EQ(0, import_response["num_imported"].get()); @@ -1417,7 +1675,7 @@ TEST_F(CollectionTest, ImportDocuments) { // invalid JSON more_records = {"{"}; - import_response = coll_mul_fields->add_many(more_records); + import_response = coll_mul_fields->add_many(more_records, document); ASSERT_FALSE(import_response["success"].get()); ASSERT_EQ(0, import_response["num_imported"].get()); @@ -1756,7 +2014,7 @@ TEST_F(CollectionTest, IndexingWithBadData) { sample_collection = collectionManager.create_collection("sample_collection", 4, fields, "age").get(); } - const Option & search_fields_missing_op1 = sample_collection->add("{\"namezz\": \"foo\", \"age\": 29, \"average\": 78}"); + const Option & search_fields_missing_op1 = sample_collection->add("{\"name\": \"foo\", \"age\": 29, \"average\": 78}"); ASSERT_FALSE(search_fields_missing_op1.ok()); ASSERT_STREQ("Field `tags` has been declared in the schema, but is not found in the document.", search_fields_missing_op1.error().c_str()); @@ -2210,9 +2468,169 @@ TEST_F(CollectionTest, SearchHighlightShouldFollowThreshold) { ASSERT_STREQ("fox jumped over the lazy dog and ran straight", res["hits"][0]["highlights"][0]["snippet"].get().c_str()); + // specify the number of surrounding tokens to return + size_t highlight_affix_num_tokens = 2; + + res = coll1->search("lazy", {"title"}, "", {}, sort_fields, 0, 10, 1, + token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 5, highlight_affix_num_tokens).get(); + ASSERT_STREQ("over the lazy dog and", + res["hits"][0]["highlights"][0]["snippet"].get().c_str()); + + highlight_affix_num_tokens = 0; + res = coll1->search("lazy", {"title"}, "", {}, sort_fields, 0, 10, 1, + token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 5, highlight_affix_num_tokens).get(); + ASSERT_STREQ("lazy", + res["hits"][0]["highlights"][0]["snippet"].get().c_str()); + collectionManager.drop_collection("coll1"); } +TEST_F(CollectionTest, UpdateDocument) { + Collection *coll1; + + std::vector fields = {field("title", field_types::STRING, true), + field("tags", field_types::STRING_ARRAY, true), + field("points", field_types::INT32, false)}; + + std::vector sort_fields = {sort_by("points", "DESC")}; + + coll1 = collectionManager.get_collection("coll1"); + if (coll1 == nullptr) { + coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get(); + } + + nlohmann::json doc; + doc["id"] = "100"; + doc["title"] = "The quick brown fox jumped over the lazy dog and ran straight to the forest to sleep."; + doc["tags"] = {"NEWS", "LAZY"}; + doc["points"] = 25; + + auto add_op = coll1->add(doc.dump()); + ASSERT_TRUE(add_op.ok()); + + auto res = coll1->search("lazy", {"title"}, "", {}, sort_fields, 0, 10, 1, + token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 5, 5, "title").get(); + + ASSERT_EQ(1, res["hits"].size()); + ASSERT_STREQ("The quick brown fox jumped over the lazy dog and ran straight to the forest to sleep.", + res["hits"][0]["document"]["title"].get().c_str()); + + // try changing the title and searching for an older token + doc["title"] = "The quick brown fox."; + add_op = coll1->add(doc.dump(), UPSERT); + ASSERT_TRUE(add_op.ok()); + + ASSERT_EQ(1, coll1->get_num_documents()); + + res = coll1->search("lazy", {"title"}, "", {}, sort_fields, 0, 10, 1, + token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 5, 5, "title").get(); + + ASSERT_EQ(0, res["hits"].size()); + + res = coll1->search("quick", {"title"}, "", {}, sort_fields, 0, 10, 1, + token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 5, 5, "title").get(); + + ASSERT_EQ(1, res["hits"].size()); + ASSERT_STREQ("The quick brown fox.", res["hits"][0]["document"]["title"].get().c_str()); + + // try to update document tags without `id` + nlohmann::json doc2; + doc2["tags"] = {"SENTENCE"}; + add_op = coll1->add(doc2.dump(), UPDATE); + ASSERT_FALSE(add_op.ok()); + ASSERT_STREQ("For update, the `id` key must be provided.", add_op.error().c_str()); + + // now change tags with id + doc2["id"] = "100"; + add_op = coll1->add(doc2.dump(), UPDATE); + ASSERT_TRUE(add_op.ok()); + + // check for old tag + res = coll1->search("NEWS", {"tags"}, "", {}, sort_fields, 0, 10, 1, + token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 5, 5, "title").get(); + + ASSERT_EQ(0, res["hits"].size()); + + // now check for new tag and also try faceting on that field + res = coll1->search("SENTENCE", {"tags"}, "", {"tags"}, sort_fields, 0, 10, 1, + token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 5, 5, "title").get(); + + ASSERT_EQ(1, res["hits"].size()); + ASSERT_STREQ("SENTENCE", res["facet_counts"][0]["counts"][0]["value"].get().c_str()); + + // try changing points + nlohmann::json doc3; + doc3["points"] = 99; + doc3["id"] = "100"; + + add_op = coll1->add(doc3.dump(), UPDATE); + ASSERT_TRUE(add_op.ok()); + + res = coll1->search("*", {"tags"}, "points: > 90", {}, sort_fields, 0, 10, 1, + token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 5, 5, "title").get(); + + ASSERT_EQ(1, res["hits"].size()); + ASSERT_EQ(99, res["hits"][0]["document"]["points"].get()); + + // id can be passed by param + nlohmann::json doc4; + doc4["points"] = 105; + + add_op = coll1->add(doc4.dump(), UPSERT, "100"); + ASSERT_TRUE(add_op.ok()); + + res = coll1->search("*", {"tags"}, "points: > 101", {}, sort_fields, 0, 10, 1, + token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 5, 5, "title").get(); + + ASSERT_EQ(1, res["hits"].size()); + ASSERT_EQ(105, res["hits"][0]["document"]["points"].get()); + + // try to change a field with bad value and verify that old document is put back + doc4["points"] = "abc"; + add_op = coll1->add(doc4.dump(), UPSERT, "100"); + ASSERT_FALSE(add_op.ok()); + + res = coll1->search("*", {"tags"}, "points: > 101", {}, sort_fields, 0, 10, 1, + token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 5, 5, "title").get(); + + ASSERT_EQ(1, res["hits"].size()); + ASSERT_EQ(105, res["hits"][0]["document"]["points"].get()); + + // when explicit path id does not match doc id, error should be returned + nlohmann::json doc5; + doc5["id"] = "800"; + doc5["title"] = "The Secret Seven"; + doc5["points"] = 250; + doc5["tags"] = {"BOOK", "ENID BLYTON"}; + + add_op = coll1->add(doc5.dump(), UPSERT, "799"); + ASSERT_FALSE(add_op.ok()); + ASSERT_EQ(400, add_op.code()); + ASSERT_STREQ("The `id` of the resource does not match the `id` in the JSON body.", add_op.error().c_str()); + + // passing an empty id should not succeed + nlohmann::json doc6; + doc6["id"] = ""; + doc6["title"] = "The Secret Seven"; + doc6["points"] = 250; + doc6["tags"] = {"BOOK", "ENID BLYTON"}; + + add_op = coll1->add(doc6.dump(), UPDATE); + ASSERT_FALSE(add_op.ok()); + ASSERT_EQ(400, add_op.code()); + ASSERT_STREQ("The `id` should not be empty.", add_op.error().c_str()); +} + TEST_F(CollectionTest, SearchHighlightFieldFully) { Collection *coll1; @@ -2240,7 +2658,7 @@ TEST_F(CollectionTest, SearchHighlightFieldFully) { auto res = coll1->search("lazy", {"title"}, "", {}, sort_fields, 0, 10, 1, token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set(), - spp::sparse_hash_set(), 10, "", 5, "title").get(); + spp::sparse_hash_set(), 10, "", 5, 5, "title").get(); ASSERT_EQ(1, res["hits"][0]["highlights"].size()); ASSERT_STREQ("The quick brown fox jumped over the lazy dog and ran straight to the forest to sleep.", @@ -2249,14 +2667,14 @@ TEST_F(CollectionTest, SearchHighlightFieldFully) { // should not return value key when highlight_full_fields is not specified res = coll1->search("lazy", {"title"}, "", {}, sort_fields, 0, 10, 1, token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set(), - spp::sparse_hash_set(), 10, "", 5, "").get(); + spp::sparse_hash_set(), 10, "", 5, 5, "").get(); ASSERT_EQ(2, res["hits"][0]["highlights"][0].size()); // query multiple fields res = coll1->search("lazy", {"title", "tags"}, "", {}, sort_fields, 0, 10, 1, token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set(), - spp::sparse_hash_set(), 10, "", 5, "title, tags").get(); + spp::sparse_hash_set(), 10, "", 5, 5, "title, tags").get(); ASSERT_EQ(2, res["hits"][0]["highlights"].size()); ASSERT_STREQ("The quick brown fox jumped over the lazy dog and ran straight to the forest to sleep.", @@ -2269,7 +2687,7 @@ TEST_F(CollectionTest, SearchHighlightFieldFully) { spp::sparse_hash_set excluded_fields = {"tags"}; res = coll1->search("lazy", {"title", "tags"}, "", {}, sort_fields, 0, 10, 1, token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set(), - excluded_fields, 10, "", 5, "title, tags").get(); + excluded_fields, 10, "", 5, 5, "title, tags").get(); ASSERT_EQ(1, res["hits"][0]["highlights"].size()); ASSERT_STREQ("The quick brown fox jumped over the lazy dog and ran straight to the forest to sleep.", @@ -2279,7 +2697,7 @@ TEST_F(CollectionTest, SearchHighlightFieldFully) { excluded_fields = {"tags", "title"}; res = coll1->search("lazy", {"title", "tags"}, "", {}, sort_fields, 0, 10, 1, token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set(), - excluded_fields, 10, "", 5, "title, tags").get(); + excluded_fields, 10, "", 5, 5, "title, tags").get(); ASSERT_EQ(0, res["hits"][0]["highlights"].size()); collectionManager.drop_collection("coll1"); diff --git a/test/index_test.cpp b/test/index_test.cpp new file mode 100644 index 00000000..c069565e --- /dev/null +++ b/test/index_test.cpp @@ -0,0 +1,59 @@ +#include +#include "index.h" +#include + +TEST(IndexTest, ScrubReindexDoc) { + std::unordered_map search_schema; + search_schema.emplace("title", field("title", field_types::STRING, false)); + search_schema.emplace("points", field("title", field_types::INT32, false)); + search_schema.emplace("cast", field("cast", field_types::STRING_ARRAY, false)); + search_schema.emplace("movie", field("movie", field_types::BOOL, false)); + + Index index("index", search_schema, {}, {}); + nlohmann::json old_doc; + old_doc["id"] = "1"; + old_doc["title"] = "One more thing."; + old_doc["points"] = 100; + old_doc["cast"] = {"John Wick", "Jeremy Renner"}; + old_doc["movie"] = true; + + // all fields remain same + + nlohmann::json update_doc1, del_doc1; + update_doc1 = old_doc; + del_doc1 = old_doc; + + index.scrub_reindex_doc(update_doc1, del_doc1, old_doc); + ASSERT_EQ(1, del_doc1.size()); + ASSERT_STREQ("1", del_doc1["id"].get().c_str()); + + // when only some fields are updated + + nlohmann::json update_doc2, del_doc2; + update_doc2["id"] = "1"; + update_doc2["points"] = 100; + update_doc2["cast"] = {"Jack"}; + + del_doc2 = update_doc2; + + index.scrub_reindex_doc(update_doc2, del_doc2, old_doc); + ASSERT_EQ(2, del_doc2.size()); + ASSERT_STREQ("1", del_doc2["id"].get().c_str()); + std::vector cast = del_doc2["cast"].get>(); + ASSERT_EQ(1, cast.size()); + ASSERT_STREQ("Jack", cast[0].c_str()); + + // containing fields not part of search schema + + nlohmann::json update_doc3, del_doc3; + update_doc3["id"] = "1"; + update_doc3["title"] = "The Lawyer"; + update_doc3["foo"] = "Bar"; + + del_doc3 = update_doc3; + index.scrub_reindex_doc(update_doc3, del_doc3, old_doc); + ASSERT_EQ(3, del_doc3.size()); + ASSERT_STREQ("1", del_doc3["id"].get().c_str()); + ASSERT_STREQ("The Lawyer", del_doc3["title"].get().c_str()); + ASSERT_STREQ("Bar", del_doc3["foo"].get().c_str()); +} \ No newline at end of file diff --git a/test/resources/common100_english.txt b/test/resources/common100_english.txt new file mode 100644 index 00000000..af98316b --- /dev/null +++ b/test/resources/common100_english.txt @@ -0,0 +1,100 @@ +the +of +to +and +a +in +is +it +you +that +he +was +for +on +are +with +as +I +his +they +be +at +one +have +this +from +or +had +by +not +word +but +what +some +we +can +out +other +were +all +there +when +up +use +your +how +said +an +each +she +which +do +their +time +if +will +way +about +many +then +them +write +would +like +so +these +her +long +make +thing +see +him +two +has +look +more +day +could +go +come +did +number +sound +no +most +people +my +over +know +water +than +call +first +who +may +down +side +been +now +find \ No newline at end of file diff --git a/test/sorted_array_test.cpp b/test/sorted_array_test.cpp index dfdb9dcc..aadcd52c 100644 --- a/test/sorted_array_test.cpp +++ b/test/sorted_array_test.cpp @@ -12,7 +12,8 @@ TEST(SortedArrayTest, Append) { EXPECT_EQ(arr.indexOf(100), 0); // when not found must be equal to length (0 in this case) for(uint32_t i=0; i < SIZE; i++) { - arr.append(i); + size_t appended_index = arr.append(i); + ASSERT_EQ(i, appended_index); } EXPECT_EQ(arr.getLength(), SIZE); @@ -28,11 +29,94 @@ TEST(SortedArrayTest, Append) { EXPECT_EQ(arr.indexOf(SIZE+1), SIZE); sorted_array arr_small; - arr_small.append(100); + size_t appended_index = arr_small.append(100); + EXPECT_EQ(0, appended_index); EXPECT_EQ(arr_small.getLength(), 1); EXPECT_EQ(arr_small.at(0), 100); } +TEST(SortedArrayTest, AppendOutOfOrder) { + sorted_array arr; + for(size_t i=5; i<=10; i++) { + size_t appended_index = arr.append(i); + ASSERT_EQ(i-5, appended_index); + } + + EXPECT_EQ(6, arr.getLength()); + + int appended_index = -1; + + appended_index = arr.append(1); + ASSERT_EQ(0, appended_index); + + appended_index = arr.append(3); + ASSERT_EQ(1, appended_index); + + appended_index = arr.append(2); + ASSERT_EQ(1, appended_index); + + appended_index = arr.append(4); + ASSERT_EQ(3, appended_index); + + appended_index = arr.append(11); + ASSERT_EQ(10, appended_index); + + appended_index = arr.append(14); + ASSERT_EQ(11, appended_index); + + appended_index = arr.append(12); + ASSERT_EQ(11, appended_index); + + EXPECT_EQ(13, arr.getLength()); +} + +TEST(SortedArrayTest, InsertAtIndex) { + std::vector eles; + sorted_array arr; + for(size_t i=5; i<=9; i++) { + arr.append(i); + } + + arr.append(11); + eles = {5, 6, 7, 8, 9, 11}; + + for(size_t i=0; i < eles.size(); i++) { + ASSERT_EQ(eles[i], arr.at(i)); + } + + arr.insert(0, 1); + eles = { 1, 5, 6, 7, 8, 9, 11 }; + + for(size_t i=0; i < eles.size(); i++) { + ASSERT_EQ(eles[i], arr.at(i)); + } + + ASSERT_EQ(1, arr.at(0)); + ASSERT_EQ(5, arr.at(1)); + + arr.insert(1, 2); + eles = {1, 2, 5, 6, 7, 8, 9, 11}; + ASSERT_EQ(1, arr.at(0)); + ASSERT_EQ(2, arr.at(1)); + ASSERT_EQ(8, arr.getLength()); + + for(size_t i=0; i < eles.size(); i++) { + ASSERT_EQ(eles[i], arr.at(i)); + } + + arr.insert(7, 10); + eles = { 1, 2, 5, 6, 7, 8, 9, 10, 11}; + ASSERT_EQ(10, arr.at(7)); + ASSERT_EQ(11, arr.at(8)); + ASSERT_EQ(9, arr.getLength()); + + for(size_t i=0; i < eles.size(); i++) { + ASSERT_EQ(eles[i], arr.at(i)); + } + + ASSERT_FALSE(arr.insert(9, 12)); // index out of range +} + TEST(SortedArrayTest, Load) { sorted_array arr; @@ -70,6 +154,32 @@ TEST(SortedArrayTest, Uncompress) { delete[] raw_sorted_arr; } +TEST(SortedArrayTest, RemoveValue) { + sorted_array arr; + + const size_t SIZE = 10*1000; + for(size_t i=0; i