diff --git a/WORKSPACE b/WORKSPACE index f2d86794..80f643c9 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -179,7 +179,7 @@ new_git_repository( new_git_repository( name = "hnsw", build_file = "//bazel:hnsw.BUILD", - commit = "5aba40d4b10dd77aece2ab9a1b3fdf06e433466a", + commit = "5100d3fe41da45601875b3f395f508398cb12b8a", remote = "https://github.com/typesense/hnswlib.git", ) diff --git a/include/collection.h b/include/collection.h index 33a73d3c..59c8f9be 100644 --- a/include/collection.h +++ b/include/collection.h @@ -55,6 +55,8 @@ private: mutable std::shared_mutex mutex; + mutable std::shared_mutex index_repair_lock; + const uint8_t CURATED_RECORD_IDENTIFIER = 100; const size_t DEFAULT_TOPSTER_SIZE = 250; @@ -428,7 +430,8 @@ public: const std::vector& sort_by_fields); void batch_index(std::vector& index_records, std::vector& json_out, size_t &num_indexed, - const bool& return_doc, const bool& return_id, const size_t remote_embedding_batch_size = 200); + const bool& return_doc, const bool& return_id, const size_t remote_embedding_batch_size = 200, + const size_t remote_embedding_timeout_ms = 60000, const size_t remote_embedding_num_tries = 2); bool is_exceeding_memory_threshold() const; @@ -442,7 +445,7 @@ public: nlohmann::json get_summary_json() const; size_t batch_index_in_memory(std::vector& index_records, const size_t remote_embedding_batch_size, - const bool generate_embeddings); + const size_t remote_embedding_timeout_ms, const size_t remote_embedding_num_tries, const bool generate_embeddings); Option add(const std::string & json_str, const index_operation_t& operation=CREATE, const std::string& id="", @@ -452,7 +455,9 @@ public: const index_operation_t& operation=CREATE, const std::string& id="", const DIRTY_VALUES& dirty_values=DIRTY_VALUES::COERCE_OR_REJECT, const bool& return_doc=false, const bool& return_id=false, - const size_t remote_embedding_batch_size=200); + const size_t remote_embedding_batch_size=200, + const size_t remote_embedding_timeout_ms=60000, + const size_t remote_embedding_num_tries=2); Option update_matching_filter(const std::string& filter_query, const std::string & json_str, @@ -464,6 +469,8 @@ public: tsl::htrie_set& include_fields_full, tsl::htrie_set& exclude_fields_full) const; + void do_housekeeping(); + Option search(std::string query, const std::vector & search_fields, const std::string & filter_query, const std::vector & facet_fields, const std::vector & sort_fields, const std::vector& num_typos, diff --git a/include/collection_manager.h b/include/collection_manager.h index c70ce6ad..7319bb98 100644 --- a/include/collection_manager.h +++ b/include/collection_manager.h @@ -129,6 +129,8 @@ public: std::vector get_collections() const; + std::vector get_collection_names() const; + Collection* get_collection_unsafe(const std::string & collection_name) const; // PUBLICLY EXPOSED API diff --git a/include/index.h b/include/index.h index 748c24e6..40589a3b 100644 --- a/include/index.h +++ b/include/index.h @@ -296,6 +296,9 @@ struct hnsw_index_t { size_t num_dim; vector_distance_type_t distance_type; + // ensures that this index is not dropped when it's being repaired + std::mutex repair_m; + hnsw_index_t(size_t num_dim, size_t init_size, vector_distance_type_t distance_type): space(new hnswlib::InnerProductSpace(num_dim)), vecdex(new hnswlib::HierarchicalNSW(space, init_size, 16, 200, 100, true)), @@ -561,13 +564,14 @@ private: const std::string& token, uint32_t seq_id); void initialize_facet_indexes(const field& facet_field); - - static void batch_embed_fields(std::vector& documents, - const tsl::htrie_map& embedding_fields, - const tsl::htrie_map & search_schema, const size_t remote_embedding_batch_size = 200); std::vector get_group_by_field_iterators(const std::vector&, bool is_reverse=false) const; + static void batch_embed_fields(std::vector& documents, + const tsl::htrie_map& embedding_fields, + const tsl::htrie_map & search_schema, const size_t remote_embedding_batch_size = 200, + const size_t remote_embedding_timeout_ms = 60000, const size_t remote_embedding_num_tries = 2); + public: // for limiting number of results on multiple candidates / query rewrites enum {TYPO_TOKENS_THRESHOLD = 1}; @@ -713,7 +717,8 @@ public: const std::string& fallback_field_type, const std::vector& token_separators, const std::vector& symbols_to_index, - const bool do_validation, const size_t remote_embedding_batch_size = 200, const bool generate_embeddings = true); + const bool do_validation, const size_t remote_embedding_batch_size = 200, + const size_t remote_embedding_timeout_ms = 60000, const size_t remote_embedding_num_tries = 2, const bool generate_embeddings = true); static size_t batch_memory_index(Index *index, std::vector& iter_batch, @@ -724,9 +729,10 @@ public: const std::vector& token_separators, const std::vector& symbols_to_index, const bool do_validation, const size_t remote_embedding_batch_size = 200, - const bool generate_embeddings = true, + const size_t remote_embedding_timeout_ms = 60000, + const size_t remote_embedding_num_tries = 2, const bool generate_embeddings = true, const bool use_addition_fields = false, - const tsl::htrie_map& addition_fields = {}); + const tsl::htrie_map& addition_fields = tsl::htrie_map()); void index_field_in_memory(const field& afield, std::vector& iter_batch); @@ -1017,6 +1023,8 @@ public: const uint32_t& seq_id) const; friend class filter_result_iterator_t; + + void repair_hnsw_index(); }; template diff --git a/include/text_embedder.h b/include/text_embedder.h index ca64aa52..370d0670 100644 --- a/include/text_embedder.h +++ b/include/text_embedder.h @@ -17,7 +17,8 @@ class TextEmbedder { TextEmbedder(const nlohmann::json& model_config, size_t num_dims); ~TextEmbedder(); embedding_res_t Embed(const std::string& text, const size_t remote_embedder_timeout_ms = 30000, const size_t remote_embedding_num_tries = 2); - std::vector batch_embed(const std::vector& inputs, const size_t remote_embedding_batch_size = 200); + std::vector batch_embed(const std::vector& inputs, const size_t remote_embedding_batch_size = 200, + const size_t remote_embedding_timeout_ms = 60000, const size_t remote_embedding_num_tries = 2); const std::string& get_vocab_file_name() const; const size_t get_num_dim() const; bool is_remote() { diff --git a/include/text_embedder_remote.h b/include/text_embedder_remote.h index b5f219d5..1c39489a 100644 --- a/include/text_embedder_remote.h +++ b/include/text_embedder_remote.h @@ -30,7 +30,8 @@ class RemoteEmbedder { public: virtual nlohmann::json get_error_json(const nlohmann::json& req_body, long res_code, const std::string& res_body) = 0; virtual embedding_res_t Embed(const std::string& text, const size_t remote_embedder_timeout_ms = 30000, const size_t remote_embedding_num_tries = 2) = 0; - virtual std::vector batch_embed(const std::vector& inputs, const size_t remote_embedding_batch_size = 200) = 0; + virtual std::vector batch_embed(const std::vector& inputs, const size_t remote_embedding_batch_size = 200, + const size_t remote_embedding_timeout_ms = 60000, const size_t remote_embedding_num_tries = 2) = 0; static const std::string get_model_key(const nlohmann::json& model_config); static void init(ReplicationState* rs) { raft_server = rs; @@ -50,7 +51,8 @@ class OpenAIEmbedder : public RemoteEmbedder { OpenAIEmbedder(const std::string& openai_model_path, const std::string& api_key); static Option is_model_valid(const nlohmann::json& model_config, size_t& num_dims); embedding_res_t Embed(const std::string& text, const size_t remote_embedder_timeout_ms = 30000, const size_t remote_embedding_num_tries = 2) override; - std::vector batch_embed(const std::vector& inputs, const size_t remote_embedding_batch_size = 200) override; + std::vector batch_embed(const std::vector& inputs, const size_t remote_embedding_batch_size = 200, + const size_t remote_embedding_timeout_ms = 60000, const size_t remote_embedding_num_tries = 2) override; nlohmann::json get_error_json(const nlohmann::json& req_body, long res_code, const std::string& res_body) override; static std::string get_model_key(const nlohmann::json& model_config); }; @@ -68,7 +70,8 @@ class GoogleEmbedder : public RemoteEmbedder { GoogleEmbedder(const std::string& google_api_key); static Option is_model_valid(const nlohmann::json& model_config, size_t& num_dims); embedding_res_t Embed(const std::string& text, const size_t remote_embedder_timeout_ms = 30000, const size_t remote_embedding_num_tries = 2) override; - std::vector batch_embed(const std::vector& inputs, const size_t remote_embedding_batch_size = 200) override; + std::vector batch_embed(const std::vector& inputs, const size_t remote_embedding_batch_size = 200, + const size_t remote_embedding_timeout_ms = 60000, const size_t remote_embedding_num_tries = 2) override; nlohmann::json get_error_json(const nlohmann::json& req_body, long res_code, const std::string& res_body) override; static std::string get_model_key(const nlohmann::json& model_config); }; @@ -96,7 +99,8 @@ class GCPEmbedder : public RemoteEmbedder { const std::string& refresh_token, const std::string& client_id, const std::string& client_secret); static Option is_model_valid(const nlohmann::json& model_config, size_t& num_dims); embedding_res_t Embed(const std::string& text, const size_t remote_embedder_timeout_ms = 30000, const size_t remote_embedding_num_tries = 2) override; - std::vector batch_embed(const std::vector& inputs, const size_t remote_embedding_batch_size = 200) override; + std::vector batch_embed(const std::vector& inputs, const size_t remote_embedding_batch_size = 200, + const size_t remote_embedding_timeout_ms = 60000, const size_t remote_embedding_num_tries = 2) override; nlohmann::json get_error_json(const nlohmann::json& req_body, long res_code, const std::string& res_body) override; static std::string get_model_key(const nlohmann::json& model_config); }; diff --git a/include/tsconfig.h b/include/tsconfig.h index dc382e80..5e754d84 100644 --- a/include/tsconfig.h +++ b/include/tsconfig.h @@ -69,6 +69,8 @@ private: uint32_t analytics_flush_interval; + uint32_t housekeeping_interval; + protected: Config() { @@ -96,6 +98,7 @@ protected: this->enable_search_analytics = false; this->analytics_flush_interval = 3600; // in seconds + this->housekeeping_interval = 1800; // in seconds } Config(Config const&) { @@ -294,6 +297,10 @@ public: return this->analytics_flush_interval; } + size_t get_housekeeping_interval() const { + return this->housekeeping_interval; + } + size_t get_thread_pool_size() const { return this->thread_pool_size; } @@ -429,6 +436,10 @@ public: this->analytics_flush_interval = std::stoi(get_env("TYPESENSE_ANALYTICS_FLUSH_INTERVAL")); } + if(!get_env("TYPESENSE_HOUSEKEEPING_INTERVAL").empty()) { + this->housekeeping_interval = std::stoi(get_env("TYPESENSE_HOUSEKEEPING_INTERVAL")); + } + if(!get_env("TYPESENSE_THREAD_POOL_SIZE").empty()) { this->thread_pool_size = std::stoi(get_env("TYPESENSE_THREAD_POOL_SIZE")); } @@ -592,6 +603,10 @@ public: this->analytics_flush_interval = (int) reader.GetInteger("server", "analytics-flush-interval", 3600); } + if(reader.Exists("server", "housekeeping-interval")) { + this->housekeeping_interval = (int) reader.GetInteger("server", "housekeeping-interval", 1800); + } + if(reader.Exists("server", "thread-pool-size")) { this->thread_pool_size = (int) reader.GetInteger("server", "thread-pool-size", 0); } @@ -746,6 +761,10 @@ public: this->analytics_flush_interval = options.get("analytics-flush-interval"); } + if(options.exist("housekeeping-interval")) { + this->housekeeping_interval = options.get("housekeeping-interval"); + } + if(options.exist("thread-pool-size")) { this->thread_pool_size = options.get("thread-pool-size"); } diff --git a/src/collection.cpp b/src/collection.cpp index fecb2898..02220990 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -58,6 +58,7 @@ Collection::Collection(const std::string& name, const uint32_t collection_id, co Collection::~Collection() { std::unique_lock lock(mutex); + std::unique_lock repair_lock(index_repair_lock); delete index; delete synonym_index; } @@ -391,7 +392,9 @@ Option Collection::add(const std::string & json_str, nlohmann::json Collection::add_many(std::vector& json_lines, nlohmann::json& document, const index_operation_t& operation, const std::string& id, const DIRTY_VALUES& dirty_values, const bool& return_doc, const bool& return_id, - const size_t remote_embedding_batch_size) { + const size_t remote_embedding_batch_size, + const size_t remote_embedding_timeout_ms, + const size_t remote_embedding_num_tries) { //LOG(INFO) << "Memory ratio. Max = " << max_memory_ratio << ", Used = " << SystemMetrics::used_memory_ratio(); std::vector index_records; @@ -481,7 +484,7 @@ nlohmann::json Collection::add_many(std::vector& json_lines, nlohma if((i+1) % index_batch_size == 0 || i == json_lines.size()-1 || repeated_doc) { - batch_index(index_records, json_lines, num_indexed, return_doc, return_id, remote_embedding_batch_size); + batch_index(index_records, json_lines, num_indexed, return_doc, return_id, remote_embedding_batch_size, remote_embedding_timeout_ms, remote_embedding_num_tries); // to return the document for the single doc add cases if(index_records.size() == 1) { @@ -598,9 +601,10 @@ bool Collection::is_exceeding_memory_threshold() const { } void Collection::batch_index(std::vector& index_records, std::vector& json_out, - size_t &num_indexed, const bool& return_doc, const bool& return_id, const size_t remote_embedding_batch_size) { + size_t &num_indexed, const bool& return_doc, const bool& return_id, const size_t remote_embedding_batch_size, + const size_t remote_embedding_timeout_ms, const size_t remote_embedding_num_tries) { - batch_index_in_memory(index_records, remote_embedding_batch_size, true); + batch_index_in_memory(index_records, remote_embedding_batch_size, remote_embedding_timeout_ms, remote_embedding_num_tries, true); // store only documents that were indexed in-memory successfully for(auto& index_record: index_records) { @@ -704,11 +708,12 @@ Option Collection::index_in_memory(nlohmann::json &document, uint32_t return Option<>(200); } -size_t Collection::batch_index_in_memory(std::vector& index_records, const size_t remote_embedding_batch_size, const bool generate_embeddings) { +size_t Collection::batch_index_in_memory(std::vector& index_records, const size_t remote_embedding_batch_size, + const size_t remote_embedding_timeout_ms, const size_t remote_embedding_num_tries, const bool generate_embeddings) { std::unique_lock lock(mutex); size_t num_indexed = Index::batch_memory_index(index, index_records, default_sorting_field, search_schema, embedding_fields, fallback_field_type, - token_separators, symbols_to_index, true, remote_embedding_batch_size, generate_embeddings); + token_separators, symbols_to_index, true, remote_embedding_batch_size, remote_embedding_timeout_ms, remote_embedding_num_tries, generate_embeddings); num_documents += num_indexed; return num_indexed; } @@ -4291,8 +4296,9 @@ Option Collection::batch_alter_data(const std::vector& alter_fields } Index::batch_memory_index(index, iter_batch, default_sorting_field, search_schema, embedding_fields, - fallback_field_type, token_separators, symbols_to_index, true, 200, + fallback_field_type, token_separators, symbols_to_index, true, 200, 60000, 2, found_embedding_field, true, schema_additions); + if(found_embedding_field) { for(auto& index_record : iter_batch) { if(index_record.indexed.ok()) { @@ -5380,7 +5386,7 @@ bool Collection::get_enable_nested_fields() { Option Collection::parse_facet(const std::string& facet_field, std::vector& facets) const { const std::regex base_pattern(".+\\(.*\\)"); - const std::regex range_pattern("[[a-zA-Z]+:\\[([0-9]+)\\,\\s*([0-9]+)\\]"); + const std::regex range_pattern("[[a-z A-Z]+:\\[([+-]?([0-9]*[.])?[0-9]*)\\,\\s*([+-]?([0-9]*[.])?[0-9]*)\\]"); const std::string _alpha = "_alpha"; if ((facet_field.find(":") != std::string::npos) @@ -5469,24 +5475,49 @@ Option Collection::parse_facet(const std::string& facet_field, std::vector auto pos3 = range.find("]"); int64_t lower_range, upper_range; - auto lower_range_start = pos1 + 2; - auto lower_range_len = pos2 - lower_range_start; - auto upper_range_start = pos2 + 1; - auto upper_range_len = pos3 - upper_range_start; if(a_field.is_integer()) { - std::string lower_range_str = range.substr(lower_range_start, lower_range_len); + auto start = pos1 + 2; + auto end = pos2 - start; + auto lower_range_str = range.substr(start, end); StringUtils::trim(lower_range_str); - lower_range = std::stoll(lower_range_str); - std::string upper_range_str = range.substr(upper_range_start, upper_range_len); - StringUtils::trim(upper_range_str); - upper_range = std::stoll(upper_range_str); - } else { - float val = std::stof(range.substr(pos1 + 2, pos2)); - lower_range = Index::float_to_int64_t(val); + if(lower_range_str.empty()) { + lower_range = INT64_MIN; + } else { + lower_range = std::stoll(lower_range_str); + } - val = std::stof(range.substr(pos2 + 1, pos3)); - upper_range = Index::float_to_int64_t(val); + start = pos2 + 1; + end = pos3 - start; + auto upper_range_str = range.substr(start, end); + StringUtils::trim(upper_range_str); + if(upper_range_str.empty()) { + upper_range = INT64_MAX; + } else { + upper_range = std::stoll(upper_range_str); + } + } else { + auto start = pos1 + 2; + auto end = pos2 - start; + auto lower_range_str = range.substr(start, end); + StringUtils::trim(lower_range_str); + if(lower_range_str.empty()) { + lower_range = INT64_MIN; + } else { + float val = std::stof(lower_range_str); + lower_range = Index::float_to_int64_t(val); + } + + start = pos2 + 1; + end = pos3 - start; + auto upper_range_str = range.substr(start, end); + StringUtils::trim(upper_range_str); + if(upper_range_str.empty()) { + upper_range = INT64_MAX; + } else { + float val = std::stof(upper_range_str); + upper_range = Index::float_to_int64_t(val); + } } tupVec.emplace_back(lower_range, upper_range, range_val); @@ -5811,3 +5842,8 @@ void Collection::remove_embedding_field(const std::string& field_name) { tsl::htrie_map Collection::get_embedding_fields_unsafe() { return embedding_fields; } + +void Collection::do_housekeeping() { + std::unique_lock lock(index_repair_lock); + index->repair_hnsw_index(); +} diff --git a/src/collection_manager.cpp b/src/collection_manager.cpp index 8cec4c92..bf3baa70 100644 --- a/src/collection_manager.cpp +++ b/src/collection_manager.cpp @@ -527,6 +527,17 @@ std::vector CollectionManager::get_collections() const { return collection_vec; } +std::vector CollectionManager::get_collection_names() const { + std::shared_lock lock(mutex); + + std::vector collection_vec; + for(const auto& kv: collections) { + collection_vec.push_back(kv.first); + } + + return collection_vec; +} + Option CollectionManager::drop_collection(const std::string& collection_name, const bool remove_from_store) { std::shared_lock s_lock(mutex); auto collection = get_collection_unsafe(collection_name); @@ -1787,7 +1798,7 @@ Option CollectionManager::load_collection(const nlohmann::json &collection // batch must match atleast the number of shards if(exceeds_batch_mem_threshold || (num_valid_docs % batch_size == 0) || last_record) { size_t num_records = index_records.size(); - size_t num_indexed = collection->batch_index_in_memory(index_records, 200, false); + size_t num_indexed = collection->batch_index_in_memory(index_records, 200, 60000, 2, false); batch_doc_str_size = 0; if(num_indexed != num_records) { diff --git a/src/core_api.cpp b/src/core_api.cpp index 2a88eb52..5fc3f0bc 100644 --- a/src/core_api.cpp +++ b/src/core_api.cpp @@ -756,6 +756,8 @@ bool post_import_documents(const std::shared_ptr& req, const std::shar const char *RETURN_DOC = "return_doc"; const char *RETURN_ID = "return_id"; const char *REMOTE_EMBEDDING_BATCH_SIZE = "remote_embedding_batch_size"; + const char *REMOTE_EMBEDDING_TIMEOUT_MS = "remote_embedding_timeout_ms"; + const char *REMOTE_EMBEDDING_NUM_TRIES = "remote_embedding_num_tries"; if(req->params.count(BATCH_SIZE) == 0) { req->params[BATCH_SIZE] = "40"; @@ -810,8 +812,18 @@ bool post_import_documents(const std::shared_ptr& req, const std::shar return false; } + if(req->params.count(REMOTE_EMBEDDING_TIMEOUT_MS) == 0) { + req->params[REMOTE_EMBEDDING_TIMEOUT_MS] = "60000"; + } + + if(req->params.count(REMOTE_EMBEDDING_NUM_TRIES) == 0) { + req->params[REMOTE_EMBEDDING_NUM_TRIES] = "2"; + } + const size_t IMPORT_BATCH_SIZE = std::stoi(req->params[BATCH_SIZE]); const size_t REMOTE_EMBEDDING_BATCH_SIZE_VAL = std::stoi(req->params[REMOTE_EMBEDDING_BATCH_SIZE]); + const size_t REMOTE_EMBEDDING_TIMEOUT_MS_VAL = std::stoi(req->params[REMOTE_EMBEDDING_TIMEOUT_MS]); + const size_t REMOTE_EMBEDDING_NUM_TRIES_VAL = std::stoi(req->params[REMOTE_EMBEDDING_NUM_TRIES]); if(IMPORT_BATCH_SIZE == 0) { res->final = true; @@ -827,6 +839,20 @@ bool post_import_documents(const std::shared_ptr& req, const std::shar return false; } + if(REMOTE_EMBEDDING_TIMEOUT_MS_VAL == 0) { + res->final = true; + res->set_400("Parameter `" + std::string(REMOTE_EMBEDDING_TIMEOUT_MS) + "` must be a positive integer."); + stream_response(req, res); + return false; + } + + if(REMOTE_EMBEDDING_NUM_TRIES_VAL == 0) { + res->final = true; + res->set_400("Parameter `" + std::string(REMOTE_EMBEDDING_NUM_TRIES) + "` must be a positive integer."); + stream_response(req, res); + return false; + } + if(req->body_index == 0) { // will log for every major chunk of request body //LOG(INFO) << "Import, req->body.size=" << req->body.size() << ", batch_size=" << IMPORT_BATCH_SIZE; @@ -896,7 +922,7 @@ bool post_import_documents(const std::shared_ptr& req, const std::shar const bool& return_doc = req->params[RETURN_DOC] == "true"; const bool& return_id = req->params[RETURN_ID] == "true"; nlohmann::json json_res = collection->add_many(json_lines, document, operation, "", - dirty_values, return_doc, return_id, REMOTE_EMBEDDING_BATCH_SIZE_VAL); + dirty_values, return_doc, return_id, REMOTE_EMBEDDING_BATCH_SIZE_VAL, REMOTE_EMBEDDING_TIMEOUT_MS_VAL, REMOTE_EMBEDDING_NUM_TRIES_VAL); //const std::string& import_summary_json = json_res->dump(); //response_stream << import_summary_json << "\n"; @@ -940,6 +966,7 @@ bool post_add_document(const std::shared_ptr& req, const std::shared_p req->params[DIRTY_VALUES_PARAM] = ""; // set it empty as default will depend on whether schema is enabled } + CollectionManager & collectionManager = CollectionManager::get_instance(); auto collection = collectionManager.get_collection(req->params["collection"]); @@ -951,10 +978,22 @@ bool post_add_document(const std::shared_ptr& req, const std::shared_p const index_operation_t operation = get_index_operation(req->params[ACTION]); const auto& dirty_values = collection->parse_dirty_values_option(req->params[DIRTY_VALUES_PARAM]); + size_t remote_embedding_timeout_ms = 60000; + size_t remote_embedding_num_tries = 2; + + if(req->params.count("remote_embedding_timeout_ms") != 0) { + remote_embedding_timeout_ms = std::stoul(req->params["remote_embedding_timeout_ms"]); + } + + if(req->params.count("remote_embedding_num_tries") != 0) { + remote_embedding_num_tries = std::stoul(req->params["remote_embedding_num_tries"]); + } + nlohmann::json document; std::vector json_lines = {req->body}; const nlohmann::json& inserted_doc_op = collection->add_many(json_lines, document, operation, "", dirty_values, - false, false); + false, false, 200, remote_embedding_timeout_ms, + remote_embedding_num_tries); if(!inserted_doc_op["success"].get()) { nlohmann::json res_doc; diff --git a/src/index.cpp b/src/index.cpp index 1ba401c3..e7b04d1d 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -206,6 +206,7 @@ Index::~Index() { delete seq_ids; for(auto& vec_index_kv: vector_index) { + std::unique_lock lock(vec_index_kv.second->repair_m); delete vec_index_kv.second; } @@ -432,7 +433,8 @@ void Index::validate_and_preprocess(Index *index, const std::string& fallback_field_type, const std::vector& token_separators, const std::vector& symbols_to_index, - const bool do_validation, const size_t remote_embedding_batch_size, const bool generate_embeddings) { + const bool do_validation, const size_t remote_embedding_batch_size, + const size_t remote_embedding_timeout_ms, const size_t remote_embedding_num_tries, const bool generate_embeddings) { // runs in a partitioned thread std::vector records_to_embed; @@ -523,7 +525,7 @@ void Index::validate_and_preprocess(Index *index, } if(generate_embeddings) { - batch_embed_fields(records_to_embed, embedding_fields, search_schema, remote_embedding_batch_size); + batch_embed_fields(records_to_embed, embedding_fields, search_schema, remote_embedding_batch_size, remote_embedding_timeout_ms, remote_embedding_num_tries); } } @@ -535,7 +537,8 @@ size_t Index::batch_memory_index(Index *index, const std::string& fallback_field_type, const std::vector& token_separators, const std::vector& symbols_to_index, - const bool do_validation, const size_t remote_embedding_batch_size, const bool generate_embeddings, + const bool do_validation, const size_t remote_embedding_batch_size, + const size_t remote_embedding_timeout_ms, const size_t remote_embedding_num_tries, const bool generate_embeddings, const bool use_addition_fields, const tsl::htrie_map& addition_fields) { const size_t concurrency = 4; @@ -568,7 +571,7 @@ size_t Index::batch_memory_index(Index *index, index->thread_pool->enqueue([&, batch_index, batch_len]() { write_log_index = local_write_log_index; validate_and_preprocess(index, iter_batch, batch_index, batch_len, default_sorting_field, actual_search_schema, - embedding_fields, fallback_field_type, token_separators, symbols_to_index, do_validation, remote_embedding_batch_size, generate_embeddings); + embedding_fields, fallback_field_type, token_separators, symbols_to_index, do_validation, remote_embedding_batch_size, remote_embedding_timeout_ms, remote_embedding_num_tries, generate_embeddings); std::unique_lock lock(m_process); num_processed++; @@ -6660,6 +6663,7 @@ void Index::refresh_schemas(const std::vector& new_fields, const std::vec if(del_field.num_dim) { auto hnsw_index = vector_index[del_field.name]; + std::unique_lock lock(hnsw_index->repair_m); delete hnsw_index; vector_index.erase(del_field.name); } @@ -6951,8 +6955,9 @@ bool Index::common_results_exist(std::vector& leaves, bool must_match void Index::batch_embed_fields(std::vector& records, - const tsl::htrie_map& embedding_fields, - const tsl::htrie_map & search_schema, const size_t remote_embedding_batch_size) { + const tsl::htrie_map& embedding_fields, + const tsl::htrie_map & search_schema, const size_t remote_embedding_batch_size, + const size_t remote_embedding_timeout_ms, const size_t remote_embedding_num_tries) { for(const auto& field : embedding_fields) { std::vector> texts_to_embed; auto indexing_prefix = TextEmbedderManager::get_instance().get_indexing_prefix(field.embed[fields::model_config]); @@ -7023,7 +7028,8 @@ void Index::batch_embed_fields(std::vector& records, texts.push_back(text_to_embed.second); } - auto embeddings = embedder_op.get()->batch_embed(texts, remote_embedding_batch_size); + auto embeddings = embedder_op.get()->batch_embed(texts, remote_embedding_batch_size, remote_embedding_timeout_ms, + remote_embedding_num_tries); for(size_t i = 0; i < embeddings.size(); i++) { auto& embedding_res = embeddings[i]; @@ -7032,13 +7038,35 @@ void Index::batch_embed_fields(std::vector& records, texts_to_embed[i].first->index_failure(embedding_res.status_code, ""); continue; } - nlohmann::json* document; if(texts_to_embed[i].first->is_update) { - document = &texts_to_embed[i].first->new_doc; - } else { - document = &texts_to_embed[i].first->doc; - } - (*document)[field.name] = embedding_res.embedding; + texts_to_embed[i].first->new_doc[field.name] = embedding_res.embedding; + } + texts_to_embed[i].first->doc[field.name] = embedding_res.embedding; + } + } +} + +void Index::repair_hnsw_index() { + std::vector vector_fields; + + // this lock ensures that the `vector_index` map is not mutated during read + std::shared_lock read_lock(mutex); + + for(auto& vec_kv: vector_index) { + vector_fields.push_back(vec_kv.first); + } + + read_lock.unlock(); + + for(const auto& vector_field: vector_fields) { + read_lock.lock(); + if(vector_index.count(vector_field) != 0) { + // this lock ensures that the vector index is not dropped during repair + std::unique_lock lock(vector_index[vector_field]->repair_m); + read_lock.unlock(); // release this lock since repair is a long running operation + vector_index[vector_field]->vecdex->repair_zero_indegree(); + } else { + read_lock.unlock(); } } } diff --git a/src/text_embedder.cpp b/src/text_embedder.cpp index 126655f9..897b1c0b 100644 --- a/src/text_embedder.cpp +++ b/src/text_embedder.cpp @@ -152,7 +152,8 @@ embedding_res_t TextEmbedder::Embed(const std::string& text, const size_t remote } } -std::vector TextEmbedder::batch_embed(const std::vector& inputs, const size_t remote_embedding_batch_size) { +std::vector TextEmbedder::batch_embed(const std::vector& inputs, const size_t remote_embedding_batch_size, + const size_t remote_embedding_timeout_ms, const size_t remote_embedding_num_tries) { std::vector outputs; if(!is_remote()) { std::lock_guard lock(mutex_); @@ -235,7 +236,7 @@ std::vector TextEmbedder::batch_embed(const std::vectorbatch_embed(inputs, remote_embedding_batch_size)); + outputs = std::move(remote_embedder_->batch_embed(inputs, remote_embedding_batch_size, remote_embedding_timeout_ms, remote_embedding_num_tries)); } return outputs; diff --git a/src/text_embedder_remote.cpp b/src/text_embedder_remote.cpp index 74226db2..27409c2d 100644 --- a/src/text_embedder_remote.cpp +++ b/src/text_embedder_remote.cpp @@ -188,13 +188,14 @@ embedding_res_t OpenAIEmbedder::Embed(const std::string& text, const size_t remo } } -std::vector OpenAIEmbedder::batch_embed(const std::vector& inputs, const size_t remote_embedding_batch_size) { +std::vector OpenAIEmbedder::batch_embed(const std::vector& inputs, const size_t remote_embedding_batch_size, + const size_t remote_embedding_timeout_ms, const size_t remote_embedding_num_tries) { // call recursively if inputs larger than remote_embedding_batch_size if(inputs.size() > remote_embedding_batch_size) { std::vector outputs; for(size_t i = 0; i < inputs.size(); i += remote_embedding_batch_size) { auto batch = std::vector(inputs.begin() + i, inputs.begin() + std::min(i + remote_embedding_batch_size, inputs.size())); - auto batch_outputs = batch_embed(batch, remote_embedding_batch_size); + auto batch_outputs = batch_embed(batch, remote_embedding_batch_size, remote_embedding_timeout_ms, remote_embedding_num_tries); outputs.insert(outputs.end(), batch_outputs.begin(), batch_outputs.end()); } return outputs; @@ -206,6 +207,8 @@ std::vector OpenAIEmbedder::batch_embed(const std::vector headers; headers["Authorization"] = "Bearer " + api_key; headers["Content-Type"] = "application/json"; + headers["timeout_ms"] = std::to_string(remote_embedding_timeout_ms); + headers["num_try"] = std::to_string(remote_embedding_num_tries); std::map res_headers; std::string res; auto res_code = call_remote_api("POST", OPENAI_CREATE_EMBEDDING, req_body.dump(), res, res_headers, headers); @@ -370,10 +373,22 @@ embedding_res_t GoogleEmbedder::Embed(const std::string& text, const size_t remo } -std::vector GoogleEmbedder::batch_embed(const std::vector& inputs, const size_t remote_embedding_batch_size) { +std::vector GoogleEmbedder::batch_embed(const std::vector& inputs, const size_t remote_embedding_batch_size, + const size_t remote_embedding_timeout_ms, const size_t remote_embedding_num_tries) { std::vector outputs; + bool timeout_prev = false; for(auto& input : inputs) { - auto res = Embed(input); + auto res = Embed(input, remote_embedding_timeout_ms, remote_embedding_num_tries); + if(res.status_code == 408) { + if(timeout_prev) { + // fail whole batch if two consecutive timeouts, + nlohmann::json req_body; + req_body["text"] = input; + return std::vector(inputs.size(), embedding_res_t(408, get_error_json(req_body, 408, ""))); + } + timeout_prev = true; + } + timeout_prev = false; outputs.push_back(res); } @@ -529,7 +544,8 @@ embedding_res_t GCPEmbedder::Embed(const std::string& text, const size_t remote_ } -std::vector GCPEmbedder::batch_embed(const std::vector& inputs, const size_t remote_embedding_batch_size) { +std::vector GCPEmbedder::batch_embed(const std::vector& inputs, const size_t remote_embedding_batch_size, + const size_t remote_embedding_timeout_ms, const size_t remote_embedding_num_tries) { // GCP API has a limit of 5 instances per request if(inputs.size() > 5) { std::vector res; @@ -549,6 +565,8 @@ std::vector GCPEmbedder::batch_embed(const std::vector headers; headers["Authorization"] = "Bearer " + access_token; headers["Content-Type"] = "application/json"; + headers["timeout_ms"] = std::to_string(remote_embedding_timeout_ms); + headers["num_try"] = std::to_string(remote_embedding_num_tries); std::map res_headers; std::string res; auto res_code = call_remote_api("POST", get_gcp_embedding_url(project_id, model_name), req_body.dump(), res, res_headers, headers); diff --git a/src/typesense_server_utils.cpp b/src/typesense_server_utils.cpp index 3c96e25b..97cfbd00 100644 --- a/src/typesense_server_utils.cpp +++ b/src/typesense_server_utils.cpp @@ -13,7 +13,8 @@ #include #include #include -#include +#include "analytics_manager.h" +#include "housekeeper.h" #include "core_api.h" #include "ratelimit_manager.h" @@ -104,6 +105,7 @@ void init_cmdline_options(cmdline::parser & options, int argc, char **argv) { options.add("log-slow-searches-time-ms", '\0', "When >= 0, searches that take longer than this duration are logged.", false, 30*1000); options.add("cache-num-entries", '\0', "Number of entries to cache.", false, 1000); options.add("analytics-flush-interval", '\0', "Frequency of persisting analytics data to disk (in seconds).", false, 3600); + options.add("housekeeping-interval", '\0', "Frequency of housekeeping background job (in seconds).", false, 1800); // DEPRECATED options.add("listen-address", 'h', "[DEPRECATED: use `api-address`] Address to which Typesense API service binds.", false, "0.0.0.0"); @@ -457,6 +459,11 @@ int run_server(const Config & config, const std::string & version, void (*master AnalyticsManager::get_instance().run(&replication_state); }); + HouseKeeper::get_instance().init(config.get_housekeeping_interval()); + std::thread housekeeping_thread([]() { + HouseKeeper::get_instance().run(); + }); + RemoteEmbedder::init(&replication_state); std::string path_to_nodes = config.get_nodes(); @@ -481,6 +488,10 @@ int run_server(const Config & config, const std::string & version, void (*master LOG(INFO) << "Waiting for event sink thread to be done..."; event_sink_thread.join(); + LOG(INFO) << "Waiting for housekeeping thread to be done..."; + HouseKeeper::get_instance().stop(); + housekeeping_thread.join(); + LOG(INFO) << "Shutting down server_thread_pool"; server_thread_pool.shutdown(); diff --git a/test/collection_faceting_test.cpp b/test/collection_faceting_test.cpp index 50066856..909fe676 100644 --- a/test/collection_faceting_test.cpp +++ b/test/collection_faceting_test.cpp @@ -1229,6 +1229,7 @@ TEST_F(CollectionFacetingTest, FacetParseTest){ field("grade", field_types::INT32, true), field("rank", field_types::INT32, true), field("range", field_types::INT32, true), + field("review", field_types::FLOAT, true), field("scale", field_types::INT32, false), }; @@ -1326,6 +1327,34 @@ TEST_F(CollectionFacetingTest, FacetParseTest){ ASSERT_EQ("rank", mixed_facets_ptr[2]->field_name); ASSERT_EQ("range", mixed_facets_ptr[1]->field_name); + + std::vector range_facet_float_fields { + "review(bad:[0, 2.5], good:[2.5, 5])" + }; + + std::vector float_facets; + for(const std::string & facet_field: range_facet_float_fields) { + auto res = coll1->parse_facet(facet_field, float_facets); + + if(!res.error().empty()) { + LOG(ERROR) << res.error(); + FAIL(); + } + } + + std::vector range_facet_negative_range { + "review(bad:[-2.5, 2.5], good:[2.5, 5])" + }; + + std::vector negative_range; + for(const std::string & facet_field: range_facet_negative_range) { + auto res = coll1->parse_facet(facet_field, negative_range); + + if(!res.error().empty()) { + LOG(ERROR) << res.error(); + FAIL(); + } + } } TEST_F(CollectionFacetingTest, RangeFacetTest) { @@ -1667,6 +1696,204 @@ TEST_F(CollectionFacetingTest, RangeFacetTypo) { collectionManager.drop_collection("coll1"); } +TEST_F(CollectionFacetingTest, RangeFacetsFloatRange) { + std::vector fields = {field("name", field_types::STRING, false), + field("inches", field_types::FLOAT, true),}; + Collection* coll1 = collectionManager.create_collection( + "coll1", 1, fields, "", 0, "", {}, {}).get(); + + nlohmann::json doc; + doc["id"] = "0"; + doc["name"] = "TV 1"; + doc["inches"] = 32.4; + + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + + doc["id"] = "1"; + doc["name"] = "TV 2"; + doc["inches"] = 55; + + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + + doc["id"] = "2"; + doc["name"] = "TV 3"; + doc["inches"] = 55.6; + + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + + auto results = coll1->search("*", {}, + "", {"inches(small:[0, 55.5])"}, + {}, {2}, 10, + 1, FREQUENCY, {true}, + 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "", 10, {}, {}, {}, 0, + "", "", {}, 1000, + true, false, true, "", true).get(); + + ASSERT_EQ(1, results["facet_counts"][0]["counts"].size()); + ASSERT_EQ(2, (int) results["facet_counts"][0]["counts"][0]["count"]); + ASSERT_EQ("small", results["facet_counts"][0]["counts"][0]["value"]); +} + +TEST_F(CollectionFacetingTest, RangeFacetsMinMaxRange) { + std::vector fields = {field("name", field_types::STRING, false), + field("inches", field_types::FLOAT, true),}; + Collection* coll1 = collectionManager.create_collection( + "coll1", 1, fields, "", 0, "", {}, {}).get(); + + nlohmann::json doc; + doc["id"] = "0"; + doc["name"] = "TV 1"; + doc["inches"] = 32.4; + + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + + doc["id"] = "1"; + doc["name"] = "TV 2"; + doc["inches"] = 55; + + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + + doc["id"] = "2"; + doc["name"] = "TV 3"; + doc["inches"] = 55.6; + + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + + auto results = coll1->search("*", {}, + "", {"inches(small:[0, 55], large:[55, ])"}, + {}, {2}, 10, + 1, FREQUENCY, {true}, + 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "", 10, {}, {}, {}, 0, + "", "", {}, 1000, + true, false, true, "", true).get(); + + ASSERT_EQ(2, results["facet_counts"][0]["counts"].size()); + ASSERT_EQ(2, (int) results["facet_counts"][0]["counts"][0]["count"]); + ASSERT_EQ("small", results["facet_counts"][0]["counts"][0]["value"]); + ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][1]["count"]); + ASSERT_EQ("large", results["facet_counts"][0]["counts"][1]["value"]); + + results = coll1->search("*", {}, + "", {"inches(small:[,55])"}, + {}, {2}, 10, + 1, FREQUENCY, {true}, + 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "", 10, {}, {}, {}, 0, + "", "", {}, 1000, + true, false, true, "", true).get(); + + ASSERT_EQ(1, results["facet_counts"][0]["counts"].size()); + ASSERT_EQ(2, (int) results["facet_counts"][0]["counts"][0]["count"]); + ASSERT_EQ("small", results["facet_counts"][0]["counts"][0]["value"]); +} + +TEST_F(CollectionFacetingTest, RangeFacetRangeLabelWithSpace) { + std::vector fields = {field("name", field_types::STRING, false), + field("inches", field_types::FLOAT, true),}; + Collection* coll1 = collectionManager.create_collection( + "coll1", 1, fields, "", 0, "", {}, {}).get(); + + nlohmann::json doc; + doc["id"] = "0"; + doc["name"] = "TV 1"; + doc["inches"] = 32.4; + + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + + doc["id"] = "1"; + doc["name"] = "TV 2"; + doc["inches"] = 55; + + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + + doc["id"] = "2"; + doc["name"] = "TV 3"; + doc["inches"] = 55.6; + + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + + auto results = coll1->search("*", {}, + "", {"inches(small tvs with display size:[0,55])"}, + {}, {2}, 10, + 1, FREQUENCY, {true}, + 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "", 10, {}, {}, {}, 0, + "", "", {}, 1000, + true, false, true, "", true).get(); + + ASSERT_EQ(1, results["facet_counts"][0]["counts"].size()); + ASSERT_EQ(2, (int) results["facet_counts"][0]["counts"][0]["count"]); + ASSERT_EQ("small tvs with display size", results["facet_counts"][0]["counts"][0]["value"]); +} + +TEST_F(CollectionFacetingTest, RangeFacetRangeNegativeRanges) { + std::vector fields = {field("team", field_types::STRING, false), + field("nrr", field_types::FLOAT, true),}; + Collection* coll1 = collectionManager.create_collection( + "coll1", 1, fields, "", 0, "", + {},{}).get(); + + nlohmann::json doc; + doc["id"] = "0"; + doc["team"] = "india"; + doc["nrr"] = 1.353; + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + + doc["id"] = "1"; + doc["team"] = "australia"; + doc["nrr"] = -0.193; + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + + doc["id"] = "2"; + doc["team"] = "pakistan"; + doc["nrr"] = -0.400; + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + + doc["id"] = "3"; + doc["team"] = "afghanistan"; + doc["nrr"] = -0.969; + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + + doc["id"] = "4"; + doc["team"] = "srilanka"; + doc["nrr"] = -1.048; + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + + doc["id"] = "5"; + doc["team"] = "england"; + doc["nrr"] = -1.248; + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + + doc["id"] = "6"; + doc["team"] = "bangladesh"; + doc["nrr"] = -1.253; + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + + doc["id"] = "7"; + doc["team"] = "new zealand"; + doc["nrr"] = 1.481; + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + + auto results = coll1->search("*", {}, + "", {"nrr(poor:[-1.5,-1], decent:[-1,0], good:[0,2])"}, + {}, {2}, 10, + 1, FREQUENCY, {true}, + 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "", 10, {}, {}, {}, 0, + "", "", {}, 1000, + true, false, true, "", true).get(); + + ASSERT_EQ(3, results["facet_counts"][0]["counts"].size()); + ASSERT_EQ(3, (int) results["facet_counts"][0]["counts"][0]["count"]); + ASSERT_EQ("poor", results["facet_counts"][0]["counts"][0]["value"]); + ASSERT_EQ(3, (int) results["facet_counts"][0]["counts"][1]["count"]); + ASSERT_EQ("decent", results["facet_counts"][0]["counts"][1]["value"]); + ASSERT_EQ(2, (int) results["facet_counts"][0]["counts"][2]["count"]); + ASSERT_EQ("good", results["facet_counts"][0]["counts"][2]["value"]); +} + TEST_F(CollectionFacetingTest, SampleFacetCounts) { nlohmann::json schema = R"({ "name": "coll1", diff --git a/test/collection_vector_search_test.cpp b/test/collection_vector_search_test.cpp index 70e0a15b..5181c76b 100644 --- a/test/collection_vector_search_test.cpp +++ b/test/collection_vector_search_test.cpp @@ -2747,3 +2747,95 @@ TEST_F(CollectionVectorTest, TestSearchNonIndexedVectorField) { ASSERT_FALSE(search_result.ok()); ASSERT_EQ("Field `vec` is marked as a non-indexed field in the schema.", search_result.error()); } + +TEST_F(CollectionVectorTest, TestSemanticSearchAfterUpdate) { + nlohmann::json schema = R"({ + "name": "test", + "fields": [ + { + "name": "name", + "type": "string" + }, + { + "name": "embedding", + "type": "float[]", + "embed": { + "from": [ + "name" + ], + "model_config": { + "model_name": "ts/e5-small" + } + } + } + ] + })"_json; + + TextEmbedderManager::set_model_dir("/tmp/typesense_test/models"); + + auto collection_create_op = collectionManager.create_collection(schema); + ASSERT_TRUE(collection_create_op.ok()); + + auto coll = collection_create_op.get(); + + auto add_op = coll->add(R"({ + "name": "soccer", + "id": "0" + })"_json.dump()); + + ASSERT_TRUE(add_op.ok()); + + add_op = coll->add(R"({ + "name": "basketball", + "id": "1" + })"_json.dump()); + + ASSERT_TRUE(add_op.ok()); + + add_op = coll->add(R"({ + "name": "typesense", + "id": "2" + })"_json.dump()); + + ASSERT_TRUE(add_op.ok()); + + add_op = coll->add(R"({ + "name": "potato", + "id": "3" + })"_json.dump()); + + ASSERT_TRUE(add_op.ok()); + + auto result = coll->search("*", {}, "", {}, {}, {0}, 20, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 5, + "", 10, {}, {}, {}, 0, + "", "", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, + fallback, + 4, {off}, 32767, 32767, 2, + false, true, "embedding:([], id:0, k:1)"); + + ASSERT_TRUE(result.ok()); + ASSERT_EQ(1, result.get()["hits"].size()); + ASSERT_EQ("basketball", result.get()["hits"][0]["document"]["name"]); + + auto update_op = coll->add(R"({ + "name": "onion", + "id": "0" + })"_json.dump(), index_operation_t::UPDATE, "0"); + + ASSERT_TRUE(update_op.ok()); + + result = coll->search("*", {}, "", {}, {}, {0}, 20, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 5, + "", 10, {}, {}, {}, 0, + "", "", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, + fallback, + 4, {off}, 32767, 32767, 2, + false, true, "embedding:([], id:0, k:1)"); + + ASSERT_TRUE(result.ok()); + ASSERT_EQ(1, result.get()["hits"].size()); + ASSERT_EQ("potato", result.get()["hits"][0]["document"]["name"]); +}