Merge branch 'v0.25-join' into v0.26-facets

# Conflicts:
#	include/index.h
#	src/collection.cpp
#	test/collection_vector_search_test.cpp
This commit is contained in:
Kishore Nallan 2023-10-30 21:41:47 +05:30
commit ae597f40ba
16 changed files with 566 additions and 62 deletions

View File

@ -179,7 +179,7 @@ new_git_repository(
new_git_repository(
name = "hnsw",
build_file = "//bazel:hnsw.BUILD",
commit = "5aba40d4b10dd77aece2ab9a1b3fdf06e433466a",
commit = "5100d3fe41da45601875b3f395f508398cb12b8a",
remote = "https://github.com/typesense/hnswlib.git",
)

View File

@ -55,6 +55,8 @@ private:
mutable std::shared_mutex mutex;
mutable std::shared_mutex index_repair_lock;
const uint8_t CURATED_RECORD_IDENTIFIER = 100;
const size_t DEFAULT_TOPSTER_SIZE = 250;
@ -428,7 +430,8 @@ public:
const std::vector<sort_by>& sort_by_fields);
void batch_index(std::vector<index_record>& index_records, std::vector<std::string>& json_out, size_t &num_indexed,
const bool& return_doc, const bool& return_id, const size_t remote_embedding_batch_size = 200);
const bool& return_doc, const bool& return_id, const size_t remote_embedding_batch_size = 200,
const size_t remote_embedding_timeout_ms = 60000, const size_t remote_embedding_num_tries = 2);
bool is_exceeding_memory_threshold() const;
@ -442,7 +445,7 @@ public:
nlohmann::json get_summary_json() const;
size_t batch_index_in_memory(std::vector<index_record>& index_records, const size_t remote_embedding_batch_size,
const bool generate_embeddings);
const size_t remote_embedding_timeout_ms, const size_t remote_embedding_num_tries, const bool generate_embeddings);
Option<nlohmann::json> add(const std::string & json_str,
const index_operation_t& operation=CREATE, const std::string& id="",
@ -452,7 +455,9 @@ public:
const index_operation_t& operation=CREATE, const std::string& id="",
const DIRTY_VALUES& dirty_values=DIRTY_VALUES::COERCE_OR_REJECT,
const bool& return_doc=false, const bool& return_id=false,
const size_t remote_embedding_batch_size=200);
const size_t remote_embedding_batch_size=200,
const size_t remote_embedding_timeout_ms=60000,
const size_t remote_embedding_num_tries=2);
Option<nlohmann::json> update_matching_filter(const std::string& filter_query,
const std::string & json_str,
@ -464,6 +469,8 @@ public:
tsl::htrie_set<char>& include_fields_full,
tsl::htrie_set<char>& exclude_fields_full) const;
void do_housekeeping();
Option<nlohmann::json> search(std::string query, const std::vector<std::string> & search_fields,
const std::string & filter_query, const std::vector<std::string> & facet_fields,
const std::vector<sort_by> & sort_fields, const std::vector<uint32_t>& num_typos,

View File

@ -129,6 +129,8 @@ public:
std::vector<Collection*> get_collections() const;
std::vector<std::string> get_collection_names() const;
Collection* get_collection_unsafe(const std::string & collection_name) const;
// PUBLICLY EXPOSED API

View File

@ -296,6 +296,9 @@ struct hnsw_index_t {
size_t num_dim;
vector_distance_type_t distance_type;
// ensures that this index is not dropped when it's being repaired
std::mutex repair_m;
hnsw_index_t(size_t num_dim, size_t init_size, vector_distance_type_t distance_type):
space(new hnswlib::InnerProductSpace(num_dim)),
vecdex(new hnswlib::HierarchicalNSW<float>(space, init_size, 16, 200, 100, true)),
@ -561,13 +564,14 @@ private:
const std::string& token, uint32_t seq_id);
void initialize_facet_indexes(const field& facet_field);
static void batch_embed_fields(std::vector<index_record*>& documents,
const tsl::htrie_map<char, field>& embedding_fields,
const tsl::htrie_map<char, field> & search_schema, const size_t remote_embedding_batch_size = 200);
std::vector<group_by_field_it_t> get_group_by_field_iterators(const std::vector<std::string>&, bool is_reverse=false) const;
static void batch_embed_fields(std::vector<index_record*>& documents,
const tsl::htrie_map<char, field>& embedding_fields,
const tsl::htrie_map<char, field> & search_schema, const size_t remote_embedding_batch_size = 200,
const size_t remote_embedding_timeout_ms = 60000, const size_t remote_embedding_num_tries = 2);
public:
// for limiting number of results on multiple candidates / query rewrites
enum {TYPO_TOKENS_THRESHOLD = 1};
@ -713,7 +717,8 @@ public:
const std::string& fallback_field_type,
const std::vector<char>& token_separators,
const std::vector<char>& symbols_to_index,
const bool do_validation, const size_t remote_embedding_batch_size = 200, const bool generate_embeddings = true);
const bool do_validation, const size_t remote_embedding_batch_size = 200,
const size_t remote_embedding_timeout_ms = 60000, const size_t remote_embedding_num_tries = 2, const bool generate_embeddings = true);
static size_t batch_memory_index(Index *index,
std::vector<index_record>& iter_batch,
@ -724,9 +729,10 @@ public:
const std::vector<char>& token_separators,
const std::vector<char>& symbols_to_index,
const bool do_validation, const size_t remote_embedding_batch_size = 200,
const bool generate_embeddings = true,
const size_t remote_embedding_timeout_ms = 60000,
const size_t remote_embedding_num_tries = 2, const bool generate_embeddings = true,
const bool use_addition_fields = false,
const tsl::htrie_map<char, field>& addition_fields = {});
const tsl::htrie_map<char, field>& addition_fields = tsl::htrie_map<char, field>());
void index_field_in_memory(const field& afield, std::vector<index_record>& iter_batch);
@ -1017,6 +1023,8 @@ public:
const uint32_t& seq_id) const;
friend class filter_result_iterator_t;
void repair_hnsw_index();
};
template<class T>

View File

@ -17,7 +17,8 @@ class TextEmbedder {
TextEmbedder(const nlohmann::json& model_config, size_t num_dims);
~TextEmbedder();
embedding_res_t Embed(const std::string& text, const size_t remote_embedder_timeout_ms = 30000, const size_t remote_embedding_num_tries = 2);
std::vector<embedding_res_t> batch_embed(const std::vector<std::string>& inputs, const size_t remote_embedding_batch_size = 200);
std::vector<embedding_res_t> batch_embed(const std::vector<std::string>& inputs, const size_t remote_embedding_batch_size = 200,
const size_t remote_embedding_timeout_ms = 60000, const size_t remote_embedding_num_tries = 2);
const std::string& get_vocab_file_name() const;
const size_t get_num_dim() const;
bool is_remote() {

View File

@ -30,7 +30,8 @@ class RemoteEmbedder {
public:
virtual nlohmann::json get_error_json(const nlohmann::json& req_body, long res_code, const std::string& res_body) = 0;
virtual embedding_res_t Embed(const std::string& text, const size_t remote_embedder_timeout_ms = 30000, const size_t remote_embedding_num_tries = 2) = 0;
virtual std::vector<embedding_res_t> batch_embed(const std::vector<std::string>& inputs, const size_t remote_embedding_batch_size = 200) = 0;
virtual std::vector<embedding_res_t> batch_embed(const std::vector<std::string>& inputs, const size_t remote_embedding_batch_size = 200,
const size_t remote_embedding_timeout_ms = 60000, const size_t remote_embedding_num_tries = 2) = 0;
static const std::string get_model_key(const nlohmann::json& model_config);
static void init(ReplicationState* rs) {
raft_server = rs;
@ -50,7 +51,8 @@ class OpenAIEmbedder : public RemoteEmbedder {
OpenAIEmbedder(const std::string& openai_model_path, const std::string& api_key);
static Option<bool> is_model_valid(const nlohmann::json& model_config, size_t& num_dims);
embedding_res_t Embed(const std::string& text, const size_t remote_embedder_timeout_ms = 30000, const size_t remote_embedding_num_tries = 2) override;
std::vector<embedding_res_t> batch_embed(const std::vector<std::string>& inputs, const size_t remote_embedding_batch_size = 200) override;
std::vector<embedding_res_t> batch_embed(const std::vector<std::string>& inputs, const size_t remote_embedding_batch_size = 200,
const size_t remote_embedding_timeout_ms = 60000, const size_t remote_embedding_num_tries = 2) override;
nlohmann::json get_error_json(const nlohmann::json& req_body, long res_code, const std::string& res_body) override;
static std::string get_model_key(const nlohmann::json& model_config);
};
@ -68,7 +70,8 @@ class GoogleEmbedder : public RemoteEmbedder {
GoogleEmbedder(const std::string& google_api_key);
static Option<bool> is_model_valid(const nlohmann::json& model_config, size_t& num_dims);
embedding_res_t Embed(const std::string& text, const size_t remote_embedder_timeout_ms = 30000, const size_t remote_embedding_num_tries = 2) override;
std::vector<embedding_res_t> batch_embed(const std::vector<std::string>& inputs, const size_t remote_embedding_batch_size = 200) override;
std::vector<embedding_res_t> batch_embed(const std::vector<std::string>& inputs, const size_t remote_embedding_batch_size = 200,
const size_t remote_embedding_timeout_ms = 60000, const size_t remote_embedding_num_tries = 2) override;
nlohmann::json get_error_json(const nlohmann::json& req_body, long res_code, const std::string& res_body) override;
static std::string get_model_key(const nlohmann::json& model_config);
};
@ -96,7 +99,8 @@ class GCPEmbedder : public RemoteEmbedder {
const std::string& refresh_token, const std::string& client_id, const std::string& client_secret);
static Option<bool> is_model_valid(const nlohmann::json& model_config, size_t& num_dims);
embedding_res_t Embed(const std::string& text, const size_t remote_embedder_timeout_ms = 30000, const size_t remote_embedding_num_tries = 2) override;
std::vector<embedding_res_t> batch_embed(const std::vector<std::string>& inputs, const size_t remote_embedding_batch_size = 200) override;
std::vector<embedding_res_t> batch_embed(const std::vector<std::string>& inputs, const size_t remote_embedding_batch_size = 200,
const size_t remote_embedding_timeout_ms = 60000, const size_t remote_embedding_num_tries = 2) override;
nlohmann::json get_error_json(const nlohmann::json& req_body, long res_code, const std::string& res_body) override;
static std::string get_model_key(const nlohmann::json& model_config);
};

View File

@ -69,6 +69,8 @@ private:
uint32_t analytics_flush_interval;
uint32_t housekeeping_interval;
protected:
Config() {
@ -96,6 +98,7 @@ protected:
this->enable_search_analytics = false;
this->analytics_flush_interval = 3600; // in seconds
this->housekeeping_interval = 1800; // in seconds
}
Config(Config const&) {
@ -294,6 +297,10 @@ public:
return this->analytics_flush_interval;
}
size_t get_housekeeping_interval() const {
return this->housekeeping_interval;
}
size_t get_thread_pool_size() const {
return this->thread_pool_size;
}
@ -429,6 +436,10 @@ public:
this->analytics_flush_interval = std::stoi(get_env("TYPESENSE_ANALYTICS_FLUSH_INTERVAL"));
}
if(!get_env("TYPESENSE_HOUSEKEEPING_INTERVAL").empty()) {
this->housekeeping_interval = std::stoi(get_env("TYPESENSE_HOUSEKEEPING_INTERVAL"));
}
if(!get_env("TYPESENSE_THREAD_POOL_SIZE").empty()) {
this->thread_pool_size = std::stoi(get_env("TYPESENSE_THREAD_POOL_SIZE"));
}
@ -592,6 +603,10 @@ public:
this->analytics_flush_interval = (int) reader.GetInteger("server", "analytics-flush-interval", 3600);
}
if(reader.Exists("server", "housekeeping-interval")) {
this->housekeeping_interval = (int) reader.GetInteger("server", "housekeeping-interval", 1800);
}
if(reader.Exists("server", "thread-pool-size")) {
this->thread_pool_size = (int) reader.GetInteger("server", "thread-pool-size", 0);
}
@ -746,6 +761,10 @@ public:
this->analytics_flush_interval = options.get<uint32_t>("analytics-flush-interval");
}
if(options.exist("housekeeping-interval")) {
this->housekeeping_interval = options.get<uint32_t>("housekeeping-interval");
}
if(options.exist("thread-pool-size")) {
this->thread_pool_size = options.get<uint32_t>("thread-pool-size");
}

View File

@ -58,6 +58,7 @@ Collection::Collection(const std::string& name, const uint32_t collection_id, co
Collection::~Collection() {
std::unique_lock lock(mutex);
std::unique_lock repair_lock(index_repair_lock);
delete index;
delete synonym_index;
}
@ -391,7 +392,9 @@ Option<nlohmann::json> Collection::add(const std::string & json_str,
nlohmann::json Collection::add_many(std::vector<std::string>& json_lines, nlohmann::json& document,
const index_operation_t& operation, const std::string& id,
const DIRTY_VALUES& dirty_values, const bool& return_doc, const bool& return_id,
const size_t remote_embedding_batch_size) {
const size_t remote_embedding_batch_size,
const size_t remote_embedding_timeout_ms,
const size_t remote_embedding_num_tries) {
//LOG(INFO) << "Memory ratio. Max = " << max_memory_ratio << ", Used = " << SystemMetrics::used_memory_ratio();
std::vector<index_record> index_records;
@ -481,7 +484,7 @@ nlohmann::json Collection::add_many(std::vector<std::string>& json_lines, nlohma
if((i+1) % index_batch_size == 0 || i == json_lines.size()-1 || repeated_doc) {
batch_index(index_records, json_lines, num_indexed, return_doc, return_id, remote_embedding_batch_size);
batch_index(index_records, json_lines, num_indexed, return_doc, return_id, remote_embedding_batch_size, remote_embedding_timeout_ms, remote_embedding_num_tries);
// to return the document for the single doc add cases
if(index_records.size() == 1) {
@ -598,9 +601,10 @@ bool Collection::is_exceeding_memory_threshold() const {
}
void Collection::batch_index(std::vector<index_record>& index_records, std::vector<std::string>& json_out,
size_t &num_indexed, const bool& return_doc, const bool& return_id, const size_t remote_embedding_batch_size) {
size_t &num_indexed, const bool& return_doc, const bool& return_id, const size_t remote_embedding_batch_size,
const size_t remote_embedding_timeout_ms, const size_t remote_embedding_num_tries) {
batch_index_in_memory(index_records, remote_embedding_batch_size, true);
batch_index_in_memory(index_records, remote_embedding_batch_size, remote_embedding_timeout_ms, remote_embedding_num_tries, true);
// store only documents that were indexed in-memory successfully
for(auto& index_record: index_records) {
@ -704,11 +708,12 @@ Option<uint32_t> Collection::index_in_memory(nlohmann::json &document, uint32_t
return Option<>(200);
}
size_t Collection::batch_index_in_memory(std::vector<index_record>& index_records, const size_t remote_embedding_batch_size, const bool generate_embeddings) {
size_t Collection::batch_index_in_memory(std::vector<index_record>& index_records, const size_t remote_embedding_batch_size,
const size_t remote_embedding_timeout_ms, const size_t remote_embedding_num_tries, const bool generate_embeddings) {
std::unique_lock lock(mutex);
size_t num_indexed = Index::batch_memory_index(index, index_records, default_sorting_field,
search_schema, embedding_fields, fallback_field_type,
token_separators, symbols_to_index, true, remote_embedding_batch_size, generate_embeddings);
token_separators, symbols_to_index, true, remote_embedding_batch_size, remote_embedding_timeout_ms, remote_embedding_num_tries, generate_embeddings);
num_documents += num_indexed;
return num_indexed;
}
@ -4291,8 +4296,9 @@ Option<bool> Collection::batch_alter_data(const std::vector<field>& alter_fields
}
Index::batch_memory_index(index, iter_batch, default_sorting_field, search_schema, embedding_fields,
fallback_field_type, token_separators, symbols_to_index, true, 200,
fallback_field_type, token_separators, symbols_to_index, true, 200, 60000, 2,
found_embedding_field, true, schema_additions);
if(found_embedding_field) {
for(auto& index_record : iter_batch) {
if(index_record.indexed.ok()) {
@ -5380,7 +5386,7 @@ bool Collection::get_enable_nested_fields() {
Option<bool> Collection::parse_facet(const std::string& facet_field, std::vector<facet>& facets) const {
const std::regex base_pattern(".+\\(.*\\)");
const std::regex range_pattern("[[a-zA-Z]+:\\[([0-9]+)\\,\\s*([0-9]+)\\]");
const std::regex range_pattern("[[a-z A-Z]+:\\[([+-]?([0-9]*[.])?[0-9]*)\\,\\s*([+-]?([0-9]*[.])?[0-9]*)\\]");
const std::string _alpha = "_alpha";
if ((facet_field.find(":") != std::string::npos)
@ -5469,24 +5475,49 @@ Option<bool> Collection::parse_facet(const std::string& facet_field, std::vector
auto pos3 = range.find("]");
int64_t lower_range, upper_range;
auto lower_range_start = pos1 + 2;
auto lower_range_len = pos2 - lower_range_start;
auto upper_range_start = pos2 + 1;
auto upper_range_len = pos3 - upper_range_start;
if(a_field.is_integer()) {
std::string lower_range_str = range.substr(lower_range_start, lower_range_len);
auto start = pos1 + 2;
auto end = pos2 - start;
auto lower_range_str = range.substr(start, end);
StringUtils::trim(lower_range_str);
lower_range = std::stoll(lower_range_str);
std::string upper_range_str = range.substr(upper_range_start, upper_range_len);
StringUtils::trim(upper_range_str);
upper_range = std::stoll(upper_range_str);
} else {
float val = std::stof(range.substr(pos1 + 2, pos2));
lower_range = Index::float_to_int64_t(val);
if(lower_range_str.empty()) {
lower_range = INT64_MIN;
} else {
lower_range = std::stoll(lower_range_str);
}
val = std::stof(range.substr(pos2 + 1, pos3));
upper_range = Index::float_to_int64_t(val);
start = pos2 + 1;
end = pos3 - start;
auto upper_range_str = range.substr(start, end);
StringUtils::trim(upper_range_str);
if(upper_range_str.empty()) {
upper_range = INT64_MAX;
} else {
upper_range = std::stoll(upper_range_str);
}
} else {
auto start = pos1 + 2;
auto end = pos2 - start;
auto lower_range_str = range.substr(start, end);
StringUtils::trim(lower_range_str);
if(lower_range_str.empty()) {
lower_range = INT64_MIN;
} else {
float val = std::stof(lower_range_str);
lower_range = Index::float_to_int64_t(val);
}
start = pos2 + 1;
end = pos3 - start;
auto upper_range_str = range.substr(start, end);
StringUtils::trim(upper_range_str);
if(upper_range_str.empty()) {
upper_range = INT64_MAX;
} else {
float val = std::stof(upper_range_str);
upper_range = Index::float_to_int64_t(val);
}
}
tupVec.emplace_back(lower_range, upper_range, range_val);
@ -5811,3 +5842,8 @@ void Collection::remove_embedding_field(const std::string& field_name) {
tsl::htrie_map<char, field> Collection::get_embedding_fields_unsafe() {
return embedding_fields;
}
void Collection::do_housekeeping() {
std::unique_lock lock(index_repair_lock);
index->repair_hnsw_index();
}

View File

@ -527,6 +527,17 @@ std::vector<Collection*> CollectionManager::get_collections() const {
return collection_vec;
}
std::vector<std::string> CollectionManager::get_collection_names() const {
std::shared_lock lock(mutex);
std::vector<std::string> collection_vec;
for(const auto& kv: collections) {
collection_vec.push_back(kv.first);
}
return collection_vec;
}
Option<nlohmann::json> CollectionManager::drop_collection(const std::string& collection_name, const bool remove_from_store) {
std::shared_lock s_lock(mutex);
auto collection = get_collection_unsafe(collection_name);
@ -1787,7 +1798,7 @@ Option<bool> CollectionManager::load_collection(const nlohmann::json &collection
// batch must match atleast the number of shards
if(exceeds_batch_mem_threshold || (num_valid_docs % batch_size == 0) || last_record) {
size_t num_records = index_records.size();
size_t num_indexed = collection->batch_index_in_memory(index_records, 200, false);
size_t num_indexed = collection->batch_index_in_memory(index_records, 200, 60000, 2, false);
batch_doc_str_size = 0;
if(num_indexed != num_records) {

View File

@ -756,6 +756,8 @@ bool post_import_documents(const std::shared_ptr<http_req>& req, const std::shar
const char *RETURN_DOC = "return_doc";
const char *RETURN_ID = "return_id";
const char *REMOTE_EMBEDDING_BATCH_SIZE = "remote_embedding_batch_size";
const char *REMOTE_EMBEDDING_TIMEOUT_MS = "remote_embedding_timeout_ms";
const char *REMOTE_EMBEDDING_NUM_TRIES = "remote_embedding_num_tries";
if(req->params.count(BATCH_SIZE) == 0) {
req->params[BATCH_SIZE] = "40";
@ -810,8 +812,18 @@ bool post_import_documents(const std::shared_ptr<http_req>& req, const std::shar
return false;
}
if(req->params.count(REMOTE_EMBEDDING_TIMEOUT_MS) == 0) {
req->params[REMOTE_EMBEDDING_TIMEOUT_MS] = "60000";
}
if(req->params.count(REMOTE_EMBEDDING_NUM_TRIES) == 0) {
req->params[REMOTE_EMBEDDING_NUM_TRIES] = "2";
}
const size_t IMPORT_BATCH_SIZE = std::stoi(req->params[BATCH_SIZE]);
const size_t REMOTE_EMBEDDING_BATCH_SIZE_VAL = std::stoi(req->params[REMOTE_EMBEDDING_BATCH_SIZE]);
const size_t REMOTE_EMBEDDING_TIMEOUT_MS_VAL = std::stoi(req->params[REMOTE_EMBEDDING_TIMEOUT_MS]);
const size_t REMOTE_EMBEDDING_NUM_TRIES_VAL = std::stoi(req->params[REMOTE_EMBEDDING_NUM_TRIES]);
if(IMPORT_BATCH_SIZE == 0) {
res->final = true;
@ -827,6 +839,20 @@ bool post_import_documents(const std::shared_ptr<http_req>& req, const std::shar
return false;
}
if(REMOTE_EMBEDDING_TIMEOUT_MS_VAL == 0) {
res->final = true;
res->set_400("Parameter `" + std::string(REMOTE_EMBEDDING_TIMEOUT_MS) + "` must be a positive integer.");
stream_response(req, res);
return false;
}
if(REMOTE_EMBEDDING_NUM_TRIES_VAL == 0) {
res->final = true;
res->set_400("Parameter `" + std::string(REMOTE_EMBEDDING_NUM_TRIES) + "` must be a positive integer.");
stream_response(req, res);
return false;
}
if(req->body_index == 0) {
// will log for every major chunk of request body
//LOG(INFO) << "Import, req->body.size=" << req->body.size() << ", batch_size=" << IMPORT_BATCH_SIZE;
@ -896,7 +922,7 @@ bool post_import_documents(const std::shared_ptr<http_req>& req, const std::shar
const bool& return_doc = req->params[RETURN_DOC] == "true";
const bool& return_id = req->params[RETURN_ID] == "true";
nlohmann::json json_res = collection->add_many(json_lines, document, operation, "",
dirty_values, return_doc, return_id, REMOTE_EMBEDDING_BATCH_SIZE_VAL);
dirty_values, return_doc, return_id, REMOTE_EMBEDDING_BATCH_SIZE_VAL, REMOTE_EMBEDDING_TIMEOUT_MS_VAL, REMOTE_EMBEDDING_NUM_TRIES_VAL);
//const std::string& import_summary_json = json_res->dump();
//response_stream << import_summary_json << "\n";
@ -940,6 +966,7 @@ bool post_add_document(const std::shared_ptr<http_req>& req, const std::shared_p
req->params[DIRTY_VALUES_PARAM] = ""; // set it empty as default will depend on whether schema is enabled
}
CollectionManager & collectionManager = CollectionManager::get_instance();
auto collection = collectionManager.get_collection(req->params["collection"]);
@ -951,10 +978,22 @@ bool post_add_document(const std::shared_ptr<http_req>& req, const std::shared_p
const index_operation_t operation = get_index_operation(req->params[ACTION]);
const auto& dirty_values = collection->parse_dirty_values_option(req->params[DIRTY_VALUES_PARAM]);
size_t remote_embedding_timeout_ms = 60000;
size_t remote_embedding_num_tries = 2;
if(req->params.count("remote_embedding_timeout_ms") != 0) {
remote_embedding_timeout_ms = std::stoul(req->params["remote_embedding_timeout_ms"]);
}
if(req->params.count("remote_embedding_num_tries") != 0) {
remote_embedding_num_tries = std::stoul(req->params["remote_embedding_num_tries"]);
}
nlohmann::json document;
std::vector<std::string> json_lines = {req->body};
const nlohmann::json& inserted_doc_op = collection->add_many(json_lines, document, operation, "", dirty_values,
false, false);
false, false, 200, remote_embedding_timeout_ms,
remote_embedding_num_tries);
if(!inserted_doc_op["success"].get<bool>()) {
nlohmann::json res_doc;

View File

@ -206,6 +206,7 @@ Index::~Index() {
delete seq_ids;
for(auto& vec_index_kv: vector_index) {
std::unique_lock lock(vec_index_kv.second->repair_m);
delete vec_index_kv.second;
}
@ -432,7 +433,8 @@ void Index::validate_and_preprocess(Index *index,
const std::string& fallback_field_type,
const std::vector<char>& token_separators,
const std::vector<char>& symbols_to_index,
const bool do_validation, const size_t remote_embedding_batch_size, const bool generate_embeddings) {
const bool do_validation, const size_t remote_embedding_batch_size,
const size_t remote_embedding_timeout_ms, const size_t remote_embedding_num_tries, const bool generate_embeddings) {
// runs in a partitioned thread
std::vector<index_record*> records_to_embed;
@ -523,7 +525,7 @@ void Index::validate_and_preprocess(Index *index,
}
if(generate_embeddings) {
batch_embed_fields(records_to_embed, embedding_fields, search_schema, remote_embedding_batch_size);
batch_embed_fields(records_to_embed, embedding_fields, search_schema, remote_embedding_batch_size, remote_embedding_timeout_ms, remote_embedding_num_tries);
}
}
@ -535,7 +537,8 @@ size_t Index::batch_memory_index(Index *index,
const std::string& fallback_field_type,
const std::vector<char>& token_separators,
const std::vector<char>& symbols_to_index,
const bool do_validation, const size_t remote_embedding_batch_size, const bool generate_embeddings,
const bool do_validation, const size_t remote_embedding_batch_size,
const size_t remote_embedding_timeout_ms, const size_t remote_embedding_num_tries, const bool generate_embeddings,
const bool use_addition_fields, const tsl::htrie_map<char, field>& addition_fields) {
const size_t concurrency = 4;
@ -568,7 +571,7 @@ size_t Index::batch_memory_index(Index *index,
index->thread_pool->enqueue([&, batch_index, batch_len]() {
write_log_index = local_write_log_index;
validate_and_preprocess(index, iter_batch, batch_index, batch_len, default_sorting_field, actual_search_schema,
embedding_fields, fallback_field_type, token_separators, symbols_to_index, do_validation, remote_embedding_batch_size, generate_embeddings);
embedding_fields, fallback_field_type, token_separators, symbols_to_index, do_validation, remote_embedding_batch_size, remote_embedding_timeout_ms, remote_embedding_num_tries, generate_embeddings);
std::unique_lock<std::mutex> lock(m_process);
num_processed++;
@ -6660,6 +6663,7 @@ void Index::refresh_schemas(const std::vector<field>& new_fields, const std::vec
if(del_field.num_dim) {
auto hnsw_index = vector_index[del_field.name];
std::unique_lock lock(hnsw_index->repair_m);
delete hnsw_index;
vector_index.erase(del_field.name);
}
@ -6951,8 +6955,9 @@ bool Index::common_results_exist(std::vector<art_leaf*>& leaves, bool must_match
void Index::batch_embed_fields(std::vector<index_record*>& records,
const tsl::htrie_map<char, field>& embedding_fields,
const tsl::htrie_map<char, field> & search_schema, const size_t remote_embedding_batch_size) {
const tsl::htrie_map<char, field>& embedding_fields,
const tsl::htrie_map<char, field> & search_schema, const size_t remote_embedding_batch_size,
const size_t remote_embedding_timeout_ms, const size_t remote_embedding_num_tries) {
for(const auto& field : embedding_fields) {
std::vector<std::pair<index_record*, std::string>> texts_to_embed;
auto indexing_prefix = TextEmbedderManager::get_instance().get_indexing_prefix(field.embed[fields::model_config]);
@ -7023,7 +7028,8 @@ void Index::batch_embed_fields(std::vector<index_record*>& records,
texts.push_back(text_to_embed.second);
}
auto embeddings = embedder_op.get()->batch_embed(texts, remote_embedding_batch_size);
auto embeddings = embedder_op.get()->batch_embed(texts, remote_embedding_batch_size, remote_embedding_timeout_ms,
remote_embedding_num_tries);
for(size_t i = 0; i < embeddings.size(); i++) {
auto& embedding_res = embeddings[i];
@ -7032,13 +7038,35 @@ void Index::batch_embed_fields(std::vector<index_record*>& records,
texts_to_embed[i].first->index_failure(embedding_res.status_code, "");
continue;
}
nlohmann::json* document;
if(texts_to_embed[i].first->is_update) {
document = &texts_to_embed[i].first->new_doc;
} else {
document = &texts_to_embed[i].first->doc;
}
(*document)[field.name] = embedding_res.embedding;
texts_to_embed[i].first->new_doc[field.name] = embedding_res.embedding;
}
texts_to_embed[i].first->doc[field.name] = embedding_res.embedding;
}
}
}
void Index::repair_hnsw_index() {
std::vector<std::string> vector_fields;
// this lock ensures that the `vector_index` map is not mutated during read
std::shared_lock read_lock(mutex);
for(auto& vec_kv: vector_index) {
vector_fields.push_back(vec_kv.first);
}
read_lock.unlock();
for(const auto& vector_field: vector_fields) {
read_lock.lock();
if(vector_index.count(vector_field) != 0) {
// this lock ensures that the vector index is not dropped during repair
std::unique_lock lock(vector_index[vector_field]->repair_m);
read_lock.unlock(); // release this lock since repair is a long running operation
vector_index[vector_field]->vecdex->repair_zero_indegree();
} else {
read_lock.unlock();
}
}
}

View File

@ -152,7 +152,8 @@ embedding_res_t TextEmbedder::Embed(const std::string& text, const size_t remote
}
}
std::vector<embedding_res_t> TextEmbedder::batch_embed(const std::vector<std::string>& inputs, const size_t remote_embedding_batch_size) {
std::vector<embedding_res_t> TextEmbedder::batch_embed(const std::vector<std::string>& inputs, const size_t remote_embedding_batch_size,
const size_t remote_embedding_timeout_ms, const size_t remote_embedding_num_tries) {
std::vector<embedding_res_t> outputs;
if(!is_remote()) {
std::lock_guard<std::mutex> lock(mutex_);
@ -235,7 +236,7 @@ std::vector<embedding_res_t> TextEmbedder::batch_embed(const std::vector<std::st
}
}
} else {
outputs = std::move(remote_embedder_->batch_embed(inputs, remote_embedding_batch_size));
outputs = std::move(remote_embedder_->batch_embed(inputs, remote_embedding_batch_size, remote_embedding_timeout_ms, remote_embedding_num_tries));
}
return outputs;

View File

@ -188,13 +188,14 @@ embedding_res_t OpenAIEmbedder::Embed(const std::string& text, const size_t remo
}
}
std::vector<embedding_res_t> OpenAIEmbedder::batch_embed(const std::vector<std::string>& inputs, const size_t remote_embedding_batch_size) {
std::vector<embedding_res_t> OpenAIEmbedder::batch_embed(const std::vector<std::string>& inputs, const size_t remote_embedding_batch_size,
const size_t remote_embedding_timeout_ms, const size_t remote_embedding_num_tries) {
// call recursively if inputs larger than remote_embedding_batch_size
if(inputs.size() > remote_embedding_batch_size) {
std::vector<embedding_res_t> outputs;
for(size_t i = 0; i < inputs.size(); i += remote_embedding_batch_size) {
auto batch = std::vector<std::string>(inputs.begin() + i, inputs.begin() + std::min(i + remote_embedding_batch_size, inputs.size()));
auto batch_outputs = batch_embed(batch, remote_embedding_batch_size);
auto batch_outputs = batch_embed(batch, remote_embedding_batch_size, remote_embedding_timeout_ms, remote_embedding_num_tries);
outputs.insert(outputs.end(), batch_outputs.begin(), batch_outputs.end());
}
return outputs;
@ -206,6 +207,8 @@ std::vector<embedding_res_t> OpenAIEmbedder::batch_embed(const std::vector<std::
std::unordered_map<std::string, std::string> headers;
headers["Authorization"] = "Bearer " + api_key;
headers["Content-Type"] = "application/json";
headers["timeout_ms"] = std::to_string(remote_embedding_timeout_ms);
headers["num_try"] = std::to_string(remote_embedding_num_tries);
std::map<std::string, std::string> res_headers;
std::string res;
auto res_code = call_remote_api("POST", OPENAI_CREATE_EMBEDDING, req_body.dump(), res, res_headers, headers);
@ -370,10 +373,22 @@ embedding_res_t GoogleEmbedder::Embed(const std::string& text, const size_t remo
}
std::vector<embedding_res_t> GoogleEmbedder::batch_embed(const std::vector<std::string>& inputs, const size_t remote_embedding_batch_size) {
std::vector<embedding_res_t> GoogleEmbedder::batch_embed(const std::vector<std::string>& inputs, const size_t remote_embedding_batch_size,
const size_t remote_embedding_timeout_ms, const size_t remote_embedding_num_tries) {
std::vector<embedding_res_t> outputs;
bool timeout_prev = false;
for(auto& input : inputs) {
auto res = Embed(input);
auto res = Embed(input, remote_embedding_timeout_ms, remote_embedding_num_tries);
if(res.status_code == 408) {
if(timeout_prev) {
// fail whole batch if two consecutive timeouts,
nlohmann::json req_body;
req_body["text"] = input;
return std::vector<embedding_res_t>(inputs.size(), embedding_res_t(408, get_error_json(req_body, 408, "")));
}
timeout_prev = true;
}
timeout_prev = false;
outputs.push_back(res);
}
@ -529,7 +544,8 @@ embedding_res_t GCPEmbedder::Embed(const std::string& text, const size_t remote_
}
std::vector<embedding_res_t> GCPEmbedder::batch_embed(const std::vector<std::string>& inputs, const size_t remote_embedding_batch_size) {
std::vector<embedding_res_t> GCPEmbedder::batch_embed(const std::vector<std::string>& inputs, const size_t remote_embedding_batch_size,
const size_t remote_embedding_timeout_ms, const size_t remote_embedding_num_tries) {
// GCP API has a limit of 5 instances per request
if(inputs.size() > 5) {
std::vector<embedding_res_t> res;
@ -549,6 +565,8 @@ std::vector<embedding_res_t> GCPEmbedder::batch_embed(const std::vector<std::str
std::unordered_map<std::string, std::string> headers;
headers["Authorization"] = "Bearer " + access_token;
headers["Content-Type"] = "application/json";
headers["timeout_ms"] = std::to_string(remote_embedding_timeout_ms);
headers["num_try"] = std::to_string(remote_embedding_num_tries);
std::map<std::string, std::string> res_headers;
std::string res;
auto res_code = call_remote_api("POST", get_gcp_embedding_url(project_id, model_name), req_body.dump(), res, res_headers, headers);

View File

@ -13,7 +13,8 @@
#include <arpa/inet.h>
#include <sys/socket.h>
#include <ifaddrs.h>
#include <analytics_manager.h>
#include "analytics_manager.h"
#include "housekeeper.h"
#include "core_api.h"
#include "ratelimit_manager.h"
@ -104,6 +105,7 @@ void init_cmdline_options(cmdline::parser & options, int argc, char **argv) {
options.add<int>("log-slow-searches-time-ms", '\0', "When >= 0, searches that take longer than this duration are logged.", false, 30*1000);
options.add<int>("cache-num-entries", '\0', "Number of entries to cache.", false, 1000);
options.add<uint32_t>("analytics-flush-interval", '\0', "Frequency of persisting analytics data to disk (in seconds).", false, 3600);
options.add<uint32_t>("housekeeping-interval", '\0', "Frequency of housekeeping background job (in seconds).", false, 1800);
// DEPRECATED
options.add<std::string>("listen-address", 'h', "[DEPRECATED: use `api-address`] Address to which Typesense API service binds.", false, "0.0.0.0");
@ -457,6 +459,11 @@ int run_server(const Config & config, const std::string & version, void (*master
AnalyticsManager::get_instance().run(&replication_state);
});
HouseKeeper::get_instance().init(config.get_housekeeping_interval());
std::thread housekeeping_thread([]() {
HouseKeeper::get_instance().run();
});
RemoteEmbedder::init(&replication_state);
std::string path_to_nodes = config.get_nodes();
@ -481,6 +488,10 @@ int run_server(const Config & config, const std::string & version, void (*master
LOG(INFO) << "Waiting for event sink thread to be done...";
event_sink_thread.join();
LOG(INFO) << "Waiting for housekeeping thread to be done...";
HouseKeeper::get_instance().stop();
housekeeping_thread.join();
LOG(INFO) << "Shutting down server_thread_pool";
server_thread_pool.shutdown();

View File

@ -1229,6 +1229,7 @@ TEST_F(CollectionFacetingTest, FacetParseTest){
field("grade", field_types::INT32, true),
field("rank", field_types::INT32, true),
field("range", field_types::INT32, true),
field("review", field_types::FLOAT, true),
field("scale", field_types::INT32, false),
};
@ -1326,6 +1327,34 @@ TEST_F(CollectionFacetingTest, FacetParseTest){
ASSERT_EQ("rank", mixed_facets_ptr[2]->field_name);
ASSERT_EQ("range", mixed_facets_ptr[1]->field_name);
std::vector<std::string> range_facet_float_fields {
"review(bad:[0, 2.5], good:[2.5, 5])"
};
std::vector<facet> float_facets;
for(const std::string & facet_field: range_facet_float_fields) {
auto res = coll1->parse_facet(facet_field, float_facets);
if(!res.error().empty()) {
LOG(ERROR) << res.error();
FAIL();
}
}
std::vector<std::string> range_facet_negative_range {
"review(bad:[-2.5, 2.5], good:[2.5, 5])"
};
std::vector<facet> negative_range;
for(const std::string & facet_field: range_facet_negative_range) {
auto res = coll1->parse_facet(facet_field, negative_range);
if(!res.error().empty()) {
LOG(ERROR) << res.error();
FAIL();
}
}
}
TEST_F(CollectionFacetingTest, RangeFacetTest) {
@ -1667,6 +1696,204 @@ TEST_F(CollectionFacetingTest, RangeFacetTypo) {
collectionManager.drop_collection("coll1");
}
TEST_F(CollectionFacetingTest, RangeFacetsFloatRange) {
std::vector<field> fields = {field("name", field_types::STRING, false),
field("inches", field_types::FLOAT, true),};
Collection* coll1 = collectionManager.create_collection(
"coll1", 1, fields, "", 0, "", {}, {}).get();
nlohmann::json doc;
doc["id"] = "0";
doc["name"] = "TV 1";
doc["inches"] = 32.4;
ASSERT_TRUE(coll1->add(doc.dump()).ok());
doc["id"] = "1";
doc["name"] = "TV 2";
doc["inches"] = 55;
ASSERT_TRUE(coll1->add(doc.dump()).ok());
doc["id"] = "2";
doc["name"] = "TV 3";
doc["inches"] = 55.6;
ASSERT_TRUE(coll1->add(doc.dump()).ok());
auto results = coll1->search("*", {},
"", {"inches(small:[0, 55.5])"},
{}, {2}, 10,
1, FREQUENCY, {true},
10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 10, {}, {}, {}, 0,
"<mark>", "</mark>", {}, 1000,
true, false, true, "", true).get();
ASSERT_EQ(1, results["facet_counts"][0]["counts"].size());
ASSERT_EQ(2, (int) results["facet_counts"][0]["counts"][0]["count"]);
ASSERT_EQ("small", results["facet_counts"][0]["counts"][0]["value"]);
}
TEST_F(CollectionFacetingTest, RangeFacetsMinMaxRange) {
std::vector<field> fields = {field("name", field_types::STRING, false),
field("inches", field_types::FLOAT, true),};
Collection* coll1 = collectionManager.create_collection(
"coll1", 1, fields, "", 0, "", {}, {}).get();
nlohmann::json doc;
doc["id"] = "0";
doc["name"] = "TV 1";
doc["inches"] = 32.4;
ASSERT_TRUE(coll1->add(doc.dump()).ok());
doc["id"] = "1";
doc["name"] = "TV 2";
doc["inches"] = 55;
ASSERT_TRUE(coll1->add(doc.dump()).ok());
doc["id"] = "2";
doc["name"] = "TV 3";
doc["inches"] = 55.6;
ASSERT_TRUE(coll1->add(doc.dump()).ok());
auto results = coll1->search("*", {},
"", {"inches(small:[0, 55], large:[55, ])"},
{}, {2}, 10,
1, FREQUENCY, {true},
10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 10, {}, {}, {}, 0,
"<mark>", "</mark>", {}, 1000,
true, false, true, "", true).get();
ASSERT_EQ(2, results["facet_counts"][0]["counts"].size());
ASSERT_EQ(2, (int) results["facet_counts"][0]["counts"][0]["count"]);
ASSERT_EQ("small", results["facet_counts"][0]["counts"][0]["value"]);
ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][1]["count"]);
ASSERT_EQ("large", results["facet_counts"][0]["counts"][1]["value"]);
results = coll1->search("*", {},
"", {"inches(small:[,55])"},
{}, {2}, 10,
1, FREQUENCY, {true},
10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 10, {}, {}, {}, 0,
"<mark>", "</mark>", {}, 1000,
true, false, true, "", true).get();
ASSERT_EQ(1, results["facet_counts"][0]["counts"].size());
ASSERT_EQ(2, (int) results["facet_counts"][0]["counts"][0]["count"]);
ASSERT_EQ("small", results["facet_counts"][0]["counts"][0]["value"]);
}
TEST_F(CollectionFacetingTest, RangeFacetRangeLabelWithSpace) {
std::vector<field> fields = {field("name", field_types::STRING, false),
field("inches", field_types::FLOAT, true),};
Collection* coll1 = collectionManager.create_collection(
"coll1", 1, fields, "", 0, "", {}, {}).get();
nlohmann::json doc;
doc["id"] = "0";
doc["name"] = "TV 1";
doc["inches"] = 32.4;
ASSERT_TRUE(coll1->add(doc.dump()).ok());
doc["id"] = "1";
doc["name"] = "TV 2";
doc["inches"] = 55;
ASSERT_TRUE(coll1->add(doc.dump()).ok());
doc["id"] = "2";
doc["name"] = "TV 3";
doc["inches"] = 55.6;
ASSERT_TRUE(coll1->add(doc.dump()).ok());
auto results = coll1->search("*", {},
"", {"inches(small tvs with display size:[0,55])"},
{}, {2}, 10,
1, FREQUENCY, {true},
10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 10, {}, {}, {}, 0,
"<mark>", "</mark>", {}, 1000,
true, false, true, "", true).get();
ASSERT_EQ(1, results["facet_counts"][0]["counts"].size());
ASSERT_EQ(2, (int) results["facet_counts"][0]["counts"][0]["count"]);
ASSERT_EQ("small tvs with display size", results["facet_counts"][0]["counts"][0]["value"]);
}
TEST_F(CollectionFacetingTest, RangeFacetRangeNegativeRanges) {
std::vector<field> fields = {field("team", field_types::STRING, false),
field("nrr", field_types::FLOAT, true),};
Collection* coll1 = collectionManager.create_collection(
"coll1", 1, fields, "", 0, "",
{},{}).get();
nlohmann::json doc;
doc["id"] = "0";
doc["team"] = "india";
doc["nrr"] = 1.353;
ASSERT_TRUE(coll1->add(doc.dump()).ok());
doc["id"] = "1";
doc["team"] = "australia";
doc["nrr"] = -0.193;
ASSERT_TRUE(coll1->add(doc.dump()).ok());
doc["id"] = "2";
doc["team"] = "pakistan";
doc["nrr"] = -0.400;
ASSERT_TRUE(coll1->add(doc.dump()).ok());
doc["id"] = "3";
doc["team"] = "afghanistan";
doc["nrr"] = -0.969;
ASSERT_TRUE(coll1->add(doc.dump()).ok());
doc["id"] = "4";
doc["team"] = "srilanka";
doc["nrr"] = -1.048;
ASSERT_TRUE(coll1->add(doc.dump()).ok());
doc["id"] = "5";
doc["team"] = "england";
doc["nrr"] = -1.248;
ASSERT_TRUE(coll1->add(doc.dump()).ok());
doc["id"] = "6";
doc["team"] = "bangladesh";
doc["nrr"] = -1.253;
ASSERT_TRUE(coll1->add(doc.dump()).ok());
doc["id"] = "7";
doc["team"] = "new zealand";
doc["nrr"] = 1.481;
ASSERT_TRUE(coll1->add(doc.dump()).ok());
auto results = coll1->search("*", {},
"", {"nrr(poor:[-1.5,-1], decent:[-1,0], good:[0,2])"},
{}, {2}, 10,
1, FREQUENCY, {true},
10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 10, {}, {}, {}, 0,
"<mark>", "</mark>", {}, 1000,
true, false, true, "", true).get();
ASSERT_EQ(3, results["facet_counts"][0]["counts"].size());
ASSERT_EQ(3, (int) results["facet_counts"][0]["counts"][0]["count"]);
ASSERT_EQ("poor", results["facet_counts"][0]["counts"][0]["value"]);
ASSERT_EQ(3, (int) results["facet_counts"][0]["counts"][1]["count"]);
ASSERT_EQ("decent", results["facet_counts"][0]["counts"][1]["value"]);
ASSERT_EQ(2, (int) results["facet_counts"][0]["counts"][2]["count"]);
ASSERT_EQ("good", results["facet_counts"][0]["counts"][2]["value"]);
}
TEST_F(CollectionFacetingTest, SampleFacetCounts) {
nlohmann::json schema = R"({
"name": "coll1",

View File

@ -2747,3 +2747,95 @@ TEST_F(CollectionVectorTest, TestSearchNonIndexedVectorField) {
ASSERT_FALSE(search_result.ok());
ASSERT_EQ("Field `vec` is marked as a non-indexed field in the schema.", search_result.error());
}
TEST_F(CollectionVectorTest, TestSemanticSearchAfterUpdate) {
nlohmann::json schema = R"({
"name": "test",
"fields": [
{
"name": "name",
"type": "string"
},
{
"name": "embedding",
"type": "float[]",
"embed": {
"from": [
"name"
],
"model_config": {
"model_name": "ts/e5-small"
}
}
}
]
})"_json;
TextEmbedderManager::set_model_dir("/tmp/typesense_test/models");
auto collection_create_op = collectionManager.create_collection(schema);
ASSERT_TRUE(collection_create_op.ok());
auto coll = collection_create_op.get();
auto add_op = coll->add(R"({
"name": "soccer",
"id": "0"
})"_json.dump());
ASSERT_TRUE(add_op.ok());
add_op = coll->add(R"({
"name": "basketball",
"id": "1"
})"_json.dump());
ASSERT_TRUE(add_op.ok());
add_op = coll->add(R"({
"name": "typesense",
"id": "2"
})"_json.dump());
ASSERT_TRUE(add_op.ok());
add_op = coll->add(R"({
"name": "potato",
"id": "3"
})"_json.dump());
ASSERT_TRUE(add_op.ok());
auto result = coll->search("*", {}, "", {}, {}, {0}, 20, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
"", 10, {}, {}, {}, 0,
"<mark>", "</mark>", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7,
fallback,
4, {off}, 32767, 32767, 2,
false, true, "embedding:([], id:0, k:1)");
ASSERT_TRUE(result.ok());
ASSERT_EQ(1, result.get()["hits"].size());
ASSERT_EQ("basketball", result.get()["hits"][0]["document"]["name"]);
auto update_op = coll->add(R"({
"name": "onion",
"id": "0"
})"_json.dump(), index_operation_t::UPDATE, "0");
ASSERT_TRUE(update_op.ok());
result = coll->search("*", {}, "", {}, {}, {0}, 20, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
"", 10, {}, {}, {}, 0,
"<mark>", "</mark>", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7,
fallback,
4, {off}, 32767, 32767, 2,
false, true, "embedding:([], id:0, k:1)");
ASSERT_TRUE(result.ok());
ASSERT_EQ(1, result.get()["hits"].size());
ASSERT_EQ("potato", result.get()["hits"][0]["document"]["name"]);
}