mirror of
https://github.com/typesense/typesense.git
synced 2025-05-18 12:42:50 +08:00
Merge branch 'v0.25-join' into v0.26-facets
# Conflicts: # include/index.h # src/collection.cpp # test/collection_vector_search_test.cpp
This commit is contained in:
commit
ae597f40ba
@ -179,7 +179,7 @@ new_git_repository(
|
||||
new_git_repository(
|
||||
name = "hnsw",
|
||||
build_file = "//bazel:hnsw.BUILD",
|
||||
commit = "5aba40d4b10dd77aece2ab9a1b3fdf06e433466a",
|
||||
commit = "5100d3fe41da45601875b3f395f508398cb12b8a",
|
||||
remote = "https://github.com/typesense/hnswlib.git",
|
||||
)
|
||||
|
||||
|
@ -55,6 +55,8 @@ private:
|
||||
|
||||
mutable std::shared_mutex mutex;
|
||||
|
||||
mutable std::shared_mutex index_repair_lock;
|
||||
|
||||
const uint8_t CURATED_RECORD_IDENTIFIER = 100;
|
||||
|
||||
const size_t DEFAULT_TOPSTER_SIZE = 250;
|
||||
@ -428,7 +430,8 @@ public:
|
||||
const std::vector<sort_by>& sort_by_fields);
|
||||
|
||||
void batch_index(std::vector<index_record>& index_records, std::vector<std::string>& json_out, size_t &num_indexed,
|
||||
const bool& return_doc, const bool& return_id, const size_t remote_embedding_batch_size = 200);
|
||||
const bool& return_doc, const bool& return_id, const size_t remote_embedding_batch_size = 200,
|
||||
const size_t remote_embedding_timeout_ms = 60000, const size_t remote_embedding_num_tries = 2);
|
||||
|
||||
bool is_exceeding_memory_threshold() const;
|
||||
|
||||
@ -442,7 +445,7 @@ public:
|
||||
nlohmann::json get_summary_json() const;
|
||||
|
||||
size_t batch_index_in_memory(std::vector<index_record>& index_records, const size_t remote_embedding_batch_size,
|
||||
const bool generate_embeddings);
|
||||
const size_t remote_embedding_timeout_ms, const size_t remote_embedding_num_tries, const bool generate_embeddings);
|
||||
|
||||
Option<nlohmann::json> add(const std::string & json_str,
|
||||
const index_operation_t& operation=CREATE, const std::string& id="",
|
||||
@ -452,7 +455,9 @@ public:
|
||||
const index_operation_t& operation=CREATE, const std::string& id="",
|
||||
const DIRTY_VALUES& dirty_values=DIRTY_VALUES::COERCE_OR_REJECT,
|
||||
const bool& return_doc=false, const bool& return_id=false,
|
||||
const size_t remote_embedding_batch_size=200);
|
||||
const size_t remote_embedding_batch_size=200,
|
||||
const size_t remote_embedding_timeout_ms=60000,
|
||||
const size_t remote_embedding_num_tries=2);
|
||||
|
||||
Option<nlohmann::json> update_matching_filter(const std::string& filter_query,
|
||||
const std::string & json_str,
|
||||
@ -464,6 +469,8 @@ public:
|
||||
tsl::htrie_set<char>& include_fields_full,
|
||||
tsl::htrie_set<char>& exclude_fields_full) const;
|
||||
|
||||
void do_housekeeping();
|
||||
|
||||
Option<nlohmann::json> search(std::string query, const std::vector<std::string> & search_fields,
|
||||
const std::string & filter_query, const std::vector<std::string> & facet_fields,
|
||||
const std::vector<sort_by> & sort_fields, const std::vector<uint32_t>& num_typos,
|
||||
|
@ -129,6 +129,8 @@ public:
|
||||
|
||||
std::vector<Collection*> get_collections() const;
|
||||
|
||||
std::vector<std::string> get_collection_names() const;
|
||||
|
||||
Collection* get_collection_unsafe(const std::string & collection_name) const;
|
||||
|
||||
// PUBLICLY EXPOSED API
|
||||
|
@ -296,6 +296,9 @@ struct hnsw_index_t {
|
||||
size_t num_dim;
|
||||
vector_distance_type_t distance_type;
|
||||
|
||||
// ensures that this index is not dropped when it's being repaired
|
||||
std::mutex repair_m;
|
||||
|
||||
hnsw_index_t(size_t num_dim, size_t init_size, vector_distance_type_t distance_type):
|
||||
space(new hnswlib::InnerProductSpace(num_dim)),
|
||||
vecdex(new hnswlib::HierarchicalNSW<float>(space, init_size, 16, 200, 100, true)),
|
||||
@ -561,13 +564,14 @@ private:
|
||||
const std::string& token, uint32_t seq_id);
|
||||
|
||||
void initialize_facet_indexes(const field& facet_field);
|
||||
|
||||
static void batch_embed_fields(std::vector<index_record*>& documents,
|
||||
const tsl::htrie_map<char, field>& embedding_fields,
|
||||
const tsl::htrie_map<char, field> & search_schema, const size_t remote_embedding_batch_size = 200);
|
||||
|
||||
std::vector<group_by_field_it_t> get_group_by_field_iterators(const std::vector<std::string>&, bool is_reverse=false) const;
|
||||
|
||||
static void batch_embed_fields(std::vector<index_record*>& documents,
|
||||
const tsl::htrie_map<char, field>& embedding_fields,
|
||||
const tsl::htrie_map<char, field> & search_schema, const size_t remote_embedding_batch_size = 200,
|
||||
const size_t remote_embedding_timeout_ms = 60000, const size_t remote_embedding_num_tries = 2);
|
||||
|
||||
public:
|
||||
// for limiting number of results on multiple candidates / query rewrites
|
||||
enum {TYPO_TOKENS_THRESHOLD = 1};
|
||||
@ -713,7 +717,8 @@ public:
|
||||
const std::string& fallback_field_type,
|
||||
const std::vector<char>& token_separators,
|
||||
const std::vector<char>& symbols_to_index,
|
||||
const bool do_validation, const size_t remote_embedding_batch_size = 200, const bool generate_embeddings = true);
|
||||
const bool do_validation, const size_t remote_embedding_batch_size = 200,
|
||||
const size_t remote_embedding_timeout_ms = 60000, const size_t remote_embedding_num_tries = 2, const bool generate_embeddings = true);
|
||||
|
||||
static size_t batch_memory_index(Index *index,
|
||||
std::vector<index_record>& iter_batch,
|
||||
@ -724,9 +729,10 @@ public:
|
||||
const std::vector<char>& token_separators,
|
||||
const std::vector<char>& symbols_to_index,
|
||||
const bool do_validation, const size_t remote_embedding_batch_size = 200,
|
||||
const bool generate_embeddings = true,
|
||||
const size_t remote_embedding_timeout_ms = 60000,
|
||||
const size_t remote_embedding_num_tries = 2, const bool generate_embeddings = true,
|
||||
const bool use_addition_fields = false,
|
||||
const tsl::htrie_map<char, field>& addition_fields = {});
|
||||
const tsl::htrie_map<char, field>& addition_fields = tsl::htrie_map<char, field>());
|
||||
|
||||
void index_field_in_memory(const field& afield, std::vector<index_record>& iter_batch);
|
||||
|
||||
@ -1017,6 +1023,8 @@ public:
|
||||
const uint32_t& seq_id) const;
|
||||
|
||||
friend class filter_result_iterator_t;
|
||||
|
||||
void repair_hnsw_index();
|
||||
};
|
||||
|
||||
template<class T>
|
||||
|
@ -17,7 +17,8 @@ class TextEmbedder {
|
||||
TextEmbedder(const nlohmann::json& model_config, size_t num_dims);
|
||||
~TextEmbedder();
|
||||
embedding_res_t Embed(const std::string& text, const size_t remote_embedder_timeout_ms = 30000, const size_t remote_embedding_num_tries = 2);
|
||||
std::vector<embedding_res_t> batch_embed(const std::vector<std::string>& inputs, const size_t remote_embedding_batch_size = 200);
|
||||
std::vector<embedding_res_t> batch_embed(const std::vector<std::string>& inputs, const size_t remote_embedding_batch_size = 200,
|
||||
const size_t remote_embedding_timeout_ms = 60000, const size_t remote_embedding_num_tries = 2);
|
||||
const std::string& get_vocab_file_name() const;
|
||||
const size_t get_num_dim() const;
|
||||
bool is_remote() {
|
||||
|
@ -30,7 +30,8 @@ class RemoteEmbedder {
|
||||
public:
|
||||
virtual nlohmann::json get_error_json(const nlohmann::json& req_body, long res_code, const std::string& res_body) = 0;
|
||||
virtual embedding_res_t Embed(const std::string& text, const size_t remote_embedder_timeout_ms = 30000, const size_t remote_embedding_num_tries = 2) = 0;
|
||||
virtual std::vector<embedding_res_t> batch_embed(const std::vector<std::string>& inputs, const size_t remote_embedding_batch_size = 200) = 0;
|
||||
virtual std::vector<embedding_res_t> batch_embed(const std::vector<std::string>& inputs, const size_t remote_embedding_batch_size = 200,
|
||||
const size_t remote_embedding_timeout_ms = 60000, const size_t remote_embedding_num_tries = 2) = 0;
|
||||
static const std::string get_model_key(const nlohmann::json& model_config);
|
||||
static void init(ReplicationState* rs) {
|
||||
raft_server = rs;
|
||||
@ -50,7 +51,8 @@ class OpenAIEmbedder : public RemoteEmbedder {
|
||||
OpenAIEmbedder(const std::string& openai_model_path, const std::string& api_key);
|
||||
static Option<bool> is_model_valid(const nlohmann::json& model_config, size_t& num_dims);
|
||||
embedding_res_t Embed(const std::string& text, const size_t remote_embedder_timeout_ms = 30000, const size_t remote_embedding_num_tries = 2) override;
|
||||
std::vector<embedding_res_t> batch_embed(const std::vector<std::string>& inputs, const size_t remote_embedding_batch_size = 200) override;
|
||||
std::vector<embedding_res_t> batch_embed(const std::vector<std::string>& inputs, const size_t remote_embedding_batch_size = 200,
|
||||
const size_t remote_embedding_timeout_ms = 60000, const size_t remote_embedding_num_tries = 2) override;
|
||||
nlohmann::json get_error_json(const nlohmann::json& req_body, long res_code, const std::string& res_body) override;
|
||||
static std::string get_model_key(const nlohmann::json& model_config);
|
||||
};
|
||||
@ -68,7 +70,8 @@ class GoogleEmbedder : public RemoteEmbedder {
|
||||
GoogleEmbedder(const std::string& google_api_key);
|
||||
static Option<bool> is_model_valid(const nlohmann::json& model_config, size_t& num_dims);
|
||||
embedding_res_t Embed(const std::string& text, const size_t remote_embedder_timeout_ms = 30000, const size_t remote_embedding_num_tries = 2) override;
|
||||
std::vector<embedding_res_t> batch_embed(const std::vector<std::string>& inputs, const size_t remote_embedding_batch_size = 200) override;
|
||||
std::vector<embedding_res_t> batch_embed(const std::vector<std::string>& inputs, const size_t remote_embedding_batch_size = 200,
|
||||
const size_t remote_embedding_timeout_ms = 60000, const size_t remote_embedding_num_tries = 2) override;
|
||||
nlohmann::json get_error_json(const nlohmann::json& req_body, long res_code, const std::string& res_body) override;
|
||||
static std::string get_model_key(const nlohmann::json& model_config);
|
||||
};
|
||||
@ -96,7 +99,8 @@ class GCPEmbedder : public RemoteEmbedder {
|
||||
const std::string& refresh_token, const std::string& client_id, const std::string& client_secret);
|
||||
static Option<bool> is_model_valid(const nlohmann::json& model_config, size_t& num_dims);
|
||||
embedding_res_t Embed(const std::string& text, const size_t remote_embedder_timeout_ms = 30000, const size_t remote_embedding_num_tries = 2) override;
|
||||
std::vector<embedding_res_t> batch_embed(const std::vector<std::string>& inputs, const size_t remote_embedding_batch_size = 200) override;
|
||||
std::vector<embedding_res_t> batch_embed(const std::vector<std::string>& inputs, const size_t remote_embedding_batch_size = 200,
|
||||
const size_t remote_embedding_timeout_ms = 60000, const size_t remote_embedding_num_tries = 2) override;
|
||||
nlohmann::json get_error_json(const nlohmann::json& req_body, long res_code, const std::string& res_body) override;
|
||||
static std::string get_model_key(const nlohmann::json& model_config);
|
||||
};
|
||||
|
@ -69,6 +69,8 @@ private:
|
||||
|
||||
uint32_t analytics_flush_interval;
|
||||
|
||||
uint32_t housekeeping_interval;
|
||||
|
||||
protected:
|
||||
|
||||
Config() {
|
||||
@ -96,6 +98,7 @@ protected:
|
||||
|
||||
this->enable_search_analytics = false;
|
||||
this->analytics_flush_interval = 3600; // in seconds
|
||||
this->housekeeping_interval = 1800; // in seconds
|
||||
}
|
||||
|
||||
Config(Config const&) {
|
||||
@ -294,6 +297,10 @@ public:
|
||||
return this->analytics_flush_interval;
|
||||
}
|
||||
|
||||
size_t get_housekeeping_interval() const {
|
||||
return this->housekeeping_interval;
|
||||
}
|
||||
|
||||
size_t get_thread_pool_size() const {
|
||||
return this->thread_pool_size;
|
||||
}
|
||||
@ -429,6 +436,10 @@ public:
|
||||
this->analytics_flush_interval = std::stoi(get_env("TYPESENSE_ANALYTICS_FLUSH_INTERVAL"));
|
||||
}
|
||||
|
||||
if(!get_env("TYPESENSE_HOUSEKEEPING_INTERVAL").empty()) {
|
||||
this->housekeeping_interval = std::stoi(get_env("TYPESENSE_HOUSEKEEPING_INTERVAL"));
|
||||
}
|
||||
|
||||
if(!get_env("TYPESENSE_THREAD_POOL_SIZE").empty()) {
|
||||
this->thread_pool_size = std::stoi(get_env("TYPESENSE_THREAD_POOL_SIZE"));
|
||||
}
|
||||
@ -592,6 +603,10 @@ public:
|
||||
this->analytics_flush_interval = (int) reader.GetInteger("server", "analytics-flush-interval", 3600);
|
||||
}
|
||||
|
||||
if(reader.Exists("server", "housekeeping-interval")) {
|
||||
this->housekeeping_interval = (int) reader.GetInteger("server", "housekeeping-interval", 1800);
|
||||
}
|
||||
|
||||
if(reader.Exists("server", "thread-pool-size")) {
|
||||
this->thread_pool_size = (int) reader.GetInteger("server", "thread-pool-size", 0);
|
||||
}
|
||||
@ -746,6 +761,10 @@ public:
|
||||
this->analytics_flush_interval = options.get<uint32_t>("analytics-flush-interval");
|
||||
}
|
||||
|
||||
if(options.exist("housekeeping-interval")) {
|
||||
this->housekeeping_interval = options.get<uint32_t>("housekeeping-interval");
|
||||
}
|
||||
|
||||
if(options.exist("thread-pool-size")) {
|
||||
this->thread_pool_size = options.get<uint32_t>("thread-pool-size");
|
||||
}
|
||||
|
@ -58,6 +58,7 @@ Collection::Collection(const std::string& name, const uint32_t collection_id, co
|
||||
|
||||
Collection::~Collection() {
|
||||
std::unique_lock lock(mutex);
|
||||
std::unique_lock repair_lock(index_repair_lock);
|
||||
delete index;
|
||||
delete synonym_index;
|
||||
}
|
||||
@ -391,7 +392,9 @@ Option<nlohmann::json> Collection::add(const std::string & json_str,
|
||||
nlohmann::json Collection::add_many(std::vector<std::string>& json_lines, nlohmann::json& document,
|
||||
const index_operation_t& operation, const std::string& id,
|
||||
const DIRTY_VALUES& dirty_values, const bool& return_doc, const bool& return_id,
|
||||
const size_t remote_embedding_batch_size) {
|
||||
const size_t remote_embedding_batch_size,
|
||||
const size_t remote_embedding_timeout_ms,
|
||||
const size_t remote_embedding_num_tries) {
|
||||
//LOG(INFO) << "Memory ratio. Max = " << max_memory_ratio << ", Used = " << SystemMetrics::used_memory_ratio();
|
||||
std::vector<index_record> index_records;
|
||||
|
||||
@ -481,7 +484,7 @@ nlohmann::json Collection::add_many(std::vector<std::string>& json_lines, nlohma
|
||||
|
||||
|
||||
if((i+1) % index_batch_size == 0 || i == json_lines.size()-1 || repeated_doc) {
|
||||
batch_index(index_records, json_lines, num_indexed, return_doc, return_id, remote_embedding_batch_size);
|
||||
batch_index(index_records, json_lines, num_indexed, return_doc, return_id, remote_embedding_batch_size, remote_embedding_timeout_ms, remote_embedding_num_tries);
|
||||
|
||||
// to return the document for the single doc add cases
|
||||
if(index_records.size() == 1) {
|
||||
@ -598,9 +601,10 @@ bool Collection::is_exceeding_memory_threshold() const {
|
||||
}
|
||||
|
||||
void Collection::batch_index(std::vector<index_record>& index_records, std::vector<std::string>& json_out,
|
||||
size_t &num_indexed, const bool& return_doc, const bool& return_id, const size_t remote_embedding_batch_size) {
|
||||
size_t &num_indexed, const bool& return_doc, const bool& return_id, const size_t remote_embedding_batch_size,
|
||||
const size_t remote_embedding_timeout_ms, const size_t remote_embedding_num_tries) {
|
||||
|
||||
batch_index_in_memory(index_records, remote_embedding_batch_size, true);
|
||||
batch_index_in_memory(index_records, remote_embedding_batch_size, remote_embedding_timeout_ms, remote_embedding_num_tries, true);
|
||||
|
||||
// store only documents that were indexed in-memory successfully
|
||||
for(auto& index_record: index_records) {
|
||||
@ -704,11 +708,12 @@ Option<uint32_t> Collection::index_in_memory(nlohmann::json &document, uint32_t
|
||||
return Option<>(200);
|
||||
}
|
||||
|
||||
size_t Collection::batch_index_in_memory(std::vector<index_record>& index_records, const size_t remote_embedding_batch_size, const bool generate_embeddings) {
|
||||
size_t Collection::batch_index_in_memory(std::vector<index_record>& index_records, const size_t remote_embedding_batch_size,
|
||||
const size_t remote_embedding_timeout_ms, const size_t remote_embedding_num_tries, const bool generate_embeddings) {
|
||||
std::unique_lock lock(mutex);
|
||||
size_t num_indexed = Index::batch_memory_index(index, index_records, default_sorting_field,
|
||||
search_schema, embedding_fields, fallback_field_type,
|
||||
token_separators, symbols_to_index, true, remote_embedding_batch_size, generate_embeddings);
|
||||
token_separators, symbols_to_index, true, remote_embedding_batch_size, remote_embedding_timeout_ms, remote_embedding_num_tries, generate_embeddings);
|
||||
num_documents += num_indexed;
|
||||
return num_indexed;
|
||||
}
|
||||
@ -4291,8 +4296,9 @@ Option<bool> Collection::batch_alter_data(const std::vector<field>& alter_fields
|
||||
}
|
||||
|
||||
Index::batch_memory_index(index, iter_batch, default_sorting_field, search_schema, embedding_fields,
|
||||
fallback_field_type, token_separators, symbols_to_index, true, 200,
|
||||
fallback_field_type, token_separators, symbols_to_index, true, 200, 60000, 2,
|
||||
found_embedding_field, true, schema_additions);
|
||||
|
||||
if(found_embedding_field) {
|
||||
for(auto& index_record : iter_batch) {
|
||||
if(index_record.indexed.ok()) {
|
||||
@ -5380,7 +5386,7 @@ bool Collection::get_enable_nested_fields() {
|
||||
|
||||
Option<bool> Collection::parse_facet(const std::string& facet_field, std::vector<facet>& facets) const {
|
||||
const std::regex base_pattern(".+\\(.*\\)");
|
||||
const std::regex range_pattern("[[a-zA-Z]+:\\[([0-9]+)\\,\\s*([0-9]+)\\]");
|
||||
const std::regex range_pattern("[[a-z A-Z]+:\\[([+-]?([0-9]*[.])?[0-9]*)\\,\\s*([+-]?([0-9]*[.])?[0-9]*)\\]");
|
||||
const std::string _alpha = "_alpha";
|
||||
|
||||
if ((facet_field.find(":") != std::string::npos)
|
||||
@ -5469,24 +5475,49 @@ Option<bool> Collection::parse_facet(const std::string& facet_field, std::vector
|
||||
auto pos3 = range.find("]");
|
||||
|
||||
int64_t lower_range, upper_range;
|
||||
auto lower_range_start = pos1 + 2;
|
||||
auto lower_range_len = pos2 - lower_range_start;
|
||||
auto upper_range_start = pos2 + 1;
|
||||
auto upper_range_len = pos3 - upper_range_start;
|
||||
|
||||
if(a_field.is_integer()) {
|
||||
std::string lower_range_str = range.substr(lower_range_start, lower_range_len);
|
||||
auto start = pos1 + 2;
|
||||
auto end = pos2 - start;
|
||||
auto lower_range_str = range.substr(start, end);
|
||||
StringUtils::trim(lower_range_str);
|
||||
lower_range = std::stoll(lower_range_str);
|
||||
std::string upper_range_str = range.substr(upper_range_start, upper_range_len);
|
||||
StringUtils::trim(upper_range_str);
|
||||
upper_range = std::stoll(upper_range_str);
|
||||
} else {
|
||||
float val = std::stof(range.substr(pos1 + 2, pos2));
|
||||
lower_range = Index::float_to_int64_t(val);
|
||||
if(lower_range_str.empty()) {
|
||||
lower_range = INT64_MIN;
|
||||
} else {
|
||||
lower_range = std::stoll(lower_range_str);
|
||||
}
|
||||
|
||||
val = std::stof(range.substr(pos2 + 1, pos3));
|
||||
upper_range = Index::float_to_int64_t(val);
|
||||
start = pos2 + 1;
|
||||
end = pos3 - start;
|
||||
auto upper_range_str = range.substr(start, end);
|
||||
StringUtils::trim(upper_range_str);
|
||||
if(upper_range_str.empty()) {
|
||||
upper_range = INT64_MAX;
|
||||
} else {
|
||||
upper_range = std::stoll(upper_range_str);
|
||||
}
|
||||
} else {
|
||||
auto start = pos1 + 2;
|
||||
auto end = pos2 - start;
|
||||
auto lower_range_str = range.substr(start, end);
|
||||
StringUtils::trim(lower_range_str);
|
||||
if(lower_range_str.empty()) {
|
||||
lower_range = INT64_MIN;
|
||||
} else {
|
||||
float val = std::stof(lower_range_str);
|
||||
lower_range = Index::float_to_int64_t(val);
|
||||
}
|
||||
|
||||
start = pos2 + 1;
|
||||
end = pos3 - start;
|
||||
auto upper_range_str = range.substr(start, end);
|
||||
StringUtils::trim(upper_range_str);
|
||||
if(upper_range_str.empty()) {
|
||||
upper_range = INT64_MAX;
|
||||
} else {
|
||||
float val = std::stof(upper_range_str);
|
||||
upper_range = Index::float_to_int64_t(val);
|
||||
}
|
||||
}
|
||||
|
||||
tupVec.emplace_back(lower_range, upper_range, range_val);
|
||||
@ -5811,3 +5842,8 @@ void Collection::remove_embedding_field(const std::string& field_name) {
|
||||
tsl::htrie_map<char, field> Collection::get_embedding_fields_unsafe() {
|
||||
return embedding_fields;
|
||||
}
|
||||
|
||||
void Collection::do_housekeeping() {
|
||||
std::unique_lock lock(index_repair_lock);
|
||||
index->repair_hnsw_index();
|
||||
}
|
||||
|
@ -527,6 +527,17 @@ std::vector<Collection*> CollectionManager::get_collections() const {
|
||||
return collection_vec;
|
||||
}
|
||||
|
||||
std::vector<std::string> CollectionManager::get_collection_names() const {
|
||||
std::shared_lock lock(mutex);
|
||||
|
||||
std::vector<std::string> collection_vec;
|
||||
for(const auto& kv: collections) {
|
||||
collection_vec.push_back(kv.first);
|
||||
}
|
||||
|
||||
return collection_vec;
|
||||
}
|
||||
|
||||
Option<nlohmann::json> CollectionManager::drop_collection(const std::string& collection_name, const bool remove_from_store) {
|
||||
std::shared_lock s_lock(mutex);
|
||||
auto collection = get_collection_unsafe(collection_name);
|
||||
@ -1787,7 +1798,7 @@ Option<bool> CollectionManager::load_collection(const nlohmann::json &collection
|
||||
// batch must match atleast the number of shards
|
||||
if(exceeds_batch_mem_threshold || (num_valid_docs % batch_size == 0) || last_record) {
|
||||
size_t num_records = index_records.size();
|
||||
size_t num_indexed = collection->batch_index_in_memory(index_records, 200, false);
|
||||
size_t num_indexed = collection->batch_index_in_memory(index_records, 200, 60000, 2, false);
|
||||
batch_doc_str_size = 0;
|
||||
|
||||
if(num_indexed != num_records) {
|
||||
|
@ -756,6 +756,8 @@ bool post_import_documents(const std::shared_ptr<http_req>& req, const std::shar
|
||||
const char *RETURN_DOC = "return_doc";
|
||||
const char *RETURN_ID = "return_id";
|
||||
const char *REMOTE_EMBEDDING_BATCH_SIZE = "remote_embedding_batch_size";
|
||||
const char *REMOTE_EMBEDDING_TIMEOUT_MS = "remote_embedding_timeout_ms";
|
||||
const char *REMOTE_EMBEDDING_NUM_TRIES = "remote_embedding_num_tries";
|
||||
|
||||
if(req->params.count(BATCH_SIZE) == 0) {
|
||||
req->params[BATCH_SIZE] = "40";
|
||||
@ -810,8 +812,18 @@ bool post_import_documents(const std::shared_ptr<http_req>& req, const std::shar
|
||||
return false;
|
||||
}
|
||||
|
||||
if(req->params.count(REMOTE_EMBEDDING_TIMEOUT_MS) == 0) {
|
||||
req->params[REMOTE_EMBEDDING_TIMEOUT_MS] = "60000";
|
||||
}
|
||||
|
||||
if(req->params.count(REMOTE_EMBEDDING_NUM_TRIES) == 0) {
|
||||
req->params[REMOTE_EMBEDDING_NUM_TRIES] = "2";
|
||||
}
|
||||
|
||||
const size_t IMPORT_BATCH_SIZE = std::stoi(req->params[BATCH_SIZE]);
|
||||
const size_t REMOTE_EMBEDDING_BATCH_SIZE_VAL = std::stoi(req->params[REMOTE_EMBEDDING_BATCH_SIZE]);
|
||||
const size_t REMOTE_EMBEDDING_TIMEOUT_MS_VAL = std::stoi(req->params[REMOTE_EMBEDDING_TIMEOUT_MS]);
|
||||
const size_t REMOTE_EMBEDDING_NUM_TRIES_VAL = std::stoi(req->params[REMOTE_EMBEDDING_NUM_TRIES]);
|
||||
|
||||
if(IMPORT_BATCH_SIZE == 0) {
|
||||
res->final = true;
|
||||
@ -827,6 +839,20 @@ bool post_import_documents(const std::shared_ptr<http_req>& req, const std::shar
|
||||
return false;
|
||||
}
|
||||
|
||||
if(REMOTE_EMBEDDING_TIMEOUT_MS_VAL == 0) {
|
||||
res->final = true;
|
||||
res->set_400("Parameter `" + std::string(REMOTE_EMBEDDING_TIMEOUT_MS) + "` must be a positive integer.");
|
||||
stream_response(req, res);
|
||||
return false;
|
||||
}
|
||||
|
||||
if(REMOTE_EMBEDDING_NUM_TRIES_VAL == 0) {
|
||||
res->final = true;
|
||||
res->set_400("Parameter `" + std::string(REMOTE_EMBEDDING_NUM_TRIES) + "` must be a positive integer.");
|
||||
stream_response(req, res);
|
||||
return false;
|
||||
}
|
||||
|
||||
if(req->body_index == 0) {
|
||||
// will log for every major chunk of request body
|
||||
//LOG(INFO) << "Import, req->body.size=" << req->body.size() << ", batch_size=" << IMPORT_BATCH_SIZE;
|
||||
@ -896,7 +922,7 @@ bool post_import_documents(const std::shared_ptr<http_req>& req, const std::shar
|
||||
const bool& return_doc = req->params[RETURN_DOC] == "true";
|
||||
const bool& return_id = req->params[RETURN_ID] == "true";
|
||||
nlohmann::json json_res = collection->add_many(json_lines, document, operation, "",
|
||||
dirty_values, return_doc, return_id, REMOTE_EMBEDDING_BATCH_SIZE_VAL);
|
||||
dirty_values, return_doc, return_id, REMOTE_EMBEDDING_BATCH_SIZE_VAL, REMOTE_EMBEDDING_TIMEOUT_MS_VAL, REMOTE_EMBEDDING_NUM_TRIES_VAL);
|
||||
//const std::string& import_summary_json = json_res->dump();
|
||||
//response_stream << import_summary_json << "\n";
|
||||
|
||||
@ -940,6 +966,7 @@ bool post_add_document(const std::shared_ptr<http_req>& req, const std::shared_p
|
||||
req->params[DIRTY_VALUES_PARAM] = ""; // set it empty as default will depend on whether schema is enabled
|
||||
}
|
||||
|
||||
|
||||
CollectionManager & collectionManager = CollectionManager::get_instance();
|
||||
auto collection = collectionManager.get_collection(req->params["collection"]);
|
||||
|
||||
@ -951,10 +978,22 @@ bool post_add_document(const std::shared_ptr<http_req>& req, const std::shared_p
|
||||
const index_operation_t operation = get_index_operation(req->params[ACTION]);
|
||||
const auto& dirty_values = collection->parse_dirty_values_option(req->params[DIRTY_VALUES_PARAM]);
|
||||
|
||||
size_t remote_embedding_timeout_ms = 60000;
|
||||
size_t remote_embedding_num_tries = 2;
|
||||
|
||||
if(req->params.count("remote_embedding_timeout_ms") != 0) {
|
||||
remote_embedding_timeout_ms = std::stoul(req->params["remote_embedding_timeout_ms"]);
|
||||
}
|
||||
|
||||
if(req->params.count("remote_embedding_num_tries") != 0) {
|
||||
remote_embedding_num_tries = std::stoul(req->params["remote_embedding_num_tries"]);
|
||||
}
|
||||
|
||||
nlohmann::json document;
|
||||
std::vector<std::string> json_lines = {req->body};
|
||||
const nlohmann::json& inserted_doc_op = collection->add_many(json_lines, document, operation, "", dirty_values,
|
||||
false, false);
|
||||
false, false, 200, remote_embedding_timeout_ms,
|
||||
remote_embedding_num_tries);
|
||||
|
||||
if(!inserted_doc_op["success"].get<bool>()) {
|
||||
nlohmann::json res_doc;
|
||||
|
@ -206,6 +206,7 @@ Index::~Index() {
|
||||
delete seq_ids;
|
||||
|
||||
for(auto& vec_index_kv: vector_index) {
|
||||
std::unique_lock lock(vec_index_kv.second->repair_m);
|
||||
delete vec_index_kv.second;
|
||||
}
|
||||
|
||||
@ -432,7 +433,8 @@ void Index::validate_and_preprocess(Index *index,
|
||||
const std::string& fallback_field_type,
|
||||
const std::vector<char>& token_separators,
|
||||
const std::vector<char>& symbols_to_index,
|
||||
const bool do_validation, const size_t remote_embedding_batch_size, const bool generate_embeddings) {
|
||||
const bool do_validation, const size_t remote_embedding_batch_size,
|
||||
const size_t remote_embedding_timeout_ms, const size_t remote_embedding_num_tries, const bool generate_embeddings) {
|
||||
|
||||
// runs in a partitioned thread
|
||||
std::vector<index_record*> records_to_embed;
|
||||
@ -523,7 +525,7 @@ void Index::validate_and_preprocess(Index *index,
|
||||
}
|
||||
|
||||
if(generate_embeddings) {
|
||||
batch_embed_fields(records_to_embed, embedding_fields, search_schema, remote_embedding_batch_size);
|
||||
batch_embed_fields(records_to_embed, embedding_fields, search_schema, remote_embedding_batch_size, remote_embedding_timeout_ms, remote_embedding_num_tries);
|
||||
}
|
||||
}
|
||||
|
||||
@ -535,7 +537,8 @@ size_t Index::batch_memory_index(Index *index,
|
||||
const std::string& fallback_field_type,
|
||||
const std::vector<char>& token_separators,
|
||||
const std::vector<char>& symbols_to_index,
|
||||
const bool do_validation, const size_t remote_embedding_batch_size, const bool generate_embeddings,
|
||||
const bool do_validation, const size_t remote_embedding_batch_size,
|
||||
const size_t remote_embedding_timeout_ms, const size_t remote_embedding_num_tries, const bool generate_embeddings,
|
||||
const bool use_addition_fields, const tsl::htrie_map<char, field>& addition_fields) {
|
||||
|
||||
const size_t concurrency = 4;
|
||||
@ -568,7 +571,7 @@ size_t Index::batch_memory_index(Index *index,
|
||||
index->thread_pool->enqueue([&, batch_index, batch_len]() {
|
||||
write_log_index = local_write_log_index;
|
||||
validate_and_preprocess(index, iter_batch, batch_index, batch_len, default_sorting_field, actual_search_schema,
|
||||
embedding_fields, fallback_field_type, token_separators, symbols_to_index, do_validation, remote_embedding_batch_size, generate_embeddings);
|
||||
embedding_fields, fallback_field_type, token_separators, symbols_to_index, do_validation, remote_embedding_batch_size, remote_embedding_timeout_ms, remote_embedding_num_tries, generate_embeddings);
|
||||
|
||||
std::unique_lock<std::mutex> lock(m_process);
|
||||
num_processed++;
|
||||
@ -6660,6 +6663,7 @@ void Index::refresh_schemas(const std::vector<field>& new_fields, const std::vec
|
||||
|
||||
if(del_field.num_dim) {
|
||||
auto hnsw_index = vector_index[del_field.name];
|
||||
std::unique_lock lock(hnsw_index->repair_m);
|
||||
delete hnsw_index;
|
||||
vector_index.erase(del_field.name);
|
||||
}
|
||||
@ -6951,8 +6955,9 @@ bool Index::common_results_exist(std::vector<art_leaf*>& leaves, bool must_match
|
||||
|
||||
|
||||
void Index::batch_embed_fields(std::vector<index_record*>& records,
|
||||
const tsl::htrie_map<char, field>& embedding_fields,
|
||||
const tsl::htrie_map<char, field> & search_schema, const size_t remote_embedding_batch_size) {
|
||||
const tsl::htrie_map<char, field>& embedding_fields,
|
||||
const tsl::htrie_map<char, field> & search_schema, const size_t remote_embedding_batch_size,
|
||||
const size_t remote_embedding_timeout_ms, const size_t remote_embedding_num_tries) {
|
||||
for(const auto& field : embedding_fields) {
|
||||
std::vector<std::pair<index_record*, std::string>> texts_to_embed;
|
||||
auto indexing_prefix = TextEmbedderManager::get_instance().get_indexing_prefix(field.embed[fields::model_config]);
|
||||
@ -7023,7 +7028,8 @@ void Index::batch_embed_fields(std::vector<index_record*>& records,
|
||||
texts.push_back(text_to_embed.second);
|
||||
}
|
||||
|
||||
auto embeddings = embedder_op.get()->batch_embed(texts, remote_embedding_batch_size);
|
||||
auto embeddings = embedder_op.get()->batch_embed(texts, remote_embedding_batch_size, remote_embedding_timeout_ms,
|
||||
remote_embedding_num_tries);
|
||||
|
||||
for(size_t i = 0; i < embeddings.size(); i++) {
|
||||
auto& embedding_res = embeddings[i];
|
||||
@ -7032,13 +7038,35 @@ void Index::batch_embed_fields(std::vector<index_record*>& records,
|
||||
texts_to_embed[i].first->index_failure(embedding_res.status_code, "");
|
||||
continue;
|
||||
}
|
||||
nlohmann::json* document;
|
||||
if(texts_to_embed[i].first->is_update) {
|
||||
document = &texts_to_embed[i].first->new_doc;
|
||||
} else {
|
||||
document = &texts_to_embed[i].first->doc;
|
||||
}
|
||||
(*document)[field.name] = embedding_res.embedding;
|
||||
texts_to_embed[i].first->new_doc[field.name] = embedding_res.embedding;
|
||||
}
|
||||
texts_to_embed[i].first->doc[field.name] = embedding_res.embedding;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void Index::repair_hnsw_index() {
|
||||
std::vector<std::string> vector_fields;
|
||||
|
||||
// this lock ensures that the `vector_index` map is not mutated during read
|
||||
std::shared_lock read_lock(mutex);
|
||||
|
||||
for(auto& vec_kv: vector_index) {
|
||||
vector_fields.push_back(vec_kv.first);
|
||||
}
|
||||
|
||||
read_lock.unlock();
|
||||
|
||||
for(const auto& vector_field: vector_fields) {
|
||||
read_lock.lock();
|
||||
if(vector_index.count(vector_field) != 0) {
|
||||
// this lock ensures that the vector index is not dropped during repair
|
||||
std::unique_lock lock(vector_index[vector_field]->repair_m);
|
||||
read_lock.unlock(); // release this lock since repair is a long running operation
|
||||
vector_index[vector_field]->vecdex->repair_zero_indegree();
|
||||
} else {
|
||||
read_lock.unlock();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -152,7 +152,8 @@ embedding_res_t TextEmbedder::Embed(const std::string& text, const size_t remote
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<embedding_res_t> TextEmbedder::batch_embed(const std::vector<std::string>& inputs, const size_t remote_embedding_batch_size) {
|
||||
std::vector<embedding_res_t> TextEmbedder::batch_embed(const std::vector<std::string>& inputs, const size_t remote_embedding_batch_size,
|
||||
const size_t remote_embedding_timeout_ms, const size_t remote_embedding_num_tries) {
|
||||
std::vector<embedding_res_t> outputs;
|
||||
if(!is_remote()) {
|
||||
std::lock_guard<std::mutex> lock(mutex_);
|
||||
@ -235,7 +236,7 @@ std::vector<embedding_res_t> TextEmbedder::batch_embed(const std::vector<std::st
|
||||
}
|
||||
}
|
||||
} else {
|
||||
outputs = std::move(remote_embedder_->batch_embed(inputs, remote_embedding_batch_size));
|
||||
outputs = std::move(remote_embedder_->batch_embed(inputs, remote_embedding_batch_size, remote_embedding_timeout_ms, remote_embedding_num_tries));
|
||||
}
|
||||
|
||||
return outputs;
|
||||
|
@ -188,13 +188,14 @@ embedding_res_t OpenAIEmbedder::Embed(const std::string& text, const size_t remo
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<embedding_res_t> OpenAIEmbedder::batch_embed(const std::vector<std::string>& inputs, const size_t remote_embedding_batch_size) {
|
||||
std::vector<embedding_res_t> OpenAIEmbedder::batch_embed(const std::vector<std::string>& inputs, const size_t remote_embedding_batch_size,
|
||||
const size_t remote_embedding_timeout_ms, const size_t remote_embedding_num_tries) {
|
||||
// call recursively if inputs larger than remote_embedding_batch_size
|
||||
if(inputs.size() > remote_embedding_batch_size) {
|
||||
std::vector<embedding_res_t> outputs;
|
||||
for(size_t i = 0; i < inputs.size(); i += remote_embedding_batch_size) {
|
||||
auto batch = std::vector<std::string>(inputs.begin() + i, inputs.begin() + std::min(i + remote_embedding_batch_size, inputs.size()));
|
||||
auto batch_outputs = batch_embed(batch, remote_embedding_batch_size);
|
||||
auto batch_outputs = batch_embed(batch, remote_embedding_batch_size, remote_embedding_timeout_ms, remote_embedding_num_tries);
|
||||
outputs.insert(outputs.end(), batch_outputs.begin(), batch_outputs.end());
|
||||
}
|
||||
return outputs;
|
||||
@ -206,6 +207,8 @@ std::vector<embedding_res_t> OpenAIEmbedder::batch_embed(const std::vector<std::
|
||||
std::unordered_map<std::string, std::string> headers;
|
||||
headers["Authorization"] = "Bearer " + api_key;
|
||||
headers["Content-Type"] = "application/json";
|
||||
headers["timeout_ms"] = std::to_string(remote_embedding_timeout_ms);
|
||||
headers["num_try"] = std::to_string(remote_embedding_num_tries);
|
||||
std::map<std::string, std::string> res_headers;
|
||||
std::string res;
|
||||
auto res_code = call_remote_api("POST", OPENAI_CREATE_EMBEDDING, req_body.dump(), res, res_headers, headers);
|
||||
@ -370,10 +373,22 @@ embedding_res_t GoogleEmbedder::Embed(const std::string& text, const size_t remo
|
||||
}
|
||||
|
||||
|
||||
std::vector<embedding_res_t> GoogleEmbedder::batch_embed(const std::vector<std::string>& inputs, const size_t remote_embedding_batch_size) {
|
||||
std::vector<embedding_res_t> GoogleEmbedder::batch_embed(const std::vector<std::string>& inputs, const size_t remote_embedding_batch_size,
|
||||
const size_t remote_embedding_timeout_ms, const size_t remote_embedding_num_tries) {
|
||||
std::vector<embedding_res_t> outputs;
|
||||
bool timeout_prev = false;
|
||||
for(auto& input : inputs) {
|
||||
auto res = Embed(input);
|
||||
auto res = Embed(input, remote_embedding_timeout_ms, remote_embedding_num_tries);
|
||||
if(res.status_code == 408) {
|
||||
if(timeout_prev) {
|
||||
// fail whole batch if two consecutive timeouts,
|
||||
nlohmann::json req_body;
|
||||
req_body["text"] = input;
|
||||
return std::vector<embedding_res_t>(inputs.size(), embedding_res_t(408, get_error_json(req_body, 408, "")));
|
||||
}
|
||||
timeout_prev = true;
|
||||
}
|
||||
timeout_prev = false;
|
||||
outputs.push_back(res);
|
||||
}
|
||||
|
||||
@ -529,7 +544,8 @@ embedding_res_t GCPEmbedder::Embed(const std::string& text, const size_t remote_
|
||||
}
|
||||
|
||||
|
||||
std::vector<embedding_res_t> GCPEmbedder::batch_embed(const std::vector<std::string>& inputs, const size_t remote_embedding_batch_size) {
|
||||
std::vector<embedding_res_t> GCPEmbedder::batch_embed(const std::vector<std::string>& inputs, const size_t remote_embedding_batch_size,
|
||||
const size_t remote_embedding_timeout_ms, const size_t remote_embedding_num_tries) {
|
||||
// GCP API has a limit of 5 instances per request
|
||||
if(inputs.size() > 5) {
|
||||
std::vector<embedding_res_t> res;
|
||||
@ -549,6 +565,8 @@ std::vector<embedding_res_t> GCPEmbedder::batch_embed(const std::vector<std::str
|
||||
std::unordered_map<std::string, std::string> headers;
|
||||
headers["Authorization"] = "Bearer " + access_token;
|
||||
headers["Content-Type"] = "application/json";
|
||||
headers["timeout_ms"] = std::to_string(remote_embedding_timeout_ms);
|
||||
headers["num_try"] = std::to_string(remote_embedding_num_tries);
|
||||
std::map<std::string, std::string> res_headers;
|
||||
std::string res;
|
||||
auto res_code = call_remote_api("POST", get_gcp_embedding_url(project_id, model_name), req_body.dump(), res, res_headers, headers);
|
||||
|
@ -13,7 +13,8 @@
|
||||
#include <arpa/inet.h>
|
||||
#include <sys/socket.h>
|
||||
#include <ifaddrs.h>
|
||||
#include <analytics_manager.h>
|
||||
#include "analytics_manager.h"
|
||||
#include "housekeeper.h"
|
||||
|
||||
#include "core_api.h"
|
||||
#include "ratelimit_manager.h"
|
||||
@ -104,6 +105,7 @@ void init_cmdline_options(cmdline::parser & options, int argc, char **argv) {
|
||||
options.add<int>("log-slow-searches-time-ms", '\0', "When >= 0, searches that take longer than this duration are logged.", false, 30*1000);
|
||||
options.add<int>("cache-num-entries", '\0', "Number of entries to cache.", false, 1000);
|
||||
options.add<uint32_t>("analytics-flush-interval", '\0', "Frequency of persisting analytics data to disk (in seconds).", false, 3600);
|
||||
options.add<uint32_t>("housekeeping-interval", '\0', "Frequency of housekeeping background job (in seconds).", false, 1800);
|
||||
|
||||
// DEPRECATED
|
||||
options.add<std::string>("listen-address", 'h', "[DEPRECATED: use `api-address`] Address to which Typesense API service binds.", false, "0.0.0.0");
|
||||
@ -457,6 +459,11 @@ int run_server(const Config & config, const std::string & version, void (*master
|
||||
AnalyticsManager::get_instance().run(&replication_state);
|
||||
});
|
||||
|
||||
HouseKeeper::get_instance().init(config.get_housekeeping_interval());
|
||||
std::thread housekeeping_thread([]() {
|
||||
HouseKeeper::get_instance().run();
|
||||
});
|
||||
|
||||
RemoteEmbedder::init(&replication_state);
|
||||
|
||||
std::string path_to_nodes = config.get_nodes();
|
||||
@ -481,6 +488,10 @@ int run_server(const Config & config, const std::string & version, void (*master
|
||||
LOG(INFO) << "Waiting for event sink thread to be done...";
|
||||
event_sink_thread.join();
|
||||
|
||||
LOG(INFO) << "Waiting for housekeeping thread to be done...";
|
||||
HouseKeeper::get_instance().stop();
|
||||
housekeeping_thread.join();
|
||||
|
||||
LOG(INFO) << "Shutting down server_thread_pool";
|
||||
|
||||
server_thread_pool.shutdown();
|
||||
|
@ -1229,6 +1229,7 @@ TEST_F(CollectionFacetingTest, FacetParseTest){
|
||||
field("grade", field_types::INT32, true),
|
||||
field("rank", field_types::INT32, true),
|
||||
field("range", field_types::INT32, true),
|
||||
field("review", field_types::FLOAT, true),
|
||||
field("scale", field_types::INT32, false),
|
||||
};
|
||||
|
||||
@ -1326,6 +1327,34 @@ TEST_F(CollectionFacetingTest, FacetParseTest){
|
||||
|
||||
ASSERT_EQ("rank", mixed_facets_ptr[2]->field_name);
|
||||
ASSERT_EQ("range", mixed_facets_ptr[1]->field_name);
|
||||
|
||||
std::vector<std::string> range_facet_float_fields {
|
||||
"review(bad:[0, 2.5], good:[2.5, 5])"
|
||||
};
|
||||
|
||||
std::vector<facet> float_facets;
|
||||
for(const std::string & facet_field: range_facet_float_fields) {
|
||||
auto res = coll1->parse_facet(facet_field, float_facets);
|
||||
|
||||
if(!res.error().empty()) {
|
||||
LOG(ERROR) << res.error();
|
||||
FAIL();
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::string> range_facet_negative_range {
|
||||
"review(bad:[-2.5, 2.5], good:[2.5, 5])"
|
||||
};
|
||||
|
||||
std::vector<facet> negative_range;
|
||||
for(const std::string & facet_field: range_facet_negative_range) {
|
||||
auto res = coll1->parse_facet(facet_field, negative_range);
|
||||
|
||||
if(!res.error().empty()) {
|
||||
LOG(ERROR) << res.error();
|
||||
FAIL();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(CollectionFacetingTest, RangeFacetTest) {
|
||||
@ -1667,6 +1696,204 @@ TEST_F(CollectionFacetingTest, RangeFacetTypo) {
|
||||
collectionManager.drop_collection("coll1");
|
||||
}
|
||||
|
||||
TEST_F(CollectionFacetingTest, RangeFacetsFloatRange) {
|
||||
std::vector<field> fields = {field("name", field_types::STRING, false),
|
||||
field("inches", field_types::FLOAT, true),};
|
||||
Collection* coll1 = collectionManager.create_collection(
|
||||
"coll1", 1, fields, "", 0, "", {}, {}).get();
|
||||
|
||||
nlohmann::json doc;
|
||||
doc["id"] = "0";
|
||||
doc["name"] = "TV 1";
|
||||
doc["inches"] = 32.4;
|
||||
|
||||
ASSERT_TRUE(coll1->add(doc.dump()).ok());
|
||||
|
||||
doc["id"] = "1";
|
||||
doc["name"] = "TV 2";
|
||||
doc["inches"] = 55;
|
||||
|
||||
ASSERT_TRUE(coll1->add(doc.dump()).ok());
|
||||
|
||||
doc["id"] = "2";
|
||||
doc["name"] = "TV 3";
|
||||
doc["inches"] = 55.6;
|
||||
|
||||
ASSERT_TRUE(coll1->add(doc.dump()).ok());
|
||||
|
||||
auto results = coll1->search("*", {},
|
||||
"", {"inches(small:[0, 55.5])"},
|
||||
{}, {2}, 10,
|
||||
1, FREQUENCY, {true},
|
||||
10, spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 10, {}, {}, {}, 0,
|
||||
"<mark>", "</mark>", {}, 1000,
|
||||
true, false, true, "", true).get();
|
||||
|
||||
ASSERT_EQ(1, results["facet_counts"][0]["counts"].size());
|
||||
ASSERT_EQ(2, (int) results["facet_counts"][0]["counts"][0]["count"]);
|
||||
ASSERT_EQ("small", results["facet_counts"][0]["counts"][0]["value"]);
|
||||
}
|
||||
|
||||
TEST_F(CollectionFacetingTest, RangeFacetsMinMaxRange) {
|
||||
std::vector<field> fields = {field("name", field_types::STRING, false),
|
||||
field("inches", field_types::FLOAT, true),};
|
||||
Collection* coll1 = collectionManager.create_collection(
|
||||
"coll1", 1, fields, "", 0, "", {}, {}).get();
|
||||
|
||||
nlohmann::json doc;
|
||||
doc["id"] = "0";
|
||||
doc["name"] = "TV 1";
|
||||
doc["inches"] = 32.4;
|
||||
|
||||
ASSERT_TRUE(coll1->add(doc.dump()).ok());
|
||||
|
||||
doc["id"] = "1";
|
||||
doc["name"] = "TV 2";
|
||||
doc["inches"] = 55;
|
||||
|
||||
ASSERT_TRUE(coll1->add(doc.dump()).ok());
|
||||
|
||||
doc["id"] = "2";
|
||||
doc["name"] = "TV 3";
|
||||
doc["inches"] = 55.6;
|
||||
|
||||
ASSERT_TRUE(coll1->add(doc.dump()).ok());
|
||||
|
||||
auto results = coll1->search("*", {},
|
||||
"", {"inches(small:[0, 55], large:[55, ])"},
|
||||
{}, {2}, 10,
|
||||
1, FREQUENCY, {true},
|
||||
10, spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 10, {}, {}, {}, 0,
|
||||
"<mark>", "</mark>", {}, 1000,
|
||||
true, false, true, "", true).get();
|
||||
|
||||
ASSERT_EQ(2, results["facet_counts"][0]["counts"].size());
|
||||
ASSERT_EQ(2, (int) results["facet_counts"][0]["counts"][0]["count"]);
|
||||
ASSERT_EQ("small", results["facet_counts"][0]["counts"][0]["value"]);
|
||||
ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][1]["count"]);
|
||||
ASSERT_EQ("large", results["facet_counts"][0]["counts"][1]["value"]);
|
||||
|
||||
results = coll1->search("*", {},
|
||||
"", {"inches(small:[,55])"},
|
||||
{}, {2}, 10,
|
||||
1, FREQUENCY, {true},
|
||||
10, spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 10, {}, {}, {}, 0,
|
||||
"<mark>", "</mark>", {}, 1000,
|
||||
true, false, true, "", true).get();
|
||||
|
||||
ASSERT_EQ(1, results["facet_counts"][0]["counts"].size());
|
||||
ASSERT_EQ(2, (int) results["facet_counts"][0]["counts"][0]["count"]);
|
||||
ASSERT_EQ("small", results["facet_counts"][0]["counts"][0]["value"]);
|
||||
}
|
||||
|
||||
TEST_F(CollectionFacetingTest, RangeFacetRangeLabelWithSpace) {
|
||||
std::vector<field> fields = {field("name", field_types::STRING, false),
|
||||
field("inches", field_types::FLOAT, true),};
|
||||
Collection* coll1 = collectionManager.create_collection(
|
||||
"coll1", 1, fields, "", 0, "", {}, {}).get();
|
||||
|
||||
nlohmann::json doc;
|
||||
doc["id"] = "0";
|
||||
doc["name"] = "TV 1";
|
||||
doc["inches"] = 32.4;
|
||||
|
||||
ASSERT_TRUE(coll1->add(doc.dump()).ok());
|
||||
|
||||
doc["id"] = "1";
|
||||
doc["name"] = "TV 2";
|
||||
doc["inches"] = 55;
|
||||
|
||||
ASSERT_TRUE(coll1->add(doc.dump()).ok());
|
||||
|
||||
doc["id"] = "2";
|
||||
doc["name"] = "TV 3";
|
||||
doc["inches"] = 55.6;
|
||||
|
||||
ASSERT_TRUE(coll1->add(doc.dump()).ok());
|
||||
|
||||
auto results = coll1->search("*", {},
|
||||
"", {"inches(small tvs with display size:[0,55])"},
|
||||
{}, {2}, 10,
|
||||
1, FREQUENCY, {true},
|
||||
10, spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 10, {}, {}, {}, 0,
|
||||
"<mark>", "</mark>", {}, 1000,
|
||||
true, false, true, "", true).get();
|
||||
|
||||
ASSERT_EQ(1, results["facet_counts"][0]["counts"].size());
|
||||
ASSERT_EQ(2, (int) results["facet_counts"][0]["counts"][0]["count"]);
|
||||
ASSERT_EQ("small tvs with display size", results["facet_counts"][0]["counts"][0]["value"]);
|
||||
}
|
||||
|
||||
TEST_F(CollectionFacetingTest, RangeFacetRangeNegativeRanges) {
|
||||
std::vector<field> fields = {field("team", field_types::STRING, false),
|
||||
field("nrr", field_types::FLOAT, true),};
|
||||
Collection* coll1 = collectionManager.create_collection(
|
||||
"coll1", 1, fields, "", 0, "",
|
||||
{},{}).get();
|
||||
|
||||
nlohmann::json doc;
|
||||
doc["id"] = "0";
|
||||
doc["team"] = "india";
|
||||
doc["nrr"] = 1.353;
|
||||
ASSERT_TRUE(coll1->add(doc.dump()).ok());
|
||||
|
||||
doc["id"] = "1";
|
||||
doc["team"] = "australia";
|
||||
doc["nrr"] = -0.193;
|
||||
ASSERT_TRUE(coll1->add(doc.dump()).ok());
|
||||
|
||||
doc["id"] = "2";
|
||||
doc["team"] = "pakistan";
|
||||
doc["nrr"] = -0.400;
|
||||
ASSERT_TRUE(coll1->add(doc.dump()).ok());
|
||||
|
||||
doc["id"] = "3";
|
||||
doc["team"] = "afghanistan";
|
||||
doc["nrr"] = -0.969;
|
||||
ASSERT_TRUE(coll1->add(doc.dump()).ok());
|
||||
|
||||
doc["id"] = "4";
|
||||
doc["team"] = "srilanka";
|
||||
doc["nrr"] = -1.048;
|
||||
ASSERT_TRUE(coll1->add(doc.dump()).ok());
|
||||
|
||||
doc["id"] = "5";
|
||||
doc["team"] = "england";
|
||||
doc["nrr"] = -1.248;
|
||||
ASSERT_TRUE(coll1->add(doc.dump()).ok());
|
||||
|
||||
doc["id"] = "6";
|
||||
doc["team"] = "bangladesh";
|
||||
doc["nrr"] = -1.253;
|
||||
ASSERT_TRUE(coll1->add(doc.dump()).ok());
|
||||
|
||||
doc["id"] = "7";
|
||||
doc["team"] = "new zealand";
|
||||
doc["nrr"] = 1.481;
|
||||
ASSERT_TRUE(coll1->add(doc.dump()).ok());
|
||||
|
||||
auto results = coll1->search("*", {},
|
||||
"", {"nrr(poor:[-1.5,-1], decent:[-1,0], good:[0,2])"},
|
||||
{}, {2}, 10,
|
||||
1, FREQUENCY, {true},
|
||||
10, spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 10, {}, {}, {}, 0,
|
||||
"<mark>", "</mark>", {}, 1000,
|
||||
true, false, true, "", true).get();
|
||||
|
||||
ASSERT_EQ(3, results["facet_counts"][0]["counts"].size());
|
||||
ASSERT_EQ(3, (int) results["facet_counts"][0]["counts"][0]["count"]);
|
||||
ASSERT_EQ("poor", results["facet_counts"][0]["counts"][0]["value"]);
|
||||
ASSERT_EQ(3, (int) results["facet_counts"][0]["counts"][1]["count"]);
|
||||
ASSERT_EQ("decent", results["facet_counts"][0]["counts"][1]["value"]);
|
||||
ASSERT_EQ(2, (int) results["facet_counts"][0]["counts"][2]["count"]);
|
||||
ASSERT_EQ("good", results["facet_counts"][0]["counts"][2]["value"]);
|
||||
}
|
||||
|
||||
TEST_F(CollectionFacetingTest, SampleFacetCounts) {
|
||||
nlohmann::json schema = R"({
|
||||
"name": "coll1",
|
||||
|
@ -2747,3 +2747,95 @@ TEST_F(CollectionVectorTest, TestSearchNonIndexedVectorField) {
|
||||
ASSERT_FALSE(search_result.ok());
|
||||
ASSERT_EQ("Field `vec` is marked as a non-indexed field in the schema.", search_result.error());
|
||||
}
|
||||
|
||||
TEST_F(CollectionVectorTest, TestSemanticSearchAfterUpdate) {
|
||||
nlohmann::json schema = R"({
|
||||
"name": "test",
|
||||
"fields": [
|
||||
{
|
||||
"name": "name",
|
||||
"type": "string"
|
||||
},
|
||||
{
|
||||
"name": "embedding",
|
||||
"type": "float[]",
|
||||
"embed": {
|
||||
"from": [
|
||||
"name"
|
||||
],
|
||||
"model_config": {
|
||||
"model_name": "ts/e5-small"
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
})"_json;
|
||||
|
||||
TextEmbedderManager::set_model_dir("/tmp/typesense_test/models");
|
||||
|
||||
auto collection_create_op = collectionManager.create_collection(schema);
|
||||
ASSERT_TRUE(collection_create_op.ok());
|
||||
|
||||
auto coll = collection_create_op.get();
|
||||
|
||||
auto add_op = coll->add(R"({
|
||||
"name": "soccer",
|
||||
"id": "0"
|
||||
})"_json.dump());
|
||||
|
||||
ASSERT_TRUE(add_op.ok());
|
||||
|
||||
add_op = coll->add(R"({
|
||||
"name": "basketball",
|
||||
"id": "1"
|
||||
})"_json.dump());
|
||||
|
||||
ASSERT_TRUE(add_op.ok());
|
||||
|
||||
add_op = coll->add(R"({
|
||||
"name": "typesense",
|
||||
"id": "2"
|
||||
})"_json.dump());
|
||||
|
||||
ASSERT_TRUE(add_op.ok());
|
||||
|
||||
add_op = coll->add(R"({
|
||||
"name": "potato",
|
||||
"id": "3"
|
||||
})"_json.dump());
|
||||
|
||||
ASSERT_TRUE(add_op.ok());
|
||||
|
||||
auto result = coll->search("*", {}, "", {}, {}, {0}, 20, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
|
||||
"", 10, {}, {}, {}, 0,
|
||||
"<mark>", "</mark>", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7,
|
||||
fallback,
|
||||
4, {off}, 32767, 32767, 2,
|
||||
false, true, "embedding:([], id:0, k:1)");
|
||||
|
||||
ASSERT_TRUE(result.ok());
|
||||
ASSERT_EQ(1, result.get()["hits"].size());
|
||||
ASSERT_EQ("basketball", result.get()["hits"][0]["document"]["name"]);
|
||||
|
||||
auto update_op = coll->add(R"({
|
||||
"name": "onion",
|
||||
"id": "0"
|
||||
})"_json.dump(), index_operation_t::UPDATE, "0");
|
||||
|
||||
ASSERT_TRUE(update_op.ok());
|
||||
|
||||
result = coll->search("*", {}, "", {}, {}, {0}, 20, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
|
||||
"", 10, {}, {}, {}, 0,
|
||||
"<mark>", "</mark>", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7,
|
||||
fallback,
|
||||
4, {off}, 32767, 32767, 2,
|
||||
false, true, "embedding:([], id:0, k:1)");
|
||||
|
||||
ASSERT_TRUE(result.ok());
|
||||
ASSERT_EQ(1, result.get()["hits"].size());
|
||||
ASSERT_EQ("potato", result.get()["hits"][0]["document"]["name"]);
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user