Merge branch 'v0.25-join'

This commit is contained in:
Jason Bosco 2023-09-11 10:34:38 -05:00
commit 3bbfe20fcc
24 changed files with 908 additions and 143 deletions

View File

@ -79,7 +79,7 @@ public:
Option<bool> remove_rule(const std::string& name);
void add_suggestion(const std::string& query_collection,
std::string& query, bool live_query, const std::string& user_id);
const std::string& query, bool live_query, const std::string& user_id);
void stop();

View File

@ -162,7 +162,7 @@ private:
void remove_document(const nlohmann::json & document, const uint32_t seq_id, bool remove_from_store);
void process_remove_field_for_embedding_fields(const field& the_field, std::vector<field>& garbage_fields);
void process_remove_field_for_embedding_fields(const field& del_field, std::vector<field>& garbage_embed_fields);
void curate_results(string& actual_query, const string& filter_query, bool enable_overrides, bool already_segmented,
const std::map<size_t, std::vector<std::string>>& pinned_hits,

View File

@ -424,10 +424,11 @@ struct field {
std::string& fallback_field_type,
std::vector<field>& the_fields);
static Option<bool> validate_and_init_embed_fields(const std::vector<std::pair<size_t, size_t>>& embed_json_field_indices,
const tsl::htrie_map<char, field>& search_schema,
nlohmann::json& fields_json,
std::vector<field>& fields_vec);
static Option<bool> validate_and_init_embed_field(const tsl::htrie_map<char, field>& search_schema,
nlohmann::json& field_json,
const nlohmann::json& fields_json,
field& the_field);
static bool flatten_obj(nlohmann::json& doc, nlohmann::json& value, bool has_array, bool has_obj_array,
bool is_update, const field& the_field, const std::string& flat_name,

View File

@ -532,7 +532,7 @@ private:
static void handle_doc_ops(const tsl::htrie_map<char, field>& search_schema,
nlohmann::json& update_doc, const nlohmann::json& old_doc);
static void get_doc_changes(const index_operation_t op, const tsl::htrie_map<char, field>& search_schema,
static void get_doc_changes(const index_operation_t op, const tsl::htrie_map<char, field>& embedding_fields,
nlohmann::json &update_doc, const nlohmann::json &old_doc, nlohmann::json &new_doc,
nlohmann::json &del_doc);

View File

@ -72,6 +72,10 @@ public:
Option<bool> validate_and_init_local_model(const nlohmann::json& model_config, size_t& num_dims);
Option<bool> validate_and_init_model(const nlohmann::json& model_config, size_t& num_dims);
std::unordered_map<std::string, std::shared_ptr<TextEmbedder>> _get_text_embedders() {
return text_embedders;
}
private:
TextEmbedderManager() = default;

View File

@ -31,6 +31,7 @@ class RemoteEmbedder {
virtual nlohmann::json get_error_json(const nlohmann::json& req_body, long res_code, const std::string& res_body) = 0;
virtual embedding_res_t Embed(const std::string& text, const size_t remote_embedder_timeout_ms = 30000, const size_t remote_embedding_num_tries = 2) = 0;
virtual std::vector<embedding_res_t> batch_embed(const std::vector<std::string>& inputs, const size_t remote_embedding_batch_size = 200) = 0;
static const std::string get_model_key(const nlohmann::json& model_config);
static void init(ReplicationState* rs) {
raft_server = rs;
}
@ -51,6 +52,7 @@ class OpenAIEmbedder : public RemoteEmbedder {
embedding_res_t Embed(const std::string& text, const size_t remote_embedder_timeout_ms = 30000, const size_t remote_embedding_num_tries = 2) override;
std::vector<embedding_res_t> batch_embed(const std::vector<std::string>& inputs, const size_t remote_embedding_batch_size = 200) override;
nlohmann::json get_error_json(const nlohmann::json& req_body, long res_code, const std::string& res_body) override;
static std::string get_model_key(const nlohmann::json& model_config);
};
@ -68,6 +70,7 @@ class GoogleEmbedder : public RemoteEmbedder {
embedding_res_t Embed(const std::string& text, const size_t remote_embedder_timeout_ms = 30000, const size_t remote_embedding_num_tries = 2) override;
std::vector<embedding_res_t> batch_embed(const std::vector<std::string>& inputs, const size_t remote_embedding_batch_size = 200) override;
nlohmann::json get_error_json(const nlohmann::json& req_body, long res_code, const std::string& res_body) override;
static std::string get_model_key(const nlohmann::json& model_config);
};
@ -95,6 +98,7 @@ class GCPEmbedder : public RemoteEmbedder {
embedding_res_t Embed(const std::string& text, const size_t remote_embedder_timeout_ms = 30000, const size_t remote_embedding_num_tries = 2) override;
std::vector<embedding_res_t> batch_embed(const std::vector<std::string>& inputs, const size_t remote_embedding_batch_size = 200) override;
nlohmann::json get_error_json(const nlohmann::json& req_body, long res_code, const std::string& res_body) override;
static std::string get_model_key(const nlohmann::json& model_config);
};

View File

@ -88,5 +88,5 @@ public:
bool should_skip_char(char c);
static void normalize_ascii(std::string& text);
static std::string normalize_ascii_no_spaces(const std::string& text);
};

View File

@ -787,6 +787,10 @@ public:
cors_domains.insert(cors_values_vec.begin(), cors_values_vec.end());
}
void set_enable_search_analytics(bool enable_search_analytics) {
this->enable_search_analytics = enable_search_analytics;
}
// validation
Option<bool> is_valid() {

View File

@ -203,7 +203,7 @@ Option<bool> AnalyticsManager::remove_popular_queries_index(const std::string &n
return Option<bool>(true);
}
void AnalyticsManager::add_suggestion(const std::string &query_collection, std::string &query,
void AnalyticsManager::add_suggestion(const std::string &query_collection, const std::string& query,
const bool live_query, const std::string& user_id) {
// look up suggestion collections for the query collection
std::unique_lock lock(mutex);
@ -212,7 +212,6 @@ void AnalyticsManager::add_suggestion(const std::string &query_collection, std::
for(const auto& suggestion_collection: suggestion_collections_it->second) {
const auto& popular_queries_it = popular_queries.find(suggestion_collection);
if(popular_queries_it != popular_queries.end()) {
Tokenizer::normalize_ascii(query);
popular_queries_it->second->add(query, live_query, user_id);
}
}
@ -235,6 +234,8 @@ void AnalyticsManager::run(ReplicationState* raft_server) {
}
persist_suggestions(raft_server, prev_persistence_s);
prev_persistence_s = std::chrono::duration_cast<std::chrono::seconds>(
std::chrono::system_clock::now().time_since_epoch()).count();
lk.unlock();
}
@ -270,8 +271,6 @@ void AnalyticsManager::persist_suggestions(ReplicationState *raft_server, uint64
continue;
}
prev_persistence_s = now_ts_seconds;
std::string import_payload;
popularQueries->serialize_as_docs(import_payload);

View File

@ -52,12 +52,6 @@ Collection::Collection(const std::string& name, const uint32_t collection_id, co
symbols_to_index(to_char_array(symbols_to_index)), token_separators(to_char_array(token_separators)),
index(init_index()) {
for (auto const& field: fields) {
if (field.embed.count(fields::from) != 0) {
embedding_fields.emplace(field.name, field);
}
}
this->num_documents = 0;
}
@ -3942,7 +3936,6 @@ Option<bool> Collection::alter(nlohmann::json& alter_payload) {
}
}
// hide credentials in the alter payload return
for(auto& field_json : alter_payload["fields"]) {
if(field_json[fields::embed].count(fields::model_config) != 0) {
@ -3955,8 +3948,6 @@ Option<bool> Collection::alter(nlohmann::json& alter_payload) {
}
}
return Option<bool>(true);
}
@ -4165,7 +4156,6 @@ Option<bool> Collection::validate_alter_payload(nlohmann::json& schema_changes,
}
std::unordered_map<std::string, field> new_dynamic_fields;
std::vector<std::pair<size_t, size_t>> embed_json_field_indices;
int json_array_index = -1;
for(const auto& kv: schema_changes["fields"].items()) {
@ -4253,7 +4243,7 @@ Option<bool> Collection::validate_alter_payload(nlohmann::json& schema_changes,
return parse_op;
}
const auto& f = diff_fields.back();
auto& f = diff_fields.back();
if(f.is_dynamic()) {
new_dynamic_fields[f.name] = f;
@ -4261,6 +4251,14 @@ Option<bool> Collection::validate_alter_payload(nlohmann::json& schema_changes,
updated_search_schema[f.name] = f;
}
if(!f.embed.empty()) {
auto validate_res = field::validate_and_init_embed_field(search_schema, schema_changes["fields"][json_array_index], schema_changes["fields"], f);
if(!validate_res.ok()) {
return validate_res;
}
}
if(is_reindex) {
reindex_fields.push_back(f);
} else {
@ -4295,9 +4293,7 @@ Option<bool> Collection::validate_alter_payload(nlohmann::json& schema_changes,
}
}
if(!f.embed.empty() && !diff_fields.empty()) {
embed_json_field_indices.emplace_back(json_array_index, diff_fields.size()-1);
}
} else {
// partial update is not supported for now
@ -4307,12 +4303,6 @@ Option<bool> Collection::validate_alter_payload(nlohmann::json& schema_changes,
}
}
auto validation_op = field::validate_and_init_embed_fields(embed_json_field_indices, search_schema,
schema_changes["fields"], diff_fields);
if(!validation_op.ok()) {
return validation_op;
}
if(num_auto_detect_fields > 1) {
return Option<bool>(400, "There can be only one field named `.*`.");
}
@ -4904,27 +4894,43 @@ Option<bool> Collection::populate_include_exclude_fields_lk(const spp::sparse_ha
}
// Removes the dropped field from embed_from of all embedding fields.
void Collection::process_remove_field_for_embedding_fields(const field& the_field, std::vector<field>& garbage_fields) {
void Collection::process_remove_field_for_embedding_fields(const field& del_field,
std::vector<field>& garbage_embed_fields) {
for(auto& field : fields) {
if(field.embed.count(fields::from) == 0) {
continue;
}
auto embed_from = field.embed[fields::from].get<std::vector<std::string>>();
embed_from.erase(std::remove_if(embed_from.begin(), embed_from.end(), [&the_field](std::string field_name) {
return the_field.name == field_name;
}));
field.embed[fields::from] = std::move(embed_from);
embedding_fields[field.name] = field;
// mark this embedding field as "garbage" if it has no more embed_from fields
if(embed_from.empty()) {
embedding_fields.erase(field.name);
garbage_fields.push_back(field);
bool found_field = false;
nlohmann::json& embed_from_names = field.embed[fields::from];
for(auto it = embed_from_names.begin(); it != embed_from_names.end();) {
if(it.value() == del_field.name) {
it = embed_from_names.erase(it);
found_field = true;
} else {
it++;
}
}
if(found_field) {
// mark this embedding field as "garbage" if it has no more embed_from fields
if(embed_from_names.empty()) {
garbage_embed_fields.push_back(field);
} else {
// the dropped field was present in `embed_from`, so we have to update the field objects
field.embed[fields::from] = embed_from_names;
embedding_fields[field.name].embed[fields::from] = embed_from_names;
}
}
}
for(auto& garbage_field: garbage_embed_fields) {
embedding_fields.erase(garbage_field.name);
search_schema.erase(garbage_field.name);
fields.erase(std::remove_if(fields.begin(), fields.end(), [&garbage_field](const auto &f) {
return f.name == garbage_field.name;
}), fields.end());
}
}
void Collection::hide_credential(nlohmann::json& json, const std::string& credential_name) {
@ -4939,10 +4945,15 @@ void Collection::hide_credential(nlohmann::json& json, const std::string& creden
}
}
}
Option<bool> Collection::truncate_after_top_k(const string &field_name, size_t k) {
std::shared_lock slock(mutex);
std::vector<uint32_t> seq_ids;
auto op = index->seq_ids_outside_top_k(field_name, k, seq_ids);
slock.unlock();
if(!op.ok()) {
return op;
}

View File

@ -766,7 +766,9 @@ Option<bool> CollectionManager::do_search(std::map<std::string, std::string>& re
nlohmann::json preset;
const auto& preset_op = CollectionManager::get_instance().get_preset(preset_it->second, preset);
if(preset_op.ok()) {
// NOTE: we merge only single preset configuration because multi ("searches") preset value replaces
// the request body directly before we reach this single search request function.
if(preset_op.ok() && !preset.contains("searches")) {
if(!preset.is_object()) {
return Option<bool>(400, "Search preset is not an object.");
}
@ -1112,7 +1114,7 @@ Option<bool> CollectionManager::do_search(std::map<std::string, std::string>& re
if(Config::get_instance().get_enable_search_analytics()) {
if(result.count("found") != 0 && result["found"].get<size_t>() != 0) {
std::string analytics_query = raw_query;
std::string analytics_query = Tokenizer::normalize_ascii_no_spaces(raw_query);
AnalyticsManager::get_instance().add_suggestion(orig_coll_name, analytics_query,
true, req_params["x-typesense-user-id"]);
}

View File

@ -729,7 +729,7 @@ bool get_export_documents(const std::shared_ptr<http_req>& req, const std::share
}
}
res->content_type_header = "text/plain; charset=utf8";
res->content_type_header = "text/plain; charset=utf-8";
res->status_code = 200;
stream_response(req, res);
@ -902,7 +902,7 @@ bool post_import_documents(const std::shared_ptr<http_req>& req, const std::shar
}
}
res->content_type_header = "text/plain; charset=utf8";
res->content_type_header = "text/plain; charset=utf-8";
res->status_code = 200;
res->body = response_stream.str();

View File

@ -1083,7 +1083,7 @@ void field::compact_nested_fields(tsl::htrie_map<char, field>& nested_fields) {
Option<bool> field::json_fields_to_fields(bool enable_nested_fields, nlohmann::json &fields_json, string &fallback_field_type,
std::vector<field>& the_fields) {
size_t num_auto_detect_fields = 0;
std::vector<std::pair<size_t, size_t>> embed_json_field_indices;
const tsl::htrie_map<char, field> dummy_search_schema;
for(size_t i = 0; i < fields_json.size(); i++) {
nlohmann::json& field_json = fields_json[i];
@ -1094,17 +1094,13 @@ Option<bool> field::json_fields_to_fields(bool enable_nested_fields, nlohmann::j
}
if(!the_fields.empty() && !the_fields.back().embed.empty()) {
embed_json_field_indices.emplace_back(i, i);
auto validate_res = validate_and_init_embed_field(dummy_search_schema, field_json, fields_json, the_fields.back());
if(!validate_res.ok()) {
return validate_res;
}
}
}
const tsl::htrie_map<char, field> dummy_search_schema;
auto validation_op = field::validate_and_init_embed_fields(embed_json_field_indices, dummy_search_schema,
fields_json, the_fields);
if(!validation_op.ok()) {
return validation_op;
}
if(num_auto_detect_fields > 1) {
return Option<bool>(400,"There can be only one field named `.*`.");
}
@ -1112,49 +1108,47 @@ Option<bool> field::json_fields_to_fields(bool enable_nested_fields, nlohmann::j
return Option<bool>(true);
}
Option<bool> field::validate_and_init_embed_fields(const std::vector<std::pair<size_t, size_t>>& embed_json_field_indices,
const tsl::htrie_map<char, field>& search_schema,
nlohmann::json& fields_json,
std::vector<field>& fields_vec) {
for(const auto& json_field_index: embed_json_field_indices) {
auto& field_json = fields_json[json_field_index.first];
const std::string err_msg = "Property `" + fields::embed + "." + fields::from +
Option<bool> field::validate_and_init_embed_field(const tsl::htrie_map<char, field>& search_schema, nlohmann::json& field_json,
const nlohmann::json& fields_json,
field& the_field) {
const std::string err_msg = "Property `" + fields::embed + "." + fields::from +
"` can only refer to string or string array fields.";
for(auto& field_name : field_json[fields::embed][fields::from].get<std::vector<std::string>>()) {
auto embed_field = std::find_if(fields_json.begin(), fields_json.end(), [&field_name](const nlohmann::json& x) {
return x["name"].get<std::string>() == field_name;
});
for(auto& field_name : field_json[fields::embed][fields::from].get<std::vector<std::string>>()) {
if(embed_field == fields_json.end()) {
const auto& embed_field2 = search_schema.find(field_name);
if (embed_field2 == search_schema.end()) {
return Option<bool>(400, err_msg);
} else if (embed_field2->type != field_types::STRING && embed_field2->type != field_types::STRING_ARRAY) {
return Option<bool>(400, err_msg);
}
} else if((*embed_field)[fields::type] != field_types::STRING &&
(*embed_field)[fields::type] != field_types::STRING_ARRAY) {
auto embed_field = std::find_if(fields_json.begin(), fields_json.end(), [&field_name](const nlohmann::json& x) {
return x["name"].get<std::string>() == field_name;
});
if(embed_field == fields_json.end()) {
const auto& embed_field2 = search_schema.find(field_name);
if (embed_field2 == search_schema.end()) {
return Option<bool>(400, err_msg);
} else if (embed_field2->type != field_types::STRING && embed_field2->type != field_types::STRING_ARRAY) {
return Option<bool>(400, err_msg);
}
} else if((*embed_field)[fields::type] != field_types::STRING &&
(*embed_field)[fields::type] != field_types::STRING_ARRAY) {
return Option<bool>(400, err_msg);
}
const auto& model_config = field_json[fields::embed][fields::model_config];
size_t num_dim = 0;
auto res = TextEmbedderManager::get_instance().validate_and_init_model(model_config, num_dim);
if(!res.ok()) {
return Option<bool>(res.code(), res.error());
}
LOG(INFO) << "Model init done.";
field_json[fields::num_dim] = num_dim;
fields_vec[json_field_index.second].num_dim = num_dim;
}
const auto& model_config = field_json[fields::embed][fields::model_config];
size_t num_dim = 0;
auto res = TextEmbedderManager::get_instance().validate_and_init_model(model_config, num_dim);
if(!res.ok()) {
return Option<bool>(res.code(), res.error());
}
LOG(INFO) << "Model init done.";
field_json[fields::num_dim] = num_dim;
the_field.num_dim = num_dim;
return Option<bool>(true);
}
void filter_result_t::and_filter_results(const filter_result_t& a, const filter_result_t& b, filter_result_t& result) {
auto lenA = a.count, lenB = b.count;
if (lenA == 0 || lenB == 0) {

View File

@ -454,7 +454,7 @@ void Index::validate_and_preprocess(Index *index, std::vector<index_record>& ite
if(index_rec.is_update) {
// scrub string fields to reduce delete ops
get_doc_changes(index_rec.operation, search_schema, index_rec.doc, index_rec.old_doc,
get_doc_changes(index_rec.operation, embedding_fields, index_rec.doc, index_rec.old_doc,
index_rec.new_doc, index_rec.del_doc);
if(generate_embeddings) {
@ -870,12 +870,16 @@ void Index::index_field_in_memory(const field& afield, std::vector<index_record>
try {
const std::vector<float>& float_vals = record.doc[afield.name].get<std::vector<float>>();
if(afield.vec_dist == cosine) {
std::vector<float> normalized_vals(afield.num_dim);
hnsw_index_t::normalize_vector(float_vals, normalized_vals);
vec_index->addPoint(normalized_vals.data(), (size_t)record.seq_id, true);
if(float_vals.size() != afield.num_dim) {
record.index_failure(400, "Vector size mismatch.");
} else {
vec_index->addPoint(float_vals.data(), (size_t)record.seq_id, true);
if(afield.vec_dist == cosine) {
std::vector<float> normalized_vals(afield.num_dim);
hnsw_index_t::normalize_vector(float_vals, normalized_vals);
vec_index->addPoint(normalized_vals.data(), (size_t)record.seq_id, true);
} else {
vec_index->addPoint(float_vals.data(), (size_t)record.seq_id, true);
}
}
} catch(const std::exception &e) {
record.index_failure(400, e.what());
@ -3200,8 +3204,8 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
for(size_t res_index = 0; res_index < vec_results.size(); res_index++) {
auto& vec_result = vec_results[res_index];
auto doc_id = vec_result.first;
auto result_it = topster->kv_map.find(doc_id);
auto seq_id = vec_result.first;
auto result_it = topster->kv_map.find(seq_id);
if(result_it != topster->kv_map.end()) {
if(result_it->second->match_score_index < 0 || result_it->second->match_score_index > 2) {
@ -3210,22 +3214,23 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
// result overlaps with keyword search: we have to combine the scores
auto result = result_it->second;
KV* kv = result_it->second;
// old_score + (1 / rank_of_document) * WEIGHT)
result->vector_distance = vec_result.second;
result->text_match_score = result->scores[result->match_score_index];
kv->vector_distance = vec_result.second;
kv->text_match_score = kv->scores[kv->match_score_index];
int64_t match_score = float_to_int64_t(
(int64_t_to_float(result->scores[result->match_score_index])) +
(int64_t_to_float(kv->scores[kv->match_score_index])) +
((1.0 / (res_index + 1)) * VECTOR_SEARCH_WEIGHT));
int64_t match_score_index = -1;
int64_t scores[3] = {0};
compute_sort_scores(sort_fields_std, sort_order, field_values, geopoint_indices, doc_id, 0, match_score, scores, match_score_index, vec_result.second);
compute_sort_scores(sort_fields_std, sort_order, field_values, geopoint_indices, seq_id, 0,
match_score, scores, match_score_index, vec_result.second);
for(int i = 0; i < 3; i++) {
result->scores[i] = scores[i];
kv->scores[i] = scores[i];
}
result->match_score_index = match_score_index;
kv->match_score_index = match_score_index;
} else {
// Result has been found only in vector search: we have to add it to both KV and result_ids
@ -3233,12 +3238,21 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
int64_t scores[3] = {0};
int64_t match_score = float_to_int64_t((1.0 / (res_index + 1)) * VECTOR_SEARCH_WEIGHT);
int64_t match_score_index = -1;
compute_sort_scores(sort_fields_std, sort_order, field_values, geopoint_indices, doc_id, 0, match_score, scores, match_score_index, vec_result.second);
KV kv(searched_queries.size(), doc_id, doc_id, match_score_index, scores);
compute_sort_scores(sort_fields_std, sort_order, field_values, geopoint_indices, seq_id, 0, match_score, scores, match_score_index, vec_result.second);
uint64_t distinct_id = seq_id;
if (group_limit != 0) {
distinct_id = get_distinct_id(group_by_fields, seq_id);
if(excluded_group_ids.count(distinct_id) != 0) {
continue;
}
}
KV kv(searched_queries.size(), seq_id, distinct_id, match_score_index, scores);
kv.text_match_score = 0;
kv.vector_distance = vec_result.second;
topster->add(&kv);
vec_search_ids.push_back(doc_id);
vec_search_ids.push_back(seq_id);
}
}
@ -3967,8 +3981,6 @@ void Index::search_across_fields(const std::vector<token_t>& query_tokens,
dropped_token_its.push_back(std::move(token_fields));
}
// one iterator for each token, each underlying iterator contains results of token across multiple fields
std::vector<or_iterator_t> token_its;
@ -4060,6 +4072,28 @@ void Index::search_across_fields(const std::vector<token_t>& query_tokens,
}
}
size_t query_len = query_tokens.size();
// check if seq_id exists in any of the dropped_token iters
for(size_t ti = 0; ti < dropped_token_its.size(); ti++) {
or_iterator_t& token_fields_iters = dropped_token_its[ti];
if(token_fields_iters.skip_to(seq_id) && token_fields_iters.id() == seq_id) {
query_len++;
const std::vector<posting_list_t::iterator_t>& field_iters = token_fields_iters.get_its();
for(size_t fi = 0; fi < field_iters.size(); fi++) {
const posting_list_t::iterator_t& field_iter = field_iters[fi];
if(field_iter.id() == seq_id) {
// not all fields might contain a given token
field_to_tokens[field_iter.get_field_id()].push_back(field_iter.clone());
}
}
}
}
if(syn_orig_num_tokens != -1) {
query_len = syn_orig_num_tokens;
}
int64_t best_field_match_score = 0, best_field_weight = 0;
uint32_t num_matching_fields = 0;
@ -4113,18 +4147,6 @@ void Index::search_across_fields(const std::vector<token_t>& query_tokens,
compute_sort_scores(sort_fields, sort_order, field_values, geopoint_indices, seq_id, filter_index,
best_field_match_score, scores, match_score_index);
size_t query_len = query_tokens.size();
// check if seq_id exists in any of the dropped_token iters and increment matching fields accordingly
for(auto& dropped_token_it: dropped_token_its) {
if(dropped_token_it.skip_to(seq_id) && dropped_token_it.id() == seq_id) {
query_len++;
}
}
if(syn_orig_num_tokens != -1) {
query_len = syn_orig_num_tokens;
}
query_len = std::min<size_t>(15, query_len);
// NOTE: `query_len` is total tokens matched across fields.
@ -6244,7 +6266,7 @@ void Index::handle_doc_ops(const tsl::htrie_map<char, field>& search_schema,
}
}
void Index::get_doc_changes(const index_operation_t op, const tsl::htrie_map<char, field>& search_schema,
void Index::get_doc_changes(const index_operation_t op, const tsl::htrie_map<char, field>& embedding_fields,
nlohmann::json& update_doc, const nlohmann::json& old_doc, nlohmann::json& new_doc,
nlohmann::json& del_doc) {
@ -6257,7 +6279,12 @@ void Index::get_doc_changes(const index_operation_t op, const tsl::htrie_map<cha
}
if(!update_doc.contains(it.key())) {
del_doc[it.key()] = it.value();
// embedding field won't be part of upsert doc so populate new doc with the value from old doc
if(embedding_fields.count(it.key()) != 0) {
new_doc[it.key()] = it.value();
} else {
del_doc[it.key()] = it.value();
}
}
}
} else {
@ -6311,9 +6338,10 @@ size_t Index::num_seq_ids() const {
Option<bool> Index::seq_ids_outside_top_k(const std::string& field_name, size_t k,
std::vector<uint32_t>& outside_seq_ids) {
std::shared_lock lock(mutex);
auto field_it = numerical_index.find(field_name);
if(field_it == sort_index.end()) {
if(field_it == numerical_index.end()) {
return Option<bool>(400, "Field not found in numerical index.");
}

View File

@ -117,6 +117,11 @@ embedding_res_t TextEmbedder::Embed(const std::string& text, const size_t remote
input_shapes.push_back({1, static_cast<int64_t>(encoded_input.input_ids.size())});
input_shapes.push_back({1, static_cast<int64_t>(encoded_input.attention_mask.size())});
if(session_->GetInputCount() == 3) {
// edge case: xlm_roberta does not have token_type_ids, but if the model has it as input, we need to fill it with 0s
if(encoded_input.token_type_ids.size() == 0) {
encoded_input.token_type_ids.resize(encoded_input.input_ids.size(), 0);
}
input_shapes.push_back({1, static_cast<int64_t>(encoded_input.token_type_ids.size())});
}
input_tensors.push_back(Ort::Value::CreateTensor<int64_t>(memory_info, encoded_input.input_ids.data(), encoded_input.input_ids.size(), input_shapes[0].data(), input_shapes[0].size()));

View File

@ -43,9 +43,10 @@ Option<bool> TextEmbedderManager::validate_and_init_remote_model(const nlohmann:
}
std::unique_lock<std::mutex> lock(text_embedders_mutex);
auto text_embedder_it = text_embedders.find(model_name);
std::string model_key = is_remote_model(model_name) ? RemoteEmbedder::get_model_key(model_config) : model_name;
auto text_embedder_it = text_embedders.find(model_key);
if(text_embedder_it == text_embedders.end()) {
text_embedders.emplace(model_name, std::make_shared<TextEmbedder>(model_config, num_dims));
text_embedders.emplace(model_key, std::make_shared<TextEmbedder>(model_config, num_dims));
}
return Option<bool>(true);
@ -122,7 +123,8 @@ Option<bool> TextEmbedderManager::validate_and_init_local_model(const nlohmann::
Option<TextEmbedder*> TextEmbedderManager::get_text_embedder(const nlohmann::json& model_config) {
std::unique_lock<std::mutex> lock(text_embedders_mutex);
const std::string& model_name = model_config.at("model_name");
auto text_embedder_it = text_embedders.find(model_name);
std::string model_key = is_remote_model(model_name) ? RemoteEmbedder::get_model_key(model_config) : model_name;
auto text_embedder_it = text_embedders.find(model_key);
if(text_embedder_it == text_embedders.end()) {
return Option<TextEmbedder*>(404, "Text embedder was not found.");

View File

@ -53,6 +53,21 @@ long RemoteEmbedder::call_remote_api(const std::string& method, const std::strin
proxy_call_timeout_ms, true);
}
const std::string RemoteEmbedder::get_model_key(const nlohmann::json& model_config) {
const std::string model_namespace = TextEmbedderManager::get_model_namespace(model_config["model_name"].get<std::string>());
if(model_namespace == "openai") {
return OpenAIEmbedder::get_model_key(model_config);
} else if(model_namespace == "google") {
return GoogleEmbedder::get_model_key(model_config);
} else if(model_namespace == "gcp") {
return GCPEmbedder::get_model_key(model_config);
} else {
return "";
}
}
OpenAIEmbedder::OpenAIEmbedder(const std::string& openai_model_path, const std::string& api_key) : api_key(api_key), openai_model_path(openai_model_path) {
}
@ -206,6 +221,7 @@ std::vector<embedding_res_t> OpenAIEmbedder::batch_embed(const std::vector<std::
}
nlohmann::json res_json;
try {
res_json = nlohmann::json::parse(res);
} catch (const std::exception& e) {
@ -217,8 +233,21 @@ std::vector<embedding_res_t> OpenAIEmbedder::batch_embed(const std::vector<std::
}
return outputs;
}
if(res_json.count("data") == 0 || !res_json["data"].is_array() || res_json["data"].size() != inputs.size()) {
std::vector<embedding_res_t> outputs;
for(size_t i = 0; i < inputs.size(); i++) {
outputs.push_back(embedding_res_t(500, "Got malformed response from OpenAI API."));
}
return outputs;
}
std::vector<embedding_res_t> outputs;
for(auto& data : res_json["data"]) {
if(data.count("embedding") == 0 || !data["embedding"].is_array() || data["embedding"].size() == 0) {
outputs.push_back(embedding_res_t(500, "Got malformed response from OpenAI API."));
continue;
}
outputs.push_back(embedding_res_t(data["embedding"].get<std::vector<float>>()));
}
@ -255,6 +284,9 @@ nlohmann::json OpenAIEmbedder::get_error_json(const nlohmann::json& req_body, lo
return embedding_res;
}
std::string OpenAIEmbedder::get_model_key(const nlohmann::json& model_config) {
return model_config["model_name"].get<std::string>() + ":" + model_config["api_key"].get<std::string>();
}
GoogleEmbedder::GoogleEmbedder(const std::string& google_api_key) : google_api_key(google_api_key) {
@ -372,6 +404,10 @@ nlohmann::json GoogleEmbedder::get_error_json(const nlohmann::json& req_body, lo
return embedding_res;
}
std::string GoogleEmbedder::get_model_key(const nlohmann::json& model_config) {
return model_config["model_name"].get<std::string>() + ":" + model_config["api_key"].get<std::string>();
}
GCPEmbedder::GCPEmbedder(const std::string& project_id, const std::string& model_name, const std::string& access_token,
const std::string& refresh_token, const std::string& client_id, const std::string& client_secret) :
@ -555,7 +591,20 @@ std::vector<embedding_res_t> GCPEmbedder::batch_embed(const std::vector<std::str
return outputs;
}
std::vector<embedding_res_t> outputs;
if(res_json.count("predictions") == 0 || !res_json["predictions"].is_array() || res_json["predictions"].size() != inputs.size()) {
std::vector<embedding_res_t> outputs;
for(size_t i = 0; i < inputs.size(); i++) {
outputs.push_back(embedding_res_t(500, "Got malformed response from GCP API."));
}
return outputs;
}
for(const auto& prediction : res_json["predictions"]) {
if(prediction.count("embeddings") == 0 || !prediction["embeddings"].is_object() || prediction["embeddings"].count("values") == 0 || !prediction["embeddings"]["values"].is_array() || prediction["embeddings"]["values"].size() == 0) {
outputs.push_back(embedding_res_t(500, "Got malformed response from GCP API."));
continue;
}
outputs.push_back(embedding_res_t(prediction["embeddings"]["values"].get<std::vector<float>>()));
}
@ -625,3 +674,7 @@ Option<std::string> GCPEmbedder::generate_access_token(const std::string& refres
return Option<std::string>(access_token);
}
std::string GCPEmbedder::get_model_key(const nlohmann::json& model_config) {
return model_config["model_name"].get<std::string>() + ":" + model_config["project_id"].get<std::string>() + ":" + model_config["client_secret"].get<std::string>();
}

View File

@ -1,5 +1,6 @@
#include <sstream>
#include <algorithm>
#include <string_utils.h>
#include "tokenizer.h"
Tokenizer::Tokenizer(const std::string& input, bool normalize, bool no_op, const std::string& locale,
@ -349,10 +350,15 @@ bool Tokenizer::should_skip_char(char c) {
return is_ascii_char(c) && get_stream_mode(c) != INDEX;
}
void Tokenizer::normalize_ascii(std::string& text) {
for(size_t i = 0; i < text.size(); i++) {
std::string Tokenizer::normalize_ascii_no_spaces(const std::string& text) {
std::string analytics_query = text;
StringUtils::trim(analytics_query);
for(size_t i = 0; i < analytics_query.size(); i++) {
if(is_ascii_char(text[i])) {
text[i] = std::tolower(text[i]);
analytics_query[i] = std::tolower(analytics_query[i]);
}
}
return analytics_query;
}

View File

@ -118,7 +118,7 @@ int init_root_logger(Config & config, const std::string & server_version) {
if(log_dir.empty()) {
// use console logger if log dir is not specified
FLAGS_logtostdout = true;
FLAGS_logtostderr = true;
} else {
if(!directory_exists(log_dir)) {
std::cerr << "Typesense failed to start. " << "Log directory " << log_dir << " does not exist.";

View File

@ -3,6 +3,7 @@
#include <vector>
#include <fstream>
#include <collection_manager.h>
#include <analytics_manager.h>
#include "string_utils.h"
#include "collection.h"
@ -24,6 +25,8 @@ protected:
collectionManager.init(store, 1.0, "auth_key", quit);
collectionManager.load(8, 1000);
AnalyticsManager::get_instance().init(store);
schema = R"({
"name": "collection1",
"enable_nested_fields": true,
@ -587,6 +590,67 @@ TEST_F(CollectionManagerTest, VerifyEmbeddedParametersOfScopedAPIKey) {
collectionManager.drop_collection("coll1");
}
TEST_F(CollectionManagerTest, QuerySuggestionsShouldBeTrimmed) {
std::vector<field> fields = {field("title", field_types::STRING, false, false, true, "", -1, 1),
field("year", field_types::INT32, false),
field("points", field_types::INT32, false),};
Collection* coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
nlohmann::json doc1;
doc1["id"] = "0";
doc1["title"] = "Tom Sawyer";
doc1["year"] = 1876;
doc1["points"] = 100;
ASSERT_TRUE(coll1->add(doc1.dump()).ok());
Config::get_instance().set_enable_search_analytics(true);
nlohmann::json analytics_rule = R"({
"name": "top_search_queries",
"type": "popular_queries",
"params": {
"limit": 100,
"source": {
"collections": ["coll1"]
},
"destination": {
"collection": "top_queries"
}
}
})"_json;
auto create_op = AnalyticsManager::get_instance().create_rule(analytics_rule, false, true);
ASSERT_TRUE(create_op.ok());
nlohmann::json embedded_params;
std::map<std::string, std::string> req_params;
req_params["collection"] = "coll1";
req_params["q"] = " tom ";
req_params["query_by"] = "title";
std::string json_res;
auto now_ts = std::chrono::duration_cast<std::chrono::microseconds>(
std::chrono::system_clock::now().time_since_epoch()).count();
auto search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts);
ASSERT_TRUE(search_op.ok());
json_res.clear();
req_params["q"] = " ";
search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts);
ASSERT_TRUE(search_op.ok());
// check that suggestions have been trimmed
auto popular_queries = AnalyticsManager::get_instance().get_popular_queries();
ASSERT_EQ(2, popular_queries["top_queries"]->get_user_prefix_queries()[""].size());
ASSERT_EQ("tom", popular_queries["top_queries"]->get_user_prefix_queries()[""][0].query);
ASSERT_EQ("", popular_queries["top_queries"]->get_user_prefix_queries()[""][1].query);
collectionManager.drop_collection("coll1");
}
TEST_F(CollectionManagerTest, RestoreAutoSchemaDocsOnRestart) {
Collection *coll1;

View File

@ -1566,6 +1566,14 @@ TEST_F(CollectionSchemaChangeTest, UpdateSchemaWithNewEmbeddingField) {
ASSERT_TRUE(res.ok());
ASSERT_EQ(1, coll->get_embedding_fields().size());
auto search_schema = coll->get_schema();
auto embedding_field_it = search_schema.find("embedding");
ASSERT_TRUE(embedding_field_it != coll->get_schema().end());
ASSERT_EQ("embedding", embedding_field_it.value().name);
ASSERT_EQ("float[]", embedding_field_it.value().type);
ASSERT_EQ(384, embedding_field_it.value().num_dim);
nlohmann::json doc;
doc["names"] = {"hello", "world"};
auto add_op = coll->add(doc.dump());
@ -1580,9 +1588,13 @@ TEST_F(CollectionSchemaChangeTest, DropFieldUsedForEmbedding) {
nlohmann::json schema = R"({
"name": "objects",
"fields": [
{"name": "names", "type": "string[]"},
{"name": "category", "type":"string"},
{"name": "embedding", "type":"float[]", "embed":{"from": ["names","category"], "model_config": {"model_name": "ts/e5-small"}}}
{"name": "title", "type": "string"},
{"name": "names", "type": "string[]"},
{"name": "category", "type":"string"},
{"name": "embedding", "type":"float[]", "embed":{"from": ["names","category"],
"model_config": {"model_name": "ts/e5-small"}}},
{"name": "embedding2", "type":"float[]", "embed":{"from": ["names"],
"model_config": {"model_name": "ts/e5-small"}}}
]
})"_json;
@ -1594,20 +1606,28 @@ TEST_F(CollectionSchemaChangeTest, DropFieldUsedForEmbedding) {
LOG(INFO) << "Created collection";
auto embedding_fields = coll->get_embedding_fields();
ASSERT_EQ(2, embedding_fields.size());
ASSERT_EQ(2, embedding_fields["embedding"].embed[fields::from].get<std::vector<std::string>>().size());
ASSERT_EQ(1, embedding_fields["embedding2"].embed[fields::from].get<std::vector<std::string>>().size());
auto coll_schema = coll->get_schema();
ASSERT_EQ(5, coll_schema.size());
auto the_fields = coll->get_fields();
ASSERT_EQ(5, the_fields.size());
auto schema_changes = R"({
"fields": [
{"name": "names", "drop": true}
]
})"_json;
auto embedding_fields = coll->get_embedding_fields();
ASSERT_EQ(2, embedding_fields["embedding"].embed[fields::from].get<std::vector<std::string>>().size());
auto alter_op = coll->alter(schema_changes);
ASSERT_TRUE(alter_op.ok());
embedding_fields = coll->get_embedding_fields();
ASSERT_EQ(1, embedding_fields.size());
ASSERT_EQ(1, embedding_fields["embedding"].embed[fields::from].get<std::vector<std::string>>().size());
ASSERT_EQ("category", embedding_fields["embedding"].embed[fields::from].get<std::vector<std::string>>()[0]);
@ -1623,6 +1643,16 @@ TEST_F(CollectionSchemaChangeTest, DropFieldUsedForEmbedding) {
embedding_fields = coll->get_embedding_fields();
ASSERT_EQ(0, embedding_fields.size());
ASSERT_EQ(0, coll->_get_index()->_get_vector_index().size());
// only title remains
coll_schema = coll->get_schema();
ASSERT_EQ(1, coll_schema.size());
ASSERT_EQ("title", coll_schema["title"].name);
the_fields = coll->get_fields();
ASSERT_EQ(1, the_fields.size());
ASSERT_EQ("title", the_fields[0].name);
}
TEST_F(CollectionSchemaChangeTest, EmbeddingFieldsMapTest) {

View File

@ -1816,6 +1816,36 @@ TEST_F(CollectionSpecificMoreTest, ConsiderDroppedTokensDuringTextMatchScoring)
ASSERT_EQ("1", res["hits"][1]["document"]["id"].get<std::string>());
}
TEST_F(CollectionSpecificMoreTest, ConsiderDroppedTokensDuringTextMatchScoring2) {
nlohmann::json schema = R"({
"name": "coll1",
"fields": [
{"name": "name", "type": "string"}
]
})"_json;
Collection *coll1 = collectionManager.create_collection(schema).get();
nlohmann::json doc;
doc["id"] = "0";
doc["name"] = "Elizabeth Arden 5th Avenue Eau de Parfum 125ml";
ASSERT_TRUE(coll1->add(doc.dump()).ok());
doc["id"] = "1";
doc["name"] = "Avène Sun Very High Protection Mineral Cream SPF50+ 50ml";
ASSERT_TRUE(coll1->add(doc.dump()).ok());
auto res = coll1->search("avène eau mineral", {"name"}, "", {}, {}, {2}, 10, 1, FREQUENCY, {true}, 5,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 20, {}, {}, {}, 0,
"<mark>", "</mark>", {3}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback,
4, {off}, 0, 0, 0, 2, false, "", true, 0, max_weight).get();
ASSERT_EQ(2, res["hits"].size());
ASSERT_EQ("1", res["hits"][0]["document"]["id"].get<std::string>());
ASSERT_EQ("0", res["hits"][1]["document"]["id"].get<std::string>());
}
TEST_F(CollectionSpecificMoreTest, NonNestedFieldNameWithDot) {
nlohmann::json schema = R"({
"name": "coll1",

View File

@ -224,6 +224,137 @@ TEST_F(CollectionVectorTest, BasicVectorQuerying) {
collectionManager.drop_collection("coll1");
}
TEST_F(CollectionVectorTest, VectorUnchangedUpsert) {
nlohmann::json schema = R"({
"name": "coll1",
"fields": [
{"name": "title", "type": "string"},
{"name": "points", "type": "int32"},
{"name": "vec", "type": "float[]", "num_dim": 3}
]
})"_json;
Collection* coll1 = collectionManager.create_collection(schema).get();
std::vector<float> vec = {0.12, 0.45, 0.64};
nlohmann::json doc;
doc["id"] = "0";
doc["title"] = "Title";
doc["points"] = 100;
doc["vec"] = vec;
auto add_op = coll1->add(doc.dump());
ASSERT_TRUE(add_op.ok());
auto results = coll1->search("*", {}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
"", 10, {}, {}, {}, 0,
"<mark>", "</mark>", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback,
4, {off}, 32767, 32767, 2,
false, true, "vec:([0.12, 0.44, 0.55])").get();
ASSERT_EQ(1, results["found"].get<size_t>());
// upsert unchanged doc
add_op = coll1->add(doc.dump(), index_operation_t::UPSERT);
ASSERT_TRUE(add_op.ok());
results = coll1->search("*", {}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
"", 10, {}, {}, {}, 0,
"<mark>", "</mark>", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback,
4, {off}, 32767, 32767, 2,
false, true, "vec:([0.12, 0.44, 0.55])").get();
ASSERT_EQ(1, results["found"].get<size_t>());
// emplace unchanged doc
add_op = coll1->add(doc.dump(), index_operation_t::EMPLACE);
ASSERT_TRUE(add_op.ok());
results = coll1->search("*", {}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
"", 10, {}, {}, {}, 0,
"<mark>", "</mark>", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback,
4, {off}, 32767, 32767, 2,
false, true, "vec:([0.12, 0.44, 0.55])").get();
ASSERT_EQ(1, results["found"].get<size_t>());
}
TEST_F(CollectionVectorTest, VectorPartialUpdate) {
nlohmann::json schema = R"({
"name": "coll1",
"fields": [
{"name": "title", "type": "string"},
{"name": "points", "type": "int32"},
{"name": "vec", "type": "float[]", "num_dim": 3}
]
})"_json;
Collection* coll1 = collectionManager.create_collection(schema).get();
std::vector<float> vec = {0.12, 0.45, 0.64};
nlohmann::json doc;
doc["id"] = "0";
doc["title"] = "Title";
doc["points"] = 100;
doc["vec"] = vec;
auto add_op = coll1->add(doc.dump());
ASSERT_TRUE(add_op.ok());
auto results = coll1->search("*", {}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
"", 10, {}, {}, {}, 0,
"<mark>", "</mark>", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback,
4, {off}, 32767, 32767, 2,
false, true, "vec:([0.12, 0.44, 0.55])").get();
ASSERT_EQ(1, results["found"].get<size_t>());
// emplace partial doc
doc.erase("vec");
doc["title"] = "Random";
add_op = coll1->add(doc.dump(), index_operation_t::EMPLACE);
ASSERT_TRUE(add_op.ok());
results = coll1->search("Random", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
"", 10, {}, {}, {}, 0,
"<mark>", "</mark>", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback,
4, {off}, 32767, 32767, 2,
false, true, "vec:([0.12, 0.44, 0.55])").get();
ASSERT_EQ(1, results["found"].get<size_t>());
// update portial doc
doc.erase("vec");
doc["title"] = "Random";
add_op = coll1->add(doc.dump(), index_operation_t::UPDATE);
ASSERT_TRUE(add_op.ok());
results = coll1->search("Random", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
"", 10, {}, {}, {}, 0,
"<mark>", "</mark>", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback,
4, {off}, 32767, 32767, 2,
false, true, "vec:([0.12, 0.44, 0.55])").get();
ASSERT_EQ(1, results["found"].get<size_t>());
}
TEST_F(CollectionVectorTest, NumVectorGreaterThanNumDim) {
nlohmann::json schema = R"({
"name": "coll1",
@ -692,6 +823,88 @@ TEST_F(CollectionVectorTest, VectorWithNullValue) {
nlohmann::json::parse(json_lines[1])["error"].get<std::string>());
}
TEST_F(CollectionVectorTest, EmbeddedVectorUnchangedUpsert) {
nlohmann::json schema = R"({
"name": "coll1",
"fields": [
{"name": "title", "type": "string"},
{"name": "points", "type": "int32"},
{"name": "embedding", "type":"float[]", "embed":{"from": ["title"],
"model_config": {"model_name": "ts/e5-small"}}}
]
})"_json;
TextEmbedderManager::set_model_dir("/tmp/typesense_test/models");
Collection* coll1 = collectionManager.create_collection(schema).get();
nlohmann::json doc;
doc["id"] = "0";
doc["title"] = "Title";
doc["points"] = 100;
auto add_op = coll1->add(doc.dump());
ASSERT_TRUE(add_op.ok());
auto results = coll1->search("title", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>()).get();
ASSERT_EQ(1, results["found"].get<size_t>());
auto embedding = results["hits"][0]["document"]["embedding"].get<std::vector<float>>();
ASSERT_EQ(384, embedding.size());
// upsert unchanged doc
doc.clear();
doc["id"] = "0";
doc["title"] = "Title";
doc["points"] = 100;
add_op = coll1->add(doc.dump(), index_operation_t::UPSERT);
ASSERT_TRUE(add_op.ok());
results = coll1->search("title", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>()).get();
ASSERT_EQ(1, results["found"].get<size_t>());
embedding = results["hits"][0]["document"]["embedding"].get<std::vector<float>>();
ASSERT_EQ(384, embedding.size());
// update
doc.clear();
doc["id"] = "0";
doc["title"] = "Title";
doc["points"] = 100;
add_op = coll1->add(doc.dump(), index_operation_t::UPDATE);
ASSERT_TRUE(add_op.ok());
results = coll1->search("title", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>()).get();
ASSERT_EQ(1, results["found"].get<size_t>());
embedding = results["hits"][0]["document"]["embedding"].get<std::vector<float>>();
ASSERT_EQ(384, embedding.size());
// emplace
doc.clear();
doc["id"] = "0";
doc["title"] = "Title";
doc["points"] = 100;
add_op = coll1->add(doc.dump(), index_operation_t::EMPLACE);
ASSERT_TRUE(add_op.ok());
results = coll1->search("title", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>()).get();
ASSERT_EQ(1, results["found"].get<size_t>());
embedding = results["hits"][0]["document"]["embedding"].get<std::vector<float>>();
ASSERT_EQ(384, embedding.size());
}
TEST_F(CollectionVectorTest, HybridSearchWithExplicitVector) {
nlohmann::json schema = R"({
"name": "objects",
@ -1099,7 +1312,67 @@ TEST_F(CollectionVectorTest, HideCredential) {
ASSERT_EQ("***********", coll_summary["fields"][1]["embed"]["model_config"]["project_id"].get<std::string>());
}
TEST_F(CollectionVectorTest, UpdateOfCollWithNonOptionalEmbeddingField) {
TEST_F(CollectionVectorTest, UpdateOfFieldReferencedByEmbedding) {
nlohmann::json schema = R"({
"name": "objects",
"fields": [
{"name": "name", "type": "string"},
{"name": "embedding", "type":"float[]", "embed":{"from": ["name"],
"model_config": {"model_name": "ts/e5-small"}}}
]
})"_json;
TextEmbedderManager::set_model_dir("/tmp/typesense_test/models");
auto op = collectionManager.create_collection(schema);
ASSERT_TRUE(op.ok());
Collection* coll = op.get();
nlohmann::json object;
object["id"] = "0";
object["name"] = "butter";
auto add_op = coll->add(object.dump(), CREATE);
ASSERT_TRUE(add_op.ok());
auto results = coll->search("butter", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}).get();
ASSERT_EQ(1, results["found"].get<size_t>());
auto original_embedding = results["hits"][0]["document"]["embedding"].get<std::vector<float>>();
nlohmann::json update_object;
update_object["id"] = "0";
update_object["name"] = "ghee";
auto update_op = coll->add(update_object.dump(), EMPLACE);
ASSERT_TRUE(update_op.ok());
results = coll->search("ghee", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}).get();
ASSERT_EQ(1, results["found"].get<size_t>());
auto updated_embedding = results["hits"][0]["document"]["embedding"].get<std::vector<float>>();
ASSERT_NE(original_embedding, updated_embedding);
// action = update
update_object["name"] = "milk";
update_op = coll->add(update_object.dump(), UPDATE);
ASSERT_TRUE(update_op.ok());
results = coll->search("milk", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}).get();
ASSERT_EQ(1, results["found"].get<size_t>());
updated_embedding = results["hits"][0]["document"]["embedding"].get<std::vector<float>>();
ASSERT_NE(original_embedding, updated_embedding);
// action = upsert
update_object["name"] = "cheese";
update_op = coll->add(update_object.dump(), UPSERT);
ASSERT_TRUE(update_op.ok());
results = coll->search("cheese", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}).get();
ASSERT_EQ(1, results["found"].get<size_t>());
updated_embedding = results["hits"][0]["document"]["embedding"].get<std::vector<float>>();
ASSERT_NE(original_embedding, updated_embedding);
}
TEST_F(CollectionVectorTest, UpdateOfFieldNotReferencedByEmbedding) {
// test updates to a field that's not referred by an embedding field
nlohmann::json schema = R"({
"name": "objects",
"fields": [
@ -1123,16 +1396,34 @@ TEST_F(CollectionVectorTest, UpdateOfCollWithNonOptionalEmbeddingField) {
auto add_op = coll->add(object.dump(), CREATE);
ASSERT_TRUE(add_op.ok());
auto results = coll->search("butter", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}).get();
ASSERT_EQ(1, results["found"].get<size_t>());
nlohmann::json update_object;
update_object["id"] = "0";
update_object["about"] = "something about butter";
auto update_op = coll->add(update_object.dump(), EMPLACE);
ASSERT_TRUE(update_op.ok());
results = coll->search("butter", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}).get();
ASSERT_EQ(1, results["found"].get<size_t>());
// action = update
update_object["about"] = "something about butter 2";
update_op = coll->add(update_object.dump(), UPDATE);
ASSERT_TRUE(update_op.ok());
results = coll->search("butter", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}).get();
ASSERT_EQ(1, results["found"].get<size_t>());
// action = upsert
update_object["name"] = "butter";
update_object["about"] = "something about butter 3";
update_op = coll->add(update_object.dump(), UPSERT);
ASSERT_TRUE(update_op.ok());
results = coll->search("butter", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}).get();
ASSERT_EQ(1, results["found"].get<size_t>());
}
TEST_F(CollectionVectorTest, FreshEmplaceWithOptionalEmbeddingReferencedField) {
@ -1161,6 +1452,27 @@ TEST_F(CollectionVectorTest, FreshEmplaceWithOptionalEmbeddingReferencedField) {
"or make the embedding field optional.", add_op.error());
}
TEST_F(CollectionVectorTest, EmbeddingFieldWithIdFieldPrecedingInSchema) {
auto schema = R"({
"name": "objects",
"fields": [
{"name": "id", "type": "string"},
{"name": "name", "type": "string"},
{"name": "embedding", "type":"float[]", "embed":{"from": ["name"], "model_config": {"model_name": "ts/e5-small"}}}
]
})"_json;
TextEmbedderManager::set_model_dir("/tmp/typesense_test/models");
auto op = collectionManager.create_collection(schema);
ASSERT_TRUE(op.ok());
Collection* coll = op.get();
auto fs = coll->get_fields();
ASSERT_EQ(2, fs.size());
ASSERT_EQ(384, fs[1].num_dim);
}
TEST_F(CollectionVectorTest, SkipEmbeddingOpWhenValueExists) {
nlohmann::json schema = R"({
"name": "objects",
@ -1306,6 +1618,58 @@ TEST_F(CollectionVectorTest, KeywordSearchReturnOnlyTextMatchInfo) {
ASSERT_EQ(1, results["hits"][0].count("text_match_info"));
}
TEST_F(CollectionVectorTest, GroupByWithVectorSearch) {
nlohmann::json schema = R"({
"name": "coll1",
"fields": [
{"name": "title", "type": "string"},
{"name": "group", "type": "string", "facet": true},
{"name": "vec", "type": "float[]", "num_dim": 4}
]
})"_json;
Collection* coll1 = collectionManager.create_collection(schema).get();
std::vector<std::vector<float>> values = {
{0.851758, 0.909671, 0.823431, 0.372063},
{0.97826, 0.933157, 0.39557, 0.306488},
{0.230606, 0.634397, 0.514009, 0.399594}
};
for (size_t i = 0; i < values.size(); i++) {
nlohmann::json doc;
doc["id"] = std::to_string(i);
doc["title"] = std::to_string(i) + " title";
doc["group"] = "0";
doc["vec"] = values[i];
ASSERT_TRUE(coll1->add(doc.dump()).ok());
}
auto res = coll1->search("title", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
"", 10, {}, {}, {"group"}, 1,
"<mark>", "</mark>", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback,
4, {off}, 32767, 32767, 2,
false, true, "vec:([0.96826, 0.94, 0.39557, 0.306488])").get();
ASSERT_EQ(1, res["grouped_hits"].size());
ASSERT_EQ(1, res["grouped_hits"][0]["hits"].size());
ASSERT_EQ(1, res["grouped_hits"][0]["hits"][0].count("vector_distance"));
res = coll1->search("*", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
"", 10, {}, {}, {"group"}, 1,
"<mark>", "</mark>", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback,
4, {off}, 32767, 32767, 2,
false, true, "vec:([0.96826, 0.94, 0.39557, 0.306488])").get();
ASSERT_EQ(1, res["grouped_hits"].size());
ASSERT_EQ(1, res["grouped_hits"][0]["hits"].size());
ASSERT_EQ(1, res["grouped_hits"][0]["hits"][0].count("vector_distance"));
}
TEST_F(CollectionVectorTest, HybridSearchReturnAllInfo) {
auto schema_json =
R"({
@ -1342,3 +1706,135 @@ TEST_F(CollectionVectorTest, HybridSearchReturnAllInfo) {
ASSERT_EQ(1, results["hits"][0].count("text_match_info"));
ASSERT_EQ(1, results["hits"][0].count("hybrid_search_info"));
}
TEST_F(CollectionVectorTest, DISABLED_HybridSortingTest) {
auto schema_json =
R"({
"name": "TEST",
"fields": [
{"name": "name", "type": "string"},
{"name": "embedding", "type":"float[]", "embed":{"from": ["name"], "model_config": {"model_name": "ts/e5-small"}}}
]
})"_json;
TextEmbedderManager::set_model_dir("/tmp/typesense_test/models");
auto collection_create_op = collectionManager.create_collection(schema_json);
ASSERT_TRUE(collection_create_op.ok());
auto coll1 = collection_create_op.get();
auto add_op = coll1->add(R"({
"name": "john doe"
})"_json.dump());
ASSERT_TRUE(add_op.ok());
add_op = coll1->add(R"({
"name": "john legend"
})"_json.dump());
ASSERT_TRUE(add_op.ok());
add_op = coll1->add(R"({
"name": "john krasinski"
})"_json.dump());
ASSERT_TRUE(add_op.ok());
add_op = coll1->add(R"({
"name": "john abraham"
})"_json.dump());
ASSERT_TRUE(add_op.ok());
// first do keyword search
auto results = coll1->search("john", {"name"},
"", {}, {}, {2}, 10,
1, FREQUENCY, {true},
0, spp::sparse_hash_set<std::string>()).get();
ASSERT_EQ(4, results["hits"].size());
// now do hybrid search with sort_by: _text_match:desc,_vector_distance:asc
std::vector<sort_by> sort_by_list = {{"_text_match", "desc"}, {"_vector_distance", "asc"}};
auto hybrid_results = coll1->search("john", {"name", "embedding"},
"", {}, sort_by_list, {2}, 10,
1, FREQUENCY, {true},
0, spp::sparse_hash_set<std::string>()).get();
// first 4 results should be same as keyword search
ASSERT_EQ(results["hits"][0]["document"]["name"].get<std::string>(), hybrid_results["hits"][0]["document"]["name"].get<std::string>());
ASSERT_EQ(results["hits"][1]["document"]["name"].get<std::string>(), hybrid_results["hits"][1]["document"]["name"].get<std::string>());
ASSERT_EQ(results["hits"][2]["document"]["name"].get<std::string>(), hybrid_results["hits"][2]["document"]["name"].get<std::string>());
ASSERT_EQ(results["hits"][3]["document"]["name"].get<std::string>(), hybrid_results["hits"][3]["document"]["name"].get<std::string>());
}
TEST_F(CollectionVectorTest, TestDifferentOpenAIApiKeys) {
if (std::getenv("api_key_1") == nullptr || std::getenv("api_key_2") == nullptr) {
LOG(INFO) << "Skipping test as api_key_1 or api_key_2 is not set";
return;
}
auto api_key1 = std::string(std::getenv("api_key_1"));
auto api_key2 = std::string(std::getenv("api_key_2"));
auto embedder_map = TextEmbedderManager::get_instance()._get_text_embedders();
ASSERT_EQ(embedder_map.find("openai/text-embedding-ada-002:" + api_key1), embedder_map.end());
ASSERT_EQ(embedder_map.find("openai/text-embedding-ada-002:" + api_key2), embedder_map.end());
ASSERT_EQ(embedder_map.find("openai/text-embedding-ada-002"), embedder_map.end());
nlohmann::json model_config1 = R"({
"model_name": "openai/text-embedding-ada-002"
})"_json;
nlohmann::json model_config2 = model_config1;
model_config1["api_key"] = api_key1;
model_config2["api_key"] = api_key2;
size_t num_dim;
TextEmbedderManager::get_instance().validate_and_init_remote_model(model_config1, num_dim);
TextEmbedderManager::get_instance().validate_and_init_remote_model(model_config2, num_dim);
embedder_map = TextEmbedderManager::get_instance()._get_text_embedders();
ASSERT_NE(embedder_map.find("openai/text-embedding-ada-002:" + api_key1), embedder_map.end());
ASSERT_NE(embedder_map.find("openai/text-embedding-ada-002:" + api_key2), embedder_map.end());
ASSERT_EQ(embedder_map.find("openai/text-embedding-ada-002"), embedder_map.end());
}
TEST_F(CollectionVectorTest, TestMultilingualE5) {
auto schema_json =
R"({
"name": "TEST",
"fields": [
{"name": "name", "type": "string"},
{"name": "embedding", "type":"float[]", "embed":{"from": ["name"], "model_config": {"model_name": "ts/multilingual-e5-small"}}}
]
})"_json;
TextEmbedderManager::set_model_dir("/tmp/typesense_test/models");
auto collection_create_op = collectionManager.create_collection(schema_json);
ASSERT_TRUE(collection_create_op.ok());
auto coll1 = collection_create_op.get();
auto add_op = coll1->add(R"({
"name": "john doe"
})"_json.dump());
auto hybrid_results = coll1->search("john", {"name", "embedding"},
"", {}, {}, {2}, 10,
1, FREQUENCY, {true},
0, spp::sparse_hash_set<std::string>());
ASSERT_TRUE(hybrid_results.ok());
auto semantic_results = coll1->search("john", {"embedding"},
"", {}, {}, {2}, 10,
1, FREQUENCY, {true},
0, spp::sparse_hash_set<std::string>());
ASSERT_TRUE(semantic_results.ok());
}

View File

@ -610,7 +610,7 @@ TEST_F(CoreAPIUtilsTest, MultiSearchWithPresetShouldUsePresetForAuth) {
ASSERT_EQ(2, embedded_params_vec.size());
}
TEST_F(CoreAPIUtilsTest, PresetSingleSearch) {
TEST_F(CoreAPIUtilsTest, PresetMultiSearch) {
nlohmann::json schema = R"({
"name": "coll1",
"fields": [
@ -634,7 +634,7 @@ TEST_F(CoreAPIUtilsTest, PresetSingleSearch) {
auto search_body = R"(
{"searches":[
{"collection":"coll1","q":"apple", "query_by": "title", "preset": "single_preset"}
{"collection":"coll1","q":"apple", "query_by": "name", "preset": "single_preset"}
]}
)";
@ -644,8 +644,40 @@ TEST_F(CoreAPIUtilsTest, PresetSingleSearch) {
post_multi_search(req, res);
ASSERT_EQ("12", req->params["per_page"]);
ASSERT_EQ("coll1", req->params["collection"]);
auto res_json = nlohmann::json::parse(res->body);
ASSERT_EQ(1, res_json["results"].size());
ASSERT_EQ(0, res_json["results"][0]["found"].get<size_t>());
// with multiple "searches" preset configuration
preset_value = R"(
{"searches":[
{"collection":"coll1", "q": "*", "per_page": "8"},
{"collection":"coll1", "q": "*", "per_page": "11"}
]}
)"_json;
collectionManager.upsert_preset("multi_preset", preset_value);
embedded_params.clear();
req->params.clear();
req->params["preset"] = "multi_preset";
req->embedded_params_vec.clear();
req->embedded_params_vec.push_back(embedded_params);
req->embedded_params_vec.push_back(embedded_params);
// "preset": "multi_preset"
search_body = R"(
{"searches":[
{"collection":"coll1","q":"apple", "query_by": "title"}
]}
)";
req->body = search_body;
post_multi_search(req, res);
res_json = nlohmann::json::parse(res->body);
ASSERT_EQ(2, res_json["results"].size());
ASSERT_EQ(0, res_json["results"][0]["found"].get<size_t>());
ASSERT_EQ(0, res_json["results"][1]["found"].get<size_t>());
collectionManager.drop_collection("coll1");
}