mirror of
https://github.com/typesense/typesense.git
synced 2025-05-22 06:40:30 +08:00
Merge branch 'v0.25-join'
This commit is contained in:
commit
3bbfe20fcc
@ -79,7 +79,7 @@ public:
|
||||
Option<bool> remove_rule(const std::string& name);
|
||||
|
||||
void add_suggestion(const std::string& query_collection,
|
||||
std::string& query, bool live_query, const std::string& user_id);
|
||||
const std::string& query, bool live_query, const std::string& user_id);
|
||||
|
||||
void stop();
|
||||
|
||||
|
@ -162,7 +162,7 @@ private:
|
||||
|
||||
void remove_document(const nlohmann::json & document, const uint32_t seq_id, bool remove_from_store);
|
||||
|
||||
void process_remove_field_for_embedding_fields(const field& the_field, std::vector<field>& garbage_fields);
|
||||
void process_remove_field_for_embedding_fields(const field& del_field, std::vector<field>& garbage_embed_fields);
|
||||
|
||||
void curate_results(string& actual_query, const string& filter_query, bool enable_overrides, bool already_segmented,
|
||||
const std::map<size_t, std::vector<std::string>>& pinned_hits,
|
||||
|
@ -424,10 +424,11 @@ struct field {
|
||||
std::string& fallback_field_type,
|
||||
std::vector<field>& the_fields);
|
||||
|
||||
static Option<bool> validate_and_init_embed_fields(const std::vector<std::pair<size_t, size_t>>& embed_json_field_indices,
|
||||
const tsl::htrie_map<char, field>& search_schema,
|
||||
nlohmann::json& fields_json,
|
||||
std::vector<field>& fields_vec);
|
||||
static Option<bool> validate_and_init_embed_field(const tsl::htrie_map<char, field>& search_schema,
|
||||
nlohmann::json& field_json,
|
||||
const nlohmann::json& fields_json,
|
||||
field& the_field);
|
||||
|
||||
|
||||
static bool flatten_obj(nlohmann::json& doc, nlohmann::json& value, bool has_array, bool has_obj_array,
|
||||
bool is_update, const field& the_field, const std::string& flat_name,
|
||||
|
@ -532,7 +532,7 @@ private:
|
||||
static void handle_doc_ops(const tsl::htrie_map<char, field>& search_schema,
|
||||
nlohmann::json& update_doc, const nlohmann::json& old_doc);
|
||||
|
||||
static void get_doc_changes(const index_operation_t op, const tsl::htrie_map<char, field>& search_schema,
|
||||
static void get_doc_changes(const index_operation_t op, const tsl::htrie_map<char, field>& embedding_fields,
|
||||
nlohmann::json &update_doc, const nlohmann::json &old_doc, nlohmann::json &new_doc,
|
||||
nlohmann::json &del_doc);
|
||||
|
||||
|
@ -72,6 +72,10 @@ public:
|
||||
Option<bool> validate_and_init_local_model(const nlohmann::json& model_config, size_t& num_dims);
|
||||
Option<bool> validate_and_init_model(const nlohmann::json& model_config, size_t& num_dims);
|
||||
|
||||
std::unordered_map<std::string, std::shared_ptr<TextEmbedder>> _get_text_embedders() {
|
||||
return text_embedders;
|
||||
}
|
||||
|
||||
private:
|
||||
TextEmbedderManager() = default;
|
||||
|
||||
|
@ -31,6 +31,7 @@ class RemoteEmbedder {
|
||||
virtual nlohmann::json get_error_json(const nlohmann::json& req_body, long res_code, const std::string& res_body) = 0;
|
||||
virtual embedding_res_t Embed(const std::string& text, const size_t remote_embedder_timeout_ms = 30000, const size_t remote_embedding_num_tries = 2) = 0;
|
||||
virtual std::vector<embedding_res_t> batch_embed(const std::vector<std::string>& inputs, const size_t remote_embedding_batch_size = 200) = 0;
|
||||
static const std::string get_model_key(const nlohmann::json& model_config);
|
||||
static void init(ReplicationState* rs) {
|
||||
raft_server = rs;
|
||||
}
|
||||
@ -51,6 +52,7 @@ class OpenAIEmbedder : public RemoteEmbedder {
|
||||
embedding_res_t Embed(const std::string& text, const size_t remote_embedder_timeout_ms = 30000, const size_t remote_embedding_num_tries = 2) override;
|
||||
std::vector<embedding_res_t> batch_embed(const std::vector<std::string>& inputs, const size_t remote_embedding_batch_size = 200) override;
|
||||
nlohmann::json get_error_json(const nlohmann::json& req_body, long res_code, const std::string& res_body) override;
|
||||
static std::string get_model_key(const nlohmann::json& model_config);
|
||||
};
|
||||
|
||||
|
||||
@ -68,6 +70,7 @@ class GoogleEmbedder : public RemoteEmbedder {
|
||||
embedding_res_t Embed(const std::string& text, const size_t remote_embedder_timeout_ms = 30000, const size_t remote_embedding_num_tries = 2) override;
|
||||
std::vector<embedding_res_t> batch_embed(const std::vector<std::string>& inputs, const size_t remote_embedding_batch_size = 200) override;
|
||||
nlohmann::json get_error_json(const nlohmann::json& req_body, long res_code, const std::string& res_body) override;
|
||||
static std::string get_model_key(const nlohmann::json& model_config);
|
||||
};
|
||||
|
||||
|
||||
@ -95,6 +98,7 @@ class GCPEmbedder : public RemoteEmbedder {
|
||||
embedding_res_t Embed(const std::string& text, const size_t remote_embedder_timeout_ms = 30000, const size_t remote_embedding_num_tries = 2) override;
|
||||
std::vector<embedding_res_t> batch_embed(const std::vector<std::string>& inputs, const size_t remote_embedding_batch_size = 200) override;
|
||||
nlohmann::json get_error_json(const nlohmann::json& req_body, long res_code, const std::string& res_body) override;
|
||||
static std::string get_model_key(const nlohmann::json& model_config);
|
||||
};
|
||||
|
||||
|
||||
|
@ -88,5 +88,5 @@ public:
|
||||
|
||||
bool should_skip_char(char c);
|
||||
|
||||
static void normalize_ascii(std::string& text);
|
||||
static std::string normalize_ascii_no_spaces(const std::string& text);
|
||||
};
|
@ -787,6 +787,10 @@ public:
|
||||
cors_domains.insert(cors_values_vec.begin(), cors_values_vec.end());
|
||||
}
|
||||
|
||||
void set_enable_search_analytics(bool enable_search_analytics) {
|
||||
this->enable_search_analytics = enable_search_analytics;
|
||||
}
|
||||
|
||||
// validation
|
||||
|
||||
Option<bool> is_valid() {
|
||||
|
@ -203,7 +203,7 @@ Option<bool> AnalyticsManager::remove_popular_queries_index(const std::string &n
|
||||
return Option<bool>(true);
|
||||
}
|
||||
|
||||
void AnalyticsManager::add_suggestion(const std::string &query_collection, std::string &query,
|
||||
void AnalyticsManager::add_suggestion(const std::string &query_collection, const std::string& query,
|
||||
const bool live_query, const std::string& user_id) {
|
||||
// look up suggestion collections for the query collection
|
||||
std::unique_lock lock(mutex);
|
||||
@ -212,7 +212,6 @@ void AnalyticsManager::add_suggestion(const std::string &query_collection, std::
|
||||
for(const auto& suggestion_collection: suggestion_collections_it->second) {
|
||||
const auto& popular_queries_it = popular_queries.find(suggestion_collection);
|
||||
if(popular_queries_it != popular_queries.end()) {
|
||||
Tokenizer::normalize_ascii(query);
|
||||
popular_queries_it->second->add(query, live_query, user_id);
|
||||
}
|
||||
}
|
||||
@ -235,6 +234,8 @@ void AnalyticsManager::run(ReplicationState* raft_server) {
|
||||
}
|
||||
|
||||
persist_suggestions(raft_server, prev_persistence_s);
|
||||
prev_persistence_s = std::chrono::duration_cast<std::chrono::seconds>(
|
||||
std::chrono::system_clock::now().time_since_epoch()).count();
|
||||
|
||||
lk.unlock();
|
||||
}
|
||||
@ -270,8 +271,6 @@ void AnalyticsManager::persist_suggestions(ReplicationState *raft_server, uint64
|
||||
continue;
|
||||
}
|
||||
|
||||
prev_persistence_s = now_ts_seconds;
|
||||
|
||||
std::string import_payload;
|
||||
popularQueries->serialize_as_docs(import_payload);
|
||||
|
||||
|
@ -52,12 +52,6 @@ Collection::Collection(const std::string& name, const uint32_t collection_id, co
|
||||
symbols_to_index(to_char_array(symbols_to_index)), token_separators(to_char_array(token_separators)),
|
||||
index(init_index()) {
|
||||
|
||||
for (auto const& field: fields) {
|
||||
if (field.embed.count(fields::from) != 0) {
|
||||
embedding_fields.emplace(field.name, field);
|
||||
}
|
||||
}
|
||||
|
||||
this->num_documents = 0;
|
||||
}
|
||||
|
||||
@ -3942,7 +3936,6 @@ Option<bool> Collection::alter(nlohmann::json& alter_payload) {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// hide credentials in the alter payload return
|
||||
for(auto& field_json : alter_payload["fields"]) {
|
||||
if(field_json[fields::embed].count(fields::model_config) != 0) {
|
||||
@ -3955,8 +3948,6 @@ Option<bool> Collection::alter(nlohmann::json& alter_payload) {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
return Option<bool>(true);
|
||||
}
|
||||
|
||||
@ -4165,7 +4156,6 @@ Option<bool> Collection::validate_alter_payload(nlohmann::json& schema_changes,
|
||||
}
|
||||
|
||||
std::unordered_map<std::string, field> new_dynamic_fields;
|
||||
std::vector<std::pair<size_t, size_t>> embed_json_field_indices;
|
||||
int json_array_index = -1;
|
||||
|
||||
for(const auto& kv: schema_changes["fields"].items()) {
|
||||
@ -4253,7 +4243,7 @@ Option<bool> Collection::validate_alter_payload(nlohmann::json& schema_changes,
|
||||
return parse_op;
|
||||
}
|
||||
|
||||
const auto& f = diff_fields.back();
|
||||
auto& f = diff_fields.back();
|
||||
|
||||
if(f.is_dynamic()) {
|
||||
new_dynamic_fields[f.name] = f;
|
||||
@ -4261,6 +4251,14 @@ Option<bool> Collection::validate_alter_payload(nlohmann::json& schema_changes,
|
||||
updated_search_schema[f.name] = f;
|
||||
}
|
||||
|
||||
if(!f.embed.empty()) {
|
||||
auto validate_res = field::validate_and_init_embed_field(search_schema, schema_changes["fields"][json_array_index], schema_changes["fields"], f);
|
||||
|
||||
if(!validate_res.ok()) {
|
||||
return validate_res;
|
||||
}
|
||||
}
|
||||
|
||||
if(is_reindex) {
|
||||
reindex_fields.push_back(f);
|
||||
} else {
|
||||
@ -4295,9 +4293,7 @@ Option<bool> Collection::validate_alter_payload(nlohmann::json& schema_changes,
|
||||
}
|
||||
}
|
||||
|
||||
if(!f.embed.empty() && !diff_fields.empty()) {
|
||||
embed_json_field_indices.emplace_back(json_array_index, diff_fields.size()-1);
|
||||
}
|
||||
|
||||
|
||||
} else {
|
||||
// partial update is not supported for now
|
||||
@ -4307,12 +4303,6 @@ Option<bool> Collection::validate_alter_payload(nlohmann::json& schema_changes,
|
||||
}
|
||||
}
|
||||
|
||||
auto validation_op = field::validate_and_init_embed_fields(embed_json_field_indices, search_schema,
|
||||
schema_changes["fields"], diff_fields);
|
||||
if(!validation_op.ok()) {
|
||||
return validation_op;
|
||||
}
|
||||
|
||||
if(num_auto_detect_fields > 1) {
|
||||
return Option<bool>(400, "There can be only one field named `.*`.");
|
||||
}
|
||||
@ -4904,27 +4894,43 @@ Option<bool> Collection::populate_include_exclude_fields_lk(const spp::sparse_ha
|
||||
}
|
||||
|
||||
// Removes the dropped field from embed_from of all embedding fields.
|
||||
void Collection::process_remove_field_for_embedding_fields(const field& the_field, std::vector<field>& garbage_fields) {
|
||||
void Collection::process_remove_field_for_embedding_fields(const field& del_field,
|
||||
std::vector<field>& garbage_embed_fields) {
|
||||
for(auto& field : fields) {
|
||||
if(field.embed.count(fields::from) == 0) {
|
||||
continue;
|
||||
}
|
||||
auto embed_from = field.embed[fields::from].get<std::vector<std::string>>();
|
||||
embed_from.erase(std::remove_if(embed_from.begin(), embed_from.end(), [&the_field](std::string field_name) {
|
||||
return the_field.name == field_name;
|
||||
}));
|
||||
field.embed[fields::from] = std::move(embed_from);
|
||||
embedding_fields[field.name] = field;
|
||||
|
||||
// mark this embedding field as "garbage" if it has no more embed_from fields
|
||||
if(embed_from.empty()) {
|
||||
embedding_fields.erase(field.name);
|
||||
garbage_fields.push_back(field);
|
||||
bool found_field = false;
|
||||
nlohmann::json& embed_from_names = field.embed[fields::from];
|
||||
for(auto it = embed_from_names.begin(); it != embed_from_names.end();) {
|
||||
if(it.value() == del_field.name) {
|
||||
it = embed_from_names.erase(it);
|
||||
found_field = true;
|
||||
} else {
|
||||
it++;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if(found_field) {
|
||||
// mark this embedding field as "garbage" if it has no more embed_from fields
|
||||
if(embed_from_names.empty()) {
|
||||
garbage_embed_fields.push_back(field);
|
||||
} else {
|
||||
// the dropped field was present in `embed_from`, so we have to update the field objects
|
||||
field.embed[fields::from] = embed_from_names;
|
||||
embedding_fields[field.name].embed[fields::from] = embed_from_names;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for(auto& garbage_field: garbage_embed_fields) {
|
||||
embedding_fields.erase(garbage_field.name);
|
||||
search_schema.erase(garbage_field.name);
|
||||
fields.erase(std::remove_if(fields.begin(), fields.end(), [&garbage_field](const auto &f) {
|
||||
return f.name == garbage_field.name;
|
||||
}), fields.end());
|
||||
}
|
||||
}
|
||||
|
||||
void Collection::hide_credential(nlohmann::json& json, const std::string& credential_name) {
|
||||
@ -4939,10 +4945,15 @@ void Collection::hide_credential(nlohmann::json& json, const std::string& creden
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Option<bool> Collection::truncate_after_top_k(const string &field_name, size_t k) {
|
||||
std::shared_lock slock(mutex);
|
||||
|
||||
std::vector<uint32_t> seq_ids;
|
||||
auto op = index->seq_ids_outside_top_k(field_name, k, seq_ids);
|
||||
|
||||
slock.unlock();
|
||||
|
||||
if(!op.ok()) {
|
||||
return op;
|
||||
}
|
||||
|
@ -766,7 +766,9 @@ Option<bool> CollectionManager::do_search(std::map<std::string, std::string>& re
|
||||
nlohmann::json preset;
|
||||
const auto& preset_op = CollectionManager::get_instance().get_preset(preset_it->second, preset);
|
||||
|
||||
if(preset_op.ok()) {
|
||||
// NOTE: we merge only single preset configuration because multi ("searches") preset value replaces
|
||||
// the request body directly before we reach this single search request function.
|
||||
if(preset_op.ok() && !preset.contains("searches")) {
|
||||
if(!preset.is_object()) {
|
||||
return Option<bool>(400, "Search preset is not an object.");
|
||||
}
|
||||
@ -1112,7 +1114,7 @@ Option<bool> CollectionManager::do_search(std::map<std::string, std::string>& re
|
||||
|
||||
if(Config::get_instance().get_enable_search_analytics()) {
|
||||
if(result.count("found") != 0 && result["found"].get<size_t>() != 0) {
|
||||
std::string analytics_query = raw_query;
|
||||
std::string analytics_query = Tokenizer::normalize_ascii_no_spaces(raw_query);
|
||||
AnalyticsManager::get_instance().add_suggestion(orig_coll_name, analytics_query,
|
||||
true, req_params["x-typesense-user-id"]);
|
||||
}
|
||||
|
@ -729,7 +729,7 @@ bool get_export_documents(const std::shared_ptr<http_req>& req, const std::share
|
||||
}
|
||||
}
|
||||
|
||||
res->content_type_header = "text/plain; charset=utf8";
|
||||
res->content_type_header = "text/plain; charset=utf-8";
|
||||
res->status_code = 200;
|
||||
|
||||
stream_response(req, res);
|
||||
@ -902,7 +902,7 @@ bool post_import_documents(const std::shared_ptr<http_req>& req, const std::shar
|
||||
}
|
||||
}
|
||||
|
||||
res->content_type_header = "text/plain; charset=utf8";
|
||||
res->content_type_header = "text/plain; charset=utf-8";
|
||||
res->status_code = 200;
|
||||
res->body = response_stream.str();
|
||||
|
||||
|
@ -1083,7 +1083,7 @@ void field::compact_nested_fields(tsl::htrie_map<char, field>& nested_fields) {
|
||||
Option<bool> field::json_fields_to_fields(bool enable_nested_fields, nlohmann::json &fields_json, string &fallback_field_type,
|
||||
std::vector<field>& the_fields) {
|
||||
size_t num_auto_detect_fields = 0;
|
||||
std::vector<std::pair<size_t, size_t>> embed_json_field_indices;
|
||||
const tsl::htrie_map<char, field> dummy_search_schema;
|
||||
|
||||
for(size_t i = 0; i < fields_json.size(); i++) {
|
||||
nlohmann::json& field_json = fields_json[i];
|
||||
@ -1094,17 +1094,13 @@ Option<bool> field::json_fields_to_fields(bool enable_nested_fields, nlohmann::j
|
||||
}
|
||||
|
||||
if(!the_fields.empty() && !the_fields.back().embed.empty()) {
|
||||
embed_json_field_indices.emplace_back(i, i);
|
||||
auto validate_res = validate_and_init_embed_field(dummy_search_schema, field_json, fields_json, the_fields.back());
|
||||
if(!validate_res.ok()) {
|
||||
return validate_res;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const tsl::htrie_map<char, field> dummy_search_schema;
|
||||
auto validation_op = field::validate_and_init_embed_fields(embed_json_field_indices, dummy_search_schema,
|
||||
fields_json, the_fields);
|
||||
if(!validation_op.ok()) {
|
||||
return validation_op;
|
||||
}
|
||||
|
||||
if(num_auto_detect_fields > 1) {
|
||||
return Option<bool>(400,"There can be only one field named `.*`.");
|
||||
}
|
||||
@ -1112,49 +1108,47 @@ Option<bool> field::json_fields_to_fields(bool enable_nested_fields, nlohmann::j
|
||||
return Option<bool>(true);
|
||||
}
|
||||
|
||||
Option<bool> field::validate_and_init_embed_fields(const std::vector<std::pair<size_t, size_t>>& embed_json_field_indices,
|
||||
const tsl::htrie_map<char, field>& search_schema,
|
||||
nlohmann::json& fields_json,
|
||||
std::vector<field>& fields_vec) {
|
||||
|
||||
for(const auto& json_field_index: embed_json_field_indices) {
|
||||
auto& field_json = fields_json[json_field_index.first];
|
||||
const std::string err_msg = "Property `" + fields::embed + "." + fields::from +
|
||||
Option<bool> field::validate_and_init_embed_field(const tsl::htrie_map<char, field>& search_schema, nlohmann::json& field_json,
|
||||
const nlohmann::json& fields_json,
|
||||
field& the_field) {
|
||||
const std::string err_msg = "Property `" + fields::embed + "." + fields::from +
|
||||
"` can only refer to string or string array fields.";
|
||||
|
||||
for(auto& field_name : field_json[fields::embed][fields::from].get<std::vector<std::string>>()) {
|
||||
auto embed_field = std::find_if(fields_json.begin(), fields_json.end(), [&field_name](const nlohmann::json& x) {
|
||||
return x["name"].get<std::string>() == field_name;
|
||||
});
|
||||
for(auto& field_name : field_json[fields::embed][fields::from].get<std::vector<std::string>>()) {
|
||||
|
||||
if(embed_field == fields_json.end()) {
|
||||
const auto& embed_field2 = search_schema.find(field_name);
|
||||
if (embed_field2 == search_schema.end()) {
|
||||
return Option<bool>(400, err_msg);
|
||||
} else if (embed_field2->type != field_types::STRING && embed_field2->type != field_types::STRING_ARRAY) {
|
||||
return Option<bool>(400, err_msg);
|
||||
}
|
||||
} else if((*embed_field)[fields::type] != field_types::STRING &&
|
||||
(*embed_field)[fields::type] != field_types::STRING_ARRAY) {
|
||||
auto embed_field = std::find_if(fields_json.begin(), fields_json.end(), [&field_name](const nlohmann::json& x) {
|
||||
return x["name"].get<std::string>() == field_name;
|
||||
});
|
||||
|
||||
|
||||
if(embed_field == fields_json.end()) {
|
||||
const auto& embed_field2 = search_schema.find(field_name);
|
||||
if (embed_field2 == search_schema.end()) {
|
||||
return Option<bool>(400, err_msg);
|
||||
} else if (embed_field2->type != field_types::STRING && embed_field2->type != field_types::STRING_ARRAY) {
|
||||
return Option<bool>(400, err_msg);
|
||||
}
|
||||
} else if((*embed_field)[fields::type] != field_types::STRING &&
|
||||
(*embed_field)[fields::type] != field_types::STRING_ARRAY) {
|
||||
return Option<bool>(400, err_msg);
|
||||
}
|
||||
|
||||
const auto& model_config = field_json[fields::embed][fields::model_config];
|
||||
size_t num_dim = 0;
|
||||
auto res = TextEmbedderManager::get_instance().validate_and_init_model(model_config, num_dim);
|
||||
if(!res.ok()) {
|
||||
return Option<bool>(res.code(), res.error());
|
||||
}
|
||||
|
||||
LOG(INFO) << "Model init done.";
|
||||
field_json[fields::num_dim] = num_dim;
|
||||
fields_vec[json_field_index.second].num_dim = num_dim;
|
||||
}
|
||||
|
||||
const auto& model_config = field_json[fields::embed][fields::model_config];
|
||||
size_t num_dim = 0;
|
||||
auto res = TextEmbedderManager::get_instance().validate_and_init_model(model_config, num_dim);
|
||||
if(!res.ok()) {
|
||||
return Option<bool>(res.code(), res.error());
|
||||
}
|
||||
|
||||
LOG(INFO) << "Model init done.";
|
||||
field_json[fields::num_dim] = num_dim;
|
||||
the_field.num_dim = num_dim;
|
||||
|
||||
return Option<bool>(true);
|
||||
}
|
||||
|
||||
|
||||
void filter_result_t::and_filter_results(const filter_result_t& a, const filter_result_t& b, filter_result_t& result) {
|
||||
auto lenA = a.count, lenB = b.count;
|
||||
if (lenA == 0 || lenB == 0) {
|
||||
|
@ -454,7 +454,7 @@ void Index::validate_and_preprocess(Index *index, std::vector<index_record>& ite
|
||||
|
||||
if(index_rec.is_update) {
|
||||
// scrub string fields to reduce delete ops
|
||||
get_doc_changes(index_rec.operation, search_schema, index_rec.doc, index_rec.old_doc,
|
||||
get_doc_changes(index_rec.operation, embedding_fields, index_rec.doc, index_rec.old_doc,
|
||||
index_rec.new_doc, index_rec.del_doc);
|
||||
|
||||
if(generate_embeddings) {
|
||||
@ -870,12 +870,16 @@ void Index::index_field_in_memory(const field& afield, std::vector<index_record>
|
||||
|
||||
try {
|
||||
const std::vector<float>& float_vals = record.doc[afield.name].get<std::vector<float>>();
|
||||
if(afield.vec_dist == cosine) {
|
||||
std::vector<float> normalized_vals(afield.num_dim);
|
||||
hnsw_index_t::normalize_vector(float_vals, normalized_vals);
|
||||
vec_index->addPoint(normalized_vals.data(), (size_t)record.seq_id, true);
|
||||
if(float_vals.size() != afield.num_dim) {
|
||||
record.index_failure(400, "Vector size mismatch.");
|
||||
} else {
|
||||
vec_index->addPoint(float_vals.data(), (size_t)record.seq_id, true);
|
||||
if(afield.vec_dist == cosine) {
|
||||
std::vector<float> normalized_vals(afield.num_dim);
|
||||
hnsw_index_t::normalize_vector(float_vals, normalized_vals);
|
||||
vec_index->addPoint(normalized_vals.data(), (size_t)record.seq_id, true);
|
||||
} else {
|
||||
vec_index->addPoint(float_vals.data(), (size_t)record.seq_id, true);
|
||||
}
|
||||
}
|
||||
} catch(const std::exception &e) {
|
||||
record.index_failure(400, e.what());
|
||||
@ -3200,8 +3204,8 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
|
||||
|
||||
for(size_t res_index = 0; res_index < vec_results.size(); res_index++) {
|
||||
auto& vec_result = vec_results[res_index];
|
||||
auto doc_id = vec_result.first;
|
||||
auto result_it = topster->kv_map.find(doc_id);
|
||||
auto seq_id = vec_result.first;
|
||||
auto result_it = topster->kv_map.find(seq_id);
|
||||
|
||||
if(result_it != topster->kv_map.end()) {
|
||||
if(result_it->second->match_score_index < 0 || result_it->second->match_score_index > 2) {
|
||||
@ -3210,22 +3214,23 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
|
||||
|
||||
// result overlaps with keyword search: we have to combine the scores
|
||||
|
||||
auto result = result_it->second;
|
||||
KV* kv = result_it->second;
|
||||
// old_score + (1 / rank_of_document) * WEIGHT)
|
||||
result->vector_distance = vec_result.second;
|
||||
result->text_match_score = result->scores[result->match_score_index];
|
||||
kv->vector_distance = vec_result.second;
|
||||
kv->text_match_score = kv->scores[kv->match_score_index];
|
||||
int64_t match_score = float_to_int64_t(
|
||||
(int64_t_to_float(result->scores[result->match_score_index])) +
|
||||
(int64_t_to_float(kv->scores[kv->match_score_index])) +
|
||||
((1.0 / (res_index + 1)) * VECTOR_SEARCH_WEIGHT));
|
||||
int64_t match_score_index = -1;
|
||||
int64_t scores[3] = {0};
|
||||
|
||||
compute_sort_scores(sort_fields_std, sort_order, field_values, geopoint_indices, doc_id, 0, match_score, scores, match_score_index, vec_result.second);
|
||||
compute_sort_scores(sort_fields_std, sort_order, field_values, geopoint_indices, seq_id, 0,
|
||||
match_score, scores, match_score_index, vec_result.second);
|
||||
|
||||
for(int i = 0; i < 3; i++) {
|
||||
result->scores[i] = scores[i];
|
||||
kv->scores[i] = scores[i];
|
||||
}
|
||||
result->match_score_index = match_score_index;
|
||||
kv->match_score_index = match_score_index;
|
||||
|
||||
} else {
|
||||
// Result has been found only in vector search: we have to add it to both KV and result_ids
|
||||
@ -3233,12 +3238,21 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
|
||||
int64_t scores[3] = {0};
|
||||
int64_t match_score = float_to_int64_t((1.0 / (res_index + 1)) * VECTOR_SEARCH_WEIGHT);
|
||||
int64_t match_score_index = -1;
|
||||
compute_sort_scores(sort_fields_std, sort_order, field_values, geopoint_indices, doc_id, 0, match_score, scores, match_score_index, vec_result.second);
|
||||
KV kv(searched_queries.size(), doc_id, doc_id, match_score_index, scores);
|
||||
compute_sort_scores(sort_fields_std, sort_order, field_values, geopoint_indices, seq_id, 0, match_score, scores, match_score_index, vec_result.second);
|
||||
|
||||
uint64_t distinct_id = seq_id;
|
||||
if (group_limit != 0) {
|
||||
distinct_id = get_distinct_id(group_by_fields, seq_id);
|
||||
if(excluded_group_ids.count(distinct_id) != 0) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
KV kv(searched_queries.size(), seq_id, distinct_id, match_score_index, scores);
|
||||
kv.text_match_score = 0;
|
||||
kv.vector_distance = vec_result.second;
|
||||
topster->add(&kv);
|
||||
vec_search_ids.push_back(doc_id);
|
||||
vec_search_ids.push_back(seq_id);
|
||||
}
|
||||
}
|
||||
|
||||
@ -3967,8 +3981,6 @@ void Index::search_across_fields(const std::vector<token_t>& query_tokens,
|
||||
dropped_token_its.push_back(std::move(token_fields));
|
||||
}
|
||||
|
||||
|
||||
|
||||
// one iterator for each token, each underlying iterator contains results of token across multiple fields
|
||||
std::vector<or_iterator_t> token_its;
|
||||
|
||||
@ -4060,6 +4072,28 @@ void Index::search_across_fields(const std::vector<token_t>& query_tokens,
|
||||
}
|
||||
}
|
||||
|
||||
size_t query_len = query_tokens.size();
|
||||
|
||||
// check if seq_id exists in any of the dropped_token iters
|
||||
for(size_t ti = 0; ti < dropped_token_its.size(); ti++) {
|
||||
or_iterator_t& token_fields_iters = dropped_token_its[ti];
|
||||
if(token_fields_iters.skip_to(seq_id) && token_fields_iters.id() == seq_id) {
|
||||
query_len++;
|
||||
const std::vector<posting_list_t::iterator_t>& field_iters = token_fields_iters.get_its();
|
||||
for(size_t fi = 0; fi < field_iters.size(); fi++) {
|
||||
const posting_list_t::iterator_t& field_iter = field_iters[fi];
|
||||
if(field_iter.id() == seq_id) {
|
||||
// not all fields might contain a given token
|
||||
field_to_tokens[field_iter.get_field_id()].push_back(field_iter.clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(syn_orig_num_tokens != -1) {
|
||||
query_len = syn_orig_num_tokens;
|
||||
}
|
||||
|
||||
int64_t best_field_match_score = 0, best_field_weight = 0;
|
||||
uint32_t num_matching_fields = 0;
|
||||
|
||||
@ -4113,18 +4147,6 @@ void Index::search_across_fields(const std::vector<token_t>& query_tokens,
|
||||
compute_sort_scores(sort_fields, sort_order, field_values, geopoint_indices, seq_id, filter_index,
|
||||
best_field_match_score, scores, match_score_index);
|
||||
|
||||
size_t query_len = query_tokens.size();
|
||||
|
||||
// check if seq_id exists in any of the dropped_token iters and increment matching fields accordingly
|
||||
for(auto& dropped_token_it: dropped_token_its) {
|
||||
if(dropped_token_it.skip_to(seq_id) && dropped_token_it.id() == seq_id) {
|
||||
query_len++;
|
||||
}
|
||||
}
|
||||
|
||||
if(syn_orig_num_tokens != -1) {
|
||||
query_len = syn_orig_num_tokens;
|
||||
}
|
||||
query_len = std::min<size_t>(15, query_len);
|
||||
|
||||
// NOTE: `query_len` is total tokens matched across fields.
|
||||
@ -6244,7 +6266,7 @@ void Index::handle_doc_ops(const tsl::htrie_map<char, field>& search_schema,
|
||||
}
|
||||
}
|
||||
|
||||
void Index::get_doc_changes(const index_operation_t op, const tsl::htrie_map<char, field>& search_schema,
|
||||
void Index::get_doc_changes(const index_operation_t op, const tsl::htrie_map<char, field>& embedding_fields,
|
||||
nlohmann::json& update_doc, const nlohmann::json& old_doc, nlohmann::json& new_doc,
|
||||
nlohmann::json& del_doc) {
|
||||
|
||||
@ -6257,7 +6279,12 @@ void Index::get_doc_changes(const index_operation_t op, const tsl::htrie_map<cha
|
||||
}
|
||||
|
||||
if(!update_doc.contains(it.key())) {
|
||||
del_doc[it.key()] = it.value();
|
||||
// embedding field won't be part of upsert doc so populate new doc with the value from old doc
|
||||
if(embedding_fields.count(it.key()) != 0) {
|
||||
new_doc[it.key()] = it.value();
|
||||
} else {
|
||||
del_doc[it.key()] = it.value();
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
@ -6311,9 +6338,10 @@ size_t Index::num_seq_ids() const {
|
||||
|
||||
Option<bool> Index::seq_ids_outside_top_k(const std::string& field_name, size_t k,
|
||||
std::vector<uint32_t>& outside_seq_ids) {
|
||||
std::shared_lock lock(mutex);
|
||||
auto field_it = numerical_index.find(field_name);
|
||||
|
||||
if(field_it == sort_index.end()) {
|
||||
if(field_it == numerical_index.end()) {
|
||||
return Option<bool>(400, "Field not found in numerical index.");
|
||||
}
|
||||
|
||||
|
@ -117,6 +117,11 @@ embedding_res_t TextEmbedder::Embed(const std::string& text, const size_t remote
|
||||
input_shapes.push_back({1, static_cast<int64_t>(encoded_input.input_ids.size())});
|
||||
input_shapes.push_back({1, static_cast<int64_t>(encoded_input.attention_mask.size())});
|
||||
if(session_->GetInputCount() == 3) {
|
||||
// edge case: xlm_roberta does not have token_type_ids, but if the model has it as input, we need to fill it with 0s
|
||||
if(encoded_input.token_type_ids.size() == 0) {
|
||||
encoded_input.token_type_ids.resize(encoded_input.input_ids.size(), 0);
|
||||
}
|
||||
|
||||
input_shapes.push_back({1, static_cast<int64_t>(encoded_input.token_type_ids.size())});
|
||||
}
|
||||
input_tensors.push_back(Ort::Value::CreateTensor<int64_t>(memory_info, encoded_input.input_ids.data(), encoded_input.input_ids.size(), input_shapes[0].data(), input_shapes[0].size()));
|
||||
|
@ -43,9 +43,10 @@ Option<bool> TextEmbedderManager::validate_and_init_remote_model(const nlohmann:
|
||||
}
|
||||
|
||||
std::unique_lock<std::mutex> lock(text_embedders_mutex);
|
||||
auto text_embedder_it = text_embedders.find(model_name);
|
||||
std::string model_key = is_remote_model(model_name) ? RemoteEmbedder::get_model_key(model_config) : model_name;
|
||||
auto text_embedder_it = text_embedders.find(model_key);
|
||||
if(text_embedder_it == text_embedders.end()) {
|
||||
text_embedders.emplace(model_name, std::make_shared<TextEmbedder>(model_config, num_dims));
|
||||
text_embedders.emplace(model_key, std::make_shared<TextEmbedder>(model_config, num_dims));
|
||||
}
|
||||
|
||||
return Option<bool>(true);
|
||||
@ -122,7 +123,8 @@ Option<bool> TextEmbedderManager::validate_and_init_local_model(const nlohmann::
|
||||
Option<TextEmbedder*> TextEmbedderManager::get_text_embedder(const nlohmann::json& model_config) {
|
||||
std::unique_lock<std::mutex> lock(text_embedders_mutex);
|
||||
const std::string& model_name = model_config.at("model_name");
|
||||
auto text_embedder_it = text_embedders.find(model_name);
|
||||
std::string model_key = is_remote_model(model_name) ? RemoteEmbedder::get_model_key(model_config) : model_name;
|
||||
auto text_embedder_it = text_embedders.find(model_key);
|
||||
|
||||
if(text_embedder_it == text_embedders.end()) {
|
||||
return Option<TextEmbedder*>(404, "Text embedder was not found.");
|
||||
|
@ -53,6 +53,21 @@ long RemoteEmbedder::call_remote_api(const std::string& method, const std::strin
|
||||
proxy_call_timeout_ms, true);
|
||||
}
|
||||
|
||||
|
||||
const std::string RemoteEmbedder::get_model_key(const nlohmann::json& model_config) {
|
||||
const std::string model_namespace = TextEmbedderManager::get_model_namespace(model_config["model_name"].get<std::string>());
|
||||
|
||||
if(model_namespace == "openai") {
|
||||
return OpenAIEmbedder::get_model_key(model_config);
|
||||
} else if(model_namespace == "google") {
|
||||
return GoogleEmbedder::get_model_key(model_config);
|
||||
} else if(model_namespace == "gcp") {
|
||||
return GCPEmbedder::get_model_key(model_config);
|
||||
} else {
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
OpenAIEmbedder::OpenAIEmbedder(const std::string& openai_model_path, const std::string& api_key) : api_key(api_key), openai_model_path(openai_model_path) {
|
||||
|
||||
}
|
||||
@ -206,6 +221,7 @@ std::vector<embedding_res_t> OpenAIEmbedder::batch_embed(const std::vector<std::
|
||||
}
|
||||
|
||||
nlohmann::json res_json;
|
||||
|
||||
try {
|
||||
res_json = nlohmann::json::parse(res);
|
||||
} catch (const std::exception& e) {
|
||||
@ -217,8 +233,21 @@ std::vector<embedding_res_t> OpenAIEmbedder::batch_embed(const std::vector<std::
|
||||
}
|
||||
return outputs;
|
||||
}
|
||||
|
||||
if(res_json.count("data") == 0 || !res_json["data"].is_array() || res_json["data"].size() != inputs.size()) {
|
||||
std::vector<embedding_res_t> outputs;
|
||||
for(size_t i = 0; i < inputs.size(); i++) {
|
||||
outputs.push_back(embedding_res_t(500, "Got malformed response from OpenAI API."));
|
||||
}
|
||||
return outputs;
|
||||
}
|
||||
|
||||
std::vector<embedding_res_t> outputs;
|
||||
for(auto& data : res_json["data"]) {
|
||||
if(data.count("embedding") == 0 || !data["embedding"].is_array() || data["embedding"].size() == 0) {
|
||||
outputs.push_back(embedding_res_t(500, "Got malformed response from OpenAI API."));
|
||||
continue;
|
||||
}
|
||||
outputs.push_back(embedding_res_t(data["embedding"].get<std::vector<float>>()));
|
||||
}
|
||||
|
||||
@ -255,6 +284,9 @@ nlohmann::json OpenAIEmbedder::get_error_json(const nlohmann::json& req_body, lo
|
||||
return embedding_res;
|
||||
}
|
||||
|
||||
std::string OpenAIEmbedder::get_model_key(const nlohmann::json& model_config) {
|
||||
return model_config["model_name"].get<std::string>() + ":" + model_config["api_key"].get<std::string>();
|
||||
}
|
||||
|
||||
GoogleEmbedder::GoogleEmbedder(const std::string& google_api_key) : google_api_key(google_api_key) {
|
||||
|
||||
@ -372,6 +404,10 @@ nlohmann::json GoogleEmbedder::get_error_json(const nlohmann::json& req_body, lo
|
||||
return embedding_res;
|
||||
}
|
||||
|
||||
std::string GoogleEmbedder::get_model_key(const nlohmann::json& model_config) {
|
||||
return model_config["model_name"].get<std::string>() + ":" + model_config["api_key"].get<std::string>();
|
||||
}
|
||||
|
||||
|
||||
GCPEmbedder::GCPEmbedder(const std::string& project_id, const std::string& model_name, const std::string& access_token,
|
||||
const std::string& refresh_token, const std::string& client_id, const std::string& client_secret) :
|
||||
@ -555,7 +591,20 @@ std::vector<embedding_res_t> GCPEmbedder::batch_embed(const std::vector<std::str
|
||||
return outputs;
|
||||
}
|
||||
std::vector<embedding_res_t> outputs;
|
||||
|
||||
if(res_json.count("predictions") == 0 || !res_json["predictions"].is_array() || res_json["predictions"].size() != inputs.size()) {
|
||||
std::vector<embedding_res_t> outputs;
|
||||
for(size_t i = 0; i < inputs.size(); i++) {
|
||||
outputs.push_back(embedding_res_t(500, "Got malformed response from GCP API."));
|
||||
}
|
||||
return outputs;
|
||||
}
|
||||
|
||||
for(const auto& prediction : res_json["predictions"]) {
|
||||
if(prediction.count("embeddings") == 0 || !prediction["embeddings"].is_object() || prediction["embeddings"].count("values") == 0 || !prediction["embeddings"]["values"].is_array() || prediction["embeddings"]["values"].size() == 0) {
|
||||
outputs.push_back(embedding_res_t(500, "Got malformed response from GCP API."));
|
||||
continue;
|
||||
}
|
||||
outputs.push_back(embedding_res_t(prediction["embeddings"]["values"].get<std::vector<float>>()));
|
||||
}
|
||||
|
||||
@ -625,3 +674,7 @@ Option<std::string> GCPEmbedder::generate_access_token(const std::string& refres
|
||||
|
||||
return Option<std::string>(access_token);
|
||||
}
|
||||
|
||||
std::string GCPEmbedder::get_model_key(const nlohmann::json& model_config) {
|
||||
return model_config["model_name"].get<std::string>() + ":" + model_config["project_id"].get<std::string>() + ":" + model_config["client_secret"].get<std::string>();
|
||||
}
|
@ -1,5 +1,6 @@
|
||||
#include <sstream>
|
||||
#include <algorithm>
|
||||
#include <string_utils.h>
|
||||
#include "tokenizer.h"
|
||||
|
||||
Tokenizer::Tokenizer(const std::string& input, bool normalize, bool no_op, const std::string& locale,
|
||||
@ -349,10 +350,15 @@ bool Tokenizer::should_skip_char(char c) {
|
||||
return is_ascii_char(c) && get_stream_mode(c) != INDEX;
|
||||
}
|
||||
|
||||
void Tokenizer::normalize_ascii(std::string& text) {
|
||||
for(size_t i = 0; i < text.size(); i++) {
|
||||
std::string Tokenizer::normalize_ascii_no_spaces(const std::string& text) {
|
||||
std::string analytics_query = text;
|
||||
StringUtils::trim(analytics_query);
|
||||
|
||||
for(size_t i = 0; i < analytics_query.size(); i++) {
|
||||
if(is_ascii_char(text[i])) {
|
||||
text[i] = std::tolower(text[i]);
|
||||
analytics_query[i] = std::tolower(analytics_query[i]);
|
||||
}
|
||||
}
|
||||
|
||||
return analytics_query;
|
||||
}
|
||||
|
@ -118,7 +118,7 @@ int init_root_logger(Config & config, const std::string & server_version) {
|
||||
|
||||
if(log_dir.empty()) {
|
||||
// use console logger if log dir is not specified
|
||||
FLAGS_logtostdout = true;
|
||||
FLAGS_logtostderr = true;
|
||||
} else {
|
||||
if(!directory_exists(log_dir)) {
|
||||
std::cerr << "Typesense failed to start. " << "Log directory " << log_dir << " does not exist.";
|
||||
|
@ -3,6 +3,7 @@
|
||||
#include <vector>
|
||||
#include <fstream>
|
||||
#include <collection_manager.h>
|
||||
#include <analytics_manager.h>
|
||||
#include "string_utils.h"
|
||||
#include "collection.h"
|
||||
|
||||
@ -24,6 +25,8 @@ protected:
|
||||
collectionManager.init(store, 1.0, "auth_key", quit);
|
||||
collectionManager.load(8, 1000);
|
||||
|
||||
AnalyticsManager::get_instance().init(store);
|
||||
|
||||
schema = R"({
|
||||
"name": "collection1",
|
||||
"enable_nested_fields": true,
|
||||
@ -587,6 +590,67 @@ TEST_F(CollectionManagerTest, VerifyEmbeddedParametersOfScopedAPIKey) {
|
||||
collectionManager.drop_collection("coll1");
|
||||
}
|
||||
|
||||
TEST_F(CollectionManagerTest, QuerySuggestionsShouldBeTrimmed) {
|
||||
std::vector<field> fields = {field("title", field_types::STRING, false, false, true, "", -1, 1),
|
||||
field("year", field_types::INT32, false),
|
||||
field("points", field_types::INT32, false),};
|
||||
|
||||
Collection* coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
|
||||
|
||||
nlohmann::json doc1;
|
||||
doc1["id"] = "0";
|
||||
doc1["title"] = "Tom Sawyer";
|
||||
doc1["year"] = 1876;
|
||||
doc1["points"] = 100;
|
||||
|
||||
ASSERT_TRUE(coll1->add(doc1.dump()).ok());
|
||||
|
||||
Config::get_instance().set_enable_search_analytics(true);
|
||||
|
||||
nlohmann::json analytics_rule = R"({
|
||||
"name": "top_search_queries",
|
||||
"type": "popular_queries",
|
||||
"params": {
|
||||
"limit": 100,
|
||||
"source": {
|
||||
"collections": ["coll1"]
|
||||
},
|
||||
"destination": {
|
||||
"collection": "top_queries"
|
||||
}
|
||||
}
|
||||
})"_json;
|
||||
|
||||
auto create_op = AnalyticsManager::get_instance().create_rule(analytics_rule, false, true);
|
||||
ASSERT_TRUE(create_op.ok());
|
||||
|
||||
nlohmann::json embedded_params;
|
||||
std::map<std::string, std::string> req_params;
|
||||
req_params["collection"] = "coll1";
|
||||
req_params["q"] = " tom ";
|
||||
req_params["query_by"] = "title";
|
||||
|
||||
std::string json_res;
|
||||
auto now_ts = std::chrono::duration_cast<std::chrono::microseconds>(
|
||||
std::chrono::system_clock::now().time_since_epoch()).count();
|
||||
|
||||
auto search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts);
|
||||
ASSERT_TRUE(search_op.ok());
|
||||
|
||||
json_res.clear();
|
||||
req_params["q"] = " ";
|
||||
search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts);
|
||||
ASSERT_TRUE(search_op.ok());
|
||||
|
||||
// check that suggestions have been trimmed
|
||||
auto popular_queries = AnalyticsManager::get_instance().get_popular_queries();
|
||||
ASSERT_EQ(2, popular_queries["top_queries"]->get_user_prefix_queries()[""].size());
|
||||
ASSERT_EQ("tom", popular_queries["top_queries"]->get_user_prefix_queries()[""][0].query);
|
||||
ASSERT_EQ("", popular_queries["top_queries"]->get_user_prefix_queries()[""][1].query);
|
||||
|
||||
collectionManager.drop_collection("coll1");
|
||||
}
|
||||
|
||||
TEST_F(CollectionManagerTest, RestoreAutoSchemaDocsOnRestart) {
|
||||
Collection *coll1;
|
||||
|
||||
|
@ -1566,6 +1566,14 @@ TEST_F(CollectionSchemaChangeTest, UpdateSchemaWithNewEmbeddingField) {
|
||||
ASSERT_TRUE(res.ok());
|
||||
ASSERT_EQ(1, coll->get_embedding_fields().size());
|
||||
|
||||
auto search_schema = coll->get_schema();
|
||||
|
||||
auto embedding_field_it = search_schema.find("embedding");
|
||||
ASSERT_TRUE(embedding_field_it != coll->get_schema().end());
|
||||
ASSERT_EQ("embedding", embedding_field_it.value().name);
|
||||
ASSERT_EQ("float[]", embedding_field_it.value().type);
|
||||
ASSERT_EQ(384, embedding_field_it.value().num_dim);
|
||||
|
||||
nlohmann::json doc;
|
||||
doc["names"] = {"hello", "world"};
|
||||
auto add_op = coll->add(doc.dump());
|
||||
@ -1580,9 +1588,13 @@ TEST_F(CollectionSchemaChangeTest, DropFieldUsedForEmbedding) {
|
||||
nlohmann::json schema = R"({
|
||||
"name": "objects",
|
||||
"fields": [
|
||||
{"name": "names", "type": "string[]"},
|
||||
{"name": "category", "type":"string"},
|
||||
{"name": "embedding", "type":"float[]", "embed":{"from": ["names","category"], "model_config": {"model_name": "ts/e5-small"}}}
|
||||
{"name": "title", "type": "string"},
|
||||
{"name": "names", "type": "string[]"},
|
||||
{"name": "category", "type":"string"},
|
||||
{"name": "embedding", "type":"float[]", "embed":{"from": ["names","category"],
|
||||
"model_config": {"model_name": "ts/e5-small"}}},
|
||||
{"name": "embedding2", "type":"float[]", "embed":{"from": ["names"],
|
||||
"model_config": {"model_name": "ts/e5-small"}}}
|
||||
]
|
||||
})"_json;
|
||||
|
||||
@ -1594,20 +1606,28 @@ TEST_F(CollectionSchemaChangeTest, DropFieldUsedForEmbedding) {
|
||||
|
||||
LOG(INFO) << "Created collection";
|
||||
|
||||
auto embedding_fields = coll->get_embedding_fields();
|
||||
ASSERT_EQ(2, embedding_fields.size());
|
||||
ASSERT_EQ(2, embedding_fields["embedding"].embed[fields::from].get<std::vector<std::string>>().size());
|
||||
ASSERT_EQ(1, embedding_fields["embedding2"].embed[fields::from].get<std::vector<std::string>>().size());
|
||||
|
||||
auto coll_schema = coll->get_schema();
|
||||
ASSERT_EQ(5, coll_schema.size());
|
||||
|
||||
auto the_fields = coll->get_fields();
|
||||
ASSERT_EQ(5, the_fields.size());
|
||||
|
||||
auto schema_changes = R"({
|
||||
"fields": [
|
||||
{"name": "names", "drop": true}
|
||||
]
|
||||
})"_json;
|
||||
|
||||
|
||||
auto embedding_fields = coll->get_embedding_fields();
|
||||
ASSERT_EQ(2, embedding_fields["embedding"].embed[fields::from].get<std::vector<std::string>>().size());
|
||||
|
||||
auto alter_op = coll->alter(schema_changes);
|
||||
ASSERT_TRUE(alter_op.ok());
|
||||
|
||||
embedding_fields = coll->get_embedding_fields();
|
||||
ASSERT_EQ(1, embedding_fields.size());
|
||||
ASSERT_EQ(1, embedding_fields["embedding"].embed[fields::from].get<std::vector<std::string>>().size());
|
||||
ASSERT_EQ("category", embedding_fields["embedding"].embed[fields::from].get<std::vector<std::string>>()[0]);
|
||||
|
||||
@ -1623,6 +1643,16 @@ TEST_F(CollectionSchemaChangeTest, DropFieldUsedForEmbedding) {
|
||||
embedding_fields = coll->get_embedding_fields();
|
||||
ASSERT_EQ(0, embedding_fields.size());
|
||||
ASSERT_EQ(0, coll->_get_index()->_get_vector_index().size());
|
||||
|
||||
// only title remains
|
||||
|
||||
coll_schema = coll->get_schema();
|
||||
ASSERT_EQ(1, coll_schema.size());
|
||||
ASSERT_EQ("title", coll_schema["title"].name);
|
||||
|
||||
the_fields = coll->get_fields();
|
||||
ASSERT_EQ(1, the_fields.size());
|
||||
ASSERT_EQ("title", the_fields[0].name);
|
||||
}
|
||||
|
||||
TEST_F(CollectionSchemaChangeTest, EmbeddingFieldsMapTest) {
|
||||
|
@ -1816,6 +1816,36 @@ TEST_F(CollectionSpecificMoreTest, ConsiderDroppedTokensDuringTextMatchScoring)
|
||||
ASSERT_EQ("1", res["hits"][1]["document"]["id"].get<std::string>());
|
||||
}
|
||||
|
||||
TEST_F(CollectionSpecificMoreTest, ConsiderDroppedTokensDuringTextMatchScoring2) {
|
||||
nlohmann::json schema = R"({
|
||||
"name": "coll1",
|
||||
"fields": [
|
||||
{"name": "name", "type": "string"}
|
||||
]
|
||||
})"_json;
|
||||
|
||||
Collection *coll1 = collectionManager.create_collection(schema).get();
|
||||
|
||||
nlohmann::json doc;
|
||||
doc["id"] = "0";
|
||||
doc["name"] = "Elizabeth Arden 5th Avenue Eau de Parfum 125ml";
|
||||
ASSERT_TRUE(coll1->add(doc.dump()).ok());
|
||||
|
||||
doc["id"] = "1";
|
||||
doc["name"] = "Avène Sun Very High Protection Mineral Cream SPF50+ 50ml";
|
||||
ASSERT_TRUE(coll1->add(doc.dump()).ok());
|
||||
|
||||
auto res = coll1->search("avène eau mineral", {"name"}, "", {}, {}, {2}, 10, 1, FREQUENCY, {true}, 5,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 20, {}, {}, {}, 0,
|
||||
"<mark>", "</mark>", {3}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback,
|
||||
4, {off}, 0, 0, 0, 2, false, "", true, 0, max_weight).get();
|
||||
|
||||
ASSERT_EQ(2, res["hits"].size());
|
||||
ASSERT_EQ("1", res["hits"][0]["document"]["id"].get<std::string>());
|
||||
ASSERT_EQ("0", res["hits"][1]["document"]["id"].get<std::string>());
|
||||
}
|
||||
|
||||
TEST_F(CollectionSpecificMoreTest, NonNestedFieldNameWithDot) {
|
||||
nlohmann::json schema = R"({
|
||||
"name": "coll1",
|
||||
|
@ -224,6 +224,137 @@ TEST_F(CollectionVectorTest, BasicVectorQuerying) {
|
||||
collectionManager.drop_collection("coll1");
|
||||
}
|
||||
|
||||
TEST_F(CollectionVectorTest, VectorUnchangedUpsert) {
|
||||
nlohmann::json schema = R"({
|
||||
"name": "coll1",
|
||||
"fields": [
|
||||
{"name": "title", "type": "string"},
|
||||
{"name": "points", "type": "int32"},
|
||||
{"name": "vec", "type": "float[]", "num_dim": 3}
|
||||
]
|
||||
})"_json;
|
||||
|
||||
Collection* coll1 = collectionManager.create_collection(schema).get();
|
||||
|
||||
std::vector<float> vec = {0.12, 0.45, 0.64};
|
||||
|
||||
nlohmann::json doc;
|
||||
doc["id"] = "0";
|
||||
doc["title"] = "Title";
|
||||
doc["points"] = 100;
|
||||
doc["vec"] = vec;
|
||||
|
||||
auto add_op = coll1->add(doc.dump());
|
||||
ASSERT_TRUE(add_op.ok());
|
||||
|
||||
auto results = coll1->search("*", {}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
|
||||
"", 10, {}, {}, {}, 0,
|
||||
"<mark>", "</mark>", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback,
|
||||
4, {off}, 32767, 32767, 2,
|
||||
false, true, "vec:([0.12, 0.44, 0.55])").get();
|
||||
|
||||
ASSERT_EQ(1, results["found"].get<size_t>());
|
||||
|
||||
|
||||
// upsert unchanged doc
|
||||
add_op = coll1->add(doc.dump(), index_operation_t::UPSERT);
|
||||
ASSERT_TRUE(add_op.ok());
|
||||
|
||||
results = coll1->search("*", {}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
|
||||
"", 10, {}, {}, {}, 0,
|
||||
"<mark>", "</mark>", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback,
|
||||
4, {off}, 32767, 32767, 2,
|
||||
false, true, "vec:([0.12, 0.44, 0.55])").get();
|
||||
|
||||
ASSERT_EQ(1, results["found"].get<size_t>());
|
||||
|
||||
// emplace unchanged doc
|
||||
add_op = coll1->add(doc.dump(), index_operation_t::EMPLACE);
|
||||
ASSERT_TRUE(add_op.ok());
|
||||
|
||||
results = coll1->search("*", {}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
|
||||
"", 10, {}, {}, {}, 0,
|
||||
"<mark>", "</mark>", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback,
|
||||
4, {off}, 32767, 32767, 2,
|
||||
false, true, "vec:([0.12, 0.44, 0.55])").get();
|
||||
|
||||
ASSERT_EQ(1, results["found"].get<size_t>());
|
||||
}
|
||||
|
||||
TEST_F(CollectionVectorTest, VectorPartialUpdate) {
|
||||
nlohmann::json schema = R"({
|
||||
"name": "coll1",
|
||||
"fields": [
|
||||
{"name": "title", "type": "string"},
|
||||
{"name": "points", "type": "int32"},
|
||||
{"name": "vec", "type": "float[]", "num_dim": 3}
|
||||
]
|
||||
})"_json;
|
||||
|
||||
Collection* coll1 = collectionManager.create_collection(schema).get();
|
||||
|
||||
std::vector<float> vec = {0.12, 0.45, 0.64};
|
||||
|
||||
nlohmann::json doc;
|
||||
doc["id"] = "0";
|
||||
doc["title"] = "Title";
|
||||
doc["points"] = 100;
|
||||
doc["vec"] = vec;
|
||||
|
||||
auto add_op = coll1->add(doc.dump());
|
||||
ASSERT_TRUE(add_op.ok());
|
||||
|
||||
auto results = coll1->search("*", {}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
|
||||
"", 10, {}, {}, {}, 0,
|
||||
"<mark>", "</mark>", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback,
|
||||
4, {off}, 32767, 32767, 2,
|
||||
false, true, "vec:([0.12, 0.44, 0.55])").get();
|
||||
|
||||
ASSERT_EQ(1, results["found"].get<size_t>());
|
||||
|
||||
|
||||
// emplace partial doc
|
||||
doc.erase("vec");
|
||||
doc["title"] = "Random";
|
||||
add_op = coll1->add(doc.dump(), index_operation_t::EMPLACE);
|
||||
ASSERT_TRUE(add_op.ok());
|
||||
|
||||
results = coll1->search("Random", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
|
||||
"", 10, {}, {}, {}, 0,
|
||||
"<mark>", "</mark>", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback,
|
||||
4, {off}, 32767, 32767, 2,
|
||||
false, true, "vec:([0.12, 0.44, 0.55])").get();
|
||||
|
||||
ASSERT_EQ(1, results["found"].get<size_t>());
|
||||
|
||||
// update portial doc
|
||||
|
||||
doc.erase("vec");
|
||||
doc["title"] = "Random";
|
||||
add_op = coll1->add(doc.dump(), index_operation_t::UPDATE);
|
||||
ASSERT_TRUE(add_op.ok());
|
||||
|
||||
results = coll1->search("Random", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
|
||||
"", 10, {}, {}, {}, 0,
|
||||
"<mark>", "</mark>", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback,
|
||||
4, {off}, 32767, 32767, 2,
|
||||
false, true, "vec:([0.12, 0.44, 0.55])").get();
|
||||
|
||||
ASSERT_EQ(1, results["found"].get<size_t>());
|
||||
}
|
||||
|
||||
TEST_F(CollectionVectorTest, NumVectorGreaterThanNumDim) {
|
||||
nlohmann::json schema = R"({
|
||||
"name": "coll1",
|
||||
@ -692,6 +823,88 @@ TEST_F(CollectionVectorTest, VectorWithNullValue) {
|
||||
nlohmann::json::parse(json_lines[1])["error"].get<std::string>());
|
||||
}
|
||||
|
||||
TEST_F(CollectionVectorTest, EmbeddedVectorUnchangedUpsert) {
|
||||
nlohmann::json schema = R"({
|
||||
"name": "coll1",
|
||||
"fields": [
|
||||
{"name": "title", "type": "string"},
|
||||
{"name": "points", "type": "int32"},
|
||||
{"name": "embedding", "type":"float[]", "embed":{"from": ["title"],
|
||||
"model_config": {"model_name": "ts/e5-small"}}}
|
||||
]
|
||||
})"_json;
|
||||
|
||||
TextEmbedderManager::set_model_dir("/tmp/typesense_test/models");
|
||||
|
||||
Collection* coll1 = collectionManager.create_collection(schema).get();
|
||||
|
||||
nlohmann::json doc;
|
||||
doc["id"] = "0";
|
||||
doc["title"] = "Title";
|
||||
doc["points"] = 100;
|
||||
|
||||
auto add_op = coll1->add(doc.dump());
|
||||
ASSERT_TRUE(add_op.ok());
|
||||
|
||||
auto results = coll1->search("title", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>()).get();
|
||||
|
||||
ASSERT_EQ(1, results["found"].get<size_t>());
|
||||
auto embedding = results["hits"][0]["document"]["embedding"].get<std::vector<float>>();
|
||||
ASSERT_EQ(384, embedding.size());
|
||||
|
||||
// upsert unchanged doc
|
||||
doc.clear();
|
||||
doc["id"] = "0";
|
||||
doc["title"] = "Title";
|
||||
doc["points"] = 100;
|
||||
|
||||
add_op = coll1->add(doc.dump(), index_operation_t::UPSERT);
|
||||
ASSERT_TRUE(add_op.ok());
|
||||
|
||||
results = coll1->search("title", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>()).get();
|
||||
ASSERT_EQ(1, results["found"].get<size_t>());
|
||||
embedding = results["hits"][0]["document"]["embedding"].get<std::vector<float>>();
|
||||
ASSERT_EQ(384, embedding.size());
|
||||
|
||||
// update
|
||||
|
||||
doc.clear();
|
||||
doc["id"] = "0";
|
||||
doc["title"] = "Title";
|
||||
doc["points"] = 100;
|
||||
|
||||
add_op = coll1->add(doc.dump(), index_operation_t::UPDATE);
|
||||
ASSERT_TRUE(add_op.ok());
|
||||
|
||||
results = coll1->search("title", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>()).get();
|
||||
ASSERT_EQ(1, results["found"].get<size_t>());
|
||||
embedding = results["hits"][0]["document"]["embedding"].get<std::vector<float>>();
|
||||
ASSERT_EQ(384, embedding.size());
|
||||
|
||||
// emplace
|
||||
|
||||
doc.clear();
|
||||
doc["id"] = "0";
|
||||
doc["title"] = "Title";
|
||||
doc["points"] = 100;
|
||||
|
||||
add_op = coll1->add(doc.dump(), index_operation_t::EMPLACE);
|
||||
ASSERT_TRUE(add_op.ok());
|
||||
|
||||
results = coll1->search("title", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>()).get();
|
||||
ASSERT_EQ(1, results["found"].get<size_t>());
|
||||
embedding = results["hits"][0]["document"]["embedding"].get<std::vector<float>>();
|
||||
ASSERT_EQ(384, embedding.size());
|
||||
}
|
||||
|
||||
TEST_F(CollectionVectorTest, HybridSearchWithExplicitVector) {
|
||||
nlohmann::json schema = R"({
|
||||
"name": "objects",
|
||||
@ -1099,7 +1312,67 @@ TEST_F(CollectionVectorTest, HideCredential) {
|
||||
ASSERT_EQ("***********", coll_summary["fields"][1]["embed"]["model_config"]["project_id"].get<std::string>());
|
||||
}
|
||||
|
||||
TEST_F(CollectionVectorTest, UpdateOfCollWithNonOptionalEmbeddingField) {
|
||||
TEST_F(CollectionVectorTest, UpdateOfFieldReferencedByEmbedding) {
|
||||
nlohmann::json schema = R"({
|
||||
"name": "objects",
|
||||
"fields": [
|
||||
{"name": "name", "type": "string"},
|
||||
{"name": "embedding", "type":"float[]", "embed":{"from": ["name"],
|
||||
"model_config": {"model_name": "ts/e5-small"}}}
|
||||
]
|
||||
})"_json;
|
||||
|
||||
TextEmbedderManager::set_model_dir("/tmp/typesense_test/models");
|
||||
|
||||
auto op = collectionManager.create_collection(schema);
|
||||
ASSERT_TRUE(op.ok());
|
||||
Collection* coll = op.get();
|
||||
|
||||
nlohmann::json object;
|
||||
object["id"] = "0";
|
||||
object["name"] = "butter";
|
||||
|
||||
auto add_op = coll->add(object.dump(), CREATE);
|
||||
ASSERT_TRUE(add_op.ok());
|
||||
|
||||
auto results = coll->search("butter", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}).get();
|
||||
ASSERT_EQ(1, results["found"].get<size_t>());
|
||||
auto original_embedding = results["hits"][0]["document"]["embedding"].get<std::vector<float>>();
|
||||
|
||||
nlohmann::json update_object;
|
||||
update_object["id"] = "0";
|
||||
update_object["name"] = "ghee";
|
||||
auto update_op = coll->add(update_object.dump(), EMPLACE);
|
||||
ASSERT_TRUE(update_op.ok());
|
||||
|
||||
results = coll->search("ghee", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}).get();
|
||||
ASSERT_EQ(1, results["found"].get<size_t>());
|
||||
auto updated_embedding = results["hits"][0]["document"]["embedding"].get<std::vector<float>>();
|
||||
ASSERT_NE(original_embedding, updated_embedding);
|
||||
|
||||
// action = update
|
||||
update_object["name"] = "milk";
|
||||
update_op = coll->add(update_object.dump(), UPDATE);
|
||||
ASSERT_TRUE(update_op.ok());
|
||||
|
||||
results = coll->search("milk", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}).get();
|
||||
ASSERT_EQ(1, results["found"].get<size_t>());
|
||||
updated_embedding = results["hits"][0]["document"]["embedding"].get<std::vector<float>>();
|
||||
ASSERT_NE(original_embedding, updated_embedding);
|
||||
|
||||
// action = upsert
|
||||
update_object["name"] = "cheese";
|
||||
update_op = coll->add(update_object.dump(), UPSERT);
|
||||
ASSERT_TRUE(update_op.ok());
|
||||
|
||||
results = coll->search("cheese", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}).get();
|
||||
ASSERT_EQ(1, results["found"].get<size_t>());
|
||||
updated_embedding = results["hits"][0]["document"]["embedding"].get<std::vector<float>>();
|
||||
ASSERT_NE(original_embedding, updated_embedding);
|
||||
}
|
||||
|
||||
TEST_F(CollectionVectorTest, UpdateOfFieldNotReferencedByEmbedding) {
|
||||
// test updates to a field that's not referred by an embedding field
|
||||
nlohmann::json schema = R"({
|
||||
"name": "objects",
|
||||
"fields": [
|
||||
@ -1123,16 +1396,34 @@ TEST_F(CollectionVectorTest, UpdateOfCollWithNonOptionalEmbeddingField) {
|
||||
auto add_op = coll->add(object.dump(), CREATE);
|
||||
ASSERT_TRUE(add_op.ok());
|
||||
|
||||
auto results = coll->search("butter", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}).get();
|
||||
ASSERT_EQ(1, results["found"].get<size_t>());
|
||||
|
||||
nlohmann::json update_object;
|
||||
update_object["id"] = "0";
|
||||
update_object["about"] = "something about butter";
|
||||
auto update_op = coll->add(update_object.dump(), EMPLACE);
|
||||
ASSERT_TRUE(update_op.ok());
|
||||
|
||||
results = coll->search("butter", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}).get();
|
||||
ASSERT_EQ(1, results["found"].get<size_t>());
|
||||
|
||||
// action = update
|
||||
update_object["about"] = "something about butter 2";
|
||||
update_op = coll->add(update_object.dump(), UPDATE);
|
||||
ASSERT_TRUE(update_op.ok());
|
||||
|
||||
results = coll->search("butter", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}).get();
|
||||
ASSERT_EQ(1, results["found"].get<size_t>());
|
||||
|
||||
// action = upsert
|
||||
update_object["name"] = "butter";
|
||||
update_object["about"] = "something about butter 3";
|
||||
update_op = coll->add(update_object.dump(), UPSERT);
|
||||
ASSERT_TRUE(update_op.ok());
|
||||
|
||||
results = coll->search("butter", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}).get();
|
||||
ASSERT_EQ(1, results["found"].get<size_t>());
|
||||
}
|
||||
|
||||
TEST_F(CollectionVectorTest, FreshEmplaceWithOptionalEmbeddingReferencedField) {
|
||||
@ -1161,6 +1452,27 @@ TEST_F(CollectionVectorTest, FreshEmplaceWithOptionalEmbeddingReferencedField) {
|
||||
"or make the embedding field optional.", add_op.error());
|
||||
}
|
||||
|
||||
TEST_F(CollectionVectorTest, EmbeddingFieldWithIdFieldPrecedingInSchema) {
|
||||
auto schema = R"({
|
||||
"name": "objects",
|
||||
"fields": [
|
||||
{"name": "id", "type": "string"},
|
||||
{"name": "name", "type": "string"},
|
||||
{"name": "embedding", "type":"float[]", "embed":{"from": ["name"], "model_config": {"model_name": "ts/e5-small"}}}
|
||||
]
|
||||
})"_json;
|
||||
|
||||
TextEmbedderManager::set_model_dir("/tmp/typesense_test/models");
|
||||
|
||||
auto op = collectionManager.create_collection(schema);
|
||||
ASSERT_TRUE(op.ok());
|
||||
Collection* coll = op.get();
|
||||
|
||||
auto fs = coll->get_fields();
|
||||
ASSERT_EQ(2, fs.size());
|
||||
ASSERT_EQ(384, fs[1].num_dim);
|
||||
}
|
||||
|
||||
TEST_F(CollectionVectorTest, SkipEmbeddingOpWhenValueExists) {
|
||||
nlohmann::json schema = R"({
|
||||
"name": "objects",
|
||||
@ -1306,6 +1618,58 @@ TEST_F(CollectionVectorTest, KeywordSearchReturnOnlyTextMatchInfo) {
|
||||
ASSERT_EQ(1, results["hits"][0].count("text_match_info"));
|
||||
}
|
||||
|
||||
TEST_F(CollectionVectorTest, GroupByWithVectorSearch) {
|
||||
nlohmann::json schema = R"({
|
||||
"name": "coll1",
|
||||
"fields": [
|
||||
{"name": "title", "type": "string"},
|
||||
{"name": "group", "type": "string", "facet": true},
|
||||
{"name": "vec", "type": "float[]", "num_dim": 4}
|
||||
]
|
||||
})"_json;
|
||||
|
||||
Collection* coll1 = collectionManager.create_collection(schema).get();
|
||||
|
||||
std::vector<std::vector<float>> values = {
|
||||
{0.851758, 0.909671, 0.823431, 0.372063},
|
||||
{0.97826, 0.933157, 0.39557, 0.306488},
|
||||
{0.230606, 0.634397, 0.514009, 0.399594}
|
||||
};
|
||||
|
||||
for (size_t i = 0; i < values.size(); i++) {
|
||||
nlohmann::json doc;
|
||||
doc["id"] = std::to_string(i);
|
||||
doc["title"] = std::to_string(i) + " title";
|
||||
doc["group"] = "0";
|
||||
doc["vec"] = values[i];
|
||||
ASSERT_TRUE(coll1->add(doc.dump()).ok());
|
||||
}
|
||||
|
||||
auto res = coll1->search("title", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
|
||||
"", 10, {}, {}, {"group"}, 1,
|
||||
"<mark>", "</mark>", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback,
|
||||
4, {off}, 32767, 32767, 2,
|
||||
false, true, "vec:([0.96826, 0.94, 0.39557, 0.306488])").get();
|
||||
|
||||
ASSERT_EQ(1, res["grouped_hits"].size());
|
||||
ASSERT_EQ(1, res["grouped_hits"][0]["hits"].size());
|
||||
ASSERT_EQ(1, res["grouped_hits"][0]["hits"][0].count("vector_distance"));
|
||||
|
||||
res = coll1->search("*", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
|
||||
"", 10, {}, {}, {"group"}, 1,
|
||||
"<mark>", "</mark>", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback,
|
||||
4, {off}, 32767, 32767, 2,
|
||||
false, true, "vec:([0.96826, 0.94, 0.39557, 0.306488])").get();
|
||||
|
||||
ASSERT_EQ(1, res["grouped_hits"].size());
|
||||
ASSERT_EQ(1, res["grouped_hits"][0]["hits"].size());
|
||||
ASSERT_EQ(1, res["grouped_hits"][0]["hits"][0].count("vector_distance"));
|
||||
}
|
||||
|
||||
TEST_F(CollectionVectorTest, HybridSearchReturnAllInfo) {
|
||||
auto schema_json =
|
||||
R"({
|
||||
@ -1342,3 +1706,135 @@ TEST_F(CollectionVectorTest, HybridSearchReturnAllInfo) {
|
||||
ASSERT_EQ(1, results["hits"][0].count("text_match_info"));
|
||||
ASSERT_EQ(1, results["hits"][0].count("hybrid_search_info"));
|
||||
}
|
||||
|
||||
|
||||
TEST_F(CollectionVectorTest, DISABLED_HybridSortingTest) {
|
||||
auto schema_json =
|
||||
R"({
|
||||
"name": "TEST",
|
||||
"fields": [
|
||||
{"name": "name", "type": "string"},
|
||||
{"name": "embedding", "type":"float[]", "embed":{"from": ["name"], "model_config": {"model_name": "ts/e5-small"}}}
|
||||
]
|
||||
})"_json;
|
||||
|
||||
TextEmbedderManager::set_model_dir("/tmp/typesense_test/models");
|
||||
auto collection_create_op = collectionManager.create_collection(schema_json);
|
||||
ASSERT_TRUE(collection_create_op.ok());
|
||||
auto coll1 = collection_create_op.get();
|
||||
|
||||
auto add_op = coll1->add(R"({
|
||||
"name": "john doe"
|
||||
})"_json.dump());
|
||||
ASSERT_TRUE(add_op.ok());
|
||||
|
||||
add_op = coll1->add(R"({
|
||||
"name": "john legend"
|
||||
})"_json.dump());
|
||||
ASSERT_TRUE(add_op.ok());
|
||||
|
||||
add_op = coll1->add(R"({
|
||||
"name": "john krasinski"
|
||||
})"_json.dump());
|
||||
ASSERT_TRUE(add_op.ok());
|
||||
|
||||
add_op = coll1->add(R"({
|
||||
"name": "john abraham"
|
||||
})"_json.dump());
|
||||
ASSERT_TRUE(add_op.ok());
|
||||
|
||||
// first do keyword search
|
||||
auto results = coll1->search("john", {"name"},
|
||||
"", {}, {}, {2}, 10,
|
||||
1, FREQUENCY, {true},
|
||||
0, spp::sparse_hash_set<std::string>()).get();
|
||||
|
||||
ASSERT_EQ(4, results["hits"].size());
|
||||
|
||||
|
||||
// now do hybrid search with sort_by: _text_match:desc,_vector_distance:asc
|
||||
std::vector<sort_by> sort_by_list = {{"_text_match", "desc"}, {"_vector_distance", "asc"}};
|
||||
|
||||
auto hybrid_results = coll1->search("john", {"name", "embedding"},
|
||||
"", {}, sort_by_list, {2}, 10,
|
||||
1, FREQUENCY, {true},
|
||||
0, spp::sparse_hash_set<std::string>()).get();
|
||||
|
||||
// first 4 results should be same as keyword search
|
||||
ASSERT_EQ(results["hits"][0]["document"]["name"].get<std::string>(), hybrid_results["hits"][0]["document"]["name"].get<std::string>());
|
||||
ASSERT_EQ(results["hits"][1]["document"]["name"].get<std::string>(), hybrid_results["hits"][1]["document"]["name"].get<std::string>());
|
||||
ASSERT_EQ(results["hits"][2]["document"]["name"].get<std::string>(), hybrid_results["hits"][2]["document"]["name"].get<std::string>());
|
||||
ASSERT_EQ(results["hits"][3]["document"]["name"].get<std::string>(), hybrid_results["hits"][3]["document"]["name"].get<std::string>());
|
||||
}
|
||||
|
||||
TEST_F(CollectionVectorTest, TestDifferentOpenAIApiKeys) {
|
||||
if (std::getenv("api_key_1") == nullptr || std::getenv("api_key_2") == nullptr) {
|
||||
LOG(INFO) << "Skipping test as api_key_1 or api_key_2 is not set";
|
||||
return;
|
||||
}
|
||||
|
||||
auto api_key1 = std::string(std::getenv("api_key_1"));
|
||||
auto api_key2 = std::string(std::getenv("api_key_2"));
|
||||
|
||||
auto embedder_map = TextEmbedderManager::get_instance()._get_text_embedders();
|
||||
|
||||
ASSERT_EQ(embedder_map.find("openai/text-embedding-ada-002:" + api_key1), embedder_map.end());
|
||||
ASSERT_EQ(embedder_map.find("openai/text-embedding-ada-002:" + api_key2), embedder_map.end());
|
||||
ASSERT_EQ(embedder_map.find("openai/text-embedding-ada-002"), embedder_map.end());
|
||||
|
||||
nlohmann::json model_config1 = R"({
|
||||
"model_name": "openai/text-embedding-ada-002"
|
||||
})"_json;
|
||||
|
||||
nlohmann::json model_config2 = model_config1;
|
||||
|
||||
model_config1["api_key"] = api_key1;
|
||||
model_config2["api_key"] = api_key2;
|
||||
|
||||
size_t num_dim;
|
||||
TextEmbedderManager::get_instance().validate_and_init_remote_model(model_config1, num_dim);
|
||||
TextEmbedderManager::get_instance().validate_and_init_remote_model(model_config2, num_dim);
|
||||
|
||||
embedder_map = TextEmbedderManager::get_instance()._get_text_embedders();
|
||||
|
||||
ASSERT_NE(embedder_map.find("openai/text-embedding-ada-002:" + api_key1), embedder_map.end());
|
||||
ASSERT_NE(embedder_map.find("openai/text-embedding-ada-002:" + api_key2), embedder_map.end());
|
||||
ASSERT_EQ(embedder_map.find("openai/text-embedding-ada-002"), embedder_map.end());
|
||||
}
|
||||
|
||||
|
||||
TEST_F(CollectionVectorTest, TestMultilingualE5) {
|
||||
auto schema_json =
|
||||
R"({
|
||||
"name": "TEST",
|
||||
"fields": [
|
||||
{"name": "name", "type": "string"},
|
||||
{"name": "embedding", "type":"float[]", "embed":{"from": ["name"], "model_config": {"model_name": "ts/multilingual-e5-small"}}}
|
||||
]
|
||||
})"_json;
|
||||
|
||||
TextEmbedderManager::set_model_dir("/tmp/typesense_test/models");
|
||||
|
||||
auto collection_create_op = collectionManager.create_collection(schema_json);
|
||||
|
||||
ASSERT_TRUE(collection_create_op.ok());
|
||||
auto coll1 = collection_create_op.get();
|
||||
|
||||
auto add_op = coll1->add(R"({
|
||||
"name": "john doe"
|
||||
})"_json.dump());
|
||||
|
||||
auto hybrid_results = coll1->search("john", {"name", "embedding"},
|
||||
"", {}, {}, {2}, 10,
|
||||
1, FREQUENCY, {true},
|
||||
0, spp::sparse_hash_set<std::string>());
|
||||
|
||||
ASSERT_TRUE(hybrid_results.ok());
|
||||
|
||||
auto semantic_results = coll1->search("john", {"embedding"},
|
||||
"", {}, {}, {2}, 10,
|
||||
1, FREQUENCY, {true},
|
||||
0, spp::sparse_hash_set<std::string>());
|
||||
|
||||
ASSERT_TRUE(semantic_results.ok());
|
||||
}
|
@ -610,7 +610,7 @@ TEST_F(CoreAPIUtilsTest, MultiSearchWithPresetShouldUsePresetForAuth) {
|
||||
ASSERT_EQ(2, embedded_params_vec.size());
|
||||
}
|
||||
|
||||
TEST_F(CoreAPIUtilsTest, PresetSingleSearch) {
|
||||
TEST_F(CoreAPIUtilsTest, PresetMultiSearch) {
|
||||
nlohmann::json schema = R"({
|
||||
"name": "coll1",
|
||||
"fields": [
|
||||
@ -634,7 +634,7 @@ TEST_F(CoreAPIUtilsTest, PresetSingleSearch) {
|
||||
|
||||
auto search_body = R"(
|
||||
{"searches":[
|
||||
{"collection":"coll1","q":"apple", "query_by": "title", "preset": "single_preset"}
|
||||
{"collection":"coll1","q":"apple", "query_by": "name", "preset": "single_preset"}
|
||||
]}
|
||||
)";
|
||||
|
||||
@ -644,8 +644,40 @@ TEST_F(CoreAPIUtilsTest, PresetSingleSearch) {
|
||||
|
||||
post_multi_search(req, res);
|
||||
|
||||
ASSERT_EQ("12", req->params["per_page"]);
|
||||
ASSERT_EQ("coll1", req->params["collection"]);
|
||||
auto res_json = nlohmann::json::parse(res->body);
|
||||
ASSERT_EQ(1, res_json["results"].size());
|
||||
ASSERT_EQ(0, res_json["results"][0]["found"].get<size_t>());
|
||||
|
||||
// with multiple "searches" preset configuration
|
||||
preset_value = R"(
|
||||
{"searches":[
|
||||
{"collection":"coll1", "q": "*", "per_page": "8"},
|
||||
{"collection":"coll1", "q": "*", "per_page": "11"}
|
||||
]}
|
||||
)"_json;
|
||||
|
||||
collectionManager.upsert_preset("multi_preset", preset_value);
|
||||
embedded_params.clear();
|
||||
req->params.clear();
|
||||
req->params["preset"] = "multi_preset";
|
||||
req->embedded_params_vec.clear();
|
||||
req->embedded_params_vec.push_back(embedded_params);
|
||||
req->embedded_params_vec.push_back(embedded_params);
|
||||
|
||||
// "preset": "multi_preset"
|
||||
search_body = R"(
|
||||
{"searches":[
|
||||
{"collection":"coll1","q":"apple", "query_by": "title"}
|
||||
]}
|
||||
)";
|
||||
|
||||
req->body = search_body;
|
||||
|
||||
post_multi_search(req, res);
|
||||
res_json = nlohmann::json::parse(res->body);
|
||||
ASSERT_EQ(2, res_json["results"].size());
|
||||
ASSERT_EQ(0, res_json["results"][0]["found"].get<size_t>());
|
||||
ASSERT_EQ(0, res_json["results"][1]["found"].get<size_t>());
|
||||
|
||||
collectionManager.drop_collection("coll1");
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user