diff --git a/include/analytics_manager.h b/include/analytics_manager.h index 4207f7cf..0f098a7a 100644 --- a/include/analytics_manager.h +++ b/include/analytics_manager.h @@ -79,7 +79,7 @@ public: Option remove_rule(const std::string& name); void add_suggestion(const std::string& query_collection, - std::string& query, bool live_query, const std::string& user_id); + const std::string& query, bool live_query, const std::string& user_id); void stop(); diff --git a/include/collection.h b/include/collection.h index f95b7c9f..e9ca2b2a 100644 --- a/include/collection.h +++ b/include/collection.h @@ -162,7 +162,7 @@ private: void remove_document(const nlohmann::json & document, const uint32_t seq_id, bool remove_from_store); - void process_remove_field_for_embedding_fields(const field& the_field, std::vector& garbage_fields); + void process_remove_field_for_embedding_fields(const field& del_field, std::vector& garbage_embed_fields); void curate_results(string& actual_query, const string& filter_query, bool enable_overrides, bool already_segmented, const std::map>& pinned_hits, diff --git a/include/field.h b/include/field.h index d34d32ae..0f959b7d 100644 --- a/include/field.h +++ b/include/field.h @@ -424,10 +424,11 @@ struct field { std::string& fallback_field_type, std::vector& the_fields); - static Option validate_and_init_embed_fields(const std::vector>& embed_json_field_indices, - const tsl::htrie_map& search_schema, - nlohmann::json& fields_json, - std::vector& fields_vec); + static Option validate_and_init_embed_field(const tsl::htrie_map& search_schema, + nlohmann::json& field_json, + const nlohmann::json& fields_json, + field& the_field); + static bool flatten_obj(nlohmann::json& doc, nlohmann::json& value, bool has_array, bool has_obj_array, bool is_update, const field& the_field, const std::string& flat_name, diff --git a/include/index.h b/include/index.h index 6dffc38e..5656cc2f 100644 --- a/include/index.h +++ b/include/index.h @@ -532,7 +532,7 @@ private: static void handle_doc_ops(const tsl::htrie_map& search_schema, nlohmann::json& update_doc, const nlohmann::json& old_doc); - static void get_doc_changes(const index_operation_t op, const tsl::htrie_map& search_schema, + static void get_doc_changes(const index_operation_t op, const tsl::htrie_map& embedding_fields, nlohmann::json &update_doc, const nlohmann::json &old_doc, nlohmann::json &new_doc, nlohmann::json &del_doc); diff --git a/include/text_embedder_manager.h b/include/text_embedder_manager.h index 543e8f91..2681e0b9 100644 --- a/include/text_embedder_manager.h +++ b/include/text_embedder_manager.h @@ -72,6 +72,10 @@ public: Option validate_and_init_local_model(const nlohmann::json& model_config, size_t& num_dims); Option validate_and_init_model(const nlohmann::json& model_config, size_t& num_dims); + std::unordered_map> _get_text_embedders() { + return text_embedders; + } + private: TextEmbedderManager() = default; diff --git a/include/text_embedder_remote.h b/include/text_embedder_remote.h index 8167fefd..b5f219d5 100644 --- a/include/text_embedder_remote.h +++ b/include/text_embedder_remote.h @@ -31,6 +31,7 @@ class RemoteEmbedder { virtual nlohmann::json get_error_json(const nlohmann::json& req_body, long res_code, const std::string& res_body) = 0; virtual embedding_res_t Embed(const std::string& text, const size_t remote_embedder_timeout_ms = 30000, const size_t remote_embedding_num_tries = 2) = 0; virtual std::vector batch_embed(const std::vector& inputs, const size_t remote_embedding_batch_size = 200) = 0; + static const std::string get_model_key(const nlohmann::json& model_config); static void init(ReplicationState* rs) { raft_server = rs; } @@ -51,6 +52,7 @@ class OpenAIEmbedder : public RemoteEmbedder { embedding_res_t Embed(const std::string& text, const size_t remote_embedder_timeout_ms = 30000, const size_t remote_embedding_num_tries = 2) override; std::vector batch_embed(const std::vector& inputs, const size_t remote_embedding_batch_size = 200) override; nlohmann::json get_error_json(const nlohmann::json& req_body, long res_code, const std::string& res_body) override; + static std::string get_model_key(const nlohmann::json& model_config); }; @@ -68,6 +70,7 @@ class GoogleEmbedder : public RemoteEmbedder { embedding_res_t Embed(const std::string& text, const size_t remote_embedder_timeout_ms = 30000, const size_t remote_embedding_num_tries = 2) override; std::vector batch_embed(const std::vector& inputs, const size_t remote_embedding_batch_size = 200) override; nlohmann::json get_error_json(const nlohmann::json& req_body, long res_code, const std::string& res_body) override; + static std::string get_model_key(const nlohmann::json& model_config); }; @@ -95,6 +98,7 @@ class GCPEmbedder : public RemoteEmbedder { embedding_res_t Embed(const std::string& text, const size_t remote_embedder_timeout_ms = 30000, const size_t remote_embedding_num_tries = 2) override; std::vector batch_embed(const std::vector& inputs, const size_t remote_embedding_batch_size = 200) override; nlohmann::json get_error_json(const nlohmann::json& req_body, long res_code, const std::string& res_body) override; + static std::string get_model_key(const nlohmann::json& model_config); }; diff --git a/include/tokenizer.h b/include/tokenizer.h index a6a88ab6..36de0940 100644 --- a/include/tokenizer.h +++ b/include/tokenizer.h @@ -88,5 +88,5 @@ public: bool should_skip_char(char c); - static void normalize_ascii(std::string& text); + static std::string normalize_ascii_no_spaces(const std::string& text); }; \ No newline at end of file diff --git a/include/tsconfig.h b/include/tsconfig.h index 1e6d8f62..dc382e80 100644 --- a/include/tsconfig.h +++ b/include/tsconfig.h @@ -787,6 +787,10 @@ public: cors_domains.insert(cors_values_vec.begin(), cors_values_vec.end()); } + void set_enable_search_analytics(bool enable_search_analytics) { + this->enable_search_analytics = enable_search_analytics; + } + // validation Option is_valid() { diff --git a/src/analytics_manager.cpp b/src/analytics_manager.cpp index b23c6cc2..31ac078a 100644 --- a/src/analytics_manager.cpp +++ b/src/analytics_manager.cpp @@ -203,7 +203,7 @@ Option AnalyticsManager::remove_popular_queries_index(const std::string &n return Option(true); } -void AnalyticsManager::add_suggestion(const std::string &query_collection, std::string &query, +void AnalyticsManager::add_suggestion(const std::string &query_collection, const std::string& query, const bool live_query, const std::string& user_id) { // look up suggestion collections for the query collection std::unique_lock lock(mutex); @@ -212,7 +212,6 @@ void AnalyticsManager::add_suggestion(const std::string &query_collection, std:: for(const auto& suggestion_collection: suggestion_collections_it->second) { const auto& popular_queries_it = popular_queries.find(suggestion_collection); if(popular_queries_it != popular_queries.end()) { - Tokenizer::normalize_ascii(query); popular_queries_it->second->add(query, live_query, user_id); } } @@ -235,6 +234,8 @@ void AnalyticsManager::run(ReplicationState* raft_server) { } persist_suggestions(raft_server, prev_persistence_s); + prev_persistence_s = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()).count(); lk.unlock(); } @@ -270,8 +271,6 @@ void AnalyticsManager::persist_suggestions(ReplicationState *raft_server, uint64 continue; } - prev_persistence_s = now_ts_seconds; - std::string import_payload; popularQueries->serialize_as_docs(import_payload); diff --git a/src/collection.cpp b/src/collection.cpp index 8883f4ab..9140e8bb 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -52,12 +52,6 @@ Collection::Collection(const std::string& name, const uint32_t collection_id, co symbols_to_index(to_char_array(symbols_to_index)), token_separators(to_char_array(token_separators)), index(init_index()) { - for (auto const& field: fields) { - if (field.embed.count(fields::from) != 0) { - embedding_fields.emplace(field.name, field); - } - } - this->num_documents = 0; } @@ -3942,7 +3936,6 @@ Option Collection::alter(nlohmann::json& alter_payload) { } } - // hide credentials in the alter payload return for(auto& field_json : alter_payload["fields"]) { if(field_json[fields::embed].count(fields::model_config) != 0) { @@ -3955,8 +3948,6 @@ Option Collection::alter(nlohmann::json& alter_payload) { } } - - return Option(true); } @@ -4165,7 +4156,6 @@ Option Collection::validate_alter_payload(nlohmann::json& schema_changes, } std::unordered_map new_dynamic_fields; - std::vector> embed_json_field_indices; int json_array_index = -1; for(const auto& kv: schema_changes["fields"].items()) { @@ -4253,7 +4243,7 @@ Option Collection::validate_alter_payload(nlohmann::json& schema_changes, return parse_op; } - const auto& f = diff_fields.back(); + auto& f = diff_fields.back(); if(f.is_dynamic()) { new_dynamic_fields[f.name] = f; @@ -4261,6 +4251,14 @@ Option Collection::validate_alter_payload(nlohmann::json& schema_changes, updated_search_schema[f.name] = f; } + if(!f.embed.empty()) { + auto validate_res = field::validate_and_init_embed_field(search_schema, schema_changes["fields"][json_array_index], schema_changes["fields"], f); + + if(!validate_res.ok()) { + return validate_res; + } + } + if(is_reindex) { reindex_fields.push_back(f); } else { @@ -4295,9 +4293,7 @@ Option Collection::validate_alter_payload(nlohmann::json& schema_changes, } } - if(!f.embed.empty() && !diff_fields.empty()) { - embed_json_field_indices.emplace_back(json_array_index, diff_fields.size()-1); - } + } else { // partial update is not supported for now @@ -4307,12 +4303,6 @@ Option Collection::validate_alter_payload(nlohmann::json& schema_changes, } } - auto validation_op = field::validate_and_init_embed_fields(embed_json_field_indices, search_schema, - schema_changes["fields"], diff_fields); - if(!validation_op.ok()) { - return validation_op; - } - if(num_auto_detect_fields > 1) { return Option(400, "There can be only one field named `.*`."); } @@ -4904,27 +4894,43 @@ Option Collection::populate_include_exclude_fields_lk(const spp::sparse_ha } // Removes the dropped field from embed_from of all embedding fields. -void Collection::process_remove_field_for_embedding_fields(const field& the_field, std::vector& garbage_fields) { +void Collection::process_remove_field_for_embedding_fields(const field& del_field, + std::vector& garbage_embed_fields) { for(auto& field : fields) { if(field.embed.count(fields::from) == 0) { continue; } - auto embed_from = field.embed[fields::from].get>(); - embed_from.erase(std::remove_if(embed_from.begin(), embed_from.end(), [&the_field](std::string field_name) { - return the_field.name == field_name; - })); - field.embed[fields::from] = std::move(embed_from); - embedding_fields[field.name] = field; - // mark this embedding field as "garbage" if it has no more embed_from fields - if(embed_from.empty()) { - embedding_fields.erase(field.name); - garbage_fields.push_back(field); + bool found_field = false; + nlohmann::json& embed_from_names = field.embed[fields::from]; + for(auto it = embed_from_names.begin(); it != embed_from_names.end();) { + if(it.value() == del_field.name) { + it = embed_from_names.erase(it); + found_field = true; + } else { + it++; + } } - + if(found_field) { + // mark this embedding field as "garbage" if it has no more embed_from fields + if(embed_from_names.empty()) { + garbage_embed_fields.push_back(field); + } else { + // the dropped field was present in `embed_from`, so we have to update the field objects + field.embed[fields::from] = embed_from_names; + embedding_fields[field.name].embed[fields::from] = embed_from_names; + } + } } + for(auto& garbage_field: garbage_embed_fields) { + embedding_fields.erase(garbage_field.name); + search_schema.erase(garbage_field.name); + fields.erase(std::remove_if(fields.begin(), fields.end(), [&garbage_field](const auto &f) { + return f.name == garbage_field.name; + }), fields.end()); + } } void Collection::hide_credential(nlohmann::json& json, const std::string& credential_name) { @@ -4939,10 +4945,15 @@ void Collection::hide_credential(nlohmann::json& json, const std::string& creden } } } + Option Collection::truncate_after_top_k(const string &field_name, size_t k) { + std::shared_lock slock(mutex); + std::vector seq_ids; auto op = index->seq_ids_outside_top_k(field_name, k, seq_ids); + slock.unlock(); + if(!op.ok()) { return op; } diff --git a/src/collection_manager.cpp b/src/collection_manager.cpp index 9aec9e5c..4d045827 100644 --- a/src/collection_manager.cpp +++ b/src/collection_manager.cpp @@ -766,7 +766,9 @@ Option CollectionManager::do_search(std::map& re nlohmann::json preset; const auto& preset_op = CollectionManager::get_instance().get_preset(preset_it->second, preset); - if(preset_op.ok()) { + // NOTE: we merge only single preset configuration because multi ("searches") preset value replaces + // the request body directly before we reach this single search request function. + if(preset_op.ok() && !preset.contains("searches")) { if(!preset.is_object()) { return Option(400, "Search preset is not an object."); } @@ -1112,7 +1114,7 @@ Option CollectionManager::do_search(std::map& re if(Config::get_instance().get_enable_search_analytics()) { if(result.count("found") != 0 && result["found"].get() != 0) { - std::string analytics_query = raw_query; + std::string analytics_query = Tokenizer::normalize_ascii_no_spaces(raw_query); AnalyticsManager::get_instance().add_suggestion(orig_coll_name, analytics_query, true, req_params["x-typesense-user-id"]); } diff --git a/src/core_api.cpp b/src/core_api.cpp index 86b85e27..68f144bf 100644 --- a/src/core_api.cpp +++ b/src/core_api.cpp @@ -729,7 +729,7 @@ bool get_export_documents(const std::shared_ptr& req, const std::share } } - res->content_type_header = "text/plain; charset=utf8"; + res->content_type_header = "text/plain; charset=utf-8"; res->status_code = 200; stream_response(req, res); @@ -902,7 +902,7 @@ bool post_import_documents(const std::shared_ptr& req, const std::shar } } - res->content_type_header = "text/plain; charset=utf8"; + res->content_type_header = "text/plain; charset=utf-8"; res->status_code = 200; res->body = response_stream.str(); diff --git a/src/field.cpp b/src/field.cpp index 7d5e399c..d053e578 100644 --- a/src/field.cpp +++ b/src/field.cpp @@ -1083,7 +1083,7 @@ void field::compact_nested_fields(tsl::htrie_map& nested_fields) { Option field::json_fields_to_fields(bool enable_nested_fields, nlohmann::json &fields_json, string &fallback_field_type, std::vector& the_fields) { size_t num_auto_detect_fields = 0; - std::vector> embed_json_field_indices; + const tsl::htrie_map dummy_search_schema; for(size_t i = 0; i < fields_json.size(); i++) { nlohmann::json& field_json = fields_json[i]; @@ -1094,17 +1094,13 @@ Option field::json_fields_to_fields(bool enable_nested_fields, nlohmann::j } if(!the_fields.empty() && !the_fields.back().embed.empty()) { - embed_json_field_indices.emplace_back(i, i); + auto validate_res = validate_and_init_embed_field(dummy_search_schema, field_json, fields_json, the_fields.back()); + if(!validate_res.ok()) { + return validate_res; + } } } - const tsl::htrie_map dummy_search_schema; - auto validation_op = field::validate_and_init_embed_fields(embed_json_field_indices, dummy_search_schema, - fields_json, the_fields); - if(!validation_op.ok()) { - return validation_op; - } - if(num_auto_detect_fields > 1) { return Option(400,"There can be only one field named `.*`."); } @@ -1112,49 +1108,47 @@ Option field::json_fields_to_fields(bool enable_nested_fields, nlohmann::j return Option(true); } -Option field::validate_and_init_embed_fields(const std::vector>& embed_json_field_indices, - const tsl::htrie_map& search_schema, - nlohmann::json& fields_json, - std::vector& fields_vec) { - - for(const auto& json_field_index: embed_json_field_indices) { - auto& field_json = fields_json[json_field_index.first]; - const std::string err_msg = "Property `" + fields::embed + "." + fields::from + +Option field::validate_and_init_embed_field(const tsl::htrie_map& search_schema, nlohmann::json& field_json, + const nlohmann::json& fields_json, + field& the_field) { + const std::string err_msg = "Property `" + fields::embed + "." + fields::from + "` can only refer to string or string array fields."; - for(auto& field_name : field_json[fields::embed][fields::from].get>()) { - auto embed_field = std::find_if(fields_json.begin(), fields_json.end(), [&field_name](const nlohmann::json& x) { - return x["name"].get() == field_name; - }); + for(auto& field_name : field_json[fields::embed][fields::from].get>()) { - if(embed_field == fields_json.end()) { - const auto& embed_field2 = search_schema.find(field_name); - if (embed_field2 == search_schema.end()) { - return Option(400, err_msg); - } else if (embed_field2->type != field_types::STRING && embed_field2->type != field_types::STRING_ARRAY) { - return Option(400, err_msg); - } - } else if((*embed_field)[fields::type] != field_types::STRING && - (*embed_field)[fields::type] != field_types::STRING_ARRAY) { + auto embed_field = std::find_if(fields_json.begin(), fields_json.end(), [&field_name](const nlohmann::json& x) { + return x["name"].get() == field_name; + }); + + + if(embed_field == fields_json.end()) { + const auto& embed_field2 = search_schema.find(field_name); + if (embed_field2 == search_schema.end()) { + return Option(400, err_msg); + } else if (embed_field2->type != field_types::STRING && embed_field2->type != field_types::STRING_ARRAY) { return Option(400, err_msg); } + } else if((*embed_field)[fields::type] != field_types::STRING && + (*embed_field)[fields::type] != field_types::STRING_ARRAY) { + return Option(400, err_msg); } - - const auto& model_config = field_json[fields::embed][fields::model_config]; - size_t num_dim = 0; - auto res = TextEmbedderManager::get_instance().validate_and_init_model(model_config, num_dim); - if(!res.ok()) { - return Option(res.code(), res.error()); - } - - LOG(INFO) << "Model init done."; - field_json[fields::num_dim] = num_dim; - fields_vec[json_field_index.second].num_dim = num_dim; } + const auto& model_config = field_json[fields::embed][fields::model_config]; + size_t num_dim = 0; + auto res = TextEmbedderManager::get_instance().validate_and_init_model(model_config, num_dim); + if(!res.ok()) { + return Option(res.code(), res.error()); + } + + LOG(INFO) << "Model init done."; + field_json[fields::num_dim] = num_dim; + the_field.num_dim = num_dim; + return Option(true); } + void filter_result_t::and_filter_results(const filter_result_t& a, const filter_result_t& b, filter_result_t& result) { auto lenA = a.count, lenB = b.count; if (lenA == 0 || lenB == 0) { diff --git a/src/index.cpp b/src/index.cpp index 4d81126a..ac7be1d6 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -454,7 +454,7 @@ void Index::validate_and_preprocess(Index *index, std::vector& ite if(index_rec.is_update) { // scrub string fields to reduce delete ops - get_doc_changes(index_rec.operation, search_schema, index_rec.doc, index_rec.old_doc, + get_doc_changes(index_rec.operation, embedding_fields, index_rec.doc, index_rec.old_doc, index_rec.new_doc, index_rec.del_doc); if(generate_embeddings) { @@ -870,12 +870,16 @@ void Index::index_field_in_memory(const field& afield, std::vector try { const std::vector& float_vals = record.doc[afield.name].get>(); - if(afield.vec_dist == cosine) { - std::vector normalized_vals(afield.num_dim); - hnsw_index_t::normalize_vector(float_vals, normalized_vals); - vec_index->addPoint(normalized_vals.data(), (size_t)record.seq_id, true); + if(float_vals.size() != afield.num_dim) { + record.index_failure(400, "Vector size mismatch."); } else { - vec_index->addPoint(float_vals.data(), (size_t)record.seq_id, true); + if(afield.vec_dist == cosine) { + std::vector normalized_vals(afield.num_dim); + hnsw_index_t::normalize_vector(float_vals, normalized_vals); + vec_index->addPoint(normalized_vals.data(), (size_t)record.seq_id, true); + } else { + vec_index->addPoint(float_vals.data(), (size_t)record.seq_id, true); + } } } catch(const std::exception &e) { record.index_failure(400, e.what()); @@ -3200,8 +3204,8 @@ Option Index::search(std::vector& field_query_tokens, cons for(size_t res_index = 0; res_index < vec_results.size(); res_index++) { auto& vec_result = vec_results[res_index]; - auto doc_id = vec_result.first; - auto result_it = topster->kv_map.find(doc_id); + auto seq_id = vec_result.first; + auto result_it = topster->kv_map.find(seq_id); if(result_it != topster->kv_map.end()) { if(result_it->second->match_score_index < 0 || result_it->second->match_score_index > 2) { @@ -3210,22 +3214,23 @@ Option Index::search(std::vector& field_query_tokens, cons // result overlaps with keyword search: we have to combine the scores - auto result = result_it->second; + KV* kv = result_it->second; // old_score + (1 / rank_of_document) * WEIGHT) - result->vector_distance = vec_result.second; - result->text_match_score = result->scores[result->match_score_index]; + kv->vector_distance = vec_result.second; + kv->text_match_score = kv->scores[kv->match_score_index]; int64_t match_score = float_to_int64_t( - (int64_t_to_float(result->scores[result->match_score_index])) + + (int64_t_to_float(kv->scores[kv->match_score_index])) + ((1.0 / (res_index + 1)) * VECTOR_SEARCH_WEIGHT)); int64_t match_score_index = -1; int64_t scores[3] = {0}; - compute_sort_scores(sort_fields_std, sort_order, field_values, geopoint_indices, doc_id, 0, match_score, scores, match_score_index, vec_result.second); + compute_sort_scores(sort_fields_std, sort_order, field_values, geopoint_indices, seq_id, 0, + match_score, scores, match_score_index, vec_result.second); for(int i = 0; i < 3; i++) { - result->scores[i] = scores[i]; + kv->scores[i] = scores[i]; } - result->match_score_index = match_score_index; + kv->match_score_index = match_score_index; } else { // Result has been found only in vector search: we have to add it to both KV and result_ids @@ -3233,12 +3238,21 @@ Option Index::search(std::vector& field_query_tokens, cons int64_t scores[3] = {0}; int64_t match_score = float_to_int64_t((1.0 / (res_index + 1)) * VECTOR_SEARCH_WEIGHT); int64_t match_score_index = -1; - compute_sort_scores(sort_fields_std, sort_order, field_values, geopoint_indices, doc_id, 0, match_score, scores, match_score_index, vec_result.second); - KV kv(searched_queries.size(), doc_id, doc_id, match_score_index, scores); + compute_sort_scores(sort_fields_std, sort_order, field_values, geopoint_indices, seq_id, 0, match_score, scores, match_score_index, vec_result.second); + + uint64_t distinct_id = seq_id; + if (group_limit != 0) { + distinct_id = get_distinct_id(group_by_fields, seq_id); + if(excluded_group_ids.count(distinct_id) != 0) { + continue; + } + } + + KV kv(searched_queries.size(), seq_id, distinct_id, match_score_index, scores); kv.text_match_score = 0; kv.vector_distance = vec_result.second; topster->add(&kv); - vec_search_ids.push_back(doc_id); + vec_search_ids.push_back(seq_id); } } @@ -3967,8 +3981,6 @@ void Index::search_across_fields(const std::vector& query_tokens, dropped_token_its.push_back(std::move(token_fields)); } - - // one iterator for each token, each underlying iterator contains results of token across multiple fields std::vector token_its; @@ -4060,6 +4072,28 @@ void Index::search_across_fields(const std::vector& query_tokens, } } + size_t query_len = query_tokens.size(); + + // check if seq_id exists in any of the dropped_token iters + for(size_t ti = 0; ti < dropped_token_its.size(); ti++) { + or_iterator_t& token_fields_iters = dropped_token_its[ti]; + if(token_fields_iters.skip_to(seq_id) && token_fields_iters.id() == seq_id) { + query_len++; + const std::vector& field_iters = token_fields_iters.get_its(); + for(size_t fi = 0; fi < field_iters.size(); fi++) { + const posting_list_t::iterator_t& field_iter = field_iters[fi]; + if(field_iter.id() == seq_id) { + // not all fields might contain a given token + field_to_tokens[field_iter.get_field_id()].push_back(field_iter.clone()); + } + } + } + } + + if(syn_orig_num_tokens != -1) { + query_len = syn_orig_num_tokens; + } + int64_t best_field_match_score = 0, best_field_weight = 0; uint32_t num_matching_fields = 0; @@ -4113,18 +4147,6 @@ void Index::search_across_fields(const std::vector& query_tokens, compute_sort_scores(sort_fields, sort_order, field_values, geopoint_indices, seq_id, filter_index, best_field_match_score, scores, match_score_index); - size_t query_len = query_tokens.size(); - - // check if seq_id exists in any of the dropped_token iters and increment matching fields accordingly - for(auto& dropped_token_it: dropped_token_its) { - if(dropped_token_it.skip_to(seq_id) && dropped_token_it.id() == seq_id) { - query_len++; - } - } - - if(syn_orig_num_tokens != -1) { - query_len = syn_orig_num_tokens; - } query_len = std::min(15, query_len); // NOTE: `query_len` is total tokens matched across fields. @@ -6244,7 +6266,7 @@ void Index::handle_doc_ops(const tsl::htrie_map& search_schema, } } -void Index::get_doc_changes(const index_operation_t op, const tsl::htrie_map& search_schema, +void Index::get_doc_changes(const index_operation_t op, const tsl::htrie_map& embedding_fields, nlohmann::json& update_doc, const nlohmann::json& old_doc, nlohmann::json& new_doc, nlohmann::json& del_doc) { @@ -6257,7 +6279,12 @@ void Index::get_doc_changes(const index_operation_t op, const tsl::htrie_map Index::seq_ids_outside_top_k(const std::string& field_name, size_t k, std::vector& outside_seq_ids) { + std::shared_lock lock(mutex); auto field_it = numerical_index.find(field_name); - if(field_it == sort_index.end()) { + if(field_it == numerical_index.end()) { return Option(400, "Field not found in numerical index."); } diff --git a/src/text_embedder.cpp b/src/text_embedder.cpp index 61244054..126655f9 100644 --- a/src/text_embedder.cpp +++ b/src/text_embedder.cpp @@ -117,6 +117,11 @@ embedding_res_t TextEmbedder::Embed(const std::string& text, const size_t remote input_shapes.push_back({1, static_cast(encoded_input.input_ids.size())}); input_shapes.push_back({1, static_cast(encoded_input.attention_mask.size())}); if(session_->GetInputCount() == 3) { + // edge case: xlm_roberta does not have token_type_ids, but if the model has it as input, we need to fill it with 0s + if(encoded_input.token_type_ids.size() == 0) { + encoded_input.token_type_ids.resize(encoded_input.input_ids.size(), 0); + } + input_shapes.push_back({1, static_cast(encoded_input.token_type_ids.size())}); } input_tensors.push_back(Ort::Value::CreateTensor(memory_info, encoded_input.input_ids.data(), encoded_input.input_ids.size(), input_shapes[0].data(), input_shapes[0].size())); diff --git a/src/text_embedder_manager.cpp b/src/text_embedder_manager.cpp index ac2c110f..89400a79 100644 --- a/src/text_embedder_manager.cpp +++ b/src/text_embedder_manager.cpp @@ -43,9 +43,10 @@ Option TextEmbedderManager::validate_and_init_remote_model(const nlohmann: } std::unique_lock lock(text_embedders_mutex); - auto text_embedder_it = text_embedders.find(model_name); + std::string model_key = is_remote_model(model_name) ? RemoteEmbedder::get_model_key(model_config) : model_name; + auto text_embedder_it = text_embedders.find(model_key); if(text_embedder_it == text_embedders.end()) { - text_embedders.emplace(model_name, std::make_shared(model_config, num_dims)); + text_embedders.emplace(model_key, std::make_shared(model_config, num_dims)); } return Option(true); @@ -122,7 +123,8 @@ Option TextEmbedderManager::validate_and_init_local_model(const nlohmann:: Option TextEmbedderManager::get_text_embedder(const nlohmann::json& model_config) { std::unique_lock lock(text_embedders_mutex); const std::string& model_name = model_config.at("model_name"); - auto text_embedder_it = text_embedders.find(model_name); + std::string model_key = is_remote_model(model_name) ? RemoteEmbedder::get_model_key(model_config) : model_name; + auto text_embedder_it = text_embedders.find(model_key); if(text_embedder_it == text_embedders.end()) { return Option(404, "Text embedder was not found."); diff --git a/src/text_embedder_remote.cpp b/src/text_embedder_remote.cpp index e59f93bf..74226db2 100644 --- a/src/text_embedder_remote.cpp +++ b/src/text_embedder_remote.cpp @@ -53,6 +53,21 @@ long RemoteEmbedder::call_remote_api(const std::string& method, const std::strin proxy_call_timeout_ms, true); } + +const std::string RemoteEmbedder::get_model_key(const nlohmann::json& model_config) { + const std::string model_namespace = TextEmbedderManager::get_model_namespace(model_config["model_name"].get()); + + if(model_namespace == "openai") { + return OpenAIEmbedder::get_model_key(model_config); + } else if(model_namespace == "google") { + return GoogleEmbedder::get_model_key(model_config); + } else if(model_namespace == "gcp") { + return GCPEmbedder::get_model_key(model_config); + } else { + return ""; + } +} + OpenAIEmbedder::OpenAIEmbedder(const std::string& openai_model_path, const std::string& api_key) : api_key(api_key), openai_model_path(openai_model_path) { } @@ -206,6 +221,7 @@ std::vector OpenAIEmbedder::batch_embed(const std::vector OpenAIEmbedder::batch_embed(const std::vector outputs; + for(size_t i = 0; i < inputs.size(); i++) { + outputs.push_back(embedding_res_t(500, "Got malformed response from OpenAI API.")); + } + return outputs; + } + std::vector outputs; for(auto& data : res_json["data"]) { + if(data.count("embedding") == 0 || !data["embedding"].is_array() || data["embedding"].size() == 0) { + outputs.push_back(embedding_res_t(500, "Got malformed response from OpenAI API.")); + continue; + } outputs.push_back(embedding_res_t(data["embedding"].get>())); } @@ -255,6 +284,9 @@ nlohmann::json OpenAIEmbedder::get_error_json(const nlohmann::json& req_body, lo return embedding_res; } +std::string OpenAIEmbedder::get_model_key(const nlohmann::json& model_config) { + return model_config["model_name"].get() + ":" + model_config["api_key"].get(); +} GoogleEmbedder::GoogleEmbedder(const std::string& google_api_key) : google_api_key(google_api_key) { @@ -372,6 +404,10 @@ nlohmann::json GoogleEmbedder::get_error_json(const nlohmann::json& req_body, lo return embedding_res; } +std::string GoogleEmbedder::get_model_key(const nlohmann::json& model_config) { + return model_config["model_name"].get() + ":" + model_config["api_key"].get(); +} + GCPEmbedder::GCPEmbedder(const std::string& project_id, const std::string& model_name, const std::string& access_token, const std::string& refresh_token, const std::string& client_id, const std::string& client_secret) : @@ -555,7 +591,20 @@ std::vector GCPEmbedder::batch_embed(const std::vector outputs; + + if(res_json.count("predictions") == 0 || !res_json["predictions"].is_array() || res_json["predictions"].size() != inputs.size()) { + std::vector outputs; + for(size_t i = 0; i < inputs.size(); i++) { + outputs.push_back(embedding_res_t(500, "Got malformed response from GCP API.")); + } + return outputs; + } + for(const auto& prediction : res_json["predictions"]) { + if(prediction.count("embeddings") == 0 || !prediction["embeddings"].is_object() || prediction["embeddings"].count("values") == 0 || !prediction["embeddings"]["values"].is_array() || prediction["embeddings"]["values"].size() == 0) { + outputs.push_back(embedding_res_t(500, "Got malformed response from GCP API.")); + continue; + } outputs.push_back(embedding_res_t(prediction["embeddings"]["values"].get>())); } @@ -625,3 +674,7 @@ Option GCPEmbedder::generate_access_token(const std::string& refres return Option(access_token); } + +std::string GCPEmbedder::get_model_key(const nlohmann::json& model_config) { + return model_config["model_name"].get() + ":" + model_config["project_id"].get() + ":" + model_config["client_secret"].get(); +} \ No newline at end of file diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp index 464349ed..5688e27d 100644 --- a/src/tokenizer.cpp +++ b/src/tokenizer.cpp @@ -1,5 +1,6 @@ #include #include +#include #include "tokenizer.h" Tokenizer::Tokenizer(const std::string& input, bool normalize, bool no_op, const std::string& locale, @@ -349,10 +350,15 @@ bool Tokenizer::should_skip_char(char c) { return is_ascii_char(c) && get_stream_mode(c) != INDEX; } -void Tokenizer::normalize_ascii(std::string& text) { - for(size_t i = 0; i < text.size(); i++) { +std::string Tokenizer::normalize_ascii_no_spaces(const std::string& text) { + std::string analytics_query = text; + StringUtils::trim(analytics_query); + + for(size_t i = 0; i < analytics_query.size(); i++) { if(is_ascii_char(text[i])) { - text[i] = std::tolower(text[i]); + analytics_query[i] = std::tolower(analytics_query[i]); } } + + return analytics_query; } diff --git a/src/typesense_server_utils.cpp b/src/typesense_server_utils.cpp index e43a67c9..c628c569 100644 --- a/src/typesense_server_utils.cpp +++ b/src/typesense_server_utils.cpp @@ -118,7 +118,7 @@ int init_root_logger(Config & config, const std::string & server_version) { if(log_dir.empty()) { // use console logger if log dir is not specified - FLAGS_logtostdout = true; + FLAGS_logtostderr = true; } else { if(!directory_exists(log_dir)) { std::cerr << "Typesense failed to start. " << "Log directory " << log_dir << " does not exist."; diff --git a/test/collection_manager_test.cpp b/test/collection_manager_test.cpp index e821e51b..12ed2cd3 100644 --- a/test/collection_manager_test.cpp +++ b/test/collection_manager_test.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include "string_utils.h" #include "collection.h" @@ -24,6 +25,8 @@ protected: collectionManager.init(store, 1.0, "auth_key", quit); collectionManager.load(8, 1000); + AnalyticsManager::get_instance().init(store); + schema = R"({ "name": "collection1", "enable_nested_fields": true, @@ -587,6 +590,67 @@ TEST_F(CollectionManagerTest, VerifyEmbeddedParametersOfScopedAPIKey) { collectionManager.drop_collection("coll1"); } +TEST_F(CollectionManagerTest, QuerySuggestionsShouldBeTrimmed) { + std::vector fields = {field("title", field_types::STRING, false, false, true, "", -1, 1), + field("year", field_types::INT32, false), + field("points", field_types::INT32, false),}; + + Collection* coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get(); + + nlohmann::json doc1; + doc1["id"] = "0"; + doc1["title"] = "Tom Sawyer"; + doc1["year"] = 1876; + doc1["points"] = 100; + + ASSERT_TRUE(coll1->add(doc1.dump()).ok()); + + Config::get_instance().set_enable_search_analytics(true); + + nlohmann::json analytics_rule = R"({ + "name": "top_search_queries", + "type": "popular_queries", + "params": { + "limit": 100, + "source": { + "collections": ["coll1"] + }, + "destination": { + "collection": "top_queries" + } + } + })"_json; + + auto create_op = AnalyticsManager::get_instance().create_rule(analytics_rule, false, true); + ASSERT_TRUE(create_op.ok()); + + nlohmann::json embedded_params; + std::map req_params; + req_params["collection"] = "coll1"; + req_params["q"] = " tom "; + req_params["query_by"] = "title"; + + std::string json_res; + auto now_ts = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()).count(); + + auto search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts); + ASSERT_TRUE(search_op.ok()); + + json_res.clear(); + req_params["q"] = " "; + search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts); + ASSERT_TRUE(search_op.ok()); + + // check that suggestions have been trimmed + auto popular_queries = AnalyticsManager::get_instance().get_popular_queries(); + ASSERT_EQ(2, popular_queries["top_queries"]->get_user_prefix_queries()[""].size()); + ASSERT_EQ("tom", popular_queries["top_queries"]->get_user_prefix_queries()[""][0].query); + ASSERT_EQ("", popular_queries["top_queries"]->get_user_prefix_queries()[""][1].query); + + collectionManager.drop_collection("coll1"); +} + TEST_F(CollectionManagerTest, RestoreAutoSchemaDocsOnRestart) { Collection *coll1; diff --git a/test/collection_schema_change_test.cpp b/test/collection_schema_change_test.cpp index 7bc6f32c..c936ce78 100644 --- a/test/collection_schema_change_test.cpp +++ b/test/collection_schema_change_test.cpp @@ -1566,6 +1566,14 @@ TEST_F(CollectionSchemaChangeTest, UpdateSchemaWithNewEmbeddingField) { ASSERT_TRUE(res.ok()); ASSERT_EQ(1, coll->get_embedding_fields().size()); + auto search_schema = coll->get_schema(); + + auto embedding_field_it = search_schema.find("embedding"); + ASSERT_TRUE(embedding_field_it != coll->get_schema().end()); + ASSERT_EQ("embedding", embedding_field_it.value().name); + ASSERT_EQ("float[]", embedding_field_it.value().type); + ASSERT_EQ(384, embedding_field_it.value().num_dim); + nlohmann::json doc; doc["names"] = {"hello", "world"}; auto add_op = coll->add(doc.dump()); @@ -1580,9 +1588,13 @@ TEST_F(CollectionSchemaChangeTest, DropFieldUsedForEmbedding) { nlohmann::json schema = R"({ "name": "objects", "fields": [ - {"name": "names", "type": "string[]"}, - {"name": "category", "type":"string"}, - {"name": "embedding", "type":"float[]", "embed":{"from": ["names","category"], "model_config": {"model_name": "ts/e5-small"}}} + {"name": "title", "type": "string"}, + {"name": "names", "type": "string[]"}, + {"name": "category", "type":"string"}, + {"name": "embedding", "type":"float[]", "embed":{"from": ["names","category"], + "model_config": {"model_name": "ts/e5-small"}}}, + {"name": "embedding2", "type":"float[]", "embed":{"from": ["names"], + "model_config": {"model_name": "ts/e5-small"}}} ] })"_json; @@ -1594,20 +1606,28 @@ TEST_F(CollectionSchemaChangeTest, DropFieldUsedForEmbedding) { LOG(INFO) << "Created collection"; + auto embedding_fields = coll->get_embedding_fields(); + ASSERT_EQ(2, embedding_fields.size()); + ASSERT_EQ(2, embedding_fields["embedding"].embed[fields::from].get>().size()); + ASSERT_EQ(1, embedding_fields["embedding2"].embed[fields::from].get>().size()); + + auto coll_schema = coll->get_schema(); + ASSERT_EQ(5, coll_schema.size()); + + auto the_fields = coll->get_fields(); + ASSERT_EQ(5, the_fields.size()); + auto schema_changes = R"({ "fields": [ {"name": "names", "drop": true} ] })"_json; - - auto embedding_fields = coll->get_embedding_fields(); - ASSERT_EQ(2, embedding_fields["embedding"].embed[fields::from].get>().size()); - auto alter_op = coll->alter(schema_changes); ASSERT_TRUE(alter_op.ok()); embedding_fields = coll->get_embedding_fields(); + ASSERT_EQ(1, embedding_fields.size()); ASSERT_EQ(1, embedding_fields["embedding"].embed[fields::from].get>().size()); ASSERT_EQ("category", embedding_fields["embedding"].embed[fields::from].get>()[0]); @@ -1623,6 +1643,16 @@ TEST_F(CollectionSchemaChangeTest, DropFieldUsedForEmbedding) { embedding_fields = coll->get_embedding_fields(); ASSERT_EQ(0, embedding_fields.size()); ASSERT_EQ(0, coll->_get_index()->_get_vector_index().size()); + + // only title remains + + coll_schema = coll->get_schema(); + ASSERT_EQ(1, coll_schema.size()); + ASSERT_EQ("title", coll_schema["title"].name); + + the_fields = coll->get_fields(); + ASSERT_EQ(1, the_fields.size()); + ASSERT_EQ("title", the_fields[0].name); } TEST_F(CollectionSchemaChangeTest, EmbeddingFieldsMapTest) { diff --git a/test/collection_specific_more_test.cpp b/test/collection_specific_more_test.cpp index 83181283..4e7347e6 100644 --- a/test/collection_specific_more_test.cpp +++ b/test/collection_specific_more_test.cpp @@ -1816,6 +1816,36 @@ TEST_F(CollectionSpecificMoreTest, ConsiderDroppedTokensDuringTextMatchScoring) ASSERT_EQ("1", res["hits"][1]["document"]["id"].get()); } +TEST_F(CollectionSpecificMoreTest, ConsiderDroppedTokensDuringTextMatchScoring2) { + nlohmann::json schema = R"({ + "name": "coll1", + "fields": [ + {"name": "name", "type": "string"} + ] + })"_json; + + Collection *coll1 = collectionManager.create_collection(schema).get(); + + nlohmann::json doc; + doc["id"] = "0"; + doc["name"] = "Elizabeth Arden 5th Avenue Eau de Parfum 125ml"; + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + + doc["id"] = "1"; + doc["name"] = "Avène Sun Very High Protection Mineral Cream SPF50+ 50ml"; + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + + auto res = coll1->search("avène eau mineral", {"name"}, "", {}, {}, {2}, 10, 1, FREQUENCY, {true}, 5, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "", 20, {}, {}, {}, 0, + "", "", {3}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback, + 4, {off}, 0, 0, 0, 2, false, "", true, 0, max_weight).get(); + + ASSERT_EQ(2, res["hits"].size()); + ASSERT_EQ("1", res["hits"][0]["document"]["id"].get()); + ASSERT_EQ("0", res["hits"][1]["document"]["id"].get()); +} + TEST_F(CollectionSpecificMoreTest, NonNestedFieldNameWithDot) { nlohmann::json schema = R"({ "name": "coll1", diff --git a/test/collection_vector_search_test.cpp b/test/collection_vector_search_test.cpp index 059e4437..4fcc134c 100644 --- a/test/collection_vector_search_test.cpp +++ b/test/collection_vector_search_test.cpp @@ -224,6 +224,137 @@ TEST_F(CollectionVectorTest, BasicVectorQuerying) { collectionManager.drop_collection("coll1"); } +TEST_F(CollectionVectorTest, VectorUnchangedUpsert) { + nlohmann::json schema = R"({ + "name": "coll1", + "fields": [ + {"name": "title", "type": "string"}, + {"name": "points", "type": "int32"}, + {"name": "vec", "type": "float[]", "num_dim": 3} + ] + })"_json; + + Collection* coll1 = collectionManager.create_collection(schema).get(); + + std::vector vec = {0.12, 0.45, 0.64}; + + nlohmann::json doc; + doc["id"] = "0"; + doc["title"] = "Title"; + doc["points"] = 100; + doc["vec"] = vec; + + auto add_op = coll1->add(doc.dump()); + ASSERT_TRUE(add_op.ok()); + + auto results = coll1->search("*", {}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 5, + "", 10, {}, {}, {}, 0, + "", "", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback, + 4, {off}, 32767, 32767, 2, + false, true, "vec:([0.12, 0.44, 0.55])").get(); + + ASSERT_EQ(1, results["found"].get()); + + + // upsert unchanged doc + add_op = coll1->add(doc.dump(), index_operation_t::UPSERT); + ASSERT_TRUE(add_op.ok()); + + results = coll1->search("*", {}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 5, + "", 10, {}, {}, {}, 0, + "", "", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback, + 4, {off}, 32767, 32767, 2, + false, true, "vec:([0.12, 0.44, 0.55])").get(); + + ASSERT_EQ(1, results["found"].get()); + + // emplace unchanged doc + add_op = coll1->add(doc.dump(), index_operation_t::EMPLACE); + ASSERT_TRUE(add_op.ok()); + + results = coll1->search("*", {}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 5, + "", 10, {}, {}, {}, 0, + "", "", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback, + 4, {off}, 32767, 32767, 2, + false, true, "vec:([0.12, 0.44, 0.55])").get(); + + ASSERT_EQ(1, results["found"].get()); +} + +TEST_F(CollectionVectorTest, VectorPartialUpdate) { + nlohmann::json schema = R"({ + "name": "coll1", + "fields": [ + {"name": "title", "type": "string"}, + {"name": "points", "type": "int32"}, + {"name": "vec", "type": "float[]", "num_dim": 3} + ] + })"_json; + + Collection* coll1 = collectionManager.create_collection(schema).get(); + + std::vector vec = {0.12, 0.45, 0.64}; + + nlohmann::json doc; + doc["id"] = "0"; + doc["title"] = "Title"; + doc["points"] = 100; + doc["vec"] = vec; + + auto add_op = coll1->add(doc.dump()); + ASSERT_TRUE(add_op.ok()); + + auto results = coll1->search("*", {}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 5, + "", 10, {}, {}, {}, 0, + "", "", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback, + 4, {off}, 32767, 32767, 2, + false, true, "vec:([0.12, 0.44, 0.55])").get(); + + ASSERT_EQ(1, results["found"].get()); + + + // emplace partial doc + doc.erase("vec"); + doc["title"] = "Random"; + add_op = coll1->add(doc.dump(), index_operation_t::EMPLACE); + ASSERT_TRUE(add_op.ok()); + + results = coll1->search("Random", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 5, + "", 10, {}, {}, {}, 0, + "", "", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback, + 4, {off}, 32767, 32767, 2, + false, true, "vec:([0.12, 0.44, 0.55])").get(); + + ASSERT_EQ(1, results["found"].get()); + + // update portial doc + + doc.erase("vec"); + doc["title"] = "Random"; + add_op = coll1->add(doc.dump(), index_operation_t::UPDATE); + ASSERT_TRUE(add_op.ok()); + + results = coll1->search("Random", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 5, + "", 10, {}, {}, {}, 0, + "", "", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback, + 4, {off}, 32767, 32767, 2, + false, true, "vec:([0.12, 0.44, 0.55])").get(); + + ASSERT_EQ(1, results["found"].get()); +} + TEST_F(CollectionVectorTest, NumVectorGreaterThanNumDim) { nlohmann::json schema = R"({ "name": "coll1", @@ -692,6 +823,88 @@ TEST_F(CollectionVectorTest, VectorWithNullValue) { nlohmann::json::parse(json_lines[1])["error"].get()); } +TEST_F(CollectionVectorTest, EmbeddedVectorUnchangedUpsert) { + nlohmann::json schema = R"({ + "name": "coll1", + "fields": [ + {"name": "title", "type": "string"}, + {"name": "points", "type": "int32"}, + {"name": "embedding", "type":"float[]", "embed":{"from": ["title"], + "model_config": {"model_name": "ts/e5-small"}}} + ] + })"_json; + + TextEmbedderManager::set_model_dir("/tmp/typesense_test/models"); + + Collection* coll1 = collectionManager.create_collection(schema).get(); + + nlohmann::json doc; + doc["id"] = "0"; + doc["title"] = "Title"; + doc["points"] = 100; + + auto add_op = coll1->add(doc.dump()); + ASSERT_TRUE(add_op.ok()); + + auto results = coll1->search("title", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set()).get(); + + ASSERT_EQ(1, results["found"].get()); + auto embedding = results["hits"][0]["document"]["embedding"].get>(); + ASSERT_EQ(384, embedding.size()); + + // upsert unchanged doc + doc.clear(); + doc["id"] = "0"; + doc["title"] = "Title"; + doc["points"] = 100; + + add_op = coll1->add(doc.dump(), index_operation_t::UPSERT); + ASSERT_TRUE(add_op.ok()); + + results = coll1->search("title", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set()).get(); + ASSERT_EQ(1, results["found"].get()); + embedding = results["hits"][0]["document"]["embedding"].get>(); + ASSERT_EQ(384, embedding.size()); + + // update + + doc.clear(); + doc["id"] = "0"; + doc["title"] = "Title"; + doc["points"] = 100; + + add_op = coll1->add(doc.dump(), index_operation_t::UPDATE); + ASSERT_TRUE(add_op.ok()); + + results = coll1->search("title", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set()).get(); + ASSERT_EQ(1, results["found"].get()); + embedding = results["hits"][0]["document"]["embedding"].get>(); + ASSERT_EQ(384, embedding.size()); + + // emplace + + doc.clear(); + doc["id"] = "0"; + doc["title"] = "Title"; + doc["points"] = 100; + + add_op = coll1->add(doc.dump(), index_operation_t::EMPLACE); + ASSERT_TRUE(add_op.ok()); + + results = coll1->search("title", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set()).get(); + ASSERT_EQ(1, results["found"].get()); + embedding = results["hits"][0]["document"]["embedding"].get>(); + ASSERT_EQ(384, embedding.size()); +} + TEST_F(CollectionVectorTest, HybridSearchWithExplicitVector) { nlohmann::json schema = R"({ "name": "objects", @@ -1099,7 +1312,67 @@ TEST_F(CollectionVectorTest, HideCredential) { ASSERT_EQ("***********", coll_summary["fields"][1]["embed"]["model_config"]["project_id"].get()); } -TEST_F(CollectionVectorTest, UpdateOfCollWithNonOptionalEmbeddingField) { +TEST_F(CollectionVectorTest, UpdateOfFieldReferencedByEmbedding) { + nlohmann::json schema = R"({ + "name": "objects", + "fields": [ + {"name": "name", "type": "string"}, + {"name": "embedding", "type":"float[]", "embed":{"from": ["name"], + "model_config": {"model_name": "ts/e5-small"}}} + ] + })"_json; + + TextEmbedderManager::set_model_dir("/tmp/typesense_test/models"); + + auto op = collectionManager.create_collection(schema); + ASSERT_TRUE(op.ok()); + Collection* coll = op.get(); + + nlohmann::json object; + object["id"] = "0"; + object["name"] = "butter"; + + auto add_op = coll->add(object.dump(), CREATE); + ASSERT_TRUE(add_op.ok()); + + auto results = coll->search("butter", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}).get(); + ASSERT_EQ(1, results["found"].get()); + auto original_embedding = results["hits"][0]["document"]["embedding"].get>(); + + nlohmann::json update_object; + update_object["id"] = "0"; + update_object["name"] = "ghee"; + auto update_op = coll->add(update_object.dump(), EMPLACE); + ASSERT_TRUE(update_op.ok()); + + results = coll->search("ghee", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}).get(); + ASSERT_EQ(1, results["found"].get()); + auto updated_embedding = results["hits"][0]["document"]["embedding"].get>(); + ASSERT_NE(original_embedding, updated_embedding); + + // action = update + update_object["name"] = "milk"; + update_op = coll->add(update_object.dump(), UPDATE); + ASSERT_TRUE(update_op.ok()); + + results = coll->search("milk", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}).get(); + ASSERT_EQ(1, results["found"].get()); + updated_embedding = results["hits"][0]["document"]["embedding"].get>(); + ASSERT_NE(original_embedding, updated_embedding); + + // action = upsert + update_object["name"] = "cheese"; + update_op = coll->add(update_object.dump(), UPSERT); + ASSERT_TRUE(update_op.ok()); + + results = coll->search("cheese", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}).get(); + ASSERT_EQ(1, results["found"].get()); + updated_embedding = results["hits"][0]["document"]["embedding"].get>(); + ASSERT_NE(original_embedding, updated_embedding); +} + +TEST_F(CollectionVectorTest, UpdateOfFieldNotReferencedByEmbedding) { + // test updates to a field that's not referred by an embedding field nlohmann::json schema = R"({ "name": "objects", "fields": [ @@ -1123,16 +1396,34 @@ TEST_F(CollectionVectorTest, UpdateOfCollWithNonOptionalEmbeddingField) { auto add_op = coll->add(object.dump(), CREATE); ASSERT_TRUE(add_op.ok()); + auto results = coll->search("butter", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}).get(); + ASSERT_EQ(1, results["found"].get()); + nlohmann::json update_object; update_object["id"] = "0"; update_object["about"] = "something about butter"; auto update_op = coll->add(update_object.dump(), EMPLACE); ASSERT_TRUE(update_op.ok()); + results = coll->search("butter", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}).get(); + ASSERT_EQ(1, results["found"].get()); + // action = update update_object["about"] = "something about butter 2"; update_op = coll->add(update_object.dump(), UPDATE); ASSERT_TRUE(update_op.ok()); + + results = coll->search("butter", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}).get(); + ASSERT_EQ(1, results["found"].get()); + + // action = upsert + update_object["name"] = "butter"; + update_object["about"] = "something about butter 3"; + update_op = coll->add(update_object.dump(), UPSERT); + ASSERT_TRUE(update_op.ok()); + + results = coll->search("butter", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}).get(); + ASSERT_EQ(1, results["found"].get()); } TEST_F(CollectionVectorTest, FreshEmplaceWithOptionalEmbeddingReferencedField) { @@ -1161,6 +1452,27 @@ TEST_F(CollectionVectorTest, FreshEmplaceWithOptionalEmbeddingReferencedField) { "or make the embedding field optional.", add_op.error()); } +TEST_F(CollectionVectorTest, EmbeddingFieldWithIdFieldPrecedingInSchema) { + auto schema = R"({ + "name": "objects", + "fields": [ + {"name": "id", "type": "string"}, + {"name": "name", "type": "string"}, + {"name": "embedding", "type":"float[]", "embed":{"from": ["name"], "model_config": {"model_name": "ts/e5-small"}}} + ] + })"_json; + + TextEmbedderManager::set_model_dir("/tmp/typesense_test/models"); + + auto op = collectionManager.create_collection(schema); + ASSERT_TRUE(op.ok()); + Collection* coll = op.get(); + + auto fs = coll->get_fields(); + ASSERT_EQ(2, fs.size()); + ASSERT_EQ(384, fs[1].num_dim); +} + TEST_F(CollectionVectorTest, SkipEmbeddingOpWhenValueExists) { nlohmann::json schema = R"({ "name": "objects", @@ -1306,6 +1618,58 @@ TEST_F(CollectionVectorTest, KeywordSearchReturnOnlyTextMatchInfo) { ASSERT_EQ(1, results["hits"][0].count("text_match_info")); } +TEST_F(CollectionVectorTest, GroupByWithVectorSearch) { + nlohmann::json schema = R"({ + "name": "coll1", + "fields": [ + {"name": "title", "type": "string"}, + {"name": "group", "type": "string", "facet": true}, + {"name": "vec", "type": "float[]", "num_dim": 4} + ] + })"_json; + + Collection* coll1 = collectionManager.create_collection(schema).get(); + + std::vector> values = { + {0.851758, 0.909671, 0.823431, 0.372063}, + {0.97826, 0.933157, 0.39557, 0.306488}, + {0.230606, 0.634397, 0.514009, 0.399594} + }; + + for (size_t i = 0; i < values.size(); i++) { + nlohmann::json doc; + doc["id"] = std::to_string(i); + doc["title"] = std::to_string(i) + " title"; + doc["group"] = "0"; + doc["vec"] = values[i]; + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + } + + auto res = coll1->search("title", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 5, + "", 10, {}, {}, {"group"}, 1, + "", "", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback, + 4, {off}, 32767, 32767, 2, + false, true, "vec:([0.96826, 0.94, 0.39557, 0.306488])").get(); + + ASSERT_EQ(1, res["grouped_hits"].size()); + ASSERT_EQ(1, res["grouped_hits"][0]["hits"].size()); + ASSERT_EQ(1, res["grouped_hits"][0]["hits"][0].count("vector_distance")); + + res = coll1->search("*", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 5, + "", 10, {}, {}, {"group"}, 1, + "", "", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback, + 4, {off}, 32767, 32767, 2, + false, true, "vec:([0.96826, 0.94, 0.39557, 0.306488])").get(); + + ASSERT_EQ(1, res["grouped_hits"].size()); + ASSERT_EQ(1, res["grouped_hits"][0]["hits"].size()); + ASSERT_EQ(1, res["grouped_hits"][0]["hits"][0].count("vector_distance")); +} + TEST_F(CollectionVectorTest, HybridSearchReturnAllInfo) { auto schema_json = R"({ @@ -1342,3 +1706,135 @@ TEST_F(CollectionVectorTest, HybridSearchReturnAllInfo) { ASSERT_EQ(1, results["hits"][0].count("text_match_info")); ASSERT_EQ(1, results["hits"][0].count("hybrid_search_info")); } + + +TEST_F(CollectionVectorTest, DISABLED_HybridSortingTest) { + auto schema_json = + R"({ + "name": "TEST", + "fields": [ + {"name": "name", "type": "string"}, + {"name": "embedding", "type":"float[]", "embed":{"from": ["name"], "model_config": {"model_name": "ts/e5-small"}}} + ] + })"_json; + + TextEmbedderManager::set_model_dir("/tmp/typesense_test/models"); + auto collection_create_op = collectionManager.create_collection(schema_json); + ASSERT_TRUE(collection_create_op.ok()); + auto coll1 = collection_create_op.get(); + + auto add_op = coll1->add(R"({ + "name": "john doe" + })"_json.dump()); + ASSERT_TRUE(add_op.ok()); + + add_op = coll1->add(R"({ + "name": "john legend" + })"_json.dump()); + ASSERT_TRUE(add_op.ok()); + + add_op = coll1->add(R"({ + "name": "john krasinski" + })"_json.dump()); + ASSERT_TRUE(add_op.ok()); + + add_op = coll1->add(R"({ + "name": "john abraham" + })"_json.dump()); + ASSERT_TRUE(add_op.ok()); + + // first do keyword search + auto results = coll1->search("john", {"name"}, + "", {}, {}, {2}, 10, + 1, FREQUENCY, {true}, + 0, spp::sparse_hash_set()).get(); + + ASSERT_EQ(4, results["hits"].size()); + + + // now do hybrid search with sort_by: _text_match:desc,_vector_distance:asc + std::vector sort_by_list = {{"_text_match", "desc"}, {"_vector_distance", "asc"}}; + + auto hybrid_results = coll1->search("john", {"name", "embedding"}, + "", {}, sort_by_list, {2}, 10, + 1, FREQUENCY, {true}, + 0, spp::sparse_hash_set()).get(); + + // first 4 results should be same as keyword search + ASSERT_EQ(results["hits"][0]["document"]["name"].get(), hybrid_results["hits"][0]["document"]["name"].get()); + ASSERT_EQ(results["hits"][1]["document"]["name"].get(), hybrid_results["hits"][1]["document"]["name"].get()); + ASSERT_EQ(results["hits"][2]["document"]["name"].get(), hybrid_results["hits"][2]["document"]["name"].get()); + ASSERT_EQ(results["hits"][3]["document"]["name"].get(), hybrid_results["hits"][3]["document"]["name"].get()); +} + +TEST_F(CollectionVectorTest, TestDifferentOpenAIApiKeys) { + if (std::getenv("api_key_1") == nullptr || std::getenv("api_key_2") == nullptr) { + LOG(INFO) << "Skipping test as api_key_1 or api_key_2 is not set"; + return; + } + + auto api_key1 = std::string(std::getenv("api_key_1")); + auto api_key2 = std::string(std::getenv("api_key_2")); + + auto embedder_map = TextEmbedderManager::get_instance()._get_text_embedders(); + + ASSERT_EQ(embedder_map.find("openai/text-embedding-ada-002:" + api_key1), embedder_map.end()); + ASSERT_EQ(embedder_map.find("openai/text-embedding-ada-002:" + api_key2), embedder_map.end()); + ASSERT_EQ(embedder_map.find("openai/text-embedding-ada-002"), embedder_map.end()); + + nlohmann::json model_config1 = R"({ + "model_name": "openai/text-embedding-ada-002" + })"_json; + + nlohmann::json model_config2 = model_config1; + + model_config1["api_key"] = api_key1; + model_config2["api_key"] = api_key2; + + size_t num_dim; + TextEmbedderManager::get_instance().validate_and_init_remote_model(model_config1, num_dim); + TextEmbedderManager::get_instance().validate_and_init_remote_model(model_config2, num_dim); + + embedder_map = TextEmbedderManager::get_instance()._get_text_embedders(); + + ASSERT_NE(embedder_map.find("openai/text-embedding-ada-002:" + api_key1), embedder_map.end()); + ASSERT_NE(embedder_map.find("openai/text-embedding-ada-002:" + api_key2), embedder_map.end()); + ASSERT_EQ(embedder_map.find("openai/text-embedding-ada-002"), embedder_map.end()); +} + + +TEST_F(CollectionVectorTest, TestMultilingualE5) { + auto schema_json = + R"({ + "name": "TEST", + "fields": [ + {"name": "name", "type": "string"}, + {"name": "embedding", "type":"float[]", "embed":{"from": ["name"], "model_config": {"model_name": "ts/multilingual-e5-small"}}} + ] + })"_json; + + TextEmbedderManager::set_model_dir("/tmp/typesense_test/models"); + + auto collection_create_op = collectionManager.create_collection(schema_json); + + ASSERT_TRUE(collection_create_op.ok()); + auto coll1 = collection_create_op.get(); + + auto add_op = coll1->add(R"({ + "name": "john doe" + })"_json.dump()); + + auto hybrid_results = coll1->search("john", {"name", "embedding"}, + "", {}, {}, {2}, 10, + 1, FREQUENCY, {true}, + 0, spp::sparse_hash_set()); + + ASSERT_TRUE(hybrid_results.ok()); + + auto semantic_results = coll1->search("john", {"embedding"}, + "", {}, {}, {2}, 10, + 1, FREQUENCY, {true}, + 0, spp::sparse_hash_set()); + + ASSERT_TRUE(semantic_results.ok()); +} \ No newline at end of file diff --git a/test/core_api_utils_test.cpp b/test/core_api_utils_test.cpp index 7a1005e0..e39ede35 100644 --- a/test/core_api_utils_test.cpp +++ b/test/core_api_utils_test.cpp @@ -610,7 +610,7 @@ TEST_F(CoreAPIUtilsTest, MultiSearchWithPresetShouldUsePresetForAuth) { ASSERT_EQ(2, embedded_params_vec.size()); } -TEST_F(CoreAPIUtilsTest, PresetSingleSearch) { +TEST_F(CoreAPIUtilsTest, PresetMultiSearch) { nlohmann::json schema = R"({ "name": "coll1", "fields": [ @@ -634,7 +634,7 @@ TEST_F(CoreAPIUtilsTest, PresetSingleSearch) { auto search_body = R"( {"searches":[ - {"collection":"coll1","q":"apple", "query_by": "title", "preset": "single_preset"} + {"collection":"coll1","q":"apple", "query_by": "name", "preset": "single_preset"} ]} )"; @@ -644,8 +644,40 @@ TEST_F(CoreAPIUtilsTest, PresetSingleSearch) { post_multi_search(req, res); - ASSERT_EQ("12", req->params["per_page"]); - ASSERT_EQ("coll1", req->params["collection"]); + auto res_json = nlohmann::json::parse(res->body); + ASSERT_EQ(1, res_json["results"].size()); + ASSERT_EQ(0, res_json["results"][0]["found"].get()); + + // with multiple "searches" preset configuration + preset_value = R"( + {"searches":[ + {"collection":"coll1", "q": "*", "per_page": "8"}, + {"collection":"coll1", "q": "*", "per_page": "11"} + ]} + )"_json; + + collectionManager.upsert_preset("multi_preset", preset_value); + embedded_params.clear(); + req->params.clear(); + req->params["preset"] = "multi_preset"; + req->embedded_params_vec.clear(); + req->embedded_params_vec.push_back(embedded_params); + req->embedded_params_vec.push_back(embedded_params); + + // "preset": "multi_preset" + search_body = R"( + {"searches":[ + {"collection":"coll1","q":"apple", "query_by": "title"} + ]} + )"; + + req->body = search_body; + + post_multi_search(req, res); + res_json = nlohmann::json::parse(res->body); + ASSERT_EQ(2, res_json["results"].size()); + ASSERT_EQ(0, res_json["results"][0]["found"].get()); + ASSERT_EQ(0, res_json["results"][1]["found"].get()); collectionManager.drop_collection("coll1"); }