From c2c3c26ebfb2ca3ab0daf867630eb52b01261f3b Mon Sep 17 00:00:00 2001 From: ozanarmagan Date: Tue, 15 Aug 2023 12:01:31 +0300 Subject: [PATCH 01/28] Added test for hybrid sorting --- test/collection_vector_search_test.cpp | 60 ++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/test/collection_vector_search_test.cpp b/test/collection_vector_search_test.cpp index 2f55cd34..600f8fb4 100644 --- a/test/collection_vector_search_test.cpp +++ b/test/collection_vector_search_test.cpp @@ -1214,3 +1214,63 @@ TEST_F(CollectionVectorTest, HybridSearchReturnAllInfo) { ASSERT_EQ(1, results["hits"][0].count("text_match_info")); ASSERT_EQ(1, results["hits"][0].count("hybrid_search_info")); } + + +TEST_F(CollectionVectorTest, HybridSortingTest) { + auto schema_json = + R"({ + "name": "TEST", + "fields": [ + {"name": "name", "type": "string"}, + {"name": "embedding", "type":"float[]", "embed":{"from": ["name"], "model_config": {"model_name": "ts/e5-small"}}} + ] + })"_json; + + TextEmbedderManager::set_model_dir("/tmp/typesense_test/models"); + auto collection_create_op = collectionManager.create_collection(schema_json); + ASSERT_TRUE(collection_create_op.ok()); + auto coll1 = collection_create_op.get(); + + auto add_op = coll1->add(R"({ + "name": "john doe" + })"_json.dump()); + ASSERT_TRUE(add_op.ok()); + + add_op = coll1->add(R"({ + "name": "john legend" + })"_json.dump()); + ASSERT_TRUE(add_op.ok()); + + add_op = coll1->add(R"({ + "name": "john krasinski" + })"_json.dump()); + ASSERT_TRUE(add_op.ok()); + + add_op = coll1->add(R"({ + "name": "john abraham" + })"_json.dump()); + ASSERT_TRUE(add_op.ok()); + + // first do keyword search + auto results = coll1->search("john", {"name"}, + "", {}, {}, {2}, 10, + 1, FREQUENCY, {true}, + 0, spp::sparse_hash_set()).get(); + + ASSERT_EQ(4, results["hits"].size()); + + + // now do hybrid search with sort_by: _text_match:desc,_vector_distance:asc + std::vector sort_by_list = {{"_text_match", "desc"}, {"_vector_distance", "asc"}}; + + auto hybrid_results = coll1->search("john", {"name", "embedding"}, + "", {}, sort_by_list, {2}, 10, + 1, FREQUENCY, {true}, + 0, spp::sparse_hash_set()).get(); + + // first 4 results should be same as keyword search + ASSERT_EQ(results["hits"][0]["document"]["name"].get(), hybrid_results["hits"][0]["document"]["name"].get()); + ASSERT_EQ(results["hits"][1]["document"]["name"].get(), hybrid_results["hits"][1]["document"]["name"].get()); + ASSERT_EQ(results["hits"][2]["document"]["name"].get(), hybrid_results["hits"][2]["document"]["name"].get()); + ASSERT_EQ(results["hits"][3]["document"]["name"].get(), hybrid_results["hits"][3]["document"]["name"].get()); +} \ No newline at end of file From 6f4e4bc77a26d2500e3227c58c519490a96342c7 Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Sat, 19 Aug 2023 12:14:05 +0530 Subject: [PATCH 02/28] Check for vector dimensions explicitly. --- src/index.cpp | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/index.cpp b/src/index.cpp index 4d81126a..2f8be825 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -870,12 +870,16 @@ void Index::index_field_in_memory(const field& afield, std::vector try { const std::vector& float_vals = record.doc[afield.name].get>(); - if(afield.vec_dist == cosine) { - std::vector normalized_vals(afield.num_dim); - hnsw_index_t::normalize_vector(float_vals, normalized_vals); - vec_index->addPoint(normalized_vals.data(), (size_t)record.seq_id, true); + if(float_vals.size() != afield.num_dim) { + record.index_failure(400, "Vector size mismatch."); } else { - vec_index->addPoint(float_vals.data(), (size_t)record.seq_id, true); + if(afield.vec_dist == cosine) { + std::vector normalized_vals(afield.num_dim); + hnsw_index_t::normalize_vector(float_vals, normalized_vals); + vec_index->addPoint(normalized_vals.data(), (size_t)record.seq_id, true); + } else { + vec_index->addPoint(float_vals.data(), (size_t)record.seq_id, true); + } } } catch(const std::exception &e) { record.index_failure(400, e.what()); From 24a67140f248da9733ff7d1a496a2689ee37b980 Mon Sep 17 00:00:00 2001 From: ozanarmagan Date: Sat, 19 Aug 2023 10:59:50 +0300 Subject: [PATCH 03/28] Fix model key for remote embedders --- include/text_embedder.h | 3 +++ include/text_embedder_manager.h | 4 +++ include/text_embedder_remote.h | 4 +++ src/text_embedder_manager.cpp | 14 +++++++--- src/text_embedder_remote.cpp | 28 +++++++++++++++++++ test/collection_vector_search_test.cpp | 37 +++++++++++++++++++++++++- 6 files changed, 86 insertions(+), 4 deletions(-) diff --git a/include/text_embedder.h b/include/text_embedder.h index ca64aa52..241570a3 100644 --- a/include/text_embedder.h +++ b/include/text_embedder.h @@ -24,6 +24,9 @@ class TextEmbedder { return remote_embedder_ != nullptr; } Option validate(); + RemoteEmbedder* _get_remote_embedder() { + return remote_embedder_.get(); + } private: std::unique_ptr session_; Ort::Env env_; diff --git a/include/text_embedder_manager.h b/include/text_embedder_manager.h index 543e8f91..2681e0b9 100644 --- a/include/text_embedder_manager.h +++ b/include/text_embedder_manager.h @@ -72,6 +72,10 @@ public: Option validate_and_init_local_model(const nlohmann::json& model_config, size_t& num_dims); Option validate_and_init_model(const nlohmann::json& model_config, size_t& num_dims); + std::unordered_map> _get_text_embedders() { + return text_embedders; + } + private: TextEmbedderManager() = default; diff --git a/include/text_embedder_remote.h b/include/text_embedder_remote.h index 8167fefd..b5f219d5 100644 --- a/include/text_embedder_remote.h +++ b/include/text_embedder_remote.h @@ -31,6 +31,7 @@ class RemoteEmbedder { virtual nlohmann::json get_error_json(const nlohmann::json& req_body, long res_code, const std::string& res_body) = 0; virtual embedding_res_t Embed(const std::string& text, const size_t remote_embedder_timeout_ms = 30000, const size_t remote_embedding_num_tries = 2) = 0; virtual std::vector batch_embed(const std::vector& inputs, const size_t remote_embedding_batch_size = 200) = 0; + static const std::string get_model_key(const nlohmann::json& model_config); static void init(ReplicationState* rs) { raft_server = rs; } @@ -51,6 +52,7 @@ class OpenAIEmbedder : public RemoteEmbedder { embedding_res_t Embed(const std::string& text, const size_t remote_embedder_timeout_ms = 30000, const size_t remote_embedding_num_tries = 2) override; std::vector batch_embed(const std::vector& inputs, const size_t remote_embedding_batch_size = 200) override; nlohmann::json get_error_json(const nlohmann::json& req_body, long res_code, const std::string& res_body) override; + static std::string get_model_key(const nlohmann::json& model_config); }; @@ -68,6 +70,7 @@ class GoogleEmbedder : public RemoteEmbedder { embedding_res_t Embed(const std::string& text, const size_t remote_embedder_timeout_ms = 30000, const size_t remote_embedding_num_tries = 2) override; std::vector batch_embed(const std::vector& inputs, const size_t remote_embedding_batch_size = 200) override; nlohmann::json get_error_json(const nlohmann::json& req_body, long res_code, const std::string& res_body) override; + static std::string get_model_key(const nlohmann::json& model_config); }; @@ -95,6 +98,7 @@ class GCPEmbedder : public RemoteEmbedder { embedding_res_t Embed(const std::string& text, const size_t remote_embedder_timeout_ms = 30000, const size_t remote_embedding_num_tries = 2) override; std::vector batch_embed(const std::vector& inputs, const size_t remote_embedding_batch_size = 200) override; nlohmann::json get_error_json(const nlohmann::json& req_body, long res_code, const std::string& res_body) override; + static std::string get_model_key(const nlohmann::json& model_config); }; diff --git a/src/text_embedder_manager.cpp b/src/text_embedder_manager.cpp index ac2c110f..8b34b949 100644 --- a/src/text_embedder_manager.cpp +++ b/src/text_embedder_manager.cpp @@ -43,9 +43,13 @@ Option TextEmbedderManager::validate_and_init_remote_model(const nlohmann: } std::unique_lock lock(text_embedders_mutex); - auto text_embedder_it = text_embedders.find(model_name); + std::string model_key = model_name; + if(is_remote_model(model_name)) { + model_key = RemoteEmbedder::get_model_key(model_config); + } + auto text_embedder_it = text_embedders.find(model_key); if(text_embedder_it == text_embedders.end()) { - text_embedders.emplace(model_name, std::make_shared(model_config, num_dims)); + text_embedders.emplace(model_key, std::make_shared(model_config, num_dims)); } return Option(true); @@ -122,7 +126,11 @@ Option TextEmbedderManager::validate_and_init_local_model(const nlohmann:: Option TextEmbedderManager::get_text_embedder(const nlohmann::json& model_config) { std::unique_lock lock(text_embedders_mutex); const std::string& model_name = model_config.at("model_name"); - auto text_embedder_it = text_embedders.find(model_name); + std::string model_key = model_name; + if(is_remote_model(model_name)) { + model_key = RemoteEmbedder::get_model_key(model_config); + } + auto text_embedder_it = text_embedders.find(model_key); if(text_embedder_it == text_embedders.end()) { return Option(404, "Text embedder was not found."); diff --git a/src/text_embedder_remote.cpp b/src/text_embedder_remote.cpp index e59f93bf..a7a745e0 100644 --- a/src/text_embedder_remote.cpp +++ b/src/text_embedder_remote.cpp @@ -53,6 +53,21 @@ long RemoteEmbedder::call_remote_api(const std::string& method, const std::strin proxy_call_timeout_ms, true); } + +const std::string RemoteEmbedder::get_model_key(const nlohmann::json& model_config) { + const std::string model_namespace = TextEmbedderManager::get_model_namespace(model_config["model_name"].get()); + + if(model_namespace == "openai") { + return OpenAIEmbedder::get_model_key(model_config); + } else if(model_namespace == "google") { + return GoogleEmbedder::get_model_key(model_config); + } else if(model_namespace == "gcp") { + return GCPEmbedder::get_model_key(model_config); + } else { + return ""; + } +} + OpenAIEmbedder::OpenAIEmbedder(const std::string& openai_model_path, const std::string& api_key) : api_key(api_key), openai_model_path(openai_model_path) { } @@ -255,6 +270,9 @@ nlohmann::json OpenAIEmbedder::get_error_json(const nlohmann::json& req_body, lo return embedding_res; } +std::string OpenAIEmbedder::get_model_key(const nlohmann::json& model_config) { + return model_config["model_name"].get() + ":" + model_config["api_key"].get(); +} GoogleEmbedder::GoogleEmbedder(const std::string& google_api_key) : google_api_key(google_api_key) { @@ -372,6 +390,10 @@ nlohmann::json GoogleEmbedder::get_error_json(const nlohmann::json& req_body, lo return embedding_res; } +std::string GoogleEmbedder::get_model_key(const nlohmann::json& model_config) { + return model_config["model_name"].get() + ":" + model_config["api_key"].get(); +} + GCPEmbedder::GCPEmbedder(const std::string& project_id, const std::string& model_name, const std::string& access_token, const std::string& refresh_token, const std::string& client_id, const std::string& client_secret) : @@ -625,3 +647,9 @@ Option GCPEmbedder::generate_access_token(const std::string& refres return Option(access_token); } + +std::string GCPEmbedder::get_model_key(const nlohmann::json& model_config) { + return model_config["model_name"].get() + ":" + model_config["project_id"].get() + ":" + + model_config["access_token"].get() + ":" + model_config["refresh_token"].get() + ":" + + model_config["client_id"].get() + ":" + model_config["client_secret"].get(); +} \ No newline at end of file diff --git a/test/collection_vector_search_test.cpp b/test/collection_vector_search_test.cpp index 600f8fb4..1b02cdf6 100644 --- a/test/collection_vector_search_test.cpp +++ b/test/collection_vector_search_test.cpp @@ -1216,7 +1216,7 @@ TEST_F(CollectionVectorTest, HybridSearchReturnAllInfo) { } -TEST_F(CollectionVectorTest, HybridSortingTest) { +TEST_F(CollectionVectorTest, DISABLED_HybridSortingTest) { auto schema_json = R"({ "name": "TEST", @@ -1273,4 +1273,39 @@ TEST_F(CollectionVectorTest, HybridSortingTest) { ASSERT_EQ(results["hits"][1]["document"]["name"].get(), hybrid_results["hits"][1]["document"]["name"].get()); ASSERT_EQ(results["hits"][2]["document"]["name"].get(), hybrid_results["hits"][2]["document"]["name"].get()); ASSERT_EQ(results["hits"][3]["document"]["name"].get(), hybrid_results["hits"][3]["document"]["name"].get()); +} + +TEST_F(CollectionVectorTest, TestDifferentOpenAIApiKeys) { + if (std::getenv("api_key_1") == nullptr || std::getenv("api_key_2") == nullptr) { + LOG(INFO) << "Skipping test as api_key_1 or api_key_2 is not set"; + return; + } + + auto api_key1 = std::string(std::getenv("api_key_1")); + auto api_key2 = std::string(std::getenv("api_key_2")); + + auto embedder_map = TextEmbedderManager::get_instance()._get_text_embedders(); + + ASSERT_EQ(embedder_map.find("openai/text-embedding-ada-002:" + api_key1), embedder_map.end()); + ASSERT_EQ(embedder_map.find("openai/text-embedding-ada-002:" + api_key2), embedder_map.end()); + ASSERT_EQ(embedder_map.find("openai/text-embedding-ada-002"), embedder_map.end()); + + nlohmann::json model_config1 = R"({ + "model_name": "openai/text-embedding-ada-002" + })"_json; + + nlohmann::json model_config2 = model_config1; + + model_config1["api_key"] = api_key1; + model_config2["api_key"] = api_key2; + + size_t num_dim; + TextEmbedderManager::get_instance().validate_and_init_remote_model(model_config1, num_dim); + TextEmbedderManager::get_instance().validate_and_init_remote_model(model_config2, num_dim); + + embedder_map = TextEmbedderManager::get_instance()._get_text_embedders(); + + ASSERT_NE(embedder_map.find("openai/text-embedding-ada-002:" + api_key1), embedder_map.end()); + ASSERT_NE(embedder_map.find("openai/text-embedding-ada-002:" + api_key2), embedder_map.end()); + ASSERT_EQ(embedder_map.find("openai/text-embedding-ada-002"), embedder_map.end()); } \ No newline at end of file From c3ec46cfba68881f3384a6325257a5237a2834f3 Mon Sep 17 00:00:00 2001 From: ozanarmagan Date: Sat, 19 Aug 2023 11:02:09 +0300 Subject: [PATCH 04/28] Remove unnecessary test function --- include/text_embedder.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/include/text_embedder.h b/include/text_embedder.h index 241570a3..ca64aa52 100644 --- a/include/text_embedder.h +++ b/include/text_embedder.h @@ -24,9 +24,6 @@ class TextEmbedder { return remote_embedder_ != nullptr; } Option validate(); - RemoteEmbedder* _get_remote_embedder() { - return remote_embedder_.get(); - } private: std::unique_ptr session_; Ort::Env env_; From 5120148b5330d604db1f7e758ab7d0e20cb17052 Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Sat, 19 Aug 2023 16:46:21 +0530 Subject: [PATCH 05/28] Dropped field should be removed properly from embed_from. --- include/collection.h | 2 +- src/collection.cpp | 44 +++++++++++++++++--------- test/collection_schema_change_test.cpp | 36 +++++++++++++++++---- 3 files changed, 59 insertions(+), 23 deletions(-) diff --git a/include/collection.h b/include/collection.h index f95b7c9f..e9ca2b2a 100644 --- a/include/collection.h +++ b/include/collection.h @@ -162,7 +162,7 @@ private: void remove_document(const nlohmann::json & document, const uint32_t seq_id, bool remove_from_store); - void process_remove_field_for_embedding_fields(const field& the_field, std::vector& garbage_fields); + void process_remove_field_for_embedding_fields(const field& del_field, std::vector& garbage_embed_fields); void curate_results(string& actual_query, const string& filter_query, bool enable_overrides, bool already_segmented, const std::map>& pinned_hits, diff --git a/src/collection.cpp b/src/collection.cpp index 8883f4ab..62e0e0a2 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -3942,7 +3942,6 @@ Option Collection::alter(nlohmann::json& alter_payload) { } } - // hide credentials in the alter payload return for(auto& field_json : alter_payload["fields"]) { if(field_json[fields::embed].count(fields::model_config) != 0) { @@ -3955,8 +3954,6 @@ Option Collection::alter(nlohmann::json& alter_payload) { } } - - return Option(true); } @@ -4904,27 +4901,44 @@ Option Collection::populate_include_exclude_fields_lk(const spp::sparse_ha } // Removes the dropped field from embed_from of all embedding fields. -void Collection::process_remove_field_for_embedding_fields(const field& the_field, std::vector& garbage_fields) { +void Collection::process_remove_field_for_embedding_fields(const field& del_field, + std::vector& garbage_embed_fields) { for(auto& field : fields) { if(field.embed.count(fields::from) == 0) { continue; } - auto embed_from = field.embed[fields::from].get>(); - embed_from.erase(std::remove_if(embed_from.begin(), embed_from.end(), [&the_field](std::string field_name) { - return the_field.name == field_name; - })); - field.embed[fields::from] = std::move(embed_from); - embedding_fields[field.name] = field; - // mark this embedding field as "garbage" if it has no more embed_from fields - if(embed_from.empty()) { - embedding_fields.erase(field.name); - garbage_fields.push_back(field); + bool found_field = false; + nlohmann::json& embed_from_names = field.embed[fields::from]; + for(auto it = embed_from_names.begin(); it != embed_from_names.end();) { + LOG(INFO) << it.value(); + if(it.value() == del_field.name) { + it = embed_from_names.erase(it); + found_field = true; + } else { + it++; + } } - + if(found_field) { + // mark this embedding field as "garbage" if it has no more embed_from fields + if(embed_from_names.empty()) { + garbage_embed_fields.push_back(field); + } else { + // the dropped field was present in `embed_from`, so we have to update the field objects + field.embed[fields::from] = embed_from_names; + embedding_fields[field.name].embed[fields::from] = embed_from_names; + } + } } + for(auto& garbage_field: garbage_embed_fields) { + embedding_fields.erase(garbage_field.name); + search_schema.erase(garbage_field.name); + fields.erase(std::remove_if(fields.begin(), fields.end(), [&garbage_field](const auto &f) { + return f.name == garbage_field.name; + }), fields.end()); + } } void Collection::hide_credential(nlohmann::json& json, const std::string& credential_name) { diff --git a/test/collection_schema_change_test.cpp b/test/collection_schema_change_test.cpp index 7bc6f32c..0d8364fd 100644 --- a/test/collection_schema_change_test.cpp +++ b/test/collection_schema_change_test.cpp @@ -1580,9 +1580,13 @@ TEST_F(CollectionSchemaChangeTest, DropFieldUsedForEmbedding) { nlohmann::json schema = R"({ "name": "objects", "fields": [ - {"name": "names", "type": "string[]"}, - {"name": "category", "type":"string"}, - {"name": "embedding", "type":"float[]", "embed":{"from": ["names","category"], "model_config": {"model_name": "ts/e5-small"}}} + {"name": "title", "type": "string"}, + {"name": "names", "type": "string[]"}, + {"name": "category", "type":"string"}, + {"name": "embedding", "type":"float[]", "embed":{"from": ["names","category"], + "model_config": {"model_name": "ts/e5-small"}}}, + {"name": "embedding2", "type":"float[]", "embed":{"from": ["names"], + "model_config": {"model_name": "ts/e5-small"}}} ] })"_json; @@ -1594,20 +1598,28 @@ TEST_F(CollectionSchemaChangeTest, DropFieldUsedForEmbedding) { LOG(INFO) << "Created collection"; + auto embedding_fields = coll->get_embedding_fields(); + ASSERT_EQ(2, embedding_fields.size()); + ASSERT_EQ(2, embedding_fields["embedding"].embed[fields::from].get>().size()); + ASSERT_EQ(1, embedding_fields["embedding2"].embed[fields::from].get>().size()); + + auto coll_schema = coll->get_schema(); + ASSERT_EQ(5, coll_schema.size()); + + auto the_fields = coll->get_fields(); + ASSERT_EQ(5, the_fields.size()); + auto schema_changes = R"({ "fields": [ {"name": "names", "drop": true} ] })"_json; - - auto embedding_fields = coll->get_embedding_fields(); - ASSERT_EQ(2, embedding_fields["embedding"].embed[fields::from].get>().size()); - auto alter_op = coll->alter(schema_changes); ASSERT_TRUE(alter_op.ok()); embedding_fields = coll->get_embedding_fields(); + ASSERT_EQ(1, embedding_fields.size()); ASSERT_EQ(1, embedding_fields["embedding"].embed[fields::from].get>().size()); ASSERT_EQ("category", embedding_fields["embedding"].embed[fields::from].get>()[0]); @@ -1623,6 +1635,16 @@ TEST_F(CollectionSchemaChangeTest, DropFieldUsedForEmbedding) { embedding_fields = coll->get_embedding_fields(); ASSERT_EQ(0, embedding_fields.size()); ASSERT_EQ(0, coll->_get_index()->_get_vector_index().size()); + + // only title remains + + coll_schema = coll->get_schema(); + ASSERT_EQ(1, coll_schema.size()); + ASSERT_EQ("title", coll_schema["title"].name); + + the_fields = coll->get_fields(); + ASSERT_EQ(1, the_fields.size()); + ASSERT_EQ("title", the_fields[0].name); } TEST_F(CollectionSchemaChangeTest, EmbeddingFieldsMapTest) { From 8ebabe9cdbc1c8d8d8c434ea969d06d714fbbdb0 Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Sat, 19 Aug 2023 17:00:55 +0530 Subject: [PATCH 06/28] Fix charset in content type header. --- src/core_api.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/core_api.cpp b/src/core_api.cpp index 86b85e27..68f144bf 100644 --- a/src/core_api.cpp +++ b/src/core_api.cpp @@ -729,7 +729,7 @@ bool get_export_documents(const std::shared_ptr& req, const std::share } } - res->content_type_header = "text/plain; charset=utf8"; + res->content_type_header = "text/plain; charset=utf-8"; res->status_code = 200; stream_response(req, res); @@ -902,7 +902,7 @@ bool post_import_documents(const std::shared_ptr& req, const std::shar } } - res->content_type_header = "text/plain; charset=utf8"; + res->content_type_header = "text/plain; charset=utf-8"; res->status_code = 200; res->body = response_stream.str(); From ef6e92192401b7ef56936c5cdaa91dd406d24d65 Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Sat, 19 Aug 2023 18:19:19 +0530 Subject: [PATCH 07/28] Remove stray log. --- src/collection.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/collection.cpp b/src/collection.cpp index 62e0e0a2..5d92b5b9 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -4911,7 +4911,6 @@ void Collection::process_remove_field_for_embedding_fields(const field& del_fiel bool found_field = false; nlohmann::json& embed_from_names = field.embed[fields::from]; for(auto it = embed_from_names.begin(); it != embed_from_names.end();) { - LOG(INFO) << it.value(); if(it.value() == del_field.name) { it = embed_from_names.erase(it); found_field = true; From 2d855a9f0ae48802d3edbd0d100d17bf224cfd25 Mon Sep 17 00:00:00 2001 From: ozanarmagan Date: Sun, 20 Aug 2023 15:36:36 +0300 Subject: [PATCH 08/28] Use ternary operator for model key --- src/text_embedder_manager.cpp | 14 +++----------- src/text_embedder_remote.cpp | 4 +--- 2 files changed, 4 insertions(+), 14 deletions(-) diff --git a/src/text_embedder_manager.cpp b/src/text_embedder_manager.cpp index 8b34b949..489fd208 100644 --- a/src/text_embedder_manager.cpp +++ b/src/text_embedder_manager.cpp @@ -43,13 +43,9 @@ Option TextEmbedderManager::validate_and_init_remote_model(const nlohmann: } std::unique_lock lock(text_embedders_mutex); - std::string model_key = model_name; - if(is_remote_model(model_name)) { - model_key = RemoteEmbedder::get_model_key(model_config); - } - auto text_embedder_it = text_embedders.find(model_key); + auto text_embedder_it = text_embedders.find(is_remote_model(model_name) ? RemoteEmbedder::get_model_key(model_config) : model_name); if(text_embedder_it == text_embedders.end()) { - text_embedders.emplace(model_key, std::make_shared(model_config, num_dims)); + text_embedders.emplace(is_remote_model(model_name) ? RemoteEmbedder::get_model_key(model_config) : model_name, std::make_shared(model_config, num_dims)); } return Option(true); @@ -126,11 +122,7 @@ Option TextEmbedderManager::validate_and_init_local_model(const nlohmann:: Option TextEmbedderManager::get_text_embedder(const nlohmann::json& model_config) { std::unique_lock lock(text_embedders_mutex); const std::string& model_name = model_config.at("model_name"); - std::string model_key = model_name; - if(is_remote_model(model_name)) { - model_key = RemoteEmbedder::get_model_key(model_config); - } - auto text_embedder_it = text_embedders.find(model_key); + auto text_embedder_it = text_embedders.find(is_remote_model(model_name) ? RemoteEmbedder::get_model_key(model_config) : model_name); if(text_embedder_it == text_embedders.end()) { return Option(404, "Text embedder was not found."); diff --git a/src/text_embedder_remote.cpp b/src/text_embedder_remote.cpp index a7a745e0..06a53a65 100644 --- a/src/text_embedder_remote.cpp +++ b/src/text_embedder_remote.cpp @@ -649,7 +649,5 @@ Option GCPEmbedder::generate_access_token(const std::string& refres } std::string GCPEmbedder::get_model_key(const nlohmann::json& model_config) { - return model_config["model_name"].get() + ":" + model_config["project_id"].get() + ":" - + model_config["access_token"].get() + ":" + model_config["refresh_token"].get() + ":" - + model_config["client_id"].get() + ":" + model_config["client_secret"].get(); + return model_config["model_name"].get() + ":" + model_config["project_id"].get() + ":" + model_config["client_secret"].get(); } \ No newline at end of file From 1d45fcf00cc14bbee95d7748608895436c1f10d1 Mon Sep 17 00:00:00 2001 From: ozanarmagan Date: Sun, 20 Aug 2023 15:52:53 +0300 Subject: [PATCH 09/28] Update model_key --- src/text_embedder_manager.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/text_embedder_manager.cpp b/src/text_embedder_manager.cpp index 489fd208..89400a79 100644 --- a/src/text_embedder_manager.cpp +++ b/src/text_embedder_manager.cpp @@ -43,9 +43,10 @@ Option TextEmbedderManager::validate_and_init_remote_model(const nlohmann: } std::unique_lock lock(text_embedders_mutex); - auto text_embedder_it = text_embedders.find(is_remote_model(model_name) ? RemoteEmbedder::get_model_key(model_config) : model_name); + std::string model_key = is_remote_model(model_name) ? RemoteEmbedder::get_model_key(model_config) : model_name; + auto text_embedder_it = text_embedders.find(model_key); if(text_embedder_it == text_embedders.end()) { - text_embedders.emplace(is_remote_model(model_name) ? RemoteEmbedder::get_model_key(model_config) : model_name, std::make_shared(model_config, num_dims)); + text_embedders.emplace(model_key, std::make_shared(model_config, num_dims)); } return Option(true); @@ -122,7 +123,8 @@ Option TextEmbedderManager::validate_and_init_local_model(const nlohmann:: Option TextEmbedderManager::get_text_embedder(const nlohmann::json& model_config) { std::unique_lock lock(text_embedders_mutex); const std::string& model_name = model_config.at("model_name"); - auto text_embedder_it = text_embedders.find(is_remote_model(model_name) ? RemoteEmbedder::get_model_key(model_config) : model_name); + std::string model_key = is_remote_model(model_name) ? RemoteEmbedder::get_model_key(model_config) : model_name; + auto text_embedder_it = text_embedders.find(model_key); if(text_embedder_it == text_embedders.end()) { return Option(404, "Text embedder was not found."); From f8e6468a7b2eb6f02026840a77fa7b9e586690b3 Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Mon, 21 Aug 2023 12:29:40 +0530 Subject: [PATCH 10/28] Better guards for top_k truncation. --- src/collection.cpp | 4 ++++ src/index.cpp | 3 ++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/collection.cpp b/src/collection.cpp index 5d92b5b9..50aa82fe 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -4953,9 +4953,13 @@ void Collection::hide_credential(nlohmann::json& json, const std::string& creden } } Option Collection::truncate_after_top_k(const string &field_name, size_t k) { + std::shared_lock slock(mutex); + std::vector seq_ids; auto op = index->seq_ids_outside_top_k(field_name, k, seq_ids); + slock.unlock(); + if(!op.ok()) { return op; } diff --git a/src/index.cpp b/src/index.cpp index 2f8be825..84e9f6ed 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -6315,9 +6315,10 @@ size_t Index::num_seq_ids() const { Option Index::seq_ids_outside_top_k(const std::string& field_name, size_t k, std::vector& outside_seq_ids) { + std::shared_lock lock(mutex); auto field_it = numerical_index.find(field_name); - if(field_it == sort_index.end()) { + if(field_it == numerical_index.end()) { return Option(400, "Field not found in numerical index."); } From f1cd6038ea064c179bc80f0dfa4ad31bd88882cd Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Mon, 21 Aug 2023 16:44:37 +0530 Subject: [PATCH 11/28] Trim query suggesitons before aggregation. --- include/analytics_manager.h | 2 +- include/tokenizer.h | 2 +- include/tsconfig.h | 4 ++ src/analytics_manager.cpp | 3 +- src/collection.cpp | 1 + src/collection_manager.cpp | 2 +- src/tokenizer.cpp | 12 ++++-- test/collection_manager_test.cpp | 64 ++++++++++++++++++++++++++++++++ 8 files changed, 82 insertions(+), 8 deletions(-) diff --git a/include/analytics_manager.h b/include/analytics_manager.h index 4207f7cf..0f098a7a 100644 --- a/include/analytics_manager.h +++ b/include/analytics_manager.h @@ -79,7 +79,7 @@ public: Option remove_rule(const std::string& name); void add_suggestion(const std::string& query_collection, - std::string& query, bool live_query, const std::string& user_id); + const std::string& query, bool live_query, const std::string& user_id); void stop(); diff --git a/include/tokenizer.h b/include/tokenizer.h index a6a88ab6..36de0940 100644 --- a/include/tokenizer.h +++ b/include/tokenizer.h @@ -88,5 +88,5 @@ public: bool should_skip_char(char c); - static void normalize_ascii(std::string& text); + static std::string normalize_ascii_no_spaces(const std::string& text); }; \ No newline at end of file diff --git a/include/tsconfig.h b/include/tsconfig.h index 1e6d8f62..dc382e80 100644 --- a/include/tsconfig.h +++ b/include/tsconfig.h @@ -787,6 +787,10 @@ public: cors_domains.insert(cors_values_vec.begin(), cors_values_vec.end()); } + void set_enable_search_analytics(bool enable_search_analytics) { + this->enable_search_analytics = enable_search_analytics; + } + // validation Option is_valid() { diff --git a/src/analytics_manager.cpp b/src/analytics_manager.cpp index b23c6cc2..5385bdf9 100644 --- a/src/analytics_manager.cpp +++ b/src/analytics_manager.cpp @@ -203,7 +203,7 @@ Option AnalyticsManager::remove_popular_queries_index(const std::string &n return Option(true); } -void AnalyticsManager::add_suggestion(const std::string &query_collection, std::string &query, +void AnalyticsManager::add_suggestion(const std::string &query_collection, const std::string& query, const bool live_query, const std::string& user_id) { // look up suggestion collections for the query collection std::unique_lock lock(mutex); @@ -212,7 +212,6 @@ void AnalyticsManager::add_suggestion(const std::string &query_collection, std:: for(const auto& suggestion_collection: suggestion_collections_it->second) { const auto& popular_queries_it = popular_queries.find(suggestion_collection); if(popular_queries_it != popular_queries.end()) { - Tokenizer::normalize_ascii(query); popular_queries_it->second->add(query, live_query, user_id); } } diff --git a/src/collection.cpp b/src/collection.cpp index 50aa82fe..7775b367 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -4952,6 +4952,7 @@ void Collection::hide_credential(nlohmann::json& json, const std::string& creden } } } + Option Collection::truncate_after_top_k(const string &field_name, size_t k) { std::shared_lock slock(mutex); diff --git a/src/collection_manager.cpp b/src/collection_manager.cpp index 9aec9e5c..7ded7e02 100644 --- a/src/collection_manager.cpp +++ b/src/collection_manager.cpp @@ -1112,7 +1112,7 @@ Option CollectionManager::do_search(std::map& re if(Config::get_instance().get_enable_search_analytics()) { if(result.count("found") != 0 && result["found"].get() != 0) { - std::string analytics_query = raw_query; + std::string analytics_query = Tokenizer::normalize_ascii_no_spaces(raw_query); AnalyticsManager::get_instance().add_suggestion(orig_coll_name, analytics_query, true, req_params["x-typesense-user-id"]); } diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp index 464349ed..5688e27d 100644 --- a/src/tokenizer.cpp +++ b/src/tokenizer.cpp @@ -1,5 +1,6 @@ #include #include +#include #include "tokenizer.h" Tokenizer::Tokenizer(const std::string& input, bool normalize, bool no_op, const std::string& locale, @@ -349,10 +350,15 @@ bool Tokenizer::should_skip_char(char c) { return is_ascii_char(c) && get_stream_mode(c) != INDEX; } -void Tokenizer::normalize_ascii(std::string& text) { - for(size_t i = 0; i < text.size(); i++) { +std::string Tokenizer::normalize_ascii_no_spaces(const std::string& text) { + std::string analytics_query = text; + StringUtils::trim(analytics_query); + + for(size_t i = 0; i < analytics_query.size(); i++) { if(is_ascii_char(text[i])) { - text[i] = std::tolower(text[i]); + analytics_query[i] = std::tolower(analytics_query[i]); } } + + return analytics_query; } diff --git a/test/collection_manager_test.cpp b/test/collection_manager_test.cpp index e821e51b..12ed2cd3 100644 --- a/test/collection_manager_test.cpp +++ b/test/collection_manager_test.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include "string_utils.h" #include "collection.h" @@ -24,6 +25,8 @@ protected: collectionManager.init(store, 1.0, "auth_key", quit); collectionManager.load(8, 1000); + AnalyticsManager::get_instance().init(store); + schema = R"({ "name": "collection1", "enable_nested_fields": true, @@ -587,6 +590,67 @@ TEST_F(CollectionManagerTest, VerifyEmbeddedParametersOfScopedAPIKey) { collectionManager.drop_collection("coll1"); } +TEST_F(CollectionManagerTest, QuerySuggestionsShouldBeTrimmed) { + std::vector fields = {field("title", field_types::STRING, false, false, true, "", -1, 1), + field("year", field_types::INT32, false), + field("points", field_types::INT32, false),}; + + Collection* coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get(); + + nlohmann::json doc1; + doc1["id"] = "0"; + doc1["title"] = "Tom Sawyer"; + doc1["year"] = 1876; + doc1["points"] = 100; + + ASSERT_TRUE(coll1->add(doc1.dump()).ok()); + + Config::get_instance().set_enable_search_analytics(true); + + nlohmann::json analytics_rule = R"({ + "name": "top_search_queries", + "type": "popular_queries", + "params": { + "limit": 100, + "source": { + "collections": ["coll1"] + }, + "destination": { + "collection": "top_queries" + } + } + })"_json; + + auto create_op = AnalyticsManager::get_instance().create_rule(analytics_rule, false, true); + ASSERT_TRUE(create_op.ok()); + + nlohmann::json embedded_params; + std::map req_params; + req_params["collection"] = "coll1"; + req_params["q"] = " tom "; + req_params["query_by"] = "title"; + + std::string json_res; + auto now_ts = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()).count(); + + auto search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts); + ASSERT_TRUE(search_op.ok()); + + json_res.clear(); + req_params["q"] = " "; + search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts); + ASSERT_TRUE(search_op.ok()); + + // check that suggestions have been trimmed + auto popular_queries = AnalyticsManager::get_instance().get_popular_queries(); + ASSERT_EQ(2, popular_queries["top_queries"]->get_user_prefix_queries()[""].size()); + ASSERT_EQ("tom", popular_queries["top_queries"]->get_user_prefix_queries()[""][0].query); + ASSERT_EQ("", popular_queries["top_queries"]->get_user_prefix_queries()[""][1].query); + + collectionManager.drop_collection("coll1"); +} + TEST_F(CollectionManagerTest, RestoreAutoSchemaDocsOnRestart) { Collection *coll1; From 4b6ec974173980cc5b87cddfd86d63b43ac6e093 Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Wed, 23 Aug 2023 12:52:36 +0530 Subject: [PATCH 12/28] Use fields index directly during looping. --- src/field.cpp | 2 +- test/collection_vector_search_test.cpp | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/src/field.cpp b/src/field.cpp index 7d5e399c..3c4d2e1c 100644 --- a/src/field.cpp +++ b/src/field.cpp @@ -1094,7 +1094,7 @@ Option field::json_fields_to_fields(bool enable_nested_fields, nlohmann::j } if(!the_fields.empty() && !the_fields.back().embed.empty()) { - embed_json_field_indices.emplace_back(i, i); + embed_json_field_indices.emplace_back(i, the_fields.size()-1); } } diff --git a/test/collection_vector_search_test.cpp b/test/collection_vector_search_test.cpp index 02bef6ff..694e501f 100644 --- a/test/collection_vector_search_test.cpp +++ b/test/collection_vector_search_test.cpp @@ -1161,6 +1161,27 @@ TEST_F(CollectionVectorTest, FreshEmplaceWithOptionalEmbeddingReferencedField) { "or make the embedding field optional.", add_op.error()); } +TEST_F(CollectionVectorTest, EmbeddingFieldWithIdFieldPrecedingInSchema) { + auto schema = R"({ + "name": "objects", + "fields": [ + {"name": "id", "type": "string"}, + {"name": "name", "type": "string"}, + {"name": "embedding", "type":"float[]", "embed":{"from": ["name"], "model_config": {"model_name": "ts/e5-small"}}} + ] + })"_json; + + TextEmbedderManager::set_model_dir("/tmp/typesense_test/models"); + + auto op = collectionManager.create_collection(schema); + ASSERT_TRUE(op.ok()); + Collection* coll = op.get(); + + auto fs = coll->get_fields(); + ASSERT_EQ(2, fs.size()); + ASSERT_EQ(384, fs[1].num_dim); +} + TEST_F(CollectionVectorTest, SkipEmbeddingOpWhenValueExists) { nlohmann::json schema = R"({ "name": "objects", From 6087ff30d41ad5c372f27beecfe20336b80548a4 Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Wed, 23 Aug 2023 19:10:26 +0530 Subject: [PATCH 13/28] Fix bug in multi collection autosuggest aggregation. --- src/analytics_manager.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/analytics_manager.cpp b/src/analytics_manager.cpp index 5385bdf9..31ac078a 100644 --- a/src/analytics_manager.cpp +++ b/src/analytics_manager.cpp @@ -234,6 +234,8 @@ void AnalyticsManager::run(ReplicationState* raft_server) { } persist_suggestions(raft_server, prev_persistence_s); + prev_persistence_s = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()).count(); lk.unlock(); } @@ -269,8 +271,6 @@ void AnalyticsManager::persist_suggestions(ReplicationState *raft_server, uint64 continue; } - prev_persistence_s = now_ts_seconds; - std::string import_payload; popularQueries->serialize_as_docs(import_payload); From 6cbd4306e0a17d1272b6f4d6a036b8c33bfde8a5 Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Sat, 26 Aug 2023 17:44:27 +0530 Subject: [PATCH 14/28] Fix group by not happening on vector search. --- src/index.cpp | 34 +++++++++++------ test/collection_vector_search_test.cpp | 52 ++++++++++++++++++++++++++ 2 files changed, 74 insertions(+), 12 deletions(-) diff --git a/src/index.cpp b/src/index.cpp index 84e9f6ed..35f5c839 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -3204,8 +3204,8 @@ Option Index::search(std::vector& field_query_tokens, cons for(size_t res_index = 0; res_index < vec_results.size(); res_index++) { auto& vec_result = vec_results[res_index]; - auto doc_id = vec_result.first; - auto result_it = topster->kv_map.find(doc_id); + auto seq_id = vec_result.first; + auto result_it = topster->kv_map.find(seq_id); if(result_it != topster->kv_map.end()) { if(result_it->second->match_score_index < 0 || result_it->second->match_score_index > 2) { @@ -3214,22 +3214,23 @@ Option Index::search(std::vector& field_query_tokens, cons // result overlaps with keyword search: we have to combine the scores - auto result = result_it->second; + KV* kv = result_it->second; // old_score + (1 / rank_of_document) * WEIGHT) - result->vector_distance = vec_result.second; - result->text_match_score = result->scores[result->match_score_index]; + kv->vector_distance = vec_result.second; + kv->text_match_score = kv->scores[kv->match_score_index]; int64_t match_score = float_to_int64_t( - (int64_t_to_float(result->scores[result->match_score_index])) + + (int64_t_to_float(kv->scores[kv->match_score_index])) + ((1.0 / (res_index + 1)) * VECTOR_SEARCH_WEIGHT)); int64_t match_score_index = -1; int64_t scores[3] = {0}; - compute_sort_scores(sort_fields_std, sort_order, field_values, geopoint_indices, doc_id, 0, match_score, scores, match_score_index, vec_result.second); + compute_sort_scores(sort_fields_std, sort_order, field_values, geopoint_indices, seq_id, 0, + match_score, scores, match_score_index, vec_result.second); for(int i = 0; i < 3; i++) { - result->scores[i] = scores[i]; + kv->scores[i] = scores[i]; } - result->match_score_index = match_score_index; + kv->match_score_index = match_score_index; } else { // Result has been found only in vector search: we have to add it to both KV and result_ids @@ -3237,12 +3238,21 @@ Option Index::search(std::vector& field_query_tokens, cons int64_t scores[3] = {0}; int64_t match_score = float_to_int64_t((1.0 / (res_index + 1)) * VECTOR_SEARCH_WEIGHT); int64_t match_score_index = -1; - compute_sort_scores(sort_fields_std, sort_order, field_values, geopoint_indices, doc_id, 0, match_score, scores, match_score_index, vec_result.second); - KV kv(searched_queries.size(), doc_id, doc_id, match_score_index, scores); + compute_sort_scores(sort_fields_std, sort_order, field_values, geopoint_indices, seq_id, 0, match_score, scores, match_score_index, vec_result.second); + + uint64_t distinct_id = seq_id; + if (group_limit != 0) { + distinct_id = get_distinct_id(group_by_fields, seq_id); + if(excluded_group_ids.count(distinct_id) != 0) { + continue; + } + } + + KV kv(searched_queries.size(), seq_id, distinct_id, match_score_index, scores); kv.text_match_score = 0; kv.vector_distance = vec_result.second; topster->add(&kv); - vec_search_ids.push_back(doc_id); + vec_search_ids.push_back(seq_id); } } diff --git a/test/collection_vector_search_test.cpp b/test/collection_vector_search_test.cpp index 694e501f..9e057851 100644 --- a/test/collection_vector_search_test.cpp +++ b/test/collection_vector_search_test.cpp @@ -1327,6 +1327,58 @@ TEST_F(CollectionVectorTest, KeywordSearchReturnOnlyTextMatchInfo) { ASSERT_EQ(1, results["hits"][0].count("text_match_info")); } +TEST_F(CollectionVectorTest, GroupByWithVectorSearch) { + nlohmann::json schema = R"({ + "name": "coll1", + "fields": [ + {"name": "title", "type": "string"}, + {"name": "group", "type": "string", "facet": true}, + {"name": "vec", "type": "float[]", "num_dim": 4} + ] + })"_json; + + Collection* coll1 = collectionManager.create_collection(schema).get(); + + std::vector> values = { + {0.851758, 0.909671, 0.823431, 0.372063}, + {0.97826, 0.933157, 0.39557, 0.306488}, + {0.230606, 0.634397, 0.514009, 0.399594} + }; + + for (size_t i = 0; i < values.size(); i++) { + nlohmann::json doc; + doc["id"] = std::to_string(i); + doc["title"] = std::to_string(i) + " title"; + doc["group"] = "0"; + doc["vec"] = values[i]; + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + } + + auto res = coll1->search("title", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 5, + "", 10, {}, {}, {"group"}, 1, + "", "", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback, + 4, {off}, 32767, 32767, 2, + false, true, "vec:([0.96826, 0.94, 0.39557, 0.306488])").get(); + + ASSERT_EQ(1, res["grouped_hits"].size()); + ASSERT_EQ(1, res["grouped_hits"][0]["hits"].size()); + ASSERT_EQ(1, res["grouped_hits"][0]["hits"][0].count("vector_distance")); + + res = coll1->search("*", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 5, + "", 10, {}, {}, {"group"}, 1, + "", "", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback, + 4, {off}, 32767, 32767, 2, + false, true, "vec:([0.96826, 0.94, 0.39557, 0.306488])").get(); + + ASSERT_EQ(1, res["grouped_hits"].size()); + ASSERT_EQ(1, res["grouped_hits"][0]["hits"].size()); + ASSERT_EQ(1, res["grouped_hits"][0]["hits"][0].count("vector_distance")); +} + TEST_F(CollectionVectorTest, HybridSearchReturnAllInfo) { auto schema_json = R"({ From ae5a90582a8df7857ea455b567f64549ebad44cc Mon Sep 17 00:00:00 2001 From: ozanarmagan Date: Sat, 26 Aug 2023 17:24:38 +0300 Subject: [PATCH 15/28] Check partial response from OpenAI API --- src/text_embedder_remote.cpp | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/src/text_embedder_remote.cpp b/src/text_embedder_remote.cpp index 06a53a65..5a559990 100644 --- a/src/text_embedder_remote.cpp +++ b/src/text_embedder_remote.cpp @@ -221,6 +221,7 @@ std::vector OpenAIEmbedder::batch_embed(const std::vector OpenAIEmbedder::batch_embed(const std::vector outputs; + for(size_t i = 0; i < inputs.size(); i++) { + outputs.push_back(embedding_res_t(500, "Got malformed response from OpenAI API.")); + } + return outputs; + } + std::vector outputs; for(auto& data : res_json["data"]) { + if(data.count("embedding") == 0 || !data["embedding"].is_array() || data["embedding"].size() == 0) { + nlohmann::json embedding_res = get_error_json(req_body, res_code, res); + outputs.push_back(embedding_res_t(500, "Got malformed response from OpenAI API.")); + continue; + } outputs.push_back(embedding_res_t(data["embedding"].get>())); } @@ -577,7 +593,22 @@ std::vector GCPEmbedder::batch_embed(const std::vector outputs; + + if(res_json.count("predictions") == 0 || !res_json["predictions"].is_array() || res_json["predictions"].size() != inputs.size()) { + nlohmann::json embedding_res = get_error_json(req_body, res_code, res); + std::vector outputs; + for(size_t i = 0; i < inputs.size(); i++) { + outputs.push_back(embedding_res_t(500, "Got malformed response from GCP API.")); + } + return outputs; + } + for(const auto& prediction : res_json["predictions"]) { + if(prediction.count("embeddings") == 0 || !prediction["embeddings"].is_object() || prediction["embeddings"].count("values") == 0 || !prediction["embeddings"]["values"].is_array() || prediction["embeddings"]["values"].size() == 0) { + nlohmann::json embedding_res = get_error_json(req_body, res_code, res); + outputs.push_back(embedding_res_t(500, "Got malformed response from GCP API.")); + continue; + } outputs.push_back(embedding_res_t(prediction["embeddings"]["values"].get>())); } From 57d083785d7184dae06f0f41f24bd288976e91fa Mon Sep 17 00:00:00 2001 From: ozanarmagan Date: Sat, 26 Aug 2023 17:34:08 +0300 Subject: [PATCH 16/28] Fix error log --- src/text_embedder_remote.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/text_embedder_remote.cpp b/src/text_embedder_remote.cpp index 5a559990..781fcae2 100644 --- a/src/text_embedder_remote.cpp +++ b/src/text_embedder_remote.cpp @@ -246,7 +246,6 @@ std::vector OpenAIEmbedder::batch_embed(const std::vector outputs; for(auto& data : res_json["data"]) { if(data.count("embedding") == 0 || !data["embedding"].is_array() || data["embedding"].size() == 0) { - nlohmann::json embedding_res = get_error_json(req_body, res_code, res); outputs.push_back(embedding_res_t(500, "Got malformed response from OpenAI API.")); continue; } From 8f3684f8d5d1a2b33beb05589895eb14884ae1eb Mon Sep 17 00:00:00 2001 From: ozanarmagan Date: Sat, 26 Aug 2023 17:43:13 +0300 Subject: [PATCH 17/28] Fix parsing errors --- src/text_embedder_remote.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/text_embedder_remote.cpp b/src/text_embedder_remote.cpp index 781fcae2..22c16420 100644 --- a/src/text_embedder_remote.cpp +++ b/src/text_embedder_remote.cpp @@ -594,7 +594,6 @@ std::vector GCPEmbedder::batch_embed(const std::vector outputs; if(res_json.count("predictions") == 0 || !res_json["predictions"].is_array() || res_json["predictions"].size() != inputs.size()) { - nlohmann::json embedding_res = get_error_json(req_body, res_code, res); std::vector outputs; for(size_t i = 0; i < inputs.size(); i++) { outputs.push_back(embedding_res_t(500, "Got malformed response from GCP API.")); @@ -604,7 +603,6 @@ std::vector GCPEmbedder::batch_embed(const std::vector Date: Sat, 26 Aug 2023 17:45:10 +0300 Subject: [PATCH 18/28] Fix parsing errors --- src/text_embedder_remote.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/text_embedder_remote.cpp b/src/text_embedder_remote.cpp index 22c16420..74226db2 100644 --- a/src/text_embedder_remote.cpp +++ b/src/text_embedder_remote.cpp @@ -235,7 +235,6 @@ std::vector OpenAIEmbedder::batch_embed(const std::vector outputs; for(size_t i = 0; i < inputs.size(); i++) { outputs.push_back(embedding_res_t(500, "Got malformed response from OpenAI API.")); From c5d2efa36dd6c76bb786d88b23a3ecceebd730ba Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Mon, 28 Aug 2023 11:59:42 +0530 Subject: [PATCH 19/28] Fix preset usage regression on multi search endpoint. --- src/collection_manager.cpp | 4 +++- test/core_api_utils_test.cpp | 40 ++++++++++++++++++++++++++++++++---- 2 files changed, 39 insertions(+), 5 deletions(-) diff --git a/src/collection_manager.cpp b/src/collection_manager.cpp index 7ded7e02..4d045827 100644 --- a/src/collection_manager.cpp +++ b/src/collection_manager.cpp @@ -766,7 +766,9 @@ Option CollectionManager::do_search(std::map& re nlohmann::json preset; const auto& preset_op = CollectionManager::get_instance().get_preset(preset_it->second, preset); - if(preset_op.ok()) { + // NOTE: we merge only single preset configuration because multi ("searches") preset value replaces + // the request body directly before we reach this single search request function. + if(preset_op.ok() && !preset.contains("searches")) { if(!preset.is_object()) { return Option(400, "Search preset is not an object."); } diff --git a/test/core_api_utils_test.cpp b/test/core_api_utils_test.cpp index 7a1005e0..e39ede35 100644 --- a/test/core_api_utils_test.cpp +++ b/test/core_api_utils_test.cpp @@ -610,7 +610,7 @@ TEST_F(CoreAPIUtilsTest, MultiSearchWithPresetShouldUsePresetForAuth) { ASSERT_EQ(2, embedded_params_vec.size()); } -TEST_F(CoreAPIUtilsTest, PresetSingleSearch) { +TEST_F(CoreAPIUtilsTest, PresetMultiSearch) { nlohmann::json schema = R"({ "name": "coll1", "fields": [ @@ -634,7 +634,7 @@ TEST_F(CoreAPIUtilsTest, PresetSingleSearch) { auto search_body = R"( {"searches":[ - {"collection":"coll1","q":"apple", "query_by": "title", "preset": "single_preset"} + {"collection":"coll1","q":"apple", "query_by": "name", "preset": "single_preset"} ]} )"; @@ -644,8 +644,40 @@ TEST_F(CoreAPIUtilsTest, PresetSingleSearch) { post_multi_search(req, res); - ASSERT_EQ("12", req->params["per_page"]); - ASSERT_EQ("coll1", req->params["collection"]); + auto res_json = nlohmann::json::parse(res->body); + ASSERT_EQ(1, res_json["results"].size()); + ASSERT_EQ(0, res_json["results"][0]["found"].get()); + + // with multiple "searches" preset configuration + preset_value = R"( + {"searches":[ + {"collection":"coll1", "q": "*", "per_page": "8"}, + {"collection":"coll1", "q": "*", "per_page": "11"} + ]} + )"_json; + + collectionManager.upsert_preset("multi_preset", preset_value); + embedded_params.clear(); + req->params.clear(); + req->params["preset"] = "multi_preset"; + req->embedded_params_vec.clear(); + req->embedded_params_vec.push_back(embedded_params); + req->embedded_params_vec.push_back(embedded_params); + + // "preset": "multi_preset" + search_body = R"( + {"searches":[ + {"collection":"coll1","q":"apple", "query_by": "title"} + ]} + )"; + + req->body = search_body; + + post_multi_search(req, res); + res_json = nlohmann::json::parse(res->body); + ASSERT_EQ(2, res_json["results"].size()); + ASSERT_EQ(0, res_json["results"][0]["found"].get()); + ASSERT_EQ(0, res_json["results"][1]["found"].get()); collectionManager.drop_collection("coll1"); } From 6a9d5efc94a5fce8e0aa26279080d7e6eb4f5da8 Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Mon, 28 Aug 2023 12:21:38 +0530 Subject: [PATCH 20/28] Rollback glog stdout/stderrr separation. This does not respect buffer levels yet: https://github.com/google/glog/issues/943 --- src/typesense_server_utils.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/typesense_server_utils.cpp b/src/typesense_server_utils.cpp index e43a67c9..c628c569 100644 --- a/src/typesense_server_utils.cpp +++ b/src/typesense_server_utils.cpp @@ -118,7 +118,7 @@ int init_root_logger(Config & config, const std::string & server_version) { if(log_dir.empty()) { // use console logger if log dir is not specified - FLAGS_logtostdout = true; + FLAGS_logtostderr = true; } else { if(!directory_exists(log_dir)) { std::cerr << "Typesense failed to start. " << "Log directory " << log_dir << " does not exist."; From 633ec69aed7c9cfd613ed4dbbec793afb578d86f Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Thu, 31 Aug 2023 13:38:08 +0530 Subject: [PATCH 21/28] Fix upsert on unchanged docs with embedding fields. --- include/index.h | 2 +- src/index.cpp | 11 +- test/collection_vector_search_test.cpp | 211 ++++++++++++++++++++++++- 3 files changed, 219 insertions(+), 5 deletions(-) diff --git a/include/index.h b/include/index.h index 6dffc38e..5656cc2f 100644 --- a/include/index.h +++ b/include/index.h @@ -532,7 +532,7 @@ private: static void handle_doc_ops(const tsl::htrie_map& search_schema, nlohmann::json& update_doc, const nlohmann::json& old_doc); - static void get_doc_changes(const index_operation_t op, const tsl::htrie_map& search_schema, + static void get_doc_changes(const index_operation_t op, const tsl::htrie_map& embedding_fields, nlohmann::json &update_doc, const nlohmann::json &old_doc, nlohmann::json &new_doc, nlohmann::json &del_doc); diff --git a/src/index.cpp b/src/index.cpp index 35f5c839..c05b8c5d 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -454,7 +454,7 @@ void Index::validate_and_preprocess(Index *index, std::vector& ite if(index_rec.is_update) { // scrub string fields to reduce delete ops - get_doc_changes(index_rec.operation, search_schema, index_rec.doc, index_rec.old_doc, + get_doc_changes(index_rec.operation, embedding_fields, index_rec.doc, index_rec.old_doc, index_rec.new_doc, index_rec.del_doc); if(generate_embeddings) { @@ -6258,7 +6258,7 @@ void Index::handle_doc_ops(const tsl::htrie_map& search_schema, } } -void Index::get_doc_changes(const index_operation_t op, const tsl::htrie_map& search_schema, +void Index::get_doc_changes(const index_operation_t op, const tsl::htrie_map& embedding_fields, nlohmann::json& update_doc, const nlohmann::json& old_doc, nlohmann::json& new_doc, nlohmann::json& del_doc) { @@ -6271,7 +6271,12 @@ void Index::get_doc_changes(const index_operation_t op, const tsl::htrie_map vec = {0.12, 0.45, 0.64}; + + nlohmann::json doc; + doc["id"] = "0"; + doc["title"] = "Title"; + doc["points"] = 100; + doc["vec"] = vec; + + auto add_op = coll1->add(doc.dump()); + ASSERT_TRUE(add_op.ok()); + + auto results = coll1->search("*", {}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 5, + "", 10, {}, {}, {}, 0, + "", "", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback, + 4, {off}, 32767, 32767, 2, + false, true, "vec:([0.12, 0.44, 0.55])").get(); + + ASSERT_EQ(1, results["found"].get()); + + + // upsert unchanged doc + add_op = coll1->add(doc.dump(), index_operation_t::UPSERT); + ASSERT_TRUE(add_op.ok()); + + results = coll1->search("*", {}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 5, + "", 10, {}, {}, {}, 0, + "", "", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback, + 4, {off}, 32767, 32767, 2, + false, true, "vec:([0.12, 0.44, 0.55])").get(); + + ASSERT_EQ(1, results["found"].get()); +} + TEST_F(CollectionVectorTest, NumVectorGreaterThanNumDim) { nlohmann::json schema = R"({ "name": "coll1", @@ -692,6 +741,88 @@ TEST_F(CollectionVectorTest, VectorWithNullValue) { nlohmann::json::parse(json_lines[1])["error"].get()); } +TEST_F(CollectionVectorTest, EmbeddedVectorUnchangedUpsert) { + nlohmann::json schema = R"({ + "name": "coll1", + "fields": [ + {"name": "title", "type": "string"}, + {"name": "points", "type": "int32"}, + {"name": "embedding", "type":"float[]", "embed":{"from": ["title"], + "model_config": {"model_name": "ts/e5-small"}}} + ] + })"_json; + + TextEmbedderManager::set_model_dir("/tmp/typesense_test/models"); + + Collection* coll1 = collectionManager.create_collection(schema).get(); + + nlohmann::json doc; + doc["id"] = "0"; + doc["title"] = "Title"; + doc["points"] = 100; + + auto add_op = coll1->add(doc.dump()); + ASSERT_TRUE(add_op.ok()); + + auto results = coll1->search("title", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set()).get(); + + ASSERT_EQ(1, results["found"].get()); + auto embedding = results["hits"][0]["document"]["embedding"].get>(); + ASSERT_EQ(384, embedding.size()); + + // upsert unchanged doc + doc.clear(); + doc["id"] = "0"; + doc["title"] = "Title"; + doc["points"] = 100; + + add_op = coll1->add(doc.dump(), index_operation_t::UPSERT); + ASSERT_TRUE(add_op.ok()); + + results = coll1->search("title", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set()).get(); + ASSERT_EQ(1, results["found"].get()); + embedding = results["hits"][0]["document"]["embedding"].get>(); + ASSERT_EQ(384, embedding.size()); + + // update + + doc.clear(); + doc["id"] = "0"; + doc["title"] = "Title"; + doc["points"] = 100; + + add_op = coll1->add(doc.dump(), index_operation_t::UPDATE); + ASSERT_TRUE(add_op.ok()); + + results = coll1->search("title", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set()).get(); + ASSERT_EQ(1, results["found"].get()); + embedding = results["hits"][0]["document"]["embedding"].get>(); + ASSERT_EQ(384, embedding.size()); + + // emplace + + doc.clear(); + doc["id"] = "0"; + doc["title"] = "Title"; + doc["points"] = 100; + + add_op = coll1->add(doc.dump(), index_operation_t::EMPLACE); + ASSERT_TRUE(add_op.ok()); + + results = coll1->search("title", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set()).get(); + ASSERT_EQ(1, results["found"].get()); + embedding = results["hits"][0]["document"]["embedding"].get>(); + ASSERT_EQ(384, embedding.size()); +} + TEST_F(CollectionVectorTest, HybridSearchWithExplicitVector) { nlohmann::json schema = R"({ "name": "objects", @@ -1099,7 +1230,67 @@ TEST_F(CollectionVectorTest, HideCredential) { ASSERT_EQ("***********", coll_summary["fields"][1]["embed"]["model_config"]["project_id"].get()); } -TEST_F(CollectionVectorTest, UpdateOfCollWithNonOptionalEmbeddingField) { +TEST_F(CollectionVectorTest, UpdateOfFieldReferencedByEmbedding) { + nlohmann::json schema = R"({ + "name": "objects", + "fields": [ + {"name": "name", "type": "string"}, + {"name": "embedding", "type":"float[]", "embed":{"from": ["name"], + "model_config": {"model_name": "ts/e5-small"}}} + ] + })"_json; + + TextEmbedderManager::set_model_dir("/tmp/typesense_test/models"); + + auto op = collectionManager.create_collection(schema); + ASSERT_TRUE(op.ok()); + Collection* coll = op.get(); + + nlohmann::json object; + object["id"] = "0"; + object["name"] = "butter"; + + auto add_op = coll->add(object.dump(), CREATE); + ASSERT_TRUE(add_op.ok()); + + auto results = coll->search("butter", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}).get(); + ASSERT_EQ(1, results["found"].get()); + auto original_embedding = results["hits"][0]["document"]["embedding"].get>(); + + nlohmann::json update_object; + update_object["id"] = "0"; + update_object["name"] = "ghee"; + auto update_op = coll->add(update_object.dump(), EMPLACE); + ASSERT_TRUE(update_op.ok()); + + results = coll->search("ghee", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}).get(); + ASSERT_EQ(1, results["found"].get()); + auto updated_embedding = results["hits"][0]["document"]["embedding"].get>(); + ASSERT_NE(original_embedding, updated_embedding); + + // action = update + update_object["name"] = "milk"; + update_op = coll->add(update_object.dump(), UPDATE); + ASSERT_TRUE(update_op.ok()); + + results = coll->search("milk", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}).get(); + ASSERT_EQ(1, results["found"].get()); + updated_embedding = results["hits"][0]["document"]["embedding"].get>(); + ASSERT_NE(original_embedding, updated_embedding); + + // action = upsert + update_object["name"] = "cheese"; + update_op = coll->add(update_object.dump(), UPSERT); + ASSERT_TRUE(update_op.ok()); + + results = coll->search("cheese", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}).get(); + ASSERT_EQ(1, results["found"].get()); + updated_embedding = results["hits"][0]["document"]["embedding"].get>(); + ASSERT_NE(original_embedding, updated_embedding); +} + +TEST_F(CollectionVectorTest, UpdateOfFieldNotReferencedByEmbedding) { + // test updates to a field that's not referred by an embedding field nlohmann::json schema = R"({ "name": "objects", "fields": [ @@ -1123,16 +1314,34 @@ TEST_F(CollectionVectorTest, UpdateOfCollWithNonOptionalEmbeddingField) { auto add_op = coll->add(object.dump(), CREATE); ASSERT_TRUE(add_op.ok()); + auto results = coll->search("butter", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}).get(); + ASSERT_EQ(1, results["found"].get()); + nlohmann::json update_object; update_object["id"] = "0"; update_object["about"] = "something about butter"; auto update_op = coll->add(update_object.dump(), EMPLACE); ASSERT_TRUE(update_op.ok()); + results = coll->search("butter", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}).get(); + ASSERT_EQ(1, results["found"].get()); + // action = update update_object["about"] = "something about butter 2"; update_op = coll->add(update_object.dump(), UPDATE); ASSERT_TRUE(update_op.ok()); + + results = coll->search("butter", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}).get(); + ASSERT_EQ(1, results["found"].get()); + + // action = upsert + update_object["name"] = "butter"; + update_object["about"] = "something about butter 3"; + update_op = coll->add(update_object.dump(), UPSERT); + ASSERT_TRUE(update_op.ok()); + + results = coll->search("butter", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}).get(); + ASSERT_EQ(1, results["found"].get()); } TEST_F(CollectionVectorTest, FreshEmplaceWithOptionalEmbeddingReferencedField) { From 3b157f6c61a0f4e7e58290b675f4af9f4049278e Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Thu, 31 Aug 2023 15:12:49 +0530 Subject: [PATCH 22/28] Fix duplicate embed fields init. --- src/collection.cpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/collection.cpp b/src/collection.cpp index 7775b367..ba00b5ab 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -52,12 +52,6 @@ Collection::Collection(const std::string& name, const uint32_t collection_id, co symbols_to_index(to_char_array(symbols_to_index)), token_separators(to_char_array(token_separators)), index(init_index()) { - for (auto const& field: fields) { - if (field.embed.count(fields::from) != 0) { - embedding_fields.emplace(field.name, field); - } - } - this->num_documents = 0; } From 29613ad05454da91f9f026862d6dcf02934e2954 Mon Sep 17 00:00:00 2001 From: ozanarmagan Date: Sat, 2 Sep 2023 01:12:21 +0300 Subject: [PATCH 23/28] Fix tokenizing XLM-RoBERTa models with 3 inputs and altering embedding fields --- src/collection.cpp | 21 +++++++++++++++++++++ src/text_embedder.cpp | 5 +++++ 2 files changed, 26 insertions(+) diff --git a/src/collection.cpp b/src/collection.cpp index ba00b5ab..6168c8f7 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -4300,6 +4300,27 @@ Option Collection::validate_alter_payload(nlohmann::json& schema_changes, auto validation_op = field::validate_and_init_embed_fields(embed_json_field_indices, search_schema, schema_changes["fields"], diff_fields); + + for(auto index : embed_json_field_indices) { + auto& field = diff_fields[index.second]; + auto is_reindex = (delete_field_names.count(field.name) != 0); + if(is_reindex) { + for(auto& reindex_field: reindex_fields) { + if(reindex_field.name == field.name) { + reindex_field.num_dim = field.num_dim; + break; + } + } + } else { + for(auto& add_field: addition_fields) { + if(add_field.name == field.name) { + add_field.num_dim = field.num_dim; + break; + } + } + } + } + if(!validation_op.ok()) { return validation_op; } diff --git a/src/text_embedder.cpp b/src/text_embedder.cpp index 61244054..126655f9 100644 --- a/src/text_embedder.cpp +++ b/src/text_embedder.cpp @@ -117,6 +117,11 @@ embedding_res_t TextEmbedder::Embed(const std::string& text, const size_t remote input_shapes.push_back({1, static_cast(encoded_input.input_ids.size())}); input_shapes.push_back({1, static_cast(encoded_input.attention_mask.size())}); if(session_->GetInputCount() == 3) { + // edge case: xlm_roberta does not have token_type_ids, but if the model has it as input, we need to fill it with 0s + if(encoded_input.token_type_ids.size() == 0) { + encoded_input.token_type_ids.resize(encoded_input.input_ids.size(), 0); + } + input_shapes.push_back({1, static_cast(encoded_input.token_type_ids.size())}); } input_tensors.push_back(Ort::Value::CreateTensor(memory_info, encoded_input.input_ids.data(), encoded_input.input_ids.size(), input_shapes[0].data(), input_shapes[0].size())); From 16a553dd3e0b9c8368bed34c3e5064dd347a70ac Mon Sep 17 00:00:00 2001 From: ozanarmagan Date: Sat, 2 Sep 2023 16:14:39 +0300 Subject: [PATCH 24/28] Add tests & add validate_and_init_embed_field --- include/field.h | 5 +++ src/collection.cpp | 18 ++++----- src/field.cpp | 54 +++++++++++++++++++++----- test/collection_schema_change_test.cpp | 8 ++++ test/collection_vector_search_test.cpp | 37 ++++++++++++++++++ 5 files changed, 104 insertions(+), 18 deletions(-) diff --git a/include/field.h b/include/field.h index d34d32ae..908c0cc9 100644 --- a/include/field.h +++ b/include/field.h @@ -424,6 +424,11 @@ struct field { std::string& fallback_field_type, std::vector& the_fields); + static Option validate_and_init_embed_field(const tsl::htrie_map& search_schema, + nlohmann::json& field_json, + const nlohmann::json& fields_json, + field& the_field); + static Option validate_and_init_embed_fields(const std::vector>& embed_json_field_indices, const tsl::htrie_map& search_schema, nlohmann::json& fields_json, diff --git a/src/collection.cpp b/src/collection.cpp index 6168c8f7..960ce266 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -4252,6 +4252,14 @@ Option Collection::validate_alter_payload(nlohmann::json& schema_changes, updated_search_schema[f.name] = f; } + if(!f.embed.empty() && !diff_fields.empty()) { + auto validate_res = field::validate_and_init_embed_field(search_schema, schema_changes["fields"][json_array_index], schema_changes["fields"], diff_fields.back()); + + if(!validate_res.ok()) { + return validate_res; + } + } + if(is_reindex) { reindex_fields.push_back(f); } else { @@ -4286,9 +4294,7 @@ Option Collection::validate_alter_payload(nlohmann::json& schema_changes, } } - if(!f.embed.empty() && !diff_fields.empty()) { - embed_json_field_indices.emplace_back(json_array_index, diff_fields.size()-1); - } + } else { // partial update is not supported for now @@ -4297,9 +4303,6 @@ Option Collection::validate_alter_payload(nlohmann::json& schema_changes, } } } - - auto validation_op = field::validate_and_init_embed_fields(embed_json_field_indices, search_schema, - schema_changes["fields"], diff_fields); for(auto index : embed_json_field_indices) { auto& field = diff_fields[index.second]; @@ -4321,9 +4324,6 @@ Option Collection::validate_alter_payload(nlohmann::json& schema_changes, } } - if(!validation_op.ok()) { - return validation_op; - } if(num_auto_detect_fields > 1) { return Option(400, "There can be only one field named `.*`."); diff --git a/src/field.cpp b/src/field.cpp index 3c4d2e1c..6a3151ae 100644 --- a/src/field.cpp +++ b/src/field.cpp @@ -1083,7 +1083,7 @@ void field::compact_nested_fields(tsl::htrie_map& nested_fields) { Option field::json_fields_to_fields(bool enable_nested_fields, nlohmann::json &fields_json, string &fallback_field_type, std::vector& the_fields) { size_t num_auto_detect_fields = 0; - std::vector> embed_json_field_indices; + const tsl::htrie_map dummy_search_schema; for(size_t i = 0; i < fields_json.size(); i++) { nlohmann::json& field_json = fields_json[i]; @@ -1094,17 +1094,13 @@ Option field::json_fields_to_fields(bool enable_nested_fields, nlohmann::j } if(!the_fields.empty() && !the_fields.back().embed.empty()) { - embed_json_field_indices.emplace_back(i, the_fields.size()-1); + auto validate_res = validate_and_init_embed_field(dummy_search_schema, field_json, fields_json, the_fields.back()); + if(!validate_res.ok()) { + return validate_res; + } } } - const tsl::htrie_map dummy_search_schema; - auto validation_op = field::validate_and_init_embed_fields(embed_json_field_indices, dummy_search_schema, - fields_json, the_fields); - if(!validation_op.ok()) { - return validation_op; - } - if(num_auto_detect_fields > 1) { return Option(400,"There can be only one field named `.*`."); } @@ -1112,6 +1108,46 @@ Option field::json_fields_to_fields(bool enable_nested_fields, nlohmann::j return Option(true); } +Option field::validate_and_init_embed_field(const tsl::htrie_map& search_schema, nlohmann::json& field_json, + const nlohmann::json& fields_json, + field& the_field) { + const std::string err_msg = "Property `" + fields::embed + "." + fields::from + + "` can only refer to string or string array fields."; + + for(auto& field_name : field_json[fields::embed][fields::from].get>()) { + + auto embed_field = std::find_if(fields_json.begin(), fields_json.end(), [&field_name](const nlohmann::json& x) { + return x["name"].get() == field_name; + }); + + + if(embed_field == fields_json.end()) { + const auto& embed_field2 = search_schema.find(field_name); + if (embed_field2 == search_schema.end()) { + return Option(400, err_msg); + } else if (embed_field2->type != field_types::STRING && embed_field2->type != field_types::STRING_ARRAY) { + return Option(400, err_msg); + } + } else if((*embed_field)[fields::type] != field_types::STRING && + (*embed_field)[fields::type] != field_types::STRING_ARRAY) { + return Option(400, err_msg); + } + } + + const auto& model_config = field_json[fields::embed][fields::model_config]; + size_t num_dim = 0; + auto res = TextEmbedderManager::get_instance().validate_and_init_model(model_config, num_dim); + if(!res.ok()) { + return Option(res.code(), res.error()); + } + + LOG(INFO) << "Model init done."; + field_json[fields::num_dim] = num_dim; + the_field.num_dim = num_dim; + + return Option(true); +} + Option field::validate_and_init_embed_fields(const std::vector>& embed_json_field_indices, const tsl::htrie_map& search_schema, nlohmann::json& fields_json, diff --git a/test/collection_schema_change_test.cpp b/test/collection_schema_change_test.cpp index 0d8364fd..c936ce78 100644 --- a/test/collection_schema_change_test.cpp +++ b/test/collection_schema_change_test.cpp @@ -1566,6 +1566,14 @@ TEST_F(CollectionSchemaChangeTest, UpdateSchemaWithNewEmbeddingField) { ASSERT_TRUE(res.ok()); ASSERT_EQ(1, coll->get_embedding_fields().size()); + auto search_schema = coll->get_schema(); + + auto embedding_field_it = search_schema.find("embedding"); + ASSERT_TRUE(embedding_field_it != coll->get_schema().end()); + ASSERT_EQ("embedding", embedding_field_it.value().name); + ASSERT_EQ("float[]", embedding_field_it.value().type); + ASSERT_EQ(384, embedding_field_it.value().num_dim); + nlohmann::json doc; doc["names"] = {"hello", "world"}; auto add_op = coll->add(doc.dump()); diff --git a/test/collection_vector_search_test.cpp b/test/collection_vector_search_test.cpp index ddd5a16f..4b77bb95 100644 --- a/test/collection_vector_search_test.cpp +++ b/test/collection_vector_search_test.cpp @@ -1718,4 +1718,41 @@ TEST_F(CollectionVectorTest, TestDifferentOpenAIApiKeys) { ASSERT_NE(embedder_map.find("openai/text-embedding-ada-002:" + api_key1), embedder_map.end()); ASSERT_NE(embedder_map.find("openai/text-embedding-ada-002:" + api_key2), embedder_map.end()); ASSERT_EQ(embedder_map.find("openai/text-embedding-ada-002"), embedder_map.end()); +} + + +TEST_F(CollectionVectorTest, TestMultilingualE5) { + auto schema_json = + R"({ + "name": "TEST", + "fields": [ + {"name": "name", "type": "string"}, + {"name": "embedding", "type":"float[]", "embed":{"from": ["name"], "model_config": {"model_name": "ts/multilingual-e5-small"}}} + ] + })"_json; + + TextEmbedderManager::set_model_dir("/tmp/typesense_test/models"); + + auto collection_create_op = collectionManager.create_collection(schema_json); + + ASSERT_TRUE(collection_create_op.ok()); + auto coll1 = collection_create_op.get(); + + auto add_op = coll1->add(R"({ + "name": "john doe" + })"_json.dump()); + + auto hybrid_results = coll1->search("john", {"name", "embedding"}, + "", {}, {}, {2}, 10, + 1, FREQUENCY, {true}, + 0, spp::sparse_hash_set()); + + ASSERT_TRUE(hybrid_results.ok()); + + auto semantic_results = coll1->search("john", {"embedding"}, + "", {}, {}, {2}, 10, + 1, FREQUENCY, {true}, + 0, spp::sparse_hash_set()); + + ASSERT_TRUE(semantic_results.ok()); } \ No newline at end of file From dedb8e213dcaa75cefc0ee7a798ce09c545abe01 Mon Sep 17 00:00:00 2001 From: ozanarmagan Date: Sat, 2 Sep 2023 17:21:07 +0300 Subject: [PATCH 25/28] Remove validate_and_init_embed_fields --- include/field.h | 4 ---- src/collection.cpp | 24 +----------------------- src/field.cpp | 42 ------------------------------------------ 3 files changed, 1 insertion(+), 69 deletions(-) diff --git a/include/field.h b/include/field.h index 908c0cc9..0f959b7d 100644 --- a/include/field.h +++ b/include/field.h @@ -429,10 +429,6 @@ struct field { const nlohmann::json& fields_json, field& the_field); - static Option validate_and_init_embed_fields(const std::vector>& embed_json_field_indices, - const tsl::htrie_map& search_schema, - nlohmann::json& fields_json, - std::vector& fields_vec); static bool flatten_obj(nlohmann::json& doc, nlohmann::json& value, bool has_array, bool has_obj_array, bool is_update, const field& the_field, const std::string& flat_name, diff --git a/src/collection.cpp b/src/collection.cpp index 960ce266..08358c5e 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -4156,7 +4156,6 @@ Option Collection::validate_alter_payload(nlohmann::json& schema_changes, } std::unordered_map new_dynamic_fields; - std::vector> embed_json_field_indices; int json_array_index = -1; for(const auto& kv: schema_changes["fields"].items()) { @@ -4252,7 +4251,7 @@ Option Collection::validate_alter_payload(nlohmann::json& schema_changes, updated_search_schema[f.name] = f; } - if(!f.embed.empty() && !diff_fields.empty()) { + if(!f.embed.empty()) { auto validate_res = field::validate_and_init_embed_field(search_schema, schema_changes["fields"][json_array_index], schema_changes["fields"], diff_fields.back()); if(!validate_res.ok()) { @@ -4303,27 +4302,6 @@ Option Collection::validate_alter_payload(nlohmann::json& schema_changes, } } } - - for(auto index : embed_json_field_indices) { - auto& field = diff_fields[index.second]; - auto is_reindex = (delete_field_names.count(field.name) != 0); - if(is_reindex) { - for(auto& reindex_field: reindex_fields) { - if(reindex_field.name == field.name) { - reindex_field.num_dim = field.num_dim; - break; - } - } - } else { - for(auto& add_field: addition_fields) { - if(add_field.name == field.name) { - add_field.num_dim = field.num_dim; - break; - } - } - } - } - if(num_auto_detect_fields > 1) { return Option(400, "There can be only one field named `.*`."); diff --git a/src/field.cpp b/src/field.cpp index 6a3151ae..d053e578 100644 --- a/src/field.cpp +++ b/src/field.cpp @@ -1148,48 +1148,6 @@ Option field::validate_and_init_embed_field(const tsl::htrie_map(true); } -Option field::validate_and_init_embed_fields(const std::vector>& embed_json_field_indices, - const tsl::htrie_map& search_schema, - nlohmann::json& fields_json, - std::vector& fields_vec) { - - for(const auto& json_field_index: embed_json_field_indices) { - auto& field_json = fields_json[json_field_index.first]; - const std::string err_msg = "Property `" + fields::embed + "." + fields::from + - "` can only refer to string or string array fields."; - - for(auto& field_name : field_json[fields::embed][fields::from].get>()) { - auto embed_field = std::find_if(fields_json.begin(), fields_json.end(), [&field_name](const nlohmann::json& x) { - return x["name"].get() == field_name; - }); - - if(embed_field == fields_json.end()) { - const auto& embed_field2 = search_schema.find(field_name); - if (embed_field2 == search_schema.end()) { - return Option(400, err_msg); - } else if (embed_field2->type != field_types::STRING && embed_field2->type != field_types::STRING_ARRAY) { - return Option(400, err_msg); - } - } else if((*embed_field)[fields::type] != field_types::STRING && - (*embed_field)[fields::type] != field_types::STRING_ARRAY) { - return Option(400, err_msg); - } - } - - const auto& model_config = field_json[fields::embed][fields::model_config]; - size_t num_dim = 0; - auto res = TextEmbedderManager::get_instance().validate_and_init_model(model_config, num_dim); - if(!res.ok()) { - return Option(res.code(), res.error()); - } - - LOG(INFO) << "Model init done."; - field_json[fields::num_dim] = num_dim; - fields_vec[json_field_index.second].num_dim = num_dim; - } - - return Option(true); -} void filter_result_t::and_filter_results(const filter_result_t& a, const filter_result_t& b, filter_result_t& result) { auto lenA = a.count, lenB = b.count; From 781c5348d8a66479ef92d1532838ee5857d0bf1e Mon Sep 17 00:00:00 2001 From: ozanarmagan Date: Sat, 2 Sep 2023 17:26:24 +0300 Subject: [PATCH 26/28] Use `f` instead of `diff_fields.back()` --- src/collection.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/collection.cpp b/src/collection.cpp index 08358c5e..9140e8bb 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -4243,7 +4243,7 @@ Option Collection::validate_alter_payload(nlohmann::json& schema_changes, return parse_op; } - const auto& f = diff_fields.back(); + auto& f = diff_fields.back(); if(f.is_dynamic()) { new_dynamic_fields[f.name] = f; @@ -4252,7 +4252,7 @@ Option Collection::validate_alter_payload(nlohmann::json& schema_changes, } if(!f.embed.empty()) { - auto validate_res = field::validate_and_init_embed_field(search_schema, schema_changes["fields"][json_array_index], schema_changes["fields"], diff_fields.back()); + auto validate_res = field::validate_and_init_embed_field(search_schema, schema_changes["fields"][json_array_index], schema_changes["fields"], f); if(!validate_res.ok()) { return validate_res; From dc780c0f583eca20ce37fd64911373150534202b Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Sun, 3 Sep 2023 17:27:53 +0530 Subject: [PATCH 27/28] Add more test for partial vector update --- test/collection_vector_search_test.cpp | 82 ++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) diff --git a/test/collection_vector_search_test.cpp b/test/collection_vector_search_test.cpp index 4b77bb95..4fcc134c 100644 --- a/test/collection_vector_search_test.cpp +++ b/test/collection_vector_search_test.cpp @@ -271,6 +271,88 @@ TEST_F(CollectionVectorTest, VectorUnchangedUpsert) { false, true, "vec:([0.12, 0.44, 0.55])").get(); ASSERT_EQ(1, results["found"].get()); + + // emplace unchanged doc + add_op = coll1->add(doc.dump(), index_operation_t::EMPLACE); + ASSERT_TRUE(add_op.ok()); + + results = coll1->search("*", {}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 5, + "", 10, {}, {}, {}, 0, + "", "", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback, + 4, {off}, 32767, 32767, 2, + false, true, "vec:([0.12, 0.44, 0.55])").get(); + + ASSERT_EQ(1, results["found"].get()); +} + +TEST_F(CollectionVectorTest, VectorPartialUpdate) { + nlohmann::json schema = R"({ + "name": "coll1", + "fields": [ + {"name": "title", "type": "string"}, + {"name": "points", "type": "int32"}, + {"name": "vec", "type": "float[]", "num_dim": 3} + ] + })"_json; + + Collection* coll1 = collectionManager.create_collection(schema).get(); + + std::vector vec = {0.12, 0.45, 0.64}; + + nlohmann::json doc; + doc["id"] = "0"; + doc["title"] = "Title"; + doc["points"] = 100; + doc["vec"] = vec; + + auto add_op = coll1->add(doc.dump()); + ASSERT_TRUE(add_op.ok()); + + auto results = coll1->search("*", {}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 5, + "", 10, {}, {}, {}, 0, + "", "", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback, + 4, {off}, 32767, 32767, 2, + false, true, "vec:([0.12, 0.44, 0.55])").get(); + + ASSERT_EQ(1, results["found"].get()); + + + // emplace partial doc + doc.erase("vec"); + doc["title"] = "Random"; + add_op = coll1->add(doc.dump(), index_operation_t::EMPLACE); + ASSERT_TRUE(add_op.ok()); + + results = coll1->search("Random", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 5, + "", 10, {}, {}, {}, 0, + "", "", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback, + 4, {off}, 32767, 32767, 2, + false, true, "vec:([0.12, 0.44, 0.55])").get(); + + ASSERT_EQ(1, results["found"].get()); + + // update portial doc + + doc.erase("vec"); + doc["title"] = "Random"; + add_op = coll1->add(doc.dump(), index_operation_t::UPDATE); + ASSERT_TRUE(add_op.ok()); + + results = coll1->search("Random", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 5, + "", 10, {}, {}, {}, 0, + "", "", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback, + 4, {off}, 32767, 32767, 2, + false, true, "vec:([0.12, 0.44, 0.55])").get(); + + ASSERT_EQ(1, results["found"].get()); } TEST_F(CollectionVectorTest, NumVectorGreaterThanNumDim) { From 2af676916a7b810a07e232329e9901422f782178 Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Sat, 9 Sep 2023 14:17:44 +0530 Subject: [PATCH 28/28] Use drop token iters directly for text match scoring. --- src/index.cpp | 36 ++++++++++++++++---------- test/collection_specific_more_test.cpp | 30 +++++++++++++++++++++ 2 files changed, 52 insertions(+), 14 deletions(-) diff --git a/src/index.cpp b/src/index.cpp index c05b8c5d..ac7be1d6 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -3981,8 +3981,6 @@ void Index::search_across_fields(const std::vector& query_tokens, dropped_token_its.push_back(std::move(token_fields)); } - - // one iterator for each token, each underlying iterator contains results of token across multiple fields std::vector token_its; @@ -4074,6 +4072,28 @@ void Index::search_across_fields(const std::vector& query_tokens, } } + size_t query_len = query_tokens.size(); + + // check if seq_id exists in any of the dropped_token iters + for(size_t ti = 0; ti < dropped_token_its.size(); ti++) { + or_iterator_t& token_fields_iters = dropped_token_its[ti]; + if(token_fields_iters.skip_to(seq_id) && token_fields_iters.id() == seq_id) { + query_len++; + const std::vector& field_iters = token_fields_iters.get_its(); + for(size_t fi = 0; fi < field_iters.size(); fi++) { + const posting_list_t::iterator_t& field_iter = field_iters[fi]; + if(field_iter.id() == seq_id) { + // not all fields might contain a given token + field_to_tokens[field_iter.get_field_id()].push_back(field_iter.clone()); + } + } + } + } + + if(syn_orig_num_tokens != -1) { + query_len = syn_orig_num_tokens; + } + int64_t best_field_match_score = 0, best_field_weight = 0; uint32_t num_matching_fields = 0; @@ -4127,18 +4147,6 @@ void Index::search_across_fields(const std::vector& query_tokens, compute_sort_scores(sort_fields, sort_order, field_values, geopoint_indices, seq_id, filter_index, best_field_match_score, scores, match_score_index); - size_t query_len = query_tokens.size(); - - // check if seq_id exists in any of the dropped_token iters and increment matching fields accordingly - for(auto& dropped_token_it: dropped_token_its) { - if(dropped_token_it.skip_to(seq_id) && dropped_token_it.id() == seq_id) { - query_len++; - } - } - - if(syn_orig_num_tokens != -1) { - query_len = syn_orig_num_tokens; - } query_len = std::min(15, query_len); // NOTE: `query_len` is total tokens matched across fields. diff --git a/test/collection_specific_more_test.cpp b/test/collection_specific_more_test.cpp index 83181283..4e7347e6 100644 --- a/test/collection_specific_more_test.cpp +++ b/test/collection_specific_more_test.cpp @@ -1816,6 +1816,36 @@ TEST_F(CollectionSpecificMoreTest, ConsiderDroppedTokensDuringTextMatchScoring) ASSERT_EQ("1", res["hits"][1]["document"]["id"].get()); } +TEST_F(CollectionSpecificMoreTest, ConsiderDroppedTokensDuringTextMatchScoring2) { + nlohmann::json schema = R"({ + "name": "coll1", + "fields": [ + {"name": "name", "type": "string"} + ] + })"_json; + + Collection *coll1 = collectionManager.create_collection(schema).get(); + + nlohmann::json doc; + doc["id"] = "0"; + doc["name"] = "Elizabeth Arden 5th Avenue Eau de Parfum 125ml"; + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + + doc["id"] = "1"; + doc["name"] = "Avène Sun Very High Protection Mineral Cream SPF50+ 50ml"; + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + + auto res = coll1->search("avène eau mineral", {"name"}, "", {}, {}, {2}, 10, 1, FREQUENCY, {true}, 5, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "", 20, {}, {}, {}, 0, + "", "", {3}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback, + 4, {off}, 0, 0, 0, 2, false, "", true, 0, max_weight).get(); + + ASSERT_EQ(2, res["hits"].size()); + ASSERT_EQ("1", res["hits"][0]["document"]["id"].get()); + ASSERT_EQ("0", res["hits"][1]["document"]["id"].get()); +} + TEST_F(CollectionSpecificMoreTest, NonNestedFieldNameWithDot) { nlohmann::json schema = R"({ "name": "coll1",