mirror of
https://github.com/typesense/typesense.git
synced 2025-05-20 21:52:23 +08:00
Remove collection_to_text_embedders
and text_embedder_to_collections
This commit is contained in:
parent
61469b4d84
commit
238c5f00b9
@ -213,7 +213,7 @@ private:
|
||||
|
||||
Option<bool> persist_collection_meta();
|
||||
|
||||
Option<bool> batch_alter_data(const std::vector<field>& alter_fields,
|
||||
Option<bool> batch_alter_data(std::vector<field>& alter_fields,
|
||||
const std::vector<field>& del_fields,
|
||||
const std::string& this_fallback_field_type);
|
||||
|
||||
@ -278,6 +278,8 @@ private:
|
||||
|
||||
static void hide_credential(nlohmann::json& json, const std::string& credential_name);
|
||||
|
||||
void remove_embedding_field(const std::string& field_name);
|
||||
|
||||
public:
|
||||
|
||||
enum {MAX_ARRAY_MATCHES = 5};
|
||||
|
@ -36,9 +36,6 @@ public:
|
||||
TextEmbedderManager& operator=(const TextEmbedderManager&) = delete;
|
||||
|
||||
Option<TextEmbedder*> get_text_embedder(const nlohmann::json& model_config);
|
||||
void add_text_embedder_to_collection(const std::string& collection_name, const std::string& model_name);
|
||||
void remove_text_embedder_from_collection(const std::string& collection_name, const std::string& model_name);
|
||||
void remove_collection(const std::string& collection_name);
|
||||
|
||||
void delete_text_embedder(const std::string& model_path);
|
||||
void delete_all_text_embedders();
|
||||
@ -86,8 +83,6 @@ private:
|
||||
std::unordered_map<std::string, text_embedding_model> public_models;
|
||||
std::mutex text_embedders_mutex;
|
||||
|
||||
std::unordered_map<std::string, std::unordered_set<std::string>> text_embedder_to_collections, collection_to_text_embedders;
|
||||
|
||||
static Option<std::string> get_namespace(const std::string& model_name);
|
||||
};
|
||||
|
||||
|
@ -3765,7 +3765,7 @@ Option<bool> Collection::persist_collection_meta() {
|
||||
return Option<bool>(true);
|
||||
}
|
||||
|
||||
Option<bool> Collection::batch_alter_data(const std::vector<field>& alter_fields,
|
||||
Option<bool> Collection::batch_alter_data(std::vector<field>& alter_fields,
|
||||
const std::vector<field>& del_fields,
|
||||
const std::string& this_fallback_field_type) {
|
||||
// Update schema with additions (deletions can only be made later)
|
||||
@ -3797,8 +3797,12 @@ Option<bool> Collection::batch_alter_data(const std::vector<field>& alter_fields
|
||||
|
||||
if(f.embed.count(fields::from) != 0) {
|
||||
found_embedding_field = true;
|
||||
auto text_embedders = TextEmbedderManager::get_instance()._get_text_embedders();
|
||||
auto model_name = f.embed[fields::model_config][fields::model_name].get<std::string>();
|
||||
if(text_embedders.count(model_name) == 0) {
|
||||
TextEmbedderManager::get_instance().validate_and_init_model(f.embed[fields::model_config], f.num_dim);
|
||||
}
|
||||
embedding_fields.emplace(f.name, f);
|
||||
TextEmbedderManager::get_instance().add_text_embedder_to_collection(name, f.embed[fields::model_config]["model_name"]);
|
||||
}
|
||||
|
||||
fields.push_back(f);
|
||||
@ -3909,8 +3913,7 @@ Option<bool> Collection::batch_alter_data(const std::vector<field>& alter_fields
|
||||
}
|
||||
|
||||
if(del_field.embed.count(fields::from) != 0) {
|
||||
embedding_fields.erase(del_field.name);
|
||||
TextEmbedderManager::get_instance().remove_text_embedder_from_collection(name, del_field.embed[fields::model_config]["model_name"]);
|
||||
remove_embedding_field(del_field.name);
|
||||
}
|
||||
|
||||
if(del_field.name == ".*") {
|
||||
@ -4226,7 +4229,6 @@ Option<bool> Collection::validate_alter_payload(nlohmann::json& schema_changes,
|
||||
|
||||
if(found_field && field_it.value().embed.count(fields::from) != 0) {
|
||||
updated_embedding_fields.erase(field_it.key());
|
||||
TextEmbedderManager::get_instance().remove_text_embedder_from_collection(name, field_it.value().embed[fields::model_config]["model_name"]);
|
||||
}
|
||||
|
||||
if(found_field) {
|
||||
@ -4236,7 +4238,6 @@ Option<bool> Collection::validate_alter_payload(nlohmann::json& schema_changes,
|
||||
|
||||
if(field_it.value().embed.count(fields::from) != 0) {
|
||||
updated_embedding_fields.erase(field_it.key());
|
||||
TextEmbedderManager::get_instance().remove_text_embedder_from_collection(name, field_it.value().embed[fields::model_config]["model_name"]);
|
||||
}
|
||||
|
||||
// should also remove children if the field being dropped is an object
|
||||
@ -4251,7 +4252,6 @@ Option<bool> Collection::validate_alter_payload(nlohmann::json& schema_changes,
|
||||
|
||||
if(prefix_kv.value().embed.count(fields::from) != 0) {
|
||||
updated_embedding_fields.erase(prefix_kv.key());
|
||||
TextEmbedderManager::get_instance().remove_text_embedder_from_collection(name, prefix_kv.value().embed[fields::model_config]["model_name"]);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -4988,12 +4988,11 @@ void Collection::process_remove_field_for_embedding_fields(const field& del_fiel
|
||||
}
|
||||
|
||||
for(auto& garbage_field: garbage_embed_fields) {
|
||||
embedding_fields.erase(garbage_field.name);
|
||||
remove_embedding_field(garbage_field.name);
|
||||
search_schema.erase(garbage_field.name);
|
||||
fields.erase(std::remove_if(fields.begin(), fields.end(), [&garbage_field](const auto &f) {
|
||||
return f.name == garbage_field.name;
|
||||
}), fields.end());
|
||||
TextEmbedderManager::get_instance().remove_text_embedder_from_collection(name, garbage_field.embed[fields::model_config]["model_name"].get<std::string>());
|
||||
}
|
||||
}
|
||||
|
||||
@ -5031,3 +5030,40 @@ Option<bool> Collection::truncate_after_top_k(const string &field_name, size_t k
|
||||
|
||||
return Option<bool>(true);
|
||||
}
|
||||
|
||||
void Collection::remove_embedding_field(const std::string& field_name) {
|
||||
field del_field;
|
||||
|
||||
if(embedding_fields.find(field_name) != embedding_fields.end()) {
|
||||
del_field = embedding_fields[field_name];
|
||||
} else {
|
||||
return;
|
||||
}
|
||||
|
||||
embedding_fields.erase(field_name);
|
||||
|
||||
auto model_name = del_field.embed[fields::model_config]["model_name"].get<std::string>();
|
||||
|
||||
auto collections = CollectionManager::get_instance().get_collections();
|
||||
|
||||
bool found = false;
|
||||
|
||||
for(auto& collection: collections) {
|
||||
auto embedding_fields_other = collection->embedding_fields;
|
||||
|
||||
for(auto& embedding_field: embedding_fields_other) {
|
||||
if(embedding_field.embed.count(fields::model_config) != 0) {
|
||||
auto model_config = embedding_field.embed[fields::model_config];
|
||||
if(model_config["model_name"].get<std::string>() == model_name) {
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(!found) {
|
||||
LOG(INFO) << "Deleting text embedder: " << model_name;
|
||||
TextEmbedderManager::get_instance().delete_text_embedder(model_name);
|
||||
}
|
||||
}
|
@ -88,7 +88,6 @@ Collection* CollectionManager::init_collection(const nlohmann::json & collection
|
||||
}
|
||||
|
||||
field_obj[fields::num_dim] = num_dim;
|
||||
TextEmbedderManager::get_instance().add_text_embedder_to_collection(this_collection_name, model_config["model_name"].get<std::string>());
|
||||
LOG(INFO) << "Model init done.";
|
||||
}
|
||||
|
||||
@ -529,11 +528,35 @@ Option<nlohmann::json> CollectionManager::drop_collection(const std::string& col
|
||||
|
||||
s_lock.unlock();
|
||||
|
||||
auto embedding_fields = collection->get_embedding_fields();
|
||||
|
||||
std::unique_lock u_lock(mutex);
|
||||
collections.erase(actual_coll_name);
|
||||
collection_id_names.erase(collection->get_collection_id());
|
||||
|
||||
TextEmbedderManager::get_instance().remove_collection(actual_coll_name);
|
||||
for(auto& embedding_field : embedding_fields) {
|
||||
bool found = false;
|
||||
auto model_name = embedding_field.embed[fields::model_config]["model_name"].get<std::string>();
|
||||
|
||||
for(auto& collection: collections) {
|
||||
auto embedding_fields_other = collection.second->get_embedding_fields();
|
||||
|
||||
for(auto& embedding_field: embedding_fields_other) {
|
||||
if(embedding_field.embed.count(fields::model_config) != 0) {
|
||||
auto model_config = embedding_field.embed[fields::model_config];
|
||||
if(model_config["model_name"].get<std::string>() == model_name) {
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(!found) {
|
||||
LOG(INFO) << "Deleting text embedder: " << model_name;
|
||||
TextEmbedderManager::get_instance().delete_text_embedder(embedding_field.embed[fields::model_config]["model_name"].get<std::string>());
|
||||
}
|
||||
}
|
||||
|
||||
u_lock.unlock();
|
||||
|
||||
|
@ -1140,7 +1140,6 @@ Option<bool> field::validate_and_init_embed_field(const tsl::htrie_map<char, fie
|
||||
if(!res.ok()) {
|
||||
return Option<bool>(res.code(), res.error());
|
||||
}
|
||||
TextEmbedderManager::get_instance().add_text_embedder_to_collection(collection_name, model_config[fields::model_name].get<std::string>());
|
||||
|
||||
LOG(INFO) << "Model init done.";
|
||||
field_json[fields::num_dim] = num_dim;
|
||||
|
@ -394,76 +394,3 @@ bool TextEmbedderManager::is_remote_model(const std::string& model_name) {
|
||||
auto model_namespace = get_namespace(model_name);
|
||||
return model_namespace.ok() && (model_namespace.get() == "openai" || model_namespace.get() == "google" || model_namespace.get() == "gcp");
|
||||
}
|
||||
|
||||
void TextEmbedderManager::add_text_embedder_to_collection(const std::string& collection_name, const std::string& model_name) {
|
||||
std::unique_lock<std::mutex> lock(text_embedders_mutex);
|
||||
auto text_embedder_it = text_embedders.find(model_name);
|
||||
if(text_embedder_it == text_embedders.end()) {
|
||||
LOG(ERROR) << "Text embedder not found: " << model_name;
|
||||
return;
|
||||
}
|
||||
|
||||
if(text_embedder_to_collections.find(model_name) == text_embedder_to_collections.end()) {
|
||||
text_embedder_to_collections.emplace(model_name, std::unordered_set<std::string>{collection_name});
|
||||
} else {
|
||||
text_embedder_to_collections[model_name].emplace(collection_name);
|
||||
}
|
||||
|
||||
if(collection_to_text_embedders.find(collection_name) == collection_to_text_embedders.end()) {
|
||||
collection_to_text_embedders.emplace(collection_name, std::unordered_set<std::string>{model_name});
|
||||
} else {
|
||||
collection_to_text_embedders[collection_name].emplace(model_name);
|
||||
}
|
||||
|
||||
LOG(INFO) << "Added text embedder: " << model_name << " to collection: " << collection_name;
|
||||
}
|
||||
|
||||
|
||||
void TextEmbedderManager::remove_text_embedder_from_collection(const std::string& collection_name, const std::string& model_name) {
|
||||
std::unique_lock<std::mutex> lock(text_embedders_mutex);
|
||||
auto text_embedder_it = text_embedders.find(model_name);
|
||||
if(text_embedder_it == text_embedders.end()) {
|
||||
LOG(ERROR) << "Text embedder not found: " << model_name;
|
||||
return;
|
||||
}
|
||||
|
||||
if(text_embedder_to_collections.find(model_name) == text_embedder_to_collections.end()) {
|
||||
LOG(ERROR) << "Text embedder not found in any collection: " << model_name;
|
||||
return;
|
||||
}
|
||||
|
||||
if(collection_to_text_embedders.find(collection_name) == collection_to_text_embedders.end()) {
|
||||
LOG(ERROR) << "Collection not found: " << collection_name;
|
||||
return;
|
||||
}
|
||||
|
||||
text_embedder_to_collections[model_name].erase(collection_name);
|
||||
collection_to_text_embedders[collection_name].erase(model_name);
|
||||
|
||||
if(text_embedder_to_collections[model_name].empty()) {
|
||||
text_embedder_to_collections.erase(model_name);
|
||||
text_embedders.erase(model_name);
|
||||
}
|
||||
|
||||
LOG(INFO) << "Removed text embedder: " << model_name << " from collection: " << collection_name;
|
||||
}
|
||||
|
||||
|
||||
void TextEmbedderManager::remove_collection(const std::string& collection_name) {
|
||||
std::unique_lock<std::mutex> lock(text_embedders_mutex);
|
||||
if(collection_to_text_embedders.find(collection_name) == collection_to_text_embedders.end()) {
|
||||
LOG(ERROR) << "Collection not found: " << collection_name;
|
||||
return;
|
||||
}
|
||||
|
||||
for(const auto& model_name : collection_to_text_embedders[collection_name]) {
|
||||
text_embedder_to_collections[model_name].erase(collection_name);
|
||||
if(text_embedder_to_collections[model_name].empty()) {
|
||||
text_embedder_to_collections.erase(model_name);
|
||||
text_embedders.erase(model_name);
|
||||
}
|
||||
}
|
||||
|
||||
collection_to_text_embedders.erase(collection_name);
|
||||
LOG(INFO) << "Removed all text embedders for collection: " << collection_name;
|
||||
}
|
@ -2306,4 +2306,100 @@ TEST_F(CollectionVectorTest, TestUnloadingModelsOnCollectionDelete) {
|
||||
|
||||
text_embedders = TextEmbedderManager::get_instance()._get_text_embedders();
|
||||
ASSERT_EQ(0, text_embedders.size());
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(CollectionVectorTest, TestUnloadingModelsOnDrop) {
|
||||
nlohmann::json actual_schema = R"({
|
||||
"name": "test",
|
||||
"fields": [
|
||||
{
|
||||
"name": "title",
|
||||
"type": "string"
|
||||
},
|
||||
{
|
||||
"name": "title_vec",
|
||||
"type": "float[]",
|
||||
"embed": {
|
||||
"from": [
|
||||
"title"
|
||||
],
|
||||
"model_config": {
|
||||
"model_name": "ts/e5-small"
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
})"_json;
|
||||
|
||||
TextEmbedderManager::set_model_dir("/tmp/typesense_test/models");
|
||||
|
||||
auto schema = actual_schema;
|
||||
auto collection_create_op = collectionManager.create_collection(schema);
|
||||
ASSERT_TRUE(collection_create_op.ok());
|
||||
|
||||
auto coll = collection_create_op.get();
|
||||
|
||||
auto text_embedders = TextEmbedderManager::get_instance()._get_text_embedders();
|
||||
|
||||
ASSERT_EQ(1, text_embedders.size());
|
||||
|
||||
nlohmann::json drop_schema = R"({
|
||||
"fields": [
|
||||
{
|
||||
"name": "title_vec",
|
||||
"drop": true
|
||||
}
|
||||
]
|
||||
})"_json;
|
||||
|
||||
auto drop_op = coll->alter(drop_schema);
|
||||
ASSERT_TRUE(drop_op.ok());
|
||||
|
||||
text_embedders = TextEmbedderManager::get_instance()._get_text_embedders();
|
||||
ASSERT_EQ(0, text_embedders.size());
|
||||
|
||||
// create another collection
|
||||
schema = actual_schema;
|
||||
schema["name"] = "test2";
|
||||
collection_create_op = collectionManager.create_collection(schema);
|
||||
ASSERT_TRUE(collection_create_op.ok());
|
||||
|
||||
auto coll2 = collection_create_op.get();
|
||||
|
||||
nlohmann::json alter_schema = R"({
|
||||
"fields": [
|
||||
{
|
||||
"name": "title_vec",
|
||||
"type": "float[]",
|
||||
"embed": {
|
||||
"from": [
|
||||
"title"
|
||||
],
|
||||
"model_config": {
|
||||
"model_name": "ts/e5-small"
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
})"_json;
|
||||
|
||||
auto alter_op = coll->alter(alter_schema);
|
||||
ASSERT_TRUE(alter_op.ok());
|
||||
|
||||
LOG(INFO) << "created second collection";
|
||||
|
||||
text_embedders = TextEmbedderManager::get_instance()._get_text_embedders();
|
||||
ASSERT_EQ(1, text_embedders.size());
|
||||
|
||||
drop_op = coll2->alter(drop_schema);
|
||||
ASSERT_TRUE(drop_op.ok());
|
||||
|
||||
text_embedders = TextEmbedderManager::get_instance()._get_text_embedders();
|
||||
ASSERT_EQ(1, text_embedders.size());
|
||||
|
||||
drop_op = coll->alter(drop_schema);
|
||||
ASSERT_TRUE(drop_op.ok());
|
||||
|
||||
text_embedders = TextEmbedderManager::get_instance()._get_text_embedders();
|
||||
ASSERT_EQ(0, text_embedders.size());
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user