mirror of
https://github.com/typesense/typesense.git
synced 2025-05-20 13:42:26 +08:00
Merge pull request #1238 from ozanarmagan/v0.25-join
Fix updating old documents with embeddings on alter
This commit is contained in:
commit
44ad5fdbcf
@ -3756,6 +3756,8 @@ Option<bool> Collection::batch_alter_data(const std::vector<field>& alter_fields
|
||||
|
||||
std::vector<std::string> nested_field_names;
|
||||
|
||||
bool found_embedding_field = false;
|
||||
|
||||
for(auto& f: alter_fields) {
|
||||
if(f.name == ".*") {
|
||||
fields.push_back(f);
|
||||
@ -3776,6 +3778,7 @@ Option<bool> Collection::batch_alter_data(const std::vector<field>& alter_fields
|
||||
}
|
||||
|
||||
if(f.embed.count(fields::from) != 0) {
|
||||
found_embedding_field = true;
|
||||
embedding_fields.emplace(f.name, f);
|
||||
}
|
||||
|
||||
@ -3834,8 +3837,24 @@ Option<bool> Collection::batch_alter_data(const std::vector<field>& alter_fields
|
||||
}
|
||||
}
|
||||
|
||||
Index::batch_memory_index(index, iter_batch, default_sorting_field, schema_additions, embedding_fields,
|
||||
fallback_field_type, token_separators, symbols_to_index, true, 200, false);
|
||||
Index::batch_memory_index(index, iter_batch, default_sorting_field, search_schema, embedding_fields,
|
||||
fallback_field_type, token_separators, symbols_to_index, true, 200, found_embedding_field);
|
||||
if(found_embedding_field) {
|
||||
for(auto& index_record : iter_batch) {
|
||||
if(index_record.indexed.ok()) {
|
||||
remove_flat_fields(index_record.doc);
|
||||
const std::string& serialized_json = index_record.doc.dump(-1, ' ', false, nlohmann::detail::error_handler_t::ignore);
|
||||
bool write_ok = store->insert(get_seq_id_key(index_record.seq_id), serialized_json);
|
||||
|
||||
if(!write_ok) {
|
||||
LOG(ERROR) << "Inserting doc with new embedding field failed for seq id: " << index_record.seq_id;
|
||||
index_record.index_failure(500, "Could not write to on-disk storage.");
|
||||
} else {
|
||||
index_record.index_success();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
iter_batch.clear();
|
||||
}
|
||||
|
@ -53,8 +53,9 @@ Option<uint32_t> validator_t::coerce_element(const field& a_field, nlohmann::jso
|
||||
}
|
||||
} else if(a_field.is_array()) {
|
||||
if(!doc_ele.is_array()) {
|
||||
if(a_field.optional && (dirty_values == DIRTY_VALUES::DROP ||
|
||||
dirty_values == DIRTY_VALUES::COERCE_OR_DROP)) {
|
||||
bool is_auto_embedding = a_field.type == field_types::FLOAT_ARRAY && a_field.embed.count(fields::from) > 0;
|
||||
if((a_field.optional && (dirty_values == DIRTY_VALUES::DROP ||
|
||||
dirty_values == DIRTY_VALUES::COERCE_OR_DROP)) || is_auto_embedding) {
|
||||
document.erase(field_name);
|
||||
return Option<uint32_t>(200);
|
||||
} else {
|
||||
@ -630,7 +631,9 @@ Option<uint32_t> validator_t::validate_index_in_memory(nlohmann::json& document,
|
||||
continue;
|
||||
}
|
||||
|
||||
if(document.count(field_name) == 0) {
|
||||
bool is_auto_embedding = a_field.type == field_types::FLOAT_ARRAY && a_field.embed.count(fields::from) > 0;
|
||||
|
||||
if(document.count(field_name) == 0 && !is_auto_embedding) {
|
||||
return Option<>(400, "Field `" + field_name + "` has been declared in the schema, "
|
||||
"but is not found in the document.");
|
||||
}
|
||||
|
@ -1793,3 +1793,46 @@ TEST_F(CollectionSchemaChangeTest, EmbeddingFieldAlterDropTest) {
|
||||
ASSERT_EQ(0, vec_index.size());
|
||||
ASSERT_EQ(0, vec_index.count("embedding"));
|
||||
}
|
||||
|
||||
|
||||
TEST_F(CollectionSchemaChangeTest, EmbeddingFieldAlterUpdateOldDocs) {
|
||||
nlohmann::json schema = R"({
|
||||
"name": "objects",
|
||||
"fields": [
|
||||
{"name": "title", "type": "string"},
|
||||
{"name": "nested", "type": "object"}
|
||||
],
|
||||
"enable_nested_fields": true
|
||||
})"_json;
|
||||
|
||||
TextEmbedderManager::set_model_dir("/tmp/typesense_test/models");
|
||||
|
||||
auto op = collectionManager.create_collection(schema);
|
||||
ASSERT_TRUE(op.ok());
|
||||
Collection* coll = op.get();
|
||||
|
||||
nlohmann::json doc;
|
||||
doc["title"] = "hello";
|
||||
doc["nested"] = nlohmann::json::object();
|
||||
doc["nested"]["hello"] = "world";
|
||||
|
||||
auto add_op = coll->add(doc.dump());
|
||||
ASSERT_TRUE(add_op.ok());
|
||||
|
||||
nlohmann::json schema_change = R"({
|
||||
"fields": [
|
||||
{"name": "embedding", "type":"float[]", "embed":{"from": ["title"], "model_config": {"model_name": "ts/e5-small"}}}
|
||||
]
|
||||
})"_json;
|
||||
|
||||
auto schema_change_op = coll->alter(schema_change);
|
||||
ASSERT_TRUE(schema_change_op.ok());
|
||||
|
||||
auto search_res = coll->search("*", {}, "", {}, {}, {0}, 3, 1, FREQUENCY, {true}, 5);
|
||||
|
||||
ASSERT_EQ(1, search_res.get()["found"].get<size_t>());
|
||||
ASSERT_EQ(384, search_res.get()["hits"][0]["document"]["embedding"].get<std::vector<float>>().size());
|
||||
ASSERT_EQ(1, search_res.get()["hits"][0]["document"]["nested"].size());
|
||||
ASSERT_EQ(0, search_res.get()["hits"][0]["document"].count(".flat"));
|
||||
ASSERT_EQ(0, search_res.get()["hits"][0]["document"].count("nested.hello"));
|
||||
}
|
||||
|
@ -2077,4 +2077,49 @@ TEST_F(CollectionVectorTest, TestTwoEmbeddingFieldsSamePrefix) {
|
||||
0, spp::sparse_hash_set<std::string>());
|
||||
|
||||
ASSERT_TRUE(semantic_results.ok());
|
||||
}
|
||||
|
||||
TEST_F(CollectionVectorTest, TestOneEmbeddingOneKeywordFieldsHaveSamePrefix) {
|
||||
nlohmann::json schema = R"({
|
||||
"name": "test",
|
||||
"fields": [
|
||||
{
|
||||
"name": "title",
|
||||
"type": "string"
|
||||
},
|
||||
{
|
||||
"name": "title_vec",
|
||||
"type": "float[]",
|
||||
"embed": {
|
||||
"from": [
|
||||
"title"
|
||||
],
|
||||
"model_config": {
|
||||
"model_name": "ts/e5-small"
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
})"_json;
|
||||
|
||||
TextEmbedderManager::set_model_dir("/tmp/typesense_test/models");
|
||||
|
||||
auto collection_create_op = collectionManager.create_collection(schema);
|
||||
|
||||
ASSERT_TRUE(collection_create_op.ok());
|
||||
|
||||
auto coll1 = collection_create_op.get();
|
||||
|
||||
auto add_op = coll1->add(R"({
|
||||
"title": "john doe"
|
||||
})"_json.dump());
|
||||
|
||||
ASSERT_TRUE(add_op.ok());
|
||||
|
||||
auto keyword_results = coll1->search("john", {"title"},
|
||||
"", {}, {}, {2}, 10,
|
||||
1, FREQUENCY, {true},
|
||||
0, spp::sparse_hash_set<std::string>());
|
||||
|
||||
ASSERT_TRUE(keyword_results.ok());
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user