Merge pull request #1238 from ozanarmagan/v0.25-join

Fix updating old documents with embeddings on alter
This commit is contained in:
Kishore Nallan 2023-09-24 18:20:36 +05:30 committed by GitHub
commit 44ad5fdbcf
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 115 additions and 5 deletions

View File

@ -3756,6 +3756,8 @@ Option<bool> Collection::batch_alter_data(const std::vector<field>& alter_fields
std::vector<std::string> nested_field_names;
bool found_embedding_field = false;
for(auto& f: alter_fields) {
if(f.name == ".*") {
fields.push_back(f);
@ -3776,6 +3778,7 @@ Option<bool> Collection::batch_alter_data(const std::vector<field>& alter_fields
}
if(f.embed.count(fields::from) != 0) {
found_embedding_field = true;
embedding_fields.emplace(f.name, f);
}
@ -3834,8 +3837,24 @@ Option<bool> Collection::batch_alter_data(const std::vector<field>& alter_fields
}
}
Index::batch_memory_index(index, iter_batch, default_sorting_field, schema_additions, embedding_fields,
fallback_field_type, token_separators, symbols_to_index, true, 200, false);
Index::batch_memory_index(index, iter_batch, default_sorting_field, search_schema, embedding_fields,
fallback_field_type, token_separators, symbols_to_index, true, 200, found_embedding_field);
if(found_embedding_field) {
for(auto& index_record : iter_batch) {
if(index_record.indexed.ok()) {
remove_flat_fields(index_record.doc);
const std::string& serialized_json = index_record.doc.dump(-1, ' ', false, nlohmann::detail::error_handler_t::ignore);
bool write_ok = store->insert(get_seq_id_key(index_record.seq_id), serialized_json);
if(!write_ok) {
LOG(ERROR) << "Inserting doc with new embedding field failed for seq id: " << index_record.seq_id;
index_record.index_failure(500, "Could not write to on-disk storage.");
} else {
index_record.index_success();
}
}
}
}
iter_batch.clear();
}

View File

@ -53,8 +53,9 @@ Option<uint32_t> validator_t::coerce_element(const field& a_field, nlohmann::jso
}
} else if(a_field.is_array()) {
if(!doc_ele.is_array()) {
if(a_field.optional && (dirty_values == DIRTY_VALUES::DROP ||
dirty_values == DIRTY_VALUES::COERCE_OR_DROP)) {
bool is_auto_embedding = a_field.type == field_types::FLOAT_ARRAY && a_field.embed.count(fields::from) > 0;
if((a_field.optional && (dirty_values == DIRTY_VALUES::DROP ||
dirty_values == DIRTY_VALUES::COERCE_OR_DROP)) || is_auto_embedding) {
document.erase(field_name);
return Option<uint32_t>(200);
} else {
@ -630,7 +631,9 @@ Option<uint32_t> validator_t::validate_index_in_memory(nlohmann::json& document,
continue;
}
if(document.count(field_name) == 0) {
bool is_auto_embedding = a_field.type == field_types::FLOAT_ARRAY && a_field.embed.count(fields::from) > 0;
if(document.count(field_name) == 0 && !is_auto_embedding) {
return Option<>(400, "Field `" + field_name + "` has been declared in the schema, "
"but is not found in the document.");
}

View File

@ -1793,3 +1793,46 @@ TEST_F(CollectionSchemaChangeTest, EmbeddingFieldAlterDropTest) {
ASSERT_EQ(0, vec_index.size());
ASSERT_EQ(0, vec_index.count("embedding"));
}
TEST_F(CollectionSchemaChangeTest, EmbeddingFieldAlterUpdateOldDocs) {
nlohmann::json schema = R"({
"name": "objects",
"fields": [
{"name": "title", "type": "string"},
{"name": "nested", "type": "object"}
],
"enable_nested_fields": true
})"_json;
TextEmbedderManager::set_model_dir("/tmp/typesense_test/models");
auto op = collectionManager.create_collection(schema);
ASSERT_TRUE(op.ok());
Collection* coll = op.get();
nlohmann::json doc;
doc["title"] = "hello";
doc["nested"] = nlohmann::json::object();
doc["nested"]["hello"] = "world";
auto add_op = coll->add(doc.dump());
ASSERT_TRUE(add_op.ok());
nlohmann::json schema_change = R"({
"fields": [
{"name": "embedding", "type":"float[]", "embed":{"from": ["title"], "model_config": {"model_name": "ts/e5-small"}}}
]
})"_json;
auto schema_change_op = coll->alter(schema_change);
ASSERT_TRUE(schema_change_op.ok());
auto search_res = coll->search("*", {}, "", {}, {}, {0}, 3, 1, FREQUENCY, {true}, 5);
ASSERT_EQ(1, search_res.get()["found"].get<size_t>());
ASSERT_EQ(384, search_res.get()["hits"][0]["document"]["embedding"].get<std::vector<float>>().size());
ASSERT_EQ(1, search_res.get()["hits"][0]["document"]["nested"].size());
ASSERT_EQ(0, search_res.get()["hits"][0]["document"].count(".flat"));
ASSERT_EQ(0, search_res.get()["hits"][0]["document"].count("nested.hello"));
}

View File

@ -2077,4 +2077,49 @@ TEST_F(CollectionVectorTest, TestTwoEmbeddingFieldsSamePrefix) {
0, spp::sparse_hash_set<std::string>());
ASSERT_TRUE(semantic_results.ok());
}
TEST_F(CollectionVectorTest, TestOneEmbeddingOneKeywordFieldsHaveSamePrefix) {
nlohmann::json schema = R"({
"name": "test",
"fields": [
{
"name": "title",
"type": "string"
},
{
"name": "title_vec",
"type": "float[]",
"embed": {
"from": [
"title"
],
"model_config": {
"model_name": "ts/e5-small"
}
}
}
]
})"_json;
TextEmbedderManager::set_model_dir("/tmp/typesense_test/models");
auto collection_create_op = collectionManager.create_collection(schema);
ASSERT_TRUE(collection_create_op.ok());
auto coll1 = collection_create_op.get();
auto add_op = coll1->add(R"({
"title": "john doe"
})"_json.dump());
ASSERT_TRUE(add_op.ok());
auto keyword_results = coll1->search("john", {"title"},
"", {}, {}, {2}, 10,
1, FREQUENCY, {true},
0, spp::sparse_hash_set<std::string>());
ASSERT_TRUE(keyword_results.ok());
}