From eaef5cb8c6e4c91a21bf626917521535cedf1142 Mon Sep 17 00:00:00 2001 From: ozanarmagan Date: Thu, 21 Sep 2023 16:38:35 +0300 Subject: [PATCH 1/3] Fix reindexing old documents with embeddings on alter --- src/collection.cpp | 23 ++++++++++++++-- src/validator.cpp | 9 ++++--- test/collection_schema_change_test.cpp | 36 ++++++++++++++++++++++++++ 3 files changed, 63 insertions(+), 5 deletions(-) diff --git a/src/collection.cpp b/src/collection.cpp index 28907b33..765dbc06 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -3756,6 +3756,8 @@ Option Collection::batch_alter_data(const std::vector& alter_fields std::vector nested_field_names; + bool found_embedding_field = false; + for(auto& f: alter_fields) { if(f.name == ".*") { fields.push_back(f); @@ -3776,6 +3778,7 @@ Option Collection::batch_alter_data(const std::vector& alter_fields } if(f.embed.count(fields::from) != 0) { + found_embedding_field = true; embedding_fields.emplace(f.name, f); } @@ -3834,8 +3837,24 @@ Option Collection::batch_alter_data(const std::vector& alter_fields } } - Index::batch_memory_index(index, iter_batch, default_sorting_field, schema_additions, embedding_fields, - fallback_field_type, token_separators, symbols_to_index, true, 200, false); + Index::batch_memory_index(index, iter_batch, default_sorting_field, search_schema, embedding_fields, + fallback_field_type, token_separators, symbols_to_index, true, 200, found_embedding_field); + if(found_embedding_field) { + for(auto& index_record : iter_batch) { + if(index_record.indexed.ok()) { + remove_flat_fields(index_record.doc); + const std::string& serialized_json = index_record.doc.dump(-1, ' ', false, nlohmann::detail::error_handler_t::ignore); + bool write_ok = store->insert(get_seq_id_key(index_record.seq_id), serialized_json); + + if(!write_ok) { + LOG(ERROR) << "Inserting doc with new embedding field failed for seq id: " << index_record.seq_id; + index_record.index_failure(500, "Could not write to on-disk storage."); + } else { + index_record.index_success(); + } + } + } + } iter_batch.clear(); } diff --git a/src/validator.cpp b/src/validator.cpp index f8c23ee9..4f3df1aa 100644 --- a/src/validator.cpp +++ b/src/validator.cpp @@ -53,8 +53,9 @@ Option validator_t::coerce_element(const field& a_field, nlohmann::jso } } else if(a_field.is_array()) { if(!doc_ele.is_array()) { - if(a_field.optional && (dirty_values == DIRTY_VALUES::DROP || - dirty_values == DIRTY_VALUES::COERCE_OR_DROP)) { + bool is_embedding_field = a_field.type == field_types::FLOAT_ARRAY && a_field.embed.count(fields::from) > 0; + if((a_field.optional && (dirty_values == DIRTY_VALUES::DROP || + dirty_values == DIRTY_VALUES::COERCE_OR_DROP)) || is_embedding_field) { document.erase(field_name); return Option(200); } else { @@ -630,7 +631,9 @@ Option validator_t::validate_index_in_memory(nlohmann::json& document, continue; } - if(document.count(field_name) == 0) { + bool is_embedding_field = a_field.type == field_types::FLOAT_ARRAY && a_field.embed.count(fields::from) > 0; + + if(document.count(field_name) == 0 && !is_embedding_field) { return Option<>(400, "Field `" + field_name + "` has been declared in the schema, " "but is not found in the document."); } diff --git a/test/collection_schema_change_test.cpp b/test/collection_schema_change_test.cpp index c936ce78..858818fc 100644 --- a/test/collection_schema_change_test.cpp +++ b/test/collection_schema_change_test.cpp @@ -1793,3 +1793,39 @@ TEST_F(CollectionSchemaChangeTest, EmbeddingFieldAlterDropTest) { ASSERT_EQ(0, vec_index.size()); ASSERT_EQ(0, vec_index.count("embedding")); } + + +TEST_F(CollectionSchemaChangeTest, EmbeddingFieldAlterUpdateOldDocs) { + nlohmann::json schema = R"({ + "name": "objects", + "fields": [ + {"name": "title", "type": "string"} + ] + })"_json; + + TextEmbedderManager::set_model_dir("/tmp/typesense_test/models"); + + auto op = collectionManager.create_collection(schema); + ASSERT_TRUE(op.ok()); + Collection* coll = op.get(); + + nlohmann::json doc; + doc["title"] = "hello"; + + auto add_op = coll->add(doc.dump()); + ASSERT_TRUE(add_op.ok()); + + nlohmann::json schema_change = R"({ + "fields": [ + {"name": "embedding", "type":"float[]", "embed":{"from": ["title"], "model_config": {"model_name": "ts/e5-small"}}} + ] + })"_json; + + auto schema_change_op = coll->alter(schema_change); + ASSERT_TRUE(schema_change_op.ok()); + + auto search_res = coll->search("*", {}, "", {}, {}, {0}, 3, 1, FREQUENCY, {true}, 5); + + ASSERT_EQ(1, search_res.get()["found"].get()); + ASSERT_EQ(384, search_res.get()["hits"][0]["document"]["embedding"].get>().size()); +} From 88f53c7f32d75f6601610c208754c656ba1da976 Mon Sep 17 00:00:00 2001 From: ozanarmagan Date: Sun, 24 Sep 2023 14:33:24 +0300 Subject: [PATCH 2/3] Add assert for `.flat` --- src/validator.cpp | 8 ++++---- test/collection_schema_change_test.cpp | 11 +++++++++-- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/src/validator.cpp b/src/validator.cpp index 4f3df1aa..b18af4f1 100644 --- a/src/validator.cpp +++ b/src/validator.cpp @@ -53,9 +53,9 @@ Option validator_t::coerce_element(const field& a_field, nlohmann::jso } } else if(a_field.is_array()) { if(!doc_ele.is_array()) { - bool is_embedding_field = a_field.type == field_types::FLOAT_ARRAY && a_field.embed.count(fields::from) > 0; + bool is_auto_embedding = a_field.type == field_types::FLOAT_ARRAY && a_field.embed.count(fields::from) > 0; if((a_field.optional && (dirty_values == DIRTY_VALUES::DROP || - dirty_values == DIRTY_VALUES::COERCE_OR_DROP)) || is_embedding_field) { + dirty_values == DIRTY_VALUES::COERCE_OR_DROP)) || is_auto_embedding) { document.erase(field_name); return Option(200); } else { @@ -631,9 +631,9 @@ Option validator_t::validate_index_in_memory(nlohmann::json& document, continue; } - bool is_embedding_field = a_field.type == field_types::FLOAT_ARRAY && a_field.embed.count(fields::from) > 0; + bool is_auto_embedding = a_field.type == field_types::FLOAT_ARRAY && a_field.embed.count(fields::from) > 0; - if(document.count(field_name) == 0 && !is_embedding_field) { + if(document.count(field_name) == 0 && !is_auto_embedding) { return Option<>(400, "Field `" + field_name + "` has been declared in the schema, " "but is not found in the document."); } diff --git a/test/collection_schema_change_test.cpp b/test/collection_schema_change_test.cpp index 858818fc..7e454860 100644 --- a/test/collection_schema_change_test.cpp +++ b/test/collection_schema_change_test.cpp @@ -1799,8 +1799,10 @@ TEST_F(CollectionSchemaChangeTest, EmbeddingFieldAlterUpdateOldDocs) { nlohmann::json schema = R"({ "name": "objects", "fields": [ - {"name": "title", "type": "string"} - ] + {"name": "title", "type": "string"}, + {"name": "nested", "type": "object"} + ], + "enable_nested_fields": true })"_json; TextEmbedderManager::set_model_dir("/tmp/typesense_test/models"); @@ -1811,6 +1813,8 @@ TEST_F(CollectionSchemaChangeTest, EmbeddingFieldAlterUpdateOldDocs) { nlohmann::json doc; doc["title"] = "hello"; + doc["nested"] = nlohmann::json::object(); + doc["nested"]["hello"] = "world"; auto add_op = coll->add(doc.dump()); ASSERT_TRUE(add_op.ok()); @@ -1828,4 +1832,7 @@ TEST_F(CollectionSchemaChangeTest, EmbeddingFieldAlterUpdateOldDocs) { ASSERT_EQ(1, search_res.get()["found"].get()); ASSERT_EQ(384, search_res.get()["hits"][0]["document"]["embedding"].get>().size()); + ASSERT_EQ(1, search_res.get()["hits"][0]["document"]["nested"].size()); + ASSERT_EQ(0, search_res.get()["hits"][0]["document"].count(".flat")); + ASSERT_EQ(0, search_res.get()["hits"][0]["document"].count("nested.hello")); } From e54f680b224d3feb7c788651365c3bbafef40ecf Mon Sep 17 00:00:00 2001 From: ozanarmagan Date: Sun, 24 Sep 2023 14:52:48 +0300 Subject: [PATCH 3/3] Add test for one embedding and one keyword field that have same prefix --- test/collection_vector_search_test.cpp | 45 ++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/test/collection_vector_search_test.cpp b/test/collection_vector_search_test.cpp index cacfb978..ea22f823 100644 --- a/test/collection_vector_search_test.cpp +++ b/test/collection_vector_search_test.cpp @@ -2077,4 +2077,49 @@ TEST_F(CollectionVectorTest, TestTwoEmbeddingFieldsSamePrefix) { 0, spp::sparse_hash_set()); ASSERT_TRUE(semantic_results.ok()); +} + +TEST_F(CollectionVectorTest, TestOneEmbeddingOneKeywordFieldsHaveSamePrefix) { + nlohmann::json schema = R"({ + "name": "test", + "fields": [ + { + "name": "title", + "type": "string" + }, + { + "name": "title_vec", + "type": "float[]", + "embed": { + "from": [ + "title" + ], + "model_config": { + "model_name": "ts/e5-small" + } + } + } + ] + })"_json; + + TextEmbedderManager::set_model_dir("/tmp/typesense_test/models"); + + auto collection_create_op = collectionManager.create_collection(schema); + + ASSERT_TRUE(collection_create_op.ok()); + + auto coll1 = collection_create_op.get(); + + auto add_op = coll1->add(R"({ + "title": "john doe" + })"_json.dump()); + + ASSERT_TRUE(add_op.ok()); + + auto keyword_results = coll1->search("john", {"title"}, + "", {}, {}, {2}, 10, + 1, FREQUENCY, {true}, + 0, spp::sparse_hash_set()); + + ASSERT_TRUE(keyword_results.ok()); } \ No newline at end of file