Merge pull request #1238 from ozanarmagan/v0.25-join

Fix updating old documents with embeddings on alter
2025-05-20 21:52:23 +08:00 · 2023-09-24 18:20:36 +05:30 · 2023-09-24 18:20:36 +05:30 · 44ad5fdbcf
commit 44ad5fdbcf
parent c798966a50 e54f680b22
4 changed files with 115 additions and 5 deletions
--- a/src/collection.cpp
+++ b/src/collection.cpp
@ -3756,6 +3756,8 @@ Option<bool> Collection::batch_alter_data(const std::vector<field>& alter_fields

    std::vector<std::string> nested_field_names;

+    bool found_embedding_field = false;
+
    for(auto& f: alter_fields) {
        if(f.name == ".*") {
            fields.push_back(f);
@ -3776,6 +3778,7 @@ Option<bool> Collection::batch_alter_data(const std::vector<field>& alter_fields
        }

        if(f.embed.count(fields::from) != 0) {
+            found_embedding_field = true;
            embedding_fields.emplace(f.name, f);
        }

@ -3834,8 +3837,24 @@ Option<bool> Collection::batch_alter_data(const std::vector<field>& alter_fields
                }
            }
            
-            Index::batch_memory_index(index, iter_batch, default_sorting_field, schema_additions, embedding_fields,
-                                      fallback_field_type, token_separators, symbols_to_index, true, 200, false);
+            Index::batch_memory_index(index, iter_batch, default_sorting_field, search_schema, embedding_fields,
+                                      fallback_field_type, token_separators, symbols_to_index, true, 200, found_embedding_field);
+            if(found_embedding_field) {
+                for(auto& index_record : iter_batch) {
+                    if(index_record.indexed.ok()) {
+                        remove_flat_fields(index_record.doc);
+                        const std::string& serialized_json = index_record.doc.dump(-1, ' ', false, nlohmann::detail::error_handler_t::ignore);
+                        bool write_ok = store->insert(get_seq_id_key(index_record.seq_id), serialized_json);
+
+                        if(!write_ok) {
+                            LOG(ERROR) << "Inserting doc with new embedding field failed for seq id: " << index_record.seq_id;
+                            index_record.index_failure(500, "Could not write to on-disk storage.");
+                        } else {
+                            index_record.index_success();
+                        }
+                    }
+                }
+            }

            iter_batch.clear();
        }
--- a/src/validator.cpp
+++ b/src/validator.cpp
@ -53,8 +53,9 @@ Option<uint32_t> validator_t::coerce_element(const field& a_field, nlohmann::jso
        }
    } else if(a_field.is_array()) {
        if(!doc_ele.is_array()) {
-            if(a_field.optional && (dirty_values == DIRTY_VALUES::DROP ||
-                                    dirty_values == DIRTY_VALUES::COERCE_OR_DROP)) {
+            bool is_auto_embedding = a_field.type == field_types::FLOAT_ARRAY && a_field.embed.count(fields::from) > 0;
+            if((a_field.optional && (dirty_values == DIRTY_VALUES::DROP ||
+                                    dirty_values == DIRTY_VALUES::COERCE_OR_DROP)) || is_auto_embedding) {
                document.erase(field_name);
                return Option<uint32_t>(200);
            } else {
@ -630,7 +631,9 @@ Option<uint32_t> validator_t::validate_index_in_memory(nlohmann::json& document,
            continue;
        }

-        if(document.count(field_name) == 0) {
+        bool is_auto_embedding = a_field.type == field_types::FLOAT_ARRAY && a_field.embed.count(fields::from) > 0;
+
+        if(document.count(field_name) == 0 && !is_auto_embedding) {
            return Option<>(400, "Field `" + field_name  + "` has been declared in the schema, "
                                                           "but is not found in the document.");
        }
--- a/test/collection_schema_change_test.cpp
+++ b/test/collection_schema_change_test.cpp
@ -1793,3 +1793,46 @@ TEST_F(CollectionSchemaChangeTest, EmbeddingFieldAlterDropTest) {
    ASSERT_EQ(0, vec_index.size());
    ASSERT_EQ(0, vec_index.count("embedding"));
 }
+
+
+TEST_F(CollectionSchemaChangeTest, EmbeddingFieldAlterUpdateOldDocs) {
+    nlohmann::json schema = R"({
+            "name": "objects",
+            "fields": [
+                {"name": "title", "type": "string"},
+                {"name": "nested", "type": "object"}
+            ],
+            "enable_nested_fields": true
+        })"_json;
+
+    TextEmbedderManager::set_model_dir("/tmp/typesense_test/models");
+
+    auto op = collectionManager.create_collection(schema);
+    ASSERT_TRUE(op.ok());
+    Collection* coll = op.get();
+
+    nlohmann::json doc;
+    doc["title"] = "hello";
+    doc["nested"] = nlohmann::json::object();
+    doc["nested"]["hello"] = "world";
+
+    auto add_op = coll->add(doc.dump());
+    ASSERT_TRUE(add_op.ok());
+
+    nlohmann::json schema_change = R"({
+            "fields": [
+                {"name": "embedding", "type":"float[]", "embed":{"from": ["title"], "model_config": {"model_name": "ts/e5-small"}}}
+            ]
+        })"_json;
+    
+    auto schema_change_op = coll->alter(schema_change);
+    ASSERT_TRUE(schema_change_op.ok());
+
+    auto search_res = coll->search("*", {}, "", {}, {}, {0}, 3, 1, FREQUENCY, {true}, 5);
+
+    ASSERT_EQ(1, search_res.get()["found"].get<size_t>());
+    ASSERT_EQ(384, search_res.get()["hits"][0]["document"]["embedding"].get<std::vector<float>>().size());
+    ASSERT_EQ(1, search_res.get()["hits"][0]["document"]["nested"].size());
+    ASSERT_EQ(0, search_res.get()["hits"][0]["document"].count(".flat"));
+    ASSERT_EQ(0, search_res.get()["hits"][0]["document"].count("nested.hello"));
+}
--- a/test/collection_vector_search_test.cpp
+++ b/test/collection_vector_search_test.cpp
@ -2077,4 +2077,49 @@ TEST_F(CollectionVectorTest, TestTwoEmbeddingFieldsSamePrefix) {
                                 0, spp::sparse_hash_set<std::string>());

    ASSERT_TRUE(semantic_results.ok());
+}
+
+TEST_F(CollectionVectorTest, TestOneEmbeddingOneKeywordFieldsHaveSamePrefix) {
+    nlohmann::json schema = R"({
+                        "name": "test",
+                        "fields": [
+                            {
+                                "name": "title",
+                                "type": "string"
+                            },
+                            {
+                            "name": "title_vec",
+                            "type": "float[]",
+                            "embed": {
+                                "from": [
+                                    "title"
+                                ],
+                                "model_config": {
+                                    "model_name": "ts/e5-small"
+                                }
+                            }
+                            }
+                        ]
+                        })"_json;
+    
+    TextEmbedderManager::set_model_dir("/tmp/typesense_test/models");
+
+    auto collection_create_op = collectionManager.create_collection(schema);
+
+    ASSERT_TRUE(collection_create_op.ok());
+
+    auto coll1 = collection_create_op.get();
+
+    auto add_op = coll1->add(R"({
+        "title": "john doe"
+    })"_json.dump());
+
+    ASSERT_TRUE(add_op.ok());
+
+    auto keyword_results = coll1->search("john", {"title"},
+                                 "", {}, {}, {2}, 10,
+                                 1, FREQUENCY, {true},
+                                 0, spp::sparse_hash_set<std::string>());
+
+    ASSERT_TRUE(keyword_results.ok());
 }