Merge pull request #1119 from ozanarmagan/v0.25-join

Ignore null optional fields while generating embedding
2025-05-23 07:09:44 +08:00 · 2023-07-28 21:54:45 +05:30 · 2023-07-28 21:54:45 +05:30 · 86eb4989f1
commit 86eb4989f1
parent 79986fd0a1 dce27b918e
4 changed files with 65 additions and 8 deletions
--- a/src/index.cpp
+++ b/src/index.cpp
@ -6492,13 +6492,17 @@ void Index::batch_embed_fields(std::vector<index_record*>& records,
                continue;
            }
            std::string text = indexing_prefix;
-            auto embed_from = field.embed[fields::from].get<std::vector<std::string>>();
+            const auto& embed_from = field.embed[fields::from].get<std::vector<std::string>>();
            for(const auto& field_name : embed_from) {
                auto field_it = search_schema.find(field_name);
+                auto doc_field_it = document->find(field_name);
+                if(doc_field_it == document->end()) {
+                        continue;
+                }
                if(field_it.value().type == field_types::STRING) {
-                    text += (*document)[field_name].get<std::string>() + " ";
+                    text += doc_field_it->get<std::string>() + " ";
                } else if(field_it.value().type == field_types::STRING_ARRAY) {
-                    for(const auto& val : (*document)[field_name]) {
+                    for(const auto& val : *(doc_field_it)) {
                        text += val.get<std::string>() + " ";
                    }
                }
@ -6511,7 +6515,7 @@ void Index::batch_embed_fields(std::vector<index_record*>& records,
        if(texts_to_embed.empty()) {
            continue;
        }
-        
+
        TextEmbedderManager& embedder_manager = TextEmbedderManager::get_instance();
        auto embedder_op = embedder_manager.get_text_embedder(field.embed[fields::model_config]);

--- a/src/validator.cpp
+++ b/src/validator.cpp
@ -669,7 +669,9 @@ Option<bool> validator_t::validate_embed_fields(const nlohmann::json& document,
                                          const tsl::htrie_map<char, field> & search_schema,
                                          const bool& error_if_field_not_found) {
    for(const auto& field : embedding_fields) {
-        auto embed_from = field.embed[fields::from].get<std::vector<std::string>>();
+        const auto& embed_from = field.embed[fields::from].get<std::vector<std::string>>();
+        // flag to check if all fields to embed from are optional and null
+        bool all_optional_and_null = true;
        for(const auto& field_name : embed_from) {
            auto schema_field_it = search_schema.find(field_name);
            auto doc_field_it = document.find(field_name);
@ -677,12 +679,13 @@ Option<bool> validator_t::validate_embed_fields(const nlohmann::json& document,
                return Option<bool>(400, "Field `" + field.name + "` has invalid fields to create embeddings from.");
            }
            if(doc_field_it == document.end()) {
-                if(error_if_field_not_found) {
+                if(error_if_field_not_found && !schema_field_it->optional) {
                    return Option<bool>(400, "Field `" + field_name + "` is needed to create embedding.");
                } else {
                    continue;
                }
            }
+            all_optional_and_null = false;
            if((schema_field_it.value().type == field_types::STRING && !doc_field_it.value().is_string()) || 
                (schema_field_it.value().type == field_types::STRING_ARRAY && !doc_field_it.value().is_array())) {
                return Option<bool>(400, "Field `" + field_name + "` has malformed data.");
@ -695,6 +698,9 @@ Option<bool> validator_t::validate_embed_fields(const nlohmann::json& document,
                }
            }
        }
+        if(all_optional_and_null && !field.optional) {
+            return Option<bool>(400, "No valid fields found to create embedding for `" + field.name + "`, please provide at least one valid field or make the embedding field optional.");
+        }
    }

    return Option<bool>(true);
--- a/test/collection_test.cpp
+++ b/test/collection_test.cpp
@ -4870,8 +4870,7 @@ TEST_F(CollectionTest, MissingFieldForEmbedding) {
    doc["names"].push_back("butterball");

    auto add_op = coll->add(doc.dump());
-    ASSERT_FALSE(add_op.ok());
-    ASSERT_EQ("Field `category` is needed to create embedding.", add_op.error());
+    ASSERT_TRUE(add_op.ok());
 }

 TEST_F(CollectionTest, WrongTypeInEmbedFrom) {
--- a/test/collection_vector_search_test.cpp
+++ b/test/collection_vector_search_test.cpp
@ -987,4 +987,52 @@ TEST_F(CollectionVectorTest, HybridSearchSortByGeopoint) {
    ASSERT_EQ("butter", search_res["hits"][0]["document"]["name"].get<std::string>());
    ASSERT_EQ("butterball", search_res["hits"][1]["document"]["name"].get<std::string>());
    ASSERT_EQ("butterfly", search_res["hits"][2]["document"]["name"].get<std::string>());
+}
+
+
+TEST_F(CollectionVectorTest, EmbedFromOptionalNullField) {
+    nlohmann::json schema = R"({
+                "name": "objects",
+                "fields": [
+                {"name": "text", "type": "string", "optional": true},
+                {"name": "embedding", "type":"float[]", "embed":{"from": ["text"], "model_config": {"model_name": "ts/e5-small"}}}
+                ]
+            })"_json;
+
+    TextEmbedderManager::set_model_dir("/tmp/typesense_test/models");
+
+    auto op = collectionManager.create_collection(schema);
+
+    ASSERT_TRUE(op.ok());
+    auto coll = op.get();
+
+    nlohmann::json doc = R"({
+    })"_json;
+
+    auto add_op = coll->add(doc.dump());
+
+    ASSERT_FALSE(add_op.ok());
+    ASSERT_EQ("No valid fields found to create embedding for `embedding`, please provide at least one valid field or make the embedding field optional.", add_op.error());
+
+    doc["text"] = "butter";
+    add_op = coll->add(doc.dump());
+    ASSERT_TRUE(add_op.ok());
+    // drop the embedding field and reindex
+
+    nlohmann::json alter_schema = R"({
+        "fields": [
+        {"name": "embedding", "drop": true},
+        {"name": "embedding", "type":"float[]", "embed":{"from": ["text"], "model_config": {"model_name": "ts/e5-small"}}, "optional": true}
+        ]
+    })"_json;
+
+    auto update_op = coll->alter(alter_schema);
+    ASSERT_TRUE(update_op.ok());
+
+
+    doc = R"({
+    })"_json;
+    add_op = coll->add(doc.dump());
+
+    ASSERT_TRUE(add_op.ok());
 }