mirror of
https://github.com/typesense/typesense.git
synced 2025-05-23 07:09:44 +08:00
Merge pull request #1119 from ozanarmagan/v0.25-join
Ignore null optional fields while generating embedding
This commit is contained in:
commit
86eb4989f1
@ -6492,13 +6492,17 @@ void Index::batch_embed_fields(std::vector<index_record*>& records,
|
||||
continue;
|
||||
}
|
||||
std::string text = indexing_prefix;
|
||||
auto embed_from = field.embed[fields::from].get<std::vector<std::string>>();
|
||||
const auto& embed_from = field.embed[fields::from].get<std::vector<std::string>>();
|
||||
for(const auto& field_name : embed_from) {
|
||||
auto field_it = search_schema.find(field_name);
|
||||
auto doc_field_it = document->find(field_name);
|
||||
if(doc_field_it == document->end()) {
|
||||
continue;
|
||||
}
|
||||
if(field_it.value().type == field_types::STRING) {
|
||||
text += (*document)[field_name].get<std::string>() + " ";
|
||||
text += doc_field_it->get<std::string>() + " ";
|
||||
} else if(field_it.value().type == field_types::STRING_ARRAY) {
|
||||
for(const auto& val : (*document)[field_name]) {
|
||||
for(const auto& val : *(doc_field_it)) {
|
||||
text += val.get<std::string>() + " ";
|
||||
}
|
||||
}
|
||||
@ -6511,7 +6515,7 @@ void Index::batch_embed_fields(std::vector<index_record*>& records,
|
||||
if(texts_to_embed.empty()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
TextEmbedderManager& embedder_manager = TextEmbedderManager::get_instance();
|
||||
auto embedder_op = embedder_manager.get_text_embedder(field.embed[fields::model_config]);
|
||||
|
||||
|
@ -669,7 +669,9 @@ Option<bool> validator_t::validate_embed_fields(const nlohmann::json& document,
|
||||
const tsl::htrie_map<char, field> & search_schema,
|
||||
const bool& error_if_field_not_found) {
|
||||
for(const auto& field : embedding_fields) {
|
||||
auto embed_from = field.embed[fields::from].get<std::vector<std::string>>();
|
||||
const auto& embed_from = field.embed[fields::from].get<std::vector<std::string>>();
|
||||
// flag to check if all fields to embed from are optional and null
|
||||
bool all_optional_and_null = true;
|
||||
for(const auto& field_name : embed_from) {
|
||||
auto schema_field_it = search_schema.find(field_name);
|
||||
auto doc_field_it = document.find(field_name);
|
||||
@ -677,12 +679,13 @@ Option<bool> validator_t::validate_embed_fields(const nlohmann::json& document,
|
||||
return Option<bool>(400, "Field `" + field.name + "` has invalid fields to create embeddings from.");
|
||||
}
|
||||
if(doc_field_it == document.end()) {
|
||||
if(error_if_field_not_found) {
|
||||
if(error_if_field_not_found && !schema_field_it->optional) {
|
||||
return Option<bool>(400, "Field `" + field_name + "` is needed to create embedding.");
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
all_optional_and_null = false;
|
||||
if((schema_field_it.value().type == field_types::STRING && !doc_field_it.value().is_string()) ||
|
||||
(schema_field_it.value().type == field_types::STRING_ARRAY && !doc_field_it.value().is_array())) {
|
||||
return Option<bool>(400, "Field `" + field_name + "` has malformed data.");
|
||||
@ -695,6 +698,9 @@ Option<bool> validator_t::validate_embed_fields(const nlohmann::json& document,
|
||||
}
|
||||
}
|
||||
}
|
||||
if(all_optional_and_null && !field.optional) {
|
||||
return Option<bool>(400, "No valid fields found to create embedding for `" + field.name + "`, please provide at least one valid field or make the embedding field optional.");
|
||||
}
|
||||
}
|
||||
|
||||
return Option<bool>(true);
|
||||
|
@ -4870,8 +4870,7 @@ TEST_F(CollectionTest, MissingFieldForEmbedding) {
|
||||
doc["names"].push_back("butterball");
|
||||
|
||||
auto add_op = coll->add(doc.dump());
|
||||
ASSERT_FALSE(add_op.ok());
|
||||
ASSERT_EQ("Field `category` is needed to create embedding.", add_op.error());
|
||||
ASSERT_TRUE(add_op.ok());
|
||||
}
|
||||
|
||||
TEST_F(CollectionTest, WrongTypeInEmbedFrom) {
|
||||
|
@ -987,4 +987,52 @@ TEST_F(CollectionVectorTest, HybridSearchSortByGeopoint) {
|
||||
ASSERT_EQ("butter", search_res["hits"][0]["document"]["name"].get<std::string>());
|
||||
ASSERT_EQ("butterball", search_res["hits"][1]["document"]["name"].get<std::string>());
|
||||
ASSERT_EQ("butterfly", search_res["hits"][2]["document"]["name"].get<std::string>());
|
||||
}
|
||||
|
||||
|
||||
TEST_F(CollectionVectorTest, EmbedFromOptionalNullField) {
|
||||
nlohmann::json schema = R"({
|
||||
"name": "objects",
|
||||
"fields": [
|
||||
{"name": "text", "type": "string", "optional": true},
|
||||
{"name": "embedding", "type":"float[]", "embed":{"from": ["text"], "model_config": {"model_name": "ts/e5-small"}}}
|
||||
]
|
||||
})"_json;
|
||||
|
||||
TextEmbedderManager::set_model_dir("/tmp/typesense_test/models");
|
||||
|
||||
auto op = collectionManager.create_collection(schema);
|
||||
|
||||
ASSERT_TRUE(op.ok());
|
||||
auto coll = op.get();
|
||||
|
||||
nlohmann::json doc = R"({
|
||||
})"_json;
|
||||
|
||||
auto add_op = coll->add(doc.dump());
|
||||
|
||||
ASSERT_FALSE(add_op.ok());
|
||||
ASSERT_EQ("No valid fields found to create embedding for `embedding`, please provide at least one valid field or make the embedding field optional.", add_op.error());
|
||||
|
||||
doc["text"] = "butter";
|
||||
add_op = coll->add(doc.dump());
|
||||
ASSERT_TRUE(add_op.ok());
|
||||
// drop the embedding field and reindex
|
||||
|
||||
nlohmann::json alter_schema = R"({
|
||||
"fields": [
|
||||
{"name": "embedding", "drop": true},
|
||||
{"name": "embedding", "type":"float[]", "embed":{"from": ["text"], "model_config": {"model_name": "ts/e5-small"}}, "optional": true}
|
||||
]
|
||||
})"_json;
|
||||
|
||||
auto update_op = coll->alter(alter_schema);
|
||||
ASSERT_TRUE(update_op.ok());
|
||||
|
||||
|
||||
doc = R"({
|
||||
})"_json;
|
||||
add_op = coll->add(doc.dump());
|
||||
|
||||
ASSERT_TRUE(add_op.ok());
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user