Merge pull request #1119 from ozanarmagan/v0.25-join

Ignore null optional fields while generating embedding
This commit is contained in:
Kishore Nallan 2023-07-28 21:54:45 +05:30 committed by GitHub
commit 86eb4989f1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 65 additions and 8 deletions

View File

@ -6492,13 +6492,17 @@ void Index::batch_embed_fields(std::vector<index_record*>& records,
continue;
}
std::string text = indexing_prefix;
auto embed_from = field.embed[fields::from].get<std::vector<std::string>>();
const auto& embed_from = field.embed[fields::from].get<std::vector<std::string>>();
for(const auto& field_name : embed_from) {
auto field_it = search_schema.find(field_name);
auto doc_field_it = document->find(field_name);
if(doc_field_it == document->end()) {
continue;
}
if(field_it.value().type == field_types::STRING) {
text += (*document)[field_name].get<std::string>() + " ";
text += doc_field_it->get<std::string>() + " ";
} else if(field_it.value().type == field_types::STRING_ARRAY) {
for(const auto& val : (*document)[field_name]) {
for(const auto& val : *(doc_field_it)) {
text += val.get<std::string>() + " ";
}
}
@ -6511,7 +6515,7 @@ void Index::batch_embed_fields(std::vector<index_record*>& records,
if(texts_to_embed.empty()) {
continue;
}
TextEmbedderManager& embedder_manager = TextEmbedderManager::get_instance();
auto embedder_op = embedder_manager.get_text_embedder(field.embed[fields::model_config]);

View File

@ -669,7 +669,9 @@ Option<bool> validator_t::validate_embed_fields(const nlohmann::json& document,
const tsl::htrie_map<char, field> & search_schema,
const bool& error_if_field_not_found) {
for(const auto& field : embedding_fields) {
auto embed_from = field.embed[fields::from].get<std::vector<std::string>>();
const auto& embed_from = field.embed[fields::from].get<std::vector<std::string>>();
// flag to check if all fields to embed from are optional and null
bool all_optional_and_null = true;
for(const auto& field_name : embed_from) {
auto schema_field_it = search_schema.find(field_name);
auto doc_field_it = document.find(field_name);
@ -677,12 +679,13 @@ Option<bool> validator_t::validate_embed_fields(const nlohmann::json& document,
return Option<bool>(400, "Field `" + field.name + "` has invalid fields to create embeddings from.");
}
if(doc_field_it == document.end()) {
if(error_if_field_not_found) {
if(error_if_field_not_found && !schema_field_it->optional) {
return Option<bool>(400, "Field `" + field_name + "` is needed to create embedding.");
} else {
continue;
}
}
all_optional_and_null = false;
if((schema_field_it.value().type == field_types::STRING && !doc_field_it.value().is_string()) ||
(schema_field_it.value().type == field_types::STRING_ARRAY && !doc_field_it.value().is_array())) {
return Option<bool>(400, "Field `" + field_name + "` has malformed data.");
@ -695,6 +698,9 @@ Option<bool> validator_t::validate_embed_fields(const nlohmann::json& document,
}
}
}
if(all_optional_and_null && !field.optional) {
return Option<bool>(400, "No valid fields found to create embedding for `" + field.name + "`, please provide at least one valid field or make the embedding field optional.");
}
}
return Option<bool>(true);

View File

@ -4870,8 +4870,7 @@ TEST_F(CollectionTest, MissingFieldForEmbedding) {
doc["names"].push_back("butterball");
auto add_op = coll->add(doc.dump());
ASSERT_FALSE(add_op.ok());
ASSERT_EQ("Field `category` is needed to create embedding.", add_op.error());
ASSERT_TRUE(add_op.ok());
}
TEST_F(CollectionTest, WrongTypeInEmbedFrom) {

View File

@ -987,4 +987,52 @@ TEST_F(CollectionVectorTest, HybridSearchSortByGeopoint) {
ASSERT_EQ("butter", search_res["hits"][0]["document"]["name"].get<std::string>());
ASSERT_EQ("butterball", search_res["hits"][1]["document"]["name"].get<std::string>());
ASSERT_EQ("butterfly", search_res["hits"][2]["document"]["name"].get<std::string>());
}
TEST_F(CollectionVectorTest, EmbedFromOptionalNullField) {
nlohmann::json schema = R"({
"name": "objects",
"fields": [
{"name": "text", "type": "string", "optional": true},
{"name": "embedding", "type":"float[]", "embed":{"from": ["text"], "model_config": {"model_name": "ts/e5-small"}}}
]
})"_json;
TextEmbedderManager::set_model_dir("/tmp/typesense_test/models");
auto op = collectionManager.create_collection(schema);
ASSERT_TRUE(op.ok());
auto coll = op.get();
nlohmann::json doc = R"({
})"_json;
auto add_op = coll->add(doc.dump());
ASSERT_FALSE(add_op.ok());
ASSERT_EQ("No valid fields found to create embedding for `embedding`, please provide at least one valid field or make the embedding field optional.", add_op.error());
doc["text"] = "butter";
add_op = coll->add(doc.dump());
ASSERT_TRUE(add_op.ok());
// drop the embedding field and reindex
nlohmann::json alter_schema = R"({
"fields": [
{"name": "embedding", "drop": true},
{"name": "embedding", "type":"float[]", "embed":{"from": ["text"], "model_config": {"model_name": "ts/e5-small"}}, "optional": true}
]
})"_json;
auto update_op = coll->alter(alter_schema);
ASSERT_TRUE(update_op.ok());
doc = R"({
})"_json;
add_op = coll->add(doc.dump());
ASSERT_TRUE(add_op.ok());
}