Fix tokenizing XLM-RoBERTa models with 3 inputs and altering embedding fields

This commit is contained in:
ozanarmagan 2023-09-02 01:12:21 +03:00
parent 3b157f6c61
commit 29613ad054
2 changed files with 26 additions and 0 deletions

View File

@ -4300,6 +4300,27 @@ Option<bool> Collection::validate_alter_payload(nlohmann::json& schema_changes,
auto validation_op = field::validate_and_init_embed_fields(embed_json_field_indices, search_schema,
schema_changes["fields"], diff_fields);
for(auto index : embed_json_field_indices) {
auto& field = diff_fields[index.second];
auto is_reindex = (delete_field_names.count(field.name) != 0);
if(is_reindex) {
for(auto& reindex_field: reindex_fields) {
if(reindex_field.name == field.name) {
reindex_field.num_dim = field.num_dim;
break;
}
}
} else {
for(auto& add_field: addition_fields) {
if(add_field.name == field.name) {
add_field.num_dim = field.num_dim;
break;
}
}
}
}
if(!validation_op.ok()) {
return validation_op;
}

View File

@ -117,6 +117,11 @@ embedding_res_t TextEmbedder::Embed(const std::string& text, const size_t remote
input_shapes.push_back({1, static_cast<int64_t>(encoded_input.input_ids.size())});
input_shapes.push_back({1, static_cast<int64_t>(encoded_input.attention_mask.size())});
if(session_->GetInputCount() == 3) {
// edge case: xlm_roberta does not have token_type_ids, but if the model has it as input, we need to fill it with 0s
if(encoded_input.token_type_ids.size() == 0) {
encoded_input.token_type_ids.resize(encoded_input.input_ids.size(), 0);
}
input_shapes.push_back({1, static_cast<int64_t>(encoded_input.token_type_ids.size())});
}
input_tensors.push_back(Ort::Value::CreateTensor<int64_t>(memory_info, encoded_input.input_ids.data(), encoded_input.input_ids.size(), input_shapes[0].data(), input_shapes[0].size()));