mirror of
https://github.com/typesense/typesense.git
synced 2025-05-18 20:52:50 +08:00
Merge pull request #959 from ozanarmagan/v0.25-join
Hybrid & sematic search improvements
This commit is contained in:
commit
0a0a2ed272
@ -120,6 +120,8 @@ private:
|
||||
|
||||
tsl::htrie_map<char, field> nested_fields;
|
||||
|
||||
tsl::htrie_map<char, field> embedding_fields;
|
||||
|
||||
bool enable_nested_fields;
|
||||
|
||||
std::vector<char> symbols_to_index;
|
||||
@ -160,6 +162,8 @@ private:
|
||||
|
||||
void remove_document(const nlohmann::json & document, const uint32_t seq_id, bool remove_from_store);
|
||||
|
||||
void process_remove_field_for_embedding_fields(const field& the_field, std::vector<field>& garbage_fields);
|
||||
|
||||
void curate_results(string& actual_query, const string& filter_query, bool enable_overrides, bool already_segmented,
|
||||
const std::map<size_t, std::vector<std::string>>& pinned_hits,
|
||||
const std::vector<std::string>& hidden_hits,
|
||||
@ -205,6 +209,7 @@ private:
|
||||
std::vector<sort_by>& sort_fields_std,
|
||||
bool is_wildcard_query, bool is_group_by_query = false) const;
|
||||
|
||||
|
||||
Option<bool> persist_collection_meta();
|
||||
|
||||
Option<bool> batch_alter_data(const std::vector<field>& alter_fields,
|
||||
@ -342,6 +347,8 @@ public:
|
||||
|
||||
tsl::htrie_map<char, field> get_nested_fields();
|
||||
|
||||
tsl::htrie_map<char, field> get_embedding_fields();
|
||||
|
||||
std::string get_default_sorting_field();
|
||||
|
||||
Option<doc_seq_id_t> to_doc(const std::string& json_str, nlohmann::json& document,
|
||||
@ -349,7 +356,6 @@ public:
|
||||
const DIRTY_VALUES dirty_values,
|
||||
const std::string& id="");
|
||||
|
||||
Option<bool> embed_fields(nlohmann::json& document);
|
||||
|
||||
static uint32_t get_seq_id_from_key(const std::string & key);
|
||||
|
||||
|
@ -50,7 +50,7 @@ namespace fields {
|
||||
static const std::string num_dim = "num_dim";
|
||||
static const std::string vec_dist = "vec_dist";
|
||||
static const std::string reference = "reference";
|
||||
static const std::string create_from = "create_from";
|
||||
static const std::string embed_from = "embed_from";
|
||||
static const std::string model_name = "model_name";
|
||||
}
|
||||
|
||||
@ -77,7 +77,7 @@ struct field {
|
||||
int nested_array;
|
||||
|
||||
size_t num_dim;
|
||||
std::vector<std::string> create_from;
|
||||
std::vector<std::string> embed_from;
|
||||
std::string model_name;
|
||||
vector_distance_type_t vec_dist;
|
||||
|
||||
@ -89,9 +89,9 @@ struct field {
|
||||
|
||||
field(const std::string &name, const std::string &type, const bool facet, const bool optional = false,
|
||||
bool index = true, std::string locale = "", int sort = -1, int infix = -1, bool nested = false,
|
||||
int nested_array = 0, size_t num_dim = 0, vector_distance_type_t vec_dist = cosine, std::string reference = "", const std::vector<std::string> &create_from = {}, const std::string& model_name = "") :
|
||||
int nested_array = 0, size_t num_dim = 0, vector_distance_type_t vec_dist = cosine, std::string reference = "", const std::vector<std::string> &embed_from = {}, const std::string& model_name = "") :
|
||||
name(name), type(type), facet(facet), optional(optional), index(index), locale(locale),
|
||||
nested(nested), nested_array(nested_array), num_dim(num_dim), vec_dist(vec_dist), reference(reference), create_from(create_from), model_name(model_name) {
|
||||
nested(nested), nested_array(nested_array), num_dim(num_dim), vec_dist(vec_dist), reference(reference), embed_from(embed_from), model_name(model_name) {
|
||||
|
||||
set_computed_defaults(sort, infix);
|
||||
}
|
||||
@ -319,8 +319,8 @@ struct field {
|
||||
if (!field.reference.empty()) {
|
||||
field_val[fields::reference] = field.reference;
|
||||
}
|
||||
if(!field.create_from.empty()) {
|
||||
field_val[fields::create_from] = field.create_from;
|
||||
if(!field.embed_from.empty()) {
|
||||
field_val[fields::embed_from] = field.embed_from;
|
||||
if(!field.model_name.empty()) {
|
||||
field_val[fields::model_name] = field.model_name;
|
||||
}
|
||||
@ -421,36 +421,36 @@ struct field {
|
||||
|
||||
for(nlohmann::json & field_json: fields_json) {
|
||||
|
||||
if(field_json.count(fields::create_from) != 0) {
|
||||
if(field_json.count(fields::embed_from) != 0) {
|
||||
if(TextEmbedderManager::model_dir.empty()) {
|
||||
return Option<bool>(400, "Text embedding is not enabled. Please set `model-dir` at startup.");
|
||||
}
|
||||
|
||||
if(!field_json[fields::create_from].is_array()) {
|
||||
return Option<bool>(400, "Property `" + fields::create_from + "` must be an array.");
|
||||
if(!field_json[fields::embed_from].is_array()) {
|
||||
return Option<bool>(400, "Property `" + fields::embed_from + "` must be an array.");
|
||||
}
|
||||
|
||||
if(field_json[fields::create_from].empty()) {
|
||||
return Option<bool>(400, "Property `" + fields::create_from + "` must have at least one element.");
|
||||
if(field_json[fields::embed_from].empty()) {
|
||||
return Option<bool>(400, "Property `" + fields::embed_from + "` must have at least one element.");
|
||||
}
|
||||
|
||||
for(auto& create_from_field : field_json[fields::create_from]) {
|
||||
if(!create_from_field.is_string()) {
|
||||
return Option<bool>(400, "Property `" + fields::create_from + "` must be an array of strings.");
|
||||
for(auto& embed_from_field : field_json[fields::embed_from]) {
|
||||
if(!embed_from_field.is_string()) {
|
||||
return Option<bool>(400, "Property `" + fields::embed_from + "` must contain only field names as strings.");
|
||||
}
|
||||
}
|
||||
|
||||
if(field_json[fields::type] != field_types::FLOAT_ARRAY) {
|
||||
return Option<bool>(400, "Property `" + fields::create_from + "` is only allowed on a float array field.");
|
||||
return Option<bool>(400, "Property `" + fields::embed_from + "` is only allowed on a float array field.");
|
||||
}
|
||||
|
||||
|
||||
for(auto& create_from_field : field_json[fields::create_from]) {
|
||||
for(auto& embed_from_field : field_json[fields::embed_from]) {
|
||||
bool flag = false;
|
||||
for(const auto& field : fields_json) {
|
||||
if(field[fields::name] == create_from_field) {
|
||||
if(field[fields::type] != field_types::STRING) {
|
||||
return Option<bool>(400, "Property `" + fields::create_from + "` can only be used with array of string fields.");
|
||||
if(field[fields::name] == embed_from_field) {
|
||||
if(field[fields::type] != field_types::STRING && field[fields::type] != field_types::STRING_ARRAY) {
|
||||
return Option<bool>(400, "Property `" + fields::embed_from + "` can only refer to string or string array fields.");
|
||||
}
|
||||
flag = true;
|
||||
break;
|
||||
@ -458,9 +458,9 @@ struct field {
|
||||
}
|
||||
if(!flag) {
|
||||
for(const auto& field : the_fields) {
|
||||
if(field.name == create_from_field) {
|
||||
if(field.type != field_types::STRING) {
|
||||
return Option<bool>(400, "Property `" + fields::create_from + "` can only be used with array of string fields.");
|
||||
if(field.name == embed_from_field) {
|
||||
if(field.type != field_types::STRING && field.type != field_types::STRING_ARRAY) {
|
||||
return Option<bool>(400, "Property `" + fields::embed_from + "` can only refer to string or string array fields.");
|
||||
}
|
||||
flag = true;
|
||||
break;
|
||||
@ -468,7 +468,7 @@ struct field {
|
||||
}
|
||||
}
|
||||
if(!flag) {
|
||||
return Option<bool>(400, "Property `" + fields::create_from + "` can only be used with array of string fields.");
|
||||
return Option<bool>(400, "Property `" + fields::embed_from + "` can only refer to string or string array fields.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -532,6 +532,12 @@ private:
|
||||
const std::string& token, uint32_t seq_id);
|
||||
|
||||
void initialize_facet_indexes(const field& facet_field);
|
||||
|
||||
|
||||
|
||||
static Option<bool> embed_fields(nlohmann::json& document,
|
||||
const tsl::htrie_map<char, field>& embedding_fields,
|
||||
const tsl::htrie_map<char, field> & search_schema);
|
||||
|
||||
public:
|
||||
// for limiting number of results on multiple candidates / query rewrites
|
||||
@ -663,6 +669,7 @@ public:
|
||||
const size_t batch_start_index, const size_t batch_size,
|
||||
const std::string & default_sorting_field,
|
||||
const tsl::htrie_map<char, field> & search_schema,
|
||||
const tsl::htrie_map<char, field> & embedding_fields,
|
||||
const std::string& fallback_field_type,
|
||||
const std::vector<char>& token_separators,
|
||||
const std::vector<char>& symbols_to_index,
|
||||
@ -672,6 +679,7 @@ public:
|
||||
std::vector<index_record>& iter_batch,
|
||||
const std::string& default_sorting_field,
|
||||
const tsl::htrie_map<char, field>& search_schema,
|
||||
const tsl::htrie_map<char, field> & embedding_fields,
|
||||
const std::string& fallback_field_type,
|
||||
const std::vector<char>& token_separators,
|
||||
const std::vector<char>& symbols_to_index,
|
||||
|
@ -1,6 +1,6 @@
|
||||
#pragma once
|
||||
|
||||
|
||||
#include <filesystem>
|
||||
#include <unordered_map>
|
||||
#include <openssl/md5.h>
|
||||
#include <fstream>
|
||||
@ -43,6 +43,10 @@ public:
|
||||
}
|
||||
|
||||
static void set_model_dir(const std::string& dir) {
|
||||
// create the directory if it doesn't exist
|
||||
if(!std::filesystem::exists(dir)) {
|
||||
std::filesystem::create_directories(dir);
|
||||
}
|
||||
model_dir = dir;
|
||||
}
|
||||
|
||||
|
@ -27,6 +27,7 @@ public:
|
||||
static Option<uint32_t> validate_index_in_memory(nlohmann::json &document, uint32_t seq_id,
|
||||
const std::string & default_sorting_field,
|
||||
const tsl::htrie_map<char, field> & search_schema,
|
||||
const tsl::htrie_map<char, field> & embedding_fields,
|
||||
const index_operation_t op,
|
||||
const bool is_update,
|
||||
const std::string& fallback_field_type,
|
||||
@ -67,4 +68,9 @@ public:
|
||||
nlohmann::json::iterator& array_iter,
|
||||
bool is_array, bool& array_ele_erased);
|
||||
|
||||
static Option<bool> validate_embed_fields(const nlohmann::json& document,
|
||||
const tsl::htrie_map<char, field>& embedding_fields,
|
||||
const tsl::htrie_map<char, field> & search_schema,
|
||||
const bool& error_if_field_not_found);
|
||||
|
||||
};
|
@ -10,6 +10,7 @@ struct vector_query_t {
|
||||
std::string field_name;
|
||||
size_t k = 0;
|
||||
size_t flat_search_cutoff = 0;
|
||||
float distance_threshold = 2.01;
|
||||
std::vector<float> values;
|
||||
|
||||
uint32_t seq_id = 0;
|
||||
@ -19,6 +20,7 @@ struct vector_query_t {
|
||||
// used for testing only
|
||||
field_name.clear();
|
||||
k = 0;
|
||||
distance_threshold = 2.01;
|
||||
values.clear();
|
||||
seq_id = 0;
|
||||
query_doc_given = false;
|
||||
|
@ -51,6 +51,12 @@ Collection::Collection(const std::string& name, const uint32_t collection_id, co
|
||||
symbols_to_index(to_char_array(symbols_to_index)), token_separators(to_char_array(token_separators)),
|
||||
index(init_index()) {
|
||||
|
||||
for (auto const& field: fields) {
|
||||
if (!field.embed_from.empty()) {
|
||||
embedding_fields.emplace(field.name, field);
|
||||
}
|
||||
}
|
||||
|
||||
this->num_documents = 0;
|
||||
}
|
||||
|
||||
@ -72,10 +78,6 @@ Option<doc_seq_id_t> Collection::to_doc(const std::string & json_str, nlohmann::
|
||||
const std::string& id) {
|
||||
try {
|
||||
document = nlohmann::json::parse(json_str);
|
||||
auto embed_res = embed_fields(document);
|
||||
if (!embed_res.ok()) {
|
||||
return Option<doc_seq_id_t>(400, embed_res.error());
|
||||
}
|
||||
} catch(const std::exception& e) {
|
||||
LOG(ERROR) << "JSON error: " << e.what();
|
||||
return Option<doc_seq_id_t>(400, std::string("Bad JSON: ") + e.what());
|
||||
@ -106,6 +108,7 @@ Option<doc_seq_id_t> Collection::to_doc(const std::string & json_str, nlohmann::
|
||||
uint32_t seq_id = get_next_seq_id();
|
||||
document["id"] = std::to_string(seq_id);
|
||||
|
||||
|
||||
// Add reference helper fields in the document.
|
||||
for (auto const& pair: reference_fields) {
|
||||
auto field_name = pair.first;
|
||||
@ -176,9 +179,12 @@ Option<doc_seq_id_t> Collection::to_doc(const std::string & json_str, nlohmann::
|
||||
if(operation == CREATE) {
|
||||
return Option<doc_seq_id_t>(409, std::string("A document with id ") + doc_id + " already exists.");
|
||||
}
|
||||
|
||||
|
||||
|
||||
// UPSERT, EMPLACE or UPDATE
|
||||
uint32_t seq_id = (uint32_t) std::stoul(seq_id_str);
|
||||
|
||||
return Option<doc_seq_id_t>(doc_seq_id_t{seq_id, false});
|
||||
|
||||
} else {
|
||||
@ -188,6 +194,7 @@ Option<doc_seq_id_t> Collection::to_doc(const std::string & json_str, nlohmann::
|
||||
} else {
|
||||
// for UPSERT, EMPLACE or CREATE, if a document with given ID is not found, we will treat it as a new doc
|
||||
uint32_t seq_id = get_next_seq_id();
|
||||
|
||||
return Option<doc_seq_id_t>(doc_seq_id_t{seq_id, true});
|
||||
}
|
||||
}
|
||||
@ -233,8 +240,8 @@ nlohmann::json Collection::get_summary_json() const {
|
||||
field_json[fields::infix] = coll_field.infix;
|
||||
field_json[fields::locale] = coll_field.locale;
|
||||
|
||||
if(coll_field.create_from.size() > 0) {
|
||||
field_json[fields::create_from] = coll_field.create_from;
|
||||
if(!coll_field.embed_from.empty()) {
|
||||
field_json[fields::embed_from] = coll_field.embed_from;
|
||||
}
|
||||
|
||||
if(coll_field.model_name.size() > 0) {
|
||||
@ -298,7 +305,6 @@ nlohmann::json Collection::add_many(std::vector<std::string>& json_lines, nlohma
|
||||
const std::string & json_line = json_lines[i];
|
||||
Option<doc_seq_id_t> doc_seq_id_op = to_doc(json_line, document, operation, dirty_values, id);
|
||||
|
||||
|
||||
const uint32_t seq_id = doc_seq_id_op.ok() ? doc_seq_id_op.get().seq_id : 0;
|
||||
index_record record(i, seq_id, document, operation, dirty_values);
|
||||
|
||||
@ -372,6 +378,7 @@ nlohmann::json Collection::add_many(std::vector<std::string>& json_lines, nlohma
|
||||
|
||||
do_batched_index:
|
||||
|
||||
|
||||
if((i+1) % index_batch_size == 0 || i == json_lines.size()-1 || repeated_doc) {
|
||||
batch_index(index_records, json_lines, num_indexed, return_doc, return_id);
|
||||
|
||||
@ -568,7 +575,7 @@ Option<uint32_t> Collection::index_in_memory(nlohmann::json &document, uint32_t
|
||||
std::unique_lock lock(mutex);
|
||||
|
||||
Option<uint32_t> validation_op = validator_t::validate_index_in_memory(document, seq_id, default_sorting_field,
|
||||
search_schema, op, false,
|
||||
search_schema, embedding_fields, op, false,
|
||||
fallback_field_type, dirty_values);
|
||||
|
||||
if(!validation_op.ok()) {
|
||||
@ -579,7 +586,7 @@ Option<uint32_t> Collection::index_in_memory(nlohmann::json &document, uint32_t
|
||||
|
||||
std::vector<index_record> index_batch;
|
||||
index_batch.emplace_back(std::move(rec));
|
||||
Index::batch_memory_index(index, index_batch, default_sorting_field, search_schema,
|
||||
Index::batch_memory_index(index, index_batch, default_sorting_field, search_schema, embedding_fields,
|
||||
fallback_field_type, token_separators, symbols_to_index, true);
|
||||
|
||||
num_documents += 1;
|
||||
@ -589,7 +596,7 @@ Option<uint32_t> Collection::index_in_memory(nlohmann::json &document, uint32_t
|
||||
size_t Collection::batch_index_in_memory(std::vector<index_record>& index_records) {
|
||||
std::unique_lock lock(mutex);
|
||||
size_t num_indexed = Index::batch_memory_index(index, index_records, default_sorting_field,
|
||||
search_schema, fallback_field_type,
|
||||
search_schema, embedding_fields, fallback_field_type,
|
||||
token_separators, symbols_to_index, true);
|
||||
num_documents += num_indexed;
|
||||
return num_indexed;
|
||||
@ -989,7 +996,7 @@ Option<bool> Collection::extract_field_name(const std::string& field_name,
|
||||
for(auto kv = prefix_it.first; kv != prefix_it.second; ++kv) {
|
||||
bool exact_key_match = (kv.key().size() == field_name.size());
|
||||
bool exact_primitive_match = exact_key_match && !kv.value().is_object();
|
||||
bool text_embedding = kv.value().type == field_types::FLOAT_ARRAY && kv.value().create_from.size() > 0;
|
||||
bool text_embedding = kv.value().type == field_types::FLOAT_ARRAY && !kv.value().embed_from.empty();
|
||||
|
||||
if(extract_only_string_fields && !kv.value().is_string() && !text_embedding) {
|
||||
if(exact_primitive_match && !is_wildcard) {
|
||||
@ -1121,10 +1128,6 @@ Option<nlohmann::json> Collection::search(std::string raw_query,
|
||||
|
||||
vector_query_t vector_query;
|
||||
if(!vector_query_str.empty()) {
|
||||
if(raw_query != "*") {
|
||||
return Option<nlohmann::json>(400, "Vector query is supported only on wildcard (q=*) searches.");
|
||||
}
|
||||
|
||||
auto parse_vector_op = VectorQueryOps::parse_vector_query_str(vector_query_str, vector_query, this);
|
||||
if(!parse_vector_op.ok()) {
|
||||
return Option<nlohmann::json>(400, parse_vector_op.error());
|
||||
@ -3457,6 +3460,11 @@ tsl::htrie_map<char, field> Collection::get_nested_fields() {
|
||||
return nested_fields;
|
||||
};
|
||||
|
||||
tsl::htrie_map<char, field> Collection::get_embedding_fields() {
|
||||
std::shared_lock lock(mutex);
|
||||
return embedding_fields;
|
||||
};
|
||||
|
||||
std::string Collection::get_meta_key(const std::string & collection_name) {
|
||||
return std::string(COLLECTION_META_PREFIX) + "_" + collection_name;
|
||||
}
|
||||
@ -3709,7 +3717,7 @@ Option<bool> Collection::batch_alter_data(const std::vector<field>& alter_fields
|
||||
}
|
||||
}
|
||||
|
||||
Index::batch_memory_index(index, iter_batch, default_sorting_field, schema_additions,
|
||||
Index::batch_memory_index(index, iter_batch, default_sorting_field, schema_additions, embedding_fields,
|
||||
fallback_field_type, token_separators, symbols_to_index, true);
|
||||
|
||||
iter_batch.clear();
|
||||
@ -3728,7 +3736,7 @@ Option<bool> Collection::batch_alter_data(const std::vector<field>& alter_fields
|
||||
}
|
||||
|
||||
LOG(INFO) << "Finished altering " << num_found_docs << " document(s).";
|
||||
|
||||
std::vector<field> garbage_embedding_fields_vec;
|
||||
for(auto& del_field: del_fields) {
|
||||
search_schema.erase(del_field.name);
|
||||
auto new_end = std::remove_if(fields.begin(), fields.end(), [&del_field](const field& f) {
|
||||
@ -3745,6 +3753,10 @@ Option<bool> Collection::batch_alter_data(const std::vector<field>& alter_fields
|
||||
nested_fields.erase(del_field.name);
|
||||
}
|
||||
|
||||
if(!del_field.embed_from.empty()) {
|
||||
embedding_fields.erase(del_field.name);
|
||||
}
|
||||
|
||||
if(del_field.name == ".*") {
|
||||
fallback_field_type = "";
|
||||
}
|
||||
@ -3752,9 +3764,12 @@ Option<bool> Collection::batch_alter_data(const std::vector<field>& alter_fields
|
||||
if(del_field.name == default_sorting_field) {
|
||||
default_sorting_field = "";
|
||||
}
|
||||
|
||||
process_remove_field_for_embedding_fields(del_field, garbage_embedding_fields_vec);
|
||||
}
|
||||
|
||||
index->refresh_schemas({}, del_fields);
|
||||
index->refresh_schemas({}, garbage_embedding_fields_vec);
|
||||
|
||||
auto persist_op = persist_collection_meta();
|
||||
if(!persist_op.ok()) {
|
||||
@ -3980,6 +3995,7 @@ Option<bool> Collection::validate_alter_payload(nlohmann::json& schema_changes,
|
||||
std::vector<field> diff_fields;
|
||||
tsl::htrie_map<char, field> updated_search_schema = search_schema;
|
||||
tsl::htrie_map<char, field> updated_nested_fields = nested_fields;
|
||||
tsl::htrie_map<char, field> updated_embedding_fields = embedding_fields;
|
||||
size_t num_auto_detect_fields = 0;
|
||||
|
||||
// since fields can be deleted and added in the same change set,
|
||||
@ -4036,10 +4052,18 @@ Option<bool> Collection::validate_alter_payload(nlohmann::json& schema_changes,
|
||||
return Option<bool>(400, "Field `" + field_name + "` is not part of collection schema.");
|
||||
}
|
||||
|
||||
if(found_field && !field_it.value().embed_from.empty()) {
|
||||
updated_embedding_fields.erase(field_it.key());
|
||||
}
|
||||
|
||||
if(found_field) {
|
||||
del_fields.push_back(field_it.value());
|
||||
updated_search_schema.erase(field_it.key());
|
||||
updated_nested_fields.erase(field_it.key());
|
||||
|
||||
if(!field_it.value().embed_from.empty()) {
|
||||
updated_embedding_fields.erase(field_it.key());
|
||||
}
|
||||
|
||||
// should also remove children if the field being dropped is an object
|
||||
if(field_it.value().nested && enable_nested_fields) {
|
||||
@ -4050,6 +4074,10 @@ Option<bool> Collection::validate_alter_payload(nlohmann::json& schema_changes,
|
||||
del_fields.push_back(prefix_kv.value());
|
||||
updated_search_schema.erase(prefix_kv.key());
|
||||
updated_nested_fields.erase(prefix_kv.key());
|
||||
|
||||
if(!prefix_kv.value().embed_from.empty()) {
|
||||
updated_embedding_fields.erase(prefix_kv.key());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -4100,6 +4128,10 @@ Option<bool> Collection::validate_alter_payload(nlohmann::json& schema_changes,
|
||||
addition_fields.push_back(f);
|
||||
}
|
||||
|
||||
if(!f.embed_from.empty()) {
|
||||
return Option<bool>(400, "Embedding fields can only be added at the time of collection creation.");
|
||||
}
|
||||
|
||||
if(f.nested && enable_nested_fields) {
|
||||
updated_nested_fields.emplace(f.name, f);
|
||||
|
||||
@ -4111,6 +4143,10 @@ Option<bool> Collection::validate_alter_payload(nlohmann::json& schema_changes,
|
||||
updated_search_schema.emplace(prefix_kv.key(), prefix_kv.value());
|
||||
updated_nested_fields.emplace(prefix_kv.key(), prefix_kv.value());
|
||||
|
||||
if(!prefix_kv.value().embed_from.empty()) {
|
||||
return Option<bool>(400, "Embedding fields can only be added at the time of collection creation.");
|
||||
}
|
||||
|
||||
if(is_reindex) {
|
||||
reindex_fields.push_back(prefix_kv.value());
|
||||
} else {
|
||||
@ -4181,6 +4217,7 @@ Option<bool> Collection::validate_alter_payload(nlohmann::json& schema_changes,
|
||||
// validate existing data on disk for compatibility via updated_search_schema
|
||||
auto validate_op = validator_t::validate_index_in_memory(document, seq_id, default_sorting_field,
|
||||
updated_search_schema,
|
||||
updated_embedding_fields,
|
||||
index_operation_t::CREATE,
|
||||
false,
|
||||
fallback_field_type,
|
||||
@ -4431,6 +4468,10 @@ Index* Collection::init_index() {
|
||||
nested_fields.emplace(field.name, field);
|
||||
}
|
||||
|
||||
if(!field.embed_from.empty()) {
|
||||
embedding_fields.emplace(field.name, field);
|
||||
}
|
||||
|
||||
if(!field.reference.empty()) {
|
||||
auto dot_index = field.reference.find('.');
|
||||
auto collection_name = field.reference.substr(0, dot_index);
|
||||
@ -4701,32 +4742,22 @@ Option<bool> Collection::populate_include_exclude_fields_lk(const spp::sparse_ha
|
||||
return populate_include_exclude_fields(include_fields, exclude_fields, include_fields_full, exclude_fields_full);
|
||||
}
|
||||
|
||||
// Removes the dropped field from embed_from of all embedding fields.
|
||||
void Collection::process_remove_field_for_embedding_fields(const field& the_field, std::vector<field>& garbage_fields) {
|
||||
for(auto& field : fields) {
|
||||
if(field.embed_from.empty()) {
|
||||
continue;
|
||||
}
|
||||
field.embed_from.erase(std::remove_if(field.embed_from.begin(), field.embed_from.end(), [&the_field](std::string field_name) {
|
||||
return the_field.name == field_name;
|
||||
}));
|
||||
embedding_fields[field.name] = field;
|
||||
|
||||
Option<bool> Collection::embed_fields(nlohmann::json& document) {
|
||||
for(const auto& field : fields) {
|
||||
if(field.create_from.size() > 0) {
|
||||
if(TextEmbedderManager::model_dir.empty()) {
|
||||
return Option<bool>(400, "Text embedding is not enabled. Please set `model-dir` at startup.");
|
||||
}
|
||||
std::string text_to_embed;
|
||||
for(const auto& field_name : field.create_from) {
|
||||
auto field_it = document.find(field_name);
|
||||
if(field_it != document.end()) {
|
||||
if(field_it->is_string()) {
|
||||
text_to_embed += field_it->get<std::string>() + " ";
|
||||
} else {
|
||||
return Option<bool>(400, "Field `" + field_name + "` is not a string.");
|
||||
}
|
||||
} else {
|
||||
return Option<bool>(400, "Field `" + field_name + "` not found in document.");
|
||||
}
|
||||
}
|
||||
|
||||
TextEmbedderManager& embedder_manager = TextEmbedderManager::get_instance();
|
||||
auto embedder = embedder_manager.get_text_embedder(field.model_name.size() > 0 ? field.model_name : TextEmbedderManager::DEFAULT_MODEL_NAME);
|
||||
std::vector<float> embedding = embedder->Embed(text_to_embed);
|
||||
document[field.name] = embedding;
|
||||
// mark this embedding field as "garbage" if it has no more embed_from fields
|
||||
if(field.embed_from.empty()) {
|
||||
embedding_fields.erase(field.name);
|
||||
garbage_fields.push_back(field);
|
||||
}
|
||||
}
|
||||
return Option<bool>(true);
|
||||
}
|
||||
|
||||
}
|
@ -58,8 +58,8 @@ Collection* CollectionManager::init_collection(const nlohmann::json & collection
|
||||
field_obj[fields::reference] = "";
|
||||
}
|
||||
|
||||
if(field_obj.count(fields::create_from) == 0) {
|
||||
field_obj[fields::create_from] = std::vector<std::string>();
|
||||
if(field_obj.count(fields::embed_from) == 0) {
|
||||
field_obj[fields::embed_from] = std::vector<std::string>();
|
||||
}
|
||||
|
||||
if(field_obj.count(fields::model_name) == 0) {
|
||||
@ -78,7 +78,7 @@ Collection* CollectionManager::init_collection(const nlohmann::json & collection
|
||||
field f(field_obj[fields::name], field_obj[fields::type], field_obj[fields::facet],
|
||||
field_obj[fields::optional], field_obj[fields::index], field_obj[fields::locale],
|
||||
-1, field_obj[fields::infix], field_obj[fields::nested], field_obj[fields::nested_array],
|
||||
field_obj[fields::num_dim], vec_dist_type, field_obj[fields::reference], field_obj[fields::create_from],
|
||||
field_obj[fields::num_dim], vec_dist_type, field_obj[fields::reference], field_obj[fields::embed_from],
|
||||
field_obj[fields::model_name]);
|
||||
|
||||
// value of `sort` depends on field type
|
||||
|
@ -672,11 +672,11 @@ Option<bool> field::json_field_to_field(bool enable_nested_fields, nlohmann::jso
|
||||
}
|
||||
}
|
||||
|
||||
if(field_json.count(fields::model_name) > 0 && field_json.count(fields::create_from) == 0) {
|
||||
return Option<bool>(400, "Property `" + fields::model_name + "` can only be used with `" + fields::create_from + "`.");
|
||||
if(field_json.count(fields::model_name) > 0 && field_json.count(fields::embed_from) == 0) {
|
||||
return Option<bool>(400, "Property `" + fields::model_name + "` can only be used with `" + fields::embed_from + "`.");
|
||||
}
|
||||
|
||||
if(field_json.count(fields::create_from) != 0) {
|
||||
if(field_json.count(fields::embed_from) != 0) {
|
||||
// If the model path is not specified, use the default model and set the number of dimensions to 384 (number of dimensions of the default model)
|
||||
field_json[fields::num_dim] = static_cast<unsigned int>(384);
|
||||
if(field_json.count(fields::model_name) != 0) {
|
||||
@ -695,7 +695,7 @@ Option<bool> field::json_field_to_field(bool enable_nested_fields, nlohmann::jso
|
||||
}
|
||||
}
|
||||
} else {
|
||||
field_json[fields::create_from] = std::vector<std::string>();
|
||||
field_json[fields::embed_from] = std::vector<std::string>();
|
||||
}
|
||||
|
||||
|
||||
@ -784,7 +784,7 @@ Option<bool> field::json_field_to_field(bool enable_nested_fields, nlohmann::jso
|
||||
field_json[fields::optional], field_json[fields::index], field_json[fields::locale],
|
||||
field_json[fields::sort], field_json[fields::infix], field_json[fields::nested],
|
||||
field_json[fields::nested_array], field_json[fields::num_dim], vec_dist,
|
||||
field_json[fields::reference], field_json[fields::create_from].get<std::vector<std::string>>(),
|
||||
field_json[fields::reference], field_json[fields::embed_from].get<std::vector<std::string>>(),
|
||||
field_json[fields::model_name])
|
||||
);
|
||||
|
||||
|
@ -411,6 +411,7 @@ void Index::validate_and_preprocess(Index *index, std::vector<index_record>& ite
|
||||
const size_t batch_start_index, const size_t batch_size,
|
||||
const std::string& default_sorting_field,
|
||||
const tsl::htrie_map<char, field>& search_schema,
|
||||
const tsl::htrie_map<char, field>& embedding_fields,
|
||||
const std::string& fallback_field_type,
|
||||
const std::vector<char>& token_separators,
|
||||
const std::vector<char>& symbols_to_index,
|
||||
@ -435,6 +436,7 @@ void Index::validate_and_preprocess(Index *index, std::vector<index_record>& ite
|
||||
Option<uint32_t> validation_op = validator_t::validate_index_in_memory(index_rec.doc, index_rec.seq_id,
|
||||
default_sorting_field,
|
||||
search_schema,
|
||||
embedding_fields,
|
||||
index_rec.operation,
|
||||
index_rec.is_update,
|
||||
fallback_field_type,
|
||||
@ -451,6 +453,9 @@ void Index::validate_and_preprocess(Index *index, std::vector<index_record>& ite
|
||||
get_doc_changes(index_rec.operation, index_rec.doc, index_rec.old_doc, index_rec.new_doc,
|
||||
index_rec.del_doc);
|
||||
scrub_reindex_doc(search_schema, index_rec.doc, index_rec.del_doc, index_rec.old_doc);
|
||||
embed_fields(index_rec.new_doc, embedding_fields, search_schema);
|
||||
} else {
|
||||
embed_fields(index_rec.doc, embedding_fields, search_schema);
|
||||
}
|
||||
|
||||
compute_token_offsets_facets(index_rec, search_schema, token_separators, symbols_to_index);
|
||||
@ -485,6 +490,7 @@ void Index::validate_and_preprocess(Index *index, std::vector<index_record>& ite
|
||||
size_t Index::batch_memory_index(Index *index, std::vector<index_record>& iter_batch,
|
||||
const std::string & default_sorting_field,
|
||||
const tsl::htrie_map<char, field> & search_schema,
|
||||
const tsl::htrie_map<char, field> & embedding_fields,
|
||||
const std::string& fallback_field_type,
|
||||
const std::vector<char>& token_separators,
|
||||
const std::vector<char>& symbols_to_index,
|
||||
@ -518,7 +524,7 @@ size_t Index::batch_memory_index(Index *index, std::vector<index_record>& iter_b
|
||||
index->thread_pool->enqueue([&, batch_index, batch_len]() {
|
||||
write_log_index = local_write_log_index;
|
||||
validate_and_preprocess(index, iter_batch, batch_index, batch_len, default_sorting_field, search_schema,
|
||||
fallback_field_type, token_separators, symbols_to_index, do_validation);
|
||||
embedding_fields, fallback_field_type, token_separators, symbols_to_index, do_validation);
|
||||
|
||||
std::unique_lock<std::mutex> lock(m_process);
|
||||
num_processed++;
|
||||
@ -2881,6 +2887,10 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
|
||||
|
||||
auto vec_dist_score = (field_vector_index->distance_type == cosine) ? std::abs(dist_label.first) :
|
||||
dist_label.first;
|
||||
|
||||
if(vec_dist_score > vector_query.distance_threshold) {
|
||||
continue;
|
||||
}
|
||||
|
||||
int64_t scores[3] = {0};
|
||||
scores[0] = -float_to_int64_t(vec_dist_score);
|
||||
@ -3101,9 +3111,12 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
|
||||
|
||||
auto vec_dist_score = (field_vector_index->distance_type == cosine) ? std::abs(dist_label.first) :
|
||||
dist_label.first;
|
||||
|
||||
if(vec_dist_score > vector_query.distance_threshold) {
|
||||
continue;
|
||||
}
|
||||
vec_results.emplace_back(seq_id, vec_dist_score);
|
||||
}
|
||||
|
||||
std::sort(vec_results.begin(), vec_results.end(), [](const auto& a, const auto& b) {
|
||||
return a.second < b.second;
|
||||
});
|
||||
@ -6250,6 +6263,30 @@ bool Index::common_results_exist(std::vector<art_leaf*>& leaves, bool must_match
|
||||
return phrase_exists;
|
||||
}
|
||||
|
||||
Option<bool> Index::embed_fields(nlohmann::json& document,
|
||||
const tsl::htrie_map<char, field>& embedding_fields,
|
||||
const tsl::htrie_map<char, field> & search_schema) {
|
||||
for(const auto& field : embedding_fields) {
|
||||
std::string text_to_embed;
|
||||
for(const auto& field_name : field.embed_from) {
|
||||
auto field_it = search_schema.find(field_name);
|
||||
if(field_it.value().type == field_types::STRING) {
|
||||
text_to_embed += document[field_name].get<std::string>() + " ";
|
||||
} else if(field_it.value().type == field_types::STRING_ARRAY) {
|
||||
for(const auto& val : document[field_name]) {
|
||||
text_to_embed += val.get<std::string>() + " ";
|
||||
}
|
||||
}
|
||||
}
|
||||
TextEmbedderManager& embedder_manager = TextEmbedderManager::get_instance();
|
||||
auto embedder = embedder_manager.get_text_embedder(field.model_name.size() > 0 ? field.model_name : TextEmbedderManager::DEFAULT_MODEL_NAME);
|
||||
std::vector<float> embedding = embedder->Embed(text_to_embed);
|
||||
document[field.name] = embedding;
|
||||
}
|
||||
|
||||
return Option<bool>(true);
|
||||
}
|
||||
|
||||
/*
|
||||
// https://stackoverflow.com/questions/924171/geo-fencing-point-inside-outside-polygon
|
||||
// NOTE: polygon and point should have been transformed with `transform_for_180th_meridian`
|
||||
@ -6295,3 +6332,4 @@ void Index::transform_for_180th_meridian(GeoCoord &point, double offset) {
|
||||
point.lon = point.lon < 0.0 ? point.lon + offset : point.lon;
|
||||
}
|
||||
*/
|
||||
|
||||
|
@ -425,7 +425,7 @@ int run_server(const Config & config, const std::string & version, void (*master
|
||||
|
||||
if(config.get_model_dir().size() > 0) {
|
||||
LOG(INFO) << "Loading text embedding models from " << config.get_model_dir();
|
||||
TextEmbedderManager::model_dir = config.get_model_dir();
|
||||
TextEmbedderManager::set_model_dir(config.get_model_dir());
|
||||
TextEmbedderManager::download_default_model();
|
||||
}
|
||||
|
||||
|
@ -529,6 +529,7 @@ Option<uint32_t> validator_t::coerce_float(const DIRTY_VALUES& dirty_values, con
|
||||
Option<uint32_t> validator_t::validate_index_in_memory(nlohmann::json& document, uint32_t seq_id,
|
||||
const std::string & default_sorting_field,
|
||||
const tsl::htrie_map<char, field> & search_schema,
|
||||
const tsl::htrie_map<char, field> & embedding_fields,
|
||||
const index_operation_t op,
|
||||
const bool is_update,
|
||||
const std::string& fallback_field_type,
|
||||
@ -544,6 +545,11 @@ Option<uint32_t> validator_t::validate_index_in_memory(nlohmann::json& document,
|
||||
for(const auto& a_field: search_schema) {
|
||||
const std::string& field_name = a_field.name;
|
||||
|
||||
// ignore embedding fields, they will be validated later
|
||||
if(embedding_fields.count(field_name) > 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if(field_name == "id" || a_field.is_object()) {
|
||||
continue;
|
||||
}
|
||||
@ -574,5 +580,50 @@ Option<uint32_t> validator_t::validate_index_in_memory(nlohmann::json& document,
|
||||
}
|
||||
}
|
||||
|
||||
// validate embedding fields
|
||||
auto validate_embed_op = validate_embed_fields(document, embedding_fields, search_schema, !is_update);
|
||||
if(!validate_embed_op.ok()) {
|
||||
return Option<>(validate_embed_op.code(), validate_embed_op.error());
|
||||
}
|
||||
|
||||
return Option<>(200);
|
||||
}
|
||||
|
||||
|
||||
Option<bool> validator_t::validate_embed_fields(const nlohmann::json& document,
|
||||
const tsl::htrie_map<char, field>& embedding_fields,
|
||||
const tsl::htrie_map<char, field> & search_schema,
|
||||
const bool& error_if_field_not_found) {
|
||||
if(!embedding_fields.empty() && TextEmbedderManager::model_dir.empty()) {
|
||||
return Option<bool>(400, "Text embedding is not enabled. Please set `model-dir` at startup.");
|
||||
}
|
||||
for(const auto& field : embedding_fields) {
|
||||
for(const auto& field_name : field.embed_from) {
|
||||
auto schema_field_it = search_schema.find(field_name);
|
||||
auto doc_field_it = document.find(field_name);
|
||||
if(schema_field_it == search_schema.end()) {
|
||||
return Option<bool>(400, "Field `" + field.name + "` has invalid fields to create embeddings from.");
|
||||
}
|
||||
if(doc_field_it == document.end()) {
|
||||
if(error_if_field_not_found) {
|
||||
return Option<bool>(400, "Field `" + field_name + "` is needed to create embedding.");
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if((schema_field_it.value().type == field_types::STRING && !doc_field_it.value().is_string()) ||
|
||||
(schema_field_it.value().type == field_types::STRING_ARRAY && !doc_field_it.value().is_array())) {
|
||||
return Option<bool>(400, "Field `" + field_name + "` has malformed data.");
|
||||
}
|
||||
if(doc_field_it.value().is_array()) {
|
||||
for(const auto& val : doc_field_it.value()) {
|
||||
if(!val.is_string()) {
|
||||
return Option<bool>(400, "Field `" + field_name + "` has malformed data.");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return Option<bool>(true);
|
||||
}
|
@ -145,6 +145,15 @@ Option<bool> VectorQueryOps::parse_vector_query_str(std::string vector_query_str
|
||||
|
||||
vector_query.flat_search_cutoff = std::stoi(param_kv[1]);
|
||||
}
|
||||
|
||||
if(param_kv[0] == "distance_threshold") {
|
||||
if(!StringUtils::is_float(param_kv[1]) || std::stof(param_kv[1]) < 0.0 || std::stof(param_kv[1]) > 2.0) {
|
||||
return Option<bool>(400, "Malformed vector query string: "
|
||||
"`distance_threshold` parameter must be a float between 0.0-2.0.");
|
||||
}
|
||||
|
||||
vector_query.distance_threshold = std::stof(param_kv[1]);
|
||||
}
|
||||
}
|
||||
|
||||
if(!vector_query.query_doc_given && vector_query.values.empty()) {
|
||||
|
@ -1592,12 +1592,12 @@ TEST_F(CollectionAllFieldsTest, FieldNameMatchingRegexpShouldNotBeIndexedInNonAu
|
||||
ASSERT_EQ(1, results["hits"].size());
|
||||
}
|
||||
|
||||
TEST_F(CollectionAllFieldsTest, CreateFromFieldJSONInvalidField) {
|
||||
TEST_F(CollectionAllFieldsTest, EmbedFromFieldJSONInvalidField) {
|
||||
TextEmbedderManager::model_dir = "/tmp/models";
|
||||
nlohmann::json field_json;
|
||||
field_json["name"] = "embedding";
|
||||
field_json["type"] = "float[]";
|
||||
field_json["create_from"] = {"name"};
|
||||
field_json["embed_from"] = {"name"};
|
||||
|
||||
std::vector<field> fields;
|
||||
std::string fallback_field_type;
|
||||
@ -1607,15 +1607,15 @@ TEST_F(CollectionAllFieldsTest, CreateFromFieldJSONInvalidField) {
|
||||
auto field_op = field::json_fields_to_fields(false, arr, fallback_field_type, fields);
|
||||
|
||||
ASSERT_FALSE(field_op.ok());
|
||||
ASSERT_EQ("Property `create_from` can only be used with array of string fields.", field_op.error());
|
||||
ASSERT_EQ("Property `embed_from` can only refer to string or string array fields.", field_op.error());
|
||||
}
|
||||
|
||||
TEST_F(CollectionAllFieldsTest, CreateFromFieldNoModelDir) {
|
||||
TEST_F(CollectionAllFieldsTest, EmbedFromFieldNoModelDir) {
|
||||
TextEmbedderManager::model_dir = std::string();
|
||||
nlohmann::json field_json;
|
||||
field_json["name"] = "embedding";
|
||||
field_json["type"] = "float[]";
|
||||
field_json["create_from"] = {"name"};
|
||||
field_json["embed_from"] = {"name"};
|
||||
|
||||
std::vector<field> fields;
|
||||
std::string fallback_field_type;
|
||||
@ -1628,12 +1628,12 @@ TEST_F(CollectionAllFieldsTest, CreateFromFieldNoModelDir) {
|
||||
ASSERT_EQ("Text embedding is not enabled. Please set `model-dir` at startup.", field_op.error());
|
||||
}
|
||||
|
||||
TEST_F(CollectionAllFieldsTest, CreateFromNotArray) {
|
||||
TEST_F(CollectionAllFieldsTest, EmbedFromNotArray) {
|
||||
TextEmbedderManager::model_dir = "/tmp/models";
|
||||
nlohmann::json field_json;
|
||||
field_json["name"] = "embedding";
|
||||
field_json["type"] = "float[]";
|
||||
field_json["create_from"] = "name";
|
||||
field_json["embed_from"] = "name";
|
||||
|
||||
std::vector<field> fields;
|
||||
std::string fallback_field_type;
|
||||
@ -1643,10 +1643,10 @@ TEST_F(CollectionAllFieldsTest, CreateFromNotArray) {
|
||||
auto field_op = field::json_fields_to_fields(false, arr, fallback_field_type, fields);
|
||||
|
||||
ASSERT_FALSE(field_op.ok());
|
||||
ASSERT_EQ("Property `create_from` must be an array.", field_op.error());
|
||||
ASSERT_EQ("Property `embed_from` must be an array.", field_op.error());
|
||||
}
|
||||
|
||||
TEST_F(CollectionAllFieldsTest, ModelPathWithoutCreateFrom) {
|
||||
TEST_F(CollectionAllFieldsTest, ModelPathWithoutEmbedFrom) {
|
||||
TextEmbedderManager::model_dir = "/tmp/models";
|
||||
nlohmann::json field_json;
|
||||
field_json["name"] = "embedding";
|
||||
@ -1660,17 +1660,17 @@ TEST_F(CollectionAllFieldsTest, ModelPathWithoutCreateFrom) {
|
||||
|
||||
auto field_op = field::json_fields_to_fields(false, arr, fallback_field_type, fields);
|
||||
ASSERT_FALSE(field_op.ok());
|
||||
ASSERT_EQ("Property `model_name` can only be used with `create_from`.", field_op.error());
|
||||
ASSERT_EQ("Property `model_name` can only be used with `embed_from`.", field_op.error());
|
||||
}
|
||||
|
||||
|
||||
TEST_F(CollectionAllFieldsTest, CreateFromBasicValid) {
|
||||
TEST_F(CollectionAllFieldsTest, EmbedFromBasicValid) {
|
||||
|
||||
TextEmbedderManager::model_dir = "/tmp/typesense_test/models";
|
||||
TextEmbedderManager::download_default_model();
|
||||
|
||||
field embedding = field("embedding", field_types::FLOAT_ARRAY, false);
|
||||
embedding.create_from.push_back("name");
|
||||
embedding.embed_from.push_back("name");
|
||||
std::vector<field> fields = {field("name", field_types::STRING, false),
|
||||
embedding};
|
||||
auto obj_coll_op = collectionManager.create_collection("obj_coll", 1, fields, "", 0, field_types::AUTO);
|
||||
@ -1690,3 +1690,21 @@ TEST_F(CollectionAllFieldsTest, CreateFromBasicValid) {
|
||||
|
||||
}
|
||||
|
||||
TEST_F(CollectionAllFieldsTest, WrongDataTypeForEmbedFrom) {
|
||||
TextEmbedderManager::model_dir = "/tmp/models";
|
||||
nlohmann::json field_json;
|
||||
field_json["name"] = "embedding";
|
||||
field_json["type"] = "float[]";
|
||||
field_json["embed_from"] = {"age"};
|
||||
|
||||
std::vector<field> fields;
|
||||
std::string fallback_field_type;
|
||||
auto arr = nlohmann::json::array();
|
||||
arr.push_back(field_json);
|
||||
field_json["name"] = "age";
|
||||
field_json["type"] = "int32";
|
||||
arr.push_back(field_json);
|
||||
auto field_op = field::json_fields_to_fields(false, arr, fallback_field_type, fields);
|
||||
ASSERT_FALSE(field_op.ok());
|
||||
ASSERT_EQ("Property `embed_from` can only refer to string or string array fields.", field_op.error());
|
||||
}
|
@ -1446,3 +1446,116 @@ TEST_F(CollectionSchemaChangeTest, GeoFieldSchemaAddition) {
|
||||
ASSERT_TRUE(res_op.ok());
|
||||
ASSERT_EQ(2, res_op.get()["found"].get<size_t>());
|
||||
}
|
||||
|
||||
TEST_F(CollectionSchemaChangeTest, UpdateSchemaWithNewEmbeddingField) {
|
||||
nlohmann::json schema = R"({
|
||||
"name": "objects",
|
||||
"fields": [
|
||||
{"name": "names", "type": "string[]"}
|
||||
]
|
||||
})"_json;
|
||||
|
||||
|
||||
auto op = collectionManager.create_collection(schema);
|
||||
ASSERT_TRUE(op.ok());
|
||||
Collection* coll = op.get();
|
||||
|
||||
nlohmann::json update_schema = R"({
|
||||
"fields": [
|
||||
{"name": "embedding", "type":"float[]", "embed_from": ["names"]}
|
||||
]
|
||||
})"_json;
|
||||
|
||||
auto res = coll->alter(update_schema);
|
||||
|
||||
ASSERT_FALSE(res.ok());
|
||||
ASSERT_EQ("Embedding fields can only be added at the time of collection creation.", res.error());
|
||||
}
|
||||
|
||||
TEST_F(CollectionSchemaChangeTest, DropFieldUsedForEmbedding) {
|
||||
nlohmann::json schema = R"({
|
||||
"name": "objects",
|
||||
"fields": [
|
||||
{"name": "names", "type": "string[]"},
|
||||
{"name": "category", "type":"string"},
|
||||
{"name": "embedding", "type":"float[]", "embed_from": ["names","category"]}
|
||||
]
|
||||
})"_json;
|
||||
|
||||
TextEmbedderManager::set_model_dir("/tmp/typesense_test/models");
|
||||
TextEmbedderManager::download_default_model();
|
||||
|
||||
auto op = collectionManager.create_collection(schema);
|
||||
ASSERT_TRUE(op.ok());
|
||||
Collection* coll = op.get();
|
||||
|
||||
LOG(INFO) << "Created collection";
|
||||
|
||||
auto schema_changes = R"({
|
||||
"fields": [
|
||||
{"name": "names", "drop": true}
|
||||
]
|
||||
})"_json;
|
||||
|
||||
|
||||
auto embedding_fields = coll->get_embedding_fields();
|
||||
ASSERT_EQ(2, embedding_fields["embedding"].embed_from.size());
|
||||
|
||||
auto alter_op = coll->alter(schema_changes);
|
||||
ASSERT_TRUE(alter_op.ok());
|
||||
|
||||
embedding_fields = coll->get_embedding_fields();
|
||||
ASSERT_EQ(1, embedding_fields["embedding"].embed_from.size());
|
||||
ASSERT_EQ("category", embedding_fields["embedding"].embed_from[0]);
|
||||
|
||||
schema_changes = R"({
|
||||
"fields": [
|
||||
{"name": "category", "drop": true}
|
||||
]
|
||||
})"_json;
|
||||
|
||||
alter_op = coll->alter(schema_changes);
|
||||
ASSERT_TRUE(alter_op.ok());
|
||||
|
||||
embedding_fields = coll->get_embedding_fields();
|
||||
ASSERT_EQ(0, embedding_fields.size());
|
||||
ASSERT_EQ(0, coll->_get_index()->_get_vector_index().size());
|
||||
}
|
||||
|
||||
TEST_F(CollectionSchemaChangeTest, EmbeddingFieldsMapTest) {
|
||||
nlohmann::json schema = R"({
|
||||
"name": "objects",
|
||||
"fields": [
|
||||
{"name": "name", "type": "string"},
|
||||
{"name": "embedding", "type":"float[]", "embed_from": ["name"]}
|
||||
]
|
||||
})"_json;
|
||||
|
||||
TextEmbedderManager::set_model_dir("/tmp/typesense_test/models");
|
||||
TextEmbedderManager::download_default_model();
|
||||
|
||||
auto op = collectionManager.create_collection(schema);
|
||||
ASSERT_TRUE(op.ok());
|
||||
Collection* coll = op.get();
|
||||
|
||||
auto embedding_fields_map = coll->get_embedding_fields();
|
||||
ASSERT_EQ(1, embedding_fields_map.size());
|
||||
auto embedding_field_it = embedding_fields_map.find("embedding");
|
||||
ASSERT_TRUE(embedding_field_it != embedding_fields_map.end());
|
||||
ASSERT_EQ("embedding", embedding_field_it.value().name);
|
||||
ASSERT_EQ(1, embedding_field_it.value().embed_from.size());
|
||||
ASSERT_EQ("name", embedding_field_it.value().embed_from[0]);
|
||||
|
||||
// drop the embedding field
|
||||
nlohmann::json schema_without_embedding = R"({
|
||||
"fields": [
|
||||
{"name": "embedding", "drop": true}
|
||||
]
|
||||
})"_json;
|
||||
auto update_op = coll->alter(schema_without_embedding);
|
||||
|
||||
ASSERT_TRUE(update_op.ok());
|
||||
|
||||
embedding_fields_map = coll->get_embedding_fields();
|
||||
ASSERT_EQ(0, embedding_fields_map.size());
|
||||
}
|
@ -4616,11 +4616,11 @@ TEST_F(CollectionTest, SemanticSearchTest) {
|
||||
"name": "objects",
|
||||
"fields": [
|
||||
{"name": "name", "type": "string"},
|
||||
{"name": "embedding", "type":"float[]", "create_from": ["name"]}
|
||||
{"name": "embedding", "type":"float[]", "embed_from": ["name"]}
|
||||
]
|
||||
})"_json;
|
||||
|
||||
TextEmbedderManager::model_dir = "/tmp/typesense_test/models";
|
||||
TextEmbedderManager::set_model_dir("/tmp/typesense_test/models");
|
||||
TextEmbedderManager::download_default_model();
|
||||
|
||||
auto op = collectionManager.create_collection(schema);
|
||||
@ -4651,11 +4651,11 @@ TEST_F(CollectionTest, InvalidSemanticSearch) {
|
||||
"name": "objects",
|
||||
"fields": [
|
||||
{"name": "name", "type": "string"},
|
||||
{"name": "embedding", "type":"float[]", "create_from": ["name"]}
|
||||
{"name": "embedding", "type":"float[]", "embed_from": ["name"]}
|
||||
]
|
||||
})"_json;
|
||||
|
||||
TextEmbedderManager::model_dir = "/tmp/typesense_test/models";
|
||||
TextEmbedderManager::set_model_dir("/tmp/typesense_test/models");
|
||||
TextEmbedderManager::download_default_model();
|
||||
|
||||
auto op = collectionManager.create_collection(schema);
|
||||
@ -4666,7 +4666,6 @@ TEST_F(CollectionTest, InvalidSemanticSearch) {
|
||||
object["name"] = "apple";
|
||||
auto add_op = coll->add(object.dump());
|
||||
ASSERT_TRUE(add_op.ok());
|
||||
LOG(INFO) << "add_op.get(): " << add_op.get().dump();
|
||||
ASSERT_EQ("apple", add_op.get()["name"]);
|
||||
ASSERT_EQ(384, add_op.get()["embedding"].size());
|
||||
|
||||
@ -4682,11 +4681,11 @@ TEST_F(CollectionTest, HybridSearch) {
|
||||
"name": "objects",
|
||||
"fields": [
|
||||
{"name": "name", "type": "string"},
|
||||
{"name": "embedding", "type":"float[]", "create_from": ["name"]}
|
||||
{"name": "embedding", "type":"float[]", "embed_from": ["name"]}
|
||||
]
|
||||
})"_json;
|
||||
|
||||
TextEmbedderManager::model_dir = "/tmp/typesense_test/models";
|
||||
TextEmbedderManager::set_model_dir("/tmp/typesense_test/models");
|
||||
TextEmbedderManager::download_default_model();
|
||||
|
||||
auto op = collectionManager.create_collection(schema);
|
||||
@ -4695,6 +4694,7 @@ TEST_F(CollectionTest, HybridSearch) {
|
||||
nlohmann::json object;
|
||||
object["name"] = "apple";
|
||||
auto add_op = coll->add(object.dump());
|
||||
LOG(INFO) << "add_op.error(): " << add_op.error();
|
||||
ASSERT_TRUE(add_op.ok());
|
||||
|
||||
ASSERT_EQ("apple", add_op.get()["name"]);
|
||||
@ -4710,44 +4710,44 @@ TEST_F(CollectionTest, HybridSearch) {
|
||||
ASSERT_EQ(384, search_res["hits"][0]["document"]["embedding"].size());
|
||||
}
|
||||
|
||||
TEST_F(CollectionTest, EmbedFielsTest) {
|
||||
nlohmann::json schema = R"({
|
||||
"name": "objects",
|
||||
"fields": [
|
||||
{"name": "name", "type": "string"},
|
||||
{"name": "embedding", "type":"float[]", "create_from": ["name"]}
|
||||
]
|
||||
})"_json;
|
||||
// TEST_F(CollectionTest, EmbedFielsTest) {
|
||||
// nlohmann::json schema = R"({
|
||||
// "name": "objects",
|
||||
// "fields": [
|
||||
// {"name": "name", "type": "string"},
|
||||
// {"name": "embedding", "type":"float[]", "embed_from": ["name"]}
|
||||
// ]
|
||||
// })"_json;
|
||||
|
||||
TextEmbedderManager::model_dir = "/tmp/typesense_test/models";
|
||||
TextEmbedderManager::download_default_model();
|
||||
// TextEmbedderManager::set_model_dir("/tmp/typesense_test/models");
|
||||
// TextEmbedderManager::download_default_model();
|
||||
|
||||
auto op = collectionManager.create_collection(schema);
|
||||
ASSERT_TRUE(op.ok());
|
||||
Collection* coll = op.get();
|
||||
// auto op = collectionManager.create_collection(schema);
|
||||
// ASSERT_TRUE(op.ok());
|
||||
// Collection* coll = op.get();
|
||||
|
||||
nlohmann::json object = R"({
|
||||
"name": "apple"
|
||||
})"_json;
|
||||
// nlohmann::json object = R"({
|
||||
// "name": "apple"
|
||||
// })"_json;
|
||||
|
||||
auto embed_op = coll->embed_fields(object);
|
||||
// auto embed_op = coll->embed_fields(object);
|
||||
|
||||
ASSERT_TRUE(embed_op.ok());
|
||||
// ASSERT_TRUE(embed_op.ok());
|
||||
|
||||
ASSERT_EQ("apple", object["name"]);
|
||||
ASSERT_EQ(384, object["embedding"].get<std::vector<float>>().size());
|
||||
}
|
||||
// ASSERT_EQ("apple", object["name"]);
|
||||
// ASSERT_EQ(384, object["embedding"].get<std::vector<float>>().size());
|
||||
// }
|
||||
|
||||
TEST_F(CollectionTest, HybridSearchRankFusionTest) {
|
||||
nlohmann::json schema = R"({
|
||||
"name": "objects",
|
||||
"fields": [
|
||||
{"name": "name", "type": "string"},
|
||||
{"name": "embedding", "type":"float[]", "create_from": ["name"]}
|
||||
{"name": "embedding", "type":"float[]", "embed_from": ["name"]}
|
||||
]
|
||||
})"_json;
|
||||
|
||||
TextEmbedderManager::model_dir = "/tmp/typesense_test/models";
|
||||
TextEmbedderManager::set_model_dir("/tmp/typesense_test/models");
|
||||
TextEmbedderManager::download_default_model();
|
||||
|
||||
auto op = collectionManager.create_collection(schema);
|
||||
@ -4813,15 +4813,15 @@ TEST_F(CollectionTest, HybridSearchRankFusionTest) {
|
||||
}
|
||||
|
||||
TEST_F(CollectionTest, WildcardSearchWithEmbeddingField) {
|
||||
nlohmann::json schema = R"({
|
||||
"name": "objects",
|
||||
"fields": [
|
||||
{"name": "name", "type": "string"},
|
||||
{"name": "embedding", "type":"float[]", "create_from": ["name"]}
|
||||
]
|
||||
})"_json;
|
||||
nlohmann::json schema = R"({
|
||||
"name": "objects",
|
||||
"fields": [
|
||||
{"name": "name", "type": "string"},
|
||||
{"name": "embedding", "type":"float[]", "embed_from": ["name"]}
|
||||
]
|
||||
})"_json;
|
||||
|
||||
TextEmbedderManager::model_dir = "/tmp/typesense_test/models";
|
||||
TextEmbedderManager::set_model_dir("/tmp/typesense_test/models");
|
||||
TextEmbedderManager::download_default_model();
|
||||
|
||||
auto op = collectionManager.create_collection(schema);
|
||||
@ -4833,4 +4833,163 @@ TEST_F(CollectionTest, WildcardSearchWithEmbeddingField) {
|
||||
|
||||
ASSERT_FALSE(search_res_op.ok());
|
||||
ASSERT_EQ("Wildcard query is not supported for embedding fields.", search_res_op.error());
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(CollectionTest, CreateModelDirIfNotExists) {
|
||||
system("mkdir -p /tmp/typesense_test/models");
|
||||
system("rm -rf /tmp/typesense_test/models");
|
||||
TextEmbedderManager::set_model_dir("/tmp/typesense_test/models");
|
||||
|
||||
// check if model dir is created
|
||||
ASSERT_TRUE(std::filesystem::exists("/tmp/typesense_test/models"));
|
||||
}
|
||||
|
||||
|
||||
|
||||
TEST_F(CollectionTest, EmbedStringArrayField) {
|
||||
nlohmann::json schema = R"({
|
||||
"name": "objects",
|
||||
"fields": [
|
||||
{"name": "names", "type": "string[]"},
|
||||
{"name": "embedding", "type":"float[]", "embed_from": ["names"]}
|
||||
]
|
||||
})"_json;
|
||||
|
||||
TextEmbedderManager::set_model_dir("/tmp/typesense_test/models");
|
||||
TextEmbedderManager::download_default_model();
|
||||
|
||||
auto op = collectionManager.create_collection(schema);
|
||||
ASSERT_TRUE(op.ok());
|
||||
Collection* coll = op.get();
|
||||
|
||||
nlohmann::json doc;
|
||||
doc["names"].push_back("butter");
|
||||
doc["names"].push_back("butterfly");
|
||||
doc["names"].push_back("butterball");
|
||||
|
||||
auto add_op = coll->add(doc.dump());
|
||||
ASSERT_TRUE(add_op.ok());
|
||||
}
|
||||
|
||||
TEST_F(CollectionTest, MissingFieldForEmbedding) {
|
||||
nlohmann::json schema = R"({
|
||||
"name": "objects",
|
||||
"fields": [
|
||||
{"name": "names", "type": "string[]"},
|
||||
{"name": "category", "type": "string", "optional": true},
|
||||
{"name": "embedding", "type":"float[]", "embed_from": ["names", "category"]}
|
||||
]
|
||||
})"_json;
|
||||
|
||||
TextEmbedderManager::set_model_dir("/tmp/typesense_test/models");
|
||||
TextEmbedderManager::download_default_model();
|
||||
|
||||
auto op = collectionManager.create_collection(schema);
|
||||
ASSERT_TRUE(op.ok());
|
||||
Collection* coll = op.get();
|
||||
|
||||
nlohmann::json doc;
|
||||
doc["names"].push_back("butter");
|
||||
doc["names"].push_back("butterfly");
|
||||
doc["names"].push_back("butterball");
|
||||
|
||||
auto add_op = coll->add(doc.dump());
|
||||
ASSERT_FALSE(add_op.ok());
|
||||
ASSERT_EQ("Field `category` is needed to create embedding.", add_op.error());
|
||||
}
|
||||
|
||||
|
||||
TEST_F(CollectionTest, WrongTypeForEmbedding) {
|
||||
nlohmann::json schema = R"({
|
||||
"name": "objects",
|
||||
"fields": [
|
||||
{"name": "category", "type": "string"},
|
||||
{"name": "embedding", "type":"float[]", "embed_from": ["category"]}
|
||||
]
|
||||
})"_json;
|
||||
|
||||
TextEmbedderManager::set_model_dir("/tmp/typesense_test/models");
|
||||
TextEmbedderManager::download_default_model();
|
||||
|
||||
auto op = collectionManager.create_collection(schema);
|
||||
ASSERT_TRUE(op.ok());
|
||||
Collection* coll = op.get();
|
||||
|
||||
nlohmann::json doc;
|
||||
doc["category"] = 1;
|
||||
|
||||
auto add_op = validator_t::validate_embed_fields(doc, coll->get_embedding_fields(), coll->get_schema(), true);
|
||||
ASSERT_FALSE(add_op.ok());
|
||||
ASSERT_EQ("Field `category` has malformed data.", add_op.error());
|
||||
}
|
||||
|
||||
TEST_F(CollectionTest, WrongTypeOfElementForEmbeddingInStringArray) {
|
||||
nlohmann::json schema = R"({
|
||||
"name": "objects",
|
||||
"fields": [
|
||||
{"name": "category", "type": "string[]"},
|
||||
{"name": "embedding", "type":"float[]", "embed_from": ["category"]}
|
||||
]
|
||||
})"_json;
|
||||
|
||||
TextEmbedderManager::set_model_dir("/tmp/typesense_test/models");
|
||||
TextEmbedderManager::download_default_model();
|
||||
|
||||
auto op = collectionManager.create_collection(schema);
|
||||
ASSERT_TRUE(op.ok());
|
||||
Collection* coll = op.get();
|
||||
|
||||
nlohmann::json doc;
|
||||
doc["category"].push_back(33);
|
||||
|
||||
auto add_op = validator_t::validate_embed_fields(doc, coll->get_embedding_fields(), coll->get_schema(), true);
|
||||
ASSERT_FALSE(add_op.ok());
|
||||
ASSERT_EQ("Field `category` has malformed data.", add_op.error());
|
||||
}
|
||||
|
||||
TEST_F(CollectionTest, UpdateEmbeddingsForUpdatedDocument) {
|
||||
nlohmann::json schema = R"({
|
||||
"name": "objects",
|
||||
"fields": [
|
||||
{"name": "name", "type": "string"},
|
||||
{"name": "embedding", "type":"float[]", "embed_from": ["name"]}
|
||||
]
|
||||
})"_json;
|
||||
|
||||
TextEmbedderManager::set_model_dir("/tmp/typesense_test/models");
|
||||
TextEmbedderManager::download_default_model();
|
||||
|
||||
auto op = collectionManager.create_collection(schema);
|
||||
ASSERT_TRUE(op.ok());
|
||||
Collection* coll = op.get();
|
||||
|
||||
nlohmann::json doc;
|
||||
doc["name"] = "butter";
|
||||
|
||||
auto add_op = coll->add(doc.dump());
|
||||
ASSERT_TRUE(add_op.ok());
|
||||
// get embedding field
|
||||
|
||||
// get id of the document
|
||||
auto id = add_op.get()["id"];
|
||||
// get embedding field from the document
|
||||
auto embedding_field = add_op.get()["embedding"].get<std::vector<float>>();
|
||||
ASSERT_EQ(384, embedding_field.size());
|
||||
|
||||
// update the document
|
||||
nlohmann::json update_doc;
|
||||
update_doc["name"] = "butterball";
|
||||
std::string dirty_values;
|
||||
|
||||
auto update_op = coll->update_matching_filter("id:=" + id.get<std::string>(), update_doc.dump(), dirty_values);
|
||||
ASSERT_TRUE(update_op.ok());
|
||||
ASSERT_EQ(1, update_op.get()["num_updated"]);
|
||||
|
||||
// get the document again
|
||||
auto get_op = coll->get(id);
|
||||
ASSERT_TRUE(get_op.ok());
|
||||
auto updated_embedding_field = get_op.get()["embedding"].get<std::vector<float>>();
|
||||
|
||||
// check if the embedding field is updated
|
||||
ASSERT_NE(embedding_field, updated_embedding_field);
|
||||
}
|
||||
|
@ -184,17 +184,18 @@ TEST_F(CollectionVectorTest, BasicVectorQuerying) {
|
||||
ASSERT_FALSE(res_op.ok());
|
||||
ASSERT_EQ("Document id referenced in vector query is not found.", res_op.error());
|
||||
|
||||
// DEPRECATED: vector query is also supported on non-wildcard queries with hybrid search
|
||||
// only supported with wildcard queries
|
||||
res_op = coll1->search("title", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
|
||||
"", 10, {}, {}, {}, 0,
|
||||
"<mark>", "</mark>", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback,
|
||||
4, {off}, 32767, 32767, 2,
|
||||
false, true, "zec:([0.96826, 0.94, 0.39557, 0.4542])");
|
||||
// res_op = coll1->search("title", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD,
|
||||
// spp::sparse_hash_set<std::string>(),
|
||||
// spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
|
||||
// "", 10, {}, {}, {}, 0,
|
||||
// "<mark>", "</mark>", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback,
|
||||
// 4, {off}, 32767, 32767, 2,
|
||||
// false, true, "zec:([0.96826, 0.94, 0.39557, 0.4542])");
|
||||
|
||||
ASSERT_FALSE(res_op.ok());
|
||||
ASSERT_EQ("Vector query is supported only on wildcard (q=*) searches.", res_op.error());
|
||||
// ASSERT_FALSE(res_op.ok());
|
||||
// ASSERT_EQ("Vector query is supported only on wildcard (q=*) searches.", res_op.error());
|
||||
|
||||
// support num_dim on only float array fields
|
||||
schema = R"({
|
||||
@ -676,3 +677,136 @@ TEST_F(CollectionVectorTest, VectorWithNullValue) {
|
||||
ASSERT_EQ("Field `vec` must be an array.",
|
||||
nlohmann::json::parse(json_lines[1])["error"].get<std::string>());
|
||||
}
|
||||
|
||||
TEST_F(CollectionVectorTest, HybridSearchWithExplicitVector) {
|
||||
nlohmann::json schema = R"({
|
||||
"name": "coll1",
|
||||
"fields": [
|
||||
{"name": "name", "type": "string"},
|
||||
{"name": "vec", "type": "float[]", "embed_from": ["name"]}
|
||||
]
|
||||
})"_json;
|
||||
|
||||
TextEmbedderManager::set_model_dir("/tmp/typesense_test/models");
|
||||
TextEmbedderManager::download_default_model();
|
||||
|
||||
Collection* coll1 = collectionManager.create_collection(schema).get();
|
||||
|
||||
nlohmann::json doc;
|
||||
|
||||
doc["name"] = "john doe";
|
||||
ASSERT_TRUE(coll1->add(doc.dump()).ok());
|
||||
|
||||
std::string dummy_vec_string = "[0.9";
|
||||
for (int i = 0; i < 382; i++) {
|
||||
dummy_vec_string += ", 0.9";
|
||||
}
|
||||
dummy_vec_string += ", 0.9]";
|
||||
|
||||
auto results_op = coll1->search("john", {"name"}, "", {}, {}, {0}, 20, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
|
||||
"", 10, {}, {}, {}, 0,
|
||||
"<mark>", "</mark>", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7,
|
||||
fallback,
|
||||
4, {off}, 32767, 32767, 2,
|
||||
false, true, "vec:(" + dummy_vec_string +")");
|
||||
ASSERT_EQ(true, results_op.ok());
|
||||
|
||||
|
||||
ASSERT_EQ(1, results_op.get()["found"].get<size_t>());
|
||||
ASSERT_EQ(1, results_op.get()["hits"].size());
|
||||
}
|
||||
|
||||
TEST_F(CollectionVectorTest, DistanceThresholdTest) {
|
||||
nlohmann::json schema = R"({
|
||||
"name": "test",
|
||||
"fields": [
|
||||
{"name": "vec", "type": "float[]", "num_dim": 3}
|
||||
]
|
||||
})"_json;
|
||||
|
||||
Collection* coll1 = collectionManager.create_collection(schema).get();
|
||||
|
||||
nlohmann::json doc;
|
||||
doc["vec"] = {0.1, 0.2, 0.3};
|
||||
ASSERT_TRUE(coll1->add(doc.dump()).ok());
|
||||
|
||||
// write a vector which is 0.5 away from the first vector
|
||||
doc["vec"] = {0.6, 0.7, 0.8};
|
||||
ASSERT_TRUE(coll1->add(doc.dump()).ok());
|
||||
|
||||
|
||||
auto results_op = coll1->search("*", {}, "", {}, {}, {0}, 20, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
|
||||
"", 10, {}, {}, {}, 0,
|
||||
"<mark>", "</mark>", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7,
|
||||
fallback,
|
||||
4, {off}, 32767, 32767, 2,
|
||||
false, true, "vec:([0.3,0.4,0.5])");
|
||||
|
||||
ASSERT_EQ(true, results_op.ok());
|
||||
ASSERT_EQ(2, results_op.get()["found"].get<size_t>());
|
||||
ASSERT_EQ(2, results_op.get()["hits"].size());
|
||||
|
||||
ASSERT_FLOAT_EQ(0.6, results_op.get()["hits"][0]["document"]["vec"].get<std::vector<float>>()[0]);
|
||||
ASSERT_FLOAT_EQ(0.7, results_op.get()["hits"][0]["document"]["vec"].get<std::vector<float>>()[1]);
|
||||
ASSERT_FLOAT_EQ(0.8, results_op.get()["hits"][0]["document"]["vec"].get<std::vector<float>>()[2]);
|
||||
|
||||
ASSERT_FLOAT_EQ(0.1, results_op.get()["hits"][1]["document"]["vec"].get<std::vector<float>>()[0]);
|
||||
ASSERT_FLOAT_EQ(0.2, results_op.get()["hits"][1]["document"]["vec"].get<std::vector<float>>()[1]);
|
||||
ASSERT_FLOAT_EQ(0.3, results_op.get()["hits"][1]["document"]["vec"].get<std::vector<float>>()[2]);
|
||||
|
||||
results_op = coll1->search("*", {}, "", {}, {}, {0}, 20, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
|
||||
"", 10, {}, {}, {}, 0,
|
||||
"<mark>", "</mark>", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7,
|
||||
fallback,
|
||||
4, {off}, 32767, 32767, 2,
|
||||
false, true, "vec:([0.3,0.4,0.5], distance_threshold:0.01)");
|
||||
|
||||
ASSERT_EQ(true, results_op.ok());
|
||||
ASSERT_EQ(1, results_op.get()["found"].get<size_t>());
|
||||
ASSERT_EQ(1, results_op.get()["hits"].size());
|
||||
|
||||
ASSERT_FLOAT_EQ(0.6, results_op.get()["hits"][0]["document"]["vec"].get<std::vector<float>>()[0]);
|
||||
ASSERT_FLOAT_EQ(0.7, results_op.get()["hits"][0]["document"]["vec"].get<std::vector<float>>()[1]);
|
||||
ASSERT_FLOAT_EQ(0.8, results_op.get()["hits"][0]["document"]["vec"].get<std::vector<float>>()[2]);
|
||||
|
||||
}
|
||||
|
||||
TEST_F(CollectionVectorTest, EmbeddingFieldVectorIndexTest) {
|
||||
nlohmann::json schema = R"({
|
||||
"name": "objects",
|
||||
"fields": [
|
||||
{"name": "name", "type": "string"},
|
||||
{"name": "embedding", "type":"float[]", "embed_from": ["name"]}
|
||||
]
|
||||
})"_json;
|
||||
|
||||
TextEmbedderManager::set_model_dir("/tmp/typesense_test/models");
|
||||
TextEmbedderManager::download_default_model();
|
||||
|
||||
auto op = collectionManager.create_collection(schema);
|
||||
ASSERT_TRUE(op.ok());
|
||||
Collection* coll = op.get();
|
||||
|
||||
auto& vec_index = coll->_get_index()->_get_vector_index();
|
||||
ASSERT_EQ(1, vec_index.size());
|
||||
ASSERT_EQ(1, vec_index.count("embedding"));
|
||||
|
||||
|
||||
nlohmann::json schema_change = R"({
|
||||
"fields": [
|
||||
{"name": "embedding", "drop": true}
|
||||
]
|
||||
})"_json;
|
||||
|
||||
auto schema_change_op = coll->alter(schema_change);
|
||||
|
||||
ASSERT_TRUE(schema_change_op.ok());
|
||||
ASSERT_EQ(0, vec_index.size());
|
||||
ASSERT_EQ(0, vec_index.count("embedding"));
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user