Merge branch 'v0.25-join' into v0.26-facets

This commit is contained in:
Kishore Nallan 2023-09-01 12:11:31 +05:30
commit bb5720955b
8 changed files with 362 additions and 29 deletions

View File

@ -507,7 +507,7 @@ private:
static void handle_doc_ops(const tsl::htrie_map<char, field>& search_schema,
nlohmann::json& update_doc, const nlohmann::json& old_doc);
static void get_doc_changes(const index_operation_t op, const tsl::htrie_map<char, field>& search_schema,
static void get_doc_changes(const index_operation_t op, const tsl::htrie_map<char, field>& embedding_fields,
nlohmann::json &update_doc, const nlohmann::json &old_doc, nlohmann::json &new_doc,
nlohmann::json &del_doc);

View File

@ -53,12 +53,6 @@ Collection::Collection(const std::string& name, const uint32_t collection_id, co
symbols_to_index(to_char_array(symbols_to_index)), token_separators(to_char_array(token_separators)),
index(init_index()) {
for (auto const& field: fields) {
if (field.embed.count(fields::from) != 0) {
embedding_fields.emplace(field.name, field);
}
}
this->num_documents = 0;
}

View File

@ -919,7 +919,9 @@ Option<bool> CollectionManager::do_search(std::map<std::string, std::string>& re
nlohmann::json preset;
const auto& preset_op = CollectionManager::get_instance().get_preset(preset_it->second, preset);
if(preset_op.ok()) {
// NOTE: we merge only single preset configuration because multi ("searches") preset value replaces
// the request body directly before we reach this single search request function.
if(preset_op.ok() && !preset.contains("searches")) {
if(!preset.is_object()) {
return Option<bool>(400, "Search preset is not an object.");
}

View File

@ -449,7 +449,7 @@ void Index::validate_and_preprocess(Index *index,
if(index_rec.is_update) {
// scrub string fields to reduce delete ops
get_doc_changes(index_rec.operation, search_schema, index_rec.doc, index_rec.old_doc,
get_doc_changes(index_rec.operation, embedding_fields, index_rec.doc, index_rec.old_doc,
index_rec.new_doc, index_rec.del_doc);
if(generate_embeddings) {
@ -2689,8 +2689,8 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
for(size_t res_index = 0; res_index < vec_results.size(); res_index++) {
auto& vec_result = vec_results[res_index];
auto doc_id = vec_result.first;
auto result_it = topster->kv_map.find(doc_id);
auto seq_id = vec_result.first;
auto result_it = topster->kv_map.find(seq_id);
if(result_it != topster->kv_map.end()) {
if(result_it->second->match_score_index < 0 || result_it->second->match_score_index > 2) {
@ -2699,30 +2699,42 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
// result overlaps with keyword search: we have to combine the scores
auto result = result_it->second;
KV* kv = result_it->second;
// old_score + (1 / rank_of_document) * WEIGHT)
result->vector_distance = vec_result.second;
result->text_match_score = result->scores[result->match_score_index];
kv->vector_distance = vec_result.second;
kv->text_match_score = kv->scores[kv->match_score_index];
int64_t match_score = float_to_int64_t(
(int64_t_to_float(result->scores[result->match_score_index])) +
(int64_t_to_float(kv->scores[kv->match_score_index])) +
((1.0 / (res_index + 1)) * VECTOR_SEARCH_WEIGHT));
int64_t match_score_index = -1;
int64_t scores[3] = {0};
compute_sort_scores(sort_fields_std, sort_order, field_values, geopoint_indices, doc_id, 0, match_score, scores, match_score_index, vec_result.second);
compute_sort_scores(sort_fields_std, sort_order, field_values, geopoint_indices, seq_id, 0,
match_score, scores, match_score_index, vec_result.second);
for(int i = 0; i < 3; i++) {
result->scores[i] = scores[i];
kv->scores[i] = scores[i];
}
result->match_score_index = match_score_index;
kv->match_score_index = match_score_index;
} else {
// Result has been found only in vector search: we have to add it to both KV and result_ids
// (1 / rank_of_document) * WEIGHT)
int64_t scores[3] = {0};
int64_t match_score = float_to_int64_t((1.0 / (res_index + 1)) * VECTOR_SEARCH_WEIGHT);
int64_t match_score_index = -1;
compute_sort_scores(sort_fields_std, sort_order, field_values, geopoint_indices, doc_id, 0, match_score, scores, match_score_index, vec_result.second);
KV kv(searched_queries.size(), doc_id, doc_id, match_score_index, scores);
compute_sort_scores(sort_fields_std, sort_order, field_values, geopoint_indices, seq_id, 0, match_score, scores, match_score_index, vec_result.second);
uint64_t distinct_id = seq_id;
if (group_limit != 0) {
distinct_id = get_distinct_id(group_by_fields, seq_id);
if(excluded_group_ids.count(distinct_id) != 0) {
continue;
}
}
KV kv(searched_queries.size(), seq_id, distinct_id, match_score_index, scores);
kv.text_match_score = 0;
kv.vector_distance = vec_result.second;
@ -2735,7 +2747,7 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
}
topster->add(&kv);
vec_search_ids.push_back(doc_id);
vec_search_ids.push_back(seq_id);
}
}
@ -5804,7 +5816,7 @@ void Index::handle_doc_ops(const tsl::htrie_map<char, field>& search_schema,
}
}
void Index::get_doc_changes(const index_operation_t op, const tsl::htrie_map<char, field>& search_schema,
void Index::get_doc_changes(const index_operation_t op, const tsl::htrie_map<char, field>& embedding_fields,
nlohmann::json& update_doc, const nlohmann::json& old_doc, nlohmann::json& new_doc,
nlohmann::json& del_doc) {
@ -5817,7 +5829,12 @@ void Index::get_doc_changes(const index_operation_t op, const tsl::htrie_map<cha
}
if(!update_doc.contains(it.key())) {
del_doc[it.key()] = it.value();
// embedding field won't be part of upsert doc so populate new doc with the value from old doc
if(embedding_fields.count(it.key()) != 0) {
new_doc[it.key()] = it.value();
} else {
del_doc[it.key()] = it.value();
}
}
}
} else {

View File

@ -221,6 +221,7 @@ std::vector<embedding_res_t> OpenAIEmbedder::batch_embed(const std::vector<std::
}
nlohmann::json res_json;
try {
res_json = nlohmann::json::parse(res);
} catch (const std::exception& e) {
@ -232,8 +233,21 @@ std::vector<embedding_res_t> OpenAIEmbedder::batch_embed(const std::vector<std::
}
return outputs;
}
if(res_json.count("data") == 0 || !res_json["data"].is_array() || res_json["data"].size() != inputs.size()) {
std::vector<embedding_res_t> outputs;
for(size_t i = 0; i < inputs.size(); i++) {
outputs.push_back(embedding_res_t(500, "Got malformed response from OpenAI API."));
}
return outputs;
}
std::vector<embedding_res_t> outputs;
for(auto& data : res_json["data"]) {
if(data.count("embedding") == 0 || !data["embedding"].is_array() || data["embedding"].size() == 0) {
outputs.push_back(embedding_res_t(500, "Got malformed response from OpenAI API."));
continue;
}
outputs.push_back(embedding_res_t(data["embedding"].get<std::vector<float>>()));
}
@ -577,7 +591,20 @@ std::vector<embedding_res_t> GCPEmbedder::batch_embed(const std::vector<std::str
return outputs;
}
std::vector<embedding_res_t> outputs;
if(res_json.count("predictions") == 0 || !res_json["predictions"].is_array() || res_json["predictions"].size() != inputs.size()) {
std::vector<embedding_res_t> outputs;
for(size_t i = 0; i < inputs.size(); i++) {
outputs.push_back(embedding_res_t(500, "Got malformed response from GCP API."));
}
return outputs;
}
for(const auto& prediction : res_json["predictions"]) {
if(prediction.count("embeddings") == 0 || !prediction["embeddings"].is_object() || prediction["embeddings"].count("values") == 0 || !prediction["embeddings"]["values"].is_array() || prediction["embeddings"]["values"].size() == 0) {
outputs.push_back(embedding_res_t(500, "Got malformed response from GCP API."));
continue;
}
outputs.push_back(embedding_res_t(prediction["embeddings"]["values"].get<std::vector<float>>()));
}

View File

@ -119,7 +119,7 @@ int init_root_logger(Config & config, const std::string & server_version) {
if(log_dir.empty()) {
// use console logger if log dir is not specified
FLAGS_logtostdout = true;
FLAGS_logtostderr = true;
} else {
if(!directory_exists(log_dir)) {
std::cerr << "Typesense failed to start. " << "Log directory " << log_dir << " does not exist.";

View File

@ -224,6 +224,55 @@ TEST_F(CollectionVectorTest, BasicVectorQuerying) {
collectionManager.drop_collection("coll1");
}
TEST_F(CollectionVectorTest, VectorUnchangedUpsert) {
nlohmann::json schema = R"({
"name": "coll1",
"fields": [
{"name": "title", "type": "string"},
{"name": "points", "type": "int32"},
{"name": "vec", "type": "float[]", "num_dim": 3}
]
})"_json;
Collection* coll1 = collectionManager.create_collection(schema).get();
std::vector<float> vec = {0.12, 0.45, 0.64};
nlohmann::json doc;
doc["id"] = "0";
doc["title"] = "Title";
doc["points"] = 100;
doc["vec"] = vec;
auto add_op = coll1->add(doc.dump());
ASSERT_TRUE(add_op.ok());
auto results = coll1->search("*", {}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
"", 10, {}, {}, {}, 0,
"<mark>", "</mark>", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback,
4, {off}, 32767, 32767, 2,
false, true, "vec:([0.12, 0.44, 0.55])").get();
ASSERT_EQ(1, results["found"].get<size_t>());
// upsert unchanged doc
add_op = coll1->add(doc.dump(), index_operation_t::UPSERT);
ASSERT_TRUE(add_op.ok());
results = coll1->search("*", {}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
"", 10, {}, {}, {}, 0,
"<mark>", "</mark>", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback,
4, {off}, 32767, 32767, 2,
false, true, "vec:([0.12, 0.44, 0.55])").get();
ASSERT_EQ(1, results["found"].get<size_t>());
}
TEST_F(CollectionVectorTest, NumVectorGreaterThanNumDim) {
nlohmann::json schema = R"({
"name": "coll1",
@ -692,6 +741,88 @@ TEST_F(CollectionVectorTest, VectorWithNullValue) {
nlohmann::json::parse(json_lines[1])["error"].get<std::string>());
}
TEST_F(CollectionVectorTest, EmbeddedVectorUnchangedUpsert) {
nlohmann::json schema = R"({
"name": "coll1",
"fields": [
{"name": "title", "type": "string"},
{"name": "points", "type": "int32"},
{"name": "embedding", "type":"float[]", "embed":{"from": ["title"],
"model_config": {"model_name": "ts/e5-small"}}}
]
})"_json;
TextEmbedderManager::set_model_dir("/tmp/typesense_test/models");
Collection* coll1 = collectionManager.create_collection(schema).get();
nlohmann::json doc;
doc["id"] = "0";
doc["title"] = "Title";
doc["points"] = 100;
auto add_op = coll1->add(doc.dump());
ASSERT_TRUE(add_op.ok());
auto results = coll1->search("title", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>()).get();
ASSERT_EQ(1, results["found"].get<size_t>());
auto embedding = results["hits"][0]["document"]["embedding"].get<std::vector<float>>();
ASSERT_EQ(384, embedding.size());
// upsert unchanged doc
doc.clear();
doc["id"] = "0";
doc["title"] = "Title";
doc["points"] = 100;
add_op = coll1->add(doc.dump(), index_operation_t::UPSERT);
ASSERT_TRUE(add_op.ok());
results = coll1->search("title", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>()).get();
ASSERT_EQ(1, results["found"].get<size_t>());
embedding = results["hits"][0]["document"]["embedding"].get<std::vector<float>>();
ASSERT_EQ(384, embedding.size());
// update
doc.clear();
doc["id"] = "0";
doc["title"] = "Title";
doc["points"] = 100;
add_op = coll1->add(doc.dump(), index_operation_t::UPDATE);
ASSERT_TRUE(add_op.ok());
results = coll1->search("title", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>()).get();
ASSERT_EQ(1, results["found"].get<size_t>());
embedding = results["hits"][0]["document"]["embedding"].get<std::vector<float>>();
ASSERT_EQ(384, embedding.size());
// emplace
doc.clear();
doc["id"] = "0";
doc["title"] = "Title";
doc["points"] = 100;
add_op = coll1->add(doc.dump(), index_operation_t::EMPLACE);
ASSERT_TRUE(add_op.ok());
results = coll1->search("title", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>()).get();
ASSERT_EQ(1, results["found"].get<size_t>());
embedding = results["hits"][0]["document"]["embedding"].get<std::vector<float>>();
ASSERT_EQ(384, embedding.size());
}
TEST_F(CollectionVectorTest, HybridSearchWithExplicitVector) {
nlohmann::json schema = R"({
"name": "objects",
@ -1099,7 +1230,67 @@ TEST_F(CollectionVectorTest, HideCredential) {
ASSERT_EQ("***********", coll_summary["fields"][1]["embed"]["model_config"]["project_id"].get<std::string>());
}
TEST_F(CollectionVectorTest, UpdateOfCollWithNonOptionalEmbeddingField) {
TEST_F(CollectionVectorTest, UpdateOfFieldReferencedByEmbedding) {
nlohmann::json schema = R"({
"name": "objects",
"fields": [
{"name": "name", "type": "string"},
{"name": "embedding", "type":"float[]", "embed":{"from": ["name"],
"model_config": {"model_name": "ts/e5-small"}}}
]
})"_json;
TextEmbedderManager::set_model_dir("/tmp/typesense_test/models");
auto op = collectionManager.create_collection(schema);
ASSERT_TRUE(op.ok());
Collection* coll = op.get();
nlohmann::json object;
object["id"] = "0";
object["name"] = "butter";
auto add_op = coll->add(object.dump(), CREATE);
ASSERT_TRUE(add_op.ok());
auto results = coll->search("butter", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}).get();
ASSERT_EQ(1, results["found"].get<size_t>());
auto original_embedding = results["hits"][0]["document"]["embedding"].get<std::vector<float>>();
nlohmann::json update_object;
update_object["id"] = "0";
update_object["name"] = "ghee";
auto update_op = coll->add(update_object.dump(), EMPLACE);
ASSERT_TRUE(update_op.ok());
results = coll->search("ghee", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}).get();
ASSERT_EQ(1, results["found"].get<size_t>());
auto updated_embedding = results["hits"][0]["document"]["embedding"].get<std::vector<float>>();
ASSERT_NE(original_embedding, updated_embedding);
// action = update
update_object["name"] = "milk";
update_op = coll->add(update_object.dump(), UPDATE);
ASSERT_TRUE(update_op.ok());
results = coll->search("milk", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}).get();
ASSERT_EQ(1, results["found"].get<size_t>());
updated_embedding = results["hits"][0]["document"]["embedding"].get<std::vector<float>>();
ASSERT_NE(original_embedding, updated_embedding);
// action = upsert
update_object["name"] = "cheese";
update_op = coll->add(update_object.dump(), UPSERT);
ASSERT_TRUE(update_op.ok());
results = coll->search("cheese", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}).get();
ASSERT_EQ(1, results["found"].get<size_t>());
updated_embedding = results["hits"][0]["document"]["embedding"].get<std::vector<float>>();
ASSERT_NE(original_embedding, updated_embedding);
}
TEST_F(CollectionVectorTest, UpdateOfFieldNotReferencedByEmbedding) {
// test updates to a field that's not referred by an embedding field
nlohmann::json schema = R"({
"name": "objects",
"fields": [
@ -1123,16 +1314,34 @@ TEST_F(CollectionVectorTest, UpdateOfCollWithNonOptionalEmbeddingField) {
auto add_op = coll->add(object.dump(), CREATE);
ASSERT_TRUE(add_op.ok());
auto results = coll->search("butter", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}).get();
ASSERT_EQ(1, results["found"].get<size_t>());
nlohmann::json update_object;
update_object["id"] = "0";
update_object["about"] = "something about butter";
auto update_op = coll->add(update_object.dump(), EMPLACE);
ASSERT_TRUE(update_op.ok());
results = coll->search("butter", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}).get();
ASSERT_EQ(1, results["found"].get<size_t>());
// action = update
update_object["about"] = "something about butter 2";
update_op = coll->add(update_object.dump(), UPDATE);
ASSERT_TRUE(update_op.ok());
results = coll->search("butter", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}).get();
ASSERT_EQ(1, results["found"].get<size_t>());
// action = upsert
update_object["name"] = "butter";
update_object["about"] = "something about butter 3";
update_op = coll->add(update_object.dump(), UPSERT);
ASSERT_TRUE(update_op.ok());
results = coll->search("butter", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}).get();
ASSERT_EQ(1, results["found"].get<size_t>());
}
TEST_F(CollectionVectorTest, FreshEmplaceWithOptionalEmbeddingReferencedField) {
@ -1327,6 +1536,58 @@ TEST_F(CollectionVectorTest, KeywordSearchReturnOnlyTextMatchInfo) {
ASSERT_EQ(1, results["hits"][0].count("text_match_info"));
}
TEST_F(CollectionVectorTest, GroupByWithVectorSearch) {
nlohmann::json schema = R"({
"name": "coll1",
"fields": [
{"name": "title", "type": "string"},
{"name": "group", "type": "string", "facet": true},
{"name": "vec", "type": "float[]", "num_dim": 4}
]
})"_json;
Collection* coll1 = collectionManager.create_collection(schema).get();
std::vector<std::vector<float>> values = {
{0.851758, 0.909671, 0.823431, 0.372063},
{0.97826, 0.933157, 0.39557, 0.306488},
{0.230606, 0.634397, 0.514009, 0.399594}
};
for (size_t i = 0; i < values.size(); i++) {
nlohmann::json doc;
doc["id"] = std::to_string(i);
doc["title"] = std::to_string(i) + " title";
doc["group"] = "0";
doc["vec"] = values[i];
ASSERT_TRUE(coll1->add(doc.dump()).ok());
}
auto res = coll1->search("title", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
"", 10, {}, {}, {"group"}, 1,
"<mark>", "</mark>", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback,
4, {off}, 32767, 32767, 2,
false, true, "vec:([0.96826, 0.94, 0.39557, 0.306488])").get();
ASSERT_EQ(1, res["grouped_hits"].size());
ASSERT_EQ(1, res["grouped_hits"][0]["hits"].size());
ASSERT_EQ(1, res["grouped_hits"][0]["hits"][0].count("vector_distance"));
res = coll1->search("*", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
"", 10, {}, {}, {"group"}, 1,
"<mark>", "</mark>", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback,
4, {off}, 32767, 32767, 2,
false, true, "vec:([0.96826, 0.94, 0.39557, 0.306488])").get();
ASSERT_EQ(1, res["grouped_hits"].size());
ASSERT_EQ(1, res["grouped_hits"][0]["hits"].size());
ASSERT_EQ(1, res["grouped_hits"][0]["hits"][0].count("vector_distance"));
}
TEST_F(CollectionVectorTest, HybridSearchReturnAllInfo) {
auto schema_json =
R"({

View File

@ -611,7 +611,7 @@ TEST_F(CoreAPIUtilsTest, MultiSearchWithPresetShouldUsePresetForAuth) {
ASSERT_EQ(2, embedded_params_vec.size());
}
TEST_F(CoreAPIUtilsTest, PresetSingleSearch) {
TEST_F(CoreAPIUtilsTest, PresetMultiSearch) {
nlohmann::json schema = R"({
"name": "coll1",
"fields": [
@ -636,7 +636,7 @@ TEST_F(CoreAPIUtilsTest, PresetSingleSearch) {
auto search_body = R"(
{"searches":[
{"collection":"coll1","q":"apple", "query_by": "title", "preset": "single_preset"}
{"collection":"coll1","q":"apple", "query_by": "name", "preset": "single_preset"}
]}
)";
@ -646,8 +646,40 @@ TEST_F(CoreAPIUtilsTest, PresetSingleSearch) {
post_multi_search(req, res);
ASSERT_EQ("12", req->params["per_page"]);
ASSERT_EQ("coll1", req->params["collection"]);
auto res_json = nlohmann::json::parse(res->body);
ASSERT_EQ(1, res_json["results"].size());
ASSERT_EQ(0, res_json["results"][0]["found"].get<size_t>());
// with multiple "searches" preset configuration
preset_value = R"(
{"searches":[
{"collection":"coll1", "q": "*", "per_page": "8"},
{"collection":"coll1", "q": "*", "per_page": "11"}
]}
)"_json;
collectionManager.upsert_preset("multi_preset", preset_value);
embedded_params.clear();
req->params.clear();
req->params["preset"] = "multi_preset";
req->embedded_params_vec.clear();
req->embedded_params_vec.push_back(embedded_params);
req->embedded_params_vec.push_back(embedded_params);
// "preset": "multi_preset"
search_body = R"(
{"searches":[
{"collection":"coll1","q":"apple", "query_by": "title"}
]}
)";
req->body = search_body;
post_multi_search(req, res);
res_json = nlohmann::json::parse(res->body);
ASSERT_EQ(2, res_json["results"].size());
ASSERT_EQ(0, res_json["results"][0]["found"].get<size_t>());
ASSERT_EQ(0, res_json["results"][1]["found"].get<size_t>());
collectionManager.drop_collection("coll1");
}