Merge branch 'event_anaylytics' of https://github.com/krunal1313/typesense into event_anaylytics

This commit is contained in:
krunal 2023-12-01 14:10:04 +05:30
commit 471dccc42e
8 changed files with 201 additions and 20 deletions

View File

@ -38,15 +38,17 @@ jobs:
uses: bazelbuild/setup-bazelisk@v2
- name: Download bazel cache
uses: dawidd6/action-download-artifact@v2
uses: jasonbosco/action-download-artifact@709b71d3729e8980f52a5a2a9ec04261060945c1
with:
name: bazel-cache
search_artifacts: true
workflow_conclusion: ""
if_no_artifact_found: warn
skip_unpack: true
- name: Uncompress bazel cache
run: |
unzip bazel-cache.zip
mkdir -p ~/.cache/bazel
tar_file="bazel-cache.tar.gz" && \
[ -f "$tar_file" ] && \

View File

@ -276,12 +276,26 @@ struct index_record {
class VectorFilterFunctor: public hnswlib::BaseFilterFunctor {
filter_result_iterator_t* const filter_result_iterator;
const uint32_t* excluded_ids = nullptr;
const uint32_t excluded_ids_length = 0;
public:
explicit VectorFilterFunctor(filter_result_iterator_t* const filter_result_iterator) :
filter_result_iterator(filter_result_iterator) {}
explicit VectorFilterFunctor(filter_result_iterator_t* const filter_result_iterator,
const uint32_t* excluded_ids = nullptr, const uint32_t excluded_ids_length = 0) :
filter_result_iterator(filter_result_iterator),
excluded_ids(excluded_ids), excluded_ids_length(excluded_ids_length) {}
bool operator()(hnswlib::labeltype id) override {
if (filter_result_iterator->approx_filter_ids_length == 0) {
if (filter_result_iterator->approx_filter_ids_length == 0 && excluded_ids_length == 0) {
return true;
}
if(excluded_ids_length > 0 && excluded_ids && std::binary_search(excluded_ids, excluded_ids + excluded_ids_length, id)) {
return false;
}
if(filter_result_iterator->approx_filter_ids_length == 0) {
return true;
}

View File

@ -10,7 +10,9 @@ embedding_res_t CLIPImageEmbedder::embed(const std::string& encoded_image) {
auto processed_image_op = image_processor_.process_image(encoded_image);
if (!processed_image_op.ok()) {
return embedding_res_t(processed_image_op.code(), processed_image_op.error());
nlohmann::json error_json;
error_json["error"] = processed_image_op.error();
return embedding_res_t(processed_image_op.code(), error_json);
}
auto processed_image = processed_image_op.get();
@ -58,7 +60,9 @@ std::vector<embedding_res_t> CLIPImageEmbedder::batch_embed(const std::vector<st
auto processed_image_op = image_processor_.process_image(input);
if (!processed_image_op.ok()) {
results[i] = embedding_res_t(processed_image_op.code(), processed_image_op.error());
nlohmann::json error_json;
error_json["error"] = processed_image_op.error();
results[i] = embedding_res_t(processed_image_op.code(), error_json);
i++;
continue;
}
@ -67,6 +71,17 @@ std::vector<embedding_res_t> CLIPImageEmbedder::batch_embed(const std::vector<st
i++;
}
// no valid images
if (processed_images.empty()) {
std::vector<embedding_res_t> result_vector(inputs.size());
for (int i = 0; i < inputs.size(); i++) {
result_vector[i] = results[i];
}
return result_vector;
}
// create input tensor
std::vector<int64_t> input_shape = {static_cast<int64_t>(processed_images.size()), 3, 224, 224};
std::vector<const char*> input_names = {"input_ids", "pixel_values", "attention_mask"};

View File

@ -36,8 +36,7 @@ Option<processed_image_t> CLIPImageProcessor::process_image(const std::string& i
LOG(INFO) << "Running image processor";
try {
output_tensors = session_->Run(Ort::RunOptions{nullptr}, input_names.data(), &input_tensor, 1, output_names.data(), output_names.size());
} catch (const std::exception& e) {
LOG(INFO) << "Error while running image processor: " << e.what();
} catch (...) {
return Option<processed_image_t>(400, "Error while processing image");
}

View File

@ -2542,6 +2542,7 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
k++;
}
VectorFilterFunctor filterFunctor(filter_result_iterator, excluded_result_ids, excluded_result_ids_size);
auto& field_vector_index = vector_index.at(vector_query.field_name);
std::vector<std::pair<float, single_filter_result_t>> dist_results;
@ -2946,7 +2947,7 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
const float VECTOR_SEARCH_WEIGHT = vector_query.alpha;
const float TEXT_MATCH_WEIGHT = 1.0 - VECTOR_SEARCH_WEIGHT;
VectorFilterFunctor filterFunctor(filter_result_iterator);
VectorFilterFunctor filterFunctor(filter_result_iterator, excluded_result_ids, excluded_result_ids_size);
auto& field_vector_index = vector_index.at(vector_query.field_name);
std::vector<std::pair<float, size_t>> dist_labels;

View File

@ -106,7 +106,7 @@ Option<bool> OpenAIEmbedder::is_model_valid(const nlohmann::json& model_config,
if(json_res.count("error") == 0 || json_res["error"].count("message") == 0) {
return Option<bool>(400, "OpenAI API error: " + res);
}
return Option<bool>(400, "OpenAI API error: " + nlohmann::json::parse(res)["error"]["message"].get<std::string>());
return Option<bool>(400, "OpenAI API error: " + json_res["error"]["message"].get<std::string>());
}
nlohmann::json models_json;
@ -152,7 +152,7 @@ Option<bool> OpenAIEmbedder::is_model_valid(const nlohmann::json& model_config,
if(json_res.count("error") == 0 || json_res["error"].count("message") == 0) {
return Option<bool>(400, "OpenAI API error: " + embedding_res);
}
return Option<bool>(400, "OpenAI API error: " + nlohmann::json::parse(res)["error"]["message"].get<std::string>());
return Option<bool>(400, "OpenAI API error: " + json_res["error"]["message"].get<std::string>());
}
std::vector<float> embedding;
try {
@ -337,7 +337,7 @@ Option<bool> GoogleEmbedder::is_model_valid(const nlohmann::json& model_config,
return Option<bool>(400, "Google API error: " + res);
}
return Option<bool>(400, "Google API error: " + nlohmann::json::parse(res)["error"]["message"].get<std::string>());
return Option<bool>(400, "Google API error: " + json_res["error"]["message"].get<std::string>());
}
try {
@ -477,7 +477,7 @@ Option<bool> GCPEmbedder::is_model_valid(const nlohmann::json& model_config, siz
if(json_res.count("error") == 0 || json_res["error"].count("message") == 0) {
return Option<bool>(400, "GCP API error: " + res);
}
return Option<bool>(400, "GCP API error: " + nlohmann::json::parse(res)["error"]["message"].get<std::string>());
return Option<bool>(400, "GCP API error: " + json_res["error"]["message"].get<std::string>());
}
nlohmann::json res_json;
try {
@ -680,7 +680,7 @@ Option<std::string> GCPEmbedder::generate_access_token(const std::string& refres
if(res_code == 408) {
return Option<std::string>(408, "GCP API timeout.");
}
return Option<std::string>(400, "GCP API error: " + nlohmann::json::parse(res)["error"]["message"].get<std::string>());
return Option<std::string>(400, "GCP API error: " + json_res["error"]["message"].get<std::string>());
}
nlohmann::json res_json;
try {

View File

@ -2987,7 +2987,6 @@ TEST_F(CollectionVectorTest, TestImageEmbedding) {
auto coll = collection_create_op.get();
LOG(INFO) << "Adding image to collection";
auto add_op = coll->add(R"({
"name": "dog",
@ -3027,6 +3026,83 @@ TEST_F(CollectionVectorTest, TestImageEmbedding) {
ASSERT_EQ(results2["hits"][1]["document"]["id"], "0");
}
TEST_F(CollectionVectorTest, TestHybridSearchHiddenHits) {
nlohmann::json schema = R"({
"name": "test",
"fields": [
{
"name": "name",
"type": "string"
},
{
"name": "embedding",
"type": "float[]",
"embed": {
"from": [
"name"
],
"model_config": {
"model_name": "ts/e5-small"
}
}
}
]
})"_json;
EmbedderManager::set_model_dir("/tmp/typesense_test/models");
auto collection_create_op = collectionManager.create_collection(schema);
ASSERT_TRUE(collection_create_op.ok());
auto coll = collection_create_op.get();
auto add_op = coll->add(R"({
"name": "soccer",
"id": "0"
})"_json.dump());
ASSERT_TRUE(add_op.ok());
add_op = coll->add(R"({
"name": "guitar",
"id": "1"
})"_json.dump());
ASSERT_TRUE(add_op.ok());
add_op = coll->add(R"({
"name": "typesense",
"id": "2"
})"_json.dump());
ASSERT_TRUE(add_op.ok());
add_op = coll->add(R"({
"name": "potato",
"id": "3"
})"_json.dump());
ASSERT_TRUE(add_op.ok());
auto results = coll->search("sports", {"name", "embedding"},
"", {}, {}, {2}, 10,
1, FREQUENCY, {true},
0, spp::sparse_hash_set<std::string>()).get();
ASSERT_EQ(4, results["hits"].size());
ASSERT_STREQ("0", results["hits"][0]["document"]["id"].get<std::string>().c_str());
// do hybrid search with hidden_hits
auto hybrid_results = coll->search("sports", {"name", "embedding"},
"", {}, {}, {2}, 10,
1, FREQUENCY, {true},
0, spp::sparse_hash_set<std::string>(), spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 1, "", "0").get();
ASSERT_EQ(3, hybrid_results["hits"].size());
ASSERT_FALSE(hybrid_results["hits"][0]["document"]["id"] == 0);
}
TEST_F(CollectionVectorTest, TryAddingMultipleImageFieldToEmbedFrom) {
auto schema_json =
R"({
@ -3045,4 +3121,34 @@ TEST_F(CollectionVectorTest, TryAddingMultipleImageFieldToEmbedFrom) {
ASSERT_FALSE(collection_create_op.ok());
ASSERT_EQ(collection_create_op.error(), "Only one field can be used in the `embed.from` property of an embed field when embedding from an image field.");
}
}
<<<<<<< Updated upstream
TEST_F(CollectionVectorTest, TestInvalidImage) {
auto schema_json =
R"({
"name": "Images",
"fields": [
{"name": "name", "type": "string"},
{"name": "image", "type": "image", "store": false},
{"name": "embedding", "type":"float[]", "embed":{"from": ["image"], "model_config": {"model_name": "ts/clip-vit-b-p32"}}}
]
})"_json;
EmbedderManager::set_model_dir("/tmp/typesense_test/models");
auto collection_create_op = collectionManager.create_collection(schema_json);
ASSERT_TRUE(collection_create_op.ok());
auto coll = collection_create_op.get();
auto add_op = coll->add(R"({
"name": "teddy bear",
"image": "invalid"
})"_json.dump());
ASSERT_FALSE(add_op.ok());
ASSERT_EQ(add_op.error(), "Error while processing image");
}

View File

@ -3,6 +3,8 @@
TEST(FacetIndexTest, FacetValueDeletionString) {
facet_index_t findex;
findex.initialize("brand");
std::unordered_map<facet_value_id_t, std::vector<uint32_t>, facet_value_id_t::Hash> fvalue_to_seq_ids;
std::unordered_map<uint32_t, std::vector<facet_value_id_t>> seq_id_to_fvalues;
@ -18,15 +20,54 @@ TEST(FacetIndexTest, FacetValueDeletionString) {
doc["brand"] = "nike";
findex.insert("brand", fvalue_to_seq_ids, seq_id_to_fvalues, true);
ASSERT_EQ(3, findex.facet_val_num_ids("brand", "nike"));
findex.remove(doc, brandf, 0);
findex.remove(doc, brandf, 1);
findex.remove(doc, brandf, 2);
ASSERT_EQ(1, findex.facet_val_num_ids("brand", "nike"));
findex.remove(doc, brandf, 2);
ASSERT_FALSE(findex.facet_value_exists("brand", "nike"));
}
TEST(FacetIndexTest, FacetValueDeletionOfLongString) {
facet_index_t findex;
findex.initialize("brand");
std::unordered_map<facet_value_id_t, std::vector<uint32_t>, facet_value_id_t::Hash> fvalue_to_seq_ids;
std::unordered_map<uint32_t, std::vector<facet_value_id_t>> seq_id_to_fvalues;
std::string longval;
for(size_t i = 0; i < 200; i++) {
longval += "a";
}
facet_value_id_t longfval(longval.substr(0, 100), 1);
fvalue_to_seq_ids[longfval] = {0, 1, 2};
seq_id_to_fvalues[0] = {longfval};
seq_id_to_fvalues[1] = {longfval};
seq_id_to_fvalues[2] = {longfval};
field brandf("brand", field_types::STRING, true);
nlohmann::json doc;
doc["brand"] = longval;
findex.insert("brand", fvalue_to_seq_ids, seq_id_to_fvalues, true);
ASSERT_EQ(3, findex.facet_val_num_ids("brand", longval.substr(0, 100)));
findex.remove(doc, brandf, 0);
findex.remove(doc, brandf, 1);
ASSERT_EQ(1, findex.facet_val_num_ids("brand", longval.substr(0, 100)));
findex.remove(doc, brandf, 2);
ASSERT_FALSE(findex.facet_value_exists("brand", longval.substr(0, 100)));
}
TEST(FacetIndexTest, FacetValueDeletionFloat) {
facet_index_t findex;
findex.initialize("price");
std::unordered_map<facet_value_id_t, std::vector<uint32_t>, facet_value_id_t::Hash> fvalue_to_seq_ids;
std::unordered_map<uint32_t, std::vector<facet_value_id_t>> seq_id_to_fvalues;
@ -39,13 +80,16 @@ TEST(FacetIndexTest, FacetValueDeletionFloat) {
field pricef("price", field_types::FLOAT, true);
nlohmann::json doc;
doc["price"] = "99.95";
doc["price"] = 99.95;
findex.insert("price", fvalue_to_seq_ids, seq_id_to_fvalues, true);
ASSERT_EQ(3, findex.facet_val_num_ids("price", "99.95"));
findex.insert("brand", fvalue_to_seq_ids, seq_id_to_fvalues, true);
findex.remove(doc, pricef, 0);
findex.remove(doc, pricef, 1);
findex.remove(doc, pricef, 2);
ASSERT_EQ(1, findex.facet_val_num_ids("price", "99.95"));
findex.remove(doc, pricef, 2);
ASSERT_FALSE(findex.facet_value_exists("price", "99.95"));
}