Merge branch 'event_anaylytics' of https://github.com/krunal1313/typesense into event_anaylytics

2025-05-21 22:33:27 +08:00 · 2023-12-01 14:10:04 +05:30 · 2023-12-01 14:10:04 +05:30 · 471dccc42e
commit 471dccc42e
parent 81bb569b8f cdae0e28ad
8 changed files with 201 additions and 20 deletions
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@ -38,15 +38,17 @@ jobs:
        uses: bazelbuild/setup-bazelisk@v2

      - name: Download bazel cache
-        uses: dawidd6/action-download-artifact@v2
+        uses: jasonbosco/action-download-artifact@709b71d3729e8980f52a5a2a9ec04261060945c1
        with:
          name: bazel-cache
          search_artifacts: true
          workflow_conclusion: ""
          if_no_artifact_found: warn
+          skip_unpack: true

      - name: Uncompress bazel cache
        run: |
+          unzip bazel-cache.zip
          mkdir -p ~/.cache/bazel
          tar_file="bazel-cache.tar.gz" && \
            [ -f "$tar_file" ] && \
--- a/include/index.h
+++ b/include/index.h
@ -276,12 +276,26 @@ struct index_record {
 class VectorFilterFunctor: public hnswlib::BaseFilterFunctor {
    filter_result_iterator_t* const filter_result_iterator;

+    const uint32_t* excluded_ids = nullptr;
+    const uint32_t excluded_ids_length = 0;
+
 public:
-    explicit VectorFilterFunctor(filter_result_iterator_t* const filter_result_iterator) :
-    filter_result_iterator(filter_result_iterator) {}
+
+    explicit VectorFilterFunctor(filter_result_iterator_t* const filter_result_iterator,
+                                 const uint32_t* excluded_ids = nullptr, const uint32_t excluded_ids_length = 0) :
+                                filter_result_iterator(filter_result_iterator),
+                                excluded_ids(excluded_ids), excluded_ids_length(excluded_ids_length) {}

    bool operator()(hnswlib::labeltype id) override {
-        if (filter_result_iterator->approx_filter_ids_length == 0) {
+        if (filter_result_iterator->approx_filter_ids_length == 0 && excluded_ids_length == 0) {
+            return true;
+        }
+
+        if(excluded_ids_length > 0 && excluded_ids && std::binary_search(excluded_ids, excluded_ids + excluded_ids_length, id)) {
+            return false;
+        }
+
+        if(filter_result_iterator->approx_filter_ids_length == 0) {
            return true;
        }

--- a/src/image_embedder.cpp
+++ b/src/image_embedder.cpp
@ -10,7 +10,9 @@ embedding_res_t CLIPImageEmbedder::embed(const std::string& encoded_image) {
    auto processed_image_op = image_processor_.process_image(encoded_image);

    if (!processed_image_op.ok()) {
-        return embedding_res_t(processed_image_op.code(), processed_image_op.error());
+        nlohmann::json error_json;
+        error_json["error"] = processed_image_op.error();
+        return embedding_res_t(processed_image_op.code(), error_json);
    }

    auto processed_image = processed_image_op.get();
@ -58,7 +60,9 @@ std::vector<embedding_res_t> CLIPImageEmbedder::batch_embed(const std::vector<st
        auto processed_image_op = image_processor_.process_image(input);

        if (!processed_image_op.ok()) {
-            results[i] = embedding_res_t(processed_image_op.code(), processed_image_op.error());
+            nlohmann::json error_json;
+            error_json["error"] = processed_image_op.error();
+            results[i] = embedding_res_t(processed_image_op.code(), error_json);
            i++;
            continue;
        }
@ -67,6 +71,17 @@ std::vector<embedding_res_t> CLIPImageEmbedder::batch_embed(const std::vector<st
        i++;
    }

+
+    // no valid images
+    if (processed_images.empty()) {
+        std::vector<embedding_res_t> result_vector(inputs.size());
+        for (int i = 0; i < inputs.size(); i++) {
+            result_vector[i] = results[i];
+        }
+
+        return result_vector;
+    }
+
    // create input tensor
    std::vector<int64_t> input_shape = {static_cast<int64_t>(processed_images.size()), 3, 224, 224};
    std::vector<const char*> input_names = {"input_ids", "pixel_values", "attention_mask"};
--- a/src/image_processor.cpp
+++ b/src/image_processor.cpp
@ -36,8 +36,7 @@ Option<processed_image_t> CLIPImageProcessor::process_image(const std::string& i
    LOG(INFO) << "Running image processor";
    try {
        output_tensors = session_->Run(Ort::RunOptions{nullptr}, input_names.data(), &input_tensor, 1, output_names.data(), output_names.size());
-    } catch (const std::exception& e) {
-        LOG(INFO) << "Error while running image processor: " << e.what();
+    } catch (...) {
        return Option<processed_image_t>(400, "Error while processing image");
    }

--- a/src/index.cpp
+++ b/src/index.cpp
@ -2542,6 +2542,7 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
                k++;
            }

+            VectorFilterFunctor filterFunctor(filter_result_iterator, excluded_result_ids, excluded_result_ids_size);
            auto& field_vector_index = vector_index.at(vector_query.field_name);

            std::vector<std::pair<float, single_filter_result_t>> dist_results;
@ -2946,7 +2947,7 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
                const float VECTOR_SEARCH_WEIGHT = vector_query.alpha;
                const float TEXT_MATCH_WEIGHT = 1.0 - VECTOR_SEARCH_WEIGHT;

-                VectorFilterFunctor filterFunctor(filter_result_iterator);
+                VectorFilterFunctor filterFunctor(filter_result_iterator, excluded_result_ids, excluded_result_ids_size);
                auto& field_vector_index = vector_index.at(vector_query.field_name);

                std::vector<std::pair<float, size_t>> dist_labels;
--- a/src/text_embedder_remote.cpp
+++ b/src/text_embedder_remote.cpp
@ -106,7 +106,7 @@ Option<bool> OpenAIEmbedder::is_model_valid(const nlohmann::json& model_config,
        if(json_res.count("error") == 0 || json_res["error"].count("message") == 0) {
            return Option<bool>(400, "OpenAI API error: " + res);
        }
-        return Option<bool>(400, "OpenAI API error: " + nlohmann::json::parse(res)["error"]["message"].get<std::string>());
+        return Option<bool>(400, "OpenAI API error: " + json_res["error"]["message"].get<std::string>());
    }

    nlohmann::json models_json;
@ -152,7 +152,7 @@ Option<bool> OpenAIEmbedder::is_model_valid(const nlohmann::json& model_config,
        if(json_res.count("error") == 0 || json_res["error"].count("message") == 0) {
            return Option<bool>(400, "OpenAI API error: " + embedding_res);
        }
-        return Option<bool>(400, "OpenAI API error: " + nlohmann::json::parse(res)["error"]["message"].get<std::string>());
+        return Option<bool>(400, "OpenAI API error: " + json_res["error"]["message"].get<std::string>());
    }
    std::vector<float> embedding;
    try {
@ -337,7 +337,7 @@ Option<bool> GoogleEmbedder::is_model_valid(const nlohmann::json& model_config,
            return Option<bool>(400, "Google API error: " + res);
        }
        
-        return Option<bool>(400, "Google API error: " + nlohmann::json::parse(res)["error"]["message"].get<std::string>());
+        return Option<bool>(400, "Google API error: " + json_res["error"]["message"].get<std::string>());
    }

    try {
@ -477,7 +477,7 @@ Option<bool> GCPEmbedder::is_model_valid(const nlohmann::json& model_config, siz
        if(json_res.count("error") == 0 || json_res["error"].count("message") == 0) {
            return Option<bool>(400, "GCP API error: " + res);
        }
-        return Option<bool>(400, "GCP API error: " + nlohmann::json::parse(res)["error"]["message"].get<std::string>());
+        return Option<bool>(400, "GCP API error: " + json_res["error"]["message"].get<std::string>());
    }
    nlohmann::json res_json;
    try {
@ -680,7 +680,7 @@ Option<std::string> GCPEmbedder::generate_access_token(const std::string& refres
        if(res_code == 408) {
            return Option<std::string>(408, "GCP API timeout.");
        }
-        return Option<std::string>(400, "GCP API error: " + nlohmann::json::parse(res)["error"]["message"].get<std::string>());
+        return Option<std::string>(400, "GCP API error: " + json_res["error"]["message"].get<std::string>());
    }
    nlohmann::json res_json;
    try {
--- a/test/collection_vector_search_test.cpp
+++ b/test/collection_vector_search_test.cpp
@ -2987,7 +2987,6 @@ TEST_F(CollectionVectorTest, TestImageEmbedding) {

    auto coll = collection_create_op.get();

-    LOG(INFO) << "Adding image to collection";

    auto add_op = coll->add(R"({
        "name": "dog",
@ -3027,6 +3026,83 @@ TEST_F(CollectionVectorTest, TestImageEmbedding) {
    ASSERT_EQ(results2["hits"][1]["document"]["id"], "0");
 }

+TEST_F(CollectionVectorTest, TestHybridSearchHiddenHits) {
+    nlohmann::json schema = R"({
+                "name": "test",
+                "fields": [
+                    {
+                        "name": "name",
+                        "type": "string"
+                    },
+                    {
+                        "name": "embedding",
+                        "type": "float[]",
+                        "embed": {
+                            "from": [
+                                "name"
+                            ],
+                            "model_config": {
+                                "model_name": "ts/e5-small"
+                            }
+                        }
+                    }
+                ]
+                })"_json;
+
+    EmbedderManager::set_model_dir("/tmp/typesense_test/models");
+
+    auto collection_create_op = collectionManager.create_collection(schema);
+    ASSERT_TRUE(collection_create_op.ok());
+
+    auto coll = collection_create_op.get();
+
+    auto add_op = coll->add(R"({
+            "name": "soccer",
+            "id": "0"
+        })"_json.dump());
+
+    ASSERT_TRUE(add_op.ok());
+
+    add_op = coll->add(R"({
+            "name": "guitar",
+            "id": "1"
+        })"_json.dump());
+
+    ASSERT_TRUE(add_op.ok());
+
+    add_op = coll->add(R"({
+            "name": "typesense",
+            "id": "2"
+        })"_json.dump());
+
+    ASSERT_TRUE(add_op.ok());
+
+    add_op = coll->add(R"({
+            "name": "potato",
+            "id": "3"
+        })"_json.dump());
+
+    ASSERT_TRUE(add_op.ok());
+
+    auto results = coll->search("sports", {"name", "embedding"},
+                                "", {}, {}, {2}, 10,
+                                1, FREQUENCY, {true},
+                                0, spp::sparse_hash_set<std::string>()).get();
+
+    ASSERT_EQ(4, results["hits"].size());
+    ASSERT_STREQ("0", results["hits"][0]["document"]["id"].get<std::string>().c_str());
+
+
+    // do hybrid search with hidden_hits
+    auto hybrid_results = coll->search("sports", {"name", "embedding"},
+                                       "", {}, {}, {2}, 10,
+                                       1, FREQUENCY, {true},
+                                       0, spp::sparse_hash_set<std::string>(), spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 1, "", "0").get();
+
+    ASSERT_EQ(3, hybrid_results["hits"].size());
+    ASSERT_FALSE(hybrid_results["hits"][0]["document"]["id"] == 0);
+}
+
 TEST_F(CollectionVectorTest, TryAddingMultipleImageFieldToEmbedFrom) {
    auto schema_json =
        R"({
@ -3045,4 +3121,34 @@ TEST_F(CollectionVectorTest, TryAddingMultipleImageFieldToEmbedFrom) {
    ASSERT_FALSE(collection_create_op.ok());

    ASSERT_EQ(collection_create_op.error(), "Only one field can be used in the `embed.from` property of an embed field when embedding from an image field.");
-}
+}
+<<<<<<< Updated upstream
+
+TEST_F(CollectionVectorTest, TestInvalidImage) {
+    auto schema_json =
+        R"({
+        "name": "Images",
+        "fields": [
+            {"name": "name", "type": "string"},
+            {"name": "image", "type": "image", "store": false},
+            {"name": "embedding", "type":"float[]", "embed":{"from": ["image"], "model_config": {"model_name": "ts/clip-vit-b-p32"}}}
+        ]
+    })"_json;
+
+    EmbedderManager::set_model_dir("/tmp/typesense_test/models");
+
+    auto collection_create_op = collectionManager.create_collection(schema_json);
+    ASSERT_TRUE(collection_create_op.ok());
+
+    auto coll = collection_create_op.get();
+
+    auto add_op = coll->add(R"({
+        "name": "teddy bear",
+        "image": "invalid"
+    })"_json.dump());
+
+    ASSERT_FALSE(add_op.ok());
+
+    ASSERT_EQ(add_op.error(), "Error while processing image");
+
+}
--- a/test/facet_index_test.cpp
+++ b/test/facet_index_test.cpp
@ -3,6 +3,8 @@

 TEST(FacetIndexTest, FacetValueDeletionString) {
    facet_index_t findex;
+    findex.initialize("brand");
+
    std::unordered_map<facet_value_id_t, std::vector<uint32_t>, facet_value_id_t::Hash> fvalue_to_seq_ids;
    std::unordered_map<uint32_t, std::vector<facet_value_id_t>> seq_id_to_fvalues;

@ -18,15 +20,54 @@ TEST(FacetIndexTest, FacetValueDeletionString) {
    doc["brand"] = "nike";

    findex.insert("brand", fvalue_to_seq_ids, seq_id_to_fvalues, true);
+    ASSERT_EQ(3, findex.facet_val_num_ids("brand", "nike"));
+
    findex.remove(doc, brandf, 0);
    findex.remove(doc, brandf, 1);
-    findex.remove(doc, brandf, 2);
+    ASSERT_EQ(1, findex.facet_val_num_ids("brand", "nike"));

+    findex.remove(doc, brandf, 2);
    ASSERT_FALSE(findex.facet_value_exists("brand", "nike"));
 }

+TEST(FacetIndexTest, FacetValueDeletionOfLongString) {
+    facet_index_t findex;
+    findex.initialize("brand");
+
+    std::unordered_map<facet_value_id_t, std::vector<uint32_t>, facet_value_id_t::Hash> fvalue_to_seq_ids;
+    std::unordered_map<uint32_t, std::vector<facet_value_id_t>> seq_id_to_fvalues;
+
+    std::string longval;
+
+    for(size_t i = 0; i < 200; i++) {
+        longval += "a";
+    }
+
+    facet_value_id_t longfval(longval.substr(0, 100), 1);
+
+    fvalue_to_seq_ids[longfval] = {0, 1, 2};
+    seq_id_to_fvalues[0] = {longfval};
+    seq_id_to_fvalues[1] = {longfval};
+    seq_id_to_fvalues[2] = {longfval};
+
+    field brandf("brand", field_types::STRING, true);
+    nlohmann::json doc;
+    doc["brand"] = longval;
+
+    findex.insert("brand", fvalue_to_seq_ids, seq_id_to_fvalues, true);
+    ASSERT_EQ(3, findex.facet_val_num_ids("brand", longval.substr(0, 100)));
+
+    findex.remove(doc, brandf, 0);
+    findex.remove(doc, brandf, 1);
+    ASSERT_EQ(1, findex.facet_val_num_ids("brand", longval.substr(0, 100)));
+
+    findex.remove(doc, brandf, 2);
+    ASSERT_FALSE(findex.facet_value_exists("brand", longval.substr(0, 100)));
+}
+
 TEST(FacetIndexTest, FacetValueDeletionFloat) {
    facet_index_t findex;
+    findex.initialize("price");
    std::unordered_map<facet_value_id_t, std::vector<uint32_t>, facet_value_id_t::Hash> fvalue_to_seq_ids;
    std::unordered_map<uint32_t, std::vector<facet_value_id_t>> seq_id_to_fvalues;

@ -39,13 +80,16 @@ TEST(FacetIndexTest, FacetValueDeletionFloat) {

    field pricef("price", field_types::FLOAT, true);
    nlohmann::json doc;
-    doc["price"] = "99.95";
+    doc["price"] = 99.95;
+
+    findex.insert("price", fvalue_to_seq_ids, seq_id_to_fvalues, true);
+    ASSERT_EQ(3, findex.facet_val_num_ids("price", "99.95"));

-    findex.insert("brand", fvalue_to_seq_ids, seq_id_to_fvalues, true);
    findex.remove(doc, pricef, 0);
    findex.remove(doc, pricef, 1);
-    findex.remove(doc, pricef, 2);
+    ASSERT_EQ(1, findex.facet_val_num_ids("price", "99.95"));

+    findex.remove(doc, pricef, 2);
    ASSERT_FALSE(findex.facet_value_exists("price", "99.95"));
 }