From 2b154226cab4af7528c1c065d93163940c93ce2c Mon Sep 17 00:00:00 2001
From: ozanarmagan <o.armagan2020@gtu.edu.tr>
Date: Wed, 22 Nov 2023 22:28:15 +0300
Subject: [PATCH 1/4] Fix hybrid search with filters

---
 src/index.cpp                          | 12 ++++
 test/collection_vector_search_test.cpp | 78 ++++++++++++++++++++++++++
 2 files changed, 90 insertions(+)
diff --git a/src/index.cpp b/src/index.cpp
index f883c127..8be15837 100644
--- a/src/index.cpp
+++ b/src/index.cpp
@@ -3206,6 +3206,18 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
                 // For hybrid search, we need to give weight to text match and vector search
                 const float VECTOR_SEARCH_WEIGHT = vector_query.alpha;
                 const float TEXT_MATCH_WEIGHT = 1.0 - VECTOR_SEARCH_WEIGHT;
+                
+                bool no_filters_provided = (filter_tree_root == nullptr && filter_result.count == 0);
+
+                // list of all document ids
+                if (no_filters_provided) {
+                    filter_result.count = seq_ids->num_ids();
+                    filter_result.docs = seq_ids->uncompress();
+                }
+
+                curate_filtered_ids(curated_ids, excluded_result_ids,
+                                    excluded_result_ids_size, filter_result.docs, filter_result.count, curated_ids_sorted);
+                collate_included_ids({}, included_ids_map, curated_topster, searched_queries);
 
                 VectorFilterFunctor filterFunctor(filter_result.docs, filter_result.count);
                 auto& field_vector_index = vector_index.at(vector_query.field_name);
diff --git a/test/collection_vector_search_test.cpp b/test/collection_vector_search_test.cpp
index 72244efe..299b001c 100644
--- a/test/collection_vector_search_test.cpp
+++ b/test/collection_vector_search_test.cpp
@@ -2822,4 +2822,82 @@ TEST_F(CollectionVectorTest, TestSemanticSearchAfterUpdate) {
     ASSERT_TRUE(result.ok());
     ASSERT_EQ(1, result.get()["hits"].size());
     ASSERT_EQ("potato", result.get()["hits"][0]["document"]["name"]);   
+}
+
+
+TEST_F(CollectionVectorTest, TestHybridSearchHiddenHits) {
+    nlohmann::json schema = R"({
+                "name": "test",
+                "fields": [
+                    {
+                        "name": "name",
+                        "type": "string"
+                    },
+                    {
+                        "name": "embedding",
+                        "type": "float[]",
+                        "embed": {
+                            "from": [
+                                "name"
+                            ],
+                            "model_config": {
+                                "model_name": "ts/e5-small"
+                            }
+                        }
+                    }
+                ]
+                })"_json;
+    
+    TextEmbedderManager::set_model_dir("/tmp/typesense_test/models");
+
+    auto collection_create_op = collectionManager.create_collection(schema);
+    ASSERT_TRUE(collection_create_op.ok());
+
+    auto coll = collection_create_op.get();
+
+    auto add_op = coll->add(R"({
+        "name": "soccer",
+        "id": "0"
+    })"_json.dump());
+
+    ASSERT_TRUE(add_op.ok());
+
+    add_op = coll->add(R"({
+        "name": "guitar",
+        "id": "1"
+    })"_json.dump());
+
+    ASSERT_TRUE(add_op.ok());
+
+    add_op = coll->add(R"({
+        "name": "typesense",
+        "id": "2"
+    })"_json.dump());
+
+    ASSERT_TRUE(add_op.ok());
+
+    add_op = coll->add(R"({
+        "name": "potato",
+        "id": "3"
+    })"_json.dump());
+
+    ASSERT_TRUE(add_op.ok());
+
+    auto results = coll->search("sports", {"name", "embedding"},
+                                 "", {}, {}, {2}, 10,
+                                 1, FREQUENCY, {true},
+                                 0, spp::sparse_hash_set<std::string>()).get();
+    
+    ASSERT_EQ(4, results["hits"].size());
+    ASSERT_STREQ("0", results["hits"][0]["document"]["id"].get<std::string>().c_str());
+
+
+    // do hybrid search with hidden_hits
+    auto hybrid_results = coll->search("sports", {"name", "embedding"},
+                                 "", {}, {}, {2}, 10,
+                                 1, FREQUENCY, {true},
+                                 0, spp::sparse_hash_set<std::string>(), spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 1, "", "0").get();
+
+    ASSERT_EQ(3, hybrid_results["hits"].size());
+    ASSERT_FALSE(hybrid_results["hits"][0]["document"]["id"] == 0);
 }
\ No newline at end of file

From a8b936bee85b0ef01dbdc639f0a41b984fee12f1 Mon Sep 17 00:00:00 2001
From: Jason Bosco <mail@jasonbos.co>
Date: Wed, 22 Nov 2023 17:28:15 -0600
Subject: [PATCH 2/4] Try manually unzipping, to prevent 2GB file size error

---
 .github/workflows/tests.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 649da61a..0800ecc1 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -44,9 +44,11 @@ jobs:
           search_artifacts: true
           workflow_conclusion: ""
           if_no_artifact_found: warn
+          skip_unpack: true
 
       - name: Uncompress bazel cache
         run: |
+          unzip bazel-cache.zip
           mkdir -p ~/.cache/bazel
           tar_file="bazel-cache.tar.gz" && \
             [ -f "$tar_file" ] && \

From a2c5d24802d2e49884d4d51736baa66201e95997 Mon Sep 17 00:00:00 2001
From: ozanarmagan <o.armagan2020@gtu.edu.tr>
Date: Thu, 23 Nov 2023 22:23:15 +0300
Subject: [PATCH 3/4] Refactor ```VectorFilterFunctor``` to include
 ```excluded_ids```

---
 include/index.h | 15 +++++++++++++--
 src/index.cpp   | 16 ++--------------
 2 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/include/index.h b/include/index.h
index 1d895c02..8020c23f 100644
--- a/include/index.h
+++ b/include/index.h
@@ -269,11 +269,22 @@ class VectorFilterFunctor: public hnswlib::BaseFilterFunctor {
     const uint32_t* filter_ids = nullptr;
     const uint32_t filter_ids_length = 0;
 
+    const uint32_t* excluded_ids = nullptr;
+    const uint32_t excluded_ids_length = 0;
+
 public:
-    explicit VectorFilterFunctor(const uint32_t* filter_ids, const uint32_t filter_ids_length) :
-            filter_ids(filter_ids), filter_ids_length(filter_ids_length) {}
+    explicit VectorFilterFunctor(const uint32_t* filter_ids, const uint32_t filter_ids_length, const uint32_t* excluded_ids = nullptr, const uint32_t excluded_ids_length = 0) :
+            filter_ids(filter_ids), filter_ids_length(filter_ids_length), excluded_ids(excluded_ids), excluded_ids_length(excluded_ids_length) {}
 
     bool operator()(hnswlib::labeltype id) override {
+        if(filter_ids_length == 0 && excluded_ids_length == 0) {
+            return true;
+        }
+
+        if(excluded_ids_length > 0 && excluded_ids && std::binary_search(excluded_ids, excluded_ids + excluded_ids_length, id)) {
+            return false;
+        }
+
         if(filter_ids_length == 0) {
             return true;
         }
diff --git a/src/index.cpp b/src/index.cpp
index 8be15837..90e7131b 100644
--- a/src/index.cpp
+++ b/src/index.cpp
@@ -2901,7 +2901,7 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
                 k++;
             }
 
-            VectorFilterFunctor filterFunctor(filter_result.docs, filter_result.count);
+            VectorFilterFunctor filterFunctor(filter_result.docs, filter_result.count, excluded_result_ids, excluded_result_ids_size);
             auto& field_vector_index = vector_index.at(vector_query.field_name);
 
             std::vector<std::pair<float, size_t>> dist_labels;
@@ -3206,20 +3206,8 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
                 // For hybrid search, we need to give weight to text match and vector search
                 const float VECTOR_SEARCH_WEIGHT = vector_query.alpha;
                 const float TEXT_MATCH_WEIGHT = 1.0 - VECTOR_SEARCH_WEIGHT;
-                
-                bool no_filters_provided = (filter_tree_root == nullptr && filter_result.count == 0);
 
-                // list of all document ids
-                if (no_filters_provided) {
-                    filter_result.count = seq_ids->num_ids();
-                    filter_result.docs = seq_ids->uncompress();
-                }
-
-                curate_filtered_ids(curated_ids, excluded_result_ids,
-                                    excluded_result_ids_size, filter_result.docs, filter_result.count, curated_ids_sorted);
-                collate_included_ids({}, included_ids_map, curated_topster, searched_queries);
-
-                VectorFilterFunctor filterFunctor(filter_result.docs, filter_result.count);
+                VectorFilterFunctor filterFunctor(filter_result.docs, filter_result.count, excluded_result_ids, excluded_result_ids_size);
                 auto& field_vector_index = vector_index.at(vector_query.field_name);
                 std::vector<std::pair<float, size_t>> dist_labels;
                 // use k as 100 by default for ensuring results stability in pagination

From 09de5fff202734d6e0d7a14ae8690a54120397c1 Mon Sep 17 00:00:00 2001
From: Jason Bosco <mail@jasonbos.co>
Date: Thu, 23 Nov 2023 17:09:51 -0600
Subject: [PATCH 4/4] Upgrade action-download-artifact to hopefully fix CI

---
 .github/workflows/tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 0800ecc1..4c7cf196 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -38,7 +38,7 @@ jobs:
         uses: bazelbuild/setup-bazelisk@v2
 
       - name: Download bazel cache
-        uses: dawidd6/action-download-artifact@v2
+        uses: dawidd6/action-download-artifact@v2.28.0
         with:
           name: bazel-cache
           search_artifacts: true