From 2b154226cab4af7528c1c065d93163940c93ce2c Mon Sep 17 00:00:00 2001
From: ozanarmagan <o.armagan2020@gtu.edu.tr>
Date: Wed, 22 Nov 2023 22:28:15 +0300
Subject: [PATCH 01/11] Fix hybrid search with filters

---
 src/index.cpp                          | 12 ++++
 test/collection_vector_search_test.cpp | 78 ++++++++++++++++++++++++++
 2 files changed, 90 insertions(+)
diff --git a/src/index.cpp b/src/index.cpp
index f883c127..8be15837 100644
--- a/src/index.cpp
+++ b/src/index.cpp
@@ -3206,6 +3206,18 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
                 // For hybrid search, we need to give weight to text match and vector search
                 const float VECTOR_SEARCH_WEIGHT = vector_query.alpha;
                 const float TEXT_MATCH_WEIGHT = 1.0 - VECTOR_SEARCH_WEIGHT;
+                
+                bool no_filters_provided = (filter_tree_root == nullptr && filter_result.count == 0);
+
+                // list of all document ids
+                if (no_filters_provided) {
+                    filter_result.count = seq_ids->num_ids();
+                    filter_result.docs = seq_ids->uncompress();
+                }
+
+                curate_filtered_ids(curated_ids, excluded_result_ids,
+                                    excluded_result_ids_size, filter_result.docs, filter_result.count, curated_ids_sorted);
+                collate_included_ids({}, included_ids_map, curated_topster, searched_queries);
 
                 VectorFilterFunctor filterFunctor(filter_result.docs, filter_result.count);
                 auto& field_vector_index = vector_index.at(vector_query.field_name);
diff --git a/test/collection_vector_search_test.cpp b/test/collection_vector_search_test.cpp
index 72244efe..299b001c 100644
--- a/test/collection_vector_search_test.cpp
+++ b/test/collection_vector_search_test.cpp
@@ -2822,4 +2822,82 @@ TEST_F(CollectionVectorTest, TestSemanticSearchAfterUpdate) {
     ASSERT_TRUE(result.ok());
     ASSERT_EQ(1, result.get()["hits"].size());
     ASSERT_EQ("potato", result.get()["hits"][0]["document"]["name"]);   
+}
+
+
+TEST_F(CollectionVectorTest, TestHybridSearchHiddenHits) {
+    nlohmann::json schema = R"({
+                "name": "test",
+                "fields": [
+                    {
+                        "name": "name",
+                        "type": "string"
+                    },
+                    {
+                        "name": "embedding",
+                        "type": "float[]",
+                        "embed": {
+                            "from": [
+                                "name"
+                            ],
+                            "model_config": {
+                                "model_name": "ts/e5-small"
+                            }
+                        }
+                    }
+                ]
+                })"_json;
+    
+    TextEmbedderManager::set_model_dir("/tmp/typesense_test/models");
+
+    auto collection_create_op = collectionManager.create_collection(schema);
+    ASSERT_TRUE(collection_create_op.ok());
+
+    auto coll = collection_create_op.get();
+
+    auto add_op = coll->add(R"({
+        "name": "soccer",
+        "id": "0"
+    })"_json.dump());
+
+    ASSERT_TRUE(add_op.ok());
+
+    add_op = coll->add(R"({
+        "name": "guitar",
+        "id": "1"
+    })"_json.dump());
+
+    ASSERT_TRUE(add_op.ok());
+
+    add_op = coll->add(R"({
+        "name": "typesense",
+        "id": "2"
+    })"_json.dump());
+
+    ASSERT_TRUE(add_op.ok());
+
+    add_op = coll->add(R"({
+        "name": "potato",
+        "id": "3"
+    })"_json.dump());
+
+    ASSERT_TRUE(add_op.ok());
+
+    auto results = coll->search("sports", {"name", "embedding"},
+                                 "", {}, {}, {2}, 10,
+                                 1, FREQUENCY, {true},
+                                 0, spp::sparse_hash_set<std::string>()).get();
+    
+    ASSERT_EQ(4, results["hits"].size());
+    ASSERT_STREQ("0", results["hits"][0]["document"]["id"].get<std::string>().c_str());
+
+
+    // do hybrid search with hidden_hits
+    auto hybrid_results = coll->search("sports", {"name", "embedding"},
+                                 "", {}, {}, {2}, 10,
+                                 1, FREQUENCY, {true},
+                                 0, spp::sparse_hash_set<std::string>(), spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 1, "", "0").get();
+
+    ASSERT_EQ(3, hybrid_results["hits"].size());
+    ASSERT_FALSE(hybrid_results["hits"][0]["document"]["id"] == 0);
 }
\ No newline at end of file

From a8b936bee85b0ef01dbdc639f0a41b984fee12f1 Mon Sep 17 00:00:00 2001
From: Jason Bosco <mail@jasonbos.co>
Date: Wed, 22 Nov 2023 17:28:15 -0600
Subject: [PATCH 02/11] Try manually unzipping, to prevent 2GB file size error

---
 .github/workflows/tests.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 649da61a..0800ecc1 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -44,9 +44,11 @@ jobs:
           search_artifacts: true
           workflow_conclusion: ""
           if_no_artifact_found: warn
+          skip_unpack: true
 
       - name: Uncompress bazel cache
         run: |
+          unzip bazel-cache.zip
           mkdir -p ~/.cache/bazel
           tar_file="bazel-cache.tar.gz" && \
             [ -f "$tar_file" ] && \

From a2c5d24802d2e49884d4d51736baa66201e95997 Mon Sep 17 00:00:00 2001
From: ozanarmagan <o.armagan2020@gtu.edu.tr>
Date: Thu, 23 Nov 2023 22:23:15 +0300
Subject: [PATCH 03/11] Refactor ```VectorFilterFunctor``` to include
 ```excluded_ids```

---
 include/index.h | 15 +++++++++++++--
 src/index.cpp   | 16 ++--------------
 2 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/include/index.h b/include/index.h
index 1d895c02..8020c23f 100644
--- a/include/index.h
+++ b/include/index.h
@@ -269,11 +269,22 @@ class VectorFilterFunctor: public hnswlib::BaseFilterFunctor {
     const uint32_t* filter_ids = nullptr;
     const uint32_t filter_ids_length = 0;
 
+    const uint32_t* excluded_ids = nullptr;
+    const uint32_t excluded_ids_length = 0;
+
 public:
-    explicit VectorFilterFunctor(const uint32_t* filter_ids, const uint32_t filter_ids_length) :
-            filter_ids(filter_ids), filter_ids_length(filter_ids_length) {}
+    explicit VectorFilterFunctor(const uint32_t* filter_ids, const uint32_t filter_ids_length, const uint32_t* excluded_ids = nullptr, const uint32_t excluded_ids_length = 0) :
+            filter_ids(filter_ids), filter_ids_length(filter_ids_length), excluded_ids(excluded_ids), excluded_ids_length(excluded_ids_length) {}
 
     bool operator()(hnswlib::labeltype id) override {
+        if(filter_ids_length == 0 && excluded_ids_length == 0) {
+            return true;
+        }
+
+        if(excluded_ids_length > 0 && excluded_ids && std::binary_search(excluded_ids, excluded_ids + excluded_ids_length, id)) {
+            return false;
+        }
+
         if(filter_ids_length == 0) {
             return true;
         }
diff --git a/src/index.cpp b/src/index.cpp
index 8be15837..90e7131b 100644
--- a/src/index.cpp
+++ b/src/index.cpp
@@ -2901,7 +2901,7 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
                 k++;
             }
 
-            VectorFilterFunctor filterFunctor(filter_result.docs, filter_result.count);
+            VectorFilterFunctor filterFunctor(filter_result.docs, filter_result.count, excluded_result_ids, excluded_result_ids_size);
             auto& field_vector_index = vector_index.at(vector_query.field_name);
 
             std::vector<std::pair<float, size_t>> dist_labels;
@@ -3206,20 +3206,8 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
                 // For hybrid search, we need to give weight to text match and vector search
                 const float VECTOR_SEARCH_WEIGHT = vector_query.alpha;
                 const float TEXT_MATCH_WEIGHT = 1.0 - VECTOR_SEARCH_WEIGHT;
-                
-                bool no_filters_provided = (filter_tree_root == nullptr && filter_result.count == 0);
 
-                // list of all document ids
-                if (no_filters_provided) {
-                    filter_result.count = seq_ids->num_ids();
-                    filter_result.docs = seq_ids->uncompress();
-                }
-
-                curate_filtered_ids(curated_ids, excluded_result_ids,
-                                    excluded_result_ids_size, filter_result.docs, filter_result.count, curated_ids_sorted);
-                collate_included_ids({}, included_ids_map, curated_topster, searched_queries);
-
-                VectorFilterFunctor filterFunctor(filter_result.docs, filter_result.count);
+                VectorFilterFunctor filterFunctor(filter_result.docs, filter_result.count, excluded_result_ids, excluded_result_ids_size);
                 auto& field_vector_index = vector_index.at(vector_query.field_name);
                 std::vector<std::pair<float, size_t>> dist_labels;
                 // use k as 100 by default for ensuring results stability in pagination

From 09de5fff202734d6e0d7a14ae8690a54120397c1 Mon Sep 17 00:00:00 2001
From: Jason Bosco <mail@jasonbos.co>
Date: Thu, 23 Nov 2023 17:09:51 -0600
Subject: [PATCH 04/11] Upgrade action-download-artifact to hopefully fix CI

---
 .github/workflows/tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 0800ecc1..4c7cf196 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -38,7 +38,7 @@ jobs:
         uses: bazelbuild/setup-bazelisk@v2
 
       - name: Download bazel cache
-        uses: dawidd6/action-download-artifact@v2
+        uses: dawidd6/action-download-artifact@v2.28.0
         with:
           name: bazel-cache
           search_artifacts: true

From dd215ffa49240d1bb3b0312ea0c61530755ec26b Mon Sep 17 00:00:00 2001
From: ozanarmagan <o.armagan2020@gtu.edu.tr>
Date: Mon, 27 Nov 2023 02:20:10 +0300
Subject: [PATCH 05/11] Fix error messages for remote embedders

---
 src/text_embedder_remote.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/text_embedder_remote.cpp b/src/text_embedder_remote.cpp
index 27409c2d..feb1900f 100644
--- a/src/text_embedder_remote.cpp
+++ b/src/text_embedder_remote.cpp
@@ -106,7 +106,7 @@ Option<bool> OpenAIEmbedder::is_model_valid(const nlohmann::json& model_config,
         if(json_res.count("error") == 0 || json_res["error"].count("message") == 0) {
             return Option<bool>(400, "OpenAI API error: " + res);
         }
-        return Option<bool>(400, "OpenAI API error: " + nlohmann::json::parse(res)["error"]["message"].get<std::string>());
+        return Option<bool>(400, "OpenAI API error: " + json_res["error"]["message"].get<std::string>());
     }
 
     nlohmann::json models_json;
@@ -152,7 +152,7 @@ Option<bool> OpenAIEmbedder::is_model_valid(const nlohmann::json& model_config,
         if(json_res.count("error") == 0 || json_res["error"].count("message") == 0) {
             return Option<bool>(400, "OpenAI API error: " + embedding_res);
         }
-        return Option<bool>(400, "OpenAI API error: " + nlohmann::json::parse(res)["error"]["message"].get<std::string>());
+        return Option<bool>(400, "OpenAI API error: " + json_res["error"]["message"].get<std::string>());
     }
     std::vector<float> embedding;
     try {
@@ -337,7 +337,7 @@ Option<bool> GoogleEmbedder::is_model_valid(const nlohmann::json& model_config,
             return Option<bool>(400, "Google API error: " + res);
         }
         
-        return Option<bool>(400, "Google API error: " + nlohmann::json::parse(res)["error"]["message"].get<std::string>());
+        return Option<bool>(400, "Google API error: " + json_res["error"]["message"].get<std::string>());
     }
 
     try {
@@ -477,7 +477,7 @@ Option<bool> GCPEmbedder::is_model_valid(const nlohmann::json& model_config, siz
         if(json_res.count("error") == 0 || json_res["error"].count("message") == 0) {
             return Option<bool>(400, "GCP API error: " + res);
         }
-        return Option<bool>(400, "GCP API error: " + nlohmann::json::parse(res)["error"]["message"].get<std::string>());
+        return Option<bool>(400, "GCP API error: " + json_res["error"]["message"].get<std::string>());
     }
     nlohmann::json res_json;
     try {
@@ -680,7 +680,7 @@ Option<std::string> GCPEmbedder::generate_access_token(const std::string& refres
         if(res_code == 408) {
             return Option<std::string>(408, "GCP API timeout.");
         }
-        return Option<std::string>(400, "GCP API error: " + nlohmann::json::parse(res)["error"]["message"].get<std::string>());
+        return Option<std::string>(400, "GCP API error: " + json_res["error"]["message"].get<std::string>());
     }
     nlohmann::json res_json;
     try {

From 34f748d9b06adb189951f91f1d11e0de683b9d9f Mon Sep 17 00:00:00 2001
From: Jason Bosco <mail@jasonbos.co>
Date: Sun, 26 Nov 2023 19:28:33 -0600
Subject: [PATCH 06/11] Attempt to fix bazel cache loading issue for files
 larger than 2GB

---
 .github/workflows/tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 4c7cf196..4c602a25 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -38,7 +38,7 @@ jobs:
         uses: bazelbuild/setup-bazelisk@v2
 
       - name: Download bazel cache
-        uses: dawidd6/action-download-artifact@v2.28.0
+        uses: jasonbosco/action-download-artifact@c4d5f373a5d7da32fe23a91e414baa3e81eeef42
         with:
           name: bazel-cache
           search_artifacts: true

From 59628c91b9a9adcf11ea4fd6b42759bc306f8911 Mon Sep 17 00:00:00 2001
From: Jason Bosco <mail@jasonbos.co>
Date: Sun, 26 Nov 2023 19:33:43 -0600
Subject: [PATCH 07/11] Attempt 2

---
 .github/workflows/tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 4c602a25..c0d13967 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -38,7 +38,7 @@ jobs:
         uses: bazelbuild/setup-bazelisk@v2
 
       - name: Download bazel cache
-        uses: jasonbosco/action-download-artifact@c4d5f373a5d7da32fe23a91e414baa3e81eeef42
+        uses: jasonbosco/action-download-artifact@d7ab2dcce12fbef7a1565790bac7cf24a319b066
         with:
           name: bazel-cache
           search_artifacts: true

From 98f599719ea4ab3528211eab2ec7c0c000c39b9d Mon Sep 17 00:00:00 2001
From: Jason Bosco <mail@jasonbos.co>
Date: Sun, 26 Nov 2023 19:35:18 -0600
Subject: [PATCH 08/11] Round 3

---
 .github/workflows/tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index c0d13967..280e69e9 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -38,7 +38,7 @@ jobs:
         uses: bazelbuild/setup-bazelisk@v2
 
       - name: Download bazel cache
-        uses: jasonbosco/action-download-artifact@d7ab2dcce12fbef7a1565790bac7cf24a319b066
+        uses: jasonbosco/action-download-artifact@709b71d3729e8980f52a5a2a9ec04261060945c1
         with:
           name: bazel-cache
           search_artifacts: true

From 9ad9caecc94fbe84a5e07ec679ae021089f1312e Mon Sep 17 00:00:00 2001
From: Jason Bosco <mail@jasonbos.co>
Date: Sun, 26 Nov 2023 19:38:20 -0600
Subject: [PATCH 09/11] Trigger CI


From 721d4ed7dc86900087ed3de6d83a97b7b460651a Mon Sep 17 00:00:00 2001
From: Kishore Nallan <kishorenc@gmail.com>
Date: Sat, 25 Nov 2023 21:36:08 +0530
Subject: [PATCH 10/11] Reduce noise of per-collection housekeeping log.

---
 src/housekeeper.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/housekeeper.cpp b/src/housekeeper.cpp
index 07150e11..370a3074 100644
--- a/src/housekeeper.cpp
+++ b/src/housekeeper.cpp
@@ -31,7 +31,10 @@ void HouseKeeper::run() {
             }
 
             coll->do_housekeeping();
-            LOG(INFO) << "Ran housekeeping.";
+        }
+
+        if(!coll_names.empty()) {
+            LOG(INFO) << "Ran housekeeping for " << coll_names.size() << " collections.";
         }
 
         prev_persistence_s = std::chrono::duration_cast<std::chrono::seconds>(

From c71245ca1410cb653a917b6cc2cf560a8f4a5ea3 Mon Sep 17 00:00:00 2001
From: Kishore Nallan <kishorenc@gmail.com>
Date: Thu, 30 Nov 2023 14:08:39 +0530
Subject: [PATCH 11/11] Fix bug with deeply nested optional array of obj field.

---
 src/field.cpp                          |  4 +-
 test/collection_nested_fields_test.cpp | 75 ++++++++++++++++++++++++++
 2 files changed, 78 insertions(+), 1 deletion(-)

diff --git a/src/field.cpp b/src/field.cpp
index ea99b72f..8c613e0c 100644
--- a/src/field.cpp
+++ b/src/field.cpp
@@ -1022,9 +1022,11 @@ Option<bool> field::flatten_field(nlohmann::json& doc, nlohmann::json& obj, cons
             return flatten_field(doc, it.value(), the_field, path_parts, path_index + 1, has_array, has_obj_array,
                                  is_update, dyn_fields, flattened_fields);
         }
-    } {
+    } else if(!the_field.optional) {
         return Option<bool>(404, "Field `" + the_field.name + "` not found.");
     }
+
+    return Option<bool>(true);
 }
 
 Option<bool> field::flatten_doc(nlohmann::json& document,
diff --git a/test/collection_nested_fields_test.cpp b/test/collection_nested_fields_test.cpp
index 1c4bfb24..bcab0f73 100644
--- a/test/collection_nested_fields_test.cpp
+++ b/test/collection_nested_fields_test.cpp
@@ -3056,6 +3056,81 @@ TEST_F(CollectionNestedFieldsTest, HighlightArrayOfObjects) {
     ASSERT_EQ(1, results["hits"][0]["highlight"]["details"][2].size());
 }
 
+TEST_F(CollectionNestedFieldsTest, DeepNestedOptionalArrayValue) {
+    nlohmann::json schema = R"({
+        "name": "coll1",
+        "enable_nested_fields": true,
+        "fields": [
+            {
+                "facet": false,
+                "index": true,
+                "infix": false,
+                "locale": "",
+                "name": "items.name",
+                "optional": true,
+                "sort": false,
+                "type": "string[]"
+            },
+            {
+                "facet": false,
+                "index": true,
+                "infix": false,
+                "locale": "",
+                "name": "items.description",
+                "optional": true,
+                "sort": false,
+                "type": "string[]"
+            },
+            {
+                "facet": false,
+                "index": true,
+                "infix": false,
+                "locale": "",
+                "name": "items.nested_items.name",
+                "optional": true,
+                "sort": false,
+                "type": "string[]"
+            }
+        ]
+    })"_json;
+
+    auto op = collectionManager.create_collection(schema);
+    ASSERT_TRUE(op.ok());
+    Collection* coll1 = op.get();
+
+    auto doc1 = R"({
+        "items": [
+            {
+                "description": "random description.",
+                "name": "foobar",
+                "nested_items": [
+                    {
+                        "isAvailable": true
+                    },
+                    {
+                        "description": "nested description here",
+                        "isAvailable": true,
+                        "name": "naruto"
+                    },
+                    {
+                        "description": "description again",
+                        "isAvailable": true,
+                        "name": "dragon ball"
+                    }
+                ]
+            }
+        ]
+    })"_json;
+
+    auto add_op = coll1->add(doc1.dump(), CREATE);
+    ASSERT_TRUE(add_op.ok());
+
+    auto results = coll1->search("naruto", {"items.nested_items.name"}, "", {}, {}, {0}, 10, 1, FREQUENCY,
+                                 {true}, 1, spp::sparse_hash_set<std::string>(),
+                                 spp::sparse_hash_set<std::string>(), 10, "", 30, 4).get();
+    ASSERT_EQ(1, results["found"].get<size_t>());
+}
+
 TEST_F(CollectionNestedFieldsTest, FloatInsideNestedObject) {
     nlohmann::json schema = R"({
         "name": "coll1",