From 596c1c31f480f92c0169e864424dd2f0c93d0d99 Mon Sep 17 00:00:00 2001
From: ozanarmagan <o.armagan2020@gtu.edu.tr>
Date: Tue, 1 Aug 2023 11:06:14 +0300
Subject: [PATCH 1/4] Fix HybridSearchWithExplicitVector test

---
 test/collection_vector_search_test.cpp | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)
diff --git a/test/collection_vector_search_test.cpp b/test/collection_vector_search_test.cpp
index 7f1e7eef..84e54d03 100644
--- a/test/collection_vector_search_test.cpp
+++ b/test/collection_vector_search_test.cpp
@@ -775,7 +775,7 @@ TEST_F(CollectionVectorTest, HybridSearchWithExplicitVector) {
     ASSERT_EQ(2, search_res["found"].get<size_t>());
     ASSERT_EQ(2, search_res["hits"].size());
 
-    ASSERT_FLOAT_EQ(0.0462081432, search_res["hits"][0]["vector_distance"].get<float>());
+    ASSERT_FLOAT_EQ(0.046207964, search_res["hits"][0]["vector_distance"].get<float>());
     ASSERT_FLOAT_EQ(0.1213316321, search_res["hits"][1]["vector_distance"].get<float>());
 
     // to pass k param
@@ -825,10 +825,6 @@ TEST_F(CollectionVectorTest, HybridSearchWithExplicitVector) {
     ASSERT_FLOAT_EQ(2.0f, search_res["hits"][0]["vector_distance"].get<float>());
     ASSERT_FLOAT_EQ(2.0f, search_res["hits"][1]["vector_distance"].get<float>());
     ASSERT_FLOAT_EQ(2.0f, search_res["hits"][2]["vector_distance"].get<float>());
-
-    ASSERT_FLOAT_EQ(2.0f, search_res["hits"][0]["hybrid_search_info"]["vector_distance"].get<float>());
-    ASSERT_FLOAT_EQ(2.0f, search_res["hits"][1]["hybrid_search_info"]["vector_distance"].get<float>());
-    ASSERT_FLOAT_EQ(2.0f, search_res["hits"][2]["hybrid_search_info"]["vector_distance"].get<float>());
 }
 
 TEST_F(CollectionVectorTest, HybridSearchOnlyVectorMatches) {

From 7998054900349bcc8c073c42316fbfc0714136b1 Mon Sep 17 00:00:00 2001
From: Kishore Nallan <kishorenc@gmail.com>
Date: Tue, 1 Aug 2023 18:40:48 +0530
Subject: [PATCH 2/4] Exclude POST /health from batch indexer resource check.

---
 src/batched_indexer.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/batched_indexer.cpp b/src/batched_indexer.cpp
index 93a19382..be3c9974 100644
--- a/src/batched_indexer.cpp
+++ b/src/batched_indexer.cpp
@@ -200,7 +200,8 @@ void BatchedIndexer::run() {
                                                                     config.get_disk_used_max_percentage(),
                                                                     config.get_memory_used_max_percentage());
 
-                        if (resource_check != cached_resource_stat_t::OK && orig_req->http_method != "DELETE") {
+                        if (resource_check != cached_resource_stat_t::OK &&
+                            orig_req->http_method != "DELETE"  && found_rpath->handler != post_health) {
                             const std::string& err_msg = "Rejecting write: running out of resource type: " +
                                                           std::string(magic_enum::enum_name(resource_check));
                             LOG(ERROR) << err_msg;

From 770c192e9762e088ba217989638674c42606d48a Mon Sep 17 00:00:00 2001
From: Kishore Nallan <kishorenc@gmail.com>
Date: Wed, 2 Aug 2023 07:15:40 +0530
Subject: [PATCH 3/4] Add onnx patch for arm gpu for T4

---
 bazel/onnx.patch | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/bazel/onnx.patch b/bazel/onnx.patch
index 121a941d..0ceeb6db 100644
--- a/bazel/onnx.patch
+++ b/bazel/onnx.patch
@@ -11,6 +11,19 @@ index 88b46890b7..d090499971 100644
            __cmake_contentNameLower
            __cmake_contentName
 
+
+--- cmake/CMakeLists.txt
++++ cmake/CMakeLists.txt
+@@ -1268,6 +1268,7 @@ if (onnxruntime_USE_CUDA)
+       set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_53,code=sm_53") # TX1, Nano
+       set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_62,code=sm_62") # TX2
+       set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_72,code=sm_72") # AGX Xavier, NX Xavier
++      set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_75,code=sm_75") # T4
+       if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 11)
+         set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_87,code=sm_87") # AGX Orin, NX Orin
+       endif()
+
+
 diff --git a/.gitmodules b/.gitmodules
 index 8e4217162b..bb63b7d9c5 100644
 --- .gitmodules

From 06b64d8879a4a797f21974f5c11709511cc39e6b Mon Sep 17 00:00:00 2001
From: Kishore Nallan <kishorenc@gmail.com>
Date: Wed, 2 Aug 2023 13:58:50 +0530
Subject: [PATCH 4/4] Deal with repeated facet values in arrays.

---
 src/index.cpp                     |  5 ++-
 test/collection_faceting_test.cpp | 74 ++++++++++++++++++++++++++++++-
 2 files changed, 77 insertions(+), 2 deletions(-)

diff --git a/src/index.cpp b/src/index.cpp
index 7192faba..6a1c99bf 100644
--- a/src/index.cpp
+++ b/src/index.cpp
@@ -1056,6 +1056,8 @@ void Index::tokenize_string_array_with_facets(const std::vector<std::string>& st
                                               std::unordered_map<std::string, std::vector<uint32_t>>& token_to_offsets,
                                               std::vector<uint64_t>& facet_hashes) {
 
+    std::set<uint64_t> facet_hash_set;  // required to deal with repeating phrases
+
     for(size_t array_index = 0; array_index < strings.size(); array_index++) {
         const std::string& str = strings[array_index];
         std::set<std::string> token_set;  // required to deal with repeating tokens
@@ -1089,8 +1091,9 @@ void Index::tokenize_string_array_with_facets(const std::vector<std::string>& st
             }
         }
 
-        if(is_facet) {
+        if(is_facet && facet_hash_set.count(facet_hash) == 0) {
             facet_hashes.push_back(facet_hash);
+            facet_hash_set.insert(facet_hash);
         }
 
         if(token_set.empty()) {
diff --git a/test/collection_faceting_test.cpp b/test/collection_faceting_test.cpp
index ee9b798f..fec66c48 100644
--- a/test/collection_faceting_test.cpp
+++ b/test/collection_faceting_test.cpp
@@ -994,7 +994,7 @@ TEST_F(CollectionFacetingTest, FacetArrayValuesShouldBeNormalized) {
     ASSERT_EQ(1, results["facet_counts"][0]["counts"].size());
 
     // any document is chosen as representative
-    ASSERT_EQ("bu-qu", results["facet_counts"][0]["counts"][0]["value"].get<std::string>());
+    ASSERT_EQ("BUQU", results["facet_counts"][0]["counts"][0]["value"].get<std::string>());
 
     collectionManager.drop_collection("coll1");
 }
@@ -1057,6 +1057,78 @@ TEST_F(CollectionFacetingTest, FacetByNestedIntField) {
     ASSERT_EQ("companyRank", wildcard_facets[1].field_name);
 }
 
+TEST_F(CollectionFacetingTest, FacetByNestedArrayField) {
+    nlohmann::json schema = R"({
+        "name": "coll1",
+        "enable_nested_fields": true,
+        "fields": [
+          {"name": "data", "type": "object", "optional": false, "facet": true }
+        ]
+    })"_json;
+
+    auto op = collectionManager.create_collection(schema);
+    ASSERT_TRUE(op.ok());
+    Collection* coll1 = op.get();
+
+    auto doc1 = R"({
+        "data": {"details": [{"name": "Foo"}, {"name": "Foo"}]}
+    })"_json;
+
+    auto doc2 = R"({
+        "data": {"details": [{"name": "Foo"}, {"name": "Foo"}]}
+    })"_json;
+
+    ASSERT_TRUE(coll1->add(doc1.dump(), CREATE).ok());
+    ASSERT_TRUE(coll1->add(doc2.dump(), CREATE).ok());
+
+    auto results = coll1->search("*", {}, "", {"data.details.name"}, {}, {0}, 10, 1,
+                                 token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set<std::string>(),
+                                 spp::sparse_hash_set<std::string>(), 10, "", 30, 4).get();
+
+    ASSERT_EQ(2, results["found"].get<size_t>());
+    ASSERT_EQ(1, results["facet_counts"].size());
+    ASSERT_EQ("data.details.name", results["facet_counts"][0]["field_name"]);
+    ASSERT_EQ(1, results["facet_counts"][0]["counts"].size());
+    ASSERT_EQ(2, results["facet_counts"][0]["counts"][0]["count"].get<size_t>());
+    ASSERT_EQ("Foo", results["facet_counts"][0]["counts"][0]["value"].get<std::string>());
+}
+
+TEST_F(CollectionFacetingTest, FacetByArrayField) {
+    nlohmann::json schema = R"({
+        "name": "coll1",
+        "enable_nested_fields": true,
+        "fields": [
+          {"name": "data", "type": "string[]", "optional": false, "facet": true }
+        ]
+    })"_json;
+
+    auto op = collectionManager.create_collection(schema);
+    ASSERT_TRUE(op.ok());
+    Collection* coll1 = op.get();
+
+    auto doc1 = R"({
+        "data": ["Foo", "Foo"]
+    })"_json;
+
+    auto doc2 = R"({
+        "data": ["Foo", "Foo"]
+    })"_json;
+
+    ASSERT_TRUE(coll1->add(doc1.dump(), CREATE).ok());
+    ASSERT_TRUE(coll1->add(doc2.dump(), CREATE).ok());
+
+    auto results = coll1->search("*", {}, "", {"data"}, {}, {0}, 10, 1,
+                                 token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set<std::string>(),
+                                 spp::sparse_hash_set<std::string>(), 10, "", 30, 4).get();
+
+    ASSERT_EQ(2, results["found"].get<size_t>());
+    ASSERT_EQ(1, results["facet_counts"].size());
+    ASSERT_EQ("data", results["facet_counts"][0]["field_name"]);
+    ASSERT_EQ(1, results["facet_counts"][0]["counts"].size());
+    ASSERT_EQ(2, results["facet_counts"][0]["counts"][0]["count"].get<size_t>());
+    ASSERT_EQ("Foo", results["facet_counts"][0]["counts"][0]["value"].get<std::string>());
+}
+
 TEST_F(CollectionFacetingTest, FacetParseTest){
     std::vector<field> fields = {
             field("score", field_types::INT32, true),