From d2798ac00b00956d3bcf5366a10f6970e6bc7b33 Mon Sep 17 00:00:00 2001
From: Kishore Nallan <kishorenc@gmail.com>
Date: Fri, 19 Nov 2021 19:32:00 +0530
Subject: [PATCH 01/20] Log times for operations.

---
 src/art.cpp   |  2 ++
 src/index.cpp | 13 ++++++++++---
 2 files changed, 12 insertions(+), 3 deletions(-)
diff --git a/src/art.cpp b/src/art.cpp
index 40017d0c..311e1993 100644
--- a/src/art.cpp
+++ b/src/art.cpp
@@ -1024,6 +1024,8 @@ int art_topk_iter(const art_node *root, token_ordering token_order, size_t max_r
         }
     }
 
+    LOG(INFO) << "leaf results.size: " << results.size();
+
     printf("OUTSIDE art_topk_iter: results size: %d\n", results.size());
     return 0;
 }
diff --git a/src/index.cpp b/src/index.cpp
index d77bf416..40580704 100644
--- a/src/index.cpp
+++ b/src/index.cpp
@@ -1212,7 +1212,7 @@ void Index::search_candidates(const uint8_t & field_id, bool field_is_array,
 void Index::do_filtering(uint32_t*& filter_ids, uint32_t& filter_ids_length,
                          const std::vector<filter>& filters,
                          const bool enable_short_circuit) const {
-    //auto begin = std::chrono::high_resolution_clock::now();
+    auto begin = std::chrono::high_resolution_clock::now();
     for(size_t i = 0; i < filters.size(); i++) {
         const filter & a_filter = filters[i];
 
@@ -1552,10 +1552,10 @@ void Index::do_filtering(uint32_t*& filter_ids, uint32_t& filter_ids_length,
         }
     }
 
-    /*long long int timeMillis =
+    long long int timeMillis =
             std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - begin).count();
 
-    LOG(INFO) << "Time taken for filtering: " << timeMillis << "ms";*/
+    LOG(INFO) << "Time taken for filtering: " << timeMillis << "ms";
 }
 
 
@@ -2881,11 +2881,18 @@ void Index::search_field(const uint8_t & field_id,
                 // prefix should apply only for last token
                 const size_t token_len = prefix_search ? (int) token.length() : (int) token.length() + 1;
 
+                auto begin = std::chrono::high_resolution_clock::now();
+
                 // need less candidates for filtered searches since we already only pick tokens with results
                 art_fuzzy_search(search_index.at(field_name), (const unsigned char *) token.c_str(), token_len,
                                  costs[token_index], costs[token_index], num_fuzzy_candidates, token_order, prefix_search,
                                  filter_ids, filter_ids_length, leaves, unique_tokens);
 
+                auto timeMillis = std::chrono::duration_cast<std::chrono::milliseconds>(
+                                std::chrono::high_resolution_clock::now() - begin).count();
+
+                LOG(INFO) << "Time taken for fuzzy search: " << timeMillis << "ms";
+
                 if(!leaves.empty()) {
                     token_cost_cache.emplace(token_cost_hash, leaves);
                     for(auto leaf: leaves) {

From 4cb6d7926a411025eaca45cbef1869cd97508179 Mon Sep 17 00:00:00 2001
From: Kishore Nallan <kishorenc@gmail.com>
Date: Fri, 19 Nov 2021 22:19:25 +0530
Subject: [PATCH 02/20] Log multi-field aggregation time.

---
 src/index.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/index.cpp b/src/index.cpp
index 40580704..3fc4bc1e 100644
--- a/src/index.cpp
+++ b/src/index.cpp
@@ -2175,6 +2175,8 @@ void Index::search(std::vector<query_tokens_t>& field_query_tokens,
             }
         }
 
+        auto begin0 = std::chrono::high_resolution_clock::now();
+
         for(auto& seq_id_kvs: topster_ids) {
             const uint64_t seq_id = seq_id_kvs.first;
             auto& kvs = seq_id_kvs.second; // each `kv` can be from a different field
@@ -2358,6 +2360,11 @@ void Index::search(std::vector<query_tokens_t>& field_query_tokens,
             kvs[0]->scores[kvs[0]->match_score_index] = aggregated_score;
             topster->add(kvs[0]);
         }
+
+        auto timeMillis0 = std::chrono::duration_cast<std::chrono::milliseconds>(
+                std::chrono::high_resolution_clock::now() - begin0).count();
+
+        LOG(INFO) << "Time taken for multi-field aggregation: " << timeMillis0 << "ms";
     }
 
     //LOG(INFO) << "topster size: " << topster->size;

From 0626ca8cf661f0975526543c24d34e2dded43ebd Mon Sep 17 00:00:00 2001
From: Kishore Nallan <kishorenc@gmail.com>
Date: Sat, 20 Nov 2021 11:59:46 +0530
Subject: [PATCH 03/20] Speed up search inside multi-field aggregation.

---
 src/index.cpp | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/src/index.cpp b/src/index.cpp
index 3fc4bc1e..418dc98b 100644
--- a/src/index.cpp
+++ b/src/index.cpp
@@ -2266,18 +2266,14 @@ void Index::search(std::vector<query_tokens_t>& field_query_tokens,
 
                 for(size_t token_index=0; token_index < field_query_tokens[i].q_include_tokens.size(); token_index++) {
                     const auto& token = field_query_tokens[i].q_include_tokens[token_index];
+                    const art_leaf* leaf = (art_leaf *) art_search(search_index.at(field), (const unsigned char*) token.c_str(),
+                                                             token.length()+1);
 
-                    std::vector<art_leaf*> leaves;
-                    const bool prefix_search = field_prefix && (token_index == field_query_tokens[i].q_include_tokens.size()-1);
-                    const size_t token_len = prefix_search ? (int) token.length() : (int) token.length() + 1;
-                    art_fuzzy_search(search_index.at(field), (const unsigned char *) token.c_str(), token_len,
-                                     0, 0, 1, token_order, prefix_search, nullptr, 0, leaves);
-
-                    if(leaves.empty()) {
+                    if(!leaf) {
                         continue;
                     }
 
-                    if(!posting_t::contains(leaves[0]->values, seq_id)) {
+                    if(!posting_t::contains(leaf->values, seq_id)) {
                         continue;
                     }
 

From ef47f54d111339c9aceb430035aa4844fbee034c Mon Sep 17 00:00:00 2001
From: Kishore Nallan <kishorenc@gmail.com>
Date: Sun, 21 Nov 2021 12:51:21 +0530
Subject: [PATCH 04/20] Improve fuzzy search filtering perf.

---
 src/art.cpp                | 10 +++++++++-
 src/posting.cpp            | 24 +++++++++++++++---------
 test/posting_list_test.cpp |  6 ++++++
 3 files changed, 30 insertions(+), 10 deletions(-)

diff --git a/src/art.cpp b/src/art.cpp
index 311e1993..bb373813 100644
--- a/src/art.cpp
+++ b/src/art.cpp
@@ -950,6 +950,8 @@ int art_topk_iter(const art_node *root, token_ordering token_order, size_t max_r
 
     q.push(root);
 
+    size_t num_large_lists = 0;
+
     while(!q.empty() && results.size() < max_results*4) {
         art_node *n = (art_node *) q.top();
         q.pop();
@@ -974,6 +976,10 @@ int art_topk_iter(const art_node *root, token_ordering token_order, size_t max_r
                 results.push_back(l);
             } else {
                 // we will push leaf only if filter matches with leaf IDs
+                if(!IS_COMPACT_POSTING(l->values)) {
+                    num_large_lists++;
+                }
+
                 bool found_atleast_one = posting_t::contains_atleast_one(l->values, filter_ids, filter_ids_length);
                 if(found_atleast_one) {
                     results.push_back(l);
@@ -1024,7 +1030,9 @@ int art_topk_iter(const art_node *root, token_ordering token_order, size_t max_r
         }
     }
 
-    LOG(INFO) << "leaf results.size: " << results.size();
+    LOG(INFO) << "leaf results.size: " << results.size()
+              << ", filter_ids_length: " << filter_ids_length
+              << ", num_large_lists: " << num_large_lists;
 
     printf("OUTSIDE art_topk_iter: results size: %d\n", results.size());
     return 0;
diff --git a/src/posting.cpp b/src/posting.cpp
index 4ec4ab00..14484af0 100644
--- a/src/posting.cpp
+++ b/src/posting.cpp
@@ -221,17 +221,23 @@ bool compact_posting_list_t::contains_atleast_one(const uint32_t* target_ids, si
         size_t num_existing_offsets = id_offsets[i];
         size_t existing_id = id_offsets[i + num_existing_offsets + 1];
 
-        if(existing_id == target_ids[target_ids_index]) {
-            return true;
+        // Returns iterator to the first element that is >= to value or last if no such element is found.
+        size_t found_index = std::lower_bound(target_ids + target_ids_index,
+                                              target_ids + target_ids_size, existing_id) - target_ids;
+
+        if(found_index == target_ids_size) {
+            // all elements are lesser than lowest value (existing_id), so we can stop looking
+            return false;
+        } else {
+            if(target_ids[found_index] == existing_id) {
+                return true;
+            }
+
+            // adjust lower bound to found_index+1 whose value is >= `existing_id`
+            target_ids_index = found_index;
         }
 
-        if(target_ids[target_ids_index] < existing_id) {
-            while(target_ids_index < target_ids_size && target_ids[target_ids_index] < existing_id) {
-                target_ids_index++;
-            }
-        } else {
-            i += num_existing_offsets + 2;
-        }
+        i += num_existing_offsets + 2;
     }
 
     return false;
diff --git a/test/posting_list_test.cpp b/test/posting_list_test.cpp
index d28db670..09c567ef 100644
--- a/test/posting_list_test.cpp
+++ b/test/posting_list_test.cpp
@@ -1325,6 +1325,12 @@ TEST_F(PostingListTest, CompactPostingListContainsAtleastOne) {
     ASSERT_TRUE(COMPACT_POSTING_PTR(obj)->contains_atleast_one(&target_ids3[0], target_ids3.size()));
     ASSERT_FALSE(COMPACT_POSTING_PTR(obj)->contains_atleast_one(&target_ids4[0], target_ids4.size()));
 
+    std::vector<uint32_t> target_ids5 = {2, 3};
+    ASSERT_TRUE(COMPACT_POSTING_PTR(obj)->contains_atleast_one(&target_ids5[0], target_ids5.size()));
+
+    std::vector<uint32_t> target_ids6 = {0, 1, 2};
+    ASSERT_FALSE(COMPACT_POSTING_PTR(obj)->contains_atleast_one(&target_ids6[0], target_ids6.size()));
+
     posting_t::destroy_list(obj);
 }
 

From 55ad455804efc21ae48622278009c3a04b8fa44e Mon Sep 17 00:00:00 2001
From: Kishore Nallan <kishorenc@gmail.com>
Date: Mon, 22 Nov 2021 12:29:11 +0530
Subject: [PATCH 05/20] Improve intersected filtering perf.

---
 src/posting_list.cpp | 27 ++++-----------------------
 1 file changed, 4 insertions(+), 23 deletions(-)

diff --git a/src/posting_list.cpp b/src/posting_list.cpp
index 0fcb771d..823dd028 100644
--- a/src/posting_list.cpp
+++ b/src/posting_list.cpp
@@ -666,37 +666,18 @@ void posting_list_t::intersect(const std::vector<posting_list_t*>& posting_lists
 bool posting_list_t::take_id(result_iter_state_t& istate, uint32_t id) {
     // decide if this result id should be excluded
     if(istate.excluded_result_ids_size != 0) {
-        while(istate.excluded_result_ids_index < istate.excluded_result_ids_size &&
-              istate.excluded_result_ids[istate.excluded_result_ids_index] < id) {
-            istate.excluded_result_ids_index++;
-        }
-
-        if(istate.excluded_result_ids_index < istate.excluded_result_ids_size &&
-           id == istate.excluded_result_ids[istate.excluded_result_ids_index]) {
-            istate.excluded_result_ids_index++;
+        if (std::binary_search(istate.excluded_result_ids,
+                               istate.excluded_result_ids + istate.excluded_result_ids_size, id)) {
             return false;
         }
     }
 
-    bool id_found_in_filter = true;
-
     // decide if this result be matched with filter results
     if(istate.filter_ids_length != 0) {
-        id_found_in_filter = false;
-
-        // e.g. [1, 3] vs [2, 3]
-
-        while(istate.filter_ids_index < istate.filter_ids_length && istate.filter_ids[istate.filter_ids_index] < id) {
-            istate.filter_ids_index++;
-        }
-
-        if(istate.filter_ids_index < istate.filter_ids_length && istate.filter_ids[istate.filter_ids_index] == id) {
-            istate.filter_ids_index++;
-            id_found_in_filter = true;
-        }
+        return std::binary_search(istate.filter_ids, istate.filter_ids + istate.filter_ids_length, id);
     }
 
-    return id_found_in_filter;
+    return true;
 }
 
 bool posting_list_t::get_offsets(const std::vector<iterator_t>& its,

From 97b1981fcdda9540a6f7e017843774537c0fb9f1 Mon Sep 17 00:00:00 2001
From: Kishore Nallan <kishorenc@gmail.com>
Date: Mon, 22 Nov 2021 15:18:37 +0530
Subject: [PATCH 06/20] Buffer IDs during exhaustive search.

---
 include/index.h          |  3 ++-
 include/posting.h        |  8 ++++---
 src/index.cpp            | 46 ++++++++++++++++++++++++++++++++++------
 test/collection_test.cpp |  2 +-
 4 files changed, 47 insertions(+), 12 deletions(-)

diff --git a/include/index.h b/include/index.h
index 9c8d0fc4..1309abea 100644
--- a/include/index.h
+++ b/include/index.h
@@ -510,7 +510,8 @@ private:
                            bool prioritize_exact_match,
                            bool exhaustive_search,
                            size_t concurrency,
-                           std::set<uint64>& query_hashes) const;
+                           std::set<uint64>& query_hashes,
+                           std::vector<uint32_t>& id_buff) const;
 
     void do_filtering(uint32_t*& filter_ids, uint32_t& filter_ids_length, const std::vector<filter>& filters,
                       const bool enable_short_circuit) const;
diff --git a/include/posting.h b/include/posting.h
index c5c3ef66..e8ad4be3 100644
--- a/include/posting.h
+++ b/include/posting.h
@@ -65,9 +65,11 @@ public:
 
             to_expanded_plists(raw_posting_lists, plists, expanded_plists);
 
-            std::sort(this->plists.begin(), this->plists.end(), [](posting_list_t* a, posting_list_t* b) {
-                return a->num_blocks() < b->num_blocks();
-            });
+            if(plists.size() > 1) {
+                std::sort(this->plists.begin(), this->plists.end(), [](posting_list_t* a, posting_list_t* b) {
+                    return a->num_blocks() < b->num_blocks();
+                });
+            }
         }
 
         ~block_intersector_t() {
diff --git a/src/index.cpp b/src/index.cpp
index 418dc98b..1a677731 100644
--- a/src/index.cpp
+++ b/src/index.cpp
@@ -1075,7 +1075,8 @@ void Index::search_candidates(const uint8_t & field_id, bool field_is_array,
                               bool prioritize_exact_match,
                               const bool exhaustive_search,
                               const size_t concurrency,
-                              std::set<uint64>& query_hashes) const {
+                              std::set<uint64>& query_hashes,
+                              std::vector<uint32_t>& id_buff) const {
 
     auto product = []( long long a, token_candidates & b ) { return a*b.candidates.size(); };
     long long int N = std::accumulate(token_candidates_vec.begin(), token_candidates_vec.end(), 1LL, product);
@@ -1180,11 +1181,15 @@ void Index::search_candidates(const uint8_t & field_id, bool field_is_array,
         for(size_t i = 0; i < concurrency; i++) {
             // empty vec can happen if not all threads produce results
             if (!result_id_vecs[i].empty()) {
-                uint32_t* new_all_result_ids = nullptr;
-                all_result_ids_len = ArrayUtils::or_scalar(*all_result_ids, all_result_ids_len, &result_id_vecs[i][0],
-                                                           result_id_vecs[i].size(), &new_all_result_ids);
-                delete[] *all_result_ids;
-                *all_result_ids = new_all_result_ids;
+                if(exhaustive_search) {
+                    id_buff.insert(id_buff.end(), result_id_vecs[i].begin(), result_id_vecs[i].end());
+                } else {
+                    uint32_t* new_all_result_ids = nullptr;
+                    all_result_ids_len = ArrayUtils::or_scalar(*all_result_ids, all_result_ids_len, &result_id_vecs[i][0],
+                                                               result_id_vecs[i].size(), &new_all_result_ids);
+                    delete[] *all_result_ids;
+                    *all_result_ids = new_all_result_ids;
+                }
 
                 num_result_ids += result_id_vecs[i].size();
 
@@ -1200,6 +1205,20 @@ void Index::search_candidates(const uint8_t & field_id, bool field_is_array,
             }
         }
 
+        if(id_buff.size() > 100000) {
+            // prevents too many ORs during exhaustive searching
+            std::sort(id_buff.begin(), id_buff.end());
+            id_buff.erase(std::unique( id_buff.begin(), id_buff.end() ), id_buff.end());
+
+            uint32_t* new_all_result_ids = nullptr;
+            all_result_ids_len = ArrayUtils::or_scalar(*all_result_ids, all_result_ids_len, &id_buff[0],
+                                                       id_buff.size(), &new_all_result_ids);
+            delete[] *all_result_ids;
+            *all_result_ids = new_all_result_ids;
+            num_result_ids += id_buff.size();
+            id_buff.clear();
+        }
+
         if(num_result_ids == 0) {
             continue;
         }
@@ -2940,13 +2959,26 @@ void Index::search_field(const uint8_t & field_id,
         }
 
         if(!token_candidates_vec.empty()) {
+            std::vector<uint32_t> id_buff;
+
             // If atleast one token is found, go ahead and search for candidates
             search_candidates(field_id, the_field.is_array(), filter_ids, filter_ids_length,
                               exclude_token_ids, exclude_token_ids_size,
                               curated_ids, sort_fields, token_candidates_vec, searched_queries, topster,
                               groups_processed, all_result_ids, all_result_ids_len, field_num_results,
                               typo_tokens_threshold, group_limit, group_by_fields, query_tokens,
-                              prioritize_exact_match, combination_limit, concurrency, query_hashes);
+                              prioritize_exact_match, combination_limit, concurrency, query_hashes, id_buff);
+
+            if(id_buff.size() > 1) {
+                std::sort(id_buff.begin(), id_buff.end());
+                id_buff.erase(std::unique( id_buff.begin(), id_buff.end() ), id_buff.end());
+            }
+
+            uint32_t* new_all_result_ids = nullptr;
+            all_result_ids_len = ArrayUtils::or_scalar(*all_result_ids, all_result_ids_len, &id_buff[0],
+                                                       id_buff.size(), &new_all_result_ids);
+            delete[] *all_result_ids;
+            *all_result_ids = new_all_result_ids;
         }
 
         resume_typo_loop:
diff --git a/test/collection_test.cpp b/test/collection_test.cpp
index 63ceba4f..09229ed0 100644
--- a/test/collection_test.cpp
+++ b/test/collection_test.cpp
@@ -1939,7 +1939,7 @@ TEST_F(CollectionTest, DeletionOfDocumentArrayFields) {
                                        token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set<std::string>(),
                                        spp::sparse_hash_set<std::string>(), 10).get();
 
-    ASSERT_EQ(1, res["found"]);
+    ASSERT_EQ(1, res["found"].get<size_t>());
 
     Option<std::string> rem_op = coll1->remove("100");
 

From 72b40252325ed50542bcf2a880c80d93fe33a479 Mon Sep 17 00:00:00 2001
From: Kishore Nallan <kishorenc@gmail.com>
Date: Tue, 23 Nov 2021 08:56:29 +0530
Subject: [PATCH 07/20] Remove stray logs.

---
 src/art.cpp   |  4 ++--
 src/index.cpp | 18 +++++++++---------
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/art.cpp b/src/art.cpp
index bb373813..36ff086d 100644
--- a/src/art.cpp
+++ b/src/art.cpp
@@ -1030,9 +1030,9 @@ int art_topk_iter(const art_node *root, token_ordering token_order, size_t max_r
         }
     }
 
-    LOG(INFO) << "leaf results.size: " << results.size()
+    /*LOG(INFO) << "leaf results.size: " << results.size()
               << ", filter_ids_length: " << filter_ids_length
-              << ", num_large_lists: " << num_large_lists;
+              << ", num_large_lists: " << num_large_lists;*/
 
     printf("OUTSIDE art_topk_iter: results size: %d\n", results.size());
     return 0;
diff --git a/src/index.cpp b/src/index.cpp
index 1a677731..03faac11 100644
--- a/src/index.cpp
+++ b/src/index.cpp
@@ -1231,7 +1231,7 @@ void Index::search_candidates(const uint8_t & field_id, bool field_is_array,
 void Index::do_filtering(uint32_t*& filter_ids, uint32_t& filter_ids_length,
                          const std::vector<filter>& filters,
                          const bool enable_short_circuit) const {
-    auto begin = std::chrono::high_resolution_clock::now();
+    //auto begin = std::chrono::high_resolution_clock::now();
     for(size_t i = 0; i < filters.size(); i++) {
         const filter & a_filter = filters[i];
 
@@ -1571,10 +1571,10 @@ void Index::do_filtering(uint32_t*& filter_ids, uint32_t& filter_ids_length,
         }
     }
 
-    long long int timeMillis =
+    /*long long int timeMillis =
             std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - begin).count();
 
-    LOG(INFO) << "Time taken for filtering: " << timeMillis << "ms";
+    LOG(INFO) << "Time taken for filtering: " << timeMillis << "ms";*/
 }
 
 
@@ -2194,7 +2194,7 @@ void Index::search(std::vector<query_tokens_t>& field_query_tokens,
             }
         }
 
-        auto begin0 = std::chrono::high_resolution_clock::now();
+        //auto begin0 = std::chrono::high_resolution_clock::now();
 
         for(auto& seq_id_kvs: topster_ids) {
             const uint64_t seq_id = seq_id_kvs.first;
@@ -2376,10 +2376,10 @@ void Index::search(std::vector<query_tokens_t>& field_query_tokens,
             topster->add(kvs[0]);
         }
 
-        auto timeMillis0 = std::chrono::duration_cast<std::chrono::milliseconds>(
+        /*auto timeMillis0 = std::chrono::duration_cast<std::chrono::milliseconds>(
                 std::chrono::high_resolution_clock::now() - begin0).count();
 
-        LOG(INFO) << "Time taken for multi-field aggregation: " << timeMillis0 << "ms";
+        LOG(INFO) << "Time taken for multi-field aggregation: " << timeMillis0 << "ms";*/
     }
 
     //LOG(INFO) << "topster size: " << topster->size;
@@ -2903,17 +2903,17 @@ void Index::search_field(const uint8_t & field_id,
                 // prefix should apply only for last token
                 const size_t token_len = prefix_search ? (int) token.length() : (int) token.length() + 1;
 
-                auto begin = std::chrono::high_resolution_clock::now();
+                //auto begin = std::chrono::high_resolution_clock::now();
 
                 // need less candidates for filtered searches since we already only pick tokens with results
                 art_fuzzy_search(search_index.at(field_name), (const unsigned char *) token.c_str(), token_len,
                                  costs[token_index], costs[token_index], num_fuzzy_candidates, token_order, prefix_search,
                                  filter_ids, filter_ids_length, leaves, unique_tokens);
 
-                auto timeMillis = std::chrono::duration_cast<std::chrono::milliseconds>(
+                /*auto timeMillis = std::chrono::duration_cast<std::chrono::milliseconds>(
                                 std::chrono::high_resolution_clock::now() - begin).count();
 
-                LOG(INFO) << "Time taken for fuzzy search: " << timeMillis << "ms";
+                LOG(INFO) << "Time taken for fuzzy search: " << timeMillis << "ms";*/
 
                 if(!leaves.empty()) {
                     token_cost_cache.emplace(token_cost_hash, leaves);

From f45416be0c20eb075712cfa4ccb946d9a7ff58f8 Mon Sep 17 00:00:00 2001
From: Kishore Nallan <kishorenc@gmail.com>
Date: Tue, 23 Nov 2021 18:12:38 +0530
Subject: [PATCH 08/20] Only last token should prefix search during drop
 tokens.

---
 include/index.h                   |  5 ++++
 src/index.cpp                     | 24 ++++++++++-------
 test/collection_specific_test.cpp | 44 ++++++++++++++++++++++++++++---
 3 files changed, 59 insertions(+), 14 deletions(-)

diff --git a/include/index.h b/include/index.h
index 1309abea..effe2bc8 100644
--- a/include/index.h
+++ b/include/index.h
@@ -29,6 +29,11 @@ using array_mapped_facet_t = std::array<facet_map_t*, ARRAY_FACET_DIM>;
 struct token_t {
     size_t position;
     std::string value;
+    bool prefix;
+
+    token_t(size_t position, const std::string& value, bool prefix): position(position), value(value), prefix(prefix) {
+
+    }
 };
 
 struct token_candidates {
diff --git a/src/index.cpp b/src/index.cpp
index 03faac11..19885cba 100644
--- a/src/index.cpp
+++ b/src/index.cpp
@@ -1923,7 +1923,8 @@ bool Index::check_for_overrides(const token_ordering& token_order, const string&
             std::vector<token_t> window_tokens;
             std::set<std::string> window_tokens_set;
             for (size_t i = start_index; i < start_index + window_len; i++) {
-                window_tokens.push_back({i, tokens[i]});
+                bool is_prefix = (i == (start_index + window_len - 1));
+                window_tokens.emplace_back(i, tokens[i], is_prefix);
                 window_tokens_set.emplace(tokens[i]);
             }
 
@@ -2100,14 +2101,16 @@ void Index::search(std::vector<query_tokens_t>& field_query_tokens,
         for(size_t i = 0; i < num_search_fields; i++) {
             std::vector<token_t> q_include_pos_tokens;
             for(size_t j=0; j < field_query_tokens[i].q_include_tokens.size(); j++) {
-                q_include_pos_tokens.push_back({j, field_query_tokens[i].q_include_tokens[j]});
+                bool is_prefix = (j == field_query_tokens[i].q_include_tokens.size()-1);
+                q_include_pos_tokens.emplace_back(j, field_query_tokens[i].q_include_tokens[j], is_prefix);
             }
 
             std::vector<std::vector<token_t>> q_pos_synonyms;
             for(const auto& q_syn_vec: field_query_tokens[i].q_synonyms) {
                 std::vector<token_t> q_pos_syn;
                 for(size_t j=0; j < q_syn_vec.size(); j++) {
-                    q_pos_syn.push_back({j, q_syn_vec[j]});
+                    bool is_prefix = (j == q_syn_vec.size()-1);
+                    q_pos_syn.emplace_back(j, q_syn_vec[j], is_prefix);
                 }
                 q_pos_synonyms.emplace_back(q_pos_syn);
             }
@@ -2543,8 +2546,9 @@ void Index::compute_facet_infos(const std::vector<facet>& facets, facet_query_t&
             std::vector<token_t> search_tokens, qtokens;
 
             for (size_t qtoken_index = 0; qtoken_index < query_tokens.size(); qtoken_index++) {
-                search_tokens.emplace_back(token_t{qtoken_index, query_tokens[qtoken_index]});
-                qtokens.emplace_back(token_t{qtoken_index, query_tokens[qtoken_index]});
+                bool is_prefix = (qtoken_index == query_tokens.size()-1);
+                search_tokens.emplace_back(qtoken_index, query_tokens[qtoken_index], is_prefix);
+                qtokens.emplace_back(qtoken_index, query_tokens[qtoken_index], is_prefix);
             }
 
             std::vector<std::vector<art_leaf*>> searched_queries;
@@ -2893,14 +2897,14 @@ void Index::search_field(const uint8_t & field_id,
             const std::string token_cost_hash = token + std::to_string(costs[token_index]);
 
             std::vector<art_leaf*> leaves;
-            //LOG(INFO) << "Searching for field: " << field << ", token:" << token << " - cost: " << costs[token_index];
+            const bool prefix_search = prefix && search_tokens[token_index].prefix;
 
-            const bool prefix_search = prefix && (token_index == search_tokens.size()-1);
+            /*LOG(INFO) << "Searching for field: " << the_field.name << ", token:"
+                      << token << " - cost: " << costs[token_index] << ", prefix_search: " << prefix_search;*/
 
             if(token_cost_cache.count(token_cost_hash) != 0) {
                 leaves = token_cost_cache[token_cost_hash];
             } else {
-                // prefix should apply only for last token
                 const size_t token_len = prefix_search ? (int) token.length() : (int) token.length() + 1;
 
                 //auto begin = std::chrono::high_resolution_clock::now();
@@ -3008,13 +3012,13 @@ void Index::search_field(const uint8_t & field_id,
             // drop from right
             size_t end_index = (query_tokens.size() - 1) - num_tokens_dropped;
             for(size_t i=0; i <= end_index; i++) {
-                truncated_tokens.push_back({query_tokens[i].position, query_tokens[i].value});
+                truncated_tokens.emplace_back(query_tokens[i].position, query_tokens[i].value, query_tokens[i].prefix);
             }
         } else {
             // drop from left
             size_t start_index = (num_tokens_dropped - mid_index);
             for(size_t i=start_index; i<query_tokens.size(); i++) {
-                truncated_tokens.push_back({query_tokens[i].position, query_tokens[i].value});
+                truncated_tokens.emplace_back(query_tokens[i].position, query_tokens[i].value, query_tokens[i].prefix);
             }
         }
 
diff --git a/test/collection_specific_test.cpp b/test/collection_specific_test.cpp
index 8ae33b22..c148b659 100644
--- a/test/collection_specific_test.cpp
+++ b/test/collection_specific_test.cpp
@@ -1711,8 +1711,6 @@ TEST_F(CollectionSpecificTest, RepeatingStringArrayTokens) {
 }
 
 TEST_F(CollectionSpecificTest, HighlightOnPrefixRegression) {
-    std::vector<std::string> tags;
-
     // when the first document containing a token already cannot fit compact posting list
 
     std::vector<field> fields = {field("title", field_types::STRING, false),};
@@ -1727,8 +1725,46 @@ TEST_F(CollectionSpecificTest, HighlightOnPrefixRegression) {
     auto results = coll1->search("and", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}).get();
     ASSERT_EQ(1, results["hits"].size());
 
-    LOG(INFO) << results;
-
     collectionManager.drop_collection("coll1");
 }
 
+TEST_F(CollectionSpecificTest, DroppedTokensShouldNotBeUsedForPrefixSearch) {
+    std::vector<field> fields = {field("title", field_types::STRING, false),
+                                 field("points", field_types::INT32, false),};
+
+    Collection* coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
+
+    nlohmann::json doc1;
+    doc1["id"] = "0";
+    doc1["title"] = "Dog Shoemaker";
+    doc1["points"] = 100;
+
+    nlohmann::json doc2;
+    doc2["id"] = "1";
+    doc2["title"] = "Shoe and Sock";
+    doc2["points"] = 200;
+
+    ASSERT_TRUE(coll1->add(doc1.dump()).ok());
+    ASSERT_TRUE(coll1->add(doc2.dump()).ok());
+
+    auto results = coll1->search("shoe cat", {"title"},
+                                 "", {}, {}, {2}, 10,
+                                 1, FREQUENCY, {true},
+                                 10, spp::sparse_hash_set<std::string>(),
+                                 spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "title", 20, {}, {}, {}, 0,
+                                 "<mark>", "</mark>", {}, 1000, true).get();
+
+    ASSERT_EQ(1, results["hits"].size());
+    ASSERT_EQ("1", results["hits"][0]["document"]["id"].get<std::string>());
+
+    results = coll1->search("cat shoe", {"title"},
+                            "", {}, {}, {2}, 10,
+                            1, FREQUENCY, {true},
+                            10, spp::sparse_hash_set<std::string>(),
+                            spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "title", 20, {}, {}, {}, 0,
+                            "<mark>", "</mark>", {}, 1000, true).get();
+
+    ASSERT_EQ(2, results["hits"].size());
+    ASSERT_EQ("1", results["hits"][0]["document"]["id"].get<std::string>());
+    ASSERT_EQ("0", results["hits"][1]["document"]["id"].get<std::string>());
+}

From d67d0d6a65d7c96cc7587f5398d1762761bb9952 Mon Sep 17 00:00:00 2001
From: Kishore Nallan <kishorenc@gmail.com>
Date: Tue, 23 Nov 2021 18:16:08 +0530
Subject: [PATCH 09/20] [RPM] Ensure typesense service is stopped before
 starting.

Work around for a systemd bug: https://gitlab.com/gitlab-org/ci-cd/docker-machine/-/issues/41#note_688450126
---
 debian-pkg/typesense-server/DEBIAN/postinst | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/debian-pkg/typesense-server/DEBIAN/postinst b/debian-pkg/typesense-server/DEBIAN/postinst
index 52c70f1d..a71c14c6 100755
--- a/debian-pkg/typesense-server/DEBIAN/postinst
+++ b/debian-pkg/typesense-server/DEBIAN/postinst
@@ -4,6 +4,7 @@
 API_KEY=$(cat /dev/urandom | tr -dc 'a-zA-Z0-9' | fold -w 48 | head -n 1)
 sed -i "s/\$API_KEY/$API_KEY/g" /etc/typesense/typesense-server.ini
 
+systemctl stop typesense-server
 systemctl enable typesense-server
 systemctl daemon-reload
-systemctl start typesense-server
\ No newline at end of file
+systemctl start typesense-server

From 69345d37ced62d51cc9bbc9623897bad7600fac5 Mon Sep 17 00:00:00 2001
From: Kishore Nallan <kishorenc@gmail.com>
Date: Wed, 24 Nov 2021 12:02:08 +0530
Subject: [PATCH 10/20] Revert "[RPM] Ensure typesense service is stopped
 before starting."

This reverts commit 514824b76b3221998f2d35816a7878ad9c9ffd66.
---
 debian-pkg/typesense-server/DEBIAN/postinst | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/debian-pkg/typesense-server/DEBIAN/postinst b/debian-pkg/typesense-server/DEBIAN/postinst
index a71c14c6..52c70f1d 100755
--- a/debian-pkg/typesense-server/DEBIAN/postinst
+++ b/debian-pkg/typesense-server/DEBIAN/postinst
@@ -4,7 +4,6 @@
 API_KEY=$(cat /dev/urandom | tr -dc 'a-zA-Z0-9' | fold -w 48 | head -n 1)
 sed -i "s/\$API_KEY/$API_KEY/g" /etc/typesense/typesense-server.ini
 
-systemctl stop typesense-server
 systemctl enable typesense-server
 systemctl daemon-reload
-systemctl start typesense-server
+systemctl start typesense-server
\ No newline at end of file

From a21628207ea23be2cee3780bf3b111b371673d35 Mon Sep 17 00:00:00 2001
From: Kishore Nallan <kishorenc@gmail.com>
Date: Wed, 24 Nov 2021 20:20:21 +0530
Subject: [PATCH 11/20] Refactor grouping with sorting.

---
 include/topster.h                 |  73 ++++----------------
 src/collection.cpp                |  23 +++++--
 test/collection_grouping_test.cpp | 108 +++++++++++++++++++++++++++++-
 3 files changed, 137 insertions(+), 67 deletions(-)

diff --git a/include/topster.h b/include/topster.h
index f86a7b4b..4accd374 100644
--- a/include/topster.h
+++ b/include/topster.h
@@ -105,10 +105,9 @@ struct Topster {
     KV *data;
     KV** kvs;
 
-    // For distinct, stores the min heap kv of each group_kv_map topster value
     std::unordered_map<uint64_t, KV*> kv_map;
 
-    std::unordered_map<uint64_t, Topster*> group_kv_map;
+    spp::sparse_hash_map<uint64_t, Topster*> group_kv_map;
     size_t distinct;
 
     explicit Topster(size_t capacity): Topster(capacity, 0) {
@@ -172,64 +171,17 @@ struct Topster {
         bool SIFT_DOWN = true;
 
         if(distinct) {
-            const auto& found_it = group_kv_map.find(kv->distinct_key);
-            bool is_duplicate_key = (found_it != group_kv_map.end());
-
-            if(!is_duplicate_key && less_than_min_heap) {
-                // for distinct, if a non duplicate kv is < than min heap we ignore
-                return false;
-            }
-
-            if(is_duplicate_key) {
-                // if min heap (group_topster.kvs[0]) changes, we have to update kvs and sift
-                Topster* group_topster = found_it->second;
-                KV old_min_heap_kv = *kv_map[kv->distinct_key];
-                bool added = group_topster->add(kv);
-
-                if(!added) {
-                    return false;
-                }
-
-                // if new kv score is greater than previous min heap score we sift down, otherwise sift up
-                SIFT_DOWN = is_greater(kv, &old_min_heap_kv);
-
-                // new kv is different from old_min_heap_kv so we have to sift heap
-                heap_op_index = old_min_heap_kv.array_index;
-
-                // erase current min heap key from kv_map
-                kv_map.erase(old_min_heap_kv.distinct_key);
-
+            // Grouping cannot be a streaming operation, so aggregate the KVs associated with every group.
+            auto kvs_it = group_kv_map.find(kv->distinct_key);
+            if(kvs_it != group_kv_map.end()) {
+                kvs_it->second->add(kv);
             } else {
-                // kv is guaranteed to be > current min heap: kvs[0]
-                // create fresh topster for this distinct group key since it does not exist
-                Topster* group_topster = new Topster(distinct, 0);
-                group_topster->add(kv);
-
-                // add new group key to map
-                group_kv_map.emplace(kv->distinct_key, group_topster);
-
-                // find heap operation index for updating kvs
-
-                if(size < MAX_SIZE) {
-                    // there is enough space in heap we just copy to end
-                    SIFT_DOWN = false;
-                    heap_op_index = size;
-                    size++;
-                } else {
-                    SIFT_DOWN = true;
-
-                    // max size is reached so we are forced to replace current min heap element (kvs[0])
-                    heap_op_index = 0;
-
-                    // remove current min heap group key from maps
-                    delete group_kv_map[kvs[heap_op_index]->distinct_key];
-                    group_kv_map.erase(kvs[heap_op_index]->distinct_key);
-                    kv_map.erase(kvs[heap_op_index]->distinct_key);
-                }
+                Topster* g_topster = new Topster(distinct, 0);
+                g_topster->add(kv);
+                group_kv_map.insert({kv->distinct_key, g_topster});
             }
-
-            // kv will be copied into the pointer at heap_op_index
-            kv_map.emplace(kv->distinct_key, kvs[heap_op_index]);
+            
+            return true;
 
         } else { // not distinct
             //LOG(INFO) << "Searching for key: " << kv->key;
@@ -334,9 +286,8 @@ struct Topster {
 
     // topster must be sorted before iterated upon to remove dead array entries
     void sort() {
-        std::stable_sort(kvs, kvs + size, is_greater);
-        for(auto &group_topster: group_kv_map) {
-            group_topster.second->sort();
+        if(!distinct) {
+            std::stable_sort(kvs, kvs + size, is_greater);
         }
     }
 
diff --git a/src/collection.cpp b/src/collection.cpp
index f875bb68..f690e5f3 100644
--- a/src/collection.cpp
+++ b/src/collection.cpp
@@ -1384,12 +1384,27 @@ void Collection::parse_search_query(const std::string &query, std::vector<std::s
 
 void Collection::populate_result_kvs(Topster *topster, std::vector<std::vector<KV *>> &result_kvs) {
     if(topster->distinct) {
-        for(auto &group_topster_entry: topster->group_kv_map) {
-            Topster* group_topster = group_topster_entry.second;
-            const std::vector<KV*> group_kvs(group_topster->kvs, group_topster->kvs+group_topster->size);
-            result_kvs.emplace_back(group_kvs);
+        // we have to pick top-K groups
+        Topster gtopster(topster->MAX_SIZE);
+
+        for(auto& group_topster: topster->group_kv_map) {
+            group_topster.second->sort();
+            if(group_topster.second->size != 0) {
+                KV* kv_head = group_topster.second->getKV(0);
+                gtopster.add(kv_head);
+            }
         }
 
+        gtopster.sort();
+
+        for(size_t i = 0; i < gtopster.size; i++) {
+            KV* kv = gtopster.getKV(i);
+            const std::vector<KV*> group_kvs(
+                topster->group_kv_map[kv->distinct_key]->kvs,
+                topster->group_kv_map[kv->distinct_key]->kvs+topster->group_kv_map[kv->distinct_key]->size
+            );
+            result_kvs.emplace_back(group_kvs);
+        }
     } else {
         for(uint32_t t = 0; t < topster->size; t++) {
             KV* kv = topster->getKV(t);
diff --git a/test/collection_grouping_test.cpp b/test/collection_grouping_test.cpp
index 09111c88..c5321b9e 100644
--- a/test/collection_grouping_test.cpp
+++ b/test/collection_grouping_test.cpp
@@ -154,8 +154,8 @@ TEST_F(CollectionGroupingTest, GroupingCompoundKey) {
 
     ASSERT_EQ(10, res["found"].get<size_t>());
     ASSERT_EQ(10, res["grouped_hits"].size());
-    ASSERT_EQ(11, res["grouped_hits"][0]["group_key"][0].get<size_t>());
 
+    ASSERT_EQ(11, res["grouped_hits"][0]["group_key"][0].get<size_t>());
     ASSERT_STREQ("Beta", res["grouped_hits"][0]["group_key"][1].get<std::string>().c_str());
 
     // optional field should have no value in the group key component
@@ -428,4 +428,108 @@ TEST_F(CollectionGroupingTest, GroupingWithArrayFieldAndOverride) {
 
     ASSERT_EQ(1, (int) res["facet_counts"][0]["counts"][3]["count"]);
     ASSERT_STREQ("Zeta", res["facet_counts"][0]["counts"][3]["value"].get<std::string>().c_str());
-}
\ No newline at end of file
+}
+
+TEST_F(CollectionGroupingTest, GroupOrderIndependence) {
+    Collection *coll1;
+
+    std::vector<field> fields = {field("group", field_types::STRING, true),
+                                 field("points", field_types::INT32, false),};
+
+    coll1 = collectionManager.get_collection("coll1").get();
+    if(coll1 == nullptr) {
+        coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
+    }
+
+    nlohmann::json doc;
+
+    for(size_t i = 0; i < 256; i++) {
+        int64_t points = 100 + i;
+        doc["id"] = std::to_string(i);
+        doc["group"] = std::to_string(i);
+        doc["points"] = points;
+
+        ASSERT_TRUE(coll1->add(doc.dump()).ok());
+    }
+
+    // doc id "255" will have points of 255
+    // try to insert doc id "256" with group "256" but having lesser points than all records
+
+    doc["id"] = "256";
+    doc["group"] = "256";
+    doc["points"] = 50;
+    ASSERT_TRUE(coll1->add(doc.dump()).ok());
+
+    // insert doc id "257" of same group "256" with greatest point
+
+    doc["id"] = "257";
+    doc["group"] = "256";
+    doc["points"] = 500;
+    ASSERT_TRUE(coll1->add(doc.dump()).ok());
+
+    // when we search by grouping records, sorting descending on points, both records of group "256" should show up
+
+    std::vector<sort_by> sort_fields = {sort_by("points", "DESC")};
+
+    auto res = coll1->search("*", {}, "", {}, sort_fields, {0}, 10, 1, FREQUENCY,
+                                  {false}, Index::DROP_TOKENS_THRESHOLD,
+                                  spp::sparse_hash_set<std::string>(),
+                                  spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
+                                  "", 10,
+                                  {}, {}, {"group"}, 10).get();
+
+    ASSERT_EQ(1, res["grouped_hits"][0]["group_key"].size());
+    ASSERT_STREQ("256", res["grouped_hits"][0]["group_key"][0].get<std::string>().c_str());
+    ASSERT_EQ(2, res["grouped_hits"][0]["hits"].size());
+}
+
+TEST_F(CollectionGroupingTest, UseHighestValueInGroupForOrdering) {
+    Collection *coll1;
+
+    std::vector<field> fields = {field("group", field_types::STRING, true),
+                                 field("points", field_types::INT32, false),};
+
+    coll1 = collectionManager.get_collection("coll1").get();
+    if(coll1 == nullptr) {
+        coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
+    }
+
+    nlohmann::json doc;
+
+    for(size_t i = 0; i < 250; i++) {
+        int64_t points = 100 + i;
+        doc["id"] = std::to_string(i);
+        doc["group"] = std::to_string(i);
+        doc["points"] = points;
+
+        ASSERT_TRUE(coll1->add(doc.dump()).ok());
+    }
+
+    // points: 100 -> 349
+
+    // group with highest point is "249" with 349 points
+    // insert another document for that group with 50 points
+    doc["id"] = "250";
+    doc["group"] = "249";
+    doc["points"] = 50;
+    ASSERT_TRUE(coll1->add(doc.dump()).ok());
+
+    // now insert another new group whose points is greater than 50
+    doc["id"] = "251";
+    doc["group"] = "1000";
+    doc["points"] = 60;
+    ASSERT_TRUE(coll1->add(doc.dump()).ok());
+
+    std::vector<sort_by> sort_fields = {sort_by("points", "DESC")};
+
+    auto res = coll1->search("*", {}, "", {}, sort_fields, {0}, 10, 1, FREQUENCY,
+                             {false}, Index::DROP_TOKENS_THRESHOLD,
+                             spp::sparse_hash_set<std::string>(),
+                             spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
+                             "", 10,
+                             {}, {}, {"group"}, 10).get();
+
+    ASSERT_EQ(1, res["grouped_hits"][0]["group_key"].size());
+    ASSERT_STREQ("249", res["grouped_hits"][0]["group_key"][0].get<std::string>().c_str());
+    ASSERT_EQ(2, res["grouped_hits"][0]["hits"].size());
+}

From b088bad83bdf8b77da2ba5ca78f4cc7ecdeff1c0 Mon Sep 17 00:00:00 2001
From: Kishore Nallan <kishorenc@gmail.com>
Date: Thu, 25 Nov 2021 19:05:41 +0530
Subject: [PATCH 12/20] Use gzip compression for DEB.

---
 debian-pkg/generate_deb_rpm.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/debian-pkg/generate_deb_rpm.sh b/debian-pkg/generate_deb_rpm.sh
index 42a7b191..f5c67231 100755
--- a/debian-pkg/generate_deb_rpm.sh
+++ b/debian-pkg/generate_deb_rpm.sh
@@ -35,7 +35,8 @@ rm -rf /tmp/typesense-server-$TS_VERSION /tmp/typesense-server-$TS_VERSION.tar.g
 
 sed -i "s/\$VERSION/$TS_VERSION/g" `find /tmp/typesense-deb-build -maxdepth 10 -type f`
 
-dpkg -b /tmp/typesense-deb-build/typesense-server "/tmp/typesense-deb-build/typesense-server-${TS_VERSION}-amd64.deb"
+dpkg-deb -Zgzip -z6 \
+         -b /tmp/typesense-deb-build/typesense-server "/tmp/typesense-deb-build/typesense-server-${TS_VERSION}-amd64.deb"
 
 # Generate RPM
 

From 8ec653c3e6ee8f0342bfe8c4bd7ce47eaad5ba8e Mon Sep 17 00:00:00 2001
From: Kishore Nallan <kishorenc@gmail.com>
Date: Fri, 26 Nov 2021 15:11:23 +0530
Subject: [PATCH 13/20] Check for connection status before responding.

---
 src/batched_indexer.cpp |  8 +++++---
 src/core_api.cpp        |  6 ++++++
 src/http_server.cpp     | 13 ++++++++-----
 3 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/src/batched_indexer.cpp b/src/batched_indexer.cpp
index 2fdb760c..e149487e 100644
--- a/src/batched_indexer.cpp
+++ b/src/batched_indexer.cpp
@@ -255,9 +255,11 @@ void BatchedIndexer::run() {
                     const std::string& req_key_prefix = get_req_prefix_key(it->second.start_ts);
                     store->delete_range(req_key_prefix, req_key_prefix + StringUtils::serialize_uint32_t(UINT32_MAX));
 
-                    it->second.res->final = true;
-                    async_req_res_t* async_req_res = new async_req_res_t(it->second.req, it->second.res, true);
-                    server->get_message_dispatcher()->send_message(HttpServer::STREAM_RESPONSE_MESSAGE, async_req_res);
+                    if(it->second.res->is_alive) {
+                        it->second.res->final = true;
+                        async_req_res_t* async_req_res = new async_req_res_t(it->second.req, it->second.res, true);
+                        server->get_message_dispatcher()->send_message(HttpServer::STREAM_RESPONSE_MESSAGE, async_req_res);
+                    }
 
                     it = req_res_map.erase(it);
                 } else {
diff --git a/src/core_api.cpp b/src/core_api.cpp
index cdf67adf..ada733fc 100644
--- a/src/core_api.cpp
+++ b/src/core_api.cpp
@@ -34,9 +34,15 @@ bool handle_authentication(std::map<std::string, std::string>& req_params, const
 
 void stream_response(const std::shared_ptr<http_req>& req, const std::shared_ptr<http_res>& res) {
     if(!res->is_alive) {
+        // underlying request is dead or this is a raft log playback
         return ;
     }
 
+    if(req->_req->res.status != 0) {
+        // not the first response chunk, so wait for previous chunk to finish
+        res->wait();
+    }
+
     auto req_res = new async_req_res_t(req, res, true);
     server->get_message_dispatcher()->send_message(HttpServer::STREAM_RESPONSE_MESSAGE, req_res);
 }
diff --git a/src/http_server.cpp b/src/http_server.cpp
index a62cd477..1a9d4a5d 100644
--- a/src/http_server.cpp
+++ b/src/http_server.cpp
@@ -767,6 +767,7 @@ void HttpServer::stream_response(stream_response_state_t& state) {
 
     if(state.res_body.len == 0 && state.send_state != H2O_SEND_STATE_FINAL) {
         // without this guard, http streaming will break
+        state.generator->proceed(state.generator, req);
         return;
     }
 
@@ -908,13 +909,15 @@ bool HttpServer::on_request_proceed_message(void *data) {
     // This callback will run concurrently to batch indexer's run() so care must be taken to protect access
     // to variables that are written to by the batch indexer, which for now is only: last_chunk_aggregate (atomic)
     deferred_req_res_t* req_res = static_cast<deferred_req_res_t *>(data);
-    auto stream_state = (req_res->req->last_chunk_aggregate) ? H2O_SEND_STATE_FINAL : H2O_SEND_STATE_IN_PROGRESS;
+    if(req_res->res->is_alive) {
+        auto stream_state = (req_res->req->last_chunk_aggregate) ? H2O_SEND_STATE_FINAL : H2O_SEND_STATE_IN_PROGRESS;
 
-    size_t written = req_res->req->chunk_len;
-    req_res->req->chunk_len = 0;
+        size_t written = req_res->req->chunk_len;
+        req_res->req->chunk_len = 0;
 
-    if(req_res->req->_req && req_res->req->_req->proceed_req) {
-        req_res->req->_req->proceed_req(req_res->req->_req, written, stream_state);
+        if(req_res->req->_req && req_res->req->_req->proceed_req) {
+            req_res->req->_req->proceed_req(req_res->req->_req, written, stream_state);
+        }
     }
 
     if(req_res->destroy_after_use) {

From b14458efd4874fe9e6599da6ae24876a58ec37e5 Mon Sep 17 00:00:00 2001
From: Kishore Nallan <kishorenc@gmail.com>
Date: Fri, 26 Nov 2021 19:35:50 +0530
Subject: [PATCH 14/20] Handle updates of repeating records within the same
 batch.

---
 src/collection.cpp                |  40 +++----
 test/collection_specific_test.cpp | 182 +++++++++++++++++++++++++++++-
 2 files changed, 192 insertions(+), 30 deletions(-)

diff --git a/src/collection.cpp b/src/collection.cpp
index f690e5f3..627454a8 100644
--- a/src/collection.cpp
+++ b/src/collection.cpp
@@ -223,22 +223,28 @@ nlohmann::json Collection::add_many(std::vector<std::string>& json_lines, nlohma
         // NOTE: we overwrite the input json_lines with result to avoid memory pressure
 
         record.is_update = false;
+        bool repeated_doc = false;
 
         if(!doc_seq_id_op.ok()) {
             record.index_failure(doc_seq_id_op.code(), doc_seq_id_op.error());
         } else {
+            const std::string& doc_id = record.doc["id"].get<std::string>();
+            repeated_doc = (batch_doc_ids.find(doc_id) != batch_doc_ids.end());
+
+            if(repeated_doc) {
+                // when a document repeats, we send the batch until this document so that we can deal with conflicts
+                i--;
+                goto do_batched_index;
+            }
+
             record.is_update = !doc_seq_id_op.get().is_new;
+
             if(record.is_update) {
                 get_document_from_store(get_seq_id_key(seq_id), record.old_doc);
-            } else {
-                const std::string& doc_id = record.doc["id"].get<std::string>();
-                if(batch_doc_ids.find(doc_id) != batch_doc_ids.end()) {
-                    record.index_failure(400, "Document with `id` " + doc_id + " already exists in the batch.");
-                } else {
-                    batch_doc_ids.emplace(doc_id);
-                }
             }
 
+            batch_doc_ids.insert(doc_id);
+
             // if `fallback_field_type` or `dynamic_fields` is enabled, update schema first before indexing
             if(!fallback_field_type.empty() || !dynamic_fields.empty()) {
                 Option<bool> schema_change_op = check_and_update_schema(record.doc, dirty_values);
@@ -248,25 +254,11 @@ nlohmann::json Collection::add_many(std::vector<std::string>& json_lines, nlohma
             }
         }
 
-        /*
-        // check for memory threshold before allowing subsequent batches
-        if(is_exceeding_memory_threshold()) {
-            exceeds_memory_limit = true;
-        }
-
-        if(exceeds_memory_limit) {
-            nlohmann::json index_res;
-            index_res["error"] = "Max memory ratio exceeded.";
-            index_res["success"] = false;
-            index_res["document"] = json_line;
-            json_lines[i] = index_res.dump();
-            record.index_failure(500, "Max memory ratio exceeded.");
-        }
-        */
-
         index_records.emplace_back(std::move(record));
 
-        if((i+1) % index_batch_size == 0 || i == json_lines.size()-1) {
+        do_batched_index:
+
+        if((i+1) % index_batch_size == 0 || i == json_lines.size()-1 || repeated_doc) {
             batch_index(index_records, json_lines, num_indexed);
 
             // to return the document for the single doc add cases
diff --git a/test/collection_specific_test.cpp b/test/collection_specific_test.cpp
index c148b659..3c11518f 100644
--- a/test/collection_specific_test.cpp
+++ b/test/collection_specific_test.cpp
@@ -1452,7 +1452,7 @@ TEST_F(CollectionSpecificTest, ImportDocumentWithRepeatingIDInTheSameBatch) {
 
     ASSERT_TRUE(nlohmann::json::parse(import_records[0])["success"].get<bool>());
     ASSERT_FALSE(nlohmann::json::parse(import_records[1])["success"].get<bool>());
-    ASSERT_EQ("Document with `id` 0 already exists in the batch.",
+    ASSERT_EQ("A document with id 0 already exists.",
               nlohmann::json::parse(import_records[1])["error"].get<std::string>());
 
     auto results = coll1->search("levis", {"name"},
@@ -1485,7 +1485,7 @@ TEST_F(CollectionSpecificTest, ImportDocumentWithRepeatingIDInTheSameBatch) {
     ASSERT_TRUE(import_response["success"].get<bool>());
     ASSERT_EQ(2, import_response["num_imported"].get<int>());
 
-    // repeated ID is rejected even if the first ID is not indexed due to some error
+    // repeated ID is NOT rejected if the first ID is not indexed due to some error
     import_records.clear();
     doc1.erase("name");
     doc1["id"] = "100";
@@ -1496,17 +1496,187 @@ TEST_F(CollectionSpecificTest, ImportDocumentWithRepeatingIDInTheSameBatch) {
 
     import_response = coll1->add_many(import_records, document);
 
+    LOG(INFO) << import_response;
+
     ASSERT_FALSE(import_response["success"].get<bool>());
-    ASSERT_EQ(0, import_response["num_imported"].get<int>());
+    ASSERT_EQ(1, import_response["num_imported"].get<int>());
 
     ASSERT_FALSE(nlohmann::json::parse(import_records[0])["success"].get<bool>());
-    ASSERT_FALSE(nlohmann::json::parse(import_records[1])["success"].get<bool>());
-    ASSERT_EQ("Document with `id` 100 already exists in the batch.",
-              nlohmann::json::parse(import_records[1])["error"].get<std::string>());
+    ASSERT_EQ("Field `name` has been declared in the schema, but is not found in the document.",
+              nlohmann::json::parse(import_records[0])["error"].get<std::string>());
+
+    ASSERT_TRUE(nlohmann::json::parse(import_records[1])["success"].get<bool>());
 
     collectionManager.drop_collection("coll1");
 }
 
+
+TEST_F(CollectionSpecificTest, UpdateOfTwoDocsWithSameIdWithinSameBatch) {
+    std::vector<field> fields = {field("last_chance", field_types::BOOL, false, true),
+                                 field("points", field_types::INT32, false),};
+
+    Collection* coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
+
+    nlohmann::json doc1;
+    doc1["id"] = "0";
+    doc1["points"] = 100;
+
+    ASSERT_TRUE(coll1->add(doc1.dump()).ok());
+
+    // second update should reflect the result of first update
+    std::vector<std::string> updates = {
+            R"({"id": "0", "last_chance": false})",
+            R"({"id": "0", "points": 200})",
+    };
+
+    nlohmann::json update_doc;
+    auto import_response = coll1->add_many(updates, update_doc, UPDATE);
+    ASSERT_TRUE(import_response["success"].get<bool>());
+    ASSERT_EQ(2, import_response["num_imported"].get<int>());
+
+    auto results = coll1->search("*", {},
+                                 "", {}, {}, {0}, 10,
+                                 1, FREQUENCY, {true},
+                                 10, spp::sparse_hash_set<std::string>(),
+                                 spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "title", 20, {}, {}, {}, 0,
+                                 "<mark>", "</mark>", {}, 1000, true).get();
+
+    LOG(INFO) << results;
+
+    ASSERT_EQ(1, results["hits"].size());
+    ASSERT_EQ("0", results["hits"][0]["document"]["id"].get<std::string>());
+    ASSERT_TRUE(results["hits"][0]["document"].contains("last_chance"));
+    ASSERT_EQ(false, results["hits"][0]["document"]["last_chance"].get<bool>());
+
+    ASSERT_EQ(1, coll1->_get_index()->_get_numerical_index().at("points")->size());
+    ASSERT_EQ(1, coll1->_get_index()->_get_numerical_index().at("last_chance")->size());
+}
+
+TEST_F(CollectionSpecificTest, UpsertOfTwoDocsWithSameIdWithinSameBatch) {
+    std::vector<field> fields = {field("last_chance", field_types::BOOL, false, true),
+                                 field("points", field_types::INT32, false, true),};
+
+    Collection* coll1 = collectionManager.create_collection("coll1", 1, fields, "").get();
+
+    nlohmann::json doc1;
+    doc1["id"] = "0";
+    doc1["points"] = 100;
+
+    ASSERT_TRUE(coll1->add(doc1.dump()).ok());
+
+    // first upsert removes both fields, so second upsert should only insert "points"
+    std::vector<std::string> upserts = {
+            R"({"id": "0", "last_chance": true})",
+            R"({"id": "0", "points": 200})",
+    };
+
+    nlohmann::json update_doc;
+    auto import_response = coll1->add_many(upserts, update_doc, UPSERT);
+    ASSERT_TRUE(import_response["success"].get<bool>());
+    ASSERT_EQ(2, import_response["num_imported"].get<int>());
+
+    auto results = coll1->search("*", {},
+                                 "", {}, {}, {0}, 10,
+                                 1, FREQUENCY, {true},
+                                 10, spp::sparse_hash_set<std::string>(),
+                                 spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "title", 20, {}, {}, {}, 0,
+                                 "<mark>", "</mark>", {}, 1000, true).get();
+
+    ASSERT_EQ(1, results["hits"].size());
+    ASSERT_EQ("0", results["hits"][0]["document"]["id"].get<std::string>());
+    ASSERT_TRUE(results["hits"][0]["document"].contains("points"));
+    ASSERT_FALSE(results["hits"][0]["document"].contains("last_chance"));
+    ASSERT_EQ(200, results["hits"][0]["document"]["points"].get<int32_t>());
+
+    ASSERT_EQ(1, coll1->_get_index()->_get_numerical_index().at("points")->size());
+    ASSERT_EQ(0, coll1->_get_index()->_get_numerical_index().at("last_chance")->size());
+
+    // update without doc id
+
+    upserts = {
+            R"({"last_chance": true})",
+    };
+
+    import_response = coll1->add_many(upserts, update_doc, UPDATE);
+    ASSERT_FALSE(import_response["success"].get<bool>());
+    ASSERT_EQ(0, import_response["num_imported"].get<int>());
+}
+
+TEST_F(CollectionSpecificTest, UpdateUpsertOfDocWithMissingFields) {
+    std::vector<field> fields = {field("last_chance", field_types::BOOL, false, true),
+                                 field("points", field_types::INT32, false, true),};
+
+    Collection* coll1 = collectionManager.create_collection("coll1", 1, fields, "").get();
+
+    nlohmann::json doc1;
+    doc1["id"] = "0";
+    doc1["last_chance"] = true;
+    doc1["points"] = 100;
+
+    ASSERT_TRUE(coll1->add(doc1.dump()).ok());
+
+    // upsert doc with missing fields: should be removed from index
+    std::vector<std::string> upserts = {
+            R"({"id": "0"})"
+    };
+
+    nlohmann::json update_doc;
+    auto import_response = coll1->add_many(upserts, update_doc, UPSERT);
+    ASSERT_TRUE(import_response["success"].get<bool>());
+    ASSERT_EQ(1, import_response["num_imported"].get<int>());
+
+    auto results = coll1->search("*", {},
+                                 "", {}, {}, {0}, 10,
+                                 1, FREQUENCY, {true},
+                                 10, spp::sparse_hash_set<std::string>(),
+                                 spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "title", 20, {}, {}, {}, 0,
+                                 "<mark>", "</mark>", {}, 1000, true).get();
+
+    ASSERT_EQ(1, results["hits"].size());
+    ASSERT_EQ("0", results["hits"][0]["document"]["id"].get<std::string>());
+    ASSERT_EQ(1, results["hits"][0]["document"].size());
+
+    ASSERT_EQ(0, coll1->_get_index()->_get_numerical_index().at("points")->size());
+    ASSERT_EQ(0, coll1->_get_index()->_get_numerical_index().at("last_chance")->size());
+
+    // put the original doc back
+    ASSERT_TRUE(coll1->add(doc1.dump(), UPSERT).ok());
+
+    results = coll1->search("*", {},
+                            "", {}, {}, {0}, 10,
+                            1, FREQUENCY, {true},
+                            10, spp::sparse_hash_set<std::string>(),
+                            spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "title", 20, {}, {}, {}, 0,
+                            "<mark>", "</mark>", {}, 1000, true).get();
+
+    ASSERT_EQ(1, results["hits"].size());
+    ASSERT_EQ(3, results["hits"][0]["document"].size());
+
+    // update doc with missing fields: existing fields should NOT be removed
+
+    upserts = {
+            R"({"id": "0"})"
+    };
+
+    import_response = coll1->add_many(upserts, update_doc, UPDATE);
+    ASSERT_TRUE(import_response["success"].get<bool>());
+    ASSERT_EQ(1, import_response["num_imported"].get<int>());
+
+    results = coll1->search("*", {},
+                            "", {}, {}, {0}, 10,
+                            1, FREQUENCY, {true},
+                            10, spp::sparse_hash_set<std::string>(),
+                            spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "title", 20, {}, {}, {}, 0,
+                            "<mark>", "</mark>", {}, 1000, true).get();
+
+    ASSERT_EQ(1, results["hits"].size());
+    ASSERT_EQ("0", results["hits"][0]["document"]["id"].get<std::string>());
+    ASSERT_EQ(3, results["hits"][0]["document"].size());
+
+    ASSERT_EQ(1, coll1->_get_index()->_get_numerical_index().at("points")->size());
+    ASSERT_EQ(1, coll1->_get_index()->_get_numerical_index().at("last_chance")->size());
+}
+
 TEST_F(CollectionSpecificTest, FacetParallelizationVerification) {
     std::vector<field> fields = {field("name", field_types::STRING, false),
                                  field("category", field_types::STRING, true),

From cec0779bb227dc7d071d033c296a28596a59e84a Mon Sep 17 00:00:00 2001
From: Kishore Nallan <kishorenc@gmail.com>
Date: Fri, 26 Nov 2021 21:08:24 +0530
Subject: [PATCH 15/20] Add coll drops.

---
 test/collection_specific_test.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/test/collection_specific_test.cpp b/test/collection_specific_test.cpp
index 3c11518f..09cd6c4d 100644
--- a/test/collection_specific_test.cpp
+++ b/test/collection_specific_test.cpp
@@ -1550,6 +1550,8 @@ TEST_F(CollectionSpecificTest, UpdateOfTwoDocsWithSameIdWithinSameBatch) {
 
     ASSERT_EQ(1, coll1->_get_index()->_get_numerical_index().at("points")->size());
     ASSERT_EQ(1, coll1->_get_index()->_get_numerical_index().at("last_chance")->size());
+
+    collectionManager.drop_collection("coll1");
 }
 
 TEST_F(CollectionSpecificTest, UpsertOfTwoDocsWithSameIdWithinSameBatch) {
@@ -1600,6 +1602,8 @@ TEST_F(CollectionSpecificTest, UpsertOfTwoDocsWithSameIdWithinSameBatch) {
     import_response = coll1->add_many(upserts, update_doc, UPDATE);
     ASSERT_FALSE(import_response["success"].get<bool>());
     ASSERT_EQ(0, import_response["num_imported"].get<int>());
+
+    collectionManager.drop_collection("coll1");
 }
 
 TEST_F(CollectionSpecificTest, UpdateUpsertOfDocWithMissingFields) {
@@ -1675,6 +1679,8 @@ TEST_F(CollectionSpecificTest, UpdateUpsertOfDocWithMissingFields) {
 
     ASSERT_EQ(1, coll1->_get_index()->_get_numerical_index().at("points")->size());
     ASSERT_EQ(1, coll1->_get_index()->_get_numerical_index().at("last_chance")->size());
+
+    collectionManager.drop_collection("coll1");
 }
 
 TEST_F(CollectionSpecificTest, FacetParallelizationVerification) {
@@ -1937,4 +1943,6 @@ TEST_F(CollectionSpecificTest, DroppedTokensShouldNotBeUsedForPrefixSearch) {
     ASSERT_EQ(2, results["hits"].size());
     ASSERT_EQ("1", results["hits"][0]["document"]["id"].get<std::string>());
     ASSERT_EQ("0", results["hits"][1]["document"]["id"].get<std::string>());
+
+    collectionManager.drop_collection("coll1");
 }

From d9b969aab5a5cdb523024782b7514befaaac1e49 Mon Sep 17 00:00:00 2001
From: Kishore Nallan <kishorenc@gmail.com>
Date: Tue, 30 Nov 2021 14:21:12 +0530
Subject: [PATCH 16/20] Sort queued requests by timestamp before restore.

---
 src/batched_indexer.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/batched_indexer.cpp b/src/batched_indexer.cpp
index e149487e..04e5d1e1 100644
--- a/src/batched_indexer.cpp
+++ b/src/batched_indexer.cpp
@@ -385,16 +385,16 @@ void BatchedIndexer::load_state(const nlohmann::json& state) {
             queue_ids.push_back(queue_id);
             std::unique_lock qlk(qmutuxes[queue_id].mcv);
             queues[queue_id].emplace_back(req->start_ts);
-            qmutuxes[queue_id].cv.notify_one();
         }
 
         num_reqs_restored++;
     }
 
-    // need to sort on `start_ts` to preserve original order
+    // need to sort on `start_ts` to preserve original order before notifying queues
     for(auto queue_id: queue_ids) {
         std::unique_lock lk(qmutuxes[queue_id].mcv);
         std::sort(queues[queue_id].begin(), queues[queue_id].end());
+        qmutuxes[queue_id].cv.notify_one();
     }
 
     LOG(INFO) << "Restored " << num_reqs_restored << " in-flight requests from snapshot.";

From 6ded5c7556f75ad2edfaa80e38d48429a8c916be Mon Sep 17 00:00:00 2001
From: Kishore Nallan <kishorenc@gmail.com>
Date: Tue, 30 Nov 2021 15:56:25 +0530
Subject: [PATCH 17/20] Fix posting list deletion edge case.

---
 src/posting_list.cpp       |  35 +++++++----
 test/posting_list_test.cpp | 117 +++++++++++++++++++++++++++++--------
 2 files changed, 117 insertions(+), 35 deletions(-)

diff --git a/src/posting_list.cpp b/src/posting_list.cpp
index 823dd028..c888225e 100644
--- a/src/posting_list.cpp
+++ b/src/posting_list.cpp
@@ -186,7 +186,9 @@ bool posting_list_t::block_t::contains(uint32_t id) {
 /* posting_list_t operations */
 
 posting_list_t::posting_list_t(uint16_t max_block_elements): BLOCK_MAX_ELEMENTS(max_block_elements) {
-
+    if(max_block_elements <= 1) {
+        throw std::invalid_argument("max_block_elements must be > 1");
+    }
 }
 
 posting_list_t::~posting_list_t() {
@@ -423,7 +425,7 @@ void posting_list_t::upsert(const uint32_t id, const std::vector<uint32_t>& offs
     } else {
         const auto it = id_block_map.lower_bound(id);
         upsert_block = (it == id_block_map.end()) ? id_block_map.rbegin()->second : it->second;
-        before_upsert_last_id = upsert_block->ids.at(upsert_block->size() - 1);
+        before_upsert_last_id = upsert_block->ids.last();
     }
 
     // happy path: upsert_block is not full
@@ -431,7 +433,7 @@ void posting_list_t::upsert(const uint32_t id, const std::vector<uint32_t>& offs
         uint32_t num_inserted = upsert_block->upsert(id, offsets);
         ids_length += num_inserted;
 
-        last_id_t after_upsert_last_id = upsert_block->ids.at(upsert_block->size() - 1);
+        last_id_t after_upsert_last_id = upsert_block->ids.last();
         if(before_upsert_last_id != after_upsert_last_id) {
             id_block_map.erase(before_upsert_last_id);
             id_block_map.emplace(after_upsert_last_id, upsert_block);
@@ -451,12 +453,12 @@ void posting_list_t::upsert(const uint32_t id, const std::vector<uint32_t>& offs
             // evenly divide elements between both blocks
             split_block(upsert_block, new_block);
 
-            last_id_t after_upsert_last_id = upsert_block->ids.at(upsert_block->size() - 1);
+            last_id_t after_upsert_last_id = upsert_block->ids.last();
             id_block_map.erase(before_upsert_last_id);
             id_block_map.emplace(after_upsert_last_id, upsert_block);
         }
 
-        last_id_t after_new_block_id = new_block->ids.at(new_block->size() - 1);
+        last_id_t after_new_block_id = new_block->ids.last();
         id_block_map.emplace(after_new_block_id, new_block);
 
         new_block->next = upsert_block->next;
@@ -485,6 +487,17 @@ void posting_list_t::erase(const uint32_t id) {
             // since we will be deleting the empty node, set the previous node's next pointer to null
             std::prev(it)->second->next = nullptr;
             delete erase_block;
+        } else {
+            // The root block cannot be empty if there are other blocks so we will pull some contents from next block
+            // This is only an issue for blocks with max size of 2
+            if(root_block.next != nullptr) {
+                auto next_block_last_id = erase_block->next->ids.last();
+                merge_adjacent_blocks(erase_block, erase_block->next, erase_block->next->size()/2);
+                id_block_map.erase(next_block_last_id);
+
+                id_block_map.emplace(erase_block->next->ids.last(), erase_block->next);
+                id_block_map.emplace(erase_block->ids.last(), erase_block);
+            }
         }
 
         id_block_map.erase(before_last_id);
@@ -493,7 +506,7 @@ void posting_list_t::erase(const uint32_t id) {
     }
 
     if(new_ids_length >= BLOCK_MAX_ELEMENTS/2 || erase_block->next == nullptr) {
-        last_id_t after_last_id = erase_block->ids.at(new_ids_length-1);
+        last_id_t after_last_id = erase_block->ids.last();
         if(before_last_id != after_last_id) {
             id_block_map.erase(before_last_id);
             id_block_map.emplace(after_last_id, erase_block);
@@ -505,7 +518,7 @@ void posting_list_t::erase(const uint32_t id) {
     // block is less than 50% of max capacity and contains a next node which we can refill from
 
     auto next_block = erase_block->next;
-    last_id_t next_block_last_id = next_block->ids.at(next_block->ids.getLength()-1);
+    last_id_t next_block_last_id = next_block->ids.last();
 
     if(erase_block->size() + next_block->size() <= BLOCK_MAX_ELEMENTS) {
         // we can merge the contents of next block with `erase_block` and delete the next block
@@ -515,13 +528,15 @@ void posting_list_t::erase(const uint32_t id) {
 
         id_block_map.erase(next_block_last_id);
     } else {
-        // only part of the next block can be moved over
-        size_t num_block2_ids = BLOCK_MAX_ELEMENTS - erase_block->size();
+        // Only part of the next block can be moved over.
+        // We will move only 50% of max elements to ensure that we don't end up "flipping" adjacent blocks:
+        // 1, 5 -> 5, 1
+        size_t num_block2_ids = BLOCK_MAX_ELEMENTS/2;
         merge_adjacent_blocks(erase_block, next_block, num_block2_ids);
         // NOTE: we don't have to update `id_block_map` for `next_block` as last element doesn't change
     }
 
-    last_id_t after_last_id = erase_block->ids.at(erase_block->ids.getLength()-1);
+    last_id_t after_last_id = erase_block->ids.last();
     if(before_last_id != after_last_id) {
         id_block_map.erase(before_last_id);
         id_block_map.emplace(after_last_id, erase_block);
diff --git a/test/posting_list_test.cpp b/test/posting_list_test.cpp
index 09c567ef..b9ecec3d 100644
--- a/test/posting_list_test.cpp
+++ b/test/posting_list_test.cpp
@@ -447,13 +447,13 @@ TEST_F(PostingListTest, RemovalsOnLaterBlocks) {
     // only part of the next node contents can be moved over when we delete 8 since (1 + 5) > 5
     pl.erase(8);
 
-    // [0..4], [9], [10..14] => [0..4], [9,10,11,12,13], [14]
+    // [0..4], [9], [10..14] => [0..4], [9,10,11], [12,13,14]
 
     ASSERT_EQ(3, pl.num_blocks());
     ASSERT_EQ(11, pl.num_ids());
-    ASSERT_EQ(5, pl.get_root()->next->size());
-    ASSERT_EQ(1, pl.get_root()->next->next->size());
-    ASSERT_EQ(13, pl.get_root()->next->ids.last());
+    ASSERT_EQ(3, pl.get_root()->next->size());
+    ASSERT_EQ(3, pl.get_root()->next->next->size());
+    ASSERT_EQ(11, pl.get_root()->next->ids.last());
     ASSERT_EQ(14, pl.get_root()->next->next->ids.last());
 
     for(size_t i = 0; i < pl.get_root()->next->offset_index.getLength(); i++) {
@@ -616,27 +616,6 @@ TEST_F(PostingListTest, SplittingOfListsSimple) {
     std::vector<std::vector<posting_list_t::iterator_t>> partial_its_vec(4);
     intersector.split_lists(4, partial_its_vec);
 
-    /*for(size_t i = 0; i < partial_its_vec.size(); i++) {
-        auto& partial_its = partial_its_vec[i];
-
-        if (partial_its.empty()) {
-            continue;
-        }
-
-        LOG(INFO) << "Vec " << i;
-
-        for (auto& it: partial_its) {
-            while (it.valid()) {
-                LOG(INFO) << it.id();
-                it.next();
-            }
-
-            LOG(INFO) << "---";
-        }
-    }
-
-    return ;*/
-
     std::vector<std::vector<std::vector<uint32_t>>> split_ids = {
         {{0, 2}, {1, 3}, {2, 3}},
         {{3, 20}, {1, 3, 5, 10, 20}, {2, 3, 5, 7, 20}}
@@ -1384,6 +1363,94 @@ TEST_F(PostingListTest, BlockIntersectionOnMixedLists) {
     free(list1);
 }
 
+TEST_F(PostingListTest, InsertAndEraseSequence) {
+    std::vector<uint32_t> offsets = {0, 1, 3};
+    posting_list_t pl(5);
+
+    pl.upsert(0, offsets);
+    pl.upsert(2, offsets);
+    pl.upsert(4, offsets);
+    pl.upsert(6, offsets);
+    pl.upsert(8, offsets);
+
+    // this will cause a split of the root block
+    pl.upsert(3, offsets); // 0,2,3 | 4,6,8
+    pl.erase(0); // 2,3 | 4,6,8
+    pl.upsert(5, offsets); // 2,3 | 4,5,6,8
+    pl.upsert(7, offsets); // 2,3 | 4,5,6,7,8
+    pl.upsert(10, offsets); // 2,3 | 4,5,6,7,8 | 10
+
+    // this will cause adjacent block refill
+    pl.erase(2); // 3,4,5,6,7 | 8 | 10
+
+    // deletes second block
+    pl.erase(8);
+
+    // remove all elements
+    pl.erase(3);
+    pl.erase(4);
+    pl.erase(5);
+    pl.erase(6);
+    pl.erase(7);
+    pl.erase(10);
+
+    ASSERT_EQ(0, pl.num_ids());
+}
+
+TEST_F(PostingListTest, InsertAndEraseSequenceWithBlockSizeTwo) {
+    std::vector<uint32_t> offsets = {0, 1, 3};
+    posting_list_t pl(2);
+
+    pl.upsert(2, offsets);
+    pl.upsert(3, offsets);
+    pl.upsert(1, offsets);  // inserting 2 again here?  // inserting 4 here?
+
+    // 1 | 2,3
+
+    pl.erase(1);
+
+    ASSERT_EQ(1, pl.get_root()->size());
+    ASSERT_EQ(2, pl.num_blocks());
+
+    pl.erase(3);
+    pl.erase(2);
+
+    ASSERT_EQ(0, pl.get_root()->size());
+}
+
+TEST_F(PostingListTest, PostingListMustHaveAtleast1Element) {
+    try {
+        std::vector<uint32_t> offsets = {0, 1, 3};
+        posting_list_t pl(1);
+        FAIL() << "Expected std::invalid_argument";
+    }
+    catch(std::invalid_argument const & err) {
+        EXPECT_EQ(err.what(),std::string("max_block_elements must be > 1"));
+    } catch(...) {
+        FAIL() << "Expected std::invalid_argument";
+    }
+}
+
+TEST_F(PostingListTest, DISABLED_RandInsertAndErase) {
+    std::vector<uint32_t> offsets = {0, 1, 3};
+    posting_list_t pl(5);
+
+    time_t t;
+    srand((unsigned) time(&t));
+
+    for(size_t i = 0; i < 10000; i++) {
+        LOG(INFO) << "i: " << i;
+        uint32_t add_id = rand() % 15;
+        pl.upsert(add_id, offsets);
+
+        uint32_t del_id = rand() % 15;
+        LOG(INFO) << "add: " << add_id << ", erase: " << del_id;
+        pl.erase(del_id);
+    }
+
+    LOG(INFO) << "Num ids: " << pl.num_ids() << ", num bocks: " << pl.num_blocks();
+}
+
 TEST_F(PostingListTest, DISABLED_Benchmark) {
     std::vector<uint32_t> offsets = {0, 1, 3};
     posting_list_t pl(4096);

From 1bd028e951f1d8dee4a1de780ce8c71abc77b213 Mon Sep 17 00:00:00 2001
From: Kishore Nallan <kishorenc@gmail.com>
Date: Tue, 30 Nov 2021 16:51:41 +0530
Subject: [PATCH 18/20] Fix warning.

---
 src/posting_list.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/posting_list.cpp b/src/posting_list.cpp
index c888225e..a3dc949a 100644
--- a/src/posting_list.cpp
+++ b/src/posting_list.cpp
@@ -269,7 +269,8 @@ void posting_list_t::merge_adjacent_blocks(posting_list_t::block_t* block1, post
     size_t new_block1_offsets_size = block1->offsets.getLength() + num_block2_offsets_to_move;
     uint32_t* new_block1_offsets = new uint32_t[new_block1_offsets_size];
 
-    uint32_t min = offsets1[0], max = offsets1[0];
+    uint32_t min = block1->offsets.getLength() != 0 ? offsets1[0] : 0;
+    uint32_t max = min;
 
     // we have to manually copy over so we can find the new min and max
     for(size_t i = 0; i < block1->offsets.getLength(); i++) {

From 11f919c4f8ea0cf9ba2fe4c0fc20ae23baf0c1a6 Mon Sep 17 00:00:00 2001
From: Kishore Nallan <kishorenc@gmail.com>
Date: Tue, 30 Nov 2021 19:35:41 +0530
Subject: [PATCH 19/20] Use set for batch indexer request restore.

---
 src/batched_indexer.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/batched_indexer.cpp b/src/batched_indexer.cpp
index 04e5d1e1..4d25f976 100644
--- a/src/batched_indexer.cpp
+++ b/src/batched_indexer.cpp
@@ -354,7 +354,7 @@ void BatchedIndexer::load_state(const nlohmann::json& state) {
     queued_writes = state["queued_writes"].get<int64_t>();
 
     size_t num_reqs_restored = 0;
-    std::vector<uint64_t> queue_ids;
+    std::set<uint64_t> queue_ids;
 
     for(auto& kv: state["req_res_map"].items()) {
         std::shared_ptr<http_req> req = std::make_shared<http_req>();
@@ -382,7 +382,7 @@ void BatchedIndexer::load_state(const nlohmann::json& state) {
 
             const std::string& coll_name = get_collection_name(req);
             uint64_t queue_id = StringUtils::hash_wy(coll_name.c_str(), coll_name.size()) % num_threads;
-            queue_ids.push_back(queue_id);
+            queue_ids.insert(queue_id);
             std::unique_lock qlk(qmutuxes[queue_id].mcv);
             queues[queue_id].emplace_back(req->start_ts);
         }

From 339300dcd00b896e2f1b1f1a39e0a13d5748e7d5 Mon Sep 17 00:00:00 2001
From: Kishore Nallan <kishorenc@gmail.com>
Date: Tue, 30 Nov 2021 20:20:19 +0530
Subject: [PATCH 20/20] Lowercase unicode while tokenization.

---
 src/tokenizer.cpp                 |  2 +-
 test/collection_specific_test.cpp | 30 ++++++++++++++++++++++--------
 2 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp
index 1d868e01..745e47e8 100644
--- a/src/tokenizer.cpp
+++ b/src/tokenizer.cpp
@@ -96,7 +96,7 @@ bool Tokenizer::next(std::string &token, size_t& token_index, size_t& start_inde
                         LOG(ERROR) << "Unicode error during parsing: " << errcode;
                     }
                 } else {
-                    token = unicode_text.tempSubString(prev_position, length).toUTF8String(word);
+                    token = unicode_text.toLower().tempSubString(prev_position, length).toUTF8String(word);
                 }
 
                 if(!token.empty()) {
diff --git a/test/collection_specific_test.cpp b/test/collection_specific_test.cpp
index 09cd6c4d..6ea594c4 100644
--- a/test/collection_specific_test.cpp
+++ b/test/collection_specific_test.cpp
@@ -1541,15 +1541,28 @@ TEST_F(CollectionSpecificTest, UpdateOfTwoDocsWithSameIdWithinSameBatch) {
                                  spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "title", 20, {}, {}, {}, 0,
                                  "<mark>", "</mark>", {}, 1000, true).get();
 
-    LOG(INFO) << results;
+    collectionManager.drop_collection("coll1");
+}
 
-    ASSERT_EQ(1, results["hits"].size());
-    ASSERT_EQ("0", results["hits"][0]["document"]["id"].get<std::string>());
-    ASSERT_TRUE(results["hits"][0]["document"].contains("last_chance"));
-    ASSERT_EQ(false, results["hits"][0]["document"]["last_chance"].get<bool>());
+TEST_F(CollectionSpecificTest, CyrillicText) {
+    // when the first document containing a token already cannot fit compact posting list
 
-    ASSERT_EQ(1, coll1->_get_index()->_get_numerical_index().at("points")->size());
-    ASSERT_EQ(1, coll1->_get_index()->_get_numerical_index().at("last_chance")->size());
+    std::vector<field> fields = {field("title", field_types::STRING, false, false, true, "sr"),};
+
+    Collection* coll1 = collectionManager.create_collection("coll1", 1, fields).get();
+
+    nlohmann::json doc;
+    doc["title"] = "Test Тест";
+    ASSERT_TRUE(coll1->add(doc.dump()).ok());
+
+    doc["title"] = "TEST ТЕСТ";
+    ASSERT_TRUE(coll1->add(doc.dump()).ok());
+
+    auto results = coll1->search("тест", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}).get();
+
+    ASSERT_EQ(2, results["hits"].size());
+    ASSERT_EQ("1", results["hits"][0]["document"]["id"].get<std::string>());
+    ASSERT_EQ("0", results["hits"][1]["document"]["id"].get<std::string>());
 
     collectionManager.drop_collection("coll1");
 }
@@ -1779,9 +1792,10 @@ TEST_F(CollectionSpecificTest, VerbatimMatchShouldConsiderTokensMatchedAcrossAll
     ASSERT_EQ("3", results["hits"][0]["document"]["id"].get<std::string>());
     ASSERT_EQ("2", results["hits"][1]["document"]["id"].get<std::string>());
 
+    ASSERT_EQ(2, results["hits"].size());
+
     collectionManager.drop_collection("coll1");
 }
-
 TEST_F(CollectionSpecificTest, CustomNumTyposConfiguration) {
     // dropped tokens on a single field cannot be deemed as verbatim match