From 5b2407433f46d6c6243b8a9849920cf79181733b Mon Sep 17 00:00:00 2001
From: kishorenc <kishorenc@gmail.com>
Date: Fri, 5 Jun 2020 20:28:33 +0530
Subject: [PATCH 01/38] Refactor topster to support grouping.

---
 include/topster.h      | 244 ++++++++++++++++++++++++-----------------
 src/index.cpp          |   6 +-
 src/main/benchmark.cpp |   4 +-
 test/topster_test.cpp  |   6 +-
 4 files changed, 156 insertions(+), 104 deletions(-)
diff --git a/include/topster.h b/include/topster.h
index 6ebadece..a065bb01 100644
--- a/include/topster.h
+++ b/include/topster.h
@@ -5,16 +5,26 @@
 #include <cstdio>
 #include <algorithm>
 #include <sparsepp.h>
-#include <match_score.h>
-#include <number.h>
 
 struct KV {
     uint8_t field_id;
     uint16_t query_index;
     uint16_t array_index;
     uint64_t key;
+    uint64_t distinct_key;
     uint64_t match_score;
-    int64_t scores[3];  // match score + 2 custom attributes
+    int64_t scores[3]{};  // match score + 2 custom attributes
+
+    KV(uint8_t fieldId, uint16_t queryIndex, uint16_t arrayIndex, uint64_t key, uint64_t distinct_key,
+       uint64_t match_score, const int64_t *scores):
+            field_id(fieldId), query_index(queryIndex), array_index(arrayIndex), key(key),
+            distinct_key(distinct_key), match_score(match_score) {
+        this->scores[0] = scores[0];
+        this->scores[1] = scores[1];
+        this->scores[2] = scores[2];
+    }
+
+    KV() {}
 };
 
 /*
@@ -25,12 +35,18 @@ struct Topster {
     uint32_t size;
 
     KV *data;
-    KV* *kvs;
+    KV** kvs;
+    spp::sparse_hash_map<uint64_t, KV*> kv_map;
 
-    spp::sparse_hash_map<uint64_t, KV*> keys;
+    KV* min_kv;
+    spp::sparse_hash_map<uint64_t, Topster*> group_kv_map;
+    size_t distinct;
 
-    explicit Topster(size_t capacity): MAX_SIZE(capacity), size(0) {
-        // we allocate data first to get contiguous memory block whose indices are then assigned to `kvs`
+    explicit Topster(size_t capacity): Topster(capacity, 0) {
+    }
+
+    explicit Topster(size_t capacity, size_t distinct): MAX_SIZE(capacity), size(0), distinct(distinct) {
+        // we allocate data first to get a memory block whose indices are then assigned to `kvs`
         // we use separate **kvs for easier pointer swaps
         data = new KV[capacity];
         kvs = new KV*[capacity];
@@ -38,15 +54,23 @@ struct Topster {
         for(size_t i=0; i<capacity; i++) {
             data[i].field_id = 0;
             data[i].query_index = 0;
+            data[i].array_index = i;
             data[i].key = 0;
+            data[i].distinct_key = 0;
             data[i].match_score = 0;
             kvs[i] = &data[i];
         }
+
+        min_kv = new KV();
     }
 
     ~Topster() {
-        delete [] data;
-        delete [] kvs;
+        delete[] data;
+        delete[] kvs;
+        delete min_kv;
+        for(auto& kv: group_kv_map) {
+            delete kv.second;
+        }
     }
 
     static inline void swapMe(KV** a, KV** b) {
@@ -59,118 +83,137 @@ struct Topster {
         (*b)->array_index = a_index;
     }
 
-    static inline void replace_key_values(const uint64_t &key, const uint8_t &field_id, const uint16_t &query_index,
-                                          const uint64_t &match_score, const int64_t *scores, uint32_t start,
-                                          KV* *kvs, spp::sparse_hash_map<uint64_t, KV*>& keys) {
-        kvs[start]->key = key;
-        kvs[start]->field_id = field_id;
-        kvs[start]->query_index = query_index;
-        kvs[start]->array_index = start;
-        kvs[start]->match_score = match_score;
-        kvs[start]->scores[0] = scores[0];
-        kvs[start]->scores[1] = scores[1];
-        kvs[start]->scores[2] = scores[2];
-
-        keys.erase(kvs[start]->key);
-        keys[key] = kvs[start];
+    static inline void copyMe(KV* a, KV* b) {
+        size_t b_index = b->array_index;
+        *b = *a;
+        b->array_index = b_index;
     }
 
-    void add(const uint64_t &key, const uint8_t &field_id, const uint16_t &query_index, const uint64_t &match_score,
-             const int64_t scores[3]) {
-        if (size >= MAX_SIZE) {
-            if(!is_greater(kvs[0], scores)) {
-                // when incoming value is less than the smallest in the heap, ignore
-                return;
+    bool add(KV* kv) {
+        //LOG(INFO) << "kv_map size: " << kv_map.size() << " -- kvs[0]: " << kvs[0]->match_score;
+        /*for(auto kv: kv_map) {
+            LOG(INFO) << "kv key: " << kv.first << " => " << kv.second->match_score;
+        }*/
+
+        bool less_than_min_heap = (size >= MAX_SIZE) && is_smaller_equal(kv, kvs[0]);
+        size_t heap_down_index = 0;
+
+        if(!distinct && less_than_min_heap) {
+            // for non-distinct, if incoming value is smaller than min-heap ignore
+            return false;
+        }
+
+        if(distinct) {
+            const auto& found_it = group_kv_map.find(kv->distinct_key);
+            bool is_duplicate_key = (found_it != group_kv_map.end());
+
+            if(!is_duplicate_key && less_than_min_heap) {
+                // for distinct, if a non duplicate kv is < than min heap we also ignore
+                return false;
             }
 
-            uint32_t start = 0;
-
-            // When the key already exists and has a greater score, ignore. Otherwise, we have to replace.
-            // NOTE: we don't consider primary and secondary attrs here because they will be the same for a given key.
-            if(keys.count(key) != 0) {
-                const KV* existing = keys.at(key);
-                if(match_score <= existing->match_score) {
-                    return ;
+            if(is_duplicate_key) {
+                // if min heap (group_topster.kvs[0]) changes, we have to update kvs and sift down
+                Topster* group_topster = found_it->second;
+                uint16_t old_min_heap_array_index = group_topster->min_kv->array_index;
+                bool added = group_topster->add(kv);
+                if(!added) {
+                    return false;
                 }
 
-                // replace and sift down
-                start = existing->array_index;
-            }
+                // if added, guaranteed to be larger than old_min_heap_ele
+                copyMe(kv, group_topster->min_kv);
+                heap_down_index = old_min_heap_array_index;
+            } else {
+                // we have to replace min heap element
+                // create fresh topster for this distinct group key since it does not exist
 
-            replace_key_values(key, field_id, query_index, match_score, scores, start, kvs, keys);
+                Topster* group_topster = new Topster(distinct, 0);
+                group_topster->add(kv);
+                copyMe(kv, group_topster->min_kv);
 
-            // sift down to maintain heap property
-            while ((2*start+1) < MAX_SIZE) {
-                uint32_t next = (2 * start + 1);
-                if (next+1 < MAX_SIZE && is_greater_kv(kvs[next], kvs[next+1])) {
-                    next++;
-                }
-
-                if (is_greater_kv(kvs[start], kvs[next])) {
-                    swapMe(&kvs[start], &kvs[next]);
+                if(size < MAX_SIZE) {
+                    // we just copy to end of array
+                    heap_down_index = size;
+                    size++;
                 } else {
-                    break;
+                    // kv is guaranteed to be > current min heap (group_topster.kvs[0])
+                    heap_down_index = 0;
+
+                    // remove current min heap group key from map
+                    delete group_kv_map[kvs[heap_down_index]->distinct_key];
+                    group_kv_map.erase(kvs[heap_down_index]->distinct_key);
                 }
 
-                start = next;
+                // add new group key to map
+                group_kv_map.emplace(kv->distinct_key, group_topster);
             }
-        } else {
-            uint32_t start = size;
-            bool key_found = false;
 
-            // When the key already exists and has a greater score, ignore. Otherwise, we have to replace
-            if(keys.count(key) != 0) {
-                const KV* existing = keys.at(key);
-                if(match_score <= existing->match_score) {
-                    return ;
+        } else { // not distinct
+            //LOG(INFO) << "Searching for key: " << kv->key;
+
+            const auto& found_it = kv_map.find(kv->key);
+            bool is_duplicate_key = (found_it != kv_map.end());
+
+            if(is_duplicate_key) {
+                // Need to check if kv is greater than existing duplicate kv.
+                KV* existing_kv = found_it->second;
+                //LOG(INFO) << "existing_kv: " << existing_kv->key << " -> " << existing_kv->match_score;
+
+                if(is_smaller_equal(kv, existing_kv)) {
+                    return false;
                 }
 
-                // replace and sift down
-                start = existing->array_index;
-                key_found = true;
+                // replace existing kv and sift down
+                heap_down_index = existing_kv->array_index;
+                kv_map.erase(kvs[heap_down_index]->key);
+
+                // kv will be swapped into heap_down_index
+                kv_map.emplace(kv->key, kvs[heap_down_index]);
             }
 
-            replace_key_values(key, field_id, query_index, match_score, scores, start, kvs, keys);
-
-            if(key_found) {
-                // need to sift down if it's a replace
-                while ((2*start+1) < size) {
-                    uint32_t next = (2 * start + 1);
-                    if (next+1 < size && is_greater_kv(kvs[next], kvs[next+1])) {
-                        next++;
-                    }
-
-                    if (is_greater_kv(kvs[start], kvs[next])) {
-                        swapMe(&kvs[start], &kvs[next]);
-                    } else {
-                        break;
-                    }
-
-                    start = next;
-                }
-
-                return ;
-            }
-
-            while(start > 0) {
-                uint32_t parent = (start-1)/2;
-                if (is_greater_kv(kvs[parent], kvs[start])) {
-                    swapMe(&kvs[start], &kvs[parent]);
-                    start = parent;
+            else {
+                if(size < MAX_SIZE) {
+                    // we just copy to end of array
+                    heap_down_index = size;
+                    size++;
                 } else {
-                    break;
+                    // kv is guaranteed to be > min heap.
+                    // we have to replace min heap element since array is full
+                    heap_down_index = 0;
+                    kv_map.erase(kvs[heap_down_index]->key);
                 }
-            }
 
-            if(keys.count(key) != 0) {
-                size++;
+                // kv will be swapped into heap_down_index pointer
+                kv_map.emplace(kv->key, kvs[heap_down_index]);
             }
         }
-    }
 
-    static bool is_greater(const struct KV* i, const int64_t scores[3]) {
-        return std::tie(scores[0], scores[1], scores[2]) >
-               std::tie(i->scores[0], i->scores[1], i->scores[2]);
+        // we have to replace the existing element in the heap and sift down
+        copyMe(kv, kvs[heap_down_index]);
+
+        if(size < MAX_SIZE) {
+            heap_down_index = 0;
+        }
+
+        // sift down to maintain heap property
+        while ((2*heap_down_index+1) < size) {
+            uint32_t next = (2 * heap_down_index + 1);  // left child
+            if (next+1 < size && is_greater_kv(kvs[next], kvs[next+1])) {
+                // for min heap we compare with the minimum of children
+                next++;  // right child (2n + 2)
+            }
+
+            if (is_greater_kv(kvs[heap_down_index], kvs[next])) {
+                swapMe(&kvs[heap_down_index], &kvs[next]);
+            } else {
+                break;
+            }
+
+            heap_down_index = next;
+        }
+
+        return true;
     }
 
     static bool is_greater_kv(const struct KV* i, const struct KV* j) {
@@ -178,6 +221,11 @@ struct Topster {
                std::tie(j->scores[0], j->scores[1], j->scores[2], j->key);
     }
 
+    static bool is_smaller_equal(const struct KV* i, const struct KV* j) {
+        return std::tie(i->scores[0], i->scores[1], i->scores[2]) <=
+               std::tie(j->scores[0], j->scores[1], j->scores[2]);
+    }
+
     static bool is_greater_kv_value(const struct KV & i, const struct KV & j) {
         return std::tie(i.scores[0], i.scores[1], i.scores[2], i.key) >
                std::tie(j.scores[0], j.scores[1], j.scores[2], j.key);
diff --git a/src/index.cpp b/src/index.cpp
index 71dd092b..96df1f78 100644
--- a/src/index.cpp
+++ b/src/index.cpp
@@ -1135,7 +1135,8 @@ void Index::collate_curated_ids(const std::string & query, const std::string & f
         scores[1] = int64_t(1);
         scores[2] = int64_t(1);
 
-        curated_topster.add(seq_id, field_id, searched_queries.size(), match_score, scores);
+        KV kv(field_id, searched_queries.size(), 0, seq_id, seq_id, match_score, scores);
+        curated_topster.add(&kv);
 
         searched_queries.push_back(override_query);
     }
@@ -1540,7 +1541,8 @@ void Index::score_results(const std::vector<sort_by> & sort_fields, const uint16
             }
         }
 
-        topster.add(seq_id, field_id, query_index, match_score, scores);
+        KV kv(field_id, query_index, 0, seq_id, seq_id, match_score, scores);
+        topster.add(&kv);
     }
 
     //long long int timeNanos = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - begin).count();
diff --git a/src/main/benchmark.cpp b/src/main/benchmark.cpp
index 793eb00a..3000c26e 100644
--- a/src/main/benchmark.cpp
+++ b/src/main/benchmark.cpp
@@ -46,7 +46,7 @@ void benchmark_hn_titles(char* file_path) {
 
     Store *store = new Store("/tmp/typesense-data");
     CollectionManager & collectionManager = CollectionManager::get_instance();
-    collectionManager.init(store, 4, "abcd", "1234");
+    collectionManager.init(store, 4, "abcd");
     collectionManager.load();
 
     Collection *collection = collectionManager.get_collection("hnstories_direct");
@@ -116,7 +116,7 @@ void benchmark_reactjs_pages(char* file_path) {
 
     Store *store = new Store("/tmp/typesense-data");
     CollectionManager & collectionManager = CollectionManager::get_instance();
-    collectionManager.init(store, 4, "abcd", "1234");
+    collectionManager.init(store, 4, "abcd");
     collectionManager.load();
 
     Collection *collection = collectionManager.get_collection("reactjs_pages");
diff --git a/test/topster_test.cpp b/test/topster_test.cpp
index 1e9438a8..a685fcb0 100644
--- a/test/topster_test.cpp
+++ b/test/topster_test.cpp
@@ -36,7 +36,8 @@ TEST(TopsterTest, MaxIntValues) {
         scores[1] = data[i].primary_attr;
         scores[2] = data[i].secondary_attr;
 
-        topster.add(data[i].key, data[i].field_id, data[i].query_index, data[i].match_score, scores);
+        KV kv(data[i].field_id, data[i].query_index, 0, data[i].key, data[i].key, data[i].match_score, scores);
+        topster.add(&kv);
     }
 
     topster.sort();
@@ -87,7 +88,8 @@ TEST(TopsterTest, MaxFloatValues) {
         scores[1] = Index::float_to_in64_t(data[i].primary_attr);
         scores[2] = data[i].secondary_attr;
 
-        topster.add(data[i].key, data[i].field_id, data[i].query_index, data[i].match_score, scores);
+        KV kv(data[i].field_id, data[i].query_index, 0, data[i].key, data[i].key, data[i].match_score, scores);
+        topster.add(&kv);
     }
 
     topster.sort();

From 8f458640fd4b036e1e84a37a58f3405e9a9389cb Mon Sep 17 00:00:00 2001
From: kishorenc <kishorenc@gmail.com>
Date: Sat, 6 Jun 2020 12:58:47 +0530
Subject: [PATCH 02/38] Choose sift down/up based on array size.

---
 include/topster.h     | 114 ++++++++++++++++++++++++++----------------
 src/index.cpp         |   4 +-
 test/topster_test.cpp |   4 +-
 3 files changed, 75 insertions(+), 47 deletions(-)

diff --git a/include/topster.h b/include/topster.h
index a065bb01..e564a0e2 100644
--- a/include/topster.h
+++ b/include/topster.h
@@ -15,9 +15,9 @@ struct KV {
     uint64_t match_score;
     int64_t scores[3]{};  // match score + 2 custom attributes
 
-    KV(uint8_t fieldId, uint16_t queryIndex, uint16_t arrayIndex, uint64_t key, uint64_t distinct_key,
+    KV(uint8_t fieldId, uint16_t queryIndex, uint64_t key, uint64_t distinct_key,
        uint64_t match_score, const int64_t *scores):
-            field_id(fieldId), query_index(queryIndex), array_index(arrayIndex), key(key),
+            field_id(fieldId), query_index(queryIndex), array_index(0), key(key),
             distinct_key(distinct_key), match_score(match_score) {
         this->scores[0] = scores[0];
         this->scores[1] = scores[1];
@@ -36,9 +36,10 @@ struct Topster {
 
     KV *data;
     KV** kvs;
+
     spp::sparse_hash_map<uint64_t, KV*> kv_map;
 
-    KV* min_kv;
+    KV* group_min_kv;
     spp::sparse_hash_map<uint64_t, Topster*> group_kv_map;
     size_t distinct;
 
@@ -61,13 +62,13 @@ struct Topster {
             kvs[i] = &data[i];
         }
 
-        min_kv = new KV();
+        group_min_kv = new KV();
     }
 
     ~Topster() {
         delete[] data;
         delete[] kvs;
-        delete min_kv;
+        delete group_min_kv;
         for(auto& kv: group_kv_map) {
             delete kv.second;
         }
@@ -95,14 +96,20 @@ struct Topster {
             LOG(INFO) << "kv key: " << kv.first << " => " << kv.second->match_score;
         }*/
 
+        /*if(kv->key == 5) {
+            LOG(INFO) << "Key is 5";
+        }*/
+
         bool less_than_min_heap = (size >= MAX_SIZE) && is_smaller_equal(kv, kvs[0]);
-        size_t heap_down_index = 0;
+        size_t heap_op_index = 0;
 
         if(!distinct && less_than_min_heap) {
             // for non-distinct, if incoming value is smaller than min-heap ignore
             return false;
         }
 
+        bool SIFT_DOWN = true;
+
         if(distinct) {
             const auto& found_it = group_kv_map.find(kv->distinct_key);
             bool is_duplicate_key = (found_it != group_kv_map.end());
@@ -115,34 +122,34 @@ struct Topster {
             if(is_duplicate_key) {
                 // if min heap (group_topster.kvs[0]) changes, we have to update kvs and sift down
                 Topster* group_topster = found_it->second;
-                uint16_t old_min_heap_array_index = group_topster->min_kv->array_index;
+                uint16_t old_min_heap_array_index = group_min_kv->array_index;
                 bool added = group_topster->add(kv);
                 if(!added) {
                     return false;
                 }
 
                 // if added, guaranteed to be larger than old_min_heap_ele
-                copyMe(kv, group_topster->min_kv);
-                heap_down_index = old_min_heap_array_index;
+                copyMe(kv, group_min_kv);
+                heap_op_index = old_min_heap_array_index;
             } else {
-                // we have to replace min heap element
                 // create fresh topster for this distinct group key since it does not exist
 
                 Topster* group_topster = new Topster(distinct, 0);
                 group_topster->add(kv);
-                copyMe(kv, group_topster->min_kv);
+                copyMe(kv, group_min_kv);
 
                 if(size < MAX_SIZE) {
                     // we just copy to end of array
-                    heap_down_index = size;
+                    heap_op_index = size;
                     size++;
                 } else {
                     // kv is guaranteed to be > current min heap (group_topster.kvs[0])
-                    heap_down_index = 0;
+                    // so we have to replace min heap element (kvs[0])
+                    heap_op_index = 0;
 
                     // remove current min heap group key from map
-                    delete group_kv_map[kvs[heap_down_index]->distinct_key];
-                    group_kv_map.erase(kvs[heap_down_index]->distinct_key);
+                    delete group_kv_map[kvs[heap_op_index]->distinct_key];
+                    group_kv_map.erase(kvs[heap_op_index]->distinct_key);
                 }
 
                 // add new group key to map
@@ -155,6 +162,13 @@ struct Topster {
             const auto& found_it = kv_map.find(kv->key);
             bool is_duplicate_key = (found_it != kv_map.end());
 
+            /*
+               is_duplicate_key: SIFT_DOWN regardless of `size`.
+               Else:
+                   Do SIFT_UP if size < max_size
+                   Else SIFT_DOWN
+            */
+
             if(is_duplicate_key) {
                 // Need to check if kv is greater than existing duplicate kv.
                 KV* existing_kv = found_it->second;
@@ -164,53 +178,67 @@ struct Topster {
                     return false;
                 }
 
+                SIFT_DOWN = true;
+
                 // replace existing kv and sift down
-                heap_down_index = existing_kv->array_index;
-                kv_map.erase(kvs[heap_down_index]->key);
+                heap_op_index = existing_kv->array_index;
+                kv_map.erase(kvs[heap_op_index]->key);
 
-                // kv will be swapped into heap_down_index
-                kv_map.emplace(kv->key, kvs[heap_down_index]);
-            }
+                // kv will be swapped into heap_op_index
+                kv_map.emplace(kv->key, kvs[heap_op_index]);
 
-            else {
+            } else {  // not duplicate
+                
                 if(size < MAX_SIZE) {
                     // we just copy to end of array
-                    heap_down_index = size;
+                    SIFT_DOWN = false;
+                    heap_op_index = size;
                     size++;
                 } else {
                     // kv is guaranteed to be > min heap.
                     // we have to replace min heap element since array is full
-                    heap_down_index = 0;
-                    kv_map.erase(kvs[heap_down_index]->key);
+                    SIFT_DOWN = true;
+                    heap_op_index = 0;
+                    kv_map.erase(kvs[heap_op_index]->key);
                 }
 
-                // kv will be swapped into heap_down_index pointer
-                kv_map.emplace(kv->key, kvs[heap_down_index]);
+                // kv will be swapped into heap_op_index pointer
+                kv_map.emplace(kv->key, kvs[heap_op_index]);
             }
         }
 
         // we have to replace the existing element in the heap and sift down
-        copyMe(kv, kvs[heap_down_index]);
+        copyMe(kv, kvs[heap_op_index]);
 
-        if(size < MAX_SIZE) {
-            heap_down_index = 0;
-        }
+        // sift up/down to maintain heap property
 
-        // sift down to maintain heap property
-        while ((2*heap_down_index+1) < size) {
-            uint32_t next = (2 * heap_down_index + 1);  // left child
-            if (next+1 < size && is_greater_kv(kvs[next], kvs[next+1])) {
-                // for min heap we compare with the minimum of children
-                next++;  // right child (2n + 2)
+        if(SIFT_DOWN) {
+            while ((2 * heap_op_index + 1) < size) {
+                uint32_t next = (2 * heap_op_index + 1);  // left child
+                if (next+1 < size && is_greater_kv(kvs[next], kvs[next+1])) {
+                    // for min heap we compare with the minimum of children
+                    next++;  // right child (2n + 2)
+                }
+
+                if (is_greater_kv(kvs[heap_op_index], kvs[next])) {
+                    swapMe(&kvs[heap_op_index], &kvs[next]);
+                } else {
+                    break;
+                }
+
+                heap_op_index = next;
             }
-
-            if (is_greater_kv(kvs[heap_down_index], kvs[next])) {
-                swapMe(&kvs[heap_down_index], &kvs[next]);
-            } else {
-                break;
+        } else {
+            // SIFT UP
+            while(heap_op_index > 0) {
+                uint32_t parent = (heap_op_index - 1) / 2;
+                if (is_greater_kv(kvs[parent], kvs[heap_op_index])) {
+                    swapMe(&kvs[heap_op_index], &kvs[parent]);
+                    heap_op_index = parent;
+                } else {
+                    break;
+                }
             }
-
-            heap_down_index = next;
         }
 
         return true;
diff --git a/src/index.cpp b/src/index.cpp
index 96df1f78..f49097f4 100644
--- a/src/index.cpp
+++ b/src/index.cpp
@@ -1135,7 +1135,7 @@ void Index::collate_curated_ids(const std::string & query, const std::string & f
         scores[1] = int64_t(1);
         scores[2] = int64_t(1);
 
-        KV kv(field_id, searched_queries.size(), 0, seq_id, seq_id, match_score, scores);
+        KV kv(field_id, searched_queries.size(), seq_id, seq_id, match_score, scores);
         curated_topster.add(&kv);
 
         searched_queries.push_back(override_query);
@@ -1541,7 +1541,7 @@ void Index::score_results(const std::vector<sort_by> & sort_fields, const uint16
             }
         }
 
-        KV kv(field_id, query_index, 0, seq_id, seq_id, match_score, scores);
+        KV kv(field_id, query_index, seq_id, seq_id, match_score, scores);
         topster.add(&kv);
     }
 
diff --git a/test/topster_test.cpp b/test/topster_test.cpp
index a685fcb0..3c4b4521 100644
--- a/test/topster_test.cpp
+++ b/test/topster_test.cpp
@@ -36,7 +36,7 @@ TEST(TopsterTest, MaxIntValues) {
         scores[1] = data[i].primary_attr;
         scores[2] = data[i].secondary_attr;
 
-        KV kv(data[i].field_id, data[i].query_index, 0, data[i].key, data[i].key, data[i].match_score, scores);
+        KV kv(data[i].field_id, data[i].query_index, data[i].key, data[i].key, data[i].match_score, scores);
         topster.add(&kv);
     }
 
@@ -88,7 +88,7 @@ TEST(TopsterTest, MaxFloatValues) {
         scores[1] = Index::float_to_in64_t(data[i].primary_attr);
         scores[2] = data[i].secondary_attr;
 
-        KV kv(data[i].field_id, data[i].query_index, 0, data[i].key, data[i].key, data[i].match_score, scores);
+        KV kv(data[i].field_id, data[i].query_index, data[i].key, data[i].key, data[i].match_score, scores);
         topster.add(&kv);
     }
 

From b97c37215a8fac8491078357a042a0b08c15458d Mon Sep 17 00:00:00 2001
From: kishorenc <kishorenc@gmail.com>
Date: Sat, 6 Jun 2020 15:02:50 +0530
Subject: [PATCH 03/38] Basic distinct test is passing.

---
 include/topster.h     | 108 +++++++++++++++++++++++-------------------
 test/topster_test.cpp |  60 +++++++++++++++++++++++
 2 files changed, 119 insertions(+), 49 deletions(-)

diff --git a/include/topster.h b/include/topster.h
index e564a0e2..ab6d752b 100644
--- a/include/topster.h
+++ b/include/topster.h
@@ -7,12 +7,12 @@
 #include <sparsepp.h>
 
 struct KV {
-    uint8_t field_id;
-    uint16_t query_index;
-    uint16_t array_index;
-    uint64_t key;
-    uint64_t distinct_key;
-    uint64_t match_score;
+    uint8_t field_id{};
+    uint16_t query_index{};
+    uint16_t array_index{};
+    uint64_t key{};
+    uint64_t distinct_key{};
+    uint64_t match_score{};
     int64_t scores[3]{};  // match score + 2 custom attributes
 
     KV(uint8_t fieldId, uint16_t queryIndex, uint64_t key, uint64_t distinct_key,
@@ -24,7 +24,7 @@ struct KV {
         this->scores[2] = scores[2];
     }
 
-    KV() {}
+    KV() = default;
 };
 
 /*
@@ -37,9 +37,9 @@ struct Topster {
     KV *data;
     KV** kvs;
 
+    // For distinct, stores the min heap kv of each group_kv_map topster value
     spp::sparse_hash_map<uint64_t, KV*> kv_map;
 
-    KV* group_min_kv;
     spp::sparse_hash_map<uint64_t, Topster*> group_kv_map;
     size_t distinct;
 
@@ -61,14 +61,11 @@ struct Topster {
             data[i].match_score = 0;
             kvs[i] = &data[i];
         }
-
-        group_min_kv = new KV();
     }
 
     ~Topster() {
         delete[] data;
         delete[] kvs;
-        delete group_min_kv;
         for(auto& kv: group_kv_map) {
             delete kv.second;
         }
@@ -84,22 +81,12 @@ struct Topster {
         (*b)->array_index = a_index;
     }
 
-    static inline void copyMe(KV* a, KV* b) {
-        size_t b_index = b->array_index;
-        *b = *a;
-        b->array_index = b_index;
-    }
-
     bool add(KV* kv) {
         //LOG(INFO) << "kv_map size: " << kv_map.size() << " -- kvs[0]: " << kvs[0]->match_score;
         /*for(auto kv: kv_map) {
             LOG(INFO) << "kv key: " << kv.first << " => " << kv.second->match_score;
         }*/
 
-        /*if(kv->key == 5) {
-            LOG(INFO) << "Key is 5";
-        }*/
-
         bool less_than_min_heap = (size >= MAX_SIZE) && is_smaller_equal(kv, kvs[0]);
         size_t heap_op_index = 0;
 
@@ -115,45 +102,61 @@ struct Topster {
             bool is_duplicate_key = (found_it != group_kv_map.end());
 
             if(!is_duplicate_key && less_than_min_heap) {
-                // for distinct, if a non duplicate kv is < than min heap we also ignore
+                // for distinct, if a non duplicate kv is < than min heap we ignore
                 return false;
             }
 
             if(is_duplicate_key) {
-                // if min heap (group_topster.kvs[0]) changes, we have to update kvs and sift down
+                // if min heap (group_topster.kvs[0]) changes, we have to update kvs and sift
                 Topster* group_topster = found_it->second;
-                uint16_t old_min_heap_array_index = group_min_kv->array_index;
+                KV old_min_heap_kv = *kv_map[kv->distinct_key];
                 bool added = group_topster->add(kv);
+
                 if(!added) {
                     return false;
                 }
 
-                // if added, guaranteed to be larger than old_min_heap_ele
-                copyMe(kv, group_min_kv);
-                heap_op_index = old_min_heap_array_index;
-            } else {
-                // create fresh topster for this distinct group key since it does not exist
+                // if new kv score is greater than previous min heap score we sift dowm, otherwise sift up
+                SIFT_DOWN = is_greater_kv(kv, &old_min_heap_kv);
 
+                // new kv is different from old_min_heap_kv so we have to sift heap
+                heap_op_index = old_min_heap_kv.array_index;
+
+                // erase current min heap key from kv_map
+                kv_map.erase(old_min_heap_kv.distinct_key);
+
+                // kv will be copied into the pointer at heap_op_index
+                kv_map.emplace(kv->distinct_key, kvs[heap_op_index]);
+            } else {
+                // kv is guaranteed to be > current min heap: kvs[0]
+                // create fresh topster for this distinct group key since it does not exist
                 Topster* group_topster = new Topster(distinct, 0);
                 group_topster->add(kv);
-                copyMe(kv, group_min_kv);
-
-                if(size < MAX_SIZE) {
-                    // we just copy to end of array
-                    heap_op_index = size;
-                    size++;
-                } else {
-                    // kv is guaranteed to be > current min heap (group_topster.kvs[0])
-                    // so we have to replace min heap element (kvs[0])
-                    heap_op_index = 0;
-
-                    // remove current min heap group key from map
-                    delete group_kv_map[kvs[heap_op_index]->distinct_key];
-                    group_kv_map.erase(kvs[heap_op_index]->distinct_key);
-                }
 
                 // add new group key to map
                 group_kv_map.emplace(kv->distinct_key, group_topster);
+
+                // find heap operation index for updating kvs
+
+                if(size < MAX_SIZE) {
+                    // there is enough space in heap we just copy to end
+                    SIFT_DOWN = false;
+                    heap_op_index = size;
+                    size++;
+                } else {
+                    SIFT_DOWN = true;
+
+                    // max size is reached so we are forced to replace current min heap element (kvs[0])
+                    heap_op_index = 0;
+
+                    // remove current min heap group key from maps
+                    delete group_kv_map[kvs[heap_op_index]->distinct_key];
+                    group_kv_map.erase(kvs[heap_op_index]->distinct_key);
+                    kv_map.erase(kvs[heap_op_index]->distinct_key);
+                }
+
+                // kv will be copied into the pointer at heap_op_index
+                kv_map.emplace(kv->distinct_key, kvs[heap_op_index]);
             }
 
         } else { // not distinct
@@ -184,11 +187,10 @@ struct Topster {
                 heap_op_index = existing_kv->array_index;
                 kv_map.erase(kvs[heap_op_index]->key);
 
-                // kv will be swapped into heap_op_index
+                // kv will be copied into the pointer at heap_op_index
                 kv_map.emplace(kv->key, kvs[heap_op_index]);
-
             } else {  // not duplicate
-                
+
                 if(size < MAX_SIZE) {
                     // we just copy to end of array
                     SIFT_DOWN = false;
@@ -202,13 +204,14 @@ struct Topster {
                     kv_map.erase(kvs[heap_op_index]->key);
                 }
 
-                // kv will be swapped into heap_op_index pointer
+                // kv will be copied into the pointer at heap_op_index
                 kv_map.emplace(kv->key, kvs[heap_op_index]);
             }
         }
 
         // we have to replace the existing element in the heap and sift down
-        copyMe(kv, kvs[heap_op_index]);
+        kv->array_index = heap_op_index;
+        *kvs[heap_op_index] = *kv;
 
         // sift up/down to maintain heap property
 
@@ -262,6 +265,9 @@ struct Topster {
     // topster must be sorted before iterated upon to remove dead array entries
     void sort() {
         std::stable_sort(kvs, kvs+size, is_greater_kv);
+        for(auto &group_topster: group_kv_map) {
+            group_topster.second->sort();
+        }
     }
 
     void clear(){
@@ -272,6 +278,10 @@ struct Topster {
         return kvs[index]->key;
     }
 
+    uint64_t getDistinctKeyAt(uint32_t index) {
+        return kvs[index]->distinct_key;
+    }
+
     KV* getKV(uint32_t index) {
         return kvs[index];
     }
diff --git a/test/topster_test.cpp b/test/topster_test.cpp
index 3c4b4521..104e941e 100644
--- a/test/topster_test.cpp
+++ b/test/topster_test.cpp
@@ -99,4 +99,64 @@ TEST(TopsterTest, MaxFloatValues) {
     for(uint32_t i = 0; i < topster.size; i++) {
         EXPECT_EQ(ids[i], topster.getKeyAt(i));
     }
+}
+
+TEST(TopsterTest, DistinctIntValues) {
+    Topster dist_topster(5, 2);
+
+    struct {
+        uint8_t field_id;
+        uint16_t query_index;
+        uint64_t distinct_key;
+        uint64_t match_score;
+        int64_t primary_attr;
+        int64_t secondary_attr;
+    } data[14] = {
+            {1, 0, 1, 11, 20, 30},
+            {1, 0, 1, 12, 20, 32},
+            {1, 0, 2, 4, 20, 30},
+            {1, 2, 3, 7, 20, 30},
+            {1, 0, 4, 14, 20, 30},
+            {1, 1, 5, 9, 20, 30},
+            {1, 1, 5, 10, 20, 32},
+            {1, 1, 5, 9, 20, 30},
+            {1, 0, 6, 6, 20, 30},
+            {1, 2, 7, 6, 22, 30},
+            {1, 2, 7, 6, 22, 30},
+            {1, 1, 8, 9, 20, 30},
+            {1, 0, 9, 8, 20, 30},
+            {1, 3, 10, 5, 20, 30},
+    };
+
+    for(int i = 0; i < 14; i++) {
+        int64_t scores[3];
+        scores[0] = int64_t(data[i].match_score);
+        scores[1] = data[i].primary_attr;
+        scores[2] = data[i].secondary_attr;
+
+        KV kv(data[i].field_id, data[i].query_index, i+100, data[i].distinct_key, data[i].match_score, scores);
+        dist_topster.add(&kv);
+    }
+
+    dist_topster.sort();
+
+    std::vector<uint64_t> distinct_ids = {4, 1, 5, 8, 9};
+
+    for(uint32_t i = 0; i < dist_topster.size; i++) {
+        EXPECT_EQ(distinct_ids[i], dist_topster.getDistinctKeyAt(i));
+
+        if(distinct_ids[i] == 1) {
+            EXPECT_EQ(12, (int) dist_topster.getKV(i)->match_score);
+            EXPECT_EQ(2, dist_topster.group_kv_map[dist_topster.getDistinctKeyAt(i)]->size);
+            EXPECT_EQ(12, dist_topster.group_kv_map[dist_topster.getDistinctKeyAt(i)]->getKV(0)->match_score);
+            EXPECT_EQ(11, dist_topster.group_kv_map[dist_topster.getDistinctKeyAt(i)]->getKV(1)->match_score);
+        }
+
+        if(distinct_ids[i] == 5) {
+            EXPECT_EQ(10, (int) dist_topster.getKV(i)->match_score);
+            EXPECT_EQ(2, dist_topster.group_kv_map[dist_topster.getDistinctKeyAt(i)]->size);
+            EXPECT_EQ(10, dist_topster.group_kv_map[dist_topster.getDistinctKeyAt(i)]->getKV(0)->match_score);
+            EXPECT_EQ(9, dist_topster.group_kv_map[dist_topster.getDistinctKeyAt(i)]->getKV(1)->match_score);
+        }
+    }
 }
\ No newline at end of file

From c452fa0db17cb94fe0d4e898d4a38f5437046f96 Mon Sep 17 00:00:00 2001
From: kishorenc <kishorenc@gmail.com>
Date: Sat, 6 Jun 2020 20:50:32 +0530
Subject: [PATCH 04/38] Make iteration of curated IDs respect max hits.

---
 src/index.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/index.cpp b/src/index.cpp
index f49097f4..9ce1b396 100644
--- a/src/index.cpp
+++ b/src/index.cpp
@@ -1110,7 +1110,11 @@ void Index::collate_curated_ids(const std::string & query, const std::string & f
         leaf_to_indices.emplace(token_leaf, indices);
     }
 
-    for(size_t j=0; j<included_ids.size(); j++) {
+    // curated_topster.MAX_SIZE is initialized based on max_hits.
+    // Even if override has more IDs, we should restrict to max hits.
+    size_t iter_size = std::min((size_t)curated_topster.MAX_SIZE, included_ids.size());
+
+    for(size_t j=0; j<iter_size; j++) {
         const uint32_t seq_id = included_ids[j];
 
         std::vector<std::vector<std::vector<uint16_t>>> array_token_positions;

From 8ffb9c5154e478f84055fabc9539079d277d7149 Mon Sep 17 00:00:00 2001
From: kishorenc <kishorenc@gmail.com>
Date: Sun, 7 Jun 2020 17:36:27 +0530
Subject: [PATCH 05/38] Reuse KV pointers instead of copying as objects.

---
 include/collection.h |  2 +-
 include/index.h      | 35 +++++++++++++++--------
 include/topster.h    |  5 ----
 src/collection.cpp   | 67 ++++++++++++++++++++++++--------------------
 src/index.cpp        | 62 ++++++++++++++++++++--------------------
 5 files changed, 91 insertions(+), 80 deletions(-)

diff --git a/include/collection.h b/include/collection.h
index c1ebc55e..4807d091 100644
--- a/include/collection.h
+++ b/include/collection.h
@@ -147,7 +147,7 @@ private:
     std::string get_seq_id_key(uint32_t seq_id);
 
     void highlight_result(const field &search_field, const std::vector<std::vector<art_leaf *>> &searched_queries,
-                          const KV &field_order_kv, const nlohmann::json &document,
+                          const KV* field_order_kv, const nlohmann::json &document,
                           StringUtils & string_utils, size_t snippet_threshold,
                           bool highlighted_fully,
                           highlight_t &highlight);
diff --git a/include/index.h b/include/index.h
index 9f2d2edd..994aa6e5 100644
--- a/include/index.h
+++ b/include/index.h
@@ -32,17 +32,18 @@ struct search_args {
     facet_query_t facet_query;
     int num_typos;
     size_t max_facet_values;
-    size_t max_hits;
     size_t per_page;
     size_t page;
     token_ordering token_order;
     bool prefix;
     size_t drop_tokens_threshold;
     size_t typo_tokens_threshold;
-    std::vector<KV> raw_result_kvs;
     size_t all_result_ids_len;
     std::vector<std::vector<art_leaf*>> searched_queries;
-    std::vector<KV> override_result_kvs;
+    Topster* topster;
+    Topster* curated_topster;
+    std::vector<KV*> raw_result_kvs;
+    std::vector<KV*> override_result_kvs;
     Option<uint32_t> outcome;
 
     search_args(): outcome(0) {
@@ -56,12 +57,20 @@ struct search_args {
                 size_t drop_tokens_threshold, size_t typo_tokens_threshold):
             query(query), search_fields(search_fields), filters(filters), facets(facets), included_ids(included_ids),
             excluded_ids(excluded_ids), sort_fields_std(sort_fields_std), facet_query(facet_query), num_typos(num_typos),
-            max_facet_values(max_facet_values), max_hits(max_hits), per_page(per_page),
+            max_facet_values(max_facet_values), per_page(per_page),
             page(page), token_order(token_order), prefix(prefix),
             drop_tokens_threshold(drop_tokens_threshold), typo_tokens_threshold(typo_tokens_threshold),
             all_result_ids_len(0), outcome(0) {
 
+        const size_t topster_size = std::max((size_t)1, max_hits);  // needs to be atleast 1 since scoring is mandatory
+        topster = new Topster(topster_size);
+        curated_topster = new Topster(topster_size);
     }
+
+    ~search_args() {
+        delete topster;
+        delete curated_topster;
+    };
 };
 
 struct index_record {
@@ -155,7 +164,7 @@ private:
                       const std::string & field, uint32_t *filter_ids, size_t filter_ids_length,
                       std::vector<facet> & facets, const std::vector<sort_by> & sort_fields,
                       const int num_typos, std::vector<std::vector<art_leaf*>> & searched_queries,
-                      Topster & topster, uint32_t** all_result_ids,
+                      Topster* topster, uint32_t** all_result_ids,
                       size_t & all_result_ids_len, const token_ordering token_order = FREQUENCY,
                       const bool prefix = false,
                       const size_t drop_tokens_threshold = Index::DROP_TOKENS_THRESHOLD,
@@ -164,7 +173,7 @@ private:
     void search_candidates(const uint8_t & field_id, uint32_t* filter_ids, size_t filter_ids_length,
                            const std::vector<sort_by> & sort_fields, std::vector<token_candidates> & token_to_candidates,
                            const token_ordering token_order, std::vector<std::vector<art_leaf*>> & searched_queries,
-                           Topster & topster, uint32_t** all_result_ids,
+                           Topster* topster, uint32_t** all_result_ids,
                            size_t & all_result_ids_len,
                            const size_t typo_tokens_threshold);
 
@@ -198,7 +207,7 @@ private:
 
     void collate_curated_ids(const std::string & query, const std::string & field, const uint8_t field_id,
                              const std::vector<uint32_t> & included_ids,
-                             Topster & curated_topster, std::vector<std::vector<art_leaf*>> & searched_queries);
+                             Topster* curated_topster, std::vector<std::vector<art_leaf*>> & searched_queries);
 
     uint64_t facet_token_hash(const field & a_field, const std::string &token);
 
@@ -220,10 +229,12 @@ public:
                           facet_query_t & facet_query,
                           const std::vector<uint32_t> & included_ids, const std::vector<uint32_t> & excluded_ids,
                           const std::vector<sort_by> & sort_fields_std, const int num_typos,
-                          const size_t max_hits, const size_t per_page, const size_t page, const token_ordering token_order,
-                          const bool prefix, const size_t drop_tokens_threshold, std::vector<KV> & raw_result_kvs,
+                          Topster* topster, Topster* curated_topster,
+                          const size_t per_page, const size_t page, const token_ordering token_order,
+                          const bool prefix, const size_t drop_tokens_threshold,
                           size_t & all_result_ids_len, std::vector<std::vector<art_leaf*>> & searched_queries,
-                          std::vector<KV> & override_result_kvs, const size_t typo_tokens_threshold);
+                          std::vector<KV*> & raw_result_kvs, std::vector<KV*> & override_result_kvs,
+                          const size_t typo_tokens_threshold);
 
     Option<uint32_t> remove(const uint32_t seq_id, nlohmann::json & document);
 
@@ -235,7 +246,7 @@ public:
                                          std::vector<std::vector<std::vector<uint16_t>>> &array_token_positions);
 
     void score_results(const std::vector<sort_by> & sort_fields, const uint16_t & query_index, const uint8_t & field_id,
-                       const uint32_t total_cost, Topster &topster, const std::vector<art_leaf *> & query_suggestion,
+                       const uint32_t total_cost, Topster* topster, const std::vector<art_leaf *> & query_suggestion,
                        const uint32_t *result_ids, const size_t result_size) const;
 
     static int32_t get_points_from_doc(const nlohmann::json &document, const std::string & default_sorting_field);
@@ -278,7 +289,7 @@ public:
     bool processed;   // prevents spurious wake up of the main thread
     bool terminate;   // used for interrupting the thread during tear down
 
-    search_args search_params;
+    search_args* search_params;
 
     static void populate_array_token_positions(std::vector<std::vector<std::vector<uint16_t>>> & array_token_positions,
                                                const art_leaf *token_leaf, uint32_t doc_index);
diff --git a/include/topster.h b/include/topster.h
index ab6d752b..bbc7f590 100644
--- a/include/topster.h
+++ b/include/topster.h
@@ -257,11 +257,6 @@ struct Topster {
                std::tie(j->scores[0], j->scores[1], j->scores[2]);
     }
 
-    static bool is_greater_kv_value(const struct KV & i, const struct KV & j) {
-        return std::tie(i.scores[0], i.scores[1], i.scores[2], i.key) >
-               std::tie(j.scores[0], j.scores[1], j.scores[2], j.key);
-    }
-
     // topster must be sorted before iterated upon to remove dead array entries
     void sort() {
         std::stable_sort(kvs, kvs+size, is_greater_kv);
diff --git a/src/collection.cpp b/src/collection.cpp
index 2e7b30f2..eee23762 100644
--- a/src/collection.cpp
+++ b/src/collection.cpp
@@ -608,15 +608,15 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
     const size_t max_hits = std::min((page * per_page), get_num_documents());
 
     std::vector<std::vector<art_leaf*>> searched_queries;  // search queries used for generating the results
-    std::vector<KV> raw_result_kvs;
-    std::vector<KV> override_result_kvs;
+    std::vector<KV*> raw_result_kvs;
+    std::vector<KV*> override_result_kvs;
 
     size_t total_found = 0;
 
     // send data to individual index threads
     size_t index_id = 0;
     for(Index* index: indices) {
-        index->search_params = search_args(query, search_fields, filters, facets,
+        index->search_params = new search_args(query, search_fields, filters, facets,
                                            index_to_included_ids[index_id], index_to_excluded_ids[index_id],
                                            sort_fields_std, facet_query, num_typos, max_facet_values, max_hits,
                                            per_page, page, token_order, prefix,
@@ -639,9 +639,9 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
             index->cv.wait(lk, [index]{return index->processed;});
         }
 
-        if(!index->search_params.outcome.ok()) {
-            index_search_op = Option<nlohmann::json>(index->search_params.outcome.code(),
-                                                    index->search_params.outcome.error());
+        if(!index->search_params->outcome.ok()) {
+            index_search_op = Option<nlohmann::json>(index->search_params->outcome.code(),
+                                                    index->search_params->outcome.error());
         }
 
         if(!index_search_op.ok()) {
@@ -649,21 +649,21 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
             continue;
         }
 
-        for(auto & field_order_kv: index->search_params.raw_result_kvs) {
-            field_order_kv.query_index += searched_queries.size();
+        for(auto & field_order_kv: index->search_params->raw_result_kvs) {
+            field_order_kv->query_index += searched_queries.size();
             raw_result_kvs.push_back(field_order_kv);
         }
 
-        for(auto & field_order_kv: index->search_params.override_result_kvs) {
-            field_order_kv.query_index += searched_queries.size();
+        for(auto & field_order_kv: index->search_params->override_result_kvs) {
+            field_order_kv->query_index += searched_queries.size();
             override_result_kvs.push_back(field_order_kv);
         }
 
-        searched_queries.insert(searched_queries.end(), index->search_params.searched_queries.begin(),
-                                index->search_params.searched_queries.end());
+        searched_queries.insert(searched_queries.end(), index->search_params->searched_queries.begin(),
+                                index->search_params->searched_queries.end());
 
-        for(size_t fi = 0; fi < index->search_params.facets.size(); fi++) {
-            auto & this_facet = index->search_params.facets[fi];
+        for(size_t fi = 0; fi < index->search_params->facets.size(); fi++) {
+            auto & this_facet = index->search_params->facets[fi];
             auto & acc_facet = facets[fi];
 
             for(auto & facet_kv: this_facet.result_map) {
@@ -690,7 +690,7 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
             }
         }
 
-        total_found += index->search_params.all_result_ids_len;
+        total_found += index->search_params->all_result_ids_len;
     }
 
     if(!index_search_op.ok()) {
@@ -698,13 +698,13 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
     }
 
     // All fields are sorted descending
-    std::sort(raw_result_kvs.begin(), raw_result_kvs.end(), Topster::is_greater_kv_value);
+    std::sort(raw_result_kvs.begin(), raw_result_kvs.end(), Topster::is_greater_kv);
 
-    // Sort based on position in overriden list
+    // Sort based on position in overridden list
     std::sort(
       override_result_kvs.begin(), override_result_kvs.end(),
-      [&id_pos_map](const KV & a, const KV & b) -> bool {
-          return id_pos_map[a.key] < id_pos_map[b.key];
+      [&id_pos_map](const KV* a, const KV* b) -> bool {
+          return id_pos_map[a->key] < id_pos_map[b->key];
       }
     );
 
@@ -713,15 +713,15 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
     result["hits"] = nlohmann::json::array();
     result["found"] = total_found;
 
-    std::vector<KV> result_kvs;
+    std::vector<KV*> result_kvs;
     size_t override_kv_index = 0;
     size_t raw_results_index = 0;
 
     // merge raw results and override results
     while(override_kv_index < override_result_kvs.size() && raw_results_index < raw_result_kvs.size()) {
         if(override_kv_index < override_result_kvs.size() &&
-           id_pos_map.count(override_result_kvs[override_kv_index].key) != 0 &&
-           result_kvs.size() + 1 == id_pos_map[override_result_kvs[override_kv_index].key]) {
+           id_pos_map.count(override_result_kvs[override_kv_index]->key) != 0 &&
+           result_kvs.size() + 1 == id_pos_map[override_result_kvs[override_kv_index]->key]) {
              result_kvs.push_back(override_result_kvs[override_kv_index]);
              override_kv_index++;
         } else {
@@ -746,7 +746,7 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
     // construct results array
     for(long result_kvs_index = start_result_index; result_kvs_index <= end_result_index; result_kvs_index++) {
         const auto & field_order_kv = result_kvs[result_kvs_index];
-        const std::string& seq_id_key = get_seq_id_key((uint32_t) field_order_kv.key);
+        const std::string& seq_id_key = get_seq_id_key((uint32_t) field_order_kv->key);
 
         nlohmann::json document;
         const Option<bool> & document_op = get_document_from_store(seq_id_key, document);
@@ -818,8 +818,8 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
 
         prune_document(document, include_fields, exclude_fields);
         wrapper_doc["document"] = document;
-        wrapper_doc["text_match"] = field_order_kv.match_score;
-        //wrapper_doc["seq_id"] = (uint32_t) field_order_kv.key;
+        wrapper_doc["text_match"] = field_order_kv->match_score;
+        //wrapper_doc["seq_id"] = (uint32_t) field_order_kv->key;
 
         result["hits"].push_back(wrapper_doc);
     }
@@ -918,6 +918,11 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
         result["facet_counts"].push_back(facet_result);
     }
 
+    // free search params
+    for(Index* index: indices) {
+        delete index->search_params;
+    }
+
     result["request_params"] = nlohmann::json::object();;
     result["request_params"]["per_page"] = per_page;
     result["request_params"]["q"] = query;
@@ -966,7 +971,7 @@ void Collection::facet_value_to_string(const facet &a_facet, const facet_count_t
 
 void Collection::highlight_result(const field &search_field,
                                   const std::vector<std::vector<art_leaf *>> &searched_queries,
-                                  const KV & field_order_kv, const nlohmann::json & document,
+                                  const KV* field_order_kv, const nlohmann::json & document,
                                   StringUtils & string_utils, size_t snippet_threshold,
                                   bool highlighted_fully,
                                   highlight_t & highlight) {
@@ -974,15 +979,15 @@ void Collection::highlight_result(const field &search_field,
     spp::sparse_hash_map<const art_leaf*, uint32_t*> leaf_to_indices;
     std::vector<art_leaf *> query_suggestion;
 
-    for (const art_leaf *token_leaf : searched_queries[field_order_kv.query_index]) {
+    for (const art_leaf *token_leaf : searched_queries[field_order_kv->query_index]) {
         // Must search for the token string fresh on that field for the given document since `token_leaf`
         // is from the best matched field and need not be present in other fields of a document.
-        Index* index = indices[field_order_kv.key % num_indices];
+        Index* index = indices[field_order_kv->key % num_indices];
         art_leaf *actual_leaf = index->get_token_leaf(search_field.name, &token_leaf->key[0], token_leaf->key_len);
         if(actual_leaf != nullptr) {
             query_suggestion.push_back(actual_leaf);
             std::vector<uint16_t> positions;
-            uint32_t doc_index = actual_leaf->values->ids.indexOf(field_order_kv.key);
+            uint32_t doc_index = actual_leaf->values->ids.indexOf(field_order_kv->key);
             auto doc_indices = new uint32_t[1];
             doc_indices[0] = doc_index;
             leaf_to_indices.emplace(actual_leaf, doc_indices);
@@ -1008,8 +1013,8 @@ void Collection::highlight_result(const field &search_field,
             continue;
         }
 
-        const Match & this_match = Match::match(field_order_kv.key, token_positions);
-        uint64_t this_match_score = this_match.get_match_score(1, field_order_kv.field_id);
+        const Match & this_match = Match::match(field_order_kv->key, token_positions);
+        uint64_t this_match_score = this_match.get_match_score(1, field_order_kv->field_id);
         match_indices.emplace_back(this_match, this_match_score, array_index);
     }
 
diff --git a/src/index.cpp b/src/index.cpp
index 9ce1b396..1ff8f5bb 100644
--- a/src/index.cpp
+++ b/src/index.cpp
@@ -829,7 +829,7 @@ void Index::drop_facets(std::vector<facet> & facets, const std::vector<uint32_t>
 void Index::search_candidates(const uint8_t & field_id, uint32_t* filter_ids, size_t filter_ids_length,
                               const std::vector<sort_by> & sort_fields,
                               std::vector<token_candidates> & token_candidates_vec, const token_ordering token_order,
-                              std::vector<std::vector<art_leaf*>> & searched_queries, Topster & topster,
+                              std::vector<std::vector<art_leaf*>> & searched_queries, Topster* topster,
                               uint32_t** all_result_ids, size_t & all_result_ids_len,
                               const size_t typo_tokens_threshold) {
     const long long combination_limit = 10;
@@ -1055,13 +1055,15 @@ void Index::run_search() {
         }
 
         // after the wait, we own the lock.
-        search(search_params.outcome, search_params.query, search_params.search_fields,
-               search_params.filters, search_params.facets, search_params.facet_query, search_params.included_ids,
-               search_params.excluded_ids, search_params.sort_fields_std, search_params.num_typos,
-               search_params.max_hits, search_params.per_page, search_params.page, search_params.token_order,
-               search_params.prefix, search_params.drop_tokens_threshold, search_params.raw_result_kvs,
-               search_params.all_result_ids_len, search_params.searched_queries, search_params.override_result_kvs,
-               search_params.typo_tokens_threshold);
+        search(search_params->outcome, search_params->query, search_params->search_fields,
+               search_params->filters, search_params->facets, search_params->facet_query, search_params->included_ids,
+               search_params->excluded_ids, search_params->sort_fields_std, search_params->num_typos,
+               search_params->topster, search_params->curated_topster,
+               search_params->per_page, search_params->page, search_params->token_order,
+               search_params->prefix, search_params->drop_tokens_threshold,
+               search_params->all_result_ids_len, search_params->searched_queries,
+               search_params->raw_result_kvs, search_params->override_result_kvs,
+               search_params->typo_tokens_threshold);
 
         // hand control back to main thread
         processed = true;
@@ -1075,7 +1077,7 @@ void Index::run_search() {
 
 void Index::collate_curated_ids(const std::string & query, const std::string & field, const uint8_t field_id,
                                 const std::vector<uint32_t> & included_ids,
-                                Topster & curated_topster,
+                                Topster* curated_topster,
                                 std::vector<std::vector<art_leaf*>> & searched_queries) {
 
     if(included_ids.size() == 0) {
@@ -1110,9 +1112,9 @@ void Index::collate_curated_ids(const std::string & query, const std::string & f
         leaf_to_indices.emplace(token_leaf, indices);
     }
 
-    // curated_topster.MAX_SIZE is initialized based on max_hits.
+    // curated_topster->MAX_SIZE is initialized based on max_hits.
     // Even if override has more IDs, we should restrict to max hits.
-    size_t iter_size = std::min((size_t)curated_topster.MAX_SIZE, included_ids.size());
+    size_t iter_size = std::min((size_t)curated_topster->MAX_SIZE, included_ids.size());
 
     for(size_t j=0; j<iter_size; j++) {
         const uint32_t seq_id = included_ids[j];
@@ -1140,7 +1142,7 @@ void Index::collate_curated_ids(const std::string & query, const std::string & f
         scores[2] = int64_t(1);
 
         KV kv(field_id, searched_queries.size(), seq_id, seq_id, match_score, scores);
-        curated_topster.add(&kv);
+        curated_topster->add(&kv);
 
         searched_queries.push_back(override_query);
     }
@@ -1153,13 +1155,15 @@ void Index::search(Option<uint32_t> & outcome,
                    std::vector<facet> & facets, facet_query_t & facet_query,
                    const std::vector<uint32_t> & included_ids,
                    const std::vector<uint32_t> & excluded_ids,
-                   const std::vector<sort_by> & sort_fields_std, const int num_typos, const size_t max_hits,
+                   const std::vector<sort_by> & sort_fields_std, const int num_typos,
+                   Topster* topster,
+                   Topster* curated_topster,
                    const size_t per_page, const size_t page, const token_ordering token_order,
                    const bool prefix, const size_t drop_tokens_threshold,
-                   std::vector<KV> & raw_result_kvs,
                    size_t & all_result_ids_len,
                    std::vector<std::vector<art_leaf*>> & searched_queries,
-                   std::vector<KV> & override_result_kvs,
+                   std::vector<KV*> & raw_result_kvs,
+                   std::vector<KV*> & override_result_kvs,
                    const size_t typo_tokens_threshold) {
 
     const size_t num_results = (page * per_page);
@@ -1179,10 +1183,6 @@ void Index::search(Option<uint32_t> & outcome,
     //auto begin = std::chrono::high_resolution_clock::now();
     uint32_t* all_result_ids = nullptr;
 
-    const size_t topster_size = std::max((size_t)1, max_hits);  // needs to be atleast 1 since scoring is mandatory
-    Topster topster(topster_size);
-    Topster curated_topster(topster_size);
-
     if(query == "*") {
         const uint8_t field_id = (uint8_t)(FIELD_LIMIT_NUM - 0);
         const std::string & field = search_fields[0];
@@ -1212,8 +1212,8 @@ void Index::search(Option<uint32_t> & outcome,
     do_facets(facets, facet_query, &included_ids[0], included_ids.size());
 
     // must be sorted before iterated upon to remove "empty" array entries
-    topster.sort();
-    curated_topster.sort();
+    topster->sort();
+    curated_topster->sort();
 
     std::set<uint32_t> ids_to_remove(included_ids.begin(), included_ids.end());
     ids_to_remove.insert(excluded_ids.begin(), excluded_ids.end());
@@ -1221,26 +1221,26 @@ void Index::search(Option<uint32_t> & outcome,
     std::vector<uint32_t> dropped_ids;
 
     // loop through topster and remove elements from included and excluded id lists
-    for(uint32_t t = 0; t < topster.size && t < num_results; t++) {
-        KV* kv = topster.getKV(t);
+    for(uint32_t t = 0; t < topster->size && t < num_results; t++) {
+        KV* kv = topster->getKV(t);
 
         if(ids_to_remove.count(kv->key) != 0) {
             dropped_ids.push_back((uint32_t)kv->key);
         } else {
-            raw_result_kvs.push_back(*kv);
+            raw_result_kvs.push_back(kv);
         }
     }
 
-    for(uint32_t t = 0; t < curated_topster.size && t < num_results; t++) {
-        KV* kv = curated_topster.getKV(t);
-        override_result_kvs.push_back(*kv);
+    for(uint32_t t = 0; t < curated_topster->size && t < num_results; t++) {
+        KV* kv = curated_topster->getKV(t);
+        override_result_kvs.push_back(kv);
     }
 
     // for the ids that are dropped, remove their corresponding facet components from facet results
     drop_facets(facets, dropped_ids);
 
     all_result_ids_len -= dropped_ids.size();
-    all_result_ids_len += curated_topster.size;
+    all_result_ids_len += curated_topster->size;
 
     delete [] filter_ids;
     delete [] all_result_ids;
@@ -1264,7 +1264,7 @@ void Index::search_field(const uint8_t & field_id, const std::string & query, co
                          uint32_t *filter_ids, size_t filter_ids_length,
                          std::vector<facet> & facets, const std::vector<sort_by> & sort_fields, const int num_typos,
                          std::vector<std::vector<art_leaf*>> & searched_queries,
-                         Topster & topster, uint32_t** all_result_ids, size_t & all_result_ids_len,
+                         Topster* topster, uint32_t** all_result_ids, size_t & all_result_ids_len,
                          const token_ordering token_order, const bool prefix, 
                          const size_t drop_tokens_threshold, const size_t typo_tokens_threshold) {
     std::vector<std::string> tokens;
@@ -1438,7 +1438,7 @@ void Index::log_leaves(const int cost, const std::string &token, const std::vect
 }
 
 void Index::score_results(const std::vector<sort_by> & sort_fields, const uint16_t & query_index,
-                          const uint8_t & field_id, const uint32_t total_cost, Topster & topster,
+                          const uint8_t & field_id, const uint32_t total_cost, Topster* topster,
                           const std::vector<art_leaf *> &query_suggestion,
                           const uint32_t *result_ids, const size_t result_size) const {
 
@@ -1546,7 +1546,7 @@ void Index::score_results(const std::vector<sort_by> & sort_fields, const uint16
         }
 
         KV kv(field_id, query_index, seq_id, seq_id, match_score, scores);
-        topster.add(&kv);
+        topster->add(&kv);
     }
 
     //long long int timeNanos = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - begin).count();

From a6a2000ddce6dbf5d74a9986f0cb8f0b0c9bd3ed Mon Sep 17 00:00:00 2001
From: kishorenc <kishorenc@gmail.com>
Date: Wed, 10 Jun 2020 07:00:32 +0530
Subject: [PATCH 06/38] Use proc meminfo for available memory on Linux.

---
 include/system_metrics.h | 17 +++++++++++++++++
 src/system_metrics.cpp   |  8 ++++----
 2 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/include/system_metrics.h b/include/system_metrics.h
index fda58fdd..6ea58dee 100644
--- a/include/system_metrics.h
+++ b/include/system_metrics.h
@@ -118,6 +118,23 @@ private:
         }
     }
 
+    static unsigned long linux_get_mem_available_bytes() {
+        std::string token;
+        std::ifstream file("/proc/meminfo");
+        while(file >> token) {
+            if(token == "MemAvailable:") {
+                unsigned long mem_kb;
+                if(file >> mem_kb) {
+                    return mem_kb * 1000;
+                } else {
+                    return 0;
+                }
+            }
+        }
+
+        return 0; // nothing found
+    }
+
 public:
     void get(const std::string & data_dir_path, nlohmann::json& result);
 
diff --git a/src/system_metrics.cpp b/src/system_metrics.cpp
index a0c6d01e..ac5412b7 100644
--- a/src/system_metrics.cpp
+++ b/src/system_metrics.cpp
@@ -29,7 +29,7 @@ void SystemMetrics::get(const std::string &data_dir_path, nlohmann::json &result
     getrusage(RUSAGE_SELF, &r_usage);
     result["memory_used_process_bytes"] = r_usage.ru_maxrss * 1000;
 
-    uint64_t memory_free_bytes = 0;
+    uint64_t memory_available_bytes = 0;
     uint64_t memory_total_bytes = 0;
 
 #ifdef __APPLE__
@@ -42,7 +42,7 @@ void SystemMetrics::get(const std::string &data_dir_path, nlohmann::json &result
     if (KERN_SUCCESS == host_page_size(mach_port, &mach_page_size) &&
         KERN_SUCCESS == host_statistics64(mach_port, HOST_VM_INFO,
                                           (host_info64_t)&vm_stats, &count)) {
-        memory_free_bytes = (int64_t)(vm_stats.free_count) * (int64_t)mach_page_size;
+        memory_available_bytes = (int64_t)(vm_stats.free_count) * (int64_t)mach_page_size;
     }
 
     uint64_t pages = sysconf(_SC_PHYS_PAGES);
@@ -51,11 +51,11 @@ void SystemMetrics::get(const std::string &data_dir_path, nlohmann::json &result
 #elif __linux__
     struct sysinfo sys_info;
     sysinfo(&sys_info);
-    memory_free_bytes = sys_info.freeram;
+    memory_available_bytes = linux_get_mem_available_bytes();
     memory_total_bytes = sys_info.totalram;
 #endif
 
-    result["memory_free_bytes"] = memory_free_bytes;
+    result["memory_available_bytes"] = memory_available_bytes;
     result["memory_total_bytes"] = memory_total_bytes;
 
     // CPU METRICS

From b7dcb7367bc69945bc36ac0b28bf9756fd8a651a Mon Sep 17 00:00:00 2001
From: kishorenc <kishorenc@gmail.com>
Date: Wed, 10 Jun 2020 18:26:17 +0530
Subject: [PATCH 07/38] Expose group by params.

---
 TODO.md              |   1 +
 include/collection.h |   4 +-
 include/index.h      |  17 +++-
 include/topster.h    |  17 ++--
 src/collection.cpp   | 223 +++++++++++++++++++++++++++----------------
 src/core_api.cpp     |  27 +++++-
 src/index.cpp        |  66 ++++++++++---
 7 files changed, 250 insertions(+), 105 deletions(-)

diff --git a/TODO.md b/TODO.md
index 69298247..96f2a7e3 100644
--- a/TODO.md
+++ b/TODO.md
@@ -97,6 +97,7 @@
 - ~~Have a LOG(ERROR) level~~
 - ~~Handle SIGTERM which is sent when process is killed~~
 - ~~Use snappy compression for storage~~
+- Test for overriding result on second page
 - atleast 1 token match for proceeding with drop tokens
 - support wildcard query with filters
 - API for optimizing on disk storage
diff --git a/include/collection.h b/include/collection.h
index 4807d091..48aa4938 100644
--- a/include/collection.h
+++ b/include/collection.h
@@ -235,7 +235,9 @@ public:
                           const std::string & highlight_full_fields = "",
                           size_t typo_tokens_threshold = Index::TYPO_TOKENS_THRESHOLD,
                           const std::map<std::string, size_t>& pinned_hits={},
-                          const std::vector<std::string>& hidden_hits={});
+                          const std::vector<std::string>& hidden_hits={},
+                          const std::vector<std::string>& group_by_fields={},
+                          const size_t group_limit = 0);
 
     Option<nlohmann::json> get(const std::string & id);
 
diff --git a/include/index.h b/include/index.h
index 994aa6e5..b142c9a3 100644
--- a/include/index.h
+++ b/include/index.h
@@ -38,11 +38,13 @@ struct search_args {
     bool prefix;
     size_t drop_tokens_threshold;
     size_t typo_tokens_threshold;
+    std::vector<std::string> group_by_fields;
+    size_t group_limit;
     size_t all_result_ids_len;
     std::vector<std::vector<art_leaf*>> searched_queries;
     Topster* topster;
     Topster* curated_topster;
-    std::vector<KV*> raw_result_kvs;
+    std::vector<std::vector<KV*>> raw_result_kvs;
     std::vector<KV*> override_result_kvs;
     Option<uint32_t> outcome;
 
@@ -54,16 +56,18 @@ struct search_args {
                 std::vector<facet> facets, std::vector<uint32_t> included_ids, std::vector<uint32_t> excluded_ids,
                 std::vector<sort_by> sort_fields_std, facet_query_t facet_query, int num_typos, size_t max_facet_values,
                 size_t max_hits, size_t per_page, size_t page, token_ordering token_order, bool prefix,
-                size_t drop_tokens_threshold, size_t typo_tokens_threshold):
+                size_t drop_tokens_threshold, size_t typo_tokens_threshold,
+                const std::vector<std::string>& group_by_fields, size_t group_limit):
             query(query), search_fields(search_fields), filters(filters), facets(facets), included_ids(included_ids),
             excluded_ids(excluded_ids), sort_fields_std(sort_fields_std), facet_query(facet_query), num_typos(num_typos),
             max_facet_values(max_facet_values), per_page(per_page),
             page(page), token_order(token_order), prefix(prefix),
             drop_tokens_threshold(drop_tokens_threshold), typo_tokens_threshold(typo_tokens_threshold),
+            group_by_fields(group_by_fields), group_limit(group_limit),
             all_result_ids_len(0), outcome(0) {
 
         const size_t topster_size = std::max((size_t)1, max_hits);  // needs to be atleast 1 since scoring is mandatory
-        topster = new Topster(topster_size);
+        topster = new Topster(topster_size, group_limit);
         curated_topster = new Topster(topster_size);
     }
 
@@ -213,6 +217,11 @@ private:
 
     void compute_facet_stats(facet &a_facet, int64_t raw_value, const std::string & field_type);
 
+    // reference: https://stackoverflow.com/a/27952689/131050
+    uint64_t hash_combine(uint64_t lhs, uint64_t rhs) const {
+        lhs ^= rhs + 0x517cc1b727220a95 + (lhs << 6) + (lhs >> 2);
+        return lhs;
+    }
 
 public:
     Index() = delete;
@@ -233,7 +242,7 @@ public:
                           const size_t per_page, const size_t page, const token_ordering token_order,
                           const bool prefix, const size_t drop_tokens_threshold,
                           size_t & all_result_ids_len, std::vector<std::vector<art_leaf*>> & searched_queries,
-                          std::vector<KV*> & raw_result_kvs, std::vector<KV*> & override_result_kvs,
+                          std::vector<std::vector<KV*>> & raw_result_kvs, std::vector<KV*> & override_result_kvs,
                           const size_t typo_tokens_threshold);
 
     Option<uint32_t> remove(const uint32_t seq_id, nlohmann::json & document);
diff --git a/include/topster.h b/include/topster.h
index bbc7f590..685e6d62 100644
--- a/include/topster.h
+++ b/include/topster.h
@@ -117,7 +117,7 @@ struct Topster {
                 }
 
                 // if new kv score is greater than previous min heap score we sift dowm, otherwise sift up
-                SIFT_DOWN = is_greater_kv(kv, &old_min_heap_kv);
+                SIFT_DOWN = is_greater(kv, &old_min_heap_kv);
 
                 // new kv is different from old_min_heap_kv so we have to sift heap
                 heap_op_index = old_min_heap_kv.array_index;
@@ -218,12 +218,12 @@ struct Topster {
         if(SIFT_DOWN) {
             while ((2 * heap_op_index + 1) < size) {
                 uint32_t next = (2 * heap_op_index + 1);  // left child
-                if (next+1 < size && is_greater_kv(kvs[next], kvs[next+1])) {
+                if (next+1 < size && is_greater(kvs[next], kvs[next + 1])) {
                     // for min heap we compare with the minimum of children
                     next++;  // right child (2n + 2)
                 }
 
-                if (is_greater_kv(kvs[heap_op_index], kvs[next])) {
+                if (is_greater(kvs[heap_op_index], kvs[next])) {
                     swapMe(&kvs[heap_op_index], &kvs[next]);
                 } else {
                     break;
@@ -235,7 +235,7 @@ struct Topster {
             // SIFT UP
             while(heap_op_index > 0) {
                 uint32_t parent = (heap_op_index - 1) / 2;
-                if (is_greater_kv(kvs[parent], kvs[heap_op_index])) {
+                if (is_greater(kvs[parent], kvs[heap_op_index])) {
                     swapMe(&kvs[heap_op_index], &kvs[parent]);
                     heap_op_index = parent;
                 } else {
@@ -247,7 +247,7 @@ struct Topster {
         return true;
     }
 
-    static bool is_greater_kv(const struct KV* i, const struct KV* j) {
+    static bool is_greater(const struct KV* i, const struct KV* j) {
         return std::tie(i->scores[0], i->scores[1], i->scores[2], i->key) >
                std::tie(j->scores[0], j->scores[1], j->scores[2], j->key);
     }
@@ -257,9 +257,14 @@ struct Topster {
                std::tie(j->scores[0], j->scores[1], j->scores[2]);
     }
 
+    static bool is_greater_kv_group(const std::vector<KV*>& i, const std::vector<KV*>& j) {
+        return std::tie(i[0]->scores[0], i[0]->scores[1], i[0]->scores[2], i[0]->key) >
+               std::tie(j[0]->scores[0], j[0]->scores[1], j[0]->scores[2], j[0]->key);
+    }
+
     // topster must be sorted before iterated upon to remove dead array entries
     void sort() {
-        std::stable_sort(kvs, kvs+size, is_greater_kv);
+        std::stable_sort(kvs, kvs + size, is_greater);
         for(auto &group_topster: group_kv_map) {
             group_topster.second->sort();
         }
diff --git a/src/collection.cpp b/src/collection.cpp
index eee23762..96a1afbc 100644
--- a/src/collection.cpp
+++ b/src/collection.cpp
@@ -349,7 +349,9 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
                                   const std::string & highlight_full_fields,
                                   size_t typo_tokens_threshold,
                                   const std::map<std::string, size_t>& pinned_hits,
-                                  const std::vector<std::string>& hidden_hits) {
+                                  const std::vector<std::string>& hidden_hits,
+                                  const std::vector<std::string>& group_by_fields,
+                                  const size_t group_limit) {
 
     std::vector<uint32_t> included_ids;
     std::vector<uint32_t> excluded_ids;
@@ -385,6 +387,22 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
         }
     }
 
+    // validate group by fields
+    for(const std::string & field_name: group_by_fields) {
+        if(search_schema.count(field_name) == 0) {
+            std::string error = "Could not find a field named `" + field_name + "` in the schema.";
+            return Option<nlohmann::json>(404, error);
+        }
+
+        field search_field = search_schema.at(field_name);
+
+        // if field is a string field then it must be a facet field as well
+        if(search_field.is_string() && !search_field.is_facet()) {
+            std::string error = "Field `" + field_name + "` should be a facet field.";
+            return Option<nlohmann::json>(400, error);
+        }
+    }
+
     // validate filter fields
     std::vector<std::string> filter_blocks;
     StringUtils::split(simple_filter_query, filter_blocks, "&&");
@@ -492,7 +510,7 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
     }
 
     // for a wildcard query, if filter is not specified, use default_sorting_field as a catch-all
-    if(query == "*" && filters.size() == 0) {
+    if(query == "*" && filters.empty()) {
         field f = search_schema.at(default_sorting_field);
         std::string max_value = f.is_float() ? std::to_string(std::numeric_limits<float>::max()) :
                                 std::to_string(std::numeric_limits<int32_t>::max());
@@ -608,7 +626,7 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
     const size_t max_hits = std::min((page * per_page), get_num_documents());
 
     std::vector<std::vector<art_leaf*>> searched_queries;  // search queries used for generating the results
-    std::vector<KV*> raw_result_kvs;
+    std::vector<std::vector<KV*>> raw_result_kvs;
     std::vector<KV*> override_result_kvs;
 
     size_t total_found = 0;
@@ -617,10 +635,11 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
     size_t index_id = 0;
     for(Index* index: indices) {
         index->search_params = new search_args(query, search_fields, filters, facets,
-                                           index_to_included_ids[index_id], index_to_excluded_ids[index_id],
-                                           sort_fields_std, facet_query, num_typos, max_facet_values, max_hits,
-                                           per_page, page, token_order, prefix,
-                                           drop_tokens_threshold, typo_tokens_threshold);
+                                               index_to_included_ids[index_id], index_to_excluded_ids[index_id],
+                                               sort_fields_std, facet_query, num_typos, max_facet_values, max_hits,
+                                               per_page, page, token_order, prefix,
+                                               drop_tokens_threshold, typo_tokens_threshold,
+                                               group_by_fields, group_limit);
         {
             std::lock_guard<std::mutex> lk(index->m);
             index->ready = true;
@@ -649,9 +668,9 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
             continue;
         }
 
-        for(auto & field_order_kv: index->search_params->raw_result_kvs) {
-            field_order_kv->query_index += searched_queries.size();
-            raw_result_kvs.push_back(field_order_kv);
+        for(const std::vector<KV*> & kv_group: index->search_params->raw_result_kvs) {
+            kv_group[0]->query_index += searched_queries.size();
+            raw_result_kvs.push_back(kv_group);
         }
 
         for(auto & field_order_kv: index->search_params->override_result_kvs) {
@@ -697,8 +716,35 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
         return index_search_op;
     }
 
+    Topster* aggr_topster = nullptr;
+
+    if(group_limit > 0) {
+        // group by query requires another round of topster-ing
+
+        // needs to be atleast 1 since scoring is mandatory
+        const size_t topster_size = std::max((size_t)1, max_hits);
+        aggr_topster = new Topster(topster_size, group_limit);
+
+        for(const auto& kv_group: raw_result_kvs) {
+            for(KV* kv: kv_group) {
+                aggr_topster->add(kv);
+            }
+        }
+
+        aggr_topster->sort();
+
+        raw_result_kvs.clear();
+        raw_result_kvs.shrink_to_fit();
+
+        for(auto &group_topster_entry: aggr_topster->group_kv_map) {
+            Topster* group_topster = group_topster_entry.second;
+            const std::vector<KV*> group_kvs(group_topster->kvs, group_topster->kvs+group_topster->size);
+            raw_result_kvs.emplace_back(group_kvs);
+        }
+    }
+
     // All fields are sorted descending
-    std::sort(raw_result_kvs.begin(), raw_result_kvs.end(), Topster::is_greater_kv);
+    std::sort(raw_result_kvs.begin(), raw_result_kvs.end(), Topster::is_greater_kv_group);
 
     // Sort based on position in overridden list
     std::sort(
@@ -708,12 +754,7 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
       }
     );
 
-    nlohmann::json result = nlohmann::json::object();
-
-    result["hits"] = nlohmann::json::array();
-    result["found"] = total_found;
-
-    std::vector<KV*> result_kvs;
+    std::vector<std::vector<KV*>> result_group_kvs;
     size_t override_kv_index = 0;
     size_t raw_results_index = 0;
 
@@ -721,107 +762,123 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
     while(override_kv_index < override_result_kvs.size() && raw_results_index < raw_result_kvs.size()) {
         if(override_kv_index < override_result_kvs.size() &&
            id_pos_map.count(override_result_kvs[override_kv_index]->key) != 0 &&
-           result_kvs.size() + 1 == id_pos_map[override_result_kvs[override_kv_index]->key]) {
-             result_kvs.push_back(override_result_kvs[override_kv_index]);
+           result_group_kvs.size() + 1 == id_pos_map[override_result_kvs[override_kv_index]->key]) {
+             result_group_kvs.push_back({override_result_kvs[override_kv_index]});
              override_kv_index++;
         } else {
-            result_kvs.push_back(raw_result_kvs[raw_results_index]);
+            result_group_kvs.push_back(raw_result_kvs[raw_results_index]);
             raw_results_index++;
         }
     }
 
     while(override_kv_index < override_result_kvs.size()) {
-        result_kvs.push_back(override_result_kvs[override_kv_index]);
+        result_group_kvs.push_back({override_result_kvs[override_kv_index]});
         override_kv_index++;
     }
 
     while(raw_results_index < raw_result_kvs.size()) {
-        result_kvs.push_back(raw_result_kvs[raw_results_index]);
+        result_group_kvs.push_back(raw_result_kvs[raw_results_index]);
         raw_results_index++;
     }
 
     const long start_result_index = (page - 1) * per_page;
-    const long end_result_index = std::min(max_hits, result_kvs.size()) - 1;  // could be -1 when max_hits is 0
+    const long end_result_index = std::min(max_hits, result_group_kvs.size()) - 1;  // could be -1 when max_hits is 0
+
+    nlohmann::json result = nlohmann::json::object();
+
+    result["found"] = total_found;
+
+    std::string hits_key = (group_limit > 1) ? "grouped_hits" : "hits";
+    result[hits_key] = nlohmann::json::array();
 
     // construct results array
     for(long result_kvs_index = start_result_index; result_kvs_index <= end_result_index; result_kvs_index++) {
-        const auto & field_order_kv = result_kvs[result_kvs_index];
-        const std::string& seq_id_key = get_seq_id_key((uint32_t) field_order_kv->key);
+        const std::vector<KV*> & kv_group = result_group_kvs[result_kvs_index];
 
-        nlohmann::json document;
-        const Option<bool> & document_op = get_document_from_store(seq_id_key, document);
+        nlohmann::json group_hits_array = nlohmann::json::array();
+        nlohmann::json& hits_array = (group_limit > 1) ? group_hits_array : result["hits"];
 
-        if(!document_op.ok()) {
-            LOG(ERROR) << "Document fetch error. " << document_op.error();
-            continue;
-        }
+        for(const KV* field_order_kv: kv_group) {
+            const std::string& seq_id_key = get_seq_id_key((uint32_t) field_order_kv->key);
 
-        nlohmann::json wrapper_doc;
-        wrapper_doc["highlights"] = nlohmann::json::array();
-        std::vector<highlight_t> highlights;
-        StringUtils string_utils;
+            nlohmann::json document;
+            const Option<bool> & document_op = get_document_from_store(seq_id_key, document);
 
-        // find out if fields have to be highlighted fully
-        std::vector<std::string> fields_highlighted_fully_vec;
-        spp::sparse_hash_set<std::string> fields_highlighted_fully;
-        StringUtils::split(highlight_full_fields, fields_highlighted_fully_vec, ",");
-
-        for(std::string & highlight_full_field: fields_highlighted_fully_vec) {
-            StringUtils::trim(highlight_full_field);
-            fields_highlighted_fully.emplace(highlight_full_field);
-        }
-
-        for(const std::string & field_name: search_fields) {
-            // should not pick excluded field for highlighting
-            if(exclude_fields.count(field_name) > 0) {
+            if(!document_op.ok()) {
+                LOG(ERROR) << "Document fetch error. " << document_op.error();
                 continue;
             }
 
-            field search_field = search_schema.at(field_name);
-            if(query != "*" && (search_field.type == field_types::STRING ||
-                                search_field.type == field_types::STRING_ARRAY)) {
+            nlohmann::json wrapper_doc;
+            wrapper_doc["highlights"] = nlohmann::json::array();
+            std::vector<highlight_t> highlights;
+            StringUtils string_utils;
 
-                bool highlighted_fully = (fields_highlighted_fully.find(field_name) != fields_highlighted_fully.end());
-                highlight_t highlight;
-                highlight_result(search_field, searched_queries, field_order_kv, document,
-                                 string_utils, snippet_threshold, highlighted_fully, highlight);
+            // find out if fields have to be highlighted fully
+            std::vector<std::string> fields_highlighted_fully_vec;
+            spp::sparse_hash_set<std::string> fields_highlighted_fully;
+            StringUtils::split(highlight_full_fields, fields_highlighted_fully_vec, ",");
 
-                if(!highlight.snippets.empty()) {
-                    highlights.push_back(highlight);
-                }
+            for(std::string & highlight_full_field: fields_highlighted_fully_vec) {
+                StringUtils::trim(highlight_full_field);
+                fields_highlighted_fully.emplace(highlight_full_field);
             }
-        }
 
-        std::sort(highlights.begin(), highlights.end());
-
-        for(const auto & highlight: highlights) {
-            nlohmann::json h_json = nlohmann::json::object();
-            h_json["field"] = highlight.field;
-            bool highlight_fully = (fields_highlighted_fully.find(highlight.field) != fields_highlighted_fully.end());
-
-            if(!highlight.indices.empty()) {
-                h_json["indices"] = highlight.indices;
-                h_json["snippets"] = highlight.snippets;
-                if(highlight_fully) {
-                    h_json["values"] = highlight.values;
+            for(const std::string & field_name: search_fields) {
+                // should not pick excluded field for highlighting
+                if(exclude_fields.count(field_name) > 0) {
+                    continue;
                 }
 
-            } else {
-                h_json["snippet"] = highlight.snippets[0];
-                if(highlight_fully) {
-                    h_json["value"] = highlight.values[0];
+                field search_field = search_schema.at(field_name);
+                if(query != "*" && (search_field.type == field_types::STRING ||
+                                    search_field.type == field_types::STRING_ARRAY)) {
+
+                    bool highlighted_fully = (fields_highlighted_fully.find(field_name) != fields_highlighted_fully.end());
+                    highlight_t highlight;
+                    highlight_result(search_field, searched_queries, field_order_kv, document,
+                                     string_utils, snippet_threshold, highlighted_fully, highlight);
+
+                    if(!highlight.snippets.empty()) {
+                        highlights.push_back(highlight);
+                    }
                 }
             }
 
-            wrapper_doc["highlights"].push_back(h_json);
+            std::sort(highlights.begin(), highlights.end());
+
+            for(const auto & highlight: highlights) {
+                nlohmann::json h_json = nlohmann::json::object();
+                h_json["field"] = highlight.field;
+                bool highlight_fully = (fields_highlighted_fully.find(highlight.field) != fields_highlighted_fully.end());
+
+                if(!highlight.indices.empty()) {
+                    h_json["indices"] = highlight.indices;
+                    h_json["snippets"] = highlight.snippets;
+                    if(highlight_fully) {
+                        h_json["values"] = highlight.values;
+                    }
+
+                } else {
+                    h_json["snippet"] = highlight.snippets[0];
+                    if(highlight_fully) {
+                        h_json["value"] = highlight.values[0];
+                    }
+                }
+
+                wrapper_doc["highlights"].push_back(h_json);
+            }
+
+            prune_document(document, include_fields, exclude_fields);
+            wrapper_doc["document"] = document;
+            wrapper_doc["text_match"] = field_order_kv->match_score;
+            //wrapper_doc["seq_id"] = (uint32_t) field_order_kv->key;
+            hits_array.push_back(wrapper_doc);
         }
 
-        prune_document(document, include_fields, exclude_fields);
-        wrapper_doc["document"] = document;
-        wrapper_doc["text_match"] = field_order_kv->match_score;
-        //wrapper_doc["seq_id"] = (uint32_t) field_order_kv->key;
-
-        result["hits"].push_back(wrapper_doc);
+        if(group_limit > 1) {
+            result["grouped_hits"].push_back(group_hits_array);
+        }
     }
 
     result["facet_counts"] = nlohmann::json::array();
@@ -923,6 +980,8 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
         delete index->search_params;
     }
 
+    delete aggr_topster;
+
     result["request_params"] = nlohmann::json::object();;
     result["request_params"]["per_page"] = per_page;
     result["request_params"]["q"] = query;
diff --git a/src/core_api.cpp b/src/core_api.cpp
index 56efeec7..c987443e 100644
--- a/src/core_api.cpp
+++ b/src/core_api.cpp
@@ -235,6 +235,9 @@ bool get_search(http_req & req, http_res & res) {
     const char *FACET_QUERY = "facet_query";
     const char *MAX_FACET_VALUES = "max_facet_values";
 
+    const char *GROUP_BY = "group_by";
+    const char *GROUP_LIMIT = "group_limit";
+
     const char *PER_PAGE = "per_page";
     const char *PAGE = "page";
     const char *CALLBACK = "callback";
@@ -314,6 +317,18 @@ bool get_search(http_req & req, http_res & res) {
         req.params[EXCLUDE_FIELDS] = "";
     }
 
+    if(req.params.count(GROUP_BY) == 0) {
+        req.params[GROUP_BY] = "";
+    }
+
+    if(req.params.count(GROUP_LIMIT) == 0) {
+        if(req.params[GROUP_BY] != "") {
+            req.params[GROUP_LIMIT] = "3";
+        } else {
+            req.params[GROUP_LIMIT] = "0";
+        }
+    }
+
     if(!StringUtils::is_uint64_t(req.params[DROP_TOKENS_THRESHOLD])) {
         res.set_400("Parameter `" + std::string(DROP_TOKENS_THRESHOLD) + "` must be an unsigned integer.");
         return false;
@@ -349,6 +364,11 @@ bool get_search(http_req & req, http_res & res) {
         return false;
     }
 
+    if(!StringUtils::is_uint64_t(req.params[GROUP_LIMIT])) {
+        res.set_400("Parameter `" + std::string(GROUP_LIMIT) + "` must be an unsigned integer.");
+        return false;
+    }
+
     std::string filter_str = req.params.count(FILTER) != 0 ? req.params[FILTER] : "";
 
     std::vector<std::string> search_fields;
@@ -366,6 +386,9 @@ bool get_search(http_req & req, http_res & res) {
     spp::sparse_hash_set<std::string> include_fields(include_fields_vec.begin(), include_fields_vec.end());
     spp::sparse_hash_set<std::string> exclude_fields(exclude_fields_vec.begin(), exclude_fields_vec.end());
 
+    std::vector<std::string> group_by_fields;
+    StringUtils::split(req.params[GROUP_BY], group_by_fields, ",");
+
     std::vector<sort_by> sort_fields;
     if(req.params.count(SORT_BY) != 0) {
         std::vector<std::string> sort_field_strs;
@@ -455,7 +478,9 @@ bool get_search(http_req & req, http_res & res) {
                                                           req.params[HIGHLIGHT_FULL_FIELDS],
                                                           typo_tokens_threshold,
                                                           pinned_hits,
-                                                          hidden_hits
+                                                          hidden_hits,
+                                                          group_by_fields,
+                                                          static_cast<size_t>(std::stoi(req.params[GROUP_LIMIT]))
                                                           );
 
     uint64_t timeMillis = std::chrono::duration_cast<std::chrono::milliseconds>(
diff --git a/src/index.cpp b/src/index.cpp
index 1ff8f5bb..a93509d7 100644
--- a/src/index.cpp
+++ b/src/index.cpp
@@ -1162,12 +1162,10 @@ void Index::search(Option<uint32_t> & outcome,
                    const bool prefix, const size_t drop_tokens_threshold,
                    size_t & all_result_ids_len,
                    std::vector<std::vector<art_leaf*>> & searched_queries,
-                   std::vector<KV*> & raw_result_kvs,
+                   std::vector<std::vector<KV*>> & raw_result_kvs,
                    std::vector<KV*> & override_result_kvs,
                    const size_t typo_tokens_threshold) {
 
-    const size_t num_results = (page * per_page);
-
     // process the filters
 
     uint32_t* filter_ids = nullptr;
@@ -1221,17 +1219,33 @@ void Index::search(Option<uint32_t> & outcome,
     std::vector<uint32_t> dropped_ids;
 
     // loop through topster and remove elements from included and excluded id lists
-    for(uint32_t t = 0; t < topster->size && t < num_results; t++) {
-        KV* kv = topster->getKV(t);
 
-        if(ids_to_remove.count(kv->key) != 0) {
-            dropped_ids.push_back((uint32_t)kv->key);
-        } else {
-            raw_result_kvs.push_back(kv);
+    if(topster->distinct) {
+        for(auto &group_topster_entry: topster->group_kv_map) {
+            Topster* group_topster = group_topster_entry.second;
+            for(uint32_t t = 0; t < group_topster->size; t++) {
+                KV* kv = group_topster->getKV(t);
+                if(ids_to_remove.count(kv->key) != 0) {
+                    dropped_ids.push_back((uint32_t)kv->key);
+                }
+            }
+
+            const std::vector<KV*> group_kvs(group_topster->kvs, group_topster->kvs+group_topster->size);
+            raw_result_kvs.emplace_back(group_kvs);
+        }
+    } else {
+        for(uint32_t t = 0; t < topster->size; t++) {
+            KV* kv = topster->getKV(t);
+
+            if(ids_to_remove.count(kv->key) != 0) {
+                dropped_ids.push_back((uint32_t)kv->key);
+            } else {
+                raw_result_kvs.push_back({kv});
+            }
         }
     }
 
-    for(uint32_t t = 0; t < curated_topster->size && t < num_results; t++) {
+    for(uint32_t t = 0; t < curated_topster->size; t++) {
         KV* kv = curated_topster->getKV(t);
         override_result_kvs.push_back(kv);
     }
@@ -1471,6 +1485,16 @@ void Index::score_results(const std::vector<sort_by> & sort_fields, const uint16
     Match single_token_match = Match(1, 0, 0, empty_offset_diffs);
     const uint64_t single_token_match_score = single_token_match.get_match_score(total_cost, field_id);
 
+    std::unordered_map<std::string, size_t> facet_to_id;
+
+    if(search_params->group_limit > 0) {
+        size_t i_facet = 0;
+        for(const auto & facet: facet_schema) {
+            facet_to_id[facet.first] = i_facet;
+            i_facet++;
+        }
+    }
+
     for(size_t i=0; i<result_size; i++) {
         const uint32_t seq_id = result_ids[i];
 
@@ -1545,7 +1569,27 @@ void Index::score_results(const std::vector<sort_by> & sort_fields, const uint16
             }
         }
 
-        KV kv(field_id, query_index, seq_id, seq_id, match_score, scores);
+        uint64_t distinct_id = seq_id;
+
+        if(search_params->group_limit != 0) {
+            distinct_id = 1; // some constant initial value
+
+            // calculate hash from group_by_fields
+            for(const auto& field: search_params->group_by_fields) {
+                if(facet_to_id.count(field) == 0 || facet_index_v2.count(seq_id) == 0) {
+                    continue;
+                }
+
+                size_t facet_id = facet_to_id[field];
+                const std::vector<uint64_t>& fhashes = facet_index_v2.at(seq_id)[facet_id];
+
+                for(const auto& hash: fhashes) {
+                    distinct_id = hash_combine(distinct_id, hash);
+                }
+            }
+        }
+
+        KV kv(field_id, query_index, seq_id, distinct_id, match_score, scores);
         topster->add(&kv);
     }
 

From 07456a5c6a0a7b86a584a2264f3aa9a75069cf91 Mon Sep 17 00:00:00 2001
From: kishorenc <kishorenc@gmail.com>
Date: Wed, 10 Jun 2020 21:59:56 +0530
Subject: [PATCH 08/38] Handle deletion of records with optional fields.

---
 src/index.cpp            | 4 ++++
 test/collection_test.cpp | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/src/index.cpp b/src/index.cpp
index a93509d7..74a96af4 100644
--- a/src/index.cpp
+++ b/src/index.cpp
@@ -1756,6 +1756,10 @@ void Index::remove_and_shift_offset_index(sorted_array &offset_index, const uint
 
 Option<uint32_t> Index::remove(const uint32_t seq_id, nlohmann::json & document) {
     for(auto & name_field: search_schema) {
+        if(name_field.second.optional && document.count(name_field.first) == 0) {
+            continue;
+        }
+
         // Go through all the field names and find the keys+values so that they can be removed from in-memory index
         std::vector<std::string> tokens;
         if(name_field.second.type == field_types::STRING) {
diff --git a/test/collection_test.cpp b/test/collection_test.cpp
index 6066da0a..cac9d715 100644
--- a/test/collection_test.cpp
+++ b/test/collection_test.cpp
@@ -2251,6 +2251,10 @@ TEST_F(CollectionTest, OptionalFields) {
     auto res_op = coll1->search("*", {"title"}, "", {}, sort_fields, 0, 10, 1, FREQUENCY, false);
     ASSERT_FALSE(res_op.ok());
     ASSERT_STREQ("Cannot sort by `average` as it is defined as an optional field.", res_op.error().c_str());
+    
+    // try deleting a record having optional field
+    Option<std::string> remove_op = coll1->remove("1");
+    ASSERT_TRUE(remove_op.ok());
 
     // default sorting field should not be declared optional
     fields = {

From 10c22c174aa0782f21b1cd68f4cd62ae7b4ef715 Mon Sep 17 00:00:00 2001
From: kishorenc <kishorenc@gmail.com>
Date: Sun, 14 Jun 2020 08:14:03 +0530
Subject: [PATCH 09/38] Consider grouping when generating facet counts.

---
 TODO.md                           |   2 +
 include/field.h                   |   1 +
 include/index.h                   |   9 +-
 src/array_utils.cpp               |   2 +-
 src/collection.cpp                |  18 ++-
 src/index.cpp                     | 192 +++++++++++++-----------------
 test/collection_override_test.cpp |   4 +-
 7 files changed, 108 insertions(+), 120 deletions(-)

diff --git a/TODO.md b/TODO.md
index 96f2a7e3..b031a788 100644
--- a/TODO.md
+++ b/TODO.md
@@ -98,6 +98,8 @@
 - ~~Handle SIGTERM which is sent when process is killed~~
 - ~~Use snappy compression for storage~~
 - Test for overriding result on second page
+- Fix exclude_scalar early returns
+- Fix result ids length during grouped overrides
 - atleast 1 token match for proceeding with drop tokens
 - support wildcard query with filters
 - API for optimizing on disk storage
diff --git a/include/field.h b/include/field.h
index 95c1fcbe..6255aed5 100644
--- a/include/field.h
+++ b/include/field.h
@@ -146,6 +146,7 @@ struct token_pos_cost_t {
 
 struct facet_count_t {
     uint32_t count;
+    spp::sparse_hash_map<uint64_t, uint32_t> groups;  // used for faceting grouped results
 
     // used to fetch the actual document and value for representation
     uint32_t doc_id;
diff --git a/include/index.h b/include/index.h
index b142c9a3..4baa1eed 100644
--- a/include/index.h
+++ b/include/index.h
@@ -13,6 +13,7 @@
 #include <json.hpp>
 #include <field.h>
 #include <option.h>
+#include <set>
 #include "string_utils.h"
 
 struct token_candidates {
@@ -162,10 +163,9 @@ private:
     void do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
                    const uint32_t* result_ids, size_t results_size);
 
-    void drop_facets(std::vector<facet> & facets, const std::vector<uint32_t> & ids);
-
     void search_field(const uint8_t & field_id, const std::string & query,
                       const std::string & field, uint32_t *filter_ids, size_t filter_ids_length,
+                      const std::vector<uint32_t>& curated_ids,
                       std::vector<facet> & facets, const std::vector<sort_by> & sort_fields,
                       const int num_typos, std::vector<std::vector<art_leaf*>> & searched_queries,
                       Topster* topster, uint32_t** all_result_ids,
@@ -175,8 +175,9 @@ private:
                       const size_t typo_tokens_threshold = Index::TYPO_TOKENS_THRESHOLD);
 
     void search_candidates(const uint8_t & field_id, uint32_t* filter_ids, size_t filter_ids_length,
+                           const std::vector<uint32_t>& curated_ids,
                            const std::vector<sort_by> & sort_fields, std::vector<token_candidates> & token_to_candidates,
-                           const token_ordering token_order, std::vector<std::vector<art_leaf*>> & searched_queries,
+                           std::vector<std::vector<art_leaf*>> & searched_queries,
                            Topster* topster, uint32_t** all_result_ids,
                            size_t & all_result_ids_len,
                            const size_t typo_tokens_threshold);
@@ -306,5 +307,7 @@ public:
     int get_bounded_typo_cost(const size_t max_cost, const size_t token_len) const;
 
     static int64_t float_to_in64_t(float n);
+
+    uint64_t get_distinct_id(const std::unordered_map<std::string, size_t> &facet_to_id, const uint32_t seq_id) const;
 };
 
diff --git a/src/array_utils.cpp b/src/array_utils.cpp
index 9d93fa46..62fe2d87 100644
--- a/src/array_utils.cpp
+++ b/src/array_utils.cpp
@@ -114,7 +114,7 @@ size_t ArrayUtils::exclude_scalar(const uint32_t *A, const size_t lenA,
     return 0;
   }
 
-  if(B == nullptr) {
+  if(lenB == 0 || B == nullptr) {
     *out = new uint32_t[lenA];
     memcpy(*out, A, lenA * sizeof(uint32_t));
     return lenA;
diff --git a/src/collection.cpp b/src/collection.cpp
index af446907..6e50fbac 100644
--- a/src/collection.cpp
+++ b/src/collection.cpp
@@ -419,9 +419,9 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
 
         field search_field = search_schema.at(field_name);
 
-        // if field is a string field then it must be a facet field as well
-        if(search_field.is_string() && !search_field.is_facet()) {
-            std::string error = "Field `" + field_name + "` should be a facet field.";
+        // must be a facet field
+        if(!search_field.is_facet()) {
+            std::string error = "Group by field `" + field_name + "` should be a facet field.";
             return Option<nlohmann::json>(400, error);
         }
     }
@@ -711,6 +711,18 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
             for(auto & facet_kv: this_facet.result_map) {
                 size_t count = 0;
 
+
+                // for grouping we have to aggregate group counts to a count value
+                /*if(search_params->group_limit) {
+                    // for every facet
+                    for(auto& a_facet: facets) {
+                        // for every facet value
+                        for(auto& fvalue: a_facet.result_map) {
+                            fvalue.second.count = fvalue.second.groups.size();
+                        }
+                    }
+                }*/
+
                 if(acc_facet.result_map.count(facet_kv.first) == 0) {
                     // not found, so set it
                     count = facet_kv.second.count;
diff --git a/src/index.cpp b/src/index.cpp
index 2fe6e5e6..5c0f8546 100644
--- a/src/index.cpp
+++ b/src/index.cpp
@@ -633,7 +633,7 @@ void Index::compute_facet_stats(facet &a_facet, int64_t raw_value, const std::st
 void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
                       const uint32_t* result_ids, size_t results_size) {
 
-    std::map<std::string, size_t> facet_to_index;
+    std::unordered_map<std::string, size_t> facet_to_index;
 
     size_t i_facet = 0;
     for(const auto & facet: facet_schema) {
@@ -661,6 +661,7 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
             std::vector<std::string> query_tokens;
             StringUtils::split(facet_query.query, query_tokens, " ");
 
+            // for non-string fields, `faceted_name` returns their aliased stringified field name
             art_tree *t = search_index.at(facet_field.faceted_name());
 
             for(size_t qtoken_index = 0; qtoken_index < query_tokens.size(); qtoken_index++) {
@@ -703,7 +704,7 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
                 const std::vector<uint64_t> & fhashes = facet_index_v2[doc_seq_id][facet_id];
 
                 int array_pos = 0;
-                int fvalue_found = 0;
+                bool fvalue_found = false;
                 std::stringstream fvaluestream; // for hashing the entire facet value (multiple tokens)
                 spp::sparse_hash_map<uint32_t, token_pos_cost_t> query_token_positions;
                 size_t field_token_index = -1;
@@ -717,9 +718,9 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
                         // ftoken_hash is the raw value for numeric fields
                         compute_facet_stats(a_facet, ftoken_hash, facet_field.type);
 
+                        // not using facet query or this particular facet value is found in facet filter
                         if(!use_facet_query || fhash_qtoken_pos.find(ftoken_hash) != fhash_qtoken_pos.end()) {
-                            // not using facet query or this particular facet value is found in facet filter
-                            fvalue_found |= 1;  // bitwise to ensure only one count for a multi-token facet value
+                            fvalue_found = true;
 
                             if(use_facet_query) {
                                 // map token index to query index (used for highlighting later on)
@@ -736,41 +737,38 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
                         }
                     }
 
-                    //std::cout << "j: " << j << std::endl;
-
                     // 0 indicates separator, while the second condition checks for non-array string
                     if(fhashes[j] == FACET_ARRAY_DELIMETER || (fhashes.back() != FACET_ARRAY_DELIMETER && j == fhashes.size() - 1)) {
-                        if(!use_facet_query || fvalue_found != 0) {
+                        if(!use_facet_query || fvalue_found) {
                             const std::string & fvalue_str = fvaluestream.str();
-                            uint64_t fhash = 0;
-                            if(facet_field.is_string()) {
-                                fhash = facet_token_hash(facet_field, fvalue_str);
-                            } else {
-                                fhash = std::atoi(fvalue_str.c_str());
-                            }
+                            uint64_t fhash = facet_token_hash(facet_field, fvalue_str);
 
                             if(a_facet.result_map.count(fhash) == 0) {
-                                a_facet.result_map[fhash] = facet_count_t{0, doc_seq_id, 0,
+                                a_facet.result_map[fhash] = facet_count_t{0, spp::sparse_hash_map<uint64_t, uint32_t>(),
+                                                                          doc_seq_id, 0,
                                                                           spp::sparse_hash_map<uint32_t, token_pos_cost_t>()};
                             }
 
-                            a_facet.result_map[fhash].count += 1;
                             a_facet.result_map[fhash].doc_id = doc_seq_id;
                             a_facet.result_map[fhash].array_pos = array_pos;
 
+                            if(search_params->group_limit) {
+                                uint64_t distinct_id = get_distinct_id(facet_to_index, doc_seq_id);
+                                if(a_facet.result_map[fhash].groups.count(distinct_id) == 0) {
+                                    a_facet.result_map[fhash].groups.emplace(distinct_id, 0);
+                                }
+                                a_facet.result_map[fhash].groups[distinct_id] += 1;
+                            } else {
+                                a_facet.result_map[fhash].count += 1;
+                            }
+
                             if(use_facet_query) {
                                 a_facet.result_map[fhash].query_token_pos = query_token_positions;
-
-                                /*if(j == 11) {
-                                    for(auto xx: query_token_positions) {
-                                        std::cout << xx.first << " -> " << xx.second.pos << " , " << xx.second.cost << std::endl;
-                                    }
-                                }*/
                             }
                         }
 
                         array_pos++;
-                        fvalue_found = 0;
+                        fvalue_found = false;
                         std::stringstream().swap(fvaluestream);
                         spp::sparse_hash_map<uint32_t, token_pos_cost_t>().swap(query_token_positions);
                         field_token_index = -1;
@@ -781,55 +779,10 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
     }
 }
 
-void Index::drop_facets(std::vector<facet> & facets, const std::vector<uint32_t> & ids) {
-    std::map<std::string, size_t> facet_to_index;
-
-    size_t i_facet = 0;
-    for(const auto & facet: facet_schema) {
-        facet_to_index[facet.first] = i_facet;
-        i_facet++;
-    }
-
-    for(auto & a_facet: facets) {
-        const field & facet_field = facet_schema.at(a_facet.field_name);
-
-        // assumed that facet fields have already been validated upstream
-        for(const uint32_t doc_seq_id: ids) {
-            if(facet_index_v2.count(doc_seq_id) != 0) {
-                // FORMAT OF VALUES
-                // String: h1 h2 h3
-                // String array: h1 h2 h3 0 h1 0 h1 h2 0
-                const std::vector<uint64_t> & fhashes = facet_index_v2[doc_seq_id][facet_to_index[a_facet.field_name]];
-                std::stringstream fvaluestream; // for hashing the entire facet value (multiple tokens)
-
-                for(size_t j = 0; j < fhashes.size(); j++) {
-                    if(fhashes[j] != FACET_ARRAY_DELIMETER) {
-                        int64_t ftoken_hash = fhashes[j];
-                        fvaluestream << ftoken_hash;
-                    }
-
-                    if(fhashes[j] == FACET_ARRAY_DELIMETER || (fhashes.back() != FACET_ARRAY_DELIMETER &&
-                       j == fhashes.size() - 1)) {
-                        const std::string & fvalue_str = fvaluestream.str();
-                        uint64_t fhash = facet_token_hash(facet_field, fvalue_str);
-
-                        if(a_facet.result_map.count(fhash) != 0) {
-                            a_facet.result_map[fhash].count -= 1;
-                            if(a_facet.result_map[fhash].count == 0) {
-                                a_facet.result_map.erase(fhash);
-                            }
-                        }
-                        std::stringstream().swap(fvaluestream);
-                    }
-                }
-            }
-        }
-    }
-}
-
 void Index::search_candidates(const uint8_t & field_id, uint32_t* filter_ids, size_t filter_ids_length,
+                              const std::vector<uint32_t>& curated_ids,
                               const std::vector<sort_by> & sort_fields,
-                              std::vector<token_candidates> & token_candidates_vec, const token_ordering token_order,
+                              std::vector<token_candidates> & token_candidates_vec,
                               std::vector<std::vector<art_leaf*>> & searched_queries, Topster* topster,
                               uint32_t** all_result_ids, size_t & all_result_ids_len,
                               const size_t typo_tokens_threshold) {
@@ -874,6 +827,15 @@ void Index::search_candidates(const uint8_t & field_id, uint32_t* filter_ids, si
             continue;
         }
 
+        if(!curated_ids.empty()) {
+            uint32_t *excluded_result_ids = nullptr;
+            result_size = ArrayUtils::exclude_scalar(result_ids, result_size, &curated_ids[0],
+                                                     curated_ids.size(), &excluded_result_ids);
+
+            delete [] result_ids;
+            result_ids = excluded_result_ids;
+        }
+
         if(filter_ids != nullptr) {
             // intersect once again with filter ids
             uint32_t* filtered_result_ids = nullptr;
@@ -1176,7 +1138,14 @@ void Index::search(Option<uint32_t> & outcome,
         return ;
     }
 
-    const uint32_t filter_ids_length = op_filter_ids_length.get();
+    uint32_t filter_ids_length = op_filter_ids_length.get();
+
+    // we will be removing all curated IDs from organic results before running topster
+    std::set<uint32_t> curated_ids(included_ids.begin(), included_ids.end());
+    curated_ids.insert(excluded_ids.begin(), excluded_ids.end());
+
+    std::vector<uint32_t> curated_ids_sorted(curated_ids.begin(), curated_ids.end());
+    std::sort(curated_ids_sorted.begin(), curated_ids_sorted.end());
 
     // Order of `fields` are used to sort results
     //auto begin = std::chrono::high_resolution_clock::now();
@@ -1186,11 +1155,22 @@ void Index::search(Option<uint32_t> & outcome,
         const uint8_t field_id = (uint8_t)(FIELD_LIMIT_NUM - 0);
         const std::string & field = search_fields[0];
 
+        if(!curated_ids.empty()) {
+            uint32_t *excluded_result_ids = nullptr;
+            filter_ids_length = ArrayUtils::exclude_scalar(filter_ids, filter_ids_length, &curated_ids_sorted[0],
+                                                     curated_ids.size(), &excluded_result_ids);
+
+            delete [] filter_ids;
+            filter_ids = excluded_result_ids;
+        }
+
         score_results(sort_fields_std, (uint16_t) searched_queries.size(), field_id, 0, topster, {},
                       filter_ids, filter_ids_length);
         collate_curated_ids(query, field, field_id, included_ids, curated_topster, searched_queries);
-        do_facets(facets, facet_query, filter_ids, filter_ids_length);
+
         all_result_ids_len = filter_ids_length;
+        all_result_ids = filter_ids;
+        filter_ids = nullptr;
     } else {
         const size_t num_search_fields = std::min(search_fields.size(), (size_t) FIELD_LIMIT_NUM);
         for(size_t i = 0; i < num_search_fields; i++) {
@@ -1199,50 +1179,33 @@ void Index::search(Option<uint32_t> & outcome,
                 const uint8_t field_id = (uint8_t)(FIELD_LIMIT_NUM - i); // Order of `fields` are used to sort results
                 const std::string & field = search_fields[i];
 
-                search_field(field_id, query, field, filter_ids, filter_ids_length, facets, sort_fields_std,
+                search_field(field_id, query, field, filter_ids, filter_ids_length, curated_ids_sorted, facets, sort_fields_std,
                              num_typos, searched_queries, topster, &all_result_ids, all_result_ids_len,
                              token_order, prefix, drop_tokens_threshold, typo_tokens_threshold);
                 collate_curated_ids(query, field, field_id, included_ids, curated_topster, searched_queries);
             }
         }
-        do_facets(facets, facet_query, all_result_ids, all_result_ids_len);
     }
 
+    do_facets(facets, facet_query, all_result_ids, all_result_ids_len);
     do_facets(facets, facet_query, &included_ids[0], included_ids.size());
 
     // must be sorted before iterated upon to remove "empty" array entries
     topster->sort();
     curated_topster->sort();
 
-    std::set<uint32_t> ids_to_remove(included_ids.begin(), included_ids.end());
-    ids_to_remove.insert(excluded_ids.begin(), excluded_ids.end());
-
-    std::vector<uint32_t> dropped_ids;
-
     // loop through topster and remove elements from included and excluded id lists
 
     if(topster->distinct) {
         for(auto &group_topster_entry: topster->group_kv_map) {
             Topster* group_topster = group_topster_entry.second;
-            for(uint32_t t = 0; t < group_topster->size; t++) {
-                KV* kv = group_topster->getKV(t);
-                if(ids_to_remove.count(kv->key) != 0) {
-                    dropped_ids.push_back((uint32_t)kv->key);
-                }
-            }
-
             const std::vector<KV*> group_kvs(group_topster->kvs, group_topster->kvs+group_topster->size);
             raw_result_kvs.emplace_back(group_kvs);
         }
     } else {
         for(uint32_t t = 0; t < topster->size; t++) {
             KV* kv = topster->getKV(t);
-
-            if(ids_to_remove.count(kv->key) != 0) {
-                dropped_ids.push_back((uint32_t)kv->key);
-            } else {
-                raw_result_kvs.push_back({kv});
-            }
+            raw_result_kvs.push_back({kv});
         }
     }
 
@@ -1252,9 +1215,6 @@ void Index::search(Option<uint32_t> & outcome,
     }
 
     // for the ids that are dropped, remove their corresponding facet components from facet results
-    drop_facets(facets, dropped_ids);
-
-    all_result_ids_len -= dropped_ids.size();
     all_result_ids_len += curated_topster->size;
 
     delete [] filter_ids;
@@ -1277,6 +1237,7 @@ void Index::search(Option<uint32_t> & outcome,
 */
 void Index::search_field(const uint8_t & field_id, const std::string & query, const std::string & field,
                          uint32_t *filter_ids, size_t filter_ids_length,
+                         const std::vector<uint32_t>& curated_ids,
                          std::vector<facet> & facets, const std::vector<sort_by> & sort_fields, const int num_typos,
                          std::vector<std::vector<art_leaf*>> & searched_queries,
                          Topster* topster, uint32_t** all_result_ids, size_t & all_result_ids_len,
@@ -1392,8 +1353,8 @@ void Index::search_field(const uint8_t & field_id, const std::string & query, co
 
         if(!token_candidates_vec.empty() && token_candidates_vec.size() == tokens.size()) {
             // If all tokens were found, go ahead and search for candidates with what we have so far
-            search_candidates(field_id, filter_ids, filter_ids_length, sort_fields, token_candidates_vec,
-                              token_order, searched_queries, topster, all_result_ids, all_result_ids_len,
+            search_candidates(field_id, filter_ids, filter_ids_length, curated_ids, sort_fields, token_candidates_vec,
+                              searched_queries, topster, all_result_ids, all_result_ids_len,
                               typo_tokens_threshold);
         }
 
@@ -1426,7 +1387,8 @@ void Index::search_field(const uint8_t & field_id, const std::string & query, co
             truncated_query += " " + token_count_pairs.at(i).first;
         }
 
-        return search_field(field_id, truncated_query, field, filter_ids, filter_ids_length, facets, sort_fields, num_typos,
+        return search_field(field_id, truncated_query, field, filter_ids, filter_ids_length, curated_ids,
+                            facets, sort_fields, num_typos,
                             searched_queries, topster, all_result_ids, all_result_ids_len,
                             token_order, prefix);
     }
@@ -1573,21 +1535,7 @@ void Index::score_results(const std::vector<sort_by> & sort_fields, const uint16
         uint64_t distinct_id = seq_id;
 
         if(search_params->group_limit != 0) {
-            distinct_id = 1; // some constant initial value
-
-            // calculate hash from group_by_fields
-            for(const auto& field: search_params->group_by_fields) {
-                if(facet_to_id.count(field) == 0 || facet_index_v2.count(seq_id) == 0) {
-                    continue;
-                }
-
-                size_t facet_id = facet_to_id[field];
-                const std::vector<uint64_t>& fhashes = facet_index_v2.at(seq_id)[facet_id];
-
-                for(const auto& hash: fhashes) {
-                    distinct_id = hash_combine(distinct_id, hash);
-                }
-            }
+            distinct_id = get_distinct_id(facet_to_id, seq_id);
         }
 
         KV kv(field_id, query_index, seq_id, distinct_id, match_score, scores);
@@ -1603,6 +1551,26 @@ void Index::score_results(const std::vector<sort_by> & sort_fields, const uint16
     }
 }
 
+uint64_t Index::get_distinct_id(const std::unordered_map<std::string, size_t> &facet_to_id,
+                                const uint32_t seq_id) const {
+    uint64_t distinct_id = 1; // some constant initial value
+
+    // calculate hash from group_by_fields
+    for(const auto& field: search_params->group_by_fields) {
+        if(facet_to_id.count(field) == 0 || facet_index_v2.count(seq_id) == 0) {
+            continue;
+        }
+
+        size_t facet_id = facet_to_id.at(field);
+        const std::vector<uint64_t>& fhashes = facet_index_v2.at(seq_id)[facet_id];
+
+        for(const auto& hash: fhashes) {
+            distinct_id = hash_combine(distinct_id, hash);
+        }
+    }
+    return distinct_id;
+}
+
 void Index::populate_token_positions(const std::vector<art_leaf *> &query_suggestion,
                                      spp::sparse_hash_map<const art_leaf *, uint32_t *> &leaf_to_indices,
                                      size_t result_index,
diff --git a/test/collection_override_test.cpp b/test/collection_override_test.cpp
index a94bfbc0..49824142 100644
--- a/test/collection_override_test.cpp
+++ b/test/collection_override_test.cpp
@@ -219,6 +219,8 @@ TEST_F(CollectionOverrideTest, ExcludeIncludeFacetFilterQuery) {
                                       spp::sparse_hash_set<std::string>(),
                                       spp::sparse_hash_set<std::string>(), 10, "starring: scott").get();
 
+    ASSERT_EQ(9, results["found"].get<size_t>());
+
     // "count" would be `2` without exclusion
     ASSERT_EQ("<mark>Scott</mark> Glenn", results["facet_counts"][0]["counts"][0]["highlighted"].get<std::string>());
     ASSERT_EQ(1, results["facet_counts"][0]["counts"][0]["count"].get<size_t>());
@@ -233,7 +235,7 @@ TEST_F(CollectionOverrideTest, ExcludeIncludeFacetFilterQuery) {
                                       spp::sparse_hash_set<std::string>(),
                                       spp::sparse_hash_set<std::string>(), 10, "starring: scott").get();
 
-    ASSERT_EQ(10, results["found"].get<size_t>());
+    ASSERT_EQ(9, results["found"].get<size_t>());
     ASSERT_EQ(0, results["hits"].size());
 
     coll_mul_fields->remove_override("exclude-rule");

From 3b68a815159176855808e8ff812c904ac55591f9 Mon Sep 17 00:00:00 2001
From: kishorenc <kishorenc@gmail.com>
Date: Sun, 14 Jun 2020 15:20:31 +0530
Subject: [PATCH 10/38] Ensure timeouts are cleared before destroying evloop.

---
 src/http_server.cpp | 31 ++-----------------------------
 1 file changed, 2 insertions(+), 29 deletions(-)

diff --git a/src/http_server.cpp b/src/http_server.cpp
index 37b6aae0..8058e32d 100644
--- a/src/http_server.cpp
+++ b/src/http_server.cpp
@@ -186,20 +186,6 @@ std::string HttpServer::get_version() {
 void HttpServer::clear_timeouts(const std::vector<h2o_timer_t*> & timers, bool trigger_callback) {
     for(h2o_timer_t* timer: timers) {
         h2o_timer_unlink(timer);
-        /*while (!h2o_linklist_is_empty(&timer->_link)) {
-            h2o_timer_t *entry = H2O_STRUCT_FROM_MEMBER(h2o_timer_t, _link, timer->_link.next);
-            if(entry == nullptr) {
-                continue;
-            }
-
-            if(trigger_callback) {
-                entry->cb(entry);
-            }
-
-            //entry->expire_at = 0;
-            h2o_linklist_unlink(&entry->_link);
-            h2o_timer_unlink(timer);
-        }*/
     }
 }
 
@@ -476,26 +462,13 @@ void HttpServer::on(const std::string & message, bool (*handler)(void*)) {
 HttpServer::~HttpServer() {
     delete message_dispatcher;
 
-    // remove all timeouts defined in: https://github.com/h2o/h2o/blob/v2.2.2/lib/core/context.c#L142
-    /*std::vector<h2o_timeout_t*> timeouts = {
-        &ctx.zero_timeout,
-        &ctx.one_sec_timeout,
-        &ctx.hundred_ms_timeout,
-        &ctx.handshake_timeout,
-        &ctx.http1.req_timeout,
-        &ctx.http2.idle_timeout,
-        &ctx.http2.graceful_shutdown_timeout,
-        &ctx.proxy.io_timeout
-    };
-
-    clear_timeouts(timeouts);
-    */
-
     if(ssl_refresh_timer.timer.expire_at != 0) {
         // avoid callback since it recreates timeout
         clear_timeouts({&ssl_refresh_timer.timer}, false);
     }
 
+    h2o_timerwheel_run(ctx.loop->_timeouts, 9999999999999);
+
     h2o_context_dispose(&ctx);
 
     free(ctx.globalconf->server_name.base);

From 1c398fac7ef48338f8fa2b98c5db763c0536c8a7 Mon Sep 17 00:00:00 2001
From: kishorenc <kishorenc@gmail.com>
Date: Sun, 14 Jun 2020 15:21:54 +0530
Subject: [PATCH 11/38] Don't require query fields for wildcard query.

---
 src/core_api.cpp         | 2 +-
 test/collection_test.cpp | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/core_api.cpp b/src/core_api.cpp
index 8ba15bc2..18c96629 100644
--- a/src/core_api.cpp
+++ b/src/core_api.cpp
@@ -252,7 +252,7 @@ bool get_search(http_req & req, http_res & res) {
         return false;
     }
 
-    if(req.params.count(QUERY_BY) == 0) {
+    if(req.params.count(QUERY_BY) == 0 && req.params[QUERY] != "*") {
         res.set_400(std::string("Parameter `") + QUERY_BY + "` is required.");
         return false;
     }
diff --git a/test/collection_test.cpp b/test/collection_test.cpp
index f5c9bf09..dc904ce5 100644
--- a/test/collection_test.cpp
+++ b/test/collection_test.cpp
@@ -455,6 +455,12 @@ TEST_F(CollectionTest, WildcardQuery) {
         std::string id = ids.at(i);
         ASSERT_STREQ(id.c_str(), result_id.c_str());
     }
+
+    // wildcard query should not require a search field
+    results_op = collection->search("*", {}, "", {}, sort_fields, 0, 3, 1, FREQUENCY, false).get();
+    ASSERT_TRUE(results_op.ok());
+    ASSERT_EQ(3, results["hits"].size());
+    ASSERT_EQ(25, results["found"].get<uint32_t>());
 }
 
 TEST_F(CollectionTest, PrefixSearching) {

From c5010a6a5f282c1a8deec3b70a741d9c551e8a9b Mon Sep 17 00:00:00 2001
From: kishorenc <kishorenc@gmail.com>
Date: Sun, 14 Jun 2020 17:16:01 +0530
Subject: [PATCH 12/38] Results count should match group size for group query.

---
 TODO.md             |  5 +++--
 include/field.h     |  2 +-
 include/index.h     | 21 ++++++++++-------
 src/array_utils.cpp |  4 +++-
 src/art.cpp         |  2 --
 src/collection.cpp  | 55 ++++++++++++++++++++++++++++-----------------
 src/index.cpp       | 53 ++++++++++++++++++++++---------------------
 7 files changed, 82 insertions(+), 60 deletions(-)

diff --git a/TODO.md b/TODO.md
index b031a788..adeea108 100644
--- a/TODO.md
+++ b/TODO.md
@@ -97,9 +97,10 @@
 - ~~Have a LOG(ERROR) level~~
 - ~~Handle SIGTERM which is sent when process is killed~~
 - ~~Use snappy compression for storage~~
+- ~~Fix exclude_scalar early returns~~
+- ~~Fix result ids length during grouped overrides~~
+- Fix override grouping (collate_included_ids)
 - Test for overriding result on second page
-- Fix exclude_scalar early returns
-- Fix result ids length during grouped overrides
 - atleast 1 token match for proceeding with drop tokens
 - support wildcard query with filters
 - API for optimizing on disk storage
diff --git a/include/field.h b/include/field.h
index 6255aed5..44761e23 100644
--- a/include/field.h
+++ b/include/field.h
@@ -146,7 +146,7 @@ struct token_pos_cost_t {
 
 struct facet_count_t {
     uint32_t count;
-    spp::sparse_hash_map<uint64_t, uint32_t> groups;  // used for faceting grouped results
+    spp::sparse_hash_set<uint64_t> groups;  // used for faceting grouped results
 
     // used to fetch the actual document and value for representation
     uint32_t doc_id;
diff --git a/include/index.h b/include/index.h
index 4baa1eed..65cf951e 100644
--- a/include/index.h
+++ b/include/index.h
@@ -42,6 +42,7 @@ struct search_args {
     std::vector<std::string> group_by_fields;
     size_t group_limit;
     size_t all_result_ids_len;
+    spp::sparse_hash_set<uint64_t> groups_processed;
     std::vector<std::vector<art_leaf*>> searched_queries;
     Topster* topster;
     Topster* curated_topster;
@@ -168,9 +169,9 @@ private:
                       const std::vector<uint32_t>& curated_ids,
                       std::vector<facet> & facets, const std::vector<sort_by> & sort_fields,
                       const int num_typos, std::vector<std::vector<art_leaf*>> & searched_queries,
-                      Topster* topster, uint32_t** all_result_ids,
-                      size_t & all_result_ids_len, const token_ordering token_order = FREQUENCY,
-                      const bool prefix = false,
+                      Topster* topster, spp::sparse_hash_set<uint64_t>& groups_processed,
+                      uint32_t** all_result_ids, size_t & all_result_ids_len,
+                      const token_ordering token_order = FREQUENCY, const bool prefix = false,
                       const size_t drop_tokens_threshold = Index::DROP_TOKENS_THRESHOLD,
                       const size_t typo_tokens_threshold = Index::TYPO_TOKENS_THRESHOLD);
 
@@ -178,7 +179,8 @@ private:
                            const std::vector<uint32_t>& curated_ids,
                            const std::vector<sort_by> & sort_fields, std::vector<token_candidates> & token_to_candidates,
                            std::vector<std::vector<art_leaf*>> & searched_queries,
-                           Topster* topster, uint32_t** all_result_ids,
+                           Topster* topster, spp::sparse_hash_set<uint64_t>& groups_processed,
+                           uint32_t** all_result_ids,
                            size_t & all_result_ids_len,
                            const size_t typo_tokens_threshold);
 
@@ -210,9 +212,9 @@ private:
     void remove_and_shift_offset_index(sorted_array &offset_index, const uint32_t *indices_sorted,
                                        const uint32_t indices_length);
 
-    void collate_curated_ids(const std::string & query, const std::string & field, const uint8_t field_id,
-                             const std::vector<uint32_t> & included_ids,
-                             Topster* curated_topster, std::vector<std::vector<art_leaf*>> & searched_queries);
+    void collate_included_ids(const std::string & query, const std::string & field, const uint8_t field_id,
+                              const std::vector<uint32_t> & included_ids,
+                              Topster* curated_topster, std::vector<std::vector<art_leaf*>> & searched_queries);
 
     uint64_t facet_token_hash(const field & a_field, const std::string &token);
 
@@ -242,7 +244,9 @@ public:
                           Topster* topster, Topster* curated_topster,
                           const size_t per_page, const size_t page, const token_ordering token_order,
                           const bool prefix, const size_t drop_tokens_threshold,
-                          size_t & all_result_ids_len, std::vector<std::vector<art_leaf*>> & searched_queries,
+                          size_t & all_result_ids_len,
+                          spp::sparse_hash_set<uint64_t>& groups_processed,
+                          std::vector<std::vector<art_leaf*>> & searched_queries,
                           std::vector<std::vector<KV*>> & raw_result_kvs, std::vector<KV*> & override_result_kvs,
                           const size_t typo_tokens_threshold);
 
@@ -257,6 +261,7 @@ public:
 
     void score_results(const std::vector<sort_by> & sort_fields, const uint16_t & query_index, const uint8_t & field_id,
                        const uint32_t total_cost, Topster* topster, const std::vector<art_leaf *> & query_suggestion,
+                       spp::sparse_hash_set<uint64_t>& groups_processed,
                        const uint32_t *result_ids, const size_t result_size) const;
 
     static int32_t get_points_from_doc(const nlohmann::json &document, const std::string & default_sorting_field);
diff --git a/src/array_utils.cpp b/src/array_utils.cpp
index 62fe2d87..5a88dffd 100644
--- a/src/array_utils.cpp
+++ b/src/array_utils.cpp
@@ -107,10 +107,12 @@ size_t ArrayUtils::exclude_scalar(const uint32_t *A, const size_t lenA,
   size_t indexA = 0, indexB = 0, res_index = 0;
 
   if(A == nullptr && B == nullptr) {
-    return 0;
+      *out = nullptr;
+      return 0;
   }
 
   if(A == nullptr) {
+    *out = nullptr;
     return 0;
   }
 
diff --git a/src/art.cpp b/src/art.cpp
index 7ac945c0..1da84cf1 100644
--- a/src/art.cpp
+++ b/src/art.cpp
@@ -1384,8 +1384,6 @@ int art_fuzzy_search(art_tree *t, const unsigned char *term, const int term_len,
         art_fuzzy_recurse(0, 0, t->root, -1, term, term_len, irow, jrow, min_cost, max_cost, prefix, nodes);
     }
 
-    PROCESS_NODES:
-
     if(token_order == FREQUENCY) {
         std::sort(nodes.begin(), nodes.end(), compare_art_node_frequency);
     } else {
diff --git a/src/collection.cpp b/src/collection.cpp
index 6e50fbac..24547c62 100644
--- a/src/collection.cpp
+++ b/src/collection.cpp
@@ -653,6 +653,7 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
     std::vector<KV*> override_result_kvs;
 
     size_t total_found = 0;
+    spp::sparse_hash_set<uint64_t> groups_processed;  // used to calculate total_found for grouped query
 
     // send data to individual index threads
     size_t index_id = 0;
@@ -709,28 +710,22 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
             auto & acc_facet = facets[fi];
 
             for(auto & facet_kv: this_facet.result_map) {
-                size_t count = 0;
-
-
-                // for grouping we have to aggregate group counts to a count value
-                /*if(search_params->group_limit) {
-                    // for every facet
-                    for(auto& a_facet: facets) {
-                        // for every facet value
-                        for(auto& fvalue: a_facet.result_map) {
-                            fvalue.second.count = fvalue.second.groups.size();
-                        }
-                    }
-                }*/
-
-                if(acc_facet.result_map.count(facet_kv.first) == 0) {
-                    // not found, so set it
-                    count = facet_kv.second.count;
+                if(index->search_params->group_limit) {
+                    // we have to add all group sets
+                    acc_facet.result_map[facet_kv.first].groups.insert(
+                        facet_kv.second.groups.begin(), facet_kv.second.groups.end()
+                    );
                 } else {
-                    count = acc_facet.result_map[facet_kv.first].count + facet_kv.second.count;
+                    size_t count = 0;
+                    if(acc_facet.result_map.count(facet_kv.first) == 0) {
+                        // not found, so set it
+                        count = facet_kv.second.count;
+                    } else {
+                        count = acc_facet.result_map[facet_kv.first].count + facet_kv.second.count;
+                    }
+                    acc_facet.result_map[facet_kv.first].count = count;
                 }
 
-                acc_facet.result_map[facet_kv.first].count = count;
                 acc_facet.result_map[facet_kv.first].doc_id = facet_kv.second.doc_id;
                 acc_facet.result_map[facet_kv.first].array_pos = facet_kv.second.array_pos;
                 acc_facet.result_map[facet_kv.first].query_token_pos = facet_kv.second.query_token_pos;
@@ -744,7 +739,25 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
             }
         }
 
-        total_found += index->search_params->all_result_ids_len;
+        if(group_limit) {
+            groups_processed.insert(
+                index->search_params->groups_processed.begin(),
+                index->search_params->groups_processed.end()
+            );
+        } else {
+            total_found += index->search_params->all_result_ids_len;
+        }
+    }
+
+    // for grouping we have to aggregate group set sizes to a count value
+    if(group_limit) {
+        for(auto& acc_facet: facets) {
+            for(auto& facet_kv: acc_facet.result_map) {
+                facet_kv.second.count = facet_kv.second.groups.size();
+            }
+        }
+
+        total_found = groups_processed.size();
     }
 
     if(!index_search_op.ok()) {
@@ -753,7 +766,7 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
 
     Topster* aggr_topster = nullptr;
 
-    if(group_limit > 0) {
+    if(group_limit) {
         // group by query requires another round of topster-ing
 
         // needs to be atleast 1 since scoring is mandatory
diff --git a/src/index.cpp b/src/index.cpp
index 5c0f8546..fcbbc478 100644
--- a/src/index.cpp
+++ b/src/index.cpp
@@ -744,7 +744,7 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
                             uint64_t fhash = facet_token_hash(facet_field, fvalue_str);
 
                             if(a_facet.result_map.count(fhash) == 0) {
-                                a_facet.result_map[fhash] = facet_count_t{0, spp::sparse_hash_map<uint64_t, uint32_t>(),
+                                a_facet.result_map[fhash] = facet_count_t{0, spp::sparse_hash_set<uint64_t>(),
                                                                           doc_seq_id, 0,
                                                                           spp::sparse_hash_map<uint32_t, token_pos_cost_t>()};
                             }
@@ -754,10 +754,7 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
 
                             if(search_params->group_limit) {
                                 uint64_t distinct_id = get_distinct_id(facet_to_index, doc_seq_id);
-                                if(a_facet.result_map[fhash].groups.count(distinct_id) == 0) {
-                                    a_facet.result_map[fhash].groups.emplace(distinct_id, 0);
-                                }
-                                a_facet.result_map[fhash].groups[distinct_id] += 1;
+                                a_facet.result_map[fhash].groups.emplace(distinct_id);
                             } else {
                                 a_facet.result_map[fhash].count += 1;
                             }
@@ -784,6 +781,7 @@ void Index::search_candidates(const uint8_t & field_id, uint32_t* filter_ids, si
                               const std::vector<sort_by> & sort_fields,
                               std::vector<token_candidates> & token_candidates_vec,
                               std::vector<std::vector<art_leaf*>> & searched_queries, Topster* topster,
+                              spp::sparse_hash_set<uint64_t>& groups_processed,
                               uint32_t** all_result_ids, size_t & all_result_ids_len,
                               const size_t typo_tokens_threshold) {
     const long long combination_limit = 10;
@@ -850,7 +848,7 @@ void Index::search_candidates(const uint8_t & field_id, uint32_t* filter_ids, si
 
             // go through each matching document id and calculate match score
             score_results(sort_fields, (uint16_t) searched_queries.size(), field_id, total_cost, topster, query_suggestion,
-                          filtered_result_ids, filtered_results_size);
+                          groups_processed, filtered_result_ids, filtered_results_size);
 
             delete[] filtered_result_ids;
             delete[] result_ids;
@@ -862,7 +860,7 @@ void Index::search_candidates(const uint8_t & field_id, uint32_t* filter_ids, si
             *all_result_ids = new_all_result_ids;
 
             score_results(sort_fields, (uint16_t) searched_queries.size(), field_id, total_cost, topster, query_suggestion,
-                          result_ids, result_size);
+                          groups_processed, result_ids, result_size);
             delete[] result_ids;
         }
 
@@ -1024,7 +1022,8 @@ void Index::run_search() {
                search_params->topster, search_params->curated_topster,
                search_params->per_page, search_params->page, search_params->token_order,
                search_params->prefix, search_params->drop_tokens_threshold,
-               search_params->all_result_ids_len, search_params->searched_queries,
+               search_params->all_result_ids_len, search_params->groups_processed,
+               search_params->searched_queries,
                search_params->raw_result_kvs, search_params->override_result_kvs,
                search_params->typo_tokens_threshold);
 
@@ -1038,12 +1037,12 @@ void Index::run_search() {
     }
 }
 
-void Index::collate_curated_ids(const std::string & query, const std::string & field, const uint8_t field_id,
-                                const std::vector<uint32_t> & included_ids,
-                                Topster* curated_topster,
-                                std::vector<std::vector<art_leaf*>> & searched_queries) {
+void Index::collate_included_ids(const std::string & query, const std::string & field, const uint8_t field_id,
+                                 const std::vector<uint32_t> & included_ids,
+                                 Topster* curated_topster,
+                                 std::vector<std::vector<art_leaf*>> & searched_queries) {
 
-    if(included_ids.size() == 0) {
+    if(included_ids.empty()) {
         return;
     }
 
@@ -1106,9 +1105,9 @@ void Index::collate_curated_ids(const std::string & query, const std::string & f
 
         KV kv(field_id, searched_queries.size(), seq_id, seq_id, match_score, scores);
         curated_topster->add(&kv);
-
-        searched_queries.push_back(override_query);
     }
+
+    searched_queries.push_back(override_query);
 }
 
 void Index::search(Option<uint32_t> & outcome,
@@ -1124,7 +1123,8 @@ void Index::search(Option<uint32_t> & outcome,
                    const size_t per_page, const size_t page, const token_ordering token_order,
                    const bool prefix, const size_t drop_tokens_threshold,
                    size_t & all_result_ids_len,
-                   std::vector<std::vector<art_leaf*>> & searched_queries,
+                   spp::sparse_hash_set<uint64_t>& groups_processed,
+                   std::vector<std::vector<art_leaf*>>& searched_queries,
                    std::vector<std::vector<KV*>> & raw_result_kvs,
                    std::vector<KV*> & override_result_kvs,
                    const size_t typo_tokens_threshold) {
@@ -1140,7 +1140,7 @@ void Index::search(Option<uint32_t> & outcome,
 
     uint32_t filter_ids_length = op_filter_ids_length.get();
 
-    // we will be removing all curated IDs from organic results before running topster
+    // we will be removing all curated IDs from organic result ids before running topster
     std::set<uint32_t> curated_ids(included_ids.begin(), included_ids.end());
     curated_ids.insert(excluded_ids.begin(), excluded_ids.end());
 
@@ -1165,8 +1165,8 @@ void Index::search(Option<uint32_t> & outcome,
         }
 
         score_results(sort_fields_std, (uint16_t) searched_queries.size(), field_id, 0, topster, {},
-                      filter_ids, filter_ids_length);
-        collate_curated_ids(query, field, field_id, included_ids, curated_topster, searched_queries);
+                      groups_processed, filter_ids, filter_ids_length);
+        collate_included_ids(query, field, field_id, included_ids, curated_topster, searched_queries);
 
         all_result_ids_len = filter_ids_length;
         all_result_ids = filter_ids;
@@ -1180,9 +1180,9 @@ void Index::search(Option<uint32_t> & outcome,
                 const std::string & field = search_fields[i];
 
                 search_field(field_id, query, field, filter_ids, filter_ids_length, curated_ids_sorted, facets, sort_fields_std,
-                             num_typos, searched_queries, topster, &all_result_ids, all_result_ids_len,
+                             num_typos, searched_queries, topster, groups_processed, &all_result_ids, all_result_ids_len,
                              token_order, prefix, drop_tokens_threshold, typo_tokens_threshold);
-                collate_curated_ids(query, field, field_id, included_ids, curated_topster, searched_queries);
+                collate_included_ids(query, field, field_id, included_ids, curated_topster, searched_queries);
             }
         }
     }
@@ -1214,7 +1214,7 @@ void Index::search(Option<uint32_t> & outcome,
         override_result_kvs.push_back(kv);
     }
 
-    // for the ids that are dropped, remove their corresponding facet components from facet results
+    // add curated IDs to result count
     all_result_ids_len += curated_topster->size;
 
     delete [] filter_ids;
@@ -1240,7 +1240,8 @@ void Index::search_field(const uint8_t & field_id, const std::string & query, co
                          const std::vector<uint32_t>& curated_ids,
                          std::vector<facet> & facets, const std::vector<sort_by> & sort_fields, const int num_typos,
                          std::vector<std::vector<art_leaf*>> & searched_queries,
-                         Topster* topster, uint32_t** all_result_ids, size_t & all_result_ids_len,
+                         Topster* topster, spp::sparse_hash_set<uint64_t>& groups_processed,
+                         uint32_t** all_result_ids, size_t & all_result_ids_len,
                          const token_ordering token_order, const bool prefix, 
                          const size_t drop_tokens_threshold, const size_t typo_tokens_threshold) {
     std::vector<std::string> tokens;
@@ -1354,7 +1355,7 @@ void Index::search_field(const uint8_t & field_id, const std::string & query, co
         if(!token_candidates_vec.empty() && token_candidates_vec.size() == tokens.size()) {
             // If all tokens were found, go ahead and search for candidates with what we have so far
             search_candidates(field_id, filter_ids, filter_ids_length, curated_ids, sort_fields, token_candidates_vec,
-                              searched_queries, topster, all_result_ids, all_result_ids_len,
+                              searched_queries, topster, groups_processed, all_result_ids, all_result_ids_len,
                               typo_tokens_threshold);
         }
 
@@ -1389,7 +1390,7 @@ void Index::search_field(const uint8_t & field_id, const std::string & query, co
 
         return search_field(field_id, truncated_query, field, filter_ids, filter_ids_length, curated_ids,
                             facets, sort_fields, num_typos,
-                            searched_queries, topster, all_result_ids, all_result_ids_len,
+                            searched_queries, topster, groups_processed, all_result_ids, all_result_ids_len,
                             token_order, prefix);
     }
 }
@@ -1417,6 +1418,7 @@ void Index::log_leaves(const int cost, const std::string &token, const std::vect
 void Index::score_results(const std::vector<sort_by> & sort_fields, const uint16_t & query_index,
                           const uint8_t & field_id, const uint32_t total_cost, Topster* topster,
                           const std::vector<art_leaf *> &query_suggestion,
+                          spp::sparse_hash_set<uint64_t>& groups_processed,
                           const uint32_t *result_ids, const size_t result_size) const {
 
     spp::sparse_hash_map<const art_leaf*, uint32_t*> leaf_to_indices;
@@ -1536,6 +1538,7 @@ void Index::score_results(const std::vector<sort_by> & sort_fields, const uint16
 
         if(search_params->group_limit != 0) {
             distinct_id = get_distinct_id(facet_to_id, seq_id);
+            groups_processed.emplace(distinct_id);
         }
 
         KV kv(field_id, query_index, seq_id, distinct_id, match_score, scores);

From 3f6f13baf1c7c6c39e6618167fd2735595c5c8a2 Mon Sep 17 00:00:00 2001
From: kishorenc <kishorenc@gmail.com>
Date: Mon, 15 Jun 2020 19:20:00 +0530
Subject: [PATCH 13/38] Support for grouping overrides.

---
 TODO.md                           |   4 +-
 include/collection.h              |   8 +-
 include/index.h                   |  16 ++--
 src/collection.cpp                | 129 +++++++++++++++++++-----------
 src/core_api.cpp                  |   5 +-
 src/index.cpp                     |  86 +++++++++-----------
 test/collection_override_test.cpp |  23 +++++-
 7 files changed, 159 insertions(+), 112 deletions(-)

diff --git a/TODO.md b/TODO.md
index adeea108..8c3bed2d 100644
--- a/TODO.md
+++ b/TODO.md
@@ -99,8 +99,8 @@
 - ~~Use snappy compression for storage~~
 - ~~Fix exclude_scalar early returns~~
 - ~~Fix result ids length during grouped overrides~~
-- Fix override grouping (collate_included_ids)
-- Test for overriding result on second page
+- ~~Fix override grouping (collate_included_ids)~~
+- ~~Test for overriding result on second page~~
 - atleast 1 token match for proceeding with drop tokens
 - support wildcard query with filters
 - API for optimizing on disk storage
diff --git a/include/collection.h b/include/collection.h
index 4e06131d..648ca384 100644
--- a/include/collection.h
+++ b/include/collection.h
@@ -155,10 +155,10 @@ private:
     void remove_document(nlohmann::json & document, const uint32_t seq_id, bool remove_from_store);
 
     void populate_overrides(std::string query,
-                            const std::map<std::string, size_t>& pinned_hits,
+                            const std::map<size_t, std::vector<std::string>>& pinned_hits,
                             const std::vector<std::string>& hidden_hits,
-                            std::map<uint32_t, size_t> & id_pos_map,
-                            std::vector<uint32_t> & included_ids, std::vector<uint32_t> & excluded_ids);
+                            std::map<size_t, std::vector<uint32_t>>& include_ids,
+                            std::vector<uint32_t> & excluded_ids);
 
     static bool facet_count_compare(const std::pair<uint64_t, facet_count_t>& a,
                                     const std::pair<uint64_t, facet_count_t>& b) {
@@ -236,7 +236,7 @@ public:
                           const size_t snippet_threshold = 30,
                           const std::string & highlight_full_fields = "",
                           size_t typo_tokens_threshold = Index::TYPO_TOKENS_THRESHOLD,
-                          const std::map<std::string, size_t>& pinned_hits={},
+                          const std::map<size_t, std::vector<std::string>>& pinned_hits={},
                           const std::vector<std::string>& hidden_hits={},
                           const std::vector<std::string>& group_by_fields={},
                           const size_t group_limit = 0);
diff --git a/include/index.h b/include/index.h
index 65cf951e..f2eb5fcc 100644
--- a/include/index.h
+++ b/include/index.h
@@ -27,7 +27,7 @@ struct search_args {
     std::vector<std::string> search_fields;
     std::vector<filter> filters;
     std::vector<facet> facets;
-    std::vector<uint32_t> included_ids;
+    std::map<size_t, std::vector<uint32_t>> included_ids;
     std::vector<uint32_t> excluded_ids;
     std::vector<sort_by> sort_fields_std;
     facet_query_t facet_query;
@@ -47,7 +47,7 @@ struct search_args {
     Topster* topster;
     Topster* curated_topster;
     std::vector<std::vector<KV*>> raw_result_kvs;
-    std::vector<KV*> override_result_kvs;
+    std::vector<std::vector<KV*>> override_result_kvs;
     Option<uint32_t> outcome;
 
     search_args(): outcome(0) {
@@ -55,7 +55,7 @@ struct search_args {
     }
 
     search_args(std::string query, std::vector<std::string> search_fields, std::vector<filter> filters,
-                std::vector<facet> facets, std::vector<uint32_t> included_ids, std::vector<uint32_t> excluded_ids,
+                std::vector<facet> facets, std::map<size_t, std::vector<uint32_t>> included_ids, std::vector<uint32_t> excluded_ids,
                 std::vector<sort_by> sort_fields_std, facet_query_t facet_query, int num_typos, size_t max_facet_values,
                 size_t max_hits, size_t per_page, size_t page, token_ordering token_order, bool prefix,
                 size_t drop_tokens_threshold, size_t typo_tokens_threshold,
@@ -70,7 +70,7 @@ struct search_args {
 
         const size_t topster_size = std::max((size_t)1, max_hits);  // needs to be atleast 1 since scoring is mandatory
         topster = new Topster(topster_size, group_limit);
-        curated_topster = new Topster(topster_size);
+        curated_topster = new Topster(topster_size, group_limit);
     }
 
     ~search_args() {
@@ -213,7 +213,7 @@ private:
                                        const uint32_t indices_length);
 
     void collate_included_ids(const std::string & query, const std::string & field, const uint8_t field_id,
-                              const std::vector<uint32_t> & included_ids,
+                              const std::map<size_t, std::vector<uint32_t>> & included_ids_map,
                               Topster* curated_topster, std::vector<std::vector<art_leaf*>> & searched_queries);
 
     uint64_t facet_token_hash(const field & a_field, const std::string &token);
@@ -239,7 +239,8 @@ public:
     void search(Option<uint32_t> & outcome, const std::string & query, const std::vector<std::string> & search_fields,
                           const std::vector<filter> & filters, std::vector<facet> & facets,
                           facet_query_t & facet_query,
-                          const std::vector<uint32_t> & included_ids, const std::vector<uint32_t> & excluded_ids,
+                          const std::map<size_t, std::vector<uint32_t>> & included_ids_map,
+                          const std::vector<uint32_t> & excluded_ids,
                           const std::vector<sort_by> & sort_fields_std, const int num_typos,
                           Topster* topster, Topster* curated_topster,
                           const size_t per_page, const size_t page, const token_ordering token_order,
@@ -247,7 +248,8 @@ public:
                           size_t & all_result_ids_len,
                           spp::sparse_hash_set<uint64_t>& groups_processed,
                           std::vector<std::vector<art_leaf*>> & searched_queries,
-                          std::vector<std::vector<KV*>> & raw_result_kvs, std::vector<KV*> & override_result_kvs,
+                          std::vector<std::vector<KV*>> & raw_result_kvs,
+                          std::vector<std::vector<KV*>> & override_result_kvs,
                           const size_t typo_tokens_threshold);
 
     Option<uint32_t> remove(const uint32_t seq_id, nlohmann::json & document);
diff --git a/src/collection.cpp b/src/collection.cpp
index 24547c62..a51a7e29 100644
--- a/src/collection.cpp
+++ b/src/collection.cpp
@@ -305,54 +305,70 @@ void Collection::prune_document(nlohmann::json &document, const spp::sparse_hash
 }
 
 void Collection::populate_overrides(std::string query,
-                                    const std::map<std::string, size_t>& pinned_hits,
+                                    const std::map<size_t, std::vector<std::string>>& pinned_hits,
                                     const std::vector<std::string>& hidden_hits,
-                                    std::map<uint32_t, size_t> & id_pos_map,
-                                    std::vector<uint32_t> & included_ids,
+                                    std::map<size_t, std::vector<uint32_t>>& include_ids,
                                     std::vector<uint32_t> & excluded_ids) {
     StringUtils::tolowercase(query);
+    std::set<uint32_t> excluded_set;
+
+    // If pinned or hidden hits are provided, they take precedence over overrides
+
+    // have to ensure that hidden hits take precedence over included hits
+    if(!hidden_hits.empty()) {
+        for(const auto & hit: hidden_hits) {
+            Option<uint32_t> seq_id_op = doc_id_to_seq_id(hit);
+            if(seq_id_op.ok()) {
+                excluded_ids.push_back(seq_id_op.get());
+                excluded_set.insert(seq_id_op.get());
+            }
+        }
+    }
 
     for(const auto & override_kv: overrides) {
         const auto & override = override_kv.second;
 
         if( (override.rule.match == override_t::MATCH_EXACT && override.rule.query == query) ||
             (override.rule.match == override_t::MATCH_CONTAINS && query.find(override.rule.query) != std::string::npos) )  {
-            for(const auto & hit: override.add_hits) {
-                Option<uint32_t> seq_id_op = doc_id_to_seq_id(hit.doc_id);
-                if(seq_id_op.ok()) {
-                    included_ids.push_back(seq_id_op.get());
-                    id_pos_map[seq_id_op.get()] = hit.position;
-                }
-            }
 
+            // have to ensure that dropped hits take precedence over added hits
             for(const auto & hit: override.drop_hits) {
                 Option<uint32_t> seq_id_op = doc_id_to_seq_id(hit.doc_id);
                 if(seq_id_op.ok()) {
                     excluded_ids.push_back(seq_id_op.get());
+                    excluded_set.insert(seq_id_op.get());
                 }
             }
+
+            for(const auto & hit: override.add_hits) {
+                Option<uint32_t> seq_id_op = doc_id_to_seq_id(hit.doc_id);
+                if(!seq_id_op.ok()) {
+                    continue;
+                }
+                uint32_t seq_id = seq_id_op.get();
+                bool excluded = (excluded_set.count(seq_id) != 0);
+                if(!excluded) {
+                    include_ids[hit.position].push_back(seq_id);
+                }
+            }
+
+            break;
         }
     }
 
-    // If pinned or hidden hits are provided, they take precedence over overrides
-
     if(!pinned_hits.empty()) {
-        for(const auto & hit: pinned_hits) {
-            Option<uint32_t> seq_id_op = doc_id_to_seq_id(hit.first);
-            if(seq_id_op.ok()) {
-                included_ids.push_back(seq_id_op.get());
-                id_pos_map[seq_id_op.get()] = hit.second;
-            }
-        }
-    }
-
-    if(!hidden_hits.empty()) {
-        for(const auto & hit: hidden_hits) {
-            Option<uint32_t> seq_id_op = doc_id_to_seq_id(hit);
-            if(seq_id_op.ok()) {
-                included_ids.erase(std::remove(included_ids.begin(), included_ids.end(), seq_id_op.get()), included_ids.end());
-                id_pos_map.erase(seq_id_op.get());
-                excluded_ids.push_back(seq_id_op.get());
+        for(const auto& pos_ids: pinned_hits) {
+            size_t pos = pos_ids.first;
+            for(const std::string& id: pos_ids.second) {
+                Option<uint32_t> seq_id_op = doc_id_to_seq_id(id);
+                if(!seq_id_op.ok()) {
+                    continue;
+                }
+                uint32_t seq_id = seq_id_op.get();
+                bool excluded = (excluded_set.count(seq_id) != 0);
+                if(!excluded) {
+                    include_ids[pos].push_back(seq_id);
+                }
             }
         }
     }
@@ -371,22 +387,40 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
                                   const size_t snippet_threshold,
                                   const std::string & highlight_full_fields,
                                   size_t typo_tokens_threshold,
-                                  const std::map<std::string, size_t>& pinned_hits,
+                                  const std::map<size_t, std::vector<std::string>>& pinned_hits,
                                   const std::vector<std::string>& hidden_hits,
                                   const std::vector<std::string>& group_by_fields,
                                   const size_t group_limit) {
 
-    std::vector<uint32_t> included_ids;
     std::vector<uint32_t> excluded_ids;
-    std::map<uint32_t, size_t> id_pos_map;
-    populate_overrides(query, pinned_hits, hidden_hits, id_pos_map, included_ids, excluded_ids);
+    std::map<size_t, std::vector<uint32_t>> include_ids; // position => list of IDs
+    populate_overrides(query, pinned_hits, hidden_hits, include_ids, excluded_ids);
 
-    std::map<uint32_t, std::vector<uint32_t>> index_to_included_ids;
+    /*for(auto kv: include_ids) {
+        LOG(INFO) << "key: " << kv.first;
+        for(auto val: kv.second) {
+            LOG(INFO) << val;
+        }
+    }
+
+    LOG(INFO) << "Excludes:";
+
+    for(auto id: excluded_ids) {
+        LOG(INFO) << id;
+    }*/
+
+    //LOG(INFO) << "include_ids size: " << include_ids.size();
+    //LOG(INFO) << "Pos 1: " << include_ids[1][0];
+
+    std::map<uint32_t, std::map<size_t, std::vector<uint32_t>>> index_to_included_ids;
     std::map<uint32_t, std::vector<uint32_t>> index_to_excluded_ids;
 
-    for(auto seq_id: included_ids) {
-        auto index_id = (seq_id % num_indices);
-        index_to_included_ids[index_id].push_back(seq_id);
+    for(const auto& pos_ids: include_ids) {
+        size_t position = pos_ids.first;
+        for(auto seq_id: pos_ids.second) {
+            auto index_id = (seq_id % num_indices);
+            index_to_included_ids[index_id][position].push_back(seq_id);
+        }
     }
 
     for(auto seq_id: excluded_ids) {
@@ -650,7 +684,7 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
 
     std::vector<std::vector<art_leaf*>> searched_queries;  // search queries used for generating the results
     std::vector<std::vector<KV*>> raw_result_kvs;
-    std::vector<KV*> override_result_kvs;
+    std::vector<std::vector<KV*>> override_result_kvs;
 
     size_t total_found = 0;
     spp::sparse_hash_set<uint64_t> groups_processed;  // used to calculate total_found for grouped query
@@ -697,9 +731,9 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
             raw_result_kvs.push_back(kv_group);
         }
 
-        for(auto & field_order_kv: index->search_params->override_result_kvs) {
-            field_order_kv->query_index += searched_queries.size();
-            override_result_kvs.push_back(field_order_kv);
+        for(const std::vector<KV*> & kv_group: index->search_params->override_result_kvs) {
+            kv_group[0]->query_index += searched_queries.size();
+            override_result_kvs.push_back(kv_group);
         }
 
         searched_queries.insert(searched_queries.end(), index->search_params->searched_queries.begin(),
@@ -797,8 +831,8 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
     // Sort based on position in overridden list
     std::sort(
       override_result_kvs.begin(), override_result_kvs.end(),
-      [&id_pos_map](const KV* a, const KV* b) -> bool {
-          return id_pos_map[a->key] < id_pos_map[b->key];
+      [](const std::vector<KV*>& a, std::vector<KV*>& b) -> bool {
+          return a[0]->distinct_key < b[0]->distinct_key;
       }
     );
 
@@ -808,11 +842,12 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
 
     // merge raw results and override results
     while(override_kv_index < override_result_kvs.size() && raw_results_index < raw_result_kvs.size()) {
-        if(override_kv_index < override_result_kvs.size() &&
-           id_pos_map.count(override_result_kvs[override_kv_index]->key) != 0 &&
-           result_group_kvs.size() + 1 == id_pos_map[override_result_kvs[override_kv_index]->key]) {
-             result_group_kvs.push_back({override_result_kvs[override_kv_index]});
-             override_kv_index++;
+        size_t result_position = result_group_kvs.size() + 1;
+        uint64_t override_position = override_result_kvs[override_kv_index][0]->distinct_key;
+
+        if(result_position == override_position) {
+            result_group_kvs.push_back(override_result_kvs[override_kv_index]);
+            override_kv_index++;
         } else {
             result_group_kvs.push_back(raw_result_kvs[raw_results_index]);
             raw_results_index++;
diff --git a/src/core_api.cpp b/src/core_api.cpp
index 18c96629..03f89a2f 100644
--- a/src/core_api.cpp
+++ b/src/core_api.cpp
@@ -390,7 +390,8 @@ bool get_search(http_req & req, http_res & res) {
         }
     }
 
-    std::map<std::string, size_t> pinned_hits;
+    std::map<size_t, std::vector<std::string>> pinned_hits;
+
     if(req.params.count(PINNED_HITS) != 0) {
         std::vector<std::string> pinned_hits_strs;
         StringUtils::split(req.params[PINNED_HITS], pinned_hits_strs, ",");
@@ -415,7 +416,7 @@ bool get_search(http_req & req, http_res & res) {
                 return false;
             }
 
-            pinned_hits.emplace(expression_parts[0], position);
+            pinned_hits[position].emplace_back(expression_parts[0]);
         }
     }
 
diff --git a/src/index.cpp b/src/index.cpp
index fcbbc478..6fdee499 100644
--- a/src/index.cpp
+++ b/src/index.cpp
@@ -1038,11 +1038,11 @@ void Index::run_search() {
 }
 
 void Index::collate_included_ids(const std::string & query, const std::string & field, const uint8_t field_id,
-                                 const std::vector<uint32_t> & included_ids,
+                                 const std::map<size_t, std::vector<uint32_t>> & included_ids_map,
                                  Topster* curated_topster,
                                  std::vector<std::vector<art_leaf*>> & searched_queries) {
 
-    if(included_ids.empty()) {
+    if(included_ids_map.empty()) {
         return;
     }
 
@@ -1061,50 +1061,28 @@ void Index::collate_included_ids(const std::string & query, const std::string &
         art_fuzzy_search(search_index.at(field), (const unsigned char *) token.c_str(), token_len,
                          0, 0, 1, token_ordering::MAX_SCORE, false, leaves);
 
-        if(leaves.size() > 0) {
+        if(!leaves.empty()) {
             override_query.push_back(leaves[0]);
         }
     }
 
-    spp::sparse_hash_map<const art_leaf*, uint32_t*> leaf_to_indices;
+    for(const auto& pos_ids: included_ids_map) {
+        const size_t pos = pos_ids.first;
 
-    for (art_leaf *token_leaf : override_query) {
-        uint32_t *indices = new uint32_t[included_ids.size()];
-        token_leaf->values->ids.indexOf(&included_ids[0], included_ids.size(), indices);
-        leaf_to_indices.emplace(token_leaf, indices);
-    }
+        for(size_t i = 0; i < pos_ids.second.size(); i++) {
+            uint32_t seq_id = pos_ids.second[i];
 
-    // curated_topster->MAX_SIZE is initialized based on max_hits.
-    // Even if override has more IDs, we should restrict to max hits.
-    size_t iter_size = std::min((size_t)curated_topster->MAX_SIZE, included_ids.size());
+            uint64_t distinct_id = pos;          // position is the group distinct key
+            uint64_t match_score = (64000 - i);  // index within a group is the match score
 
-    for(size_t j=0; j<iter_size; j++) {
-        const uint32_t seq_id = included_ids[j];
+            int64_t scores[3];
+            scores[0] = match_score;
+            scores[1] = int64_t(1);
+            scores[2] = int64_t(1);
 
-        std::vector<std::vector<std::vector<uint16_t>>> array_token_positions;
-        populate_token_positions(override_query, leaf_to_indices, j, array_token_positions);
-
-        uint64_t match_score = 0;
-
-        for(const std::vector<std::vector<uint16_t>> & token_positions: array_token_positions) {
-            if(token_positions.empty()) {
-                continue;
-            }
-            const Match & match = Match::match(seq_id, token_positions);
-            uint64_t this_match_score = match.get_match_score(0, field_id);
-
-            if(this_match_score > match_score) {
-                match_score = this_match_score;
-            }
+            KV kv(field_id, searched_queries.size(), seq_id, distinct_id, match_score, scores);
+            curated_topster->add(&kv);
         }
-
-        int64_t scores[3];
-        scores[0] = int64_t(match_score);
-        scores[1] = int64_t(1);
-        scores[2] = int64_t(1);
-
-        KV kv(field_id, searched_queries.size(), seq_id, seq_id, match_score, scores);
-        curated_topster->add(&kv);
     }
 
     searched_queries.push_back(override_query);
@@ -1115,7 +1093,7 @@ void Index::search(Option<uint32_t> & outcome,
                    const std::vector<std::string> & search_fields,
                    const std::vector<filter> & filters,
                    std::vector<facet> & facets, facet_query_t & facet_query,
-                   const std::vector<uint32_t> & included_ids,
+                   const std::map<size_t, std::vector<uint32_t>> & included_ids_map,
                    const std::vector<uint32_t> & excluded_ids,
                    const std::vector<sort_by> & sort_fields_std, const int num_typos,
                    Topster* topster,
@@ -1126,7 +1104,7 @@ void Index::search(Option<uint32_t> & outcome,
                    spp::sparse_hash_set<uint64_t>& groups_processed,
                    std::vector<std::vector<art_leaf*>>& searched_queries,
                    std::vector<std::vector<KV*>> & raw_result_kvs,
-                   std::vector<KV*> & override_result_kvs,
+                   std::vector<std::vector<KV*>> & override_result_kvs,
                    const size_t typo_tokens_threshold) {
 
     // process the filters
@@ -1141,7 +1119,16 @@ void Index::search(Option<uint32_t> & outcome,
     uint32_t filter_ids_length = op_filter_ids_length.get();
 
     // we will be removing all curated IDs from organic result ids before running topster
-    std::set<uint32_t> curated_ids(included_ids.begin(), included_ids.end());
+    std::set<uint32_t> curated_ids;
+    std::vector<uint32_t> included_ids;
+
+    for(const auto& pos_ids: included_ids_map) {
+        for(const uint32_t id: pos_ids.second) {
+            curated_ids.insert(id);
+            included_ids.push_back(id);
+        }
+    }
+
     curated_ids.insert(excluded_ids.begin(), excluded_ids.end());
 
     std::vector<uint32_t> curated_ids_sorted(curated_ids.begin(), curated_ids.end());
@@ -1166,7 +1153,7 @@ void Index::search(Option<uint32_t> & outcome,
 
         score_results(sort_fields_std, (uint16_t) searched_queries.size(), field_id, 0, topster, {},
                       groups_processed, filter_ids, filter_ids_length);
-        collate_included_ids(query, field, field_id, included_ids, curated_topster, searched_queries);
+        collate_included_ids(query, field, field_id, included_ids_map, curated_topster, searched_queries);
 
         all_result_ids_len = filter_ids_length;
         all_result_ids = filter_ids;
@@ -1182,7 +1169,7 @@ void Index::search(Option<uint32_t> & outcome,
                 search_field(field_id, query, field, filter_ids, filter_ids_length, curated_ids_sorted, facets, sort_fields_std,
                              num_typos, searched_queries, topster, groups_processed, &all_result_ids, all_result_ids_len,
                              token_order, prefix, drop_tokens_threshold, typo_tokens_threshold);
-                collate_included_ids(query, field, field_id, included_ids, curated_topster, searched_queries);
+                collate_included_ids(query, field, field_id, included_ids_map, curated_topster, searched_queries);
             }
         }
     }
@@ -1202,16 +1189,23 @@ void Index::search(Option<uint32_t> & outcome,
             const std::vector<KV*> group_kvs(group_topster->kvs, group_topster->kvs+group_topster->size);
             raw_result_kvs.emplace_back(group_kvs);
         }
+
+        for(auto &curated_topster_entry: curated_topster->group_kv_map) {
+            Topster* group_topster = curated_topster_entry.second;
+            const std::vector<KV*> group_kvs(group_topster->kvs, group_topster->kvs+group_topster->size);
+            override_result_kvs.emplace_back(group_kvs);
+        }
+
     } else {
         for(uint32_t t = 0; t < topster->size; t++) {
             KV* kv = topster->getKV(t);
             raw_result_kvs.push_back({kv});
         }
-    }
 
-    for(uint32_t t = 0; t < curated_topster->size; t++) {
-        KV* kv = curated_topster->getKV(t);
-        override_result_kvs.push_back(kv);
+        for(uint32_t t = 0; t < curated_topster->size; t++) {
+            KV* kv = curated_topster->getKV(t);
+            override_result_kvs.push_back({kv});
+        }
     }
 
     // add curated IDs to result count
diff --git a/test/collection_override_test.cpp b/test/collection_override_test.cpp
index 49824142..eaf153f7 100644
--- a/test/collection_override_test.cpp
+++ b/test/collection_override_test.cpp
@@ -256,10 +256,9 @@ TEST_F(CollectionOverrideTest, ExcludeIncludeFacetFilterQuery) {
 }
 
 TEST_F(CollectionOverrideTest, IncludeExcludeHitsQuery) {
-    std::map<std::string, size_t> pinned_hits;
-    std::vector<std::string> hidden_hits;
-    pinned_hits["13"] = 1;
-    pinned_hits["4"] = 2;
+    std::map<size_t, std::vector<std::string>> pinned_hits;
+    pinned_hits[1] = {"13"};
+    pinned_hits[2] = {"4"};
 
     // basic pinning
 
@@ -279,6 +278,7 @@ TEST_F(CollectionOverrideTest, IncludeExcludeHitsQuery) {
 
     // both pinning and hiding
 
+    std::vector<std::string> hidden_hits;
     hidden_hits = {"11", "16"};
     results = coll_mul_fields->search("the", {"title"}, "", {"starring"}, {}, 0, 50, 1, FREQUENCY,
                                       false, Index::DROP_TOKENS_THRESHOLD,
@@ -291,6 +291,21 @@ TEST_F(CollectionOverrideTest, IncludeExcludeHitsQuery) {
     ASSERT_STREQ("4", results["hits"][1]["document"]["id"].get<std::string>().c_str());
     ASSERT_STREQ("6", results["hits"][2]["document"]["id"].get<std::string>().c_str());
 
+    // paginating such that pinned hits appear on second page
+    pinned_hits.clear();
+    pinned_hits[4] = {"13"};
+    pinned_hits[5] = {"4"};
+
+    results = coll_mul_fields->search("the", {"title"}, "", {"starring"}, {}, 0, 2, 2, FREQUENCY,
+                                      false, Index::DROP_TOKENS_THRESHOLD,
+                                      spp::sparse_hash_set<std::string>(),
+                                      spp::sparse_hash_set<std::string>(), 10, "starring: will", 30,
+                                      "", 10,
+                                      pinned_hits, hidden_hits).get();
+
+    ASSERT_STREQ("1", results["hits"][0]["document"]["id"].get<std::string>().c_str());
+    ASSERT_STREQ("13", results["hits"][1]["document"]["id"].get<std::string>().c_str());
+
     // take precedence over override rules
 
     nlohmann::json override_json_include = {

From ef405167902474342b6b719ba35a271081acb6c4 Mon Sep 17 00:00:00 2001
From: kishorenc <kishorenc@gmail.com>
Date: Thu, 18 Jun 2020 06:17:10 +0530
Subject: [PATCH 14/38] Fixing an issue with grouping across indices.

---
 include/collection.h              |   4 +
 include/topster.h                 |   5 ++
 src/collection.cpp                | 120 ++++++++++++++++++------------
 src/index.cpp                     |  28 -------
 test/collection_grouping_test.cpp |  71 ++++++++++++++++++
 test/collection_override_test.cpp |  38 +++++++++-
 test/collection_test.cpp          |   1 -
 7 files changed, 189 insertions(+), 78 deletions(-)
 create mode 100644 test/collection_grouping_test.cpp

diff --git a/include/collection.h b/include/collection.h
index 648ca384..730bad63 100644
--- a/include/collection.h
+++ b/include/collection.h
@@ -282,5 +282,9 @@ public:
 
     void facet_value_to_string(const facet &a_facet, const facet_count_t &facet_count, const nlohmann::json &document,
                                std::string &value);
+
+    void aggregate_topster(size_t query_index, Topster &topster, Topster *index_topster) const;
+
+    void populate_result_kvs(Topster *topster, std::vector<std::vector<KV *>> &result_kvs) const;
 };
 
diff --git a/include/topster.h b/include/topster.h
index 685e6d62..2e585a6b 100644
--- a/include/topster.h
+++ b/include/topster.h
@@ -69,6 +69,11 @@ struct Topster {
         for(auto& kv: group_kv_map) {
             delete kv.second;
         }
+
+        data = nullptr;
+        kvs = nullptr;
+
+        group_kv_map.clear();
     }
 
     static inline void swapMe(KV** a, KV** b) {
diff --git a/src/collection.cpp b/src/collection.cpp
index a51a7e29..ee23aefe 100644
--- a/src/collection.cpp
+++ b/src/collection.cpp
@@ -407,19 +407,30 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
 
     for(auto id: excluded_ids) {
         LOG(INFO) << id;
-    }*/
+    }
 
-    //LOG(INFO) << "include_ids size: " << include_ids.size();
-    //LOG(INFO) << "Pos 1: " << include_ids[1][0];
+    LOG(INFO) << "include_ids size: " << include_ids.size();
+    for(auto& group: include_ids) {
+        for(uint32_t& seq_id: group.second) {
+            LOG(INFO) << "seq_id: " << seq_id;
+        }
+
+        LOG(INFO) << "----";
+    }
+    */
 
     std::map<uint32_t, std::map<size_t, std::vector<uint32_t>>> index_to_included_ids;
     std::map<uint32_t, std::vector<uint32_t>> index_to_excluded_ids;
 
     for(const auto& pos_ids: include_ids) {
         size_t position = pos_ids.first;
-        for(auto seq_id: pos_ids.second) {
+        size_t ids_per_pos = std::max(size_t(1), group_limit);
+
+        for(size_t i = 0; i < std::min(ids_per_pos, pos_ids.second.size()); i++) {
+            auto seq_id = pos_ids.second[i];
             auto index_id = (seq_id % num_indices);
             index_to_included_ids[index_id][position].push_back(seq_id);
+            //LOG(INFO) << "Adding seq_id " << seq_id << " to index_id " << index_id;
         }
     }
 
@@ -709,6 +720,12 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
 
     Option<nlohmann::json> index_search_op({});  // stores the last error across all index threads
 
+    // for grouping we have re-aggregate
+
+    const size_t topster_size = std::max((size_t)1, max_hits);
+    Topster topster(topster_size, group_limit);
+    Topster curated_topster(topster_size, group_limit);
+
     for(Index* index: indices) {
         // wait for the worker
         {
@@ -726,15 +743,8 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
             continue;
         }
 
-        for(const std::vector<KV*> & kv_group: index->search_params->raw_result_kvs) {
-            kv_group[0]->query_index += searched_queries.size();
-            raw_result_kvs.push_back(kv_group);
-        }
-
-        for(const std::vector<KV*> & kv_group: index->search_params->override_result_kvs) {
-            kv_group[0]->query_index += searched_queries.size();
-            override_result_kvs.push_back(kv_group);
-        }
+        aggregate_topster(searched_queries.size(), topster, index->search_params->topster);
+        aggregate_topster(searched_queries.size(), curated_topster, index->search_params->curated_topster);
 
         searched_queries.insert(searched_queries.end(), index->search_params->searched_queries.begin(),
                                 index->search_params->searched_queries.end());
@@ -783,46 +793,27 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
         }
     }
 
+    if(!index_search_op.ok()) {
+        return index_search_op;
+    }
+
+    topster.sort();
+    curated_topster.sort();
+
+    populate_result_kvs(&topster, raw_result_kvs);
+    populate_result_kvs(&curated_topster, override_result_kvs);
+
     // for grouping we have to aggregate group set sizes to a count value
     if(group_limit) {
+        LOG(INFO) << "override_result_kvs size: " << override_result_kvs.size();
+
         for(auto& acc_facet: facets) {
             for(auto& facet_kv: acc_facet.result_map) {
                 facet_kv.second.count = facet_kv.second.groups.size();
             }
         }
 
-        total_found = groups_processed.size();
-    }
-
-    if(!index_search_op.ok()) {
-        return index_search_op;
-    }
-
-    Topster* aggr_topster = nullptr;
-
-    if(group_limit) {
-        // group by query requires another round of topster-ing
-
-        // needs to be atleast 1 since scoring is mandatory
-        const size_t topster_size = std::max((size_t)1, max_hits);
-        aggr_topster = new Topster(topster_size, group_limit);
-
-        for(const auto& kv_group: raw_result_kvs) {
-            for(KV* kv: kv_group) {
-                aggr_topster->add(kv);
-            }
-        }
-
-        aggr_topster->sort();
-
-        raw_result_kvs.clear();
-        raw_result_kvs.shrink_to_fit();
-
-        for(auto &group_topster_entry: aggr_topster->group_kv_map) {
-            Topster* group_topster = group_topster_entry.second;
-            const std::vector<KV*> group_kvs(group_topster->kvs, group_topster->kvs+group_topster->size);
-            raw_result_kvs.emplace_back(group_kvs);
-        }
+        total_found = groups_processed.size() + override_result_kvs.size();
     }
 
     // All fields are sorted descending
@@ -1063,8 +1054,6 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
         delete index->search_params;
     }
 
-    delete aggr_topster;
-
     result["request_params"] = nlohmann::json::object();;
     result["request_params"]["per_page"] = per_page;
     result["request_params"]["q"] = query;
@@ -1075,6 +1064,43 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
     return result;
 }
 
+
+void Collection::populate_result_kvs(Topster *topster, std::vector<std::vector<KV *>> &result_kvs) const {
+    if(topster->distinct) {
+        for(auto &group_topster_entry: topster->group_kv_map) {
+            Topster* group_topster = group_topster_entry.second;
+            const std::vector<KV*> group_kvs(group_topster->kvs, group_topster->kvs+group_topster->size);
+            result_kvs.emplace_back(group_kvs);
+        }
+
+    } else {
+        for(uint32_t t = 0; t < topster->size; t++) {
+            KV* kv = topster->getKV(t);
+            result_kvs.push_back({kv});
+        }
+    }
+}
+
+void Collection::aggregate_topster(size_t query_index, Topster &topster, Topster *index_topster) const {
+    if(index_topster->distinct) {
+        for(auto &group_topster_entry: index_topster->group_kv_map) {
+            Topster* group_topster = group_topster_entry.second;
+            const std::vector<KV*> group_kvs(group_topster->kvs, group_topster->kvs+group_topster->size);
+            for(KV* kv: group_kvs) {
+                kv->query_index += query_index;
+                topster.add(kv);
+            }
+        }
+
+    } else {
+        for(uint32_t t = 0; t < index_topster->size; t++) {
+            KV* kv = index_topster->getKV(t);
+            kv->query_index += query_index;
+            topster.add(kv);
+        }
+    }
+}
+
 void Collection::facet_value_to_string(const facet &a_facet, const facet_count_t &facet_count,
                                        const nlohmann::json &document, std::string &value) {
 
diff --git a/src/index.cpp b/src/index.cpp
index 6fdee499..ce1fecbc 100644
--- a/src/index.cpp
+++ b/src/index.cpp
@@ -1181,34 +1181,6 @@ void Index::search(Option<uint32_t> & outcome,
     topster->sort();
     curated_topster->sort();
 
-    // loop through topster and remove elements from included and excluded id lists
-
-    if(topster->distinct) {
-        for(auto &group_topster_entry: topster->group_kv_map) {
-            Topster* group_topster = group_topster_entry.second;
-            const std::vector<KV*> group_kvs(group_topster->kvs, group_topster->kvs+group_topster->size);
-            raw_result_kvs.emplace_back(group_kvs);
-        }
-
-        for(auto &curated_topster_entry: curated_topster->group_kv_map) {
-            Topster* group_topster = curated_topster_entry.second;
-            const std::vector<KV*> group_kvs(group_topster->kvs, group_topster->kvs+group_topster->size);
-            override_result_kvs.emplace_back(group_kvs);
-        }
-
-    } else {
-        for(uint32_t t = 0; t < topster->size; t++) {
-            KV* kv = topster->getKV(t);
-            raw_result_kvs.push_back({kv});
-        }
-
-        for(uint32_t t = 0; t < curated_topster->size; t++) {
-            KV* kv = curated_topster->getKV(t);
-            override_result_kvs.push_back({kv});
-        }
-    }
-
-    // add curated IDs to result count
     all_result_ids_len += curated_topster->size;
 
     delete [] filter_ids;
diff --git a/test/collection_grouping_test.cpp b/test/collection_grouping_test.cpp
new file mode 100644
index 00000000..b6d99948
--- /dev/null
+++ b/test/collection_grouping_test.cpp
@@ -0,0 +1,71 @@
+#include <gtest/gtest.h>
+#include <string>
+#include <vector>
+#include <fstream>
+#include <algorithm>
+#include <collection_manager.h>
+#include "collection.h"
+
+class CollectionGroupingTest : public ::testing::Test {
+protected:
+    Store *store;
+    CollectionManager & collectionManager = CollectionManager::get_instance();
+
+    std::vector<std::string> query_fields;
+    std::vector<sort_by> sort_fields;
+
+    void setupCollection() {
+        std::string state_dir_path = "/tmp/typesense_test/collection_sorting";
+        LOG(INFO) << "Truncating and creating: " << state_dir_path;
+        system(("rm -rf "+state_dir_path+" && mkdir -p "+state_dir_path).c_str());
+
+        store = new Store(state_dir_path);
+        collectionManager.init(store, 4, "auth_key");
+        collectionManager.load();
+    }
+
+    virtual void SetUp() {
+        setupCollection();
+    }
+
+    virtual void TearDown() {
+        delete store;
+    }
+};
+
+TEST_F(CollectionGroupingTest, GroupingOnOptionalIntegerArray) {
+    Collection *coll1;
+
+    std::vector<field> fields = {
+        field("title", field_types::STRING, false),
+        field("description", field_types::STRING, true, true),
+        field("max", field_types::INT32, false),
+        field("scores", field_types::INT64_ARRAY, true, true),
+        field("average", field_types::FLOAT, false, true),
+        field("is_valid", field_types::BOOL, false, true),
+    };
+
+    coll1 = collectionManager.get_collection("coll1");
+    if(coll1 == nullptr) {
+        coll1 = collectionManager.create_collection("coll1", fields, "max").get();
+    }
+
+    std::ifstream infile(std::string(ROOT_DIR)+"test/optional_fields.jsonl");
+
+    std::string json_line;
+
+    while (std::getline(infile, json_line)) {
+        auto add_op = coll1->add(json_line);
+        if(!add_op.ok()) {
+            std::cout << add_op.error() << std::endl;
+        }
+        ASSERT_TRUE(add_op.ok());
+    }
+
+    infile.close();
+
+    // first must be able to fetch all records (i.e. all must have been index)
+
+    auto res = coll1->search("*", {"title"}, "", {}, {}, 0, 10, 1, FREQUENCY, false).get();
+    ASSERT_EQ(6, res["found"].get<size_t>());
+}
\ No newline at end of file
diff --git a/test/collection_override_test.cpp b/test/collection_override_test.cpp
index eaf153f7..8aca5069 100644
--- a/test/collection_override_test.cpp
+++ b/test/collection_override_test.cpp
@@ -18,7 +18,7 @@ protected:
         system(("rm -rf "+state_dir_path+" && mkdir -p "+state_dir_path).c_str());
 
         store = new Store(state_dir_path);
-        collectionManager.init(store, 1, "auth_key");
+        collectionManager.init(store, 4, "auth_key");
         collectionManager.load();
 
         std::ifstream infile(std::string(ROOT_DIR)+"test/multi_field_documents.jsonl");
@@ -248,7 +248,7 @@ TEST_F(CollectionOverrideTest, ExcludeIncludeFacetFilterQuery) {
                                       spp::sparse_hash_set<std::string>(),
                                       spp::sparse_hash_set<std::string>(), 10, "").get();
 
-    ASSERT_EQ(1, results["found"].get<size_t>());
+    ASSERT_EQ(2, results["found"].get<size_t>());
     ASSERT_EQ(1, results["hits"].size());
     ASSERT_EQ("0", results["hits"][0]["document"]["id"].get<std::string>());
 
@@ -342,4 +342,38 @@ TEST_F(CollectionOverrideTest, IncludeExcludeHitsQuery) {
     ASSERT_EQ(8, results["found"].get<size_t>());
     ASSERT_STREQ("8", results["hits"][0]["document"]["id"].get<std::string>().c_str());
     ASSERT_STREQ("6", results["hits"][1]["document"]["id"].get<std::string>().c_str());
+}
+
+TEST_F(CollectionOverrideTest, PinnedHitsGrouping) {
+    std::map<size_t, std::vector<std::string>> pinned_hits;
+    pinned_hits[1] = {"6", "8"};
+    pinned_hits[2] = {"1"};
+    pinned_hits[3] = {"13", "4"};
+
+    // without any grouping parameter, only the first ID in a position should be picked
+    // and other IDs should appear in their original positions
+
+    auto results = coll_mul_fields->search("the", {"title"}, "", {"starring"}, {}, 0, 50, 1, FREQUENCY,
+                                           false, Index::DROP_TOKENS_THRESHOLD,
+                                           spp::sparse_hash_set<std::string>(),
+                                           spp::sparse_hash_set<std::string>(), 10, "starring: will", 30,
+                                           "", 10,
+                                           pinned_hits, {}).get();
+
+    ASSERT_EQ(10, results["found"].get<size_t>());
+    ASSERT_STREQ("6", results["hits"][0]["document"]["id"].get<std::string>().c_str());
+    ASSERT_STREQ("1", results["hits"][1]["document"]["id"].get<std::string>().c_str());
+    ASSERT_STREQ("13", results["hits"][2]["document"]["id"].get<std::string>().c_str());
+    ASSERT_STREQ("11", results["hits"][3]["document"]["id"].get<std::string>().c_str());
+
+    // with grouping
+
+    results = coll_mul_fields->search("the", {"title"}, "", {"starring"}, {}, 0, 50, 1, FREQUENCY,
+                            false, Index::DROP_TOKENS_THRESHOLD,
+                            spp::sparse_hash_set<std::string>(),
+                            spp::sparse_hash_set<std::string>(), 10, "starring: will", 30,
+                            "", 10,
+                            pinned_hits, {}, {"cast"}, 2).get();
+
+    ASSERT_EQ(8, results["found"].get<size_t>());
 }
\ No newline at end of file
diff --git a/test/collection_test.cpp b/test/collection_test.cpp
index dc904ce5..51865d89 100644
--- a/test/collection_test.cpp
+++ b/test/collection_test.cpp
@@ -2265,7 +2265,6 @@ TEST_F(CollectionTest, OptionalFields) {
 
     // try fetching the schema (should contain optional field)
     nlohmann::json coll_summary = coll1->get_summary_json();
-    LOG(INFO) << coll_summary;
     ASSERT_STREQ("title", coll_summary["fields"][0]["name"].get<std::string>().c_str());
     ASSERT_STREQ("string", coll_summary["fields"][0]["type"].get<std::string>().c_str());
     ASSERT_FALSE(coll_summary["fields"][0]["facet"].get<bool>());

From ae6ea0ba6c8076824a0b5422f842c3af2f32b41c Mon Sep 17 00:00:00 2001
From: kishorenc <kishorenc@gmail.com>
Date: Fri, 19 Jun 2020 06:14:37 +0530
Subject: [PATCH 15/38] Use malloc/free consistently for managing array buffer.

---
 src/array.cpp        | 4 ++--
 src/sorted_array.cpp | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/array.cpp b/src/array.cpp
index bc901ff7..7a009d2c 100644
--- a/src/array.cpp
+++ b/src/array.cpp
@@ -61,12 +61,12 @@ void array::remove_index(uint32_t start_index, uint32_t end_index) {
     }
 
     uint32_t size_required = (uint32_t) (unsorted_append_size_required(max, new_index) * FOR_GROWTH_FACTOR);
-    uint8_t *out = new uint8_t[size_required];
+    uint8_t *out = (uint8_t *) malloc(size_required * sizeof *out);
     uint32_t actual_size = for_compress_unsorted(new_array, out, new_index);
 
     delete[] curr_array;
     delete[] new_array;
-    delete[] in;
+    free(in);
 
     in = out;
     length = new_index;
diff --git a/src/sorted_array.cpp b/src/sorted_array.cpp
index 228f6ba8..60f0a962 100644
--- a/src/sorted_array.cpp
+++ b/src/sorted_array.cpp
@@ -6,7 +6,7 @@ void sorted_array::load(const uint32_t *sorted_array, const uint32_t array_lengt
     max = array_length > 1 ? sorted_array[array_length-1] : min;
 
     uint32_t size_required = (uint32_t) (sorted_append_size_required(max, array_length) * FOR_GROWTH_FACTOR);
-    uint8_t *out = new uint8_t[size_required];
+    uint8_t *out = (uint8_t *) malloc(size_required * sizeof *out);
     uint32_t actual_size = for_compress_sorted(sorted_array, out, array_length);
 
     free(in);

From e9557bf233b191936151c9ca2e8217c7fe9d3203 Mon Sep 17 00:00:00 2001
From: kishorenc <kishorenc@gmail.com>
Date: Fri, 19 Jun 2020 07:18:21 +0530
Subject: [PATCH 16/38] Tear down test suite state properly.

---
 test/collection_faceting_test.cpp |  3 ++-
 test/collection_grouping_test.cpp | 18 ++++++++----------
 test/collection_manager_test.cpp  |  1 +
 test/collection_override_test.cpp |  1 +
 test/collection_sorting_test.cpp  |  1 +
 test/collection_test.cpp          |  1 +
 6 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/test/collection_faceting_test.cpp b/test/collection_faceting_test.cpp
index 5db5bac6..2df584c4 100644
--- a/test/collection_faceting_test.cpp
+++ b/test/collection_faceting_test.cpp
@@ -15,7 +15,7 @@ protected:
     std::vector<sort_by> sort_fields;
 
     void setupCollection() {
-        std::string state_dir_path = "/tmp/typesense_test/collection_sorting";
+        std::string state_dir_path = "/tmp/typesense_test/collection_faceting";
         LOG(INFO) << "Truncating and creating: " << state_dir_path;
         system(("rm -rf "+state_dir_path+" && mkdir -p "+state_dir_path).c_str());
 
@@ -29,6 +29,7 @@ protected:
     }
 
     virtual void TearDown() {
+        collectionManager.dispose();
         delete store;
     }
 };
diff --git a/test/collection_grouping_test.cpp b/test/collection_grouping_test.cpp
index b6d99948..8a9a3849 100644
--- a/test/collection_grouping_test.cpp
+++ b/test/collection_grouping_test.cpp
@@ -11,11 +11,8 @@ protected:
     Store *store;
     CollectionManager & collectionManager = CollectionManager::get_instance();
 
-    std::vector<std::string> query_fields;
-    std::vector<sort_by> sort_fields;
-
     void setupCollection() {
-        std::string state_dir_path = "/tmp/typesense_test/collection_sorting";
+        std::string state_dir_path = "/tmp/typesense_test/collection_grouping";
         LOG(INFO) << "Truncating and creating: " << state_dir_path;
         system(("rm -rf "+state_dir_path+" && mkdir -p "+state_dir_path).c_str());
 
@@ -29,12 +26,13 @@ protected:
     }
 
     virtual void TearDown() {
+        collectionManager.dispose();
         delete store;
     }
 };
 
 TEST_F(CollectionGroupingTest, GroupingOnOptionalIntegerArray) {
-    Collection *coll1;
+    Collection *coll_group;
 
     std::vector<field> fields = {
         field("title", field_types::STRING, false),
@@ -45,9 +43,9 @@ TEST_F(CollectionGroupingTest, GroupingOnOptionalIntegerArray) {
         field("is_valid", field_types::BOOL, false, true),
     };
 
-    coll1 = collectionManager.get_collection("coll1");
-    if(coll1 == nullptr) {
-        coll1 = collectionManager.create_collection("coll1", fields, "max").get();
+    coll_group = collectionManager.get_collection("coll_group");
+    if(coll_group == nullptr) {
+        coll_group = collectionManager.create_collection("coll_group", fields, "max").get();
     }
 
     std::ifstream infile(std::string(ROOT_DIR)+"test/optional_fields.jsonl");
@@ -55,7 +53,7 @@ TEST_F(CollectionGroupingTest, GroupingOnOptionalIntegerArray) {
     std::string json_line;
 
     while (std::getline(infile, json_line)) {
-        auto add_op = coll1->add(json_line);
+        auto add_op = coll_group->add(json_line);
         if(!add_op.ok()) {
             std::cout << add_op.error() << std::endl;
         }
@@ -66,6 +64,6 @@ TEST_F(CollectionGroupingTest, GroupingOnOptionalIntegerArray) {
 
     // first must be able to fetch all records (i.e. all must have been index)
 
-    auto res = coll1->search("*", {"title"}, "", {}, {}, 0, 10, 1, FREQUENCY, false).get();
+    auto res = coll_group->search("*", {"title"}, "", {}, {}, 0, 10, 1, FREQUENCY, false).get();
     ASSERT_EQ(6, res["found"].get<size_t>());
 }
\ No newline at end of file
diff --git a/test/collection_manager_test.cpp b/test/collection_manager_test.cpp
index f2b1f03a..4c8ecfbb 100644
--- a/test/collection_manager_test.cpp
+++ b/test/collection_manager_test.cpp
@@ -40,6 +40,7 @@ protected:
 
     virtual void TearDown() {
         collectionManager.drop_collection("collection1");
+        collectionManager.dispose();
         delete store;
     }
 };
diff --git a/test/collection_override_test.cpp b/test/collection_override_test.cpp
index 8aca5069..30d223da 100644
--- a/test/collection_override_test.cpp
+++ b/test/collection_override_test.cpp
@@ -49,6 +49,7 @@ protected:
 
     virtual void TearDown() {
         collectionManager.drop_collection("coll_mul_fields");
+        collectionManager.dispose();
         delete store;
     }
 };
diff --git a/test/collection_sorting_test.cpp b/test/collection_sorting_test.cpp
index 93caff5b..bd983a05 100644
--- a/test/collection_sorting_test.cpp
+++ b/test/collection_sorting_test.cpp
@@ -29,6 +29,7 @@ protected:
     }
 
     virtual void TearDown() {
+        collectionManager.dispose();
         delete store;
     }
 };
diff --git a/test/collection_test.cpp b/test/collection_test.cpp
index 51865d89..ec5c3649 100644
--- a/test/collection_test.cpp
+++ b/test/collection_test.cpp
@@ -56,6 +56,7 @@ protected:
 
     virtual void TearDown() {
         collectionManager.drop_collection("collection");
+        collectionManager.dispose();
         delete store;
     }
 };

From 1216bcb3ccdbd2ce652264aacd666d7ad4040fc1 Mon Sep 17 00:00:00 2001
From: kishorenc <kishorenc@gmail.com>
Date: Fri, 19 Jun 2020 18:57:49 +0530
Subject: [PATCH 17/38] Wrap grouped hits in a dictionary structure.

---
 src/collection.cpp | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/src/collection.cpp b/src/collection.cpp
index ee23aefe..29238a88 100644
--- a/src/collection.cpp
+++ b/src/collection.cpp
@@ -869,8 +869,18 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
     for(long result_kvs_index = start_result_index; result_kvs_index <= end_result_index; result_kvs_index++) {
         const std::vector<KV*> & kv_group = result_group_kvs[result_kvs_index];
 
-        nlohmann::json group_hits_array = nlohmann::json::array();
-        nlohmann::json& hits_array = (group_limit > 1) ? group_hits_array : result["hits"];
+        nlohmann::json group_hits;
+        if(group_limit > 1) {
+            group_hits["hits"] = nlohmann::json::array();
+            std::vector<std::string> group_keys;
+            for(const auto& group_key: group_by_fields) {
+                group_keys.push_back(group_key);
+            }
+
+            group_hits["group_key"] = StringUtils::join(group_keys, ":");
+        }
+
+        nlohmann::json& hits_array = (group_limit > 1) ? group_hits["hits"] : result["hits"];
 
         for(const KV* field_order_kv: kv_group) {
             const std::string& seq_id_key = get_seq_id_key((uint32_t) field_order_kv->key);
@@ -951,7 +961,7 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
         }
 
         if(group_limit > 1) {
-            result["grouped_hits"].push_back(group_hits_array);
+            result["grouped_hits"].push_back(group_hits);
         }
     }
 

From 7fa6d5c888e5b8efdb99912f491d0a86591a5ee8 Mon Sep 17 00:00:00 2001
From: kishorenc <kishorenc@gmail.com>
Date: Fri, 19 Jun 2020 21:18:43 +0530
Subject: [PATCH 18/38] Fix ordering in grouped override IDs.

---
 include/index.h    |  8 ++++----
 src/collection.cpp | 10 +++++-----
 src/index.cpp      | 30 +++++++++++++++++-------------
 3 files changed, 26 insertions(+), 22 deletions(-)

diff --git a/include/index.h b/include/index.h
index f2eb5fcc..36cae022 100644
--- a/include/index.h
+++ b/include/index.h
@@ -27,7 +27,7 @@ struct search_args {
     std::vector<std::string> search_fields;
     std::vector<filter> filters;
     std::vector<facet> facets;
-    std::map<size_t, std::vector<uint32_t>> included_ids;
+    std::map<size_t, std::map<size_t, uint32_t>> included_ids;
     std::vector<uint32_t> excluded_ids;
     std::vector<sort_by> sort_fields_std;
     facet_query_t facet_query;
@@ -55,7 +55,7 @@ struct search_args {
     }
 
     search_args(std::string query, std::vector<std::string> search_fields, std::vector<filter> filters,
-                std::vector<facet> facets, std::map<size_t, std::vector<uint32_t>> included_ids, std::vector<uint32_t> excluded_ids,
+                std::vector<facet> facets, std::map<size_t, std::map<size_t, uint32_t>> included_ids, std::vector<uint32_t> excluded_ids,
                 std::vector<sort_by> sort_fields_std, facet_query_t facet_query, int num_typos, size_t max_facet_values,
                 size_t max_hits, size_t per_page, size_t page, token_ordering token_order, bool prefix,
                 size_t drop_tokens_threshold, size_t typo_tokens_threshold,
@@ -213,7 +213,7 @@ private:
                                        const uint32_t indices_length);
 
     void collate_included_ids(const std::string & query, const std::string & field, const uint8_t field_id,
-                              const std::map<size_t, std::vector<uint32_t>> & included_ids_map,
+                              const std::map<size_t, std::map<size_t, uint32_t>> & included_ids_map,
                               Topster* curated_topster, std::vector<std::vector<art_leaf*>> & searched_queries);
 
     uint64_t facet_token_hash(const field & a_field, const std::string &token);
@@ -239,7 +239,7 @@ public:
     void search(Option<uint32_t> & outcome, const std::string & query, const std::vector<std::string> & search_fields,
                           const std::vector<filter> & filters, std::vector<facet> & facets,
                           facet_query_t & facet_query,
-                          const std::map<size_t, std::vector<uint32_t>> & included_ids_map,
+                          const std::map<size_t, std::map<size_t, uint32_t>> & included_ids_map,
                           const std::vector<uint32_t> & excluded_ids,
                           const std::vector<sort_by> & sort_fields_std, const int num_typos,
                           Topster* topster, Topster* curated_topster,
diff --git a/src/collection.cpp b/src/collection.cpp
index 29238a88..0f7336f7 100644
--- a/src/collection.cpp
+++ b/src/collection.cpp
@@ -419,17 +419,17 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
     }
     */
 
-    std::map<uint32_t, std::map<size_t, std::vector<uint32_t>>> index_to_included_ids;
+    std::map<uint32_t, std::map<size_t, std::map<size_t, uint32_t>>> index_to_included_ids;
     std::map<uint32_t, std::vector<uint32_t>> index_to_excluded_ids;
 
     for(const auto& pos_ids: include_ids) {
-        size_t position = pos_ids.first;
+        size_t outer_pos = pos_ids.first;
         size_t ids_per_pos = std::max(size_t(1), group_limit);
 
-        for(size_t i = 0; i < std::min(ids_per_pos, pos_ids.second.size()); i++) {
-            auto seq_id = pos_ids.second[i];
+        for(size_t inner_pos = 0; inner_pos < std::min(ids_per_pos, pos_ids.second.size()); inner_pos++) {
+            auto seq_id = pos_ids.second[inner_pos];
             auto index_id = (seq_id % num_indices);
-            index_to_included_ids[index_id][position].push_back(seq_id);
+            index_to_included_ids[index_id][outer_pos][inner_pos] = seq_id;
             //LOG(INFO) << "Adding seq_id " << seq_id << " to index_id " << index_id;
         }
     }
diff --git a/src/index.cpp b/src/index.cpp
index ce1fecbc..4e50fb6d 100644
--- a/src/index.cpp
+++ b/src/index.cpp
@@ -1017,8 +1017,9 @@ void Index::run_search() {
 
         // after the wait, we own the lock.
         search(search_params->outcome, search_params->query, search_params->search_fields,
-               search_params->filters, search_params->facets, search_params->facet_query, search_params->included_ids,
-               search_params->excluded_ids, search_params->sort_fields_std, search_params->num_typos,
+               search_params->filters, search_params->facets, search_params->facet_query,
+               search_params->included_ids, search_params->excluded_ids,
+               search_params->sort_fields_std, search_params->num_typos,
                search_params->topster, search_params->curated_topster,
                search_params->per_page, search_params->page, search_params->token_order,
                search_params->prefix, search_params->drop_tokens_threshold,
@@ -1038,7 +1039,7 @@ void Index::run_search() {
 }
 
 void Index::collate_included_ids(const std::string & query, const std::string & field, const uint8_t field_id,
-                                 const std::map<size_t, std::vector<uint32_t>> & included_ids_map,
+                                 const std::map<size_t, std::map<size_t, uint32_t>> & included_ids_map,
                                  Topster* curated_topster,
                                  std::vector<std::vector<art_leaf*>> & searched_queries) {
 
@@ -1067,13 +1068,16 @@ void Index::collate_included_ids(const std::string & query, const std::string &
     }
 
     for(const auto& pos_ids: included_ids_map) {
-        const size_t pos = pos_ids.first;
+        const size_t outer_pos = pos_ids.first;
 
-        for(size_t i = 0; i < pos_ids.second.size(); i++) {
-            uint32_t seq_id = pos_ids.second[i];
+        for(const auto& index_seq_id: pos_ids.second) {
+            uint32_t inner_pos = index_seq_id.first;
+            uint32_t seq_id = index_seq_id.second;
 
-            uint64_t distinct_id = pos;          // position is the group distinct key
-            uint64_t match_score = (64000 - i);  // index within a group is the match score
+            uint64_t distinct_id = outer_pos;              // outer pos is the group distinct key
+            uint64_t match_score = (64000 - inner_pos);    // inner pos within a group is the match score
+
+            LOG(INFO) << "seq_id: " << seq_id << " - " << match_score;
 
             int64_t scores[3];
             scores[0] = match_score;
@@ -1093,7 +1097,7 @@ void Index::search(Option<uint32_t> & outcome,
                    const std::vector<std::string> & search_fields,
                    const std::vector<filter> & filters,
                    std::vector<facet> & facets, facet_query_t & facet_query,
-                   const std::map<size_t, std::vector<uint32_t>> & included_ids_map,
+                   const std::map<size_t, std::map<size_t, uint32_t>> & included_ids_map,
                    const std::vector<uint32_t> & excluded_ids,
                    const std::vector<sort_by> & sort_fields_std, const int num_typos,
                    Topster* topster,
@@ -1122,10 +1126,10 @@ void Index::search(Option<uint32_t> & outcome,
     std::set<uint32_t> curated_ids;
     std::vector<uint32_t> included_ids;
 
-    for(const auto& pos_ids: included_ids_map) {
-        for(const uint32_t id: pos_ids.second) {
-            curated_ids.insert(id);
-            included_ids.push_back(id);
+    for(const auto& outer_pos_ids: included_ids_map) {
+        for(const auto& inner_pos_seq_id: outer_pos_ids.second) {
+            curated_ids.insert(inner_pos_seq_id.second);
+            included_ids.push_back(inner_pos_seq_id.second);
         }
     }
 

From de28d33f24366986a6120303e579363ec59c18fe Mon Sep 17 00:00:00 2001
From: kishorenc <kishorenc@gmail.com>
Date: Sun, 21 Jun 2020 22:00:08 +0530
Subject: [PATCH 19/38] More tests for grouping.

---
 src/collection.cpp                |   8 +-
 src/index.cpp                     |   2 +-
 test/collection_grouping_test.cpp | 289 +++++++++++++++++++++++++++---
 test/collection_override_test.cpp |  12 ++
 test/group_documents.jsonl        |  12 ++
 5 files changed, 290 insertions(+), 33 deletions(-)
 create mode 100644 test/group_documents.jsonl

diff --git a/src/collection.cpp b/src/collection.cpp
index 0f7336f7..1c244855 100644
--- a/src/collection.cpp
+++ b/src/collection.cpp
@@ -351,8 +351,6 @@ void Collection::populate_overrides(std::string query,
                     include_ids[hit.position].push_back(seq_id);
                 }
             }
-
-            break;
         }
     }
 
@@ -805,8 +803,6 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
 
     // for grouping we have to aggregate group set sizes to a count value
     if(group_limit) {
-        LOG(INFO) << "override_result_kvs size: " << override_result_kvs.size();
-
         for(auto& acc_facet: facets) {
             for(auto& facet_kv: acc_facet.result_map) {
                 facet_kv.second.count = facet_kv.second.groups.size();
@@ -877,7 +873,7 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
                 group_keys.push_back(group_key);
             }
 
-            group_hits["group_key"] = StringUtils::join(group_keys, ":");
+            group_hits["group_key"] = StringUtils::join(group_keys, ",");
         }
 
         nlohmann::json& hits_array = (group_limit > 1) ? group_hits["hits"] : result["hits"];
@@ -1391,7 +1387,7 @@ Option<uint32_t> Collection::add_override(const override_t & override) {
         return Option<uint32_t>(500, "Error while storing the override on disk.");
     }
 
-    overrides[override.id] = override;
+    overrides.emplace(override.id, override);
     return Option<uint32_t>(200);
 }
 
diff --git a/src/index.cpp b/src/index.cpp
index 4e50fb6d..6384228f 100644
--- a/src/index.cpp
+++ b/src/index.cpp
@@ -1077,7 +1077,7 @@ void Index::collate_included_ids(const std::string & query, const std::string &
             uint64_t distinct_id = outer_pos;              // outer pos is the group distinct key
             uint64_t match_score = (64000 - inner_pos);    // inner pos within a group is the match score
 
-            LOG(INFO) << "seq_id: " << seq_id << " - " << match_score;
+            // LOG(INFO) << "seq_id: " << seq_id << " - " << match_score;
 
             int64_t scores[3];
             scores[0] = match_score;
diff --git a/test/collection_grouping_test.cpp b/test/collection_grouping_test.cpp
index 8a9a3849..f3e0e6a2 100644
--- a/test/collection_grouping_test.cpp
+++ b/test/collection_grouping_test.cpp
@@ -10,6 +10,7 @@ class CollectionGroupingTest : public ::testing::Test {
 protected:
     Store *store;
     CollectionManager & collectionManager = CollectionManager::get_instance();
+    Collection *coll_group;
 
     void setupCollection() {
         std::string state_dir_path = "/tmp/typesense_test/collection_grouping";
@@ -19,6 +20,33 @@ protected:
         store = new Store(state_dir_path);
         collectionManager.init(store, 4, "auth_key");
         collectionManager.load();
+
+        std::vector<field> fields = {
+            field("title", field_types::STRING, false),
+            field("brand", field_types::STRING, true, true),
+            field("size", field_types::INT32, true, false),
+            field("colors", field_types::STRING_ARRAY, true, false),
+            field("rating", field_types::FLOAT, true, false)
+        };
+
+        coll_group = collectionManager.get_collection("coll_group");
+        if(coll_group == nullptr) {
+            coll_group = collectionManager.create_collection("coll_group", fields, "rating").get();
+        }
+
+        std::ifstream infile(std::string(ROOT_DIR)+"test/group_documents.jsonl");
+
+        std::string json_line;
+
+        while (std::getline(infile, json_line)) {
+            auto add_op = coll_group->add(json_line);
+            if(!add_op.ok()) {
+                std::cout << add_op.error() << std::endl;
+            }
+            ASSERT_TRUE(add_op.ok());
+        }
+
+        infile.close();
     }
 
     virtual void SetUp() {
@@ -31,39 +59,248 @@ protected:
     }
 };
 
-TEST_F(CollectionGroupingTest, GroupingOnOptionalIntegerArray) {
-    Collection *coll_group;
+TEST_F(CollectionGroupingTest, GroupingBasics) {
+    // group by size (int32)
+    auto res = coll_group->search("*", {}, "", {"brand"}, {}, 0, 50, 1, FREQUENCY,
+                                   false, Index::DROP_TOKENS_THRESHOLD,
+                                   spp::sparse_hash_set<std::string>(),
+                                   spp::sparse_hash_set<std::string>(), 10, "", 30,
+                                   "", 10,
+                                   {}, {}, {"size"}, 2).get();
 
-    std::vector<field> fields = {
-        field("title", field_types::STRING, false),
-        field("description", field_types::STRING, true, true),
-        field("max", field_types::INT32, false),
-        field("scores", field_types::INT64_ARRAY, true, true),
-        field("average", field_types::FLOAT, false, true),
-        field("is_valid", field_types::BOOL, false, true),
+    ASSERT_EQ(3, res["found"].get<size_t>());
+    ASSERT_EQ(3, res["grouped_hits"].size());
+    ASSERT_STREQ("size", res["grouped_hits"][0]["group_key"].get<std::string>().c_str());
+
+    ASSERT_FLOAT_EQ(4.8, res["grouped_hits"][0]["hits"][0]["document"]["rating"].get<float>());
+    ASSERT_STREQ("5", res["grouped_hits"][0]["hits"][0]["document"]["id"].get<std::string>().c_str());
+    ASSERT_FLOAT_EQ(4.3, res["grouped_hits"][0]["hits"][1]["document"]["rating"].get<float>());
+    ASSERT_STREQ("1", res["grouped_hits"][0]["hits"][1]["document"]["id"].get<std::string>().c_str());
+
+    ASSERT_FLOAT_EQ(4.8, res["grouped_hits"][1]["hits"][0]["document"]["rating"].get<float>());
+    ASSERT_STREQ("4", res["grouped_hits"][1]["hits"][0]["document"]["id"].get<std::string>().c_str());
+    ASSERT_FLOAT_EQ(4.6, res["grouped_hits"][1]["hits"][1]["document"]["rating"].get<float>());
+    ASSERT_STREQ("3", res["grouped_hits"][1]["hits"][1]["document"]["id"].get<std::string>().c_str());
+
+    ASSERT_FLOAT_EQ(4.6, res["grouped_hits"][2]["hits"][0]["document"]["rating"].get<float>());
+    ASSERT_STREQ("2", res["grouped_hits"][2]["hits"][0]["document"]["id"].get<std::string>().c_str());
+    ASSERT_FLOAT_EQ(4.4, res["grouped_hits"][2]["hits"][1]["document"]["rating"].get<float>());
+    ASSERT_STREQ("8", res["grouped_hits"][2]["hits"][1]["document"]["id"].get<std::string>().c_str());
+
+    ASSERT_STREQ("brand", res["facet_counts"][0]["field_name"].get<std::string>().c_str());
+    ASSERT_EQ(3, (int) res["facet_counts"][0]["counts"][0]["count"]);
+    ASSERT_STREQ("Omega", res["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
+
+    ASSERT_EQ(3, (int) res["facet_counts"][0]["counts"][1]["count"]);
+    ASSERT_STREQ("Beta", res["facet_counts"][0]["counts"][1]["value"].get<std::string>().c_str());
+
+    ASSERT_EQ(2, (int) res["facet_counts"][0]["counts"][2]["count"]);
+    ASSERT_STREQ("Xorp", res["facet_counts"][0]["counts"][2]["value"].get<std::string>().c_str());
+
+    ASSERT_EQ(1, (int) res["facet_counts"][0]["counts"][3]["count"]);
+    ASSERT_STREQ("Zeta", res["facet_counts"][0]["counts"][3]["value"].get<std::string>().c_str());
+
+    // group by rating (float) and sort by size
+    std::vector<sort_by> sort_size = {sort_by("size", "DESC")};
+    res = coll_group->search("*", {}, "", {"brand"}, sort_size, 0, 50, 1, FREQUENCY,
+                             false, Index::DROP_TOKENS_THRESHOLD,
+                             spp::sparse_hash_set<std::string>(),
+                             spp::sparse_hash_set<std::string>(), 10, "brand: omeg", 30,
+                             "", 10,
+                             {}, {}, {"rating"}, 2).get();
+
+    // 7 unique ratings
+    ASSERT_EQ(7, res["found"].get<size_t>());
+    ASSERT_EQ(7, res["grouped_hits"].size());
+    ASSERT_STREQ("rating", res["grouped_hits"][0]["group_key"].get<std::string>().c_str());
+
+    ASSERT_EQ(12, res["grouped_hits"][0]["hits"][0]["document"]["size"].get<uint32_t>());
+    ASSERT_STREQ("8", res["grouped_hits"][0]["hits"][0]["document"]["id"].get<std::string>().c_str());
+    ASSERT_FLOAT_EQ(4.4, res["grouped_hits"][0]["hits"][0]["document"]["rating"].get<float>());
+
+    ASSERT_EQ(12, res["grouped_hits"][1]["hits"][0]["document"]["size"].get<uint32_t>());
+    ASSERT_STREQ("6", res["grouped_hits"][1]["hits"][0]["document"]["id"].get<std::string>().c_str());
+    ASSERT_FLOAT_EQ(4.3, res["grouped_hits"][1]["hits"][0]["document"]["rating"].get<float>());
+
+    ASSERT_EQ(11, res["grouped_hits"][1]["hits"][1]["document"]["size"].get<uint32_t>());
+    ASSERT_STREQ("1", res["grouped_hits"][1]["hits"][1]["document"]["id"].get<std::string>().c_str());
+    ASSERT_FLOAT_EQ(4.3, res["grouped_hits"][1]["hits"][1]["document"]["rating"].get<float>());
+
+    ASSERT_EQ(10, res["grouped_hits"][5]["hits"][0]["document"]["size"].get<uint32_t>());
+    ASSERT_STREQ("9", res["grouped_hits"][5]["hits"][0]["document"]["id"].get<std::string>().c_str());
+    ASSERT_FLOAT_EQ(4.1, res["grouped_hits"][5]["hits"][0]["document"]["rating"].get<float>());
+
+    ASSERT_EQ(10, res["grouped_hits"][6]["hits"][0]["document"]["size"].get<uint32_t>());
+    ASSERT_STREQ("0", res["grouped_hits"][6]["hits"][0]["document"]["id"].get<std::string>().c_str());
+    ASSERT_FLOAT_EQ(4.5, res["grouped_hits"][6]["hits"][0]["document"]["rating"].get<float>());
+
+    ASSERT_STREQ("brand", res["facet_counts"][0]["field_name"].get<std::string>().c_str());
+    ASSERT_EQ(3, (int) res["facet_counts"][0]["counts"][0]["count"]);
+    ASSERT_STREQ("Omega", res["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
+    ASSERT_STREQ("<mark>Omeg</mark>a", res["facet_counts"][0]["counts"][0]["highlighted"].get<std::string>().c_str());
+}
+
+TEST_F(CollectionGroupingTest, GroupingCompoundKey) {
+    // group by size+brand (int32, string)
+    auto res = coll_group->search("*", {}, "", {"brand"}, {}, 0, 50, 1, FREQUENCY,
+                                  false, Index::DROP_TOKENS_THRESHOLD,
+                                  spp::sparse_hash_set<std::string>(),
+                                  spp::sparse_hash_set<std::string>(), 10, "", 30,
+                                  "", 10,
+                                  {}, {}, {"size", "brand"}, 2).get();
+
+    ASSERT_EQ(10, res["found"].get<size_t>());
+    ASSERT_EQ(10, res["grouped_hits"].size());
+    ASSERT_STREQ("size,brand", res["grouped_hits"][0]["group_key"].get<std::string>().c_str());
+
+    ASSERT_EQ(1, res["grouped_hits"][0]["hits"].size());
+    ASSERT_FLOAT_EQ(4.8, res["grouped_hits"][0]["hits"][0]["document"]["rating"].get<float>());
+    ASSERT_STREQ("5", res["grouped_hits"][0]["hits"][0]["document"]["id"].get<std::string>().c_str());
+
+    ASSERT_EQ(1, res["grouped_hits"][1]["hits"].size());
+    ASSERT_FLOAT_EQ(4.8, res["grouped_hits"][1]["hits"][0]["document"]["rating"].get<float>());
+    ASSERT_STREQ("4", res["grouped_hits"][1]["hits"][0]["document"]["id"].get<std::string>().c_str());
+
+    ASSERT_EQ(2, res["grouped_hits"][2]["hits"].size());
+    ASSERT_FLOAT_EQ(4.6, res["grouped_hits"][2]["hits"][0]["document"]["rating"].get<float>());
+    ASSERT_STREQ("3", res["grouped_hits"][2]["hits"][0]["document"]["id"].get<std::string>().c_str());
+    ASSERT_FLOAT_EQ(4.5, res["grouped_hits"][2]["hits"][1]["document"]["rating"].get<float>());
+    ASSERT_STREQ("0", res["grouped_hits"][2]["hits"][1]["document"]["id"].get<std::string>().c_str());
+
+    ASSERT_STREQ("brand", res["facet_counts"][0]["field_name"].get<std::string>().c_str());
+    ASSERT_EQ(3, (int) res["facet_counts"][0]["counts"][0]["count"]);
+    ASSERT_STREQ("Omega", res["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
+
+    ASSERT_EQ(3, (int) res["facet_counts"][0]["counts"][1]["count"]);
+    ASSERT_STREQ("Beta", res["facet_counts"][0]["counts"][1]["value"].get<std::string>().c_str());
+
+    ASSERT_EQ(2, (int) res["facet_counts"][0]["counts"][2]["count"]);
+    ASSERT_STREQ("Xorp", res["facet_counts"][0]["counts"][2]["value"].get<std::string>().c_str());
+
+    ASSERT_EQ(1, (int) res["facet_counts"][0]["counts"][3]["count"]);
+    ASSERT_STREQ("Zeta", res["facet_counts"][0]["counts"][3]["value"].get<std::string>().c_str());
+
+    // pagination with page=2, per_page=2
+    res = coll_group->search("*", {}, "", {"brand"}, {}, 0, 2, 2, FREQUENCY,
+                             false, Index::DROP_TOKENS_THRESHOLD,
+                             spp::sparse_hash_set<std::string>(),
+                             spp::sparse_hash_set<std::string>(), 10, "", 30,
+                             "", 10,
+                             {}, {}, {"size", "brand"}, 2).get();
+
+
+    // 3rd result from previous assertion will be in the first position
+    ASSERT_EQ(2, res["grouped_hits"][0]["hits"].size());
+    ASSERT_FLOAT_EQ(4.6, res["grouped_hits"][0]["hits"][0]["document"]["rating"].get<float>());
+    ASSERT_STREQ("3", res["grouped_hits"][0]["hits"][0]["document"]["id"].get<std::string>().c_str());
+    ASSERT_FLOAT_EQ(4.5, res["grouped_hits"][0]["hits"][1]["document"]["rating"].get<float>());
+    ASSERT_STREQ("0", res["grouped_hits"][0]["hits"][1]["document"]["id"].get<std::string>().c_str());
+
+    // total count and facet counts should be the same
+    ASSERT_EQ(10, res["found"].get<size_t>());
+    ASSERT_EQ(2, res["grouped_hits"].size());
+    ASSERT_STREQ("size,brand", res["grouped_hits"][0]["group_key"].get<std::string>().c_str());
+
+    ASSERT_STREQ("brand", res["facet_counts"][0]["field_name"].get<std::string>().c_str());
+    ASSERT_EQ(3, (int) res["facet_counts"][0]["counts"][0]["count"]);
+    ASSERT_STREQ("Omega", res["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
+
+    ASSERT_EQ(3, (int) res["facet_counts"][0]["counts"][1]["count"]);
+    ASSERT_STREQ("Beta", res["facet_counts"][0]["counts"][1]["value"].get<std::string>().c_str());
+
+    ASSERT_EQ(2, (int) res["facet_counts"][0]["counts"][2]["count"]);
+    ASSERT_STREQ("Xorp", res["facet_counts"][0]["counts"][2]["value"].get<std::string>().c_str());
+
+    ASSERT_EQ(1, (int) res["facet_counts"][0]["counts"][3]["count"]);
+    ASSERT_STREQ("Zeta", res["facet_counts"][0]["counts"][3]["value"].get<std::string>().c_str());
+}
+
+TEST_F(CollectionGroupingTest, GroupingWithSingleDistinct) {
+    auto res = coll_group->search("*", {}, "", {"brand"}, {}, 0, 50, 1, FREQUENCY,
+                                  false, Index::DROP_TOKENS_THRESHOLD,
+                                  spp::sparse_hash_set<std::string>(),
+                                  spp::sparse_hash_set<std::string>(), 10, "", 30,
+                                  "", 10,
+                                  {}, {}, {"brand"}, 1).get();
+
+    ASSERT_EQ(5, res["found"].get<size_t>());
+    ASSERT_EQ(5, res["hits"].size());
+
+    ASSERT_STREQ("4", res["hits"][0]["document"]["id"].get<std::string>().c_str());
+    ASSERT_STREQ("2", res["hits"][1]["document"]["id"].get<std::string>().c_str());
+    ASSERT_STREQ("8", res["hits"][2]["document"]["id"].get<std::string>().c_str());
+    ASSERT_STREQ("10", res["hits"][3]["document"]["id"].get<std::string>().c_str()); // unbranded
+    ASSERT_STREQ("9", res["hits"][4]["document"]["id"].get<std::string>().c_str());
+
+    // facet counts should each be 1, including unbranded
+    ASSERT_STREQ("brand", res["facet_counts"][0]["field_name"].get<std::string>().c_str());
+
+    for(size_t i=0; i < 4; i++) {
+        ASSERT_EQ(1, (int) res["facet_counts"][0]["counts"][i]["count"]);
+    }
+}
+
+TEST_F(CollectionGroupingTest, GroupingWithArrayFieldAndOverride) {
+    nlohmann::json override_json_include = {
+        {"id", "include-rule"},
+        {
+            "rule", {
+               {"query", "shirt"},
+               {"match", override_t::MATCH_EXACT}
+            }
+        }
     };
 
-    coll_group = collectionManager.get_collection("coll_group");
-    if(coll_group == nullptr) {
-        coll_group = collectionManager.create_collection("coll_group", fields, "max").get();
-    }
+    override_json_include["includes"] = nlohmann::json::array();
+    override_json_include["includes"][0] = nlohmann::json::object();
+    override_json_include["includes"][0]["id"] = "11";
+    override_json_include["includes"][0]["position"] = 1;
 
-    std::ifstream infile(std::string(ROOT_DIR)+"test/optional_fields.jsonl");
+    override_json_include["includes"][1] = nlohmann::json::object();
+    override_json_include["includes"][1]["id"] = "10";
+    override_json_include["includes"][1]["position"] = 1;
 
-    std::string json_line;
-
-    while (std::getline(infile, json_line)) {
-        auto add_op = coll_group->add(json_line);
-        if(!add_op.ok()) {
-            std::cout << add_op.error() << std::endl;
+    nlohmann::json override_json_exclude = {
+        {"id",   "exclude-rule"},
+        {
+            "rule", {
+                 {"query", "shirt"},
+                 {"match", override_t::MATCH_EXACT}
+            }
         }
-        ASSERT_TRUE(add_op.ok());
-    }
+    };
+    override_json_exclude["excludes"] = nlohmann::json::array();
+    override_json_exclude["excludes"][0] = nlohmann::json::object();
+    override_json_exclude["excludes"][0]["id"] = "2";
 
-    infile.close();
+    override_t override1(override_json_include);
+    override_t override2(override_json_exclude);
+    Option<uint32_t> ov1_op = coll_group->add_override(override1);
+    Option<uint32_t> ov2_op = coll_group->add_override(override2);
 
-    // first must be able to fetch all records (i.e. all must have been index)
+    ASSERT_TRUE(ov1_op.ok());
+    ASSERT_TRUE(ov2_op.ok());
 
-    auto res = coll_group->search("*", {"title"}, "", {}, {}, 0, 10, 1, FREQUENCY, false).get();
-    ASSERT_EQ(6, res["found"].get<size_t>());
+    auto res = coll_group->search("shirt", {"title"}, "", {"brand"}, {}, 0, 50, 1, FREQUENCY,
+                                  false, Index::DROP_TOKENS_THRESHOLD,
+                                  spp::sparse_hash_set<std::string>(),
+                                  spp::sparse_hash_set<std::string>(), 10, "", 30,
+                                  "", 10,
+                                  {}, {}, {"colors"}, 2).get();
+
+    ASSERT_EQ(4, res["found"].get<size_t>());
+    ASSERT_EQ(4, res["grouped_hits"].size());
+    ASSERT_STREQ("colors", res["grouped_hits"][0]["group_key"].get<std::string>().c_str());
+
+    ASSERT_STREQ("11", res["grouped_hits"][0]["hits"][0]["document"]["id"].get<std::string>().c_str());
+    ASSERT_STREQ("10", res["grouped_hits"][0]["hits"][1]["document"]["id"].get<std::string>().c_str());
+
+    ASSERT_STREQ("5", res["grouped_hits"][1]["hits"][0]["document"]["id"].get<std::string>().c_str());
+    ASSERT_STREQ("3", res["grouped_hits"][1]["hits"][1]["document"]["id"].get<std::string>().c_str());
+
+    ASSERT_STREQ("4", res["grouped_hits"][2]["hits"][0]["document"]["id"].get<std::string>().c_str());
+    ASSERT_STREQ("0", res["grouped_hits"][2]["hits"][1]["document"]["id"].get<std::string>().c_str());
+
+    ASSERT_EQ(1, res["grouped_hits"][3]["hits"].size());
+    ASSERT_STREQ("8", res["grouped_hits"][3]["hits"][0]["document"]["id"].get<std::string>().c_str());
 }
\ No newline at end of file
diff --git a/test/collection_override_test.cpp b/test/collection_override_test.cpp
index 30d223da..a36fb85a 100644
--- a/test/collection_override_test.cpp
+++ b/test/collection_override_test.cpp
@@ -377,4 +377,16 @@ TEST_F(CollectionOverrideTest, PinnedHitsGrouping) {
                             pinned_hits, {}, {"cast"}, 2).get();
 
     ASSERT_EQ(8, results["found"].get<size_t>());
+
+    ASSERT_STREQ("cast", results["grouped_hits"][0]["group_key"].get<std::string>().c_str());
+    ASSERT_STREQ("6", results["grouped_hits"][0]["hits"][0]["document"]["id"].get<std::string>().c_str());
+    ASSERT_STREQ("8", results["grouped_hits"][0]["hits"][1]["document"]["id"].get<std::string>().c_str());
+
+    ASSERT_STREQ("1", results["grouped_hits"][1]["hits"][0]["document"]["id"].get<std::string>().c_str());
+
+    ASSERT_STREQ("13", results["grouped_hits"][2]["hits"][0]["document"]["id"].get<std::string>().c_str());
+    ASSERT_STREQ("4", results["grouped_hits"][2]["hits"][1]["document"]["id"].get<std::string>().c_str());
+
+    ASSERT_STREQ("11", results["grouped_hits"][3]["hits"][0]["document"]["id"].get<std::string>().c_str());
+    ASSERT_STREQ("16", results["grouped_hits"][4]["hits"][0]["document"]["id"].get<std::string>().c_str());
 }
\ No newline at end of file
diff --git a/test/group_documents.jsonl b/test/group_documents.jsonl
new file mode 100644
index 00000000..5d673ff0
--- /dev/null
+++ b/test/group_documents.jsonl
@@ -0,0 +1,12 @@
+{"title": "Omega Casual Poplin Shirt", "brand": "Omega", "size": 10, "colors": ["white", "blue"], "rating": 4.5}
+{"title": "Omega Casual Poplin Shirt", "brand": "Omega", "size": 11, "colors": ["white", "blue"], "rating": 4.3}
+{"title": "Omega Casual Poplin Shirt", "brand": "Omega", "size": 12, "colors": ["white", "blue"], "rating": 4.6}
+{"title": "Omega Casual Poplin Shirt", "brand": "Omega", "size": 10, "colors": ["blue"], "rating": 4.6}
+{"title": "Beta Casual Poplin Shirt", "brand": "Beta", "size": 10, "colors": ["white", "blue"], "rating": 4.8}
+{"title": "Beta Casual Poplin Shirt", "brand": "Beta", "size": 11, "colors": ["blue"], "rating": 4.8}
+{"title": "Beta Casual Poplin Shirt", "brand": "Beta", "size": 12, "colors": ["white", "blue"], "rating": 4.3}
+{"title": "Xorp Casual Shirt", "brand": "Xorp", "size": 10, "colors": ["white", "blue"], "rating": 4.3}
+{"title": "Xorp Casual Shirt", "brand": "Xorp", "size": 12, "colors": ["white", "red"], "rating": 4.4}
+{"title": "Zeta Casual Shirt", "brand": "Zeta", "size": 10, "colors": ["white", "blue"], "rating": 4.1}
+{"title": "White Casual Shirt", "size": 10, "colors": ["white"], "rating": 4.3}
+{"title": "White Casual Shirt", "size": 10, "colors": ["white"], "rating": 3.3}
\ No newline at end of file

From 933cbe9bb0502e7c4e7b06f2f9782ca0e465aa1f Mon Sep 17 00:00:00 2001
From: kishorenc <kishorenc@gmail.com>
Date: Mon, 22 Jun 2020 07:29:20 +0530
Subject: [PATCH 20/38] More tests.

---
 cmake/Glog.cmake                  |  0
 src/collection.cpp                |  4 ++++
 src/core_api.cpp                  |  5 -----
 test/collection_grouping_test.cpp | 14 ++++++++++++++
 test/collection_sorting_test.cpp  |  1 +
 test/collection_test.cpp          |  8 +++++++-
 6 files changed, 26 insertions(+), 6 deletions(-)
 create mode 100644 cmake/Glog.cmake

diff --git a/cmake/Glog.cmake b/cmake/Glog.cmake
new file mode 100644
index 00000000..e69de29b
diff --git a/src/collection.cpp b/src/collection.cpp
index 1c244855..fbe0a376 100644
--- a/src/collection.cpp
+++ b/src/collection.cpp
@@ -390,6 +390,10 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
                                   const std::vector<std::string>& group_by_fields,
                                   const size_t group_limit) {
 
+    if(query != "*" && search_fields.empty()) {
+        return Option<nlohmann::json>(400, "No search fields specified for the query.");
+    }
+
     std::vector<uint32_t> excluded_ids;
     std::map<size_t, std::vector<uint32_t>> include_ids; // position => list of IDs
     populate_overrides(query, pinned_hits, hidden_hits, include_ids, excluded_ids);
diff --git a/src/core_api.cpp b/src/core_api.cpp
index 03f89a2f..a52b1a7a 100644
--- a/src/core_api.cpp
+++ b/src/core_api.cpp
@@ -252,11 +252,6 @@ bool get_search(http_req & req, http_res & res) {
         return false;
     }
 
-    if(req.params.count(QUERY_BY) == 0 && req.params[QUERY] != "*") {
-        res.set_400(std::string("Parameter `") + QUERY_BY + "` is required.");
-        return false;
-    }
-
     if(req.params.count(MAX_FACET_VALUES) == 0) {
         req.params[MAX_FACET_VALUES] = "10";
     }
diff --git a/test/collection_grouping_test.cpp b/test/collection_grouping_test.cpp
index f3e0e6a2..b1fcd8b6 100644
--- a/test/collection_grouping_test.cpp
+++ b/test/collection_grouping_test.cpp
@@ -303,4 +303,18 @@ TEST_F(CollectionGroupingTest, GroupingWithArrayFieldAndOverride) {
 
     ASSERT_EQ(1, res["grouped_hits"][3]["hits"].size());
     ASSERT_STREQ("8", res["grouped_hits"][3]["hits"][0]["document"]["id"].get<std::string>().c_str());
+
+    // assert facet counts
+    ASSERT_STREQ("brand", res["facet_counts"][0]["field_name"].get<std::string>().c_str());
+    ASSERT_EQ(2, (int) res["facet_counts"][0]["counts"][0]["count"]);
+    ASSERT_STREQ("Omega", res["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
+
+    ASSERT_EQ(2, (int) res["facet_counts"][0]["counts"][1]["count"]);
+    ASSERT_STREQ("Xorp", res["facet_counts"][0]["counts"][1]["value"].get<std::string>().c_str());
+
+    ASSERT_EQ(2, (int) res["facet_counts"][0]["counts"][2]["count"]);
+    ASSERT_STREQ("Beta", res["facet_counts"][0]["counts"][2]["value"].get<std::string>().c_str());
+
+    ASSERT_EQ(1, (int) res["facet_counts"][0]["counts"][3]["count"]);
+    ASSERT_STREQ("Zeta", res["facet_counts"][0]["counts"][3]["value"].get<std::string>().c_str());
 }
\ No newline at end of file
diff --git a/test/collection_sorting_test.cpp b/test/collection_sorting_test.cpp
index bd983a05..e552f307 100644
--- a/test/collection_sorting_test.cpp
+++ b/test/collection_sorting_test.cpp
@@ -249,6 +249,7 @@ TEST_F(CollectionSortingTest, ThreeSortFieldsLimit) {
         sort_by("min", "DESC"),
     };
 
+    query_fields = {"title"};
     auto res_op = coll1->search("the", query_fields, "", {}, sort_fields_desc, 0, 10, 1, FREQUENCY, false);
 
     ASSERT_FALSE(res_op.ok());
diff --git a/test/collection_test.cpp b/test/collection_test.cpp
index ec5c3649..12748830 100644
--- a/test/collection_test.cpp
+++ b/test/collection_test.cpp
@@ -458,10 +458,16 @@ TEST_F(CollectionTest, WildcardQuery) {
     }
 
     // wildcard query should not require a search field
-    results_op = collection->search("*", {}, "", {}, sort_fields, 0, 3, 1, FREQUENCY, false).get();
+    results_op = collection->search("*", {}, "", {}, sort_fields, 0, 3, 1, FREQUENCY, false);
     ASSERT_TRUE(results_op.ok());
+    results = results_op.get();
     ASSERT_EQ(3, results["hits"].size());
     ASSERT_EQ(25, results["found"].get<uint32_t>());
+
+    // non-wildcard query should require a search field
+    results_op = collection->search("the", {}, "", {}, sort_fields, 0, 3, 1, FREQUENCY, false);
+    ASSERT_FALSE(results_op.ok());
+    ASSERT_STREQ("No search fields specified for the query.", results_op.error().c_str());
 }
 
 TEST_F(CollectionTest, PrefixSearching) {

From e314ec23e6c6bd13cbc6a7337c40cb958e5b52e1 Mon Sep 17 00:00:00 2001
From: kishorenc <kishorenc@gmail.com>
Date: Mon, 22 Jun 2020 18:17:52 +0530
Subject: [PATCH 21/38] Move to glog.

---
 .circleci/config.yml               |  2 +-
 CMakeLists.txt                     |  3 ++-
 cmake/Modules/FindGlog.cmake       | 40 ++++++++++++++++++++++++++++++
 cmake/braft.cmake                  |  2 +-
 cmake/brpc.cmake                   |  2 +-
 docker-build.sh                    |  8 +++---
 docker/development.Dockerfile      | 26 ++++++++++++-------
 docker/patches/brpc_cmakelists.txt |  4 +--
 include/logger.h                   |  2 +-
 src/main/typesense_server.cpp      |  6 -----
 src/raft_server.cpp                |  4 +--
 src/typesense_server_utils.cpp     | 21 ++++++----------
 12 files changed, 79 insertions(+), 41 deletions(-)
 create mode 100644 cmake/Modules/FindGlog.cmake

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 63d2d996..8609ef77 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -2,7 +2,7 @@ version: 2
 jobs:
   build:
     docker:
-      - image: typesense/typesense-development:22-MAR-2020-5
+      - image: typesense/typesense-development:22-JUNE-2020-1
     environment:
       - PROJECT_DIR: /typesense
       - TYPESENSE_VERSION: $CIRCLE_BRANCH-$CIRCLE_SHA1
diff --git a/CMakeLists.txt b/CMakeLists.txt
index b8c4e9f8..358f2502 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -43,6 +43,7 @@ FIND_PACKAGE(ICU REQUIRED)
 FIND_PACKAGE(Protobuf REQUIRED)
 FIND_PACKAGE(LevelDB REQUIRED)
 FIND_PACKAGE(gflags REQUIRED)
+FIND_PACKAGE(glog REQUIRED)
 
 message("OpenSSL library: ${OPENSSL_LIBRARIES}")
 
@@ -152,7 +153,7 @@ set(ICU_ALL_LIBRARIES ${ICU_I18N_LIBRARIES} ${ICU_LIBRARIES} ${ICU_DATA_LIBRARIE
 set(CORE_LIBS h2o-evloop iconv ${CURL_LIBRARIES} for ${ICU_ALL_LIBRARIES} ${G3LOGGER_LIBRARIES}
               ${ROCKSDB_LIBS} ${OPENSSL_LIBRARIES} pthread dl ${STD_LIB})
 
-set(CORE_LIBS braft brpc ${LevelDB_LIBRARIES} ${CORE_LIBS} ${GFLAGS_LIBRARIES} ${PROTOBUF_LIBRARIES} ${SYSTEM_LIBS})
+set(CORE_LIBS braft brpc ${LevelDB_LIBRARIES} ${GLOG_LIBRARIES} ${CORE_LIBS} ${GFLAGS_LIBRARIES} ${PROTOBUF_LIBRARIES} ${SYSTEM_LIBS})
 
 target_link_libraries(typesense-core ${CORE_LIBS})
 target_link_libraries(typesense-server ${CORE_LIBS})
diff --git a/cmake/Modules/FindGlog.cmake b/cmake/Modules/FindGlog.cmake
new file mode 100644
index 00000000..688e30ce
--- /dev/null
+++ b/cmake/Modules/FindGlog.cmake
@@ -0,0 +1,40 @@
+# - Try to find Glog
+#
+# The following variables are optionally searched for defaults
+#  GLOG_ROOT_DIR:            Base directory where all GLOG components are found
+#
+# The following are set after configuration is done:
+#  GLOG_FOUND
+#  GLOG_INCLUDE_DIRS
+#  GLOG_LIBRARIES
+
+include(FindPackageHandleStandardArgs)
+
+if (NOT DEFINED GLOG_ROOT)
+    message("set GLOG_ROOT========================")
+    set (GLOG_ROOT /usr /usr/local /usr/include/)
+endif (NOT DEFINED GLOG_ROOT)
+
+#set(GLOG_ROOT_DIR "" CACHE PATH "Folder contains Google glog")
+
+find_path(GLOG_INCLUDE_DIR glog/logging.h
+        PATHS
+        ${GLOG_ROOT_DIR}
+        PATH_SUFFIXES
+        src)
+
+find_library(GLOG_LIBRARY glog libglog
+        PATHS
+        ${GLOG_ROOT_DIR}
+        PATH_SUFFIXES
+        .libs
+        lib
+        lib64)
+
+find_package_handle_standard_args(GLOG DEFAULT_MSG
+        GLOG_INCLUDE_DIR GLOG_LIBRARY)
+
+if(GLOG_FOUND)
+    set(GLOG_INCLUDE_DIRS ${GLOG_INCLUDE_DIR})
+    set(GLOG_LIBRARIES ${GLOG_LIBRARY})
+endif()
\ No newline at end of file
diff --git a/cmake/braft.cmake b/cmake/braft.cmake
index 082f95c9..b72a1e25 100644
--- a/cmake/braft.cmake
+++ b/cmake/braft.cmake
@@ -1,4 +1,4 @@
-set(BRAFT_VERSION 0a9ec3f)
+set(BRAFT_VERSION fb27e63)
 set(BRAFT_NAME braft-${BRAFT_VERSION})
 set(BRAFT_TAR_PATH ${DEP_ROOT_DIR}/${BRAFT_NAME}.tar.gz)
 
diff --git a/cmake/brpc.cmake b/cmake/brpc.cmake
index 4c97adf1..9ea1927a 100644
--- a/cmake/brpc.cmake
+++ b/cmake/brpc.cmake
@@ -1,4 +1,4 @@
-set(BRPC_VERSION 23c66e3)
+set(BRPC_VERSION 2f8fc37d)
 set(BRPC_NAME brpc-${BRPC_VERSION})
 set(BRPC_TAR_PATH ${DEP_ROOT_DIR}/${BRPC_NAME}.tar.gz)
 
diff --git a/docker-build.sh b/docker-build.sh
index 16bd3f26..94164a05 100755
--- a/docker-build.sh
+++ b/docker-build.sh
@@ -22,13 +22,13 @@ if [[ "$@" == *"--depclean"* ]]; then
 fi
 
 #echo "Creating development image..."
-#docker build --file $PROJECT_DIR/docker/development.Dockerfile --tag typesense/typesense-development:latest $PROJECT_DIR/docker
+#docker build --file $PROJECT_DIR/docker/development.Dockerfile --tag typesense/typesense-development:22-JUNE-2020-1 $PROJECT_DIR/docker
 
 echo "Building Typesense $TYPESENSE_VERSION..."
-docker run -it -v $PROJECT_DIR:/typesense typesense/typesense-development cmake -DTYPESENSE_VERSION=$TYPESENSE_VERSION \
+docker run -it -v $PROJECT_DIR:/typesense typesense/typesense-development:22-JUNE-2020-1 cmake -DTYPESENSE_VERSION=$TYPESENSE_VERSION \
 -DCMAKE_BUILD_TYPE=Release -H/typesense -B/typesense/$BUILD_DIR
 
-docker run -it -v $PROJECT_DIR:/typesense typesense/typesense-development make typesense-server -C/typesense/$BUILD_DIR
+docker run -it -v $PROJECT_DIR:/typesense typesense/typesense-development:22-JUNE-2020-1 make typesense-server -C/typesense/$BUILD_DIR
 
 if [[ "$@" == *"--build-deploy-image"* ]]; then
     echo "Creating deployment image for Typesense $TYPESENSE_VERSION server ..."
@@ -60,7 +60,7 @@ if [[ "$@" == *"--package-libs"* ]]; then
 fi
 #
 #if [[ "$@" == *"--create-deb-upload"* ]]; then
-#    docker run -it -v $PROJECT_DIR:/typesense typesense/typesense-development cmake -DTYPESENSE_VERSION=$TYPESENSE_VERSION \
+#    docker run -it -v $PROJECT_DIR:/typesense typesense/typesense-development:22-JUNE-2020-1 cmake -DTYPESENSE_VERSION=$TYPESENSE_VERSION \
 #    -DCMAKE_BUILD_TYPE=Debug -H/typesense -B/typesense/$BUILD_DIR
 #fi
 
diff --git a/docker/development.Dockerfile b/docker/development.Dockerfile
index 790a3246..8b6d0cc4 100644
--- a/docker/development.Dockerfile
+++ b/docker/development.Dockerfile
@@ -55,25 +55,33 @@ ADD https://github.com/protocolbuffers/protobuf/releases/download/v3.11.4/protob
 RUN tar -C /opt -xf /opt/protobuf-cpp-3.11.4.tar.gz && chown -R root:root /opt/protobuf-3.11.4
 RUN cd /opt/protobuf-3.11.4 && ./configure --disable-shared && make -j8 && make check && make install && rm -rf /usr/local/lib/*.so*
 
-ADD https://github.com/google/leveldb/archive/1.22.tar.gz /opt/leveldb-1.22.tar.gz.tar.gz
-RUN tar -C /opt -xf /opt/leveldb-1.22.tar.gz.tar.gz
+ADD https://github.com/google/leveldb/archive/1.22.tar.gz /opt/leveldb-1.22.tar.gz
+RUN tar -C /opt -xf /opt/leveldb-1.22.tar.gz
 RUN mkdir -p /opt/leveldb-1.22/build && cd /opt/leveldb-1.22/build && cmake -DCMAKE_BUILD_TYPE=Release .. && \
     cmake --build . && make install && rm -rf /usr/local/lib/*.so*
 
+ADD https://github.com/google/glog/archive/v0.4.0.tar.gz /opt/glog-0.4.0.tar.gz
+RUN tar -C /opt -xf /opt/glog-0.4.0.tar.gz
+RUN mkdir -p /opt/glog-0.4.0/build && cd /opt/glog-0.4.0/build && \
+    cmake -DBUILD_TESTING=0 -DWITH_GFLAGS=ON -DWITH_UNWIND=OFF .. && \
+    cmake --build . && make install && rm -rf /usr/local/lib/*.so*
+
 ADD https://github.com/apache/incubator-brpc/archive/0.9.7-rc03.tar.gz /opt/brpc-0.9.7-rc03.tar.gz
 RUN tar -C /opt -xf /opt/brpc-0.9.7-rc03.tar.gz
 COPY patches/brpc_cmakelists.txt /opt/incubator-brpc-0.9.7-rc03/src/CMakeLists.txt
 RUN chown root:root /opt/incubator-brpc-0.9.7-rc03/src/CMakeLists.txt
 RUN mkdir -p /opt/incubator-brpc-0.9.7-rc03/build && cd /opt/incubator-brpc-0.9.7-rc03/build && \
-    cmake -DWITH_DEBUG_SYMBOLS=OFF .. && make -j8 && make install && rm -rf /usr/local/lib/*.so* && \
+    cmake -DWITH_DEBUG_SYMBOLS=OFF -DWITH_GLOG=ON .. && \
+    make -j8 && make install && rm -rf /usr/local/lib/*.so* && \
     rm -rf /opt/incubator-brpc-0.9.7-rc03/build/output/bin
 
-ADD https://github.com/baidu/braft/archive/v1.1.0.tar.gz /opt/braft-v1.1.0.tar.gz
-RUN tar -C /opt -xf /opt/braft-v1.1.0.tar.gz
-COPY patches/braft_cmakelists.txt /opt/braft-1.1.0/src/CMakeLists.txt
-RUN chown root:root /opt/braft-1.1.0/src/CMakeLists.txt
-RUN mkdir -p /opt/braft-1.1.0/build && cd /opt/braft-1.1.0/build && \
-    cmake -DWITH_DEBUG_SYMBOLS=ON .. && make -j8 && make install && rm -rf /usr/local/lib/*.so*
+ADD https://github.com/baidu/braft/archive/v1.1.1.tar.gz /opt/braft-v1.1.1.tar.gz
+RUN tar -C /opt -xf /opt/braft-v1.1.1.tar.gz
+COPY patches/braft_cmakelists.txt /opt/braft-1.1.1/src/CMakeLists.txt
+RUN chown root:root /opt/braft-1.1.1/src/CMakeLists.txt
+RUN mkdir -p /opt/braft-1.1.1/build && cd /opt/braft-1.1.1/build && \
+    cmake -DWITH_DEBUG_SYMBOLS=ON -DBRPC_WITH_GLOG=ON .. && make -j8 && make install && rm -rf /usr/local/lib/*.so* && \
+    rm -rf /opt/braft-1.1.1/build/output/bin
 
 ENV CC /usr/local/gcc-6.4.0/bin/gcc
 ENV CXX /usr/local/gcc-6.4.0/bin/g++
diff --git a/docker/patches/brpc_cmakelists.txt b/docker/patches/brpc_cmakelists.txt
index 03c05a11..687b3b7d 100644
--- a/docker/patches/brpc_cmakelists.txt
+++ b/docker/patches/brpc_cmakelists.txt
@@ -35,8 +35,8 @@ add_library(brpc-static STATIC $<TARGET_OBJECTS:BUTIL_LIB>
                                $<TARGET_OBJECTS:SOURCES_LIB>
                                $<TARGET_OBJECTS:PROTO_LIB>)
 
-if(BRPC_WITH_THRIFT)
-    target_link_libraries(brpc-static thrift)
+if(BRPC_WITH_GLOG)
+    target_link_libraries(brpc-static ${GLOG_LIB})
 endif()
 
 SET_TARGET_PROPERTIES(brpc-static PROPERTIES OUTPUT_NAME brpc CLEAN_DIRECT_OUTPUT 1)
diff --git a/include/logger.h b/include/logger.h
index c8a9e62e..01adc553 100644
--- a/include/logger.h
+++ b/include/logger.h
@@ -2,4 +2,4 @@
 
 #include <string>
 #include <iostream>
-#include <butil/logging.h>
+#include <glog/logging.h>
\ No newline at end of file
diff --git a/src/main/typesense_server.cpp b/src/main/typesense_server.cpp
index b76942e3..5a3ef7d5 100644
--- a/src/main/typesense_server.cpp
+++ b/src/main/typesense_server.cpp
@@ -55,13 +55,7 @@ void replica_server_routes() {
     server->get("/health", get_health);
 }
 
-namespace logging {
-    DECLARE_bool(log_year);
-}
-
 int main(int argc, char **argv) {
-    logging::FLAGS_log_year = true;
-
     Config config;
 
     cmdline::parser options;
diff --git a/src/raft_server.cpp b/src/raft_server.cpp
index 70a0b472..dc1483e9 100644
--- a/src/raft_server.cpp
+++ b/src/raft_server.cpp
@@ -297,7 +297,7 @@ int ReplicationState::on_snapshot_load(braft::SnapshotReader* reader) {
         return -1;
     }
 
-    LOG(TRACE) << "rm " << store->get_state_dir_path() << " success";
+    LOG(INFO) << "rm " << store->get_state_dir_path() << " success";
 
     std::string snapshot_path = reader->get_path();
     snapshot_path.append(std::string("/") + db_snapshot_name);
@@ -308,7 +308,7 @@ int ReplicationState::on_snapshot_load(braft::SnapshotReader* reader) {
         return -1;
     }
 
-    LOG(TRACE) << "copy snapshot " << snapshot_path << " to " << store->get_state_dir_path() << " success";
+    LOG(INFO) << "copy snapshot " << snapshot_path << " to " << store->get_state_dir_path() << " success";
 
     return init_db();
 }
diff --git a/src/typesense_server_utils.cpp b/src/typesense_server_utils.cpp
index 1a940518..938e2a45 100644
--- a/src/typesense_server_utils.cpp
+++ b/src/typesense_server_utils.cpp
@@ -133,35 +133,31 @@ int init_logger(Config & config, const std::string & server_version) {
     signal(SIGILL, catch_crash);
     signal(SIGSEGV, catch_crash);
 
-    logging::LoggingSettings log_settings;
-
     // we can install new signal handlers only after overriding above
     signal(SIGINT, catch_interrupt);
     signal(SIGTERM, catch_interrupt);
 
+    google::InitGoogleLogging("typesense");
+
     std::string log_dir = config.get_log_dir();
-    std::string log_path;
 
     if(log_dir.empty()) {
         // use console logger if log dir is not specified
-        log_settings.logging_dest = logging::LOG_TO_SYSTEM_DEBUG_LOG;
+        FLAGS_logtostderr = true;
     } else {
         if(!directory_exists(log_dir)) {
             std::cerr << "Typesense failed to start. " << "Log directory " << log_dir << " does not exist.";
             return 1;
         }
 
-        log_settings.logging_dest = logging::LOG_TO_FILE;
-        log_path = log_dir + "/" + "typesense.log";
-        log_settings.log_file = log_path.c_str();
+        std::cout << "Log directory is configured as: " << log_dir << std::endl;
+        std::string log_path_prefix = log_dir + "/" + "typesense-log-";
+        google::SetLogDestination(google::INFO, log_path_prefix.c_str());
 
-        LOG(INFO) << "Starting Typesense " << server_version << ". Log directory is configured as: "
-                  << log_dir << std::endl;
+        // flush log levels above -1 immediately (INFO=0)
+        FLAGS_logbuflevel = -1;
     }
 
-    logging::InitLogging(log_settings);
-    logging::SetMinLogLevel(0);
-
     return 0;
 }
 
@@ -275,7 +271,6 @@ int start_raft_server(ReplicationState& replication_state, const std::string& st
 }
 
 int run_server(const Config & config, const std::string & version, void (*master_server_routes)()) {
-
     LOG(INFO) << "Starting Typesense " << version << std::flush;
     quit_raft_service = false;
 

From abc3ae2a4f1ae65511877a6a29c6a8c0bfd22873 Mon Sep 17 00:00:00 2001
From: kishorenc <kishorenc@gmail.com>
Date: Mon, 22 Jun 2020 18:25:17 +0530
Subject: [PATCH 22/38] Temporary clean build.

---
 .circleci/config.yml         | 7 +++++--
 CMakeLists.txt               | 2 ++
 cmake/Modules/FindGlog.cmake | 1 -
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 8609ef77..6bae9b30 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -16,10 +16,13 @@ jobs:
           keys:
             - external-Linux-cache-{{ .Branch }}-{{ checksum "last-changed-git-sha-for-dependency-listing" }}
             - external-Linux-cache-{{ .Branch }}
-            - external-Linux-cache      
+            - external-Linux-cache
+      - run:
+          name: debug
+          command: ls -latr /usr/local/lib/
       - run: 
           name: build
-          command: $PROJECT_DIR/build.sh
+          command: $PROJECT_DIR/build.sh --clean
       - save_cache:
           key: external-Linux-cache-{{ .Branch }}-{{ checksum "last-changed-git-sha-for-dependency-listing" }}
           paths:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 358f2502..63abad81 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -153,6 +153,8 @@ set(ICU_ALL_LIBRARIES ${ICU_I18N_LIBRARIES} ${ICU_LIBRARIES} ${ICU_DATA_LIBRARIE
 set(CORE_LIBS h2o-evloop iconv ${CURL_LIBRARIES} for ${ICU_ALL_LIBRARIES} ${G3LOGGER_LIBRARIES}
               ${ROCKSDB_LIBS} ${OPENSSL_LIBRARIES} pthread dl ${STD_LIB})
 
+message("GLOG library is: ${GLOG_LIBRARIES}")
+
 set(CORE_LIBS braft brpc ${LevelDB_LIBRARIES} ${GLOG_LIBRARIES} ${CORE_LIBS} ${GFLAGS_LIBRARIES} ${PROTOBUF_LIBRARIES} ${SYSTEM_LIBS})
 
 target_link_libraries(typesense-core ${CORE_LIBS})
diff --git a/cmake/Modules/FindGlog.cmake b/cmake/Modules/FindGlog.cmake
index 688e30ce..1962427f 100644
--- a/cmake/Modules/FindGlog.cmake
+++ b/cmake/Modules/FindGlog.cmake
@@ -11,7 +11,6 @@
 include(FindPackageHandleStandardArgs)
 
 if (NOT DEFINED GLOG_ROOT)
-    message("set GLOG_ROOT========================")
     set (GLOG_ROOT /usr /usr/local /usr/include/)
 endif (NOT DEFINED GLOG_ROOT)
 

From b9b3c58b2cd2a753ff2c05653f01aee6c3613a62 Mon Sep 17 00:00:00 2001
From: kishorenc <kishorenc@gmail.com>
Date: Mon, 22 Jun 2020 19:32:10 +0530
Subject: [PATCH 23/38] Fix build.

---
 .circleci/config.yml         |  2 +-
 CMakeLists.txt               |  6 +++---
 cmake/Modules/FindGlog.cmake | 39 ------------------------------------
 3 files changed, 4 insertions(+), 43 deletions(-)
 delete mode 100644 cmake/Modules/FindGlog.cmake

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 6bae9b30..a5d2dc2f 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -22,7 +22,7 @@ jobs:
           command: ls -latr /usr/local/lib/
       - run: 
           name: build
-          command: $PROJECT_DIR/build.sh --clean
+          command: $PROJECT_DIR/build.sh
       - save_cache:
           key: external-Linux-cache-{{ .Branch }}-{{ checksum "last-changed-git-sha-for-dependency-listing" }}
           paths:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 63abad81..c65f4e47 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -88,8 +88,10 @@ link_directories(${DEP_ROOT_DIR}/${FOR_NAME})
 link_directories(${DEP_ROOT_DIR}/${H2O_NAME}/build)
 link_directories(${DEP_ROOT_DIR}/${ROCKSDB_NAME})
 link_directories(${DEP_ROOT_DIR}/${ICONV_NAME}/lib/.libs)
+if (APPLE)
 link_directories(${DEP_ROOT_DIR}/${BRPC_NAME}/lib)
 link_directories(${DEP_ROOT_DIR}/${BRAFT_NAME}/lib)
+endif()
 
 # Write dependency libraries to a file
 file(WRITE ${DEP_ROOT_DIR}/libs.txt "")
@@ -153,9 +155,7 @@ set(ICU_ALL_LIBRARIES ${ICU_I18N_LIBRARIES} ${ICU_LIBRARIES} ${ICU_DATA_LIBRARIE
 set(CORE_LIBS h2o-evloop iconv ${CURL_LIBRARIES} for ${ICU_ALL_LIBRARIES} ${G3LOGGER_LIBRARIES}
               ${ROCKSDB_LIBS} ${OPENSSL_LIBRARIES} pthread dl ${STD_LIB})
 
-message("GLOG library is: ${GLOG_LIBRARIES}")
-
-set(CORE_LIBS braft brpc ${LevelDB_LIBRARIES} ${GLOG_LIBRARIES} ${CORE_LIBS} ${GFLAGS_LIBRARIES} ${PROTOBUF_LIBRARIES} ${SYSTEM_LIBS})
+set(CORE_LIBS braft brpc ${LevelDB_LIBRARIES} glog ${CORE_LIBS} ${GFLAGS_LIBRARIES} ${PROTOBUF_LIBRARIES} ${SYSTEM_LIBS})
 
 target_link_libraries(typesense-core ${CORE_LIBS})
 target_link_libraries(typesense-server ${CORE_LIBS})
diff --git a/cmake/Modules/FindGlog.cmake b/cmake/Modules/FindGlog.cmake
deleted file mode 100644
index 1962427f..00000000
--- a/cmake/Modules/FindGlog.cmake
+++ /dev/null
@@ -1,39 +0,0 @@
-# - Try to find Glog
-#
-# The following variables are optionally searched for defaults
-#  GLOG_ROOT_DIR:            Base directory where all GLOG components are found
-#
-# The following are set after configuration is done:
-#  GLOG_FOUND
-#  GLOG_INCLUDE_DIRS
-#  GLOG_LIBRARIES
-
-include(FindPackageHandleStandardArgs)
-
-if (NOT DEFINED GLOG_ROOT)
-    set (GLOG_ROOT /usr /usr/local /usr/include/)
-endif (NOT DEFINED GLOG_ROOT)
-
-#set(GLOG_ROOT_DIR "" CACHE PATH "Folder contains Google glog")
-
-find_path(GLOG_INCLUDE_DIR glog/logging.h
-        PATHS
-        ${GLOG_ROOT_DIR}
-        PATH_SUFFIXES
-        src)
-
-find_library(GLOG_LIBRARY glog libglog
-        PATHS
-        ${GLOG_ROOT_DIR}
-        PATH_SUFFIXES
-        .libs
-        lib
-        lib64)
-
-find_package_handle_standard_args(GLOG DEFAULT_MSG
-        GLOG_INCLUDE_DIR GLOG_LIBRARY)
-
-if(GLOG_FOUND)
-    set(GLOG_INCLUDE_DIRS ${GLOG_INCLUDE_DIR})
-    set(GLOG_LIBRARIES ${GLOG_LIBRARY})
-endif()
\ No newline at end of file

From 17a6194d6e115a87e4996204170f6da065e83b07 Mon Sep 17 00:00:00 2001
From: kishorenc <kishorenc@gmail.com>
Date: Mon, 22 Jun 2020 19:53:12 +0530
Subject: [PATCH 24/38] Fix a compile warning.

---
 .circleci/config.yml | 3 ---
 src/index.cpp        | 2 +-
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index a5d2dc2f..765bc4ae 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -17,9 +17,6 @@ jobs:
             - external-Linux-cache-{{ .Branch }}-{{ checksum "last-changed-git-sha-for-dependency-listing" }}
             - external-Linux-cache-{{ .Branch }}
             - external-Linux-cache
-      - run:
-          name: debug
-          command: ls -latr /usr/local/lib/
       - run: 
           name: build
           command: $PROJECT_DIR/build.sh
diff --git a/src/index.cpp b/src/index.cpp
index 6384228f..66b0a299 100644
--- a/src/index.cpp
+++ b/src/index.cpp
@@ -1463,7 +1463,7 @@ void Index::score_results(const std::vector<sort_by> & sort_fields, const uint16
         }
 
         const int64_t default_score = 0;
-        int64_t scores[3];
+        int64_t scores[3] = {0};
 
         // avoiding loop
         if(sort_fields.size() > 0) {

From 57a43a0b210c74f449dc00d36119bd6e44a78b97 Mon Sep 17 00:00:00 2001
From: kishorenc <kishorenc@gmail.com>
Date: Tue, 23 Jun 2020 06:56:58 +0530
Subject: [PATCH 25/38] Upgrade glog to avail static log file name feature.

---
 .circleci/config.yml           |  2 +-
 docker-build.sh                |  8 ++++----
 docker/development.Dockerfile  | 15 ++++++++-------
 src/typesense_server_utils.cpp | 13 +++++++++----
 4 files changed, 22 insertions(+), 16 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 765bc4ae..a55fe2d4 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -2,7 +2,7 @@ version: 2
 jobs:
   build:
     docker:
-      - image: typesense/typesense-development:22-JUNE-2020-1
+      - image: typesense/typesense-development:23-JUNE-2020-1
     environment:
       - PROJECT_DIR: /typesense
       - TYPESENSE_VERSION: $CIRCLE_BRANCH-$CIRCLE_SHA1
diff --git a/docker-build.sh b/docker-build.sh
index 94164a05..e2ab7d71 100755
--- a/docker-build.sh
+++ b/docker-build.sh
@@ -22,13 +22,13 @@ if [[ "$@" == *"--depclean"* ]]; then
 fi
 
 #echo "Creating development image..."
-#docker build --file $PROJECT_DIR/docker/development.Dockerfile --tag typesense/typesense-development:22-JUNE-2020-1 $PROJECT_DIR/docker
+#docker build --file $PROJECT_DIR/docker/development.Dockerfile --tag typesense/typesense-development:23-JUNE-2020-1 $PROJECT_DIR/docker
 
 echo "Building Typesense $TYPESENSE_VERSION..."
-docker run -it -v $PROJECT_DIR:/typesense typesense/typesense-development:22-JUNE-2020-1 cmake -DTYPESENSE_VERSION=$TYPESENSE_VERSION \
+docker run -it -v $PROJECT_DIR:/typesense typesense/typesense-development:23-JUNE-2020-1 cmake -DTYPESENSE_VERSION=$TYPESENSE_VERSION \
 -DCMAKE_BUILD_TYPE=Release -H/typesense -B/typesense/$BUILD_DIR
 
-docker run -it -v $PROJECT_DIR:/typesense typesense/typesense-development:22-JUNE-2020-1 make typesense-server -C/typesense/$BUILD_DIR
+docker run -it -v $PROJECT_DIR:/typesense typesense/typesense-development:23-JUNE-2020-1 make typesense-server -C/typesense/$BUILD_DIR
 
 if [[ "$@" == *"--build-deploy-image"* ]]; then
     echo "Creating deployment image for Typesense $TYPESENSE_VERSION server ..."
@@ -60,7 +60,7 @@ if [[ "$@" == *"--package-libs"* ]]; then
 fi
 #
 #if [[ "$@" == *"--create-deb-upload"* ]]; then
-#    docker run -it -v $PROJECT_DIR:/typesense typesense/typesense-development:22-JUNE-2020-1 cmake -DTYPESENSE_VERSION=$TYPESENSE_VERSION \
+#    docker run -it -v $PROJECT_DIR:/typesense typesense/typesense-development:23-JUNE-2020-1 cmake -DTYPESENSE_VERSION=$TYPESENSE_VERSION \
 #    -DCMAKE_BUILD_TYPE=Debug -H/typesense -B/typesense/$BUILD_DIR
 #fi
 
diff --git a/docker/development.Dockerfile b/docker/development.Dockerfile
index 8b6d0cc4..ad2acb33 100644
--- a/docker/development.Dockerfile
+++ b/docker/development.Dockerfile
@@ -60,9 +60,10 @@ RUN tar -C /opt -xf /opt/leveldb-1.22.tar.gz
 RUN mkdir -p /opt/leveldb-1.22/build && cd /opt/leveldb-1.22/build && cmake -DCMAKE_BUILD_TYPE=Release .. && \
     cmake --build . && make install && rm -rf /usr/local/lib/*.so*
 
-ADD https://github.com/google/glog/archive/v0.4.0.tar.gz /opt/glog-0.4.0.tar.gz
-RUN tar -C /opt -xf /opt/glog-0.4.0.tar.gz
-RUN mkdir -p /opt/glog-0.4.0/build && cd /opt/glog-0.4.0/build && \
+ADD https://github.com/google/glog/archive/0a2e593.tar.gz /opt/glog-0a2e593.tar.gz
+RUN tar -C /opt -xf /opt/glog-0a2e593.tar.gz
+RUN mkdir -p /opt/glog-0a2e5931bd5ff22fd3bf8999eb8ce776f159cda6/bld && \
+    cd /opt/glog-0a2e5931bd5ff22fd3bf8999eb8ce776f159cda6/bld && \
     cmake -DBUILD_TESTING=0 -DWITH_GFLAGS=ON -DWITH_UNWIND=OFF .. && \
     cmake --build . && make install && rm -rf /usr/local/lib/*.so*
 
@@ -70,18 +71,18 @@ ADD https://github.com/apache/incubator-brpc/archive/0.9.7-rc03.tar.gz /opt/brpc
 RUN tar -C /opt -xf /opt/brpc-0.9.7-rc03.tar.gz
 COPY patches/brpc_cmakelists.txt /opt/incubator-brpc-0.9.7-rc03/src/CMakeLists.txt
 RUN chown root:root /opt/incubator-brpc-0.9.7-rc03/src/CMakeLists.txt
-RUN mkdir -p /opt/incubator-brpc-0.9.7-rc03/build && cd /opt/incubator-brpc-0.9.7-rc03/build && \
+RUN mkdir -p /opt/incubator-brpc-0.9.7-rc03/bld && cd /opt/incubator-brpc-0.9.7-rc03/bld && \
     cmake -DWITH_DEBUG_SYMBOLS=OFF -DWITH_GLOG=ON .. && \
     make -j8 && make install && rm -rf /usr/local/lib/*.so* && \
-    rm -rf /opt/incubator-brpc-0.9.7-rc03/build/output/bin
+    rm -rf /opt/incubator-brpc-0.9.7-rc03/bld/output/bin
 
 ADD https://github.com/baidu/braft/archive/v1.1.1.tar.gz /opt/braft-v1.1.1.tar.gz
 RUN tar -C /opt -xf /opt/braft-v1.1.1.tar.gz
 COPY patches/braft_cmakelists.txt /opt/braft-1.1.1/src/CMakeLists.txt
 RUN chown root:root /opt/braft-1.1.1/src/CMakeLists.txt
-RUN mkdir -p /opt/braft-1.1.1/build && cd /opt/braft-1.1.1/build && \
+RUN mkdir -p /opt/braft-1.1.1/bld && cd /opt/braft-1.1.1/bld && \
     cmake -DWITH_DEBUG_SYMBOLS=ON -DBRPC_WITH_GLOG=ON .. && make -j8 && make install && rm -rf /usr/local/lib/*.so* && \
-    rm -rf /opt/braft-1.1.1/build/output/bin
+    rm -rf /opt/braft-1.1.1/bld/output/bin
 
 ENV CC /usr/local/gcc-6.4.0/bin/gcc
 ENV CXX /usr/local/gcc-6.4.0/bin/g++
diff --git a/src/typesense_server_utils.cpp b/src/typesense_server_utils.cpp
index 938e2a45..15fb9d67 100644
--- a/src/typesense_server_utils.cpp
+++ b/src/typesense_server_utils.cpp
@@ -150,12 +150,17 @@ int init_logger(Config & config, const std::string & server_version) {
             return 1;
         }
 
-        std::cout << "Log directory is configured as: " << log_dir << std::endl;
-        std::string log_path_prefix = log_dir + "/" + "typesense-log-";
-        google::SetLogDestination(google::INFO, log_path_prefix.c_str());
-
         // flush log levels above -1 immediately (INFO=0)
         FLAGS_logbuflevel = -1;
+
+        // available only on glog master (ensures that log file name is constant)
+        FLAGS_timestamp_in_logfile_name = false;
+
+        std::string log_path = log_dir + "/" + "typesense.log";
+        google::SetLogDestination(google::INFO, log_path.c_str());
+        google::SetLogSymlink(google::INFO, "");
+
+        std::cout << "Log directory is configured as: " << log_dir << std::endl;
     }
 
     return 0;

From a7fad633eb70abc7e9d65b1647a1d46387c0add2 Mon Sep 17 00:00:00 2001
From: kishorenc <kishorenc@gmail.com>
Date: Wed, 24 Jun 2020 07:57:22 +0530
Subject: [PATCH 26/38] Use mt19937_64 directly for consistency.

---
 src/main/main.cpp    | 2 +-
 src/string_utils.cpp | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/main/main.cpp b/src/main/main.cpp
index 80606fe1..fc5ad311 100644
--- a/src/main/main.cpp
+++ b/src/main/main.cpp
@@ -21,7 +21,7 @@ int main(int argc, char* argv[]) {
     Store *store = new Store(state_dir_path);
 
     CollectionManager & collectionManager = CollectionManager::get_instance();
-    collectionManager.init(store, 4, "abcd", "123");
+    collectionManager.init(store, 4, "abcd");
     collectionManager.load();
 
     std::vector<field> fields_to_index = {
diff --git a/src/string_utils.cpp b/src/string_utils.cpp
index 2782c2b6..6de06f4c 100644
--- a/src/string_utils.cpp
+++ b/src/string_utils.cpp
@@ -1,8 +1,8 @@
 #include "string_utils.h"
 #include <iostream>
 #include <openssl/evp.h>
-#include <iomanip>
 #include <openssl/hmac.h>
+#include <random>
 
 std::string lower_and_no_special_chars(const std::string & str) {
     std::stringstream ss;
@@ -57,14 +57,14 @@ std::string StringUtils::randstring(size_t length, uint64_t seed) {
                         "abcdefghijklmnopqrstuvwxyz"
                         "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
 
-    thread_local static std::mt19937 rg(seed);
-    thread_local static std::uniform_int_distribution<std::string::size_type> pick(0, sizeof(chrs) - 2);
+    thread_local static std::mt19937_64 mt_rand(seed);
 
     std::string s;
     s.reserve(length);
 
     while(length--) {
-        s += chrs[pick(rg)];
+        size_t index = (mt_rand() % (sizeof(chrs) - 1));
+        s += chrs[index];
     }
 
     return s;

From 8e1338626e488630781c1b4e7a035b3a6a1afc76 Mon Sep 17 00:00:00 2001
From: kishorenc <kishorenc@gmail.com>
Date: Wed, 24 Jun 2020 19:51:07 +0530
Subject: [PATCH 27/38] Log all levels to a single file.

---
 src/typesense_server_utils.cpp | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/typesense_server_utils.cpp b/src/typesense_server_utils.cpp
index 15fb9d67..d0ae17da 100644
--- a/src/typesense_server_utils.cpp
+++ b/src/typesense_server_utils.cpp
@@ -157,8 +157,14 @@ int init_logger(Config & config, const std::string & server_version) {
         FLAGS_timestamp_in_logfile_name = false;
 
         std::string log_path = log_dir + "/" + "typesense.log";
+
+        // will log level INFO and up to the given log file
         google::SetLogDestination(google::INFO, log_path.c_str());
-        google::SetLogSymlink(google::INFO, "");
+
+        // don't create separate log files for each level
+        google::SetLogDestination(google::WARNING, "");
+        google::SetLogDestination(google::ERROR, "");
+        google::SetLogDestination(google::FATAL, "");
 
         std::cout << "Log directory is configured as: " << log_dir << std::endl;
     }

From ba80f06001453a4af50c4ba65c860d43e6115ce4 Mon Sep 17 00:00:00 2001
From: kishorenc <kishorenc@gmail.com>
Date: Wed, 24 Jun 2020 20:34:12 +0530
Subject: [PATCH 28/38] Generate whole key without relying on seed based
 generation.

---
 include/http_data.h            | 16 +++++-----------
 include/string_utils.h         |  2 +-
 src/core_api.cpp               |  2 +-
 src/http_server.cpp            |  7 +++++++
 src/string_utils.cpp           |  8 ++++----
 src/typesense_server_utils.cpp |  5 ++++-
 6 files changed, 22 insertions(+), 18 deletions(-)

diff --git a/include/http_data.h b/include/http_data.h
index 3a917956..cd306204 100644
--- a/include/http_data.h
+++ b/include/http_data.h
@@ -117,16 +117,16 @@ struct http_req {
     uint64_t route_hash;
     std::map<std::string, std::string> params;
     std::string body;
-    uint64_t seed;
+    std::string metadata;
 
-    http_req(): route_hash(1), seed(random_uint64_t()) {
+    http_req(): route_hash(1) {
 
     }
 
     http_req(h2o_req_t* _req, const std::string & http_method, uint64_t route_hash,
             const std::map<std::string, std::string> & params, std::string body):
             _req(_req), http_method(http_method), route_hash(route_hash), params(params),
-            body(body), seed(random_uint64_t()) {
+            body(body) {
 
     }
 
@@ -136,7 +136,7 @@ struct http_req {
         nlohmann::json content = nlohmann::json::parse(serialized_content);
         route_hash = content["route_hash"];
         body = content["body"];
-        seed = content["seed"];
+        metadata = content.count("metadata") != 0 ? content["metadata"] : "";
 
         for (nlohmann::json::iterator it = content["params"].begin(); it != content["params"].end(); ++it) {
             params.emplace(it.key(), it.value());
@@ -150,16 +150,10 @@ struct http_req {
         content["route_hash"] = route_hash;
         content["params"] = params;
         content["body"] = body;
-        content["seed"] = seed;
+        content["metadata"] = metadata;
 
         return content.dump();
     }
-
-    uint64_t random_uint64_t() {
-        thread_local std::mt19937 rg(std::random_device{}());
-        thread_local std::uniform_int_distribution<uint64_t> pick(0, std::numeric_limits<uint64_t>::max());
-        return pick(rg);
-    }
 };
 
 struct request_response {
diff --git a/include/string_utils.h b/include/string_utils.h
index 5a19b3ca..62005f7e 100644
--- a/include/string_utils.h
+++ b/include/string_utils.h
@@ -234,7 +234,7 @@ struct StringUtils {
         return hash != std::numeric_limits<uint64_t>::max() ? hash : (std::numeric_limits<uint64_t>::max()-1);
     }
 
-    static std::string randstring(size_t length, uint64_t seed);
+    static std::string randstring(size_t length);
 
     static std::string hmac(const std::string& key, const std::string& msg);
 };
\ No newline at end of file
diff --git a/src/core_api.cpp b/src/core_api.cpp
index a52b1a7a..7d5294d3 100644
--- a/src/core_api.cpp
+++ b/src/core_api.cpp
@@ -894,7 +894,7 @@ bool post_create_key(http_req &req, http_res &res) {
         return false;
     }
 
-    const std::string &rand_key = StringUtils::randstring(AuthManager::KEY_LEN, req.seed);
+    const std::string &rand_key = req.metadata;
 
     api_key_t api_key(
         rand_key,
diff --git a/src/http_server.cpp b/src/http_server.cpp
index 8058e32d..2b8b6dc1 100644
--- a/src/http_server.cpp
+++ b/src/http_server.cpp
@@ -6,6 +6,7 @@
 #include <signal.h>
 #include <h2o.h>
 #include <iostream>
+#include <auth_manager.h>
 #include "raft_server.h"
 #include "logger.h"
 
@@ -371,6 +372,12 @@ int HttpServer::catch_all_handler(h2o_handler_t *_self, h2o_req_t *req) {
         }
 
         // routes match and is an authenticated request
+        // do any additional pre-request middleware operations here
+        if(rpath->action == "keys:create") {
+            // we enrich incoming request with a random API key here so that leader and replicas will use the same key
+            request->metadata = StringUtils::randstring(AuthManager::KEY_LEN);
+        }
+
         // for writes, we defer to replication_state
         if(http_method != "GET") {
             self->http_server->get_replication_state()->write(request, response);
diff --git a/src/string_utils.cpp b/src/string_utils.cpp
index 6de06f4c..302831dd 100644
--- a/src/string_utils.cpp
+++ b/src/string_utils.cpp
@@ -52,19 +52,19 @@ void StringUtils::unicode_normalize(std::string & str) const {
     str.assign(lower_and_no_special_chars(out.str()));
 }
 
-std::string StringUtils::randstring(size_t length, uint64_t seed) {
+std::string StringUtils::randstring(size_t length) {
     static auto& chrs = "0123456789"
                         "abcdefghijklmnopqrstuvwxyz"
                         "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
 
-    thread_local static std::mt19937_64 mt_rand(seed);
+    thread_local std::mt19937 rg(std::random_device{}());
+    thread_local std::uniform_int_distribution<uint32_t> pick(0, sizeof(chrs) - 2);
 
     std::string s;
     s.reserve(length);
 
     while(length--) {
-        size_t index = (mt_rand() % (sizeof(chrs) - 1));
-        s += chrs[index];
+        s += chrs[pick(rg)];
     }
 
     return s;
diff --git a/src/typesense_server_utils.cpp b/src/typesense_server_utils.cpp
index d0ae17da..9a56416f 100644
--- a/src/typesense_server_utils.cpp
+++ b/src/typesense_server_utils.cpp
@@ -158,9 +158,12 @@ int init_logger(Config & config, const std::string & server_version) {
 
         std::string log_path = log_dir + "/" + "typesense.log";
 
-        // will log level INFO and up to the given log file
+        // will log levels INFO **and above** to the given log file
         google::SetLogDestination(google::INFO, log_path.c_str());
 
+        // don't create symlink for INFO log
+        google::SetLogSymlink(google::INFO, "");
+
         // don't create separate log files for each level
         google::SetLogDestination(google::WARNING, "");
         google::SetLogDestination(google::ERROR, "");

From 0762f4ae296c7c2e5d1ccd40c754fb991d00dcee Mon Sep 17 00:00:00 2001
From: kishorenc <kishorenc@gmail.com>
Date: Sun, 28 Jun 2020 16:05:31 +0530
Subject: [PATCH 29/38] Group key should be actual values.

---
 cmake/Glog.cmake                  |  0
 include/logger.h                  |  2 --
 include/typesense_server_utils.h  |  2 +-
 src/collection.cpp                | 23 ++++++++++--------
 src/main/typesense_server.cpp     |  2 +-
 test/collection_grouping_test.cpp | 40 +++++++++++++++++++++----------
 test/collection_override_test.cpp |  6 ++++-
 7 files changed, 48 insertions(+), 27 deletions(-)
 delete mode 100644 cmake/Glog.cmake

diff --git a/cmake/Glog.cmake b/cmake/Glog.cmake
deleted file mode 100644
index e69de29b..00000000
diff --git a/include/logger.h b/include/logger.h
index 01adc553..5eee4cdf 100644
--- a/include/logger.h
+++ b/include/logger.h
@@ -1,5 +1,3 @@
 #pragma once
 
-#include <string>
-#include <iostream>
 #include <glog/logging.h>
\ No newline at end of file
diff --git a/include/typesense_server_utils.h b/include/typesense_server_utils.h
index 4b0af691..34f0cff4 100644
--- a/include/typesense_server_utils.h
+++ b/include/typesense_server_utils.h
@@ -1,10 +1,10 @@
 #pragma once
 
+#include "logger.h"
 #include <string>
 #include <iostream>
 #include <cmdline.h>
 #include "config.h"
-#include "logger.h"
 #include "store.h"
 #include "collection_manager.h"
 #include <csignal>
diff --git a/src/collection.cpp b/src/collection.cpp
index fbe0a376..402892e5 100644
--- a/src/collection.cpp
+++ b/src/collection.cpp
@@ -862,7 +862,7 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
 
     result["found"] = total_found;
 
-    std::string hits_key = (group_limit > 1) ? "grouped_hits" : "hits";
+    std::string hits_key = group_limit ? "grouped_hits" : "hits";
     result[hits_key] = nlohmann::json::array();
 
     // construct results array
@@ -870,17 +870,11 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
         const std::vector<KV*> & kv_group = result_group_kvs[result_kvs_index];
 
         nlohmann::json group_hits;
-        if(group_limit > 1) {
+        if(group_limit) {
             group_hits["hits"] = nlohmann::json::array();
-            std::vector<std::string> group_keys;
-            for(const auto& group_key: group_by_fields) {
-                group_keys.push_back(group_key);
-            }
-
-            group_hits["group_key"] = StringUtils::join(group_keys, ",");
         }
 
-        nlohmann::json& hits_array = (group_limit > 1) ? group_hits["hits"] : result["hits"];
+        nlohmann::json& hits_array = group_limit ? group_hits["hits"] : result["hits"];
 
         for(const KV* field_order_kv: kv_group) {
             const std::string& seq_id_key = get_seq_id_key((uint32_t) field_order_kv->key);
@@ -960,7 +954,16 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
             hits_array.push_back(wrapper_doc);
         }
 
-        if(group_limit > 1) {
+        if(group_limit) {
+            const auto& document = group_hits["hits"][0]["document"];
+
+            group_hits["group_key"] = nlohmann::json::array();
+            for(const auto& field_name: group_by_fields) {
+                if(document.count(field_name) != 0) {
+                    group_hits["group_key"].push_back(document[field_name]);
+                }
+            }
+
             result["grouped_hits"].push_back(group_hits);
         }
     }
diff --git a/src/main/typesense_server.cpp b/src/main/typesense_server.cpp
index 5a3ef7d5..fd9ed722 100644
--- a/src/main/typesense_server.cpp
+++ b/src/main/typesense_server.cpp
@@ -1,6 +1,6 @@
+#include "typesense_server_utils.h"
 #include "core_api.h"
 #include "config.h"
-#include "typesense_server_utils.h"
 
 void master_server_routes() {
     // collection management
diff --git a/test/collection_grouping_test.cpp b/test/collection_grouping_test.cpp
index b1fcd8b6..1c565bd5 100644
--- a/test/collection_grouping_test.cpp
+++ b/test/collection_grouping_test.cpp
@@ -70,9 +70,10 @@ TEST_F(CollectionGroupingTest, GroupingBasics) {
 
     ASSERT_EQ(3, res["found"].get<size_t>());
     ASSERT_EQ(3, res["grouped_hits"].size());
-    ASSERT_STREQ("size", res["grouped_hits"][0]["group_key"].get<std::string>().c_str());
+    ASSERT_EQ(11, res["grouped_hits"][0]["group_key"][0].get<size_t>());
 
     ASSERT_FLOAT_EQ(4.8, res["grouped_hits"][0]["hits"][0]["document"]["rating"].get<float>());
+    ASSERT_EQ(11, res["grouped_hits"][0]["hits"][0]["document"]["size"].get<size_t>());
     ASSERT_STREQ("5", res["grouped_hits"][0]["hits"][0]["document"]["id"].get<std::string>().c_str());
     ASSERT_FLOAT_EQ(4.3, res["grouped_hits"][0]["hits"][1]["document"]["rating"].get<float>());
     ASSERT_STREQ("1", res["grouped_hits"][0]["hits"][1]["document"]["id"].get<std::string>().c_str());
@@ -112,7 +113,7 @@ TEST_F(CollectionGroupingTest, GroupingBasics) {
     // 7 unique ratings
     ASSERT_EQ(7, res["found"].get<size_t>());
     ASSERT_EQ(7, res["grouped_hits"].size());
-    ASSERT_STREQ("rating", res["grouped_hits"][0]["group_key"].get<std::string>().c_str());
+    ASSERT_FLOAT_EQ(4.4, res["grouped_hits"][0]["group_key"][0].get<float>());
 
     ASSERT_EQ(12, res["grouped_hits"][0]["hits"][0]["document"]["size"].get<uint32_t>());
     ASSERT_STREQ("8", res["grouped_hits"][0]["hits"][0]["document"]["id"].get<std::string>().c_str());
@@ -151,7 +152,14 @@ TEST_F(CollectionGroupingTest, GroupingCompoundKey) {
 
     ASSERT_EQ(10, res["found"].get<size_t>());
     ASSERT_EQ(10, res["grouped_hits"].size());
-    ASSERT_STREQ("size,brand", res["grouped_hits"][0]["group_key"].get<std::string>().c_str());
+    ASSERT_EQ(11, res["grouped_hits"][0]["group_key"][0].get<size_t>());
+
+    ASSERT_STREQ("Beta", res["grouped_hits"][0]["group_key"][1].get<std::string>().c_str());
+
+    // optional field should have no value in the group key component
+    ASSERT_EQ(1, res["grouped_hits"][5]["group_key"].size());
+    ASSERT_STREQ("10", res["grouped_hits"][5]["hits"][0]["document"]["id"].get<std::string>().c_str());
+    ASSERT_STREQ("11", res["grouped_hits"][5]["hits"][1]["document"]["id"].get<std::string>().c_str());
 
     ASSERT_EQ(1, res["grouped_hits"][0]["hits"].size());
     ASSERT_FLOAT_EQ(4.8, res["grouped_hits"][0]["hits"][0]["document"]["rating"].get<float>());
@@ -199,7 +207,8 @@ TEST_F(CollectionGroupingTest, GroupingCompoundKey) {
     // total count and facet counts should be the same
     ASSERT_EQ(10, res["found"].get<size_t>());
     ASSERT_EQ(2, res["grouped_hits"].size());
-    ASSERT_STREQ("size,brand", res["grouped_hits"][0]["group_key"].get<std::string>().c_str());
+    ASSERT_EQ(10, res["grouped_hits"][0]["group_key"][0].get<size_t>());
+    ASSERT_STREQ("Omega", res["grouped_hits"][0]["group_key"][1].get<std::string>().c_str());
 
     ASSERT_STREQ("brand", res["facet_counts"][0]["field_name"].get<std::string>().c_str());
     ASSERT_EQ(3, (int) res["facet_counts"][0]["counts"][0]["count"]);
@@ -215,7 +224,7 @@ TEST_F(CollectionGroupingTest, GroupingCompoundKey) {
     ASSERT_STREQ("Zeta", res["facet_counts"][0]["counts"][3]["value"].get<std::string>().c_str());
 }
 
-TEST_F(CollectionGroupingTest, GroupingWithSingleDistinct) {
+TEST_F(CollectionGroupingTest, GroupingWithGropLimitOfOne) {
     auto res = coll_group->search("*", {}, "", {"brand"}, {}, 0, 50, 1, FREQUENCY,
                                   false, Index::DROP_TOKENS_THRESHOLD,
                                   spp::sparse_hash_set<std::string>(),
@@ -224,13 +233,18 @@ TEST_F(CollectionGroupingTest, GroupingWithSingleDistinct) {
                                   {}, {}, {"brand"}, 1).get();
 
     ASSERT_EQ(5, res["found"].get<size_t>());
-    ASSERT_EQ(5, res["hits"].size());
+    ASSERT_EQ(5, res["grouped_hits"].size());
 
-    ASSERT_STREQ("4", res["hits"][0]["document"]["id"].get<std::string>().c_str());
-    ASSERT_STREQ("2", res["hits"][1]["document"]["id"].get<std::string>().c_str());
-    ASSERT_STREQ("8", res["hits"][2]["document"]["id"].get<std::string>().c_str());
-    ASSERT_STREQ("10", res["hits"][3]["document"]["id"].get<std::string>().c_str()); // unbranded
-    ASSERT_STREQ("9", res["hits"][4]["document"]["id"].get<std::string>().c_str());
+    // all hits array must be of size 1
+    for(auto i=0; i<5; i++) {
+        ASSERT_EQ(1, res["grouped_hits"][i]["hits"].size());
+    }
+
+    ASSERT_STREQ("4", res["grouped_hits"][0]["hits"][0]["document"]["id"].get<std::string>().c_str());
+    ASSERT_STREQ("2", res["grouped_hits"][1]["hits"][0]["document"]["id"].get<std::string>().c_str());
+    ASSERT_STREQ("8", res["grouped_hits"][2]["hits"][0]["document"]["id"].get<std::string>().c_str());
+    ASSERT_STREQ("10", res["grouped_hits"][3]["hits"][0]["document"]["id"].get<std::string>().c_str()); // unbranded
+    ASSERT_STREQ("9", res["grouped_hits"][4]["hits"][0]["document"]["id"].get<std::string>().c_str());
 
     // facet counts should each be 1, including unbranded
     ASSERT_STREQ("brand", res["facet_counts"][0]["field_name"].get<std::string>().c_str());
@@ -290,7 +304,9 @@ TEST_F(CollectionGroupingTest, GroupingWithArrayFieldAndOverride) {
 
     ASSERT_EQ(4, res["found"].get<size_t>());
     ASSERT_EQ(4, res["grouped_hits"].size());
-    ASSERT_STREQ("colors", res["grouped_hits"][0]["group_key"].get<std::string>().c_str());
+
+    ASSERT_EQ(1, res["grouped_hits"][0]["group_key"][0].size());
+    ASSERT_STREQ("white", res["grouped_hits"][0]["group_key"][0][0].get<std::string>().c_str());
 
     ASSERT_STREQ("11", res["grouped_hits"][0]["hits"][0]["document"]["id"].get<std::string>().c_str());
     ASSERT_STREQ("10", res["grouped_hits"][0]["hits"][1]["document"]["id"].get<std::string>().c_str());
diff --git a/test/collection_override_test.cpp b/test/collection_override_test.cpp
index a36fb85a..e79e2b42 100644
--- a/test/collection_override_test.cpp
+++ b/test/collection_override_test.cpp
@@ -378,7 +378,11 @@ TEST_F(CollectionOverrideTest, PinnedHitsGrouping) {
 
     ASSERT_EQ(8, results["found"].get<size_t>());
 
-    ASSERT_STREQ("cast", results["grouped_hits"][0]["group_key"].get<std::string>().c_str());
+    ASSERT_EQ(1, results["grouped_hits"][0]["group_key"].size());
+    ASSERT_EQ(2, results["grouped_hits"][0]["group_key"][0].size());
+    ASSERT_STREQ("Chris Evans", results["grouped_hits"][0]["group_key"][0][0].get<std::string>().c_str());
+    ASSERT_STREQ("Scarlett Johansson", results["grouped_hits"][0]["group_key"][0][1].get<std::string>().c_str());
+
     ASSERT_STREQ("6", results["grouped_hits"][0]["hits"][0]["document"]["id"].get<std::string>().c_str());
     ASSERT_STREQ("8", results["grouped_hits"][0]["hits"][1]["document"]["id"].get<std::string>().c_str());
 

From 2102f2323f810b88e9965bba0870edb0558255ff Mon Sep 17 00:00:00 2001
From: kishorenc <kishorenc@gmail.com>
Date: Sun, 28 Jun 2020 17:22:12 +0530
Subject: [PATCH 30/38] Identify curated results in response.

---
 src/collection.cpp                | 10 +++++++++-
 test/collection_override_test.cpp | 11 +++++++++++
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/src/collection.cpp b/src/collection.cpp
index 402892e5..bdfd1801 100644
--- a/src/collection.cpp
+++ b/src/collection.cpp
@@ -837,6 +837,7 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
         uint64_t override_position = override_result_kvs[override_kv_index][0]->distinct_key;
 
         if(result_position == override_position) {
+            override_result_kvs[override_kv_index][0]->match_score = 0;  // to identify curated result
             result_group_kvs.push_back(override_result_kvs[override_kv_index]);
             override_kv_index++;
         } else {
@@ -846,6 +847,7 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
     }
 
     while(override_kv_index < override_result_kvs.size()) {
+        override_result_kvs[override_kv_index][0]->match_score = 0;  // to identify curated result
         result_group_kvs.push_back({override_result_kvs[override_kv_index]});
         override_kv_index++;
     }
@@ -947,10 +949,16 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
                 wrapper_doc["highlights"].push_back(h_json);
             }
 
+            //wrapper_doc["seq_id"] = (uint32_t) field_order_kv->key;
+
             prune_document(document, include_fields, exclude_fields);
             wrapper_doc["document"] = document;
             wrapper_doc["text_match"] = field_order_kv->match_score;
-            //wrapper_doc["seq_id"] = (uint32_t) field_order_kv->key;
+
+            if(field_order_kv->match_score == 0) {
+                wrapper_doc["curated"] = true;
+            }
+
             hits_array.push_back(wrapper_doc);
         }
 
diff --git a/test/collection_override_test.cpp b/test/collection_override_test.cpp
index e79e2b42..e08f32d2 100644
--- a/test/collection_override_test.cpp
+++ b/test/collection_override_test.cpp
@@ -122,6 +122,11 @@ TEST_F(CollectionOverrideTest, ExcludeIncludeExactQueryMatch) {
     ASSERT_STREQ("3", results["hits"][1]["document"]["id"].get<std::string>().c_str());
     ASSERT_STREQ("13", results["hits"][2]["document"]["id"].get<std::string>().c_str());
 
+    // curated results should be marked as such
+    ASSERT_EQ(true, results["hits"][0]["curated"].get<bool>());
+    ASSERT_EQ(true, results["hits"][1]["curated"].get<bool>());
+    ASSERT_EQ(0, results["hits"][2].count("curated"));
+
     coll_mul_fields->remove_override("exclude-rule");
     coll_mul_fields->remove_override("include-rule");
 
@@ -367,6 +372,12 @@ TEST_F(CollectionOverrideTest, PinnedHitsGrouping) {
     ASSERT_STREQ("13", results["hits"][2]["document"]["id"].get<std::string>().c_str());
     ASSERT_STREQ("11", results["hits"][3]["document"]["id"].get<std::string>().c_str());
 
+    // pinned hits should be marked as curated
+    ASSERT_EQ(true, results["hits"][0]["curated"].get<bool>());
+    ASSERT_EQ(true, results["hits"][1]["curated"].get<bool>());
+    ASSERT_EQ(true, results["hits"][2]["curated"].get<bool>());
+    ASSERT_EQ(0, results["hits"][3].count("curated"));
+
     // with grouping
 
     results = coll_mul_fields->search("the", {"title"}, "", {"starring"}, {}, 0, 50, 1, FREQUENCY,

From 8c42d5da7bf32c305d3b145317c91d7cb675863e Mon Sep 17 00:00:00 2001
From: kishorenc <kishorenc@gmail.com>
Date: Sun, 28 Jun 2020 21:52:30 +0530
Subject: [PATCH 31/38] Fix url of h2o github archive.

---
 cmake/H2O.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/H2O.cmake b/cmake/H2O.cmake
index 679088e8..572e51d7 100644
--- a/cmake/H2O.cmake
+++ b/cmake/H2O.cmake
@@ -6,7 +6,7 @@ set(H2O_TAR_PATH ${DEP_ROOT_DIR}/v${H2O_NAME}.tar.gz)
 
 if(NOT EXISTS ${H2O_TAR_PATH})
     message(STATUS "Downloading ${H2O_NAME}...")
-    file(DOWNLOAD https://github.com/h2o/h2o/archive/${H2O_VERSION}.tar.gz ${H2O_TAR_PATH})
+    file(DOWNLOAD https://github.com/h2o/h2o/archive/v${H2O_VERSION}.tar.gz ${H2O_TAR_PATH})
 endif()
 
 if(NOT EXISTS ${DEP_ROOT_DIR}/${H2O_NAME})

From 0cdd58e86ce77d92b6ed49683bb57d4a5cd5786b Mon Sep 17 00:00:00 2001
From: kishorenc <kishorenc@gmail.com>
Date: Tue, 30 Jun 2020 06:58:06 +0530
Subject: [PATCH 32/38] Validate group limit & other numerical parameters of
 search.

---
 include/string_utils.h            | 14 ++++++++++++--
 src/collection.cpp                |  4 ++++
 src/core_api.cpp                  | 26 +++++++++++++-------------
 test/collection_grouping_test.cpp | 21 +++++++++++++++++++++
 test/string_utils_test.cpp        |  5 +++++
 5 files changed, 55 insertions(+), 15 deletions(-)

diff --git a/include/string_utils.h b/include/string_utils.h
index 62005f7e..1b12da8b 100644
--- a/include/string_utils.h
+++ b/include/string_utils.h
@@ -151,8 +151,18 @@ struct StringUtils {
         }
 
         char * p ;
-        strtoull(s.c_str(), &p, 10);
-        return (*p == 0);
+        unsigned long long ull = strtoull(s.c_str(), &p, 10);
+        return (*p == 0) && ull <= std::numeric_limits<uint64_t>::max();
+    }
+
+    static bool is_uint32_t(const std::string &s) {
+        if(s.empty()) {
+            return false;
+        }
+
+        char * p ;
+        unsigned long ul = strtoul(s.c_str(), &p, 10);
+        return (*p == 0) && ul <= std::numeric_limits<uint32_t>::max();
     }
 
     static void toupper(std::string& str) {
diff --git a/src/collection.cpp b/src/collection.cpp
index bdfd1801..2c12f690 100644
--- a/src/collection.cpp
+++ b/src/collection.cpp
@@ -394,6 +394,10 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
         return Option<nlohmann::json>(400, "No search fields specified for the query.");
     }
 
+    if(group_limit == 0 || group_limit >= 100) {
+        return Option<nlohmann::json>(400, "Value of `group_limit` is invalid.");
+    }
+
     std::vector<uint32_t> excluded_ids;
     std::map<size_t, std::vector<uint32_t>> include_ids; // position => list of IDs
     populate_overrides(query, pinned_hits, hidden_hits, include_ids, excluded_ids);
diff --git a/src/core_api.cpp b/src/core_api.cpp
index 7d5294d3..06952a80 100644
--- a/src/core_api.cpp
+++ b/src/core_api.cpp
@@ -301,42 +301,42 @@ bool get_search(http_req & req, http_res & res) {
         }
     }
 
-    if(!StringUtils::is_uint64_t(req.params[DROP_TOKENS_THRESHOLD])) {
+    if(!StringUtils::is_uint32_t(req.params[DROP_TOKENS_THRESHOLD])) {
         res.set_400("Parameter `" + std::string(DROP_TOKENS_THRESHOLD) + "` must be an unsigned integer.");
         return false;
     }
 
-    if(!StringUtils::is_uint64_t(req.params[TYPO_TOKENS_THRESHOLD])) {
+    if(!StringUtils::is_uint32_t(req.params[TYPO_TOKENS_THRESHOLD])) {
         res.set_400("Parameter `" + std::string(TYPO_TOKENS_THRESHOLD) + "` must be an unsigned integer.");
         return false;
     }
 
-    if(!StringUtils::is_uint64_t(req.params[NUM_TYPOS])) {
+    if(!StringUtils::is_uint32_t(req.params[NUM_TYPOS])) {
         res.set_400("Parameter `" + std::string(NUM_TYPOS) + "` must be an unsigned integer.");
         return false;
     }
 
-    if(!StringUtils::is_uint64_t(req.params[PER_PAGE])) {
+    if(!StringUtils::is_uint32_t(req.params[PER_PAGE])) {
         res.set_400("Parameter `" + std::string(PER_PAGE) + "` must be an unsigned integer.");
         return false;
     }
 
-    if(!StringUtils::is_uint64_t(req.params[PAGE])) {
+    if(!StringUtils::is_uint32_t(req.params[PAGE])) {
         res.set_400("Parameter `" + std::string(PAGE) + "` must be an unsigned integer.");
         return false;
     }
 
-    if(!StringUtils::is_uint64_t(req.params[MAX_FACET_VALUES])) {
+    if(!StringUtils::is_uint32_t(req.params[MAX_FACET_VALUES])) {
         res.set_400("Parameter `" + std::string(MAX_FACET_VALUES) + "` must be an unsigned integer.");
         return false;
     }
 
-    if(!StringUtils::is_uint64_t(req.params[SNIPPET_THRESHOLD])) {
+    if(!StringUtils::is_uint32_t(req.params[SNIPPET_THRESHOLD])) {
         res.set_400("Parameter `" + std::string(SNIPPET_THRESHOLD) + "` must be an unsigned integer.");
         return false;
     }
 
-    if(!StringUtils::is_uint64_t(req.params[GROUP_LIMIT])) {
+    if(!StringUtils::is_uint32_t(req.params[GROUP_LIMIT])) {
         res.set_400("Parameter `" + std::string(GROUP_LIMIT) + "` must be an unsigned integer.");
         return false;
     }
@@ -441,19 +441,19 @@ bool get_search(http_req & req, http_res & res) {
 
     Option<nlohmann::json> result_op = collection->search(req.params[QUERY], search_fields, filter_str, facet_fields,
                                                           sort_fields, std::stoi(req.params[NUM_TYPOS]),
-                                                          static_cast<size_t>(std::stoi(req.params[PER_PAGE])),
-                                                          static_cast<size_t>(std::stoi(req.params[PAGE])),
+                                                          static_cast<size_t>(std::stol(req.params[PER_PAGE])),
+                                                          static_cast<size_t>(std::stol(req.params[PAGE])),
                                                           token_order, prefix, drop_tokens_threshold,
                                                           include_fields, exclude_fields,
-                                                          static_cast<size_t>(std::stoi(req.params[MAX_FACET_VALUES])),
+                                                          static_cast<size_t>(std::stol(req.params[MAX_FACET_VALUES])),
                                                           req.params[FACET_QUERY],
-                                                          static_cast<size_t>(std::stoi(req.params[SNIPPET_THRESHOLD])),
+                                                          static_cast<size_t>(std::stol(req.params[SNIPPET_THRESHOLD])),
                                                           req.params[HIGHLIGHT_FULL_FIELDS],
                                                           typo_tokens_threshold,
                                                           pinned_hits,
                                                           hidden_hits,
                                                           group_by_fields,
-                                                          static_cast<size_t>(std::stoi(req.params[GROUP_LIMIT]))
+                                                          static_cast<size_t>(std::stol(req.params[GROUP_LIMIT]))
                                                           );
 
     uint64_t timeMillis = std::chrono::duration_cast<std::chrono::milliseconds>(
diff --git a/test/collection_grouping_test.cpp b/test/collection_grouping_test.cpp
index 1c565bd5..63118a4e 100644
--- a/test/collection_grouping_test.cpp
+++ b/test/collection_grouping_test.cpp
@@ -222,6 +222,27 @@ TEST_F(CollectionGroupingTest, GroupingCompoundKey) {
 
     ASSERT_EQ(1, (int) res["facet_counts"][0]["counts"][3]["count"]);
     ASSERT_STREQ("Zeta", res["facet_counts"][0]["counts"][3]["value"].get<std::string>().c_str());
+
+    // respect min and max grouping limit (greater than 0 and less than 99)
+    auto res_op = coll_group->search("*", {}, "", {"brand"}, {}, 0, 50, 1, FREQUENCY,
+                             false, Index::DROP_TOKENS_THRESHOLD,
+                             spp::sparse_hash_set<std::string>(),
+                             spp::sparse_hash_set<std::string>(), 10, "brand: omeg", 30,
+                             "", 10,
+                             {}, {}, {"rating"}, 100);
+
+    ASSERT_FALSE(res_op.ok());
+    ASSERT_STREQ("Value of `group_limit` is invalid.", res_op.error().c_str());
+
+    res_op = coll_group->search("*", {}, "", {"brand"}, {}, 0, 50, 1, FREQUENCY,
+                                false, Index::DROP_TOKENS_THRESHOLD,
+                                spp::sparse_hash_set<std::string>(),
+                                spp::sparse_hash_set<std::string>(), 10, "brand: omeg", 30,
+                                "", 10,
+                                {}, {}, {"rating"}, 0);
+
+    ASSERT_FALSE(res_op.ok());
+    ASSERT_STREQ("Value of `group_limit` is invalid.", res_op.error().c_str());
 }
 
 TEST_F(CollectionGroupingTest, GroupingWithGropLimitOfOne) {
diff --git a/test/string_utils_test.cpp b/test/string_utils_test.cpp
index 4f8c2400..a2d64147 100644
--- a/test/string_utils_test.cpp
+++ b/test/string_utils_test.cpp
@@ -54,3 +54,8 @@ TEST(StringUtilsTest, HMAC) {
     std::string digest1 = StringUtils::hmac("KeyVal", "{\"filter_by\": \"user_id:1080\"}");
     ASSERT_STREQ("IvjqWNZ5M5ElcvbMoXj45BxkQrZG4ZKEaNQoRioCx2s=", digest1.c_str());
 }
+
+TEST(StringUtilsTest, UInt32Validation) {
+    std::string big_num = "99999999999999999999999999999999";
+    ASSERT_FALSE(StringUtils::is_uint32_t(big_num));
+}

From 2e0d8179f7da747ae1f23ec61c7f8a00ce8ffe18 Mon Sep 17 00:00:00 2001
From: kishorenc <kishorenc@gmail.com>
Date: Tue, 30 Jun 2020 06:59:15 +0530
Subject: [PATCH 33/38] Change typesense memory usage metric name.

---
 src/system_metrics.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/system_metrics.cpp b/src/system_metrics.cpp
index ac5412b7..eb79c42b 100644
--- a/src/system_metrics.cpp
+++ b/src/system_metrics.cpp
@@ -27,7 +27,7 @@ void SystemMetrics::get(const std::string &data_dir_path, nlohmann::json &result
 
     rusage r_usage;
     getrusage(RUSAGE_SELF, &r_usage);
-    result["memory_used_process_bytes"] = r_usage.ru_maxrss * 1000;
+    result["typesense_memory_used_bytes"] = r_usage.ru_maxrss * 1000;
 
     uint64_t memory_available_bytes = 0;
     uint64_t memory_total_bytes = 0;

From 6377d1b58c8fee58c6b8b42d98adc49b4fd4b495 Mon Sep 17 00:00:00 2001
From: kishorenc <kishorenc@gmail.com>
Date: Tue, 30 Jun 2020 18:08:44 +0530
Subject: [PATCH 34/38] Fix tests.

---
 src/collection.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/collection.cpp b/src/collection.cpp
index 2c12f690..3e5a031e 100644
--- a/src/collection.cpp
+++ b/src/collection.cpp
@@ -394,7 +394,7 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
         return Option<nlohmann::json>(400, "No search fields specified for the query.");
     }
 
-    if(group_limit == 0 || group_limit >= 100) {
+    if(!group_by_fields.empty() && (group_limit == 0 || group_limit >= 100)) {
         return Option<nlohmann::json>(400, "Value of `group_limit` is invalid.");
     }
 

From 0f38242e09e94bd33af2b4795ff8eea762e0fee3 Mon Sep 17 00:00:00 2001
From: kishorenc <kishorenc@gmail.com>
Date: Wed, 1 Jul 2020 08:09:00 +0530
Subject: [PATCH 35/38] Clearer error message about group limit range.

---
 include/collection.h              | 2 ++
 src/collection.cpp                | 5 +++--
 test/collection_grouping_test.cpp | 4 ++--
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/include/collection.h b/include/collection.h
index 730bad63..69f15832 100644
--- a/include/collection.h
+++ b/include/collection.h
@@ -273,6 +273,8 @@ public:
 
     const size_t PER_PAGE_MAX = 250;
 
+    const size_t GROUP_LIMIT_MAX = 99;
+
     // Using a $ prefix so that these meta keys stay above record entries in a lexicographically ordered KV store
     static constexpr const char* COLLECTION_META_PREFIX = "$CM";
     static constexpr const char* COLLECTION_NEXT_SEQ_PREFIX = "$CS";
diff --git a/src/collection.cpp b/src/collection.cpp
index 3e5a031e..e3724a11 100644
--- a/src/collection.cpp
+++ b/src/collection.cpp
@@ -394,8 +394,9 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
         return Option<nlohmann::json>(400, "No search fields specified for the query.");
     }
 
-    if(!group_by_fields.empty() && (group_limit == 0 || group_limit >= 100)) {
-        return Option<nlohmann::json>(400, "Value of `group_limit` is invalid.");
+    if(!group_by_fields.empty() && (group_limit == 0 || group_limit > GROUP_LIMIT_MAX)) {
+        return Option<nlohmann::json>(400, "Value of `group_limit` must be between 1 and " +
+                                      std::to_string(GROUP_LIMIT_MAX) + ".");
     }
 
     std::vector<uint32_t> excluded_ids;
diff --git a/test/collection_grouping_test.cpp b/test/collection_grouping_test.cpp
index 63118a4e..9d02e0a8 100644
--- a/test/collection_grouping_test.cpp
+++ b/test/collection_grouping_test.cpp
@@ -232,7 +232,7 @@ TEST_F(CollectionGroupingTest, GroupingCompoundKey) {
                              {}, {}, {"rating"}, 100);
 
     ASSERT_FALSE(res_op.ok());
-    ASSERT_STREQ("Value of `group_limit` is invalid.", res_op.error().c_str());
+    ASSERT_STREQ("Value of `group_limit` must be between 1 and 99.", res_op.error().c_str());
 
     res_op = coll_group->search("*", {}, "", {"brand"}, {}, 0, 50, 1, FREQUENCY,
                                 false, Index::DROP_TOKENS_THRESHOLD,
@@ -242,7 +242,7 @@ TEST_F(CollectionGroupingTest, GroupingCompoundKey) {
                                 {}, {}, {"rating"}, 0);
 
     ASSERT_FALSE(res_op.ok());
-    ASSERT_STREQ("Value of `group_limit` is invalid.", res_op.error().c_str());
+    ASSERT_STREQ("Value of `group_limit` must be between 1 and 99.", res_op.error().c_str());
 }
 
 TEST_F(CollectionGroupingTest, GroupingWithGropLimitOfOne) {

From e64805320b7d85dc0ba6c4dc3ef785284622bb2a Mon Sep 17 00:00:00 2001
From: kishorenc <kishorenc@gmail.com>
Date: Fri, 3 Jul 2020 06:10:21 +0530
Subject: [PATCH 36/38] Address some warnings.

---
 src/raft_server.cpp  | 1 +
 src/string_utils.cpp | 6 +++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/raft_server.cpp b/src/raft_server.cpp
index dc1483e9..73bd1a63 100644
--- a/src/raft_server.cpp
+++ b/src/raft_server.cpp
@@ -34,6 +34,7 @@ int ReplicationState::start(const butil::EndPoint & peering_endpoint, const int
     node_options.fsm = this;
     node_options.node_owns_fsm = false;
     node_options.snapshot_interval_s = snapshot_interval_s;
+    node_options.filter_before_copy_remote = false;
     std::string prefix = "local://" + raft_dir;
     node_options.log_uri = prefix + "/" + log_dir_name;
     node_options.raft_meta_uri = prefix + "/" + meta_dir_name;
diff --git a/src/string_utils.cpp b/src/string_utils.cpp
index 302831dd..041f7f07 100644
--- a/src/string_utils.cpp
+++ b/src/string_utils.cpp
@@ -19,6 +19,10 @@ std::string lower_and_no_special_chars(const std::string & str) {
 }
 
 void StringUtils::unicode_normalize(std::string & str) const {
+    if(str.empty()) {
+        return ;
+    }
+
     std::stringstream out;
 
     for (char *s = &str[0]; *s;) {
@@ -49,7 +53,7 @@ void StringUtils::unicode_normalize(std::string & str) const {
         }
     }
 
-    str.assign(lower_and_no_special_chars(out.str()));
+    str = lower_and_no_special_chars(out.str());
 }
 
 std::string StringUtils::randstring(size_t length) {

From 60a1454d8d59312066b5e53eee3ad6d7caa7d0de Mon Sep 17 00:00:00 2001
From: kishorenc <kishorenc@gmail.com>
Date: Sat, 4 Jul 2020 08:17:49 +0530
Subject: [PATCH 37/38] Cmake: remove remote brpc & braft builds for mac.

---
 CMakeLists.txt | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c65f4e47..b5afc665 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -54,11 +54,6 @@ include(cmake/GoogleTest.cmake)
 include(cmake/TestResources.cmake)
 include(cmake/Iconv.cmake)
 
-if (APPLE)
-    include(cmake/brpc.cmake)
-    include(cmake/braft.cmake)
-endif()
-
 FILE(GLOB SRC_FILES src/*.cpp)
 FILE(GLOB TEST_FILES test/*.cpp)
 

From 14d1604d493639ad2162b217755c849a17cc5aab Mon Sep 17 00:00:00 2001
From: kishorenc <kishorenc@gmail.com>
Date: Sat, 4 Jul 2020 08:21:29 +0530
Subject: [PATCH 38/38] Update README.

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index d4f7cbfe..c0463bbd 100644
--- a/README.md
+++ b/README.md
@@ -37,7 +37,7 @@ Here's a quick example showcasing how you can create a collection, index a docum
 Let's begin by starting the Typesense server via Docker:
 
 ```
-docker run -p 8108:8108 -v/tmp/data:/data typesense/typesense:0.13.0 --data-dir /data --api-key=Hu52dwsas2AdxdE
+docker run -p 8108:8108 -v/tmp/data:/data typesense/typesense:0.14.0 --data-dir /data --api-key=Hu52dwsas2AdxdE
 ```
 
 Install the Python client for Typesense (we have [clients](https://typesense.org/api/#api-clients) for other languages too):
@@ -146,7 +146,7 @@ works without turning many knobs.
 
 **Speed is great, but what about the memory footprint?**
 
-A fresh Typesense server will take less than 5 MB of memory. As you start indexing documents, the memory use will 
+A fresh Typesense server will consume about 30 MB of memory. As you start indexing documents, the memory use will 
 increase correspondingly. How much it increases depends on the number and type of fields you index. 
 
 We've strived to keep the in-memory data structures lean. To give you a rough idea: when 1 million