Merge branch 'distinct-grouping'

2025-05-17 04:02:36 +08:00 · 2020-07-04 13:50:34 +05:30 · 2020-07-04 13:50:34 +05:30 · 3e82089423
commit 3e82089423
parent 9b5df13a9e 14d1604d49
42 changed files with 1474 additions and 607 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -2,7 +2,7 @@ version: 2
 jobs:
  build:
    docker:
-      - image: typesense/typesense-development:22-MAR-2020-5
+      - image: typesense/typesense-development:23-JUNE-2020-1
    environment:
      - PROJECT_DIR: /typesense
      - TYPESENSE_VERSION: $CIRCLE_BRANCH-$CIRCLE_SHA1
@ -16,7 +16,7 @@ jobs:
          keys:
            - external-Linux-cache-{{ .Branch }}-{{ checksum "last-changed-git-sha-for-dependency-listing" }}
            - external-Linux-cache-{{ .Branch }}
-            - external-Linux-cache      
+            - external-Linux-cache
      - run: 
          name: build
          command: $PROJECT_DIR/build.sh
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -43,6 +43,7 @@ FIND_PACKAGE(ICU REQUIRED)
 FIND_PACKAGE(Protobuf REQUIRED)
 FIND_PACKAGE(LevelDB REQUIRED)
 FIND_PACKAGE(gflags REQUIRED)
+FIND_PACKAGE(glog REQUIRED)

 message("OpenSSL library: ${OPENSSL_LIBRARIES}")

@ -53,11 +54,6 @@ include(cmake/GoogleTest.cmake)
 include(cmake/TestResources.cmake)
 include(cmake/Iconv.cmake)

-if (APPLE)
-    include(cmake/brpc.cmake)
-    include(cmake/braft.cmake)
-endif()
-
 FILE(GLOB SRC_FILES src/*.cpp)
 FILE(GLOB TEST_FILES test/*.cpp)

@ -87,8 +83,10 @@ link_directories(${DEP_ROOT_DIR}/${FOR_NAME})
 link_directories(${DEP_ROOT_DIR}/${H2O_NAME}/build)
 link_directories(${DEP_ROOT_DIR}/${ROCKSDB_NAME})
 link_directories(${DEP_ROOT_DIR}/${ICONV_NAME}/lib/.libs)
+if (APPLE)
 link_directories(${DEP_ROOT_DIR}/${BRPC_NAME}/lib)
 link_directories(${DEP_ROOT_DIR}/${BRAFT_NAME}/lib)
+endif()

 # Write dependency libraries to a file
 file(WRITE ${DEP_ROOT_DIR}/libs.txt "")
@ -152,7 +150,7 @@ set(ICU_ALL_LIBRARIES ${ICU_I18N_LIBRARIES} ${ICU_LIBRARIES} ${ICU_DATA_LIBRARIE
 set(CORE_LIBS h2o-evloop iconv ${CURL_LIBRARIES} for ${ICU_ALL_LIBRARIES} ${G3LOGGER_LIBRARIES}
              ${ROCKSDB_LIBS} ${OPENSSL_LIBRARIES} pthread dl ${STD_LIB})

-set(CORE_LIBS braft brpc ${LevelDB_LIBRARIES} ${CORE_LIBS} ${GFLAGS_LIBRARIES} ${PROTOBUF_LIBRARIES} ${SYSTEM_LIBS})
+set(CORE_LIBS braft brpc ${LevelDB_LIBRARIES} glog ${CORE_LIBS} ${GFLAGS_LIBRARIES} ${PROTOBUF_LIBRARIES} ${SYSTEM_LIBS})

 target_link_libraries(typesense-core ${CORE_LIBS})
 target_link_libraries(typesense-server ${CORE_LIBS})
--- a/README.md
+++ b/README.md
@ -37,7 +37,7 @@ Here's a quick example showcasing how you can create a collection, index a docum
 Let's begin by starting the Typesense server via Docker:

 ```
-docker run -p 8108:8108 -v/tmp/data:/data typesense/typesense:0.13.0 --data-dir /data --api-key=Hu52dwsas2AdxdE
+docker run -p 8108:8108 -v/tmp/data:/data typesense/typesense:0.14.0 --data-dir /data --api-key=Hu52dwsas2AdxdE
 ```

 Install the Python client for Typesense (we have [clients](https://typesense.org/api/#api-clients) for other languages too):
@ -146,7 +146,7 @@ works without turning many knobs.

 **Speed is great, but what about the memory footprint?**

-A fresh Typesense server will take less than 5 MB of memory. As you start indexing documents, the memory use will 
+A fresh Typesense server will consume about 30 MB of memory. As you start indexing documents, the memory use will 
 increase correspondingly. How much it increases depends on the number and type of fields you index. 

 We've strived to keep the in-memory data structures lean. To give you a rough idea: when 1 million 
--- a/TODO.md
+++ b/TODO.md
@ -97,6 +97,10 @@
 - ~~Have a LOG(ERROR) level~~
 - ~~Handle SIGTERM which is sent when process is killed~~
 - ~~Use snappy compression for storage~~
+- ~~Fix exclude_scalar early returns~~
+- ~~Fix result ids length during grouped overrides~~
+- ~~Fix override grouping (collate_included_ids)~~
+- ~~Test for overriding result on second page~~
 - atleast 1 token match for proceeding with drop tokens
 - support wildcard query with filters
 - API for optimizing on disk storage
--- a/cmake/braft.cmake
+++ b/cmake/braft.cmake
@ -1,4 +1,4 @@
-set(BRAFT_VERSION 0a9ec3f)
+set(BRAFT_VERSION fb27e63)
 set(BRAFT_NAME braft-${BRAFT_VERSION})
 set(BRAFT_TAR_PATH ${DEP_ROOT_DIR}/${BRAFT_NAME}.tar.gz)

--- a/cmake/brpc.cmake
+++ b/cmake/brpc.cmake
@ -1,4 +1,4 @@
-set(BRPC_VERSION 23c66e3)
+set(BRPC_VERSION 2f8fc37d)
 set(BRPC_NAME brpc-${BRPC_VERSION})
 set(BRPC_TAR_PATH ${DEP_ROOT_DIR}/${BRPC_NAME}.tar.gz)

--- a/docker-build.sh
+++ b/docker-build.sh
@ -22,13 +22,13 @@ if [[ "$@" == *"--depclean"* ]]; then
 fi

 #echo "Creating development image..."
-#docker build --file $PROJECT_DIR/docker/development.Dockerfile --tag typesense/typesense-development:latest $PROJECT_DIR/docker
+#docker build --file $PROJECT_DIR/docker/development.Dockerfile --tag typesense/typesense-development:23-JUNE-2020-1 $PROJECT_DIR/docker

 echo "Building Typesense $TYPESENSE_VERSION..."
-docker run -it -v $PROJECT_DIR:/typesense typesense/typesense-development cmake -DTYPESENSE_VERSION=$TYPESENSE_VERSION \
+docker run -it -v $PROJECT_DIR:/typesense typesense/typesense-development:23-JUNE-2020-1 cmake -DTYPESENSE_VERSION=$TYPESENSE_VERSION \
 -DCMAKE_BUILD_TYPE=Release -H/typesense -B/typesense/$BUILD_DIR

-docker run -it -v $PROJECT_DIR:/typesense typesense/typesense-development make typesense-server -C/typesense/$BUILD_DIR
+docker run -it -v $PROJECT_DIR:/typesense typesense/typesense-development:23-JUNE-2020-1 make typesense-server -C/typesense/$BUILD_DIR

 if [[ "$@" == *"--build-deploy-image"* ]]; then
    echo "Creating deployment image for Typesense $TYPESENSE_VERSION server ..."
@ -60,7 +60,7 @@ if [[ "$@" == *"--package-libs"* ]]; then
 fi
 #
 #if [[ "$@" == *"--create-deb-upload"* ]]; then
-#    docker run -it -v $PROJECT_DIR:/typesense typesense/typesense-development cmake -DTYPESENSE_VERSION=$TYPESENSE_VERSION \
+#    docker run -it -v $PROJECT_DIR:/typesense typesense/typesense-development:23-JUNE-2020-1 cmake -DTYPESENSE_VERSION=$TYPESENSE_VERSION \
 #    -DCMAKE_BUILD_TYPE=Debug -H/typesense -B/typesense/$BUILD_DIR
 #fi

--- a/docker/development.Dockerfile
+++ b/docker/development.Dockerfile
@ -55,25 +55,34 @@ ADD https://github.com/protocolbuffers/protobuf/releases/download/v3.11.4/protob
 RUN tar -C /opt -xf /opt/protobuf-cpp-3.11.4.tar.gz && chown -R root:root /opt/protobuf-3.11.4
 RUN cd /opt/protobuf-3.11.4 && ./configure --disable-shared && make -j8 && make check && make install && rm -rf /usr/local/lib/*.so*

-ADD https://github.com/google/leveldb/archive/1.22.tar.gz /opt/leveldb-1.22.tar.gz.tar.gz
-RUN tar -C /opt -xf /opt/leveldb-1.22.tar.gz.tar.gz
+ADD https://github.com/google/leveldb/archive/1.22.tar.gz /opt/leveldb-1.22.tar.gz
+RUN tar -C /opt -xf /opt/leveldb-1.22.tar.gz
 RUN mkdir -p /opt/leveldb-1.22/build && cd /opt/leveldb-1.22/build && cmake -DCMAKE_BUILD_TYPE=Release .. && \
    cmake --build . && make install && rm -rf /usr/local/lib/*.so*

+ADD https://github.com/google/glog/archive/0a2e593.tar.gz /opt/glog-0a2e593.tar.gz
+RUN tar -C /opt -xf /opt/glog-0a2e593.tar.gz
+RUN mkdir -p /opt/glog-0a2e5931bd5ff22fd3bf8999eb8ce776f159cda6/bld && \
+    cd /opt/glog-0a2e5931bd5ff22fd3bf8999eb8ce776f159cda6/bld && \
+    cmake -DBUILD_TESTING=0 -DWITH_GFLAGS=ON -DWITH_UNWIND=OFF .. && \
+    cmake --build . && make install && rm -rf /usr/local/lib/*.so*
+
 ADD https://github.com/apache/incubator-brpc/archive/0.9.7-rc03.tar.gz /opt/brpc-0.9.7-rc03.tar.gz
 RUN tar -C /opt -xf /opt/brpc-0.9.7-rc03.tar.gz
 COPY patches/brpc_cmakelists.txt /opt/incubator-brpc-0.9.7-rc03/src/CMakeLists.txt
 RUN chown root:root /opt/incubator-brpc-0.9.7-rc03/src/CMakeLists.txt
-RUN mkdir -p /opt/incubator-brpc-0.9.7-rc03/build && cd /opt/incubator-brpc-0.9.7-rc03/build && \
-    cmake -DWITH_DEBUG_SYMBOLS=OFF .. && make -j8 && make install && rm -rf /usr/local/lib/*.so* && \
-    rm -rf /opt/incubator-brpc-0.9.7-rc03/build/output/bin
+RUN mkdir -p /opt/incubator-brpc-0.9.7-rc03/bld && cd /opt/incubator-brpc-0.9.7-rc03/bld && \
+    cmake -DWITH_DEBUG_SYMBOLS=OFF -DWITH_GLOG=ON .. && \
+    make -j8 && make install && rm -rf /usr/local/lib/*.so* && \
+    rm -rf /opt/incubator-brpc-0.9.7-rc03/bld/output/bin

-ADD https://github.com/baidu/braft/archive/v1.1.0.tar.gz /opt/braft-v1.1.0.tar.gz
-RUN tar -C /opt -xf /opt/braft-v1.1.0.tar.gz
-COPY patches/braft_cmakelists.txt /opt/braft-1.1.0/src/CMakeLists.txt
-RUN chown root:root /opt/braft-1.1.0/src/CMakeLists.txt
-RUN mkdir -p /opt/braft-1.1.0/build && cd /opt/braft-1.1.0/build && \
-    cmake -DWITH_DEBUG_SYMBOLS=ON .. && make -j8 && make install && rm -rf /usr/local/lib/*.so*
+ADD https://github.com/baidu/braft/archive/v1.1.1.tar.gz /opt/braft-v1.1.1.tar.gz
+RUN tar -C /opt -xf /opt/braft-v1.1.1.tar.gz
+COPY patches/braft_cmakelists.txt /opt/braft-1.1.1/src/CMakeLists.txt
+RUN chown root:root /opt/braft-1.1.1/src/CMakeLists.txt
+RUN mkdir -p /opt/braft-1.1.1/bld && cd /opt/braft-1.1.1/bld && \
+    cmake -DWITH_DEBUG_SYMBOLS=ON -DBRPC_WITH_GLOG=ON .. && make -j8 && make install && rm -rf /usr/local/lib/*.so* && \
+    rm -rf /opt/braft-1.1.1/bld/output/bin

 ENV CC /usr/local/gcc-6.4.0/bin/gcc
 ENV CXX /usr/local/gcc-6.4.0/bin/g++
--- a/docker/patches/brpc_cmakelists.txt
+++ b/docker/patches/brpc_cmakelists.txt
@ -35,8 +35,8 @@ add_library(brpc-static STATIC $<TARGET_OBJECTS:BUTIL_LIB>
                               $<TARGET_OBJECTS:SOURCES_LIB>
                               $<TARGET_OBJECTS:PROTO_LIB>)

-if(BRPC_WITH_THRIFT)
-    target_link_libraries(brpc-static thrift)
+if(BRPC_WITH_GLOG)
+    target_link_libraries(brpc-static ${GLOG_LIB})
 endif()

 SET_TARGET_PROPERTIES(brpc-static PROPERTIES OUTPUT_NAME brpc CLEAN_DIRECT_OUTPUT 1)
--- a/include/collection.h
+++ b/include/collection.h
@ -147,7 +147,7 @@ private:
    std::string get_seq_id_key(uint32_t seq_id);

    void highlight_result(const field &search_field, const std::vector<std::vector<art_leaf *>> &searched_queries,
-                          const KV &field_order_kv, const nlohmann::json &document,
+                          const KV* field_order_kv, const nlohmann::json &document,
                          StringUtils & string_utils, size_t snippet_threshold,
                          bool highlighted_fully,
                          highlight_t &highlight);
@ -155,10 +155,10 @@ private:
    void remove_document(nlohmann::json & document, const uint32_t seq_id, bool remove_from_store);

    void populate_overrides(std::string query,
-                            const std::map<std::string, size_t>& pinned_hits,
+                            const std::map<size_t, std::vector<std::string>>& pinned_hits,
                            const std::vector<std::string>& hidden_hits,
-                            std::map<uint32_t, size_t> & id_pos_map,
-                            std::vector<uint32_t> & included_ids, std::vector<uint32_t> & excluded_ids);
+                            std::map<size_t, std::vector<uint32_t>>& include_ids,
+                            std::vector<uint32_t> & excluded_ids);

    static bool facet_count_compare(const std::pair<uint64_t, facet_count_t>& a,
                                    const std::pair<uint64_t, facet_count_t>& b) {
@ -236,8 +236,10 @@ public:
                          const size_t snippet_threshold = 30,
                          const std::string & highlight_full_fields = "",
                          size_t typo_tokens_threshold = Index::TYPO_TOKENS_THRESHOLD,
-                          const std::map<std::string, size_t>& pinned_hits={},
-                          const std::vector<std::string>& hidden_hits={});
+                          const std::map<size_t, std::vector<std::string>>& pinned_hits={},
+                          const std::vector<std::string>& hidden_hits={},
+                          const std::vector<std::string>& group_by_fields={},
+                          const size_t group_limit = 0);

    Option<nlohmann::json> get(const std::string & id);

@ -271,6 +273,8 @@ public:

    const size_t PER_PAGE_MAX = 250;

+    const size_t GROUP_LIMIT_MAX = 99;
+
    // Using a $ prefix so that these meta keys stay above record entries in a lexicographically ordered KV store
    static constexpr const char* COLLECTION_META_PREFIX = "$CM";
    static constexpr const char* COLLECTION_NEXT_SEQ_PREFIX = "$CS";
@ -280,5 +284,9 @@ public:

    void facet_value_to_string(const facet &a_facet, const facet_count_t &facet_count, const nlohmann::json &document,
                               std::string &value);
+
+    void aggregate_topster(size_t query_index, Topster &topster, Topster *index_topster) const;
+
+    void populate_result_kvs(Topster *topster, std::vector<std::vector<KV *>> &result_kvs) const;
 };

--- a/include/field.h
+++ b/include/field.h
@ -146,6 +146,7 @@ struct token_pos_cost_t {

 struct facet_count_t {
    uint32_t count;
+    spp::sparse_hash_set<uint64_t> groups;  // used for faceting grouped results

    // used to fetch the actual document and value for representation
    uint32_t doc_id;
--- a/include/http_data.h
+++ b/include/http_data.h
@ -117,16 +117,16 @@ struct http_req {
    uint64_t route_hash;
    std::map<std::string, std::string> params;
    std::string body;
-    uint64_t seed;
+    std::string metadata;

-    http_req(): route_hash(1), seed(random_uint64_t()) {
+    http_req(): route_hash(1) {

    }

    http_req(h2o_req_t* _req, const std::string & http_method, uint64_t route_hash,
            const std::map<std::string, std::string> & params, std::string body):
            _req(_req), http_method(http_method), route_hash(route_hash), params(params),
-            body(body), seed(random_uint64_t()) {
+            body(body) {

    }

@ -136,7 +136,7 @@ struct http_req {
        nlohmann::json content = nlohmann::json::parse(serialized_content);
        route_hash = content["route_hash"];
        body = content["body"];
-        seed = content["seed"];
+        metadata = content.count("metadata") != 0 ? content["metadata"] : "";

        for (nlohmann::json::iterator it = content["params"].begin(); it != content["params"].end(); ++it) {
            params.emplace(it.key(), it.value());
@ -150,16 +150,10 @@ struct http_req {
        content["route_hash"] = route_hash;
        content["params"] = params;
        content["body"] = body;
-        content["seed"] = seed;
+        content["metadata"] = metadata;

        return content.dump();
    }
-
-    uint64_t random_uint64_t() {
-        thread_local std::mt19937 rg(std::random_device{}());
-        thread_local std::uniform_int_distribution<uint64_t> pick(0, std::numeric_limits<uint64_t>::max());
-        return pick(rg);
-    }
 };

 struct request_response {
--- a/include/index.h
+++ b/include/index.h
@ -13,6 +13,7 @@
 #include <json.hpp>
 #include <field.h>
 #include <option.h>
+#include <set>
 #include "string_utils.h"

 struct token_candidates {
@ -26,23 +27,27 @@ struct search_args {
    std::vector<std::string> search_fields;
    std::vector<filter> filters;
    std::vector<facet> facets;
-    std::vector<uint32_t> included_ids;
+    std::map<size_t, std::map<size_t, uint32_t>> included_ids;
    std::vector<uint32_t> excluded_ids;
    std::vector<sort_by> sort_fields_std;
    facet_query_t facet_query;
    int num_typos;
    size_t max_facet_values;
-    size_t max_hits;
    size_t per_page;
    size_t page;
    token_ordering token_order;
    bool prefix;
    size_t drop_tokens_threshold;
    size_t typo_tokens_threshold;
-    std::vector<KV> raw_result_kvs;
+    std::vector<std::string> group_by_fields;
+    size_t group_limit;
    size_t all_result_ids_len;
+    spp::sparse_hash_set<uint64_t> groups_processed;
    std::vector<std::vector<art_leaf*>> searched_queries;
-    std::vector<KV> override_result_kvs;
+    Topster* topster;
+    Topster* curated_topster;
+    std::vector<std::vector<KV*>> raw_result_kvs;
+    std::vector<std::vector<KV*>> override_result_kvs;
    Option<uint32_t> outcome;

    search_args(): outcome(0) {
@ -50,18 +55,28 @@ struct search_args {
    }

    search_args(std::string query, std::vector<std::string> search_fields, std::vector<filter> filters,
-                std::vector<facet> facets, std::vector<uint32_t> included_ids, std::vector<uint32_t> excluded_ids,
+                std::vector<facet> facets, std::map<size_t, std::map<size_t, uint32_t>> included_ids, std::vector<uint32_t> excluded_ids,
                std::vector<sort_by> sort_fields_std, facet_query_t facet_query, int num_typos, size_t max_facet_values,
                size_t max_hits, size_t per_page, size_t page, token_ordering token_order, bool prefix,
-                size_t drop_tokens_threshold, size_t typo_tokens_threshold):
+                size_t drop_tokens_threshold, size_t typo_tokens_threshold,
+                const std::vector<std::string>& group_by_fields, size_t group_limit):
            query(query), search_fields(search_fields), filters(filters), facets(facets), included_ids(included_ids),
            excluded_ids(excluded_ids), sort_fields_std(sort_fields_std), facet_query(facet_query), num_typos(num_typos),
-            max_facet_values(max_facet_values), max_hits(max_hits), per_page(per_page),
+            max_facet_values(max_facet_values), per_page(per_page),
            page(page), token_order(token_order), prefix(prefix),
            drop_tokens_threshold(drop_tokens_threshold), typo_tokens_threshold(typo_tokens_threshold),
+            group_by_fields(group_by_fields), group_limit(group_limit),
            all_result_ids_len(0), outcome(0) {

+        const size_t topster_size = std::max((size_t)1, max_hits);  // needs to be atleast 1 since scoring is mandatory
+        topster = new Topster(topster_size, group_limit);
+        curated_topster = new Topster(topster_size, group_limit);
    }
+
+    ~search_args() {
+        delete topster;
+        delete curated_topster;
+    };
 };

 struct index_record {
@ -149,22 +164,23 @@ private:
    void do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
                   const uint32_t* result_ids, size_t results_size);

-    void drop_facets(std::vector<facet> & facets, const std::vector<uint32_t> & ids);
-
    void search_field(const uint8_t & field_id, const std::string & query,
                      const std::string & field, uint32_t *filter_ids, size_t filter_ids_length,
+                      const std::vector<uint32_t>& curated_ids,
                      std::vector<facet> & facets, const std::vector<sort_by> & sort_fields,
                      const int num_typos, std::vector<std::vector<art_leaf*>> & searched_queries,
-                      Topster & topster, uint32_t** all_result_ids,
-                      size_t & all_result_ids_len, const token_ordering token_order = FREQUENCY,
-                      const bool prefix = false,
+                      Topster* topster, spp::sparse_hash_set<uint64_t>& groups_processed,
+                      uint32_t** all_result_ids, size_t & all_result_ids_len,
+                      const token_ordering token_order = FREQUENCY, const bool prefix = false,
                      const size_t drop_tokens_threshold = Index::DROP_TOKENS_THRESHOLD,
                      const size_t typo_tokens_threshold = Index::TYPO_TOKENS_THRESHOLD);

    void search_candidates(const uint8_t & field_id, uint32_t* filter_ids, size_t filter_ids_length,
+                           const std::vector<uint32_t>& curated_ids,
                           const std::vector<sort_by> & sort_fields, std::vector<token_candidates> & token_to_candidates,
-                           const token_ordering token_order, std::vector<std::vector<art_leaf*>> & searched_queries,
-                           Topster & topster, uint32_t** all_result_ids,
+                           std::vector<std::vector<art_leaf*>> & searched_queries,
+                           Topster* topster, spp::sparse_hash_set<uint64_t>& groups_processed,
+                           uint32_t** all_result_ids,
                           size_t & all_result_ids_len,
                           const size_t typo_tokens_threshold);

@ -196,14 +212,19 @@ private:
    void remove_and_shift_offset_index(sorted_array &offset_index, const uint32_t *indices_sorted,
                                       const uint32_t indices_length);

-    void collate_curated_ids(const std::string & query, const std::string & field, const uint8_t field_id,
-                             const std::vector<uint32_t> & included_ids,
-                             Topster & curated_topster, std::vector<std::vector<art_leaf*>> & searched_queries);
+    void collate_included_ids(const std::string & query, const std::string & field, const uint8_t field_id,
+                              const std::map<size_t, std::map<size_t, uint32_t>> & included_ids_map,
+                              Topster* curated_topster, std::vector<std::vector<art_leaf*>> & searched_queries);

    uint64_t facet_token_hash(const field & a_field, const std::string &token);

    void compute_facet_stats(facet &a_facet, int64_t raw_value, const std::string & field_type);

+    // reference: https://stackoverflow.com/a/27952689/131050
+    uint64_t hash_combine(uint64_t lhs, uint64_t rhs) const {
+        lhs ^= rhs + 0x517cc1b727220a95 + (lhs << 6) + (lhs >> 2);
+        return lhs;
+    }

 public:
    Index() = delete;
@ -218,12 +239,18 @@ public:
    void search(Option<uint32_t> & outcome, const std::string & query, const std::vector<std::string> & search_fields,
                          const std::vector<filter> & filters, std::vector<facet> & facets,
                          facet_query_t & facet_query,
-                          const std::vector<uint32_t> & included_ids, const std::vector<uint32_t> & excluded_ids,
+                          const std::map<size_t, std::map<size_t, uint32_t>> & included_ids_map,
+                          const std::vector<uint32_t> & excluded_ids,
                          const std::vector<sort_by> & sort_fields_std, const int num_typos,
-                          const size_t max_hits, const size_t per_page, const size_t page, const token_ordering token_order,
-                          const bool prefix, const size_t drop_tokens_threshold, std::vector<KV> & raw_result_kvs,
-                          size_t & all_result_ids_len, std::vector<std::vector<art_leaf*>> & searched_queries,
-                          std::vector<KV> & override_result_kvs, const size_t typo_tokens_threshold);
+                          Topster* topster, Topster* curated_topster,
+                          const size_t per_page, const size_t page, const token_ordering token_order,
+                          const bool prefix, const size_t drop_tokens_threshold,
+                          size_t & all_result_ids_len,
+                          spp::sparse_hash_set<uint64_t>& groups_processed,
+                          std::vector<std::vector<art_leaf*>> & searched_queries,
+                          std::vector<std::vector<KV*>> & raw_result_kvs,
+                          std::vector<std::vector<KV*>> & override_result_kvs,
+                          const size_t typo_tokens_threshold);

    Option<uint32_t> remove(const uint32_t seq_id, nlohmann::json & document);

@ -235,7 +262,8 @@ public:
                                         std::vector<std::vector<std::vector<uint16_t>>> &array_token_positions);

    void score_results(const std::vector<sort_by> & sort_fields, const uint16_t & query_index, const uint8_t & field_id,
-                       const uint32_t total_cost, Topster &topster, const std::vector<art_leaf *> & query_suggestion,
+                       const uint32_t total_cost, Topster* topster, const std::vector<art_leaf *> & query_suggestion,
+                       spp::sparse_hash_set<uint64_t>& groups_processed,
                       const uint32_t *result_ids, const size_t result_size) const;

    static int32_t get_points_from_doc(const nlohmann::json &document, const std::string & default_sorting_field);
@ -278,7 +306,7 @@ public:
    bool processed;   // prevents spurious wake up of the main thread
    bool terminate;   // used for interrupting the thread during tear down

-    search_args search_params;
+    search_args* search_params;

    static void populate_array_token_positions(std::vector<std::vector<std::vector<uint16_t>>> & array_token_positions,
                                               const art_leaf *token_leaf, uint32_t doc_index);
@ -286,5 +314,7 @@ public:
    int get_bounded_typo_cost(const size_t max_cost, const size_t token_len) const;

    static int64_t float_to_in64_t(float n);
+
+    uint64_t get_distinct_id(const std::unordered_map<std::string, size_t> &facet_to_id, const uint32_t seq_id) const;
 };

--- a/include/logger.h
+++ b/include/logger.h
@ -1,5 +1,3 @@
 #pragma once

-#include <string>
-#include <iostream>
-#include <butil/logging.h>
+#include <glog/logging.h>
--- a/include/string_utils.h
+++ b/include/string_utils.h
@ -151,8 +151,18 @@ struct StringUtils {
        }

        char * p ;
-        strtoull(s.c_str(), &p, 10);
-        return (*p == 0);
+        unsigned long long ull = strtoull(s.c_str(), &p, 10);
+        return (*p == 0) && ull <= std::numeric_limits<uint64_t>::max();
+    }
+
+    static bool is_uint32_t(const std::string &s) {
+        if(s.empty()) {
+            return false;
+        }
+
+        char * p ;
+        unsigned long ul = strtoul(s.c_str(), &p, 10);
+        return (*p == 0) && ul <= std::numeric_limits<uint32_t>::max();
    }

    static void toupper(std::string& str) {
@ -234,7 +244,7 @@ struct StringUtils {
        return hash != std::numeric_limits<uint64_t>::max() ? hash : (std::numeric_limits<uint64_t>::max()-1);
    }

-    static std::string randstring(size_t length, uint64_t seed);
+    static std::string randstring(size_t length);

    static std::string hmac(const std::string& key, const std::string& msg);
 };
--- a/include/system_metrics.h
+++ b/include/system_metrics.h
@ -118,6 +118,23 @@ private:
        }
    }

+    static unsigned long linux_get_mem_available_bytes() {
+        std::string token;
+        std::ifstream file("/proc/meminfo");
+        while(file >> token) {
+            if(token == "MemAvailable:") {
+                unsigned long mem_kb;
+                if(file >> mem_kb) {
+                    return mem_kb * 1000;
+                } else {
+                    return 0;
+                }
+            }
+        }
+
+        return 0; // nothing found
+    }
+
 public:
    void get(const std::string & data_dir_path, nlohmann::json& result);

--- a/include/topster.h
+++ b/include/topster.h
@ -5,16 +5,26 @@
 #include <cstdio>
 #include <algorithm>
 #include <sparsepp.h>
-#include <match_score.h>
-#include <number.h>

 struct KV {
-    uint8_t field_id;
-    uint16_t query_index;
-    uint16_t array_index;
-    uint64_t key;
-    uint64_t match_score;
-    int64_t scores[3];  // match score + 2 custom attributes
+    uint8_t field_id{};
+    uint16_t query_index{};
+    uint16_t array_index{};
+    uint64_t key{};
+    uint64_t distinct_key{};
+    uint64_t match_score{};
+    int64_t scores[3]{};  // match score + 2 custom attributes
+
+    KV(uint8_t fieldId, uint16_t queryIndex, uint64_t key, uint64_t distinct_key,
+       uint64_t match_score, const int64_t *scores):
+            field_id(fieldId), query_index(queryIndex), array_index(0), key(key),
+            distinct_key(distinct_key), match_score(match_score) {
+        this->scores[0] = scores[0];
+        this->scores[1] = scores[1];
+        this->scores[2] = scores[2];
+    }
+
+    KV() = default;
 };

 /*
@ -25,12 +35,19 @@ struct Topster {
    uint32_t size;

    KV *data;
-    KV* *kvs;
+    KV** kvs;

-    spp::sparse_hash_map<uint64_t, KV*> keys;
+    // For distinct, stores the min heap kv of each group_kv_map topster value
+    spp::sparse_hash_map<uint64_t, KV*> kv_map;

-    explicit Topster(size_t capacity): MAX_SIZE(capacity), size(0) {
-        // we allocate data first to get contiguous memory block whose indices are then assigned to `kvs`
+    spp::sparse_hash_map<uint64_t, Topster*> group_kv_map;
+    size_t distinct;
+
+    explicit Topster(size_t capacity): Topster(capacity, 0) {
+    }
+
+    explicit Topster(size_t capacity, size_t distinct): MAX_SIZE(capacity), size(0), distinct(distinct) {
+        // we allocate data first to get a memory block whose indices are then assigned to `kvs`
        // we use separate **kvs for easier pointer swaps
        data = new KV[capacity];
        kvs = new KV*[capacity];
@ -38,15 +55,25 @@ struct Topster {
        for(size_t i=0; i<capacity; i++) {
            data[i].field_id = 0;
            data[i].query_index = 0;
+            data[i].array_index = i;
            data[i].key = 0;
+            data[i].distinct_key = 0;
            data[i].match_score = 0;
            kvs[i] = &data[i];
        }
    }

    ~Topster() {
-        delete [] data;
-        delete [] kvs;
+        delete[] data;
+        delete[] kvs;
+        for(auto& kv: group_kv_map) {
+            delete kv.second;
+        }
+
+        data = nullptr;
+        kvs = nullptr;
+
+        group_kv_map.clear();
    }

    static inline void swapMe(KV** a, KV** b) {
@ -59,133 +86,193 @@ struct Topster {
        (*b)->array_index = a_index;
    }

-    static inline void replace_key_values(const uint64_t &key, const uint8_t &field_id, const uint16_t &query_index,
-                                          const uint64_t &match_score, const int64_t *scores, uint32_t start,
-                                          KV* *kvs, spp::sparse_hash_map<uint64_t, KV*>& keys) {
-        kvs[start]->key = key;
-        kvs[start]->field_id = field_id;
-        kvs[start]->query_index = query_index;
-        kvs[start]->array_index = start;
-        kvs[start]->match_score = match_score;
-        kvs[start]->scores[0] = scores[0];
-        kvs[start]->scores[1] = scores[1];
-        kvs[start]->scores[2] = scores[2];
+    bool add(KV* kv) {
+        //LOG(INFO) << "kv_map size: " << kv_map.size() << " -- kvs[0]: " << kvs[0]->match_score;
+        /*for(auto kv: kv_map) {
+            LOG(INFO) << "kv key: " << kv.first << " => " << kv.second->match_score;
+        }*/

-        keys.erase(kvs[start]->key);
-        keys[key] = kvs[start];
-    }
+        bool less_than_min_heap = (size >= MAX_SIZE) && is_smaller_equal(kv, kvs[0]);
+        size_t heap_op_index = 0;

-    void add(const uint64_t &key, const uint8_t &field_id, const uint16_t &query_index, const uint64_t &match_score,
-             const int64_t scores[3]) {
-        if (size >= MAX_SIZE) {
-            if(!is_greater(kvs[0], scores)) {
-                // when incoming value is less than the smallest in the heap, ignore
-                return;
+        if(!distinct && less_than_min_heap) {
+            // for non-distinct, if incoming value is smaller than min-heap ignore
+            return false;
+        }
+
+        bool SIFT_DOWN = true;
+
+        if(distinct) {
+            const auto& found_it = group_kv_map.find(kv->distinct_key);
+            bool is_duplicate_key = (found_it != group_kv_map.end());
+
+            if(!is_duplicate_key && less_than_min_heap) {
+                // for distinct, if a non duplicate kv is < than min heap we ignore
+                return false;
            }

-            uint32_t start = 0;
+            if(is_duplicate_key) {
+                // if min heap (group_topster.kvs[0]) changes, we have to update kvs and sift
+                Topster* group_topster = found_it->second;
+                KV old_min_heap_kv = *kv_map[kv->distinct_key];
+                bool added = group_topster->add(kv);

-            // When the key already exists and has a greater score, ignore. Otherwise, we have to replace.
-            // NOTE: we don't consider primary and secondary attrs here because they will be the same for a given key.
-            if(keys.count(key) != 0) {
-                const KV* existing = keys.at(key);
-                if(match_score <= existing->match_score) {
-                    return ;
+                if(!added) {
+                    return false;
                }

-                // replace and sift down
-                start = existing->array_index;
-            }
+                // if new kv score is greater than previous min heap score we sift dowm, otherwise sift up
+                SIFT_DOWN = is_greater(kv, &old_min_heap_kv);

-            replace_key_values(key, field_id, query_index, match_score, scores, start, kvs, keys);
+                // new kv is different from old_min_heap_kv so we have to sift heap
+                heap_op_index = old_min_heap_kv.array_index;

-            // sift down to maintain heap property
-            while ((2*start+1) < MAX_SIZE) {
-                uint32_t next = (2 * start + 1);
-                if (next+1 < MAX_SIZE && is_greater_kv(kvs[next], kvs[next+1])) {
-                    next++;
-                }
+                // erase current min heap key from kv_map
+                kv_map.erase(old_min_heap_kv.distinct_key);

-                if (is_greater_kv(kvs[start], kvs[next])) {
-                    swapMe(&kvs[start], &kvs[next]);
+                // kv will be copied into the pointer at heap_op_index
+                kv_map.emplace(kv->distinct_key, kvs[heap_op_index]);
+            } else {
+                // kv is guaranteed to be > current min heap: kvs[0]
+                // create fresh topster for this distinct group key since it does not exist
+                Topster* group_topster = new Topster(distinct, 0);
+                group_topster->add(kv);
+
+                // add new group key to map
+                group_kv_map.emplace(kv->distinct_key, group_topster);
+
+                // find heap operation index for updating kvs
+
+                if(size < MAX_SIZE) {
+                    // there is enough space in heap we just copy to end
+                    SIFT_DOWN = false;
+                    heap_op_index = size;
+                    size++;
                } else {
-                    break;
+                    SIFT_DOWN = true;
+
+                    // max size is reached so we are forced to replace current min heap element (kvs[0])
+                    heap_op_index = 0;
+
+                    // remove current min heap group key from maps
+                    delete group_kv_map[kvs[heap_op_index]->distinct_key];
+                    group_kv_map.erase(kvs[heap_op_index]->distinct_key);
+                    kv_map.erase(kvs[heap_op_index]->distinct_key);
                }

-                start = next;
+                // kv will be copied into the pointer at heap_op_index
+                kv_map.emplace(kv->distinct_key, kvs[heap_op_index]);
            }
-        } else {
-            uint32_t start = size;
-            bool key_found = false;

-            // When the key already exists and has a greater score, ignore. Otherwise, we have to replace
-            if(keys.count(key) != 0) {
-                const KV* existing = keys.at(key);
-                if(match_score <= existing->match_score) {
-                    return ;
+        } else { // not distinct
+            //LOG(INFO) << "Searching for key: " << kv->key;
+
+            const auto& found_it = kv_map.find(kv->key);
+            bool is_duplicate_key = (found_it != kv_map.end());
+
+            /*
+               is_duplicate_key: SIFT_DOWN regardless of `size`.
+               Else:
+                   Do SIFT_UP if size < max_size
+                   Else SIFT_DOWN
+            */
+
+            if(is_duplicate_key) {
+                // Need to check if kv is greater than existing duplicate kv.
+                KV* existing_kv = found_it->second;
+                //LOG(INFO) << "existing_kv: " << existing_kv->key << " -> " << existing_kv->match_score;
+
+                if(is_smaller_equal(kv, existing_kv)) {
+                    return false;
                }

-                // replace and sift down
-                start = existing->array_index;
-                key_found = true;
-            }
+                SIFT_DOWN = true;

-            replace_key_values(key, field_id, query_index, match_score, scores, start, kvs, keys);
+                // replace existing kv and sift down
+                heap_op_index = existing_kv->array_index;
+                kv_map.erase(kvs[heap_op_index]->key);

-            if(key_found) {
-                // need to sift down if it's a replace
-                while ((2*start+1) < size) {
-                    uint32_t next = (2 * start + 1);
-                    if (next+1 < size && is_greater_kv(kvs[next], kvs[next+1])) {
-                        next++;
-                    }
+                // kv will be copied into the pointer at heap_op_index
+                kv_map.emplace(kv->key, kvs[heap_op_index]);
+            } else {  // not duplicate

-                    if (is_greater_kv(kvs[start], kvs[next])) {
-                        swapMe(&kvs[start], &kvs[next]);
-                    } else {
-                        break;
-                    }
-
-                    start = next;
-                }
-
-                return ;
-            }
-
-            while(start > 0) {
-                uint32_t parent = (start-1)/2;
-                if (is_greater_kv(kvs[parent], kvs[start])) {
-                    swapMe(&kvs[start], &kvs[parent]);
-                    start = parent;
+                if(size < MAX_SIZE) {
+                    // we just copy to end of array
+                    SIFT_DOWN = false;
+                    heap_op_index = size;
+                    size++;
                } else {
-                    break;
+                    // kv is guaranteed to be > min heap.
+                    // we have to replace min heap element since array is full
+                    SIFT_DOWN = true;
+                    heap_op_index = 0;
+                    kv_map.erase(kvs[heap_op_index]->key);
                }
-            }

-            if(keys.count(key) != 0) {
-                size++;
+                // kv will be copied into the pointer at heap_op_index
+                kv_map.emplace(kv->key, kvs[heap_op_index]);
            }
        }
+
+        // we have to replace the existing element in the heap and sift down
+        kv->array_index = heap_op_index;
+        *kvs[heap_op_index] = *kv;
+
+        // sift up/down to maintain heap property
+
+        if(SIFT_DOWN) {
+            while ((2 * heap_op_index + 1) < size) {
+                uint32_t next = (2 * heap_op_index + 1);  // left child
+                if (next+1 < size && is_greater(kvs[next], kvs[next + 1])) {
+                    // for min heap we compare with the minimum of children
+                    next++;  // right child (2n + 2)
+                }
+
+                if (is_greater(kvs[heap_op_index], kvs[next])) {
+                    swapMe(&kvs[heap_op_index], &kvs[next]);
+                } else {
+                    break;
+                }
+
+                heap_op_index = next;
+            }
+        } else {
+            // SIFT UP
+            while(heap_op_index > 0) {
+                uint32_t parent = (heap_op_index - 1) / 2;
+                if (is_greater(kvs[parent], kvs[heap_op_index])) {
+                    swapMe(&kvs[heap_op_index], &kvs[parent]);
+                    heap_op_index = parent;
+                } else {
+                    break;
+                }
+            }
+        }
+
+        return true;
    }

-    static bool is_greater(const struct KV* i, const int64_t scores[3]) {
-        return std::tie(scores[0], scores[1], scores[2]) >
-               std::tie(i->scores[0], i->scores[1], i->scores[2]);
-    }
-
-    static bool is_greater_kv(const struct KV* i, const struct KV* j) {
+    static bool is_greater(const struct KV* i, const struct KV* j) {
        return std::tie(i->scores[0], i->scores[1], i->scores[2], i->key) >
               std::tie(j->scores[0], j->scores[1], j->scores[2], j->key);
    }

-    static bool is_greater_kv_value(const struct KV & i, const struct KV & j) {
-        return std::tie(i.scores[0], i.scores[1], i.scores[2], i.key) >
-               std::tie(j.scores[0], j.scores[1], j.scores[2], j.key);
+    static bool is_smaller_equal(const struct KV* i, const struct KV* j) {
+        return std::tie(i->scores[0], i->scores[1], i->scores[2]) <=
+               std::tie(j->scores[0], j->scores[1], j->scores[2]);
+    }
+
+    static bool is_greater_kv_group(const std::vector<KV*>& i, const std::vector<KV*>& j) {
+        return std::tie(i[0]->scores[0], i[0]->scores[1], i[0]->scores[2], i[0]->key) >
+               std::tie(j[0]->scores[0], j[0]->scores[1], j[0]->scores[2], j[0]->key);
    }

    // topster must be sorted before iterated upon to remove dead array entries
    void sort() {
-        std::stable_sort(kvs, kvs+size, is_greater_kv);
+        std::stable_sort(kvs, kvs + size, is_greater);
+        for(auto &group_topster: group_kv_map) {
+            group_topster.second->sort();
+        }
    }

    void clear(){
@ -196,6 +283,10 @@ struct Topster {
        return kvs[index]->key;
    }

+    uint64_t getDistinctKeyAt(uint32_t index) {
+        return kvs[index]->distinct_key;
+    }
+
    KV* getKV(uint32_t index) {
        return kvs[index];
    }
--- a/include/typesense_server_utils.h
+++ b/include/typesense_server_utils.h
@ -1,10 +1,10 @@
 #pragma once

+#include "logger.h"
 #include <string>
 #include <iostream>
 #include <cmdline.h>
 #include "config.h"
-#include "logger.h"
 #include "store.h"
 #include "collection_manager.h"
 #include <csignal>
--- a/src/array.cpp
+++ b/src/array.cpp
@ -61,12 +61,12 @@ void array::remove_index(uint32_t start_index, uint32_t end_index) {
    }

    uint32_t size_required = (uint32_t) (unsorted_append_size_required(max, new_index) * FOR_GROWTH_FACTOR);
-    uint8_t *out = new uint8_t[size_required];
+    uint8_t *out = (uint8_t *) malloc(size_required * sizeof *out);
    uint32_t actual_size = for_compress_unsorted(new_array, out, new_index);

    delete[] curr_array;
    delete[] new_array;
-    delete[] in;
+    free(in);

    in = out;
    length = new_index;
--- a/src/array_utils.cpp
+++ b/src/array_utils.cpp
@ -107,14 +107,16 @@ size_t ArrayUtils::exclude_scalar(const uint32_t *A, const size_t lenA,
  size_t indexA = 0, indexB = 0, res_index = 0;

  if(A == nullptr && B == nullptr) {
-    return 0;
+      *out = nullptr;
+      return 0;
  }

  if(A == nullptr) {
+    *out = nullptr;
    return 0;
  }

-  if(B == nullptr) {
+  if(lenB == 0 || B == nullptr) {
    *out = new uint32_t[lenA];
    memcpy(*out, A, lenA * sizeof(uint32_t));
    return lenA;
--- a/src/art.cpp
+++ b/src/art.cpp
@ -1384,8 +1384,6 @@ int art_fuzzy_search(art_tree *t, const unsigned char *term, const int term_len,
        art_fuzzy_recurse(0, 0, t->root, -1, term, term_len, irow, jrow, min_cost, max_cost, prefix, nodes);
    }

-    PROCESS_NODES:
-
    if(token_order == FREQUENCY) {
        std::sort(nodes.begin(), nodes.end(), compare_art_node_frequency);
    } else {
--- a/src/collection.cpp
+++ b/src/collection.cpp
@ -305,54 +305,68 @@ void Collection::prune_document(nlohmann::json &document, const spp::sparse_hash
 }

 void Collection::populate_overrides(std::string query,
-                                    const std::map<std::string, size_t>& pinned_hits,
+                                    const std::map<size_t, std::vector<std::string>>& pinned_hits,
                                    const std::vector<std::string>& hidden_hits,
-                                    std::map<uint32_t, size_t> & id_pos_map,
-                                    std::vector<uint32_t> & included_ids,
+                                    std::map<size_t, std::vector<uint32_t>>& include_ids,
                                    std::vector<uint32_t> & excluded_ids) {
    StringUtils::tolowercase(query);
+    std::set<uint32_t> excluded_set;
+
+    // If pinned or hidden hits are provided, they take precedence over overrides
+
+    // have to ensure that hidden hits take precedence over included hits
+    if(!hidden_hits.empty()) {
+        for(const auto & hit: hidden_hits) {
+            Option<uint32_t> seq_id_op = doc_id_to_seq_id(hit);
+            if(seq_id_op.ok()) {
+                excluded_ids.push_back(seq_id_op.get());
+                excluded_set.insert(seq_id_op.get());
+            }
+        }
+    }

    for(const auto & override_kv: overrides) {
        const auto & override = override_kv.second;

        if( (override.rule.match == override_t::MATCH_EXACT && override.rule.query == query) ||
            (override.rule.match == override_t::MATCH_CONTAINS && query.find(override.rule.query) != std::string::npos) )  {
-            for(const auto & hit: override.add_hits) {
-                Option<uint32_t> seq_id_op = doc_id_to_seq_id(hit.doc_id);
-                if(seq_id_op.ok()) {
-                    included_ids.push_back(seq_id_op.get());
-                    id_pos_map[seq_id_op.get()] = hit.position;
-                }
-            }

+            // have to ensure that dropped hits take precedence over added hits
            for(const auto & hit: override.drop_hits) {
                Option<uint32_t> seq_id_op = doc_id_to_seq_id(hit.doc_id);
                if(seq_id_op.ok()) {
                    excluded_ids.push_back(seq_id_op.get());
+                    excluded_set.insert(seq_id_op.get());
+                }
+            }
+
+            for(const auto & hit: override.add_hits) {
+                Option<uint32_t> seq_id_op = doc_id_to_seq_id(hit.doc_id);
+                if(!seq_id_op.ok()) {
+                    continue;
+                }
+                uint32_t seq_id = seq_id_op.get();
+                bool excluded = (excluded_set.count(seq_id) != 0);
+                if(!excluded) {
+                    include_ids[hit.position].push_back(seq_id);
                }
            }
        }
    }

-    // If pinned or hidden hits are provided, they take precedence over overrides
-
    if(!pinned_hits.empty()) {
-        for(const auto & hit: pinned_hits) {
-            Option<uint32_t> seq_id_op = doc_id_to_seq_id(hit.first);
-            if(seq_id_op.ok()) {
-                included_ids.push_back(seq_id_op.get());
-                id_pos_map[seq_id_op.get()] = hit.second;
-            }
-        }
-    }
-
-    if(!hidden_hits.empty()) {
-        for(const auto & hit: hidden_hits) {
-            Option<uint32_t> seq_id_op = doc_id_to_seq_id(hit);
-            if(seq_id_op.ok()) {
-                included_ids.erase(std::remove(included_ids.begin(), included_ids.end(), seq_id_op.get()), included_ids.end());
-                id_pos_map.erase(seq_id_op.get());
-                excluded_ids.push_back(seq_id_op.get());
+        for(const auto& pos_ids: pinned_hits) {
+            size_t pos = pos_ids.first;
+            for(const std::string& id: pos_ids.second) {
+                Option<uint32_t> seq_id_op = doc_id_to_seq_id(id);
+                if(!seq_id_op.ok()) {
+                    continue;
+                }
+                uint32_t seq_id = seq_id_op.get();
+                bool excluded = (excluded_set.count(seq_id) != 0);
+                if(!excluded) {
+                    include_ids[pos].push_back(seq_id);
+                }
            }
        }
    }
@ -371,20 +385,60 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
                                  const size_t snippet_threshold,
                                  const std::string & highlight_full_fields,
                                  size_t typo_tokens_threshold,
-                                  const std::map<std::string, size_t>& pinned_hits,
-                                  const std::vector<std::string>& hidden_hits) {
+                                  const std::map<size_t, std::vector<std::string>>& pinned_hits,
+                                  const std::vector<std::string>& hidden_hits,
+                                  const std::vector<std::string>& group_by_fields,
+                                  const size_t group_limit) {
+
+    if(query != "*" && search_fields.empty()) {
+        return Option<nlohmann::json>(400, "No search fields specified for the query.");
+    }
+
+    if(!group_by_fields.empty() && (group_limit == 0 || group_limit > GROUP_LIMIT_MAX)) {
+        return Option<nlohmann::json>(400, "Value of `group_limit` must be between 1 and " +
+                                      std::to_string(GROUP_LIMIT_MAX) + ".");
+    }

-    std::vector<uint32_t> included_ids;
    std::vector<uint32_t> excluded_ids;
-    std::map<uint32_t, size_t> id_pos_map;
-    populate_overrides(query, pinned_hits, hidden_hits, id_pos_map, included_ids, excluded_ids);
+    std::map<size_t, std::vector<uint32_t>> include_ids; // position => list of IDs
+    populate_overrides(query, pinned_hits, hidden_hits, include_ids, excluded_ids);

-    std::map<uint32_t, std::vector<uint32_t>> index_to_included_ids;
+    /*for(auto kv: include_ids) {
+        LOG(INFO) << "key: " << kv.first;
+        for(auto val: kv.second) {
+            LOG(INFO) << val;
+        }
+    }
+
+    LOG(INFO) << "Excludes:";
+
+    for(auto id: excluded_ids) {
+        LOG(INFO) << id;
+    }
+
+    LOG(INFO) << "include_ids size: " << include_ids.size();
+    for(auto& group: include_ids) {
+        for(uint32_t& seq_id: group.second) {
+            LOG(INFO) << "seq_id: " << seq_id;
+        }
+
+        LOG(INFO) << "----";
+    }
+    */
+
+    std::map<uint32_t, std::map<size_t, std::map<size_t, uint32_t>>> index_to_included_ids;
    std::map<uint32_t, std::vector<uint32_t>> index_to_excluded_ids;

-    for(auto seq_id: included_ids) {
-        auto index_id = (seq_id % num_indices);
-        index_to_included_ids[index_id].push_back(seq_id);
+    for(const auto& pos_ids: include_ids) {
+        size_t outer_pos = pos_ids.first;
+        size_t ids_per_pos = std::max(size_t(1), group_limit);
+
+        for(size_t inner_pos = 0; inner_pos < std::min(ids_per_pos, pos_ids.second.size()); inner_pos++) {
+            auto seq_id = pos_ids.second[inner_pos];
+            auto index_id = (seq_id % num_indices);
+            index_to_included_ids[index_id][outer_pos][inner_pos] = seq_id;
+            //LOG(INFO) << "Adding seq_id " << seq_id << " to index_id " << index_id;
+        }
    }

    for(auto seq_id: excluded_ids) {
@ -408,6 +462,22 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
        }
    }

+    // validate group by fields
+    for(const std::string & field_name: group_by_fields) {
+        if(search_schema.count(field_name) == 0) {
+            std::string error = "Could not find a field named `" + field_name + "` in the schema.";
+            return Option<nlohmann::json>(404, error);
+        }
+
+        field search_field = search_schema.at(field_name);
+
+        // must be a facet field
+        if(!search_field.is_facet()) {
+            std::string error = "Group by field `" + field_name + "` should be a facet field.";
+            return Option<nlohmann::json>(400, error);
+        }
+    }
+
    // validate filter fields
    std::vector<std::string> filter_blocks;
    StringUtils::split(simple_filter_query, filter_blocks, "&&");
@ -515,7 +585,7 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
    }

    // for a wildcard query, if filter is not specified, use default_sorting_field as a catch-all
-    if(query == "*" && filters.size() == 0) {
+    if(query == "*" && filters.empty()) {
        field f = search_schema.at(default_sorting_field);
        std::string max_value = f.is_float() ? std::to_string(std::numeric_limits<float>::max()) :
                                std::to_string(std::numeric_limits<int32_t>::max());
@ -631,19 +701,21 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
    const size_t max_hits = std::min((page * per_page), get_num_documents());

    std::vector<std::vector<art_leaf*>> searched_queries;  // search queries used for generating the results
-    std::vector<KV> raw_result_kvs;
-    std::vector<KV> override_result_kvs;
+    std::vector<std::vector<KV*>> raw_result_kvs;
+    std::vector<std::vector<KV*>> override_result_kvs;

    size_t total_found = 0;
+    spp::sparse_hash_set<uint64_t> groups_processed;  // used to calculate total_found for grouped query

    // send data to individual index threads
    size_t index_id = 0;
    for(Index* index: indices) {
-        index->search_params = search_args(query, search_fields, filters, facets,
-                                           index_to_included_ids[index_id], index_to_excluded_ids[index_id],
-                                           sort_fields_std, facet_query, num_typos, max_facet_values, max_hits,
-                                           per_page, page, token_order, prefix,
-                                           drop_tokens_threshold, typo_tokens_threshold);
+        index->search_params = new search_args(query, search_fields, filters, facets,
+                                               index_to_included_ids[index_id], index_to_excluded_ids[index_id],
+                                               sort_fields_std, facet_query, num_typos, max_facet_values, max_hits,
+                                               per_page, page, token_order, prefix,
+                                               drop_tokens_threshold, typo_tokens_threshold,
+                                               group_by_fields, group_limit);
        {
            std::lock_guard<std::mutex> lk(index->m);
            index->ready = true;
@ -655,6 +727,12 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::

    Option<nlohmann::json> index_search_op({});  // stores the last error across all index threads

+    // for grouping we have re-aggregate
+
+    const size_t topster_size = std::max((size_t)1, max_hits);
+    Topster topster(topster_size, group_limit);
+    Topster curated_topster(topster_size, group_limit);
+
    for(Index* index: indices) {
        // wait for the worker
        {
@ -662,9 +740,9 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
            index->cv.wait(lk, [index]{return index->processed;});
        }

-        if(!index->search_params.outcome.ok()) {
-            index_search_op = Option<nlohmann::json>(index->search_params.outcome.code(),
-                                                    index->search_params.outcome.error());
+        if(!index->search_params->outcome.ok()) {
+            index_search_op = Option<nlohmann::json>(index->search_params->outcome.code(),
+                                                    index->search_params->outcome.error());
        }

        if(!index_search_op.ok()) {
@ -672,34 +750,33 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
            continue;
        }

-        for(auto & field_order_kv: index->search_params.raw_result_kvs) {
-            field_order_kv.query_index += searched_queries.size();
-            raw_result_kvs.push_back(field_order_kv);
-        }
+        aggregate_topster(searched_queries.size(), topster, index->search_params->topster);
+        aggregate_topster(searched_queries.size(), curated_topster, index->search_params->curated_topster);

-        for(auto & field_order_kv: index->search_params.override_result_kvs) {
-            field_order_kv.query_index += searched_queries.size();
-            override_result_kvs.push_back(field_order_kv);
-        }
+        searched_queries.insert(searched_queries.end(), index->search_params->searched_queries.begin(),
+                                index->search_params->searched_queries.end());

-        searched_queries.insert(searched_queries.end(), index->search_params.searched_queries.begin(),
-                                index->search_params.searched_queries.end());
-
-        for(size_t fi = 0; fi < index->search_params.facets.size(); fi++) {
-            auto & this_facet = index->search_params.facets[fi];
+        for(size_t fi = 0; fi < index->search_params->facets.size(); fi++) {
+            auto & this_facet = index->search_params->facets[fi];
            auto & acc_facet = facets[fi];

            for(auto & facet_kv: this_facet.result_map) {
-                size_t count = 0;
-
-                if(acc_facet.result_map.count(facet_kv.first) == 0) {
-                    // not found, so set it
-                    count = facet_kv.second.count;
+                if(index->search_params->group_limit) {
+                    // we have to add all group sets
+                    acc_facet.result_map[facet_kv.first].groups.insert(
+                        facet_kv.second.groups.begin(), facet_kv.second.groups.end()
+                    );
                } else {
-                    count = acc_facet.result_map[facet_kv.first].count + facet_kv.second.count;
+                    size_t count = 0;
+                    if(acc_facet.result_map.count(facet_kv.first) == 0) {
+                        // not found, so set it
+                        count = facet_kv.second.count;
+                    } else {
+                        count = acc_facet.result_map[facet_kv.first].count + facet_kv.second.count;
+                    }
+                    acc_facet.result_map[facet_kv.first].count = count;
                }

-                acc_facet.result_map[facet_kv.first].count = count;
                acc_facet.result_map[facet_kv.first].doc_id = facet_kv.second.doc_id;
                acc_facet.result_map[facet_kv.first].array_pos = facet_kv.second.array_pos;
                acc_facet.result_map[facet_kv.first].query_token_pos = facet_kv.second.query_token_pos;
@ -713,138 +790,195 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
            }
        }

-        total_found += index->search_params.all_result_ids_len;
+        if(group_limit) {
+            groups_processed.insert(
+                index->search_params->groups_processed.begin(),
+                index->search_params->groups_processed.end()
+            );
+        } else {
+            total_found += index->search_params->all_result_ids_len;
+        }
    }

    if(!index_search_op.ok()) {
        return index_search_op;
    }

-    // All fields are sorted descending
-    std::sort(raw_result_kvs.begin(), raw_result_kvs.end(), Topster::is_greater_kv_value);
+    topster.sort();
+    curated_topster.sort();

-    // Sort based on position in overriden list
+    populate_result_kvs(&topster, raw_result_kvs);
+    populate_result_kvs(&curated_topster, override_result_kvs);
+
+    // for grouping we have to aggregate group set sizes to a count value
+    if(group_limit) {
+        for(auto& acc_facet: facets) {
+            for(auto& facet_kv: acc_facet.result_map) {
+                facet_kv.second.count = facet_kv.second.groups.size();
+            }
+        }
+
+        total_found = groups_processed.size() + override_result_kvs.size();
+    }
+
+    // All fields are sorted descending
+    std::sort(raw_result_kvs.begin(), raw_result_kvs.end(), Topster::is_greater_kv_group);
+
+    // Sort based on position in overridden list
    std::sort(
      override_result_kvs.begin(), override_result_kvs.end(),
-      [&id_pos_map](const KV & a, const KV & b) -> bool {
-          return id_pos_map[a.key] < id_pos_map[b.key];
+      [](const std::vector<KV*>& a, std::vector<KV*>& b) -> bool {
+          return a[0]->distinct_key < b[0]->distinct_key;
      }
    );

-    nlohmann::json result = nlohmann::json::object();
-
-    result["hits"] = nlohmann::json::array();
-    result["found"] = total_found;
-
-    std::vector<KV> result_kvs;
+    std::vector<std::vector<KV*>> result_group_kvs;
    size_t override_kv_index = 0;
    size_t raw_results_index = 0;

    // merge raw results and override results
    while(override_kv_index < override_result_kvs.size() && raw_results_index < raw_result_kvs.size()) {
-        if(override_kv_index < override_result_kvs.size() &&
-           id_pos_map.count(override_result_kvs[override_kv_index].key) != 0 &&
-           result_kvs.size() + 1 == id_pos_map[override_result_kvs[override_kv_index].key]) {
-             result_kvs.push_back(override_result_kvs[override_kv_index]);
-             override_kv_index++;
+        size_t result_position = result_group_kvs.size() + 1;
+        uint64_t override_position = override_result_kvs[override_kv_index][0]->distinct_key;
+
+        if(result_position == override_position) {
+            override_result_kvs[override_kv_index][0]->match_score = 0;  // to identify curated result
+            result_group_kvs.push_back(override_result_kvs[override_kv_index]);
+            override_kv_index++;
        } else {
-            result_kvs.push_back(raw_result_kvs[raw_results_index]);
+            result_group_kvs.push_back(raw_result_kvs[raw_results_index]);
            raw_results_index++;
        }
    }

    while(override_kv_index < override_result_kvs.size()) {
-        result_kvs.push_back(override_result_kvs[override_kv_index]);
+        override_result_kvs[override_kv_index][0]->match_score = 0;  // to identify curated result
+        result_group_kvs.push_back({override_result_kvs[override_kv_index]});
        override_kv_index++;
    }

    while(raw_results_index < raw_result_kvs.size()) {
-        result_kvs.push_back(raw_result_kvs[raw_results_index]);
+        result_group_kvs.push_back(raw_result_kvs[raw_results_index]);
        raw_results_index++;
    }

    const long start_result_index = (page - 1) * per_page;
-    const long end_result_index = std::min(max_hits, result_kvs.size()) - 1;  // could be -1 when max_hits is 0
+    const long end_result_index = std::min(max_hits, result_group_kvs.size()) - 1;  // could be -1 when max_hits is 0
+
+    nlohmann::json result = nlohmann::json::object();
+
+    result["found"] = total_found;
+
+    std::string hits_key = group_limit ? "grouped_hits" : "hits";
+    result[hits_key] = nlohmann::json::array();

    // construct results array
    for(long result_kvs_index = start_result_index; result_kvs_index <= end_result_index; result_kvs_index++) {
-        const auto & field_order_kv = result_kvs[result_kvs_index];
-        const std::string& seq_id_key = get_seq_id_key((uint32_t) field_order_kv.key);
+        const std::vector<KV*> & kv_group = result_group_kvs[result_kvs_index];

-        nlohmann::json document;
-        const Option<bool> & document_op = get_document_from_store(seq_id_key, document);
-
-        if(!document_op.ok()) {
-            LOG(ERROR) << "Document fetch error. " << document_op.error();
-            continue;
+        nlohmann::json group_hits;
+        if(group_limit) {
+            group_hits["hits"] = nlohmann::json::array();
        }

-        nlohmann::json wrapper_doc;
-        wrapper_doc["highlights"] = nlohmann::json::array();
-        std::vector<highlight_t> highlights;
-        StringUtils string_utils;
+        nlohmann::json& hits_array = group_limit ? group_hits["hits"] : result["hits"];

-        // find out if fields have to be highlighted fully
-        std::vector<std::string> fields_highlighted_fully_vec;
-        spp::sparse_hash_set<std::string> fields_highlighted_fully;
-        StringUtils::split(highlight_full_fields, fields_highlighted_fully_vec, ",");
+        for(const KV* field_order_kv: kv_group) {
+            const std::string& seq_id_key = get_seq_id_key((uint32_t) field_order_kv->key);

-        for(std::string & highlight_full_field: fields_highlighted_fully_vec) {
-            StringUtils::trim(highlight_full_field);
-            fields_highlighted_fully.emplace(highlight_full_field);
-        }
+            nlohmann::json document;
+            const Option<bool> & document_op = get_document_from_store(seq_id_key, document);

-        for(const std::string & field_name: search_fields) {
-            // should not pick excluded field for highlighting
-            if(exclude_fields.count(field_name) > 0) {
+            if(!document_op.ok()) {
+                LOG(ERROR) << "Document fetch error. " << document_op.error();
                continue;
            }

-            field search_field = search_schema.at(field_name);
-            if(query != "*" && (search_field.type == field_types::STRING ||
-                                search_field.type == field_types::STRING_ARRAY)) {
+            nlohmann::json wrapper_doc;
+            wrapper_doc["highlights"] = nlohmann::json::array();
+            std::vector<highlight_t> highlights;
+            StringUtils string_utils;

-                bool highlighted_fully = (fields_highlighted_fully.find(field_name) != fields_highlighted_fully.end());
-                highlight_t highlight;
-                highlight_result(search_field, searched_queries, field_order_kv, document,
-                                 string_utils, snippet_threshold, highlighted_fully, highlight);
+            // find out if fields have to be highlighted fully
+            std::vector<std::string> fields_highlighted_fully_vec;
+            spp::sparse_hash_set<std::string> fields_highlighted_fully;
+            StringUtils::split(highlight_full_fields, fields_highlighted_fully_vec, ",");

-                if(!highlight.snippets.empty()) {
-                    highlights.push_back(highlight);
-                }
+            for(std::string & highlight_full_field: fields_highlighted_fully_vec) {
+                StringUtils::trim(highlight_full_field);
+                fields_highlighted_fully.emplace(highlight_full_field);
            }
-        }

-        std::sort(highlights.begin(), highlights.end());
-
-        for(const auto & highlight: highlights) {
-            nlohmann::json h_json = nlohmann::json::object();
-            h_json["field"] = highlight.field;
-            bool highlight_fully = (fields_highlighted_fully.find(highlight.field) != fields_highlighted_fully.end());
-
-            if(!highlight.indices.empty()) {
-                h_json["indices"] = highlight.indices;
-                h_json["snippets"] = highlight.snippets;
-                if(highlight_fully) {
-                    h_json["values"] = highlight.values;
+            for(const std::string & field_name: search_fields) {
+                // should not pick excluded field for highlighting
+                if(exclude_fields.count(field_name) > 0) {
+                    continue;
                }

-            } else {
-                h_json["snippet"] = highlight.snippets[0];
-                if(highlight_fully) {
-                    h_json["value"] = highlight.values[0];
+                field search_field = search_schema.at(field_name);
+                if(query != "*" && (search_field.type == field_types::STRING ||
+                                    search_field.type == field_types::STRING_ARRAY)) {
+
+                    bool highlighted_fully = (fields_highlighted_fully.find(field_name) != fields_highlighted_fully.end());
+                    highlight_t highlight;
+                    highlight_result(search_field, searched_queries, field_order_kv, document,
+                                     string_utils, snippet_threshold, highlighted_fully, highlight);
+
+                    if(!highlight.snippets.empty()) {
+                        highlights.push_back(highlight);
+                    }
                }
            }

-            wrapper_doc["highlights"].push_back(h_json);
+            std::sort(highlights.begin(), highlights.end());
+
+            for(const auto & highlight: highlights) {
+                nlohmann::json h_json = nlohmann::json::object();
+                h_json["field"] = highlight.field;
+                bool highlight_fully = (fields_highlighted_fully.find(highlight.field) != fields_highlighted_fully.end());
+
+                if(!highlight.indices.empty()) {
+                    h_json["indices"] = highlight.indices;
+                    h_json["snippets"] = highlight.snippets;
+                    if(highlight_fully) {
+                        h_json["values"] = highlight.values;
+                    }
+
+                } else {
+                    h_json["snippet"] = highlight.snippets[0];
+                    if(highlight_fully) {
+                        h_json["value"] = highlight.values[0];
+                    }
+                }
+
+                wrapper_doc["highlights"].push_back(h_json);
+            }
+
+            //wrapper_doc["seq_id"] = (uint32_t) field_order_kv->key;
+
+            prune_document(document, include_fields, exclude_fields);
+            wrapper_doc["document"] = document;
+            wrapper_doc["text_match"] = field_order_kv->match_score;
+
+            if(field_order_kv->match_score == 0) {
+                wrapper_doc["curated"] = true;
+            }
+
+            hits_array.push_back(wrapper_doc);
        }

-        prune_document(document, include_fields, exclude_fields);
-        wrapper_doc["document"] = document;
-        wrapper_doc["text_match"] = field_order_kv.match_score;
-        //wrapper_doc["seq_id"] = (uint32_t) field_order_kv.key;
+        if(group_limit) {
+            const auto& document = group_hits["hits"][0]["document"];

-        result["hits"].push_back(wrapper_doc);
+            group_hits["group_key"] = nlohmann::json::array();
+            for(const auto& field_name: group_by_fields) {
+                if(document.count(field_name) != 0) {
+                    group_hits["group_key"].push_back(document[field_name]);
+                }
+            }
+
+            result["grouped_hits"].push_back(group_hits);
+        }
    }

    result["facet_counts"] = nlohmann::json::array();
@ -941,6 +1075,11 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
        result["facet_counts"].push_back(facet_result);
    }

+    // free search params
+    for(Index* index: indices) {
+        delete index->search_params;
+    }
+
    result["request_params"] = nlohmann::json::object();;
    result["request_params"]["per_page"] = per_page;
    result["request_params"]["q"] = query;
@ -951,6 +1090,43 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
    return result;
 }

+
+void Collection::populate_result_kvs(Topster *topster, std::vector<std::vector<KV *>> &result_kvs) const {
+    if(topster->distinct) {
+        for(auto &group_topster_entry: topster->group_kv_map) {
+            Topster* group_topster = group_topster_entry.second;
+            const std::vector<KV*> group_kvs(group_topster->kvs, group_topster->kvs+group_topster->size);
+            result_kvs.emplace_back(group_kvs);
+        }
+
+    } else {
+        for(uint32_t t = 0; t < topster->size; t++) {
+            KV* kv = topster->getKV(t);
+            result_kvs.push_back({kv});
+        }
+    }
+}
+
+void Collection::aggregate_topster(size_t query_index, Topster &topster, Topster *index_topster) const {
+    if(index_topster->distinct) {
+        for(auto &group_topster_entry: index_topster->group_kv_map) {
+            Topster* group_topster = group_topster_entry.second;
+            const std::vector<KV*> group_kvs(group_topster->kvs, group_topster->kvs+group_topster->size);
+            for(KV* kv: group_kvs) {
+                kv->query_index += query_index;
+                topster.add(kv);
+            }
+        }
+
+    } else {
+        for(uint32_t t = 0; t < index_topster->size; t++) {
+            KV* kv = index_topster->getKV(t);
+            kv->query_index += query_index;
+            topster.add(kv);
+        }
+    }
+}
+
 void Collection::facet_value_to_string(const facet &a_facet, const facet_count_t &facet_count,
                                       const nlohmann::json &document, std::string &value) {

@ -989,7 +1165,7 @@ void Collection::facet_value_to_string(const facet &a_facet, const facet_count_t

 void Collection::highlight_result(const field &search_field,
                                  const std::vector<std::vector<art_leaf *>> &searched_queries,
-                                  const KV & field_order_kv, const nlohmann::json & document,
+                                  const KV* field_order_kv, const nlohmann::json & document,
                                  StringUtils & string_utils, size_t snippet_threshold,
                                  bool highlighted_fully,
                                  highlight_t & highlight) {
@ -997,15 +1173,15 @@ void Collection::highlight_result(const field &search_field,
    spp::sparse_hash_map<const art_leaf*, uint32_t*> leaf_to_indices;
    std::vector<art_leaf *> query_suggestion;

-    for (const art_leaf *token_leaf : searched_queries[field_order_kv.query_index]) {
+    for (const art_leaf *token_leaf : searched_queries[field_order_kv->query_index]) {
        // Must search for the token string fresh on that field for the given document since `token_leaf`
        // is from the best matched field and need not be present in other fields of a document.
-        Index* index = indices[field_order_kv.key % num_indices];
+        Index* index = indices[field_order_kv->key % num_indices];
        art_leaf *actual_leaf = index->get_token_leaf(search_field.name, &token_leaf->key[0], token_leaf->key_len);
        if(actual_leaf != nullptr) {
            query_suggestion.push_back(actual_leaf);
            std::vector<uint16_t> positions;
-            uint32_t doc_index = actual_leaf->values->ids.indexOf(field_order_kv.key);
+            uint32_t doc_index = actual_leaf->values->ids.indexOf(field_order_kv->key);
            auto doc_indices = new uint32_t[1];
            doc_indices[0] = doc_index;
            leaf_to_indices.emplace(actual_leaf, doc_indices);
@ -1031,8 +1207,8 @@ void Collection::highlight_result(const field &search_field,
            continue;
        }

-        const Match & this_match = Match::match(field_order_kv.key, token_positions);
-        uint64_t this_match_score = this_match.get_match_score(1, field_order_kv.field_id);
+        const Match & this_match = Match::match(field_order_kv->key, token_positions);
+        uint64_t this_match_score = this_match.get_match_score(1, field_order_kv->field_id);
        match_indices.emplace_back(this_match, this_match_score, array_index);
    }

@ -1231,7 +1407,7 @@ Option<uint32_t> Collection::add_override(const override_t & override) {
        return Option<uint32_t>(500, "Error while storing the override on disk.");
    }

-    overrides[override.id] = override;
+    overrides.emplace(override.id, override);
    return Option<uint32_t>(200);
 }

--- a/src/core_api.cpp
+++ b/src/core_api.cpp
@ -212,6 +212,9 @@ bool get_search(http_req & req, http_res & res) {
    const char *FACET_QUERY = "facet_query";
    const char *MAX_FACET_VALUES = "max_facet_values";

+    const char *GROUP_BY = "group_by";
+    const char *GROUP_LIMIT = "group_limit";
+
    const char *PER_PAGE = "per_page";
    const char *PAGE = "page";
    const char *CALLBACK = "callback";
@ -249,11 +252,6 @@ bool get_search(http_req & req, http_res & res) {
        return false;
    }

-    if(req.params.count(QUERY_BY) == 0) {
-        res.set_400(std::string("Parameter `") + QUERY_BY + "` is required.");
-        return false;
-    }
-
    if(req.params.count(MAX_FACET_VALUES) == 0) {
        req.params[MAX_FACET_VALUES] = "10";
    }
@ -291,41 +289,58 @@ bool get_search(http_req & req, http_res & res) {
        req.params[EXCLUDE_FIELDS] = "";
    }

-    if(!StringUtils::is_uint64_t(req.params[DROP_TOKENS_THRESHOLD])) {
+    if(req.params.count(GROUP_BY) == 0) {
+        req.params[GROUP_BY] = "";
+    }
+
+    if(req.params.count(GROUP_LIMIT) == 0) {
+        if(req.params[GROUP_BY] != "") {
+            req.params[GROUP_LIMIT] = "3";
+        } else {
+            req.params[GROUP_LIMIT] = "0";
+        }
+    }
+
+    if(!StringUtils::is_uint32_t(req.params[DROP_TOKENS_THRESHOLD])) {
        res.set_400("Parameter `" + std::string(DROP_TOKENS_THRESHOLD) + "` must be an unsigned integer.");
        return false;
    }

-    if(!StringUtils::is_uint64_t(req.params[TYPO_TOKENS_THRESHOLD])) {
+    if(!StringUtils::is_uint32_t(req.params[TYPO_TOKENS_THRESHOLD])) {
        res.set_400("Parameter `" + std::string(TYPO_TOKENS_THRESHOLD) + "` must be an unsigned integer.");
        return false;
    }

-    if(!StringUtils::is_uint64_t(req.params[NUM_TYPOS])) {
+    if(!StringUtils::is_uint32_t(req.params[NUM_TYPOS])) {
        res.set_400("Parameter `" + std::string(NUM_TYPOS) + "` must be an unsigned integer.");
        return false;
    }

-    if(!StringUtils::is_uint64_t(req.params[PER_PAGE])) {
+    if(!StringUtils::is_uint32_t(req.params[PER_PAGE])) {
        res.set_400("Parameter `" + std::string(PER_PAGE) + "` must be an unsigned integer.");
        return false;
    }

-    if(!StringUtils::is_uint64_t(req.params[PAGE])) {
+    if(!StringUtils::is_uint32_t(req.params[PAGE])) {
        res.set_400("Parameter `" + std::string(PAGE) + "` must be an unsigned integer.");
        return false;
    }

-    if(!StringUtils::is_uint64_t(req.params[MAX_FACET_VALUES])) {
+    if(!StringUtils::is_uint32_t(req.params[MAX_FACET_VALUES])) {
        res.set_400("Parameter `" + std::string(MAX_FACET_VALUES) + "` must be an unsigned integer.");
        return false;
    }

-    if(!StringUtils::is_uint64_t(req.params[SNIPPET_THRESHOLD])) {
+    if(!StringUtils::is_uint32_t(req.params[SNIPPET_THRESHOLD])) {
        res.set_400("Parameter `" + std::string(SNIPPET_THRESHOLD) + "` must be an unsigned integer.");
        return false;
    }

+    if(!StringUtils::is_uint32_t(req.params[GROUP_LIMIT])) {
+        res.set_400("Parameter `" + std::string(GROUP_LIMIT) + "` must be an unsigned integer.");
+        return false;
+    }
+
    std::string filter_str = req.params.count(FILTER) != 0 ? req.params[FILTER] : "";

    std::vector<std::string> search_fields;
@ -343,6 +358,9 @@ bool get_search(http_req & req, http_res & res) {
    spp::sparse_hash_set<std::string> include_fields(include_fields_vec.begin(), include_fields_vec.end());
    spp::sparse_hash_set<std::string> exclude_fields(exclude_fields_vec.begin(), exclude_fields_vec.end());

+    std::vector<std::string> group_by_fields;
+    StringUtils::split(req.params[GROUP_BY], group_by_fields, ",");
+
    std::vector<sort_by> sort_fields;
    if(req.params.count(SORT_BY) != 0) {
        std::vector<std::string> sort_field_strs;
@ -367,7 +385,8 @@ bool get_search(http_req & req, http_res & res) {
        }
    }

-    std::map<std::string, size_t> pinned_hits;
+    std::map<size_t, std::vector<std::string>> pinned_hits;
+
    if(req.params.count(PINNED_HITS) != 0) {
        std::vector<std::string> pinned_hits_strs;
        StringUtils::split(req.params[PINNED_HITS], pinned_hits_strs, ",");
@ -392,7 +411,7 @@ bool get_search(http_req & req, http_res & res) {
                return false;
            }

-            pinned_hits.emplace(expression_parts[0], position);
+            pinned_hits[position].emplace_back(expression_parts[0]);
        }
    }

@ -422,17 +441,19 @@ bool get_search(http_req & req, http_res & res) {

    Option<nlohmann::json> result_op = collection->search(req.params[QUERY], search_fields, filter_str, facet_fields,
                                                          sort_fields, std::stoi(req.params[NUM_TYPOS]),
-                                                          static_cast<size_t>(std::stoi(req.params[PER_PAGE])),
-                                                          static_cast<size_t>(std::stoi(req.params[PAGE])),
+                                                          static_cast<size_t>(std::stol(req.params[PER_PAGE])),
+                                                          static_cast<size_t>(std::stol(req.params[PAGE])),
                                                          token_order, prefix, drop_tokens_threshold,
                                                          include_fields, exclude_fields,
-                                                          static_cast<size_t>(std::stoi(req.params[MAX_FACET_VALUES])),
+                                                          static_cast<size_t>(std::stol(req.params[MAX_FACET_VALUES])),
                                                          req.params[FACET_QUERY],
-                                                          static_cast<size_t>(std::stoi(req.params[SNIPPET_THRESHOLD])),
+                                                          static_cast<size_t>(std::stol(req.params[SNIPPET_THRESHOLD])),
                                                          req.params[HIGHLIGHT_FULL_FIELDS],
                                                          typo_tokens_threshold,
                                                          pinned_hits,
-                                                          hidden_hits
+                                                          hidden_hits,
+                                                          group_by_fields,
+                                                          static_cast<size_t>(std::stol(req.params[GROUP_LIMIT]))
                                                          );

    uint64_t timeMillis = std::chrono::duration_cast<std::chrono::milliseconds>(
@ -873,7 +894,7 @@ bool post_create_key(http_req &req, http_res &res) {
        return false;
    }

-    const std::string &rand_key = StringUtils::randstring(AuthManager::KEY_LEN, req.seed);
+    const std::string &rand_key = req.metadata;

    api_key_t api_key(
        rand_key,
--- a/src/http_server.cpp
+++ b/src/http_server.cpp
@ -6,6 +6,7 @@
 #include <signal.h>
 #include <h2o.h>
 #include <iostream>
+#include <auth_manager.h>
 #include "raft_server.h"
 #include "logger.h"

@ -186,20 +187,6 @@ std::string HttpServer::get_version() {
 void HttpServer::clear_timeouts(const std::vector<h2o_timer_t*> & timers, bool trigger_callback) {
    for(h2o_timer_t* timer: timers) {
        h2o_timer_unlink(timer);
-        /*while (!h2o_linklist_is_empty(&timer->_link)) {
-            h2o_timer_t *entry = H2O_STRUCT_FROM_MEMBER(h2o_timer_t, _link, timer->_link.next);
-            if(entry == nullptr) {
-                continue;
-            }
-
-            if(trigger_callback) {
-                entry->cb(entry);
-            }
-
-            //entry->expire_at = 0;
-            h2o_linklist_unlink(&entry->_link);
-            h2o_timer_unlink(timer);
-        }*/
    }
 }

@ -385,6 +372,12 @@ int HttpServer::catch_all_handler(h2o_handler_t *_self, h2o_req_t *req) {
        }

        // routes match and is an authenticated request
+        // do any additional pre-request middleware operations here
+        if(rpath->action == "keys:create") {
+            // we enrich incoming request with a random API key here so that leader and replicas will use the same key
+            request->metadata = StringUtils::randstring(AuthManager::KEY_LEN);
+        }
+
        // for writes, we defer to replication_state
        if(http_method != "GET") {
            self->http_server->get_replication_state()->write(request, response);
@ -476,26 +469,13 @@ void HttpServer::on(const std::string & message, bool (*handler)(void*)) {
 HttpServer::~HttpServer() {
    delete message_dispatcher;

-    // remove all timeouts defined in: https://github.com/h2o/h2o/blob/v2.2.2/lib/core/context.c#L142
-    /*std::vector<h2o_timeout_t*> timeouts = {
-        &ctx.zero_timeout,
-        &ctx.one_sec_timeout,
-        &ctx.hundred_ms_timeout,
-        &ctx.handshake_timeout,
-        &ctx.http1.req_timeout,
-        &ctx.http2.idle_timeout,
-        &ctx.http2.graceful_shutdown_timeout,
-        &ctx.proxy.io_timeout
-    };
-
-    clear_timeouts(timeouts);
-    */
-
    if(ssl_refresh_timer.timer.expire_at != 0) {
        // avoid callback since it recreates timeout
        clear_timeouts({&ssl_refresh_timer.timer}, false);
    }

+    h2o_timerwheel_run(ctx.loop->_timeouts, 9999999999999);
+
    h2o_context_dispose(&ctx);

    free(ctx.globalconf->server_name.base);
--- a/src/index.cpp
+++ b/src/index.cpp
@ -633,7 +633,7 @@ void Index::compute_facet_stats(facet &a_facet, int64_t raw_value, const std::st
 void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
                      const uint32_t* result_ids, size_t results_size) {

-    std::map<std::string, size_t> facet_to_index;
+    std::unordered_map<std::string, size_t> facet_to_index;

    size_t i_facet = 0;
    for(const auto & facet: facet_schema) {
@ -661,6 +661,7 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
            std::vector<std::string> query_tokens;
            StringUtils::split(facet_query.query, query_tokens, " ");

+            // for non-string fields, `faceted_name` returns their aliased stringified field name
            art_tree *t = search_index.at(facet_field.faceted_name());

            for(size_t qtoken_index = 0; qtoken_index < query_tokens.size(); qtoken_index++) {
@ -703,7 +704,7 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
                const std::vector<uint64_t> & fhashes = facet_index_v2[doc_seq_id][facet_id];

                int array_pos = 0;
-                int fvalue_found = 0;
+                bool fvalue_found = false;
                std::stringstream fvaluestream; // for hashing the entire facet value (multiple tokens)
                spp::sparse_hash_map<uint32_t, token_pos_cost_t> query_token_positions;
                size_t field_token_index = -1;
@ -717,9 +718,9 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
                        // ftoken_hash is the raw value for numeric fields
                        compute_facet_stats(a_facet, ftoken_hash, facet_field.type);

+                        // not using facet query or this particular facet value is found in facet filter
                        if(!use_facet_query || fhash_qtoken_pos.find(ftoken_hash) != fhash_qtoken_pos.end()) {
-                            // not using facet query or this particular facet value is found in facet filter
-                            fvalue_found |= 1;  // bitwise to ensure only one count for a multi-token facet value
+                            fvalue_found = true;

                            if(use_facet_query) {
                                // map token index to query index (used for highlighting later on)
@ -736,41 +737,35 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
                        }
                    }

-                    //std::cout << "j: " << j << std::endl;
-
                    // 0 indicates separator, while the second condition checks for non-array string
                    if(fhashes[j] == FACET_ARRAY_DELIMETER || (fhashes.back() != FACET_ARRAY_DELIMETER && j == fhashes.size() - 1)) {
-                        if(!use_facet_query || fvalue_found != 0) {
+                        if(!use_facet_query || fvalue_found) {
                            const std::string & fvalue_str = fvaluestream.str();
-                            uint64_t fhash = 0;
-                            if(facet_field.is_string()) {
-                                fhash = facet_token_hash(facet_field, fvalue_str);
-                            } else {
-                                fhash = std::atoi(fvalue_str.c_str());
-                            }
+                            uint64_t fhash = facet_token_hash(facet_field, fvalue_str);

                            if(a_facet.result_map.count(fhash) == 0) {
-                                a_facet.result_map[fhash] = facet_count_t{0, doc_seq_id, 0,
+                                a_facet.result_map[fhash] = facet_count_t{0, spp::sparse_hash_set<uint64_t>(),
+                                                                          doc_seq_id, 0,
                                                                          spp::sparse_hash_map<uint32_t, token_pos_cost_t>()};
                            }

-                            a_facet.result_map[fhash].count += 1;
                            a_facet.result_map[fhash].doc_id = doc_seq_id;
                            a_facet.result_map[fhash].array_pos = array_pos;

+                            if(search_params->group_limit) {
+                                uint64_t distinct_id = get_distinct_id(facet_to_index, doc_seq_id);
+                                a_facet.result_map[fhash].groups.emplace(distinct_id);
+                            } else {
+                                a_facet.result_map[fhash].count += 1;
+                            }
+
                            if(use_facet_query) {
                                a_facet.result_map[fhash].query_token_pos = query_token_positions;
-
-                                /*if(j == 11) {
-                                    for(auto xx: query_token_positions) {
-                                        std::cout << xx.first << " -> " << xx.second.pos << " , " << xx.second.cost << std::endl;
-                                    }
-                                }*/
                            }
                        }

                        array_pos++;
-                        fvalue_found = 0;
+                        fvalue_found = false;
                        std::stringstream().swap(fvaluestream);
                        spp::sparse_hash_map<uint32_t, token_pos_cost_t>().swap(query_token_positions);
                        field_token_index = -1;
@ -781,56 +776,12 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
    }
 }

-void Index::drop_facets(std::vector<facet> & facets, const std::vector<uint32_t> & ids) {
-    std::map<std::string, size_t> facet_to_index;
-
-    size_t i_facet = 0;
-    for(const auto & facet: facet_schema) {
-        facet_to_index[facet.first] = i_facet;
-        i_facet++;
-    }
-
-    for(auto & a_facet: facets) {
-        const field & facet_field = facet_schema.at(a_facet.field_name);
-
-        // assumed that facet fields have already been validated upstream
-        for(const uint32_t doc_seq_id: ids) {
-            if(facet_index_v2.count(doc_seq_id) != 0) {
-                // FORMAT OF VALUES
-                // String: h1 h2 h3
-                // String array: h1 h2 h3 0 h1 0 h1 h2 0
-                const std::vector<uint64_t> & fhashes = facet_index_v2[doc_seq_id][facet_to_index[a_facet.field_name]];
-                std::stringstream fvaluestream; // for hashing the entire facet value (multiple tokens)
-
-                for(size_t j = 0; j < fhashes.size(); j++) {
-                    if(fhashes[j] != FACET_ARRAY_DELIMETER) {
-                        int64_t ftoken_hash = fhashes[j];
-                        fvaluestream << ftoken_hash;
-                    }
-
-                    if(fhashes[j] == FACET_ARRAY_DELIMETER || (fhashes.back() != FACET_ARRAY_DELIMETER &&
-                       j == fhashes.size() - 1)) {
-                        const std::string & fvalue_str = fvaluestream.str();
-                        uint64_t fhash = facet_token_hash(facet_field, fvalue_str);
-
-                        if(a_facet.result_map.count(fhash) != 0) {
-                            a_facet.result_map[fhash].count -= 1;
-                            if(a_facet.result_map[fhash].count == 0) {
-                                a_facet.result_map.erase(fhash);
-                            }
-                        }
-                        std::stringstream().swap(fvaluestream);
-                    }
-                }
-            }
-        }
-    }
-}
-
 void Index::search_candidates(const uint8_t & field_id, uint32_t* filter_ids, size_t filter_ids_length,
+                              const std::vector<uint32_t>& curated_ids,
                              const std::vector<sort_by> & sort_fields,
-                              std::vector<token_candidates> & token_candidates_vec, const token_ordering token_order,
-                              std::vector<std::vector<art_leaf*>> & searched_queries, Topster & topster,
+                              std::vector<token_candidates> & token_candidates_vec,
+                              std::vector<std::vector<art_leaf*>> & searched_queries, Topster* topster,
+                              spp::sparse_hash_set<uint64_t>& groups_processed,
                              uint32_t** all_result_ids, size_t & all_result_ids_len,
                              const size_t typo_tokens_threshold) {
    const long long combination_limit = 10;
@ -874,6 +825,15 @@ void Index::search_candidates(const uint8_t & field_id, uint32_t* filter_ids, si
            continue;
        }

+        if(!curated_ids.empty()) {
+            uint32_t *excluded_result_ids = nullptr;
+            result_size = ArrayUtils::exclude_scalar(result_ids, result_size, &curated_ids[0],
+                                                     curated_ids.size(), &excluded_result_ids);
+
+            delete [] result_ids;
+            result_ids = excluded_result_ids;
+        }
+
        if(filter_ids != nullptr) {
            // intersect once again with filter ids
            uint32_t* filtered_result_ids = nullptr;
@ -888,7 +848,7 @@ void Index::search_candidates(const uint8_t & field_id, uint32_t* filter_ids, si

            // go through each matching document id and calculate match score
            score_results(sort_fields, (uint16_t) searched_queries.size(), field_id, total_cost, topster, query_suggestion,
-                          filtered_result_ids, filtered_results_size);
+                          groups_processed, filtered_result_ids, filtered_results_size);

            delete[] filtered_result_ids;
            delete[] result_ids;
@ -900,7 +860,7 @@ void Index::search_candidates(const uint8_t & field_id, uint32_t* filter_ids, si
            *all_result_ids = new_all_result_ids;

            score_results(sort_fields, (uint16_t) searched_queries.size(), field_id, total_cost, topster, query_suggestion,
-                          result_ids, result_size);
+                          groups_processed, result_ids, result_size);
            delete[] result_ids;
        }

@ -1056,13 +1016,17 @@ void Index::run_search() {
        }

        // after the wait, we own the lock.
-        search(search_params.outcome, search_params.query, search_params.search_fields,
-               search_params.filters, search_params.facets, search_params.facet_query, search_params.included_ids,
-               search_params.excluded_ids, search_params.sort_fields_std, search_params.num_typos,
-               search_params.max_hits, search_params.per_page, search_params.page, search_params.token_order,
-               search_params.prefix, search_params.drop_tokens_threshold, search_params.raw_result_kvs,
-               search_params.all_result_ids_len, search_params.searched_queries, search_params.override_result_kvs,
-               search_params.typo_tokens_threshold);
+        search(search_params->outcome, search_params->query, search_params->search_fields,
+               search_params->filters, search_params->facets, search_params->facet_query,
+               search_params->included_ids, search_params->excluded_ids,
+               search_params->sort_fields_std, search_params->num_typos,
+               search_params->topster, search_params->curated_topster,
+               search_params->per_page, search_params->page, search_params->token_order,
+               search_params->prefix, search_params->drop_tokens_threshold,
+               search_params->all_result_ids_len, search_params->groups_processed,
+               search_params->searched_queries,
+               search_params->raw_result_kvs, search_params->override_result_kvs,
+               search_params->typo_tokens_threshold);

        // hand control back to main thread
        processed = true;
@ -1074,12 +1038,12 @@ void Index::run_search() {
    }
 }

-void Index::collate_curated_ids(const std::string & query, const std::string & field, const uint8_t field_id,
-                                const std::vector<uint32_t> & included_ids,
-                                Topster & curated_topster,
-                                std::vector<std::vector<art_leaf*>> & searched_queries) {
+void Index::collate_included_ids(const std::string & query, const std::string & field, const uint8_t field_id,
+                                 const std::map<size_t, std::map<size_t, uint32_t>> & included_ids_map,
+                                 Topster* curated_topster,
+                                 std::vector<std::vector<art_leaf*>> & searched_queries) {

-    if(included_ids.size() == 0) {
+    if(included_ids_map.empty()) {
        return;
    }

@ -1098,48 +1062,34 @@ void Index::collate_curated_ids(const std::string & query, const std::string & f
        art_fuzzy_search(search_index.at(field), (const unsigned char *) token.c_str(), token_len,
                         0, 0, 1, token_ordering::MAX_SCORE, false, leaves);

-        if(leaves.size() > 0) {
+        if(!leaves.empty()) {
            override_query.push_back(leaves[0]);
        }
    }

-    spp::sparse_hash_map<const art_leaf*, uint32_t*> leaf_to_indices;
+    for(const auto& pos_ids: included_ids_map) {
+        const size_t outer_pos = pos_ids.first;

-    for (art_leaf *token_leaf : override_query) {
-        uint32_t *indices = new uint32_t[included_ids.size()];
-        token_leaf->values->ids.indexOf(&included_ids[0], included_ids.size(), indices);
-        leaf_to_indices.emplace(token_leaf, indices);
-    }
+        for(const auto& index_seq_id: pos_ids.second) {
+            uint32_t inner_pos = index_seq_id.first;
+            uint32_t seq_id = index_seq_id.second;

-    for(size_t j=0; j<included_ids.size(); j++) {
-        const uint32_t seq_id = included_ids[j];
+            uint64_t distinct_id = outer_pos;              // outer pos is the group distinct key
+            uint64_t match_score = (64000 - inner_pos);    // inner pos within a group is the match score

-        std::vector<std::vector<std::vector<uint16_t>>> array_token_positions;
-        populate_token_positions(override_query, leaf_to_indices, j, array_token_positions);
+            // LOG(INFO) << "seq_id: " << seq_id << " - " << match_score;

-        uint64_t match_score = 0;
+            int64_t scores[3];
+            scores[0] = match_score;
+            scores[1] = int64_t(1);
+            scores[2] = int64_t(1);

-        for(const std::vector<std::vector<uint16_t>> & token_positions: array_token_positions) {
-            if(token_positions.empty()) {
-                continue;
-            }
-            const Match & match = Match::match(seq_id, token_positions);
-            uint64_t this_match_score = match.get_match_score(0, field_id);
-
-            if(this_match_score > match_score) {
-                match_score = this_match_score;
-            }
+            KV kv(field_id, searched_queries.size(), seq_id, distinct_id, match_score, scores);
+            curated_topster->add(&kv);
        }
-
-        int64_t scores[3];
-        scores[0] = int64_t(match_score);
-        scores[1] = int64_t(1);
-        scores[2] = int64_t(1);
-
-        curated_topster.add(seq_id, field_id, searched_queries.size(), match_score, scores);
-
-        searched_queries.push_back(override_query);
    }
+
+    searched_queries.push_back(override_query);
 }

 void Index::search(Option<uint32_t> & outcome,
@ -1147,19 +1097,20 @@ void Index::search(Option<uint32_t> & outcome,
                   const std::vector<std::string> & search_fields,
                   const std::vector<filter> & filters,
                   std::vector<facet> & facets, facet_query_t & facet_query,
-                   const std::vector<uint32_t> & included_ids,
+                   const std::map<size_t, std::map<size_t, uint32_t>> & included_ids_map,
                   const std::vector<uint32_t> & excluded_ids,
-                   const std::vector<sort_by> & sort_fields_std, const int num_typos, const size_t max_hits,
+                   const std::vector<sort_by> & sort_fields_std, const int num_typos,
+                   Topster* topster,
+                   Topster* curated_topster,
                   const size_t per_page, const size_t page, const token_ordering token_order,
                   const bool prefix, const size_t drop_tokens_threshold,
-                   std::vector<KV> & raw_result_kvs,
                   size_t & all_result_ids_len,
-                   std::vector<std::vector<art_leaf*>> & searched_queries,
-                   std::vector<KV> & override_result_kvs,
+                   spp::sparse_hash_set<uint64_t>& groups_processed,
+                   std::vector<std::vector<art_leaf*>>& searched_queries,
+                   std::vector<std::vector<KV*>> & raw_result_kvs,
+                   std::vector<std::vector<KV*>> & override_result_kvs,
                   const size_t typo_tokens_threshold) {

-    const size_t num_results = (page * per_page);
-
    // process the filters

    uint32_t* filter_ids = nullptr;
@ -1169,25 +1120,48 @@ void Index::search(Option<uint32_t> & outcome,
        return ;
    }

-    const uint32_t filter_ids_length = op_filter_ids_length.get();
+    uint32_t filter_ids_length = op_filter_ids_length.get();
+
+    // we will be removing all curated IDs from organic result ids before running topster
+    std::set<uint32_t> curated_ids;
+    std::vector<uint32_t> included_ids;
+
+    for(const auto& outer_pos_ids: included_ids_map) {
+        for(const auto& inner_pos_seq_id: outer_pos_ids.second) {
+            curated_ids.insert(inner_pos_seq_id.second);
+            included_ids.push_back(inner_pos_seq_id.second);
+        }
+    }
+
+    curated_ids.insert(excluded_ids.begin(), excluded_ids.end());
+
+    std::vector<uint32_t> curated_ids_sorted(curated_ids.begin(), curated_ids.end());
+    std::sort(curated_ids_sorted.begin(), curated_ids_sorted.end());

    // Order of `fields` are used to sort results
    //auto begin = std::chrono::high_resolution_clock::now();
    uint32_t* all_result_ids = nullptr;

-    const size_t topster_size = std::max((size_t)1, max_hits);  // needs to be atleast 1 since scoring is mandatory
-    Topster topster(topster_size);
-    Topster curated_topster(topster_size);
-
    if(query == "*") {
        const uint8_t field_id = (uint8_t)(FIELD_LIMIT_NUM - 0);
        const std::string & field = search_fields[0];

+        if(!curated_ids.empty()) {
+            uint32_t *excluded_result_ids = nullptr;
+            filter_ids_length = ArrayUtils::exclude_scalar(filter_ids, filter_ids_length, &curated_ids_sorted[0],
+                                                     curated_ids.size(), &excluded_result_ids);
+
+            delete [] filter_ids;
+            filter_ids = excluded_result_ids;
+        }
+
        score_results(sort_fields_std, (uint16_t) searched_queries.size(), field_id, 0, topster, {},
-                      filter_ids, filter_ids_length);
-        collate_curated_ids(query, field, field_id, included_ids, curated_topster, searched_queries);
-        do_facets(facets, facet_query, filter_ids, filter_ids_length);
+                      groups_processed, filter_ids, filter_ids_length);
+        collate_included_ids(query, field, field_id, included_ids_map, curated_topster, searched_queries);
+
        all_result_ids_len = filter_ids_length;
+        all_result_ids = filter_ids;
+        filter_ids = nullptr;
    } else {
        const size_t num_search_fields = std::min(search_fields.size(), (size_t) FIELD_LIMIT_NUM);
        for(size_t i = 0; i < num_search_fields; i++) {
@ -1196,47 +1170,22 @@ void Index::search(Option<uint32_t> & outcome,
                const uint8_t field_id = (uint8_t)(FIELD_LIMIT_NUM - i); // Order of `fields` are used to sort results
                const std::string & field = search_fields[i];

-                search_field(field_id, query, field, filter_ids, filter_ids_length, facets, sort_fields_std,
-                             num_typos, searched_queries, topster, &all_result_ids, all_result_ids_len,
+                search_field(field_id, query, field, filter_ids, filter_ids_length, curated_ids_sorted, facets, sort_fields_std,
+                             num_typos, searched_queries, topster, groups_processed, &all_result_ids, all_result_ids_len,
                             token_order, prefix, drop_tokens_threshold, typo_tokens_threshold);
-                collate_curated_ids(query, field, field_id, included_ids, curated_topster, searched_queries);
+                collate_included_ids(query, field, field_id, included_ids_map, curated_topster, searched_queries);
            }
        }
-        do_facets(facets, facet_query, all_result_ids, all_result_ids_len);
    }

+    do_facets(facets, facet_query, all_result_ids, all_result_ids_len);
    do_facets(facets, facet_query, &included_ids[0], included_ids.size());

    // must be sorted before iterated upon to remove "empty" array entries
-    topster.sort();
-    curated_topster.sort();
+    topster->sort();
+    curated_topster->sort();

-    std::set<uint32_t> ids_to_remove(included_ids.begin(), included_ids.end());
-    ids_to_remove.insert(excluded_ids.begin(), excluded_ids.end());
-
-    std::vector<uint32_t> dropped_ids;
-
-    // loop through topster and remove elements from included and excluded id lists
-    for(uint32_t t = 0; t < topster.size && t < num_results; t++) {
-        KV* kv = topster.getKV(t);
-
-        if(ids_to_remove.count(kv->key) != 0) {
-            dropped_ids.push_back((uint32_t)kv->key);
-        } else {
-            raw_result_kvs.push_back(*kv);
-        }
-    }
-
-    for(uint32_t t = 0; t < curated_topster.size && t < num_results; t++) {
-        KV* kv = curated_topster.getKV(t);
-        override_result_kvs.push_back(*kv);
-    }
-
-    // for the ids that are dropped, remove their corresponding facet components from facet results
-    drop_facets(facets, dropped_ids);
-
-    all_result_ids_len -= dropped_ids.size();
-    all_result_ids_len += curated_topster.size;
+    all_result_ids_len += curated_topster->size;

    delete [] filter_ids;
    delete [] all_result_ids;
@ -1258,9 +1207,11 @@ void Index::search(Option<uint32_t> & outcome,
 */
 void Index::search_field(const uint8_t & field_id, const std::string & query, const std::string & field,
                         uint32_t *filter_ids, size_t filter_ids_length,
+                         const std::vector<uint32_t>& curated_ids,
                         std::vector<facet> & facets, const std::vector<sort_by> & sort_fields, const int num_typos,
                         std::vector<std::vector<art_leaf*>> & searched_queries,
-                         Topster & topster, uint32_t** all_result_ids, size_t & all_result_ids_len,
+                         Topster* topster, spp::sparse_hash_set<uint64_t>& groups_processed,
+                         uint32_t** all_result_ids, size_t & all_result_ids_len,
                         const token_ordering token_order, const bool prefix, 
                         const size_t drop_tokens_threshold, const size_t typo_tokens_threshold) {
    std::vector<std::string> tokens;
@ -1373,8 +1324,8 @@ void Index::search_field(const uint8_t & field_id, const std::string & query, co

        if(!token_candidates_vec.empty() && token_candidates_vec.size() == tokens.size()) {
            // If all tokens were found, go ahead and search for candidates with what we have so far
-            search_candidates(field_id, filter_ids, filter_ids_length, sort_fields, token_candidates_vec,
-                              token_order, searched_queries, topster, all_result_ids, all_result_ids_len,
+            search_candidates(field_id, filter_ids, filter_ids_length, curated_ids, sort_fields, token_candidates_vec,
+                              searched_queries, topster, groups_processed, all_result_ids, all_result_ids_len,
                              typo_tokens_threshold);
        }

@ -1407,8 +1358,9 @@ void Index::search_field(const uint8_t & field_id, const std::string & query, co
            truncated_query += " " + token_count_pairs.at(i).first;
        }

-        return search_field(field_id, truncated_query, field, filter_ids, filter_ids_length, facets, sort_fields, num_typos,
-                            searched_queries, topster, all_result_ids, all_result_ids_len,
+        return search_field(field_id, truncated_query, field, filter_ids, filter_ids_length, curated_ids,
+                            facets, sort_fields, num_typos,
+                            searched_queries, topster, groups_processed, all_result_ids, all_result_ids_len,
                            token_order, prefix);
    }
 }
@ -1434,8 +1386,9 @@ void Index::log_leaves(const int cost, const std::string &token, const std::vect
 }

 void Index::score_results(const std::vector<sort_by> & sort_fields, const uint16_t & query_index,
-                          const uint8_t & field_id, const uint32_t total_cost, Topster & topster,
+                          const uint8_t & field_id, const uint32_t total_cost, Topster* topster,
                          const std::vector<art_leaf *> &query_suggestion,
+                          spp::sparse_hash_set<uint64_t>& groups_processed,
                          const uint32_t *result_ids, const size_t result_size) const {

    spp::sparse_hash_map<const art_leaf*, uint32_t*> leaf_to_indices;
@ -1467,6 +1420,16 @@ void Index::score_results(const std::vector<sort_by> & sort_fields, const uint16
    Match single_token_match = Match(1, 0, 0, empty_offset_diffs);
    const uint64_t single_token_match_score = single_token_match.get_match_score(total_cost, field_id);

+    std::unordered_map<std::string, size_t> facet_to_id;
+
+    if(search_params->group_limit > 0) {
+        size_t i_facet = 0;
+        for(const auto & facet: facet_schema) {
+            facet_to_id[facet.first] = i_facet;
+            i_facet++;
+        }
+    }
+
    for(size_t i=0; i<result_size; i++) {
        const uint32_t seq_id = result_ids[i];

@ -1500,7 +1463,7 @@ void Index::score_results(const std::vector<sort_by> & sort_fields, const uint16
        }

        const int64_t default_score = 0;
-        int64_t scores[3];
+        int64_t scores[3] = {0};

        // avoiding loop
        if(sort_fields.size() > 0) {
@ -1541,7 +1504,15 @@ void Index::score_results(const std::vector<sort_by> & sort_fields, const uint16
            }
        }

-        topster.add(seq_id, field_id, query_index, match_score, scores);
+        uint64_t distinct_id = seq_id;
+
+        if(search_params->group_limit != 0) {
+            distinct_id = get_distinct_id(facet_to_id, seq_id);
+            groups_processed.emplace(distinct_id);
+        }
+
+        KV kv(field_id, query_index, seq_id, distinct_id, match_score, scores);
+        topster->add(&kv);
    }

    //long long int timeNanos = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - begin).count();
@ -1553,6 +1524,26 @@ void Index::score_results(const std::vector<sort_by> & sort_fields, const uint16
    }
 }

+uint64_t Index::get_distinct_id(const std::unordered_map<std::string, size_t> &facet_to_id,
+                                const uint32_t seq_id) const {
+    uint64_t distinct_id = 1; // some constant initial value
+
+    // calculate hash from group_by_fields
+    for(const auto& field: search_params->group_by_fields) {
+        if(facet_to_id.count(field) == 0 || facet_index_v2.count(seq_id) == 0) {
+            continue;
+        }
+
+        size_t facet_id = facet_to_id.at(field);
+        const std::vector<uint64_t>& fhashes = facet_index_v2.at(seq_id)[facet_id];
+
+        for(const auto& hash: fhashes) {
+            distinct_id = hash_combine(distinct_id, hash);
+        }
+    }
+    return distinct_id;
+}
+
 void Index::populate_token_positions(const std::vector<art_leaf *> &query_suggestion,
                                     spp::sparse_hash_map<const art_leaf *, uint32_t *> &leaf_to_indices,
                                     size_t result_index,
--- a/src/main/benchmark.cpp
+++ b/src/main/benchmark.cpp
@ -46,7 +46,7 @@ void benchmark_hn_titles(char* file_path) {

    Store *store = new Store("/tmp/typesense-data");
    CollectionManager & collectionManager = CollectionManager::get_instance();
-    collectionManager.init(store, 4, "abcd", "1234");
+    collectionManager.init(store, 4, "abcd");
    collectionManager.load();

    Collection *collection = collectionManager.get_collection("hnstories_direct");
@ -116,7 +116,7 @@ void benchmark_reactjs_pages(char* file_path) {

    Store *store = new Store("/tmp/typesense-data");
    CollectionManager & collectionManager = CollectionManager::get_instance();
-    collectionManager.init(store, 4, "abcd", "1234");
+    collectionManager.init(store, 4, "abcd");
    collectionManager.load();

    Collection *collection = collectionManager.get_collection("reactjs_pages");
--- a/src/main/main.cpp
+++ b/src/main/main.cpp
@ -21,7 +21,7 @@ int main(int argc, char* argv[]) {
    Store *store = new Store(state_dir_path);

    CollectionManager & collectionManager = CollectionManager::get_instance();
-    collectionManager.init(store, 4, "abcd", "123");
+    collectionManager.init(store, 4, "abcd");
    collectionManager.load();

    std::vector<field> fields_to_index = {
--- a/src/main/typesense_server.cpp
+++ b/src/main/typesense_server.cpp
@ -1,6 +1,6 @@
+#include "typesense_server_utils.h"
 #include "core_api.h"
 #include "config.h"
-#include "typesense_server_utils.h"

 void master_server_routes() {
    // collection management
@ -55,13 +55,7 @@ void replica_server_routes() {
    server->get("/health", get_health);
 }

-namespace logging {
-    DECLARE_bool(log_year);
-}
-
 int main(int argc, char **argv) {
-    logging::FLAGS_log_year = true;
-
    Config config;

    cmdline::parser options;
--- a/src/raft_server.cpp
+++ b/src/raft_server.cpp
@ -34,6 +34,7 @@ int ReplicationState::start(const butil::EndPoint & peering_endpoint, const int
    node_options.fsm = this;
    node_options.node_owns_fsm = false;
    node_options.snapshot_interval_s = snapshot_interval_s;
+    node_options.filter_before_copy_remote = false;
    std::string prefix = "local://" + raft_dir;
    node_options.log_uri = prefix + "/" + log_dir_name;
    node_options.raft_meta_uri = prefix + "/" + meta_dir_name;
@ -297,7 +298,7 @@ int ReplicationState::on_snapshot_load(braft::SnapshotReader* reader) {
        return -1;
    }

-    LOG(TRACE) << "rm " << store->get_state_dir_path() << " success";
+    LOG(INFO) << "rm " << store->get_state_dir_path() << " success";

    std::string snapshot_path = reader->get_path();
    snapshot_path.append(std::string("/") + db_snapshot_name);
@ -308,7 +309,7 @@ int ReplicationState::on_snapshot_load(braft::SnapshotReader* reader) {
        return -1;
    }

-    LOG(TRACE) << "copy snapshot " << snapshot_path << " to " << store->get_state_dir_path() << " success";
+    LOG(INFO) << "copy snapshot " << snapshot_path << " to " << store->get_state_dir_path() << " success";

    return init_db();
 }
--- a/src/sorted_array.cpp
+++ b/src/sorted_array.cpp
@ -6,7 +6,7 @@ void sorted_array::load(const uint32_t *sorted_array, const uint32_t array_lengt
    max = array_length > 1 ? sorted_array[array_length-1] : min;

    uint32_t size_required = (uint32_t) (sorted_append_size_required(max, array_length) * FOR_GROWTH_FACTOR);
-    uint8_t *out = new uint8_t[size_required];
+    uint8_t *out = (uint8_t *) malloc(size_required * sizeof *out);
    uint32_t actual_size = for_compress_sorted(sorted_array, out, array_length);

    free(in);
--- a/src/string_utils.cpp
+++ b/src/string_utils.cpp
@ -1,8 +1,8 @@
 #include "string_utils.h"
 #include <iostream>
 #include <openssl/evp.h>
-#include <iomanip>
 #include <openssl/hmac.h>
+#include <random>

 std::string lower_and_no_special_chars(const std::string & str) {
    std::stringstream ss;
@ -19,6 +19,10 @@ std::string lower_and_no_special_chars(const std::string & str) {
 }

 void StringUtils::unicode_normalize(std::string & str) const {
+    if(str.empty()) {
+        return ;
+    }
+
    std::stringstream out;

    for (char *s = &str[0]; *s;) {
@ -49,16 +53,16 @@ void StringUtils::unicode_normalize(std::string & str) const {
        }
    }

-    str.assign(lower_and_no_special_chars(out.str()));
+    str = lower_and_no_special_chars(out.str());
 }

-std::string StringUtils::randstring(size_t length, uint64_t seed) {
+std::string StringUtils::randstring(size_t length) {
    static auto& chrs = "0123456789"
                        "abcdefghijklmnopqrstuvwxyz"
                        "ABCDEFGHIJKLMNOPQRSTUVWXYZ";

-    thread_local static std::mt19937 rg(seed);
-    thread_local static std::uniform_int_distribution<std::string::size_type> pick(0, sizeof(chrs) - 2);
+    thread_local std::mt19937 rg(std::random_device{}());
+    thread_local std::uniform_int_distribution<uint32_t> pick(0, sizeof(chrs) - 2);

    std::string s;
    s.reserve(length);
--- a/src/system_metrics.cpp
+++ b/src/system_metrics.cpp
@ -27,9 +27,9 @@ void SystemMetrics::get(const std::string &data_dir_path, nlohmann::json &result

    rusage r_usage;
    getrusage(RUSAGE_SELF, &r_usage);
-    result["memory_used_process_bytes"] = r_usage.ru_maxrss * 1000;
+    result["typesense_memory_used_bytes"] = r_usage.ru_maxrss * 1000;

-    uint64_t memory_free_bytes = 0;
+    uint64_t memory_available_bytes = 0;
    uint64_t memory_total_bytes = 0;

 #ifdef __APPLE__
@ -42,7 +42,7 @@ void SystemMetrics::get(const std::string &data_dir_path, nlohmann::json &result
    if (KERN_SUCCESS == host_page_size(mach_port, &mach_page_size) &&
        KERN_SUCCESS == host_statistics64(mach_port, HOST_VM_INFO,
                                          (host_info64_t)&vm_stats, &count)) {
-        memory_free_bytes = (int64_t)(vm_stats.free_count) * (int64_t)mach_page_size;
+        memory_available_bytes = (int64_t)(vm_stats.free_count) * (int64_t)mach_page_size;
    }

    uint64_t pages = sysconf(_SC_PHYS_PAGES);
@ -51,11 +51,11 @@ void SystemMetrics::get(const std::string &data_dir_path, nlohmann::json &result
 #elif __linux__
    struct sysinfo sys_info;
    sysinfo(&sys_info);
-    memory_free_bytes = sys_info.freeram;
+    memory_available_bytes = linux_get_mem_available_bytes();
    memory_total_bytes = sys_info.totalram;
 #endif

-    result["memory_free_bytes"] = memory_free_bytes;
+    result["memory_available_bytes"] = memory_available_bytes;
    result["memory_total_bytes"] = memory_total_bytes;

    // CPU METRICS
--- a/src/typesense_server_utils.cpp
+++ b/src/typesense_server_utils.cpp
@ -133,35 +133,45 @@ int init_logger(Config & config, const std::string & server_version) {
    signal(SIGILL, catch_crash);
    signal(SIGSEGV, catch_crash);

-    logging::LoggingSettings log_settings;
-
    // we can install new signal handlers only after overriding above
    signal(SIGINT, catch_interrupt);
    signal(SIGTERM, catch_interrupt);

+    google::InitGoogleLogging("typesense");
+
    std::string log_dir = config.get_log_dir();
-    std::string log_path;

    if(log_dir.empty()) {
        // use console logger if log dir is not specified
-        log_settings.logging_dest = logging::LOG_TO_SYSTEM_DEBUG_LOG;
+        FLAGS_logtostderr = true;
    } else {
        if(!directory_exists(log_dir)) {
            std::cerr << "Typesense failed to start. " << "Log directory " << log_dir << " does not exist.";
            return 1;
        }

-        log_settings.logging_dest = logging::LOG_TO_FILE;
-        log_path = log_dir + "/" + "typesense.log";
-        log_settings.log_file = log_path.c_str();
+        // flush log levels above -1 immediately (INFO=0)
+        FLAGS_logbuflevel = -1;

-        LOG(INFO) << "Starting Typesense " << server_version << ". Log directory is configured as: "
-                  << log_dir << std::endl;
+        // available only on glog master (ensures that log file name is constant)
+        FLAGS_timestamp_in_logfile_name = false;
+
+        std::string log_path = log_dir + "/" + "typesense.log";
+
+        // will log levels INFO **and above** to the given log file
+        google::SetLogDestination(google::INFO, log_path.c_str());
+
+        // don't create symlink for INFO log
+        google::SetLogSymlink(google::INFO, "");
+
+        // don't create separate log files for each level
+        google::SetLogDestination(google::WARNING, "");
+        google::SetLogDestination(google::ERROR, "");
+        google::SetLogDestination(google::FATAL, "");
+
+        std::cout << "Log directory is configured as: " << log_dir << std::endl;
    }

-    logging::InitLogging(log_settings);
-    logging::SetMinLogLevel(0);
-
    return 0;
 }

@ -275,7 +285,6 @@ int start_raft_server(ReplicationState& replication_state, const std::string& st
 }

 int run_server(const Config & config, const std::string & version, void (*master_server_routes)()) {
-
    LOG(INFO) << "Starting Typesense " << version << std::flush;
    quit_raft_service = false;

--- a/test/collection_faceting_test.cpp
+++ b/test/collection_faceting_test.cpp
@ -15,7 +15,7 @@ protected:
    std::vector<sort_by> sort_fields;

    void setupCollection() {
-        std::string state_dir_path = "/tmp/typesense_test/collection_sorting";
+        std::string state_dir_path = "/tmp/typesense_test/collection_faceting";
        LOG(INFO) << "Truncating and creating: " << state_dir_path;
        system(("rm -rf "+state_dir_path+" && mkdir -p "+state_dir_path).c_str());

@ -29,6 +29,7 @@ protected:
    }

    virtual void TearDown() {
+        collectionManager.dispose();
        delete store;
    }
 };
--- a/test/collection_grouping_test.cpp
+++ b/test/collection_grouping_test.cpp
@ -0,0 +1,357 @@
+#include <gtest/gtest.h>
+#include <string>
+#include <vector>
+#include <fstream>
+#include <algorithm>
+#include <collection_manager.h>
+#include "collection.h"
+
+class CollectionGroupingTest : public ::testing::Test {
+protected:
+    Store *store;
+    CollectionManager & collectionManager = CollectionManager::get_instance();
+    Collection *coll_group;
+
+    void setupCollection() {
+        std::string state_dir_path = "/tmp/typesense_test/collection_grouping";
+        LOG(INFO) << "Truncating and creating: " << state_dir_path;
+        system(("rm -rf "+state_dir_path+" && mkdir -p "+state_dir_path).c_str());
+
+        store = new Store(state_dir_path);
+        collectionManager.init(store, 4, "auth_key");
+        collectionManager.load();
+
+        std::vector<field> fields = {
+            field("title", field_types::STRING, false),
+            field("brand", field_types::STRING, true, true),
+            field("size", field_types::INT32, true, false),
+            field("colors", field_types::STRING_ARRAY, true, false),
+            field("rating", field_types::FLOAT, true, false)
+        };
+
+        coll_group = collectionManager.get_collection("coll_group");
+        if(coll_group == nullptr) {
+            coll_group = collectionManager.create_collection("coll_group", fields, "rating").get();
+        }
+
+        std::ifstream infile(std::string(ROOT_DIR)+"test/group_documents.jsonl");
+
+        std::string json_line;
+
+        while (std::getline(infile, json_line)) {
+            auto add_op = coll_group->add(json_line);
+            if(!add_op.ok()) {
+                std::cout << add_op.error() << std::endl;
+            }
+            ASSERT_TRUE(add_op.ok());
+        }
+
+        infile.close();
+    }
+
+    virtual void SetUp() {
+        setupCollection();
+    }
+
+    virtual void TearDown() {
+        collectionManager.dispose();
+        delete store;
+    }
+};
+
+TEST_F(CollectionGroupingTest, GroupingBasics) {
+    // group by size (int32)
+    auto res = coll_group->search("*", {}, "", {"brand"}, {}, 0, 50, 1, FREQUENCY,
+                                   false, Index::DROP_TOKENS_THRESHOLD,
+                                   spp::sparse_hash_set<std::string>(),
+                                   spp::sparse_hash_set<std::string>(), 10, "", 30,
+                                   "", 10,
+                                   {}, {}, {"size"}, 2).get();
+
+    ASSERT_EQ(3, res["found"].get<size_t>());
+    ASSERT_EQ(3, res["grouped_hits"].size());
+    ASSERT_EQ(11, res["grouped_hits"][0]["group_key"][0].get<size_t>());
+
+    ASSERT_FLOAT_EQ(4.8, res["grouped_hits"][0]["hits"][0]["document"]["rating"].get<float>());
+    ASSERT_EQ(11, res["grouped_hits"][0]["hits"][0]["document"]["size"].get<size_t>());
+    ASSERT_STREQ("5", res["grouped_hits"][0]["hits"][0]["document"]["id"].get<std::string>().c_str());
+    ASSERT_FLOAT_EQ(4.3, res["grouped_hits"][0]["hits"][1]["document"]["rating"].get<float>());
+    ASSERT_STREQ("1", res["grouped_hits"][0]["hits"][1]["document"]["id"].get<std::string>().c_str());
+
+    ASSERT_FLOAT_EQ(4.8, res["grouped_hits"][1]["hits"][0]["document"]["rating"].get<float>());
+    ASSERT_STREQ("4", res["grouped_hits"][1]["hits"][0]["document"]["id"].get<std::string>().c_str());
+    ASSERT_FLOAT_EQ(4.6, res["grouped_hits"][1]["hits"][1]["document"]["rating"].get<float>());
+    ASSERT_STREQ("3", res["grouped_hits"][1]["hits"][1]["document"]["id"].get<std::string>().c_str());
+
+    ASSERT_FLOAT_EQ(4.6, res["grouped_hits"][2]["hits"][0]["document"]["rating"].get<float>());
+    ASSERT_STREQ("2", res["grouped_hits"][2]["hits"][0]["document"]["id"].get<std::string>().c_str());
+    ASSERT_FLOAT_EQ(4.4, res["grouped_hits"][2]["hits"][1]["document"]["rating"].get<float>());
+    ASSERT_STREQ("8", res["grouped_hits"][2]["hits"][1]["document"]["id"].get<std::string>().c_str());
+
+    ASSERT_STREQ("brand", res["facet_counts"][0]["field_name"].get<std::string>().c_str());
+    ASSERT_EQ(3, (int) res["facet_counts"][0]["counts"][0]["count"]);
+    ASSERT_STREQ("Omega", res["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
+
+    ASSERT_EQ(3, (int) res["facet_counts"][0]["counts"][1]["count"]);
+    ASSERT_STREQ("Beta", res["facet_counts"][0]["counts"][1]["value"].get<std::string>().c_str());
+
+    ASSERT_EQ(2, (int) res["facet_counts"][0]["counts"][2]["count"]);
+    ASSERT_STREQ("Xorp", res["facet_counts"][0]["counts"][2]["value"].get<std::string>().c_str());
+
+    ASSERT_EQ(1, (int) res["facet_counts"][0]["counts"][3]["count"]);
+    ASSERT_STREQ("Zeta", res["facet_counts"][0]["counts"][3]["value"].get<std::string>().c_str());
+
+    // group by rating (float) and sort by size
+    std::vector<sort_by> sort_size = {sort_by("size", "DESC")};
+    res = coll_group->search("*", {}, "", {"brand"}, sort_size, 0, 50, 1, FREQUENCY,
+                             false, Index::DROP_TOKENS_THRESHOLD,
+                             spp::sparse_hash_set<std::string>(),
+                             spp::sparse_hash_set<std::string>(), 10, "brand: omeg", 30,
+                             "", 10,
+                             {}, {}, {"rating"}, 2).get();
+
+    // 7 unique ratings
+    ASSERT_EQ(7, res["found"].get<size_t>());
+    ASSERT_EQ(7, res["grouped_hits"].size());
+    ASSERT_FLOAT_EQ(4.4, res["grouped_hits"][0]["group_key"][0].get<float>());
+
+    ASSERT_EQ(12, res["grouped_hits"][0]["hits"][0]["document"]["size"].get<uint32_t>());
+    ASSERT_STREQ("8", res["grouped_hits"][0]["hits"][0]["document"]["id"].get<std::string>().c_str());
+    ASSERT_FLOAT_EQ(4.4, res["grouped_hits"][0]["hits"][0]["document"]["rating"].get<float>());
+
+    ASSERT_EQ(12, res["grouped_hits"][1]["hits"][0]["document"]["size"].get<uint32_t>());
+    ASSERT_STREQ("6", res["grouped_hits"][1]["hits"][0]["document"]["id"].get<std::string>().c_str());
+    ASSERT_FLOAT_EQ(4.3, res["grouped_hits"][1]["hits"][0]["document"]["rating"].get<float>());
+
+    ASSERT_EQ(11, res["grouped_hits"][1]["hits"][1]["document"]["size"].get<uint32_t>());
+    ASSERT_STREQ("1", res["grouped_hits"][1]["hits"][1]["document"]["id"].get<std::string>().c_str());
+    ASSERT_FLOAT_EQ(4.3, res["grouped_hits"][1]["hits"][1]["document"]["rating"].get<float>());
+
+    ASSERT_EQ(10, res["grouped_hits"][5]["hits"][0]["document"]["size"].get<uint32_t>());
+    ASSERT_STREQ("9", res["grouped_hits"][5]["hits"][0]["document"]["id"].get<std::string>().c_str());
+    ASSERT_FLOAT_EQ(4.1, res["grouped_hits"][5]["hits"][0]["document"]["rating"].get<float>());
+
+    ASSERT_EQ(10, res["grouped_hits"][6]["hits"][0]["document"]["size"].get<uint32_t>());
+    ASSERT_STREQ("0", res["grouped_hits"][6]["hits"][0]["document"]["id"].get<std::string>().c_str());
+    ASSERT_FLOAT_EQ(4.5, res["grouped_hits"][6]["hits"][0]["document"]["rating"].get<float>());
+
+    ASSERT_STREQ("brand", res["facet_counts"][0]["field_name"].get<std::string>().c_str());
+    ASSERT_EQ(3, (int) res["facet_counts"][0]["counts"][0]["count"]);
+    ASSERT_STREQ("Omega", res["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
+    ASSERT_STREQ("<mark>Omeg</mark>a", res["facet_counts"][0]["counts"][0]["highlighted"].get<std::string>().c_str());
+}
+
+TEST_F(CollectionGroupingTest, GroupingCompoundKey) {
+    // group by size+brand (int32, string)
+    auto res = coll_group->search("*", {}, "", {"brand"}, {}, 0, 50, 1, FREQUENCY,
+                                  false, Index::DROP_TOKENS_THRESHOLD,
+                                  spp::sparse_hash_set<std::string>(),
+                                  spp::sparse_hash_set<std::string>(), 10, "", 30,
+                                  "", 10,
+                                  {}, {}, {"size", "brand"}, 2).get();
+
+    ASSERT_EQ(10, res["found"].get<size_t>());
+    ASSERT_EQ(10, res["grouped_hits"].size());
+    ASSERT_EQ(11, res["grouped_hits"][0]["group_key"][0].get<size_t>());
+
+    ASSERT_STREQ("Beta", res["grouped_hits"][0]["group_key"][1].get<std::string>().c_str());
+
+    // optional field should have no value in the group key component
+    ASSERT_EQ(1, res["grouped_hits"][5]["group_key"].size());
+    ASSERT_STREQ("10", res["grouped_hits"][5]["hits"][0]["document"]["id"].get<std::string>().c_str());
+    ASSERT_STREQ("11", res["grouped_hits"][5]["hits"][1]["document"]["id"].get<std::string>().c_str());
+
+    ASSERT_EQ(1, res["grouped_hits"][0]["hits"].size());
+    ASSERT_FLOAT_EQ(4.8, res["grouped_hits"][0]["hits"][0]["document"]["rating"].get<float>());
+    ASSERT_STREQ("5", res["grouped_hits"][0]["hits"][0]["document"]["id"].get<std::string>().c_str());
+
+    ASSERT_EQ(1, res["grouped_hits"][1]["hits"].size());
+    ASSERT_FLOAT_EQ(4.8, res["grouped_hits"][1]["hits"][0]["document"]["rating"].get<float>());
+    ASSERT_STREQ("4", res["grouped_hits"][1]["hits"][0]["document"]["id"].get<std::string>().c_str());
+
+    ASSERT_EQ(2, res["grouped_hits"][2]["hits"].size());
+    ASSERT_FLOAT_EQ(4.6, res["grouped_hits"][2]["hits"][0]["document"]["rating"].get<float>());
+    ASSERT_STREQ("3", res["grouped_hits"][2]["hits"][0]["document"]["id"].get<std::string>().c_str());
+    ASSERT_FLOAT_EQ(4.5, res["grouped_hits"][2]["hits"][1]["document"]["rating"].get<float>());
+    ASSERT_STREQ("0", res["grouped_hits"][2]["hits"][1]["document"]["id"].get<std::string>().c_str());
+
+    ASSERT_STREQ("brand", res["facet_counts"][0]["field_name"].get<std::string>().c_str());
+    ASSERT_EQ(3, (int) res["facet_counts"][0]["counts"][0]["count"]);
+    ASSERT_STREQ("Omega", res["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
+
+    ASSERT_EQ(3, (int) res["facet_counts"][0]["counts"][1]["count"]);
+    ASSERT_STREQ("Beta", res["facet_counts"][0]["counts"][1]["value"].get<std::string>().c_str());
+
+    ASSERT_EQ(2, (int) res["facet_counts"][0]["counts"][2]["count"]);
+    ASSERT_STREQ("Xorp", res["facet_counts"][0]["counts"][2]["value"].get<std::string>().c_str());
+
+    ASSERT_EQ(1, (int) res["facet_counts"][0]["counts"][3]["count"]);
+    ASSERT_STREQ("Zeta", res["facet_counts"][0]["counts"][3]["value"].get<std::string>().c_str());
+
+    // pagination with page=2, per_page=2
+    res = coll_group->search("*", {}, "", {"brand"}, {}, 0, 2, 2, FREQUENCY,
+                             false, Index::DROP_TOKENS_THRESHOLD,
+                             spp::sparse_hash_set<std::string>(),
+                             spp::sparse_hash_set<std::string>(), 10, "", 30,
+                             "", 10,
+                             {}, {}, {"size", "brand"}, 2).get();
+
+
+    // 3rd result from previous assertion will be in the first position
+    ASSERT_EQ(2, res["grouped_hits"][0]["hits"].size());
+    ASSERT_FLOAT_EQ(4.6, res["grouped_hits"][0]["hits"][0]["document"]["rating"].get<float>());
+    ASSERT_STREQ("3", res["grouped_hits"][0]["hits"][0]["document"]["id"].get<std::string>().c_str());
+    ASSERT_FLOAT_EQ(4.5, res["grouped_hits"][0]["hits"][1]["document"]["rating"].get<float>());
+    ASSERT_STREQ("0", res["grouped_hits"][0]["hits"][1]["document"]["id"].get<std::string>().c_str());
+
+    // total count and facet counts should be the same
+    ASSERT_EQ(10, res["found"].get<size_t>());
+    ASSERT_EQ(2, res["grouped_hits"].size());
+    ASSERT_EQ(10, res["grouped_hits"][0]["group_key"][0].get<size_t>());
+    ASSERT_STREQ("Omega", res["grouped_hits"][0]["group_key"][1].get<std::string>().c_str());
+
+    ASSERT_STREQ("brand", res["facet_counts"][0]["field_name"].get<std::string>().c_str());
+    ASSERT_EQ(3, (int) res["facet_counts"][0]["counts"][0]["count"]);
+    ASSERT_STREQ("Omega", res["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
+
+    ASSERT_EQ(3, (int) res["facet_counts"][0]["counts"][1]["count"]);
+    ASSERT_STREQ("Beta", res["facet_counts"][0]["counts"][1]["value"].get<std::string>().c_str());
+
+    ASSERT_EQ(2, (int) res["facet_counts"][0]["counts"][2]["count"]);
+    ASSERT_STREQ("Xorp", res["facet_counts"][0]["counts"][2]["value"].get<std::string>().c_str());
+
+    ASSERT_EQ(1, (int) res["facet_counts"][0]["counts"][3]["count"]);
+    ASSERT_STREQ("Zeta", res["facet_counts"][0]["counts"][3]["value"].get<std::string>().c_str());
+
+    // respect min and max grouping limit (greater than 0 and less than 99)
+    auto res_op = coll_group->search("*", {}, "", {"brand"}, {}, 0, 50, 1, FREQUENCY,
+                             false, Index::DROP_TOKENS_THRESHOLD,
+                             spp::sparse_hash_set<std::string>(),
+                             spp::sparse_hash_set<std::string>(), 10, "brand: omeg", 30,
+                             "", 10,
+                             {}, {}, {"rating"}, 100);
+
+    ASSERT_FALSE(res_op.ok());
+    ASSERT_STREQ("Value of `group_limit` must be between 1 and 99.", res_op.error().c_str());
+
+    res_op = coll_group->search("*", {}, "", {"brand"}, {}, 0, 50, 1, FREQUENCY,
+                                false, Index::DROP_TOKENS_THRESHOLD,
+                                spp::sparse_hash_set<std::string>(),
+                                spp::sparse_hash_set<std::string>(), 10, "brand: omeg", 30,
+                                "", 10,
+                                {}, {}, {"rating"}, 0);
+
+    ASSERT_FALSE(res_op.ok());
+    ASSERT_STREQ("Value of `group_limit` must be between 1 and 99.", res_op.error().c_str());
+}
+
+TEST_F(CollectionGroupingTest, GroupingWithGropLimitOfOne) {
+    auto res = coll_group->search("*", {}, "", {"brand"}, {}, 0, 50, 1, FREQUENCY,
+                                  false, Index::DROP_TOKENS_THRESHOLD,
+                                  spp::sparse_hash_set<std::string>(),
+                                  spp::sparse_hash_set<std::string>(), 10, "", 30,
+                                  "", 10,
+                                  {}, {}, {"brand"}, 1).get();
+
+    ASSERT_EQ(5, res["found"].get<size_t>());
+    ASSERT_EQ(5, res["grouped_hits"].size());
+
+    // all hits array must be of size 1
+    for(auto i=0; i<5; i++) {
+        ASSERT_EQ(1, res["grouped_hits"][i]["hits"].size());
+    }
+
+    ASSERT_STREQ("4", res["grouped_hits"][0]["hits"][0]["document"]["id"].get<std::string>().c_str());
+    ASSERT_STREQ("2", res["grouped_hits"][1]["hits"][0]["document"]["id"].get<std::string>().c_str());
+    ASSERT_STREQ("8", res["grouped_hits"][2]["hits"][0]["document"]["id"].get<std::string>().c_str());
+    ASSERT_STREQ("10", res["grouped_hits"][3]["hits"][0]["document"]["id"].get<std::string>().c_str()); // unbranded
+    ASSERT_STREQ("9", res["grouped_hits"][4]["hits"][0]["document"]["id"].get<std::string>().c_str());
+
+    // facet counts should each be 1, including unbranded
+    ASSERT_STREQ("brand", res["facet_counts"][0]["field_name"].get<std::string>().c_str());
+
+    for(size_t i=0; i < 4; i++) {
+        ASSERT_EQ(1, (int) res["facet_counts"][0]["counts"][i]["count"]);
+    }
+}
+
+TEST_F(CollectionGroupingTest, GroupingWithArrayFieldAndOverride) {
+    nlohmann::json override_json_include = {
+        {"id", "include-rule"},
+        {
+            "rule", {
+               {"query", "shirt"},
+               {"match", override_t::MATCH_EXACT}
+            }
+        }
+    };
+
+    override_json_include["includes"] = nlohmann::json::array();
+    override_json_include["includes"][0] = nlohmann::json::object();
+    override_json_include["includes"][0]["id"] = "11";
+    override_json_include["includes"][0]["position"] = 1;
+
+    override_json_include["includes"][1] = nlohmann::json::object();
+    override_json_include["includes"][1]["id"] = "10";
+    override_json_include["includes"][1]["position"] = 1;
+
+    nlohmann::json override_json_exclude = {
+        {"id",   "exclude-rule"},
+        {
+            "rule", {
+                 {"query", "shirt"},
+                 {"match", override_t::MATCH_EXACT}
+            }
+        }
+    };
+    override_json_exclude["excludes"] = nlohmann::json::array();
+    override_json_exclude["excludes"][0] = nlohmann::json::object();
+    override_json_exclude["excludes"][0]["id"] = "2";
+
+    override_t override1(override_json_include);
+    override_t override2(override_json_exclude);
+    Option<uint32_t> ov1_op = coll_group->add_override(override1);
+    Option<uint32_t> ov2_op = coll_group->add_override(override2);
+
+    ASSERT_TRUE(ov1_op.ok());
+    ASSERT_TRUE(ov2_op.ok());
+
+    auto res = coll_group->search("shirt", {"title"}, "", {"brand"}, {}, 0, 50, 1, FREQUENCY,
+                                  false, Index::DROP_TOKENS_THRESHOLD,
+                                  spp::sparse_hash_set<std::string>(),
+                                  spp::sparse_hash_set<std::string>(), 10, "", 30,
+                                  "", 10,
+                                  {}, {}, {"colors"}, 2).get();
+
+    ASSERT_EQ(4, res["found"].get<size_t>());
+    ASSERT_EQ(4, res["grouped_hits"].size());
+
+    ASSERT_EQ(1, res["grouped_hits"][0]["group_key"][0].size());
+    ASSERT_STREQ("white", res["grouped_hits"][0]["group_key"][0][0].get<std::string>().c_str());
+
+    ASSERT_STREQ("11", res["grouped_hits"][0]["hits"][0]["document"]["id"].get<std::string>().c_str());
+    ASSERT_STREQ("10", res["grouped_hits"][0]["hits"][1]["document"]["id"].get<std::string>().c_str());
+
+    ASSERT_STREQ("5", res["grouped_hits"][1]["hits"][0]["document"]["id"].get<std::string>().c_str());
+    ASSERT_STREQ("3", res["grouped_hits"][1]["hits"][1]["document"]["id"].get<std::string>().c_str());
+
+    ASSERT_STREQ("4", res["grouped_hits"][2]["hits"][0]["document"]["id"].get<std::string>().c_str());
+    ASSERT_STREQ("0", res["grouped_hits"][2]["hits"][1]["document"]["id"].get<std::string>().c_str());
+
+    ASSERT_EQ(1, res["grouped_hits"][3]["hits"].size());
+    ASSERT_STREQ("8", res["grouped_hits"][3]["hits"][0]["document"]["id"].get<std::string>().c_str());
+
+    // assert facet counts
+    ASSERT_STREQ("brand", res["facet_counts"][0]["field_name"].get<std::string>().c_str());
+    ASSERT_EQ(2, (int) res["facet_counts"][0]["counts"][0]["count"]);
+    ASSERT_STREQ("Omega", res["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
+
+    ASSERT_EQ(2, (int) res["facet_counts"][0]["counts"][1]["count"]);
+    ASSERT_STREQ("Xorp", res["facet_counts"][0]["counts"][1]["value"].get<std::string>().c_str());
+
+    ASSERT_EQ(2, (int) res["facet_counts"][0]["counts"][2]["count"]);
+    ASSERT_STREQ("Beta", res["facet_counts"][0]["counts"][2]["value"].get<std::string>().c_str());
+
+    ASSERT_EQ(1, (int) res["facet_counts"][0]["counts"][3]["count"]);
+    ASSERT_STREQ("Zeta", res["facet_counts"][0]["counts"][3]["value"].get<std::string>().c_str());
+}
--- a/test/collection_manager_test.cpp
+++ b/test/collection_manager_test.cpp
@ -40,6 +40,7 @@ protected:

    virtual void TearDown() {
        collectionManager.drop_collection("collection1");
+        collectionManager.dispose();
        delete store;
    }
 };
--- a/test/collection_override_test.cpp
+++ b/test/collection_override_test.cpp
@ -18,7 +18,7 @@ protected:
        system(("rm -rf "+state_dir_path+" && mkdir -p "+state_dir_path).c_str());

        store = new Store(state_dir_path);
-        collectionManager.init(store, 1, "auth_key");
+        collectionManager.init(store, 4, "auth_key");
        collectionManager.load();

        std::ifstream infile(std::string(ROOT_DIR)+"test/multi_field_documents.jsonl");
@ -49,6 +49,7 @@ protected:

    virtual void TearDown() {
        collectionManager.drop_collection("coll_mul_fields");
+        collectionManager.dispose();
        delete store;
    }
 };
@ -121,6 +122,11 @@ TEST_F(CollectionOverrideTest, ExcludeIncludeExactQueryMatch) {
    ASSERT_STREQ("3", results["hits"][1]["document"]["id"].get<std::string>().c_str());
    ASSERT_STREQ("13", results["hits"][2]["document"]["id"].get<std::string>().c_str());

+    // curated results should be marked as such
+    ASSERT_EQ(true, results["hits"][0]["curated"].get<bool>());
+    ASSERT_EQ(true, results["hits"][1]["curated"].get<bool>());
+    ASSERT_EQ(0, results["hits"][2].count("curated"));
+
    coll_mul_fields->remove_override("exclude-rule");
    coll_mul_fields->remove_override("include-rule");

@ -219,6 +225,8 @@ TEST_F(CollectionOverrideTest, ExcludeIncludeFacetFilterQuery) {
                                      spp::sparse_hash_set<std::string>(),
                                      spp::sparse_hash_set<std::string>(), 10, "starring: scott").get();

+    ASSERT_EQ(9, results["found"].get<size_t>());
+
    // "count" would be `2` without exclusion
    ASSERT_EQ("<mark>Scott</mark> Glenn", results["facet_counts"][0]["counts"][0]["highlighted"].get<std::string>());
    ASSERT_EQ(1, results["facet_counts"][0]["counts"][0]["count"].get<size_t>());
@ -233,7 +241,7 @@ TEST_F(CollectionOverrideTest, ExcludeIncludeFacetFilterQuery) {
                                      spp::sparse_hash_set<std::string>(),
                                      spp::sparse_hash_set<std::string>(), 10, "starring: scott").get();

-    ASSERT_EQ(10, results["found"].get<size_t>());
+    ASSERT_EQ(9, results["found"].get<size_t>());
    ASSERT_EQ(0, results["hits"].size());

    coll_mul_fields->remove_override("exclude-rule");
@ -246,7 +254,7 @@ TEST_F(CollectionOverrideTest, ExcludeIncludeFacetFilterQuery) {
                                      spp::sparse_hash_set<std::string>(),
                                      spp::sparse_hash_set<std::string>(), 10, "").get();

-    ASSERT_EQ(1, results["found"].get<size_t>());
+    ASSERT_EQ(2, results["found"].get<size_t>());
    ASSERT_EQ(1, results["hits"].size());
    ASSERT_EQ("0", results["hits"][0]["document"]["id"].get<std::string>());

@ -254,10 +262,9 @@ TEST_F(CollectionOverrideTest, ExcludeIncludeFacetFilterQuery) {
 }

 TEST_F(CollectionOverrideTest, IncludeExcludeHitsQuery) {
-    std::map<std::string, size_t> pinned_hits;
-    std::vector<std::string> hidden_hits;
-    pinned_hits["13"] = 1;
-    pinned_hits["4"] = 2;
+    std::map<size_t, std::vector<std::string>> pinned_hits;
+    pinned_hits[1] = {"13"};
+    pinned_hits[2] = {"4"};

    // basic pinning

@ -277,6 +284,7 @@ TEST_F(CollectionOverrideTest, IncludeExcludeHitsQuery) {

    // both pinning and hiding

+    std::vector<std::string> hidden_hits;
    hidden_hits = {"11", "16"};
    results = coll_mul_fields->search("the", {"title"}, "", {"starring"}, {}, 0, 50, 1, FREQUENCY,
                                      false, Index::DROP_TOKENS_THRESHOLD,
@ -289,6 +297,21 @@ TEST_F(CollectionOverrideTest, IncludeExcludeHitsQuery) {
    ASSERT_STREQ("4", results["hits"][1]["document"]["id"].get<std::string>().c_str());
    ASSERT_STREQ("6", results["hits"][2]["document"]["id"].get<std::string>().c_str());

+    // paginating such that pinned hits appear on second page
+    pinned_hits.clear();
+    pinned_hits[4] = {"13"};
+    pinned_hits[5] = {"4"};
+
+    results = coll_mul_fields->search("the", {"title"}, "", {"starring"}, {}, 0, 2, 2, FREQUENCY,
+                                      false, Index::DROP_TOKENS_THRESHOLD,
+                                      spp::sparse_hash_set<std::string>(),
+                                      spp::sparse_hash_set<std::string>(), 10, "starring: will", 30,
+                                      "", 10,
+                                      pinned_hits, hidden_hits).get();
+
+    ASSERT_STREQ("1", results["hits"][0]["document"]["id"].get<std::string>().c_str());
+    ASSERT_STREQ("13", results["hits"][1]["document"]["id"].get<std::string>().c_str());
+
    // take precedence over override rules

    nlohmann::json override_json_include = {
@ -325,4 +348,60 @@ TEST_F(CollectionOverrideTest, IncludeExcludeHitsQuery) {
    ASSERT_EQ(8, results["found"].get<size_t>());
    ASSERT_STREQ("8", results["hits"][0]["document"]["id"].get<std::string>().c_str());
    ASSERT_STREQ("6", results["hits"][1]["document"]["id"].get<std::string>().c_str());
+}
+
+TEST_F(CollectionOverrideTest, PinnedHitsGrouping) {
+    std::map<size_t, std::vector<std::string>> pinned_hits;
+    pinned_hits[1] = {"6", "8"};
+    pinned_hits[2] = {"1"};
+    pinned_hits[3] = {"13", "4"};
+
+    // without any grouping parameter, only the first ID in a position should be picked
+    // and other IDs should appear in their original positions
+
+    auto results = coll_mul_fields->search("the", {"title"}, "", {"starring"}, {}, 0, 50, 1, FREQUENCY,
+                                           false, Index::DROP_TOKENS_THRESHOLD,
+                                           spp::sparse_hash_set<std::string>(),
+                                           spp::sparse_hash_set<std::string>(), 10, "starring: will", 30,
+                                           "", 10,
+                                           pinned_hits, {}).get();
+
+    ASSERT_EQ(10, results["found"].get<size_t>());
+    ASSERT_STREQ("6", results["hits"][0]["document"]["id"].get<std::string>().c_str());
+    ASSERT_STREQ("1", results["hits"][1]["document"]["id"].get<std::string>().c_str());
+    ASSERT_STREQ("13", results["hits"][2]["document"]["id"].get<std::string>().c_str());
+    ASSERT_STREQ("11", results["hits"][3]["document"]["id"].get<std::string>().c_str());
+
+    // pinned hits should be marked as curated
+    ASSERT_EQ(true, results["hits"][0]["curated"].get<bool>());
+    ASSERT_EQ(true, results["hits"][1]["curated"].get<bool>());
+    ASSERT_EQ(true, results["hits"][2]["curated"].get<bool>());
+    ASSERT_EQ(0, results["hits"][3].count("curated"));
+
+    // with grouping
+
+    results = coll_mul_fields->search("the", {"title"}, "", {"starring"}, {}, 0, 50, 1, FREQUENCY,
+                            false, Index::DROP_TOKENS_THRESHOLD,
+                            spp::sparse_hash_set<std::string>(),
+                            spp::sparse_hash_set<std::string>(), 10, "starring: will", 30,
+                            "", 10,
+                            pinned_hits, {}, {"cast"}, 2).get();
+
+    ASSERT_EQ(8, results["found"].get<size_t>());
+
+    ASSERT_EQ(1, results["grouped_hits"][0]["group_key"].size());
+    ASSERT_EQ(2, results["grouped_hits"][0]["group_key"][0].size());
+    ASSERT_STREQ("Chris Evans", results["grouped_hits"][0]["group_key"][0][0].get<std::string>().c_str());
+    ASSERT_STREQ("Scarlett Johansson", results["grouped_hits"][0]["group_key"][0][1].get<std::string>().c_str());
+
+    ASSERT_STREQ("6", results["grouped_hits"][0]["hits"][0]["document"]["id"].get<std::string>().c_str());
+    ASSERT_STREQ("8", results["grouped_hits"][0]["hits"][1]["document"]["id"].get<std::string>().c_str());
+
+    ASSERT_STREQ("1", results["grouped_hits"][1]["hits"][0]["document"]["id"].get<std::string>().c_str());
+
+    ASSERT_STREQ("13", results["grouped_hits"][2]["hits"][0]["document"]["id"].get<std::string>().c_str());
+    ASSERT_STREQ("4", results["grouped_hits"][2]["hits"][1]["document"]["id"].get<std::string>().c_str());
+
+    ASSERT_STREQ("11", results["grouped_hits"][3]["hits"][0]["document"]["id"].get<std::string>().c_str());
+    ASSERT_STREQ("16", results["grouped_hits"][4]["hits"][0]["document"]["id"].get<std::string>().c_str());
 }
--- a/test/collection_sorting_test.cpp
+++ b/test/collection_sorting_test.cpp
@ -29,6 +29,7 @@ protected:
    }

    virtual void TearDown() {
+        collectionManager.dispose();
        delete store;
    }
 };
@ -248,6 +249,7 @@ TEST_F(CollectionSortingTest, ThreeSortFieldsLimit) {
        sort_by("min", "DESC"),
    };

+    query_fields = {"title"};
    auto res_op = coll1->search("the", query_fields, "", {}, sort_fields_desc, 0, 10, 1, FREQUENCY, false);

    ASSERT_FALSE(res_op.ok());
--- a/test/collection_test.cpp
+++ b/test/collection_test.cpp
@ -56,6 +56,7 @@ protected:

    virtual void TearDown() {
        collectionManager.drop_collection("collection");
+        collectionManager.dispose();
        delete store;
    }
 };
@ -455,6 +456,18 @@ TEST_F(CollectionTest, WildcardQuery) {
        std::string id = ids.at(i);
        ASSERT_STREQ(id.c_str(), result_id.c_str());
    }
+
+    // wildcard query should not require a search field
+    results_op = collection->search("*", {}, "", {}, sort_fields, 0, 3, 1, FREQUENCY, false);
+    ASSERT_TRUE(results_op.ok());
+    results = results_op.get();
+    ASSERT_EQ(3, results["hits"].size());
+    ASSERT_EQ(25, results["found"].get<uint32_t>());
+
+    // non-wildcard query should require a search field
+    results_op = collection->search("the", {}, "", {}, sort_fields, 0, 3, 1, FREQUENCY, false);
+    ASSERT_FALSE(results_op.ok());
+    ASSERT_STREQ("No search fields specified for the query.", results_op.error().c_str());
 }

 TEST_F(CollectionTest, PrefixSearching) {
@ -2259,7 +2272,6 @@ TEST_F(CollectionTest, OptionalFields) {

    // try fetching the schema (should contain optional field)
    nlohmann::json coll_summary = coll1->get_summary_json();
-    LOG(INFO) << coll_summary;
    ASSERT_STREQ("title", coll_summary["fields"][0]["name"].get<std::string>().c_str());
    ASSERT_STREQ("string", coll_summary["fields"][0]["type"].get<std::string>().c_str());
    ASSERT_FALSE(coll_summary["fields"][0]["facet"].get<bool>());
--- a/test/group_documents.jsonl
+++ b/test/group_documents.jsonl
@ -0,0 +1,12 @@
+{"title": "Omega Casual Poplin Shirt", "brand": "Omega", "size": 10, "colors": ["white", "blue"], "rating": 4.5}
+{"title": "Omega Casual Poplin Shirt", "brand": "Omega", "size": 11, "colors": ["white", "blue"], "rating": 4.3}
+{"title": "Omega Casual Poplin Shirt", "brand": "Omega", "size": 12, "colors": ["white", "blue"], "rating": 4.6}
+{"title": "Omega Casual Poplin Shirt", "brand": "Omega", "size": 10, "colors": ["blue"], "rating": 4.6}
+{"title": "Beta Casual Poplin Shirt", "brand": "Beta", "size": 10, "colors": ["white", "blue"], "rating": 4.8}
+{"title": "Beta Casual Poplin Shirt", "brand": "Beta", "size": 11, "colors": ["blue"], "rating": 4.8}
+{"title": "Beta Casual Poplin Shirt", "brand": "Beta", "size": 12, "colors": ["white", "blue"], "rating": 4.3}
+{"title": "Xorp Casual Shirt", "brand": "Xorp", "size": 10, "colors": ["white", "blue"], "rating": 4.3}
+{"title": "Xorp Casual Shirt", "brand": "Xorp", "size": 12, "colors": ["white", "red"], "rating": 4.4}
+{"title": "Zeta Casual Shirt", "brand": "Zeta", "size": 10, "colors": ["white", "blue"], "rating": 4.1}
+{"title": "White Casual Shirt", "size": 10, "colors": ["white"], "rating": 4.3}
+{"title": "White Casual Shirt", "size": 10, "colors": ["white"], "rating": 3.3}
--- a/test/string_utils_test.cpp
+++ b/test/string_utils_test.cpp
@ -54,3 +54,8 @@ TEST(StringUtilsTest, HMAC) {
    std::string digest1 = StringUtils::hmac("KeyVal", "{\"filter_by\": \"user_id:1080\"}");
    ASSERT_STREQ("IvjqWNZ5M5ElcvbMoXj45BxkQrZG4ZKEaNQoRioCx2s=", digest1.c_str());
 }
+
+TEST(StringUtilsTest, UInt32Validation) {
+    std::string big_num = "99999999999999999999999999999999";
+    ASSERT_FALSE(StringUtils::is_uint32_t(big_num));
+}
--- a/test/topster_test.cpp
+++ b/test/topster_test.cpp
@ -36,7 +36,8 @@ TEST(TopsterTest, MaxIntValues) {
        scores[1] = data[i].primary_attr;
        scores[2] = data[i].secondary_attr;

-        topster.add(data[i].key, data[i].field_id, data[i].query_index, data[i].match_score, scores);
+        KV kv(data[i].field_id, data[i].query_index, data[i].key, data[i].key, data[i].match_score, scores);
+        topster.add(&kv);
    }

    topster.sort();
@ -87,7 +88,8 @@ TEST(TopsterTest, MaxFloatValues) {
        scores[1] = Index::float_to_in64_t(data[i].primary_attr);
        scores[2] = data[i].secondary_attr;

-        topster.add(data[i].key, data[i].field_id, data[i].query_index, data[i].match_score, scores);
+        KV kv(data[i].field_id, data[i].query_index, data[i].key, data[i].key, data[i].match_score, scores);
+        topster.add(&kv);
    }

    topster.sort();
@ -97,4 +99,64 @@ TEST(TopsterTest, MaxFloatValues) {
    for(uint32_t i = 0; i < topster.size; i++) {
        EXPECT_EQ(ids[i], topster.getKeyAt(i));
    }
+}
+
+TEST(TopsterTest, DistinctIntValues) {
+    Topster dist_topster(5, 2);
+
+    struct {
+        uint8_t field_id;
+        uint16_t query_index;
+        uint64_t distinct_key;
+        uint64_t match_score;
+        int64_t primary_attr;
+        int64_t secondary_attr;
+    } data[14] = {
+            {1, 0, 1, 11, 20, 30},
+            {1, 0, 1, 12, 20, 32},
+            {1, 0, 2, 4, 20, 30},
+            {1, 2, 3, 7, 20, 30},
+            {1, 0, 4, 14, 20, 30},
+            {1, 1, 5, 9, 20, 30},
+            {1, 1, 5, 10, 20, 32},
+            {1, 1, 5, 9, 20, 30},
+            {1, 0, 6, 6, 20, 30},
+            {1, 2, 7, 6, 22, 30},
+            {1, 2, 7, 6, 22, 30},
+            {1, 1, 8, 9, 20, 30},
+            {1, 0, 9, 8, 20, 30},
+            {1, 3, 10, 5, 20, 30},
+    };
+
+    for(int i = 0; i < 14; i++) {
+        int64_t scores[3];
+        scores[0] = int64_t(data[i].match_score);
+        scores[1] = data[i].primary_attr;
+        scores[2] = data[i].secondary_attr;
+
+        KV kv(data[i].field_id, data[i].query_index, i+100, data[i].distinct_key, data[i].match_score, scores);
+        dist_topster.add(&kv);
+    }
+
+    dist_topster.sort();
+
+    std::vector<uint64_t> distinct_ids = {4, 1, 5, 8, 9};
+
+    for(uint32_t i = 0; i < dist_topster.size; i++) {
+        EXPECT_EQ(distinct_ids[i], dist_topster.getDistinctKeyAt(i));
+
+        if(distinct_ids[i] == 1) {
+            EXPECT_EQ(12, (int) dist_topster.getKV(i)->match_score);
+            EXPECT_EQ(2, dist_topster.group_kv_map[dist_topster.getDistinctKeyAt(i)]->size);
+            EXPECT_EQ(12, dist_topster.group_kv_map[dist_topster.getDistinctKeyAt(i)]->getKV(0)->match_score);
+            EXPECT_EQ(11, dist_topster.group_kv_map[dist_topster.getDistinctKeyAt(i)]->getKV(1)->match_score);
+        }
+
+        if(distinct_ids[i] == 5) {
+            EXPECT_EQ(10, (int) dist_topster.getKV(i)->match_score);
+            EXPECT_EQ(2, dist_topster.group_kv_map[dist_topster.getDistinctKeyAt(i)]->size);
+            EXPECT_EQ(10, dist_topster.group_kv_map[dist_topster.getDistinctKeyAt(i)]->getKV(0)->match_score);
+            EXPECT_EQ(9, dist_topster.group_kv_map[dist_topster.getDistinctKeyAt(i)]->getKV(1)->match_score);
+        }
+    }
 }