From df8f6849fe6f4eec8da028266117385bca7cf0e9 Mon Sep 17 00:00:00 2001
From: Kishore Nallan <kishorenc@gmail.com>
Date: Thu, 15 Dec 2022 14:50:09 +0530
Subject: [PATCH] Facet estimation for large result sets.

---
 include/collection.h                   |   4 +-
 include/field.h                        |   2 +
 include/index.h                        |  11 +-
 src/collection.cpp                     |  26 +--
 src/collection_manager.cpp             |  13 +-
 src/index.cpp                          |  56 ++++-
 test/collection_faceting_test.cpp      | 309 ++++++++++++++++++++++++-
 test/collection_schema_change_test.cpp |   2 +-
 test/collection_specific_test.cpp      | 236 -------------------
 9 files changed, 385 insertions(+), 274 deletions(-)
diff --git a/include/collection.h b/include/collection.h
index 72cdd76c..19d1fe9b 100644
--- a/include/collection.h
+++ b/include/collection.h
@@ -408,7 +408,9 @@ public:
                                   const size_t facet_query_num_typos = 2,
                                   const size_t filter_curated_hits_option = 2,
                                   const bool prioritize_token_position = false,
-                                  const std::string& vector_query_str = "") const;
+                                  const std::string& vector_query_str = "",
+                                  const size_t facet_sample_percent = 100,
+                                  const size_t facet_sample_threshold = 0) const;
 
     Option<bool> get_filter_ids(const std::string & simple_filter_query,
                                 std::vector<std::pair<size_t, uint32_t*>>& index_ids);
diff --git a/include/field.h b/include/field.h
index c3d284ab..d0ccbe80 100644
--- a/include/field.h
+++ b/include/field.h
@@ -682,6 +682,8 @@ struct facet {
 
     bool is_range_query;
 
+    bool sampled = false;
+
     bool get_range(int64_t key, std::pair<int64_t, std::string>& range_pair)
     {
         if(facet_range_map.empty())
diff --git a/include/index.h b/include/index.h
index 95a8a4ff..035998e8 100644
--- a/include/index.h
+++ b/include/index.h
@@ -132,6 +132,8 @@ struct search_args {
     std::vector<std::vector<KV*>> override_result_kvs;
 
     vector_query_t& vector_query;
+    size_t facet_sample_percent;
+    size_t facet_sample_threshold;
 
     search_args(std::vector<query_tokens_t> field_query_tokens, std::vector<search_field_t> search_fields,
                 filter_node_t* filter_tree_root, std::vector<facet>& facets,
@@ -145,7 +147,8 @@ struct search_args {
                 size_t concurrency, size_t search_cutoff_ms,
                 size_t min_len_1typo, size_t min_len_2typo, size_t max_candidates, const std::vector<enable_t>& infixes,
                 const size_t max_extra_prefix, const size_t max_extra_suffix, const size_t facet_query_num_typos,
-                const bool filter_curated_hits, const enable_t split_join_tokens, vector_query_t& vector_query) :
+                const bool filter_curated_hits, const enable_t split_join_tokens, vector_query_t& vector_query,
+                size_t facet_sample_percent, size_t facet_sample_threshold) :
             field_query_tokens(field_query_tokens),
             search_fields(search_fields), filter_tree_root(filter_tree_root), facets(facets),
             included_ids(included_ids), excluded_ids(excluded_ids), sort_fields_std(sort_fields_std),
@@ -159,7 +162,8 @@ struct search_args {
             min_len_1typo(min_len_1typo), min_len_2typo(min_len_2typo), max_candidates(max_candidates),
             infixes(infixes), max_extra_prefix(max_extra_prefix), max_extra_suffix(max_extra_suffix),
             facet_query_num_typos(facet_query_num_typos), filter_curated_hits(filter_curated_hits),
-            split_join_tokens(split_join_tokens), vector_query(vector_query) {
+            split_join_tokens(split_join_tokens), vector_query(vector_query),
+            facet_sample_percent(facet_sample_percent), facet_sample_threshold(facet_sample_threshold) {
 
         const size_t topster_size = std::max((size_t)1, max_hits);  // needs to be atleast 1 since scoring is mandatory
         topster = new Topster(topster_size, group_limit);
@@ -357,6 +361,7 @@ private:
     void log_leaves(int cost, const std::string &token, const std::vector<art_leaf *> &leaves) const;
 
     void do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
+                   bool estimate_facets, size_t facet_sample_percent,
                    const std::vector<facet_info_t>& facet_infos,
                    size_t group_limit, const std::vector<std::string>& group_by_fields,
                    const uint32_t* result_ids, size_t results_size) const;
@@ -645,7 +650,7 @@ public:
                 size_t max_candidates, const std::vector<enable_t>& infixes, const size_t max_extra_prefix,
                 const size_t max_extra_suffix, const size_t facet_query_num_typos,
                 const bool filter_curated_hits, enable_t split_join_tokens,
-                const vector_query_t& vector_query) const;
+                const vector_query_t& vector_query, size_t facet_sample_percent, size_t facet_sample_threshold) const;
 
     void remove_field(uint32_t seq_id, const nlohmann::json& document, const std::string& field_name);
 
diff --git a/src/collection.cpp b/src/collection.cpp
index ffacd8ca..d2c697db 100644
--- a/src/collection.cpp
+++ b/src/collection.cpp
@@ -866,7 +866,9 @@ Option<nlohmann::json> Collection::search(const std::string & raw_query,
                                   const size_t facet_query_num_typos,
                                   const size_t filter_curated_hits_option,
                                   const bool prioritize_token_position,
-                                  const std::string& vector_query_str) const {
+                                  const std::string& vector_query_str,
+                                  const size_t facet_sample_percent,
+                                  const size_t facet_sample_threshold) const {
 
     std::shared_lock lock(mutex);
 
@@ -911,6 +913,10 @@ Option<nlohmann::json> Collection::search(const std::string & raw_query,
         }
     }
 
+    if(facet_sample_percent > 100) {
+        return Option<nlohmann::json>(400, "Value of `facet_sample_percent` must be less than 100.");
+    }
+
     if(raw_group_by_fields.empty()) {
         group_limit = 0;
     }
@@ -1302,7 +1308,8 @@ Option<nlohmann::json> Collection::search(const std::string & raw_query,
                                                  search_stop_millis,
                                                  min_len_1typo, min_len_2typo, max_candidates, infixes,
                                                  max_extra_prefix, max_extra_suffix, facet_query_num_typos,
-                                                 filter_curated_hits, split_join_tokens, vector_query);
+                                                 filter_curated_hits, split_join_tokens, vector_query,
+                                                 facet_sample_percent, facet_sample_threshold);
 
     index->run_search(search_params);
 
@@ -1319,12 +1326,6 @@ Option<nlohmann::json> Collection::search(const std::string & raw_query,
 
     // for grouping we have to aggregate group set sizes to a count value
     if(group_limit) {
-        for(auto& acc_facet: facets) {
-            for(auto& facet_kv: acc_facet.result_map) {
-                facet_kv.second.count = acc_facet.hash_groups[facet_kv.first].size();
-            }
-        }
-
         total_found = search_params->groups_processed.size() + override_result_kvs.size();
     } else {
         total_found = search_params->all_result_ids_len;
@@ -1430,8 +1431,6 @@ Option<nlohmann::json> Collection::search(const std::string & raw_query,
     // handle which fields have to be highlighted
 
     std::vector<highlight_field_t> highlight_items;
-    bool has_atleast_one_fully_highlighted_field = false;
-
     std::vector<std::string> highlight_field_names;
     StringUtils::split(highlight_fields, highlight_field_names, ",");
 
@@ -1442,12 +1441,6 @@ Option<nlohmann::json> Collection::search(const std::string & raw_query,
         process_highlight_fields(weighted_search_fields, raw_search_fields, include_fields_full, exclude_fields_full,
                                  highlight_field_names, highlight_full_field_names, infixes, q_tokens,
                                  search_params->qtoken_set, highlight_items);
-
-        for(auto& highlight_item: highlight_items) {
-            if(highlight_item.fully_highlighted) {
-                has_atleast_one_fully_highlighted_field = true;
-            }
-        }
     }
 
     nlohmann::json result = nlohmann::json::object();
@@ -1657,6 +1650,7 @@ Option<nlohmann::json> Collection::search(const std::string & raw_query,
     for(facet & a_facet: facets) {
         nlohmann::json facet_result = nlohmann::json::object();
         facet_result["field_name"] = a_facet.field_name;
+        facet_result["sampled"] = a_facet.sampled;
         facet_result["counts"] = nlohmann::json::array();
 
         std::vector<facet_value_t> facet_values;
diff --git a/src/collection_manager.cpp b/src/collection_manager.cpp
index 2633d39e..e04ce8f9 100644
--- a/src/collection_manager.cpp
+++ b/src/collection_manager.cpp
@@ -695,6 +695,9 @@ Option<bool> CollectionManager::do_search(std::map<std::string, std::string>& re
     const char *EXHAUSTIVE_SEARCH = "exhaustive_search";
     const char *SPLIT_JOIN_TOKENS = "split_join_tokens";
 
+    const char *FACET_SAMPLE_PERCENT = "facet_sample_percent";
+    const char *FACET_SAMPLE_THRESHOLD = "facet_sample_threshold";
+
     // enrich params with values from embedded params
     for(auto& item: embedded_params.items()) {
         if(item.key() == "expires_at") {
@@ -720,7 +723,6 @@ Option<bool> CollectionManager::do_search(std::map<std::string, std::string>& re
 
     // end check for mandatory params
 
-
     const std::string& raw_query = req_params[QUERY];
     std::vector<uint32_t> num_typos = {2};
     size_t min_len_1typo = 4;
@@ -772,6 +774,9 @@ Option<bool> CollectionManager::do_search(std::map<std::string, std::string>& re
     size_t max_extra_prefix = INT16_MAX;
     size_t max_extra_suffix = INT16_MAX;
 
+    size_t facet_sample_percent = 100;
+    size_t facet_sample_threshold = 0;
+
     std::unordered_map<std::string, size_t*> unsigned_int_values = {
         {MIN_LEN_1TYPO, &min_len_1typo},
         {MIN_LEN_2TYPO, &min_len_2typo},
@@ -790,6 +795,8 @@ Option<bool> CollectionManager::do_search(std::map<std::string, std::string>& re
         {MAX_CANDIDATES, &max_candidates},
         {FACET_QUERY_NUM_TYPOS, &facet_query_num_typos},
         {FILTER_CURATED_HITS, &filter_curated_hits_option},
+        {FACET_SAMPLE_PERCENT, &facet_sample_percent},
+        {FACET_SAMPLE_THRESHOLD, &facet_sample_threshold},
     };
 
     std::unordered_map<std::string, std::string*> str_values = {
@@ -982,7 +989,9 @@ Option<bool> CollectionManager::do_search(std::map<std::string, std::string>& re
                                                           facet_query_num_typos,
                                                           filter_curated_hits_option,
                                                           prioritize_token_position,
-                                                          vector_query
+                                                          vector_query,
+                                                          facet_sample_percent,
+                                                          facet_sample_threshold
                                                         );
 
     uint64_t timeMillis = std::chrono::duration_cast<std::chrono::milliseconds>(
diff --git a/src/index.cpp b/src/index.cpp
index f5f7a2d1..f85b55d3 100644
--- a/src/index.cpp
+++ b/src/index.cpp
@@ -4,6 +4,7 @@
 #include <chrono>
 #include <set>
 #include <unordered_map>
+#include <random>
 #include <array_utils.h>
 #include <match_score.h>
 #include <string_utils.h>
@@ -1228,6 +1229,7 @@ void Index::compute_facet_stats(facet &a_facet, uint64_t raw_value, const std::s
 }
 
 void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
+                      bool estimate_facets, size_t facet_sample_percent,
                       const std::vector<facet_info_t>& facet_infos,
                       const size_t group_limit, const std::vector<std::string>& group_by_fields,
                       const uint32_t* result_ids, size_t results_size) const {
@@ -1247,8 +1249,21 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
 
         const auto& field_facet_mapping = field_facet_mapping_it->second;
 
+        // used for sampling facets (if enabled)
+        std::mt19937 gen(137723); // use constant seed to make sure that counts don't jump around
+        std::uniform_int_distribution<> distr(1, 100); // 1 to 100 inclusive
+
         for(size_t i = 0; i < results_size; i++) {
             uint32_t doc_seq_id = result_ids[i];
+
+            // if sampling is enabled, we will skip a portion of the results to speed up things
+            if(estimate_facets) {
+                size_t num = distr(gen);
+                if(num > facet_sample_percent) {
+                    continue;
+                }
+            }
+
             const auto& facet_hashes_it = field_facet_mapping[doc_seq_id % ARRAY_FACET_DIM]->find(doc_seq_id);
 
             if(facet_hashes_it == field_facet_mapping[doc_seq_id % ARRAY_FACET_DIM]->end()) {
@@ -1265,7 +1280,7 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
                     compute_facet_stats(a_facet, fhash, facet_field.type);
                 }
 
-                if(a_facet.is_range_query){
+                if(a_facet.is_range_query) {
                     auto sort_index_it = sort_index.find(a_facet.field_name);
                     
                     if(sort_index_it != sort_index.end()){
@@ -1285,8 +1300,7 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
                             }
                         }
                     }
-                }
-                else if(!use_facet_query || fquery_hashes.find(fhash) != fquery_hashes.end()) {
+                } else if(!use_facet_query || fquery_hashes.find(fhash) != fquery_hashes.end()) {
                     facet_count_t& facet_count = a_facet.result_map[fhash];
 
                     //LOG(INFO) << "field: " << a_facet.field_name << ", doc id: " << doc_seq_id << ", hash: " <<  fhash;
@@ -1980,7 +1994,9 @@ void Index::run_search(search_args* search_params) {
            search_params->facet_query_num_typos,
            search_params->filter_curated_hits,
            search_params->split_join_tokens,
-           search_params->vector_query);
+           search_params->vector_query,
+           search_params->facet_sample_percent,
+           search_params->facet_sample_threshold);
 }
 
 void Index::collate_included_ids(const std::vector<token_t>& q_included_tokens,
@@ -2430,7 +2446,8 @@ void Index::search(std::vector<query_tokens_t>& field_query_tokens, const std::v
                    size_t max_candidates, const std::vector<enable_t>& infixes, const size_t max_extra_prefix,
                    const size_t max_extra_suffix, const size_t facet_query_num_typos,
                    const bool filter_curated_hits, const enable_t split_join_tokens,
-                   const vector_query_t& vector_query) const {
+                   const vector_query_t& vector_query,
+                   size_t facet_sample_percent, size_t facet_sample_threshold) const {
 
     // process the filters
 
@@ -2784,6 +2801,8 @@ void Index::search(std::vector<query_tokens_t>& field_query_tokens, const std::v
     delete [] exclude_token_ids;
     delete [] excluded_result_ids;
 
+    bool estimate_facets = (facet_sample_percent < 100 && all_result_ids_len > facet_sample_threshold);
+
     if(!facets.empty()) {
         const size_t num_threads = std::min(concurrency, all_result_ids_len);
         const size_t window_size = (num_threads == 0) ? 0 :
@@ -2820,9 +2839,11 @@ void Index::search(std::vector<query_tokens_t>& field_query_tokens, const std::v
 
             thread_pool->enqueue([this, thread_id, &facet_batches, &facet_query, group_limit, group_by_fields,
                                          batch_result_ids, batch_res_len, &facet_infos,
+                                         estimate_facets, facet_sample_percent,
                                          &num_processed, &m_process, &cv_process]() {
                 auto fq = facet_query;
-                do_facets(facet_batches[thread_id], fq, facet_infos, group_limit, group_by_fields,
+                do_facets(facet_batches[thread_id], fq, estimate_facets, facet_sample_percent,
+                          facet_infos, group_limit, group_by_fields,
                           batch_result_ids, batch_res_len);
                 std::unique_lock<std::mutex> lock(m_process);
                 num_processed++;
@@ -2844,8 +2865,8 @@ void Index::search(std::vector<query_tokens_t>& field_query_tokens, const std::v
                     if(group_limit) {
                         // we have to add all group sets
                         acc_facet.hash_groups[facet_kv.first].insert(
-                                this_facet.hash_groups[facet_kv.first].begin(),
-                                this_facet.hash_groups[facet_kv.first].end()
+                            this_facet.hash_groups[facet_kv.first].begin(),
+                            this_facet.hash_groups[facet_kv.first].end()
                         );
                     } else {
                         size_t count = 0;
@@ -2872,6 +2893,22 @@ void Index::search(std::vector<query_tokens_t>& field_query_tokens, const std::v
             }
         }
 
+        for(auto & acc_facet: facets) {
+            for(auto& facet_kv: acc_facet.result_map) {
+                if(group_limit) {
+                    facet_kv.second.count = acc_facet.hash_groups[facet_kv.first].size();
+                }
+
+                if(estimate_facets) {
+                    facet_kv.second.count = size_t(double(facet_kv.second.count) * (100.0f / facet_sample_percent));
+                }
+            }
+
+            if(estimate_facets) {
+                acc_facet.sampled = true;
+            }
+        }
+
         /*long long int timeMillisF = std::chrono::duration_cast<std::chrono::milliseconds>(
                 std::chrono::high_resolution_clock::now() - beginF).count();
         LOG(INFO) << "Time for faceting: " << timeMillisF;*/
@@ -2880,7 +2917,8 @@ void Index::search(std::vector<query_tokens_t>& field_query_tokens, const std::v
     std::vector<facet_info_t> facet_infos(facets.size());
     compute_facet_infos(facets, facet_query, facet_query_num_typos,
                         &included_ids_vec[0], included_ids_vec.size(), group_by_fields, max_candidates, facet_infos);
-    do_facets(facets, facet_query, facet_infos, group_limit, group_by_fields, &included_ids_vec[0], included_ids_vec.size());
+    do_facets(facets, facet_query, estimate_facets, facet_sample_percent,
+              facet_infos, group_limit, group_by_fields, &included_ids_vec[0], included_ids_vec.size());
 
     all_result_ids_len += curated_topster->size;
 
diff --git a/test/collection_faceting_test.cpp b/test/collection_faceting_test.cpp
index 688a0512..8b0f9717 100644
--- a/test/collection_faceting_test.cpp
+++ b/test/collection_faceting_test.cpp
@@ -73,8 +73,9 @@ TEST_F(CollectionFacetingTest, FacetCounts) {
     ASSERT_EQ(5, results["hits"].size());
 
     ASSERT_EQ(1, results["facet_counts"].size());
-    ASSERT_EQ(3, results["facet_counts"][0].size());
+    ASSERT_EQ(4, results["facet_counts"][0].size());
     ASSERT_EQ("tags", results["facet_counts"][0]["field_name"]);
+    ASSERT_EQ(false, results["facet_counts"][0]["sampled"].get<bool>());
     ASSERT_EQ(4, results["facet_counts"][0]["counts"].size());
     ASSERT_EQ(1, results["facet_counts"][0]["stats"].size());
     ASSERT_EQ(4, results["facet_counts"][0]["stats"]["total_values"].get<size_t>());
@@ -981,7 +982,6 @@ TEST_F(CollectionFacetingTest, FacetByNestedIntField) {
 }
 
 TEST_F(CollectionFacetingTest, FacetParseTest){
-
     std::vector<field> fields = {
         field("score", field_types::INT32, true),
         field("grade", field_types::INT32, true),
@@ -1008,8 +1008,6 @@ TEST_F(CollectionFacetingTest, FacetParseTest){
     ASSERT_TRUE(range_facets[1].is_range_query);
     ASSERT_GT(range_facets[1].facet_range_map.size(), 0);
 
-    
-    
     std::vector<std::string> normal_facet_fields {
         "score",
         "grade"
@@ -1022,9 +1020,7 @@ TEST_F(CollectionFacetingTest, FacetParseTest){
 
     ASSERT_STREQ("score", normal_facets[0].field_name.c_str());
     ASSERT_STREQ("grade", normal_facets[1].field_name.c_str());
-    
 
-    
     std::vector<std::string> mixed_facet_fields {
         "score", 
         "grade(A:[80, 100], B:[60, 80], C:[40, 60])", 
@@ -1044,3 +1040,304 @@ TEST_F(CollectionFacetingTest, FacetParseTest){
 
     ASSERT_STREQ("rank", mixed_facets[2].field_name.c_str());
 }
+
+
+TEST_F(CollectionFacetingTest, RangeFacetTest) {
+    std::vector<field> fields = {field("place", field_types::STRING, false),
+                                 field("state", field_types::STRING, false),
+                                 field("visitors", field_types::INT32, true),};
+    Collection* coll1 = collectionManager.create_collection(
+            "coll1", 1, fields, "", 0, "", {}, {}
+    ).get();
+    nlohmann::json doc1;
+    doc1["id"] = "0";
+    doc1["place"] = "Mysore Palace";
+    doc1["state"] = "Karnataka";
+    doc1["visitors"] = 235486;
+
+    nlohmann::json doc2;
+    doc2["id"] = "1";
+    doc2["place"] = "Hampi";
+    doc2["state"] = "Karnataka";
+    doc2["visitors"] = 187654;
+
+    nlohmann::json doc3;
+    doc3["id"] = "2";
+    doc3["place"] = "Mahabalipuram";
+    doc3["state"] = "TamilNadu";
+    doc3["visitors"] = 174684;
+
+    nlohmann::json doc4;
+    doc4["id"] = "3";
+    doc4["place"] = "Meenakshi Amman Temple";
+    doc4["state"] = "TamilNadu";
+    doc4["visitors"] = 246676;
+
+    nlohmann::json doc5;
+    doc5["id"] = "4";
+    doc5["place"] = "Staue of Unity";
+    doc5["state"] = "Gujarat";
+    doc5["visitors"] = 345878;
+
+
+    ASSERT_TRUE(coll1->add(doc1.dump()).ok());
+    ASSERT_TRUE(coll1->add(doc2.dump()).ok());
+    ASSERT_TRUE(coll1->add(doc3.dump()).ok());
+    ASSERT_TRUE(coll1->add(doc4.dump()).ok());
+    ASSERT_TRUE(coll1->add(doc5.dump()).ok());
+
+    auto results = coll1->search("Karnataka", {"state"},
+                                 "", {"visitors(Busy:[0, 200000], VeryBusy:[200000, 500000])"},
+                                 {}, {2}, 10,
+                                 1, FREQUENCY, {true},
+                                 10, spp::sparse_hash_set<std::string>(),
+                                 spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 10, {}, {}, {}, 0,
+                                 "<mark>", "</mark>", {}, 1000,
+                                 true, false, true, "", true).get();
+    ASSERT_EQ(2, results["facet_counts"][0]["counts"].size());
+    ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][0]["count"]);
+    ASSERT_STREQ("Busy", results["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
+
+    auto results2 = coll1->search("Gujarat", {"state"},
+                                  "", {"visitors(Busy:[0, 200000], VeryBusy:[200000, 500000])"},
+                                  {}, {2}, 10,
+                                  1, FREQUENCY, {true},
+                                  10, spp::sparse_hash_set<std::string>(),
+                                  spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 10, {}, {}, {}, 0,
+                                  "<mark>", "</mark>", {}, 1000,
+                                  true, false, true, "", true).get();
+    ASSERT_EQ(1, results2["facet_counts"][0]["counts"].size());
+    ASSERT_EQ(1, results2["facet_counts"][0]["counts"][0]["count"].get<std::size_t>());
+    ASSERT_STREQ("VeryBusy", results2["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
+    ASSERT_TRUE(results2["facet_counts"][0]["counts"][1]["value"] == nullptr);
+
+    collectionManager.drop_collection("coll1");
+}
+
+TEST_F(CollectionFacetingTest, RangeFacetContinuity) {
+    std::vector<field> fields = {field("place", field_types::STRING, false),
+                                 field("state", field_types::STRING, false),
+                                 field("visitors", field_types::INT32, true),};
+    Collection* coll1 = collectionManager.create_collection(
+            "coll1", 1, fields, "", 0, "", {}, {}
+    ).get();
+    nlohmann::json doc1;
+    doc1["id"] = "0";
+    doc1["place"] = "Mysore Palace";
+    doc1["state"] = "Karnataka";
+    doc1["visitors"] = 235486;
+
+    nlohmann::json doc2;
+    doc2["id"] = "1";
+    doc2["place"] = "Hampi";
+    doc2["state"] = "Karnataka";
+    doc2["visitors"] = 187654;
+
+    nlohmann::json doc3;
+    doc3["id"] = "2";
+    doc3["place"] = "Mahabalipuram";
+    doc3["state"] = "TamilNadu";
+    doc3["visitors"] = 174684;
+
+    nlohmann::json doc4;
+    doc4["id"] = "3";
+    doc4["place"] = "Meenakshi Amman Temple";
+    doc4["state"] = "TamilNadu";
+    doc4["visitors"] = 246676;
+
+    nlohmann::json doc5;
+    doc5["id"] = "4";
+    doc5["place"] = "Staue of Unity";
+    doc5["state"] = "Gujarat";
+    doc5["visitors"] = 345878;
+
+
+    ASSERT_TRUE(coll1->add(doc1.dump()).ok());
+    ASSERT_TRUE(coll1->add(doc2.dump()).ok());
+    ASSERT_TRUE(coll1->add(doc3.dump()).ok());
+    ASSERT_TRUE(coll1->add(doc4.dump()).ok());
+    ASSERT_TRUE(coll1->add(doc5.dump()).ok());
+
+    auto results = coll1->search("TamilNadu", {"state"},
+                                 "", {"visitors(Busy:[0, 200000], VeryBusy:[200001, 500000])"},
+                                 {}, {2}, 10,
+                                 1, FREQUENCY, {true},
+                                 10, spp::sparse_hash_set<std::string>(),
+                                 spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 10, {}, {}, {}, 0,
+                                 "<mark>", "</mark>", {}, 1000,
+                                 true, false, true, "", true);
+    ASSERT_STREQ("Ranges in range facet syntax should be continous.", results.error().c_str());
+
+    auto results2 = coll1->search("TamilNadu", {"state"},
+                                  "", {"visitors(Busy:[0, 200000], VeryBusy:[199999, 500000])"},
+                                  {}, {2}, 10,
+                                  1, FREQUENCY, {true},
+                                  10, spp::sparse_hash_set<std::string>(),
+                                  spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 10, {}, {}, {}, 0,
+                                  "<mark>", "</mark>", {}, 1000,
+                                  true, false, true, "", true);
+    ASSERT_STREQ("Ranges in range facet syntax should be continous.", results2.error().c_str());
+
+    collectionManager.drop_collection("coll1");
+}
+
+TEST_F(CollectionFacetingTest, RangeFacetTypo) {
+    std::vector<field> fields = {field("place", field_types::STRING, false),
+                                 field("state", field_types::STRING, false),
+                                 field("visitors", field_types::INT32, true),};
+    Collection* coll1 = collectionManager.create_collection(
+            "coll1", 1, fields, "", 0, "", {}, {}
+    ).get();
+    nlohmann::json doc1;
+    doc1["id"] = "0";
+    doc1["place"] = "Mysore Palace";
+    doc1["state"] = "Karnataka";
+    doc1["visitors"] = 235486;
+
+    nlohmann::json doc2;
+    doc2["id"] = "1";
+    doc2["place"] = "Hampi";
+    doc2["state"] = "Karnataka";
+    doc2["visitors"] = 187654;
+
+    nlohmann::json doc3;
+    doc3["id"] = "2";
+    doc3["place"] = "Mahabalipuram";
+    doc3["state"] = "TamilNadu";
+    doc3["visitors"] = 174684;
+
+    nlohmann::json doc4;
+    doc4["id"] = "3";
+    doc4["place"] = "Meenakshi Amman Temple";
+    doc4["state"] = "TamilNadu";
+    doc4["visitors"] = 246676;
+
+    nlohmann::json doc5;
+    doc5["id"] = "4";
+    doc5["place"] = "Staue of Unity";
+    doc5["state"] = "Gujarat";
+    doc5["visitors"] = 345878;
+
+
+    ASSERT_TRUE(coll1->add(doc1.dump()).ok());
+    ASSERT_TRUE(coll1->add(doc2.dump()).ok());
+    ASSERT_TRUE(coll1->add(doc3.dump()).ok());
+    ASSERT_TRUE(coll1->add(doc4.dump()).ok());
+    ASSERT_TRUE(coll1->add(doc5.dump()).ok());
+
+    auto results = coll1->search("TamilNadu", {"state"},
+                                 "", {"visitors(Busy:[0, 200000], VeryBusy:[200000, 500000)"}, //missing ']' at end
+                                 {}, {2}, 10,
+                                 1, FREQUENCY, {true},
+                                 10, spp::sparse_hash_set<std::string>(),
+                                 spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 10, {}, {}, {}, 0,
+                                 "<mark>", "</mark>", {}, 1000,
+                                 true, false, true, "", true);
+    ASSERT_STREQ("Error splitting the range string.", results.error().c_str());
+
+    auto results2 = coll1->search("TamilNadu", {"state"},
+                                  "", {"visitors(Busy:[0, 200000], VeryBusy:200000, 500000])"}, //missing '[' in second range
+                                  {}, {2}, 10,
+                                  1, FREQUENCY, {true},
+                                  10, spp::sparse_hash_set<std::string>(),
+                                  spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 10, {}, {}, {}, 0,
+                                  "<mark>", "</mark>", {}, 1000,
+                                  true, false, true, "", true);
+    ASSERT_STREQ("Error splitting the range string.", results2.error().c_str());
+
+    auto results3 = coll1->search("TamilNadu", {"state"},
+                                  "", {"visitors(Busy:[0, 200000] VeryBusy:[200000, 500000])"}, //missing ',' between ranges
+                                  {}, {2}, 10,
+                                  1, FREQUENCY, {true},
+                                  10, spp::sparse_hash_set<std::string>(),
+                                  spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 10, {}, {}, {}, 0,
+                                  "<mark>", "</mark>", {}, 1000,
+                                  true, false, true, "", true);
+    ASSERT_STREQ("Error splitting the range string.", results3.error().c_str());
+
+    auto results4 = coll1->search("TamilNadu", {"state"},
+                                  "", {"visitors(Busy:[0 200000], VeryBusy:[200000, 500000])"}, //missing ',' between first ranges values
+                                  {}, {2}, 10,
+                                  1, FREQUENCY, {true},
+                                  10, spp::sparse_hash_set<std::string>(),
+                                  spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 10, {}, {}, {}, 0,
+                                  "<mark>", "</mark>", {}, 1000,
+                                  true, false, true, "", true);
+    ASSERT_STREQ("Range String range pattern not matched.", results4.error().c_str());
+
+    auto results5 = coll1->search("TamilNadu", {"state"},
+                                  "", {"visitors(Busy:[0, 200000 VeryBusy:200000, 500000])"}, //missing '],' and '['
+                                  {}, {2}, 10,
+                                  1, FREQUENCY, {true},
+                                  10, spp::sparse_hash_set<std::string>(),
+                                  spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 10, {}, {}, {}, 0,
+                                  "<mark>", "</mark>", {}, 1000,
+                                  true, false, true, "", true);
+    ASSERT_STREQ("Range String range pattern not matched.", results5.error().c_str());
+
+    collectionManager.drop_collection("coll1");
+}
+
+TEST_F(CollectionFacetingTest, SampleFacetCounts) {
+    nlohmann::json schema = R"({
+            "name": "coll1",
+            "fields": [
+                {"name": "color", "type": "string", "facet": true}
+            ]
+        })"_json;
+
+    Collection* coll1 = collectionManager.create_collection(schema).get();
+
+    for(size_t i = 0; i < 1000; i++) {
+        nlohmann::json doc;
+        if(i % 2 == 0) {
+            doc["color"] = "blue";
+        } else {
+            doc["color"] = "red";
+        }
+
+        ASSERT_TRUE(coll1->add(doc.dump()).ok());
+    }
+
+    auto res = coll1->search("*", {}, "", {"color"}, {}, {0}, 3, 1, FREQUENCY, {true}, 5,
+                             spp::sparse_hash_set<std::string>(),
+                             spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 20, {}, {}, {}, 0,
+                             "<mark>", "</mark>", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback,
+                             4, {off}, 3, 3, 2, 2, false, "", 10, 0).get();
+
+    ASSERT_EQ(1000, res["found"].get<size_t>());
+    ASSERT_EQ(1, res["facet_counts"].size());
+    ASSERT_EQ(2, res["facet_counts"][0]["counts"].size());
+
+    // verify approximate counts
+    ASSERT_GE(res["facet_counts"][0]["counts"][0]["count"].get<size_t>(), 250);
+    ASSERT_GE(res["facet_counts"][0]["counts"][1]["count"].get<size_t>(), 250);
+    ASSERT_TRUE(res["facet_counts"][0]["sampled"].get<bool>());
+
+    // when sample threshold is high, don't estimate
+    res = coll1->search("*", {}, "", {"color"}, {}, {0}, 3, 1, FREQUENCY, {true}, 5,
+                        spp::sparse_hash_set<std::string>(),
+                        spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 20, {}, {}, {}, 0,
+                        "<mark>", "</mark>", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback,
+                        4, {off}, 3, 3, 2, 2, false, "", 10, 10000).get();
+
+    ASSERT_EQ(1000, res["found"].get<size_t>());
+    ASSERT_EQ(1, res["facet_counts"].size());
+    ASSERT_EQ(2, res["facet_counts"][0]["counts"].size());
+
+    // verify approximate counts
+    ASSERT_EQ(500, res["facet_counts"][0]["counts"][0]["count"].get<size_t>());
+    ASSERT_EQ(500, res["facet_counts"][0]["counts"][1]["count"].get<size_t>());
+    ASSERT_FALSE(res["facet_counts"][0]["sampled"].get<bool>());
+
+    // test for sample percent > 100
+
+    auto res_op = coll1->search("*", {}, "", {"color"}, {}, {0}, 3, 1, FREQUENCY, {true}, 5,
+                        spp::sparse_hash_set<std::string>(),
+                        spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 20, {}, {}, {}, 0,
+                        "<mark>", "</mark>", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback,
+                        4, {off}, 3, 3, 2, 2, false, "", 200, 0);
+
+    ASSERT_FALSE(res_op.ok());
+    ASSERT_EQ("Value of `facet_sample_percent` must be less than 100.", res_op.error());
+}
diff --git a/test/collection_schema_change_test.cpp b/test/collection_schema_change_test.cpp
index 4555766a..c32ba142 100644
--- a/test/collection_schema_change_test.cpp
+++ b/test/collection_schema_change_test.cpp
@@ -583,7 +583,7 @@ TEST_F(CollectionSchemaChangeTest, AbilityToDropAndReAddIndexAtTheSameTime) {
     ASSERT_EQ(1, res["found"].get<size_t>());
     ASSERT_EQ("0", res["hits"][0]["document"]["id"].get<std::string>());
     ASSERT_EQ(1, res["facet_counts"].size());
-    ASSERT_EQ(3, res["facet_counts"][0].size());
+    ASSERT_EQ(4, res["facet_counts"][0].size());
     ASSERT_EQ("title", res["facet_counts"][0]["field_name"]);
     ASSERT_EQ(1, res["facet_counts"][0]["counts"].size());
     ASSERT_EQ("123", res["facet_counts"][0]["counts"][0]["value"].get<std::string>());
diff --git a/test/collection_specific_test.cpp b/test/collection_specific_test.cpp
index 95ae5039..e051783e 100644
--- a/test/collection_specific_test.cpp
+++ b/test/collection_specific_test.cpp
@@ -2918,239 +2918,3 @@ TEST_F(CollectionSpecificTest, DontHighlightPunctuation) {
 
     collectionManager.drop_collection("coll1");
 }
-
-TEST_F(CollectionSpecificTest, RangeFacetTest) {
-    std::vector<field> fields = {field("place", field_types::STRING, false),
-                                 field("state", field_types::STRING, false),
-                                 field("visitors", field_types::INT32, true),};
-    Collection* coll1 = collectionManager.create_collection(
-        "coll1", 1, fields, "", 0, "", {}, {}
-    ).get();
-    nlohmann::json doc1;
-    doc1["id"] = "0";
-    doc1["place"] = "Mysore Palace";
-    doc1["state"] = "Karnataka";
-    doc1["visitors"] = 235486;
-
-    nlohmann::json doc2;
-    doc2["id"] = "1";
-    doc2["place"] = "Hampi";
-    doc2["state"] = "Karnataka";
-    doc2["visitors"] = 187654;
-
-    nlohmann::json doc3;
-    doc3["id"] = "2";
-    doc3["place"] = "Mahabalipuram";
-    doc3["state"] = "TamilNadu";
-    doc3["visitors"] = 174684;
-
-    nlohmann::json doc4;
-    doc4["id"] = "3";
-    doc4["place"] = "Meenakshi Amman Temple";
-    doc4["state"] = "TamilNadu";
-    doc4["visitors"] = 246676;
-
-    nlohmann::json doc5;
-    doc5["id"] = "4";
-    doc5["place"] = "Staue of Unity";
-    doc5["state"] = "Gujarat";
-    doc5["visitors"] = 345878;
-
-    
-    ASSERT_TRUE(coll1->add(doc1.dump()).ok());
-    ASSERT_TRUE(coll1->add(doc2.dump()).ok());
-    ASSERT_TRUE(coll1->add(doc3.dump()).ok());
-    ASSERT_TRUE(coll1->add(doc4.dump()).ok());
-    ASSERT_TRUE(coll1->add(doc5.dump()).ok());
-
-    auto results = coll1->search("Karnataka", {"state"},
-                                 "", {"visitors(Busy:[0, 200000], VeryBusy:[200000, 500000])"},
-                                 {}, {2}, 10,
-                                 1, FREQUENCY, {true},
-                                 10, spp::sparse_hash_set<std::string>(),
-                                 spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 10, {}, {}, {}, 0,
-                                 "<mark>", "</mark>", {}, 1000,
-                                 true, false, true, "", true).get();
-    ASSERT_EQ(2, results["facet_counts"][0]["counts"].size());
-    ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][0]["count"]);
-    ASSERT_STREQ("Busy", results["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
-
-    auto results2 = coll1->search("Gujarat", {"state"},
-                                 "", {"visitors(Busy:[0, 200000], VeryBusy:[200000, 500000])"},
-                                 {}, {2}, 10,
-                                 1, FREQUENCY, {true},
-                                 10, spp::sparse_hash_set<std::string>(),
-                                 spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 10, {}, {}, {}, 0,
-                                 "<mark>", "</mark>", {}, 1000,
-                                 true, false, true, "", true).get();
-    ASSERT_EQ(1, results2["facet_counts"][0]["counts"].size());
-    ASSERT_EQ(1, results2["facet_counts"][0]["counts"][0]["count"].get<std::size_t>());
-    ASSERT_STREQ("VeryBusy", results2["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
-    ASSERT_TRUE(results2["facet_counts"][0]["counts"][1]["value"] == nullptr);
-    
-    collectionManager.drop_collection("coll1");
-}
-
-TEST_F(CollectionSpecificTest, RangeFacetContinuity) {
-    std::vector<field> fields = {field("place", field_types::STRING, false),
-                                 field("state", field_types::STRING, false),
-                                 field("visitors", field_types::INT32, true),};
-    Collection* coll1 = collectionManager.create_collection(
-        "coll1", 1, fields, "", 0, "", {}, {}
-    ).get();
-    nlohmann::json doc1;
-    doc1["id"] = "0";
-    doc1["place"] = "Mysore Palace";
-    doc1["state"] = "Karnataka";
-    doc1["visitors"] = 235486;
-
-    nlohmann::json doc2;
-    doc2["id"] = "1";
-    doc2["place"] = "Hampi";
-    doc2["state"] = "Karnataka";
-    doc2["visitors"] = 187654;
-
-    nlohmann::json doc3;
-    doc3["id"] = "2";
-    doc3["place"] = "Mahabalipuram";
-    doc3["state"] = "TamilNadu";
-    doc3["visitors"] = 174684;
-
-    nlohmann::json doc4;
-    doc4["id"] = "3";
-    doc4["place"] = "Meenakshi Amman Temple";
-    doc4["state"] = "TamilNadu";
-    doc4["visitors"] = 246676;
-
-    nlohmann::json doc5;
-    doc5["id"] = "4";
-    doc5["place"] = "Staue of Unity";
-    doc5["state"] = "Gujarat";
-    doc5["visitors"] = 345878;
-
-    
-    ASSERT_TRUE(coll1->add(doc1.dump()).ok());
-    ASSERT_TRUE(coll1->add(doc2.dump()).ok());
-    ASSERT_TRUE(coll1->add(doc3.dump()).ok());
-    ASSERT_TRUE(coll1->add(doc4.dump()).ok());
-    ASSERT_TRUE(coll1->add(doc5.dump()).ok());
-
-    auto results = coll1->search("TamilNadu", {"state"},
-                                 "", {"visitors(Busy:[0, 200000], VeryBusy:[200001, 500000])"},
-                                 {}, {2}, 10,
-                                 1, FREQUENCY, {true},
-                                 10, spp::sparse_hash_set<std::string>(),
-                                 spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 10, {}, {}, {}, 0,
-                                 "<mark>", "</mark>", {}, 1000,
-                                 true, false, true, "", true);
-    ASSERT_STREQ("Ranges in range facet syntax should be continous.", results.error().c_str());
-
-    auto results2 = coll1->search("TamilNadu", {"state"},
-                                 "", {"visitors(Busy:[0, 200000], VeryBusy:[199999, 500000])"},
-                                 {}, {2}, 10,
-                                 1, FREQUENCY, {true},
-                                 10, spp::sparse_hash_set<std::string>(),
-                                 spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 10, {}, {}, {}, 0,
-                                 "<mark>", "</mark>", {}, 1000,
-                                 true, false, true, "", true);
-    ASSERT_STREQ("Ranges in range facet syntax should be continous.", results2.error().c_str());
-    
-    collectionManager.drop_collection("coll1");
-}
-
-TEST_F(CollectionSpecificTest, RangeFacetTypo) {
-    std::vector<field> fields = {field("place", field_types::STRING, false),
-                                 field("state", field_types::STRING, false),
-                                 field("visitors", field_types::INT32, true),};
-    Collection* coll1 = collectionManager.create_collection(
-        "coll1", 1, fields, "", 0, "", {}, {}
-    ).get();
-    nlohmann::json doc1;
-    doc1["id"] = "0";
-    doc1["place"] = "Mysore Palace";
-    doc1["state"] = "Karnataka";
-    doc1["visitors"] = 235486;
-
-    nlohmann::json doc2;
-    doc2["id"] = "1";
-    doc2["place"] = "Hampi";
-    doc2["state"] = "Karnataka";
-    doc2["visitors"] = 187654;
-
-    nlohmann::json doc3;
-    doc3["id"] = "2";
-    doc3["place"] = "Mahabalipuram";
-    doc3["state"] = "TamilNadu";
-    doc3["visitors"] = 174684;
-
-    nlohmann::json doc4;
-    doc4["id"] = "3";
-    doc4["place"] = "Meenakshi Amman Temple";
-    doc4["state"] = "TamilNadu";
-    doc4["visitors"] = 246676;
-
-    nlohmann::json doc5;
-    doc5["id"] = "4";
-    doc5["place"] = "Staue of Unity";
-    doc5["state"] = "Gujarat";
-    doc5["visitors"] = 345878;
-
-    
-    ASSERT_TRUE(coll1->add(doc1.dump()).ok());
-    ASSERT_TRUE(coll1->add(doc2.dump()).ok());
-    ASSERT_TRUE(coll1->add(doc3.dump()).ok());
-    ASSERT_TRUE(coll1->add(doc4.dump()).ok());
-    ASSERT_TRUE(coll1->add(doc5.dump()).ok());
-
-    auto results = coll1->search("TamilNadu", {"state"},
-                                 "", {"visitors(Busy:[0, 200000], VeryBusy:[200000, 500000)"}, //missing ']' at end
-                                 {}, {2}, 10,
-                                 1, FREQUENCY, {true},
-                                 10, spp::sparse_hash_set<std::string>(),
-                                 spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 10, {}, {}, {}, 0,
-                                 "<mark>", "</mark>", {}, 1000,
-                                 true, false, true, "", true);
-    ASSERT_STREQ("Error splitting the range string.", results.error().c_str());
-
-    auto results2 = coll1->search("TamilNadu", {"state"},
-                                 "", {"visitors(Busy:[0, 200000], VeryBusy:200000, 500000])"}, //missing '[' in second range
-                                 {}, {2}, 10,
-                                 1, FREQUENCY, {true},
-                                 10, spp::sparse_hash_set<std::string>(),
-                                 spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 10, {}, {}, {}, 0,
-                                 "<mark>", "</mark>", {}, 1000,
-                                 true, false, true, "", true);
-    ASSERT_STREQ("Error splitting the range string.", results2.error().c_str());
-
-    auto results3 = coll1->search("TamilNadu", {"state"},
-                                 "", {"visitors(Busy:[0, 200000] VeryBusy:[200000, 500000])"}, //missing ',' between ranges
-                                 {}, {2}, 10,
-                                 1, FREQUENCY, {true},
-                                 10, spp::sparse_hash_set<std::string>(),
-                                 spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 10, {}, {}, {}, 0,
-                                 "<mark>", "</mark>", {}, 1000,
-                                 true, false, true, "", true);
-    ASSERT_STREQ("Error splitting the range string.", results3.error().c_str());
-
-    auto results4 = coll1->search("TamilNadu", {"state"},
-                                 "", {"visitors(Busy:[0 200000], VeryBusy:[200000, 500000])"}, //missing ',' between first ranges values
-                                 {}, {2}, 10,
-                                 1, FREQUENCY, {true},
-                                 10, spp::sparse_hash_set<std::string>(),
-                                 spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 10, {}, {}, {}, 0,
-                                 "<mark>", "</mark>", {}, 1000,
-                                 true, false, true, "", true);
-    ASSERT_STREQ("Range String range pattern not matched.", results4.error().c_str());
-
-    auto results5 = coll1->search("TamilNadu", {"state"},
-                                 "", {"visitors(Busy:[0, 200000 VeryBusy:200000, 500000])"}, //missing '],' and '['
-                                 {}, {2}, 10,
-                                 1, FREQUENCY, {true},
-                                 10, spp::sparse_hash_set<std::string>(),
-                                 spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 10, {}, {}, {}, 0,
-                                 "<mark>", "</mark>", {}, 1000,
-                                 true, false, true, "", true);
-    ASSERT_STREQ("Range String range pattern not matched.", results5.error().c_str());
-    
-    collectionManager.drop_collection("coll1");
-}
\ No newline at end of file