Facet estimation for large result sets.

2025-05-21 22:33:27 +08:00 · 2022-12-15 14:50:09 +05:30 · 2022-12-15 14:50:09 +05:30 · df8f6849fe
commit df8f6849fe
parent e836af159f
9 changed files with 385 additions and 274 deletions
--- a/include/collection.h
+++ b/include/collection.h
@ -408,7 +408,9 @@ public:
                                  const size_t facet_query_num_typos = 2,
                                  const size_t filter_curated_hits_option = 2,
                                  const bool prioritize_token_position = false,
-                                  const std::string& vector_query_str = "") const;
+                                  const std::string& vector_query_str = "",
+                                  const size_t facet_sample_percent = 100,
+                                  const size_t facet_sample_threshold = 0) const;

    Option<bool> get_filter_ids(const std::string & simple_filter_query,
                                std::vector<std::pair<size_t, uint32_t*>>& index_ids);
--- a/include/field.h
+++ b/include/field.h
@ -682,6 +682,8 @@ struct facet {

    bool is_range_query;

+    bool sampled = false;
+
    bool get_range(int64_t key, std::pair<int64_t, std::string>& range_pair)
    {
        if(facet_range_map.empty())
--- a/include/index.h
+++ b/include/index.h
@ -132,6 +132,8 @@ struct search_args {
    std::vector<std::vector<KV*>> override_result_kvs;

    vector_query_t& vector_query;
+    size_t facet_sample_percent;
+    size_t facet_sample_threshold;

    search_args(std::vector<query_tokens_t> field_query_tokens, std::vector<search_field_t> search_fields,
                filter_node_t* filter_tree_root, std::vector<facet>& facets,
@ -145,7 +147,8 @@ struct search_args {
                size_t concurrency, size_t search_cutoff_ms,
                size_t min_len_1typo, size_t min_len_2typo, size_t max_candidates, const std::vector<enable_t>& infixes,
                const size_t max_extra_prefix, const size_t max_extra_suffix, const size_t facet_query_num_typos,
-                const bool filter_curated_hits, const enable_t split_join_tokens, vector_query_t& vector_query) :
+                const bool filter_curated_hits, const enable_t split_join_tokens, vector_query_t& vector_query,
+                size_t facet_sample_percent, size_t facet_sample_threshold) :
            field_query_tokens(field_query_tokens),
            search_fields(search_fields), filter_tree_root(filter_tree_root), facets(facets),
            included_ids(included_ids), excluded_ids(excluded_ids), sort_fields_std(sort_fields_std),
@ -159,7 +162,8 @@ struct search_args {
            min_len_1typo(min_len_1typo), min_len_2typo(min_len_2typo), max_candidates(max_candidates),
            infixes(infixes), max_extra_prefix(max_extra_prefix), max_extra_suffix(max_extra_suffix),
            facet_query_num_typos(facet_query_num_typos), filter_curated_hits(filter_curated_hits),
-            split_join_tokens(split_join_tokens), vector_query(vector_query) {
+            split_join_tokens(split_join_tokens), vector_query(vector_query),
+            facet_sample_percent(facet_sample_percent), facet_sample_threshold(facet_sample_threshold) {

        const size_t topster_size = std::max((size_t)1, max_hits);  // needs to be atleast 1 since scoring is mandatory
        topster = new Topster(topster_size, group_limit);
@ -357,6 +361,7 @@ private:
    void log_leaves(int cost, const std::string &token, const std::vector<art_leaf *> &leaves) const;

    void do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
+                   bool estimate_facets, size_t facet_sample_percent,
                   const std::vector<facet_info_t>& facet_infos,
                   size_t group_limit, const std::vector<std::string>& group_by_fields,
                   const uint32_t* result_ids, size_t results_size) const;
@ -645,7 +650,7 @@ public:
                size_t max_candidates, const std::vector<enable_t>& infixes, const size_t max_extra_prefix,
                const size_t max_extra_suffix, const size_t facet_query_num_typos,
                const bool filter_curated_hits, enable_t split_join_tokens,
-                const vector_query_t& vector_query) const;
+                const vector_query_t& vector_query, size_t facet_sample_percent, size_t facet_sample_threshold) const;

    void remove_field(uint32_t seq_id, const nlohmann::json& document, const std::string& field_name);

--- a/src/collection.cpp
+++ b/src/collection.cpp
@ -866,7 +866,9 @@ Option<nlohmann::json> Collection::search(const std::string & raw_query,
                                  const size_t facet_query_num_typos,
                                  const size_t filter_curated_hits_option,
                                  const bool prioritize_token_position,
-                                  const std::string& vector_query_str) const {
+                                  const std::string& vector_query_str,
+                                  const size_t facet_sample_percent,
+                                  const size_t facet_sample_threshold) const {

    std::shared_lock lock(mutex);

@ -911,6 +913,10 @@ Option<nlohmann::json> Collection::search(const std::string & raw_query,
        }
    }

+    if(facet_sample_percent > 100) {
+        return Option<nlohmann::json>(400, "Value of `facet_sample_percent` must be less than 100.");
+    }
+
    if(raw_group_by_fields.empty()) {
        group_limit = 0;
    }
@ -1302,7 +1308,8 @@ Option<nlohmann::json> Collection::search(const std::string & raw_query,
                                                 search_stop_millis,
                                                 min_len_1typo, min_len_2typo, max_candidates, infixes,
                                                 max_extra_prefix, max_extra_suffix, facet_query_num_typos,
-                                                 filter_curated_hits, split_join_tokens, vector_query);
+                                                 filter_curated_hits, split_join_tokens, vector_query,
+                                                 facet_sample_percent, facet_sample_threshold);

    index->run_search(search_params);

@ -1319,12 +1326,6 @@ Option<nlohmann::json> Collection::search(const std::string & raw_query,

    // for grouping we have to aggregate group set sizes to a count value
    if(group_limit) {
-        for(auto& acc_facet: facets) {
-            for(auto& facet_kv: acc_facet.result_map) {
-                facet_kv.second.count = acc_facet.hash_groups[facet_kv.first].size();
-            }
-        }
-
        total_found = search_params->groups_processed.size() + override_result_kvs.size();
    } else {
        total_found = search_params->all_result_ids_len;
@ -1430,8 +1431,6 @@ Option<nlohmann::json> Collection::search(const std::string & raw_query,
    // handle which fields have to be highlighted

    std::vector<highlight_field_t> highlight_items;
-    bool has_atleast_one_fully_highlighted_field = false;
-
    std::vector<std::string> highlight_field_names;
    StringUtils::split(highlight_fields, highlight_field_names, ",");

@ -1442,12 +1441,6 @@ Option<nlohmann::json> Collection::search(const std::string & raw_query,
        process_highlight_fields(weighted_search_fields, raw_search_fields, include_fields_full, exclude_fields_full,
                                 highlight_field_names, highlight_full_field_names, infixes, q_tokens,
                                 search_params->qtoken_set, highlight_items);
-
-        for(auto& highlight_item: highlight_items) {
-            if(highlight_item.fully_highlighted) {
-                has_atleast_one_fully_highlighted_field = true;
-            }
-        }
    }

    nlohmann::json result = nlohmann::json::object();
@ -1657,6 +1650,7 @@ Option<nlohmann::json> Collection::search(const std::string & raw_query,
    for(facet & a_facet: facets) {
        nlohmann::json facet_result = nlohmann::json::object();
        facet_result["field_name"] = a_facet.field_name;
+        facet_result["sampled"] = a_facet.sampled;
        facet_result["counts"] = nlohmann::json::array();

        std::vector<facet_value_t> facet_values;
--- a/src/collection_manager.cpp
+++ b/src/collection_manager.cpp
@ -695,6 +695,9 @@ Option<bool> CollectionManager::do_search(std::map<std::string, std::string>& re
    const char *EXHAUSTIVE_SEARCH = "exhaustive_search";
    const char *SPLIT_JOIN_TOKENS = "split_join_tokens";

+    const char *FACET_SAMPLE_PERCENT = "facet_sample_percent";
+    const char *FACET_SAMPLE_THRESHOLD = "facet_sample_threshold";
+
    // enrich params with values from embedded params
    for(auto& item: embedded_params.items()) {
        if(item.key() == "expires_at") {
@ -720,7 +723,6 @@ Option<bool> CollectionManager::do_search(std::map<std::string, std::string>& re

    // end check for mandatory params

-
    const std::string& raw_query = req_params[QUERY];
    std::vector<uint32_t> num_typos = {2};
    size_t min_len_1typo = 4;
@ -772,6 +774,9 @@ Option<bool> CollectionManager::do_search(std::map<std::string, std::string>& re
    size_t max_extra_prefix = INT16_MAX;
    size_t max_extra_suffix = INT16_MAX;

+    size_t facet_sample_percent = 100;
+    size_t facet_sample_threshold = 0;
+
    std::unordered_map<std::string, size_t*> unsigned_int_values = {
        {MIN_LEN_1TYPO, &min_len_1typo},
        {MIN_LEN_2TYPO, &min_len_2typo},
@ -790,6 +795,8 @@ Option<bool> CollectionManager::do_search(std::map<std::string, std::string>& re
        {MAX_CANDIDATES, &max_candidates},
        {FACET_QUERY_NUM_TYPOS, &facet_query_num_typos},
        {FILTER_CURATED_HITS, &filter_curated_hits_option},
+        {FACET_SAMPLE_PERCENT, &facet_sample_percent},
+        {FACET_SAMPLE_THRESHOLD, &facet_sample_threshold},
    };

    std::unordered_map<std::string, std::string*> str_values = {
@ -982,7 +989,9 @@ Option<bool> CollectionManager::do_search(std::map<std::string, std::string>& re
                                                          facet_query_num_typos,
                                                          filter_curated_hits_option,
                                                          prioritize_token_position,
-                                                          vector_query
+                                                          vector_query,
+                                                          facet_sample_percent,
+                                                          facet_sample_threshold
                                                        );

    uint64_t timeMillis = std::chrono::duration_cast<std::chrono::milliseconds>(
--- a/src/index.cpp
+++ b/src/index.cpp
@ -4,6 +4,7 @@
 #include <chrono>
 #include <set>
 #include <unordered_map>
+#include <random>
 #include <array_utils.h>
 #include <match_score.h>
 #include <string_utils.h>
@ -1228,6 +1229,7 @@ void Index::compute_facet_stats(facet &a_facet, uint64_t raw_value, const std::s
 }

 void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
+                      bool estimate_facets, size_t facet_sample_percent,
                      const std::vector<facet_info_t>& facet_infos,
                      const size_t group_limit, const std::vector<std::string>& group_by_fields,
                      const uint32_t* result_ids, size_t results_size) const {
@ -1247,8 +1249,21 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,

        const auto& field_facet_mapping = field_facet_mapping_it->second;

+        // used for sampling facets (if enabled)
+        std::mt19937 gen(137723); // use constant seed to make sure that counts don't jump around
+        std::uniform_int_distribution<> distr(1, 100); // 1 to 100 inclusive
+
        for(size_t i = 0; i < results_size; i++) {
            uint32_t doc_seq_id = result_ids[i];
+
+            // if sampling is enabled, we will skip a portion of the results to speed up things
+            if(estimate_facets) {
+                size_t num = distr(gen);
+                if(num > facet_sample_percent) {
+                    continue;
+                }
+            }
+
            const auto& facet_hashes_it = field_facet_mapping[doc_seq_id % ARRAY_FACET_DIM]->find(doc_seq_id);

            if(facet_hashes_it == field_facet_mapping[doc_seq_id % ARRAY_FACET_DIM]->end()) {
@ -1265,7 +1280,7 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
                    compute_facet_stats(a_facet, fhash, facet_field.type);
                }

-                if(a_facet.is_range_query){
+                if(a_facet.is_range_query) {
                    auto sort_index_it = sort_index.find(a_facet.field_name);
                    
                    if(sort_index_it != sort_index.end()){
@ -1285,8 +1300,7 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
                            }
                        }
                    }
-                }
-                else if(!use_facet_query || fquery_hashes.find(fhash) != fquery_hashes.end()) {
+                } else if(!use_facet_query || fquery_hashes.find(fhash) != fquery_hashes.end()) {
                    facet_count_t& facet_count = a_facet.result_map[fhash];

                    //LOG(INFO) << "field: " << a_facet.field_name << ", doc id: " << doc_seq_id << ", hash: " <<  fhash;
@ -1980,7 +1994,9 @@ void Index::run_search(search_args* search_params) {
           search_params->facet_query_num_typos,
           search_params->filter_curated_hits,
           search_params->split_join_tokens,
-           search_params->vector_query);
+           search_params->vector_query,
+           search_params->facet_sample_percent,
+           search_params->facet_sample_threshold);
 }

 void Index::collate_included_ids(const std::vector<token_t>& q_included_tokens,
@ -2430,7 +2446,8 @@ void Index::search(std::vector<query_tokens_t>& field_query_tokens, const std::v
                   size_t max_candidates, const std::vector<enable_t>& infixes, const size_t max_extra_prefix,
                   const size_t max_extra_suffix, const size_t facet_query_num_typos,
                   const bool filter_curated_hits, const enable_t split_join_tokens,
-                   const vector_query_t& vector_query) const {
+                   const vector_query_t& vector_query,
+                   size_t facet_sample_percent, size_t facet_sample_threshold) const {

    // process the filters

@ -2784,6 +2801,8 @@ void Index::search(std::vector<query_tokens_t>& field_query_tokens, const std::v
    delete [] exclude_token_ids;
    delete [] excluded_result_ids;

+    bool estimate_facets = (facet_sample_percent < 100 && all_result_ids_len > facet_sample_threshold);
+
    if(!facets.empty()) {
        const size_t num_threads = std::min(concurrency, all_result_ids_len);
        const size_t window_size = (num_threads == 0) ? 0 :
@ -2820,9 +2839,11 @@ void Index::search(std::vector<query_tokens_t>& field_query_tokens, const std::v

            thread_pool->enqueue([this, thread_id, &facet_batches, &facet_query, group_limit, group_by_fields,
                                         batch_result_ids, batch_res_len, &facet_infos,
+                                         estimate_facets, facet_sample_percent,
                                         &num_processed, &m_process, &cv_process]() {
                auto fq = facet_query;
-                do_facets(facet_batches[thread_id], fq, facet_infos, group_limit, group_by_fields,
+                do_facets(facet_batches[thread_id], fq, estimate_facets, facet_sample_percent,
+                          facet_infos, group_limit, group_by_fields,
                          batch_result_ids, batch_res_len);
                std::unique_lock<std::mutex> lock(m_process);
                num_processed++;
@ -2844,8 +2865,8 @@ void Index::search(std::vector<query_tokens_t>& field_query_tokens, const std::v
                    if(group_limit) {
                        // we have to add all group sets
                        acc_facet.hash_groups[facet_kv.first].insert(
-                                this_facet.hash_groups[facet_kv.first].begin(),
-                                this_facet.hash_groups[facet_kv.first].end()
+                            this_facet.hash_groups[facet_kv.first].begin(),
+                            this_facet.hash_groups[facet_kv.first].end()
                        );
                    } else {
                        size_t count = 0;
@ -2872,6 +2893,22 @@ void Index::search(std::vector<query_tokens_t>& field_query_tokens, const std::v
            }
        }

+        for(auto & acc_facet: facets) {
+            for(auto& facet_kv: acc_facet.result_map) {
+                if(group_limit) {
+                    facet_kv.second.count = acc_facet.hash_groups[facet_kv.first].size();
+                }
+
+                if(estimate_facets) {
+                    facet_kv.second.count = size_t(double(facet_kv.second.count) * (100.0f / facet_sample_percent));
+                }
+            }
+
+            if(estimate_facets) {
+                acc_facet.sampled = true;
+            }
+        }
+
        /*long long int timeMillisF = std::chrono::duration_cast<std::chrono::milliseconds>(
                std::chrono::high_resolution_clock::now() - beginF).count();
        LOG(INFO) << "Time for faceting: " << timeMillisF;*/
@ -2880,7 +2917,8 @@ void Index::search(std::vector<query_tokens_t>& field_query_tokens, const std::v
    std::vector<facet_info_t> facet_infos(facets.size());
    compute_facet_infos(facets, facet_query, facet_query_num_typos,
                        &included_ids_vec[0], included_ids_vec.size(), group_by_fields, max_candidates, facet_infos);
-    do_facets(facets, facet_query, facet_infos, group_limit, group_by_fields, &included_ids_vec[0], included_ids_vec.size());
+    do_facets(facets, facet_query, estimate_facets, facet_sample_percent,
+              facet_infos, group_limit, group_by_fields, &included_ids_vec[0], included_ids_vec.size());

    all_result_ids_len += curated_topster->size;

--- a/test/collection_faceting_test.cpp
+++ b/test/collection_faceting_test.cpp
@ -73,8 +73,9 @@ TEST_F(CollectionFacetingTest, FacetCounts) {
    ASSERT_EQ(5, results["hits"].size());

    ASSERT_EQ(1, results["facet_counts"].size());
-    ASSERT_EQ(3, results["facet_counts"][0].size());
+    ASSERT_EQ(4, results["facet_counts"][0].size());
    ASSERT_EQ("tags", results["facet_counts"][0]["field_name"]);
+    ASSERT_EQ(false, results["facet_counts"][0]["sampled"].get<bool>());
    ASSERT_EQ(4, results["facet_counts"][0]["counts"].size());
    ASSERT_EQ(1, results["facet_counts"][0]["stats"].size());
    ASSERT_EQ(4, results["facet_counts"][0]["stats"]["total_values"].get<size_t>());
@ -981,7 +982,6 @@ TEST_F(CollectionFacetingTest, FacetByNestedIntField) {
 }

 TEST_F(CollectionFacetingTest, FacetParseTest){
-
    std::vector<field> fields = {
        field("score", field_types::INT32, true),
        field("grade", field_types::INT32, true),
@ -1008,8 +1008,6 @@ TEST_F(CollectionFacetingTest, FacetParseTest){
    ASSERT_TRUE(range_facets[1].is_range_query);
    ASSERT_GT(range_facets[1].facet_range_map.size(), 0);

-    
-    
    std::vector<std::string> normal_facet_fields {
        "score",
        "grade"
@ -1022,9 +1020,7 @@ TEST_F(CollectionFacetingTest, FacetParseTest){

    ASSERT_STREQ("score", normal_facets[0].field_name.c_str());
    ASSERT_STREQ("grade", normal_facets[1].field_name.c_str());
-    

-    
    std::vector<std::string> mixed_facet_fields {
        "score", 
        "grade(A:[80, 100], B:[60, 80], C:[40, 60])", 
@ -1044,3 +1040,304 @@ TEST_F(CollectionFacetingTest, FacetParseTest){

    ASSERT_STREQ("rank", mixed_facets[2].field_name.c_str());
 }
+
+
+TEST_F(CollectionFacetingTest, RangeFacetTest) {
+    std::vector<field> fields = {field("place", field_types::STRING, false),
+                                 field("state", field_types::STRING, false),
+                                 field("visitors", field_types::INT32, true),};
+    Collection* coll1 = collectionManager.create_collection(
+            "coll1", 1, fields, "", 0, "", {}, {}
+    ).get();
+    nlohmann::json doc1;
+    doc1["id"] = "0";
+    doc1["place"] = "Mysore Palace";
+    doc1["state"] = "Karnataka";
+    doc1["visitors"] = 235486;
+
+    nlohmann::json doc2;
+    doc2["id"] = "1";
+    doc2["place"] = "Hampi";
+    doc2["state"] = "Karnataka";
+    doc2["visitors"] = 187654;
+
+    nlohmann::json doc3;
+    doc3["id"] = "2";
+    doc3["place"] = "Mahabalipuram";
+    doc3["state"] = "TamilNadu";
+    doc3["visitors"] = 174684;
+
+    nlohmann::json doc4;
+    doc4["id"] = "3";
+    doc4["place"] = "Meenakshi Amman Temple";
+    doc4["state"] = "TamilNadu";
+    doc4["visitors"] = 246676;
+
+    nlohmann::json doc5;
+    doc5["id"] = "4";
+    doc5["place"] = "Staue of Unity";
+    doc5["state"] = "Gujarat";
+    doc5["visitors"] = 345878;
+
+
+    ASSERT_TRUE(coll1->add(doc1.dump()).ok());
+    ASSERT_TRUE(coll1->add(doc2.dump()).ok());
+    ASSERT_TRUE(coll1->add(doc3.dump()).ok());
+    ASSERT_TRUE(coll1->add(doc4.dump()).ok());
+    ASSERT_TRUE(coll1->add(doc5.dump()).ok());
+
+    auto results = coll1->search("Karnataka", {"state"},
+                                 "", {"visitors(Busy:[0, 200000], VeryBusy:[200000, 500000])"},
+                                 {}, {2}, 10,
+                                 1, FREQUENCY, {true},
+                                 10, spp::sparse_hash_set<std::string>(),
+                                 spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 10, {}, {}, {}, 0,
+                                 "<mark>", "</mark>", {}, 1000,
+                                 true, false, true, "", true).get();
+    ASSERT_EQ(2, results["facet_counts"][0]["counts"].size());
+    ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][0]["count"]);
+    ASSERT_STREQ("Busy", results["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
+
+    auto results2 = coll1->search("Gujarat", {"state"},
+                                  "", {"visitors(Busy:[0, 200000], VeryBusy:[200000, 500000])"},
+                                  {}, {2}, 10,
+                                  1, FREQUENCY, {true},
+                                  10, spp::sparse_hash_set<std::string>(),
+                                  spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 10, {}, {}, {}, 0,
+                                  "<mark>", "</mark>", {}, 1000,
+                                  true, false, true, "", true).get();
+    ASSERT_EQ(1, results2["facet_counts"][0]["counts"].size());
+    ASSERT_EQ(1, results2["facet_counts"][0]["counts"][0]["count"].get<std::size_t>());
+    ASSERT_STREQ("VeryBusy", results2["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
+    ASSERT_TRUE(results2["facet_counts"][0]["counts"][1]["value"] == nullptr);
+
+    collectionManager.drop_collection("coll1");
+}
+
+TEST_F(CollectionFacetingTest, RangeFacetContinuity) {
+    std::vector<field> fields = {field("place", field_types::STRING, false),
+                                 field("state", field_types::STRING, false),
+                                 field("visitors", field_types::INT32, true),};
+    Collection* coll1 = collectionManager.create_collection(
+            "coll1", 1, fields, "", 0, "", {}, {}
+    ).get();
+    nlohmann::json doc1;
+    doc1["id"] = "0";
+    doc1["place"] = "Mysore Palace";
+    doc1["state"] = "Karnataka";
+    doc1["visitors"] = 235486;
+
+    nlohmann::json doc2;
+    doc2["id"] = "1";
+    doc2["place"] = "Hampi";
+    doc2["state"] = "Karnataka";
+    doc2["visitors"] = 187654;
+
+    nlohmann::json doc3;
+    doc3["id"] = "2";
+    doc3["place"] = "Mahabalipuram";
+    doc3["state"] = "TamilNadu";
+    doc3["visitors"] = 174684;
+
+    nlohmann::json doc4;
+    doc4["id"] = "3";
+    doc4["place"] = "Meenakshi Amman Temple";
+    doc4["state"] = "TamilNadu";
+    doc4["visitors"] = 246676;
+
+    nlohmann::json doc5;
+    doc5["id"] = "4";
+    doc5["place"] = "Staue of Unity";
+    doc5["state"] = "Gujarat";
+    doc5["visitors"] = 345878;
+
+
+    ASSERT_TRUE(coll1->add(doc1.dump()).ok());
+    ASSERT_TRUE(coll1->add(doc2.dump()).ok());
+    ASSERT_TRUE(coll1->add(doc3.dump()).ok());
+    ASSERT_TRUE(coll1->add(doc4.dump()).ok());
+    ASSERT_TRUE(coll1->add(doc5.dump()).ok());
+
+    auto results = coll1->search("TamilNadu", {"state"},
+                                 "", {"visitors(Busy:[0, 200000], VeryBusy:[200001, 500000])"},
+                                 {}, {2}, 10,
+                                 1, FREQUENCY, {true},
+                                 10, spp::sparse_hash_set<std::string>(),
+                                 spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 10, {}, {}, {}, 0,
+                                 "<mark>", "</mark>", {}, 1000,
+                                 true, false, true, "", true);
+    ASSERT_STREQ("Ranges in range facet syntax should be continous.", results.error().c_str());
+
+    auto results2 = coll1->search("TamilNadu", {"state"},
+                                  "", {"visitors(Busy:[0, 200000], VeryBusy:[199999, 500000])"},
+                                  {}, {2}, 10,
+                                  1, FREQUENCY, {true},
+                                  10, spp::sparse_hash_set<std::string>(),
+                                  spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 10, {}, {}, {}, 0,
+                                  "<mark>", "</mark>", {}, 1000,
+                                  true, false, true, "", true);
+    ASSERT_STREQ("Ranges in range facet syntax should be continous.", results2.error().c_str());
+
+    collectionManager.drop_collection("coll1");
+}
+
+TEST_F(CollectionFacetingTest, RangeFacetTypo) {
+    std::vector<field> fields = {field("place", field_types::STRING, false),
+                                 field("state", field_types::STRING, false),
+                                 field("visitors", field_types::INT32, true),};
+    Collection* coll1 = collectionManager.create_collection(
+            "coll1", 1, fields, "", 0, "", {}, {}
+    ).get();
+    nlohmann::json doc1;
+    doc1["id"] = "0";
+    doc1["place"] = "Mysore Palace";
+    doc1["state"] = "Karnataka";
+    doc1["visitors"] = 235486;
+
+    nlohmann::json doc2;
+    doc2["id"] = "1";
+    doc2["place"] = "Hampi";
+    doc2["state"] = "Karnataka";
+    doc2["visitors"] = 187654;
+
+    nlohmann::json doc3;
+    doc3["id"] = "2";
+    doc3["place"] = "Mahabalipuram";
+    doc3["state"] = "TamilNadu";
+    doc3["visitors"] = 174684;
+
+    nlohmann::json doc4;
+    doc4["id"] = "3";
+    doc4["place"] = "Meenakshi Amman Temple";
+    doc4["state"] = "TamilNadu";
+    doc4["visitors"] = 246676;
+
+    nlohmann::json doc5;
+    doc5["id"] = "4";
+    doc5["place"] = "Staue of Unity";
+    doc5["state"] = "Gujarat";
+    doc5["visitors"] = 345878;
+
+
+    ASSERT_TRUE(coll1->add(doc1.dump()).ok());
+    ASSERT_TRUE(coll1->add(doc2.dump()).ok());
+    ASSERT_TRUE(coll1->add(doc3.dump()).ok());
+    ASSERT_TRUE(coll1->add(doc4.dump()).ok());
+    ASSERT_TRUE(coll1->add(doc5.dump()).ok());
+
+    auto results = coll1->search("TamilNadu", {"state"},
+                                 "", {"visitors(Busy:[0, 200000], VeryBusy:[200000, 500000)"}, //missing ']' at end
+                                 {}, {2}, 10,
+                                 1, FREQUENCY, {true},
+                                 10, spp::sparse_hash_set<std::string>(),
+                                 spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 10, {}, {}, {}, 0,
+                                 "<mark>", "</mark>", {}, 1000,
+                                 true, false, true, "", true);
+    ASSERT_STREQ("Error splitting the range string.", results.error().c_str());
+
+    auto results2 = coll1->search("TamilNadu", {"state"},
+                                  "", {"visitors(Busy:[0, 200000], VeryBusy:200000, 500000])"}, //missing '[' in second range
+                                  {}, {2}, 10,
+                                  1, FREQUENCY, {true},
+                                  10, spp::sparse_hash_set<std::string>(),
+                                  spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 10, {}, {}, {}, 0,
+                                  "<mark>", "</mark>", {}, 1000,
+                                  true, false, true, "", true);
+    ASSERT_STREQ("Error splitting the range string.", results2.error().c_str());
+
+    auto results3 = coll1->search("TamilNadu", {"state"},
+                                  "", {"visitors(Busy:[0, 200000] VeryBusy:[200000, 500000])"}, //missing ',' between ranges
+                                  {}, {2}, 10,
+                                  1, FREQUENCY, {true},
+                                  10, spp::sparse_hash_set<std::string>(),
+                                  spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 10, {}, {}, {}, 0,
+                                  "<mark>", "</mark>", {}, 1000,
+                                  true, false, true, "", true);
+    ASSERT_STREQ("Error splitting the range string.", results3.error().c_str());
+
+    auto results4 = coll1->search("TamilNadu", {"state"},
+                                  "", {"visitors(Busy:[0 200000], VeryBusy:[200000, 500000])"}, //missing ',' between first ranges values
+                                  {}, {2}, 10,
+                                  1, FREQUENCY, {true},
+                                  10, spp::sparse_hash_set<std::string>(),
+                                  spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 10, {}, {}, {}, 0,
+                                  "<mark>", "</mark>", {}, 1000,
+                                  true, false, true, "", true);
+    ASSERT_STREQ("Range String range pattern not matched.", results4.error().c_str());
+
+    auto results5 = coll1->search("TamilNadu", {"state"},
+                                  "", {"visitors(Busy:[0, 200000 VeryBusy:200000, 500000])"}, //missing '],' and '['
+                                  {}, {2}, 10,
+                                  1, FREQUENCY, {true},
+                                  10, spp::sparse_hash_set<std::string>(),
+                                  spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 10, {}, {}, {}, 0,
+                                  "<mark>", "</mark>", {}, 1000,
+                                  true, false, true, "", true);
+    ASSERT_STREQ("Range String range pattern not matched.", results5.error().c_str());
+
+    collectionManager.drop_collection("coll1");
+}
+
+TEST_F(CollectionFacetingTest, SampleFacetCounts) {
+    nlohmann::json schema = R"({
+            "name": "coll1",
+            "fields": [
+                {"name": "color", "type": "string", "facet": true}
+            ]
+        })"_json;
+
+    Collection* coll1 = collectionManager.create_collection(schema).get();
+
+    for(size_t i = 0; i < 1000; i++) {
+        nlohmann::json doc;
+        if(i % 2 == 0) {
+            doc["color"] = "blue";
+        } else {
+            doc["color"] = "red";
+        }
+
+        ASSERT_TRUE(coll1->add(doc.dump()).ok());
+    }
+
+    auto res = coll1->search("*", {}, "", {"color"}, {}, {0}, 3, 1, FREQUENCY, {true}, 5,
+                             spp::sparse_hash_set<std::string>(),
+                             spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 20, {}, {}, {}, 0,
+                             "<mark>", "</mark>", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback,
+                             4, {off}, 3, 3, 2, 2, false, "", 10, 0).get();
+
+    ASSERT_EQ(1000, res["found"].get<size_t>());
+    ASSERT_EQ(1, res["facet_counts"].size());
+    ASSERT_EQ(2, res["facet_counts"][0]["counts"].size());
+
+    // verify approximate counts
+    ASSERT_GE(res["facet_counts"][0]["counts"][0]["count"].get<size_t>(), 250);
+    ASSERT_GE(res["facet_counts"][0]["counts"][1]["count"].get<size_t>(), 250);
+    ASSERT_TRUE(res["facet_counts"][0]["sampled"].get<bool>());
+
+    // when sample threshold is high, don't estimate
+    res = coll1->search("*", {}, "", {"color"}, {}, {0}, 3, 1, FREQUENCY, {true}, 5,
+                        spp::sparse_hash_set<std::string>(),
+                        spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 20, {}, {}, {}, 0,
+                        "<mark>", "</mark>", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback,
+                        4, {off}, 3, 3, 2, 2, false, "", 10, 10000).get();
+
+    ASSERT_EQ(1000, res["found"].get<size_t>());
+    ASSERT_EQ(1, res["facet_counts"].size());
+    ASSERT_EQ(2, res["facet_counts"][0]["counts"].size());
+
+    // verify approximate counts
+    ASSERT_EQ(500, res["facet_counts"][0]["counts"][0]["count"].get<size_t>());
+    ASSERT_EQ(500, res["facet_counts"][0]["counts"][1]["count"].get<size_t>());
+    ASSERT_FALSE(res["facet_counts"][0]["sampled"].get<bool>());
+
+    // test for sample percent > 100
+
+    auto res_op = coll1->search("*", {}, "", {"color"}, {}, {0}, 3, 1, FREQUENCY, {true}, 5,
+                        spp::sparse_hash_set<std::string>(),
+                        spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 20, {}, {}, {}, 0,
+                        "<mark>", "</mark>", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback,
+                        4, {off}, 3, 3, 2, 2, false, "", 200, 0);
+
+    ASSERT_FALSE(res_op.ok());
+    ASSERT_EQ("Value of `facet_sample_percent` must be less than 100.", res_op.error());
+}
--- a/test/collection_schema_change_test.cpp
+++ b/test/collection_schema_change_test.cpp
@ -583,7 +583,7 @@ TEST_F(CollectionSchemaChangeTest, AbilityToDropAndReAddIndexAtTheSameTime) {
    ASSERT_EQ(1, res["found"].get<size_t>());
    ASSERT_EQ("0", res["hits"][0]["document"]["id"].get<std::string>());
    ASSERT_EQ(1, res["facet_counts"].size());
-    ASSERT_EQ(3, res["facet_counts"][0].size());
+    ASSERT_EQ(4, res["facet_counts"][0].size());
    ASSERT_EQ("title", res["facet_counts"][0]["field_name"]);
    ASSERT_EQ(1, res["facet_counts"][0]["counts"].size());
    ASSERT_EQ("123", res["facet_counts"][0]["counts"][0]["value"].get<std::string>());
--- a/test/collection_specific_test.cpp
+++ b/test/collection_specific_test.cpp
@ -2918,239 +2918,3 @@ TEST_F(CollectionSpecificTest, DontHighlightPunctuation) {

    collectionManager.drop_collection("coll1");
 }
-
-TEST_F(CollectionSpecificTest, RangeFacetTest) {
-    std::vector<field> fields = {field("place", field_types::STRING, false),
-                                 field("state", field_types::STRING, false),
-                                 field("visitors", field_types::INT32, true),};
-    Collection* coll1 = collectionManager.create_collection(
-        "coll1", 1, fields, "", 0, "", {}, {}
-    ).get();
-    nlohmann::json doc1;
-    doc1["id"] = "0";
-    doc1["place"] = "Mysore Palace";
-    doc1["state"] = "Karnataka";
-    doc1["visitors"] = 235486;
-
-    nlohmann::json doc2;
-    doc2["id"] = "1";
-    doc2["place"] = "Hampi";
-    doc2["state"] = "Karnataka";
-    doc2["visitors"] = 187654;
-
-    nlohmann::json doc3;
-    doc3["id"] = "2";
-    doc3["place"] = "Mahabalipuram";
-    doc3["state"] = "TamilNadu";
-    doc3["visitors"] = 174684;
-
-    nlohmann::json doc4;
-    doc4["id"] = "3";
-    doc4["place"] = "Meenakshi Amman Temple";
-    doc4["state"] = "TamilNadu";
-    doc4["visitors"] = 246676;
-
-    nlohmann::json doc5;
-    doc5["id"] = "4";
-    doc5["place"] = "Staue of Unity";
-    doc5["state"] = "Gujarat";
-    doc5["visitors"] = 345878;
-
-    
-    ASSERT_TRUE(coll1->add(doc1.dump()).ok());
-    ASSERT_TRUE(coll1->add(doc2.dump()).ok());
-    ASSERT_TRUE(coll1->add(doc3.dump()).ok());
-    ASSERT_TRUE(coll1->add(doc4.dump()).ok());
-    ASSERT_TRUE(coll1->add(doc5.dump()).ok());
-
-    auto results = coll1->search("Karnataka", {"state"},
-                                 "", {"visitors(Busy:[0, 200000], VeryBusy:[200000, 500000])"},
-                                 {}, {2}, 10,
-                                 1, FREQUENCY, {true},
-                                 10, spp::sparse_hash_set<std::string>(),
-                                 spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 10, {}, {}, {}, 0,
-                                 "<mark>", "</mark>", {}, 1000,
-                                 true, false, true, "", true).get();
-    ASSERT_EQ(2, results["facet_counts"][0]["counts"].size());
-    ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][0]["count"]);
-    ASSERT_STREQ("Busy", results["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
-
-    auto results2 = coll1->search("Gujarat", {"state"},
-                                 "", {"visitors(Busy:[0, 200000], VeryBusy:[200000, 500000])"},
-                                 {}, {2}, 10,
-                                 1, FREQUENCY, {true},
-                                 10, spp::sparse_hash_set<std::string>(),
-                                 spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 10, {}, {}, {}, 0,
-                                 "<mark>", "</mark>", {}, 1000,
-                                 true, false, true, "", true).get();
-    ASSERT_EQ(1, results2["facet_counts"][0]["counts"].size());
-    ASSERT_EQ(1, results2["facet_counts"][0]["counts"][0]["count"].get<std::size_t>());
-    ASSERT_STREQ("VeryBusy", results2["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
-    ASSERT_TRUE(results2["facet_counts"][0]["counts"][1]["value"] == nullptr);
-    
-    collectionManager.drop_collection("coll1");
-}
-
-TEST_F(CollectionSpecificTest, RangeFacetContinuity) {
-    std::vector<field> fields = {field("place", field_types::STRING, false),
-                                 field("state", field_types::STRING, false),
-                                 field("visitors", field_types::INT32, true),};
-    Collection* coll1 = collectionManager.create_collection(
-        "coll1", 1, fields, "", 0, "", {}, {}
-    ).get();
-    nlohmann::json doc1;
-    doc1["id"] = "0";
-    doc1["place"] = "Mysore Palace";
-    doc1["state"] = "Karnataka";
-    doc1["visitors"] = 235486;
-
-    nlohmann::json doc2;
-    doc2["id"] = "1";
-    doc2["place"] = "Hampi";
-    doc2["state"] = "Karnataka";
-    doc2["visitors"] = 187654;
-
-    nlohmann::json doc3;
-    doc3["id"] = "2";
-    doc3["place"] = "Mahabalipuram";
-    doc3["state"] = "TamilNadu";
-    doc3["visitors"] = 174684;
-
-    nlohmann::json doc4;
-    doc4["id"] = "3";
-    doc4["place"] = "Meenakshi Amman Temple";
-    doc4["state"] = "TamilNadu";
-    doc4["visitors"] = 246676;
-
-    nlohmann::json doc5;
-    doc5["id"] = "4";
-    doc5["place"] = "Staue of Unity";
-    doc5["state"] = "Gujarat";
-    doc5["visitors"] = 345878;
-
-    
-    ASSERT_TRUE(coll1->add(doc1.dump()).ok());
-    ASSERT_TRUE(coll1->add(doc2.dump()).ok());
-    ASSERT_TRUE(coll1->add(doc3.dump()).ok());
-    ASSERT_TRUE(coll1->add(doc4.dump()).ok());
-    ASSERT_TRUE(coll1->add(doc5.dump()).ok());
-
-    auto results = coll1->search("TamilNadu", {"state"},
-                                 "", {"visitors(Busy:[0, 200000], VeryBusy:[200001, 500000])"},
-                                 {}, {2}, 10,
-                                 1, FREQUENCY, {true},
-                                 10, spp::sparse_hash_set<std::string>(),
-                                 spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 10, {}, {}, {}, 0,
-                                 "<mark>", "</mark>", {}, 1000,
-                                 true, false, true, "", true);
-    ASSERT_STREQ("Ranges in range facet syntax should be continous.", results.error().c_str());
-
-    auto results2 = coll1->search("TamilNadu", {"state"},
-                                 "", {"visitors(Busy:[0, 200000], VeryBusy:[199999, 500000])"},
-                                 {}, {2}, 10,
-                                 1, FREQUENCY, {true},
-                                 10, spp::sparse_hash_set<std::string>(),
-                                 spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 10, {}, {}, {}, 0,
-                                 "<mark>", "</mark>", {}, 1000,
-                                 true, false, true, "", true);
-    ASSERT_STREQ("Ranges in range facet syntax should be continous.", results2.error().c_str());
-    
-    collectionManager.drop_collection("coll1");
-}
-
-TEST_F(CollectionSpecificTest, RangeFacetTypo) {
-    std::vector<field> fields = {field("place", field_types::STRING, false),
-                                 field("state", field_types::STRING, false),
-                                 field("visitors", field_types::INT32, true),};
-    Collection* coll1 = collectionManager.create_collection(
-        "coll1", 1, fields, "", 0, "", {}, {}
-    ).get();
-    nlohmann::json doc1;
-    doc1["id"] = "0";
-    doc1["place"] = "Mysore Palace";
-    doc1["state"] = "Karnataka";
-    doc1["visitors"] = 235486;
-
-    nlohmann::json doc2;
-    doc2["id"] = "1";
-    doc2["place"] = "Hampi";
-    doc2["state"] = "Karnataka";
-    doc2["visitors"] = 187654;
-
-    nlohmann::json doc3;
-    doc3["id"] = "2";
-    doc3["place"] = "Mahabalipuram";
-    doc3["state"] = "TamilNadu";
-    doc3["visitors"] = 174684;
-
-    nlohmann::json doc4;
-    doc4["id"] = "3";
-    doc4["place"] = "Meenakshi Amman Temple";
-    doc4["state"] = "TamilNadu";
-    doc4["visitors"] = 246676;
-
-    nlohmann::json doc5;
-    doc5["id"] = "4";
-    doc5["place"] = "Staue of Unity";
-    doc5["state"] = "Gujarat";
-    doc5["visitors"] = 345878;
-
-    
-    ASSERT_TRUE(coll1->add(doc1.dump()).ok());
-    ASSERT_TRUE(coll1->add(doc2.dump()).ok());
-    ASSERT_TRUE(coll1->add(doc3.dump()).ok());
-    ASSERT_TRUE(coll1->add(doc4.dump()).ok());
-    ASSERT_TRUE(coll1->add(doc5.dump()).ok());
-
-    auto results = coll1->search("TamilNadu", {"state"},
-                                 "", {"visitors(Busy:[0, 200000], VeryBusy:[200000, 500000)"}, //missing ']' at end
-                                 {}, {2}, 10,
-                                 1, FREQUENCY, {true},
-                                 10, spp::sparse_hash_set<std::string>(),
-                                 spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 10, {}, {}, {}, 0,
-                                 "<mark>", "</mark>", {}, 1000,
-                                 true, false, true, "", true);
-    ASSERT_STREQ("Error splitting the range string.", results.error().c_str());
-
-    auto results2 = coll1->search("TamilNadu", {"state"},
-                                 "", {"visitors(Busy:[0, 200000], VeryBusy:200000, 500000])"}, //missing '[' in second range
-                                 {}, {2}, 10,
-                                 1, FREQUENCY, {true},
-                                 10, spp::sparse_hash_set<std::string>(),
-                                 spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 10, {}, {}, {}, 0,
-                                 "<mark>", "</mark>", {}, 1000,
-                                 true, false, true, "", true);
-    ASSERT_STREQ("Error splitting the range string.", results2.error().c_str());
-
-    auto results3 = coll1->search("TamilNadu", {"state"},
-                                 "", {"visitors(Busy:[0, 200000] VeryBusy:[200000, 500000])"}, //missing ',' between ranges
-                                 {}, {2}, 10,
-                                 1, FREQUENCY, {true},
-                                 10, spp::sparse_hash_set<std::string>(),
-                                 spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 10, {}, {}, {}, 0,
-                                 "<mark>", "</mark>", {}, 1000,
-                                 true, false, true, "", true);
-    ASSERT_STREQ("Error splitting the range string.", results3.error().c_str());
-
-    auto results4 = coll1->search("TamilNadu", {"state"},
-                                 "", {"visitors(Busy:[0 200000], VeryBusy:[200000, 500000])"}, //missing ',' between first ranges values
-                                 {}, {2}, 10,
-                                 1, FREQUENCY, {true},
-                                 10, spp::sparse_hash_set<std::string>(),
-                                 spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 10, {}, {}, {}, 0,
-                                 "<mark>", "</mark>", {}, 1000,
-                                 true, false, true, "", true);
-    ASSERT_STREQ("Range String range pattern not matched.", results4.error().c_str());
-
-    auto results5 = coll1->search("TamilNadu", {"state"},
-                                 "", {"visitors(Busy:[0, 200000 VeryBusy:200000, 500000])"}, //missing '],' and '['
-                                 {}, {2}, 10,
-                                 1, FREQUENCY, {true},
-                                 10, spp::sparse_hash_set<std::string>(),
-                                 spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 10, {}, {}, {}, 0,
-                                 "<mark>", "</mark>", {}, 1000,
-                                 true, false, true, "", true);
-    ASSERT_STREQ("Range String range pattern not matched.", results5.error().c_str());
-    
-    collectionManager.drop_collection("coll1");
-}