facet index refactor updated changes

2025-05-20 05:32:30 +08:00 · 2023-03-31 17:38:33 +05:30 · 2023-03-31 17:38:33 +05:30 · cd69111a5b
commit cd69111a5b
parent 8b4e95e421
8 changed files with 331 additions and 294 deletions
--- a/include/facet_index.h
+++ b/include/facet_index.h
@ -3,8 +3,6 @@
 #include "ids_t.h"
 #include "tsl/htrie_map.h"
 #include <list>
-#include <set>
-#include <mutex>

 class facet_index_t {
 private:
@ -21,17 +19,27 @@ private:
    struct facet_index_counter {
        tsl::htrie_map<char, void*> facet_index_map;
        std::list<count_list> counter_list;
+
+        ~facet_index_counter() {
+            for(auto it = facet_index_map.begin(); it != facet_index_map.end(); ++it) {
+                ids_t::destroy_list(it.value());
+            }
+    
+            facet_index_map.clear();
+
+            counter_list.clear();
+        }
    };

    std::map<std::string, facet_index_counter> facet_field_map;
 public:

+    facet_index_t() = default;
+
    ~facet_index_t();

    void insert(const std::string& field, const std::string& value, uint32_t id);

-    size_t get(const std::string& field, std::map<std::string,std::vector<uint32_t>>& result_ids);
-
    void erase(const std::string& field);

    bool contains(const std::string& field);
@ -40,4 +48,7 @@ public:

    int intersect(const std::string& val, const uint32_t* result_ids, int result_id_len, 
        int max_facet_count, std::map<std::string, uint32_t>& found);
+    
+    int get_facet(const std::string& field, const std::vector<std::string>& searched_tokens,
+        std::vector<std::string>& facets);
 };
--- a/include/field.h
+++ b/include/field.h
@ -635,26 +635,24 @@ struct facet_stats_t {

 struct facet {
    const std::string field_name;
-    spp::sparse_hash_map<uint64_t, facet_count_t> result_map;
-
+    spp::sparse_hash_map<std::string, facet_count_t> result_map;
    // used for facet value query
-    spp::sparse_hash_map<uint64_t, std::vector<std::string>> hash_tokens;
+    //spp::sparse_hash_map<uint64_t, std::vector<std::string>> hash_tokens;
+    spp::sparse_hash_map<std::string, std::vector<std::string>> facet_tokens;

    // used for faceting grouped results
-    spp::sparse_hash_map<uint64_t, spp::sparse_hash_set<uint64_t>> hash_groups;
+    //spp::sparse_hash_map<uint64_t, spp::sparse_hash_set<uint64_t>> hash_groups;

    facet_stats_t stats;

    //dictionary of key=>pair(range_id, range_val)
-    std::map<int64_t, std::string> facet_range_map;
+    std::map<std::string, std::string> facet_range_map;

    bool is_range_query;

    bool sampled = false;

-    bool is_wildcard_match = false;
-
-    bool get_range(int64_t key, std::pair<int64_t, std::string>& range_pair)
+    bool get_range(std::string key, std::pair<int64_t, std::string>& range_pair)
    {
        if(facet_range_map.empty())
        {
@ -673,7 +671,7 @@ struct facet {
    }

    explicit facet(const std::string& field_name, 
-        std::map<int64_t, std::string> facet_range = {}, bool is_range_q = false)
+        std::map<std::string, std::string> facet_range = {}, bool is_range_q = false)
        :field_name(field_name){
            facet_range_map = facet_range;
            is_range_query = is_range_q;
@ -684,7 +682,7 @@ struct facet_info_t {
    // facet hash => resolved tokens
    //std::unordered_map<uint64_t, std::vector<std::string>> hashes;
    //facet name => resolved tokens
-    std::unordered_map<uint32_t, std::vector<std::string>> doc_id_tokens;
+    std::unordered_map<std::string, std::vector<std::string>> facet_tokens;
    bool use_facet_query = false;
    bool should_compute_stats = false;
    field facet_field{"", "", false};
--- a/include/index.h
+++ b/include/index.h
@ -281,8 +281,6 @@ struct hnsw_index_t {
    }
 };

-extern std::map<std::string, std::map<std::string, uint32_t>> facet_results;
-
 class Index {
 private:
    mutable std::shared_mutex mutex;
@ -509,7 +507,7 @@ private:

    static uint64_t facet_token_hash(const field & a_field, const std::string &token);

-    static void compute_facet_stats(facet &a_facet, uint64_t raw_value, const std::string & field_type);
+    static void compute_facet_stats(facet &a_facet, std::string raw_value, const std::string & field_type);

    static void handle_doc_ops(const tsl::htrie_map<char, field>& search_schema,
                               nlohmann::json& update_doc, const nlohmann::json& old_doc, nlohmann::json& new_doc);
--- a/src/collection.cpp
+++ b/src/collection.cpp
@ -22,8 +22,6 @@
 const std::string override_t::MATCH_EXACT = "exact";
 const std::string override_t::MATCH_CONTAINS = "contains";

-std::map<std::string, std::map<std::string, uint32_t>> facet_results;
-
 struct sort_fields_guard_t {
    std::vector<sort_by> sort_fields_std;

@ -1922,15 +1920,22 @@ Option<nlohmann::json> Collection::search(std::string  raw_query,
        facet_result["counts"] = nlohmann::json::array();

        std::vector<facet_value_t> facet_values;
-        std::vector<std::pair<uint64_t, facet_count_t>> facet_hash_counts;
+        // std::vector<std::pair<uint64_t, facet_count_t>> facet_hash_counts;

-        for (const auto & kv : a_facet.result_map) {
-            facet_hash_counts.emplace_back(kv);
-        }
-
-        if(a_facet.is_range_query){
-            for(auto kv : a_facet.result_map){
+        // for (const auto & kv : a_facet.result_map) {
+        //     facet_hash_counts.emplace_back(kv);
+        // }
+        
+        auto the_field = search_schema.at(a_facet.field_name);
+        // keep only top K facets
+        //auto max_facets = std::min(max_facet_values, facet_hash_counts.size());
+        auto max_facets = std::min(max_facet_values, a_facet.result_map.size());
+        // std::nth_element(facet_hash_counts.begin(), facet_hash_counts.begin() + max_facets,
+        //                  facet_hash_counts.end(), Collection::facet_count_compare);
+        //LOG (INFO) << "found facet size " << a_facet.result_map.size();
+        for(auto& kv : a_facet.result_map) {

+            if(a_facet.is_range_query){
                auto facet_range_iter = a_facet.facet_range_map.find(kv.first);
                if(facet_range_iter != a_facet.facet_range_map.end()){
                    auto & facet_count = kv.second;
@ -1940,109 +1945,99 @@ Option<nlohmann::json> Collection::search(std::string  raw_query,
                else{
                    LOG (ERROR) << "range_id not found in result map.";
                }
-            }
-        }
-        
-        auto the_field = search_schema.at(a_facet.field_name);
-        // keep only top K facets
-        //auto max_facets = std::min(max_facet_values, facet_hash_counts.size());
-        auto max_facets = std::min(max_facet_values, facet_results[a_facet.field_name].size());
-        // std::nth_element(facet_hash_counts.begin(), facet_hash_counts.begin() + max_facets,
-        //                  facet_hash_counts.end(), Collection::facet_count_compare);
-        LOG (INFO) << "found_doc_seq_ids size " << facet_results[a_facet.field_name].size();
-        //for(size_t fi = 0; fi < max_facets; fi++) {
-        for(auto& it : facet_results[a_facet.field_name]) {
+            } else {
+                //facet_value_t facet_value = { kv.first, std::string(), kv.second.count};
+                //facet_values.emplace_back(facet_value);
+            

-            if(a_facet.is_range_query){
-                break;
+                // remap facet value hash with actual string
+                // auto & kv = facet_hash_counts[fi];
+                // auto & facet_count = kv.second;
+                // // fetch actual facet value from representative doc id
+                // const std::string& seq_id_key = get_seq_id_key((uint32_t) facet_count.doc_id);
+                // nlohmann::json document;
+                // const Option<bool> & document_op = get_document_from_store(seq_id_key, document);
+                // if(!document_op.ok()) {
+                //     LOG(ERROR) << "Facet fetch error. " << document_op.error();
+                //     continue;
+                // }
+                //std::string value;
+                // bool facet_found = facet_value_to_string(a_facet, facet_count, document, value);
+                // if(!facet_found) {
+                //     continue;
+                // }
+                std::string value = kv.first;
+                std::unordered_map<std::string, size_t> ftoken_pos;
+                //std::vector<string>& ftokens = a_facet.hash_tokens[kv.first];
+                std::vector<string>& ftokens = a_facet.facet_tokens[kv.first];
+                for(size_t ti = 0; ti < ftokens.size(); ti++) {
+                    // if(the_field.is_bool()) {
+                    //     if(ftokens[ti] == "1") {
+                    //         ftokens[ti] = "true";
+                    //     } else {
+                    //         ftokens[ti] = "false";
+                    //     }
+                    // }
+                    const std::string& resolved_token = ftokens[ti];
+                    ftoken_pos[resolved_token] = ti;
+                }
+                const std::string& last_full_q_token = ftokens.empty() ? "" : ftokens.back();
+                // 2 passes: first identify tokens that need to be highlighted and then construct highlighted text
+                bool is_cyrillic = Tokenizer::is_cyrillic(the_field.locale);
+                bool normalise = is_cyrillic ? false : true;
+                Tokenizer tokenizer(value, normalise, !the_field.is_string(), the_field.locale, symbols_to_index, token_separators);
+                // secondary tokenizer used for specific languages that requires transliteration
+                // we use 2 tokenizers so that the original text offsets are available for highlighting
+                Tokenizer word_tokenizer("", true, false, the_field.locale, symbols_to_index, token_separators);
+                std::string raw_token;
+                size_t raw_token_index = 0, tok_start = 0, tok_end = 0;
+                // need an ordered map here to ensure that it is ordered by the key (start offset)
+                std::map<size_t, size_t> token_offsets;
+                size_t prefix_token_start_index = 0;
+                while(tokenizer.next(raw_token, raw_token_index, tok_start, tok_end)) {
+                    if(is_cyrillic) {
+                        word_tokenizer.tokenize(raw_token);
+                    }
+                    auto token_pos_it = ftoken_pos.find(raw_token);
+                    if(token_pos_it != ftoken_pos.end()) {
+                        token_offsets[tok_start] = tok_end;
+                        if(raw_token == last_full_q_token) {
+                            prefix_token_start_index = tok_start;
+                        }
+                    }
+                }
+                auto offset_it = token_offsets.begin();
+                size_t i = 0;
+                std::stringstream highlightedss;
+                // loop until end index, accumulate token and complete highlighting
+                while(i < value.size()) {
+                    if(offset_it != token_offsets.end()) {
+                        if (i == offset_it->first) {
+                            highlightedss << highlight_start_tag;
+                            // do prefix highlighting for non-dropped last token
+                            size_t token_len = (i == prefix_token_start_index && token_offsets.size() == facet_query_num_tokens) ?
+                                               facet_query_last_token.size() :
+                                               (offset_it->second - i + 1);
+                            if(i == prefix_token_start_index && token_offsets.size() == facet_query_num_tokens) {
+                                token_len = std::min((offset_it->second - i + 1), facet_query_last_token.size());
+                            } else {
+                                token_len = (offset_it->second - i + 1);
+                            }
+                            for(size_t j = 0; j < token_len; j++) {
+                                highlightedss << value[i + j];
+                            }
+                            highlightedss << highlight_end_tag;
+                            offset_it++;
+                            i += token_len;
+                            continue;
+                        }
+                    }
+                    highlightedss << value[i];
+                    i++;
+                }
+                facet_value_t facet_value = {value, highlightedss.str(), kv.second.count};
+                facet_values.emplace_back(facet_value);
            }
-
-            // remap facet value hash with actual string
-            // auto & kv = facet_hash_counts[fi];
-            // auto & facet_count = kv.second;
-            // // fetch actual facet value from representative doc id
-            // const std::string& seq_id_key = get_seq_id_key((uint32_t) facet_count.doc_id);
-            // nlohmann::json document;
-            // const Option<bool> & document_op = get_document_from_store(seq_id_key, document);
-            // if(!document_op.ok()) {
-            //     LOG(ERROR) << "Facet fetch error. " << document_op.error();
-            //     continue;
-            // }
-            //std::string value;
-            // bool facet_found = facet_value_to_string(a_facet, facet_count, document, value);
-            // if(!facet_found) {
-            //     continue;
-            // }
-            // std::unordered_map<std::string, size_t> ftoken_pos;
-            // std::vector<string>& ftokens = a_facet.hash_tokens[kv.first];
-            // for(size_t ti = 0; ti < ftokens.size(); ti++) {
-            //     if(the_field.is_bool()) {
-            //         if(ftokens[ti] == "1") {
-            //             ftokens[ti] = "true";
-            //         } else {
-            //             ftokens[ti] = "false";
-            //         }
-            //     }
-            //     const std::string& resolved_token = ftokens[ti];
-            //     ftoken_pos[resolved_token] = ti;
-            // }
-            // const std::string& last_full_q_token = ftokens.empty() ? "" : ftokens.back();
-            // // 2 passes: first identify tokens that need to be highlighted and then construct highlighted text
-            // bool is_cyrillic = Tokenizer::is_cyrillic(the_field.locale);
-            // bool normalise = is_cyrillic ? false : true;
-            // Tokenizer tokenizer(value, normalise, !the_field.is_string(), the_field.locale, symbols_to_index, token_separators);
-            // // secondary tokenizer used for specific languages that requires transliteration
-            // // we use 2 tokenizers so that the original text offsets are available for highlighting
-            // Tokenizer word_tokenizer("", true, false, the_field.locale, symbols_to_index, token_separators);
-            // std::string raw_token;
-            // size_t raw_token_index = 0, tok_start = 0, tok_end = 0;
-            // // need an ordered map here to ensure that it is ordered by the key (start offset)
-            // std::map<size_t, size_t> token_offsets;
-            // size_t prefix_token_start_index = 0;
-            // while(tokenizer.next(raw_token, raw_token_index, tok_start, tok_end)) {
-            //     if(is_cyrillic) {
-            //         word_tokenizer.tokenize(raw_token);
-            //     }
-            //     auto token_pos_it = ftoken_pos.find(raw_token);
-            //     if(token_pos_it != ftoken_pos.end()) {
-            //         token_offsets[tok_start] = tok_end;
-            //         if(raw_token == last_full_q_token) {
-            //             prefix_token_start_index = tok_start;
-            //         }
-            //     }
-            // }
-            // auto offset_it = token_offsets.begin();
-            // size_t i = 0;
-            // std::stringstream highlightedss;
-            // // loop until end index, accumulate token and complete highlighting
-            // while(i < value.size()) {
-            //     if(offset_it != token_offsets.end()) {
-            //         if (i == offset_it->first) {
-            //             highlightedss << highlight_start_tag;
-            //             // do prefix highlighting for non-dropped last token
-            //             size_t token_len = (i == prefix_token_start_index && token_offsets.size() == facet_query_num_tokens) ?
-            //                                facet_query_last_token.size() :
-            //                                (offset_it->second - i + 1);
-            //             if(i == prefix_token_start_index && token_offsets.size() == facet_query_num_tokens) {
-            //                 token_len = std::min((offset_it->second - i + 1), facet_query_last_token.size());
-            //             } else {
-            //                 token_len = (offset_it->second - i + 1);
-            //             }
-            //             for(size_t j = 0; j < token_len; j++) {
-            //                 highlightedss << value[i + j];
-            //             }
-            //             highlightedss << highlight_end_tag;
-            //             offset_it++;
-            //             i += token_len;
-            //             continue;
-            //         }
-            //     }
-            //     highlightedss << value[i];
-            //     i++;
-            // }
-            //facet_value_t facet_value = {value, highlightedss.str(), facet_count.count};
-            facet_value_t facet_value = { it.first, std::string(), it.second};
-            facet_values.emplace_back(facet_value);
        }
        
        std::stable_sort(facet_values.begin(), facet_values.end(), Collection::facet_count_str_compare);
@ -2066,7 +2061,7 @@ Option<nlohmann::json> Collection::search(std::string  raw_query,
            facet_result["stats"]["avg"] = (a_facet.stats.fvsum / a_facet.stats.fvcount);
        }

-        facet_result["stats"]["total_values"] = facet_hash_counts.size();
+        facet_result["stats"]["total_values"] = facet_values.size();
        result["facet_counts"].push_back(facet_result);
    }

@ -4659,9 +4654,9 @@ Option<bool> Collection::parse_facet(const std::string& facet_field, std::vector

        for(const auto& tup : tupVec){

-            int64_t lower_range = std::get<0>(tup);
-            int64_t upper_range = std::get<1>(tup);
-            std::string range_val = std::get<2>(tup);
+            const std::string& lower_range = std::to_string(std::get<0>(tup));
+            const std::string& upper_range = std::to_string(std::get<1>(tup));
+            const std::string& range_val = std::get<2>(tup);
            //check if ranges are continous or not
            if((!range_map.empty()) && (range_map.find(lower_range)== range_map.end())){
                std::string error = "Ranges in range facet syntax should be continous.";
--- a/src/facet_index.cpp
+++ b/src/facet_index.cpp
@ -51,25 +51,6 @@ void facet_index_t::insert(const std::string& field, const std::string& value, u
    }
 }

-size_t facet_index_t::get(const std::string& field, 
-                    std::map<std::string,std::vector<uint32_t>>& result_ids) {
-
-    const auto& facet_field_it = facet_field_map.find(field);
-    if(facet_field_it == facet_field_map.end()) {
-        return 0;
-    }
-    auto& facet_index_map = facet_field_it->second.facet_index_map;
-
-    for(auto it = facet_index_map.begin(); it != facet_index_map.end(); ++it) {
-        auto ids = ids_t::uncompress(it.value());
-        for(auto i = 0; i < ids_t::num_ids(ids); ++i) {
-           result_ids[it.key()].emplace_back(ids[i]);
-        }
-    }
-
-    return result_ids.size();
-}
-
 bool facet_index_t::contains(const std::string& field) {

    const auto& facet_field_it = facet_field_map.find(field);
@ -77,38 +58,14 @@ bool facet_index_t::contains(const std::string& field) {
        return false;
    }

-    // auto& facet_index_map = facet_field_it->second.facet_index_map;
-    // LOG(INFO) << "Size of facet_field " << field << " " << facet_index_map.size();
-    
-    // for(auto it = facet_index_map.begin(); it != facet_index_map.end(); ++it) {
-    //     LOG (INFO) << "facet_value " << it.key() << " with ids as follow";
-
-    //     auto ids = ids_t::uncompress(it.value());
-    //     for(auto i = 0; i < ids_t::num_ids(ids); ++i) {
-    //         LOG(INFO) << ids[i];
-    //     }
-    // }
    return true;
 }

 void facet_index_t::erase(const std::string& field) {
-
-    const auto& facet_field_it = facet_field_map.find(field);
-    if(facet_field_it == facet_field_map.end()) {
-        return;
+    const auto it = facet_field_map.find(field);
+    if(it != facet_field_map.end()) {
+        facet_field_map.erase(field);
    }
-
-    auto& facet_index_map = facet_field_it->second.facet_index_map;
-
-    for(auto it = facet_index_map.begin(); it != facet_index_map.end(); ++it) {
-        ids_t::destroy_list(it.value());
-    }
-    
-    facet_index_map.clear();
-
-    facet_field_it->second.counter_list.clear();
-
-    facet_field_map.erase(field);
 }

 size_t facet_index_t::size() {
@ -132,39 +89,68 @@ int facet_index_t::intersect(const std::string& field, const uint32_t* result_id
    // LOG (INFO) << "facet_index_map size " << facet_index_map.size() 
    //     << " , counter_list size " << counter_list.size();
    
-    auto counter_list_it = counter_list.begin();
-    int facet_count = 0;
-
+    std::vector<uint32_t> id_list;
    const auto max_facets = std::min((int)counter_list.size(), max_facet_count);
-    while(facet_count < max_facets) {
+    for(const auto& counter_list_it : counter_list) {
        //LOG (INFO) << "checking ids in facet_value " << counter_list_it->facet_value 
        // << " having total count " << counter_list_it->count;

-        auto ids = facet_index_map.at(counter_list_it->facet_value);
-        auto id_list = ids_t::uncompress(ids);
+        auto ids = facet_index_map.at(counter_list_it.facet_value);
+        ids_t::uncompress(ids, id_list);
+        const auto ids_len = id_list.size();
        int count = 0;
        
        for(int i = 0; i < result_ids_len; ++i) {
-            if(std::binary_search(id_list, id_list + ids_t::num_ids(id_list), result_ids[i])) {
+            if(std::binary_search(id_list.begin(), id_list.end(), result_ids[i])) {
                ++count;
            }
        }
+        
        if(count) {
-            //LOG (INFO) << "fount count " << count << " for facet " << counter_list_it->facet_value;
-            found[counter_list_it->facet_value] += count;
+            found[counter_list_it.facet_value] = count;
+            
+            if(found.size() == max_facets) {
+                break;
+            }
        }

-        ++facet_count;
-        ++counter_list_it;
+        id_list.clear();
    }
-
+    
    return found.size();
 }

-facet_index_t::~facet_index_t() {
-    for(auto it = facet_field_map.begin(); it != facet_field_map.end(); ++it) {
-        erase(it->first);
+int facet_index_t::get_facet(const std::string& field,  const std::vector<std::string>& searched_tokens, 
+        std::vector<std::string>& facets) {
+
+    const auto& facet_field_it = facet_field_map.find(field);
+
+    if(facet_field_it == facet_field_map.end()) {
+        return 0;
    }
+
+    auto facet_index_map = facet_field_it->second.facet_index_map;
+
+    for(const auto& token : searched_tokens) {
+        auto token_string = token;
+        std::transform(token_string.begin(), token_string.end(), token_string.begin(), ::tolower); 
+
+        for(auto facet_index_map_it = facet_index_map.begin(); 
+            facet_index_map_it != facet_index_map.end(); ++facet_index_map_it) {
+
+            auto facet_string = facet_index_map_it.key();
+            std::transform(facet_string.begin(), facet_string.end(), facet_string.begin(), ::tolower);
+            
+            if(facet_string.find(token_string) != std::string::npos) {
+                facets.emplace_back(facet_index_map_it.key());
+            }
+        }
+    }
+
+    return facets.size();
+}
+
+facet_index_t::~facet_index_t() {
    facet_field_map.clear();    
 }

--- a/src/index.cpp
+++ b/src/index.cpp
@ -62,6 +62,8 @@ Index::Index(const std::string& name, const uint32_t collection_id, const Store*
        search_schema(search_schema),
        seq_ids(new id_list_t(256)), symbols_to_index(symbols_to_index), token_separators(token_separators) {

+    facet_index_v4 = new facet_index_t();
+
    for(const auto& a_field: search_schema) {
        if(!a_field.index) {
            continue;
@ -102,9 +104,6 @@ Index::Index(const std::string& name, const uint32_t collection_id, const Store*

        if(a_field.facet) {
            //initialize_facet_indexes(a_field);
-            if(facet_index_v4 == nullptr) {
-                facet_index_v4 = new facet_index_t();
-            }
        }

        // initialize for non-string facet fields
@ -1173,9 +1172,9 @@ void Index::initialize_facet_indexes(const field& facet_field) {
    // }
 }

-void Index::compute_facet_stats(facet &a_facet, uint64_t raw_value, const std::string & field_type) {
+void Index::compute_facet_stats(facet &a_facet, std::string raw_value, const std::string & field_type) {
    if(field_type == field_types::INT32 || field_type == field_types::INT32_ARRAY) {
-        int32_t val = raw_value;
+        int32_t val = std::stoi(raw_value);
        if (val < a_facet.stats.fvmin) {
            a_facet.stats.fvmin = val;
        }
@ -1185,7 +1184,7 @@ void Index::compute_facet_stats(facet &a_facet, uint64_t raw_value, const std::s
        a_facet.stats.fvsum += val;
        a_facet.stats.fvcount++;
    } else if(field_type == field_types::INT64 || field_type == field_types::INT64_ARRAY) {
-        int64_t val = raw_value;
+        int64_t val = std::stol(raw_value);
        if(val < a_facet.stats.fvmin) {
            a_facet.stats.fvmin = val;
        }
@ -1195,7 +1194,7 @@ void Index::compute_facet_stats(facet &a_facet, uint64_t raw_value, const std::s
        a_facet.stats.fvsum += val;
        a_facet.stats.fvcount++;
    } else if(field_type == field_types::FLOAT || field_type == field_types::FLOAT_ARRAY) {
-        float val = reinterpret_cast<float&>(raw_value);
+        float val = std::stof(raw_value);
        if(val < a_facet.stats.fvmin) {
            a_facet.stats.fvmin = val;
        }
@ -1219,7 +1218,7 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
        const auto& facet_field = facet_infos[findex].facet_field;
        const bool use_facet_query = facet_infos[findex].use_facet_query;
        //const auto& fquery_hashes = facet_infos[findex].hashes;
-        const auto& fquery_doc_id_tokens = facet_infos[findex].doc_id_tokens;
+        const auto& fquery_facet_tokens = facet_infos[findex].facet_tokens;
        const bool should_compute_stats = facet_infos[findex].should_compute_stats;

        auto sort_index_it = sort_index.find(a_facet.field_name);
@ -1232,9 +1231,41 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
        // size_t facet_hash_count = 1;
        // const auto& field_facet_mapping_it = facet_index_v3.find(a_facet.field_name);
        // const auto& field_single_val_facet_mapping_it = single_val_facet_index_v3.find(a_facet.field_name);
+        std::map<std::string, uint32_t> facet_results;
        facet_index_v4->intersect(a_facet.field_name, result_ids, 
-            results_size, max_facet_count, facet_results[a_facet.field_name]);
+            results_size, max_facet_count, facet_results);
+        //LOG(INFO) << "facet_results size " << facet_results.size();
        
+        for(const auto& kv : facet_results) {
+            //range facet processing
+            if(a_facet.is_range_query) {
+                const auto doc_val = kv.first;
+                std::pair<std::string, std::string> range_pair {};
+                if(a_facet.get_range(doc_val, range_pair)) {
+                    const auto& range_id = range_pair.first;
+                    facet_count_t& facet_count = a_facet.result_map[range_id];
+                    facet_count.count = kv.second;
+                }
+            } else if(use_facet_query) { 
+                    if (fquery_facet_tokens.find(kv.first) != fquery_facet_tokens.end()) {
+                        a_facet.facet_tokens[kv.first] = fquery_facet_tokens.at(kv.first);
+                    
+                        facet_count_t& facet_count = a_facet.result_map[kv.first];
+                        facet_count.count = kv.second;
+                    }
+            } else { 
+                facet_count_t& facet_count = a_facet.result_map[kv.first];
+                facet_count.count = kv.second;
+            }
+
+            if(should_compute_stats) {
+                //LOG(INFO) << "Computing facet stas for facet " << a_facet.field_name;
+                for(int i = 0; i < kv.second; ++i) {
+                    compute_facet_stats(a_facet, kv.first, facet_field.type);
+                }
+            } 
+        }                 
+    }

        // for(size_t i = 0; i < results_size; i++) {
        //     // if sampling is enabled, we will skip a portion of the results to speed up things
@ -1266,7 +1297,6 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,

            // const uint64_t distinct_id = group_limit ? get_distinct_id(group_by_fields, doc_seq_id) : 0;
            // //for(size_t j = 0; j < facet_hash_count; j++) {
-            // for(size_t j = 0; j < found_doc_seq_ids.size(); j++) {
            //     // if(facet_field.is_array()) {
            //     //     fhash = facet_map_it->second.hashes[j];
            //     // }
@ -1306,7 +1336,7 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
            //     }
            // }
        //}
-    }
+    //}
 }

 void Index::aggregate_topster(Topster* agg_topster, Topster* index_topster) {
@ -2399,18 +2429,22 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
            while (it.valid()) {
                uint32_t seq_id = it.id();
                uint64_t distinct_id = seq_id;
-                if (group_limit != 0) {
-                    distinct_id = get_distinct_id(group_by_fields, seq_id);
-                    if(excluded_group_ids.count(distinct_id) != 0) {
-                        continue;
-                    }
-                }
+                // if (group_limit != 0) {
+                //     distinct_id = get_distinct_id(group_by_fields, seq_id);
+                //     if(excluded_group_ids.count(distinct_id) != 0) {
+                //        continue;
+                //    }
+                // }

                int64_t scores[3] = {0};
                scores[0] = seq_id;
                int64_t match_score_index = -1;

                result_ids.push_back(seq_id);
+                if(group_limit == 0) {
+                    KV kv(searched_queries.size(), seq_id, distinct_id, match_score_index, scores);
+                    topster->add(&kv);
+                }

                KV kv(searched_queries.size(), seq_id, distinct_id, match_score_index, scores, nullptr);
                int ret = topster->add(&kv);
@ -2507,12 +2541,12 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
                }

                uint64_t distinct_id = seq_id;
-                if (group_limit != 0) {
-                    distinct_id = get_distinct_id(group_by_fields, seq_id);
-                    if(excluded_group_ids.count(distinct_id) != 0) {
-                        continue;
-                    }
-                }
+                // if (group_limit != 0) {
+                //     distinct_id = get_distinct_id(group_by_fields, seq_id);
+                //     if(excluded_group_ids.count(distinct_id) != 0) {
+                //        continue;
+                //    }
+                // }

                auto vec_dist_score = (field_vector_index->distance_type == cosine) ? std::abs(dist_label.first) :
                                      dist_label.first;
@ -2530,9 +2564,9 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
                KV kv(searched_queries.size(), seq_id, distinct_id, match_score_index, scores, nullptr);
                int ret = topster->add(&kv);

-                if(group_limit != 0 && ret < 2) {
-                    groups_processed[distinct_id]++;
-                }
+                // if(group_limit != 0 && ret < 2) {
+                //     groups_processed[distinct_id]++;
+                // }
                nearest_ids.push_back(seq_id);
            }

@ -2826,7 +2860,8 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
    bool estimate_facets = (facet_sample_percent < 100 && all_result_ids_len > facet_sample_threshold);

    if(!facets.empty()) {
-        const size_t num_threads = std::min(concurrency, all_result_ids_len);
+        //const size_t num_threads = std::min(concurrency, all_result_ids_len);
+        const size_t num_threads = 1;
        const size_t window_size = (num_threads == 0) ? 0 :
                                   (all_result_ids_len + num_threads - 1) / num_threads;  // rounds up
        size_t num_processed = 0;
@ -2897,10 +2932,10 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
                for(auto & facet_kv: this_facet.result_map) {
                    if(group_limit) {
                        // we have to add all group sets
-                        acc_facet.hash_groups[facet_kv.first].insert(
-                            this_facet.hash_groups[facet_kv.first].begin(),
-                            this_facet.hash_groups[facet_kv.first].end()
-                        );
+                        // acc_facet.hash_groups[facet_kv.first].insert(
+                        //     this_facet.hash_groups[facet_kv.first].begin(),
+                        //     this_facet.hash_groups[facet_kv.first].end()
+                        // );
                    } else {
                        size_t count = 0;
                        if(acc_facet.result_map.count(facet_kv.first) == 0) {
@ -2912,9 +2947,10 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
                        acc_facet.result_map[facet_kv.first].count = count;
                    }

-                    acc_facet.result_map[facet_kv.first].doc_id = facet_kv.second.doc_id;
-                    acc_facet.result_map[facet_kv.first].array_pos = facet_kv.second.array_pos;
-                    acc_facet.hash_tokens[facet_kv.first] = this_facet.hash_tokens[facet_kv.first];
+                    //acc_facet.result_map[facet_kv.first].doc_id = facet_kv.second.doc_id;
+                    //acc_facet.result_map[facet_kv.first].array_pos = facet_kv.second.array_pos;
+                    //acc_facet.hash_tokens[facet_kv.first] = this_facet.hash_tokens[facet_kv.first];
+                    acc_facet.facet_tokens[facet_kv.first] = this_facet.facet_tokens[facet_kv.first];
                }

                if(this_facet.stats.fvcount != 0) {
@ -2928,9 +2964,9 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons

        for(auto & acc_facet: facets) {
            for(auto& facet_kv: acc_facet.result_map) {
-                if(group_limit) {
-                    facet_kv.second.count = acc_facet.hash_groups[facet_kv.first].size();
-                }
+                // if(group_limit) {
+                //     facet_kv.second.count = acc_facet.hash_groups[facet_kv.first].size();
+                // }

                if(estimate_facets) {
                    facet_kv.second.count = size_t(double(facet_kv.second.count) * (100.0f / facet_sample_percent));
@ -2955,8 +2991,6 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
              facet_infos, group_limit, group_by_fields, &included_ids_vec[0], 
              included_ids_vec.size(), max_facet_values);

-    facet_index_v4->contains("tags");
-    
    all_result_ids_len += curated_topster->size;

    delete [] all_result_ids;
@ -3665,12 +3699,12 @@ void Index::search_across_fields(const std::vector<token_t>& query_tokens,
        }

        uint64_t distinct_id = seq_id;
-        if(group_limit != 0) {
-            distinct_id = get_distinct_id(group_by_fields, seq_id);
-            if(excluded_group_ids.count(distinct_id) != 0) {
-                return;
-            }
-        }
+        // if(group_limit != 0) {
+        //     distinct_id = get_distinct_id(group_by_fields, seq_id);
+        //     if(excluded_group_ids.count(distinct_id) != 0) {
+        //        return;
+        //    }
+        // }

        int64_t scores[3] = {0};
        int64_t match_score_index = -1;
@ -4267,18 +4301,21 @@ void Index::do_infix_search(const size_t num_search_fields, const std::vector<se
                                        100, scores, match_score_index);

                    uint64_t distinct_id = seq_id;
-                    if(group_limit != 0) {
-                        distinct_id = get_distinct_id(group_by_fields, seq_id);
-                        if(excluded_group_ids.count(distinct_id) != 0) {
-                            continue;
-                        }
-                    }
-
-                    KV kv(searched_queries.size(), seq_id, distinct_id, match_score_index, scores);
-                    int ret = actual_topster->add(&kv);
-                    if(group_limit != 0 && ret < 2) {
-                        groups_processed[distinct_id]++;
+                    // if(group_limit != 0) {
+                    //     distinct_id = get_distinct_id(group_by_fields, seq_id);
+                    //     if(excluded_group_ids.count(distinct_id) != 0) {
+                    //        continue;
+                    //    }
+                    // }
+                    if(group_limit == 0) {
+                        KV kv(searched_queries.size(), seq_id, distinct_id, match_score_index, scores);
+                        int ret = actual_topster->add(&kv);
                    }
+                    
+                    // if(group_limit != 0 && ret < 2) {
+                    //     groups_processed[distinct_id]++;
+                    // }
+                    

                    if(((i + 1) % (1 << 12)) == 0) {
                        BREAK_CIRCUIT_BREAKER
@ -4374,10 +4411,6 @@ void Index::compute_facet_infos(const std::vector<facet>& facets, facet_query_t&
        //     && (field_single_val_facet_mapping_it == single_val_facet_index_v3.end())) {
        //     continue;
        // }
-        std::map<std::string, std::vector<uint32_t>> found_doc_ids;
-        if(facet_index_v4->get(a_facet.field_name, found_doc_ids) == 0) {
-            continue;
-        }

        facet_infos[findex].use_facet_query = false;

@ -4442,6 +4475,16 @@ void Index::compute_facet_infos(const std::vector<facet>& facets, facet_query_t&
                for(auto leaf: searched_query) {
                    posting_lists.push_back(leaf->values);
                    std::string tok(reinterpret_cast<char*>(leaf->key), leaf->key_len - 1);
+                    
+                    //convert again to boolean string to help search in facet_index map
+                    if (facet_field.is_bool()) {
+                        if (tok == "1") {
+                            tok = "true";
+                        } else if (tok == "0") {
+                            tok = "false";
+                        }
+                    }
+
                    searched_tokens.push_back(tok);
                    //LOG(INFO) << "tok: " << tok;
                }
@ -4468,6 +4511,7 @@ void Index::compute_facet_infos(const std::vector<facet>& facets, facet_query_t&
                    if(!id_matched) {
                        continue;
                    }
+                    //LOG(INFO) << "seq_id matched : " << seq_id;

                    // if(facet_field.is_array()) {
                    //     const auto doc_fvalues_it = field_facet_mapping_it->second[seq_id % ARRAY_FACET_DIM]->find(seq_id);
@ -4501,19 +4545,18 @@ void Index::compute_facet_infos(const std::vector<facet>& facets, facet_query_t&
                    //         facet_infos[findex].hashes.emplace(hash, searched_tokens);
                    //     }
                    // }
-                    for(const auto& found_doc_it : found_doc_ids) {
-                        const auto& ids = found_doc_it.second;
-                        if(std::binary_search(ids.begin(), ids.end(), seq_id)){
-                            for(const auto& doc_id : ids) {
-                                if(facet_infos[findex].doc_id_tokens.count(doc_id) == 0) {
-                                    facet_infos[findex].doc_id_tokens.emplace(doc_id, searched_tokens);
-                                }
-                            }
+                }
+                std::vector<std::string> matched_facets;
+                if(facet_index_v4->get_facet(a_facet.field_name, searched_tokens, matched_facets)) {
+                    for(const auto& facet : matched_facets) {
+                        if(facet_infos[findex].facet_tokens.count(facet) == 0) {
+                            LOG(INFO) << "adding facet " << facet << " in facet_info";
+                            facet_infos[findex].facet_tokens.emplace(facet, searched_tokens);
                        }
                    }
                }
            }
-
+            
            delete [] field_result_ids;
        }
    }
@ -4621,20 +4664,20 @@ void Index::search_wildcard(filter_node_t const* const& filter_tree_root,
                                    100, scores, match_score_index);

                uint64_t distinct_id = seq_id;
-                if(group_limit != 0) {
-                    distinct_id = get_distinct_id(group_by_fields, seq_id);
-                    if(excluded_group_ids.count(distinct_id) != 0) {
-                        continue;
-                    }
+                // if(group_limit != 0) {
+                //     distinct_id = get_distinct_id(group_by_fields, seq_id);
+                //     if(excluded_group_ids.count(distinct_id) != 0) {
+                //        continue;
+                //    }
+                // }
+                if(group_limit == 0) {
+                    KV kv(searched_queries.size(), seq_id, distinct_id, match_score_index, scores);
+                    int ret = topsters[thread_id]->add(&kv);
                }
-
-                KV kv(searched_queries.size(), seq_id, distinct_id, match_score_index, scores);
-                int ret = topsters[thread_id]->add(&kv);
-
-                if(group_limit != 0 && ret < 2) {
-                    tgroups_processed[thread_id][distinct_id]++;
-                }
-
+                
+                // if(group_limit != 0 && ret < 2) {
+                //     tgroups_processed[thread_id][distinct_id]++;
+                // }
                if(check_for_circuit_break && ((i + 1) % (1 << 15)) == 0) {
                    // check only once every 2^15 docs to reduce overhead
                    BREAK_CIRCUIT_BREAKER
@ -5220,11 +5263,16 @@ void Index::score_results(const std::vector<sort_by> & sort_fields, const uint16

    uint64_t distinct_id = seq_id;

-    if(group_limit != 0) {
-        distinct_id = get_distinct_id(group_by_fields, seq_id);
-    }
+    // if(group_limit != 0) {
+    //     distinct_id = get_distinct_id(group_by_fields, seq_id);
+    //     groups_processed.emplace(distinct_id);
+    // }

    //LOG(INFO) << "Seq id: " << seq_id << ", match_score: " << match_score;
+    if(group_limit == 0) {
+        KV kv(query_index, seq_id, distinct_id, match_score_index, scores);
+        topster->add(&kv);
+    }
    KV kv(query_index, seq_id, distinct_id, match_score_index, scores);
    int ret = topster->add(&kv);
    if(group_limit != 0 && ret < 2) {
@ -5239,7 +5287,6 @@ void Index::score_results(const std::vector<sort_by> & sort_fields, const uint16
 uint64_t Index::get_distinct_id(const std::vector<std::string>& group_by_fields,
                                const uint32_t seq_id) const {
    uint64_t distinct_id = 1; // some constant initial value
-    std::hash<std::string> hasher;
    // calculate hash from group_by_fields
    for(const auto& field: group_by_fields) {
        // const auto& field_facet_mapping_it = facet_index_v3.find(field);
@ -5277,9 +5324,6 @@ uint64_t Index::get_distinct_id(const std::vector<std::string>& group_by_fields,

        //     distinct_id = StringUtils::hash_combine(distinct_id, facet_hash);
        // }
-
-        const auto& hash = hasher(field);
-        distinct_id = StringUtils::hash_combine(distinct_id, hash);
    }

    return distinct_id;
--- a/test/collection_faceting_test.cpp
+++ b/test/collection_faceting_test.cpp
@ -103,11 +103,11 @@ TEST_F(CollectionFacetingTest, FacetCounts) {
    ASSERT_STREQ("tags", results["facet_counts"][0]["field_name"].get<std::string>().c_str());
    ASSERT_EQ(2, results["facet_counts"][0]["counts"].size());

-    ASSERT_STREQ("gold", results["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
-    ASSERT_EQ(3, (int) results["facet_counts"][0]["counts"][0]["count"]);
+    ASSERT_STREQ("bronze", results["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
+    ASSERT_EQ(2, (int) results["facet_counts"][0]["counts"][0]["count"]);

-    ASSERT_STREQ("silver", results["facet_counts"][0]["counts"][1]["value"].get<std::string>().c_str());
-    ASSERT_EQ(3, (int) results["facet_counts"][0]["counts"][1]["count"]);
+    ASSERT_STREQ("FINE PLATINUM", results["facet_counts"][0]["counts"][1]["value"].get<std::string>().c_str());
+    ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][1]["count"]);

    // 2 facets, 1 text query with no filters
    facets.clear();
@ -230,12 +230,12 @@ TEST_F(CollectionFacetingTest, FacetCounts) {
    ASSERT_STREQ("age", results["facet_counts"][0]["field_name"].get<std::string>().c_str());

    ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][0]["count"]);
-    ASSERT_STREQ("21", results["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
-    ASSERT_STREQ("<mark>2</mark>1", results["facet_counts"][0]["counts"][0]["highlighted"].get<std::string>().c_str());
+    ASSERT_STREQ("24", results["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
+    ASSERT_STREQ("<mark>2</mark>4", results["facet_counts"][0]["counts"][0]["highlighted"].get<std::string>().c_str());

    ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][1]["count"]);
-    ASSERT_STREQ("24", results["facet_counts"][0]["counts"][1]["value"].get<std::string>().c_str());
-    ASSERT_STREQ("<mark>2</mark>4", results["facet_counts"][0]["counts"][1]["highlighted"].get<std::string>().c_str());
+    ASSERT_STREQ("21", results["facet_counts"][0]["counts"][1]["value"].get<std::string>().c_str());
+    ASSERT_STREQ("<mark>2</mark>1", results["facet_counts"][0]["counts"][1]["highlighted"].get<std::string>().c_str());

    // facet on a float field without query to check on stats
    results = coll_array_fields->search("*", query_fields, "", {"rating"}, sort_fields, {0}, 10, 1, FREQUENCY,
@ -258,7 +258,7 @@ TEST_F(CollectionFacetingTest, FacetCounts) {
                                        {false}, Index::DROP_TOKENS_THRESHOLD,
                                        spp::sparse_hash_set<std::string>(),
                                        spp::sparse_hash_set<std::string>(), 10, "rating: 7").get();
-
+    
    ASSERT_EQ(5, results["hits"].size());
    ASSERT_EQ(1, results["facet_counts"].size());
    ASSERT_STREQ("rating", results["facet_counts"][0]["field_name"].get<std::string>().c_str());
@ -278,7 +278,7 @@ TEST_F(CollectionFacetingTest, FacetCounts) {
    results = coll_array_fields->search("*", query_fields, "", {"timestamps"}, sort_fields, {0}, 10, 1, FREQUENCY,
                                        {false}, Index::DROP_TOKENS_THRESHOLD,
                                        spp::sparse_hash_set<std::string>(),
-                                        spp::sparse_hash_set<std::string>(), 10, "timestamps: 142189002").get();
+                                        spp::sparse_hash_set<std::string>(), 10, "timestamps: 142189002").get();    
    ASSERT_EQ(5, results["hits"].size());
    ASSERT_EQ(1, results["facet_counts"].size());
    ASSERT_EQ(1, results["facet_counts"][0]["counts"].size());
@ -607,7 +607,6 @@ TEST_F(CollectionFacetingTest, FacetCountsHighlighting) {
    ASSERT_STREQ("Cell Phone <mark>Acces</mark>sories", results["facet_counts"][0]["counts"][0]["highlighted"].get<std::string>().c_str());

    // ensure that only the last token is treated as prefix search
-
    coll1->remove("100");
    doc["categories"] = {"Cell Phones", "Cell Phone Accessories", "Cellophanes"};
    coll1->add(doc.dump());
@ -616,6 +615,8 @@ TEST_F(CollectionFacetingTest, FacetCountsHighlighting) {
                            token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set<std::string>(),
                            spp::sparse_hash_set<std::string>(), 10, "categories:cell ph").get();

+    LOG(INFO) << results.dump();
+
    ASSERT_EQ(1, results["facet_counts"].size());
    ASSERT_EQ(2, results["facet_counts"][0]["counts"].size());

@ -673,6 +674,7 @@ TEST_F(CollectionFacetingTest, FacetStatOnFloatFields) {
                                            1, FREQUENCY, {false});

    auto results = res_op.get();
+    LOG(INFO) << results.dump();

    ASSERT_EQ(7, results["hits"].size());

@ -746,8 +748,8 @@ TEST_F(CollectionFacetingTest, FacetCountOnSimilarStrings) {
    ASSERT_EQ(2, results["hits"].size());
    ASSERT_EQ(2, results["facet_counts"][0]["counts"].size());

-    ASSERT_STREQ("India in England", results["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
-    ASSERT_STREQ("England in India", results["facet_counts"][0]["counts"][1]["value"].get<std::string>().c_str());
+    ASSERT_STREQ("England in India", results["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
+    ASSERT_STREQ("India in England", results["facet_counts"][0]["counts"][1]["value"].get<std::string>().c_str());

    collectionManager.drop_collection("coll1");
 }
@ -1645,6 +1647,7 @@ TEST_F(CollectionFacetingTest, FacetIndexRefactor) {
    ASSERT_EQ(5, results["hits"].size());
    ASSERT_EQ(1, results["facet_counts"].size());
    ASSERT_EQ(4, results["facet_counts"][0].size());
+    ASSERT_EQ(4, results["facet_counts"][0]["counts"].size());
    ASSERT_EQ("tags", results["facet_counts"][0]["field_name"]);

    ASSERT_STREQ("gold", results["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
--- a/test/collection_grouping_test.cpp
+++ b/test/collection_grouping_test.cpp
@ -69,6 +69,8 @@ TEST_F(CollectionGroupingTest, GroupingBasics) {
                                   "", 10,
                                   {}, {}, {"size"}, 2).get();

+    LOG(INFO) << res.dump();
+
    ASSERT_EQ(3, res["found"].get<size_t>());
    ASSERT_EQ(3, res["grouped_hits"].size());
    ASSERT_EQ(11, res["grouped_hits"][0]["group_key"][0].get<size_t>());