diff --git a/include/facet_index.h b/include/facet_index.h index 38aa5a54..6076f5af 100644 --- a/include/facet_index.h +++ b/include/facet_index.h @@ -132,8 +132,8 @@ public: void initialize(const std::string& field); - void handle_index_change(const std::string& field_name, size_t total_num_docs, size_t facet_index_threshold, - size_t facet_count, spp::sparse_hash_map& numerical_index); + void handle_index_change(const std::string& field_name, size_t total_num_docs, + size_t facet_index_threshold, size_t facet_count); bool has_hash_index(const std::string& field_name); diff --git a/include/num_tree.h b/include/num_tree.h index 24a169a3..26d5bdf4 100644 --- a/include/num_tree.h +++ b/include/num_tree.h @@ -77,12 +77,5 @@ public: uint32_t* const& context_ids, size_t& result_ids_len, uint32_t*& result_ids) const; - - size_t intersect(const uint32_t* result_ids, size_t result_id_len, - size_t max_facet_count, std::map& found, - bool is_wildcard_no_filter_query); - - size_t counter_list_size() const; - size_t get_facet_indexes(std::map>& seqid_countIndexes); }; \ No newline at end of file diff --git a/src/facet_index.cpp b/src/facet_index.cpp index 2a796221..c7981fb1 100644 --- a/src/facet_index.cpp +++ b/src/facet_index.cpp @@ -55,30 +55,26 @@ void facet_index_t::insert(const std::string& field_name, bool is_string, if(facet_index.has_value_index) { count_list.emplace_back(fvalue.facet_value, seq_ids.size(), facet_id); fis.facet_count_it = std::prev(count_list.end()); - if(is_string) { - fis.seq_ids = SET_COMPACT_IDS(compact_id_list_t::create(seq_ids.size(), seq_ids)); - } + fis.seq_ids = SET_COMPACT_IDS(compact_id_list_t::create(seq_ids.size(), seq_ids)); } fvalue_index.emplace(fvalue.facet_value, fis); } else if(facet_index.has_value_index) { - if(is_string) { - for(const auto id : seq_ids) { - ids_t::upsert(fvalue_index_it->seq_ids, id); - } + for(const auto id : seq_ids) { + ids_t::upsert(fvalue_index_it->seq_ids, id); + } - auto facet_count_it = fvalue_index_it->facet_count_it; + auto facet_count_it = fvalue_index_it->facet_count_it; - if(facet_count_it->facet_id == facet_id) { - facet_count_it->count = ids_t::num_ids(fvalue_index_it->seq_ids); - auto curr = facet_count_it; - while (curr != count_list.begin() && std::prev(curr)->count < curr->count) { - count_list.splice(curr, count_list, std::prev(curr)); // swaps list nodes - curr--; - } - } else { - LOG(ERROR) << "Wrong reference stored for facet " << fvalue.facet_value << " with facet_id " << facet_id; + if(facet_count_it->facet_id == facet_id) { + facet_count_it->count = ids_t::num_ids(fvalue_index_it->seq_ids); + auto curr = facet_count_it; + while (curr != count_list.begin() && std::prev(curr)->count < curr->count) { + count_list.splice(curr, count_list, std::prev(curr)); // swaps list nodes + curr--; } + } else { + LOG(ERROR) << "Wrong reference stored for facet " << fvalue.facet_value << " with facet_id " << facet_id; } } @@ -162,27 +158,28 @@ size_t facet_index_t::intersect(const std::string& field, const uint32_t* result const auto& facet_index_map = facet_field_it->second.fvalue_seq_ids; const auto& counter_list = facet_field_it->second.counts; - // LOG (INFO) << "fvalue_seq_ids size " << fvalue_seq_ids.size() - // << " , counts size " << counts.size(); - + //LOG(INFO) << "fvalue_seq_ids size " << facet_index_map.size() << " , counts size " << counter_list.size(); + size_t max_facets = std::min((size_t)2 * max_facet_count, counter_list.size()); std::vector id_list; for(const auto& facet_count : counter_list) { - // LOG (INFO) << "checking ids in facet_value " << counter_list_it.facet_value - // << " having total count " << counter_list_it.count; + //LOG(INFO) << "checking ids in facet_value " << facet_count.facet_value << " having total count " + // << facet_count.count << ", is_wildcard_no_filter_query: " << is_wildcard_no_filter_query; uint32_t count = 0; if(is_wildcard_no_filter_query) { count = facet_count.count; } else { auto ids = facet_index_map.at(facet_count.facet_value).seq_ids; - ids_t::uncompress(ids, id_list); - for(size_t i = 0; i < result_ids_len; ++i) { - uint32_t* out = nullptr; - count = ArrayUtils::and_scalar(id_list.data(), id_list.size(), - result_ids, result_ids_len, &out); - delete[] out; + if(!ids) { + continue; } + + ids_t::uncompress(ids, id_list); + uint32_t* out = nullptr; + count = ArrayUtils::and_scalar(id_list.data(), id_list.size(), result_ids, result_ids_len, &out); + delete[] out; + id_list.clear(); } @@ -230,8 +227,7 @@ size_t facet_index_t::get_facet_indexes(const std::string& field_name, } void facet_index_t::handle_index_change(const std::string& field_name, size_t total_num_docs, - size_t facet_index_threshold, size_t facet_count, - spp::sparse_hash_map& numerical_index) { + size_t facet_index_threshold, size_t facet_count) { // Low cardinality fields will have only value based facet index. Once a field becomes a high cardinality // field (exceeding FACET_INDEX_THRESHOLD), we will create a hash based index and populate it. @@ -252,16 +248,6 @@ void facet_index_t::handle_index_change(const std::string& field_name, size_t to seq_id_index_map.clear(); - auto numerical_index_it = numerical_index.find(field_name); - if(numerical_index_it != numerical_index.end()) { - auto num_tree = numerical_index_it->second; - if(num_tree->get_facet_indexes(seq_id_index_map)) { - for(const auto& kv : seq_id_index_map) { - fhash_index->upsert(kv.first, kv.second); - } - } - } - facet_index.has_hash_index = true; auto cardinality_ratio = total_num_docs / facet_count; diff --git a/src/index.cpp b/src/index.cpp index d268a173..d90407b5 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -636,24 +636,16 @@ void Index::index_field_in_memory(const field& afield, std::vector // b) `afield` value could be empty // non-geo faceted field should be indexed as faceted string field as well - bool non_string_facet_field = (afield.facet && !afield.is_geopoint()); + bool is_facet_field = (afield.facet && !afield.is_geopoint()); - if(afield.is_string() || non_string_facet_field) { + if(afield.is_string() || is_facet_field) { std::unordered_map> token_to_doc_offsets; int64_t max_score = INT64_MIN; std::unordered_map, facet_value_id_t::Hash> fvalue_to_seq_ids; std::unordered_map> seq_id_to_fvalues; - auto facet_count = 0; - if(afield.is_string()) { - facet_count = facet_index_v4->get_facet_count(afield.name); - } else { - auto numerical_index_it = numerical_index.find(afield.name); - if(numerical_index_it != numerical_index.end()) { - facet_count = numerical_index_it->second->counter_list_size(); - } - } + auto facet_count = facet_index_v4->get_facet_count(afield.name); #ifdef TEST_BUILD facet_count = FACET_INDEX_THRESHOLD + 1; @@ -661,8 +653,7 @@ void Index::index_field_in_memory(const field& afield, std::vector if(afield.facet) { size_t total_num_docs = seq_ids->num_ids(); - facet_index_v4->handle_index_change(afield.name, total_num_docs, FACET_INDEX_THRESHOLD, facet_count, - numerical_index); + facet_index_v4->handle_index_change(afield.name, total_num_docs, FACET_INDEX_THRESHOLD, facet_count); } for(const auto& record: iter_batch) { @@ -706,13 +697,14 @@ void Index::index_field_in_memory(const field& afield, std::vector } else if(afield.type == field_types::FLOAT_ARRAY) { float raw_val = field_values[i].get(); auto fhash = reinterpret_cast(raw_val); - facet_value_id_t facet_value_id(std::to_string(raw_val), fhash); + facet_value_id_t facet_value_id(StringUtils::float_to_str(raw_val), fhash); fvalue_to_seq_ids[facet_value_id].push_back(seq_id); seq_id_to_fvalues[seq_id].push_back(facet_value_id); } else if(afield.type == field_types::BOOL_ARRAY) { bool raw_val = field_values[i].get(); auto fhash = (uint32_t)raw_val; - facet_value_id_t facet_value_id(std::to_string(raw_val), fhash); + auto str_val = (raw_val == 1) ? "true" : "false"; + facet_value_id_t facet_value_id(str_val, fhash); fvalue_to_seq_ids[facet_value_id].push_back(seq_id); seq_id_to_fvalues[seq_id].push_back(facet_value_id); } @@ -740,14 +732,15 @@ void Index::index_field_in_memory(const field& afield, std::vector else if(afield.type == field_types::FLOAT) { float raw_val = document[afield.name].get(); auto fhash = reinterpret_cast(raw_val); - facet_value_id_t facet_value_id(std::to_string(raw_val), fhash); + facet_value_id_t facet_value_id(StringUtils::float_to_str(raw_val), fhash); fvalue_to_seq_ids[facet_value_id].push_back(seq_id); seq_id_to_fvalues[seq_id].push_back(facet_value_id); } else if(afield.type == field_types::BOOL) { bool raw_val = document[afield.name].get(); auto fhash = (uint32_t)raw_val; - facet_value_id_t facet_value_id(std::to_string(raw_val), fhash); + auto str_val = (raw_val == 1) ? "true" : "false"; + facet_value_id_t facet_value_id(str_val, fhash); fvalue_to_seq_ids[facet_value_id].push_back(seq_id); seq_id_to_fvalues[seq_id].push_back(facet_value_id); } @@ -1228,18 +1221,7 @@ void Index::do_facets(std::vector & facets, facet_query_t & facet_query, size_t mod_value = 100 / facet_sample_percent; - auto num_facet_values = 0; - - if(facet_field.is_string()) { - num_facet_values = facet_index_v4->get_facet_count(a_facet.field_name); - } else { - auto numerical_index_it = numerical_index.find(a_facet.field_name); - if(numerical_index_it != numerical_index.end()) { - num_facet_values = numerical_index_it->second->counter_list_size(); - } else { - LOG(ERROR) << "facet " << a_facet.field_name << " not found in numerical index"; - } - } + auto num_facet_values = facet_index_v4->get_facet_count(facet_field.name); bool use_hashes = false; @@ -1253,36 +1235,17 @@ void Index::do_facets(std::vector & facets, facet_query_t & facet_query, use_hashes = false; #endif bool is_wildcard_no_filter_query = is_wildcard_query && no_filters_provided; - bool facet_hash_index_exists = facet_index_v4->has_hash_index(a_facet.field_name); + bool facet_hash_index_exists = facet_index_v4->has_hash_index(facet_field.name); if((num_facet_values && ((!facet_hash_index_exists || is_wildcard_no_filter_query || num_facet_values > 50000) && group_limit == 0) && !use_hashes) || use_facet_intersection) { - //LOG(INFO) << "Using intersection to find facets"; + // LOG(INFO) << "Using intersection to find facets"; a_facet.is_intersected = true; std::map facet_results; - if(facet_field.is_string()) { - facet_index_v4->intersect(a_facet.field_name, result_ids, - results_size, max_facet_count, facet_results, is_wildcard_no_filter_query); - } else { - std::map facet_counts; - numerical_index.at(a_facet.field_name)->intersect(result_ids, results_size, max_facet_count, - facet_counts, is_wildcard_no_filter_query); - - for(const auto& kv : facet_counts) { - std::string val; - if(facet_field.is_float()) { - val = StringUtils::float_to_str(int64_t_to_float(kv.first)); - } else if(facet_field.is_bool()) { - val = kv.first == 1 ? "true" : "false"; - } else { - val = std::to_string(kv.first); - } - - facet_results[val] = kv.second; - } - } + facet_index_v4->intersect(facet_field.name, result_ids, + results_size, max_facet_count, facet_results, is_wildcard_no_filter_query); for(const auto& kv : facet_results) { //range facet processing diff --git a/src/num_tree.cpp b/src/num_tree.cpp index 30c2c754..59a5220e 100644 --- a/src/num_tree.cpp +++ b/src/num_tree.cpp @@ -382,76 +382,8 @@ size_t num_tree_t::size() { return int64map.size(); } -size_t num_tree_t::counter_list_size() const { - return counter_list.size(); -} - num_tree_t::~num_tree_t() { for(auto& kv: int64map) { ids_t::destroy_list(kv.second); } } - -size_t num_tree_t::intersect(const uint32_t* result_ids, size_t result_ids_len, size_t max_facet_count, - std::map& found, bool is_wildcard_no_filter_query) { - //LOG (INFO) << "intersecting field " << field; - - // LOG (INFO) << "int64map size " << int64map.size() - // << " , counts size " << counts.size(); - - std::vector id_list; - for(const auto& counter_list_it : counter_list) { - // LOG (INFO) << "checking ids in facet_value " << counter_list_it.facet_value - // << " having total count " << counter_list_it.count; - uint32_t count = 0; - - if(is_wildcard_no_filter_query) { - count = counter_list_it.count; - } else { - auto ids = int64map.at(counter_list_it.facet_value); - ids_t::uncompress(ids, id_list); - const auto ids_len = id_list.size(); - for(size_t i = 0; i < result_ids_len; ++i) { - uint32_t* out = nullptr; - count = ArrayUtils::and_scalar(id_list.data(), id_list.size(), - result_ids, result_ids_len, &out); - delete[] out; - } - id_list.clear(); - } - - if(count) { - found[counter_list_it.facet_value] = count; - if(found.size() == max_facet_count) { - break; - } - } - } - - return found.size(); -} - -size_t num_tree_t::get_facet_indexes(std::map>& seqid_countIndexes) { - - //check if facet field - if(counter_list.empty()) { - return 0; - } - - std::vector id_list; - - for(auto int64map_it = int64map.begin(); int64map_it != int64map.end(); ++int64map_it) { - - auto ids = int64map_it->second; - ids_t::uncompress(ids, id_list); - - // emplacing seq_id => facet_id - for(const auto& id : id_list) { - seqid_countIndexes[id].emplace_back(int64map_it->first); - } - - id_list.clear(); - } - - return seqid_countIndexes.size(); -}