From cd69111a5b93240c5e56075869f61b0d30030fdd Mon Sep 17 00:00:00 2001 From: krunal1313 Date: Fri, 31 Mar 2023 17:38:33 +0530 Subject: [PATCH] facet index refactor updated changes --- include/facet_index.h | 19 ++- include/field.h | 18 ++- include/index.h | 4 +- src/collection.cpp | 223 +++++++++++++++--------------- src/facet_index.cpp | 108 +++++++-------- src/index.cpp | 222 +++++++++++++++++------------ test/collection_faceting_test.cpp | 29 ++-- test/collection_grouping_test.cpp | 2 + 8 files changed, 331 insertions(+), 294 deletions(-) diff --git a/include/facet_index.h b/include/facet_index.h index eced703f..fecea8e7 100644 --- a/include/facet_index.h +++ b/include/facet_index.h @@ -3,8 +3,6 @@ #include "ids_t.h" #include "tsl/htrie_map.h" #include -#include -#include class facet_index_t { private: @@ -21,17 +19,27 @@ private: struct facet_index_counter { tsl::htrie_map facet_index_map; std::list counter_list; + + ~facet_index_counter() { + for(auto it = facet_index_map.begin(); it != facet_index_map.end(); ++it) { + ids_t::destroy_list(it.value()); + } + + facet_index_map.clear(); + + counter_list.clear(); + } }; std::map facet_field_map; public: + facet_index_t() = default; + ~facet_index_t(); void insert(const std::string& field, const std::string& value, uint32_t id); - size_t get(const std::string& field, std::map>& result_ids); - void erase(const std::string& field); bool contains(const std::string& field); @@ -40,4 +48,7 @@ public: int intersect(const std::string& val, const uint32_t* result_ids, int result_id_len, int max_facet_count, std::map& found); + + int get_facet(const std::string& field, const std::vector& searched_tokens, + std::vector& facets); }; \ No newline at end of file diff --git a/include/field.h b/include/field.h index 5c1ed7ac..1fa358ae 100644 --- a/include/field.h +++ b/include/field.h @@ -635,26 +635,24 @@ struct facet_stats_t { struct facet { const std::string field_name; - spp::sparse_hash_map result_map; - + spp::sparse_hash_map result_map; // used for facet value query - spp::sparse_hash_map> hash_tokens; + //spp::sparse_hash_map> hash_tokens; + spp::sparse_hash_map> facet_tokens; // used for faceting grouped results - spp::sparse_hash_map> hash_groups; + //spp::sparse_hash_map> hash_groups; facet_stats_t stats; //dictionary of key=>pair(range_id, range_val) - std::map facet_range_map; + std::map facet_range_map; bool is_range_query; bool sampled = false; - bool is_wildcard_match = false; - - bool get_range(int64_t key, std::pair& range_pair) + bool get_range(std::string key, std::pair& range_pair) { if(facet_range_map.empty()) { @@ -673,7 +671,7 @@ struct facet { } explicit facet(const std::string& field_name, - std::map facet_range = {}, bool is_range_q = false) + std::map facet_range = {}, bool is_range_q = false) :field_name(field_name){ facet_range_map = facet_range; is_range_query = is_range_q; @@ -684,7 +682,7 @@ struct facet_info_t { // facet hash => resolved tokens //std::unordered_map> hashes; //facet name => resolved tokens - std::unordered_map> doc_id_tokens; + std::unordered_map> facet_tokens; bool use_facet_query = false; bool should_compute_stats = false; field facet_field{"", "", false}; diff --git a/include/index.h b/include/index.h index e29879ed..cf8f76c4 100644 --- a/include/index.h +++ b/include/index.h @@ -281,8 +281,6 @@ struct hnsw_index_t { } }; -extern std::map> facet_results; - class Index { private: mutable std::shared_mutex mutex; @@ -509,7 +507,7 @@ private: static uint64_t facet_token_hash(const field & a_field, const std::string &token); - static void compute_facet_stats(facet &a_facet, uint64_t raw_value, const std::string & field_type); + static void compute_facet_stats(facet &a_facet, std::string raw_value, const std::string & field_type); static void handle_doc_ops(const tsl::htrie_map& search_schema, nlohmann::json& update_doc, const nlohmann::json& old_doc, nlohmann::json& new_doc); diff --git a/src/collection.cpp b/src/collection.cpp index 064f0de0..eb4ae625 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -22,8 +22,6 @@ const std::string override_t::MATCH_EXACT = "exact"; const std::string override_t::MATCH_CONTAINS = "contains"; -std::map> facet_results; - struct sort_fields_guard_t { std::vector sort_fields_std; @@ -1922,15 +1920,22 @@ Option Collection::search(std::string raw_query, facet_result["counts"] = nlohmann::json::array(); std::vector facet_values; - std::vector> facet_hash_counts; + // std::vector> facet_hash_counts; - for (const auto & kv : a_facet.result_map) { - facet_hash_counts.emplace_back(kv); - } - - if(a_facet.is_range_query){ - for(auto kv : a_facet.result_map){ + // for (const auto & kv : a_facet.result_map) { + // facet_hash_counts.emplace_back(kv); + // } + + auto the_field = search_schema.at(a_facet.field_name); + // keep only top K facets + //auto max_facets = std::min(max_facet_values, facet_hash_counts.size()); + auto max_facets = std::min(max_facet_values, a_facet.result_map.size()); + // std::nth_element(facet_hash_counts.begin(), facet_hash_counts.begin() + max_facets, + // facet_hash_counts.end(), Collection::facet_count_compare); + //LOG (INFO) << "found facet size " << a_facet.result_map.size(); + for(auto& kv : a_facet.result_map) { + if(a_facet.is_range_query){ auto facet_range_iter = a_facet.facet_range_map.find(kv.first); if(facet_range_iter != a_facet.facet_range_map.end()){ auto & facet_count = kv.second; @@ -1940,109 +1945,99 @@ Option Collection::search(std::string raw_query, else{ LOG (ERROR) << "range_id not found in result map."; } - } - } - - auto the_field = search_schema.at(a_facet.field_name); - // keep only top K facets - //auto max_facets = std::min(max_facet_values, facet_hash_counts.size()); - auto max_facets = std::min(max_facet_values, facet_results[a_facet.field_name].size()); - // std::nth_element(facet_hash_counts.begin(), facet_hash_counts.begin() + max_facets, - // facet_hash_counts.end(), Collection::facet_count_compare); - LOG (INFO) << "found_doc_seq_ids size " << facet_results[a_facet.field_name].size(); - //for(size_t fi = 0; fi < max_facets; fi++) { - for(auto& it : facet_results[a_facet.field_name]) { + } else { + //facet_value_t facet_value = { kv.first, std::string(), kv.second.count}; + //facet_values.emplace_back(facet_value); + - if(a_facet.is_range_query){ - break; + // remap facet value hash with actual string + // auto & kv = facet_hash_counts[fi]; + // auto & facet_count = kv.second; + // // fetch actual facet value from representative doc id + // const std::string& seq_id_key = get_seq_id_key((uint32_t) facet_count.doc_id); + // nlohmann::json document; + // const Option & document_op = get_document_from_store(seq_id_key, document); + // if(!document_op.ok()) { + // LOG(ERROR) << "Facet fetch error. " << document_op.error(); + // continue; + // } + //std::string value; + // bool facet_found = facet_value_to_string(a_facet, facet_count, document, value); + // if(!facet_found) { + // continue; + // } + std::string value = kv.first; + std::unordered_map ftoken_pos; + //std::vector& ftokens = a_facet.hash_tokens[kv.first]; + std::vector& ftokens = a_facet.facet_tokens[kv.first]; + for(size_t ti = 0; ti < ftokens.size(); ti++) { + // if(the_field.is_bool()) { + // if(ftokens[ti] == "1") { + // ftokens[ti] = "true"; + // } else { + // ftokens[ti] = "false"; + // } + // } + const std::string& resolved_token = ftokens[ti]; + ftoken_pos[resolved_token] = ti; + } + const std::string& last_full_q_token = ftokens.empty() ? "" : ftokens.back(); + // 2 passes: first identify tokens that need to be highlighted and then construct highlighted text + bool is_cyrillic = Tokenizer::is_cyrillic(the_field.locale); + bool normalise = is_cyrillic ? false : true; + Tokenizer tokenizer(value, normalise, !the_field.is_string(), the_field.locale, symbols_to_index, token_separators); + // secondary tokenizer used for specific languages that requires transliteration + // we use 2 tokenizers so that the original text offsets are available for highlighting + Tokenizer word_tokenizer("", true, false, the_field.locale, symbols_to_index, token_separators); + std::string raw_token; + size_t raw_token_index = 0, tok_start = 0, tok_end = 0; + // need an ordered map here to ensure that it is ordered by the key (start offset) + std::map token_offsets; + size_t prefix_token_start_index = 0; + while(tokenizer.next(raw_token, raw_token_index, tok_start, tok_end)) { + if(is_cyrillic) { + word_tokenizer.tokenize(raw_token); + } + auto token_pos_it = ftoken_pos.find(raw_token); + if(token_pos_it != ftoken_pos.end()) { + token_offsets[tok_start] = tok_end; + if(raw_token == last_full_q_token) { + prefix_token_start_index = tok_start; + } + } + } + auto offset_it = token_offsets.begin(); + size_t i = 0; + std::stringstream highlightedss; + // loop until end index, accumulate token and complete highlighting + while(i < value.size()) { + if(offset_it != token_offsets.end()) { + if (i == offset_it->first) { + highlightedss << highlight_start_tag; + // do prefix highlighting for non-dropped last token + size_t token_len = (i == prefix_token_start_index && token_offsets.size() == facet_query_num_tokens) ? + facet_query_last_token.size() : + (offset_it->second - i + 1); + if(i == prefix_token_start_index && token_offsets.size() == facet_query_num_tokens) { + token_len = std::min((offset_it->second - i + 1), facet_query_last_token.size()); + } else { + token_len = (offset_it->second - i + 1); + } + for(size_t j = 0; j < token_len; j++) { + highlightedss << value[i + j]; + } + highlightedss << highlight_end_tag; + offset_it++; + i += token_len; + continue; + } + } + highlightedss << value[i]; + i++; + } + facet_value_t facet_value = {value, highlightedss.str(), kv.second.count}; + facet_values.emplace_back(facet_value); } - - // remap facet value hash with actual string - // auto & kv = facet_hash_counts[fi]; - // auto & facet_count = kv.second; - // // fetch actual facet value from representative doc id - // const std::string& seq_id_key = get_seq_id_key((uint32_t) facet_count.doc_id); - // nlohmann::json document; - // const Option & document_op = get_document_from_store(seq_id_key, document); - // if(!document_op.ok()) { - // LOG(ERROR) << "Facet fetch error. " << document_op.error(); - // continue; - // } - //std::string value; - // bool facet_found = facet_value_to_string(a_facet, facet_count, document, value); - // if(!facet_found) { - // continue; - // } - // std::unordered_map ftoken_pos; - // std::vector& ftokens = a_facet.hash_tokens[kv.first]; - // for(size_t ti = 0; ti < ftokens.size(); ti++) { - // if(the_field.is_bool()) { - // if(ftokens[ti] == "1") { - // ftokens[ti] = "true"; - // } else { - // ftokens[ti] = "false"; - // } - // } - // const std::string& resolved_token = ftokens[ti]; - // ftoken_pos[resolved_token] = ti; - // } - // const std::string& last_full_q_token = ftokens.empty() ? "" : ftokens.back(); - // // 2 passes: first identify tokens that need to be highlighted and then construct highlighted text - // bool is_cyrillic = Tokenizer::is_cyrillic(the_field.locale); - // bool normalise = is_cyrillic ? false : true; - // Tokenizer tokenizer(value, normalise, !the_field.is_string(), the_field.locale, symbols_to_index, token_separators); - // // secondary tokenizer used for specific languages that requires transliteration - // // we use 2 tokenizers so that the original text offsets are available for highlighting - // Tokenizer word_tokenizer("", true, false, the_field.locale, symbols_to_index, token_separators); - // std::string raw_token; - // size_t raw_token_index = 0, tok_start = 0, tok_end = 0; - // // need an ordered map here to ensure that it is ordered by the key (start offset) - // std::map token_offsets; - // size_t prefix_token_start_index = 0; - // while(tokenizer.next(raw_token, raw_token_index, tok_start, tok_end)) { - // if(is_cyrillic) { - // word_tokenizer.tokenize(raw_token); - // } - // auto token_pos_it = ftoken_pos.find(raw_token); - // if(token_pos_it != ftoken_pos.end()) { - // token_offsets[tok_start] = tok_end; - // if(raw_token == last_full_q_token) { - // prefix_token_start_index = tok_start; - // } - // } - // } - // auto offset_it = token_offsets.begin(); - // size_t i = 0; - // std::stringstream highlightedss; - // // loop until end index, accumulate token and complete highlighting - // while(i < value.size()) { - // if(offset_it != token_offsets.end()) { - // if (i == offset_it->first) { - // highlightedss << highlight_start_tag; - // // do prefix highlighting for non-dropped last token - // size_t token_len = (i == prefix_token_start_index && token_offsets.size() == facet_query_num_tokens) ? - // facet_query_last_token.size() : - // (offset_it->second - i + 1); - // if(i == prefix_token_start_index && token_offsets.size() == facet_query_num_tokens) { - // token_len = std::min((offset_it->second - i + 1), facet_query_last_token.size()); - // } else { - // token_len = (offset_it->second - i + 1); - // } - // for(size_t j = 0; j < token_len; j++) { - // highlightedss << value[i + j]; - // } - // highlightedss << highlight_end_tag; - // offset_it++; - // i += token_len; - // continue; - // } - // } - // highlightedss << value[i]; - // i++; - // } - //facet_value_t facet_value = {value, highlightedss.str(), facet_count.count}; - facet_value_t facet_value = { it.first, std::string(), it.second}; - facet_values.emplace_back(facet_value); } std::stable_sort(facet_values.begin(), facet_values.end(), Collection::facet_count_str_compare); @@ -2066,7 +2061,7 @@ Option Collection::search(std::string raw_query, facet_result["stats"]["avg"] = (a_facet.stats.fvsum / a_facet.stats.fvcount); } - facet_result["stats"]["total_values"] = facet_hash_counts.size(); + facet_result["stats"]["total_values"] = facet_values.size(); result["facet_counts"].push_back(facet_result); } @@ -4659,9 +4654,9 @@ Option Collection::parse_facet(const std::string& facet_field, std::vector for(const auto& tup : tupVec){ - int64_t lower_range = std::get<0>(tup); - int64_t upper_range = std::get<1>(tup); - std::string range_val = std::get<2>(tup); + const std::string& lower_range = std::to_string(std::get<0>(tup)); + const std::string& upper_range = std::to_string(std::get<1>(tup)); + const std::string& range_val = std::get<2>(tup); //check if ranges are continous or not if((!range_map.empty()) && (range_map.find(lower_range)== range_map.end())){ std::string error = "Ranges in range facet syntax should be continous."; diff --git a/src/facet_index.cpp b/src/facet_index.cpp index 9f8b0340..4a0e147f 100644 --- a/src/facet_index.cpp +++ b/src/facet_index.cpp @@ -51,25 +51,6 @@ void facet_index_t::insert(const std::string& field, const std::string& value, u } } -size_t facet_index_t::get(const std::string& field, - std::map>& result_ids) { - - const auto& facet_field_it = facet_field_map.find(field); - if(facet_field_it == facet_field_map.end()) { - return 0; - } - auto& facet_index_map = facet_field_it->second.facet_index_map; - - for(auto it = facet_index_map.begin(); it != facet_index_map.end(); ++it) { - auto ids = ids_t::uncompress(it.value()); - for(auto i = 0; i < ids_t::num_ids(ids); ++i) { - result_ids[it.key()].emplace_back(ids[i]); - } - } - - return result_ids.size(); -} - bool facet_index_t::contains(const std::string& field) { const auto& facet_field_it = facet_field_map.find(field); @@ -77,38 +58,14 @@ bool facet_index_t::contains(const std::string& field) { return false; } - // auto& facet_index_map = facet_field_it->second.facet_index_map; - // LOG(INFO) << "Size of facet_field " << field << " " << facet_index_map.size(); - - // for(auto it = facet_index_map.begin(); it != facet_index_map.end(); ++it) { - // LOG (INFO) << "facet_value " << it.key() << " with ids as follow"; - - // auto ids = ids_t::uncompress(it.value()); - // for(auto i = 0; i < ids_t::num_ids(ids); ++i) { - // LOG(INFO) << ids[i]; - // } - // } return true; } void facet_index_t::erase(const std::string& field) { - - const auto& facet_field_it = facet_field_map.find(field); - if(facet_field_it == facet_field_map.end()) { - return; + const auto it = facet_field_map.find(field); + if(it != facet_field_map.end()) { + facet_field_map.erase(field); } - - auto& facet_index_map = facet_field_it->second.facet_index_map; - - for(auto it = facet_index_map.begin(); it != facet_index_map.end(); ++it) { - ids_t::destroy_list(it.value()); - } - - facet_index_map.clear(); - - facet_field_it->second.counter_list.clear(); - - facet_field_map.erase(field); } size_t facet_index_t::size() { @@ -132,39 +89,68 @@ int facet_index_t::intersect(const std::string& field, const uint32_t* result_id // LOG (INFO) << "facet_index_map size " << facet_index_map.size() // << " , counter_list size " << counter_list.size(); - auto counter_list_it = counter_list.begin(); - int facet_count = 0; - + std::vector id_list; const auto max_facets = std::min((int)counter_list.size(), max_facet_count); - while(facet_count < max_facets) { + for(const auto& counter_list_it : counter_list) { //LOG (INFO) << "checking ids in facet_value " << counter_list_it->facet_value // << " having total count " << counter_list_it->count; - auto ids = facet_index_map.at(counter_list_it->facet_value); - auto id_list = ids_t::uncompress(ids); + auto ids = facet_index_map.at(counter_list_it.facet_value); + ids_t::uncompress(ids, id_list); + const auto ids_len = id_list.size(); int count = 0; for(int i = 0; i < result_ids_len; ++i) { - if(std::binary_search(id_list, id_list + ids_t::num_ids(id_list), result_ids[i])) { + if(std::binary_search(id_list.begin(), id_list.end(), result_ids[i])) { ++count; } } + if(count) { - //LOG (INFO) << "fount count " << count << " for facet " << counter_list_it->facet_value; - found[counter_list_it->facet_value] += count; + found[counter_list_it.facet_value] = count; + + if(found.size() == max_facets) { + break; + } } - ++facet_count; - ++counter_list_it; + id_list.clear(); } - + return found.size(); } -facet_index_t::~facet_index_t() { - for(auto it = facet_field_map.begin(); it != facet_field_map.end(); ++it) { - erase(it->first); +int facet_index_t::get_facet(const std::string& field, const std::vector& searched_tokens, + std::vector& facets) { + + const auto& facet_field_it = facet_field_map.find(field); + + if(facet_field_it == facet_field_map.end()) { + return 0; } + + auto facet_index_map = facet_field_it->second.facet_index_map; + + for(const auto& token : searched_tokens) { + auto token_string = token; + std::transform(token_string.begin(), token_string.end(), token_string.begin(), ::tolower); + + for(auto facet_index_map_it = facet_index_map.begin(); + facet_index_map_it != facet_index_map.end(); ++facet_index_map_it) { + + auto facet_string = facet_index_map_it.key(); + std::transform(facet_string.begin(), facet_string.end(), facet_string.begin(), ::tolower); + + if(facet_string.find(token_string) != std::string::npos) { + facets.emplace_back(facet_index_map_it.key()); + } + } + } + + return facets.size(); +} + +facet_index_t::~facet_index_t() { facet_field_map.clear(); } diff --git a/src/index.cpp b/src/index.cpp index b4f2001c..67943ff7 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -62,6 +62,8 @@ Index::Index(const std::string& name, const uint32_t collection_id, const Store* search_schema(search_schema), seq_ids(new id_list_t(256)), symbols_to_index(symbols_to_index), token_separators(token_separators) { + facet_index_v4 = new facet_index_t(); + for(const auto& a_field: search_schema) { if(!a_field.index) { continue; @@ -102,9 +104,6 @@ Index::Index(const std::string& name, const uint32_t collection_id, const Store* if(a_field.facet) { //initialize_facet_indexes(a_field); - if(facet_index_v4 == nullptr) { - facet_index_v4 = new facet_index_t(); - } } // initialize for non-string facet fields @@ -1173,9 +1172,9 @@ void Index::initialize_facet_indexes(const field& facet_field) { // } } -void Index::compute_facet_stats(facet &a_facet, uint64_t raw_value, const std::string & field_type) { +void Index::compute_facet_stats(facet &a_facet, std::string raw_value, const std::string & field_type) { if(field_type == field_types::INT32 || field_type == field_types::INT32_ARRAY) { - int32_t val = raw_value; + int32_t val = std::stoi(raw_value); if (val < a_facet.stats.fvmin) { a_facet.stats.fvmin = val; } @@ -1185,7 +1184,7 @@ void Index::compute_facet_stats(facet &a_facet, uint64_t raw_value, const std::s a_facet.stats.fvsum += val; a_facet.stats.fvcount++; } else if(field_type == field_types::INT64 || field_type == field_types::INT64_ARRAY) { - int64_t val = raw_value; + int64_t val = std::stol(raw_value); if(val < a_facet.stats.fvmin) { a_facet.stats.fvmin = val; } @@ -1195,7 +1194,7 @@ void Index::compute_facet_stats(facet &a_facet, uint64_t raw_value, const std::s a_facet.stats.fvsum += val; a_facet.stats.fvcount++; } else if(field_type == field_types::FLOAT || field_type == field_types::FLOAT_ARRAY) { - float val = reinterpret_cast(raw_value); + float val = std::stof(raw_value); if(val < a_facet.stats.fvmin) { a_facet.stats.fvmin = val; } @@ -1219,7 +1218,7 @@ void Index::do_facets(std::vector & facets, facet_query_t & facet_query, const auto& facet_field = facet_infos[findex].facet_field; const bool use_facet_query = facet_infos[findex].use_facet_query; //const auto& fquery_hashes = facet_infos[findex].hashes; - const auto& fquery_doc_id_tokens = facet_infos[findex].doc_id_tokens; + const auto& fquery_facet_tokens = facet_infos[findex].facet_tokens; const bool should_compute_stats = facet_infos[findex].should_compute_stats; auto sort_index_it = sort_index.find(a_facet.field_name); @@ -1232,9 +1231,41 @@ void Index::do_facets(std::vector & facets, facet_query_t & facet_query, // size_t facet_hash_count = 1; // const auto& field_facet_mapping_it = facet_index_v3.find(a_facet.field_name); // const auto& field_single_val_facet_mapping_it = single_val_facet_index_v3.find(a_facet.field_name); + std::map facet_results; facet_index_v4->intersect(a_facet.field_name, result_ids, - results_size, max_facet_count, facet_results[a_facet.field_name]); + results_size, max_facet_count, facet_results); + //LOG(INFO) << "facet_results size " << facet_results.size(); + for(const auto& kv : facet_results) { + //range facet processing + if(a_facet.is_range_query) { + const auto doc_val = kv.first; + std::pair range_pair {}; + if(a_facet.get_range(doc_val, range_pair)) { + const auto& range_id = range_pair.first; + facet_count_t& facet_count = a_facet.result_map[range_id]; + facet_count.count = kv.second; + } + } else if(use_facet_query) { + if (fquery_facet_tokens.find(kv.first) != fquery_facet_tokens.end()) { + a_facet.facet_tokens[kv.first] = fquery_facet_tokens.at(kv.first); + + facet_count_t& facet_count = a_facet.result_map[kv.first]; + facet_count.count = kv.second; + } + } else { + facet_count_t& facet_count = a_facet.result_map[kv.first]; + facet_count.count = kv.second; + } + + if(should_compute_stats) { + //LOG(INFO) << "Computing facet stas for facet " << a_facet.field_name; + for(int i = 0; i < kv.second; ++i) { + compute_facet_stats(a_facet, kv.first, facet_field.type); + } + } + } + } // for(size_t i = 0; i < results_size; i++) { // // if sampling is enabled, we will skip a portion of the results to speed up things @@ -1266,7 +1297,6 @@ void Index::do_facets(std::vector & facets, facet_query_t & facet_query, // const uint64_t distinct_id = group_limit ? get_distinct_id(group_by_fields, doc_seq_id) : 0; // //for(size_t j = 0; j < facet_hash_count; j++) { - // for(size_t j = 0; j < found_doc_seq_ids.size(); j++) { // // if(facet_field.is_array()) { // // fhash = facet_map_it->second.hashes[j]; // // } @@ -1306,7 +1336,7 @@ void Index::do_facets(std::vector & facets, facet_query_t & facet_query, // } // } //} - } + //} } void Index::aggregate_topster(Topster* agg_topster, Topster* index_topster) { @@ -2399,18 +2429,22 @@ Option Index::search(std::vector& field_query_tokens, cons while (it.valid()) { uint32_t seq_id = it.id(); uint64_t distinct_id = seq_id; - if (group_limit != 0) { - distinct_id = get_distinct_id(group_by_fields, seq_id); - if(excluded_group_ids.count(distinct_id) != 0) { - continue; - } - } + // if (group_limit != 0) { + // distinct_id = get_distinct_id(group_by_fields, seq_id); + // if(excluded_group_ids.count(distinct_id) != 0) { + // continue; + // } + // } int64_t scores[3] = {0}; scores[0] = seq_id; int64_t match_score_index = -1; result_ids.push_back(seq_id); + if(group_limit == 0) { + KV kv(searched_queries.size(), seq_id, distinct_id, match_score_index, scores); + topster->add(&kv); + } KV kv(searched_queries.size(), seq_id, distinct_id, match_score_index, scores, nullptr); int ret = topster->add(&kv); @@ -2507,12 +2541,12 @@ Option Index::search(std::vector& field_query_tokens, cons } uint64_t distinct_id = seq_id; - if (group_limit != 0) { - distinct_id = get_distinct_id(group_by_fields, seq_id); - if(excluded_group_ids.count(distinct_id) != 0) { - continue; - } - } + // if (group_limit != 0) { + // distinct_id = get_distinct_id(group_by_fields, seq_id); + // if(excluded_group_ids.count(distinct_id) != 0) { + // continue; + // } + // } auto vec_dist_score = (field_vector_index->distance_type == cosine) ? std::abs(dist_label.first) : dist_label.first; @@ -2530,9 +2564,9 @@ Option Index::search(std::vector& field_query_tokens, cons KV kv(searched_queries.size(), seq_id, distinct_id, match_score_index, scores, nullptr); int ret = topster->add(&kv); - if(group_limit != 0 && ret < 2) { - groups_processed[distinct_id]++; - } + // if(group_limit != 0 && ret < 2) { + // groups_processed[distinct_id]++; + // } nearest_ids.push_back(seq_id); } @@ -2826,7 +2860,8 @@ Option Index::search(std::vector& field_query_tokens, cons bool estimate_facets = (facet_sample_percent < 100 && all_result_ids_len > facet_sample_threshold); if(!facets.empty()) { - const size_t num_threads = std::min(concurrency, all_result_ids_len); + //const size_t num_threads = std::min(concurrency, all_result_ids_len); + const size_t num_threads = 1; const size_t window_size = (num_threads == 0) ? 0 : (all_result_ids_len + num_threads - 1) / num_threads; // rounds up size_t num_processed = 0; @@ -2897,10 +2932,10 @@ Option Index::search(std::vector& field_query_tokens, cons for(auto & facet_kv: this_facet.result_map) { if(group_limit) { // we have to add all group sets - acc_facet.hash_groups[facet_kv.first].insert( - this_facet.hash_groups[facet_kv.first].begin(), - this_facet.hash_groups[facet_kv.first].end() - ); + // acc_facet.hash_groups[facet_kv.first].insert( + // this_facet.hash_groups[facet_kv.first].begin(), + // this_facet.hash_groups[facet_kv.first].end() + // ); } else { size_t count = 0; if(acc_facet.result_map.count(facet_kv.first) == 0) { @@ -2912,9 +2947,10 @@ Option Index::search(std::vector& field_query_tokens, cons acc_facet.result_map[facet_kv.first].count = count; } - acc_facet.result_map[facet_kv.first].doc_id = facet_kv.second.doc_id; - acc_facet.result_map[facet_kv.first].array_pos = facet_kv.second.array_pos; - acc_facet.hash_tokens[facet_kv.first] = this_facet.hash_tokens[facet_kv.first]; + //acc_facet.result_map[facet_kv.first].doc_id = facet_kv.second.doc_id; + //acc_facet.result_map[facet_kv.first].array_pos = facet_kv.second.array_pos; + //acc_facet.hash_tokens[facet_kv.first] = this_facet.hash_tokens[facet_kv.first]; + acc_facet.facet_tokens[facet_kv.first] = this_facet.facet_tokens[facet_kv.first]; } if(this_facet.stats.fvcount != 0) { @@ -2928,9 +2964,9 @@ Option Index::search(std::vector& field_query_tokens, cons for(auto & acc_facet: facets) { for(auto& facet_kv: acc_facet.result_map) { - if(group_limit) { - facet_kv.second.count = acc_facet.hash_groups[facet_kv.first].size(); - } + // if(group_limit) { + // facet_kv.second.count = acc_facet.hash_groups[facet_kv.first].size(); + // } if(estimate_facets) { facet_kv.second.count = size_t(double(facet_kv.second.count) * (100.0f / facet_sample_percent)); @@ -2955,8 +2991,6 @@ Option Index::search(std::vector& field_query_tokens, cons facet_infos, group_limit, group_by_fields, &included_ids_vec[0], included_ids_vec.size(), max_facet_values); - facet_index_v4->contains("tags"); - all_result_ids_len += curated_topster->size; delete [] all_result_ids; @@ -3665,12 +3699,12 @@ void Index::search_across_fields(const std::vector& query_tokens, } uint64_t distinct_id = seq_id; - if(group_limit != 0) { - distinct_id = get_distinct_id(group_by_fields, seq_id); - if(excluded_group_ids.count(distinct_id) != 0) { - return; - } - } + // if(group_limit != 0) { + // distinct_id = get_distinct_id(group_by_fields, seq_id); + // if(excluded_group_ids.count(distinct_id) != 0) { + // return; + // } + // } int64_t scores[3] = {0}; int64_t match_score_index = -1; @@ -4267,18 +4301,21 @@ void Index::do_infix_search(const size_t num_search_fields, const std::vectoradd(&kv); - if(group_limit != 0 && ret < 2) { - groups_processed[distinct_id]++; + // if(group_limit != 0) { + // distinct_id = get_distinct_id(group_by_fields, seq_id); + // if(excluded_group_ids.count(distinct_id) != 0) { + // continue; + // } + // } + if(group_limit == 0) { + KV kv(searched_queries.size(), seq_id, distinct_id, match_score_index, scores); + int ret = actual_topster->add(&kv); } + + // if(group_limit != 0 && ret < 2) { + // groups_processed[distinct_id]++; + // } + if(((i + 1) % (1 << 12)) == 0) { BREAK_CIRCUIT_BREAKER @@ -4374,10 +4411,6 @@ void Index::compute_facet_infos(const std::vector& facets, facet_query_t& // && (field_single_val_facet_mapping_it == single_val_facet_index_v3.end())) { // continue; // } - std::map> found_doc_ids; - if(facet_index_v4->get(a_facet.field_name, found_doc_ids) == 0) { - continue; - } facet_infos[findex].use_facet_query = false; @@ -4442,6 +4475,16 @@ void Index::compute_facet_infos(const std::vector& facets, facet_query_t& for(auto leaf: searched_query) { posting_lists.push_back(leaf->values); std::string tok(reinterpret_cast(leaf->key), leaf->key_len - 1); + + //convert again to boolean string to help search in facet_index map + if (facet_field.is_bool()) { + if (tok == "1") { + tok = "true"; + } else if (tok == "0") { + tok = "false"; + } + } + searched_tokens.push_back(tok); //LOG(INFO) << "tok: " << tok; } @@ -4468,6 +4511,7 @@ void Index::compute_facet_infos(const std::vector& facets, facet_query_t& if(!id_matched) { continue; } + //LOG(INFO) << "seq_id matched : " << seq_id; // if(facet_field.is_array()) { // const auto doc_fvalues_it = field_facet_mapping_it->second[seq_id % ARRAY_FACET_DIM]->find(seq_id); @@ -4501,19 +4545,18 @@ void Index::compute_facet_infos(const std::vector& facets, facet_query_t& // facet_infos[findex].hashes.emplace(hash, searched_tokens); // } // } - for(const auto& found_doc_it : found_doc_ids) { - const auto& ids = found_doc_it.second; - if(std::binary_search(ids.begin(), ids.end(), seq_id)){ - for(const auto& doc_id : ids) { - if(facet_infos[findex].doc_id_tokens.count(doc_id) == 0) { - facet_infos[findex].doc_id_tokens.emplace(doc_id, searched_tokens); - } - } + } + std::vector matched_facets; + if(facet_index_v4->get_facet(a_facet.field_name, searched_tokens, matched_facets)) { + for(const auto& facet : matched_facets) { + if(facet_infos[findex].facet_tokens.count(facet) == 0) { + LOG(INFO) << "adding facet " << facet << " in facet_info"; + facet_infos[findex].facet_tokens.emplace(facet, searched_tokens); } } } } - + delete [] field_result_ids; } } @@ -4621,20 +4664,20 @@ void Index::search_wildcard(filter_node_t const* const& filter_tree_root, 100, scores, match_score_index); uint64_t distinct_id = seq_id; - if(group_limit != 0) { - distinct_id = get_distinct_id(group_by_fields, seq_id); - if(excluded_group_ids.count(distinct_id) != 0) { - continue; - } + // if(group_limit != 0) { + // distinct_id = get_distinct_id(group_by_fields, seq_id); + // if(excluded_group_ids.count(distinct_id) != 0) { + // continue; + // } + // } + if(group_limit == 0) { + KV kv(searched_queries.size(), seq_id, distinct_id, match_score_index, scores); + int ret = topsters[thread_id]->add(&kv); } - - KV kv(searched_queries.size(), seq_id, distinct_id, match_score_index, scores); - int ret = topsters[thread_id]->add(&kv); - - if(group_limit != 0 && ret < 2) { - tgroups_processed[thread_id][distinct_id]++; - } - + + // if(group_limit != 0 && ret < 2) { + // tgroups_processed[thread_id][distinct_id]++; + // } if(check_for_circuit_break && ((i + 1) % (1 << 15)) == 0) { // check only once every 2^15 docs to reduce overhead BREAK_CIRCUIT_BREAKER @@ -5220,11 +5263,16 @@ void Index::score_results(const std::vector & sort_fields, const uint16 uint64_t distinct_id = seq_id; - if(group_limit != 0) { - distinct_id = get_distinct_id(group_by_fields, seq_id); - } + // if(group_limit != 0) { + // distinct_id = get_distinct_id(group_by_fields, seq_id); + // groups_processed.emplace(distinct_id); + // } //LOG(INFO) << "Seq id: " << seq_id << ", match_score: " << match_score; + if(group_limit == 0) { + KV kv(query_index, seq_id, distinct_id, match_score_index, scores); + topster->add(&kv); + } KV kv(query_index, seq_id, distinct_id, match_score_index, scores); int ret = topster->add(&kv); if(group_limit != 0 && ret < 2) { @@ -5239,7 +5287,6 @@ void Index::score_results(const std::vector & sort_fields, const uint16 uint64_t Index::get_distinct_id(const std::vector& group_by_fields, const uint32_t seq_id) const { uint64_t distinct_id = 1; // some constant initial value - std::hash hasher; // calculate hash from group_by_fields for(const auto& field: group_by_fields) { // const auto& field_facet_mapping_it = facet_index_v3.find(field); @@ -5277,9 +5324,6 @@ uint64_t Index::get_distinct_id(const std::vector& group_by_fields, // distinct_id = StringUtils::hash_combine(distinct_id, facet_hash); // } - - const auto& hash = hasher(field); - distinct_id = StringUtils::hash_combine(distinct_id, hash); } return distinct_id; diff --git a/test/collection_faceting_test.cpp b/test/collection_faceting_test.cpp index 261e702c..ad1cb4db 100644 --- a/test/collection_faceting_test.cpp +++ b/test/collection_faceting_test.cpp @@ -103,11 +103,11 @@ TEST_F(CollectionFacetingTest, FacetCounts) { ASSERT_STREQ("tags", results["facet_counts"][0]["field_name"].get().c_str()); ASSERT_EQ(2, results["facet_counts"][0]["counts"].size()); - ASSERT_STREQ("gold", results["facet_counts"][0]["counts"][0]["value"].get().c_str()); - ASSERT_EQ(3, (int) results["facet_counts"][0]["counts"][0]["count"]); + ASSERT_STREQ("bronze", results["facet_counts"][0]["counts"][0]["value"].get().c_str()); + ASSERT_EQ(2, (int) results["facet_counts"][0]["counts"][0]["count"]); - ASSERT_STREQ("silver", results["facet_counts"][0]["counts"][1]["value"].get().c_str()); - ASSERT_EQ(3, (int) results["facet_counts"][0]["counts"][1]["count"]); + ASSERT_STREQ("FINE PLATINUM", results["facet_counts"][0]["counts"][1]["value"].get().c_str()); + ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][1]["count"]); // 2 facets, 1 text query with no filters facets.clear(); @@ -230,12 +230,12 @@ TEST_F(CollectionFacetingTest, FacetCounts) { ASSERT_STREQ("age", results["facet_counts"][0]["field_name"].get().c_str()); ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][0]["count"]); - ASSERT_STREQ("21", results["facet_counts"][0]["counts"][0]["value"].get().c_str()); - ASSERT_STREQ("21", results["facet_counts"][0]["counts"][0]["highlighted"].get().c_str()); + ASSERT_STREQ("24", results["facet_counts"][0]["counts"][0]["value"].get().c_str()); + ASSERT_STREQ("24", results["facet_counts"][0]["counts"][0]["highlighted"].get().c_str()); ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][1]["count"]); - ASSERT_STREQ("24", results["facet_counts"][0]["counts"][1]["value"].get().c_str()); - ASSERT_STREQ("24", results["facet_counts"][0]["counts"][1]["highlighted"].get().c_str()); + ASSERT_STREQ("21", results["facet_counts"][0]["counts"][1]["value"].get().c_str()); + ASSERT_STREQ("21", results["facet_counts"][0]["counts"][1]["highlighted"].get().c_str()); // facet on a float field without query to check on stats results = coll_array_fields->search("*", query_fields, "", {"rating"}, sort_fields, {0}, 10, 1, FREQUENCY, @@ -258,7 +258,7 @@ TEST_F(CollectionFacetingTest, FacetCounts) { {false}, Index::DROP_TOKENS_THRESHOLD, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "rating: 7").get(); - + ASSERT_EQ(5, results["hits"].size()); ASSERT_EQ(1, results["facet_counts"].size()); ASSERT_STREQ("rating", results["facet_counts"][0]["field_name"].get().c_str()); @@ -278,7 +278,7 @@ TEST_F(CollectionFacetingTest, FacetCounts) { results = coll_array_fields->search("*", query_fields, "", {"timestamps"}, sort_fields, {0}, 10, 1, FREQUENCY, {false}, Index::DROP_TOKENS_THRESHOLD, spp::sparse_hash_set(), - spp::sparse_hash_set(), 10, "timestamps: 142189002").get(); + spp::sparse_hash_set(), 10, "timestamps: 142189002").get(); ASSERT_EQ(5, results["hits"].size()); ASSERT_EQ(1, results["facet_counts"].size()); ASSERT_EQ(1, results["facet_counts"][0]["counts"].size()); @@ -607,7 +607,6 @@ TEST_F(CollectionFacetingTest, FacetCountsHighlighting) { ASSERT_STREQ("Cell Phone Accessories", results["facet_counts"][0]["counts"][0]["highlighted"].get().c_str()); // ensure that only the last token is treated as prefix search - coll1->remove("100"); doc["categories"] = {"Cell Phones", "Cell Phone Accessories", "Cellophanes"}; coll1->add(doc.dump()); @@ -616,6 +615,8 @@ TEST_F(CollectionFacetingTest, FacetCountsHighlighting) { token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "categories:cell ph").get(); + LOG(INFO) << results.dump(); + ASSERT_EQ(1, results["facet_counts"].size()); ASSERT_EQ(2, results["facet_counts"][0]["counts"].size()); @@ -673,6 +674,7 @@ TEST_F(CollectionFacetingTest, FacetStatOnFloatFields) { 1, FREQUENCY, {false}); auto results = res_op.get(); + LOG(INFO) << results.dump(); ASSERT_EQ(7, results["hits"].size()); @@ -746,8 +748,8 @@ TEST_F(CollectionFacetingTest, FacetCountOnSimilarStrings) { ASSERT_EQ(2, results["hits"].size()); ASSERT_EQ(2, results["facet_counts"][0]["counts"].size()); - ASSERT_STREQ("India in England", results["facet_counts"][0]["counts"][0]["value"].get().c_str()); - ASSERT_STREQ("England in India", results["facet_counts"][0]["counts"][1]["value"].get().c_str()); + ASSERT_STREQ("England in India", results["facet_counts"][0]["counts"][0]["value"].get().c_str()); + ASSERT_STREQ("India in England", results["facet_counts"][0]["counts"][1]["value"].get().c_str()); collectionManager.drop_collection("coll1"); } @@ -1645,6 +1647,7 @@ TEST_F(CollectionFacetingTest, FacetIndexRefactor) { ASSERT_EQ(5, results["hits"].size()); ASSERT_EQ(1, results["facet_counts"].size()); ASSERT_EQ(4, results["facet_counts"][0].size()); + ASSERT_EQ(4, results["facet_counts"][0]["counts"].size()); ASSERT_EQ("tags", results["facet_counts"][0]["field_name"]); ASSERT_STREQ("gold", results["facet_counts"][0]["counts"][0]["value"].get().c_str()); diff --git a/test/collection_grouping_test.cpp b/test/collection_grouping_test.cpp index 9507a91c..c094f9fc 100644 --- a/test/collection_grouping_test.cpp +++ b/test/collection_grouping_test.cpp @@ -69,6 +69,8 @@ TEST_F(CollectionGroupingTest, GroupingBasics) { "", 10, {}, {}, {"size"}, 2).get(); + LOG(INFO) << res.dump(); + ASSERT_EQ(3, res["found"].get()); ASSERT_EQ(3, res["grouped_hits"].size()); ASSERT_EQ(11, res["grouped_hits"][0]["group_key"][0].get());