diff --git a/include/collection.h b/include/collection.h index 692e4142..1f948175 100644 --- a/include/collection.h +++ b/include/collection.h @@ -187,9 +187,8 @@ private: std::vector& new_fields, bool enable_nested_fields); - static bool facet_count_compare(const std::pair& a, - const std::pair& b) { - return std::tie(a.second.count, a.first) > std::tie(b.second.count, b.first); + static bool facet_count_compare(const facet_count_t& a, const facet_count_t& b) { + return std::tie(a.count, a.fhash) > std::tie(b.count, b.fhash); } static bool facet_count_str_compare(const facet_value_t& a, diff --git a/include/field.h b/include/field.h index c1cfe1aa..a051c7ee 100644 --- a/include/field.h +++ b/include/field.h @@ -570,6 +570,10 @@ public: struct facet_count_t { uint32_t count = 0; + // for value based faceting, actual value is stored here + std::string fvalue; + // for hash based faceting, hash value is stored here + int64_t fhash; // used to fetch the actual document and value for representation uint32_t doc_id = 0; uint32_t array_pos = 0; @@ -584,9 +588,12 @@ struct facet_stats_t { struct facet { const std::string field_name; - spp::sparse_hash_map result_map; + spp::sparse_hash_map result_map; + spp::sparse_hash_map value_result_map; + // used for facet value query - spp::sparse_hash_map> hash_tokens; + spp::sparse_hash_map> fvalue_tokens; + spp::sparse_hash_map> hash_tokens; // used for faceting grouped results spp::sparse_hash_map> hash_groups; @@ -594,7 +601,7 @@ struct facet { facet_stats_t stats; //dictionary of key=>pair(range_id, range_val) - std::map facet_range_map; + std::map facet_range_map; bool is_range_query; @@ -604,16 +611,14 @@ struct facet { bool is_intersected = false; - bool get_range(std::string key, std::pair& range_pair) - { - if(facet_range_map.empty()) - { + bool get_range(int64_t key, std::pair& range_pair) { + if(facet_range_map.empty()) { LOG (ERROR) << "Facet range is not defined!!!"; } + auto it = facet_range_map.lower_bound(key); - if(it != facet_range_map.end()) - { + if(it != facet_range_map.end()) { range_pair.first = it->first; range_pair.second = it->second; return true; @@ -622,17 +627,16 @@ struct facet { return false; } - explicit facet(const std::string& field_name, - std::map facet_range = {}, bool is_range_q = false) - :field_name(field_name){ - facet_range_map = facet_range; - is_range_query = is_range_q; + explicit facet(const std::string& field_name, std::map facet_range = {}, + bool is_range_q = false) :field_name(field_name), facet_range_map(facet_range), + is_range_query(is_range_q) { } }; struct facet_info_t { // facet hash => resolved tokens - std::unordered_map> hashes; + std::unordered_map> hashes; + std::vector fvalue_searched_tokens; bool use_facet_query = false; bool should_compute_stats = false; bool use_value_index = false; diff --git a/src/collection.cpp b/src/collection.cpp index 99bfacf0..11eff55e 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -2001,9 +2001,11 @@ Option Collection::search(std::string raw_query, result["facet_counts"] = nlohmann::json::array(); // populate facets - for(facet & a_facet: facets) { + for(facet& a_facet: facets) { // Don't return zero counts for a wildcard facet. - if (a_facet.is_wildcard_match && a_facet.result_map.size() == 0) { + if (a_facet.is_wildcard_match && + (((a_facet.is_intersected && a_facet.value_result_map.empty())) || + (!a_facet.is_intersected && a_facet.result_map.empty()))) { continue; } @@ -2020,28 +2022,28 @@ Option Collection::search(std::string raw_query, facet_result["counts"] = nlohmann::json::array(); std::vector facet_values; - std::vector> facet_counts; + std::vector facet_counts; for (const auto & kv : a_facet.result_map) { - facet_counts.emplace_back(std::make_pair(kv.first, kv.second)); + facet_count_t v = kv.second; + v.fhash = kv.first; + facet_counts.emplace_back(v); + } + + for (const auto& kv : a_facet.value_result_map) { + facet_count_t v = kv.second; + v.fvalue = kv.first; + v.fhash = StringUtils::hash_wy(kv.first.c_str(), kv.first.size()); + facet_counts.emplace_back(v); } auto max_facets = std::min(max_facet_values, facet_counts.size()); auto nthElement = max_facets == facet_counts.size() ? max_facets - 1 : max_facets; - std::nth_element(facet_counts.begin(), facet_counts.begin() + nthElement, - facet_counts.end(), [&](const auto& kv1, const auto& kv2) { - size_t a_count = kv1.second.count; - size_t b_count = kv2.second.count; - - size_t a_value_size = UINT64_MAX - kv1.first.size(); - size_t b_value_size = UINT64_MAX - kv2.first.size(); - - return std::tie(a_count, a_value_size) > std::tie(b_count, b_value_size); - }); + std::nth_element(facet_counts.begin(), facet_counts.begin() + nthElement, facet_counts.end(), + Collection::facet_count_compare); if(a_facet.is_range_query){ - for(auto kv : a_facet.result_map){ - + for(const auto& kv : a_facet.result_map){ auto facet_range_iter = a_facet.facet_range_map.find(kv.first); if(facet_range_iter != a_facet.facet_range_map.end()){ auto & facet_count = kv.second; @@ -2059,13 +2061,11 @@ Option Collection::search(std::string raw_query, for(size_t fi = 0; fi < max_facets; fi++) { // remap facet value hash with actual string - auto & kv = facet_counts[fi]; - auto & facet_count = kv.second; - + auto & facet_count = facet_counts[fi]; std::string value; if(a_facet.is_intersected) { - value = kv.first; + value = facet_count.fvalue; //LOG(INFO) << "used intersection"; } else { // fetch actual facet value from representative doc id @@ -2089,7 +2089,8 @@ Option Collection::search(std::string raw_query, } std::unordered_map ftoken_pos; - std::vector& ftokens = a_facet.hash_tokens[kv.first]; + std::vector& ftokens = a_facet.is_intersected ? a_facet.fvalue_tokens[facet_count.fvalue] : + a_facet.hash_tokens[facet_count.fhash]; //LOG(INFO) << "working on hash_tokens for hash " << kv.first << " with size " << ftokens.size(); for(size_t ti = 0; ti < ftokens.size(); ti++) { if(the_field.is_bool()) { @@ -4874,7 +4875,7 @@ Option Collection::parse_facet(const std::string& facet_field, std::vector return Option(400, error); } - std::vector> tupVec; + std::vector> tupVec; auto& range_map = a_facet.facet_range_map; for(const auto& range : result){ @@ -4889,26 +4890,28 @@ Option Collection::parse_facet(const std::string& facet_field, std::vector auto pos2 = range.find(","); auto pos3 = range.find("]"); - std::string lower_range, upper_range; + int64_t lower_range, upper_range; auto lower_range_start = pos1 + 2; auto lower_range_len = pos2 - lower_range_start; auto upper_range_start = pos2 + 1; auto upper_range_len = pos3 - upper_range_start; if(a_field.is_integer()) { - lower_range = range.substr(lower_range_start, lower_range_len); - StringUtils::trim(lower_range); - upper_range = range.substr(upper_range_start, upper_range_len); - StringUtils::trim(upper_range); + std::string lower_range_str = range.substr(lower_range_start, lower_range_len); + StringUtils::trim(lower_range_str); + lower_range = std::stoll(lower_range_str); + std::string upper_range_str = range.substr(upper_range_start, upper_range_len); + StringUtils::trim(upper_range_str); + upper_range = std::stoll(upper_range_str); } else { float val = std::stof(range.substr(pos1 + 2, pos2)); - lower_range = std::to_string(Index::float_to_int64_t(val)); + lower_range = Index::float_to_int64_t(val); val = std::stof(range.substr(pos2 + 1, pos3)); - upper_range = std::to_string(Index::float_to_int64_t(val)); + upper_range = Index::float_to_int64_t(val); } - tupVec.emplace_back(std::make_tuple(lower_range, upper_range, range_val)); + tupVec.emplace_back(lower_range, upper_range, range_val); } //sort the range values so that we can check continuity diff --git a/src/index.cpp b/src/index.cpp index 49f38451..091c5dd2 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -1295,32 +1295,29 @@ void Index::do_facets(std::vector & facets, facet_query_t & facet_query, //range facet processing if(a_facet.is_range_query) { const auto doc_val = kv.first; - std::pair range_pair {}; - if(a_facet.get_range(doc_val, range_pair)) { + std::pair range_pair {}; + if(a_facet.get_range(std::stoll(doc_val), range_pair)) { const auto& range_id = range_pair.first; facet_count_t& facet_count = a_facet.result_map[range_id]; facet_count.count = kv.second; } } else { if(use_facet_query) { - const auto fquery_hashes_it = fquery_hashes.find(facet_field.name); - if(fquery_hashes_it != fquery_hashes.end()) { - const auto& searched_tokens = fquery_hashes_it->second; - auto facet_str = kv.first; - transform(facet_str.begin(), facet_str.end(), facet_str.begin(), ::tolower); + const auto& searched_tokens = facet_infos[findex].fvalue_searched_tokens; + auto facet_str = kv.first; + transform(facet_str.begin(), facet_str.end(), facet_str.begin(), ::tolower); - for(const auto& val : searched_tokens) { - if(facet_str.find(val) != std::string::npos) { - facet_count_t& facet_count = a_facet.result_map[kv.first]; - facet_count.count = kv.second; + for(const auto& val : searched_tokens) { + if(facet_str.find(val) != std::string::npos) { + facet_count_t& facet_count = a_facet.value_result_map[kv.first]; + facet_count.count = kv.second; - a_facet.hash_tokens[kv.first] = searched_tokens; - } + a_facet.fvalue_tokens[kv.first] = searched_tokens; } } } else { - facet_count_t& facet_count = a_facet.result_map[kv.first]; + facet_count_t& facet_count = a_facet.value_result_map[kv.first]; facet_count.count = kv.second; } } @@ -1389,18 +1386,17 @@ void Index::do_facets(std::vector & facets, facet_query_t & facet_query, compute_facet_stats(a_facet, fhash, facet_field.type); } - std::string fhash_str = std::to_string(fhash); if(a_facet.is_range_query) { int64_t doc_val = get_doc_val_from_sort_index(sort_index_it, doc_seq_id); - std::pair range_pair {}; - if(a_facet.get_range(std::to_string(doc_val), range_pair)) { + std::pair range_pair {}; + if(a_facet.get_range(doc_val, range_pair)) { const auto& range_id = range_pair.first; facet_count_t& facet_count = a_facet.result_map[range_id]; facet_count.count += 1; } - } else if(!use_facet_query || fquery_hashes.find(fhash_str) != fquery_hashes.end()) { - facet_count_t& facet_count = a_facet.result_map[fhash_str]; + } else if(!use_facet_query || fquery_hashes.find(fhash) != fquery_hashes.end()) { + facet_count_t& facet_count = a_facet.result_map[fhash]; //LOG(INFO) << "field: " << a_facet.field_name << ", doc id: " << doc_seq_id << ", hash: " << fhash; facet_count.doc_id = doc_seq_id; facet_count.array_pos = j; @@ -1411,7 +1407,7 @@ void Index::do_facets(std::vector & facets, facet_query_t & facet_query, } if(use_facet_query) { //LOG (INFO) << "adding hash tokens for hash " << fhash; - a_facet.hash_tokens[fhash_str] = fquery_hashes.at(fhash_str); + a_facet.hash_tokens[fhash] = fquery_hashes.at(fhash); } } } @@ -2891,7 +2887,7 @@ Option Index::search(std::vector& field_query_tokens, cons for(auto & facet_kv: this_facet.result_map) { uint32_t fhash = 0; if(group_limit) { - fhash = std::stoul(facet_kv.first); + fhash = facet_kv.first; // we have to add all group sets acc_facet.hash_groups[fhash].insert( this_facet.hash_groups[fhash].begin(), @@ -2915,6 +2911,24 @@ Option Index::search(std::vector& field_query_tokens, cons acc_facet.hash_tokens[facet_kv.first] = this_facet.hash_tokens[facet_kv.first]; } + for(auto& facet_kv: this_facet.value_result_map) { + size_t count = 0; + if(acc_facet.value_result_map.count(facet_kv.first) == 0) { + // not found, so set it + count = facet_kv.second.count; + } else { + count = acc_facet.value_result_map[facet_kv.first].count + facet_kv.second.count; + } + + acc_facet.value_result_map[facet_kv.first].count = count; + + acc_facet.value_result_map[facet_kv.first].doc_id = facet_kv.second.doc_id; + acc_facet.value_result_map[facet_kv.first].array_pos = facet_kv.second.array_pos; + acc_facet.is_intersected = this_facet.is_intersected; + + acc_facet.fvalue_tokens[facet_kv.first] = this_facet.fvalue_tokens[facet_kv.first]; + } + if(this_facet.stats.fvcount != 0) { acc_facet.stats.fvcount += this_facet.stats.fvcount; acc_facet.stats.fvsum += this_facet.stats.fvsum; @@ -2927,7 +2941,7 @@ Option Index::search(std::vector& field_query_tokens, cons for(auto & acc_facet: facets) { for(auto& facet_kv: acc_facet.result_map) { if(group_limit) { - facet_kv.second.count = acc_facet.hash_groups[std::stoul(facet_kv.first)].size(); + facet_kv.second.count = acc_facet.hash_groups[facet_kv.first].size(); } if(estimate_facets) { @@ -2935,6 +2949,12 @@ Option Index::search(std::vector& field_query_tokens, cons } } + for(auto& facet_kv: acc_facet.value_result_map) { + if(estimate_facets) { + facet_kv.second.count = size_t(double(facet_kv.second.count) * (100.0f / facet_sample_percent)); + } + } + if(estimate_facets) { acc_facet.sampled = true; } @@ -4482,7 +4502,7 @@ void Index::compute_facet_infos(const std::vector& facets, facet_query_t& // since `field_result_ids` contains documents matched across all queries // value based index for(const auto& val : searched_tokens) { - facet_infos[findex].hashes[facet_field.name].emplace_back(val); + facet_infos[findex].fvalue_searched_tokens.emplace_back(val); } } } @@ -4519,7 +4539,7 @@ void Index::compute_facet_infos(const std::vector& facets, facet_query_t& for(size_t array_index: array_indices) { if(array_index < facet_hashes.size()) { - std::string hash = std::to_string(facet_hashes[array_index]); + uint32_t hash = facet_hashes[array_index]; /*LOG(INFO) << "seq_id: " << seq_id << ", hash: " << hash << ", array index: " << array_index;*/ @@ -4531,7 +4551,7 @@ void Index::compute_facet_infos(const std::vector& facets, facet_query_t& } } } else { - std::string hash = std::to_string(facet_hashes[0]); + uint32_t hash = facet_hashes[0]; if(facet_infos[findex].hashes.count(hash) == 0) { //LOG(INFO) << "adding searched_tokens for hash " << hash; facet_infos[findex].hashes.emplace(hash, searched_tokens);