Have separate facet aggregation structures for value/hash facets.

This commit is contained in:
Kishore Nallan 2023-08-15 19:25:42 +05:30
parent ced85b446d
commit a3a44acf2f
4 changed files with 99 additions and 73 deletions

View File

@ -187,9 +187,8 @@ private:
std::vector<field>& new_fields,
bool enable_nested_fields);
static bool facet_count_compare(const std::pair<uint32_t, facet_count_t>& a,
const std::pair<uint32_t, facet_count_t>& b) {
return std::tie(a.second.count, a.first) > std::tie(b.second.count, b.first);
static bool facet_count_compare(const facet_count_t& a, const facet_count_t& b) {
return std::tie(a.count, a.fhash) > std::tie(b.count, b.fhash);
}
static bool facet_count_str_compare(const facet_value_t& a,

View File

@ -570,6 +570,10 @@ public:
struct facet_count_t {
uint32_t count = 0;
// for value based faceting, actual value is stored here
std::string fvalue;
// for hash based faceting, hash value is stored here
int64_t fhash;
// used to fetch the actual document and value for representation
uint32_t doc_id = 0;
uint32_t array_pos = 0;
@ -584,9 +588,12 @@ struct facet_stats_t {
struct facet {
const std::string field_name;
spp::sparse_hash_map<std::string, facet_count_t> result_map;
spp::sparse_hash_map<uint64_t, facet_count_t> result_map;
spp::sparse_hash_map<std::string, facet_count_t> value_result_map;
// used for facet value query
spp::sparse_hash_map<std::string, std::vector<std::string>> hash_tokens;
spp::sparse_hash_map<std::string, std::vector<std::string>> fvalue_tokens;
spp::sparse_hash_map<uint64_t, std::vector<std::string>> hash_tokens;
// used for faceting grouped results
spp::sparse_hash_map<uint32_t, spp::sparse_hash_set<uint32_t>> hash_groups;
@ -594,7 +601,7 @@ struct facet {
facet_stats_t stats;
//dictionary of key=>pair(range_id, range_val)
std::map<std::string, std::string> facet_range_map;
std::map<int64_t, std::string> facet_range_map;
bool is_range_query;
@ -604,16 +611,14 @@ struct facet {
bool is_intersected = false;
bool get_range(std::string key, std::pair<std::string, std::string>& range_pair)
{
if(facet_range_map.empty())
{
bool get_range(int64_t key, std::pair<int64_t, std::string>& range_pair) {
if(facet_range_map.empty()) {
LOG (ERROR) << "Facet range is not defined!!!";
}
auto it = facet_range_map.lower_bound(key);
if(it != facet_range_map.end())
{
if(it != facet_range_map.end()) {
range_pair.first = it->first;
range_pair.second = it->second;
return true;
@ -622,17 +627,16 @@ struct facet {
return false;
}
explicit facet(const std::string& field_name,
std::map<std::string, std::string> facet_range = {}, bool is_range_q = false)
:field_name(field_name){
facet_range_map = facet_range;
is_range_query = is_range_q;
explicit facet(const std::string& field_name, std::map<int64_t, std::string> facet_range = {},
bool is_range_q = false) :field_name(field_name), facet_range_map(facet_range),
is_range_query(is_range_q) {
}
};
struct facet_info_t {
// facet hash => resolved tokens
std::unordered_map<std::string, std::vector<std::string>> hashes;
std::unordered_map<uint64_t, std::vector<std::string>> hashes;
std::vector<std::string> fvalue_searched_tokens;
bool use_facet_query = false;
bool should_compute_stats = false;
bool use_value_index = false;

View File

@ -2001,9 +2001,11 @@ Option<nlohmann::json> Collection::search(std::string raw_query,
result["facet_counts"] = nlohmann::json::array();
// populate facets
for(facet & a_facet: facets) {
for(facet& a_facet: facets) {
// Don't return zero counts for a wildcard facet.
if (a_facet.is_wildcard_match && a_facet.result_map.size() == 0) {
if (a_facet.is_wildcard_match &&
(((a_facet.is_intersected && a_facet.value_result_map.empty())) ||
(!a_facet.is_intersected && a_facet.result_map.empty()))) {
continue;
}
@ -2020,28 +2022,28 @@ Option<nlohmann::json> Collection::search(std::string raw_query,
facet_result["counts"] = nlohmann::json::array();
std::vector<facet_value_t> facet_values;
std::vector<std::pair<std::string, facet_count_t>> facet_counts;
std::vector<facet_count_t> facet_counts;
for (const auto & kv : a_facet.result_map) {
facet_counts.emplace_back(std::make_pair(kv.first, kv.second));
facet_count_t v = kv.second;
v.fhash = kv.first;
facet_counts.emplace_back(v);
}
for (const auto& kv : a_facet.value_result_map) {
facet_count_t v = kv.second;
v.fvalue = kv.first;
v.fhash = StringUtils::hash_wy(kv.first.c_str(), kv.first.size());
facet_counts.emplace_back(v);
}
auto max_facets = std::min(max_facet_values, facet_counts.size());
auto nthElement = max_facets == facet_counts.size() ? max_facets - 1 : max_facets;
std::nth_element(facet_counts.begin(), facet_counts.begin() + nthElement,
facet_counts.end(), [&](const auto& kv1, const auto& kv2) {
size_t a_count = kv1.second.count;
size_t b_count = kv2.second.count;
size_t a_value_size = UINT64_MAX - kv1.first.size();
size_t b_value_size = UINT64_MAX - kv2.first.size();
return std::tie(a_count, a_value_size) > std::tie(b_count, b_value_size);
});
std::nth_element(facet_counts.begin(), facet_counts.begin() + nthElement, facet_counts.end(),
Collection::facet_count_compare);
if(a_facet.is_range_query){
for(auto kv : a_facet.result_map){
for(const auto& kv : a_facet.result_map){
auto facet_range_iter = a_facet.facet_range_map.find(kv.first);
if(facet_range_iter != a_facet.facet_range_map.end()){
auto & facet_count = kv.second;
@ -2059,13 +2061,11 @@ Option<nlohmann::json> Collection::search(std::string raw_query,
for(size_t fi = 0; fi < max_facets; fi++) {
// remap facet value hash with actual string
auto & kv = facet_counts[fi];
auto & facet_count = kv.second;
auto & facet_count = facet_counts[fi];
std::string value;
if(a_facet.is_intersected) {
value = kv.first;
value = facet_count.fvalue;
//LOG(INFO) << "used intersection";
} else {
// fetch actual facet value from representative doc id
@ -2089,7 +2089,8 @@ Option<nlohmann::json> Collection::search(std::string raw_query,
}
std::unordered_map<std::string, size_t> ftoken_pos;
std::vector<string>& ftokens = a_facet.hash_tokens[kv.first];
std::vector<string>& ftokens = a_facet.is_intersected ? a_facet.fvalue_tokens[facet_count.fvalue] :
a_facet.hash_tokens[facet_count.fhash];
//LOG(INFO) << "working on hash_tokens for hash " << kv.first << " with size " << ftokens.size();
for(size_t ti = 0; ti < ftokens.size(); ti++) {
if(the_field.is_bool()) {
@ -4874,7 +4875,7 @@ Option<bool> Collection::parse_facet(const std::string& facet_field, std::vector
return Option<bool>(400, error);
}
std::vector<std::tuple<std::string, std::string, std::string>> tupVec;
std::vector<std::tuple<int64_t, int64_t, std::string>> tupVec;
auto& range_map = a_facet.facet_range_map;
for(const auto& range : result){
@ -4889,26 +4890,28 @@ Option<bool> Collection::parse_facet(const std::string& facet_field, std::vector
auto pos2 = range.find(",");
auto pos3 = range.find("]");
std::string lower_range, upper_range;
int64_t lower_range, upper_range;
auto lower_range_start = pos1 + 2;
auto lower_range_len = pos2 - lower_range_start;
auto upper_range_start = pos2 + 1;
auto upper_range_len = pos3 - upper_range_start;
if(a_field.is_integer()) {
lower_range = range.substr(lower_range_start, lower_range_len);
StringUtils::trim(lower_range);
upper_range = range.substr(upper_range_start, upper_range_len);
StringUtils::trim(upper_range);
std::string lower_range_str = range.substr(lower_range_start, lower_range_len);
StringUtils::trim(lower_range_str);
lower_range = std::stoll(lower_range_str);
std::string upper_range_str = range.substr(upper_range_start, upper_range_len);
StringUtils::trim(upper_range_str);
upper_range = std::stoll(upper_range_str);
} else {
float val = std::stof(range.substr(pos1 + 2, pos2));
lower_range = std::to_string(Index::float_to_int64_t(val));
lower_range = Index::float_to_int64_t(val);
val = std::stof(range.substr(pos2 + 1, pos3));
upper_range = std::to_string(Index::float_to_int64_t(val));
upper_range = Index::float_to_int64_t(val);
}
tupVec.emplace_back(std::make_tuple(lower_range, upper_range, range_val));
tupVec.emplace_back(lower_range, upper_range, range_val);
}
//sort the range values so that we can check continuity

View File

@ -1295,32 +1295,29 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
//range facet processing
if(a_facet.is_range_query) {
const auto doc_val = kv.first;
std::pair<std::string, std::string> range_pair {};
if(a_facet.get_range(doc_val, range_pair)) {
std::pair<int64_t , std::string> range_pair {};
if(a_facet.get_range(std::stoll(doc_val), range_pair)) {
const auto& range_id = range_pair.first;
facet_count_t& facet_count = a_facet.result_map[range_id];
facet_count.count = kv.second;
}
} else {
if(use_facet_query) {
const auto fquery_hashes_it = fquery_hashes.find(facet_field.name);
if(fquery_hashes_it != fquery_hashes.end()) {
const auto& searched_tokens = fquery_hashes_it->second;
auto facet_str = kv.first;
transform(facet_str.begin(), facet_str.end(), facet_str.begin(), ::tolower);
const auto& searched_tokens = facet_infos[findex].fvalue_searched_tokens;
auto facet_str = kv.first;
transform(facet_str.begin(), facet_str.end(), facet_str.begin(), ::tolower);
for(const auto& val : searched_tokens) {
if(facet_str.find(val) != std::string::npos) {
facet_count_t& facet_count = a_facet.result_map[kv.first];
facet_count.count = kv.second;
for(const auto& val : searched_tokens) {
if(facet_str.find(val) != std::string::npos) {
facet_count_t& facet_count = a_facet.value_result_map[kv.first];
facet_count.count = kv.second;
a_facet.hash_tokens[kv.first] = searched_tokens;
}
a_facet.fvalue_tokens[kv.first] = searched_tokens;
}
}
} else {
facet_count_t& facet_count = a_facet.result_map[kv.first];
facet_count_t& facet_count = a_facet.value_result_map[kv.first];
facet_count.count = kv.second;
}
}
@ -1389,18 +1386,17 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
compute_facet_stats(a_facet, fhash, facet_field.type);
}
std::string fhash_str = std::to_string(fhash);
if(a_facet.is_range_query) {
int64_t doc_val = get_doc_val_from_sort_index(sort_index_it, doc_seq_id);
std::pair<std::string, std::string> range_pair {};
if(a_facet.get_range(std::to_string(doc_val), range_pair)) {
std::pair<int64_t , std::string> range_pair {};
if(a_facet.get_range(doc_val, range_pair)) {
const auto& range_id = range_pair.first;
facet_count_t& facet_count = a_facet.result_map[range_id];
facet_count.count += 1;
}
} else if(!use_facet_query || fquery_hashes.find(fhash_str) != fquery_hashes.end()) {
facet_count_t& facet_count = a_facet.result_map[fhash_str];
} else if(!use_facet_query || fquery_hashes.find(fhash) != fquery_hashes.end()) {
facet_count_t& facet_count = a_facet.result_map[fhash];
//LOG(INFO) << "field: " << a_facet.field_name << ", doc id: " << doc_seq_id << ", hash: " << fhash;
facet_count.doc_id = doc_seq_id;
facet_count.array_pos = j;
@ -1411,7 +1407,7 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
}
if(use_facet_query) {
//LOG (INFO) << "adding hash tokens for hash " << fhash;
a_facet.hash_tokens[fhash_str] = fquery_hashes.at(fhash_str);
a_facet.hash_tokens[fhash] = fquery_hashes.at(fhash);
}
}
}
@ -2891,7 +2887,7 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
for(auto & facet_kv: this_facet.result_map) {
uint32_t fhash = 0;
if(group_limit) {
fhash = std::stoul(facet_kv.first);
fhash = facet_kv.first;
// we have to add all group sets
acc_facet.hash_groups[fhash].insert(
this_facet.hash_groups[fhash].begin(),
@ -2915,6 +2911,24 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
acc_facet.hash_tokens[facet_kv.first] = this_facet.hash_tokens[facet_kv.first];
}
for(auto& facet_kv: this_facet.value_result_map) {
size_t count = 0;
if(acc_facet.value_result_map.count(facet_kv.first) == 0) {
// not found, so set it
count = facet_kv.second.count;
} else {
count = acc_facet.value_result_map[facet_kv.first].count + facet_kv.second.count;
}
acc_facet.value_result_map[facet_kv.first].count = count;
acc_facet.value_result_map[facet_kv.first].doc_id = facet_kv.second.doc_id;
acc_facet.value_result_map[facet_kv.first].array_pos = facet_kv.second.array_pos;
acc_facet.is_intersected = this_facet.is_intersected;
acc_facet.fvalue_tokens[facet_kv.first] = this_facet.fvalue_tokens[facet_kv.first];
}
if(this_facet.stats.fvcount != 0) {
acc_facet.stats.fvcount += this_facet.stats.fvcount;
acc_facet.stats.fvsum += this_facet.stats.fvsum;
@ -2927,7 +2941,7 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
for(auto & acc_facet: facets) {
for(auto& facet_kv: acc_facet.result_map) {
if(group_limit) {
facet_kv.second.count = acc_facet.hash_groups[std::stoul(facet_kv.first)].size();
facet_kv.second.count = acc_facet.hash_groups[facet_kv.first].size();
}
if(estimate_facets) {
@ -2935,6 +2949,12 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
}
}
for(auto& facet_kv: acc_facet.value_result_map) {
if(estimate_facets) {
facet_kv.second.count = size_t(double(facet_kv.second.count) * (100.0f / facet_sample_percent));
}
}
if(estimate_facets) {
acc_facet.sampled = true;
}
@ -4482,7 +4502,7 @@ void Index::compute_facet_infos(const std::vector<facet>& facets, facet_query_t&
// since `field_result_ids` contains documents matched across all queries
// value based index
for(const auto& val : searched_tokens) {
facet_infos[findex].hashes[facet_field.name].emplace_back(val);
facet_infos[findex].fvalue_searched_tokens.emplace_back(val);
}
}
}
@ -4519,7 +4539,7 @@ void Index::compute_facet_infos(const std::vector<facet>& facets, facet_query_t&
for(size_t array_index: array_indices) {
if(array_index < facet_hashes.size()) {
std::string hash = std::to_string(facet_hashes[array_index]);
uint32_t hash = facet_hashes[array_index];
/*LOG(INFO) << "seq_id: " << seq_id << ", hash: " << hash << ", array index: "
<< array_index;*/
@ -4531,7 +4551,7 @@ void Index::compute_facet_infos(const std::vector<facet>& facets, facet_query_t&
}
}
} else {
std::string hash = std::to_string(facet_hashes[0]);
uint32_t hash = facet_hashes[0];
if(facet_infos[findex].hashes.count(hash) == 0) {
//LOG(INFO) << "adding searched_tokens for hash " << hash;
facet_infos[findex].hashes.emplace(hash, searched_tokens);