mirror of
https://github.com/typesense/typesense.git
synced 2025-05-19 13:12:22 +08:00
Have separate facet aggregation structures for value/hash facets.
This commit is contained in:
parent
ced85b446d
commit
a3a44acf2f
@ -187,9 +187,8 @@ private:
|
||||
std::vector<field>& new_fields,
|
||||
bool enable_nested_fields);
|
||||
|
||||
static bool facet_count_compare(const std::pair<uint32_t, facet_count_t>& a,
|
||||
const std::pair<uint32_t, facet_count_t>& b) {
|
||||
return std::tie(a.second.count, a.first) > std::tie(b.second.count, b.first);
|
||||
static bool facet_count_compare(const facet_count_t& a, const facet_count_t& b) {
|
||||
return std::tie(a.count, a.fhash) > std::tie(b.count, b.fhash);
|
||||
}
|
||||
|
||||
static bool facet_count_str_compare(const facet_value_t& a,
|
||||
|
@ -570,6 +570,10 @@ public:
|
||||
|
||||
struct facet_count_t {
|
||||
uint32_t count = 0;
|
||||
// for value based faceting, actual value is stored here
|
||||
std::string fvalue;
|
||||
// for hash based faceting, hash value is stored here
|
||||
int64_t fhash;
|
||||
// used to fetch the actual document and value for representation
|
||||
uint32_t doc_id = 0;
|
||||
uint32_t array_pos = 0;
|
||||
@ -584,9 +588,12 @@ struct facet_stats_t {
|
||||
|
||||
struct facet {
|
||||
const std::string field_name;
|
||||
spp::sparse_hash_map<std::string, facet_count_t> result_map;
|
||||
spp::sparse_hash_map<uint64_t, facet_count_t> result_map;
|
||||
spp::sparse_hash_map<std::string, facet_count_t> value_result_map;
|
||||
|
||||
// used for facet value query
|
||||
spp::sparse_hash_map<std::string, std::vector<std::string>> hash_tokens;
|
||||
spp::sparse_hash_map<std::string, std::vector<std::string>> fvalue_tokens;
|
||||
spp::sparse_hash_map<uint64_t, std::vector<std::string>> hash_tokens;
|
||||
|
||||
// used for faceting grouped results
|
||||
spp::sparse_hash_map<uint32_t, spp::sparse_hash_set<uint32_t>> hash_groups;
|
||||
@ -594,7 +601,7 @@ struct facet {
|
||||
facet_stats_t stats;
|
||||
|
||||
//dictionary of key=>pair(range_id, range_val)
|
||||
std::map<std::string, std::string> facet_range_map;
|
||||
std::map<int64_t, std::string> facet_range_map;
|
||||
|
||||
bool is_range_query;
|
||||
|
||||
@ -604,16 +611,14 @@ struct facet {
|
||||
|
||||
bool is_intersected = false;
|
||||
|
||||
bool get_range(std::string key, std::pair<std::string, std::string>& range_pair)
|
||||
{
|
||||
if(facet_range_map.empty())
|
||||
{
|
||||
bool get_range(int64_t key, std::pair<int64_t, std::string>& range_pair) {
|
||||
if(facet_range_map.empty()) {
|
||||
LOG (ERROR) << "Facet range is not defined!!!";
|
||||
}
|
||||
|
||||
auto it = facet_range_map.lower_bound(key);
|
||||
|
||||
if(it != facet_range_map.end())
|
||||
{
|
||||
if(it != facet_range_map.end()) {
|
||||
range_pair.first = it->first;
|
||||
range_pair.second = it->second;
|
||||
return true;
|
||||
@ -622,17 +627,16 @@ struct facet {
|
||||
return false;
|
||||
}
|
||||
|
||||
explicit facet(const std::string& field_name,
|
||||
std::map<std::string, std::string> facet_range = {}, bool is_range_q = false)
|
||||
:field_name(field_name){
|
||||
facet_range_map = facet_range;
|
||||
is_range_query = is_range_q;
|
||||
explicit facet(const std::string& field_name, std::map<int64_t, std::string> facet_range = {},
|
||||
bool is_range_q = false) :field_name(field_name), facet_range_map(facet_range),
|
||||
is_range_query(is_range_q) {
|
||||
}
|
||||
};
|
||||
|
||||
struct facet_info_t {
|
||||
// facet hash => resolved tokens
|
||||
std::unordered_map<std::string, std::vector<std::string>> hashes;
|
||||
std::unordered_map<uint64_t, std::vector<std::string>> hashes;
|
||||
std::vector<std::string> fvalue_searched_tokens;
|
||||
bool use_facet_query = false;
|
||||
bool should_compute_stats = false;
|
||||
bool use_value_index = false;
|
||||
|
@ -2001,9 +2001,11 @@ Option<nlohmann::json> Collection::search(std::string raw_query,
|
||||
result["facet_counts"] = nlohmann::json::array();
|
||||
|
||||
// populate facets
|
||||
for(facet & a_facet: facets) {
|
||||
for(facet& a_facet: facets) {
|
||||
// Don't return zero counts for a wildcard facet.
|
||||
if (a_facet.is_wildcard_match && a_facet.result_map.size() == 0) {
|
||||
if (a_facet.is_wildcard_match &&
|
||||
(((a_facet.is_intersected && a_facet.value_result_map.empty())) ||
|
||||
(!a_facet.is_intersected && a_facet.result_map.empty()))) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -2020,28 +2022,28 @@ Option<nlohmann::json> Collection::search(std::string raw_query,
|
||||
facet_result["counts"] = nlohmann::json::array();
|
||||
|
||||
std::vector<facet_value_t> facet_values;
|
||||
std::vector<std::pair<std::string, facet_count_t>> facet_counts;
|
||||
std::vector<facet_count_t> facet_counts;
|
||||
|
||||
for (const auto & kv : a_facet.result_map) {
|
||||
facet_counts.emplace_back(std::make_pair(kv.first, kv.second));
|
||||
facet_count_t v = kv.second;
|
||||
v.fhash = kv.first;
|
||||
facet_counts.emplace_back(v);
|
||||
}
|
||||
|
||||
for (const auto& kv : a_facet.value_result_map) {
|
||||
facet_count_t v = kv.second;
|
||||
v.fvalue = kv.first;
|
||||
v.fhash = StringUtils::hash_wy(kv.first.c_str(), kv.first.size());
|
||||
facet_counts.emplace_back(v);
|
||||
}
|
||||
|
||||
auto max_facets = std::min(max_facet_values, facet_counts.size());
|
||||
auto nthElement = max_facets == facet_counts.size() ? max_facets - 1 : max_facets;
|
||||
std::nth_element(facet_counts.begin(), facet_counts.begin() + nthElement,
|
||||
facet_counts.end(), [&](const auto& kv1, const auto& kv2) {
|
||||
size_t a_count = kv1.second.count;
|
||||
size_t b_count = kv2.second.count;
|
||||
|
||||
size_t a_value_size = UINT64_MAX - kv1.first.size();
|
||||
size_t b_value_size = UINT64_MAX - kv2.first.size();
|
||||
|
||||
return std::tie(a_count, a_value_size) > std::tie(b_count, b_value_size);
|
||||
});
|
||||
std::nth_element(facet_counts.begin(), facet_counts.begin() + nthElement, facet_counts.end(),
|
||||
Collection::facet_count_compare);
|
||||
|
||||
if(a_facet.is_range_query){
|
||||
for(auto kv : a_facet.result_map){
|
||||
|
||||
for(const auto& kv : a_facet.result_map){
|
||||
auto facet_range_iter = a_facet.facet_range_map.find(kv.first);
|
||||
if(facet_range_iter != a_facet.facet_range_map.end()){
|
||||
auto & facet_count = kv.second;
|
||||
@ -2059,13 +2061,11 @@ Option<nlohmann::json> Collection::search(std::string raw_query,
|
||||
|
||||
for(size_t fi = 0; fi < max_facets; fi++) {
|
||||
// remap facet value hash with actual string
|
||||
auto & kv = facet_counts[fi];
|
||||
auto & facet_count = kv.second;
|
||||
|
||||
auto & facet_count = facet_counts[fi];
|
||||
std::string value;
|
||||
|
||||
if(a_facet.is_intersected) {
|
||||
value = kv.first;
|
||||
value = facet_count.fvalue;
|
||||
//LOG(INFO) << "used intersection";
|
||||
} else {
|
||||
// fetch actual facet value from representative doc id
|
||||
@ -2089,7 +2089,8 @@ Option<nlohmann::json> Collection::search(std::string raw_query,
|
||||
}
|
||||
|
||||
std::unordered_map<std::string, size_t> ftoken_pos;
|
||||
std::vector<string>& ftokens = a_facet.hash_tokens[kv.first];
|
||||
std::vector<string>& ftokens = a_facet.is_intersected ? a_facet.fvalue_tokens[facet_count.fvalue] :
|
||||
a_facet.hash_tokens[facet_count.fhash];
|
||||
//LOG(INFO) << "working on hash_tokens for hash " << kv.first << " with size " << ftokens.size();
|
||||
for(size_t ti = 0; ti < ftokens.size(); ti++) {
|
||||
if(the_field.is_bool()) {
|
||||
@ -4874,7 +4875,7 @@ Option<bool> Collection::parse_facet(const std::string& facet_field, std::vector
|
||||
return Option<bool>(400, error);
|
||||
}
|
||||
|
||||
std::vector<std::tuple<std::string, std::string, std::string>> tupVec;
|
||||
std::vector<std::tuple<int64_t, int64_t, std::string>> tupVec;
|
||||
|
||||
auto& range_map = a_facet.facet_range_map;
|
||||
for(const auto& range : result){
|
||||
@ -4889,26 +4890,28 @@ Option<bool> Collection::parse_facet(const std::string& facet_field, std::vector
|
||||
auto pos2 = range.find(",");
|
||||
auto pos3 = range.find("]");
|
||||
|
||||
std::string lower_range, upper_range;
|
||||
int64_t lower_range, upper_range;
|
||||
auto lower_range_start = pos1 + 2;
|
||||
auto lower_range_len = pos2 - lower_range_start;
|
||||
auto upper_range_start = pos2 + 1;
|
||||
auto upper_range_len = pos3 - upper_range_start;
|
||||
|
||||
if(a_field.is_integer()) {
|
||||
lower_range = range.substr(lower_range_start, lower_range_len);
|
||||
StringUtils::trim(lower_range);
|
||||
upper_range = range.substr(upper_range_start, upper_range_len);
|
||||
StringUtils::trim(upper_range);
|
||||
std::string lower_range_str = range.substr(lower_range_start, lower_range_len);
|
||||
StringUtils::trim(lower_range_str);
|
||||
lower_range = std::stoll(lower_range_str);
|
||||
std::string upper_range_str = range.substr(upper_range_start, upper_range_len);
|
||||
StringUtils::trim(upper_range_str);
|
||||
upper_range = std::stoll(upper_range_str);
|
||||
} else {
|
||||
float val = std::stof(range.substr(pos1 + 2, pos2));
|
||||
lower_range = std::to_string(Index::float_to_int64_t(val));
|
||||
lower_range = Index::float_to_int64_t(val);
|
||||
|
||||
val = std::stof(range.substr(pos2 + 1, pos3));
|
||||
upper_range = std::to_string(Index::float_to_int64_t(val));
|
||||
upper_range = Index::float_to_int64_t(val);
|
||||
}
|
||||
|
||||
tupVec.emplace_back(std::make_tuple(lower_range, upper_range, range_val));
|
||||
tupVec.emplace_back(lower_range, upper_range, range_val);
|
||||
}
|
||||
|
||||
//sort the range values so that we can check continuity
|
||||
|
@ -1295,32 +1295,29 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
|
||||
//range facet processing
|
||||
if(a_facet.is_range_query) {
|
||||
const auto doc_val = kv.first;
|
||||
std::pair<std::string, std::string> range_pair {};
|
||||
if(a_facet.get_range(doc_val, range_pair)) {
|
||||
std::pair<int64_t , std::string> range_pair {};
|
||||
if(a_facet.get_range(std::stoll(doc_val), range_pair)) {
|
||||
const auto& range_id = range_pair.first;
|
||||
facet_count_t& facet_count = a_facet.result_map[range_id];
|
||||
facet_count.count = kv.second;
|
||||
}
|
||||
} else {
|
||||
if(use_facet_query) {
|
||||
const auto fquery_hashes_it = fquery_hashes.find(facet_field.name);
|
||||
if(fquery_hashes_it != fquery_hashes.end()) {
|
||||
const auto& searched_tokens = fquery_hashes_it->second;
|
||||
auto facet_str = kv.first;
|
||||
transform(facet_str.begin(), facet_str.end(), facet_str.begin(), ::tolower);
|
||||
const auto& searched_tokens = facet_infos[findex].fvalue_searched_tokens;
|
||||
auto facet_str = kv.first;
|
||||
transform(facet_str.begin(), facet_str.end(), facet_str.begin(), ::tolower);
|
||||
|
||||
for(const auto& val : searched_tokens) {
|
||||
if(facet_str.find(val) != std::string::npos) {
|
||||
facet_count_t& facet_count = a_facet.result_map[kv.first];
|
||||
facet_count.count = kv.second;
|
||||
for(const auto& val : searched_tokens) {
|
||||
if(facet_str.find(val) != std::string::npos) {
|
||||
facet_count_t& facet_count = a_facet.value_result_map[kv.first];
|
||||
facet_count.count = kv.second;
|
||||
|
||||
a_facet.hash_tokens[kv.first] = searched_tokens;
|
||||
}
|
||||
a_facet.fvalue_tokens[kv.first] = searched_tokens;
|
||||
}
|
||||
}
|
||||
|
||||
} else {
|
||||
facet_count_t& facet_count = a_facet.result_map[kv.first];
|
||||
facet_count_t& facet_count = a_facet.value_result_map[kv.first];
|
||||
facet_count.count = kv.second;
|
||||
}
|
||||
}
|
||||
@ -1389,18 +1386,17 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
|
||||
compute_facet_stats(a_facet, fhash, facet_field.type);
|
||||
}
|
||||
|
||||
std::string fhash_str = std::to_string(fhash);
|
||||
if(a_facet.is_range_query) {
|
||||
int64_t doc_val = get_doc_val_from_sort_index(sort_index_it, doc_seq_id);
|
||||
|
||||
std::pair<std::string, std::string> range_pair {};
|
||||
if(a_facet.get_range(std::to_string(doc_val), range_pair)) {
|
||||
std::pair<int64_t , std::string> range_pair {};
|
||||
if(a_facet.get_range(doc_val, range_pair)) {
|
||||
const auto& range_id = range_pair.first;
|
||||
facet_count_t& facet_count = a_facet.result_map[range_id];
|
||||
facet_count.count += 1;
|
||||
}
|
||||
} else if(!use_facet_query || fquery_hashes.find(fhash_str) != fquery_hashes.end()) {
|
||||
facet_count_t& facet_count = a_facet.result_map[fhash_str];
|
||||
} else if(!use_facet_query || fquery_hashes.find(fhash) != fquery_hashes.end()) {
|
||||
facet_count_t& facet_count = a_facet.result_map[fhash];
|
||||
//LOG(INFO) << "field: " << a_facet.field_name << ", doc id: " << doc_seq_id << ", hash: " << fhash;
|
||||
facet_count.doc_id = doc_seq_id;
|
||||
facet_count.array_pos = j;
|
||||
@ -1411,7 +1407,7 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
|
||||
}
|
||||
if(use_facet_query) {
|
||||
//LOG (INFO) << "adding hash tokens for hash " << fhash;
|
||||
a_facet.hash_tokens[fhash_str] = fquery_hashes.at(fhash_str);
|
||||
a_facet.hash_tokens[fhash] = fquery_hashes.at(fhash);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -2891,7 +2887,7 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
|
||||
for(auto & facet_kv: this_facet.result_map) {
|
||||
uint32_t fhash = 0;
|
||||
if(group_limit) {
|
||||
fhash = std::stoul(facet_kv.first);
|
||||
fhash = facet_kv.first;
|
||||
// we have to add all group sets
|
||||
acc_facet.hash_groups[fhash].insert(
|
||||
this_facet.hash_groups[fhash].begin(),
|
||||
@ -2915,6 +2911,24 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
|
||||
acc_facet.hash_tokens[facet_kv.first] = this_facet.hash_tokens[facet_kv.first];
|
||||
}
|
||||
|
||||
for(auto& facet_kv: this_facet.value_result_map) {
|
||||
size_t count = 0;
|
||||
if(acc_facet.value_result_map.count(facet_kv.first) == 0) {
|
||||
// not found, so set it
|
||||
count = facet_kv.second.count;
|
||||
} else {
|
||||
count = acc_facet.value_result_map[facet_kv.first].count + facet_kv.second.count;
|
||||
}
|
||||
|
||||
acc_facet.value_result_map[facet_kv.first].count = count;
|
||||
|
||||
acc_facet.value_result_map[facet_kv.first].doc_id = facet_kv.second.doc_id;
|
||||
acc_facet.value_result_map[facet_kv.first].array_pos = facet_kv.second.array_pos;
|
||||
acc_facet.is_intersected = this_facet.is_intersected;
|
||||
|
||||
acc_facet.fvalue_tokens[facet_kv.first] = this_facet.fvalue_tokens[facet_kv.first];
|
||||
}
|
||||
|
||||
if(this_facet.stats.fvcount != 0) {
|
||||
acc_facet.stats.fvcount += this_facet.stats.fvcount;
|
||||
acc_facet.stats.fvsum += this_facet.stats.fvsum;
|
||||
@ -2927,7 +2941,7 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
|
||||
for(auto & acc_facet: facets) {
|
||||
for(auto& facet_kv: acc_facet.result_map) {
|
||||
if(group_limit) {
|
||||
facet_kv.second.count = acc_facet.hash_groups[std::stoul(facet_kv.first)].size();
|
||||
facet_kv.second.count = acc_facet.hash_groups[facet_kv.first].size();
|
||||
}
|
||||
|
||||
if(estimate_facets) {
|
||||
@ -2935,6 +2949,12 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
|
||||
}
|
||||
}
|
||||
|
||||
for(auto& facet_kv: acc_facet.value_result_map) {
|
||||
if(estimate_facets) {
|
||||
facet_kv.second.count = size_t(double(facet_kv.second.count) * (100.0f / facet_sample_percent));
|
||||
}
|
||||
}
|
||||
|
||||
if(estimate_facets) {
|
||||
acc_facet.sampled = true;
|
||||
}
|
||||
@ -4482,7 +4502,7 @@ void Index::compute_facet_infos(const std::vector<facet>& facets, facet_query_t&
|
||||
// since `field_result_ids` contains documents matched across all queries
|
||||
// value based index
|
||||
for(const auto& val : searched_tokens) {
|
||||
facet_infos[findex].hashes[facet_field.name].emplace_back(val);
|
||||
facet_infos[findex].fvalue_searched_tokens.emplace_back(val);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -4519,7 +4539,7 @@ void Index::compute_facet_infos(const std::vector<facet>& facets, facet_query_t&
|
||||
|
||||
for(size_t array_index: array_indices) {
|
||||
if(array_index < facet_hashes.size()) {
|
||||
std::string hash = std::to_string(facet_hashes[array_index]);
|
||||
uint32_t hash = facet_hashes[array_index];
|
||||
|
||||
/*LOG(INFO) << "seq_id: " << seq_id << ", hash: " << hash << ", array index: "
|
||||
<< array_index;*/
|
||||
@ -4531,7 +4551,7 @@ void Index::compute_facet_infos(const std::vector<facet>& facets, facet_query_t&
|
||||
}
|
||||
}
|
||||
} else {
|
||||
std::string hash = std::to_string(facet_hashes[0]);
|
||||
uint32_t hash = facet_hashes[0];
|
||||
if(facet_infos[findex].hashes.count(hash) == 0) {
|
||||
//LOG(INFO) << "adding searched_tokens for hash " << hash;
|
||||
facet_infos[findex].hashes.emplace(hash, searched_tokens);
|
||||
|
Loading…
x
Reference in New Issue
Block a user