facet index refactor updated changes

This commit is contained in:
krunal1313 2023-03-31 17:38:33 +05:30
parent 8b4e95e421
commit cd69111a5b
8 changed files with 331 additions and 294 deletions

View File

@ -3,8 +3,6 @@
#include "ids_t.h"
#include "tsl/htrie_map.h"
#include <list>
#include <set>
#include <mutex>
class facet_index_t {
private:
@ -21,17 +19,27 @@ private:
struct facet_index_counter {
tsl::htrie_map<char, void*> facet_index_map;
std::list<count_list> counter_list;
~facet_index_counter() {
for(auto it = facet_index_map.begin(); it != facet_index_map.end(); ++it) {
ids_t::destroy_list(it.value());
}
facet_index_map.clear();
counter_list.clear();
}
};
std::map<std::string, facet_index_counter> facet_field_map;
public:
facet_index_t() = default;
~facet_index_t();
void insert(const std::string& field, const std::string& value, uint32_t id);
size_t get(const std::string& field, std::map<std::string,std::vector<uint32_t>>& result_ids);
void erase(const std::string& field);
bool contains(const std::string& field);
@ -40,4 +48,7 @@ public:
int intersect(const std::string& val, const uint32_t* result_ids, int result_id_len,
int max_facet_count, std::map<std::string, uint32_t>& found);
int get_facet(const std::string& field, const std::vector<std::string>& searched_tokens,
std::vector<std::string>& facets);
};

View File

@ -635,26 +635,24 @@ struct facet_stats_t {
struct facet {
const std::string field_name;
spp::sparse_hash_map<uint64_t, facet_count_t> result_map;
spp::sparse_hash_map<std::string, facet_count_t> result_map;
// used for facet value query
spp::sparse_hash_map<uint64_t, std::vector<std::string>> hash_tokens;
//spp::sparse_hash_map<uint64_t, std::vector<std::string>> hash_tokens;
spp::sparse_hash_map<std::string, std::vector<std::string>> facet_tokens;
// used for faceting grouped results
spp::sparse_hash_map<uint64_t, spp::sparse_hash_set<uint64_t>> hash_groups;
//spp::sparse_hash_map<uint64_t, spp::sparse_hash_set<uint64_t>> hash_groups;
facet_stats_t stats;
//dictionary of key=>pair(range_id, range_val)
std::map<int64_t, std::string> facet_range_map;
std::map<std::string, std::string> facet_range_map;
bool is_range_query;
bool sampled = false;
bool is_wildcard_match = false;
bool get_range(int64_t key, std::pair<int64_t, std::string>& range_pair)
bool get_range(std::string key, std::pair<int64_t, std::string>& range_pair)
{
if(facet_range_map.empty())
{
@ -673,7 +671,7 @@ struct facet {
}
explicit facet(const std::string& field_name,
std::map<int64_t, std::string> facet_range = {}, bool is_range_q = false)
std::map<std::string, std::string> facet_range = {}, bool is_range_q = false)
:field_name(field_name){
facet_range_map = facet_range;
is_range_query = is_range_q;
@ -684,7 +682,7 @@ struct facet_info_t {
// facet hash => resolved tokens
//std::unordered_map<uint64_t, std::vector<std::string>> hashes;
//facet name => resolved tokens
std::unordered_map<uint32_t, std::vector<std::string>> doc_id_tokens;
std::unordered_map<std::string, std::vector<std::string>> facet_tokens;
bool use_facet_query = false;
bool should_compute_stats = false;
field facet_field{"", "", false};

View File

@ -281,8 +281,6 @@ struct hnsw_index_t {
}
};
extern std::map<std::string, std::map<std::string, uint32_t>> facet_results;
class Index {
private:
mutable std::shared_mutex mutex;
@ -509,7 +507,7 @@ private:
static uint64_t facet_token_hash(const field & a_field, const std::string &token);
static void compute_facet_stats(facet &a_facet, uint64_t raw_value, const std::string & field_type);
static void compute_facet_stats(facet &a_facet, std::string raw_value, const std::string & field_type);
static void handle_doc_ops(const tsl::htrie_map<char, field>& search_schema,
nlohmann::json& update_doc, const nlohmann::json& old_doc, nlohmann::json& new_doc);

View File

@ -22,8 +22,6 @@
const std::string override_t::MATCH_EXACT = "exact";
const std::string override_t::MATCH_CONTAINS = "contains";
std::map<std::string, std::map<std::string, uint32_t>> facet_results;
struct sort_fields_guard_t {
std::vector<sort_by> sort_fields_std;
@ -1922,15 +1920,22 @@ Option<nlohmann::json> Collection::search(std::string raw_query,
facet_result["counts"] = nlohmann::json::array();
std::vector<facet_value_t> facet_values;
std::vector<std::pair<uint64_t, facet_count_t>> facet_hash_counts;
// std::vector<std::pair<uint64_t, facet_count_t>> facet_hash_counts;
for (const auto & kv : a_facet.result_map) {
facet_hash_counts.emplace_back(kv);
}
if(a_facet.is_range_query){
for(auto kv : a_facet.result_map){
// for (const auto & kv : a_facet.result_map) {
// facet_hash_counts.emplace_back(kv);
// }
auto the_field = search_schema.at(a_facet.field_name);
// keep only top K facets
//auto max_facets = std::min(max_facet_values, facet_hash_counts.size());
auto max_facets = std::min(max_facet_values, a_facet.result_map.size());
// std::nth_element(facet_hash_counts.begin(), facet_hash_counts.begin() + max_facets,
// facet_hash_counts.end(), Collection::facet_count_compare);
//LOG (INFO) << "found facet size " << a_facet.result_map.size();
for(auto& kv : a_facet.result_map) {
if(a_facet.is_range_query){
auto facet_range_iter = a_facet.facet_range_map.find(kv.first);
if(facet_range_iter != a_facet.facet_range_map.end()){
auto & facet_count = kv.second;
@ -1940,109 +1945,99 @@ Option<nlohmann::json> Collection::search(std::string raw_query,
else{
LOG (ERROR) << "range_id not found in result map.";
}
}
}
auto the_field = search_schema.at(a_facet.field_name);
// keep only top K facets
//auto max_facets = std::min(max_facet_values, facet_hash_counts.size());
auto max_facets = std::min(max_facet_values, facet_results[a_facet.field_name].size());
// std::nth_element(facet_hash_counts.begin(), facet_hash_counts.begin() + max_facets,
// facet_hash_counts.end(), Collection::facet_count_compare);
LOG (INFO) << "found_doc_seq_ids size " << facet_results[a_facet.field_name].size();
//for(size_t fi = 0; fi < max_facets; fi++) {
for(auto& it : facet_results[a_facet.field_name]) {
} else {
//facet_value_t facet_value = { kv.first, std::string(), kv.second.count};
//facet_values.emplace_back(facet_value);
if(a_facet.is_range_query){
break;
// remap facet value hash with actual string
// auto & kv = facet_hash_counts[fi];
// auto & facet_count = kv.second;
// // fetch actual facet value from representative doc id
// const std::string& seq_id_key = get_seq_id_key((uint32_t) facet_count.doc_id);
// nlohmann::json document;
// const Option<bool> & document_op = get_document_from_store(seq_id_key, document);
// if(!document_op.ok()) {
// LOG(ERROR) << "Facet fetch error. " << document_op.error();
// continue;
// }
//std::string value;
// bool facet_found = facet_value_to_string(a_facet, facet_count, document, value);
// if(!facet_found) {
// continue;
// }
std::string value = kv.first;
std::unordered_map<std::string, size_t> ftoken_pos;
//std::vector<string>& ftokens = a_facet.hash_tokens[kv.first];
std::vector<string>& ftokens = a_facet.facet_tokens[kv.first];
for(size_t ti = 0; ti < ftokens.size(); ti++) {
// if(the_field.is_bool()) {
// if(ftokens[ti] == "1") {
// ftokens[ti] = "true";
// } else {
// ftokens[ti] = "false";
// }
// }
const std::string& resolved_token = ftokens[ti];
ftoken_pos[resolved_token] = ti;
}
const std::string& last_full_q_token = ftokens.empty() ? "" : ftokens.back();
// 2 passes: first identify tokens that need to be highlighted and then construct highlighted text
bool is_cyrillic = Tokenizer::is_cyrillic(the_field.locale);
bool normalise = is_cyrillic ? false : true;
Tokenizer tokenizer(value, normalise, !the_field.is_string(), the_field.locale, symbols_to_index, token_separators);
// secondary tokenizer used for specific languages that requires transliteration
// we use 2 tokenizers so that the original text offsets are available for highlighting
Tokenizer word_tokenizer("", true, false, the_field.locale, symbols_to_index, token_separators);
std::string raw_token;
size_t raw_token_index = 0, tok_start = 0, tok_end = 0;
// need an ordered map here to ensure that it is ordered by the key (start offset)
std::map<size_t, size_t> token_offsets;
size_t prefix_token_start_index = 0;
while(tokenizer.next(raw_token, raw_token_index, tok_start, tok_end)) {
if(is_cyrillic) {
word_tokenizer.tokenize(raw_token);
}
auto token_pos_it = ftoken_pos.find(raw_token);
if(token_pos_it != ftoken_pos.end()) {
token_offsets[tok_start] = tok_end;
if(raw_token == last_full_q_token) {
prefix_token_start_index = tok_start;
}
}
}
auto offset_it = token_offsets.begin();
size_t i = 0;
std::stringstream highlightedss;
// loop until end index, accumulate token and complete highlighting
while(i < value.size()) {
if(offset_it != token_offsets.end()) {
if (i == offset_it->first) {
highlightedss << highlight_start_tag;
// do prefix highlighting for non-dropped last token
size_t token_len = (i == prefix_token_start_index && token_offsets.size() == facet_query_num_tokens) ?
facet_query_last_token.size() :
(offset_it->second - i + 1);
if(i == prefix_token_start_index && token_offsets.size() == facet_query_num_tokens) {
token_len = std::min((offset_it->second - i + 1), facet_query_last_token.size());
} else {
token_len = (offset_it->second - i + 1);
}
for(size_t j = 0; j < token_len; j++) {
highlightedss << value[i + j];
}
highlightedss << highlight_end_tag;
offset_it++;
i += token_len;
continue;
}
}
highlightedss << value[i];
i++;
}
facet_value_t facet_value = {value, highlightedss.str(), kv.second.count};
facet_values.emplace_back(facet_value);
}
// remap facet value hash with actual string
// auto & kv = facet_hash_counts[fi];
// auto & facet_count = kv.second;
// // fetch actual facet value from representative doc id
// const std::string& seq_id_key = get_seq_id_key((uint32_t) facet_count.doc_id);
// nlohmann::json document;
// const Option<bool> & document_op = get_document_from_store(seq_id_key, document);
// if(!document_op.ok()) {
// LOG(ERROR) << "Facet fetch error. " << document_op.error();
// continue;
// }
//std::string value;
// bool facet_found = facet_value_to_string(a_facet, facet_count, document, value);
// if(!facet_found) {
// continue;
// }
// std::unordered_map<std::string, size_t> ftoken_pos;
// std::vector<string>& ftokens = a_facet.hash_tokens[kv.first];
// for(size_t ti = 0; ti < ftokens.size(); ti++) {
// if(the_field.is_bool()) {
// if(ftokens[ti] == "1") {
// ftokens[ti] = "true";
// } else {
// ftokens[ti] = "false";
// }
// }
// const std::string& resolved_token = ftokens[ti];
// ftoken_pos[resolved_token] = ti;
// }
// const std::string& last_full_q_token = ftokens.empty() ? "" : ftokens.back();
// // 2 passes: first identify tokens that need to be highlighted and then construct highlighted text
// bool is_cyrillic = Tokenizer::is_cyrillic(the_field.locale);
// bool normalise = is_cyrillic ? false : true;
// Tokenizer tokenizer(value, normalise, !the_field.is_string(), the_field.locale, symbols_to_index, token_separators);
// // secondary tokenizer used for specific languages that requires transliteration
// // we use 2 tokenizers so that the original text offsets are available for highlighting
// Tokenizer word_tokenizer("", true, false, the_field.locale, symbols_to_index, token_separators);
// std::string raw_token;
// size_t raw_token_index = 0, tok_start = 0, tok_end = 0;
// // need an ordered map here to ensure that it is ordered by the key (start offset)
// std::map<size_t, size_t> token_offsets;
// size_t prefix_token_start_index = 0;
// while(tokenizer.next(raw_token, raw_token_index, tok_start, tok_end)) {
// if(is_cyrillic) {
// word_tokenizer.tokenize(raw_token);
// }
// auto token_pos_it = ftoken_pos.find(raw_token);
// if(token_pos_it != ftoken_pos.end()) {
// token_offsets[tok_start] = tok_end;
// if(raw_token == last_full_q_token) {
// prefix_token_start_index = tok_start;
// }
// }
// }
// auto offset_it = token_offsets.begin();
// size_t i = 0;
// std::stringstream highlightedss;
// // loop until end index, accumulate token and complete highlighting
// while(i < value.size()) {
// if(offset_it != token_offsets.end()) {
// if (i == offset_it->first) {
// highlightedss << highlight_start_tag;
// // do prefix highlighting for non-dropped last token
// size_t token_len = (i == prefix_token_start_index && token_offsets.size() == facet_query_num_tokens) ?
// facet_query_last_token.size() :
// (offset_it->second - i + 1);
// if(i == prefix_token_start_index && token_offsets.size() == facet_query_num_tokens) {
// token_len = std::min((offset_it->second - i + 1), facet_query_last_token.size());
// } else {
// token_len = (offset_it->second - i + 1);
// }
// for(size_t j = 0; j < token_len; j++) {
// highlightedss << value[i + j];
// }
// highlightedss << highlight_end_tag;
// offset_it++;
// i += token_len;
// continue;
// }
// }
// highlightedss << value[i];
// i++;
// }
//facet_value_t facet_value = {value, highlightedss.str(), facet_count.count};
facet_value_t facet_value = { it.first, std::string(), it.second};
facet_values.emplace_back(facet_value);
}
std::stable_sort(facet_values.begin(), facet_values.end(), Collection::facet_count_str_compare);
@ -2066,7 +2061,7 @@ Option<nlohmann::json> Collection::search(std::string raw_query,
facet_result["stats"]["avg"] = (a_facet.stats.fvsum / a_facet.stats.fvcount);
}
facet_result["stats"]["total_values"] = facet_hash_counts.size();
facet_result["stats"]["total_values"] = facet_values.size();
result["facet_counts"].push_back(facet_result);
}
@ -4659,9 +4654,9 @@ Option<bool> Collection::parse_facet(const std::string& facet_field, std::vector
for(const auto& tup : tupVec){
int64_t lower_range = std::get<0>(tup);
int64_t upper_range = std::get<1>(tup);
std::string range_val = std::get<2>(tup);
const std::string& lower_range = std::to_string(std::get<0>(tup));
const std::string& upper_range = std::to_string(std::get<1>(tup));
const std::string& range_val = std::get<2>(tup);
//check if ranges are continous or not
if((!range_map.empty()) && (range_map.find(lower_range)== range_map.end())){
std::string error = "Ranges in range facet syntax should be continous.";

View File

@ -51,25 +51,6 @@ void facet_index_t::insert(const std::string& field, const std::string& value, u
}
}
size_t facet_index_t::get(const std::string& field,
std::map<std::string,std::vector<uint32_t>>& result_ids) {
const auto& facet_field_it = facet_field_map.find(field);
if(facet_field_it == facet_field_map.end()) {
return 0;
}
auto& facet_index_map = facet_field_it->second.facet_index_map;
for(auto it = facet_index_map.begin(); it != facet_index_map.end(); ++it) {
auto ids = ids_t::uncompress(it.value());
for(auto i = 0; i < ids_t::num_ids(ids); ++i) {
result_ids[it.key()].emplace_back(ids[i]);
}
}
return result_ids.size();
}
bool facet_index_t::contains(const std::string& field) {
const auto& facet_field_it = facet_field_map.find(field);
@ -77,38 +58,14 @@ bool facet_index_t::contains(const std::string& field) {
return false;
}
// auto& facet_index_map = facet_field_it->second.facet_index_map;
// LOG(INFO) << "Size of facet_field " << field << " " << facet_index_map.size();
// for(auto it = facet_index_map.begin(); it != facet_index_map.end(); ++it) {
// LOG (INFO) << "facet_value " << it.key() << " with ids as follow";
// auto ids = ids_t::uncompress(it.value());
// for(auto i = 0; i < ids_t::num_ids(ids); ++i) {
// LOG(INFO) << ids[i];
// }
// }
return true;
}
void facet_index_t::erase(const std::string& field) {
const auto& facet_field_it = facet_field_map.find(field);
if(facet_field_it == facet_field_map.end()) {
return;
const auto it = facet_field_map.find(field);
if(it != facet_field_map.end()) {
facet_field_map.erase(field);
}
auto& facet_index_map = facet_field_it->second.facet_index_map;
for(auto it = facet_index_map.begin(); it != facet_index_map.end(); ++it) {
ids_t::destroy_list(it.value());
}
facet_index_map.clear();
facet_field_it->second.counter_list.clear();
facet_field_map.erase(field);
}
size_t facet_index_t::size() {
@ -132,39 +89,68 @@ int facet_index_t::intersect(const std::string& field, const uint32_t* result_id
// LOG (INFO) << "facet_index_map size " << facet_index_map.size()
// << " , counter_list size " << counter_list.size();
auto counter_list_it = counter_list.begin();
int facet_count = 0;
std::vector<uint32_t> id_list;
const auto max_facets = std::min((int)counter_list.size(), max_facet_count);
while(facet_count < max_facets) {
for(const auto& counter_list_it : counter_list) {
//LOG (INFO) << "checking ids in facet_value " << counter_list_it->facet_value
// << " having total count " << counter_list_it->count;
auto ids = facet_index_map.at(counter_list_it->facet_value);
auto id_list = ids_t::uncompress(ids);
auto ids = facet_index_map.at(counter_list_it.facet_value);
ids_t::uncompress(ids, id_list);
const auto ids_len = id_list.size();
int count = 0;
for(int i = 0; i < result_ids_len; ++i) {
if(std::binary_search(id_list, id_list + ids_t::num_ids(id_list), result_ids[i])) {
if(std::binary_search(id_list.begin(), id_list.end(), result_ids[i])) {
++count;
}
}
if(count) {
//LOG (INFO) << "fount count " << count << " for facet " << counter_list_it->facet_value;
found[counter_list_it->facet_value] += count;
found[counter_list_it.facet_value] = count;
if(found.size() == max_facets) {
break;
}
}
++facet_count;
++counter_list_it;
id_list.clear();
}
return found.size();
}
facet_index_t::~facet_index_t() {
for(auto it = facet_field_map.begin(); it != facet_field_map.end(); ++it) {
erase(it->first);
int facet_index_t::get_facet(const std::string& field, const std::vector<std::string>& searched_tokens,
std::vector<std::string>& facets) {
const auto& facet_field_it = facet_field_map.find(field);
if(facet_field_it == facet_field_map.end()) {
return 0;
}
auto facet_index_map = facet_field_it->second.facet_index_map;
for(const auto& token : searched_tokens) {
auto token_string = token;
std::transform(token_string.begin(), token_string.end(), token_string.begin(), ::tolower);
for(auto facet_index_map_it = facet_index_map.begin();
facet_index_map_it != facet_index_map.end(); ++facet_index_map_it) {
auto facet_string = facet_index_map_it.key();
std::transform(facet_string.begin(), facet_string.end(), facet_string.begin(), ::tolower);
if(facet_string.find(token_string) != std::string::npos) {
facets.emplace_back(facet_index_map_it.key());
}
}
}
return facets.size();
}
facet_index_t::~facet_index_t() {
facet_field_map.clear();
}

View File

@ -62,6 +62,8 @@ Index::Index(const std::string& name, const uint32_t collection_id, const Store*
search_schema(search_schema),
seq_ids(new id_list_t(256)), symbols_to_index(symbols_to_index), token_separators(token_separators) {
facet_index_v4 = new facet_index_t();
for(const auto& a_field: search_schema) {
if(!a_field.index) {
continue;
@ -102,9 +104,6 @@ Index::Index(const std::string& name, const uint32_t collection_id, const Store*
if(a_field.facet) {
//initialize_facet_indexes(a_field);
if(facet_index_v4 == nullptr) {
facet_index_v4 = new facet_index_t();
}
}
// initialize for non-string facet fields
@ -1173,9 +1172,9 @@ void Index::initialize_facet_indexes(const field& facet_field) {
// }
}
void Index::compute_facet_stats(facet &a_facet, uint64_t raw_value, const std::string & field_type) {
void Index::compute_facet_stats(facet &a_facet, std::string raw_value, const std::string & field_type) {
if(field_type == field_types::INT32 || field_type == field_types::INT32_ARRAY) {
int32_t val = raw_value;
int32_t val = std::stoi(raw_value);
if (val < a_facet.stats.fvmin) {
a_facet.stats.fvmin = val;
}
@ -1185,7 +1184,7 @@ void Index::compute_facet_stats(facet &a_facet, uint64_t raw_value, const std::s
a_facet.stats.fvsum += val;
a_facet.stats.fvcount++;
} else if(field_type == field_types::INT64 || field_type == field_types::INT64_ARRAY) {
int64_t val = raw_value;
int64_t val = std::stol(raw_value);
if(val < a_facet.stats.fvmin) {
a_facet.stats.fvmin = val;
}
@ -1195,7 +1194,7 @@ void Index::compute_facet_stats(facet &a_facet, uint64_t raw_value, const std::s
a_facet.stats.fvsum += val;
a_facet.stats.fvcount++;
} else if(field_type == field_types::FLOAT || field_type == field_types::FLOAT_ARRAY) {
float val = reinterpret_cast<float&>(raw_value);
float val = std::stof(raw_value);
if(val < a_facet.stats.fvmin) {
a_facet.stats.fvmin = val;
}
@ -1219,7 +1218,7 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
const auto& facet_field = facet_infos[findex].facet_field;
const bool use_facet_query = facet_infos[findex].use_facet_query;
//const auto& fquery_hashes = facet_infos[findex].hashes;
const auto& fquery_doc_id_tokens = facet_infos[findex].doc_id_tokens;
const auto& fquery_facet_tokens = facet_infos[findex].facet_tokens;
const bool should_compute_stats = facet_infos[findex].should_compute_stats;
auto sort_index_it = sort_index.find(a_facet.field_name);
@ -1232,9 +1231,41 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
// size_t facet_hash_count = 1;
// const auto& field_facet_mapping_it = facet_index_v3.find(a_facet.field_name);
// const auto& field_single_val_facet_mapping_it = single_val_facet_index_v3.find(a_facet.field_name);
std::map<std::string, uint32_t> facet_results;
facet_index_v4->intersect(a_facet.field_name, result_ids,
results_size, max_facet_count, facet_results[a_facet.field_name]);
results_size, max_facet_count, facet_results);
//LOG(INFO) << "facet_results size " << facet_results.size();
for(const auto& kv : facet_results) {
//range facet processing
if(a_facet.is_range_query) {
const auto doc_val = kv.first;
std::pair<std::string, std::string> range_pair {};
if(a_facet.get_range(doc_val, range_pair)) {
const auto& range_id = range_pair.first;
facet_count_t& facet_count = a_facet.result_map[range_id];
facet_count.count = kv.second;
}
} else if(use_facet_query) {
if (fquery_facet_tokens.find(kv.first) != fquery_facet_tokens.end()) {
a_facet.facet_tokens[kv.first] = fquery_facet_tokens.at(kv.first);
facet_count_t& facet_count = a_facet.result_map[kv.first];
facet_count.count = kv.second;
}
} else {
facet_count_t& facet_count = a_facet.result_map[kv.first];
facet_count.count = kv.second;
}
if(should_compute_stats) {
//LOG(INFO) << "Computing facet stas for facet " << a_facet.field_name;
for(int i = 0; i < kv.second; ++i) {
compute_facet_stats(a_facet, kv.first, facet_field.type);
}
}
}
}
// for(size_t i = 0; i < results_size; i++) {
// // if sampling is enabled, we will skip a portion of the results to speed up things
@ -1266,7 +1297,6 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
// const uint64_t distinct_id = group_limit ? get_distinct_id(group_by_fields, doc_seq_id) : 0;
// //for(size_t j = 0; j < facet_hash_count; j++) {
// for(size_t j = 0; j < found_doc_seq_ids.size(); j++) {
// // if(facet_field.is_array()) {
// // fhash = facet_map_it->second.hashes[j];
// // }
@ -1306,7 +1336,7 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
// }
// }
//}
}
//}
}
void Index::aggregate_topster(Topster* agg_topster, Topster* index_topster) {
@ -2399,18 +2429,22 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
while (it.valid()) {
uint32_t seq_id = it.id();
uint64_t distinct_id = seq_id;
if (group_limit != 0) {
distinct_id = get_distinct_id(group_by_fields, seq_id);
if(excluded_group_ids.count(distinct_id) != 0) {
continue;
}
}
// if (group_limit != 0) {
// distinct_id = get_distinct_id(group_by_fields, seq_id);
// if(excluded_group_ids.count(distinct_id) != 0) {
// continue;
// }
// }
int64_t scores[3] = {0};
scores[0] = seq_id;
int64_t match_score_index = -1;
result_ids.push_back(seq_id);
if(group_limit == 0) {
KV kv(searched_queries.size(), seq_id, distinct_id, match_score_index, scores);
topster->add(&kv);
}
KV kv(searched_queries.size(), seq_id, distinct_id, match_score_index, scores, nullptr);
int ret = topster->add(&kv);
@ -2507,12 +2541,12 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
}
uint64_t distinct_id = seq_id;
if (group_limit != 0) {
distinct_id = get_distinct_id(group_by_fields, seq_id);
if(excluded_group_ids.count(distinct_id) != 0) {
continue;
}
}
// if (group_limit != 0) {
// distinct_id = get_distinct_id(group_by_fields, seq_id);
// if(excluded_group_ids.count(distinct_id) != 0) {
// continue;
// }
// }
auto vec_dist_score = (field_vector_index->distance_type == cosine) ? std::abs(dist_label.first) :
dist_label.first;
@ -2530,9 +2564,9 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
KV kv(searched_queries.size(), seq_id, distinct_id, match_score_index, scores, nullptr);
int ret = topster->add(&kv);
if(group_limit != 0 && ret < 2) {
groups_processed[distinct_id]++;
}
// if(group_limit != 0 && ret < 2) {
// groups_processed[distinct_id]++;
// }
nearest_ids.push_back(seq_id);
}
@ -2826,7 +2860,8 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
bool estimate_facets = (facet_sample_percent < 100 && all_result_ids_len > facet_sample_threshold);
if(!facets.empty()) {
const size_t num_threads = std::min(concurrency, all_result_ids_len);
//const size_t num_threads = std::min(concurrency, all_result_ids_len);
const size_t num_threads = 1;
const size_t window_size = (num_threads == 0) ? 0 :
(all_result_ids_len + num_threads - 1) / num_threads; // rounds up
size_t num_processed = 0;
@ -2897,10 +2932,10 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
for(auto & facet_kv: this_facet.result_map) {
if(group_limit) {
// we have to add all group sets
acc_facet.hash_groups[facet_kv.first].insert(
this_facet.hash_groups[facet_kv.first].begin(),
this_facet.hash_groups[facet_kv.first].end()
);
// acc_facet.hash_groups[facet_kv.first].insert(
// this_facet.hash_groups[facet_kv.first].begin(),
// this_facet.hash_groups[facet_kv.first].end()
// );
} else {
size_t count = 0;
if(acc_facet.result_map.count(facet_kv.first) == 0) {
@ -2912,9 +2947,10 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
acc_facet.result_map[facet_kv.first].count = count;
}
acc_facet.result_map[facet_kv.first].doc_id = facet_kv.second.doc_id;
acc_facet.result_map[facet_kv.first].array_pos = facet_kv.second.array_pos;
acc_facet.hash_tokens[facet_kv.first] = this_facet.hash_tokens[facet_kv.first];
//acc_facet.result_map[facet_kv.first].doc_id = facet_kv.second.doc_id;
//acc_facet.result_map[facet_kv.first].array_pos = facet_kv.second.array_pos;
//acc_facet.hash_tokens[facet_kv.first] = this_facet.hash_tokens[facet_kv.first];
acc_facet.facet_tokens[facet_kv.first] = this_facet.facet_tokens[facet_kv.first];
}
if(this_facet.stats.fvcount != 0) {
@ -2928,9 +2964,9 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
for(auto & acc_facet: facets) {
for(auto& facet_kv: acc_facet.result_map) {
if(group_limit) {
facet_kv.second.count = acc_facet.hash_groups[facet_kv.first].size();
}
// if(group_limit) {
// facet_kv.second.count = acc_facet.hash_groups[facet_kv.first].size();
// }
if(estimate_facets) {
facet_kv.second.count = size_t(double(facet_kv.second.count) * (100.0f / facet_sample_percent));
@ -2955,8 +2991,6 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
facet_infos, group_limit, group_by_fields, &included_ids_vec[0],
included_ids_vec.size(), max_facet_values);
facet_index_v4->contains("tags");
all_result_ids_len += curated_topster->size;
delete [] all_result_ids;
@ -3665,12 +3699,12 @@ void Index::search_across_fields(const std::vector<token_t>& query_tokens,
}
uint64_t distinct_id = seq_id;
if(group_limit != 0) {
distinct_id = get_distinct_id(group_by_fields, seq_id);
if(excluded_group_ids.count(distinct_id) != 0) {
return;
}
}
// if(group_limit != 0) {
// distinct_id = get_distinct_id(group_by_fields, seq_id);
// if(excluded_group_ids.count(distinct_id) != 0) {
// return;
// }
// }
int64_t scores[3] = {0};
int64_t match_score_index = -1;
@ -4267,18 +4301,21 @@ void Index::do_infix_search(const size_t num_search_fields, const std::vector<se
100, scores, match_score_index);
uint64_t distinct_id = seq_id;
if(group_limit != 0) {
distinct_id = get_distinct_id(group_by_fields, seq_id);
if(excluded_group_ids.count(distinct_id) != 0) {
continue;
}
}
KV kv(searched_queries.size(), seq_id, distinct_id, match_score_index, scores);
int ret = actual_topster->add(&kv);
if(group_limit != 0 && ret < 2) {
groups_processed[distinct_id]++;
// if(group_limit != 0) {
// distinct_id = get_distinct_id(group_by_fields, seq_id);
// if(excluded_group_ids.count(distinct_id) != 0) {
// continue;
// }
// }
if(group_limit == 0) {
KV kv(searched_queries.size(), seq_id, distinct_id, match_score_index, scores);
int ret = actual_topster->add(&kv);
}
// if(group_limit != 0 && ret < 2) {
// groups_processed[distinct_id]++;
// }
if(((i + 1) % (1 << 12)) == 0) {
BREAK_CIRCUIT_BREAKER
@ -4374,10 +4411,6 @@ void Index::compute_facet_infos(const std::vector<facet>& facets, facet_query_t&
// && (field_single_val_facet_mapping_it == single_val_facet_index_v3.end())) {
// continue;
// }
std::map<std::string, std::vector<uint32_t>> found_doc_ids;
if(facet_index_v4->get(a_facet.field_name, found_doc_ids) == 0) {
continue;
}
facet_infos[findex].use_facet_query = false;
@ -4442,6 +4475,16 @@ void Index::compute_facet_infos(const std::vector<facet>& facets, facet_query_t&
for(auto leaf: searched_query) {
posting_lists.push_back(leaf->values);
std::string tok(reinterpret_cast<char*>(leaf->key), leaf->key_len - 1);
//convert again to boolean string to help search in facet_index map
if (facet_field.is_bool()) {
if (tok == "1") {
tok = "true";
} else if (tok == "0") {
tok = "false";
}
}
searched_tokens.push_back(tok);
//LOG(INFO) << "tok: " << tok;
}
@ -4468,6 +4511,7 @@ void Index::compute_facet_infos(const std::vector<facet>& facets, facet_query_t&
if(!id_matched) {
continue;
}
//LOG(INFO) << "seq_id matched : " << seq_id;
// if(facet_field.is_array()) {
// const auto doc_fvalues_it = field_facet_mapping_it->second[seq_id % ARRAY_FACET_DIM]->find(seq_id);
@ -4501,19 +4545,18 @@ void Index::compute_facet_infos(const std::vector<facet>& facets, facet_query_t&
// facet_infos[findex].hashes.emplace(hash, searched_tokens);
// }
// }
for(const auto& found_doc_it : found_doc_ids) {
const auto& ids = found_doc_it.second;
if(std::binary_search(ids.begin(), ids.end(), seq_id)){
for(const auto& doc_id : ids) {
if(facet_infos[findex].doc_id_tokens.count(doc_id) == 0) {
facet_infos[findex].doc_id_tokens.emplace(doc_id, searched_tokens);
}
}
}
std::vector<std::string> matched_facets;
if(facet_index_v4->get_facet(a_facet.field_name, searched_tokens, matched_facets)) {
for(const auto& facet : matched_facets) {
if(facet_infos[findex].facet_tokens.count(facet) == 0) {
LOG(INFO) << "adding facet " << facet << " in facet_info";
facet_infos[findex].facet_tokens.emplace(facet, searched_tokens);
}
}
}
}
delete [] field_result_ids;
}
}
@ -4621,20 +4664,20 @@ void Index::search_wildcard(filter_node_t const* const& filter_tree_root,
100, scores, match_score_index);
uint64_t distinct_id = seq_id;
if(group_limit != 0) {
distinct_id = get_distinct_id(group_by_fields, seq_id);
if(excluded_group_ids.count(distinct_id) != 0) {
continue;
}
// if(group_limit != 0) {
// distinct_id = get_distinct_id(group_by_fields, seq_id);
// if(excluded_group_ids.count(distinct_id) != 0) {
// continue;
// }
// }
if(group_limit == 0) {
KV kv(searched_queries.size(), seq_id, distinct_id, match_score_index, scores);
int ret = topsters[thread_id]->add(&kv);
}
KV kv(searched_queries.size(), seq_id, distinct_id, match_score_index, scores);
int ret = topsters[thread_id]->add(&kv);
if(group_limit != 0 && ret < 2) {
tgroups_processed[thread_id][distinct_id]++;
}
// if(group_limit != 0 && ret < 2) {
// tgroups_processed[thread_id][distinct_id]++;
// }
if(check_for_circuit_break && ((i + 1) % (1 << 15)) == 0) {
// check only once every 2^15 docs to reduce overhead
BREAK_CIRCUIT_BREAKER
@ -5220,11 +5263,16 @@ void Index::score_results(const std::vector<sort_by> & sort_fields, const uint16
uint64_t distinct_id = seq_id;
if(group_limit != 0) {
distinct_id = get_distinct_id(group_by_fields, seq_id);
}
// if(group_limit != 0) {
// distinct_id = get_distinct_id(group_by_fields, seq_id);
// groups_processed.emplace(distinct_id);
// }
//LOG(INFO) << "Seq id: " << seq_id << ", match_score: " << match_score;
if(group_limit == 0) {
KV kv(query_index, seq_id, distinct_id, match_score_index, scores);
topster->add(&kv);
}
KV kv(query_index, seq_id, distinct_id, match_score_index, scores);
int ret = topster->add(&kv);
if(group_limit != 0 && ret < 2) {
@ -5239,7 +5287,6 @@ void Index::score_results(const std::vector<sort_by> & sort_fields, const uint16
uint64_t Index::get_distinct_id(const std::vector<std::string>& group_by_fields,
const uint32_t seq_id) const {
uint64_t distinct_id = 1; // some constant initial value
std::hash<std::string> hasher;
// calculate hash from group_by_fields
for(const auto& field: group_by_fields) {
// const auto& field_facet_mapping_it = facet_index_v3.find(field);
@ -5277,9 +5324,6 @@ uint64_t Index::get_distinct_id(const std::vector<std::string>& group_by_fields,
// distinct_id = StringUtils::hash_combine(distinct_id, facet_hash);
// }
const auto& hash = hasher(field);
distinct_id = StringUtils::hash_combine(distinct_id, hash);
}
return distinct_id;

View File

@ -103,11 +103,11 @@ TEST_F(CollectionFacetingTest, FacetCounts) {
ASSERT_STREQ("tags", results["facet_counts"][0]["field_name"].get<std::string>().c_str());
ASSERT_EQ(2, results["facet_counts"][0]["counts"].size());
ASSERT_STREQ("gold", results["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
ASSERT_EQ(3, (int) results["facet_counts"][0]["counts"][0]["count"]);
ASSERT_STREQ("bronze", results["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
ASSERT_EQ(2, (int) results["facet_counts"][0]["counts"][0]["count"]);
ASSERT_STREQ("silver", results["facet_counts"][0]["counts"][1]["value"].get<std::string>().c_str());
ASSERT_EQ(3, (int) results["facet_counts"][0]["counts"][1]["count"]);
ASSERT_STREQ("FINE PLATINUM", results["facet_counts"][0]["counts"][1]["value"].get<std::string>().c_str());
ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][1]["count"]);
// 2 facets, 1 text query with no filters
facets.clear();
@ -230,12 +230,12 @@ TEST_F(CollectionFacetingTest, FacetCounts) {
ASSERT_STREQ("age", results["facet_counts"][0]["field_name"].get<std::string>().c_str());
ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][0]["count"]);
ASSERT_STREQ("21", results["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
ASSERT_STREQ("<mark>2</mark>1", results["facet_counts"][0]["counts"][0]["highlighted"].get<std::string>().c_str());
ASSERT_STREQ("24", results["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
ASSERT_STREQ("<mark>2</mark>4", results["facet_counts"][0]["counts"][0]["highlighted"].get<std::string>().c_str());
ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][1]["count"]);
ASSERT_STREQ("24", results["facet_counts"][0]["counts"][1]["value"].get<std::string>().c_str());
ASSERT_STREQ("<mark>2</mark>4", results["facet_counts"][0]["counts"][1]["highlighted"].get<std::string>().c_str());
ASSERT_STREQ("21", results["facet_counts"][0]["counts"][1]["value"].get<std::string>().c_str());
ASSERT_STREQ("<mark>2</mark>1", results["facet_counts"][0]["counts"][1]["highlighted"].get<std::string>().c_str());
// facet on a float field without query to check on stats
results = coll_array_fields->search("*", query_fields, "", {"rating"}, sort_fields, {0}, 10, 1, FREQUENCY,
@ -258,7 +258,7 @@ TEST_F(CollectionFacetingTest, FacetCounts) {
{false}, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "rating: 7").get();
ASSERT_EQ(5, results["hits"].size());
ASSERT_EQ(1, results["facet_counts"].size());
ASSERT_STREQ("rating", results["facet_counts"][0]["field_name"].get<std::string>().c_str());
@ -278,7 +278,7 @@ TEST_F(CollectionFacetingTest, FacetCounts) {
results = coll_array_fields->search("*", query_fields, "", {"timestamps"}, sort_fields, {0}, 10, 1, FREQUENCY,
{false}, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "timestamps: 142189002").get();
spp::sparse_hash_set<std::string>(), 10, "timestamps: 142189002").get();
ASSERT_EQ(5, results["hits"].size());
ASSERT_EQ(1, results["facet_counts"].size());
ASSERT_EQ(1, results["facet_counts"][0]["counts"].size());
@ -607,7 +607,6 @@ TEST_F(CollectionFacetingTest, FacetCountsHighlighting) {
ASSERT_STREQ("Cell Phone <mark>Acces</mark>sories", results["facet_counts"][0]["counts"][0]["highlighted"].get<std::string>().c_str());
// ensure that only the last token is treated as prefix search
coll1->remove("100");
doc["categories"] = {"Cell Phones", "Cell Phone Accessories", "Cellophanes"};
coll1->add(doc.dump());
@ -616,6 +615,8 @@ TEST_F(CollectionFacetingTest, FacetCountsHighlighting) {
token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "categories:cell ph").get();
LOG(INFO) << results.dump();
ASSERT_EQ(1, results["facet_counts"].size());
ASSERT_EQ(2, results["facet_counts"][0]["counts"].size());
@ -673,6 +674,7 @@ TEST_F(CollectionFacetingTest, FacetStatOnFloatFields) {
1, FREQUENCY, {false});
auto results = res_op.get();
LOG(INFO) << results.dump();
ASSERT_EQ(7, results["hits"].size());
@ -746,8 +748,8 @@ TEST_F(CollectionFacetingTest, FacetCountOnSimilarStrings) {
ASSERT_EQ(2, results["hits"].size());
ASSERT_EQ(2, results["facet_counts"][0]["counts"].size());
ASSERT_STREQ("India in England", results["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
ASSERT_STREQ("England in India", results["facet_counts"][0]["counts"][1]["value"].get<std::string>().c_str());
ASSERT_STREQ("England in India", results["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
ASSERT_STREQ("India in England", results["facet_counts"][0]["counts"][1]["value"].get<std::string>().c_str());
collectionManager.drop_collection("coll1");
}
@ -1645,6 +1647,7 @@ TEST_F(CollectionFacetingTest, FacetIndexRefactor) {
ASSERT_EQ(5, results["hits"].size());
ASSERT_EQ(1, results["facet_counts"].size());
ASSERT_EQ(4, results["facet_counts"][0].size());
ASSERT_EQ(4, results["facet_counts"][0]["counts"].size());
ASSERT_EQ("tags", results["facet_counts"][0]["field_name"]);
ASSERT_STREQ("gold", results["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());

View File

@ -69,6 +69,8 @@ TEST_F(CollectionGroupingTest, GroupingBasics) {
"", 10,
{}, {}, {"size"}, 2).get();
LOG(INFO) << res.dump();
ASSERT_EQ(3, res["found"].get<size_t>());
ASSERT_EQ(3, res["grouped_hits"].size());
ASSERT_EQ(11, res["grouped_hits"][0]["group_key"][0].get<size_t>());