Use facet index entirely for numbers.

This commit is contained in:
Kishore Nallan 2023-06-14 11:07:57 +05:30
parent 938b3f3b45
commit a2238afe30
5 changed files with 43 additions and 169 deletions

View File

@ -132,8 +132,8 @@ public:
void initialize(const std::string& field);
void handle_index_change(const std::string& field_name, size_t total_num_docs, size_t facet_index_threshold,
size_t facet_count, spp::sparse_hash_map<std::string, num_tree_t*>& numerical_index);
void handle_index_change(const std::string& field_name, size_t total_num_docs,
size_t facet_index_threshold, size_t facet_count);
bool has_hash_index(const std::string& field_name);

View File

@ -77,12 +77,5 @@ public:
uint32_t* const& context_ids,
size_t& result_ids_len,
uint32_t*& result_ids) const;
size_t intersect(const uint32_t* result_ids, size_t result_id_len,
size_t max_facet_count, std::map<int64_t, uint32_t>& found,
bool is_wildcard_no_filter_query);
size_t counter_list_size() const;
size_t get_facet_indexes(std::map<uint32_t, std::vector<uint32_t>>& seqid_countIndexes);
};

View File

@ -55,30 +55,26 @@ void facet_index_t::insert(const std::string& field_name, bool is_string,
if(facet_index.has_value_index) {
count_list.emplace_back(fvalue.facet_value, seq_ids.size(), facet_id);
fis.facet_count_it = std::prev(count_list.end());
if(is_string) {
fis.seq_ids = SET_COMPACT_IDS(compact_id_list_t::create(seq_ids.size(), seq_ids));
}
fis.seq_ids = SET_COMPACT_IDS(compact_id_list_t::create(seq_ids.size(), seq_ids));
}
fvalue_index.emplace(fvalue.facet_value, fis);
} else if(facet_index.has_value_index) {
if(is_string) {
for(const auto id : seq_ids) {
ids_t::upsert(fvalue_index_it->seq_ids, id);
}
for(const auto id : seq_ids) {
ids_t::upsert(fvalue_index_it->seq_ids, id);
}
auto facet_count_it = fvalue_index_it->facet_count_it;
auto facet_count_it = fvalue_index_it->facet_count_it;
if(facet_count_it->facet_id == facet_id) {
facet_count_it->count = ids_t::num_ids(fvalue_index_it->seq_ids);
auto curr = facet_count_it;
while (curr != count_list.begin() && std::prev(curr)->count < curr->count) {
count_list.splice(curr, count_list, std::prev(curr)); // swaps list nodes
curr--;
}
} else {
LOG(ERROR) << "Wrong reference stored for facet " << fvalue.facet_value << " with facet_id " << facet_id;
if(facet_count_it->facet_id == facet_id) {
facet_count_it->count = ids_t::num_ids(fvalue_index_it->seq_ids);
auto curr = facet_count_it;
while (curr != count_list.begin() && std::prev(curr)->count < curr->count) {
count_list.splice(curr, count_list, std::prev(curr)); // swaps list nodes
curr--;
}
} else {
LOG(ERROR) << "Wrong reference stored for facet " << fvalue.facet_value << " with facet_id " << facet_id;
}
}
@ -162,27 +158,28 @@ size_t facet_index_t::intersect(const std::string& field, const uint32_t* result
const auto& facet_index_map = facet_field_it->second.fvalue_seq_ids;
const auto& counter_list = facet_field_it->second.counts;
// LOG (INFO) << "fvalue_seq_ids size " << fvalue_seq_ids.size()
// << " , counts size " << counts.size();
//LOG(INFO) << "fvalue_seq_ids size " << facet_index_map.size() << " , counts size " << counter_list.size();
size_t max_facets = std::min((size_t)2 * max_facet_count, counter_list.size());
std::vector<uint32_t> id_list;
for(const auto& facet_count : counter_list) {
// LOG (INFO) << "checking ids in facet_value " << counter_list_it.facet_value
// << " having total count " << counter_list_it.count;
//LOG(INFO) << "checking ids in facet_value " << facet_count.facet_value << " having total count "
// << facet_count.count << ", is_wildcard_no_filter_query: " << is_wildcard_no_filter_query;
uint32_t count = 0;
if(is_wildcard_no_filter_query) {
count = facet_count.count;
} else {
auto ids = facet_index_map.at(facet_count.facet_value).seq_ids;
ids_t::uncompress(ids, id_list);
for(size_t i = 0; i < result_ids_len; ++i) {
uint32_t* out = nullptr;
count = ArrayUtils::and_scalar(id_list.data(), id_list.size(),
result_ids, result_ids_len, &out);
delete[] out;
if(!ids) {
continue;
}
ids_t::uncompress(ids, id_list);
uint32_t* out = nullptr;
count = ArrayUtils::and_scalar(id_list.data(), id_list.size(), result_ids, result_ids_len, &out);
delete[] out;
id_list.clear();
}
@ -230,8 +227,7 @@ size_t facet_index_t::get_facet_indexes(const std::string& field_name,
}
void facet_index_t::handle_index_change(const std::string& field_name, size_t total_num_docs,
size_t facet_index_threshold, size_t facet_count,
spp::sparse_hash_map<std::string, num_tree_t*>& numerical_index) {
size_t facet_index_threshold, size_t facet_count) {
// Low cardinality fields will have only value based facet index. Once a field becomes a high cardinality
// field (exceeding FACET_INDEX_THRESHOLD), we will create a hash based index and populate it.
@ -252,16 +248,6 @@ void facet_index_t::handle_index_change(const std::string& field_name, size_t to
seq_id_index_map.clear();
auto numerical_index_it = numerical_index.find(field_name);
if(numerical_index_it != numerical_index.end()) {
auto num_tree = numerical_index_it->second;
if(num_tree->get_facet_indexes(seq_id_index_map)) {
for(const auto& kv : seq_id_index_map) {
fhash_index->upsert(kv.first, kv.second);
}
}
}
facet_index.has_hash_index = true;
auto cardinality_ratio = total_num_docs / facet_count;

View File

@ -636,24 +636,16 @@ void Index::index_field_in_memory(const field& afield, std::vector<index_record>
// b) `afield` value could be empty
// non-geo faceted field should be indexed as faceted string field as well
bool non_string_facet_field = (afield.facet && !afield.is_geopoint());
bool is_facet_field = (afield.facet && !afield.is_geopoint());
if(afield.is_string() || non_string_facet_field) {
if(afield.is_string() || is_facet_field) {
std::unordered_map<std::string, std::vector<art_document>> token_to_doc_offsets;
int64_t max_score = INT64_MIN;
std::unordered_map<facet_value_id_t, std::vector<uint32_t>, facet_value_id_t::Hash> fvalue_to_seq_ids;
std::unordered_map<uint32_t, std::vector<facet_value_id_t>> seq_id_to_fvalues;
auto facet_count = 0;
if(afield.is_string()) {
facet_count = facet_index_v4->get_facet_count(afield.name);
} else {
auto numerical_index_it = numerical_index.find(afield.name);
if(numerical_index_it != numerical_index.end()) {
facet_count = numerical_index_it->second->counter_list_size();
}
}
auto facet_count = facet_index_v4->get_facet_count(afield.name);
#ifdef TEST_BUILD
facet_count = FACET_INDEX_THRESHOLD + 1;
@ -661,8 +653,7 @@ void Index::index_field_in_memory(const field& afield, std::vector<index_record>
if(afield.facet) {
size_t total_num_docs = seq_ids->num_ids();
facet_index_v4->handle_index_change(afield.name, total_num_docs, FACET_INDEX_THRESHOLD, facet_count,
numerical_index);
facet_index_v4->handle_index_change(afield.name, total_num_docs, FACET_INDEX_THRESHOLD, facet_count);
}
for(const auto& record: iter_batch) {
@ -706,13 +697,14 @@ void Index::index_field_in_memory(const field& afield, std::vector<index_record>
} else if(afield.type == field_types::FLOAT_ARRAY) {
float raw_val = field_values[i].get<float>();
auto fhash = reinterpret_cast<uint32_t&>(raw_val);
facet_value_id_t facet_value_id(std::to_string(raw_val), fhash);
facet_value_id_t facet_value_id(StringUtils::float_to_str(raw_val), fhash);
fvalue_to_seq_ids[facet_value_id].push_back(seq_id);
seq_id_to_fvalues[seq_id].push_back(facet_value_id);
} else if(afield.type == field_types::BOOL_ARRAY) {
bool raw_val = field_values[i].get<bool>();
auto fhash = (uint32_t)raw_val;
facet_value_id_t facet_value_id(std::to_string(raw_val), fhash);
auto str_val = (raw_val == 1) ? "true" : "false";
facet_value_id_t facet_value_id(str_val, fhash);
fvalue_to_seq_ids[facet_value_id].push_back(seq_id);
seq_id_to_fvalues[seq_id].push_back(facet_value_id);
}
@ -740,14 +732,15 @@ void Index::index_field_in_memory(const field& afield, std::vector<index_record>
else if(afield.type == field_types::FLOAT) {
float raw_val = document[afield.name].get<float>();
auto fhash = reinterpret_cast<uint32_t&>(raw_val);
facet_value_id_t facet_value_id(std::to_string(raw_val), fhash);
facet_value_id_t facet_value_id(StringUtils::float_to_str(raw_val), fhash);
fvalue_to_seq_ids[facet_value_id].push_back(seq_id);
seq_id_to_fvalues[seq_id].push_back(facet_value_id);
}
else if(afield.type == field_types::BOOL) {
bool raw_val = document[afield.name].get<bool>();
auto fhash = (uint32_t)raw_val;
facet_value_id_t facet_value_id(std::to_string(raw_val), fhash);
auto str_val = (raw_val == 1) ? "true" : "false";
facet_value_id_t facet_value_id(str_val, fhash);
fvalue_to_seq_ids[facet_value_id].push_back(seq_id);
seq_id_to_fvalues[seq_id].push_back(facet_value_id);
}
@ -1228,18 +1221,7 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
size_t mod_value = 100 / facet_sample_percent;
auto num_facet_values = 0;
if(facet_field.is_string()) {
num_facet_values = facet_index_v4->get_facet_count(a_facet.field_name);
} else {
auto numerical_index_it = numerical_index.find(a_facet.field_name);
if(numerical_index_it != numerical_index.end()) {
num_facet_values = numerical_index_it->second->counter_list_size();
} else {
LOG(ERROR) << "facet " << a_facet.field_name << " not found in numerical index";
}
}
auto num_facet_values = facet_index_v4->get_facet_count(facet_field.name);
bool use_hashes = false;
@ -1253,36 +1235,17 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
use_hashes = false;
#endif
bool is_wildcard_no_filter_query = is_wildcard_query && no_filters_provided;
bool facet_hash_index_exists = facet_index_v4->has_hash_index(a_facet.field_name);
bool facet_hash_index_exists = facet_index_v4->has_hash_index(facet_field.name);
if((num_facet_values && ((!facet_hash_index_exists || is_wildcard_no_filter_query
|| num_facet_values > 50000) && group_limit == 0) && !use_hashes) || use_facet_intersection) {
//LOG(INFO) << "Using intersection to find facets";
// LOG(INFO) << "Using intersection to find facets";
a_facet.is_intersected = true;
std::map<std::string, uint32_t> facet_results;
if(facet_field.is_string()) {
facet_index_v4->intersect(a_facet.field_name, result_ids,
results_size, max_facet_count, facet_results, is_wildcard_no_filter_query);
} else {
std::map<int64_t, uint32_t> facet_counts;
numerical_index.at(a_facet.field_name)->intersect(result_ids, results_size, max_facet_count,
facet_counts, is_wildcard_no_filter_query);
for(const auto& kv : facet_counts) {
std::string val;
if(facet_field.is_float()) {
val = StringUtils::float_to_str(int64_t_to_float(kv.first));
} else if(facet_field.is_bool()) {
val = kv.first == 1 ? "true" : "false";
} else {
val = std::to_string(kv.first);
}
facet_results[val] = kv.second;
}
}
facet_index_v4->intersect(facet_field.name, result_ids,
results_size, max_facet_count, facet_results, is_wildcard_no_filter_query);
for(const auto& kv : facet_results) {
//range facet processing

View File

@ -382,76 +382,8 @@ size_t num_tree_t::size() {
return int64map.size();
}
size_t num_tree_t::counter_list_size() const {
return counter_list.size();
}
num_tree_t::~num_tree_t() {
for(auto& kv: int64map) {
ids_t::destroy_list(kv.second);
}
}
size_t num_tree_t::intersect(const uint32_t* result_ids, size_t result_ids_len, size_t max_facet_count,
std::map<int64_t, uint32_t>& found, bool is_wildcard_no_filter_query) {
//LOG (INFO) << "intersecting field " << field;
// LOG (INFO) << "int64map size " << int64map.size()
// << " , counts size " << counts.size();
std::vector<uint32_t> id_list;
for(const auto& counter_list_it : counter_list) {
// LOG (INFO) << "checking ids in facet_value " << counter_list_it.facet_value
// << " having total count " << counter_list_it.count;
uint32_t count = 0;
if(is_wildcard_no_filter_query) {
count = counter_list_it.count;
} else {
auto ids = int64map.at(counter_list_it.facet_value);
ids_t::uncompress(ids, id_list);
const auto ids_len = id_list.size();
for(size_t i = 0; i < result_ids_len; ++i) {
uint32_t* out = nullptr;
count = ArrayUtils::and_scalar(id_list.data(), id_list.size(),
result_ids, result_ids_len, &out);
delete[] out;
}
id_list.clear();
}
if(count) {
found[counter_list_it.facet_value] = count;
if(found.size() == max_facet_count) {
break;
}
}
}
return found.size();
}
size_t num_tree_t::get_facet_indexes(std::map<uint32_t, std::vector<uint32_t>>& seqid_countIndexes) {
//check if facet field
if(counter_list.empty()) {
return 0;
}
std::vector<uint32_t> id_list;
for(auto int64map_it = int64map.begin(); int64map_it != int64map.end(); ++int64map_it) {
auto ids = int64map_it->second;
ids_t::uncompress(ids, id_list);
// emplacing seq_id => facet_id
for(const auto& id : id_list) {
seqid_countIndexes[id].emplace_back(int64map_it->first);
}
id_list.clear();
}
return seqid_countIndexes.size();
}