typesense/src/facet_index.cpp
2023-10-13 15:56:01 +05:30

363 lines
14 KiB
C++

#include "facet_index.h"
#include <tokenizer.h>
#include "string_utils.h"
#include "array_utils.h"
void facet_index_t::initialize(const std::string& field) {
const auto facet_field_map_it = facet_field_map.find(field);
if(facet_field_map_it == facet_field_map.end()) {
// NOTE: try_emplace is needed to construct the value object in-place without calling the destructor
facet_field_map.try_emplace(field);
}
}
void facet_index_t::insert(const std::string& field_name,std::unordered_map<facet_value_id_t,
std::vector<uint32_t>, facet_value_id_t::Hash>& fvalue_to_seq_ids,
std::unordered_map<uint32_t, std::vector<facet_value_id_t>>& seq_id_to_fvalues,
bool is_string_field) {
const auto facet_field_map_it = facet_field_map.find(field_name);
if(facet_field_map_it == facet_field_map.end()) {
return; // field is not initialized or dropped
}
auto& facet_index = facet_field_map_it->second;
auto& fvalue_index = facet_index.fvalue_seq_ids;
auto fhash_index = facet_index.seq_id_hashes;
for(const auto& seq_id_fvalues: seq_id_to_fvalues) {
auto seq_id = seq_id_fvalues.first;
std::vector<uint32_t> real_facet_ids;
real_facet_ids.reserve(seq_id_fvalues.second.size());
for(const auto& fvalue: seq_id_fvalues.second) {
uint32_t facet_id = fvalue.facet_id;
const auto& fvalue_index_it = fvalue_index.find(fvalue.facet_value);
if(fvalue.facet_id == UINT32_MAX) {
// float, int32 & bool will provide facet_id as their own numerical values
facet_id = (fvalue_index_it == fvalue_index.end()) ? ++next_facet_id : fvalue_index_it->second.facet_id;
if(!is_string_field) {
int64_t val = std::stoll(fvalue.facet_value);
facet_index.fhash_to_int64_map[facet_id] = val;
}
}
real_facet_ids.push_back(facet_id);
auto seq_ids_it = fvalue_to_seq_ids.find(fvalue);
if(seq_ids_it == fvalue_to_seq_ids.end()) {
continue;
}
auto& seq_ids = seq_ids_it->second;
std::list<facet_count_t>& count_list = facet_index.counts;
if(fvalue_index_it == fvalue_index.end()) {
facet_id_seq_ids_t fis;
fis.facet_id = facet_id;
if(facet_index.has_value_index) {
count_list.emplace_back(fvalue.facet_value, seq_ids.size(), facet_id);
fis.facet_count_it = std::prev(count_list.end());
fis.seq_ids = ids_t::create(seq_ids);
}
fvalue_index.emplace(fvalue.facet_value, fis);
} else if(facet_index.has_value_index) {
for(const auto id : seq_ids) {
ids_t::upsert(fvalue_index_it->second.seq_ids, id);
}
auto facet_count_it = fvalue_index_it->second.facet_count_it;
if(facet_count_it->facet_id == facet_id) {
facet_count_it->count = ids_t::num_ids(fvalue_index_it->second.seq_ids);
auto curr = facet_count_it;
while (curr != count_list.begin() && std::prev(curr)->count < curr->count) {
count_list.splice(curr, count_list, std::prev(curr)); // swaps list nodes
curr--;
}
} else {
LOG(ERROR) << "Wrong reference stored for facet " << fvalue.facet_value << " with facet_id " << facet_id;
}
}
fvalue_to_seq_ids.erase(fvalue);
}
if(facet_index.has_hash_index && fhash_index != nullptr) {
fhash_index->upsert(seq_id, real_facet_ids);
}
}
}
bool facet_index_t::contains(const std::string& field_name) {
const auto& facet_field_it = facet_field_map.find(field_name);
if(facet_field_it == facet_field_map.end()) {
return false;
}
return true;
}
void facet_index_t::erase(const std::string& field_name) {
facet_field_map.erase(field_name);
}
void facet_index_t::remove(const std::string& field_name, const uint32_t seq_id) {
const auto facet_field_it = facet_field_map.find(field_name);
if(facet_field_it != facet_field_map.end()) {
auto& facet_index_map = facet_field_it->second.fvalue_seq_ids;
std::vector<std::string> dead_fvalues;
for(auto facet_ids_seq_ids = facet_index_map.begin(); facet_ids_seq_ids != facet_index_map.end(); facet_ids_seq_ids++) {
void*& ids = facet_ids_seq_ids->second.seq_ids;
if(ids && ids_t::contains(ids, seq_id)) {
ids_t::erase(ids, seq_id);
auto& count_list = facet_field_it->second.counts;
facet_ids_seq_ids->second.facet_count_it->count--;
if(ids_t::num_ids(ids) == 0) {
ids_t::destroy_list(ids);
std::string dead_fvalue;
dead_fvalues.push_back(dead_fvalue);
//remove from int64 lookup map first
auto& fhash_int64_map = facet_field_it->second.fhash_to_int64_map;
uint32_t fhash = facet_ids_seq_ids->second.facet_id;
fhash_int64_map.erase(fhash);
count_list.erase(facet_ids_seq_ids->second.facet_count_it);
auto node = facet_index_map.extract(facet_ids_seq_ids->first);
node.key() = dead_fvalue;
facet_index_map.insert(std::move(node));
}
}
}
for(auto& dead_fvalue: dead_fvalues) {
facet_index_map.erase(dead_fvalue);
}
auto& seq_id_hashes = facet_field_it->second.seq_id_hashes;
seq_id_hashes->erase(seq_id);
}
}
size_t facet_index_t::get_facet_count(const std::string& field_name) {
const auto it = facet_field_map.find(field_name);
if(it == facet_field_map.end()) {
return 0;
}
return has_hash_index(field_name) ? it->second.seq_id_hashes->num_ids() : it->second.counts.size();
}
//returns the count of matching seq_ids from result array
size_t facet_index_t::intersect(facet& a_facet,
bool has_facet_query, const std::vector<std::vector<std::string>>& fvalue_searched_tokens,
const uint32_t* result_ids, size_t result_ids_len,
size_t max_facet_count, std::map<std::string, docid_count_t>& found,
bool is_wildcard_no_filter_query, const std::string& sort_order) {
//LOG (INFO) << "intersecting field " << field;
const auto& facet_field_it = facet_field_map.find(a_facet.field_name);
if(facet_field_it == facet_field_map.end()) {
return 0;
}
const auto& facet_index_map = facet_field_it->second.fvalue_seq_ids;
const auto& counter_list = facet_field_it->second.counts;
//LOG(INFO) << "fvalue_seq_ids size " << facet_index_map.size() << " , counts size " << counter_list.size();
// We look 2 * max_facet_count when keyword search / filtering is involved to ensure that we
// try and pick the actual top facets by count.
size_t max_facets = is_wildcard_no_filter_query ? std::min((size_t)max_facet_count, counter_list.size()) :
std::min((size_t)2 * max_facet_count, counter_list.size());
auto intersect_fn = [&] (std::list<facet_count_t>::const_iterator facet_count_it) {
uint32_t count = 0;
uint32_t doc_id = 0;
if(has_facet_query) {
bool found_search_token = false;
auto facet_str = facet_count_it->facet_value;
transform(facet_str.begin(), facet_str.end(), facet_str.begin(), ::tolower);
for(const auto& searched_tokens : fvalue_searched_tokens) {
bool found_all_tokens = true;
for (const auto &searched_token: searched_tokens) {
if (facet_str.find(searched_token) == std::string::npos) {
found_all_tokens = false;
break;
}
}
if (found_all_tokens) {
a_facet.fvalue_tokens[facet_count_it->facet_value] = searched_tokens;
found_search_token = true;
break;
}
}
if(!found_search_token) {
return;
}
}
auto ids = facet_index_map.at(facet_count_it->facet_value).seq_ids;
if (!ids) {
return;
}
if (is_wildcard_no_filter_query) {
count = facet_count_it->count;
} else {
count = ids_t::intersect_count(ids, result_ids, result_ids_len);
}
if (count) {
doc_id = ids_t::first_id(ids);
found[facet_count_it->facet_value] = {doc_id, count};
}
};
if(sort_order.empty()) {
for (auto facet_count_it = counter_list.begin(); facet_count_it != counter_list.end();
++facet_count_it) {
//LOG(INFO) << "checking ids in facet_value " << facet_count.facet_value << " having total count "
// << facet_count.count << ", is_wildcard_no_filter_query: " << is_wildcard_no_filter_query;
intersect_fn(facet_count_it);
if (found.size() == max_facets) {
break;
}
}
} else {
if(sort_order == "asc") {
for(auto facet_index_map_it = facet_index_map.begin();
facet_index_map_it != facet_index_map.end(); ++facet_index_map_it) {
intersect_fn(facet_index_map_it->second.facet_count_it);
if (found.size() == max_facets) {
break;
}
}
} else if(sort_order == "desc") {
for(auto facet_index_map_it = facet_index_map.rbegin();
facet_index_map_it != facet_index_map.rend(); ++facet_index_map_it) {
intersect_fn(facet_index_map_it->second.facet_count_it);
if (found.size() == max_facets) {
break;
}
}
}
}
return found.size();
}
facet_index_t::~facet_index_t() {
facet_field_map.clear();
}
// used for migrating string and int64 facets
size_t facet_index_t::get_facet_indexes(const std::string& field_name,
std::map<uint32_t, std::vector<uint32_t>>& seqid_countIndexes) {
const auto& facet_field_it = facet_field_map.find(field_name);
if(facet_field_it == facet_field_map.end()) {
return 0;
}
auto& facet_index_map = facet_field_it->second.fvalue_seq_ids;
std::vector<uint32_t> id_list;
for(auto facet_index_map_it = facet_index_map.begin(); facet_index_map_it != facet_index_map.end(); ++facet_index_map_it) {
//auto ids = facet_index_map_it->seq_ids;
auto ids = facet_index_map_it->second.seq_ids;
ids_t::uncompress(ids, id_list);
// emplacing seq_id => next_facet_id
for(const auto& id : id_list) {
//seqid_countIndexes[id].emplace_back(facet_index_map_it->facet_id);
seqid_countIndexes[id].emplace_back(facet_index_map_it->second.facet_id);
}
id_list.clear();
}
return seqid_countIndexes.size();
}
void facet_index_t::handle_index_change(const std::string& field_name, size_t total_num_docs,
size_t facet_index_threshold, size_t facet_count) {
// Low cardinality fields will have only value based facet index. Once a field becomes a high cardinality
// field (exceeding FACET_INDEX_THRESHOLD), we will create a hash based index and populate it.
// If a field is an id-like field (cardinality_ratio < 5) we will then remove value based index.
auto& facet_index = facet_field_map.at(field_name);
posting_list_t*& fhash_index = facet_index.seq_id_hashes;
if(fhash_index == nullptr && (facet_count > facet_index_threshold) && total_num_docs < 1000000) {
fhash_index = new posting_list_t(256);
std::map<uint32_t, std::vector<uint32_t>> seq_id_index_map;
if(get_facet_indexes(field_name, seq_id_index_map)) {
for(const auto& kv : seq_id_index_map) {
fhash_index->upsert(kv.first, kv.second);
}
}
seq_id_index_map.clear();
facet_index.has_hash_index = true;
auto cardinality_ratio = total_num_docs / facet_count;
if(cardinality_ratio != 0 && cardinality_ratio < 5) {
// drop the value index for this field
auto& fvalue_seq_ids = facet_index.fvalue_seq_ids;
for(auto it = fvalue_seq_ids.begin(); it != fvalue_seq_ids.end(); ++it) {
ids_t::destroy_list(it->second.seq_ids);
}
fvalue_seq_ids.clear();
facet_index.counts.clear();
facet_index.has_value_index = false;
}
}
}
bool facet_index_t::has_hash_index(const std::string &field_name) {
auto facet_index_it = facet_field_map.find(field_name);
return facet_index_it != facet_field_map.end() && facet_index_it->second.has_hash_index;
}
bool facet_index_t::has_value_index(const std::string &field_name) {
auto facet_index_it = facet_field_map.find(field_name);
return facet_index_it != facet_field_map.end() && facet_index_it->second.has_value_index;
}
posting_list_t* facet_index_t::get_facet_hash_index(const std::string &field_name) {
auto facet_index_it = facet_field_map.find(field_name);
if(facet_index_it != facet_field_map.end()) {
return facet_index_it->second.seq_id_hashes;
}
return nullptr;
}
const spp::sparse_hash_map<uint32_t , int64_t >& facet_index_t::get_fhash_int64_map(const std::string& field_name) {
const auto facet_field_map_it = facet_field_map.find(field_name);
if(facet_field_map_it == facet_field_map.end()) {
return spp::sparse_hash_map<uint32_t , int64_t >{}; // field is not initialized or dropped
}
const auto& facet_index = facet_field_map_it->second;
return facet_index.fhash_to_int64_map;
}