Optimize facet index for sparse/high-cardinality fields.

This commit is contained in:
Kishore Nallan 2023-12-23 20:12:53 +05:30
parent 962e6b758c
commit ac1e85599f
3 changed files with 47 additions and 13 deletions

View File

@ -154,6 +154,8 @@ public:
void handle_index_change(const std::string& field_name, size_t total_num_docs,
size_t facet_index_threshold, size_t facet_count);
void check_for_high_cardinality(const std::string& field_name, size_t total_num_docs);
bool has_hash_index(const std::string& field_name);
bool has_value_index(const std::string& field_name);
@ -173,4 +175,5 @@ public:
size_t facet_val_num_ids(const std::string& field_name, const std::string& fvalue);
size_t facet_node_count(const std::string& field_name, const std::string& fvalue);
};
};

View File

@ -11,8 +11,8 @@ void facet_index_t::initialize(const std::string& field) {
}
}
void facet_index_t::insert(const std::string& field_name,std::unordered_map<facet_value_id_t,
std::vector<uint32_t>, facet_value_id_t::Hash>& fvalue_to_seq_ids,
void facet_index_t::insert(const std::string& field_name,
std::unordered_map<facet_value_id_t, std::vector<uint32_t>, facet_value_id_t::Hash>& fvalue_to_seq_ids,
std::unordered_map<uint32_t, std::vector<facet_value_id_t>>& seq_id_to_fvalues,
bool is_string_field) {
@ -571,3 +571,40 @@ size_t facet_index_t::facet_node_count(const string &field_name, const string &f
return facet_field_map_it->second.fvalue_seq_ids[fvalue].facet_count_it->count;
}
void facet_index_t::check_for_high_cardinality(const string& field_name, size_t total_num_docs) {
// high cardinality or sparse facet fields must be dropped from value facet index
const auto facet_field_map_it = facet_field_map.find(field_name);
if(facet_field_map_it == facet_field_map.end()) {
return ;
}
if(!facet_field_map_it->second.has_value_index) {
return ;
}
size_t value_facet_threshold = 0.8 * total_num_docs;
auto num_facet_values = facet_field_map_it->second.fvalue_seq_ids.size();
bool is_sparse_field = false;
if(facet_field_map.size() > 100 && total_num_docs > 10*1000) {
size_t num_docs_with_facet = facet_field_map_it->second.seq_id_hashes->num_ids();
if(num_docs_with_facet > 0 && num_docs_with_facet < 100) {
is_sparse_field = true;
}
}
if(num_facet_values > value_facet_threshold || is_sparse_field) {
// if there are too many unique values, we will drop the value index
auto& fvalue_seq_ids = facet_field_map_it->second.fvalue_seq_ids;
for(auto it = fvalue_seq_ids.begin(); it != fvalue_seq_ids.end(); ++it) {
ids_t::destroy_list(it->second.seq_ids);
}
fvalue_seq_ids.clear();
facet_field_map_it->second.counts.clear();
facet_field_map_it->second.count_map.clear();
facet_field_map_it->second.has_value_index = false;
//LOG(INFO) << "Dropped value index for field " << field_name;
}
}

View File

@ -700,16 +700,10 @@ void Index::index_field_in_memory(const field& afield, std::vector<index_record>
std::unordered_map<facet_value_id_t, std::vector<uint32_t>, facet_value_id_t::Hash> fvalue_to_seq_ids;
std::unordered_map<uint32_t, std::vector<facet_value_id_t>> seq_id_to_fvalues;
auto facet_count = facet_index_v4->get_facet_count(afield.name);
#ifdef TEST_BUILD
facet_count = FACET_INDEX_THRESHOLD + 1;
#endif
/*if(afield.facet) {
size_t total_num_docs = seq_ids->num_ids();
facet_index_v4->handle_index_change(afield.name, total_num_docs, FACET_INDEX_THRESHOLD, facet_count);
}*/
size_t total_num_docs = seq_ids->num_ids();
if(afield.facet && total_num_docs > 10*1000 && search_schema.size() > 100) {
facet_index_v4->check_for_high_cardinality(afield.name, total_num_docs);
}
for(const auto& record: iter_batch) {
if(!record.indexed.ok()) {