mirror of
https://github.com/typesense/typesense.git
synced 2025-05-18 04:32:38 +08:00
Optimize facet index for sparse/high-cardinality fields.
This commit is contained in:
parent
962e6b758c
commit
ac1e85599f
@ -154,6 +154,8 @@ public:
|
||||
void handle_index_change(const std::string& field_name, size_t total_num_docs,
|
||||
size_t facet_index_threshold, size_t facet_count);
|
||||
|
||||
void check_for_high_cardinality(const std::string& field_name, size_t total_num_docs);
|
||||
|
||||
bool has_hash_index(const std::string& field_name);
|
||||
|
||||
bool has_value_index(const std::string& field_name);
|
||||
@ -173,4 +175,5 @@ public:
|
||||
size_t facet_val_num_ids(const std::string& field_name, const std::string& fvalue);
|
||||
|
||||
size_t facet_node_count(const std::string& field_name, const std::string& fvalue);
|
||||
};
|
||||
|
||||
};
|
||||
|
@ -11,8 +11,8 @@ void facet_index_t::initialize(const std::string& field) {
|
||||
}
|
||||
}
|
||||
|
||||
void facet_index_t::insert(const std::string& field_name,std::unordered_map<facet_value_id_t,
|
||||
std::vector<uint32_t>, facet_value_id_t::Hash>& fvalue_to_seq_ids,
|
||||
void facet_index_t::insert(const std::string& field_name,
|
||||
std::unordered_map<facet_value_id_t, std::vector<uint32_t>, facet_value_id_t::Hash>& fvalue_to_seq_ids,
|
||||
std::unordered_map<uint32_t, std::vector<facet_value_id_t>>& seq_id_to_fvalues,
|
||||
bool is_string_field) {
|
||||
|
||||
@ -571,3 +571,40 @@ size_t facet_index_t::facet_node_count(const string &field_name, const string &f
|
||||
return facet_field_map_it->second.fvalue_seq_ids[fvalue].facet_count_it->count;
|
||||
}
|
||||
|
||||
void facet_index_t::check_for_high_cardinality(const string& field_name, size_t total_num_docs) {
|
||||
// high cardinality or sparse facet fields must be dropped from value facet index
|
||||
const auto facet_field_map_it = facet_field_map.find(field_name);
|
||||
if(facet_field_map_it == facet_field_map.end()) {
|
||||
return ;
|
||||
}
|
||||
|
||||
if(!facet_field_map_it->second.has_value_index) {
|
||||
return ;
|
||||
}
|
||||
|
||||
size_t value_facet_threshold = 0.8 * total_num_docs;
|
||||
|
||||
auto num_facet_values = facet_field_map_it->second.fvalue_seq_ids.size();
|
||||
bool is_sparse_field = false;
|
||||
|
||||
if(facet_field_map.size() > 100 && total_num_docs > 10*1000) {
|
||||
size_t num_docs_with_facet = facet_field_map_it->second.seq_id_hashes->num_ids();
|
||||
if(num_docs_with_facet > 0 && num_docs_with_facet < 100) {
|
||||
is_sparse_field = true;
|
||||
}
|
||||
}
|
||||
|
||||
if(num_facet_values > value_facet_threshold || is_sparse_field) {
|
||||
// if there are too many unique values, we will drop the value index
|
||||
auto& fvalue_seq_ids = facet_field_map_it->second.fvalue_seq_ids;
|
||||
for(auto it = fvalue_seq_ids.begin(); it != fvalue_seq_ids.end(); ++it) {
|
||||
ids_t::destroy_list(it->second.seq_ids);
|
||||
}
|
||||
fvalue_seq_ids.clear();
|
||||
facet_field_map_it->second.counts.clear();
|
||||
facet_field_map_it->second.count_map.clear();
|
||||
facet_field_map_it->second.has_value_index = false;
|
||||
//LOG(INFO) << "Dropped value index for field " << field_name;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -700,16 +700,10 @@ void Index::index_field_in_memory(const field& afield, std::vector<index_record>
|
||||
std::unordered_map<facet_value_id_t, std::vector<uint32_t>, facet_value_id_t::Hash> fvalue_to_seq_ids;
|
||||
std::unordered_map<uint32_t, std::vector<facet_value_id_t>> seq_id_to_fvalues;
|
||||
|
||||
auto facet_count = facet_index_v4->get_facet_count(afield.name);
|
||||
|
||||
#ifdef TEST_BUILD
|
||||
facet_count = FACET_INDEX_THRESHOLD + 1;
|
||||
#endif
|
||||
|
||||
/*if(afield.facet) {
|
||||
size_t total_num_docs = seq_ids->num_ids();
|
||||
facet_index_v4->handle_index_change(afield.name, total_num_docs, FACET_INDEX_THRESHOLD, facet_count);
|
||||
}*/
|
||||
size_t total_num_docs = seq_ids->num_ids();
|
||||
if(afield.facet && total_num_docs > 10*1000 && search_schema.size() > 100) {
|
||||
facet_index_v4->check_for_high_cardinality(afield.name, total_num_docs);
|
||||
}
|
||||
|
||||
for(const auto& record: iter_batch) {
|
||||
if(!record.indexed.ok()) {
|
||||
|
Loading…
x
Reference in New Issue
Block a user