diff --git a/include/facet_index.h b/include/facet_index.h index 910ead73..08a3532f 100644 --- a/include/facet_index.h +++ b/include/facet_index.h @@ -154,6 +154,8 @@ public: void handle_index_change(const std::string& field_name, size_t total_num_docs, size_t facet_index_threshold, size_t facet_count); + void check_for_high_cardinality(const std::string& field_name, size_t total_num_docs); + bool has_hash_index(const std::string& field_name); bool has_value_index(const std::string& field_name); @@ -173,4 +175,5 @@ public: size_t facet_val_num_ids(const std::string& field_name, const std::string& fvalue); size_t facet_node_count(const std::string& field_name, const std::string& fvalue); -}; \ No newline at end of file + +}; diff --git a/src/facet_index.cpp b/src/facet_index.cpp index 87a7312e..9504cb5d 100644 --- a/src/facet_index.cpp +++ b/src/facet_index.cpp @@ -11,8 +11,8 @@ void facet_index_t::initialize(const std::string& field) { } } -void facet_index_t::insert(const std::string& field_name,std::unordered_map, facet_value_id_t::Hash>& fvalue_to_seq_ids, +void facet_index_t::insert(const std::string& field_name, + std::unordered_map, facet_value_id_t::Hash>& fvalue_to_seq_ids, std::unordered_map>& seq_id_to_fvalues, bool is_string_field) { @@ -571,3 +571,40 @@ size_t facet_index_t::facet_node_count(const string &field_name, const string &f return facet_field_map_it->second.fvalue_seq_ids[fvalue].facet_count_it->count; } +void facet_index_t::check_for_high_cardinality(const string& field_name, size_t total_num_docs) { + // high cardinality or sparse facet fields must be dropped from value facet index + const auto facet_field_map_it = facet_field_map.find(field_name); + if(facet_field_map_it == facet_field_map.end()) { + return ; + } + + if(!facet_field_map_it->second.has_value_index) { + return ; + } + + size_t value_facet_threshold = 0.8 * total_num_docs; + + auto num_facet_values = facet_field_map_it->second.fvalue_seq_ids.size(); + bool is_sparse_field = false; + + if(facet_field_map.size() > 100 && total_num_docs > 10*1000) { + size_t num_docs_with_facet = facet_field_map_it->second.seq_id_hashes->num_ids(); + if(num_docs_with_facet > 0 && num_docs_with_facet < 100) { + is_sparse_field = true; + } + } + + if(num_facet_values > value_facet_threshold || is_sparse_field) { + // if there are too many unique values, we will drop the value index + auto& fvalue_seq_ids = facet_field_map_it->second.fvalue_seq_ids; + for(auto it = fvalue_seq_ids.begin(); it != fvalue_seq_ids.end(); ++it) { + ids_t::destroy_list(it->second.seq_ids); + } + fvalue_seq_ids.clear(); + facet_field_map_it->second.counts.clear(); + facet_field_map_it->second.count_map.clear(); + facet_field_map_it->second.has_value_index = false; + //LOG(INFO) << "Dropped value index for field " << field_name; + } +} + diff --git a/src/index.cpp b/src/index.cpp index 0287f443..f61a365c 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -700,16 +700,10 @@ void Index::index_field_in_memory(const field& afield, std::vector std::unordered_map, facet_value_id_t::Hash> fvalue_to_seq_ids; std::unordered_map> seq_id_to_fvalues; - auto facet_count = facet_index_v4->get_facet_count(afield.name); - -#ifdef TEST_BUILD - facet_count = FACET_INDEX_THRESHOLD + 1; -#endif - - /*if(afield.facet) { - size_t total_num_docs = seq_ids->num_ids(); - facet_index_v4->handle_index_change(afield.name, total_num_docs, FACET_INDEX_THRESHOLD, facet_count); - }*/ + size_t total_num_docs = seq_ids->num_ids(); + if(afield.facet && total_num_docs > 10*1000 && search_schema.size() > 100) { + facet_index_v4->check_for_high_cardinality(afield.name, total_num_docs); + } for(const auto& record: iter_batch) { if(!record.indexed.ok()) {