diff --git a/include/index.h b/include/index.h index fa9eeef1..742ee1b2 100644 --- a/include/index.h +++ b/include/index.h @@ -620,9 +620,6 @@ public: static float int64_t_to_float(int64_t n); - uint64_t get_distinct_id(const std::vector& group_by_fields, - const uint32_t seq_id, const bool group_missing_values) const; - void get_distinct_id(const std::string& field_name, posting_list_t::iterator_t& facet_index_it, const uint32_t seq_id, const bool group_missing_values, uint64_t& distinct_id) const; diff --git a/src/index.cpp b/src/index.cpp index 333dac17..70c08750 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -1274,6 +1274,16 @@ void Index::do_facets(std::vector & facets, facet_query_t & facet_query, size_t total_docs = seq_ids->num_ids(); + std::vector> group_by_it_field_vec; + for(const auto& field_name: group_by_fields) { + if (!facet_index_v4->has_hash_index(field_name)) { + continue; + } + auto facet_index = facet_index_v4->get_facet_hash_index(field_name); + auto facet_index_it = facet_index->new_iterator(); + group_by_it_field_vec.emplace_back(std::make_pair(std::move(facet_index_it), field_name)); + } + // assumed that facet fields have already been validated upstream for(size_t findex=0; findex < facets.size(); findex++) { auto& a_facet = facets[findex]; @@ -1371,7 +1381,13 @@ void Index::do_facets(std::vector & facets, facet_query_t & facet_query, facet_hashes.clear(); posting_list_t::get_offsets(facet_index_it, facet_hashes); - const uint64_t distinct_id = group_limit ? get_distinct_id(group_by_fields, doc_seq_id, group_missing_values) : 0; + uint64_t distinct_id = 0; + if(group_limit) { + distinct_id = 1; + for(auto& kv : group_by_it_field_vec) { + get_distinct_id(kv.second, kv.first, doc_seq_id, group_missing_values, distinct_id); + } + } //LOG(INFO) << "facet_hash_count " << facet_hash_count; if(((i + 1) % 16384) == 0) { RETURN_CIRCUIT_BREAKER @@ -2370,7 +2386,15 @@ Option Index::search(std::vector& field_query_tokens, cons uint32_t seq_id = it.id(); uint64_t distinct_id = seq_id; if (group_limit != 0) { - distinct_id = get_distinct_id(group_by_fields, seq_id, group_missing_values); + distinct_id = 1; + for(const auto& field_name: group_by_fields) { + if (!facet_index_v4->has_hash_index(field_name)) { + continue; + } + auto facet_index = facet_index_v4->get_facet_hash_index(field_name); + auto facet_index_it = facet_index->new_iterator(); + get_distinct_id(field_name, facet_index_it, seq_id, group_missing_values, distinct_id); + } if(excluded_group_ids.count(distinct_id) != 0) { continue; } @@ -2490,7 +2514,15 @@ Option Index::search(std::vector& field_query_tokens, cons uint64_t distinct_id = seq_id; if (group_limit != 0) { - distinct_id = get_distinct_id(group_by_fields, seq_id, group_missing_values); + distinct_id = 1; + for(const auto& field_name: group_by_fields) { + if (!facet_index_v4->has_hash_index(field_name)) { + continue; + } + auto facet_index = facet_index_v4->get_facet_hash_index(field_name); + auto facet_index_it = facet_index->new_iterator(); + get_distinct_id(field_name, facet_index_it, seq_id, group_missing_values, distinct_id); + } if(excluded_group_ids.count(distinct_id) != 0) { continue; } @@ -2918,7 +2950,15 @@ Option Index::search(std::vector& field_query_tokens, cons uint64_t distinct_id = seq_id; if (group_limit != 0) { - distinct_id = get_distinct_id(group_by_fields, seq_id, group_missing_values); + distinct_id = 1; + for(const auto& field_name: group_by_fields) { + if (!facet_index_v4->has_hash_index(field_name)) { + continue; + } + auto facet_index = facet_index_v4->get_facet_hash_index(field_name); + auto facet_index_it = facet_index->new_iterator(); + get_distinct_id(field_name, facet_index_it, seq_id, group_missing_values, distinct_id); + } if(excluded_group_ids.count(distinct_id) != 0) { continue; } @@ -3177,7 +3217,15 @@ void Index::process_curated_ids(const std::vector> if(group_limit != 0) { // if one `id` of a group is present in curated hits, we have to exclude that entire group from results for(auto seq_id: included_ids_vec) { - uint64_t distinct_id = get_distinct_id(group_by_fields, seq_id, group_missing_values); + uint64_t distinct_id = 1; + for(const auto& field_name: group_by_fields) { + if (!facet_index_v4->has_hash_index(field_name)) { + continue; + } + auto facet_index = facet_index_v4->get_facet_hash_index(field_name); + auto facet_index_it = facet_index->new_iterator(); + get_distinct_id(field_name, facet_index_it, seq_id, group_missing_values, distinct_id); + } excluded_group_ids.emplace(distinct_id); } } @@ -3895,7 +3943,15 @@ Option Index::search_across_fields(const std::vector& query_token uint64_t distinct_id = seq_id; if(group_limit != 0) { - distinct_id = get_distinct_id(group_by_fields, seq_id, group_missing_values); + distinct_id = 1; + for(const auto& field_name: group_by_fields) { + if (!facet_index_v4->has_hash_index(field_name)) { + continue; + } + auto facet_index = facet_index_v4->get_facet_hash_index(field_name); + auto facet_index_it = facet_index->new_iterator(); + get_distinct_id(field_name, facet_index_it, seq_id, group_missing_values, distinct_id); + } if(excluded_group_ids.count(distinct_id) != 0) { return; } @@ -4674,7 +4730,15 @@ Option Index::do_phrase_search(const size_t num_search_fields, const std:: uint64_t distinct_id = seq_id; if(group_limit != 0) { - distinct_id = get_distinct_id(group_by_fields, seq_id, group_missing_values); + distinct_id = 1; + for(const auto& field_name: group_by_fields) { + if (!facet_index_v4->has_hash_index(field_name)) { + continue; + } + auto facet_index = facet_index_v4->get_facet_hash_index(field_name); + auto facet_index_it = facet_index->new_iterator(); + get_distinct_id(field_name, facet_index_it, seq_id, group_missing_values, distinct_id); + } if(excluded_group_ids.count(distinct_id) != 0) { continue; } @@ -4836,7 +4900,15 @@ Option Index::do_infix_search(const size_t num_search_fields, const std::v uint64_t distinct_id = seq_id; if(group_limit != 0) { - distinct_id = get_distinct_id(group_by_fields, seq_id, group_missing_values); + distinct_id = 1; + for(const auto& field_name: group_by_fields) { + if (!facet_index_v4->has_hash_index(field_name)) { + continue; + } + auto facet_index = facet_index_v4->get_facet_hash_index(field_name); + auto facet_index_it = facet_index->new_iterator(); + get_distinct_id(field_name, facet_index_it, seq_id, group_missing_values, distinct_id); + } if(excluded_group_ids.count(distinct_id) != 0) { continue; } @@ -5197,6 +5269,15 @@ Option Index::search_wildcard(filter_node_t const* const& filter_tree_root search_cutoff = parent_search_cutoff; size_t filter_index = 0; + std::vector> group_by_it_field_vec; + for(const auto& field_name: group_by_fields) { + if (!facet_index_v4->has_hash_index(field_name)) { + continue; + } + auto facet_index = facet_index_v4->get_facet_hash_index(field_name); + auto facet_index_it = facet_index->new_iterator(); + group_by_it_field_vec.emplace_back(std::make_pair(std::move(facet_index_it), field_name)); + } for(size_t i = 0; i < batch_result->count; i++) { const uint32_t seq_id = batch_result->docs[i]; @@ -5224,14 +5305,8 @@ Option Index::search_wildcard(filter_node_t const* const& filter_tree_root uint64_t distinct_id = seq_id; if(group_limit != 0) { distinct_id = 1; - for(const auto& field_name: group_by_fields) { - if (!facet_index_v4->has_hash_index(field_name)) { - continue; - } - auto facet_index = facet_index_v4->get_facet_hash_index(field_name); - auto facet_index_it = facet_index->new_iterator(); - - get_distinct_id(field_name, facet_index_it, seq_id, group_missing_values, distinct_id); + for(auto& kv: group_by_it_field_vec) { + get_distinct_id(kv.second, kv.first, seq_id, group_missing_values, distinct_id); } if(excluded_group_ids.count(distinct_id) != 0) { @@ -5879,7 +5954,15 @@ void Index::score_results(const std::vector & sort_fields, const uint16 uint64_t distinct_id = seq_id; if(group_limit != 0) { - distinct_id = get_distinct_id(group_by_fields, seq_id, group_missing_values); + distinct_id = 1; + for(const auto& field_name: group_by_fields) { + if (!facet_index_v4->has_hash_index(field_name)) { + continue; + } + auto facet_index = facet_index_v4->get_facet_hash_index(field_name); + auto facet_index_it = facet_index->new_iterator(); + get_distinct_id(field_name, facet_index_it, seq_id, group_missing_values, distinct_id); + } } //LOG(INFO) << "Seq id: " << seq_id << ", match_score: " << match_score; @@ -5893,45 +5976,8 @@ void Index::score_results(const std::vector & sort_fields, const uint16 //LOG(INFO) << "Time taken for results iteration: " << timeNanos << "ms"; } -// pre-filter group_by_fields such that we can avoid the find() check -uint64_t Index::get_distinct_id(const std::vector& group_by_fields, - const uint32_t seq_id, const bool group_missing_values) const { - uint32_t distinct_id = 1; // some constant initial value - - // calculate hash from group_by_fields - for(const auto& field_name: group_by_fields) { - if (!facet_index_v4->has_hash_index(field_name)) { - continue; - } - - std::vector facet_hashes; - auto facet_index = facet_index_v4->get_facet_hash_index(field_name); - auto facet_index_it = facet_index->new_iterator(); - facet_index_it.skip_to(seq_id); - - if (facet_index_it.valid() && facet_index_it.id() == seq_id) { - posting_list_t::get_offsets(facet_index_it, facet_hashes); - - if (search_schema.at(field_name).is_array()) { - //LOG(INFO) << "combining hashes for facet array "; - for (size_t i = 0; i < facet_hashes.size(); i++) { - distinct_id = StringUtils::hash_combine(distinct_id, facet_hashes[i]); - } - } else { - const auto &facet_hash = facet_hashes[0]; - //LOG(INFO) << "combining hashes for facet "; - distinct_id = StringUtils::hash_combine(distinct_id, facet_hash); - } - } - } - - //LOG(INFO) << "seq_id: " << seq_id << ", distinct_id: " << distinct_id; - return (distinct_id == 1 && !group_missing_values) ? seq_id : distinct_id; -} - void Index::get_distinct_id(const std::string& field_name, posting_list_t::iterator_t& facet_index_it, const uint32_t seq_id, const bool group_missing_values, uint64_t& distinct_id) const { - // calculate hash from group_by_fields std::vector facet_hashes; facet_index_it.skip_to(seq_id);