mirror of
https://github.com/typesense/typesense.git
synced 2025-05-16 03:12:32 +08:00
Revert "Do grouping in two pass (#1677)"
This reverts commit dccf6eb1864870cffc5ca71e3307e59b6ee5d9b2. # Conflicts: # src/index.cpp
This commit is contained in:
parent
89bcd383c3
commit
b9d70433b4
@ -1,123 +0,0 @@
|
||||
// Source : https://github.com/iwiwi/hyperloglog-hip
|
||||
#ifndef HYPERLOGLOG_HIP_DENSE_ARRAY_H_
|
||||
#define HYPERLOGLOG_HIP_DENSE_ARRAY_H_
|
||||
|
||||
#include <algorithm>
|
||||
#include <climits>
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <type_traits>
|
||||
|
||||
namespace hyperloglog_hip {
|
||||
template<size_t NumRegisterBits, typename Value = uint8_t>
|
||||
class dense_array {
|
||||
public:
|
||||
typedef Value value_type;
|
||||
|
||||
dense_array(size_t num_registers)
|
||||
: data_(new value_type[data_length(num_registers)]()) {}
|
||||
|
||||
value_type get(size_t pos) {
|
||||
const size_t b = pos * num_register_bits();
|
||||
const size_t i1 = b / num_value_bits();
|
||||
const size_t o1 = b - i1 * num_value_bits();
|
||||
const size_t n1 = num_value_bits() - o1;
|
||||
value_type v = data_[i1] >> o1;
|
||||
|
||||
if (n1 > num_register_bits()) {
|
||||
v &= (value_type(1) << num_register_bits()) - 1;
|
||||
}
|
||||
else if (n1 < num_register_bits()) {
|
||||
const size_t i2 = i1 + 1;
|
||||
const size_t n2 = num_register_bits() - n1;
|
||||
v |= (data_[i2] & ((value_type(1) << n2) - 1)) << n1;
|
||||
}
|
||||
return v;
|
||||
}
|
||||
|
||||
void set(size_t pos, value_type val) {
|
||||
const size_t b = pos * num_register_bits();
|
||||
|
||||
const size_t i1 = b / num_value_bits();
|
||||
const size_t o1 = b - i1 * num_value_bits();
|
||||
const size_t n1 = std::min(num_value_bits() - o1, num_register_bits());
|
||||
data_[i1] &= value_type(-1) ^ (((value_type(1) << n1) - 1) << o1);
|
||||
data_[i1] |= val << o1;
|
||||
|
||||
if (n1 < num_register_bits()) {
|
||||
const size_t i2 = i1 + 1;
|
||||
const size_t n2 = num_register_bits() - n1;
|
||||
data_[i2] &= value_type(-1) ^ ((value_type(1) << n2) - 1);
|
||||
data_[i2] |= val >> n1;
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
std::unique_ptr<value_type[]> data_;
|
||||
|
||||
static constexpr size_t num_register_bits() {
|
||||
return NumRegisterBits;
|
||||
}
|
||||
|
||||
static constexpr size_t num_value_bits() {
|
||||
return sizeof(Value) * CHAR_BIT;
|
||||
}
|
||||
|
||||
static constexpr size_t data_length(size_t num_registers) {
|
||||
return (num_registers * num_register_bits() + num_value_bits() - 1) / num_value_bits();
|
||||
}
|
||||
|
||||
static_assert(std::is_unsigned<value_type>::value,
|
||||
"Value should be an unsigned integral type.");
|
||||
|
||||
static_assert(sizeof(value_type) * CHAR_BIT >= NumRegisterBits,
|
||||
"Value should have at least NumRegisterBits bits.");
|
||||
};
|
||||
|
||||
template<typename Value>
|
||||
class dense_array_primitive {
|
||||
public:
|
||||
typedef Value value_type;
|
||||
|
||||
dense_array_primitive(size_t size) : data_(new value_type[size]()) {}
|
||||
virtual ~dense_array_primitive() {}
|
||||
|
||||
value_type get(size_t pos) const {
|
||||
return data_[pos];
|
||||
}
|
||||
|
||||
void set(size_t pos, value_type val) {
|
||||
data_[pos] = val;
|
||||
}
|
||||
|
||||
private:
|
||||
std::unique_ptr<value_type[]> data_;
|
||||
};
|
||||
|
||||
template<>
|
||||
class dense_array<8, uint8_t> : public dense_array_primitive<uint8_t> {
|
||||
public:
|
||||
dense_array(size_t size) : dense_array_primitive(size) {}
|
||||
};
|
||||
|
||||
template<>
|
||||
class dense_array<16, uint16_t> : public dense_array_primitive<uint16_t> {
|
||||
public:
|
||||
dense_array(size_t size) : dense_array_primitive(size) {}
|
||||
};
|
||||
|
||||
template<>
|
||||
class dense_array<32, uint32_t> : public dense_array_primitive<uint32_t> {
|
||||
public:
|
||||
dense_array(size_t size) : dense_array_primitive(size) {}
|
||||
};
|
||||
|
||||
template<>
|
||||
class dense_array<64, uint64_t> : public dense_array_primitive<uint64_t> {
|
||||
public:
|
||||
dense_array(size_t size) : dense_array_primitive(size) {}
|
||||
};
|
||||
} // namespace hyperloglog_hip
|
||||
|
||||
#endif // HYEPRLOGLOG_HIP_DENSE_ARRAY_H_
|
||||
|
@ -1,64 +0,0 @@
|
||||
// Source : https://github.com/iwiwi/hyperloglog-hip
|
||||
#ifndef HYPERLOGLOG_HIP_DISTINCT_COUNTER_H_
|
||||
#define HYPERLOGLOG_HIP_DISTINCT_COUNTER_H_
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdint>
|
||||
#include <cmath>
|
||||
#include "dense_array.h"
|
||||
|
||||
namespace hyperloglog_hip {
|
||||
template<typename Key, typename Hash = std::hash<Key>, int NumRegisterBits = 5>
|
||||
class distinct_counter {
|
||||
public:
|
||||
typedef Key key_type;
|
||||
typedef Hash hash_type;
|
||||
|
||||
distinct_counter(size_t num_bucket_bits = 12)
|
||||
: num_bucket_bits_(num_bucket_bits), M_(1 << num_bucket_bits),
|
||||
c_(0), s_(1 << num_bucket_bits) {}
|
||||
|
||||
void insert(const key_type &v) {
|
||||
static constexpr uint64_t num_register_bits = NumRegisterBits;
|
||||
static constexpr uint64_t register_limit = (uint64_t(1) << num_register_bits) - 1;
|
||||
|
||||
const uint64_t h = hash_(v) * magic1() + magic2();
|
||||
const uint64_t h0 = h & ((uint64_t(1) << num_bucket_bits_) - 1);
|
||||
const uint64_t h1 = h >> num_bucket_bits_;
|
||||
|
||||
const uint64_t b_old = M_.get(h0);
|
||||
const uint64_t b_new = h1 == 0 ? register_limit :
|
||||
std::min(register_limit, uint64_t(1 + __builtin_ctzl(h1)));
|
||||
|
||||
if (b_new > b_old) {
|
||||
M_.set(h0, b_new);
|
||||
c_ += 1.0 / (s_ / (uint64_t(1) << num_bucket_bits_));
|
||||
s_ -= 1.0 / (uint64_t(1) << b_old);
|
||||
if (b_new < register_limit) {
|
||||
s_ += 1.0 / (uint64_t(1) << b_new);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
size_t count() const {
|
||||
return round(c_);
|
||||
}
|
||||
|
||||
private:
|
||||
const size_t num_bucket_bits_;
|
||||
dense_array<NumRegisterBits> M_;
|
||||
double c_, s_;
|
||||
hash_type hash_;
|
||||
|
||||
static constexpr uint64_t magic1() {
|
||||
return 9223372036854775837ULL;
|
||||
}
|
||||
|
||||
static constexpr uint64_t magic2() {
|
||||
return 1234567890123456789ULL;
|
||||
}
|
||||
};
|
||||
} // namespace hyperloglog_hip
|
||||
|
||||
#endif // HYPERLOGLOG_HIP_DISTINCT_COUNTER_H_
|
||||
|
@ -7,9 +7,6 @@
|
||||
#include <unordered_map>
|
||||
#include <field.h>
|
||||
#include "filter_result_iterator.h"
|
||||
#include "hll/distinct_counter.h"
|
||||
|
||||
#define HYPERLOGLOG_THRESHOLD 2048
|
||||
|
||||
struct KV {
|
||||
int8_t match_score_index{};
|
||||
@ -139,17 +136,10 @@ struct Topster {
|
||||
spp::sparse_hash_map<uint64_t, Topster*> group_kv_map;
|
||||
size_t distinct;
|
||||
|
||||
//hyperloglog lib counter for counting total unique group values more than 512
|
||||
hyperloglog_hip::distinct_counter<uint64_t> hyperloglog_counter;
|
||||
std::set<uint64_t> groups_found; //to keep count of total unique group less than 512
|
||||
uint32_t groups_found_count = 0; //to keep track of groups found in current pass
|
||||
bool is_first_pass_completed = false;
|
||||
|
||||
explicit Topster(size_t capacity): Topster(capacity, 0) {
|
||||
}
|
||||
|
||||
explicit Topster(size_t capacity, size_t distinct, bool is_first_pass_completed = false): MAX_SIZE(capacity),
|
||||
size(0), distinct(distinct), is_first_pass_completed(is_first_pass_completed) {
|
||||
explicit Topster(size_t capacity, size_t distinct): MAX_SIZE(capacity), size(0), distinct(distinct) {
|
||||
// we allocate data first to get a memory block whose indices are then assigned to `kvs`
|
||||
// we use separate **kvs for easier pointer swaps
|
||||
data = new KV[capacity];
|
||||
@ -194,32 +184,24 @@ struct Topster {
|
||||
LOG(INFO) << "kv key: " << mkv.first << " => " << mkv.second->scores[mkv.second->match_score_index];
|
||||
}*/
|
||||
|
||||
/* returns either 0 or 1
|
||||
* 1 -> distinct_id was added to group_kv_map in second pass, which will aggregate found counts to groups_processed
|
||||
* 0 -> distinct_id was added to kv_map in first pass
|
||||
* -1 -> distinct_id was not added
|
||||
*/
|
||||
|
||||
int ret = 1;
|
||||
|
||||
bool less_than_min_heap = (size >= MAX_SIZE) && is_smaller(kv, kvs[0]);
|
||||
size_t heap_op_index = 0;
|
||||
|
||||
if(!distinct && less_than_min_heap) {
|
||||
// for non-distinct, if incoming value is smaller than min-heap ignore
|
||||
return -1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool SIFT_DOWN = true;
|
||||
|
||||
if(distinct && is_first_pass_completed) {
|
||||
if(kv_map.count(kv->distinct_key) == 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
const auto& doc_seq_id_exists =
|
||||
if(distinct) {
|
||||
const auto& doc_seq_id_exists =
|
||||
(group_doc_seq_ids.find(kv->key) != group_doc_seq_ids.end());
|
||||
|
||||
if(doc_seq_id_exists) {
|
||||
return -1;
|
||||
ret = 2;
|
||||
}
|
||||
group_doc_seq_ids.emplace(kv->key);
|
||||
|
||||
@ -231,16 +213,14 @@ struct Topster {
|
||||
Topster* g_topster = new Topster(distinct, 0);
|
||||
g_topster->add(kv);
|
||||
group_kv_map.insert({kv->distinct_key, g_topster});
|
||||
groups_found_count++;
|
||||
}
|
||||
|
||||
return 1;
|
||||
return ret;
|
||||
|
||||
} else { // not distinct
|
||||
//LOG(INFO) << "Searching for key: " << kv->key;
|
||||
auto key = distinct ? kv->distinct_key : kv->key;
|
||||
|
||||
const auto& found_it = kv_map.find(key);
|
||||
const auto& found_it = kv_map.find(kv->key);
|
||||
bool is_duplicate_key = (found_it != kv_map.end());
|
||||
|
||||
/*
|
||||
@ -257,15 +237,14 @@ struct Topster {
|
||||
|
||||
bool smaller_than_existing = is_smaller(kv, existing_kv);
|
||||
if(smaller_than_existing) {
|
||||
return -1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
SIFT_DOWN = true;
|
||||
|
||||
// replace existing kv and sift down
|
||||
heap_op_index = existing_kv->array_index;
|
||||
auto heap_op_index_key = distinct ? kvs[heap_op_index]->distinct_key : kvs[heap_op_index]->key;
|
||||
kv_map.erase(heap_op_index_key);
|
||||
kv_map.erase(kvs[heap_op_index]->key);
|
||||
} else { // not duplicate
|
||||
|
||||
if(size < MAX_SIZE) {
|
||||
@ -278,24 +257,12 @@ struct Topster {
|
||||
// we have to replace min heap element since array is full
|
||||
SIFT_DOWN = true;
|
||||
heap_op_index = 0;
|
||||
auto heap_op_index_key = distinct ? kvs[heap_op_index]->distinct_key : kvs[heap_op_index]->key;
|
||||
kv_map.erase(heap_op_index_key);
|
||||
kv_map.erase(kvs[heap_op_index]->key);
|
||||
}
|
||||
}
|
||||
|
||||
// kv will be copied into the pointer at heap_op_index
|
||||
kv_map.emplace(key, kvs[heap_op_index]);
|
||||
|
||||
if(distinct) {
|
||||
hyperloglog_counter.insert(kv->distinct_key);
|
||||
|
||||
if(groups_found.size() < HYPERLOGLOG_THRESHOLD) {
|
||||
groups_found.insert(kv->distinct_key);
|
||||
groups_found_count = groups_found.size();
|
||||
} else {
|
||||
groups_found_count = hyperloglog_counter.count();
|
||||
}
|
||||
}
|
||||
kv_map.emplace(kv->key, kvs[heap_op_index]);
|
||||
}
|
||||
|
||||
// we have to replace the existing element in the heap and sift down
|
||||
@ -333,7 +300,7 @@ struct Topster {
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
return ret;
|
||||
}
|
||||
|
||||
static bool is_greater(const struct KV* i, const struct KV* j) {
|
||||
@ -373,18 +340,4 @@ struct Topster {
|
||||
KV* getKV(uint32_t index) {
|
||||
return kvs[index];
|
||||
}
|
||||
|
||||
const size_t get_total_unique_groups() const {
|
||||
auto groups_count = groups_found.size();
|
||||
return groups_count < 512 ? groups_count : hyperloglog_counter.count();
|
||||
}
|
||||
|
||||
void set_first_pass_complete() {
|
||||
is_first_pass_completed = true;
|
||||
groups_found_count = 0;
|
||||
}
|
||||
|
||||
const size_t get_current_groups_count() const {
|
||||
return groups_found_count;
|
||||
}
|
||||
};
|
@ -2441,7 +2441,7 @@ Option<nlohmann::json> Collection::search(std::string raw_query,
|
||||
|
||||
// for grouping we have to aggregate group set sizes to a count value
|
||||
if(group_limit) {
|
||||
total = search_params->topster->get_total_unique_groups() + override_result_kvs.size();
|
||||
total = search_params->groups_processed.size() + override_result_kvs.size();
|
||||
} else {
|
||||
total = search_params->all_result_ids_len;
|
||||
}
|
||||
@ -3630,7 +3630,7 @@ void Collection::populate_result_kvs(Topster *topster, std::vector<std::vector<K
|
||||
if(group_count_index >= 0) {
|
||||
const auto& itr = groups_processed.find(kv_head->distinct_key);
|
||||
if(itr != groups_processed.end()) {
|
||||
kv_head->scores[group_count_index] = (int)itr->second * group_sort_order;
|
||||
kv_head->scores[group_count_index] = itr->second * group_sort_order;
|
||||
}
|
||||
}
|
||||
gtopster.add(kv_head);
|
||||
|
@ -1565,7 +1565,7 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
|
||||
}
|
||||
|
||||
void Index::aggregate_topster(Topster* agg_topster, Topster* index_topster) {
|
||||
if(index_topster->distinct && index_topster->is_first_pass_completed) {
|
||||
if(index_topster->distinct) {
|
||||
for(auto &group_topster_entry: index_topster->group_kv_map) {
|
||||
Topster* group_topster = group_topster_entry.second;
|
||||
for(const auto& map_kv: group_topster->kv_map) {
|
||||
@ -2215,58 +2215,6 @@ Option<bool> Index::run_search(search_args* search_params, const std::string& co
|
||||
const std::vector<facet_index_type_t>& facet_index_types, bool enable_typos_for_numerical_tokens,
|
||||
bool enable_synonyms, bool synonym_prefix, uint32_t synonym_num_typos,
|
||||
bool enable_typos_for_alpha_numerical_tokens) {
|
||||
if(search_params->group_limit != 0) {
|
||||
search(search_params->field_query_tokens,
|
||||
search_params->search_fields,
|
||||
search_params->match_type,
|
||||
search_params->filter_tree_root, search_params->facets, search_params->facet_query,
|
||||
search_params->max_facet_values,
|
||||
search_params->included_ids, search_params->excluded_ids,
|
||||
search_params->sort_fields_std, search_params->num_typos,
|
||||
search_params->topster, search_params->curated_topster,
|
||||
search_params->per_page, search_params->offset, search_params->token_order,
|
||||
search_params->prefixes, search_params->drop_tokens_threshold,
|
||||
search_params->all_result_ids_len, search_params->groups_processed,
|
||||
search_params->searched_queries,
|
||||
search_params->qtoken_set,
|
||||
search_params->raw_result_kvs, search_params->override_result_kvs,
|
||||
search_params->typo_tokens_threshold,
|
||||
search_params->group_limit,
|
||||
search_params->group_by_fields,
|
||||
search_params->group_missing_values,
|
||||
search_params->default_sorting_field,
|
||||
search_params->prioritize_exact_match,
|
||||
search_params->prioritize_token_position,
|
||||
search_params->prioritize_num_matching_fields,
|
||||
search_params->exhaustive_search,
|
||||
search_params->concurrency,
|
||||
search_params->search_cutoff_ms,
|
||||
search_params->min_len_1typo,
|
||||
search_params->min_len_2typo,
|
||||
search_params->max_candidates,
|
||||
search_params->infixes,
|
||||
search_params->max_extra_prefix,
|
||||
search_params->max_extra_suffix,
|
||||
search_params->facet_query_num_typos,
|
||||
search_params->filter_curated_hits,
|
||||
search_params->split_join_tokens,
|
||||
search_params->vector_query,
|
||||
search_params->facet_sample_percent,
|
||||
search_params->facet_sample_threshold,
|
||||
collection_name,
|
||||
search_params->drop_tokens_mode,
|
||||
facet_index_types,
|
||||
enable_typos_for_numerical_tokens,
|
||||
enable_synonyms,
|
||||
synonym_prefix,
|
||||
synonym_num_typos,
|
||||
search_params->enable_lazy_filter,
|
||||
enable_typos_for_alpha_numerical_tokens
|
||||
);
|
||||
search_params->topster->set_first_pass_complete();
|
||||
search_params->curated_topster->set_first_pass_complete();
|
||||
|
||||
}
|
||||
auto res = search(search_params->field_query_tokens,
|
||||
search_params->search_fields,
|
||||
search_params->match_type,
|
||||
@ -2314,6 +2262,7 @@ Option<bool> Index::run_search(search_args* search_params, const std::string& co
|
||||
search_params->enable_lazy_filter,
|
||||
enable_typos_for_alpha_numerical_tokens
|
||||
);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
@ -2947,7 +2896,7 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
|
||||
KV kv(searched_queries.size(), seq_id, distinct_id, match_score_index, scores);
|
||||
int ret = topster->add(&kv);
|
||||
|
||||
if(group_limit != 0 && ret > 0) {
|
||||
if(group_limit != 0 && ret < 2) {
|
||||
groups_processed[distinct_id]++;
|
||||
}
|
||||
|
||||
@ -3110,7 +3059,7 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
|
||||
kv.vector_distance = vec_dist_score;
|
||||
int ret = topster->add(&kv);
|
||||
|
||||
if(group_limit != 0 && ret > 0) {
|
||||
if(group_limit != 0 && ret < 2) {
|
||||
groups_processed[distinct_id]++;
|
||||
}
|
||||
|
||||
@ -3581,7 +3530,7 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
|
||||
auto ret = topster->add(&kv);
|
||||
vec_search_ids.push_back(seq_id);
|
||||
|
||||
if(group_limit != 0 && ret > 0) {
|
||||
if(group_limit != 0 && ret < 2) {
|
||||
groups_processed[distinct_id]++;
|
||||
}
|
||||
}
|
||||
@ -4263,8 +4212,7 @@ Option<bool> Index::fuzzy_search_fields(const std::vector<search_field_t>& the_f
|
||||
|
||||
resume_typo_loop:
|
||||
|
||||
auto current_groups_count = topster ? topster->get_current_groups_count() : 0;
|
||||
auto results_count = group_limit != 0 ? current_groups_count : all_result_ids_len;
|
||||
auto results_count = group_limit != 0 ? groups_processed.size() : all_result_ids_len;
|
||||
if(!exhaustive_search && results_count >= typo_tokens_threshold) {
|
||||
// if typo threshold is breached, we are done
|
||||
return Option<bool>(true);
|
||||
@ -4695,7 +4643,7 @@ Option<bool> Index::search_across_fields(const std::vector<token_t>& query_token
|
||||
}
|
||||
|
||||
int ret = topster->add(&kv);
|
||||
if(group_limit != 0 && ret > 0) {
|
||||
if(group_limit != 0 && ret < 2) {
|
||||
groups_processed[distinct_id]++;
|
||||
}
|
||||
result_ids.push_back(seq_id);
|
||||
@ -5496,7 +5444,7 @@ Option<bool> Index::do_phrase_search(const size_t num_search_fields, const std::
|
||||
KV kv(searched_queries.size(), seq_id, distinct_id, match_score_index, scores, std::move(references));
|
||||
|
||||
int ret = actual_topster->add(&kv);
|
||||
if(group_limit != 0 && ret > 0) {
|
||||
if(group_limit != 0 && ret < 2) {
|
||||
groups_processed[distinct_id]++;
|
||||
}
|
||||
|
||||
@ -5671,7 +5619,7 @@ Option<bool> Index::do_infix_search(const size_t num_search_fields, const std::v
|
||||
KV kv(searched_queries.size(), seq_id, distinct_id, match_score_index, scores, std::move(references));
|
||||
int ret = actual_topster->add(&kv);
|
||||
|
||||
if(group_limit != 0 && ret > 0) {
|
||||
if(group_limit != 0 && ret < 2) {
|
||||
groups_processed[distinct_id]++;
|
||||
}
|
||||
|
||||
@ -6033,8 +5981,7 @@ Option<bool> Index::search_wildcard(filter_node_t const* const& filter_tree_root
|
||||
|
||||
searched_queries.push_back({});
|
||||
|
||||
topsters[thread_id] = new Topster(topster->MAX_SIZE, topster->distinct, topster->is_first_pass_completed);
|
||||
topsters[thread_id]->kv_map = topster->kv_map;
|
||||
topsters[thread_id] = new Topster(topster->MAX_SIZE, topster->distinct);
|
||||
auto& compute_sort_score_status = compute_sort_score_statuses[thread_id] = nullptr;
|
||||
|
||||
thread_pool->enqueue([this, &parent_search_begin, &parent_search_stop_ms, &parent_search_cutoff,
|
||||
@ -6096,7 +6043,7 @@ Option<bool> Index::search_wildcard(filter_node_t const* const& filter_tree_root
|
||||
KV kv(searched_queries.size(), seq_id, distinct_id, match_score_index, scores, std::move(references));
|
||||
|
||||
int ret = topsters[thread_id]->add(&kv);
|
||||
if(group_limit != 0 && ret > 0) {
|
||||
if(group_limit != 0 && ret < 2) {
|
||||
tgroups_processed[thread_id][distinct_id]++;
|
||||
}
|
||||
if(check_for_circuit_break && ((i + 1) % (1 << 15)) == 0) {
|
||||
@ -6611,7 +6558,7 @@ void Index::score_results(const std::vector<sort_by> & sort_fields, const uint16
|
||||
//LOG(INFO) << "Seq id: " << seq_id << ", match_score: " << match_score;
|
||||
KV kv(query_index, seq_id, distinct_id, match_score_index, scores);
|
||||
int ret = topster->add(&kv);
|
||||
if(group_limit != 0 && ret > 0) {
|
||||
if(group_limit != 0 && ret < 2) {
|
||||
groups_processed[distinct_id]++;
|
||||
}
|
||||
|
||||
|
@ -461,7 +461,7 @@ TEST_F(CollectionGroupingTest, GroupingWithArrayFieldAndOverride) {
|
||||
"", 10,
|
||||
{}, {}, {"colors"}, 2).get();
|
||||
|
||||
ASSERT_EQ(10, res["found_docs"].get<size_t>());
|
||||
ASSERT_EQ(9, res["found_docs"].get<size_t>());
|
||||
ASSERT_EQ(4, res["found"].get<size_t>());
|
||||
ASSERT_EQ(4, res["grouped_hits"].size());
|
||||
|
||||
@ -755,54 +755,147 @@ TEST_F(CollectionGroupingTest, SortingOnGroupCount) {
|
||||
ASSERT_EQ(7, res2["grouped_hits"][2]["found"].get<int32_t>());
|
||||
}
|
||||
|
||||
TEST_F(CollectionGroupingTest, SortingByGroupCount) {
|
||||
TEST_F(CollectionGroupingTest, SortingMoreThanMaxTopsterSize) {
|
||||
|
||||
std::vector<field> fields = {
|
||||
field("title", field_types::STRING, false),
|
||||
field("brand", field_types::STRING, true, true),
|
||||
field("size", field_types::INT32, true, false),
|
||||
field("colors", field_types::STRING, true, false),
|
||||
field("rating", field_types::FLOAT, true, false)
|
||||
};
|
||||
|
||||
Collection* coll3 = collectionManager.get_collection("coll3").get();
|
||||
if(coll3 == nullptr) {
|
||||
coll3 = collectionManager.create_collection("coll3", 4, fields, "rating").get();
|
||||
}
|
||||
|
||||
for(auto i = 0; i < 150; i++) {
|
||||
auto group_id = i;
|
||||
for(auto j = 0; j < 4; j++) {
|
||||
nlohmann::json doc;
|
||||
doc["title"] = "Omega Casual Poplin Shirt";
|
||||
doc["brand"] = "Omega";
|
||||
doc["size"] = group_id;
|
||||
doc["colors"] = "blue";
|
||||
doc["rating"] = 4.5;
|
||||
|
||||
ASSERT_TRUE(coll3->add(doc.dump()).ok());
|
||||
}
|
||||
}
|
||||
|
||||
for(auto i = 150; i < 250; i++) {
|
||||
auto group_id = i;
|
||||
for(auto j = 0; j < 3; j++) {
|
||||
nlohmann::json doc;
|
||||
doc["title"] = "Beta Casual Poplin Shirt";
|
||||
doc["brand"] = "Beta";
|
||||
doc["size"] = group_id;
|
||||
doc["colors"] = "white";
|
||||
doc["rating"] = 4.3;
|
||||
|
||||
ASSERT_TRUE(coll3->add(doc.dump()).ok());
|
||||
}
|
||||
}
|
||||
|
||||
for(auto i = 250; i < 300; i++) {
|
||||
auto group_id = i;
|
||||
for(auto j = 0; j < 2; j++) {
|
||||
nlohmann::json doc;
|
||||
doc["title"] = "Zeta Casual Poplin Shirt";
|
||||
doc["brand"] = "Zeta";
|
||||
doc["size"] = group_id;
|
||||
doc["colors"] = "red";
|
||||
doc["rating"] = 4.6;
|
||||
|
||||
ASSERT_TRUE(coll3->add(doc.dump()).ok());
|
||||
}
|
||||
}
|
||||
|
||||
//first search in desc order
|
||||
std::vector<sort_by> sort_fields = {sort_by("_group_found", "DESC")};
|
||||
|
||||
auto res = coll_group->search("shirt", {"title"}, "", {"brand"}, sort_fields, {0}, 10, 1, FREQUENCY,
|
||||
auto res = coll3->search("*", {}, "", {"brand"}, sort_fields, {0}, 100, 2, FREQUENCY,
|
||||
{false}, Index::DROP_TOKENS_THRESHOLD,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
|
||||
"", 10,
|
||||
{}, {}, {"brand"}, 10).get();
|
||||
ASSERT_EQ(12, res["found_docs"].get<size_t>());
|
||||
ASSERT_EQ(5, res["found"].get<size_t>());
|
||||
ASSERT_EQ(5, res["grouped_hits"].size());
|
||||
{}, {}, {"size"}, 2).get();
|
||||
|
||||
ASSERT_EQ(4, res["grouped_hits"][0]["found"].get<int32_t>());
|
||||
ASSERT_EQ(1000, res["found_docs"].get<size_t>());
|
||||
ASSERT_EQ(300, res["found"].get<size_t>());
|
||||
ASSERT_EQ(100, res["grouped_hits"].size());
|
||||
|
||||
ASSERT_EQ(3, res["grouped_hits"][1]["found"].get<int32_t>());
|
||||
ASSERT_EQ(4, res["grouped_hits"][4]["found"].get<int32_t>());
|
||||
|
||||
ASSERT_EQ(2, res["grouped_hits"][2]["found"].get<int32_t>());
|
||||
ASSERT_EQ(4, res["grouped_hits"][4]["found"].get<int32_t>());
|
||||
|
||||
ASSERT_EQ(2, res["grouped_hits"][3]["found"].get<int32_t>());
|
||||
ASSERT_EQ(3, res["grouped_hits"][50]["found"].get<int32_t>());
|
||||
|
||||
ASSERT_EQ(3, res["grouped_hits"][99]["found"].get<int32_t>());
|
||||
|
||||
|
||||
res = coll3->search("*", {}, "", {"brand"}, sort_fields, {0}, 100, 3, FREQUENCY,
|
||||
{false}, Index::DROP_TOKENS_THRESHOLD,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
|
||||
"", 10,
|
||||
{}, {}, {"size"}, 2).get();
|
||||
|
||||
ASSERT_EQ(1000, res["found_docs"].get<size_t>());
|
||||
ASSERT_EQ(300, res["found"].get<size_t>());
|
||||
ASSERT_EQ(100, res["grouped_hits"].size());
|
||||
|
||||
ASSERT_EQ(3, res["grouped_hits"][4]["found"].get<int32_t>());
|
||||
|
||||
ASSERT_EQ(3, res["grouped_hits"][4]["found"].get<int32_t>());
|
||||
|
||||
ASSERT_EQ(2, res["grouped_hits"][50]["found"].get<int32_t>());
|
||||
|
||||
ASSERT_EQ(2, res["grouped_hits"][99]["found"].get<int32_t>());
|
||||
|
||||
ASSERT_EQ(1, res["grouped_hits"][4]["found"].get<int32_t>());
|
||||
|
||||
//search in asc order
|
||||
|
||||
std::vector<sort_by> sort_fields2 = {sort_by("_group_found", "ASC")};
|
||||
|
||||
auto res2 = coll_group->search("shirt", {"title"}, "", {"brand"}, sort_fields2, {0}, 10, 1, FREQUENCY,
|
||||
auto res2 = coll3->search("*", {}, "", {"brand"}, sort_fields2, {0}, 100, 1, FREQUENCY,
|
||||
{false}, Index::DROP_TOKENS_THRESHOLD,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
|
||||
"", 10,
|
||||
{}, {}, {"brand"}, 10).get();
|
||||
{}, {}, {"size"}, 2).get();
|
||||
|
||||
ASSERT_EQ(12, res2["found_docs"].get<size_t>());
|
||||
ASSERT_EQ(5, res2["found"].get<size_t>());
|
||||
ASSERT_EQ(5, res2["grouped_hits"].size());
|
||||
ASSERT_EQ(1000, res2["found_docs"].get<size_t>());
|
||||
ASSERT_EQ(300, res2["found"].get<size_t>());
|
||||
ASSERT_EQ(100, res2["grouped_hits"].size());
|
||||
|
||||
ASSERT_EQ(1, res2["grouped_hits"][0]["found"].get<int32_t>());
|
||||
ASSERT_EQ(2, res2["grouped_hits"][0]["found"].get<int32_t>());
|
||||
|
||||
ASSERT_EQ(2, res2["grouped_hits"][1]["found"].get<int32_t>());
|
||||
|
||||
ASSERT_EQ(2, res2["grouped_hits"][2]["found"].get<int32_t>());
|
||||
ASSERT_EQ(3, res2["grouped_hits"][50]["found"].get<int32_t>());
|
||||
|
||||
ASSERT_EQ(3, res2["grouped_hits"][3]["found"].get<int32_t>());
|
||||
ASSERT_EQ(3, res2["grouped_hits"][99]["found"].get<int32_t>());
|
||||
|
||||
ASSERT_EQ(4, res2["grouped_hits"][4]["found"].get<int32_t>());
|
||||
res2 = coll3->search("*", {}, "", {"brand"}, sort_fields2, {0}, 100, 2, FREQUENCY,
|
||||
{false}, Index::DROP_TOKENS_THRESHOLD,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
|
||||
"", 10,
|
||||
{}, {}, {"size"}, 2).get();
|
||||
|
||||
ASSERT_EQ(1000, res2["found_docs"].get<size_t>());
|
||||
ASSERT_EQ(300, res2["found"].get<size_t>());
|
||||
ASSERT_EQ(100, res2["grouped_hits"].size());
|
||||
|
||||
ASSERT_EQ(3, res2["grouped_hits"][0]["found"].get<int32_t>());
|
||||
|
||||
ASSERT_EQ(3, res2["grouped_hits"][1]["found"].get<int32_t>());
|
||||
|
||||
ASSERT_EQ(4, res2["grouped_hits"][50]["found"].get<int32_t>());
|
||||
|
||||
ASSERT_EQ(4, res2["grouped_hits"][99]["found"].get<int32_t>());
|
||||
}
|
||||
|
||||
TEST_F(CollectionGroupingTest, GroupSortingWithoutGroupingFields) {
|
||||
@ -1039,7 +1132,7 @@ TEST_F(CollectionGroupingTest, GroupByMultipleFacetFieldsWithPinning) {
|
||||
"", 10,
|
||||
{"3:1,4:2"}, {}, {"size"}, 2).get();
|
||||
|
||||
ASSERT_EQ(7, res["found_docs"].get<size_t>());
|
||||
ASSERT_EQ(5, res["found_docs"].get<size_t>());
|
||||
ASSERT_EQ(4, res["found"].get<size_t>());
|
||||
ASSERT_EQ(4, res["grouped_hits"].size());
|
||||
|
||||
|
@ -179,7 +179,7 @@ TEST(TopsterTest, MaxFloatValues) {
|
||||
}
|
||||
|
||||
TEST(TopsterTest, DistinctIntValues) {
|
||||
Topster dist_topster(5, 2);
|
||||
Topster dist_topster(5, 2);
|
||||
|
||||
struct {
|
||||
uint16_t query_index;
|
||||
@ -214,20 +214,9 @@ TEST(TopsterTest, DistinctIntValues) {
|
||||
dist_topster.add(&kv);
|
||||
}
|
||||
|
||||
dist_topster.set_first_pass_complete();
|
||||
for(int i = 0; i < 14; i++) {
|
||||
int64_t scores[3];
|
||||
scores[0] = int64_t(data[i].match_score);
|
||||
scores[1] = data[i].primary_attr;
|
||||
scores[2] = data[i].secondary_attr;
|
||||
|
||||
KV kv(data[i].query_index, i+100, data[i].distinct_key, 0, scores);
|
||||
dist_topster.add(&kv);
|
||||
}
|
||||
|
||||
dist_topster.sort();
|
||||
|
||||
std::vector<uint64_t> distinct_ids = {10, 5, 8, 4, 1};
|
||||
std::vector<uint64_t> distinct_ids = {4, 1, 8, 5, 9};
|
||||
|
||||
for(uint32_t i = 0; i < dist_topster.size; i++) {
|
||||
EXPECT_EQ(distinct_ids[i], dist_topster.getDistinctKeyAt(i));
|
||||
@ -235,15 +224,15 @@ TEST(TopsterTest, DistinctIntValues) {
|
||||
if(distinct_ids[i] == 1) {
|
||||
EXPECT_EQ(12, (int) dist_topster.getKV(i)->scores[dist_topster.getKV(i)->match_score_index]);
|
||||
EXPECT_EQ(2, dist_topster.group_kv_map[dist_topster.getDistinctKeyAt(i)]->size);
|
||||
EXPECT_EQ(11, dist_topster.group_kv_map[dist_topster.getDistinctKeyAt(i)]->getKV(0)->scores[0]);
|
||||
EXPECT_EQ(12, dist_topster.group_kv_map[dist_topster.getDistinctKeyAt(i)]->getKV(1)->scores[0]);
|
||||
EXPECT_EQ(12, dist_topster.group_kv_map[dist_topster.getDistinctKeyAt(i)]->getKV(0)->scores[0]);
|
||||
EXPECT_EQ(11, dist_topster.group_kv_map[dist_topster.getDistinctKeyAt(i)]->getKV(1)->scores[0]);
|
||||
}
|
||||
|
||||
if(distinct_ids[i] == 5) {
|
||||
EXPECT_EQ(10, (int) dist_topster.getKV(i)->scores[dist_topster.getKV(i)->match_score_index]);
|
||||
EXPECT_EQ(9, (int) dist_topster.getKV(i)->scores[dist_topster.getKV(i)->match_score_index]);
|
||||
EXPECT_EQ(2, dist_topster.group_kv_map[dist_topster.getDistinctKeyAt(i)]->size);
|
||||
EXPECT_EQ(9, dist_topster.group_kv_map[dist_topster.getDistinctKeyAt(i)]->getKV(0)->scores[0]);
|
||||
EXPECT_EQ(10, dist_topster.group_kv_map[dist_topster.getDistinctKeyAt(i)]->getKV(1)->scores[0]);
|
||||
EXPECT_EQ(10, dist_topster.group_kv_map[dist_topster.getDistinctKeyAt(i)]->getKV(0)->scores[0]);
|
||||
EXPECT_EQ(9, dist_topster.group_kv_map[dist_topster.getDistinctKeyAt(i)]->getKV(1)->scores[0]);
|
||||
}
|
||||
}
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user