mirror of
https://github.com/typesense/typesense.git
synced 2025-05-18 04:32:38 +08:00
Results count should match group size for group query.
This commit is contained in:
parent
1c398fac7e
commit
c5010a6a5f
5
TODO.md
5
TODO.md
@ -97,9 +97,10 @@
|
||||
- ~~Have a LOG(ERROR) level~~
|
||||
- ~~Handle SIGTERM which is sent when process is killed~~
|
||||
- ~~Use snappy compression for storage~~
|
||||
- ~~Fix exclude_scalar early returns~~
|
||||
- ~~Fix result ids length during grouped overrides~~
|
||||
- Fix override grouping (collate_included_ids)
|
||||
- Test for overriding result on second page
|
||||
- Fix exclude_scalar early returns
|
||||
- Fix result ids length during grouped overrides
|
||||
- atleast 1 token match for proceeding with drop tokens
|
||||
- support wildcard query with filters
|
||||
- API for optimizing on disk storage
|
||||
|
@ -146,7 +146,7 @@ struct token_pos_cost_t {
|
||||
|
||||
struct facet_count_t {
|
||||
uint32_t count;
|
||||
spp::sparse_hash_map<uint64_t, uint32_t> groups; // used for faceting grouped results
|
||||
spp::sparse_hash_set<uint64_t> groups; // used for faceting grouped results
|
||||
|
||||
// used to fetch the actual document and value for representation
|
||||
uint32_t doc_id;
|
||||
|
@ -42,6 +42,7 @@ struct search_args {
|
||||
std::vector<std::string> group_by_fields;
|
||||
size_t group_limit;
|
||||
size_t all_result_ids_len;
|
||||
spp::sparse_hash_set<uint64_t> groups_processed;
|
||||
std::vector<std::vector<art_leaf*>> searched_queries;
|
||||
Topster* topster;
|
||||
Topster* curated_topster;
|
||||
@ -168,9 +169,9 @@ private:
|
||||
const std::vector<uint32_t>& curated_ids,
|
||||
std::vector<facet> & facets, const std::vector<sort_by> & sort_fields,
|
||||
const int num_typos, std::vector<std::vector<art_leaf*>> & searched_queries,
|
||||
Topster* topster, uint32_t** all_result_ids,
|
||||
size_t & all_result_ids_len, const token_ordering token_order = FREQUENCY,
|
||||
const bool prefix = false,
|
||||
Topster* topster, spp::sparse_hash_set<uint64_t>& groups_processed,
|
||||
uint32_t** all_result_ids, size_t & all_result_ids_len,
|
||||
const token_ordering token_order = FREQUENCY, const bool prefix = false,
|
||||
const size_t drop_tokens_threshold = Index::DROP_TOKENS_THRESHOLD,
|
||||
const size_t typo_tokens_threshold = Index::TYPO_TOKENS_THRESHOLD);
|
||||
|
||||
@ -178,7 +179,8 @@ private:
|
||||
const std::vector<uint32_t>& curated_ids,
|
||||
const std::vector<sort_by> & sort_fields, std::vector<token_candidates> & token_to_candidates,
|
||||
std::vector<std::vector<art_leaf*>> & searched_queries,
|
||||
Topster* topster, uint32_t** all_result_ids,
|
||||
Topster* topster, spp::sparse_hash_set<uint64_t>& groups_processed,
|
||||
uint32_t** all_result_ids,
|
||||
size_t & all_result_ids_len,
|
||||
const size_t typo_tokens_threshold);
|
||||
|
||||
@ -210,9 +212,9 @@ private:
|
||||
void remove_and_shift_offset_index(sorted_array &offset_index, const uint32_t *indices_sorted,
|
||||
const uint32_t indices_length);
|
||||
|
||||
void collate_curated_ids(const std::string & query, const std::string & field, const uint8_t field_id,
|
||||
const std::vector<uint32_t> & included_ids,
|
||||
Topster* curated_topster, std::vector<std::vector<art_leaf*>> & searched_queries);
|
||||
void collate_included_ids(const std::string & query, const std::string & field, const uint8_t field_id,
|
||||
const std::vector<uint32_t> & included_ids,
|
||||
Topster* curated_topster, std::vector<std::vector<art_leaf*>> & searched_queries);
|
||||
|
||||
uint64_t facet_token_hash(const field & a_field, const std::string &token);
|
||||
|
||||
@ -242,7 +244,9 @@ public:
|
||||
Topster* topster, Topster* curated_topster,
|
||||
const size_t per_page, const size_t page, const token_ordering token_order,
|
||||
const bool prefix, const size_t drop_tokens_threshold,
|
||||
size_t & all_result_ids_len, std::vector<std::vector<art_leaf*>> & searched_queries,
|
||||
size_t & all_result_ids_len,
|
||||
spp::sparse_hash_set<uint64_t>& groups_processed,
|
||||
std::vector<std::vector<art_leaf*>> & searched_queries,
|
||||
std::vector<std::vector<KV*>> & raw_result_kvs, std::vector<KV*> & override_result_kvs,
|
||||
const size_t typo_tokens_threshold);
|
||||
|
||||
@ -257,6 +261,7 @@ public:
|
||||
|
||||
void score_results(const std::vector<sort_by> & sort_fields, const uint16_t & query_index, const uint8_t & field_id,
|
||||
const uint32_t total_cost, Topster* topster, const std::vector<art_leaf *> & query_suggestion,
|
||||
spp::sparse_hash_set<uint64_t>& groups_processed,
|
||||
const uint32_t *result_ids, const size_t result_size) const;
|
||||
|
||||
static int32_t get_points_from_doc(const nlohmann::json &document, const std::string & default_sorting_field);
|
||||
|
@ -107,10 +107,12 @@ size_t ArrayUtils::exclude_scalar(const uint32_t *A, const size_t lenA,
|
||||
size_t indexA = 0, indexB = 0, res_index = 0;
|
||||
|
||||
if(A == nullptr && B == nullptr) {
|
||||
return 0;
|
||||
*out = nullptr;
|
||||
return 0;
|
||||
}
|
||||
|
||||
if(A == nullptr) {
|
||||
*out = nullptr;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -1384,8 +1384,6 @@ int art_fuzzy_search(art_tree *t, const unsigned char *term, const int term_len,
|
||||
art_fuzzy_recurse(0, 0, t->root, -1, term, term_len, irow, jrow, min_cost, max_cost, prefix, nodes);
|
||||
}
|
||||
|
||||
PROCESS_NODES:
|
||||
|
||||
if(token_order == FREQUENCY) {
|
||||
std::sort(nodes.begin(), nodes.end(), compare_art_node_frequency);
|
||||
} else {
|
||||
|
@ -653,6 +653,7 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
|
||||
std::vector<KV*> override_result_kvs;
|
||||
|
||||
size_t total_found = 0;
|
||||
spp::sparse_hash_set<uint64_t> groups_processed; // used to calculate total_found for grouped query
|
||||
|
||||
// send data to individual index threads
|
||||
size_t index_id = 0;
|
||||
@ -709,28 +710,22 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
|
||||
auto & acc_facet = facets[fi];
|
||||
|
||||
for(auto & facet_kv: this_facet.result_map) {
|
||||
size_t count = 0;
|
||||
|
||||
|
||||
// for grouping we have to aggregate group counts to a count value
|
||||
/*if(search_params->group_limit) {
|
||||
// for every facet
|
||||
for(auto& a_facet: facets) {
|
||||
// for every facet value
|
||||
for(auto& fvalue: a_facet.result_map) {
|
||||
fvalue.second.count = fvalue.second.groups.size();
|
||||
}
|
||||
}
|
||||
}*/
|
||||
|
||||
if(acc_facet.result_map.count(facet_kv.first) == 0) {
|
||||
// not found, so set it
|
||||
count = facet_kv.second.count;
|
||||
if(index->search_params->group_limit) {
|
||||
// we have to add all group sets
|
||||
acc_facet.result_map[facet_kv.first].groups.insert(
|
||||
facet_kv.second.groups.begin(), facet_kv.second.groups.end()
|
||||
);
|
||||
} else {
|
||||
count = acc_facet.result_map[facet_kv.first].count + facet_kv.second.count;
|
||||
size_t count = 0;
|
||||
if(acc_facet.result_map.count(facet_kv.first) == 0) {
|
||||
// not found, so set it
|
||||
count = facet_kv.second.count;
|
||||
} else {
|
||||
count = acc_facet.result_map[facet_kv.first].count + facet_kv.second.count;
|
||||
}
|
||||
acc_facet.result_map[facet_kv.first].count = count;
|
||||
}
|
||||
|
||||
acc_facet.result_map[facet_kv.first].count = count;
|
||||
acc_facet.result_map[facet_kv.first].doc_id = facet_kv.second.doc_id;
|
||||
acc_facet.result_map[facet_kv.first].array_pos = facet_kv.second.array_pos;
|
||||
acc_facet.result_map[facet_kv.first].query_token_pos = facet_kv.second.query_token_pos;
|
||||
@ -744,7 +739,25 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
|
||||
}
|
||||
}
|
||||
|
||||
total_found += index->search_params->all_result_ids_len;
|
||||
if(group_limit) {
|
||||
groups_processed.insert(
|
||||
index->search_params->groups_processed.begin(),
|
||||
index->search_params->groups_processed.end()
|
||||
);
|
||||
} else {
|
||||
total_found += index->search_params->all_result_ids_len;
|
||||
}
|
||||
}
|
||||
|
||||
// for grouping we have to aggregate group set sizes to a count value
|
||||
if(group_limit) {
|
||||
for(auto& acc_facet: facets) {
|
||||
for(auto& facet_kv: acc_facet.result_map) {
|
||||
facet_kv.second.count = facet_kv.second.groups.size();
|
||||
}
|
||||
}
|
||||
|
||||
total_found = groups_processed.size();
|
||||
}
|
||||
|
||||
if(!index_search_op.ok()) {
|
||||
@ -753,7 +766,7 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
|
||||
|
||||
Topster* aggr_topster = nullptr;
|
||||
|
||||
if(group_limit > 0) {
|
||||
if(group_limit) {
|
||||
// group by query requires another round of topster-ing
|
||||
|
||||
// needs to be atleast 1 since scoring is mandatory
|
||||
|
@ -744,7 +744,7 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
|
||||
uint64_t fhash = facet_token_hash(facet_field, fvalue_str);
|
||||
|
||||
if(a_facet.result_map.count(fhash) == 0) {
|
||||
a_facet.result_map[fhash] = facet_count_t{0, spp::sparse_hash_map<uint64_t, uint32_t>(),
|
||||
a_facet.result_map[fhash] = facet_count_t{0, spp::sparse_hash_set<uint64_t>(),
|
||||
doc_seq_id, 0,
|
||||
spp::sparse_hash_map<uint32_t, token_pos_cost_t>()};
|
||||
}
|
||||
@ -754,10 +754,7 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
|
||||
|
||||
if(search_params->group_limit) {
|
||||
uint64_t distinct_id = get_distinct_id(facet_to_index, doc_seq_id);
|
||||
if(a_facet.result_map[fhash].groups.count(distinct_id) == 0) {
|
||||
a_facet.result_map[fhash].groups.emplace(distinct_id, 0);
|
||||
}
|
||||
a_facet.result_map[fhash].groups[distinct_id] += 1;
|
||||
a_facet.result_map[fhash].groups.emplace(distinct_id);
|
||||
} else {
|
||||
a_facet.result_map[fhash].count += 1;
|
||||
}
|
||||
@ -784,6 +781,7 @@ void Index::search_candidates(const uint8_t & field_id, uint32_t* filter_ids, si
|
||||
const std::vector<sort_by> & sort_fields,
|
||||
std::vector<token_candidates> & token_candidates_vec,
|
||||
std::vector<std::vector<art_leaf*>> & searched_queries, Topster* topster,
|
||||
spp::sparse_hash_set<uint64_t>& groups_processed,
|
||||
uint32_t** all_result_ids, size_t & all_result_ids_len,
|
||||
const size_t typo_tokens_threshold) {
|
||||
const long long combination_limit = 10;
|
||||
@ -850,7 +848,7 @@ void Index::search_candidates(const uint8_t & field_id, uint32_t* filter_ids, si
|
||||
|
||||
// go through each matching document id and calculate match score
|
||||
score_results(sort_fields, (uint16_t) searched_queries.size(), field_id, total_cost, topster, query_suggestion,
|
||||
filtered_result_ids, filtered_results_size);
|
||||
groups_processed, filtered_result_ids, filtered_results_size);
|
||||
|
||||
delete[] filtered_result_ids;
|
||||
delete[] result_ids;
|
||||
@ -862,7 +860,7 @@ void Index::search_candidates(const uint8_t & field_id, uint32_t* filter_ids, si
|
||||
*all_result_ids = new_all_result_ids;
|
||||
|
||||
score_results(sort_fields, (uint16_t) searched_queries.size(), field_id, total_cost, topster, query_suggestion,
|
||||
result_ids, result_size);
|
||||
groups_processed, result_ids, result_size);
|
||||
delete[] result_ids;
|
||||
}
|
||||
|
||||
@ -1024,7 +1022,8 @@ void Index::run_search() {
|
||||
search_params->topster, search_params->curated_topster,
|
||||
search_params->per_page, search_params->page, search_params->token_order,
|
||||
search_params->prefix, search_params->drop_tokens_threshold,
|
||||
search_params->all_result_ids_len, search_params->searched_queries,
|
||||
search_params->all_result_ids_len, search_params->groups_processed,
|
||||
search_params->searched_queries,
|
||||
search_params->raw_result_kvs, search_params->override_result_kvs,
|
||||
search_params->typo_tokens_threshold);
|
||||
|
||||
@ -1038,12 +1037,12 @@ void Index::run_search() {
|
||||
}
|
||||
}
|
||||
|
||||
void Index::collate_curated_ids(const std::string & query, const std::string & field, const uint8_t field_id,
|
||||
const std::vector<uint32_t> & included_ids,
|
||||
Topster* curated_topster,
|
||||
std::vector<std::vector<art_leaf*>> & searched_queries) {
|
||||
void Index::collate_included_ids(const std::string & query, const std::string & field, const uint8_t field_id,
|
||||
const std::vector<uint32_t> & included_ids,
|
||||
Topster* curated_topster,
|
||||
std::vector<std::vector<art_leaf*>> & searched_queries) {
|
||||
|
||||
if(included_ids.size() == 0) {
|
||||
if(included_ids.empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
@ -1106,9 +1105,9 @@ void Index::collate_curated_ids(const std::string & query, const std::string & f
|
||||
|
||||
KV kv(field_id, searched_queries.size(), seq_id, seq_id, match_score, scores);
|
||||
curated_topster->add(&kv);
|
||||
|
||||
searched_queries.push_back(override_query);
|
||||
}
|
||||
|
||||
searched_queries.push_back(override_query);
|
||||
}
|
||||
|
||||
void Index::search(Option<uint32_t> & outcome,
|
||||
@ -1124,7 +1123,8 @@ void Index::search(Option<uint32_t> & outcome,
|
||||
const size_t per_page, const size_t page, const token_ordering token_order,
|
||||
const bool prefix, const size_t drop_tokens_threshold,
|
||||
size_t & all_result_ids_len,
|
||||
std::vector<std::vector<art_leaf*>> & searched_queries,
|
||||
spp::sparse_hash_set<uint64_t>& groups_processed,
|
||||
std::vector<std::vector<art_leaf*>>& searched_queries,
|
||||
std::vector<std::vector<KV*>> & raw_result_kvs,
|
||||
std::vector<KV*> & override_result_kvs,
|
||||
const size_t typo_tokens_threshold) {
|
||||
@ -1140,7 +1140,7 @@ void Index::search(Option<uint32_t> & outcome,
|
||||
|
||||
uint32_t filter_ids_length = op_filter_ids_length.get();
|
||||
|
||||
// we will be removing all curated IDs from organic results before running topster
|
||||
// we will be removing all curated IDs from organic result ids before running topster
|
||||
std::set<uint32_t> curated_ids(included_ids.begin(), included_ids.end());
|
||||
curated_ids.insert(excluded_ids.begin(), excluded_ids.end());
|
||||
|
||||
@ -1165,8 +1165,8 @@ void Index::search(Option<uint32_t> & outcome,
|
||||
}
|
||||
|
||||
score_results(sort_fields_std, (uint16_t) searched_queries.size(), field_id, 0, topster, {},
|
||||
filter_ids, filter_ids_length);
|
||||
collate_curated_ids(query, field, field_id, included_ids, curated_topster, searched_queries);
|
||||
groups_processed, filter_ids, filter_ids_length);
|
||||
collate_included_ids(query, field, field_id, included_ids, curated_topster, searched_queries);
|
||||
|
||||
all_result_ids_len = filter_ids_length;
|
||||
all_result_ids = filter_ids;
|
||||
@ -1180,9 +1180,9 @@ void Index::search(Option<uint32_t> & outcome,
|
||||
const std::string & field = search_fields[i];
|
||||
|
||||
search_field(field_id, query, field, filter_ids, filter_ids_length, curated_ids_sorted, facets, sort_fields_std,
|
||||
num_typos, searched_queries, topster, &all_result_ids, all_result_ids_len,
|
||||
num_typos, searched_queries, topster, groups_processed, &all_result_ids, all_result_ids_len,
|
||||
token_order, prefix, drop_tokens_threshold, typo_tokens_threshold);
|
||||
collate_curated_ids(query, field, field_id, included_ids, curated_topster, searched_queries);
|
||||
collate_included_ids(query, field, field_id, included_ids, curated_topster, searched_queries);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1214,7 +1214,7 @@ void Index::search(Option<uint32_t> & outcome,
|
||||
override_result_kvs.push_back(kv);
|
||||
}
|
||||
|
||||
// for the ids that are dropped, remove their corresponding facet components from facet results
|
||||
// add curated IDs to result count
|
||||
all_result_ids_len += curated_topster->size;
|
||||
|
||||
delete [] filter_ids;
|
||||
@ -1240,7 +1240,8 @@ void Index::search_field(const uint8_t & field_id, const std::string & query, co
|
||||
const std::vector<uint32_t>& curated_ids,
|
||||
std::vector<facet> & facets, const std::vector<sort_by> & sort_fields, const int num_typos,
|
||||
std::vector<std::vector<art_leaf*>> & searched_queries,
|
||||
Topster* topster, uint32_t** all_result_ids, size_t & all_result_ids_len,
|
||||
Topster* topster, spp::sparse_hash_set<uint64_t>& groups_processed,
|
||||
uint32_t** all_result_ids, size_t & all_result_ids_len,
|
||||
const token_ordering token_order, const bool prefix,
|
||||
const size_t drop_tokens_threshold, const size_t typo_tokens_threshold) {
|
||||
std::vector<std::string> tokens;
|
||||
@ -1354,7 +1355,7 @@ void Index::search_field(const uint8_t & field_id, const std::string & query, co
|
||||
if(!token_candidates_vec.empty() && token_candidates_vec.size() == tokens.size()) {
|
||||
// If all tokens were found, go ahead and search for candidates with what we have so far
|
||||
search_candidates(field_id, filter_ids, filter_ids_length, curated_ids, sort_fields, token_candidates_vec,
|
||||
searched_queries, topster, all_result_ids, all_result_ids_len,
|
||||
searched_queries, topster, groups_processed, all_result_ids, all_result_ids_len,
|
||||
typo_tokens_threshold);
|
||||
}
|
||||
|
||||
@ -1389,7 +1390,7 @@ void Index::search_field(const uint8_t & field_id, const std::string & query, co
|
||||
|
||||
return search_field(field_id, truncated_query, field, filter_ids, filter_ids_length, curated_ids,
|
||||
facets, sort_fields, num_typos,
|
||||
searched_queries, topster, all_result_ids, all_result_ids_len,
|
||||
searched_queries, topster, groups_processed, all_result_ids, all_result_ids_len,
|
||||
token_order, prefix);
|
||||
}
|
||||
}
|
||||
@ -1417,6 +1418,7 @@ void Index::log_leaves(const int cost, const std::string &token, const std::vect
|
||||
void Index::score_results(const std::vector<sort_by> & sort_fields, const uint16_t & query_index,
|
||||
const uint8_t & field_id, const uint32_t total_cost, Topster* topster,
|
||||
const std::vector<art_leaf *> &query_suggestion,
|
||||
spp::sparse_hash_set<uint64_t>& groups_processed,
|
||||
const uint32_t *result_ids, const size_t result_size) const {
|
||||
|
||||
spp::sparse_hash_map<const art_leaf*, uint32_t*> leaf_to_indices;
|
||||
@ -1536,6 +1538,7 @@ void Index::score_results(const std::vector<sort_by> & sort_fields, const uint16
|
||||
|
||||
if(search_params->group_limit != 0) {
|
||||
distinct_id = get_distinct_id(facet_to_id, seq_id);
|
||||
groups_processed.emplace(distinct_id);
|
||||
}
|
||||
|
||||
KV kv(field_id, query_index, seq_id, distinct_id, match_score, scores);
|
||||
|
Loading…
x
Reference in New Issue
Block a user