Results count should match group size for group query.

This commit is contained in:
kishorenc 2020-06-14 17:16:01 +05:30
parent 1c398fac7e
commit c5010a6a5f
7 changed files with 82 additions and 60 deletions

View File

@ -97,9 +97,10 @@
- ~~Have a LOG(ERROR) level~~
- ~~Handle SIGTERM which is sent when process is killed~~
- ~~Use snappy compression for storage~~
- ~~Fix exclude_scalar early returns~~
- ~~Fix result ids length during grouped overrides~~
- Fix override grouping (collate_included_ids)
- Test for overriding result on second page
- Fix exclude_scalar early returns
- Fix result ids length during grouped overrides
- atleast 1 token match for proceeding with drop tokens
- support wildcard query with filters
- API for optimizing on disk storage

View File

@ -146,7 +146,7 @@ struct token_pos_cost_t {
struct facet_count_t {
uint32_t count;
spp::sparse_hash_map<uint64_t, uint32_t> groups; // used for faceting grouped results
spp::sparse_hash_set<uint64_t> groups; // used for faceting grouped results
// used to fetch the actual document and value for representation
uint32_t doc_id;

View File

@ -42,6 +42,7 @@ struct search_args {
std::vector<std::string> group_by_fields;
size_t group_limit;
size_t all_result_ids_len;
spp::sparse_hash_set<uint64_t> groups_processed;
std::vector<std::vector<art_leaf*>> searched_queries;
Topster* topster;
Topster* curated_topster;
@ -168,9 +169,9 @@ private:
const std::vector<uint32_t>& curated_ids,
std::vector<facet> & facets, const std::vector<sort_by> & sort_fields,
const int num_typos, std::vector<std::vector<art_leaf*>> & searched_queries,
Topster* topster, uint32_t** all_result_ids,
size_t & all_result_ids_len, const token_ordering token_order = FREQUENCY,
const bool prefix = false,
Topster* topster, spp::sparse_hash_set<uint64_t>& groups_processed,
uint32_t** all_result_ids, size_t & all_result_ids_len,
const token_ordering token_order = FREQUENCY, const bool prefix = false,
const size_t drop_tokens_threshold = Index::DROP_TOKENS_THRESHOLD,
const size_t typo_tokens_threshold = Index::TYPO_TOKENS_THRESHOLD);
@ -178,7 +179,8 @@ private:
const std::vector<uint32_t>& curated_ids,
const std::vector<sort_by> & sort_fields, std::vector<token_candidates> & token_to_candidates,
std::vector<std::vector<art_leaf*>> & searched_queries,
Topster* topster, uint32_t** all_result_ids,
Topster* topster, spp::sparse_hash_set<uint64_t>& groups_processed,
uint32_t** all_result_ids,
size_t & all_result_ids_len,
const size_t typo_tokens_threshold);
@ -210,9 +212,9 @@ private:
void remove_and_shift_offset_index(sorted_array &offset_index, const uint32_t *indices_sorted,
const uint32_t indices_length);
void collate_curated_ids(const std::string & query, const std::string & field, const uint8_t field_id,
const std::vector<uint32_t> & included_ids,
Topster* curated_topster, std::vector<std::vector<art_leaf*>> & searched_queries);
void collate_included_ids(const std::string & query, const std::string & field, const uint8_t field_id,
const std::vector<uint32_t> & included_ids,
Topster* curated_topster, std::vector<std::vector<art_leaf*>> & searched_queries);
uint64_t facet_token_hash(const field & a_field, const std::string &token);
@ -242,7 +244,9 @@ public:
Topster* topster, Topster* curated_topster,
const size_t per_page, const size_t page, const token_ordering token_order,
const bool prefix, const size_t drop_tokens_threshold,
size_t & all_result_ids_len, std::vector<std::vector<art_leaf*>> & searched_queries,
size_t & all_result_ids_len,
spp::sparse_hash_set<uint64_t>& groups_processed,
std::vector<std::vector<art_leaf*>> & searched_queries,
std::vector<std::vector<KV*>> & raw_result_kvs, std::vector<KV*> & override_result_kvs,
const size_t typo_tokens_threshold);
@ -257,6 +261,7 @@ public:
void score_results(const std::vector<sort_by> & sort_fields, const uint16_t & query_index, const uint8_t & field_id,
const uint32_t total_cost, Topster* topster, const std::vector<art_leaf *> & query_suggestion,
spp::sparse_hash_set<uint64_t>& groups_processed,
const uint32_t *result_ids, const size_t result_size) const;
static int32_t get_points_from_doc(const nlohmann::json &document, const std::string & default_sorting_field);

View File

@ -107,10 +107,12 @@ size_t ArrayUtils::exclude_scalar(const uint32_t *A, const size_t lenA,
size_t indexA = 0, indexB = 0, res_index = 0;
if(A == nullptr && B == nullptr) {
return 0;
*out = nullptr;
return 0;
}
if(A == nullptr) {
*out = nullptr;
return 0;
}

View File

@ -1384,8 +1384,6 @@ int art_fuzzy_search(art_tree *t, const unsigned char *term, const int term_len,
art_fuzzy_recurse(0, 0, t->root, -1, term, term_len, irow, jrow, min_cost, max_cost, prefix, nodes);
}
PROCESS_NODES:
if(token_order == FREQUENCY) {
std::sort(nodes.begin(), nodes.end(), compare_art_node_frequency);
} else {

View File

@ -653,6 +653,7 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
std::vector<KV*> override_result_kvs;
size_t total_found = 0;
spp::sparse_hash_set<uint64_t> groups_processed; // used to calculate total_found for grouped query
// send data to individual index threads
size_t index_id = 0;
@ -709,28 +710,22 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
auto & acc_facet = facets[fi];
for(auto & facet_kv: this_facet.result_map) {
size_t count = 0;
// for grouping we have to aggregate group counts to a count value
/*if(search_params->group_limit) {
// for every facet
for(auto& a_facet: facets) {
// for every facet value
for(auto& fvalue: a_facet.result_map) {
fvalue.second.count = fvalue.second.groups.size();
}
}
}*/
if(acc_facet.result_map.count(facet_kv.first) == 0) {
// not found, so set it
count = facet_kv.second.count;
if(index->search_params->group_limit) {
// we have to add all group sets
acc_facet.result_map[facet_kv.first].groups.insert(
facet_kv.second.groups.begin(), facet_kv.second.groups.end()
);
} else {
count = acc_facet.result_map[facet_kv.first].count + facet_kv.second.count;
size_t count = 0;
if(acc_facet.result_map.count(facet_kv.first) == 0) {
// not found, so set it
count = facet_kv.second.count;
} else {
count = acc_facet.result_map[facet_kv.first].count + facet_kv.second.count;
}
acc_facet.result_map[facet_kv.first].count = count;
}
acc_facet.result_map[facet_kv.first].count = count;
acc_facet.result_map[facet_kv.first].doc_id = facet_kv.second.doc_id;
acc_facet.result_map[facet_kv.first].array_pos = facet_kv.second.array_pos;
acc_facet.result_map[facet_kv.first].query_token_pos = facet_kv.second.query_token_pos;
@ -744,7 +739,25 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
}
}
total_found += index->search_params->all_result_ids_len;
if(group_limit) {
groups_processed.insert(
index->search_params->groups_processed.begin(),
index->search_params->groups_processed.end()
);
} else {
total_found += index->search_params->all_result_ids_len;
}
}
// for grouping we have to aggregate group set sizes to a count value
if(group_limit) {
for(auto& acc_facet: facets) {
for(auto& facet_kv: acc_facet.result_map) {
facet_kv.second.count = facet_kv.second.groups.size();
}
}
total_found = groups_processed.size();
}
if(!index_search_op.ok()) {
@ -753,7 +766,7 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
Topster* aggr_topster = nullptr;
if(group_limit > 0) {
if(group_limit) {
// group by query requires another round of topster-ing
// needs to be atleast 1 since scoring is mandatory

View File

@ -744,7 +744,7 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
uint64_t fhash = facet_token_hash(facet_field, fvalue_str);
if(a_facet.result_map.count(fhash) == 0) {
a_facet.result_map[fhash] = facet_count_t{0, spp::sparse_hash_map<uint64_t, uint32_t>(),
a_facet.result_map[fhash] = facet_count_t{0, spp::sparse_hash_set<uint64_t>(),
doc_seq_id, 0,
spp::sparse_hash_map<uint32_t, token_pos_cost_t>()};
}
@ -754,10 +754,7 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
if(search_params->group_limit) {
uint64_t distinct_id = get_distinct_id(facet_to_index, doc_seq_id);
if(a_facet.result_map[fhash].groups.count(distinct_id) == 0) {
a_facet.result_map[fhash].groups.emplace(distinct_id, 0);
}
a_facet.result_map[fhash].groups[distinct_id] += 1;
a_facet.result_map[fhash].groups.emplace(distinct_id);
} else {
a_facet.result_map[fhash].count += 1;
}
@ -784,6 +781,7 @@ void Index::search_candidates(const uint8_t & field_id, uint32_t* filter_ids, si
const std::vector<sort_by> & sort_fields,
std::vector<token_candidates> & token_candidates_vec,
std::vector<std::vector<art_leaf*>> & searched_queries, Topster* topster,
spp::sparse_hash_set<uint64_t>& groups_processed,
uint32_t** all_result_ids, size_t & all_result_ids_len,
const size_t typo_tokens_threshold) {
const long long combination_limit = 10;
@ -850,7 +848,7 @@ void Index::search_candidates(const uint8_t & field_id, uint32_t* filter_ids, si
// go through each matching document id and calculate match score
score_results(sort_fields, (uint16_t) searched_queries.size(), field_id, total_cost, topster, query_suggestion,
filtered_result_ids, filtered_results_size);
groups_processed, filtered_result_ids, filtered_results_size);
delete[] filtered_result_ids;
delete[] result_ids;
@ -862,7 +860,7 @@ void Index::search_candidates(const uint8_t & field_id, uint32_t* filter_ids, si
*all_result_ids = new_all_result_ids;
score_results(sort_fields, (uint16_t) searched_queries.size(), field_id, total_cost, topster, query_suggestion,
result_ids, result_size);
groups_processed, result_ids, result_size);
delete[] result_ids;
}
@ -1024,7 +1022,8 @@ void Index::run_search() {
search_params->topster, search_params->curated_topster,
search_params->per_page, search_params->page, search_params->token_order,
search_params->prefix, search_params->drop_tokens_threshold,
search_params->all_result_ids_len, search_params->searched_queries,
search_params->all_result_ids_len, search_params->groups_processed,
search_params->searched_queries,
search_params->raw_result_kvs, search_params->override_result_kvs,
search_params->typo_tokens_threshold);
@ -1038,12 +1037,12 @@ void Index::run_search() {
}
}
void Index::collate_curated_ids(const std::string & query, const std::string & field, const uint8_t field_id,
const std::vector<uint32_t> & included_ids,
Topster* curated_topster,
std::vector<std::vector<art_leaf*>> & searched_queries) {
void Index::collate_included_ids(const std::string & query, const std::string & field, const uint8_t field_id,
const std::vector<uint32_t> & included_ids,
Topster* curated_topster,
std::vector<std::vector<art_leaf*>> & searched_queries) {
if(included_ids.size() == 0) {
if(included_ids.empty()) {
return;
}
@ -1106,9 +1105,9 @@ void Index::collate_curated_ids(const std::string & query, const std::string & f
KV kv(field_id, searched_queries.size(), seq_id, seq_id, match_score, scores);
curated_topster->add(&kv);
searched_queries.push_back(override_query);
}
searched_queries.push_back(override_query);
}
void Index::search(Option<uint32_t> & outcome,
@ -1124,7 +1123,8 @@ void Index::search(Option<uint32_t> & outcome,
const size_t per_page, const size_t page, const token_ordering token_order,
const bool prefix, const size_t drop_tokens_threshold,
size_t & all_result_ids_len,
std::vector<std::vector<art_leaf*>> & searched_queries,
spp::sparse_hash_set<uint64_t>& groups_processed,
std::vector<std::vector<art_leaf*>>& searched_queries,
std::vector<std::vector<KV*>> & raw_result_kvs,
std::vector<KV*> & override_result_kvs,
const size_t typo_tokens_threshold) {
@ -1140,7 +1140,7 @@ void Index::search(Option<uint32_t> & outcome,
uint32_t filter_ids_length = op_filter_ids_length.get();
// we will be removing all curated IDs from organic results before running topster
// we will be removing all curated IDs from organic result ids before running topster
std::set<uint32_t> curated_ids(included_ids.begin(), included_ids.end());
curated_ids.insert(excluded_ids.begin(), excluded_ids.end());
@ -1165,8 +1165,8 @@ void Index::search(Option<uint32_t> & outcome,
}
score_results(sort_fields_std, (uint16_t) searched_queries.size(), field_id, 0, topster, {},
filter_ids, filter_ids_length);
collate_curated_ids(query, field, field_id, included_ids, curated_topster, searched_queries);
groups_processed, filter_ids, filter_ids_length);
collate_included_ids(query, field, field_id, included_ids, curated_topster, searched_queries);
all_result_ids_len = filter_ids_length;
all_result_ids = filter_ids;
@ -1180,9 +1180,9 @@ void Index::search(Option<uint32_t> & outcome,
const std::string & field = search_fields[i];
search_field(field_id, query, field, filter_ids, filter_ids_length, curated_ids_sorted, facets, sort_fields_std,
num_typos, searched_queries, topster, &all_result_ids, all_result_ids_len,
num_typos, searched_queries, topster, groups_processed, &all_result_ids, all_result_ids_len,
token_order, prefix, drop_tokens_threshold, typo_tokens_threshold);
collate_curated_ids(query, field, field_id, included_ids, curated_topster, searched_queries);
collate_included_ids(query, field, field_id, included_ids, curated_topster, searched_queries);
}
}
}
@ -1214,7 +1214,7 @@ void Index::search(Option<uint32_t> & outcome,
override_result_kvs.push_back(kv);
}
// for the ids that are dropped, remove their corresponding facet components from facet results
// add curated IDs to result count
all_result_ids_len += curated_topster->size;
delete [] filter_ids;
@ -1240,7 +1240,8 @@ void Index::search_field(const uint8_t & field_id, const std::string & query, co
const std::vector<uint32_t>& curated_ids,
std::vector<facet> & facets, const std::vector<sort_by> & sort_fields, const int num_typos,
std::vector<std::vector<art_leaf*>> & searched_queries,
Topster* topster, uint32_t** all_result_ids, size_t & all_result_ids_len,
Topster* topster, spp::sparse_hash_set<uint64_t>& groups_processed,
uint32_t** all_result_ids, size_t & all_result_ids_len,
const token_ordering token_order, const bool prefix,
const size_t drop_tokens_threshold, const size_t typo_tokens_threshold) {
std::vector<std::string> tokens;
@ -1354,7 +1355,7 @@ void Index::search_field(const uint8_t & field_id, const std::string & query, co
if(!token_candidates_vec.empty() && token_candidates_vec.size() == tokens.size()) {
// If all tokens were found, go ahead and search for candidates with what we have so far
search_candidates(field_id, filter_ids, filter_ids_length, curated_ids, sort_fields, token_candidates_vec,
searched_queries, topster, all_result_ids, all_result_ids_len,
searched_queries, topster, groups_processed, all_result_ids, all_result_ids_len,
typo_tokens_threshold);
}
@ -1389,7 +1390,7 @@ void Index::search_field(const uint8_t & field_id, const std::string & query, co
return search_field(field_id, truncated_query, field, filter_ids, filter_ids_length, curated_ids,
facets, sort_fields, num_typos,
searched_queries, topster, all_result_ids, all_result_ids_len,
searched_queries, topster, groups_processed, all_result_ids, all_result_ids_len,
token_order, prefix);
}
}
@ -1417,6 +1418,7 @@ void Index::log_leaves(const int cost, const std::string &token, const std::vect
void Index::score_results(const std::vector<sort_by> & sort_fields, const uint16_t & query_index,
const uint8_t & field_id, const uint32_t total_cost, Topster* topster,
const std::vector<art_leaf *> &query_suggestion,
spp::sparse_hash_set<uint64_t>& groups_processed,
const uint32_t *result_ids, const size_t result_size) const {
spp::sparse_hash_map<const art_leaf*, uint32_t*> leaf_to_indices;
@ -1536,6 +1538,7 @@ void Index::score_results(const std::vector<sort_by> & sort_fields, const uint16
if(search_params->group_limit != 0) {
distinct_id = get_distinct_id(facet_to_id, seq_id);
groups_processed.emplace(distinct_id);
}
KV kv(field_id, query_index, seq_id, distinct_id, match_score, scores);