Short circuit to speed up single token searches.

- Refactor token position population
- Store only the query index in topster instead of storing the full offsets.
- Calculate the offsets finally on the results that are to be returned.
This commit is contained in:
Kishore Nallan 2017-08-08 17:08:19 -04:00
parent ce69dbb371
commit 6a6785ef74
6 changed files with 135 additions and 80 deletions

View File

@ -48,6 +48,7 @@
- ~~Fetch an individual document~~
- ~~ID field should be a string: must validate~~
- ~~Number of records in collection~~
- Add docs/explanation around ranking calc
- Use rocksdb batch put for atomic insertion
- When prefix=true, use token_ranking_field for token ordering only for last word
- Query token ids should match query token ordering

View File

@ -79,14 +79,21 @@ private:
void do_facets(std::vector<facet> & facets, uint32_t* result_ids, size_t results_size);
void populate_token_positions(const std::vector<art_leaf *> &query_suggestion,
spp::sparse_hash_map<const art_leaf *, uint32_t *> &leaf_to_indices,
size_t result_index, std::vector<std::vector<uint16_t>> &token_positions) const;
void search_field(std::string & query, const std::string & field, uint32_t *filter_ids, size_t filter_ids_length,
std::vector<facet> & facets, const std::vector<sort_field> & sort_fields, const int num_typos,
const size_t num_results, Topster<100> &topster, uint32_t** all_result_ids,
std::vector<facet> & facets, const std::vector<sort_field> & sort_fields,
const int num_typos, const size_t num_results,
std::vector<std::vector<art_leaf*>> & searched_queries, int & searched_queries_index,
Topster<100> & topster, uint32_t** all_result_ids,
size_t & all_result_ids_len, const token_ordering token_order = FREQUENCY, const bool prefix = false);
void search_candidates(uint32_t* filter_ids, size_t filter_ids_length, std::vector<facet> & facets,
const std::vector<sort_field> & sort_fields, int & token_rank,
std::vector<std::vector<art_leaf*>> & token_leaves, Topster<100> & topster,
const std::vector<sort_field> & sort_fields, int & candidate_rank,
std::vector<std::vector<art_leaf*>> & token_to_candidates,
std::vector<std::vector<art_leaf*>> & searched_queries, Topster<100> & topster,
size_t & total_results, uint32_t** all_result_ids, size_t & all_result_ids_len,
const size_t & max_results);
@ -152,8 +159,8 @@ public:
Option<std::string> remove(const std::string & id);
void score_results(const std::vector<sort_field> & sort_fields, const int & token_rank, Topster<100> &topster,
const std::vector<art_leaf *> & query_suggestion, const uint32_t *result_ids,
void score_results(const std::vector<sort_field> & sort_fields, const int & query_index, const int & candidate_rank,
Topster<100> &topster, const std::vector<art_leaf *> & query_suggestion, const uint32_t *result_ids,
const size_t result_size) const;
Option<uint32_t> index_in_memory(const nlohmann::json & document, uint32_t seq_id);

View File

@ -30,6 +30,10 @@ struct MatchScore {
uint16_t start_offset;
char offset_diffs[16];
MatchScore() {
}
MatchScore(uint16_t words_present, uint16_t distance, uint16_t start_offset, char *offset_diffs_stacked):
words_present(words_present), distance(distance), start_offset(start_offset) {
memcpy(offset_diffs, offset_diffs_stacked, 16);

View File

@ -13,8 +13,7 @@
template <size_t MAX_SIZE=100>
struct Topster {
struct KV {
uint16_t start_offset;
char offset_diffs[16]; // [len, offset1-start_offset, offset2-start_offset, ...]
uint16_t query_index;
uint64_t key;
uint64_t match_score;
int64_t primary_attr;
@ -35,8 +34,8 @@ struct Topster {
b = c;
}
void add(const uint64_t &key, const uint64_t &match_score, const int64_t &primary_attr,
const int64_t &secondary_attr, const uint16_t &start_offset, char *offset_diffs_stacked){
void add(const uint64_t &key, const uint16_t &query_index, const uint64_t &match_score, const int64_t &primary_attr,
const int64_t &secondary_attr) {
if (size >= MAX_SIZE) {
if(!is_greater(data[0], match_score, primary_attr, secondary_attr)) {
// when incoming value is less than the smallest in the heap, ignore
@ -52,11 +51,10 @@ struct Topster {
dedup_keys.insert(key);
data[0].key = key;
data[0].query_index = query_index;
data[0].match_score = match_score;
data[0].primary_attr = primary_attr;
data[0].secondary_attr = secondary_attr;
data[0].start_offset = start_offset;
memcpy(data[0].offset_diffs, offset_diffs_stacked, 16);
uint32_t i = 0;
// sift to maintain heap property
@ -83,11 +81,10 @@ struct Topster {
dedup_keys.insert(key);
data[size].key = key;
data[size].query_index = query_index;
data[size].match_score = match_score;
data[size].primary_attr = primary_attr;
data[size].secondary_attr = secondary_attr;
data[size].start_offset = start_offset;
memcpy(data[size].offset_diffs, offset_diffs_stacked, 16);
size++;
for (uint32_t i = size - 1; i > 0;) {

View File

@ -349,17 +349,17 @@ void Collection::do_facets(std::vector<facet> & facets, uint32_t* result_ids, si
void Collection::search_candidates(uint32_t* filter_ids, size_t filter_ids_length, std::vector<facet> & facets,
const std::vector<sort_field> & sort_fields, int & candidate_rank,
std::vector<std::vector<art_leaf*>> & token_leaves, Topster<100> & topster,
std::vector<std::vector<art_leaf*>> & token_to_candidates,
std::vector<std::vector<art_leaf*>> & searched_queries, Topster<100> & topster,
size_t & total_results, uint32_t** all_result_ids, size_t & all_result_ids_len,
const size_t & max_results) {
const size_t combination_limit = 10;
auto product = []( long long a, std::vector<art_leaf*>& b ) { return a*b.size(); };
long long int N = std::accumulate(token_leaves.begin(), token_leaves.end(), 1LL, product);
long long int N = std::accumulate(token_to_candidates.begin(), token_to_candidates.end(), 1LL, product);
for(long long n=0; n<N && n<combination_limit; ++n) {
// every element in `query_suggestion` contains a token and its associated hits
std::vector<art_leaf *> query_suggestion = next_suggestion(token_leaves, n);
candidate_rank++;
std::vector<art_leaf *> query_suggestion = next_suggestion(token_to_candidates, n);
/*for(auto i=0; i < query_suggestion.size(); i++) {
std::cout << "i: " << i << " - " << query_suggestion[i]->key << std::endl;
@ -371,6 +371,8 @@ void Collection::search_candidates(uint32_t* filter_ids, size_t filter_ids_lengt
if(result_size == 0) continue;
candidate_rank += 1;
// intersect the document ids for each token to find docs that contain all the tokens (stored in `result_ids`)
for(auto i=1; i < query_suggestion.size(); i++) {
uint32_t* out = nullptr;
@ -394,7 +396,8 @@ void Collection::search_candidates(uint32_t* filter_ids, size_t filter_ids_lengt
do_facets(facets, filtered_result_ids, filtered_results_size);
// go through each matching document id and calculate match score
score_results(sort_fields, candidate_rank, topster, query_suggestion, filtered_result_ids, filtered_results_size);
score_results(sort_fields, searched_queries.size(), candidate_rank, topster, query_suggestion,
filtered_result_ids, filtered_results_size);
delete[] filtered_result_ids;
delete[] result_ids;
@ -407,11 +410,12 @@ void Collection::search_candidates(uint32_t* filter_ids, size_t filter_ids_lengt
delete [] *all_result_ids;
*all_result_ids = new_all_result_ids;
score_results(sort_fields, candidate_rank, topster, query_suggestion, result_ids, result_size);
score_results(sort_fields, searched_queries.size(), candidate_rank, topster, query_suggestion, result_ids, result_size);
delete[] result_ids;
}
total_results += topster.size;
searched_queries.push_back(query_suggestion);
if(total_results >= max_results) {
break;
@ -638,13 +642,17 @@ Option<nlohmann::json> Collection::search(std::string query, const std::vector<s
uint32_t* all_result_ids = nullptr;
size_t all_result_ids_len = 0;
// all search queries that were used for generating the results
std::vector<std::vector<art_leaf*>> searched_queries;
int searched_queries_index = 0;
for(int i = 0; i < search_fields.size(); i++) {
Topster<100> topster;
const std::string & field = search_fields[i];
// proceed to query search only when no filters are provided or when filtering produces results
if(simple_filter_query.size() == 0 || filter_ids_length > 0) {
search_field(query, field, filter_ids, filter_ids_length, facets, sort_fields_std, num_typos, num_results,
topster, &all_result_ids, all_result_ids_len, token_order, prefix);
searched_queries, searched_queries_index, topster, &all_result_ids, all_result_ids_len, token_order, prefix);
topster.sort();
}
@ -694,10 +702,34 @@ Option<nlohmann::json> Collection::search(std::string query, const std::vector<s
std::vector<std::string> tokens;
StringUtils::split(document[field_name], tokens, " ");
std::vector<std::vector<uint16_t>> token_positions;
for (const art_leaf *token_leaf : searched_queries[field_order_kv.second.query_index]) {
std::vector<uint16_t> positions;
int doc_index = token_leaf->values->ids.indexOf(field_order_kv.second.key);
if(doc_index == token_leaf->values->ids.getLength()) {
continue;
}
uint32_t start_offset = token_leaf->values->offset_index.at(doc_index);
uint32_t end_offset = (doc_index == token_leaf->values->ids.getLength() - 1) ?
token_leaf->values->offsets.getLength() :
token_leaf->values->offset_index.at(doc_index+1);
while(start_offset < end_offset) {
positions.push_back((uint16_t) token_leaf->values->offsets.at(start_offset));
start_offset++;
}
token_positions.push_back(positions);
}
MatchScore mscore = MatchScore::match_score(field_order_kv.second.key, token_positions);
std::vector<size_t> token_indices;
char num_tokens_found = field_order_kv.second.offset_diffs[0];
char num_tokens_found = mscore.offset_diffs[0];
for(size_t i = 1; i <= num_tokens_found; i++) {
size_t token_index = (size_t)(field_order_kv.second.start_offset + field_order_kv.second.offset_diffs[i]);
size_t token_index = (size_t)(mscore.start_offset + mscore.offset_diffs[i]);
token_indices.push_back(token_index);
}
@ -780,8 +812,9 @@ Option<nlohmann::json> Collection::search(std::string query, const std::vector<s
*/
void Collection::search_field(std::string & query, const std::string & field, uint32_t *filter_ids, size_t filter_ids_length,
std::vector<facet> & facets, const std::vector<sort_field> & sort_fields, const int num_typos,
const size_t num_results, Topster<100> &topster, uint32_t** all_result_ids,
size_t & all_result_ids_len, const token_ordering token_order, const bool prefix) {
const size_t num_results, std::vector<std::vector<art_leaf*>> & searched_queries,
int & searched_queries_index, Topster<100> &topster, uint32_t** all_result_ids, size_t & all_result_ids_len,
const token_ordering token_order, const bool prefix) {
std::vector<std::string> tokens;
StringUtils::split(query, tokens, " ");
@ -808,7 +841,8 @@ void Collection::search_field(std::string & query, const std::string & field, ui
std::transform(tokens[token_index].begin(), tokens[token_index].end(), tokens[token_index].begin(), ::tolower);
}
std::vector<std::vector<art_leaf*>> token_leaves;
// stores candidates for each token, i.e. i-th index would have all possible tokens with a cost of "c"
std::vector<std::vector<art_leaf*>> token_to_candidates;
const size_t combination_limit = 10;
auto product = []( long long a, std::vector<int>& b ) { return a*b.size(); };
@ -826,7 +860,7 @@ void Collection::search_field(std::string & query, const std::string & field, ui
costs[i] = token_to_costs[i][q.rem];
}
token_leaves.clear();
token_to_candidates.clear();
int token_index = 0;
while(token_index < tokens.size()) {
@ -855,7 +889,7 @@ void Collection::search_field(std::string & query, const std::string & field, ui
if(!leaves.empty()) {
//!log_leaves(costs[token_index], token, leaves);
token_leaves.push_back(leaves);
token_to_candidates.push_back(leaves);
token_to_count[token] = std::max(token_to_count[token], leaves.at(0)->values->ids.getLength());
} else {
// No result at `cost = costs[token_index]`. Remove costs until `cost` for token and re-do combinations
@ -880,10 +914,10 @@ void Collection::search_field(std::string & query, const std::string & field, ui
token_index++;
}
if(token_leaves.size() != 0 && token_leaves.size() == tokens.size()) {
if(token_to_candidates.size() != 0 && token_to_candidates.size() == tokens.size()) {
// If all tokens were found, go ahead and search for candidates with what we have so far
search_candidates(filter_ids, filter_ids_length, facets, sort_fields, candidate_rank, token_leaves, topster,
total_results, all_result_ids, all_result_ids_len, max_results);
search_candidates(filter_ids, filter_ids_length, facets, sort_fields, candidate_rank, token_to_candidates,
searched_queries, topster, total_results, all_result_ids, all_result_ids_len, max_results);
if (total_results >= max_results) {
// If we don't find enough results, we continue outerloop (looking at tokens with greater cost)
@ -917,7 +951,8 @@ void Collection::search_field(std::string & query, const std::string & field, ui
}
return search_field(truncated_query, field, filter_ids, filter_ids_length, facets, sort_fields, num_typos,
num_results, topster, all_result_ids, all_result_ids_len, token_order, prefix);
num_results, searched_queries, candidate_rank, topster, all_result_ids, all_result_ids_len,
token_order, prefix);
}
}
@ -932,12 +967,12 @@ void Collection::log_leaves(const int cost, const std::string &token, const std:
}
}
void Collection::score_results(const std::vector<sort_field> & sort_fields, const int & candidate_rank,
void Collection::score_results(const std::vector<sort_field> & sort_fields, const int & query_index, const int & candidate_rank,
Topster<100> & topster, const std::vector<art_leaf *> &query_suggestion,
const uint32_t *result_ids, const size_t result_size) const {
const int max_candidate_rank = 250;
spp::sparse_hash_map<art_leaf*, uint32_t*> leaf_to_indices;
spp::sparse_hash_map<const art_leaf*, uint32_t*> leaf_to_indices;
for (art_leaf *token_leaf : query_suggestion) {
uint32_t *indices = new uint32_t[result_size];
@ -969,12 +1004,52 @@ void Collection::score_results(const std::vector<sort_field> & sort_fields, cons
for(auto i=0; i<result_size; i++) {
uint32_t seq_id = result_ids[i];
std::vector<std::vector<uint16_t>> token_positions;
MatchScore mscore;
// for each token in the query, find the positions that it appears in this document
for (art_leaf *token_leaf : query_suggestion) {
if(query_suggestion.size() == 1) {
// short circuit to speed up single token searches (use dummy offsets for now)
char offset_diffs[16];
std::fill_n(offset_diffs, 16, 0);
mscore = MatchScore(1, 0, 0, offset_diffs);
} else {
std::vector<std::vector<uint16_t>> token_positions;
populate_token_positions(query_suggestion, leaf_to_indices, i, token_positions);
mscore = MatchScore::match_score(seq_id, token_positions);
}
int candidate_rank_score = max_candidate_rank - candidate_rank;
// Construct a single match_score from individual components (for multi-field sort)
const uint64_t match_score = ((uint64_t)(mscore.words_present) << 16) +
(candidate_rank_score << 8) +
(MAX_SEARCH_TOKENS - mscore.distance);
int64_t primary_rank_score = (primary_rank_scores && primary_rank_scores->count(seq_id) > 0) ?
primary_rank_scores->at(seq_id) : 0;
int64_t secondary_rank_score = (secondary_rank_scores && secondary_rank_scores->count(seq_id) > 0) ?
secondary_rank_scores->at(seq_id) : 0;
topster.add(seq_id, query_index, match_score, primary_rank_factor * primary_rank_score,
secondary_rank_factor * secondary_rank_score);
/*std::cout << "candidate_rank_score: " << candidate_rank_score << ", words_present: " << mscore.words_present
<< ", match_score: " << match_score << ", primary_rank_score: " << primary_rank_score
<< ", seq_id: " << seq_id << std::endl;*/
}
for (auto it = leaf_to_indices.begin(); it != leaf_to_indices.end(); it++) {
delete [] it->second;
it->second = nullptr;
}
}
void Collection::populate_token_positions(const std::vector<art_leaf *> &query_suggestion,
spp::sparse_hash_map<const art_leaf *, uint32_t *> &leaf_to_indices,
size_t result_index, std::vector<std::vector<uint16_t>> &token_positions) const {
// for each token in the query, find the positions that it appears in this document
for (const art_leaf *token_leaf : query_suggestion) {
std::vector<uint16_t> positions;
int doc_index = leaf_to_indices.at(token_leaf)[i];
int doc_index = leaf_to_indices.at(token_leaf)[result_index];
if(doc_index == token_leaf->values->ids.getLength()) {
continue;
}
@ -991,34 +1066,6 @@ void Collection::score_results(const std::vector<sort_field> & sort_fields, cons
token_positions.push_back(positions);
}
MatchScore mscore = MatchScore::match_score(seq_id, token_positions);
int candidate_rank_score = max_candidate_rank - candidate_rank;
// Construct a single match_score from individual components (for multi-field sort)
const uint64_t match_score = ((uint64_t)(mscore.words_present) << 16) +
(candidate_rank_score << 8) +
(MAX_SEARCH_TOKENS - mscore.distance);
int64_t primary_rank_score = (primary_rank_scores && primary_rank_scores->count(seq_id) > 0) ?
primary_rank_scores->at(seq_id) : 0;
int64_t secondary_rank_score = (secondary_rank_scores && secondary_rank_scores->count(seq_id) > 0) ?
secondary_rank_scores->at(seq_id) : 0;
topster.add(seq_id, match_score,
primary_rank_factor * primary_rank_score, secondary_rank_factor * secondary_rank_score,
mscore.start_offset, mscore.offset_diffs);
/*std::cout << "candidate_rank_score: " << candidate_rank_score << ", words_present: " << mscore.words_present
<< ", match_score: " << match_score << ", primary_rank_score: " << primary_rank_score
<< ", seq_id: " << seq_id << std::endl;*/
}
for (auto it = leaf_to_indices.begin(); it != leaf_to_indices.end(); it++) {
delete [] it->second;
it->second = nullptr;
}
}
inline std::vector<art_leaf *> Collection::next_suggestion(const std::vector<std::vector<art_leaf *>> &token_leaves,

View File

@ -6,28 +6,27 @@ TEST(TopsterTest, StoreMaxValuesWithoutRepetition) {
Topster<5> topster;
struct {
uint16_t start_offset;
char offset_diffs[16];
uint16_t query_index;
uint64_t key;
uint64_t match_score;
int64_t primary_attr;
int64_t secondary_attr;
} data[10] = {
{10, { 10 }, 1, 10, 20, 30},
{0, { 10 }, 2, 4, 20, 30},
{2, { 10 }, 3, 7, 20, 30},
{11, { 20 }, 4, 11, 20, 30},
{78, { 30 }, 5, 9, 20, 30},
{246, { 10 }, 6, 6, 20, 30},
{0, { 10, 11 }, 7, 6, 22, 30},
{20, { 10 }, 8, 9, 20, 30},
{22, { 15, 17, 18 }, 9, 8, 20, 30},
{77, { 10 }, 10, 5, 20, 30},
{0, 1, 10, 20, 30},
{0, 2, 4, 20, 30},
{2, 3, 7, 20, 30},
{0, 4, 11, 20, 30},
{1, 5, 9, 20, 30},
{0, 6, 6, 20, 30},
{2, 7, 6, 22, 30},
{1, 8, 9, 20, 30},
{0, 9, 8, 20, 30},
{3, 10, 5, 20, 30},
};
for(int i = 0; i < 10; i++) {
topster.add(data[i].key, data[i].match_score, data[i].primary_attr, data[i].secondary_attr,
data[i].start_offset, data[i].offset_diffs);
topster.add(data[i].key, data[i].query_index, data[i].match_score, data[i].primary_attr,
data[i].secondary_attr);
}
topster.sort();