Refactor collection's search method to be more judicious in using higher costs.

Earlier, even if one token produced no result, ALL tokens were searched with a higher cost. This change ensures that we first retry only the token that did not produce results with a larger cost before doing the same for other tokens.
This commit is contained in:
Kishore Nallan 2016-11-24 21:39:20 +05:30
parent 44d55cb13d
commit 396e10be5d
6 changed files with 197 additions and 95 deletions

View File

@ -15,6 +15,7 @@
- ~~Speed up UUID generation~~
- Prefix-search strings should not be null terminated
- Make the search score computation customizable
- string_utils::tokenize should not have max length
**API**

View File

@ -26,8 +26,12 @@ private:
std::string get_seq_id_key(uint32_t seq_id);
std::string get_id_key(std::string id);
static inline std::vector<art_leaf *> _next_suggestion(const std::vector<std::vector<art_leaf *>> &token_leaves,
long long int n);
static inline std::vector<art_leaf *> next_suggestion(const std::vector<std::vector<art_leaf *>> &token_leaves,
long long int n);
void log_leaves(const int max_cost, const std::string &token, const std::vector<art_leaf *> &leaves) const;
void search_candidates(std::vector<std::vector<art_leaf*>> & token_leaves, Topster<100> & topster,
size_t & total_results, const size_t & max_results);
public:
Collection() = delete;
@ -39,5 +43,8 @@ public:
void score_results(Topster<100> &topster, const std::vector<art_leaf *> &query_suggestion,
const uint32_t *result_ids,
size_t result_size) const;
enum {MAX_SEARCH_TOKENS = 20};
enum {MAX_RESULTS = 100};
};

View File

@ -60,9 +60,8 @@ struct MatchScore {
* compute the max_match and min_displacement of target tokens across the windows.
*/
static MatchScore match_score(uint32_t doc_id, std::vector<std::vector<uint16_t>> &token_offsets) {
const size_t WINDOW_SIZE = 20;
const size_t MAX_TOKENS_IN_A_QUERY = 20;
const uint16_t MAX_DISPLACEMENT = 20;
const size_t WINDOW_SIZE = Collection::MAX_SEARCH_TOKENS;
const uint16_t MAX_DISPLACEMENT = Collection::MAX_SEARCH_TOKENS;
std::priority_queue<TokenOffset, std::vector<TokenOffset>, TokenOffset> heap;
@ -76,8 +75,8 @@ struct MatchScore {
uint16_t min_displacement = MAX_DISPLACEMENT;
std::queue<TokenOffset> window;
uint16_t token_offset[MAX_TOKENS_IN_A_QUERY] = { };
std::fill_n(token_offset, MAX_TOKENS_IN_A_QUERY, MAX_DISPLACEMENT);
uint16_t token_offset[Collection::MAX_SEARCH_TOKENS] = { };
std::fill_n(token_offset, Collection::MAX_SEARCH_TOKENS, MAX_DISPLACEMENT);
do {
if(window.empty()) {

View File

@ -80,12 +80,49 @@ std::string Collection::add(std::string json_str) {
return document["id"];
}
void Collection::search_candidates(std::vector<std::vector<art_leaf*>> & token_leaves, Topster<100> & topster,
size_t & total_results, const size_t & max_results) {
const size_t combination_limit = 10;
auto product = []( long long a, std::vector<art_leaf*>& b ) { return a*b.size(); };
long long int N = std::accumulate(token_leaves.begin(), token_leaves.end(), 1LL, product);
for(long long n=0; n<N && n<combination_limit; ++n) {
// every element in `query_suggestion` represents a token and its associated hits
std::vector<art_leaf *> query_suggestion = next_suggestion(token_leaves, n);
// initialize results with the starting element (for further intersection)
uint32_t* result_ids = query_suggestion[0]->values->ids.uncompress();
size_t result_size = query_suggestion[0]->values->ids.getLength();
if(result_size == 0) continue;
// intersect the document ids for each token to find docs that contain all the tokens (stored in `result_ids`)
for(auto i=1; i < query_suggestion.size(); i++) {
uint32_t* out = new uint32_t[result_size];
uint32_t* curr = query_suggestion[i]->values->ids.uncompress();
result_size = Intersection::scalar(result_ids, result_size, curr, query_suggestion[i]->values->ids.getLength(), out);
delete[] result_ids;
delete[] curr;
result_ids = out;
}
// go through each matching document id and calculate match score
score_results(topster, query_suggestion, result_ids, result_size);
total_results += result_size;
delete[] result_ids;
if(total_results >= max_results) break;
}
}
/*
1. Split the query into tokens
2. For each token, look up ids using exact lookup
a. If a token has no result, try again with edit distance of 1, and then 2
3. Do a limited cartesian product of the word suggestions for each token to form possible corrected search phrases
(adapted from: http://stackoverflow.com/a/31169617/131050)
2. Outer loop will generate bounded cartesian product with costs for each token
3. Inner loop will iterate on each token with associated cost
4. Cartesian product of the results of the token searches will be used to form search phrases
(cartesian product adapted from: http://stackoverflow.com/a/31169617/131050)
4. Intersect the lists to find docs that match each phrase
5. Sort the docs based on some ranking criteria
*/
@ -94,109 +131,127 @@ std::vector<nlohmann::json> Collection::search(std::string query, const int num_
StringUtils::tokenize(query, tokens, " ", true);
const int max_cost = (num_typos < 0 || num_typos > 2) ? 2 : num_typos;
const size_t max_results = std::min(num_results, (size_t) 100);
const size_t max_results = std::min(num_results, (size_t) Collection::MAX_RESULTS);
int cost = 0;
size_t total_results = 0;
std::vector<nlohmann::json> results;
Topster<100> topster;
auto begin = std::chrono::high_resolution_clock::now();
while(cost <= max_cost) {
std::cout << "Searching with cost=" << cost << std::endl;
std::vector<std::vector<int>> token_to_costs;
std::vector<int> all_costs;
for(int cost = 0; cost <= max_cost; cost++) {
all_costs.push_back(cost);
}
std::vector<std::vector<art_leaf*>> token_leaves;
for(std::string token: tokens) {
std::transform(token.begin(), token.end(), token.begin(), ::tolower);
for(size_t token_index = 0; token_index < tokens.size(); token_index++) {
token_to_costs.push_back(all_costs);
std::transform(tokens[token_index].begin(), tokens[token_index].end(), tokens[token_index].begin(), ::tolower);
}
std::vector<std::vector<art_leaf*>> token_leaves;
const size_t combination_limit = 10;
auto product = []( long long a, std::vector<int>& b ) { return a*b.size(); };
long long n = 0;
long long int N = std::accumulate(token_to_costs.begin(), token_to_costs.end(), 1LL, product);
while(n < N && n < combination_limit) {
// Outerloop generates combinations of [cost to max_cost] for each token
// For e.g. for a 3-token query: [0, 0, 0], [0, 0, 1], [0, 1, 1] etc.
std::vector<uint32_t> costs(token_to_costs.size());
ldiv_t q { n, 0 };
for(long long i = (token_to_costs.size() - 1); 0 <= i ; --i ) {
q = ldiv(q.quot, token_to_costs[i].size());
costs[i] = token_to_costs[i][q.rem];
}
token_leaves.clear();
size_t token_index = 0;
bool retry_with_larger_cost = false;
while(token_index < tokens.size()) {
// For each token, look up the generated cost for this iteration and search using that cost
std::string token = tokens[token_index];
std::vector<art_leaf*> leaves;
art_fuzzy_search(&t, (const unsigned char *) token.c_str(), (int) token.length() + 1, cost, 3, leaves);
art_fuzzy_search(&t, (const unsigned char *) token.c_str(), (int) token.length() + 1, costs[token_index], 3, leaves);
if(!leaves.empty()) {
for(auto i=0; i<leaves.size(); i++) {
printf("%s - ", token.c_str());
printf("%.*s", leaves[i]->key_len, leaves[i]->key);
printf(" - max_cost: %d, - num_ids: %d\n", max_cost, leaves[i]->values->ids.getLength());
/*for(auto j=0; j<leaves[i]->values->ids.getLength(); j++) {
printf("id: %d\n", leaves[i]->values->ids.at(j));
}*/
}
//log_leaves(max_cost, token, leaves);
token_leaves.push_back(leaves);
} else {
// no result when `cost = costs[token_index]` => remove cost for token and re-do combinations
auto it = std::find(token_to_costs[token_index].begin(), token_to_costs[token_index].end(), costs[token_index]);
if(it != token_to_costs[token_index].end()) {
token_to_costs[token_index].erase(it);
// no more costs left for this token, clean up
if(token_to_costs[token_index].empty()) {
token_to_costs.erase(token_to_costs.begin()+token_index);
tokens.erase(tokens.begin()+token_index);
token_index--;
}
}
n = -1;
N = std::accumulate(token_to_costs.begin(), token_to_costs.end(), 1LL, product);
if(costs[token_index] != max_cost) {
// Unless we're already at max_cost for this token, don't look at remaining tokens since we would
// see them again in a future iteration when we retry with a larger cost
retry_with_larger_cost = true;
break;
}
}
token_index++;
}
if(token_leaves.size() != 0 && !retry_with_larger_cost) {
// If a) all tokens were found, or b) Some were skipped because they don't exist within max_cost,
// go ahead and search for candidates with what we have so far
search_candidates(token_leaves, topster, total_results, max_results);
topster.sort();
for (uint32_t i = 0; i < topster.size; i++) {
uint64_t seq_id = topster.getKeyAt(i);
std::string value;
store->get(get_seq_id_key((uint32_t) seq_id), value);
nlohmann::json document = nlohmann::json::parse(value);
results.push_back(document);
}
if (total_results > 0) {
// Unless there are results, we continue outerloop (looking at tokens with greater cost)
break;
}
}
if(token_leaves.size() != tokens.size() && cost != max_cost) {
// There could have been a typo in one of the tokens, so let's try again with greater cost
// Or this could be a token that does not exist at all (rare)
//std::cout << "token_leaves.size() != tokens.size(), continuing..." << std::endl << std::endl;
cost++;
continue;
}
const size_t combination_limit = 10;
auto product = []( long long a, std::vector<art_leaf*>& b ) { return a*b.size(); };
long long int N = std::accumulate(token_leaves.begin(), token_leaves.end(), 1LL, product );
for(long long n=0; n<N && n<combination_limit; ++n) {
// every element in `query_suggestion` represents a token and its associated hits
std::vector<art_leaf *> query_suggestion = _next_suggestion(token_leaves, n);
// initialize results with the starting element (for further intersection)
uint32_t* result_ids = query_suggestion[0]->values->ids.uncompress();
size_t result_size = query_suggestion[0]->values->ids.getLength();
if(result_size == 0) continue;
// intersect the document ids for each token to find docs that contain all the tokens (stored in `result_ids`)
for(auto i=1; i < query_suggestion.size(); i++) {
uint32_t* out = new uint32_t[result_size];
uint32_t* curr = query_suggestion[i]->values->ids.uncompress();
result_size = Intersection::scalar(result_ids, result_size, curr, query_suggestion[i]->values->ids.getLength(), out);
delete[] result_ids;
delete[] curr;
result_ids = out;
}
// go through each matching document id and calculate match score
score_results(topster, query_suggestion, result_ids, result_size);
total_results += result_size;
delete[] result_ids;
if(total_results >= max_results) break;
}
topster.sort();
for(uint32_t i=0; i<topster.size; i++) {
uint64_t seq_id = topster.getKeyAt(i);
//std::cout << "ID: " << seq_id << std::endl;
std::string value;
store->get(get_seq_id_key((uint32_t) seq_id), value);
nlohmann::json document = nlohmann::json::parse(value);
results.push_back(document);
}
if(total_results > 0) {
break;
}
cost++;
n++;
}
if(results.size() == 0) {
// We could drop certain tokens and try
// FIXME: We could drop certain tokens and try searching again
}
long long int timeMillis = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::high_resolution_clock::now() - begin).count();
std::cout << "Time taken for result calc: " << timeMillis << "us" << std::endl;
store->print_memory_usage();
return results;
}
void Collection::log_leaves(const int max_cost, const std::string &token, const std::vector<art_leaf *> &leaves) const {
for(auto i=0; i < leaves.size(); i++) {
printf("%s - ", token.c_str());
printf("%.*s", leaves[i]->key_len, leaves[i]->key);
printf(" - max_cost: %d, - num_ids: %d\n", max_cost, leaves[i]->values->ids.getLength());
for(auto j=0; j<leaves[i]->values->ids.getLength(); j++) {
printf("id: %d\n", leaves[i]->values->ids.at(j));
}
}
}
void Collection::score_results(Topster<100> &topster, const std::vector<art_leaf *> &query_suggestion,
const uint32_t *result_ids, size_t result_size) const {
for(auto i=0; i<result_size; i++) {
@ -228,13 +283,12 @@ void Collection::score_results(Topster<100> &topster, const std::vector<art_leaf
mscore = MatchScore::match_score(doc_id, token_positions);
}
uint32_t doc_score = doc_scores.at(doc_id);
const uint64_t final_score = ((uint64_t)(mscore.words_present * 32 + (20 - mscore.distance)) * UINT32_MAX) +
const uint64_t final_score = ((uint64_t)(mscore.words_present * 32 + (MAX_SEARCH_TOKENS - mscore.distance)) * UINT32_MAX) +
doc_scores.at(doc_id);
//std::cout << "final_score: " << final_score << ", doc_id: " << doc_id << std::endl;
/*
std::cout << "final_score: " << final_score << ", doc_id: " << doc_id << std::endl;
uint32_t doc_score = doc_scores.at(doc_id);
std::cout << "result_ids[i]: " << result_ids[i] << " - mscore.distance: "
<< (int) mscore.distance << " - mscore.words_present: " << (int) mscore.words_present
<< " - doc_scores[doc_id]: " << (int) doc_scores.at(doc_id) << " - final_score: "
@ -245,7 +299,7 @@ void Collection::score_results(Topster<100> &topster, const std::vector<art_leaf
}
}
inline std::vector<art_leaf *> Collection::_next_suggestion(
inline std::vector<art_leaf *> Collection::next_suggestion(
const std::vector<std::vector<art_leaf *>> &token_leaves,
long long int n) {
std::vector<art_leaf*> query_suggestion(token_leaves.size());

View File

@ -65,7 +65,7 @@ TEST_F(CollectionTest, ExactPhraseSearch) {
TEST_F(CollectionTest, SkipUnindexedTokensDuringPhraseSearch) {
// Tokens that are not found in the index should be skipped
std::vector<nlohmann::json> results = collection->search("from DoesNotExist", 0, 10);
std::vector<nlohmann::json> results = collection->search("DoesNotExist from", 0, 10);
ASSERT_EQ(2, results.size());
std::vector<std::string> ids = {"2", "17"};
@ -76,4 +76,43 @@ TEST_F(CollectionTest, SkipUnindexedTokensDuringPhraseSearch) {
std::string result_id = result["id"];
ASSERT_STREQ(id.c_str(), result_id.c_str());
}
// with non-zero cost
results = collection->search("DoesNotExist from", 2, 10);
ASSERT_EQ(2, results.size());
for(size_t i = 0; i < results.size(); i++) {
nlohmann::json result = results.at(i);
std::string id = ids.at(i);
std::string result_id = result["id"];
ASSERT_STREQ(id.c_str(), result_id.c_str());
}
// with 2 indexed words
results = collection->search("from DoesNotExist insTruments", 2, 10);
ASSERT_EQ(1, results.size());
nlohmann::json result = results.at(0);
std::string result_id = result["id"];
ASSERT_STREQ("2", result_id.c_str());
results.clear();
results = collection->search("DoesNotExist1 DoesNotExist2", 0, 10);
ASSERT_EQ(0, results.size());
results.clear();
results = collection->search("DoesNotExist1 DoesNotExist2", 2, 10);
ASSERT_EQ(0, results.size());
}
TEST_F(CollectionTest, PartialPhraseSearch) {
std::vector<nlohmann::json> results = collection->search("rocket research", 0, 10);
//ASSERT_EQ(1, results.size());
}
TEST_F(CollectionTest, RegressionTest1) {
std::vector<nlohmann::json> results = collection->search("kind biologcal", 2, 10);
ASSERT_EQ(1, results.size());
std::string result_id = results.at(0)["id"];
ASSERT_STREQ("19", result_id.c_str());
}

View File

@ -16,4 +16,6 @@
{"points":10,"title":"How late do the launch propellants ionize in a chemical rocket mission?"}
{"points":8,"title":"How much does it cost to launch (right from start) a rocket today?"}
{"points":16,"title":"Difference between Space Dynamics & Astrodynamics in engineering perspective?"}
{"points":18,"title":"What kind of biological research does ISS do?"}
{"points":18,"title":"What kind of biological research does ISS do?"}
{"points":10,"title":"What kinds of radiation hit ISS?"}
{"points":7,"title":"What kinds of things have been tossed out of ISS?"}