mirror of
https://github.com/typesense/typesense.git
synced 2025-05-20 05:32:30 +08:00
Refactor collection's search method to be more judicious in using higher costs.
Earlier, even if one token produced no result, ALL tokens were searched with a higher cost. This change ensures that we first retry only the token that did not produce results with a larger cost before doing the same for other tokens.
This commit is contained in:
parent
44d55cb13d
commit
396e10be5d
1
TODO.md
1
TODO.md
@ -15,6 +15,7 @@
|
||||
- ~~Speed up UUID generation~~
|
||||
- Prefix-search strings should not be null terminated
|
||||
- Make the search score computation customizable
|
||||
- string_utils::tokenize should not have max length
|
||||
|
||||
**API**
|
||||
|
||||
|
@ -26,8 +26,12 @@ private:
|
||||
std::string get_seq_id_key(uint32_t seq_id);
|
||||
std::string get_id_key(std::string id);
|
||||
|
||||
static inline std::vector<art_leaf *> _next_suggestion(const std::vector<std::vector<art_leaf *>> &token_leaves,
|
||||
long long int n);
|
||||
static inline std::vector<art_leaf *> next_suggestion(const std::vector<std::vector<art_leaf *>> &token_leaves,
|
||||
long long int n);
|
||||
void log_leaves(const int max_cost, const std::string &token, const std::vector<art_leaf *> &leaves) const;
|
||||
|
||||
void search_candidates(std::vector<std::vector<art_leaf*>> & token_leaves, Topster<100> & topster,
|
||||
size_t & total_results, const size_t & max_results);
|
||||
|
||||
public:
|
||||
Collection() = delete;
|
||||
@ -39,5 +43,8 @@ public:
|
||||
void score_results(Topster<100> &topster, const std::vector<art_leaf *> &query_suggestion,
|
||||
const uint32_t *result_ids,
|
||||
size_t result_size) const;
|
||||
|
||||
enum {MAX_SEARCH_TOKENS = 20};
|
||||
enum {MAX_RESULTS = 100};
|
||||
};
|
||||
|
||||
|
@ -60,9 +60,8 @@ struct MatchScore {
|
||||
* compute the max_match and min_displacement of target tokens across the windows.
|
||||
*/
|
||||
static MatchScore match_score(uint32_t doc_id, std::vector<std::vector<uint16_t>> &token_offsets) {
|
||||
const size_t WINDOW_SIZE = 20;
|
||||
const size_t MAX_TOKENS_IN_A_QUERY = 20;
|
||||
const uint16_t MAX_DISPLACEMENT = 20;
|
||||
const size_t WINDOW_SIZE = Collection::MAX_SEARCH_TOKENS;
|
||||
const uint16_t MAX_DISPLACEMENT = Collection::MAX_SEARCH_TOKENS;
|
||||
|
||||
std::priority_queue<TokenOffset, std::vector<TokenOffset>, TokenOffset> heap;
|
||||
|
||||
@ -76,8 +75,8 @@ struct MatchScore {
|
||||
uint16_t min_displacement = MAX_DISPLACEMENT;
|
||||
|
||||
std::queue<TokenOffset> window;
|
||||
uint16_t token_offset[MAX_TOKENS_IN_A_QUERY] = { };
|
||||
std::fill_n(token_offset, MAX_TOKENS_IN_A_QUERY, MAX_DISPLACEMENT);
|
||||
uint16_t token_offset[Collection::MAX_SEARCH_TOKENS] = { };
|
||||
std::fill_n(token_offset, Collection::MAX_SEARCH_TOKENS, MAX_DISPLACEMENT);
|
||||
|
||||
do {
|
||||
if(window.empty()) {
|
||||
|
@ -80,12 +80,49 @@ std::string Collection::add(std::string json_str) {
|
||||
return document["id"];
|
||||
}
|
||||
|
||||
void Collection::search_candidates(std::vector<std::vector<art_leaf*>> & token_leaves, Topster<100> & topster,
|
||||
size_t & total_results, const size_t & max_results) {
|
||||
const size_t combination_limit = 10;
|
||||
auto product = []( long long a, std::vector<art_leaf*>& b ) { return a*b.size(); };
|
||||
long long int N = std::accumulate(token_leaves.begin(), token_leaves.end(), 1LL, product);
|
||||
|
||||
for(long long n=0; n<N && n<combination_limit; ++n) {
|
||||
// every element in `query_suggestion` represents a token and its associated hits
|
||||
std::vector<art_leaf *> query_suggestion = next_suggestion(token_leaves, n);
|
||||
|
||||
// initialize results with the starting element (for further intersection)
|
||||
uint32_t* result_ids = query_suggestion[0]->values->ids.uncompress();
|
||||
size_t result_size = query_suggestion[0]->values->ids.getLength();
|
||||
|
||||
if(result_size == 0) continue;
|
||||
|
||||
// intersect the document ids for each token to find docs that contain all the tokens (stored in `result_ids`)
|
||||
for(auto i=1; i < query_suggestion.size(); i++) {
|
||||
uint32_t* out = new uint32_t[result_size];
|
||||
uint32_t* curr = query_suggestion[i]->values->ids.uncompress();
|
||||
result_size = Intersection::scalar(result_ids, result_size, curr, query_suggestion[i]->values->ids.getLength(), out);
|
||||
delete[] result_ids;
|
||||
delete[] curr;
|
||||
result_ids = out;
|
||||
}
|
||||
|
||||
// go through each matching document id and calculate match score
|
||||
score_results(topster, query_suggestion, result_ids, result_size);
|
||||
|
||||
total_results += result_size;
|
||||
delete[] result_ids;
|
||||
|
||||
if(total_results >= max_results) break;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
1. Split the query into tokens
|
||||
2. For each token, look up ids using exact lookup
|
||||
a. If a token has no result, try again with edit distance of 1, and then 2
|
||||
3. Do a limited cartesian product of the word suggestions for each token to form possible corrected search phrases
|
||||
(adapted from: http://stackoverflow.com/a/31169617/131050)
|
||||
2. Outer loop will generate bounded cartesian product with costs for each token
|
||||
3. Inner loop will iterate on each token with associated cost
|
||||
4. Cartesian product of the results of the token searches will be used to form search phrases
|
||||
(cartesian product adapted from: http://stackoverflow.com/a/31169617/131050)
|
||||
4. Intersect the lists to find docs that match each phrase
|
||||
5. Sort the docs based on some ranking criteria
|
||||
*/
|
||||
@ -94,109 +131,127 @@ std::vector<nlohmann::json> Collection::search(std::string query, const int num_
|
||||
StringUtils::tokenize(query, tokens, " ", true);
|
||||
|
||||
const int max_cost = (num_typos < 0 || num_typos > 2) ? 2 : num_typos;
|
||||
const size_t max_results = std::min(num_results, (size_t) 100);
|
||||
const size_t max_results = std::min(num_results, (size_t) Collection::MAX_RESULTS);
|
||||
|
||||
int cost = 0;
|
||||
size_t total_results = 0;
|
||||
std::vector<nlohmann::json> results;
|
||||
Topster<100> topster;
|
||||
|
||||
auto begin = std::chrono::high_resolution_clock::now();
|
||||
|
||||
while(cost <= max_cost) {
|
||||
std::cout << "Searching with cost=" << cost << std::endl;
|
||||
std::vector<std::vector<int>> token_to_costs;
|
||||
std::vector<int> all_costs;
|
||||
for(int cost = 0; cost <= max_cost; cost++) {
|
||||
all_costs.push_back(cost);
|
||||
}
|
||||
|
||||
std::vector<std::vector<art_leaf*>> token_leaves;
|
||||
for(std::string token: tokens) {
|
||||
std::transform(token.begin(), token.end(), token.begin(), ::tolower);
|
||||
for(size_t token_index = 0; token_index < tokens.size(); token_index++) {
|
||||
token_to_costs.push_back(all_costs);
|
||||
std::transform(tokens[token_index].begin(), tokens[token_index].end(), tokens[token_index].begin(), ::tolower);
|
||||
}
|
||||
|
||||
std::vector<std::vector<art_leaf*>> token_leaves;
|
||||
const size_t combination_limit = 10;
|
||||
auto product = []( long long a, std::vector<int>& b ) { return a*b.size(); };
|
||||
long long n = 0;
|
||||
long long int N = std::accumulate(token_to_costs.begin(), token_to_costs.end(), 1LL, product);
|
||||
|
||||
while(n < N && n < combination_limit) {
|
||||
// Outerloop generates combinations of [cost to max_cost] for each token
|
||||
// For e.g. for a 3-token query: [0, 0, 0], [0, 0, 1], [0, 1, 1] etc.
|
||||
std::vector<uint32_t> costs(token_to_costs.size());
|
||||
ldiv_t q { n, 0 };
|
||||
for(long long i = (token_to_costs.size() - 1); 0 <= i ; --i ) {
|
||||
q = ldiv(q.quot, token_to_costs[i].size());
|
||||
costs[i] = token_to_costs[i][q.rem];
|
||||
}
|
||||
|
||||
token_leaves.clear();
|
||||
size_t token_index = 0;
|
||||
bool retry_with_larger_cost = false;
|
||||
|
||||
while(token_index < tokens.size()) {
|
||||
// For each token, look up the generated cost for this iteration and search using that cost
|
||||
std::string token = tokens[token_index];
|
||||
|
||||
std::vector<art_leaf*> leaves;
|
||||
art_fuzzy_search(&t, (const unsigned char *) token.c_str(), (int) token.length() + 1, cost, 3, leaves);
|
||||
art_fuzzy_search(&t, (const unsigned char *) token.c_str(), (int) token.length() + 1, costs[token_index], 3, leaves);
|
||||
|
||||
if(!leaves.empty()) {
|
||||
for(auto i=0; i<leaves.size(); i++) {
|
||||
printf("%s - ", token.c_str());
|
||||
printf("%.*s", leaves[i]->key_len, leaves[i]->key);
|
||||
printf(" - max_cost: %d, - num_ids: %d\n", max_cost, leaves[i]->values->ids.getLength());
|
||||
/*for(auto j=0; j<leaves[i]->values->ids.getLength(); j++) {
|
||||
printf("id: %d\n", leaves[i]->values->ids.at(j));
|
||||
}*/
|
||||
}
|
||||
//log_leaves(max_cost, token, leaves);
|
||||
token_leaves.push_back(leaves);
|
||||
} else {
|
||||
// no result when `cost = costs[token_index]` => remove cost for token and re-do combinations
|
||||
auto it = std::find(token_to_costs[token_index].begin(), token_to_costs[token_index].end(), costs[token_index]);
|
||||
if(it != token_to_costs[token_index].end()) {
|
||||
token_to_costs[token_index].erase(it);
|
||||
|
||||
// no more costs left for this token, clean up
|
||||
if(token_to_costs[token_index].empty()) {
|
||||
token_to_costs.erase(token_to_costs.begin()+token_index);
|
||||
tokens.erase(tokens.begin()+token_index);
|
||||
token_index--;
|
||||
}
|
||||
}
|
||||
|
||||
n = -1;
|
||||
N = std::accumulate(token_to_costs.begin(), token_to_costs.end(), 1LL, product);
|
||||
|
||||
if(costs[token_index] != max_cost) {
|
||||
// Unless we're already at max_cost for this token, don't look at remaining tokens since we would
|
||||
// see them again in a future iteration when we retry with a larger cost
|
||||
retry_with_larger_cost = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
token_index++;
|
||||
}
|
||||
|
||||
if(token_leaves.size() != 0 && !retry_with_larger_cost) {
|
||||
// If a) all tokens were found, or b) Some were skipped because they don't exist within max_cost,
|
||||
// go ahead and search for candidates with what we have so far
|
||||
search_candidates(token_leaves, topster, total_results, max_results);
|
||||
topster.sort();
|
||||
|
||||
for (uint32_t i = 0; i < topster.size; i++) {
|
||||
uint64_t seq_id = topster.getKeyAt(i);
|
||||
std::string value;
|
||||
store->get(get_seq_id_key((uint32_t) seq_id), value);
|
||||
nlohmann::json document = nlohmann::json::parse(value);
|
||||
results.push_back(document);
|
||||
}
|
||||
|
||||
if (total_results > 0) {
|
||||
// Unless there are results, we continue outerloop (looking at tokens with greater cost)
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if(token_leaves.size() != tokens.size() && cost != max_cost) {
|
||||
// There could have been a typo in one of the tokens, so let's try again with greater cost
|
||||
// Or this could be a token that does not exist at all (rare)
|
||||
//std::cout << "token_leaves.size() != tokens.size(), continuing..." << std::endl << std::endl;
|
||||
cost++;
|
||||
continue;
|
||||
}
|
||||
|
||||
const size_t combination_limit = 10;
|
||||
auto product = []( long long a, std::vector<art_leaf*>& b ) { return a*b.size(); };
|
||||
long long int N = std::accumulate(token_leaves.begin(), token_leaves.end(), 1LL, product );
|
||||
|
||||
for(long long n=0; n<N && n<combination_limit; ++n) {
|
||||
// every element in `query_suggestion` represents a token and its associated hits
|
||||
std::vector<art_leaf *> query_suggestion = _next_suggestion(token_leaves, n);
|
||||
|
||||
// initialize results with the starting element (for further intersection)
|
||||
uint32_t* result_ids = query_suggestion[0]->values->ids.uncompress();
|
||||
size_t result_size = query_suggestion[0]->values->ids.getLength();
|
||||
|
||||
if(result_size == 0) continue;
|
||||
|
||||
// intersect the document ids for each token to find docs that contain all the tokens (stored in `result_ids`)
|
||||
for(auto i=1; i < query_suggestion.size(); i++) {
|
||||
uint32_t* out = new uint32_t[result_size];
|
||||
uint32_t* curr = query_suggestion[i]->values->ids.uncompress();
|
||||
result_size = Intersection::scalar(result_ids, result_size, curr, query_suggestion[i]->values->ids.getLength(), out);
|
||||
delete[] result_ids;
|
||||
delete[] curr;
|
||||
result_ids = out;
|
||||
}
|
||||
|
||||
// go through each matching document id and calculate match score
|
||||
score_results(topster, query_suggestion, result_ids, result_size);
|
||||
|
||||
total_results += result_size;
|
||||
delete[] result_ids;
|
||||
|
||||
if(total_results >= max_results) break;
|
||||
}
|
||||
|
||||
topster.sort();
|
||||
|
||||
for(uint32_t i=0; i<topster.size; i++) {
|
||||
uint64_t seq_id = topster.getKeyAt(i);
|
||||
//std::cout << "ID: " << seq_id << std::endl;
|
||||
|
||||
std::string value;
|
||||
store->get(get_seq_id_key((uint32_t) seq_id), value);
|
||||
nlohmann::json document = nlohmann::json::parse(value);
|
||||
results.push_back(document);
|
||||
}
|
||||
|
||||
if(total_results > 0) {
|
||||
break;
|
||||
}
|
||||
|
||||
cost++;
|
||||
n++;
|
||||
}
|
||||
|
||||
if(results.size() == 0) {
|
||||
// We could drop certain tokens and try
|
||||
// FIXME: We could drop certain tokens and try searching again
|
||||
}
|
||||
|
||||
long long int timeMillis = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::high_resolution_clock::now() - begin).count();
|
||||
std::cout << "Time taken for result calc: " << timeMillis << "us" << std::endl;
|
||||
|
||||
store->print_memory_usage();
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
void Collection::log_leaves(const int max_cost, const std::string &token, const std::vector<art_leaf *> &leaves) const {
|
||||
for(auto i=0; i < leaves.size(); i++) {
|
||||
printf("%s - ", token.c_str());
|
||||
printf("%.*s", leaves[i]->key_len, leaves[i]->key);
|
||||
printf(" - max_cost: %d, - num_ids: %d\n", max_cost, leaves[i]->values->ids.getLength());
|
||||
for(auto j=0; j<leaves[i]->values->ids.getLength(); j++) {
|
||||
printf("id: %d\n", leaves[i]->values->ids.at(j));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void Collection::score_results(Topster<100> &topster, const std::vector<art_leaf *> &query_suggestion,
|
||||
const uint32_t *result_ids, size_t result_size) const {
|
||||
for(auto i=0; i<result_size; i++) {
|
||||
@ -228,13 +283,12 @@ void Collection::score_results(Topster<100> &topster, const std::vector<art_leaf
|
||||
mscore = MatchScore::match_score(doc_id, token_positions);
|
||||
}
|
||||
|
||||
uint32_t doc_score = doc_scores.at(doc_id);
|
||||
const uint64_t final_score = ((uint64_t)(mscore.words_present * 32 + (20 - mscore.distance)) * UINT32_MAX) +
|
||||
const uint64_t final_score = ((uint64_t)(mscore.words_present * 32 + (MAX_SEARCH_TOKENS - mscore.distance)) * UINT32_MAX) +
|
||||
doc_scores.at(doc_id);
|
||||
|
||||
//std::cout << "final_score: " << final_score << ", doc_id: " << doc_id << std::endl;
|
||||
|
||||
/*
|
||||
std::cout << "final_score: " << final_score << ", doc_id: " << doc_id << std::endl;
|
||||
uint32_t doc_score = doc_scores.at(doc_id);
|
||||
std::cout << "result_ids[i]: " << result_ids[i] << " - mscore.distance: "
|
||||
<< (int) mscore.distance << " - mscore.words_present: " << (int) mscore.words_present
|
||||
<< " - doc_scores[doc_id]: " << (int) doc_scores.at(doc_id) << " - final_score: "
|
||||
@ -245,7 +299,7 @@ void Collection::score_results(Topster<100> &topster, const std::vector<art_leaf
|
||||
}
|
||||
}
|
||||
|
||||
inline std::vector<art_leaf *> Collection::_next_suggestion(
|
||||
inline std::vector<art_leaf *> Collection::next_suggestion(
|
||||
const std::vector<std::vector<art_leaf *>> &token_leaves,
|
||||
long long int n) {
|
||||
std::vector<art_leaf*> query_suggestion(token_leaves.size());
|
||||
|
@ -65,7 +65,7 @@ TEST_F(CollectionTest, ExactPhraseSearch) {
|
||||
|
||||
TEST_F(CollectionTest, SkipUnindexedTokensDuringPhraseSearch) {
|
||||
// Tokens that are not found in the index should be skipped
|
||||
std::vector<nlohmann::json> results = collection->search("from DoesNotExist", 0, 10);
|
||||
std::vector<nlohmann::json> results = collection->search("DoesNotExist from", 0, 10);
|
||||
ASSERT_EQ(2, results.size());
|
||||
|
||||
std::vector<std::string> ids = {"2", "17"};
|
||||
@ -76,4 +76,43 @@ TEST_F(CollectionTest, SkipUnindexedTokensDuringPhraseSearch) {
|
||||
std::string result_id = result["id"];
|
||||
ASSERT_STREQ(id.c_str(), result_id.c_str());
|
||||
}
|
||||
|
||||
// with non-zero cost
|
||||
results = collection->search("DoesNotExist from", 2, 10);
|
||||
ASSERT_EQ(2, results.size());
|
||||
|
||||
for(size_t i = 0; i < results.size(); i++) {
|
||||
nlohmann::json result = results.at(i);
|
||||
std::string id = ids.at(i);
|
||||
std::string result_id = result["id"];
|
||||
ASSERT_STREQ(id.c_str(), result_id.c_str());
|
||||
}
|
||||
|
||||
// with 2 indexed words
|
||||
results = collection->search("from DoesNotExist insTruments", 2, 10);
|
||||
ASSERT_EQ(1, results.size());
|
||||
nlohmann::json result = results.at(0);
|
||||
std::string result_id = result["id"];
|
||||
ASSERT_STREQ("2", result_id.c_str());
|
||||
|
||||
results.clear();
|
||||
results = collection->search("DoesNotExist1 DoesNotExist2", 0, 10);
|
||||
ASSERT_EQ(0, results.size());
|
||||
|
||||
results.clear();
|
||||
results = collection->search("DoesNotExist1 DoesNotExist2", 2, 10);
|
||||
ASSERT_EQ(0, results.size());
|
||||
}
|
||||
|
||||
TEST_F(CollectionTest, PartialPhraseSearch) {
|
||||
std::vector<nlohmann::json> results = collection->search("rocket research", 0, 10);
|
||||
//ASSERT_EQ(1, results.size());
|
||||
}
|
||||
|
||||
TEST_F(CollectionTest, RegressionTest1) {
|
||||
std::vector<nlohmann::json> results = collection->search("kind biologcal", 2, 10);
|
||||
ASSERT_EQ(1, results.size());
|
||||
|
||||
std::string result_id = results.at(0)["id"];
|
||||
ASSERT_STREQ("19", result_id.c_str());
|
||||
}
|
@ -16,4 +16,6 @@
|
||||
{"points":10,"title":"How late do the launch propellants ionize in a chemical rocket mission?"}
|
||||
{"points":8,"title":"How much does it cost to launch (right from start) a rocket today?"}
|
||||
{"points":16,"title":"Difference between Space Dynamics & Astrodynamics in engineering perspective?"}
|
||||
{"points":18,"title":"What kind of biological research does ISS do?"}
|
||||
{"points":18,"title":"What kind of biological research does ISS do?"}
|
||||
{"points":10,"title":"What kinds of radiation hit ISS?"}
|
||||
{"points":7,"title":"What kinds of things have been tossed out of ISS?"}
|
Loading…
x
Reference in New Issue
Block a user