Set a ceiling on num_typos so that 1 and 2 char prefix searches make sense.

This commit is contained in:
Kishore Nallan 2017-09-22 20:59:26 +05:30
parent e24e0fae5d
commit b0cb3ceb41
3 changed files with 35 additions and 18 deletions

View File

@ -61,7 +61,7 @@
- ~~Collection Manager collections map should store plain collection name~~
- ~~init_collection of Collection manager should probably take seq_id as param~~
- ~~node score should be int32, no longer uint16 like in document struct~~
- Typo in prefix search
- ~~Typo in prefix search~~
- Proper logging
- https support
- Validate before string to int conversion in the http api layer

View File

@ -453,11 +453,6 @@ void Collection::search_candidates(uint32_t* filter_ids, size_t filter_ids_lengt
candidate_rank += 1;
int actual_candidate_rank = candidate_rank;
if(prefix) {
actual_candidate_rank = 0;
}
// intersect the document ids for each token to find docs that contain all the tokens (stored in `result_ids`)
for(auto i=1; i < query_suggestion.size(); i++) {
uint32_t* out = nullptr;
@ -481,7 +476,7 @@ void Collection::search_candidates(uint32_t* filter_ids, size_t filter_ids_lengt
do_facets(facets, filtered_result_ids, filtered_results_size);
// go through each matching document id and calculate match score
score_results(sort_fields, searched_queries.size(), actual_candidate_rank, topster, query_suggestion,
score_results(sort_fields, searched_queries.size(), candidate_rank, topster, query_suggestion,
filtered_result_ids, filtered_results_size);
delete[] filtered_result_ids;
@ -495,7 +490,7 @@ void Collection::search_candidates(uint32_t* filter_ids, size_t filter_ids_lengt
delete [] *all_result_ids;
*all_result_ids = new_all_result_ids;
score_results(sort_fields, searched_queries.size(), actual_candidate_rank, topster, query_suggestion,
score_results(sort_fields, searched_queries.size(), candidate_rank, topster, query_suggestion,
result_ids, result_size);
delete[] result_ids;
}
@ -938,13 +933,21 @@ void Collection::search_field(std::string & query, const std::string & field, ui
spp::sparse_hash_map<std::string, uint32_t> token_to_count;
std::vector<std::vector<int>> token_to_costs;
std::vector<int> all_costs;
for(int cost = 0; cost <= max_cost; cost++) {
all_costs.push_back(cost);
}
for(size_t token_index = 0; token_index < tokens.size(); token_index++) {
std::vector<int> all_costs;
const size_t token_len = tokens[token_index].length();
// This ensures that we don't end up doing a cost of 1 for a single char etc.
int bounded_cost = max_cost;
if(token_len > 0 && max_cost >= token_len && (token_len == 1 || token_len == 2)) {
bounded_cost = token_len - 1;
}
for(int cost = 0; cost <= bounded_cost; cost++) {
all_costs.push_back(cost);
}
token_to_costs.push_back(all_costs);
std::transform(tokens[token_index].begin(), tokens[token_index].end(), tokens[token_index].begin(), ::tolower);
}
@ -986,9 +989,10 @@ void Collection::search_field(std::string & query, const std::string & field, ui
// prefix should apply only for last token
const bool prefix_search = prefix && ((token_index == tokens.size()-1) ? true : false);
const int token_len = prefix_search ? (int) token.length() : (int) token.length() + 1;
const int max_candidates = prefix_search ? 5 : 3;
art_fuzzy_search(search_index.at(field), (const unsigned char *) token.c_str(), token_len,
costs[token_index], costs[token_index], 3, token_order, prefix_search, leaves);
costs[token_index], costs[token_index], max_candidates, token_order, prefix_search, leaves);
if(!leaves.empty()) {
token_cost_cache.emplace(token_cost_hash, leaves);
@ -1033,6 +1037,7 @@ void Collection::search_field(std::string & query, const std::string & field, ui
break;
}
// only allow upto 10 prefix candidate tokens
if(prefix && candidate_rank > 10) {
break;
}
@ -1164,9 +1169,9 @@ void Collection::score_results(const std::vector<sort_by> & sort_fields, const i
const number_t & secondary_rank_value = secondary_rank_score * secondary_rank_factor;
topster.add(seq_id, query_index, match_score, primary_rank_value, secondary_rank_value);
/*std::cout << "candidate_rank_score: " << candidate_rank_score << ", words_present: " << mscore.words_present
<< ", match_score: " << match_score << ", primary_rank_score: " << primary_rank_score
<< ", seq_id: " << seq_id << std::endl;*/
/*std::cout << "candidate_rank: " << candidate_rank << ", candidate_rank_score: " << candidate_rank_score
<< ", words_present: " << mscore.words_present << ", match_score: " << match_score
<< ", primary_rank_score: " << primary_rank_score.intval << ", seq_id: " << seq_id << std::endl;*/
}
for (auto it = leaf_to_indices.begin(); it != leaf_to_indices.end(); it++) {

View File

@ -351,7 +351,7 @@ TEST_F(CollectionTest, PrefixSearching) {
std::vector<std::string> facets;
nlohmann::json results = collection->search("ex", query_fields, "", facets, sort_fields, 0, 10, 1, FREQUENCY, true).get();
ASSERT_EQ(2, results["hits"].size());
std::vector<std::string> ids = {"6", "12"};
std::vector<std::string> ids = {"12", "6"};
for(size_t i = 0; i < results["hits"].size(); i++) {
nlohmann::json result = results["hits"].at(i);
@ -408,6 +408,18 @@ TEST_F(CollectionTest, PrefixSearching) {
// only the last token in the query should be used for prefix search - so, "math" should not match "mathematics"
results = collection->search("math fx", query_fields, "", facets, sort_fields, 0, 1, 1, FREQUENCY, true).get();
ASSERT_EQ(0, results["hits"].size());
// single and double char prefixes should set a ceiling on the num_typos possible
results = collection->search("x", query_fields, "", facets, sort_fields, 2, 2, 1, FREQUENCY, true).get();
ASSERT_EQ(0, results["hits"].size());
results = collection->search("xq", query_fields, "", facets, sort_fields, 2, 2, 1, FREQUENCY, true).get();
ASSERT_EQ(0, results["hits"].size());
// prefix with a typo
results = collection->search("late propx", query_fields, "", facets, sort_fields, 2, 1, 1, FREQUENCY, true).get();
ASSERT_EQ(1, results["hits"].size());
ASSERT_EQ("16", results["hits"].at(0)["id"]);
}
TEST_F(CollectionTest, MultipleFields) {