For prefix search, only the last term in the query should be considered as prefix.

This commit is contained in:
Kishore Nallan 2017-08-19 10:42:49 +05:30
parent f5848be750
commit ea550f167c
4 changed files with 27 additions and 12 deletions

View File

@ -53,11 +53,11 @@
- ~~Test for collection creation validation~~
- ~~Test for delete document~~
- ~~art float search~~
- When prefix=true, use token_ranking_field for token ordering only for last word
- only last token should be prefix searched
- ~~When prefix=true, use token_ranking_field for token ordering only for last word~~
- ~~only last token should be prefix searched~~
- ~~Prefix-search strings should not be null terminated~~
- test for token ranking on float field
- test for float int field deletion during doc deletion
- Prefix-search strings should not be null terminated
- > INT32_MAX validation for float field
- art bool support
- Proper logging

View File

@ -1169,7 +1169,9 @@ static inline int levenshtein_dist(const int depth, const char p, const char c,
krow[column] = std::min(krow[column], irow[column-2] + cost);
}
if(krow[column] < row_min) row_min = krow[column];
if(krow[column] < row_min) {
row_min = krow[column];
}
}
return row_min;
@ -1267,7 +1269,16 @@ static void art_fuzzy_recurse(char p, char c, const art_node *n, int depth, cons
art_leaf *l = (art_leaf *) LEAF_RAW(n);
printf("\nIS_LEAF\nLEAF KEY: %s, depth: %d\n", l->key, depth);
const int end_index = min(l->key_len, term_len+max_cost);
/*
For prefix search, when key is longer than term, we could potentially iterate till `term_len+max_cost` for:
term = `th`, leaf = `mathematics` - if we compared only first 2 chars, exceeds max_cost
However, we refrain from doing so for performance reasons, or atleast until we hear strong objections.
Also, for prefix searches we don't compare with full leaf key.
*/
const int end_index = prefix ? min(l->key_len, term_len) : l->key_len;
// If at any point, `cost > 2*max_cost` we can terminate immediately as we can never recover from that
while(depth < end_index && cost <= 2*max_cost) {
c = l->key[depth];
cost = levenshtein_dist(depth, p, c, term, term_len, rows[i], rows[j], rows[k]);

View File

@ -955,11 +955,12 @@ void Collection::search_field(std::string & query, const std::string & field, ui
if(token_cost_cache.count(token_cost_hash) != 0) {
leaves = token_cost_cache[token_cost_hash];
} else {
int token_len = prefix ? (int) token.length() : (int) token.length() + 1;
int count = search_index.count(field);
// prefix should apply only for last token
const bool prefix_search = prefix && ((token_index == tokens.size()-1) ? true : false);
const int token_len = prefix_search ? (int) token.length() : (int) token.length() + 1;
art_fuzzy_search(search_index.at(field), (const unsigned char *) token.c_str(), token_len,
costs[token_index], costs[token_index], 3, token_order, prefix, leaves);
costs[token_index], costs[token_index], 3, token_order, prefix_search, leaves);
if(!leaves.empty()) {
token_cost_cache.emplace(token_cost_hash, leaves);
@ -1014,7 +1015,7 @@ void Collection::search_field(std::string & query, const std::string & field, ui
// When there are not enough overall results and atleast one token has results
if(topster.size < max_results && token_to_count.size() > 1) {
// Drop certain token with least hits and try searching again
// Drop token with least hits and try searching again
std::string truncated_query;
std::vector<std::pair<std::string, uint32_t>> token_count_pairs;
@ -1029,9 +1030,8 @@ void Collection::search_field(std::string & query, const std::string & field, ui
);
for(uint32_t i = 0; i < token_count_pairs.size()-1; i++) {
if(token_to_count.count(token_count_pairs[i].first) != 0) {
truncated_query += " " + token_count_pairs.at(i).first;
}
// iterate till last but one
truncated_query += " " + token_count_pairs.at(i).first;
}
return search_field(truncated_query, field, filter_ids, filter_ids_length, facets, sort_fields, num_typos,

View File

@ -402,6 +402,10 @@ TEST_F(CollectionTest, PrefixSearching) {
std::string id = ids.at(i);
ASSERT_STREQ(id.c_str(), result_id.c_str());
}
// only the last token in the query should be used for prefix search - so, "math" should not match "mathematics"
results = collection->search("math fx", query_fields, "", facets, sort_fields, 0, 1, 1, FREQUENCY, true).get();
ASSERT_EQ(0, results["hits"].size());
}
TEST_F(CollectionTest, MultipleFields) {