mirror of
https://github.com/typesense/typesense.git
synced 2025-05-18 04:32:38 +08:00
For prefix search, only the last term in the query should be considered as prefix.
This commit is contained in:
parent
f5848be750
commit
ea550f167c
6
TODO.md
6
TODO.md
@ -53,11 +53,11 @@
|
||||
- ~~Test for collection creation validation~~
|
||||
- ~~Test for delete document~~
|
||||
- ~~art float search~~
|
||||
- When prefix=true, use token_ranking_field for token ordering only for last word
|
||||
- only last token should be prefix searched
|
||||
- ~~When prefix=true, use token_ranking_field for token ordering only for last word~~
|
||||
- ~~only last token should be prefix searched~~
|
||||
- ~~Prefix-search strings should not be null terminated~~
|
||||
- test for token ranking on float field
|
||||
- test for float int field deletion during doc deletion
|
||||
- Prefix-search strings should not be null terminated
|
||||
- > INT32_MAX validation for float field
|
||||
- art bool support
|
||||
- Proper logging
|
||||
|
15
src/art.cpp
15
src/art.cpp
@ -1169,7 +1169,9 @@ static inline int levenshtein_dist(const int depth, const char p, const char c,
|
||||
krow[column] = std::min(krow[column], irow[column-2] + cost);
|
||||
}
|
||||
|
||||
if(krow[column] < row_min) row_min = krow[column];
|
||||
if(krow[column] < row_min) {
|
||||
row_min = krow[column];
|
||||
}
|
||||
}
|
||||
|
||||
return row_min;
|
||||
@ -1267,7 +1269,16 @@ static void art_fuzzy_recurse(char p, char c, const art_node *n, int depth, cons
|
||||
art_leaf *l = (art_leaf *) LEAF_RAW(n);
|
||||
printf("\nIS_LEAF\nLEAF KEY: %s, depth: %d\n", l->key, depth);
|
||||
|
||||
const int end_index = min(l->key_len, term_len+max_cost);
|
||||
/*
|
||||
For prefix search, when key is longer than term, we could potentially iterate till `term_len+max_cost` for:
|
||||
term = `th`, leaf = `mathematics` - if we compared only first 2 chars, exceeds max_cost
|
||||
However, we refrain from doing so for performance reasons, or atleast until we hear strong objections.
|
||||
|
||||
Also, for prefix searches we don't compare with full leaf key.
|
||||
*/
|
||||
const int end_index = prefix ? min(l->key_len, term_len) : l->key_len;
|
||||
|
||||
// If at any point, `cost > 2*max_cost` we can terminate immediately as we can never recover from that
|
||||
while(depth < end_index && cost <= 2*max_cost) {
|
||||
c = l->key[depth];
|
||||
cost = levenshtein_dist(depth, p, c, term, term_len, rows[i], rows[j], rows[k]);
|
||||
|
@ -955,11 +955,12 @@ void Collection::search_field(std::string & query, const std::string & field, ui
|
||||
if(token_cost_cache.count(token_cost_hash) != 0) {
|
||||
leaves = token_cost_cache[token_cost_hash];
|
||||
} else {
|
||||
int token_len = prefix ? (int) token.length() : (int) token.length() + 1;
|
||||
int count = search_index.count(field);
|
||||
// prefix should apply only for last token
|
||||
const bool prefix_search = prefix && ((token_index == tokens.size()-1) ? true : false);
|
||||
const int token_len = prefix_search ? (int) token.length() : (int) token.length() + 1;
|
||||
|
||||
art_fuzzy_search(search_index.at(field), (const unsigned char *) token.c_str(), token_len,
|
||||
costs[token_index], costs[token_index], 3, token_order, prefix, leaves);
|
||||
costs[token_index], costs[token_index], 3, token_order, prefix_search, leaves);
|
||||
|
||||
if(!leaves.empty()) {
|
||||
token_cost_cache.emplace(token_cost_hash, leaves);
|
||||
@ -1014,7 +1015,7 @@ void Collection::search_field(std::string & query, const std::string & field, ui
|
||||
|
||||
// When there are not enough overall results and atleast one token has results
|
||||
if(topster.size < max_results && token_to_count.size() > 1) {
|
||||
// Drop certain token with least hits and try searching again
|
||||
// Drop token with least hits and try searching again
|
||||
std::string truncated_query;
|
||||
|
||||
std::vector<std::pair<std::string, uint32_t>> token_count_pairs;
|
||||
@ -1029,9 +1030,8 @@ void Collection::search_field(std::string & query, const std::string & field, ui
|
||||
);
|
||||
|
||||
for(uint32_t i = 0; i < token_count_pairs.size()-1; i++) {
|
||||
if(token_to_count.count(token_count_pairs[i].first) != 0) {
|
||||
truncated_query += " " + token_count_pairs.at(i).first;
|
||||
}
|
||||
// iterate till last but one
|
||||
truncated_query += " " + token_count_pairs.at(i).first;
|
||||
}
|
||||
|
||||
return search_field(truncated_query, field, filter_ids, filter_ids_length, facets, sort_fields, num_typos,
|
||||
|
@ -402,6 +402,10 @@ TEST_F(CollectionTest, PrefixSearching) {
|
||||
std::string id = ids.at(i);
|
||||
ASSERT_STREQ(id.c_str(), result_id.c_str());
|
||||
}
|
||||
|
||||
// only the last token in the query should be used for prefix search - so, "math" should not match "mathematics"
|
||||
results = collection->search("math fx", query_fields, "", facets, sort_fields, 0, 1, 1, FREQUENCY, true).get();
|
||||
ASSERT_EQ(0, results["hits"].size());
|
||||
}
|
||||
|
||||
TEST_F(CollectionTest, MultipleFields) {
|
||||
|
Loading…
x
Reference in New Issue
Block a user