From ea550f167c35b1ca6ceedb9007274c5bf19e7dcf Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Sat, 19 Aug 2017 10:42:49 +0530 Subject: [PATCH] For prefix search, only the last term in the query should be considered as prefix. --- TODO.md | 6 +++--- src/art.cpp | 15 +++++++++++++-- src/collection.cpp | 14 +++++++------- test/collection_test.cpp | 4 ++++ 4 files changed, 27 insertions(+), 12 deletions(-) diff --git a/TODO.md b/TODO.md index 2143defa..61e7f95b 100644 --- a/TODO.md +++ b/TODO.md @@ -53,11 +53,11 @@ - ~~Test for collection creation validation~~ - ~~Test for delete document~~ - ~~art float search~~ -- When prefix=true, use token_ranking_field for token ordering only for last word -- only last token should be prefix searched +- ~~When prefix=true, use token_ranking_field for token ordering only for last word~~ +- ~~only last token should be prefix searched~~ +- ~~Prefix-search strings should not be null terminated~~ - test for token ranking on float field - test for float int field deletion during doc deletion -- Prefix-search strings should not be null terminated - > INT32_MAX validation for float field - art bool support - Proper logging diff --git a/src/art.cpp b/src/art.cpp index 97155f2c..c675ecd9 100644 --- a/src/art.cpp +++ b/src/art.cpp @@ -1169,7 +1169,9 @@ static inline int levenshtein_dist(const int depth, const char p, const char c, krow[column] = std::min(krow[column], irow[column-2] + cost); } - if(krow[column] < row_min) row_min = krow[column]; + if(krow[column] < row_min) { + row_min = krow[column]; + } } return row_min; @@ -1267,7 +1269,16 @@ static void art_fuzzy_recurse(char p, char c, const art_node *n, int depth, cons art_leaf *l = (art_leaf *) LEAF_RAW(n); printf("\nIS_LEAF\nLEAF KEY: %s, depth: %d\n", l->key, depth); - const int end_index = min(l->key_len, term_len+max_cost); + /* + For prefix search, when key is longer than term, we could potentially iterate till `term_len+max_cost` for: + term = `th`, leaf = `mathematics` - if we compared only first 2 chars, exceeds max_cost + However, we refrain from doing so for performance reasons, or atleast until we hear strong objections. + + Also, for prefix searches we don't compare with full leaf key. + */ + const int end_index = prefix ? min(l->key_len, term_len) : l->key_len; + + // If at any point, `cost > 2*max_cost` we can terminate immediately as we can never recover from that while(depth < end_index && cost <= 2*max_cost) { c = l->key[depth]; cost = levenshtein_dist(depth, p, c, term, term_len, rows[i], rows[j], rows[k]); diff --git a/src/collection.cpp b/src/collection.cpp index 2f6161b3..0a3c2498 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -955,11 +955,12 @@ void Collection::search_field(std::string & query, const std::string & field, ui if(token_cost_cache.count(token_cost_hash) != 0) { leaves = token_cost_cache[token_cost_hash]; } else { - int token_len = prefix ? (int) token.length() : (int) token.length() + 1; - int count = search_index.count(field); + // prefix should apply only for last token + const bool prefix_search = prefix && ((token_index == tokens.size()-1) ? true : false); + const int token_len = prefix_search ? (int) token.length() : (int) token.length() + 1; art_fuzzy_search(search_index.at(field), (const unsigned char *) token.c_str(), token_len, - costs[token_index], costs[token_index], 3, token_order, prefix, leaves); + costs[token_index], costs[token_index], 3, token_order, prefix_search, leaves); if(!leaves.empty()) { token_cost_cache.emplace(token_cost_hash, leaves); @@ -1014,7 +1015,7 @@ void Collection::search_field(std::string & query, const std::string & field, ui // When there are not enough overall results and atleast one token has results if(topster.size < max_results && token_to_count.size() > 1) { - // Drop certain token with least hits and try searching again + // Drop token with least hits and try searching again std::string truncated_query; std::vector> token_count_pairs; @@ -1029,9 +1030,8 @@ void Collection::search_field(std::string & query, const std::string & field, ui ); for(uint32_t i = 0; i < token_count_pairs.size()-1; i++) { - if(token_to_count.count(token_count_pairs[i].first) != 0) { - truncated_query += " " + token_count_pairs.at(i).first; - } + // iterate till last but one + truncated_query += " " + token_count_pairs.at(i).first; } return search_field(truncated_query, field, filter_ids, filter_ids_length, facets, sort_fields, num_typos, diff --git a/test/collection_test.cpp b/test/collection_test.cpp index 9aa3feba..7bfd82ba 100644 --- a/test/collection_test.cpp +++ b/test/collection_test.cpp @@ -402,6 +402,10 @@ TEST_F(CollectionTest, PrefixSearching) { std::string id = ids.at(i); ASSERT_STREQ(id.c_str(), result_id.c_str()); } + + // only the last token in the query should be used for prefix search - so, "math" should not match "mathematics" + results = collection->search("math fx", query_fields, "", facets, sort_fields, 0, 1, 1, FREQUENCY, true).get(); + ASSERT_EQ(0, results["hits"].size()); } TEST_F(CollectionTest, MultipleFields) {