From ea550f167c35b1ca6ceedb9007274c5bf19e7dcf Mon Sep 17 00:00:00 2001
From: Kishore Nallan <kishore@kishorelive.com>
Date: Sat, 19 Aug 2017 10:42:49 +0530
Subject: [PATCH] For prefix search, only the last term in the query should be
 considered as prefix.

---
 TODO.md                  |  6 +++---
 src/art.cpp              | 15 +++++++++++++--
 src/collection.cpp       | 14 +++++++-------
 test/collection_test.cpp |  4 ++++
 4 files changed, 27 insertions(+), 12 deletions(-)

diff --git a/TODO.md b/TODO.md
index 2143defa..61e7f95b 100644
--- a/TODO.md
+++ b/TODO.md
@@ -53,11 +53,11 @@
 - ~~Test for collection creation validation~~
 - ~~Test for delete document~~
 - ~~art float search~~
-- When prefix=true, use token_ranking_field for token ordering only for last word
-- only last token should be prefix searched
+- ~~When prefix=true, use token_ranking_field for token ordering only for last word~~
+- ~~only last token should be prefix searched~~
+- ~~Prefix-search strings should not be null terminated~~
 - test for token ranking on float field
 - test for float int field deletion during doc deletion
-- Prefix-search strings should not be null terminated
 - > INT32_MAX validation for float field
 - art bool support
 - Proper logging
diff --git a/src/art.cpp b/src/art.cpp
index 97155f2c..c675ecd9 100644
--- a/src/art.cpp
+++ b/src/art.cpp
@@ -1169,7 +1169,9 @@ static inline int levenshtein_dist(const int depth, const char p, const char c,
             krow[column] = std::min(krow[column], irow[column-2] + cost);
         }
 
-        if(krow[column] < row_min) row_min = krow[column];
+        if(krow[column] < row_min) {
+            row_min = krow[column];
+        }
     }
 
     return row_min;
@@ -1267,7 +1269,16 @@ static void art_fuzzy_recurse(char p, char c, const art_node *n, int depth, cons
         art_leaf *l = (art_leaf *) LEAF_RAW(n);
         printf("\nIS_LEAF\nLEAF KEY: %s, depth: %d\n", l->key, depth);
 
-        const int end_index = min(l->key_len, term_len+max_cost);
+        /*
+           For prefix search, when key is longer than term, we could potentially iterate till `term_len+max_cost` for:
+           term = `th`, leaf = `mathematics` - if we compared only first 2 chars, exceeds max_cost
+           However, we refrain from doing so for performance reasons, or atleast until we hear strong objections.
+
+           Also, for prefix searches we don't compare with full leaf key.
+         */
+        const int end_index = prefix ? min(l->key_len, term_len) : l->key_len;
+
+        // If at any point, `cost > 2*max_cost` we can terminate immediately as we can never recover from that
         while(depth < end_index && cost <= 2*max_cost) {
             c = l->key[depth];
             cost = levenshtein_dist(depth, p, c, term, term_len, rows[i], rows[j], rows[k]);
diff --git a/src/collection.cpp b/src/collection.cpp
index 2f6161b3..0a3c2498 100644
--- a/src/collection.cpp
+++ b/src/collection.cpp
@@ -955,11 +955,12 @@ void Collection::search_field(std::string & query, const std::string & field, ui
             if(token_cost_cache.count(token_cost_hash) != 0) {
                 leaves = token_cost_cache[token_cost_hash];
             } else {
-                int token_len = prefix ? (int) token.length() : (int) token.length() + 1;
-                int count = search_index.count(field);
+                // prefix should apply only for last token
+                const bool prefix_search = prefix && ((token_index == tokens.size()-1) ? true : false);
+                const int token_len = prefix_search ? (int) token.length() : (int) token.length() + 1;
 
                 art_fuzzy_search(search_index.at(field), (const unsigned char *) token.c_str(), token_len,
-                                 costs[token_index], costs[token_index], 3, token_order, prefix, leaves);
+                                 costs[token_index], costs[token_index], 3, token_order, prefix_search, leaves);
 
                 if(!leaves.empty()) {
                     token_cost_cache.emplace(token_cost_hash, leaves);
@@ -1014,7 +1015,7 @@ void Collection::search_field(std::string & query, const std::string & field, ui
 
     // When there are not enough overall results and atleast one token has results
     if(topster.size < max_results && token_to_count.size() > 1) {
-        // Drop certain token with least hits and try searching again
+        // Drop token with least hits and try searching again
         std::string truncated_query;
 
         std::vector<std::pair<std::string, uint32_t>> token_count_pairs;
@@ -1029,9 +1030,8 @@ void Collection::search_field(std::string & query, const std::string & field, ui
         );
 
         for(uint32_t i = 0; i < token_count_pairs.size()-1; i++) {
-            if(token_to_count.count(token_count_pairs[i].first) != 0) {
-                truncated_query += " " + token_count_pairs.at(i).first;
-            }
+            // iterate till last but one
+            truncated_query += " " + token_count_pairs.at(i).first;
         }
 
         return search_field(truncated_query, field, filter_ids, filter_ids_length, facets, sort_fields, num_typos,
diff --git a/test/collection_test.cpp b/test/collection_test.cpp
index 9aa3feba..7bfd82ba 100644
--- a/test/collection_test.cpp
+++ b/test/collection_test.cpp
@@ -402,6 +402,10 @@ TEST_F(CollectionTest, PrefixSearching) {
         std::string id = ids.at(i);
         ASSERT_STREQ(id.c_str(), result_id.c_str());
     }
+
+    // only the last token in the query should be used for prefix search - so, "math" should not match "mathematics"
+    results = collection->search("math fx", query_fields, "", facets, sort_fields, 0, 1, 1, FREQUENCY, true).get();
+    ASSERT_EQ(0, results["hits"].size());
 }
 
 TEST_F(CollectionTest, MultipleFields) {