For prefix search, only the last term in the query should be considered as prefix.

2025-05-18 04:32:38 +08:00 · 2017-08-19 10:42:49 +05:30 · 2017-08-19 10:42:49 +05:30 · ea550f167c
commit ea550f167c
parent f5848be750
4 changed files with 27 additions and 12 deletions
--- a/TODO.md
+++ b/TODO.md
@ -53,11 +53,11 @@
 - ~~Test for collection creation validation~~
 - ~~Test for delete document~~
 - ~~art float search~~
- When prefix=true, use token_ranking_field for token ordering only for last word
- only last token should be prefix searched
+- ~~When prefix=true, use token_ranking_field for token ordering only for last word~~
+- ~~only last token should be prefix searched~~
+- ~~Prefix-search strings should not be null terminated~~
 - test for token ranking on float field
 - test for float int field deletion during doc deletion
- Prefix-search strings should not be null terminated
 - > INT32_MAX validation for float field
 - art bool support
 - Proper logging
--- a/src/art.cpp
+++ b/src/art.cpp
@ -1169,7 +1169,9 @@ static inline int levenshtein_dist(const int depth, const char p, const char c,
            krow[column] = std::min(krow[column], irow[column-2] + cost);
        }

-        if(krow[column] < row_min) row_min = krow[column];
+        if(krow[column] < row_min) {
+            row_min = krow[column];
+        }
    }

    return row_min;
@ -1267,7 +1269,16 @@ static void art_fuzzy_recurse(char p, char c, const art_node *n, int depth, cons
        art_leaf *l = (art_leaf *) LEAF_RAW(n);
        printf("\nIS_LEAF\nLEAF KEY: %s, depth: %d\n", l->key, depth);

-        const int end_index = min(l->key_len, term_len+max_cost);
+        /*
+           For prefix search, when key is longer than term, we could potentially iterate till `term_len+max_cost` for:
+           term = `th`, leaf = `mathematics` - if we compared only first 2 chars, exceeds max_cost
+           However, we refrain from doing so for performance reasons, or atleast until we hear strong objections.
+
+           Also, for prefix searches we don't compare with full leaf key.
+         */
+        const int end_index = prefix ? min(l->key_len, term_len) : l->key_len;
+
+        // If at any point, `cost > 2*max_cost` we can terminate immediately as we can never recover from that
        while(depth < end_index && cost <= 2*max_cost) {
            c = l->key[depth];
            cost = levenshtein_dist(depth, p, c, term, term_len, rows[i], rows[j], rows[k]);
--- a/src/collection.cpp
+++ b/src/collection.cpp
@ -955,11 +955,12 @@ void Collection::search_field(std::string & query, const std::string & field, ui
            if(token_cost_cache.count(token_cost_hash) != 0) {
                leaves = token_cost_cache[token_cost_hash];
            } else {
-                int token_len = prefix ? (int) token.length() : (int) token.length() + 1;
-                int count = search_index.count(field);
+                // prefix should apply only for last token
+                const bool prefix_search = prefix && ((token_index == tokens.size()-1) ? true : false);
+                const int token_len = prefix_search ? (int) token.length() : (int) token.length() + 1;

                art_fuzzy_search(search_index.at(field), (const unsigned char *) token.c_str(), token_len,
-                                 costs[token_index], costs[token_index], 3, token_order, prefix, leaves);
+                                 costs[token_index], costs[token_index], 3, token_order, prefix_search, leaves);

                if(!leaves.empty()) {
                    token_cost_cache.emplace(token_cost_hash, leaves);
@ -1014,7 +1015,7 @@ void Collection::search_field(std::string & query, const std::string & field, ui

    // When there are not enough overall results and atleast one token has results
    if(topster.size < max_results && token_to_count.size() > 1) {
-        // Drop certain token with least hits and try searching again
+        // Drop token with least hits and try searching again
        std::string truncated_query;

        std::vector<std::pair<std::string, uint32_t>> token_count_pairs;
@ -1029,9 +1030,8 @@ void Collection::search_field(std::string & query, const std::string & field, ui
        );

        for(uint32_t i = 0; i < token_count_pairs.size()-1; i++) {
-            if(token_to_count.count(token_count_pairs[i].first) != 0) {
-                truncated_query += " " + token_count_pairs.at(i).first;
-            }
+            // iterate till last but one
+            truncated_query += " " + token_count_pairs.at(i).first;
        }

        return search_field(truncated_query, field, filter_ids, filter_ids_length, facets, sort_fields, num_typos,
--- a/test/collection_test.cpp
+++ b/test/collection_test.cpp
@ -402,6 +402,10 @@ TEST_F(CollectionTest, PrefixSearching) {
        std::string id = ids.at(i);
        ASSERT_STREQ(id.c_str(), result_id.c_str());
    }
+
+    // only the last token in the query should be used for prefix search - so, "math" should not match "mathematics"
+    results = collection->search("math fx", query_fields, "", facets, sort_fields, 0, 1, 1, FREQUENCY, true).get();
+    ASSERT_EQ(0, results["hits"].size());
 }

 TEST_F(CollectionTest, MultipleFields) {