Typo and drop tokens thresholds must be applied independently.

2025-05-18 20:52:50 +08:00 · 2021-07-16 13:39:52 +05:30 · 2021-07-16 13:39:52 +05:30 · 672c895805
commit 672c895805
parent 0ae718d067
3 changed files with 39 additions and 3 deletions
--- a/src/index.cpp
+++ b/src/index.cpp
@ -1984,6 +1984,12 @@ void Index::search_field(const uint8_t & field_id,
                    // when no more costs are left for this token
                    if(token_to_costs[token_index].empty()) {
                        // we can try to drop the token and search with remaining tokens
+
+                        if(field_num_results >= drop_tokens_threshold) {
+                            // but if drop_tokens_threshold is breached, we are done
+                            return ;
+                        }
+
                        token_to_costs.erase(token_to_costs.begin()+token_index);
                        search_tokens.erase(search_tokens.begin()+token_index);
                        query_tokens.erase(query_tokens.begin()+token_index);
@ -2010,8 +2016,8 @@ void Index::search_field(const uint8_t & field_id,

        resume_typo_loop:

-        if(field_num_results >= drop_tokens_threshold || field_num_results >= typo_tokens_threshold) {
-            // if either threshold is breached, we are done
+        if(field_num_results >= typo_tokens_threshold) {
+            // if typo threshold is breached, we are done
            return ;
        }

@ -2022,6 +2028,11 @@ void Index::search_field(const uint8_t & field_id,
    if(!query_tokens.empty() && num_tokens_dropped < query_tokens.size()) {
        // Drop tokens from right until (len/2 + 1), and then from left until (len/2 + 1)

+        if(field_num_results >= drop_tokens_threshold) {
+            // if drop_tokens_threshold is breached, we are done
+            return ;
+        }
+
        std::vector<token_t> truncated_tokens;
        num_tokens_dropped++;

--- a/test/collection_specific_test.cpp
+++ b/test/collection_specific_test.cpp
@ -309,6 +309,31 @@ TEST_F(CollectionSpecificTest, ExactMatchOnPrefix) {
    collectionManager.drop_collection("coll1");
 }

+TEST_F(CollectionSpecificTest, TypoPrefixSearchWithoutPrefixEnabled) {
+    std::vector<field> fields = {field("title", field_types::STRING, false),
+                                 field("points", field_types::INT32, false),};
+
+    Collection* coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
+
+    nlohmann::json doc1;
+    doc1["id"] = "0";
+    doc1["title"] = "Cisco SG25026HP Gigabit Smart Switch";
+    doc1["points"] = 100;
+
+    ASSERT_TRUE(coll1->add(doc1.dump()).ok());
+
+    auto results = coll1->search("SG25026H", {"title"}, "", {}, {}, {2}, 10,
+                                 1, FREQUENCY, {false}, 0,
+                                 spp::sparse_hash_set<std::string>(),
+                                 spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
+                                 "", 1).get();
+
+    ASSERT_EQ(1, results["hits"].size());
+    ASSERT_EQ("0", results["hits"][0]["document"]["id"].get<std::string>());
+
+    collectionManager.drop_collection("coll1");
+}
+
 TEST_F(CollectionSpecificTest, PrefixWithTypos) {
    std::vector<field> fields = {field("title", field_types::STRING, false),
                                 field("points", field_types::INT32, false),};
--- a/test/collection_test.cpp
+++ b/test/collection_test.cpp
@ -292,7 +292,7 @@ TEST_F(CollectionTest, SkipUnindexedTokensDuringPhraseSearch) {
    }

    results.clear();
-    results = collection->search("the a DoesNotExist", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, {false}, 0).get();
+    results = collection->search("the a insurance", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, {false}, 0).get();
    ASSERT_EQ(0, results["hits"].size());

    // with no indexed word