Change default value of typo/drop tokens threshold to 1.

2025-05-20 05:32:30 +08:00 · 2021-08-11 14:20:28 +05:30 · 2021-08-11 14:20:28 +05:30 · 26351a6984
commit 26351a6984
parent 8c46fed1b4
7 changed files with 92 additions and 24 deletions
--- a/include/index.h
+++ b/include/index.h
@ -292,7 +292,7 @@ private:

 public:
    // for limiting number of results on multiple candidates / query rewrites
-    enum {TYPO_TOKENS_THRESHOLD = 100};
+    enum {TYPO_TOKENS_THRESHOLD = 1};

    // for limiting number of fields that can be searched on
    enum {FIELD_LIMIT_NUM = 100};
@ -301,7 +301,7 @@ public:

    // If the number of results found is less than this threshold, Typesense will attempt to drop the tokens
    // in the query that have the least individual hits one by one until enough results are found.
-    static const int DROP_TOKENS_THRESHOLD = 10;
+    static const int DROP_TOKENS_THRESHOLD = 1;

    Index() = delete;

--- a/src/index.cpp
+++ b/src/index.cpp
@ -2025,7 +2025,7 @@ void Index::search_field(const uint8_t & field_id,
                            num_tokens_dropped, field, filter_ids, filter_ids_length, curated_ids,facets,
                            sort_fields, num_typos,searched_queries, topster, groups_processed, all_result_ids,
                            all_result_ids_len, field_num_results, group_limit, group_by_fields, prioritize_exact_match,
-                            token_order, prefix, combination_limit);
+                            token_order, prefix, drop_tokens_threshold, typo_tokens_threshold, combination_limit);
    }
 }

--- a/test/collection_grouping_test.cpp
+++ b/test/collection_grouping_test.cpp
@ -287,7 +287,7 @@ TEST_F(CollectionGroupingTest, GroupingWithMultiFieldRelevance) {

    auto results = coll1->search("Dustin Kensrue Down There by the Train",
                                 {"title", "artist"}, "", {}, {}, {0}, 10, 1, FREQUENCY,
-                                 {false}, Index::DROP_TOKENS_THRESHOLD,
+                                 {false}, 10,
                                 spp::sparse_hash_set<std::string>(),
                                 spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
                                 "", 10,
--- a/test/collection_locale_test.cpp
+++ b/test/collection_locale_test.cpp
@ -349,7 +349,10 @@ TEST_F(CollectionLocaleTest, KoreanTextPrefixConsonant) {

    // To ensure that NFKD works, we will test for both &#4352; (Hangul Choseong Kiyeok)
    auto results = coll1->search("서울특별시 ᄀ",
-                                 {"title"}, "", {}, sort_fields, {0}, 10, 1, FREQUENCY, {true}).get();
+                                 {"title"}, "", {}, sort_fields, {0}, 10, 1, FREQUENCY, {true}, 10,
+                                 spp::sparse_hash_set<std::string>(),
+                                 spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
+                                 "", 10).get();

    ASSERT_EQ(6, results["found"].get<size_t>());
    ASSERT_EQ(6, results["hits"].size());
@ -357,7 +360,10 @@ TEST_F(CollectionLocaleTest, KoreanTextPrefixConsonant) {

    // and &#12593; (Hangul Letter Kiyeok)
    results = coll1->search("서울특별시 ㄱ",
-                             {"title"}, "", {}, sort_fields, {0}, 10, 1, FREQUENCY, {true}).get();
+                             {"title"}, "", {}, sort_fields, {0}, 10, 1, FREQUENCY, {true}, 10,
+                            spp::sparse_hash_set<std::string>(),
+                            spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
+                            "", 10).get();

    ASSERT_EQ(6, results["found"].get<size_t>());
    ASSERT_EQ(6, results["hits"].size());
@ -365,7 +371,10 @@ TEST_F(CollectionLocaleTest, KoreanTextPrefixConsonant) {

    // search for full word
    results = coll1->search("서울특별시 관",
-                             {"title"}, "", {}, sort_fields, {0}, 10, 1, FREQUENCY, {true}).get();
+                             {"title"}, "", {}, sort_fields, {0}, 10, 1, FREQUENCY, {true}, 10,
+                            spp::sparse_hash_set<std::string>(),
+                            spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
+                            "", 10).get();

    ASSERT_EQ(6, results["found"].get<size_t>());
    ASSERT_EQ(6, results["hits"].size());
@ -407,7 +416,10 @@ TEST_F(CollectionLocaleTest, KoreanTextPrefixVowel) {
    std::vector<sort_by> sort_fields = { sort_by(sort_field_const::text_match, "DESC"), sort_by("points", "DESC") };

    auto results = coll1->search("서울특별시 고",
-                                 {"title"}, "", {}, sort_fields, {0}, 10, 1, FREQUENCY, {true}).get();
+                                 {"title"}, "", {}, sort_fields, {0}, 10, 1, FREQUENCY, {true}, 10,
+                                 spp::sparse_hash_set<std::string>(),
+                                 spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
+                                 "", 10).get();

    ASSERT_EQ(6, results["found"].get<size_t>());
    ASSERT_EQ(6, results["hits"].size());
--- a/test/collection_sorting_test.cpp
+++ b/test/collection_sorting_test.cpp
@ -438,7 +438,10 @@ TEST_F(CollectionSortingTest, ThreeSortFieldsTextMatchLast) {
    std::vector<sort_by> sort_fields = { sort_by("popularity", "DESC"), sort_by("points", "DESC"), sort_by(sort_field_const::text_match, "DESC") };

    auto res = coll1->search("grant",
-                                 {"title","artist"}, "", {}, sort_fields, {1}, 10, 1, FREQUENCY).get();
+                             {"title","artist"}, "", {}, sort_fields, {1}, 10, 1, FREQUENCY, {false}, 10,
+                             spp::sparse_hash_set<std::string>(),
+                             spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
+                             "", 10).get();

    ASSERT_EQ(2, res["found"].get<size_t>());
    ASSERT_STREQ("1", res["hits"][0]["document"]["id"].get<std::string>().c_str());
@ -479,7 +482,10 @@ TEST_F(CollectionSortingTest, SingleFieldTextMatchScoreDefault) {

    auto results = coll1->search("alpha",
                                 {"title"}, "", {}, sort_fields, {2}, 10, 1, FREQUENCY,
-                                 {false}, 10).get();
+                                 {false}, 10,
+                                 spp::sparse_hash_set<std::string>(),
+                                 spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
+                                 "", 10).get();

    ASSERT_EQ(3, results["found"].get<size_t>());
    ASSERT_EQ(3, results["hits"].size());
--- a/test/collection_specific_test.cpp
+++ b/test/collection_specific_test.cpp
@ -410,7 +410,10 @@ TEST_F(CollectionSpecificTest, PrefixVsExactMatch) {
    }

    auto results = coll1->search("ration",
-                                 {"title"}, "", {}, {}, {1}, 10, 1, FREQUENCY, {true}).get();
+                                 {"title"}, "", {}, {}, {1}, 10, 1, FREQUENCY, {true}, 10,
+                                 spp::sparse_hash_set<std::string>(),
+                                 spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
+                                 "", 10).get();

    ASSERT_EQ(4, results["found"].get<size_t>());
    ASSERT_EQ(4, results["hits"].size());
@ -818,7 +821,7 @@ TEST_F(CollectionSpecificTest, PrefixSearchOnlyOnLastToken) {
                                 "", {}, {}, {1}, 10,
                                 1, FREQUENCY, {true},
                                 0, spp::sparse_hash_set<std::string>(),
-                                 spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "description", 20, {}, {}, {}, 0,
+                                 spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "concat", 20, {}, {}, {}, 0,
                                 "<mark>", "</mark>").get();

    ASSERT_EQ(0, results["hits"][0]["highlights"].size());
--- a/test/collection_test.cpp
+++ b/test/collection_test.cpp
@ -151,7 +151,12 @@ TEST_F(CollectionTest, ExactSearchShouldBeStable) {

 TEST_F(CollectionTest, PhraseSearch) {
    std::vector<std::string> facets;
-    nlohmann::json results = collection->search("rocket launch", query_fields, "", facets, sort_fields, {0}, 10).get();
+    nlohmann::json results = collection->search("rocket launch", query_fields, "", facets, sort_fields, {0}, 10,
+                                                1, FREQUENCY,
+                                                {false}, 10,
+                                                spp::sparse_hash_set<std::string>(),
+                                                spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
+                                                "", 10).get();
    ASSERT_EQ(5, results["hits"].size());
    ASSERT_EQ(5, results["found"].get<uint32_t>());

@ -180,7 +185,12 @@ TEST_F(CollectionTest, PhraseSearch) {

    // Check ASC sort order
    std::vector<sort_by> sort_fields_asc = { sort_by(sort_field_const::text_match, "DESC"), sort_by("points", "ASC") };
-    results = collection->search("rocket launch", query_fields, "", facets, sort_fields_asc, {0}, 10).get();
+    results = collection->search("rocket launch", query_fields, "", facets, sort_fields_asc, {0}, 10,
+                                 1, FREQUENCY,
+                                 {false}, 10,
+                                 spp::sparse_hash_set<std::string>(),
+                                 spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
+                                 "", 10).get();
    ASSERT_EQ(5, results["hits"].size());
    ASSERT_EQ(5, results["found"].get<uint32_t>());

@ -194,7 +204,12 @@ TEST_F(CollectionTest, PhraseSearch) {
    }

    // Check pagination
-    results = collection->search("rocket launch", query_fields, "", facets, sort_fields, {0}, 3).get();
+    results = collection->search("rocket launch", query_fields, "", facets, sort_fields, {0}, 3,
+                                 1, FREQUENCY,
+                                 {false}, 10,
+                                 spp::sparse_hash_set<std::string>(),
+                                 spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
+                                 "", 10).get();
    ASSERT_EQ(3, results["hits"].size());
    ASSERT_EQ(5, results["found"].get<uint32_t>());

@ -212,7 +227,12 @@ TEST_F(CollectionTest, PhraseSearch) {

 TEST_F(CollectionTest, SearchWithExcludedTokens) {
    std::vector<std::string> facets;
-    nlohmann::json results = collection->search("how -propellants -are", query_fields, "", facets, sort_fields, {0}, 10).get();
+    nlohmann::json results = collection->search("how -propellants -are", query_fields, "", facets, sort_fields, {0}, 10,
+                                                1, FREQUENCY,
+                                                {false}, 10,
+                                                spp::sparse_hash_set<std::string>(),
+                                                spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
+                                                "", 10).get();

    ASSERT_EQ(2, results["hits"].size());
    ASSERT_EQ(2, results["found"].get<uint32_t>());
@ -276,7 +296,10 @@ TEST_F(CollectionTest, SkipUnindexedTokensDuringPhraseSearch) {

    // should not try to drop tokens to expand query
    results.clear();
-    results = collection->search("the a", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, {false}, 10).get();
+    results = collection->search("the a", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, {false}, 10,
+                                 spp::sparse_hash_set<std::string>(),
+                                 spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
+                                 "", 10).get();
    ASSERT_EQ(9, results["hits"].size());

    results.clear();
@ -322,7 +345,12 @@ TEST_F(CollectionTest, PartialPhraseSearch) {

 TEST_F(CollectionTest, QueryWithTypo) {
    std::vector<std::string> facets;
-    nlohmann::json results = collection->search("kind biologcal", query_fields, "", facets, sort_fields, {2}, 3).get();
+    nlohmann::json results = collection->search("kind biologcal", query_fields, "", facets, sort_fields, {2}, 3,
+                                                1, FREQUENCY,
+                                                {false}, 10,
+                                                spp::sparse_hash_set<std::string>(),
+                                                spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
+                                                "", 10).get();
    ASSERT_EQ(3, results["hits"].size());

    std::vector<std::string> ids = {"19", "3", "20"};
@ -335,7 +363,12 @@ TEST_F(CollectionTest, QueryWithTypo) {
    }

    results.clear();
-    results = collection->search("fer thx", query_fields, "", facets, sort_fields, {1}, 3).get();
+    results = collection->search("fer thx", query_fields, "", facets, sort_fields, {1}, 3,
+                                 1, FREQUENCY,
+                                 {false}, 10,
+                                 spp::sparse_hash_set<std::string>(),
+                                 spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
+                                 "", 10).get();
    ids = {"1", "10", "13"};

    ASSERT_EQ(3, results["hits"].size());
@ -411,7 +444,9 @@ TEST_F(CollectionTest, TypoTokenRankedByScoreAndFrequency) {
 TEST_F(CollectionTest, TextContainingAnActualTypo) {
    // A line contains "ISX" but not "what" - need to ensure that correction to "ISS what" happens
    std::vector<std::string> facets;
-    nlohmann::json results = collection->search("ISX what", query_fields, "", facets, sort_fields, {1}, 4, 1, FREQUENCY, {false}).get();
+    nlohmann::json results = collection->search("ISX what", query_fields, "", facets, sort_fields, {1}, 4, 1, FREQUENCY, {false},
+                                               10, spp::sparse_hash_set<std::string>(), spp::sparse_hash_set<std::string>(),
+                                               10, "", 30, 5, "", 10).get();
    ASSERT_EQ(4, results["hits"].size());
    ASSERT_EQ(13, results["found"].get<uint32_t>());

@ -425,7 +460,10 @@ TEST_F(CollectionTest, TextContainingAnActualTypo) {
    }

    // Record containing exact token match should appear first
-    results = collection->search("ISX", query_fields, "", facets, sort_fields, {1}, 10, 1, FREQUENCY, {false}).get();
+    results = collection->search("ISX", query_fields, "", facets, sort_fields, {1}, 10, 1, FREQUENCY, {false}, 10,
+                                 spp::sparse_hash_set<std::string>(),
+                                 spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
+                                 "", 10).get();
    ASSERT_EQ(8, results["hits"].size());
    ASSERT_EQ(8, results["found"].get<uint32_t>());

@ -547,7 +585,10 @@ TEST_F(CollectionTest, PrefixSearching) {
        ASSERT_STREQ(id.c_str(), result_id.c_str());
    }

-    results = collection->search("what ex", query_fields, "", facets, sort_fields, {0}, 10, 1, MAX_SCORE, {true}).get();
+    results = collection->search("what ex", query_fields, "", facets, sort_fields, {0}, 10, 1, MAX_SCORE, {true}, 10,
+                                 spp::sparse_hash_set<std::string>(),
+                                 spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
+                                 "", 10).get();
    ASSERT_EQ(9, results["hits"].size());
    ids = {"6", "12", "19", "22", "13", "8", "15", "24", "21"};

@ -559,7 +600,10 @@ TEST_F(CollectionTest, PrefixSearching) {
    }

    // restrict to only 2 results and differentiate between MAX_SCORE and FREQUENCY
-    results = collection->search("t", query_fields, "", facets, sort_fields, {0}, 2, 1, MAX_SCORE, {true}).get();
+    results = collection->search("t", query_fields, "", facets, sort_fields, {0}, 2, 1, MAX_SCORE, {true}, 10,
+                                 spp::sparse_hash_set<std::string>(),
+                                 spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
+                                 "", 10).get();
    ASSERT_EQ(2, results["hits"].size());
    ids = {"19", "22"};

@ -570,7 +614,10 @@ TEST_F(CollectionTest, PrefixSearching) {
        ASSERT_STREQ(id.c_str(), result_id.c_str());
    }

-    results = collection->search("t", query_fields, "", facets, sort_fields, {0}, 2, 1, FREQUENCY, {true}).get();
+    results = collection->search("t", query_fields, "", facets, sort_fields, {0}, 2, 1, FREQUENCY, {true}, 10,
+                                 spp::sparse_hash_set<std::string>(),
+                                 spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
+                                 "", 10).get();
    ASSERT_EQ(2, results["hits"].size());
    ids = {"19", "22"};