Change default value of typo/drop tokens threshold to 1.

This commit is contained in:
Kishore Nallan 2021-08-11 14:20:28 +05:30
parent 8c46fed1b4
commit 26351a6984
7 changed files with 92 additions and 24 deletions

View File

@ -292,7 +292,7 @@ private:
public:
// for limiting number of results on multiple candidates / query rewrites
enum {TYPO_TOKENS_THRESHOLD = 100};
enum {TYPO_TOKENS_THRESHOLD = 1};
// for limiting number of fields that can be searched on
enum {FIELD_LIMIT_NUM = 100};
@ -301,7 +301,7 @@ public:
// If the number of results found is less than this threshold, Typesense will attempt to drop the tokens
// in the query that have the least individual hits one by one until enough results are found.
static const int DROP_TOKENS_THRESHOLD = 10;
static const int DROP_TOKENS_THRESHOLD = 1;
Index() = delete;

View File

@ -2025,7 +2025,7 @@ void Index::search_field(const uint8_t & field_id,
num_tokens_dropped, field, filter_ids, filter_ids_length, curated_ids,facets,
sort_fields, num_typos,searched_queries, topster, groups_processed, all_result_ids,
all_result_ids_len, field_num_results, group_limit, group_by_fields, prioritize_exact_match,
token_order, prefix, combination_limit);
token_order, prefix, drop_tokens_threshold, typo_tokens_threshold, combination_limit);
}
}

View File

@ -287,7 +287,7 @@ TEST_F(CollectionGroupingTest, GroupingWithMultiFieldRelevance) {
auto results = coll1->search("Dustin Kensrue Down There by the Train",
{"title", "artist"}, "", {}, {}, {0}, 10, 1, FREQUENCY,
{false}, Index::DROP_TOKENS_THRESHOLD,
{false}, 10,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
"", 10,

View File

@ -349,7 +349,10 @@ TEST_F(CollectionLocaleTest, KoreanTextPrefixConsonant) {
// To ensure that NFKD works, we will test for both &#4352; (Hangul Choseong Kiyeok)
auto results = coll1->search("서울특별시 ᄀ",
{"title"}, "", {}, sort_fields, {0}, 10, 1, FREQUENCY, {true}).get();
{"title"}, "", {}, sort_fields, {0}, 10, 1, FREQUENCY, {true}, 10,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
"", 10).get();
ASSERT_EQ(6, results["found"].get<size_t>());
ASSERT_EQ(6, results["hits"].size());
@ -357,7 +360,10 @@ TEST_F(CollectionLocaleTest, KoreanTextPrefixConsonant) {
// and &#12593; (Hangul Letter Kiyeok)
results = coll1->search("서울특별시 ㄱ",
{"title"}, "", {}, sort_fields, {0}, 10, 1, FREQUENCY, {true}).get();
{"title"}, "", {}, sort_fields, {0}, 10, 1, FREQUENCY, {true}, 10,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
"", 10).get();
ASSERT_EQ(6, results["found"].get<size_t>());
ASSERT_EQ(6, results["hits"].size());
@ -365,7 +371,10 @@ TEST_F(CollectionLocaleTest, KoreanTextPrefixConsonant) {
// search for full word
results = coll1->search("서울특별시 관",
{"title"}, "", {}, sort_fields, {0}, 10, 1, FREQUENCY, {true}).get();
{"title"}, "", {}, sort_fields, {0}, 10, 1, FREQUENCY, {true}, 10,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
"", 10).get();
ASSERT_EQ(6, results["found"].get<size_t>());
ASSERT_EQ(6, results["hits"].size());
@ -407,7 +416,10 @@ TEST_F(CollectionLocaleTest, KoreanTextPrefixVowel) {
std::vector<sort_by> sort_fields = { sort_by(sort_field_const::text_match, "DESC"), sort_by("points", "DESC") };
auto results = coll1->search("서울특별시 고",
{"title"}, "", {}, sort_fields, {0}, 10, 1, FREQUENCY, {true}).get();
{"title"}, "", {}, sort_fields, {0}, 10, 1, FREQUENCY, {true}, 10,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
"", 10).get();
ASSERT_EQ(6, results["found"].get<size_t>());
ASSERT_EQ(6, results["hits"].size());

View File

@ -438,7 +438,10 @@ TEST_F(CollectionSortingTest, ThreeSortFieldsTextMatchLast) {
std::vector<sort_by> sort_fields = { sort_by("popularity", "DESC"), sort_by("points", "DESC"), sort_by(sort_field_const::text_match, "DESC") };
auto res = coll1->search("grant",
{"title","artist"}, "", {}, sort_fields, {1}, 10, 1, FREQUENCY).get();
{"title","artist"}, "", {}, sort_fields, {1}, 10, 1, FREQUENCY, {false}, 10,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
"", 10).get();
ASSERT_EQ(2, res["found"].get<size_t>());
ASSERT_STREQ("1", res["hits"][0]["document"]["id"].get<std::string>().c_str());
@ -479,7 +482,10 @@ TEST_F(CollectionSortingTest, SingleFieldTextMatchScoreDefault) {
auto results = coll1->search("alpha",
{"title"}, "", {}, sort_fields, {2}, 10, 1, FREQUENCY,
{false}, 10).get();
{false}, 10,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
"", 10).get();
ASSERT_EQ(3, results["found"].get<size_t>());
ASSERT_EQ(3, results["hits"].size());

View File

@ -410,7 +410,10 @@ TEST_F(CollectionSpecificTest, PrefixVsExactMatch) {
}
auto results = coll1->search("ration",
{"title"}, "", {}, {}, {1}, 10, 1, FREQUENCY, {true}).get();
{"title"}, "", {}, {}, {1}, 10, 1, FREQUENCY, {true}, 10,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
"", 10).get();
ASSERT_EQ(4, results["found"].get<size_t>());
ASSERT_EQ(4, results["hits"].size());
@ -818,7 +821,7 @@ TEST_F(CollectionSpecificTest, PrefixSearchOnlyOnLastToken) {
"", {}, {}, {1}, 10,
1, FREQUENCY, {true},
0, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "description", 20, {}, {}, {}, 0,
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "concat", 20, {}, {}, {}, 0,
"<mark>", "</mark>").get();
ASSERT_EQ(0, results["hits"][0]["highlights"].size());

View File

@ -151,7 +151,12 @@ TEST_F(CollectionTest, ExactSearchShouldBeStable) {
TEST_F(CollectionTest, PhraseSearch) {
std::vector<std::string> facets;
nlohmann::json results = collection->search("rocket launch", query_fields, "", facets, sort_fields, {0}, 10).get();
nlohmann::json results = collection->search("rocket launch", query_fields, "", facets, sort_fields, {0}, 10,
1, FREQUENCY,
{false}, 10,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
"", 10).get();
ASSERT_EQ(5, results["hits"].size());
ASSERT_EQ(5, results["found"].get<uint32_t>());
@ -180,7 +185,12 @@ TEST_F(CollectionTest, PhraseSearch) {
// Check ASC sort order
std::vector<sort_by> sort_fields_asc = { sort_by(sort_field_const::text_match, "DESC"), sort_by("points", "ASC") };
results = collection->search("rocket launch", query_fields, "", facets, sort_fields_asc, {0}, 10).get();
results = collection->search("rocket launch", query_fields, "", facets, sort_fields_asc, {0}, 10,
1, FREQUENCY,
{false}, 10,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
"", 10).get();
ASSERT_EQ(5, results["hits"].size());
ASSERT_EQ(5, results["found"].get<uint32_t>());
@ -194,7 +204,12 @@ TEST_F(CollectionTest, PhraseSearch) {
}
// Check pagination
results = collection->search("rocket launch", query_fields, "", facets, sort_fields, {0}, 3).get();
results = collection->search("rocket launch", query_fields, "", facets, sort_fields, {0}, 3,
1, FREQUENCY,
{false}, 10,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
"", 10).get();
ASSERT_EQ(3, results["hits"].size());
ASSERT_EQ(5, results["found"].get<uint32_t>());
@ -212,7 +227,12 @@ TEST_F(CollectionTest, PhraseSearch) {
TEST_F(CollectionTest, SearchWithExcludedTokens) {
std::vector<std::string> facets;
nlohmann::json results = collection->search("how -propellants -are", query_fields, "", facets, sort_fields, {0}, 10).get();
nlohmann::json results = collection->search("how -propellants -are", query_fields, "", facets, sort_fields, {0}, 10,
1, FREQUENCY,
{false}, 10,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
"", 10).get();
ASSERT_EQ(2, results["hits"].size());
ASSERT_EQ(2, results["found"].get<uint32_t>());
@ -276,7 +296,10 @@ TEST_F(CollectionTest, SkipUnindexedTokensDuringPhraseSearch) {
// should not try to drop tokens to expand query
results.clear();
results = collection->search("the a", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, {false}, 10).get();
results = collection->search("the a", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, {false}, 10,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
"", 10).get();
ASSERT_EQ(9, results["hits"].size());
results.clear();
@ -322,7 +345,12 @@ TEST_F(CollectionTest, PartialPhraseSearch) {
TEST_F(CollectionTest, QueryWithTypo) {
std::vector<std::string> facets;
nlohmann::json results = collection->search("kind biologcal", query_fields, "", facets, sort_fields, {2}, 3).get();
nlohmann::json results = collection->search("kind biologcal", query_fields, "", facets, sort_fields, {2}, 3,
1, FREQUENCY,
{false}, 10,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
"", 10).get();
ASSERT_EQ(3, results["hits"].size());
std::vector<std::string> ids = {"19", "3", "20"};
@ -335,7 +363,12 @@ TEST_F(CollectionTest, QueryWithTypo) {
}
results.clear();
results = collection->search("fer thx", query_fields, "", facets, sort_fields, {1}, 3).get();
results = collection->search("fer thx", query_fields, "", facets, sort_fields, {1}, 3,
1, FREQUENCY,
{false}, 10,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
"", 10).get();
ids = {"1", "10", "13"};
ASSERT_EQ(3, results["hits"].size());
@ -411,7 +444,9 @@ TEST_F(CollectionTest, TypoTokenRankedByScoreAndFrequency) {
TEST_F(CollectionTest, TextContainingAnActualTypo) {
// A line contains "ISX" but not "what" - need to ensure that correction to "ISS what" happens
std::vector<std::string> facets;
nlohmann::json results = collection->search("ISX what", query_fields, "", facets, sort_fields, {1}, 4, 1, FREQUENCY, {false}).get();
nlohmann::json results = collection->search("ISX what", query_fields, "", facets, sort_fields, {1}, 4, 1, FREQUENCY, {false},
10, spp::sparse_hash_set<std::string>(), spp::sparse_hash_set<std::string>(),
10, "", 30, 5, "", 10).get();
ASSERT_EQ(4, results["hits"].size());
ASSERT_EQ(13, results["found"].get<uint32_t>());
@ -425,7 +460,10 @@ TEST_F(CollectionTest, TextContainingAnActualTypo) {
}
// Record containing exact token match should appear first
results = collection->search("ISX", query_fields, "", facets, sort_fields, {1}, 10, 1, FREQUENCY, {false}).get();
results = collection->search("ISX", query_fields, "", facets, sort_fields, {1}, 10, 1, FREQUENCY, {false}, 10,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
"", 10).get();
ASSERT_EQ(8, results["hits"].size());
ASSERT_EQ(8, results["found"].get<uint32_t>());
@ -547,7 +585,10 @@ TEST_F(CollectionTest, PrefixSearching) {
ASSERT_STREQ(id.c_str(), result_id.c_str());
}
results = collection->search("what ex", query_fields, "", facets, sort_fields, {0}, 10, 1, MAX_SCORE, {true}).get();
results = collection->search("what ex", query_fields, "", facets, sort_fields, {0}, 10, 1, MAX_SCORE, {true}, 10,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
"", 10).get();
ASSERT_EQ(9, results["hits"].size());
ids = {"6", "12", "19", "22", "13", "8", "15", "24", "21"};
@ -559,7 +600,10 @@ TEST_F(CollectionTest, PrefixSearching) {
}
// restrict to only 2 results and differentiate between MAX_SCORE and FREQUENCY
results = collection->search("t", query_fields, "", facets, sort_fields, {0}, 2, 1, MAX_SCORE, {true}).get();
results = collection->search("t", query_fields, "", facets, sort_fields, {0}, 2, 1, MAX_SCORE, {true}, 10,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
"", 10).get();
ASSERT_EQ(2, results["hits"].size());
ids = {"19", "22"};
@ -570,7 +614,10 @@ TEST_F(CollectionTest, PrefixSearching) {
ASSERT_STREQ(id.c_str(), result_id.c_str());
}
results = collection->search("t", query_fields, "", facets, sort_fields, {0}, 2, 1, FREQUENCY, {true}).get();
results = collection->search("t", query_fields, "", facets, sort_fields, {0}, 2, 1, FREQUENCY, {true}, 10,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
"", 10).get();
ASSERT_EQ(2, results["hits"].size());
ids = {"19", "22"};