mirror of
https://github.com/typesense/typesense.git
synced 2025-05-20 05:32:30 +08:00
Change default value of typo/drop tokens threshold to 1.
This commit is contained in:
parent
8c46fed1b4
commit
26351a6984
@ -292,7 +292,7 @@ private:
|
||||
|
||||
public:
|
||||
// for limiting number of results on multiple candidates / query rewrites
|
||||
enum {TYPO_TOKENS_THRESHOLD = 100};
|
||||
enum {TYPO_TOKENS_THRESHOLD = 1};
|
||||
|
||||
// for limiting number of fields that can be searched on
|
||||
enum {FIELD_LIMIT_NUM = 100};
|
||||
@ -301,7 +301,7 @@ public:
|
||||
|
||||
// If the number of results found is less than this threshold, Typesense will attempt to drop the tokens
|
||||
// in the query that have the least individual hits one by one until enough results are found.
|
||||
static const int DROP_TOKENS_THRESHOLD = 10;
|
||||
static const int DROP_TOKENS_THRESHOLD = 1;
|
||||
|
||||
Index() = delete;
|
||||
|
||||
|
@ -2025,7 +2025,7 @@ void Index::search_field(const uint8_t & field_id,
|
||||
num_tokens_dropped, field, filter_ids, filter_ids_length, curated_ids,facets,
|
||||
sort_fields, num_typos,searched_queries, topster, groups_processed, all_result_ids,
|
||||
all_result_ids_len, field_num_results, group_limit, group_by_fields, prioritize_exact_match,
|
||||
token_order, prefix, combination_limit);
|
||||
token_order, prefix, drop_tokens_threshold, typo_tokens_threshold, combination_limit);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -287,7 +287,7 @@ TEST_F(CollectionGroupingTest, GroupingWithMultiFieldRelevance) {
|
||||
|
||||
auto results = coll1->search("Dustin Kensrue Down There by the Train",
|
||||
{"title", "artist"}, "", {}, {}, {0}, 10, 1, FREQUENCY,
|
||||
{false}, Index::DROP_TOKENS_THRESHOLD,
|
||||
{false}, 10,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
|
||||
"", 10,
|
||||
|
@ -349,7 +349,10 @@ TEST_F(CollectionLocaleTest, KoreanTextPrefixConsonant) {
|
||||
|
||||
// To ensure that NFKD works, we will test for both ᄀ (Hangul Choseong Kiyeok)
|
||||
auto results = coll1->search("서울특별시 ᄀ",
|
||||
{"title"}, "", {}, sort_fields, {0}, 10, 1, FREQUENCY, {true}).get();
|
||||
{"title"}, "", {}, sort_fields, {0}, 10, 1, FREQUENCY, {true}, 10,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
|
||||
"", 10).get();
|
||||
|
||||
ASSERT_EQ(6, results["found"].get<size_t>());
|
||||
ASSERT_EQ(6, results["hits"].size());
|
||||
@ -357,7 +360,10 @@ TEST_F(CollectionLocaleTest, KoreanTextPrefixConsonant) {
|
||||
|
||||
// and ㄱ (Hangul Letter Kiyeok)
|
||||
results = coll1->search("서울특별시 ㄱ",
|
||||
{"title"}, "", {}, sort_fields, {0}, 10, 1, FREQUENCY, {true}).get();
|
||||
{"title"}, "", {}, sort_fields, {0}, 10, 1, FREQUENCY, {true}, 10,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
|
||||
"", 10).get();
|
||||
|
||||
ASSERT_EQ(6, results["found"].get<size_t>());
|
||||
ASSERT_EQ(6, results["hits"].size());
|
||||
@ -365,7 +371,10 @@ TEST_F(CollectionLocaleTest, KoreanTextPrefixConsonant) {
|
||||
|
||||
// search for full word
|
||||
results = coll1->search("서울특별시 관",
|
||||
{"title"}, "", {}, sort_fields, {0}, 10, 1, FREQUENCY, {true}).get();
|
||||
{"title"}, "", {}, sort_fields, {0}, 10, 1, FREQUENCY, {true}, 10,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
|
||||
"", 10).get();
|
||||
|
||||
ASSERT_EQ(6, results["found"].get<size_t>());
|
||||
ASSERT_EQ(6, results["hits"].size());
|
||||
@ -407,7 +416,10 @@ TEST_F(CollectionLocaleTest, KoreanTextPrefixVowel) {
|
||||
std::vector<sort_by> sort_fields = { sort_by(sort_field_const::text_match, "DESC"), sort_by("points", "DESC") };
|
||||
|
||||
auto results = coll1->search("서울특별시 고",
|
||||
{"title"}, "", {}, sort_fields, {0}, 10, 1, FREQUENCY, {true}).get();
|
||||
{"title"}, "", {}, sort_fields, {0}, 10, 1, FREQUENCY, {true}, 10,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
|
||||
"", 10).get();
|
||||
|
||||
ASSERT_EQ(6, results["found"].get<size_t>());
|
||||
ASSERT_EQ(6, results["hits"].size());
|
||||
|
@ -438,7 +438,10 @@ TEST_F(CollectionSortingTest, ThreeSortFieldsTextMatchLast) {
|
||||
std::vector<sort_by> sort_fields = { sort_by("popularity", "DESC"), sort_by("points", "DESC"), sort_by(sort_field_const::text_match, "DESC") };
|
||||
|
||||
auto res = coll1->search("grant",
|
||||
{"title","artist"}, "", {}, sort_fields, {1}, 10, 1, FREQUENCY).get();
|
||||
{"title","artist"}, "", {}, sort_fields, {1}, 10, 1, FREQUENCY, {false}, 10,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
|
||||
"", 10).get();
|
||||
|
||||
ASSERT_EQ(2, res["found"].get<size_t>());
|
||||
ASSERT_STREQ("1", res["hits"][0]["document"]["id"].get<std::string>().c_str());
|
||||
@ -479,7 +482,10 @@ TEST_F(CollectionSortingTest, SingleFieldTextMatchScoreDefault) {
|
||||
|
||||
auto results = coll1->search("alpha",
|
||||
{"title"}, "", {}, sort_fields, {2}, 10, 1, FREQUENCY,
|
||||
{false}, 10).get();
|
||||
{false}, 10,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
|
||||
"", 10).get();
|
||||
|
||||
ASSERT_EQ(3, results["found"].get<size_t>());
|
||||
ASSERT_EQ(3, results["hits"].size());
|
||||
|
@ -410,7 +410,10 @@ TEST_F(CollectionSpecificTest, PrefixVsExactMatch) {
|
||||
}
|
||||
|
||||
auto results = coll1->search("ration",
|
||||
{"title"}, "", {}, {}, {1}, 10, 1, FREQUENCY, {true}).get();
|
||||
{"title"}, "", {}, {}, {1}, 10, 1, FREQUENCY, {true}, 10,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
|
||||
"", 10).get();
|
||||
|
||||
ASSERT_EQ(4, results["found"].get<size_t>());
|
||||
ASSERT_EQ(4, results["hits"].size());
|
||||
@ -818,7 +821,7 @@ TEST_F(CollectionSpecificTest, PrefixSearchOnlyOnLastToken) {
|
||||
"", {}, {}, {1}, 10,
|
||||
1, FREQUENCY, {true},
|
||||
0, spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "description", 20, {}, {}, {}, 0,
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "concat", 20, {}, {}, {}, 0,
|
||||
"<mark>", "</mark>").get();
|
||||
|
||||
ASSERT_EQ(0, results["hits"][0]["highlights"].size());
|
||||
|
@ -151,7 +151,12 @@ TEST_F(CollectionTest, ExactSearchShouldBeStable) {
|
||||
|
||||
TEST_F(CollectionTest, PhraseSearch) {
|
||||
std::vector<std::string> facets;
|
||||
nlohmann::json results = collection->search("rocket launch", query_fields, "", facets, sort_fields, {0}, 10).get();
|
||||
nlohmann::json results = collection->search("rocket launch", query_fields, "", facets, sort_fields, {0}, 10,
|
||||
1, FREQUENCY,
|
||||
{false}, 10,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
|
||||
"", 10).get();
|
||||
ASSERT_EQ(5, results["hits"].size());
|
||||
ASSERT_EQ(5, results["found"].get<uint32_t>());
|
||||
|
||||
@ -180,7 +185,12 @@ TEST_F(CollectionTest, PhraseSearch) {
|
||||
|
||||
// Check ASC sort order
|
||||
std::vector<sort_by> sort_fields_asc = { sort_by(sort_field_const::text_match, "DESC"), sort_by("points", "ASC") };
|
||||
results = collection->search("rocket launch", query_fields, "", facets, sort_fields_asc, {0}, 10).get();
|
||||
results = collection->search("rocket launch", query_fields, "", facets, sort_fields_asc, {0}, 10,
|
||||
1, FREQUENCY,
|
||||
{false}, 10,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
|
||||
"", 10).get();
|
||||
ASSERT_EQ(5, results["hits"].size());
|
||||
ASSERT_EQ(5, results["found"].get<uint32_t>());
|
||||
|
||||
@ -194,7 +204,12 @@ TEST_F(CollectionTest, PhraseSearch) {
|
||||
}
|
||||
|
||||
// Check pagination
|
||||
results = collection->search("rocket launch", query_fields, "", facets, sort_fields, {0}, 3).get();
|
||||
results = collection->search("rocket launch", query_fields, "", facets, sort_fields, {0}, 3,
|
||||
1, FREQUENCY,
|
||||
{false}, 10,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
|
||||
"", 10).get();
|
||||
ASSERT_EQ(3, results["hits"].size());
|
||||
ASSERT_EQ(5, results["found"].get<uint32_t>());
|
||||
|
||||
@ -212,7 +227,12 @@ TEST_F(CollectionTest, PhraseSearch) {
|
||||
|
||||
TEST_F(CollectionTest, SearchWithExcludedTokens) {
|
||||
std::vector<std::string> facets;
|
||||
nlohmann::json results = collection->search("how -propellants -are", query_fields, "", facets, sort_fields, {0}, 10).get();
|
||||
nlohmann::json results = collection->search("how -propellants -are", query_fields, "", facets, sort_fields, {0}, 10,
|
||||
1, FREQUENCY,
|
||||
{false}, 10,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
|
||||
"", 10).get();
|
||||
|
||||
ASSERT_EQ(2, results["hits"].size());
|
||||
ASSERT_EQ(2, results["found"].get<uint32_t>());
|
||||
@ -276,7 +296,10 @@ TEST_F(CollectionTest, SkipUnindexedTokensDuringPhraseSearch) {
|
||||
|
||||
// should not try to drop tokens to expand query
|
||||
results.clear();
|
||||
results = collection->search("the a", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, {false}, 10).get();
|
||||
results = collection->search("the a", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, {false}, 10,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
|
||||
"", 10).get();
|
||||
ASSERT_EQ(9, results["hits"].size());
|
||||
|
||||
results.clear();
|
||||
@ -322,7 +345,12 @@ TEST_F(CollectionTest, PartialPhraseSearch) {
|
||||
|
||||
TEST_F(CollectionTest, QueryWithTypo) {
|
||||
std::vector<std::string> facets;
|
||||
nlohmann::json results = collection->search("kind biologcal", query_fields, "", facets, sort_fields, {2}, 3).get();
|
||||
nlohmann::json results = collection->search("kind biologcal", query_fields, "", facets, sort_fields, {2}, 3,
|
||||
1, FREQUENCY,
|
||||
{false}, 10,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
|
||||
"", 10).get();
|
||||
ASSERT_EQ(3, results["hits"].size());
|
||||
|
||||
std::vector<std::string> ids = {"19", "3", "20"};
|
||||
@ -335,7 +363,12 @@ TEST_F(CollectionTest, QueryWithTypo) {
|
||||
}
|
||||
|
||||
results.clear();
|
||||
results = collection->search("fer thx", query_fields, "", facets, sort_fields, {1}, 3).get();
|
||||
results = collection->search("fer thx", query_fields, "", facets, sort_fields, {1}, 3,
|
||||
1, FREQUENCY,
|
||||
{false}, 10,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
|
||||
"", 10).get();
|
||||
ids = {"1", "10", "13"};
|
||||
|
||||
ASSERT_EQ(3, results["hits"].size());
|
||||
@ -411,7 +444,9 @@ TEST_F(CollectionTest, TypoTokenRankedByScoreAndFrequency) {
|
||||
TEST_F(CollectionTest, TextContainingAnActualTypo) {
|
||||
// A line contains "ISX" but not "what" - need to ensure that correction to "ISS what" happens
|
||||
std::vector<std::string> facets;
|
||||
nlohmann::json results = collection->search("ISX what", query_fields, "", facets, sort_fields, {1}, 4, 1, FREQUENCY, {false}).get();
|
||||
nlohmann::json results = collection->search("ISX what", query_fields, "", facets, sort_fields, {1}, 4, 1, FREQUENCY, {false},
|
||||
10, spp::sparse_hash_set<std::string>(), spp::sparse_hash_set<std::string>(),
|
||||
10, "", 30, 5, "", 10).get();
|
||||
ASSERT_EQ(4, results["hits"].size());
|
||||
ASSERT_EQ(13, results["found"].get<uint32_t>());
|
||||
|
||||
@ -425,7 +460,10 @@ TEST_F(CollectionTest, TextContainingAnActualTypo) {
|
||||
}
|
||||
|
||||
// Record containing exact token match should appear first
|
||||
results = collection->search("ISX", query_fields, "", facets, sort_fields, {1}, 10, 1, FREQUENCY, {false}).get();
|
||||
results = collection->search("ISX", query_fields, "", facets, sort_fields, {1}, 10, 1, FREQUENCY, {false}, 10,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
|
||||
"", 10).get();
|
||||
ASSERT_EQ(8, results["hits"].size());
|
||||
ASSERT_EQ(8, results["found"].get<uint32_t>());
|
||||
|
||||
@ -547,7 +585,10 @@ TEST_F(CollectionTest, PrefixSearching) {
|
||||
ASSERT_STREQ(id.c_str(), result_id.c_str());
|
||||
}
|
||||
|
||||
results = collection->search("what ex", query_fields, "", facets, sort_fields, {0}, 10, 1, MAX_SCORE, {true}).get();
|
||||
results = collection->search("what ex", query_fields, "", facets, sort_fields, {0}, 10, 1, MAX_SCORE, {true}, 10,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
|
||||
"", 10).get();
|
||||
ASSERT_EQ(9, results["hits"].size());
|
||||
ids = {"6", "12", "19", "22", "13", "8", "15", "24", "21"};
|
||||
|
||||
@ -559,7 +600,10 @@ TEST_F(CollectionTest, PrefixSearching) {
|
||||
}
|
||||
|
||||
// restrict to only 2 results and differentiate between MAX_SCORE and FREQUENCY
|
||||
results = collection->search("t", query_fields, "", facets, sort_fields, {0}, 2, 1, MAX_SCORE, {true}).get();
|
||||
results = collection->search("t", query_fields, "", facets, sort_fields, {0}, 2, 1, MAX_SCORE, {true}, 10,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
|
||||
"", 10).get();
|
||||
ASSERT_EQ(2, results["hits"].size());
|
||||
ids = {"19", "22"};
|
||||
|
||||
@ -570,7 +614,10 @@ TEST_F(CollectionTest, PrefixSearching) {
|
||||
ASSERT_STREQ(id.c_str(), result_id.c_str());
|
||||
}
|
||||
|
||||
results = collection->search("t", query_fields, "", facets, sort_fields, {0}, 2, 1, FREQUENCY, {true}).get();
|
||||
results = collection->search("t", query_fields, "", facets, sort_fields, {0}, 2, 1, FREQUENCY, {true}, 10,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
|
||||
"", 10).get();
|
||||
ASSERT_EQ(2, results["hits"].size());
|
||||
ids = {"19", "22"};
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user