mirror of
https://github.com/typesense/typesense.git
synced 2025-05-18 04:32:38 +08:00
Stricter bounding of typo correction threshold.
This commit is contained in:
parent
ce7b6e12e9
commit
b6f1885aec
@ -2052,10 +2052,18 @@ void Index::search_field(const uint8_t & field_id,
|
||||
|
||||
int Index::get_bounded_typo_cost(const size_t max_cost, const size_t token_len) {
|
||||
int bounded_cost = max_cost;
|
||||
if(token_len > 0 && max_cost >= token_len && (token_len == 1 || token_len == 2)) {
|
||||
bounded_cost = token_len - 1;
|
||||
|
||||
if(token_len < 4) {
|
||||
// typo correction is disabled for small tokens
|
||||
return 0;
|
||||
}
|
||||
return bounded_cost;
|
||||
|
||||
if(token_len < 7) {
|
||||
// 2-typos are enabled only at token length of 7 chars
|
||||
return std::min<int>(max_cost, 1);
|
||||
}
|
||||
|
||||
return std::min<int>(max_cost, 2);
|
||||
}
|
||||
|
||||
void Index::log_leaves(const int cost, const std::string &token, const std::vector<art_leaf *> &leaves) const {
|
||||
|
@ -463,9 +463,9 @@ TEST_F(CollectionSortingTest, SingleFieldTextMatchScoreDefault) {
|
||||
}
|
||||
|
||||
std::vector<std::vector<std::string>> records = {
|
||||
{"Ayxha Beta"},
|
||||
{"Alpha Beta"},
|
||||
{"Alppha Beta"},
|
||||
{"Alpha Beta"},
|
||||
{"Alphas Beta"},
|
||||
};
|
||||
|
||||
for(size_t i=0; i<records.size(); i++) {
|
||||
|
@ -363,13 +363,14 @@ TEST_F(CollectionTest, QueryWithTypo) {
|
||||
}
|
||||
|
||||
results.clear();
|
||||
results = collection->search("fer thx", query_fields, "", facets, sort_fields, {1}, 3,
|
||||
results = collection->search("lauxnch rcket", query_fields, "", facets, sort_fields, {1}, 3,
|
||||
1, FREQUENCY,
|
||||
{false}, 10,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
|
||||
"", 10).get();
|
||||
ids = {"1", "10", "13"};
|
||||
|
||||
ids = {"8", "1", "17"};
|
||||
|
||||
ASSERT_EQ(3, results["hits"].size());
|
||||
|
||||
@ -442,15 +443,15 @@ TEST_F(CollectionTest, TypoTokenRankedByScoreAndFrequency) {
|
||||
}
|
||||
|
||||
TEST_F(CollectionTest, TextContainingAnActualTypo) {
|
||||
// A line contains "ISX" but not "what" - need to ensure that correction to "ISS what" happens
|
||||
// A line contains "ISSX" but not "what" - need to ensure that correction to "ISSS what" happens
|
||||
std::vector<std::string> facets;
|
||||
nlohmann::json results = collection->search("ISX what", query_fields, "", facets, sort_fields, {1}, 4, 1, FREQUENCY, {false},
|
||||
nlohmann::json results = collection->search("ISSX what", query_fields, "", facets, sort_fields, {1}, 4, 1, FREQUENCY, {false},
|
||||
10, spp::sparse_hash_set<std::string>(), spp::sparse_hash_set<std::string>(),
|
||||
10, "", 30, 5, "", 10).get();
|
||||
ASSERT_EQ(4, results["hits"].size());
|
||||
ASSERT_EQ(13, results["found"].get<uint32_t>());
|
||||
ASSERT_EQ(11, results["found"].get<uint32_t>());
|
||||
|
||||
std::vector<std::string> ids = {"8", "19", "6", "21"};
|
||||
std::vector<std::string> ids = {"19", "6", "21", "22"};
|
||||
|
||||
for(size_t i = 0; i < results["hits"].size(); i++) {
|
||||
nlohmann::json result = results["hits"].at(i);
|
||||
@ -460,14 +461,15 @@ TEST_F(CollectionTest, TextContainingAnActualTypo) {
|
||||
}
|
||||
|
||||
// Record containing exact token match should appear first
|
||||
results = collection->search("ISX", query_fields, "", facets, sort_fields, {1}, 10, 1, FREQUENCY, {false}, 10,
|
||||
results = collection->search("ISSX", query_fields, "", facets, sort_fields, {1}, 10, 1, FREQUENCY, {false}, 10,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
|
||||
"", 10).get();
|
||||
ASSERT_EQ(8, results["hits"].size());
|
||||
ASSERT_EQ(8, results["found"].get<uint32_t>());
|
||||
|
||||
ids = {"20", "19", "6", "4", "3", "10", "8", "21"};
|
||||
ASSERT_EQ(5, results["hits"].size());
|
||||
ASSERT_EQ(5, results["found"].get<uint32_t>());
|
||||
|
||||
ids = {"20", "19", "6", "3", "21"};
|
||||
|
||||
for(size_t i = 0; i < results["hits"].size(); i++) {
|
||||
nlohmann::json result = results["hits"].at(i);
|
||||
@ -3164,8 +3166,8 @@ TEST_F(CollectionTest, MultiFieldRelevance4) {
|
||||
}
|
||||
|
||||
std::vector<std::vector<std::string>> records = {
|
||||
{"Madras Dreams", "Chennai King"},
|
||||
{"Madurai Express", "Madura Maddy"},
|
||||
{"Maddras Dreams", "Chennai King"},
|
||||
{"Maddurai Express", "Maddura Maddy"},
|
||||
};
|
||||
|
||||
for(size_t i=0; i<records.size(); i++) {
|
||||
@ -3179,7 +3181,7 @@ TEST_F(CollectionTest, MultiFieldRelevance4) {
|
||||
ASSERT_TRUE(coll1->add(doc.dump()).ok());
|
||||
}
|
||||
|
||||
auto results = coll1->search("madras",
|
||||
auto results = coll1->search("maddras",
|
||||
{"title", "artist"}, "", {}, {}, {2}, 10, 1, FREQUENCY,
|
||||
{true}, 10, spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0,
|
||||
|
@ -1,9 +1,9 @@
|
||||
{"points":15,"title":"How are cryogenic rocket propellants delivered for the launch pad?"}
|
||||
{"points":14,"title":"Are there any (free) online data archives for data from instruments on Soviet / Russian missions?"}
|
||||
{"points":13,"title":"Where should I look in ISS to find mouldy food?"}
|
||||
{"points":13,"title":"Where should I look in ISSS to find mouldy food?"}
|
||||
{"points":13,"title":"Is solar system active cryovolcanism a potential viable power source for future colonies?"}
|
||||
{"id": "foo", "points":13,"title":"The heaviest martian spacecraft"}
|
||||
{"points":13,"title":"To what extent are the US modules of ISS based on the Spacelab design?"}
|
||||
{"points":13,"title":"To what extent are the US modules of ISSS based on the Spacelab design?"}
|
||||
{"points":12,"title":"Could future astronauts eat during EVAs?"}
|
||||
{"points":12,"title":"What is the power, requirement of a rocket launch these days?"}
|
||||
{"points":12,"title":"How does plant growing medium not scatter around?"}
|
||||
@ -16,9 +16,9 @@
|
||||
{"points":10,"title":"How late do the launch propellants ionize in a chemical rocket mission?"}
|
||||
{"points":8,"title":"How much does it cost to launch (right from start) a rocket today?"}
|
||||
{"points":16,"title":"Difference between Space Dynamics & Astrodynamics in engineering perspective?"}
|
||||
{"points":18,"title":"What kind of biological research does ISS do then?"}
|
||||
{"points":10,"title":"Which kinds of radiation hit ISX ?"}
|
||||
{"points":7,"title":"What kinds of things have been tossed out of ISS in space?"}
|
||||
{"points":18,"title":"What kind of biological research does ISSS do then?"}
|
||||
{"points":10,"title":"Which kinds of radiation hit ISSX ?"}
|
||||
{"points":7,"title":"What kinds of things have been tossed out of ISSS in space?"}
|
||||
{"points":17,"title":"What does triple redundant closed loop digital avionics system mean?"}
|
||||
{"points":11,"title":"How are rockets guided to follow specific loop trajectory?"}
|
||||
{"points":8,"title":"What do remotely controlled bolts look like?"}
|
Loading…
x
Reference in New Issue
Block a user