Stricter bounding of typo correction threshold.

This commit is contained in:
Kishore Nallan 2021-08-28 16:38:07 +05:30
parent ce7b6e12e9
commit b6f1885aec
4 changed files with 33 additions and 23 deletions

View File

@ -2052,10 +2052,18 @@ void Index::search_field(const uint8_t & field_id,
int Index::get_bounded_typo_cost(const size_t max_cost, const size_t token_len) {
int bounded_cost = max_cost;
if(token_len > 0 && max_cost >= token_len && (token_len == 1 || token_len == 2)) {
bounded_cost = token_len - 1;
if(token_len < 4) {
// typo correction is disabled for small tokens
return 0;
}
return bounded_cost;
if(token_len < 7) {
// 2-typos are enabled only at token length of 7 chars
return std::min<int>(max_cost, 1);
}
return std::min<int>(max_cost, 2);
}
void Index::log_leaves(const int cost, const std::string &token, const std::vector<art_leaf *> &leaves) const {

View File

@ -463,9 +463,9 @@ TEST_F(CollectionSortingTest, SingleFieldTextMatchScoreDefault) {
}
std::vector<std::vector<std::string>> records = {
{"Ayxha Beta"},
{"Alpha Beta"},
{"Alppha Beta"},
{"Alpha Beta"},
{"Alphas Beta"},
};
for(size_t i=0; i<records.size(); i++) {

View File

@ -363,13 +363,14 @@ TEST_F(CollectionTest, QueryWithTypo) {
}
results.clear();
results = collection->search("fer thx", query_fields, "", facets, sort_fields, {1}, 3,
results = collection->search("lauxnch rcket", query_fields, "", facets, sort_fields, {1}, 3,
1, FREQUENCY,
{false}, 10,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
"", 10).get();
ids = {"1", "10", "13"};
ids = {"8", "1", "17"};
ASSERT_EQ(3, results["hits"].size());
@ -442,15 +443,15 @@ TEST_F(CollectionTest, TypoTokenRankedByScoreAndFrequency) {
}
TEST_F(CollectionTest, TextContainingAnActualTypo) {
// A line contains "ISX" but not "what" - need to ensure that correction to "ISS what" happens
// A line contains "ISSX" but not "what" - need to ensure that correction to "ISSS what" happens
std::vector<std::string> facets;
nlohmann::json results = collection->search("ISX what", query_fields, "", facets, sort_fields, {1}, 4, 1, FREQUENCY, {false},
nlohmann::json results = collection->search("ISSX what", query_fields, "", facets, sort_fields, {1}, 4, 1, FREQUENCY, {false},
10, spp::sparse_hash_set<std::string>(), spp::sparse_hash_set<std::string>(),
10, "", 30, 5, "", 10).get();
ASSERT_EQ(4, results["hits"].size());
ASSERT_EQ(13, results["found"].get<uint32_t>());
ASSERT_EQ(11, results["found"].get<uint32_t>());
std::vector<std::string> ids = {"8", "19", "6", "21"};
std::vector<std::string> ids = {"19", "6", "21", "22"};
for(size_t i = 0; i < results["hits"].size(); i++) {
nlohmann::json result = results["hits"].at(i);
@ -460,14 +461,15 @@ TEST_F(CollectionTest, TextContainingAnActualTypo) {
}
// Record containing exact token match should appear first
results = collection->search("ISX", query_fields, "", facets, sort_fields, {1}, 10, 1, FREQUENCY, {false}, 10,
results = collection->search("ISSX", query_fields, "", facets, sort_fields, {1}, 10, 1, FREQUENCY, {false}, 10,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
"", 10).get();
ASSERT_EQ(8, results["hits"].size());
ASSERT_EQ(8, results["found"].get<uint32_t>());
ids = {"20", "19", "6", "4", "3", "10", "8", "21"};
ASSERT_EQ(5, results["hits"].size());
ASSERT_EQ(5, results["found"].get<uint32_t>());
ids = {"20", "19", "6", "3", "21"};
for(size_t i = 0; i < results["hits"].size(); i++) {
nlohmann::json result = results["hits"].at(i);
@ -3164,8 +3166,8 @@ TEST_F(CollectionTest, MultiFieldRelevance4) {
}
std::vector<std::vector<std::string>> records = {
{"Madras Dreams", "Chennai King"},
{"Madurai Express", "Madura Maddy"},
{"Maddras Dreams", "Chennai King"},
{"Maddurai Express", "Maddura Maddy"},
};
for(size_t i=0; i<records.size(); i++) {
@ -3179,7 +3181,7 @@ TEST_F(CollectionTest, MultiFieldRelevance4) {
ASSERT_TRUE(coll1->add(doc.dump()).ok());
}
auto results = coll1->search("madras",
auto results = coll1->search("maddras",
{"title", "artist"}, "", {}, {}, {2}, 10, 1, FREQUENCY,
{true}, 10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0,

View File

@ -1,9 +1,9 @@
{"points":15,"title":"How are cryogenic rocket propellants delivered for the launch pad?"}
{"points":14,"title":"Are there any (free) online data archives for data from instruments on Soviet / Russian missions?"}
{"points":13,"title":"Where should I look in ISS to find mouldy food?"}
{"points":13,"title":"Where should I look in ISSS to find mouldy food?"}
{"points":13,"title":"Is solar system active cryovolcanism a potential viable power source for future colonies?"}
{"id": "foo", "points":13,"title":"The heaviest martian spacecraft"}
{"points":13,"title":"To what extent are the US modules of ISS based on the Spacelab design?"}
{"points":13,"title":"To what extent are the US modules of ISSS based on the Spacelab design?"}
{"points":12,"title":"Could future astronauts eat during EVAs?"}
{"points":12,"title":"What is the power, requirement of a rocket launch these days?"}
{"points":12,"title":"How does plant growing medium not scatter around?"}
@ -16,9 +16,9 @@
{"points":10,"title":"How late do the launch propellants ionize in a chemical rocket mission?"}
{"points":8,"title":"How much does it cost to launch (right from start) a rocket today?"}
{"points":16,"title":"Difference between Space Dynamics & Astrodynamics in engineering perspective?"}
{"points":18,"title":"What kind of biological research does ISS do then?"}
{"points":10,"title":"Which kinds of radiation hit ISX ?"}
{"points":7,"title":"What kinds of things have been tossed out of ISS in space?"}
{"points":18,"title":"What kind of biological research does ISSS do then?"}
{"points":10,"title":"Which kinds of radiation hit ISSX ?"}
{"points":7,"title":"What kinds of things have been tossed out of ISSS in space?"}
{"points":17,"title":"What does triple redundant closed loop digital avionics system mean?"}
{"points":11,"title":"How are rockets guided to follow specific loop trajectory?"}
{"points":8,"title":"What do remotely controlled bolts look like?"}