diff --git a/src/index.cpp b/src/index.cpp index ac0f953f..f483cfdf 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -2052,10 +2052,18 @@ void Index::search_field(const uint8_t & field_id, int Index::get_bounded_typo_cost(const size_t max_cost, const size_t token_len) { int bounded_cost = max_cost; - if(token_len > 0 && max_cost >= token_len && (token_len == 1 || token_len == 2)) { - bounded_cost = token_len - 1; + + if(token_len < 4) { + // typo correction is disabled for small tokens + return 0; } - return bounded_cost; + + if(token_len < 7) { + // 2-typos are enabled only at token length of 7 chars + return std::min(max_cost, 1); + } + + return std::min(max_cost, 2); } void Index::log_leaves(const int cost, const std::string &token, const std::vector &leaves) const { diff --git a/test/collection_sorting_test.cpp b/test/collection_sorting_test.cpp index 8e088361..245aa490 100644 --- a/test/collection_sorting_test.cpp +++ b/test/collection_sorting_test.cpp @@ -463,9 +463,9 @@ TEST_F(CollectionSortingTest, SingleFieldTextMatchScoreDefault) { } std::vector> records = { - {"Ayxha Beta"}, - {"Alpha Beta"}, {"Alppha Beta"}, + {"Alpha Beta"}, + {"Alphas Beta"}, }; for(size_t i=0; isearch("fer thx", query_fields, "", facets, sort_fields, {1}, 3, + results = collection->search("lauxnch rcket", query_fields, "", facets, sort_fields, {1}, 3, 1, FREQUENCY, {false}, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 30, 5, "", 10).get(); - ids = {"1", "10", "13"}; + + ids = {"8", "1", "17"}; ASSERT_EQ(3, results["hits"].size()); @@ -442,15 +443,15 @@ TEST_F(CollectionTest, TypoTokenRankedByScoreAndFrequency) { } TEST_F(CollectionTest, TextContainingAnActualTypo) { - // A line contains "ISX" but not "what" - need to ensure that correction to "ISS what" happens + // A line contains "ISSX" but not "what" - need to ensure that correction to "ISSS what" happens std::vector facets; - nlohmann::json results = collection->search("ISX what", query_fields, "", facets, sort_fields, {1}, 4, 1, FREQUENCY, {false}, + nlohmann::json results = collection->search("ISSX what", query_fields, "", facets, sort_fields, {1}, 4, 1, FREQUENCY, {false}, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 30, 5, "", 10).get(); ASSERT_EQ(4, results["hits"].size()); - ASSERT_EQ(13, results["found"].get()); + ASSERT_EQ(11, results["found"].get()); - std::vector ids = {"8", "19", "6", "21"}; + std::vector ids = {"19", "6", "21", "22"}; for(size_t i = 0; i < results["hits"].size(); i++) { nlohmann::json result = results["hits"].at(i); @@ -460,14 +461,15 @@ TEST_F(CollectionTest, TextContainingAnActualTypo) { } // Record containing exact token match should appear first - results = collection->search("ISX", query_fields, "", facets, sort_fields, {1}, 10, 1, FREQUENCY, {false}, 10, + results = collection->search("ISSX", query_fields, "", facets, sort_fields, {1}, 10, 1, FREQUENCY, {false}, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 30, 5, "", 10).get(); - ASSERT_EQ(8, results["hits"].size()); - ASSERT_EQ(8, results["found"].get()); - ids = {"20", "19", "6", "4", "3", "10", "8", "21"}; + ASSERT_EQ(5, results["hits"].size()); + ASSERT_EQ(5, results["found"].get()); + + ids = {"20", "19", "6", "3", "21"}; for(size_t i = 0; i < results["hits"].size(); i++) { nlohmann::json result = results["hits"].at(i); @@ -3164,8 +3166,8 @@ TEST_F(CollectionTest, MultiFieldRelevance4) { } std::vector> records = { - {"Madras Dreams", "Chennai King"}, - {"Madurai Express", "Madura Maddy"}, + {"Maddras Dreams", "Chennai King"}, + {"Maddurai Express", "Maddura Maddy"}, }; for(size_t i=0; iadd(doc.dump()).ok()); } - auto results = coll1->search("madras", + auto results = coll1->search("maddras", {"title", "artist"}, "", {}, {}, {2}, 10, 1, FREQUENCY, {true}, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 30, 4, "", 40, {}, {}, {}, 0, diff --git a/test/documents.jsonl b/test/documents.jsonl index ae97bbd6..8caaa289 100644 --- a/test/documents.jsonl +++ b/test/documents.jsonl @@ -1,9 +1,9 @@ {"points":15,"title":"How are cryogenic rocket propellants delivered for the launch pad?"} {"points":14,"title":"Are there any (free) online data archives for data from instruments on Soviet / Russian missions?"} -{"points":13,"title":"Where should I look in ISS to find mouldy food?"} +{"points":13,"title":"Where should I look in ISSS to find mouldy food?"} {"points":13,"title":"Is solar system active cryovolcanism a potential viable power source for future colonies?"} {"id": "foo", "points":13,"title":"The heaviest martian spacecraft"} -{"points":13,"title":"To what extent are the US modules of ISS based on the Spacelab design?"} +{"points":13,"title":"To what extent are the US modules of ISSS based on the Spacelab design?"} {"points":12,"title":"Could future astronauts eat during EVAs?"} {"points":12,"title":"What is the power, requirement of a rocket launch these days?"} {"points":12,"title":"How does plant growing medium not scatter around?"} @@ -16,9 +16,9 @@ {"points":10,"title":"How late do the launch propellants ionize in a chemical rocket mission?"} {"points":8,"title":"How much does it cost to launch (right from start) a rocket today?"} {"points":16,"title":"Difference between Space Dynamics & Astrodynamics in engineering perspective?"} -{"points":18,"title":"What kind of biological research does ISS do then?"} -{"points":10,"title":"Which kinds of radiation hit ISX ?"} -{"points":7,"title":"What kinds of things have been tossed out of ISS in space?"} +{"points":18,"title":"What kind of biological research does ISSS do then?"} +{"points":10,"title":"Which kinds of radiation hit ISSX ?"} +{"points":7,"title":"What kinds of things have been tossed out of ISSS in space?"} {"points":17,"title":"What does triple redundant closed loop digital avionics system mean?"} {"points":11,"title":"How are rockets guided to follow specific loop trajectory?"} {"points":8,"title":"What do remotely controlled bolts look like?"} \ No newline at end of file