Typo and drop tokens thresholds must be applied independently.

This commit is contained in:
Kishore Nallan 2021-07-16 13:39:52 +05:30
parent 0ae718d067
commit 672c895805
3 changed files with 39 additions and 3 deletions

View File

@ -1984,6 +1984,12 @@ void Index::search_field(const uint8_t & field_id,
// when no more costs are left for this token
if(token_to_costs[token_index].empty()) {
// we can try to drop the token and search with remaining tokens
if(field_num_results >= drop_tokens_threshold) {
// but if drop_tokens_threshold is breached, we are done
return ;
}
token_to_costs.erase(token_to_costs.begin()+token_index);
search_tokens.erase(search_tokens.begin()+token_index);
query_tokens.erase(query_tokens.begin()+token_index);
@ -2010,8 +2016,8 @@ void Index::search_field(const uint8_t & field_id,
resume_typo_loop:
if(field_num_results >= drop_tokens_threshold || field_num_results >= typo_tokens_threshold) {
// if either threshold is breached, we are done
if(field_num_results >= typo_tokens_threshold) {
// if typo threshold is breached, we are done
return ;
}
@ -2022,6 +2028,11 @@ void Index::search_field(const uint8_t & field_id,
if(!query_tokens.empty() && num_tokens_dropped < query_tokens.size()) {
// Drop tokens from right until (len/2 + 1), and then from left until (len/2 + 1)
if(field_num_results >= drop_tokens_threshold) {
// if drop_tokens_threshold is breached, we are done
return ;
}
std::vector<token_t> truncated_tokens;
num_tokens_dropped++;

View File

@ -309,6 +309,31 @@ TEST_F(CollectionSpecificTest, ExactMatchOnPrefix) {
collectionManager.drop_collection("coll1");
}
TEST_F(CollectionSpecificTest, TypoPrefixSearchWithoutPrefixEnabled) {
std::vector<field> fields = {field("title", field_types::STRING, false),
field("points", field_types::INT32, false),};
Collection* coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
nlohmann::json doc1;
doc1["id"] = "0";
doc1["title"] = "Cisco SG25026HP Gigabit Smart Switch";
doc1["points"] = 100;
ASSERT_TRUE(coll1->add(doc1.dump()).ok());
auto results = coll1->search("SG25026H", {"title"}, "", {}, {}, {2}, 10,
1, FREQUENCY, {false}, 0,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
"", 1).get();
ASSERT_EQ(1, results["hits"].size());
ASSERT_EQ("0", results["hits"][0]["document"]["id"].get<std::string>());
collectionManager.drop_collection("coll1");
}
TEST_F(CollectionSpecificTest, PrefixWithTypos) {
std::vector<field> fields = {field("title", field_types::STRING, false),
field("points", field_types::INT32, false),};

View File

@ -292,7 +292,7 @@ TEST_F(CollectionTest, SkipUnindexedTokensDuringPhraseSearch) {
}
results.clear();
results = collection->search("the a DoesNotExist", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, {false}, 0).get();
results = collection->search("the a insurance", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, {false}, 0).get();
ASSERT_EQ(0, results["hits"].size());
// with no indexed word