Fixed an issue with prefix searching.

This commit is contained in:
Kishore Nallan 2021-04-23 17:41:33 +05:30
parent aa1cd0acd1
commit 51f57d3dd7
4 changed files with 58 additions and 13 deletions

View File

@ -1318,10 +1318,10 @@ static void art_fuzzy_recurse(unsigned char p, unsigned char c, const art_node *
Also, for prefix searches we don't compare with full leaf key.
*/
const int end_index = prefix ? min(l->key_len, term_len) : l->key_len;
const int iter_len = prefix ? min(l->key_len - 1, term_len) : l->key_len;
// If at any point, `temp_cost > 2*max_cost` we can terminate immediately as we can never recover from that
while(depth < end_index && temp_cost <= 2*max_cost) {
while(depth < iter_len && temp_cost <= 2 * max_cost) {
c = l->key[depth];
temp_cost = levenshtein_dist(depth, p, c, term, term_len, rows[i], rows[j], rows[k]);
printf("leaf char: %c\n", l->key[depth]);
@ -1338,12 +1338,12 @@ static void art_fuzzy_recurse(unsigned char p, unsigned char c, const art_node *
int final_cost = rows[j][columns-1];
if(prefix && term_len < (int) l->key_len && temp_cost >= min_cost && temp_cost <= max_cost) {
if(prefix && term_len < (int) l->key_len - 1 && temp_cost >= min_cost && temp_cost <= max_cost) {
results.push_back(n);
return;
}
if(prefix && term_len >= (int) l->key_len && final_cost >= min_cost && final_cost <= max_cost) {
if(prefix && term_len >= (int) l->key_len - 1 && final_cost >= min_cost && final_cost <= max_cost) {
results.push_back(n);
return;
}

View File

@ -718,8 +718,9 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
std::vector<art_leaf *> leaves;
const size_t q_len = prefix_search ? q.length() : q.length() + 1;
art_fuzzy_search(t, (const unsigned char *) q.c_str(),
q.size(), 0, bounded_cost, 10000,
q_len, 0, bounded_cost, 10000,
token_ordering::MAX_SCORE, prefix_search, nullptr, 0, leaves);
for (size_t leaf_index = 0; leaf_index < leaves.size(); leaf_index++) {
@ -1416,7 +1417,7 @@ void Index::collate_included_ids(const std::vector<std::string>& q_included_toke
std::vector<art_leaf *> override_query;
for(const std::string& token: q_included_tokens) {
const size_t token_len = token.length();
const size_t token_len = token.size() + 1;
std::vector<art_leaf*> leaves;
art_fuzzy_search(search_index.at(field), (const unsigned char *) token.c_str(), token_len,

View File

@ -623,7 +623,7 @@ TEST(ArtTest, test_art_fuzzy_search_single_leaf_prefix) {
EXPECT_EQ(1, l->values->ids.at(0));
std::vector<art_leaf*> leaves;
art_fuzzy_search(&t, (const unsigned char *) "aplication", strlen(key), 0, 1, 10, FREQUENCY, true, nullptr, 0, leaves);
art_fuzzy_search(&t, (const unsigned char *) "aplication", strlen(key)-1, 0, 1, 10, FREQUENCY, true, nullptr, 0, leaves);
ASSERT_EQ(1, leaves.size());
res = art_tree_destroy(&t);
@ -783,7 +783,7 @@ TEST(ArtTest, test_art_search_sku_like_tokens) {
for (const auto &key : keys) {
std::vector<art_leaf *> leaves;
art_fuzzy_search(&t, (const unsigned char*)key.c_str(), key.size()+1, 0, 0, 10,
art_fuzzy_search(&t, (const unsigned char*)key.c_str(), key.size(), 0, 0, 10,
FREQUENCY, true, nullptr, 0, leaves);
ASSERT_EQ(1, leaves.size());
ASSERT_STREQ(key.c_str(), (const char *) leaves.at(0)->key);
@ -822,6 +822,19 @@ TEST(ArtTest, test_art_search_ill_like_tokens) {
line++;
}
std::map<std::string, size_t> key_to_count {
std::make_pair("input", 2),
std::make_pair("image", 7),
std::make_pair("instrument", 2),
std::make_pair("in", 10),
std::make_pair("info", 2),
std::make_pair("inventor", 2),
std::make_pair("imageresize", 2),
std::make_pair("id", 5),
std::make_pair("insect", 2),
std::make_pair("ice", 2),
};
for (const auto &key : keys) {
//LOG(INFO) << "Searching for " << key;
art_leaf* l = (art_leaf *) art_search(&t, (const unsigned char *)key.c_str(), key.size()+1);
@ -829,11 +842,15 @@ TEST(ArtTest, test_art_search_ill_like_tokens) {
EXPECT_EQ(1, l->values->ids.getLength());
std::vector<art_leaf *> leaves;
art_fuzzy_search(&t, (const unsigned char*)key.c_str(), key.size()+1, 0, 0, 10,
art_fuzzy_search(&t, (const unsigned char*)key.c_str(), key.size(), 0, 0, 10,
FREQUENCY, true, nullptr, 0, leaves);
ASSERT_EQ(1, leaves.size());
ASSERT_STREQ(key.c_str(), (const char *) leaves.at(0)->key);
if(key_to_count.count(key) != 0) {
ASSERT_EQ(key_to_count[key], leaves.size());
} else {
ASSERT_EQ(1, leaves.size());
ASSERT_STREQ(key.c_str(), (const char *) leaves.at(0)->key);
}
leaves.clear();
@ -872,7 +889,7 @@ TEST(ArtTest, test_art_search_ill_like_tokens2) {
EXPECT_EQ(1, l->values->ids.getLength());
std::vector<art_leaf *> leaves;
art_fuzzy_search(&t, (const unsigned char*)key.c_str(), key.size()+1, 0, 0, 10,
art_fuzzy_search(&t, (const unsigned char*)key.c_str(), key.size(), 0, 0, 10,
FREQUENCY, true, nullptr, 0, leaves);
ASSERT_EQ(1, leaves.size());
@ -891,6 +908,33 @@ TEST(ArtTest, test_art_search_ill_like_tokens2) {
ASSERT_TRUE(res == 0);
}
TEST(ArtTest, test_art_search_roche_chews) {
art_tree t;
int res = art_tree_init(&t);
ASSERT_TRUE(res == 0);
std::vector<std::string> keys;
keys = {"roche"};
art_document doc = get_document((uint32_t) 1);
ASSERT_TRUE(NULL == art_insert(&t, (unsigned char *) keys[0].c_str(), keys[0].size()+1, &doc, 1));
std::string term = "chews";
std::vector<art_leaf *> leaves;
art_fuzzy_search(&t, (const unsigned char*)term.c_str(), term.size(), 0, 2, 10,
FREQUENCY, true, nullptr, 0, leaves);
ASSERT_EQ(0, leaves.size());
art_fuzzy_search(&t, (const unsigned char*)keys[0].c_str(), keys[0].size() + 1, 0, 0, 10,
FREQUENCY, false, nullptr, 0, leaves);
ASSERT_EQ(1, leaves.size());
res = art_tree_destroy(&t);
ASSERT_TRUE(res == 0);
}
TEST(ArtTest, test_encode_int32) {
unsigned char chars[8];

View File

@ -175,7 +175,7 @@ TEST_F(CollectionFacetingTest, FacetCounts) {
results = coll_array_fields->search("*", query_fields, "", facets, sort_fields, 0, 10, 1, FREQUENCY,
false, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "tags: fine pltinum").get();
spp::sparse_hash_set<std::string>(), 10, "tags: fxne aluminium").get();
ASSERT_EQ(5, results["hits"].size());
ASSERT_EQ(1, results["facet_counts"].size());