From 51f57d3dd7115f3c94f8245df35008bc2bce3307 Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Fri, 23 Apr 2021 17:41:33 +0530 Subject: [PATCH] Fixed an issue with prefix searching. --- src/art.cpp | 8 ++--- src/index.cpp | 5 +-- test/art_test.cpp | 56 +++++++++++++++++++++++++++---- test/collection_faceting_test.cpp | 2 +- 4 files changed, 58 insertions(+), 13 deletions(-) diff --git a/src/art.cpp b/src/art.cpp index c668094f..8b4d6122 100644 --- a/src/art.cpp +++ b/src/art.cpp @@ -1318,10 +1318,10 @@ static void art_fuzzy_recurse(unsigned char p, unsigned char c, const art_node * Also, for prefix searches we don't compare with full leaf key. */ - const int end_index = prefix ? min(l->key_len, term_len) : l->key_len; + const int iter_len = prefix ? min(l->key_len - 1, term_len) : l->key_len; // If at any point, `temp_cost > 2*max_cost` we can terminate immediately as we can never recover from that - while(depth < end_index && temp_cost <= 2*max_cost) { + while(depth < iter_len && temp_cost <= 2 * max_cost) { c = l->key[depth]; temp_cost = levenshtein_dist(depth, p, c, term, term_len, rows[i], rows[j], rows[k]); printf("leaf char: %c\n", l->key[depth]); @@ -1338,12 +1338,12 @@ static void art_fuzzy_recurse(unsigned char p, unsigned char c, const art_node * int final_cost = rows[j][columns-1]; - if(prefix && term_len < (int) l->key_len && temp_cost >= min_cost && temp_cost <= max_cost) { + if(prefix && term_len < (int) l->key_len - 1 && temp_cost >= min_cost && temp_cost <= max_cost) { results.push_back(n); return; } - if(prefix && term_len >= (int) l->key_len && final_cost >= min_cost && final_cost <= max_cost) { + if(prefix && term_len >= (int) l->key_len - 1 && final_cost >= min_cost && final_cost <= max_cost) { results.push_back(n); return; } diff --git a/src/index.cpp b/src/index.cpp index 821878fa..f7861763 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -718,8 +718,9 @@ void Index::do_facets(std::vector & facets, facet_query_t & facet_query, std::vector leaves; + const size_t q_len = prefix_search ? q.length() : q.length() + 1; art_fuzzy_search(t, (const unsigned char *) q.c_str(), - q.size(), 0, bounded_cost, 10000, + q_len, 0, bounded_cost, 10000, token_ordering::MAX_SCORE, prefix_search, nullptr, 0, leaves); for (size_t leaf_index = 0; leaf_index < leaves.size(); leaf_index++) { @@ -1416,7 +1417,7 @@ void Index::collate_included_ids(const std::vector& q_included_toke std::vector override_query; for(const std::string& token: q_included_tokens) { - const size_t token_len = token.length(); + const size_t token_len = token.size() + 1; std::vector leaves; art_fuzzy_search(search_index.at(field), (const unsigned char *) token.c_str(), token_len, diff --git a/test/art_test.cpp b/test/art_test.cpp index 61e7b057..d59eac48 100644 --- a/test/art_test.cpp +++ b/test/art_test.cpp @@ -623,7 +623,7 @@ TEST(ArtTest, test_art_fuzzy_search_single_leaf_prefix) { EXPECT_EQ(1, l->values->ids.at(0)); std::vector leaves; - art_fuzzy_search(&t, (const unsigned char *) "aplication", strlen(key), 0, 1, 10, FREQUENCY, true, nullptr, 0, leaves); + art_fuzzy_search(&t, (const unsigned char *) "aplication", strlen(key)-1, 0, 1, 10, FREQUENCY, true, nullptr, 0, leaves); ASSERT_EQ(1, leaves.size()); res = art_tree_destroy(&t); @@ -783,7 +783,7 @@ TEST(ArtTest, test_art_search_sku_like_tokens) { for (const auto &key : keys) { std::vector leaves; - art_fuzzy_search(&t, (const unsigned char*)key.c_str(), key.size()+1, 0, 0, 10, + art_fuzzy_search(&t, (const unsigned char*)key.c_str(), key.size(), 0, 0, 10, FREQUENCY, true, nullptr, 0, leaves); ASSERT_EQ(1, leaves.size()); ASSERT_STREQ(key.c_str(), (const char *) leaves.at(0)->key); @@ -822,6 +822,19 @@ TEST(ArtTest, test_art_search_ill_like_tokens) { line++; } + std::map key_to_count { + std::make_pair("input", 2), + std::make_pair("image", 7), + std::make_pair("instrument", 2), + std::make_pair("in", 10), + std::make_pair("info", 2), + std::make_pair("inventor", 2), + std::make_pair("imageresize", 2), + std::make_pair("id", 5), + std::make_pair("insect", 2), + std::make_pair("ice", 2), + }; + for (const auto &key : keys) { //LOG(INFO) << "Searching for " << key; art_leaf* l = (art_leaf *) art_search(&t, (const unsigned char *)key.c_str(), key.size()+1); @@ -829,11 +842,15 @@ TEST(ArtTest, test_art_search_ill_like_tokens) { EXPECT_EQ(1, l->values->ids.getLength()); std::vector leaves; - art_fuzzy_search(&t, (const unsigned char*)key.c_str(), key.size()+1, 0, 0, 10, + art_fuzzy_search(&t, (const unsigned char*)key.c_str(), key.size(), 0, 0, 10, FREQUENCY, true, nullptr, 0, leaves); - ASSERT_EQ(1, leaves.size()); - ASSERT_STREQ(key.c_str(), (const char *) leaves.at(0)->key); + if(key_to_count.count(key) != 0) { + ASSERT_EQ(key_to_count[key], leaves.size()); + } else { + ASSERT_EQ(1, leaves.size()); + ASSERT_STREQ(key.c_str(), (const char *) leaves.at(0)->key); + } leaves.clear(); @@ -872,7 +889,7 @@ TEST(ArtTest, test_art_search_ill_like_tokens2) { EXPECT_EQ(1, l->values->ids.getLength()); std::vector leaves; - art_fuzzy_search(&t, (const unsigned char*)key.c_str(), key.size()+1, 0, 0, 10, + art_fuzzy_search(&t, (const unsigned char*)key.c_str(), key.size(), 0, 0, 10, FREQUENCY, true, nullptr, 0, leaves); ASSERT_EQ(1, leaves.size()); @@ -891,6 +908,33 @@ TEST(ArtTest, test_art_search_ill_like_tokens2) { ASSERT_TRUE(res == 0); } +TEST(ArtTest, test_art_search_roche_chews) { + art_tree t; + int res = art_tree_init(&t); + ASSERT_TRUE(res == 0); + + std::vector keys; + keys = {"roche"}; + + art_document doc = get_document((uint32_t) 1); + ASSERT_TRUE(NULL == art_insert(&t, (unsigned char *) keys[0].c_str(), keys[0].size()+1, &doc, 1)); + + std::string term = "chews"; + std::vector leaves; + art_fuzzy_search(&t, (const unsigned char*)term.c_str(), term.size(), 0, 2, 10, + FREQUENCY, true, nullptr, 0, leaves); + + ASSERT_EQ(0, leaves.size()); + + art_fuzzy_search(&t, (const unsigned char*)keys[0].c_str(), keys[0].size() + 1, 0, 0, 10, + FREQUENCY, false, nullptr, 0, leaves); + + ASSERT_EQ(1, leaves.size()); + + res = art_tree_destroy(&t); + ASSERT_TRUE(res == 0); +} + TEST(ArtTest, test_encode_int32) { unsigned char chars[8]; diff --git a/test/collection_faceting_test.cpp b/test/collection_faceting_test.cpp index c4a4b4f4..79865ce0 100644 --- a/test/collection_faceting_test.cpp +++ b/test/collection_faceting_test.cpp @@ -175,7 +175,7 @@ TEST_F(CollectionFacetingTest, FacetCounts) { results = coll_array_fields->search("*", query_fields, "", facets, sort_fields, 0, 10, 1, FREQUENCY, false, Index::DROP_TOKENS_THRESHOLD, spp::sparse_hash_set(), - spp::sparse_hash_set(), 10, "tags: fine pltinum").get(); + spp::sparse_hash_set(), 10, "tags: fxne aluminium").get(); ASSERT_EQ(5, results["hits"].size()); ASSERT_EQ(1, results["facet_counts"].size());