diff --git a/docker/deployment.Dockerfile b/docker/deployment.Dockerfile index 06e52872..55402b2e 100644 --- a/docker/deployment.Dockerfile +++ b/docker/deployment.Dockerfile @@ -1,4 +1,4 @@ -FROM ubuntu:20.04 +FROM ubuntu:22.04 RUN apt-get -y update && apt-get -y install ca-certificates diff --git a/include/art.h b/include/art.h index 0502641c..a9715fac 100644 --- a/include/art.h +++ b/include/art.h @@ -276,9 +276,10 @@ int art_iter_prefix(art_tree *t, const unsigned char *prefix, int prefix_len, ar * Returns leaves that match a given string within a fuzzy distance of max_cost. */ int art_fuzzy_search(art_tree *t, const unsigned char *term, const int term_len, const int min_cost, const int max_cost, - const int max_words, const token_ordering token_order, const bool prefix, - const uint32_t *filter_ids, size_t filter_ids_length, - std::vector &results, const std::set& exclude_leaves = {}); + const size_t max_words, const token_ordering token_order, + const bool prefix, bool last_token, const std::string& prev_token, + const uint32_t *filter_ids, const size_t filter_ids_length, + std::vector &results, std::set& exclude_leaves); void encode_int32(int32_t n, unsigned char *chars); diff --git a/include/field.h b/include/field.h index 6894dcee..bc4ae03c 100644 --- a/include/field.h +++ b/include/field.h @@ -317,7 +317,6 @@ struct field { if (!field.reference.empty()) { field_val[fields::reference] = field.reference; } - if(field.create_from.size() > 0) { field_val[fields::create_from] = field.create_from; if(field.model_path.size() > 0) { diff --git a/src/art.cpp b/src/art.cpp index f778eace..40b028a3 100644 --- a/src/art.cpp +++ b/src/art.cpp @@ -21,6 +21,7 @@ #include #include "art.h" #include "logger.h" +#include "array_utils.h" /** * Macros to manipulate pointer tags @@ -940,10 +941,69 @@ void* art_delete(art_tree *t, const unsigned char *key, int key_len) { return child->max_token_count; }*/ +const uint32_t* get_allowed_doc_ids(art_tree *t, const std::string& prev_token, + const uint32_t* filter_ids, const size_t filter_ids_length, + size_t& prev_token_doc_ids_len) { + + art_leaf* prev_leaf = static_cast( + art_search(t, reinterpret_cast(prev_token.c_str()), prev_token.size() + 1) + ); + + if(prev_token.empty() || !prev_leaf) { + prev_token_doc_ids_len = filter_ids_length; + return filter_ids; + } + + std::vector prev_leaf_ids; + posting_t::merge({prev_leaf->values}, prev_leaf_ids); + + uint32_t* prev_token_doc_ids = nullptr; + + if(filter_ids_length != 0) { + prev_token_doc_ids_len = ArrayUtils::and_scalar(prev_leaf_ids.data(), prev_leaf_ids.size(), + filter_ids, filter_ids_length, + &prev_token_doc_ids); + } else { + prev_token_doc_ids_len = prev_leaf_ids.size(); + prev_token_doc_ids = new uint32_t[prev_token_doc_ids_len]; + std::copy(prev_leaf_ids.begin(), prev_leaf_ids.end(), prev_token_doc_ids); + } + + return prev_token_doc_ids; +} + +bool validate_and_add_leaf(art_leaf* leaf, const bool last_token, const std::string& prev_token, + const uint32_t* allowed_doc_ids, const size_t allowed_doc_ids_len, + std::set& exclude_leaves, const art_leaf* exact_leaf, + std::vector& results) { + + if(leaf == exact_leaf) { + return false; + } + + std::string tok(reinterpret_cast(leaf->key), leaf->key_len - 1); + if(exclude_leaves.count(tok) != 0) { + return false; + } + + if(allowed_doc_ids_len != 0) { + if(!posting_t::contains_atleast_one(leaf->values, allowed_doc_ids, + allowed_doc_ids_len)) { + return false; + } + } + + exclude_leaves.emplace(tok); + results.push_back(leaf); + + return true; +} + int art_topk_iter(const art_node *root, token_ordering token_order, size_t max_results, - const uint32_t* filter_ids, size_t filter_ids_length, - const std::set& exclude_leaves, const art_leaf* exact_leaf, - std::vector& results) { + const art_leaf* exact_leaf, + const bool last_token, const std::string& prev_token, + const uint32_t* allowed_doc_ids, size_t allowed_doc_ids_len, + const art_tree* t, std::set& exclude_leaves, std::vector& results) { printf("INSIDE art_topk_iter: root->type: %d\n", root->type); @@ -957,6 +1017,8 @@ int art_topk_iter(const art_node *root, token_ordering token_order, size_t max_r q.push(root); + size_t num_processed = 0; + while(!q.empty() && results.size() < max_results*4) { art_node *n = (art_node *) q.top(); q.pop(); @@ -972,23 +1034,13 @@ int art_topk_iter(const art_node *root, token_ordering token_order, size_t max_r if (IS_LEAF(n)) { art_leaf *l = (art_leaf *) LEAF_RAW(n); //LOG(INFO) << "END LEAF SCORE: " << l->max_score; + validate_and_add_leaf(l, last_token, prev_token, allowed_doc_ids, allowed_doc_ids_len, + exclude_leaves, exact_leaf, results); - if(filter_ids_length == 0) { - std::string tok(reinterpret_cast(l->key), l->key_len - 1); - if(exclude_leaves.count(tok) != 0 || l == exact_leaf) { - continue; - } - results.push_back(l); - } else { - // we will push leaf only if filter matches with leaf IDs - bool found_atleast_one = posting_t::contains_atleast_one(l->values, filter_ids, filter_ids_length); - if(found_atleast_one) { - std::string tok(reinterpret_cast(l->key), l->key_len - 1); - if(exclude_leaves.count(tok) != 0 || l == exact_leaf) { - continue; - } - results.push_back(l); - } + if (++num_processed % 1024 == 0 && (microseconds( + std::chrono::system_clock::now().time_since_epoch()).count() - search_begin_us) > search_stop_us) { + search_cutoff = true; + break; } continue; @@ -1491,9 +1543,10 @@ static void art_fuzzy_recurse(unsigned char p, unsigned char c, const art_node * * Returns leaves that match a given string within a fuzzy distance of max_cost. */ int art_fuzzy_search(art_tree *t, const unsigned char *term, const int term_len, const int min_cost, const int max_cost, - const int max_words, const token_ordering token_order, const bool prefix, - const uint32_t *filter_ids, size_t filter_ids_length, - std::vector &results, const std::set& exclude_leaves) { + const size_t max_words, const token_ordering token_order, const bool prefix, + bool last_token, const std::string& prev_token, + const uint32_t *filter_ids, const size_t filter_ids_length, + std::vector &results, std::set& exclude_leaves) { std::vector nodes; int irow[term_len + 1]; @@ -1525,8 +1578,15 @@ int art_fuzzy_search(art_tree *t, const unsigned char *term, const int term_len, art_leaf* exact_leaf = (art_leaf *) art_search(t, term, key_len); //LOG(INFO) << "exact_leaf: " << exact_leaf << ", term: " << term << ", term_len: " << term_len; + // documents that contain the previous token and/or filter ids + size_t allowed_doc_ids_len = 0; + const uint32_t* allowed_doc_ids = get_allowed_doc_ids(t, prev_token, filter_ids, filter_ids_length, + allowed_doc_ids_len); + for(auto node: nodes) { - art_topk_iter(node, token_order, max_words, filter_ids, filter_ids_length, exclude_leaves, exact_leaf, results); + art_topk_iter(node, token_order, max_words, exact_leaf, + last_token, prev_token, allowed_doc_ids, allowed_doc_ids_len, + t, exclude_leaves, results); } if(token_order == FREQUENCY) { @@ -1536,7 +1596,11 @@ int art_fuzzy_search(art_tree *t, const unsigned char *term, const int term_len, } if(exact_leaf && min_cost == 0) { - results.insert(results.begin(), exact_leaf); + std::string tok(reinterpret_cast(exact_leaf->key), exact_leaf->key_len - 1); + if(exclude_leaves.count(tok) == 0) { + results.insert(results.begin(), exact_leaf); + exclude_leaves.emplace(tok); + } } if(results.size() > max_words) { @@ -1551,6 +1615,10 @@ int art_fuzzy_search(art_tree *t, const unsigned char *term, const int term_len, << ", filter_ids_length: " << filter_ids_length; }*/ + if(allowed_doc_ids != filter_ids) { + delete [] allowed_doc_ids; + } + return 0; } diff --git a/src/collection.cpp b/src/collection.cpp index 4c70bb39..f111eb34 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -246,7 +246,6 @@ nlohmann::json Collection::get_summary_json() const { field_json[fields::reference] = coll_field.reference; } - fields_arr.push_back(field_json); } diff --git a/src/index.cpp b/src/index.cpp index e734b8b3..9b1b844d 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -3422,12 +3422,12 @@ void Index::fuzzy_search_fields(const std::vector& the_fields, } //LOG(INFO) << "Searching for field: " << the_field.name << ", found token:" << token; + const auto& prev_token = last_token ? token_candidates_vec.back().candidates[0] : ""; std::vector field_leaves; - int max_words = 100000; art_fuzzy_search(search_index.at(the_field.name), (const unsigned char *) token.c_str(), token_len, - costs[token_index], costs[token_index], max_words, token_order, prefix_search, - filter_ids, filter_ids_length, field_leaves, unique_tokens); + costs[token_index], costs[token_index], max_candidates, token_order, prefix_search, + last_token, prev_token, filter_ids, filter_ids_length, field_leaves, unique_tokens); /*auto timeMillis = std::chrono::duration_cast( std::chrono::high_resolution_clock::now() - begin).count(); @@ -3438,60 +3438,17 @@ void Index::fuzzy_search_fields(const std::vector& the_fields, continue; } - uint32_t* prev_token_doc_ids = nullptr; // documents that contain the previous token - size_t prev_token_doc_ids_len = 0; - - if(last_token) { - auto& prev_token = token_candidates_vec.back().candidates[0]; - art_leaf* prev_leaf = static_cast( - art_search(search_index.at(the_field.name), - reinterpret_cast(prev_token.c_str()), - prev_token.size() + 1)); - - if(!prev_leaf) { - continue; - } - - std::vector prev_leaf_ids; - posting_t::merge({prev_leaf->values}, prev_leaf_ids); - - if(filter_ids_length != 0) { - prev_token_doc_ids_len = ArrayUtils::and_scalar(prev_leaf_ids.data(), prev_leaf_ids.size(), - filter_ids, filter_ids_length, - &prev_token_doc_ids); - } else { - prev_token_doc_ids_len = prev_leaf_ids.size(); - prev_token_doc_ids = new uint32_t[prev_token_doc_ids_len]; - std::copy(prev_leaf_ids.begin(), prev_leaf_ids.end(), prev_token_doc_ids); - } - } - for(size_t i = 0; i < field_leaves.size(); i++) { auto leaf = field_leaves[i]; std::string tok(reinterpret_cast(leaf->key), leaf->key_len - 1); - if(unique_tokens.count(tok) == 0) { - if(last_token) { - if(!posting_t::contains_atleast_one(leaf->values, prev_token_doc_ids, - prev_token_doc_ids_len)) { - continue; - } - } - - unique_tokens.emplace(tok); - leaf_tokens.push_back(tok); - } - - if(leaf_tokens.size() >= max_candidates) { - token_cost_cache.emplace(token_cost_hash, leaf_tokens); - delete [] prev_token_doc_ids; - prev_token_doc_ids = nullptr; - goto token_done; - } + leaf_tokens.push_back(tok); } token_cost_cache.emplace(token_cost_hash, leaf_tokens); - delete [] prev_token_doc_ids; - prev_token_doc_ids = nullptr; + + if(leaf_tokens.size() >= max_candidates) { + goto token_done; + } } if(last_token && leaf_tokens.size() < max_candidates) { @@ -3520,10 +3477,9 @@ void Index::fuzzy_search_fields(const std::vector& the_fields, } std::vector field_leaves; - int max_words = 100000; art_fuzzy_search(search_index.at(the_field.name), (const unsigned char *) token.c_str(), token_len, - costs[token_index], costs[token_index], max_words, token_order, prefix_search, - filter_ids, filter_ids_length, field_leaves, unique_tokens); + costs[token_index], costs[token_index], max_candidates, token_order, prefix_search, + false, "", filter_ids, filter_ids_length, field_leaves, unique_tokens); if(field_leaves.empty()) { // look at the next field @@ -3533,23 +3489,14 @@ void Index::fuzzy_search_fields(const std::vector& the_fields, for(size_t i = 0; i < field_leaves.size(); i++) { auto leaf = field_leaves[i]; std::string tok(reinterpret_cast(leaf->key), leaf->key_len - 1); - if(unique_tokens.count(tok) == 0) { - if(!posting_t::contains_atleast_one(leaf->values, &prev_token_doc_ids[0], - prev_token_doc_ids.size())) { - continue; - } - - unique_tokens.emplace(tok); - leaf_tokens.push_back(tok); - } - - if(leaf_tokens.size() >= max_candidates) { - token_cost_cache.emplace(token_cost_hash, leaf_tokens); - goto token_done; - } + leaf_tokens.push_back(tok); } token_cost_cache.emplace(token_cost_hash, leaf_tokens); + + if(leaf_tokens.size() >= max_candidates) { + goto token_done; + } } } } @@ -4935,7 +4882,7 @@ void Index::search_field(const uint8_t & field_id, // need less candidates for filtered searches since we already only pick tokens with results art_fuzzy_search(search_index.at(field_name), (const unsigned char *) token.c_str(), token_len, costs[token_index], costs[token_index], max_candidates, token_order, prefix_search, - filter_ids, filter_ids_length, leaves, unique_tokens); + false, "", filter_ids, filter_ids_length, leaves, unique_tokens); /*auto timeMillis = std::chrono::duration_cast( std::chrono::high_resolution_clock::now() - begin).count(); diff --git a/test/art_test.cpp b/test/art_test.cpp index 0236b5e1..f8414653 100644 --- a/test/art_test.cpp +++ b/test/art_test.cpp @@ -18,6 +18,8 @@ art_document get_document(uint32_t id) { return document; } +std::set exclude_leaves; + TEST(ArtTest, test_art_init_and_destroy) { art_tree t; int res = art_tree_init(&t); @@ -587,22 +589,25 @@ TEST(ArtTest, test_art_fuzzy_search_single_leaf) { EXPECT_EQ(1, posting_t::first_id(l->values)); std::vector leaves; - art_fuzzy_search(&t, (const unsigned char *) implement_key, strlen(implement_key) + 1, 0, 0, 10, FREQUENCY, false, nullptr, 0, leaves); + art_fuzzy_search(&t, (const unsigned char *) implement_key, strlen(implement_key) + 1, 0, 0, 10, FREQUENCY, false, false, "", nullptr, 0, leaves, exclude_leaves); ASSERT_EQ(1, leaves.size()); const char* implement_key_typo1 = "implment"; const char* implement_key_typo2 = "implwnent"; leaves.clear(); - art_fuzzy_search(&t, (const unsigned char *) implement_key_typo1, strlen(implement_key_typo1) + 1, 0, 0, 10, FREQUENCY, false, nullptr, 0, leaves); + exclude_leaves.clear(); + art_fuzzy_search(&t, (const unsigned char *) implement_key_typo1, strlen(implement_key_typo1) + 1, 0, 0, 10, FREQUENCY, false, false, "", nullptr, 0, leaves, exclude_leaves); ASSERT_EQ(0, leaves.size()); leaves.clear(); - art_fuzzy_search(&t, (const unsigned char *) implement_key_typo1, strlen(implement_key_typo1) + 1, 0, 1, 10, FREQUENCY, false, nullptr, 0, leaves); + exclude_leaves.clear(); + art_fuzzy_search(&t, (const unsigned char *) implement_key_typo1, strlen(implement_key_typo1) + 1, 0, 1, 10, FREQUENCY, false, false, "", nullptr, 0, leaves, exclude_leaves); ASSERT_EQ(1, leaves.size()); leaves.clear(); - art_fuzzy_search(&t, (const unsigned char *) implement_key_typo2, strlen(implement_key_typo2) + 1, 0, 2, 10, FREQUENCY, false, nullptr, 0, leaves); + exclude_leaves.clear(); + art_fuzzy_search(&t, (const unsigned char *) implement_key_typo2, strlen(implement_key_typo2) + 1, 0, 2, 10, FREQUENCY, false, false, "", nullptr, 0, leaves, exclude_leaves); ASSERT_EQ(1, leaves.size()); res = art_tree_destroy(&t); @@ -623,11 +628,12 @@ TEST(ArtTest, test_art_fuzzy_search_single_leaf_prefix) { std::vector leaves; std::string term = "aplication"; - art_fuzzy_search(&t, (const unsigned char *)(term.c_str()), term.size(), 0, 1, 10, FREQUENCY, true, nullptr, 0, leaves); + art_fuzzy_search(&t, (const unsigned char *)(term.c_str()), term.size(), 0, 1, 10, FREQUENCY, true, false, "", nullptr, 0, leaves, exclude_leaves); ASSERT_EQ(1, leaves.size()); leaves.clear(); - art_fuzzy_search(&t, (const unsigned char *)(term.c_str()), term.size(), 0, 2, 10, FREQUENCY, true, nullptr, 0, leaves); + exclude_leaves.clear(); + art_fuzzy_search(&t, (const unsigned char *)(term.c_str()), term.size(), 0, 2, 10, FREQUENCY, true, false, "", nullptr, 0, leaves, exclude_leaves); ASSERT_EQ(1, leaves.size()); res = art_tree_destroy(&t); @@ -645,7 +651,7 @@ TEST(ArtTest, test_art_fuzzy_search_single_leaf_qlen_greater_than_key) { std::string term = "starkbin"; std::vector leaves; - art_fuzzy_search(&t, (const unsigned char *)(term.c_str()), term.size(), 0, 2, 10, FREQUENCY, true, nullptr, 0, leaves); + art_fuzzy_search(&t, (const unsigned char *)(term.c_str()), term.size(), 0, 2, 10, FREQUENCY, true, false, "", nullptr, 0, leaves, exclude_leaves); ASSERT_EQ(0, leaves.size()); } @@ -660,11 +666,12 @@ TEST(ArtTest, test_art_fuzzy_search_single_leaf_non_prefix) { std::string term = "spz"; std::vector leaves; - art_fuzzy_search(&t, (const unsigned char *)(term.c_str()), term.size()+1, 0, 1, 10, FREQUENCY, false, nullptr, 0, leaves); + art_fuzzy_search(&t, (const unsigned char *)(term.c_str()), term.size()+1, 0, 1, 10, FREQUENCY, false, false, "", nullptr, 0, leaves, exclude_leaves); ASSERT_EQ(0, leaves.size()); leaves.clear(); - art_fuzzy_search(&t, (const unsigned char *)(term.c_str()), term.size(), 0, 1, 10, FREQUENCY, true, nullptr, 0, leaves); + exclude_leaves.clear(); + art_fuzzy_search(&t, (const unsigned char *)(term.c_str()), term.size(), 0, 1, 10, FREQUENCY, true, false, "", nullptr, 0, leaves, exclude_leaves); ASSERT_EQ(1, leaves.size()); res = art_tree_destroy(&t); @@ -682,7 +689,7 @@ TEST(ArtTest, test_art_prefix_larger_than_key) { std::string term = "earrings"; std::vector leaves; - art_fuzzy_search(&t, (const unsigned char *)(term.c_str()), term.size()+1, 0, 2, 10, FREQUENCY, false, nullptr, 0, leaves); + art_fuzzy_search(&t, (const unsigned char *)(term.c_str()), term.size()+1, 0, 2, 10, FREQUENCY, false, false, "", nullptr, 0, leaves, exclude_leaves); ASSERT_EQ(0, leaves.size()); res = art_tree_destroy(&t); @@ -706,7 +713,7 @@ TEST(ArtTest, test_art_fuzzy_search_prefix_token_ordering) { } std::vector leaves; - art_fuzzy_search(&t, (const unsigned char *) "e", 1, 0, 0, 3, MAX_SCORE, true, nullptr, 0, leaves); + art_fuzzy_search(&t, (const unsigned char *) "e", 1, 0, 0, 3, MAX_SCORE, true, false, "", nullptr, 0, leaves, exclude_leaves); std::string first_key(reinterpret_cast(leaves[0]->key), leaves[0]->key_len - 1); ASSERT_EQ("e", first_key); @@ -718,7 +725,8 @@ TEST(ArtTest, test_art_fuzzy_search_prefix_token_ordering) { ASSERT_EQ("elephant", third_key); leaves.clear(); - art_fuzzy_search(&t, (const unsigned char *) "enter", 5, 1, 1, 3, MAX_SCORE, true, nullptr, 0, leaves); + exclude_leaves.clear(); + art_fuzzy_search(&t, (const unsigned char *) "enter", 5, 1, 1, 3, MAX_SCORE, true, false, "", nullptr, 0, leaves, exclude_leaves); ASSERT_TRUE(leaves.empty()); res = art_tree_destroy(&t); @@ -747,56 +755,65 @@ TEST(ArtTest, test_art_fuzzy_search) { auto begin = std::chrono::high_resolution_clock::now(); leaves.clear(); - art_fuzzy_search(&t, (const unsigned char *) "pltinum", strlen("pltinum"), 0, 1, 10, FREQUENCY, true, nullptr, 0, leaves); + exclude_leaves.clear(); + art_fuzzy_search(&t, (const unsigned char *) "pltinum", strlen("pltinum"), 0, 1, 10, FREQUENCY, true, false, "", nullptr, 0, leaves, exclude_leaves); ASSERT_EQ(2, leaves.size()); ASSERT_STREQ("platinumsmith", (const char *)leaves.at(0)->key); ASSERT_STREQ("platinum", (const char *)leaves.at(1)->key); leaves.clear(); + exclude_leaves.clear(); // extra char - art_fuzzy_search(&t, (const unsigned char *) "higghliving", strlen("higghliving") + 1, 0, 1, 10, FREQUENCY, false, nullptr, 0, leaves); + art_fuzzy_search(&t, (const unsigned char *) "higghliving", strlen("higghliving") + 1, 0, 1, 10, FREQUENCY, false, false, "", nullptr, 0, leaves, exclude_leaves); ASSERT_EQ(1, leaves.size()); ASSERT_STREQ("highliving", (const char *)leaves.at(0)->key); // transpose leaves.clear(); - art_fuzzy_search(&t, (const unsigned char *) "zymosthneic", strlen("zymosthneic") + 1, 0, 1, 10, FREQUENCY, false, nullptr, 0, leaves); + exclude_leaves.clear(); + art_fuzzy_search(&t, (const unsigned char *) "zymosthneic", strlen("zymosthneic") + 1, 0, 1, 10, FREQUENCY, false, false, "", nullptr, 0, leaves, exclude_leaves); ASSERT_EQ(1, leaves.size()); ASSERT_STREQ("zymosthenic", (const char *)leaves.at(0)->key); // transpose + missing leaves.clear(); - art_fuzzy_search(&t, (const unsigned char *) "dacrcyystlgia", strlen("dacrcyystlgia") + 1, 0, 2, 10, FREQUENCY, false, nullptr, 0, leaves); + exclude_leaves.clear(); + art_fuzzy_search(&t, (const unsigned char *) "dacrcyystlgia", strlen("dacrcyystlgia") + 1, 0, 2, 10, FREQUENCY, false, false, "", nullptr, 0, leaves, exclude_leaves); ASSERT_EQ(1, leaves.size()); ASSERT_STREQ("dacrycystalgia", (const char *)leaves.at(0)->key); leaves.clear(); - art_fuzzy_search(&t, (const unsigned char *) "dacrcyystlgia", strlen("dacrcyystlgia") + 1, 1, 2, 10, FREQUENCY, false, nullptr, 0, leaves); + exclude_leaves.clear(); + art_fuzzy_search(&t, (const unsigned char *) "dacrcyystlgia", strlen("dacrcyystlgia") + 1, 1, 2, 10, FREQUENCY, false, false, "", nullptr, 0, leaves, exclude_leaves); ASSERT_EQ(1, leaves.size()); ASSERT_STREQ("dacrycystalgia", (const char *)leaves.at(0)->key); // missing char leaves.clear(); - art_fuzzy_search(&t, (const unsigned char *) "gaberlunze", strlen("gaberlunze") + 1, 0, 1, 10, FREQUENCY, false, nullptr, 0, leaves); + exclude_leaves.clear(); + art_fuzzy_search(&t, (const unsigned char *) "gaberlunze", strlen("gaberlunze") + 1, 0, 1, 10, FREQUENCY, false, false, "", nullptr, 0, leaves, exclude_leaves); ASSERT_EQ(1, leaves.size()); ASSERT_STREQ("gaberlunzie", (const char *)leaves.at(0)->key); // substituted char leaves.clear(); - art_fuzzy_search(&t, (const unsigned char *) "eacemiferous", strlen("eacemiferous") + 1, 0, 1, 10, FREQUENCY, false, nullptr, 0, leaves); + exclude_leaves.clear(); + art_fuzzy_search(&t, (const unsigned char *) "eacemiferous", strlen("eacemiferous") + 1, 0, 1, 10, FREQUENCY, false, false, "", nullptr, 0, leaves, exclude_leaves); ASSERT_EQ(1, leaves.size()); ASSERT_STREQ("racemiferous", (const char *)leaves.at(0)->key); // missing char + extra char leaves.clear(); - art_fuzzy_search(&t, (const unsigned char *) "Sarbruckken", strlen("Sarbruckken") + 1, 0, 2, 10, FREQUENCY, false, nullptr, 0, leaves); + exclude_leaves.clear(); + art_fuzzy_search(&t, (const unsigned char *) "Sarbruckken", strlen("Sarbruckken") + 1, 0, 2, 10, FREQUENCY, false, false, "", nullptr, 0, leaves, exclude_leaves); ASSERT_EQ(1, leaves.size()); ASSERT_STREQ("Saarbrucken", (const char *)leaves.at(0)->key); // multiple matching results leaves.clear(); - art_fuzzy_search(&t, (const unsigned char *) "hown", strlen("hown") + 1, 0, 1, 10, FREQUENCY, false, nullptr, 0, leaves); + exclude_leaves.clear(); + art_fuzzy_search(&t, (const unsigned char *) "hown", strlen("hown") + 1, 0, 1, 10, FREQUENCY, false, false, "", nullptr, 0, leaves, exclude_leaves); ASSERT_EQ(10, leaves.size()); std::set expected_words = {"town", "sown", "mown", "lown", "howl", "howk", "howe", "how", "horn", "hoon"}; @@ -809,23 +826,28 @@ TEST(ArtTest, test_art_fuzzy_search) { // fuzzy prefix search leaves.clear(); - art_fuzzy_search(&t, (const unsigned char *) "lionhear", strlen("lionhear"), 0, 0, 10, FREQUENCY, true, nullptr, 0, leaves); + exclude_leaves.clear(); + art_fuzzy_search(&t, (const unsigned char *) "lionhear", strlen("lionhear"), 0, 0, 10, FREQUENCY, true, false, "", nullptr, 0, leaves, exclude_leaves); ASSERT_EQ(3, leaves.size()); leaves.clear(); - art_fuzzy_search(&t, (const unsigned char *) "lineage", strlen("lineage"), 0, 0, 10, FREQUENCY, true, nullptr, 0, leaves); + exclude_leaves.clear(); + art_fuzzy_search(&t, (const unsigned char *) "lineage", strlen("lineage"), 0, 0, 10, FREQUENCY, true, false, "", nullptr, 0, leaves, exclude_leaves); ASSERT_EQ(2, leaves.size()); leaves.clear(); - art_fuzzy_search(&t, (const unsigned char *) "liq", strlen("liq"), 0, 0, 50, FREQUENCY, true, nullptr, 0, leaves); + exclude_leaves.clear(); + art_fuzzy_search(&t, (const unsigned char *) "liq", strlen("liq"), 0, 0, 50, FREQUENCY, true, false, "", nullptr, 0, leaves, exclude_leaves); ASSERT_EQ(39, leaves.size()); leaves.clear(); - art_fuzzy_search(&t, (const unsigned char *) "antitraditiana", strlen("antitraditiana"), 0, 1, 10, FREQUENCY, true, nullptr, 0, leaves); + exclude_leaves.clear(); + art_fuzzy_search(&t, (const unsigned char *) "antitraditiana", strlen("antitraditiana"), 0, 1, 10, FREQUENCY, true, false, "", nullptr, 0, leaves, exclude_leaves); ASSERT_EQ(1, leaves.size()); leaves.clear(); - art_fuzzy_search(&t, (const unsigned char *) "antisocao", strlen("antisocao"), 0, 2, 10, FREQUENCY, true, nullptr, 0, leaves); + exclude_leaves.clear(); + art_fuzzy_search(&t, (const unsigned char *) "antisocao", strlen("antisocao"), 0, 2, 10, FREQUENCY, true, false, "", nullptr, 0, leaves, exclude_leaves); ASSERT_EQ(6, leaves.size()); long long int timeMillis = std::chrono::duration_cast( @@ -855,7 +877,7 @@ TEST(ArtTest, test_art_fuzzy_search_unicode_chars) { EXPECT_EQ(1, posting_t::first_id(l->values)); std::vector leaves; - art_fuzzy_search(&t, (unsigned char *)key, strlen(key), 0, 0, 10, FREQUENCY, true, nullptr, 0, leaves); + art_fuzzy_search(&t, (unsigned char *)key, strlen(key), 0, 0, 10, FREQUENCY, true, false, "", nullptr, 0, leaves, exclude_leaves); ASSERT_EQ(1, leaves.size()); } @@ -879,7 +901,7 @@ TEST(ArtTest, test_art_fuzzy_search_extra_chars) { const char* query = "abbreviation"; std::vector leaves; - art_fuzzy_search(&t, (unsigned char *)query, strlen(query), 0, 2, 10, FREQUENCY, true, nullptr, 0, leaves); + art_fuzzy_search(&t, (unsigned char *)query, strlen(query), 0, 2, 10, FREQUENCY, true, false, "", nullptr, 0, leaves, exclude_leaves); ASSERT_EQ(1, leaves.size()); res = art_tree_destroy(&t); @@ -918,15 +940,16 @@ TEST(ArtTest, test_art_search_sku_like_tokens) { for (const auto &key : keys) { std::vector leaves; art_fuzzy_search(&t, (const unsigned char*)key.c_str(), key.size(), 0, 0, 10, - FREQUENCY, true, nullptr, 0, leaves); + FREQUENCY, true, false, "", nullptr, 0, leaves, exclude_leaves); ASSERT_EQ(1, leaves.size()); ASSERT_STREQ(key.c_str(), (const char *) leaves.at(0)->key); leaves.clear(); + exclude_leaves.clear(); // non prefix art_fuzzy_search(&t, (const unsigned char*)key.c_str(), key.size()+1, 0, 0, 10, - FREQUENCY, false, nullptr, 0, leaves); + FREQUENCY, false, false, "", nullptr, 0, leaves, exclude_leaves); ASSERT_EQ(1, leaves.size()); ASSERT_STREQ(key.c_str(), (const char *) leaves.at(0)->key); } @@ -970,14 +993,17 @@ TEST(ArtTest, test_art_search_ill_like_tokens) { std::make_pair("ice", 2), }; + std::string key = "input"; + for (const auto &key : keys) { art_leaf* l = (art_leaf *) art_search(&t, (const unsigned char *)key.c_str(), key.size()+1); ASSERT_FALSE(l == nullptr); EXPECT_EQ(1, posting_t::num_ids(l->values)); std::vector leaves; + exclude_leaves.clear(); art_fuzzy_search(&t, (const unsigned char*)key.c_str(), key.size(), 0, 0, 10, - FREQUENCY, true, nullptr, 0, leaves); + FREQUENCY, true, false, "", nullptr, 0, leaves, exclude_leaves); if(key_to_count.count(key) != 0) { ASSERT_EQ(key_to_count[key], leaves.size()); @@ -987,10 +1013,14 @@ TEST(ArtTest, test_art_search_ill_like_tokens) { } leaves.clear(); + exclude_leaves.clear(); // non prefix art_fuzzy_search(&t, (const unsigned char*)key.c_str(), key.size()+1, 0, 0, 10, - FREQUENCY, false, nullptr, 0, leaves); + FREQUENCY, false, false, "", nullptr, 0, leaves, exclude_leaves); + if(leaves.size() != 1) { + LOG(INFO) << key; + } ASSERT_EQ(1, leaves.size()); ASSERT_STREQ(key.c_str(), (const char *) leaves.at(0)->key); } @@ -1022,8 +1052,9 @@ TEST(ArtTest, test_art_search_ill_like_tokens2) { EXPECT_EQ(1, posting_t::num_ids(l->values)); std::vector leaves; + exclude_leaves.clear(); art_fuzzy_search(&t, (const unsigned char*)key.c_str(), key.size(), 0, 0, 10, - FREQUENCY, true, nullptr, 0, leaves); + FREQUENCY, true, false, "", nullptr, 0, leaves, exclude_leaves); if(key == "illustration") { ASSERT_EQ(2, leaves.size()); @@ -1033,10 +1064,11 @@ TEST(ArtTest, test_art_search_ill_like_tokens2) { } leaves.clear(); + exclude_leaves.clear(); // non prefix art_fuzzy_search(&t, (const unsigned char*)key.c_str(), key.size() + 1, 0, 0, 10, - FREQUENCY, false, nullptr, 0, leaves); + FREQUENCY, false, false, "", nullptr, 0, leaves, exclude_leaves); ASSERT_EQ(1, leaves.size()); ASSERT_STREQ(key.c_str(), (const char *) leaves.at(0)->key); } @@ -1059,12 +1091,12 @@ TEST(ArtTest, test_art_search_roche_chews) { std::string term = "chews"; std::vector leaves; art_fuzzy_search(&t, (const unsigned char*)term.c_str(), term.size(), 0, 2, 10, - FREQUENCY, true, nullptr, 0, leaves); + FREQUENCY, true, false, "", nullptr, 0, leaves, exclude_leaves); ASSERT_EQ(0, leaves.size()); art_fuzzy_search(&t, (const unsigned char*)keys[0].c_str(), keys[0].size() + 1, 0, 0, 10, - FREQUENCY, false, nullptr, 0, leaves); + FREQUENCY, false, false, "", nullptr, 0, leaves, exclude_leaves); ASSERT_EQ(1, leaves.size()); @@ -1091,14 +1123,15 @@ TEST(ArtTest, test_art_search_raspberry) { std::string q_raspberries = "raspberries"; art_fuzzy_search(&t, (const unsigned char*)q_raspberries.c_str(), q_raspberries.size(), 0, 2, 10, - FREQUENCY, true, nullptr, 0, leaves); + FREQUENCY, true, false, "", nullptr, 0, leaves, exclude_leaves); ASSERT_EQ(2, leaves.size()); leaves.clear(); + exclude_leaves.clear(); std::string q_raspberry = "raspberry"; art_fuzzy_search(&t, (const unsigned char*)q_raspberry.c_str(), q_raspberry.size(), 0, 2, 10, - FREQUENCY, true, nullptr, 0, leaves); + FREQUENCY, true, false, "", nullptr, 0, leaves, exclude_leaves); ASSERT_EQ(2, leaves.size()); res = art_tree_destroy(&t); @@ -1124,13 +1157,16 @@ TEST(ArtTest, test_art_search_highliving) { std::string query = "higghliving"; art_fuzzy_search(&t, (const unsigned char*)query.c_str(), query.size() + 1, 0, 1, 10, - FREQUENCY, false, nullptr, 0, leaves); + FREQUENCY, false, false, "", nullptr, 0, leaves, exclude_leaves); ASSERT_EQ(1, leaves.size()); leaves.clear(); + exclude_leaves.clear(); + exclude_leaves.clear(); + exclude_leaves.clear(); art_fuzzy_search(&t, (const unsigned char*)query.c_str(), query.size(), 0, 2, 10, - FREQUENCY, true, nullptr, 0, leaves); + FREQUENCY, true, false, "", nullptr, 0, leaves, exclude_leaves); ASSERT_EQ(1, leaves.size()); res = art_tree_destroy(&t); diff --git a/test/collection_specific_test.cpp b/test/collection_specific_test.cpp index de658b37..befa767b 100644 --- a/test/collection_specific_test.cpp +++ b/test/collection_specific_test.cpp @@ -203,6 +203,8 @@ TEST_F(CollectionSpecificTest, ExactSingleFieldMatch) { spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 30, 4, "title", 10).get(); + LOG(INFO) << results; + ASSERT_EQ(2, results["hits"].size()); ASSERT_EQ("0", results["hits"][0]["document"]["id"].get()); ASSERT_EQ("1", results["hits"][1]["document"]["id"].get()); diff --git a/test/collection_test.cpp b/test/collection_test.cpp index 58ba40bf..9ca811cf 100644 --- a/test/collection_test.cpp +++ b/test/collection_test.cpp @@ -476,7 +476,7 @@ TEST_F(CollectionTest, TextContainingAnActualTypo) { ASSERT_EQ(4, results["hits"].size()); ASSERT_EQ(11, results["found"].get()); - std::vector ids = {"19", "22", "6", "13"}; + std::vector ids = {"19", "6", "21", "22"}; for(size_t i = 0; i < results["hits"].size(); i++) { nlohmann::json result = results["hits"].at(i);