Merge branch 'v0.25-join' into v0.25

This commit is contained in:
Ozan Armağan 2023-02-16 16:15:20 +03:00 committed by GitHub
commit 807a95b383
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 192 additions and 140 deletions

View File

@ -1,4 +1,4 @@
FROM ubuntu:20.04
FROM ubuntu:22.04
RUN apt-get -y update && apt-get -y install ca-certificates

View File

@ -276,9 +276,10 @@ int art_iter_prefix(art_tree *t, const unsigned char *prefix, int prefix_len, ar
* Returns leaves that match a given string within a fuzzy distance of max_cost.
*/
int art_fuzzy_search(art_tree *t, const unsigned char *term, const int term_len, const int min_cost, const int max_cost,
const int max_words, const token_ordering token_order, const bool prefix,
const uint32_t *filter_ids, size_t filter_ids_length,
std::vector<art_leaf *> &results, const std::set<std::string>& exclude_leaves = {});
const size_t max_words, const token_ordering token_order,
const bool prefix, bool last_token, const std::string& prev_token,
const uint32_t *filter_ids, const size_t filter_ids_length,
std::vector<art_leaf *> &results, std::set<std::string>& exclude_leaves);
void encode_int32(int32_t n, unsigned char *chars);

View File

@ -317,7 +317,6 @@ struct field {
if (!field.reference.empty()) {
field_val[fields::reference] = field.reference;
}
if(field.create_from.size() > 0) {
field_val[fields::create_from] = field.create_from;
if(field.model_path.size() > 0) {

View File

@ -21,6 +21,7 @@
#include <posting.h>
#include "art.h"
#include "logger.h"
#include "array_utils.h"
/**
* Macros to manipulate pointer tags
@ -940,10 +941,69 @@ void* art_delete(art_tree *t, const unsigned char *key, int key_len) {
return child->max_token_count;
}*/
const uint32_t* get_allowed_doc_ids(art_tree *t, const std::string& prev_token,
const uint32_t* filter_ids, const size_t filter_ids_length,
size_t& prev_token_doc_ids_len) {
art_leaf* prev_leaf = static_cast<art_leaf*>(
art_search(t, reinterpret_cast<const unsigned char*>(prev_token.c_str()), prev_token.size() + 1)
);
if(prev_token.empty() || !prev_leaf) {
prev_token_doc_ids_len = filter_ids_length;
return filter_ids;
}
std::vector<uint32_t> prev_leaf_ids;
posting_t::merge({prev_leaf->values}, prev_leaf_ids);
uint32_t* prev_token_doc_ids = nullptr;
if(filter_ids_length != 0) {
prev_token_doc_ids_len = ArrayUtils::and_scalar(prev_leaf_ids.data(), prev_leaf_ids.size(),
filter_ids, filter_ids_length,
&prev_token_doc_ids);
} else {
prev_token_doc_ids_len = prev_leaf_ids.size();
prev_token_doc_ids = new uint32_t[prev_token_doc_ids_len];
std::copy(prev_leaf_ids.begin(), prev_leaf_ids.end(), prev_token_doc_ids);
}
return prev_token_doc_ids;
}
bool validate_and_add_leaf(art_leaf* leaf, const bool last_token, const std::string& prev_token,
const uint32_t* allowed_doc_ids, const size_t allowed_doc_ids_len,
std::set<std::string>& exclude_leaves, const art_leaf* exact_leaf,
std::vector<art_leaf *>& results) {
if(leaf == exact_leaf) {
return false;
}
std::string tok(reinterpret_cast<char*>(leaf->key), leaf->key_len - 1);
if(exclude_leaves.count(tok) != 0) {
return false;
}
if(allowed_doc_ids_len != 0) {
if(!posting_t::contains_atleast_one(leaf->values, allowed_doc_ids,
allowed_doc_ids_len)) {
return false;
}
}
exclude_leaves.emplace(tok);
results.push_back(leaf);
return true;
}
int art_topk_iter(const art_node *root, token_ordering token_order, size_t max_results,
const uint32_t* filter_ids, size_t filter_ids_length,
const std::set<std::string>& exclude_leaves, const art_leaf* exact_leaf,
std::vector<art_leaf *>& results) {
const art_leaf* exact_leaf,
const bool last_token, const std::string& prev_token,
const uint32_t* allowed_doc_ids, size_t allowed_doc_ids_len,
const art_tree* t, std::set<std::string>& exclude_leaves, std::vector<art_leaf *>& results) {
printf("INSIDE art_topk_iter: root->type: %d\n", root->type);
@ -957,6 +1017,8 @@ int art_topk_iter(const art_node *root, token_ordering token_order, size_t max_r
q.push(root);
size_t num_processed = 0;
while(!q.empty() && results.size() < max_results*4) {
art_node *n = (art_node *) q.top();
q.pop();
@ -972,23 +1034,13 @@ int art_topk_iter(const art_node *root, token_ordering token_order, size_t max_r
if (IS_LEAF(n)) {
art_leaf *l = (art_leaf *) LEAF_RAW(n);
//LOG(INFO) << "END LEAF SCORE: " << l->max_score;
validate_and_add_leaf(l, last_token, prev_token, allowed_doc_ids, allowed_doc_ids_len,
exclude_leaves, exact_leaf, results);
if(filter_ids_length == 0) {
std::string tok(reinterpret_cast<char*>(l->key), l->key_len - 1);
if(exclude_leaves.count(tok) != 0 || l == exact_leaf) {
continue;
}
results.push_back(l);
} else {
// we will push leaf only if filter matches with leaf IDs
bool found_atleast_one = posting_t::contains_atleast_one(l->values, filter_ids, filter_ids_length);
if(found_atleast_one) {
std::string tok(reinterpret_cast<char*>(l->key), l->key_len - 1);
if(exclude_leaves.count(tok) != 0 || l == exact_leaf) {
continue;
}
results.push_back(l);
}
if (++num_processed % 1024 == 0 && (microseconds(
std::chrono::system_clock::now().time_since_epoch()).count() - search_begin_us) > search_stop_us) {
search_cutoff = true;
break;
}
continue;
@ -1491,9 +1543,10 @@ static void art_fuzzy_recurse(unsigned char p, unsigned char c, const art_node *
* Returns leaves that match a given string within a fuzzy distance of max_cost.
*/
int art_fuzzy_search(art_tree *t, const unsigned char *term, const int term_len, const int min_cost, const int max_cost,
const int max_words, const token_ordering token_order, const bool prefix,
const uint32_t *filter_ids, size_t filter_ids_length,
std::vector<art_leaf *> &results, const std::set<std::string>& exclude_leaves) {
const size_t max_words, const token_ordering token_order, const bool prefix,
bool last_token, const std::string& prev_token,
const uint32_t *filter_ids, const size_t filter_ids_length,
std::vector<art_leaf *> &results, std::set<std::string>& exclude_leaves) {
std::vector<const art_node*> nodes;
int irow[term_len + 1];
@ -1525,8 +1578,15 @@ int art_fuzzy_search(art_tree *t, const unsigned char *term, const int term_len,
art_leaf* exact_leaf = (art_leaf *) art_search(t, term, key_len);
//LOG(INFO) << "exact_leaf: " << exact_leaf << ", term: " << term << ", term_len: " << term_len;
// documents that contain the previous token and/or filter ids
size_t allowed_doc_ids_len = 0;
const uint32_t* allowed_doc_ids = get_allowed_doc_ids(t, prev_token, filter_ids, filter_ids_length,
allowed_doc_ids_len);
for(auto node: nodes) {
art_topk_iter(node, token_order, max_words, filter_ids, filter_ids_length, exclude_leaves, exact_leaf, results);
art_topk_iter(node, token_order, max_words, exact_leaf,
last_token, prev_token, allowed_doc_ids, allowed_doc_ids_len,
t, exclude_leaves, results);
}
if(token_order == FREQUENCY) {
@ -1536,7 +1596,11 @@ int art_fuzzy_search(art_tree *t, const unsigned char *term, const int term_len,
}
if(exact_leaf && min_cost == 0) {
results.insert(results.begin(), exact_leaf);
std::string tok(reinterpret_cast<char*>(exact_leaf->key), exact_leaf->key_len - 1);
if(exclude_leaves.count(tok) == 0) {
results.insert(results.begin(), exact_leaf);
exclude_leaves.emplace(tok);
}
}
if(results.size() > max_words) {
@ -1551,6 +1615,10 @@ int art_fuzzy_search(art_tree *t, const unsigned char *term, const int term_len,
<< ", filter_ids_length: " << filter_ids_length;
}*/
if(allowed_doc_ids != filter_ids) {
delete [] allowed_doc_ids;
}
return 0;
}

View File

@ -246,7 +246,6 @@ nlohmann::json Collection::get_summary_json() const {
field_json[fields::reference] = coll_field.reference;
}
fields_arr.push_back(field_json);
}

View File

@ -3422,12 +3422,12 @@ void Index::fuzzy_search_fields(const std::vector<search_field_t>& the_fields,
}
//LOG(INFO) << "Searching for field: " << the_field.name << ", found token:" << token;
const auto& prev_token = last_token ? token_candidates_vec.back().candidates[0] : "";
std::vector<art_leaf*> field_leaves;
int max_words = 100000;
art_fuzzy_search(search_index.at(the_field.name), (const unsigned char *) token.c_str(), token_len,
costs[token_index], costs[token_index], max_words, token_order, prefix_search,
filter_ids, filter_ids_length, field_leaves, unique_tokens);
costs[token_index], costs[token_index], max_candidates, token_order, prefix_search,
last_token, prev_token, filter_ids, filter_ids_length, field_leaves, unique_tokens);
/*auto timeMillis = std::chrono::duration_cast<std::chrono::milliseconds>(
std::chrono::high_resolution_clock::now() - begin).count();
@ -3438,60 +3438,17 @@ void Index::fuzzy_search_fields(const std::vector<search_field_t>& the_fields,
continue;
}
uint32_t* prev_token_doc_ids = nullptr; // documents that contain the previous token
size_t prev_token_doc_ids_len = 0;
if(last_token) {
auto& prev_token = token_candidates_vec.back().candidates[0];
art_leaf* prev_leaf = static_cast<art_leaf*>(
art_search(search_index.at(the_field.name),
reinterpret_cast<const unsigned char*>(prev_token.c_str()),
prev_token.size() + 1));
if(!prev_leaf) {
continue;
}
std::vector<uint32_t> prev_leaf_ids;
posting_t::merge({prev_leaf->values}, prev_leaf_ids);
if(filter_ids_length != 0) {
prev_token_doc_ids_len = ArrayUtils::and_scalar(prev_leaf_ids.data(), prev_leaf_ids.size(),
filter_ids, filter_ids_length,
&prev_token_doc_ids);
} else {
prev_token_doc_ids_len = prev_leaf_ids.size();
prev_token_doc_ids = new uint32_t[prev_token_doc_ids_len];
std::copy(prev_leaf_ids.begin(), prev_leaf_ids.end(), prev_token_doc_ids);
}
}
for(size_t i = 0; i < field_leaves.size(); i++) {
auto leaf = field_leaves[i];
std::string tok(reinterpret_cast<char*>(leaf->key), leaf->key_len - 1);
if(unique_tokens.count(tok) == 0) {
if(last_token) {
if(!posting_t::contains_atleast_one(leaf->values, prev_token_doc_ids,
prev_token_doc_ids_len)) {
continue;
}
}
unique_tokens.emplace(tok);
leaf_tokens.push_back(tok);
}
if(leaf_tokens.size() >= max_candidates) {
token_cost_cache.emplace(token_cost_hash, leaf_tokens);
delete [] prev_token_doc_ids;
prev_token_doc_ids = nullptr;
goto token_done;
}
leaf_tokens.push_back(tok);
}
token_cost_cache.emplace(token_cost_hash, leaf_tokens);
delete [] prev_token_doc_ids;
prev_token_doc_ids = nullptr;
if(leaf_tokens.size() >= max_candidates) {
goto token_done;
}
}
if(last_token && leaf_tokens.size() < max_candidates) {
@ -3520,10 +3477,9 @@ void Index::fuzzy_search_fields(const std::vector<search_field_t>& the_fields,
}
std::vector<art_leaf*> field_leaves;
int max_words = 100000;
art_fuzzy_search(search_index.at(the_field.name), (const unsigned char *) token.c_str(), token_len,
costs[token_index], costs[token_index], max_words, token_order, prefix_search,
filter_ids, filter_ids_length, field_leaves, unique_tokens);
costs[token_index], costs[token_index], max_candidates, token_order, prefix_search,
false, "", filter_ids, filter_ids_length, field_leaves, unique_tokens);
if(field_leaves.empty()) {
// look at the next field
@ -3533,23 +3489,14 @@ void Index::fuzzy_search_fields(const std::vector<search_field_t>& the_fields,
for(size_t i = 0; i < field_leaves.size(); i++) {
auto leaf = field_leaves[i];
std::string tok(reinterpret_cast<char*>(leaf->key), leaf->key_len - 1);
if(unique_tokens.count(tok) == 0) {
if(!posting_t::contains_atleast_one(leaf->values, &prev_token_doc_ids[0],
prev_token_doc_ids.size())) {
continue;
}
unique_tokens.emplace(tok);
leaf_tokens.push_back(tok);
}
if(leaf_tokens.size() >= max_candidates) {
token_cost_cache.emplace(token_cost_hash, leaf_tokens);
goto token_done;
}
leaf_tokens.push_back(tok);
}
token_cost_cache.emplace(token_cost_hash, leaf_tokens);
if(leaf_tokens.size() >= max_candidates) {
goto token_done;
}
}
}
}
@ -4935,7 +4882,7 @@ void Index::search_field(const uint8_t & field_id,
// need less candidates for filtered searches since we already only pick tokens with results
art_fuzzy_search(search_index.at(field_name), (const unsigned char *) token.c_str(), token_len,
costs[token_index], costs[token_index], max_candidates, token_order, prefix_search,
filter_ids, filter_ids_length, leaves, unique_tokens);
false, "", filter_ids, filter_ids_length, leaves, unique_tokens);
/*auto timeMillis = std::chrono::duration_cast<std::chrono::milliseconds>(
std::chrono::high_resolution_clock::now() - begin).count();

View File

@ -18,6 +18,8 @@ art_document get_document(uint32_t id) {
return document;
}
std::set<std::string> exclude_leaves;
TEST(ArtTest, test_art_init_and_destroy) {
art_tree t;
int res = art_tree_init(&t);
@ -587,22 +589,25 @@ TEST(ArtTest, test_art_fuzzy_search_single_leaf) {
EXPECT_EQ(1, posting_t::first_id(l->values));
std::vector<art_leaf*> leaves;
art_fuzzy_search(&t, (const unsigned char *) implement_key, strlen(implement_key) + 1, 0, 0, 10, FREQUENCY, false, nullptr, 0, leaves);
art_fuzzy_search(&t, (const unsigned char *) implement_key, strlen(implement_key) + 1, 0, 0, 10, FREQUENCY, false, false, "", nullptr, 0, leaves, exclude_leaves);
ASSERT_EQ(1, leaves.size());
const char* implement_key_typo1 = "implment";
const char* implement_key_typo2 = "implwnent";
leaves.clear();
art_fuzzy_search(&t, (const unsigned char *) implement_key_typo1, strlen(implement_key_typo1) + 1, 0, 0, 10, FREQUENCY, false, nullptr, 0, leaves);
exclude_leaves.clear();
art_fuzzy_search(&t, (const unsigned char *) implement_key_typo1, strlen(implement_key_typo1) + 1, 0, 0, 10, FREQUENCY, false, false, "", nullptr, 0, leaves, exclude_leaves);
ASSERT_EQ(0, leaves.size());
leaves.clear();
art_fuzzy_search(&t, (const unsigned char *) implement_key_typo1, strlen(implement_key_typo1) + 1, 0, 1, 10, FREQUENCY, false, nullptr, 0, leaves);
exclude_leaves.clear();
art_fuzzy_search(&t, (const unsigned char *) implement_key_typo1, strlen(implement_key_typo1) + 1, 0, 1, 10, FREQUENCY, false, false, "", nullptr, 0, leaves, exclude_leaves);
ASSERT_EQ(1, leaves.size());
leaves.clear();
art_fuzzy_search(&t, (const unsigned char *) implement_key_typo2, strlen(implement_key_typo2) + 1, 0, 2, 10, FREQUENCY, false, nullptr, 0, leaves);
exclude_leaves.clear();
art_fuzzy_search(&t, (const unsigned char *) implement_key_typo2, strlen(implement_key_typo2) + 1, 0, 2, 10, FREQUENCY, false, false, "", nullptr, 0, leaves, exclude_leaves);
ASSERT_EQ(1, leaves.size());
res = art_tree_destroy(&t);
@ -623,11 +628,12 @@ TEST(ArtTest, test_art_fuzzy_search_single_leaf_prefix) {
std::vector<art_leaf*> leaves;
std::string term = "aplication";
art_fuzzy_search(&t, (const unsigned char *)(term.c_str()), term.size(), 0, 1, 10, FREQUENCY, true, nullptr, 0, leaves);
art_fuzzy_search(&t, (const unsigned char *)(term.c_str()), term.size(), 0, 1, 10, FREQUENCY, true, false, "", nullptr, 0, leaves, exclude_leaves);
ASSERT_EQ(1, leaves.size());
leaves.clear();
art_fuzzy_search(&t, (const unsigned char *)(term.c_str()), term.size(), 0, 2, 10, FREQUENCY, true, nullptr, 0, leaves);
exclude_leaves.clear();
art_fuzzy_search(&t, (const unsigned char *)(term.c_str()), term.size(), 0, 2, 10, FREQUENCY, true, false, "", nullptr, 0, leaves, exclude_leaves);
ASSERT_EQ(1, leaves.size());
res = art_tree_destroy(&t);
@ -645,7 +651,7 @@ TEST(ArtTest, test_art_fuzzy_search_single_leaf_qlen_greater_than_key) {
std::string term = "starkbin";
std::vector<art_leaf*> leaves;
art_fuzzy_search(&t, (const unsigned char *)(term.c_str()), term.size(), 0, 2, 10, FREQUENCY, true, nullptr, 0, leaves);
art_fuzzy_search(&t, (const unsigned char *)(term.c_str()), term.size(), 0, 2, 10, FREQUENCY, true, false, "", nullptr, 0, leaves, exclude_leaves);
ASSERT_EQ(0, leaves.size());
}
@ -660,11 +666,12 @@ TEST(ArtTest, test_art_fuzzy_search_single_leaf_non_prefix) {
std::string term = "spz";
std::vector<art_leaf*> leaves;
art_fuzzy_search(&t, (const unsigned char *)(term.c_str()), term.size()+1, 0, 1, 10, FREQUENCY, false, nullptr, 0, leaves);
art_fuzzy_search(&t, (const unsigned char *)(term.c_str()), term.size()+1, 0, 1, 10, FREQUENCY, false, false, "", nullptr, 0, leaves, exclude_leaves);
ASSERT_EQ(0, leaves.size());
leaves.clear();
art_fuzzy_search(&t, (const unsigned char *)(term.c_str()), term.size(), 0, 1, 10, FREQUENCY, true, nullptr, 0, leaves);
exclude_leaves.clear();
art_fuzzy_search(&t, (const unsigned char *)(term.c_str()), term.size(), 0, 1, 10, FREQUENCY, true, false, "", nullptr, 0, leaves, exclude_leaves);
ASSERT_EQ(1, leaves.size());
res = art_tree_destroy(&t);
@ -682,7 +689,7 @@ TEST(ArtTest, test_art_prefix_larger_than_key) {
std::string term = "earrings";
std::vector<art_leaf*> leaves;
art_fuzzy_search(&t, (const unsigned char *)(term.c_str()), term.size()+1, 0, 2, 10, FREQUENCY, false, nullptr, 0, leaves);
art_fuzzy_search(&t, (const unsigned char *)(term.c_str()), term.size()+1, 0, 2, 10, FREQUENCY, false, false, "", nullptr, 0, leaves, exclude_leaves);
ASSERT_EQ(0, leaves.size());
res = art_tree_destroy(&t);
@ -706,7 +713,7 @@ TEST(ArtTest, test_art_fuzzy_search_prefix_token_ordering) {
}
std::vector<art_leaf*> leaves;
art_fuzzy_search(&t, (const unsigned char *) "e", 1, 0, 0, 3, MAX_SCORE, true, nullptr, 0, leaves);
art_fuzzy_search(&t, (const unsigned char *) "e", 1, 0, 0, 3, MAX_SCORE, true, false, "", nullptr, 0, leaves, exclude_leaves);
std::string first_key(reinterpret_cast<char*>(leaves[0]->key), leaves[0]->key_len - 1);
ASSERT_EQ("e", first_key);
@ -718,7 +725,8 @@ TEST(ArtTest, test_art_fuzzy_search_prefix_token_ordering) {
ASSERT_EQ("elephant", third_key);
leaves.clear();
art_fuzzy_search(&t, (const unsigned char *) "enter", 5, 1, 1, 3, MAX_SCORE, true, nullptr, 0, leaves);
exclude_leaves.clear();
art_fuzzy_search(&t, (const unsigned char *) "enter", 5, 1, 1, 3, MAX_SCORE, true, false, "", nullptr, 0, leaves, exclude_leaves);
ASSERT_TRUE(leaves.empty());
res = art_tree_destroy(&t);
@ -747,56 +755,65 @@ TEST(ArtTest, test_art_fuzzy_search) {
auto begin = std::chrono::high_resolution_clock::now();
leaves.clear();
art_fuzzy_search(&t, (const unsigned char *) "pltinum", strlen("pltinum"), 0, 1, 10, FREQUENCY, true, nullptr, 0, leaves);
exclude_leaves.clear();
art_fuzzy_search(&t, (const unsigned char *) "pltinum", strlen("pltinum"), 0, 1, 10, FREQUENCY, true, false, "", nullptr, 0, leaves, exclude_leaves);
ASSERT_EQ(2, leaves.size());
ASSERT_STREQ("platinumsmith", (const char *)leaves.at(0)->key);
ASSERT_STREQ("platinum", (const char *)leaves.at(1)->key);
leaves.clear();
exclude_leaves.clear();
// extra char
art_fuzzy_search(&t, (const unsigned char *) "higghliving", strlen("higghliving") + 1, 0, 1, 10, FREQUENCY, false, nullptr, 0, leaves);
art_fuzzy_search(&t, (const unsigned char *) "higghliving", strlen("higghliving") + 1, 0, 1, 10, FREQUENCY, false, false, "", nullptr, 0, leaves, exclude_leaves);
ASSERT_EQ(1, leaves.size());
ASSERT_STREQ("highliving", (const char *)leaves.at(0)->key);
// transpose
leaves.clear();
art_fuzzy_search(&t, (const unsigned char *) "zymosthneic", strlen("zymosthneic") + 1, 0, 1, 10, FREQUENCY, false, nullptr, 0, leaves);
exclude_leaves.clear();
art_fuzzy_search(&t, (const unsigned char *) "zymosthneic", strlen("zymosthneic") + 1, 0, 1, 10, FREQUENCY, false, false, "", nullptr, 0, leaves, exclude_leaves);
ASSERT_EQ(1, leaves.size());
ASSERT_STREQ("zymosthenic", (const char *)leaves.at(0)->key);
// transpose + missing
leaves.clear();
art_fuzzy_search(&t, (const unsigned char *) "dacrcyystlgia", strlen("dacrcyystlgia") + 1, 0, 2, 10, FREQUENCY, false, nullptr, 0, leaves);
exclude_leaves.clear();
art_fuzzy_search(&t, (const unsigned char *) "dacrcyystlgia", strlen("dacrcyystlgia") + 1, 0, 2, 10, FREQUENCY, false, false, "", nullptr, 0, leaves, exclude_leaves);
ASSERT_EQ(1, leaves.size());
ASSERT_STREQ("dacrycystalgia", (const char *)leaves.at(0)->key);
leaves.clear();
art_fuzzy_search(&t, (const unsigned char *) "dacrcyystlgia", strlen("dacrcyystlgia") + 1, 1, 2, 10, FREQUENCY, false, nullptr, 0, leaves);
exclude_leaves.clear();
art_fuzzy_search(&t, (const unsigned char *) "dacrcyystlgia", strlen("dacrcyystlgia") + 1, 1, 2, 10, FREQUENCY, false, false, "", nullptr, 0, leaves, exclude_leaves);
ASSERT_EQ(1, leaves.size());
ASSERT_STREQ("dacrycystalgia", (const char *)leaves.at(0)->key);
// missing char
leaves.clear();
art_fuzzy_search(&t, (const unsigned char *) "gaberlunze", strlen("gaberlunze") + 1, 0, 1, 10, FREQUENCY, false, nullptr, 0, leaves);
exclude_leaves.clear();
art_fuzzy_search(&t, (const unsigned char *) "gaberlunze", strlen("gaberlunze") + 1, 0, 1, 10, FREQUENCY, false, false, "", nullptr, 0, leaves, exclude_leaves);
ASSERT_EQ(1, leaves.size());
ASSERT_STREQ("gaberlunzie", (const char *)leaves.at(0)->key);
// substituted char
leaves.clear();
art_fuzzy_search(&t, (const unsigned char *) "eacemiferous", strlen("eacemiferous") + 1, 0, 1, 10, FREQUENCY, false, nullptr, 0, leaves);
exclude_leaves.clear();
art_fuzzy_search(&t, (const unsigned char *) "eacemiferous", strlen("eacemiferous") + 1, 0, 1, 10, FREQUENCY, false, false, "", nullptr, 0, leaves, exclude_leaves);
ASSERT_EQ(1, leaves.size());
ASSERT_STREQ("racemiferous", (const char *)leaves.at(0)->key);
// missing char + extra char
leaves.clear();
art_fuzzy_search(&t, (const unsigned char *) "Sarbruckken", strlen("Sarbruckken") + 1, 0, 2, 10, FREQUENCY, false, nullptr, 0, leaves);
exclude_leaves.clear();
art_fuzzy_search(&t, (const unsigned char *) "Sarbruckken", strlen("Sarbruckken") + 1, 0, 2, 10, FREQUENCY, false, false, "", nullptr, 0, leaves, exclude_leaves);
ASSERT_EQ(1, leaves.size());
ASSERT_STREQ("Saarbrucken", (const char *)leaves.at(0)->key);
// multiple matching results
leaves.clear();
art_fuzzy_search(&t, (const unsigned char *) "hown", strlen("hown") + 1, 0, 1, 10, FREQUENCY, false, nullptr, 0, leaves);
exclude_leaves.clear();
art_fuzzy_search(&t, (const unsigned char *) "hown", strlen("hown") + 1, 0, 1, 10, FREQUENCY, false, false, "", nullptr, 0, leaves, exclude_leaves);
ASSERT_EQ(10, leaves.size());
std::set<std::string> expected_words = {"town", "sown", "mown", "lown", "howl", "howk", "howe", "how", "horn", "hoon"};
@ -809,23 +826,28 @@ TEST(ArtTest, test_art_fuzzy_search) {
// fuzzy prefix search
leaves.clear();
art_fuzzy_search(&t, (const unsigned char *) "lionhear", strlen("lionhear"), 0, 0, 10, FREQUENCY, true, nullptr, 0, leaves);
exclude_leaves.clear();
art_fuzzy_search(&t, (const unsigned char *) "lionhear", strlen("lionhear"), 0, 0, 10, FREQUENCY, true, false, "", nullptr, 0, leaves, exclude_leaves);
ASSERT_EQ(3, leaves.size());
leaves.clear();
art_fuzzy_search(&t, (const unsigned char *) "lineage", strlen("lineage"), 0, 0, 10, FREQUENCY, true, nullptr, 0, leaves);
exclude_leaves.clear();
art_fuzzy_search(&t, (const unsigned char *) "lineage", strlen("lineage"), 0, 0, 10, FREQUENCY, true, false, "", nullptr, 0, leaves, exclude_leaves);
ASSERT_EQ(2, leaves.size());
leaves.clear();
art_fuzzy_search(&t, (const unsigned char *) "liq", strlen("liq"), 0, 0, 50, FREQUENCY, true, nullptr, 0, leaves);
exclude_leaves.clear();
art_fuzzy_search(&t, (const unsigned char *) "liq", strlen("liq"), 0, 0, 50, FREQUENCY, true, false, "", nullptr, 0, leaves, exclude_leaves);
ASSERT_EQ(39, leaves.size());
leaves.clear();
art_fuzzy_search(&t, (const unsigned char *) "antitraditiana", strlen("antitraditiana"), 0, 1, 10, FREQUENCY, true, nullptr, 0, leaves);
exclude_leaves.clear();
art_fuzzy_search(&t, (const unsigned char *) "antitraditiana", strlen("antitraditiana"), 0, 1, 10, FREQUENCY, true, false, "", nullptr, 0, leaves, exclude_leaves);
ASSERT_EQ(1, leaves.size());
leaves.clear();
art_fuzzy_search(&t, (const unsigned char *) "antisocao", strlen("antisocao"), 0, 2, 10, FREQUENCY, true, nullptr, 0, leaves);
exclude_leaves.clear();
art_fuzzy_search(&t, (const unsigned char *) "antisocao", strlen("antisocao"), 0, 2, 10, FREQUENCY, true, false, "", nullptr, 0, leaves, exclude_leaves);
ASSERT_EQ(6, leaves.size());
long long int timeMillis = std::chrono::duration_cast<std::chrono::milliseconds>(
@ -855,7 +877,7 @@ TEST(ArtTest, test_art_fuzzy_search_unicode_chars) {
EXPECT_EQ(1, posting_t::first_id(l->values));
std::vector<art_leaf*> leaves;
art_fuzzy_search(&t, (unsigned char *)key, strlen(key), 0, 0, 10, FREQUENCY, true, nullptr, 0, leaves);
art_fuzzy_search(&t, (unsigned char *)key, strlen(key), 0, 0, 10, FREQUENCY, true, false, "", nullptr, 0, leaves, exclude_leaves);
ASSERT_EQ(1, leaves.size());
}
@ -879,7 +901,7 @@ TEST(ArtTest, test_art_fuzzy_search_extra_chars) {
const char* query = "abbreviation";
std::vector<art_leaf*> leaves;
art_fuzzy_search(&t, (unsigned char *)query, strlen(query), 0, 2, 10, FREQUENCY, true, nullptr, 0, leaves);
art_fuzzy_search(&t, (unsigned char *)query, strlen(query), 0, 2, 10, FREQUENCY, true, false, "", nullptr, 0, leaves, exclude_leaves);
ASSERT_EQ(1, leaves.size());
res = art_tree_destroy(&t);
@ -918,15 +940,16 @@ TEST(ArtTest, test_art_search_sku_like_tokens) {
for (const auto &key : keys) {
std::vector<art_leaf *> leaves;
art_fuzzy_search(&t, (const unsigned char*)key.c_str(), key.size(), 0, 0, 10,
FREQUENCY, true, nullptr, 0, leaves);
FREQUENCY, true, false, "", nullptr, 0, leaves, exclude_leaves);
ASSERT_EQ(1, leaves.size());
ASSERT_STREQ(key.c_str(), (const char *) leaves.at(0)->key);
leaves.clear();
exclude_leaves.clear();
// non prefix
art_fuzzy_search(&t, (const unsigned char*)key.c_str(), key.size()+1, 0, 0, 10,
FREQUENCY, false, nullptr, 0, leaves);
FREQUENCY, false, false, "", nullptr, 0, leaves, exclude_leaves);
ASSERT_EQ(1, leaves.size());
ASSERT_STREQ(key.c_str(), (const char *) leaves.at(0)->key);
}
@ -970,14 +993,17 @@ TEST(ArtTest, test_art_search_ill_like_tokens) {
std::make_pair("ice", 2),
};
std::string key = "input";
for (const auto &key : keys) {
art_leaf* l = (art_leaf *) art_search(&t, (const unsigned char *)key.c_str(), key.size()+1);
ASSERT_FALSE(l == nullptr);
EXPECT_EQ(1, posting_t::num_ids(l->values));
std::vector<art_leaf *> leaves;
exclude_leaves.clear();
art_fuzzy_search(&t, (const unsigned char*)key.c_str(), key.size(), 0, 0, 10,
FREQUENCY, true, nullptr, 0, leaves);
FREQUENCY, true, false, "", nullptr, 0, leaves, exclude_leaves);
if(key_to_count.count(key) != 0) {
ASSERT_EQ(key_to_count[key], leaves.size());
@ -987,10 +1013,14 @@ TEST(ArtTest, test_art_search_ill_like_tokens) {
}
leaves.clear();
exclude_leaves.clear();
// non prefix
art_fuzzy_search(&t, (const unsigned char*)key.c_str(), key.size()+1, 0, 0, 10,
FREQUENCY, false, nullptr, 0, leaves);
FREQUENCY, false, false, "", nullptr, 0, leaves, exclude_leaves);
if(leaves.size() != 1) {
LOG(INFO) << key;
}
ASSERT_EQ(1, leaves.size());
ASSERT_STREQ(key.c_str(), (const char *) leaves.at(0)->key);
}
@ -1022,8 +1052,9 @@ TEST(ArtTest, test_art_search_ill_like_tokens2) {
EXPECT_EQ(1, posting_t::num_ids(l->values));
std::vector<art_leaf *> leaves;
exclude_leaves.clear();
art_fuzzy_search(&t, (const unsigned char*)key.c_str(), key.size(), 0, 0, 10,
FREQUENCY, true, nullptr, 0, leaves);
FREQUENCY, true, false, "", nullptr, 0, leaves, exclude_leaves);
if(key == "illustration") {
ASSERT_EQ(2, leaves.size());
@ -1033,10 +1064,11 @@ TEST(ArtTest, test_art_search_ill_like_tokens2) {
}
leaves.clear();
exclude_leaves.clear();
// non prefix
art_fuzzy_search(&t, (const unsigned char*)key.c_str(), key.size() + 1, 0, 0, 10,
FREQUENCY, false, nullptr, 0, leaves);
FREQUENCY, false, false, "", nullptr, 0, leaves, exclude_leaves);
ASSERT_EQ(1, leaves.size());
ASSERT_STREQ(key.c_str(), (const char *) leaves.at(0)->key);
}
@ -1059,12 +1091,12 @@ TEST(ArtTest, test_art_search_roche_chews) {
std::string term = "chews";
std::vector<art_leaf *> leaves;
art_fuzzy_search(&t, (const unsigned char*)term.c_str(), term.size(), 0, 2, 10,
FREQUENCY, true, nullptr, 0, leaves);
FREQUENCY, true, false, "", nullptr, 0, leaves, exclude_leaves);
ASSERT_EQ(0, leaves.size());
art_fuzzy_search(&t, (const unsigned char*)keys[0].c_str(), keys[0].size() + 1, 0, 0, 10,
FREQUENCY, false, nullptr, 0, leaves);
FREQUENCY, false, false, "", nullptr, 0, leaves, exclude_leaves);
ASSERT_EQ(1, leaves.size());
@ -1091,14 +1123,15 @@ TEST(ArtTest, test_art_search_raspberry) {
std::string q_raspberries = "raspberries";
art_fuzzy_search(&t, (const unsigned char*)q_raspberries.c_str(), q_raspberries.size(), 0, 2, 10,
FREQUENCY, true, nullptr, 0, leaves);
FREQUENCY, true, false, "", nullptr, 0, leaves, exclude_leaves);
ASSERT_EQ(2, leaves.size());
leaves.clear();
exclude_leaves.clear();
std::string q_raspberry = "raspberry";
art_fuzzy_search(&t, (const unsigned char*)q_raspberry.c_str(), q_raspberry.size(), 0, 2, 10,
FREQUENCY, true, nullptr, 0, leaves);
FREQUENCY, true, false, "", nullptr, 0, leaves, exclude_leaves);
ASSERT_EQ(2, leaves.size());
res = art_tree_destroy(&t);
@ -1124,13 +1157,16 @@ TEST(ArtTest, test_art_search_highliving) {
std::string query = "higghliving";
art_fuzzy_search(&t, (const unsigned char*)query.c_str(), query.size() + 1, 0, 1, 10,
FREQUENCY, false, nullptr, 0, leaves);
FREQUENCY, false, false, "", nullptr, 0, leaves, exclude_leaves);
ASSERT_EQ(1, leaves.size());
leaves.clear();
exclude_leaves.clear();
exclude_leaves.clear();
exclude_leaves.clear();
art_fuzzy_search(&t, (const unsigned char*)query.c_str(), query.size(), 0, 2, 10,
FREQUENCY, true, nullptr, 0, leaves);
FREQUENCY, true, false, "", nullptr, 0, leaves, exclude_leaves);
ASSERT_EQ(1, leaves.size());
res = art_tree_destroy(&t);

View File

@ -203,6 +203,8 @@ TEST_F(CollectionSpecificTest, ExactSingleFieldMatch) {
spp::sparse_hash_set<std::string>(), spp::sparse_hash_set<std::string>(), 10, "", 30,
4, "title", 10).get();
LOG(INFO) << results;
ASSERT_EQ(2, results["hits"].size());
ASSERT_EQ("0", results["hits"][0]["document"]["id"].get<std::string>());
ASSERT_EQ("1", results["hits"][1]["document"]["id"].get<std::string>());

View File

@ -476,7 +476,7 @@ TEST_F(CollectionTest, TextContainingAnActualTypo) {
ASSERT_EQ(4, results["hits"].size());
ASSERT_EQ(11, results["found"].get<uint32_t>());
std::vector<std::string> ids = {"19", "22", "6", "13"};
std::vector<std::string> ids = {"19", "6", "21", "22"};
for(size_t i = 0; i < results["hits"].size(); i++) {
nlohmann::json result = results["hits"].at(i);