mirror of
https://github.com/typesense/typesense.git
synced 2025-05-21 22:33:27 +08:00
Merge branch 'v0.25-join' into v0.25
This commit is contained in:
commit
807a95b383
@ -1,4 +1,4 @@
|
||||
FROM ubuntu:20.04
|
||||
FROM ubuntu:22.04
|
||||
|
||||
RUN apt-get -y update && apt-get -y install ca-certificates
|
||||
|
||||
|
@ -276,9 +276,10 @@ int art_iter_prefix(art_tree *t, const unsigned char *prefix, int prefix_len, ar
|
||||
* Returns leaves that match a given string within a fuzzy distance of max_cost.
|
||||
*/
|
||||
int art_fuzzy_search(art_tree *t, const unsigned char *term, const int term_len, const int min_cost, const int max_cost,
|
||||
const int max_words, const token_ordering token_order, const bool prefix,
|
||||
const uint32_t *filter_ids, size_t filter_ids_length,
|
||||
std::vector<art_leaf *> &results, const std::set<std::string>& exclude_leaves = {});
|
||||
const size_t max_words, const token_ordering token_order,
|
||||
const bool prefix, bool last_token, const std::string& prev_token,
|
||||
const uint32_t *filter_ids, const size_t filter_ids_length,
|
||||
std::vector<art_leaf *> &results, std::set<std::string>& exclude_leaves);
|
||||
|
||||
void encode_int32(int32_t n, unsigned char *chars);
|
||||
|
||||
|
@ -317,7 +317,6 @@ struct field {
|
||||
if (!field.reference.empty()) {
|
||||
field_val[fields::reference] = field.reference;
|
||||
}
|
||||
|
||||
if(field.create_from.size() > 0) {
|
||||
field_val[fields::create_from] = field.create_from;
|
||||
if(field.model_path.size() > 0) {
|
||||
|
116
src/art.cpp
116
src/art.cpp
@ -21,6 +21,7 @@
|
||||
#include <posting.h>
|
||||
#include "art.h"
|
||||
#include "logger.h"
|
||||
#include "array_utils.h"
|
||||
|
||||
/**
|
||||
* Macros to manipulate pointer tags
|
||||
@ -940,10 +941,69 @@ void* art_delete(art_tree *t, const unsigned char *key, int key_len) {
|
||||
return child->max_token_count;
|
||||
}*/
|
||||
|
||||
const uint32_t* get_allowed_doc_ids(art_tree *t, const std::string& prev_token,
|
||||
const uint32_t* filter_ids, const size_t filter_ids_length,
|
||||
size_t& prev_token_doc_ids_len) {
|
||||
|
||||
art_leaf* prev_leaf = static_cast<art_leaf*>(
|
||||
art_search(t, reinterpret_cast<const unsigned char*>(prev_token.c_str()), prev_token.size() + 1)
|
||||
);
|
||||
|
||||
if(prev_token.empty() || !prev_leaf) {
|
||||
prev_token_doc_ids_len = filter_ids_length;
|
||||
return filter_ids;
|
||||
}
|
||||
|
||||
std::vector<uint32_t> prev_leaf_ids;
|
||||
posting_t::merge({prev_leaf->values}, prev_leaf_ids);
|
||||
|
||||
uint32_t* prev_token_doc_ids = nullptr;
|
||||
|
||||
if(filter_ids_length != 0) {
|
||||
prev_token_doc_ids_len = ArrayUtils::and_scalar(prev_leaf_ids.data(), prev_leaf_ids.size(),
|
||||
filter_ids, filter_ids_length,
|
||||
&prev_token_doc_ids);
|
||||
} else {
|
||||
prev_token_doc_ids_len = prev_leaf_ids.size();
|
||||
prev_token_doc_ids = new uint32_t[prev_token_doc_ids_len];
|
||||
std::copy(prev_leaf_ids.begin(), prev_leaf_ids.end(), prev_token_doc_ids);
|
||||
}
|
||||
|
||||
return prev_token_doc_ids;
|
||||
}
|
||||
|
||||
bool validate_and_add_leaf(art_leaf* leaf, const bool last_token, const std::string& prev_token,
|
||||
const uint32_t* allowed_doc_ids, const size_t allowed_doc_ids_len,
|
||||
std::set<std::string>& exclude_leaves, const art_leaf* exact_leaf,
|
||||
std::vector<art_leaf *>& results) {
|
||||
|
||||
if(leaf == exact_leaf) {
|
||||
return false;
|
||||
}
|
||||
|
||||
std::string tok(reinterpret_cast<char*>(leaf->key), leaf->key_len - 1);
|
||||
if(exclude_leaves.count(tok) != 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if(allowed_doc_ids_len != 0) {
|
||||
if(!posting_t::contains_atleast_one(leaf->values, allowed_doc_ids,
|
||||
allowed_doc_ids_len)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
exclude_leaves.emplace(tok);
|
||||
results.push_back(leaf);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
int art_topk_iter(const art_node *root, token_ordering token_order, size_t max_results,
|
||||
const uint32_t* filter_ids, size_t filter_ids_length,
|
||||
const std::set<std::string>& exclude_leaves, const art_leaf* exact_leaf,
|
||||
std::vector<art_leaf *>& results) {
|
||||
const art_leaf* exact_leaf,
|
||||
const bool last_token, const std::string& prev_token,
|
||||
const uint32_t* allowed_doc_ids, size_t allowed_doc_ids_len,
|
||||
const art_tree* t, std::set<std::string>& exclude_leaves, std::vector<art_leaf *>& results) {
|
||||
|
||||
printf("INSIDE art_topk_iter: root->type: %d\n", root->type);
|
||||
|
||||
@ -957,6 +1017,8 @@ int art_topk_iter(const art_node *root, token_ordering token_order, size_t max_r
|
||||
|
||||
q.push(root);
|
||||
|
||||
size_t num_processed = 0;
|
||||
|
||||
while(!q.empty() && results.size() < max_results*4) {
|
||||
art_node *n = (art_node *) q.top();
|
||||
q.pop();
|
||||
@ -972,23 +1034,13 @@ int art_topk_iter(const art_node *root, token_ordering token_order, size_t max_r
|
||||
if (IS_LEAF(n)) {
|
||||
art_leaf *l = (art_leaf *) LEAF_RAW(n);
|
||||
//LOG(INFO) << "END LEAF SCORE: " << l->max_score;
|
||||
validate_and_add_leaf(l, last_token, prev_token, allowed_doc_ids, allowed_doc_ids_len,
|
||||
exclude_leaves, exact_leaf, results);
|
||||
|
||||
if(filter_ids_length == 0) {
|
||||
std::string tok(reinterpret_cast<char*>(l->key), l->key_len - 1);
|
||||
if(exclude_leaves.count(tok) != 0 || l == exact_leaf) {
|
||||
continue;
|
||||
}
|
||||
results.push_back(l);
|
||||
} else {
|
||||
// we will push leaf only if filter matches with leaf IDs
|
||||
bool found_atleast_one = posting_t::contains_atleast_one(l->values, filter_ids, filter_ids_length);
|
||||
if(found_atleast_one) {
|
||||
std::string tok(reinterpret_cast<char*>(l->key), l->key_len - 1);
|
||||
if(exclude_leaves.count(tok) != 0 || l == exact_leaf) {
|
||||
continue;
|
||||
}
|
||||
results.push_back(l);
|
||||
}
|
||||
if (++num_processed % 1024 == 0 && (microseconds(
|
||||
std::chrono::system_clock::now().time_since_epoch()).count() - search_begin_us) > search_stop_us) {
|
||||
search_cutoff = true;
|
||||
break;
|
||||
}
|
||||
|
||||
continue;
|
||||
@ -1491,9 +1543,10 @@ static void art_fuzzy_recurse(unsigned char p, unsigned char c, const art_node *
|
||||
* Returns leaves that match a given string within a fuzzy distance of max_cost.
|
||||
*/
|
||||
int art_fuzzy_search(art_tree *t, const unsigned char *term, const int term_len, const int min_cost, const int max_cost,
|
||||
const int max_words, const token_ordering token_order, const bool prefix,
|
||||
const uint32_t *filter_ids, size_t filter_ids_length,
|
||||
std::vector<art_leaf *> &results, const std::set<std::string>& exclude_leaves) {
|
||||
const size_t max_words, const token_ordering token_order, const bool prefix,
|
||||
bool last_token, const std::string& prev_token,
|
||||
const uint32_t *filter_ids, const size_t filter_ids_length,
|
||||
std::vector<art_leaf *> &results, std::set<std::string>& exclude_leaves) {
|
||||
|
||||
std::vector<const art_node*> nodes;
|
||||
int irow[term_len + 1];
|
||||
@ -1525,8 +1578,15 @@ int art_fuzzy_search(art_tree *t, const unsigned char *term, const int term_len,
|
||||
art_leaf* exact_leaf = (art_leaf *) art_search(t, term, key_len);
|
||||
//LOG(INFO) << "exact_leaf: " << exact_leaf << ", term: " << term << ", term_len: " << term_len;
|
||||
|
||||
// documents that contain the previous token and/or filter ids
|
||||
size_t allowed_doc_ids_len = 0;
|
||||
const uint32_t* allowed_doc_ids = get_allowed_doc_ids(t, prev_token, filter_ids, filter_ids_length,
|
||||
allowed_doc_ids_len);
|
||||
|
||||
for(auto node: nodes) {
|
||||
art_topk_iter(node, token_order, max_words, filter_ids, filter_ids_length, exclude_leaves, exact_leaf, results);
|
||||
art_topk_iter(node, token_order, max_words, exact_leaf,
|
||||
last_token, prev_token, allowed_doc_ids, allowed_doc_ids_len,
|
||||
t, exclude_leaves, results);
|
||||
}
|
||||
|
||||
if(token_order == FREQUENCY) {
|
||||
@ -1536,7 +1596,11 @@ int art_fuzzy_search(art_tree *t, const unsigned char *term, const int term_len,
|
||||
}
|
||||
|
||||
if(exact_leaf && min_cost == 0) {
|
||||
results.insert(results.begin(), exact_leaf);
|
||||
std::string tok(reinterpret_cast<char*>(exact_leaf->key), exact_leaf->key_len - 1);
|
||||
if(exclude_leaves.count(tok) == 0) {
|
||||
results.insert(results.begin(), exact_leaf);
|
||||
exclude_leaves.emplace(tok);
|
||||
}
|
||||
}
|
||||
|
||||
if(results.size() > max_words) {
|
||||
@ -1551,6 +1615,10 @@ int art_fuzzy_search(art_tree *t, const unsigned char *term, const int term_len,
|
||||
<< ", filter_ids_length: " << filter_ids_length;
|
||||
}*/
|
||||
|
||||
if(allowed_doc_ids != filter_ids) {
|
||||
delete [] allowed_doc_ids;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -246,7 +246,6 @@ nlohmann::json Collection::get_summary_json() const {
|
||||
field_json[fields::reference] = coll_field.reference;
|
||||
}
|
||||
|
||||
|
||||
fields_arr.push_back(field_json);
|
||||
}
|
||||
|
||||
|
@ -3422,12 +3422,12 @@ void Index::fuzzy_search_fields(const std::vector<search_field_t>& the_fields,
|
||||
}
|
||||
|
||||
//LOG(INFO) << "Searching for field: " << the_field.name << ", found token:" << token;
|
||||
const auto& prev_token = last_token ? token_candidates_vec.back().candidates[0] : "";
|
||||
|
||||
std::vector<art_leaf*> field_leaves;
|
||||
int max_words = 100000;
|
||||
art_fuzzy_search(search_index.at(the_field.name), (const unsigned char *) token.c_str(), token_len,
|
||||
costs[token_index], costs[token_index], max_words, token_order, prefix_search,
|
||||
filter_ids, filter_ids_length, field_leaves, unique_tokens);
|
||||
costs[token_index], costs[token_index], max_candidates, token_order, prefix_search,
|
||||
last_token, prev_token, filter_ids, filter_ids_length, field_leaves, unique_tokens);
|
||||
|
||||
/*auto timeMillis = std::chrono::duration_cast<std::chrono::milliseconds>(
|
||||
std::chrono::high_resolution_clock::now() - begin).count();
|
||||
@ -3438,60 +3438,17 @@ void Index::fuzzy_search_fields(const std::vector<search_field_t>& the_fields,
|
||||
continue;
|
||||
}
|
||||
|
||||
uint32_t* prev_token_doc_ids = nullptr; // documents that contain the previous token
|
||||
size_t prev_token_doc_ids_len = 0;
|
||||
|
||||
if(last_token) {
|
||||
auto& prev_token = token_candidates_vec.back().candidates[0];
|
||||
art_leaf* prev_leaf = static_cast<art_leaf*>(
|
||||
art_search(search_index.at(the_field.name),
|
||||
reinterpret_cast<const unsigned char*>(prev_token.c_str()),
|
||||
prev_token.size() + 1));
|
||||
|
||||
if(!prev_leaf) {
|
||||
continue;
|
||||
}
|
||||
|
||||
std::vector<uint32_t> prev_leaf_ids;
|
||||
posting_t::merge({prev_leaf->values}, prev_leaf_ids);
|
||||
|
||||
if(filter_ids_length != 0) {
|
||||
prev_token_doc_ids_len = ArrayUtils::and_scalar(prev_leaf_ids.data(), prev_leaf_ids.size(),
|
||||
filter_ids, filter_ids_length,
|
||||
&prev_token_doc_ids);
|
||||
} else {
|
||||
prev_token_doc_ids_len = prev_leaf_ids.size();
|
||||
prev_token_doc_ids = new uint32_t[prev_token_doc_ids_len];
|
||||
std::copy(prev_leaf_ids.begin(), prev_leaf_ids.end(), prev_token_doc_ids);
|
||||
}
|
||||
}
|
||||
|
||||
for(size_t i = 0; i < field_leaves.size(); i++) {
|
||||
auto leaf = field_leaves[i];
|
||||
std::string tok(reinterpret_cast<char*>(leaf->key), leaf->key_len - 1);
|
||||
if(unique_tokens.count(tok) == 0) {
|
||||
if(last_token) {
|
||||
if(!posting_t::contains_atleast_one(leaf->values, prev_token_doc_ids,
|
||||
prev_token_doc_ids_len)) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
unique_tokens.emplace(tok);
|
||||
leaf_tokens.push_back(tok);
|
||||
}
|
||||
|
||||
if(leaf_tokens.size() >= max_candidates) {
|
||||
token_cost_cache.emplace(token_cost_hash, leaf_tokens);
|
||||
delete [] prev_token_doc_ids;
|
||||
prev_token_doc_ids = nullptr;
|
||||
goto token_done;
|
||||
}
|
||||
leaf_tokens.push_back(tok);
|
||||
}
|
||||
|
||||
token_cost_cache.emplace(token_cost_hash, leaf_tokens);
|
||||
delete [] prev_token_doc_ids;
|
||||
prev_token_doc_ids = nullptr;
|
||||
|
||||
if(leaf_tokens.size() >= max_candidates) {
|
||||
goto token_done;
|
||||
}
|
||||
}
|
||||
|
||||
if(last_token && leaf_tokens.size() < max_candidates) {
|
||||
@ -3520,10 +3477,9 @@ void Index::fuzzy_search_fields(const std::vector<search_field_t>& the_fields,
|
||||
}
|
||||
|
||||
std::vector<art_leaf*> field_leaves;
|
||||
int max_words = 100000;
|
||||
art_fuzzy_search(search_index.at(the_field.name), (const unsigned char *) token.c_str(), token_len,
|
||||
costs[token_index], costs[token_index], max_words, token_order, prefix_search,
|
||||
filter_ids, filter_ids_length, field_leaves, unique_tokens);
|
||||
costs[token_index], costs[token_index], max_candidates, token_order, prefix_search,
|
||||
false, "", filter_ids, filter_ids_length, field_leaves, unique_tokens);
|
||||
|
||||
if(field_leaves.empty()) {
|
||||
// look at the next field
|
||||
@ -3533,23 +3489,14 @@ void Index::fuzzy_search_fields(const std::vector<search_field_t>& the_fields,
|
||||
for(size_t i = 0; i < field_leaves.size(); i++) {
|
||||
auto leaf = field_leaves[i];
|
||||
std::string tok(reinterpret_cast<char*>(leaf->key), leaf->key_len - 1);
|
||||
if(unique_tokens.count(tok) == 0) {
|
||||
if(!posting_t::contains_atleast_one(leaf->values, &prev_token_doc_ids[0],
|
||||
prev_token_doc_ids.size())) {
|
||||
continue;
|
||||
}
|
||||
|
||||
unique_tokens.emplace(tok);
|
||||
leaf_tokens.push_back(tok);
|
||||
}
|
||||
|
||||
if(leaf_tokens.size() >= max_candidates) {
|
||||
token_cost_cache.emplace(token_cost_hash, leaf_tokens);
|
||||
goto token_done;
|
||||
}
|
||||
leaf_tokens.push_back(tok);
|
||||
}
|
||||
|
||||
token_cost_cache.emplace(token_cost_hash, leaf_tokens);
|
||||
|
||||
if(leaf_tokens.size() >= max_candidates) {
|
||||
goto token_done;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -4935,7 +4882,7 @@ void Index::search_field(const uint8_t & field_id,
|
||||
// need less candidates for filtered searches since we already only pick tokens with results
|
||||
art_fuzzy_search(search_index.at(field_name), (const unsigned char *) token.c_str(), token_len,
|
||||
costs[token_index], costs[token_index], max_candidates, token_order, prefix_search,
|
||||
filter_ids, filter_ids_length, leaves, unique_tokens);
|
||||
false, "", filter_ids, filter_ids_length, leaves, unique_tokens);
|
||||
|
||||
/*auto timeMillis = std::chrono::duration_cast<std::chrono::milliseconds>(
|
||||
std::chrono::high_resolution_clock::now() - begin).count();
|
||||
|
@ -18,6 +18,8 @@ art_document get_document(uint32_t id) {
|
||||
return document;
|
||||
}
|
||||
|
||||
std::set<std::string> exclude_leaves;
|
||||
|
||||
TEST(ArtTest, test_art_init_and_destroy) {
|
||||
art_tree t;
|
||||
int res = art_tree_init(&t);
|
||||
@ -587,22 +589,25 @@ TEST(ArtTest, test_art_fuzzy_search_single_leaf) {
|
||||
EXPECT_EQ(1, posting_t::first_id(l->values));
|
||||
|
||||
std::vector<art_leaf*> leaves;
|
||||
art_fuzzy_search(&t, (const unsigned char *) implement_key, strlen(implement_key) + 1, 0, 0, 10, FREQUENCY, false, nullptr, 0, leaves);
|
||||
art_fuzzy_search(&t, (const unsigned char *) implement_key, strlen(implement_key) + 1, 0, 0, 10, FREQUENCY, false, false, "", nullptr, 0, leaves, exclude_leaves);
|
||||
ASSERT_EQ(1, leaves.size());
|
||||
|
||||
const char* implement_key_typo1 = "implment";
|
||||
const char* implement_key_typo2 = "implwnent";
|
||||
|
||||
leaves.clear();
|
||||
art_fuzzy_search(&t, (const unsigned char *) implement_key_typo1, strlen(implement_key_typo1) + 1, 0, 0, 10, FREQUENCY, false, nullptr, 0, leaves);
|
||||
exclude_leaves.clear();
|
||||
art_fuzzy_search(&t, (const unsigned char *) implement_key_typo1, strlen(implement_key_typo1) + 1, 0, 0, 10, FREQUENCY, false, false, "", nullptr, 0, leaves, exclude_leaves);
|
||||
ASSERT_EQ(0, leaves.size());
|
||||
|
||||
leaves.clear();
|
||||
art_fuzzy_search(&t, (const unsigned char *) implement_key_typo1, strlen(implement_key_typo1) + 1, 0, 1, 10, FREQUENCY, false, nullptr, 0, leaves);
|
||||
exclude_leaves.clear();
|
||||
art_fuzzy_search(&t, (const unsigned char *) implement_key_typo1, strlen(implement_key_typo1) + 1, 0, 1, 10, FREQUENCY, false, false, "", nullptr, 0, leaves, exclude_leaves);
|
||||
ASSERT_EQ(1, leaves.size());
|
||||
|
||||
leaves.clear();
|
||||
art_fuzzy_search(&t, (const unsigned char *) implement_key_typo2, strlen(implement_key_typo2) + 1, 0, 2, 10, FREQUENCY, false, nullptr, 0, leaves);
|
||||
exclude_leaves.clear();
|
||||
art_fuzzy_search(&t, (const unsigned char *) implement_key_typo2, strlen(implement_key_typo2) + 1, 0, 2, 10, FREQUENCY, false, false, "", nullptr, 0, leaves, exclude_leaves);
|
||||
ASSERT_EQ(1, leaves.size());
|
||||
|
||||
res = art_tree_destroy(&t);
|
||||
@ -623,11 +628,12 @@ TEST(ArtTest, test_art_fuzzy_search_single_leaf_prefix) {
|
||||
|
||||
std::vector<art_leaf*> leaves;
|
||||
std::string term = "aplication";
|
||||
art_fuzzy_search(&t, (const unsigned char *)(term.c_str()), term.size(), 0, 1, 10, FREQUENCY, true, nullptr, 0, leaves);
|
||||
art_fuzzy_search(&t, (const unsigned char *)(term.c_str()), term.size(), 0, 1, 10, FREQUENCY, true, false, "", nullptr, 0, leaves, exclude_leaves);
|
||||
ASSERT_EQ(1, leaves.size());
|
||||
|
||||
leaves.clear();
|
||||
art_fuzzy_search(&t, (const unsigned char *)(term.c_str()), term.size(), 0, 2, 10, FREQUENCY, true, nullptr, 0, leaves);
|
||||
exclude_leaves.clear();
|
||||
art_fuzzy_search(&t, (const unsigned char *)(term.c_str()), term.size(), 0, 2, 10, FREQUENCY, true, false, "", nullptr, 0, leaves, exclude_leaves);
|
||||
ASSERT_EQ(1, leaves.size());
|
||||
|
||||
res = art_tree_destroy(&t);
|
||||
@ -645,7 +651,7 @@ TEST(ArtTest, test_art_fuzzy_search_single_leaf_qlen_greater_than_key) {
|
||||
|
||||
std::string term = "starkbin";
|
||||
std::vector<art_leaf*> leaves;
|
||||
art_fuzzy_search(&t, (const unsigned char *)(term.c_str()), term.size(), 0, 2, 10, FREQUENCY, true, nullptr, 0, leaves);
|
||||
art_fuzzy_search(&t, (const unsigned char *)(term.c_str()), term.size(), 0, 2, 10, FREQUENCY, true, false, "", nullptr, 0, leaves, exclude_leaves);
|
||||
ASSERT_EQ(0, leaves.size());
|
||||
}
|
||||
|
||||
@ -660,11 +666,12 @@ TEST(ArtTest, test_art_fuzzy_search_single_leaf_non_prefix) {
|
||||
|
||||
std::string term = "spz";
|
||||
std::vector<art_leaf*> leaves;
|
||||
art_fuzzy_search(&t, (const unsigned char *)(term.c_str()), term.size()+1, 0, 1, 10, FREQUENCY, false, nullptr, 0, leaves);
|
||||
art_fuzzy_search(&t, (const unsigned char *)(term.c_str()), term.size()+1, 0, 1, 10, FREQUENCY, false, false, "", nullptr, 0, leaves, exclude_leaves);
|
||||
ASSERT_EQ(0, leaves.size());
|
||||
|
||||
leaves.clear();
|
||||
art_fuzzy_search(&t, (const unsigned char *)(term.c_str()), term.size(), 0, 1, 10, FREQUENCY, true, nullptr, 0, leaves);
|
||||
exclude_leaves.clear();
|
||||
art_fuzzy_search(&t, (const unsigned char *)(term.c_str()), term.size(), 0, 1, 10, FREQUENCY, true, false, "", nullptr, 0, leaves, exclude_leaves);
|
||||
ASSERT_EQ(1, leaves.size());
|
||||
|
||||
res = art_tree_destroy(&t);
|
||||
@ -682,7 +689,7 @@ TEST(ArtTest, test_art_prefix_larger_than_key) {
|
||||
|
||||
std::string term = "earrings";
|
||||
std::vector<art_leaf*> leaves;
|
||||
art_fuzzy_search(&t, (const unsigned char *)(term.c_str()), term.size()+1, 0, 2, 10, FREQUENCY, false, nullptr, 0, leaves);
|
||||
art_fuzzy_search(&t, (const unsigned char *)(term.c_str()), term.size()+1, 0, 2, 10, FREQUENCY, false, false, "", nullptr, 0, leaves, exclude_leaves);
|
||||
ASSERT_EQ(0, leaves.size());
|
||||
|
||||
res = art_tree_destroy(&t);
|
||||
@ -706,7 +713,7 @@ TEST(ArtTest, test_art_fuzzy_search_prefix_token_ordering) {
|
||||
}
|
||||
|
||||
std::vector<art_leaf*> leaves;
|
||||
art_fuzzy_search(&t, (const unsigned char *) "e", 1, 0, 0, 3, MAX_SCORE, true, nullptr, 0, leaves);
|
||||
art_fuzzy_search(&t, (const unsigned char *) "e", 1, 0, 0, 3, MAX_SCORE, true, false, "", nullptr, 0, leaves, exclude_leaves);
|
||||
|
||||
std::string first_key(reinterpret_cast<char*>(leaves[0]->key), leaves[0]->key_len - 1);
|
||||
ASSERT_EQ("e", first_key);
|
||||
@ -718,7 +725,8 @@ TEST(ArtTest, test_art_fuzzy_search_prefix_token_ordering) {
|
||||
ASSERT_EQ("elephant", third_key);
|
||||
|
||||
leaves.clear();
|
||||
art_fuzzy_search(&t, (const unsigned char *) "enter", 5, 1, 1, 3, MAX_SCORE, true, nullptr, 0, leaves);
|
||||
exclude_leaves.clear();
|
||||
art_fuzzy_search(&t, (const unsigned char *) "enter", 5, 1, 1, 3, MAX_SCORE, true, false, "", nullptr, 0, leaves, exclude_leaves);
|
||||
ASSERT_TRUE(leaves.empty());
|
||||
|
||||
res = art_tree_destroy(&t);
|
||||
@ -747,56 +755,65 @@ TEST(ArtTest, test_art_fuzzy_search) {
|
||||
auto begin = std::chrono::high_resolution_clock::now();
|
||||
|
||||
leaves.clear();
|
||||
art_fuzzy_search(&t, (const unsigned char *) "pltinum", strlen("pltinum"), 0, 1, 10, FREQUENCY, true, nullptr, 0, leaves);
|
||||
exclude_leaves.clear();
|
||||
art_fuzzy_search(&t, (const unsigned char *) "pltinum", strlen("pltinum"), 0, 1, 10, FREQUENCY, true, false, "", nullptr, 0, leaves, exclude_leaves);
|
||||
ASSERT_EQ(2, leaves.size());
|
||||
ASSERT_STREQ("platinumsmith", (const char *)leaves.at(0)->key);
|
||||
ASSERT_STREQ("platinum", (const char *)leaves.at(1)->key);
|
||||
|
||||
leaves.clear();
|
||||
exclude_leaves.clear();
|
||||
|
||||
// extra char
|
||||
art_fuzzy_search(&t, (const unsigned char *) "higghliving", strlen("higghliving") + 1, 0, 1, 10, FREQUENCY, false, nullptr, 0, leaves);
|
||||
art_fuzzy_search(&t, (const unsigned char *) "higghliving", strlen("higghliving") + 1, 0, 1, 10, FREQUENCY, false, false, "", nullptr, 0, leaves, exclude_leaves);
|
||||
ASSERT_EQ(1, leaves.size());
|
||||
ASSERT_STREQ("highliving", (const char *)leaves.at(0)->key);
|
||||
|
||||
// transpose
|
||||
leaves.clear();
|
||||
art_fuzzy_search(&t, (const unsigned char *) "zymosthneic", strlen("zymosthneic") + 1, 0, 1, 10, FREQUENCY, false, nullptr, 0, leaves);
|
||||
exclude_leaves.clear();
|
||||
art_fuzzy_search(&t, (const unsigned char *) "zymosthneic", strlen("zymosthneic") + 1, 0, 1, 10, FREQUENCY, false, false, "", nullptr, 0, leaves, exclude_leaves);
|
||||
ASSERT_EQ(1, leaves.size());
|
||||
ASSERT_STREQ("zymosthenic", (const char *)leaves.at(0)->key);
|
||||
|
||||
// transpose + missing
|
||||
leaves.clear();
|
||||
art_fuzzy_search(&t, (const unsigned char *) "dacrcyystlgia", strlen("dacrcyystlgia") + 1, 0, 2, 10, FREQUENCY, false, nullptr, 0, leaves);
|
||||
exclude_leaves.clear();
|
||||
art_fuzzy_search(&t, (const unsigned char *) "dacrcyystlgia", strlen("dacrcyystlgia") + 1, 0, 2, 10, FREQUENCY, false, false, "", nullptr, 0, leaves, exclude_leaves);
|
||||
ASSERT_EQ(1, leaves.size());
|
||||
ASSERT_STREQ("dacrycystalgia", (const char *)leaves.at(0)->key);
|
||||
|
||||
leaves.clear();
|
||||
art_fuzzy_search(&t, (const unsigned char *) "dacrcyystlgia", strlen("dacrcyystlgia") + 1, 1, 2, 10, FREQUENCY, false, nullptr, 0, leaves);
|
||||
exclude_leaves.clear();
|
||||
art_fuzzy_search(&t, (const unsigned char *) "dacrcyystlgia", strlen("dacrcyystlgia") + 1, 1, 2, 10, FREQUENCY, false, false, "", nullptr, 0, leaves, exclude_leaves);
|
||||
ASSERT_EQ(1, leaves.size());
|
||||
ASSERT_STREQ("dacrycystalgia", (const char *)leaves.at(0)->key);
|
||||
|
||||
// missing char
|
||||
leaves.clear();
|
||||
art_fuzzy_search(&t, (const unsigned char *) "gaberlunze", strlen("gaberlunze") + 1, 0, 1, 10, FREQUENCY, false, nullptr, 0, leaves);
|
||||
exclude_leaves.clear();
|
||||
art_fuzzy_search(&t, (const unsigned char *) "gaberlunze", strlen("gaberlunze") + 1, 0, 1, 10, FREQUENCY, false, false, "", nullptr, 0, leaves, exclude_leaves);
|
||||
ASSERT_EQ(1, leaves.size());
|
||||
ASSERT_STREQ("gaberlunzie", (const char *)leaves.at(0)->key);
|
||||
|
||||
// substituted char
|
||||
leaves.clear();
|
||||
art_fuzzy_search(&t, (const unsigned char *) "eacemiferous", strlen("eacemiferous") + 1, 0, 1, 10, FREQUENCY, false, nullptr, 0, leaves);
|
||||
exclude_leaves.clear();
|
||||
art_fuzzy_search(&t, (const unsigned char *) "eacemiferous", strlen("eacemiferous") + 1, 0, 1, 10, FREQUENCY, false, false, "", nullptr, 0, leaves, exclude_leaves);
|
||||
ASSERT_EQ(1, leaves.size());
|
||||
ASSERT_STREQ("racemiferous", (const char *)leaves.at(0)->key);
|
||||
|
||||
// missing char + extra char
|
||||
leaves.clear();
|
||||
art_fuzzy_search(&t, (const unsigned char *) "Sarbruckken", strlen("Sarbruckken") + 1, 0, 2, 10, FREQUENCY, false, nullptr, 0, leaves);
|
||||
exclude_leaves.clear();
|
||||
art_fuzzy_search(&t, (const unsigned char *) "Sarbruckken", strlen("Sarbruckken") + 1, 0, 2, 10, FREQUENCY, false, false, "", nullptr, 0, leaves, exclude_leaves);
|
||||
ASSERT_EQ(1, leaves.size());
|
||||
ASSERT_STREQ("Saarbrucken", (const char *)leaves.at(0)->key);
|
||||
|
||||
// multiple matching results
|
||||
leaves.clear();
|
||||
art_fuzzy_search(&t, (const unsigned char *) "hown", strlen("hown") + 1, 0, 1, 10, FREQUENCY, false, nullptr, 0, leaves);
|
||||
exclude_leaves.clear();
|
||||
art_fuzzy_search(&t, (const unsigned char *) "hown", strlen("hown") + 1, 0, 1, 10, FREQUENCY, false, false, "", nullptr, 0, leaves, exclude_leaves);
|
||||
ASSERT_EQ(10, leaves.size());
|
||||
|
||||
std::set<std::string> expected_words = {"town", "sown", "mown", "lown", "howl", "howk", "howe", "how", "horn", "hoon"};
|
||||
@ -809,23 +826,28 @@ TEST(ArtTest, test_art_fuzzy_search) {
|
||||
|
||||
// fuzzy prefix search
|
||||
leaves.clear();
|
||||
art_fuzzy_search(&t, (const unsigned char *) "lionhear", strlen("lionhear"), 0, 0, 10, FREQUENCY, true, nullptr, 0, leaves);
|
||||
exclude_leaves.clear();
|
||||
art_fuzzy_search(&t, (const unsigned char *) "lionhear", strlen("lionhear"), 0, 0, 10, FREQUENCY, true, false, "", nullptr, 0, leaves, exclude_leaves);
|
||||
ASSERT_EQ(3, leaves.size());
|
||||
|
||||
leaves.clear();
|
||||
art_fuzzy_search(&t, (const unsigned char *) "lineage", strlen("lineage"), 0, 0, 10, FREQUENCY, true, nullptr, 0, leaves);
|
||||
exclude_leaves.clear();
|
||||
art_fuzzy_search(&t, (const unsigned char *) "lineage", strlen("lineage"), 0, 0, 10, FREQUENCY, true, false, "", nullptr, 0, leaves, exclude_leaves);
|
||||
ASSERT_EQ(2, leaves.size());
|
||||
|
||||
leaves.clear();
|
||||
art_fuzzy_search(&t, (const unsigned char *) "liq", strlen("liq"), 0, 0, 50, FREQUENCY, true, nullptr, 0, leaves);
|
||||
exclude_leaves.clear();
|
||||
art_fuzzy_search(&t, (const unsigned char *) "liq", strlen("liq"), 0, 0, 50, FREQUENCY, true, false, "", nullptr, 0, leaves, exclude_leaves);
|
||||
ASSERT_EQ(39, leaves.size());
|
||||
|
||||
leaves.clear();
|
||||
art_fuzzy_search(&t, (const unsigned char *) "antitraditiana", strlen("antitraditiana"), 0, 1, 10, FREQUENCY, true, nullptr, 0, leaves);
|
||||
exclude_leaves.clear();
|
||||
art_fuzzy_search(&t, (const unsigned char *) "antitraditiana", strlen("antitraditiana"), 0, 1, 10, FREQUENCY, true, false, "", nullptr, 0, leaves, exclude_leaves);
|
||||
ASSERT_EQ(1, leaves.size());
|
||||
|
||||
leaves.clear();
|
||||
art_fuzzy_search(&t, (const unsigned char *) "antisocao", strlen("antisocao"), 0, 2, 10, FREQUENCY, true, nullptr, 0, leaves);
|
||||
exclude_leaves.clear();
|
||||
art_fuzzy_search(&t, (const unsigned char *) "antisocao", strlen("antisocao"), 0, 2, 10, FREQUENCY, true, false, "", nullptr, 0, leaves, exclude_leaves);
|
||||
ASSERT_EQ(6, leaves.size());
|
||||
|
||||
long long int timeMillis = std::chrono::duration_cast<std::chrono::milliseconds>(
|
||||
@ -855,7 +877,7 @@ TEST(ArtTest, test_art_fuzzy_search_unicode_chars) {
|
||||
EXPECT_EQ(1, posting_t::first_id(l->values));
|
||||
|
||||
std::vector<art_leaf*> leaves;
|
||||
art_fuzzy_search(&t, (unsigned char *)key, strlen(key), 0, 0, 10, FREQUENCY, true, nullptr, 0, leaves);
|
||||
art_fuzzy_search(&t, (unsigned char *)key, strlen(key), 0, 0, 10, FREQUENCY, true, false, "", nullptr, 0, leaves, exclude_leaves);
|
||||
ASSERT_EQ(1, leaves.size());
|
||||
}
|
||||
|
||||
@ -879,7 +901,7 @@ TEST(ArtTest, test_art_fuzzy_search_extra_chars) {
|
||||
|
||||
const char* query = "abbreviation";
|
||||
std::vector<art_leaf*> leaves;
|
||||
art_fuzzy_search(&t, (unsigned char *)query, strlen(query), 0, 2, 10, FREQUENCY, true, nullptr, 0, leaves);
|
||||
art_fuzzy_search(&t, (unsigned char *)query, strlen(query), 0, 2, 10, FREQUENCY, true, false, "", nullptr, 0, leaves, exclude_leaves);
|
||||
ASSERT_EQ(1, leaves.size());
|
||||
|
||||
res = art_tree_destroy(&t);
|
||||
@ -918,15 +940,16 @@ TEST(ArtTest, test_art_search_sku_like_tokens) {
|
||||
for (const auto &key : keys) {
|
||||
std::vector<art_leaf *> leaves;
|
||||
art_fuzzy_search(&t, (const unsigned char*)key.c_str(), key.size(), 0, 0, 10,
|
||||
FREQUENCY, true, nullptr, 0, leaves);
|
||||
FREQUENCY, true, false, "", nullptr, 0, leaves, exclude_leaves);
|
||||
ASSERT_EQ(1, leaves.size());
|
||||
ASSERT_STREQ(key.c_str(), (const char *) leaves.at(0)->key);
|
||||
|
||||
leaves.clear();
|
||||
exclude_leaves.clear();
|
||||
|
||||
// non prefix
|
||||
art_fuzzy_search(&t, (const unsigned char*)key.c_str(), key.size()+1, 0, 0, 10,
|
||||
FREQUENCY, false, nullptr, 0, leaves);
|
||||
FREQUENCY, false, false, "", nullptr, 0, leaves, exclude_leaves);
|
||||
ASSERT_EQ(1, leaves.size());
|
||||
ASSERT_STREQ(key.c_str(), (const char *) leaves.at(0)->key);
|
||||
}
|
||||
@ -970,14 +993,17 @@ TEST(ArtTest, test_art_search_ill_like_tokens) {
|
||||
std::make_pair("ice", 2),
|
||||
};
|
||||
|
||||
std::string key = "input";
|
||||
|
||||
for (const auto &key : keys) {
|
||||
art_leaf* l = (art_leaf *) art_search(&t, (const unsigned char *)key.c_str(), key.size()+1);
|
||||
ASSERT_FALSE(l == nullptr);
|
||||
EXPECT_EQ(1, posting_t::num_ids(l->values));
|
||||
|
||||
std::vector<art_leaf *> leaves;
|
||||
exclude_leaves.clear();
|
||||
art_fuzzy_search(&t, (const unsigned char*)key.c_str(), key.size(), 0, 0, 10,
|
||||
FREQUENCY, true, nullptr, 0, leaves);
|
||||
FREQUENCY, true, false, "", nullptr, 0, leaves, exclude_leaves);
|
||||
|
||||
if(key_to_count.count(key) != 0) {
|
||||
ASSERT_EQ(key_to_count[key], leaves.size());
|
||||
@ -987,10 +1013,14 @@ TEST(ArtTest, test_art_search_ill_like_tokens) {
|
||||
}
|
||||
|
||||
leaves.clear();
|
||||
exclude_leaves.clear();
|
||||
|
||||
// non prefix
|
||||
art_fuzzy_search(&t, (const unsigned char*)key.c_str(), key.size()+1, 0, 0, 10,
|
||||
FREQUENCY, false, nullptr, 0, leaves);
|
||||
FREQUENCY, false, false, "", nullptr, 0, leaves, exclude_leaves);
|
||||
if(leaves.size() != 1) {
|
||||
LOG(INFO) << key;
|
||||
}
|
||||
ASSERT_EQ(1, leaves.size());
|
||||
ASSERT_STREQ(key.c_str(), (const char *) leaves.at(0)->key);
|
||||
}
|
||||
@ -1022,8 +1052,9 @@ TEST(ArtTest, test_art_search_ill_like_tokens2) {
|
||||
EXPECT_EQ(1, posting_t::num_ids(l->values));
|
||||
|
||||
std::vector<art_leaf *> leaves;
|
||||
exclude_leaves.clear();
|
||||
art_fuzzy_search(&t, (const unsigned char*)key.c_str(), key.size(), 0, 0, 10,
|
||||
FREQUENCY, true, nullptr, 0, leaves);
|
||||
FREQUENCY, true, false, "", nullptr, 0, leaves, exclude_leaves);
|
||||
|
||||
if(key == "illustration") {
|
||||
ASSERT_EQ(2, leaves.size());
|
||||
@ -1033,10 +1064,11 @@ TEST(ArtTest, test_art_search_ill_like_tokens2) {
|
||||
}
|
||||
|
||||
leaves.clear();
|
||||
exclude_leaves.clear();
|
||||
|
||||
// non prefix
|
||||
art_fuzzy_search(&t, (const unsigned char*)key.c_str(), key.size() + 1, 0, 0, 10,
|
||||
FREQUENCY, false, nullptr, 0, leaves);
|
||||
FREQUENCY, false, false, "", nullptr, 0, leaves, exclude_leaves);
|
||||
ASSERT_EQ(1, leaves.size());
|
||||
ASSERT_STREQ(key.c_str(), (const char *) leaves.at(0)->key);
|
||||
}
|
||||
@ -1059,12 +1091,12 @@ TEST(ArtTest, test_art_search_roche_chews) {
|
||||
std::string term = "chews";
|
||||
std::vector<art_leaf *> leaves;
|
||||
art_fuzzy_search(&t, (const unsigned char*)term.c_str(), term.size(), 0, 2, 10,
|
||||
FREQUENCY, true, nullptr, 0, leaves);
|
||||
FREQUENCY, true, false, "", nullptr, 0, leaves, exclude_leaves);
|
||||
|
||||
ASSERT_EQ(0, leaves.size());
|
||||
|
||||
art_fuzzy_search(&t, (const unsigned char*)keys[0].c_str(), keys[0].size() + 1, 0, 0, 10,
|
||||
FREQUENCY, false, nullptr, 0, leaves);
|
||||
FREQUENCY, false, false, "", nullptr, 0, leaves, exclude_leaves);
|
||||
|
||||
ASSERT_EQ(1, leaves.size());
|
||||
|
||||
@ -1091,14 +1123,15 @@ TEST(ArtTest, test_art_search_raspberry) {
|
||||
|
||||
std::string q_raspberries = "raspberries";
|
||||
art_fuzzy_search(&t, (const unsigned char*)q_raspberries.c_str(), q_raspberries.size(), 0, 2, 10,
|
||||
FREQUENCY, true, nullptr, 0, leaves);
|
||||
FREQUENCY, true, false, "", nullptr, 0, leaves, exclude_leaves);
|
||||
ASSERT_EQ(2, leaves.size());
|
||||
|
||||
leaves.clear();
|
||||
exclude_leaves.clear();
|
||||
|
||||
std::string q_raspberry = "raspberry";
|
||||
art_fuzzy_search(&t, (const unsigned char*)q_raspberry.c_str(), q_raspberry.size(), 0, 2, 10,
|
||||
FREQUENCY, true, nullptr, 0, leaves);
|
||||
FREQUENCY, true, false, "", nullptr, 0, leaves, exclude_leaves);
|
||||
ASSERT_EQ(2, leaves.size());
|
||||
|
||||
res = art_tree_destroy(&t);
|
||||
@ -1124,13 +1157,16 @@ TEST(ArtTest, test_art_search_highliving) {
|
||||
|
||||
std::string query = "higghliving";
|
||||
art_fuzzy_search(&t, (const unsigned char*)query.c_str(), query.size() + 1, 0, 1, 10,
|
||||
FREQUENCY, false, nullptr, 0, leaves);
|
||||
FREQUENCY, false, false, "", nullptr, 0, leaves, exclude_leaves);
|
||||
ASSERT_EQ(1, leaves.size());
|
||||
|
||||
leaves.clear();
|
||||
exclude_leaves.clear();
|
||||
exclude_leaves.clear();
|
||||
exclude_leaves.clear();
|
||||
|
||||
art_fuzzy_search(&t, (const unsigned char*)query.c_str(), query.size(), 0, 2, 10,
|
||||
FREQUENCY, true, nullptr, 0, leaves);
|
||||
FREQUENCY, true, false, "", nullptr, 0, leaves, exclude_leaves);
|
||||
ASSERT_EQ(1, leaves.size());
|
||||
|
||||
res = art_tree_destroy(&t);
|
||||
|
@ -203,6 +203,8 @@ TEST_F(CollectionSpecificTest, ExactSingleFieldMatch) {
|
||||
spp::sparse_hash_set<std::string>(), spp::sparse_hash_set<std::string>(), 10, "", 30,
|
||||
4, "title", 10).get();
|
||||
|
||||
LOG(INFO) << results;
|
||||
|
||||
ASSERT_EQ(2, results["hits"].size());
|
||||
ASSERT_EQ("0", results["hits"][0]["document"]["id"].get<std::string>());
|
||||
ASSERT_EQ("1", results["hits"][1]["document"]["id"].get<std::string>());
|
||||
|
@ -476,7 +476,7 @@ TEST_F(CollectionTest, TextContainingAnActualTypo) {
|
||||
ASSERT_EQ(4, results["hits"].size());
|
||||
ASSERT_EQ(11, results["found"].get<uint32_t>());
|
||||
|
||||
std::vector<std::string> ids = {"19", "22", "6", "13"};
|
||||
std::vector<std::string> ids = {"19", "6", "21", "22"};
|
||||
|
||||
for(size_t i = 0; i < results["hits"].size(); i++) {
|
||||
nlohmann::json result = results["hits"].at(i);
|
||||
|
Loading…
x
Reference in New Issue
Block a user