Exhaustive token searching with filter_ids

This commit is contained in:
Jason Bosco 2021-01-22 20:06:18 -08:00
parent bcea70ebfd
commit a08fb7738f
9 changed files with 173 additions and 38 deletions

View File

@ -65,6 +65,7 @@ FILE(GLOB SRC_FILES src/*.cpp)
FILE(GLOB TEST_FILES test/*.cpp)
include_directories(include)
include_directories(/usr/local/include)
include_directories(${OPENSSL_INCLUDE_DIR})
include_directories(${CURL_INCLUDE_DIR})
include_directories(${ICU_INCLUDE_DIRS})
@ -77,6 +78,7 @@ include_directories(${DEP_ROOT_DIR}/${BRPC_NAME}/include)
include_directories(${DEP_ROOT_DIR}/${BRAFT_NAME}/include)
include_directories(${DEP_ROOT_DIR}/${JEMALLOC_NAME}/include/jemalloc)
link_directories(/usr/local/lib)
link_directories(${DEP_ROOT_DIR}/${GTEST_NAME}/googletest/build)
link_directories(${DEP_ROOT_DIR}/${FOR_NAME})
link_directories(${DEP_ROOT_DIR}/${H2O_NAME}/build)

View File

@ -245,7 +245,9 @@ int art_iter_prefix(art_tree *t, const unsigned char *prefix, int prefix_len, ar
* Returns leaves that match a given string within a fuzzy distance of max_cost.
*/
int art_fuzzy_search(art_tree *t, const unsigned char *term, const int term_len, const int min_cost, const int max_cost,
const int max_words, const token_ordering token_order, const bool prefix, std::vector<art_leaf *> &results);
const int max_words, const token_ordering token_order, const bool prefix,
const uint32_t *filter_ids, size_t filter_ids_length,
std::vector<art_leaf *> &results);
int art_topk_iter(const art_node *root, token_ordering token_order, size_t max_results,
std::vector<art_leaf *> &results);

View File

@ -35,6 +35,10 @@ private:
int low_index, int high_index, uint32_t base, uint32_t bits,
uint32_t *indices);
void binary_count_indices(const uint32_t *values, int low_vindex, int high_vindex,
int low_index, int high_index, uint32_t base, uint32_t bits,
size_t& num_found);
public:
void load(const uint32_t *sorted_array, const uint32_t array_length);
@ -45,7 +49,9 @@ public:
uint32_t indexOf(uint32_t value);
void indexOf(const uint32_t *values, const size_t values_len, uint32_t* indices);
void indexOf(const uint32_t *values, size_t values_len, uint32_t* indices);
size_t numFoundOf(const uint32_t *values, const size_t values_len);
// returns false if malloc fails
size_t append(uint32_t value);

View File

@ -904,7 +904,8 @@ void* art_delete(art_tree *t, const unsigned char *key, int key_len) {
}*/
int art_topk_iter(const art_node *root, token_ordering token_order, size_t max_results,
std::vector<art_leaf *> &results) {
const uint32_t* filter_ids, size_t filter_ids_length,
std::vector<art_leaf *> &results) {
printf("INSIDE art_topk_iter: root->type: %d\n", root->type);
std::priority_queue<const art_node *, std::vector<const art_node *>,
@ -924,7 +925,17 @@ int art_topk_iter(const art_node *root, token_ordering token_order, size_t max_r
if (!n) continue;
if (IS_LEAF(n)) {
art_leaf *l = (art_leaf *) LEAF_RAW(n);
results.push_back(l);
if(filter_ids_length == 0) {
results.push_back(l);
} else {
// we will push leaf only if filter matches with leaf IDs
size_t found_len = l->values->ids.numFoundOf(filter_ids, filter_ids_length);
if(found_len != 0) {
results.push_back(l);
}
}
continue;
}
@ -1383,6 +1394,7 @@ static void art_fuzzy_recurse(unsigned char p, unsigned char c, const art_node *
*/
int art_fuzzy_search(art_tree *t, const unsigned char *term, const int term_len, const int min_cost, const int max_cost,
const int max_words, const token_ordering token_order, const bool prefix,
const uint32_t *filter_ids, size_t filter_ids_length,
std::vector<art_leaf *> &results) {
std::vector<const art_node*> nodes;
@ -1412,7 +1424,7 @@ int art_fuzzy_search(art_tree *t, const unsigned char *term, const int term_len,
//begin = std::chrono::high_resolution_clock::now();
for(auto node: nodes) {
art_topk_iter(node, token_order, max_words, results);
art_topk_iter(node, token_order, max_words, filter_ids, filter_ids_length, results);
}
if(token_order == FREQUENCY) {

View File

@ -697,7 +697,7 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
art_fuzzy_search(t, (const unsigned char *) q.c_str(),
q.size(), 0, bounded_cost, 10000,
token_ordering::MAX_SCORE, prefix_search, leaves);
token_ordering::MAX_SCORE, prefix_search, nullptr, 0, leaves);
for (size_t leaf_index = 0; leaf_index < leaves.size(); leaf_index++) {
const auto &leaf = leaves[leaf_index];
@ -1248,7 +1248,7 @@ void Index::collate_included_ids(const std::vector<std::string>& q_included_toke
std::vector<art_leaf*> leaves;
art_fuzzy_search(search_index.at(field), (const unsigned char *) token.c_str(), token_len,
0, 0, 1, token_ordering::MAX_SCORE, false, leaves);
0, 0, 1, token_ordering::MAX_SCORE, false, nullptr, 0, leaves);
if(!leaves.empty()) {
override_query.push_back(leaves[0]);
@ -1550,7 +1550,7 @@ void Index::search(Option<uint32_t> & outcome,
const bool prefix_search = prefix && (token_index == q_include_tokens.size()-1);
const size_t token_len = prefix_search ? (int) token.length() : (int) token.length() + 1;
art_fuzzy_search(search_index.at(field), (const unsigned char *) token.c_str(), token_len,
0, 0, 1, token_order, prefix_search, leaves);
0, 0, 1, token_order, prefix_search, nullptr, 0, leaves);
if(leaves.empty()) {
continue;
@ -1701,7 +1701,8 @@ void Index::search_field(const uint8_t & field_id,
// If this is a prefix search, look for more candidates and do a union of those document IDs
const int max_candidates = prefix_search ? 10 : 3;
art_fuzzy_search(search_index.at(field), (const unsigned char *) token.c_str(), token_len,
costs[token_index], costs[token_index], max_candidates, token_order, prefix_search, leaves);
costs[token_index], costs[token_index], max_candidates, token_order, prefix_search,
filter_ids, filter_ids_length, leaves);
if(!leaves.empty()) {
token_cost_cache.emplace(token_cost_hash, leaves);

View File

@ -174,13 +174,13 @@ void sorted_array::indexOf(const uint32_t *values, const size_t values_len, uint
do {
head++;
low_index = lower_bound_search_bits(in+METADATA_OVERHEAD, 0, length-1, base, bits, values[head], &actual_value);
} while(actual_value != values[head]);
} while(head < int(values_len - 1) && actual_value != values[head]);
int tail = values_len;
do {
tail--;
high_index = lower_bound_search_bits(in+METADATA_OVERHEAD, 0, length-1, base, bits, values[tail], &actual_value);
} while(actual_value != values[tail]);
} while(tail > 0 && actual_value != values[tail]);
for(int i = 0; i < head; i++) {
indices[i] = length;
@ -237,4 +237,55 @@ void sorted_array::remove_values(uint32_t *sorted_values, uint32_t sorted_values
load(new_array, new_index);
delete[] curr_array;
delete[] new_array;
}
}
size_t sorted_array::numFoundOf(const uint32_t *values, const size_t values_len) {
size_t num_found = 0;
if(length == 0 || values_len == 0) {
return num_found;
}
uint32_t base = *(uint32_t *)(in + 0);
uint32_t bits = *(in + 4);
uint32_t low_index, high_index;
uint32_t actual_value = 0;
// identify the upper and lower bounds of the search space
int head = -1;
do {
head++;
low_index = lower_bound_search_bits(in+METADATA_OVERHEAD, 0, length-1, base, bits, values[head], &actual_value);
} while(head < int(values_len - 1) && actual_value != values[head]);
int tail = values_len;
do {
tail--;
high_index = lower_bound_search_bits(in+METADATA_OVERHEAD, 0, length-1, base, bits, values[tail], &actual_value);
} while(tail > 0 && actual_value != values[tail]);
// recursively search within the bounds for all values
binary_count_indices(values, head, tail, low_index, high_index, base, bits, num_found);
return num_found;
}
void sorted_array::binary_count_indices(const uint32_t *values, int low_vindex, int high_vindex, int low_index,
int high_index, uint32_t base, uint32_t bits, size_t& num_found) {
uint32_t actual_value = 0;
if(high_vindex >= low_vindex && high_index >= low_index) {
size_t pivot_vindex = (low_vindex + high_vindex) / 2;
uint32_t in_index = lower_bound_search_bits(in+METADATA_OVERHEAD, low_index, high_index, base, bits,
values[pivot_vindex], &actual_value);
if(actual_value == values[pivot_vindex]) {
num_found++;
}
binary_count_indices(values, low_vindex, pivot_vindex-1, low_index, in_index-1, base, bits, num_found);
binary_count_indices(values, pivot_vindex+1, high_vindex, in_index+1, high_index, base, bits, num_found);
}
}

View File

@ -588,22 +588,22 @@ TEST(ArtTest, test_art_fuzzy_search_single_leaf) {
EXPECT_EQ(1, l->values->ids.at(0));
std::vector<art_leaf*> leaves;
art_fuzzy_search(&t, (const unsigned char *) implement_key, strlen(implement_key) + 1, 0, 0, 10, FREQUENCY, false, leaves);
art_fuzzy_search(&t, (const unsigned char *) implement_key, strlen(implement_key) + 1, 0, 0, 10, FREQUENCY, false, nullptr, 0, leaves);
ASSERT_EQ(1, leaves.size());
const char* implement_key_typo1 = "implment";
const char* implement_key_typo2 = "implwnent";
leaves.clear();
art_fuzzy_search(&t, (const unsigned char *) implement_key_typo1, strlen(implement_key_typo1) + 1, 0, 0, 10, FREQUENCY, false, leaves);
art_fuzzy_search(&t, (const unsigned char *) implement_key_typo1, strlen(implement_key_typo1) + 1, 0, 0, 10, FREQUENCY, false, nullptr, 0, leaves);
ASSERT_EQ(0, leaves.size());
leaves.clear();
art_fuzzy_search(&t, (const unsigned char *) implement_key_typo1, strlen(implement_key_typo1) + 1, 0, 1, 10, FREQUENCY, false, leaves);
art_fuzzy_search(&t, (const unsigned char *) implement_key_typo1, strlen(implement_key_typo1) + 1, 0, 1, 10, FREQUENCY, false, nullptr, 0, leaves);
ASSERT_EQ(1, leaves.size());
leaves.clear();
art_fuzzy_search(&t, (const unsigned char *) implement_key_typo2, strlen(implement_key_typo2) + 1, 0, 2, 10, FREQUENCY, false, leaves);
art_fuzzy_search(&t, (const unsigned char *) implement_key_typo2, strlen(implement_key_typo2) + 1, 0, 2, 10, FREQUENCY, false, nullptr, 0, leaves);
ASSERT_EQ(1, leaves.size());
res = art_tree_destroy(&t);
@ -623,7 +623,7 @@ TEST(ArtTest, test_art_fuzzy_search_single_leaf_prefix) {
EXPECT_EQ(1, l->values->ids.at(0));
std::vector<art_leaf*> leaves;
art_fuzzy_search(&t, (const unsigned char *) "aplication", strlen(key), 0, 1, 10, FREQUENCY, true, leaves);
art_fuzzy_search(&t, (const unsigned char *) "aplication", strlen(key), 0, 1, 10, FREQUENCY, true, nullptr, 0, leaves);
ASSERT_EQ(1, leaves.size());
res = art_tree_destroy(&t);
@ -651,48 +651,48 @@ TEST(ArtTest, test_art_fuzzy_search) {
std::vector<art_leaf*> leaves;
// transpose
art_fuzzy_search(&t, (const unsigned char *) "zymosthneic", strlen("zymosthneic") + 1, 0, 1, 10, FREQUENCY, false, leaves);
art_fuzzy_search(&t, (const unsigned char *) "zymosthneic", strlen("zymosthneic") + 1, 0, 1, 10, FREQUENCY, false, nullptr, 0, leaves);
ASSERT_EQ(1, leaves.size());
ASSERT_STREQ("zymosthenic", (const char *)leaves.at(0)->key);
// transpose + missing
leaves.clear();
art_fuzzy_search(&t, (const unsigned char *) "dacrcyystlgia", strlen("dacrcyystlgia") + 1, 0, 2, 10, FREQUENCY, false, leaves);
art_fuzzy_search(&t, (const unsigned char *) "dacrcyystlgia", strlen("dacrcyystlgia") + 1, 0, 2, 10, FREQUENCY, false, nullptr, 0, leaves);
ASSERT_EQ(1, leaves.size());
ASSERT_STREQ("dacrycystalgia", (const char *)leaves.at(0)->key);
leaves.clear();
art_fuzzy_search(&t, (const unsigned char *) "dacrcyystlgia", strlen("dacrcyystlgia") + 1, 1, 2, 10, FREQUENCY, false, leaves);
art_fuzzy_search(&t, (const unsigned char *) "dacrcyystlgia", strlen("dacrcyystlgia") + 1, 1, 2, 10, FREQUENCY, false, nullptr, 0, leaves);
ASSERT_EQ(1, leaves.size());
ASSERT_STREQ("dacrycystalgia", (const char *)leaves.at(0)->key);
// missing char
leaves.clear();
art_fuzzy_search(&t, (const unsigned char *) "gaberlunze", strlen("gaberlunze") + 1, 0, 1, 10, FREQUENCY, false, leaves);
art_fuzzy_search(&t, (const unsigned char *) "gaberlunze", strlen("gaberlunze") + 1, 0, 1, 10, FREQUENCY, false, nullptr, 0, leaves);
ASSERT_EQ(1, leaves.size());
ASSERT_STREQ("gaberlunzie", (const char *)leaves.at(0)->key);
// extra char
leaves.clear();
art_fuzzy_search(&t, (const unsigned char *) "higghliving", strlen("higghliving") + 1, 0, 1, 10, FREQUENCY, false, leaves);
art_fuzzy_search(&t, (const unsigned char *) "higghliving", strlen("higghliving") + 1, 0, 1, 10, FREQUENCY, false, nullptr, 0, leaves);
ASSERT_EQ(1, leaves.size());
ASSERT_STREQ("highliving", (const char *)leaves.at(0)->key);
// substituted char
leaves.clear();
art_fuzzy_search(&t, (const unsigned char *) "eacemiferous", strlen("eacemiferous") + 1, 0, 1, 10, FREQUENCY, false, leaves);
art_fuzzy_search(&t, (const unsigned char *) "eacemiferous", strlen("eacemiferous") + 1, 0, 1, 10, FREQUENCY, false, nullptr, 0, leaves);
ASSERT_EQ(1, leaves.size());
ASSERT_STREQ("racemiferous", (const char *)leaves.at(0)->key);
// missing char + extra char
leaves.clear();
art_fuzzy_search(&t, (const unsigned char *) "Sarbruckken", strlen("Sarbruckken") + 1, 0, 2, 10, FREQUENCY, false, leaves);
art_fuzzy_search(&t, (const unsigned char *) "Sarbruckken", strlen("Sarbruckken") + 1, 0, 2, 10, FREQUENCY, false, nullptr, 0, leaves);
ASSERT_EQ(1, leaves.size());
ASSERT_STREQ("Saarbrucken", (const char *)leaves.at(0)->key);
// multiple matching results
leaves.clear();
art_fuzzy_search(&t, (const unsigned char *) "hown", strlen("hown") + 1, 0, 1, 10, FREQUENCY, false, leaves);
art_fuzzy_search(&t, (const unsigned char *) "hown", strlen("hown") + 1, 0, 1, 10, FREQUENCY, false, nullptr, 0, leaves);
ASSERT_EQ(10, leaves.size());
std::vector<const char*> words = {"town", "sown", "shown", "own", "mown", "lown", "howl", "howk", "howe", "how"};
@ -702,23 +702,23 @@ TEST(ArtTest, test_art_fuzzy_search) {
// fuzzy prefix search
leaves.clear();
art_fuzzy_search(&t, (const unsigned char *) "lionhear", strlen("lionhear"), 0, 0, 10, FREQUENCY, true, leaves);
art_fuzzy_search(&t, (const unsigned char *) "lionhear", strlen("lionhear"), 0, 0, 10, FREQUENCY, true, nullptr, 0, leaves);
ASSERT_EQ(3, leaves.size());
leaves.clear();
art_fuzzy_search(&t, (const unsigned char *) "lineage", strlen("lineage"), 0, 0, 10, FREQUENCY, true, leaves);
art_fuzzy_search(&t, (const unsigned char *) "lineage", strlen("lineage"), 0, 0, 10, FREQUENCY, true, nullptr, 0, leaves);
ASSERT_EQ(2, leaves.size());
leaves.clear();
art_fuzzy_search(&t, (const unsigned char *) "liq", strlen("liq"), 0, 0, 50, FREQUENCY, true, leaves);
art_fuzzy_search(&t, (const unsigned char *) "liq", strlen("liq"), 0, 0, 50, FREQUENCY, true, nullptr, 0, leaves);
ASSERT_EQ(39, leaves.size());
leaves.clear();
art_fuzzy_search(&t, (const unsigned char *) "antitraditian", strlen("antitraditian"), 0, 1, 10, FREQUENCY, true, leaves);
art_fuzzy_search(&t, (const unsigned char *) "antitraditian", strlen("antitraditian"), 0, 1, 10, FREQUENCY, true, nullptr, 0, leaves);
ASSERT_EQ(1, leaves.size());
leaves.clear();
art_fuzzy_search(&t, (const unsigned char *) "antisocao", strlen("antisocao"), 0, 2, 10, FREQUENCY, true, leaves);
art_fuzzy_search(&t, (const unsigned char *) "antisocao", strlen("antisocao"), 0, 2, 10, FREQUENCY, true, nullptr, 0, leaves);
ASSERT_EQ(10, leaves.size());
res = art_tree_destroy(&t);
@ -744,7 +744,7 @@ TEST(ArtTest, test_art_fuzzy_search_unicode_chars) {
EXPECT_EQ(1, l->values->ids.at(0));
std::vector<art_leaf*> leaves;
art_fuzzy_search(&t, (unsigned char *)key, strlen(key), 0, 0, 10, FREQUENCY, true, leaves);
art_fuzzy_search(&t, (unsigned char *)key, strlen(key), 0, 0, 10, FREQUENCY, true, nullptr, 0, leaves);
ASSERT_EQ(1, leaves.size());
}
@ -784,7 +784,7 @@ TEST(ArtTest, test_art_search_sku_like_tokens) {
for (const auto &key : keys) {
std::vector<art_leaf *> leaves;
art_fuzzy_search(&t, (const unsigned char*)key.c_str(), key.size()+1, 0, 0, 10,
FREQUENCY, true, leaves);
FREQUENCY, true, nullptr, 0, leaves);
ASSERT_EQ(1, leaves.size());
ASSERT_STREQ(key.c_str(), (const char *) leaves.at(0)->key);
@ -792,7 +792,7 @@ TEST(ArtTest, test_art_search_sku_like_tokens) {
// non prefix
art_fuzzy_search(&t, (const unsigned char*)key.c_str(), key.size()+1, 0, 0, 10,
FREQUENCY, false, leaves);
FREQUENCY, false, nullptr, 0, leaves);
ASSERT_EQ(1, leaves.size());
ASSERT_STREQ(key.c_str(), (const char *) leaves.at(0)->key);
}
@ -830,7 +830,7 @@ TEST(ArtTest, test_art_search_ill_like_tokens) {
std::vector<art_leaf *> leaves;
art_fuzzy_search(&t, (const unsigned char*)key.c_str(), key.size()+1, 0, 0, 10,
FREQUENCY, true, leaves);
FREQUENCY, true, nullptr, 0, leaves);
ASSERT_EQ(1, leaves.size());
ASSERT_STREQ(key.c_str(), (const char *) leaves.at(0)->key);
@ -839,7 +839,7 @@ TEST(ArtTest, test_art_search_ill_like_tokens) {
// non prefix
art_fuzzy_search(&t, (const unsigned char*)key.c_str(), key.size()+1, 0, 0, 10,
FREQUENCY, false, leaves);
FREQUENCY, false, nullptr, 0, leaves);
ASSERT_EQ(1, leaves.size());
ASSERT_STREQ(key.c_str(), (const char *) leaves.at(0)->key);
}
@ -873,7 +873,7 @@ TEST(ArtTest, test_art_search_ill_like_tokens2) {
std::vector<art_leaf *> leaves;
art_fuzzy_search(&t, (const unsigned char*)key.c_str(), key.size()+1, 0, 0, 10,
FREQUENCY, true, leaves);
FREQUENCY, true, nullptr, 0, leaves);
ASSERT_EQ(1, leaves.size());
ASSERT_STREQ(key.c_str(), (const char *) leaves.at(0)->key);
@ -882,7 +882,7 @@ TEST(ArtTest, test_art_search_ill_like_tokens2) {
// non prefix
art_fuzzy_search(&t, (const unsigned char*)key.c_str(), key.size()+1, 0, 0, 10,
FREQUENCY, false, leaves);
FREQUENCY, false, nullptr, 0, leaves);
ASSERT_EQ(1, leaves.size());
ASSERT_STREQ(key.c_str(), (const char *) leaves.at(0)->key);
}

View File

@ -675,4 +675,49 @@ TEST_F(CollectionFilteringTest, ComparatorsOnMultiValuedNumericalField) {
}
collectionManager.drop_collection("coll_array_fields");
}
TEST_F(CollectionFilteringTest, FilteringWithPrefixSearch) {
Collection *coll1;
std::vector<field> fields = {field("title", field_types::STRING, false),
field("points", field_types::INT32, false),};
coll1 = collectionManager.get_collection("coll1");
if(coll1 == nullptr) {
coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
}
std::vector<std::vector<std::string>> records = {
{"elephant"}, {"emerald"}, {"effective"}, {"esther"}, {"eagle"},
{"empty"}, {"elite"}, {"example"}, {"elated"}, {"end"},
{"ear"}, {"eager"}, {"earmark"}, {"envelop"}, {"excess"},
{"ember"}, {"earth"}, {"envoy"}, {"emerge"}, {"emigrant"},
{"envision"}, {"envy"}, {"envisage"}, {"executive"}, {"end"},
};
for(size_t i=0; i<records.size(); i++) {
nlohmann::json doc;
doc["id"] = std::to_string(i);
doc["title"] = records[i][0];
doc["points"] = i;
ASSERT_TRUE(coll1->add(doc.dump()).ok());
}
// pick a location close to only the Sacre Coeur
auto res_op = coll1->search("e",
{"title"}, "points: 23",
{}, {}, 0, 10, 1, FREQUENCY, true);
auto results = res_op.get();
LOG(INFO) << results;
ASSERT_EQ(1, results["found"].get<size_t>());
ASSERT_EQ(1, results["hits"].size());
ASSERT_STREQ("23", results["hits"][0]["document"]["id"].get<std::string>().c_str());
collectionManager.drop_collection("coll1");
}

View File

@ -2,7 +2,6 @@
#include "sorted_array.h"
#include <vector>
#include <fstream>
#include "string_utils.h"
TEST(SortedArrayTest, Append) {
sorted_array arr;
@ -239,4 +238,21 @@ TEST(SortedArrayTest, BulkIndexOf) {
auto search_id = search_ids.at(i);
ASSERT_EQ(ids.indexOf(search_id), results[i]);
}
// search with IDs that don't exist
search_ids = {100};
delete [] results;
results = new uint32_t[search_ids.size()];
ids.indexOf(&search_ids[0], search_ids.size(), results);
ASSERT_EQ(562, results[0]);
search_ids = {100, 105};
delete [] results;
results = new uint32_t[search_ids.size()];
ids.indexOf(&search_ids[0], search_ids.size(), results);
ASSERT_EQ(562, results[0]);
ASSERT_EQ(562, results[1]);
}