ART - integer range search.

This commit is contained in:
Kishore Nallan 2016-12-10 13:53:09 +05:30
parent 9cc3e7e5ea
commit 9b0c347334
3 changed files with 413 additions and 10 deletions

View File

@ -239,7 +239,12 @@ int art_iter_prefix(art_tree *t, const unsigned char *prefix, int prefix_len, ar
int art_fuzzy_search(art_tree *t, const unsigned char *term, const int term_len, const int max_cost,
const int max_words, const token_ordering token_order, const bool prefix, std::vector<art_leaf *> &results);
static int topk_iter(const art_node *root, token_ordering token_order, const int max_results, std::vector<art_leaf*> & results);
static int art_topk_iter(const art_node *root, token_ordering token_order, const int max_results,
std::vector<art_leaf *> &results);
void encode_int(uint32_t n, unsigned char* chars);
int art_int_search(art_tree *t, uint32_t value, int compare, std::vector<const art_leaf*> & results);
#ifdef __cplusplus
}

View File

@ -28,10 +28,15 @@
#define microseconds std::chrono::duration_cast<std::chrono::microseconds>
enum recurse_progress { CONTINUE, ABORT, ITERATE };
static void art_fuzzy_recurse(char p, char c, const art_node *n, int depth, const unsigned char *term,
const int term_len, const int* irow, const int* jrow, const int max_cost,
const bool prefix, std::vector<const art_node *> &results);
void art_int_fuzzy_recurse(art_node *n, int depth, unsigned char* int_str, int int_str_len,
uint32_t compare, std::vector<const art_leaf *> &results);
bool compare_art_leaf_frequency(const art_leaf *a, const art_leaf *b) {
return a->values->ids.getLength() > b->values->ids.getLength();
}
@ -883,8 +888,9 @@ static uint32_t get_score(art_node* child) {
return child->max_token_count;
}
static int topk_iter(const art_node *root, token_ordering token_order, const int max_results, std::vector<art_leaf*> & results) {
printf("INSIDE topk_iter: root->type: %d\n", root->type);
static int art_topk_iter(const art_node *root, token_ordering token_order, const int max_results,
std::vector<art_leaf *> &results) {
printf("INSIDE art_topk_iter: root->type: %d\n", root->type);
std::priority_queue<art_node *, std::vector<const art_node *>,
std::function<bool(const art_node*, const art_node*)>> q;
@ -952,7 +958,7 @@ static int topk_iter(const art_node *root, token_ordering token_order, const int
}
}
printf("OUTSIDE topk_iter: results size: %d\n", results.size());
printf("OUTSIDE art_topk_iter: results size: %d\n", results.size());
return 0;
}
@ -1329,7 +1335,7 @@ int art_fuzzy_search(art_tree *t, const unsigned char *term, const int term_len,
begin = std::chrono::high_resolution_clock::now();
for(auto node: nodes) {
topk_iter(node, token_order, max_words, results);
art_topk_iter(node, token_order, max_words, results);
}
if(token_order == FREQUENCY) {
@ -1339,6 +1345,215 @@ int art_fuzzy_search(art_tree *t, const unsigned char *term, const int term_len,
}
time_micro = microseconds(std::chrono::high_resolution_clock::now() - begin).count();
std::cout << "Time taken for topk_iter: " << time_micro << "us" << std::endl;
std::cout << "Time taken for art_topk_iter: " << time_micro << "us" << std::endl;
return 0;
}
}
void encode_int(uint32_t n, unsigned char* chars) {
unsigned char symbols[16] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
};
unsigned char bytes[4];
bytes[0] = (unsigned char) ((n >> 24) & 0xFF);
bytes[1] = (unsigned char) ((n >> 16) & 0xFF);
bytes[2] = (unsigned char) ((n >> 8) & 0xFF);
bytes[3] = (unsigned char) (n & 0xFF);
for(uint32_t i = 0; i < 4; i++) {
chars[2*i] = symbols[((bytes[i] >> 4) & 0x0F)];
chars[2*i+1] = symbols[(bytes[i] & 0x0F)];
}
// Terminate the string with a "character" that does not ever appear in regular text since an inserted string
// should not be a substring of another string in this ART implementation. We choose 46 (.) instead of '\0' which is
// actually ZERO and is a valid character that can appear in the encoded string.
chars[8] = 46;
}
recurse_progress matches(char a, char b, int compare) {
switch(compare) {
case -1:
if (a == b) return CONTINUE;
else if(a < b) return ITERATE;
case 0:
if(a == b) return CONTINUE;
return ABORT;
case 1:
if (a == b) return CONTINUE;
else if(a > b) return ITERATE;
return ABORT;
default:
abort();
}
}
static void art_iter(const art_node *n, std::vector<const art_leaf *> &results) {
// Handle base cases
if (!n) return ;
if (IS_LEAF(n)) {
art_leaf *l = (art_leaf *) LEAF_RAW(n);
results.push_back(l);
return ;
}
int idx, res;
switch (n->type) {
case NODE4:
for (int i=0; i < n->num_children; i++) {
art_iter(((art_node4 *) n)->children[i], results);
}
break;
case NODE16:
for (int i=0; i < n->num_children; i++) {
art_iter(((art_node16 *) n)->children[i], results);
}
break;
case NODE48:
for (int i=0; i < 256; i++) {
idx = ((art_node48*)n)->keys[i];
if (!idx) continue;
art_iter(((art_node48 *) n)->children[idx - 1], results);
}
break;
case NODE256:
for (int i=0; i < 256; i++) {
if (!((art_node256*)n)->children[i]) continue;
art_iter(((art_node256 *) n)->children[i], results);
}
break;
default:
abort();
}
return ;
}
static inline void art_int_fuzzy_children(const art_node *n, int depth, unsigned char* int_str, int int_str_len,
uint32_t compare, std::vector<const art_leaf *> &results) {
char child_char;
art_node* child;
switch (n->type) {
case NODE4:
printf("\nNODE4\n");
for (int i=n->num_children-1; i >= 0; i--) {
child_char = ((art_node4*)n)->keys[i];
printf("\n4!child_char: %c, %d, depth: %d", child_char, child_char, depth);
child = ((art_node4*)n)->children[i];
recurse_progress progress = matches(child_char, int_str[depth], compare);
if(progress == CONTINUE) {
art_int_fuzzy_recurse(child, depth+1, int_str, int_str_len, compare, results);
} else if(progress == ITERATE) {
art_iter(child, results);
}
}
break;
case NODE16:
printf("\nNODE16\n");
for (int i=n->num_children-1; i >= 0; i--) {
child_char = ((art_node16*)n)->keys[i];
printf("\n16!child_char: %c, depth: %d", child_char, depth);
child = ((art_node16*)n)->children[i];
recurse_progress progress = matches(child_char, int_str[depth], compare);
if(progress == CONTINUE) {
art_int_fuzzy_recurse(child, depth+1, int_str, int_str_len, compare, results);
} else if(progress == ITERATE) {
art_iter(child, results);
}
}
break;
case NODE48:
printf("\nNODE48\n");
for (int i=255; i >= 0; i--) {
int ix = ((art_node48*)n)->keys[i];
if (!ix) continue;
child = ((art_node48*)n)->children[ix - 1];
child_char = (char)i;
printf("\n48!child_char: %c, depth: %d, ix: %d", child_char, depth, ix);
recurse_progress progress = matches(child_char, int_str[depth], compare);
if(progress == CONTINUE) {
art_int_fuzzy_recurse(child, depth+1, int_str, int_str_len, compare, results);
} else if(progress == ITERATE) {
art_iter(child, results);
}
}
break;
case NODE256:
printf("\nNODE256\n");
for (int i=255; i >= 0; i--) {
if (!((art_node256*)n)->children[i]) continue;
child_char = (char) i;
printf("\n256!child_char: %c, depth: %d", child_char, depth);
child = ((art_node256*)n)->children[i];
recurse_progress progress = matches(child_char, int_str[depth], compare);
if(progress == CONTINUE) {
art_int_fuzzy_recurse(child, depth+1, int_str, int_str_len, compare, results);
} else if(progress == ITERATE) {
art_iter(child, results);
}
}
break;
default:
abort();
}
}
void art_int_fuzzy_recurse(art_node *n, int depth, unsigned char* int_str, int int_str_len,
uint32_t compare, std::vector<const art_leaf*> &results) {
if (!n) return ;
if(IS_LEAF(n)) {
art_leaf *l = (art_leaf *) LEAF_RAW(n);
const int end_index = min(l->key_len, int_str_len);
while(depth < end_index) {
char c = l->key[depth];
recurse_progress progress = matches(c, int_str[depth], compare);
if(progress == ABORT) {
return;
}
if(progress == ITERATE) {
break;
}
depth++;
}
results.push_back(l);
return ;
}
const int partial_len = min(MAX_PREFIX_LEN, n->partial_len);
const int end_index = min(partial_len, int_str_len);
printf("\npartial_len: %d", partial_len);
for(int idx=0; idx<end_index; idx++) {
char c = n->partial[idx];
recurse_progress progress = matches(c, int_str[depth+idx], compare);
if(progress == ABORT) {
return;
}
if(progress == ITERATE) {
return art_iter(n, results);
}
}
depth += n->partial_len;
art_int_fuzzy_children(n, depth, int_str, int_str_len, compare, results);
}
int art_int_search(art_tree *t, uint32_t value, int compare, std::vector<const art_leaf*> & results) {
unsigned char chars[9];
encode_int(value, chars);
art_int_fuzzy_recurse(t->root, 0, chars, 9, compare, results);
return 0;
}

View File

@ -2,12 +2,10 @@
#include <inttypes.h>
#include <stdio.h>
#include <string.h>
#include <cmath>
#include <gtest/gtest.h>
#include <art.h>
#include "art.h"
art_document get_document(uint32_t id) {
art_document document;
document.score = (uint16_t) id;
@ -603,6 +601,9 @@ TEST(ArtTest, test_art_fuzzy_search_single_leaf) {
leaves.clear();
art_fuzzy_search(&t, (const unsigned char *) implement_key_typo2, strlen(implement_key_typo2) + 1, 2, 10, FREQUENCY, false, leaves);
ASSERT_EQ(1, leaves.size());
res = art_tree_destroy(&t);
ASSERT_TRUE(res == 0);
}
TEST(ArtTest, test_art_fuzzy_search) {
@ -674,4 +675,186 @@ TEST(ArtTest, test_art_fuzzy_search) {
for(auto leaf_index = 0; leaf_index < leaves.size(); leaf_index++) {
ASSERT_STREQ(words.at(leaf_index), (const char *)leaves.at(leaf_index)->key);
}
res = art_tree_destroy(&t);
ASSERT_TRUE(res == 0);
}
TEST(ArtTest, test_encode_int) {
unsigned char chars[9];
// 175 => 0000,0000,0000,0000,0000,0000,1010,1111,\0
unsigned char chars_175[9] = {0, 0, 0, 0, 0, 0, 10, 15, 46};
encode_int(175, chars);
for(uint32_t i = 0; i < 9; i++) {
ASSERT_EQ(chars_175[i], chars[i]);
}
// 0 => 0000,0000,0000,0000,0000,0000,0000,0000,\0
unsigned char chars_0[9] = {0, 0, 0, 0, 0, 0, 0, 0, 46};
encode_int(0, chars);
for(uint32_t i = 0; i < 9; i++) {
ASSERT_EQ(chars_0[i], chars[i]);
}
// 255 => 0000,0000,0000,0000,0000,0000,1111,1111,\0
unsigned char chars_255[9] = {0, 0, 0, 0, 0, 0, 15, 15, 46};
encode_int(255, chars);
for(uint32_t i = 0; i < 9; i++) {
ASSERT_EQ(chars_255[i], chars[i]);
}
// 4531 => 0000,0000,0000,0000,0001,0001,1011,0011,\0
unsigned char chars_4531[9] = {0, 0, 0, 0, 1, 1, 11, 3, 46};
encode_int(4531, chars);
for(uint32_t i = 0; i < 9; i++) {
ASSERT_EQ(chars_4531[i], chars[i]);
}
// 1200000 => 0000,0000,0001,0010,0100,1111,1000,0000,\0
unsigned char chars_1M[9] = {0, 0, 1, 2, 4, 15, 8, 0, 46};
encode_int(1200000, chars);
for(uint32_t i = 0; i < 9; i++) {
ASSERT_EQ(chars_1M[i], chars[i]);
}
}
TEST(ArtTest, test_int_range_hundreds) {
art_tree t;
art_tree_init(&t);
art_document doc = get_document(1);
const int CHAR_LEN = 9;
unsigned char chars[CHAR_LEN];
for(uint32_t i = 100; i < 110; i++) {
encode_int(i, chars);
ASSERT_TRUE(NULL == art_insert(&t, (unsigned char*)chars, CHAR_LEN, &doc, 1));
}
encode_int(106, chars);
std::vector<const art_leaf*> results;
int res = art_int_search(&t, 106, 0, results);
ASSERT_TRUE(res == 0);
ASSERT_EQ(1, results.size());
results.clear();
res = art_int_search(&t, 106, 1, results);
ASSERT_TRUE(res == 0);
ASSERT_EQ(4, results.size());
results.clear();
res = art_int_search(&t, 106, -1, results);
ASSERT_TRUE(res == 0);
ASSERT_EQ(7, results.size());
res = art_tree_destroy(&t);
ASSERT_TRUE(res == 0);
}
TEST(ArtTest, test_int_range_millions) {
art_tree t;
art_tree_init(&t);
art_document doc = get_document(1);
const int CHAR_LEN = 9;
unsigned char chars[CHAR_LEN];
for(uint32_t i = 0; i < 1000000; i++) {
encode_int(i, chars);
ASSERT_TRUE(NULL == art_insert(&t, (unsigned char*)chars, CHAR_LEN, &doc, 1));
}
encode_int(5, chars);
/*std::cout << std::endl;
for(uint32_t i = 0; i < CHAR_LEN; i++) {
std::cout << (int)chars[i] << ", ";
}
std::cout << std::endl;*/
std::vector<const art_leaf*> results;
// ==
for(uint32_t i = 0; i < 6; i++) {
results.clear();
art_int_search(&t, (uint32_t) pow(10, i), 0, results);
ASSERT_EQ(1, results.size());
results.clear();
art_int_search(&t, (uint32_t) (pow(10, i) + 7), 0, results);
ASSERT_EQ(1, results.size());
}
results.clear();
art_int_search(&t, 1000000-1, 0, results);
ASSERT_EQ(1, results.size());
// >=
results.clear();
art_int_search(&t, 1000000-5, 1, results);
ASSERT_EQ(5, results.size());
results.clear();
art_int_search(&t, 1000000-1, 1, results);
ASSERT_EQ(1, results.size());
results.clear();
art_int_search(&t, 1000000, 1, results);
ASSERT_EQ(0, results.size());
results.clear();
art_int_search(&t, 5, 1, results);
ASSERT_EQ(1000000-5, results.size());
// <=
results.clear();
art_int_search(&t, 1000000-5, -1, results);
ASSERT_EQ(1000000-5+1, results.size());
results.clear();
art_int_search(&t, 1000000-1, -1, results);
ASSERT_EQ(1000000, results.size());
results.clear();
art_int_search(&t, 1000000, -1, results);
ASSERT_EQ(1000000, results.size());
results.clear();
art_int_search(&t, 5, -1, results);
ASSERT_EQ(5+1, results.size());
}
TEST(ArtTest, test_int_range_byte_boundary) {
art_tree t;
art_tree_init(&t);
art_document doc = get_document(1);
const int CHAR_LEN = 9;
unsigned char chars[CHAR_LEN];
for(uint32_t i = 200; i < 300; i++) {
encode_int(i, chars);
ASSERT_TRUE(NULL == art_insert(&t, (unsigned char*)chars, CHAR_LEN, &doc, 1));
}
encode_int(255, chars);
std::vector<const art_leaf*> results;
results.clear();
art_int_search(&t, 255, 1, results);
ASSERT_EQ(45, results.size());
/*std::cout << std::endl;
for(auto i = 0; i < 1; i++) {
auto result = results[i];
for(auto j = 0; j < result->key_len; j++) {
std::cout << (int) result->key[j] << ", ";
}
std::cout << std::endl;
}*/
}