diff --git a/include/art.h b/include/art.h index 9a53b5a4..65ccb995 100644 --- a/include/art.h +++ b/include/art.h @@ -239,7 +239,12 @@ int art_iter_prefix(art_tree *t, const unsigned char *prefix, int prefix_len, ar int art_fuzzy_search(art_tree *t, const unsigned char *term, const int term_len, const int max_cost, const int max_words, const token_ordering token_order, const bool prefix, std::vector &results); -static int topk_iter(const art_node *root, token_ordering token_order, const int max_results, std::vector & results); +static int art_topk_iter(const art_node *root, token_ordering token_order, const int max_results, + std::vector &results); + +void encode_int(uint32_t n, unsigned char* chars); + +int art_int_search(art_tree *t, uint32_t value, int compare, std::vector & results); #ifdef __cplusplus } diff --git a/src/art.cpp b/src/art.cpp index a9850310..2db74985 100644 --- a/src/art.cpp +++ b/src/art.cpp @@ -28,10 +28,15 @@ #define microseconds std::chrono::duration_cast +enum recurse_progress { CONTINUE, ABORT, ITERATE }; + static void art_fuzzy_recurse(char p, char c, const art_node *n, int depth, const unsigned char *term, const int term_len, const int* irow, const int* jrow, const int max_cost, const bool prefix, std::vector &results); +void art_int_fuzzy_recurse(art_node *n, int depth, unsigned char* int_str, int int_str_len, + uint32_t compare, std::vector &results); + bool compare_art_leaf_frequency(const art_leaf *a, const art_leaf *b) { return a->values->ids.getLength() > b->values->ids.getLength(); } @@ -883,8 +888,9 @@ static uint32_t get_score(art_node* child) { return child->max_token_count; } -static int topk_iter(const art_node *root, token_ordering token_order, const int max_results, std::vector & results) { - printf("INSIDE topk_iter: root->type: %d\n", root->type); +static int art_topk_iter(const art_node *root, token_ordering token_order, const int max_results, + std::vector &results) { + printf("INSIDE art_topk_iter: root->type: %d\n", root->type); std::priority_queue, std::function> q; @@ -952,7 +958,7 @@ static int topk_iter(const art_node *root, token_ordering token_order, const int } } - printf("OUTSIDE topk_iter: results size: %d\n", results.size()); + printf("OUTSIDE art_topk_iter: results size: %d\n", results.size()); return 0; } @@ -1329,7 +1335,7 @@ int art_fuzzy_search(art_tree *t, const unsigned char *term, const int term_len, begin = std::chrono::high_resolution_clock::now(); for(auto node: nodes) { - topk_iter(node, token_order, max_words, results); + art_topk_iter(node, token_order, max_words, results); } if(token_order == FREQUENCY) { @@ -1339,6 +1345,215 @@ int art_fuzzy_search(art_tree *t, const unsigned char *term, const int term_len, } time_micro = microseconds(std::chrono::high_resolution_clock::now() - begin).count(); - std::cout << "Time taken for topk_iter: " << time_micro << "us" << std::endl; + std::cout << "Time taken for art_topk_iter: " << time_micro << "us" << std::endl; return 0; -} \ No newline at end of file +} + +void encode_int(uint32_t n, unsigned char* chars) { + unsigned char symbols[16] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + }; + + unsigned char bytes[4]; + + bytes[0] = (unsigned char) ((n >> 24) & 0xFF); + bytes[1] = (unsigned char) ((n >> 16) & 0xFF); + bytes[2] = (unsigned char) ((n >> 8) & 0xFF); + bytes[3] = (unsigned char) (n & 0xFF); + + for(uint32_t i = 0; i < 4; i++) { + chars[2*i] = symbols[((bytes[i] >> 4) & 0x0F)]; + chars[2*i+1] = symbols[(bytes[i] & 0x0F)]; + } + + // Terminate the string with a "character" that does not ever appear in regular text since an inserted string + // should not be a substring of another string in this ART implementation. We choose 46 (.) instead of '\0' which is + // actually ZERO and is a valid character that can appear in the encoded string. + chars[8] = 46; +} + +recurse_progress matches(char a, char b, int compare) { + switch(compare) { + case -1: + if (a == b) return CONTINUE; + else if(a < b) return ITERATE; + case 0: + if(a == b) return CONTINUE; + return ABORT; + case 1: + if (a == b) return CONTINUE; + else if(a > b) return ITERATE; + return ABORT; + default: + abort(); + } +} + + +static void art_iter(const art_node *n, std::vector &results) { + // Handle base cases + if (!n) return ; + if (IS_LEAF(n)) { + art_leaf *l = (art_leaf *) LEAF_RAW(n); + results.push_back(l); + return ; + } + + int idx, res; + switch (n->type) { + case NODE4: + for (int i=0; i < n->num_children; i++) { + art_iter(((art_node4 *) n)->children[i], results); + } + break; + + case NODE16: + for (int i=0; i < n->num_children; i++) { + art_iter(((art_node16 *) n)->children[i], results); + } + break; + + case NODE48: + for (int i=0; i < 256; i++) { + idx = ((art_node48*)n)->keys[i]; + if (!idx) continue; + art_iter(((art_node48 *) n)->children[idx - 1], results); + } + break; + + case NODE256: + for (int i=0; i < 256; i++) { + if (!((art_node256*)n)->children[i]) continue; + art_iter(((art_node256 *) n)->children[i], results); + } + break; + + default: + abort(); + } + + return ; +} + +static inline void art_int_fuzzy_children(const art_node *n, int depth, unsigned char* int_str, int int_str_len, + uint32_t compare, std::vector &results) { + char child_char; + art_node* child; + + switch (n->type) { + case NODE4: + printf("\nNODE4\n"); + for (int i=n->num_children-1; i >= 0; i--) { + child_char = ((art_node4*)n)->keys[i]; + printf("\n4!child_char: %c, %d, depth: %d", child_char, child_char, depth); + child = ((art_node4*)n)->children[i]; + recurse_progress progress = matches(child_char, int_str[depth], compare); + if(progress == CONTINUE) { + art_int_fuzzy_recurse(child, depth+1, int_str, int_str_len, compare, results); + } else if(progress == ITERATE) { + art_iter(child, results); + } + } + break; + case NODE16: + printf("\nNODE16\n"); + for (int i=n->num_children-1; i >= 0; i--) { + child_char = ((art_node16*)n)->keys[i]; + printf("\n16!child_char: %c, depth: %d", child_char, depth); + child = ((art_node16*)n)->children[i]; + recurse_progress progress = matches(child_char, int_str[depth], compare); + if(progress == CONTINUE) { + art_int_fuzzy_recurse(child, depth+1, int_str, int_str_len, compare, results); + } else if(progress == ITERATE) { + art_iter(child, results); + } + } + break; + case NODE48: + printf("\nNODE48\n"); + for (int i=255; i >= 0; i--) { + int ix = ((art_node48*)n)->keys[i]; + if (!ix) continue; + child = ((art_node48*)n)->children[ix - 1]; + child_char = (char)i; + printf("\n48!child_char: %c, depth: %d, ix: %d", child_char, depth, ix); + recurse_progress progress = matches(child_char, int_str[depth], compare); + if(progress == CONTINUE) { + art_int_fuzzy_recurse(child, depth+1, int_str, int_str_len, compare, results); + } else if(progress == ITERATE) { + art_iter(child, results); + } + } + break; + case NODE256: + printf("\nNODE256\n"); + for (int i=255; i >= 0; i--) { + if (!((art_node256*)n)->children[i]) continue; + child_char = (char) i; + printf("\n256!child_char: %c, depth: %d", child_char, depth); + child = ((art_node256*)n)->children[i]; + recurse_progress progress = matches(child_char, int_str[depth], compare); + if(progress == CONTINUE) { + art_int_fuzzy_recurse(child, depth+1, int_str, int_str_len, compare, results); + } else if(progress == ITERATE) { + art_iter(child, results); + } + } + break; + default: + abort(); + } +} + +void art_int_fuzzy_recurse(art_node *n, int depth, unsigned char* int_str, int int_str_len, + uint32_t compare, std::vector &results) { + if (!n) return ; + + if(IS_LEAF(n)) { + art_leaf *l = (art_leaf *) LEAF_RAW(n); + const int end_index = min(l->key_len, int_str_len); + while(depth < end_index) { + char c = l->key[depth]; + recurse_progress progress = matches(c, int_str[depth], compare); + if(progress == ABORT) { + return; + } + + if(progress == ITERATE) { + break; + } + + depth++; + } + + results.push_back(l); + return ; + } + + const int partial_len = min(MAX_PREFIX_LEN, n->partial_len); + const int end_index = min(partial_len, int_str_len); + + printf("\npartial_len: %d", partial_len); + + for(int idx=0; idxpartial[idx]; + recurse_progress progress = matches(c, int_str[depth+idx], compare); + if(progress == ABORT) { + return; + } + + if(progress == ITERATE) { + return art_iter(n, results); + } + } + + depth += n->partial_len; + art_int_fuzzy_children(n, depth, int_str, int_str_len, compare, results); +} + +int art_int_search(art_tree *t, uint32_t value, int compare, std::vector & results) { + unsigned char chars[9]; + encode_int(value, chars); + art_int_fuzzy_recurse(t->root, 0, chars, 9, compare, results); + return 0; +} diff --git a/test/art_test.cpp b/test/art_test.cpp index b2a052ac..c8e3df3d 100644 --- a/test/art_test.cpp +++ b/test/art_test.cpp @@ -2,12 +2,10 @@ #include #include #include - +#include #include #include -#include "art.h" - art_document get_document(uint32_t id) { art_document document; document.score = (uint16_t) id; @@ -603,6 +601,9 @@ TEST(ArtTest, test_art_fuzzy_search_single_leaf) { leaves.clear(); art_fuzzy_search(&t, (const unsigned char *) implement_key_typo2, strlen(implement_key_typo2) + 1, 2, 10, FREQUENCY, false, leaves); ASSERT_EQ(1, leaves.size()); + + res = art_tree_destroy(&t); + ASSERT_TRUE(res == 0); } TEST(ArtTest, test_art_fuzzy_search) { @@ -674,4 +675,186 @@ TEST(ArtTest, test_art_fuzzy_search) { for(auto leaf_index = 0; leaf_index < leaves.size(); leaf_index++) { ASSERT_STREQ(words.at(leaf_index), (const char *)leaves.at(leaf_index)->key); } + + res = art_tree_destroy(&t); + ASSERT_TRUE(res == 0); +} + +TEST(ArtTest, test_encode_int) { + unsigned char chars[9]; + + // 175 => 0000,0000,0000,0000,0000,0000,1010,1111,\0 + unsigned char chars_175[9] = {0, 0, 0, 0, 0, 0, 10, 15, 46}; + encode_int(175, chars); + for(uint32_t i = 0; i < 9; i++) { + ASSERT_EQ(chars_175[i], chars[i]); + } + + // 0 => 0000,0000,0000,0000,0000,0000,0000,0000,\0 + unsigned char chars_0[9] = {0, 0, 0, 0, 0, 0, 0, 0, 46}; + encode_int(0, chars); + for(uint32_t i = 0; i < 9; i++) { + ASSERT_EQ(chars_0[i], chars[i]); + } + + // 255 => 0000,0000,0000,0000,0000,0000,1111,1111,\0 + unsigned char chars_255[9] = {0, 0, 0, 0, 0, 0, 15, 15, 46}; + encode_int(255, chars); + for(uint32_t i = 0; i < 9; i++) { + ASSERT_EQ(chars_255[i], chars[i]); + } + + // 4531 => 0000,0000,0000,0000,0001,0001,1011,0011,\0 + unsigned char chars_4531[9] = {0, 0, 0, 0, 1, 1, 11, 3, 46}; + encode_int(4531, chars); + for(uint32_t i = 0; i < 9; i++) { + ASSERT_EQ(chars_4531[i], chars[i]); + } + + // 1200000 => 0000,0000,0001,0010,0100,1111,1000,0000,\0 + unsigned char chars_1M[9] = {0, 0, 1, 2, 4, 15, 8, 0, 46}; + encode_int(1200000, chars); + for(uint32_t i = 0; i < 9; i++) { + ASSERT_EQ(chars_1M[i], chars[i]); + } +} + +TEST(ArtTest, test_int_range_hundreds) { + art_tree t; + art_tree_init(&t); + + art_document doc = get_document(1); + const int CHAR_LEN = 9; + unsigned char chars[CHAR_LEN]; + + for(uint32_t i = 100; i < 110; i++) { + encode_int(i, chars); + ASSERT_TRUE(NULL == art_insert(&t, (unsigned char*)chars, CHAR_LEN, &doc, 1)); + } + + encode_int(106, chars); + + std::vector results; + + int res = art_int_search(&t, 106, 0, results); + ASSERT_TRUE(res == 0); + ASSERT_EQ(1, results.size()); + results.clear(); + + res = art_int_search(&t, 106, 1, results); + ASSERT_TRUE(res == 0); + ASSERT_EQ(4, results.size()); + results.clear(); + + res = art_int_search(&t, 106, -1, results); + ASSERT_TRUE(res == 0); + ASSERT_EQ(7, results.size()); + + res = art_tree_destroy(&t); + ASSERT_TRUE(res == 0); +} + +TEST(ArtTest, test_int_range_millions) { + art_tree t; + art_tree_init(&t); + + art_document doc = get_document(1); + + const int CHAR_LEN = 9; + unsigned char chars[CHAR_LEN]; + + for(uint32_t i = 0; i < 1000000; i++) { + encode_int(i, chars); + ASSERT_TRUE(NULL == art_insert(&t, (unsigned char*)chars, CHAR_LEN, &doc, 1)); + } + + encode_int(5, chars); + /*std::cout << std::endl; + for(uint32_t i = 0; i < CHAR_LEN; i++) { + std::cout << (int)chars[i] << ", "; + } + std::cout << std::endl;*/ + + std::vector results; + + // == + for(uint32_t i = 0; i < 6; i++) { + results.clear(); + art_int_search(&t, (uint32_t) pow(10, i), 0, results); + ASSERT_EQ(1, results.size()); + + results.clear(); + art_int_search(&t, (uint32_t) (pow(10, i) + 7), 0, results); + ASSERT_EQ(1, results.size()); + } + + results.clear(); + art_int_search(&t, 1000000-1, 0, results); + ASSERT_EQ(1, results.size()); + + // >= + results.clear(); + art_int_search(&t, 1000000-5, 1, results); + ASSERT_EQ(5, results.size()); + + results.clear(); + art_int_search(&t, 1000000-1, 1, results); + ASSERT_EQ(1, results.size()); + + results.clear(); + art_int_search(&t, 1000000, 1, results); + ASSERT_EQ(0, results.size()); + + results.clear(); + art_int_search(&t, 5, 1, results); + ASSERT_EQ(1000000-5, results.size()); + + // <= + results.clear(); + art_int_search(&t, 1000000-5, -1, results); + ASSERT_EQ(1000000-5+1, results.size()); + + results.clear(); + art_int_search(&t, 1000000-1, -1, results); + ASSERT_EQ(1000000, results.size()); + + results.clear(); + art_int_search(&t, 1000000, -1, results); + ASSERT_EQ(1000000, results.size()); + + results.clear(); + art_int_search(&t, 5, -1, results); + ASSERT_EQ(5+1, results.size()); +} + +TEST(ArtTest, test_int_range_byte_boundary) { + art_tree t; + art_tree_init(&t); + + art_document doc = get_document(1); + + const int CHAR_LEN = 9; + unsigned char chars[CHAR_LEN]; + + for(uint32_t i = 200; i < 300; i++) { + encode_int(i, chars); + ASSERT_TRUE(NULL == art_insert(&t, (unsigned char*)chars, CHAR_LEN, &doc, 1)); + } + + encode_int(255, chars); + std::vector results; + + results.clear(); + art_int_search(&t, 255, 1, results); + ASSERT_EQ(45, results.size()); + + /*std::cout << std::endl; + for(auto i = 0; i < 1; i++) { + auto result = results[i]; + for(auto j = 0; j < result->key_len; j++) { + std::cout << (int) result->key[j] << ", "; + } + + std::cout << std::endl; + }*/ } \ No newline at end of file