/* * art_impl.h * * This source file is part of the FoundationDB open source project * * Copyright 2013-2020 Apple Inc. and the FoundationDB project authors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef ART_IMPL_H #define ART_IMPL_H using art_tree = VersionedBTree::art_tree; using art_leaf = art_tree::art_leaf; #define art_node art_tree::art_node int art_tree::fat_leaf_offset[] = { 0, 0, 0, 0, 0, sizeof(art_node4), sizeof(art_node16), sizeof(art_node48), sizeof(art_node256)}; int art_tree::node_sizes[] = { 0, sizeof(art_node4), sizeof(art_node16), sizeof(art_node48), sizeof(art_node256), sizeof(art_node4_kv), sizeof(art_node16_kv), sizeof(art_node48_kv), sizeof(art_node256_kv)}; VersionedBTree::art_iterator art_tree::insert(KeyRef &k, void *value) { #define INIT_DEPTH 0 #define REPLACE 1 int old_val = 0; art_leaf *l = iterative_insert(this->root, &this->root, k, value, INIT_DEPTH, &old_val, REPLACE); if (!old_val) this->size++; return VersionedBTree::art_iterator(l); } VersionedBTree::art_iterator art_tree::insert_if_absent(KeyRef &k, void *value, int *existing) { #define INIT_DEPTH 0 #define DONTREPLACE 0 art_leaf *l = iterative_insert(this->root, &this->root, k, value, INIT_DEPTH, existing, DONTREPLACE); if (!existing) this->size++; return VersionedBTree::art_iterator(l); } VersionedBTree::art_iterator art_tree::lower_bound(const KeyRef &key) { if (!size) return art_iterator(nullptr); art_node *n = root; art_leaf *res = nullptr; int depth = 0; art_bound_iterative(n, key, depth, &res, false); return art_iterator(res); } VersionedBTree::art_iterator art_tree::upper_bound(const KeyRef &key) { if (!size) return art_iterator(nullptr); art_node *n = root; art_leaf *res = nullptr; int depth = 0; art_bound_iterative(n, key, depth, &res, true); return art_iterator(res); } struct stack_entry { art_node *node; unsigned char key; stack_entry *prev; stack_entry(art_node *n, unsigned char k, stack_entry *s) : node(n), key(k), prev(s) { } stack_entry() { } }; art_leaf *art_tree::minimum(art_node *n) { // Handle base cases if (!n) { return nullptr; } if (ART_IS_LEAF(n)) { return ART_LEAF_RAW(n); } int idx; switch (n->type) { case ART_NODE4: return minimum(((art_node4 *) n)->children[0]); case ART_NODE16: return minimum(((art_node16 *) n)->children[0]); case ART_NODE48: idx = 0; while (!((art_node48 *) n)->keys[idx]) idx++; idx = ((art_node48 *) n)->keys[idx] - 1; return minimum(((art_node48 *) n)->children[idx]); case ART_NODE256: idx = 0; while (!((art_node256 *) n)->children[idx]) idx++; return minimum(((art_node256 *) n)->children[idx]); case ART_NODE256_KV: case ART_NODE48_KV: case ART_NODE16_KV: case ART_NODE4_KV: return ART_FAT_NODE_LEAF(n); default: printf("%d\n", n->type); UNSTOPPABLE_ASSERT(false); } } art_leaf *art_tree::maximum(art_node *n) { // Handle base cases if (!n) return nullptr; if (ART_IS_LEAF(n)) return ART_LEAF_RAW(n); int idx; switch (n->type) { case ART_NODE4: case ART_NODE4_KV: return maximum(((art_node4 *) n)->children[n->num_children - 1]); case ART_NODE16: case ART_NODE16_KV: return maximum(((art_node16 *) n)->children[n->num_children - 1]); case ART_NODE48: case ART_NODE48_KV: idx = 255; while (!((art_node48 *) n)->keys[idx]) idx--; idx = ((art_node48 *) n)->keys[idx] - 1; return maximum(((art_node48 *) n)->children[idx]); case ART_NODE256: case ART_NODE256_KV: idx = 255; while (!((art_node256 *) n)->children[idx]) idx--; return maximum(((art_node256 *) n)->children[idx]); default: UNSTOPPABLE_ASSERT(false); } } void art_tree::art_bound_iterative(art_node *n, const KeyRef &k, int depth, art_leaf **result, bool strict) { static stack_entry arena[ART_MAX_KEY_LEN]; //Single threaded implementation. stack_entry *head = nullptr, *tmp, *curr_arena = arena; int ret; art_node **child; unsigned char *key = (unsigned char *) k.begin(); while (n) { ret = check_bound_node(n, k, &depth, result, strict); if (ret == ART_I_FOUND) { return; } if (ret == ART_I_BACKTRACK) break; if (ret == ART_I_DEPTH) { head = new(curr_arena++) stack_entry(n, key[depth], head); //if the child is nullptr, then we have to look right; i.e., we start backtracking child = find_child(n, key[depth]); if (!child)break; n = *child; depth = depth + 1; //else go on } else { UNSTOPPABLE_ASSERT(false); } } art_node *next; while (head) { n = head->node; find_next(n, head->key, &next); if (next) { *result = minimum(next); break; } else { tmp = head; head = head->prev; } } } int art_tree::check_bound_node(art_node *n, const KeyRef &k, int *depth_p, art_leaf **result, bool strict) { if (ART_IS_LEAF(n)) { int ret = art_bound_leaf(n, k, *depth_p, result); *result = ART_LEAF_RAW(n); if (ret == ART_LEAF_SMALLER_KEY || (ret == ART_LEAF_MATCH_KEY && strict)) { *result = (*result)->next; } return ART_I_FOUND; } int depth = *depth_p; const int key_len = k.size(); if (n->type < ART_NODE4_KV) { const uint32_t n_partial_len = n->partial_len; //Case 1: the search ends on this node if (key_len <= (depth + n_partial_len)) { //Easy case w/o checking prefix: if (0 == n_partial_len) { *result = minimum(n); return ART_I_FOUND; } else { //The Whole prefix is the set of bytes after the first depth bytes and up to partial_len art_leaf *min_leaf; int prefix_differs = signed_prefix_mismatch(n, k, depth, &min_leaf, true); if (prefix_differs < 0) {//our key is greater than or equal to the partial prefix. I.e., this subtree //only stores smaller keys. return ART_I_BACKTRACK; } else {//our key is less than or equal to the partial prefix. So this subtree stores our bound (modulo null leaf) if (min_leaf == (art_leaf *) ART_MIN_LEAF_UNSET) { min_leaf = minimum(n); } *result = min_leaf; return ART_I_FOUND; } } } //Case 2: the search has to go deeper //First, look if you have any prefix. If you do, you have to check this prefix before checking your children if (n_partial_len) { art_leaf *min_leaf; int prefix_differs = signed_prefix_mismatch(n, k, depth, &min_leaf, true); if (prefix_differs == 0) { //The partial prefix matches (0 ==> match) //The mismatch happens after the prefix covered by the node, so it is fine to check its children *depth_p += n->partial_len; return ART_I_DEPTH; } else { //The mismatch happens in the prefix of this node. Now there are two options //1. The subtree stores smaller keys. Have to backtrack if (prefix_differs < 0) { return ART_I_BACKTRACK; } else { if (min_leaf == (art_leaf *) ART_MIN_LEAF_UNSET) { min_leaf = minimum(n); } *result = min_leaf; return ART_I_FOUND; } } } return ART_I_DEPTH; } else { if (key_len <= depth + n->partial_len) {//Case 1: the key ends here art_leaf *l = ART_FAT_NODE_LEAF(n); // Check if the expanded path matches int m = leaf_matches_signed(l, k, depth); if (!m) { // We have a match. If the bound is not strict, we save the result if (!strict) { *result = l; return ART_I_FOUND; } else {//Assuming --as we do-- that we have a child, the smallest child in the subtree is the bound *result = minimum_kv(n); return ART_I_FOUND; //We have found the bound. We use this return value so that the value //We set as result is propagated directly } } if (m < 0) { // The key in the fat node is smaller than the desired key // This means that the target key does not exist. // Also, it means that this subtree as a whole is smaller than the target key // So the bound has to be searched by my daddy return ART_I_BACKTRACK; } else { *result = l; return ART_I_FOUND; } } if (n->partial_len) { art_leaf *l = ART_FAT_NODE_LEAF(n); //Note we know target key *does not* end here, so we just need to check the prefix //To avoid distinguishing whether the prefix is larger or smaller than min, we compare with the leaf //That is readily available w/o issuing a "min" //The fat leaf key includes the prefix for sure int cmp = memcmp(((unsigned char *) k.begin()) + depth, ((unsigned char *) l->key.begin()) + depth, n->partial_len); if (cmp < 0) {//Target key is smaller, fat key is then the smallest key larger than current key *result = l; return ART_I_FOUND; } else if (cmp > 0) {//Target key is larger than key and hence than subtree. Ask daddy to go right return ART_I_BACKTRACK; } //If prefix is the same, then go in depth with the updated depth value *depth_p += n->partial_len; return ART_I_DEPTH; } return ART_I_DEPTH; } } int art_tree::art_bound_leaf(art_node *n, const KeyRef &k, int depth, art_leaf **result) { n = (art_node *) ART_LEAF_RAW(n); // Check if the expanded path matches int m = leaf_matches_signed((art_leaf *) n, k, depth); if (0 == m) return ART_LEAF_MATCH_KEY; if (m < 0) return ART_LEAF_SMALLER_KEY; return ART_LEAF_LARGER_KEY; } int art_tree::leaf_matches_signed(art_leaf *n, const KeyRef &k, int depth) { const int key_len = k.size(); const unsigned char *key = k.begin(); int common_length = min(n->key.size() - depth, key_len - depth); // Compare the keys starting at the depth int cmp = memcmp((n->key.begin()) + depth, key + depth, common_length); if (cmp)return cmp; return n->key.size() - key_len; } int art_tree::leaf_matches(const art_leaf *n, const KeyRef &k, int depth) { const int key_len = k.size(); // Fail if the key lengths are different if (n->key.size() != (uint32_t) key_len) return 1; const unsigned char *key = k.begin(); // Compare the keys starting at the depth return memcmp(((unsigned char *) (n->key).begin()) + depth, key + depth, key_len - depth); } int art_tree::signed_prefix_mismatch(art_node *n, const KeyRef &k, int depth, art_leaf **min_leaf, bool find_min) { int all_to_check = min(n->partial_len, k.size() - depth); int max_cmp = min(ART_MAX_PREFIX_LEN, all_to_check); //Mark the leaf as unset right away, so that if you don't return in the first loop AND you don't enter the //Second loop, the leaf is marked correctly as unset *min_leaf = (art_leaf *) ART_MIN_LEAF_UNSET; int idx; for (idx = 0; idx < max_cmp; idx++) { if (n->partial[idx] != k.begin()[depth + idx]) { return ((int) n->partial[idx]) - ((int) (k.begin()[depth + idx])); } } //So far they are the same. //There are two cases now. They are the same BUT we have to check after ART_MAX_PREFIX_LEN, or they are the same //and there's nothing more to check. if (all_to_check <= ART_MAX_PREFIX_LEN)return 0; //Else, we need to go deeper and check the bytes after ART_MAX_PREFIX_LEN int remaining = all_to_check - ART_MAX_PREFIX_LEN; // If the prefix is short we can avoid finding a leaf if (remaining > 0) { // Prefix is longer than what we've checked, find a leaf art_leaf *l = find_min ? minimum(n) : maximum(n); *min_leaf = l; //We have to compare the last partial_len - ART_MAX_PREFIX_LEN bytes //If the minimum is below me, then it has at least my same prefix, so I can safely check //all its bytes from depth to depth+remaining for (; idx < ART_MAX_PREFIX_LEN + remaining; idx++) { if (l->key.begin()[idx + depth] != k.begin()[depth + idx]) { return ((int) l->key.begin()[idx + depth]) - ((int) k.begin()[depth + idx]); } } } return 0; } art_leaf *art_tree::minimum_kv(art_node *n) { // Handle base cases if (!n) { return nullptr; } if (ART_IS_LEAF(n)) { return ART_LEAF_RAW(n); } int idx; switch (n->type) { case ART_NODE4_KV: return minimum(((art_node4 *) n)->children[0]); case ART_NODE16_KV: return minimum(((art_node16 *) n)->children[0]); case ART_NODE48_KV: idx = 0; while (!((art_node48 *) n)->keys[idx]) idx++; idx = ((art_node48 *) n)->keys[idx] - 1; return minimum(((art_node48 *) n)->children[idx]); case ART_NODE256_KV: idx = 0; while (!((art_node256 *) n)->children[idx]) idx++; return minimum(((art_node256 *) n)->children[idx]); default: UNSTOPPABLE_ASSERT(false); } } art_node **art_tree::find_child(art_node *n, unsigned char c) { int i, mask, bitfield; union { art_node4 *p1; art_node16 *p2; art_node48 *p3; art_node256 *p4; } p; switch (n->type) { case ART_NODE4: case ART_NODE4_KV: p.p1 = (art_node4 *) n; switch (n->num_children) { case 4: if (p.p1->keys[3] == c) { return &p.p1->children[3]; } case 3: if (p.p1->keys[2] == c) { return &p.p1->children[2]; } case 2: if (p.p1->keys[1] == c) { return &p.p1->children[1]; } /* * We need a case 1. Otherwise this could happen: we could be looking for char '0' * On a node with 0 children (root node). Since the node has been zeroed on allocation, * we get that key[0] is exactly '0', and we think we have found a child. Duh. */ case 1: if (p.p1->keys[0] == c) { return &p.p1->children[0]; } default: break; } break; { case ART_NODE16: case ART_NODE16_KV: __m128i cmp; p.p2 = (art_node16 *) n; // Compare the key to all 16 stored keys cmp = _mm_cmpeq_epi8(_mm_set1_epi8(c), _mm_loadu_si128((__m128i *) p.p2->keys)); // Use a mask to ignore children that don't exist mask = (1 << n->num_children) - 1; bitfield = _mm_movemask_epi8(cmp) & mask; /* * If we have a match (any bit set) then we can * return the pointer match using ctz to get * the index. */ if (bitfield) return &p.p2->children[ctz(bitfield)]; break; } case ART_NODE48: case ART_NODE48_KV: p.p3 = (art_node48 *) n; i = p.p3->keys[c]; if (i) return &p.p3->children[i - 1]; break; case ART_NODE256: case ART_NODE256_KV: p.p4 = (art_node256 *) n; if (p.p4->children[c]) return &p.p4->children[c]; break; default: UNSTOPPABLE_ASSERT(false); } return nullptr; } void art_tree::find_next(art_node *n, unsigned char c, art_node **out) { int i, mask, bitfield; union { art_node4 *p1; art_node16 *p2; art_node48 *p3; art_node256 *p4; } p; *out = nullptr; switch (n->type) { case ART_NODE4: case ART_NODE4_KV: p.p1 = (art_node4 *) n; switch (n->num_children) {//unrolling loop case 4: if (p.p1->keys[0] > c) { *out = p.p1->children[0]; break; } if (p.p1->keys[1] > c) { *out = p.p1->children[1]; break; } if (p.p1->keys[2] > c) { *out = p.p1->children[2]; break; } if (p.p1->keys[3] > c) { *out = p.p1->children[3]; break; } break; case 3: if (p.p1->keys[0] > c) { *out = p.p1->children[0]; break; } if (p.p1->keys[1] > c) { *out = p.p1->children[1]; break; } if (p.p1->keys[2] > c) { *out = p.p1->children[2]; break; } break; case 2: if (p.p1->keys[0] > c) { *out = p.p1->children[0]; break; } if (p.p1->keys[1] > c) { *out = p.p1->children[1]; break; } break; //We add a case 1 just in case... (see previous similar comment) case 1: if (p.p1->keys[0] > c) { *out = p.p1->children[0]; break; } default: break; } break; { case ART_NODE16: case ART_NODE16_KV: __m128i cmp; p.p2 = (art_node16 *) n; //We did not find the child corresponding to the key. Let's see if we have a next at least // Compare the key to all 16 stored keys for Greater than cmp = _mm_cmplt_epu8(_mm_set1_epi8(c), _mm_loadu_si128((__m128i *) p.p2->keys)); // Use a mask to ignore children that don't exist mask = (1 << n->num_children) - 1; bitfield = _mm_movemask_epi8(cmp) & mask; if (bitfield) { //ALL children greater than char have their bit set //We need the smallest one //let's get the least significant bit that is set in the bitfield int one = ctz(bitfield); *out = p.p2->children[one]; } break; } case ART_NODE48: case ART_NODE48_KV: { p.p3 = (art_node48 *) n; if (c == 255)break; unsigned char cc = c + 1; do { i = p.p3->keys[cc]; if (i) { *out = p.p3->children[i - 1]; break; } ++cc; } while (cc > 0); //cc wraps around at 256 break; } case ART_NODE256: case ART_NODE256_KV: { p.p4 = (art_node256 *) n; unsigned char cc = c + 1; if (c == 255)break; do { if (p.p4->children[cc]) { *out = p.p4->children[cc]; break; } ++cc; } while (cc > 0); //cc wraps around at 256 break; } default: UNSTOPPABLE_ASSERT(false); } } void art_tree::recursive_delete_binary(art_node *n, art_node **ref, const KeyRef &k, int depth) { // Bail if the prefix does not match unsigned char *key = (unsigned char *) k.begin(); unsigned int key_len = (unsigned int) k.size(); if (n->partial_len) { int prefix_len = check_prefix(n, k, depth); if (prefix_len != min(ART_MAX_PREFIX_LEN, n->partial_len)) { return; } depth = depth + n->partial_len; } // Find child node art_node **child = find_child(n, key[depth]); if (!child) return; //The child contains the key to be deleted in two cases //1. The child is a leaf (and the key matches) //2. THe child is a fat node (and the key ands in the node and the key matches) // If the child is leaf, delete from this node if (ART_IS_LEAF(*child)) { art_leaf *l = ART_LEAF_RAW(*child); if (!leaf_matches(l, k, depth)) { remove_child(n, ref, key[depth], child, depth); if (l->prev) { l->prev->next = l->next; } if (l->next) { l->next->prev = l->prev; } return; } return; } else { // Fat node and key ends within the child // Note that the predicate is on the child node, so at depth+1 if ((*child)->type >= ART_NODE4_KV && (key_len == depth + 1 + (*child)->partial_len)) { //Check if key matches art_leaf *l = ART_FAT_NODE_LEAF((*child)); //Return nullptr if key does not match if (leaf_matches(l, k, depth + 1)) { return; } //Set prev and next, since the leaf is going away if (l->prev) { l->prev->next = l->next; } if (l->next) { l->next->prev = l->prev; } //Remove the fat child. This also triggers compaction //NOTE: we are removing the fat leaf on our child, which is at depth+1!!! remove_fat_child(*child, child, depth + 1); return; } // Recurse if key is not supposed to be in the child. The child is going to be an internal node recursive_delete_binary(*child, child, k, depth + 1); } } int art_tree::check_prefix(const art_node *n, const KeyRef &k, int depth) { int max_cmp = min(min(n->partial_len, ART_MAX_PREFIX_LEN), k.size() - depth); int idx; for (idx = 0; idx < max_cmp; idx++) { if (n->partial[idx] != ((unsigned char *) k.begin())[depth + idx]) return idx; } return idx; } //Remove a LEAF from a node256/node256kv void art_tree::remove_child256(art_node256 *n, art_node **ref, unsigned char c) { n->children[c] = nullptr; n->n.num_children--; // Resize to a node48 on underflow, not immediately to prevent // trashing if we sit on the 48/49 boundary if (n->n.num_children == 37) { art_node48 *new_node; if (n->n.type == ART_NODE256_KV) { new_node = (art_node48 *) alloc_node(ART_NODE48_KV); ART_FAT_NODE_LEAF(&new_node->n) = ART_FAT_NODE_LEAF(&n->n); } else { new_node = (art_node48 *) alloc_node(ART_NODE48); } *ref = (art_node *) new_node; copy_header((art_node *) new_node, (art_node *) n); int pos = 0; for (int i = 0; i < 256; i++) { if (n->children[i]) { new_node->children[pos] = n->children[i]; new_node->keys[i] = pos + 1; pos++; } } } } void art_tree::remove_fat_child256(art_node256_kv *n, art_node **ref) { //Delete the child by turning the fat node to a normal internal node n->n.n.type = ART_NODE256; *ref = (art_node *) n; } //Remove a LEAF from a node48/node48kv void art_tree::remove_child48(art_node48 *n, art_node **ref, unsigned char c) { int pos = n->keys[c]; n->keys[c] = 0; n->children[pos - 1] = nullptr; n->n.num_children--; if (n->n.num_children == 12) { art_node16 *new_node; if (n->n.type == ART_NODE48_KV) { new_node = (art_node16 *) alloc_node(ART_NODE16_KV); ART_FAT_NODE_LEAF(&new_node->n) = ART_FAT_NODE_LEAF(&n->n); } else { new_node = (art_node16 *) alloc_node(ART_NODE16); } *ref = (art_node *) new_node; copy_header((art_node *) new_node, (art_node *) n); int child = 0; for (int i = 0; i < 256; i++) { pos = n->keys[i]; if (pos) { new_node->keys[child] = i; new_node->children[child] = n->children[pos - 1]; child++; } } } } void art_tree::remove_fat_child48(art_node48_kv *n, art_node **ref) { //Delete the child by turning the fat node to a normal internal node n->n.n.type = ART_NODE48; *ref = (art_node *) n; } //Remove a LEAF from a node16/node16kv void art_tree::remove_child16(art_node16 *n, art_node **ref, art_node **l) { int pos = l - n->children; memmove(n->keys + pos, n->keys + pos + 1, n->n.num_children - 1 - pos); memmove(n->children + pos, n->children + pos + 1, (n->n.num_children - 1 - pos) * sizeof(void *)); n->n.num_children--; if (n->n.num_children == 3) { art_node4 *new_node; if (n->n.type == ART_NODE16_KV) { new_node = (art_node4 *) alloc_node(ART_NODE4_KV); ART_FAT_NODE_LEAF(&new_node->n) = ART_FAT_NODE_LEAF(&n->n); } else { new_node = (art_node4 *) alloc_node(ART_NODE4); } *ref = (art_node *) new_node; copy_header((art_node *) new_node, (art_node *) n); memcpy(new_node->keys, n->keys, 4); memcpy(new_node->children, n->children, 4 * sizeof(void *)); } } void art_tree::remove_fat_child16(art_node16_kv *n, art_node **ref) { //Delete the child by turning the fat node to a normal internal node n->n.n.type = ART_NODE16; *ref = (art_node *) n; } //Remove a LEAF from a node4/node4kv void art_tree::remove_child4(art_node4 *n, art_node **ref, art_node **l, int depth) { int pos = l - n->children; //We should do this only if then we do not remove the node altogether memmove(n->keys + pos, n->keys + pos + 1, n->n.num_children - 1 - pos); memmove(n->children + pos, n->children + pos + 1, (n->n.num_children - 1 - pos) * sizeof(void *)); n->n.num_children--; // Remove nodes with only a single child //This can only be done if the node is not a fat node. //In that case, the key in the fat node is a prefix to the key in the only child //And hence has to be preserved in the fat node if (n->n.num_children == 1 && (n->n.type < ART_NODE4_KV) && depth) { art_node *child = n->children[0]; if (!ART_IS_LEAF(child)) { // Concatenate the prefixes int prefix = n->n.partial_len; if (prefix < ART_MAX_PREFIX_LEN) { n->n.partial[prefix] = n->keys[0]; prefix++; } if (prefix < ART_MAX_PREFIX_LEN) { int sub_prefix = min(child->partial_len, ART_MAX_PREFIX_LEN - prefix); memcpy(n->n.partial + prefix, child->partial, sub_prefix); prefix += sub_prefix; } // Store the prefix in the child memcpy(child->partial, n->n.partial, min(prefix, ART_MAX_PREFIX_LEN)); child->partial_len += n->n.partial_len + 1; } //This is done also for the leaf. If the only son is a leaf, just delete the internal node //And have the father of the deleted internal node point directly to the leaf. *ref = child; } //What can happen now is that the node has ZERO children. //If you have a fat node, you cannot compress when you get to one child //So it can happen that the one child get removed //At that point, you transform the fat node into a leaf: the fat key is not prefix of any subtree! //This should only happen to a node4kv: if the node is normal, then already when there there is 1 child //the node gets compressed. //We still check for depth b/c we want to avoid that root becomes empty and becomes a leaf else if (n->n.num_children == 0 && depth) { *ref = (art_node *) SET_LEAF((art_node *) ART_FAT_NODE_LEAF(&n->n)); } else if (!depth && n->n.num_children == 0) { n->children[0] = nullptr; } } void art_tree::remove_fat_child4(art_node4_kv *n, art_node **ref, int depth) { //Delete the child by turning the fat node to a normal internal node n->n.n.type = ART_NODE4; *ref = (art_node *) n; //Tell daddy that my address has changed art_node4 *node = (art_node4 *) n; // Remove nodes with only a single child if (node->n.num_children == 1 && depth) { art_node *child = node->children[0]; if (!ART_IS_LEAF(child)) { // Concatenate the prefixes int prefix = node->n.partial_len; if (prefix < ART_MAX_PREFIX_LEN) { node->n.partial[prefix] = node->keys[0]; prefix++; } if (prefix < ART_MAX_PREFIX_LEN) { int sub_prefix = min(child->partial_len, ART_MAX_PREFIX_LEN - prefix); memcpy(node->n.partial + prefix, child->partial, sub_prefix); prefix += sub_prefix; } // Store the prefix in the child memcpy(child->partial, node->n.partial, min(prefix, ART_MAX_PREFIX_LEN)); child->partial_len += node->n.partial_len + 1; } *ref = child; } else if (!depth && node->n.num_children == 0) { //This happens if we have a fat root with the empty key. //If only the empty key remains, then the fat root has zero children node->children[0] = nullptr; } } void art_tree::remove_child(art_node *n, art_node **ref, unsigned char c, art_node **l, int depth) { switch (n->type) { case ART_NODE4_KV: case ART_NODE4: return remove_child4((art_node4 *) n, ref, l, depth); case ART_NODE16_KV: case ART_NODE16: return remove_child16((art_node16 *) n, ref, l); case ART_NODE48_KV: case ART_NODE48: return remove_child48((art_node48 *) n, ref, c); case ART_NODE256_KV: case ART_NODE256: return remove_child256((art_node256 *) n, ref, c); default: UNSTOPPABLE_ASSERT(false); } } //NB: Here, n is the fat node from which we remove the kv/pair void art_tree::remove_fat_child(art_node *n, art_node **ref, int depth) { switch (n->type) { case ART_NODE4_KV: return remove_fat_child4((art_node4_kv *) n, ref, depth); case ART_NODE16_KV: return remove_fat_child16((art_node16_kv *) n, ref); case ART_NODE48_KV: return remove_fat_child48((art_node48_kv *) n, ref); case ART_NODE256_KV: return remove_fat_child256((art_node256_kv *) n, ref); default: UNSTOPPABLE_ASSERT(false); } } void art_tree::copy_header(art_node *dest, art_node *src) { dest->num_children = src->num_children; dest->partial_len = src->partial_len; memcpy(dest->partial, src->partial, min(ART_MAX_PREFIX_LEN, src->partial_len)); } void art_tree::erase(const art_iterator &it) { recursive_delete_binary(this->root, &this->root, it.key(), 0); } art_leaf *art_tree::iterative_insert(art_node *root, art_node **root_ptr, KeyRef &k, void *value, int depth, int *old, int replace_existing) { art_node *n = root; art_node **ref = root_ptr; //Ref is the memory location of the father that stores the pointer to the child in which we add the new item //Of course, the root is not the child of anybody. We need the ptr to root bc it is possible that the root itself //Grows by adding nodes, and hence gets relocated. Then, we need to update the ptr to the root while (n) { if (ART_IS_LEAF(n)) { return insert_leaf(n, ref, k, value, depth, old, replace_existing); } art_leaf *min_of_n = nullptr; if (n->partial_len) { // Determine if the prefixes differ, since we need to split int prefix_diff = prefix_mismatch(n, k, depth, &min_of_n); if ((uint32_t) prefix_diff >= n->partial_len) { depth += n->partial_len; //handle fat key or go in depth } else { return insert_internal_node(n, ref, k, value, depth, old, min_of_n, prefix_diff, replace_existing); } } if (k.size() == (depth)) { return insert_fat_node(n, ref, k, value, depth, old, min_of_n, replace_existing); } // Find a child to recurse to //Child is the pointer to the memory location within the node that contains the ptr to the child art_node **child = find_child(n, ((unsigned char *) k.begin())[depth]); if (!child) { break; } //Else go on. Update depth, the current node and its reference in the list of children in the parent depth++; ref = child; n = *ref; } return insert_child(n, ref, k, value, depth, old, replace_existing); } art_leaf *art_tree::insert_child(art_node *n, art_node **ref, const KeyRef &k, void *value, int depth, int *old, int replace_existing) { // No child, node goes within us art_leaf *l = make_leaf(k, value); const unsigned char *key = (const unsigned char *) k.begin(); add_child(n, ref, key[depth], SET_LEAF(l)); //After adding a child, the node might have grown (and old pointer n becomes invalid) art_node *new_node = *ref; art_node *pn = nullptr; art_leaf *lm = nullptr; int prev_next = art_next_prev(new_node, key[depth], &pn); if (prev_next == ART_NEXT) { //If the char of this key is NOT the maximum in the node, take the key's next node from curr_node //If the next node exists, then the MIN of the next node is the new_key's next //NB: The fat leaf cannot be the next, so we search for the min in the subtree lm = minimum(pn); //new key is LOWER than min insert_before(l, lm); } else if (prev_next == ART_PREV) { //If the next does not exist, it means there is a prev (the curr_node has at least one child). //The curr key is the next of the MAX of the prev node //Check if the prev is the fat_leaf if (pn == new_node) { lm = ART_FAT_NODE_LEAF(new_node); } else { lm = maximum(pn); } //new key is BIGGER than lm insert_after(l, lm); }//if it is ART_NEITHER, then there's no prev nor next. They have been set to nullptr already return l; } int art_tree::art_next_prev(art_node *n, unsigned char c, art_node **out) { find_next(n, c, out); if (*out) { return ART_NEXT; } find_prev(n, c, out); if (*out) { return ART_PREV; } return ART_NEITHER; } art_leaf *art_tree::insert_internal_node(art_node *n, art_node **ref, const KeyRef &k, void *value, int depth, int *old, art_leaf *min_of_n, int prefix_diff, int replace_existing) { art_node4 *new_node; bool kv_creat = (prefix_diff == (k.size() - depth)); if (!kv_creat) { new_node = (art_node4 *) alloc_node(ART_NODE4); *ref = (art_node *) new_node; new_node->n.partial_len = prefix_diff; memcpy(new_node->n.partial, n->partial, min(ART_MAX_PREFIX_LEN, prefix_diff)); // Adjust the prefix of the old node if (n->partial_len <= ART_MAX_PREFIX_LEN) { add_child4(new_node, ref, n->partial[prefix_diff], n); n->partial_len -= (prefix_diff + 1); memmove(n->partial, n->partial + prefix_diff + 1, min(ART_MAX_PREFIX_LEN, n->partial_len)); } else { n->partial_len -= (prefix_diff + 1); art_leaf *l = min_of_n == nullptr ? minimum(n) : min_of_n; min_of_n = l; unsigned char *lkey = (unsigned char *) l->key.begin(); add_child4(new_node, ref, lkey[depth + prefix_diff], n); memcpy(n->partial, lkey + depth + prefix_diff + 1, min(ART_MAX_PREFIX_LEN, n->partial_len)); } // Insert the new leaf art_leaf *l = make_leaf(k, value); add_child4(new_node, ref, ((unsigned char *) k.begin())[depth + prefix_diff], SET_LEAF(l)); art_node *pn; art_leaf *lm = nullptr; int prev_next = art_next_prev((art_node *) new_node, l->key[depth + prefix_diff], &pn); if (prev_next == ART_NEXT) { //If the char of this key is NOT the maximum in the node, take the key's next node from curr_node //If the next node exists, then the MIN of the next node is the new_key's next lm = minimum(pn); insert_before(l, lm); } else if (prev_next == ART_PREV) { //If the next does not exist, it means there is a prev (the curr_node has at least one child). //The curr key is the next of the MAX of the prev node //We do not check for fat node lm = maximum(pn); //new key is BIGGER than lm insert_after(l, lm); } return l; } else { //The new key becomes a fat node that has as child the current internal node. //The current internal node has to be indexed by the first char in the prefix, whose size has to be reduced //By one accordingly new_node = (art_node4 *) alloc_kv_node(ART_NODE4_KV); ART_FAT_NODE_LEAF(&new_node->n) = make_leaf(k, value); //This is legacy code. I think it is valid, but probably it can be simplified in our case *ref = (art_node *) new_node; new_node->n.partial_len = prefix_diff; memcpy(new_node->n.partial, n->partial, min(ART_MAX_PREFIX_LEN, prefix_diff)); // Adjust the prefix of the old node if (n->partial_len <= ART_MAX_PREFIX_LEN) { add_child4(new_node, ref, n->partial[prefix_diff], n); n->partial_len -= (prefix_diff + 1); memmove(n->partial, n->partial + prefix_diff + 1, min(ART_MAX_PREFIX_LEN, n->partial_len)); } else { n->partial_len -= (prefix_diff + 1); art_leaf *l = min_of_n == nullptr ? minimum(n) : min_of_n;//minimum(n); min_of_n = l; add_child4(new_node, ref, ((unsigned char *) l->key.begin())[depth + prefix_diff], n); memcpy(n->partial, l->key.begin() + depth + prefix_diff + 1, min(ART_MAX_PREFIX_LEN, n->partial_len)); } //Fat leaf is the smallest in this subtree. So the minimum in the subtree is its next art_leaf *lm = min_of_n == nullptr ? minimum(n) : min_of_n;//minimum(n); //FIXME: reuse from before if already taken min_of_n = lm; //new key is LOWER than min art_leaf *l_new = ART_FAT_NODE_LEAF(&new_node->n); insert_before(l_new, lm); return l_new; } } art_leaf *art_tree::insert_fat_node(art_node *n, art_node **ref, const KeyRef &k, void *value, int depth, int *old, art_leaf *min_of_n, int replace_existing) { if (n->type >= ART_NODE4_KV) { //If the node is already fat, it means you already have the key *old = 1; art_leaf *l = ART_FAT_NODE_LEAF(n); if (replace_existing) { l->value = value; } return l; } else { //We have to transform a node into a fat node //The minimum function considers the leaf to be the minimum. //Let's grab the minimum BEFORE we create the fat node and insert the new leaf :) art_leaf *lm = min_of_n == nullptr ? minimum(n) : min_of_n; art_node *nkv = n; //change the type before deferencing the leaf nkv->type = static_cast(n->type + 4); art_leaf *l_new = make_leaf(k, value); ART_FAT_NODE_LEAF(nkv) = l_new; *ref = (art_node *) nkv; //Fat leaf is the smallest in this subtree. So the minimum in the subtree is its next //new key is LOWER than min if (lm) insert_before(l_new, lm); return l_new; } } art_leaf *art_tree::insert_leaf(art_node *n, art_node **ref, const KeyRef &k, void *value, int depth, int *old, int replace_existing) { art_leaf *l = ART_LEAF_RAW(n); // Check if we are updating an existing value //Original case if (!leaf_matches(l, k, depth)) { *old = 1; if (replace_existing) { l->value = value; } return l; } const unsigned char *key = (const unsigned char *) k.begin(); const int key_len = k.size(); int longest_prefix = longest_common_prefix(k, l->key, depth); int kv_creat = (longest_prefix == (min(key_len, l->key.size()) - depth)); if (!kv_creat) {//Common case, old code //Recompute longest prefix from depth. Probably can be optimized // New value, we must split the leaf into a node4 art_node4 *new_node = (art_node4 *) alloc_node(ART_NODE4); // Create a new leaf art_leaf *l2 = make_leaf(k, value); // Determine longest prefix new_node->n.partial_len = longest_prefix; memcpy(new_node->n.partial, key + depth, min(ART_MAX_PREFIX_LEN, longest_prefix)); // Add the leaves to the new node4 *ref = (art_node *) new_node; add_child4(new_node, ref, l->key[depth + longest_prefix], SET_LEAF(l)); add_child4(new_node, ref, l2->key[depth + longest_prefix], SET_LEAF(l2)); //Set next and prev. Check if new leaf goes before or after existing leaf if (l2->key[depth + longest_prefix] < l->key[depth + longest_prefix]) { //New key comes BEFORE existing key insert_before(l2, l); } else { //New key comes AFTER existing key insert_after(l2, l); } return l2; } // The new key is a superset of the key in the leaf. // So the current leaf becomes the value of the kv node if (key_len > l->key.size()) { art_node4_kv *new_node = (art_node4_kv *) alloc_kv_node(ART_NODE4_KV); // Create a new leaf with the new key art_leaf *l2 = make_leaf(k, value); // Determine longest prefix new_node->n.n.partial_len = longest_prefix; memcpy(new_node->n.n.partial, key + depth, min(ART_MAX_PREFIX_LEN, longest_prefix)); // Add the leaf to the new node4 *ref = (art_node *) new_node; add_child4((art_node4 * ) & new_node->n, ref, l2->key[depth + longest_prefix], SET_LEAF(l2)); ART_FAT_NODE_LEAF(&new_node->n.n) = l; art_leaf *fat_leaf = ART_FAT_NODE_LEAF(&new_node->n.n); //The new key goes in the leaf and its predecessor is the key in the new fat node insert_after(l2, fat_leaf); return l2; } else { // The key in the leaf is a supertset of the new key // So the leaf stays a leaf and the new key goes in the kv_node art_node4_kv *new_node = (art_node4_kv *) alloc_kv_node(ART_NODE4_KV); art_leaf *fat_leaf = make_leaf(k, value); ART_FAT_NODE_LEAF(&new_node->n.n) = fat_leaf; new_node->n.n.partial_len = longest_prefix; memcpy(new_node->n.n.partial, key + depth, min(ART_MAX_PREFIX_LEN, longest_prefix)); *ref = (art_node *) new_node; add_child4((art_node4 * ) & new_node->n, ref, l->key[depth + longest_prefix], SET_LEAF(l)); //The fat node is the prev of the current leaf insert_before(fat_leaf, l); return fat_leaf; } } int art_tree::longest_common_prefix(const KeyRef &k1, const KeyRef &k2, int depth) { int max_cmp = min(k1.size(), k2.size()) - depth; int idx; for (idx = 0; idx < max_cmp; idx++) { if (((unsigned char *) k1.begin())[depth + idx] != ((unsigned char *) k2.begin())[depth + idx]) return idx; } return idx; } void art_tree::insert_before(art_leaf *l_new, art_leaf *l_existing) { l_new->next = l_existing; if (l_existing) { l_new->prev = l_existing->prev; if (l_existing->prev) { l_existing->prev->next = l_new; } l_existing->prev = l_new; } else { l_new->prev = nullptr; } } void art_tree::insert_after(art_leaf *l_new, art_leaf *l_existing) { l_new->prev = l_existing; if (l_existing) { l_new->next = l_existing->next; if (l_existing->next) { l_existing->next->prev = l_new; } l_existing->next = l_new; } else { l_new->next = nullptr; } } void art_tree::add_child(art_node *n, art_node **ref, unsigned char c, void *child) { switch (n->type) { case ART_NODE4_KV: case ART_NODE4: return add_child4((art_node4 *) n, ref, c, child); case ART_NODE16_KV: case ART_NODE16: return add_child16((art_node16 *) n, ref, c, child); case ART_NODE48_KV: case ART_NODE48: return add_child48((art_node48 *) n, ref, c, child); case ART_NODE256_KV: case ART_NODE256: return add_child256((art_node256 *) n, ref, c, child); default: UNSTOPPABLE_ASSERT(false); } } void art_tree::add_child256(art_node256 *n, art_node **ref, unsigned char c, void *child) { (void) ref; n->n.num_children++; n->children[c] = (art_node *) child; } void art_tree::add_child48(art_node48 *n, art_node **ref, unsigned char c, void *child) { if (n->n.num_children < 48) { int pos = 0; while (n->children[pos]) pos++; n->children[pos] = (art_node *) child; n->keys[c] = pos + 1; n->n.num_children++; } else { art_node256 *new_node; //Copy the fat leaf pointer if needed if (n->n.type == ART_NODE48_KV) { new_node = (art_node256 *) alloc_node(ART_NODE256_KV); ART_FAT_NODE_LEAF(&new_node->n) = ART_FAT_NODE_LEAF(&n->n); } else { new_node = (art_node256 *) alloc_node(ART_NODE256); } for (int i = 0; i < 256; i++) { if (n->keys[i]) { new_node->children[i] = n->children[n->keys[i] - 1]; } } copy_header((art_node *) new_node, (art_node *) n); *ref = (art_node *) new_node; add_child256(new_node, ref, c, child); } } void art_tree::add_child16(art_node16 *n, art_node **ref, unsigned char c, void *child) { if (n->n.num_children < 16) { __m128i cmp; // Compare the key to all 16 stored keys cmp = _mm_cmplt_epu8(_mm_set1_epi8(c), _mm_loadu_si128((__m128i *) n->keys)); // Use a mask to ignore children that don't exist unsigned mask = (1 << n->n.num_children) - 1; unsigned bitfield = _mm_movemask_epi8(cmp) & mask; // Check if less than any unsigned idx; if (bitfield) { idx = ctz(bitfield); memmove(n->keys + idx + 1, n->keys + idx, n->n.num_children - idx); memmove(n->children + idx + 1, n->children + idx, (n->n.num_children - idx) * sizeof(void *)); } else idx = n->n.num_children; // Set the child n->keys[idx] = c; n->children[idx] = (art_node *) child; n->n.num_children++; } else { art_node48 *new_node; //Check whether this is a fat node if (n->n.type == ART_NODE16_KV) { new_node = (art_node48 *) alloc_node(ART_NODE48_KV); ART_FAT_NODE_LEAF(&new_node->n) = ART_FAT_NODE_LEAF(&n->n); } else { new_node = (art_node48 *) alloc_node(ART_NODE48); } // Copy the child pointers and populate the key map memcpy(new_node->children, n->children, sizeof(void *) * n->n.num_children); for (int i = 0; i < n->n.num_children; i++) { new_node->keys[n->keys[i]] = i + 1; } copy_header((art_node *) new_node, (art_node *) n); *ref = (art_node *) new_node; add_child48(new_node, ref, c, child); } } void art_tree::add_child4(art_node4 *n, art_node **ref, unsigned char c, void *child) { if (n->n.num_children < 4) { int idx; for (idx = 0; idx < n->n.num_children; idx++) { if (c < n->keys[idx]) break; } // Shift to make room memmove(n->keys + idx + 1, n->keys + idx, n->n.num_children - idx); memmove(n->children + idx + 1, n->children + idx, (n->n.num_children - idx) * sizeof(void *)); // Insert element n->keys[idx] = c; n->children[idx] = (art_node *) child; n->n.num_children++; } else { art_node16 *new_node; //Check whether this is a fat node if (n->n.type == ART_NODE4_KV) { new_node = (art_node16 *) alloc_node(ART_NODE16_KV); ART_FAT_NODE_LEAF(&new_node->n) = ART_FAT_NODE_LEAF(&n->n); } else { new_node = (art_node16 *) alloc_node(ART_NODE16); } // Copy the child pointers and the key map memcpy(new_node->children, n->children, sizeof(void *) * n->n.num_children); memcpy(new_node->keys, n->keys, sizeof(unsigned char) * n->n.num_children); copy_header((art_node *) new_node, (art_node *) n); *ref = (art_node *) new_node; add_child16(new_node, ref, c, child); } } //Every node is actually a kv node, but the type is <= NODE256 art_node *art_tree::alloc_node(ART_NODE_TYPE type) { const int offset = type > ART_NODE256 ? 0 : ART_NODE256; art_node *n = (art_node *) new((Arena & ) * this->arena)uint8_t[node_sizes[offset + type]](); n->type = type; return n; } art_node *art_tree::alloc_kv_node(ART_NODE_TYPE type) { art_node *n = (art_node *) new((Arena & ) * this->arena)uint8_t[node_sizes[type]](); n->type = type; return n; } art_leaf *art_tree::make_leaf(const KeyRef &k, void *value) { const int key_len = k.size(); //Allocate contiguous buffer to hold the leaf and the key pointed by the KeyRef art_leaf *v = (art_leaf *) new((Arena & ) * this->arena)uint8_t[sizeof(art_leaf) + key_len]; //copy the key to the proper offset in the buffer memcpy(v + 1, k.begin(), key_len); KeyRef nkr = KeyRef((const uint8_t *) (v + 1), key_len); //create the art_leaf, allocating it on the buffer //The KeyRef& passed as argument is the new one, allocated in the buffer art_leaf *l = new(v)art_leaf(nkr, value); l->prev = nullptr; l->next = nullptr; l->type = ART_LEAF; return l; } int art_tree::prefix_mismatch(art_node *n, KeyRef &k, int depth, art_leaf **minout) { const int key_len = k.size(); const unsigned char *key = (const unsigned char *) k.begin(); int max_cmp = min(min(ART_MAX_PREFIX_LEN, n->partial_len), key_len - depth); int idx; for (idx = 0; idx < max_cmp; idx++) { if (n->partial[idx] != key[depth + idx]) return idx; } // If the prefix is short we can avoid finding a leaf if (n->partial_len > ART_MAX_PREFIX_LEN) { // Prefix is longer than what we've checked, find a leaf art_leaf *l = minimum(n); *minout = l; max_cmp = min(l->key.size(), key_len) - depth; for (; idx < max_cmp; idx++) { if (l->key.begin()[idx + depth] != k.begin()[depth + idx]) return idx; } } return idx; } //Find the child corresponding to c, if present, and the largest child smaller than c void art_tree::find_prev(art_node *n, unsigned char c, art_node **out) { int i, mask, bitfield; *out = nullptr; union { art_node4 *p1; art_node16 *p2; art_node48 *p3; art_node256 *p4; } p; switch (n->type) { case ART_NODE4_KV: case ART_NODE4: p.p1 = (art_node4 *) n; for (i = 0; i < n->num_children; i++) { if (p.p1->keys[i] < c) { *out = p.p1->children[i]; } else { break; } } if (i == 0 && n->type == ART_NODE4_KV) { *out = n; } break; { __m128i cmp; case ART_NODE16_KV: case ART_NODE16: p.p2 = (art_node16 *) n; // Compare the key to all 16 stored keys for less than cmp = _mm_cmpgt_epu8(_mm_set1_epi8(c), _mm_loadu_si128((__m128i *) p.p2->keys)); // Use a mask to ignore children that don't exist mask = (1 << n->num_children) - 1; bitfield = _mm_movemask_epi8(cmp) & mask; if (bitfield) { //We have at least one bit set to one, so the largest smaller exists int one = clz(bitfield); one = (sizeof(bitfield) * 8) - one; *out = p.p2->children[one - 1]; } else { if (n->type == ART_NODE16_KV) { *out = n; } } break; } case ART_NODE48_KV: case ART_NODE48: { p.p3 = (art_node48 *) n; unsigned char cc = c - 1; while (cc < 255) { i = p.p3->keys[cc]; if (i) { *out = p.p3->children[i - 1]; break; } --cc; } if (cc == 255 && n->type == ART_NODE48_KV) { *out = n; } break; } case ART_NODE256_KV: case ART_NODE256: { p.p4 = (art_node256 *) n; unsigned char cc = c - 1; if (c == 0 && n->type == ART_NODE256_KV) { *out = n; } while (cc < 255) { if (p.p4->children[cc]) { *out = p.p4->children[cc]; break; } --cc; } break; } default: UNSTOPPABLE_ASSERT(false); } } #endif //ART_IMPL_H