Basics of a block based posting list container.

2025-05-22 06:40:30 +08:00 · 2021-05-22 15:05:46 +05:30 · 2021-05-22 15:05:46 +05:30 · 8b24fe82b8
commit 8b24fe82b8
parent b817e615cb
6 changed files with 835 additions and 2 deletions
--- a/include/index.h
+++ b/include/index.h
@ -242,8 +242,8 @@ private:
    void index_string_array_field(const std::vector<std::string> & strings, const int64_t score, art_tree *t,
                                  uint32_t seq_id, bool is_facet, const field & a_field);

-    void remove_and_shift_offset_index(sorted_array& offset_index, const uint32_t* indices_sorted,
-                                       const uint32_t indices_length);
+    static void remove_and_shift_offset_index(sorted_array& offset_index, const uint32_t* indices_sorted,
+                                              const uint32_t indices_length);

    void collate_included_ids(const std::vector<std::string>& q_included_tokens,
                              const std::string & field, const uint8_t field_id,
--- a/include/posting_list.h
+++ b/include/posting_list.h
@ -0,0 +1,78 @@
+#pragma once
+#include <map>
+#include "sorted_array.h"
+#include "array.h"
+
+typedef uint32_t last_id_t;
+
+#define FOR_ELE_SIZE sizeof(uint32_t)
+#define METADATA_OVERHEAD 5
+
+/*
+    Compressed chain of blocks that store the document IDs and offsets of a given token.
+    Offsets of singular and multi-valued fields are encoded differently.
+*/
+class posting_list_t {
+public:
+
+    // A block stores a list of Document IDs, Token Offsets and a Mapping of ID => Offset indices efficiently
+    // Layout of *data: [ids...mappings..offsets]
+    // IDs and Mappings are sorted integers, while offsets are not sorted
+    struct block_t {
+        sorted_array ids;
+        sorted_array offset_index;
+        array offsets;
+
+        // link to next block
+        block_t* next = nullptr;
+
+        void insert_and_shift_offset_index(uint32_t index, uint32_t num_offsets);
+
+        void remove_and_shift_offset_index(const uint32_t* indices_sorted, uint32_t num_indices);
+
+        void upsert(uint32_t id, const std::vector<uint32_t>& offsets);
+
+        void erase(uint32_t id);
+
+        uint32_t size() {
+            return ids.getLength();
+        }
+    };
+
+private:
+
+    // when a block reaches pre-allocated storage, it is expanded by this factor
+    static constexpr float BLOCK_GROWTH_FACTOR = 1.3;
+
+    // maximum number of IDs (and associated offsets) to store in each block before another block is created
+    const uint16_t BLOCK_MAX_ELEMENTS;
+
+    block_t root_block;
+
+    // keeps track of the *last* ID in each block and is used for partial random access
+    // e.g. 0..[9], 10..[19], 20..[29]
+    // MUST be ordered
+    std::map<last_id_t, block_t*> id_block_map;
+
+    static void split_block(block_t* src_block, block_t* dst_block);
+
+    static void merge_adjacent_blocks(block_t* block1, block_t* block2, size_t num_block2_ids);
+
+public:
+
+    posting_list_t() = delete;
+
+    explicit posting_list_t(uint16_t max_block_elements);
+
+    ~posting_list_t();
+
+    void upsert(uint32_t id, const std::vector<uint32_t>& offsets);
+
+    void erase(uint32_t id);
+
+    block_t* get_root();
+
+    size_t size();
+
+    block_t* block_of(last_id_t id);
+};
--- a/include/sorted_array.h
+++ b/include/sorted_array.h
@ -54,6 +54,8 @@ public:

    uint32_t at(uint32_t index);

+    uint32_t last();
+
    bool contains(uint32_t value);

    uint32_t indexOf(uint32_t value);
--- a/src/posting_list.cpp
+++ b/src/posting_list.cpp
@ -0,0 +1,414 @@
+#include "posting_list.h"
+#include "for.h"
+#include "array_utils.h"
+
+/* block_t operations */
+
+void posting_list_t::block_t::insert_and_shift_offset_index(const uint32_t index, const uint32_t num_offsets) {
+    uint32_t existing_offset_index = offset_index.at(index);
+    uint32_t length = offset_index.getLength();
+    uint32_t new_length = length + 1;
+    uint32_t* curr_array = offset_index.uncompress(new_length);
+
+    memmove(&curr_array[index+1], &curr_array[index], sizeof(uint32_t)*(length - index));
+    curr_array[index] = existing_offset_index;
+
+    uint32_t curr_index = index + 1;
+    while(curr_index < new_length) {
+        curr_array[curr_index] += num_offsets;
+        curr_index++;
+    }
+
+    offset_index.load(curr_array, new_length);
+
+    delete [] curr_array;
+}
+
+void posting_list_t::block_t::upsert(const uint32_t id, const std::vector<uint32_t>& positions) {
+    size_t inserted_index = ids.append(id);
+
+    if(inserted_index == ids.getLength()-1) {
+        // treat as appends
+        uint32_t curr_index = offsets.getLength();
+        offset_index.append(curr_index);
+        for(uint32_t position : positions) {
+            offsets.append(position);
+        }
+    } else {
+        uint32_t existing_offset_index = offset_index.at(inserted_index);
+        insert_and_shift_offset_index(inserted_index, positions.size());
+        offsets.insert(existing_offset_index, &positions[0], positions.size());
+    }
+}
+
+void posting_list_t::block_t::erase(const uint32_t id) {
+    uint32_t doc_index = ids.indexOf(id);
+
+    if (doc_index == ids.getLength()) {
+        return;
+    }
+
+    uint32_t start_offset = offset_index.at(doc_index);
+    uint32_t end_offset = (doc_index == ids.getLength() - 1) ?
+                          offsets.getLength() :
+                          offset_index.at(doc_index + 1);
+
+    uint32_t doc_indices[1] = {doc_index};
+    remove_and_shift_offset_index(doc_indices, 1);
+
+    offsets.remove_index(start_offset, end_offset);
+    ids.remove_value(id);
+}
+
+void posting_list_t::block_t::remove_and_shift_offset_index(const uint32_t* indices_sorted,
+                                                            const uint32_t num_indices) {
+    uint32_t *curr_array = offset_index.uncompress();
+    uint32_t *new_array = new uint32_t[offset_index.getLength()];
+
+    new_array[0] = 0;
+    uint32_t new_index = 0;
+    uint32_t curr_index = 0;
+    uint32_t indices_counter = 0;
+    uint32_t shift_value = 0;
+
+    while(curr_index < offset_index.getLength()) {
+        if(indices_counter < num_indices && curr_index >= indices_sorted[indices_counter]) {
+            // skip copying
+            if(curr_index == indices_sorted[indices_counter]) {
+                curr_index++;
+                const uint32_t diff = curr_index == offset_index.getLength() ?
+                                      0 : (offset_index.at(curr_index) - offset_index.at(curr_index-1));
+
+                shift_value += diff;
+            }
+            indices_counter++;
+        } else {
+            new_array[new_index++] = curr_array[curr_index++] - shift_value;
+        }
+    }
+
+    offset_index.load(new_array, new_index);
+
+    delete[] curr_array;
+    delete[] new_array;
+}
+
+/* posting_list_t operations */
+
+posting_list_t::posting_list_t(uint16_t max_block_elements): BLOCK_MAX_ELEMENTS(max_block_elements) {
+
+}
+
+posting_list_t::~posting_list_t() {
+    block_t* block = root_block.next;
+    while(block != nullptr) {
+        block_t* next_block = block->next;
+        delete block;
+        block = next_block;
+    }
+}
+
+void posting_list_t::merge_adjacent_blocks(posting_list_t::block_t* block1, posting_list_t::block_t* block2,
+                                           size_t num_block2_ids) {
+    // merge ids
+    uint32_t* raw_ids1 = block1->ids.uncompress();
+    uint32_t* raw_ids2 = block2->ids.uncompress();
+
+    size_t block1_orig_size = block1->size();
+    size_t block2_orig_size = block2->size();
+
+    uint32_t* raw_ids = new uint32_t[block1->size() + num_block2_ids];
+    std::memmove(raw_ids, raw_ids1, sizeof(uint32_t) * block1->size());
+    std::memmove(raw_ids + block1->size(), raw_ids2, sizeof(uint32_t) * num_block2_ids);
+
+    block1->ids.load(raw_ids, block1->size() + num_block2_ids);
+    block2->ids.load(raw_ids2 + num_block2_ids, block2->size() - num_block2_ids);
+
+    delete [] raw_ids1;
+    delete [] raw_ids2;
+    delete [] raw_ids;
+
+    // merge offset indices
+    uint32_t* raw_offset_index1 = block1->offset_index.uncompress();
+    uint32_t* raw_offset_index2 = block2->offset_index.uncompress();
+    uint32_t* raw_offset_index = new uint32_t[block1_orig_size + block2_orig_size];
+
+    std::memmove(raw_offset_index, raw_offset_index1, sizeof(uint32_t) * block1->offset_index.getLength());
+    size_t start_index = block1->offset_index.getLength();
+    size_t base_offset_len = block1->offsets.getLength();
+
+    for(size_t i = 0; i < num_block2_ids; i++) {
+        raw_offset_index[start_index + i] = raw_offset_index2[i] + base_offset_len;
+    }
+
+    block1->offset_index.load(raw_offset_index, block1->offset_index.getLength() + num_block2_ids);
+
+    for(size_t i = 0; i < (block2_orig_size - num_block2_ids); i++) {
+        raw_offset_index2[num_block2_ids + i] -= raw_offset_index2[num_block2_ids];
+    }
+
+    block2->offset_index.load(raw_offset_index2 + num_block2_ids, block2_orig_size - num_block2_ids);
+
+    // merge offsets
+    uint32_t* raw_offsets1 = block1->offsets.uncompress();
+    uint32_t* raw_offsets2 = block2->offsets.uncompress();
+    size_t num_block2_offset_elements = (num_block2_ids == block2_orig_size) ? block2->offsets.getLength() :
+                                        raw_offset_index2[num_block2_ids];
+
+    uint32_t* raw_offsets = new uint32_t[block1->offsets.getLength() + num_block2_offset_elements];
+
+    uint32_t min = raw_offsets1[0], max = raw_offsets1[0];
+
+    // we have to manually copy over so we can find the new min and max
+    for(size_t i = 0; i < block1->offsets.getLength(); i++) {
+        raw_offsets[i] = raw_offsets1[i];
+        if(raw_offsets[i] < min) {
+            min = raw_offsets[i];
+        }
+
+        if(raw_offsets[i] > max) {
+            max = raw_offsets[i];
+        }
+    }
+
+    size_t block2_base_index = block1->offsets.getLength();
+
+    for(size_t i = 0; i < num_block2_offset_elements; i++) {
+        size_t j = block2_base_index + i;
+        raw_offsets[j] = raw_offsets2[i];
+
+        if(raw_offsets[j] < min) {
+            min = raw_offsets[j];
+        }
+
+        if(raw_offsets[j] > max) {
+            max = raw_offsets[j];
+        }
+    }
+
+    block1->offsets.load(raw_offsets, block1->offsets.getLength() + num_block2_offset_elements, min, max);
+
+    // reset block2 offsets with remaining elements
+    if(block2->offsets.getLength() != num_block2_offset_elements) {
+        const size_t block2_new_offsets_length = (block2->offsets.getLength() - num_block2_offset_elements);
+        uint32_t* block2_new_raw_offsets = new uint32_t[block2_new_offsets_length];
+        min = max = raw_offsets2[num_block2_offset_elements];
+        for(size_t i = 0; i < block2_new_offsets_length; i++) {
+            block2_new_raw_offsets[i] = raw_offsets2[num_block2_offset_elements + i];
+            if(block2_new_raw_offsets[i] < min) {
+                min = block2_new_raw_offsets[i];
+            }
+
+            if(block2_new_raw_offsets[i] > max) {
+                max = block2_new_raw_offsets[i];
+            }
+        }
+        block2->offsets.load(block2_new_raw_offsets, block2_new_offsets_length, min, max);
+        delete [] block2_new_raw_offsets;
+    } else {
+        block2->offsets.load(nullptr, 0, 0, 0);
+    }
+
+    delete [] raw_offset_index1;
+    delete [] raw_offset_index2;
+    delete [] raw_offset_index;
+
+    delete [] raw_offsets1;
+    delete [] raw_offsets2;
+    delete [] raw_offsets;
+}
+
+void posting_list_t::split_block(posting_list_t::block_t* src_block, posting_list_t::block_t* dst_block) {
+    if(src_block->size() <= 1) {
+        return;
+    }
+
+    uint32_t* raw_ids = src_block->ids.uncompress();
+    size_t ids_first_half_length = (src_block->size() / 2);
+    size_t ids_second_half_length = (src_block->size() - ids_first_half_length);
+    src_block->ids.load(raw_ids, ids_first_half_length);
+    dst_block->ids.load(raw_ids + ids_first_half_length, ids_second_half_length);
+
+    uint32_t* raw_offset_indices = src_block->offset_index.uncompress();
+    size_t offset_indices_first_half_length = (src_block->offset_index.getLength() / 2);
+    size_t offset_indices_second_half_length = (src_block->offset_index.getLength() - offset_indices_first_half_length);
+    src_block->offset_index.load(raw_offset_indices, offset_indices_first_half_length);
+
+    // update second half to use zero based index
+    uint32_t base_index_diff = raw_offset_indices[offset_indices_first_half_length];
+    for(size_t i = 0; i < offset_indices_second_half_length; i++) {
+        raw_offset_indices[offset_indices_first_half_length + i] -= base_index_diff;
+    }
+
+    dst_block->offset_index.load(raw_offset_indices + offset_indices_first_half_length, offset_indices_second_half_length);
+
+    uint32_t* raw_offsets = src_block->offsets.uncompress();
+    size_t src_offsets_length = src_block->offsets.getLength();
+
+    // load first half of offsets
+
+    size_t offset_first_half_length = base_index_diff;
+
+    // we need to find new min and max
+    uint32_t min = raw_offsets[0], max = raw_offsets[0];
+
+    for(size_t i = 0; i < offset_first_half_length; i++) {
+        if(raw_offsets[i] < min) {
+            min = raw_offsets[i];
+        }
+
+        if(raw_offsets[i] > max) {
+            max = raw_offsets[i];
+        }
+    }
+
+    src_block->offsets.load(raw_offsets, offset_first_half_length, min, max);
+
+    // load second half
+
+    min = max = raw_offsets[offset_first_half_length];
+    for(size_t i = offset_first_half_length; i < src_offsets_length; i++) {
+        if(raw_offsets[i] < min) {
+            min = raw_offsets[i];
+        }
+
+        if(raw_offsets[i] > max) {
+            max = raw_offsets[i];
+        }
+    }
+
+    size_t offsets_second_half_length = src_offsets_length - offset_first_half_length;
+    dst_block->offsets.load(raw_offsets + offset_first_half_length, offsets_second_half_length, min, max);
+
+    delete [] raw_ids;
+    delete [] raw_offset_indices;
+    delete [] raw_offsets;
+}
+
+void posting_list_t::upsert(const uint32_t id, const std::vector<uint32_t>& offsets) {
+    // first we will locate the block where `id` should reside
+    block_t* upsert_block;
+    last_id_t before_upsert_last_id;
+
+    if(id_block_map.empty()) {
+        //id_block_map.emplace(id, &root_block);
+        upsert_block = &root_block;
+        before_upsert_last_id = UINT32_MAX;
+    } else {
+        const auto it = id_block_map.lower_bound(id);
+        upsert_block = (it == id_block_map.end()) ? id_block_map.rbegin()->second : it->second;
+        before_upsert_last_id = upsert_block->ids.at(upsert_block->size() - 1);
+    }
+
+    // happy path: upsert_block is not full
+    if(upsert_block->size() < BLOCK_MAX_ELEMENTS) {
+        upsert_block->upsert(id, offsets);
+        last_id_t after_upsert_last_id = upsert_block->ids.at(upsert_block->size() - 1);
+        if(before_upsert_last_id != after_upsert_last_id) {
+            id_block_map.erase(before_upsert_last_id);
+            id_block_map.emplace(after_upsert_last_id, upsert_block);
+        }
+    } else {
+        block_t* new_block = new block_t;
+
+        if(upsert_block->next == nullptr && upsert_block->ids.last() < id) {
+            // appending to the end of the last block where the id will reside on a newly block
+            new_block->upsert(id, offsets);
+        } else {
+            // upsert and then split block
+            upsert_block->upsert(id, offsets);
+
+            // evenly divide elements between both blocks
+            split_block(upsert_block, new_block);
+
+            last_id_t after_upsert_last_id = upsert_block->ids.at(upsert_block->size() - 1);
+            id_block_map.erase(before_upsert_last_id);
+            id_block_map.emplace(after_upsert_last_id, upsert_block);
+        }
+
+        last_id_t after_new_block_id = new_block->ids.at(new_block->size() - 1);
+        id_block_map.emplace(after_new_block_id, new_block);
+
+        new_block->next = upsert_block->next;
+        upsert_block->next = new_block;
+    }
+}
+
+void posting_list_t::erase(const uint32_t id) {
+    const auto it = id_block_map.lower_bound(id);
+
+    if(it == id_block_map.end()) {
+        return ;
+    }
+
+    block_t* erase_block = it->second;
+    last_id_t before_last_id = it->first;
+    erase_block->erase(id);
+
+    size_t new_ids_length = erase_block->size();
+
+    if(new_ids_length == 0) {
+        // happens when the last element of last block is deleted
+
+        if(erase_block != &root_block) {
+            // since we will be deleting the empty node, set the previous node's next pointer to null
+            std::prev(it)->second->next = nullptr;
+            delete erase_block;
+        }
+
+        id_block_map.erase(before_last_id);
+
+        return;
+    }
+
+    if(new_ids_length >= BLOCK_MAX_ELEMENTS/2 || erase_block->next == nullptr) {
+        last_id_t after_last_id = erase_block->ids.at(new_ids_length-1);
+        if(before_last_id != after_last_id) {
+            id_block_map.erase(before_last_id);
+            id_block_map.emplace(after_last_id, erase_block);
+        }
+
+        return ;
+    }
+
+    // block is less than 50% of max capacity and contains a next node which we can refill from
+
+    auto next_block = erase_block->next;
+    last_id_t next_block_last_id = next_block->ids.at(next_block->ids.getLength()-1);
+
+    if(erase_block->size() + next_block->size() <= BLOCK_MAX_ELEMENTS) {
+        // we can merge the contents of next block with `erase_block` and delete the next block
+        merge_adjacent_blocks(erase_block, next_block, next_block->size());
+        erase_block->next = next_block->next;
+        delete next_block;
+
+        id_block_map.erase(next_block_last_id);
+    } else {
+        // only part of the next block can be moved over
+        size_t num_block2_ids = BLOCK_MAX_ELEMENTS - erase_block->size();
+        merge_adjacent_blocks(erase_block, next_block, num_block2_ids);
+        // NOTE: we don't have to update `id_block_map` for `next_block` as last element doesn't change
+    }
+
+    last_id_t after_last_id = erase_block->ids.at(erase_block->ids.getLength()-1);
+    if(before_last_id != after_last_id) {
+        id_block_map.erase(before_last_id);
+        id_block_map.emplace(after_last_id, erase_block);
+    }
+}
+
+posting_list_t::block_t* posting_list_t::get_root() {
+    return &root_block;
+}
+
+size_t posting_list_t::size() {
+    return id_block_map.size();
+}
+
+posting_list_t::block_t* posting_list_t::block_of(last_id_t id) {
+    auto it = id_block_map.find(id);
+    if(it != id_block_map.end()) {
+        return it->second;
+    }
+    return nullptr;
+}
--- a/src/sorted_array.cpp
+++ b/src/sorted_array.cpp
@ -397,3 +397,11 @@ void sorted_array::binary_count_indices(const uint32_t *values, int low_vindex,
        binary_count_indices(values, pivot_vindex+1, high_vindex, src, in_index, high_index, num_found);
    }
 }
+
+uint32_t sorted_array::last() {
+    if(getLength() == 0) {
+        return UINT32_MAX;
+    }
+
+    return at(getLength()-1);
+}
--- a/test/posting_list_test.cpp
+++ b/test/posting_list_test.cpp
@ -0,0 +1,331 @@
+#include <gtest/gtest.h>
+#include "posting_list.h"
+#include <vector>
+
+TEST(PostingListTest, Insert) {
+    std::vector<uint32_t> offsets = {0, 1, 3};
+
+    posting_list_t pl(5);
+
+    // insert elements sequentially
+
+    for(size_t i = 0; i < 15; i++) {
+        pl.upsert(i, offsets);
+    }
+
+    posting_list_t::block_t* root = pl.get_root();
+    ASSERT_EQ(5, root->ids.getLength());
+    ASSERT_EQ(5, root->next->ids.getLength());
+    ASSERT_EQ(5, root->next->next->ids.getLength());
+
+    ASSERT_EQ(root->next->next->next, nullptr);
+
+    ASSERT_EQ(3, pl.size());
+    ASSERT_EQ(root, pl.block_of(4));
+    ASSERT_EQ(root->next, pl.block_of(9));
+    ASSERT_EQ(root->next->next, pl.block_of(14));
+
+    // insert alternate values
+
+    posting_list_t pl2(5);
+
+    for(size_t i = 0; i < 15; i+=2) {
+        // [0, 2, 4, 6, 8], [10, 12, 14]
+        pl2.upsert(i, offsets);
+    }
+
+    root = pl2.get_root();
+    ASSERT_EQ(5, root->ids.getLength());
+    ASSERT_EQ(3, root->next->ids.getLength());
+
+    ASSERT_EQ(root->next->next, nullptr);
+    ASSERT_EQ(2, pl2.size());
+
+    ASSERT_EQ(root, pl2.block_of(8));
+    ASSERT_EQ(root->next, pl2.block_of(14));
+
+    // insert in the middle
+    // case 1
+
+    posting_list_t pl3(5);
+
+    for(size_t i = 0; i < 5; i++) {
+        pl3.upsert(i, offsets);
+    }
+
+    pl3.upsert(6, offsets);
+    pl3.upsert(8, offsets);
+    pl3.upsert(9, offsets);
+    pl3.upsert(10, offsets);
+    pl3.upsert(12, offsets);
+
+    // [0,1,2,3,4], [6,8,9,10,12]
+    pl3.upsert(5, offsets);
+    ASSERT_EQ(3, pl3.size());
+    ASSERT_EQ(5, pl3.get_root()->ids.getLength());
+    ASSERT_EQ(3, pl3.get_root()->next->ids.getLength());
+    ASSERT_EQ(8, pl3.get_root()->next->ids.last());
+    ASSERT_EQ(3, pl3.get_root()->next->next->ids.getLength());
+    ASSERT_EQ(12, pl3.get_root()->next->next->ids.last());
+
+    for(size_t i = 0; i < pl3.get_root()->next->offset_index.getLength(); i++) {
+        ASSERT_EQ(i * 3, pl3.get_root()->next->offset_index.at(i));
+    }
+
+    for(size_t i = 0; i < pl3.get_root()->next->offsets.getLength(); i++) {
+        ASSERT_EQ(offsets[i % 3], pl3.get_root()->next->offsets.at(i));
+    }
+
+    // case 2
+    posting_list_t pl4(5);
+
+    for(size_t i = 0; i < 5; i++) {
+        pl4.upsert(i, offsets);
+    }
+
+    pl4.upsert(6, offsets);
+    pl4.upsert(8, offsets);
+    pl4.upsert(9, offsets);
+    pl4.upsert(10, offsets);
+    pl4.upsert(12, offsets);
+
+    // [0,1,2,3,4], [6,8,9,10,12]
+    pl4.upsert(11, offsets);
+    ASSERT_EQ(3, pl4.size());
+
+    ASSERT_EQ(5, pl4.get_root()->ids.getLength());
+    ASSERT_EQ(3, pl4.get_root()->next->ids.getLength());
+    ASSERT_EQ(9, pl4.get_root()->next->ids.last());
+    ASSERT_EQ(3, pl4.get_root()->next->next->ids.getLength());
+    ASSERT_EQ(12, pl4.get_root()->next->next->ids.last());
+
+    for(size_t i = 0; i < pl4.get_root()->next->offset_index.getLength(); i++) {
+        ASSERT_EQ(i * 3, pl4.get_root()->next->offset_index.at(i));
+    }
+
+    for(size_t i = 0; i < pl4.get_root()->next->offsets.getLength(); i++) {
+        ASSERT_EQ(offsets[i % 3], pl4.get_root()->next->offsets.at(i));
+    }
+}
+
+TEST(PostingListTest, RemovalsOnFirstBlock) {
+    std::vector<uint32_t> offsets = {0, 1, 3};
+    posting_list_t pl(5);
+
+    ASSERT_EQ(0, pl.size());
+
+    // try to erase when posting list is empty
+    pl.erase(0);
+
+    ASSERT_EQ(0, pl.size());
+
+    // insert a single element and erase it
+    pl.upsert(0, offsets);
+    ASSERT_EQ(1, pl.size());
+    pl.erase(0);
+    ASSERT_EQ(0, pl.size());
+
+    ASSERT_EQ(0, pl.get_root()->ids.getLength());
+    ASSERT_EQ(0, pl.get_root()->offset_index.getLength());
+    ASSERT_EQ(0, pl.get_root()->offsets.getLength());
+
+    // insert until one past max block size
+    for(size_t i = 0; i < 6; i++) {
+        pl.upsert(i, offsets);
+    }
+
+    ASSERT_EQ(2, pl.size());
+
+    // delete non-existing element
+    pl.erase(1000);
+
+    // delete elements from first block: blocks should not be merged until it falls below 50% occupancy
+    pl.erase(1);
+    ASSERT_EQ(2, pl.size());
+
+    // [0, 2, 3, 4], [5]
+
+    for(size_t i = 0; i < pl.get_root()->offset_index.getLength(); i++) {
+        ASSERT_EQ(i * 3, pl.get_root()->offset_index.at(i));
+    }
+
+    for(size_t i = 0; i < pl.get_root()->offsets.getLength(); i++) {
+        ASSERT_EQ(offsets[i % 3], pl.get_root()->offsets.at(i));
+    }
+
+    pl.erase(2);
+    ASSERT_EQ(2, pl.size());
+    pl.erase(3);
+
+    // [0, 4], [5]
+    ASSERT_EQ(2, pl.size());
+    ASSERT_EQ(2, pl.get_root()->size());
+    ASSERT_EQ(1, pl.get_root()->next->size());
+    ASSERT_EQ(pl.get_root(), pl.block_of(4));
+    ASSERT_EQ(pl.get_root()->next, pl.block_of(5));
+
+    for(size_t i = 0; i < pl.get_root()->offset_index.getLength(); i++) {
+        ASSERT_EQ(i * 3, pl.get_root()->offset_index.at(i));
+    }
+
+    for(size_t i = 0; i < pl.get_root()->offsets.getLength(); i++) {
+        ASSERT_EQ(offsets[i % 3], pl.get_root()->offsets.at(i));
+    }
+
+    pl.erase(4);  // this will trigger the merge
+
+    // [0, 5]
+    // ensure that merge has happened
+    ASSERT_EQ(1, pl.size());
+    ASSERT_EQ(pl.get_root(), pl.block_of(5));
+    ASSERT_EQ(nullptr, pl.get_root()->next);
+    ASSERT_EQ(2, pl.get_root()->size());
+
+    for(size_t i = 0; i < pl.get_root()->offset_index.getLength(); i++) {
+        ASSERT_EQ(i * 3, pl.get_root()->offset_index.at(i));
+    }
+
+    for(size_t i = 0; i < pl.get_root()->offsets.getLength(); i++) {
+        ASSERT_EQ(offsets[i % 3], pl.get_root()->offsets.at(i));
+    }
+}
+
+TEST(PostingListTest, RemovalsOnLaterBlocks) {
+    std::vector<uint32_t> offsets = {0, 1, 3};
+    posting_list_t pl(5);
+
+    // insert until one past max block size
+    for(size_t i = 0; i < 6; i++) {
+        pl.upsert(i, offsets);
+    }
+
+    // erase last element of last, non-first block
+
+    pl.erase(5);
+    ASSERT_EQ(1, pl.size());
+    ASSERT_EQ(5, pl.get_root()->size());
+    ASSERT_EQ(4, pl.get_root()->ids.last());
+    ASSERT_EQ(nullptr, pl.get_root()->next);
+
+    for(size_t i = 0; i < pl.get_root()->offset_index.getLength(); i++) {
+        ASSERT_EQ(i * 3, pl.get_root()->offset_index.at(i));
+    }
+
+    for(size_t i = 0; i < pl.get_root()->offsets.getLength(); i++) {
+        ASSERT_EQ(offsets[i % 3], pl.get_root()->offsets.at(i));
+    }
+
+    // erase last element of the only block when block is atleast half full
+    pl.erase(4);
+    ASSERT_EQ(1, pl.size());
+    ASSERT_EQ(4, pl.get_root()->size());
+    ASSERT_EQ(3, pl.get_root()->ids.last());
+    ASSERT_EQ(pl.get_root(), pl.block_of(3));
+
+    for(size_t i = 4; i < 15; i++) {
+        pl.upsert(i, offsets);
+    }
+
+    // [0..4], [5..9], [10..14]
+    pl.erase(5);
+    pl.erase(6);
+    pl.erase(7);
+
+    for(size_t i = 0; i < pl.get_root()->next->offset_index.getLength(); i++) {
+        ASSERT_EQ(i * 3, pl.get_root()->next->offset_index.at(i));
+    }
+
+    for(size_t i = 0; i < pl.get_root()->next->offsets.getLength(); i++) {
+        ASSERT_EQ(offsets[i % 3], pl.get_root()->next->offsets.at(i));
+    }
+
+    for(size_t i = 0; i < pl.get_root()->next->next->offset_index.getLength(); i++) {
+        ASSERT_EQ(i * 3, pl.get_root()->next->next->offset_index.at(i));
+    }
+
+    for(size_t i = 0; i < pl.get_root()->next->next->offsets.getLength(); i++) {
+        ASSERT_EQ(offsets[i % 3], pl.get_root()->next->next->offsets.at(i));
+    }
+
+    // only part of the next node contents can be moved over when we delete 8 since (1 + 5) > 5
+    pl.erase(8);
+
+    // [0..4], [9], [10..14] => [0..4], [9,10,11,12,13], [14]
+
+    ASSERT_EQ(3, pl.size());
+    ASSERT_EQ(5, pl.get_root()->next->size());
+    ASSERT_EQ(1, pl.get_root()->next->next->size());
+    ASSERT_EQ(13, pl.get_root()->next->ids.last());
+    ASSERT_EQ(14, pl.get_root()->next->next->ids.last());
+
+    for(size_t i = 0; i < pl.get_root()->next->offset_index.getLength(); i++) {
+        ASSERT_EQ(i * 3, pl.get_root()->next->offset_index.at(i));
+    }
+
+    for(size_t i = 0; i < pl.get_root()->next->offsets.getLength(); i++) {
+        ASSERT_EQ(offsets[i % 3], pl.get_root()->next->offsets.at(i));
+    }
+
+    for(size_t i = 0; i < pl.get_root()->next->next->offset_index.getLength(); i++) {
+        ASSERT_EQ(i * 3, pl.get_root()->next->next->offset_index.at(i));
+    }
+
+    for(size_t i = 0; i < pl.get_root()->next->next->offsets.getLength(); i++) {
+        ASSERT_EQ(offsets[i % 3], pl.get_root()->next->next->offsets.at(i));
+    }
+}
+
+TEST(PostingListTest, OutOfOrderUpserts) {
+    std::vector<uint32_t> offsets = {0, 1, 3};
+    posting_list_t pl(5);
+
+    for(int i = 5; i > 0; i--) {
+        pl.upsert(i, offsets);
+    }
+
+    pl.upsert(0, offsets);
+    pl.upsert(200000, offsets);
+
+    ASSERT_EQ(2, pl.size());
+
+    ASSERT_EQ(3, pl.get_root()->size());
+    ASSERT_EQ(4, pl.get_root()->next->size());
+
+    for(size_t i = 0; i < pl.get_root()->offset_index.getLength(); i++) {
+        ASSERT_EQ(i * 3, pl.get_root()->offset_index.at(i));
+    }
+
+    for(size_t i = 0; i < pl.get_root()->offsets.getLength(); i++) {
+        ASSERT_EQ(offsets[i % 3], pl.get_root()->offsets.at(i));
+    }
+
+    for(size_t i = 0; i < pl.get_root()->next->offset_index.getLength(); i++) {
+        ASSERT_EQ(i * 3, pl.get_root()->next->offset_index.at(i));
+    }
+
+    for(size_t i = 0; i < pl.get_root()->next->offsets.getLength(); i++) {
+        ASSERT_EQ(offsets[i % 3], pl.get_root()->next->offsets.at(i));
+    }
+}
+
+TEST(PostingListTest, RandomInsertAndDeletes) {
+    time_t t;
+    srand((unsigned) time(&t));
+
+    posting_list_t pl(100);
+    std::vector<uint32_t> offsets1 = {0, 1, 3};
+    std::vector<uint32_t> offsets2 = {10, 12};
+
+    for(size_t i = 0; i < 100000; i++) {
+        const std::vector<uint32_t>& offsets = (i % 2 == 0) ? offsets1 : offsets2;
+        pl.upsert(rand() % 100000, offsets);
+    }
+
+    for(size_t i = 0; i < 10000; i++) {
+        const std::vector<uint32_t>& offsets = (i % 2 == 0) ? offsets1 : offsets2;
+        pl.erase(rand() % 100000);
+    }
+
+    bool size_within_range = (pl.size() < 1500) && (pl.size() > 1000);
+    ASSERT_TRUE(size_within_range);
+}