From 8b24fe82b8e5d931f1b4c9d201a7c4dae127545a Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Sat, 22 May 2021 15:05:46 +0530 Subject: [PATCH] Basics of a block based posting list container. --- include/index.h | 4 +- include/posting_list.h | 78 +++++++ include/sorted_array.h | 2 + src/posting_list.cpp | 414 +++++++++++++++++++++++++++++++++++++ src/sorted_array.cpp | 8 + test/posting_list_test.cpp | 331 +++++++++++++++++++++++++++++ 6 files changed, 835 insertions(+), 2 deletions(-) create mode 100644 include/posting_list.h create mode 100644 src/posting_list.cpp create mode 100644 test/posting_list_test.cpp diff --git a/include/index.h b/include/index.h index e338f181..cc3e0b72 100644 --- a/include/index.h +++ b/include/index.h @@ -242,8 +242,8 @@ private: void index_string_array_field(const std::vector & strings, const int64_t score, art_tree *t, uint32_t seq_id, bool is_facet, const field & a_field); - void remove_and_shift_offset_index(sorted_array& offset_index, const uint32_t* indices_sorted, - const uint32_t indices_length); + static void remove_and_shift_offset_index(sorted_array& offset_index, const uint32_t* indices_sorted, + const uint32_t indices_length); void collate_included_ids(const std::vector& q_included_tokens, const std::string & field, const uint8_t field_id, diff --git a/include/posting_list.h b/include/posting_list.h new file mode 100644 index 00000000..a05acdec --- /dev/null +++ b/include/posting_list.h @@ -0,0 +1,78 @@ +#pragma once +#include +#include "sorted_array.h" +#include "array.h" + +typedef uint32_t last_id_t; + +#define FOR_ELE_SIZE sizeof(uint32_t) +#define METADATA_OVERHEAD 5 + +/* + Compressed chain of blocks that store the document IDs and offsets of a given token. + Offsets of singular and multi-valued fields are encoded differently. +*/ +class posting_list_t { +public: + + // A block stores a list of Document IDs, Token Offsets and a Mapping of ID => Offset indices efficiently + // Layout of *data: [ids...mappings..offsets] + // IDs and Mappings are sorted integers, while offsets are not sorted + struct block_t { + sorted_array ids; + sorted_array offset_index; + array offsets; + + // link to next block + block_t* next = nullptr; + + void insert_and_shift_offset_index(uint32_t index, uint32_t num_offsets); + + void remove_and_shift_offset_index(const uint32_t* indices_sorted, uint32_t num_indices); + + void upsert(uint32_t id, const std::vector& offsets); + + void erase(uint32_t id); + + uint32_t size() { + return ids.getLength(); + } + }; + +private: + + // when a block reaches pre-allocated storage, it is expanded by this factor + static constexpr float BLOCK_GROWTH_FACTOR = 1.3; + + // maximum number of IDs (and associated offsets) to store in each block before another block is created + const uint16_t BLOCK_MAX_ELEMENTS; + + block_t root_block; + + // keeps track of the *last* ID in each block and is used for partial random access + // e.g. 0..[9], 10..[19], 20..[29] + // MUST be ordered + std::map id_block_map; + + static void split_block(block_t* src_block, block_t* dst_block); + + static void merge_adjacent_blocks(block_t* block1, block_t* block2, size_t num_block2_ids); + +public: + + posting_list_t() = delete; + + explicit posting_list_t(uint16_t max_block_elements); + + ~posting_list_t(); + + void upsert(uint32_t id, const std::vector& offsets); + + void erase(uint32_t id); + + block_t* get_root(); + + size_t size(); + + block_t* block_of(last_id_t id); +}; \ No newline at end of file diff --git a/include/sorted_array.h b/include/sorted_array.h index 9e9ada90..59a19f9b 100644 --- a/include/sorted_array.h +++ b/include/sorted_array.h @@ -54,6 +54,8 @@ public: uint32_t at(uint32_t index); + uint32_t last(); + bool contains(uint32_t value); uint32_t indexOf(uint32_t value); diff --git a/src/posting_list.cpp b/src/posting_list.cpp new file mode 100644 index 00000000..740386ac --- /dev/null +++ b/src/posting_list.cpp @@ -0,0 +1,414 @@ +#include "posting_list.h" +#include "for.h" +#include "array_utils.h" + +/* block_t operations */ + +void posting_list_t::block_t::insert_and_shift_offset_index(const uint32_t index, const uint32_t num_offsets) { + uint32_t existing_offset_index = offset_index.at(index); + uint32_t length = offset_index.getLength(); + uint32_t new_length = length + 1; + uint32_t* curr_array = offset_index.uncompress(new_length); + + memmove(&curr_array[index+1], &curr_array[index], sizeof(uint32_t)*(length - index)); + curr_array[index] = existing_offset_index; + + uint32_t curr_index = index + 1; + while(curr_index < new_length) { + curr_array[curr_index] += num_offsets; + curr_index++; + } + + offset_index.load(curr_array, new_length); + + delete [] curr_array; +} + +void posting_list_t::block_t::upsert(const uint32_t id, const std::vector& positions) { + size_t inserted_index = ids.append(id); + + if(inserted_index == ids.getLength()-1) { + // treat as appends + uint32_t curr_index = offsets.getLength(); + offset_index.append(curr_index); + for(uint32_t position : positions) { + offsets.append(position); + } + } else { + uint32_t existing_offset_index = offset_index.at(inserted_index); + insert_and_shift_offset_index(inserted_index, positions.size()); + offsets.insert(existing_offset_index, &positions[0], positions.size()); + } +} + +void posting_list_t::block_t::erase(const uint32_t id) { + uint32_t doc_index = ids.indexOf(id); + + if (doc_index == ids.getLength()) { + return; + } + + uint32_t start_offset = offset_index.at(doc_index); + uint32_t end_offset = (doc_index == ids.getLength() - 1) ? + offsets.getLength() : + offset_index.at(doc_index + 1); + + uint32_t doc_indices[1] = {doc_index}; + remove_and_shift_offset_index(doc_indices, 1); + + offsets.remove_index(start_offset, end_offset); + ids.remove_value(id); +} + +void posting_list_t::block_t::remove_and_shift_offset_index(const uint32_t* indices_sorted, + const uint32_t num_indices) { + uint32_t *curr_array = offset_index.uncompress(); + uint32_t *new_array = new uint32_t[offset_index.getLength()]; + + new_array[0] = 0; + uint32_t new_index = 0; + uint32_t curr_index = 0; + uint32_t indices_counter = 0; + uint32_t shift_value = 0; + + while(curr_index < offset_index.getLength()) { + if(indices_counter < num_indices && curr_index >= indices_sorted[indices_counter]) { + // skip copying + if(curr_index == indices_sorted[indices_counter]) { + curr_index++; + const uint32_t diff = curr_index == offset_index.getLength() ? + 0 : (offset_index.at(curr_index) - offset_index.at(curr_index-1)); + + shift_value += diff; + } + indices_counter++; + } else { + new_array[new_index++] = curr_array[curr_index++] - shift_value; + } + } + + offset_index.load(new_array, new_index); + + delete[] curr_array; + delete[] new_array; +} + +/* posting_list_t operations */ + +posting_list_t::posting_list_t(uint16_t max_block_elements): BLOCK_MAX_ELEMENTS(max_block_elements) { + +} + +posting_list_t::~posting_list_t() { + block_t* block = root_block.next; + while(block != nullptr) { + block_t* next_block = block->next; + delete block; + block = next_block; + } +} + +void posting_list_t::merge_adjacent_blocks(posting_list_t::block_t* block1, posting_list_t::block_t* block2, + size_t num_block2_ids) { + // merge ids + uint32_t* raw_ids1 = block1->ids.uncompress(); + uint32_t* raw_ids2 = block2->ids.uncompress(); + + size_t block1_orig_size = block1->size(); + size_t block2_orig_size = block2->size(); + + uint32_t* raw_ids = new uint32_t[block1->size() + num_block2_ids]; + std::memmove(raw_ids, raw_ids1, sizeof(uint32_t) * block1->size()); + std::memmove(raw_ids + block1->size(), raw_ids2, sizeof(uint32_t) * num_block2_ids); + + block1->ids.load(raw_ids, block1->size() + num_block2_ids); + block2->ids.load(raw_ids2 + num_block2_ids, block2->size() - num_block2_ids); + + delete [] raw_ids1; + delete [] raw_ids2; + delete [] raw_ids; + + // merge offset indices + uint32_t* raw_offset_index1 = block1->offset_index.uncompress(); + uint32_t* raw_offset_index2 = block2->offset_index.uncompress(); + uint32_t* raw_offset_index = new uint32_t[block1_orig_size + block2_orig_size]; + + std::memmove(raw_offset_index, raw_offset_index1, sizeof(uint32_t) * block1->offset_index.getLength()); + size_t start_index = block1->offset_index.getLength(); + size_t base_offset_len = block1->offsets.getLength(); + + for(size_t i = 0; i < num_block2_ids; i++) { + raw_offset_index[start_index + i] = raw_offset_index2[i] + base_offset_len; + } + + block1->offset_index.load(raw_offset_index, block1->offset_index.getLength() + num_block2_ids); + + for(size_t i = 0; i < (block2_orig_size - num_block2_ids); i++) { + raw_offset_index2[num_block2_ids + i] -= raw_offset_index2[num_block2_ids]; + } + + block2->offset_index.load(raw_offset_index2 + num_block2_ids, block2_orig_size - num_block2_ids); + + // merge offsets + uint32_t* raw_offsets1 = block1->offsets.uncompress(); + uint32_t* raw_offsets2 = block2->offsets.uncompress(); + size_t num_block2_offset_elements = (num_block2_ids == block2_orig_size) ? block2->offsets.getLength() : + raw_offset_index2[num_block2_ids]; + + uint32_t* raw_offsets = new uint32_t[block1->offsets.getLength() + num_block2_offset_elements]; + + uint32_t min = raw_offsets1[0], max = raw_offsets1[0]; + + // we have to manually copy over so we can find the new min and max + for(size_t i = 0; i < block1->offsets.getLength(); i++) { + raw_offsets[i] = raw_offsets1[i]; + if(raw_offsets[i] < min) { + min = raw_offsets[i]; + } + + if(raw_offsets[i] > max) { + max = raw_offsets[i]; + } + } + + size_t block2_base_index = block1->offsets.getLength(); + + for(size_t i = 0; i < num_block2_offset_elements; i++) { + size_t j = block2_base_index + i; + raw_offsets[j] = raw_offsets2[i]; + + if(raw_offsets[j] < min) { + min = raw_offsets[j]; + } + + if(raw_offsets[j] > max) { + max = raw_offsets[j]; + } + } + + block1->offsets.load(raw_offsets, block1->offsets.getLength() + num_block2_offset_elements, min, max); + + // reset block2 offsets with remaining elements + if(block2->offsets.getLength() != num_block2_offset_elements) { + const size_t block2_new_offsets_length = (block2->offsets.getLength() - num_block2_offset_elements); + uint32_t* block2_new_raw_offsets = new uint32_t[block2_new_offsets_length]; + min = max = raw_offsets2[num_block2_offset_elements]; + for(size_t i = 0; i < block2_new_offsets_length; i++) { + block2_new_raw_offsets[i] = raw_offsets2[num_block2_offset_elements + i]; + if(block2_new_raw_offsets[i] < min) { + min = block2_new_raw_offsets[i]; + } + + if(block2_new_raw_offsets[i] > max) { + max = block2_new_raw_offsets[i]; + } + } + block2->offsets.load(block2_new_raw_offsets, block2_new_offsets_length, min, max); + delete [] block2_new_raw_offsets; + } else { + block2->offsets.load(nullptr, 0, 0, 0); + } + + delete [] raw_offset_index1; + delete [] raw_offset_index2; + delete [] raw_offset_index; + + delete [] raw_offsets1; + delete [] raw_offsets2; + delete [] raw_offsets; +} + +void posting_list_t::split_block(posting_list_t::block_t* src_block, posting_list_t::block_t* dst_block) { + if(src_block->size() <= 1) { + return; + } + + uint32_t* raw_ids = src_block->ids.uncompress(); + size_t ids_first_half_length = (src_block->size() / 2); + size_t ids_second_half_length = (src_block->size() - ids_first_half_length); + src_block->ids.load(raw_ids, ids_first_half_length); + dst_block->ids.load(raw_ids + ids_first_half_length, ids_second_half_length); + + uint32_t* raw_offset_indices = src_block->offset_index.uncompress(); + size_t offset_indices_first_half_length = (src_block->offset_index.getLength() / 2); + size_t offset_indices_second_half_length = (src_block->offset_index.getLength() - offset_indices_first_half_length); + src_block->offset_index.load(raw_offset_indices, offset_indices_first_half_length); + + // update second half to use zero based index + uint32_t base_index_diff = raw_offset_indices[offset_indices_first_half_length]; + for(size_t i = 0; i < offset_indices_second_half_length; i++) { + raw_offset_indices[offset_indices_first_half_length + i] -= base_index_diff; + } + + dst_block->offset_index.load(raw_offset_indices + offset_indices_first_half_length, offset_indices_second_half_length); + + uint32_t* raw_offsets = src_block->offsets.uncompress(); + size_t src_offsets_length = src_block->offsets.getLength(); + + // load first half of offsets + + size_t offset_first_half_length = base_index_diff; + + // we need to find new min and max + uint32_t min = raw_offsets[0], max = raw_offsets[0]; + + for(size_t i = 0; i < offset_first_half_length; i++) { + if(raw_offsets[i] < min) { + min = raw_offsets[i]; + } + + if(raw_offsets[i] > max) { + max = raw_offsets[i]; + } + } + + src_block->offsets.load(raw_offsets, offset_first_half_length, min, max); + + // load second half + + min = max = raw_offsets[offset_first_half_length]; + for(size_t i = offset_first_half_length; i < src_offsets_length; i++) { + if(raw_offsets[i] < min) { + min = raw_offsets[i]; + } + + if(raw_offsets[i] > max) { + max = raw_offsets[i]; + } + } + + size_t offsets_second_half_length = src_offsets_length - offset_first_half_length; + dst_block->offsets.load(raw_offsets + offset_first_half_length, offsets_second_half_length, min, max); + + delete [] raw_ids; + delete [] raw_offset_indices; + delete [] raw_offsets; +} + +void posting_list_t::upsert(const uint32_t id, const std::vector& offsets) { + // first we will locate the block where `id` should reside + block_t* upsert_block; + last_id_t before_upsert_last_id; + + if(id_block_map.empty()) { + //id_block_map.emplace(id, &root_block); + upsert_block = &root_block; + before_upsert_last_id = UINT32_MAX; + } else { + const auto it = id_block_map.lower_bound(id); + upsert_block = (it == id_block_map.end()) ? id_block_map.rbegin()->second : it->second; + before_upsert_last_id = upsert_block->ids.at(upsert_block->size() - 1); + } + + // happy path: upsert_block is not full + if(upsert_block->size() < BLOCK_MAX_ELEMENTS) { + upsert_block->upsert(id, offsets); + last_id_t after_upsert_last_id = upsert_block->ids.at(upsert_block->size() - 1); + if(before_upsert_last_id != after_upsert_last_id) { + id_block_map.erase(before_upsert_last_id); + id_block_map.emplace(after_upsert_last_id, upsert_block); + } + } else { + block_t* new_block = new block_t; + + if(upsert_block->next == nullptr && upsert_block->ids.last() < id) { + // appending to the end of the last block where the id will reside on a newly block + new_block->upsert(id, offsets); + } else { + // upsert and then split block + upsert_block->upsert(id, offsets); + + // evenly divide elements between both blocks + split_block(upsert_block, new_block); + + last_id_t after_upsert_last_id = upsert_block->ids.at(upsert_block->size() - 1); + id_block_map.erase(before_upsert_last_id); + id_block_map.emplace(after_upsert_last_id, upsert_block); + } + + last_id_t after_new_block_id = new_block->ids.at(new_block->size() - 1); + id_block_map.emplace(after_new_block_id, new_block); + + new_block->next = upsert_block->next; + upsert_block->next = new_block; + } +} + +void posting_list_t::erase(const uint32_t id) { + const auto it = id_block_map.lower_bound(id); + + if(it == id_block_map.end()) { + return ; + } + + block_t* erase_block = it->second; + last_id_t before_last_id = it->first; + erase_block->erase(id); + + size_t new_ids_length = erase_block->size(); + + if(new_ids_length == 0) { + // happens when the last element of last block is deleted + + if(erase_block != &root_block) { + // since we will be deleting the empty node, set the previous node's next pointer to null + std::prev(it)->second->next = nullptr; + delete erase_block; + } + + id_block_map.erase(before_last_id); + + return; + } + + if(new_ids_length >= BLOCK_MAX_ELEMENTS/2 || erase_block->next == nullptr) { + last_id_t after_last_id = erase_block->ids.at(new_ids_length-1); + if(before_last_id != after_last_id) { + id_block_map.erase(before_last_id); + id_block_map.emplace(after_last_id, erase_block); + } + + return ; + } + + // block is less than 50% of max capacity and contains a next node which we can refill from + + auto next_block = erase_block->next; + last_id_t next_block_last_id = next_block->ids.at(next_block->ids.getLength()-1); + + if(erase_block->size() + next_block->size() <= BLOCK_MAX_ELEMENTS) { + // we can merge the contents of next block with `erase_block` and delete the next block + merge_adjacent_blocks(erase_block, next_block, next_block->size()); + erase_block->next = next_block->next; + delete next_block; + + id_block_map.erase(next_block_last_id); + } else { + // only part of the next block can be moved over + size_t num_block2_ids = BLOCK_MAX_ELEMENTS - erase_block->size(); + merge_adjacent_blocks(erase_block, next_block, num_block2_ids); + // NOTE: we don't have to update `id_block_map` for `next_block` as last element doesn't change + } + + last_id_t after_last_id = erase_block->ids.at(erase_block->ids.getLength()-1); + if(before_last_id != after_last_id) { + id_block_map.erase(before_last_id); + id_block_map.emplace(after_last_id, erase_block); + } +} + +posting_list_t::block_t* posting_list_t::get_root() { + return &root_block; +} + +size_t posting_list_t::size() { + return id_block_map.size(); +} + +posting_list_t::block_t* posting_list_t::block_of(last_id_t id) { + auto it = id_block_map.find(id); + if(it != id_block_map.end()) { + return it->second; + } + return nullptr; +} diff --git a/src/sorted_array.cpp b/src/sorted_array.cpp index 3666bfb0..80b757a6 100644 --- a/src/sorted_array.cpp +++ b/src/sorted_array.cpp @@ -397,3 +397,11 @@ void sorted_array::binary_count_indices(const uint32_t *values, int low_vindex, binary_count_indices(values, pivot_vindex+1, high_vindex, src, in_index, high_index, num_found); } } + +uint32_t sorted_array::last() { + if(getLength() == 0) { + return UINT32_MAX; + } + + return at(getLength()-1); +} diff --git a/test/posting_list_test.cpp b/test/posting_list_test.cpp new file mode 100644 index 00000000..0eea3a8f --- /dev/null +++ b/test/posting_list_test.cpp @@ -0,0 +1,331 @@ +#include +#include "posting_list.h" +#include + +TEST(PostingListTest, Insert) { + std::vector offsets = {0, 1, 3}; + + posting_list_t pl(5); + + // insert elements sequentially + + for(size_t i = 0; i < 15; i++) { + pl.upsert(i, offsets); + } + + posting_list_t::block_t* root = pl.get_root(); + ASSERT_EQ(5, root->ids.getLength()); + ASSERT_EQ(5, root->next->ids.getLength()); + ASSERT_EQ(5, root->next->next->ids.getLength()); + + ASSERT_EQ(root->next->next->next, nullptr); + + ASSERT_EQ(3, pl.size()); + ASSERT_EQ(root, pl.block_of(4)); + ASSERT_EQ(root->next, pl.block_of(9)); + ASSERT_EQ(root->next->next, pl.block_of(14)); + + // insert alternate values + + posting_list_t pl2(5); + + for(size_t i = 0; i < 15; i+=2) { + // [0, 2, 4, 6, 8], [10, 12, 14] + pl2.upsert(i, offsets); + } + + root = pl2.get_root(); + ASSERT_EQ(5, root->ids.getLength()); + ASSERT_EQ(3, root->next->ids.getLength()); + + ASSERT_EQ(root->next->next, nullptr); + ASSERT_EQ(2, pl2.size()); + + ASSERT_EQ(root, pl2.block_of(8)); + ASSERT_EQ(root->next, pl2.block_of(14)); + + // insert in the middle + // case 1 + + posting_list_t pl3(5); + + for(size_t i = 0; i < 5; i++) { + pl3.upsert(i, offsets); + } + + pl3.upsert(6, offsets); + pl3.upsert(8, offsets); + pl3.upsert(9, offsets); + pl3.upsert(10, offsets); + pl3.upsert(12, offsets); + + // [0,1,2,3,4], [6,8,9,10,12] + pl3.upsert(5, offsets); + ASSERT_EQ(3, pl3.size()); + ASSERT_EQ(5, pl3.get_root()->ids.getLength()); + ASSERT_EQ(3, pl3.get_root()->next->ids.getLength()); + ASSERT_EQ(8, pl3.get_root()->next->ids.last()); + ASSERT_EQ(3, pl3.get_root()->next->next->ids.getLength()); + ASSERT_EQ(12, pl3.get_root()->next->next->ids.last()); + + for(size_t i = 0; i < pl3.get_root()->next->offset_index.getLength(); i++) { + ASSERT_EQ(i * 3, pl3.get_root()->next->offset_index.at(i)); + } + + for(size_t i = 0; i < pl3.get_root()->next->offsets.getLength(); i++) { + ASSERT_EQ(offsets[i % 3], pl3.get_root()->next->offsets.at(i)); + } + + // case 2 + posting_list_t pl4(5); + + for(size_t i = 0; i < 5; i++) { + pl4.upsert(i, offsets); + } + + pl4.upsert(6, offsets); + pl4.upsert(8, offsets); + pl4.upsert(9, offsets); + pl4.upsert(10, offsets); + pl4.upsert(12, offsets); + + // [0,1,2,3,4], [6,8,9,10,12] + pl4.upsert(11, offsets); + ASSERT_EQ(3, pl4.size()); + + ASSERT_EQ(5, pl4.get_root()->ids.getLength()); + ASSERT_EQ(3, pl4.get_root()->next->ids.getLength()); + ASSERT_EQ(9, pl4.get_root()->next->ids.last()); + ASSERT_EQ(3, pl4.get_root()->next->next->ids.getLength()); + ASSERT_EQ(12, pl4.get_root()->next->next->ids.last()); + + for(size_t i = 0; i < pl4.get_root()->next->offset_index.getLength(); i++) { + ASSERT_EQ(i * 3, pl4.get_root()->next->offset_index.at(i)); + } + + for(size_t i = 0; i < pl4.get_root()->next->offsets.getLength(); i++) { + ASSERT_EQ(offsets[i % 3], pl4.get_root()->next->offsets.at(i)); + } +} + +TEST(PostingListTest, RemovalsOnFirstBlock) { + std::vector offsets = {0, 1, 3}; + posting_list_t pl(5); + + ASSERT_EQ(0, pl.size()); + + // try to erase when posting list is empty + pl.erase(0); + + ASSERT_EQ(0, pl.size()); + + // insert a single element and erase it + pl.upsert(0, offsets); + ASSERT_EQ(1, pl.size()); + pl.erase(0); + ASSERT_EQ(0, pl.size()); + + ASSERT_EQ(0, pl.get_root()->ids.getLength()); + ASSERT_EQ(0, pl.get_root()->offset_index.getLength()); + ASSERT_EQ(0, pl.get_root()->offsets.getLength()); + + // insert until one past max block size + for(size_t i = 0; i < 6; i++) { + pl.upsert(i, offsets); + } + + ASSERT_EQ(2, pl.size()); + + // delete non-existing element + pl.erase(1000); + + // delete elements from first block: blocks should not be merged until it falls below 50% occupancy + pl.erase(1); + ASSERT_EQ(2, pl.size()); + + // [0, 2, 3, 4], [5] + + for(size_t i = 0; i < pl.get_root()->offset_index.getLength(); i++) { + ASSERT_EQ(i * 3, pl.get_root()->offset_index.at(i)); + } + + for(size_t i = 0; i < pl.get_root()->offsets.getLength(); i++) { + ASSERT_EQ(offsets[i % 3], pl.get_root()->offsets.at(i)); + } + + pl.erase(2); + ASSERT_EQ(2, pl.size()); + pl.erase(3); + + // [0, 4], [5] + ASSERT_EQ(2, pl.size()); + ASSERT_EQ(2, pl.get_root()->size()); + ASSERT_EQ(1, pl.get_root()->next->size()); + ASSERT_EQ(pl.get_root(), pl.block_of(4)); + ASSERT_EQ(pl.get_root()->next, pl.block_of(5)); + + for(size_t i = 0; i < pl.get_root()->offset_index.getLength(); i++) { + ASSERT_EQ(i * 3, pl.get_root()->offset_index.at(i)); + } + + for(size_t i = 0; i < pl.get_root()->offsets.getLength(); i++) { + ASSERT_EQ(offsets[i % 3], pl.get_root()->offsets.at(i)); + } + + pl.erase(4); // this will trigger the merge + + // [0, 5] + // ensure that merge has happened + ASSERT_EQ(1, pl.size()); + ASSERT_EQ(pl.get_root(), pl.block_of(5)); + ASSERT_EQ(nullptr, pl.get_root()->next); + ASSERT_EQ(2, pl.get_root()->size()); + + for(size_t i = 0; i < pl.get_root()->offset_index.getLength(); i++) { + ASSERT_EQ(i * 3, pl.get_root()->offset_index.at(i)); + } + + for(size_t i = 0; i < pl.get_root()->offsets.getLength(); i++) { + ASSERT_EQ(offsets[i % 3], pl.get_root()->offsets.at(i)); + } +} + +TEST(PostingListTest, RemovalsOnLaterBlocks) { + std::vector offsets = {0, 1, 3}; + posting_list_t pl(5); + + // insert until one past max block size + for(size_t i = 0; i < 6; i++) { + pl.upsert(i, offsets); + } + + // erase last element of last, non-first block + + pl.erase(5); + ASSERT_EQ(1, pl.size()); + ASSERT_EQ(5, pl.get_root()->size()); + ASSERT_EQ(4, pl.get_root()->ids.last()); + ASSERT_EQ(nullptr, pl.get_root()->next); + + for(size_t i = 0; i < pl.get_root()->offset_index.getLength(); i++) { + ASSERT_EQ(i * 3, pl.get_root()->offset_index.at(i)); + } + + for(size_t i = 0; i < pl.get_root()->offsets.getLength(); i++) { + ASSERT_EQ(offsets[i % 3], pl.get_root()->offsets.at(i)); + } + + // erase last element of the only block when block is atleast half full + pl.erase(4); + ASSERT_EQ(1, pl.size()); + ASSERT_EQ(4, pl.get_root()->size()); + ASSERT_EQ(3, pl.get_root()->ids.last()); + ASSERT_EQ(pl.get_root(), pl.block_of(3)); + + for(size_t i = 4; i < 15; i++) { + pl.upsert(i, offsets); + } + + // [0..4], [5..9], [10..14] + pl.erase(5); + pl.erase(6); + pl.erase(7); + + for(size_t i = 0; i < pl.get_root()->next->offset_index.getLength(); i++) { + ASSERT_EQ(i * 3, pl.get_root()->next->offset_index.at(i)); + } + + for(size_t i = 0; i < pl.get_root()->next->offsets.getLength(); i++) { + ASSERT_EQ(offsets[i % 3], pl.get_root()->next->offsets.at(i)); + } + + for(size_t i = 0; i < pl.get_root()->next->next->offset_index.getLength(); i++) { + ASSERT_EQ(i * 3, pl.get_root()->next->next->offset_index.at(i)); + } + + for(size_t i = 0; i < pl.get_root()->next->next->offsets.getLength(); i++) { + ASSERT_EQ(offsets[i % 3], pl.get_root()->next->next->offsets.at(i)); + } + + // only part of the next node contents can be moved over when we delete 8 since (1 + 5) > 5 + pl.erase(8); + + // [0..4], [9], [10..14] => [0..4], [9,10,11,12,13], [14] + + ASSERT_EQ(3, pl.size()); + ASSERT_EQ(5, pl.get_root()->next->size()); + ASSERT_EQ(1, pl.get_root()->next->next->size()); + ASSERT_EQ(13, pl.get_root()->next->ids.last()); + ASSERT_EQ(14, pl.get_root()->next->next->ids.last()); + + for(size_t i = 0; i < pl.get_root()->next->offset_index.getLength(); i++) { + ASSERT_EQ(i * 3, pl.get_root()->next->offset_index.at(i)); + } + + for(size_t i = 0; i < pl.get_root()->next->offsets.getLength(); i++) { + ASSERT_EQ(offsets[i % 3], pl.get_root()->next->offsets.at(i)); + } + + for(size_t i = 0; i < pl.get_root()->next->next->offset_index.getLength(); i++) { + ASSERT_EQ(i * 3, pl.get_root()->next->next->offset_index.at(i)); + } + + for(size_t i = 0; i < pl.get_root()->next->next->offsets.getLength(); i++) { + ASSERT_EQ(offsets[i % 3], pl.get_root()->next->next->offsets.at(i)); + } +} + +TEST(PostingListTest, OutOfOrderUpserts) { + std::vector offsets = {0, 1, 3}; + posting_list_t pl(5); + + for(int i = 5; i > 0; i--) { + pl.upsert(i, offsets); + } + + pl.upsert(0, offsets); + pl.upsert(200000, offsets); + + ASSERT_EQ(2, pl.size()); + + ASSERT_EQ(3, pl.get_root()->size()); + ASSERT_EQ(4, pl.get_root()->next->size()); + + for(size_t i = 0; i < pl.get_root()->offset_index.getLength(); i++) { + ASSERT_EQ(i * 3, pl.get_root()->offset_index.at(i)); + } + + for(size_t i = 0; i < pl.get_root()->offsets.getLength(); i++) { + ASSERT_EQ(offsets[i % 3], pl.get_root()->offsets.at(i)); + } + + for(size_t i = 0; i < pl.get_root()->next->offset_index.getLength(); i++) { + ASSERT_EQ(i * 3, pl.get_root()->next->offset_index.at(i)); + } + + for(size_t i = 0; i < pl.get_root()->next->offsets.getLength(); i++) { + ASSERT_EQ(offsets[i % 3], pl.get_root()->next->offsets.at(i)); + } +} + +TEST(PostingListTest, RandomInsertAndDeletes) { + time_t t; + srand((unsigned) time(&t)); + + posting_list_t pl(100); + std::vector offsets1 = {0, 1, 3}; + std::vector offsets2 = {10, 12}; + + for(size_t i = 0; i < 100000; i++) { + const std::vector& offsets = (i % 2 == 0) ? offsets1 : offsets2; + pl.upsert(rand() % 100000, offsets); + } + + for(size_t i = 0; i < 10000; i++) { + const std::vector& offsets = (i % 2 == 0) ? offsets1 : offsets2; + pl.erase(rand() % 100000); + } + + bool size_within_range = (pl.size() < 1500) && (pl.size() > 1000); + ASSERT_TRUE(size_within_range); +}