Basics of a block based posting list container.

This commit is contained in:
Kishore Nallan 2021-05-22 15:05:46 +05:30
parent b817e615cb
commit 8b24fe82b8
6 changed files with 835 additions and 2 deletions

View File

@ -242,8 +242,8 @@ private:
void index_string_array_field(const std::vector<std::string> & strings, const int64_t score, art_tree *t,
uint32_t seq_id, bool is_facet, const field & a_field);
void remove_and_shift_offset_index(sorted_array& offset_index, const uint32_t* indices_sorted,
const uint32_t indices_length);
static void remove_and_shift_offset_index(sorted_array& offset_index, const uint32_t* indices_sorted,
const uint32_t indices_length);
void collate_included_ids(const std::vector<std::string>& q_included_tokens,
const std::string & field, const uint8_t field_id,

78
include/posting_list.h Normal file
View File

@ -0,0 +1,78 @@
#pragma once
#include <map>
#include "sorted_array.h"
#include "array.h"
typedef uint32_t last_id_t;
#define FOR_ELE_SIZE sizeof(uint32_t)
#define METADATA_OVERHEAD 5
/*
Compressed chain of blocks that store the document IDs and offsets of a given token.
Offsets of singular and multi-valued fields are encoded differently.
*/
class posting_list_t {
public:
// A block stores a list of Document IDs, Token Offsets and a Mapping of ID => Offset indices efficiently
// Layout of *data: [ids...mappings..offsets]
// IDs and Mappings are sorted integers, while offsets are not sorted
struct block_t {
sorted_array ids;
sorted_array offset_index;
array offsets;
// link to next block
block_t* next = nullptr;
void insert_and_shift_offset_index(uint32_t index, uint32_t num_offsets);
void remove_and_shift_offset_index(const uint32_t* indices_sorted, uint32_t num_indices);
void upsert(uint32_t id, const std::vector<uint32_t>& offsets);
void erase(uint32_t id);
uint32_t size() {
return ids.getLength();
}
};
private:
// when a block reaches pre-allocated storage, it is expanded by this factor
static constexpr float BLOCK_GROWTH_FACTOR = 1.3;
// maximum number of IDs (and associated offsets) to store in each block before another block is created
const uint16_t BLOCK_MAX_ELEMENTS;
block_t root_block;
// keeps track of the *last* ID in each block and is used for partial random access
// e.g. 0..[9], 10..[19], 20..[29]
// MUST be ordered
std::map<last_id_t, block_t*> id_block_map;
static void split_block(block_t* src_block, block_t* dst_block);
static void merge_adjacent_blocks(block_t* block1, block_t* block2, size_t num_block2_ids);
public:
posting_list_t() = delete;
explicit posting_list_t(uint16_t max_block_elements);
~posting_list_t();
void upsert(uint32_t id, const std::vector<uint32_t>& offsets);
void erase(uint32_t id);
block_t* get_root();
size_t size();
block_t* block_of(last_id_t id);
};

View File

@ -54,6 +54,8 @@ public:
uint32_t at(uint32_t index);
uint32_t last();
bool contains(uint32_t value);
uint32_t indexOf(uint32_t value);

414
src/posting_list.cpp Normal file
View File

@ -0,0 +1,414 @@
#include "posting_list.h"
#include "for.h"
#include "array_utils.h"
/* block_t operations */
void posting_list_t::block_t::insert_and_shift_offset_index(const uint32_t index, const uint32_t num_offsets) {
uint32_t existing_offset_index = offset_index.at(index);
uint32_t length = offset_index.getLength();
uint32_t new_length = length + 1;
uint32_t* curr_array = offset_index.uncompress(new_length);
memmove(&curr_array[index+1], &curr_array[index], sizeof(uint32_t)*(length - index));
curr_array[index] = existing_offset_index;
uint32_t curr_index = index + 1;
while(curr_index < new_length) {
curr_array[curr_index] += num_offsets;
curr_index++;
}
offset_index.load(curr_array, new_length);
delete [] curr_array;
}
void posting_list_t::block_t::upsert(const uint32_t id, const std::vector<uint32_t>& positions) {
size_t inserted_index = ids.append(id);
if(inserted_index == ids.getLength()-1) {
// treat as appends
uint32_t curr_index = offsets.getLength();
offset_index.append(curr_index);
for(uint32_t position : positions) {
offsets.append(position);
}
} else {
uint32_t existing_offset_index = offset_index.at(inserted_index);
insert_and_shift_offset_index(inserted_index, positions.size());
offsets.insert(existing_offset_index, &positions[0], positions.size());
}
}
void posting_list_t::block_t::erase(const uint32_t id) {
uint32_t doc_index = ids.indexOf(id);
if (doc_index == ids.getLength()) {
return;
}
uint32_t start_offset = offset_index.at(doc_index);
uint32_t end_offset = (doc_index == ids.getLength() - 1) ?
offsets.getLength() :
offset_index.at(doc_index + 1);
uint32_t doc_indices[1] = {doc_index};
remove_and_shift_offset_index(doc_indices, 1);
offsets.remove_index(start_offset, end_offset);
ids.remove_value(id);
}
void posting_list_t::block_t::remove_and_shift_offset_index(const uint32_t* indices_sorted,
const uint32_t num_indices) {
uint32_t *curr_array = offset_index.uncompress();
uint32_t *new_array = new uint32_t[offset_index.getLength()];
new_array[0] = 0;
uint32_t new_index = 0;
uint32_t curr_index = 0;
uint32_t indices_counter = 0;
uint32_t shift_value = 0;
while(curr_index < offset_index.getLength()) {
if(indices_counter < num_indices && curr_index >= indices_sorted[indices_counter]) {
// skip copying
if(curr_index == indices_sorted[indices_counter]) {
curr_index++;
const uint32_t diff = curr_index == offset_index.getLength() ?
0 : (offset_index.at(curr_index) - offset_index.at(curr_index-1));
shift_value += diff;
}
indices_counter++;
} else {
new_array[new_index++] = curr_array[curr_index++] - shift_value;
}
}
offset_index.load(new_array, new_index);
delete[] curr_array;
delete[] new_array;
}
/* posting_list_t operations */
posting_list_t::posting_list_t(uint16_t max_block_elements): BLOCK_MAX_ELEMENTS(max_block_elements) {
}
posting_list_t::~posting_list_t() {
block_t* block = root_block.next;
while(block != nullptr) {
block_t* next_block = block->next;
delete block;
block = next_block;
}
}
void posting_list_t::merge_adjacent_blocks(posting_list_t::block_t* block1, posting_list_t::block_t* block2,
size_t num_block2_ids) {
// merge ids
uint32_t* raw_ids1 = block1->ids.uncompress();
uint32_t* raw_ids2 = block2->ids.uncompress();
size_t block1_orig_size = block1->size();
size_t block2_orig_size = block2->size();
uint32_t* raw_ids = new uint32_t[block1->size() + num_block2_ids];
std::memmove(raw_ids, raw_ids1, sizeof(uint32_t) * block1->size());
std::memmove(raw_ids + block1->size(), raw_ids2, sizeof(uint32_t) * num_block2_ids);
block1->ids.load(raw_ids, block1->size() + num_block2_ids);
block2->ids.load(raw_ids2 + num_block2_ids, block2->size() - num_block2_ids);
delete [] raw_ids1;
delete [] raw_ids2;
delete [] raw_ids;
// merge offset indices
uint32_t* raw_offset_index1 = block1->offset_index.uncompress();
uint32_t* raw_offset_index2 = block2->offset_index.uncompress();
uint32_t* raw_offset_index = new uint32_t[block1_orig_size + block2_orig_size];
std::memmove(raw_offset_index, raw_offset_index1, sizeof(uint32_t) * block1->offset_index.getLength());
size_t start_index = block1->offset_index.getLength();
size_t base_offset_len = block1->offsets.getLength();
for(size_t i = 0; i < num_block2_ids; i++) {
raw_offset_index[start_index + i] = raw_offset_index2[i] + base_offset_len;
}
block1->offset_index.load(raw_offset_index, block1->offset_index.getLength() + num_block2_ids);
for(size_t i = 0; i < (block2_orig_size - num_block2_ids); i++) {
raw_offset_index2[num_block2_ids + i] -= raw_offset_index2[num_block2_ids];
}
block2->offset_index.load(raw_offset_index2 + num_block2_ids, block2_orig_size - num_block2_ids);
// merge offsets
uint32_t* raw_offsets1 = block1->offsets.uncompress();
uint32_t* raw_offsets2 = block2->offsets.uncompress();
size_t num_block2_offset_elements = (num_block2_ids == block2_orig_size) ? block2->offsets.getLength() :
raw_offset_index2[num_block2_ids];
uint32_t* raw_offsets = new uint32_t[block1->offsets.getLength() + num_block2_offset_elements];
uint32_t min = raw_offsets1[0], max = raw_offsets1[0];
// we have to manually copy over so we can find the new min and max
for(size_t i = 0; i < block1->offsets.getLength(); i++) {
raw_offsets[i] = raw_offsets1[i];
if(raw_offsets[i] < min) {
min = raw_offsets[i];
}
if(raw_offsets[i] > max) {
max = raw_offsets[i];
}
}
size_t block2_base_index = block1->offsets.getLength();
for(size_t i = 0; i < num_block2_offset_elements; i++) {
size_t j = block2_base_index + i;
raw_offsets[j] = raw_offsets2[i];
if(raw_offsets[j] < min) {
min = raw_offsets[j];
}
if(raw_offsets[j] > max) {
max = raw_offsets[j];
}
}
block1->offsets.load(raw_offsets, block1->offsets.getLength() + num_block2_offset_elements, min, max);
// reset block2 offsets with remaining elements
if(block2->offsets.getLength() != num_block2_offset_elements) {
const size_t block2_new_offsets_length = (block2->offsets.getLength() - num_block2_offset_elements);
uint32_t* block2_new_raw_offsets = new uint32_t[block2_new_offsets_length];
min = max = raw_offsets2[num_block2_offset_elements];
for(size_t i = 0; i < block2_new_offsets_length; i++) {
block2_new_raw_offsets[i] = raw_offsets2[num_block2_offset_elements + i];
if(block2_new_raw_offsets[i] < min) {
min = block2_new_raw_offsets[i];
}
if(block2_new_raw_offsets[i] > max) {
max = block2_new_raw_offsets[i];
}
}
block2->offsets.load(block2_new_raw_offsets, block2_new_offsets_length, min, max);
delete [] block2_new_raw_offsets;
} else {
block2->offsets.load(nullptr, 0, 0, 0);
}
delete [] raw_offset_index1;
delete [] raw_offset_index2;
delete [] raw_offset_index;
delete [] raw_offsets1;
delete [] raw_offsets2;
delete [] raw_offsets;
}
void posting_list_t::split_block(posting_list_t::block_t* src_block, posting_list_t::block_t* dst_block) {
if(src_block->size() <= 1) {
return;
}
uint32_t* raw_ids = src_block->ids.uncompress();
size_t ids_first_half_length = (src_block->size() / 2);
size_t ids_second_half_length = (src_block->size() - ids_first_half_length);
src_block->ids.load(raw_ids, ids_first_half_length);
dst_block->ids.load(raw_ids + ids_first_half_length, ids_second_half_length);
uint32_t* raw_offset_indices = src_block->offset_index.uncompress();
size_t offset_indices_first_half_length = (src_block->offset_index.getLength() / 2);
size_t offset_indices_second_half_length = (src_block->offset_index.getLength() - offset_indices_first_half_length);
src_block->offset_index.load(raw_offset_indices, offset_indices_first_half_length);
// update second half to use zero based index
uint32_t base_index_diff = raw_offset_indices[offset_indices_first_half_length];
for(size_t i = 0; i < offset_indices_second_half_length; i++) {
raw_offset_indices[offset_indices_first_half_length + i] -= base_index_diff;
}
dst_block->offset_index.load(raw_offset_indices + offset_indices_first_half_length, offset_indices_second_half_length);
uint32_t* raw_offsets = src_block->offsets.uncompress();
size_t src_offsets_length = src_block->offsets.getLength();
// load first half of offsets
size_t offset_first_half_length = base_index_diff;
// we need to find new min and max
uint32_t min = raw_offsets[0], max = raw_offsets[0];
for(size_t i = 0; i < offset_first_half_length; i++) {
if(raw_offsets[i] < min) {
min = raw_offsets[i];
}
if(raw_offsets[i] > max) {
max = raw_offsets[i];
}
}
src_block->offsets.load(raw_offsets, offset_first_half_length, min, max);
// load second half
min = max = raw_offsets[offset_first_half_length];
for(size_t i = offset_first_half_length; i < src_offsets_length; i++) {
if(raw_offsets[i] < min) {
min = raw_offsets[i];
}
if(raw_offsets[i] > max) {
max = raw_offsets[i];
}
}
size_t offsets_second_half_length = src_offsets_length - offset_first_half_length;
dst_block->offsets.load(raw_offsets + offset_first_half_length, offsets_second_half_length, min, max);
delete [] raw_ids;
delete [] raw_offset_indices;
delete [] raw_offsets;
}
void posting_list_t::upsert(const uint32_t id, const std::vector<uint32_t>& offsets) {
// first we will locate the block where `id` should reside
block_t* upsert_block;
last_id_t before_upsert_last_id;
if(id_block_map.empty()) {
//id_block_map.emplace(id, &root_block);
upsert_block = &root_block;
before_upsert_last_id = UINT32_MAX;
} else {
const auto it = id_block_map.lower_bound(id);
upsert_block = (it == id_block_map.end()) ? id_block_map.rbegin()->second : it->second;
before_upsert_last_id = upsert_block->ids.at(upsert_block->size() - 1);
}
// happy path: upsert_block is not full
if(upsert_block->size() < BLOCK_MAX_ELEMENTS) {
upsert_block->upsert(id, offsets);
last_id_t after_upsert_last_id = upsert_block->ids.at(upsert_block->size() - 1);
if(before_upsert_last_id != after_upsert_last_id) {
id_block_map.erase(before_upsert_last_id);
id_block_map.emplace(after_upsert_last_id, upsert_block);
}
} else {
block_t* new_block = new block_t;
if(upsert_block->next == nullptr && upsert_block->ids.last() < id) {
// appending to the end of the last block where the id will reside on a newly block
new_block->upsert(id, offsets);
} else {
// upsert and then split block
upsert_block->upsert(id, offsets);
// evenly divide elements between both blocks
split_block(upsert_block, new_block);
last_id_t after_upsert_last_id = upsert_block->ids.at(upsert_block->size() - 1);
id_block_map.erase(before_upsert_last_id);
id_block_map.emplace(after_upsert_last_id, upsert_block);
}
last_id_t after_new_block_id = new_block->ids.at(new_block->size() - 1);
id_block_map.emplace(after_new_block_id, new_block);
new_block->next = upsert_block->next;
upsert_block->next = new_block;
}
}
void posting_list_t::erase(const uint32_t id) {
const auto it = id_block_map.lower_bound(id);
if(it == id_block_map.end()) {
return ;
}
block_t* erase_block = it->second;
last_id_t before_last_id = it->first;
erase_block->erase(id);
size_t new_ids_length = erase_block->size();
if(new_ids_length == 0) {
// happens when the last element of last block is deleted
if(erase_block != &root_block) {
// since we will be deleting the empty node, set the previous node's next pointer to null
std::prev(it)->second->next = nullptr;
delete erase_block;
}
id_block_map.erase(before_last_id);
return;
}
if(new_ids_length >= BLOCK_MAX_ELEMENTS/2 || erase_block->next == nullptr) {
last_id_t after_last_id = erase_block->ids.at(new_ids_length-1);
if(before_last_id != after_last_id) {
id_block_map.erase(before_last_id);
id_block_map.emplace(after_last_id, erase_block);
}
return ;
}
// block is less than 50% of max capacity and contains a next node which we can refill from
auto next_block = erase_block->next;
last_id_t next_block_last_id = next_block->ids.at(next_block->ids.getLength()-1);
if(erase_block->size() + next_block->size() <= BLOCK_MAX_ELEMENTS) {
// we can merge the contents of next block with `erase_block` and delete the next block
merge_adjacent_blocks(erase_block, next_block, next_block->size());
erase_block->next = next_block->next;
delete next_block;
id_block_map.erase(next_block_last_id);
} else {
// only part of the next block can be moved over
size_t num_block2_ids = BLOCK_MAX_ELEMENTS - erase_block->size();
merge_adjacent_blocks(erase_block, next_block, num_block2_ids);
// NOTE: we don't have to update `id_block_map` for `next_block` as last element doesn't change
}
last_id_t after_last_id = erase_block->ids.at(erase_block->ids.getLength()-1);
if(before_last_id != after_last_id) {
id_block_map.erase(before_last_id);
id_block_map.emplace(after_last_id, erase_block);
}
}
posting_list_t::block_t* posting_list_t::get_root() {
return &root_block;
}
size_t posting_list_t::size() {
return id_block_map.size();
}
posting_list_t::block_t* posting_list_t::block_of(last_id_t id) {
auto it = id_block_map.find(id);
if(it != id_block_map.end()) {
return it->second;
}
return nullptr;
}

View File

@ -397,3 +397,11 @@ void sorted_array::binary_count_indices(const uint32_t *values, int low_vindex,
binary_count_indices(values, pivot_vindex+1, high_vindex, src, in_index, high_index, num_found);
}
}
uint32_t sorted_array::last() {
if(getLength() == 0) {
return UINT32_MAX;
}
return at(getLength()-1);
}

331
test/posting_list_test.cpp Normal file
View File

@ -0,0 +1,331 @@
#include <gtest/gtest.h>
#include "posting_list.h"
#include <vector>
TEST(PostingListTest, Insert) {
std::vector<uint32_t> offsets = {0, 1, 3};
posting_list_t pl(5);
// insert elements sequentially
for(size_t i = 0; i < 15; i++) {
pl.upsert(i, offsets);
}
posting_list_t::block_t* root = pl.get_root();
ASSERT_EQ(5, root->ids.getLength());
ASSERT_EQ(5, root->next->ids.getLength());
ASSERT_EQ(5, root->next->next->ids.getLength());
ASSERT_EQ(root->next->next->next, nullptr);
ASSERT_EQ(3, pl.size());
ASSERT_EQ(root, pl.block_of(4));
ASSERT_EQ(root->next, pl.block_of(9));
ASSERT_EQ(root->next->next, pl.block_of(14));
// insert alternate values
posting_list_t pl2(5);
for(size_t i = 0; i < 15; i+=2) {
// [0, 2, 4, 6, 8], [10, 12, 14]
pl2.upsert(i, offsets);
}
root = pl2.get_root();
ASSERT_EQ(5, root->ids.getLength());
ASSERT_EQ(3, root->next->ids.getLength());
ASSERT_EQ(root->next->next, nullptr);
ASSERT_EQ(2, pl2.size());
ASSERT_EQ(root, pl2.block_of(8));
ASSERT_EQ(root->next, pl2.block_of(14));
// insert in the middle
// case 1
posting_list_t pl3(5);
for(size_t i = 0; i < 5; i++) {
pl3.upsert(i, offsets);
}
pl3.upsert(6, offsets);
pl3.upsert(8, offsets);
pl3.upsert(9, offsets);
pl3.upsert(10, offsets);
pl3.upsert(12, offsets);
// [0,1,2,3,4], [6,8,9,10,12]
pl3.upsert(5, offsets);
ASSERT_EQ(3, pl3.size());
ASSERT_EQ(5, pl3.get_root()->ids.getLength());
ASSERT_EQ(3, pl3.get_root()->next->ids.getLength());
ASSERT_EQ(8, pl3.get_root()->next->ids.last());
ASSERT_EQ(3, pl3.get_root()->next->next->ids.getLength());
ASSERT_EQ(12, pl3.get_root()->next->next->ids.last());
for(size_t i = 0; i < pl3.get_root()->next->offset_index.getLength(); i++) {
ASSERT_EQ(i * 3, pl3.get_root()->next->offset_index.at(i));
}
for(size_t i = 0; i < pl3.get_root()->next->offsets.getLength(); i++) {
ASSERT_EQ(offsets[i % 3], pl3.get_root()->next->offsets.at(i));
}
// case 2
posting_list_t pl4(5);
for(size_t i = 0; i < 5; i++) {
pl4.upsert(i, offsets);
}
pl4.upsert(6, offsets);
pl4.upsert(8, offsets);
pl4.upsert(9, offsets);
pl4.upsert(10, offsets);
pl4.upsert(12, offsets);
// [0,1,2,3,4], [6,8,9,10,12]
pl4.upsert(11, offsets);
ASSERT_EQ(3, pl4.size());
ASSERT_EQ(5, pl4.get_root()->ids.getLength());
ASSERT_EQ(3, pl4.get_root()->next->ids.getLength());
ASSERT_EQ(9, pl4.get_root()->next->ids.last());
ASSERT_EQ(3, pl4.get_root()->next->next->ids.getLength());
ASSERT_EQ(12, pl4.get_root()->next->next->ids.last());
for(size_t i = 0; i < pl4.get_root()->next->offset_index.getLength(); i++) {
ASSERT_EQ(i * 3, pl4.get_root()->next->offset_index.at(i));
}
for(size_t i = 0; i < pl4.get_root()->next->offsets.getLength(); i++) {
ASSERT_EQ(offsets[i % 3], pl4.get_root()->next->offsets.at(i));
}
}
TEST(PostingListTest, RemovalsOnFirstBlock) {
std::vector<uint32_t> offsets = {0, 1, 3};
posting_list_t pl(5);
ASSERT_EQ(0, pl.size());
// try to erase when posting list is empty
pl.erase(0);
ASSERT_EQ(0, pl.size());
// insert a single element and erase it
pl.upsert(0, offsets);
ASSERT_EQ(1, pl.size());
pl.erase(0);
ASSERT_EQ(0, pl.size());
ASSERT_EQ(0, pl.get_root()->ids.getLength());
ASSERT_EQ(0, pl.get_root()->offset_index.getLength());
ASSERT_EQ(0, pl.get_root()->offsets.getLength());
// insert until one past max block size
for(size_t i = 0; i < 6; i++) {
pl.upsert(i, offsets);
}
ASSERT_EQ(2, pl.size());
// delete non-existing element
pl.erase(1000);
// delete elements from first block: blocks should not be merged until it falls below 50% occupancy
pl.erase(1);
ASSERT_EQ(2, pl.size());
// [0, 2, 3, 4], [5]
for(size_t i = 0; i < pl.get_root()->offset_index.getLength(); i++) {
ASSERT_EQ(i * 3, pl.get_root()->offset_index.at(i));
}
for(size_t i = 0; i < pl.get_root()->offsets.getLength(); i++) {
ASSERT_EQ(offsets[i % 3], pl.get_root()->offsets.at(i));
}
pl.erase(2);
ASSERT_EQ(2, pl.size());
pl.erase(3);
// [0, 4], [5]
ASSERT_EQ(2, pl.size());
ASSERT_EQ(2, pl.get_root()->size());
ASSERT_EQ(1, pl.get_root()->next->size());
ASSERT_EQ(pl.get_root(), pl.block_of(4));
ASSERT_EQ(pl.get_root()->next, pl.block_of(5));
for(size_t i = 0; i < pl.get_root()->offset_index.getLength(); i++) {
ASSERT_EQ(i * 3, pl.get_root()->offset_index.at(i));
}
for(size_t i = 0; i < pl.get_root()->offsets.getLength(); i++) {
ASSERT_EQ(offsets[i % 3], pl.get_root()->offsets.at(i));
}
pl.erase(4); // this will trigger the merge
// [0, 5]
// ensure that merge has happened
ASSERT_EQ(1, pl.size());
ASSERT_EQ(pl.get_root(), pl.block_of(5));
ASSERT_EQ(nullptr, pl.get_root()->next);
ASSERT_EQ(2, pl.get_root()->size());
for(size_t i = 0; i < pl.get_root()->offset_index.getLength(); i++) {
ASSERT_EQ(i * 3, pl.get_root()->offset_index.at(i));
}
for(size_t i = 0; i < pl.get_root()->offsets.getLength(); i++) {
ASSERT_EQ(offsets[i % 3], pl.get_root()->offsets.at(i));
}
}
TEST(PostingListTest, RemovalsOnLaterBlocks) {
std::vector<uint32_t> offsets = {0, 1, 3};
posting_list_t pl(5);
// insert until one past max block size
for(size_t i = 0; i < 6; i++) {
pl.upsert(i, offsets);
}
// erase last element of last, non-first block
pl.erase(5);
ASSERT_EQ(1, pl.size());
ASSERT_EQ(5, pl.get_root()->size());
ASSERT_EQ(4, pl.get_root()->ids.last());
ASSERT_EQ(nullptr, pl.get_root()->next);
for(size_t i = 0; i < pl.get_root()->offset_index.getLength(); i++) {
ASSERT_EQ(i * 3, pl.get_root()->offset_index.at(i));
}
for(size_t i = 0; i < pl.get_root()->offsets.getLength(); i++) {
ASSERT_EQ(offsets[i % 3], pl.get_root()->offsets.at(i));
}
// erase last element of the only block when block is atleast half full
pl.erase(4);
ASSERT_EQ(1, pl.size());
ASSERT_EQ(4, pl.get_root()->size());
ASSERT_EQ(3, pl.get_root()->ids.last());
ASSERT_EQ(pl.get_root(), pl.block_of(3));
for(size_t i = 4; i < 15; i++) {
pl.upsert(i, offsets);
}
// [0..4], [5..9], [10..14]
pl.erase(5);
pl.erase(6);
pl.erase(7);
for(size_t i = 0; i < pl.get_root()->next->offset_index.getLength(); i++) {
ASSERT_EQ(i * 3, pl.get_root()->next->offset_index.at(i));
}
for(size_t i = 0; i < pl.get_root()->next->offsets.getLength(); i++) {
ASSERT_EQ(offsets[i % 3], pl.get_root()->next->offsets.at(i));
}
for(size_t i = 0; i < pl.get_root()->next->next->offset_index.getLength(); i++) {
ASSERT_EQ(i * 3, pl.get_root()->next->next->offset_index.at(i));
}
for(size_t i = 0; i < pl.get_root()->next->next->offsets.getLength(); i++) {
ASSERT_EQ(offsets[i % 3], pl.get_root()->next->next->offsets.at(i));
}
// only part of the next node contents can be moved over when we delete 8 since (1 + 5) > 5
pl.erase(8);
// [0..4], [9], [10..14] => [0..4], [9,10,11,12,13], [14]
ASSERT_EQ(3, pl.size());
ASSERT_EQ(5, pl.get_root()->next->size());
ASSERT_EQ(1, pl.get_root()->next->next->size());
ASSERT_EQ(13, pl.get_root()->next->ids.last());
ASSERT_EQ(14, pl.get_root()->next->next->ids.last());
for(size_t i = 0; i < pl.get_root()->next->offset_index.getLength(); i++) {
ASSERT_EQ(i * 3, pl.get_root()->next->offset_index.at(i));
}
for(size_t i = 0; i < pl.get_root()->next->offsets.getLength(); i++) {
ASSERT_EQ(offsets[i % 3], pl.get_root()->next->offsets.at(i));
}
for(size_t i = 0; i < pl.get_root()->next->next->offset_index.getLength(); i++) {
ASSERT_EQ(i * 3, pl.get_root()->next->next->offset_index.at(i));
}
for(size_t i = 0; i < pl.get_root()->next->next->offsets.getLength(); i++) {
ASSERT_EQ(offsets[i % 3], pl.get_root()->next->next->offsets.at(i));
}
}
TEST(PostingListTest, OutOfOrderUpserts) {
std::vector<uint32_t> offsets = {0, 1, 3};
posting_list_t pl(5);
for(int i = 5; i > 0; i--) {
pl.upsert(i, offsets);
}
pl.upsert(0, offsets);
pl.upsert(200000, offsets);
ASSERT_EQ(2, pl.size());
ASSERT_EQ(3, pl.get_root()->size());
ASSERT_EQ(4, pl.get_root()->next->size());
for(size_t i = 0; i < pl.get_root()->offset_index.getLength(); i++) {
ASSERT_EQ(i * 3, pl.get_root()->offset_index.at(i));
}
for(size_t i = 0; i < pl.get_root()->offsets.getLength(); i++) {
ASSERT_EQ(offsets[i % 3], pl.get_root()->offsets.at(i));
}
for(size_t i = 0; i < pl.get_root()->next->offset_index.getLength(); i++) {
ASSERT_EQ(i * 3, pl.get_root()->next->offset_index.at(i));
}
for(size_t i = 0; i < pl.get_root()->next->offsets.getLength(); i++) {
ASSERT_EQ(offsets[i % 3], pl.get_root()->next->offsets.at(i));
}
}
TEST(PostingListTest, RandomInsertAndDeletes) {
time_t t;
srand((unsigned) time(&t));
posting_list_t pl(100);
std::vector<uint32_t> offsets1 = {0, 1, 3};
std::vector<uint32_t> offsets2 = {10, 12};
for(size_t i = 0; i < 100000; i++) {
const std::vector<uint32_t>& offsets = (i % 2 == 0) ? offsets1 : offsets2;
pl.upsert(rand() % 100000, offsets);
}
for(size_t i = 0; i < 10000; i++) {
const std::vector<uint32_t>& offsets = (i % 2 == 0) ? offsets1 : offsets2;
pl.erase(rand() % 100000);
}
bool size_within_range = (pl.size() < 1500) && (pl.size() > 1000);
ASSERT_TRUE(size_within_range);
}