mirror of
https://github.com/typesense/typesense.git
synced 2025-05-22 06:40:30 +08:00
Basics of a block based posting list container.
This commit is contained in:
parent
b817e615cb
commit
8b24fe82b8
@ -242,8 +242,8 @@ private:
|
||||
void index_string_array_field(const std::vector<std::string> & strings, const int64_t score, art_tree *t,
|
||||
uint32_t seq_id, bool is_facet, const field & a_field);
|
||||
|
||||
void remove_and_shift_offset_index(sorted_array& offset_index, const uint32_t* indices_sorted,
|
||||
const uint32_t indices_length);
|
||||
static void remove_and_shift_offset_index(sorted_array& offset_index, const uint32_t* indices_sorted,
|
||||
const uint32_t indices_length);
|
||||
|
||||
void collate_included_ids(const std::vector<std::string>& q_included_tokens,
|
||||
const std::string & field, const uint8_t field_id,
|
||||
|
78
include/posting_list.h
Normal file
78
include/posting_list.h
Normal file
@ -0,0 +1,78 @@
|
||||
#pragma once
|
||||
#include <map>
|
||||
#include "sorted_array.h"
|
||||
#include "array.h"
|
||||
|
||||
typedef uint32_t last_id_t;
|
||||
|
||||
#define FOR_ELE_SIZE sizeof(uint32_t)
|
||||
#define METADATA_OVERHEAD 5
|
||||
|
||||
/*
|
||||
Compressed chain of blocks that store the document IDs and offsets of a given token.
|
||||
Offsets of singular and multi-valued fields are encoded differently.
|
||||
*/
|
||||
class posting_list_t {
|
||||
public:
|
||||
|
||||
// A block stores a list of Document IDs, Token Offsets and a Mapping of ID => Offset indices efficiently
|
||||
// Layout of *data: [ids...mappings..offsets]
|
||||
// IDs and Mappings are sorted integers, while offsets are not sorted
|
||||
struct block_t {
|
||||
sorted_array ids;
|
||||
sorted_array offset_index;
|
||||
array offsets;
|
||||
|
||||
// link to next block
|
||||
block_t* next = nullptr;
|
||||
|
||||
void insert_and_shift_offset_index(uint32_t index, uint32_t num_offsets);
|
||||
|
||||
void remove_and_shift_offset_index(const uint32_t* indices_sorted, uint32_t num_indices);
|
||||
|
||||
void upsert(uint32_t id, const std::vector<uint32_t>& offsets);
|
||||
|
||||
void erase(uint32_t id);
|
||||
|
||||
uint32_t size() {
|
||||
return ids.getLength();
|
||||
}
|
||||
};
|
||||
|
||||
private:
|
||||
|
||||
// when a block reaches pre-allocated storage, it is expanded by this factor
|
||||
static constexpr float BLOCK_GROWTH_FACTOR = 1.3;
|
||||
|
||||
// maximum number of IDs (and associated offsets) to store in each block before another block is created
|
||||
const uint16_t BLOCK_MAX_ELEMENTS;
|
||||
|
||||
block_t root_block;
|
||||
|
||||
// keeps track of the *last* ID in each block and is used for partial random access
|
||||
// e.g. 0..[9], 10..[19], 20..[29]
|
||||
// MUST be ordered
|
||||
std::map<last_id_t, block_t*> id_block_map;
|
||||
|
||||
static void split_block(block_t* src_block, block_t* dst_block);
|
||||
|
||||
static void merge_adjacent_blocks(block_t* block1, block_t* block2, size_t num_block2_ids);
|
||||
|
||||
public:
|
||||
|
||||
posting_list_t() = delete;
|
||||
|
||||
explicit posting_list_t(uint16_t max_block_elements);
|
||||
|
||||
~posting_list_t();
|
||||
|
||||
void upsert(uint32_t id, const std::vector<uint32_t>& offsets);
|
||||
|
||||
void erase(uint32_t id);
|
||||
|
||||
block_t* get_root();
|
||||
|
||||
size_t size();
|
||||
|
||||
block_t* block_of(last_id_t id);
|
||||
};
|
@ -54,6 +54,8 @@ public:
|
||||
|
||||
uint32_t at(uint32_t index);
|
||||
|
||||
uint32_t last();
|
||||
|
||||
bool contains(uint32_t value);
|
||||
|
||||
uint32_t indexOf(uint32_t value);
|
||||
|
414
src/posting_list.cpp
Normal file
414
src/posting_list.cpp
Normal file
@ -0,0 +1,414 @@
|
||||
#include "posting_list.h"
|
||||
#include "for.h"
|
||||
#include "array_utils.h"
|
||||
|
||||
/* block_t operations */
|
||||
|
||||
void posting_list_t::block_t::insert_and_shift_offset_index(const uint32_t index, const uint32_t num_offsets) {
|
||||
uint32_t existing_offset_index = offset_index.at(index);
|
||||
uint32_t length = offset_index.getLength();
|
||||
uint32_t new_length = length + 1;
|
||||
uint32_t* curr_array = offset_index.uncompress(new_length);
|
||||
|
||||
memmove(&curr_array[index+1], &curr_array[index], sizeof(uint32_t)*(length - index));
|
||||
curr_array[index] = existing_offset_index;
|
||||
|
||||
uint32_t curr_index = index + 1;
|
||||
while(curr_index < new_length) {
|
||||
curr_array[curr_index] += num_offsets;
|
||||
curr_index++;
|
||||
}
|
||||
|
||||
offset_index.load(curr_array, new_length);
|
||||
|
||||
delete [] curr_array;
|
||||
}
|
||||
|
||||
void posting_list_t::block_t::upsert(const uint32_t id, const std::vector<uint32_t>& positions) {
|
||||
size_t inserted_index = ids.append(id);
|
||||
|
||||
if(inserted_index == ids.getLength()-1) {
|
||||
// treat as appends
|
||||
uint32_t curr_index = offsets.getLength();
|
||||
offset_index.append(curr_index);
|
||||
for(uint32_t position : positions) {
|
||||
offsets.append(position);
|
||||
}
|
||||
} else {
|
||||
uint32_t existing_offset_index = offset_index.at(inserted_index);
|
||||
insert_and_shift_offset_index(inserted_index, positions.size());
|
||||
offsets.insert(existing_offset_index, &positions[0], positions.size());
|
||||
}
|
||||
}
|
||||
|
||||
void posting_list_t::block_t::erase(const uint32_t id) {
|
||||
uint32_t doc_index = ids.indexOf(id);
|
||||
|
||||
if (doc_index == ids.getLength()) {
|
||||
return;
|
||||
}
|
||||
|
||||
uint32_t start_offset = offset_index.at(doc_index);
|
||||
uint32_t end_offset = (doc_index == ids.getLength() - 1) ?
|
||||
offsets.getLength() :
|
||||
offset_index.at(doc_index + 1);
|
||||
|
||||
uint32_t doc_indices[1] = {doc_index};
|
||||
remove_and_shift_offset_index(doc_indices, 1);
|
||||
|
||||
offsets.remove_index(start_offset, end_offset);
|
||||
ids.remove_value(id);
|
||||
}
|
||||
|
||||
void posting_list_t::block_t::remove_and_shift_offset_index(const uint32_t* indices_sorted,
|
||||
const uint32_t num_indices) {
|
||||
uint32_t *curr_array = offset_index.uncompress();
|
||||
uint32_t *new_array = new uint32_t[offset_index.getLength()];
|
||||
|
||||
new_array[0] = 0;
|
||||
uint32_t new_index = 0;
|
||||
uint32_t curr_index = 0;
|
||||
uint32_t indices_counter = 0;
|
||||
uint32_t shift_value = 0;
|
||||
|
||||
while(curr_index < offset_index.getLength()) {
|
||||
if(indices_counter < num_indices && curr_index >= indices_sorted[indices_counter]) {
|
||||
// skip copying
|
||||
if(curr_index == indices_sorted[indices_counter]) {
|
||||
curr_index++;
|
||||
const uint32_t diff = curr_index == offset_index.getLength() ?
|
||||
0 : (offset_index.at(curr_index) - offset_index.at(curr_index-1));
|
||||
|
||||
shift_value += diff;
|
||||
}
|
||||
indices_counter++;
|
||||
} else {
|
||||
new_array[new_index++] = curr_array[curr_index++] - shift_value;
|
||||
}
|
||||
}
|
||||
|
||||
offset_index.load(new_array, new_index);
|
||||
|
||||
delete[] curr_array;
|
||||
delete[] new_array;
|
||||
}
|
||||
|
||||
/* posting_list_t operations */
|
||||
|
||||
posting_list_t::posting_list_t(uint16_t max_block_elements): BLOCK_MAX_ELEMENTS(max_block_elements) {
|
||||
|
||||
}
|
||||
|
||||
posting_list_t::~posting_list_t() {
|
||||
block_t* block = root_block.next;
|
||||
while(block != nullptr) {
|
||||
block_t* next_block = block->next;
|
||||
delete block;
|
||||
block = next_block;
|
||||
}
|
||||
}
|
||||
|
||||
void posting_list_t::merge_adjacent_blocks(posting_list_t::block_t* block1, posting_list_t::block_t* block2,
|
||||
size_t num_block2_ids) {
|
||||
// merge ids
|
||||
uint32_t* raw_ids1 = block1->ids.uncompress();
|
||||
uint32_t* raw_ids2 = block2->ids.uncompress();
|
||||
|
||||
size_t block1_orig_size = block1->size();
|
||||
size_t block2_orig_size = block2->size();
|
||||
|
||||
uint32_t* raw_ids = new uint32_t[block1->size() + num_block2_ids];
|
||||
std::memmove(raw_ids, raw_ids1, sizeof(uint32_t) * block1->size());
|
||||
std::memmove(raw_ids + block1->size(), raw_ids2, sizeof(uint32_t) * num_block2_ids);
|
||||
|
||||
block1->ids.load(raw_ids, block1->size() + num_block2_ids);
|
||||
block2->ids.load(raw_ids2 + num_block2_ids, block2->size() - num_block2_ids);
|
||||
|
||||
delete [] raw_ids1;
|
||||
delete [] raw_ids2;
|
||||
delete [] raw_ids;
|
||||
|
||||
// merge offset indices
|
||||
uint32_t* raw_offset_index1 = block1->offset_index.uncompress();
|
||||
uint32_t* raw_offset_index2 = block2->offset_index.uncompress();
|
||||
uint32_t* raw_offset_index = new uint32_t[block1_orig_size + block2_orig_size];
|
||||
|
||||
std::memmove(raw_offset_index, raw_offset_index1, sizeof(uint32_t) * block1->offset_index.getLength());
|
||||
size_t start_index = block1->offset_index.getLength();
|
||||
size_t base_offset_len = block1->offsets.getLength();
|
||||
|
||||
for(size_t i = 0; i < num_block2_ids; i++) {
|
||||
raw_offset_index[start_index + i] = raw_offset_index2[i] + base_offset_len;
|
||||
}
|
||||
|
||||
block1->offset_index.load(raw_offset_index, block1->offset_index.getLength() + num_block2_ids);
|
||||
|
||||
for(size_t i = 0; i < (block2_orig_size - num_block2_ids); i++) {
|
||||
raw_offset_index2[num_block2_ids + i] -= raw_offset_index2[num_block2_ids];
|
||||
}
|
||||
|
||||
block2->offset_index.load(raw_offset_index2 + num_block2_ids, block2_orig_size - num_block2_ids);
|
||||
|
||||
// merge offsets
|
||||
uint32_t* raw_offsets1 = block1->offsets.uncompress();
|
||||
uint32_t* raw_offsets2 = block2->offsets.uncompress();
|
||||
size_t num_block2_offset_elements = (num_block2_ids == block2_orig_size) ? block2->offsets.getLength() :
|
||||
raw_offset_index2[num_block2_ids];
|
||||
|
||||
uint32_t* raw_offsets = new uint32_t[block1->offsets.getLength() + num_block2_offset_elements];
|
||||
|
||||
uint32_t min = raw_offsets1[0], max = raw_offsets1[0];
|
||||
|
||||
// we have to manually copy over so we can find the new min and max
|
||||
for(size_t i = 0; i < block1->offsets.getLength(); i++) {
|
||||
raw_offsets[i] = raw_offsets1[i];
|
||||
if(raw_offsets[i] < min) {
|
||||
min = raw_offsets[i];
|
||||
}
|
||||
|
||||
if(raw_offsets[i] > max) {
|
||||
max = raw_offsets[i];
|
||||
}
|
||||
}
|
||||
|
||||
size_t block2_base_index = block1->offsets.getLength();
|
||||
|
||||
for(size_t i = 0; i < num_block2_offset_elements; i++) {
|
||||
size_t j = block2_base_index + i;
|
||||
raw_offsets[j] = raw_offsets2[i];
|
||||
|
||||
if(raw_offsets[j] < min) {
|
||||
min = raw_offsets[j];
|
||||
}
|
||||
|
||||
if(raw_offsets[j] > max) {
|
||||
max = raw_offsets[j];
|
||||
}
|
||||
}
|
||||
|
||||
block1->offsets.load(raw_offsets, block1->offsets.getLength() + num_block2_offset_elements, min, max);
|
||||
|
||||
// reset block2 offsets with remaining elements
|
||||
if(block2->offsets.getLength() != num_block2_offset_elements) {
|
||||
const size_t block2_new_offsets_length = (block2->offsets.getLength() - num_block2_offset_elements);
|
||||
uint32_t* block2_new_raw_offsets = new uint32_t[block2_new_offsets_length];
|
||||
min = max = raw_offsets2[num_block2_offset_elements];
|
||||
for(size_t i = 0; i < block2_new_offsets_length; i++) {
|
||||
block2_new_raw_offsets[i] = raw_offsets2[num_block2_offset_elements + i];
|
||||
if(block2_new_raw_offsets[i] < min) {
|
||||
min = block2_new_raw_offsets[i];
|
||||
}
|
||||
|
||||
if(block2_new_raw_offsets[i] > max) {
|
||||
max = block2_new_raw_offsets[i];
|
||||
}
|
||||
}
|
||||
block2->offsets.load(block2_new_raw_offsets, block2_new_offsets_length, min, max);
|
||||
delete [] block2_new_raw_offsets;
|
||||
} else {
|
||||
block2->offsets.load(nullptr, 0, 0, 0);
|
||||
}
|
||||
|
||||
delete [] raw_offset_index1;
|
||||
delete [] raw_offset_index2;
|
||||
delete [] raw_offset_index;
|
||||
|
||||
delete [] raw_offsets1;
|
||||
delete [] raw_offsets2;
|
||||
delete [] raw_offsets;
|
||||
}
|
||||
|
||||
void posting_list_t::split_block(posting_list_t::block_t* src_block, posting_list_t::block_t* dst_block) {
|
||||
if(src_block->size() <= 1) {
|
||||
return;
|
||||
}
|
||||
|
||||
uint32_t* raw_ids = src_block->ids.uncompress();
|
||||
size_t ids_first_half_length = (src_block->size() / 2);
|
||||
size_t ids_second_half_length = (src_block->size() - ids_first_half_length);
|
||||
src_block->ids.load(raw_ids, ids_first_half_length);
|
||||
dst_block->ids.load(raw_ids + ids_first_half_length, ids_second_half_length);
|
||||
|
||||
uint32_t* raw_offset_indices = src_block->offset_index.uncompress();
|
||||
size_t offset_indices_first_half_length = (src_block->offset_index.getLength() / 2);
|
||||
size_t offset_indices_second_half_length = (src_block->offset_index.getLength() - offset_indices_first_half_length);
|
||||
src_block->offset_index.load(raw_offset_indices, offset_indices_first_half_length);
|
||||
|
||||
// update second half to use zero based index
|
||||
uint32_t base_index_diff = raw_offset_indices[offset_indices_first_half_length];
|
||||
for(size_t i = 0; i < offset_indices_second_half_length; i++) {
|
||||
raw_offset_indices[offset_indices_first_half_length + i] -= base_index_diff;
|
||||
}
|
||||
|
||||
dst_block->offset_index.load(raw_offset_indices + offset_indices_first_half_length, offset_indices_second_half_length);
|
||||
|
||||
uint32_t* raw_offsets = src_block->offsets.uncompress();
|
||||
size_t src_offsets_length = src_block->offsets.getLength();
|
||||
|
||||
// load first half of offsets
|
||||
|
||||
size_t offset_first_half_length = base_index_diff;
|
||||
|
||||
// we need to find new min and max
|
||||
uint32_t min = raw_offsets[0], max = raw_offsets[0];
|
||||
|
||||
for(size_t i = 0; i < offset_first_half_length; i++) {
|
||||
if(raw_offsets[i] < min) {
|
||||
min = raw_offsets[i];
|
||||
}
|
||||
|
||||
if(raw_offsets[i] > max) {
|
||||
max = raw_offsets[i];
|
||||
}
|
||||
}
|
||||
|
||||
src_block->offsets.load(raw_offsets, offset_first_half_length, min, max);
|
||||
|
||||
// load second half
|
||||
|
||||
min = max = raw_offsets[offset_first_half_length];
|
||||
for(size_t i = offset_first_half_length; i < src_offsets_length; i++) {
|
||||
if(raw_offsets[i] < min) {
|
||||
min = raw_offsets[i];
|
||||
}
|
||||
|
||||
if(raw_offsets[i] > max) {
|
||||
max = raw_offsets[i];
|
||||
}
|
||||
}
|
||||
|
||||
size_t offsets_second_half_length = src_offsets_length - offset_first_half_length;
|
||||
dst_block->offsets.load(raw_offsets + offset_first_half_length, offsets_second_half_length, min, max);
|
||||
|
||||
delete [] raw_ids;
|
||||
delete [] raw_offset_indices;
|
||||
delete [] raw_offsets;
|
||||
}
|
||||
|
||||
void posting_list_t::upsert(const uint32_t id, const std::vector<uint32_t>& offsets) {
|
||||
// first we will locate the block where `id` should reside
|
||||
block_t* upsert_block;
|
||||
last_id_t before_upsert_last_id;
|
||||
|
||||
if(id_block_map.empty()) {
|
||||
//id_block_map.emplace(id, &root_block);
|
||||
upsert_block = &root_block;
|
||||
before_upsert_last_id = UINT32_MAX;
|
||||
} else {
|
||||
const auto it = id_block_map.lower_bound(id);
|
||||
upsert_block = (it == id_block_map.end()) ? id_block_map.rbegin()->second : it->second;
|
||||
before_upsert_last_id = upsert_block->ids.at(upsert_block->size() - 1);
|
||||
}
|
||||
|
||||
// happy path: upsert_block is not full
|
||||
if(upsert_block->size() < BLOCK_MAX_ELEMENTS) {
|
||||
upsert_block->upsert(id, offsets);
|
||||
last_id_t after_upsert_last_id = upsert_block->ids.at(upsert_block->size() - 1);
|
||||
if(before_upsert_last_id != after_upsert_last_id) {
|
||||
id_block_map.erase(before_upsert_last_id);
|
||||
id_block_map.emplace(after_upsert_last_id, upsert_block);
|
||||
}
|
||||
} else {
|
||||
block_t* new_block = new block_t;
|
||||
|
||||
if(upsert_block->next == nullptr && upsert_block->ids.last() < id) {
|
||||
// appending to the end of the last block where the id will reside on a newly block
|
||||
new_block->upsert(id, offsets);
|
||||
} else {
|
||||
// upsert and then split block
|
||||
upsert_block->upsert(id, offsets);
|
||||
|
||||
// evenly divide elements between both blocks
|
||||
split_block(upsert_block, new_block);
|
||||
|
||||
last_id_t after_upsert_last_id = upsert_block->ids.at(upsert_block->size() - 1);
|
||||
id_block_map.erase(before_upsert_last_id);
|
||||
id_block_map.emplace(after_upsert_last_id, upsert_block);
|
||||
}
|
||||
|
||||
last_id_t after_new_block_id = new_block->ids.at(new_block->size() - 1);
|
||||
id_block_map.emplace(after_new_block_id, new_block);
|
||||
|
||||
new_block->next = upsert_block->next;
|
||||
upsert_block->next = new_block;
|
||||
}
|
||||
}
|
||||
|
||||
void posting_list_t::erase(const uint32_t id) {
|
||||
const auto it = id_block_map.lower_bound(id);
|
||||
|
||||
if(it == id_block_map.end()) {
|
||||
return ;
|
||||
}
|
||||
|
||||
block_t* erase_block = it->second;
|
||||
last_id_t before_last_id = it->first;
|
||||
erase_block->erase(id);
|
||||
|
||||
size_t new_ids_length = erase_block->size();
|
||||
|
||||
if(new_ids_length == 0) {
|
||||
// happens when the last element of last block is deleted
|
||||
|
||||
if(erase_block != &root_block) {
|
||||
// since we will be deleting the empty node, set the previous node's next pointer to null
|
||||
std::prev(it)->second->next = nullptr;
|
||||
delete erase_block;
|
||||
}
|
||||
|
||||
id_block_map.erase(before_last_id);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
if(new_ids_length >= BLOCK_MAX_ELEMENTS/2 || erase_block->next == nullptr) {
|
||||
last_id_t after_last_id = erase_block->ids.at(new_ids_length-1);
|
||||
if(before_last_id != after_last_id) {
|
||||
id_block_map.erase(before_last_id);
|
||||
id_block_map.emplace(after_last_id, erase_block);
|
||||
}
|
||||
|
||||
return ;
|
||||
}
|
||||
|
||||
// block is less than 50% of max capacity and contains a next node which we can refill from
|
||||
|
||||
auto next_block = erase_block->next;
|
||||
last_id_t next_block_last_id = next_block->ids.at(next_block->ids.getLength()-1);
|
||||
|
||||
if(erase_block->size() + next_block->size() <= BLOCK_MAX_ELEMENTS) {
|
||||
// we can merge the contents of next block with `erase_block` and delete the next block
|
||||
merge_adjacent_blocks(erase_block, next_block, next_block->size());
|
||||
erase_block->next = next_block->next;
|
||||
delete next_block;
|
||||
|
||||
id_block_map.erase(next_block_last_id);
|
||||
} else {
|
||||
// only part of the next block can be moved over
|
||||
size_t num_block2_ids = BLOCK_MAX_ELEMENTS - erase_block->size();
|
||||
merge_adjacent_blocks(erase_block, next_block, num_block2_ids);
|
||||
// NOTE: we don't have to update `id_block_map` for `next_block` as last element doesn't change
|
||||
}
|
||||
|
||||
last_id_t after_last_id = erase_block->ids.at(erase_block->ids.getLength()-1);
|
||||
if(before_last_id != after_last_id) {
|
||||
id_block_map.erase(before_last_id);
|
||||
id_block_map.emplace(after_last_id, erase_block);
|
||||
}
|
||||
}
|
||||
|
||||
posting_list_t::block_t* posting_list_t::get_root() {
|
||||
return &root_block;
|
||||
}
|
||||
|
||||
size_t posting_list_t::size() {
|
||||
return id_block_map.size();
|
||||
}
|
||||
|
||||
posting_list_t::block_t* posting_list_t::block_of(last_id_t id) {
|
||||
auto it = id_block_map.find(id);
|
||||
if(it != id_block_map.end()) {
|
||||
return it->second;
|
||||
}
|
||||
return nullptr;
|
||||
}
|
@ -397,3 +397,11 @@ void sorted_array::binary_count_indices(const uint32_t *values, int low_vindex,
|
||||
binary_count_indices(values, pivot_vindex+1, high_vindex, src, in_index, high_index, num_found);
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t sorted_array::last() {
|
||||
if(getLength() == 0) {
|
||||
return UINT32_MAX;
|
||||
}
|
||||
|
||||
return at(getLength()-1);
|
||||
}
|
||||
|
331
test/posting_list_test.cpp
Normal file
331
test/posting_list_test.cpp
Normal file
@ -0,0 +1,331 @@
|
||||
#include <gtest/gtest.h>
|
||||
#include "posting_list.h"
|
||||
#include <vector>
|
||||
|
||||
TEST(PostingListTest, Insert) {
|
||||
std::vector<uint32_t> offsets = {0, 1, 3};
|
||||
|
||||
posting_list_t pl(5);
|
||||
|
||||
// insert elements sequentially
|
||||
|
||||
for(size_t i = 0; i < 15; i++) {
|
||||
pl.upsert(i, offsets);
|
||||
}
|
||||
|
||||
posting_list_t::block_t* root = pl.get_root();
|
||||
ASSERT_EQ(5, root->ids.getLength());
|
||||
ASSERT_EQ(5, root->next->ids.getLength());
|
||||
ASSERT_EQ(5, root->next->next->ids.getLength());
|
||||
|
||||
ASSERT_EQ(root->next->next->next, nullptr);
|
||||
|
||||
ASSERT_EQ(3, pl.size());
|
||||
ASSERT_EQ(root, pl.block_of(4));
|
||||
ASSERT_EQ(root->next, pl.block_of(9));
|
||||
ASSERT_EQ(root->next->next, pl.block_of(14));
|
||||
|
||||
// insert alternate values
|
||||
|
||||
posting_list_t pl2(5);
|
||||
|
||||
for(size_t i = 0; i < 15; i+=2) {
|
||||
// [0, 2, 4, 6, 8], [10, 12, 14]
|
||||
pl2.upsert(i, offsets);
|
||||
}
|
||||
|
||||
root = pl2.get_root();
|
||||
ASSERT_EQ(5, root->ids.getLength());
|
||||
ASSERT_EQ(3, root->next->ids.getLength());
|
||||
|
||||
ASSERT_EQ(root->next->next, nullptr);
|
||||
ASSERT_EQ(2, pl2.size());
|
||||
|
||||
ASSERT_EQ(root, pl2.block_of(8));
|
||||
ASSERT_EQ(root->next, pl2.block_of(14));
|
||||
|
||||
// insert in the middle
|
||||
// case 1
|
||||
|
||||
posting_list_t pl3(5);
|
||||
|
||||
for(size_t i = 0; i < 5; i++) {
|
||||
pl3.upsert(i, offsets);
|
||||
}
|
||||
|
||||
pl3.upsert(6, offsets);
|
||||
pl3.upsert(8, offsets);
|
||||
pl3.upsert(9, offsets);
|
||||
pl3.upsert(10, offsets);
|
||||
pl3.upsert(12, offsets);
|
||||
|
||||
// [0,1,2,3,4], [6,8,9,10,12]
|
||||
pl3.upsert(5, offsets);
|
||||
ASSERT_EQ(3, pl3.size());
|
||||
ASSERT_EQ(5, pl3.get_root()->ids.getLength());
|
||||
ASSERT_EQ(3, pl3.get_root()->next->ids.getLength());
|
||||
ASSERT_EQ(8, pl3.get_root()->next->ids.last());
|
||||
ASSERT_EQ(3, pl3.get_root()->next->next->ids.getLength());
|
||||
ASSERT_EQ(12, pl3.get_root()->next->next->ids.last());
|
||||
|
||||
for(size_t i = 0; i < pl3.get_root()->next->offset_index.getLength(); i++) {
|
||||
ASSERT_EQ(i * 3, pl3.get_root()->next->offset_index.at(i));
|
||||
}
|
||||
|
||||
for(size_t i = 0; i < pl3.get_root()->next->offsets.getLength(); i++) {
|
||||
ASSERT_EQ(offsets[i % 3], pl3.get_root()->next->offsets.at(i));
|
||||
}
|
||||
|
||||
// case 2
|
||||
posting_list_t pl4(5);
|
||||
|
||||
for(size_t i = 0; i < 5; i++) {
|
||||
pl4.upsert(i, offsets);
|
||||
}
|
||||
|
||||
pl4.upsert(6, offsets);
|
||||
pl4.upsert(8, offsets);
|
||||
pl4.upsert(9, offsets);
|
||||
pl4.upsert(10, offsets);
|
||||
pl4.upsert(12, offsets);
|
||||
|
||||
// [0,1,2,3,4], [6,8,9,10,12]
|
||||
pl4.upsert(11, offsets);
|
||||
ASSERT_EQ(3, pl4.size());
|
||||
|
||||
ASSERT_EQ(5, pl4.get_root()->ids.getLength());
|
||||
ASSERT_EQ(3, pl4.get_root()->next->ids.getLength());
|
||||
ASSERT_EQ(9, pl4.get_root()->next->ids.last());
|
||||
ASSERT_EQ(3, pl4.get_root()->next->next->ids.getLength());
|
||||
ASSERT_EQ(12, pl4.get_root()->next->next->ids.last());
|
||||
|
||||
for(size_t i = 0; i < pl4.get_root()->next->offset_index.getLength(); i++) {
|
||||
ASSERT_EQ(i * 3, pl4.get_root()->next->offset_index.at(i));
|
||||
}
|
||||
|
||||
for(size_t i = 0; i < pl4.get_root()->next->offsets.getLength(); i++) {
|
||||
ASSERT_EQ(offsets[i % 3], pl4.get_root()->next->offsets.at(i));
|
||||
}
|
||||
}
|
||||
|
||||
TEST(PostingListTest, RemovalsOnFirstBlock) {
|
||||
std::vector<uint32_t> offsets = {0, 1, 3};
|
||||
posting_list_t pl(5);
|
||||
|
||||
ASSERT_EQ(0, pl.size());
|
||||
|
||||
// try to erase when posting list is empty
|
||||
pl.erase(0);
|
||||
|
||||
ASSERT_EQ(0, pl.size());
|
||||
|
||||
// insert a single element and erase it
|
||||
pl.upsert(0, offsets);
|
||||
ASSERT_EQ(1, pl.size());
|
||||
pl.erase(0);
|
||||
ASSERT_EQ(0, pl.size());
|
||||
|
||||
ASSERT_EQ(0, pl.get_root()->ids.getLength());
|
||||
ASSERT_EQ(0, pl.get_root()->offset_index.getLength());
|
||||
ASSERT_EQ(0, pl.get_root()->offsets.getLength());
|
||||
|
||||
// insert until one past max block size
|
||||
for(size_t i = 0; i < 6; i++) {
|
||||
pl.upsert(i, offsets);
|
||||
}
|
||||
|
||||
ASSERT_EQ(2, pl.size());
|
||||
|
||||
// delete non-existing element
|
||||
pl.erase(1000);
|
||||
|
||||
// delete elements from first block: blocks should not be merged until it falls below 50% occupancy
|
||||
pl.erase(1);
|
||||
ASSERT_EQ(2, pl.size());
|
||||
|
||||
// [0, 2, 3, 4], [5]
|
||||
|
||||
for(size_t i = 0; i < pl.get_root()->offset_index.getLength(); i++) {
|
||||
ASSERT_EQ(i * 3, pl.get_root()->offset_index.at(i));
|
||||
}
|
||||
|
||||
for(size_t i = 0; i < pl.get_root()->offsets.getLength(); i++) {
|
||||
ASSERT_EQ(offsets[i % 3], pl.get_root()->offsets.at(i));
|
||||
}
|
||||
|
||||
pl.erase(2);
|
||||
ASSERT_EQ(2, pl.size());
|
||||
pl.erase(3);
|
||||
|
||||
// [0, 4], [5]
|
||||
ASSERT_EQ(2, pl.size());
|
||||
ASSERT_EQ(2, pl.get_root()->size());
|
||||
ASSERT_EQ(1, pl.get_root()->next->size());
|
||||
ASSERT_EQ(pl.get_root(), pl.block_of(4));
|
||||
ASSERT_EQ(pl.get_root()->next, pl.block_of(5));
|
||||
|
||||
for(size_t i = 0; i < pl.get_root()->offset_index.getLength(); i++) {
|
||||
ASSERT_EQ(i * 3, pl.get_root()->offset_index.at(i));
|
||||
}
|
||||
|
||||
for(size_t i = 0; i < pl.get_root()->offsets.getLength(); i++) {
|
||||
ASSERT_EQ(offsets[i % 3], pl.get_root()->offsets.at(i));
|
||||
}
|
||||
|
||||
pl.erase(4); // this will trigger the merge
|
||||
|
||||
// [0, 5]
|
||||
// ensure that merge has happened
|
||||
ASSERT_EQ(1, pl.size());
|
||||
ASSERT_EQ(pl.get_root(), pl.block_of(5));
|
||||
ASSERT_EQ(nullptr, pl.get_root()->next);
|
||||
ASSERT_EQ(2, pl.get_root()->size());
|
||||
|
||||
for(size_t i = 0; i < pl.get_root()->offset_index.getLength(); i++) {
|
||||
ASSERT_EQ(i * 3, pl.get_root()->offset_index.at(i));
|
||||
}
|
||||
|
||||
for(size_t i = 0; i < pl.get_root()->offsets.getLength(); i++) {
|
||||
ASSERT_EQ(offsets[i % 3], pl.get_root()->offsets.at(i));
|
||||
}
|
||||
}
|
||||
|
||||
TEST(PostingListTest, RemovalsOnLaterBlocks) {
|
||||
std::vector<uint32_t> offsets = {0, 1, 3};
|
||||
posting_list_t pl(5);
|
||||
|
||||
// insert until one past max block size
|
||||
for(size_t i = 0; i < 6; i++) {
|
||||
pl.upsert(i, offsets);
|
||||
}
|
||||
|
||||
// erase last element of last, non-first block
|
||||
|
||||
pl.erase(5);
|
||||
ASSERT_EQ(1, pl.size());
|
||||
ASSERT_EQ(5, pl.get_root()->size());
|
||||
ASSERT_EQ(4, pl.get_root()->ids.last());
|
||||
ASSERT_EQ(nullptr, pl.get_root()->next);
|
||||
|
||||
for(size_t i = 0; i < pl.get_root()->offset_index.getLength(); i++) {
|
||||
ASSERT_EQ(i * 3, pl.get_root()->offset_index.at(i));
|
||||
}
|
||||
|
||||
for(size_t i = 0; i < pl.get_root()->offsets.getLength(); i++) {
|
||||
ASSERT_EQ(offsets[i % 3], pl.get_root()->offsets.at(i));
|
||||
}
|
||||
|
||||
// erase last element of the only block when block is atleast half full
|
||||
pl.erase(4);
|
||||
ASSERT_EQ(1, pl.size());
|
||||
ASSERT_EQ(4, pl.get_root()->size());
|
||||
ASSERT_EQ(3, pl.get_root()->ids.last());
|
||||
ASSERT_EQ(pl.get_root(), pl.block_of(3));
|
||||
|
||||
for(size_t i = 4; i < 15; i++) {
|
||||
pl.upsert(i, offsets);
|
||||
}
|
||||
|
||||
// [0..4], [5..9], [10..14]
|
||||
pl.erase(5);
|
||||
pl.erase(6);
|
||||
pl.erase(7);
|
||||
|
||||
for(size_t i = 0; i < pl.get_root()->next->offset_index.getLength(); i++) {
|
||||
ASSERT_EQ(i * 3, pl.get_root()->next->offset_index.at(i));
|
||||
}
|
||||
|
||||
for(size_t i = 0; i < pl.get_root()->next->offsets.getLength(); i++) {
|
||||
ASSERT_EQ(offsets[i % 3], pl.get_root()->next->offsets.at(i));
|
||||
}
|
||||
|
||||
for(size_t i = 0; i < pl.get_root()->next->next->offset_index.getLength(); i++) {
|
||||
ASSERT_EQ(i * 3, pl.get_root()->next->next->offset_index.at(i));
|
||||
}
|
||||
|
||||
for(size_t i = 0; i < pl.get_root()->next->next->offsets.getLength(); i++) {
|
||||
ASSERT_EQ(offsets[i % 3], pl.get_root()->next->next->offsets.at(i));
|
||||
}
|
||||
|
||||
// only part of the next node contents can be moved over when we delete 8 since (1 + 5) > 5
|
||||
pl.erase(8);
|
||||
|
||||
// [0..4], [9], [10..14] => [0..4], [9,10,11,12,13], [14]
|
||||
|
||||
ASSERT_EQ(3, pl.size());
|
||||
ASSERT_EQ(5, pl.get_root()->next->size());
|
||||
ASSERT_EQ(1, pl.get_root()->next->next->size());
|
||||
ASSERT_EQ(13, pl.get_root()->next->ids.last());
|
||||
ASSERT_EQ(14, pl.get_root()->next->next->ids.last());
|
||||
|
||||
for(size_t i = 0; i < pl.get_root()->next->offset_index.getLength(); i++) {
|
||||
ASSERT_EQ(i * 3, pl.get_root()->next->offset_index.at(i));
|
||||
}
|
||||
|
||||
for(size_t i = 0; i < pl.get_root()->next->offsets.getLength(); i++) {
|
||||
ASSERT_EQ(offsets[i % 3], pl.get_root()->next->offsets.at(i));
|
||||
}
|
||||
|
||||
for(size_t i = 0; i < pl.get_root()->next->next->offset_index.getLength(); i++) {
|
||||
ASSERT_EQ(i * 3, pl.get_root()->next->next->offset_index.at(i));
|
||||
}
|
||||
|
||||
for(size_t i = 0; i < pl.get_root()->next->next->offsets.getLength(); i++) {
|
||||
ASSERT_EQ(offsets[i % 3], pl.get_root()->next->next->offsets.at(i));
|
||||
}
|
||||
}
|
||||
|
||||
TEST(PostingListTest, OutOfOrderUpserts) {
|
||||
std::vector<uint32_t> offsets = {0, 1, 3};
|
||||
posting_list_t pl(5);
|
||||
|
||||
for(int i = 5; i > 0; i--) {
|
||||
pl.upsert(i, offsets);
|
||||
}
|
||||
|
||||
pl.upsert(0, offsets);
|
||||
pl.upsert(200000, offsets);
|
||||
|
||||
ASSERT_EQ(2, pl.size());
|
||||
|
||||
ASSERT_EQ(3, pl.get_root()->size());
|
||||
ASSERT_EQ(4, pl.get_root()->next->size());
|
||||
|
||||
for(size_t i = 0; i < pl.get_root()->offset_index.getLength(); i++) {
|
||||
ASSERT_EQ(i * 3, pl.get_root()->offset_index.at(i));
|
||||
}
|
||||
|
||||
for(size_t i = 0; i < pl.get_root()->offsets.getLength(); i++) {
|
||||
ASSERT_EQ(offsets[i % 3], pl.get_root()->offsets.at(i));
|
||||
}
|
||||
|
||||
for(size_t i = 0; i < pl.get_root()->next->offset_index.getLength(); i++) {
|
||||
ASSERT_EQ(i * 3, pl.get_root()->next->offset_index.at(i));
|
||||
}
|
||||
|
||||
for(size_t i = 0; i < pl.get_root()->next->offsets.getLength(); i++) {
|
||||
ASSERT_EQ(offsets[i % 3], pl.get_root()->next->offsets.at(i));
|
||||
}
|
||||
}
|
||||
|
||||
TEST(PostingListTest, RandomInsertAndDeletes) {
|
||||
time_t t;
|
||||
srand((unsigned) time(&t));
|
||||
|
||||
posting_list_t pl(100);
|
||||
std::vector<uint32_t> offsets1 = {0, 1, 3};
|
||||
std::vector<uint32_t> offsets2 = {10, 12};
|
||||
|
||||
for(size_t i = 0; i < 100000; i++) {
|
||||
const std::vector<uint32_t>& offsets = (i % 2 == 0) ? offsets1 : offsets2;
|
||||
pl.upsert(rand() % 100000, offsets);
|
||||
}
|
||||
|
||||
for(size_t i = 0; i < 10000; i++) {
|
||||
const std::vector<uint32_t>& offsets = (i % 2 == 0) ? offsets1 : offsets2;
|
||||
pl.erase(rand() % 100000);
|
||||
}
|
||||
|
||||
bool size_within_range = (pl.size() < 1500) && (pl.size() > 1000);
|
||||
ASSERT_TRUE(size_within_range);
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user