Fix posting list deletion edge case.

This commit is contained in:
Kishore Nallan 2021-11-30 15:56:25 +05:30
parent d9b969aab5
commit 6ded5c7556
2 changed files with 117 additions and 35 deletions

View File

@ -186,7 +186,9 @@ bool posting_list_t::block_t::contains(uint32_t id) {
/* posting_list_t operations */
posting_list_t::posting_list_t(uint16_t max_block_elements): BLOCK_MAX_ELEMENTS(max_block_elements) {
if(max_block_elements <= 1) {
throw std::invalid_argument("max_block_elements must be > 1");
}
}
posting_list_t::~posting_list_t() {
@ -423,7 +425,7 @@ void posting_list_t::upsert(const uint32_t id, const std::vector<uint32_t>& offs
} else {
const auto it = id_block_map.lower_bound(id);
upsert_block = (it == id_block_map.end()) ? id_block_map.rbegin()->second : it->second;
before_upsert_last_id = upsert_block->ids.at(upsert_block->size() - 1);
before_upsert_last_id = upsert_block->ids.last();
}
// happy path: upsert_block is not full
@ -431,7 +433,7 @@ void posting_list_t::upsert(const uint32_t id, const std::vector<uint32_t>& offs
uint32_t num_inserted = upsert_block->upsert(id, offsets);
ids_length += num_inserted;
last_id_t after_upsert_last_id = upsert_block->ids.at(upsert_block->size() - 1);
last_id_t after_upsert_last_id = upsert_block->ids.last();
if(before_upsert_last_id != after_upsert_last_id) {
id_block_map.erase(before_upsert_last_id);
id_block_map.emplace(after_upsert_last_id, upsert_block);
@ -451,12 +453,12 @@ void posting_list_t::upsert(const uint32_t id, const std::vector<uint32_t>& offs
// evenly divide elements between both blocks
split_block(upsert_block, new_block);
last_id_t after_upsert_last_id = upsert_block->ids.at(upsert_block->size() - 1);
last_id_t after_upsert_last_id = upsert_block->ids.last();
id_block_map.erase(before_upsert_last_id);
id_block_map.emplace(after_upsert_last_id, upsert_block);
}
last_id_t after_new_block_id = new_block->ids.at(new_block->size() - 1);
last_id_t after_new_block_id = new_block->ids.last();
id_block_map.emplace(after_new_block_id, new_block);
new_block->next = upsert_block->next;
@ -485,6 +487,17 @@ void posting_list_t::erase(const uint32_t id) {
// since we will be deleting the empty node, set the previous node's next pointer to null
std::prev(it)->second->next = nullptr;
delete erase_block;
} else {
// The root block cannot be empty if there are other blocks so we will pull some contents from next block
// This is only an issue for blocks with max size of 2
if(root_block.next != nullptr) {
auto next_block_last_id = erase_block->next->ids.last();
merge_adjacent_blocks(erase_block, erase_block->next, erase_block->next->size()/2);
id_block_map.erase(next_block_last_id);
id_block_map.emplace(erase_block->next->ids.last(), erase_block->next);
id_block_map.emplace(erase_block->ids.last(), erase_block);
}
}
id_block_map.erase(before_last_id);
@ -493,7 +506,7 @@ void posting_list_t::erase(const uint32_t id) {
}
if(new_ids_length >= BLOCK_MAX_ELEMENTS/2 || erase_block->next == nullptr) {
last_id_t after_last_id = erase_block->ids.at(new_ids_length-1);
last_id_t after_last_id = erase_block->ids.last();
if(before_last_id != after_last_id) {
id_block_map.erase(before_last_id);
id_block_map.emplace(after_last_id, erase_block);
@ -505,7 +518,7 @@ void posting_list_t::erase(const uint32_t id) {
// block is less than 50% of max capacity and contains a next node which we can refill from
auto next_block = erase_block->next;
last_id_t next_block_last_id = next_block->ids.at(next_block->ids.getLength()-1);
last_id_t next_block_last_id = next_block->ids.last();
if(erase_block->size() + next_block->size() <= BLOCK_MAX_ELEMENTS) {
// we can merge the contents of next block with `erase_block` and delete the next block
@ -515,13 +528,15 @@ void posting_list_t::erase(const uint32_t id) {
id_block_map.erase(next_block_last_id);
} else {
// only part of the next block can be moved over
size_t num_block2_ids = BLOCK_MAX_ELEMENTS - erase_block->size();
// Only part of the next block can be moved over.
// We will move only 50% of max elements to ensure that we don't end up "flipping" adjacent blocks:
// 1, 5 -> 5, 1
size_t num_block2_ids = BLOCK_MAX_ELEMENTS/2;
merge_adjacent_blocks(erase_block, next_block, num_block2_ids);
// NOTE: we don't have to update `id_block_map` for `next_block` as last element doesn't change
}
last_id_t after_last_id = erase_block->ids.at(erase_block->ids.getLength()-1);
last_id_t after_last_id = erase_block->ids.last();
if(before_last_id != after_last_id) {
id_block_map.erase(before_last_id);
id_block_map.emplace(after_last_id, erase_block);

View File

@ -447,13 +447,13 @@ TEST_F(PostingListTest, RemovalsOnLaterBlocks) {
// only part of the next node contents can be moved over when we delete 8 since (1 + 5) > 5
pl.erase(8);
// [0..4], [9], [10..14] => [0..4], [9,10,11,12,13], [14]
// [0..4], [9], [10..14] => [0..4], [9,10,11], [12,13,14]
ASSERT_EQ(3, pl.num_blocks());
ASSERT_EQ(11, pl.num_ids());
ASSERT_EQ(5, pl.get_root()->next->size());
ASSERT_EQ(1, pl.get_root()->next->next->size());
ASSERT_EQ(13, pl.get_root()->next->ids.last());
ASSERT_EQ(3, pl.get_root()->next->size());
ASSERT_EQ(3, pl.get_root()->next->next->size());
ASSERT_EQ(11, pl.get_root()->next->ids.last());
ASSERT_EQ(14, pl.get_root()->next->next->ids.last());
for(size_t i = 0; i < pl.get_root()->next->offset_index.getLength(); i++) {
@ -616,27 +616,6 @@ TEST_F(PostingListTest, SplittingOfListsSimple) {
std::vector<std::vector<posting_list_t::iterator_t>> partial_its_vec(4);
intersector.split_lists(4, partial_its_vec);
/*for(size_t i = 0; i < partial_its_vec.size(); i++) {
auto& partial_its = partial_its_vec[i];
if (partial_its.empty()) {
continue;
}
LOG(INFO) << "Vec " << i;
for (auto& it: partial_its) {
while (it.valid()) {
LOG(INFO) << it.id();
it.next();
}
LOG(INFO) << "---";
}
}
return ;*/
std::vector<std::vector<std::vector<uint32_t>>> split_ids = {
{{0, 2}, {1, 3}, {2, 3}},
{{3, 20}, {1, 3, 5, 10, 20}, {2, 3, 5, 7, 20}}
@ -1384,6 +1363,94 @@ TEST_F(PostingListTest, BlockIntersectionOnMixedLists) {
free(list1);
}
TEST_F(PostingListTest, InsertAndEraseSequence) {
std::vector<uint32_t> offsets = {0, 1, 3};
posting_list_t pl(5);
pl.upsert(0, offsets);
pl.upsert(2, offsets);
pl.upsert(4, offsets);
pl.upsert(6, offsets);
pl.upsert(8, offsets);
// this will cause a split of the root block
pl.upsert(3, offsets); // 0,2,3 | 4,6,8
pl.erase(0); // 2,3 | 4,6,8
pl.upsert(5, offsets); // 2,3 | 4,5,6,8
pl.upsert(7, offsets); // 2,3 | 4,5,6,7,8
pl.upsert(10, offsets); // 2,3 | 4,5,6,7,8 | 10
// this will cause adjacent block refill
pl.erase(2); // 3,4,5,6,7 | 8 | 10
// deletes second block
pl.erase(8);
// remove all elements
pl.erase(3);
pl.erase(4);
pl.erase(5);
pl.erase(6);
pl.erase(7);
pl.erase(10);
ASSERT_EQ(0, pl.num_ids());
}
TEST_F(PostingListTest, InsertAndEraseSequenceWithBlockSizeTwo) {
std::vector<uint32_t> offsets = {0, 1, 3};
posting_list_t pl(2);
pl.upsert(2, offsets);
pl.upsert(3, offsets);
pl.upsert(1, offsets); // inserting 2 again here? // inserting 4 here?
// 1 | 2,3
pl.erase(1);
ASSERT_EQ(1, pl.get_root()->size());
ASSERT_EQ(2, pl.num_blocks());
pl.erase(3);
pl.erase(2);
ASSERT_EQ(0, pl.get_root()->size());
}
TEST_F(PostingListTest, PostingListMustHaveAtleast1Element) {
try {
std::vector<uint32_t> offsets = {0, 1, 3};
posting_list_t pl(1);
FAIL() << "Expected std::invalid_argument";
}
catch(std::invalid_argument const & err) {
EXPECT_EQ(err.what(),std::string("max_block_elements must be > 1"));
} catch(...) {
FAIL() << "Expected std::invalid_argument";
}
}
TEST_F(PostingListTest, DISABLED_RandInsertAndErase) {
std::vector<uint32_t> offsets = {0, 1, 3};
posting_list_t pl(5);
time_t t;
srand((unsigned) time(&t));
for(size_t i = 0; i < 10000; i++) {
LOG(INFO) << "i: " << i;
uint32_t add_id = rand() % 15;
pl.upsert(add_id, offsets);
uint32_t del_id = rand() % 15;
LOG(INFO) << "add: " << add_id << ", erase: " << del_id;
pl.erase(del_id);
}
LOG(INFO) << "Num ids: " << pl.num_ids() << ", num bocks: " << pl.num_blocks();
}
TEST_F(PostingListTest, DISABLED_Benchmark) {
std::vector<uint32_t> offsets = {0, 1, 3};
posting_list_t pl(4096);