From 6ded5c7556f75ad2edfaa80e38d48429a8c916be Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Tue, 30 Nov 2021 15:56:25 +0530 Subject: [PATCH] Fix posting list deletion edge case. --- src/posting_list.cpp | 35 +++++++---- test/posting_list_test.cpp | 117 +++++++++++++++++++++++++++++-------- 2 files changed, 117 insertions(+), 35 deletions(-) diff --git a/src/posting_list.cpp b/src/posting_list.cpp index 823dd028..c888225e 100644 --- a/src/posting_list.cpp +++ b/src/posting_list.cpp @@ -186,7 +186,9 @@ bool posting_list_t::block_t::contains(uint32_t id) { /* posting_list_t operations */ posting_list_t::posting_list_t(uint16_t max_block_elements): BLOCK_MAX_ELEMENTS(max_block_elements) { - + if(max_block_elements <= 1) { + throw std::invalid_argument("max_block_elements must be > 1"); + } } posting_list_t::~posting_list_t() { @@ -423,7 +425,7 @@ void posting_list_t::upsert(const uint32_t id, const std::vector& offs } else { const auto it = id_block_map.lower_bound(id); upsert_block = (it == id_block_map.end()) ? id_block_map.rbegin()->second : it->second; - before_upsert_last_id = upsert_block->ids.at(upsert_block->size() - 1); + before_upsert_last_id = upsert_block->ids.last(); } // happy path: upsert_block is not full @@ -431,7 +433,7 @@ void posting_list_t::upsert(const uint32_t id, const std::vector& offs uint32_t num_inserted = upsert_block->upsert(id, offsets); ids_length += num_inserted; - last_id_t after_upsert_last_id = upsert_block->ids.at(upsert_block->size() - 1); + last_id_t after_upsert_last_id = upsert_block->ids.last(); if(before_upsert_last_id != after_upsert_last_id) { id_block_map.erase(before_upsert_last_id); id_block_map.emplace(after_upsert_last_id, upsert_block); @@ -451,12 +453,12 @@ void posting_list_t::upsert(const uint32_t id, const std::vector& offs // evenly divide elements between both blocks split_block(upsert_block, new_block); - last_id_t after_upsert_last_id = upsert_block->ids.at(upsert_block->size() - 1); + last_id_t after_upsert_last_id = upsert_block->ids.last(); id_block_map.erase(before_upsert_last_id); id_block_map.emplace(after_upsert_last_id, upsert_block); } - last_id_t after_new_block_id = new_block->ids.at(new_block->size() - 1); + last_id_t after_new_block_id = new_block->ids.last(); id_block_map.emplace(after_new_block_id, new_block); new_block->next = upsert_block->next; @@ -485,6 +487,17 @@ void posting_list_t::erase(const uint32_t id) { // since we will be deleting the empty node, set the previous node's next pointer to null std::prev(it)->second->next = nullptr; delete erase_block; + } else { + // The root block cannot be empty if there are other blocks so we will pull some contents from next block + // This is only an issue for blocks with max size of 2 + if(root_block.next != nullptr) { + auto next_block_last_id = erase_block->next->ids.last(); + merge_adjacent_blocks(erase_block, erase_block->next, erase_block->next->size()/2); + id_block_map.erase(next_block_last_id); + + id_block_map.emplace(erase_block->next->ids.last(), erase_block->next); + id_block_map.emplace(erase_block->ids.last(), erase_block); + } } id_block_map.erase(before_last_id); @@ -493,7 +506,7 @@ void posting_list_t::erase(const uint32_t id) { } if(new_ids_length >= BLOCK_MAX_ELEMENTS/2 || erase_block->next == nullptr) { - last_id_t after_last_id = erase_block->ids.at(new_ids_length-1); + last_id_t after_last_id = erase_block->ids.last(); if(before_last_id != after_last_id) { id_block_map.erase(before_last_id); id_block_map.emplace(after_last_id, erase_block); @@ -505,7 +518,7 @@ void posting_list_t::erase(const uint32_t id) { // block is less than 50% of max capacity and contains a next node which we can refill from auto next_block = erase_block->next; - last_id_t next_block_last_id = next_block->ids.at(next_block->ids.getLength()-1); + last_id_t next_block_last_id = next_block->ids.last(); if(erase_block->size() + next_block->size() <= BLOCK_MAX_ELEMENTS) { // we can merge the contents of next block with `erase_block` and delete the next block @@ -515,13 +528,15 @@ void posting_list_t::erase(const uint32_t id) { id_block_map.erase(next_block_last_id); } else { - // only part of the next block can be moved over - size_t num_block2_ids = BLOCK_MAX_ELEMENTS - erase_block->size(); + // Only part of the next block can be moved over. + // We will move only 50% of max elements to ensure that we don't end up "flipping" adjacent blocks: + // 1, 5 -> 5, 1 + size_t num_block2_ids = BLOCK_MAX_ELEMENTS/2; merge_adjacent_blocks(erase_block, next_block, num_block2_ids); // NOTE: we don't have to update `id_block_map` for `next_block` as last element doesn't change } - last_id_t after_last_id = erase_block->ids.at(erase_block->ids.getLength()-1); + last_id_t after_last_id = erase_block->ids.last(); if(before_last_id != after_last_id) { id_block_map.erase(before_last_id); id_block_map.emplace(after_last_id, erase_block); diff --git a/test/posting_list_test.cpp b/test/posting_list_test.cpp index 09c567ef..b9ecec3d 100644 --- a/test/posting_list_test.cpp +++ b/test/posting_list_test.cpp @@ -447,13 +447,13 @@ TEST_F(PostingListTest, RemovalsOnLaterBlocks) { // only part of the next node contents can be moved over when we delete 8 since (1 + 5) > 5 pl.erase(8); - // [0..4], [9], [10..14] => [0..4], [9,10,11,12,13], [14] + // [0..4], [9], [10..14] => [0..4], [9,10,11], [12,13,14] ASSERT_EQ(3, pl.num_blocks()); ASSERT_EQ(11, pl.num_ids()); - ASSERT_EQ(5, pl.get_root()->next->size()); - ASSERT_EQ(1, pl.get_root()->next->next->size()); - ASSERT_EQ(13, pl.get_root()->next->ids.last()); + ASSERT_EQ(3, pl.get_root()->next->size()); + ASSERT_EQ(3, pl.get_root()->next->next->size()); + ASSERT_EQ(11, pl.get_root()->next->ids.last()); ASSERT_EQ(14, pl.get_root()->next->next->ids.last()); for(size_t i = 0; i < pl.get_root()->next->offset_index.getLength(); i++) { @@ -616,27 +616,6 @@ TEST_F(PostingListTest, SplittingOfListsSimple) { std::vector> partial_its_vec(4); intersector.split_lists(4, partial_its_vec); - /*for(size_t i = 0; i < partial_its_vec.size(); i++) { - auto& partial_its = partial_its_vec[i]; - - if (partial_its.empty()) { - continue; - } - - LOG(INFO) << "Vec " << i; - - for (auto& it: partial_its) { - while (it.valid()) { - LOG(INFO) << it.id(); - it.next(); - } - - LOG(INFO) << "---"; - } - } - - return ;*/ - std::vector>> split_ids = { {{0, 2}, {1, 3}, {2, 3}}, {{3, 20}, {1, 3, 5, 10, 20}, {2, 3, 5, 7, 20}} @@ -1384,6 +1363,94 @@ TEST_F(PostingListTest, BlockIntersectionOnMixedLists) { free(list1); } +TEST_F(PostingListTest, InsertAndEraseSequence) { + std::vector offsets = {0, 1, 3}; + posting_list_t pl(5); + + pl.upsert(0, offsets); + pl.upsert(2, offsets); + pl.upsert(4, offsets); + pl.upsert(6, offsets); + pl.upsert(8, offsets); + + // this will cause a split of the root block + pl.upsert(3, offsets); // 0,2,3 | 4,6,8 + pl.erase(0); // 2,3 | 4,6,8 + pl.upsert(5, offsets); // 2,3 | 4,5,6,8 + pl.upsert(7, offsets); // 2,3 | 4,5,6,7,8 + pl.upsert(10, offsets); // 2,3 | 4,5,6,7,8 | 10 + + // this will cause adjacent block refill + pl.erase(2); // 3,4,5,6,7 | 8 | 10 + + // deletes second block + pl.erase(8); + + // remove all elements + pl.erase(3); + pl.erase(4); + pl.erase(5); + pl.erase(6); + pl.erase(7); + pl.erase(10); + + ASSERT_EQ(0, pl.num_ids()); +} + +TEST_F(PostingListTest, InsertAndEraseSequenceWithBlockSizeTwo) { + std::vector offsets = {0, 1, 3}; + posting_list_t pl(2); + + pl.upsert(2, offsets); + pl.upsert(3, offsets); + pl.upsert(1, offsets); // inserting 2 again here? // inserting 4 here? + + // 1 | 2,3 + + pl.erase(1); + + ASSERT_EQ(1, pl.get_root()->size()); + ASSERT_EQ(2, pl.num_blocks()); + + pl.erase(3); + pl.erase(2); + + ASSERT_EQ(0, pl.get_root()->size()); +} + +TEST_F(PostingListTest, PostingListMustHaveAtleast1Element) { + try { + std::vector offsets = {0, 1, 3}; + posting_list_t pl(1); + FAIL() << "Expected std::invalid_argument"; + } + catch(std::invalid_argument const & err) { + EXPECT_EQ(err.what(),std::string("max_block_elements must be > 1")); + } catch(...) { + FAIL() << "Expected std::invalid_argument"; + } +} + +TEST_F(PostingListTest, DISABLED_RandInsertAndErase) { + std::vector offsets = {0, 1, 3}; + posting_list_t pl(5); + + time_t t; + srand((unsigned) time(&t)); + + for(size_t i = 0; i < 10000; i++) { + LOG(INFO) << "i: " << i; + uint32_t add_id = rand() % 15; + pl.upsert(add_id, offsets); + + uint32_t del_id = rand() % 15; + LOG(INFO) << "add: " << add_id << ", erase: " << del_id; + pl.erase(del_id); + } + + LOG(INFO) << "Num ids: " << pl.num_ids() << ", num bocks: " << pl.num_blocks(); +} + TEST_F(PostingListTest, DISABLED_Benchmark) { std::vector offsets = {0, 1, 3}; posting_list_t pl(4096);