From 3428a740b693e767d815b19a1e7d16d6c9214e0d Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Sat, 19 Jun 2021 15:46:25 +0530 Subject: [PATCH] Fixed an edge case with posting list block merging. --- include/posting_list.h | 8 +-- src/posting_list.cpp | 137 +++++++++++++++++++++++------------- test/posting_list_test.cpp | 139 +++++++++++++++++++++++++++++++++++++ 3 files changed, 230 insertions(+), 54 deletions(-) diff --git a/include/posting_list.h b/include/posting_list.h index 64345302..938a5135 100644 --- a/include/posting_list.h +++ b/include/posting_list.h @@ -82,10 +82,6 @@ private: // MUST be ordered std::map id_block_map; - static void split_block(block_t* src_block, block_t* dst_block); - - static void merge_adjacent_blocks(block_t* block1, block_t* block2, size_t num_block2_ids); - static bool at_end(const std::vector& its); static bool at_end2(const std::vector& its); @@ -109,6 +105,10 @@ public: ~posting_list_t(); + static void split_block(block_t* src_block, block_t* dst_block); + + static void merge_adjacent_blocks(block_t* block1, block_t* block2, size_t num_block2_ids_to_move); + void upsert(uint32_t id, const std::vector& offsets); void erase(uint32_t id); diff --git a/src/posting_list.cpp b/src/posting_list.cpp index 2f1dbe71..83b02eda 100644 --- a/src/posting_list.cpp +++ b/src/posting_list.cpp @@ -198,92 +198,112 @@ posting_list_t::~posting_list_t() { } void posting_list_t::merge_adjacent_blocks(posting_list_t::block_t* block1, posting_list_t::block_t* block2, - size_t num_block2_ids) { + size_t num_block2_ids_to_move) { // merge ids - uint32_t* raw_ids1 = block1->ids.uncompress(); - uint32_t* raw_ids2 = block2->ids.uncompress(); + uint32_t* ids1 = block1->ids.uncompress(); + uint32_t* ids2 = block2->ids.uncompress(); size_t block1_orig_size = block1->size(); size_t block2_orig_size = block2->size(); - uint32_t* raw_ids = new uint32_t[block1->size() + num_block2_ids]; - std::memmove(raw_ids, raw_ids1, sizeof(uint32_t) * block1->size()); - std::memmove(raw_ids + block1->size(), raw_ids2, sizeof(uint32_t) * num_block2_ids); + size_t block1_orig_offset_size = block1->offsets.getLength(); + size_t block2_orig_offset_size = block2->offsets.getLength(); - block1->ids.load(raw_ids, block1->size() + num_block2_ids); - block2->ids.load(raw_ids2 + num_block2_ids, block2->size() - num_block2_ids); + size_t block1_orig_offset_index_size = block1->offset_index.getLength(); + size_t block2_orig_offset_index_size = block2->offset_index.getLength(); - delete [] raw_ids1; - delete [] raw_ids2; - delete [] raw_ids; + uint32_t* new_ids = new uint32_t[block1->size() + num_block2_ids_to_move]; + std::memmove(new_ids, ids1, sizeof(uint32_t) * block1->size()); + std::memmove(new_ids + block1->size(), ids2, sizeof(uint32_t) * num_block2_ids_to_move); + + block1->ids.load(new_ids, block1->size() + num_block2_ids_to_move); + if(block2->size() != num_block2_ids_to_move) { + block2->ids.load(ids2 + num_block2_ids_to_move, block2->size() - num_block2_ids_to_move); + } else { + block2->ids.load(nullptr, 0); + } + + delete [] ids1; + delete [] ids2; + delete [] new_ids; // merge offset indices - uint32_t* raw_offset_index1 = block1->offset_index.uncompress(); - uint32_t* raw_offset_index2 = block2->offset_index.uncompress(); - uint32_t* raw_offset_index = new uint32_t[block1_orig_size + block2_orig_size]; + uint32_t* offset_index1 = block1->offset_index.uncompress(); + uint32_t* offset_index2 = block2->offset_index.uncompress(); + uint32_t* new_offset_index = new uint32_t[block1_orig_size + block2_orig_size]; - std::memmove(raw_offset_index, raw_offset_index1, sizeof(uint32_t) * block1->offset_index.getLength()); + size_t num_block2_offsets_to_move = (num_block2_ids_to_move == block2_orig_size) ? block2->offsets.getLength() : + offset_index2[num_block2_ids_to_move]; + + std::memmove(new_offset_index, offset_index1, sizeof(uint32_t) * block1->offset_index.getLength()); size_t start_index = block1->offset_index.getLength(); size_t base_offset_len = block1->offsets.getLength(); - for(size_t i = 0; i < num_block2_ids; i++) { - raw_offset_index[start_index + i] = raw_offset_index2[i] + base_offset_len; + for(size_t i = 0; i < num_block2_ids_to_move; i++) { + new_offset_index[start_index + i] = offset_index2[i] + base_offset_len; } - block1->offset_index.load(raw_offset_index, block1->offset_index.getLength() + num_block2_ids); + block1->offset_index.load(new_offset_index, block1->offset_index.getLength() + num_block2_ids_to_move); - for(size_t i = 0; i < (block2_orig_size - num_block2_ids); i++) { - raw_offset_index2[num_block2_ids + i] -= raw_offset_index2[num_block2_ids]; + if(block2->offset_index.getLength() != num_block2_ids_to_move) { + const uint32_t offset_index2_base_index = offset_index2[num_block2_ids_to_move]; + + for(size_t i = 0; i < (block2_orig_size - num_block2_ids_to_move); i++) { + offset_index2[num_block2_ids_to_move + i] -= offset_index2_base_index; + } + + block2->offset_index.load(offset_index2 + num_block2_ids_to_move, block2_orig_size - num_block2_ids_to_move); + } else { + block2->offset_index.load(nullptr, 0); } - block2->offset_index.load(raw_offset_index2 + num_block2_ids, block2_orig_size - num_block2_ids); - // merge offsets - uint32_t* raw_offsets1 = block1->offsets.uncompress(); - uint32_t* raw_offsets2 = block2->offsets.uncompress(); - size_t num_block2_offset_elements = (num_block2_ids == block2_orig_size) ? block2->offsets.getLength() : - raw_offset_index2[num_block2_ids]; + uint32_t* offsets1 = block1->offsets.uncompress(); + uint32_t* offsets2 = block2->offsets.uncompress(); - uint32_t* raw_offsets = new uint32_t[block1->offsets.getLength() + num_block2_offset_elements]; + // we will have to compute new min and max for new block1 and block2 offsets - uint32_t min = raw_offsets1[0], max = raw_offsets1[0]; + size_t new_block1_offsets_size = block1->offsets.getLength() + num_block2_offsets_to_move; + uint32_t* new_block1_offsets = new uint32_t[new_block1_offsets_size]; + + uint32_t min = offsets1[0], max = offsets1[0]; // we have to manually copy over so we can find the new min and max for(size_t i = 0; i < block1->offsets.getLength(); i++) { - raw_offsets[i] = raw_offsets1[i]; - if(raw_offsets[i] < min) { - min = raw_offsets[i]; + new_block1_offsets[i] = offsets1[i]; + if(new_block1_offsets[i] < min) { + min = new_block1_offsets[i]; } - if(raw_offsets[i] > max) { - max = raw_offsets[i]; + if(new_block1_offsets[i] > max) { + max = new_block1_offsets[i]; } } size_t block2_base_index = block1->offsets.getLength(); - for(size_t i = 0; i < num_block2_offset_elements; i++) { + for(size_t i = 0; i < num_block2_offsets_to_move; i++) { size_t j = block2_base_index + i; - raw_offsets[j] = raw_offsets2[i]; + new_block1_offsets[j] = offsets2[i]; - if(raw_offsets[j] < min) { - min = raw_offsets[j]; + if(new_block1_offsets[j] < min) { + min = new_block1_offsets[j]; } - if(raw_offsets[j] > max) { - max = raw_offsets[j]; + if(new_block1_offsets[j] > max) { + max = new_block1_offsets[j]; } } - block1->offsets.load(raw_offsets, block1->offsets.getLength() + num_block2_offset_elements, min, max); + block1->offsets.load(new_block1_offsets, new_block1_offsets_size, min, max); // reset block2 offsets with remaining elements - if(block2->offsets.getLength() != num_block2_offset_elements) { - const size_t block2_new_offsets_length = (block2->offsets.getLength() - num_block2_offset_elements); + if(block2->offsets.getLength() != num_block2_offsets_to_move) { + const size_t block2_new_offsets_length = (block2->offsets.getLength() - num_block2_offsets_to_move); uint32_t* block2_new_raw_offsets = new uint32_t[block2_new_offsets_length]; - min = max = raw_offsets2[num_block2_offset_elements]; + min = max = offsets2[num_block2_offsets_to_move]; for(size_t i = 0; i < block2_new_offsets_length; i++) { - block2_new_raw_offsets[i] = raw_offsets2[num_block2_offset_elements + i]; + block2_new_raw_offsets[i] = offsets2[num_block2_offsets_to_move + i]; if(block2_new_raw_offsets[i] < min) { min = block2_new_raw_offsets[i]; } @@ -298,15 +318,27 @@ void posting_list_t::merge_adjacent_blocks(posting_list_t::block_t* block1, post block2->offsets.load(nullptr, 0, 0, 0); } - delete [] raw_offset_index1; - delete [] raw_offset_index2; - delete [] raw_offset_index; + if(block1->offsets.getLength() < block1->offset_index.getLength()) { + LOG(ERROR) << "Block offset length is smaller than offset index length after merging."; + } - delete [] raw_offsets1; - delete [] raw_offsets2; - delete [] raw_offsets; + delete [] offset_index1; + delete [] offset_index2; + delete [] new_offset_index; + + delete [] offsets1; + delete [] offsets2; + delete [] new_block1_offsets; } +/*void print_vec(const std::vector& vec) { + LOG(INFO) << "---"; + for(auto x: vec) { + LOG(INFO) << x; + } + LOG(INFO) << "---"; +}*/ + void posting_list_t::split_block(posting_list_t::block_t* src_block, posting_list_t::block_t* dst_block) { if(src_block->size() <= 1) { return; @@ -369,6 +401,11 @@ void posting_list_t::split_block(posting_list_t::block_t* src_block, posting_lis size_t offsets_second_half_length = src_offsets_length - offset_first_half_length; dst_block->offsets.load(raw_offsets + offset_first_half_length, offsets_second_half_length, min, max); + if(dst_block->offsets.getLength() < dst_block->offset_index.getLength() || + src_block->offsets.getLength() < src_block->offset_index.getLength()) { + LOG(ERROR) << "Block offset length is smaller than offset index length after splitting."; + } + delete [] raw_ids; delete [] raw_offset_indices; delete [] raw_offsets; diff --git a/test/posting_list_test.cpp b/test/posting_list_test.cpp index 8ad7c8a8..0644c0a2 100644 --- a/test/posting_list_test.cpp +++ b/test/posting_list_test.cpp @@ -792,6 +792,145 @@ TEST(PostingListTest, PostingListContainsAtleastOne) { ASSERT_FALSE(p2.contains_atleast_one(&target_ids2[0], target_ids2.size())); } +TEST(PostingListTest, PostingListMergeAdjancentBlocks) { + // when posting list is larger than target IDs + posting_list_t p1(6); + + for(size_t i = 0; i < 18; i++) { + p1.upsert(i, {2, 3}); + } + + p1.erase(0); + p1.erase(1); + + // IDs: [4] [6] [6] + // [6] [4] [6] + // Offsets: + // Before: [8] [12] [12] + // After: [12] [8] [12] + + posting_list_t::block_t* next_block = p1.get_root()->next; + posting_list_t::merge_adjacent_blocks(p1.get_root(), next_block, 2); + + ASSERT_EQ(6, p1.get_root()->ids.getLength()); + ASSERT_EQ(6, p1.get_root()->offset_index.getLength()); + ASSERT_EQ(12, p1.get_root()->offsets.getLength()); + + std::vector ids = {2, 3, 4, 5, 6, 7}; + for(size_t i = 0 ; i < ids.size(); i++) { + auto id = ids[i]; + ASSERT_EQ(id, p1.get_root()->ids.at(i)); + } + + for(size_t i = 0; i < p1.get_root()->offset_index.getLength(); i++) { + ASSERT_EQ(i*2, p1.get_root()->offset_index.at(i)); + } + + for(size_t i = 0; i < p1.get_root()->offsets.getLength(); i++) { + auto expected_offset = (i % 2 == 0) ? 2 : 3; + ASSERT_EQ(expected_offset, p1.get_root()->offsets.at(i)); + } + + ASSERT_EQ(4, next_block->ids.getLength()); + ASSERT_EQ(4, next_block->offset_index.getLength()); + ASSERT_EQ(8, next_block->offsets.getLength()); + + ids = {8, 9, 10, 11}; + for(size_t i = 0 ; i < ids.size(); i++) { + auto id = ids[i]; + ASSERT_EQ(id, next_block->ids.at(i)); + } + + for(size_t i = 0; i < next_block->offset_index.getLength(); i++) { + ASSERT_EQ(i*2, next_block->offset_index.at(i)); + } + + for(size_t i = 0; i < next_block->offsets.getLength(); i++) { + auto expected_offset = (i % 2 == 0) ? 2 : 3; + ASSERT_EQ(expected_offset, next_block->offsets.at(i)); + } + + // full merge + + posting_list_t::block_t* block1 = next_block; + posting_list_t::block_t* block2 = next_block->next; + + posting_list_t::merge_adjacent_blocks(block1, block2, 6); + ASSERT_EQ(10, block1->ids.getLength()); + ASSERT_EQ(10, block1->offset_index.getLength()); + ASSERT_EQ(20, block1->offsets.getLength()); + + ids = {8, 9, 10, 11, 12, 13, 14, 15, 16, 17}; + + for(size_t i = 0; i < ids.size(); i++) { + auto id = ids[i]; + ASSERT_EQ(id, block1->ids.at(i)); + } + + for(size_t i = 0; i < block1->offset_index.getLength(); i++) { + ASSERT_EQ(i*2, block1->offset_index.at(i)); + } + + for(size_t i = 0; i < block1->offsets.getLength(); i++) { + auto expected_offset = (i % 2 == 0) ? 2 : 3; + ASSERT_EQ(expected_offset, block1->offsets.at(i)); + } + + ASSERT_EQ(0, block2->ids.getLength()); + ASSERT_EQ(0, block2->offset_index.getLength()); + ASSERT_EQ(0, block2->offsets.getLength()); +} + +TEST(PostingListTest, PostingListSplitBlock) { + posting_list_t p1(6); + + for (size_t i = 0; i < 6; i++) { + p1.upsert(i, {2, 3}); + } + + posting_list_t::block_t* block1 = p1.get_root(); + posting_list_t::block_t block2; + posting_list_t::split_block(block1, &block2); + + ASSERT_EQ(3, block1->ids.getLength()); + ASSERT_EQ(3, block1->offset_index.getLength()); + ASSERT_EQ(6, block1->offsets.getLength()); + + std::vector ids = {0, 1, 2}; + + for(size_t i = 0; i < ids.size(); i++) { + ASSERT_EQ(ids[i], block1->ids.at(i)); + } + + for(size_t i = 0; i < block1->offset_index.getLength(); i++) { + ASSERT_EQ(i*2, block1->offset_index.at(i)); + } + + for(size_t i = 0; i < block1->offsets.getLength(); i++) { + auto expected_offset = (i % 2 == 0) ? 2 : 3; + ASSERT_EQ(expected_offset, block1->offsets.at(i)); + } + + ASSERT_EQ(3, block2.ids.getLength()); + ASSERT_EQ(3, block2.offset_index.getLength()); + ASSERT_EQ(6, block2.offsets.getLength()); + + ids = {3, 4, 5}; + + for(size_t i = 0; i < ids.size(); i++) { + ASSERT_EQ(ids[i], block2.ids.at(i)); + } + + for(size_t i = 0; i < block2.offset_index.getLength(); i++) { + ASSERT_EQ(i*2, block2.offset_index.at(i)); + } + + for(size_t i = 0; i < block2.offsets.getLength(); i++) { + auto expected_offset = (i % 2 == 0) ? 2 : 3; + ASSERT_EQ(expected_offset, block2.offsets.at(i)); + } +} + TEST(PostingListTest, CompactPostingListUpsertAppends) { uint32_t ids[] = {0, 1000, 1002}; uint32_t offset_index[] = {0, 3, 6};