From 6e4ecd409cafe857ff730ac03a5f482225a07aae Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Sun, 30 May 2021 19:54:22 +0530 Subject: [PATCH] In place updates for posting list. --- include/array_base.h | 4 + src/array_base.cpp | 8 ++ src/posting_list.cpp | 123 ++++++++++++++++++++++++++--- test/posting_list_test.cpp | 158 +++++++++++++++++++++++++++++++++++-- 4 files changed, 277 insertions(+), 16 deletions(-) diff --git a/include/array_base.h b/include/array_base.h index 3ad75207..f04bbc9d 100644 --- a/include/array_base.h +++ b/include/array_base.h @@ -42,4 +42,8 @@ public: uint32_t getSizeInBytes(); uint32_t getLength() const; + + uint32_t getMin() const; + + uint32_t getMax() const; }; \ No newline at end of file diff --git a/src/array_base.cpp b/src/array_base.cpp index 860a1a06..f61a123c 100644 --- a/src/array_base.cpp +++ b/src/array_base.cpp @@ -14,3 +14,11 @@ uint32_t array_base::getSizeInBytes() { uint32_t array_base::getLength() const { return length; } + +uint32_t array_base::getMin() const { + return min; +} + +uint32_t array_base::getMax() const { + return max; +} diff --git a/src/posting_list.cpp b/src/posting_list.cpp index c6425b95..a5229150 100644 --- a/src/posting_list.cpp +++ b/src/posting_list.cpp @@ -25,19 +25,120 @@ void posting_list_t::block_t::insert_and_shift_offset_index(const uint32_t index } void posting_list_t::block_t::upsert(const uint32_t id, const std::vector& positions) { - size_t inserted_index = ids.append(id); + if(id <= ids.last()) { + // we have to check if `id` already exists, for an opportunity to do in-place updates + uint32_t id_index = ids.indexOf(id); - if(inserted_index == ids.getLength()-1) { - // treat as appends - uint32_t curr_index = offsets.getLength(); - offset_index.append(curr_index); - for(uint32_t position : positions) { - offsets.append(position); + if(id_index != ids.getLength()) { + // id is already present, so we will only update offset index and offsets + uint32_t start_offset_index = offset_index.at(id_index); + uint32_t end_offset_index = (id == ids.last()) ? offsets.getLength()-1 : offset_index.at(id_index + 1)-1; + uint32_t num_offsets = (end_offset_index - start_offset_index) + 1; + uint32_t* curr_offsets = offsets.uncompress(); + uint32_t m = offsets.getMin(), M = offsets.getMax(); + + if(num_offsets == positions.size()) { + // no need to touch the offset index and need to just do inplace updates of offsets + bool find_new_min_max = false; + for(size_t i = 0; i < positions.size(); i++) { + if((curr_offsets[start_offset_index + i] == m || curr_offsets[start_offset_index + i] == M) && + curr_offsets[start_offset_index + i] != positions[i]) { + // when an existing min/max is affected we will have to find the new min/max + find_new_min_max = true; + } + + if(positions[i] < m) { + m = positions[i]; + } + + if(positions[i] > M) { + M = positions[i]; + } + + curr_offsets[start_offset_index + i] = positions[i]; + } + + if(find_new_min_max) { + for(size_t i = 0; i < offsets.getLength(); i++) { + if(curr_offsets[i] < m) { + m = curr_offsets[i]; + } + + if(curr_offsets[i] > M) { + M = curr_offsets[i]; + } + } + } + + offsets.load(curr_offsets, offsets.getLength(), m, M); + } else { + // need to resize offsets array + int64_t size_diff = int64_t(positions.size()) - num_offsets; // size_diff can be negative + size_t new_offsets_length = offsets.getLength() + size_diff; + uint32_t* new_offsets = new uint32_t[new_offsets_length]; + std::memmove(new_offsets, curr_offsets, sizeof(uint32_t) * start_offset_index); + + bool find_new_min_max = false; + for(size_t i = 0; i < num_offsets; i++) { + if(curr_offsets[start_offset_index + i] == m || curr_offsets[start_offset_index + i] == M) { + // when an existing min/max is affected we will have to find the new min/max + find_new_min_max = true; + } + } + + for(size_t i = 0; i < positions.size(); i++) { + if(positions[i] < m) { + m = positions[i]; + } + + if(positions[i] > M) { + M = positions[i]; + } + + new_offsets[start_offset_index + i] = positions[i]; + } + + std::memmove(new_offsets + start_offset_index + positions.size(), + curr_offsets + end_offset_index + 1, + sizeof(uint32_t) * (offsets.getLength() - (end_offset_index + 1))); + + if(find_new_min_max) { + for(size_t i = 0; i < offsets.getLength(); i++) { + if(curr_offsets[i] < m) { + m = curr_offsets[i]; + } + + if(curr_offsets[i] > M) { + M = curr_offsets[i]; + } + } + } + + offsets.load(new_offsets, new_offsets_length, m, M); + delete [] new_offsets; + + // shift offset index + uint32_t* current_offset_index = offset_index.uncompress(); + for(size_t i = id_index+1; i < ids.getLength(); i++) { + current_offset_index[i] += size_diff; + } + + offset_index.load(current_offset_index, offset_index.getLength()); + delete [] current_offset_index; + } + + delete [] curr_offsets; + return; } - } else { - uint32_t existing_offset_index = offset_index.at(inserted_index); - insert_and_shift_offset_index(inserted_index, positions.size()); - offsets.insert(existing_offset_index, &positions[0], positions.size()); + } + + // treat as regular append (either id not found or exceeds max id) + + ids.append(id); + uint32_t curr_index = offsets.getLength(); + offset_index.append(curr_index); + for(uint32_t position : positions) { + offsets.append(position); } } diff --git a/test/posting_list_test.cpp b/test/posting_list_test.cpp index 08f2657c..638e8007 100644 --- a/test/posting_list_test.cpp +++ b/test/posting_list_test.cpp @@ -110,6 +110,155 @@ TEST(PostingListTest, Insert) { } } +TEST(PostingListTest, InplaceUpserts) { + std::vector offsets = {1, 2, 3}; + posting_list_t pl(5); + + pl.upsert(2, offsets); + pl.upsert(5, offsets); + pl.upsert(7, offsets); + + ASSERT_EQ(1, pl.size()); + ASSERT_EQ(3, pl.get_root()->ids.getLength()); + ASSERT_EQ(9, pl.get_root()->offsets.getLength()); + + // update starting ID with same length of offsets + pl.upsert(2, {1, 2, 4}); + ASSERT_EQ(1, pl.size()); + ASSERT_EQ(3, pl.get_root()->ids.getLength()); + ASSERT_EQ(9, pl.get_root()->offsets.getLength()); + + ASSERT_EQ(1, pl.get_root()->offsets.at(0)); + ASSERT_EQ(2, pl.get_root()->offsets.at(1)); + ASSERT_EQ(4, pl.get_root()->offsets.at(2)); + ASSERT_EQ(4, pl.get_root()->offsets.getMax()); + ASSERT_EQ(1, pl.get_root()->offsets.getMin()); + + ASSERT_EQ(0, pl.get_root()->offset_index.at(0)); + ASSERT_EQ(3, pl.get_root()->offset_index.at(1)); + ASSERT_EQ(6, pl.get_root()->offset_index.at(2)); + + // update starting ID with smaller number of offsets + pl.upsert(2, {5, 7}); + ASSERT_EQ(1, pl.size()); + ASSERT_EQ(3, pl.get_root()->ids.getLength()); + ASSERT_EQ(8, pl.get_root()->offsets.getLength()); + + ASSERT_EQ(5, pl.get_root()->offsets.at(0)); + ASSERT_EQ(7, pl.get_root()->offsets.at(1)); + ASSERT_EQ(1, pl.get_root()->offsets.at(2)); + ASSERT_EQ(7, pl.get_root()->offsets.getMax()); + ASSERT_EQ(1, pl.get_root()->offsets.getMin()); + + ASSERT_EQ(0, pl.get_root()->offset_index.at(0)); + ASSERT_EQ(2, pl.get_root()->offset_index.at(1)); + ASSERT_EQ(5, pl.get_root()->offset_index.at(2)); + + // update starting ID with larger number of offsets + pl.upsert(2, {0, 2, 8}); + ASSERT_EQ(1, pl.size()); + ASSERT_EQ(3, pl.get_root()->ids.getLength()); + ASSERT_EQ(9, pl.get_root()->offsets.getLength()); + + ASSERT_EQ(0, pl.get_root()->offsets.at(0)); + ASSERT_EQ(2, pl.get_root()->offsets.at(1)); + ASSERT_EQ(8, pl.get_root()->offsets.at(2)); + ASSERT_EQ(1, pl.get_root()->offsets.at(3)); + ASSERT_EQ(8, pl.get_root()->offsets.getMax()); + ASSERT_EQ(0, pl.get_root()->offsets.getMin()); + + ASSERT_EQ(0, pl.get_root()->offset_index.at(0)); + ASSERT_EQ(3, pl.get_root()->offset_index.at(1)); + ASSERT_EQ(6, pl.get_root()->offset_index.at(2)); + + // update middle ID with smaller number of offsets + pl.upsert(5, {1, 10}); + ASSERT_EQ(1, pl.size()); + ASSERT_EQ(3, pl.get_root()->ids.getLength()); + ASSERT_EQ(8, pl.get_root()->offsets.getLength()); + + ASSERT_EQ(0, pl.get_root()->offsets.at(0)); + ASSERT_EQ(2, pl.get_root()->offsets.at(1)); + ASSERT_EQ(8, pl.get_root()->offsets.at(2)); + ASSERT_EQ(1, pl.get_root()->offsets.at(3)); + ASSERT_EQ(10, pl.get_root()->offsets.at(4)); + + ASSERT_EQ(10, pl.get_root()->offsets.getMax()); + ASSERT_EQ(0, pl.get_root()->offsets.getMin()); + + ASSERT_EQ(0, pl.get_root()->offset_index.at(0)); + ASSERT_EQ(3, pl.get_root()->offset_index.at(1)); + ASSERT_EQ(5, pl.get_root()->offset_index.at(2)); + + // update middle ID with larger number of offsets + pl.upsert(5, {2, 4, 12}); + ASSERT_EQ(1, pl.size()); + ASSERT_EQ(3, pl.get_root()->ids.getLength()); + ASSERT_EQ(9, pl.get_root()->offsets.getLength()); + + ASSERT_EQ(0, pl.get_root()->offsets.at(0)); + ASSERT_EQ(2, pl.get_root()->offsets.at(1)); + ASSERT_EQ(8, pl.get_root()->offsets.at(2)); + ASSERT_EQ(2, pl.get_root()->offsets.at(3)); + ASSERT_EQ(4, pl.get_root()->offsets.at(4)); + ASSERT_EQ(12, pl.get_root()->offsets.at(5)); + ASSERT_EQ(1, pl.get_root()->offsets.at(6)); + ASSERT_EQ(2, pl.get_root()->offsets.at(7)); + ASSERT_EQ(3, pl.get_root()->offsets.at(8)); + + ASSERT_EQ(12, pl.get_root()->offsets.getMax()); + ASSERT_EQ(0, pl.get_root()->offsets.getMin()); + + ASSERT_EQ(0, pl.get_root()->offset_index.at(0)); + ASSERT_EQ(3, pl.get_root()->offset_index.at(1)); + ASSERT_EQ(6, pl.get_root()->offset_index.at(2)); + + // update last ID with smaller number of offsets + + pl.upsert(7, {3}); + ASSERT_EQ(1, pl.size()); + ASSERT_EQ(3, pl.get_root()->ids.getLength()); + ASSERT_EQ(7, pl.get_root()->offsets.getLength()); + + ASSERT_EQ(0, pl.get_root()->offsets.at(0)); + ASSERT_EQ(2, pl.get_root()->offsets.at(1)); + ASSERT_EQ(8, pl.get_root()->offsets.at(2)); + ASSERT_EQ(2, pl.get_root()->offsets.at(3)); + ASSERT_EQ(4, pl.get_root()->offsets.at(4)); + ASSERT_EQ(12, pl.get_root()->offsets.at(5)); + ASSERT_EQ(3, pl.get_root()->offsets.at(6)); + + ASSERT_EQ(12, pl.get_root()->offsets.getMax()); + ASSERT_EQ(0, pl.get_root()->offsets.getMin()); + + ASSERT_EQ(0, pl.get_root()->offset_index.at(0)); + ASSERT_EQ(3, pl.get_root()->offset_index.at(1)); + ASSERT_EQ(6, pl.get_root()->offset_index.at(2)); + + // update last ID with larger number of offsets + + pl.upsert(7, {5, 20}); + ASSERT_EQ(1, pl.size()); + ASSERT_EQ(3, pl.get_root()->ids.getLength()); + ASSERT_EQ(8, pl.get_root()->offsets.getLength()); + + ASSERT_EQ(0, pl.get_root()->offsets.at(0)); + ASSERT_EQ(2, pl.get_root()->offsets.at(1)); + ASSERT_EQ(8, pl.get_root()->offsets.at(2)); + ASSERT_EQ(2, pl.get_root()->offsets.at(3)); + ASSERT_EQ(4, pl.get_root()->offsets.at(4)); + ASSERT_EQ(12, pl.get_root()->offsets.at(5)); + ASSERT_EQ(5, pl.get_root()->offsets.at(6)); + ASSERT_EQ(20, pl.get_root()->offsets.at(7)); + + ASSERT_EQ(20, pl.get_root()->offsets.getMax()); + ASSERT_EQ(0, pl.get_root()->offsets.getMin()); + + ASSERT_EQ(0, pl.get_root()->offset_index.at(0)); + ASSERT_EQ(3, pl.get_root()->offset_index.at(1)); + ASSERT_EQ(6, pl.get_root()->offset_index.at(2)); +} + TEST(PostingListTest, RemovalsOnFirstBlock) { std::vector offsets = {0, 1, 3}; posting_list_t pl(5); @@ -318,11 +467,10 @@ TEST(PostingListTest, RandomInsertAndDeletes) { std::vector offsets1 = {0, 1, 3}; std::vector offsets2 = {10, 12}; - // generate unique random IDs - std::set ids; + std::vector ids; for(size_t i = 0; i < 100000; i++) { - ids.insert(rand() % 100000); + ids.push_back(rand() % 100000); } size_t index = 0; @@ -337,8 +485,8 @@ TEST(PostingListTest, RandomInsertAndDeletes) { pl.erase(rand() % 100000); } - ASSERT_LT(pl.size(), 750); - ASSERT_GT(pl.size(), 500); + ASSERT_GT(pl.size(), 750); + ASSERT_LT(pl.size(), 1000); } TEST(PostingListTest, IntersectionBasics) {