From 13622ff0386f57366b20e6dfd8e6aadb8918f7d0 Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Sat, 29 Jan 2022 18:25:57 +0530 Subject: [PATCH] Fix an edge case in string update. --- include/posting_list.h | 5 +++ src/posting_list.cpp | 92 +++++++++++++++++++++++++++++++++----- test/posting_list_test.cpp | 20 +++++++++ 3 files changed, 105 insertions(+), 12 deletions(-) diff --git a/include/posting_list.h b/include/posting_list.h index a477ee42..d8ebcc4b 100644 --- a/include/posting_list.h +++ b/include/posting_list.h @@ -33,6 +33,8 @@ public: void remove_and_shift_offset_index(const uint32_t* indices_sorted, uint32_t num_indices); + void insert_and_shift_offset_index(const uint32_t index, const uint32_t num_offsets); + uint32_t upsert(uint32_t id, const std::vector& offsets); uint32_t erase(uint32_t id); @@ -61,6 +63,7 @@ public: [[nodiscard]] bool valid() const; void next(); void skip_to(uint32_t id); + void set_index(uint32_t index); [[nodiscard]] uint32_t id() const; [[nodiscard]] inline uint32_t index() const; [[nodiscard]] inline block_t* block() const; @@ -129,6 +132,8 @@ public: void erase(uint32_t id); + void dump(); + block_t* get_root(); size_t num_blocks() const; diff --git a/src/posting_list.cpp b/src/posting_list.cpp index a1fbdc98..bd2a324c 100644 --- a/src/posting_list.cpp +++ b/src/posting_list.cpp @@ -6,11 +6,29 @@ /* block_t operations */ uint32_t posting_list_t::block_t::upsert(const uint32_t id, const std::vector& positions) { - if(id <= ids.last() && ids.getLength() != 0) { - // we have to check if `id` already exists, for an opportunity to do in-place updates + if(id > ids.last() || ids.getLength() == 0) { + // append to the end + ids.append(id); + uint32_t curr_index = offsets.getLength(); + offset_index.append(curr_index); + for(uint32_t position : positions) { + offsets.append(position); + } + } + + else { + // we have to check if `id` already exists, and do in-place update/insert uint32_t id_index = ids.indexOf(id); - if(id_index != ids.getLength()) { + if(id_index == ids.getLength()) { + // id not found, we have to insert it + size_t inserted_index = ids.append(id); + uint32_t existing_offset_index = offset_index.at(inserted_index); + insert_and_shift_offset_index(inserted_index, positions.size()); + offsets.insert(existing_offset_index, &positions[0], positions.size()); + } + + else { // id is already present, so we will only update offset index and offsets uint32_t start_offset_index = offset_index.at(id_index); uint32_t end_offset_index = (id == ids.last()) ? offsets.getLength()-1 : offset_index.at(id_index + 1)-1; @@ -113,15 +131,6 @@ uint32_t posting_list_t::block_t::upsert(const uint32_t id, const std::vector& offs } } +void posting_list_t::dump() { + auto it = new_iterator(); + + std::string ids_str; + std::string offset_index_str; + std::string offsets_str; + + while(it.valid()) { + auto index = it.index(); + while(index < it.block()->size()) { + ids_str += std::to_string(it.ids[index]) + ", "; + offset_index_str += std::to_string(it.offset_index[index]) + ", "; + index++; + } + + auto last_offset_index = it.offset_index[it.block()->size()-1]; + + for(size_t j = 0; j <= last_offset_index; j++) { + offsets_str += std::to_string(it.offsets[j]) + ", "; + } + + it.set_index(it.block()->size()-1); + it.next(); + } + + LOG(INFO) << "ids_str:"; + LOG(INFO) << ids_str; + + LOG(INFO) << "offset_index_str:"; + LOG(INFO) << offset_index_str; + + LOG(INFO) << "offsets_str:"; + LOG(INFO) << offsets_str; +} + void posting_list_t::erase(const uint32_t id) { const auto it = id_block_map.lower_bound(id); @@ -1354,3 +1418,7 @@ posting_list_t::iterator_t::iterator_t(iterator_t&& rhs) noexcept { rhs.offset_index = nullptr; rhs.offsets = nullptr; } + +void posting_list_t::iterator_t::set_index(uint32_t index) { + curr_index = index; +} diff --git a/test/posting_list_test.cpp b/test/posting_list_test.cpp index b9ecec3d..b78186e6 100644 --- a/test/posting_list_test.cpp +++ b/test/posting_list_test.cpp @@ -129,6 +129,26 @@ TEST_F(PostingListTest, Insert) { } } +TEST_F(PostingListTest, InsertInMiddle) { + posting_list_t pl(3); + + pl.upsert(1, {1}); + pl.upsert(3, {3}); + pl.upsert(2, {2}); + + ASSERT_EQ(1, pl.get_root()->ids.at(0)); + ASSERT_EQ(2, pl.get_root()->ids.at(1)); + ASSERT_EQ(3, pl.get_root()->ids.at(2)); + + ASSERT_EQ(0, pl.get_root()->offset_index.at(0)); + ASSERT_EQ(1, pl.get_root()->offset_index.at(1)); + ASSERT_EQ(2, pl.get_root()->offset_index.at(2)); + + ASSERT_EQ(1, pl.get_root()->offsets.at(0)); + ASSERT_EQ(2, pl.get_root()->offsets.at(1)); + ASSERT_EQ(3, pl.get_root()->offsets.at(2)); +} + TEST_F(PostingListTest, InplaceUpserts) { std::vector offsets = {1, 2, 3}; posting_list_t pl(5);