diff --git a/include/posting.h b/include/posting.h index 5f8cbb98..63db6c1a 100644 --- a/include/posting.h +++ b/include/posting.h @@ -47,6 +47,37 @@ private: public: + struct block_intersector_t { + size_t batch_size; + std::vector its; + std::vector plists; + std::vector expanded_plist_indices; + + posting_list_t::result_iter_state_t& iter_state; + + block_intersector_t(const std::vector& raw_posting_lists, + size_t batch_size, + posting_list_t::result_iter_state_t& iter_state): + batch_size(batch_size), iter_state(iter_state) { + to_expanded_plists(raw_posting_lists, plists, expanded_plist_indices); + + its.reserve(plists.size()); + for(const auto& posting_list: plists) { + its.push_back(posting_list->new_iterator()); + } + } + + ~block_intersector_t() { + for(uint32_t expanded_plist_index: expanded_plist_indices) { + delete plists[expanded_plist_index]; + } + } + + bool intersect() { + return posting_list_t::block_intersect(plists, batch_size, its, iter_state);; + } + }; + static void upsert(void*& obj, uint32_t id, const std::vector& offsets); static void erase(void*& obj, uint32_t id); @@ -64,11 +95,4 @@ public: static void merge(const std::vector& posting_lists, std::vector& result_ids); static void intersect(const std::vector& posting_lists, std::vector& result_ids); - - static bool block_intersect( - const std::vector& posting_lists, - size_t batch_size, - std::vector& its, - posting_list_t::result_iter_state_t& iter_state - ); }; \ No newline at end of file diff --git a/src/posting.cpp b/src/posting.cpp index 645ba009..f9bec6d6 100644 --- a/src/posting.cpp +++ b/src/posting.cpp @@ -8,13 +8,13 @@ int64_t compact_posting_list_t::upsert(const uint32_t id, const std::vector last_id) { - new_storage_needed = sizeof(uint32_t) * (num_offsets + 2); - if(length + new_storage_needed > capacity) { + extra_length_needed = (num_offsets + 2); + if(length + extra_length_needed > capacity) { // enough storage should have been provided upstream - return (length + new_storage_needed) - capacity; + return (length + extra_length_needed) - capacity; } // can just append to the end @@ -34,21 +34,21 @@ int64_t compact_posting_list_t::upsert(const uint32_t id, const uint32_t* offset size_t existing_id = id_offsets[i + num_existing_offsets + 1]; if(existing_id == id) { - new_storage_needed = sizeof(uint32_t) * (num_offsets - num_existing_offsets); - if(new_storage_needed > 0) { - if(length + new_storage_needed > capacity) { + extra_length_needed = (num_offsets - num_existing_offsets); + if(extra_length_needed > 0) { + if(length + extra_length_needed > capacity) { // enough storage should have been provided upstream - return (length + new_storage_needed) - capacity; + return (length + extra_length_needed) - capacity; } // shift offsets to the right to make space - int64_t shift_index = int64_t(length)+new_storage_needed-1; - while(shift_index >= i && (shift_index - new_storage_needed) >= 0) { - id_offsets[shift_index] = id_offsets[shift_index - new_storage_needed]; + int64_t shift_index = int64_t(length) + extra_length_needed - 1; + while(shift_index >= i && (shift_index - extra_length_needed) >= 0) { + id_offsets[shift_index] = id_offsets[shift_index - extra_length_needed]; shift_index--; } - } else if(new_storage_needed < 0) { + } else if(extra_length_needed < 0) { // shift offsets to the left to reduce space // [num_offsets][0][2][4][id] // [num_offsets][0][id] @@ -71,18 +71,18 @@ int64_t compact_posting_list_t::upsert(const uint32_t id, const uint32_t* offset } else if(existing_id > id) { - new_storage_needed = sizeof(uint32_t) * (num_offsets + 2); - if(length + new_storage_needed > capacity) { + extra_length_needed = (num_offsets + 2); + if(length + extra_length_needed > capacity) { // enough storage should have been provided upstream - return (length + new_storage_needed) - capacity; + return (length + extra_length_needed) - capacity; } - // shift index [i..length-1] by `new_storage_needed` positions - int64_t shift_index = length+new_storage_needed-1; - while((shift_index - new_storage_needed) >= 0 && shift_index >= i) { + // shift index [i..length-1] by `extra_length_needed` positions + int64_t shift_index = length + extra_length_needed - 1; + while((shift_index - extra_length_needed) >= 0 && shift_index >= i) { // [*1 1 4] [1 1 7] // [1 1 3] - id_offsets[shift_index] = id_offsets[shift_index - new_storage_needed]; + id_offsets[shift_index] = id_offsets[shift_index - extra_length_needed]; shift_index--; } // now store the new offsets in the shifted space @@ -100,7 +100,7 @@ int64_t compact_posting_list_t::upsert(const uint32_t id, const uint32_t* offset i += num_existing_offsets + 2; } - length += new_storage_needed; // new_storage_needed can be negative here but that's okay + length += extra_length_needed; // extra_length_needed can be negative here but that's okay } return 0; @@ -247,7 +247,7 @@ void posting_t::upsert(void*& obj, uint32_t id, const std::vector& off if(extra_capacity_required != 0) { // grow the container by 30% size_t new_capacity = (list->capacity + extra_capacity_required) * 1.3; - size_t new_capacity_bytes = new_capacity * sizeof(uint32_t); + size_t new_capacity_bytes = sizeof(compact_posting_list_t) + (new_capacity * sizeof(uint32_t)); auto new_list = (compact_posting_list_t *) realloc(list, new_capacity_bytes); if(new_list == nullptr) { abort(); @@ -283,7 +283,7 @@ void posting_t::erase(void*& obj, uint32_t id) { if(list->length < list->capacity/2) { // resize container size_t new_capacity = list->capacity/2; - size_t new_capacity_bytes = new_capacity * sizeof(uint32_t); + size_t new_capacity_bytes = sizeof(compact_posting_list_t) + (new_capacity * sizeof(uint32_t)); auto new_list = (compact_posting_list_t *) realloc(list, new_capacity_bytes); if(new_list == nullptr) { abort(); @@ -400,25 +400,6 @@ void posting_t::to_expanded_plists(const std::vector& raw_posting_lists, } } -bool posting_t::block_intersect(const std::vector& raw_posting_lists, size_t batch_size, - std::vector& its, - posting_list_t::result_iter_state_t& iter_state) { - // we will have to convert the compact posting list (if any) to full form - std::vector plists; - std::vector expanded_plist_indices; - to_expanded_plists(raw_posting_lists, plists, expanded_plist_indices); - - bool has_more = posting_list_t::block_intersect(plists, batch_size, its, iter_state); - - if(!has_more) { - for(uint32_t expanded_plist_index: expanded_plist_indices) { - delete plists[expanded_plist_index]; - } - } - - return has_more; -} - void posting_t::destroy_list(void*& obj) { if(obj == nullptr) { return; diff --git a/test/posting_list_test.cpp b/test/posting_list_test.cpp index 04234bf2..8ad7c8a8 100644 --- a/test/posting_list_test.cpp +++ b/test/posting_list_test.cpp @@ -1059,6 +1059,48 @@ TEST(PostingListTest, CompactPostingListContainsAtleastOne) { posting_t::destroy_list(obj); } +TEST(PostingListTest, CompactToFullPostingListConversion) { + uint32_t ids[] = {5, 6, 7, 8}; + uint32_t offset_index[] = {0, 3, 6, 9}; + uint32_t offsets[] = {0, 3, 4, 0, 3, 4, 0, 3, 4, 0, 3, 4}; + + compact_posting_list_t* c1 = compact_posting_list_t::create(4, ids, offset_index, 12, offsets); + posting_list_t* p1 = c1->to_full_posting_list(); + + ASSERT_EQ(4, c1->num_ids()); + ASSERT_EQ(4, p1->num_ids()); +} + +TEST(PostingListTest, BlockIntersectionOnMixedLists) { + uint32_t ids[] = {5, 6, 7, 8}; + uint32_t offset_index[] = {0, 3, 6, 9}; + uint32_t offsets[] = {0, 3, 4, 0, 3, 4, 0, 3, 4, 0, 3, 4}; + + compact_posting_list_t* list1 = compact_posting_list_t::create(4, ids, offset_index, 12, offsets); + + posting_list_t p1(2); + std::vector offsets1 = {2, 4}; + + p1.upsert(0, offsets1); + p1.upsert(5, offsets1); + p1.upsert(8, offsets1); + p1.upsert(20, offsets1); + + std::vector raw_posting_lists = {SET_COMPACT_POSTING(list1), &p1}; + posting_list_t::result_iter_state_t iter_state; + posting_t::block_intersector_t intersector(raw_posting_lists, 1, iter_state); + + ASSERT_TRUE(intersector.intersect()); + ASSERT_EQ(1, iter_state.ids.size()); + ASSERT_EQ(5, iter_state.ids[0]); + + ASSERT_FALSE(intersector.intersect()); + ASSERT_EQ(1, iter_state.ids.size()); + ASSERT_EQ(8, iter_state.ids[0]); + + free(list1); +} + TEST(PostingListTest, DISABLED_Benchmark) { std::vector offsets = {0, 1, 3}; posting_list_t pl(4096);