From 70f970b80c26d3d7bce4abb6d5da6ee3af777dbd Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Mon, 14 Jun 2021 22:05:24 +0530 Subject: [PATCH] Posting list merge. --- include/posting.h | 5 ++ include/posting_list.h | 9 +++- src/posting.cpp | 29 ++++++++--- src/posting_list.cpp | 104 ++++++++++++++++++++++++++++++++++--- test/posting_list_test.cpp | 46 +++++++++++++++- 5 files changed, 178 insertions(+), 15 deletions(-) diff --git a/include/posting.h b/include/posting.h index 61a703b8..3596c963 100644 --- a/include/posting.h +++ b/include/posting.h @@ -38,6 +38,9 @@ class posting_t { private: static constexpr size_t COMPACT_LIST_THRESHOLD_LENGTH = 64; + static void to_expanded_plists(const std::vector& raw_posting_lists, std::vector& plists, + std::vector& expanded_plist_indices); + public: static void upsert(void*& obj, uint32_t id, const std::vector& offsets); @@ -48,5 +51,7 @@ public: static uint32_t first_id(const void* obj); + static void merge(const std::vector& posting_lists, std::vector& result_ids); + static void intersect(const std::vector& posting_lists, std::vector& result_ids); }; \ No newline at end of file diff --git a/include/posting_list.h b/include/posting_list.h index 681a382f..9ba6ff7d 100644 --- a/include/posting_list.h +++ b/include/posting_list.h @@ -95,8 +95,11 @@ private: static void advance_all(std::vector& its); static void advance_all2(std::vector& its); - static void advance_least(std::vector& its); - static void advance_least2(std::vector& its); + static void advance_non_largest(std::vector& its); + static void advance_non_largest2(std::vector& its); + + static uint32_t advance_smallest(std::vector& its); + static uint32_t advance_smallest2(std::vector& its); public: @@ -122,6 +125,8 @@ public: iterator_t new_iterator(); + static void merge(const std::vector& posting_lists, std::vector& result_ids); + static void intersect(const std::vector& posting_lists, std::vector& result_ids); static bool block_intersect( diff --git a/src/posting.cpp b/src/posting.cpp index 06b8058e..7b04441c 100644 --- a/src/posting.cpp +++ b/src/posting.cpp @@ -293,11 +293,34 @@ uint32_t posting_t::first_id(const void* obj) { } } +void posting_t::merge(const std::vector& raw_posting_lists, std::vector& result_ids) { + // we will have to convert the compact posting list (if any) to full form + std::vector plists; + std::vector expanded_plist_indices; + to_expanded_plists(raw_posting_lists, plists, expanded_plist_indices); + + posting_list_t::merge(plists, result_ids); + + for(uint32_t expanded_plist_index: expanded_plist_indices) { + delete plists[expanded_plist_index]; + } +} + void posting_t::intersect(const std::vector& raw_posting_lists, std::vector& result_ids) { // we will have to convert the compact posting list (if any) to full form std::vector plists; std::vector expanded_plist_indices; + to_expanded_plists(raw_posting_lists, plists, expanded_plist_indices); + posting_list_t::intersect(plists, result_ids); + + for(uint32_t expanded_plist_index: expanded_plist_indices) { + delete plists[expanded_plist_index]; + } +} + +void posting_t::to_expanded_plists(const std::vector& raw_posting_lists, std::vector& plists, + std::vector& expanded_plist_indices) { for(size_t i = 0; i < raw_posting_lists.size(); i++) { auto raw_posting_list = raw_posting_lists[i]; @@ -310,10 +333,4 @@ void posting_t::intersect(const std::vector& raw_posting_lists, std::vect plists.emplace_back(full_posting_list); } } - - posting_list_t::intersect(plists, result_ids); - - for(uint32_t expanded_plist_index: expanded_plist_indices) { - delete plists[expanded_plist_index]; - } } diff --git a/src/posting_list.cpp b/src/posting_list.cpp index b245845e..3755867b 100644 --- a/src/posting_list.cpp +++ b/src/posting_list.cpp @@ -530,6 +530,65 @@ posting_list_t::block_t* posting_list_t::block_of(last_id_t id) { return nullptr; } + +void posting_list_t::merge(const std::vector& posting_lists, std::vector& result_ids) { + auto its = std::vector(); + its.reserve(posting_lists.size()); + + size_t sum_sizes = 0; + + for(const auto& posting_list: posting_lists) { + its.push_back(posting_list->new_iterator()); + sum_sizes += posting_list->num_ids(); + } + + result_ids.reserve(sum_sizes); + size_t num_lists = its.size(); + + switch (num_lists) { + case 2: + while(!at_end2(its)) { + if(equals2(its)) { + //LOG(INFO) << its[0].id(); + result_ids.push_back(its[0].id()); + advance_all2(its); + } else { + uint32_t smallest_value = advance_smallest2(its); + result_ids.push_back(smallest_value); + } + } + + while(its[0].valid()) { + result_ids.push_back(its[0].id()); + its[0].next(); + } + + while(its[1].valid()) { + result_ids.push_back(its[1].id()); + its[1].next(); + } + + break; + default: + while(!at_end(its)) { + if(equals(its)) { + result_ids.push_back(its[0].id()); + advance_all(its); + } else { + uint32_t smallest_value = advance_smallest(its); + result_ids.push_back(smallest_value); + } + } + + for(auto& it: its) { + while(it.valid()) { + result_ids.push_back(it.id()); + it.next(); + } + } + } +} + // Inspired by: https://stackoverflow.com/a/25509185/131050 void posting_list_t::intersect(const std::vector& posting_lists, std::vector& result_ids) { auto its = std::vector(); @@ -549,7 +608,7 @@ void posting_list_t::intersect(const std::vector& posting_lists result_ids.push_back(its[0].id()); advance_all2(its); } else { - advance_least2(its); + advance_non_largest2(its); } } break; @@ -560,7 +619,7 @@ void posting_list_t::intersect(const std::vector& posting_lists result_ids.push_back(its[0].id()); advance_all(its); } else { - advance_least(its); + advance_non_largest(its); } } } @@ -604,7 +663,7 @@ bool posting_list_t::block_intersect(const std::vector& posting advance_all2(its); } else { - advance_least2(its); + advance_non_largest2(its); } if(iter_state.ids.size() == batch_size) { @@ -631,7 +690,7 @@ bool posting_list_t::block_intersect(const std::vector& posting advance_all(its); } else { - advance_least(its); + advance_non_largest(its); } if(iter_state.ids.size() == batch_size) { @@ -770,7 +829,7 @@ void posting_list_t::advance_all2(std::vector& its) its[1].next(); } -void posting_list_t::advance_least(std::vector& its) { +void posting_list_t::advance_non_largest(std::vector& its) { // we will find the iter with greatest value and then advance the rest until their value catches up uint32_t greatest_value = 0; @@ -787,7 +846,7 @@ void posting_list_t::advance_least(std::vector& its) } } -void posting_list_t::advance_least2(std::vector& its) { +void posting_list_t::advance_non_largest2(std::vector& its) { if(its[0].id() > its[1].id()) { its[1].skip_to(its[0].id()); } else { @@ -795,6 +854,39 @@ void posting_list_t::advance_least2(std::vector& its } } +uint32_t posting_list_t::advance_smallest(std::vector& its) { + // we will advance the iterator(s) with the smallest value and then return that value + uint32_t smallest_value = UINT32_MAX; + + for(size_t i = 0; i < its.size(); i++) { + if(its[i].id() < smallest_value) { + smallest_value = its[i].id(); + } + } + + for(size_t i = 0; i < its.size(); i++) { + if(its[i].id() == smallest_value) { + its[i].next(); + } + } + + return smallest_value; +} + +uint32_t posting_list_t::advance_smallest2(std::vector& its) { + uint32_t smallest_value = 0; + + if(its[0].id() < its[1].id()) { + smallest_value = its[0].id(); + its[0].next(); + } else { + smallest_value = its[1].id(); + its[1].next(); + } + + return smallest_value; +} + size_t posting_list_t::num_ids() { return ids_length; } diff --git a/test/posting_list_test.cpp b/test/posting_list_test.cpp index 85be7e4c..4f0fca8f 100644 --- a/test/posting_list_test.cpp +++ b/test/posting_list_test.cpp @@ -516,6 +516,50 @@ TEST(PostingListTest, RandomInsertAndDeletes) { ASSERT_LT(pl.num_blocks(), 1000); } +TEST(PostingListTest, MergeBasics) { + std::vector offsets = {0, 1, 3}; + std::vector lists; + + // [0, 2] [3, 20] + // [1, 3], [5, 10], [20] + // [2, 3], [5, 7], [20] + + posting_list_t p1(2); + p1.upsert(0, offsets); + p1.upsert(2, offsets); + p1.upsert(3, offsets); + p1.upsert(20, offsets); + + posting_list_t p2(2); + p2.upsert(1, offsets); + p2.upsert(3, offsets); + p2.upsert(5, offsets); + p2.upsert(10, offsets); + p2.upsert(20, offsets); + + posting_list_t p3(2); + p3.upsert(2, offsets); + p3.upsert(3, offsets); + p3.upsert(5, offsets); + p3.upsert(7, offsets); + p3.upsert(20, offsets); + + lists.push_back(&p1); + lists.push_back(&p2); + lists.push_back(&p3); + + std::vector result_ids; + + posting_list_t::merge(lists, result_ids); + + std::vector expected_ids = {0, 1, 2, 3, 5, 7, 10, 20}; + ASSERT_EQ(expected_ids.size(), result_ids.size()); + + for(size_t i = 0; i < expected_ids.size(); i++) { + ASSERT_EQ(expected_ids[i], result_ids[i]); + } +} + TEST(PostingListTest, IntersectionBasics) { std::vector offsets = {0, 1, 3}; std::vector lists; @@ -956,7 +1000,7 @@ TEST(PostingListTest, DISABLED_Benchmark) { LOG(INFO) << "Time taken for 5 sorted array updates: " << timeMicros; } -TEST(PostingListTest, BenchmarkIntersection) { +TEST(PostingListTest, DISABLED_BenchmarkIntersection) { std::vector offsets = {0, 1, 3}; time_t t;