Posting list merge.

This commit is contained in:
Kishore Nallan 2021-06-14 22:05:24 +05:30
parent 34ad1747e1
commit 267aabb701
5 changed files with 178 additions and 15 deletions

View File

@ -38,6 +38,9 @@ class posting_t {
private:
static constexpr size_t COMPACT_LIST_THRESHOLD_LENGTH = 64;
static void to_expanded_plists(const std::vector<void*>& raw_posting_lists, std::vector<posting_list_t*>& plists,
std::vector<uint32_t>& expanded_plist_indices);
public:
static void upsert(void*& obj, uint32_t id, const std::vector<uint32_t>& offsets);
@ -48,5 +51,7 @@ public:
static uint32_t first_id(const void* obj);
static void merge(const std::vector<void*>& posting_lists, std::vector<uint32_t>& result_ids);
static void intersect(const std::vector<void*>& posting_lists, std::vector<uint32_t>& result_ids);
};

View File

@ -95,8 +95,11 @@ private:
static void advance_all(std::vector<posting_list_t::iterator_t>& its);
static void advance_all2(std::vector<posting_list_t::iterator_t>& its);
static void advance_least(std::vector<posting_list_t::iterator_t>& its);
static void advance_least2(std::vector<posting_list_t::iterator_t>& its);
static void advance_non_largest(std::vector<posting_list_t::iterator_t>& its);
static void advance_non_largest2(std::vector<posting_list_t::iterator_t>& its);
static uint32_t advance_smallest(std::vector<posting_list_t::iterator_t>& its);
static uint32_t advance_smallest2(std::vector<posting_list_t::iterator_t>& its);
public:
@ -122,6 +125,8 @@ public:
iterator_t new_iterator();
static void merge(const std::vector<posting_list_t*>& posting_lists, std::vector<uint32_t>& result_ids);
static void intersect(const std::vector<posting_list_t*>& posting_lists, std::vector<uint32_t>& result_ids);
static bool block_intersect(

View File

@ -293,11 +293,34 @@ uint32_t posting_t::first_id(const void* obj) {
}
}
void posting_t::merge(const std::vector<void*>& raw_posting_lists, std::vector<uint32_t>& result_ids) {
// we will have to convert the compact posting list (if any) to full form
std::vector<posting_list_t*> plists;
std::vector<uint32_t> expanded_plist_indices;
to_expanded_plists(raw_posting_lists, plists, expanded_plist_indices);
posting_list_t::merge(plists, result_ids);
for(uint32_t expanded_plist_index: expanded_plist_indices) {
delete plists[expanded_plist_index];
}
}
void posting_t::intersect(const std::vector<void*>& raw_posting_lists, std::vector<uint32_t>& result_ids) {
// we will have to convert the compact posting list (if any) to full form
std::vector<posting_list_t*> plists;
std::vector<uint32_t> expanded_plist_indices;
to_expanded_plists(raw_posting_lists, plists, expanded_plist_indices);
posting_list_t::intersect(plists, result_ids);
for(uint32_t expanded_plist_index: expanded_plist_indices) {
delete plists[expanded_plist_index];
}
}
void posting_t::to_expanded_plists(const std::vector<void*>& raw_posting_lists, std::vector<posting_list_t*>& plists,
std::vector<uint32_t>& expanded_plist_indices) {
for(size_t i = 0; i < raw_posting_lists.size(); i++) {
auto raw_posting_list = raw_posting_lists[i];
@ -310,10 +333,4 @@ void posting_t::intersect(const std::vector<void*>& raw_posting_lists, std::vect
plists.emplace_back(full_posting_list);
}
}
posting_list_t::intersect(plists, result_ids);
for(uint32_t expanded_plist_index: expanded_plist_indices) {
delete plists[expanded_plist_index];
}
}

View File

@ -530,6 +530,65 @@ posting_list_t::block_t* posting_list_t::block_of(last_id_t id) {
return nullptr;
}
void posting_list_t::merge(const std::vector<posting_list_t*>& posting_lists, std::vector<uint32_t>& result_ids) {
auto its = std::vector<posting_list_t::iterator_t>();
its.reserve(posting_lists.size());
size_t sum_sizes = 0;
for(const auto& posting_list: posting_lists) {
its.push_back(posting_list->new_iterator());
sum_sizes += posting_list->num_ids();
}
result_ids.reserve(sum_sizes);
size_t num_lists = its.size();
switch (num_lists) {
case 2:
while(!at_end2(its)) {
if(equals2(its)) {
//LOG(INFO) << its[0].id();
result_ids.push_back(its[0].id());
advance_all2(its);
} else {
uint32_t smallest_value = advance_smallest2(its);
result_ids.push_back(smallest_value);
}
}
while(its[0].valid()) {
result_ids.push_back(its[0].id());
its[0].next();
}
while(its[1].valid()) {
result_ids.push_back(its[1].id());
its[1].next();
}
break;
default:
while(!at_end(its)) {
if(equals(its)) {
result_ids.push_back(its[0].id());
advance_all(its);
} else {
uint32_t smallest_value = advance_smallest(its);
result_ids.push_back(smallest_value);
}
}
for(auto& it: its) {
while(it.valid()) {
result_ids.push_back(it.id());
it.next();
}
}
}
}
// Inspired by: https://stackoverflow.com/a/25509185/131050
void posting_list_t::intersect(const std::vector<posting_list_t*>& posting_lists, std::vector<uint32_t>& result_ids) {
auto its = std::vector<posting_list_t::iterator_t>();
@ -549,7 +608,7 @@ void posting_list_t::intersect(const std::vector<posting_list_t*>& posting_lists
result_ids.push_back(its[0].id());
advance_all2(its);
} else {
advance_least2(its);
advance_non_largest2(its);
}
}
break;
@ -560,7 +619,7 @@ void posting_list_t::intersect(const std::vector<posting_list_t*>& posting_lists
result_ids.push_back(its[0].id());
advance_all(its);
} else {
advance_least(its);
advance_non_largest(its);
}
}
}
@ -604,7 +663,7 @@ bool posting_list_t::block_intersect(const std::vector<posting_list_t*>& posting
advance_all2(its);
} else {
advance_least2(its);
advance_non_largest2(its);
}
if(iter_state.ids.size() == batch_size) {
@ -631,7 +690,7 @@ bool posting_list_t::block_intersect(const std::vector<posting_list_t*>& posting
advance_all(its);
} else {
advance_least(its);
advance_non_largest(its);
}
if(iter_state.ids.size() == batch_size) {
@ -770,7 +829,7 @@ void posting_list_t::advance_all2(std::vector<posting_list_t::iterator_t>& its)
its[1].next();
}
void posting_list_t::advance_least(std::vector<posting_list_t::iterator_t>& its) {
void posting_list_t::advance_non_largest(std::vector<posting_list_t::iterator_t>& its) {
// we will find the iter with greatest value and then advance the rest until their value catches up
uint32_t greatest_value = 0;
@ -787,7 +846,7 @@ void posting_list_t::advance_least(std::vector<posting_list_t::iterator_t>& its)
}
}
void posting_list_t::advance_least2(std::vector<posting_list_t::iterator_t>& its) {
void posting_list_t::advance_non_largest2(std::vector<posting_list_t::iterator_t>& its) {
if(its[0].id() > its[1].id()) {
its[1].skip_to(its[0].id());
} else {
@ -795,6 +854,39 @@ void posting_list_t::advance_least2(std::vector<posting_list_t::iterator_t>& its
}
}
uint32_t posting_list_t::advance_smallest(std::vector<posting_list_t::iterator_t>& its) {
// we will advance the iterator(s) with the smallest value and then return that value
uint32_t smallest_value = UINT32_MAX;
for(size_t i = 0; i < its.size(); i++) {
if(its[i].id() < smallest_value) {
smallest_value = its[i].id();
}
}
for(size_t i = 0; i < its.size(); i++) {
if(its[i].id() == smallest_value) {
its[i].next();
}
}
return smallest_value;
}
uint32_t posting_list_t::advance_smallest2(std::vector<posting_list_t::iterator_t>& its) {
uint32_t smallest_value = 0;
if(its[0].id() < its[1].id()) {
smallest_value = its[0].id();
its[0].next();
} else {
smallest_value = its[1].id();
its[1].next();
}
return smallest_value;
}
size_t posting_list_t::num_ids() {
return ids_length;
}

View File

@ -516,6 +516,50 @@ TEST(PostingListTest, RandomInsertAndDeletes) {
ASSERT_LT(pl.num_blocks(), 1000);
}
TEST(PostingListTest, MergeBasics) {
std::vector<uint32_t> offsets = {0, 1, 3};
std::vector<posting_list_t*> lists;
// [0, 2] [3, 20]
// [1, 3], [5, 10], [20]
// [2, 3], [5, 7], [20]
posting_list_t p1(2);
p1.upsert(0, offsets);
p1.upsert(2, offsets);
p1.upsert(3, offsets);
p1.upsert(20, offsets);
posting_list_t p2(2);
p2.upsert(1, offsets);
p2.upsert(3, offsets);
p2.upsert(5, offsets);
p2.upsert(10, offsets);
p2.upsert(20, offsets);
posting_list_t p3(2);
p3.upsert(2, offsets);
p3.upsert(3, offsets);
p3.upsert(5, offsets);
p3.upsert(7, offsets);
p3.upsert(20, offsets);
lists.push_back(&p1);
lists.push_back(&p2);
lists.push_back(&p3);
std::vector<uint32_t> result_ids;
posting_list_t::merge(lists, result_ids);
std::vector<uint32_t> expected_ids = {0, 1, 2, 3, 5, 7, 10, 20};
ASSERT_EQ(expected_ids.size(), result_ids.size());
for(size_t i = 0; i < expected_ids.size(); i++) {
ASSERT_EQ(expected_ids[i], result_ids[i]);
}
}
TEST(PostingListTest, IntersectionBasics) {
std::vector<uint32_t> offsets = {0, 1, 3};
std::vector<posting_list_t*> lists;
@ -956,7 +1000,7 @@ TEST(PostingListTest, DISABLED_Benchmark) {
LOG(INFO) << "Time taken for 5 sorted array updates: " << timeMicros;
}
TEST(PostingListTest, BenchmarkIntersection) {
TEST(PostingListTest, DISABLED_BenchmarkIntersection) {
std::vector<uint32_t> offsets = {0, 1, 3};
time_t t;