mirror of
https://github.com/typesense/typesense.git
synced 2025-05-20 05:32:30 +08:00
Posting list merge.
This commit is contained in:
parent
36580dfb62
commit
70f970b80c
@ -38,6 +38,9 @@ class posting_t {
|
||||
private:
|
||||
static constexpr size_t COMPACT_LIST_THRESHOLD_LENGTH = 64;
|
||||
|
||||
static void to_expanded_plists(const std::vector<void*>& raw_posting_lists, std::vector<posting_list_t*>& plists,
|
||||
std::vector<uint32_t>& expanded_plist_indices);
|
||||
|
||||
public:
|
||||
|
||||
static void upsert(void*& obj, uint32_t id, const std::vector<uint32_t>& offsets);
|
||||
@ -48,5 +51,7 @@ public:
|
||||
|
||||
static uint32_t first_id(const void* obj);
|
||||
|
||||
static void merge(const std::vector<void*>& posting_lists, std::vector<uint32_t>& result_ids);
|
||||
|
||||
static void intersect(const std::vector<void*>& posting_lists, std::vector<uint32_t>& result_ids);
|
||||
};
|
@ -95,8 +95,11 @@ private:
|
||||
static void advance_all(std::vector<posting_list_t::iterator_t>& its);
|
||||
static void advance_all2(std::vector<posting_list_t::iterator_t>& its);
|
||||
|
||||
static void advance_least(std::vector<posting_list_t::iterator_t>& its);
|
||||
static void advance_least2(std::vector<posting_list_t::iterator_t>& its);
|
||||
static void advance_non_largest(std::vector<posting_list_t::iterator_t>& its);
|
||||
static void advance_non_largest2(std::vector<posting_list_t::iterator_t>& its);
|
||||
|
||||
static uint32_t advance_smallest(std::vector<posting_list_t::iterator_t>& its);
|
||||
static uint32_t advance_smallest2(std::vector<posting_list_t::iterator_t>& its);
|
||||
|
||||
public:
|
||||
|
||||
@ -122,6 +125,8 @@ public:
|
||||
|
||||
iterator_t new_iterator();
|
||||
|
||||
static void merge(const std::vector<posting_list_t*>& posting_lists, std::vector<uint32_t>& result_ids);
|
||||
|
||||
static void intersect(const std::vector<posting_list_t*>& posting_lists, std::vector<uint32_t>& result_ids);
|
||||
|
||||
static bool block_intersect(
|
||||
|
@ -293,11 +293,34 @@ uint32_t posting_t::first_id(const void* obj) {
|
||||
}
|
||||
}
|
||||
|
||||
void posting_t::merge(const std::vector<void*>& raw_posting_lists, std::vector<uint32_t>& result_ids) {
|
||||
// we will have to convert the compact posting list (if any) to full form
|
||||
std::vector<posting_list_t*> plists;
|
||||
std::vector<uint32_t> expanded_plist_indices;
|
||||
to_expanded_plists(raw_posting_lists, plists, expanded_plist_indices);
|
||||
|
||||
posting_list_t::merge(plists, result_ids);
|
||||
|
||||
for(uint32_t expanded_plist_index: expanded_plist_indices) {
|
||||
delete plists[expanded_plist_index];
|
||||
}
|
||||
}
|
||||
|
||||
void posting_t::intersect(const std::vector<void*>& raw_posting_lists, std::vector<uint32_t>& result_ids) {
|
||||
// we will have to convert the compact posting list (if any) to full form
|
||||
std::vector<posting_list_t*> plists;
|
||||
std::vector<uint32_t> expanded_plist_indices;
|
||||
to_expanded_plists(raw_posting_lists, plists, expanded_plist_indices);
|
||||
|
||||
posting_list_t::intersect(plists, result_ids);
|
||||
|
||||
for(uint32_t expanded_plist_index: expanded_plist_indices) {
|
||||
delete plists[expanded_plist_index];
|
||||
}
|
||||
}
|
||||
|
||||
void posting_t::to_expanded_plists(const std::vector<void*>& raw_posting_lists, std::vector<posting_list_t*>& plists,
|
||||
std::vector<uint32_t>& expanded_plist_indices) {
|
||||
for(size_t i = 0; i < raw_posting_lists.size(); i++) {
|
||||
auto raw_posting_list = raw_posting_lists[i];
|
||||
|
||||
@ -310,10 +333,4 @@ void posting_t::intersect(const std::vector<void*>& raw_posting_lists, std::vect
|
||||
plists.emplace_back(full_posting_list);
|
||||
}
|
||||
}
|
||||
|
||||
posting_list_t::intersect(plists, result_ids);
|
||||
|
||||
for(uint32_t expanded_plist_index: expanded_plist_indices) {
|
||||
delete plists[expanded_plist_index];
|
||||
}
|
||||
}
|
||||
|
@ -530,6 +530,65 @@ posting_list_t::block_t* posting_list_t::block_of(last_id_t id) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
|
||||
void posting_list_t::merge(const std::vector<posting_list_t*>& posting_lists, std::vector<uint32_t>& result_ids) {
|
||||
auto its = std::vector<posting_list_t::iterator_t>();
|
||||
its.reserve(posting_lists.size());
|
||||
|
||||
size_t sum_sizes = 0;
|
||||
|
||||
for(const auto& posting_list: posting_lists) {
|
||||
its.push_back(posting_list->new_iterator());
|
||||
sum_sizes += posting_list->num_ids();
|
||||
}
|
||||
|
||||
result_ids.reserve(sum_sizes);
|
||||
size_t num_lists = its.size();
|
||||
|
||||
switch (num_lists) {
|
||||
case 2:
|
||||
while(!at_end2(its)) {
|
||||
if(equals2(its)) {
|
||||
//LOG(INFO) << its[0].id();
|
||||
result_ids.push_back(its[0].id());
|
||||
advance_all2(its);
|
||||
} else {
|
||||
uint32_t smallest_value = advance_smallest2(its);
|
||||
result_ids.push_back(smallest_value);
|
||||
}
|
||||
}
|
||||
|
||||
while(its[0].valid()) {
|
||||
result_ids.push_back(its[0].id());
|
||||
its[0].next();
|
||||
}
|
||||
|
||||
while(its[1].valid()) {
|
||||
result_ids.push_back(its[1].id());
|
||||
its[1].next();
|
||||
}
|
||||
|
||||
break;
|
||||
default:
|
||||
while(!at_end(its)) {
|
||||
if(equals(its)) {
|
||||
result_ids.push_back(its[0].id());
|
||||
advance_all(its);
|
||||
} else {
|
||||
uint32_t smallest_value = advance_smallest(its);
|
||||
result_ids.push_back(smallest_value);
|
||||
}
|
||||
}
|
||||
|
||||
for(auto& it: its) {
|
||||
while(it.valid()) {
|
||||
result_ids.push_back(it.id());
|
||||
it.next();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Inspired by: https://stackoverflow.com/a/25509185/131050
|
||||
void posting_list_t::intersect(const std::vector<posting_list_t*>& posting_lists, std::vector<uint32_t>& result_ids) {
|
||||
auto its = std::vector<posting_list_t::iterator_t>();
|
||||
@ -549,7 +608,7 @@ void posting_list_t::intersect(const std::vector<posting_list_t*>& posting_lists
|
||||
result_ids.push_back(its[0].id());
|
||||
advance_all2(its);
|
||||
} else {
|
||||
advance_least2(its);
|
||||
advance_non_largest2(its);
|
||||
}
|
||||
}
|
||||
break;
|
||||
@ -560,7 +619,7 @@ void posting_list_t::intersect(const std::vector<posting_list_t*>& posting_lists
|
||||
result_ids.push_back(its[0].id());
|
||||
advance_all(its);
|
||||
} else {
|
||||
advance_least(its);
|
||||
advance_non_largest(its);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -604,7 +663,7 @@ bool posting_list_t::block_intersect(const std::vector<posting_list_t*>& posting
|
||||
|
||||
advance_all2(its);
|
||||
} else {
|
||||
advance_least2(its);
|
||||
advance_non_largest2(its);
|
||||
}
|
||||
|
||||
if(iter_state.ids.size() == batch_size) {
|
||||
@ -631,7 +690,7 @@ bool posting_list_t::block_intersect(const std::vector<posting_list_t*>& posting
|
||||
|
||||
advance_all(its);
|
||||
} else {
|
||||
advance_least(its);
|
||||
advance_non_largest(its);
|
||||
}
|
||||
|
||||
if(iter_state.ids.size() == batch_size) {
|
||||
@ -770,7 +829,7 @@ void posting_list_t::advance_all2(std::vector<posting_list_t::iterator_t>& its)
|
||||
its[1].next();
|
||||
}
|
||||
|
||||
void posting_list_t::advance_least(std::vector<posting_list_t::iterator_t>& its) {
|
||||
void posting_list_t::advance_non_largest(std::vector<posting_list_t::iterator_t>& its) {
|
||||
// we will find the iter with greatest value and then advance the rest until their value catches up
|
||||
uint32_t greatest_value = 0;
|
||||
|
||||
@ -787,7 +846,7 @@ void posting_list_t::advance_least(std::vector<posting_list_t::iterator_t>& its)
|
||||
}
|
||||
}
|
||||
|
||||
void posting_list_t::advance_least2(std::vector<posting_list_t::iterator_t>& its) {
|
||||
void posting_list_t::advance_non_largest2(std::vector<posting_list_t::iterator_t>& its) {
|
||||
if(its[0].id() > its[1].id()) {
|
||||
its[1].skip_to(its[0].id());
|
||||
} else {
|
||||
@ -795,6 +854,39 @@ void posting_list_t::advance_least2(std::vector<posting_list_t::iterator_t>& its
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t posting_list_t::advance_smallest(std::vector<posting_list_t::iterator_t>& its) {
|
||||
// we will advance the iterator(s) with the smallest value and then return that value
|
||||
uint32_t smallest_value = UINT32_MAX;
|
||||
|
||||
for(size_t i = 0; i < its.size(); i++) {
|
||||
if(its[i].id() < smallest_value) {
|
||||
smallest_value = its[i].id();
|
||||
}
|
||||
}
|
||||
|
||||
for(size_t i = 0; i < its.size(); i++) {
|
||||
if(its[i].id() == smallest_value) {
|
||||
its[i].next();
|
||||
}
|
||||
}
|
||||
|
||||
return smallest_value;
|
||||
}
|
||||
|
||||
uint32_t posting_list_t::advance_smallest2(std::vector<posting_list_t::iterator_t>& its) {
|
||||
uint32_t smallest_value = 0;
|
||||
|
||||
if(its[0].id() < its[1].id()) {
|
||||
smallest_value = its[0].id();
|
||||
its[0].next();
|
||||
} else {
|
||||
smallest_value = its[1].id();
|
||||
its[1].next();
|
||||
}
|
||||
|
||||
return smallest_value;
|
||||
}
|
||||
|
||||
size_t posting_list_t::num_ids() {
|
||||
return ids_length;
|
||||
}
|
||||
|
@ -516,6 +516,50 @@ TEST(PostingListTest, RandomInsertAndDeletes) {
|
||||
ASSERT_LT(pl.num_blocks(), 1000);
|
||||
}
|
||||
|
||||
TEST(PostingListTest, MergeBasics) {
|
||||
std::vector<uint32_t> offsets = {0, 1, 3};
|
||||
std::vector<posting_list_t*> lists;
|
||||
|
||||
// [0, 2] [3, 20]
|
||||
// [1, 3], [5, 10], [20]
|
||||
// [2, 3], [5, 7], [20]
|
||||
|
||||
posting_list_t p1(2);
|
||||
p1.upsert(0, offsets);
|
||||
p1.upsert(2, offsets);
|
||||
p1.upsert(3, offsets);
|
||||
p1.upsert(20, offsets);
|
||||
|
||||
posting_list_t p2(2);
|
||||
p2.upsert(1, offsets);
|
||||
p2.upsert(3, offsets);
|
||||
p2.upsert(5, offsets);
|
||||
p2.upsert(10, offsets);
|
||||
p2.upsert(20, offsets);
|
||||
|
||||
posting_list_t p3(2);
|
||||
p3.upsert(2, offsets);
|
||||
p3.upsert(3, offsets);
|
||||
p3.upsert(5, offsets);
|
||||
p3.upsert(7, offsets);
|
||||
p3.upsert(20, offsets);
|
||||
|
||||
lists.push_back(&p1);
|
||||
lists.push_back(&p2);
|
||||
lists.push_back(&p3);
|
||||
|
||||
std::vector<uint32_t> result_ids;
|
||||
|
||||
posting_list_t::merge(lists, result_ids);
|
||||
|
||||
std::vector<uint32_t> expected_ids = {0, 1, 2, 3, 5, 7, 10, 20};
|
||||
ASSERT_EQ(expected_ids.size(), result_ids.size());
|
||||
|
||||
for(size_t i = 0; i < expected_ids.size(); i++) {
|
||||
ASSERT_EQ(expected_ids[i], result_ids[i]);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(PostingListTest, IntersectionBasics) {
|
||||
std::vector<uint32_t> offsets = {0, 1, 3};
|
||||
std::vector<posting_list_t*> lists;
|
||||
@ -956,7 +1000,7 @@ TEST(PostingListTest, DISABLED_Benchmark) {
|
||||
LOG(INFO) << "Time taken for 5 sorted array updates: " << timeMicros;
|
||||
}
|
||||
|
||||
TEST(PostingListTest, BenchmarkIntersection) {
|
||||
TEST(PostingListTest, DISABLED_BenchmarkIntersection) {
|
||||
std::vector<uint32_t> offsets = {0, 1, 3};
|
||||
|
||||
time_t t;
|
||||
|
Loading…
x
Reference in New Issue
Block a user