Add contains for posting list.

This commit is contained in:
Kishore Nallan 2021-06-15 12:58:45 +05:30
parent 70f970b80c
commit ef1badb077
5 changed files with 91 additions and 21 deletions

View File

@ -23,6 +23,8 @@ struct compact_posting_list_t {
posting_list_t* to_full_posting_list();
bool contains(uint32_t id);
int64_t upsert(uint32_t id, const std::vector<uint32_t>& offsets);
int64_t upsert(uint32_t id, const uint32_t* offsets, uint32_t num_offsets);
@ -51,7 +53,16 @@ public:
static uint32_t first_id(const void* obj);
static bool contains(const void* obj, uint32_t id);
static void merge(const std::vector<void*>& posting_lists, std::vector<uint32_t>& result_ids);
static void intersect(const std::vector<void*>& posting_lists, std::vector<uint32_t>& result_ids);
static bool block_intersect(
const std::vector<void*>& posting_lists,
size_t batch_size,
std::vector<posting_list_t::iterator_t>& its,
posting_list_t::result_iter_state_t& iter_state
);
};

View File

@ -29,7 +29,7 @@ public:
// link to next block
block_t* next = nullptr;
void insert_and_shift_offset_index(uint32_t index, uint32_t num_offsets);
bool contains(uint32_t id);
void remove_and_shift_offset_index(const uint32_t* indices_sorted, uint32_t num_indices);
@ -123,6 +123,8 @@ public:
block_t* block_of(last_id_t id);
bool contains(uint32_t id);
iterator_t new_iterator();
static void merge(const std::vector<posting_list_t*>& posting_lists, std::vector<uint32_t>& result_ids);

View File

@ -192,6 +192,27 @@ uint32_t compact_posting_list_t::first_id() {
return id_offsets[id_offsets[0] + 1];
}
bool compact_posting_list_t::contains(uint32_t id) {
size_t i = 0;
while(i < length) {
size_t num_existing_offsets = id_offsets[i];
size_t existing_id = id_offsets[i + num_existing_offsets + 1];
if(existing_id > id) {
// not found!
return false;
}
if(existing_id == id) {
return true;
}
i += num_existing_offsets + 2;
}
return false;
}
/* posting operations */
void posting_t::upsert(void*& obj, uint32_t id, const std::vector<uint32_t>& offsets) {
@ -293,6 +314,16 @@ uint32_t posting_t::first_id(const void* obj) {
}
}
bool posting_t::contains(const void* obj, uint32_t id) {
if(IS_COMPACT_POSTING(obj)) {
compact_posting_list_t* list = COMPACT_POSTING_PTR(obj);
return list->contains(id);
} else {
posting_list_t* list = (posting_list_t*) RAW_POSTING_PTR(obj);
return list->contains(id);
}
}
void posting_t::merge(const std::vector<void*>& raw_posting_lists, std::vector<uint32_t>& result_ids) {
// we will have to convert the compact posting list (if any) to full form
std::vector<posting_list_t*> plists;
@ -334,3 +365,22 @@ void posting_t::to_expanded_plists(const std::vector<void*>& raw_posting_lists,
}
}
}
bool posting_t::block_intersect(const std::vector<void*>& raw_posting_lists, size_t batch_size,
std::vector<posting_list_t::iterator_t>& its,
posting_list_t::result_iter_state_t& iter_state) {
// we will have to convert the compact posting list (if any) to full form
std::vector<posting_list_t*> plists;
std::vector<uint32_t> expanded_plist_indices;
to_expanded_plists(raw_posting_lists, plists, expanded_plist_indices);
bool done = posting_list_t::block_intersect(plists, batch_size, its, iter_state);
if(done) {
for(uint32_t expanded_plist_index: expanded_plist_indices) {
delete plists[expanded_plist_index];
}
}
return done;
}

View File

@ -4,26 +4,6 @@
/* block_t operations */
void posting_list_t::block_t::insert_and_shift_offset_index(const uint32_t index, const uint32_t num_offsets) {
uint32_t existing_offset_index = offset_index.at(index);
uint32_t length = offset_index.getLength();
uint32_t new_length = length + 1;
uint32_t* curr_array = offset_index.uncompress(new_length);
memmove(&curr_array[index+1], &curr_array[index], sizeof(uint32_t)*(length - index));
curr_array[index] = existing_offset_index;
uint32_t curr_index = index + 1;
while(curr_index < new_length) {
curr_array[curr_index] += num_offsets;
curr_index++;
}
offset_index.load(curr_array, new_length);
delete [] curr_array;
}
uint32_t posting_list_t::block_t::upsert(const uint32_t id, const std::vector<uint32_t>& positions) {
if(id <= ids.last()) {
// we have to check if `id` already exists, for an opportunity to do in-place updates
@ -198,6 +178,10 @@ void posting_list_t::block_t::remove_and_shift_offset_index(const uint32_t* indi
delete[] new_array;
}
bool posting_list_t::block_t::contains(uint32_t id) {
return ids.contains(id);
}
/* posting_list_t operations */
posting_list_t::posting_list_t(uint16_t max_block_elements): BLOCK_MAX_ELEMENTS(max_block_elements) {
@ -891,6 +875,17 @@ size_t posting_list_t::num_ids() {
return ids_length;
}
bool posting_list_t::contains(uint32_t id) {
const auto it = id_block_map.lower_bound(id);
if(it == id_block_map.end()) {
return false;
}
block_t* potential_block = it->second;
return potential_block->contains(id);
}
/* iterator_t operations */
posting_list_t::iterator_t::iterator_t(posting_list_t::block_t* root):

View File

@ -281,6 +281,7 @@ TEST(PostingListTest, RemovalsOnFirstBlock) {
// try to erase when posting list is empty
pl.erase(0);
ASSERT_FALSE(pl.contains(0));
ASSERT_EQ(0, pl.num_ids());
ASSERT_EQ(0, pl.num_blocks());
@ -305,6 +306,11 @@ TEST(PostingListTest, RemovalsOnFirstBlock) {
ASSERT_EQ(2, pl.num_blocks());
ASSERT_EQ(6, pl.num_ids());
ASSERT_TRUE(pl.contains(2));
ASSERT_TRUE(pl.contains(5));
ASSERT_FALSE(pl.contains(6));
ASSERT_FALSE(pl.contains(1000));
// delete non-existing element
pl.erase(1000);
ASSERT_EQ(6, pl.num_ids());
@ -743,6 +749,12 @@ TEST(PostingListTest, CompactPostingListUpsertAppends) {
ASSERT_EQ(1002, list->last_id());
ASSERT_EQ(3, list->num_ids());
ASSERT_TRUE(list->contains(0));
ASSERT_TRUE(list->contains(1000));
ASSERT_TRUE(list->contains(1002));
ASSERT_FALSE(list->contains(500));
ASSERT_FALSE(list->contains(2));
// no-op since the container expects resizing to be done outside
list->upsert(1003, {1, 2});
ASSERT_EQ(15, list->length);