Add contains one helper for posting list.

This commit is contained in:
Kishore Nallan 2021-06-15 19:44:43 +05:30
parent 93261178a9
commit c9fd7bc506
5 changed files with 138 additions and 0 deletions

View File

@ -34,6 +34,8 @@ struct compact_posting_list_t {
uint32_t last_id();
uint32_t num_ids() const;
bool contains_atleast_one(const uint32_t* target_ids, size_t target_ids_size);
};
class posting_t {
@ -49,12 +51,16 @@ public:
static void erase(void*& obj, uint32_t id);
static void destroy_list(void*& obj);
static uint32_t num_ids(const void* obj);
static uint32_t first_id(const void* obj);
static bool contains(const void* obj, uint32_t id);
static bool contains_atleast_one(const void* obj, const uint32_t* target_ids, size_t target_ids_size);
static void merge(const std::vector<void*>& posting_lists, std::vector<uint32_t>& result_ids);
static void intersect(const std::vector<void*>& posting_lists, std::vector<uint32_t>& result_ids);

View File

@ -125,6 +125,8 @@ public:
bool contains(uint32_t id);
bool contains_atleast_one(const uint32_t* target_ids, size_t target_ids_size);
iterator_t new_iterator();
static void merge(const std::vector<posting_list_t*>& posting_lists, std::vector<uint32_t>& result_ids);

View File

@ -213,6 +213,30 @@ bool compact_posting_list_t::contains(uint32_t id) {
return false;
}
bool compact_posting_list_t::contains_atleast_one(const uint32_t* target_ids, size_t target_ids_size) {
size_t i = 0;
size_t target_ids_index = 0;
while(i < length && target_ids_index < target_ids_size) {
size_t num_existing_offsets = id_offsets[i];
size_t existing_id = id_offsets[i + num_existing_offsets + 1];
if(existing_id == target_ids[target_ids_index]) {
return true;
}
if(target_ids[target_ids_index] < existing_id) {
while(target_ids_index < target_ids_size && target_ids[target_ids_index] < existing_id) {
target_ids_index++;
}
} else {
i += num_existing_offsets + 2;
}
}
return false;
}
/* posting operations */
void posting_t::upsert(void*& obj, uint32_t id, const std::vector<uint32_t>& offsets) {
@ -324,6 +348,16 @@ bool posting_t::contains(const void* obj, uint32_t id) {
}
}
bool posting_t::contains_atleast_one(const void* obj, const uint32_t* target_ids, size_t target_ids_size) {
if(IS_COMPACT_POSTING(obj)) {
compact_posting_list_t* list = COMPACT_POSTING_PTR(obj);
return list->contains_atleast_one(target_ids, target_ids_size);
} else {
posting_list_t* list = (posting_list_t*) RAW_POSTING_PTR(obj);
return list->contains_atleast_one(target_ids, target_ids_size);
}
}
void posting_t::merge(const std::vector<void*>& raw_posting_lists, std::vector<uint32_t>& result_ids) {
// we will have to convert the compact posting list (if any) to full form
std::vector<posting_list_t*> plists;
@ -384,3 +418,15 @@ bool posting_t::block_intersect(const std::vector<void*>& raw_posting_lists, siz
return done;
}
void posting_t::destroy_list(void*& obj) {
if(IS_COMPACT_POSTING(obj)) {
compact_posting_list_t* list = COMPACT_POSTING_PTR(obj);
free(list); // assigned via malloc, so must be free()d
} else {
posting_list_t* list = (posting_list_t*) RAW_POSTING_PTR(obj);
delete list;
}
obj = nullptr;
}

View File

@ -886,6 +886,30 @@ bool posting_list_t::contains(uint32_t id) {
return potential_block->contains(id);
}
bool posting_list_t::contains_atleast_one(const uint32_t* target_ids, size_t target_ids_size) {
posting_list_t::iterator_t it = new_iterator();
size_t target_ids_index = 0;
while(target_ids_index < target_ids_size && it.valid()) {
uint32_t id = it.id();
if(id == target_ids[target_ids_index]) {
return true;
} else {
// advance smallest value
if(id > target_ids[target_ids_index]) {
while(target_ids_index < target_ids_size && target_ids[target_ids_index] < id) {
target_ids_index++;
}
} else {
it.skip_to(target_ids[target_ids_index]);
}
}
}
return false;
}
/* iterator_t operations */
posting_list_t::iterator_t::iterator_t(posting_list_t::block_t* root):
@ -956,5 +980,6 @@ posting_list_t::iterator_t::iterator_t(iterator_t&& rhs) noexcept {
ids = rhs.ids;
rhs.curr_block = nullptr;
rhs.uncompressed_block = nullptr;
rhs.ids = nullptr;
}

View File

@ -738,6 +738,42 @@ TEST(PostingListTest, IntersectionSkipBlocks) {
delete [] final_results;
}
TEST(PostingListTest, PostingListContainsAtleastOne) {
// when posting list is larger than target IDs
posting_list_t p1(100);
for(size_t i = 20; i < 1000; i++) {
p1.upsert(i, {1, 2, 3});
}
std::vector<uint32_t> target_ids1 = {200, 300};
std::vector<uint32_t> target_ids2 = {200, 3000};
std::vector<uint32_t> target_ids3 = {2000, 3000};
ASSERT_TRUE(p1.contains_atleast_one(&target_ids1[0], target_ids1.size()));
ASSERT_TRUE(p1.contains_atleast_one(&target_ids2[0], target_ids2.size()));
ASSERT_FALSE(p1.contains_atleast_one(&target_ids3[0], target_ids3.size()));
// when posting list is smaller than target IDs
posting_list_t p2(2);
for(size_t i = 10; i < 20; i++) {
p2.upsert(i, {1, 2, 3});
}
target_ids1.clear();
for(size_t i = 5; i < 1000; i++) {
target_ids1.push_back(i);
}
target_ids2.clear();
for(size_t i = 25; i < 1000; i++) {
target_ids2.push_back(i);
}
ASSERT_TRUE(p2.contains_atleast_one(&target_ids1[0], target_ids1.size()));
ASSERT_FALSE(p2.contains_atleast_one(&target_ids2[0], target_ids2.size()));
}
TEST(PostingListTest, CompactPostingListUpsertAppends) {
uint32_t ids[] = {0, 1000, 1002};
uint32_t offset_index[] = {0, 3, 6};
@ -978,6 +1014,29 @@ TEST(PostingListTest, CompactPostingListErase) {
free(list);
}
TEST(PostingListTest, CompactPostingListContainsAtleastOne) {
uint32_t ids[] = {5, 6, 7, 8};
uint32_t offset_index[] = {0, 3, 6, 9};
uint32_t offsets[] = {0, 3, 4, 0, 3, 4, 0, 3, 4, 0, 3, 4};
std::vector<uint32_t> target_ids1 = {4, 7, 11};
std::vector<uint32_t> target_ids2 = {2, 3, 4, 20};
compact_posting_list_t* list1 = compact_posting_list_t::create(4, ids, offset_index, 12, offsets);
ASSERT_TRUE(list1->contains_atleast_one(&target_ids1[0], target_ids1.size()));
ASSERT_FALSE(list1->contains_atleast_one(&target_ids2[0], target_ids2.size()));
compact_posting_list_t* list2 = static_cast<compact_posting_list_t*>(malloc(sizeof(compact_posting_list_t)));
void* obj = SET_COMPACT_POSTING(list2);
posting_t::upsert(obj, 3, {1, 5});
std::vector<uint32_t> target_ids3 = {1, 2, 3, 4, 100};
std::vector<uint32_t> target_ids4 = {4, 5, 6, 100};
ASSERT_TRUE(COMPACT_POSTING_PTR(obj)->contains_atleast_one(&target_ids3[0], target_ids3.size()));
ASSERT_FALSE(COMPACT_POSTING_PTR(obj)->contains_atleast_one(&target_ids4[0], target_ids4.size()));
}
TEST(PostingListTest, DISABLED_Benchmark) {
std::vector<uint32_t> offsets = {0, 1, 3};
posting_list_t pl(4096);