diff --git a/include/posting.h b/include/posting.h index 1b5392d9..094c3853 100644 --- a/include/posting.h +++ b/include/posting.h @@ -34,6 +34,8 @@ struct compact_posting_list_t { uint32_t last_id(); uint32_t num_ids() const; + + bool contains_atleast_one(const uint32_t* target_ids, size_t target_ids_size); }; class posting_t { @@ -49,12 +51,16 @@ public: static void erase(void*& obj, uint32_t id); + static void destroy_list(void*& obj); + static uint32_t num_ids(const void* obj); static uint32_t first_id(const void* obj); static bool contains(const void* obj, uint32_t id); + static bool contains_atleast_one(const void* obj, const uint32_t* target_ids, size_t target_ids_size); + static void merge(const std::vector& posting_lists, std::vector& result_ids); static void intersect(const std::vector& posting_lists, std::vector& result_ids); diff --git a/include/posting_list.h b/include/posting_list.h index eb78ec6b..0f389472 100644 --- a/include/posting_list.h +++ b/include/posting_list.h @@ -125,6 +125,8 @@ public: bool contains(uint32_t id); + bool contains_atleast_one(const uint32_t* target_ids, size_t target_ids_size); + iterator_t new_iterator(); static void merge(const std::vector& posting_lists, std::vector& result_ids); diff --git a/src/posting.cpp b/src/posting.cpp index 2f8c7849..293b9b53 100644 --- a/src/posting.cpp +++ b/src/posting.cpp @@ -213,6 +213,30 @@ bool compact_posting_list_t::contains(uint32_t id) { return false; } +bool compact_posting_list_t::contains_atleast_one(const uint32_t* target_ids, size_t target_ids_size) { + size_t i = 0; + size_t target_ids_index = 0; + + while(i < length && target_ids_index < target_ids_size) { + size_t num_existing_offsets = id_offsets[i]; + size_t existing_id = id_offsets[i + num_existing_offsets + 1]; + + if(existing_id == target_ids[target_ids_index]) { + return true; + } + + if(target_ids[target_ids_index] < existing_id) { + while(target_ids_index < target_ids_size && target_ids[target_ids_index] < existing_id) { + target_ids_index++; + } + } else { + i += num_existing_offsets + 2; + } + } + + return false; +} + /* posting operations */ void posting_t::upsert(void*& obj, uint32_t id, const std::vector& offsets) { @@ -324,6 +348,16 @@ bool posting_t::contains(const void* obj, uint32_t id) { } } +bool posting_t::contains_atleast_one(const void* obj, const uint32_t* target_ids, size_t target_ids_size) { + if(IS_COMPACT_POSTING(obj)) { + compact_posting_list_t* list = COMPACT_POSTING_PTR(obj); + return list->contains_atleast_one(target_ids, target_ids_size); + } else { + posting_list_t* list = (posting_list_t*) RAW_POSTING_PTR(obj); + return list->contains_atleast_one(target_ids, target_ids_size); + } +} + void posting_t::merge(const std::vector& raw_posting_lists, std::vector& result_ids) { // we will have to convert the compact posting list (if any) to full form std::vector plists; @@ -384,3 +418,15 @@ bool posting_t::block_intersect(const std::vector& raw_posting_lists, siz return done; } + +void posting_t::destroy_list(void*& obj) { + if(IS_COMPACT_POSTING(obj)) { + compact_posting_list_t* list = COMPACT_POSTING_PTR(obj); + free(list); // assigned via malloc, so must be free()d + } else { + posting_list_t* list = (posting_list_t*) RAW_POSTING_PTR(obj); + delete list; + } + + obj = nullptr; +} diff --git a/src/posting_list.cpp b/src/posting_list.cpp index 394265dc..bc63c6ed 100644 --- a/src/posting_list.cpp +++ b/src/posting_list.cpp @@ -886,6 +886,30 @@ bool posting_list_t::contains(uint32_t id) { return potential_block->contains(id); } +bool posting_list_t::contains_atleast_one(const uint32_t* target_ids, size_t target_ids_size) { + posting_list_t::iterator_t it = new_iterator(); + size_t target_ids_index = 0; + + while(target_ids_index < target_ids_size && it.valid()) { + uint32_t id = it.id(); + + if(id == target_ids[target_ids_index]) { + return true; + } else { + // advance smallest value + if(id > target_ids[target_ids_index]) { + while(target_ids_index < target_ids_size && target_ids[target_ids_index] < id) { + target_ids_index++; + } + } else { + it.skip_to(target_ids[target_ids_index]); + } + } + } + + return false; +} + /* iterator_t operations */ posting_list_t::iterator_t::iterator_t(posting_list_t::block_t* root): @@ -956,5 +980,6 @@ posting_list_t::iterator_t::iterator_t(iterator_t&& rhs) noexcept { ids = rhs.ids; rhs.curr_block = nullptr; + rhs.uncompressed_block = nullptr; rhs.ids = nullptr; } diff --git a/test/posting_list_test.cpp b/test/posting_list_test.cpp index 0987fc30..71a66670 100644 --- a/test/posting_list_test.cpp +++ b/test/posting_list_test.cpp @@ -738,6 +738,42 @@ TEST(PostingListTest, IntersectionSkipBlocks) { delete [] final_results; } +TEST(PostingListTest, PostingListContainsAtleastOne) { + // when posting list is larger than target IDs + posting_list_t p1(100); + + for(size_t i = 20; i < 1000; i++) { + p1.upsert(i, {1, 2, 3}); + } + + std::vector target_ids1 = {200, 300}; + std::vector target_ids2 = {200, 3000}; + std::vector target_ids3 = {2000, 3000}; + + ASSERT_TRUE(p1.contains_atleast_one(&target_ids1[0], target_ids1.size())); + ASSERT_TRUE(p1.contains_atleast_one(&target_ids2[0], target_ids2.size())); + ASSERT_FALSE(p1.contains_atleast_one(&target_ids3[0], target_ids3.size())); + + // when posting list is smaller than target IDs + posting_list_t p2(2); + for(size_t i = 10; i < 20; i++) { + p2.upsert(i, {1, 2, 3}); + } + + target_ids1.clear(); + for(size_t i = 5; i < 1000; i++) { + target_ids1.push_back(i); + } + + target_ids2.clear(); + for(size_t i = 25; i < 1000; i++) { + target_ids2.push_back(i); + } + + ASSERT_TRUE(p2.contains_atleast_one(&target_ids1[0], target_ids1.size())); + ASSERT_FALSE(p2.contains_atleast_one(&target_ids2[0], target_ids2.size())); +} + TEST(PostingListTest, CompactPostingListUpsertAppends) { uint32_t ids[] = {0, 1000, 1002}; uint32_t offset_index[] = {0, 3, 6}; @@ -978,6 +1014,29 @@ TEST(PostingListTest, CompactPostingListErase) { free(list); } +TEST(PostingListTest, CompactPostingListContainsAtleastOne) { + uint32_t ids[] = {5, 6, 7, 8}; + uint32_t offset_index[] = {0, 3, 6, 9}; + uint32_t offsets[] = {0, 3, 4, 0, 3, 4, 0, 3, 4, 0, 3, 4}; + + std::vector target_ids1 = {4, 7, 11}; + std::vector target_ids2 = {2, 3, 4, 20}; + + compact_posting_list_t* list1 = compact_posting_list_t::create(4, ids, offset_index, 12, offsets); + ASSERT_TRUE(list1->contains_atleast_one(&target_ids1[0], target_ids1.size())); + ASSERT_FALSE(list1->contains_atleast_one(&target_ids2[0], target_ids2.size())); + + compact_posting_list_t* list2 = static_cast(malloc(sizeof(compact_posting_list_t))); + void* obj = SET_COMPACT_POSTING(list2); + posting_t::upsert(obj, 3, {1, 5}); + + std::vector target_ids3 = {1, 2, 3, 4, 100}; + std::vector target_ids4 = {4, 5, 6, 100}; + + ASSERT_TRUE(COMPACT_POSTING_PTR(obj)->contains_atleast_one(&target_ids3[0], target_ids3.size())); + ASSERT_FALSE(COMPACT_POSTING_PTR(obj)->contains_atleast_one(&target_ids4[0], target_ids4.size())); +} + TEST(PostingListTest, DISABLED_Benchmark) { std::vector offsets = {0, 1, 3}; posting_list_t pl(4096);