diff --git a/include/posting.h b/include/posting.h index 3596c963..1b5392d9 100644 --- a/include/posting.h +++ b/include/posting.h @@ -23,6 +23,8 @@ struct compact_posting_list_t { posting_list_t* to_full_posting_list(); + bool contains(uint32_t id); + int64_t upsert(uint32_t id, const std::vector& offsets); int64_t upsert(uint32_t id, const uint32_t* offsets, uint32_t num_offsets); @@ -51,7 +53,16 @@ public: static uint32_t first_id(const void* obj); + static bool contains(const void* obj, uint32_t id); + static void merge(const std::vector& posting_lists, std::vector& result_ids); static void intersect(const std::vector& posting_lists, std::vector& result_ids); + + static bool block_intersect( + const std::vector& posting_lists, + size_t batch_size, + std::vector& its, + posting_list_t::result_iter_state_t& iter_state + ); }; \ No newline at end of file diff --git a/include/posting_list.h b/include/posting_list.h index 9ba6ff7d..eb78ec6b 100644 --- a/include/posting_list.h +++ b/include/posting_list.h @@ -29,7 +29,7 @@ public: // link to next block block_t* next = nullptr; - void insert_and_shift_offset_index(uint32_t index, uint32_t num_offsets); + bool contains(uint32_t id); void remove_and_shift_offset_index(const uint32_t* indices_sorted, uint32_t num_indices); @@ -123,6 +123,8 @@ public: block_t* block_of(last_id_t id); + bool contains(uint32_t id); + iterator_t new_iterator(); static void merge(const std::vector& posting_lists, std::vector& result_ids); diff --git a/src/posting.cpp b/src/posting.cpp index 7b04441c..2f8c7849 100644 --- a/src/posting.cpp +++ b/src/posting.cpp @@ -192,6 +192,27 @@ uint32_t compact_posting_list_t::first_id() { return id_offsets[id_offsets[0] + 1]; } +bool compact_posting_list_t::contains(uint32_t id) { + size_t i = 0; + while(i < length) { + size_t num_existing_offsets = id_offsets[i]; + size_t existing_id = id_offsets[i + num_existing_offsets + 1]; + + if(existing_id > id) { + // not found! + return false; + } + + if(existing_id == id) { + return true; + } + + i += num_existing_offsets + 2; + } + + return false; +} + /* posting operations */ void posting_t::upsert(void*& obj, uint32_t id, const std::vector& offsets) { @@ -293,6 +314,16 @@ uint32_t posting_t::first_id(const void* obj) { } } +bool posting_t::contains(const void* obj, uint32_t id) { + if(IS_COMPACT_POSTING(obj)) { + compact_posting_list_t* list = COMPACT_POSTING_PTR(obj); + return list->contains(id); + } else { + posting_list_t* list = (posting_list_t*) RAW_POSTING_PTR(obj); + return list->contains(id); + } +} + void posting_t::merge(const std::vector& raw_posting_lists, std::vector& result_ids) { // we will have to convert the compact posting list (if any) to full form std::vector plists; @@ -334,3 +365,22 @@ void posting_t::to_expanded_plists(const std::vector& raw_posting_lists, } } } + +bool posting_t::block_intersect(const std::vector& raw_posting_lists, size_t batch_size, + std::vector& its, + posting_list_t::result_iter_state_t& iter_state) { + // we will have to convert the compact posting list (if any) to full form + std::vector plists; + std::vector expanded_plist_indices; + to_expanded_plists(raw_posting_lists, plists, expanded_plist_indices); + + bool done = posting_list_t::block_intersect(plists, batch_size, its, iter_state); + + if(done) { + for(uint32_t expanded_plist_index: expanded_plist_indices) { + delete plists[expanded_plist_index]; + } + } + + return done; +} diff --git a/src/posting_list.cpp b/src/posting_list.cpp index 3755867b..394265dc 100644 --- a/src/posting_list.cpp +++ b/src/posting_list.cpp @@ -4,26 +4,6 @@ /* block_t operations */ -void posting_list_t::block_t::insert_and_shift_offset_index(const uint32_t index, const uint32_t num_offsets) { - uint32_t existing_offset_index = offset_index.at(index); - uint32_t length = offset_index.getLength(); - uint32_t new_length = length + 1; - uint32_t* curr_array = offset_index.uncompress(new_length); - - memmove(&curr_array[index+1], &curr_array[index], sizeof(uint32_t)*(length - index)); - curr_array[index] = existing_offset_index; - - uint32_t curr_index = index + 1; - while(curr_index < new_length) { - curr_array[curr_index] += num_offsets; - curr_index++; - } - - offset_index.load(curr_array, new_length); - - delete [] curr_array; -} - uint32_t posting_list_t::block_t::upsert(const uint32_t id, const std::vector& positions) { if(id <= ids.last()) { // we have to check if `id` already exists, for an opportunity to do in-place updates @@ -198,6 +178,10 @@ void posting_list_t::block_t::remove_and_shift_offset_index(const uint32_t* indi delete[] new_array; } +bool posting_list_t::block_t::contains(uint32_t id) { + return ids.contains(id); +} + /* posting_list_t operations */ posting_list_t::posting_list_t(uint16_t max_block_elements): BLOCK_MAX_ELEMENTS(max_block_elements) { @@ -891,6 +875,17 @@ size_t posting_list_t::num_ids() { return ids_length; } +bool posting_list_t::contains(uint32_t id) { + const auto it = id_block_map.lower_bound(id); + + if(it == id_block_map.end()) { + return false; + } + + block_t* potential_block = it->second; + return potential_block->contains(id); +} + /* iterator_t operations */ posting_list_t::iterator_t::iterator_t(posting_list_t::block_t* root): diff --git a/test/posting_list_test.cpp b/test/posting_list_test.cpp index 4f0fca8f..0987fc30 100644 --- a/test/posting_list_test.cpp +++ b/test/posting_list_test.cpp @@ -281,6 +281,7 @@ TEST(PostingListTest, RemovalsOnFirstBlock) { // try to erase when posting list is empty pl.erase(0); + ASSERT_FALSE(pl.contains(0)); ASSERT_EQ(0, pl.num_ids()); ASSERT_EQ(0, pl.num_blocks()); @@ -305,6 +306,11 @@ TEST(PostingListTest, RemovalsOnFirstBlock) { ASSERT_EQ(2, pl.num_blocks()); ASSERT_EQ(6, pl.num_ids()); + ASSERT_TRUE(pl.contains(2)); + ASSERT_TRUE(pl.contains(5)); + ASSERT_FALSE(pl.contains(6)); + ASSERT_FALSE(pl.contains(1000)); + // delete non-existing element pl.erase(1000); ASSERT_EQ(6, pl.num_ids()); @@ -743,6 +749,12 @@ TEST(PostingListTest, CompactPostingListUpsertAppends) { ASSERT_EQ(1002, list->last_id()); ASSERT_EQ(3, list->num_ids()); + ASSERT_TRUE(list->contains(0)); + ASSERT_TRUE(list->contains(1000)); + ASSERT_TRUE(list->contains(1002)); + ASSERT_FALSE(list->contains(500)); + ASSERT_FALSE(list->contains(2)); + // no-op since the container expects resizing to be done outside list->upsert(1003, {1, 2}); ASSERT_EQ(15, list->length);