diff --git a/include/posting.h b/include/posting.h new file mode 100644 index 00000000..3a1d2076 --- /dev/null +++ b/include/posting.h @@ -0,0 +1,43 @@ +#pragma once + +#include +#include +#include "posting_list.h" + +#define IS_COMPACT_POSTING(x) (((uintptr_t)x & 1)) +#define SET_COMPACT_POSTING(x) ((void*)((uintptr_t)x | 1)) +#define RAW_POSTING_PTR(x) ((void*)((uintptr_t)x & ~1)) +#define COMPACT_POSTING_PTR(x) ((compact_posting_list_t*)((uintptr_t)x & ~1)) + +struct compact_posting_list_t { + // use uint16_t to get 4 byte alignment for `id_offsets` + uint16_t length = 0; + uint16_t capacity = 0; + + // format: num_offsets, offset1,..,offsetn, id1 | num_offsets, offset1,..,offsetn, id2 + uint32_t id_offsets[]; + + static compact_posting_list_t* create(uint32_t num_ids, uint32_t* ids, const uint32_t* offset_index, + uint32_t num_offsets, uint32_t* offsets); + + posting_list_t* to_full_posting_list(); + + int64_t upsert(uint32_t id, const std::vector& offsets); + int64_t upsert(uint32_t id, const uint32_t* offsets, uint32_t num_offsets); + + void erase(uint32_t id); + + uint32_t last_id(); +}; + +class posting_t { +private: + static constexpr size_t COMPACT_LIST_THRESHOLD_LENGTH = 64; + +public: + + static void upsert(void*& obj, uint32_t id, const std::vector& offsets); + + static void erase(void*& obj, uint32_t id); + +}; \ No newline at end of file diff --git a/include/posting_list.h b/include/posting_list.h index b7c321d1..be2e7eb9 100644 --- a/include/posting_list.h +++ b/include/posting_list.h @@ -57,9 +57,6 @@ public: private: - // when a block reaches pre-allocated storage, it is expanded by this factor - static constexpr float BLOCK_GROWTH_FACTOR = 1.3; - // maximum number of IDs (and associated offsets) to store in each block before another block is created const uint16_t BLOCK_MAX_ELEMENTS; diff --git a/src/posting.cpp b/src/posting.cpp new file mode 100644 index 00000000..50cbd10c --- /dev/null +++ b/src/posting.cpp @@ -0,0 +1,259 @@ +#include "posting.h" +#include "posting_list.h" + +int64_t compact_posting_list_t::upsert(const uint32_t id, const std::vector& offsets) { + return upsert(id, &offsets[0], offsets.size()); +} + +int64_t compact_posting_list_t::upsert(const uint32_t id, const uint32_t* offsets, uint32_t num_offsets) { + // format: num_offsets, offset1,..,offsetn, id1 | num_offsets, offset1,..,offsetn, id2 + uint32_t last_id = (length == 0) ? 0 : id_offsets[length - 1]; + int64_t new_storage_needed = 0; + + if(length == 0 || id > last_id) { + new_storage_needed = num_offsets + 2; + if(length + new_storage_needed > capacity) { + // enough storage should have been provided upstream + return (length + new_storage_needed) - capacity; + } + + // can just append to the end + id_offsets[length++] = num_offsets; + for(size_t i = 0; i < num_offsets; i++) { + id_offsets[length+i] = offsets[i]; + } + length += num_offsets; + id_offsets[length++] = id; + } else { + // locate position and shift contents to make space available + int64_t i = 0; + + while(i < length) { + size_t num_existing_offsets = id_offsets[i]; + size_t existing_id = id_offsets[i + num_existing_offsets + 1]; + + if(existing_id == id) { + new_storage_needed = (num_offsets - num_existing_offsets); + if(new_storage_needed > 0) { + if(length + new_storage_needed > capacity) { + // enough storage should have been provided upstream + return (length + new_storage_needed) - capacity; + } + + // shift offsets to the right to make space + int64_t shift_index = int64_t(length)+new_storage_needed-1; + while(shift_index >= i && (shift_index - new_storage_needed) >= 0) { + id_offsets[shift_index] = id_offsets[shift_index - new_storage_needed]; + shift_index--; + } + + } else if(new_storage_needed < 0) { + // shift offsets to the left to reduce space + // [num_offsets][0][2][4][id] + // [num_offsets][0][id] + size_t offset_diff = (num_existing_offsets - num_offsets); + size_t start_index = i + 1 + offset_diff; + while(start_index < length - offset_diff) { + id_offsets[start_index] = id_offsets[start_index + offset_diff]; + start_index++; + } + } + + id_offsets[i] = num_offsets; + for(size_t j = 0; j < num_offsets; j++) { + id_offsets[i + 1 + j] = offsets[j]; + } + + id_offsets[i+1+num_offsets] = id; + + break; + } + + else if(existing_id > id) { + new_storage_needed = (num_offsets + 2); + if(length + new_storage_needed > capacity) { + // enough storage should have been provided upstream + return (length + new_storage_needed) - capacity; + } + + // shift index [i..length-1] by `new_storage_needed` positions + int64_t shift_index = length+new_storage_needed-1; + while((shift_index - new_storage_needed) >= 0 && shift_index >= i) { + // [*1 1 4] [1 1 7] + // [1 1 3] + id_offsets[shift_index] = id_offsets[shift_index - new_storage_needed]; + shift_index--; + } + // now store the new offsets in the shifted space + id_offsets[i++] = num_offsets; + for (size_t j = 0; j < num_offsets; j++) { + id_offsets[i+j] = offsets[j]; + } + + i += num_offsets; + id_offsets[i++] = id; + break; + } + + i += num_existing_offsets + 2; + } + + length += new_storage_needed; // new_storage_needed can be negative here but that's okay + } + + return 0; +} + +void compact_posting_list_t::erase(const uint32_t id) { + // locate position and shift contents to collapse space vacated + size_t i = 0; + while(i < length) { + size_t num_existing_offsets = id_offsets[i]; + size_t existing_id = id_offsets[i + num_existing_offsets + 1]; + if(existing_id > id) { + // not found! + return ; + } + + if(existing_id == id) { + size_t shift_offset = num_existing_offsets + 2; + while(i+shift_offset < length) { + id_offsets[i] = id_offsets[i+shift_offset]; + i++; + } + + length -= shift_offset; + break; + } + + i += num_existing_offsets + 2; + } +} + +compact_posting_list_t* compact_posting_list_t::create(uint32_t num_ids, uint32_t* ids, const uint32_t* offset_index, + uint32_t num_offsets, uint32_t* offsets) { + // format: num_offsets, offset1,..,offsetn, id1 | num_offsets, offset1,..,offsetn, id2 + + size_t length_required = num_offsets + (2 * num_ids); + compact_posting_list_t* pl = (compact_posting_list_t*) malloc(sizeof(compact_posting_list_t) + + (length_required * sizeof(uint32_t))); + + pl->length = 0; + pl->capacity = length_required; + + size_t id_offsets_index = 0; + for(size_t i = 0; i < num_ids; i++) { + uint32_t start_offset = offset_index[i]; + uint32_t next_start_offset = (i == num_ids-1) ? num_offsets : offset_index[i+1]; + pl->upsert(ids[i], offsets+start_offset, (next_start_offset - start_offset)); + } + + return pl; +} + +posting_list_t* compact_posting_list_t::to_full_posting_list() { + posting_list_t* pl = new posting_list_t(1024); + + size_t i = 0; + while(i < length) { + size_t num_existing_offsets = id_offsets[i]; + i++; + std::vector offsets(num_existing_offsets); + for(size_t j = 0; j < num_existing_offsets; j++) { + auto offset = id_offsets[i + j]; + offsets[j] = offset; + } + + size_t existing_id = id_offsets[i + num_existing_offsets]; + pl->upsert(existing_id, offsets); + i += num_existing_offsets + 1; + } + + return pl; +} + +uint32_t compact_posting_list_t::last_id() { + return (length == 0) ? UINT32_MAX : id_offsets[length - 1]; +} + +/* posting operations */ + +void posting_t::upsert(void*& obj, uint32_t id, const std::vector& offsets) { + if(IS_COMPACT_POSTING(obj)) { + compact_posting_list_t* list = (compact_posting_list_t*) RAW_POSTING_PTR(obj); + int64_t extra_capacity_required = list->upsert(id, offsets); + + if(extra_capacity_required != 0) { + // grow the container by 30% + size_t new_capacity = (list->capacity + extra_capacity_required) * 1.3; + size_t new_capacity_bytes = new_capacity * sizeof(uint32_t); + auto new_list = (compact_posting_list_t *) realloc(list, new_capacity_bytes); + if(new_list == nullptr) { + abort(); + } + + list = new_list; + list->capacity = new_capacity; + obj = SET_COMPACT_POSTING(list); + + list->upsert(id, offsets); + } + + if(list->length > COMPACT_LIST_THRESHOLD_LENGTH) { + // we will store anything over this threshold as a full posting list + posting_list_t* full_list = list->to_full_posting_list(); + free(list); + obj = full_list; + return; + } + + } else { + posting_list_t* list = (posting_list_t*) RAW_POSTING_PTR(obj); + list->upsert(id, offsets); + } +} + +void posting_t::erase(void*& obj, uint32_t id) { + if(IS_COMPACT_POSTING(obj)) { + compact_posting_list_t* list = (compact_posting_list_t*) RAW_POSTING_PTR(obj); + list->erase(id); + + // if the list becomes too small, we resize it to save memory + if(list->length < list->capacity/2) { + // resize container + size_t new_capacity = list->capacity/2; + size_t new_capacity_bytes = new_capacity * sizeof(uint32_t); + auto new_list = (compact_posting_list_t *) realloc(list, new_capacity_bytes); + if(new_list == nullptr) { + abort(); + } + + list = new_list; + list->capacity = new_capacity; + obj = SET_COMPACT_POSTING(list); + } + + } else { + posting_list_t* list = (posting_list_t*) RAW_POSTING_PTR(obj); + list->erase(id); + if(list->size() == 1 && list->get_root()->size() <= 10) { + // convert to compact posting format + auto root_block = list->get_root(); + auto ids = root_block->ids.uncompress(); + auto offset_index = root_block->offset_index.uncompress(); + auto offsets = root_block->offsets.uncompress(); + + compact_posting_list_t* compact_list = compact_posting_list_t::create( + root_block->size(), ids, offset_index, root_block->offsets.getLength(), offsets + ); + + delete [] ids; + delete [] offset_index; + delete [] offsets; + free(list); + + obj = COMPACT_POSTING_PTR(compact_list); + } + } +} + diff --git a/test/posting_list_test.cpp b/test/posting_list_test.cpp index 00e20b42..08f2657c 100644 --- a/test/posting_list_test.cpp +++ b/test/posting_list_test.cpp @@ -1,5 +1,5 @@ #include -#include "posting_list.h" +#include "posting.h" #include "array_utils.h" #include #include @@ -433,6 +433,219 @@ TEST(PostingListTest, IntersectionSkipBlocks) { delete [] final_results; } +TEST(PostingListTest, CompactPostingListUpsertAppends) { + uint32_t ids[] = {0, 1000, 1002}; + uint32_t offset_index[] = {0, 3, 6}; + uint32_t offsets[] = {0, 3, 4, 0, 3, 4, 0, 3, 4}; + + compact_posting_list_t* list = compact_posting_list_t::create(3, ids, offset_index, 9, offsets); + ASSERT_EQ(15, list->length); + ASSERT_EQ(15, list->capacity); + ASSERT_EQ(1002, list->last_id()); + + // no-op since the container expects resizing to be done outside + list->upsert(1003, {1, 2}); + ASSERT_EQ(15, list->length); + ASSERT_EQ(15, list->capacity); + ASSERT_EQ(1002, list->last_id()); + + // now resize + void* obj = SET_COMPACT_POSTING(list); + posting_t::upsert(obj, 1003, {1, 2}); + ASSERT_EQ(1003, COMPACT_POSTING_PTR(obj)->last_id()); + + ASSERT_EQ(19, (COMPACT_POSTING_PTR(obj))->length); + ASSERT_EQ(24, (COMPACT_POSTING_PTR(obj))->capacity); + + // insert enough docs to NOT exceed compact posting list threshold + posting_t::upsert(obj, 1004, {1, 2, 3, 4, 5, 6, 7, 8}); + ASSERT_EQ(1004, COMPACT_POSTING_PTR(obj)->last_id()); + posting_t::upsert(obj, 1005, {1, 2, 3, 4, 5, 6, 7, 8}); + ASSERT_EQ(1005, COMPACT_POSTING_PTR(obj)->last_id()); + posting_t::upsert(obj, 1006, {1, 2, 3, 4, 5, 6, 7, 8}); + ASSERT_EQ(1006, COMPACT_POSTING_PTR(obj)->last_id()); + posting_t::upsert(obj, 1007, {1, 2, 3, 4, 5, 6, 7, 8}); + ASSERT_EQ(1007, COMPACT_POSTING_PTR(obj)->last_id()); + ASSERT_TRUE(IS_COMPACT_POSTING(obj)); + ASSERT_EQ(1007, COMPACT_POSTING_PTR(obj)->last_id()); + + // next upsert will exceed threshold + posting_t::upsert(obj, 1008, {1, 2, 3, 4, 5, 6, 7, 8}); + ASSERT_FALSE(IS_COMPACT_POSTING(obj)); + + ASSERT_EQ(1, ((posting_list_t*)(obj))->size()); + ASSERT_EQ(9, ((posting_list_t*)(obj))->get_root()->size()); + ASSERT_EQ(1008, ((posting_list_t*)(obj))->get_root()->ids.last()); + + delete ((posting_list_t*)(obj)); +} + +TEST(PostingListTest, CompactPostingListUpserts) { + uint32_t ids[] = {3, 1000, 1002}; + uint32_t offset_index[] = {0, 3, 6}; + uint32_t offsets[] = {0, 3, 4, 0, 3, 4, 0, 3, 4}; + + compact_posting_list_t* list = compact_posting_list_t::create(3, ids, offset_index, 9, offsets); + ASSERT_EQ(15, list->length); + ASSERT_EQ(15, list->capacity); + ASSERT_EQ(1002, list->last_id()); + + // insert before first ID + + void* obj = SET_COMPACT_POSTING(list); + posting_t::upsert(obj, 2, {1, 2}); + ASSERT_EQ(1002, COMPACT_POSTING_PTR(obj)->last_id()); + ASSERT_EQ(19, COMPACT_POSTING_PTR(obj)->length); + ASSERT_EQ(24, COMPACT_POSTING_PTR(obj)->capacity); + + // insert in the middle + posting_t::upsert(obj, 999, {1, 2}); + ASSERT_EQ(1002, COMPACT_POSTING_PTR(obj)->last_id()); + ASSERT_EQ(23, COMPACT_POSTING_PTR(obj)->length); + ASSERT_EQ(24, COMPACT_POSTING_PTR(obj)->capacity); + + uint32_t expected_id_offsets[] = { + 2, 1, 2, 2, + 3, 0, 3, 4, 3, + 2, 1, 2, 999, + 3, 0, 3, 4, 1000, + 3, 0, 3, 4, 1002 + }; + + ASSERT_EQ(23, COMPACT_POSTING_PTR(obj)->length); + + for(size_t i = 0; i < COMPACT_POSTING_PTR(obj)->length; i++) { + ASSERT_EQ(expected_id_offsets[i], COMPACT_POSTING_PTR(obj)->id_offsets[i]); + } + + free(COMPACT_POSTING_PTR(obj)); +} + +TEST(PostingListTest, CompactPostingListUpdateWithLessOffsets) { + uint32_t ids[] = {0, 1000, 1002}; + uint32_t offset_index[] = {0, 3, 6}; + uint32_t offsets[] = {0, 3, 4, 0, 3, 4, 0, 3, 4}; + + compact_posting_list_t* list = compact_posting_list_t::create(3, ids, offset_index, 9, offsets); + ASSERT_EQ(15, list->length); + ASSERT_EQ(15, list->capacity); + ASSERT_EQ(1002, list->last_id()); + + // update middle + + list->upsert(1000, {1, 2}); + ASSERT_EQ(14, list->length); + ASSERT_EQ(15, list->capacity); + ASSERT_EQ(1002, list->last_id()); + uint32_t expected_id_offsets[] = {3, 0, 3, 4, 0, 2, 1, 2, 1000, 3, 0, 3, 4, 1002}; + for(size_t i = 0; i < list->length; i++) { + ASSERT_EQ(expected_id_offsets[i], list->id_offsets[i]); + } + + // update start + list->upsert(0, {2, 4}); + ASSERT_EQ(13, list->length); + ASSERT_EQ(15, list->capacity); + ASSERT_EQ(1002, list->last_id()); + uint32_t expected_id_offsets2[] = {2, 2, 4, 0, 2, 1, 2, 1000, 3, 0, 3, 4, 1002}; + for(size_t i = 0; i < list->length; i++) { + ASSERT_EQ(expected_id_offsets2[i], list->id_offsets[i]); + } + + // update end + list->upsert(1002, {2, 4}); + ASSERT_EQ(12, list->length); + ASSERT_EQ(15, list->capacity); + ASSERT_EQ(1002, list->last_id()); + uint32_t expected_id_offsets3[] = {2, 2, 4, 0, 2, 1, 2, 1000, 2, 2, 4, 1002}; + for(size_t i = 0; i < list->length; i++) { + ASSERT_EQ(expected_id_offsets3[i], list->id_offsets[i]); + } + + free(list); +} + +TEST(PostingListTest, CompactPostingListUpdateWithMoreOffsets) { + uint32_t ids[] = {0, 1000, 1002}; + uint32_t offset_index[] = {0, 3, 6}; + uint32_t offsets[] = {0, 3, 4, 0, 3, 4, 0, 3, 4}; + + compact_posting_list_t* list = compact_posting_list_t::create(3, ids, offset_index, 9, offsets); + ASSERT_EQ(15, list->length); + ASSERT_EQ(15, list->capacity); + ASSERT_EQ(1002, list->last_id()); + + // update middle + void* obj = SET_COMPACT_POSTING(list); + posting_t::upsert(obj, 1000, {1, 2, 3, 4}); + list = COMPACT_POSTING_PTR(obj); + ASSERT_EQ(16, list->length); + ASSERT_EQ(20, list->capacity); + ASSERT_EQ(1002, list->last_id()); + uint32_t expected_id_offsets[] = {3, 0, 3, 4, 0, 4, 1, 2, 3, 4, 1000, 3, 0, 3, 4, 1002}; + for(size_t i = 0; i < list->length; i++) { + ASSERT_EQ(expected_id_offsets[i], list->id_offsets[i]); + } + + // update start + list->upsert(0, {1, 2, 3, 4}); + ASSERT_EQ(17, list->length); + ASSERT_EQ(20, list->capacity); + ASSERT_EQ(1002, list->last_id()); + uint32_t expected_id_offsets2[] = {4, 1, 2, 3, 4, 0, 4, 1, 2, 3, 4, 1000, 3, 0, 3, 4, 1002}; + for(size_t i = 0; i < list->length; i++) { + ASSERT_EQ(expected_id_offsets2[i], list->id_offsets[i]); + } + + // update end + list->upsert(1002, {1, 2, 3, 4}); + ASSERT_EQ(18, list->length); + ASSERT_EQ(20, list->capacity); + ASSERT_EQ(1002, list->last_id()); + uint32_t expected_id_offsets3[] = {4, 1, 2, 3, 4, 0, 4, 1, 2, 3, 4, 1000, 4, 1, 2, 3, 4, 1002}; + for(size_t i = 0; i < list->length; i++) { + ASSERT_EQ(expected_id_offsets3[i], list->id_offsets[i]); + } + + free(list); +} + +TEST(PostingListTest, CompactPostingListErase) { + uint32_t ids[] = {0, 1000, 1002}; + uint32_t offset_index[] = {0, 3, 6}; + uint32_t offsets[] = {0, 3, 4, 0, 3, 4, 0, 3, 4}; + + compact_posting_list_t* list = compact_posting_list_t::create(3, ids, offset_index, 9, offsets); + + list->erase(3); // erase non-existing ID + + ASSERT_EQ(15, list->length); + ASSERT_EQ(15, list->capacity); + ASSERT_EQ(1002, list->last_id()); + + list->erase(1000); + ASSERT_EQ(10, list->length); + ASSERT_EQ(15, list->capacity); + ASSERT_EQ(1002, list->last_id()); + + // deleting using posting wrapper + void* obj = SET_COMPACT_POSTING(list); + posting_t::erase(obj, 1002); + ASSERT_TRUE(IS_COMPACT_POSTING(obj)); + ASSERT_EQ(5, (COMPACT_POSTING_PTR(obj))->length); + ASSERT_EQ(7, (COMPACT_POSTING_PTR(obj))->capacity); + ASSERT_EQ(0, (COMPACT_POSTING_PTR(obj))->last_id()); + + // upsert again + posting_t::upsert(obj, 1002, {0, 3, 4}); + list = COMPACT_POSTING_PTR(obj); + ASSERT_EQ(10, list->length); + ASSERT_EQ(13, list->capacity); + ASSERT_EQ(1002, list->last_id()); + + free(list); +} + TEST(PostingListTest, DISABLED_Benchmark) { std::vector offsets = {0, 1, 3}; posting_list_t pl(4096);