mirror of
https://github.com/typesense/typesense.git
synced 2025-05-19 21:22:25 +08:00
In place updates for posting list.
This commit is contained in:
parent
d6d83ec3ed
commit
6e4ecd409c
@ -42,4 +42,8 @@ public:
|
||||
uint32_t getSizeInBytes();
|
||||
|
||||
uint32_t getLength() const;
|
||||
|
||||
uint32_t getMin() const;
|
||||
|
||||
uint32_t getMax() const;
|
||||
};
|
@ -14,3 +14,11 @@ uint32_t array_base::getSizeInBytes() {
|
||||
uint32_t array_base::getLength() const {
|
||||
return length;
|
||||
}
|
||||
|
||||
uint32_t array_base::getMin() const {
|
||||
return min;
|
||||
}
|
||||
|
||||
uint32_t array_base::getMax() const {
|
||||
return max;
|
||||
}
|
||||
|
@ -25,19 +25,120 @@ void posting_list_t::block_t::insert_and_shift_offset_index(const uint32_t index
|
||||
}
|
||||
|
||||
void posting_list_t::block_t::upsert(const uint32_t id, const std::vector<uint32_t>& positions) {
|
||||
size_t inserted_index = ids.append(id);
|
||||
if(id <= ids.last()) {
|
||||
// we have to check if `id` already exists, for an opportunity to do in-place updates
|
||||
uint32_t id_index = ids.indexOf(id);
|
||||
|
||||
if(inserted_index == ids.getLength()-1) {
|
||||
// treat as appends
|
||||
uint32_t curr_index = offsets.getLength();
|
||||
offset_index.append(curr_index);
|
||||
for(uint32_t position : positions) {
|
||||
offsets.append(position);
|
||||
if(id_index != ids.getLength()) {
|
||||
// id is already present, so we will only update offset index and offsets
|
||||
uint32_t start_offset_index = offset_index.at(id_index);
|
||||
uint32_t end_offset_index = (id == ids.last()) ? offsets.getLength()-1 : offset_index.at(id_index + 1)-1;
|
||||
uint32_t num_offsets = (end_offset_index - start_offset_index) + 1;
|
||||
uint32_t* curr_offsets = offsets.uncompress();
|
||||
uint32_t m = offsets.getMin(), M = offsets.getMax();
|
||||
|
||||
if(num_offsets == positions.size()) {
|
||||
// no need to touch the offset index and need to just do inplace updates of offsets
|
||||
bool find_new_min_max = false;
|
||||
for(size_t i = 0; i < positions.size(); i++) {
|
||||
if((curr_offsets[start_offset_index + i] == m || curr_offsets[start_offset_index + i] == M) &&
|
||||
curr_offsets[start_offset_index + i] != positions[i]) {
|
||||
// when an existing min/max is affected we will have to find the new min/max
|
||||
find_new_min_max = true;
|
||||
}
|
||||
|
||||
if(positions[i] < m) {
|
||||
m = positions[i];
|
||||
}
|
||||
|
||||
if(positions[i] > M) {
|
||||
M = positions[i];
|
||||
}
|
||||
|
||||
curr_offsets[start_offset_index + i] = positions[i];
|
||||
}
|
||||
|
||||
if(find_new_min_max) {
|
||||
for(size_t i = 0; i < offsets.getLength(); i++) {
|
||||
if(curr_offsets[i] < m) {
|
||||
m = curr_offsets[i];
|
||||
}
|
||||
|
||||
if(curr_offsets[i] > M) {
|
||||
M = curr_offsets[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
offsets.load(curr_offsets, offsets.getLength(), m, M);
|
||||
} else {
|
||||
// need to resize offsets array
|
||||
int64_t size_diff = int64_t(positions.size()) - num_offsets; // size_diff can be negative
|
||||
size_t new_offsets_length = offsets.getLength() + size_diff;
|
||||
uint32_t* new_offsets = new uint32_t[new_offsets_length];
|
||||
std::memmove(new_offsets, curr_offsets, sizeof(uint32_t) * start_offset_index);
|
||||
|
||||
bool find_new_min_max = false;
|
||||
for(size_t i = 0; i < num_offsets; i++) {
|
||||
if(curr_offsets[start_offset_index + i] == m || curr_offsets[start_offset_index + i] == M) {
|
||||
// when an existing min/max is affected we will have to find the new min/max
|
||||
find_new_min_max = true;
|
||||
}
|
||||
}
|
||||
|
||||
for(size_t i = 0; i < positions.size(); i++) {
|
||||
if(positions[i] < m) {
|
||||
m = positions[i];
|
||||
}
|
||||
|
||||
if(positions[i] > M) {
|
||||
M = positions[i];
|
||||
}
|
||||
|
||||
new_offsets[start_offset_index + i] = positions[i];
|
||||
}
|
||||
|
||||
std::memmove(new_offsets + start_offset_index + positions.size(),
|
||||
curr_offsets + end_offset_index + 1,
|
||||
sizeof(uint32_t) * (offsets.getLength() - (end_offset_index + 1)));
|
||||
|
||||
if(find_new_min_max) {
|
||||
for(size_t i = 0; i < offsets.getLength(); i++) {
|
||||
if(curr_offsets[i] < m) {
|
||||
m = curr_offsets[i];
|
||||
}
|
||||
|
||||
if(curr_offsets[i] > M) {
|
||||
M = curr_offsets[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
offsets.load(new_offsets, new_offsets_length, m, M);
|
||||
delete [] new_offsets;
|
||||
|
||||
// shift offset index
|
||||
uint32_t* current_offset_index = offset_index.uncompress();
|
||||
for(size_t i = id_index+1; i < ids.getLength(); i++) {
|
||||
current_offset_index[i] += size_diff;
|
||||
}
|
||||
|
||||
offset_index.load(current_offset_index, offset_index.getLength());
|
||||
delete [] current_offset_index;
|
||||
}
|
||||
|
||||
delete [] curr_offsets;
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
uint32_t existing_offset_index = offset_index.at(inserted_index);
|
||||
insert_and_shift_offset_index(inserted_index, positions.size());
|
||||
offsets.insert(existing_offset_index, &positions[0], positions.size());
|
||||
}
|
||||
|
||||
// treat as regular append (either id not found or exceeds max id)
|
||||
|
||||
ids.append(id);
|
||||
uint32_t curr_index = offsets.getLength();
|
||||
offset_index.append(curr_index);
|
||||
for(uint32_t position : positions) {
|
||||
offsets.append(position);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -110,6 +110,155 @@ TEST(PostingListTest, Insert) {
|
||||
}
|
||||
}
|
||||
|
||||
TEST(PostingListTest, InplaceUpserts) {
|
||||
std::vector<uint32_t> offsets = {1, 2, 3};
|
||||
posting_list_t pl(5);
|
||||
|
||||
pl.upsert(2, offsets);
|
||||
pl.upsert(5, offsets);
|
||||
pl.upsert(7, offsets);
|
||||
|
||||
ASSERT_EQ(1, pl.size());
|
||||
ASSERT_EQ(3, pl.get_root()->ids.getLength());
|
||||
ASSERT_EQ(9, pl.get_root()->offsets.getLength());
|
||||
|
||||
// update starting ID with same length of offsets
|
||||
pl.upsert(2, {1, 2, 4});
|
||||
ASSERT_EQ(1, pl.size());
|
||||
ASSERT_EQ(3, pl.get_root()->ids.getLength());
|
||||
ASSERT_EQ(9, pl.get_root()->offsets.getLength());
|
||||
|
||||
ASSERT_EQ(1, pl.get_root()->offsets.at(0));
|
||||
ASSERT_EQ(2, pl.get_root()->offsets.at(1));
|
||||
ASSERT_EQ(4, pl.get_root()->offsets.at(2));
|
||||
ASSERT_EQ(4, pl.get_root()->offsets.getMax());
|
||||
ASSERT_EQ(1, pl.get_root()->offsets.getMin());
|
||||
|
||||
ASSERT_EQ(0, pl.get_root()->offset_index.at(0));
|
||||
ASSERT_EQ(3, pl.get_root()->offset_index.at(1));
|
||||
ASSERT_EQ(6, pl.get_root()->offset_index.at(2));
|
||||
|
||||
// update starting ID with smaller number of offsets
|
||||
pl.upsert(2, {5, 7});
|
||||
ASSERT_EQ(1, pl.size());
|
||||
ASSERT_EQ(3, pl.get_root()->ids.getLength());
|
||||
ASSERT_EQ(8, pl.get_root()->offsets.getLength());
|
||||
|
||||
ASSERT_EQ(5, pl.get_root()->offsets.at(0));
|
||||
ASSERT_EQ(7, pl.get_root()->offsets.at(1));
|
||||
ASSERT_EQ(1, pl.get_root()->offsets.at(2));
|
||||
ASSERT_EQ(7, pl.get_root()->offsets.getMax());
|
||||
ASSERT_EQ(1, pl.get_root()->offsets.getMin());
|
||||
|
||||
ASSERT_EQ(0, pl.get_root()->offset_index.at(0));
|
||||
ASSERT_EQ(2, pl.get_root()->offset_index.at(1));
|
||||
ASSERT_EQ(5, pl.get_root()->offset_index.at(2));
|
||||
|
||||
// update starting ID with larger number of offsets
|
||||
pl.upsert(2, {0, 2, 8});
|
||||
ASSERT_EQ(1, pl.size());
|
||||
ASSERT_EQ(3, pl.get_root()->ids.getLength());
|
||||
ASSERT_EQ(9, pl.get_root()->offsets.getLength());
|
||||
|
||||
ASSERT_EQ(0, pl.get_root()->offsets.at(0));
|
||||
ASSERT_EQ(2, pl.get_root()->offsets.at(1));
|
||||
ASSERT_EQ(8, pl.get_root()->offsets.at(2));
|
||||
ASSERT_EQ(1, pl.get_root()->offsets.at(3));
|
||||
ASSERT_EQ(8, pl.get_root()->offsets.getMax());
|
||||
ASSERT_EQ(0, pl.get_root()->offsets.getMin());
|
||||
|
||||
ASSERT_EQ(0, pl.get_root()->offset_index.at(0));
|
||||
ASSERT_EQ(3, pl.get_root()->offset_index.at(1));
|
||||
ASSERT_EQ(6, pl.get_root()->offset_index.at(2));
|
||||
|
||||
// update middle ID with smaller number of offsets
|
||||
pl.upsert(5, {1, 10});
|
||||
ASSERT_EQ(1, pl.size());
|
||||
ASSERT_EQ(3, pl.get_root()->ids.getLength());
|
||||
ASSERT_EQ(8, pl.get_root()->offsets.getLength());
|
||||
|
||||
ASSERT_EQ(0, pl.get_root()->offsets.at(0));
|
||||
ASSERT_EQ(2, pl.get_root()->offsets.at(1));
|
||||
ASSERT_EQ(8, pl.get_root()->offsets.at(2));
|
||||
ASSERT_EQ(1, pl.get_root()->offsets.at(3));
|
||||
ASSERT_EQ(10, pl.get_root()->offsets.at(4));
|
||||
|
||||
ASSERT_EQ(10, pl.get_root()->offsets.getMax());
|
||||
ASSERT_EQ(0, pl.get_root()->offsets.getMin());
|
||||
|
||||
ASSERT_EQ(0, pl.get_root()->offset_index.at(0));
|
||||
ASSERT_EQ(3, pl.get_root()->offset_index.at(1));
|
||||
ASSERT_EQ(5, pl.get_root()->offset_index.at(2));
|
||||
|
||||
// update middle ID with larger number of offsets
|
||||
pl.upsert(5, {2, 4, 12});
|
||||
ASSERT_EQ(1, pl.size());
|
||||
ASSERT_EQ(3, pl.get_root()->ids.getLength());
|
||||
ASSERT_EQ(9, pl.get_root()->offsets.getLength());
|
||||
|
||||
ASSERT_EQ(0, pl.get_root()->offsets.at(0));
|
||||
ASSERT_EQ(2, pl.get_root()->offsets.at(1));
|
||||
ASSERT_EQ(8, pl.get_root()->offsets.at(2));
|
||||
ASSERT_EQ(2, pl.get_root()->offsets.at(3));
|
||||
ASSERT_EQ(4, pl.get_root()->offsets.at(4));
|
||||
ASSERT_EQ(12, pl.get_root()->offsets.at(5));
|
||||
ASSERT_EQ(1, pl.get_root()->offsets.at(6));
|
||||
ASSERT_EQ(2, pl.get_root()->offsets.at(7));
|
||||
ASSERT_EQ(3, pl.get_root()->offsets.at(8));
|
||||
|
||||
ASSERT_EQ(12, pl.get_root()->offsets.getMax());
|
||||
ASSERT_EQ(0, pl.get_root()->offsets.getMin());
|
||||
|
||||
ASSERT_EQ(0, pl.get_root()->offset_index.at(0));
|
||||
ASSERT_EQ(3, pl.get_root()->offset_index.at(1));
|
||||
ASSERT_EQ(6, pl.get_root()->offset_index.at(2));
|
||||
|
||||
// update last ID with smaller number of offsets
|
||||
|
||||
pl.upsert(7, {3});
|
||||
ASSERT_EQ(1, pl.size());
|
||||
ASSERT_EQ(3, pl.get_root()->ids.getLength());
|
||||
ASSERT_EQ(7, pl.get_root()->offsets.getLength());
|
||||
|
||||
ASSERT_EQ(0, pl.get_root()->offsets.at(0));
|
||||
ASSERT_EQ(2, pl.get_root()->offsets.at(1));
|
||||
ASSERT_EQ(8, pl.get_root()->offsets.at(2));
|
||||
ASSERT_EQ(2, pl.get_root()->offsets.at(3));
|
||||
ASSERT_EQ(4, pl.get_root()->offsets.at(4));
|
||||
ASSERT_EQ(12, pl.get_root()->offsets.at(5));
|
||||
ASSERT_EQ(3, pl.get_root()->offsets.at(6));
|
||||
|
||||
ASSERT_EQ(12, pl.get_root()->offsets.getMax());
|
||||
ASSERT_EQ(0, pl.get_root()->offsets.getMin());
|
||||
|
||||
ASSERT_EQ(0, pl.get_root()->offset_index.at(0));
|
||||
ASSERT_EQ(3, pl.get_root()->offset_index.at(1));
|
||||
ASSERT_EQ(6, pl.get_root()->offset_index.at(2));
|
||||
|
||||
// update last ID with larger number of offsets
|
||||
|
||||
pl.upsert(7, {5, 20});
|
||||
ASSERT_EQ(1, pl.size());
|
||||
ASSERT_EQ(3, pl.get_root()->ids.getLength());
|
||||
ASSERT_EQ(8, pl.get_root()->offsets.getLength());
|
||||
|
||||
ASSERT_EQ(0, pl.get_root()->offsets.at(0));
|
||||
ASSERT_EQ(2, pl.get_root()->offsets.at(1));
|
||||
ASSERT_EQ(8, pl.get_root()->offsets.at(2));
|
||||
ASSERT_EQ(2, pl.get_root()->offsets.at(3));
|
||||
ASSERT_EQ(4, pl.get_root()->offsets.at(4));
|
||||
ASSERT_EQ(12, pl.get_root()->offsets.at(5));
|
||||
ASSERT_EQ(5, pl.get_root()->offsets.at(6));
|
||||
ASSERT_EQ(20, pl.get_root()->offsets.at(7));
|
||||
|
||||
ASSERT_EQ(20, pl.get_root()->offsets.getMax());
|
||||
ASSERT_EQ(0, pl.get_root()->offsets.getMin());
|
||||
|
||||
ASSERT_EQ(0, pl.get_root()->offset_index.at(0));
|
||||
ASSERT_EQ(3, pl.get_root()->offset_index.at(1));
|
||||
ASSERT_EQ(6, pl.get_root()->offset_index.at(2));
|
||||
}
|
||||
|
||||
TEST(PostingListTest, RemovalsOnFirstBlock) {
|
||||
std::vector<uint32_t> offsets = {0, 1, 3};
|
||||
posting_list_t pl(5);
|
||||
@ -318,11 +467,10 @@ TEST(PostingListTest, RandomInsertAndDeletes) {
|
||||
std::vector<uint32_t> offsets1 = {0, 1, 3};
|
||||
std::vector<uint32_t> offsets2 = {10, 12};
|
||||
|
||||
// generate unique random IDs
|
||||
std::set<uint32_t> ids;
|
||||
std::vector<uint32_t> ids;
|
||||
|
||||
for(size_t i = 0; i < 100000; i++) {
|
||||
ids.insert(rand() % 100000);
|
||||
ids.push_back(rand() % 100000);
|
||||
}
|
||||
|
||||
size_t index = 0;
|
||||
@ -337,8 +485,8 @@ TEST(PostingListTest, RandomInsertAndDeletes) {
|
||||
pl.erase(rand() % 100000);
|
||||
}
|
||||
|
||||
ASSERT_LT(pl.size(), 750);
|
||||
ASSERT_GT(pl.size(), 500);
|
||||
ASSERT_GT(pl.size(), 750);
|
||||
ASSERT_LT(pl.size(), 1000);
|
||||
}
|
||||
|
||||
TEST(PostingListTest, IntersectionBasics) {
|
||||
|
Loading…
x
Reference in New Issue
Block a user