Fix an edge case in string update.

This commit is contained in:
Kishore Nallan 2022-01-29 18:25:57 +05:30
parent 2534b1f798
commit 13622ff038
3 changed files with 105 additions and 12 deletions

View File

@ -33,6 +33,8 @@ public:
void remove_and_shift_offset_index(const uint32_t* indices_sorted, uint32_t num_indices);
void insert_and_shift_offset_index(const uint32_t index, const uint32_t num_offsets);
uint32_t upsert(uint32_t id, const std::vector<uint32_t>& offsets);
uint32_t erase(uint32_t id);
@ -61,6 +63,7 @@ public:
[[nodiscard]] bool valid() const;
void next();
void skip_to(uint32_t id);
void set_index(uint32_t index);
[[nodiscard]] uint32_t id() const;
[[nodiscard]] inline uint32_t index() const;
[[nodiscard]] inline block_t* block() const;
@ -129,6 +132,8 @@ public:
void erase(uint32_t id);
void dump();
block_t* get_root();
size_t num_blocks() const;

View File

@ -6,11 +6,29 @@
/* block_t operations */
uint32_t posting_list_t::block_t::upsert(const uint32_t id, const std::vector<uint32_t>& positions) {
if(id <= ids.last() && ids.getLength() != 0) {
// we have to check if `id` already exists, for an opportunity to do in-place updates
if(id > ids.last() || ids.getLength() == 0) {
// append to the end
ids.append(id);
uint32_t curr_index = offsets.getLength();
offset_index.append(curr_index);
for(uint32_t position : positions) {
offsets.append(position);
}
}
else {
// we have to check if `id` already exists, and do in-place update/insert
uint32_t id_index = ids.indexOf(id);
if(id_index != ids.getLength()) {
if(id_index == ids.getLength()) {
// id not found, we have to insert it
size_t inserted_index = ids.append(id);
uint32_t existing_offset_index = offset_index.at(inserted_index);
insert_and_shift_offset_index(inserted_index, positions.size());
offsets.insert(existing_offset_index, &positions[0], positions.size());
}
else {
// id is already present, so we will only update offset index and offsets
uint32_t start_offset_index = offset_index.at(id_index);
uint32_t end_offset_index = (id == ids.last()) ? offsets.getLength()-1 : offset_index.at(id_index + 1)-1;
@ -113,15 +131,6 @@ uint32_t posting_list_t::block_t::upsert(const uint32_t id, const std::vector<ui
}
}
// treat as regular append (either id not found or exceeds max id)
ids.append(id);
uint32_t curr_index = offsets.getLength();
offset_index.append(curr_index);
for(uint32_t position : positions) {
offsets.append(position);
}
return 1;
}
@ -179,6 +188,26 @@ void posting_list_t::block_t::remove_and_shift_offset_index(const uint32_t* indi
delete[] new_array;
}
void posting_list_t::block_t::insert_and_shift_offset_index(const uint32_t index, const uint32_t num_offsets) {
uint32_t existing_offset_index = offset_index.at(index);
uint32_t length = offset_index.getLength();
uint32_t new_length = length + 1;
uint32_t *curr_array = offset_index.uncompress(new_length);
memmove(&curr_array[index+1], &curr_array[index], sizeof(uint32_t)*(length - index));
curr_array[index] = existing_offset_index;
uint32_t curr_index = index + 1;
while(curr_index < new_length) {
curr_array[curr_index] += num_offsets;
curr_index++;
}
offset_index.load(curr_array, new_length);
delete [] curr_array;
}
bool posting_list_t::block_t::contains(uint32_t id) {
return ids.contains(id);
}
@ -467,6 +496,41 @@ void posting_list_t::upsert(const uint32_t id, const std::vector<uint32_t>& offs
}
}
void posting_list_t::dump() {
auto it = new_iterator();
std::string ids_str;
std::string offset_index_str;
std::string offsets_str;
while(it.valid()) {
auto index = it.index();
while(index < it.block()->size()) {
ids_str += std::to_string(it.ids[index]) + ", ";
offset_index_str += std::to_string(it.offset_index[index]) + ", ";
index++;
}
auto last_offset_index = it.offset_index[it.block()->size()-1];
for(size_t j = 0; j <= last_offset_index; j++) {
offsets_str += std::to_string(it.offsets[j]) + ", ";
}
it.set_index(it.block()->size()-1);
it.next();
}
LOG(INFO) << "ids_str:";
LOG(INFO) << ids_str;
LOG(INFO) << "offset_index_str:";
LOG(INFO) << offset_index_str;
LOG(INFO) << "offsets_str:";
LOG(INFO) << offsets_str;
}
void posting_list_t::erase(const uint32_t id) {
const auto it = id_block_map.lower_bound(id);
@ -1354,3 +1418,7 @@ posting_list_t::iterator_t::iterator_t(iterator_t&& rhs) noexcept {
rhs.offset_index = nullptr;
rhs.offsets = nullptr;
}
void posting_list_t::iterator_t::set_index(uint32_t index) {
curr_index = index;
}

View File

@ -129,6 +129,26 @@ TEST_F(PostingListTest, Insert) {
}
}
TEST_F(PostingListTest, InsertInMiddle) {
posting_list_t pl(3);
pl.upsert(1, {1});
pl.upsert(3, {3});
pl.upsert(2, {2});
ASSERT_EQ(1, pl.get_root()->ids.at(0));
ASSERT_EQ(2, pl.get_root()->ids.at(1));
ASSERT_EQ(3, pl.get_root()->ids.at(2));
ASSERT_EQ(0, pl.get_root()->offset_index.at(0));
ASSERT_EQ(1, pl.get_root()->offset_index.at(1));
ASSERT_EQ(2, pl.get_root()->offset_index.at(2));
ASSERT_EQ(1, pl.get_root()->offsets.at(0));
ASSERT_EQ(2, pl.get_root()->offsets.at(1));
ASSERT_EQ(3, pl.get_root()->offsets.at(2));
}
TEST_F(PostingListTest, InplaceUpserts) {
std::vector<uint32_t> offsets = {1, 2, 3};
posting_list_t pl(5);