Posting list: split up intersection and offset compute.

This commit is contained in:
Kishore Nallan 2021-06-14 20:29:50 +05:30
parent bf49b351a1
commit 36580dfb62
4 changed files with 293 additions and 28 deletions

View File

@ -1,9 +1,10 @@
#pragma once
#include <stdint.h>
#include <cstdint>
#include <vector>
#include <queue>
#include <stdlib.h>
#include <algorithm>
#include <cstdlib>
#include <limits>
#include "logger.h"

View File

@ -1,7 +1,10 @@
#pragma once
#include <map>
#include <unordered_map>
#include "sorted_array.h"
#include "array.h"
#include "match_score.h"
typedef uint32_t last_id_t;
@ -41,8 +44,8 @@ public:
class iterator_t {
private:
block_t* block;
uint32_t index;
block_t* curr_block;
uint32_t curr_index;
// uncompressed data structures for performance
block_t* uncompressed_block;
@ -56,7 +59,14 @@ public:
void next();
void skip_to(uint32_t id);
[[nodiscard]] inline uint32_t id();
void offsets(std::vector<uint32_t>& offsets);
[[nodiscard]] inline uint32_t index() const;
[[nodiscard]] inline block_t* block() const;
};
struct result_iter_state_t {
std::vector<std::vector<block_t*>> blocks;
std::vector<std::vector<uint32_t>> indices;
std::vector<uint32_t> ids;
};
private:
@ -113,4 +123,16 @@ public:
iterator_t new_iterator();
static void intersect(const std::vector<posting_list_t*>& posting_lists, std::vector<uint32_t>& result_ids);
static bool block_intersect(
const std::vector<posting_list_t*>& posting_lists,
size_t batch_size,
std::vector<posting_list_t::iterator_t>& its,
result_iter_state_t& iter_state
);
static bool get_offsets(
result_iter_state_t& iter_state,
std::vector<std::unordered_map<size_t, std::vector<token_positions_t>>>& array_token_positions
);
};

View File

@ -566,6 +566,165 @@ void posting_list_t::intersect(const std::vector<posting_list_t*>& posting_lists
}
}
bool posting_list_t::block_intersect(const std::vector<posting_list_t*>& posting_lists, const size_t batch_size,
std::vector<posting_list_t::iterator_t>& its,
result_iter_state_t& iter_state) {
if(its.empty()) {
its.reserve(posting_lists.size());
for(const auto& posting_list: posting_lists) {
its.push_back(posting_list->new_iterator());
}
} else {
// already in the middle of iteration: prepare for next batch
iter_state.ids.clear();
iter_state.indices.clear();
iter_state.blocks.clear();
}
size_t num_lists = its.size();
switch (num_lists) {
case 2:
while(!at_end2(its)) {
if(equals2(its)) {
// still need to ensure that the ID exists in inclusion list but NOT in exclusion list
iter_state.ids.push_back(its[0].id());
std::vector<block_t*> block_vec(2);
std::vector<uint32_t> index_vec(its.size());
block_vec[0] = its[0].block();
block_vec[1] = its[1].block();
index_vec[0] = its[0].index();
index_vec[1] = its[1].index();
iter_state.blocks.emplace_back(block_vec);
iter_state.indices.emplace_back(index_vec);
advance_all2(its);
} else {
advance_least2(its);
}
if(iter_state.ids.size() == batch_size) {
return true;
}
}
break;
default:
while(!at_end(its)) {
if(equals(its)) {
//LOG(INFO) << its[0].id();
iter_state.ids.push_back(its[0].id());
std::vector<block_t*> block_vec(its.size());
std::vector<uint32_t> index_vec(its.size());
for(size_t i = 0; i < its.size(); i++) {
block_vec[i] = its[i].block();
index_vec[i] = its[i].index();
}
iter_state.blocks.emplace_back(block_vec);
iter_state.indices.emplace_back(index_vec);
advance_all(its);
} else {
advance_least(its);
}
if(iter_state.ids.size() == batch_size) {
return true;
}
}
}
return false;
}
bool posting_list_t::get_offsets(posting_list_t::result_iter_state_t& iter_state,
std::vector<std::unordered_map<size_t, std::vector<token_positions_t>>>& array_token_positions_vec) {
// Plain string format:
// offset1, offset2, ... , 0 (if token is the last offset for the document)
// Array string format:
// offset1, ... , offsetn, offsetn, array_index, 0 (if token is the last offset for the document)
// (last offset is repeated to indicate end of offsets for a given array index)
// For each result ID and for each block it is contained in, calculate offsets
for(size_t i = 0; i < iter_state.ids.size(); i++) {
uint32_t id = iter_state.ids[i];
array_token_positions_vec.emplace_back();
std::unordered_map<size_t, std::vector<token_positions_t>>& array_tok_pos = array_token_positions_vec.back();
for(size_t j = 0; j < iter_state.blocks[i].size(); j++) {
block_t* curr_block = iter_state.blocks[i][j];
uint32_t curr_index = iter_state.indices[i][j];
uint32_t* offsets = curr_block->offsets.uncompress();
uint32_t start_offset = curr_block->offset_index.at(curr_index);
uint32_t end_offset = (curr_index == curr_block->size() - 1) ?
curr_block->offsets.getLength() :
curr_block->offset_index.at(curr_index + 1);
std::vector<uint16_t> positions;
int prev_pos = -1;
bool is_last_token = false;
while(start_offset < end_offset) {
int pos = offsets[start_offset];
start_offset++;
if(pos == 0) {
// indicates that token is the last token on the doc
is_last_token = true;
start_offset++;
continue;
}
if(pos == prev_pos) { // indicates end of array index
if(!positions.empty()) {
size_t array_index = (size_t) offsets[start_offset];
is_last_token = false;
if(start_offset+1 < end_offset) {
size_t next_offset = (size_t) offsets[start_offset + 1];
if(next_offset == 0) {
// indicates that token is the last token on the doc
is_last_token = true;
start_offset++;
}
}
array_tok_pos[array_index].push_back(token_positions_t{is_last_token, positions});
positions.clear();
}
start_offset++; // skip current value which is the array index or flag for last index
prev_pos = -1;
continue;
}
prev_pos = pos;
positions.push_back((uint16_t)pos - 1);
}
if(!positions.empty()) {
// for plain string fields
array_tok_pos[0].push_back(token_positions_t{is_last_token, positions});
}
delete [] offsets;
}
}
return false;
}
bool posting_list_t::at_end(const std::vector<posting_list_t::iterator_t>& its) {
// if any one iterator is at end, we can stop
for(const auto& it : its) {
@ -643,55 +802,58 @@ size_t posting_list_t::num_ids() {
/* iterator_t operations */
posting_list_t::iterator_t::iterator_t(posting_list_t::block_t* root):
block(root), index(0), uncompressed_block(nullptr), ids(nullptr) {
curr_block(root), curr_index(0), uncompressed_block(nullptr), ids(nullptr) {
}
bool posting_list_t::iterator_t::valid() const {
return (block != nullptr) && (index < block->size());
return (curr_block != nullptr) && (curr_index < curr_block->size());
}
void posting_list_t::iterator_t::next() {
index++;
if(index == block->size()) {
index = 0;
block = block->next;
curr_index++;
if(curr_index == curr_block->size()) {
curr_index = 0;
curr_block = curr_block->next;
}
}
uint32_t posting_list_t::iterator_t::id() {
//return block->ids.at(index);
if(uncompressed_block != curr_block) {
uncompressed_block = curr_block;
if(uncompressed_block != block) {
delete [] ids;
ids = nullptr;
uncompressed_block = block;
if(block != nullptr) {
ids = block->ids.uncompress();
if(curr_block != nullptr) {
ids = curr_block->ids.uncompress();
}
}
return ids[index];
return ids[curr_index];
}
void posting_list_t::iterator_t::offsets(std::vector<uint32_t>& offsets) {
// TODO
uint32_t posting_list_t::iterator_t::index() const {
return curr_index;
}
posting_list_t::block_t* posting_list_t::iterator_t::block() const {
return curr_block;
}
void posting_list_t::iterator_t::skip_to(uint32_t id) {
bool skipped_block = false;
while(block != nullptr && block->ids.last() < id) {
block = block->next;
while(curr_block != nullptr && curr_block->ids.last() < id) {
curr_block = curr_block->next;
skipped_block = true;
}
if(skipped_block) {
index = 0;
curr_index = 0;
}
while(block != nullptr && index < block->size() && this->id() < id) {
index++;
while(curr_block != nullptr && curr_index < curr_block->size() && this->id() < id) {
curr_index++;
}
}
@ -701,11 +863,11 @@ posting_list_t::iterator_t::~iterator_t() {
}
posting_list_t::iterator_t::iterator_t(iterator_t&& rhs) noexcept {
block = rhs.block;
index = rhs.index;
curr_block = rhs.curr_block;
curr_index = rhs.curr_index;
uncompressed_block = rhs.uncompressed_block;
ids = rhs.ids;
rhs.block = nullptr;
rhs.curr_block = nullptr;
rhs.ids = nullptr;
}

View File

@ -557,6 +557,86 @@ TEST(PostingListTest, IntersectionBasics) {
ASSERT_EQ(20, result_ids[1]);
}
TEST(PostingListTest, ResultsAndOffsetsBasics) {
// NOTE: due to the way offsets1 are parsed, the actual positions are 1 less than the offset values stored
// (to account for the special offset `0` which indicates last offset
std::vector<uint32_t> offsets1 = {1, 2, 4};
std::vector<uint32_t> offsets2 = {5, 6};
std::vector<uint32_t> offsets3 = {7};
std::vector<posting_list_t*> lists;
// T1: [0, 2] [3, 20]
// T2: [1, 3], [5, 10], [20]
// T3: [2, 3], [5, 7], [20]
// 3: (0, 1, 3} {4, 5} {6}
// 2: {6} {4, 5} {0, 1, 3}
std::vector<token_positions_t> actual_offsets_3 = {
token_positions_t{false, {0, 1, 3}},
token_positions_t{false, {4, 5}},
token_positions_t{false, {6}},
};
std::vector<token_positions_t> actual_offsets_20 = {
token_positions_t{false, {6}},
token_positions_t{false, {4, 5}},
token_positions_t{false, {0, 1, 3}},
};
posting_list_t p1(2);
p1.upsert(0, offsets1);
p1.upsert(2, offsets1);
p1.upsert(3, offsets1);
p1.upsert(20, offsets3);
posting_list_t p2(2);
p2.upsert(1, offsets1);
p2.upsert(3, offsets2);
p2.upsert(5, offsets1);
p2.upsert(10, offsets1);
p2.upsert(20, offsets2);
posting_list_t p3(2);
p3.upsert(2, offsets1);
p3.upsert(3, offsets3);
p3.upsert(5, offsets1);
p3.upsert(7, offsets1);
p3.upsert(20, offsets1);
lists.push_back(&p1);
lists.push_back(&p2);
lists.push_back(&p3);
std::vector<posting_list_t::iterator_t> its;
posting_list_t::result_iter_state_t iter_state;
posting_list_t::block_intersect(lists, 2, its, iter_state);
ASSERT_EQ(2, iter_state.ids.size());
ASSERT_EQ(3, iter_state.ids[0]);
ASSERT_EQ(20, iter_state.ids[1]);
ASSERT_EQ(2, iter_state.blocks.size());
ASSERT_EQ(3, iter_state.blocks[0].size());
ASSERT_EQ(3, iter_state.blocks[1].size());
ASSERT_EQ(2, iter_state.indices.size());
std::vector<std::unordered_map<size_t, std::vector<token_positions_t>>> array_token_positions_vec;
posting_list_t::get_offsets(iter_state, array_token_positions_vec);
ASSERT_EQ(2, array_token_positions_vec.size());
ASSERT_EQ(actual_offsets_3[0].positions, array_token_positions_vec[0].at(0)[0].positions);
ASSERT_EQ(actual_offsets_3[1].positions, array_token_positions_vec[0].at(0)[1].positions);
ASSERT_EQ(actual_offsets_3[2].positions, array_token_positions_vec[0].at(0)[2].positions);
ASSERT_EQ(actual_offsets_20[0].positions, array_token_positions_vec[1].at(0)[0].positions);
ASSERT_EQ(actual_offsets_20[1].positions, array_token_positions_vec[1].at(0)[1].positions);
ASSERT_EQ(actual_offsets_20[2].positions, array_token_positions_vec[1].at(0)[2].positions);
}
TEST(PostingListTest, IntersectionSkipBlocks) {
std::vector<uint32_t> offsets = {0, 1, 3};
std::vector<posting_list_t*> lists;
@ -876,7 +956,7 @@ TEST(PostingListTest, DISABLED_Benchmark) {
LOG(INFO) << "Time taken for 5 sorted array updates: " << timeMicros;
}
TEST(PostingListTest, DISABLED_BenchmarkIntersection) {
TEST(PostingListTest, BenchmarkIntersection) {
std::vector<uint32_t> offsets = {0, 1, 3};
time_t t;