mirror of
https://github.com/typesense/typesense.git
synced 2025-05-19 13:12:22 +08:00
Posting list: split up intersection and offset compute.
This commit is contained in:
parent
bf49b351a1
commit
36580dfb62
@ -1,9 +1,10 @@
|
||||
#pragma once
|
||||
|
||||
#include <stdint.h>
|
||||
#include <cstdint>
|
||||
#include <vector>
|
||||
#include <queue>
|
||||
#include <stdlib.h>
|
||||
#include <algorithm>
|
||||
#include <cstdlib>
|
||||
#include <limits>
|
||||
#include "logger.h"
|
||||
|
||||
|
@ -1,7 +1,10 @@
|
||||
#pragma once
|
||||
|
||||
#include <map>
|
||||
#include <unordered_map>
|
||||
#include "sorted_array.h"
|
||||
#include "array.h"
|
||||
#include "match_score.h"
|
||||
|
||||
typedef uint32_t last_id_t;
|
||||
|
||||
@ -41,8 +44,8 @@ public:
|
||||
|
||||
class iterator_t {
|
||||
private:
|
||||
block_t* block;
|
||||
uint32_t index;
|
||||
block_t* curr_block;
|
||||
uint32_t curr_index;
|
||||
|
||||
// uncompressed data structures for performance
|
||||
block_t* uncompressed_block;
|
||||
@ -56,7 +59,14 @@ public:
|
||||
void next();
|
||||
void skip_to(uint32_t id);
|
||||
[[nodiscard]] inline uint32_t id();
|
||||
void offsets(std::vector<uint32_t>& offsets);
|
||||
[[nodiscard]] inline uint32_t index() const;
|
||||
[[nodiscard]] inline block_t* block() const;
|
||||
};
|
||||
|
||||
struct result_iter_state_t {
|
||||
std::vector<std::vector<block_t*>> blocks;
|
||||
std::vector<std::vector<uint32_t>> indices;
|
||||
std::vector<uint32_t> ids;
|
||||
};
|
||||
|
||||
private:
|
||||
@ -113,4 +123,16 @@ public:
|
||||
iterator_t new_iterator();
|
||||
|
||||
static void intersect(const std::vector<posting_list_t*>& posting_lists, std::vector<uint32_t>& result_ids);
|
||||
|
||||
static bool block_intersect(
|
||||
const std::vector<posting_list_t*>& posting_lists,
|
||||
size_t batch_size,
|
||||
std::vector<posting_list_t::iterator_t>& its,
|
||||
result_iter_state_t& iter_state
|
||||
);
|
||||
|
||||
static bool get_offsets(
|
||||
result_iter_state_t& iter_state,
|
||||
std::vector<std::unordered_map<size_t, std::vector<token_positions_t>>>& array_token_positions
|
||||
);
|
||||
};
|
||||
|
@ -566,6 +566,165 @@ void posting_list_t::intersect(const std::vector<posting_list_t*>& posting_lists
|
||||
}
|
||||
}
|
||||
|
||||
bool posting_list_t::block_intersect(const std::vector<posting_list_t*>& posting_lists, const size_t batch_size,
|
||||
std::vector<posting_list_t::iterator_t>& its,
|
||||
result_iter_state_t& iter_state) {
|
||||
if(its.empty()) {
|
||||
its.reserve(posting_lists.size());
|
||||
|
||||
for(const auto& posting_list: posting_lists) {
|
||||
its.push_back(posting_list->new_iterator());
|
||||
}
|
||||
} else {
|
||||
// already in the middle of iteration: prepare for next batch
|
||||
iter_state.ids.clear();
|
||||
iter_state.indices.clear();
|
||||
iter_state.blocks.clear();
|
||||
}
|
||||
|
||||
size_t num_lists = its.size();
|
||||
|
||||
switch (num_lists) {
|
||||
case 2:
|
||||
while(!at_end2(its)) {
|
||||
if(equals2(its)) {
|
||||
// still need to ensure that the ID exists in inclusion list but NOT in exclusion list
|
||||
iter_state.ids.push_back(its[0].id());
|
||||
|
||||
std::vector<block_t*> block_vec(2);
|
||||
std::vector<uint32_t> index_vec(its.size());
|
||||
block_vec[0] = its[0].block();
|
||||
block_vec[1] = its[1].block();
|
||||
|
||||
index_vec[0] = its[0].index();
|
||||
index_vec[1] = its[1].index();
|
||||
|
||||
iter_state.blocks.emplace_back(block_vec);
|
||||
iter_state.indices.emplace_back(index_vec);
|
||||
|
||||
advance_all2(its);
|
||||
} else {
|
||||
advance_least2(its);
|
||||
}
|
||||
|
||||
if(iter_state.ids.size() == batch_size) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
break;
|
||||
default:
|
||||
while(!at_end(its)) {
|
||||
if(equals(its)) {
|
||||
//LOG(INFO) << its[0].id();
|
||||
iter_state.ids.push_back(its[0].id());
|
||||
|
||||
std::vector<block_t*> block_vec(its.size());
|
||||
std::vector<uint32_t> index_vec(its.size());
|
||||
|
||||
for(size_t i = 0; i < its.size(); i++) {
|
||||
block_vec[i] = its[i].block();
|
||||
index_vec[i] = its[i].index();
|
||||
}
|
||||
|
||||
iter_state.blocks.emplace_back(block_vec);
|
||||
iter_state.indices.emplace_back(index_vec);
|
||||
|
||||
advance_all(its);
|
||||
} else {
|
||||
advance_least(its);
|
||||
}
|
||||
|
||||
if(iter_state.ids.size() == batch_size) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
bool posting_list_t::get_offsets(posting_list_t::result_iter_state_t& iter_state,
|
||||
std::vector<std::unordered_map<size_t, std::vector<token_positions_t>>>& array_token_positions_vec) {
|
||||
|
||||
// Plain string format:
|
||||
// offset1, offset2, ... , 0 (if token is the last offset for the document)
|
||||
|
||||
// Array string format:
|
||||
// offset1, ... , offsetn, offsetn, array_index, 0 (if token is the last offset for the document)
|
||||
// (last offset is repeated to indicate end of offsets for a given array index)
|
||||
|
||||
// For each result ID and for each block it is contained in, calculate offsets
|
||||
|
||||
for(size_t i = 0; i < iter_state.ids.size(); i++) {
|
||||
uint32_t id = iter_state.ids[i];
|
||||
array_token_positions_vec.emplace_back();
|
||||
std::unordered_map<size_t, std::vector<token_positions_t>>& array_tok_pos = array_token_positions_vec.back();
|
||||
|
||||
for(size_t j = 0; j < iter_state.blocks[i].size(); j++) {
|
||||
block_t* curr_block = iter_state.blocks[i][j];
|
||||
uint32_t curr_index = iter_state.indices[i][j];
|
||||
|
||||
uint32_t* offsets = curr_block->offsets.uncompress();
|
||||
|
||||
uint32_t start_offset = curr_block->offset_index.at(curr_index);
|
||||
uint32_t end_offset = (curr_index == curr_block->size() - 1) ?
|
||||
curr_block->offsets.getLength() :
|
||||
curr_block->offset_index.at(curr_index + 1);
|
||||
|
||||
std::vector<uint16_t> positions;
|
||||
int prev_pos = -1;
|
||||
bool is_last_token = false;
|
||||
|
||||
while(start_offset < end_offset) {
|
||||
int pos = offsets[start_offset];
|
||||
start_offset++;
|
||||
|
||||
if(pos == 0) {
|
||||
// indicates that token is the last token on the doc
|
||||
is_last_token = true;
|
||||
start_offset++;
|
||||
continue;
|
||||
}
|
||||
|
||||
if(pos == prev_pos) { // indicates end of array index
|
||||
if(!positions.empty()) {
|
||||
size_t array_index = (size_t) offsets[start_offset];
|
||||
is_last_token = false;
|
||||
|
||||
if(start_offset+1 < end_offset) {
|
||||
size_t next_offset = (size_t) offsets[start_offset + 1];
|
||||
if(next_offset == 0) {
|
||||
// indicates that token is the last token on the doc
|
||||
is_last_token = true;
|
||||
start_offset++;
|
||||
}
|
||||
}
|
||||
|
||||
array_tok_pos[array_index].push_back(token_positions_t{is_last_token, positions});
|
||||
positions.clear();
|
||||
}
|
||||
|
||||
start_offset++; // skip current value which is the array index or flag for last index
|
||||
prev_pos = -1;
|
||||
continue;
|
||||
}
|
||||
|
||||
prev_pos = pos;
|
||||
positions.push_back((uint16_t)pos - 1);
|
||||
}
|
||||
|
||||
if(!positions.empty()) {
|
||||
// for plain string fields
|
||||
array_tok_pos[0].push_back(token_positions_t{is_last_token, positions});
|
||||
}
|
||||
|
||||
delete [] offsets;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
bool posting_list_t::at_end(const std::vector<posting_list_t::iterator_t>& its) {
|
||||
// if any one iterator is at end, we can stop
|
||||
for(const auto& it : its) {
|
||||
@ -643,55 +802,58 @@ size_t posting_list_t::num_ids() {
|
||||
/* iterator_t operations */
|
||||
|
||||
posting_list_t::iterator_t::iterator_t(posting_list_t::block_t* root):
|
||||
block(root), index(0), uncompressed_block(nullptr), ids(nullptr) {
|
||||
curr_block(root), curr_index(0), uncompressed_block(nullptr), ids(nullptr) {
|
||||
|
||||
}
|
||||
|
||||
bool posting_list_t::iterator_t::valid() const {
|
||||
return (block != nullptr) && (index < block->size());
|
||||
return (curr_block != nullptr) && (curr_index < curr_block->size());
|
||||
}
|
||||
|
||||
void posting_list_t::iterator_t::next() {
|
||||
index++;
|
||||
if(index == block->size()) {
|
||||
index = 0;
|
||||
block = block->next;
|
||||
curr_index++;
|
||||
if(curr_index == curr_block->size()) {
|
||||
curr_index = 0;
|
||||
curr_block = curr_block->next;
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t posting_list_t::iterator_t::id() {
|
||||
//return block->ids.at(index);
|
||||
if(uncompressed_block != curr_block) {
|
||||
uncompressed_block = curr_block;
|
||||
|
||||
if(uncompressed_block != block) {
|
||||
delete [] ids;
|
||||
ids = nullptr;
|
||||
uncompressed_block = block;
|
||||
|
||||
if(block != nullptr) {
|
||||
ids = block->ids.uncompress();
|
||||
if(curr_block != nullptr) {
|
||||
ids = curr_block->ids.uncompress();
|
||||
}
|
||||
}
|
||||
|
||||
return ids[index];
|
||||
return ids[curr_index];
|
||||
}
|
||||
|
||||
void posting_list_t::iterator_t::offsets(std::vector<uint32_t>& offsets) {
|
||||
// TODO
|
||||
uint32_t posting_list_t::iterator_t::index() const {
|
||||
return curr_index;
|
||||
}
|
||||
|
||||
posting_list_t::block_t* posting_list_t::iterator_t::block() const {
|
||||
return curr_block;
|
||||
}
|
||||
|
||||
void posting_list_t::iterator_t::skip_to(uint32_t id) {
|
||||
bool skipped_block = false;
|
||||
while(block != nullptr && block->ids.last() < id) {
|
||||
block = block->next;
|
||||
while(curr_block != nullptr && curr_block->ids.last() < id) {
|
||||
curr_block = curr_block->next;
|
||||
skipped_block = true;
|
||||
}
|
||||
|
||||
if(skipped_block) {
|
||||
index = 0;
|
||||
curr_index = 0;
|
||||
}
|
||||
|
||||
while(block != nullptr && index < block->size() && this->id() < id) {
|
||||
index++;
|
||||
while(curr_block != nullptr && curr_index < curr_block->size() && this->id() < id) {
|
||||
curr_index++;
|
||||
}
|
||||
}
|
||||
|
||||
@ -701,11 +863,11 @@ posting_list_t::iterator_t::~iterator_t() {
|
||||
}
|
||||
|
||||
posting_list_t::iterator_t::iterator_t(iterator_t&& rhs) noexcept {
|
||||
block = rhs.block;
|
||||
index = rhs.index;
|
||||
curr_block = rhs.curr_block;
|
||||
curr_index = rhs.curr_index;
|
||||
uncompressed_block = rhs.uncompressed_block;
|
||||
ids = rhs.ids;
|
||||
|
||||
rhs.block = nullptr;
|
||||
rhs.curr_block = nullptr;
|
||||
rhs.ids = nullptr;
|
||||
}
|
||||
|
@ -557,6 +557,86 @@ TEST(PostingListTest, IntersectionBasics) {
|
||||
ASSERT_EQ(20, result_ids[1]);
|
||||
}
|
||||
|
||||
TEST(PostingListTest, ResultsAndOffsetsBasics) {
|
||||
// NOTE: due to the way offsets1 are parsed, the actual positions are 1 less than the offset values stored
|
||||
// (to account for the special offset `0` which indicates last offset
|
||||
std::vector<uint32_t> offsets1 = {1, 2, 4};
|
||||
std::vector<uint32_t> offsets2 = {5, 6};
|
||||
std::vector<uint32_t> offsets3 = {7};
|
||||
|
||||
std::vector<posting_list_t*> lists;
|
||||
|
||||
// T1: [0, 2] [3, 20]
|
||||
// T2: [1, 3], [5, 10], [20]
|
||||
// T3: [2, 3], [5, 7], [20]
|
||||
|
||||
// 3: (0, 1, 3} {4, 5} {6}
|
||||
// 2: {6} {4, 5} {0, 1, 3}
|
||||
|
||||
std::vector<token_positions_t> actual_offsets_3 = {
|
||||
token_positions_t{false, {0, 1, 3}},
|
||||
token_positions_t{false, {4, 5}},
|
||||
token_positions_t{false, {6}},
|
||||
};
|
||||
|
||||
std::vector<token_positions_t> actual_offsets_20 = {
|
||||
token_positions_t{false, {6}},
|
||||
token_positions_t{false, {4, 5}},
|
||||
token_positions_t{false, {0, 1, 3}},
|
||||
};
|
||||
|
||||
posting_list_t p1(2);
|
||||
p1.upsert(0, offsets1);
|
||||
p1.upsert(2, offsets1);
|
||||
p1.upsert(3, offsets1);
|
||||
p1.upsert(20, offsets3);
|
||||
|
||||
posting_list_t p2(2);
|
||||
p2.upsert(1, offsets1);
|
||||
p2.upsert(3, offsets2);
|
||||
p2.upsert(5, offsets1);
|
||||
p2.upsert(10, offsets1);
|
||||
p2.upsert(20, offsets2);
|
||||
|
||||
posting_list_t p3(2);
|
||||
p3.upsert(2, offsets1);
|
||||
p3.upsert(3, offsets3);
|
||||
p3.upsert(5, offsets1);
|
||||
p3.upsert(7, offsets1);
|
||||
p3.upsert(20, offsets1);
|
||||
|
||||
lists.push_back(&p1);
|
||||
lists.push_back(&p2);
|
||||
lists.push_back(&p3);
|
||||
|
||||
std::vector<posting_list_t::iterator_t> its;
|
||||
|
||||
posting_list_t::result_iter_state_t iter_state;
|
||||
posting_list_t::block_intersect(lists, 2, its, iter_state);
|
||||
|
||||
ASSERT_EQ(2, iter_state.ids.size());
|
||||
ASSERT_EQ(3, iter_state.ids[0]);
|
||||
ASSERT_EQ(20, iter_state.ids[1]);
|
||||
|
||||
ASSERT_EQ(2, iter_state.blocks.size());
|
||||
ASSERT_EQ(3, iter_state.blocks[0].size());
|
||||
ASSERT_EQ(3, iter_state.blocks[1].size());
|
||||
|
||||
ASSERT_EQ(2, iter_state.indices.size());
|
||||
|
||||
std::vector<std::unordered_map<size_t, std::vector<token_positions_t>>> array_token_positions_vec;
|
||||
posting_list_t::get_offsets(iter_state, array_token_positions_vec);
|
||||
ASSERT_EQ(2, array_token_positions_vec.size());
|
||||
|
||||
ASSERT_EQ(actual_offsets_3[0].positions, array_token_positions_vec[0].at(0)[0].positions);
|
||||
ASSERT_EQ(actual_offsets_3[1].positions, array_token_positions_vec[0].at(0)[1].positions);
|
||||
ASSERT_EQ(actual_offsets_3[2].positions, array_token_positions_vec[0].at(0)[2].positions);
|
||||
|
||||
ASSERT_EQ(actual_offsets_20[0].positions, array_token_positions_vec[1].at(0)[0].positions);
|
||||
ASSERT_EQ(actual_offsets_20[1].positions, array_token_positions_vec[1].at(0)[1].positions);
|
||||
ASSERT_EQ(actual_offsets_20[2].positions, array_token_positions_vec[1].at(0)[2].positions);
|
||||
}
|
||||
|
||||
TEST(PostingListTest, IntersectionSkipBlocks) {
|
||||
std::vector<uint32_t> offsets = {0, 1, 3};
|
||||
std::vector<posting_list_t*> lists;
|
||||
@ -876,7 +956,7 @@ TEST(PostingListTest, DISABLED_Benchmark) {
|
||||
LOG(INFO) << "Time taken for 5 sorted array updates: " << timeMicros;
|
||||
}
|
||||
|
||||
TEST(PostingListTest, DISABLED_BenchmarkIntersection) {
|
||||
TEST(PostingListTest, BenchmarkIntersection) {
|
||||
std::vector<uint32_t> offsets = {0, 1, 3};
|
||||
|
||||
time_t t;
|
||||
|
Loading…
x
Reference in New Issue
Block a user