Posting list: split up intersection and offset compute.

2025-05-19 13:12:22 +08:00 · 2021-06-14 20:29:50 +05:30 · 2021-06-14 20:29:50 +05:30 · 36580dfb62
commit 36580dfb62
parent bf49b351a1
4 changed files with 293 additions and 28 deletions
--- a/include/match_score.h
+++ b/include/match_score.h
@ -1,9 +1,10 @@
 #pragma once

-#include <stdint.h>
+#include <cstdint>
 #include <vector>
 #include <queue>
-#include <stdlib.h>
+#include <algorithm>
+#include <cstdlib>
 #include <limits>
 #include "logger.h"

--- a/include/posting_list.h
+++ b/include/posting_list.h
@ -1,7 +1,10 @@
 #pragma once
+
 #include <map>
+#include <unordered_map>
 #include "sorted_array.h"
 #include "array.h"
+#include "match_score.h"

 typedef uint32_t last_id_t;

@ -41,8 +44,8 @@ public:

    class iterator_t {
    private:
-        block_t* block;
-        uint32_t index;
+        block_t* curr_block;
+        uint32_t curr_index;

        // uncompressed data structures for performance
        block_t* uncompressed_block;
@ -56,7 +59,14 @@ public:
        void next();
        void skip_to(uint32_t id);
        [[nodiscard]] inline uint32_t id();
-        void offsets(std::vector<uint32_t>& offsets);
+        [[nodiscard]] inline uint32_t index() const;
+        [[nodiscard]] inline block_t* block() const;
+    };
+
+    struct result_iter_state_t {
+        std::vector<std::vector<block_t*>> blocks;
+        std::vector<std::vector<uint32_t>> indices;
+        std::vector<uint32_t> ids;
    };

 private:
@ -113,4 +123,16 @@ public:
    iterator_t new_iterator();

    static void intersect(const std::vector<posting_list_t*>& posting_lists, std::vector<uint32_t>& result_ids);
+
+    static bool block_intersect(
+        const std::vector<posting_list_t*>& posting_lists,
+        size_t batch_size,
+        std::vector<posting_list_t::iterator_t>& its,
+        result_iter_state_t& iter_state
+    );
+
+    static bool get_offsets(
+        result_iter_state_t& iter_state,
+        std::vector<std::unordered_map<size_t, std::vector<token_positions_t>>>& array_token_positions
+    );
 };
--- a/src/posting_list.cpp
+++ b/src/posting_list.cpp
@ -566,6 +566,165 @@ void posting_list_t::intersect(const std::vector<posting_list_t*>& posting_lists
    }
 }

+bool posting_list_t::block_intersect(const std::vector<posting_list_t*>& posting_lists, const size_t batch_size,
+                                     std::vector<posting_list_t::iterator_t>& its,
+                                     result_iter_state_t& iter_state) {
+    if(its.empty()) {
+        its.reserve(posting_lists.size());
+
+        for(const auto& posting_list: posting_lists) {
+            its.push_back(posting_list->new_iterator());
+        }
+    } else {
+        // already in the middle of iteration: prepare for next batch
+        iter_state.ids.clear();
+        iter_state.indices.clear();
+        iter_state.blocks.clear();
+    }
+
+    size_t num_lists = its.size();
+
+    switch (num_lists) {
+        case 2:
+            while(!at_end2(its)) {
+                if(equals2(its)) {
+                    // still need to ensure that the ID exists in inclusion list but NOT in exclusion list
+                    iter_state.ids.push_back(its[0].id());
+
+                    std::vector<block_t*> block_vec(2);
+                    std::vector<uint32_t> index_vec(its.size());
+                    block_vec[0] = its[0].block();
+                    block_vec[1] = its[1].block();
+
+                    index_vec[0] = its[0].index();
+                    index_vec[1] = its[1].index();
+
+                    iter_state.blocks.emplace_back(block_vec);
+                    iter_state.indices.emplace_back(index_vec);
+
+                    advance_all2(its);
+                } else {
+                    advance_least2(its);
+                }
+
+                if(iter_state.ids.size() == batch_size) {
+                    return true;
+                }
+            }
+            break;
+        default:
+            while(!at_end(its)) {
+                if(equals(its)) {
+                    //LOG(INFO) << its[0].id();
+                    iter_state.ids.push_back(its[0].id());
+
+                    std::vector<block_t*> block_vec(its.size());
+                    std::vector<uint32_t> index_vec(its.size());
+
+                    for(size_t i = 0; i < its.size(); i++) {
+                        block_vec[i] = its[i].block();
+                        index_vec[i] = its[i].index();
+                    }
+
+                    iter_state.blocks.emplace_back(block_vec);
+                    iter_state.indices.emplace_back(index_vec);
+
+                    advance_all(its);
+                } else {
+                    advance_least(its);
+                }
+
+                if(iter_state.ids.size() == batch_size) {
+                    return true;
+                }
+            }
+    }
+
+    return false;
+}
+
+bool posting_list_t::get_offsets(posting_list_t::result_iter_state_t& iter_state,
+                                 std::vector<std::unordered_map<size_t, std::vector<token_positions_t>>>& array_token_positions_vec) {
+
+    // Plain string format:
+    // offset1, offset2, ... , 0 (if token is the last offset for the document)
+
+    // Array string format:
+    // offset1, ... , offsetn, offsetn, array_index, 0 (if token is the last offset for the document)
+    // (last offset is repeated to indicate end of offsets for a given array index)
+
+    // For each result ID and for each block it is contained in, calculate offsets
+
+    for(size_t i = 0; i < iter_state.ids.size(); i++) {
+        uint32_t id = iter_state.ids[i];
+        array_token_positions_vec.emplace_back();
+        std::unordered_map<size_t, std::vector<token_positions_t>>& array_tok_pos = array_token_positions_vec.back();
+
+        for(size_t j = 0; j < iter_state.blocks[i].size(); j++) {
+            block_t* curr_block = iter_state.blocks[i][j];
+            uint32_t curr_index = iter_state.indices[i][j];
+
+            uint32_t* offsets = curr_block->offsets.uncompress();
+
+            uint32_t start_offset = curr_block->offset_index.at(curr_index);
+            uint32_t end_offset = (curr_index == curr_block->size() - 1) ?
+                                  curr_block->offsets.getLength() :
+                                  curr_block->offset_index.at(curr_index + 1);
+
+            std::vector<uint16_t> positions;
+            int prev_pos = -1;
+            bool is_last_token = false;
+
+            while(start_offset < end_offset) {
+                int pos = offsets[start_offset];
+                start_offset++;
+
+                if(pos == 0) {
+                    // indicates that token is the last token on the doc
+                    is_last_token = true;
+                    start_offset++;
+                    continue;
+                }
+
+                if(pos == prev_pos) {  // indicates end of array index
+                    if(!positions.empty()) {
+                        size_t array_index = (size_t) offsets[start_offset];
+                        is_last_token = false;
+
+                        if(start_offset+1 < end_offset) {
+                            size_t next_offset = (size_t) offsets[start_offset + 1];
+                            if(next_offset == 0) {
+                                // indicates that token is the last token on the doc
+                                is_last_token = true;
+                                start_offset++;
+                            }
+                        }
+
+                        array_tok_pos[array_index].push_back(token_positions_t{is_last_token, positions});
+                        positions.clear();
+                    }
+
+                    start_offset++;  // skip current value which is the array index or flag for last index
+                    prev_pos = -1;
+                    continue;
+                }
+
+                prev_pos = pos;
+                positions.push_back((uint16_t)pos - 1);
+            }
+
+            if(!positions.empty()) {
+                // for plain string fields
+                array_tok_pos[0].push_back(token_positions_t{is_last_token, positions});
+            }
+
+            delete [] offsets;
+        }
+    }
+
+    return false;
+}
+
 bool posting_list_t::at_end(const std::vector<posting_list_t::iterator_t>& its) {
    // if any one iterator is at end, we can stop
    for(const auto& it : its) {
@ -643,55 +802,58 @@ size_t posting_list_t::num_ids() {
 /* iterator_t operations */

 posting_list_t::iterator_t::iterator_t(posting_list_t::block_t* root):
-    block(root), index(0), uncompressed_block(nullptr), ids(nullptr) {
+        curr_block(root), curr_index(0), uncompressed_block(nullptr), ids(nullptr) {

 }

 bool posting_list_t::iterator_t::valid() const {
-    return (block != nullptr) && (index < block->size());
+    return (curr_block != nullptr) && (curr_index < curr_block->size());
 }

 void posting_list_t::iterator_t::next() {
-    index++;
-    if(index == block->size()) {
-        index = 0;
-        block = block->next;
+    curr_index++;
+    if(curr_index == curr_block->size()) {
+        curr_index = 0;
+        curr_block = curr_block->next;
    }
 }

 uint32_t posting_list_t::iterator_t::id() {
-    //return block->ids.at(index);
+    if(uncompressed_block != curr_block) {
+        uncompressed_block = curr_block;

-    if(uncompressed_block != block) {
        delete [] ids;
        ids = nullptr;
-        uncompressed_block = block;

-        if(block != nullptr) {
-            ids = block->ids.uncompress();
+        if(curr_block != nullptr) {
+            ids = curr_block->ids.uncompress();
        }
    }

-    return ids[index];
+    return ids[curr_index];
 }

-void posting_list_t::iterator_t::offsets(std::vector<uint32_t>& offsets) {
-    // TODO
+uint32_t posting_list_t::iterator_t::index() const {
+    return curr_index;
+}
+
+posting_list_t::block_t* posting_list_t::iterator_t::block() const {
+    return curr_block;
 }

 void posting_list_t::iterator_t::skip_to(uint32_t id) {
    bool skipped_block = false;
-    while(block != nullptr && block->ids.last() < id) {
-        block = block->next;
+    while(curr_block != nullptr && curr_block->ids.last() < id) {
+        curr_block = curr_block->next;
        skipped_block = true;
    }

    if(skipped_block) {
-        index = 0;
+        curr_index = 0;
    }

-    while(block != nullptr && index < block->size() && this->id() < id) {
-        index++;
+    while(curr_block != nullptr && curr_index < curr_block->size() && this->id() < id) {
+        curr_index++;
    }
 }

@ -701,11 +863,11 @@ posting_list_t::iterator_t::~iterator_t() {
 }

 posting_list_t::iterator_t::iterator_t(iterator_t&& rhs) noexcept {
-    block = rhs.block;
-    index = rhs.index;
+    curr_block = rhs.curr_block;
+    curr_index = rhs.curr_index;
    uncompressed_block = rhs.uncompressed_block;
    ids = rhs.ids;

-    rhs.block = nullptr;
+    rhs.curr_block = nullptr;
    rhs.ids = nullptr;
 }
--- a/test/posting_list_test.cpp
+++ b/test/posting_list_test.cpp
@ -557,6 +557,86 @@ TEST(PostingListTest, IntersectionBasics) {
    ASSERT_EQ(20, result_ids[1]);
 }

+TEST(PostingListTest, ResultsAndOffsetsBasics) {
+    // NOTE: due to the way offsets1 are parsed, the actual positions are 1 less than the offset values stored
+    // (to account for the special offset `0` which indicates last offset
+    std::vector<uint32_t> offsets1 = {1, 2, 4};
+    std::vector<uint32_t> offsets2 = {5, 6};
+    std::vector<uint32_t> offsets3 = {7};
+
+    std::vector<posting_list_t*> lists;
+
+    // T1: [0, 2] [3, 20]
+    // T2: [1, 3], [5, 10], [20]
+    // T3: [2, 3], [5, 7], [20]
+
+    // 3: (0, 1, 3} {4, 5} {6}
+    // 2: {6}       {4, 5} {0, 1, 3}
+
+    std::vector<token_positions_t> actual_offsets_3 = {
+        token_positions_t{false, {0, 1, 3}},
+        token_positions_t{false, {4, 5}},
+        token_positions_t{false, {6}},
+    };
+
+    std::vector<token_positions_t> actual_offsets_20 = {
+        token_positions_t{false, {6}},
+        token_positions_t{false, {4, 5}},
+        token_positions_t{false, {0, 1, 3}},
+    };
+
+    posting_list_t p1(2);
+    p1.upsert(0, offsets1);
+    p1.upsert(2, offsets1);
+    p1.upsert(3, offsets1);
+    p1.upsert(20, offsets3);
+
+    posting_list_t p2(2);
+    p2.upsert(1, offsets1);
+    p2.upsert(3, offsets2);
+    p2.upsert(5, offsets1);
+    p2.upsert(10, offsets1);
+    p2.upsert(20, offsets2);
+
+    posting_list_t p3(2);
+    p3.upsert(2, offsets1);
+    p3.upsert(3, offsets3);
+    p3.upsert(5, offsets1);
+    p3.upsert(7, offsets1);
+    p3.upsert(20, offsets1);
+
+    lists.push_back(&p1);
+    lists.push_back(&p2);
+    lists.push_back(&p3);
+
+    std::vector<posting_list_t::iterator_t> its;
+
+    posting_list_t::result_iter_state_t iter_state;
+    posting_list_t::block_intersect(lists, 2, its, iter_state);
+
+    ASSERT_EQ(2, iter_state.ids.size());
+    ASSERT_EQ(3, iter_state.ids[0]);
+    ASSERT_EQ(20, iter_state.ids[1]);
+
+    ASSERT_EQ(2, iter_state.blocks.size());
+    ASSERT_EQ(3, iter_state.blocks[0].size());
+    ASSERT_EQ(3, iter_state.blocks[1].size());
+
+    ASSERT_EQ(2, iter_state.indices.size());
+
+    std::vector<std::unordered_map<size_t, std::vector<token_positions_t>>> array_token_positions_vec;
+    posting_list_t::get_offsets(iter_state, array_token_positions_vec);
+    ASSERT_EQ(2, array_token_positions_vec.size());
+
+    ASSERT_EQ(actual_offsets_3[0].positions, array_token_positions_vec[0].at(0)[0].positions);
+    ASSERT_EQ(actual_offsets_3[1].positions, array_token_positions_vec[0].at(0)[1].positions);
+    ASSERT_EQ(actual_offsets_3[2].positions, array_token_positions_vec[0].at(0)[2].positions);
+
+    ASSERT_EQ(actual_offsets_20[0].positions, array_token_positions_vec[1].at(0)[0].positions);
+    ASSERT_EQ(actual_offsets_20[1].positions, array_token_positions_vec[1].at(0)[1].positions);
+    ASSERT_EQ(actual_offsets_20[2].positions, array_token_positions_vec[1].at(0)[2].positions);
+}
+
 TEST(PostingListTest, IntersectionSkipBlocks) {
    std::vector<uint32_t> offsets = {0, 1, 3};
    std::vector<posting_list_t*> lists;
@ -876,7 +956,7 @@ TEST(PostingListTest, DISABLED_Benchmark) {
    LOG(INFO) << "Time taken for 5 sorted array updates: " << timeMicros;
 }

-TEST(PostingListTest, DISABLED_BenchmarkIntersection) {
+TEST(PostingListTest, BenchmarkIntersection) {
    std::vector<uint32_t> offsets = {0, 1, 3};

    time_t t;