Track best-matched token offsets needed for highlighting.

- We store the best matched token offset positions in Topster KV - Using run-length encoding (via unions) to pack the offset diffs intelligently
2025-05-17 20:22:32 +08:00 · 2017-06-09 13:22:24 -05:00 · 2017-06-09 13:22:24 -05:00 · 1d5146f7ff
commit 1d5146f7ff
parent 24711d3c5c
7 changed files with 106 additions and 60 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -69,7 +69,8 @@ add_executable(typesense-server ${SRC_FILES} src/main/typesense_server.cpp)
 add_executable(search ${SRC_FILES} src/main/main.cpp)
 add_executable(benchmark ${SRC_FILES} src/main/benchmark.cpp)
 add_executable(typesense_test ${SRC_FILES} test/array_test.cpp test/sorted_array_test.cpp test/art_test.cpp
-               test/collection_test.cpp test/collection_manager_test.cpp test/topster_test.cpp)
+               test/collection_test.cpp test/collection_manager_test.cpp
+               test/topster_test.cpp test/match_score_test.cpp)

 target_compile_definitions(typesense-server PRIVATE ROOT_DIR="${CMAKE_SOURCE_DIR}/")
 target_compile_definitions(search PRIVATE ROOT_DIR="${CMAKE_SOURCE_DIR}/")
--- a/include/collection.h
+++ b/include/collection.h
@ -149,7 +149,7 @@ public:

    Option<uint32_t> index_in_memory(const nlohmann::json & document, uint32_t seq_id);

-    enum {MAX_SEARCH_TOKENS = 20};
+    enum {MAX_SEARCH_TOKENS = 10};
    enum {MAX_RESULTS = 100};

    // Using a $ prefix so that these meta keys stay above record entries in a lexicographically ordered KV store
--- a/include/match_score.h
+++ b/include/match_score.h
@ -14,8 +14,12 @@

 #define TokenOffsetHeap std::priority_queue<TokenOffset, std::vector<TokenOffset>, TokenOffset>

-struct MatchScore {
-  struct TokenOffset {
+union TokenOffsetDiffs {
+    int16_t packed;
+    char bytes[16];
+};
+
+struct TokenOffset {
    uint8_t token_id;         // token identifier
    uint16_t offset;          // token's offset in the text
    uint16_t offset_index;    // index of the offset in the vector
@ -23,7 +27,13 @@ struct MatchScore {
    bool operator() (const TokenOffset& a, const TokenOffset& b) {
        return a.offset > b.offset;
    }
-  };
+};
+
+struct MatchScore {
+  uint16_t words_present;
+  uint16_t distance;
+  uint16_t start_offset;
+  int16_t offset_diffs_packed;

  static void print_token_offsets(std::vector<std::vector<uint16_t>> &token_offsets) {
    for(auto offsets: token_offsets) {
@ -48,8 +58,12 @@ struct MatchScore {
    }
  }

-  uint16_t words_present;
-  uint16_t distance;
+  static void pack_token_offsets(const uint16_t* min_token_offset, const size_t num_tokens,
+                                 TokenOffsetDiffs & offset_diffs) {
+      for(size_t i = 1; i < num_tokens; i++) {
+          offset_diffs.bytes[i-1] = (char)(min_token_offset[i] - min_token_offset[0]);
+      }
+  }

  /*
  *  Given *sorted offsets* of each target token in a *single* document, generates a score that indicates:
@ -60,8 +74,8 @@ struct MatchScore {
  *  compute the max_match and min_displacement of target tokens across the windows.
  */
  static MatchScore match_score(uint32_t doc_id, std::vector<std::vector<uint16_t>> &token_offsets) {
-    const size_t WINDOW_SIZE = Collection::MAX_SEARCH_TOKENS;
-    const uint16_t MAX_DISPLACEMENT = Collection::MAX_SEARCH_TOKENS;
+    const size_t WINDOW_SIZE = 10;
+    const uint16_t MAX_DISPLACEMENT = std::numeric_limits<uint16_t>::max();

    std::priority_queue<TokenOffset, std::vector<TokenOffset>, TokenOffset> heap;

@ -75,8 +89,11 @@ struct MatchScore {
    uint16_t min_displacement = MAX_DISPLACEMENT;

    std::queue<TokenOffset> window;
-    uint16_t token_offset[Collection::MAX_SEARCH_TOKENS] = { };
-    std::fill_n(token_offset, Collection::MAX_SEARCH_TOKENS, MAX_DISPLACEMENT);
+    uint16_t token_offset[WINDOW_SIZE] = { };
+    std::fill_n(token_offset, WINDOW_SIZE, MAX_DISPLACEMENT);
+
+    // used to store token offsets of the best-matched window
+    uint16_t min_token_offset[WINDOW_SIZE];

    do {
      if(window.empty()) {
@ -121,6 +138,8 @@ struct MatchScore {
        max_match = num_match;
        if(displacement != 0 && displacement < min_displacement) {
          min_displacement = displacement;
+          // record the token positions (for highlighting)
+          memcpy(min_token_offset, token_offset, token_offsets.size()*sizeof(uint16_t));
        }
      }

@ -129,6 +148,11 @@ struct MatchScore {
      window.pop();
    } while(!heap.empty());

-    return MatchScore{max_match, min_displacement};
+    // do run-length encoding of the min token positions/offsets
+    TokenOffsetDiffs offset_diffs;
+    uint16_t start_offset = min_token_offset[0];
+    pack_token_offsets(min_token_offset, token_offsets.size(), offset_diffs);
+
+    return MatchScore{max_match, min_displacement, start_offset, offset_diffs.packed};
  }
 };
--- a/include/topster.h
+++ b/include/topster.h
@ -5,6 +5,7 @@
 #include <cstdio>
 #include <algorithm>
 #include <sparsepp.h>
+#include <match_score.h>

 /*
 * Remembers the max-K elements seen so far using a min-heap
@ -12,6 +13,8 @@
 template <size_t MAX_SIZE=100>
 struct Topster {
    struct KV {
+        uint16_t start_offset;
+        TokenOffsetDiffs offset_diffs;
        uint64_t key;
        uint64_t match_score;
        int64_t primary_attr;
@ -32,7 +35,8 @@ struct Topster {
        b = c;
    }

-    void add(const uint64_t &key, const uint64_t &match_score, const int64_t &primary_attr, const int64_t &secondary_attr){
+    void add(const uint64_t &key, const uint64_t &match_score, const int64_t &primary_attr,
+             const int64_t &secondary_attr, const uint16_t &start_offset, const int16_t &offset_diffs_packed){
        if (size >= MAX_SIZE) {
            if(!is_greater(data[0], match_score, primary_attr, secondary_attr)) {
                // when incoming value is less than the smallest in the heap, ignore
@ -51,6 +55,8 @@ struct Topster {
            data[0].match_score = match_score;
            data[0].primary_attr = primary_attr;
            data[0].secondary_attr = secondary_attr;
+            data[0].start_offset = start_offset;
+            data[0].offset_diffs.packed = offset_diffs_packed;
            uint32_t i = 0;

            // sift to maintain heap property
@ -80,6 +86,8 @@ struct Topster {
            data[size].match_score = match_score;
            data[size].primary_attr = primary_attr;
            data[size].secondary_attr = secondary_attr;
+            data[size].start_offset = start_offset;
+            data[size].offset_diffs.packed = offset_diffs_packed;
            size++;

            for (uint32_t i = size - 1; i > 0;) {
--- a/src/collection.cpp
+++ b/src/collection.cpp
@ -862,13 +862,10 @@ void Collection::score_results(const std::vector<sort_field> & sort_fields, cons
    const int max_candidate_rank = 250;
    spp::sparse_hash_map<art_leaf*, uint32_t*> leaf_to_indices;

-    if(query_suggestion.size() != 1) {
-        // won't be needing positional ranking when there is only 1 token in the query
-        for (art_leaf *token_leaf : query_suggestion) {
-            uint32_t *indices = new uint32_t[result_size];
-            token_leaf->values->ids.indexOf(result_ids, result_size, indices);
-            leaf_to_indices.emplace(token_leaf, indices);
-        }
+    for (art_leaf *token_leaf : query_suggestion) {
+        uint32_t *indices = new uint32_t[result_size];
+        token_leaf->values->ids.indexOf(result_ids, result_size, indices);
+        leaf_to_indices.emplace(token_leaf, indices);
    }

    spp::sparse_hash_map<uint32_t, int64_t> * primary_rank_scores = nullptr;
@ -897,35 +894,29 @@ void Collection::score_results(const std::vector<sort_field> & sort_fields, cons
        uint32_t seq_id = result_ids[i];
        std::vector<std::vector<uint16_t>> token_positions;

-        MatchScore mscore;
-
-        if(query_suggestion.size() == 1) {
-            mscore = MatchScore{1, 1};
-        } else {
-            // for each token in the query, find the positions that it appears in this document
-            for (art_leaf *token_leaf : query_suggestion) {
-                std::vector<uint16_t> positions;
-                int doc_index = leaf_to_indices.at(token_leaf)[i];
-                if(doc_index == token_leaf->values->ids.getLength()) {
-                    continue;
-                }
-
-                uint32_t start_offset = token_leaf->values->offset_index.at(doc_index);
-                uint32_t end_offset = (doc_index == token_leaf->values->ids.getLength() - 1) ?
-                                      token_leaf->values->offsets.getLength() :
-                                      token_leaf->values->offset_index.at(doc_index+1);
-
-                while(start_offset < end_offset) {
-                    positions.push_back((uint16_t) token_leaf->values->offsets.at(start_offset));
-                    start_offset++;
-                }
-
-                token_positions.push_back(positions);
+        // for each token in the query, find the positions that it appears in this document
+        for (art_leaf *token_leaf : query_suggestion) {
+            std::vector<uint16_t> positions;
+            int doc_index = leaf_to_indices.at(token_leaf)[i];
+            if(doc_index == token_leaf->values->ids.getLength()) {
+                continue;
            }

-            mscore = MatchScore::match_score(seq_id, token_positions);
+            uint32_t start_offset = token_leaf->values->offset_index.at(doc_index);
+            uint32_t end_offset = (doc_index == token_leaf->values->ids.getLength() - 1) ?
+                                  token_leaf->values->offsets.getLength() :
+                                  token_leaf->values->offset_index.at(doc_index+1);
+
+            while(start_offset < end_offset) {
+                positions.push_back((uint16_t) token_leaf->values->offsets.at(start_offset));
+                start_offset++;
+            }
+
+            token_positions.push_back(positions);
        }

+        MatchScore mscore = MatchScore::match_score(seq_id, token_positions);
+
        int candidate_rank_score = max_candidate_rank - candidate_rank;

        // Construct a single match_score from individual components (for multi-field sort)
@ -938,11 +929,12 @@ void Collection::score_results(const std::vector<sort_field> & sort_fields, cons
        int64_t secondary_rank_score = (secondary_rank_scores && secondary_rank_scores->count(seq_id) > 0) ?
                                       secondary_rank_scores->at(seq_id) : 0;
        topster.add(seq_id, match_score,
-                    primary_rank_factor * primary_rank_score,
-                    secondary_rank_factor * secondary_rank_score);
+                    primary_rank_factor * primary_rank_score, secondary_rank_factor * secondary_rank_score,
+                    mscore.start_offset, mscore.offset_diffs_packed);

-        /*std::cout << "candidate_rank_score: " << candidate_rank_score << ", match_score: "
-                  << match_score << ", primary_rank_score: " << primary_rank_score << ", seq_id: " << seq_id << std::endl;*/
+        /*std::cout << "candidate_rank_score: " << candidate_rank_score << ", words_present: " << mscore.words_present
+                  << ", match_score: " << match_score << ", primary_rank_score: " << primary_rank_score
+                  << ", seq_id: " << seq_id << std::endl;*/
    }

    for (auto it = leaf_to_indices.begin(); it != leaf_to_indices.end(); it++) {
--- a/test/match_score_test.cpp
+++ b/test/match_score_test.cpp
@ -0,0 +1,17 @@
+#include <gtest/gtest.h>
+#include <match_score.h>
+
+TEST(MatchScoreTest, ShouldPackTokenOffsets) {
+    uint16_t min_token_offset1[3] = {567, 568, 570};
+    TokenOffsetDiffs offset_diffs;
+    MatchScore::pack_token_offsets(min_token_offset1, 3, offset_diffs);
+
+    ASSERT_EQ(1, offset_diffs.bytes[0]);
+    ASSERT_EQ(3, offset_diffs.bytes[1]);
+
+    uint16_t  min_token_offset2[3] = {0, 1, 2};
+    MatchScore::pack_token_offsets(min_token_offset2, 3, offset_diffs);
+
+    ASSERT_EQ(1, offset_diffs.bytes[0]);
+    ASSERT_EQ(2, offset_diffs.bytes[1]);
+}
--- a/test/topster_test.cpp
+++ b/test/topster_test.cpp
@ -1,29 +1,33 @@
 #include <gtest/gtest.h>
 #include "topster.h"
+#include "match_score.h"

 TEST(TopsterTest, StoreMaxValuesWithoutRepetition) {
    Topster<5> topster;

    struct {
+        uint16_t start_offset;
+        TokenOffsetDiffs offset_diffs;
        uint64_t key;
        uint64_t match_score;
        int64_t primary_attr;
        int64_t secondary_attr;
    } data[10] = {
-        {1, 10, 20, 30},
-        {2, 4, 20, 30},
-        {3, 7, 20, 30},
-        {4, 11, 20, 30},
-        {5, 9, 20, 30},
-        {6, 6, 20, 30},
-        {7, 6, 22, 30},
-        {8, 9, 20, 30},
-        {9, 8, 20, 30},
-        {10, 5, 20, 30},
+        {10, {.packed = 10 }, 1, 10, 20, 30},
+        {0, {.packed = 10 }, 2, 4, 20, 30},
+        {2, {.packed = 10 }, 3, 7, 20, 30},
+        {11, {.packed = 10 }, 4, 11, 20, 30},
+        {78, {.packed = 10 }, 5, 9, 20, 30},
+        {246, {.packed = 10 }, 6, 6, 20, 30},
+        {0, {.packed = 10 }, 7, 6, 22, 30},
+        {20, {.packed = 10 }, 8, 9, 20, 30},
+        {22, {.packed = 10 }, 9, 8, 20, 30},
+        {77, {.packed = 10 }, 10, 5, 20, 30},
    };

    for(int i = 0; i < 10; i++) {
-        topster.add(data[i].key, data[i].match_score, data[i].primary_attr, data[i].secondary_attr);
+        topster.add(data[i].key, data[i].match_score, data[i].primary_attr, data[i].secondary_attr,
+                    data[i].start_offset, data[i].offset_diffs.packed);
    }

    topster.sort();