From 1d5146f7ff6cf8a61fcf9f24cd7977738fc7059d Mon Sep 17 00:00:00 2001
From: Kishore Nallan <kishore@kishorelive.com>
Date: Fri, 9 Jun 2017 13:22:24 -0500
Subject: [PATCH] Track best-matched token offsets needed for highlighting.

- We store the best matched token offset positions in Topster KV
- Using run-length encoding (via unions) to pack the offset diffs intelligently
---
 CMakeLists.txt            |  3 +-
 include/collection.h      |  2 +-
 include/match_score.h     | 44 +++++++++++++++++++++------
 include/topster.h         | 10 +++++-
 src/collection.cpp        | 64 +++++++++++++++++----------------------
 test/match_score_test.cpp | 17 +++++++++++
 test/topster_test.cpp     | 26 +++++++++-------
 7 files changed, 106 insertions(+), 60 deletions(-)
 create mode 100644 test/match_score_test.cpp
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1eca6791..07376ac9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -69,7 +69,8 @@ add_executable(typesense-server ${SRC_FILES} src/main/typesense_server.cpp)
 add_executable(search ${SRC_FILES} src/main/main.cpp)
 add_executable(benchmark ${SRC_FILES} src/main/benchmark.cpp)
 add_executable(typesense_test ${SRC_FILES} test/array_test.cpp test/sorted_array_test.cpp test/art_test.cpp
-               test/collection_test.cpp test/collection_manager_test.cpp test/topster_test.cpp)
+               test/collection_test.cpp test/collection_manager_test.cpp
+               test/topster_test.cpp test/match_score_test.cpp)
 
 target_compile_definitions(typesense-server PRIVATE ROOT_DIR="${CMAKE_SOURCE_DIR}/")
 target_compile_definitions(search PRIVATE ROOT_DIR="${CMAKE_SOURCE_DIR}/")
diff --git a/include/collection.h b/include/collection.h
index d3ef4a9c..ea13b0a1 100644
--- a/include/collection.h
+++ b/include/collection.h
@@ -149,7 +149,7 @@ public:
 
     Option<uint32_t> index_in_memory(const nlohmann::json & document, uint32_t seq_id);
 
-    enum {MAX_SEARCH_TOKENS = 20};
+    enum {MAX_SEARCH_TOKENS = 10};
     enum {MAX_RESULTS = 100};
 
     // Using a $ prefix so that these meta keys stay above record entries in a lexicographically ordered KV store
diff --git a/include/match_score.h b/include/match_score.h
index ccbb7eea..d81dbe86 100644
--- a/include/match_score.h
+++ b/include/match_score.h
@@ -14,8 +14,12 @@
 
 #define TokenOffsetHeap std::priority_queue<TokenOffset, std::vector<TokenOffset>, TokenOffset>
 
-struct MatchScore {
-  struct TokenOffset {
+union TokenOffsetDiffs {
+    int16_t packed;
+    char bytes[16];
+};
+
+struct TokenOffset {
     uint8_t token_id;         // token identifier
     uint16_t offset;          // token's offset in the text
     uint16_t offset_index;    // index of the offset in the vector
@@ -23,7 +27,13 @@ struct MatchScore {
     bool operator() (const TokenOffset& a, const TokenOffset& b) {
         return a.offset > b.offset;
     }
-  };
+};
+
+struct MatchScore {
+  uint16_t words_present;
+  uint16_t distance;
+  uint16_t start_offset;
+  int16_t offset_diffs_packed;
 
   static void print_token_offsets(std::vector<std::vector<uint16_t>> &token_offsets) {
     for(auto offsets: token_offsets) {
@@ -48,8 +58,12 @@ struct MatchScore {
     }
   }
 
-  uint16_t words_present;
-  uint16_t distance;
+  static void pack_token_offsets(const uint16_t* min_token_offset, const size_t num_tokens,
+                                 TokenOffsetDiffs & offset_diffs) {
+      for(size_t i = 1; i < num_tokens; i++) {
+          offset_diffs.bytes[i-1] = (char)(min_token_offset[i] - min_token_offset[0]);
+      }
+  }
 
   /*
   *  Given *sorted offsets* of each target token in a *single* document, generates a score that indicates:
@@ -60,8 +74,8 @@ struct MatchScore {
   *  compute the max_match and min_displacement of target tokens across the windows.
   */
   static MatchScore match_score(uint32_t doc_id, std::vector<std::vector<uint16_t>> &token_offsets) {
-    const size_t WINDOW_SIZE = Collection::MAX_SEARCH_TOKENS;
-    const uint16_t MAX_DISPLACEMENT = Collection::MAX_SEARCH_TOKENS;
+    const size_t WINDOW_SIZE = 10;
+    const uint16_t MAX_DISPLACEMENT = std::numeric_limits<uint16_t>::max();
 
     std::priority_queue<TokenOffset, std::vector<TokenOffset>, TokenOffset> heap;
 
@@ -75,8 +89,11 @@ struct MatchScore {
     uint16_t min_displacement = MAX_DISPLACEMENT;
 
     std::queue<TokenOffset> window;
-    uint16_t token_offset[Collection::MAX_SEARCH_TOKENS] = { };
-    std::fill_n(token_offset, Collection::MAX_SEARCH_TOKENS, MAX_DISPLACEMENT);
+    uint16_t token_offset[WINDOW_SIZE] = { };
+    std::fill_n(token_offset, WINDOW_SIZE, MAX_DISPLACEMENT);
+
+    // used to store token offsets of the best-matched window
+    uint16_t min_token_offset[WINDOW_SIZE];
 
     do {
       if(window.empty()) {
@@ -121,6 +138,8 @@ struct MatchScore {
         max_match = num_match;
         if(displacement != 0 && displacement < min_displacement) {
           min_displacement = displacement;
+          // record the token positions (for highlighting)
+          memcpy(min_token_offset, token_offset, token_offsets.size()*sizeof(uint16_t));
         }
       }
 
@@ -129,6 +148,11 @@ struct MatchScore {
       window.pop();
     } while(!heap.empty());
 
-    return MatchScore{max_match, min_displacement};
+    // do run-length encoding of the min token positions/offsets
+    TokenOffsetDiffs offset_diffs;
+    uint16_t start_offset = min_token_offset[0];
+    pack_token_offsets(min_token_offset, token_offsets.size(), offset_diffs);
+
+    return MatchScore{max_match, min_displacement, start_offset, offset_diffs.packed};
   }
 };
diff --git a/include/topster.h b/include/topster.h
index c6d61eda..a1ec92eb 100644
--- a/include/topster.h
+++ b/include/topster.h
@@ -5,6 +5,7 @@
 #include <cstdio>
 #include <algorithm>
 #include <sparsepp.h>
+#include <match_score.h>
 
 /*
 * Remembers the max-K elements seen so far using a min-heap
@@ -12,6 +13,8 @@
 template <size_t MAX_SIZE=100>
 struct Topster {
     struct KV {
+        uint16_t start_offset;
+        TokenOffsetDiffs offset_diffs;
         uint64_t key;
         uint64_t match_score;
         int64_t primary_attr;
@@ -32,7 +35,8 @@ struct Topster {
         b = c;
     }
 
-    void add(const uint64_t &key, const uint64_t &match_score, const int64_t &primary_attr, const int64_t &secondary_attr){
+    void add(const uint64_t &key, const uint64_t &match_score, const int64_t &primary_attr,
+             const int64_t &secondary_attr, const uint16_t &start_offset, const int16_t &offset_diffs_packed){
         if (size >= MAX_SIZE) {
             if(!is_greater(data[0], match_score, primary_attr, secondary_attr)) {
                 // when incoming value is less than the smallest in the heap, ignore
@@ -51,6 +55,8 @@ struct Topster {
             data[0].match_score = match_score;
             data[0].primary_attr = primary_attr;
             data[0].secondary_attr = secondary_attr;
+            data[0].start_offset = start_offset;
+            data[0].offset_diffs.packed = offset_diffs_packed;
             uint32_t i = 0;
 
             // sift to maintain heap property
@@ -80,6 +86,8 @@ struct Topster {
             data[size].match_score = match_score;
             data[size].primary_attr = primary_attr;
             data[size].secondary_attr = secondary_attr;
+            data[size].start_offset = start_offset;
+            data[size].offset_diffs.packed = offset_diffs_packed;
             size++;
 
             for (uint32_t i = size - 1; i > 0;) {
diff --git a/src/collection.cpp b/src/collection.cpp
index c06fca87..f8814ca1 100644
--- a/src/collection.cpp
+++ b/src/collection.cpp
@@ -862,13 +862,10 @@ void Collection::score_results(const std::vector<sort_field> & sort_fields, cons
     const int max_candidate_rank = 250;
     spp::sparse_hash_map<art_leaf*, uint32_t*> leaf_to_indices;
 
-    if(query_suggestion.size() != 1) {
-        // won't be needing positional ranking when there is only 1 token in the query
-        for (art_leaf *token_leaf : query_suggestion) {
-            uint32_t *indices = new uint32_t[result_size];
-            token_leaf->values->ids.indexOf(result_ids, result_size, indices);
-            leaf_to_indices.emplace(token_leaf, indices);
-        }
+    for (art_leaf *token_leaf : query_suggestion) {
+        uint32_t *indices = new uint32_t[result_size];
+        token_leaf->values->ids.indexOf(result_ids, result_size, indices);
+        leaf_to_indices.emplace(token_leaf, indices);
     }
 
     spp::sparse_hash_map<uint32_t, int64_t> * primary_rank_scores = nullptr;
@@ -897,35 +894,29 @@ void Collection::score_results(const std::vector<sort_field> & sort_fields, cons
         uint32_t seq_id = result_ids[i];
         std::vector<std::vector<uint16_t>> token_positions;
 
-        MatchScore mscore;
-
-        if(query_suggestion.size() == 1) {
-            mscore = MatchScore{1, 1};
-        } else {
-            // for each token in the query, find the positions that it appears in this document
-            for (art_leaf *token_leaf : query_suggestion) {
-                std::vector<uint16_t> positions;
-                int doc_index = leaf_to_indices.at(token_leaf)[i];
-                if(doc_index == token_leaf->values->ids.getLength()) {
-                    continue;
-                }
-
-                uint32_t start_offset = token_leaf->values->offset_index.at(doc_index);
-                uint32_t end_offset = (doc_index == token_leaf->values->ids.getLength() - 1) ?
-                                      token_leaf->values->offsets.getLength() :
-                                      token_leaf->values->offset_index.at(doc_index+1);
-
-                while(start_offset < end_offset) {
-                    positions.push_back((uint16_t) token_leaf->values->offsets.at(start_offset));
-                    start_offset++;
-                }
-
-                token_positions.push_back(positions);
+        // for each token in the query, find the positions that it appears in this document
+        for (art_leaf *token_leaf : query_suggestion) {
+            std::vector<uint16_t> positions;
+            int doc_index = leaf_to_indices.at(token_leaf)[i];
+            if(doc_index == token_leaf->values->ids.getLength()) {
+                continue;
             }
 
-            mscore = MatchScore::match_score(seq_id, token_positions);
+            uint32_t start_offset = token_leaf->values->offset_index.at(doc_index);
+            uint32_t end_offset = (doc_index == token_leaf->values->ids.getLength() - 1) ?
+                                  token_leaf->values->offsets.getLength() :
+                                  token_leaf->values->offset_index.at(doc_index+1);
+
+            while(start_offset < end_offset) {
+                positions.push_back((uint16_t) token_leaf->values->offsets.at(start_offset));
+                start_offset++;
+            }
+
+            token_positions.push_back(positions);
         }
 
+        MatchScore mscore = MatchScore::match_score(seq_id, token_positions);
+
         int candidate_rank_score = max_candidate_rank - candidate_rank;
 
         // Construct a single match_score from individual components (for multi-field sort)
@@ -938,11 +929,12 @@ void Collection::score_results(const std::vector<sort_field> & sort_fields, cons
         int64_t secondary_rank_score = (secondary_rank_scores && secondary_rank_scores->count(seq_id) > 0) ?
                                        secondary_rank_scores->at(seq_id) : 0;
         topster.add(seq_id, match_score,
-                    primary_rank_factor * primary_rank_score,
-                    secondary_rank_factor * secondary_rank_score);
+                    primary_rank_factor * primary_rank_score, secondary_rank_factor * secondary_rank_score,
+                    mscore.start_offset, mscore.offset_diffs_packed);
 
-        /*std::cout << "candidate_rank_score: " << candidate_rank_score << ", match_score: "
-                  << match_score << ", primary_rank_score: " << primary_rank_score << ", seq_id: " << seq_id << std::endl;*/
+        /*std::cout << "candidate_rank_score: " << candidate_rank_score << ", words_present: " << mscore.words_present
+                  << ", match_score: " << match_score << ", primary_rank_score: " << primary_rank_score
+                  << ", seq_id: " << seq_id << std::endl;*/
     }
 
     for (auto it = leaf_to_indices.begin(); it != leaf_to_indices.end(); it++) {
diff --git a/test/match_score_test.cpp b/test/match_score_test.cpp
new file mode 100644
index 00000000..57c1db12
--- /dev/null
+++ b/test/match_score_test.cpp
@@ -0,0 +1,17 @@
+#include <gtest/gtest.h>
+#include <match_score.h>
+
+TEST(MatchScoreTest, ShouldPackTokenOffsets) {
+    uint16_t min_token_offset1[3] = {567, 568, 570};
+    TokenOffsetDiffs offset_diffs;
+    MatchScore::pack_token_offsets(min_token_offset1, 3, offset_diffs);
+
+    ASSERT_EQ(1, offset_diffs.bytes[0]);
+    ASSERT_EQ(3, offset_diffs.bytes[1]);
+
+    uint16_t  min_token_offset2[3] = {0, 1, 2};
+    MatchScore::pack_token_offsets(min_token_offset2, 3, offset_diffs);
+
+    ASSERT_EQ(1, offset_diffs.bytes[0]);
+    ASSERT_EQ(2, offset_diffs.bytes[1]);
+}
\ No newline at end of file
diff --git a/test/topster_test.cpp b/test/topster_test.cpp
index 582a159e..608c65a5 100644
--- a/test/topster_test.cpp
+++ b/test/topster_test.cpp
@@ -1,29 +1,33 @@
 #include <gtest/gtest.h>
 #include "topster.h"
+#include "match_score.h"
 
 TEST(TopsterTest, StoreMaxValuesWithoutRepetition) {
     Topster<5> topster;
 
     struct {
+        uint16_t start_offset;
+        TokenOffsetDiffs offset_diffs;
         uint64_t key;
         uint64_t match_score;
         int64_t primary_attr;
         int64_t secondary_attr;
     } data[10] = {
-        {1, 10, 20, 30},
-        {2, 4, 20, 30},
-        {3, 7, 20, 30},
-        {4, 11, 20, 30},
-        {5, 9, 20, 30},
-        {6, 6, 20, 30},
-        {7, 6, 22, 30},
-        {8, 9, 20, 30},
-        {9, 8, 20, 30},
-        {10, 5, 20, 30},
+        {10, {.packed = 10 }, 1, 10, 20, 30},
+        {0, {.packed = 10 }, 2, 4, 20, 30},
+        {2, {.packed = 10 }, 3, 7, 20, 30},
+        {11, {.packed = 10 }, 4, 11, 20, 30},
+        {78, {.packed = 10 }, 5, 9, 20, 30},
+        {246, {.packed = 10 }, 6, 6, 20, 30},
+        {0, {.packed = 10 }, 7, 6, 22, 30},
+        {20, {.packed = 10 }, 8, 9, 20, 30},
+        {22, {.packed = 10 }, 9, 8, 20, 30},
+        {77, {.packed = 10 }, 10, 5, 20, 30},
     };
 
     for(int i = 0; i < 10; i++) {
-        topster.add(data[i].key, data[i].match_score, data[i].primary_attr, data[i].secondary_attr);
+        topster.add(data[i].key, data[i].match_score, data[i].primary_attr, data[i].secondary_attr,
+                    data[i].start_offset, data[i].offset_diffs.packed);
     }
 
     topster.sort();