From 3907c2d3f9af23b704b12e5f652fbddbb6b83e87 Mon Sep 17 00:00:00 2001
From: Kishore Nallan <kishore@kishorelive.com>
Date: Fri, 3 Nov 2017 21:07:56 +0530
Subject: [PATCH] Fixed an out-of-bounds bug with highlighting.

---
 include/match_score.h     | 31 +++++++++++++++++++------------
 src/collection.cpp        | 12 ++++++++++--
 test/match_score_test.cpp | 21 +++++++++++++++++++--
 3 files changed, 48 insertions(+), 16 deletions(-)
diff --git a/include/match_score.h b/include/match_score.h
index 3fc1674e..4ecc0594 100644
--- a/include/match_score.h
+++ b/include/match_score.h
@@ -14,6 +14,10 @@
 
 #define TokenOffsetHeap std::priority_queue<TokenOffset, std::vector<TokenOffset>, TokenOffset>
 
+const size_t WINDOW_SIZE = 10;
+const uint16_t MAX_DISPLACEMENT = std::numeric_limits<uint16_t>::max();
+
+
 struct TokenOffset {
     uint8_t token_id;         // token identifier
     uint16_t offset;          // token's offset in the text
@@ -63,10 +67,17 @@ struct MatchScore {
   }
 
   static void pack_token_offsets(const uint16_t* min_token_offset, const size_t num_tokens,
-                                 const size_t start_token_index, char *offset_diffs) {
+                                 const uint16_t token_start_offset, char *offset_diffs) {
       offset_diffs[0] = (char) num_tokens;
-      for(size_t i = start_token_index; i < num_tokens; i++) {
-        offset_diffs[1+i] = (int8_t)(min_token_offset[i] - min_token_offset[start_token_index]);
+      size_t j = 1;
+
+      for(size_t i = 0; i < num_tokens; i++) {
+        if(min_token_offset[i] != MAX_DISPLACEMENT) {
+          offset_diffs[j] = (int8_t)(min_token_offset[i] - token_start_offset);
+        } else {
+          offset_diffs[j] = std::numeric_limits<int8_t>::max();
+        }
+        j++;
       }
   }
 
@@ -79,9 +90,6 @@ struct MatchScore {
   *  compute the max_match and min_displacement of target tokens across the windows.
   */
   static MatchScore match_score(uint32_t doc_id, std::vector<std::vector<uint16_t>> &token_offsets) {
-    const size_t WINDOW_SIZE = 10;
-    const uint16_t MAX_DISPLACEMENT = std::numeric_limits<uint16_t>::max();
-
     std::priority_queue<TokenOffset, std::vector<TokenOffset>, TokenOffset> heap;
 
     for(uint8_t token_id=0; token_id < token_offsets.size(); token_id++) {
@@ -157,12 +165,11 @@ struct MatchScore {
 
     // do run-length encoding of the min token positions/offsets
     uint16_t token_start_offset = 0;
-    char offset_diffs[16];
-    std::fill_n(offset_diffs, 16, 0);
-
-    int token_index = 0;
+    char packed_offset_diffs[16];
+    std::fill_n(packed_offset_diffs, 16, 0);
 
     // identify the first token which is actually present and use that as the base for run-length encoding
+    int token_index = 0;
     while(token_index < token_offsets.size()) {
       if(min_token_offset[token_index] != MAX_DISPLACEMENT) {
         token_start_offset = min_token_offset[token_index];
@@ -171,7 +178,7 @@ struct MatchScore {
       token_index++;
     }
 
-    pack_token_offsets(min_token_offset, token_offsets.size(), token_index, offset_diffs);
-    return MatchScore(max_match, min_displacement, token_start_offset, offset_diffs);
+    pack_token_offsets(min_token_offset, token_offsets.size(), token_start_offset, packed_offset_diffs);
+    return MatchScore(max_match, min_displacement, token_start_offset, packed_offset_diffs);
   }
 };
diff --git a/src/collection.cpp b/src/collection.cpp
index a77e2806..aca6bc9f 100644
--- a/src/collection.cpp
+++ b/src/collection.cpp
@@ -811,6 +811,11 @@ Option<nlohmann::json> Collection::search(std::string query, const std::vector<s
     const uint32_t filter_ids_length = op_filter_ids_length.get();
 
     // check for valid pagination
+    if(page < 1) {
+        std::string message = "Page must be an integer of value greater than 0.";
+        return Option<nlohmann::json>(422, message);
+    }
+
     if((page * per_page) > MAX_RESULTS) {
         std::string message = "Only the first " + std::to_string(MAX_RESULTS) + " results are available.";
         return Option<nlohmann::json>(422, message);
@@ -889,6 +894,7 @@ Option<nlohmann::json> Collection::search(std::string query, const std::vector<s
             std::vector<std::string> tokens;
             StringUtils::split(document[field_name], tokens, " ");
 
+            // positions in the document of each token in the query
             std::vector<std::vector<uint16_t>> token_positions;
 
             for (const art_leaf *token_leaf : searched_queries[field_order_kv.second.query_index]) {
@@ -917,8 +923,10 @@ Option<nlohmann::json> Collection::search(std::string query, const std::vector<s
             std::vector<size_t> token_indices;
             char num_tokens_found = mscore.offset_diffs[0];
             for(size_t i = 1; i <= num_tokens_found; i++) {
-                size_t token_index = (size_t)(mscore.start_offset + mscore.offset_diffs[i]);
-                token_indices.push_back(token_index);
+                if(mscore.offset_diffs[i] != std::numeric_limits<int8_t>::max()) {
+                    size_t token_index = (size_t)(mscore.start_offset + mscore.offset_diffs[i]);
+                    token_indices.push_back(token_index);
+                }
             }
 
             auto minmax = std::minmax_element(token_indices.begin(), token_indices.end());
diff --git a/test/match_score_test.cpp b/test/match_score_test.cpp
index bc527559..7a84f305 100644
--- a/test/match_score_test.cpp
+++ b/test/match_score_test.cpp
@@ -5,7 +5,7 @@ TEST(MatchScoreTest, ShouldPackTokenOffsets) {
     uint16_t min_token_offset1[3] = {567, 568, 570};
     char offset_diffs[16];
 
-    MatchScore::pack_token_offsets(min_token_offset1, 3, 0, offset_diffs);
+    MatchScore::pack_token_offsets(min_token_offset1, 3, 567, offset_diffs);
 
     ASSERT_EQ(3, offset_diffs[0]);
     ASSERT_EQ(0, offset_diffs[1]);
@@ -21,8 +21,25 @@ TEST(MatchScoreTest, ShouldPackTokenOffsets) {
     ASSERT_EQ(2, offset_diffs[3]);
 
     uint16_t min_token_offset3[1] = {123};
-    MatchScore::pack_token_offsets(min_token_offset3, 1, 0, offset_diffs);
+    MatchScore::pack_token_offsets(min_token_offset3, 1, 123, offset_diffs);
 
     ASSERT_EQ(1, offset_diffs[0]);
     ASSERT_EQ(0, offset_diffs[1]);
+
+    // a token might not have an offset because it might not be in the best matching window
+    uint16_t min_token_offset4[3] = {0, MAX_DISPLACEMENT, 2};
+    MatchScore::pack_token_offsets(min_token_offset4, 3, 0, offset_diffs);
+
+    ASSERT_EQ(3, offset_diffs[0]);
+    ASSERT_EQ(0, offset_diffs[1]);
+    ASSERT_EQ(std::numeric_limits<int8_t>::max(), offset_diffs[2]);
+    ASSERT_EQ(2, offset_diffs[3]);
+
+    uint16_t min_token_offset5[3] = {MAX_DISPLACEMENT, 2, 4};
+    MatchScore::pack_token_offsets(min_token_offset5, 3, 2, offset_diffs);
+
+    ASSERT_EQ(3, offset_diffs[0]);
+    ASSERT_EQ(std::numeric_limits<int8_t>::max(), offset_diffs[1]);
+    ASSERT_EQ(0, offset_diffs[2]);
+    ASSERT_EQ(2, offset_diffs[3]);
 }
\ No newline at end of file