Fixed an out-of-bounds bug with highlighting.

2025-05-19 21:22:25 +08:00 · 2017-11-03 21:07:56 +05:30 · 2017-11-03 21:07:56 +05:30 · 3907c2d3f9
commit 3907c2d3f9
parent a7479171b1
3 changed files with 48 additions and 16 deletions
--- a/include/match_score.h
+++ b/include/match_score.h
@ -14,6 +14,10 @@

 #define TokenOffsetHeap std::priority_queue<TokenOffset, std::vector<TokenOffset>, TokenOffset>

+const size_t WINDOW_SIZE = 10;
+const uint16_t MAX_DISPLACEMENT = std::numeric_limits<uint16_t>::max();
+
+
 struct TokenOffset {
    uint8_t token_id;         // token identifier
    uint16_t offset;          // token's offset in the text
@ -63,10 +67,17 @@ struct MatchScore {
  }

  static void pack_token_offsets(const uint16_t* min_token_offset, const size_t num_tokens,
-                                 const size_t start_token_index, char *offset_diffs) {
+                                 const uint16_t token_start_offset, char *offset_diffs) {
      offset_diffs[0] = (char) num_tokens;
-      for(size_t i = start_token_index; i < num_tokens; i++) {
-        offset_diffs[1+i] = (int8_t)(min_token_offset[i] - min_token_offset[start_token_index]);
+      size_t j = 1;
+
+      for(size_t i = 0; i < num_tokens; i++) {
+        if(min_token_offset[i] != MAX_DISPLACEMENT) {
+          offset_diffs[j] = (int8_t)(min_token_offset[i] - token_start_offset);
+        } else {
+          offset_diffs[j] = std::numeric_limits<int8_t>::max();
+        }
+        j++;
      }
  }

@ -79,9 +90,6 @@ struct MatchScore {
  *  compute the max_match and min_displacement of target tokens across the windows.
  */
  static MatchScore match_score(uint32_t doc_id, std::vector<std::vector<uint16_t>> &token_offsets) {
-    const size_t WINDOW_SIZE = 10;
-    const uint16_t MAX_DISPLACEMENT = std::numeric_limits<uint16_t>::max();
-
    std::priority_queue<TokenOffset, std::vector<TokenOffset>, TokenOffset> heap;

    for(uint8_t token_id=0; token_id < token_offsets.size(); token_id++) {
@ -157,12 +165,11 @@ struct MatchScore {

    // do run-length encoding of the min token positions/offsets
    uint16_t token_start_offset = 0;
-    char offset_diffs[16];
-    std::fill_n(offset_diffs, 16, 0);
-
-    int token_index = 0;
+    char packed_offset_diffs[16];
+    std::fill_n(packed_offset_diffs, 16, 0);

    // identify the first token which is actually present and use that as the base for run-length encoding
+    int token_index = 0;
    while(token_index < token_offsets.size()) {
      if(min_token_offset[token_index] != MAX_DISPLACEMENT) {
        token_start_offset = min_token_offset[token_index];
@ -171,7 +178,7 @@ struct MatchScore {
      token_index++;
    }

-    pack_token_offsets(min_token_offset, token_offsets.size(), token_index, offset_diffs);
-    return MatchScore(max_match, min_displacement, token_start_offset, offset_diffs);
+    pack_token_offsets(min_token_offset, token_offsets.size(), token_start_offset, packed_offset_diffs);
+    return MatchScore(max_match, min_displacement, token_start_offset, packed_offset_diffs);
  }
 };
--- a/src/collection.cpp
+++ b/src/collection.cpp
@ -811,6 +811,11 @@ Option<nlohmann::json> Collection::search(std::string query, const std::vector<s
    const uint32_t filter_ids_length = op_filter_ids_length.get();

    // check for valid pagination
+    if(page < 1) {
+        std::string message = "Page must be an integer of value greater than 0.";
+        return Option<nlohmann::json>(422, message);
+    }
+
    if((page * per_page) > MAX_RESULTS) {
        std::string message = "Only the first " + std::to_string(MAX_RESULTS) + " results are available.";
        return Option<nlohmann::json>(422, message);
@ -889,6 +894,7 @@ Option<nlohmann::json> Collection::search(std::string query, const std::vector<s
            std::vector<std::string> tokens;
            StringUtils::split(document[field_name], tokens, " ");

+            // positions in the document of each token in the query
            std::vector<std::vector<uint16_t>> token_positions;

            for (const art_leaf *token_leaf : searched_queries[field_order_kv.second.query_index]) {
@ -917,8 +923,10 @@ Option<nlohmann::json> Collection::search(std::string query, const std::vector<s
            std::vector<size_t> token_indices;
            char num_tokens_found = mscore.offset_diffs[0];
            for(size_t i = 1; i <= num_tokens_found; i++) {
-                size_t token_index = (size_t)(mscore.start_offset + mscore.offset_diffs[i]);
-                token_indices.push_back(token_index);
+                if(mscore.offset_diffs[i] != std::numeric_limits<int8_t>::max()) {
+                    size_t token_index = (size_t)(mscore.start_offset + mscore.offset_diffs[i]);
+                    token_indices.push_back(token_index);
+                }
            }

            auto minmax = std::minmax_element(token_indices.begin(), token_indices.end());
--- a/test/match_score_test.cpp
+++ b/test/match_score_test.cpp
@ -5,7 +5,7 @@ TEST(MatchScoreTest, ShouldPackTokenOffsets) {
    uint16_t min_token_offset1[3] = {567, 568, 570};
    char offset_diffs[16];

-    MatchScore::pack_token_offsets(min_token_offset1, 3, 0, offset_diffs);
+    MatchScore::pack_token_offsets(min_token_offset1, 3, 567, offset_diffs);

    ASSERT_EQ(3, offset_diffs[0]);
    ASSERT_EQ(0, offset_diffs[1]);
@ -21,8 +21,25 @@ TEST(MatchScoreTest, ShouldPackTokenOffsets) {
    ASSERT_EQ(2, offset_diffs[3]);

    uint16_t min_token_offset3[1] = {123};
-    MatchScore::pack_token_offsets(min_token_offset3, 1, 0, offset_diffs);
+    MatchScore::pack_token_offsets(min_token_offset3, 1, 123, offset_diffs);

    ASSERT_EQ(1, offset_diffs[0]);
    ASSERT_EQ(0, offset_diffs[1]);
+
+    // a token might not have an offset because it might not be in the best matching window
+    uint16_t min_token_offset4[3] = {0, MAX_DISPLACEMENT, 2};
+    MatchScore::pack_token_offsets(min_token_offset4, 3, 0, offset_diffs);
+
+    ASSERT_EQ(3, offset_diffs[0]);
+    ASSERT_EQ(0, offset_diffs[1]);
+    ASSERT_EQ(std::numeric_limits<int8_t>::max(), offset_diffs[2]);
+    ASSERT_EQ(2, offset_diffs[3]);
+
+    uint16_t min_token_offset5[3] = {MAX_DISPLACEMENT, 2, 4};
+    MatchScore::pack_token_offsets(min_token_offset5, 3, 2, offset_diffs);
+
+    ASSERT_EQ(3, offset_diffs[0]);
+    ASSERT_EQ(std::numeric_limits<int8_t>::max(), offset_diffs[1]);
+    ASSERT_EQ(0, offset_diffs[2]);
+    ASSERT_EQ(2, offset_diffs[3]);
 }