From 3907c2d3f9af23b704b12e5f652fbddbb6b83e87 Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Fri, 3 Nov 2017 21:07:56 +0530 Subject: [PATCH] Fixed an out-of-bounds bug with highlighting. --- include/match_score.h | 31 +++++++++++++++++++------------ src/collection.cpp | 12 ++++++++++-- test/match_score_test.cpp | 21 +++++++++++++++++++-- 3 files changed, 48 insertions(+), 16 deletions(-) diff --git a/include/match_score.h b/include/match_score.h index 3fc1674e..4ecc0594 100644 --- a/include/match_score.h +++ b/include/match_score.h @@ -14,6 +14,10 @@ #define TokenOffsetHeap std::priority_queue, TokenOffset> +const size_t WINDOW_SIZE = 10; +const uint16_t MAX_DISPLACEMENT = std::numeric_limits::max(); + + struct TokenOffset { uint8_t token_id; // token identifier uint16_t offset; // token's offset in the text @@ -63,10 +67,17 @@ struct MatchScore { } static void pack_token_offsets(const uint16_t* min_token_offset, const size_t num_tokens, - const size_t start_token_index, char *offset_diffs) { + const uint16_t token_start_offset, char *offset_diffs) { offset_diffs[0] = (char) num_tokens; - for(size_t i = start_token_index; i < num_tokens; i++) { - offset_diffs[1+i] = (int8_t)(min_token_offset[i] - min_token_offset[start_token_index]); + size_t j = 1; + + for(size_t i = 0; i < num_tokens; i++) { + if(min_token_offset[i] != MAX_DISPLACEMENT) { + offset_diffs[j] = (int8_t)(min_token_offset[i] - token_start_offset); + } else { + offset_diffs[j] = std::numeric_limits::max(); + } + j++; } } @@ -79,9 +90,6 @@ struct MatchScore { * compute the max_match and min_displacement of target tokens across the windows. */ static MatchScore match_score(uint32_t doc_id, std::vector> &token_offsets) { - const size_t WINDOW_SIZE = 10; - const uint16_t MAX_DISPLACEMENT = std::numeric_limits::max(); - std::priority_queue, TokenOffset> heap; for(uint8_t token_id=0; token_id < token_offsets.size(); token_id++) { @@ -157,12 +165,11 @@ struct MatchScore { // do run-length encoding of the min token positions/offsets uint16_t token_start_offset = 0; - char offset_diffs[16]; - std::fill_n(offset_diffs, 16, 0); - - int token_index = 0; + char packed_offset_diffs[16]; + std::fill_n(packed_offset_diffs, 16, 0); // identify the first token which is actually present and use that as the base for run-length encoding + int token_index = 0; while(token_index < token_offsets.size()) { if(min_token_offset[token_index] != MAX_DISPLACEMENT) { token_start_offset = min_token_offset[token_index]; @@ -171,7 +178,7 @@ struct MatchScore { token_index++; } - pack_token_offsets(min_token_offset, token_offsets.size(), token_index, offset_diffs); - return MatchScore(max_match, min_displacement, token_start_offset, offset_diffs); + pack_token_offsets(min_token_offset, token_offsets.size(), token_start_offset, packed_offset_diffs); + return MatchScore(max_match, min_displacement, token_start_offset, packed_offset_diffs); } }; diff --git a/src/collection.cpp b/src/collection.cpp index a77e2806..aca6bc9f 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -811,6 +811,11 @@ Option Collection::search(std::string query, const std::vector(422, message); + } + if((page * per_page) > MAX_RESULTS) { std::string message = "Only the first " + std::to_string(MAX_RESULTS) + " results are available."; return Option(422, message); @@ -889,6 +894,7 @@ Option Collection::search(std::string query, const std::vector tokens; StringUtils::split(document[field_name], tokens, " "); + // positions in the document of each token in the query std::vector> token_positions; for (const art_leaf *token_leaf : searched_queries[field_order_kv.second.query_index]) { @@ -917,8 +923,10 @@ Option Collection::search(std::string query, const std::vector token_indices; char num_tokens_found = mscore.offset_diffs[0]; for(size_t i = 1; i <= num_tokens_found; i++) { - size_t token_index = (size_t)(mscore.start_offset + mscore.offset_diffs[i]); - token_indices.push_back(token_index); + if(mscore.offset_diffs[i] != std::numeric_limits::max()) { + size_t token_index = (size_t)(mscore.start_offset + mscore.offset_diffs[i]); + token_indices.push_back(token_index); + } } auto minmax = std::minmax_element(token_indices.begin(), token_indices.end()); diff --git a/test/match_score_test.cpp b/test/match_score_test.cpp index bc527559..7a84f305 100644 --- a/test/match_score_test.cpp +++ b/test/match_score_test.cpp @@ -5,7 +5,7 @@ TEST(MatchScoreTest, ShouldPackTokenOffsets) { uint16_t min_token_offset1[3] = {567, 568, 570}; char offset_diffs[16]; - MatchScore::pack_token_offsets(min_token_offset1, 3, 0, offset_diffs); + MatchScore::pack_token_offsets(min_token_offset1, 3, 567, offset_diffs); ASSERT_EQ(3, offset_diffs[0]); ASSERT_EQ(0, offset_diffs[1]); @@ -21,8 +21,25 @@ TEST(MatchScoreTest, ShouldPackTokenOffsets) { ASSERT_EQ(2, offset_diffs[3]); uint16_t min_token_offset3[1] = {123}; - MatchScore::pack_token_offsets(min_token_offset3, 1, 0, offset_diffs); + MatchScore::pack_token_offsets(min_token_offset3, 1, 123, offset_diffs); ASSERT_EQ(1, offset_diffs[0]); ASSERT_EQ(0, offset_diffs[1]); + + // a token might not have an offset because it might not be in the best matching window + uint16_t min_token_offset4[3] = {0, MAX_DISPLACEMENT, 2}; + MatchScore::pack_token_offsets(min_token_offset4, 3, 0, offset_diffs); + + ASSERT_EQ(3, offset_diffs[0]); + ASSERT_EQ(0, offset_diffs[1]); + ASSERT_EQ(std::numeric_limits::max(), offset_diffs[2]); + ASSERT_EQ(2, offset_diffs[3]); + + uint16_t min_token_offset5[3] = {MAX_DISPLACEMENT, 2, 4}; + MatchScore::pack_token_offsets(min_token_offset5, 3, 2, offset_diffs); + + ASSERT_EQ(3, offset_diffs[0]); + ASSERT_EQ(std::numeric_limits::max(), offset_diffs[1]); + ASSERT_EQ(0, offset_diffs[2]); + ASSERT_EQ(2, offset_diffs[3]); } \ No newline at end of file