Fixed an out-of-bounds bug with highlighting.

This commit is contained in:
Kishore Nallan 2017-11-03 21:07:56 +05:30
parent a7479171b1
commit 3907c2d3f9
3 changed files with 48 additions and 16 deletions

View File

@ -14,6 +14,10 @@
#define TokenOffsetHeap std::priority_queue<TokenOffset, std::vector<TokenOffset>, TokenOffset>
const size_t WINDOW_SIZE = 10;
const uint16_t MAX_DISPLACEMENT = std::numeric_limits<uint16_t>::max();
struct TokenOffset {
uint8_t token_id; // token identifier
uint16_t offset; // token's offset in the text
@ -63,10 +67,17 @@ struct MatchScore {
}
static void pack_token_offsets(const uint16_t* min_token_offset, const size_t num_tokens,
const size_t start_token_index, char *offset_diffs) {
const uint16_t token_start_offset, char *offset_diffs) {
offset_diffs[0] = (char) num_tokens;
for(size_t i = start_token_index; i < num_tokens; i++) {
offset_diffs[1+i] = (int8_t)(min_token_offset[i] - min_token_offset[start_token_index]);
size_t j = 1;
for(size_t i = 0; i < num_tokens; i++) {
if(min_token_offset[i] != MAX_DISPLACEMENT) {
offset_diffs[j] = (int8_t)(min_token_offset[i] - token_start_offset);
} else {
offset_diffs[j] = std::numeric_limits<int8_t>::max();
}
j++;
}
}
@ -79,9 +90,6 @@ struct MatchScore {
* compute the max_match and min_displacement of target tokens across the windows.
*/
static MatchScore match_score(uint32_t doc_id, std::vector<std::vector<uint16_t>> &token_offsets) {
const size_t WINDOW_SIZE = 10;
const uint16_t MAX_DISPLACEMENT = std::numeric_limits<uint16_t>::max();
std::priority_queue<TokenOffset, std::vector<TokenOffset>, TokenOffset> heap;
for(uint8_t token_id=0; token_id < token_offsets.size(); token_id++) {
@ -157,12 +165,11 @@ struct MatchScore {
// do run-length encoding of the min token positions/offsets
uint16_t token_start_offset = 0;
char offset_diffs[16];
std::fill_n(offset_diffs, 16, 0);
int token_index = 0;
char packed_offset_diffs[16];
std::fill_n(packed_offset_diffs, 16, 0);
// identify the first token which is actually present and use that as the base for run-length encoding
int token_index = 0;
while(token_index < token_offsets.size()) {
if(min_token_offset[token_index] != MAX_DISPLACEMENT) {
token_start_offset = min_token_offset[token_index];
@ -171,7 +178,7 @@ struct MatchScore {
token_index++;
}
pack_token_offsets(min_token_offset, token_offsets.size(), token_index, offset_diffs);
return MatchScore(max_match, min_displacement, token_start_offset, offset_diffs);
pack_token_offsets(min_token_offset, token_offsets.size(), token_start_offset, packed_offset_diffs);
return MatchScore(max_match, min_displacement, token_start_offset, packed_offset_diffs);
}
};

View File

@ -811,6 +811,11 @@ Option<nlohmann::json> Collection::search(std::string query, const std::vector<s
const uint32_t filter_ids_length = op_filter_ids_length.get();
// check for valid pagination
if(page < 1) {
std::string message = "Page must be an integer of value greater than 0.";
return Option<nlohmann::json>(422, message);
}
if((page * per_page) > MAX_RESULTS) {
std::string message = "Only the first " + std::to_string(MAX_RESULTS) + " results are available.";
return Option<nlohmann::json>(422, message);
@ -889,6 +894,7 @@ Option<nlohmann::json> Collection::search(std::string query, const std::vector<s
std::vector<std::string> tokens;
StringUtils::split(document[field_name], tokens, " ");
// positions in the document of each token in the query
std::vector<std::vector<uint16_t>> token_positions;
for (const art_leaf *token_leaf : searched_queries[field_order_kv.second.query_index]) {
@ -917,8 +923,10 @@ Option<nlohmann::json> Collection::search(std::string query, const std::vector<s
std::vector<size_t> token_indices;
char num_tokens_found = mscore.offset_diffs[0];
for(size_t i = 1; i <= num_tokens_found; i++) {
size_t token_index = (size_t)(mscore.start_offset + mscore.offset_diffs[i]);
token_indices.push_back(token_index);
if(mscore.offset_diffs[i] != std::numeric_limits<int8_t>::max()) {
size_t token_index = (size_t)(mscore.start_offset + mscore.offset_diffs[i]);
token_indices.push_back(token_index);
}
}
auto minmax = std::minmax_element(token_indices.begin(), token_indices.end());

View File

@ -5,7 +5,7 @@ TEST(MatchScoreTest, ShouldPackTokenOffsets) {
uint16_t min_token_offset1[3] = {567, 568, 570};
char offset_diffs[16];
MatchScore::pack_token_offsets(min_token_offset1, 3, 0, offset_diffs);
MatchScore::pack_token_offsets(min_token_offset1, 3, 567, offset_diffs);
ASSERT_EQ(3, offset_diffs[0]);
ASSERT_EQ(0, offset_diffs[1]);
@ -21,8 +21,25 @@ TEST(MatchScoreTest, ShouldPackTokenOffsets) {
ASSERT_EQ(2, offset_diffs[3]);
uint16_t min_token_offset3[1] = {123};
MatchScore::pack_token_offsets(min_token_offset3, 1, 0, offset_diffs);
MatchScore::pack_token_offsets(min_token_offset3, 1, 123, offset_diffs);
ASSERT_EQ(1, offset_diffs[0]);
ASSERT_EQ(0, offset_diffs[1]);
// a token might not have an offset because it might not be in the best matching window
uint16_t min_token_offset4[3] = {0, MAX_DISPLACEMENT, 2};
MatchScore::pack_token_offsets(min_token_offset4, 3, 0, offset_diffs);
ASSERT_EQ(3, offset_diffs[0]);
ASSERT_EQ(0, offset_diffs[1]);
ASSERT_EQ(std::numeric_limits<int8_t>::max(), offset_diffs[2]);
ASSERT_EQ(2, offset_diffs[3]);
uint16_t min_token_offset5[3] = {MAX_DISPLACEMENT, 2, 4};
MatchScore::pack_token_offsets(min_token_offset5, 3, 2, offset_diffs);
ASSERT_EQ(3, offset_diffs[0]);
ASSERT_EQ(std::numeric_limits<int8_t>::max(), offset_diffs[1]);
ASSERT_EQ(0, offset_diffs[2]);
ASSERT_EQ(2, offset_diffs[3]);
}