mirror of
https://github.com/typesense/typesense.git
synced 2025-05-19 21:22:25 +08:00
Fixed an out-of-bounds bug with highlighting.
This commit is contained in:
parent
a7479171b1
commit
3907c2d3f9
@ -14,6 +14,10 @@
|
||||
|
||||
#define TokenOffsetHeap std::priority_queue<TokenOffset, std::vector<TokenOffset>, TokenOffset>
|
||||
|
||||
const size_t WINDOW_SIZE = 10;
|
||||
const uint16_t MAX_DISPLACEMENT = std::numeric_limits<uint16_t>::max();
|
||||
|
||||
|
||||
struct TokenOffset {
|
||||
uint8_t token_id; // token identifier
|
||||
uint16_t offset; // token's offset in the text
|
||||
@ -63,10 +67,17 @@ struct MatchScore {
|
||||
}
|
||||
|
||||
static void pack_token_offsets(const uint16_t* min_token_offset, const size_t num_tokens,
|
||||
const size_t start_token_index, char *offset_diffs) {
|
||||
const uint16_t token_start_offset, char *offset_diffs) {
|
||||
offset_diffs[0] = (char) num_tokens;
|
||||
for(size_t i = start_token_index; i < num_tokens; i++) {
|
||||
offset_diffs[1+i] = (int8_t)(min_token_offset[i] - min_token_offset[start_token_index]);
|
||||
size_t j = 1;
|
||||
|
||||
for(size_t i = 0; i < num_tokens; i++) {
|
||||
if(min_token_offset[i] != MAX_DISPLACEMENT) {
|
||||
offset_diffs[j] = (int8_t)(min_token_offset[i] - token_start_offset);
|
||||
} else {
|
||||
offset_diffs[j] = std::numeric_limits<int8_t>::max();
|
||||
}
|
||||
j++;
|
||||
}
|
||||
}
|
||||
|
||||
@ -79,9 +90,6 @@ struct MatchScore {
|
||||
* compute the max_match and min_displacement of target tokens across the windows.
|
||||
*/
|
||||
static MatchScore match_score(uint32_t doc_id, std::vector<std::vector<uint16_t>> &token_offsets) {
|
||||
const size_t WINDOW_SIZE = 10;
|
||||
const uint16_t MAX_DISPLACEMENT = std::numeric_limits<uint16_t>::max();
|
||||
|
||||
std::priority_queue<TokenOffset, std::vector<TokenOffset>, TokenOffset> heap;
|
||||
|
||||
for(uint8_t token_id=0; token_id < token_offsets.size(); token_id++) {
|
||||
@ -157,12 +165,11 @@ struct MatchScore {
|
||||
|
||||
// do run-length encoding of the min token positions/offsets
|
||||
uint16_t token_start_offset = 0;
|
||||
char offset_diffs[16];
|
||||
std::fill_n(offset_diffs, 16, 0);
|
||||
|
||||
int token_index = 0;
|
||||
char packed_offset_diffs[16];
|
||||
std::fill_n(packed_offset_diffs, 16, 0);
|
||||
|
||||
// identify the first token which is actually present and use that as the base for run-length encoding
|
||||
int token_index = 0;
|
||||
while(token_index < token_offsets.size()) {
|
||||
if(min_token_offset[token_index] != MAX_DISPLACEMENT) {
|
||||
token_start_offset = min_token_offset[token_index];
|
||||
@ -171,7 +178,7 @@ struct MatchScore {
|
||||
token_index++;
|
||||
}
|
||||
|
||||
pack_token_offsets(min_token_offset, token_offsets.size(), token_index, offset_diffs);
|
||||
return MatchScore(max_match, min_displacement, token_start_offset, offset_diffs);
|
||||
pack_token_offsets(min_token_offset, token_offsets.size(), token_start_offset, packed_offset_diffs);
|
||||
return MatchScore(max_match, min_displacement, token_start_offset, packed_offset_diffs);
|
||||
}
|
||||
};
|
||||
|
@ -811,6 +811,11 @@ Option<nlohmann::json> Collection::search(std::string query, const std::vector<s
|
||||
const uint32_t filter_ids_length = op_filter_ids_length.get();
|
||||
|
||||
// check for valid pagination
|
||||
if(page < 1) {
|
||||
std::string message = "Page must be an integer of value greater than 0.";
|
||||
return Option<nlohmann::json>(422, message);
|
||||
}
|
||||
|
||||
if((page * per_page) > MAX_RESULTS) {
|
||||
std::string message = "Only the first " + std::to_string(MAX_RESULTS) + " results are available.";
|
||||
return Option<nlohmann::json>(422, message);
|
||||
@ -889,6 +894,7 @@ Option<nlohmann::json> Collection::search(std::string query, const std::vector<s
|
||||
std::vector<std::string> tokens;
|
||||
StringUtils::split(document[field_name], tokens, " ");
|
||||
|
||||
// positions in the document of each token in the query
|
||||
std::vector<std::vector<uint16_t>> token_positions;
|
||||
|
||||
for (const art_leaf *token_leaf : searched_queries[field_order_kv.second.query_index]) {
|
||||
@ -917,8 +923,10 @@ Option<nlohmann::json> Collection::search(std::string query, const std::vector<s
|
||||
std::vector<size_t> token_indices;
|
||||
char num_tokens_found = mscore.offset_diffs[0];
|
||||
for(size_t i = 1; i <= num_tokens_found; i++) {
|
||||
size_t token_index = (size_t)(mscore.start_offset + mscore.offset_diffs[i]);
|
||||
token_indices.push_back(token_index);
|
||||
if(mscore.offset_diffs[i] != std::numeric_limits<int8_t>::max()) {
|
||||
size_t token_index = (size_t)(mscore.start_offset + mscore.offset_diffs[i]);
|
||||
token_indices.push_back(token_index);
|
||||
}
|
||||
}
|
||||
|
||||
auto minmax = std::minmax_element(token_indices.begin(), token_indices.end());
|
||||
|
@ -5,7 +5,7 @@ TEST(MatchScoreTest, ShouldPackTokenOffsets) {
|
||||
uint16_t min_token_offset1[3] = {567, 568, 570};
|
||||
char offset_diffs[16];
|
||||
|
||||
MatchScore::pack_token_offsets(min_token_offset1, 3, 0, offset_diffs);
|
||||
MatchScore::pack_token_offsets(min_token_offset1, 3, 567, offset_diffs);
|
||||
|
||||
ASSERT_EQ(3, offset_diffs[0]);
|
||||
ASSERT_EQ(0, offset_diffs[1]);
|
||||
@ -21,8 +21,25 @@ TEST(MatchScoreTest, ShouldPackTokenOffsets) {
|
||||
ASSERT_EQ(2, offset_diffs[3]);
|
||||
|
||||
uint16_t min_token_offset3[1] = {123};
|
||||
MatchScore::pack_token_offsets(min_token_offset3, 1, 0, offset_diffs);
|
||||
MatchScore::pack_token_offsets(min_token_offset3, 1, 123, offset_diffs);
|
||||
|
||||
ASSERT_EQ(1, offset_diffs[0]);
|
||||
ASSERT_EQ(0, offset_diffs[1]);
|
||||
|
||||
// a token might not have an offset because it might not be in the best matching window
|
||||
uint16_t min_token_offset4[3] = {0, MAX_DISPLACEMENT, 2};
|
||||
MatchScore::pack_token_offsets(min_token_offset4, 3, 0, offset_diffs);
|
||||
|
||||
ASSERT_EQ(3, offset_diffs[0]);
|
||||
ASSERT_EQ(0, offset_diffs[1]);
|
||||
ASSERT_EQ(std::numeric_limits<int8_t>::max(), offset_diffs[2]);
|
||||
ASSERT_EQ(2, offset_diffs[3]);
|
||||
|
||||
uint16_t min_token_offset5[3] = {MAX_DISPLACEMENT, 2, 4};
|
||||
MatchScore::pack_token_offsets(min_token_offset5, 3, 2, offset_diffs);
|
||||
|
||||
ASSERT_EQ(3, offset_diffs[0]);
|
||||
ASSERT_EQ(std::numeric_limits<int8_t>::max(), offset_diffs[1]);
|
||||
ASSERT_EQ(0, offset_diffs[2]);
|
||||
ASSERT_EQ(2, offset_diffs[3]);
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user