mirror of
https://github.com/typesense/typesense.git
synced 2025-05-18 12:42:50 +08:00
String field tokens which match with query tokens are highlighted in the results.
This commit is contained in:
parent
1d5146f7ff
commit
50e08726da
@ -60,8 +60,9 @@ struct MatchScore {
|
||||
|
||||
static void pack_token_offsets(const uint16_t* min_token_offset, const size_t num_tokens,
|
||||
TokenOffsetDiffs & offset_diffs) {
|
||||
offset_diffs.bytes[0] = num_tokens;
|
||||
for(size_t i = 1; i < num_tokens; i++) {
|
||||
offset_diffs.bytes[i-1] = (char)(min_token_offset[i] - min_token_offset[0]);
|
||||
offset_diffs.bytes[i] = (char)(min_token_offset[i] - min_token_offset[0]);
|
||||
}
|
||||
}
|
||||
|
||||
@ -120,8 +121,9 @@ struct MatchScore {
|
||||
// If a token appeared within the window, we would have recorded its offset
|
||||
if(token_offset[token_id] != MAX_DISPLACEMENT) {
|
||||
num_match++;
|
||||
if(prev_pos == MAX_DISPLACEMENT) prev_pos = token_offset[token_id];
|
||||
else {
|
||||
if(prev_pos == MAX_DISPLACEMENT) { // for the first word
|
||||
prev_pos = token_offset[token_id];
|
||||
} else {
|
||||
// Calculate the distance between the tokens within the window
|
||||
// Ideally, this should be (NUM_TOKENS - 1) when all the tokens are adjacent to each other
|
||||
D(std::cout << "prev_pos: " << prev_pos << " , curr_pos: " << token_offset[token_id] << std::endl);
|
||||
@ -136,11 +138,15 @@ struct MatchScore {
|
||||
// Track the best `displacement` and `num_match` seen so far across all the windows
|
||||
if(num_match >= max_match) {
|
||||
max_match = num_match;
|
||||
if(displacement != 0 && displacement < min_displacement) {
|
||||
min_displacement = displacement;
|
||||
if(displacement == 0 || displacement < min_displacement) {
|
||||
// record the token positions (for highlighting)
|
||||
memcpy(min_token_offset, token_offset, token_offsets.size()*sizeof(uint16_t));
|
||||
}
|
||||
|
||||
if(displacement != 0 && displacement < min_displacement) {
|
||||
min_displacement = displacement;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
// As we slide the window, drop the first token of the window from the computation
|
||||
@ -150,9 +156,9 @@ struct MatchScore {
|
||||
|
||||
// do run-length encoding of the min token positions/offsets
|
||||
TokenOffsetDiffs offset_diffs;
|
||||
uint16_t start_offset = min_token_offset[0];
|
||||
uint16_t token_start_offset = min_token_offset[0];
|
||||
pack_token_offsets(min_token_offset, token_offsets.size(), offset_diffs);
|
||||
|
||||
return MatchScore{max_match, min_displacement, start_offset, offset_diffs.packed};
|
||||
return MatchScore{max_match, min_displacement, token_start_offset, offset_diffs.packed};
|
||||
}
|
||||
};
|
||||
|
@ -14,7 +14,7 @@ template <size_t MAX_SIZE=100>
|
||||
struct Topster {
|
||||
struct KV {
|
||||
uint16_t start_offset;
|
||||
TokenOffsetDiffs offset_diffs;
|
||||
TokenOffsetDiffs offset_diffs; // [len, offset1-start_offset, offset2-start_offset, ...]
|
||||
uint64_t key;
|
||||
uint64_t match_score;
|
||||
int64_t primary_attr;
|
||||
|
@ -651,6 +651,35 @@ nlohmann::json Collection::search(std::string query, const std::vector<std::stri
|
||||
const std::string &seq_id_key = get_seq_id_key((uint32_t) field_order_kv.second.key);
|
||||
store->get(seq_id_key, value);
|
||||
nlohmann::json document = nlohmann::json::parse(value);
|
||||
|
||||
// highlight query words in the result
|
||||
const std::string & field_name = search_fields[search_fields.size() - field_order_kv.first];
|
||||
field search_field = search_schema.at(field_name);
|
||||
|
||||
if(search_field.type == field_types::STRING) {
|
||||
std::vector<std::string> tokens;
|
||||
StringUtils::split(document[field_name], tokens, " ");
|
||||
|
||||
tokens[field_order_kv.second.start_offset] =
|
||||
"<mark>" + tokens[field_order_kv.second.start_offset] + "</mark>";
|
||||
|
||||
for(size_t i = 1; i < field_order_kv.second.offset_diffs.bytes[0]; i++) {
|
||||
size_t token_index = (size_t)(field_order_kv.second.start_offset + field_order_kv.second.offset_diffs.bytes[i]);
|
||||
tokens[token_index] = "<mark>" + tokens[token_index] + "</mark>";
|
||||
}
|
||||
|
||||
std::stringstream ss;
|
||||
|
||||
for(size_t token_index = 0; token_index < tokens.size(); ++token_index) {
|
||||
if(token_index != 0) {
|
||||
ss << " ";
|
||||
}
|
||||
ss << tokens[token_index];
|
||||
}
|
||||
|
||||
document[field_name] = ss.str();
|
||||
}
|
||||
|
||||
result["hits"].push_back(document);
|
||||
}
|
||||
|
||||
|
@ -353,8 +353,6 @@ TEST_F(CollectionTest, PrefixSearching) {
|
||||
ASSERT_STREQ(id.c_str(), result_id.c_str());
|
||||
}
|
||||
|
||||
std::cout << "WHAT EX..." << std::endl;
|
||||
|
||||
results = collection->search("what ex", query_fields, "", facets, sort_fields, 0, 10, MAX_SCORE, true);
|
||||
ASSERT_EQ(9, results["hits"].size());
|
||||
ids = {"6", "12", "19", "22", "13", "8", "15", "24", "21"};
|
||||
|
@ -6,12 +6,14 @@ TEST(MatchScoreTest, ShouldPackTokenOffsets) {
|
||||
TokenOffsetDiffs offset_diffs;
|
||||
MatchScore::pack_token_offsets(min_token_offset1, 3, offset_diffs);
|
||||
|
||||
ASSERT_EQ(1, offset_diffs.bytes[0]);
|
||||
ASSERT_EQ(3, offset_diffs.bytes[1]);
|
||||
ASSERT_EQ(3, offset_diffs.bytes[0]);
|
||||
ASSERT_EQ(1, offset_diffs.bytes[1]);
|
||||
ASSERT_EQ(3, offset_diffs.bytes[2]);
|
||||
|
||||
uint16_t min_token_offset2[3] = {0, 1, 2};
|
||||
MatchScore::pack_token_offsets(min_token_offset2, 3, offset_diffs);
|
||||
|
||||
ASSERT_EQ(1, offset_diffs.bytes[0]);
|
||||
ASSERT_EQ(2, offset_diffs.bytes[1]);
|
||||
ASSERT_EQ(3, offset_diffs.bytes[0]);
|
||||
ASSERT_EQ(1, offset_diffs.bytes[1]);
|
||||
ASSERT_EQ(2, offset_diffs.bytes[2]);
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user