String field tokens which match with query tokens are highlighted in the results.

This commit is contained in:
Kishore Nallan 2017-06-09 14:59:06 -05:00
parent 1d5146f7ff
commit 50e08726da
5 changed files with 49 additions and 14 deletions

View File

@ -60,8 +60,9 @@ struct MatchScore {
static void pack_token_offsets(const uint16_t* min_token_offset, const size_t num_tokens,
TokenOffsetDiffs & offset_diffs) {
offset_diffs.bytes[0] = num_tokens;
for(size_t i = 1; i < num_tokens; i++) {
offset_diffs.bytes[i-1] = (char)(min_token_offset[i] - min_token_offset[0]);
offset_diffs.bytes[i] = (char)(min_token_offset[i] - min_token_offset[0]);
}
}
@ -120,8 +121,9 @@ struct MatchScore {
// If a token appeared within the window, we would have recorded its offset
if(token_offset[token_id] != MAX_DISPLACEMENT) {
num_match++;
if(prev_pos == MAX_DISPLACEMENT) prev_pos = token_offset[token_id];
else {
if(prev_pos == MAX_DISPLACEMENT) { // for the first word
prev_pos = token_offset[token_id];
} else {
// Calculate the distance between the tokens within the window
// Ideally, this should be (NUM_TOKENS - 1) when all the tokens are adjacent to each other
D(std::cout << "prev_pos: " << prev_pos << " , curr_pos: " << token_offset[token_id] << std::endl);
@ -136,11 +138,15 @@ struct MatchScore {
// Track the best `displacement` and `num_match` seen so far across all the windows
if(num_match >= max_match) {
max_match = num_match;
if(displacement != 0 && displacement < min_displacement) {
min_displacement = displacement;
if(displacement == 0 || displacement < min_displacement) {
// record the token positions (for highlighting)
memcpy(min_token_offset, token_offset, token_offsets.size()*sizeof(uint16_t));
}
if(displacement != 0 && displacement < min_displacement) {
min_displacement = displacement;
}
}
// As we slide the window, drop the first token of the window from the computation
@ -150,9 +156,9 @@ struct MatchScore {
// do run-length encoding of the min token positions/offsets
TokenOffsetDiffs offset_diffs;
uint16_t start_offset = min_token_offset[0];
uint16_t token_start_offset = min_token_offset[0];
pack_token_offsets(min_token_offset, token_offsets.size(), offset_diffs);
return MatchScore{max_match, min_displacement, start_offset, offset_diffs.packed};
return MatchScore{max_match, min_displacement, token_start_offset, offset_diffs.packed};
}
};

View File

@ -14,7 +14,7 @@ template <size_t MAX_SIZE=100>
struct Topster {
struct KV {
uint16_t start_offset;
TokenOffsetDiffs offset_diffs;
TokenOffsetDiffs offset_diffs; // [len, offset1-start_offset, offset2-start_offset, ...]
uint64_t key;
uint64_t match_score;
int64_t primary_attr;

View File

@ -651,6 +651,35 @@ nlohmann::json Collection::search(std::string query, const std::vector<std::stri
const std::string &seq_id_key = get_seq_id_key((uint32_t) field_order_kv.second.key);
store->get(seq_id_key, value);
nlohmann::json document = nlohmann::json::parse(value);
// highlight query words in the result
const std::string & field_name = search_fields[search_fields.size() - field_order_kv.first];
field search_field = search_schema.at(field_name);
if(search_field.type == field_types::STRING) {
std::vector<std::string> tokens;
StringUtils::split(document[field_name], tokens, " ");
tokens[field_order_kv.second.start_offset] =
"<mark>" + tokens[field_order_kv.second.start_offset] + "</mark>";
for(size_t i = 1; i < field_order_kv.second.offset_diffs.bytes[0]; i++) {
size_t token_index = (size_t)(field_order_kv.second.start_offset + field_order_kv.second.offset_diffs.bytes[i]);
tokens[token_index] = "<mark>" + tokens[token_index] + "</mark>";
}
std::stringstream ss;
for(size_t token_index = 0; token_index < tokens.size(); ++token_index) {
if(token_index != 0) {
ss << " ";
}
ss << tokens[token_index];
}
document[field_name] = ss.str();
}
result["hits"].push_back(document);
}

View File

@ -353,8 +353,6 @@ TEST_F(CollectionTest, PrefixSearching) {
ASSERT_STREQ(id.c_str(), result_id.c_str());
}
std::cout << "WHAT EX..." << std::endl;
results = collection->search("what ex", query_fields, "", facets, sort_fields, 0, 10, MAX_SCORE, true);
ASSERT_EQ(9, results["hits"].size());
ids = {"6", "12", "19", "22", "13", "8", "15", "24", "21"};

View File

@ -6,12 +6,14 @@ TEST(MatchScoreTest, ShouldPackTokenOffsets) {
TokenOffsetDiffs offset_diffs;
MatchScore::pack_token_offsets(min_token_offset1, 3, offset_diffs);
ASSERT_EQ(1, offset_diffs.bytes[0]);
ASSERT_EQ(3, offset_diffs.bytes[1]);
ASSERT_EQ(3, offset_diffs.bytes[0]);
ASSERT_EQ(1, offset_diffs.bytes[1]);
ASSERT_EQ(3, offset_diffs.bytes[2]);
uint16_t min_token_offset2[3] = {0, 1, 2};
MatchScore::pack_token_offsets(min_token_offset2, 3, offset_diffs);
ASSERT_EQ(1, offset_diffs.bytes[0]);
ASSERT_EQ(2, offset_diffs.bytes[1]);
ASSERT_EQ(3, offset_diffs.bytes[0]);
ASSERT_EQ(1, offset_diffs.bytes[1]);
ASSERT_EQ(2, offset_diffs.bytes[2]);
}