From 50e08726da5bee4a16051b7193df7e47f6dca86c Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Fri, 9 Jun 2017 14:59:06 -0500 Subject: [PATCH] String field tokens which match with query tokens are highlighted in the results. --- include/match_score.h | 20 +++++++++++++------- include/topster.h | 2 +- src/collection.cpp | 29 +++++++++++++++++++++++++++++ test/collection_test.cpp | 2 -- test/match_score_test.cpp | 10 ++++++---- 5 files changed, 49 insertions(+), 14 deletions(-) diff --git a/include/match_score.h b/include/match_score.h index d81dbe86..4c989420 100644 --- a/include/match_score.h +++ b/include/match_score.h @@ -60,8 +60,9 @@ struct MatchScore { static void pack_token_offsets(const uint16_t* min_token_offset, const size_t num_tokens, TokenOffsetDiffs & offset_diffs) { + offset_diffs.bytes[0] = num_tokens; for(size_t i = 1; i < num_tokens; i++) { - offset_diffs.bytes[i-1] = (char)(min_token_offset[i] - min_token_offset[0]); + offset_diffs.bytes[i] = (char)(min_token_offset[i] - min_token_offset[0]); } } @@ -120,8 +121,9 @@ struct MatchScore { // If a token appeared within the window, we would have recorded its offset if(token_offset[token_id] != MAX_DISPLACEMENT) { num_match++; - if(prev_pos == MAX_DISPLACEMENT) prev_pos = token_offset[token_id]; - else { + if(prev_pos == MAX_DISPLACEMENT) { // for the first word + prev_pos = token_offset[token_id]; + } else { // Calculate the distance between the tokens within the window // Ideally, this should be (NUM_TOKENS - 1) when all the tokens are adjacent to each other D(std::cout << "prev_pos: " << prev_pos << " , curr_pos: " << token_offset[token_id] << std::endl); @@ -136,11 +138,15 @@ struct MatchScore { // Track the best `displacement` and `num_match` seen so far across all the windows if(num_match >= max_match) { max_match = num_match; - if(displacement != 0 && displacement < min_displacement) { - min_displacement = displacement; + if(displacement == 0 || displacement < min_displacement) { // record the token positions (for highlighting) memcpy(min_token_offset, token_offset, token_offsets.size()*sizeof(uint16_t)); } + + if(displacement != 0 && displacement < min_displacement) { + min_displacement = displacement; + + } } // As we slide the window, drop the first token of the window from the computation @@ -150,9 +156,9 @@ struct MatchScore { // do run-length encoding of the min token positions/offsets TokenOffsetDiffs offset_diffs; - uint16_t start_offset = min_token_offset[0]; + uint16_t token_start_offset = min_token_offset[0]; pack_token_offsets(min_token_offset, token_offsets.size(), offset_diffs); - return MatchScore{max_match, min_displacement, start_offset, offset_diffs.packed}; + return MatchScore{max_match, min_displacement, token_start_offset, offset_diffs.packed}; } }; diff --git a/include/topster.h b/include/topster.h index a1ec92eb..cedcc114 100644 --- a/include/topster.h +++ b/include/topster.h @@ -14,7 +14,7 @@ template struct Topster { struct KV { uint16_t start_offset; - TokenOffsetDiffs offset_diffs; + TokenOffsetDiffs offset_diffs; // [len, offset1-start_offset, offset2-start_offset, ...] uint64_t key; uint64_t match_score; int64_t primary_attr; diff --git a/src/collection.cpp b/src/collection.cpp index f8814ca1..d79706ba 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -651,6 +651,35 @@ nlohmann::json Collection::search(std::string query, const std::vectorget(seq_id_key, value); nlohmann::json document = nlohmann::json::parse(value); + + // highlight query words in the result + const std::string & field_name = search_fields[search_fields.size() - field_order_kv.first]; + field search_field = search_schema.at(field_name); + + if(search_field.type == field_types::STRING) { + std::vector tokens; + StringUtils::split(document[field_name], tokens, " "); + + tokens[field_order_kv.second.start_offset] = + "" + tokens[field_order_kv.second.start_offset] + ""; + + for(size_t i = 1; i < field_order_kv.second.offset_diffs.bytes[0]; i++) { + size_t token_index = (size_t)(field_order_kv.second.start_offset + field_order_kv.second.offset_diffs.bytes[i]); + tokens[token_index] = "" + tokens[token_index] + ""; + } + + std::stringstream ss; + + for(size_t token_index = 0; token_index < tokens.size(); ++token_index) { + if(token_index != 0) { + ss << " "; + } + ss << tokens[token_index]; + } + + document[field_name] = ss.str(); + } + result["hits"].push_back(document); } diff --git a/test/collection_test.cpp b/test/collection_test.cpp index 74e107d0..25a070bf 100644 --- a/test/collection_test.cpp +++ b/test/collection_test.cpp @@ -353,8 +353,6 @@ TEST_F(CollectionTest, PrefixSearching) { ASSERT_STREQ(id.c_str(), result_id.c_str()); } - std::cout << "WHAT EX..." << std::endl; - results = collection->search("what ex", query_fields, "", facets, sort_fields, 0, 10, MAX_SCORE, true); ASSERT_EQ(9, results["hits"].size()); ids = {"6", "12", "19", "22", "13", "8", "15", "24", "21"}; diff --git a/test/match_score_test.cpp b/test/match_score_test.cpp index 57c1db12..c8145c5a 100644 --- a/test/match_score_test.cpp +++ b/test/match_score_test.cpp @@ -6,12 +6,14 @@ TEST(MatchScoreTest, ShouldPackTokenOffsets) { TokenOffsetDiffs offset_diffs; MatchScore::pack_token_offsets(min_token_offset1, 3, offset_diffs); - ASSERT_EQ(1, offset_diffs.bytes[0]); - ASSERT_EQ(3, offset_diffs.bytes[1]); + ASSERT_EQ(3, offset_diffs.bytes[0]); + ASSERT_EQ(1, offset_diffs.bytes[1]); + ASSERT_EQ(3, offset_diffs.bytes[2]); uint16_t min_token_offset2[3] = {0, 1, 2}; MatchScore::pack_token_offsets(min_token_offset2, 3, offset_diffs); - ASSERT_EQ(1, offset_diffs.bytes[0]); - ASSERT_EQ(2, offset_diffs.bytes[1]); + ASSERT_EQ(3, offset_diffs.bytes[0]); + ASSERT_EQ(1, offset_diffs.bytes[1]); + ASSERT_EQ(2, offset_diffs.bytes[2]); } \ No newline at end of file