From 50e08726da5bee4a16051b7193df7e47f6dca86c Mon Sep 17 00:00:00 2001
From: Kishore Nallan <kishore@kishorelive.com>
Date: Fri, 9 Jun 2017 14:59:06 -0500
Subject: [PATCH] String field tokens which match with query tokens are
 highlighted in the results.

---
 include/match_score.h     | 20 +++++++++++++-------
 include/topster.h         |  2 +-
 src/collection.cpp        | 29 +++++++++++++++++++++++++++++
 test/collection_test.cpp  |  2 --
 test/match_score_test.cpp | 10 ++++++----
 5 files changed, 49 insertions(+), 14 deletions(-)

diff --git a/include/match_score.h b/include/match_score.h
index d81dbe86..4c989420 100644
--- a/include/match_score.h
+++ b/include/match_score.h
@@ -60,8 +60,9 @@ struct MatchScore {
 
   static void pack_token_offsets(const uint16_t* min_token_offset, const size_t num_tokens,
                                  TokenOffsetDiffs & offset_diffs) {
+      offset_diffs.bytes[0] = num_tokens;
       for(size_t i = 1; i < num_tokens; i++) {
-          offset_diffs.bytes[i-1] = (char)(min_token_offset[i] - min_token_offset[0]);
+          offset_diffs.bytes[i] = (char)(min_token_offset[i] - min_token_offset[0]);
       }
   }
 
@@ -120,8 +121,9 @@ struct MatchScore {
         // If a token appeared within the window, we would have recorded its offset
         if(token_offset[token_id] != MAX_DISPLACEMENT) {
           num_match++;
-          if(prev_pos == MAX_DISPLACEMENT) prev_pos = token_offset[token_id];
-          else {
+          if(prev_pos == MAX_DISPLACEMENT) { // for the first word
+            prev_pos = token_offset[token_id];
+          } else {
             // Calculate the distance between the tokens within the window
             // Ideally, this should be (NUM_TOKENS - 1) when all the tokens are adjacent to each other
             D(std::cout << "prev_pos: " << prev_pos << " , curr_pos: " << token_offset[token_id] << std::endl);
@@ -136,11 +138,15 @@ struct MatchScore {
       // Track the best `displacement` and `num_match` seen so far across all the windows
       if(num_match >= max_match) {
         max_match = num_match;
-        if(displacement != 0 && displacement < min_displacement) {
-          min_displacement = displacement;
+        if(displacement == 0 || displacement < min_displacement) {
           // record the token positions (for highlighting)
           memcpy(min_token_offset, token_offset, token_offsets.size()*sizeof(uint16_t));
         }
+
+        if(displacement != 0 && displacement < min_displacement) {
+          min_displacement = displacement;
+
+        }
       }
 
       // As we slide the window, drop the first token of the window from the computation
@@ -150,9 +156,9 @@ struct MatchScore {
 
     // do run-length encoding of the min token positions/offsets
     TokenOffsetDiffs offset_diffs;
-    uint16_t start_offset = min_token_offset[0];
+    uint16_t token_start_offset = min_token_offset[0];
     pack_token_offsets(min_token_offset, token_offsets.size(), offset_diffs);
 
-    return MatchScore{max_match, min_displacement, start_offset, offset_diffs.packed};
+    return MatchScore{max_match, min_displacement, token_start_offset, offset_diffs.packed};
   }
 };
diff --git a/include/topster.h b/include/topster.h
index a1ec92eb..cedcc114 100644
--- a/include/topster.h
+++ b/include/topster.h
@@ -14,7 +14,7 @@ template <size_t MAX_SIZE=100>
 struct Topster {
     struct KV {
         uint16_t start_offset;
-        TokenOffsetDiffs offset_diffs;
+        TokenOffsetDiffs offset_diffs;  // [len, offset1-start_offset, offset2-start_offset, ...]
         uint64_t key;
         uint64_t match_score;
         int64_t primary_attr;
diff --git a/src/collection.cpp b/src/collection.cpp
index f8814ca1..d79706ba 100644
--- a/src/collection.cpp
+++ b/src/collection.cpp
@@ -651,6 +651,35 @@ nlohmann::json Collection::search(std::string query, const std::vector<std::stri
         const std::string &seq_id_key = get_seq_id_key((uint32_t) field_order_kv.second.key);
         store->get(seq_id_key, value);
         nlohmann::json document = nlohmann::json::parse(value);
+
+        // highlight query words in the result
+        const std::string & field_name = search_fields[search_fields.size() - field_order_kv.first];
+        field search_field = search_schema.at(field_name);
+
+        if(search_field.type == field_types::STRING) {
+            std::vector<std::string> tokens;
+            StringUtils::split(document[field_name], tokens, " ");
+
+            tokens[field_order_kv.second.start_offset] =
+                    "<mark>" + tokens[field_order_kv.second.start_offset] + "</mark>";
+
+            for(size_t i = 1; i < field_order_kv.second.offset_diffs.bytes[0]; i++) {
+                size_t token_index = (size_t)(field_order_kv.second.start_offset + field_order_kv.second.offset_diffs.bytes[i]);
+                tokens[token_index] = "<mark>" + tokens[token_index] + "</mark>";
+            }
+
+            std::stringstream ss;
+
+            for(size_t token_index = 0; token_index < tokens.size(); ++token_index) {
+                if(token_index != 0) {
+                    ss << " ";
+                }
+                ss << tokens[token_index];
+            }
+
+            document[field_name] = ss.str();
+        }
+
         result["hits"].push_back(document);
     }
 
diff --git a/test/collection_test.cpp b/test/collection_test.cpp
index 74e107d0..25a070bf 100644
--- a/test/collection_test.cpp
+++ b/test/collection_test.cpp
@@ -353,8 +353,6 @@ TEST_F(CollectionTest, PrefixSearching) {
         ASSERT_STREQ(id.c_str(), result_id.c_str());
     }
 
-    std::cout << "WHAT EX..." << std::endl;
-
     results = collection->search("what ex", query_fields, "", facets, sort_fields, 0, 10, MAX_SCORE, true);
     ASSERT_EQ(9, results["hits"].size());
     ids = {"6", "12", "19", "22", "13", "8", "15", "24", "21"};
diff --git a/test/match_score_test.cpp b/test/match_score_test.cpp
index 57c1db12..c8145c5a 100644
--- a/test/match_score_test.cpp
+++ b/test/match_score_test.cpp
@@ -6,12 +6,14 @@ TEST(MatchScoreTest, ShouldPackTokenOffsets) {
     TokenOffsetDiffs offset_diffs;
     MatchScore::pack_token_offsets(min_token_offset1, 3, offset_diffs);
 
-    ASSERT_EQ(1, offset_diffs.bytes[0]);
-    ASSERT_EQ(3, offset_diffs.bytes[1]);
+    ASSERT_EQ(3, offset_diffs.bytes[0]);
+    ASSERT_EQ(1, offset_diffs.bytes[1]);
+    ASSERT_EQ(3, offset_diffs.bytes[2]);
 
     uint16_t  min_token_offset2[3] = {0, 1, 2};
     MatchScore::pack_token_offsets(min_token_offset2, 3, offset_diffs);
 
-    ASSERT_EQ(1, offset_diffs.bytes[0]);
-    ASSERT_EQ(2, offset_diffs.bytes[1]);
+    ASSERT_EQ(3, offset_diffs.bytes[0]);
+    ASSERT_EQ(1, offset_diffs.bytes[1]);
+    ASSERT_EQ(2, offset_diffs.bytes[2]);
 }
\ No newline at end of file