String field tokens which match with query tokens are highlighted in the results.

2025-05-18 12:42:50 +08:00 · 2017-06-09 14:59:06 -05:00 · 2017-06-09 14:59:06 -05:00 · 50e08726da
commit 50e08726da
parent 1d5146f7ff
5 changed files with 49 additions and 14 deletions
--- a/include/match_score.h
+++ b/include/match_score.h
@ -60,8 +60,9 @@ struct MatchScore {

  static void pack_token_offsets(const uint16_t* min_token_offset, const size_t num_tokens,
                                 TokenOffsetDiffs & offset_diffs) {
+      offset_diffs.bytes[0] = num_tokens;
      for(size_t i = 1; i < num_tokens; i++) {
-          offset_diffs.bytes[i-1] = (char)(min_token_offset[i] - min_token_offset[0]);
+          offset_diffs.bytes[i] = (char)(min_token_offset[i] - min_token_offset[0]);
      }
  }

@ -120,8 +121,9 @@ struct MatchScore {
        // If a token appeared within the window, we would have recorded its offset
        if(token_offset[token_id] != MAX_DISPLACEMENT) {
          num_match++;
-          if(prev_pos == MAX_DISPLACEMENT) prev_pos = token_offset[token_id];
-          else {
+          if(prev_pos == MAX_DISPLACEMENT) { // for the first word
+            prev_pos = token_offset[token_id];
+          } else {
            // Calculate the distance between the tokens within the window
            // Ideally, this should be (NUM_TOKENS - 1) when all the tokens are adjacent to each other
            D(std::cout << "prev_pos: " << prev_pos << " , curr_pos: " << token_offset[token_id] << std::endl);
@ -136,11 +138,15 @@ struct MatchScore {
      // Track the best `displacement` and `num_match` seen so far across all the windows
      if(num_match >= max_match) {
        max_match = num_match;
-        if(displacement != 0 && displacement < min_displacement) {
-          min_displacement = displacement;
+        if(displacement == 0 || displacement < min_displacement) {
          // record the token positions (for highlighting)
          memcpy(min_token_offset, token_offset, token_offsets.size()*sizeof(uint16_t));
        }
+
+        if(displacement != 0 && displacement < min_displacement) {
+          min_displacement = displacement;
+
+        }
      }

      // As we slide the window, drop the first token of the window from the computation
@ -150,9 +156,9 @@ struct MatchScore {

    // do run-length encoding of the min token positions/offsets
    TokenOffsetDiffs offset_diffs;
-    uint16_t start_offset = min_token_offset[0];
+    uint16_t token_start_offset = min_token_offset[0];
    pack_token_offsets(min_token_offset, token_offsets.size(), offset_diffs);

-    return MatchScore{max_match, min_displacement, start_offset, offset_diffs.packed};
+    return MatchScore{max_match, min_displacement, token_start_offset, offset_diffs.packed};
  }
 };
--- a/include/topster.h
+++ b/include/topster.h
@ -14,7 +14,7 @@ template <size_t MAX_SIZE=100>
 struct Topster {
    struct KV {
        uint16_t start_offset;
-        TokenOffsetDiffs offset_diffs;
+        TokenOffsetDiffs offset_diffs;  // [len, offset1-start_offset, offset2-start_offset, ...]
        uint64_t key;
        uint64_t match_score;
        int64_t primary_attr;
--- a/src/collection.cpp
+++ b/src/collection.cpp
@ -651,6 +651,35 @@ nlohmann::json Collection::search(std::string query, const std::vector<std::stri
        const std::string &seq_id_key = get_seq_id_key((uint32_t) field_order_kv.second.key);
        store->get(seq_id_key, value);
        nlohmann::json document = nlohmann::json::parse(value);
+
+        // highlight query words in the result
+        const std::string & field_name = search_fields[search_fields.size() - field_order_kv.first];
+        field search_field = search_schema.at(field_name);
+
+        if(search_field.type == field_types::STRING) {
+            std::vector<std::string> tokens;
+            StringUtils::split(document[field_name], tokens, " ");
+
+            tokens[field_order_kv.second.start_offset] =
+                    "<mark>" + tokens[field_order_kv.second.start_offset] + "</mark>";
+
+            for(size_t i = 1; i < field_order_kv.second.offset_diffs.bytes[0]; i++) {
+                size_t token_index = (size_t)(field_order_kv.second.start_offset + field_order_kv.second.offset_diffs.bytes[i]);
+                tokens[token_index] = "<mark>" + tokens[token_index] + "</mark>";
+            }
+
+            std::stringstream ss;
+
+            for(size_t token_index = 0; token_index < tokens.size(); ++token_index) {
+                if(token_index != 0) {
+                    ss << " ";
+                }
+                ss << tokens[token_index];
+            }
+
+            document[field_name] = ss.str();
+        }
+
        result["hits"].push_back(document);
    }

--- a/test/collection_test.cpp
+++ b/test/collection_test.cpp
@ -353,8 +353,6 @@ TEST_F(CollectionTest, PrefixSearching) {
        ASSERT_STREQ(id.c_str(), result_id.c_str());
    }

-    std::cout << "WHAT EX..." << std::endl;
-
    results = collection->search("what ex", query_fields, "", facets, sort_fields, 0, 10, MAX_SCORE, true);
    ASSERT_EQ(9, results["hits"].size());
    ids = {"6", "12", "19", "22", "13", "8", "15", "24", "21"};
--- a/test/match_score_test.cpp
+++ b/test/match_score_test.cpp
@ -6,12 +6,14 @@ TEST(MatchScoreTest, ShouldPackTokenOffsets) {
    TokenOffsetDiffs offset_diffs;
    MatchScore::pack_token_offsets(min_token_offset1, 3, offset_diffs);

-    ASSERT_EQ(1, offset_diffs.bytes[0]);
-    ASSERT_EQ(3, offset_diffs.bytes[1]);
+    ASSERT_EQ(3, offset_diffs.bytes[0]);
+    ASSERT_EQ(1, offset_diffs.bytes[1]);
+    ASSERT_EQ(3, offset_diffs.bytes[2]);

    uint16_t  min_token_offset2[3] = {0, 1, 2};
    MatchScore::pack_token_offsets(min_token_offset2, 3, offset_diffs);

-    ASSERT_EQ(1, offset_diffs.bytes[0]);
-    ASSERT_EQ(2, offset_diffs.bytes[1]);
+    ASSERT_EQ(3, offset_diffs.bytes[0]);
+    ASSERT_EQ(1, offset_diffs.bytes[1]);
+    ASSERT_EQ(2, offset_diffs.bytes[2]);
 }