From 99e11064a69e066ab4b38aa67590fabd1bc0a04a Mon Sep 17 00:00:00 2001
From: Kishore Nallan <kishorenc@gmail.com>
Date: Sun, 14 Jul 2024 17:24:45 +0530
Subject: [PATCH] Fix highlighting for nested array of string.

---
 include/collection.h                   |  2 +-
 src/collection.cpp                     | 25 ++++++++++++-------
 test/collection_nested_fields_test.cpp | 33 ++++++++++++++++++++++++++
 3 files changed, 51 insertions(+), 9 deletions(-)
diff --git a/include/collection.h b/include/collection.h
index 8ebd0e88..fb0b6fd4 100644
--- a/include/collection.h
+++ b/include/collection.h
@@ -288,7 +288,7 @@ private:
     void populate_text_match_info(nlohmann::json& info, uint64_t match_score, const text_match_type_t match_type,
                                   const size_t total_tokens) const;
 
-    bool handle_highlight_text(std::string& text, bool normalise, const field &search_field,
+    bool handle_highlight_text(std::string& text, bool normalise, const field &search_field, const bool is_arr_obj_ele,
                                const std::vector<char>& symbols_to_index, const std::vector<char>& token_separators,
                                highlight_t& highlight, StringUtils & string_utils, bool use_word_tokenizer,
                                const size_t highlight_affix_num_tokens,
diff --git a/src/collection.cpp b/src/collection.cpp
index ba521d9f..dfac82f7 100644
--- a/src/collection.cpp
+++ b/src/collection.cpp
@@ -3072,7 +3072,7 @@ Option<nlohmann::json> Collection::search(std::string raw_query,
                         index_symbols[uint8_t(c)] = 1;
                     }
 
-                    handle_highlight_text(value, normalise, the_field, symbols_to_index, token_separators,
+                    handle_highlight_text(value, normalise, the_field, false, symbols_to_index, token_separators,
                                           highlight, string_utils, use_word_tokenizer,
                                           highlight_affix_num_tokens, qtoken_leaves, last_valid_offset_index,
                                           prefix_token_num_chars, false, snippet_threshold, false, ftokens,
@@ -4071,7 +4071,7 @@ void Collection::highlight_result(const std::string& raw_query, const field &sea
         std::string text = h_obj.get<std::string>();
         h_obj = nlohmann::json::object();
 
-        handle_highlight_text(text, normalise, search_field, symbols_to_index,
+        handle_highlight_text(text, normalise, search_field, is_arr_obj_ele, symbols_to_index,
                               token_separators, array_highlight, string_utils, use_word_tokenizer,
                               highlight_affix_num_tokens,
                               qtoken_leaves, last_valid_offset_index,
@@ -4165,7 +4165,7 @@ void Collection::highlight_result(const std::string& raw_query, const field &sea
             text = document[search_field.name][match_index.index];
         }
 
-        handle_highlight_text(text, normalise, search_field, symbols_to_index, token_separators,
+        handle_highlight_text(text, normalise, search_field, false, symbols_to_index, token_separators,
                               highlight, string_utils, use_word_tokenizer, highlight_affix_num_tokens,
                               qtoken_leaves, last_valid_offset_index, prefix_token_num_chars,
                               highlight_fully, snippet_threshold, is_infix_search, raw_query_tokens,
@@ -4195,7 +4195,8 @@ void Collection::highlight_result(const std::string& raw_query, const field &sea
 }
 
 bool Collection::handle_highlight_text(std::string& text, bool normalise, const field &search_field,
-                           const std::vector<char>& symbols_to_index, const std::vector<char>& token_separators,
+                           const bool is_arr_obj_ele, const std::vector<char>& symbols_to_index,
+                           const std::vector<char>& token_separators,
                            highlight_t& highlight, StringUtils & string_utils, bool use_word_tokenizer,
                            const size_t highlight_affix_num_tokens,
                            const tsl::htrie_map<char, token_leaf>& qtoken_leaves, int last_valid_offset_index,
@@ -4267,8 +4268,10 @@ bool Collection::handle_highlight_text(std::string& text, bool normalise, const
 
         // Token might not appear in the best matched window, which is limited to a size of 10.
         // If field is marked to be highlighted fully, or field length exceeds snippet_threshold, we will
-        // locate all tokens that appear in the query / query candidates
-        bool raw_token_found = !match_offset_found && (highlight_fully || text_len < snippet_threshold * 6) &&
+        // locate all tokens that appear in the query / query candidates. Likewise, for text within nested array of
+        // objects have to be exhaustively looked for highlight tokens.
+        bool raw_token_found = !match_offset_found &&
+                                (highlight_fully || is_arr_obj_ele || text_len < snippet_threshold * 6) &&
                                 qtoken_leaves.find(raw_token) != qtoken_leaves.end();
 
         if (match_offset_found || raw_token_found) {
@@ -4325,6 +4328,12 @@ bool Collection::handle_highlight_text(std::string& text, bool normalise, const
                     snippet_start_offset = snippet_start_window.front();
                 }
 
+                found_first_match = true;
+            } else if(raw_token_found && is_arr_obj_ele) {
+                if(!found_first_match) {
+                    snippet_start_offset = snippet_start_window.front();
+                }
+
                 found_first_match = true;
             }
         } else if(is_infix_search && text.size() < 100 &&
@@ -4333,7 +4342,7 @@ bool Collection::handle_highlight_text(std::string& text, bool normalise, const
             token_hits.insert(raw_token);
         }
 
-        if(raw_token_index >= last_valid_offset + highlight_affix_num_tokens) {
+        if(last_valid_offset_index != -1 && raw_token_index >= last_valid_offset + highlight_affix_num_tokens) {
             // register end of highlight snippet
             if(snippet_end_offset == text.size() - 1) {
                 snippet_end_offset = tok_end;
@@ -4349,7 +4358,7 @@ bool Collection::handle_highlight_text(std::string& text, bool normalise, const
         if(raw_token_index >= snippet_threshold &&
            match_offset_index > last_valid_offset_index &&
            raw_token_index >= last_valid_offset + highlight_affix_num_tokens &&
-           !highlight_fully) {
+           !is_arr_obj_ele && !highlight_fully) {
             break;
         }
     }
diff --git a/test/collection_nested_fields_test.cpp b/test/collection_nested_fields_test.cpp
index 503f9034..dd8f37c2 100644
--- a/test/collection_nested_fields_test.cpp
+++ b/test/collection_nested_fields_test.cpp
@@ -1487,6 +1487,39 @@ TEST_F(CollectionNestedFieldsTest, ExplicitSchemaForNestedArrayTypeValidation) {
               "Hint: field inside an array of objects must be an array type as well.", add_op.error());
 }
 
+TEST_F(CollectionNestedFieldsTest, NestedStringArrayHighlight) {
+    nlohmann::json schema = R"({
+        "name": "coll1",
+        "enable_nested_fields": true,
+        "fields": [
+            {"name": "passages", "type": "object[]"},
+            {"name": "passages.text", "type": "string[]"}
+        ]
+    })"_json;
+
+    auto op = collectionManager.create_collection(schema);
+    ASSERT_TRUE(op.ok());
+    Collection* coll1 = op.get();
+
+    auto doc_str = std::string (R"({"passages": [{"text": "In January 1880, two of Tesla's uncles put together enough money to help him )") +
+            "leave Gospić for Prague, where he was to study. He arrived too late to enroll at Charles-Ferdinand " +
+            "University; he had never studied Greek, a required subject; and he was illiterate in Czech, another " +
+            "required subject. Tesla did, however, attend lectures in philosophy at the university as an auditor " +
+            "but he did not receive grades for the courses." + R"("}]})";
+
+    auto doc1 = nlohmann::json::parse(doc_str);
+    coll1->add(doc1.dump(), CREATE);
+
+    auto results = coll1->search("grades", {"passages.text"},
+                                 "", {}, {}, {0}, 10, 1,
+                                 token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set<std::string>(),
+                                 spp::sparse_hash_set<std::string>(), 10, "", 30, 4).get();
+
+    ASSERT_EQ(1, results["found"].get<size_t>());
+    ASSERT_EQ("he did not receive <mark>grades</mark> for the courses.",
+              results["hits"][0]["highlight"]["passages"][0]["text"]["snippet"].get<std::string>());
+}
+
 TEST_F(CollectionNestedFieldsTest, OptionalNestedOptionalOjectArrStringField) {
     nlohmann::json schema = R"({
             "name": "coll1",