Fix highlighting for nested array of string.

2025-05-20 13:42:26 +08:00 · 2024-07-14 17:24:45 +05:30 · 2024-07-14 17:24:45 +05:30 · 99e11064a6
commit 99e11064a6
parent eb36846f17
3 changed files with 51 additions and 9 deletions
--- a/include/collection.h
+++ b/include/collection.h
@ -288,7 +288,7 @@ private:
    void populate_text_match_info(nlohmann::json& info, uint64_t match_score, const text_match_type_t match_type,
                                  const size_t total_tokens) const;

-    bool handle_highlight_text(std::string& text, bool normalise, const field &search_field,
+    bool handle_highlight_text(std::string& text, bool normalise, const field &search_field, const bool is_arr_obj_ele,
                               const std::vector<char>& symbols_to_index, const std::vector<char>& token_separators,
                               highlight_t& highlight, StringUtils & string_utils, bool use_word_tokenizer,
                               const size_t highlight_affix_num_tokens,
--- a/src/collection.cpp
+++ b/src/collection.cpp
@ -3072,7 +3072,7 @@ Option<nlohmann::json> Collection::search(std::string raw_query,
                        index_symbols[uint8_t(c)] = 1;
                    }

-                    handle_highlight_text(value, normalise, the_field, symbols_to_index, token_separators,
+                    handle_highlight_text(value, normalise, the_field, false, symbols_to_index, token_separators,
                                          highlight, string_utils, use_word_tokenizer,
                                          highlight_affix_num_tokens, qtoken_leaves, last_valid_offset_index,
                                          prefix_token_num_chars, false, snippet_threshold, false, ftokens,
@ -4071,7 +4071,7 @@ void Collection::highlight_result(const std::string& raw_query, const field &sea
        std::string text = h_obj.get<std::string>();
        h_obj = nlohmann::json::object();

-        handle_highlight_text(text, normalise, search_field, symbols_to_index,
+        handle_highlight_text(text, normalise, search_field, is_arr_obj_ele, symbols_to_index,
                              token_separators, array_highlight, string_utils, use_word_tokenizer,
                              highlight_affix_num_tokens,
                              qtoken_leaves, last_valid_offset_index,
@ -4165,7 +4165,7 @@ void Collection::highlight_result(const std::string& raw_query, const field &sea
            text = document[search_field.name][match_index.index];
        }

-        handle_highlight_text(text, normalise, search_field, symbols_to_index, token_separators,
+        handle_highlight_text(text, normalise, search_field, false, symbols_to_index, token_separators,
                              highlight, string_utils, use_word_tokenizer, highlight_affix_num_tokens,
                              qtoken_leaves, last_valid_offset_index, prefix_token_num_chars,
                              highlight_fully, snippet_threshold, is_infix_search, raw_query_tokens,
@ -4195,7 +4195,8 @@ void Collection::highlight_result(const std::string& raw_query, const field &sea
 }

 bool Collection::handle_highlight_text(std::string& text, bool normalise, const field &search_field,
-                           const std::vector<char>& symbols_to_index, const std::vector<char>& token_separators,
+                           const bool is_arr_obj_ele, const std::vector<char>& symbols_to_index,
+                           const std::vector<char>& token_separators,
                           highlight_t& highlight, StringUtils & string_utils, bool use_word_tokenizer,
                           const size_t highlight_affix_num_tokens,
                           const tsl::htrie_map<char, token_leaf>& qtoken_leaves, int last_valid_offset_index,
@ -4267,8 +4268,10 @@ bool Collection::handle_highlight_text(std::string& text, bool normalise, const

        // Token might not appear in the best matched window, which is limited to a size of 10.
        // If field is marked to be highlighted fully, or field length exceeds snippet_threshold, we will
-        // locate all tokens that appear in the query / query candidates
-        bool raw_token_found = !match_offset_found && (highlight_fully || text_len < snippet_threshold * 6) &&
+        // locate all tokens that appear in the query / query candidates. Likewise, for text within nested array of
+        // objects have to be exhaustively looked for highlight tokens.
+        bool raw_token_found = !match_offset_found &&
+                                (highlight_fully || is_arr_obj_ele || text_len < snippet_threshold * 6) &&
                                qtoken_leaves.find(raw_token) != qtoken_leaves.end();

        if (match_offset_found || raw_token_found) {
@ -4325,6 +4328,12 @@ bool Collection::handle_highlight_text(std::string& text, bool normalise, const
                    snippet_start_offset = snippet_start_window.front();
                }

+                found_first_match = true;
+            } else if(raw_token_found && is_arr_obj_ele) {
+                if(!found_first_match) {
+                    snippet_start_offset = snippet_start_window.front();
+                }
+
                found_first_match = true;
            }
        } else if(is_infix_search && text.size() < 100 &&
@ -4333,7 +4342,7 @@ bool Collection::handle_highlight_text(std::string& text, bool normalise, const
            token_hits.insert(raw_token);
        }

-        if(raw_token_index >= last_valid_offset + highlight_affix_num_tokens) {
+        if(last_valid_offset_index != -1 && raw_token_index >= last_valid_offset + highlight_affix_num_tokens) {
            // register end of highlight snippet
            if(snippet_end_offset == text.size() - 1) {
                snippet_end_offset = tok_end;
@ -4349,7 +4358,7 @@ bool Collection::handle_highlight_text(std::string& text, bool normalise, const
        if(raw_token_index >= snippet_threshold &&
           match_offset_index > last_valid_offset_index &&
           raw_token_index >= last_valid_offset + highlight_affix_num_tokens &&
-           !highlight_fully) {
+           !is_arr_obj_ele && !highlight_fully) {
            break;
        }
    }
--- a/test/collection_nested_fields_test.cpp
+++ b/test/collection_nested_fields_test.cpp
@ -1487,6 +1487,39 @@ TEST_F(CollectionNestedFieldsTest, ExplicitSchemaForNestedArrayTypeValidation) {
              "Hint: field inside an array of objects must be an array type as well.", add_op.error());
 }

+TEST_F(CollectionNestedFieldsTest, NestedStringArrayHighlight) {
+    nlohmann::json schema = R"({
+        "name": "coll1",
+        "enable_nested_fields": true,
+        "fields": [
+            {"name": "passages", "type": "object[]"},
+            {"name": "passages.text", "type": "string[]"}
+        ]
+    })"_json;
+
+    auto op = collectionManager.create_collection(schema);
+    ASSERT_TRUE(op.ok());
+    Collection* coll1 = op.get();
+
+    auto doc_str = std::string (R"({"passages": [{"text": "In January 1880, two of Tesla's uncles put together enough money to help him )") +
+            "leave Gospić for Prague, where he was to study. He arrived too late to enroll at Charles-Ferdinand " +
+            "University; he had never studied Greek, a required subject; and he was illiterate in Czech, another " +
+            "required subject. Tesla did, however, attend lectures in philosophy at the university as an auditor " +
+            "but he did not receive grades for the courses." + R"("}]})";
+
+    auto doc1 = nlohmann::json::parse(doc_str);
+    coll1->add(doc1.dump(), CREATE);
+
+    auto results = coll1->search("grades", {"passages.text"},
+                                 "", {}, {}, {0}, 10, 1,
+                                 token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set<std::string>(),
+                                 spp::sparse_hash_set<std::string>(), 10, "", 30, 4).get();
+
+    ASSERT_EQ(1, results["found"].get<size_t>());
+    ASSERT_EQ("he did not receive <mark>grades</mark> for the courses.",
+              results["hits"][0]["highlight"]["passages"][0]["text"]["snippet"].get<std::string>());
+}
+
 TEST_F(CollectionNestedFieldsTest, OptionalNestedOptionalOjectArrStringField) {
    nlohmann::json schema = R"({
            "name": "coll1",