From 99e11064a69e066ab4b38aa67590fabd1bc0a04a Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Sun, 14 Jul 2024 17:24:45 +0530 Subject: [PATCH] Fix highlighting for nested array of string. --- include/collection.h | 2 +- src/collection.cpp | 25 ++++++++++++------- test/collection_nested_fields_test.cpp | 33 ++++++++++++++++++++++++++ 3 files changed, 51 insertions(+), 9 deletions(-) diff --git a/include/collection.h b/include/collection.h index 8ebd0e88..fb0b6fd4 100644 --- a/include/collection.h +++ b/include/collection.h @@ -288,7 +288,7 @@ private: void populate_text_match_info(nlohmann::json& info, uint64_t match_score, const text_match_type_t match_type, const size_t total_tokens) const; - bool handle_highlight_text(std::string& text, bool normalise, const field &search_field, + bool handle_highlight_text(std::string& text, bool normalise, const field &search_field, const bool is_arr_obj_ele, const std::vector& symbols_to_index, const std::vector& token_separators, highlight_t& highlight, StringUtils & string_utils, bool use_word_tokenizer, const size_t highlight_affix_num_tokens, diff --git a/src/collection.cpp b/src/collection.cpp index ba521d9f..dfac82f7 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -3072,7 +3072,7 @@ Option Collection::search(std::string raw_query, index_symbols[uint8_t(c)] = 1; } - handle_highlight_text(value, normalise, the_field, symbols_to_index, token_separators, + handle_highlight_text(value, normalise, the_field, false, symbols_to_index, token_separators, highlight, string_utils, use_word_tokenizer, highlight_affix_num_tokens, qtoken_leaves, last_valid_offset_index, prefix_token_num_chars, false, snippet_threshold, false, ftokens, @@ -4071,7 +4071,7 @@ void Collection::highlight_result(const std::string& raw_query, const field &sea std::string text = h_obj.get(); h_obj = nlohmann::json::object(); - handle_highlight_text(text, normalise, search_field, symbols_to_index, + handle_highlight_text(text, normalise, search_field, is_arr_obj_ele, symbols_to_index, token_separators, array_highlight, string_utils, use_word_tokenizer, highlight_affix_num_tokens, qtoken_leaves, last_valid_offset_index, @@ -4165,7 +4165,7 @@ void Collection::highlight_result(const std::string& raw_query, const field &sea text = document[search_field.name][match_index.index]; } - handle_highlight_text(text, normalise, search_field, symbols_to_index, token_separators, + handle_highlight_text(text, normalise, search_field, false, symbols_to_index, token_separators, highlight, string_utils, use_word_tokenizer, highlight_affix_num_tokens, qtoken_leaves, last_valid_offset_index, prefix_token_num_chars, highlight_fully, snippet_threshold, is_infix_search, raw_query_tokens, @@ -4195,7 +4195,8 @@ void Collection::highlight_result(const std::string& raw_query, const field &sea } bool Collection::handle_highlight_text(std::string& text, bool normalise, const field &search_field, - const std::vector& symbols_to_index, const std::vector& token_separators, + const bool is_arr_obj_ele, const std::vector& symbols_to_index, + const std::vector& token_separators, highlight_t& highlight, StringUtils & string_utils, bool use_word_tokenizer, const size_t highlight_affix_num_tokens, const tsl::htrie_map& qtoken_leaves, int last_valid_offset_index, @@ -4267,8 +4268,10 @@ bool Collection::handle_highlight_text(std::string& text, bool normalise, const // Token might not appear in the best matched window, which is limited to a size of 10. // If field is marked to be highlighted fully, or field length exceeds snippet_threshold, we will - // locate all tokens that appear in the query / query candidates - bool raw_token_found = !match_offset_found && (highlight_fully || text_len < snippet_threshold * 6) && + // locate all tokens that appear in the query / query candidates. Likewise, for text within nested array of + // objects have to be exhaustively looked for highlight tokens. + bool raw_token_found = !match_offset_found && + (highlight_fully || is_arr_obj_ele || text_len < snippet_threshold * 6) && qtoken_leaves.find(raw_token) != qtoken_leaves.end(); if (match_offset_found || raw_token_found) { @@ -4325,6 +4328,12 @@ bool Collection::handle_highlight_text(std::string& text, bool normalise, const snippet_start_offset = snippet_start_window.front(); } + found_first_match = true; + } else if(raw_token_found && is_arr_obj_ele) { + if(!found_first_match) { + snippet_start_offset = snippet_start_window.front(); + } + found_first_match = true; } } else if(is_infix_search && text.size() < 100 && @@ -4333,7 +4342,7 @@ bool Collection::handle_highlight_text(std::string& text, bool normalise, const token_hits.insert(raw_token); } - if(raw_token_index >= last_valid_offset + highlight_affix_num_tokens) { + if(last_valid_offset_index != -1 && raw_token_index >= last_valid_offset + highlight_affix_num_tokens) { // register end of highlight snippet if(snippet_end_offset == text.size() - 1) { snippet_end_offset = tok_end; @@ -4349,7 +4358,7 @@ bool Collection::handle_highlight_text(std::string& text, bool normalise, const if(raw_token_index >= snippet_threshold && match_offset_index > last_valid_offset_index && raw_token_index >= last_valid_offset + highlight_affix_num_tokens && - !highlight_fully) { + !is_arr_obj_ele && !highlight_fully) { break; } } diff --git a/test/collection_nested_fields_test.cpp b/test/collection_nested_fields_test.cpp index 503f9034..dd8f37c2 100644 --- a/test/collection_nested_fields_test.cpp +++ b/test/collection_nested_fields_test.cpp @@ -1487,6 +1487,39 @@ TEST_F(CollectionNestedFieldsTest, ExplicitSchemaForNestedArrayTypeValidation) { "Hint: field inside an array of objects must be an array type as well.", add_op.error()); } +TEST_F(CollectionNestedFieldsTest, NestedStringArrayHighlight) { + nlohmann::json schema = R"({ + "name": "coll1", + "enable_nested_fields": true, + "fields": [ + {"name": "passages", "type": "object[]"}, + {"name": "passages.text", "type": "string[]"} + ] + })"_json; + + auto op = collectionManager.create_collection(schema); + ASSERT_TRUE(op.ok()); + Collection* coll1 = op.get(); + + auto doc_str = std::string (R"({"passages": [{"text": "In January 1880, two of Tesla's uncles put together enough money to help him )") + + "leave Gospić for Prague, where he was to study. He arrived too late to enroll at Charles-Ferdinand " + + "University; he had never studied Greek, a required subject; and he was illiterate in Czech, another " + + "required subject. Tesla did, however, attend lectures in philosophy at the university as an auditor " + + "but he did not receive grades for the courses." + R"("}]})"; + + auto doc1 = nlohmann::json::parse(doc_str); + coll1->add(doc1.dump(), CREATE); + + auto results = coll1->search("grades", {"passages.text"}, + "", {}, {}, {0}, 10, 1, + token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4).get(); + + ASSERT_EQ(1, results["found"].get()); + ASSERT_EQ("he did not receive grades for the courses.", + results["hits"][0]["highlight"]["passages"][0]["text"]["snippet"].get()); +} + TEST_F(CollectionNestedFieldsTest, OptionalNestedOptionalOjectArrStringField) { nlohmann::json schema = R"({ "name": "coll1",