Fix highlighting for nested array of string.

This commit is contained in:
Kishore Nallan 2024-07-14 17:24:45 +05:30
parent eb36846f17
commit 99e11064a6
3 changed files with 51 additions and 9 deletions

View File

@ -288,7 +288,7 @@ private:
void populate_text_match_info(nlohmann::json& info, uint64_t match_score, const text_match_type_t match_type,
const size_t total_tokens) const;
bool handle_highlight_text(std::string& text, bool normalise, const field &search_field,
bool handle_highlight_text(std::string& text, bool normalise, const field &search_field, const bool is_arr_obj_ele,
const std::vector<char>& symbols_to_index, const std::vector<char>& token_separators,
highlight_t& highlight, StringUtils & string_utils, bool use_word_tokenizer,
const size_t highlight_affix_num_tokens,

View File

@ -3072,7 +3072,7 @@ Option<nlohmann::json> Collection::search(std::string raw_query,
index_symbols[uint8_t(c)] = 1;
}
handle_highlight_text(value, normalise, the_field, symbols_to_index, token_separators,
handle_highlight_text(value, normalise, the_field, false, symbols_to_index, token_separators,
highlight, string_utils, use_word_tokenizer,
highlight_affix_num_tokens, qtoken_leaves, last_valid_offset_index,
prefix_token_num_chars, false, snippet_threshold, false, ftokens,
@ -4071,7 +4071,7 @@ void Collection::highlight_result(const std::string& raw_query, const field &sea
std::string text = h_obj.get<std::string>();
h_obj = nlohmann::json::object();
handle_highlight_text(text, normalise, search_field, symbols_to_index,
handle_highlight_text(text, normalise, search_field, is_arr_obj_ele, symbols_to_index,
token_separators, array_highlight, string_utils, use_word_tokenizer,
highlight_affix_num_tokens,
qtoken_leaves, last_valid_offset_index,
@ -4165,7 +4165,7 @@ void Collection::highlight_result(const std::string& raw_query, const field &sea
text = document[search_field.name][match_index.index];
}
handle_highlight_text(text, normalise, search_field, symbols_to_index, token_separators,
handle_highlight_text(text, normalise, search_field, false, symbols_to_index, token_separators,
highlight, string_utils, use_word_tokenizer, highlight_affix_num_tokens,
qtoken_leaves, last_valid_offset_index, prefix_token_num_chars,
highlight_fully, snippet_threshold, is_infix_search, raw_query_tokens,
@ -4195,7 +4195,8 @@ void Collection::highlight_result(const std::string& raw_query, const field &sea
}
bool Collection::handle_highlight_text(std::string& text, bool normalise, const field &search_field,
const std::vector<char>& symbols_to_index, const std::vector<char>& token_separators,
const bool is_arr_obj_ele, const std::vector<char>& symbols_to_index,
const std::vector<char>& token_separators,
highlight_t& highlight, StringUtils & string_utils, bool use_word_tokenizer,
const size_t highlight_affix_num_tokens,
const tsl::htrie_map<char, token_leaf>& qtoken_leaves, int last_valid_offset_index,
@ -4267,8 +4268,10 @@ bool Collection::handle_highlight_text(std::string& text, bool normalise, const
// Token might not appear in the best matched window, which is limited to a size of 10.
// If field is marked to be highlighted fully, or field length exceeds snippet_threshold, we will
// locate all tokens that appear in the query / query candidates
bool raw_token_found = !match_offset_found && (highlight_fully || text_len < snippet_threshold * 6) &&
// locate all tokens that appear in the query / query candidates. Likewise, for text within nested array of
// objects have to be exhaustively looked for highlight tokens.
bool raw_token_found = !match_offset_found &&
(highlight_fully || is_arr_obj_ele || text_len < snippet_threshold * 6) &&
qtoken_leaves.find(raw_token) != qtoken_leaves.end();
if (match_offset_found || raw_token_found) {
@ -4325,6 +4328,12 @@ bool Collection::handle_highlight_text(std::string& text, bool normalise, const
snippet_start_offset = snippet_start_window.front();
}
found_first_match = true;
} else if(raw_token_found && is_arr_obj_ele) {
if(!found_first_match) {
snippet_start_offset = snippet_start_window.front();
}
found_first_match = true;
}
} else if(is_infix_search && text.size() < 100 &&
@ -4333,7 +4342,7 @@ bool Collection::handle_highlight_text(std::string& text, bool normalise, const
token_hits.insert(raw_token);
}
if(raw_token_index >= last_valid_offset + highlight_affix_num_tokens) {
if(last_valid_offset_index != -1 && raw_token_index >= last_valid_offset + highlight_affix_num_tokens) {
// register end of highlight snippet
if(snippet_end_offset == text.size() - 1) {
snippet_end_offset = tok_end;
@ -4349,7 +4358,7 @@ bool Collection::handle_highlight_text(std::string& text, bool normalise, const
if(raw_token_index >= snippet_threshold &&
match_offset_index > last_valid_offset_index &&
raw_token_index >= last_valid_offset + highlight_affix_num_tokens &&
!highlight_fully) {
!is_arr_obj_ele && !highlight_fully) {
break;
}
}

View File

@ -1487,6 +1487,39 @@ TEST_F(CollectionNestedFieldsTest, ExplicitSchemaForNestedArrayTypeValidation) {
"Hint: field inside an array of objects must be an array type as well.", add_op.error());
}
TEST_F(CollectionNestedFieldsTest, NestedStringArrayHighlight) {
nlohmann::json schema = R"({
"name": "coll1",
"enable_nested_fields": true,
"fields": [
{"name": "passages", "type": "object[]"},
{"name": "passages.text", "type": "string[]"}
]
})"_json;
auto op = collectionManager.create_collection(schema);
ASSERT_TRUE(op.ok());
Collection* coll1 = op.get();
auto doc_str = std::string (R"({"passages": [{"text": "In January 1880, two of Tesla's uncles put together enough money to help him )") +
"leave Gospić for Prague, where he was to study. He arrived too late to enroll at Charles-Ferdinand " +
"University; he had never studied Greek, a required subject; and he was illiterate in Czech, another " +
"required subject. Tesla did, however, attend lectures in philosophy at the university as an auditor " +
"but he did not receive grades for the courses." + R"("}]})";
auto doc1 = nlohmann::json::parse(doc_str);
coll1->add(doc1.dump(), CREATE);
auto results = coll1->search("grades", {"passages.text"},
"", {}, {}, {0}, 10, 1,
token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 4).get();
ASSERT_EQ(1, results["found"].get<size_t>());
ASSERT_EQ("he did not receive <mark>grades</mark> for the courses.",
results["hits"][0]["highlight"]["passages"][0]["text"]["snippet"].get<std::string>());
}
TEST_F(CollectionNestedFieldsTest, OptionalNestedOptionalOjectArrStringField) {
nlohmann::json schema = R"({
"name": "coll1",