mirror of
https://github.com/typesense/typesense.git
synced 2025-05-20 13:42:26 +08:00
Fix highlighting for nested array of string.
This commit is contained in:
parent
eb36846f17
commit
99e11064a6
@ -288,7 +288,7 @@ private:
|
||||
void populate_text_match_info(nlohmann::json& info, uint64_t match_score, const text_match_type_t match_type,
|
||||
const size_t total_tokens) const;
|
||||
|
||||
bool handle_highlight_text(std::string& text, bool normalise, const field &search_field,
|
||||
bool handle_highlight_text(std::string& text, bool normalise, const field &search_field, const bool is_arr_obj_ele,
|
||||
const std::vector<char>& symbols_to_index, const std::vector<char>& token_separators,
|
||||
highlight_t& highlight, StringUtils & string_utils, bool use_word_tokenizer,
|
||||
const size_t highlight_affix_num_tokens,
|
||||
|
@ -3072,7 +3072,7 @@ Option<nlohmann::json> Collection::search(std::string raw_query,
|
||||
index_symbols[uint8_t(c)] = 1;
|
||||
}
|
||||
|
||||
handle_highlight_text(value, normalise, the_field, symbols_to_index, token_separators,
|
||||
handle_highlight_text(value, normalise, the_field, false, symbols_to_index, token_separators,
|
||||
highlight, string_utils, use_word_tokenizer,
|
||||
highlight_affix_num_tokens, qtoken_leaves, last_valid_offset_index,
|
||||
prefix_token_num_chars, false, snippet_threshold, false, ftokens,
|
||||
@ -4071,7 +4071,7 @@ void Collection::highlight_result(const std::string& raw_query, const field &sea
|
||||
std::string text = h_obj.get<std::string>();
|
||||
h_obj = nlohmann::json::object();
|
||||
|
||||
handle_highlight_text(text, normalise, search_field, symbols_to_index,
|
||||
handle_highlight_text(text, normalise, search_field, is_arr_obj_ele, symbols_to_index,
|
||||
token_separators, array_highlight, string_utils, use_word_tokenizer,
|
||||
highlight_affix_num_tokens,
|
||||
qtoken_leaves, last_valid_offset_index,
|
||||
@ -4165,7 +4165,7 @@ void Collection::highlight_result(const std::string& raw_query, const field &sea
|
||||
text = document[search_field.name][match_index.index];
|
||||
}
|
||||
|
||||
handle_highlight_text(text, normalise, search_field, symbols_to_index, token_separators,
|
||||
handle_highlight_text(text, normalise, search_field, false, symbols_to_index, token_separators,
|
||||
highlight, string_utils, use_word_tokenizer, highlight_affix_num_tokens,
|
||||
qtoken_leaves, last_valid_offset_index, prefix_token_num_chars,
|
||||
highlight_fully, snippet_threshold, is_infix_search, raw_query_tokens,
|
||||
@ -4195,7 +4195,8 @@ void Collection::highlight_result(const std::string& raw_query, const field &sea
|
||||
}
|
||||
|
||||
bool Collection::handle_highlight_text(std::string& text, bool normalise, const field &search_field,
|
||||
const std::vector<char>& symbols_to_index, const std::vector<char>& token_separators,
|
||||
const bool is_arr_obj_ele, const std::vector<char>& symbols_to_index,
|
||||
const std::vector<char>& token_separators,
|
||||
highlight_t& highlight, StringUtils & string_utils, bool use_word_tokenizer,
|
||||
const size_t highlight_affix_num_tokens,
|
||||
const tsl::htrie_map<char, token_leaf>& qtoken_leaves, int last_valid_offset_index,
|
||||
@ -4267,8 +4268,10 @@ bool Collection::handle_highlight_text(std::string& text, bool normalise, const
|
||||
|
||||
// Token might not appear in the best matched window, which is limited to a size of 10.
|
||||
// If field is marked to be highlighted fully, or field length exceeds snippet_threshold, we will
|
||||
// locate all tokens that appear in the query / query candidates
|
||||
bool raw_token_found = !match_offset_found && (highlight_fully || text_len < snippet_threshold * 6) &&
|
||||
// locate all tokens that appear in the query / query candidates. Likewise, for text within nested array of
|
||||
// objects have to be exhaustively looked for highlight tokens.
|
||||
bool raw_token_found = !match_offset_found &&
|
||||
(highlight_fully || is_arr_obj_ele || text_len < snippet_threshold * 6) &&
|
||||
qtoken_leaves.find(raw_token) != qtoken_leaves.end();
|
||||
|
||||
if (match_offset_found || raw_token_found) {
|
||||
@ -4325,6 +4328,12 @@ bool Collection::handle_highlight_text(std::string& text, bool normalise, const
|
||||
snippet_start_offset = snippet_start_window.front();
|
||||
}
|
||||
|
||||
found_first_match = true;
|
||||
} else if(raw_token_found && is_arr_obj_ele) {
|
||||
if(!found_first_match) {
|
||||
snippet_start_offset = snippet_start_window.front();
|
||||
}
|
||||
|
||||
found_first_match = true;
|
||||
}
|
||||
} else if(is_infix_search && text.size() < 100 &&
|
||||
@ -4333,7 +4342,7 @@ bool Collection::handle_highlight_text(std::string& text, bool normalise, const
|
||||
token_hits.insert(raw_token);
|
||||
}
|
||||
|
||||
if(raw_token_index >= last_valid_offset + highlight_affix_num_tokens) {
|
||||
if(last_valid_offset_index != -1 && raw_token_index >= last_valid_offset + highlight_affix_num_tokens) {
|
||||
// register end of highlight snippet
|
||||
if(snippet_end_offset == text.size() - 1) {
|
||||
snippet_end_offset = tok_end;
|
||||
@ -4349,7 +4358,7 @@ bool Collection::handle_highlight_text(std::string& text, bool normalise, const
|
||||
if(raw_token_index >= snippet_threshold &&
|
||||
match_offset_index > last_valid_offset_index &&
|
||||
raw_token_index >= last_valid_offset + highlight_affix_num_tokens &&
|
||||
!highlight_fully) {
|
||||
!is_arr_obj_ele && !highlight_fully) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -1487,6 +1487,39 @@ TEST_F(CollectionNestedFieldsTest, ExplicitSchemaForNestedArrayTypeValidation) {
|
||||
"Hint: field inside an array of objects must be an array type as well.", add_op.error());
|
||||
}
|
||||
|
||||
TEST_F(CollectionNestedFieldsTest, NestedStringArrayHighlight) {
|
||||
nlohmann::json schema = R"({
|
||||
"name": "coll1",
|
||||
"enable_nested_fields": true,
|
||||
"fields": [
|
||||
{"name": "passages", "type": "object[]"},
|
||||
{"name": "passages.text", "type": "string[]"}
|
||||
]
|
||||
})"_json;
|
||||
|
||||
auto op = collectionManager.create_collection(schema);
|
||||
ASSERT_TRUE(op.ok());
|
||||
Collection* coll1 = op.get();
|
||||
|
||||
auto doc_str = std::string (R"({"passages": [{"text": "In January 1880, two of Tesla's uncles put together enough money to help him )") +
|
||||
"leave Gospić for Prague, where he was to study. He arrived too late to enroll at Charles-Ferdinand " +
|
||||
"University; he had never studied Greek, a required subject; and he was illiterate in Czech, another " +
|
||||
"required subject. Tesla did, however, attend lectures in philosophy at the university as an auditor " +
|
||||
"but he did not receive grades for the courses." + R"("}]})";
|
||||
|
||||
auto doc1 = nlohmann::json::parse(doc_str);
|
||||
coll1->add(doc1.dump(), CREATE);
|
||||
|
||||
auto results = coll1->search("grades", {"passages.text"},
|
||||
"", {}, {}, {0}, 10, 1,
|
||||
token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30, 4).get();
|
||||
|
||||
ASSERT_EQ(1, results["found"].get<size_t>());
|
||||
ASSERT_EQ("he did not receive <mark>grades</mark> for the courses.",
|
||||
results["hits"][0]["highlight"]["passages"][0]["text"]["snippet"].get<std::string>());
|
||||
}
|
||||
|
||||
TEST_F(CollectionNestedFieldsTest, OptionalNestedOptionalOjectArrStringField) {
|
||||
nlohmann::json schema = R"({
|
||||
"name": "coll1",
|
||||
|
Loading…
x
Reference in New Issue
Block a user