Don't highlight very large docs > 64K words.

This commit is contained in:
Kishore Nallan 2024-01-28 15:36:07 +05:30
parent ec4311635e
commit f9242dd4a5
2 changed files with 33 additions and 1 deletions

View File

@ -4067,7 +4067,8 @@ bool Collection::handle_highlight_text(std::string& text, bool normalise, const
// ensures that the `snippet_start_offset` is always from a matched token, and not from query suggestion
bool match_offset_found = (found_first_match && token_already_found) ||
(match_offset_index <= last_valid_offset_index &&
match.offsets[match_offset_index].offset == raw_token_index);
match.offsets[match_offset_index].offset == raw_token_index &&
text_len/4 < 64000);
// Token might not appear in the best matched window, which is limited to a size of 10.
// If field is marked to be highlighted fully, or field length exceeds snippet_threshold, we will

View File

@ -2761,6 +2761,37 @@ TEST_F(CollectionSpecificMoreTest, DisableTyposForNumericalTokens) {
ASSERT_EQ(2, res_op.get()["hits"].size());
}
TEST_F(CollectionSpecificMoreTest, DisableHighlightForLongFields) {
nlohmann::json schema = R"({
"name": "coll1",
"fields": [
{"name": "description", "type": "string"}
]
})"_json;
Collection* coll1 = collectionManager.create_collection(schema).get();
std::string description;
for(size_t i = 0; i < 100*1000; i++) {
description += StringUtils::randstring(4) + " ";
}
description += "foobar";
nlohmann::json doc;
doc["description"] = description;
ASSERT_TRUE(coll1->add(doc.dump()).ok());
auto res_op = coll1->search("foobar", {"description"}, "", {},
{}, {2}, 10, 1,FREQUENCY, {true},
Index::DROP_TOKENS_THRESHOLD, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "");
ASSERT_TRUE(res_op.ok());
ASSERT_EQ(1, res_op.get()["hits"].size());
ASSERT_EQ(0, res_op.get()["hits"][0]["highlight"].size());
}
TEST_F(CollectionSpecificMoreTest, TestStemming) {
nlohmann::json schema = R"({
"name": "test",