Don't highlight very large docs > 64K words.

2025-05-21 06:02:26 +08:00 · 2024-01-28 15:36:07 +05:30 · 2024-01-28 15:36:07 +05:30 · f9242dd4a5
commit f9242dd4a5
parent ec4311635e
2 changed files with 33 additions and 1 deletions
--- a/src/collection.cpp
+++ b/src/collection.cpp
@ -4067,7 +4067,8 @@ bool Collection::handle_highlight_text(std::string& text, bool normalise, const
        // ensures that the `snippet_start_offset` is always from a matched token, and not from query suggestion
        bool match_offset_found = (found_first_match && token_already_found) ||
                                  (match_offset_index <= last_valid_offset_index &&
-                                   match.offsets[match_offset_index].offset == raw_token_index);
+                                   match.offsets[match_offset_index].offset == raw_token_index &&
+                                   text_len/4 < 64000);

        // Token might not appear in the best matched window, which is limited to a size of 10.
        // If field is marked to be highlighted fully, or field length exceeds snippet_threshold, we will
--- a/test/collection_specific_more_test.cpp
+++ b/test/collection_specific_more_test.cpp
@ -2761,6 +2761,37 @@ TEST_F(CollectionSpecificMoreTest, DisableTyposForNumericalTokens) {
    ASSERT_EQ(2, res_op.get()["hits"].size());
 }

+TEST_F(CollectionSpecificMoreTest, DisableHighlightForLongFields) {
+    nlohmann::json schema = R"({
+        "name": "coll1",
+        "fields": [
+            {"name": "description", "type": "string"}
+        ]
+    })"_json;
+
+    Collection* coll1 = collectionManager.create_collection(schema).get();
+
+    std::string description;
+    for(size_t i = 0; i < 100*1000; i++) {
+        description += StringUtils::randstring(4) + " ";
+    }
+
+    description += "foobar";
+
+    nlohmann::json doc;
+    doc["description"] = description;
+    ASSERT_TRUE(coll1->add(doc.dump()).ok());
+
+    auto res_op = coll1->search("foobar", {"description"}, "", {},
+                                {}, {2}, 10, 1,FREQUENCY, {true},
+                                Index::DROP_TOKENS_THRESHOLD, spp::sparse_hash_set<std::string>(),
+                                spp::sparse_hash_set<std::string>(), 10, "");
+
+    ASSERT_TRUE(res_op.ok());
+    ASSERT_EQ(1, res_op.get()["hits"].size());
+    ASSERT_EQ(0, res_op.get()["hits"][0]["highlight"].size());
+}
+
 TEST_F(CollectionSpecificMoreTest, TestStemming) {
    nlohmann::json schema = R"({
        "name": "test",