From cfbcbfc6fb1e9b28ce1a8c19e8ee1062e0943afc Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Mon, 6 Mar 2023 13:31:38 +0530 Subject: [PATCH] Handle special characters in prefix highlighting. --- include/tokenizer.h | 2 ++ src/collection.cpp | 6 ++++++ src/tokenizer.cpp | 4 ++++ test/collection_specific_more_test.cpp | 27 ++++++++++++++++++++++++++ 4 files changed, 39 insertions(+) diff --git a/include/tokenizer.h b/include/tokenizer.h index 53a2ab13..3700a7d3 100644 --- a/include/tokenizer.h +++ b/include/tokenizer.h @@ -85,4 +85,6 @@ public: } void decr_token_counter(); + + bool should_skip_char(char c); }; \ No newline at end of file diff --git a/src/collection.cpp b/src/collection.cpp index ff2a7f78..39d54ef6 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -2832,6 +2832,12 @@ bool Collection::handle_highlight_text(std::string& text, bool normalise, const // group unicode code points and calculate number of actual characters while(k <= tok_end) { k++; + + if(tokenizer.should_skip_char(text[k])) { + // used to handle special characters inside a tokenized word, e.g. `foo-bar` + continue; + } + if ((text[k] & 0xC0) == 0x80) k++; if ((text[k] & 0xC0) == 0x80) k++; if ((text[k] & 0xC0) == 0x80) k++; diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp index 082db1be..8488b24a 100644 --- a/src/tokenizer.cpp +++ b/src/tokenizer.cpp @@ -334,3 +334,7 @@ void Tokenizer::decr_token_counter() { token_counter--; } } + +bool Tokenizer::should_skip_char(char c) { + return is_ascii_char(c) && get_stream_mode(c) != INDEX; +} diff --git a/test/collection_specific_more_test.cpp b/test/collection_specific_more_test.cpp index 553f902b..9593d33a 100644 --- a/test/collection_specific_more_test.cpp +++ b/test/collection_specific_more_test.cpp @@ -1338,6 +1338,33 @@ TEST_F(CollectionSpecificMoreTest, CopyDocHelper) { ASSERT_EQ(1, dst.count("foo.bar")); } +TEST_F(CollectionSpecificMoreTest, HighlightWordWithSymbols) { + nlohmann::json schema = R"({ + "name": "coll1", + "fields": [ + {"name": "title", "type": "string"} + ] + })"_json; + + Collection *coll1 = collectionManager.create_collection(schema).get(); + + nlohmann::json doc; + doc["title"] = "var(--icon-secondary-neutral); For components with"; + + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + + auto res = coll1->search("favicon", {"title"}, "", {}, {}, {2}, 10, 1, + FREQUENCY, {true}, + 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "locations.address", + 20, {}, {}, {}, 0, "", "", {}, 1000, true, false, true, + "title").get(); + + ASSERT_EQ(1, res["hits"].size()); + ASSERT_EQ("var(--icon-secondary-neutral); For components with", + res["hits"][0]["highlight"]["title"]["snippet"].get()); +} + TEST_F(CollectionSpecificMoreTest, HighlightObjectShouldBeEmptyWhenNoHighlightFieldFound) { nlohmann::json schema = R"({ "name": "coll1",