From ff0d2596cc9340296e76dfcb340fa8e0fbd66a24 Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Wed, 11 Oct 2023 15:02:22 +0530 Subject: [PATCH] Handle zero width non-joiner character for persian. It must split the tokens into individual words. --- src/tokenizer.cpp | 7 +++++++ test/tokenizer_test.cpp | 5 +++++ 2 files changed, 12 insertions(+) diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp index 5688e27d..eb80b09f 100644 --- a/src/tokenizer.cpp +++ b/src/tokenizer.cpp @@ -87,6 +87,13 @@ void Tokenizer::init(const std::string& input) { } unicode_text = icu::UnicodeString::fromUTF8(text); + + if(locale == "fa") { + icu::UnicodeString target_str; + target_str.setTo(0x200C); // U+200C (ZERO WIDTH NON-JOINER) + unicode_text.findAndReplace(target_str, " "); + } + bi->setText(unicode_text); start_pos = bi->first(); diff --git a/test/tokenizer_test.cpp b/test/tokenizer_test.cpp index b141e97c..054df18b 100644 --- a/test/tokenizer_test.cpp +++ b/test/tokenizer_test.cpp @@ -323,6 +323,11 @@ TEST(TokenizerTest, ShouldTokenizeLocaleText) { tokens.clear(); // 配管 Tokenizer("配管", true, false, "ja").tokenize(tokens); + + // persian containing zwnj + tokens.clear(); + Tokenizer("روان\u200Cشناسی", false, false, "fa").tokenize(tokens); + ASSERT_EQ(2, tokens.size()); } TEST(TokenizerTest, ShouldTokenizeLocaleTextWithEnglishText) {