diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp index 5688e27d..eb80b09f 100644 --- a/src/tokenizer.cpp +++ b/src/tokenizer.cpp @@ -87,6 +87,13 @@ void Tokenizer::init(const std::string& input) { } unicode_text = icu::UnicodeString::fromUTF8(text); + + if(locale == "fa") { + icu::UnicodeString target_str; + target_str.setTo(0x200C); // U+200C (ZERO WIDTH NON-JOINER) + unicode_text.findAndReplace(target_str, " "); + } + bi->setText(unicode_text); start_pos = bi->first(); diff --git a/test/tokenizer_test.cpp b/test/tokenizer_test.cpp index b141e97c..054df18b 100644 --- a/test/tokenizer_test.cpp +++ b/test/tokenizer_test.cpp @@ -323,6 +323,11 @@ TEST(TokenizerTest, ShouldTokenizeLocaleText) { tokens.clear(); // 配管 Tokenizer("配管", true, false, "ja").tokenize(tokens); + + // persian containing zwnj + tokens.clear(); + Tokenizer("روان\u200Cشناسی", false, false, "fa").tokenize(tokens); + ASSERT_EQ(2, tokens.size()); } TEST(TokenizerTest, ShouldTokenizeLocaleTextWithEnglishText) {