Handle zero width non-joiner character for persian.

It must split the tokens into individual words.
2025-05-19 05:08:43 +08:00 · 2023-10-11 15:02:22 +05:30 · 2023-10-11 15:02:22 +05:30 · ff0d2596cc
commit ff0d2596cc
parent 40a684619b
2 changed files with 12 additions and 0 deletions
--- a/src/tokenizer.cpp
+++ b/src/tokenizer.cpp
@ -87,6 +87,13 @@ void Tokenizer::init(const std::string& input) {
        }

        unicode_text = icu::UnicodeString::fromUTF8(text);
+
+        if(locale == "fa") {
+            icu::UnicodeString target_str;
+            target_str.setTo(0x200C);  // U+200C (ZERO WIDTH NON-JOINER)
+            unicode_text.findAndReplace(target_str, " ");
+        }
+
        bi->setText(unicode_text);

        start_pos = bi->first();
--- a/test/tokenizer_test.cpp
+++ b/test/tokenizer_test.cpp
@ -323,6 +323,11 @@ TEST(TokenizerTest, ShouldTokenizeLocaleText) {

    tokens.clear();  // 配管
    Tokenizer("配管", true, false, "ja").tokenize(tokens);
+
+    // persian containing zwnj
+    tokens.clear();
+    Tokenizer("روان\u200Cشناسی", false, false, "fa").tokenize(tokens);
+    ASSERT_EQ(2, tokens.size());
 }

 TEST(TokenizerTest, ShouldTokenizeLocaleTextWithEnglishText) {