From ff0d2596cc9340296e76dfcb340fa8e0fbd66a24 Mon Sep 17 00:00:00 2001
From: Kishore Nallan <kishorenc@gmail.com>
Date: Wed, 11 Oct 2023 15:02:22 +0530
Subject: [PATCH] Handle zero width non-joiner character for persian.

It must split the tokens into individual words.
---
 src/tokenizer.cpp       | 7 +++++++
 test/tokenizer_test.cpp | 5 +++++
 2 files changed, 12 insertions(+)

diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp
index 5688e27d..eb80b09f 100644
--- a/src/tokenizer.cpp
+++ b/src/tokenizer.cpp
@@ -87,6 +87,13 @@ void Tokenizer::init(const std::string& input) {
         }
 
         unicode_text = icu::UnicodeString::fromUTF8(text);
+
+        if(locale == "fa") {
+            icu::UnicodeString target_str;
+            target_str.setTo(0x200C);  // U+200C (ZERO WIDTH NON-JOINER)
+            unicode_text.findAndReplace(target_str, " ");
+        }
+
         bi->setText(unicode_text);
 
         start_pos = bi->first();
diff --git a/test/tokenizer_test.cpp b/test/tokenizer_test.cpp
index b141e97c..054df18b 100644
--- a/test/tokenizer_test.cpp
+++ b/test/tokenizer_test.cpp
@@ -323,6 +323,11 @@ TEST(TokenizerTest, ShouldTokenizeLocaleText) {
 
     tokens.clear();  // 配管
     Tokenizer("配管", true, false, "ja").tokenize(tokens);
+
+    // persian containing zwnj
+    tokens.clear();
+    Tokenizer("روان\u200Cشناسی", false, false, "fa").tokenize(tokens);
+    ASSERT_EQ(2, tokens.size());
 }
 
 TEST(TokenizerTest, ShouldTokenizeLocaleTextWithEnglishText) {