Handle zero width non-joiner character for persian.

It must split the tokens into individual words.
This commit is contained in:
Kishore Nallan 2023-10-11 15:02:22 +05:30
parent 40a684619b
commit ff0d2596cc
2 changed files with 12 additions and 0 deletions

View File

@ -87,6 +87,13 @@ void Tokenizer::init(const std::string& input) {
}
unicode_text = icu::UnicodeString::fromUTF8(text);
if(locale == "fa") {
icu::UnicodeString target_str;
target_str.setTo(0x200C); // U+200C (ZERO WIDTH NON-JOINER)
unicode_text.findAndReplace(target_str, " ");
}
bi->setText(unicode_text);
start_pos = bi->first();

View File

@ -323,6 +323,11 @@ TEST(TokenizerTest, ShouldTokenizeLocaleText) {
tokens.clear(); // 配管
Tokenizer("配管", true, false, "ja").tokenize(tokens);
// persian containing zwnj
tokens.clear();
Tokenizer("روان\u200Cشناسی", false, false, "fa").tokenize(tokens);
ASSERT_EQ(2, tokens.size());
}
TEST(TokenizerTest, ShouldTokenizeLocaleTextWithEnglishText) {