mirror of
https://github.com/typesense/typesense.git
synced 2025-05-19 05:08:43 +08:00
Handle zero width non-joiner character for persian.
It must split the tokens into individual words.
This commit is contained in:
parent
40a684619b
commit
ff0d2596cc
@ -87,6 +87,13 @@ void Tokenizer::init(const std::string& input) {
|
||||
}
|
||||
|
||||
unicode_text = icu::UnicodeString::fromUTF8(text);
|
||||
|
||||
if(locale == "fa") {
|
||||
icu::UnicodeString target_str;
|
||||
target_str.setTo(0x200C); // U+200C (ZERO WIDTH NON-JOINER)
|
||||
unicode_text.findAndReplace(target_str, " ");
|
||||
}
|
||||
|
||||
bi->setText(unicode_text);
|
||||
|
||||
start_pos = bi->first();
|
||||
|
@ -323,6 +323,11 @@ TEST(TokenizerTest, ShouldTokenizeLocaleText) {
|
||||
|
||||
tokens.clear(); // 配管
|
||||
Tokenizer("配管", true, false, "ja").tokenize(tokens);
|
||||
|
||||
// persian containing zwnj
|
||||
tokens.clear();
|
||||
Tokenizer("روان\u200Cشناسی", false, false, "fa").tokenize(tokens);
|
||||
ASSERT_EQ(2, tokens.size());
|
||||
}
|
||||
|
||||
TEST(TokenizerTest, ShouldTokenizeLocaleTextWithEnglishText) {
|
||||
|
Loading…
x
Reference in New Issue
Block a user