mirror of
https://github.com/typesense/typesense.git
synced 2025-05-22 14:55:26 +08:00
Fix JA locale tests.
This commit is contained in:
parent
5c60e06690
commit
e8d19082bd
@ -140,7 +140,9 @@ bool Tokenizer::next(std::string &token, size_t& token_index, size_t& start_inde
|
||||
} else if(normalize && locale == "ja") {
|
||||
auto raw_text = unicode_text.tempSubStringBetween(start_pos, end_pos);
|
||||
raw_text.toUTF8String(word);
|
||||
JapaneseLocalizer::get_instance().normalize(word);
|
||||
char* normalized_word = JapaneseLocalizer::get_instance().normalize(word);
|
||||
word.assign(normalized_word, strlen(normalized_word));
|
||||
free(normalized_word);
|
||||
} else {
|
||||
unicode_text.tempSubStringBetween(start_pos, end_pos).toUTF8String(word);
|
||||
}
|
||||
|
@ -223,26 +223,6 @@ TEST(TokenizerTest, ShouldTokenizeChineseText) {
|
||||
}
|
||||
|
||||
TEST(TokenizerTest, ShouldTokenizeLocaleText) {
|
||||
std::string text = R"("王獣を倒すと入手した折れた角。追放された後、この世に存在すべきではないもの。\n獣域ウルブズの中で帝王と呼ばれていても、魔獣たちの系譜では、その兄たちの万分の一にも満たないだろう。\n「黄
|
||||
金」が無数の獣域ウルブズを捨て紙のように圧縮して偶然にできた異形の魔獣。その角には、黒いウルブズを命じて自分のために空間を溶かす権威が秘めている。")";
|
||||
|
||||
Tokenizer tokenizer(text, true, false, "ja", {}, {});
|
||||
|
||||
LOG(INFO) << "text.size: " << text.size();
|
||||
|
||||
std::string raw_token;
|
||||
size_t raw_token_index = 0, tok_start = 0, tok_end = 0;
|
||||
|
||||
// based on `highlight_affix_num_tokens`
|
||||
size_t snippet_start_offset = 0, snippet_end_offset = (text.empty() ? 0 : text.size() - 1);
|
||||
|
||||
// window used to locate the starting offset for snippet on the text
|
||||
while(tokenizer.next(raw_token, raw_token_index, tok_start, tok_end)) {
|
||||
//LOG(INFO) << "tok_start: " << tok_start;
|
||||
}
|
||||
|
||||
return ;
|
||||
|
||||
std::vector<std::string> tokens;
|
||||
|
||||
tokens.clear();
|
||||
@ -298,7 +278,7 @@ TEST(TokenizerTest, ShouldTokenizeLocaleText) {
|
||||
|
||||
// japanese
|
||||
tokens.clear();
|
||||
Tokenizer("退屈", false, false, "ja").tokenize(tokens);
|
||||
Tokenizer("退屈", true, false, "ja").tokenize(tokens);
|
||||
ASSERT_EQ(2, tokens.size());
|
||||
ASSERT_EQ("た", tokens[0]);
|
||||
ASSERT_EQ("いくつ", tokens[1]);
|
||||
@ -312,7 +292,7 @@ TEST(TokenizerTest, ShouldTokenizeLocaleText) {
|
||||
ASSERT_EQ(6, tokens.size());
|
||||
|
||||
tokens.clear();
|
||||
Tokenizer("ア退屈であ", false, false, "ja").tokenize(tokens);
|
||||
Tokenizer("ア退屈であ", true, false, "ja").tokenize(tokens);
|
||||
ASSERT_EQ(5, tokens.size());
|
||||
ASSERT_EQ("あ", tokens[0]);
|
||||
ASSERT_EQ("た", tokens[1]);
|
||||
@ -321,7 +301,7 @@ TEST(TokenizerTest, ShouldTokenizeLocaleText) {
|
||||
ASSERT_EQ("あ", tokens[4]);
|
||||
|
||||
tokens.clear();
|
||||
Tokenizer("怠惰な犬", false, false, "ja").tokenize(tokens);
|
||||
Tokenizer("怠惰な犬", true, false, "ja").tokenize(tokens);
|
||||
ASSERT_EQ(4, tokens.size());
|
||||
ASSERT_EQ("たい", tokens[0]);
|
||||
ASSERT_EQ("だ", tokens[1]);
|
||||
@ -329,7 +309,7 @@ TEST(TokenizerTest, ShouldTokenizeLocaleText) {
|
||||
ASSERT_EQ("いぬ", tokens[3]);
|
||||
|
||||
tokens.clear();
|
||||
Tokenizer("今ぶり拍治ルツ", false, false, "ja").tokenize(tokens);
|
||||
Tokenizer("今ぶり拍治ルツ", true, false, "ja").tokenize(tokens);
|
||||
ASSERT_EQ(9, tokens.size());
|
||||
ASSERT_EQ("いま", tokens[0]);
|
||||
ASSERT_EQ("ぶり", tokens[1]);
|
||||
@ -342,7 +322,7 @@ TEST(TokenizerTest, ShouldTokenizeLocaleText) {
|
||||
ASSERT_EQ("つ", tokens[8]);
|
||||
|
||||
tokens.clear(); // 配管
|
||||
Tokenizer("配管", false, false, "ja").tokenize(tokens);
|
||||
Tokenizer("配管", true, false, "ja").tokenize(tokens);
|
||||
}
|
||||
|
||||
TEST(TokenizerTest, ShouldTokenizeLocaleTextWithEnglishText) {
|
||||
|
Loading…
x
Reference in New Issue
Block a user