Fix JA locale tests.

This commit is contained in:
Kishore Nallan 2023-07-09 21:36:19 +05:30
parent 5c60e06690
commit e8d19082bd
2 changed files with 8 additions and 26 deletions

View File

@ -140,7 +140,9 @@ bool Tokenizer::next(std::string &token, size_t& token_index, size_t& start_inde
} else if(normalize && locale == "ja") {
auto raw_text = unicode_text.tempSubStringBetween(start_pos, end_pos);
raw_text.toUTF8String(word);
JapaneseLocalizer::get_instance().normalize(word);
char* normalized_word = JapaneseLocalizer::get_instance().normalize(word);
word.assign(normalized_word, strlen(normalized_word));
free(normalized_word);
} else {
unicode_text.tempSubStringBetween(start_pos, end_pos).toUTF8String(word);
}

View File

@ -223,26 +223,6 @@ TEST(TokenizerTest, ShouldTokenizeChineseText) {
}
TEST(TokenizerTest, ShouldTokenizeLocaleText) {
std::string text = R"("\n獣域ウルブズの中で帝王と呼ばれていても\n
")";
Tokenizer tokenizer(text, true, false, "ja", {}, {});
LOG(INFO) << "text.size: " << text.size();
std::string raw_token;
size_t raw_token_index = 0, tok_start = 0, tok_end = 0;
// based on `highlight_affix_num_tokens`
size_t snippet_start_offset = 0, snippet_end_offset = (text.empty() ? 0 : text.size() - 1);
// window used to locate the starting offset for snippet on the text
while(tokenizer.next(raw_token, raw_token_index, tok_start, tok_end)) {
//LOG(INFO) << "tok_start: " << tok_start;
}
return ;
std::vector<std::string> tokens;
tokens.clear();
@ -298,7 +278,7 @@ TEST(TokenizerTest, ShouldTokenizeLocaleText) {
// japanese
tokens.clear();
Tokenizer("退屈", false, false, "ja").tokenize(tokens);
Tokenizer("退屈", true, false, "ja").tokenize(tokens);
ASSERT_EQ(2, tokens.size());
ASSERT_EQ("", tokens[0]);
ASSERT_EQ("いくつ", tokens[1]);
@ -312,7 +292,7 @@ TEST(TokenizerTest, ShouldTokenizeLocaleText) {
ASSERT_EQ(6, tokens.size());
tokens.clear();
Tokenizer("ア退屈であ", false, false, "ja").tokenize(tokens);
Tokenizer("ア退屈であ", true, false, "ja").tokenize(tokens);
ASSERT_EQ(5, tokens.size());
ASSERT_EQ("", tokens[0]);
ASSERT_EQ("", tokens[1]);
@ -321,7 +301,7 @@ TEST(TokenizerTest, ShouldTokenizeLocaleText) {
ASSERT_EQ("", tokens[4]);
tokens.clear();
Tokenizer("怠惰な犬", false, false, "ja").tokenize(tokens);
Tokenizer("怠惰な犬", true, false, "ja").tokenize(tokens);
ASSERT_EQ(4, tokens.size());
ASSERT_EQ("たい", tokens[0]);
ASSERT_EQ("", tokens[1]);
@ -329,7 +309,7 @@ TEST(TokenizerTest, ShouldTokenizeLocaleText) {
ASSERT_EQ("いぬ", tokens[3]);
tokens.clear();
Tokenizer("今ぶり拍治ルツ", false, false, "ja").tokenize(tokens);
Tokenizer("今ぶり拍治ルツ", true, false, "ja").tokenize(tokens);
ASSERT_EQ(9, tokens.size());
ASSERT_EQ("いま", tokens[0]);
ASSERT_EQ("ぶり", tokens[1]);
@ -342,7 +322,7 @@ TEST(TokenizerTest, ShouldTokenizeLocaleText) {
ASSERT_EQ("", tokens[8]);
tokens.clear(); // 配管
Tokenizer("配管", false, false, "ja").tokenize(tokens);
Tokenizer("配管", true, false, "ja").tokenize(tokens);
}
TEST(TokenizerTest, ShouldTokenizeLocaleTextWithEnglishText) {