diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp index 17697bf4..464349ed 100644 --- a/src/tokenizer.cpp +++ b/src/tokenizer.cpp @@ -140,7 +140,9 @@ bool Tokenizer::next(std::string &token, size_t& token_index, size_t& start_inde } else if(normalize && locale == "ja") { auto raw_text = unicode_text.tempSubStringBetween(start_pos, end_pos); raw_text.toUTF8String(word); - JapaneseLocalizer::get_instance().normalize(word); + char* normalized_word = JapaneseLocalizer::get_instance().normalize(word); + word.assign(normalized_word, strlen(normalized_word)); + free(normalized_word); } else { unicode_text.tempSubStringBetween(start_pos, end_pos).toUTF8String(word); } diff --git a/test/tokenizer_test.cpp b/test/tokenizer_test.cpp index 64a34066..b141e97c 100644 --- a/test/tokenizer_test.cpp +++ b/test/tokenizer_test.cpp @@ -223,26 +223,6 @@ TEST(TokenizerTest, ShouldTokenizeChineseText) { } TEST(TokenizerTest, ShouldTokenizeLocaleText) { - std::string text = R"("王獣を倒すと入手した折れた角。追放された後、この世に存在すべきではないもの。\n獣域ウルブズの中で帝王と呼ばれていても、魔獣たちの系譜では、その兄たちの万分の一にも満たないだろう。\n「黄 -金」が無数の獣域ウルブズを捨て紙のように圧縮して偶然にできた異形の魔獣。その角には、黒いウルブズを命じて自分のために空間を溶かす権威が秘めている。")"; - - Tokenizer tokenizer(text, true, false, "ja", {}, {}); - - LOG(INFO) << "text.size: " << text.size(); - - std::string raw_token; - size_t raw_token_index = 0, tok_start = 0, tok_end = 0; - - // based on `highlight_affix_num_tokens` - size_t snippet_start_offset = 0, snippet_end_offset = (text.empty() ? 0 : text.size() - 1); - - // window used to locate the starting offset for snippet on the text - while(tokenizer.next(raw_token, raw_token_index, tok_start, tok_end)) { - //LOG(INFO) << "tok_start: " << tok_start; - } - - return ; - std::vector tokens; tokens.clear(); @@ -298,7 +278,7 @@ TEST(TokenizerTest, ShouldTokenizeLocaleText) { // japanese tokens.clear(); - Tokenizer("退屈", false, false, "ja").tokenize(tokens); + Tokenizer("退屈", true, false, "ja").tokenize(tokens); ASSERT_EQ(2, tokens.size()); ASSERT_EQ("た", tokens[0]); ASSERT_EQ("いくつ", tokens[1]); @@ -312,7 +292,7 @@ TEST(TokenizerTest, ShouldTokenizeLocaleText) { ASSERT_EQ(6, tokens.size()); tokens.clear(); - Tokenizer("ア退屈であ", false, false, "ja").tokenize(tokens); + Tokenizer("ア退屈であ", true, false, "ja").tokenize(tokens); ASSERT_EQ(5, tokens.size()); ASSERT_EQ("あ", tokens[0]); ASSERT_EQ("た", tokens[1]); @@ -321,7 +301,7 @@ TEST(TokenizerTest, ShouldTokenizeLocaleText) { ASSERT_EQ("あ", tokens[4]); tokens.clear(); - Tokenizer("怠惰な犬", false, false, "ja").tokenize(tokens); + Tokenizer("怠惰な犬", true, false, "ja").tokenize(tokens); ASSERT_EQ(4, tokens.size()); ASSERT_EQ("たい", tokens[0]); ASSERT_EQ("だ", tokens[1]); @@ -329,7 +309,7 @@ TEST(TokenizerTest, ShouldTokenizeLocaleText) { ASSERT_EQ("いぬ", tokens[3]); tokens.clear(); - Tokenizer("今ぶり拍治ルツ", false, false, "ja").tokenize(tokens); + Tokenizer("今ぶり拍治ルツ", true, false, "ja").tokenize(tokens); ASSERT_EQ(9, tokens.size()); ASSERT_EQ("いま", tokens[0]); ASSERT_EQ("ぶり", tokens[1]); @@ -342,7 +322,7 @@ TEST(TokenizerTest, ShouldTokenizeLocaleText) { ASSERT_EQ("つ", tokens[8]); tokens.clear(); // 配管 - Tokenizer("配管", false, false, "ja").tokenize(tokens); + Tokenizer("配管", true, false, "ja").tokenize(tokens); } TEST(TokenizerTest, ShouldTokenizeLocaleTextWithEnglishText) {