Fix edge cases in Japanese tokenization.

Address bad unicode + highlighting.
This commit is contained in:
Kishore Nallan 2023-06-01 16:54:14 +05:30
parent b2c95d84c6
commit 89d2b247fc
5 changed files with 62 additions and 4 deletions

View File

@ -168,7 +168,7 @@ new_git_repository(
new_git_repository(
name = "kakasi",
build_file = "//bazel:kakasi.BUILD",
commit = "9e0825a02c7ea5605e968f6208f769f7c49d6860",
commit = "77f2d1ce0146d15199ae0db1e61e0b699b0b55f6",
remote = "https://github.com/typesense/kakasi.git",
)

View File

@ -2735,7 +2735,8 @@ void Collection::highlight_result(const std::string& raw_query, const field &sea
tsl::htrie_set<char> matched_tokens;
bool use_word_tokenizer = search_field.locale == "th" || Tokenizer::is_cyrillic(search_field.locale);
bool use_word_tokenizer = search_field.locale == "th" || search_field.locale == "ja" ||
Tokenizer::is_cyrillic(search_field.locale);
bool normalise = !use_word_tokenizer;
std::vector<std::string> raw_query_tokens;

View File

@ -60,8 +60,12 @@ void Tokenizer::init(const std::string& input) {
}
else if(locale == "ja") {
normalized_text = JapaneseLocalizer::get_instance().normalize(input);
text = normalized_text;
if(normalize) {
normalized_text = JapaneseLocalizer::get_instance().normalize(input);
text = normalized_text;
} else {
text = input;
}
} else if(is_cyrillic(locale)) {
// init transliterator but will only transliterate during tokenization
UErrorCode translit_status = U_ZERO_ERROR;
@ -133,6 +137,10 @@ bool Tokenizer::next(std::string &token, size_t& token_index, size_t& start_inde
} else {
LOG(ERROR) << "Unicode error during parsing: " << errcode;
}
} else if(normalize && locale == "ja") {
auto raw_text = unicode_text.tempSubStringBetween(start_pos, end_pos);
raw_text.toUTF8String(word);
JapaneseLocalizer::get_instance().normalize(word);
} else {
unicode_text.tempSubStringBetween(start_pos, end_pos).toUTF8String(word);
}

View File

@ -774,6 +774,27 @@ TEST_F(CollectionLocaleTest, SearchOnCyrillicLargeText) {
results["hits"][0]["highlights"][0]["snippet"].get<std::string>().c_str());
}
TEST_F(CollectionLocaleTest, SearchOnJapaneseLargeText) {
std::vector<field> fields = {field("title", field_types::STRING, true, false, true, "ja"),};
Collection* coll1 = collectionManager.create_collection("coll1", 1, fields).get();
nlohmann::json doc;
doc["title"] = "王獣を倒すと入手した折れた角。追放された後、この世に存在すべきではないもの。\n獣域ウルブズの中で帝王と呼ばれていても、"
"魔獣たちの系譜では、その兄たちの万分の一にも満たないだろう。\n「黄"
"金」が無数の獣域ウルブズを捨て紙のように圧縮して偶然にできた異形の魔獣。その角には、黒いウルブズを命じて自分のため"
"に空間を溶かす権威が秘めている。";
ASSERT_TRUE(coll1->add(doc.dump()).ok());
auto results = coll1->search("王獣を", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}).get();
ASSERT_STREQ("<mark>王</mark><mark>獣</mark><mark>を</mark><mark>倒す</mark>と入手した折",
results["hits"][0]["highlights"][0]["snippet"].get<std::string>().c_str());
results = coll1->search("業果材", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}).get();
ASSERT_STREQ("に空間を溶かす<mark>権威</mark><mark>が</mark><mark>秘</mark>めている。",
results["hits"][0]["highlights"][0]["snippet"].get<std::string>().c_str());
}
TEST_F(CollectionLocaleTest, SearchOnArabicText) {
std::vector<field> fields = {field("title", field_types::STRING, true, false, true, ""),};
Collection* coll1 = collectionManager.create_collection("coll1", 1, fields).get();

View File

@ -223,6 +223,26 @@ TEST(TokenizerTest, ShouldTokenizeChineseText) {
}
TEST(TokenizerTest, ShouldTokenizeLocaleText) {
std::string text = R"("\n獣域ウルブズの中で帝王と呼ばれていても\n
")";
Tokenizer tokenizer(text, true, false, "ja", {}, {});
LOG(INFO) << "text.size: " << text.size();
std::string raw_token;
size_t raw_token_index = 0, tok_start = 0, tok_end = 0;
// based on `highlight_affix_num_tokens`
size_t snippet_start_offset = 0, snippet_end_offset = (text.empty() ? 0 : text.size() - 1);
// window used to locate the starting offset for snippet on the text
while(tokenizer.next(raw_token, raw_token_index, tok_start, tok_end)) {
LOG(INFO) << "tok_start: " << tok_start;
}
return ;
std::vector<std::string> tokens;
tokens.clear();
@ -283,6 +303,14 @@ TEST(TokenizerTest, ShouldTokenizeLocaleText) {
ASSERT_EQ("", tokens[0]);
ASSERT_EQ("いくつ", tokens[1]);
tokens.clear();
Tokenizer("", true, false, "ja").tokenize(tokens);
ASSERT_EQ(0, tokens.size());
tokens.clear();
Tokenizer("「業果材", true, false, "ja").tokenize(tokens);
ASSERT_EQ(6, tokens.size());
tokens.clear();
Tokenizer("ア退屈であ", false, false, "ja").tokenize(tokens);
ASSERT_EQ(5, tokens.size());