Support Chinese locale.

This commit is contained in:
Kishore Nallan 2021-06-06 22:03:02 +05:30
parent 1189b7d124
commit da3de68129
4 changed files with 155 additions and 1 deletions

View File

@ -5,6 +5,7 @@
#include <iconv.h>
#include <unicode/brkiter.h>
#include <unicode/normalizer2.h>
#include <unicode/translit.h>
#include "japanese_localizer.h"
#include "logger.h"
@ -37,6 +38,8 @@ private:
// non-deletable singleton
const icu::Normalizer2* nfkd;
icu::Transliterator* transliterator = nullptr;
inline size_t get_stream_mode(char c) {
return (std::isalnum(c) || index_symbols[uint8_t(c)] == 1) ? INDEX : (
(c == ' ' || c == '\n') ? SEPARATE : SKIP
@ -58,6 +61,7 @@ public:
iconv_close(cd);
free(normalized_text);
delete bi;
delete transliterator;
}
bool next(std::string& token, size_t& token_index, size_t& start_index, size_t& end_index);

View File

@ -6,7 +6,26 @@ Tokenizer::Tokenizer(const std::string& input, bool normalize, bool no_op, const
const std::vector<char>& symbols_to_index):
i(0), normalize(normalize), no_op(no_op), locale(locale) {
if(locale == "ja") {
if(locale == "zh") {
UErrorCode translit_status = U_ZERO_ERROR;
transliterator = icu::Transliterator::createInstance("Traditional-Simplified",
UTRANS_FORWARD, translit_status);
if(U_FAILURE(translit_status)) {
//LOG(ERROR) << "Unable to create transliteration instance for `zh` locale.";
transliterator = nullptr;
text = input;
} else {
icu::UnicodeString unicode_input = icu::UnicodeString::fromUTF8(input);
transliterator->transliterate(unicode_input);
std::string output;
unicode_input.toUTF8String(output);
normalized_text = (char *)malloc(output.size()+1);
strcpy(normalized_text, output.c_str());
text = normalized_text;
}
}
else if(locale == "ja") {
normalized_text = JapaneseLocalizer::get_instance().normalize(input);
text = normalized_text;
} else {
@ -81,6 +100,8 @@ bool Tokenizer::next(std::string &token, size_t& token_index, size_t& start_inde
found_token = false;
} else if(locale == "ko" && token == "·") {
found_token = false;
} else if(locale == "zh" && (token == "" || token == "" || token == "")) {
found_token = false;
} else {
if(std::isalnum(token[0]) && is_ascii_char(token[0])) {

View File

@ -30,6 +30,70 @@ protected:
}
};
TEST_F(CollectionLocaleTest, SearchAgainstChineseText) {
Collection *coll1;
std::vector<field> fields = {field("title", field_types::STRING, false, false, true, DEFAULT_GEO_RESOLUTION, "zh"),
field("artist", field_types::STRING, false),
field("points", field_types::INT32, false),};
coll1 = collectionManager.get_collection("coll1").get();
if(coll1 == nullptr) {
coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
}
std::vector<std::vector<std::string>> records = {
{"爱并不会因时间而", "Dustin Kensrue"},
{"很久以前,傳說在臺中北屯的一個地方", "Gord Downie"},
{"獻給我思念的每一朵雲──海", "Dustin Kensrue"},
{"看誰先跑到小山丘上。媽媽總是第", "Jamie Phua"},
};
for(size_t i=0; i<records.size(); i++) {
nlohmann::json doc;
doc["id"] = std::to_string(i);
doc["title"] = records[i][0];
doc["artist"] = records[i][1];
doc["points"] = i;
ASSERT_TRUE(coll1->add(doc.dump()).ok());
}
auto results = coll1->search("",
{"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, true).get();
ASSERT_EQ(1, results["found"].get<size_t>());
ASSERT_EQ(1, results["hits"].size());
ASSERT_EQ("0", results["hits"][0]["document"]["id"].get<std::string>());
ASSERT_EQ("爱<mark>并不</mark>会因时间而", results["hits"][0]["highlights"][0]["snippet"].get<std::string>());
// partial token should not match as prefix when prefix is set to false
results = coll1->search("",
{"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY).get();
ASSERT_EQ(0, results["found"].get<size_t>());
results = coll1->search("上媽",
{"title", "artist"}, "", {}, {}, {0}, 10, 1, FREQUENCY, true).get();
ASSERT_EQ(1, results["found"].get<size_t>());
ASSERT_EQ(1, results["hits"].size());
ASSERT_EQ("3", results["hits"][0]["document"]["id"].get<std::string>());
ASSERT_EQ("看誰先跑到小山丘<mark>上</mark>。<mark>媽媽</mark>總是第", results["hits"][0]["highlights"][0]["snippet"].get<std::string>());
// search using simplified chinese
results = coll1->search("",
{"title", "artist"}, "", {}, {}, {0}, 10, 1, FREQUENCY, true).get();
ASSERT_EQ(1, results["found"].get<size_t>());
ASSERT_EQ(1, results["hits"].size());
ASSERT_EQ("3", results["hits"][0]["document"]["id"].get<std::string>());
ASSERT_EQ("看誰先跑到小山丘上。<mark>媽媽</mark>總是第", results["hits"][0]["highlights"][0]["snippet"].get<std::string>());
}
TEST_F(CollectionLocaleTest, SearchAgainstThaiText) {
Collection *coll1;

View File

@ -157,6 +157,71 @@ TEST(TokenizerTest, ShouldTokenizeTextWithCustomSpecialChars) {
ASSERT_EQ("-more", tokens[2]);
}
TEST(TokenizerTest, ShouldTokenizeChineseText) {
std::vector<std::string> tokens;
// traditional -> simplified
Tokenizer("", false, false, "zh").tokenize(tokens);
ASSERT_EQ(1, tokens.size());
ASSERT_EQ("", tokens[0]);
tokens.clear();
Tokenizer("", false, false, "zh").tokenize(tokens);
ASSERT_EQ(1, tokens.size());
ASSERT_EQ("", tokens[0]);
// tokenize traditional
tokens.clear();
Tokenizer("愛並不會因時間而", false, false, "zh").tokenize(tokens);
ASSERT_EQ(6, tokens.size());
ASSERT_EQ("", tokens[0]);
ASSERT_EQ("并不", tokens[1]);
ASSERT_EQ("", tokens[2]);
ASSERT_EQ("", tokens[3]);
ASSERT_EQ("时间", tokens[4]);
ASSERT_EQ("", tokens[5]);
// tokenize simplified
tokens.clear();
Tokenizer("爱并不会因时间而", false, false, "zh").tokenize(tokens);
ASSERT_EQ(6, tokens.size());
ASSERT_EQ("", tokens[0]);
ASSERT_EQ("并不", tokens[1]);
ASSERT_EQ("", tokens[2]);
ASSERT_EQ("", tokens[3]);
ASSERT_EQ("时间", tokens[4]);
ASSERT_EQ("", tokens[5]);
// with separators
tokens.clear();
Tokenizer("很久以前,傳說在臺中北屯的一個地方", false, false, "zh").tokenize(tokens);
ASSERT_EQ(10, tokens.size());
ASSERT_EQ("很久", tokens[0]);
ASSERT_EQ("以前", tokens[1]);
ASSERT_EQ("传说", tokens[2]);
ASSERT_EQ("", tokens[3]);
ASSERT_EQ("台中", tokens[4]);
ASSERT_EQ("", tokens[5]);
ASSERT_EQ("", tokens[6]);
ASSERT_EQ("", tokens[7]);
ASSERT_EQ("一个", tokens[8]);
ASSERT_EQ("地方", tokens[9]);
tokens.clear();
Tokenizer("朵雲──海", false, false, "zh").tokenize(tokens);
ASSERT_EQ(3, tokens.size());
ASSERT_EQ("", tokens[0]);
ASSERT_EQ("", tokens[1]);
ASSERT_EQ("", tokens[2]);
tokens.clear();
Tokenizer("山丘上。媽媽", false, false, "zh").tokenize(tokens);
ASSERT_EQ(3, tokens.size());
ASSERT_EQ("山丘", tokens[0]);
ASSERT_EQ("", tokens[1]);
ASSERT_EQ("妈妈", tokens[2]);
}
TEST(TokenizerTest, ShouldTokenizeLocaleText) {
std::string tstr = "ลงรถไฟ";
std::vector<std::string> ttokens;