mirror of
https://github.com/typesense/typesense.git
synced 2025-05-19 21:22:25 +08:00
Support Chinese locale.
This commit is contained in:
parent
1189b7d124
commit
da3de68129
@ -5,6 +5,7 @@
|
||||
#include <iconv.h>
|
||||
#include <unicode/brkiter.h>
|
||||
#include <unicode/normalizer2.h>
|
||||
#include <unicode/translit.h>
|
||||
#include "japanese_localizer.h"
|
||||
#include "logger.h"
|
||||
|
||||
@ -37,6 +38,8 @@ private:
|
||||
// non-deletable singleton
|
||||
const icu::Normalizer2* nfkd;
|
||||
|
||||
icu::Transliterator* transliterator = nullptr;
|
||||
|
||||
inline size_t get_stream_mode(char c) {
|
||||
return (std::isalnum(c) || index_symbols[uint8_t(c)] == 1) ? INDEX : (
|
||||
(c == ' ' || c == '\n') ? SEPARATE : SKIP
|
||||
@ -58,6 +61,7 @@ public:
|
||||
iconv_close(cd);
|
||||
free(normalized_text);
|
||||
delete bi;
|
||||
delete transliterator;
|
||||
}
|
||||
|
||||
bool next(std::string& token, size_t& token_index, size_t& start_index, size_t& end_index);
|
||||
|
@ -6,7 +6,26 @@ Tokenizer::Tokenizer(const std::string& input, bool normalize, bool no_op, const
|
||||
const std::vector<char>& symbols_to_index):
|
||||
i(0), normalize(normalize), no_op(no_op), locale(locale) {
|
||||
|
||||
if(locale == "ja") {
|
||||
if(locale == "zh") {
|
||||
UErrorCode translit_status = U_ZERO_ERROR;
|
||||
transliterator = icu::Transliterator::createInstance("Traditional-Simplified",
|
||||
UTRANS_FORWARD, translit_status);
|
||||
if(U_FAILURE(translit_status)) {
|
||||
//LOG(ERROR) << "Unable to create transliteration instance for `zh` locale.";
|
||||
transliterator = nullptr;
|
||||
text = input;
|
||||
} else {
|
||||
icu::UnicodeString unicode_input = icu::UnicodeString::fromUTF8(input);
|
||||
transliterator->transliterate(unicode_input);
|
||||
std::string output;
|
||||
unicode_input.toUTF8String(output);
|
||||
normalized_text = (char *)malloc(output.size()+1);
|
||||
strcpy(normalized_text, output.c_str());
|
||||
text = normalized_text;
|
||||
}
|
||||
}
|
||||
|
||||
else if(locale == "ja") {
|
||||
normalized_text = JapaneseLocalizer::get_instance().normalize(input);
|
||||
text = normalized_text;
|
||||
} else {
|
||||
@ -81,6 +100,8 @@ bool Tokenizer::next(std::string &token, size_t& token_index, size_t& start_inde
|
||||
found_token = false;
|
||||
} else if(locale == "ko" && token == "·") {
|
||||
found_token = false;
|
||||
} else if(locale == "zh" && (token == "," || token == "─" || token == "。")) {
|
||||
found_token = false;
|
||||
} else {
|
||||
|
||||
if(std::isalnum(token[0]) && is_ascii_char(token[0])) {
|
||||
|
@ -30,6 +30,70 @@ protected:
|
||||
}
|
||||
};
|
||||
|
||||
TEST_F(CollectionLocaleTest, SearchAgainstChineseText) {
|
||||
Collection *coll1;
|
||||
|
||||
std::vector<field> fields = {field("title", field_types::STRING, false, false, true, DEFAULT_GEO_RESOLUTION, "zh"),
|
||||
field("artist", field_types::STRING, false),
|
||||
field("points", field_types::INT32, false),};
|
||||
|
||||
coll1 = collectionManager.get_collection("coll1").get();
|
||||
if(coll1 == nullptr) {
|
||||
coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
|
||||
}
|
||||
|
||||
std::vector<std::vector<std::string>> records = {
|
||||
{"爱并不会因时间而", "Dustin Kensrue"},
|
||||
{"很久以前,傳說在臺中北屯的一個地方", "Gord Downie"},
|
||||
{"獻給我思念的每一朵雲──海", "Dustin Kensrue"},
|
||||
{"看誰先跑到小山丘上。媽媽總是第", "Jamie Phua"},
|
||||
};
|
||||
|
||||
for(size_t i=0; i<records.size(); i++) {
|
||||
nlohmann::json doc;
|
||||
|
||||
doc["id"] = std::to_string(i);
|
||||
doc["title"] = records[i][0];
|
||||
doc["artist"] = records[i][1];
|
||||
doc["points"] = i;
|
||||
|
||||
ASSERT_TRUE(coll1->add(doc.dump()).ok());
|
||||
}
|
||||
|
||||
auto results = coll1->search("并",
|
||||
{"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, true).get();
|
||||
|
||||
ASSERT_EQ(1, results["found"].get<size_t>());
|
||||
ASSERT_EQ(1, results["hits"].size());
|
||||
ASSERT_EQ("0", results["hits"][0]["document"]["id"].get<std::string>());
|
||||
ASSERT_EQ("爱<mark>并不</mark>会因时间而", results["hits"][0]["highlights"][0]["snippet"].get<std::string>());
|
||||
|
||||
// partial token should not match as prefix when prefix is set to false
|
||||
|
||||
results = coll1->search("并",
|
||||
{"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY).get();
|
||||
|
||||
ASSERT_EQ(0, results["found"].get<size_t>());
|
||||
|
||||
results = coll1->search("上媽",
|
||||
{"title", "artist"}, "", {}, {}, {0}, 10, 1, FREQUENCY, true).get();
|
||||
|
||||
ASSERT_EQ(1, results["found"].get<size_t>());
|
||||
ASSERT_EQ(1, results["hits"].size());
|
||||
ASSERT_EQ("3", results["hits"][0]["document"]["id"].get<std::string>());
|
||||
ASSERT_EQ("看誰先跑到小山丘<mark>上</mark>。<mark>媽媽</mark>總是第", results["hits"][0]["highlights"][0]["snippet"].get<std::string>());
|
||||
|
||||
// search using simplified chinese
|
||||
|
||||
results = coll1->search("妈",
|
||||
{"title", "artist"}, "", {}, {}, {0}, 10, 1, FREQUENCY, true).get();
|
||||
|
||||
ASSERT_EQ(1, results["found"].get<size_t>());
|
||||
ASSERT_EQ(1, results["hits"].size());
|
||||
ASSERT_EQ("3", results["hits"][0]["document"]["id"].get<std::string>());
|
||||
ASSERT_EQ("看誰先跑到小山丘上。<mark>媽媽</mark>總是第", results["hits"][0]["highlights"][0]["snippet"].get<std::string>());
|
||||
}
|
||||
|
||||
TEST_F(CollectionLocaleTest, SearchAgainstThaiText) {
|
||||
Collection *coll1;
|
||||
|
||||
|
@ -157,6 +157,71 @@ TEST(TokenizerTest, ShouldTokenizeTextWithCustomSpecialChars) {
|
||||
ASSERT_EQ("-more", tokens[2]);
|
||||
}
|
||||
|
||||
TEST(TokenizerTest, ShouldTokenizeChineseText) {
|
||||
std::vector<std::string> tokens;
|
||||
|
||||
// traditional -> simplified
|
||||
Tokenizer("語", false, false, "zh").tokenize(tokens);
|
||||
ASSERT_EQ(1, tokens.size());
|
||||
ASSERT_EQ("语", tokens[0]);
|
||||
|
||||
tokens.clear();
|
||||
Tokenizer("說", false, false, "zh").tokenize(tokens);
|
||||
ASSERT_EQ(1, tokens.size());
|
||||
ASSERT_EQ("说", tokens[0]);
|
||||
|
||||
// tokenize traditional
|
||||
tokens.clear();
|
||||
Tokenizer("愛並不會因時間而", false, false, "zh").tokenize(tokens);
|
||||
ASSERT_EQ(6, tokens.size());
|
||||
ASSERT_EQ("爱", tokens[0]);
|
||||
ASSERT_EQ("并不", tokens[1]);
|
||||
ASSERT_EQ("会", tokens[2]);
|
||||
ASSERT_EQ("因", tokens[3]);
|
||||
ASSERT_EQ("时间", tokens[4]);
|
||||
ASSERT_EQ("而", tokens[5]);
|
||||
|
||||
// tokenize simplified
|
||||
tokens.clear();
|
||||
Tokenizer("爱并不会因时间而", false, false, "zh").tokenize(tokens);
|
||||
ASSERT_EQ(6, tokens.size());
|
||||
ASSERT_EQ("爱", tokens[0]);
|
||||
ASSERT_EQ("并不", tokens[1]);
|
||||
ASSERT_EQ("会", tokens[2]);
|
||||
ASSERT_EQ("因", tokens[3]);
|
||||
ASSERT_EQ("时间", tokens[4]);
|
||||
ASSERT_EQ("而", tokens[5]);
|
||||
|
||||
// with separators
|
||||
tokens.clear();
|
||||
Tokenizer("很久以前,傳說在臺中北屯的一個地方", false, false, "zh").tokenize(tokens);
|
||||
ASSERT_EQ(10, tokens.size());
|
||||
ASSERT_EQ("很久", tokens[0]);
|
||||
ASSERT_EQ("以前", tokens[1]);
|
||||
ASSERT_EQ("传说", tokens[2]);
|
||||
ASSERT_EQ("在", tokens[3]);
|
||||
ASSERT_EQ("台中", tokens[4]);
|
||||
ASSERT_EQ("北", tokens[5]);
|
||||
ASSERT_EQ("屯", tokens[6]);
|
||||
ASSERT_EQ("的", tokens[7]);
|
||||
ASSERT_EQ("一个", tokens[8]);
|
||||
ASSERT_EQ("地方", tokens[9]);
|
||||
|
||||
tokens.clear();
|
||||
Tokenizer("朵雲──海", false, false, "zh").tokenize(tokens);
|
||||
ASSERT_EQ(3, tokens.size());
|
||||
ASSERT_EQ("朵", tokens[0]);
|
||||
ASSERT_EQ("云", tokens[1]);
|
||||
ASSERT_EQ("海", tokens[2]);
|
||||
|
||||
tokens.clear();
|
||||
Tokenizer("山丘上。媽媽", false, false, "zh").tokenize(tokens);
|
||||
ASSERT_EQ(3, tokens.size());
|
||||
ASSERT_EQ("山丘", tokens[0]);
|
||||
ASSERT_EQ("上", tokens[1]);
|
||||
ASSERT_EQ("妈妈", tokens[2]);
|
||||
}
|
||||
|
||||
TEST(TokenizerTest, ShouldTokenizeLocaleText) {
|
||||
std::string tstr = "ลงรถไฟ";
|
||||
std::vector<std::string> ttokens;
|
||||
|
Loading…
x
Reference in New Issue
Block a user