mirror of
https://github.com/typesense/typesense.git
synced 2025-05-19 05:08:43 +08:00
Add a few locale tokenization tests.
This commit is contained in:
parent
8a43fa8b88
commit
e695ba65c8
@ -30,6 +30,43 @@ protected:
|
||||
}
|
||||
};
|
||||
|
||||
TEST_F(CollectionLocaleTest, SearchAgainstJapaneseText) {
|
||||
Collection *coll1;
|
||||
|
||||
std::vector<field> fields = {field("title", field_types::STRING, false, false, true, DEFAULT_GEO_RESOLUTION, "ja"),
|
||||
field("artist", field_types::STRING, false),
|
||||
field("points", field_types::INT32, false),};
|
||||
|
||||
coll1 = collectionManager.get_collection("coll1").get();
|
||||
if(coll1 == nullptr) {
|
||||
coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
|
||||
}
|
||||
|
||||
std::vector<std::vector<std::string>> records = {
|
||||
{"今ぶり拍治ルツ", "Dustin Kensrue"},
|
||||
};
|
||||
|
||||
for(size_t i=0; i<records.size(); i++) {
|
||||
nlohmann::json doc;
|
||||
|
||||
doc["id"] = std::to_string(i);
|
||||
doc["title"] = records[i][0];
|
||||
doc["artist"] = records[i][1];
|
||||
doc["points"] = i;
|
||||
|
||||
ASSERT_TRUE(coll1->add(doc.dump()).ok());
|
||||
}
|
||||
|
||||
auto results = coll1->search("拍治",
|
||||
{"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, true).get();
|
||||
|
||||
//LOG(INFO) << results;
|
||||
ASSERT_EQ(1, results["found"].get<size_t>());
|
||||
ASSERT_EQ(1, results["hits"].size());
|
||||
ASSERT_EQ("0", results["hits"][0]["document"]["id"].get<std::string>());
|
||||
//ASSERT_EQ("今ぶり<mark>拍</mark><mark>治</mark>ルツ", results["hits"][0]["highlights"][0]["snippet"].get<std::string>());
|
||||
}
|
||||
|
||||
TEST_F(CollectionLocaleTest, SearchAgainstChineseText) {
|
||||
Collection *coll1;
|
||||
|
||||
|
@ -223,14 +223,21 @@ TEST(TokenizerTest, ShouldTokenizeChineseText) {
|
||||
}
|
||||
|
||||
TEST(TokenizerTest, ShouldTokenizeLocaleText) {
|
||||
std::string tstr = "ลงรถไฟ";
|
||||
std::vector<std::string> ttokens;
|
||||
Tokenizer(tstr, true, false, "th").tokenize(ttokens);
|
||||
|
||||
std::string str = "จิ้งจอกสีน้ำตาลด่วน";
|
||||
std::vector<std::string> tokens;
|
||||
Tokenizer(str, false, false, "th").tokenize(tokens);
|
||||
|
||||
tokens.clear();
|
||||
std::string str = "ความเหลื่อมล้ำ";
|
||||
Tokenizer(str, true, false, "th").tokenize(tokens);
|
||||
//ASSERT_EQ(2, tokens.size());
|
||||
|
||||
tokens.clear();
|
||||
str = "เหลื่";
|
||||
Tokenizer(str, false, false, "th").tokenize(tokens);
|
||||
//ASSERT_EQ(1, tokens.size());
|
||||
|
||||
tokens.clear();
|
||||
str = "จิ้งจอกสีน้ำตาลด่วน";
|
||||
Tokenizer(str, false, false, "th").tokenize(tokens);
|
||||
ASSERT_EQ(4, tokens.size());
|
||||
ASSERT_EQ("จิ้งจอก", tokens[0]);
|
||||
ASSERT_EQ("สี", tokens[1]);
|
||||
@ -292,6 +299,22 @@ TEST(TokenizerTest, ShouldTokenizeLocaleText) {
|
||||
ASSERT_EQ("だ", tokens[1]);
|
||||
ASSERT_EQ("な", tokens[2]);
|
||||
ASSERT_EQ("いぬ", tokens[3]);
|
||||
|
||||
tokens.clear();
|
||||
Tokenizer("今ぶり拍治ルツ", false, false, "ja").tokenize(tokens);
|
||||
ASSERT_EQ(9, tokens.size());
|
||||
ASSERT_EQ("いま", tokens[0]);
|
||||
ASSERT_EQ("ぶり", tokens[1]);
|
||||
ASSERT_EQ("は", tokens[2]);
|
||||
ASSERT_EQ("く", tokens[3]);
|
||||
ASSERT_EQ("お", tokens[4]);
|
||||
ASSERT_EQ("さ", tokens[5]);
|
||||
ASSERT_EQ("む", tokens[6]);
|
||||
ASSERT_EQ("る", tokens[7]);
|
||||
ASSERT_EQ("つ", tokens[8]);
|
||||
|
||||
tokens.clear(); // 配管
|
||||
Tokenizer("配管", false, false, "ja").tokenize(tokens);
|
||||
}
|
||||
|
||||
TEST(TokenizerTest, ShouldTokenizeLocaleTextWithEnglishText) {
|
||||
|
Loading…
x
Reference in New Issue
Block a user