Add a few locale tokenization tests.

This commit is contained in:
Kishore Nallan 2021-06-09 11:46:05 +05:30
parent 8a43fa8b88
commit e695ba65c8
2 changed files with 66 additions and 6 deletions

View File

@ -30,6 +30,43 @@ protected:
}
};
TEST_F(CollectionLocaleTest, SearchAgainstJapaneseText) {
Collection *coll1;
std::vector<field> fields = {field("title", field_types::STRING, false, false, true, DEFAULT_GEO_RESOLUTION, "ja"),
field("artist", field_types::STRING, false),
field("points", field_types::INT32, false),};
coll1 = collectionManager.get_collection("coll1").get();
if(coll1 == nullptr) {
coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
}
std::vector<std::vector<std::string>> records = {
{"今ぶり拍治ルツ", "Dustin Kensrue"},
};
for(size_t i=0; i<records.size(); i++) {
nlohmann::json doc;
doc["id"] = std::to_string(i);
doc["title"] = records[i][0];
doc["artist"] = records[i][1];
doc["points"] = i;
ASSERT_TRUE(coll1->add(doc.dump()).ok());
}
auto results = coll1->search("拍治",
{"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, true).get();
//LOG(INFO) << results;
ASSERT_EQ(1, results["found"].get<size_t>());
ASSERT_EQ(1, results["hits"].size());
ASSERT_EQ("0", results["hits"][0]["document"]["id"].get<std::string>());
//ASSERT_EQ("今ぶり<mark>拍</mark><mark>治</mark>ルツ", results["hits"][0]["highlights"][0]["snippet"].get<std::string>());
}
TEST_F(CollectionLocaleTest, SearchAgainstChineseText) {
Collection *coll1;

View File

@ -223,14 +223,21 @@ TEST(TokenizerTest, ShouldTokenizeChineseText) {
}
TEST(TokenizerTest, ShouldTokenizeLocaleText) {
std::string tstr = "ลงรถไฟ";
std::vector<std::string> ttokens;
Tokenizer(tstr, true, false, "th").tokenize(ttokens);
std::string str = "จิ้งจอกสีน้ำตาลด่วน";
std::vector<std::string> tokens;
Tokenizer(str, false, false, "th").tokenize(tokens);
tokens.clear();
std::string str = "ความเหลื่อมล้ำ";
Tokenizer(str, true, false, "th").tokenize(tokens);
//ASSERT_EQ(2, tokens.size());
tokens.clear();
str = "เหลื่";
Tokenizer(str, false, false, "th").tokenize(tokens);
//ASSERT_EQ(1, tokens.size());
tokens.clear();
str = "จิ้งจอกสีน้ำตาลด่วน";
Tokenizer(str, false, false, "th").tokenize(tokens);
ASSERT_EQ(4, tokens.size());
ASSERT_EQ("จิ้งจอก", tokens[0]);
ASSERT_EQ("สี", tokens[1]);
@ -292,6 +299,22 @@ TEST(TokenizerTest, ShouldTokenizeLocaleText) {
ASSERT_EQ("", tokens[1]);
ASSERT_EQ("", tokens[2]);
ASSERT_EQ("いぬ", tokens[3]);
tokens.clear();
Tokenizer("今ぶり拍治ルツ", false, false, "ja").tokenize(tokens);
ASSERT_EQ(9, tokens.size());
ASSERT_EQ("いま", tokens[0]);
ASSERT_EQ("ぶり", tokens[1]);
ASSERT_EQ("", tokens[2]);
ASSERT_EQ("", tokens[3]);
ASSERT_EQ("", tokens[4]);
ASSERT_EQ("", tokens[5]);
ASSERT_EQ("", tokens[6]);
ASSERT_EQ("", tokens[7]);
ASSERT_EQ("", tokens[8]);
tokens.clear(); // 配管
Tokenizer("配管", false, false, "ja").tokenize(tokens);
}
TEST(TokenizerTest, ShouldTokenizeLocaleTextWithEnglishText) {