diff --git a/test/collection_locale_test.cpp b/test/collection_locale_test.cpp index 47ff4dce..a2738cfc 100644 --- a/test/collection_locale_test.cpp +++ b/test/collection_locale_test.cpp @@ -30,6 +30,43 @@ protected: } }; +TEST_F(CollectionLocaleTest, SearchAgainstJapaneseText) { + Collection *coll1; + + std::vector fields = {field("title", field_types::STRING, false, false, true, DEFAULT_GEO_RESOLUTION, "ja"), + field("artist", field_types::STRING, false), + field("points", field_types::INT32, false),}; + + coll1 = collectionManager.get_collection("coll1").get(); + if(coll1 == nullptr) { + coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get(); + } + + std::vector> records = { + {"今ぶり拍治ルツ", "Dustin Kensrue"}, + }; + + for(size_t i=0; iadd(doc.dump()).ok()); + } + + auto results = coll1->search("拍治", + {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, true).get(); + + //LOG(INFO) << results; + ASSERT_EQ(1, results["found"].get()); + ASSERT_EQ(1, results["hits"].size()); + ASSERT_EQ("0", results["hits"][0]["document"]["id"].get()); + //ASSERT_EQ("今ぶりルツ", results["hits"][0]["highlights"][0]["snippet"].get()); +} + TEST_F(CollectionLocaleTest, SearchAgainstChineseText) { Collection *coll1; diff --git a/test/tokenizer_test.cpp b/test/tokenizer_test.cpp index 36b846d7..b1af9d99 100644 --- a/test/tokenizer_test.cpp +++ b/test/tokenizer_test.cpp @@ -223,14 +223,21 @@ TEST(TokenizerTest, ShouldTokenizeChineseText) { } TEST(TokenizerTest, ShouldTokenizeLocaleText) { - std::string tstr = "ลงรถไฟ"; - std::vector ttokens; - Tokenizer(tstr, true, false, "th").tokenize(ttokens); - - std::string str = "จิ้งจอกสีน้ำตาลด่วน"; std::vector tokens; - Tokenizer(str, false, false, "th").tokenize(tokens); + tokens.clear(); + std::string str = "ความเหลื่อมล้ำ"; + Tokenizer(str, true, false, "th").tokenize(tokens); + //ASSERT_EQ(2, tokens.size()); + + tokens.clear(); + str = "เหลื่"; + Tokenizer(str, false, false, "th").tokenize(tokens); + //ASSERT_EQ(1, tokens.size()); + + tokens.clear(); + str = "จิ้งจอกสีน้ำตาลด่วน"; + Tokenizer(str, false, false, "th").tokenize(tokens); ASSERT_EQ(4, tokens.size()); ASSERT_EQ("จิ้งจอก", tokens[0]); ASSERT_EQ("สี", tokens[1]); @@ -292,6 +299,22 @@ TEST(TokenizerTest, ShouldTokenizeLocaleText) { ASSERT_EQ("だ", tokens[1]); ASSERT_EQ("な", tokens[2]); ASSERT_EQ("いぬ", tokens[3]); + + tokens.clear(); + Tokenizer("今ぶり拍治ルツ", false, false, "ja").tokenize(tokens); + ASSERT_EQ(9, tokens.size()); + ASSERT_EQ("いま", tokens[0]); + ASSERT_EQ("ぶり", tokens[1]); + ASSERT_EQ("は", tokens[2]); + ASSERT_EQ("く", tokens[3]); + ASSERT_EQ("お", tokens[4]); + ASSERT_EQ("さ", tokens[5]); + ASSERT_EQ("む", tokens[6]); + ASSERT_EQ("る", tokens[7]); + ASSERT_EQ("つ", tokens[8]); + + tokens.clear(); // 配管 + Tokenizer("配管", false, false, "ja").tokenize(tokens); } TEST(TokenizerTest, ShouldTokenizeLocaleTextWithEnglishText) {