diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp index 221fbb94..cec5ffcd 100644 --- a/src/tokenizer.cpp +++ b/src/tokenizer.cpp @@ -160,13 +160,25 @@ bool Tokenizer::next(std::string &token, size_t& token_index, size_t& start_inde // `word` can be either a multi-byte unicode sequence or an ASCII character // ASCII character can be either a special character or English alphabet + size_t orig_word_size = word.size(); if(is_ascii_char(word[0])) { - if(std::isalnum(word[0])) { // normalize an ascii string and emit word as token - std::transform(word.begin(), word.end(), word.begin(), - [](unsigned char c){ return std::tolower(c); }); + size_t read_index = 0, write_index = 0; + + while (read_index < word.size()) { + size_t this_stream_mode = get_stream_mode(word[read_index]); + if(this_stream_mode != SKIP) { + word[write_index++] = std::tolower(word[read_index]); + } + + read_index++; + } + + // resize to fit new length + word.resize(write_index); + out += word; emit_token = true; } @@ -181,8 +193,6 @@ bool Tokenizer::next(std::string &token, size_t& token_index, size_t& start_inde emit_token = true; } } - - } else { if(locale == "zh" && (word == "," || word == "─" || word == "。")) { emit_token = false; @@ -201,7 +211,7 @@ bool Tokenizer::next(std::string &token, size_t& token_index, size_t& start_inde } start_index = utf8_start_index; - end_index = utf8_start_index + word.size() - 1; + end_index = utf8_start_index + orig_word_size - 1; utf8_start_index = end_index + 1; start_pos = end_pos; diff --git a/test/collection_locale_test.cpp b/test/collection_locale_test.cpp index fbb19a75..93c6b6bc 100644 --- a/test/collection_locale_test.cpp +++ b/test/collection_locale_test.cpp @@ -905,6 +905,29 @@ TEST_F(CollectionLocaleTest, SearchInGermanLocaleShouldBeTypoTolerant) { ASSERT_EQ(1, results["found"].get()); } +TEST_F(CollectionLocaleTest, HandleSpecialCharsInThai) { + nlohmann::json coll_json = R"({ + "name": "coll1", + "fields": [ + {"name": "title_th", "type": "string", "locale": "th"}, + {"name": "sku", "type": "string"} + ] + })"_json; + + auto coll1 = collectionManager.create_collection(coll_json).get(); + + nlohmann::json doc; + doc["title_th"] = "สวัสดี"; + doc["sku"] = "12345_"; + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + + // query string is parsed using the locale of the first field in the query_by list + auto results = coll1->search("12345_", {"title_th", "sku"}, "", {}, {}, + {2, 0}, 10, 1, FREQUENCY, {true, false}, 1).get(); + + ASSERT_EQ(1, results["found"].get()); +} + /* TEST_F(CollectionLocaleTest, TranslitPad) { UErrorCode translit_status = U_ZERO_ERROR; diff --git a/test/tokenizer_test.cpp b/test/tokenizer_test.cpp index 054df18b..3d584b3d 100644 --- a/test/tokenizer_test.cpp +++ b/test/tokenizer_test.cpp @@ -254,6 +254,12 @@ TEST(TokenizerTest, ShouldTokenizeLocaleText) { ASSERT_EQ("31", tokens[3]); ASSERT_EQ("มี.ค", tokens[4]); + tokens.clear(); + str = "12345_678"; + Tokenizer(str, false, false, "th").tokenize(tokens); + ASSERT_EQ(1, tokens.size()); + ASSERT_EQ("12345678", tokens[0]); + tokens.clear(); Tokenizer("Odd Thomas", false, false, "en").tokenize(tokens); ASSERT_EQ(2, tokens.size());