diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp
index 221fbb94..cec5ffcd 100644
--- a/src/tokenizer.cpp
+++ b/src/tokenizer.cpp
@@ -160,13 +160,25 @@ bool Tokenizer::next(std::string &token, size_t& token_index, size_t& start_inde
 
             // `word` can be either a multi-byte unicode sequence or an ASCII character
             // ASCII character can be either a special character or English alphabet
+            size_t orig_word_size = word.size();
 
             if(is_ascii_char(word[0])) {
-
                 if(std::isalnum(word[0])) {
                     // normalize an ascii string and emit word as token
-                    std::transform(word.begin(), word.end(), word.begin(),
-                                   [](unsigned char c){ return std::tolower(c); });
+                    size_t read_index = 0, write_index = 0;
+
+                    while (read_index < word.size()) {
+                        size_t this_stream_mode = get_stream_mode(word[read_index]);
+                        if(this_stream_mode != SKIP) {
+                            word[write_index++] = std::tolower(word[read_index]);
+                        }
+
+                        read_index++;
+                    }
+
+                    // resize to fit new length
+                    word.resize(write_index);
+
                     out += word;
                     emit_token = true;
                 }
@@ -181,8 +193,6 @@ bool Tokenizer::next(std::string &token, size_t& token_index, size_t& start_inde
                         emit_token = true;
                     }
                 }
-
-
             } else {
                 if(locale == "zh" && (word == "，" || word == "─" || word == "。")) {
                     emit_token = false;
@@ -201,7 +211,7 @@ bool Tokenizer::next(std::string &token, size_t& token_index, size_t& start_inde
             }
 
             start_index = utf8_start_index;
-            end_index = utf8_start_index + word.size() - 1;
+            end_index = utf8_start_index + orig_word_size - 1;
             utf8_start_index = end_index + 1;
 
             start_pos = end_pos;
diff --git a/test/collection_locale_test.cpp b/test/collection_locale_test.cpp
index fbb19a75..93c6b6bc 100644
--- a/test/collection_locale_test.cpp
+++ b/test/collection_locale_test.cpp
@@ -905,6 +905,29 @@ TEST_F(CollectionLocaleTest, SearchInGermanLocaleShouldBeTypoTolerant) {
     ASSERT_EQ(1, results["found"].get<size_t>());
 }
 
+TEST_F(CollectionLocaleTest, HandleSpecialCharsInThai) {
+    nlohmann::json coll_json = R"({
+            "name": "coll1",
+            "fields": [
+                {"name": "title_th", "type": "string", "locale": "th"},
+                {"name": "sku", "type": "string"}
+            ]
+        })"_json;
+
+    auto coll1 = collectionManager.create_collection(coll_json).get();
+
+    nlohmann::json doc;
+    doc["title_th"] = "สวัสดี";
+    doc["sku"] = "12345_";
+    ASSERT_TRUE(coll1->add(doc.dump()).ok());
+
+    // query string is parsed using the locale of the first field in the query_by list
+    auto results = coll1->search("12345_", {"title_th", "sku"}, "", {}, {},
+                                 {2, 0}, 10, 1, FREQUENCY, {true, false}, 1).get();
+
+    ASSERT_EQ(1, results["found"].get<size_t>());
+}
+
 /*
 TEST_F(CollectionLocaleTest, TranslitPad) {
     UErrorCode translit_status = U_ZERO_ERROR;
diff --git a/test/tokenizer_test.cpp b/test/tokenizer_test.cpp
index 054df18b..3d584b3d 100644
--- a/test/tokenizer_test.cpp
+++ b/test/tokenizer_test.cpp
@@ -254,6 +254,12 @@ TEST(TokenizerTest, ShouldTokenizeLocaleText) {
     ASSERT_EQ("31", tokens[3]);
     ASSERT_EQ("มี.ค", tokens[4]);
 
+    tokens.clear();
+    str = "12345_678";
+    Tokenizer(str, false, false, "th").tokenize(tokens);
+    ASSERT_EQ(1, tokens.size());
+    ASSERT_EQ("12345678", tokens[0]);
+
     tokens.clear();
     Tokenizer("Odd Thomas", false, false, "en").tokenize(tokens);
     ASSERT_EQ(2, tokens.size());