diff --git a/include/tokenizer.h b/include/tokenizer.h
index 36f3af90..53a2ab13 100644
--- a/include/tokenizer.h
+++ b/include/tokenizer.h
@@ -31,9 +31,14 @@ private:
     std::string locale;
     icu::BreakIterator* bi = nullptr;
     icu::UnicodeString unicode_text;
+
+    // tracks start of a text segment that can span multiple unicode tokens due to use of custom symbols
+    int32_t utf8_start_index = 0;
+
+    // tracks current unicode segment for text extraction
     int32_t start_pos = 0;
     int32_t end_pos = 0;
-    int32_t utf8_start_index = 0;
+
     char* normalized_text = nullptr;
 
     // non-deletable singletons
@@ -78,4 +83,6 @@ public:
     static inline bool is_ascii_char(char c) {
         return (c & ~0x7f) == 0;
     }
+
+    void decr_token_counter();
 };
\ No newline at end of file
diff --git a/src/collection.cpp b/src/collection.cpp
index b8dbbac2..e6f34b81 100644
--- a/src/collection.cpp
+++ b/src/collection.cpp
@@ -2521,6 +2521,7 @@ bool Collection::handle_highlight_text(std::string& text, bool normalise, const
         if(is_cyrillic) {
             bool found_token = word_tokenizer.tokenize(raw_token);
             if(!found_token) {
+                tokenizer.decr_token_counter();
                 continue;
             }
         }
diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp
index 41aab66e..4eb357a5 100644
--- a/src/tokenizer.cpp
+++ b/src/tokenizer.cpp
@@ -106,10 +106,7 @@ bool Tokenizer::next(std::string &token, size_t& token_index, size_t& start_inde
     if(!locale.empty() && locale != "en") {
         while (end_pos != icu::BreakIterator::DONE) {
             //LOG(INFO) << "Position: " << start_pos;
-            bool found_token = false;
-
             std::string word;
-            //LOG(INFO) << "token: " << token;
 
             if(locale == "ko") {
                 UErrorCode errcode = U_ZERO_ERROR;
@@ -118,67 +115,95 @@ bool Tokenizer::next(std::string &token, size_t& token_index, size_t& start_inde
                 nfkd->normalize(src, dst, errcode);
 
                 if(!U_FAILURE(errcode)) {
-                    token = dst.toUTF8String(word);
+                    dst.toUTF8String(word);
                 } else {
                     LOG(ERROR) << "Unicode error during parsing: " << errcode;
                 }
             } else if(normalize && is_cyrillic(locale)) {
                 auto raw_text = unicode_text.tempSubStringBetween(start_pos, end_pos);
                 transliterator->transliterate(raw_text);
-                token = raw_text.toUTF8String(word);
+                raw_text.toUTF8String(word);
             } else if(locale == "th") {
                 UErrorCode errcode = U_ZERO_ERROR;
                 icu::UnicodeString src = unicode_text.tempSubStringBetween(start_pos, end_pos);
                 icu::UnicodeString dst;
                 nfkc->normalize(src, dst, errcode);
                 if(!U_FAILURE(errcode)) {
-                    token = dst.toUTF8String(word);
+                    dst.toUTF8String(word);
                 } else {
                     LOG(ERROR) << "Unicode error during parsing: " << errcode;
                 }
             } else {
-                token = unicode_text.tempSubStringBetween(start_pos, end_pos).toUTF8String(word);
+                unicode_text.tempSubStringBetween(start_pos, end_pos).toUTF8String(word);
             }
 
-            if(!token.empty()) {
-                if(token == " " ||  token == "," || token == "." || token == "!" || token == "?") {
-                    found_token = false;
-                } else if (!std::isalnum(token[0]) && is_ascii_char(token[0])) {
-                    // ignore ascii symbols
-                    found_token = false;
-                    token_counter++;
-                } else if(locale == "ko" && token == "·") {
-                    found_token = false;
-                    token_counter++;
-                } else if(locale == "zh" && (token == "，" || token == "─" || token == "。")) {
-                    found_token = false;
-                    token_counter++;
-                } else {
+            bool emit_token = false;
 
-                    if(std::isalnum(token[0]) && is_ascii_char(token[0])) {
-                        // normalize an ascii string
-                        std::transform(token.begin(), token.end(), token.begin(),
-                                       [](unsigned char c){ return std::tolower(c); });
-                    }
+            // `word` can be either a multi-byte unicode sequence or an ASCII character
+            // ASCII character can be either a special character or English alphabet
 
-                    found_token = true;
-                    token_index = token_counter++;
+            if(is_ascii_char(word[0])) {
+
+                if(std::isalnum(word[0])) {
+                    // normalize an ascii string and emit word as token
+                    std::transform(word.begin(), word.end(), word.begin(),
+                                   [](unsigned char c){ return std::tolower(c); });
+                    out += word;
+                    emit_token = true;
                 }
 
-                start_index = utf8_start_index;
-                end_index = utf8_start_index + token.size() - 1;
-                utf8_start_index = end_index + 1;
+                else {
+                    // special character:
+                    // a) present in `index_symbols` -> append word to out and continue iteration
+                    // b) present in `separator_symbols` -> skip word
+                    // c) not present in either -> skip word
+                    if(index_symbols[uint8_t(word[0])] == 1) {
+                        out += word;
+                        emit_token = true;
+                    }
+                }
+
+
+            } else {
+                if(locale == "zh" && (word == "，" || word == "─" || word == "。")) {
+                    emit_token = false;
+                } else if(locale == "ko" && word == "·") {
+                    emit_token = false;
+                } else {
+                    emit_token = true;
+                    out += word;
+                }
             }
 
+            if(emit_token) {
+                token = out;
+                token_index = token_counter++;
+                out.clear();
+            }
+
+            start_index = utf8_start_index;
+            end_index = utf8_start_index + word.size() - 1;
+            utf8_start_index = end_index + 1;
+
             start_pos = end_pos;
             end_pos = bi->next();
 
-            if(found_token) {
+            if(emit_token) {
                 return true;
             }
         }
 
-        return false;
+        token = out;
+        out.clear();
+        start_index = utf8_start_index;
+        end_index = text.size() - 1;
+
+        if(token.empty()) {
+            return false;
+        }
+
+        token_index = token_counter++;
+        return true;
     }
 
     while(i < text.size()) {
@@ -303,3 +328,9 @@ bool Tokenizer::is_cyrillic(const std::string& locale) {
     return locale == "el" ||
            locale == "ru" || locale == "sr" || locale == "uk" || locale == "be";
 }
+
+void Tokenizer::decr_token_counter() {
+    if(token_counter > 0) {
+        token_counter--;
+    }
+}
diff --git a/test/collection_locale_test.cpp b/test/collection_locale_test.cpp
index 4940f2b9..6095a97d 100644
--- a/test/collection_locale_test.cpp
+++ b/test/collection_locale_test.cpp
@@ -220,6 +220,48 @@ TEST_F(CollectionLocaleTest, ThaiTextShouldBeNormalizedToNFKC) {
     ASSERT_EQ(1, results["found"].get<size_t>());
 }
 
+TEST_F(CollectionLocaleTest, ThaiTextShouldRespectSeparators) {
+    nlohmann::json coll_json = R"({
+        "name": "coll1",
+        "fields": [
+            {"name": "title", "type": "string", "locale": "th"}
+        ]
+    })"_json;
+
+    auto coll1 = collectionManager.create_collection(coll_json).get();
+
+    nlohmann::json doc;
+    doc["title"] = "alpha-beta-gamma";
+    ASSERT_TRUE(coll1->add(doc.dump()).ok());
+
+    auto results = coll1->search("*",{}, "title:=alpha-beta-gamma", {}, {},
+                                 {0}, 10, 1, FREQUENCY).get();
+
+    ASSERT_EQ(1, results["found"].get<size_t>());
+
+    // now with `symbols_to_index`
+    coll_json = R"({
+        "name": "coll2",
+        "symbols_to_index": ["-"],
+        "fields": [
+            {"name": "title", "type": "string", "locale": "th"}
+        ]
+    })"_json;
+
+    auto coll2 = collectionManager.create_collection(coll_json).get();
+    ASSERT_TRUE(coll2->add(doc.dump()).ok());
+
+    results = coll2->search("*",{}, "title:=alpha-beta-gamma", {}, {},
+                            {0}, 10, 1, FREQUENCY).get();
+
+    ASSERT_EQ(1, results["found"].get<size_t>());
+
+    results = coll2->search("*",{}, "title:=alphabetagamma", {}, {},
+                            {0}, 10, 1, FREQUENCY).get();
+
+    ASSERT_EQ(0, results["found"].get<size_t>());
+}
+
 TEST_F(CollectionLocaleTest, SearchThaiTextPreSegmentedQuery) {
     Collection *coll1;
 
diff --git a/test/tokenizer_test.cpp b/test/tokenizer_test.cpp
index 258a7a17..12dd6984 100644
--- a/test/tokenizer_test.cpp
+++ b/test/tokenizer_test.cpp
@@ -325,3 +325,35 @@ TEST(TokenizerTest, ShouldTokenizeLocaleTextWithEnglishText) {
     ASSERT_EQ("discrete", ttokens[7]);
     ASSERT_EQ("math", ttokens[8]);
 }
+
+TEST(TokenizerTest, ShouldTokenizeWithDifferentSymbolConfigs) {
+    std::string str1 = "ความ-เหลื่อมล้ำ";
+
+    // '-' in symbols_to_index: "ความ", "-", "เหลื่อม", "ล้ำ"
+    // '-' in separators: "ความ", "เหลื่อม", "ล้ำ"
+    // 'none: "ความ", "เหลื่อม", "ล้ำ"
+
+    std::vector<std::string> tokens;
+    Tokenizer(str1, true, false, "th", {'-'}, {}).tokenize(tokens);
+    ASSERT_EQ(4, tokens.size());
+    ASSERT_EQ("ความ", tokens[0]);
+    ASSERT_EQ("-", tokens[1]);
+    ASSERT_EQ("เหลื่อม", tokens[2]);
+    ASSERT_EQ("ล้ํา", tokens[3]);
+
+    tokens.clear();
+    Tokenizer(str1, true, false, "th", {}, {'-'}).tokenize(tokens);
+    ASSERT_EQ(3, tokens.size());
+    ASSERT_EQ("ความ", tokens[0]);
+    ASSERT_EQ("เหลื่อม", tokens[1]);
+    ASSERT_EQ("ล้ํา", tokens[2]);
+
+    tokens.clear();
+    Tokenizer(str1, true, false, "th", {}, {}).tokenize(tokens);
+    ASSERT_EQ(3, tokens.size());
+    ASSERT_EQ("ความ", tokens[0]);
+    ASSERT_EQ("เหลื่อม", tokens[1]);
+    ASSERT_EQ("ล้ํา", tokens[2]);
+
+    LOG(INFO) << "here";
+}