diff --git a/include/index.h b/include/index.h
index 854605d1..d3bff529 100644
--- a/include/index.h
+++ b/include/index.h
@@ -312,7 +312,6 @@ public:
 
     void scrub_reindex_doc(nlohmann::json& update_doc, nlohmann::json& del_doc, nlohmann::json& old_doc);
 
-    void tokenize_doc_field(const nlohmann::json& document, const std::string& field_name, const field& search_field,
-                            std::vector<std::string>& tokens);
+    void tokenize_doc_field(const nlohmann::json& document, const field& search_field, std::vector<std::string>& tokens);
 };
 
diff --git a/include/string_utils.h b/include/string_utils.h
index b7bfb207..2f2276be 100644
--- a/include/string_utils.h
+++ b/include/string_utils.h
@@ -291,5 +291,5 @@ struct StringUtils {
 
     static std::string hmac(const std::string& key, const std::string& msg);
 
-    static size_t unicode_length(const std::string& bytes);
+    //static size_t unicode_length(const std::string& bytes);
 };
\ No newline at end of file
diff --git a/include/tokenizer.h b/include/tokenizer.h
new file mode 100644
index 00000000..4649d9b0
--- /dev/null
+++ b/include/tokenizer.h
@@ -0,0 +1,27 @@
+#pragma once
+
+#include <string>
+#include <vector>
+#include <iconv.h>
+
+class Tokenizer {
+private:
+    const std::string& text;
+    size_t i;
+    const bool keep_empty;
+    const bool normalize;
+
+    size_t token_counter = 0;
+    iconv_t cd;
+
+public:
+
+    Tokenizer(const std::string& input, bool keep_empty=true, bool normalize=true):
+              text(input), i(0), keep_empty(keep_empty), normalize(normalize) {
+        cd = iconv_open("ASCII//TRANSLIT", "UTF-8");
+    }
+
+    bool next(std::string& token, size_t& token_index);
+
+    void tokenize(std::vector<std::string>& tokens);
+};
\ No newline at end of file
diff --git a/src/index.cpp b/src/index.cpp
index adb81c6b..11eff26e 100644
--- a/src/index.cpp
+++ b/src/index.cpp
@@ -8,6 +8,7 @@
 #include <match_score.h>
 #include <string_utils.h>
 #include <art.h>
+#include <tokenizer.h>
 #include "logger.h"
 
 Index::Index(const std::string name, const std::unordered_map<std::string, field> & search_schema,
@@ -337,8 +338,8 @@ void Index::scrub_reindex_doc(nlohmann::json& update_doc, nlohmann::json& del_do
         // Go through all the field names and find the keys+values so that they can be removed from in-memory index
         std::vector<std::string> reindex_tokens;
         std::vector<std::string> old_tokens;
-        tokenize_doc_field(update_doc, field_name, search_field, reindex_tokens);
-        tokenize_doc_field(old_doc, field_name, search_field, old_tokens);
+        tokenize_doc_field(update_doc, search_field, reindex_tokens);
+        tokenize_doc_field(old_doc, search_field, old_tokens);
 
         if(old_tokens.size() != reindex_tokens.size()) {
             ++it;
@@ -556,28 +557,23 @@ uint64_t Index::facet_token_hash(const field & a_field, const std::string &token
 
 void Index::index_string_field(const std::string & text, const int64_t score, art_tree *t,
                                     uint32_t seq_id, int facet_id, const field & a_field) {
-    std::vector<std::string> tokens;
-    StringUtils::split(text, tokens, " ", a_field.is_string());
-
     std::unordered_map<std::string, std::vector<uint32_t>> token_to_offsets;
 
-    for(size_t i=0; i<tokens.size(); i++) {
-        auto & token = tokens[i];
+    Tokenizer tokenizer(text, true, a_field.is_string());
+    std::string token;
+    size_t token_index = 0;
 
+    while(tokenizer.next(token, token_index)) {
         if(token.empty()) {
             continue;
         }
 
-        if(a_field.is_string()) {
-            string_utils.unicode_normalize(token);
-        }
-
         if(facet_id >= 0) {
             uint64_t hash = facet_token_hash(a_field, token);
             facet_index_v2[seq_id][facet_id].push_back(hash);
         }
 
-        token_to_offsets[token].push_back(i);
+        token_to_offsets[token].push_back(token_index);
     }
 
     /*if(seq_id == 0) {
@@ -596,32 +592,26 @@ void Index::index_string_array_field(const std::vector<std::string> & strings, c
     std::unordered_map<std::string, std::vector<uint32_t>> token_positions;
 
     for(size_t array_index = 0; array_index < strings.size(); array_index++) {
-        const std::string & str = strings[array_index];
-        std::vector<std::string> tokens;
-        std::string delim = " ";
-        StringUtils::split(str, tokens, delim, a_field.is_string());
-
+        const std::string& str = strings[array_index];
         std::set<std::string> token_set;  // required to deal with repeating tokens
 
-        // iterate and append offset positions
-        for(size_t i=0; i<tokens.size(); i++) {
-            auto & token = tokens[i];
+        Tokenizer tokenizer(str, true, a_field.is_string());
+        std::string token;
+        size_t token_index = 0;
 
+        // iterate and append offset positions
+        while(tokenizer.next(token, token_index)) {
             if(token.empty()) {
                 continue;
             }
 
-            if(a_field.is_string()) {
-                string_utils.unicode_normalize(token);
-            }
-
             if(facet_id >= 0) {
                 uint64_t hash = facet_token_hash(a_field, token);
                 facet_index_v2[seq_id][facet_id].push_back(hash);
                 //printf("indexing %.*s - %llu\n", token.size(), token.c_str(), hash);
             }
 
-            token_positions[token].push_back(i);
+            token_positions[token].push_back(token_index);
             token_set.insert(token);
         }
 
@@ -630,13 +620,13 @@ void Index::index_string_array_field(const std::vector<std::string> & strings, c
         }
 
         // repeat last element to indicate end of offsets for this array index
-        for(auto & token: token_set) {
-            token_positions[token].push_back(token_positions[token].back());
+        for(auto & the_token: token_set) {
+            token_positions[the_token].push_back(token_positions[the_token].back());
         }
 
         // iterate and append this array index to all tokens
-        for(auto & token: token_set) {
-            token_positions[token].push_back(array_index);
+        for(auto & the_token: token_set) {
+            token_positions[the_token].push_back(array_index);
         }
     }
 
@@ -750,17 +740,14 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
                 }
             }
 
-            std::vector<std::string> query_tokens;
-            StringUtils::split(facet_query.query, query_tokens, " ");
-
             // for non-string fields, `faceted_name` returns their aliased stringified field name
             art_tree *t = search_index.at(facet_field.faceted_name());
 
+            std::vector<std::string> query_tokens;
+            Tokenizer(facet_query.query, false, facet_field.is_string()).tokenize(query_tokens);
+
             for (size_t qtoken_index = 0; qtoken_index < query_tokens.size(); qtoken_index++) {
                 auto &q = query_tokens[qtoken_index];
-                if (facet_field.is_string()) {
-                    string_utils.unicode_normalize(q);
-                }
 
                 int bounded_cost = (q.size() < 3) ? 0 : 1;
                 bool prefix_search = (qtoken_index ==
@@ -1062,9 +1049,6 @@ Option<uint32_t> Index::do_filtering(uint32_t** filter_ids_out, const std::vecto
             size_t ids_size = 0;
 
             for(const std::string & filter_value: a_filter.values) {
-                std::vector<std::string> str_tokens;
-                StringUtils::split(filter_value, str_tokens, " ");
-
                 uint32_t* strt_ids = nullptr;
                 size_t strt_ids_size = 0;
 
@@ -1072,8 +1056,14 @@ Option<uint32_t> Index::do_filtering(uint32_t** filter_ids_out, const std::vecto
 
                 // there could be multiple tokens in a filter value, which we have to treat as ANDs
                 // e.g. country: South Africa
-                for(auto& str_token: str_tokens) {
-                    string_utils.unicode_normalize(str_token);
+
+                Tokenizer tokenizer(filter_value, false, true);
+                std::string str_token;
+                size_t token_index = 0;
+                std::vector<std::string> str_tokens;
+
+                while(tokenizer.next(str_token, token_index)) {
+                    str_tokens.push_back(str_token);
 
                     art_leaf* leaf = (art_leaf *) art_search(t, (const unsigned char*) str_token.c_str(),
                                                              str_token.length()+1);
@@ -1296,15 +1286,15 @@ void Index::collate_included_ids(const std::string & query, const std::string &
     }
 
     // calculate match_score and add to topster independently
-    std::vector<std::string> tokens;
-    StringUtils::split(query, tokens, " ");
 
     std::vector<art_leaf *> override_query;
 
-    for(size_t token_index = 0; token_index < tokens.size(); token_index++) {
-        const auto token = tokens[token_index];
+    Tokenizer tokenizer(query, false, true);
+    std::string token;
+    size_t token_index = 0;
+
+    while(tokenizer.next(token, token_index)) {
         const size_t token_len = token.length();
-        string_utils.unicode_normalize(tokens[token_index]);
 
         std::vector<art_leaf*> leaves;
         art_fuzzy_search(search_index.at(field), (const unsigned char *) token.c_str(), token_len,
@@ -1483,8 +1473,6 @@ void Index::search_field(const uint8_t & field_id, const std::string & query, co
                          uint32_t** all_result_ids, size_t & all_result_ids_len,
                          const token_ordering token_order, const bool prefix, 
                          const size_t drop_tokens_threshold, const size_t typo_tokens_threshold) {
-    std::vector<std::string> tokens;
-    StringUtils::split(query, tokens, " ");
 
     const size_t max_cost = (num_typos < 0 || num_typos > 2) ? 2 : num_typos;
 
@@ -1496,19 +1484,22 @@ void Index::search_field(const uint8_t & field_id, const std::string & query, co
 
     std::vector<std::vector<int>> token_to_costs;
 
-    for(size_t token_index = 0; token_index < tokens.size(); token_index++) {
-        std::vector<int> all_costs;
-        const size_t token_len = tokens[token_index].length();
+    Tokenizer tokenizer(query, false, true);
+    std::string token;
+    size_t token_index = 0;
+    std::vector<std::string> tokens;
 
+    while(tokenizer.next(token, token_index)) {
+        std::vector<int> all_costs;
         // This ensures that we don't end up doing a cost of 1 for a single char etc.
-        int bounded_cost = get_bounded_typo_cost(max_cost, token_len);
+        int bounded_cost = get_bounded_typo_cost(max_cost, token.length());
 
         for(int cost = 0; cost <= bounded_cost; cost++) {
             all_costs.push_back(cost);
         }
 
         token_to_costs.push_back(all_costs);
-        string_utils.unicode_normalize(tokens[token_index]);
+        tokens.push_back(token);
     }
 
     // stores candidates for each token, i.e. i-th index would have all possible tokens with a cost of "c"
@@ -1934,14 +1925,13 @@ Option<uint32_t> Index::remove(const uint32_t seq_id, const nlohmann::json & doc
 
         // Go through all the field names and find the keys+values so that they can be removed from in-memory index
         std::vector<std::string> tokens;
-        tokenize_doc_field(document, field_name, search_field, tokens);
+        tokenize_doc_field(document, search_field, tokens);
 
         for(auto & token: tokens) {
             const unsigned char *key;
             int key_len;
 
             if(search_field.type == field_types::STRING_ARRAY || search_field.type == field_types::STRING) {
-                string_utils.unicode_normalize(token);
                 key = (const unsigned char *) token.c_str();
                 key_len = (int) (token.length() + 1);
             } else {
@@ -1998,14 +1988,17 @@ Option<uint32_t> Index::remove(const uint32_t seq_id, const nlohmann::json & doc
     return Option<uint32_t>(seq_id);
 }
 
-void Index::tokenize_doc_field(const nlohmann::json& document, const std::string& field_name, const field& search_field,
+void Index::tokenize_doc_field(const nlohmann::json& document, const field& search_field,
                                std::vector<std::string>& tokens) {
+
+    const std::string& field_name = search_field.name;
+
     if(search_field.type == field_types::STRING) {
-        StringUtils::split(document[field_name], tokens, " ", true);
+        Tokenizer(document[field_name], true, search_field.is_string()).tokenize(tokens);
     } else if(search_field.type == field_types::STRING_ARRAY) {
         const std::vector<std::string>& values = document[field_name].get<std::vector<std::string>>();
         for(const std::string & value: values) {
-            StringUtils::split(value, tokens, " ", true);
+            Tokenizer(value, true, search_field.is_string()).tokenize(tokens);
         }
     } else if(search_field.type == field_types::INT32) {
         const int KEY_LEN = 8;
diff --git a/src/string_utils.cpp b/src/string_utils.cpp
index 39246d0f..da0a0582 100644
--- a/src/string_utils.cpp
+++ b/src/string_utils.cpp
@@ -3,7 +3,6 @@
 #include <openssl/evp.h>
 #include <openssl/hmac.h>
 #include <random>
-#include <codecvt>
 
 std::string lower_and_no_special_chars(const std::string & str) {
     std::stringstream ss;
@@ -31,6 +30,12 @@ void StringUtils::unicode_normalize(std::string & str) const {
         char inbuf[5];
         char *p = inbuf;
 
+        if((*s & ~0x7f) == 0 ) {
+            // ascii character
+            out << *s++;
+            continue;
+        }
+
         // group bytes to form a unicode representation
         *p++ = *s++;
         if ((*s & 0xC0) == 0x80) *p++ = *s++;
@@ -101,7 +106,7 @@ std::string StringUtils::str2hex(const std::string &str, bool capital) {
     return hexstr;
 }
 
-size_t StringUtils::unicode_length(const std::string& bytes) {
+/*size_t StringUtils::unicode_length(const std::string& bytes) {
     std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> utf8conv;
     return utf8conv.from_bytes(bytes).size();
-}
+}*/
diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp
new file mode 100644
index 00000000..713209d4
--- /dev/null
+++ b/src/tokenizer.cpp
@@ -0,0 +1,103 @@
+#include <sstream>
+#include "tokenizer.h"
+
+bool Tokenizer::next(std::string &token, size_t& token_index) {
+    std::stringstream out;
+
+    if(i >= text.size()) {
+        return false;
+    }
+
+    if(!normalize) {
+        token = text;
+        i = text.size();
+        return true;
+    }
+
+    while(i < text.size()) {
+        if((text[i] & ~0x7f) == 0 ) {
+            // ASCII character: split on space/newline or lowercase otherwise
+            bool is_space = text[i] == 32;
+            bool is_new_line = text[i] == 10;
+            bool space_or_newline = (is_space || is_new_line);
+
+            if(space_or_newline) {
+                i++;
+                token = out.str();
+                out.clear();
+
+                if(!keep_empty && token.empty()) {
+                    continue;
+                }
+
+                token_index = token_counter++;
+                return true;
+            }
+
+            if(std::isalnum(text[i])) {
+                out << char(std::tolower(text[i]));
+            }
+
+            i++;
+            continue;
+        }
+
+        char inbuf[5];
+        char *p = inbuf;
+
+        // group bytes to form a unicode representation
+        *p++ = text[i++];
+        if ((text[i] & 0xC0) == 0x80) *p++ = text[i++];
+        if ((text[i] & 0xC0) == 0x80) *p++ = text[i++];
+        if ((text[i] & 0xC0) == 0x80) *p++ = text[i++];
+        *p = 0;
+        size_t insize = (p - &inbuf[0]);
+
+        char outbuf[5] = {};
+        size_t outsize = sizeof(outbuf);
+        char *outptr = outbuf;
+        char *inptr = inbuf;
+
+        //printf("[%s]\n", inbuf);
+
+        errno = 0;
+        iconv(cd, &inptr, &insize, &outptr, &outsize);
+
+        if(errno == EILSEQ) {
+            // symbol cannot be represented as ASCII, so write the original symbol
+            out << inbuf;
+        } else {
+            // NOTE: outsize indicates bytes available AFTER current position so have to do <=
+            for(size_t out_index=0; out_index<5; out_index++) {
+                bool is_ascii = ((outbuf[out_index] & ~0x7f) == 0);
+                bool keep_char = !is_ascii || std::isalnum(outbuf[out_index]);
+
+                if(keep_char) {
+                    if(is_ascii && std::isalnum(outbuf[out_index])) {
+                        outbuf[out_index] = std::tolower(outbuf[out_index]);
+                    }
+                    out << outbuf[out_index];
+                }
+            }
+        }
+    }
+
+    token = out.str();
+    out.clear();
+
+    if(!keep_empty && token.empty()) {
+        return false;
+    }
+
+    token_index = token_counter++;
+    return true;
+}
+
+void Tokenizer::tokenize(std::vector<std::string> &tokens) {
+    std::string token;
+    size_t token_index;
+
+    while(next(token, token_index)) {
+        tokens.push_back(token);
+    }
+}
diff --git a/test/collection_faceting_test.cpp b/test/collection_faceting_test.cpp
index f1a9dd88..ba2e39db 100644
--- a/test/collection_faceting_test.cpp
+++ b/test/collection_faceting_test.cpp
@@ -558,12 +558,12 @@ TEST_F(CollectionFacetingTest, FacetCountsHighlighting) {
     ASSERT_STREQ("categories", results["facet_counts"][0]["field_name"].get<std::string>().c_str());
 
     ASSERT_EQ(1, results["facet_counts"][0]["counts"][0]["count"].get<size_t>());
-    ASSERT_STREQ("Cell Phone Cases & Clips", results["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
-    ASSERT_STREQ("<mark>Cell</mark> Phone Cases & Clips", results["facet_counts"][0]["counts"][0]["highlighted"].get<std::string>().c_str());
+    ASSERT_STREQ("Cell Phones", results["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
+    ASSERT_STREQ("<mark>Cell</mark> Phones", results["facet_counts"][0]["counts"][0]["highlighted"].get<std::string>().c_str());
 
     ASSERT_EQ(1, results["facet_counts"][0]["counts"][1]["count"].get<size_t>());
-    ASSERT_STREQ("Cell Phones", results["facet_counts"][0]["counts"][1]["value"].get<std::string>().c_str());
-    ASSERT_STREQ("<mark>Cell</mark> Phones", results["facet_counts"][0]["counts"][1]["highlighted"].get<std::string>().c_str());
+    ASSERT_STREQ("Cell Phone Cases & Clips", results["facet_counts"][0]["counts"][1]["value"].get<std::string>().c_str());
+    ASSERT_STREQ("<mark>Cell</mark> Phone Cases & Clips", results["facet_counts"][0]["counts"][1]["highlighted"].get<std::string>().c_str());
 
     ASSERT_EQ(1, results["facet_counts"][0]["counts"][2]["count"].get<size_t>());
     ASSERT_STREQ("Cell Phone Accessories", results["facet_counts"][0]["counts"][2]["value"].get<std::string>().c_str());
diff --git a/test/index_test.cpp b/test/index_test.cpp
index c069565e..0cef0f21 100644
--- a/test/index_test.cpp
+++ b/test/index_test.cpp
@@ -5,7 +5,7 @@
 TEST(IndexTest, ScrubReindexDoc) {
     std::unordered_map<std::string, field> search_schema;
     search_schema.emplace("title", field("title", field_types::STRING, false));
-    search_schema.emplace("points", field("title", field_types::INT32, false));
+    search_schema.emplace("points", field("points", field_types::INT32, false));
     search_schema.emplace("cast", field("cast", field_types::STRING_ARRAY, false));
     search_schema.emplace("movie", field("movie", field_types::BOOL, false));
 
diff --git a/test/tokenizer_test.cpp b/test/tokenizer_test.cpp
new file mode 100644
index 00000000..c53e5639
--- /dev/null
+++ b/test/tokenizer_test.cpp
@@ -0,0 +1,82 @@
+#include <gtest/gtest.h>
+#include "tokenizer.h"
+
+TEST(TokenizerTest, ShouldTokenizeNormalizeDifferentStrings) {
+    const std::string withnewline = "Michael Jordan:\nWelcome, everybody. Welcome!";
+    std::vector<std::string> tokens;
+    Tokenizer(withnewline, true, true).tokenize(tokens);
+    ASSERT_EQ(5, tokens.size());
+    ASSERT_STREQ("michael", tokens[0].c_str());
+    ASSERT_STREQ("jordan", tokens[1].c_str());
+    ASSERT_STREQ("welcome", tokens[2].c_str());
+    ASSERT_STREQ("everybody", tokens[3].c_str());
+    ASSERT_STREQ("welcome", tokens[4].c_str());
+
+    const std::string withspaces = " Michael  Jordan  ";
+    tokens.clear();
+    Tokenizer(withspaces, true, true).tokenize(tokens);
+    ASSERT_EQ(5, tokens.size());
+    ASSERT_STREQ("", tokens[0].c_str());
+    ASSERT_STREQ("michael", tokens[1].c_str());
+    ASSERT_STREQ("", tokens[2].c_str());
+    ASSERT_STREQ("jordan", tokens[3].c_str());
+    ASSERT_STREQ("", tokens[4].c_str());
+
+    tokens.clear();
+    Tokenizer(withspaces, false, true).tokenize(tokens);
+    ASSERT_EQ(2, tokens.size());
+    ASSERT_STREQ("michael", tokens[0].c_str());
+    ASSERT_STREQ("jordan", tokens[1].c_str());
+
+    const std::string withspecialchars = "Special ½¥ and தமிழ் 你好吗 abcÅà123ß12 here.";
+    tokens.clear();
+    Tokenizer(withspecialchars, false, true).tokenize(tokens);
+    ASSERT_EQ(7, tokens.size());
+    ASSERT_STREQ("special", tokens[0].c_str());
+    ASSERT_STREQ("12yen", tokens[1].c_str());
+    ASSERT_STREQ("and", tokens[2].c_str());
+    ASSERT_STREQ("தமிழ்", tokens[3].c_str());
+    ASSERT_STREQ("你好吗", tokens[4].c_str());
+    ASSERT_STREQ("abcaa123ss12", tokens[5].c_str());
+    ASSERT_STREQ("here", tokens[6].c_str());
+
+    // when normalize is false, should be verbatim
+
+    tokens.clear();
+    Tokenizer(withspecialchars, false, false).tokenize(tokens);
+    ASSERT_EQ(1, tokens.size());
+    ASSERT_STREQ(withspecialchars.c_str(), tokens[0].c_str());
+}
+
+TEST(TokenizerTest, ShouldTokenizeIteratively) {
+    const std::string withnewline = "Michael Jordan:\n\nWelcome, everybody. Welcome!";
+    std::vector<std::string> tokens;
+    Tokenizer tokenizer1(withnewline, true, true);
+
+    std::string token;
+    size_t token_index;
+
+    while(tokenizer1.next(token, token_index)) {
+        tokens.push_back(token);
+    }
+
+    ASSERT_EQ(6, tokens.size());
+    ASSERT_STREQ("michael", tokens[0].c_str());
+    ASSERT_STREQ("jordan", tokens[1].c_str());
+    ASSERT_STREQ("", tokens[2].c_str());
+    ASSERT_STREQ("welcome", tokens[3].c_str());
+    ASSERT_STREQ("everybody", tokens[4].c_str());
+    ASSERT_STREQ("welcome", tokens[5].c_str());
+
+    // verbatim (normalize=false)
+
+    tokens.clear();
+    Tokenizer tokenizer2(withnewline, true, false);
+
+    while(tokenizer2.next(token, token_index)) {
+        tokens.push_back(token);
+    }
+
+    ASSERT_EQ(1, tokens.size());
+    ASSERT_STREQ("Michael Jordan:\n\nWelcome, everybody. Welcome!", tokens[0].c_str());
+}