Fix highlighting of strings with special characters.

2025-05-19 05:08:43 +08:00 · 2021-03-20 12:58:30 +05:30 · 2021-03-20 12:58:30 +05:30 · c2eec85277
commit c2eec85277
parent fcdd8ec9c9
9 changed files with 278 additions and 176 deletions
--- a/include/index.h
+++ b/include/index.h
@ -389,10 +389,10 @@ public:

    static void transform_for_180th_meridian(GeoCoord& point, double offset);

-    // the following methods are not synchronized because their parent calls are synchronized
-
    art_leaf* get_token_leaf(const std::string & field_name, const unsigned char* token, uint32_t token_len);

+    // the following methods are not synchronized because their parent calls are synchronized
+
    uint32_t do_filtering(uint32_t** filter_ids_out, const std::vector<filter> & filters) const;

    static Option<uint32_t> validate_index_in_memory(nlohmann::json &document, uint32_t seq_id,
--- a/include/match_score.h
+++ b/include/match_score.h
@ -30,6 +30,10 @@ struct TokenOffset {
    bool operator>(const TokenOffset &a) const {
        return offset > a.offset;
    }
+
+    bool operator<(const TokenOffset &a) const {
+        return offset < a.offset;
+    }
 };

 struct Match {
--- a/include/tokenizer.h
+++ b/include/tokenizer.h
@ -8,19 +8,32 @@ class Tokenizer {
 private:
    const std::string& text;
    size_t i;
-    const bool keep_empty;
+    const bool keep_separators;
    const bool normalize;
    const bool no_op;

    size_t token_counter = 0;
    iconv_t cd;

+    static const size_t CHARS = 0;
+    static const size_t SEPARATORS = 1;
+    size_t stream_mode;
+
+    std::stringstream out;
+
 public:

    explicit Tokenizer(const std::string& input,
-                       bool keep_empty=true, bool normalize=true, bool no_op=false):
-            text(input), i(0), keep_empty(keep_empty), normalize(normalize), no_op(no_op) {
+                       bool keep_separators=true, bool normalize=true, bool no_op=false):
+            text(input), i(0), keep_separators(keep_separators), normalize(normalize), no_op(no_op) {
        cd = iconv_open("ASCII//TRANSLIT", "UTF-8");
+
+        if(!input.empty() && (std::isalnum(text[0]) || (text[i] & ~0x7f) != 0)) {
+            // alphanum or non-ascii
+            stream_mode = CHARS;
+        } else {
+            stream_mode = SEPARATORS;
+        }
    }

    ~Tokenizer() {
--- a/src/collection.cpp
+++ b/src/collection.cpp
@ -1223,20 +1223,27 @@ void Collection::parse_search_query(const std::string &query, std::vector<std::s
        q_include_tokens = {query};
    } else {
        std::vector<std::string> tokens;
-        StringUtils::split(query, tokens, " ");
+        Tokenizer(query, true, true).tokenize(tokens);
+        bool exclude_operator_prior = false;

-        for(std::string& token: tokens) {
-            if(token[0] == '-') {
-                std::string&& just_token = token.substr(1);
-                Tokenizer(just_token, false, true).tokenize(just_token);
-                if(!just_token.empty()) {
-                    q_exclude_tokens.push_back(just_token);
-                }
+        for(const auto& token: tokens) {
+            if(token.empty()) {
+                continue;
+            }
+
+            if(token == "-" || token == " -") {
+                exclude_operator_prior = true;
+            }
+
+            if(!std::isalnum(token[0])) {
+                continue;
+            }
+
+            if(exclude_operator_prior) {
+                q_exclude_tokens.push_back(token);
+                exclude_operator_prior = false;
            } else {
-                Tokenizer(token, false, true).tokenize(token);
-                if(!token.empty()) {
-                    q_include_tokens.push_back(token);
-                }
+                q_include_tokens.push_back(token);
            }
        }

@ -1383,7 +1390,9 @@ void Collection::highlight_result(const field &search_field,
        // is from the best matched field and need not be present in other fields of a document.
        Index* index = indices[field_order_kv->key % num_memory_shards];
        art_leaf *actual_leaf = index->get_token_leaf(search_field.name, &token_leaf->key[0], token_leaf->key_len);
+
        if(actual_leaf != nullptr) {
+            //LOG(INFO) << "field: " << search_field.name << ", key: " << actual_leaf->key;
            query_suggestion.push_back(actual_leaf);
            std::vector<uint16_t> positions;
            uint32_t doc_index = actual_leaf->values->ids.indexOf(field_order_kv->key);
@ -1432,66 +1441,84 @@ void Collection::highlight_result(const field &search_field,
    std::partial_sort(match_indices.begin(), match_indices.begin()+max_array_matches, match_indices.end());

    for(size_t index = 0; index < max_array_matches; index++) {
-        const match_index_t & match_index = match_indices[index];
-        const Match & match = match_index.match;
+        std::sort(match_indices[index].match.offsets.begin(), match_indices[index].match.offsets.end());
+        const auto& match_index = match_indices[index];
+        const Match& match = match_index.match;

-        std::vector<std::string> tokens;
+        const std::string& text = (search_field.type == field_types::STRING) ? document[search_field.name] : document[search_field.name][match_index.index];
+        Tokenizer tokenizer(text, true, false);

-        if(search_field.type == field_types::STRING) {
-            Tokenizer(document[search_field.name], true, false).tokenize(tokens);
-        } else {
-            Tokenizer(document[search_field.name][match_index.index], true, false).tokenize(tokens);
-        }
+        std::string raw_token;
+        size_t raw_token_index = 0;
+        int indexed_token_index = -1;
+        size_t match_offset_index = 0;

-        std::vector<size_t> token_indices;
+        std::set<size_t> token_indices;
        spp::sparse_hash_set<std::string> token_hits;
+        std::vector<std::string> raw_tokens;
+        std::unordered_map<size_t, size_t> indexed_to_raw;

-        for(size_t i = 0; i < match.offsets.size(); i++) {
-            if(match.offsets[i].offset != MAX_DISPLACEMENT) {
-                size_t token_index = (size_t)(match.offsets[i].offset);
-                token_indices.push_back(token_index);
-                if(token_index >= tokens.size()) {
-                    LOG(ERROR) << "Highlight token index " << token_index << " is greater than length of store field.";
-                    continue;
-                }
-                std::string token = tokens[token_index];
-                Tokenizer(token, true, true).tokenize(token);
-
-                token_hits.insert(token);
+        while(tokenizer.next(raw_token, raw_token_index)) {
+            if(!raw_token.empty() && (std::isalnum(raw_token[0]) || (raw_token[0] & ~0x7f) != 0)) {
+                // check for actual token (first char is NOT alphanum or ascii)
+                indexed_token_index++;
+                indexed_to_raw[indexed_token_index] = raw_token_index;
+                /*LOG(INFO) << "raw_token: " << raw_token << ", indexed_token_index: " << indexed_token_index
+                          << ", raw_token_index: " << raw_token_index;*/
            }
+
+            if (match_offset_index < match.offsets.size() &&
+                match.offsets[match_offset_index].offset == indexed_token_index) {
+                std::string indexed_token;
+                Tokenizer(raw_token, true, true).tokenize(indexed_token);
+
+                if(token_indices.count(indexed_token_index) == 0) {
+                    // repetition could occur, for e.g. in the case of synonym constructed queries
+                    token_indices.insert(indexed_token_index);
+                    token_hits.insert(indexed_token);
+                }
+
+                match_offset_index++;
+            }
+
+            raw_tokens.push_back(raw_token);
        }

+        size_t num_indexed_tokens = indexed_token_index + 1;
        auto minmax = std::minmax_element(token_indices.begin(), token_indices.end());

        size_t prefix_length = highlight_affix_num_tokens;
-        size_t suffix_length = highlight_affix_num_tokens + 1;
+        size_t suffix_length = highlight_affix_num_tokens;

-        // For longer strings, pick surrounding tokens within `prefix_length` of min_index and max_index for snippet
-        const size_t start_index = (tokens.size() <= snippet_threshold) ? 0 :
-                                   std::max(0, (int)(*(minmax.first) - prefix_length));
+        if(num_indexed_tokens == 0) {
+            continue;
+        }

-        const size_t end_index = (tokens.size() <= snippet_threshold) ? tokens.size() :
-                                 std::min((int)tokens.size(), (int)(*(minmax.second) + suffix_length));
+        // For longer strings, pick surrounding raw_tokens within `prefix_length` of min_index and max_index for snippet
+        const size_t start_index = (num_indexed_tokens <= snippet_threshold) ? 0 :
+                                   indexed_to_raw[std::max(0, (int)(*(minmax.first) - prefix_length))];
+
+        const size_t end_index = (num_indexed_tokens <= snippet_threshold) ? raw_tokens.size() - 1 :
+                                 indexed_to_raw[std::min((int)num_indexed_tokens - 1, (int)(*(minmax.second) + suffix_length))];

        std::stringstream snippet_stream;

        highlight.matched_tokens.emplace_back();
        std::vector<std::string>& matched_tokens = highlight.matched_tokens.back();
+        size_t snippet_index = start_index;

-        for(size_t snippet_index = start_index; snippet_index < end_index; snippet_index++) {
-            if(snippet_index != start_index) {
-                snippet_stream << " ";
-            }
+        while(snippet_index <= end_index) {
+            std::string normalized_token;
+            Tokenizer(raw_tokens[snippet_index], true, true).tokenize(normalized_token);

-            std::string token = tokens[snippet_index];
-            Tokenizer(token, true, true).tokenize(token);
-
-            if(token_hits.count(token) != 0) {
-                snippet_stream << highlight_start_tag << tokens[snippet_index] << highlight_end_tag;
-                matched_tokens.push_back(tokens[snippet_index]);
+            if(token_hits.count(normalized_token) != 0) {
+                snippet_stream << highlight_start_tag << raw_tokens[snippet_index] << highlight_end_tag;
+                matched_tokens.push_back(raw_tokens[snippet_index]);
            } else {
-                snippet_stream << tokens[snippet_index];
+                snippet_stream << raw_tokens[snippet_index];
            }
+
+            snippet_index++;
        }

        highlight.snippets.push_back(snippet_stream.str());
@ -1501,18 +1528,14 @@ void Collection::highlight_result(const field &search_field,

        if(highlighted_fully) {
            std::stringstream value_stream;
-            for(size_t value_index = 0; value_index < tokens.size(); value_index++) {
-                if(value_index != 0) {
-                    value_stream << " ";
-                }
+            for(size_t value_index = 0; value_index < raw_tokens.size(); value_index++) {
+                std::string normalized_token;
+                Tokenizer(raw_tokens[value_index], true, true).tokenize(normalized_token);

-                std::string token = tokens[value_index];
-                Tokenizer(token, true, true).tokenize(token);
-
-                if(token_hits.count(token) != 0) {
-                    value_stream << highlight_start_tag << tokens[value_index] << highlight_end_tag;
+                if(token_hits.count(normalized_token) != 0) {
+                    value_stream << highlight_start_tag << raw_tokens[value_index] << highlight_end_tag;
                } else {
-                    value_stream << tokens[value_index];
+                    value_stream << raw_tokens[value_index];
                }
            }

--- a/src/index.cpp
+++ b/src/index.cpp
@ -575,7 +575,7 @@ void Index::index_string_field(const std::string & text, const int64_t score, ar
                                    uint32_t seq_id, bool is_facet, const field & a_field) {
    std::unordered_map<std::string, std::vector<uint32_t>> token_to_offsets;

-    Tokenizer tokenizer(text, true, true, !a_field.is_string());
+    Tokenizer tokenizer(text, false, true, !a_field.is_string());
    std::string token;
    size_t token_index = 0;

@ -588,7 +588,6 @@ void Index::index_string_field(const std::string & text, const int64_t score, ar

        if(is_facet) {
            uint64_t hash = facet_token_hash(a_field, token);
-            //facet_index_v2[seq_id][facet_id].push_back(hash);
            facet_hashes.push_back(hash);
        }

@ -623,7 +622,7 @@ void Index::index_string_array_field(const std::vector<std::string> & strings, c
        const std::string& str = strings[array_index];
        std::set<std::string> token_set;  // required to deal with repeating tokens

-        Tokenizer tokenizer(str, true, true, !a_field.is_string());
+        Tokenizer tokenizer(str, false, true, !a_field.is_string());
        std::string token;
        size_t token_index = 0;

@ -2216,6 +2215,8 @@ void Index::populate_token_positions(const std::vector<art_leaf *>& query_sugges
    for(size_t i = 0; i < query_suggestion.size(); i++) {
        const art_leaf* token_leaf = query_suggestion[i];
        uint32_t doc_index = leaf_to_indices[i][result_index];
+        /*LOG(INFO) << "doc_id: " << token_leaf->values->ids.at(doc_index) << ", token_leaf->values->ids.getLength(): "
+                  << token_leaf->values->ids.getLength();*/

        // it's possible for a query token to not appear in a resulting document
        if(doc_index == token_leaf->values->ids.getLength()) {
@ -2229,7 +2230,14 @@ void Index::populate_token_positions(const std::vector<art_leaf *>& query_sugges
        /*uint32_t* offsets = token_leaf->values->offsets.uncompress();
        for(size_t ii=0; ii < token_leaf->values->offsets.getLength(); ii++) {
            LOG(INFO) << "offset: " << offsets[ii];
-        }*/
+        }
+
+        uint32_t* offset_indices = token_leaf->values->offset_index.uncompress();
+        for(size_t ii=0; ii < token_leaf->values->offset_index.getLength(); ii++) {
+            LOG(INFO) << "offset index: " << offset_indices[ii];
+        }
+
+        LOG(INFO) << "token_leaf->values->offsets.getLength(): " << token_leaf->values->offsets.getLength();*/

        uint32_t start_offset = token_leaf->values->offset_index.at(doc_index);
        uint32_t end_offset = (doc_index == token_leaf->values->ids.getLength() - 1) ?
@ -2464,6 +2472,7 @@ void Index::tokenize_doc_field(const nlohmann::json& document, const field& sear
 }

 art_leaf* Index::get_token_leaf(const std::string & field_name, const unsigned char* token, uint32_t token_len) {
+    std::shared_lock lock(mutex);
    const art_tree *t = search_index.at(field_name);
    return (art_leaf*) art_search(t, token, (int) token_len);
 }
--- a/src/tokenizer.cpp
+++ b/src/tokenizer.cpp
@ -2,62 +2,63 @@
 #include "tokenizer.h"

 bool Tokenizer::next(std::string &token, size_t& token_index) {
-    std::stringstream out;
-
-    if(i >= text.size()) {
-        if(i == text.size() && !text.empty() && text.back() == ' ') {
-            token = "";
-            i++;
-            return true;
+    if(no_op) {
+        if(i == text.size()) {
+            return false;
        }

-        return false;
-    }
-
-    if(no_op) {
        token = text;
        i = text.size();
        return true;
    }

    while(i < text.size()) {
-        if((text[i] & ~0x7f) == 0 ) {
-            // ASCII character: split on space/newline or lowercase otherwise
-            if(std::isalnum(text[i])) {
+        bool is_ascii = (text[i] & ~0x7f) == 0;
+        if(is_ascii) {
+            const size_t next_stream_mode = std::isalnum(text[i]) ? CHARS : SEPARATORS;
+
+            if(next_stream_mode != stream_mode) {
+                // We tokenize when `stream_mode` changes
+                token = out.str();
+
+                out.str(std::string());
                if(normalize) {
                    out << char(std::tolower(text[i]));
                } else {
                    out << text[i];
                }
+                i++;
+
+                if(stream_mode == SEPARATORS && !keep_separators) {
+                    stream_mode = next_stream_mode;
+                    continue;
+                }
+
+                token_index = token_counter++;
+                stream_mode = next_stream_mode;
+                return true;
            } else {
-                bool is_space = text[i] == 32;
-                bool is_new_line = text[i] == 10;
-                bool is_whitespace = is_space || is_new_line;
-
-                bool next_char_alphanum = (i != text.length() - 1) && std::isalnum(text[i + 1]);
-
-                if(!normalize && !is_whitespace && (i == text.length() - 1 || !next_char_alphanum)) {
-                    // checking for next char ensures that `foo-bar` does not get split to `foo-`
+                if(normalize) {
+                    out << char(std::tolower(text[i]));
+                } else {
                    out << text[i];
                }

-                if(is_whitespace || next_char_alphanum) {
-                    // we split on space or on a special character whose next char is alphanumeric
-                    token = out.str();
-                    out.clear();
-                    i++;
-
-                    if(!keep_empty && token.empty()) {
-                        continue;
-                    }
-
-                    token_index = token_counter++;
-                    return true;
-                }
+                i++;
+                continue;
            }
+        }

-            i++;
-            continue;
+        if(stream_mode == SEPARATORS) { // to detect first non-ascii character
+            // we will tokenize now and treat the following non-ascii chars as a different token
+            stream_mode = CHARS;
+            token = out.str();
+            out.str(std::string());
+
+            if(keep_separators) {
+                token_index = token_counter++;
+                return true;
+            }
        }

        char inbuf[5];
@ -90,18 +91,17 @@ bool Tokenizer::next(std::string &token, size_t& token_index) {
            // symbol cannot be represented as ASCII, so write the original symbol
            out << inbuf;
        } else {
-            // NOTE: outsize indicates bytes available AFTER current position so have to do <=
            for(size_t out_index=0; out_index<5; out_index++) {
                if(!normalize) {
                    out << outbuf[out_index];
                    continue;
                }

-                bool is_ascii = ((outbuf[out_index] & ~0x7f) == 0);
-                bool keep_char = !is_ascii || std::isalnum(outbuf[out_index]);
+                bool unicode_is_ascii = ((outbuf[out_index] & ~0x7f) == 0);
+                bool keep_char = !unicode_is_ascii || std::isalnum(outbuf[out_index]);

                if(keep_char) {
-                    if(is_ascii && std::isalnum(outbuf[out_index])) {
+                    if(unicode_is_ascii && std::isalnum(outbuf[out_index])) {
                        outbuf[out_index] = char(std::tolower(outbuf[out_index]));
                    }
                    out << outbuf[out_index];
@ -111,9 +111,13 @@ bool Tokenizer::next(std::string &token, size_t& token_index) {
    }

    token = out.str();
-    out.clear();
+    out.str(std::string());

-    if(!keep_empty && token.empty()) {
+    if(token.empty()) {
+        return false;
+    }
+
+    if(!std::isalnum(token[0]) && !keep_separators) {
        return false;
    }

--- a/test/collection_synonyms_test.cpp
+++ b/test/collection_synonyms_test.cpp
@ -359,8 +359,8 @@ TEST_F(CollectionSynonymsTest, MultiWaySynonym) {

    ASSERT_EQ(2, res["hits"].size());
    ASSERT_EQ(2, res["found"].get<uint32_t>());
-    ASSERT_STREQ("<mark>Samuel</mark> <mark>L.</mark> <mark>Jackson</mark>", res["hits"][0]["highlights"][0]["snippet"].get<std::string>().c_str());
-    ASSERT_STREQ("<mark>Samuel</mark> <mark>L.</mark> <mark>Jackson</mark>", res["hits"][1]["highlights"][0]["snippet"].get<std::string>().c_str());
+    ASSERT_STREQ("<mark>Samuel</mark> <mark>L</mark>. <mark>Jackson</mark>", res["hits"][0]["highlights"][0]["snippet"].get<std::string>().c_str());
+    ASSERT_STREQ("<mark>Samuel</mark> <mark>L</mark>. <mark>Jackson</mark>", res["hits"][1]["highlights"][0]["snippet"].get<std::string>().c_str());

    // for now we don't support synonyms on ANY prefix

--- a/test/collection_test.cpp
+++ b/test/collection_test.cpp
@ -164,7 +164,7 @@ TEST_F(CollectionTest, PhraseSearch) {
       13:  score: 12, (single word match)
    */

-    std::vector<std::string> ids = {"8", "1", "16", "17", "13"};
+    std::vector<std::string> ids = {"8", "1", "17", "16", "13"};

    for(size_t i = 0; i < results["hits"].size(); i++) {
        nlohmann::json result = results["hits"].at(i);
@ -184,7 +184,7 @@ TEST_F(CollectionTest, PhraseSearch) {
    ASSERT_EQ(5, results["hits"].size());
    ASSERT_EQ(5, results["found"].get<uint32_t>());

-    ids = {"8", "1", "17", "16", "13"};
+    ids = {"8", "17", "1", "16", "13"};

    for(size_t i = 0; i < results["hits"].size(); i++) {
        nlohmann::json result = results["hits"].at(i);
@ -200,7 +200,7 @@ TEST_F(CollectionTest, PhraseSearch) {

    ASSERT_EQ(3, results["request_params"]["per_page"].get<size_t>());

-    ids = {"8", "1", "16"};
+    ids = {"8", "1", "17"};

    for(size_t i = 0; i < 3; i++) {
        nlohmann::json result = results["hits"].at(i);
@ -1958,7 +1958,7 @@ TEST_F(CollectionTest, SearchLargeTextField) {

    ASSERT_EQ(1, results["hits"].size());

-    ASSERT_STREQ("non arcu id lectus <mark>accumsan</mark> venenatis at at justo.",
+    ASSERT_STREQ("non arcu id lectus <mark>accumsan</mark> venenatis at at justo",
    results["hits"][0]["highlights"][0]["snippet"].get<std::string>().c_str());

    collectionManager.drop_collection("coll_large_text");
@ -2141,7 +2141,7 @@ TEST_F(CollectionTest, SearchHighlightWithNewLine) {
                             token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
                             spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0).get();

-    ASSERT_STREQ("Blah, blah <mark>Stark</mark> Industries",
+    ASSERT_STREQ("Blah, blah\n<mark>Stark</mark> Industries",
                 res["hits"][0]["highlights"][0]["snippet"].get<std::string>().c_str());

    ASSERT_STREQ("Stark", res["hits"][0]["highlights"][0]["matched_tokens"][0].get<std::string>().c_str());
@ -3184,7 +3184,7 @@ TEST_F(CollectionTest, SearchingForRecordsWithSpecialChars) {

    std::vector<std::vector<std::string>> records = {
        {"Amazon Home", "https://amazon.com/"},
-        {"Google Home", "https://google.com/"},
+        {"Google Home", "https://google.com///"},
        {"Github Issue", "https://github.com/typesense/typesense/issues/241"},
        {"Amazon Search", "https://www.amazon.com/s?k=phone&ref=nb_sb_noss_2"},
    };
@ -3206,12 +3206,17 @@ TEST_F(CollectionTest, SearchingForRecordsWithSpecialChars) {
    ASSERT_EQ(1, results["found"].get<size_t>());
    ASSERT_STREQ("1", results["hits"][0]["document"]["id"].get<std::string>().c_str());

+    ASSERT_EQ(2, results["hits"][0]["highlights"].size());
+    ASSERT_EQ("<mark>Google</mark> Home", results["hits"][0]["highlights"][0]["snippet"].get<std::string>());
+    ASSERT_EQ("https://<mark>google</mark>.com///", results["hits"][0]["highlights"][1]["snippet"].get<std::string>());
+
    results = coll1->search("amazon.com",
                            {"title", "url"}, "", {}, {}, 2, 10, 1, FREQUENCY).get();

-    ASSERT_EQ(2, results["found"].get<size_t>());
+    ASSERT_EQ(3, results["found"].get<size_t>());
    ASSERT_STREQ("3", results["hits"][0]["document"]["id"].get<std::string>().c_str());
    ASSERT_STREQ("0", results["hits"][1]["document"]["id"].get<std::string>().c_str());
+    ASSERT_STREQ("1", results["hits"][2]["document"]["id"].get<std::string>().c_str());

    results = coll1->search("typesense",
                            {"title", "url"}, "", {}, {}, 2, 10, 1, FREQUENCY).get();
@ -3225,5 +3230,9 @@ TEST_F(CollectionTest, SearchingForRecordsWithSpecialChars) {
    ASSERT_EQ(1, results["found"].get<size_t>());
    ASSERT_STREQ("3", results["hits"][0]["document"]["id"].get<std::string>().c_str());

+    ASSERT_EQ(1, results["hits"][0]["highlights"].size());
+    ASSERT_EQ("https://www.amazon.com/s?k=phone&ref=<mark>nb</mark>_<mark>sb</mark>_<mark>noss</mark>_<mark>2</mark>",
+              results["hits"][0]["highlights"][0]["snippet"].get<std::string>());
+
    collectionManager.drop_collection("coll1");
-}
+}
--- a/test/tokenizer_test.cpp
+++ b/test/tokenizer_test.cpp
@ -4,25 +4,23 @@
 TEST(TokenizerTest, ShouldTokenizeNormalizeDifferentStrings) {
    const std::string withnewline = "Michael Jordan:\nWelcome, everybody. Welcome! ";
    std::vector<std::string> tokens;
-    Tokenizer(withnewline, true, true, false).tokenize(tokens);
-    ASSERT_EQ(6, tokens.size());
+    Tokenizer(withnewline, false, true, false).tokenize(tokens);
+    ASSERT_EQ(5, tokens.size());
    ASSERT_STREQ("michael", tokens[0].c_str());
    ASSERT_STREQ("jordan", tokens[1].c_str());
    ASSERT_STREQ("welcome", tokens[2].c_str());
    ASSERT_STREQ("everybody", tokens[3].c_str());
    ASSERT_STREQ("welcome", tokens[4].c_str());
-    ASSERT_STREQ("", tokens[5].c_str());

    const std::string withspaces = " Michael  Jordan  ";
    tokens.clear();
    Tokenizer(withspaces, true, true, false).tokenize(tokens);
-    ASSERT_EQ(6, tokens.size());
-    ASSERT_STREQ("", tokens[0].c_str());
+    ASSERT_EQ(5, tokens.size());
+    ASSERT_STREQ(" ", tokens[0].c_str());
    ASSERT_STREQ("michael", tokens[1].c_str());
-    ASSERT_STREQ("", tokens[2].c_str());
+    ASSERT_STREQ("  ", tokens[2].c_str());
    ASSERT_STREQ("jordan", tokens[3].c_str());
-    ASSERT_STREQ("", tokens[4].c_str());
-    ASSERT_STREQ("", tokens[5].c_str());
+    ASSERT_STREQ("  ", tokens[4].c_str());

    tokens.clear();
    Tokenizer(withspaces, false, true, false).tokenize(tokens);
@ -30,38 +28,6 @@ TEST(TokenizerTest, ShouldTokenizeNormalizeDifferentStrings) {
    ASSERT_STREQ("michael", tokens[0].c_str());
    ASSERT_STREQ("jordan", tokens[1].c_str());

-    const std::string withspecialchars = "Special ½¥ and தமிழ் 你好吗 abcÅà123ß12 here.";
-    tokens.clear();
-    Tokenizer(withspecialchars, false, true, false).tokenize(tokens);
-    ASSERT_EQ(7, tokens.size());
-    ASSERT_STREQ("special", tokens[0].c_str());
-    ASSERT_STREQ("12yen", tokens[1].c_str());
-    ASSERT_STREQ("and", tokens[2].c_str());
-    ASSERT_STREQ("தமிழ்", tokens[3].c_str());
-    ASSERT_STREQ("你好吗", tokens[4].c_str());
-    ASSERT_STREQ("abcaa123ss12", tokens[5].c_str());
-    ASSERT_STREQ("here", tokens[6].c_str());
-
-    // when normalization is disabled and keep empty is enabled
-    const std::string withoutnormalize = "Mise  à,  jour.";
-    tokens.clear();
-    Tokenizer(withoutnormalize, true, false, false).tokenize(tokens);
-    ASSERT_EQ(5, tokens.size());
-    ASSERT_STREQ("Mise", tokens[0].c_str());
-    ASSERT_STREQ("", tokens[1].c_str());
-    ASSERT_STREQ("à,", tokens[2].c_str());
-    ASSERT_STREQ("", tokens[3].c_str());
-    ASSERT_STREQ("jour.", tokens[4].c_str());
-
-    // when normalization and keep empty are disabled
-    const std::string withoutnormalizeandkeepempty = "Mise  à  jour.";
-    tokens.clear();
-    Tokenizer(withoutnormalizeandkeepempty, false, false, false).tokenize(tokens);
-    ASSERT_EQ(3, tokens.size());
-    ASSERT_STREQ("Mise", tokens[0].c_str());
-    ASSERT_STREQ("à", tokens[1].c_str());
-    ASSERT_STREQ("jour.", tokens[2].c_str());
-
    // single token
    const std::string single_token = "foobar";
    tokens.clear();
@ -89,22 +55,82 @@ TEST(TokenizerTest, ShouldTokenizeNormalizeDifferentStrings) {
    const std::string multispace_tokens = "foo     bar";
    tokens.clear();
    Tokenizer(multispace_tokens, true, false, false).tokenize(tokens);
-    ASSERT_EQ(6, tokens.size());
+    ASSERT_EQ(3, tokens.size());
    ASSERT_STREQ("foo", tokens[0].c_str());
-    ASSERT_STREQ("", tokens[1].c_str());
-    ASSERT_STREQ("", tokens[2].c_str());
-    ASSERT_STREQ("", tokens[3].c_str());
-    ASSERT_STREQ("", tokens[4].c_str());
-    ASSERT_STREQ("bar", tokens[5].c_str());
+    ASSERT_STREQ("     ", tokens[1].c_str());
+    ASSERT_STREQ("bar", tokens[2].c_str());
+
+    // special chars
+    const std::string specialchar_tokens = "https://www.amazon.com/s?k=phone&ref=nb_sb_noss_2";;
+    tokens.clear();
+    Tokenizer(specialchar_tokens, true, false, false).tokenize(tokens);
+    ASSERT_EQ(23, tokens.size());
+    ASSERT_STREQ("https", tokens[0].c_str());
+    ASSERT_STREQ("://", tokens[1].c_str());
+    ASSERT_STREQ("www", tokens[2].c_str());
+    ASSERT_STREQ(".", tokens[3].c_str());
+    ASSERT_STREQ("noss", tokens[20].c_str());
+    ASSERT_STREQ("_", tokens[21].c_str());
+    ASSERT_STREQ("2", tokens[22].c_str());

    // noop

    tokens.clear();
+    const std::string withspecialchars = "Special ½¥ and தமிழ் 你好吗 abcÅà123ß12 here.";
    Tokenizer(withspecialchars, false, true, true).tokenize(tokens);
    ASSERT_EQ(1, tokens.size());
    ASSERT_STREQ(withspecialchars.c_str(), tokens[0].c_str());
 }

+TEST(TokenizerTest, ShouldTokenizeNormalizeUnicodeStrings) {
+    std::vector<std::string> tokens;
+
+    const std::string withspecialchars = "Special ½¥ and -தமிழ் 你2好吗 abcÅà123ß12 here.";
+    tokens.clear();
+    Tokenizer(withspecialchars, false, true, false).tokenize(tokens);
+    ASSERT_EQ(7, tokens.size());
+    ASSERT_STREQ("special", tokens[0].c_str());
+    ASSERT_STREQ("12yen", tokens[1].c_str());
+    ASSERT_STREQ("and", tokens[2].c_str());
+    ASSERT_STREQ("தமிழ்", tokens[3].c_str());
+    ASSERT_STREQ("你2好吗", tokens[4].c_str());
+    ASSERT_STREQ("abcaa123ss12", tokens[5].c_str());
+    ASSERT_STREQ("here", tokens[6].c_str());
+
+    // when normalization is disabled and keep empty is enabled
+    const std::string withoutnormalize = "Mise  à,  jour.";
+    tokens.clear();
+    Tokenizer(withoutnormalize, true, false, false).tokenize(tokens);
+    ASSERT_EQ(6, tokens.size());
+    ASSERT_STREQ("Mise", tokens[0].c_str());
+    ASSERT_STREQ("  ", tokens[1].c_str());
+    ASSERT_STREQ("à", tokens[2].c_str());
+    ASSERT_STREQ(",  ", tokens[3].c_str());
+    ASSERT_STREQ("jour", tokens[4].c_str());
+    ASSERT_STREQ(".", tokens[5].c_str());
+
+    // when normalization and keep empty are disabled
+    const std::string withoutnormalizeandkeepempty = "Mise  à  jour.";
+    tokens.clear();
+    Tokenizer(withoutnormalizeandkeepempty, false, false, false).tokenize(tokens);
+    ASSERT_EQ(3, tokens.size());
+    ASSERT_STREQ("Mise", tokens[0].c_str());
+    ASSERT_STREQ("à", tokens[1].c_str());
+    ASSERT_STREQ("jour", tokens[2].c_str());
+
+    // single accented word tokenization
+    std::string singleword = "à";
+    tokens.clear();
+    Tokenizer(singleword, false, true, false).tokenize(tokens);
+    ASSERT_EQ(1, tokens.size());
+    ASSERT_STREQ("a", tokens[0].c_str());
+
+    tokens.clear();
+    Tokenizer(singleword, true, true, false).tokenize(tokens);
+    ASSERT_EQ(1, tokens.size());
+    ASSERT_STREQ("a", tokens[0].c_str());
+}
+
 TEST(TokenizerTest, ShouldTokenizeIteratively) {
    const std::string withnewline = "Michael Jordan:\n\nWelcome, everybody. Welcome!";
    std::vector<std::string> tokens;
@ -117,20 +143,34 @@ TEST(TokenizerTest, ShouldTokenizeIteratively) {
        tokens.push_back(token);
    }

-    ASSERT_EQ(6, tokens.size());
+    ASSERT_EQ(10, tokens.size());
    ASSERT_STREQ("michael", tokens[0].c_str());
-    ASSERT_STREQ("jordan", tokens[1].c_str());
-    ASSERT_STREQ("", tokens[2].c_str());
-    ASSERT_STREQ("welcome", tokens[3].c_str());
-    ASSERT_STREQ("everybody", tokens[4].c_str());
-    ASSERT_STREQ("welcome", tokens[5].c_str());
+    ASSERT_STREQ(" ", tokens[1].c_str());
+    ASSERT_STREQ("jordan", tokens[2].c_str());
+    ASSERT_STREQ(":\n\n", tokens[3].c_str());
+    ASSERT_STREQ("welcome", tokens[4].c_str());
+    ASSERT_STREQ(", ", tokens[5].c_str());
+    ASSERT_STREQ("everybody", tokens[6].c_str());
+    ASSERT_STREQ(". ", tokens[7].c_str());
+    ASSERT_STREQ("welcome", tokens[8].c_str());
+    ASSERT_STREQ("!", tokens[9].c_str());
+
+    // check for index when separators are not kept
+    Tokenizer tokenizer2(withnewline, false, true, false);
+    size_t expected_token_index = 0;
+    std::vector<std::string> expected_tokens = {"michael", "jordan", "welcome", "everybody", "welcome"};
+    while(tokenizer2.next(token, token_index)) {
+        ASSERT_EQ(expected_token_index, token_index);
+        ASSERT_EQ(expected_tokens[expected_token_index], token);
+        expected_token_index++;
+    }

    // verbatim (no_op=true)

    tokens.clear();
-    Tokenizer tokenizer2(withnewline, true, false, true);
+    Tokenizer tokenizer3(withnewline, true, false, true);

-    while(tokenizer2.next(token, token_index)) {
+    while(tokenizer3.next(token, token_index)) {
        tokens.push_back(token);
    }