From c2eec85277ac12b531ead0abc2eeb34f74d5ab6b Mon Sep 17 00:00:00 2001
From: kishorenc <kishorenc@gmail.com>
Date: Sat, 20 Mar 2021 12:58:30 +0530
Subject: [PATCH] Fix highlighting of strings with special characters.

---
 include/index.h                   |   4 +-
 include/match_score.h             |   4 +
 include/tokenizer.h               |  19 +++-
 src/collection.cpp                | 143 +++++++++++++++++------------
 src/index.cpp                     |  17 +++-
 src/tokenizer.cpp                 |  90 +++++++++---------
 test/collection_synonyms_test.cpp |   4 +-
 test/collection_test.cpp          |  25 +++--
 test/tokenizer_test.cpp           | 148 +++++++++++++++++++-----------
 9 files changed, 278 insertions(+), 176 deletions(-)
diff --git a/include/index.h b/include/index.h
index 73e45f9e..fcacae77 100644
--- a/include/index.h
+++ b/include/index.h
@@ -389,10 +389,10 @@ public:
 
     static void transform_for_180th_meridian(GeoCoord& point, double offset);
 
-    // the following methods are not synchronized because their parent calls are synchronized
-
     art_leaf* get_token_leaf(const std::string & field_name, const unsigned char* token, uint32_t token_len);
 
+    // the following methods are not synchronized because their parent calls are synchronized
+
     uint32_t do_filtering(uint32_t** filter_ids_out, const std::vector<filter> & filters) const;
 
     static Option<uint32_t> validate_index_in_memory(nlohmann::json &document, uint32_t seq_id,
diff --git a/include/match_score.h b/include/match_score.h
index 5224c9dd..c5d69733 100644
--- a/include/match_score.h
+++ b/include/match_score.h
@@ -30,6 +30,10 @@ struct TokenOffset {
     bool operator>(const TokenOffset &a) const {
         return offset > a.offset;
     }
+
+    bool operator<(const TokenOffset &a) const {
+        return offset < a.offset;
+    }
 };
 
 struct Match {
diff --git a/include/tokenizer.h b/include/tokenizer.h
index 39c0b511..985d59e8 100644
--- a/include/tokenizer.h
+++ b/include/tokenizer.h
@@ -8,19 +8,32 @@ class Tokenizer {
 private:
     const std::string& text;
     size_t i;
-    const bool keep_empty;
+    const bool keep_separators;
     const bool normalize;
     const bool no_op;
 
     size_t token_counter = 0;
     iconv_t cd;
 
+    static const size_t CHARS = 0;
+    static const size_t SEPARATORS = 1;
+    size_t stream_mode;
+
+    std::stringstream out;
+
 public:
 
     explicit Tokenizer(const std::string& input,
-                       bool keep_empty=true, bool normalize=true, bool no_op=false):
-            text(input), i(0), keep_empty(keep_empty), normalize(normalize), no_op(no_op) {
+                       bool keep_separators=true, bool normalize=true, bool no_op=false):
+            text(input), i(0), keep_separators(keep_separators), normalize(normalize), no_op(no_op) {
         cd = iconv_open("ASCII//TRANSLIT", "UTF-8");
+
+        if(!input.empty() && (std::isalnum(text[0]) || (text[i] & ~0x7f) != 0)) {
+            // alphanum or non-ascii
+            stream_mode = CHARS;
+        } else {
+            stream_mode = SEPARATORS;
+        }
     }
 
     ~Tokenizer() {
diff --git a/src/collection.cpp b/src/collection.cpp
index a98e0e1a..96ae52c9 100644
--- a/src/collection.cpp
+++ b/src/collection.cpp
@@ -1223,20 +1223,27 @@ void Collection::parse_search_query(const std::string &query, std::vector<std::s
         q_include_tokens = {query};
     } else {
         std::vector<std::string> tokens;
-        StringUtils::split(query, tokens, " ");
+        Tokenizer(query, true, true).tokenize(tokens);
+        bool exclude_operator_prior = false;
 
-        for(std::string& token: tokens) {
-            if(token[0] == '-') {
-                std::string&& just_token = token.substr(1);
-                Tokenizer(just_token, false, true).tokenize(just_token);
-                if(!just_token.empty()) {
-                    q_exclude_tokens.push_back(just_token);
-                }
+        for(const auto& token: tokens) {
+            if(token.empty()) {
+                continue;
+            }
+
+            if(token == "-" || token == " -") {
+                exclude_operator_prior = true;
+            }
+
+            if(!std::isalnum(token[0])) {
+                continue;
+            }
+
+            if(exclude_operator_prior) {
+                q_exclude_tokens.push_back(token);
+                exclude_operator_prior = false;
             } else {
-                Tokenizer(token, false, true).tokenize(token);
-                if(!token.empty()) {
-                    q_include_tokens.push_back(token);
-                }
+                q_include_tokens.push_back(token);
             }
         }
 
@@ -1383,7 +1390,9 @@ void Collection::highlight_result(const field &search_field,
         // is from the best matched field and need not be present in other fields of a document.
         Index* index = indices[field_order_kv->key % num_memory_shards];
         art_leaf *actual_leaf = index->get_token_leaf(search_field.name, &token_leaf->key[0], token_leaf->key_len);
+
         if(actual_leaf != nullptr) {
+            //LOG(INFO) << "field: " << search_field.name << ", key: " << actual_leaf->key;
             query_suggestion.push_back(actual_leaf);
             std::vector<uint16_t> positions;
             uint32_t doc_index = actual_leaf->values->ids.indexOf(field_order_kv->key);
@@ -1432,66 +1441,84 @@ void Collection::highlight_result(const field &search_field,
     std::partial_sort(match_indices.begin(), match_indices.begin()+max_array_matches, match_indices.end());
 
     for(size_t index = 0; index < max_array_matches; index++) {
-        const match_index_t & match_index = match_indices[index];
-        const Match & match = match_index.match;
+        std::sort(match_indices[index].match.offsets.begin(), match_indices[index].match.offsets.end());
+        const auto& match_index = match_indices[index];
+        const Match& match = match_index.match;
 
-        std::vector<std::string> tokens;
+        const std::string& text = (search_field.type == field_types::STRING) ? document[search_field.name] : document[search_field.name][match_index.index];
+        Tokenizer tokenizer(text, true, false);
 
-        if(search_field.type == field_types::STRING) {
-            Tokenizer(document[search_field.name], true, false).tokenize(tokens);
-        } else {
-            Tokenizer(document[search_field.name][match_index.index], true, false).tokenize(tokens);
-        }
+        std::string raw_token;
+        size_t raw_token_index = 0;
+        int indexed_token_index = -1;
+        size_t match_offset_index = 0;
 
-        std::vector<size_t> token_indices;
+        std::set<size_t> token_indices;
         spp::sparse_hash_set<std::string> token_hits;
+        std::vector<std::string> raw_tokens;
+        std::unordered_map<size_t, size_t> indexed_to_raw;
 
-        for(size_t i = 0; i < match.offsets.size(); i++) {
-            if(match.offsets[i].offset != MAX_DISPLACEMENT) {
-                size_t token_index = (size_t)(match.offsets[i].offset);
-                token_indices.push_back(token_index);
-                if(token_index >= tokens.size()) {
-                    LOG(ERROR) << "Highlight token index " << token_index << " is greater than length of store field.";
-                    continue;
-                }
-                std::string token = tokens[token_index];
-                Tokenizer(token, true, true).tokenize(token);
-
-                token_hits.insert(token);
+        while(tokenizer.next(raw_token, raw_token_index)) {
+            if(!raw_token.empty() && (std::isalnum(raw_token[0]) || (raw_token[0] & ~0x7f) != 0)) {
+                // check for actual token (first char is NOT alphanum or ascii)
+                indexed_token_index++;
+                indexed_to_raw[indexed_token_index] = raw_token_index;
+                /*LOG(INFO) << "raw_token: " << raw_token << ", indexed_token_index: " << indexed_token_index
+                          << ", raw_token_index: " << raw_token_index;*/
             }
+
+            if (match_offset_index < match.offsets.size() &&
+                match.offsets[match_offset_index].offset == indexed_token_index) {
+                std::string indexed_token;
+                Tokenizer(raw_token, true, true).tokenize(indexed_token);
+
+                if(token_indices.count(indexed_token_index) == 0) {
+                    // repetition could occur, for e.g. in the case of synonym constructed queries
+                    token_indices.insert(indexed_token_index);
+                    token_hits.insert(indexed_token);
+                }
+
+                match_offset_index++;
+            }
+
+            raw_tokens.push_back(raw_token);
         }
 
+        size_t num_indexed_tokens = indexed_token_index + 1;
         auto minmax = std::minmax_element(token_indices.begin(), token_indices.end());
 
         size_t prefix_length = highlight_affix_num_tokens;
-        size_t suffix_length = highlight_affix_num_tokens + 1;
+        size_t suffix_length = highlight_affix_num_tokens;
 
-        // For longer strings, pick surrounding tokens within `prefix_length` of min_index and max_index for snippet
-        const size_t start_index = (tokens.size() <= snippet_threshold) ? 0 :
-                                   std::max(0, (int)(*(minmax.first) - prefix_length));
+        if(num_indexed_tokens == 0) {
+            continue;
+        }
 
-        const size_t end_index = (tokens.size() <= snippet_threshold) ? tokens.size() :
-                                 std::min((int)tokens.size(), (int)(*(minmax.second) + suffix_length));
+        // For longer strings, pick surrounding raw_tokens within `prefix_length` of min_index and max_index for snippet
+        const size_t start_index = (num_indexed_tokens <= snippet_threshold) ? 0 :
+                                   indexed_to_raw[std::max(0, (int)(*(minmax.first) - prefix_length))];
+
+        const size_t end_index = (num_indexed_tokens <= snippet_threshold) ? raw_tokens.size() - 1 :
+                                 indexed_to_raw[std::min((int)num_indexed_tokens - 1, (int)(*(minmax.second) + suffix_length))];
 
         std::stringstream snippet_stream;
 
         highlight.matched_tokens.emplace_back();
         std::vector<std::string>& matched_tokens = highlight.matched_tokens.back();
+        size_t snippet_index = start_index;
 
-        for(size_t snippet_index = start_index; snippet_index < end_index; snippet_index++) {
-            if(snippet_index != start_index) {
-                snippet_stream << " ";
-            }
+        while(snippet_index <= end_index) {
+            std::string normalized_token;
+            Tokenizer(raw_tokens[snippet_index], true, true).tokenize(normalized_token);
 
-            std::string token = tokens[snippet_index];
-            Tokenizer(token, true, true).tokenize(token);
-
-            if(token_hits.count(token) != 0) {
-                snippet_stream << highlight_start_tag << tokens[snippet_index] << highlight_end_tag;
-                matched_tokens.push_back(tokens[snippet_index]);
+            if(token_hits.count(normalized_token) != 0) {
+                snippet_stream << highlight_start_tag << raw_tokens[snippet_index] << highlight_end_tag;
+                matched_tokens.push_back(raw_tokens[snippet_index]);
             } else {
-                snippet_stream << tokens[snippet_index];
+                snippet_stream << raw_tokens[snippet_index];
             }
+
+            snippet_index++;
         }
 
         highlight.snippets.push_back(snippet_stream.str());
@@ -1501,18 +1528,14 @@ void Collection::highlight_result(const field &search_field,
 
         if(highlighted_fully) {
             std::stringstream value_stream;
-            for(size_t value_index = 0; value_index < tokens.size(); value_index++) {
-                if(value_index != 0) {
-                    value_stream << " ";
-                }
+            for(size_t value_index = 0; value_index < raw_tokens.size(); value_index++) {
+                std::string normalized_token;
+                Tokenizer(raw_tokens[value_index], true, true).tokenize(normalized_token);
 
-                std::string token = tokens[value_index];
-                Tokenizer(token, true, true).tokenize(token);
-
-                if(token_hits.count(token) != 0) {
-                    value_stream << highlight_start_tag << tokens[value_index] << highlight_end_tag;
+                if(token_hits.count(normalized_token) != 0) {
+                    value_stream << highlight_start_tag << raw_tokens[value_index] << highlight_end_tag;
                 } else {
-                    value_stream << tokens[value_index];
+                    value_stream << raw_tokens[value_index];
                 }
             }
 
diff --git a/src/index.cpp b/src/index.cpp
index c272eeec..8500dee5 100644
--- a/src/index.cpp
+++ b/src/index.cpp
@@ -575,7 +575,7 @@ void Index::index_string_field(const std::string & text, const int64_t score, ar
                                     uint32_t seq_id, bool is_facet, const field & a_field) {
     std::unordered_map<std::string, std::vector<uint32_t>> token_to_offsets;
 
-    Tokenizer tokenizer(text, true, true, !a_field.is_string());
+    Tokenizer tokenizer(text, false, true, !a_field.is_string());
     std::string token;
     size_t token_index = 0;
 
@@ -588,7 +588,6 @@ void Index::index_string_field(const std::string & text, const int64_t score, ar
 
         if(is_facet) {
             uint64_t hash = facet_token_hash(a_field, token);
-            //facet_index_v2[seq_id][facet_id].push_back(hash);
             facet_hashes.push_back(hash);
         }
 
@@ -623,7 +622,7 @@ void Index::index_string_array_field(const std::vector<std::string> & strings, c
         const std::string& str = strings[array_index];
         std::set<std::string> token_set;  // required to deal with repeating tokens
 
-        Tokenizer tokenizer(str, true, true, !a_field.is_string());
+        Tokenizer tokenizer(str, false, true, !a_field.is_string());
         std::string token;
         size_t token_index = 0;
 
@@ -2216,6 +2215,8 @@ void Index::populate_token_positions(const std::vector<art_leaf *>& query_sugges
     for(size_t i = 0; i < query_suggestion.size(); i++) {
         const art_leaf* token_leaf = query_suggestion[i];
         uint32_t doc_index = leaf_to_indices[i][result_index];
+        /*LOG(INFO) << "doc_id: " << token_leaf->values->ids.at(doc_index) << ", token_leaf->values->ids.getLength(): "
+                  << token_leaf->values->ids.getLength();*/
 
         // it's possible for a query token to not appear in a resulting document
         if(doc_index == token_leaf->values->ids.getLength()) {
@@ -2229,7 +2230,14 @@ void Index::populate_token_positions(const std::vector<art_leaf *>& query_sugges
         /*uint32_t* offsets = token_leaf->values->offsets.uncompress();
         for(size_t ii=0; ii < token_leaf->values->offsets.getLength(); ii++) {
             LOG(INFO) << "offset: " << offsets[ii];
-        }*/
+        }
+
+        uint32_t* offset_indices = token_leaf->values->offset_index.uncompress();
+        for(size_t ii=0; ii < token_leaf->values->offset_index.getLength(); ii++) {
+            LOG(INFO) << "offset index: " << offset_indices[ii];
+        }
+
+        LOG(INFO) << "token_leaf->values->offsets.getLength(): " << token_leaf->values->offsets.getLength();*/
 
         uint32_t start_offset = token_leaf->values->offset_index.at(doc_index);
         uint32_t end_offset = (doc_index == token_leaf->values->ids.getLength() - 1) ?
@@ -2464,6 +2472,7 @@ void Index::tokenize_doc_field(const nlohmann::json& document, const field& sear
 }
 
 art_leaf* Index::get_token_leaf(const std::string & field_name, const unsigned char* token, uint32_t token_len) {
+    std::shared_lock lock(mutex);
     const art_tree *t = search_index.at(field_name);
     return (art_leaf*) art_search(t, token, (int) token_len);
 }
diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp
index 9dc86cd8..ac99a2bb 100644
--- a/src/tokenizer.cpp
+++ b/src/tokenizer.cpp
@@ -2,62 +2,63 @@
 #include "tokenizer.h"
 
 bool Tokenizer::next(std::string &token, size_t& token_index) {
-    std::stringstream out;
-
-    if(i >= text.size()) {
-        if(i == text.size() && !text.empty() && text.back() == ' ') {
-            token = "";
-            i++;
-            return true;
+    if(no_op) {
+        if(i == text.size()) {
+            return false;
         }
 
-        return false;
-    }
-
-    if(no_op) {
         token = text;
         i = text.size();
         return true;
     }
 
     while(i < text.size()) {
-        if((text[i] & ~0x7f) == 0 ) {
-            // ASCII character: split on space/newline or lowercase otherwise
-            if(std::isalnum(text[i])) {
+        bool is_ascii = (text[i] & ~0x7f) == 0;
+        if(is_ascii) {
+            const size_t next_stream_mode = std::isalnum(text[i]) ? CHARS : SEPARATORS;
+
+            if(next_stream_mode != stream_mode) {
+                // We tokenize when `stream_mode` changes
+                token = out.str();
+
+                out.str(std::string());
                 if(normalize) {
                     out << char(std::tolower(text[i]));
                 } else {
                     out << text[i];
                 }
+                i++;
+
+                if(stream_mode == SEPARATORS && !keep_separators) {
+                    stream_mode = next_stream_mode;
+                    continue;
+                }
+
+                token_index = token_counter++;
+                stream_mode = next_stream_mode;
+                return true;
             } else {
-                bool is_space = text[i] == 32;
-                bool is_new_line = text[i] == 10;
-                bool is_whitespace = is_space || is_new_line;
-
-                bool next_char_alphanum = (i != text.length() - 1) && std::isalnum(text[i + 1]);
-
-                if(!normalize && !is_whitespace && (i == text.length() - 1 || !next_char_alphanum)) {
-                    // checking for next char ensures that `foo-bar` does not get split to `foo-`
+                if(normalize) {
+                    out << char(std::tolower(text[i]));
+                } else {
                     out << text[i];
                 }
 
-                if(is_whitespace || next_char_alphanum) {
-                    // we split on space or on a special character whose next char is alphanumeric
-                    token = out.str();
-                    out.clear();
-                    i++;
-
-                    if(!keep_empty && token.empty()) {
-                        continue;
-                    }
-
-                    token_index = token_counter++;
-                    return true;
-                }
+                i++;
+                continue;
             }
+        }
 
-            i++;
-            continue;
+        if(stream_mode == SEPARATORS) { // to detect first non-ascii character
+            // we will tokenize now and treat the following non-ascii chars as a different token
+            stream_mode = CHARS;
+            token = out.str();
+            out.str(std::string());
+
+            if(keep_separators) {
+                token_index = token_counter++;
+                return true;
+            }
         }
 
         char inbuf[5];
@@ -90,18 +91,17 @@ bool Tokenizer::next(std::string &token, size_t& token_index) {
             // symbol cannot be represented as ASCII, so write the original symbol
             out << inbuf;
         } else {
-            // NOTE: outsize indicates bytes available AFTER current position so have to do <=
             for(size_t out_index=0; out_index<5; out_index++) {
                 if(!normalize) {
                     out << outbuf[out_index];
                     continue;
                 }
 
-                bool is_ascii = ((outbuf[out_index] & ~0x7f) == 0);
-                bool keep_char = !is_ascii || std::isalnum(outbuf[out_index]);
+                bool unicode_is_ascii = ((outbuf[out_index] & ~0x7f) == 0);
+                bool keep_char = !unicode_is_ascii || std::isalnum(outbuf[out_index]);
 
                 if(keep_char) {
-                    if(is_ascii && std::isalnum(outbuf[out_index])) {
+                    if(unicode_is_ascii && std::isalnum(outbuf[out_index])) {
                         outbuf[out_index] = char(std::tolower(outbuf[out_index]));
                     }
                     out << outbuf[out_index];
@@ -111,9 +111,13 @@ bool Tokenizer::next(std::string &token, size_t& token_index) {
     }
 
     token = out.str();
-    out.clear();
+    out.str(std::string());
 
-    if(!keep_empty && token.empty()) {
+    if(token.empty()) {
+        return false;
+    }
+
+    if(!std::isalnum(token[0]) && !keep_separators) {
         return false;
     }
 
diff --git a/test/collection_synonyms_test.cpp b/test/collection_synonyms_test.cpp
index 322b0f9a..61b1c742 100644
--- a/test/collection_synonyms_test.cpp
+++ b/test/collection_synonyms_test.cpp
@@ -359,8 +359,8 @@ TEST_F(CollectionSynonymsTest, MultiWaySynonym) {
 
     ASSERT_EQ(2, res["hits"].size());
     ASSERT_EQ(2, res["found"].get<uint32_t>());
-    ASSERT_STREQ("<mark>Samuel</mark> <mark>L.</mark> <mark>Jackson</mark>", res["hits"][0]["highlights"][0]["snippet"].get<std::string>().c_str());
-    ASSERT_STREQ("<mark>Samuel</mark> <mark>L.</mark> <mark>Jackson</mark>", res["hits"][1]["highlights"][0]["snippet"].get<std::string>().c_str());
+    ASSERT_STREQ("<mark>Samuel</mark> <mark>L</mark>. <mark>Jackson</mark>", res["hits"][0]["highlights"][0]["snippet"].get<std::string>().c_str());
+    ASSERT_STREQ("<mark>Samuel</mark> <mark>L</mark>. <mark>Jackson</mark>", res["hits"][1]["highlights"][0]["snippet"].get<std::string>().c_str());
 
     // for now we don't support synonyms on ANY prefix
 
diff --git a/test/collection_test.cpp b/test/collection_test.cpp
index 871fc156..65879b1d 100644
--- a/test/collection_test.cpp
+++ b/test/collection_test.cpp
@@ -164,7 +164,7 @@ TEST_F(CollectionTest, PhraseSearch) {
        13:  score: 12, (single word match)
     */
 
-    std::vector<std::string> ids = {"8", "1", "16", "17", "13"};
+    std::vector<std::string> ids = {"8", "1", "17", "16", "13"};
 
     for(size_t i = 0; i < results["hits"].size(); i++) {
         nlohmann::json result = results["hits"].at(i);
@@ -184,7 +184,7 @@ TEST_F(CollectionTest, PhraseSearch) {
     ASSERT_EQ(5, results["hits"].size());
     ASSERT_EQ(5, results["found"].get<uint32_t>());
 
-    ids = {"8", "1", "17", "16", "13"};
+    ids = {"8", "17", "1", "16", "13"};
 
     for(size_t i = 0; i < results["hits"].size(); i++) {
         nlohmann::json result = results["hits"].at(i);
@@ -200,7 +200,7 @@ TEST_F(CollectionTest, PhraseSearch) {
 
     ASSERT_EQ(3, results["request_params"]["per_page"].get<size_t>());
 
-    ids = {"8", "1", "16"};
+    ids = {"8", "1", "17"};
 
     for(size_t i = 0; i < 3; i++) {
         nlohmann::json result = results["hits"].at(i);
@@ -1958,7 +1958,7 @@ TEST_F(CollectionTest, SearchLargeTextField) {
 
     ASSERT_EQ(1, results["hits"].size());
 
-    ASSERT_STREQ("non arcu id lectus <mark>accumsan</mark> venenatis at at justo.",
+    ASSERT_STREQ("non arcu id lectus <mark>accumsan</mark> venenatis at at justo",
     results["hits"][0]["highlights"][0]["snippet"].get<std::string>().c_str());
 
     collectionManager.drop_collection("coll_large_text");
@@ -2141,7 +2141,7 @@ TEST_F(CollectionTest, SearchHighlightWithNewLine) {
                              token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
                              spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0).get();
 
-    ASSERT_STREQ("Blah, blah <mark>Stark</mark> Industries",
+    ASSERT_STREQ("Blah, blah\n<mark>Stark</mark> Industries",
                  res["hits"][0]["highlights"][0]["snippet"].get<std::string>().c_str());
 
     ASSERT_STREQ("Stark", res["hits"][0]["highlights"][0]["matched_tokens"][0].get<std::string>().c_str());
@@ -3184,7 +3184,7 @@ TEST_F(CollectionTest, SearchingForRecordsWithSpecialChars) {
 
     std::vector<std::vector<std::string>> records = {
         {"Amazon Home", "https://amazon.com/"},
-        {"Google Home", "https://google.com/"},
+        {"Google Home", "https://google.com///"},
         {"Github Issue", "https://github.com/typesense/typesense/issues/241"},
         {"Amazon Search", "https://www.amazon.com/s?k=phone&ref=nb_sb_noss_2"},
     };
@@ -3206,12 +3206,17 @@ TEST_F(CollectionTest, SearchingForRecordsWithSpecialChars) {
     ASSERT_EQ(1, results["found"].get<size_t>());
     ASSERT_STREQ("1", results["hits"][0]["document"]["id"].get<std::string>().c_str());
 
+    ASSERT_EQ(2, results["hits"][0]["highlights"].size());
+    ASSERT_EQ("<mark>Google</mark> Home", results["hits"][0]["highlights"][0]["snippet"].get<std::string>());
+    ASSERT_EQ("https://<mark>google</mark>.com///", results["hits"][0]["highlights"][1]["snippet"].get<std::string>());
+
     results = coll1->search("amazon.com",
                             {"title", "url"}, "", {}, {}, 2, 10, 1, FREQUENCY).get();
 
-    ASSERT_EQ(2, results["found"].get<size_t>());
+    ASSERT_EQ(3, results["found"].get<size_t>());
     ASSERT_STREQ("3", results["hits"][0]["document"]["id"].get<std::string>().c_str());
     ASSERT_STREQ("0", results["hits"][1]["document"]["id"].get<std::string>().c_str());
+    ASSERT_STREQ("1", results["hits"][2]["document"]["id"].get<std::string>().c_str());
 
     results = coll1->search("typesense",
                             {"title", "url"}, "", {}, {}, 2, 10, 1, FREQUENCY).get();
@@ -3225,5 +3230,9 @@ TEST_F(CollectionTest, SearchingForRecordsWithSpecialChars) {
     ASSERT_EQ(1, results["found"].get<size_t>());
     ASSERT_STREQ("3", results["hits"][0]["document"]["id"].get<std::string>().c_str());
 
+    ASSERT_EQ(1, results["hits"][0]["highlights"].size());
+    ASSERT_EQ("https://www.amazon.com/s?k=phone&ref=<mark>nb</mark>_<mark>sb</mark>_<mark>noss</mark>_<mark>2</mark>",
+              results["hits"][0]["highlights"][0]["snippet"].get<std::string>());
+
     collectionManager.drop_collection("coll1");
-}
\ No newline at end of file
+}
diff --git a/test/tokenizer_test.cpp b/test/tokenizer_test.cpp
index 3a4b1264..58123d4c 100644
--- a/test/tokenizer_test.cpp
+++ b/test/tokenizer_test.cpp
@@ -4,25 +4,23 @@
 TEST(TokenizerTest, ShouldTokenizeNormalizeDifferentStrings) {
     const std::string withnewline = "Michael Jordan:\nWelcome, everybody. Welcome! ";
     std::vector<std::string> tokens;
-    Tokenizer(withnewline, true, true, false).tokenize(tokens);
-    ASSERT_EQ(6, tokens.size());
+    Tokenizer(withnewline, false, true, false).tokenize(tokens);
+    ASSERT_EQ(5, tokens.size());
     ASSERT_STREQ("michael", tokens[0].c_str());
     ASSERT_STREQ("jordan", tokens[1].c_str());
     ASSERT_STREQ("welcome", tokens[2].c_str());
     ASSERT_STREQ("everybody", tokens[3].c_str());
     ASSERT_STREQ("welcome", tokens[4].c_str());
-    ASSERT_STREQ("", tokens[5].c_str());
 
     const std::string withspaces = " Michael  Jordan  ";
     tokens.clear();
     Tokenizer(withspaces, true, true, false).tokenize(tokens);
-    ASSERT_EQ(6, tokens.size());
-    ASSERT_STREQ("", tokens[0].c_str());
+    ASSERT_EQ(5, tokens.size());
+    ASSERT_STREQ(" ", tokens[0].c_str());
     ASSERT_STREQ("michael", tokens[1].c_str());
-    ASSERT_STREQ("", tokens[2].c_str());
+    ASSERT_STREQ("  ", tokens[2].c_str());
     ASSERT_STREQ("jordan", tokens[3].c_str());
-    ASSERT_STREQ("", tokens[4].c_str());
-    ASSERT_STREQ("", tokens[5].c_str());
+    ASSERT_STREQ("  ", tokens[4].c_str());
 
     tokens.clear();
     Tokenizer(withspaces, false, true, false).tokenize(tokens);
@@ -30,38 +28,6 @@ TEST(TokenizerTest, ShouldTokenizeNormalizeDifferentStrings) {
     ASSERT_STREQ("michael", tokens[0].c_str());
     ASSERT_STREQ("jordan", tokens[1].c_str());
 
-    const std::string withspecialchars = "Special ½¥ and தமிழ் 你好吗 abcÅà123ß12 here.";
-    tokens.clear();
-    Tokenizer(withspecialchars, false, true, false).tokenize(tokens);
-    ASSERT_EQ(7, tokens.size());
-    ASSERT_STREQ("special", tokens[0].c_str());
-    ASSERT_STREQ("12yen", tokens[1].c_str());
-    ASSERT_STREQ("and", tokens[2].c_str());
-    ASSERT_STREQ("தமிழ்", tokens[3].c_str());
-    ASSERT_STREQ("你好吗", tokens[4].c_str());
-    ASSERT_STREQ("abcaa123ss12", tokens[5].c_str());
-    ASSERT_STREQ("here", tokens[6].c_str());
-
-    // when normalization is disabled and keep empty is enabled
-    const std::string withoutnormalize = "Mise  à,  jour.";
-    tokens.clear();
-    Tokenizer(withoutnormalize, true, false, false).tokenize(tokens);
-    ASSERT_EQ(5, tokens.size());
-    ASSERT_STREQ("Mise", tokens[0].c_str());
-    ASSERT_STREQ("", tokens[1].c_str());
-    ASSERT_STREQ("à,", tokens[2].c_str());
-    ASSERT_STREQ("", tokens[3].c_str());
-    ASSERT_STREQ("jour.", tokens[4].c_str());
-
-    // when normalization and keep empty are disabled
-    const std::string withoutnormalizeandkeepempty = "Mise  à  jour.";
-    tokens.clear();
-    Tokenizer(withoutnormalizeandkeepempty, false, false, false).tokenize(tokens);
-    ASSERT_EQ(3, tokens.size());
-    ASSERT_STREQ("Mise", tokens[0].c_str());
-    ASSERT_STREQ("à", tokens[1].c_str());
-    ASSERT_STREQ("jour.", tokens[2].c_str());
-
     // single token
     const std::string single_token = "foobar";
     tokens.clear();
@@ -89,22 +55,82 @@ TEST(TokenizerTest, ShouldTokenizeNormalizeDifferentStrings) {
     const std::string multispace_tokens = "foo     bar";
     tokens.clear();
     Tokenizer(multispace_tokens, true, false, false).tokenize(tokens);
-    ASSERT_EQ(6, tokens.size());
+    ASSERT_EQ(3, tokens.size());
     ASSERT_STREQ("foo", tokens[0].c_str());
-    ASSERT_STREQ("", tokens[1].c_str());
-    ASSERT_STREQ("", tokens[2].c_str());
-    ASSERT_STREQ("", tokens[3].c_str());
-    ASSERT_STREQ("", tokens[4].c_str());
-    ASSERT_STREQ("bar", tokens[5].c_str());
+    ASSERT_STREQ("     ", tokens[1].c_str());
+    ASSERT_STREQ("bar", tokens[2].c_str());
+
+    // special chars
+    const std::string specialchar_tokens = "https://www.amazon.com/s?k=phone&ref=nb_sb_noss_2";;
+    tokens.clear();
+    Tokenizer(specialchar_tokens, true, false, false).tokenize(tokens);
+    ASSERT_EQ(23, tokens.size());
+    ASSERT_STREQ("https", tokens[0].c_str());
+    ASSERT_STREQ("://", tokens[1].c_str());
+    ASSERT_STREQ("www", tokens[2].c_str());
+    ASSERT_STREQ(".", tokens[3].c_str());
+    ASSERT_STREQ("noss", tokens[20].c_str());
+    ASSERT_STREQ("_", tokens[21].c_str());
+    ASSERT_STREQ("2", tokens[22].c_str());
 
     // noop
 
     tokens.clear();
+    const std::string withspecialchars = "Special ½¥ and தமிழ் 你好吗 abcÅà123ß12 here.";
     Tokenizer(withspecialchars, false, true, true).tokenize(tokens);
     ASSERT_EQ(1, tokens.size());
     ASSERT_STREQ(withspecialchars.c_str(), tokens[0].c_str());
 }
 
+TEST(TokenizerTest, ShouldTokenizeNormalizeUnicodeStrings) {
+    std::vector<std::string> tokens;
+
+    const std::string withspecialchars = "Special ½¥ and -தமிழ் 你2好吗 abcÅà123ß12 here.";
+    tokens.clear();
+    Tokenizer(withspecialchars, false, true, false).tokenize(tokens);
+    ASSERT_EQ(7, tokens.size());
+    ASSERT_STREQ("special", tokens[0].c_str());
+    ASSERT_STREQ("12yen", tokens[1].c_str());
+    ASSERT_STREQ("and", tokens[2].c_str());
+    ASSERT_STREQ("தமிழ்", tokens[3].c_str());
+    ASSERT_STREQ("你2好吗", tokens[4].c_str());
+    ASSERT_STREQ("abcaa123ss12", tokens[5].c_str());
+    ASSERT_STREQ("here", tokens[6].c_str());
+
+    // when normalization is disabled and keep empty is enabled
+    const std::string withoutnormalize = "Mise  à,  jour.";
+    tokens.clear();
+    Tokenizer(withoutnormalize, true, false, false).tokenize(tokens);
+    ASSERT_EQ(6, tokens.size());
+    ASSERT_STREQ("Mise", tokens[0].c_str());
+    ASSERT_STREQ("  ", tokens[1].c_str());
+    ASSERT_STREQ("à", tokens[2].c_str());
+    ASSERT_STREQ(",  ", tokens[3].c_str());
+    ASSERT_STREQ("jour", tokens[4].c_str());
+    ASSERT_STREQ(".", tokens[5].c_str());
+
+    // when normalization and keep empty are disabled
+    const std::string withoutnormalizeandkeepempty = "Mise  à  jour.";
+    tokens.clear();
+    Tokenizer(withoutnormalizeandkeepempty, false, false, false).tokenize(tokens);
+    ASSERT_EQ(3, tokens.size());
+    ASSERT_STREQ("Mise", tokens[0].c_str());
+    ASSERT_STREQ("à", tokens[1].c_str());
+    ASSERT_STREQ("jour", tokens[2].c_str());
+
+    // single accented word tokenization
+    std::string singleword = "à";
+    tokens.clear();
+    Tokenizer(singleword, false, true, false).tokenize(tokens);
+    ASSERT_EQ(1, tokens.size());
+    ASSERT_STREQ("a", tokens[0].c_str());
+
+    tokens.clear();
+    Tokenizer(singleword, true, true, false).tokenize(tokens);
+    ASSERT_EQ(1, tokens.size());
+    ASSERT_STREQ("a", tokens[0].c_str());
+}
+
 TEST(TokenizerTest, ShouldTokenizeIteratively) {
     const std::string withnewline = "Michael Jordan:\n\nWelcome, everybody. Welcome!";
     std::vector<std::string> tokens;
@@ -117,20 +143,34 @@ TEST(TokenizerTest, ShouldTokenizeIteratively) {
         tokens.push_back(token);
     }
 
-    ASSERT_EQ(6, tokens.size());
+    ASSERT_EQ(10, tokens.size());
     ASSERT_STREQ("michael", tokens[0].c_str());
-    ASSERT_STREQ("jordan", tokens[1].c_str());
-    ASSERT_STREQ("", tokens[2].c_str());
-    ASSERT_STREQ("welcome", tokens[3].c_str());
-    ASSERT_STREQ("everybody", tokens[4].c_str());
-    ASSERT_STREQ("welcome", tokens[5].c_str());
+    ASSERT_STREQ(" ", tokens[1].c_str());
+    ASSERT_STREQ("jordan", tokens[2].c_str());
+    ASSERT_STREQ(":\n\n", tokens[3].c_str());
+    ASSERT_STREQ("welcome", tokens[4].c_str());
+    ASSERT_STREQ(", ", tokens[5].c_str());
+    ASSERT_STREQ("everybody", tokens[6].c_str());
+    ASSERT_STREQ(". ", tokens[7].c_str());
+    ASSERT_STREQ("welcome", tokens[8].c_str());
+    ASSERT_STREQ("!", tokens[9].c_str());
+
+    // check for index when separators are not kept
+    Tokenizer tokenizer2(withnewline, false, true, false);
+    size_t expected_token_index = 0;
+    std::vector<std::string> expected_tokens = {"michael", "jordan", "welcome", "everybody", "welcome"};
+    while(tokenizer2.next(token, token_index)) {
+        ASSERT_EQ(expected_token_index, token_index);
+        ASSERT_EQ(expected_tokens[expected_token_index], token);
+        expected_token_index++;
+    }
 
     // verbatim (no_op=true)
 
     tokens.clear();
-    Tokenizer tokenizer2(withnewline, true, false, true);
+    Tokenizer tokenizer3(withnewline, true, false, true);
 
-    while(tokenizer2.next(token, token_index)) {
+    while(tokenizer3.next(token, token_index)) {
         tokens.push_back(token);
     }