From 70e6a89ea8d76c59e00af16ce4559c8932ec08ec Mon Sep 17 00:00:00 2001
From: Kishore Nallan <kishorenc@gmail.com>
Date: Wed, 31 Aug 2022 07:35:05 +0530
Subject: [PATCH] Do word level tokenization for th locale.

This ensures that we handle change in text length due to NFKC normalization.
---
 include/collection.h            |  2 +-
 src/collection.cpp              | 33 +++++++++++++++++++--------------
 src/tokenizer.cpp               |  2 +-
 test/collection_locale_test.cpp | 10 ++++++++++
 test/tokenizer_test.cpp         |  2 +-
 5 files changed, 32 insertions(+), 17 deletions(-)
diff --git a/include/collection.h b/include/collection.h
index 7d8c76c3..ff3c991b 100644
--- a/include/collection.h
+++ b/include/collection.h
@@ -222,7 +222,7 @@ private:
 
     bool handle_highlight_text(std::string& text, bool normalise, const field &search_field,
                                const std::vector<char>& symbols_to_index, const std::vector<char>& token_separators,
-                               highlight_t& highlight, StringUtils & string_utils, bool is_cyrillic,
+                               highlight_t& highlight, StringUtils & string_utils, bool use_word_tokenizer,
                                const size_t highlight_affix_num_tokens,
                                const tsl::htrie_map<char, token_leaf>& qtoken_leaves,
                                int last_valid_offset_index, const Match& match,
diff --git a/src/collection.cpp b/src/collection.cpp
index 26ef40ff..d5392014 100644
--- a/src/collection.cpp
+++ b/src/collection.cpp
@@ -2262,8 +2262,8 @@ void Collection::highlight_result(const std::string& raw_query, const field &sea
 
     tsl::htrie_set<char> matched_tokens;
 
-    bool is_cyrillic = Tokenizer::is_cyrillic(search_field.locale);
-    bool normalise = is_cyrillic ? false : true;
+    bool use_word_tokenizer = search_field.locale == "th" || Tokenizer::is_cyrillic(search_field.locale);
+    bool normalise = !use_word_tokenizer;
 
     std::vector<std::string> raw_query_tokens;
     Tokenizer(raw_query, normalise, false, search_field.locale, symbols_to_index, token_separators).tokenize(raw_query_tokens);
@@ -2348,14 +2348,14 @@ void Collection::highlight_result(const std::string& raw_query, const field &sea
 
                 std::string text = h_obj.get<std::string>();
                 handle_highlight_text(text, normalise, search_field, symbols_to_index,
-                                                            token_separators, array_highlight, string_utils, is_cyrillic,
-                                                            highlight_affix_num_tokens,
-                                                            qtoken_leaves, last_valid_offset_index, match,
-                                                            prefix_token_num_chars,
-                                                            highlight_fully, snippet_threshold, is_infix_search,
-                                                            raw_query_tokens,
-                                                            last_valid_offset, highlight_start_tag, highlight_end_tag,
-                                                            index_symbols, match_index);
+                                      token_separators, array_highlight, string_utils, use_word_tokenizer,
+                                      highlight_affix_num_tokens,
+                                      qtoken_leaves, last_valid_offset_index, match,
+                                      prefix_token_num_chars,
+                                      highlight_fully, snippet_threshold, is_infix_search,
+                                      raw_query_tokens,
+                                      last_valid_offset, highlight_start_tag, highlight_end_tag,
+                                      index_symbols, match_index);
 
                 if(!array_highlight.snippets.empty()) {
                     h_obj = array_highlight.snippets[0];
@@ -2458,7 +2458,7 @@ void Collection::highlight_result(const std::string& raw_query, const field &sea
         }
 
         handle_highlight_text(text, normalise, search_field, symbols_to_index, token_separators,
-                              highlight, string_utils, is_cyrillic, highlight_affix_num_tokens,
+                              highlight, string_utils, use_word_tokenizer, highlight_affix_num_tokens,
                               qtoken_leaves, last_valid_offset_index, match, prefix_token_num_chars,
                               highlight_fully, snippet_threshold, is_infix_search, raw_query_tokens,
                               last_valid_offset, highlight_start_tag, highlight_end_tag,
@@ -2543,7 +2543,7 @@ void Collection::highlight_result(const std::string& raw_query, const field &sea
 
 bool Collection::handle_highlight_text(std::string& text, bool normalise, const field &search_field,
                            const std::vector<char>& symbols_to_index, const std::vector<char>& token_separators,
-                           highlight_t& highlight, StringUtils & string_utils, bool is_cyrillic,
+                           highlight_t& highlight, StringUtils & string_utils, bool use_word_tokenizer,
                            const size_t highlight_affix_num_tokens,
                            const tsl::htrie_map<char, token_leaf>& qtoken_leaves,
                            int last_valid_offset_index, const Match& match,
@@ -2580,7 +2580,7 @@ bool Collection::handle_highlight_text(std::string& text, bool normalise, const
     bool found_first_match = false;
 
     while(tokenizer.next(raw_token, raw_token_index, tok_start, tok_end)) {
-        if(is_cyrillic) {
+        if(use_word_tokenizer) {
             bool found_token = word_tokenizer.tokenize(raw_token);
             if(!found_token) {
                 tokenizer.decr_token_counter();
@@ -2744,7 +2744,12 @@ void Collection::highlight_text(const string& highlight_start_tag, const string&
                 auto end_offset = offset_it->second;
 
                 // if a token ends with one or more puncutation chars, we should not highlight them
-                for(int j = end_offset; j > 0; j--) {
+                for(int j = end_offset; j >= 0; j--) {
+                    if(end_offset >= text.size()) {
+                        // this should not happen unless we mess up unicode normalization
+                        break;
+                    }
+
                     if(!std::isalnum(text[j]) && Tokenizer::is_ascii_char(text[j]) &&
                         index_symbols[uint8_t(text[j])] != 1) {
                         end_offset--;
diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp
index 4eb357a5..082db1be 100644
--- a/src/tokenizer.cpp
+++ b/src/tokenizer.cpp
@@ -123,7 +123,7 @@ bool Tokenizer::next(std::string &token, size_t& token_index, size_t& start_inde
                 auto raw_text = unicode_text.tempSubStringBetween(start_pos, end_pos);
                 transliterator->transliterate(raw_text);
                 raw_text.toUTF8String(word);
-            } else if(locale == "th") {
+            } else if(normalize && locale == "th") {
                 UErrorCode errcode = U_ZERO_ERROR;
                 icu::UnicodeString src = unicode_text.tempSubStringBetween(start_pos, end_pos);
                 icu::UnicodeString dst;
diff --git a/test/collection_locale_test.cpp b/test/collection_locale_test.cpp
index ab40b163..11ce2ed8 100644
--- a/test/collection_locale_test.cpp
+++ b/test/collection_locale_test.cpp
@@ -332,9 +332,13 @@ TEST_F(CollectionLocaleTest, SearchAgainstThaiTextExactMatch) {
         coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
     }
 
+    std::string word_9bytes = "น้ำ";
+    std::string word_12bytes = "น้ํา";
+
     std::vector<std::vector<std::string>> records = {
         {"ติดกับดักรายได้ปานกลาง", "Expected Result"},
         {"ข้อมูลรายคนหรือรายบริษัทในการเชื่อมโยงส่วนได้ส่วนเสีย", "Another Result"},
+        {word_9bytes, "Another Result"},  // NKC normalization
     };
 
     for (size_t i = 0; i < records.size(); i++) {
@@ -361,6 +365,12 @@ TEST_F(CollectionLocaleTest, SearchAgainstThaiTextExactMatch) {
     ASSERT_EQ("ข้อมูล<mark>ราย</mark>คนหรือ<mark>ราย</mark>บริษัทในการเชื่อมโยงส่วน<mark>ได้</mark>ส่วนเสีย",
               results["hits"][1]["highlights"][0]["snippet"].get<std::string>());
 
+    // check text index overflow regression with NFC normalization + highlighting
+
+    results = coll1->search(word_12bytes, {"title"}, "", {}, sort_fields, {2}, 10, 1, FREQUENCY).get();
+    ASSERT_EQ(1, results["found"].get<size_t>());
+    ASSERT_EQ(1, results["hits"].size());
+    ASSERT_EQ("<mark>น้ำ</mark>", results["hits"][0]["highlights"][0]["snippet"].get<std::string>());
 }
 
 TEST_F(CollectionLocaleTest, SearchAgainstKoreanText) {
diff --git a/test/tokenizer_test.cpp b/test/tokenizer_test.cpp
index 12dd6984..caaafbf1 100644
--- a/test/tokenizer_test.cpp
+++ b/test/tokenizer_test.cpp
@@ -237,7 +237,7 @@ TEST(TokenizerTest, ShouldTokenizeLocaleText) {
 
     tokens.clear();
     str = "จิ้งจอกสีน้ำตาลด่วน";
-    Tokenizer(str, false, false, "th").tokenize(tokens);
+    Tokenizer(str, true, false, "th").tokenize(tokens);
     ASSERT_EQ(4, tokens.size());
     ASSERT_EQ("จิ้งจอก", tokens[0]);
     ASSERT_EQ("สี", tokens[1]);