Do word level tokenization for th locale.

This ensures that we handle change in text length due to NFKC normalization.
2025-05-18 12:42:50 +08:00 · 2022-08-31 07:35:05 +05:30 · 2022-08-31 07:35:05 +05:30 · 70e6a89ea8
commit 70e6a89ea8
parent 729c73bb6d
5 changed files with 32 additions and 17 deletions
--- a/include/collection.h
+++ b/include/collection.h
@ -222,7 +222,7 @@ private:

    bool handle_highlight_text(std::string& text, bool normalise, const field &search_field,
                               const std::vector<char>& symbols_to_index, const std::vector<char>& token_separators,
-                               highlight_t& highlight, StringUtils & string_utils, bool is_cyrillic,
+                               highlight_t& highlight, StringUtils & string_utils, bool use_word_tokenizer,
                               const size_t highlight_affix_num_tokens,
                               const tsl::htrie_map<char, token_leaf>& qtoken_leaves,
                               int last_valid_offset_index, const Match& match,
--- a/src/collection.cpp
+++ b/src/collection.cpp
@ -2262,8 +2262,8 @@ void Collection::highlight_result(const std::string& raw_query, const field &sea

    tsl::htrie_set<char> matched_tokens;

-    bool is_cyrillic = Tokenizer::is_cyrillic(search_field.locale);
-    bool normalise = is_cyrillic ? false : true;
+    bool use_word_tokenizer = search_field.locale == "th" || Tokenizer::is_cyrillic(search_field.locale);
+    bool normalise = !use_word_tokenizer;

    std::vector<std::string> raw_query_tokens;
    Tokenizer(raw_query, normalise, false, search_field.locale, symbols_to_index, token_separators).tokenize(raw_query_tokens);
@ -2348,14 +2348,14 @@ void Collection::highlight_result(const std::string& raw_query, const field &sea

                std::string text = h_obj.get<std::string>();
                handle_highlight_text(text, normalise, search_field, symbols_to_index,
-                                                            token_separators, array_highlight, string_utils, is_cyrillic,
-                                                            highlight_affix_num_tokens,
-                                                            qtoken_leaves, last_valid_offset_index, match,
-                                                            prefix_token_num_chars,
-                                                            highlight_fully, snippet_threshold, is_infix_search,
-                                                            raw_query_tokens,
-                                                            last_valid_offset, highlight_start_tag, highlight_end_tag,
-                                                            index_symbols, match_index);
+                                      token_separators, array_highlight, string_utils, use_word_tokenizer,
+                                      highlight_affix_num_tokens,
+                                      qtoken_leaves, last_valid_offset_index, match,
+                                      prefix_token_num_chars,
+                                      highlight_fully, snippet_threshold, is_infix_search,
+                                      raw_query_tokens,
+                                      last_valid_offset, highlight_start_tag, highlight_end_tag,
+                                      index_symbols, match_index);

                if(!array_highlight.snippets.empty()) {
                    h_obj = array_highlight.snippets[0];
@ -2458,7 +2458,7 @@ void Collection::highlight_result(const std::string& raw_query, const field &sea
        }

        handle_highlight_text(text, normalise, search_field, symbols_to_index, token_separators,
-                              highlight, string_utils, is_cyrillic, highlight_affix_num_tokens,
+                              highlight, string_utils, use_word_tokenizer, highlight_affix_num_tokens,
                              qtoken_leaves, last_valid_offset_index, match, prefix_token_num_chars,
                              highlight_fully, snippet_threshold, is_infix_search, raw_query_tokens,
                              last_valid_offset, highlight_start_tag, highlight_end_tag,
@ -2543,7 +2543,7 @@ void Collection::highlight_result(const std::string& raw_query, const field &sea

 bool Collection::handle_highlight_text(std::string& text, bool normalise, const field &search_field,
                           const std::vector<char>& symbols_to_index, const std::vector<char>& token_separators,
-                           highlight_t& highlight, StringUtils & string_utils, bool is_cyrillic,
+                           highlight_t& highlight, StringUtils & string_utils, bool use_word_tokenizer,
                           const size_t highlight_affix_num_tokens,
                           const tsl::htrie_map<char, token_leaf>& qtoken_leaves,
                           int last_valid_offset_index, const Match& match,
@ -2580,7 +2580,7 @@ bool Collection::handle_highlight_text(std::string& text, bool normalise, const
    bool found_first_match = false;

    while(tokenizer.next(raw_token, raw_token_index, tok_start, tok_end)) {
-        if(is_cyrillic) {
+        if(use_word_tokenizer) {
            bool found_token = word_tokenizer.tokenize(raw_token);
            if(!found_token) {
                tokenizer.decr_token_counter();
@ -2744,7 +2744,12 @@ void Collection::highlight_text(const string& highlight_start_tag, const string&
                auto end_offset = offset_it->second;

                // if a token ends with one or more puncutation chars, we should not highlight them
-                for(int j = end_offset; j > 0; j--) {
+                for(int j = end_offset; j >= 0; j--) {
+                    if(end_offset >= text.size()) {
+                        // this should not happen unless we mess up unicode normalization
+                        break;
+                    }
+
                    if(!std::isalnum(text[j]) && Tokenizer::is_ascii_char(text[j]) &&
                        index_symbols[uint8_t(text[j])] != 1) {
                        end_offset--;
--- a/src/tokenizer.cpp
+++ b/src/tokenizer.cpp
@ -123,7 +123,7 @@ bool Tokenizer::next(std::string &token, size_t& token_index, size_t& start_inde
                auto raw_text = unicode_text.tempSubStringBetween(start_pos, end_pos);
                transliterator->transliterate(raw_text);
                raw_text.toUTF8String(word);
-            } else if(locale == "th") {
+            } else if(normalize && locale == "th") {
                UErrorCode errcode = U_ZERO_ERROR;
                icu::UnicodeString src = unicode_text.tempSubStringBetween(start_pos, end_pos);
                icu::UnicodeString dst;
--- a/test/collection_locale_test.cpp
+++ b/test/collection_locale_test.cpp
@ -332,9 +332,13 @@ TEST_F(CollectionLocaleTest, SearchAgainstThaiTextExactMatch) {
        coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
    }

+    std::string word_9bytes = "น้ำ";
+    std::string word_12bytes = "น้ํา";
+
    std::vector<std::vector<std::string>> records = {
        {"ติดกับดักรายได้ปานกลาง", "Expected Result"},
        {"ข้อมูลรายคนหรือรายบริษัทในการเชื่อมโยงส่วนได้ส่วนเสีย", "Another Result"},
+        {word_9bytes, "Another Result"},  // NKC normalization
    };

    for (size_t i = 0; i < records.size(); i++) {
@ -361,6 +365,12 @@ TEST_F(CollectionLocaleTest, SearchAgainstThaiTextExactMatch) {
    ASSERT_EQ("ข้อมูล<mark>ราย</mark>คนหรือ<mark>ราย</mark>บริษัทในการเชื่อมโยงส่วน<mark>ได้</mark>ส่วนเสีย",
              results["hits"][1]["highlights"][0]["snippet"].get<std::string>());

+    // check text index overflow regression with NFC normalization + highlighting
+
+    results = coll1->search(word_12bytes, {"title"}, "", {}, sort_fields, {2}, 10, 1, FREQUENCY).get();
+    ASSERT_EQ(1, results["found"].get<size_t>());
+    ASSERT_EQ(1, results["hits"].size());
+    ASSERT_EQ("<mark>น้ำ</mark>", results["hits"][0]["highlights"][0]["snippet"].get<std::string>());
 }

 TEST_F(CollectionLocaleTest, SearchAgainstKoreanText) {
--- a/test/tokenizer_test.cpp
+++ b/test/tokenizer_test.cpp
@ -237,7 +237,7 @@ TEST(TokenizerTest, ShouldTokenizeLocaleText) {

    tokens.clear();
    str = "จิ้งจอกสีน้ำตาลด่วน";
-    Tokenizer(str, false, false, "th").tokenize(tokens);
+    Tokenizer(str, true, false, "th").tokenize(tokens);
    ASSERT_EQ(4, tokens.size());
    ASSERT_EQ("จิ้งจอก", tokens[0]);
    ASSERT_EQ("สี", tokens[1]);