Handle special characters within non-English locale.

Unless present in symbols to index / separators, it will be skipped.
2025-05-21 22:33:27 +08:00 · 2024-02-08 16:52:25 +05:30 · 2024-02-08 16:52:25 +05:30 · fc80cc3a72
commit fc80cc3a72
parent 48df1e70e8
3 changed files with 45 additions and 6 deletions
--- a/src/tokenizer.cpp
+++ b/src/tokenizer.cpp
@ -160,13 +160,25 @@ bool Tokenizer::next(std::string &token, size_t& token_index, size_t& start_inde

            // `word` can be either a multi-byte unicode sequence or an ASCII character
            // ASCII character can be either a special character or English alphabet
+            size_t orig_word_size = word.size();

            if(is_ascii_char(word[0])) {
-
                if(std::isalnum(word[0])) {
                    // normalize an ascii string and emit word as token
-                    std::transform(word.begin(), word.end(), word.begin(),
-                                   [](unsigned char c){ return std::tolower(c); });
+                    size_t read_index = 0, write_index = 0;
+
+                    while (read_index < word.size()) {
+                        size_t this_stream_mode = get_stream_mode(word[read_index]);
+                        if(this_stream_mode != SKIP) {
+                            word[write_index++] = std::tolower(word[read_index]);
+                        }
+
+                        read_index++;
+                    }
+
+                    // resize to fit new length
+                    word.resize(write_index);
+
                    out += word;
                    emit_token = true;
                }
@ -181,8 +193,6 @@ bool Tokenizer::next(std::string &token, size_t& token_index, size_t& start_inde
                        emit_token = true;
                    }
                }
-
-
            } else {
                if(locale == "zh" && (word == "，" || word == "─" || word == "。")) {
                    emit_token = false;
@ -201,7 +211,7 @@ bool Tokenizer::next(std::string &token, size_t& token_index, size_t& start_inde
            }

            start_index = utf8_start_index;
-            end_index = utf8_start_index + word.size() - 1;
+            end_index = utf8_start_index + orig_word_size - 1;
            utf8_start_index = end_index + 1;

            start_pos = end_pos;
--- a/test/collection_locale_test.cpp
+++ b/test/collection_locale_test.cpp
@ -905,6 +905,29 @@ TEST_F(CollectionLocaleTest, SearchInGermanLocaleShouldBeTypoTolerant) {
    ASSERT_EQ(1, results["found"].get<size_t>());
 }

+TEST_F(CollectionLocaleTest, HandleSpecialCharsInThai) {
+    nlohmann::json coll_json = R"({
+            "name": "coll1",
+            "fields": [
+                {"name": "title_th", "type": "string", "locale": "th"},
+                {"name": "sku", "type": "string"}
+            ]
+        })"_json;
+
+    auto coll1 = collectionManager.create_collection(coll_json).get();
+
+    nlohmann::json doc;
+    doc["title_th"] = "สวัสดี";
+    doc["sku"] = "12345_";
+    ASSERT_TRUE(coll1->add(doc.dump()).ok());
+
+    // query string is parsed using the locale of the first field in the query_by list
+    auto results = coll1->search("12345_", {"title_th", "sku"}, "", {}, {},
+                                 {2, 0}, 10, 1, FREQUENCY, {true, false}, 1).get();
+
+    ASSERT_EQ(1, results["found"].get<size_t>());
+}
+
 /*
 TEST_F(CollectionLocaleTest, TranslitPad) {
    UErrorCode translit_status = U_ZERO_ERROR;
--- a/test/tokenizer_test.cpp
+++ b/test/tokenizer_test.cpp
@ -254,6 +254,12 @@ TEST(TokenizerTest, ShouldTokenizeLocaleText) {
    ASSERT_EQ("31", tokens[3]);
    ASSERT_EQ("มี.ค", tokens[4]);

+    tokens.clear();
+    str = "12345_678";
+    Tokenizer(str, false, false, "th").tokenize(tokens);
+    ASSERT_EQ(1, tokens.size());
+    ASSERT_EQ("12345678", tokens[0]);
+
    tokens.clear();
    Tokenizer("Odd Thomas", false, false, "en").tokenize(tokens);
    ASSERT_EQ(2, tokens.size());