Handle special characters in locale tokenization.

2025-05-18 04:32:38 +08:00 · 2022-08-18 10:47:30 +05:30 · 2022-08-18 10:47:30 +05:30 · 57ac561743
commit 57ac561743
parent fdb5f64d0e
5 changed files with 147 additions and 34 deletions
--- a/include/tokenizer.h
+++ b/include/tokenizer.h
@ -31,9 +31,14 @@ private:
    std::string locale;
    icu::BreakIterator* bi = nullptr;
    icu::UnicodeString unicode_text;
+
+    // tracks start of a text segment that can span multiple unicode tokens due to use of custom symbols
+    int32_t utf8_start_index = 0;
+
+    // tracks current unicode segment for text extraction
    int32_t start_pos = 0;
    int32_t end_pos = 0;
-    int32_t utf8_start_index = 0;
+
    char* normalized_text = nullptr;

    // non-deletable singletons
@ -78,4 +83,6 @@ public:
    static inline bool is_ascii_char(char c) {
        return (c & ~0x7f) == 0;
    }
+
+    void decr_token_counter();
 };
--- a/src/collection.cpp
+++ b/src/collection.cpp
@ -2521,6 +2521,7 @@ bool Collection::handle_highlight_text(std::string& text, bool normalise, const
        if(is_cyrillic) {
            bool found_token = word_tokenizer.tokenize(raw_token);
            if(!found_token) {
+                tokenizer.decr_token_counter();
                continue;
            }
        }
--- a/src/tokenizer.cpp
+++ b/src/tokenizer.cpp
@ -106,10 +106,7 @@ bool Tokenizer::next(std::string &token, size_t& token_index, size_t& start_inde
    if(!locale.empty() && locale != "en") {
        while (end_pos != icu::BreakIterator::DONE) {
            //LOG(INFO) << "Position: " << start_pos;
-            bool found_token = false;
-
            std::string word;
-            //LOG(INFO) << "token: " << token;

            if(locale == "ko") {
                UErrorCode errcode = U_ZERO_ERROR;
@ -118,67 +115,95 @@ bool Tokenizer::next(std::string &token, size_t& token_index, size_t& start_inde
                nfkd->normalize(src, dst, errcode);

                if(!U_FAILURE(errcode)) {
-                    token = dst.toUTF8String(word);
+                    dst.toUTF8String(word);
                } else {
                    LOG(ERROR) << "Unicode error during parsing: " << errcode;
                }
            } else if(normalize && is_cyrillic(locale)) {
                auto raw_text = unicode_text.tempSubStringBetween(start_pos, end_pos);
                transliterator->transliterate(raw_text);
-                token = raw_text.toUTF8String(word);
+                raw_text.toUTF8String(word);
            } else if(locale == "th") {
                UErrorCode errcode = U_ZERO_ERROR;
                icu::UnicodeString src = unicode_text.tempSubStringBetween(start_pos, end_pos);
                icu::UnicodeString dst;
                nfkc->normalize(src, dst, errcode);
                if(!U_FAILURE(errcode)) {
-                    token = dst.toUTF8String(word);
+                    dst.toUTF8String(word);
                } else {
                    LOG(ERROR) << "Unicode error during parsing: " << errcode;
                }
            } else {
-                token = unicode_text.tempSubStringBetween(start_pos, end_pos).toUTF8String(word);
+                unicode_text.tempSubStringBetween(start_pos, end_pos).toUTF8String(word);
            }

-            if(!token.empty()) {
-                if(token == " " ||  token == "," || token == "." || token == "!" || token == "?") {
-                    found_token = false;
-                } else if (!std::isalnum(token[0]) && is_ascii_char(token[0])) {
-                    // ignore ascii symbols
-                    found_token = false;
-                    token_counter++;
-                } else if(locale == "ko" && token == "·") {
-                    found_token = false;
-                    token_counter++;
-                } else if(locale == "zh" && (token == "，" || token == "─" || token == "。")) {
-                    found_token = false;
-                    token_counter++;
-                } else {
+            bool emit_token = false;

-                    if(std::isalnum(token[0]) && is_ascii_char(token[0])) {
-                        // normalize an ascii string
-                        std::transform(token.begin(), token.end(), token.begin(),
-                                       [](unsigned char c){ return std::tolower(c); });
-                    }
+            // `word` can be either a multi-byte unicode sequence or an ASCII character
+            // ASCII character can be either a special character or English alphabet

-                    found_token = true;
-                    token_index = token_counter++;
+            if(is_ascii_char(word[0])) {
+
+                if(std::isalnum(word[0])) {
+                    // normalize an ascii string and emit word as token
+                    std::transform(word.begin(), word.end(), word.begin(),
+                                   [](unsigned char c){ return std::tolower(c); });
+                    out += word;
+                    emit_token = true;
                }

-                start_index = utf8_start_index;
-                end_index = utf8_start_index + token.size() - 1;
-                utf8_start_index = end_index + 1;
+                else {
+                    // special character:
+                    // a) present in `index_symbols` -> append word to out and continue iteration
+                    // b) present in `separator_symbols` -> skip word
+                    // c) not present in either -> skip word
+                    if(index_symbols[uint8_t(word[0])] == 1) {
+                        out += word;
+                        emit_token = true;
+                    }
+                }
+
+
+            } else {
+                if(locale == "zh" && (word == "，" || word == "─" || word == "。")) {
+                    emit_token = false;
+                } else if(locale == "ko" && word == "·") {
+                    emit_token = false;
+                } else {
+                    emit_token = true;
+                    out += word;
+                }
            }

+            if(emit_token) {
+                token = out;
+                token_index = token_counter++;
+                out.clear();
+            }
+
+            start_index = utf8_start_index;
+            end_index = utf8_start_index + word.size() - 1;
+            utf8_start_index = end_index + 1;
+
            start_pos = end_pos;
            end_pos = bi->next();

-            if(found_token) {
+            if(emit_token) {
                return true;
            }
        }

-        return false;
+        token = out;
+        out.clear();
+        start_index = utf8_start_index;
+        end_index = text.size() - 1;
+
+        if(token.empty()) {
+            return false;
+        }
+
+        token_index = token_counter++;
+        return true;
    }

    while(i < text.size()) {
@ -303,3 +328,9 @@ bool Tokenizer::is_cyrillic(const std::string& locale) {
    return locale == "el" ||
           locale == "ru" || locale == "sr" || locale == "uk" || locale == "be";
 }
+
+void Tokenizer::decr_token_counter() {
+    if(token_counter > 0) {
+        token_counter--;
+    }
+}
--- a/test/collection_locale_test.cpp
+++ b/test/collection_locale_test.cpp
@ -220,6 +220,48 @@ TEST_F(CollectionLocaleTest, ThaiTextShouldBeNormalizedToNFKC) {
    ASSERT_EQ(1, results["found"].get<size_t>());
 }

+TEST_F(CollectionLocaleTest, ThaiTextShouldRespectSeparators) {
+    nlohmann::json coll_json = R"({
+        "name": "coll1",
+        "fields": [
+            {"name": "title", "type": "string", "locale": "th"}
+        ]
+    })"_json;
+
+    auto coll1 = collectionManager.create_collection(coll_json).get();
+
+    nlohmann::json doc;
+    doc["title"] = "alpha-beta-gamma";
+    ASSERT_TRUE(coll1->add(doc.dump()).ok());
+
+    auto results = coll1->search("*",{}, "title:=alpha-beta-gamma", {}, {},
+                                 {0}, 10, 1, FREQUENCY).get();
+
+    ASSERT_EQ(1, results["found"].get<size_t>());
+
+    // now with `symbols_to_index`
+    coll_json = R"({
+        "name": "coll2",
+        "symbols_to_index": ["-"],
+        "fields": [
+            {"name": "title", "type": "string", "locale": "th"}
+        ]
+    })"_json;
+
+    auto coll2 = collectionManager.create_collection(coll_json).get();
+    ASSERT_TRUE(coll2->add(doc.dump()).ok());
+
+    results = coll2->search("*",{}, "title:=alpha-beta-gamma", {}, {},
+                            {0}, 10, 1, FREQUENCY).get();
+
+    ASSERT_EQ(1, results["found"].get<size_t>());
+
+    results = coll2->search("*",{}, "title:=alphabetagamma", {}, {},
+                            {0}, 10, 1, FREQUENCY).get();
+
+    ASSERT_EQ(0, results["found"].get<size_t>());
+}
+
 TEST_F(CollectionLocaleTest, SearchThaiTextPreSegmentedQuery) {
    Collection *coll1;

--- a/test/tokenizer_test.cpp
+++ b/test/tokenizer_test.cpp
@ -325,3 +325,35 @@ TEST(TokenizerTest, ShouldTokenizeLocaleTextWithEnglishText) {
    ASSERT_EQ("discrete", ttokens[7]);
    ASSERT_EQ("math", ttokens[8]);
 }
+
+TEST(TokenizerTest, ShouldTokenizeWithDifferentSymbolConfigs) {
+    std::string str1 = "ความ-เหลื่อมล้ำ";
+
+    // '-' in symbols_to_index: "ความ", "-", "เหลื่อม", "ล้ำ"
+    // '-' in separators: "ความ", "เหลื่อม", "ล้ำ"
+    // 'none: "ความ", "เหลื่อม", "ล้ำ"
+
+    std::vector<std::string> tokens;
+    Tokenizer(str1, true, false, "th", {'-'}, {}).tokenize(tokens);
+    ASSERT_EQ(4, tokens.size());
+    ASSERT_EQ("ความ", tokens[0]);
+    ASSERT_EQ("-", tokens[1]);
+    ASSERT_EQ("เหลื่อม", tokens[2]);
+    ASSERT_EQ("ล้ํา", tokens[3]);
+
+    tokens.clear();
+    Tokenizer(str1, true, false, "th", {}, {'-'}).tokenize(tokens);
+    ASSERT_EQ(3, tokens.size());
+    ASSERT_EQ("ความ", tokens[0]);
+    ASSERT_EQ("เหลื่อม", tokens[1]);
+    ASSERT_EQ("ล้ํา", tokens[2]);
+
+    tokens.clear();
+    Tokenizer(str1, true, false, "th", {}, {}).tokenize(tokens);
+    ASSERT_EQ(3, tokens.size());
+    ASSERT_EQ("ความ", tokens[0]);
+    ASSERT_EQ("เหลื่อม", tokens[1]);
+    ASSERT_EQ("ล้ํา", tokens[2]);
+
+    LOG(INFO) << "here";
+}