diff --git a/include/tokenizer.h b/include/tokenizer.h index 36f3af90..53a2ab13 100644 --- a/include/tokenizer.h +++ b/include/tokenizer.h @@ -31,9 +31,14 @@ private: std::string locale; icu::BreakIterator* bi = nullptr; icu::UnicodeString unicode_text; + + // tracks start of a text segment that can span multiple unicode tokens due to use of custom symbols + int32_t utf8_start_index = 0; + + // tracks current unicode segment for text extraction int32_t start_pos = 0; int32_t end_pos = 0; - int32_t utf8_start_index = 0; + char* normalized_text = nullptr; // non-deletable singletons @@ -78,4 +83,6 @@ public: static inline bool is_ascii_char(char c) { return (c & ~0x7f) == 0; } + + void decr_token_counter(); }; \ No newline at end of file diff --git a/src/collection.cpp b/src/collection.cpp index b8dbbac2..e6f34b81 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -2521,6 +2521,7 @@ bool Collection::handle_highlight_text(std::string& text, bool normalise, const if(is_cyrillic) { bool found_token = word_tokenizer.tokenize(raw_token); if(!found_token) { + tokenizer.decr_token_counter(); continue; } } diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp index 41aab66e..4eb357a5 100644 --- a/src/tokenizer.cpp +++ b/src/tokenizer.cpp @@ -106,10 +106,7 @@ bool Tokenizer::next(std::string &token, size_t& token_index, size_t& start_inde if(!locale.empty() && locale != "en") { while (end_pos != icu::BreakIterator::DONE) { //LOG(INFO) << "Position: " << start_pos; - bool found_token = false; - std::string word; - //LOG(INFO) << "token: " << token; if(locale == "ko") { UErrorCode errcode = U_ZERO_ERROR; @@ -118,67 +115,95 @@ bool Tokenizer::next(std::string &token, size_t& token_index, size_t& start_inde nfkd->normalize(src, dst, errcode); if(!U_FAILURE(errcode)) { - token = dst.toUTF8String(word); + dst.toUTF8String(word); } else { LOG(ERROR) << "Unicode error during parsing: " << errcode; } } else if(normalize && is_cyrillic(locale)) { auto raw_text = unicode_text.tempSubStringBetween(start_pos, end_pos); transliterator->transliterate(raw_text); - token = raw_text.toUTF8String(word); + raw_text.toUTF8String(word); } else if(locale == "th") { UErrorCode errcode = U_ZERO_ERROR; icu::UnicodeString src = unicode_text.tempSubStringBetween(start_pos, end_pos); icu::UnicodeString dst; nfkc->normalize(src, dst, errcode); if(!U_FAILURE(errcode)) { - token = dst.toUTF8String(word); + dst.toUTF8String(word); } else { LOG(ERROR) << "Unicode error during parsing: " << errcode; } } else { - token = unicode_text.tempSubStringBetween(start_pos, end_pos).toUTF8String(word); + unicode_text.tempSubStringBetween(start_pos, end_pos).toUTF8String(word); } - if(!token.empty()) { - if(token == " " || token == "," || token == "." || token == "!" || token == "?") { - found_token = false; - } else if (!std::isalnum(token[0]) && is_ascii_char(token[0])) { - // ignore ascii symbols - found_token = false; - token_counter++; - } else if(locale == "ko" && token == "·") { - found_token = false; - token_counter++; - } else if(locale == "zh" && (token == "," || token == "─" || token == "。")) { - found_token = false; - token_counter++; - } else { + bool emit_token = false; - if(std::isalnum(token[0]) && is_ascii_char(token[0])) { - // normalize an ascii string - std::transform(token.begin(), token.end(), token.begin(), - [](unsigned char c){ return std::tolower(c); }); - } + // `word` can be either a multi-byte unicode sequence or an ASCII character + // ASCII character can be either a special character or English alphabet - found_token = true; - token_index = token_counter++; + if(is_ascii_char(word[0])) { + + if(std::isalnum(word[0])) { + // normalize an ascii string and emit word as token + std::transform(word.begin(), word.end(), word.begin(), + [](unsigned char c){ return std::tolower(c); }); + out += word; + emit_token = true; } - start_index = utf8_start_index; - end_index = utf8_start_index + token.size() - 1; - utf8_start_index = end_index + 1; + else { + // special character: + // a) present in `index_symbols` -> append word to out and continue iteration + // b) present in `separator_symbols` -> skip word + // c) not present in either -> skip word + if(index_symbols[uint8_t(word[0])] == 1) { + out += word; + emit_token = true; + } + } + + + } else { + if(locale == "zh" && (word == "," || word == "─" || word == "。")) { + emit_token = false; + } else if(locale == "ko" && word == "·") { + emit_token = false; + } else { + emit_token = true; + out += word; + } } + if(emit_token) { + token = out; + token_index = token_counter++; + out.clear(); + } + + start_index = utf8_start_index; + end_index = utf8_start_index + word.size() - 1; + utf8_start_index = end_index + 1; + start_pos = end_pos; end_pos = bi->next(); - if(found_token) { + if(emit_token) { return true; } } - return false; + token = out; + out.clear(); + start_index = utf8_start_index; + end_index = text.size() - 1; + + if(token.empty()) { + return false; + } + + token_index = token_counter++; + return true; } while(i < text.size()) { @@ -303,3 +328,9 @@ bool Tokenizer::is_cyrillic(const std::string& locale) { return locale == "el" || locale == "ru" || locale == "sr" || locale == "uk" || locale == "be"; } + +void Tokenizer::decr_token_counter() { + if(token_counter > 0) { + token_counter--; + } +} diff --git a/test/collection_locale_test.cpp b/test/collection_locale_test.cpp index 4940f2b9..6095a97d 100644 --- a/test/collection_locale_test.cpp +++ b/test/collection_locale_test.cpp @@ -220,6 +220,48 @@ TEST_F(CollectionLocaleTest, ThaiTextShouldBeNormalizedToNFKC) { ASSERT_EQ(1, results["found"].get()); } +TEST_F(CollectionLocaleTest, ThaiTextShouldRespectSeparators) { + nlohmann::json coll_json = R"({ + "name": "coll1", + "fields": [ + {"name": "title", "type": "string", "locale": "th"} + ] + })"_json; + + auto coll1 = collectionManager.create_collection(coll_json).get(); + + nlohmann::json doc; + doc["title"] = "alpha-beta-gamma"; + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + + auto results = coll1->search("*",{}, "title:=alpha-beta-gamma", {}, {}, + {0}, 10, 1, FREQUENCY).get(); + + ASSERT_EQ(1, results["found"].get()); + + // now with `symbols_to_index` + coll_json = R"({ + "name": "coll2", + "symbols_to_index": ["-"], + "fields": [ + {"name": "title", "type": "string", "locale": "th"} + ] + })"_json; + + auto coll2 = collectionManager.create_collection(coll_json).get(); + ASSERT_TRUE(coll2->add(doc.dump()).ok()); + + results = coll2->search("*",{}, "title:=alpha-beta-gamma", {}, {}, + {0}, 10, 1, FREQUENCY).get(); + + ASSERT_EQ(1, results["found"].get()); + + results = coll2->search("*",{}, "title:=alphabetagamma", {}, {}, + {0}, 10, 1, FREQUENCY).get(); + + ASSERT_EQ(0, results["found"].get()); +} + TEST_F(CollectionLocaleTest, SearchThaiTextPreSegmentedQuery) { Collection *coll1; diff --git a/test/tokenizer_test.cpp b/test/tokenizer_test.cpp index 258a7a17..12dd6984 100644 --- a/test/tokenizer_test.cpp +++ b/test/tokenizer_test.cpp @@ -325,3 +325,35 @@ TEST(TokenizerTest, ShouldTokenizeLocaleTextWithEnglishText) { ASSERT_EQ("discrete", ttokens[7]); ASSERT_EQ("math", ttokens[8]); } + +TEST(TokenizerTest, ShouldTokenizeWithDifferentSymbolConfigs) { + std::string str1 = "ความ-เหลื่อมล้ำ"; + + // '-' in symbols_to_index: "ความ", "-", "เหลื่อม", "ล้ำ" + // '-' in separators: "ความ", "เหลื่อม", "ล้ำ" + // 'none: "ความ", "เหลื่อม", "ล้ำ" + + std::vector tokens; + Tokenizer(str1, true, false, "th", {'-'}, {}).tokenize(tokens); + ASSERT_EQ(4, tokens.size()); + ASSERT_EQ("ความ", tokens[0]); + ASSERT_EQ("-", tokens[1]); + ASSERT_EQ("เหลื่อม", tokens[2]); + ASSERT_EQ("ล้ํา", tokens[3]); + + tokens.clear(); + Tokenizer(str1, true, false, "th", {}, {'-'}).tokenize(tokens); + ASSERT_EQ(3, tokens.size()); + ASSERT_EQ("ความ", tokens[0]); + ASSERT_EQ("เหลื่อม", tokens[1]); + ASSERT_EQ("ล้ํา", tokens[2]); + + tokens.clear(); + Tokenizer(str1, true, false, "th", {}, {}).tokenize(tokens); + ASSERT_EQ(3, tokens.size()); + ASSERT_EQ("ความ", tokens[0]); + ASSERT_EQ("เหลื่อม", tokens[1]); + ASSERT_EQ("ล้ํา", tokens[2]); + + LOG(INFO) << "here"; +}