Handle special characters in locale tokenization.

This commit is contained in:
Kishore Nallan 2022-08-18 10:47:30 +05:30
parent fdb5f64d0e
commit 57ac561743
5 changed files with 147 additions and 34 deletions

View File

@ -31,9 +31,14 @@ private:
std::string locale;
icu::BreakIterator* bi = nullptr;
icu::UnicodeString unicode_text;
// tracks start of a text segment that can span multiple unicode tokens due to use of custom symbols
int32_t utf8_start_index = 0;
// tracks current unicode segment for text extraction
int32_t start_pos = 0;
int32_t end_pos = 0;
int32_t utf8_start_index = 0;
char* normalized_text = nullptr;
// non-deletable singletons
@ -78,4 +83,6 @@ public:
static inline bool is_ascii_char(char c) {
return (c & ~0x7f) == 0;
}
void decr_token_counter();
};

View File

@ -2521,6 +2521,7 @@ bool Collection::handle_highlight_text(std::string& text, bool normalise, const
if(is_cyrillic) {
bool found_token = word_tokenizer.tokenize(raw_token);
if(!found_token) {
tokenizer.decr_token_counter();
continue;
}
}

View File

@ -106,10 +106,7 @@ bool Tokenizer::next(std::string &token, size_t& token_index, size_t& start_inde
if(!locale.empty() && locale != "en") {
while (end_pos != icu::BreakIterator::DONE) {
//LOG(INFO) << "Position: " << start_pos;
bool found_token = false;
std::string word;
//LOG(INFO) << "token: " << token;
if(locale == "ko") {
UErrorCode errcode = U_ZERO_ERROR;
@ -118,67 +115,95 @@ bool Tokenizer::next(std::string &token, size_t& token_index, size_t& start_inde
nfkd->normalize(src, dst, errcode);
if(!U_FAILURE(errcode)) {
token = dst.toUTF8String(word);
dst.toUTF8String(word);
} else {
LOG(ERROR) << "Unicode error during parsing: " << errcode;
}
} else if(normalize && is_cyrillic(locale)) {
auto raw_text = unicode_text.tempSubStringBetween(start_pos, end_pos);
transliterator->transliterate(raw_text);
token = raw_text.toUTF8String(word);
raw_text.toUTF8String(word);
} else if(locale == "th") {
UErrorCode errcode = U_ZERO_ERROR;
icu::UnicodeString src = unicode_text.tempSubStringBetween(start_pos, end_pos);
icu::UnicodeString dst;
nfkc->normalize(src, dst, errcode);
if(!U_FAILURE(errcode)) {
token = dst.toUTF8String(word);
dst.toUTF8String(word);
} else {
LOG(ERROR) << "Unicode error during parsing: " << errcode;
}
} else {
token = unicode_text.tempSubStringBetween(start_pos, end_pos).toUTF8String(word);
unicode_text.tempSubStringBetween(start_pos, end_pos).toUTF8String(word);
}
if(!token.empty()) {
if(token == " " || token == "," || token == "." || token == "!" || token == "?") {
found_token = false;
} else if (!std::isalnum(token[0]) && is_ascii_char(token[0])) {
// ignore ascii symbols
found_token = false;
token_counter++;
} else if(locale == "ko" && token == "·") {
found_token = false;
token_counter++;
} else if(locale == "zh" && (token == "" || token == "" || token == "")) {
found_token = false;
token_counter++;
} else {
bool emit_token = false;
if(std::isalnum(token[0]) && is_ascii_char(token[0])) {
// normalize an ascii string
std::transform(token.begin(), token.end(), token.begin(),
[](unsigned char c){ return std::tolower(c); });
}
// `word` can be either a multi-byte unicode sequence or an ASCII character
// ASCII character can be either a special character or English alphabet
found_token = true;
token_index = token_counter++;
if(is_ascii_char(word[0])) {
if(std::isalnum(word[0])) {
// normalize an ascii string and emit word as token
std::transform(word.begin(), word.end(), word.begin(),
[](unsigned char c){ return std::tolower(c); });
out += word;
emit_token = true;
}
start_index = utf8_start_index;
end_index = utf8_start_index + token.size() - 1;
utf8_start_index = end_index + 1;
else {
// special character:
// a) present in `index_symbols` -> append word to out and continue iteration
// b) present in `separator_symbols` -> skip word
// c) not present in either -> skip word
if(index_symbols[uint8_t(word[0])] == 1) {
out += word;
emit_token = true;
}
}
} else {
if(locale == "zh" && (word == "" || word == "" || word == "")) {
emit_token = false;
} else if(locale == "ko" && word == "·") {
emit_token = false;
} else {
emit_token = true;
out += word;
}
}
if(emit_token) {
token = out;
token_index = token_counter++;
out.clear();
}
start_index = utf8_start_index;
end_index = utf8_start_index + word.size() - 1;
utf8_start_index = end_index + 1;
start_pos = end_pos;
end_pos = bi->next();
if(found_token) {
if(emit_token) {
return true;
}
}
return false;
token = out;
out.clear();
start_index = utf8_start_index;
end_index = text.size() - 1;
if(token.empty()) {
return false;
}
token_index = token_counter++;
return true;
}
while(i < text.size()) {
@ -303,3 +328,9 @@ bool Tokenizer::is_cyrillic(const std::string& locale) {
return locale == "el" ||
locale == "ru" || locale == "sr" || locale == "uk" || locale == "be";
}
void Tokenizer::decr_token_counter() {
if(token_counter > 0) {
token_counter--;
}
}

View File

@ -220,6 +220,48 @@ TEST_F(CollectionLocaleTest, ThaiTextShouldBeNormalizedToNFKC) {
ASSERT_EQ(1, results["found"].get<size_t>());
}
TEST_F(CollectionLocaleTest, ThaiTextShouldRespectSeparators) {
nlohmann::json coll_json = R"({
"name": "coll1",
"fields": [
{"name": "title", "type": "string", "locale": "th"}
]
})"_json;
auto coll1 = collectionManager.create_collection(coll_json).get();
nlohmann::json doc;
doc["title"] = "alpha-beta-gamma";
ASSERT_TRUE(coll1->add(doc.dump()).ok());
auto results = coll1->search("*",{}, "title:=alpha-beta-gamma", {}, {},
{0}, 10, 1, FREQUENCY).get();
ASSERT_EQ(1, results["found"].get<size_t>());
// now with `symbols_to_index`
coll_json = R"({
"name": "coll2",
"symbols_to_index": ["-"],
"fields": [
{"name": "title", "type": "string", "locale": "th"}
]
})"_json;
auto coll2 = collectionManager.create_collection(coll_json).get();
ASSERT_TRUE(coll2->add(doc.dump()).ok());
results = coll2->search("*",{}, "title:=alpha-beta-gamma", {}, {},
{0}, 10, 1, FREQUENCY).get();
ASSERT_EQ(1, results["found"].get<size_t>());
results = coll2->search("*",{}, "title:=alphabetagamma", {}, {},
{0}, 10, 1, FREQUENCY).get();
ASSERT_EQ(0, results["found"].get<size_t>());
}
TEST_F(CollectionLocaleTest, SearchThaiTextPreSegmentedQuery) {
Collection *coll1;

View File

@ -325,3 +325,35 @@ TEST(TokenizerTest, ShouldTokenizeLocaleTextWithEnglishText) {
ASSERT_EQ("discrete", ttokens[7]);
ASSERT_EQ("math", ttokens[8]);
}
TEST(TokenizerTest, ShouldTokenizeWithDifferentSymbolConfigs) {
std::string str1 = "ความ-เหลื่อมล้ำ";
// '-' in symbols_to_index: "ความ", "-", "เหลื่อม", "ล้ำ"
// '-' in separators: "ความ", "เหลื่อม", "ล้ำ"
// 'none: "ความ", "เหลื่อม", "ล้ำ"
std::vector<std::string> tokens;
Tokenizer(str1, true, false, "th", {'-'}, {}).tokenize(tokens);
ASSERT_EQ(4, tokens.size());
ASSERT_EQ("ความ", tokens[0]);
ASSERT_EQ("-", tokens[1]);
ASSERT_EQ("เหลื่อม", tokens[2]);
ASSERT_EQ("ล้ํา", tokens[3]);
tokens.clear();
Tokenizer(str1, true, false, "th", {}, {'-'}).tokenize(tokens);
ASSERT_EQ(3, tokens.size());
ASSERT_EQ("ความ", tokens[0]);
ASSERT_EQ("เหลื่อม", tokens[1]);
ASSERT_EQ("ล้ํา", tokens[2]);
tokens.clear();
Tokenizer(str1, true, false, "th", {}, {}).tokenize(tokens);
ASSERT_EQ(3, tokens.size());
ASSERT_EQ("ความ", tokens[0]);
ASSERT_EQ("เหลื่อม", tokens[1]);
ASSERT_EQ("ล้ํา", tokens[2]);
LOG(INFO) << "here";
}