mirror of
https://github.com/typesense/typesense.git
synced 2025-05-18 04:32:38 +08:00
Handle special characters in locale tokenization.
This commit is contained in:
parent
fdb5f64d0e
commit
57ac561743
@ -31,9 +31,14 @@ private:
|
||||
std::string locale;
|
||||
icu::BreakIterator* bi = nullptr;
|
||||
icu::UnicodeString unicode_text;
|
||||
|
||||
// tracks start of a text segment that can span multiple unicode tokens due to use of custom symbols
|
||||
int32_t utf8_start_index = 0;
|
||||
|
||||
// tracks current unicode segment for text extraction
|
||||
int32_t start_pos = 0;
|
||||
int32_t end_pos = 0;
|
||||
int32_t utf8_start_index = 0;
|
||||
|
||||
char* normalized_text = nullptr;
|
||||
|
||||
// non-deletable singletons
|
||||
@ -78,4 +83,6 @@ public:
|
||||
static inline bool is_ascii_char(char c) {
|
||||
return (c & ~0x7f) == 0;
|
||||
}
|
||||
|
||||
void decr_token_counter();
|
||||
};
|
@ -2521,6 +2521,7 @@ bool Collection::handle_highlight_text(std::string& text, bool normalise, const
|
||||
if(is_cyrillic) {
|
||||
bool found_token = word_tokenizer.tokenize(raw_token);
|
||||
if(!found_token) {
|
||||
tokenizer.decr_token_counter();
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
@ -106,10 +106,7 @@ bool Tokenizer::next(std::string &token, size_t& token_index, size_t& start_inde
|
||||
if(!locale.empty() && locale != "en") {
|
||||
while (end_pos != icu::BreakIterator::DONE) {
|
||||
//LOG(INFO) << "Position: " << start_pos;
|
||||
bool found_token = false;
|
||||
|
||||
std::string word;
|
||||
//LOG(INFO) << "token: " << token;
|
||||
|
||||
if(locale == "ko") {
|
||||
UErrorCode errcode = U_ZERO_ERROR;
|
||||
@ -118,67 +115,95 @@ bool Tokenizer::next(std::string &token, size_t& token_index, size_t& start_inde
|
||||
nfkd->normalize(src, dst, errcode);
|
||||
|
||||
if(!U_FAILURE(errcode)) {
|
||||
token = dst.toUTF8String(word);
|
||||
dst.toUTF8String(word);
|
||||
} else {
|
||||
LOG(ERROR) << "Unicode error during parsing: " << errcode;
|
||||
}
|
||||
} else if(normalize && is_cyrillic(locale)) {
|
||||
auto raw_text = unicode_text.tempSubStringBetween(start_pos, end_pos);
|
||||
transliterator->transliterate(raw_text);
|
||||
token = raw_text.toUTF8String(word);
|
||||
raw_text.toUTF8String(word);
|
||||
} else if(locale == "th") {
|
||||
UErrorCode errcode = U_ZERO_ERROR;
|
||||
icu::UnicodeString src = unicode_text.tempSubStringBetween(start_pos, end_pos);
|
||||
icu::UnicodeString dst;
|
||||
nfkc->normalize(src, dst, errcode);
|
||||
if(!U_FAILURE(errcode)) {
|
||||
token = dst.toUTF8String(word);
|
||||
dst.toUTF8String(word);
|
||||
} else {
|
||||
LOG(ERROR) << "Unicode error during parsing: " << errcode;
|
||||
}
|
||||
} else {
|
||||
token = unicode_text.tempSubStringBetween(start_pos, end_pos).toUTF8String(word);
|
||||
unicode_text.tempSubStringBetween(start_pos, end_pos).toUTF8String(word);
|
||||
}
|
||||
|
||||
if(!token.empty()) {
|
||||
if(token == " " || token == "," || token == "." || token == "!" || token == "?") {
|
||||
found_token = false;
|
||||
} else if (!std::isalnum(token[0]) && is_ascii_char(token[0])) {
|
||||
// ignore ascii symbols
|
||||
found_token = false;
|
||||
token_counter++;
|
||||
} else if(locale == "ko" && token == "·") {
|
||||
found_token = false;
|
||||
token_counter++;
|
||||
} else if(locale == "zh" && (token == "," || token == "─" || token == "。")) {
|
||||
found_token = false;
|
||||
token_counter++;
|
||||
} else {
|
||||
bool emit_token = false;
|
||||
|
||||
if(std::isalnum(token[0]) && is_ascii_char(token[0])) {
|
||||
// normalize an ascii string
|
||||
std::transform(token.begin(), token.end(), token.begin(),
|
||||
[](unsigned char c){ return std::tolower(c); });
|
||||
}
|
||||
// `word` can be either a multi-byte unicode sequence or an ASCII character
|
||||
// ASCII character can be either a special character or English alphabet
|
||||
|
||||
found_token = true;
|
||||
token_index = token_counter++;
|
||||
if(is_ascii_char(word[0])) {
|
||||
|
||||
if(std::isalnum(word[0])) {
|
||||
// normalize an ascii string and emit word as token
|
||||
std::transform(word.begin(), word.end(), word.begin(),
|
||||
[](unsigned char c){ return std::tolower(c); });
|
||||
out += word;
|
||||
emit_token = true;
|
||||
}
|
||||
|
||||
start_index = utf8_start_index;
|
||||
end_index = utf8_start_index + token.size() - 1;
|
||||
utf8_start_index = end_index + 1;
|
||||
else {
|
||||
// special character:
|
||||
// a) present in `index_symbols` -> append word to out and continue iteration
|
||||
// b) present in `separator_symbols` -> skip word
|
||||
// c) not present in either -> skip word
|
||||
if(index_symbols[uint8_t(word[0])] == 1) {
|
||||
out += word;
|
||||
emit_token = true;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
} else {
|
||||
if(locale == "zh" && (word == "," || word == "─" || word == "。")) {
|
||||
emit_token = false;
|
||||
} else if(locale == "ko" && word == "·") {
|
||||
emit_token = false;
|
||||
} else {
|
||||
emit_token = true;
|
||||
out += word;
|
||||
}
|
||||
}
|
||||
|
||||
if(emit_token) {
|
||||
token = out;
|
||||
token_index = token_counter++;
|
||||
out.clear();
|
||||
}
|
||||
|
||||
start_index = utf8_start_index;
|
||||
end_index = utf8_start_index + word.size() - 1;
|
||||
utf8_start_index = end_index + 1;
|
||||
|
||||
start_pos = end_pos;
|
||||
end_pos = bi->next();
|
||||
|
||||
if(found_token) {
|
||||
if(emit_token) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
token = out;
|
||||
out.clear();
|
||||
start_index = utf8_start_index;
|
||||
end_index = text.size() - 1;
|
||||
|
||||
if(token.empty()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
token_index = token_counter++;
|
||||
return true;
|
||||
}
|
||||
|
||||
while(i < text.size()) {
|
||||
@ -303,3 +328,9 @@ bool Tokenizer::is_cyrillic(const std::string& locale) {
|
||||
return locale == "el" ||
|
||||
locale == "ru" || locale == "sr" || locale == "uk" || locale == "be";
|
||||
}
|
||||
|
||||
void Tokenizer::decr_token_counter() {
|
||||
if(token_counter > 0) {
|
||||
token_counter--;
|
||||
}
|
||||
}
|
||||
|
@ -220,6 +220,48 @@ TEST_F(CollectionLocaleTest, ThaiTextShouldBeNormalizedToNFKC) {
|
||||
ASSERT_EQ(1, results["found"].get<size_t>());
|
||||
}
|
||||
|
||||
TEST_F(CollectionLocaleTest, ThaiTextShouldRespectSeparators) {
|
||||
nlohmann::json coll_json = R"({
|
||||
"name": "coll1",
|
||||
"fields": [
|
||||
{"name": "title", "type": "string", "locale": "th"}
|
||||
]
|
||||
})"_json;
|
||||
|
||||
auto coll1 = collectionManager.create_collection(coll_json).get();
|
||||
|
||||
nlohmann::json doc;
|
||||
doc["title"] = "alpha-beta-gamma";
|
||||
ASSERT_TRUE(coll1->add(doc.dump()).ok());
|
||||
|
||||
auto results = coll1->search("*",{}, "title:=alpha-beta-gamma", {}, {},
|
||||
{0}, 10, 1, FREQUENCY).get();
|
||||
|
||||
ASSERT_EQ(1, results["found"].get<size_t>());
|
||||
|
||||
// now with `symbols_to_index`
|
||||
coll_json = R"({
|
||||
"name": "coll2",
|
||||
"symbols_to_index": ["-"],
|
||||
"fields": [
|
||||
{"name": "title", "type": "string", "locale": "th"}
|
||||
]
|
||||
})"_json;
|
||||
|
||||
auto coll2 = collectionManager.create_collection(coll_json).get();
|
||||
ASSERT_TRUE(coll2->add(doc.dump()).ok());
|
||||
|
||||
results = coll2->search("*",{}, "title:=alpha-beta-gamma", {}, {},
|
||||
{0}, 10, 1, FREQUENCY).get();
|
||||
|
||||
ASSERT_EQ(1, results["found"].get<size_t>());
|
||||
|
||||
results = coll2->search("*",{}, "title:=alphabetagamma", {}, {},
|
||||
{0}, 10, 1, FREQUENCY).get();
|
||||
|
||||
ASSERT_EQ(0, results["found"].get<size_t>());
|
||||
}
|
||||
|
||||
TEST_F(CollectionLocaleTest, SearchThaiTextPreSegmentedQuery) {
|
||||
Collection *coll1;
|
||||
|
||||
|
@ -325,3 +325,35 @@ TEST(TokenizerTest, ShouldTokenizeLocaleTextWithEnglishText) {
|
||||
ASSERT_EQ("discrete", ttokens[7]);
|
||||
ASSERT_EQ("math", ttokens[8]);
|
||||
}
|
||||
|
||||
TEST(TokenizerTest, ShouldTokenizeWithDifferentSymbolConfigs) {
|
||||
std::string str1 = "ความ-เหลื่อมล้ำ";
|
||||
|
||||
// '-' in symbols_to_index: "ความ", "-", "เหลื่อม", "ล้ำ"
|
||||
// '-' in separators: "ความ", "เหลื่อม", "ล้ำ"
|
||||
// 'none: "ความ", "เหลื่อม", "ล้ำ"
|
||||
|
||||
std::vector<std::string> tokens;
|
||||
Tokenizer(str1, true, false, "th", {'-'}, {}).tokenize(tokens);
|
||||
ASSERT_EQ(4, tokens.size());
|
||||
ASSERT_EQ("ความ", tokens[0]);
|
||||
ASSERT_EQ("-", tokens[1]);
|
||||
ASSERT_EQ("เหลื่อม", tokens[2]);
|
||||
ASSERT_EQ("ล้ํา", tokens[3]);
|
||||
|
||||
tokens.clear();
|
||||
Tokenizer(str1, true, false, "th", {}, {'-'}).tokenize(tokens);
|
||||
ASSERT_EQ(3, tokens.size());
|
||||
ASSERT_EQ("ความ", tokens[0]);
|
||||
ASSERT_EQ("เหลื่อม", tokens[1]);
|
||||
ASSERT_EQ("ล้ํา", tokens[2]);
|
||||
|
||||
tokens.clear();
|
||||
Tokenizer(str1, true, false, "th", {}, {}).tokenize(tokens);
|
||||
ASSERT_EQ(3, tokens.size());
|
||||
ASSERT_EQ("ความ", tokens[0]);
|
||||
ASSERT_EQ("เหลื่อม", tokens[1]);
|
||||
ASSERT_EQ("ล้ํา", tokens[2]);
|
||||
|
||||
LOG(INFO) << "here";
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user