mirror of
https://github.com/typesense/typesense.git
synced 2025-05-21 22:33:27 +08:00
Handle special characters within non-English locale.
Unless present in symbols to index / separators, it will be skipped.
This commit is contained in:
parent
48df1e70e8
commit
fc80cc3a72
@ -160,13 +160,25 @@ bool Tokenizer::next(std::string &token, size_t& token_index, size_t& start_inde
|
||||
|
||||
// `word` can be either a multi-byte unicode sequence or an ASCII character
|
||||
// ASCII character can be either a special character or English alphabet
|
||||
size_t orig_word_size = word.size();
|
||||
|
||||
if(is_ascii_char(word[0])) {
|
||||
|
||||
if(std::isalnum(word[0])) {
|
||||
// normalize an ascii string and emit word as token
|
||||
std::transform(word.begin(), word.end(), word.begin(),
|
||||
[](unsigned char c){ return std::tolower(c); });
|
||||
size_t read_index = 0, write_index = 0;
|
||||
|
||||
while (read_index < word.size()) {
|
||||
size_t this_stream_mode = get_stream_mode(word[read_index]);
|
||||
if(this_stream_mode != SKIP) {
|
||||
word[write_index++] = std::tolower(word[read_index]);
|
||||
}
|
||||
|
||||
read_index++;
|
||||
}
|
||||
|
||||
// resize to fit new length
|
||||
word.resize(write_index);
|
||||
|
||||
out += word;
|
||||
emit_token = true;
|
||||
}
|
||||
@ -181,8 +193,6 @@ bool Tokenizer::next(std::string &token, size_t& token_index, size_t& start_inde
|
||||
emit_token = true;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
} else {
|
||||
if(locale == "zh" && (word == "," || word == "─" || word == "。")) {
|
||||
emit_token = false;
|
||||
@ -201,7 +211,7 @@ bool Tokenizer::next(std::string &token, size_t& token_index, size_t& start_inde
|
||||
}
|
||||
|
||||
start_index = utf8_start_index;
|
||||
end_index = utf8_start_index + word.size() - 1;
|
||||
end_index = utf8_start_index + orig_word_size - 1;
|
||||
utf8_start_index = end_index + 1;
|
||||
|
||||
start_pos = end_pos;
|
||||
|
@ -905,6 +905,29 @@ TEST_F(CollectionLocaleTest, SearchInGermanLocaleShouldBeTypoTolerant) {
|
||||
ASSERT_EQ(1, results["found"].get<size_t>());
|
||||
}
|
||||
|
||||
TEST_F(CollectionLocaleTest, HandleSpecialCharsInThai) {
|
||||
nlohmann::json coll_json = R"({
|
||||
"name": "coll1",
|
||||
"fields": [
|
||||
{"name": "title_th", "type": "string", "locale": "th"},
|
||||
{"name": "sku", "type": "string"}
|
||||
]
|
||||
})"_json;
|
||||
|
||||
auto coll1 = collectionManager.create_collection(coll_json).get();
|
||||
|
||||
nlohmann::json doc;
|
||||
doc["title_th"] = "สวัสดี";
|
||||
doc["sku"] = "12345_";
|
||||
ASSERT_TRUE(coll1->add(doc.dump()).ok());
|
||||
|
||||
// query string is parsed using the locale of the first field in the query_by list
|
||||
auto results = coll1->search("12345_", {"title_th", "sku"}, "", {}, {},
|
||||
{2, 0}, 10, 1, FREQUENCY, {true, false}, 1).get();
|
||||
|
||||
ASSERT_EQ(1, results["found"].get<size_t>());
|
||||
}
|
||||
|
||||
/*
|
||||
TEST_F(CollectionLocaleTest, TranslitPad) {
|
||||
UErrorCode translit_status = U_ZERO_ERROR;
|
||||
|
@ -254,6 +254,12 @@ TEST(TokenizerTest, ShouldTokenizeLocaleText) {
|
||||
ASSERT_EQ("31", tokens[3]);
|
||||
ASSERT_EQ("มี.ค", tokens[4]);
|
||||
|
||||
tokens.clear();
|
||||
str = "12345_678";
|
||||
Tokenizer(str, false, false, "th").tokenize(tokens);
|
||||
ASSERT_EQ(1, tokens.size());
|
||||
ASSERT_EQ("12345678", tokens[0]);
|
||||
|
||||
tokens.clear();
|
||||
Tokenizer("Odd Thomas", false, false, "en").tokenize(tokens);
|
||||
ASSERT_EQ(2, tokens.size());
|
||||
|
Loading…
x
Reference in New Issue
Block a user