Handle special characters within non-English locale.

Unless present in symbols to index / separators, it will be skipped.
This commit is contained in:
Kishore Nallan 2024-02-08 16:52:25 +05:30
parent 48df1e70e8
commit fc80cc3a72
3 changed files with 45 additions and 6 deletions

View File

@ -160,13 +160,25 @@ bool Tokenizer::next(std::string &token, size_t& token_index, size_t& start_inde
// `word` can be either a multi-byte unicode sequence or an ASCII character
// ASCII character can be either a special character or English alphabet
size_t orig_word_size = word.size();
if(is_ascii_char(word[0])) {
if(std::isalnum(word[0])) {
// normalize an ascii string and emit word as token
std::transform(word.begin(), word.end(), word.begin(),
[](unsigned char c){ return std::tolower(c); });
size_t read_index = 0, write_index = 0;
while (read_index < word.size()) {
size_t this_stream_mode = get_stream_mode(word[read_index]);
if(this_stream_mode != SKIP) {
word[write_index++] = std::tolower(word[read_index]);
}
read_index++;
}
// resize to fit new length
word.resize(write_index);
out += word;
emit_token = true;
}
@ -181,8 +193,6 @@ bool Tokenizer::next(std::string &token, size_t& token_index, size_t& start_inde
emit_token = true;
}
}
} else {
if(locale == "zh" && (word == "" || word == "" || word == "")) {
emit_token = false;
@ -201,7 +211,7 @@ bool Tokenizer::next(std::string &token, size_t& token_index, size_t& start_inde
}
start_index = utf8_start_index;
end_index = utf8_start_index + word.size() - 1;
end_index = utf8_start_index + orig_word_size - 1;
utf8_start_index = end_index + 1;
start_pos = end_pos;

View File

@ -905,6 +905,29 @@ TEST_F(CollectionLocaleTest, SearchInGermanLocaleShouldBeTypoTolerant) {
ASSERT_EQ(1, results["found"].get<size_t>());
}
TEST_F(CollectionLocaleTest, HandleSpecialCharsInThai) {
nlohmann::json coll_json = R"({
"name": "coll1",
"fields": [
{"name": "title_th", "type": "string", "locale": "th"},
{"name": "sku", "type": "string"}
]
})"_json;
auto coll1 = collectionManager.create_collection(coll_json).get();
nlohmann::json doc;
doc["title_th"] = "สวัสดี";
doc["sku"] = "12345_";
ASSERT_TRUE(coll1->add(doc.dump()).ok());
// query string is parsed using the locale of the first field in the query_by list
auto results = coll1->search("12345_", {"title_th", "sku"}, "", {}, {},
{2, 0}, 10, 1, FREQUENCY, {true, false}, 1).get();
ASSERT_EQ(1, results["found"].get<size_t>());
}
/*
TEST_F(CollectionLocaleTest, TranslitPad) {
UErrorCode translit_status = U_ZERO_ERROR;

View File

@ -254,6 +254,12 @@ TEST(TokenizerTest, ShouldTokenizeLocaleText) {
ASSERT_EQ("31", tokens[3]);
ASSERT_EQ("มี.ค", tokens[4]);
tokens.clear();
str = "12345_678";
Tokenizer(str, false, false, "th").tokenize(tokens);
ASSERT_EQ(1, tokens.size());
ASSERT_EQ("12345678", tokens[0]);
tokens.clear();
Tokenizer("Odd Thomas", false, false, "en").tokenize(tokens);
ASSERT_EQ(2, tokens.size());