mirror of
https://github.com/typesense/typesense.git
synced 2025-05-18 12:42:50 +08:00
Do word level tokenization for th locale.
This ensures that we handle change in text length due to NFKC normalization.
This commit is contained in:
parent
729c73bb6d
commit
70e6a89ea8
@ -222,7 +222,7 @@ private:
|
||||
|
||||
bool handle_highlight_text(std::string& text, bool normalise, const field &search_field,
|
||||
const std::vector<char>& symbols_to_index, const std::vector<char>& token_separators,
|
||||
highlight_t& highlight, StringUtils & string_utils, bool is_cyrillic,
|
||||
highlight_t& highlight, StringUtils & string_utils, bool use_word_tokenizer,
|
||||
const size_t highlight_affix_num_tokens,
|
||||
const tsl::htrie_map<char, token_leaf>& qtoken_leaves,
|
||||
int last_valid_offset_index, const Match& match,
|
||||
|
@ -2262,8 +2262,8 @@ void Collection::highlight_result(const std::string& raw_query, const field &sea
|
||||
|
||||
tsl::htrie_set<char> matched_tokens;
|
||||
|
||||
bool is_cyrillic = Tokenizer::is_cyrillic(search_field.locale);
|
||||
bool normalise = is_cyrillic ? false : true;
|
||||
bool use_word_tokenizer = search_field.locale == "th" || Tokenizer::is_cyrillic(search_field.locale);
|
||||
bool normalise = !use_word_tokenizer;
|
||||
|
||||
std::vector<std::string> raw_query_tokens;
|
||||
Tokenizer(raw_query, normalise, false, search_field.locale, symbols_to_index, token_separators).tokenize(raw_query_tokens);
|
||||
@ -2348,14 +2348,14 @@ void Collection::highlight_result(const std::string& raw_query, const field &sea
|
||||
|
||||
std::string text = h_obj.get<std::string>();
|
||||
handle_highlight_text(text, normalise, search_field, symbols_to_index,
|
||||
token_separators, array_highlight, string_utils, is_cyrillic,
|
||||
highlight_affix_num_tokens,
|
||||
qtoken_leaves, last_valid_offset_index, match,
|
||||
prefix_token_num_chars,
|
||||
highlight_fully, snippet_threshold, is_infix_search,
|
||||
raw_query_tokens,
|
||||
last_valid_offset, highlight_start_tag, highlight_end_tag,
|
||||
index_symbols, match_index);
|
||||
token_separators, array_highlight, string_utils, use_word_tokenizer,
|
||||
highlight_affix_num_tokens,
|
||||
qtoken_leaves, last_valid_offset_index, match,
|
||||
prefix_token_num_chars,
|
||||
highlight_fully, snippet_threshold, is_infix_search,
|
||||
raw_query_tokens,
|
||||
last_valid_offset, highlight_start_tag, highlight_end_tag,
|
||||
index_symbols, match_index);
|
||||
|
||||
if(!array_highlight.snippets.empty()) {
|
||||
h_obj = array_highlight.snippets[0];
|
||||
@ -2458,7 +2458,7 @@ void Collection::highlight_result(const std::string& raw_query, const field &sea
|
||||
}
|
||||
|
||||
handle_highlight_text(text, normalise, search_field, symbols_to_index, token_separators,
|
||||
highlight, string_utils, is_cyrillic, highlight_affix_num_tokens,
|
||||
highlight, string_utils, use_word_tokenizer, highlight_affix_num_tokens,
|
||||
qtoken_leaves, last_valid_offset_index, match, prefix_token_num_chars,
|
||||
highlight_fully, snippet_threshold, is_infix_search, raw_query_tokens,
|
||||
last_valid_offset, highlight_start_tag, highlight_end_tag,
|
||||
@ -2543,7 +2543,7 @@ void Collection::highlight_result(const std::string& raw_query, const field &sea
|
||||
|
||||
bool Collection::handle_highlight_text(std::string& text, bool normalise, const field &search_field,
|
||||
const std::vector<char>& symbols_to_index, const std::vector<char>& token_separators,
|
||||
highlight_t& highlight, StringUtils & string_utils, bool is_cyrillic,
|
||||
highlight_t& highlight, StringUtils & string_utils, bool use_word_tokenizer,
|
||||
const size_t highlight_affix_num_tokens,
|
||||
const tsl::htrie_map<char, token_leaf>& qtoken_leaves,
|
||||
int last_valid_offset_index, const Match& match,
|
||||
@ -2580,7 +2580,7 @@ bool Collection::handle_highlight_text(std::string& text, bool normalise, const
|
||||
bool found_first_match = false;
|
||||
|
||||
while(tokenizer.next(raw_token, raw_token_index, tok_start, tok_end)) {
|
||||
if(is_cyrillic) {
|
||||
if(use_word_tokenizer) {
|
||||
bool found_token = word_tokenizer.tokenize(raw_token);
|
||||
if(!found_token) {
|
||||
tokenizer.decr_token_counter();
|
||||
@ -2744,7 +2744,12 @@ void Collection::highlight_text(const string& highlight_start_tag, const string&
|
||||
auto end_offset = offset_it->second;
|
||||
|
||||
// if a token ends with one or more puncutation chars, we should not highlight them
|
||||
for(int j = end_offset; j > 0; j--) {
|
||||
for(int j = end_offset; j >= 0; j--) {
|
||||
if(end_offset >= text.size()) {
|
||||
// this should not happen unless we mess up unicode normalization
|
||||
break;
|
||||
}
|
||||
|
||||
if(!std::isalnum(text[j]) && Tokenizer::is_ascii_char(text[j]) &&
|
||||
index_symbols[uint8_t(text[j])] != 1) {
|
||||
end_offset--;
|
||||
|
@ -123,7 +123,7 @@ bool Tokenizer::next(std::string &token, size_t& token_index, size_t& start_inde
|
||||
auto raw_text = unicode_text.tempSubStringBetween(start_pos, end_pos);
|
||||
transliterator->transliterate(raw_text);
|
||||
raw_text.toUTF8String(word);
|
||||
} else if(locale == "th") {
|
||||
} else if(normalize && locale == "th") {
|
||||
UErrorCode errcode = U_ZERO_ERROR;
|
||||
icu::UnicodeString src = unicode_text.tempSubStringBetween(start_pos, end_pos);
|
||||
icu::UnicodeString dst;
|
||||
|
@ -332,9 +332,13 @@ TEST_F(CollectionLocaleTest, SearchAgainstThaiTextExactMatch) {
|
||||
coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
|
||||
}
|
||||
|
||||
std::string word_9bytes = "น้ำ";
|
||||
std::string word_12bytes = "น้ํา";
|
||||
|
||||
std::vector<std::vector<std::string>> records = {
|
||||
{"ติดกับดักรายได้ปานกลาง", "Expected Result"},
|
||||
{"ข้อมูลรายคนหรือรายบริษัทในการเชื่อมโยงส่วนได้ส่วนเสีย", "Another Result"},
|
||||
{word_9bytes, "Another Result"}, // NKC normalization
|
||||
};
|
||||
|
||||
for (size_t i = 0; i < records.size(); i++) {
|
||||
@ -361,6 +365,12 @@ TEST_F(CollectionLocaleTest, SearchAgainstThaiTextExactMatch) {
|
||||
ASSERT_EQ("ข้อมูล<mark>ราย</mark>คนหรือ<mark>ราย</mark>บริษัทในการเชื่อมโยงส่วน<mark>ได้</mark>ส่วนเสีย",
|
||||
results["hits"][1]["highlights"][0]["snippet"].get<std::string>());
|
||||
|
||||
// check text index overflow regression with NFC normalization + highlighting
|
||||
|
||||
results = coll1->search(word_12bytes, {"title"}, "", {}, sort_fields, {2}, 10, 1, FREQUENCY).get();
|
||||
ASSERT_EQ(1, results["found"].get<size_t>());
|
||||
ASSERT_EQ(1, results["hits"].size());
|
||||
ASSERT_EQ("<mark>น้ำ</mark>", results["hits"][0]["highlights"][0]["snippet"].get<std::string>());
|
||||
}
|
||||
|
||||
TEST_F(CollectionLocaleTest, SearchAgainstKoreanText) {
|
||||
|
@ -237,7 +237,7 @@ TEST(TokenizerTest, ShouldTokenizeLocaleText) {
|
||||
|
||||
tokens.clear();
|
||||
str = "จิ้งจอกสีน้ำตาลด่วน";
|
||||
Tokenizer(str, false, false, "th").tokenize(tokens);
|
||||
Tokenizer(str, true, false, "th").tokenize(tokens);
|
||||
ASSERT_EQ(4, tokens.size());
|
||||
ASSERT_EQ("จิ้งจอก", tokens[0]);
|
||||
ASSERT_EQ("สี", tokens[1]);
|
||||
|
Loading…
x
Reference in New Issue
Block a user