Do word level tokenization for th locale.

This ensures that we handle change in text length due to NFKC normalization.
This commit is contained in:
Kishore Nallan 2022-08-31 07:35:05 +05:30
parent 729c73bb6d
commit 70e6a89ea8
5 changed files with 32 additions and 17 deletions

View File

@ -222,7 +222,7 @@ private:
bool handle_highlight_text(std::string& text, bool normalise, const field &search_field,
const std::vector<char>& symbols_to_index, const std::vector<char>& token_separators,
highlight_t& highlight, StringUtils & string_utils, bool is_cyrillic,
highlight_t& highlight, StringUtils & string_utils, bool use_word_tokenizer,
const size_t highlight_affix_num_tokens,
const tsl::htrie_map<char, token_leaf>& qtoken_leaves,
int last_valid_offset_index, const Match& match,

View File

@ -2262,8 +2262,8 @@ void Collection::highlight_result(const std::string& raw_query, const field &sea
tsl::htrie_set<char> matched_tokens;
bool is_cyrillic = Tokenizer::is_cyrillic(search_field.locale);
bool normalise = is_cyrillic ? false : true;
bool use_word_tokenizer = search_field.locale == "th" || Tokenizer::is_cyrillic(search_field.locale);
bool normalise = !use_word_tokenizer;
std::vector<std::string> raw_query_tokens;
Tokenizer(raw_query, normalise, false, search_field.locale, symbols_to_index, token_separators).tokenize(raw_query_tokens);
@ -2348,14 +2348,14 @@ void Collection::highlight_result(const std::string& raw_query, const field &sea
std::string text = h_obj.get<std::string>();
handle_highlight_text(text, normalise, search_field, symbols_to_index,
token_separators, array_highlight, string_utils, is_cyrillic,
highlight_affix_num_tokens,
qtoken_leaves, last_valid_offset_index, match,
prefix_token_num_chars,
highlight_fully, snippet_threshold, is_infix_search,
raw_query_tokens,
last_valid_offset, highlight_start_tag, highlight_end_tag,
index_symbols, match_index);
token_separators, array_highlight, string_utils, use_word_tokenizer,
highlight_affix_num_tokens,
qtoken_leaves, last_valid_offset_index, match,
prefix_token_num_chars,
highlight_fully, snippet_threshold, is_infix_search,
raw_query_tokens,
last_valid_offset, highlight_start_tag, highlight_end_tag,
index_symbols, match_index);
if(!array_highlight.snippets.empty()) {
h_obj = array_highlight.snippets[0];
@ -2458,7 +2458,7 @@ void Collection::highlight_result(const std::string& raw_query, const field &sea
}
handle_highlight_text(text, normalise, search_field, symbols_to_index, token_separators,
highlight, string_utils, is_cyrillic, highlight_affix_num_tokens,
highlight, string_utils, use_word_tokenizer, highlight_affix_num_tokens,
qtoken_leaves, last_valid_offset_index, match, prefix_token_num_chars,
highlight_fully, snippet_threshold, is_infix_search, raw_query_tokens,
last_valid_offset, highlight_start_tag, highlight_end_tag,
@ -2543,7 +2543,7 @@ void Collection::highlight_result(const std::string& raw_query, const field &sea
bool Collection::handle_highlight_text(std::string& text, bool normalise, const field &search_field,
const std::vector<char>& symbols_to_index, const std::vector<char>& token_separators,
highlight_t& highlight, StringUtils & string_utils, bool is_cyrillic,
highlight_t& highlight, StringUtils & string_utils, bool use_word_tokenizer,
const size_t highlight_affix_num_tokens,
const tsl::htrie_map<char, token_leaf>& qtoken_leaves,
int last_valid_offset_index, const Match& match,
@ -2580,7 +2580,7 @@ bool Collection::handle_highlight_text(std::string& text, bool normalise, const
bool found_first_match = false;
while(tokenizer.next(raw_token, raw_token_index, tok_start, tok_end)) {
if(is_cyrillic) {
if(use_word_tokenizer) {
bool found_token = word_tokenizer.tokenize(raw_token);
if(!found_token) {
tokenizer.decr_token_counter();
@ -2744,7 +2744,12 @@ void Collection::highlight_text(const string& highlight_start_tag, const string&
auto end_offset = offset_it->second;
// if a token ends with one or more puncutation chars, we should not highlight them
for(int j = end_offset; j > 0; j--) {
for(int j = end_offset; j >= 0; j--) {
if(end_offset >= text.size()) {
// this should not happen unless we mess up unicode normalization
break;
}
if(!std::isalnum(text[j]) && Tokenizer::is_ascii_char(text[j]) &&
index_symbols[uint8_t(text[j])] != 1) {
end_offset--;

View File

@ -123,7 +123,7 @@ bool Tokenizer::next(std::string &token, size_t& token_index, size_t& start_inde
auto raw_text = unicode_text.tempSubStringBetween(start_pos, end_pos);
transliterator->transliterate(raw_text);
raw_text.toUTF8String(word);
} else if(locale == "th") {
} else if(normalize && locale == "th") {
UErrorCode errcode = U_ZERO_ERROR;
icu::UnicodeString src = unicode_text.tempSubStringBetween(start_pos, end_pos);
icu::UnicodeString dst;

View File

@ -332,9 +332,13 @@ TEST_F(CollectionLocaleTest, SearchAgainstThaiTextExactMatch) {
coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
}
std::string word_9bytes = "น้ำ";
std::string word_12bytes = "น้ํา";
std::vector<std::vector<std::string>> records = {
{"ติดกับดักรายได้ปานกลาง", "Expected Result"},
{"ข้อมูลรายคนหรือรายบริษัทในการเชื่อมโยงส่วนได้ส่วนเสีย", "Another Result"},
{word_9bytes, "Another Result"}, // NKC normalization
};
for (size_t i = 0; i < records.size(); i++) {
@ -361,6 +365,12 @@ TEST_F(CollectionLocaleTest, SearchAgainstThaiTextExactMatch) {
ASSERT_EQ("ข้อมูล<mark>ราย</mark>คนหรือ<mark>ราย</mark>บริษัทในการเชื่อมโยงส่วน<mark>ได้</mark>ส่วนเสีย",
results["hits"][1]["highlights"][0]["snippet"].get<std::string>());
// check text index overflow regression with NFC normalization + highlighting
results = coll1->search(word_12bytes, {"title"}, "", {}, sort_fields, {2}, 10, 1, FREQUENCY).get();
ASSERT_EQ(1, results["found"].get<size_t>());
ASSERT_EQ(1, results["hits"].size());
ASSERT_EQ("<mark>น้ำ</mark>", results["hits"][0]["highlights"][0]["snippet"].get<std::string>());
}
TEST_F(CollectionLocaleTest, SearchAgainstKoreanText) {

View File

@ -237,7 +237,7 @@ TEST(TokenizerTest, ShouldTokenizeLocaleText) {
tokens.clear();
str = "จิ้งจอกสีน้ำตาลด่วน";
Tokenizer(str, false, false, "th").tokenize(tokens);
Tokenizer(str, true, false, "th").tokenize(tokens);
ASSERT_EQ(4, tokens.size());
ASSERT_EQ("จิ้งจอก", tokens[0]);
ASSERT_EQ("สี", tokens[1]);