From 70e6a89ea8d76c59e00af16ce4559c8932ec08ec Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Wed, 31 Aug 2022 07:35:05 +0530 Subject: [PATCH] Do word level tokenization for th locale. This ensures that we handle change in text length due to NFKC normalization. --- include/collection.h | 2 +- src/collection.cpp | 33 +++++++++++++++++++-------------- src/tokenizer.cpp | 2 +- test/collection_locale_test.cpp | 10 ++++++++++ test/tokenizer_test.cpp | 2 +- 5 files changed, 32 insertions(+), 17 deletions(-) diff --git a/include/collection.h b/include/collection.h index 7d8c76c3..ff3c991b 100644 --- a/include/collection.h +++ b/include/collection.h @@ -222,7 +222,7 @@ private: bool handle_highlight_text(std::string& text, bool normalise, const field &search_field, const std::vector& symbols_to_index, const std::vector& token_separators, - highlight_t& highlight, StringUtils & string_utils, bool is_cyrillic, + highlight_t& highlight, StringUtils & string_utils, bool use_word_tokenizer, const size_t highlight_affix_num_tokens, const tsl::htrie_map& qtoken_leaves, int last_valid_offset_index, const Match& match, diff --git a/src/collection.cpp b/src/collection.cpp index 26ef40ff..d5392014 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -2262,8 +2262,8 @@ void Collection::highlight_result(const std::string& raw_query, const field &sea tsl::htrie_set matched_tokens; - bool is_cyrillic = Tokenizer::is_cyrillic(search_field.locale); - bool normalise = is_cyrillic ? false : true; + bool use_word_tokenizer = search_field.locale == "th" || Tokenizer::is_cyrillic(search_field.locale); + bool normalise = !use_word_tokenizer; std::vector raw_query_tokens; Tokenizer(raw_query, normalise, false, search_field.locale, symbols_to_index, token_separators).tokenize(raw_query_tokens); @@ -2348,14 +2348,14 @@ void Collection::highlight_result(const std::string& raw_query, const field &sea std::string text = h_obj.get(); handle_highlight_text(text, normalise, search_field, symbols_to_index, - token_separators, array_highlight, string_utils, is_cyrillic, - highlight_affix_num_tokens, - qtoken_leaves, last_valid_offset_index, match, - prefix_token_num_chars, - highlight_fully, snippet_threshold, is_infix_search, - raw_query_tokens, - last_valid_offset, highlight_start_tag, highlight_end_tag, - index_symbols, match_index); + token_separators, array_highlight, string_utils, use_word_tokenizer, + highlight_affix_num_tokens, + qtoken_leaves, last_valid_offset_index, match, + prefix_token_num_chars, + highlight_fully, snippet_threshold, is_infix_search, + raw_query_tokens, + last_valid_offset, highlight_start_tag, highlight_end_tag, + index_symbols, match_index); if(!array_highlight.snippets.empty()) { h_obj = array_highlight.snippets[0]; @@ -2458,7 +2458,7 @@ void Collection::highlight_result(const std::string& raw_query, const field &sea } handle_highlight_text(text, normalise, search_field, symbols_to_index, token_separators, - highlight, string_utils, is_cyrillic, highlight_affix_num_tokens, + highlight, string_utils, use_word_tokenizer, highlight_affix_num_tokens, qtoken_leaves, last_valid_offset_index, match, prefix_token_num_chars, highlight_fully, snippet_threshold, is_infix_search, raw_query_tokens, last_valid_offset, highlight_start_tag, highlight_end_tag, @@ -2543,7 +2543,7 @@ void Collection::highlight_result(const std::string& raw_query, const field &sea bool Collection::handle_highlight_text(std::string& text, bool normalise, const field &search_field, const std::vector& symbols_to_index, const std::vector& token_separators, - highlight_t& highlight, StringUtils & string_utils, bool is_cyrillic, + highlight_t& highlight, StringUtils & string_utils, bool use_word_tokenizer, const size_t highlight_affix_num_tokens, const tsl::htrie_map& qtoken_leaves, int last_valid_offset_index, const Match& match, @@ -2580,7 +2580,7 @@ bool Collection::handle_highlight_text(std::string& text, bool normalise, const bool found_first_match = false; while(tokenizer.next(raw_token, raw_token_index, tok_start, tok_end)) { - if(is_cyrillic) { + if(use_word_tokenizer) { bool found_token = word_tokenizer.tokenize(raw_token); if(!found_token) { tokenizer.decr_token_counter(); @@ -2744,7 +2744,12 @@ void Collection::highlight_text(const string& highlight_start_tag, const string& auto end_offset = offset_it->second; // if a token ends with one or more puncutation chars, we should not highlight them - for(int j = end_offset; j > 0; j--) { + for(int j = end_offset; j >= 0; j--) { + if(end_offset >= text.size()) { + // this should not happen unless we mess up unicode normalization + break; + } + if(!std::isalnum(text[j]) && Tokenizer::is_ascii_char(text[j]) && index_symbols[uint8_t(text[j])] != 1) { end_offset--; diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp index 4eb357a5..082db1be 100644 --- a/src/tokenizer.cpp +++ b/src/tokenizer.cpp @@ -123,7 +123,7 @@ bool Tokenizer::next(std::string &token, size_t& token_index, size_t& start_inde auto raw_text = unicode_text.tempSubStringBetween(start_pos, end_pos); transliterator->transliterate(raw_text); raw_text.toUTF8String(word); - } else if(locale == "th") { + } else if(normalize && locale == "th") { UErrorCode errcode = U_ZERO_ERROR; icu::UnicodeString src = unicode_text.tempSubStringBetween(start_pos, end_pos); icu::UnicodeString dst; diff --git a/test/collection_locale_test.cpp b/test/collection_locale_test.cpp index ab40b163..11ce2ed8 100644 --- a/test/collection_locale_test.cpp +++ b/test/collection_locale_test.cpp @@ -332,9 +332,13 @@ TEST_F(CollectionLocaleTest, SearchAgainstThaiTextExactMatch) { coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get(); } + std::string word_9bytes = "น้ำ"; + std::string word_12bytes = "น้ํา"; + std::vector> records = { {"ติดกับดักรายได้ปานกลาง", "Expected Result"}, {"ข้อมูลรายคนหรือรายบริษัทในการเชื่อมโยงส่วนได้ส่วนเสีย", "Another Result"}, + {word_9bytes, "Another Result"}, // NKC normalization }; for (size_t i = 0; i < records.size(); i++) { @@ -361,6 +365,12 @@ TEST_F(CollectionLocaleTest, SearchAgainstThaiTextExactMatch) { ASSERT_EQ("ข้อมูลรายคนหรือรายบริษัทในการเชื่อมโยงส่วนได้ส่วนเสีย", results["hits"][1]["highlights"][0]["snippet"].get()); + // check text index overflow regression with NFC normalization + highlighting + + results = coll1->search(word_12bytes, {"title"}, "", {}, sort_fields, {2}, 10, 1, FREQUENCY).get(); + ASSERT_EQ(1, results["found"].get()); + ASSERT_EQ(1, results["hits"].size()); + ASSERT_EQ("น้ำ", results["hits"][0]["highlights"][0]["snippet"].get()); } TEST_F(CollectionLocaleTest, SearchAgainstKoreanText) { diff --git a/test/tokenizer_test.cpp b/test/tokenizer_test.cpp index 12dd6984..caaafbf1 100644 --- a/test/tokenizer_test.cpp +++ b/test/tokenizer_test.cpp @@ -237,7 +237,7 @@ TEST(TokenizerTest, ShouldTokenizeLocaleText) { tokens.clear(); str = "จิ้งจอกสีน้ำตาลด่วน"; - Tokenizer(str, false, false, "th").tokenize(tokens); + Tokenizer(str, true, false, "th").tokenize(tokens); ASSERT_EQ(4, tokens.size()); ASSERT_EQ("จิ้งจอก", tokens[0]); ASSERT_EQ("สี", tokens[1]);