From c2eec85277ac12b531ead0abc2eeb34f74d5ab6b Mon Sep 17 00:00:00 2001 From: kishorenc Date: Sat, 20 Mar 2021 12:58:30 +0530 Subject: [PATCH] Fix highlighting of strings with special characters. --- include/index.h | 4 +- include/match_score.h | 4 + include/tokenizer.h | 19 +++- src/collection.cpp | 143 +++++++++++++++++------------ src/index.cpp | 17 +++- src/tokenizer.cpp | 90 +++++++++--------- test/collection_synonyms_test.cpp | 4 +- test/collection_test.cpp | 25 +++-- test/tokenizer_test.cpp | 148 +++++++++++++++++++----------- 9 files changed, 278 insertions(+), 176 deletions(-) diff --git a/include/index.h b/include/index.h index 73e45f9e..fcacae77 100644 --- a/include/index.h +++ b/include/index.h @@ -389,10 +389,10 @@ public: static void transform_for_180th_meridian(GeoCoord& point, double offset); - // the following methods are not synchronized because their parent calls are synchronized - art_leaf* get_token_leaf(const std::string & field_name, const unsigned char* token, uint32_t token_len); + // the following methods are not synchronized because their parent calls are synchronized + uint32_t do_filtering(uint32_t** filter_ids_out, const std::vector & filters) const; static Option validate_index_in_memory(nlohmann::json &document, uint32_t seq_id, diff --git a/include/match_score.h b/include/match_score.h index 5224c9dd..c5d69733 100644 --- a/include/match_score.h +++ b/include/match_score.h @@ -30,6 +30,10 @@ struct TokenOffset { bool operator>(const TokenOffset &a) const { return offset > a.offset; } + + bool operator<(const TokenOffset &a) const { + return offset < a.offset; + } }; struct Match { diff --git a/include/tokenizer.h b/include/tokenizer.h index 39c0b511..985d59e8 100644 --- a/include/tokenizer.h +++ b/include/tokenizer.h @@ -8,19 +8,32 @@ class Tokenizer { private: const std::string& text; size_t i; - const bool keep_empty; + const bool keep_separators; const bool normalize; const bool no_op; size_t token_counter = 0; iconv_t cd; + static const size_t CHARS = 0; + static const size_t SEPARATORS = 1; + size_t stream_mode; + + std::stringstream out; + public: explicit Tokenizer(const std::string& input, - bool keep_empty=true, bool normalize=true, bool no_op=false): - text(input), i(0), keep_empty(keep_empty), normalize(normalize), no_op(no_op) { + bool keep_separators=true, bool normalize=true, bool no_op=false): + text(input), i(0), keep_separators(keep_separators), normalize(normalize), no_op(no_op) { cd = iconv_open("ASCII//TRANSLIT", "UTF-8"); + + if(!input.empty() && (std::isalnum(text[0]) || (text[i] & ~0x7f) != 0)) { + // alphanum or non-ascii + stream_mode = CHARS; + } else { + stream_mode = SEPARATORS; + } } ~Tokenizer() { diff --git a/src/collection.cpp b/src/collection.cpp index a98e0e1a..96ae52c9 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -1223,20 +1223,27 @@ void Collection::parse_search_query(const std::string &query, std::vector tokens; - StringUtils::split(query, tokens, " "); + Tokenizer(query, true, true).tokenize(tokens); + bool exclude_operator_prior = false; - for(std::string& token: tokens) { - if(token[0] == '-') { - std::string&& just_token = token.substr(1); - Tokenizer(just_token, false, true).tokenize(just_token); - if(!just_token.empty()) { - q_exclude_tokens.push_back(just_token); - } + for(const auto& token: tokens) { + if(token.empty()) { + continue; + } + + if(token == "-" || token == " -") { + exclude_operator_prior = true; + } + + if(!std::isalnum(token[0])) { + continue; + } + + if(exclude_operator_prior) { + q_exclude_tokens.push_back(token); + exclude_operator_prior = false; } else { - Tokenizer(token, false, true).tokenize(token); - if(!token.empty()) { - q_include_tokens.push_back(token); - } + q_include_tokens.push_back(token); } } @@ -1383,7 +1390,9 @@ void Collection::highlight_result(const field &search_field, // is from the best matched field and need not be present in other fields of a document. Index* index = indices[field_order_kv->key % num_memory_shards]; art_leaf *actual_leaf = index->get_token_leaf(search_field.name, &token_leaf->key[0], token_leaf->key_len); + if(actual_leaf != nullptr) { + //LOG(INFO) << "field: " << search_field.name << ", key: " << actual_leaf->key; query_suggestion.push_back(actual_leaf); std::vector positions; uint32_t doc_index = actual_leaf->values->ids.indexOf(field_order_kv->key); @@ -1432,66 +1441,84 @@ void Collection::highlight_result(const field &search_field, std::partial_sort(match_indices.begin(), match_indices.begin()+max_array_matches, match_indices.end()); for(size_t index = 0; index < max_array_matches; index++) { - const match_index_t & match_index = match_indices[index]; - const Match & match = match_index.match; + std::sort(match_indices[index].match.offsets.begin(), match_indices[index].match.offsets.end()); + const auto& match_index = match_indices[index]; + const Match& match = match_index.match; - std::vector tokens; + const std::string& text = (search_field.type == field_types::STRING) ? document[search_field.name] : document[search_field.name][match_index.index]; + Tokenizer tokenizer(text, true, false); - if(search_field.type == field_types::STRING) { - Tokenizer(document[search_field.name], true, false).tokenize(tokens); - } else { - Tokenizer(document[search_field.name][match_index.index], true, false).tokenize(tokens); - } + std::string raw_token; + size_t raw_token_index = 0; + int indexed_token_index = -1; + size_t match_offset_index = 0; - std::vector token_indices; + std::set token_indices; spp::sparse_hash_set token_hits; + std::vector raw_tokens; + std::unordered_map indexed_to_raw; - for(size_t i = 0; i < match.offsets.size(); i++) { - if(match.offsets[i].offset != MAX_DISPLACEMENT) { - size_t token_index = (size_t)(match.offsets[i].offset); - token_indices.push_back(token_index); - if(token_index >= tokens.size()) { - LOG(ERROR) << "Highlight token index " << token_index << " is greater than length of store field."; - continue; - } - std::string token = tokens[token_index]; - Tokenizer(token, true, true).tokenize(token); - - token_hits.insert(token); + while(tokenizer.next(raw_token, raw_token_index)) { + if(!raw_token.empty() && (std::isalnum(raw_token[0]) || (raw_token[0] & ~0x7f) != 0)) { + // check for actual token (first char is NOT alphanum or ascii) + indexed_token_index++; + indexed_to_raw[indexed_token_index] = raw_token_index; + /*LOG(INFO) << "raw_token: " << raw_token << ", indexed_token_index: " << indexed_token_index + << ", raw_token_index: " << raw_token_index;*/ } + + if (match_offset_index < match.offsets.size() && + match.offsets[match_offset_index].offset == indexed_token_index) { + std::string indexed_token; + Tokenizer(raw_token, true, true).tokenize(indexed_token); + + if(token_indices.count(indexed_token_index) == 0) { + // repetition could occur, for e.g. in the case of synonym constructed queries + token_indices.insert(indexed_token_index); + token_hits.insert(indexed_token); + } + + match_offset_index++; + } + + raw_tokens.push_back(raw_token); } + size_t num_indexed_tokens = indexed_token_index + 1; auto minmax = std::minmax_element(token_indices.begin(), token_indices.end()); size_t prefix_length = highlight_affix_num_tokens; - size_t suffix_length = highlight_affix_num_tokens + 1; + size_t suffix_length = highlight_affix_num_tokens; - // For longer strings, pick surrounding tokens within `prefix_length` of min_index and max_index for snippet - const size_t start_index = (tokens.size() <= snippet_threshold) ? 0 : - std::max(0, (int)(*(minmax.first) - prefix_length)); + if(num_indexed_tokens == 0) { + continue; + } - const size_t end_index = (tokens.size() <= snippet_threshold) ? tokens.size() : - std::min((int)tokens.size(), (int)(*(minmax.second) + suffix_length)); + // For longer strings, pick surrounding raw_tokens within `prefix_length` of min_index and max_index for snippet + const size_t start_index = (num_indexed_tokens <= snippet_threshold) ? 0 : + indexed_to_raw[std::max(0, (int)(*(minmax.first) - prefix_length))]; + + const size_t end_index = (num_indexed_tokens <= snippet_threshold) ? raw_tokens.size() - 1 : + indexed_to_raw[std::min((int)num_indexed_tokens - 1, (int)(*(minmax.second) + suffix_length))]; std::stringstream snippet_stream; highlight.matched_tokens.emplace_back(); std::vector& matched_tokens = highlight.matched_tokens.back(); + size_t snippet_index = start_index; - for(size_t snippet_index = start_index; snippet_index < end_index; snippet_index++) { - if(snippet_index != start_index) { - snippet_stream << " "; - } + while(snippet_index <= end_index) { + std::string normalized_token; + Tokenizer(raw_tokens[snippet_index], true, true).tokenize(normalized_token); - std::string token = tokens[snippet_index]; - Tokenizer(token, true, true).tokenize(token); - - if(token_hits.count(token) != 0) { - snippet_stream << highlight_start_tag << tokens[snippet_index] << highlight_end_tag; - matched_tokens.push_back(tokens[snippet_index]); + if(token_hits.count(normalized_token) != 0) { + snippet_stream << highlight_start_tag << raw_tokens[snippet_index] << highlight_end_tag; + matched_tokens.push_back(raw_tokens[snippet_index]); } else { - snippet_stream << tokens[snippet_index]; + snippet_stream << raw_tokens[snippet_index]; } + + snippet_index++; } highlight.snippets.push_back(snippet_stream.str()); @@ -1501,18 +1528,14 @@ void Collection::highlight_result(const field &search_field, if(highlighted_fully) { std::stringstream value_stream; - for(size_t value_index = 0; value_index < tokens.size(); value_index++) { - if(value_index != 0) { - value_stream << " "; - } + for(size_t value_index = 0; value_index < raw_tokens.size(); value_index++) { + std::string normalized_token; + Tokenizer(raw_tokens[value_index], true, true).tokenize(normalized_token); - std::string token = tokens[value_index]; - Tokenizer(token, true, true).tokenize(token); - - if(token_hits.count(token) != 0) { - value_stream << highlight_start_tag << tokens[value_index] << highlight_end_tag; + if(token_hits.count(normalized_token) != 0) { + value_stream << highlight_start_tag << raw_tokens[value_index] << highlight_end_tag; } else { - value_stream << tokens[value_index]; + value_stream << raw_tokens[value_index]; } } diff --git a/src/index.cpp b/src/index.cpp index c272eeec..8500dee5 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -575,7 +575,7 @@ void Index::index_string_field(const std::string & text, const int64_t score, ar uint32_t seq_id, bool is_facet, const field & a_field) { std::unordered_map> token_to_offsets; - Tokenizer tokenizer(text, true, true, !a_field.is_string()); + Tokenizer tokenizer(text, false, true, !a_field.is_string()); std::string token; size_t token_index = 0; @@ -588,7 +588,6 @@ void Index::index_string_field(const std::string & text, const int64_t score, ar if(is_facet) { uint64_t hash = facet_token_hash(a_field, token); - //facet_index_v2[seq_id][facet_id].push_back(hash); facet_hashes.push_back(hash); } @@ -623,7 +622,7 @@ void Index::index_string_array_field(const std::vector & strings, c const std::string& str = strings[array_index]; std::set token_set; // required to deal with repeating tokens - Tokenizer tokenizer(str, true, true, !a_field.is_string()); + Tokenizer tokenizer(str, false, true, !a_field.is_string()); std::string token; size_t token_index = 0; @@ -2216,6 +2215,8 @@ void Index::populate_token_positions(const std::vector& query_sugges for(size_t i = 0; i < query_suggestion.size(); i++) { const art_leaf* token_leaf = query_suggestion[i]; uint32_t doc_index = leaf_to_indices[i][result_index]; + /*LOG(INFO) << "doc_id: " << token_leaf->values->ids.at(doc_index) << ", token_leaf->values->ids.getLength(): " + << token_leaf->values->ids.getLength();*/ // it's possible for a query token to not appear in a resulting document if(doc_index == token_leaf->values->ids.getLength()) { @@ -2229,7 +2230,14 @@ void Index::populate_token_positions(const std::vector& query_sugges /*uint32_t* offsets = token_leaf->values->offsets.uncompress(); for(size_t ii=0; ii < token_leaf->values->offsets.getLength(); ii++) { LOG(INFO) << "offset: " << offsets[ii]; - }*/ + } + + uint32_t* offset_indices = token_leaf->values->offset_index.uncompress(); + for(size_t ii=0; ii < token_leaf->values->offset_index.getLength(); ii++) { + LOG(INFO) << "offset index: " << offset_indices[ii]; + } + + LOG(INFO) << "token_leaf->values->offsets.getLength(): " << token_leaf->values->offsets.getLength();*/ uint32_t start_offset = token_leaf->values->offset_index.at(doc_index); uint32_t end_offset = (doc_index == token_leaf->values->ids.getLength() - 1) ? @@ -2464,6 +2472,7 @@ void Index::tokenize_doc_field(const nlohmann::json& document, const field& sear } art_leaf* Index::get_token_leaf(const std::string & field_name, const unsigned char* token, uint32_t token_len) { + std::shared_lock lock(mutex); const art_tree *t = search_index.at(field_name); return (art_leaf*) art_search(t, token, (int) token_len); } diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp index 9dc86cd8..ac99a2bb 100644 --- a/src/tokenizer.cpp +++ b/src/tokenizer.cpp @@ -2,62 +2,63 @@ #include "tokenizer.h" bool Tokenizer::next(std::string &token, size_t& token_index) { - std::stringstream out; - - if(i >= text.size()) { - if(i == text.size() && !text.empty() && text.back() == ' ') { - token = ""; - i++; - return true; + if(no_op) { + if(i == text.size()) { + return false; } - return false; - } - - if(no_op) { token = text; i = text.size(); return true; } while(i < text.size()) { - if((text[i] & ~0x7f) == 0 ) { - // ASCII character: split on space/newline or lowercase otherwise - if(std::isalnum(text[i])) { + bool is_ascii = (text[i] & ~0x7f) == 0; + if(is_ascii) { + const size_t next_stream_mode = std::isalnum(text[i]) ? CHARS : SEPARATORS; + + if(next_stream_mode != stream_mode) { + // We tokenize when `stream_mode` changes + token = out.str(); + + out.str(std::string()); if(normalize) { out << char(std::tolower(text[i])); } else { out << text[i]; } + i++; + + if(stream_mode == SEPARATORS && !keep_separators) { + stream_mode = next_stream_mode; + continue; + } + + token_index = token_counter++; + stream_mode = next_stream_mode; + return true; } else { - bool is_space = text[i] == 32; - bool is_new_line = text[i] == 10; - bool is_whitespace = is_space || is_new_line; - - bool next_char_alphanum = (i != text.length() - 1) && std::isalnum(text[i + 1]); - - if(!normalize && !is_whitespace && (i == text.length() - 1 || !next_char_alphanum)) { - // checking for next char ensures that `foo-bar` does not get split to `foo-` + if(normalize) { + out << char(std::tolower(text[i])); + } else { out << text[i]; } - if(is_whitespace || next_char_alphanum) { - // we split on space or on a special character whose next char is alphanumeric - token = out.str(); - out.clear(); - i++; - - if(!keep_empty && token.empty()) { - continue; - } - - token_index = token_counter++; - return true; - } + i++; + continue; } + } - i++; - continue; + if(stream_mode == SEPARATORS) { // to detect first non-ascii character + // we will tokenize now and treat the following non-ascii chars as a different token + stream_mode = CHARS; + token = out.str(); + out.str(std::string()); + + if(keep_separators) { + token_index = token_counter++; + return true; + } } char inbuf[5]; @@ -90,18 +91,17 @@ bool Tokenizer::next(std::string &token, size_t& token_index) { // symbol cannot be represented as ASCII, so write the original symbol out << inbuf; } else { - // NOTE: outsize indicates bytes available AFTER current position so have to do <= for(size_t out_index=0; out_index<5; out_index++) { if(!normalize) { out << outbuf[out_index]; continue; } - bool is_ascii = ((outbuf[out_index] & ~0x7f) == 0); - bool keep_char = !is_ascii || std::isalnum(outbuf[out_index]); + bool unicode_is_ascii = ((outbuf[out_index] & ~0x7f) == 0); + bool keep_char = !unicode_is_ascii || std::isalnum(outbuf[out_index]); if(keep_char) { - if(is_ascii && std::isalnum(outbuf[out_index])) { + if(unicode_is_ascii && std::isalnum(outbuf[out_index])) { outbuf[out_index] = char(std::tolower(outbuf[out_index])); } out << outbuf[out_index]; @@ -111,9 +111,13 @@ bool Tokenizer::next(std::string &token, size_t& token_index) { } token = out.str(); - out.clear(); + out.str(std::string()); - if(!keep_empty && token.empty()) { + if(token.empty()) { + return false; + } + + if(!std::isalnum(token[0]) && !keep_separators) { return false; } diff --git a/test/collection_synonyms_test.cpp b/test/collection_synonyms_test.cpp index 322b0f9a..61b1c742 100644 --- a/test/collection_synonyms_test.cpp +++ b/test/collection_synonyms_test.cpp @@ -359,8 +359,8 @@ TEST_F(CollectionSynonymsTest, MultiWaySynonym) { ASSERT_EQ(2, res["hits"].size()); ASSERT_EQ(2, res["found"].get()); - ASSERT_STREQ("Samuel L. Jackson", res["hits"][0]["highlights"][0]["snippet"].get().c_str()); - ASSERT_STREQ("Samuel L. Jackson", res["hits"][1]["highlights"][0]["snippet"].get().c_str()); + ASSERT_STREQ("Samuel L. Jackson", res["hits"][0]["highlights"][0]["snippet"].get().c_str()); + ASSERT_STREQ("Samuel L. Jackson", res["hits"][1]["highlights"][0]["snippet"].get().c_str()); // for now we don't support synonyms on ANY prefix diff --git a/test/collection_test.cpp b/test/collection_test.cpp index 871fc156..65879b1d 100644 --- a/test/collection_test.cpp +++ b/test/collection_test.cpp @@ -164,7 +164,7 @@ TEST_F(CollectionTest, PhraseSearch) { 13: score: 12, (single word match) */ - std::vector ids = {"8", "1", "16", "17", "13"}; + std::vector ids = {"8", "1", "17", "16", "13"}; for(size_t i = 0; i < results["hits"].size(); i++) { nlohmann::json result = results["hits"].at(i); @@ -184,7 +184,7 @@ TEST_F(CollectionTest, PhraseSearch) { ASSERT_EQ(5, results["hits"].size()); ASSERT_EQ(5, results["found"].get()); - ids = {"8", "1", "17", "16", "13"}; + ids = {"8", "17", "1", "16", "13"}; for(size_t i = 0; i < results["hits"].size(); i++) { nlohmann::json result = results["hits"].at(i); @@ -200,7 +200,7 @@ TEST_F(CollectionTest, PhraseSearch) { ASSERT_EQ(3, results["request_params"]["per_page"].get()); - ids = {"8", "1", "16"}; + ids = {"8", "1", "17"}; for(size_t i = 0; i < 3; i++) { nlohmann::json result = results["hits"].at(i); @@ -1958,7 +1958,7 @@ TEST_F(CollectionTest, SearchLargeTextField) { ASSERT_EQ(1, results["hits"].size()); - ASSERT_STREQ("non arcu id lectus accumsan venenatis at at justo.", + ASSERT_STREQ("non arcu id lectus accumsan venenatis at at justo", results["hits"][0]["highlights"][0]["snippet"].get().c_str()); collectionManager.drop_collection("coll_large_text"); @@ -2141,7 +2141,7 @@ TEST_F(CollectionTest, SearchHighlightWithNewLine) { token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 30, 4, "", 40, {}, {}, {}, 0).get(); - ASSERT_STREQ("Blah, blah Stark Industries", + ASSERT_STREQ("Blah, blah\nStark Industries", res["hits"][0]["highlights"][0]["snippet"].get().c_str()); ASSERT_STREQ("Stark", res["hits"][0]["highlights"][0]["matched_tokens"][0].get().c_str()); @@ -3184,7 +3184,7 @@ TEST_F(CollectionTest, SearchingForRecordsWithSpecialChars) { std::vector> records = { {"Amazon Home", "https://amazon.com/"}, - {"Google Home", "https://google.com/"}, + {"Google Home", "https://google.com///"}, {"Github Issue", "https://github.com/typesense/typesense/issues/241"}, {"Amazon Search", "https://www.amazon.com/s?k=phone&ref=nb_sb_noss_2"}, }; @@ -3206,12 +3206,17 @@ TEST_F(CollectionTest, SearchingForRecordsWithSpecialChars) { ASSERT_EQ(1, results["found"].get()); ASSERT_STREQ("1", results["hits"][0]["document"]["id"].get().c_str()); + ASSERT_EQ(2, results["hits"][0]["highlights"].size()); + ASSERT_EQ("Google Home", results["hits"][0]["highlights"][0]["snippet"].get()); + ASSERT_EQ("https://google.com///", results["hits"][0]["highlights"][1]["snippet"].get()); + results = coll1->search("amazon.com", {"title", "url"}, "", {}, {}, 2, 10, 1, FREQUENCY).get(); - ASSERT_EQ(2, results["found"].get()); + ASSERT_EQ(3, results["found"].get()); ASSERT_STREQ("3", results["hits"][0]["document"]["id"].get().c_str()); ASSERT_STREQ("0", results["hits"][1]["document"]["id"].get().c_str()); + ASSERT_STREQ("1", results["hits"][2]["document"]["id"].get().c_str()); results = coll1->search("typesense", {"title", "url"}, "", {}, {}, 2, 10, 1, FREQUENCY).get(); @@ -3225,5 +3230,9 @@ TEST_F(CollectionTest, SearchingForRecordsWithSpecialChars) { ASSERT_EQ(1, results["found"].get()); ASSERT_STREQ("3", results["hits"][0]["document"]["id"].get().c_str()); + ASSERT_EQ(1, results["hits"][0]["highlights"].size()); + ASSERT_EQ("https://www.amazon.com/s?k=phone&ref=nb_sb_noss_2", + results["hits"][0]["highlights"][0]["snippet"].get()); + collectionManager.drop_collection("coll1"); -} \ No newline at end of file +} diff --git a/test/tokenizer_test.cpp b/test/tokenizer_test.cpp index 3a4b1264..58123d4c 100644 --- a/test/tokenizer_test.cpp +++ b/test/tokenizer_test.cpp @@ -4,25 +4,23 @@ TEST(TokenizerTest, ShouldTokenizeNormalizeDifferentStrings) { const std::string withnewline = "Michael Jordan:\nWelcome, everybody. Welcome! "; std::vector tokens; - Tokenizer(withnewline, true, true, false).tokenize(tokens); - ASSERT_EQ(6, tokens.size()); + Tokenizer(withnewline, false, true, false).tokenize(tokens); + ASSERT_EQ(5, tokens.size()); ASSERT_STREQ("michael", tokens[0].c_str()); ASSERT_STREQ("jordan", tokens[1].c_str()); ASSERT_STREQ("welcome", tokens[2].c_str()); ASSERT_STREQ("everybody", tokens[3].c_str()); ASSERT_STREQ("welcome", tokens[4].c_str()); - ASSERT_STREQ("", tokens[5].c_str()); const std::string withspaces = " Michael Jordan "; tokens.clear(); Tokenizer(withspaces, true, true, false).tokenize(tokens); - ASSERT_EQ(6, tokens.size()); - ASSERT_STREQ("", tokens[0].c_str()); + ASSERT_EQ(5, tokens.size()); + ASSERT_STREQ(" ", tokens[0].c_str()); ASSERT_STREQ("michael", tokens[1].c_str()); - ASSERT_STREQ("", tokens[2].c_str()); + ASSERT_STREQ(" ", tokens[2].c_str()); ASSERT_STREQ("jordan", tokens[3].c_str()); - ASSERT_STREQ("", tokens[4].c_str()); - ASSERT_STREQ("", tokens[5].c_str()); + ASSERT_STREQ(" ", tokens[4].c_str()); tokens.clear(); Tokenizer(withspaces, false, true, false).tokenize(tokens); @@ -30,38 +28,6 @@ TEST(TokenizerTest, ShouldTokenizeNormalizeDifferentStrings) { ASSERT_STREQ("michael", tokens[0].c_str()); ASSERT_STREQ("jordan", tokens[1].c_str()); - const std::string withspecialchars = "Special ½¥ and தமிழ் 你好吗 abcÅà123ß12 here."; - tokens.clear(); - Tokenizer(withspecialchars, false, true, false).tokenize(tokens); - ASSERT_EQ(7, tokens.size()); - ASSERT_STREQ("special", tokens[0].c_str()); - ASSERT_STREQ("12yen", tokens[1].c_str()); - ASSERT_STREQ("and", tokens[2].c_str()); - ASSERT_STREQ("தமிழ்", tokens[3].c_str()); - ASSERT_STREQ("你好吗", tokens[4].c_str()); - ASSERT_STREQ("abcaa123ss12", tokens[5].c_str()); - ASSERT_STREQ("here", tokens[6].c_str()); - - // when normalization is disabled and keep empty is enabled - const std::string withoutnormalize = "Mise à, jour."; - tokens.clear(); - Tokenizer(withoutnormalize, true, false, false).tokenize(tokens); - ASSERT_EQ(5, tokens.size()); - ASSERT_STREQ("Mise", tokens[0].c_str()); - ASSERT_STREQ("", tokens[1].c_str()); - ASSERT_STREQ("à,", tokens[2].c_str()); - ASSERT_STREQ("", tokens[3].c_str()); - ASSERT_STREQ("jour.", tokens[4].c_str()); - - // when normalization and keep empty are disabled - const std::string withoutnormalizeandkeepempty = "Mise à jour."; - tokens.clear(); - Tokenizer(withoutnormalizeandkeepempty, false, false, false).tokenize(tokens); - ASSERT_EQ(3, tokens.size()); - ASSERT_STREQ("Mise", tokens[0].c_str()); - ASSERT_STREQ("à", tokens[1].c_str()); - ASSERT_STREQ("jour.", tokens[2].c_str()); - // single token const std::string single_token = "foobar"; tokens.clear(); @@ -89,22 +55,82 @@ TEST(TokenizerTest, ShouldTokenizeNormalizeDifferentStrings) { const std::string multispace_tokens = "foo bar"; tokens.clear(); Tokenizer(multispace_tokens, true, false, false).tokenize(tokens); - ASSERT_EQ(6, tokens.size()); + ASSERT_EQ(3, tokens.size()); ASSERT_STREQ("foo", tokens[0].c_str()); - ASSERT_STREQ("", tokens[1].c_str()); - ASSERT_STREQ("", tokens[2].c_str()); - ASSERT_STREQ("", tokens[3].c_str()); - ASSERT_STREQ("", tokens[4].c_str()); - ASSERT_STREQ("bar", tokens[5].c_str()); + ASSERT_STREQ(" ", tokens[1].c_str()); + ASSERT_STREQ("bar", tokens[2].c_str()); + + // special chars + const std::string specialchar_tokens = "https://www.amazon.com/s?k=phone&ref=nb_sb_noss_2";; + tokens.clear(); + Tokenizer(specialchar_tokens, true, false, false).tokenize(tokens); + ASSERT_EQ(23, tokens.size()); + ASSERT_STREQ("https", tokens[0].c_str()); + ASSERT_STREQ("://", tokens[1].c_str()); + ASSERT_STREQ("www", tokens[2].c_str()); + ASSERT_STREQ(".", tokens[3].c_str()); + ASSERT_STREQ("noss", tokens[20].c_str()); + ASSERT_STREQ("_", tokens[21].c_str()); + ASSERT_STREQ("2", tokens[22].c_str()); // noop tokens.clear(); + const std::string withspecialchars = "Special ½¥ and தமிழ் 你好吗 abcÅà123ß12 here."; Tokenizer(withspecialchars, false, true, true).tokenize(tokens); ASSERT_EQ(1, tokens.size()); ASSERT_STREQ(withspecialchars.c_str(), tokens[0].c_str()); } +TEST(TokenizerTest, ShouldTokenizeNormalizeUnicodeStrings) { + std::vector tokens; + + const std::string withspecialchars = "Special ½¥ and -தமிழ் 你2好吗 abcÅà123ß12 here."; + tokens.clear(); + Tokenizer(withspecialchars, false, true, false).tokenize(tokens); + ASSERT_EQ(7, tokens.size()); + ASSERT_STREQ("special", tokens[0].c_str()); + ASSERT_STREQ("12yen", tokens[1].c_str()); + ASSERT_STREQ("and", tokens[2].c_str()); + ASSERT_STREQ("தமிழ்", tokens[3].c_str()); + ASSERT_STREQ("你2好吗", tokens[4].c_str()); + ASSERT_STREQ("abcaa123ss12", tokens[5].c_str()); + ASSERT_STREQ("here", tokens[6].c_str()); + + // when normalization is disabled and keep empty is enabled + const std::string withoutnormalize = "Mise à, jour."; + tokens.clear(); + Tokenizer(withoutnormalize, true, false, false).tokenize(tokens); + ASSERT_EQ(6, tokens.size()); + ASSERT_STREQ("Mise", tokens[0].c_str()); + ASSERT_STREQ(" ", tokens[1].c_str()); + ASSERT_STREQ("à", tokens[2].c_str()); + ASSERT_STREQ(", ", tokens[3].c_str()); + ASSERT_STREQ("jour", tokens[4].c_str()); + ASSERT_STREQ(".", tokens[5].c_str()); + + // when normalization and keep empty are disabled + const std::string withoutnormalizeandkeepempty = "Mise à jour."; + tokens.clear(); + Tokenizer(withoutnormalizeandkeepempty, false, false, false).tokenize(tokens); + ASSERT_EQ(3, tokens.size()); + ASSERT_STREQ("Mise", tokens[0].c_str()); + ASSERT_STREQ("à", tokens[1].c_str()); + ASSERT_STREQ("jour", tokens[2].c_str()); + + // single accented word tokenization + std::string singleword = "à"; + tokens.clear(); + Tokenizer(singleword, false, true, false).tokenize(tokens); + ASSERT_EQ(1, tokens.size()); + ASSERT_STREQ("a", tokens[0].c_str()); + + tokens.clear(); + Tokenizer(singleword, true, true, false).tokenize(tokens); + ASSERT_EQ(1, tokens.size()); + ASSERT_STREQ("a", tokens[0].c_str()); +} + TEST(TokenizerTest, ShouldTokenizeIteratively) { const std::string withnewline = "Michael Jordan:\n\nWelcome, everybody. Welcome!"; std::vector tokens; @@ -117,20 +143,34 @@ TEST(TokenizerTest, ShouldTokenizeIteratively) { tokens.push_back(token); } - ASSERT_EQ(6, tokens.size()); + ASSERT_EQ(10, tokens.size()); ASSERT_STREQ("michael", tokens[0].c_str()); - ASSERT_STREQ("jordan", tokens[1].c_str()); - ASSERT_STREQ("", tokens[2].c_str()); - ASSERT_STREQ("welcome", tokens[3].c_str()); - ASSERT_STREQ("everybody", tokens[4].c_str()); - ASSERT_STREQ("welcome", tokens[5].c_str()); + ASSERT_STREQ(" ", tokens[1].c_str()); + ASSERT_STREQ("jordan", tokens[2].c_str()); + ASSERT_STREQ(":\n\n", tokens[3].c_str()); + ASSERT_STREQ("welcome", tokens[4].c_str()); + ASSERT_STREQ(", ", tokens[5].c_str()); + ASSERT_STREQ("everybody", tokens[6].c_str()); + ASSERT_STREQ(". ", tokens[7].c_str()); + ASSERT_STREQ("welcome", tokens[8].c_str()); + ASSERT_STREQ("!", tokens[9].c_str()); + + // check for index when separators are not kept + Tokenizer tokenizer2(withnewline, false, true, false); + size_t expected_token_index = 0; + std::vector expected_tokens = {"michael", "jordan", "welcome", "everybody", "welcome"}; + while(tokenizer2.next(token, token_index)) { + ASSERT_EQ(expected_token_index, token_index); + ASSERT_EQ(expected_tokens[expected_token_index], token); + expected_token_index++; + } // verbatim (no_op=true) tokens.clear(); - Tokenizer tokenizer2(withnewline, true, false, true); + Tokenizer tokenizer3(withnewline, true, false, true); - while(tokenizer2.next(token, token_index)) { + while(tokenizer3.next(token, token_index)) { tokens.push_back(token); }