Fix highlighting of strings with special characters.

This commit is contained in:
kishorenc 2021-03-20 12:58:30 +05:30
parent fcdd8ec9c9
commit c2eec85277
9 changed files with 278 additions and 176 deletions

View File

@ -389,10 +389,10 @@ public:
static void transform_for_180th_meridian(GeoCoord& point, double offset);
// the following methods are not synchronized because their parent calls are synchronized
art_leaf* get_token_leaf(const std::string & field_name, const unsigned char* token, uint32_t token_len);
// the following methods are not synchronized because their parent calls are synchronized
uint32_t do_filtering(uint32_t** filter_ids_out, const std::vector<filter> & filters) const;
static Option<uint32_t> validate_index_in_memory(nlohmann::json &document, uint32_t seq_id,

View File

@ -30,6 +30,10 @@ struct TokenOffset {
bool operator>(const TokenOffset &a) const {
return offset > a.offset;
}
bool operator<(const TokenOffset &a) const {
return offset < a.offset;
}
};
struct Match {

View File

@ -8,19 +8,32 @@ class Tokenizer {
private:
const std::string& text;
size_t i;
const bool keep_empty;
const bool keep_separators;
const bool normalize;
const bool no_op;
size_t token_counter = 0;
iconv_t cd;
static const size_t CHARS = 0;
static const size_t SEPARATORS = 1;
size_t stream_mode;
std::stringstream out;
public:
explicit Tokenizer(const std::string& input,
bool keep_empty=true, bool normalize=true, bool no_op=false):
text(input), i(0), keep_empty(keep_empty), normalize(normalize), no_op(no_op) {
bool keep_separators=true, bool normalize=true, bool no_op=false):
text(input), i(0), keep_separators(keep_separators), normalize(normalize), no_op(no_op) {
cd = iconv_open("ASCII//TRANSLIT", "UTF-8");
if(!input.empty() && (std::isalnum(text[0]) || (text[i] & ~0x7f) != 0)) {
// alphanum or non-ascii
stream_mode = CHARS;
} else {
stream_mode = SEPARATORS;
}
}
~Tokenizer() {

View File

@ -1223,20 +1223,27 @@ void Collection::parse_search_query(const std::string &query, std::vector<std::s
q_include_tokens = {query};
} else {
std::vector<std::string> tokens;
StringUtils::split(query, tokens, " ");
Tokenizer(query, true, true).tokenize(tokens);
bool exclude_operator_prior = false;
for(std::string& token: tokens) {
if(token[0] == '-') {
std::string&& just_token = token.substr(1);
Tokenizer(just_token, false, true).tokenize(just_token);
if(!just_token.empty()) {
q_exclude_tokens.push_back(just_token);
}
for(const auto& token: tokens) {
if(token.empty()) {
continue;
}
if(token == "-" || token == " -") {
exclude_operator_prior = true;
}
if(!std::isalnum(token[0])) {
continue;
}
if(exclude_operator_prior) {
q_exclude_tokens.push_back(token);
exclude_operator_prior = false;
} else {
Tokenizer(token, false, true).tokenize(token);
if(!token.empty()) {
q_include_tokens.push_back(token);
}
q_include_tokens.push_back(token);
}
}
@ -1383,7 +1390,9 @@ void Collection::highlight_result(const field &search_field,
// is from the best matched field and need not be present in other fields of a document.
Index* index = indices[field_order_kv->key % num_memory_shards];
art_leaf *actual_leaf = index->get_token_leaf(search_field.name, &token_leaf->key[0], token_leaf->key_len);
if(actual_leaf != nullptr) {
//LOG(INFO) << "field: " << search_field.name << ", key: " << actual_leaf->key;
query_suggestion.push_back(actual_leaf);
std::vector<uint16_t> positions;
uint32_t doc_index = actual_leaf->values->ids.indexOf(field_order_kv->key);
@ -1432,66 +1441,84 @@ void Collection::highlight_result(const field &search_field,
std::partial_sort(match_indices.begin(), match_indices.begin()+max_array_matches, match_indices.end());
for(size_t index = 0; index < max_array_matches; index++) {
const match_index_t & match_index = match_indices[index];
const Match & match = match_index.match;
std::sort(match_indices[index].match.offsets.begin(), match_indices[index].match.offsets.end());
const auto& match_index = match_indices[index];
const Match& match = match_index.match;
std::vector<std::string> tokens;
const std::string& text = (search_field.type == field_types::STRING) ? document[search_field.name] : document[search_field.name][match_index.index];
Tokenizer tokenizer(text, true, false);
if(search_field.type == field_types::STRING) {
Tokenizer(document[search_field.name], true, false).tokenize(tokens);
} else {
Tokenizer(document[search_field.name][match_index.index], true, false).tokenize(tokens);
}
std::string raw_token;
size_t raw_token_index = 0;
int indexed_token_index = -1;
size_t match_offset_index = 0;
std::vector<size_t> token_indices;
std::set<size_t> token_indices;
spp::sparse_hash_set<std::string> token_hits;
std::vector<std::string> raw_tokens;
std::unordered_map<size_t, size_t> indexed_to_raw;
for(size_t i = 0; i < match.offsets.size(); i++) {
if(match.offsets[i].offset != MAX_DISPLACEMENT) {
size_t token_index = (size_t)(match.offsets[i].offset);
token_indices.push_back(token_index);
if(token_index >= tokens.size()) {
LOG(ERROR) << "Highlight token index " << token_index << " is greater than length of store field.";
continue;
}
std::string token = tokens[token_index];
Tokenizer(token, true, true).tokenize(token);
token_hits.insert(token);
while(tokenizer.next(raw_token, raw_token_index)) {
if(!raw_token.empty() && (std::isalnum(raw_token[0]) || (raw_token[0] & ~0x7f) != 0)) {
// check for actual token (first char is NOT alphanum or ascii)
indexed_token_index++;
indexed_to_raw[indexed_token_index] = raw_token_index;
/*LOG(INFO) << "raw_token: " << raw_token << ", indexed_token_index: " << indexed_token_index
<< ", raw_token_index: " << raw_token_index;*/
}
if (match_offset_index < match.offsets.size() &&
match.offsets[match_offset_index].offset == indexed_token_index) {
std::string indexed_token;
Tokenizer(raw_token, true, true).tokenize(indexed_token);
if(token_indices.count(indexed_token_index) == 0) {
// repetition could occur, for e.g. in the case of synonym constructed queries
token_indices.insert(indexed_token_index);
token_hits.insert(indexed_token);
}
match_offset_index++;
}
raw_tokens.push_back(raw_token);
}
size_t num_indexed_tokens = indexed_token_index + 1;
auto minmax = std::minmax_element(token_indices.begin(), token_indices.end());
size_t prefix_length = highlight_affix_num_tokens;
size_t suffix_length = highlight_affix_num_tokens + 1;
size_t suffix_length = highlight_affix_num_tokens;
// For longer strings, pick surrounding tokens within `prefix_length` of min_index and max_index for snippet
const size_t start_index = (tokens.size() <= snippet_threshold) ? 0 :
std::max(0, (int)(*(minmax.first) - prefix_length));
if(num_indexed_tokens == 0) {
continue;
}
const size_t end_index = (tokens.size() <= snippet_threshold) ? tokens.size() :
std::min((int)tokens.size(), (int)(*(minmax.second) + suffix_length));
// For longer strings, pick surrounding raw_tokens within `prefix_length` of min_index and max_index for snippet
const size_t start_index = (num_indexed_tokens <= snippet_threshold) ? 0 :
indexed_to_raw[std::max(0, (int)(*(minmax.first) - prefix_length))];
const size_t end_index = (num_indexed_tokens <= snippet_threshold) ? raw_tokens.size() - 1 :
indexed_to_raw[std::min((int)num_indexed_tokens - 1, (int)(*(minmax.second) + suffix_length))];
std::stringstream snippet_stream;
highlight.matched_tokens.emplace_back();
std::vector<std::string>& matched_tokens = highlight.matched_tokens.back();
size_t snippet_index = start_index;
for(size_t snippet_index = start_index; snippet_index < end_index; snippet_index++) {
if(snippet_index != start_index) {
snippet_stream << " ";
}
while(snippet_index <= end_index) {
std::string normalized_token;
Tokenizer(raw_tokens[snippet_index], true, true).tokenize(normalized_token);
std::string token = tokens[snippet_index];
Tokenizer(token, true, true).tokenize(token);
if(token_hits.count(token) != 0) {
snippet_stream << highlight_start_tag << tokens[snippet_index] << highlight_end_tag;
matched_tokens.push_back(tokens[snippet_index]);
if(token_hits.count(normalized_token) != 0) {
snippet_stream << highlight_start_tag << raw_tokens[snippet_index] << highlight_end_tag;
matched_tokens.push_back(raw_tokens[snippet_index]);
} else {
snippet_stream << tokens[snippet_index];
snippet_stream << raw_tokens[snippet_index];
}
snippet_index++;
}
highlight.snippets.push_back(snippet_stream.str());
@ -1501,18 +1528,14 @@ void Collection::highlight_result(const field &search_field,
if(highlighted_fully) {
std::stringstream value_stream;
for(size_t value_index = 0; value_index < tokens.size(); value_index++) {
if(value_index != 0) {
value_stream << " ";
}
for(size_t value_index = 0; value_index < raw_tokens.size(); value_index++) {
std::string normalized_token;
Tokenizer(raw_tokens[value_index], true, true).tokenize(normalized_token);
std::string token = tokens[value_index];
Tokenizer(token, true, true).tokenize(token);
if(token_hits.count(token) != 0) {
value_stream << highlight_start_tag << tokens[value_index] << highlight_end_tag;
if(token_hits.count(normalized_token) != 0) {
value_stream << highlight_start_tag << raw_tokens[value_index] << highlight_end_tag;
} else {
value_stream << tokens[value_index];
value_stream << raw_tokens[value_index];
}
}

View File

@ -575,7 +575,7 @@ void Index::index_string_field(const std::string & text, const int64_t score, ar
uint32_t seq_id, bool is_facet, const field & a_field) {
std::unordered_map<std::string, std::vector<uint32_t>> token_to_offsets;
Tokenizer tokenizer(text, true, true, !a_field.is_string());
Tokenizer tokenizer(text, false, true, !a_field.is_string());
std::string token;
size_t token_index = 0;
@ -588,7 +588,6 @@ void Index::index_string_field(const std::string & text, const int64_t score, ar
if(is_facet) {
uint64_t hash = facet_token_hash(a_field, token);
//facet_index_v2[seq_id][facet_id].push_back(hash);
facet_hashes.push_back(hash);
}
@ -623,7 +622,7 @@ void Index::index_string_array_field(const std::vector<std::string> & strings, c
const std::string& str = strings[array_index];
std::set<std::string> token_set; // required to deal with repeating tokens
Tokenizer tokenizer(str, true, true, !a_field.is_string());
Tokenizer tokenizer(str, false, true, !a_field.is_string());
std::string token;
size_t token_index = 0;
@ -2216,6 +2215,8 @@ void Index::populate_token_positions(const std::vector<art_leaf *>& query_sugges
for(size_t i = 0; i < query_suggestion.size(); i++) {
const art_leaf* token_leaf = query_suggestion[i];
uint32_t doc_index = leaf_to_indices[i][result_index];
/*LOG(INFO) << "doc_id: " << token_leaf->values->ids.at(doc_index) << ", token_leaf->values->ids.getLength(): "
<< token_leaf->values->ids.getLength();*/
// it's possible for a query token to not appear in a resulting document
if(doc_index == token_leaf->values->ids.getLength()) {
@ -2229,7 +2230,14 @@ void Index::populate_token_positions(const std::vector<art_leaf *>& query_sugges
/*uint32_t* offsets = token_leaf->values->offsets.uncompress();
for(size_t ii=0; ii < token_leaf->values->offsets.getLength(); ii++) {
LOG(INFO) << "offset: " << offsets[ii];
}*/
}
uint32_t* offset_indices = token_leaf->values->offset_index.uncompress();
for(size_t ii=0; ii < token_leaf->values->offset_index.getLength(); ii++) {
LOG(INFO) << "offset index: " << offset_indices[ii];
}
LOG(INFO) << "token_leaf->values->offsets.getLength(): " << token_leaf->values->offsets.getLength();*/
uint32_t start_offset = token_leaf->values->offset_index.at(doc_index);
uint32_t end_offset = (doc_index == token_leaf->values->ids.getLength() - 1) ?
@ -2464,6 +2472,7 @@ void Index::tokenize_doc_field(const nlohmann::json& document, const field& sear
}
art_leaf* Index::get_token_leaf(const std::string & field_name, const unsigned char* token, uint32_t token_len) {
std::shared_lock lock(mutex);
const art_tree *t = search_index.at(field_name);
return (art_leaf*) art_search(t, token, (int) token_len);
}

View File

@ -2,62 +2,63 @@
#include "tokenizer.h"
bool Tokenizer::next(std::string &token, size_t& token_index) {
std::stringstream out;
if(i >= text.size()) {
if(i == text.size() && !text.empty() && text.back() == ' ') {
token = "";
i++;
return true;
if(no_op) {
if(i == text.size()) {
return false;
}
return false;
}
if(no_op) {
token = text;
i = text.size();
return true;
}
while(i < text.size()) {
if((text[i] & ~0x7f) == 0 ) {
// ASCII character: split on space/newline or lowercase otherwise
if(std::isalnum(text[i])) {
bool is_ascii = (text[i] & ~0x7f) == 0;
if(is_ascii) {
const size_t next_stream_mode = std::isalnum(text[i]) ? CHARS : SEPARATORS;
if(next_stream_mode != stream_mode) {
// We tokenize when `stream_mode` changes
token = out.str();
out.str(std::string());
if(normalize) {
out << char(std::tolower(text[i]));
} else {
out << text[i];
}
i++;
if(stream_mode == SEPARATORS && !keep_separators) {
stream_mode = next_stream_mode;
continue;
}
token_index = token_counter++;
stream_mode = next_stream_mode;
return true;
} else {
bool is_space = text[i] == 32;
bool is_new_line = text[i] == 10;
bool is_whitespace = is_space || is_new_line;
bool next_char_alphanum = (i != text.length() - 1) && std::isalnum(text[i + 1]);
if(!normalize && !is_whitespace && (i == text.length() - 1 || !next_char_alphanum)) {
// checking for next char ensures that `foo-bar` does not get split to `foo-`
if(normalize) {
out << char(std::tolower(text[i]));
} else {
out << text[i];
}
if(is_whitespace || next_char_alphanum) {
// we split on space or on a special character whose next char is alphanumeric
token = out.str();
out.clear();
i++;
if(!keep_empty && token.empty()) {
continue;
}
token_index = token_counter++;
return true;
}
i++;
continue;
}
}
i++;
continue;
if(stream_mode == SEPARATORS) { // to detect first non-ascii character
// we will tokenize now and treat the following non-ascii chars as a different token
stream_mode = CHARS;
token = out.str();
out.str(std::string());
if(keep_separators) {
token_index = token_counter++;
return true;
}
}
char inbuf[5];
@ -90,18 +91,17 @@ bool Tokenizer::next(std::string &token, size_t& token_index) {
// symbol cannot be represented as ASCII, so write the original symbol
out << inbuf;
} else {
// NOTE: outsize indicates bytes available AFTER current position so have to do <=
for(size_t out_index=0; out_index<5; out_index++) {
if(!normalize) {
out << outbuf[out_index];
continue;
}
bool is_ascii = ((outbuf[out_index] & ~0x7f) == 0);
bool keep_char = !is_ascii || std::isalnum(outbuf[out_index]);
bool unicode_is_ascii = ((outbuf[out_index] & ~0x7f) == 0);
bool keep_char = !unicode_is_ascii || std::isalnum(outbuf[out_index]);
if(keep_char) {
if(is_ascii && std::isalnum(outbuf[out_index])) {
if(unicode_is_ascii && std::isalnum(outbuf[out_index])) {
outbuf[out_index] = char(std::tolower(outbuf[out_index]));
}
out << outbuf[out_index];
@ -111,9 +111,13 @@ bool Tokenizer::next(std::string &token, size_t& token_index) {
}
token = out.str();
out.clear();
out.str(std::string());
if(!keep_empty && token.empty()) {
if(token.empty()) {
return false;
}
if(!std::isalnum(token[0]) && !keep_separators) {
return false;
}

View File

@ -359,8 +359,8 @@ TEST_F(CollectionSynonymsTest, MultiWaySynonym) {
ASSERT_EQ(2, res["hits"].size());
ASSERT_EQ(2, res["found"].get<uint32_t>());
ASSERT_STREQ("<mark>Samuel</mark> <mark>L.</mark> <mark>Jackson</mark>", res["hits"][0]["highlights"][0]["snippet"].get<std::string>().c_str());
ASSERT_STREQ("<mark>Samuel</mark> <mark>L.</mark> <mark>Jackson</mark>", res["hits"][1]["highlights"][0]["snippet"].get<std::string>().c_str());
ASSERT_STREQ("<mark>Samuel</mark> <mark>L</mark>. <mark>Jackson</mark>", res["hits"][0]["highlights"][0]["snippet"].get<std::string>().c_str());
ASSERT_STREQ("<mark>Samuel</mark> <mark>L</mark>. <mark>Jackson</mark>", res["hits"][1]["highlights"][0]["snippet"].get<std::string>().c_str());
// for now we don't support synonyms on ANY prefix

View File

@ -164,7 +164,7 @@ TEST_F(CollectionTest, PhraseSearch) {
13: score: 12, (single word match)
*/
std::vector<std::string> ids = {"8", "1", "16", "17", "13"};
std::vector<std::string> ids = {"8", "1", "17", "16", "13"};
for(size_t i = 0; i < results["hits"].size(); i++) {
nlohmann::json result = results["hits"].at(i);
@ -184,7 +184,7 @@ TEST_F(CollectionTest, PhraseSearch) {
ASSERT_EQ(5, results["hits"].size());
ASSERT_EQ(5, results["found"].get<uint32_t>());
ids = {"8", "1", "17", "16", "13"};
ids = {"8", "17", "1", "16", "13"};
for(size_t i = 0; i < results["hits"].size(); i++) {
nlohmann::json result = results["hits"].at(i);
@ -200,7 +200,7 @@ TEST_F(CollectionTest, PhraseSearch) {
ASSERT_EQ(3, results["request_params"]["per_page"].get<size_t>());
ids = {"8", "1", "16"};
ids = {"8", "1", "17"};
for(size_t i = 0; i < 3; i++) {
nlohmann::json result = results["hits"].at(i);
@ -1958,7 +1958,7 @@ TEST_F(CollectionTest, SearchLargeTextField) {
ASSERT_EQ(1, results["hits"].size());
ASSERT_STREQ("non arcu id lectus <mark>accumsan</mark> venenatis at at justo.",
ASSERT_STREQ("non arcu id lectus <mark>accumsan</mark> venenatis at at justo",
results["hits"][0]["highlights"][0]["snippet"].get<std::string>().c_str());
collectionManager.drop_collection("coll_large_text");
@ -2141,7 +2141,7 @@ TEST_F(CollectionTest, SearchHighlightWithNewLine) {
token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0).get();
ASSERT_STREQ("Blah, blah <mark>Stark</mark> Industries",
ASSERT_STREQ("Blah, blah\n<mark>Stark</mark> Industries",
res["hits"][0]["highlights"][0]["snippet"].get<std::string>().c_str());
ASSERT_STREQ("Stark", res["hits"][0]["highlights"][0]["matched_tokens"][0].get<std::string>().c_str());
@ -3184,7 +3184,7 @@ TEST_F(CollectionTest, SearchingForRecordsWithSpecialChars) {
std::vector<std::vector<std::string>> records = {
{"Amazon Home", "https://amazon.com/"},
{"Google Home", "https://google.com/"},
{"Google Home", "https://google.com///"},
{"Github Issue", "https://github.com/typesense/typesense/issues/241"},
{"Amazon Search", "https://www.amazon.com/s?k=phone&ref=nb_sb_noss_2"},
};
@ -3206,12 +3206,17 @@ TEST_F(CollectionTest, SearchingForRecordsWithSpecialChars) {
ASSERT_EQ(1, results["found"].get<size_t>());
ASSERT_STREQ("1", results["hits"][0]["document"]["id"].get<std::string>().c_str());
ASSERT_EQ(2, results["hits"][0]["highlights"].size());
ASSERT_EQ("<mark>Google</mark> Home", results["hits"][0]["highlights"][0]["snippet"].get<std::string>());
ASSERT_EQ("https://<mark>google</mark>.com///", results["hits"][0]["highlights"][1]["snippet"].get<std::string>());
results = coll1->search("amazon.com",
{"title", "url"}, "", {}, {}, 2, 10, 1, FREQUENCY).get();
ASSERT_EQ(2, results["found"].get<size_t>());
ASSERT_EQ(3, results["found"].get<size_t>());
ASSERT_STREQ("3", results["hits"][0]["document"]["id"].get<std::string>().c_str());
ASSERT_STREQ("0", results["hits"][1]["document"]["id"].get<std::string>().c_str());
ASSERT_STREQ("1", results["hits"][2]["document"]["id"].get<std::string>().c_str());
results = coll1->search("typesense",
{"title", "url"}, "", {}, {}, 2, 10, 1, FREQUENCY).get();
@ -3225,5 +3230,9 @@ TEST_F(CollectionTest, SearchingForRecordsWithSpecialChars) {
ASSERT_EQ(1, results["found"].get<size_t>());
ASSERT_STREQ("3", results["hits"][0]["document"]["id"].get<std::string>().c_str());
ASSERT_EQ(1, results["hits"][0]["highlights"].size());
ASSERT_EQ("https://www.amazon.com/s?k=phone&ref=<mark>nb</mark>_<mark>sb</mark>_<mark>noss</mark>_<mark>2</mark>",
results["hits"][0]["highlights"][0]["snippet"].get<std::string>());
collectionManager.drop_collection("coll1");
}
}

View File

@ -4,25 +4,23 @@
TEST(TokenizerTest, ShouldTokenizeNormalizeDifferentStrings) {
const std::string withnewline = "Michael Jordan:\nWelcome, everybody. Welcome! ";
std::vector<std::string> tokens;
Tokenizer(withnewline, true, true, false).tokenize(tokens);
ASSERT_EQ(6, tokens.size());
Tokenizer(withnewline, false, true, false).tokenize(tokens);
ASSERT_EQ(5, tokens.size());
ASSERT_STREQ("michael", tokens[0].c_str());
ASSERT_STREQ("jordan", tokens[1].c_str());
ASSERT_STREQ("welcome", tokens[2].c_str());
ASSERT_STREQ("everybody", tokens[3].c_str());
ASSERT_STREQ("welcome", tokens[4].c_str());
ASSERT_STREQ("", tokens[5].c_str());
const std::string withspaces = " Michael Jordan ";
tokens.clear();
Tokenizer(withspaces, true, true, false).tokenize(tokens);
ASSERT_EQ(6, tokens.size());
ASSERT_STREQ("", tokens[0].c_str());
ASSERT_EQ(5, tokens.size());
ASSERT_STREQ(" ", tokens[0].c_str());
ASSERT_STREQ("michael", tokens[1].c_str());
ASSERT_STREQ("", tokens[2].c_str());
ASSERT_STREQ(" ", tokens[2].c_str());
ASSERT_STREQ("jordan", tokens[3].c_str());
ASSERT_STREQ("", tokens[4].c_str());
ASSERT_STREQ("", tokens[5].c_str());
ASSERT_STREQ(" ", tokens[4].c_str());
tokens.clear();
Tokenizer(withspaces, false, true, false).tokenize(tokens);
@ -30,38 +28,6 @@ TEST(TokenizerTest, ShouldTokenizeNormalizeDifferentStrings) {
ASSERT_STREQ("michael", tokens[0].c_str());
ASSERT_STREQ("jordan", tokens[1].c_str());
const std::string withspecialchars = "Special ½¥ and தமிழ் 你好吗 abcÅà123ß12 here.";
tokens.clear();
Tokenizer(withspecialchars, false, true, false).tokenize(tokens);
ASSERT_EQ(7, tokens.size());
ASSERT_STREQ("special", tokens[0].c_str());
ASSERT_STREQ("12yen", tokens[1].c_str());
ASSERT_STREQ("and", tokens[2].c_str());
ASSERT_STREQ("தமிழ்", tokens[3].c_str());
ASSERT_STREQ("你好吗", tokens[4].c_str());
ASSERT_STREQ("abcaa123ss12", tokens[5].c_str());
ASSERT_STREQ("here", tokens[6].c_str());
// when normalization is disabled and keep empty is enabled
const std::string withoutnormalize = "Mise à, jour.";
tokens.clear();
Tokenizer(withoutnormalize, true, false, false).tokenize(tokens);
ASSERT_EQ(5, tokens.size());
ASSERT_STREQ("Mise", tokens[0].c_str());
ASSERT_STREQ("", tokens[1].c_str());
ASSERT_STREQ("à,", tokens[2].c_str());
ASSERT_STREQ("", tokens[3].c_str());
ASSERT_STREQ("jour.", tokens[4].c_str());
// when normalization and keep empty are disabled
const std::string withoutnormalizeandkeepempty = "Mise à jour.";
tokens.clear();
Tokenizer(withoutnormalizeandkeepempty, false, false, false).tokenize(tokens);
ASSERT_EQ(3, tokens.size());
ASSERT_STREQ("Mise", tokens[0].c_str());
ASSERT_STREQ("à", tokens[1].c_str());
ASSERT_STREQ("jour.", tokens[2].c_str());
// single token
const std::string single_token = "foobar";
tokens.clear();
@ -89,22 +55,82 @@ TEST(TokenizerTest, ShouldTokenizeNormalizeDifferentStrings) {
const std::string multispace_tokens = "foo bar";
tokens.clear();
Tokenizer(multispace_tokens, true, false, false).tokenize(tokens);
ASSERT_EQ(6, tokens.size());
ASSERT_EQ(3, tokens.size());
ASSERT_STREQ("foo", tokens[0].c_str());
ASSERT_STREQ("", tokens[1].c_str());
ASSERT_STREQ("", tokens[2].c_str());
ASSERT_STREQ("", tokens[3].c_str());
ASSERT_STREQ("", tokens[4].c_str());
ASSERT_STREQ("bar", tokens[5].c_str());
ASSERT_STREQ(" ", tokens[1].c_str());
ASSERT_STREQ("bar", tokens[2].c_str());
// special chars
const std::string specialchar_tokens = "https://www.amazon.com/s?k=phone&ref=nb_sb_noss_2";;
tokens.clear();
Tokenizer(specialchar_tokens, true, false, false).tokenize(tokens);
ASSERT_EQ(23, tokens.size());
ASSERT_STREQ("https", tokens[0].c_str());
ASSERT_STREQ("://", tokens[1].c_str());
ASSERT_STREQ("www", tokens[2].c_str());
ASSERT_STREQ(".", tokens[3].c_str());
ASSERT_STREQ("noss", tokens[20].c_str());
ASSERT_STREQ("_", tokens[21].c_str());
ASSERT_STREQ("2", tokens[22].c_str());
// noop
tokens.clear();
const std::string withspecialchars = "Special ½¥ and தமிழ் 你好吗 abcÅà123ß12 here.";
Tokenizer(withspecialchars, false, true, true).tokenize(tokens);
ASSERT_EQ(1, tokens.size());
ASSERT_STREQ(withspecialchars.c_str(), tokens[0].c_str());
}
TEST(TokenizerTest, ShouldTokenizeNormalizeUnicodeStrings) {
std::vector<std::string> tokens;
const std::string withspecialchars = "Special ½¥ and -தமிழ் 你2好吗 abcÅà123ß12 here.";
tokens.clear();
Tokenizer(withspecialchars, false, true, false).tokenize(tokens);
ASSERT_EQ(7, tokens.size());
ASSERT_STREQ("special", tokens[0].c_str());
ASSERT_STREQ("12yen", tokens[1].c_str());
ASSERT_STREQ("and", tokens[2].c_str());
ASSERT_STREQ("தமிழ்", tokens[3].c_str());
ASSERT_STREQ("你2好吗", tokens[4].c_str());
ASSERT_STREQ("abcaa123ss12", tokens[5].c_str());
ASSERT_STREQ("here", tokens[6].c_str());
// when normalization is disabled and keep empty is enabled
const std::string withoutnormalize = "Mise à, jour.";
tokens.clear();
Tokenizer(withoutnormalize, true, false, false).tokenize(tokens);
ASSERT_EQ(6, tokens.size());
ASSERT_STREQ("Mise", tokens[0].c_str());
ASSERT_STREQ(" ", tokens[1].c_str());
ASSERT_STREQ("à", tokens[2].c_str());
ASSERT_STREQ(", ", tokens[3].c_str());
ASSERT_STREQ("jour", tokens[4].c_str());
ASSERT_STREQ(".", tokens[5].c_str());
// when normalization and keep empty are disabled
const std::string withoutnormalizeandkeepempty = "Mise à jour.";
tokens.clear();
Tokenizer(withoutnormalizeandkeepempty, false, false, false).tokenize(tokens);
ASSERT_EQ(3, tokens.size());
ASSERT_STREQ("Mise", tokens[0].c_str());
ASSERT_STREQ("à", tokens[1].c_str());
ASSERT_STREQ("jour", tokens[2].c_str());
// single accented word tokenization
std::string singleword = "à";
tokens.clear();
Tokenizer(singleword, false, true, false).tokenize(tokens);
ASSERT_EQ(1, tokens.size());
ASSERT_STREQ("a", tokens[0].c_str());
tokens.clear();
Tokenizer(singleword, true, true, false).tokenize(tokens);
ASSERT_EQ(1, tokens.size());
ASSERT_STREQ("a", tokens[0].c_str());
}
TEST(TokenizerTest, ShouldTokenizeIteratively) {
const std::string withnewline = "Michael Jordan:\n\nWelcome, everybody. Welcome!";
std::vector<std::string> tokens;
@ -117,20 +143,34 @@ TEST(TokenizerTest, ShouldTokenizeIteratively) {
tokens.push_back(token);
}
ASSERT_EQ(6, tokens.size());
ASSERT_EQ(10, tokens.size());
ASSERT_STREQ("michael", tokens[0].c_str());
ASSERT_STREQ("jordan", tokens[1].c_str());
ASSERT_STREQ("", tokens[2].c_str());
ASSERT_STREQ("welcome", tokens[3].c_str());
ASSERT_STREQ("everybody", tokens[4].c_str());
ASSERT_STREQ("welcome", tokens[5].c_str());
ASSERT_STREQ(" ", tokens[1].c_str());
ASSERT_STREQ("jordan", tokens[2].c_str());
ASSERT_STREQ(":\n\n", tokens[3].c_str());
ASSERT_STREQ("welcome", tokens[4].c_str());
ASSERT_STREQ(", ", tokens[5].c_str());
ASSERT_STREQ("everybody", tokens[6].c_str());
ASSERT_STREQ(". ", tokens[7].c_str());
ASSERT_STREQ("welcome", tokens[8].c_str());
ASSERT_STREQ("!", tokens[9].c_str());
// check for index when separators are not kept
Tokenizer tokenizer2(withnewline, false, true, false);
size_t expected_token_index = 0;
std::vector<std::string> expected_tokens = {"michael", "jordan", "welcome", "everybody", "welcome"};
while(tokenizer2.next(token, token_index)) {
ASSERT_EQ(expected_token_index, token_index);
ASSERT_EQ(expected_tokens[expected_token_index], token);
expected_token_index++;
}
// verbatim (no_op=true)
tokens.clear();
Tokenizer tokenizer2(withnewline, true, false, true);
Tokenizer tokenizer3(withnewline, true, false, true);
while(tokenizer2.next(token, token_index)) {
while(tokenizer3.next(token, token_index)) {
tokens.push_back(token);
}