mirror of
https://github.com/typesense/typesense.git
synced 2025-05-17 12:12:35 +08:00
Combine various token operations in a single flow.
Splitting, normalizing etc. are now done in a single loop.
This commit is contained in:
parent
6c1455bc2f
commit
6997e35f72
@ -312,7 +312,6 @@ public:
|
||||
|
||||
void scrub_reindex_doc(nlohmann::json& update_doc, nlohmann::json& del_doc, nlohmann::json& old_doc);
|
||||
|
||||
void tokenize_doc_field(const nlohmann::json& document, const std::string& field_name, const field& search_field,
|
||||
std::vector<std::string>& tokens);
|
||||
void tokenize_doc_field(const nlohmann::json& document, const field& search_field, std::vector<std::string>& tokens);
|
||||
};
|
||||
|
||||
|
@ -291,5 +291,5 @@ struct StringUtils {
|
||||
|
||||
static std::string hmac(const std::string& key, const std::string& msg);
|
||||
|
||||
static size_t unicode_length(const std::string& bytes);
|
||||
//static size_t unicode_length(const std::string& bytes);
|
||||
};
|
27
include/tokenizer.h
Normal file
27
include/tokenizer.h
Normal file
@ -0,0 +1,27 @@
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <iconv.h>
|
||||
|
||||
class Tokenizer {
|
||||
private:
|
||||
const std::string& text;
|
||||
size_t i;
|
||||
const bool keep_empty;
|
||||
const bool normalize;
|
||||
|
||||
size_t token_counter = 0;
|
||||
iconv_t cd;
|
||||
|
||||
public:
|
||||
|
||||
Tokenizer(const std::string& input, bool keep_empty=true, bool normalize=true):
|
||||
text(input), i(0), keep_empty(keep_empty), normalize(normalize) {
|
||||
cd = iconv_open("ASCII//TRANSLIT", "UTF-8");
|
||||
}
|
||||
|
||||
bool next(std::string& token, size_t& token_index);
|
||||
|
||||
void tokenize(std::vector<std::string>& tokens);
|
||||
};
|
107
src/index.cpp
107
src/index.cpp
@ -8,6 +8,7 @@
|
||||
#include <match_score.h>
|
||||
#include <string_utils.h>
|
||||
#include <art.h>
|
||||
#include <tokenizer.h>
|
||||
#include "logger.h"
|
||||
|
||||
Index::Index(const std::string name, const std::unordered_map<std::string, field> & search_schema,
|
||||
@ -337,8 +338,8 @@ void Index::scrub_reindex_doc(nlohmann::json& update_doc, nlohmann::json& del_do
|
||||
// Go through all the field names and find the keys+values so that they can be removed from in-memory index
|
||||
std::vector<std::string> reindex_tokens;
|
||||
std::vector<std::string> old_tokens;
|
||||
tokenize_doc_field(update_doc, field_name, search_field, reindex_tokens);
|
||||
tokenize_doc_field(old_doc, field_name, search_field, old_tokens);
|
||||
tokenize_doc_field(update_doc, search_field, reindex_tokens);
|
||||
tokenize_doc_field(old_doc, search_field, old_tokens);
|
||||
|
||||
if(old_tokens.size() != reindex_tokens.size()) {
|
||||
++it;
|
||||
@ -556,28 +557,23 @@ uint64_t Index::facet_token_hash(const field & a_field, const std::string &token
|
||||
|
||||
void Index::index_string_field(const std::string & text, const int64_t score, art_tree *t,
|
||||
uint32_t seq_id, int facet_id, const field & a_field) {
|
||||
std::vector<std::string> tokens;
|
||||
StringUtils::split(text, tokens, " ", a_field.is_string());
|
||||
|
||||
std::unordered_map<std::string, std::vector<uint32_t>> token_to_offsets;
|
||||
|
||||
for(size_t i=0; i<tokens.size(); i++) {
|
||||
auto & token = tokens[i];
|
||||
Tokenizer tokenizer(text, true, a_field.is_string());
|
||||
std::string token;
|
||||
size_t token_index = 0;
|
||||
|
||||
while(tokenizer.next(token, token_index)) {
|
||||
if(token.empty()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if(a_field.is_string()) {
|
||||
string_utils.unicode_normalize(token);
|
||||
}
|
||||
|
||||
if(facet_id >= 0) {
|
||||
uint64_t hash = facet_token_hash(a_field, token);
|
||||
facet_index_v2[seq_id][facet_id].push_back(hash);
|
||||
}
|
||||
|
||||
token_to_offsets[token].push_back(i);
|
||||
token_to_offsets[token].push_back(token_index);
|
||||
}
|
||||
|
||||
/*if(seq_id == 0) {
|
||||
@ -596,32 +592,26 @@ void Index::index_string_array_field(const std::vector<std::string> & strings, c
|
||||
std::unordered_map<std::string, std::vector<uint32_t>> token_positions;
|
||||
|
||||
for(size_t array_index = 0; array_index < strings.size(); array_index++) {
|
||||
const std::string & str = strings[array_index];
|
||||
std::vector<std::string> tokens;
|
||||
std::string delim = " ";
|
||||
StringUtils::split(str, tokens, delim, a_field.is_string());
|
||||
|
||||
const std::string& str = strings[array_index];
|
||||
std::set<std::string> token_set; // required to deal with repeating tokens
|
||||
|
||||
// iterate and append offset positions
|
||||
for(size_t i=0; i<tokens.size(); i++) {
|
||||
auto & token = tokens[i];
|
||||
Tokenizer tokenizer(str, true, a_field.is_string());
|
||||
std::string token;
|
||||
size_t token_index = 0;
|
||||
|
||||
// iterate and append offset positions
|
||||
while(tokenizer.next(token, token_index)) {
|
||||
if(token.empty()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if(a_field.is_string()) {
|
||||
string_utils.unicode_normalize(token);
|
||||
}
|
||||
|
||||
if(facet_id >= 0) {
|
||||
uint64_t hash = facet_token_hash(a_field, token);
|
||||
facet_index_v2[seq_id][facet_id].push_back(hash);
|
||||
//printf("indexing %.*s - %llu\n", token.size(), token.c_str(), hash);
|
||||
}
|
||||
|
||||
token_positions[token].push_back(i);
|
||||
token_positions[token].push_back(token_index);
|
||||
token_set.insert(token);
|
||||
}
|
||||
|
||||
@ -630,13 +620,13 @@ void Index::index_string_array_field(const std::vector<std::string> & strings, c
|
||||
}
|
||||
|
||||
// repeat last element to indicate end of offsets for this array index
|
||||
for(auto & token: token_set) {
|
||||
token_positions[token].push_back(token_positions[token].back());
|
||||
for(auto & the_token: token_set) {
|
||||
token_positions[the_token].push_back(token_positions[the_token].back());
|
||||
}
|
||||
|
||||
// iterate and append this array index to all tokens
|
||||
for(auto & token: token_set) {
|
||||
token_positions[token].push_back(array_index);
|
||||
for(auto & the_token: token_set) {
|
||||
token_positions[the_token].push_back(array_index);
|
||||
}
|
||||
}
|
||||
|
||||
@ -750,17 +740,14 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::string> query_tokens;
|
||||
StringUtils::split(facet_query.query, query_tokens, " ");
|
||||
|
||||
// for non-string fields, `faceted_name` returns their aliased stringified field name
|
||||
art_tree *t = search_index.at(facet_field.faceted_name());
|
||||
|
||||
std::vector<std::string> query_tokens;
|
||||
Tokenizer(facet_query.query, false, facet_field.is_string()).tokenize(query_tokens);
|
||||
|
||||
for (size_t qtoken_index = 0; qtoken_index < query_tokens.size(); qtoken_index++) {
|
||||
auto &q = query_tokens[qtoken_index];
|
||||
if (facet_field.is_string()) {
|
||||
string_utils.unicode_normalize(q);
|
||||
}
|
||||
|
||||
int bounded_cost = (q.size() < 3) ? 0 : 1;
|
||||
bool prefix_search = (qtoken_index ==
|
||||
@ -1062,9 +1049,6 @@ Option<uint32_t> Index::do_filtering(uint32_t** filter_ids_out, const std::vecto
|
||||
size_t ids_size = 0;
|
||||
|
||||
for(const std::string & filter_value: a_filter.values) {
|
||||
std::vector<std::string> str_tokens;
|
||||
StringUtils::split(filter_value, str_tokens, " ");
|
||||
|
||||
uint32_t* strt_ids = nullptr;
|
||||
size_t strt_ids_size = 0;
|
||||
|
||||
@ -1072,8 +1056,14 @@ Option<uint32_t> Index::do_filtering(uint32_t** filter_ids_out, const std::vecto
|
||||
|
||||
// there could be multiple tokens in a filter value, which we have to treat as ANDs
|
||||
// e.g. country: South Africa
|
||||
for(auto& str_token: str_tokens) {
|
||||
string_utils.unicode_normalize(str_token);
|
||||
|
||||
Tokenizer tokenizer(filter_value, false, true);
|
||||
std::string str_token;
|
||||
size_t token_index = 0;
|
||||
std::vector<std::string> str_tokens;
|
||||
|
||||
while(tokenizer.next(str_token, token_index)) {
|
||||
str_tokens.push_back(str_token);
|
||||
|
||||
art_leaf* leaf = (art_leaf *) art_search(t, (const unsigned char*) str_token.c_str(),
|
||||
str_token.length()+1);
|
||||
@ -1296,15 +1286,15 @@ void Index::collate_included_ids(const std::string & query, const std::string &
|
||||
}
|
||||
|
||||
// calculate match_score and add to topster independently
|
||||
std::vector<std::string> tokens;
|
||||
StringUtils::split(query, tokens, " ");
|
||||
|
||||
std::vector<art_leaf *> override_query;
|
||||
|
||||
for(size_t token_index = 0; token_index < tokens.size(); token_index++) {
|
||||
const auto token = tokens[token_index];
|
||||
Tokenizer tokenizer(query, false, true);
|
||||
std::string token;
|
||||
size_t token_index = 0;
|
||||
|
||||
while(tokenizer.next(token, token_index)) {
|
||||
const size_t token_len = token.length();
|
||||
string_utils.unicode_normalize(tokens[token_index]);
|
||||
|
||||
std::vector<art_leaf*> leaves;
|
||||
art_fuzzy_search(search_index.at(field), (const unsigned char *) token.c_str(), token_len,
|
||||
@ -1483,8 +1473,6 @@ void Index::search_field(const uint8_t & field_id, const std::string & query, co
|
||||
uint32_t** all_result_ids, size_t & all_result_ids_len,
|
||||
const token_ordering token_order, const bool prefix,
|
||||
const size_t drop_tokens_threshold, const size_t typo_tokens_threshold) {
|
||||
std::vector<std::string> tokens;
|
||||
StringUtils::split(query, tokens, " ");
|
||||
|
||||
const size_t max_cost = (num_typos < 0 || num_typos > 2) ? 2 : num_typos;
|
||||
|
||||
@ -1496,19 +1484,22 @@ void Index::search_field(const uint8_t & field_id, const std::string & query, co
|
||||
|
||||
std::vector<std::vector<int>> token_to_costs;
|
||||
|
||||
for(size_t token_index = 0; token_index < tokens.size(); token_index++) {
|
||||
std::vector<int> all_costs;
|
||||
const size_t token_len = tokens[token_index].length();
|
||||
Tokenizer tokenizer(query, false, true);
|
||||
std::string token;
|
||||
size_t token_index = 0;
|
||||
std::vector<std::string> tokens;
|
||||
|
||||
while(tokenizer.next(token, token_index)) {
|
||||
std::vector<int> all_costs;
|
||||
// This ensures that we don't end up doing a cost of 1 for a single char etc.
|
||||
int bounded_cost = get_bounded_typo_cost(max_cost, token_len);
|
||||
int bounded_cost = get_bounded_typo_cost(max_cost, token.length());
|
||||
|
||||
for(int cost = 0; cost <= bounded_cost; cost++) {
|
||||
all_costs.push_back(cost);
|
||||
}
|
||||
|
||||
token_to_costs.push_back(all_costs);
|
||||
string_utils.unicode_normalize(tokens[token_index]);
|
||||
tokens.push_back(token);
|
||||
}
|
||||
|
||||
// stores candidates for each token, i.e. i-th index would have all possible tokens with a cost of "c"
|
||||
@ -1934,14 +1925,13 @@ Option<uint32_t> Index::remove(const uint32_t seq_id, const nlohmann::json & doc
|
||||
|
||||
// Go through all the field names and find the keys+values so that they can be removed from in-memory index
|
||||
std::vector<std::string> tokens;
|
||||
tokenize_doc_field(document, field_name, search_field, tokens);
|
||||
tokenize_doc_field(document, search_field, tokens);
|
||||
|
||||
for(auto & token: tokens) {
|
||||
const unsigned char *key;
|
||||
int key_len;
|
||||
|
||||
if(search_field.type == field_types::STRING_ARRAY || search_field.type == field_types::STRING) {
|
||||
string_utils.unicode_normalize(token);
|
||||
key = (const unsigned char *) token.c_str();
|
||||
key_len = (int) (token.length() + 1);
|
||||
} else {
|
||||
@ -1998,14 +1988,17 @@ Option<uint32_t> Index::remove(const uint32_t seq_id, const nlohmann::json & doc
|
||||
return Option<uint32_t>(seq_id);
|
||||
}
|
||||
|
||||
void Index::tokenize_doc_field(const nlohmann::json& document, const std::string& field_name, const field& search_field,
|
||||
void Index::tokenize_doc_field(const nlohmann::json& document, const field& search_field,
|
||||
std::vector<std::string>& tokens) {
|
||||
|
||||
const std::string& field_name = search_field.name;
|
||||
|
||||
if(search_field.type == field_types::STRING) {
|
||||
StringUtils::split(document[field_name], tokens, " ", true);
|
||||
Tokenizer(document[field_name], true, search_field.is_string()).tokenize(tokens);
|
||||
} else if(search_field.type == field_types::STRING_ARRAY) {
|
||||
const std::vector<std::string>& values = document[field_name].get<std::vector<std::string>>();
|
||||
for(const std::string & value: values) {
|
||||
StringUtils::split(value, tokens, " ", true);
|
||||
Tokenizer(value, true, search_field.is_string()).tokenize(tokens);
|
||||
}
|
||||
} else if(search_field.type == field_types::INT32) {
|
||||
const int KEY_LEN = 8;
|
||||
|
@ -3,7 +3,6 @@
|
||||
#include <openssl/evp.h>
|
||||
#include <openssl/hmac.h>
|
||||
#include <random>
|
||||
#include <codecvt>
|
||||
|
||||
std::string lower_and_no_special_chars(const std::string & str) {
|
||||
std::stringstream ss;
|
||||
@ -31,6 +30,12 @@ void StringUtils::unicode_normalize(std::string & str) const {
|
||||
char inbuf[5];
|
||||
char *p = inbuf;
|
||||
|
||||
if((*s & ~0x7f) == 0 ) {
|
||||
// ascii character
|
||||
out << *s++;
|
||||
continue;
|
||||
}
|
||||
|
||||
// group bytes to form a unicode representation
|
||||
*p++ = *s++;
|
||||
if ((*s & 0xC0) == 0x80) *p++ = *s++;
|
||||
@ -101,7 +106,7 @@ std::string StringUtils::str2hex(const std::string &str, bool capital) {
|
||||
return hexstr;
|
||||
}
|
||||
|
||||
size_t StringUtils::unicode_length(const std::string& bytes) {
|
||||
/*size_t StringUtils::unicode_length(const std::string& bytes) {
|
||||
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> utf8conv;
|
||||
return utf8conv.from_bytes(bytes).size();
|
||||
}
|
||||
}*/
|
||||
|
103
src/tokenizer.cpp
Normal file
103
src/tokenizer.cpp
Normal file
@ -0,0 +1,103 @@
|
||||
#include <sstream>
|
||||
#include "tokenizer.h"
|
||||
|
||||
bool Tokenizer::next(std::string &token, size_t& token_index) {
|
||||
std::stringstream out;
|
||||
|
||||
if(i >= text.size()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if(!normalize) {
|
||||
token = text;
|
||||
i = text.size();
|
||||
return true;
|
||||
}
|
||||
|
||||
while(i < text.size()) {
|
||||
if((text[i] & ~0x7f) == 0 ) {
|
||||
// ASCII character: split on space/newline or lowercase otherwise
|
||||
bool is_space = text[i] == 32;
|
||||
bool is_new_line = text[i] == 10;
|
||||
bool space_or_newline = (is_space || is_new_line);
|
||||
|
||||
if(space_or_newline) {
|
||||
i++;
|
||||
token = out.str();
|
||||
out.clear();
|
||||
|
||||
if(!keep_empty && token.empty()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
token_index = token_counter++;
|
||||
return true;
|
||||
}
|
||||
|
||||
if(std::isalnum(text[i])) {
|
||||
out << char(std::tolower(text[i]));
|
||||
}
|
||||
|
||||
i++;
|
||||
continue;
|
||||
}
|
||||
|
||||
char inbuf[5];
|
||||
char *p = inbuf;
|
||||
|
||||
// group bytes to form a unicode representation
|
||||
*p++ = text[i++];
|
||||
if ((text[i] & 0xC0) == 0x80) *p++ = text[i++];
|
||||
if ((text[i] & 0xC0) == 0x80) *p++ = text[i++];
|
||||
if ((text[i] & 0xC0) == 0x80) *p++ = text[i++];
|
||||
*p = 0;
|
||||
size_t insize = (p - &inbuf[0]);
|
||||
|
||||
char outbuf[5] = {};
|
||||
size_t outsize = sizeof(outbuf);
|
||||
char *outptr = outbuf;
|
||||
char *inptr = inbuf;
|
||||
|
||||
//printf("[%s]\n", inbuf);
|
||||
|
||||
errno = 0;
|
||||
iconv(cd, &inptr, &insize, &outptr, &outsize);
|
||||
|
||||
if(errno == EILSEQ) {
|
||||
// symbol cannot be represented as ASCII, so write the original symbol
|
||||
out << inbuf;
|
||||
} else {
|
||||
// NOTE: outsize indicates bytes available AFTER current position so have to do <=
|
||||
for(size_t out_index=0; out_index<5; out_index++) {
|
||||
bool is_ascii = ((outbuf[out_index] & ~0x7f) == 0);
|
||||
bool keep_char = !is_ascii || std::isalnum(outbuf[out_index]);
|
||||
|
||||
if(keep_char) {
|
||||
if(is_ascii && std::isalnum(outbuf[out_index])) {
|
||||
outbuf[out_index] = std::tolower(outbuf[out_index]);
|
||||
}
|
||||
out << outbuf[out_index];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
token = out.str();
|
||||
out.clear();
|
||||
|
||||
if(!keep_empty && token.empty()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
token_index = token_counter++;
|
||||
return true;
|
||||
}
|
||||
|
||||
void Tokenizer::tokenize(std::vector<std::string> &tokens) {
|
||||
std::string token;
|
||||
size_t token_index;
|
||||
|
||||
while(next(token, token_index)) {
|
||||
tokens.push_back(token);
|
||||
}
|
||||
}
|
@ -558,12 +558,12 @@ TEST_F(CollectionFacetingTest, FacetCountsHighlighting) {
|
||||
ASSERT_STREQ("categories", results["facet_counts"][0]["field_name"].get<std::string>().c_str());
|
||||
|
||||
ASSERT_EQ(1, results["facet_counts"][0]["counts"][0]["count"].get<size_t>());
|
||||
ASSERT_STREQ("Cell Phone Cases & Clips", results["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
|
||||
ASSERT_STREQ("<mark>Cell</mark> Phone Cases & Clips", results["facet_counts"][0]["counts"][0]["highlighted"].get<std::string>().c_str());
|
||||
ASSERT_STREQ("Cell Phones", results["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
|
||||
ASSERT_STREQ("<mark>Cell</mark> Phones", results["facet_counts"][0]["counts"][0]["highlighted"].get<std::string>().c_str());
|
||||
|
||||
ASSERT_EQ(1, results["facet_counts"][0]["counts"][1]["count"].get<size_t>());
|
||||
ASSERT_STREQ("Cell Phones", results["facet_counts"][0]["counts"][1]["value"].get<std::string>().c_str());
|
||||
ASSERT_STREQ("<mark>Cell</mark> Phones", results["facet_counts"][0]["counts"][1]["highlighted"].get<std::string>().c_str());
|
||||
ASSERT_STREQ("Cell Phone Cases & Clips", results["facet_counts"][0]["counts"][1]["value"].get<std::string>().c_str());
|
||||
ASSERT_STREQ("<mark>Cell</mark> Phone Cases & Clips", results["facet_counts"][0]["counts"][1]["highlighted"].get<std::string>().c_str());
|
||||
|
||||
ASSERT_EQ(1, results["facet_counts"][0]["counts"][2]["count"].get<size_t>());
|
||||
ASSERT_STREQ("Cell Phone Accessories", results["facet_counts"][0]["counts"][2]["value"].get<std::string>().c_str());
|
||||
|
@ -5,7 +5,7 @@
|
||||
TEST(IndexTest, ScrubReindexDoc) {
|
||||
std::unordered_map<std::string, field> search_schema;
|
||||
search_schema.emplace("title", field("title", field_types::STRING, false));
|
||||
search_schema.emplace("points", field("title", field_types::INT32, false));
|
||||
search_schema.emplace("points", field("points", field_types::INT32, false));
|
||||
search_schema.emplace("cast", field("cast", field_types::STRING_ARRAY, false));
|
||||
search_schema.emplace("movie", field("movie", field_types::BOOL, false));
|
||||
|
||||
|
82
test/tokenizer_test.cpp
Normal file
82
test/tokenizer_test.cpp
Normal file
@ -0,0 +1,82 @@
|
||||
#include <gtest/gtest.h>
|
||||
#include "tokenizer.h"
|
||||
|
||||
TEST(TokenizerTest, ShouldTokenizeNormalizeDifferentStrings) {
|
||||
const std::string withnewline = "Michael Jordan:\nWelcome, everybody. Welcome!";
|
||||
std::vector<std::string> tokens;
|
||||
Tokenizer(withnewline, true, true).tokenize(tokens);
|
||||
ASSERT_EQ(5, tokens.size());
|
||||
ASSERT_STREQ("michael", tokens[0].c_str());
|
||||
ASSERT_STREQ("jordan", tokens[1].c_str());
|
||||
ASSERT_STREQ("welcome", tokens[2].c_str());
|
||||
ASSERT_STREQ("everybody", tokens[3].c_str());
|
||||
ASSERT_STREQ("welcome", tokens[4].c_str());
|
||||
|
||||
const std::string withspaces = " Michael Jordan ";
|
||||
tokens.clear();
|
||||
Tokenizer(withspaces, true, true).tokenize(tokens);
|
||||
ASSERT_EQ(5, tokens.size());
|
||||
ASSERT_STREQ("", tokens[0].c_str());
|
||||
ASSERT_STREQ("michael", tokens[1].c_str());
|
||||
ASSERT_STREQ("", tokens[2].c_str());
|
||||
ASSERT_STREQ("jordan", tokens[3].c_str());
|
||||
ASSERT_STREQ("", tokens[4].c_str());
|
||||
|
||||
tokens.clear();
|
||||
Tokenizer(withspaces, false, true).tokenize(tokens);
|
||||
ASSERT_EQ(2, tokens.size());
|
||||
ASSERT_STREQ("michael", tokens[0].c_str());
|
||||
ASSERT_STREQ("jordan", tokens[1].c_str());
|
||||
|
||||
const std::string withspecialchars = "Special ½¥ and தமிழ் 你好吗 abcÅà123ß12 here.";
|
||||
tokens.clear();
|
||||
Tokenizer(withspecialchars, false, true).tokenize(tokens);
|
||||
ASSERT_EQ(7, tokens.size());
|
||||
ASSERT_STREQ("special", tokens[0].c_str());
|
||||
ASSERT_STREQ("12yen", tokens[1].c_str());
|
||||
ASSERT_STREQ("and", tokens[2].c_str());
|
||||
ASSERT_STREQ("தமிழ்", tokens[3].c_str());
|
||||
ASSERT_STREQ("你好吗", tokens[4].c_str());
|
||||
ASSERT_STREQ("abcaa123ss12", tokens[5].c_str());
|
||||
ASSERT_STREQ("here", tokens[6].c_str());
|
||||
|
||||
// when normalize is false, should be verbatim
|
||||
|
||||
tokens.clear();
|
||||
Tokenizer(withspecialchars, false, false).tokenize(tokens);
|
||||
ASSERT_EQ(1, tokens.size());
|
||||
ASSERT_STREQ(withspecialchars.c_str(), tokens[0].c_str());
|
||||
}
|
||||
|
||||
TEST(TokenizerTest, ShouldTokenizeIteratively) {
|
||||
const std::string withnewline = "Michael Jordan:\n\nWelcome, everybody. Welcome!";
|
||||
std::vector<std::string> tokens;
|
||||
Tokenizer tokenizer1(withnewline, true, true);
|
||||
|
||||
std::string token;
|
||||
size_t token_index;
|
||||
|
||||
while(tokenizer1.next(token, token_index)) {
|
||||
tokens.push_back(token);
|
||||
}
|
||||
|
||||
ASSERT_EQ(6, tokens.size());
|
||||
ASSERT_STREQ("michael", tokens[0].c_str());
|
||||
ASSERT_STREQ("jordan", tokens[1].c_str());
|
||||
ASSERT_STREQ("", tokens[2].c_str());
|
||||
ASSERT_STREQ("welcome", tokens[3].c_str());
|
||||
ASSERT_STREQ("everybody", tokens[4].c_str());
|
||||
ASSERT_STREQ("welcome", tokens[5].c_str());
|
||||
|
||||
// verbatim (normalize=false)
|
||||
|
||||
tokens.clear();
|
||||
Tokenizer tokenizer2(withnewline, true, false);
|
||||
|
||||
while(tokenizer2.next(token, token_index)) {
|
||||
tokens.push_back(token);
|
||||
}
|
||||
|
||||
ASSERT_EQ(1, tokens.size());
|
||||
ASSERT_STREQ("Michael Jordan:\n\nWelcome, everybody. Welcome!", tokens[0].c_str());
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user