Tokenize on special characters.

This commit is contained in:
kishorenc 2021-03-16 11:39:53 +05:30
parent af6d2e94e5
commit f501b137b7
4 changed files with 68 additions and 23 deletions

View File

@ -23,27 +23,37 @@ bool Tokenizer::next(std::string &token, size_t& token_index) {
while(i < text.size()) {
if((text[i] & ~0x7f) == 0 ) {
// ASCII character: split on space/newline or lowercase otherwise
bool is_space = text[i] == 32;
bool is_new_line = text[i] == 10;
bool space_or_newline = (is_space || is_new_line);
if(std::isalnum(text[i])) {
if(normalize) {
out << char(std::tolower(text[i]));
} else {
out << text[i];
}
} else {
bool is_space = text[i] == 32;
bool is_new_line = text[i] == 10;
bool is_whitespace = is_space || is_new_line;
if(space_or_newline) {
i++;
token = out.str();
out.clear();
bool next_char_alphanum = (i != text.length() - 1) && std::isalnum(text[i + 1]);
if(!keep_empty && token.empty()) {
continue;
if(!normalize && !is_whitespace && (i == text.length() - 1 || !next_char_alphanum)) {
// checking for next char ensures that `foo-bar` does not get split to `foo-`
out << text[i];
}
token_index = token_counter++;
return true;
}
if(is_whitespace || next_char_alphanum) {
// we split on space or on a special character whose next char is alphanumeric
token = out.str();
out.clear();
i++;
if(!normalize) {
out << text[i];
} else if(std::isalnum(text[i])) {
out << char(std::tolower(text[i]));
if(!keep_empty && token.empty()) {
continue;
}
token_index = token_counter++;
return true;
}
}
i++;

View File

@ -164,7 +164,7 @@ TEST_F(CollectionTest, PhraseSearch) {
13: score: 12, (single word match)
*/
std::vector<std::string> ids = {"8", "1", "17", "16", "13"};
std::vector<std::string> ids = {"8", "1", "16", "17", "13"};
for(size_t i = 0; i < results["hits"].size(); i++) {
nlohmann::json result = results["hits"].at(i);
@ -176,7 +176,7 @@ TEST_F(CollectionTest, PhraseSearch) {
ASSERT_EQ(results["hits"][0]["highlights"].size(), (unsigned long) 1);
ASSERT_STREQ(results["hits"][0]["highlights"][0]["field"].get<std::string>().c_str(), "title");
ASSERT_STREQ(results["hits"][0]["highlights"][0]["snippet"].get<std::string>().c_str(),
"What is the power requirement of a <mark>rocket</mark> <mark>launch</mark> these days?");
"What is the power, requirement of a <mark>rocket</mark> <mark>launch</mark> these days?");
// Check ASC sort order
std::vector<sort_by> sort_fields_asc = { sort_by(sort_field_const::text_match, "DESC"), sort_by("points", "ASC") };
@ -184,7 +184,7 @@ TEST_F(CollectionTest, PhraseSearch) {
ASSERT_EQ(5, results["hits"].size());
ASSERT_EQ(5, results["found"].get<uint32_t>());
ids = {"8", "17", "1", "16", "13"};
ids = {"8", "1", "17", "16", "13"};
for(size_t i = 0; i < results["hits"].size(); i++) {
nlohmann::json result = results["hits"].at(i);
@ -200,7 +200,7 @@ TEST_F(CollectionTest, PhraseSearch) {
ASSERT_EQ(3, results["request_params"]["per_page"].get<size_t>());
ids = {"8", "1", "17"};
ids = {"8", "1", "16"};
for(size_t i = 0; i < 3; i++) {
nlohmann::json result = results["hits"].at(i);

View File

@ -5,7 +5,7 @@
{"id": "foo", "points":13,"title":"The heaviest martian spacecraft"}
{"points":13,"title":"To what extent are the US modules of ISS based on the Spacelab design?"}
{"points":12,"title":"Could future astronauts eat during EVAs?"}
{"points":12,"title":"What is the power requirement of a rocket launch these days?"}
{"points":12,"title":"What is the power, requirement of a rocket launch these days?"}
{"points":12,"title":"How does plant growing medium not scatter around?"}
{"points":12,"title":"Is there research for the optimal small crew size for a long space voyage?"}
{"points":12,"title":"Do long term missions receive insurance coverage?"}

View File

@ -43,13 +43,13 @@ TEST(TokenizerTest, ShouldTokenizeNormalizeDifferentStrings) {
ASSERT_STREQ("here", tokens[6].c_str());
// when normalization is disabled and keep empty is enabled
const std::string withoutnormalize = "Mise à jour.";
const std::string withoutnormalize = "Mise à, jour.";
tokens.clear();
Tokenizer(withoutnormalize, true, false, false).tokenize(tokens);
ASSERT_EQ(5, tokens.size());
ASSERT_STREQ("Mise", tokens[0].c_str());
ASSERT_STREQ("", tokens[1].c_str());
ASSERT_STREQ("à", tokens[2].c_str());
ASSERT_STREQ("à,", tokens[2].c_str());
ASSERT_STREQ("", tokens[3].c_str());
ASSERT_STREQ("jour.", tokens[4].c_str());
@ -62,6 +62,41 @@ TEST(TokenizerTest, ShouldTokenizeNormalizeDifferentStrings) {
ASSERT_STREQ("à", tokens[1].c_str());
ASSERT_STREQ("jour.", tokens[2].c_str());
// single token
const std::string single_token = "foobar";
tokens.clear();
Tokenizer(single_token, false, false, false).tokenize(tokens);
ASSERT_EQ(1, tokens.size());
ASSERT_STREQ("foobar", tokens[0].c_str());
// split tokens
const std::string split_tokens = "foo-bar-baz";
tokens.clear();
Tokenizer(split_tokens, false, false, false).tokenize(tokens);
ASSERT_EQ(3, tokens.size());
ASSERT_STREQ("foo", tokens[0].c_str());
ASSERT_STREQ("bar", tokens[1].c_str());
ASSERT_STREQ("baz", tokens[2].c_str());
tokens.clear();
Tokenizer(split_tokens, false, true, false).tokenize(tokens);
ASSERT_EQ(3, tokens.size());
ASSERT_STREQ("foo", tokens[0].c_str());
ASSERT_STREQ("bar", tokens[1].c_str());
ASSERT_STREQ("baz", tokens[2].c_str());
// multiple spaces
const std::string multispace_tokens = "foo bar";
tokens.clear();
Tokenizer(multispace_tokens, true, false, false).tokenize(tokens);
ASSERT_EQ(6, tokens.size());
ASSERT_STREQ("foo", tokens[0].c_str());
ASSERT_STREQ("", tokens[1].c_str());
ASSERT_STREQ("", tokens[2].c_str());
ASSERT_STREQ("", tokens[3].c_str());
ASSERT_STREQ("", tokens[4].c_str());
ASSERT_STREQ("bar", tokens[5].c_str());
// noop
tokens.clear();