mirror of
https://github.com/typesense/typesense.git
synced 2025-05-17 20:22:32 +08:00
Tokenize on special characters.
This commit is contained in:
parent
af6d2e94e5
commit
f501b137b7
@ -23,27 +23,37 @@ bool Tokenizer::next(std::string &token, size_t& token_index) {
|
||||
while(i < text.size()) {
|
||||
if((text[i] & ~0x7f) == 0 ) {
|
||||
// ASCII character: split on space/newline or lowercase otherwise
|
||||
bool is_space = text[i] == 32;
|
||||
bool is_new_line = text[i] == 10;
|
||||
bool space_or_newline = (is_space || is_new_line);
|
||||
if(std::isalnum(text[i])) {
|
||||
if(normalize) {
|
||||
out << char(std::tolower(text[i]));
|
||||
} else {
|
||||
out << text[i];
|
||||
}
|
||||
} else {
|
||||
bool is_space = text[i] == 32;
|
||||
bool is_new_line = text[i] == 10;
|
||||
bool is_whitespace = is_space || is_new_line;
|
||||
|
||||
if(space_or_newline) {
|
||||
i++;
|
||||
token = out.str();
|
||||
out.clear();
|
||||
bool next_char_alphanum = (i != text.length() - 1) && std::isalnum(text[i + 1]);
|
||||
|
||||
if(!keep_empty && token.empty()) {
|
||||
continue;
|
||||
if(!normalize && !is_whitespace && (i == text.length() - 1 || !next_char_alphanum)) {
|
||||
// checking for next char ensures that `foo-bar` does not get split to `foo-`
|
||||
out << text[i];
|
||||
}
|
||||
|
||||
token_index = token_counter++;
|
||||
return true;
|
||||
}
|
||||
if(is_whitespace || next_char_alphanum) {
|
||||
// we split on space or on a special character whose next char is alphanumeric
|
||||
token = out.str();
|
||||
out.clear();
|
||||
i++;
|
||||
|
||||
if(!normalize) {
|
||||
out << text[i];
|
||||
} else if(std::isalnum(text[i])) {
|
||||
out << char(std::tolower(text[i]));
|
||||
if(!keep_empty && token.empty()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
token_index = token_counter++;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
i++;
|
||||
|
@ -164,7 +164,7 @@ TEST_F(CollectionTest, PhraseSearch) {
|
||||
13: score: 12, (single word match)
|
||||
*/
|
||||
|
||||
std::vector<std::string> ids = {"8", "1", "17", "16", "13"};
|
||||
std::vector<std::string> ids = {"8", "1", "16", "17", "13"};
|
||||
|
||||
for(size_t i = 0; i < results["hits"].size(); i++) {
|
||||
nlohmann::json result = results["hits"].at(i);
|
||||
@ -176,7 +176,7 @@ TEST_F(CollectionTest, PhraseSearch) {
|
||||
ASSERT_EQ(results["hits"][0]["highlights"].size(), (unsigned long) 1);
|
||||
ASSERT_STREQ(results["hits"][0]["highlights"][0]["field"].get<std::string>().c_str(), "title");
|
||||
ASSERT_STREQ(results["hits"][0]["highlights"][0]["snippet"].get<std::string>().c_str(),
|
||||
"What is the power requirement of a <mark>rocket</mark> <mark>launch</mark> these days?");
|
||||
"What is the power, requirement of a <mark>rocket</mark> <mark>launch</mark> these days?");
|
||||
|
||||
// Check ASC sort order
|
||||
std::vector<sort_by> sort_fields_asc = { sort_by(sort_field_const::text_match, "DESC"), sort_by("points", "ASC") };
|
||||
@ -184,7 +184,7 @@ TEST_F(CollectionTest, PhraseSearch) {
|
||||
ASSERT_EQ(5, results["hits"].size());
|
||||
ASSERT_EQ(5, results["found"].get<uint32_t>());
|
||||
|
||||
ids = {"8", "17", "1", "16", "13"};
|
||||
ids = {"8", "1", "17", "16", "13"};
|
||||
|
||||
for(size_t i = 0; i < results["hits"].size(); i++) {
|
||||
nlohmann::json result = results["hits"].at(i);
|
||||
@ -200,7 +200,7 @@ TEST_F(CollectionTest, PhraseSearch) {
|
||||
|
||||
ASSERT_EQ(3, results["request_params"]["per_page"].get<size_t>());
|
||||
|
||||
ids = {"8", "1", "17"};
|
||||
ids = {"8", "1", "16"};
|
||||
|
||||
for(size_t i = 0; i < 3; i++) {
|
||||
nlohmann::json result = results["hits"].at(i);
|
||||
|
@ -5,7 +5,7 @@
|
||||
{"id": "foo", "points":13,"title":"The heaviest martian spacecraft"}
|
||||
{"points":13,"title":"To what extent are the US modules of ISS based on the Spacelab design?"}
|
||||
{"points":12,"title":"Could future astronauts eat during EVAs?"}
|
||||
{"points":12,"title":"What is the power requirement of a rocket launch these days?"}
|
||||
{"points":12,"title":"What is the power, requirement of a rocket launch these days?"}
|
||||
{"points":12,"title":"How does plant growing medium not scatter around?"}
|
||||
{"points":12,"title":"Is there research for the optimal small crew size for a long space voyage?"}
|
||||
{"points":12,"title":"Do long term missions receive insurance coverage?"}
|
||||
|
@ -43,13 +43,13 @@ TEST(TokenizerTest, ShouldTokenizeNormalizeDifferentStrings) {
|
||||
ASSERT_STREQ("here", tokens[6].c_str());
|
||||
|
||||
// when normalization is disabled and keep empty is enabled
|
||||
const std::string withoutnormalize = "Mise à jour.";
|
||||
const std::string withoutnormalize = "Mise à, jour.";
|
||||
tokens.clear();
|
||||
Tokenizer(withoutnormalize, true, false, false).tokenize(tokens);
|
||||
ASSERT_EQ(5, tokens.size());
|
||||
ASSERT_STREQ("Mise", tokens[0].c_str());
|
||||
ASSERT_STREQ("", tokens[1].c_str());
|
||||
ASSERT_STREQ("à", tokens[2].c_str());
|
||||
ASSERT_STREQ("à,", tokens[2].c_str());
|
||||
ASSERT_STREQ("", tokens[3].c_str());
|
||||
ASSERT_STREQ("jour.", tokens[4].c_str());
|
||||
|
||||
@ -62,6 +62,41 @@ TEST(TokenizerTest, ShouldTokenizeNormalizeDifferentStrings) {
|
||||
ASSERT_STREQ("à", tokens[1].c_str());
|
||||
ASSERT_STREQ("jour.", tokens[2].c_str());
|
||||
|
||||
// single token
|
||||
const std::string single_token = "foobar";
|
||||
tokens.clear();
|
||||
Tokenizer(single_token, false, false, false).tokenize(tokens);
|
||||
ASSERT_EQ(1, tokens.size());
|
||||
ASSERT_STREQ("foobar", tokens[0].c_str());
|
||||
|
||||
// split tokens
|
||||
const std::string split_tokens = "foo-bar-baz";
|
||||
tokens.clear();
|
||||
Tokenizer(split_tokens, false, false, false).tokenize(tokens);
|
||||
ASSERT_EQ(3, tokens.size());
|
||||
ASSERT_STREQ("foo", tokens[0].c_str());
|
||||
ASSERT_STREQ("bar", tokens[1].c_str());
|
||||
ASSERT_STREQ("baz", tokens[2].c_str());
|
||||
|
||||
tokens.clear();
|
||||
Tokenizer(split_tokens, false, true, false).tokenize(tokens);
|
||||
ASSERT_EQ(3, tokens.size());
|
||||
ASSERT_STREQ("foo", tokens[0].c_str());
|
||||
ASSERT_STREQ("bar", tokens[1].c_str());
|
||||
ASSERT_STREQ("baz", tokens[2].c_str());
|
||||
|
||||
// multiple spaces
|
||||
const std::string multispace_tokens = "foo bar";
|
||||
tokens.clear();
|
||||
Tokenizer(multispace_tokens, true, false, false).tokenize(tokens);
|
||||
ASSERT_EQ(6, tokens.size());
|
||||
ASSERT_STREQ("foo", tokens[0].c_str());
|
||||
ASSERT_STREQ("", tokens[1].c_str());
|
||||
ASSERT_STREQ("", tokens[2].c_str());
|
||||
ASSERT_STREQ("", tokens[3].c_str());
|
||||
ASSERT_STREQ("", tokens[4].c_str());
|
||||
ASSERT_STREQ("bar", tokens[5].c_str());
|
||||
|
||||
// noop
|
||||
|
||||
tokens.clear();
|
||||
|
Loading…
x
Reference in New Issue
Block a user