Tokenize on special characters.

2025-05-17 20:22:32 +08:00 · 2021-03-16 11:39:53 +05:30 · 2021-03-16 11:39:53 +05:30 · f501b137b7
commit f501b137b7
parent af6d2e94e5
4 changed files with 68 additions and 23 deletions
--- a/src/tokenizer.cpp
+++ b/src/tokenizer.cpp
@ -23,27 +23,37 @@ bool Tokenizer::next(std::string &token, size_t& token_index) {
    while(i < text.size()) {
        if((text[i] & ~0x7f) == 0 ) {
            // ASCII character: split on space/newline or lowercase otherwise
-            bool is_space = text[i] == 32;
-            bool is_new_line = text[i] == 10;
-            bool space_or_newline = (is_space || is_new_line);
+            if(std::isalnum(text[i])) {
+                if(normalize) {
+                    out << char(std::tolower(text[i]));
+                } else {
+                    out << text[i];
+                }
+            } else {
+                bool is_space = text[i] == 32;
+                bool is_new_line = text[i] == 10;
+                bool is_whitespace = is_space || is_new_line;

-            if(space_or_newline) {
-                i++;
-                token = out.str();
-                out.clear();
+                bool next_char_alphanum = (i != text.length() - 1) && std::isalnum(text[i + 1]);

-                if(!keep_empty && token.empty()) {
-                    continue;
+                if(!normalize && !is_whitespace && (i == text.length() - 1 || !next_char_alphanum)) {
+                    // checking for next char ensures that `foo-bar` does not get split to `foo-`
+                    out << text[i];
                }

-                token_index = token_counter++;
-                return true;
-            }
+                if(is_whitespace || next_char_alphanum) {
+                    // we split on space or on a special character whose next char is alphanumeric
+                    token = out.str();
+                    out.clear();
+                    i++;

-            if(!normalize) {
-                out << text[i];
-            } else if(std::isalnum(text[i])) {
-                out << char(std::tolower(text[i]));
+                    if(!keep_empty && token.empty()) {
+                        continue;
+                    }
+
+                    token_index = token_counter++;
+                    return true;
+                }
            }

            i++;
--- a/test/collection_test.cpp
+++ b/test/collection_test.cpp
@ -164,7 +164,7 @@ TEST_F(CollectionTest, PhraseSearch) {
       13:  score: 12, (single word match)
    */

-    std::vector<std::string> ids = {"8", "1", "17", "16", "13"};
+    std::vector<std::string> ids = {"8", "1", "16", "17", "13"};

    for(size_t i = 0; i < results["hits"].size(); i++) {
        nlohmann::json result = results["hits"].at(i);
@ -176,7 +176,7 @@ TEST_F(CollectionTest, PhraseSearch) {
    ASSERT_EQ(results["hits"][0]["highlights"].size(), (unsigned long) 1);
    ASSERT_STREQ(results["hits"][0]["highlights"][0]["field"].get<std::string>().c_str(), "title");
    ASSERT_STREQ(results["hits"][0]["highlights"][0]["snippet"].get<std::string>().c_str(),
-                 "What is the power requirement of a <mark>rocket</mark> <mark>launch</mark> these days?");
+                 "What is the power, requirement of a <mark>rocket</mark> <mark>launch</mark> these days?");

    // Check ASC sort order
    std::vector<sort_by> sort_fields_asc = { sort_by(sort_field_const::text_match, "DESC"), sort_by("points", "ASC") };
@ -184,7 +184,7 @@ TEST_F(CollectionTest, PhraseSearch) {
    ASSERT_EQ(5, results["hits"].size());
    ASSERT_EQ(5, results["found"].get<uint32_t>());

-    ids = {"8", "17", "1", "16", "13"};
+    ids = {"8", "1", "17", "16", "13"};

    for(size_t i = 0; i < results["hits"].size(); i++) {
        nlohmann::json result = results["hits"].at(i);
@ -200,7 +200,7 @@ TEST_F(CollectionTest, PhraseSearch) {

    ASSERT_EQ(3, results["request_params"]["per_page"].get<size_t>());

-    ids = {"8", "1", "17"};
+    ids = {"8", "1", "16"};

    for(size_t i = 0; i < 3; i++) {
        nlohmann::json result = results["hits"].at(i);
--- a/test/documents.jsonl
+++ b/test/documents.jsonl
@ -5,7 +5,7 @@
 {"id": "foo", "points":13,"title":"The heaviest martian spacecraft"}
 {"points":13,"title":"To what extent are the US modules of ISS based on the Spacelab design?"}
 {"points":12,"title":"Could future astronauts eat during EVAs?"}
-{"points":12,"title":"What is the power requirement of a rocket launch these days?"}
+{"points":12,"title":"What is the power, requirement of a rocket launch these days?"}
 {"points":12,"title":"How does plant growing medium not scatter around?"}
 {"points":12,"title":"Is there research for the optimal small crew size for a long space voyage?"}
 {"points":12,"title":"Do long term missions receive insurance coverage?"}
--- a/test/tokenizer_test.cpp
+++ b/test/tokenizer_test.cpp
@ -43,13 +43,13 @@ TEST(TokenizerTest, ShouldTokenizeNormalizeDifferentStrings) {
    ASSERT_STREQ("here", tokens[6].c_str());

    // when normalization is disabled and keep empty is enabled
-    const std::string withoutnormalize = "Mise  à  jour.";
+    const std::string withoutnormalize = "Mise  à,  jour.";
    tokens.clear();
    Tokenizer(withoutnormalize, true, false, false).tokenize(tokens);
    ASSERT_EQ(5, tokens.size());
    ASSERT_STREQ("Mise", tokens[0].c_str());
    ASSERT_STREQ("", tokens[1].c_str());
-    ASSERT_STREQ("à", tokens[2].c_str());
+    ASSERT_STREQ("à,", tokens[2].c_str());
    ASSERT_STREQ("", tokens[3].c_str());
    ASSERT_STREQ("jour.", tokens[4].c_str());

@ -62,6 +62,41 @@ TEST(TokenizerTest, ShouldTokenizeNormalizeDifferentStrings) {
    ASSERT_STREQ("à", tokens[1].c_str());
    ASSERT_STREQ("jour.", tokens[2].c_str());

+    // single token
+    const std::string single_token = "foobar";
+    tokens.clear();
+    Tokenizer(single_token, false, false, false).tokenize(tokens);
+    ASSERT_EQ(1, tokens.size());
+    ASSERT_STREQ("foobar", tokens[0].c_str());
+
+    // split tokens
+    const std::string split_tokens = "foo-bar-baz";
+    tokens.clear();
+    Tokenizer(split_tokens, false, false, false).tokenize(tokens);
+    ASSERT_EQ(3, tokens.size());
+    ASSERT_STREQ("foo", tokens[0].c_str());
+    ASSERT_STREQ("bar", tokens[1].c_str());
+    ASSERT_STREQ("baz", tokens[2].c_str());
+
+    tokens.clear();
+    Tokenizer(split_tokens, false, true, false).tokenize(tokens);
+    ASSERT_EQ(3, tokens.size());
+    ASSERT_STREQ("foo", tokens[0].c_str());
+    ASSERT_STREQ("bar", tokens[1].c_str());
+    ASSERT_STREQ("baz", tokens[2].c_str());
+
+    // multiple spaces
+    const std::string multispace_tokens = "foo     bar";
+    tokens.clear();
+    Tokenizer(multispace_tokens, true, false, false).tokenize(tokens);
+    ASSERT_EQ(6, tokens.size());
+    ASSERT_STREQ("foo", tokens[0].c_str());
+    ASSERT_STREQ("", tokens[1].c_str());
+    ASSERT_STREQ("", tokens[2].c_str());
+    ASSERT_STREQ("", tokens[3].c_str());
+    ASSERT_STREQ("", tokens[4].c_str());
+    ASSERT_STREQ("bar", tokens[5].c_str());
+
    // noop

    tokens.clear();