From f501b137b7dda6f9f08a5cf05cfe8accf4760e30 Mon Sep 17 00:00:00 2001
From: kishorenc <kishorenc@gmail.com>
Date: Tue, 16 Mar 2021 11:39:53 +0530
Subject: [PATCH] Tokenize on special characters.

---
 src/tokenizer.cpp        | 42 +++++++++++++++++++++++++---------------
 test/collection_test.cpp |  8 ++++----
 test/documents.jsonl     |  2 +-
 test/tokenizer_test.cpp  | 39 +++++++++++++++++++++++++++++++++++--
 4 files changed, 68 insertions(+), 23 deletions(-)
diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp
index bd1543e9..9dc86cd8 100644
--- a/src/tokenizer.cpp
+++ b/src/tokenizer.cpp
@@ -23,27 +23,37 @@ bool Tokenizer::next(std::string &token, size_t& token_index) {
     while(i < text.size()) {
         if((text[i] & ~0x7f) == 0 ) {
             // ASCII character: split on space/newline or lowercase otherwise
-            bool is_space = text[i] == 32;
-            bool is_new_line = text[i] == 10;
-            bool space_or_newline = (is_space || is_new_line);
+            if(std::isalnum(text[i])) {
+                if(normalize) {
+                    out << char(std::tolower(text[i]));
+                } else {
+                    out << text[i];
+                }
+            } else {
+                bool is_space = text[i] == 32;
+                bool is_new_line = text[i] == 10;
+                bool is_whitespace = is_space || is_new_line;
 
-            if(space_or_newline) {
-                i++;
-                token = out.str();
-                out.clear();
+                bool next_char_alphanum = (i != text.length() - 1) && std::isalnum(text[i + 1]);
 
-                if(!keep_empty && token.empty()) {
-                    continue;
+                if(!normalize && !is_whitespace && (i == text.length() - 1 || !next_char_alphanum)) {
+                    // checking for next char ensures that `foo-bar` does not get split to `foo-`
+                    out << text[i];
                 }
 
-                token_index = token_counter++;
-                return true;
-            }
+                if(is_whitespace || next_char_alphanum) {
+                    // we split on space or on a special character whose next char is alphanumeric
+                    token = out.str();
+                    out.clear();
+                    i++;
 
-            if(!normalize) {
-                out << text[i];
-            } else if(std::isalnum(text[i])) {
-                out << char(std::tolower(text[i]));
+                    if(!keep_empty && token.empty()) {
+                        continue;
+                    }
+
+                    token_index = token_counter++;
+                    return true;
+                }
             }
 
             i++;
diff --git a/test/collection_test.cpp b/test/collection_test.cpp
index 64b5f7ea..448a7850 100644
--- a/test/collection_test.cpp
+++ b/test/collection_test.cpp
@@ -164,7 +164,7 @@ TEST_F(CollectionTest, PhraseSearch) {
        13:  score: 12, (single word match)
     */
 
-    std::vector<std::string> ids = {"8", "1", "17", "16", "13"};
+    std::vector<std::string> ids = {"8", "1", "16", "17", "13"};
 
     for(size_t i = 0; i < results["hits"].size(); i++) {
         nlohmann::json result = results["hits"].at(i);
@@ -176,7 +176,7 @@ TEST_F(CollectionTest, PhraseSearch) {
     ASSERT_EQ(results["hits"][0]["highlights"].size(), (unsigned long) 1);
     ASSERT_STREQ(results["hits"][0]["highlights"][0]["field"].get<std::string>().c_str(), "title");
     ASSERT_STREQ(results["hits"][0]["highlights"][0]["snippet"].get<std::string>().c_str(),
-                 "What is the power requirement of a <mark>rocket</mark> <mark>launch</mark> these days?");
+                 "What is the power, requirement of a <mark>rocket</mark> <mark>launch</mark> these days?");
 
     // Check ASC sort order
     std::vector<sort_by> sort_fields_asc = { sort_by(sort_field_const::text_match, "DESC"), sort_by("points", "ASC") };
@@ -184,7 +184,7 @@ TEST_F(CollectionTest, PhraseSearch) {
     ASSERT_EQ(5, results["hits"].size());
     ASSERT_EQ(5, results["found"].get<uint32_t>());
 
-    ids = {"8", "17", "1", "16", "13"};
+    ids = {"8", "1", "17", "16", "13"};
 
     for(size_t i = 0; i < results["hits"].size(); i++) {
         nlohmann::json result = results["hits"].at(i);
@@ -200,7 +200,7 @@ TEST_F(CollectionTest, PhraseSearch) {
 
     ASSERT_EQ(3, results["request_params"]["per_page"].get<size_t>());
 
-    ids = {"8", "1", "17"};
+    ids = {"8", "1", "16"};
 
     for(size_t i = 0; i < 3; i++) {
         nlohmann::json result = results["hits"].at(i);
diff --git a/test/documents.jsonl b/test/documents.jsonl
index 7646a533..ae97bbd6 100644
--- a/test/documents.jsonl
+++ b/test/documents.jsonl
@@ -5,7 +5,7 @@
 {"id": "foo", "points":13,"title":"The heaviest martian spacecraft"}
 {"points":13,"title":"To what extent are the US modules of ISS based on the Spacelab design?"}
 {"points":12,"title":"Could future astronauts eat during EVAs?"}
-{"points":12,"title":"What is the power requirement of a rocket launch these days?"}
+{"points":12,"title":"What is the power, requirement of a rocket launch these days?"}
 {"points":12,"title":"How does plant growing medium not scatter around?"}
 {"points":12,"title":"Is there research for the optimal small crew size for a long space voyage?"}
 {"points":12,"title":"Do long term missions receive insurance coverage?"}
diff --git a/test/tokenizer_test.cpp b/test/tokenizer_test.cpp
index e7f794c7..3a4b1264 100644
--- a/test/tokenizer_test.cpp
+++ b/test/tokenizer_test.cpp
@@ -43,13 +43,13 @@ TEST(TokenizerTest, ShouldTokenizeNormalizeDifferentStrings) {
     ASSERT_STREQ("here", tokens[6].c_str());
 
     // when normalization is disabled and keep empty is enabled
-    const std::string withoutnormalize = "Mise  à  jour.";
+    const std::string withoutnormalize = "Mise  à,  jour.";
     tokens.clear();
     Tokenizer(withoutnormalize, true, false, false).tokenize(tokens);
     ASSERT_EQ(5, tokens.size());
     ASSERT_STREQ("Mise", tokens[0].c_str());
     ASSERT_STREQ("", tokens[1].c_str());
-    ASSERT_STREQ("à", tokens[2].c_str());
+    ASSERT_STREQ("à,", tokens[2].c_str());
     ASSERT_STREQ("", tokens[3].c_str());
     ASSERT_STREQ("jour.", tokens[4].c_str());
 
@@ -62,6 +62,41 @@ TEST(TokenizerTest, ShouldTokenizeNormalizeDifferentStrings) {
     ASSERT_STREQ("à", tokens[1].c_str());
     ASSERT_STREQ("jour.", tokens[2].c_str());
 
+    // single token
+    const std::string single_token = "foobar";
+    tokens.clear();
+    Tokenizer(single_token, false, false, false).tokenize(tokens);
+    ASSERT_EQ(1, tokens.size());
+    ASSERT_STREQ("foobar", tokens[0].c_str());
+
+    // split tokens
+    const std::string split_tokens = "foo-bar-baz";
+    tokens.clear();
+    Tokenizer(split_tokens, false, false, false).tokenize(tokens);
+    ASSERT_EQ(3, tokens.size());
+    ASSERT_STREQ("foo", tokens[0].c_str());
+    ASSERT_STREQ("bar", tokens[1].c_str());
+    ASSERT_STREQ("baz", tokens[2].c_str());
+
+    tokens.clear();
+    Tokenizer(split_tokens, false, true, false).tokenize(tokens);
+    ASSERT_EQ(3, tokens.size());
+    ASSERT_STREQ("foo", tokens[0].c_str());
+    ASSERT_STREQ("bar", tokens[1].c_str());
+    ASSERT_STREQ("baz", tokens[2].c_str());
+
+    // multiple spaces
+    const std::string multispace_tokens = "foo     bar";
+    tokens.clear();
+    Tokenizer(multispace_tokens, true, false, false).tokenize(tokens);
+    ASSERT_EQ(6, tokens.size());
+    ASSERT_STREQ("foo", tokens[0].c_str());
+    ASSERT_STREQ("", tokens[1].c_str());
+    ASSERT_STREQ("", tokens[2].c_str());
+    ASSERT_STREQ("", tokens[3].c_str());
+    ASSERT_STREQ("", tokens[4].c_str());
+    ASSERT_STREQ("bar", tokens[5].c_str());
+
     // noop
 
     tokens.clear();