From f501b137b7dda6f9f08a5cf05cfe8accf4760e30 Mon Sep 17 00:00:00 2001 From: kishorenc Date: Tue, 16 Mar 2021 11:39:53 +0530 Subject: [PATCH] Tokenize on special characters. --- src/tokenizer.cpp | 42 +++++++++++++++++++++++++--------------- test/collection_test.cpp | 8 ++++---- test/documents.jsonl | 2 +- test/tokenizer_test.cpp | 39 +++++++++++++++++++++++++++++++++++-- 4 files changed, 68 insertions(+), 23 deletions(-) diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp index bd1543e9..9dc86cd8 100644 --- a/src/tokenizer.cpp +++ b/src/tokenizer.cpp @@ -23,27 +23,37 @@ bool Tokenizer::next(std::string &token, size_t& token_index) { while(i < text.size()) { if((text[i] & ~0x7f) == 0 ) { // ASCII character: split on space/newline or lowercase otherwise - bool is_space = text[i] == 32; - bool is_new_line = text[i] == 10; - bool space_or_newline = (is_space || is_new_line); + if(std::isalnum(text[i])) { + if(normalize) { + out << char(std::tolower(text[i])); + } else { + out << text[i]; + } + } else { + bool is_space = text[i] == 32; + bool is_new_line = text[i] == 10; + bool is_whitespace = is_space || is_new_line; - if(space_or_newline) { - i++; - token = out.str(); - out.clear(); + bool next_char_alphanum = (i != text.length() - 1) && std::isalnum(text[i + 1]); - if(!keep_empty && token.empty()) { - continue; + if(!normalize && !is_whitespace && (i == text.length() - 1 || !next_char_alphanum)) { + // checking for next char ensures that `foo-bar` does not get split to `foo-` + out << text[i]; } - token_index = token_counter++; - return true; - } + if(is_whitespace || next_char_alphanum) { + // we split on space or on a special character whose next char is alphanumeric + token = out.str(); + out.clear(); + i++; - if(!normalize) { - out << text[i]; - } else if(std::isalnum(text[i])) { - out << char(std::tolower(text[i])); + if(!keep_empty && token.empty()) { + continue; + } + + token_index = token_counter++; + return true; + } } i++; diff --git a/test/collection_test.cpp b/test/collection_test.cpp index 64b5f7ea..448a7850 100644 --- a/test/collection_test.cpp +++ b/test/collection_test.cpp @@ -164,7 +164,7 @@ TEST_F(CollectionTest, PhraseSearch) { 13: score: 12, (single word match) */ - std::vector ids = {"8", "1", "17", "16", "13"}; + std::vector ids = {"8", "1", "16", "17", "13"}; for(size_t i = 0; i < results["hits"].size(); i++) { nlohmann::json result = results["hits"].at(i); @@ -176,7 +176,7 @@ TEST_F(CollectionTest, PhraseSearch) { ASSERT_EQ(results["hits"][0]["highlights"].size(), (unsigned long) 1); ASSERT_STREQ(results["hits"][0]["highlights"][0]["field"].get().c_str(), "title"); ASSERT_STREQ(results["hits"][0]["highlights"][0]["snippet"].get().c_str(), - "What is the power requirement of a rocket launch these days?"); + "What is the power, requirement of a rocket launch these days?"); // Check ASC sort order std::vector sort_fields_asc = { sort_by(sort_field_const::text_match, "DESC"), sort_by("points", "ASC") }; @@ -184,7 +184,7 @@ TEST_F(CollectionTest, PhraseSearch) { ASSERT_EQ(5, results["hits"].size()); ASSERT_EQ(5, results["found"].get()); - ids = {"8", "17", "1", "16", "13"}; + ids = {"8", "1", "17", "16", "13"}; for(size_t i = 0; i < results["hits"].size(); i++) { nlohmann::json result = results["hits"].at(i); @@ -200,7 +200,7 @@ TEST_F(CollectionTest, PhraseSearch) { ASSERT_EQ(3, results["request_params"]["per_page"].get()); - ids = {"8", "1", "17"}; + ids = {"8", "1", "16"}; for(size_t i = 0; i < 3; i++) { nlohmann::json result = results["hits"].at(i); diff --git a/test/documents.jsonl b/test/documents.jsonl index 7646a533..ae97bbd6 100644 --- a/test/documents.jsonl +++ b/test/documents.jsonl @@ -5,7 +5,7 @@ {"id": "foo", "points":13,"title":"The heaviest martian spacecraft"} {"points":13,"title":"To what extent are the US modules of ISS based on the Spacelab design?"} {"points":12,"title":"Could future astronauts eat during EVAs?"} -{"points":12,"title":"What is the power requirement of a rocket launch these days?"} +{"points":12,"title":"What is the power, requirement of a rocket launch these days?"} {"points":12,"title":"How does plant growing medium not scatter around?"} {"points":12,"title":"Is there research for the optimal small crew size for a long space voyage?"} {"points":12,"title":"Do long term missions receive insurance coverage?"} diff --git a/test/tokenizer_test.cpp b/test/tokenizer_test.cpp index e7f794c7..3a4b1264 100644 --- a/test/tokenizer_test.cpp +++ b/test/tokenizer_test.cpp @@ -43,13 +43,13 @@ TEST(TokenizerTest, ShouldTokenizeNormalizeDifferentStrings) { ASSERT_STREQ("here", tokens[6].c_str()); // when normalization is disabled and keep empty is enabled - const std::string withoutnormalize = "Mise à jour."; + const std::string withoutnormalize = "Mise à, jour."; tokens.clear(); Tokenizer(withoutnormalize, true, false, false).tokenize(tokens); ASSERT_EQ(5, tokens.size()); ASSERT_STREQ("Mise", tokens[0].c_str()); ASSERT_STREQ("", tokens[1].c_str()); - ASSERT_STREQ("à", tokens[2].c_str()); + ASSERT_STREQ("à,", tokens[2].c_str()); ASSERT_STREQ("", tokens[3].c_str()); ASSERT_STREQ("jour.", tokens[4].c_str()); @@ -62,6 +62,41 @@ TEST(TokenizerTest, ShouldTokenizeNormalizeDifferentStrings) { ASSERT_STREQ("à", tokens[1].c_str()); ASSERT_STREQ("jour.", tokens[2].c_str()); + // single token + const std::string single_token = "foobar"; + tokens.clear(); + Tokenizer(single_token, false, false, false).tokenize(tokens); + ASSERT_EQ(1, tokens.size()); + ASSERT_STREQ("foobar", tokens[0].c_str()); + + // split tokens + const std::string split_tokens = "foo-bar-baz"; + tokens.clear(); + Tokenizer(split_tokens, false, false, false).tokenize(tokens); + ASSERT_EQ(3, tokens.size()); + ASSERT_STREQ("foo", tokens[0].c_str()); + ASSERT_STREQ("bar", tokens[1].c_str()); + ASSERT_STREQ("baz", tokens[2].c_str()); + + tokens.clear(); + Tokenizer(split_tokens, false, true, false).tokenize(tokens); + ASSERT_EQ(3, tokens.size()); + ASSERT_STREQ("foo", tokens[0].c_str()); + ASSERT_STREQ("bar", tokens[1].c_str()); + ASSERT_STREQ("baz", tokens[2].c_str()); + + // multiple spaces + const std::string multispace_tokens = "foo bar"; + tokens.clear(); + Tokenizer(multispace_tokens, true, false, false).tokenize(tokens); + ASSERT_EQ(6, tokens.size()); + ASSERT_STREQ("foo", tokens[0].c_str()); + ASSERT_STREQ("", tokens[1].c_str()); + ASSERT_STREQ("", tokens[2].c_str()); + ASSERT_STREQ("", tokens[3].c_str()); + ASSERT_STREQ("", tokens[4].c_str()); + ASSERT_STREQ("bar", tokens[5].c_str()); + // noop tokens.clear();