diff --git a/include/index.h b/include/index.h index 854605d1..d3bff529 100644 --- a/include/index.h +++ b/include/index.h @@ -312,7 +312,6 @@ public: void scrub_reindex_doc(nlohmann::json& update_doc, nlohmann::json& del_doc, nlohmann::json& old_doc); - void tokenize_doc_field(const nlohmann::json& document, const std::string& field_name, const field& search_field, - std::vector& tokens); + void tokenize_doc_field(const nlohmann::json& document, const field& search_field, std::vector& tokens); }; diff --git a/include/string_utils.h b/include/string_utils.h index b7bfb207..2f2276be 100644 --- a/include/string_utils.h +++ b/include/string_utils.h @@ -291,5 +291,5 @@ struct StringUtils { static std::string hmac(const std::string& key, const std::string& msg); - static size_t unicode_length(const std::string& bytes); + //static size_t unicode_length(const std::string& bytes); }; \ No newline at end of file diff --git a/include/tokenizer.h b/include/tokenizer.h new file mode 100644 index 00000000..4649d9b0 --- /dev/null +++ b/include/tokenizer.h @@ -0,0 +1,27 @@ +#pragma once + +#include +#include +#include + +class Tokenizer { +private: + const std::string& text; + size_t i; + const bool keep_empty; + const bool normalize; + + size_t token_counter = 0; + iconv_t cd; + +public: + + Tokenizer(const std::string& input, bool keep_empty=true, bool normalize=true): + text(input), i(0), keep_empty(keep_empty), normalize(normalize) { + cd = iconv_open("ASCII//TRANSLIT", "UTF-8"); + } + + bool next(std::string& token, size_t& token_index); + + void tokenize(std::vector& tokens); +}; \ No newline at end of file diff --git a/src/index.cpp b/src/index.cpp index adb81c6b..11eff26e 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include "logger.h" Index::Index(const std::string name, const std::unordered_map & search_schema, @@ -337,8 +338,8 @@ void Index::scrub_reindex_doc(nlohmann::json& update_doc, nlohmann::json& del_do // Go through all the field names and find the keys+values so that they can be removed from in-memory index std::vector reindex_tokens; std::vector old_tokens; - tokenize_doc_field(update_doc, field_name, search_field, reindex_tokens); - tokenize_doc_field(old_doc, field_name, search_field, old_tokens); + tokenize_doc_field(update_doc, search_field, reindex_tokens); + tokenize_doc_field(old_doc, search_field, old_tokens); if(old_tokens.size() != reindex_tokens.size()) { ++it; @@ -556,28 +557,23 @@ uint64_t Index::facet_token_hash(const field & a_field, const std::string &token void Index::index_string_field(const std::string & text, const int64_t score, art_tree *t, uint32_t seq_id, int facet_id, const field & a_field) { - std::vector tokens; - StringUtils::split(text, tokens, " ", a_field.is_string()); - std::unordered_map> token_to_offsets; - for(size_t i=0; i= 0) { uint64_t hash = facet_token_hash(a_field, token); facet_index_v2[seq_id][facet_id].push_back(hash); } - token_to_offsets[token].push_back(i); + token_to_offsets[token].push_back(token_index); } /*if(seq_id == 0) { @@ -596,32 +592,26 @@ void Index::index_string_array_field(const std::vector & strings, c std::unordered_map> token_positions; for(size_t array_index = 0; array_index < strings.size(); array_index++) { - const std::string & str = strings[array_index]; - std::vector tokens; - std::string delim = " "; - StringUtils::split(str, tokens, delim, a_field.is_string()); - + const std::string& str = strings[array_index]; std::set token_set; // required to deal with repeating tokens - // iterate and append offset positions - for(size_t i=0; i= 0) { uint64_t hash = facet_token_hash(a_field, token); facet_index_v2[seq_id][facet_id].push_back(hash); //printf("indexing %.*s - %llu\n", token.size(), token.c_str(), hash); } - token_positions[token].push_back(i); + token_positions[token].push_back(token_index); token_set.insert(token); } @@ -630,13 +620,13 @@ void Index::index_string_array_field(const std::vector & strings, c } // repeat last element to indicate end of offsets for this array index - for(auto & token: token_set) { - token_positions[token].push_back(token_positions[token].back()); + for(auto & the_token: token_set) { + token_positions[the_token].push_back(token_positions[the_token].back()); } // iterate and append this array index to all tokens - for(auto & token: token_set) { - token_positions[token].push_back(array_index); + for(auto & the_token: token_set) { + token_positions[the_token].push_back(array_index); } } @@ -750,17 +740,14 @@ void Index::do_facets(std::vector & facets, facet_query_t & facet_query, } } - std::vector query_tokens; - StringUtils::split(facet_query.query, query_tokens, " "); - // for non-string fields, `faceted_name` returns their aliased stringified field name art_tree *t = search_index.at(facet_field.faceted_name()); + std::vector query_tokens; + Tokenizer(facet_query.query, false, facet_field.is_string()).tokenize(query_tokens); + for (size_t qtoken_index = 0; qtoken_index < query_tokens.size(); qtoken_index++) { auto &q = query_tokens[qtoken_index]; - if (facet_field.is_string()) { - string_utils.unicode_normalize(q); - } int bounded_cost = (q.size() < 3) ? 0 : 1; bool prefix_search = (qtoken_index == @@ -1062,9 +1049,6 @@ Option Index::do_filtering(uint32_t** filter_ids_out, const std::vecto size_t ids_size = 0; for(const std::string & filter_value: a_filter.values) { - std::vector str_tokens; - StringUtils::split(filter_value, str_tokens, " "); - uint32_t* strt_ids = nullptr; size_t strt_ids_size = 0; @@ -1072,8 +1056,14 @@ Option Index::do_filtering(uint32_t** filter_ids_out, const std::vecto // there could be multiple tokens in a filter value, which we have to treat as ANDs // e.g. country: South Africa - for(auto& str_token: str_tokens) { - string_utils.unicode_normalize(str_token); + + Tokenizer tokenizer(filter_value, false, true); + std::string str_token; + size_t token_index = 0; + std::vector str_tokens; + + while(tokenizer.next(str_token, token_index)) { + str_tokens.push_back(str_token); art_leaf* leaf = (art_leaf *) art_search(t, (const unsigned char*) str_token.c_str(), str_token.length()+1); @@ -1296,15 +1286,15 @@ void Index::collate_included_ids(const std::string & query, const std::string & } // calculate match_score and add to topster independently - std::vector tokens; - StringUtils::split(query, tokens, " "); std::vector override_query; - for(size_t token_index = 0; token_index < tokens.size(); token_index++) { - const auto token = tokens[token_index]; + Tokenizer tokenizer(query, false, true); + std::string token; + size_t token_index = 0; + + while(tokenizer.next(token, token_index)) { const size_t token_len = token.length(); - string_utils.unicode_normalize(tokens[token_index]); std::vector leaves; art_fuzzy_search(search_index.at(field), (const unsigned char *) token.c_str(), token_len, @@ -1483,8 +1473,6 @@ void Index::search_field(const uint8_t & field_id, const std::string & query, co uint32_t** all_result_ids, size_t & all_result_ids_len, const token_ordering token_order, const bool prefix, const size_t drop_tokens_threshold, const size_t typo_tokens_threshold) { - std::vector tokens; - StringUtils::split(query, tokens, " "); const size_t max_cost = (num_typos < 0 || num_typos > 2) ? 2 : num_typos; @@ -1496,19 +1484,22 @@ void Index::search_field(const uint8_t & field_id, const std::string & query, co std::vector> token_to_costs; - for(size_t token_index = 0; token_index < tokens.size(); token_index++) { - std::vector all_costs; - const size_t token_len = tokens[token_index].length(); + Tokenizer tokenizer(query, false, true); + std::string token; + size_t token_index = 0; + std::vector tokens; + while(tokenizer.next(token, token_index)) { + std::vector all_costs; // This ensures that we don't end up doing a cost of 1 for a single char etc. - int bounded_cost = get_bounded_typo_cost(max_cost, token_len); + int bounded_cost = get_bounded_typo_cost(max_cost, token.length()); for(int cost = 0; cost <= bounded_cost; cost++) { all_costs.push_back(cost); } token_to_costs.push_back(all_costs); - string_utils.unicode_normalize(tokens[token_index]); + tokens.push_back(token); } // stores candidates for each token, i.e. i-th index would have all possible tokens with a cost of "c" @@ -1934,14 +1925,13 @@ Option Index::remove(const uint32_t seq_id, const nlohmann::json & doc // Go through all the field names and find the keys+values so that they can be removed from in-memory index std::vector tokens; - tokenize_doc_field(document, field_name, search_field, tokens); + tokenize_doc_field(document, search_field, tokens); for(auto & token: tokens) { const unsigned char *key; int key_len; if(search_field.type == field_types::STRING_ARRAY || search_field.type == field_types::STRING) { - string_utils.unicode_normalize(token); key = (const unsigned char *) token.c_str(); key_len = (int) (token.length() + 1); } else { @@ -1998,14 +1988,17 @@ Option Index::remove(const uint32_t seq_id, const nlohmann::json & doc return Option(seq_id); } -void Index::tokenize_doc_field(const nlohmann::json& document, const std::string& field_name, const field& search_field, +void Index::tokenize_doc_field(const nlohmann::json& document, const field& search_field, std::vector& tokens) { + + const std::string& field_name = search_field.name; + if(search_field.type == field_types::STRING) { - StringUtils::split(document[field_name], tokens, " ", true); + Tokenizer(document[field_name], true, search_field.is_string()).tokenize(tokens); } else if(search_field.type == field_types::STRING_ARRAY) { const std::vector& values = document[field_name].get>(); for(const std::string & value: values) { - StringUtils::split(value, tokens, " ", true); + Tokenizer(value, true, search_field.is_string()).tokenize(tokens); } } else if(search_field.type == field_types::INT32) { const int KEY_LEN = 8; diff --git a/src/string_utils.cpp b/src/string_utils.cpp index 39246d0f..da0a0582 100644 --- a/src/string_utils.cpp +++ b/src/string_utils.cpp @@ -3,7 +3,6 @@ #include #include #include -#include std::string lower_and_no_special_chars(const std::string & str) { std::stringstream ss; @@ -31,6 +30,12 @@ void StringUtils::unicode_normalize(std::string & str) const { char inbuf[5]; char *p = inbuf; + if((*s & ~0x7f) == 0 ) { + // ascii character + out << *s++; + continue; + } + // group bytes to form a unicode representation *p++ = *s++; if ((*s & 0xC0) == 0x80) *p++ = *s++; @@ -101,7 +106,7 @@ std::string StringUtils::str2hex(const std::string &str, bool capital) { return hexstr; } -size_t StringUtils::unicode_length(const std::string& bytes) { +/*size_t StringUtils::unicode_length(const std::string& bytes) { std::wstring_convert, char32_t> utf8conv; return utf8conv.from_bytes(bytes).size(); -} +}*/ diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp new file mode 100644 index 00000000..713209d4 --- /dev/null +++ b/src/tokenizer.cpp @@ -0,0 +1,103 @@ +#include +#include "tokenizer.h" + +bool Tokenizer::next(std::string &token, size_t& token_index) { + std::stringstream out; + + if(i >= text.size()) { + return false; + } + + if(!normalize) { + token = text; + i = text.size(); + return true; + } + + while(i < text.size()) { + if((text[i] & ~0x7f) == 0 ) { + // ASCII character: split on space/newline or lowercase otherwise + bool is_space = text[i] == 32; + bool is_new_line = text[i] == 10; + bool space_or_newline = (is_space || is_new_line); + + if(space_or_newline) { + i++; + token = out.str(); + out.clear(); + + if(!keep_empty && token.empty()) { + continue; + } + + token_index = token_counter++; + return true; + } + + if(std::isalnum(text[i])) { + out << char(std::tolower(text[i])); + } + + i++; + continue; + } + + char inbuf[5]; + char *p = inbuf; + + // group bytes to form a unicode representation + *p++ = text[i++]; + if ((text[i] & 0xC0) == 0x80) *p++ = text[i++]; + if ((text[i] & 0xC0) == 0x80) *p++ = text[i++]; + if ((text[i] & 0xC0) == 0x80) *p++ = text[i++]; + *p = 0; + size_t insize = (p - &inbuf[0]); + + char outbuf[5] = {}; + size_t outsize = sizeof(outbuf); + char *outptr = outbuf; + char *inptr = inbuf; + + //printf("[%s]\n", inbuf); + + errno = 0; + iconv(cd, &inptr, &insize, &outptr, &outsize); + + if(errno == EILSEQ) { + // symbol cannot be represented as ASCII, so write the original symbol + out << inbuf; + } else { + // NOTE: outsize indicates bytes available AFTER current position so have to do <= + for(size_t out_index=0; out_index<5; out_index++) { + bool is_ascii = ((outbuf[out_index] & ~0x7f) == 0); + bool keep_char = !is_ascii || std::isalnum(outbuf[out_index]); + + if(keep_char) { + if(is_ascii && std::isalnum(outbuf[out_index])) { + outbuf[out_index] = std::tolower(outbuf[out_index]); + } + out << outbuf[out_index]; + } + } + } + } + + token = out.str(); + out.clear(); + + if(!keep_empty && token.empty()) { + return false; + } + + token_index = token_counter++; + return true; +} + +void Tokenizer::tokenize(std::vector &tokens) { + std::string token; + size_t token_index; + + while(next(token, token_index)) { + tokens.push_back(token); + } +} diff --git a/test/collection_faceting_test.cpp b/test/collection_faceting_test.cpp index f1a9dd88..ba2e39db 100644 --- a/test/collection_faceting_test.cpp +++ b/test/collection_faceting_test.cpp @@ -558,12 +558,12 @@ TEST_F(CollectionFacetingTest, FacetCountsHighlighting) { ASSERT_STREQ("categories", results["facet_counts"][0]["field_name"].get().c_str()); ASSERT_EQ(1, results["facet_counts"][0]["counts"][0]["count"].get()); - ASSERT_STREQ("Cell Phone Cases & Clips", results["facet_counts"][0]["counts"][0]["value"].get().c_str()); - ASSERT_STREQ("Cell Phone Cases & Clips", results["facet_counts"][0]["counts"][0]["highlighted"].get().c_str()); + ASSERT_STREQ("Cell Phones", results["facet_counts"][0]["counts"][0]["value"].get().c_str()); + ASSERT_STREQ("Cell Phones", results["facet_counts"][0]["counts"][0]["highlighted"].get().c_str()); ASSERT_EQ(1, results["facet_counts"][0]["counts"][1]["count"].get()); - ASSERT_STREQ("Cell Phones", results["facet_counts"][0]["counts"][1]["value"].get().c_str()); - ASSERT_STREQ("Cell Phones", results["facet_counts"][0]["counts"][1]["highlighted"].get().c_str()); + ASSERT_STREQ("Cell Phone Cases & Clips", results["facet_counts"][0]["counts"][1]["value"].get().c_str()); + ASSERT_STREQ("Cell Phone Cases & Clips", results["facet_counts"][0]["counts"][1]["highlighted"].get().c_str()); ASSERT_EQ(1, results["facet_counts"][0]["counts"][2]["count"].get()); ASSERT_STREQ("Cell Phone Accessories", results["facet_counts"][0]["counts"][2]["value"].get().c_str()); diff --git a/test/index_test.cpp b/test/index_test.cpp index c069565e..0cef0f21 100644 --- a/test/index_test.cpp +++ b/test/index_test.cpp @@ -5,7 +5,7 @@ TEST(IndexTest, ScrubReindexDoc) { std::unordered_map search_schema; search_schema.emplace("title", field("title", field_types::STRING, false)); - search_schema.emplace("points", field("title", field_types::INT32, false)); + search_schema.emplace("points", field("points", field_types::INT32, false)); search_schema.emplace("cast", field("cast", field_types::STRING_ARRAY, false)); search_schema.emplace("movie", field("movie", field_types::BOOL, false)); diff --git a/test/tokenizer_test.cpp b/test/tokenizer_test.cpp new file mode 100644 index 00000000..c53e5639 --- /dev/null +++ b/test/tokenizer_test.cpp @@ -0,0 +1,82 @@ +#include +#include "tokenizer.h" + +TEST(TokenizerTest, ShouldTokenizeNormalizeDifferentStrings) { + const std::string withnewline = "Michael Jordan:\nWelcome, everybody. Welcome!"; + std::vector tokens; + Tokenizer(withnewline, true, true).tokenize(tokens); + ASSERT_EQ(5, tokens.size()); + ASSERT_STREQ("michael", tokens[0].c_str()); + ASSERT_STREQ("jordan", tokens[1].c_str()); + ASSERT_STREQ("welcome", tokens[2].c_str()); + ASSERT_STREQ("everybody", tokens[3].c_str()); + ASSERT_STREQ("welcome", tokens[4].c_str()); + + const std::string withspaces = " Michael Jordan "; + tokens.clear(); + Tokenizer(withspaces, true, true).tokenize(tokens); + ASSERT_EQ(5, tokens.size()); + ASSERT_STREQ("", tokens[0].c_str()); + ASSERT_STREQ("michael", tokens[1].c_str()); + ASSERT_STREQ("", tokens[2].c_str()); + ASSERT_STREQ("jordan", tokens[3].c_str()); + ASSERT_STREQ("", tokens[4].c_str()); + + tokens.clear(); + Tokenizer(withspaces, false, true).tokenize(tokens); + ASSERT_EQ(2, tokens.size()); + ASSERT_STREQ("michael", tokens[0].c_str()); + ASSERT_STREQ("jordan", tokens[1].c_str()); + + const std::string withspecialchars = "Special ½¥ and தமிழ் 你好吗 abcÅà123ß12 here."; + tokens.clear(); + Tokenizer(withspecialchars, false, true).tokenize(tokens); + ASSERT_EQ(7, tokens.size()); + ASSERT_STREQ("special", tokens[0].c_str()); + ASSERT_STREQ("12yen", tokens[1].c_str()); + ASSERT_STREQ("and", tokens[2].c_str()); + ASSERT_STREQ("தமிழ்", tokens[3].c_str()); + ASSERT_STREQ("你好吗", tokens[4].c_str()); + ASSERT_STREQ("abcaa123ss12", tokens[5].c_str()); + ASSERT_STREQ("here", tokens[6].c_str()); + + // when normalize is false, should be verbatim + + tokens.clear(); + Tokenizer(withspecialchars, false, false).tokenize(tokens); + ASSERT_EQ(1, tokens.size()); + ASSERT_STREQ(withspecialchars.c_str(), tokens[0].c_str()); +} + +TEST(TokenizerTest, ShouldTokenizeIteratively) { + const std::string withnewline = "Michael Jordan:\n\nWelcome, everybody. Welcome!"; + std::vector tokens; + Tokenizer tokenizer1(withnewline, true, true); + + std::string token; + size_t token_index; + + while(tokenizer1.next(token, token_index)) { + tokens.push_back(token); + } + + ASSERT_EQ(6, tokens.size()); + ASSERT_STREQ("michael", tokens[0].c_str()); + ASSERT_STREQ("jordan", tokens[1].c_str()); + ASSERT_STREQ("", tokens[2].c_str()); + ASSERT_STREQ("welcome", tokens[3].c_str()); + ASSERT_STREQ("everybody", tokens[4].c_str()); + ASSERT_STREQ("welcome", tokens[5].c_str()); + + // verbatim (normalize=false) + + tokens.clear(); + Tokenizer tokenizer2(withnewline, true, false); + + while(tokenizer2.next(token, token_index)) { + tokens.push_back(token); + } + + ASSERT_EQ(1, tokens.size()); + ASSERT_STREQ("Michael Jordan:\n\nWelcome, everybody. Welcome!", tokens[0].c_str()); +}