mirror of
https://github.com/typesense/typesense.git
synced 2025-05-19 05:08:43 +08:00
Fix highlighting of strings with special characters.
This commit is contained in:
parent
fcdd8ec9c9
commit
c2eec85277
@ -389,10 +389,10 @@ public:
|
||||
|
||||
static void transform_for_180th_meridian(GeoCoord& point, double offset);
|
||||
|
||||
// the following methods are not synchronized because their parent calls are synchronized
|
||||
|
||||
art_leaf* get_token_leaf(const std::string & field_name, const unsigned char* token, uint32_t token_len);
|
||||
|
||||
// the following methods are not synchronized because their parent calls are synchronized
|
||||
|
||||
uint32_t do_filtering(uint32_t** filter_ids_out, const std::vector<filter> & filters) const;
|
||||
|
||||
static Option<uint32_t> validate_index_in_memory(nlohmann::json &document, uint32_t seq_id,
|
||||
|
@ -30,6 +30,10 @@ struct TokenOffset {
|
||||
bool operator>(const TokenOffset &a) const {
|
||||
return offset > a.offset;
|
||||
}
|
||||
|
||||
bool operator<(const TokenOffset &a) const {
|
||||
return offset < a.offset;
|
||||
}
|
||||
};
|
||||
|
||||
struct Match {
|
||||
|
@ -8,19 +8,32 @@ class Tokenizer {
|
||||
private:
|
||||
const std::string& text;
|
||||
size_t i;
|
||||
const bool keep_empty;
|
||||
const bool keep_separators;
|
||||
const bool normalize;
|
||||
const bool no_op;
|
||||
|
||||
size_t token_counter = 0;
|
||||
iconv_t cd;
|
||||
|
||||
static const size_t CHARS = 0;
|
||||
static const size_t SEPARATORS = 1;
|
||||
size_t stream_mode;
|
||||
|
||||
std::stringstream out;
|
||||
|
||||
public:
|
||||
|
||||
explicit Tokenizer(const std::string& input,
|
||||
bool keep_empty=true, bool normalize=true, bool no_op=false):
|
||||
text(input), i(0), keep_empty(keep_empty), normalize(normalize), no_op(no_op) {
|
||||
bool keep_separators=true, bool normalize=true, bool no_op=false):
|
||||
text(input), i(0), keep_separators(keep_separators), normalize(normalize), no_op(no_op) {
|
||||
cd = iconv_open("ASCII//TRANSLIT", "UTF-8");
|
||||
|
||||
if(!input.empty() && (std::isalnum(text[0]) || (text[i] & ~0x7f) != 0)) {
|
||||
// alphanum or non-ascii
|
||||
stream_mode = CHARS;
|
||||
} else {
|
||||
stream_mode = SEPARATORS;
|
||||
}
|
||||
}
|
||||
|
||||
~Tokenizer() {
|
||||
|
@ -1223,20 +1223,27 @@ void Collection::parse_search_query(const std::string &query, std::vector<std::s
|
||||
q_include_tokens = {query};
|
||||
} else {
|
||||
std::vector<std::string> tokens;
|
||||
StringUtils::split(query, tokens, " ");
|
||||
Tokenizer(query, true, true).tokenize(tokens);
|
||||
bool exclude_operator_prior = false;
|
||||
|
||||
for(std::string& token: tokens) {
|
||||
if(token[0] == '-') {
|
||||
std::string&& just_token = token.substr(1);
|
||||
Tokenizer(just_token, false, true).tokenize(just_token);
|
||||
if(!just_token.empty()) {
|
||||
q_exclude_tokens.push_back(just_token);
|
||||
}
|
||||
for(const auto& token: tokens) {
|
||||
if(token.empty()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if(token == "-" || token == " -") {
|
||||
exclude_operator_prior = true;
|
||||
}
|
||||
|
||||
if(!std::isalnum(token[0])) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if(exclude_operator_prior) {
|
||||
q_exclude_tokens.push_back(token);
|
||||
exclude_operator_prior = false;
|
||||
} else {
|
||||
Tokenizer(token, false, true).tokenize(token);
|
||||
if(!token.empty()) {
|
||||
q_include_tokens.push_back(token);
|
||||
}
|
||||
q_include_tokens.push_back(token);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1383,7 +1390,9 @@ void Collection::highlight_result(const field &search_field,
|
||||
// is from the best matched field and need not be present in other fields of a document.
|
||||
Index* index = indices[field_order_kv->key % num_memory_shards];
|
||||
art_leaf *actual_leaf = index->get_token_leaf(search_field.name, &token_leaf->key[0], token_leaf->key_len);
|
||||
|
||||
if(actual_leaf != nullptr) {
|
||||
//LOG(INFO) << "field: " << search_field.name << ", key: " << actual_leaf->key;
|
||||
query_suggestion.push_back(actual_leaf);
|
||||
std::vector<uint16_t> positions;
|
||||
uint32_t doc_index = actual_leaf->values->ids.indexOf(field_order_kv->key);
|
||||
@ -1432,66 +1441,84 @@ void Collection::highlight_result(const field &search_field,
|
||||
std::partial_sort(match_indices.begin(), match_indices.begin()+max_array_matches, match_indices.end());
|
||||
|
||||
for(size_t index = 0; index < max_array_matches; index++) {
|
||||
const match_index_t & match_index = match_indices[index];
|
||||
const Match & match = match_index.match;
|
||||
std::sort(match_indices[index].match.offsets.begin(), match_indices[index].match.offsets.end());
|
||||
const auto& match_index = match_indices[index];
|
||||
const Match& match = match_index.match;
|
||||
|
||||
std::vector<std::string> tokens;
|
||||
const std::string& text = (search_field.type == field_types::STRING) ? document[search_field.name] : document[search_field.name][match_index.index];
|
||||
Tokenizer tokenizer(text, true, false);
|
||||
|
||||
if(search_field.type == field_types::STRING) {
|
||||
Tokenizer(document[search_field.name], true, false).tokenize(tokens);
|
||||
} else {
|
||||
Tokenizer(document[search_field.name][match_index.index], true, false).tokenize(tokens);
|
||||
}
|
||||
std::string raw_token;
|
||||
size_t raw_token_index = 0;
|
||||
int indexed_token_index = -1;
|
||||
size_t match_offset_index = 0;
|
||||
|
||||
std::vector<size_t> token_indices;
|
||||
std::set<size_t> token_indices;
|
||||
spp::sparse_hash_set<std::string> token_hits;
|
||||
std::vector<std::string> raw_tokens;
|
||||
std::unordered_map<size_t, size_t> indexed_to_raw;
|
||||
|
||||
for(size_t i = 0; i < match.offsets.size(); i++) {
|
||||
if(match.offsets[i].offset != MAX_DISPLACEMENT) {
|
||||
size_t token_index = (size_t)(match.offsets[i].offset);
|
||||
token_indices.push_back(token_index);
|
||||
if(token_index >= tokens.size()) {
|
||||
LOG(ERROR) << "Highlight token index " << token_index << " is greater than length of store field.";
|
||||
continue;
|
||||
}
|
||||
std::string token = tokens[token_index];
|
||||
Tokenizer(token, true, true).tokenize(token);
|
||||
|
||||
token_hits.insert(token);
|
||||
while(tokenizer.next(raw_token, raw_token_index)) {
|
||||
if(!raw_token.empty() && (std::isalnum(raw_token[0]) || (raw_token[0] & ~0x7f) != 0)) {
|
||||
// check for actual token (first char is NOT alphanum or ascii)
|
||||
indexed_token_index++;
|
||||
indexed_to_raw[indexed_token_index] = raw_token_index;
|
||||
/*LOG(INFO) << "raw_token: " << raw_token << ", indexed_token_index: " << indexed_token_index
|
||||
<< ", raw_token_index: " << raw_token_index;*/
|
||||
}
|
||||
|
||||
if (match_offset_index < match.offsets.size() &&
|
||||
match.offsets[match_offset_index].offset == indexed_token_index) {
|
||||
std::string indexed_token;
|
||||
Tokenizer(raw_token, true, true).tokenize(indexed_token);
|
||||
|
||||
if(token_indices.count(indexed_token_index) == 0) {
|
||||
// repetition could occur, for e.g. in the case of synonym constructed queries
|
||||
token_indices.insert(indexed_token_index);
|
||||
token_hits.insert(indexed_token);
|
||||
}
|
||||
|
||||
match_offset_index++;
|
||||
}
|
||||
|
||||
raw_tokens.push_back(raw_token);
|
||||
}
|
||||
|
||||
size_t num_indexed_tokens = indexed_token_index + 1;
|
||||
auto minmax = std::minmax_element(token_indices.begin(), token_indices.end());
|
||||
|
||||
size_t prefix_length = highlight_affix_num_tokens;
|
||||
size_t suffix_length = highlight_affix_num_tokens + 1;
|
||||
size_t suffix_length = highlight_affix_num_tokens;
|
||||
|
||||
// For longer strings, pick surrounding tokens within `prefix_length` of min_index and max_index for snippet
|
||||
const size_t start_index = (tokens.size() <= snippet_threshold) ? 0 :
|
||||
std::max(0, (int)(*(minmax.first) - prefix_length));
|
||||
if(num_indexed_tokens == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const size_t end_index = (tokens.size() <= snippet_threshold) ? tokens.size() :
|
||||
std::min((int)tokens.size(), (int)(*(minmax.second) + suffix_length));
|
||||
// For longer strings, pick surrounding raw_tokens within `prefix_length` of min_index and max_index for snippet
|
||||
const size_t start_index = (num_indexed_tokens <= snippet_threshold) ? 0 :
|
||||
indexed_to_raw[std::max(0, (int)(*(minmax.first) - prefix_length))];
|
||||
|
||||
const size_t end_index = (num_indexed_tokens <= snippet_threshold) ? raw_tokens.size() - 1 :
|
||||
indexed_to_raw[std::min((int)num_indexed_tokens - 1, (int)(*(minmax.second) + suffix_length))];
|
||||
|
||||
std::stringstream snippet_stream;
|
||||
|
||||
highlight.matched_tokens.emplace_back();
|
||||
std::vector<std::string>& matched_tokens = highlight.matched_tokens.back();
|
||||
size_t snippet_index = start_index;
|
||||
|
||||
for(size_t snippet_index = start_index; snippet_index < end_index; snippet_index++) {
|
||||
if(snippet_index != start_index) {
|
||||
snippet_stream << " ";
|
||||
}
|
||||
while(snippet_index <= end_index) {
|
||||
std::string normalized_token;
|
||||
Tokenizer(raw_tokens[snippet_index], true, true).tokenize(normalized_token);
|
||||
|
||||
std::string token = tokens[snippet_index];
|
||||
Tokenizer(token, true, true).tokenize(token);
|
||||
|
||||
if(token_hits.count(token) != 0) {
|
||||
snippet_stream << highlight_start_tag << tokens[snippet_index] << highlight_end_tag;
|
||||
matched_tokens.push_back(tokens[snippet_index]);
|
||||
if(token_hits.count(normalized_token) != 0) {
|
||||
snippet_stream << highlight_start_tag << raw_tokens[snippet_index] << highlight_end_tag;
|
||||
matched_tokens.push_back(raw_tokens[snippet_index]);
|
||||
} else {
|
||||
snippet_stream << tokens[snippet_index];
|
||||
snippet_stream << raw_tokens[snippet_index];
|
||||
}
|
||||
|
||||
snippet_index++;
|
||||
}
|
||||
|
||||
highlight.snippets.push_back(snippet_stream.str());
|
||||
@ -1501,18 +1528,14 @@ void Collection::highlight_result(const field &search_field,
|
||||
|
||||
if(highlighted_fully) {
|
||||
std::stringstream value_stream;
|
||||
for(size_t value_index = 0; value_index < tokens.size(); value_index++) {
|
||||
if(value_index != 0) {
|
||||
value_stream << " ";
|
||||
}
|
||||
for(size_t value_index = 0; value_index < raw_tokens.size(); value_index++) {
|
||||
std::string normalized_token;
|
||||
Tokenizer(raw_tokens[value_index], true, true).tokenize(normalized_token);
|
||||
|
||||
std::string token = tokens[value_index];
|
||||
Tokenizer(token, true, true).tokenize(token);
|
||||
|
||||
if(token_hits.count(token) != 0) {
|
||||
value_stream << highlight_start_tag << tokens[value_index] << highlight_end_tag;
|
||||
if(token_hits.count(normalized_token) != 0) {
|
||||
value_stream << highlight_start_tag << raw_tokens[value_index] << highlight_end_tag;
|
||||
} else {
|
||||
value_stream << tokens[value_index];
|
||||
value_stream << raw_tokens[value_index];
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -575,7 +575,7 @@ void Index::index_string_field(const std::string & text, const int64_t score, ar
|
||||
uint32_t seq_id, bool is_facet, const field & a_field) {
|
||||
std::unordered_map<std::string, std::vector<uint32_t>> token_to_offsets;
|
||||
|
||||
Tokenizer tokenizer(text, true, true, !a_field.is_string());
|
||||
Tokenizer tokenizer(text, false, true, !a_field.is_string());
|
||||
std::string token;
|
||||
size_t token_index = 0;
|
||||
|
||||
@ -588,7 +588,6 @@ void Index::index_string_field(const std::string & text, const int64_t score, ar
|
||||
|
||||
if(is_facet) {
|
||||
uint64_t hash = facet_token_hash(a_field, token);
|
||||
//facet_index_v2[seq_id][facet_id].push_back(hash);
|
||||
facet_hashes.push_back(hash);
|
||||
}
|
||||
|
||||
@ -623,7 +622,7 @@ void Index::index_string_array_field(const std::vector<std::string> & strings, c
|
||||
const std::string& str = strings[array_index];
|
||||
std::set<std::string> token_set; // required to deal with repeating tokens
|
||||
|
||||
Tokenizer tokenizer(str, true, true, !a_field.is_string());
|
||||
Tokenizer tokenizer(str, false, true, !a_field.is_string());
|
||||
std::string token;
|
||||
size_t token_index = 0;
|
||||
|
||||
@ -2216,6 +2215,8 @@ void Index::populate_token_positions(const std::vector<art_leaf *>& query_sugges
|
||||
for(size_t i = 0; i < query_suggestion.size(); i++) {
|
||||
const art_leaf* token_leaf = query_suggestion[i];
|
||||
uint32_t doc_index = leaf_to_indices[i][result_index];
|
||||
/*LOG(INFO) << "doc_id: " << token_leaf->values->ids.at(doc_index) << ", token_leaf->values->ids.getLength(): "
|
||||
<< token_leaf->values->ids.getLength();*/
|
||||
|
||||
// it's possible for a query token to not appear in a resulting document
|
||||
if(doc_index == token_leaf->values->ids.getLength()) {
|
||||
@ -2229,7 +2230,14 @@ void Index::populate_token_positions(const std::vector<art_leaf *>& query_sugges
|
||||
/*uint32_t* offsets = token_leaf->values->offsets.uncompress();
|
||||
for(size_t ii=0; ii < token_leaf->values->offsets.getLength(); ii++) {
|
||||
LOG(INFO) << "offset: " << offsets[ii];
|
||||
}*/
|
||||
}
|
||||
|
||||
uint32_t* offset_indices = token_leaf->values->offset_index.uncompress();
|
||||
for(size_t ii=0; ii < token_leaf->values->offset_index.getLength(); ii++) {
|
||||
LOG(INFO) << "offset index: " << offset_indices[ii];
|
||||
}
|
||||
|
||||
LOG(INFO) << "token_leaf->values->offsets.getLength(): " << token_leaf->values->offsets.getLength();*/
|
||||
|
||||
uint32_t start_offset = token_leaf->values->offset_index.at(doc_index);
|
||||
uint32_t end_offset = (doc_index == token_leaf->values->ids.getLength() - 1) ?
|
||||
@ -2464,6 +2472,7 @@ void Index::tokenize_doc_field(const nlohmann::json& document, const field& sear
|
||||
}
|
||||
|
||||
art_leaf* Index::get_token_leaf(const std::string & field_name, const unsigned char* token, uint32_t token_len) {
|
||||
std::shared_lock lock(mutex);
|
||||
const art_tree *t = search_index.at(field_name);
|
||||
return (art_leaf*) art_search(t, token, (int) token_len);
|
||||
}
|
||||
|
@ -2,62 +2,63 @@
|
||||
#include "tokenizer.h"
|
||||
|
||||
bool Tokenizer::next(std::string &token, size_t& token_index) {
|
||||
std::stringstream out;
|
||||
|
||||
if(i >= text.size()) {
|
||||
if(i == text.size() && !text.empty() && text.back() == ' ') {
|
||||
token = "";
|
||||
i++;
|
||||
return true;
|
||||
if(no_op) {
|
||||
if(i == text.size()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
if(no_op) {
|
||||
token = text;
|
||||
i = text.size();
|
||||
return true;
|
||||
}
|
||||
|
||||
while(i < text.size()) {
|
||||
if((text[i] & ~0x7f) == 0 ) {
|
||||
// ASCII character: split on space/newline or lowercase otherwise
|
||||
if(std::isalnum(text[i])) {
|
||||
bool is_ascii = (text[i] & ~0x7f) == 0;
|
||||
if(is_ascii) {
|
||||
const size_t next_stream_mode = std::isalnum(text[i]) ? CHARS : SEPARATORS;
|
||||
|
||||
if(next_stream_mode != stream_mode) {
|
||||
// We tokenize when `stream_mode` changes
|
||||
token = out.str();
|
||||
|
||||
out.str(std::string());
|
||||
if(normalize) {
|
||||
out << char(std::tolower(text[i]));
|
||||
} else {
|
||||
out << text[i];
|
||||
}
|
||||
i++;
|
||||
|
||||
if(stream_mode == SEPARATORS && !keep_separators) {
|
||||
stream_mode = next_stream_mode;
|
||||
continue;
|
||||
}
|
||||
|
||||
token_index = token_counter++;
|
||||
stream_mode = next_stream_mode;
|
||||
return true;
|
||||
} else {
|
||||
bool is_space = text[i] == 32;
|
||||
bool is_new_line = text[i] == 10;
|
||||
bool is_whitespace = is_space || is_new_line;
|
||||
|
||||
bool next_char_alphanum = (i != text.length() - 1) && std::isalnum(text[i + 1]);
|
||||
|
||||
if(!normalize && !is_whitespace && (i == text.length() - 1 || !next_char_alphanum)) {
|
||||
// checking for next char ensures that `foo-bar` does not get split to `foo-`
|
||||
if(normalize) {
|
||||
out << char(std::tolower(text[i]));
|
||||
} else {
|
||||
out << text[i];
|
||||
}
|
||||
|
||||
if(is_whitespace || next_char_alphanum) {
|
||||
// we split on space or on a special character whose next char is alphanumeric
|
||||
token = out.str();
|
||||
out.clear();
|
||||
i++;
|
||||
|
||||
if(!keep_empty && token.empty()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
token_index = token_counter++;
|
||||
return true;
|
||||
}
|
||||
i++;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
i++;
|
||||
continue;
|
||||
if(stream_mode == SEPARATORS) { // to detect first non-ascii character
|
||||
// we will tokenize now and treat the following non-ascii chars as a different token
|
||||
stream_mode = CHARS;
|
||||
token = out.str();
|
||||
out.str(std::string());
|
||||
|
||||
if(keep_separators) {
|
||||
token_index = token_counter++;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
char inbuf[5];
|
||||
@ -90,18 +91,17 @@ bool Tokenizer::next(std::string &token, size_t& token_index) {
|
||||
// symbol cannot be represented as ASCII, so write the original symbol
|
||||
out << inbuf;
|
||||
} else {
|
||||
// NOTE: outsize indicates bytes available AFTER current position so have to do <=
|
||||
for(size_t out_index=0; out_index<5; out_index++) {
|
||||
if(!normalize) {
|
||||
out << outbuf[out_index];
|
||||
continue;
|
||||
}
|
||||
|
||||
bool is_ascii = ((outbuf[out_index] & ~0x7f) == 0);
|
||||
bool keep_char = !is_ascii || std::isalnum(outbuf[out_index]);
|
||||
bool unicode_is_ascii = ((outbuf[out_index] & ~0x7f) == 0);
|
||||
bool keep_char = !unicode_is_ascii || std::isalnum(outbuf[out_index]);
|
||||
|
||||
if(keep_char) {
|
||||
if(is_ascii && std::isalnum(outbuf[out_index])) {
|
||||
if(unicode_is_ascii && std::isalnum(outbuf[out_index])) {
|
||||
outbuf[out_index] = char(std::tolower(outbuf[out_index]));
|
||||
}
|
||||
out << outbuf[out_index];
|
||||
@ -111,9 +111,13 @@ bool Tokenizer::next(std::string &token, size_t& token_index) {
|
||||
}
|
||||
|
||||
token = out.str();
|
||||
out.clear();
|
||||
out.str(std::string());
|
||||
|
||||
if(!keep_empty && token.empty()) {
|
||||
if(token.empty()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if(!std::isalnum(token[0]) && !keep_separators) {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -359,8 +359,8 @@ TEST_F(CollectionSynonymsTest, MultiWaySynonym) {
|
||||
|
||||
ASSERT_EQ(2, res["hits"].size());
|
||||
ASSERT_EQ(2, res["found"].get<uint32_t>());
|
||||
ASSERT_STREQ("<mark>Samuel</mark> <mark>L.</mark> <mark>Jackson</mark>", res["hits"][0]["highlights"][0]["snippet"].get<std::string>().c_str());
|
||||
ASSERT_STREQ("<mark>Samuel</mark> <mark>L.</mark> <mark>Jackson</mark>", res["hits"][1]["highlights"][0]["snippet"].get<std::string>().c_str());
|
||||
ASSERT_STREQ("<mark>Samuel</mark> <mark>L</mark>. <mark>Jackson</mark>", res["hits"][0]["highlights"][0]["snippet"].get<std::string>().c_str());
|
||||
ASSERT_STREQ("<mark>Samuel</mark> <mark>L</mark>. <mark>Jackson</mark>", res["hits"][1]["highlights"][0]["snippet"].get<std::string>().c_str());
|
||||
|
||||
// for now we don't support synonyms on ANY prefix
|
||||
|
||||
|
@ -164,7 +164,7 @@ TEST_F(CollectionTest, PhraseSearch) {
|
||||
13: score: 12, (single word match)
|
||||
*/
|
||||
|
||||
std::vector<std::string> ids = {"8", "1", "16", "17", "13"};
|
||||
std::vector<std::string> ids = {"8", "1", "17", "16", "13"};
|
||||
|
||||
for(size_t i = 0; i < results["hits"].size(); i++) {
|
||||
nlohmann::json result = results["hits"].at(i);
|
||||
@ -184,7 +184,7 @@ TEST_F(CollectionTest, PhraseSearch) {
|
||||
ASSERT_EQ(5, results["hits"].size());
|
||||
ASSERT_EQ(5, results["found"].get<uint32_t>());
|
||||
|
||||
ids = {"8", "1", "17", "16", "13"};
|
||||
ids = {"8", "17", "1", "16", "13"};
|
||||
|
||||
for(size_t i = 0; i < results["hits"].size(); i++) {
|
||||
nlohmann::json result = results["hits"].at(i);
|
||||
@ -200,7 +200,7 @@ TEST_F(CollectionTest, PhraseSearch) {
|
||||
|
||||
ASSERT_EQ(3, results["request_params"]["per_page"].get<size_t>());
|
||||
|
||||
ids = {"8", "1", "16"};
|
||||
ids = {"8", "1", "17"};
|
||||
|
||||
for(size_t i = 0; i < 3; i++) {
|
||||
nlohmann::json result = results["hits"].at(i);
|
||||
@ -1958,7 +1958,7 @@ TEST_F(CollectionTest, SearchLargeTextField) {
|
||||
|
||||
ASSERT_EQ(1, results["hits"].size());
|
||||
|
||||
ASSERT_STREQ("non arcu id lectus <mark>accumsan</mark> venenatis at at justo.",
|
||||
ASSERT_STREQ("non arcu id lectus <mark>accumsan</mark> venenatis at at justo",
|
||||
results["hits"][0]["highlights"][0]["snippet"].get<std::string>().c_str());
|
||||
|
||||
collectionManager.drop_collection("coll_large_text");
|
||||
@ -2141,7 +2141,7 @@ TEST_F(CollectionTest, SearchHighlightWithNewLine) {
|
||||
token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0).get();
|
||||
|
||||
ASSERT_STREQ("Blah, blah <mark>Stark</mark> Industries",
|
||||
ASSERT_STREQ("Blah, blah\n<mark>Stark</mark> Industries",
|
||||
res["hits"][0]["highlights"][0]["snippet"].get<std::string>().c_str());
|
||||
|
||||
ASSERT_STREQ("Stark", res["hits"][0]["highlights"][0]["matched_tokens"][0].get<std::string>().c_str());
|
||||
@ -3184,7 +3184,7 @@ TEST_F(CollectionTest, SearchingForRecordsWithSpecialChars) {
|
||||
|
||||
std::vector<std::vector<std::string>> records = {
|
||||
{"Amazon Home", "https://amazon.com/"},
|
||||
{"Google Home", "https://google.com/"},
|
||||
{"Google Home", "https://google.com///"},
|
||||
{"Github Issue", "https://github.com/typesense/typesense/issues/241"},
|
||||
{"Amazon Search", "https://www.amazon.com/s?k=phone&ref=nb_sb_noss_2"},
|
||||
};
|
||||
@ -3206,12 +3206,17 @@ TEST_F(CollectionTest, SearchingForRecordsWithSpecialChars) {
|
||||
ASSERT_EQ(1, results["found"].get<size_t>());
|
||||
ASSERT_STREQ("1", results["hits"][0]["document"]["id"].get<std::string>().c_str());
|
||||
|
||||
ASSERT_EQ(2, results["hits"][0]["highlights"].size());
|
||||
ASSERT_EQ("<mark>Google</mark> Home", results["hits"][0]["highlights"][0]["snippet"].get<std::string>());
|
||||
ASSERT_EQ("https://<mark>google</mark>.com///", results["hits"][0]["highlights"][1]["snippet"].get<std::string>());
|
||||
|
||||
results = coll1->search("amazon.com",
|
||||
{"title", "url"}, "", {}, {}, 2, 10, 1, FREQUENCY).get();
|
||||
|
||||
ASSERT_EQ(2, results["found"].get<size_t>());
|
||||
ASSERT_EQ(3, results["found"].get<size_t>());
|
||||
ASSERT_STREQ("3", results["hits"][0]["document"]["id"].get<std::string>().c_str());
|
||||
ASSERT_STREQ("0", results["hits"][1]["document"]["id"].get<std::string>().c_str());
|
||||
ASSERT_STREQ("1", results["hits"][2]["document"]["id"].get<std::string>().c_str());
|
||||
|
||||
results = coll1->search("typesense",
|
||||
{"title", "url"}, "", {}, {}, 2, 10, 1, FREQUENCY).get();
|
||||
@ -3225,5 +3230,9 @@ TEST_F(CollectionTest, SearchingForRecordsWithSpecialChars) {
|
||||
ASSERT_EQ(1, results["found"].get<size_t>());
|
||||
ASSERT_STREQ("3", results["hits"][0]["document"]["id"].get<std::string>().c_str());
|
||||
|
||||
ASSERT_EQ(1, results["hits"][0]["highlights"].size());
|
||||
ASSERT_EQ("https://www.amazon.com/s?k=phone&ref=<mark>nb</mark>_<mark>sb</mark>_<mark>noss</mark>_<mark>2</mark>",
|
||||
results["hits"][0]["highlights"][0]["snippet"].get<std::string>());
|
||||
|
||||
collectionManager.drop_collection("coll1");
|
||||
}
|
||||
}
|
||||
|
@ -4,25 +4,23 @@
|
||||
TEST(TokenizerTest, ShouldTokenizeNormalizeDifferentStrings) {
|
||||
const std::string withnewline = "Michael Jordan:\nWelcome, everybody. Welcome! ";
|
||||
std::vector<std::string> tokens;
|
||||
Tokenizer(withnewline, true, true, false).tokenize(tokens);
|
||||
ASSERT_EQ(6, tokens.size());
|
||||
Tokenizer(withnewline, false, true, false).tokenize(tokens);
|
||||
ASSERT_EQ(5, tokens.size());
|
||||
ASSERT_STREQ("michael", tokens[0].c_str());
|
||||
ASSERT_STREQ("jordan", tokens[1].c_str());
|
||||
ASSERT_STREQ("welcome", tokens[2].c_str());
|
||||
ASSERT_STREQ("everybody", tokens[3].c_str());
|
||||
ASSERT_STREQ("welcome", tokens[4].c_str());
|
||||
ASSERT_STREQ("", tokens[5].c_str());
|
||||
|
||||
const std::string withspaces = " Michael Jordan ";
|
||||
tokens.clear();
|
||||
Tokenizer(withspaces, true, true, false).tokenize(tokens);
|
||||
ASSERT_EQ(6, tokens.size());
|
||||
ASSERT_STREQ("", tokens[0].c_str());
|
||||
ASSERT_EQ(5, tokens.size());
|
||||
ASSERT_STREQ(" ", tokens[0].c_str());
|
||||
ASSERT_STREQ("michael", tokens[1].c_str());
|
||||
ASSERT_STREQ("", tokens[2].c_str());
|
||||
ASSERT_STREQ(" ", tokens[2].c_str());
|
||||
ASSERT_STREQ("jordan", tokens[3].c_str());
|
||||
ASSERT_STREQ("", tokens[4].c_str());
|
||||
ASSERT_STREQ("", tokens[5].c_str());
|
||||
ASSERT_STREQ(" ", tokens[4].c_str());
|
||||
|
||||
tokens.clear();
|
||||
Tokenizer(withspaces, false, true, false).tokenize(tokens);
|
||||
@ -30,38 +28,6 @@ TEST(TokenizerTest, ShouldTokenizeNormalizeDifferentStrings) {
|
||||
ASSERT_STREQ("michael", tokens[0].c_str());
|
||||
ASSERT_STREQ("jordan", tokens[1].c_str());
|
||||
|
||||
const std::string withspecialchars = "Special ½¥ and தமிழ் 你好吗 abcÅà123ß12 here.";
|
||||
tokens.clear();
|
||||
Tokenizer(withspecialchars, false, true, false).tokenize(tokens);
|
||||
ASSERT_EQ(7, tokens.size());
|
||||
ASSERT_STREQ("special", tokens[0].c_str());
|
||||
ASSERT_STREQ("12yen", tokens[1].c_str());
|
||||
ASSERT_STREQ("and", tokens[2].c_str());
|
||||
ASSERT_STREQ("தமிழ்", tokens[3].c_str());
|
||||
ASSERT_STREQ("你好吗", tokens[4].c_str());
|
||||
ASSERT_STREQ("abcaa123ss12", tokens[5].c_str());
|
||||
ASSERT_STREQ("here", tokens[6].c_str());
|
||||
|
||||
// when normalization is disabled and keep empty is enabled
|
||||
const std::string withoutnormalize = "Mise à, jour.";
|
||||
tokens.clear();
|
||||
Tokenizer(withoutnormalize, true, false, false).tokenize(tokens);
|
||||
ASSERT_EQ(5, tokens.size());
|
||||
ASSERT_STREQ("Mise", tokens[0].c_str());
|
||||
ASSERT_STREQ("", tokens[1].c_str());
|
||||
ASSERT_STREQ("à,", tokens[2].c_str());
|
||||
ASSERT_STREQ("", tokens[3].c_str());
|
||||
ASSERT_STREQ("jour.", tokens[4].c_str());
|
||||
|
||||
// when normalization and keep empty are disabled
|
||||
const std::string withoutnormalizeandkeepempty = "Mise à jour.";
|
||||
tokens.clear();
|
||||
Tokenizer(withoutnormalizeandkeepempty, false, false, false).tokenize(tokens);
|
||||
ASSERT_EQ(3, tokens.size());
|
||||
ASSERT_STREQ("Mise", tokens[0].c_str());
|
||||
ASSERT_STREQ("à", tokens[1].c_str());
|
||||
ASSERT_STREQ("jour.", tokens[2].c_str());
|
||||
|
||||
// single token
|
||||
const std::string single_token = "foobar";
|
||||
tokens.clear();
|
||||
@ -89,22 +55,82 @@ TEST(TokenizerTest, ShouldTokenizeNormalizeDifferentStrings) {
|
||||
const std::string multispace_tokens = "foo bar";
|
||||
tokens.clear();
|
||||
Tokenizer(multispace_tokens, true, false, false).tokenize(tokens);
|
||||
ASSERT_EQ(6, tokens.size());
|
||||
ASSERT_EQ(3, tokens.size());
|
||||
ASSERT_STREQ("foo", tokens[0].c_str());
|
||||
ASSERT_STREQ("", tokens[1].c_str());
|
||||
ASSERT_STREQ("", tokens[2].c_str());
|
||||
ASSERT_STREQ("", tokens[3].c_str());
|
||||
ASSERT_STREQ("", tokens[4].c_str());
|
||||
ASSERT_STREQ("bar", tokens[5].c_str());
|
||||
ASSERT_STREQ(" ", tokens[1].c_str());
|
||||
ASSERT_STREQ("bar", tokens[2].c_str());
|
||||
|
||||
// special chars
|
||||
const std::string specialchar_tokens = "https://www.amazon.com/s?k=phone&ref=nb_sb_noss_2";;
|
||||
tokens.clear();
|
||||
Tokenizer(specialchar_tokens, true, false, false).tokenize(tokens);
|
||||
ASSERT_EQ(23, tokens.size());
|
||||
ASSERT_STREQ("https", tokens[0].c_str());
|
||||
ASSERT_STREQ("://", tokens[1].c_str());
|
||||
ASSERT_STREQ("www", tokens[2].c_str());
|
||||
ASSERT_STREQ(".", tokens[3].c_str());
|
||||
ASSERT_STREQ("noss", tokens[20].c_str());
|
||||
ASSERT_STREQ("_", tokens[21].c_str());
|
||||
ASSERT_STREQ("2", tokens[22].c_str());
|
||||
|
||||
// noop
|
||||
|
||||
tokens.clear();
|
||||
const std::string withspecialchars = "Special ½¥ and தமிழ் 你好吗 abcÅà123ß12 here.";
|
||||
Tokenizer(withspecialchars, false, true, true).tokenize(tokens);
|
||||
ASSERT_EQ(1, tokens.size());
|
||||
ASSERT_STREQ(withspecialchars.c_str(), tokens[0].c_str());
|
||||
}
|
||||
|
||||
TEST(TokenizerTest, ShouldTokenizeNormalizeUnicodeStrings) {
|
||||
std::vector<std::string> tokens;
|
||||
|
||||
const std::string withspecialchars = "Special ½¥ and -தமிழ் 你2好吗 abcÅà123ß12 here.";
|
||||
tokens.clear();
|
||||
Tokenizer(withspecialchars, false, true, false).tokenize(tokens);
|
||||
ASSERT_EQ(7, tokens.size());
|
||||
ASSERT_STREQ("special", tokens[0].c_str());
|
||||
ASSERT_STREQ("12yen", tokens[1].c_str());
|
||||
ASSERT_STREQ("and", tokens[2].c_str());
|
||||
ASSERT_STREQ("தமிழ்", tokens[3].c_str());
|
||||
ASSERT_STREQ("你2好吗", tokens[4].c_str());
|
||||
ASSERT_STREQ("abcaa123ss12", tokens[5].c_str());
|
||||
ASSERT_STREQ("here", tokens[6].c_str());
|
||||
|
||||
// when normalization is disabled and keep empty is enabled
|
||||
const std::string withoutnormalize = "Mise à, jour.";
|
||||
tokens.clear();
|
||||
Tokenizer(withoutnormalize, true, false, false).tokenize(tokens);
|
||||
ASSERT_EQ(6, tokens.size());
|
||||
ASSERT_STREQ("Mise", tokens[0].c_str());
|
||||
ASSERT_STREQ(" ", tokens[1].c_str());
|
||||
ASSERT_STREQ("à", tokens[2].c_str());
|
||||
ASSERT_STREQ(", ", tokens[3].c_str());
|
||||
ASSERT_STREQ("jour", tokens[4].c_str());
|
||||
ASSERT_STREQ(".", tokens[5].c_str());
|
||||
|
||||
// when normalization and keep empty are disabled
|
||||
const std::string withoutnormalizeandkeepempty = "Mise à jour.";
|
||||
tokens.clear();
|
||||
Tokenizer(withoutnormalizeandkeepempty, false, false, false).tokenize(tokens);
|
||||
ASSERT_EQ(3, tokens.size());
|
||||
ASSERT_STREQ("Mise", tokens[0].c_str());
|
||||
ASSERT_STREQ("à", tokens[1].c_str());
|
||||
ASSERT_STREQ("jour", tokens[2].c_str());
|
||||
|
||||
// single accented word tokenization
|
||||
std::string singleword = "à";
|
||||
tokens.clear();
|
||||
Tokenizer(singleword, false, true, false).tokenize(tokens);
|
||||
ASSERT_EQ(1, tokens.size());
|
||||
ASSERT_STREQ("a", tokens[0].c_str());
|
||||
|
||||
tokens.clear();
|
||||
Tokenizer(singleword, true, true, false).tokenize(tokens);
|
||||
ASSERT_EQ(1, tokens.size());
|
||||
ASSERT_STREQ("a", tokens[0].c_str());
|
||||
}
|
||||
|
||||
TEST(TokenizerTest, ShouldTokenizeIteratively) {
|
||||
const std::string withnewline = "Michael Jordan:\n\nWelcome, everybody. Welcome!";
|
||||
std::vector<std::string> tokens;
|
||||
@ -117,20 +143,34 @@ TEST(TokenizerTest, ShouldTokenizeIteratively) {
|
||||
tokens.push_back(token);
|
||||
}
|
||||
|
||||
ASSERT_EQ(6, tokens.size());
|
||||
ASSERT_EQ(10, tokens.size());
|
||||
ASSERT_STREQ("michael", tokens[0].c_str());
|
||||
ASSERT_STREQ("jordan", tokens[1].c_str());
|
||||
ASSERT_STREQ("", tokens[2].c_str());
|
||||
ASSERT_STREQ("welcome", tokens[3].c_str());
|
||||
ASSERT_STREQ("everybody", tokens[4].c_str());
|
||||
ASSERT_STREQ("welcome", tokens[5].c_str());
|
||||
ASSERT_STREQ(" ", tokens[1].c_str());
|
||||
ASSERT_STREQ("jordan", tokens[2].c_str());
|
||||
ASSERT_STREQ(":\n\n", tokens[3].c_str());
|
||||
ASSERT_STREQ("welcome", tokens[4].c_str());
|
||||
ASSERT_STREQ(", ", tokens[5].c_str());
|
||||
ASSERT_STREQ("everybody", tokens[6].c_str());
|
||||
ASSERT_STREQ(". ", tokens[7].c_str());
|
||||
ASSERT_STREQ("welcome", tokens[8].c_str());
|
||||
ASSERT_STREQ("!", tokens[9].c_str());
|
||||
|
||||
// check for index when separators are not kept
|
||||
Tokenizer tokenizer2(withnewline, false, true, false);
|
||||
size_t expected_token_index = 0;
|
||||
std::vector<std::string> expected_tokens = {"michael", "jordan", "welcome", "everybody", "welcome"};
|
||||
while(tokenizer2.next(token, token_index)) {
|
||||
ASSERT_EQ(expected_token_index, token_index);
|
||||
ASSERT_EQ(expected_tokens[expected_token_index], token);
|
||||
expected_token_index++;
|
||||
}
|
||||
|
||||
// verbatim (no_op=true)
|
||||
|
||||
tokens.clear();
|
||||
Tokenizer tokenizer2(withnewline, true, false, true);
|
||||
Tokenizer tokenizer3(withnewline, true, false, true);
|
||||
|
||||
while(tokenizer2.next(token, token_index)) {
|
||||
while(tokenizer3.next(token, token_index)) {
|
||||
tokens.push_back(token);
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user