mirror of
https://github.com/typesense/typesense.git
synced 2025-05-19 13:12:22 +08:00
Fix bad unicode characters in highlight snippet.
This commit is contained in:
parent
ec9da2ccf3
commit
a912a250ff
@ -61,6 +61,11 @@ bool Tokenizer::next(std::string &token, size_t& token_index) {
|
||||
*p = 0;
|
||||
size_t insize = (p - &inbuf[0]);
|
||||
|
||||
if(!normalize) {
|
||||
out << inbuf;
|
||||
continue;
|
||||
}
|
||||
|
||||
char outbuf[5] = {};
|
||||
size_t outsize = sizeof(outbuf);
|
||||
char *outptr = outbuf;
|
||||
|
@ -2637,3 +2637,41 @@ TEST_F(CollectionTest, MultiFieldRelevance) {
|
||||
|
||||
collectionManager.drop_collection("coll1");
|
||||
}
|
||||
|
||||
TEST_F(CollectionTest, HighlightWithAccentedCharacters) {
|
||||
Collection *coll1;
|
||||
|
||||
std::vector<field> fields = {field("title", field_types::STRING, false),
|
||||
field("points", field_types::INT32, false),};
|
||||
|
||||
coll1 = collectionManager.get_collection("coll1");
|
||||
if (coll1 == nullptr) {
|
||||
coll1 = collectionManager.create_collection("coll1", 4, fields, "points").get();
|
||||
}
|
||||
|
||||
std::vector<std::vector<std::string>> records = {
|
||||
{"Mise à jour Timy depuis PC"},
|
||||
{"Down There by the Train"},
|
||||
{"State Trooper"},
|
||||
};
|
||||
|
||||
for (size_t i = 0; i < records.size(); i++) {
|
||||
nlohmann::json doc;
|
||||
|
||||
doc["id"] = std::to_string(i);
|
||||
doc["title"] = records[i][0];
|
||||
doc["points"] = i;
|
||||
|
||||
ASSERT_TRUE(coll1->add(doc.dump()).ok());
|
||||
}
|
||||
|
||||
auto results = coll1->search("jour", {"title"}, "", {}, {}, 0, 10, 1, FREQUENCY).get();
|
||||
|
||||
ASSERT_EQ(1, results["found"].get<size_t>());
|
||||
ASSERT_EQ(1, results["hits"].size());
|
||||
|
||||
ASSERT_STREQ("Mise à <mark>jour</mark> Timy depuis PC",
|
||||
results["hits"][0]["highlights"][0]["snippet"].get<std::string>().c_str());
|
||||
|
||||
collectionManager.drop_collection("coll1");
|
||||
}
|
||||
|
@ -42,6 +42,26 @@ TEST(TokenizerTest, ShouldTokenizeNormalizeDifferentStrings) {
|
||||
ASSERT_STREQ("abcaa123ss12", tokens[5].c_str());
|
||||
ASSERT_STREQ("here", tokens[6].c_str());
|
||||
|
||||
// when normalization is disabled and keep empty is enabled
|
||||
const std::string withoutnormalize = "Mise à jour.";
|
||||
tokens.clear();
|
||||
Tokenizer(withoutnormalize, true, false, false).tokenize(tokens);
|
||||
ASSERT_EQ(5, tokens.size());
|
||||
ASSERT_STREQ("Mise", tokens[0].c_str());
|
||||
ASSERT_STREQ("", tokens[1].c_str());
|
||||
ASSERT_STREQ("à", tokens[2].c_str());
|
||||
ASSERT_STREQ("", tokens[3].c_str());
|
||||
ASSERT_STREQ("jour.", tokens[4].c_str());
|
||||
|
||||
// when normalization and keep empty are disabled
|
||||
const std::string withoutnormalizeandkeepempty = "Mise à jour.";
|
||||
tokens.clear();
|
||||
Tokenizer(withoutnormalizeandkeepempty, false, false, false).tokenize(tokens);
|
||||
ASSERT_EQ(3, tokens.size());
|
||||
ASSERT_STREQ("Mise", tokens[0].c_str());
|
||||
ASSERT_STREQ("à", tokens[1].c_str());
|
||||
ASSERT_STREQ("jour.", tokens[2].c_str());
|
||||
|
||||
// noop
|
||||
|
||||
tokens.clear();
|
||||
|
Loading…
x
Reference in New Issue
Block a user