Fix bad unicode characters in highlight snippet.

This commit is contained in:
kishorenc 2020-11-28 08:22:58 +05:30
parent ec9da2ccf3
commit a912a250ff
3 changed files with 63 additions and 0 deletions

View File

@ -61,6 +61,11 @@ bool Tokenizer::next(std::string &token, size_t& token_index) {
*p = 0;
size_t insize = (p - &inbuf[0]);
if(!normalize) {
out << inbuf;
continue;
}
char outbuf[5] = {};
size_t outsize = sizeof(outbuf);
char *outptr = outbuf;

View File

@ -2637,3 +2637,41 @@ TEST_F(CollectionTest, MultiFieldRelevance) {
collectionManager.drop_collection("coll1");
}
TEST_F(CollectionTest, HighlightWithAccentedCharacters) {
Collection *coll1;
std::vector<field> fields = {field("title", field_types::STRING, false),
field("points", field_types::INT32, false),};
coll1 = collectionManager.get_collection("coll1");
if (coll1 == nullptr) {
coll1 = collectionManager.create_collection("coll1", 4, fields, "points").get();
}
std::vector<std::vector<std::string>> records = {
{"Mise à jour Timy depuis PC"},
{"Down There by the Train"},
{"State Trooper"},
};
for (size_t i = 0; i < records.size(); i++) {
nlohmann::json doc;
doc["id"] = std::to_string(i);
doc["title"] = records[i][0];
doc["points"] = i;
ASSERT_TRUE(coll1->add(doc.dump()).ok());
}
auto results = coll1->search("jour", {"title"}, "", {}, {}, 0, 10, 1, FREQUENCY).get();
ASSERT_EQ(1, results["found"].get<size_t>());
ASSERT_EQ(1, results["hits"].size());
ASSERT_STREQ("Mise à <mark>jour</mark> Timy depuis PC",
results["hits"][0]["highlights"][0]["snippet"].get<std::string>().c_str());
collectionManager.drop_collection("coll1");
}

View File

@ -42,6 +42,26 @@ TEST(TokenizerTest, ShouldTokenizeNormalizeDifferentStrings) {
ASSERT_STREQ("abcaa123ss12", tokens[5].c_str());
ASSERT_STREQ("here", tokens[6].c_str());
// when normalization is disabled and keep empty is enabled
const std::string withoutnormalize = "Mise à jour.";
tokens.clear();
Tokenizer(withoutnormalize, true, false, false).tokenize(tokens);
ASSERT_EQ(5, tokens.size());
ASSERT_STREQ("Mise", tokens[0].c_str());
ASSERT_STREQ("", tokens[1].c_str());
ASSERT_STREQ("à", tokens[2].c_str());
ASSERT_STREQ("", tokens[3].c_str());
ASSERT_STREQ("jour.", tokens[4].c_str());
// when normalization and keep empty are disabled
const std::string withoutnormalizeandkeepempty = "Mise à jour.";
tokens.clear();
Tokenizer(withoutnormalizeandkeepempty, false, false, false).tokenize(tokens);
ASSERT_EQ(3, tokens.size());
ASSERT_STREQ("Mise", tokens[0].c_str());
ASSERT_STREQ("à", tokens[1].c_str());
ASSERT_STREQ("jour.", tokens[2].c_str());
// noop
tokens.clear();