diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp index ec050be6..bd1543e9 100644 --- a/src/tokenizer.cpp +++ b/src/tokenizer.cpp @@ -61,6 +61,11 @@ bool Tokenizer::next(std::string &token, size_t& token_index) { *p = 0; size_t insize = (p - &inbuf[0]); + if(!normalize) { + out << inbuf; + continue; + } + char outbuf[5] = {}; size_t outsize = sizeof(outbuf); char *outptr = outbuf; diff --git a/test/collection_test.cpp b/test/collection_test.cpp index 76d7ad80..0201ef0b 100644 --- a/test/collection_test.cpp +++ b/test/collection_test.cpp @@ -2637,3 +2637,41 @@ TEST_F(CollectionTest, MultiFieldRelevance) { collectionManager.drop_collection("coll1"); } + +TEST_F(CollectionTest, HighlightWithAccentedCharacters) { + Collection *coll1; + + std::vector fields = {field("title", field_types::STRING, false), + field("points", field_types::INT32, false),}; + + coll1 = collectionManager.get_collection("coll1"); + if (coll1 == nullptr) { + coll1 = collectionManager.create_collection("coll1", 4, fields, "points").get(); + } + + std::vector> records = { + {"Mise à jour Timy depuis PC"}, + {"Down There by the Train"}, + {"State Trooper"}, + }; + + for (size_t i = 0; i < records.size(); i++) { + nlohmann::json doc; + + doc["id"] = std::to_string(i); + doc["title"] = records[i][0]; + doc["points"] = i; + + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + } + + auto results = coll1->search("jour", {"title"}, "", {}, {}, 0, 10, 1, FREQUENCY).get(); + + ASSERT_EQ(1, results["found"].get()); + ASSERT_EQ(1, results["hits"].size()); + + ASSERT_STREQ("Mise à jour Timy depuis PC", + results["hits"][0]["highlights"][0]["snippet"].get().c_str()); + + collectionManager.drop_collection("coll1"); +} diff --git a/test/tokenizer_test.cpp b/test/tokenizer_test.cpp index c9c0ea74..e7f794c7 100644 --- a/test/tokenizer_test.cpp +++ b/test/tokenizer_test.cpp @@ -42,6 +42,26 @@ TEST(TokenizerTest, ShouldTokenizeNormalizeDifferentStrings) { ASSERT_STREQ("abcaa123ss12", tokens[5].c_str()); ASSERT_STREQ("here", tokens[6].c_str()); + // when normalization is disabled and keep empty is enabled + const std::string withoutnormalize = "Mise à jour."; + tokens.clear(); + Tokenizer(withoutnormalize, true, false, false).tokenize(tokens); + ASSERT_EQ(5, tokens.size()); + ASSERT_STREQ("Mise", tokens[0].c_str()); + ASSERT_STREQ("", tokens[1].c_str()); + ASSERT_STREQ("à", tokens[2].c_str()); + ASSERT_STREQ("", tokens[3].c_str()); + ASSERT_STREQ("jour.", tokens[4].c_str()); + + // when normalization and keep empty are disabled + const std::string withoutnormalizeandkeepempty = "Mise à jour."; + tokens.clear(); + Tokenizer(withoutnormalizeandkeepempty, false, false, false).tokenize(tokens); + ASSERT_EQ(3, tokens.size()); + ASSERT_STREQ("Mise", tokens[0].c_str()); + ASSERT_STREQ("à", tokens[1].c_str()); + ASSERT_STREQ("jour.", tokens[2].c_str()); + // noop tokens.clear();