Fix bad unicode characters in highlight snippet.

2025-05-19 13:12:22 +08:00 · 2020-11-28 08:22:58 +05:30 · 2020-11-28 08:22:58 +05:30 · a912a250ff
commit a912a250ff
parent ec9da2ccf3
3 changed files with 63 additions and 0 deletions
--- a/src/tokenizer.cpp
+++ b/src/tokenizer.cpp
@ -61,6 +61,11 @@ bool Tokenizer::next(std::string &token, size_t& token_index) {
        *p = 0;
        size_t insize = (p - &inbuf[0]);

+        if(!normalize) {
+            out << inbuf;
+            continue;
+        }
+
        char outbuf[5] = {};
        size_t outsize = sizeof(outbuf);
        char *outptr = outbuf;
--- a/test/collection_test.cpp
+++ b/test/collection_test.cpp
@ -2637,3 +2637,41 @@ TEST_F(CollectionTest, MultiFieldRelevance) {

    collectionManager.drop_collection("coll1");
 }
+
+TEST_F(CollectionTest, HighlightWithAccentedCharacters) {
+    Collection *coll1;
+
+    std::vector<field> fields = {field("title", field_types::STRING, false),
+                                 field("points", field_types::INT32, false),};
+
+    coll1 = collectionManager.get_collection("coll1");
+    if (coll1 == nullptr) {
+        coll1 = collectionManager.create_collection("coll1", 4, fields, "points").get();
+    }
+
+    std::vector<std::vector<std::string>> records = {
+        {"Mise à  jour  Timy depuis PC"},
+        {"Down There by the Train"},
+        {"State Trooper"},
+    };
+
+    for (size_t i = 0; i < records.size(); i++) {
+        nlohmann::json doc;
+
+        doc["id"] = std::to_string(i);
+        doc["title"] = records[i][0];
+        doc["points"] = i;
+
+        ASSERT_TRUE(coll1->add(doc.dump()).ok());
+    }
+
+    auto results = coll1->search("jour", {"title"}, "", {}, {}, 0, 10, 1, FREQUENCY).get();
+
+    ASSERT_EQ(1, results["found"].get<size_t>());
+    ASSERT_EQ(1, results["hits"].size());
+
+    ASSERT_STREQ("Mise à  <mark>jour</mark>  Timy depuis PC",
+                 results["hits"][0]["highlights"][0]["snippet"].get<std::string>().c_str());
+
+    collectionManager.drop_collection("coll1");
+}
--- a/test/tokenizer_test.cpp
+++ b/test/tokenizer_test.cpp
@ -42,6 +42,26 @@ TEST(TokenizerTest, ShouldTokenizeNormalizeDifferentStrings) {
    ASSERT_STREQ("abcaa123ss12", tokens[5].c_str());
    ASSERT_STREQ("here", tokens[6].c_str());

+    // when normalization is disabled and keep empty is enabled
+    const std::string withoutnormalize = "Mise  à  jour.";
+    tokens.clear();
+    Tokenizer(withoutnormalize, true, false, false).tokenize(tokens);
+    ASSERT_EQ(5, tokens.size());
+    ASSERT_STREQ("Mise", tokens[0].c_str());
+    ASSERT_STREQ("", tokens[1].c_str());
+    ASSERT_STREQ("à", tokens[2].c_str());
+    ASSERT_STREQ("", tokens[3].c_str());
+    ASSERT_STREQ("jour.", tokens[4].c_str());
+
+    // when normalization and keep empty are disabled
+    const std::string withoutnormalizeandkeepempty = "Mise  à  jour.";
+    tokens.clear();
+    Tokenizer(withoutnormalizeandkeepempty, false, false, false).tokenize(tokens);
+    ASSERT_EQ(3, tokens.size());
+    ASSERT_STREQ("Mise", tokens[0].c_str());
+    ASSERT_STREQ("à", tokens[1].c_str());
+    ASSERT_STREQ("jour.", tokens[2].c_str());
+
    // noop

    tokens.clear();