Remove ascii special characters during string normalization.

Unicode special chars are retained verbatim - will be addressed in future.
2025-05-20 05:32:30 +08:00 · 2018-01-16 21:16:24 +05:30 · 2018-01-16 21:16:24 +05:30 · 491de5a325
commit 491de5a325
parent 7e30cb1184
5 changed files with 41 additions and 14 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -61,7 +61,8 @@ add_executable(search ${SRC_FILES} src/main/main.cpp)
 add_executable(benchmark ${SRC_FILES} src/main/benchmark.cpp)
 add_executable(typesense_test ${SRC_FILES} test/array_test.cpp test/sorted_array_test.cpp test/art_test.cpp
               test/collection_test.cpp test/collection_manager_test.cpp
-               test/topster_test.cpp test/match_score_test.cpp test/store_test.cpp test/array_utils_test.cpp)
+               test/topster_test.cpp test/match_score_test.cpp test/store_test.cpp test/array_utils_test.cpp
+               test/string_utils_test.cpp)

 target_compile_definitions(typesense-server PRIVATE ROOT_DIR="${CMAKE_SOURCE_DIR}/")
 target_compile_definitions(search PRIVATE ROOT_DIR="${CMAKE_SOURCE_DIR}/")
--- a/TODO.md
+++ b/TODO.md
@ -82,8 +82,11 @@
 - ~~When the first sequence ID is not zero, bail out~~
 - ~~Proper status code when sequence number to fetch is bad~~
 - ~~Replica should be read-only~~
- handle hyphens (replace them)
- clean special chars before indexing
+- ~~string_utils::tokenize should not have max length~~
+- ~~handle hyphens (replace them)~~
+- ~~clean special chars before indexing~~
+- ~~Add docs/explanation around ranking calc~~
+- UTF-8 normalization
 - NOT operator support
 - > INT32_MAX validation for float field
 - Proper logging
@ -92,7 +95,6 @@
 - test for float int field deletion during doc deletion
 - Test for snippets
 - Test for replication
- Add docs/explanation around ranking calc
 - Use rocksdb batch put for atomic insertion
 - Query token ids should match query token ordering
 - ID should not have "/"
@ -102,12 +104,10 @@
 - Test for string utils
 - Prevent string copy during indexing
 - Minimum results should be a variable instead of blindly going with max_results
- UTF-8 support for fuzzy search
 - Handle searching for non-existing fields gracefully
 - test for same match score but different primary, secondary attr
 - Support nested fields via "."
 - Support search operators like +, - etc.
- string_utils::tokenize should not have max length
 - Space sensitivity
 - Use bitmap index instead of compressed array for doc list?
 - Primary_rank_scores and secondary_rank_scores hashmaps should be combined?
--- a/include/string_utils.h
+++ b/include/string_utils.h
@ -3,6 +3,7 @@
 #include <string>
 #include <algorithm>
 #include <sstream>
+#include <ctype.h>

 struct StringUtils {
    // Adapted from: http://stackoverflow.com/a/236180/131050
@ -121,6 +122,10 @@ struct StringUtils {
    }

    static void normalize(std::string& str) {
+        str.erase(std::remove_if(str.begin(), str.end(), [](char c) {
+                    return !std::isalnum(c) && (int)(c) >= 0;
+                  }), str.end());
+
        std::transform(str.begin(), str.end(), str.begin(), ::tolower);
    }

--- a/test/collection_test.cpp
+++ b/test/collection_test.cpp
@ -1112,15 +1112,15 @@ TEST_F(CollectionTest, FilterOnTextFields) {
    results = coll_array_fields->search("Jeremy", query_fields, "tags: BrONZe", facets, sort_fields, 0, 10, 1, FREQUENCY, false).get();
    ASSERT_EQ(2, results["hits"].size());

-    // when comparators are used, should just treat them as part of search string
+    // when comparators are used, should just treat them as part of search string (special chars will be removed)
    results = coll_array_fields->search("Jeremy", query_fields, "tags:<bronze", facets, sort_fields, 0, 10, 1, FREQUENCY, false).get();
-    ASSERT_EQ(0, results["hits"].size());
+    ASSERT_EQ(2, results["hits"].size());

    results = coll_array_fields->search("Jeremy", query_fields, "tags:<=BRONZE", facets, sort_fields, 0, 10, 1, FREQUENCY, false).get();
-    ASSERT_EQ(0, results["hits"].size());
+    ASSERT_EQ(2, results["hits"].size());

    results = coll_array_fields->search("Jeremy", query_fields, "tags:>BRONZE", facets, sort_fields, 0, 10, 1, FREQUENCY, false).get();
-    ASSERT_EQ(0, results["hits"].size());
+    ASSERT_EQ(2, results["hits"].size());

    collectionManager.drop_collection("coll_array_fields");
 }
@ -1381,7 +1381,7 @@ TEST_F(CollectionTest, SearchingWithMissingFields) {

    Option<nlohmann::json> res_op = coll_array_fields->search("the", query_fields_not_found, "", facets, sort_fields, 0, 10);
    ASSERT_FALSE(res_op.ok());
-    ASSERT_EQ(400, res_op.code());
+    ASSERT_EQ(404, res_op.code());
    ASSERT_STREQ("Could not find a field named `titlez` in the schema.", res_op.error().c_str());

    // when a query field is an integer field
@ -1391,16 +1391,16 @@ TEST_F(CollectionTest, SearchingWithMissingFields) {

    // when a facet field is not defined in the schema
    res_op = coll_array_fields->search("the", {"name"}, "", {"timestamps"}, sort_fields, 0, 10);
-    ASSERT_EQ(400, res_op.code());
+    ASSERT_EQ(404, res_op.code());
    ASSERT_STREQ("Could not find a facet field named `timestamps` in the schema.", res_op.error().c_str());

    // when a rank field is not defined in the schema
    res_op = coll_array_fields->search("the", {"name"}, "", {}, { sort_by("timestamps", "ASC") }, 0, 10);
-    ASSERT_EQ(400, res_op.code());
+    ASSERT_EQ(404, res_op.code());
    ASSERT_STREQ("Could not find a field named `timestamps` in the schema for sorting.", res_op.error().c_str());

    res_op = coll_array_fields->search("the", {"name"}, "", {}, { sort_by("_rank", "ASC") }, 0, 10);
-    ASSERT_EQ(400, res_op.code());
+    ASSERT_EQ(404, res_op.code());
    ASSERT_STREQ("Could not find a field named `_rank` in the schema for sorting.", res_op.error().c_str());

    collectionManager.drop_collection("coll_array_fields");
--- a/test/string_utils_test.cpp
+++ b/test/string_utils_test.cpp
@ -0,0 +1,21 @@
+#include <gtest/gtest.h>
+#include "string_utils.h"
+
+TEST(StringUtilsTest, ShouldNormalizeString) {
+    std::string alphanum = "Aa12Zz";
+    StringUtils::normalize(alphanum);
+    ASSERT_STREQ("aa12zz", alphanum.c_str());
+
+    std::string alphanum_space = "Aa12Zz 12A";
+    StringUtils::normalize(alphanum_space);
+    ASSERT_STREQ("aa12zz12a", alphanum_space.c_str());
+
+    std::string alphanum_specialchars = "Aa12Zz@W-_?,.R";
+    StringUtils::normalize(alphanum_specialchars);
+    ASSERT_STREQ("aa12zzwr", alphanum_specialchars.c_str());
+
+    // retain non-ascii unicode characters
+    std::string alphanum_unicodechars = "abcÅà123";
+    StringUtils::normalize(alphanum_unicodechars);
+    ASSERT_STREQ("abc\xC3\x85\xC3\xA0""123", alphanum_unicodechars.c_str());
+}