mirror of
https://github.com/typesense/typesense.git
synced 2025-05-20 05:32:30 +08:00
Remove ascii special characters during string normalization.
Unicode special chars are retained verbatim - will be addressed in future.
This commit is contained in:
parent
7e30cb1184
commit
491de5a325
@ -61,7 +61,8 @@ add_executable(search ${SRC_FILES} src/main/main.cpp)
|
||||
add_executable(benchmark ${SRC_FILES} src/main/benchmark.cpp)
|
||||
add_executable(typesense_test ${SRC_FILES} test/array_test.cpp test/sorted_array_test.cpp test/art_test.cpp
|
||||
test/collection_test.cpp test/collection_manager_test.cpp
|
||||
test/topster_test.cpp test/match_score_test.cpp test/store_test.cpp test/array_utils_test.cpp)
|
||||
test/topster_test.cpp test/match_score_test.cpp test/store_test.cpp test/array_utils_test.cpp
|
||||
test/string_utils_test.cpp)
|
||||
|
||||
target_compile_definitions(typesense-server PRIVATE ROOT_DIR="${CMAKE_SOURCE_DIR}/")
|
||||
target_compile_definitions(search PRIVATE ROOT_DIR="${CMAKE_SOURCE_DIR}/")
|
||||
|
10
TODO.md
10
TODO.md
@ -82,8 +82,11 @@
|
||||
- ~~When the first sequence ID is not zero, bail out~~
|
||||
- ~~Proper status code when sequence number to fetch is bad~~
|
||||
- ~~Replica should be read-only~~
|
||||
- handle hyphens (replace them)
|
||||
- clean special chars before indexing
|
||||
- ~~string_utils::tokenize should not have max length~~
|
||||
- ~~handle hyphens (replace them)~~
|
||||
- ~~clean special chars before indexing~~
|
||||
- ~~Add docs/explanation around ranking calc~~
|
||||
- UTF-8 normalization
|
||||
- NOT operator support
|
||||
- > INT32_MAX validation for float field
|
||||
- Proper logging
|
||||
@ -92,7 +95,6 @@
|
||||
- test for float int field deletion during doc deletion
|
||||
- Test for snippets
|
||||
- Test for replication
|
||||
- Add docs/explanation around ranking calc
|
||||
- Use rocksdb batch put for atomic insertion
|
||||
- Query token ids should match query token ordering
|
||||
- ID should not have "/"
|
||||
@ -102,12 +104,10 @@
|
||||
- Test for string utils
|
||||
- Prevent string copy during indexing
|
||||
- Minimum results should be a variable instead of blindly going with max_results
|
||||
- UTF-8 support for fuzzy search
|
||||
- Handle searching for non-existing fields gracefully
|
||||
- test for same match score but different primary, secondary attr
|
||||
- Support nested fields via "."
|
||||
- Support search operators like +, - etc.
|
||||
- string_utils::tokenize should not have max length
|
||||
- Space sensitivity
|
||||
- Use bitmap index instead of compressed array for doc list?
|
||||
- Primary_rank_scores and secondary_rank_scores hashmaps should be combined?
|
||||
|
@ -3,6 +3,7 @@
|
||||
#include <string>
|
||||
#include <algorithm>
|
||||
#include <sstream>
|
||||
#include <ctype.h>
|
||||
|
||||
struct StringUtils {
|
||||
// Adapted from: http://stackoverflow.com/a/236180/131050
|
||||
@ -121,6 +122,10 @@ struct StringUtils {
|
||||
}
|
||||
|
||||
static void normalize(std::string& str) {
|
||||
str.erase(std::remove_if(str.begin(), str.end(), [](char c) {
|
||||
return !std::isalnum(c) && (int)(c) >= 0;
|
||||
}), str.end());
|
||||
|
||||
std::transform(str.begin(), str.end(), str.begin(), ::tolower);
|
||||
}
|
||||
|
||||
|
@ -1112,15 +1112,15 @@ TEST_F(CollectionTest, FilterOnTextFields) {
|
||||
results = coll_array_fields->search("Jeremy", query_fields, "tags: BrONZe", facets, sort_fields, 0, 10, 1, FREQUENCY, false).get();
|
||||
ASSERT_EQ(2, results["hits"].size());
|
||||
|
||||
// when comparators are used, should just treat them as part of search string
|
||||
// when comparators are used, should just treat them as part of search string (special chars will be removed)
|
||||
results = coll_array_fields->search("Jeremy", query_fields, "tags:<bronze", facets, sort_fields, 0, 10, 1, FREQUENCY, false).get();
|
||||
ASSERT_EQ(0, results["hits"].size());
|
||||
ASSERT_EQ(2, results["hits"].size());
|
||||
|
||||
results = coll_array_fields->search("Jeremy", query_fields, "tags:<=BRONZE", facets, sort_fields, 0, 10, 1, FREQUENCY, false).get();
|
||||
ASSERT_EQ(0, results["hits"].size());
|
||||
ASSERT_EQ(2, results["hits"].size());
|
||||
|
||||
results = coll_array_fields->search("Jeremy", query_fields, "tags:>BRONZE", facets, sort_fields, 0, 10, 1, FREQUENCY, false).get();
|
||||
ASSERT_EQ(0, results["hits"].size());
|
||||
ASSERT_EQ(2, results["hits"].size());
|
||||
|
||||
collectionManager.drop_collection("coll_array_fields");
|
||||
}
|
||||
@ -1381,7 +1381,7 @@ TEST_F(CollectionTest, SearchingWithMissingFields) {
|
||||
|
||||
Option<nlohmann::json> res_op = coll_array_fields->search("the", query_fields_not_found, "", facets, sort_fields, 0, 10);
|
||||
ASSERT_FALSE(res_op.ok());
|
||||
ASSERT_EQ(400, res_op.code());
|
||||
ASSERT_EQ(404, res_op.code());
|
||||
ASSERT_STREQ("Could not find a field named `titlez` in the schema.", res_op.error().c_str());
|
||||
|
||||
// when a query field is an integer field
|
||||
@ -1391,16 +1391,16 @@ TEST_F(CollectionTest, SearchingWithMissingFields) {
|
||||
|
||||
// when a facet field is not defined in the schema
|
||||
res_op = coll_array_fields->search("the", {"name"}, "", {"timestamps"}, sort_fields, 0, 10);
|
||||
ASSERT_EQ(400, res_op.code());
|
||||
ASSERT_EQ(404, res_op.code());
|
||||
ASSERT_STREQ("Could not find a facet field named `timestamps` in the schema.", res_op.error().c_str());
|
||||
|
||||
// when a rank field is not defined in the schema
|
||||
res_op = coll_array_fields->search("the", {"name"}, "", {}, { sort_by("timestamps", "ASC") }, 0, 10);
|
||||
ASSERT_EQ(400, res_op.code());
|
||||
ASSERT_EQ(404, res_op.code());
|
||||
ASSERT_STREQ("Could not find a field named `timestamps` in the schema for sorting.", res_op.error().c_str());
|
||||
|
||||
res_op = coll_array_fields->search("the", {"name"}, "", {}, { sort_by("_rank", "ASC") }, 0, 10);
|
||||
ASSERT_EQ(400, res_op.code());
|
||||
ASSERT_EQ(404, res_op.code());
|
||||
ASSERT_STREQ("Could not find a field named `_rank` in the schema for sorting.", res_op.error().c_str());
|
||||
|
||||
collectionManager.drop_collection("coll_array_fields");
|
||||
|
21
test/string_utils_test.cpp
Normal file
21
test/string_utils_test.cpp
Normal file
@ -0,0 +1,21 @@
|
||||
#include <gtest/gtest.h>
|
||||
#include "string_utils.h"
|
||||
|
||||
TEST(StringUtilsTest, ShouldNormalizeString) {
|
||||
std::string alphanum = "Aa12Zz";
|
||||
StringUtils::normalize(alphanum);
|
||||
ASSERT_STREQ("aa12zz", alphanum.c_str());
|
||||
|
||||
std::string alphanum_space = "Aa12Zz 12A";
|
||||
StringUtils::normalize(alphanum_space);
|
||||
ASSERT_STREQ("aa12zz12a", alphanum_space.c_str());
|
||||
|
||||
std::string alphanum_specialchars = "Aa12Zz@W-_?,.R";
|
||||
StringUtils::normalize(alphanum_specialchars);
|
||||
ASSERT_STREQ("aa12zzwr", alphanum_specialchars.c_str());
|
||||
|
||||
// retain non-ascii unicode characters
|
||||
std::string alphanum_unicodechars = "abcÅà123";
|
||||
StringUtils::normalize(alphanum_unicodechars);
|
||||
ASSERT_STREQ("abc\xC3\x85\xC3\xA0""123", alphanum_unicodechars.c_str());
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user