Remove ascii special characters during string normalization.

Unicode special chars are retained verbatim - will be addressed in future.
This commit is contained in:
Kishore Nallan 2018-01-16 21:16:24 +05:30
parent 7e30cb1184
commit 491de5a325
5 changed files with 41 additions and 14 deletions

View File

@ -61,7 +61,8 @@ add_executable(search ${SRC_FILES} src/main/main.cpp)
add_executable(benchmark ${SRC_FILES} src/main/benchmark.cpp)
add_executable(typesense_test ${SRC_FILES} test/array_test.cpp test/sorted_array_test.cpp test/art_test.cpp
test/collection_test.cpp test/collection_manager_test.cpp
test/topster_test.cpp test/match_score_test.cpp test/store_test.cpp test/array_utils_test.cpp)
test/topster_test.cpp test/match_score_test.cpp test/store_test.cpp test/array_utils_test.cpp
test/string_utils_test.cpp)
target_compile_definitions(typesense-server PRIVATE ROOT_DIR="${CMAKE_SOURCE_DIR}/")
target_compile_definitions(search PRIVATE ROOT_DIR="${CMAKE_SOURCE_DIR}/")

10
TODO.md
View File

@ -82,8 +82,11 @@
- ~~When the first sequence ID is not zero, bail out~~
- ~~Proper status code when sequence number to fetch is bad~~
- ~~Replica should be read-only~~
- handle hyphens (replace them)
- clean special chars before indexing
- ~~string_utils::tokenize should not have max length~~
- ~~handle hyphens (replace them)~~
- ~~clean special chars before indexing~~
- ~~Add docs/explanation around ranking calc~~
- UTF-8 normalization
- NOT operator support
- > INT32_MAX validation for float field
- Proper logging
@ -92,7 +95,6 @@
- test for float int field deletion during doc deletion
- Test for snippets
- Test for replication
- Add docs/explanation around ranking calc
- Use rocksdb batch put for atomic insertion
- Query token ids should match query token ordering
- ID should not have "/"
@ -102,12 +104,10 @@
- Test for string utils
- Prevent string copy during indexing
- Minimum results should be a variable instead of blindly going with max_results
- UTF-8 support for fuzzy search
- Handle searching for non-existing fields gracefully
- test for same match score but different primary, secondary attr
- Support nested fields via "."
- Support search operators like +, - etc.
- string_utils::tokenize should not have max length
- Space sensitivity
- Use bitmap index instead of compressed array for doc list?
- Primary_rank_scores and secondary_rank_scores hashmaps should be combined?

View File

@ -3,6 +3,7 @@
#include <string>
#include <algorithm>
#include <sstream>
#include <ctype.h>
struct StringUtils {
// Adapted from: http://stackoverflow.com/a/236180/131050
@ -121,6 +122,10 @@ struct StringUtils {
}
static void normalize(std::string& str) {
str.erase(std::remove_if(str.begin(), str.end(), [](char c) {
return !std::isalnum(c) && (int)(c) >= 0;
}), str.end());
std::transform(str.begin(), str.end(), str.begin(), ::tolower);
}

View File

@ -1112,15 +1112,15 @@ TEST_F(CollectionTest, FilterOnTextFields) {
results = coll_array_fields->search("Jeremy", query_fields, "tags: BrONZe", facets, sort_fields, 0, 10, 1, FREQUENCY, false).get();
ASSERT_EQ(2, results["hits"].size());
// when comparators are used, should just treat them as part of search string
// when comparators are used, should just treat them as part of search string (special chars will be removed)
results = coll_array_fields->search("Jeremy", query_fields, "tags:<bronze", facets, sort_fields, 0, 10, 1, FREQUENCY, false).get();
ASSERT_EQ(0, results["hits"].size());
ASSERT_EQ(2, results["hits"].size());
results = coll_array_fields->search("Jeremy", query_fields, "tags:<=BRONZE", facets, sort_fields, 0, 10, 1, FREQUENCY, false).get();
ASSERT_EQ(0, results["hits"].size());
ASSERT_EQ(2, results["hits"].size());
results = coll_array_fields->search("Jeremy", query_fields, "tags:>BRONZE", facets, sort_fields, 0, 10, 1, FREQUENCY, false).get();
ASSERT_EQ(0, results["hits"].size());
ASSERT_EQ(2, results["hits"].size());
collectionManager.drop_collection("coll_array_fields");
}
@ -1381,7 +1381,7 @@ TEST_F(CollectionTest, SearchingWithMissingFields) {
Option<nlohmann::json> res_op = coll_array_fields->search("the", query_fields_not_found, "", facets, sort_fields, 0, 10);
ASSERT_FALSE(res_op.ok());
ASSERT_EQ(400, res_op.code());
ASSERT_EQ(404, res_op.code());
ASSERT_STREQ("Could not find a field named `titlez` in the schema.", res_op.error().c_str());
// when a query field is an integer field
@ -1391,16 +1391,16 @@ TEST_F(CollectionTest, SearchingWithMissingFields) {
// when a facet field is not defined in the schema
res_op = coll_array_fields->search("the", {"name"}, "", {"timestamps"}, sort_fields, 0, 10);
ASSERT_EQ(400, res_op.code());
ASSERT_EQ(404, res_op.code());
ASSERT_STREQ("Could not find a facet field named `timestamps` in the schema.", res_op.error().c_str());
// when a rank field is not defined in the schema
res_op = coll_array_fields->search("the", {"name"}, "", {}, { sort_by("timestamps", "ASC") }, 0, 10);
ASSERT_EQ(400, res_op.code());
ASSERT_EQ(404, res_op.code());
ASSERT_STREQ("Could not find a field named `timestamps` in the schema for sorting.", res_op.error().c_str());
res_op = coll_array_fields->search("the", {"name"}, "", {}, { sort_by("_rank", "ASC") }, 0, 10);
ASSERT_EQ(400, res_op.code());
ASSERT_EQ(404, res_op.code());
ASSERT_STREQ("Could not find a field named `_rank` in the schema for sorting.", res_op.error().c_str());
collectionManager.drop_collection("coll_array_fields");

View File

@ -0,0 +1,21 @@
#include <gtest/gtest.h>
#include "string_utils.h"
TEST(StringUtilsTest, ShouldNormalizeString) {
std::string alphanum = "Aa12Zz";
StringUtils::normalize(alphanum);
ASSERT_STREQ("aa12zz", alphanum.c_str());
std::string alphanum_space = "Aa12Zz 12A";
StringUtils::normalize(alphanum_space);
ASSERT_STREQ("aa12zz12a", alphanum_space.c_str());
std::string alphanum_specialchars = "Aa12Zz@W-_?,.R";
StringUtils::normalize(alphanum_specialchars);
ASSERT_STREQ("aa12zzwr", alphanum_specialchars.c_str());
// retain non-ascii unicode characters
std::string alphanum_unicodechars = "abcÅà123";
StringUtils::normalize(alphanum_unicodechars);
ASSERT_STREQ("abc\xC3\x85\xC3\xA0""123", alphanum_unicodechars.c_str());
}