mirror of
https://github.com/typesense/typesense.git
synced 2025-05-15 19:06:48 +08:00
Improve unicode normalization.
Normalize latin characters like ß to their near equivalent ASCII representations.
This commit is contained in:
parent
ad42c40e6d
commit
3c98931c0e
@ -45,6 +45,7 @@ include(cmake/RocksDB.cmake)
|
||||
include(cmake/GoogleTest.cmake)
|
||||
include(cmake/TestResources.cmake)
|
||||
include(cmake/g3log.cmake)
|
||||
include(cmake/Iconv.cmake)
|
||||
|
||||
FILE(GLOB SRC_FILES src/*.cpp)
|
||||
|
||||
@ -57,6 +58,7 @@ include_directories(${DEP_ROOT_DIR}/${GTEST_NAME}/googletest/include)
|
||||
include_directories(${DEP_ROOT_DIR}/${H2O_NAME}/include)
|
||||
include_directories(${DEP_ROOT_DIR}/${ROCKSDB_NAME}/include)
|
||||
include_directories(${DEP_ROOT_DIR}/${G3LOG_NAME}/src)
|
||||
include_directories(${DEP_ROOT_DIR}/${ICONV_NAME}/include)
|
||||
|
||||
# Write dependency include directories to a file
|
||||
file(WRITE ${DEP_ROOT_DIR}/includes.txt "")
|
||||
@ -70,6 +72,7 @@ link_directories(${DEP_ROOT_DIR}/${FOR_NAME})
|
||||
link_directories(${DEP_ROOT_DIR}/${H2O_NAME}/build)
|
||||
link_directories(${DEP_ROOT_DIR}/${ROCKSDB_NAME})
|
||||
link_directories(${DEP_ROOT_DIR}/${G3LOG_NAME}/build)
|
||||
link_directories(${DEP_ROOT_DIR}/${ICONV_NAME}/lib/.libs)
|
||||
|
||||
# Write dependency libraries to a file
|
||||
file(WRITE ${DEP_ROOT_DIR}/libs.txt "")
|
||||
@ -130,7 +133,7 @@ if(NOT APPLE)
|
||||
endif()
|
||||
|
||||
set(ICU_ALL_LIBRARIES ${ICU_I18N_LIBRARIES} ${ICU_LIBRARIES} ${ICU_DATA_LIBRARIES})
|
||||
set(CORE_LIBS h2o-evloop ${CURL_LIBRARIES} for ${ICU_ALL_LIBRARIES} ${G3LOGGER_LIBRARIES} pthread
|
||||
set(CORE_LIBS h2o-evloop iconv ${CURL_LIBRARIES} for ${ICU_ALL_LIBRARIES} ${G3LOGGER_LIBRARIES} pthread
|
||||
${ROCKSDB_LIBS} ${OPENSSL_LIBRARIES} dl ${STD_LIB})
|
||||
|
||||
target_link_libraries(typesense-core ${CORE_LIBS})
|
||||
|
31
cmake/Iconv.cmake
Normal file
31
cmake/Iconv.cmake
Normal file
@ -0,0 +1,31 @@
|
||||
# Download and build libiconv
|
||||
|
||||
set(ICONV_VERSION 1.15)
|
||||
set(ICONV_NAME libiconv-${ICONV_VERSION})
|
||||
set(ICONV_TAR_PATH ${DEP_ROOT_DIR}/${ICONV_NAME}.tar.gz)
|
||||
|
||||
if(NOT EXISTS ${ICONV_TAR_PATH})
|
||||
message(STATUS "Downloading libconv...")
|
||||
file(DOWNLOAD https://ftp.gnu.org/pub/gnu/libiconv/libiconv-${ICONV_VERSION}.tar.gz ${ICONV_TAR_PATH})
|
||||
endif()
|
||||
|
||||
if(NOT EXISTS ${DEP_ROOT_DIR}/${ICONV_NAME})
|
||||
message(STATUS "Extracting libconv...")
|
||||
execute_process(COMMAND ${CMAKE_COMMAND} -E tar xzf ${ICONV_TAR_PATH} WORKING_DIRECTORY ${DEP_ROOT_DIR})
|
||||
endif()
|
||||
|
||||
if(NOT EXISTS ${DEP_ROOT_DIR}/${ICONV_NAME}/Makefile AND BUILD_DEPS STREQUAL "yes")
|
||||
message("Configuring libconv locally...")
|
||||
execute_process(COMMAND ./configure "--enable-static" WORKING_DIRECTORY ${DEP_ROOT_DIR}/${ICONV_NAME}/ RESULT_VARIABLE ICONV_CONFIGURE)
|
||||
if(NOT ICONV_CONFIGURE EQUAL 0)
|
||||
message(FATAL_ERROR "${ICONV_NAME} configure failed!")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(NOT EXISTS ${DEP_ROOT_DIR}/${ICONV_NAME}/lib/.libs/libiconv.a AND BUILD_DEPS STREQUAL "yes")
|
||||
message("Building libconv locally...")
|
||||
execute_process(COMMAND make WORKING_DIRECTORY ${DEP_ROOT_DIR}/${ICONV_NAME})
|
||||
if(NOT EXISTS ${DEP_ROOT_DIR}/${ICONV_NAME}/lib/.libs/libiconv.a)
|
||||
message(FATAL_ERROR "${ICONV_NAME} build failed!")
|
||||
endif()
|
||||
endif()
|
@ -5,14 +5,17 @@
|
||||
#include <sstream>
|
||||
#include <ctype.h>
|
||||
#include <unicode/translit.h>
|
||||
#include <iconv.h>
|
||||
#include <vector>
|
||||
|
||||
struct StringUtils {
|
||||
UErrorCode status;
|
||||
//icu::Transliterator* transliterator;
|
||||
iconv_t cd;
|
||||
|
||||
StringUtils(): status(U_ZERO_ERROR) {
|
||||
// transliterator(icu::Transliterator::createInstance("Latin-ASCII", UTRANS_FORWARD, status))
|
||||
cd = iconv_open("ascii//TRANSLIT", "UTF-8");
|
||||
}
|
||||
|
||||
~StringUtils() {
|
||||
|
@ -610,7 +610,7 @@ Option<nlohmann::json> Collection::search(std::string query, const std::vector<s
|
||||
prune_document(document, include_fields, exclude_fields);
|
||||
wrapper_doc["document"] = document;
|
||||
//wrapper_doc["match_score"] = field_order_kv.match_score;
|
||||
//wrapper_doc["seq_id"] = (uint32_t) field_order_kv.key;
|
||||
wrapper_doc["seq_id"] = (uint32_t) field_order_kv.key;
|
||||
|
||||
result["hits"].push_back(wrapper_doc);
|
||||
}
|
||||
|
@ -732,7 +732,7 @@ void Index::search_field(const uint8_t & field_id, std::string & query, const st
|
||||
const std::string token_cost_hash = token + std::to_string(costs[token_index]);
|
||||
|
||||
std::vector<art_leaf*> leaves;
|
||||
//LOG(INFO) << "\nSearching for: " << token << " - cost: " << costs[token_index];
|
||||
//LOG(INFO) << "\nSearching for field: " << field << ", token:" << token << " - cost: " << costs[token_index];
|
||||
|
||||
if(token_cost_cache.count(token_cost_hash) != 0) {
|
||||
leaves = token_cost_cache[token_cost_hash];
|
||||
@ -831,7 +831,7 @@ void Index::log_leaves(const int cost, const std::string &token, const std::vect
|
||||
LOG(INFO) << "Token: " << token << ", cost: " << cost;
|
||||
|
||||
for(size_t i=0; i < leaves.size(); i++) {
|
||||
printf("%.*s, ", leaves[i]->key_len, leaves[i]->key);
|
||||
printf("%.*s - %d, ", leaves[i]->key_len, leaves[i]->key, leaves[i]->values->ids.getLength());
|
||||
LOG(INFO) << "frequency: " << leaves[i]->values->ids.getLength() << ", max_score: " << leaves[i]->max_score;
|
||||
/*for(auto j=0; j<leaves[i]->values->ids.getLength(); j++) {
|
||||
LOG(INFO) << "id: " << leaves[i]->values->ids.at(j);
|
||||
|
@ -1,12 +1,36 @@
|
||||
#include "string_utils.h"
|
||||
#include <iostream>
|
||||
|
||||
std::string lower_and_no_special_chars(const std::string & str) {
|
||||
std::stringstream ss;
|
||||
|
||||
for(const auto c: str) {
|
||||
bool should_remove = (!std::isalnum(c) && (int)(c) >= 0);
|
||||
if(!should_remove) {
|
||||
ss << (char) std::tolower(c);
|
||||
}
|
||||
}
|
||||
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
void StringUtils::unicode_normalize(std::string& str) const {
|
||||
// remove special chars within ASCII range
|
||||
str.erase(std::remove_if(str.begin(), str.end(), [](char c) {
|
||||
return !std::isalnum(c) && (int)(c) >= 0;
|
||||
}), str.end());
|
||||
size_t outbuflen = str.length() * 2;
|
||||
char output[outbuflen];
|
||||
char *outptr = output;
|
||||
|
||||
icu::UnicodeString u_str = icu::UnicodeString::fromUTF8(str);
|
||||
str.clear();
|
||||
u_str.toLower().toUTF8String(str);
|
||||
char *input = (char *) str.c_str();
|
||||
size_t insize = str.length();
|
||||
size_t outsize = outbuflen;
|
||||
|
||||
iconv(cd, &input, &insize, &outptr, &outsize);
|
||||
size_t actual_size = outbuflen - outsize;
|
||||
|
||||
if(actual_size == 0) {
|
||||
str.assign(lower_and_no_special_chars(str));
|
||||
return ;
|
||||
}
|
||||
|
||||
std::string nstr = std::string(output, actual_size);
|
||||
str.assign(lower_and_no_special_chars(nstr));
|
||||
}
|
@ -1,5 +1,7 @@
|
||||
#include <gtest/gtest.h>
|
||||
#include "string_utils.h"
|
||||
#include <iconv.h>
|
||||
#include <unicode/translit.h>
|
||||
|
||||
TEST(StringUtilsTest, ShouldNormalizeString) {
|
||||
StringUtils string_utils;
|
||||
@ -16,13 +18,17 @@ TEST(StringUtilsTest, ShouldNormalizeString) {
|
||||
string_utils.unicode_normalize(alphanum_specialchars);
|
||||
ASSERT_STREQ("aa12zzwr", alphanum_specialchars.c_str());
|
||||
|
||||
std::string alphanum_unicodechars = "abcÅà123";
|
||||
std::string alphanum_unicodechars = "abcÅà123ß";
|
||||
string_utils.unicode_normalize(alphanum_unicodechars);
|
||||
ASSERT_STREQ("abcåà123", alphanum_unicodechars.c_str());
|
||||
ASSERT_STREQ("abcaa123ss", alphanum_unicodechars.c_str());
|
||||
|
||||
std::string tamil_unicodechars = "தமிழ் நாடு";
|
||||
string_utils.unicode_normalize(tamil_unicodechars);
|
||||
ASSERT_STREQ("தமிழ்நாடு", tamil_unicodechars.c_str());
|
||||
|
||||
std::string chinese_unicodechars = "你好吗";
|
||||
string_utils.unicode_normalize(chinese_unicodechars);
|
||||
ASSERT_STREQ("你好吗", chinese_unicodechars.c_str());
|
||||
}
|
||||
|
||||
TEST(StringUtilsTest, ShouldJoinString) {
|
||||
@ -33,4 +39,4 @@ TEST(StringUtilsTest, ShouldJoinString) {
|
||||
|
||||
const std::string & joined_str2 = StringUtils::join(parts, "/", 2);
|
||||
ASSERT_STREQ("baz/bazinga", joined_str2.c_str());
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user