mirror of
https://github.com/typesense/typesense.git
synced 2025-05-22 23:06:30 +08:00
255 lines
7.8 KiB
C++
255 lines
7.8 KiB
C++
#include <sstream>
|
||
#include <algorithm>
|
||
#include "tokenizer.h"
|
||
|
||
Tokenizer::Tokenizer(const std::string& input, bool normalize, bool no_op, const std::string& locale,
|
||
const std::vector<char>& symbols_to_index,
|
||
const std::vector<char>& separators):
|
||
i(0), normalize(normalize), no_op(no_op), locale(locale) {
|
||
|
||
if(locale == "zh") {
|
||
UErrorCode translit_status = U_ZERO_ERROR;
|
||
transliterator = icu::Transliterator::createInstance("Traditional-Simplified",
|
||
UTRANS_FORWARD, translit_status);
|
||
if(U_FAILURE(translit_status)) {
|
||
//LOG(ERROR) << "Unable to create transliteration instance for `zh` locale.";
|
||
transliterator = nullptr;
|
||
text = input;
|
||
} else {
|
||
icu::UnicodeString unicode_input = icu::UnicodeString::fromUTF8(input);
|
||
transliterator->transliterate(unicode_input);
|
||
std::string output;
|
||
unicode_input.toUTF8String(output);
|
||
normalized_text = (char *)malloc(output.size()+1);
|
||
strcpy(normalized_text, output.c_str());
|
||
text = normalized_text;
|
||
}
|
||
}
|
||
|
||
else if(locale == "ja") {
|
||
normalized_text = JapaneseLocalizer::get_instance().normalize(input);
|
||
text = normalized_text;
|
||
} else {
|
||
text = input;
|
||
}
|
||
|
||
cd = iconv_open("ASCII//TRANSLIT", "UTF-8");
|
||
|
||
if(!locale.empty() && locale != "en") {
|
||
UErrorCode status = U_ZERO_ERROR;
|
||
const icu::Locale& icu_locale = icu::Locale(locale.c_str());
|
||
bi = icu::BreakIterator::createWordInstance(icu_locale, status);
|
||
|
||
unicode_text = icu::UnicodeString::fromUTF8(text);
|
||
bi->setText(unicode_text);
|
||
|
||
position = bi->first();
|
||
prev_position = -1;
|
||
}
|
||
|
||
for(char c: symbols_to_index) {
|
||
index_symbols[uint8_t(c)] = 1;
|
||
}
|
||
|
||
for(char c: separators) {
|
||
separator_symbols[uint8_t(c)] = 1;
|
||
}
|
||
|
||
UErrorCode errcode = U_ZERO_ERROR;
|
||
nfkd = icu::Normalizer2::getNFKDInstance(errcode);
|
||
}
|
||
|
||
bool Tokenizer::next(std::string &token, size_t& token_index, size_t& start_index, size_t& end_index) {
|
||
if(no_op) {
|
||
if(i == text.size()) {
|
||
return false;
|
||
}
|
||
|
||
token = text;
|
||
i = text.size();
|
||
start_index = 0;
|
||
end_index = text.size() - 1;
|
||
return true;
|
||
}
|
||
|
||
if(!locale.empty() && locale != "en") {
|
||
while (position != icu::BreakIterator::DONE) {
|
||
//LOG(INFO) << "Position: " << position;
|
||
bool found_token = false;
|
||
|
||
if(prev_position != -1) {
|
||
std::string word;
|
||
size_t length = position - prev_position;
|
||
//LOG(INFO) << "token: " << token;
|
||
|
||
if(locale == "ko") {
|
||
UErrorCode errcode = U_ZERO_ERROR;
|
||
icu::UnicodeString src = unicode_text.tempSubString(prev_position, length);
|
||
icu::UnicodeString dst;
|
||
nfkd->normalize(src, dst, errcode);
|
||
|
||
if(!U_FAILURE(errcode)) {
|
||
std::string output;
|
||
dst.toUTF8String(output);
|
||
token = output;
|
||
} else {
|
||
LOG(ERROR) << "Unicode error during parsing: " << errcode;
|
||
}
|
||
} else {
|
||
token = unicode_text.toLower().tempSubString(prev_position, length).toUTF8String(word);
|
||
}
|
||
|
||
if(!token.empty()) {
|
||
if (!std::isalnum(token[0]) && is_ascii_char(token[i])) {
|
||
// ignore ascii symbols
|
||
found_token = false;
|
||
} else if(locale == "ko" && token == "·") {
|
||
found_token = false;
|
||
} else if(locale == "zh" && (token == "," || token == "─" || token == "。")) {
|
||
found_token = false;
|
||
} else {
|
||
|
||
if(std::isalnum(token[0]) && is_ascii_char(token[0])) {
|
||
// normalize an ascii string
|
||
std::transform(token.begin(), token.end(), token.begin(),
|
||
[](unsigned char c){ return std::tolower(c); });
|
||
}
|
||
|
||
found_token = true;
|
||
token_index = token_counter++;
|
||
}
|
||
|
||
start_index = utf8_start_index;
|
||
end_index = utf8_start_index + token.size() - 1;
|
||
utf8_start_index = end_index + 1;
|
||
}
|
||
}
|
||
|
||
prev_position = position;
|
||
position = bi->next();
|
||
|
||
if(found_token) {
|
||
return true;
|
||
}
|
||
}
|
||
|
||
return false;
|
||
}
|
||
|
||
while(i < text.size()) {
|
||
if(is_ascii_char(text[i])) {
|
||
size_t this_stream_mode = get_stream_mode(text[i]);
|
||
|
||
if(this_stream_mode == SKIP) {
|
||
i++;
|
||
continue;
|
||
}
|
||
|
||
if(this_stream_mode == SEPARATE) {
|
||
if(out.empty()) {
|
||
i++;
|
||
continue;
|
||
}
|
||
|
||
token = out;
|
||
out.clear();
|
||
|
||
token_index = token_counter++;
|
||
end_index = i - 1;
|
||
i++;
|
||
return true;
|
||
} else {
|
||
if(out.empty()) {
|
||
start_index = i;
|
||
}
|
||
|
||
out += normalize ? char(std::tolower(text[i])) : text[i];
|
||
i++;
|
||
continue;
|
||
}
|
||
}
|
||
|
||
if(out.empty()) {
|
||
start_index = i;
|
||
}
|
||
|
||
char inbuf[5];
|
||
char *p = inbuf;
|
||
|
||
// group bytes to form a unicode representation
|
||
*p++ = text[i++];
|
||
if ((text[i] & 0xC0) == 0x80) *p++ = text[i++];
|
||
if ((text[i] & 0xC0) == 0x80) *p++ = text[i++];
|
||
if ((text[i] & 0xC0) == 0x80) *p++ = text[i++];
|
||
*p = 0;
|
||
size_t insize = (p - &inbuf[0]);
|
||
|
||
if(!normalize) {
|
||
out += inbuf;
|
||
continue;
|
||
}
|
||
|
||
char outbuf[5] = {};
|
||
size_t outsize = sizeof(outbuf);
|
||
char *outptr = outbuf;
|
||
char *inptr = inbuf;
|
||
|
||
//printf("[%s]\n", inbuf);
|
||
|
||
errno = 0;
|
||
iconv(cd, &inptr, &insize, &outptr, &outsize);
|
||
|
||
if(errno == EILSEQ) {
|
||
// symbol cannot be represented as ASCII, so write the original symbol
|
||
out += inbuf;
|
||
} else {
|
||
for(size_t out_index=0; out_index<5; out_index++) {
|
||
if(!normalize) {
|
||
out += outbuf[out_index];
|
||
continue;
|
||
}
|
||
|
||
bool unicode_is_ascii = is_ascii_char(outbuf[out_index]);
|
||
bool keep_char = !unicode_is_ascii || std::isalnum(outbuf[out_index]);
|
||
|
||
if(keep_char) {
|
||
if(unicode_is_ascii && std::isalnum(outbuf[out_index])) {
|
||
outbuf[out_index] = char(std::tolower(outbuf[out_index]));
|
||
}
|
||
out += outbuf[out_index];
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
token = out;
|
||
out.clear();
|
||
end_index = i - 1;
|
||
|
||
if(token.empty()) {
|
||
return false;
|
||
}
|
||
|
||
token_index = token_counter++;
|
||
return true;
|
||
}
|
||
|
||
void Tokenizer::tokenize(std::vector<std::string> &tokens) {
|
||
std::string token;
|
||
size_t token_index;
|
||
|
||
while(next(token, token_index)) {
|
||
tokens.push_back(token);
|
||
}
|
||
}
|
||
|
||
void Tokenizer::tokenize(std::string& token) {
|
||
size_t token_index;
|
||
next(token, token_index);
|
||
}
|
||
|
||
bool Tokenizer::next(std::string &token, size_t &token_index) {
|
||
size_t start_index, end_index;
|
||
return next(token, token_index, start_index, end_index);
|
||
}
|