typesense/include/tokenizer.h
kishorenc 6997e35f72 Combine various token operations in a single flow.
Splitting, normalizing etc. are now done in a single loop.
2020-11-17 20:10:34 +05:30

27 lines
586 B
C++

#pragma once
#include <string>
#include <vector>
#include <iconv.h>
class Tokenizer {
private:
const std::string& text;
size_t i;
const bool keep_empty;
const bool normalize;
size_t token_counter = 0;
iconv_t cd;
public:
Tokenizer(const std::string& input, bool keep_empty=true, bool normalize=true):
text(input), i(0), keep_empty(keep_empty), normalize(normalize) {
cd = iconv_open("ASCII//TRANSLIT", "UTF-8");
}
bool next(std::string& token, size_t& token_index);
void tokenize(std::vector<std::string>& tokens);
};