mirror of
https://github.com/typesense/typesense.git
synced 2025-05-20 05:32:30 +08:00
add boolean for enabling typos for alphanumeric tokens (#1651)
Co-authored-by: Kishore Nallan <kishorenc@gmail.com>
This commit is contained in:
parent
5f3b3684b2
commit
15114a6c87
@ -278,7 +278,8 @@ private:
|
||||
std::vector<std::pair<uint32_t, uint32_t>>& included_ids,
|
||||
std::vector<uint32_t>& excluded_ids,
|
||||
nlohmann::json& override_metadata,
|
||||
bool enable_typos_for_numerical_tokens=true) const;
|
||||
bool enable_typos_for_numerical_tokens=true,
|
||||
bool enable_typos_for_alpha_numerical_tokens=true) const;
|
||||
|
||||
void populate_text_match_info(nlohmann::json& info, uint64_t match_score, const text_match_type_t match_type,
|
||||
const size_t total_tokens) const;
|
||||
@ -588,7 +589,8 @@ public:
|
||||
bool enable_synonyms = true,
|
||||
bool synonym_prefix = false,
|
||||
uint32_t synonym_num_typos = 0,
|
||||
bool enable_lazy_filter = false) const;
|
||||
bool enable_lazy_filter = false,
|
||||
bool enable_typos_for_alpha_numerical_tokens = true) const;
|
||||
|
||||
Option<bool> get_filter_ids(const std::string & filter_query, filter_result_t& filter_result) const;
|
||||
|
||||
|
@ -468,13 +468,15 @@ private:
|
||||
const std::vector<std::string>& query_tokens,
|
||||
token_ordering token_order, std::set<std::string>& absorbed_tokens,
|
||||
std::string& filter_by_clause,
|
||||
bool enable_typos_for_numerical_tokens) const;
|
||||
bool enable_typos_for_numerical_tokens,
|
||||
bool enable_typos_for_alpha_numerical_tokens) const;
|
||||
|
||||
bool check_for_overrides(const token_ordering& token_order, const string& field_name, bool slide_window,
|
||||
bool exact_rule_match, std::vector<std::string>& tokens,
|
||||
std::set<std::string>& absorbed_tokens,
|
||||
std::vector<std::string>& field_absorbed_tokens,
|
||||
bool enable_typos_for_numerical_tokens) const;
|
||||
bool enable_typos_for_numerical_tokens,
|
||||
bool enable_typos_for_alpha_numerical_tokens) const;
|
||||
|
||||
static void aggregate_topster(Topster* agg_topster, Topster* index_topster);
|
||||
|
||||
@ -638,7 +640,8 @@ public:
|
||||
facet_index_t* _get_facet_index() const;
|
||||
|
||||
static int get_bounded_typo_cost(const size_t max_cost, const std::string& token, const size_t token_len,
|
||||
size_t min_len_1typo, size_t min_len_2typo, bool enable_typos_for_numerical_tokens=true);
|
||||
size_t min_len_1typo, size_t min_len_2typo, bool enable_typos_for_numerical_tokens=true,
|
||||
bool enable_typos_for_alpha_numerical_tokens = true);
|
||||
|
||||
static int64_t float_to_int64_t(float n);
|
||||
|
||||
@ -663,7 +666,8 @@ public:
|
||||
|
||||
Option<bool> run_search(search_args* search_params, const std::string& collection_name,
|
||||
facet_index_type_t facet_index_type, bool enable_typos_for_numerical_tokens,
|
||||
bool enable_synonyms, bool synonym_prefix, uint32_t synonym_num_typos);
|
||||
bool enable_synonyms, bool synonym_prefix, uint32_t synonym_num_typos,
|
||||
bool enable_typos_for_alpha_numerical_tokens);
|
||||
|
||||
Option<bool> search(std::vector<query_tokens_t>& field_query_tokens, const std::vector<search_field_t>& the_fields,
|
||||
const text_match_type_t match_type,
|
||||
@ -697,7 +701,8 @@ public:
|
||||
bool enable_synonyms = true,
|
||||
bool synonym_prefix = false,
|
||||
uint32_t synonym_num_typos = 0,
|
||||
bool enable_lazy_filter = false
|
||||
bool enable_lazy_filter = false,
|
||||
bool enable_typos_for_alpha_numerical_tokens = true
|
||||
) const;
|
||||
|
||||
void remove_field(uint32_t seq_id, const nlohmann::json& document, const std::string& field_name,
|
||||
@ -920,7 +925,8 @@ public:
|
||||
std::array<spp::sparse_hash_map<uint32_t, int64_t, Hasher32>*, 3>& field_values,
|
||||
const std::vector<size_t>& geopoint_indices,
|
||||
const std::string& collection_name = "",
|
||||
bool enable_typos_for_numerical_tokens = true) const;
|
||||
bool enable_typos_for_numerical_tokens = true,
|
||||
bool enable_typos_for_alpha_numerical_tokens = true) const;
|
||||
|
||||
void find_across_fields(const token_t& previous_token,
|
||||
const std::string& previous_token_str,
|
||||
@ -991,7 +997,8 @@ public:
|
||||
filter_node_t*& filter_tree_root,
|
||||
std::vector<const override_t*>& matched_dynamic_overrides,
|
||||
nlohmann::json& override_metadata,
|
||||
bool enable_typos_for_numerical_tokens) const;
|
||||
bool enable_typos_for_numerical_tokens,
|
||||
bool enable_typos_for_alpha_numerical_tokens) const;
|
||||
|
||||
Option<bool> compute_sort_scores(const std::vector<sort_by>& sort_fields, const int* sort_order,
|
||||
std::array<spp::sparse_hash_map<uint32_t, int64_t, Hasher32>*, 3> field_values,
|
||||
|
@ -1763,7 +1763,8 @@ Option<nlohmann::json> Collection::search(std::string raw_query,
|
||||
bool enable_synonyms,
|
||||
bool synonym_prefix,
|
||||
uint32_t synonyms_num_typos,
|
||||
bool enable_lazy_filter) const {
|
||||
bool enable_lazy_filter,
|
||||
bool enable_typos_for_alpha_numerical_tokens) const {
|
||||
std::shared_lock lock(mutex);
|
||||
|
||||
// setup thread local vars
|
||||
@ -2293,7 +2294,8 @@ Option<nlohmann::json> Collection::search(std::string raw_query,
|
||||
false, stopwords_set);
|
||||
|
||||
process_filter_overrides(filter_overrides, q_include_tokens, token_order, filter_tree_root,
|
||||
included_ids, excluded_ids, override_metadata, enable_typos_for_numerical_tokens);
|
||||
included_ids, excluded_ids, override_metadata, enable_typos_for_numerical_tokens,
|
||||
enable_typos_for_alpha_numerical_tokens);
|
||||
|
||||
for(size_t i = 0; i < q_include_tokens.size(); i++) {
|
||||
auto& q_include_token = q_include_tokens[i];
|
||||
@ -2314,7 +2316,8 @@ Option<nlohmann::json> Collection::search(std::string raw_query,
|
||||
|
||||
// included_ids, excluded_ids
|
||||
process_filter_overrides(filter_overrides, q_include_tokens, token_order, filter_tree_root,
|
||||
included_ids, excluded_ids, override_metadata, enable_typos_for_numerical_tokens);
|
||||
included_ids, excluded_ids, override_metadata, enable_typos_for_numerical_tokens,
|
||||
enable_typos_for_alpha_numerical_tokens);
|
||||
|
||||
for(size_t i = 0; i < q_include_tokens.size(); i++) {
|
||||
auto& q_include_token = q_include_tokens[i];
|
||||
@ -2360,7 +2363,7 @@ Option<nlohmann::json> Collection::search(std::string raw_query,
|
||||
|
||||
auto search_op = index->run_search(search_params, name, facet_index_type,
|
||||
enable_typos_for_numerical_tokens, enable_synonyms, synonym_prefix,
|
||||
synonyms_num_typos);
|
||||
synonyms_num_typos, enable_typos_for_alpha_numerical_tokens);
|
||||
|
||||
// filter_tree_root might be updated in Index::static_filter_query_eval.
|
||||
filter_tree_root_guard.release();
|
||||
@ -3395,12 +3398,14 @@ void Collection::process_filter_overrides(std::vector<const override_t*>& filter
|
||||
std::vector<std::pair<uint32_t, uint32_t>>& included_ids,
|
||||
std::vector<uint32_t>& excluded_ids,
|
||||
nlohmann::json& override_metadata,
|
||||
bool enable_typos_for_numerical_tokens) const {
|
||||
bool enable_typos_for_numerical_tokens,
|
||||
bool enable_typos_for_alpha_numerical_tokens) const {
|
||||
|
||||
std::vector<const override_t*> matched_dynamic_overrides;
|
||||
index->process_filter_overrides(filter_overrides, q_include_tokens, token_order,
|
||||
filter_tree_root, matched_dynamic_overrides, override_metadata,
|
||||
enable_typos_for_numerical_tokens);
|
||||
enable_typos_for_numerical_tokens,
|
||||
enable_typos_for_alpha_numerical_tokens);
|
||||
|
||||
// we will check the dynamic overrides to see if they also have include/exclude
|
||||
std::set<uint32_t> excluded_set;
|
||||
|
@ -1479,6 +1479,7 @@ Option<bool> CollectionManager::do_search(std::map<std::string, std::string>& re
|
||||
const char *VOICE_QUERY = "voice_query";
|
||||
|
||||
const char *ENABLE_TYPOS_FOR_NUMERICAL_TOKENS = "enable_typos_for_numerical_tokens";
|
||||
const char *ENABLE_TYPOS_FOR_ALPHA_NUMERICAL_TOKENS = "enable_typos_for_alpha_numerical_tokens";
|
||||
const char *ENABLE_LAZY_FILTER = "enable_lazy_filter";
|
||||
|
||||
const char *SYNONYM_PREFIX = "synonym_prefix";
|
||||
@ -1607,6 +1608,7 @@ Option<bool> CollectionManager::do_search(std::map<std::string, std::string>& re
|
||||
bool enable_highlight_v1 = true;
|
||||
text_match_type_t match_type = max_score;
|
||||
bool enable_typos_for_numerical_tokens = true;
|
||||
bool enable_typos_for_alpha_numerical_tokens = true;
|
||||
bool enable_lazy_filter = Config::get_instance().get_enable_lazy_filter();
|
||||
|
||||
size_t remote_embedding_timeout_ms = 5000;
|
||||
@ -1684,6 +1686,7 @@ Option<bool> CollectionManager::do_search(std::map<std::string, std::string>& re
|
||||
{ENABLE_SYNONYMS, &enable_synonyms},
|
||||
{SYNONYM_PREFIX, &synonym_prefix},
|
||||
{ENABLE_LAZY_FILTER, &enable_lazy_filter},
|
||||
{ENABLE_TYPOS_FOR_ALPHA_NUMERICAL_TOKENS, &enable_typos_for_alpha_numerical_tokens},
|
||||
};
|
||||
|
||||
std::unordered_map<std::string, std::vector<std::string>*> str_list_values = {
|
||||
@ -1902,7 +1905,8 @@ Option<bool> CollectionManager::do_search(std::map<std::string, std::string>& re
|
||||
enable_synonyms,
|
||||
synonym_prefix,
|
||||
synonym_num_typos,
|
||||
enable_lazy_filter);
|
||||
enable_lazy_filter,
|
||||
enable_typos_for_alpha_numerical_tokens);
|
||||
|
||||
uint64_t timeMillis = std::chrono::duration_cast<std::chrono::milliseconds>(
|
||||
std::chrono::high_resolution_clock::now() - begin).count();
|
||||
|
@ -2212,7 +2212,8 @@ Option<filter_result_t> Index::do_filtering_with_reference_ids(const std::string
|
||||
|
||||
Option<bool> Index::run_search(search_args* search_params, const std::string& collection_name,
|
||||
facet_index_type_t facet_index_type, bool enable_typos_for_numerical_tokens,
|
||||
bool enable_synonyms, bool synonym_prefix, uint32_t synonym_num_typos) {
|
||||
bool enable_synonyms, bool synonym_prefix, uint32_t synonym_num_typos,
|
||||
bool enable_typos_for_alpha_numerical_tokens) {
|
||||
return search(search_params->field_query_tokens,
|
||||
search_params->search_fields,
|
||||
search_params->match_type,
|
||||
@ -2257,7 +2258,8 @@ Option<bool> Index::run_search(search_args* search_params, const std::string& co
|
||||
enable_synonyms,
|
||||
synonym_prefix,
|
||||
synonym_num_typos,
|
||||
search_params->enable_lazy_filter
|
||||
search_params->enable_lazy_filter,
|
||||
enable_typos_for_alpha_numerical_tokens
|
||||
);
|
||||
}
|
||||
|
||||
@ -2346,7 +2348,8 @@ bool Index::static_filter_query_eval(const override_t* override,
|
||||
bool Index::resolve_override(const std::vector<std::string>& rule_tokens, const bool exact_rule_match,
|
||||
const std::vector<std::string>& query_tokens,
|
||||
token_ordering token_order, std::set<std::string>& absorbed_tokens,
|
||||
std::string& filter_by_clause, bool enable_typos_for_numerical_tokens) const {
|
||||
std::string& filter_by_clause, bool enable_typos_for_numerical_tokens,
|
||||
bool enable_typos_for_alpha_numerical_tokens) const {
|
||||
|
||||
bool resolved_override = false;
|
||||
size_t i = 0, j = 0;
|
||||
@ -2389,7 +2392,8 @@ bool Index::resolve_override(const std::vector<std::string>& rule_tokens, const
|
||||
std::vector<std::string> field_absorbed_tokens;
|
||||
resolved_override &= check_for_overrides(token_order, field_name, slide_window,
|
||||
exact_rule_match, matched_tokens, absorbed_tokens,
|
||||
field_absorbed_tokens, enable_typos_for_numerical_tokens);
|
||||
field_absorbed_tokens, enable_typos_for_numerical_tokens,
|
||||
enable_typos_for_alpha_numerical_tokens);
|
||||
|
||||
if(!resolved_override) {
|
||||
goto RETURN_EARLY;
|
||||
@ -2441,7 +2445,8 @@ void Index::process_filter_overrides(const std::vector<const override_t*>& filte
|
||||
filter_node_t*& filter_tree_root,
|
||||
std::vector<const override_t*>& matched_dynamic_overrides,
|
||||
nlohmann::json& override_metadata,
|
||||
bool enable_typos_for_numerical_tokens) const {
|
||||
bool enable_typos_for_numerical_tokens,
|
||||
bool enable_typos_for_alpha_numerical_tokens) const {
|
||||
std::shared_lock lock(mutex);
|
||||
|
||||
for (auto& override : filter_overrides) {
|
||||
@ -2478,7 +2483,8 @@ void Index::process_filter_overrides(const std::vector<const override_t*>& filte
|
||||
std::set<std::string> absorbed_tokens;
|
||||
bool resolved_override = resolve_override(rule_parts, exact_rule_match, query_tokens,
|
||||
token_order, absorbed_tokens, filter_by_clause,
|
||||
enable_typos_for_numerical_tokens);
|
||||
enable_typos_for_numerical_tokens,
|
||||
enable_typos_for_alpha_numerical_tokens);
|
||||
|
||||
if (resolved_override) {
|
||||
if(override_metadata.empty()) {
|
||||
@ -2536,7 +2542,8 @@ bool Index::check_for_overrides(const token_ordering& token_order, const string&
|
||||
bool exact_rule_match, std::vector<std::string>& tokens,
|
||||
std::set<std::string>& absorbed_tokens,
|
||||
std::vector<std::string>& field_absorbed_tokens,
|
||||
bool enable_typos_for_numerical_tokens) const {
|
||||
bool enable_typos_for_numerical_tokens,
|
||||
bool enable_typos_for_alpha_numerical_tokens) const {
|
||||
|
||||
for(size_t window_len = tokens.size(); window_len > 0; window_len--) {
|
||||
for(size_t start_index = 0; start_index+window_len-1 < tokens.size(); start_index++) {
|
||||
@ -2748,7 +2755,8 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
|
||||
bool enable_typos_for_numerical_tokens,
|
||||
bool enable_synonyms, bool synonym_prefix,
|
||||
uint32_t synonym_num_typos,
|
||||
bool enable_lazy_filter) const {
|
||||
bool enable_lazy_filter,
|
||||
bool enable_typos_for_alpha_numerical_tokens) const {
|
||||
std::shared_lock lock(mutex);
|
||||
|
||||
auto filter_result_iterator = new filter_result_iterator_t(collection_name, this, filter_tree_root,
|
||||
@ -3145,7 +3153,8 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
|
||||
typo_tokens_threshold, exhaustive_search,
|
||||
max_candidates, min_len_1typo, min_len_2typo,
|
||||
syn_orig_num_tokens, sort_order, field_values, geopoint_indices,
|
||||
collection_name, enable_typos_for_numerical_tokens);
|
||||
collection_name, enable_typos_for_numerical_tokens,
|
||||
enable_typos_for_alpha_numerical_tokens);
|
||||
if (!fuzzy_search_fields_op.ok()) {
|
||||
return fuzzy_search_fields_op;
|
||||
}
|
||||
@ -3924,7 +3933,8 @@ Option<bool> Index::fuzzy_search_fields(const std::vector<search_field_t>& the_f
|
||||
std::array<spp::sparse_hash_map<uint32_t, int64_t, Hasher32>*, 3>& field_values,
|
||||
const std::vector<size_t>& geopoint_indices,
|
||||
const std::string& collection_name,
|
||||
bool enable_typos_for_numerical_tokens) const {
|
||||
bool enable_typos_for_numerical_tokens,
|
||||
bool enable_typos_for_alpha_numerical_tokens) const {
|
||||
|
||||
// NOTE: `query_tokens` preserve original tokens, while `search_tokens` could be a result of dropped tokens
|
||||
|
||||
@ -3939,7 +3949,7 @@ Option<bool> Index::fuzzy_search_fields(const std::vector<search_field_t>& the_f
|
||||
std::vector<int> all_costs;
|
||||
// This ensures that we don't end up doing a cost of 1 for a single char etc.
|
||||
int bounded_cost = get_bounded_typo_cost(2, token , token.length(), min_len_1typo, min_len_2typo,
|
||||
enable_typos_for_numerical_tokens);
|
||||
enable_typos_for_numerical_tokens, enable_typos_for_alpha_numerical_tokens);
|
||||
|
||||
for(int cost = 0; cost <= bounded_cost; cost++) {
|
||||
all_costs.push_back(cost);
|
||||
@ -6166,7 +6176,16 @@ void Index::populate_sort_mapping_with_lock(int* sort_order, std::vector<size_t>
|
||||
|
||||
int Index::get_bounded_typo_cost(const size_t max_cost, const std::string& token, const size_t token_len,
|
||||
const size_t min_len_1typo, const size_t min_len_2typo,
|
||||
bool enable_typos_for_numerical_tokens) {
|
||||
bool enable_typos_for_numerical_tokens,
|
||||
bool enable_typos_for_alpha_numerical_tokens) {
|
||||
|
||||
if(!enable_typos_for_alpha_numerical_tokens) {
|
||||
for(auto c : token) {
|
||||
if(!isalnum(c)) { //some special char which is indexed
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(!enable_typos_for_numerical_tokens && std::all_of(token.begin(), token.end(), ::isdigit)) {
|
||||
return 0;
|
||||
|
@ -3012,4 +3012,85 @@ TEST_F(CollectionSpecificMoreTest, TestFieldStore) {
|
||||
ASSERT_EQ(1, res.get()["hits"].size());
|
||||
ASSERT_EQ("store", res.get()["hits"][0]["document"]["word_to_store"].get<std::string>());
|
||||
ASSERT_TRUE(res.get()["hits"][0]["document"].count("word_not_to_store") == 0);
|
||||
}
|
||||
|
||||
TEST_F(CollectionSpecificMoreTest, EnableTyposForAlphaNumericalTokens) {
|
||||
nlohmann::json schema = R"({
|
||||
"name": "coll1",
|
||||
"fields": [
|
||||
{"name": "title", "type": "string"}
|
||||
],
|
||||
"symbols_to_index":["/"]
|
||||
})"_json;
|
||||
|
||||
Collection* coll1 = collectionManager.create_collection(schema).get();
|
||||
|
||||
nlohmann::json doc;
|
||||
doc["title"] = "c-136/14";
|
||||
ASSERT_TRUE(coll1->add(doc.dump()).ok());
|
||||
|
||||
doc["title"] = "13/14";
|
||||
ASSERT_TRUE(coll1->add(doc.dump()).ok());
|
||||
|
||||
doc["title"] = "(136)214";
|
||||
ASSERT_TRUE(coll1->add(doc.dump()).ok());
|
||||
|
||||
doc["title"] = "c136/14";
|
||||
ASSERT_TRUE(coll1->add(doc.dump()).ok());
|
||||
|
||||
doc["title"] = "A-136/14";
|
||||
ASSERT_TRUE(coll1->add(doc.dump()).ok());
|
||||
|
||||
bool enable_typos_for_alpha_numerical_tokens = false;
|
||||
|
||||
auto res = coll1->search("c-136/14", {"title"}, "", {},
|
||||
{}, {2}, 10, 1,FREQUENCY, {true},
|
||||
Index::DROP_TOKENS_THRESHOLD, spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "",
|
||||
30, 4, "", 40,
|
||||
{}, {}, {}, 0,"<mark>",
|
||||
"</mark>", {}, 1000,true,
|
||||
false, true, "", false,
|
||||
6000*1000, 4, 7, fallback, 4,
|
||||
{off}, INT16_MAX, INT16_MAX,2,
|
||||
2, false, "", true,
|
||||
0, max_score, 100, 0, 0,
|
||||
HASH, 30000, 2, "",
|
||||
{},{}, "right_to_left", true,
|
||||
true, false, "", "", "",
|
||||
"", true, true, false, 0, true,
|
||||
enable_typos_for_alpha_numerical_tokens).get();
|
||||
|
||||
ASSERT_EQ(2, res["hits"].size());
|
||||
|
||||
ASSERT_EQ("c136/14", res["hits"][0]["document"]["title"].get<std::string>());
|
||||
ASSERT_EQ("c-136/14", res["hits"][1]["document"]["title"].get<std::string>());
|
||||
|
||||
enable_typos_for_alpha_numerical_tokens = true;
|
||||
|
||||
res = coll1->search("c-136/14", {"title"}, "", {},
|
||||
{}, {2}, 10, 1,FREQUENCY, {true},
|
||||
Index::DROP_TOKENS_THRESHOLD, spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "",
|
||||
30, 4, "", 40,
|
||||
{}, {}, {}, 0,"<mark>",
|
||||
"</mark>", {}, 1000,true,
|
||||
false, true, "", false,
|
||||
6000*1000, 4, 7, fallback, 4,
|
||||
{off}, INT16_MAX, INT16_MAX,2,
|
||||
2, false, "", true,
|
||||
0, max_score, 100, 0, 0,
|
||||
HASH, 30000, 2, "",
|
||||
{},{}, "right_to_left", true,
|
||||
true, false, "", "", "",
|
||||
"", true, true, false, 0, true,
|
||||
enable_typos_for_alpha_numerical_tokens).get();
|
||||
|
||||
ASSERT_EQ(5, res["hits"].size());
|
||||
|
||||
ASSERT_EQ("c136/14", res["hits"][0]["document"]["title"].get<std::string>());
|
||||
ASSERT_EQ("c-136/14", res["hits"][1]["document"]["title"].get<std::string>());
|
||||
ASSERT_EQ("A-136/14", res["hits"][2]["document"]["title"].get<std::string>());
|
||||
ASSERT_EQ("(136)214", res["hits"][3]["document"]["title"].get<std::string>());
|
||||
ASSERT_EQ("13/14", res["hits"][4]["document"]["title"].get<std::string>());
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user