add boolean for enabling typos for alphanumeric tokens (#1651)

Co-authored-by: Kishore Nallan <kishorenc@gmail.com>
This commit is contained in:
Krunal Gandhi 2024-04-05 08:55:28 +00:00 committed by GitHub
parent 5f3b3684b2
commit 15114a6c87
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 146 additions and 28 deletions

View File

@ -278,7 +278,8 @@ private:
std::vector<std::pair<uint32_t, uint32_t>>& included_ids,
std::vector<uint32_t>& excluded_ids,
nlohmann::json& override_metadata,
bool enable_typos_for_numerical_tokens=true) const;
bool enable_typos_for_numerical_tokens=true,
bool enable_typos_for_alpha_numerical_tokens=true) const;
void populate_text_match_info(nlohmann::json& info, uint64_t match_score, const text_match_type_t match_type,
const size_t total_tokens) const;
@ -588,7 +589,8 @@ public:
bool enable_synonyms = true,
bool synonym_prefix = false,
uint32_t synonym_num_typos = 0,
bool enable_lazy_filter = false) const;
bool enable_lazy_filter = false,
bool enable_typos_for_alpha_numerical_tokens = true) const;
Option<bool> get_filter_ids(const std::string & filter_query, filter_result_t& filter_result) const;

View File

@ -468,13 +468,15 @@ private:
const std::vector<std::string>& query_tokens,
token_ordering token_order, std::set<std::string>& absorbed_tokens,
std::string& filter_by_clause,
bool enable_typos_for_numerical_tokens) const;
bool enable_typos_for_numerical_tokens,
bool enable_typos_for_alpha_numerical_tokens) const;
bool check_for_overrides(const token_ordering& token_order, const string& field_name, bool slide_window,
bool exact_rule_match, std::vector<std::string>& tokens,
std::set<std::string>& absorbed_tokens,
std::vector<std::string>& field_absorbed_tokens,
bool enable_typos_for_numerical_tokens) const;
bool enable_typos_for_numerical_tokens,
bool enable_typos_for_alpha_numerical_tokens) const;
static void aggregate_topster(Topster* agg_topster, Topster* index_topster);
@ -638,7 +640,8 @@ public:
facet_index_t* _get_facet_index() const;
static int get_bounded_typo_cost(const size_t max_cost, const std::string& token, const size_t token_len,
size_t min_len_1typo, size_t min_len_2typo, bool enable_typos_for_numerical_tokens=true);
size_t min_len_1typo, size_t min_len_2typo, bool enable_typos_for_numerical_tokens=true,
bool enable_typos_for_alpha_numerical_tokens = true);
static int64_t float_to_int64_t(float n);
@ -663,7 +666,8 @@ public:
Option<bool> run_search(search_args* search_params, const std::string& collection_name,
facet_index_type_t facet_index_type, bool enable_typos_for_numerical_tokens,
bool enable_synonyms, bool synonym_prefix, uint32_t synonym_num_typos);
bool enable_synonyms, bool synonym_prefix, uint32_t synonym_num_typos,
bool enable_typos_for_alpha_numerical_tokens);
Option<bool> search(std::vector<query_tokens_t>& field_query_tokens, const std::vector<search_field_t>& the_fields,
const text_match_type_t match_type,
@ -697,7 +701,8 @@ public:
bool enable_synonyms = true,
bool synonym_prefix = false,
uint32_t synonym_num_typos = 0,
bool enable_lazy_filter = false
bool enable_lazy_filter = false,
bool enable_typos_for_alpha_numerical_tokens = true
) const;
void remove_field(uint32_t seq_id, const nlohmann::json& document, const std::string& field_name,
@ -920,7 +925,8 @@ public:
std::array<spp::sparse_hash_map<uint32_t, int64_t, Hasher32>*, 3>& field_values,
const std::vector<size_t>& geopoint_indices,
const std::string& collection_name = "",
bool enable_typos_for_numerical_tokens = true) const;
bool enable_typos_for_numerical_tokens = true,
bool enable_typos_for_alpha_numerical_tokens = true) const;
void find_across_fields(const token_t& previous_token,
const std::string& previous_token_str,
@ -991,7 +997,8 @@ public:
filter_node_t*& filter_tree_root,
std::vector<const override_t*>& matched_dynamic_overrides,
nlohmann::json& override_metadata,
bool enable_typos_for_numerical_tokens) const;
bool enable_typos_for_numerical_tokens,
bool enable_typos_for_alpha_numerical_tokens) const;
Option<bool> compute_sort_scores(const std::vector<sort_by>& sort_fields, const int* sort_order,
std::array<spp::sparse_hash_map<uint32_t, int64_t, Hasher32>*, 3> field_values,

View File

@ -1763,7 +1763,8 @@ Option<nlohmann::json> Collection::search(std::string raw_query,
bool enable_synonyms,
bool synonym_prefix,
uint32_t synonyms_num_typos,
bool enable_lazy_filter) const {
bool enable_lazy_filter,
bool enable_typos_for_alpha_numerical_tokens) const {
std::shared_lock lock(mutex);
// setup thread local vars
@ -2293,7 +2294,8 @@ Option<nlohmann::json> Collection::search(std::string raw_query,
false, stopwords_set);
process_filter_overrides(filter_overrides, q_include_tokens, token_order, filter_tree_root,
included_ids, excluded_ids, override_metadata, enable_typos_for_numerical_tokens);
included_ids, excluded_ids, override_metadata, enable_typos_for_numerical_tokens,
enable_typos_for_alpha_numerical_tokens);
for(size_t i = 0; i < q_include_tokens.size(); i++) {
auto& q_include_token = q_include_tokens[i];
@ -2314,7 +2316,8 @@ Option<nlohmann::json> Collection::search(std::string raw_query,
// included_ids, excluded_ids
process_filter_overrides(filter_overrides, q_include_tokens, token_order, filter_tree_root,
included_ids, excluded_ids, override_metadata, enable_typos_for_numerical_tokens);
included_ids, excluded_ids, override_metadata, enable_typos_for_numerical_tokens,
enable_typos_for_alpha_numerical_tokens);
for(size_t i = 0; i < q_include_tokens.size(); i++) {
auto& q_include_token = q_include_tokens[i];
@ -2360,7 +2363,7 @@ Option<nlohmann::json> Collection::search(std::string raw_query,
auto search_op = index->run_search(search_params, name, facet_index_type,
enable_typos_for_numerical_tokens, enable_synonyms, synonym_prefix,
synonyms_num_typos);
synonyms_num_typos, enable_typos_for_alpha_numerical_tokens);
// filter_tree_root might be updated in Index::static_filter_query_eval.
filter_tree_root_guard.release();
@ -3395,12 +3398,14 @@ void Collection::process_filter_overrides(std::vector<const override_t*>& filter
std::vector<std::pair<uint32_t, uint32_t>>& included_ids,
std::vector<uint32_t>& excluded_ids,
nlohmann::json& override_metadata,
bool enable_typos_for_numerical_tokens) const {
bool enable_typos_for_numerical_tokens,
bool enable_typos_for_alpha_numerical_tokens) const {
std::vector<const override_t*> matched_dynamic_overrides;
index->process_filter_overrides(filter_overrides, q_include_tokens, token_order,
filter_tree_root, matched_dynamic_overrides, override_metadata,
enable_typos_for_numerical_tokens);
enable_typos_for_numerical_tokens,
enable_typos_for_alpha_numerical_tokens);
// we will check the dynamic overrides to see if they also have include/exclude
std::set<uint32_t> excluded_set;

View File

@ -1479,6 +1479,7 @@ Option<bool> CollectionManager::do_search(std::map<std::string, std::string>& re
const char *VOICE_QUERY = "voice_query";
const char *ENABLE_TYPOS_FOR_NUMERICAL_TOKENS = "enable_typos_for_numerical_tokens";
const char *ENABLE_TYPOS_FOR_ALPHA_NUMERICAL_TOKENS = "enable_typos_for_alpha_numerical_tokens";
const char *ENABLE_LAZY_FILTER = "enable_lazy_filter";
const char *SYNONYM_PREFIX = "synonym_prefix";
@ -1607,6 +1608,7 @@ Option<bool> CollectionManager::do_search(std::map<std::string, std::string>& re
bool enable_highlight_v1 = true;
text_match_type_t match_type = max_score;
bool enable_typos_for_numerical_tokens = true;
bool enable_typos_for_alpha_numerical_tokens = true;
bool enable_lazy_filter = Config::get_instance().get_enable_lazy_filter();
size_t remote_embedding_timeout_ms = 5000;
@ -1684,6 +1686,7 @@ Option<bool> CollectionManager::do_search(std::map<std::string, std::string>& re
{ENABLE_SYNONYMS, &enable_synonyms},
{SYNONYM_PREFIX, &synonym_prefix},
{ENABLE_LAZY_FILTER, &enable_lazy_filter},
{ENABLE_TYPOS_FOR_ALPHA_NUMERICAL_TOKENS, &enable_typos_for_alpha_numerical_tokens},
};
std::unordered_map<std::string, std::vector<std::string>*> str_list_values = {
@ -1902,7 +1905,8 @@ Option<bool> CollectionManager::do_search(std::map<std::string, std::string>& re
enable_synonyms,
synonym_prefix,
synonym_num_typos,
enable_lazy_filter);
enable_lazy_filter,
enable_typos_for_alpha_numerical_tokens);
uint64_t timeMillis = std::chrono::duration_cast<std::chrono::milliseconds>(
std::chrono::high_resolution_clock::now() - begin).count();

View File

@ -2212,7 +2212,8 @@ Option<filter_result_t> Index::do_filtering_with_reference_ids(const std::string
Option<bool> Index::run_search(search_args* search_params, const std::string& collection_name,
facet_index_type_t facet_index_type, bool enable_typos_for_numerical_tokens,
bool enable_synonyms, bool synonym_prefix, uint32_t synonym_num_typos) {
bool enable_synonyms, bool synonym_prefix, uint32_t synonym_num_typos,
bool enable_typos_for_alpha_numerical_tokens) {
return search(search_params->field_query_tokens,
search_params->search_fields,
search_params->match_type,
@ -2257,7 +2258,8 @@ Option<bool> Index::run_search(search_args* search_params, const std::string& co
enable_synonyms,
synonym_prefix,
synonym_num_typos,
search_params->enable_lazy_filter
search_params->enable_lazy_filter,
enable_typos_for_alpha_numerical_tokens
);
}
@ -2346,7 +2348,8 @@ bool Index::static_filter_query_eval(const override_t* override,
bool Index::resolve_override(const std::vector<std::string>& rule_tokens, const bool exact_rule_match,
const std::vector<std::string>& query_tokens,
token_ordering token_order, std::set<std::string>& absorbed_tokens,
std::string& filter_by_clause, bool enable_typos_for_numerical_tokens) const {
std::string& filter_by_clause, bool enable_typos_for_numerical_tokens,
bool enable_typos_for_alpha_numerical_tokens) const {
bool resolved_override = false;
size_t i = 0, j = 0;
@ -2389,7 +2392,8 @@ bool Index::resolve_override(const std::vector<std::string>& rule_tokens, const
std::vector<std::string> field_absorbed_tokens;
resolved_override &= check_for_overrides(token_order, field_name, slide_window,
exact_rule_match, matched_tokens, absorbed_tokens,
field_absorbed_tokens, enable_typos_for_numerical_tokens);
field_absorbed_tokens, enable_typos_for_numerical_tokens,
enable_typos_for_alpha_numerical_tokens);
if(!resolved_override) {
goto RETURN_EARLY;
@ -2441,7 +2445,8 @@ void Index::process_filter_overrides(const std::vector<const override_t*>& filte
filter_node_t*& filter_tree_root,
std::vector<const override_t*>& matched_dynamic_overrides,
nlohmann::json& override_metadata,
bool enable_typos_for_numerical_tokens) const {
bool enable_typos_for_numerical_tokens,
bool enable_typos_for_alpha_numerical_tokens) const {
std::shared_lock lock(mutex);
for (auto& override : filter_overrides) {
@ -2478,7 +2483,8 @@ void Index::process_filter_overrides(const std::vector<const override_t*>& filte
std::set<std::string> absorbed_tokens;
bool resolved_override = resolve_override(rule_parts, exact_rule_match, query_tokens,
token_order, absorbed_tokens, filter_by_clause,
enable_typos_for_numerical_tokens);
enable_typos_for_numerical_tokens,
enable_typos_for_alpha_numerical_tokens);
if (resolved_override) {
if(override_metadata.empty()) {
@ -2536,7 +2542,8 @@ bool Index::check_for_overrides(const token_ordering& token_order, const string&
bool exact_rule_match, std::vector<std::string>& tokens,
std::set<std::string>& absorbed_tokens,
std::vector<std::string>& field_absorbed_tokens,
bool enable_typos_for_numerical_tokens) const {
bool enable_typos_for_numerical_tokens,
bool enable_typos_for_alpha_numerical_tokens) const {
for(size_t window_len = tokens.size(); window_len > 0; window_len--) {
for(size_t start_index = 0; start_index+window_len-1 < tokens.size(); start_index++) {
@ -2748,7 +2755,8 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
bool enable_typos_for_numerical_tokens,
bool enable_synonyms, bool synonym_prefix,
uint32_t synonym_num_typos,
bool enable_lazy_filter) const {
bool enable_lazy_filter,
bool enable_typos_for_alpha_numerical_tokens) const {
std::shared_lock lock(mutex);
auto filter_result_iterator = new filter_result_iterator_t(collection_name, this, filter_tree_root,
@ -3145,7 +3153,8 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
typo_tokens_threshold, exhaustive_search,
max_candidates, min_len_1typo, min_len_2typo,
syn_orig_num_tokens, sort_order, field_values, geopoint_indices,
collection_name, enable_typos_for_numerical_tokens);
collection_name, enable_typos_for_numerical_tokens,
enable_typos_for_alpha_numerical_tokens);
if (!fuzzy_search_fields_op.ok()) {
return fuzzy_search_fields_op;
}
@ -3924,7 +3933,8 @@ Option<bool> Index::fuzzy_search_fields(const std::vector<search_field_t>& the_f
std::array<spp::sparse_hash_map<uint32_t, int64_t, Hasher32>*, 3>& field_values,
const std::vector<size_t>& geopoint_indices,
const std::string& collection_name,
bool enable_typos_for_numerical_tokens) const {
bool enable_typos_for_numerical_tokens,
bool enable_typos_for_alpha_numerical_tokens) const {
// NOTE: `query_tokens` preserve original tokens, while `search_tokens` could be a result of dropped tokens
@ -3939,7 +3949,7 @@ Option<bool> Index::fuzzy_search_fields(const std::vector<search_field_t>& the_f
std::vector<int> all_costs;
// This ensures that we don't end up doing a cost of 1 for a single char etc.
int bounded_cost = get_bounded_typo_cost(2, token , token.length(), min_len_1typo, min_len_2typo,
enable_typos_for_numerical_tokens);
enable_typos_for_numerical_tokens, enable_typos_for_alpha_numerical_tokens);
for(int cost = 0; cost <= bounded_cost; cost++) {
all_costs.push_back(cost);
@ -6166,7 +6176,16 @@ void Index::populate_sort_mapping_with_lock(int* sort_order, std::vector<size_t>
int Index::get_bounded_typo_cost(const size_t max_cost, const std::string& token, const size_t token_len,
const size_t min_len_1typo, const size_t min_len_2typo,
bool enable_typos_for_numerical_tokens) {
bool enable_typos_for_numerical_tokens,
bool enable_typos_for_alpha_numerical_tokens) {
if(!enable_typos_for_alpha_numerical_tokens) {
for(auto c : token) {
if(!isalnum(c)) { //some special char which is indexed
return 0;
}
}
}
if(!enable_typos_for_numerical_tokens && std::all_of(token.begin(), token.end(), ::isdigit)) {
return 0;

View File

@ -3012,4 +3012,85 @@ TEST_F(CollectionSpecificMoreTest, TestFieldStore) {
ASSERT_EQ(1, res.get()["hits"].size());
ASSERT_EQ("store", res.get()["hits"][0]["document"]["word_to_store"].get<std::string>());
ASSERT_TRUE(res.get()["hits"][0]["document"].count("word_not_to_store") == 0);
}
TEST_F(CollectionSpecificMoreTest, EnableTyposForAlphaNumericalTokens) {
nlohmann::json schema = R"({
"name": "coll1",
"fields": [
{"name": "title", "type": "string"}
],
"symbols_to_index":["/"]
})"_json;
Collection* coll1 = collectionManager.create_collection(schema).get();
nlohmann::json doc;
doc["title"] = "c-136/14";
ASSERT_TRUE(coll1->add(doc.dump()).ok());
doc["title"] = "13/14";
ASSERT_TRUE(coll1->add(doc.dump()).ok());
doc["title"] = "(136)214";
ASSERT_TRUE(coll1->add(doc.dump()).ok());
doc["title"] = "c136/14";
ASSERT_TRUE(coll1->add(doc.dump()).ok());
doc["title"] = "A-136/14";
ASSERT_TRUE(coll1->add(doc.dump()).ok());
bool enable_typos_for_alpha_numerical_tokens = false;
auto res = coll1->search("c-136/14", {"title"}, "", {},
{}, {2}, 10, 1,FREQUENCY, {true},
Index::DROP_TOKENS_THRESHOLD, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "",
30, 4, "", 40,
{}, {}, {}, 0,"<mark>",
"</mark>", {}, 1000,true,
false, true, "", false,
6000*1000, 4, 7, fallback, 4,
{off}, INT16_MAX, INT16_MAX,2,
2, false, "", true,
0, max_score, 100, 0, 0,
HASH, 30000, 2, "",
{},{}, "right_to_left", true,
true, false, "", "", "",
"", true, true, false, 0, true,
enable_typos_for_alpha_numerical_tokens).get();
ASSERT_EQ(2, res["hits"].size());
ASSERT_EQ("c136/14", res["hits"][0]["document"]["title"].get<std::string>());
ASSERT_EQ("c-136/14", res["hits"][1]["document"]["title"].get<std::string>());
enable_typos_for_alpha_numerical_tokens = true;
res = coll1->search("c-136/14", {"title"}, "", {},
{}, {2}, 10, 1,FREQUENCY, {true},
Index::DROP_TOKENS_THRESHOLD, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "",
30, 4, "", 40,
{}, {}, {}, 0,"<mark>",
"</mark>", {}, 1000,true,
false, true, "", false,
6000*1000, 4, 7, fallback, 4,
{off}, INT16_MAX, INT16_MAX,2,
2, false, "", true,
0, max_score, 100, 0, 0,
HASH, 30000, 2, "",
{},{}, "right_to_left", true,
true, false, "", "", "",
"", true, true, false, 0, true,
enable_typos_for_alpha_numerical_tokens).get();
ASSERT_EQ(5, res["hits"].size());
ASSERT_EQ("c136/14", res["hits"][0]["document"]["title"].get<std::string>());
ASSERT_EQ("c-136/14", res["hits"][1]["document"]["title"].get<std::string>());
ASSERT_EQ("A-136/14", res["hits"][2]["document"]["title"].get<std::string>());
ASSERT_EQ("(136)214", res["hits"][3]["document"]["title"].get<std::string>());
ASSERT_EQ("13/14", res["hits"][4]["document"]["title"].get<std::string>());
}