Parameterize facet query num typos.

This commit is contained in:
Kishore Nallan 2022-03-10 19:05:30 +05:30
parent 7cc533ff8d
commit d0a0597fcb
6 changed files with 83 additions and 83 deletions

View File

@ -406,7 +406,8 @@ public:
size_t max_candidates = 4,
const std::vector<infix_t>& infixes = {off},
const size_t max_extra_prefix = INT16_MAX,
const size_t max_extra_suffix = INT16_MAX) const;
const size_t max_extra_suffix = INT16_MAX,
const size_t facet_query_num_typos = 2) const;
Option<bool> get_filter_ids(const std::string & simple_filter_query,
std::vector<std::pair<size_t, uint32_t*>>& index_ids);

View File

@ -301,6 +301,7 @@ struct search_args {
std::vector<infix_t> infixes;
const size_t max_extra_prefix;
const size_t max_extra_suffix;
const size_t facet_query_num_typos;
spp::sparse_hash_set<uint64_t> groups_processed;
std::vector<std::vector<art_leaf*>> searched_queries;
@ -309,27 +310,17 @@ struct search_args {
std::vector<std::vector<KV*>> raw_result_kvs;
std::vector<std::vector<KV*>> override_result_kvs;
search_args(std::vector<query_tokens_t> field_query_tokens,
std::vector<search_field_t> search_fields, std::vector<filter> filters,
std::vector<facet>& facets,
search_args(std::vector<query_tokens_t> field_query_tokens, std::vector<search_field_t> search_fields,
std::vector<filter> filters, std::vector<facet>& facets,
std::map<size_t, std::map<size_t, uint32_t>> included_ids, std::vector<uint32_t> excluded_ids,
std::vector<sort_by> sort_fields_std, facet_query_t facet_query, const std::vector<uint32_t>& num_typos,
size_t max_facet_values, size_t max_hits, size_t per_page, size_t page, token_ordering token_order,
const std::vector<bool>& prefixes,
size_t drop_tokens_threshold, size_t typo_tokens_threshold,
const std::vector<bool>& prefixes, size_t drop_tokens_threshold, size_t typo_tokens_threshold,
const std::vector<std::string>& group_by_fields, size_t group_limit,
const std::string& default_sorting_field,
bool prioritize_exact_match,
bool exhaustive_search,
size_t concurrency,
const std::vector<const override_t*>& dynamic_overrides,
size_t search_cutoff_ms,
size_t min_len_1typo,
size_t min_len_2typo,
size_t max_candidates,
const std::vector<infix_t>& infixes,
const size_t max_extra_prefix,
const size_t max_extra_suffix):
const string& default_sorting_field, bool prioritize_exact_match, bool exhaustive_search,
size_t concurrency, const std::vector<const override_t*>& dynamic_overrides, size_t search_cutoff_ms,
size_t min_len_1typo, size_t min_len_2typo, size_t max_candidates, const std::vector<infix_t>& infixes,
const size_t max_extra_prefix, const size_t max_extra_suffix, const size_t facet_query_num_typos) :
field_query_tokens(field_query_tokens),
search_fields(search_fields), filters(filters), facets(facets),
included_ids(included_ids), excluded_ids(excluded_ids), sort_fields_std(sort_fields_std),
@ -341,7 +332,8 @@ struct search_args {
exhaustive_search(exhaustive_search), concurrency(concurrency),
filter_overrides(dynamic_overrides), search_cutoff_ms(search_cutoff_ms),
min_len_1typo(min_len_1typo), min_len_2typo(min_len_2typo), max_candidates(max_candidates),
infixes(infixes), max_extra_prefix(max_extra_prefix), max_extra_suffix(max_extra_suffix) {
infixes(infixes), max_extra_prefix(max_extra_prefix), max_extra_suffix(max_extra_suffix),
facet_query_num_typos(facet_query_num_typos) {
const size_t topster_size = std::max((size_t)1, max_hits); // needs to be atleast 1 since scoring is mandatory
topster = new Topster(topster_size, group_limit);
@ -687,36 +679,22 @@ public:
void run_search(search_args* search_params);
void search(std::vector<query_tokens_t>& field_query_tokens,
const std::vector<search_field_t> & search_fields,
std::vector<filter> & filters, std::vector<facet> & facets,
facet_query_t & facet_query,
const std::map<size_t, std::map<size_t, uint32_t>> & included_ids_map,
const std::vector<uint32_t> & excluded_ids,
const std::vector<sort_by> & sort_fields_std, const std::vector<uint32_t>& num_typos,
Topster* topster, Topster* curated_topster,
const size_t per_page, const size_t page, const token_ordering token_order,
const std::vector<bool>& prefixes, const size_t drop_tokens_threshold,
size_t & all_result_ids_len,
spp::sparse_hash_set<uint64_t>& groups_processed,
std::vector<std::vector<art_leaf*>> & searched_queries,
std::vector<std::vector<KV*>> & raw_result_kvs,
std::vector<std::vector<KV*>> & override_result_kvs,
const size_t typo_tokens_threshold,
const size_t group_limit,
const std::vector<std::string>& group_by_fields,
const std::vector<const override_t*>& filter_overrides,
const std::string& default_sorting_field,
bool prioritize_exact_match,
bool exhaustive_search,
size_t concurrency,
size_t search_cutoff_ms,
size_t min_len_1typo,
size_t min_len_2typo,
size_t max_candidates,
const std::vector<infix_t>& infixes,
const size_t max_extra_prefix,
const size_t max_extra_suffix) const;
void search(std::vector<query_tokens_t>& field_query_tokens, const std::vector<search_field_t>& the_fields,
std::vector<filter>& filters, std::vector<facet>& facets, facet_query_t& facet_query,
const std::map<size_t, std::map<size_t, uint32_t>>& included_ids_map,
const std::vector<uint32_t>& excluded_ids, const std::vector<sort_by>& sort_fields_std,
const std::vector<uint32_t>& num_typos, Topster* topster, Topster* curated_topster,
const size_t per_page,
const size_t page, const token_ordering token_order, const std::vector<bool>& prefixes,
const size_t drop_tokens_threshold, size_t& all_result_ids_len,
spp::sparse_hash_set<uint64_t>& groups_processed, std::vector<std::vector<art_leaf*>>& searched_queries,
std::vector<std::vector<KV*>>& raw_result_kvs, std::vector<std::vector<KV*>>& override_result_kvs,
const size_t typo_tokens_threshold, const size_t group_limit,
const std::vector<std::string>& group_by_fields, const std::vector<const override_t*>& filter_overrides,
const string& default_sorting_field, bool prioritize_exact_match, bool exhaustive_search,
size_t concurrency, size_t search_cutoff_ms, size_t min_len_1typo, size_t min_len_2typo,
size_t max_candidates, const std::vector<infix_t>& infixes, const size_t max_extra_prefix,
const size_t max_extra_suffix, const size_t facet_query_num_typos) const;
Option<uint32_t> remove(const uint32_t seq_id, const nlohmann::json & document, const bool is_update);
@ -788,6 +766,7 @@ public:
static void remove_matched_tokens(std::vector<std::string>& tokens, const std::set<std::string>& rule_token_set) ;
void compute_facet_infos(const std::vector<facet>& facets, facet_query_t& facet_query,
const size_t facet_query_num_typos,
const uint32_t* all_result_ids, const size_t& all_result_ids_len,
const std::vector<std::string>& group_by_fields,
std::vector<facet_info_t>& facet_infos) const;

View File

@ -697,7 +697,8 @@ Option<nlohmann::json> Collection::search(const std::string & raw_query, const s
const size_t max_candidates,
const std::vector<infix_t>& infixes,
const size_t max_extra_prefix,
const size_t max_extra_suffix) const {
const size_t max_extra_suffix,
const size_t facet_query_num_typos) const {
std::shared_lock lock(mutex);
@ -1027,11 +1028,12 @@ Option<nlohmann::json> Collection::search(const std::string & raw_query, const s
sort_fields_std, facet_query, num_typos, max_facet_values, max_hits,
per_page, page, token_order, prefixes,
drop_tokens_threshold, typo_tokens_threshold,
group_by_fields, group_limit, default_sorting_field, prioritize_exact_match,
group_by_fields, group_limit, default_sorting_field,
prioritize_exact_match,
exhaustive_search, 4, filter_overrides,
search_stop_millis,
min_len_1typo, min_len_2typo, max_candidates, infixes,
max_extra_prefix, max_extra_suffix);
max_extra_prefix, max_extra_suffix, facet_query_num_typos);
index->run_search(search_params);

View File

@ -597,6 +597,7 @@ Option<bool> CollectionManager::do_search(std::map<std::string, std::string>& re
const char *FACET_BY = "facet_by";
const char *FACET_QUERY = "facet_query";
const char *FACET_QUERY_NUM_TYPOS = "facet_query_num_typos";
const char *MAX_FACET_VALUES = "max_facet_values";
const char *GROUP_BY = "group_by";
@ -687,6 +688,7 @@ Option<bool> CollectionManager::do_search(std::map<std::string, std::string>& re
size_t max_facet_values = 10;
std::string simple_facet_query;
size_t facet_query_num_typos = 2;
size_t snippet_threshold = 30;
size_t highlight_affix_num_tokens = 4;
std::string highlight_full_fields;
@ -727,6 +729,7 @@ Option<bool> CollectionManager::do_search(std::map<std::string, std::string>& re
{MAX_EXTRA_PREFIX, &max_extra_prefix},
{MAX_EXTRA_SUFFIX, &max_extra_suffix},
{MAX_CANDIDATES, &max_candidates},
{FACET_QUERY_NUM_TYPOS, &facet_query_num_typos},
};
std::unordered_map<std::string, std::string*> str_values = {

View File

@ -1652,7 +1652,8 @@ void Index::run_search(search_args* search_params) {
search_params->max_candidates,
search_params->infixes,
search_params->max_extra_prefix,
search_params->max_extra_suffix);
search_params->max_extra_suffix,
search_params->facet_query_num_typos);
}
void Index::collate_included_ids(const std::vector<std::string>& q_included_tokens,
@ -2081,37 +2082,24 @@ void Index::search_infix(const std::string& query, const std::string& field_name
}
}
void Index::search(std::vector<query_tokens_t>& field_query_tokens,
const std::vector<search_field_t>& the_fields,
std::vector<filter>& filters,
std::vector<facet>& facets, facet_query_t& facet_query,
const std::map<size_t, std::map<size_t, uint32_t>> & included_ids_map,
const std::vector<uint32_t> & excluded_ids,
const std::vector<sort_by> & sort_fields_std, const std::vector<uint32_t>& num_typos,
Topster* topster,
Topster* curated_topster,
const size_t per_page, const size_t page, const token_ordering token_order,
const std::vector<bool>& prefixes, const size_t drop_tokens_threshold,
size_t & all_result_ids_len,
void Index::search(std::vector<query_tokens_t>& field_query_tokens, const std::vector<search_field_t>& the_fields,
std::vector<filter>& filters, std::vector<facet>& facets, facet_query_t& facet_query,
const std::map<size_t, std::map<size_t, uint32_t>>& included_ids_map,
const std::vector<uint32_t>& excluded_ids, const std::vector<sort_by>& sort_fields_std,
const std::vector<uint32_t>& num_typos, Topster* topster, Topster* curated_topster,
const size_t per_page,
const size_t page, const token_ordering token_order, const std::vector<bool>& prefixes,
const size_t drop_tokens_threshold, size_t& all_result_ids_len,
spp::sparse_hash_set<uint64_t>& groups_processed,
std::vector<std::vector<art_leaf*>>& searched_queries,
std::vector<std::vector<KV*>> & raw_result_kvs,
std::vector<std::vector<KV*>> & override_result_kvs,
const size_t typo_tokens_threshold,
const size_t group_limit,
std::vector<std::vector<KV*>>& raw_result_kvs, std::vector<std::vector<KV*>>& override_result_kvs,
const size_t typo_tokens_threshold, const size_t group_limit,
const std::vector<std::string>& group_by_fields,
const std::vector<const override_t*>& filter_overrides,
const std::string& default_sorting_field,
bool prioritize_exact_match,
const bool exhaustive_search,
const size_t concurrency,
const size_t search_cutoff_ms,
size_t min_len_1typo,
size_t min_len_2typo,
const size_t max_candidates,
const std::vector<infix_t>& infixes,
const size_t max_extra_prefix,
const size_t max_extra_suffix) const {
const string& default_sorting_field, bool prioritize_exact_match, bool exhaustive_search,
size_t concurrency, size_t search_cutoff_ms, size_t min_len_1typo, size_t min_len_2typo,
size_t max_candidates, const std::vector<infix_t>& infixes, const size_t max_extra_prefix,
const size_t max_extra_suffix, const size_t facet_query_num_typos) const {
search_begin = std::chrono::high_resolution_clock::now();
search_stop_ms = search_cutoff_ms;
@ -2231,7 +2219,7 @@ void Index::search(std::vector<query_tokens_t>& field_query_tokens,
std::condition_variable cv_process;
std::vector<facet_info_t> facet_infos(facets.size());
compute_facet_infos(facets, facet_query, all_result_ids, all_result_ids_len,
compute_facet_infos(facets, facet_query, facet_query_num_typos, all_result_ids, all_result_ids_len,
group_by_fields, facet_infos);
std::vector<std::vector<facet>> facet_batches(num_threads);
@ -2316,7 +2304,8 @@ void Index::search(std::vector<query_tokens_t>& field_query_tokens,
}
std::vector<facet_info_t> facet_infos(facets.size());
compute_facet_infos(facets, facet_query, &included_ids[0], included_ids.size(), group_by_fields, facet_infos);
compute_facet_infos(facets, facet_query, facet_query_num_typos,
&included_ids[0], included_ids.size(), group_by_fields, facet_infos);
do_facets(facets, facet_query, facet_infos, group_limit, group_by_fields, &included_ids[0], included_ids.size());
all_result_ids_len += curated_topster->size;
@ -2845,6 +2834,7 @@ void Index::handle_exclusion(const size_t num_search_fields, std::vector<query_t
}
void Index::compute_facet_infos(const std::vector<facet>& facets, facet_query_t& facet_query,
const size_t facet_query_num_typos,
const uint32_t* all_result_ids, const size_t& all_result_ids_len,
const std::vector<std::string>& group_by_fields,
std::vector<facet_info_t>& facet_infos) const {
@ -2907,7 +2897,7 @@ void Index::compute_facet_infos(const std::vector<facet>& facets, facet_query_t&
search_field(0, qtokens, search_tokens, nullptr, 0, num_toks_dropped,
facet_field, facet_field.faceted_name(),
all_result_ids, all_result_ids_len, {}, {}, 2, searched_queries, topster, groups_processed,
all_result_ids, all_result_ids_len, {}, {}, facet_query_num_typos, searched_queries, topster, groups_processed,
&field_result_ids, field_result_ids_len, field_num_results, 0, group_by_fields,
false, 4, query_hashes, MAX_SCORE, true, 0, 1, false, 3, 1000, 4);

View File

@ -811,5 +811,30 @@ TEST_F(CollectionFacetingTest, FacetQueryOnStringArray) {
ASSERT_EQ(1, results["facet_counts"][0]["counts"].size());
ASSERT_EQ("<mark>Country</mark> Punk <mark>Roc</mark>k", results["facet_counts"][0]["counts"][0]["highlighted"].get<std::string>());
// with facet query num typo parameter
results = coll1->search("*", {}, "", {"genres"}, sort_fields, {0}, 0, 1, FREQUENCY,
{false}, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "genres: eletronic",
30, 4, "title", 20, {}, {}, {}, 0,
"<mark>", "</mark>", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, true,
4, {off}, 32767, 32767, 1).get();
ASSERT_EQ(1, results["facet_counts"].size());
ASSERT_EQ(1, results["facet_counts"][0]["counts"].size());
ASSERT_EQ("<mark>Electroni</mark>c", results["facet_counts"][0]["counts"][0]["highlighted"].get<std::string>());
results = coll1->search("*", {}, "", {"genres"}, sort_fields, {0}, 0, 1, FREQUENCY,
{false}, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "genres: eletronic",
30, 4, "title", 20, {}, {}, {}, 0,
"<mark>", "</mark>", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, true,
4, {off}, 32767, 32767, 0).get();
ASSERT_EQ(1, results["facet_counts"].size());
ASSERT_EQ(0, results["facet_counts"][0]["counts"].size());
collectionManager.drop_collection("coll1");
}