From d0a0597fcbb4a430407ce2f14b51349f2082dcb1 Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Thu, 10 Mar 2022 19:05:30 +0530 Subject: [PATCH] Parameterize facet query num typos. --- include/collection.h | 3 +- include/index.h | 75 +++++++++++-------------------- src/collection.cpp | 8 ++-- src/collection_manager.cpp | 3 ++ src/index.cpp | 52 +++++++++------------ test/collection_faceting_test.cpp | 25 +++++++++++ 6 files changed, 83 insertions(+), 83 deletions(-) diff --git a/include/collection.h b/include/collection.h index b66b1106..7eaa0d4e 100644 --- a/include/collection.h +++ b/include/collection.h @@ -406,7 +406,8 @@ public: size_t max_candidates = 4, const std::vector& infixes = {off}, const size_t max_extra_prefix = INT16_MAX, - const size_t max_extra_suffix = INT16_MAX) const; + const size_t max_extra_suffix = INT16_MAX, + const size_t facet_query_num_typos = 2) const; Option get_filter_ids(const std::string & simple_filter_query, std::vector>& index_ids); diff --git a/include/index.h b/include/index.h index 302567d8..d3360553 100644 --- a/include/index.h +++ b/include/index.h @@ -301,6 +301,7 @@ struct search_args { std::vector infixes; const size_t max_extra_prefix; const size_t max_extra_suffix; + const size_t facet_query_num_typos; spp::sparse_hash_set groups_processed; std::vector> searched_queries; @@ -309,27 +310,17 @@ struct search_args { std::vector> raw_result_kvs; std::vector> override_result_kvs; - search_args(std::vector field_query_tokens, - std::vector search_fields, std::vector filters, - std::vector& facets, + search_args(std::vector field_query_tokens, std::vector search_fields, + std::vector filters, std::vector& facets, std::map> included_ids, std::vector excluded_ids, std::vector sort_fields_std, facet_query_t facet_query, const std::vector& num_typos, size_t max_facet_values, size_t max_hits, size_t per_page, size_t page, token_ordering token_order, - const std::vector& prefixes, - size_t drop_tokens_threshold, size_t typo_tokens_threshold, + const std::vector& prefixes, size_t drop_tokens_threshold, size_t typo_tokens_threshold, const std::vector& group_by_fields, size_t group_limit, - const std::string& default_sorting_field, - bool prioritize_exact_match, - bool exhaustive_search, - size_t concurrency, - const std::vector& dynamic_overrides, - size_t search_cutoff_ms, - size_t min_len_1typo, - size_t min_len_2typo, - size_t max_candidates, - const std::vector& infixes, - const size_t max_extra_prefix, - const size_t max_extra_suffix): + const string& default_sorting_field, bool prioritize_exact_match, bool exhaustive_search, + size_t concurrency, const std::vector& dynamic_overrides, size_t search_cutoff_ms, + size_t min_len_1typo, size_t min_len_2typo, size_t max_candidates, const std::vector& infixes, + const size_t max_extra_prefix, const size_t max_extra_suffix, const size_t facet_query_num_typos) : field_query_tokens(field_query_tokens), search_fields(search_fields), filters(filters), facets(facets), included_ids(included_ids), excluded_ids(excluded_ids), sort_fields_std(sort_fields_std), @@ -341,7 +332,8 @@ struct search_args { exhaustive_search(exhaustive_search), concurrency(concurrency), filter_overrides(dynamic_overrides), search_cutoff_ms(search_cutoff_ms), min_len_1typo(min_len_1typo), min_len_2typo(min_len_2typo), max_candidates(max_candidates), - infixes(infixes), max_extra_prefix(max_extra_prefix), max_extra_suffix(max_extra_suffix) { + infixes(infixes), max_extra_prefix(max_extra_prefix), max_extra_suffix(max_extra_suffix), + facet_query_num_typos(facet_query_num_typos) { const size_t topster_size = std::max((size_t)1, max_hits); // needs to be atleast 1 since scoring is mandatory topster = new Topster(topster_size, group_limit); @@ -687,36 +679,22 @@ public: void run_search(search_args* search_params); - void search(std::vector& field_query_tokens, - const std::vector & search_fields, - std::vector & filters, std::vector & facets, - facet_query_t & facet_query, - const std::map> & included_ids_map, - const std::vector & excluded_ids, - const std::vector & sort_fields_std, const std::vector& num_typos, - Topster* topster, Topster* curated_topster, - const size_t per_page, const size_t page, const token_ordering token_order, - const std::vector& prefixes, const size_t drop_tokens_threshold, - size_t & all_result_ids_len, - spp::sparse_hash_set& groups_processed, - std::vector> & searched_queries, - std::vector> & raw_result_kvs, - std::vector> & override_result_kvs, - const size_t typo_tokens_threshold, - const size_t group_limit, - const std::vector& group_by_fields, - const std::vector& filter_overrides, - const std::string& default_sorting_field, - bool prioritize_exact_match, - bool exhaustive_search, - size_t concurrency, - size_t search_cutoff_ms, - size_t min_len_1typo, - size_t min_len_2typo, - size_t max_candidates, - const std::vector& infixes, - const size_t max_extra_prefix, - const size_t max_extra_suffix) const; + void search(std::vector& field_query_tokens, const std::vector& the_fields, + std::vector& filters, std::vector& facets, facet_query_t& facet_query, + const std::map>& included_ids_map, + const std::vector& excluded_ids, const std::vector& sort_fields_std, + const std::vector& num_typos, Topster* topster, Topster* curated_topster, + const size_t per_page, + const size_t page, const token_ordering token_order, const std::vector& prefixes, + const size_t drop_tokens_threshold, size_t& all_result_ids_len, + spp::sparse_hash_set& groups_processed, std::vector>& searched_queries, + std::vector>& raw_result_kvs, std::vector>& override_result_kvs, + const size_t typo_tokens_threshold, const size_t group_limit, + const std::vector& group_by_fields, const std::vector& filter_overrides, + const string& default_sorting_field, bool prioritize_exact_match, bool exhaustive_search, + size_t concurrency, size_t search_cutoff_ms, size_t min_len_1typo, size_t min_len_2typo, + size_t max_candidates, const std::vector& infixes, const size_t max_extra_prefix, + const size_t max_extra_suffix, const size_t facet_query_num_typos) const; Option remove(const uint32_t seq_id, const nlohmann::json & document, const bool is_update); @@ -788,6 +766,7 @@ public: static void remove_matched_tokens(std::vector& tokens, const std::set& rule_token_set) ; void compute_facet_infos(const std::vector& facets, facet_query_t& facet_query, + const size_t facet_query_num_typos, const uint32_t* all_result_ids, const size_t& all_result_ids_len, const std::vector& group_by_fields, std::vector& facet_infos) const; diff --git a/src/collection.cpp b/src/collection.cpp index af8c10bb..23b67eca 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -697,7 +697,8 @@ Option Collection::search(const std::string & raw_query, const s const size_t max_candidates, const std::vector& infixes, const size_t max_extra_prefix, - const size_t max_extra_suffix) const { + const size_t max_extra_suffix, + const size_t facet_query_num_typos) const { std::shared_lock lock(mutex); @@ -1027,11 +1028,12 @@ Option Collection::search(const std::string & raw_query, const s sort_fields_std, facet_query, num_typos, max_facet_values, max_hits, per_page, page, token_order, prefixes, drop_tokens_threshold, typo_tokens_threshold, - group_by_fields, group_limit, default_sorting_field, prioritize_exact_match, + group_by_fields, group_limit, default_sorting_field, + prioritize_exact_match, exhaustive_search, 4, filter_overrides, search_stop_millis, min_len_1typo, min_len_2typo, max_candidates, infixes, - max_extra_prefix, max_extra_suffix); + max_extra_prefix, max_extra_suffix, facet_query_num_typos); index->run_search(search_params); diff --git a/src/collection_manager.cpp b/src/collection_manager.cpp index a68c906b..03601e2e 100644 --- a/src/collection_manager.cpp +++ b/src/collection_manager.cpp @@ -597,6 +597,7 @@ Option CollectionManager::do_search(std::map& re const char *FACET_BY = "facet_by"; const char *FACET_QUERY = "facet_query"; + const char *FACET_QUERY_NUM_TYPOS = "facet_query_num_typos"; const char *MAX_FACET_VALUES = "max_facet_values"; const char *GROUP_BY = "group_by"; @@ -687,6 +688,7 @@ Option CollectionManager::do_search(std::map& re size_t max_facet_values = 10; std::string simple_facet_query; + size_t facet_query_num_typos = 2; size_t snippet_threshold = 30; size_t highlight_affix_num_tokens = 4; std::string highlight_full_fields; @@ -727,6 +729,7 @@ Option CollectionManager::do_search(std::map& re {MAX_EXTRA_PREFIX, &max_extra_prefix}, {MAX_EXTRA_SUFFIX, &max_extra_suffix}, {MAX_CANDIDATES, &max_candidates}, + {FACET_QUERY_NUM_TYPOS, &facet_query_num_typos}, }; std::unordered_map str_values = { diff --git a/src/index.cpp b/src/index.cpp index 5d9686ba..1750632f 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -1652,7 +1652,8 @@ void Index::run_search(search_args* search_params) { search_params->max_candidates, search_params->infixes, search_params->max_extra_prefix, - search_params->max_extra_suffix); + search_params->max_extra_suffix, + search_params->facet_query_num_typos); } void Index::collate_included_ids(const std::vector& q_included_tokens, @@ -2081,37 +2082,24 @@ void Index::search_infix(const std::string& query, const std::string& field_name } } -void Index::search(std::vector& field_query_tokens, - const std::vector& the_fields, - std::vector& filters, - std::vector& facets, facet_query_t& facet_query, - const std::map> & included_ids_map, - const std::vector & excluded_ids, - const std::vector & sort_fields_std, const std::vector& num_typos, - Topster* topster, - Topster* curated_topster, - const size_t per_page, const size_t page, const token_ordering token_order, - const std::vector& prefixes, const size_t drop_tokens_threshold, - size_t & all_result_ids_len, +void Index::search(std::vector& field_query_tokens, const std::vector& the_fields, + std::vector& filters, std::vector& facets, facet_query_t& facet_query, + const std::map>& included_ids_map, + const std::vector& excluded_ids, const std::vector& sort_fields_std, + const std::vector& num_typos, Topster* topster, Topster* curated_topster, + const size_t per_page, + const size_t page, const token_ordering token_order, const std::vector& prefixes, + const size_t drop_tokens_threshold, size_t& all_result_ids_len, spp::sparse_hash_set& groups_processed, std::vector>& searched_queries, - std::vector> & raw_result_kvs, - std::vector> & override_result_kvs, - const size_t typo_tokens_threshold, - const size_t group_limit, + std::vector>& raw_result_kvs, std::vector>& override_result_kvs, + const size_t typo_tokens_threshold, const size_t group_limit, const std::vector& group_by_fields, const std::vector& filter_overrides, - const std::string& default_sorting_field, - bool prioritize_exact_match, - const bool exhaustive_search, - const size_t concurrency, - const size_t search_cutoff_ms, - size_t min_len_1typo, - size_t min_len_2typo, - const size_t max_candidates, - const std::vector& infixes, - const size_t max_extra_prefix, - const size_t max_extra_suffix) const { + const string& default_sorting_field, bool prioritize_exact_match, bool exhaustive_search, + size_t concurrency, size_t search_cutoff_ms, size_t min_len_1typo, size_t min_len_2typo, + size_t max_candidates, const std::vector& infixes, const size_t max_extra_prefix, + const size_t max_extra_suffix, const size_t facet_query_num_typos) const { search_begin = std::chrono::high_resolution_clock::now(); search_stop_ms = search_cutoff_ms; @@ -2231,7 +2219,7 @@ void Index::search(std::vector& field_query_tokens, std::condition_variable cv_process; std::vector facet_infos(facets.size()); - compute_facet_infos(facets, facet_query, all_result_ids, all_result_ids_len, + compute_facet_infos(facets, facet_query, facet_query_num_typos, all_result_ids, all_result_ids_len, group_by_fields, facet_infos); std::vector> facet_batches(num_threads); @@ -2316,7 +2304,8 @@ void Index::search(std::vector& field_query_tokens, } std::vector facet_infos(facets.size()); - compute_facet_infos(facets, facet_query, &included_ids[0], included_ids.size(), group_by_fields, facet_infos); + compute_facet_infos(facets, facet_query, facet_query_num_typos, + &included_ids[0], included_ids.size(), group_by_fields, facet_infos); do_facets(facets, facet_query, facet_infos, group_limit, group_by_fields, &included_ids[0], included_ids.size()); all_result_ids_len += curated_topster->size; @@ -2845,6 +2834,7 @@ void Index::handle_exclusion(const size_t num_search_fields, std::vector& facets, facet_query_t& facet_query, + const size_t facet_query_num_typos, const uint32_t* all_result_ids, const size_t& all_result_ids_len, const std::vector& group_by_fields, std::vector& facet_infos) const { @@ -2907,7 +2897,7 @@ void Index::compute_facet_infos(const std::vector& facets, facet_query_t& search_field(0, qtokens, search_tokens, nullptr, 0, num_toks_dropped, facet_field, facet_field.faceted_name(), - all_result_ids, all_result_ids_len, {}, {}, 2, searched_queries, topster, groups_processed, + all_result_ids, all_result_ids_len, {}, {}, facet_query_num_typos, searched_queries, topster, groups_processed, &field_result_ids, field_result_ids_len, field_num_results, 0, group_by_fields, false, 4, query_hashes, MAX_SCORE, true, 0, 1, false, 3, 1000, 4); diff --git a/test/collection_faceting_test.cpp b/test/collection_faceting_test.cpp index 6e5f6612..680d404b 100644 --- a/test/collection_faceting_test.cpp +++ b/test/collection_faceting_test.cpp @@ -811,5 +811,30 @@ TEST_F(CollectionFacetingTest, FacetQueryOnStringArray) { ASSERT_EQ(1, results["facet_counts"][0]["counts"].size()); ASSERT_EQ("Country Punk Rock", results["facet_counts"][0]["counts"][0]["highlighted"].get()); + // with facet query num typo parameter + + results = coll1->search("*", {}, "", {"genres"}, sort_fields, {0}, 0, 1, FREQUENCY, + {false}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "genres: eletronic", + 30, 4, "title", 20, {}, {}, {}, 0, + "", "", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, true, + 4, {off}, 32767, 32767, 1).get(); + + ASSERT_EQ(1, results["facet_counts"].size()); + ASSERT_EQ(1, results["facet_counts"][0]["counts"].size()); + ASSERT_EQ("Electronic", results["facet_counts"][0]["counts"][0]["highlighted"].get()); + + results = coll1->search("*", {}, "", {"genres"}, sort_fields, {0}, 0, 1, FREQUENCY, + {false}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "genres: eletronic", + 30, 4, "title", 20, {}, {}, {}, 0, + "", "", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, true, + 4, {off}, 32767, 32767, 0).get(); + + ASSERT_EQ(1, results["facet_counts"].size()); + ASSERT_EQ(0, results["facet_counts"][0]["counts"].size()); + collectionManager.drop_collection("coll1"); }