From 7703939430c707dba84cd64e8a8123ed0571c3b3 Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Sat, 7 Aug 2021 18:13:29 +0530 Subject: [PATCH] Parameterize combination limit for more exhaustive searching. --- include/collection.h | 3 +- include/index.h | 21 ++++++----- src/collection.cpp | 6 ++-- src/collection_manager.cpp | 12 ++++++- src/index.cpp | 74 ++++++++++++-------------------------- 5 files changed, 53 insertions(+), 63 deletions(-) diff --git a/include/collection.h b/include/collection.h index fe4a72e8..4d499e84 100644 --- a/include/collection.h +++ b/include/collection.h @@ -537,7 +537,8 @@ public: bool prioritize_exact_match=true, bool pre_segmented_query=false, bool enable_overrides=true, - const std::string& highlight_fields="") const; + const std::string& highlight_fields="", + const size_t combination_limit = Index::COMBINATION_LIMIT) const; Option get_filter_ids(const std::string & simple_filter_query, std::vector>& index_ids); diff --git a/include/index.h b/include/index.h index c675fd3f..ef05e6c4 100644 --- a/include/index.h +++ b/include/index.h @@ -68,6 +68,7 @@ struct search_args { std::string default_sorting_field; bool prioritize_exact_match; size_t all_result_ids_len; + size_t combination_limit; spp::sparse_hash_set groups_processed; std::vector> searched_queries; Topster* topster; @@ -88,7 +89,8 @@ struct search_args { size_t drop_tokens_threshold, size_t typo_tokens_threshold, const std::vector& group_by_fields, size_t group_limit, const std::string& default_sorting_field, - bool prioritize_exact_match): + bool prioritize_exact_match, + size_t combination_limit): field_query_tokens(field_query_tokens), search_fields(search_fields), filters(filters), facets(facets), included_ids(included_ids), excluded_ids(excluded_ids), sort_fields_std(sort_fields_std), @@ -96,7 +98,8 @@ struct search_args { page(page), token_order(token_order), prefixes(prefixes), drop_tokens_threshold(drop_tokens_threshold), typo_tokens_threshold(typo_tokens_threshold), group_by_fields(group_by_fields), group_limit(group_limit), default_sorting_field(default_sorting_field), - prioritize_exact_match(prioritize_exact_match), all_result_ids_len(0) { + prioritize_exact_match(prioritize_exact_match), all_result_ids_len(0), + combination_limit(combination_limit) { const size_t topster_size = std::max((size_t)1, max_hits); // needs to be atleast 1 since scoring is mandatory topster = new Topster(topster_size, group_limit); @@ -220,7 +223,8 @@ private: bool prioritize_exact_match, const token_ordering token_order = FREQUENCY, const bool prefix = false, const size_t drop_tokens_threshold = Index::DROP_TOKENS_THRESHOLD, - const size_t typo_tokens_threshold = Index::TYPO_TOKENS_THRESHOLD) const; + const size_t typo_tokens_threshold = Index::TYPO_TOKENS_THRESHOLD, + const size_t combination_limit = Index::COMBINATION_LIMIT) const; void search_candidates(const uint8_t & field_id, uint32_t* filter_ids, size_t filter_ids_length, @@ -235,7 +239,8 @@ private: const size_t typo_tokens_threshold, const size_t group_limit, const std::vector& group_by_fields, const std::vector& query_tokens, - bool prioritize_exact_match) const; + bool prioritize_exact_match, + size_t combination_limit) const; void insert_doc(const int64_t score, art_tree *t, uint32_t seq_id, const std::unordered_map> &token_to_offsets) const; @@ -246,9 +251,6 @@ private: void index_string_array_field(const std::vector & strings, const int64_t score, art_tree *t, uint32_t seq_id, bool is_facet, const field & a_field); - static void remove_and_shift_offset_index(sorted_array& offset_index, const uint32_t* indices_sorted, - const uint32_t indices_length); - void collate_included_ids(const std::vector& q_included_tokens, const std::string & field, const uint8_t field_id, const std::map> & included_ids_map, @@ -295,6 +297,8 @@ public: // for limiting number of fields that can be searched on enum {FIELD_LIMIT_NUM = 100}; + enum {COMBINATION_LIMIT = 10}; + // If the number of results found is less than this threshold, Typesense will attempt to drop the tokens // in the query that have the least individual hits one by one until enough results are found. static const int DROP_TOKENS_THRESHOLD = 10; @@ -364,7 +368,8 @@ public: const size_t group_limit, const std::vector& group_by_fields, const std::string& default_sorting_field, - bool prioritize_exact_match) const; + bool prioritize_exact_match, + const size_t combination_limit) const; Option remove(const uint32_t seq_id, const nlohmann::json & document, const bool is_update); diff --git a/src/collection.cpp b/src/collection.cpp index bd8fbae3..9f206af2 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -510,7 +510,8 @@ Option Collection::search(const std::string & query, const std:: bool prioritize_exact_match, bool pre_segmented_query, bool enable_overrides, - const std::string& highlight_fields) const { + const std::string& highlight_fields, + const size_t combination_limit) const { std::shared_lock lock(mutex); @@ -936,7 +937,8 @@ Option Collection::search(const std::string & query, const std:: sort_fields_std, facet_query, num_typos, max_facet_values, max_hits, per_page, page, token_order, prefixes, drop_tokens_threshold, typo_tokens_threshold, - group_by_fields, group_limit, default_sorting_field, prioritize_exact_match); + group_by_fields, group_limit, default_sorting_field, prioritize_exact_match, + combination_limit); search_args_vec.push_back(search_params); diff --git a/src/collection_manager.cpp b/src/collection_manager.cpp index dc81f1ad..de6ed66a 100644 --- a/src/collection_manager.cpp +++ b/src/collection_manager.cpp @@ -516,6 +516,8 @@ Option CollectionManager::do_search(std::map& re const char *PRIORITIZE_EXACT_MATCH = "prioritize_exact_match"; const char *PRE_SEGMENTED_QUERY = "pre_segmented_query"; + const char *EXHAUSTIVE_SEARCH = "exhaustive_search"; + if(req_params.count(NUM_TYPOS) == 0) { req_params[NUM_TYPOS] = "2"; } @@ -613,6 +615,10 @@ Option CollectionManager::do_search(std::map& re req_params[PRE_SEGMENTED_QUERY] = "false"; } + if(req_params.count(EXHAUSTIVE_SEARCH) == 0) { + req_params[EXHAUSTIVE_SEARCH] = "false"; + } + std::vector query_by_weights_str; std::vector query_by_weights; @@ -684,6 +690,7 @@ Option CollectionManager::do_search(std::map& re bool prioritize_exact_match = (req_params[PRIORITIZE_EXACT_MATCH] == "true"); bool pre_segmented_query = (req_params[PRE_SEGMENTED_QUERY] == "true"); + bool exhaustive_search = (req_params[EXHAUSTIVE_SEARCH] == "true"); std::string filter_str = req_params.count(FILTER) != 0 ? req_params[FILTER] : ""; @@ -764,6 +771,8 @@ Option CollectionManager::do_search(std::map& re } } + const size_t combination_limit = exhaustive_search ? 10000 : 10; + Option result_op = collection->search(req_params[QUERY], search_fields, filter_str, facet_fields, sort_fields, num_typos, static_cast(std::stol(req_params[PER_PAGE])), @@ -787,7 +796,8 @@ Option CollectionManager::do_search(std::map& re prioritize_exact_match, pre_segmented_query, enable_overrides, - req_params[HIGHLIGHT_FIELDS] + req_params[HIGHLIGHT_FIELDS], + combination_limit ); uint64_t timeMillis = std::chrono::duration_cast( diff --git a/src/index.cpp b/src/index.cpp index 2f4d16ae..0ad07385 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -890,9 +890,8 @@ void Index::search_candidates(const uint8_t & field_id, const size_t group_limit, const std::vector& group_by_fields, const std::vector& query_tokens, - bool prioritize_exact_match) const { - - const long long combination_limit = 10; + bool prioritize_exact_match, + const size_t combination_limit) const { auto product = []( long long a, token_candidates & b ) { return a*b.candidates.size(); }; long long int N = std::accumulate(token_candidates_vec.begin(), token_candidates_vec.end(), 1LL, product); @@ -909,11 +908,14 @@ void Index::search_candidates(const uint8_t & field_id, query_suggestion, token_bits); //LOG(INFO) << "field_num_results: " << field_num_results << ", typo_tokens_threshold: " << typo_tokens_threshold; - /*LOG(INFO) << "n: " << n; - for(size_t i=0; i < actual_query_suggestion.size(); i++) { - LOG(INFO) << "i: " << i << " - " << actual_query_suggestion[i]->key << ", ids: " - << actual_query_suggestion[i]->values->ids.getLength() << ", total_cost: " << total_cost; - }*/ + //LOG(INFO) << "n: " << n; + + /*std::stringstream fullq; + for(const auto& qleaf : actual_query_suggestion) { + std::string qtok(reinterpret_cast(qleaf->key),qleaf->key_len - 1); + fullq << qtok << " "; + } + LOG(INFO) << fullq.str();*/ // initialize results with the starting element (for further intersection) size_t result_size = posting_t::num_ids(query_suggestion[0]->values); @@ -1372,7 +1374,8 @@ void Index::run_search(search_args* search_params) { search_params->typo_tokens_threshold, search_params->group_limit, search_params->group_by_fields, search_params->default_sorting_field, - search_params->prioritize_exact_match); + search_params->prioritize_exact_match, + search_params->combination_limit); } void Index::collate_included_ids(const std::vector& q_included_tokens, @@ -1464,7 +1467,8 @@ void Index::search(const std::vector& field_query_tokens, const size_t group_limit, const std::vector& group_by_fields, const std::string& default_sorting_field, - bool prioritize_exact_match) const { + bool prioritize_exact_match, + const size_t combination_limit) const { std::shared_lock lock(mutex); @@ -1617,7 +1621,7 @@ void Index::search(const std::vector& field_query_tokens, field_name, filter_ids, filter_ids_length, curated_ids_sorted, facets, sort_fields_std, field_num_typos, searched_queries, actual_topster, groups_processed, &all_result_ids, all_result_ids_len, field_num_results, group_limit, group_by_fields, prioritize_exact_match, token_order, field_prefix, - drop_tokens_threshold, typo_tokens_threshold); + drop_tokens_threshold, typo_tokens_threshold, combination_limit); // do synonym based searches for(const auto& syn_tokens: q_pos_synonyms) { @@ -1629,7 +1633,7 @@ void Index::search(const std::vector& field_query_tokens, field_name, filter_ids, filter_ids_length, curated_ids_sorted, facets, sort_fields_std, field_num_typos, searched_queries, actual_topster, groups_processed, &all_result_ids, all_result_ids_len, field_num_results, group_limit, group_by_fields, prioritize_exact_match, token_order, field_prefix, - drop_tokens_threshold, typo_tokens_threshold); + drop_tokens_threshold, typo_tokens_threshold, combination_limit); } // concat is done only for multi-field searches as `ftopster` will be empty for single-field search @@ -1861,7 +1865,9 @@ void Index::search_field(const uint8_t & field_id, const size_t group_limit, const std::vector& group_by_fields, bool prioritize_exact_match, const token_ordering token_order, const bool prefix, - const size_t drop_tokens_threshold, const size_t typo_tokens_threshold) const { + const size_t drop_tokens_threshold, + const size_t typo_tokens_threshold, + const size_t combination_limit) const { size_t max_cost = (num_typos < 0 || num_typos > 2) ? 2 : num_typos; if(search_schema.at(field).locale != "" && search_schema.at(field).locale != "en") { @@ -1891,7 +1897,6 @@ void Index::search_field(const uint8_t & field_id, // stores candidates for each token, i.e. i-th index would have all possible tokens with a cost of "c" std::vector token_candidates_vec; - const long long combination_limit = 10; auto product = []( long long a, std::vector& b ) { return a*b.size(); }; long long n = 0; long long int N = std::accumulate(token_to_costs.begin(), token_to_costs.end(), 1LL, product); @@ -1926,9 +1931,8 @@ void Index::search_field(const uint8_t & field_id, const size_t token_len = prefix_search ? (int) token.length() : (int) token.length() + 1; // need less candidates for filtered searches since we already only pick tokens with results - const int max_candidates = (filter_ids_length == 0) ? 10 : 3; art_fuzzy_search(search_index.at(field), (const unsigned char *) token.c_str(), token_len, - costs[token_index], costs[token_index], max_candidates, token_order, prefix_search, + costs[token_index], costs[token_index], combination_limit, token_order, prefix_search, filter_ids, filter_ids_length, leaves); if(!leaves.empty()) { @@ -1976,7 +1980,8 @@ void Index::search_field(const uint8_t & field_id, search_candidates(field_id, filter_ids, filter_ids_length, exclude_token_ids, exclude_token_ids_size, curated_ids, sort_fields, token_candidates_vec, searched_queries, topster, groups_processed, all_result_ids, all_result_ids_len, field_num_results, - typo_tokens_threshold, group_limit, group_by_fields, query_tokens, prioritize_exact_match); + typo_tokens_threshold, group_limit, group_by_fields, query_tokens, + prioritize_exact_match, combination_limit); } resume_typo_loop: @@ -2020,7 +2025,7 @@ void Index::search_field(const uint8_t & field_id, num_tokens_dropped, field, filter_ids, filter_ids_length, curated_ids,facets, sort_fields, num_typos,searched_queries, topster, groups_processed, all_result_ids, all_result_ids_len, field_num_results, group_limit, group_by_fields, prioritize_exact_match, - token_order, prefix); + token_order, prefix, combination_limit); } } @@ -2309,39 +2314,6 @@ inline uint32_t Index::next_suggestion(const std::vector &toke return total_cost; } -void Index::remove_and_shift_offset_index(sorted_array& offset_index, const uint32_t* indices_sorted, - const uint32_t indices_length) { - uint32_t *curr_array = offset_index.uncompress(); - uint32_t *new_array = new uint32_t[offset_index.getLength()]; - - new_array[0] = 0; - uint32_t new_index = 0; - uint32_t curr_index = 0; - uint32_t indices_counter = 0; - uint32_t shift_value = 0; - - while(curr_index < offset_index.getLength()) { - if(indices_counter < indices_length && curr_index >= indices_sorted[indices_counter]) { - // skip copying - if(curr_index == indices_sorted[indices_counter]) { - curr_index++; - const uint32_t diff = curr_index == offset_index.getLength() ? - 0 : (offset_index.at(curr_index) - offset_index.at(curr_index-1)); - - shift_value += diff; - } - indices_counter++; - } else { - new_array[new_index++] = curr_array[curr_index++] - shift_value; - } - } - - offset_index.load(new_array, new_index); - - delete[] curr_array; - delete[] new_array; -} - Option Index::remove(const uint32_t seq_id, const nlohmann::json & document, const bool is_update) { std::unique_lock lock(mutex);