Add flag for lazy filtering.

This commit is contained in:
Kishore Nallan 2024-03-30 15:30:10 +05:30
parent 9c0ca02812
commit c1b49ef0b9
5 changed files with 23 additions and 10 deletions

View File

@ -584,7 +584,8 @@ public:
std::string conversation_id = "",
const std::string& override_tags_str = "",
const std::string& voice_query = "",
bool enable_typos_for_numerical_tokens = true) const;
bool enable_typos_for_numerical_tokens = true,
bool enable_lazy_filter = true) const;
Option<bool> get_filter_ids(const std::string & filter_query, filter_result_t& filter_result) const;

View File

@ -174,6 +174,8 @@ struct search_args {
size_t facet_sample_threshold;
drop_tokens_param_t drop_tokens_mode;
bool enable_lazy_filter;
search_args(std::vector<query_tokens_t> field_query_tokens, std::vector<search_field_t> search_fields,
const text_match_type_t match_type,
filter_node_t* filter_tree_root, std::vector<facet>& facets,
@ -189,7 +191,8 @@ struct search_args {
size_t min_len_1typo, size_t min_len_2typo, size_t max_candidates, const std::vector<enable_t>& infixes,
const size_t max_extra_prefix, const size_t max_extra_suffix, const size_t facet_query_num_typos,
const bool filter_curated_hits, const enable_t split_join_tokens, vector_query_t& vector_query,
size_t facet_sample_percent, size_t facet_sample_threshold, drop_tokens_param_t drop_tokens_mode) :
size_t facet_sample_percent, size_t facet_sample_threshold, drop_tokens_param_t drop_tokens_mode,
bool enable_lazy_filter) :
field_query_tokens(field_query_tokens),
search_fields(search_fields), match_type(match_type), filter_tree_root(filter_tree_root), facets(facets),
included_ids(included_ids), excluded_ids(excluded_ids), sort_fields_std(sort_fields_std),
@ -208,7 +211,7 @@ struct search_args {
facet_query_num_typos(facet_query_num_typos), filter_curated_hits(filter_curated_hits),
split_join_tokens(split_join_tokens), vector_query(vector_query),
facet_sample_percent(facet_sample_percent), facet_sample_threshold(facet_sample_threshold),
drop_tokens_mode(drop_tokens_mode) {
drop_tokens_mode(drop_tokens_mode), enable_lazy_filter(enable_lazy_filter) {
const size_t topster_size = std::max((size_t)1, max_hits); // needs to be atleast 1 since scoring is mandatory
topster = new Topster(topster_size, group_limit);
@ -689,7 +692,8 @@ public:
const std::string& collection_name,
const drop_tokens_param_t drop_tokens_mode,
facet_index_type_t facet_index_type = DETECT,
bool enable_typos_for_numerical_tokens = true
bool enable_typos_for_numerical_tokens = true,
bool enable_lazy_filter = true
) const;
void remove_field(uint32_t seq_id, const nlohmann::json& document, const std::string& field_name,

View File

@ -1757,7 +1757,8 @@ Option<nlohmann::json> Collection::search(std::string raw_query,
std::string conversation_id,
const std::string& override_tags_str,
const std::string& voice_query,
bool enable_typos_for_numerical_tokens) const {
bool enable_typos_for_numerical_tokens,
bool enable_lazy_filter) const {
std::shared_lock lock(mutex);
// setup thread local vars
@ -2343,7 +2344,8 @@ Option<nlohmann::json> Collection::search(std::string raw_query,
min_len_1typo, min_len_2typo, max_candidates, infixes,
max_extra_prefix, max_extra_suffix, facet_query_num_typos,
filter_curated_hits, split_join_tokens, vector_query,
facet_sample_percent, facet_sample_threshold, drop_tokens_param);
facet_sample_percent, facet_sample_threshold, drop_tokens_param,
enable_lazy_filter);
std::unique_ptr<search_args> search_params_guard(search_params);

View File

@ -1475,6 +1475,7 @@ Option<bool> CollectionManager::do_search(std::map<std::string, std::string>& re
const char *VOICE_QUERY = "voice_query";
const char *ENABLE_TYPOS_FOR_NUMERICAL_TOKENS = "enable_typos_for_numerical_tokens";
const char *ENABLE_LAZY_FILTER = "enable_lazy_filter";
// enrich params with values from embedded params
for(auto& item: embedded_params.items()) {
@ -1595,6 +1596,7 @@ Option<bool> CollectionManager::do_search(std::map<std::string, std::string>& re
bool enable_highlight_v1 = true;
text_match_type_t match_type = max_score;
bool enable_typos_for_numerical_tokens = true;
bool enable_lazy_filter = true;
size_t remote_embedding_timeout_ms = 5000;
size_t remote_embedding_num_tries = 2;
@ -1667,6 +1669,7 @@ Option<bool> CollectionManager::do_search(std::map<std::string, std::string>& re
{PRIORITIZE_NUM_MATCHING_FIELDS, &prioritize_num_matching_fields},
{GROUP_MISSING_VALUES, &group_missing_values},
{ENABLE_TYPOS_FOR_NUMERICAL_TOKENS, &enable_typos_for_numerical_tokens},
{ENABLE_LAZY_FILTER, &enable_lazy_filter},
};
std::unordered_map<std::string, std::vector<std::string>*> str_list_values = {
@ -1881,7 +1884,8 @@ Option<bool> CollectionManager::do_search(std::map<std::string, std::string>& re
conversation_id,
override_tags,
voice_query,
enable_typos_for_numerical_tokens);
enable_typos_for_numerical_tokens,
enable_lazy_filter);
uint64_t timeMillis = std::chrono::duration_cast<std::chrono::milliseconds>(
std::chrono::high_resolution_clock::now() - begin).count();

View File

@ -2252,7 +2252,8 @@ Option<bool> Index::run_search(search_args* search_params, const std::string& co
collection_name,
search_params->drop_tokens_mode,
facet_index_type,
enable_typos_for_numerical_tokens
enable_typos_for_numerical_tokens,
search_params->enable_lazy_filter
);
}
@ -2740,7 +2741,8 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
const std::string& collection_name,
const drop_tokens_param_t drop_tokens_mode,
facet_index_type_t facet_index_type,
bool enable_typos_for_numerical_tokens) const {
bool enable_typos_for_numerical_tokens,
bool enable_lazy_filter) const {
std::shared_lock lock(mutex);
auto filter_result_iterator = new filter_result_iterator_t(collection_name, this, filter_tree_root,
@ -2763,7 +2765,7 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
}
#else
if (filter_result_iterator->approx_filter_ids_length < 25'000) {
if (!enable_lazy_filter || filter_result_iterator->approx_filter_ids_length < 25'000) {
filter_result_iterator->compute_iterators();
}
#endif