#pragma once #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "string_utils.h" #include "num_tree.h" #include "magic_enum.hpp" #include "match_score.h" #include "posting_list.h" #include "threadpool.h" #include "adi_tree.h" #include "tsl/htrie_set.h" #include #include #include "id_list.h" #include "synonym_index.h" #include "override.h" #include "vector_query_ops.h" #include "hnswlib/hnswlib.h" #include "filter.h" #include "facet_index.h" #include "numeric_range_trie.h" #include "geopolygon_index.h" static constexpr size_t ARRAY_FACET_DIM = 4; using facet_map_t = spp::sparse_hash_map; using single_val_facet_map_t = spp::sparse_hash_map; using array_mapped_facet_t = std::array; using array_mapped_single_val_facet_t = std::array; static constexpr size_t ARRAY_INFIX_DIM = 4; using array_mapped_infix_t = std::vector*>; struct token_t { size_t position; std::string value; bool is_prefix_searched; uint32_t root_len; // if prefix searched, length of the root token uint32_t num_typos; token_t() {}; token_t(size_t position, const std::string& value, bool is_prefix_searched, uint32_t root_len, uint32_t num_typos): position(position), value(value), is_prefix_searched(is_prefix_searched), root_len(root_len), num_typos(num_typos) { } }; // FIXME: deprecated struct token_candidates { token_t token; size_t cost; bool prefix_search; std::vector candidates; }; struct tok_candidates { token_t token; size_t cost; bool prefix_search; std::vector candidates; }; struct query_tokens_t { std::vector q_include_tokens; std::vector q_unstemmed_tokens; std::vector> q_exclude_tokens; std::vector> q_phrases; std::vector> q_synonyms; }; enum enable_t { always, fallback, off }; struct search_field_t { std::string name; std::string str_name; // for lookup of non-string fields in art index size_t weight; size_t num_typos; bool prefix; enable_t infix; search_field_t(const std::string& name, const std::string& str_name, size_t weight, size_t num_typos, bool prefix, enable_t infix): name(name), str_name(str_name), weight(weight), num_typos(num_typos), prefix(prefix), infix(infix) { } }; enum text_match_type_t { max_score, sum_score, max_weight }; enum drop_tokens_mode_t { left_to_right, right_to_left, both_sides, }; struct drop_tokens_param_t { drop_tokens_mode_t mode = right_to_left; size_t token_limit = 1000; drop_tokens_param_t() { } drop_tokens_param_t(drop_tokens_mode_t mode, size_t token_limit) : mode(mode), token_limit(token_limit) {} }; enum facet_index_type_t { exhaustive, top_values, automatic, }; struct search_args { std::vector field_query_tokens; std::vector search_fields; const text_match_type_t match_type; std::vector& facets; std::vector>& included_ids; std::vector excluded_ids; std::vector sort_fields_std; facet_query_t facet_query; std::vector num_typos; size_t max_facet_values; size_t fetch_size; size_t per_page; size_t offset; token_ordering token_order; std::vector prefixes; size_t drop_tokens_threshold; size_t typo_tokens_threshold; std::vector group_by_fields; size_t group_limit; bool group_missing_values; std::string default_sorting_field; bool prioritize_exact_match; bool prioritize_token_position; bool prioritize_num_matching_fields; size_t all_result_ids_len; bool exhaustive_search; size_t concurrency; size_t search_cutoff_ms; size_t min_len_1typo; size_t min_len_2typo; size_t max_candidates; std::vector infixes; const size_t max_extra_prefix; const size_t max_extra_suffix; const size_t facet_query_num_typos; const bool filter_curated_hits; const enable_t split_join_tokens; tsl::htrie_map qtoken_set; spp::sparse_hash_map groups_processed; std::vector> searched_queries; Topster* topster = nullptr; Topster* curated_topster = nullptr; std::vector> raw_result_kvs; std::vector> override_result_kvs; vector_query_t& vector_query; size_t facet_sample_percent; size_t facet_sample_threshold; drop_tokens_param_t drop_tokens_mode; std::unique_ptr filter_tree_root_guard; bool enable_lazy_filter; size_t max_filter_by_candidates; std::vector facet_index_types; bool enable_typos_for_numerical_tokens; bool enable_synonyms; bool synonym_prefix; uint32_t synonym_num_typos; bool enable_typos_for_alpha_numerical_tokens; bool rerank_hybrid_matches; bool validate_field_names; search_args(std::vector field_query_tokens, std::vector search_fields, const text_match_type_t match_type, std::vector& facets, std::vector>& included_ids, std::vector excluded_ids, std::vector& sort_fields_std, facet_query_t facet_query, const std::vector& num_typos, size_t max_facet_values, size_t fetch_size, size_t per_page, size_t offset, token_ordering token_order, const std::vector& prefixes, size_t drop_tokens_threshold, size_t typo_tokens_threshold, const std::vector& group_by_fields, size_t group_limit, const bool group_missing_values, const string& default_sorting_field, bool prioritize_exact_match, const bool prioritize_token_position, const bool prioritize_num_matching_fields, bool exhaustive_search, size_t concurrency, size_t search_cutoff_ms, size_t min_len_1typo, size_t min_len_2typo, size_t max_candidates, const std::vector& infixes, const size_t max_extra_prefix, const size_t max_extra_suffix, const size_t facet_query_num_typos, const bool filter_curated_hits, const enable_t split_join_tokens, vector_query_t& vector_query, size_t facet_sample_percent, size_t facet_sample_threshold, drop_tokens_param_t drop_tokens_mode, std::unique_ptr&& filter_tree_root_guard, bool enable_lazy_filter, const size_t max_filter_by_candidates, std::vector& facet_index_types, bool enable_typos_for_numerical_tokens, bool enable_synonyms, bool synonym_prefix, uint32_t synonym_num_typos, bool enable_typos_for_alpha_numerical_tokens, bool rerank_hybrid_matches, const bool& validate_field_names) : field_query_tokens(field_query_tokens), search_fields(search_fields), match_type(match_type), facets(facets), included_ids(included_ids), excluded_ids(excluded_ids), sort_fields_std(std::move(sort_fields_std)), facet_query(facet_query), num_typos(num_typos), max_facet_values(max_facet_values), fetch_size(fetch_size), per_page(per_page), offset(offset), token_order(token_order), prefixes(prefixes), drop_tokens_threshold(drop_tokens_threshold), typo_tokens_threshold(typo_tokens_threshold), group_by_fields(group_by_fields), group_limit(group_limit), group_missing_values(group_missing_values), default_sorting_field(default_sorting_field), prioritize_exact_match(prioritize_exact_match), prioritize_token_position(prioritize_token_position), prioritize_num_matching_fields(prioritize_num_matching_fields), all_result_ids_len(0), exhaustive_search(exhaustive_search), concurrency(concurrency), search_cutoff_ms(search_cutoff_ms), min_len_1typo(min_len_1typo), min_len_2typo(min_len_2typo), max_candidates(max_candidates), infixes(infixes), max_extra_prefix(max_extra_prefix), max_extra_suffix(max_extra_suffix), facet_query_num_typos(facet_query_num_typos), filter_curated_hits(filter_curated_hits), split_join_tokens(split_join_tokens), vector_query(vector_query), facet_sample_percent(facet_sample_percent), facet_sample_threshold(facet_sample_threshold), drop_tokens_mode(drop_tokens_mode), filter_tree_root_guard(std::move(filter_tree_root_guard)), enable_lazy_filter(enable_lazy_filter), max_filter_by_candidates(max_filter_by_candidates), facet_index_types(std::move(facet_index_types)), enable_typos_for_numerical_tokens(enable_typos_for_numerical_tokens), enable_synonyms(enable_synonyms), synonym_prefix(synonym_prefix), synonym_num_typos(synonym_num_typos), enable_typos_for_alpha_numerical_tokens(enable_typos_for_alpha_numerical_tokens), rerank_hybrid_matches(rerank_hybrid_matches), validate_field_names(validate_field_names) { } ~search_args() { delete topster; delete curated_topster; for (auto& sort_by_clause: sort_fields_std) { for (auto& eval_ids: sort_by_clause.eval.eval_ids_vec) { delete [] eval_ids; } for (uint32_t i = 0; i < sort_by_clause.eval_expressions.size(); i++) { delete sort_by_clause.eval.filter_trees[i]; } delete [] sort_by_clause.eval.filter_trees; } }; }; struct offsets_facet_hashes_t { // token to offsets std::unordered_map> offsets; }; struct index_record { size_t position; // position of record in the original request uint32_t seq_id; nlohmann::json doc; // actual document sent in request (could be partial) nlohmann::json old_doc; // previously stored *full* document from disk nlohmann::json new_doc; // new *full* document to be stored into disk nlohmann::json del_doc; // document containing the fields that should be deleted nlohmann::json embedding_res; // embedding result int embedding_status_code; // embedding status code index_operation_t operation; bool is_update; // pre-processed data primed for indexing std::unordered_map field_index; int64_t points; Option indexed; // indicates if the indexing operation was a success DIRTY_VALUES dirty_values; index_record(size_t record_pos, uint32_t seq_id, const nlohmann::json& doc, index_operation_t operation, const DIRTY_VALUES& dirty_values): position(record_pos), seq_id(seq_id), doc(doc), operation(operation), is_update(false), indexed(false), dirty_values(dirty_values) { } index_record(index_record&& rhs) = default; index_record& operator=(index_record&& mE) = default; void index_failure(const uint32_t err_code, const std::string & err_msg) { indexed = Option(err_code, err_msg); } void index_success() { indexed = Option(true); } }; class VectorFilterFunctor: public hnswlib::BaseFilterFunctor { filter_result_iterator_t* const filter_result_iterator; const uint32_t* excluded_ids = nullptr; const uint32_t excluded_ids_length = 0; public: explicit VectorFilterFunctor(filter_result_iterator_t* const filter_result_iterator, const uint32_t* excluded_ids = nullptr, const uint32_t excluded_ids_length = 0) : filter_result_iterator(filter_result_iterator), excluded_ids(excluded_ids), excluded_ids_length(excluded_ids_length) {} bool operator()(hnswlib::labeltype id) override { if (filter_result_iterator->approx_filter_ids_length == 0 && excluded_ids_length == 0) { return true; } if(excluded_ids_length > 0 && excluded_ids && std::binary_search(excluded_ids, excluded_ids + excluded_ids_length, id)) { return false; } if(filter_result_iterator->approx_filter_ids_length == 0) { return true; } filter_result_iterator->reset(); return filter_result_iterator->is_valid(id) == 1; } }; struct hnsw_index_t { hnswlib::InnerProductSpace* space; hnswlib::HierarchicalNSW* vecdex; size_t num_dim; vector_distance_type_t distance_type; // ensures that this index is not dropped when it's being repaired std::mutex repair_m; hnsw_index_t(size_t num_dim, size_t init_size, vector_distance_type_t distance_type, size_t M = 16, size_t ef_construction = 200) : space(new hnswlib::InnerProductSpace(num_dim)), vecdex(new hnswlib::HierarchicalNSW(space, init_size, M, ef_construction, 100, true)), num_dim(num_dim), distance_type(distance_type) { } ~hnsw_index_t() { std::lock_guard lk(repair_m); delete vecdex; delete space; } // needed for cosine similarity static void normalize_vector(const std::vector& src, std::vector& norm_dest) { float norm = 0.0f; for (float i : src) { norm += i * i; } norm = 1.0f / (sqrtf(norm) + 1e-30f); for (size_t i = 0; i < src.size(); i++) { norm_dest[i] = src[i] * norm; } } }; struct group_by_field_it_t { std::string field_name; posting_list_t::iterator_t it; bool is_array; }; struct Hasher32 { // Helps to spread the hash key and is used for sort index. // see: https://github.com/greg7mdp/sparsepp/issues/21#issuecomment-270816275 size_t operator()(uint32_t k) const { return (k ^ 2166136261U) * 16777619UL; } }; struct negate_left_join_t { bool is_negate_join = false; size_t excluded_ids_size = 0; std::unique_ptr excluded_ids = nullptr; negate_left_join_t() = default; }; class Index { private: mutable std::shared_mutex mutex; std::string name; const uint32_t collection_id; const Store* store; const SynonymIndex* synonym_index; ThreadPool* thread_pool; size_t num_documents; tsl::htrie_map search_schema; spp::sparse_hash_map search_index; spp::sparse_hash_map numerical_index; // reference_helper_field => (seq_id => ref_seq_ids) // Only used when the reference field is an array type otherwise sort_index is used. spp::sparse_hash_map reference_index; /// field_name => ((doc_id, object_index) => ref_doc_id) /// Used when a field inside an object array has reference. spp::sparse_hash_map, uint32_t, pair_hash>*> object_array_reference_index; spp::sparse_hash_map range_index; spp::sparse_hash_map geo_range_index; spp::sparse_hash_map field_geopolygon_index; // geo_array_field => (seq_id => values) used for exact filtering of geo array records spp::sparse_hash_map*> geo_array_index; facet_index_t* facet_index_v4 = nullptr; // sort_field => (seq_id => value) spp::sparse_hash_map*> sort_index; typedef spp::sparse_hash_map*>::iterator sort_index_iterator; // str_sort_field => adi_tree_t spp::sparse_hash_map str_sort_index; // infix field => value spp::sparse_hash_map infix_index; // vector field => vector index spp::sparse_hash_map vector_index; // this is used for wildcard queries id_list_t* seq_ids; std::vector symbols_to_index; std::vector token_separators; StringUtils string_utils; // used as sentinels static spp::sparse_hash_map text_match_sentinel_value; static spp::sparse_hash_map seq_id_sentinel_value; static spp::sparse_hash_map eval_sentinel_value; static spp::sparse_hash_map geo_sentinel_value; static spp::sparse_hash_map str_sentinel_value; static spp::sparse_hash_map vector_distance_sentinel_value; static spp::sparse_hash_map vector_query_sentinel_value; // Internal utility functions static inline uint32_t next_suggestion2(const std::vector& token_candidates_vec, long long int n, std::vector& query_suggestion, uint64& qhash); static inline uint32_t next_suggestion(const std::vector &token_candidates_vec, long long int n, std::vector& actual_query_suggestion, std::vector& query_suggestion, int syn_orig_num_tokens, uint32_t& token_bits, uint64& qhash); void log_leaves(int cost, const std::string &token, const std::vector &leaves) const; void do_facets(std::vector & facets, facet_query_t & facet_query, bool estimate_facets, size_t facet_sample_percent, const std::vector& facet_infos, size_t group_limit, const std::vector& group_by_fields, const bool group_missing_values, const uint32_t* result_ids, size_t results_size, int max_facet_count, bool is_wildcard_query, const std::vector& facet_index_types) const; bool static_filter_query_eval(const override_t* override, std::vector& tokens, std::unique_ptr& filter_tree_root, const bool& validate_field_names) const; bool resolve_override(const std::vector& rule_tokens, bool exact_rule_match, const std::vector& query_tokens, token_ordering token_order, std::set& absorbed_tokens, std::string& filter_by_clause, bool enable_typos_for_numerical_tokens, bool enable_typos_for_alpha_numerical_tokens) const; bool check_for_overrides(const token_ordering& token_order, const string& field_name, bool slide_window, bool exact_rule_match, std::vector& tokens, std::set& absorbed_tokens, std::vector& field_absorbed_tokens, bool enable_typos_for_numerical_tokens, bool enable_typos_for_alpha_numerical_tokens) const; static void aggregate_topster(Topster* agg_topster, Topster* index_topster); Option search_all_candidates(const size_t num_search_fields, const text_match_type_t match_type, const std::vector& the_fields, filter_result_iterator_t* const filter_result_iterator, const uint32_t* excluded_result_ids, size_t excluded_result_ids_size, const std::vector& sort_fields, std::vector& token_candidates_vec, std::vector>& searched_queries, tsl::htrie_map& qtoken_set, const std::vector& dropped_tokens, Topster*& topster, spp::sparse_hash_map& groups_processed, size_t& num_keyword_matches, uint32_t*& all_result_ids, size_t& all_result_ids_len, const size_t typo_tokens_threshold, const size_t group_limit, const std::vector& group_by_fields, const bool group_missing_values, const std::vector& query_tokens, const std::vector& num_typos, const std::vector& prefixes, bool prioritize_exact_match, const bool prioritize_token_position, const bool exhaustive_search, const bool prioritize_num_matching_fields, const size_t max_candidates, int syn_orig_num_tokens, const int* sort_order, std::array*, 3>& field_values, const std::vector& geopoint_indices, std::set& query_hashes, std::vector& id_buff) const; static void popular_fields_of_token(const spp::sparse_hash_map& search_index, const std::string& previous_token, const std::vector& the_fields, const size_t num_search_fields, std::vector& popular_field_ids); bool field_is_indexed(const std::string& field_name) const; static void tokenize_string(const std::string& text, const field& a_field, const std::vector& symbols_to_index, const std::vector& token_separators, std::unordered_map>& token_to_offsets); static void tokenize_string_array(const std::vector& strings, const field& a_field, const std::vector& symbols_to_index, const std::vector& token_separators, std::unordered_map>& token_to_offsets); void collate_included_ids(const std::vector& q_included_tokens, const std::map> & included_ids_map, Topster*& curated_topster, const size_t group_limit, const std::vector& group_by_fields, const bool group_missing_values, std::vector> & searched_queries) const; static void compute_facet_stats(facet &a_facet, const std::string& raw_value, const std::string & field_type, const size_t count); static void compute_facet_stats(facet &a_facet, const int64_t raw_value, const std::string & field_type); static void handle_doc_ops(const tsl::htrie_map& search_schema, nlohmann::json& update_doc, const nlohmann::json& old_doc); static void get_doc_changes(const index_operation_t op, const tsl::htrie_map& embedding_fields, nlohmann::json &update_doc, const nlohmann::json &old_doc, nlohmann::json &new_doc, nlohmann::json &del_doc); bool common_results_exist(std::vector& leaves, bool must_match_phrase) const; static void remove_facet_token(const field& search_field, spp::sparse_hash_map& search_index, const std::string& token, uint32_t seq_id); void initialize_facet_indexes(const field& facet_field); std::vector get_group_by_field_iterators(const std::vector&, bool is_reverse=false) const; static void batch_embed_fields(std::vector& documents, const tsl::htrie_map& embedding_fields, const tsl::htrie_map & search_schema, const size_t remote_embedding_batch_size = 200, const size_t remote_embedding_timeout_ms = 60000, const size_t remote_embedding_num_tries = 2); static void process_embed_results(const std::vector>& values_to_embed_text, const std::vector& embeddings_text, const std::vector>& values_to_embed_image, const std::vector& embeddings_image, const field& the_field); void update_async_references(const std::string& collection_name, const field& afield, std::vector& iter_batch, const std::set& async_referenced_ins = {}); std::string get_collection_name_with_lock() const { std::shared_lock lock(mutex); return get_collection_name(); } std::string get_collection_name() const { // Index name is: collection_name + std::to_string(0) return name.empty() ? "" : name.substr(0, name.size() - 1); } Option get_sort_index_value(const std::string& field_name, const uint32_t& seq_id) const; Option get_geo_distance(const std::string& geo_field_name, const uint32_t& seq_id, const S2LatLng& reference_lat_lng, const bool& round_distance = false) const; Option get_ref_seq_id_helper(const sort_by& sort_field, const uint32_t& seq_id, std::string& prev_coll_name, std::map const*& references, std::string& ref_coll_name) const; public: // for limiting number of results on multiple candidates / query rewrites enum {TYPO_TOKENS_THRESHOLD = 1}; // for limiting number of fields that can be searched on enum {FIELD_LIMIT_NUM = 100}; // Values 0 to 15 are allowed enum {FIELD_MAX_WEIGHT = 15}; enum {COMBINATION_MAX_LIMIT = 10000}; enum {COMBINATION_MIN_LIMIT = 10}; enum {NUM_CANDIDATES_DEFAULT_MIN = 4}; enum {NUM_CANDIDATES_DEFAULT_MAX = 10}; // If the number of results found is less than this threshold, Typesense will attempt to drop the tokens // in the query that have the least individual hits one by one until enough results are found. static const int DROP_TOKENS_THRESHOLD = 1; enum {DEFAULT_TOPSTER_SIZE = 250}; static const size_t GROUP_LIMIT_MAX = 99; /// Value used when async_reference is true and a reference doc is not found. static constexpr int64_t reference_helper_sentinel_value = UINT32_MAX; Index() = delete; Index(const std::string& name, const uint32_t collection_id, const Store* store, SynonymIndex* synonym_index, ThreadPool* thread_pool, const tsl::htrie_map& search_schema, const std::vector& symbols_to_index, const std::vector& token_separators); ~Index(); static void concat_topster_ids(Topster*& topster, spp::sparse_hash_map>& topster_ids); static int64_t score_results2(const std::vector & sort_fields, const uint16_t & query_index, const size_t field_id, const bool field_is_array, const uint32_t total_cost, int64_t& match_score, const uint32_t seq_id, const int sort_order[3], const bool prioritize_exact_match, const bool single_exact_query_token, const bool prioritize_token_position, size_t num_query_tokens, int syn_orig_num_tokens, const std::vector& posting_lists); static int64_t get_points_from_doc(const nlohmann::json &document, const std::string & default_sorting_field); const spp::sparse_hash_map& _get_search_index() const; const spp::sparse_hash_map& _get_numerical_index() const; const spp::sparse_hash_map& _get_range_index() const; const spp::sparse_hash_map& _get_infix_index() const; const spp::sparse_hash_map& _get_vector_index() const; facet_index_t* _get_facet_index() const; static int get_bounded_typo_cost(const size_t max_cost, const std::string& token, const size_t token_len, size_t min_len_1typo, size_t min_len_2typo, bool enable_typos_for_numerical_tokens=true, bool enable_typos_for_alpha_numerical_tokens = true); static int64_t float_to_int64_t(float n); static float int64_t_to_float(int64_t n); void get_distinct_id(posting_list_t::iterator_t& facet_index_it, const uint32_t seq_id, const bool is_array, const bool group_missing_values, uint64_t& distinct_id, bool is_reverse=false) const; static void compute_token_offsets_facets(index_record& record, const tsl::htrie_map& search_schema, const std::vector& local_token_separators, const std::vector& local_symbols_to_index); static void tokenize_string_field(const nlohmann::json& document, const field& search_field, std::vector& tokens, const std::string& locale, const std::vector& symbols_to_index, const std::vector& token_separators); std::string get_facet_str_val(const std::string& field_name, uint32_t facet_id); // Public operations Option run_search(search_args* search_params); Option search(std::vector& field_query_tokens, const std::vector& the_fields, const text_match_type_t match_type, std::unique_ptr& filter_tree_root, std::vector& facets, facet_query_t facet_query, const int max_facet_values, const std::vector>& included_ids, const std::vector& excluded_ids, std::vector& sort_fields_std, const std::vector& num_typos, Topster*& topster, Topster*& curated_topster, const size_t fetch_size, const size_t per_page, const size_t offset, const token_ordering token_order, const std::vector& prefixes, const size_t drop_tokens_threshold, size_t& all_result_ids_len, spp::sparse_hash_map& groups_processed, std::vector>& searched_queries, tsl::htrie_map& qtoken_set, std::vector>& raw_result_kvs, std::vector>& override_result_kvs, const size_t typo_tokens_threshold, const size_t group_limit, const std::vector& group_by_fields, const bool group_missing_values, const string& default_sorting_field, bool prioritize_exact_match, const bool prioritize_token_position, const bool prioritize_num_matching_fields, bool exhaustive_search, size_t concurrency, size_t search_cutoff_ms, size_t min_len_1typo, size_t min_len_2typo, size_t max_candidates, const std::vector& infixes, const size_t max_extra_prefix, const size_t max_extra_suffix, const size_t facet_query_num_typos, const bool filter_curated_hits, enable_t split_join_tokens, const vector_query_t& vector_query, size_t facet_sample_percent, size_t facet_sample_threshold, const drop_tokens_param_t drop_tokens_mode, const std::vector& facet_index_types, bool enable_typos_for_numerical_tokens = true, bool enable_synonyms = true, bool synonym_prefix = false, uint32_t synonym_num_typos = 0, bool enable_lazy_filter = false, bool enable_typos_for_alpha_numerical_tokens = true, const size_t& max_filter_by_candidates = DEFAULT_FILTER_BY_CANDIDATES, bool rerank_hybrid_matches = false, const bool& validate_field_names = true) const; void remove_field(uint32_t seq_id, nlohmann::json& document, const std::string& field_name, const bool is_update); Option remove(const uint32_t seq_id, nlohmann::json & document, const std::vector& del_fields, const bool is_update); static void validate_and_preprocess(Index *index, std::vector& iter_batch, const size_t batch_start_index, const size_t batch_size, const std::string & default_sorting_field, const tsl::htrie_map & search_schema, const tsl::htrie_map & embedding_fields, const std::string& fallback_field_type, const std::vector& token_separators, const std::vector& symbols_to_index, const bool do_validation, const size_t remote_embedding_batch_size = 200, const size_t remote_embedding_timeout_ms = 60000, const size_t remote_embedding_num_tries = 2, const bool generate_embeddings = true); static size_t batch_memory_index(Index *index, std::vector& iter_batch, const std::string& default_sorting_field, const tsl::htrie_map& search_schema, const tsl::htrie_map & embedding_fields, const std::string& fallback_field_type, const std::vector& token_separators, const std::vector& symbols_to_index, const bool do_validation, const size_t remote_embedding_batch_size = 200, const size_t remote_embedding_timeout_ms = 60000, const size_t remote_embedding_num_tries = 2, const bool generate_embeddings = true, const bool use_addition_fields = false, const tsl::htrie_map& addition_fields = tsl::htrie_map(), const std::string& collection_name = "", const spp::sparse_hash_map>& async_referenced_ins = spp::sparse_hash_map>()); void index_field_in_memory(const std::string& collection_name, const field& afield, std::vector& iter_batch, const std::set& async_referenced_ins = {}); template void iterate_and_index_numerical_field(std::vector& iter_batch, const field& afield, T func); //static bool is_point_in_polygon(const Geofence& poly, const GeoCoord& point); //static double transform_for_180th_meridian(Geofence& poly); //static void transform_for_180th_meridian(GeoCoord& point, double offset); art_leaf* get_token_leaf(const std::string & field_name, const unsigned char* token, uint32_t token_len); Option do_filtering_with_lock(filter_node_t* const filter_tree_root, filter_result_t& filter_result, const std::string& collection_name = "", const bool& should_timeout = true, const bool& validate_field_names = true) const; Option do_reference_filtering_with_lock(filter_node_t* const filter_tree_root, filter_result_t& filter_result, const std::string& ref_collection_name, const std::string& field_name, negate_left_join_t& negate_left_join_info, const bool& validate_field_names = true) const; Option do_filtering_with_reference_ids(const std::string& field_name, const std::string& ref_collection_name, filter_result_t&& ref_filter_result) const; void refresh_schemas(const std::vector& new_fields, const std::vector& del_fields); // the following methods are not synchronized because their parent calls are synchronized or they are const/static Option search_wildcard(filter_node_t const* const& filter_tree_root, const std::vector& sort_fields, Topster*& topster, spp::sparse_hash_map& groups_processed, std::vector>& searched_queries, const size_t group_limit, const std::vector& group_by_fields, const bool group_missing_values, const uint32_t* exclude_token_ids, size_t exclude_token_ids_size, uint32_t*& all_result_ids, size_t& all_result_ids_len, filter_result_iterator_t* const filter_result_iterator, const size_t concurrency, const int* sort_order, std::array*, 3>& field_values, const std::vector& geopoint_indices) const; Option search_infix(const std::string& query, const std::string& field_name, std::vector& ids, size_t max_extra_prefix, size_t max_extra_suffix) const; void curate_filtered_ids(const uint32_t* exclude_token_ids, size_t exclude_token_ids_size, uint32_t*& filter_ids, uint32_t& filter_ids_length, const std::vector& curated_ids_sorted) const; Option populate_sort_mapping(int* sort_order, std::vector& geopoint_indices, std::vector& sort_fields_std, std::array*, 3>& field_values, const bool& validate_field_names) const; Option populate_sort_mapping_with_lock(int* sort_order, std::vector& geopoint_indices, std::vector& sort_fields_std, std::array*, 3>& field_values, const bool& validate_field_names) const; int64_t reference_string_sort_score(const std::string& field_name, const uint32_t& seq_id) const; static void remove_matched_tokens(std::vector& tokens, const std::set& rule_token_set) ; void compute_facet_infos(const std::vector& facets, facet_query_t& facet_query, const uint32_t facet_query_num_typos, uint32_t* all_result_ids, const size_t& all_result_ids_len, const std::vector& group_by_fields, size_t group_limit, bool is_wildcard_no_filter_query, size_t max_candidates, std::vector& facet_infos, const std::vector& facet_index_types ) const; void resolve_space_as_typos(std::vector& qtokens, const std::string& field_name, std::vector>& resolved_queries) const; size_t num_seq_ids() const; void handle_exclusion(const size_t num_search_fields, std::vector& field_query_tokens, const std::vector& search_fields, uint32_t*& exclude_token_ids, size_t& exclude_token_ids_size) const; Option do_infix_search(const size_t num_search_fields, const std::vector& the_fields, const std::vector& infixes, const std::vector& sort_fields, std::vector>& searched_queries, const size_t group_limit, const std::vector& group_by_fields, const bool group_missing_values, const size_t max_extra_prefix, const size_t max_extra_suffix, const std::vector& query_tokens, Topster* actual_topster, filter_result_iterator_t* const filter_result_iterator, const int sort_order[3], std::array*, 3> field_values, const std::vector& geopoint_indices, const std::vector& curated_ids_sorted, uint32_t*& all_result_ids, size_t& all_result_ids_len, spp::sparse_hash_map& groups_processed, const std::string& collection_name = "") const; [[nodiscard]] Option do_synonym_search(const std::vector& the_fields, const text_match_type_t match_type, filter_node_t const* const& filter_tree_root, const std::vector& sort_fields_std, const token_ordering& token_order, const size_t typo_tokens_threshold, const size_t group_limit, const std::vector& group_by_fields, const bool group_missing_values, bool prioritize_exact_match, const bool prioritize_token_position, const bool prioritize_num_matching_fields, const bool exhaustive_search, const size_t concurrency, const std::vector& prefixes, size_t min_len_1typo, size_t min_len_2typo, const size_t max_candidates, const std::set& curated_ids, const std::vector& curated_ids_sorted, const uint32_t* exclude_token_ids, size_t exclude_token_ids_size, Topster* actual_topster, std::vector>& q_pos_synonyms, int syn_orig_num_tokens, spp::sparse_hash_map& groups_processed, std::vector>& searched_queries, uint32_t*& all_result_ids, size_t& all_result_ids_len, filter_result_iterator_t* const filter_result_iterator, std::set& query_hashes, const int* sort_order, std::array*, 3>& field_values, const std::vector& geopoint_indices, tsl::htrie_map& qtoken_set) const; Option do_phrase_search(const size_t num_search_fields, const std::vector& search_fields, std::vector& field_query_tokens, const std::vector& sort_fields, std::vector>& searched_queries, const size_t group_limit, const std::vector& group_by_fields, const bool group_missing_values, Topster* actual_topster, const int sort_order[3], std::array*, 3> field_values, const std::vector& geopoint_indices, filter_result_iterator_t*& filter_result_iterator, uint32_t*& all_result_ids, size_t& all_result_ids_len, spp::sparse_hash_map& groups_processed, const uint32_t* excluded_result_ids, size_t excluded_result_ids_size, bool is_wildcard_query) const; [[nodiscard]] Option fuzzy_search_fields(const std::vector& the_fields, const std::vector& query_tokens, const std::vector& dropped_tokens, const text_match_type_t match_type, const uint32_t* excluded_result_ids, size_t excluded_result_ids_size, filter_result_iterator_t* const filter_result_iterator, const std::vector& curated_ids, const std::vector& sort_fields, const std::vector& num_typos, std::vector>& searched_queries, tsl::htrie_map& qtoken_set, Topster*& topster, spp::sparse_hash_map& groups_processed, uint32_t*& all_result_ids, size_t& all_result_ids_len, const size_t group_limit, const std::vector& group_by_fields, const bool group_missing_values, bool prioritize_exact_match, const bool prioritize_token_position, const bool prioritize_num_matching_fields, std::set& query_hashes, const token_ordering token_order, const std::vector& prefixes, const size_t typo_tokens_threshold, const bool exhaustive_search, const size_t max_candidates, size_t min_len_1typo, size_t min_len_2typo, int syn_orig_num_tokens, const int* sort_order, std::array*, 3>& field_values, const std::vector& geopoint_indices, bool enable_typos_for_numerical_tokens = true, bool enable_typos_for_alpha_numerical_tokens = true) const; void find_across_fields(const token_t& previous_token, const std::string& previous_token_str, const std::vector& the_fields, const size_t num_search_fields, filter_result_iterator_t* const filter_result_iterator, const uint32_t* exclude_token_ids, size_t exclude_token_ids_size, std::vector& prev_token_doc_ids, std::vector& top_prefix_field_ids) const; Option search_across_fields(const std::vector& query_tokens, const std::vector& num_typos, const std::vector& prefixes, const std::vector& the_fields, const size_t num_search_fields, const text_match_type_t match_type, const std::vector& sort_fields, Topster*& topster, spp::sparse_hash_map& groups_processed, std::vector>& searched_queries, tsl::htrie_map& qtoken_set, const std::vector& dropped_tokens, const size_t group_limit, const std::vector& group_by_fields, const bool group_missing_values, bool prioritize_exact_match, const bool search_all_candidates, const bool prioritize_num_matching_fields, filter_result_iterator_t* const filter_result_iterator, const uint32_t total_cost, const int syn_orig_num_tokens, const uint32_t* excluded_result_ids, size_t excluded_result_ids_size, const int* sort_order, std::array*, 3>& field_values, const std::vector& geopoint_indices, std::vector& id_buff, size_t& num_keyword_matches, uint32_t*& all_result_ids, size_t& all_result_ids_len) const; static int64_t compute_aggregated_score(const std::vector& its, std::vector& dropped_token_its, const std::vector& the_fields, const std::vector& query_tokens, const size_t num_search_fields, const text_match_type_t match_type, const bool prioritize_exact_match, const bool psearch_acrossrioritize_token_position, const bool prioritize_num_matching_fields, const uint32_t total_cost, const int syn_orig_num_tokens, const uint32_t seq_id, const std::vector& sort_fields, const tsl::htrie_map& search_schema, const std::vector>& searched_queries, const int* sort_order, int64_t& out_best_field_match_score); void process_filter_overrides(const std::vector& filter_overrides, std::vector& query_tokens, token_ordering token_order, std::unique_ptr& filter_tree_root, std::vector& matched_dynamic_overrides, nlohmann::json& override_metadata, bool enable_typos_for_numerical_tokens, bool enable_typos_for_alpha_numerical_tokens, const bool& validate_field_names = true) const; Option compute_sort_scores(const std::vector& sort_fields, const int* sort_order, std::array*, 3> field_values, const std::vector& geopoint_indices, uint32_t seq_id, const std::map, reference_filter_result_t>& references, std::vector& filter_indexes, int64_t max_field_match_score, int64_t* scores, int64_t& match_score_index, bool& should_skip, float vector_distance = 0) const; void process_curated_ids(const std::vector>& included_ids, const std::vector& excluded_ids, const std::vector& group_by_fields, const size_t group_limit, const bool group_missing_values, const bool filter_curated_hits, filter_result_iterator_t* const filter_result_iterator, std::set& curated_ids, std::map>& included_ids_map, std::vector& included_ids_vec) const; int64_t get_doc_val_from_sort_index(sort_index_iterator it, uint32_t doc_seq_id) const; Option seq_ids_outside_top_k(const std::string& field_name, size_t k, std::vector& outside_seq_ids); Option get_related_ids(const std::string& field_name, const uint32_t& seq_id, std::vector& result) const; Option get_object_array_related_id(const std::string& collection_name, const std::string& reference_helper_field_name, const uint32_t& seq_id, const uint32_t& object_index, uint32_t& result) const; Option get_sort_index_value_with_lock(const std::string& field_name, const uint32_t& seq_id) const; friend class filter_result_iterator_t; void repair_hnsw_index(); void aggregate_facet(const size_t group_limit, facet& this_facet, facet& acc_facet) const; Option get_geo_distance_with_lock(const std::string& geo_field_name, const uint32_t& seq_id, const S2LatLng& reference_lat_lng, const bool& round_distance = false) const; Option get_referenced_geo_distance(const sort_by& sort_field, uint32_t seq_id, const std::map, reference_filter_result_t>& references, const S2LatLng& reference_lat_lng, const bool& round_distance = false) const; Option get_ref_seq_id(const sort_by& sort_field, const uint32_t& seq_id, const std::map& references, std::string& ref_collection_name) const; void get_top_k_result_ids(const std::vector>& raw_result_kvs, std::vector& result_ids) const; void compute_aux_scores(Topster* topster, const std::vector& the_fields, const std::vector& query_tokens, uint16_t search_query_size, const std::vector& sort_fields_std, const int* sort_order, const vector_query_t& vector_query, const text_match_type_t match_type, bool prioritize_exact_match, const bool prioritize_token_position, const bool prioritize_num_matching_fields) const; float compute_decay_function_score(const sort_by& sort_field, uint32_t seq_id) const; void get_field_token_its(const size_t num_search_fields, std::vector& query_suggestion, std::vector& token_its, std::vector& expanded_plists, const std::vector& query_tokens, const std::vector& the_fields) const; static void populate_result_kvs(Topster* topster, std::vector> &result_kvs, const spp::sparse_hash_map& groups_processed, const std::vector& sort_by_fields); GeoPolygonIndex* get_geopolygon_index(const std::string& field_name) const; }; template void Index::iterate_and_index_numerical_field(std::vector& iter_batch, const field& afield, T func) { for(auto& record: iter_batch) { if(!record.indexed.ok()) { continue; } const auto& document = record.doc; const auto seq_id = record.seq_id; if (document.count(afield.name) == 0 || !afield.index) { continue; } try { func(record, seq_id); } catch(const std::exception &e) { LOG(INFO) << "Error while indexing numerical field." << e.what(); record.index_failure(400, e.what()); } } }