#include "index.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "logger.h" #include "validator.h" #include #define RETURN_CIRCUIT_BREAKER if((std::chrono::duration_cast( \ std::chrono::system_clock::now().time_since_epoch()).count() - search_begin_us) > search_stop_us) { \ search_cutoff = true; \ return ;\ } #define RETURN_CIRCUIT_BREAKER_OP if((std::chrono::duration_cast( \ std::chrono::system_clock::now().time_since_epoch()).count() - search_begin_us) > search_stop_us) { \ search_cutoff = true; \ return Option(true);\ } #define BREAK_CIRCUIT_BREAKER if((std::chrono::duration_cast( \ std::chrono::system_clock::now().time_since_epoch()).count() - search_begin_us) > search_stop_us) { \ search_cutoff = true; \ break;\ } #define FACET_INDEX_THRESHOLD 1000000000 spp::sparse_hash_map Index::text_match_sentinel_value; spp::sparse_hash_map Index::seq_id_sentinel_value; spp::sparse_hash_map Index::eval_sentinel_value; spp::sparse_hash_map Index::geo_sentinel_value; spp::sparse_hash_map Index::str_sentinel_value; spp::sparse_hash_map Index::vector_distance_sentinel_value; Index::Index(const std::string& name, const uint32_t collection_id, const Store* store, SynonymIndex* synonym_index, ThreadPool* thread_pool, const tsl::htrie_map & search_schema, const std::vector& symbols_to_index, const std::vector& token_separators): name(name), collection_id(collection_id), store(store), synonym_index(synonym_index), thread_pool(thread_pool), search_schema(search_schema), seq_ids(new id_list_t(256)), symbols_to_index(symbols_to_index), token_separators(token_separators) { facet_index_v4 = new facet_index_t(); for(const auto& a_field: search_schema) { if(!a_field.index) { continue; } if(a_field.num_dim > 0) { auto hnsw_index = new hnsw_index_t(a_field.num_dim, 1024, a_field.vec_dist); vector_index.emplace(a_field.name, hnsw_index); continue; } if(a_field.is_string()) { art_tree *t = new art_tree; art_tree_init(t); search_index.emplace(a_field.name, t); } else if(a_field.is_geopoint()) { geo_range_index.emplace(a_field.name, new NumericTrie(32)); if(!a_field.is_single_geopoint()) { spp::sparse_hash_map * doc_to_geos = new spp::sparse_hash_map(); geo_array_index.emplace(a_field.name, doc_to_geos); } } else { if (a_field.range_index) { auto trie = a_field.is_bool() ? new NumericTrie(8) : a_field.is_int32() ? new NumericTrie(32) : new NumericTrie(64); range_index.emplace(a_field.name, trie); } else { num_tree_t* num_tree = new num_tree_t; numerical_index.emplace(a_field.name, num_tree); } } if(a_field.sort) { if(a_field.type == field_types::STRING) { adi_tree_t* tree = new adi_tree_t(); str_sort_index.emplace(a_field.name, tree); } else if(a_field.type != field_types::GEOPOINT_ARRAY) { spp::sparse_hash_map * doc_to_score = new spp::sparse_hash_map(); sort_index.emplace(a_field.name, doc_to_score); } } if(a_field.facet) { initialize_facet_indexes(a_field); } // initialize for non-string facet fields if(a_field.facet && !a_field.is_string()) { art_tree *ft = new art_tree; art_tree_init(ft); search_index.emplace(a_field.faceted_name(), ft); } if(a_field.infix) { array_mapped_infix_t infix_sets(ARRAY_INFIX_DIM); for(auto& infix_set: infix_sets) { infix_set = new tsl::htrie_set(); } infix_index.emplace(a_field.name, infix_sets); } } num_documents = 0; } Index::~Index() { std::unique_lock lock(mutex); for(auto & name_tree: search_index) { art_tree_destroy(name_tree.second); delete name_tree.second; name_tree.second = nullptr; } search_index.clear(); for(auto & name_index: geo_range_index) { delete name_index.second; name_index.second = nullptr; } geo_range_index.clear(); for(auto& name_index: geo_array_index) { for(auto& kv: *name_index.second) { delete [] kv.second; } delete name_index.second; name_index.second = nullptr; } geo_array_index.clear(); for(auto & name_tree: numerical_index) { delete name_tree.second; name_tree.second = nullptr; } numerical_index.clear(); for(auto & name_tree: range_index) { delete name_tree.second; name_tree.second = nullptr; } range_index.clear(); for(auto & name_map: sort_index) { delete name_map.second; name_map.second = nullptr; } sort_index.clear(); for(auto& kv: infix_index) { for(auto& infix_set: kv.second) { delete infix_set; infix_set = nullptr; } } infix_index.clear(); for(auto& name_tree: str_sort_index) { delete name_tree.second; name_tree.second = nullptr; } str_sort_index.clear(); delete facet_index_v4; delete seq_ids; for(auto& vec_index_kv: vector_index) { delete vec_index_kv.second; } } int64_t Index::get_points_from_doc(const nlohmann::json &document, const std::string & default_sorting_field) { int64_t points = 0; if(document[default_sorting_field].is_number_float()) { // serialize float to an integer and reverse the inverted range float n = document[default_sorting_field]; memcpy(&points, &n, sizeof(int32_t)); points ^= ((points >> (std::numeric_limits::digits - 1)) | INT32_MIN); points = -1 * (INT32_MAX - points); } else if(document[default_sorting_field].is_string()) { // not much value in supporting default sorting field as string, so we will just dummy it out points = 0; } else { points = document[default_sorting_field].is_boolean() ? int64_t(document[default_sorting_field].get()) : document[default_sorting_field].get(); } return points; } int64_t Index::float_to_int64_t(float f) { // https://stackoverflow.com/questions/60530255/convert-float-to-int64-t-while-preserving-ordering int32_t i; memcpy(&i, &f, sizeof i); if (i < 0) { i ^= INT32_MAX; } return i; } float Index::int64_t_to_float(int64_t n) { int32_t i = (int32_t) n; if(i < 0) { i ^= INT32_MAX; } float f; memcpy(&f, &i, sizeof f); return f; } void Index::compute_token_offsets_facets(index_record& record, const tsl::htrie_map& search_schema, const std::vector& local_token_separators, const std::vector& local_symbols_to_index) { const auto& document = record.doc; for(const auto& the_field: search_schema) { const std::string& field_name = the_field.name; if(document.count(field_name) == 0 || !the_field.index) { continue; } offsets_facet_hashes_t offset_facet_hashes; bool is_facet = search_schema.at(field_name).facet; // non-string, non-geo faceted field should be indexed as faceted string field as well if(the_field.facet && !the_field.is_string() && !the_field.is_geopoint()) { if(the_field.is_array()) { std::vector strings; if(the_field.type == field_types::INT32_ARRAY) { for(int32_t value: document[field_name]){ auto str = std::to_string(value); strings.emplace_back(std::move(str)); } } else if(the_field.type == field_types::INT64_ARRAY) { for(int64_t value: document[field_name]){ auto str = std::to_string(value); strings.emplace_back(std::move(str)); } } else if(the_field.type == field_types::FLOAT_ARRAY) { for(float value: document[field_name]){ auto str = StringUtils::float_to_str(value); strings.emplace_back(std::move(str)); } } else if(the_field.type == field_types::BOOL_ARRAY) { for(bool value: document[field_name]){ auto str = std::to_string(value); strings.emplace_back(std::move(str)); } } tokenize_string_array(strings, the_field, local_symbols_to_index, local_token_separators, offset_facet_hashes.offsets); } else { std::string text; if(the_field.type == field_types::INT32) { auto val = document[field_name].get(); text = std::to_string(val); } else if(the_field.type == field_types::INT64) { auto val = document[field_name].get(); text = std::to_string(val); } else if(the_field.type == field_types::FLOAT) { auto val = document[field_name].get(); text = StringUtils::float_to_str(val); } else if(the_field.type == field_types::BOOL) { auto val = document[field_name].get(); text = std::to_string(val); } tokenize_string(text, the_field, local_symbols_to_index, local_token_separators, offset_facet_hashes.offsets); } } if(the_field.is_string()) { if(the_field.type == field_types::STRING) { tokenize_string(document[field_name], the_field, local_symbols_to_index, local_token_separators, offset_facet_hashes.offsets); } else { tokenize_string_array(document[field_name], the_field, local_symbols_to_index, local_token_separators, offset_facet_hashes.offsets); } } if(!offset_facet_hashes.offsets.empty()) { record.field_index.emplace(field_name, std::move(offset_facet_hashes)); } } } bool doc_contains_field(const nlohmann::json& doc, const field& a_field, const tsl::htrie_map & search_schema) { if(doc.count(a_field.name)) { return true; } // check for a nested field, e.g. `foo.bar.baz` indexed but `foo.bar` present in schema if(a_field.is_object()) { auto prefix_it = search_schema.equal_prefix_range(a_field.name); std::string nested_field_name; for(auto kv = prefix_it.first; kv != prefix_it.second; kv++) { kv.key(nested_field_name); bool is_child_field = (nested_field_name.size() > a_field.name.size() && nested_field_name[a_field.name.size()] == '.'); if(is_child_field && doc.count(nested_field_name) != 0) { return true; } } } return false; } bool validate_object_field(nlohmann::json& doc, const field& a_field) { auto field_it = doc.find(a_field.name); if(field_it != doc.end()) { if(a_field.type == field_types::OBJECT && doc[a_field.name].is_object()) { return true; } else if(a_field.type == field_types::OBJECT_ARRAY && doc[a_field.name].is_array()) { return true; } return false; } std::vector field_parts; StringUtils::split(a_field.name, field_parts, "."); nlohmann::json* obj = &doc; bool has_array = false; for(auto& field_part: field_parts) { if(obj->is_array()) { has_array = true; if(obj->empty()) { return false; } obj = &obj->at(0); if(!obj->is_object()) { return false; } } auto obj_it = obj->find(field_part); if(obj_it == obj->end()) { return false; } obj = &obj_it.value(); } LOG(INFO) << "obj: " << *obj; LOG(INFO) << "doc: " << doc; if(a_field.type == field_types::OBJECT && obj->is_object()) { return true; } else if(a_field.type == field_types::OBJECT_ARRAY && (obj->is_array() || (has_array && obj->is_object()))) { return true; } return false; } void Index::validate_and_preprocess(Index *index, std::vector& iter_batch, const size_t batch_start_index, const size_t batch_size, const std::string& default_sorting_field, const tsl::htrie_map& search_schema, const tsl::htrie_map& embedding_fields, const std::string& fallback_field_type, const std::vector& token_separators, const std::vector& symbols_to_index, const bool do_validation, const size_t remote_embedding_batch_size, const bool generate_embeddings) { // runs in a partitioned thread std::vector records_to_embed; for(size_t i = 0; i < batch_size; i++) { index_record& index_rec = iter_batch[batch_start_index + i]; try { if(!index_rec.indexed.ok()) { // some records could have been invalidated upstream continue; } if(index_rec.operation == DELETE) { continue; } handle_doc_ops(search_schema, index_rec.doc, index_rec.old_doc); if(do_validation) { Option validation_op = validator_t::validate_index_in_memory(index_rec.doc, index_rec.seq_id, default_sorting_field, search_schema, embedding_fields, index_rec.operation, index_rec.is_update, fallback_field_type, index_rec.dirty_values, generate_embeddings); if(!validation_op.ok()) { index_rec.index_failure(validation_op.code(), validation_op.error()); continue; } } if(index_rec.is_update) { // scrub string fields to reduce delete ops get_doc_changes(index_rec.operation, embedding_fields, index_rec.doc, index_rec.old_doc, index_rec.new_doc, index_rec.del_doc); if(generate_embeddings) { for(auto& field: index_rec.doc.items()) { for(auto& embedding_field : embedding_fields) { if(!embedding_field.embed[fields::from].is_null()) { auto embed_from_vector = embedding_field.embed[fields::from].get>(); for(auto& embed_from: embed_from_vector) { if(embed_from == field.key()) { records_to_embed.push_back(&index_rec); break; } } } } } } } else { if(generate_embeddings) { records_to_embed.push_back(&index_rec); } } compute_token_offsets_facets(index_rec, search_schema, token_separators, symbols_to_index); int64_t points = 0; if(index_rec.doc.count(default_sorting_field) == 0) { auto default_sorting_field_it = index->sort_index.find(default_sorting_field); if(default_sorting_field_it != index->sort_index.end()) { auto seq_id_it = default_sorting_field_it->second->find(index_rec.seq_id); if(seq_id_it != default_sorting_field_it->second->end()) { points = seq_id_it->second; } else { points = INT64_MIN; } } else { points = INT64_MIN; } } else { points = get_points_from_doc(index_rec.doc, default_sorting_field); } index_rec.points = points; index_rec.index_success(); } catch(const std::exception &e) { LOG(INFO) << "Error while validating document: " << e.what(); index_rec.index_failure(400, e.what()); } } if(generate_embeddings) { batch_embed_fields(records_to_embed, embedding_fields, search_schema, remote_embedding_batch_size); } } size_t Index::batch_memory_index(Index *index, std::vector& iter_batch, const std::string & default_sorting_field, const tsl::htrie_map & actual_search_schema, const tsl::htrie_map & embedding_fields, const std::string& fallback_field_type, const std::vector& token_separators, const std::vector& symbols_to_index, const bool do_validation, const size_t remote_embedding_batch_size, const bool generate_embeddings, const bool use_addition_fields, const tsl::htrie_map& addition_fields) { const size_t concurrency = 4; const size_t num_threads = std::min(concurrency, iter_batch.size()); const size_t window_size = (num_threads == 0) ? 0 : (iter_batch.size() + num_threads - 1) / num_threads; // rounds up const auto& indexable_schema = use_addition_fields ? addition_fields : actual_search_schema; size_t num_indexed = 0; size_t num_processed = 0; std::mutex m_process; std::condition_variable cv_process; size_t num_queued = 0; size_t batch_index = 0; // local is need to propogate the thread local inside threads launched below auto local_write_log_index = write_log_index; for(size_t thread_id = 0; thread_id < num_threads && batch_index < iter_batch.size(); thread_id++) { size_t batch_len = window_size; if(batch_index + window_size > iter_batch.size()) { batch_len = iter_batch.size() - batch_index; } num_queued++; index->thread_pool->enqueue([&, batch_index, batch_len]() { write_log_index = local_write_log_index; validate_and_preprocess(index, iter_batch, batch_index, batch_len, default_sorting_field, actual_search_schema, embedding_fields, fallback_field_type, token_separators, symbols_to_index, do_validation, remote_embedding_batch_size, generate_embeddings); std::unique_lock lock(m_process); num_processed++; cv_process.notify_one(); }); batch_index += batch_len; } { std::unique_lock lock_process(m_process); cv_process.wait(lock_process, [&](){ return num_processed == num_queued; }); } std::unordered_set found_fields; for(size_t i = 0; i < iter_batch.size(); i++) { auto& index_rec = iter_batch[i]; if(!index_rec.indexed.ok()) { // some records could have been invalidated upstream continue; } if(index_rec.is_update) { index->remove(index_rec.seq_id, index_rec.del_doc, {}, index_rec.is_update); } else if(index_rec.indexed.ok()) { num_indexed++; } for(const auto& kv: index_rec.doc.items()) { found_fields.insert(kv.key()); } } num_queued = num_processed = 0; std::unique_lock ulock(index->mutex); for(const auto& field_name: found_fields) { //LOG(INFO) << "field name: " << field_name; if(field_name != "id" && indexable_schema.count(field_name) == 0) { continue; } num_queued++; index->thread_pool->enqueue([&]() { write_log_index = local_write_log_index; const field& f = (field_name == "id") ? field("id", field_types::STRING, false) : indexable_schema.at(field_name); try { index->index_field_in_memory(f, iter_batch); } catch(std::exception& e) { LOG(ERROR) << "Unhandled Typesense error: " << e.what(); for(auto& record: iter_batch) { record.index_failure(500, "Unhandled Typesense error in index batch, check logs for details."); } } std::unique_lock lock(m_process); num_processed++; cv_process.notify_one(); }); } { std::unique_lock lock_process(m_process); cv_process.wait(lock_process, [&](){ return num_processed == num_queued; }); } return num_indexed; } void Index::index_field_in_memory(const field& afield, std::vector& iter_batch) { // indexes a given field of all documents in the batch if(afield.name == "id") { for(const auto& record: iter_batch) { if(!record.indexed.ok()) { // some records could have been invalidated upstream continue; } if(!record.is_update && record.indexed.ok()) { // for updates, the seq_id will already exist seq_ids->upsert(record.seq_id); } } return; } if(!afield.index) { return; } // We have to handle both these edge cases: // a) `afield` might not exist in the document (optional field) // b) `afield` value could be empty // non-geo faceted field should be indexed as faceted string field as well bool is_facet_field = (afield.facet && !afield.is_geopoint()); if(afield.is_string() || is_facet_field) { std::unordered_map> token_to_doc_offsets; int64_t max_score = INT64_MIN; std::unordered_map, facet_value_id_t::Hash> fvalue_to_seq_ids; std::unordered_map> seq_id_to_fvalues; auto facet_count = facet_index_v4->get_facet_count(afield.name); #ifdef TEST_BUILD facet_count = FACET_INDEX_THRESHOLD + 1; #endif /*if(afield.facet) { size_t total_num_docs = seq_ids->num_ids(); facet_index_v4->handle_index_change(afield.name, total_num_docs, FACET_INDEX_THRESHOLD, facet_count); }*/ for(const auto& record: iter_batch) { if(!record.indexed.ok()) { // some records could have been invalidated upstream continue; } const auto& document = record.doc; const auto seq_id = record.seq_id; if(document.count(afield.name) == 0 || !record.indexed.ok()) { continue; } auto field_index_it = record.field_index.find(afield.name); if(field_index_it == record.field_index.end()) { continue; } if(afield.facet) { if(afield.is_array()) { const auto& field_values = document[afield.name]; for(size_t i = 0; i < field_values.size(); i++) { if(afield.type == field_types::INT32_ARRAY) { int32_t raw_val = field_values[i].get(); auto fhash = reinterpret_cast(raw_val); facet_value_id_t facet_value_id(std::to_string(raw_val), fhash); fvalue_to_seq_ids[facet_value_id].push_back(seq_id); seq_id_to_fvalues[seq_id].push_back(facet_value_id); } else if(afield.type == field_types::INT64_ARRAY) { int64_t raw_val = field_values[i].get(); facet_value_id_t facet_value_id(std::to_string(raw_val)); fvalue_to_seq_ids[facet_value_id].push_back(seq_id); seq_id_to_fvalues[seq_id].push_back(facet_value_id); } else if(afield.type == field_types::STRING_ARRAY) { const std::string& raw_val = field_values[i].get().substr(0, 100); facet_value_id_t facet_value_id(raw_val); fvalue_to_seq_ids[facet_value_id].push_back(seq_id); seq_id_to_fvalues[seq_id].push_back(facet_value_id); } else if(afield.type == field_types::FLOAT_ARRAY) { float raw_val = field_values[i].get(); auto fhash = reinterpret_cast(raw_val); facet_value_id_t facet_value_id(StringUtils::float_to_str(raw_val), fhash); fvalue_to_seq_ids[facet_value_id].push_back(seq_id); seq_id_to_fvalues[seq_id].push_back(facet_value_id); } else if(afield.type == field_types::BOOL_ARRAY) { bool raw_val = field_values[i].get(); auto fhash = (uint32_t)raw_val; auto str_val = (raw_val == 1) ? "true" : "false"; facet_value_id_t facet_value_id(str_val, fhash); fvalue_to_seq_ids[facet_value_id].push_back(seq_id); seq_id_to_fvalues[seq_id].push_back(facet_value_id); } } } else { if(afield.type == field_types::INT32) { int32_t raw_val = document[afield.name].get(); auto fhash = reinterpret_cast(raw_val); facet_value_id_t facet_value_id(std::to_string(raw_val), fhash); fvalue_to_seq_ids[facet_value_id].push_back(seq_id); seq_id_to_fvalues[seq_id].push_back(facet_value_id); } else if(afield.type == field_types::INT64) { int64_t raw_val = document[afield.name].get(); facet_value_id_t facet_value_id(std::to_string(raw_val)); fvalue_to_seq_ids[facet_value_id].push_back(seq_id); seq_id_to_fvalues[seq_id].push_back(facet_value_id); } else if(afield.type == field_types::STRING) { const std::string& raw_val = document[afield.name].get().substr(0, 100); facet_value_id_t facet_value_id(raw_val); fvalue_to_seq_ids[facet_value_id].push_back(seq_id); seq_id_to_fvalues[seq_id].push_back(facet_value_id); } else if(afield.type == field_types::FLOAT) { float raw_val = document[afield.name].get(); auto fhash = reinterpret_cast(raw_val); facet_value_id_t facet_value_id(StringUtils::float_to_str(raw_val), fhash); fvalue_to_seq_ids[facet_value_id].push_back(seq_id); seq_id_to_fvalues[seq_id].push_back(facet_value_id); } else if(afield.type == field_types::BOOL) { bool raw_val = document[afield.name].get(); auto fhash = (uint32_t)raw_val; auto str_val = (raw_val == 1) ? "true" : "false"; facet_value_id_t facet_value_id(str_val, fhash); fvalue_to_seq_ids[facet_value_id].push_back(seq_id); seq_id_to_fvalues[seq_id].push_back(facet_value_id); } } } if(record.points > max_score) { max_score = record.points; } for(auto& token_offsets: field_index_it->second.offsets) { token_to_doc_offsets[token_offsets.first].emplace_back(seq_id, record.points, token_offsets.second); if(afield.infix) { auto strhash = StringUtils::hash_wy(token_offsets.first.c_str(), token_offsets.first.size()); const auto& infix_sets = infix_index.at(afield.name); infix_sets[strhash % 4]->insert(token_offsets.first); } } } facet_index_v4->insert(afield.name, fvalue_to_seq_ids, seq_id_to_fvalues, afield.is_string()); auto tree_it = search_index.find(afield.faceted_name()); if(tree_it == search_index.end()) { return; } art_tree *t = tree_it->second; for(auto& token_to_doc: token_to_doc_offsets) { const std::string& token = token_to_doc.first; std::vector& documents = token_to_doc.second; const auto *key = (const unsigned char *) token.c_str(); int key_len = (int) token.length() + 1; // for the terminating \0 char //LOG(INFO) << "key: " << key << ", art_doc.id: " << art_doc.id; art_inserts(t, key, key_len, max_score, documents); } } if(!afield.is_string()) { if (afield.type == field_types::INT32) { auto num_tree = afield.range_index ? nullptr : numerical_index.at(afield.name); auto trie = afield.range_index ? range_index.at(afield.name) : nullptr; iterate_and_index_numerical_field(iter_batch, afield, [&afield, num_tree, trie] (const index_record& record, uint32_t seq_id) { int32_t value = record.doc[afield.name].get(); if (afield.range_index) { trie->insert(value, seq_id); } else { num_tree->insert(value, seq_id); } }); } else if(afield.type == field_types::INT64) { auto num_tree = afield.range_index ? nullptr : numerical_index.at(afield.name); auto trie = afield.range_index ? range_index.at(afield.name) : nullptr; iterate_and_index_numerical_field(iter_batch, afield, [&afield, num_tree, trie] (const index_record& record, uint32_t seq_id) { int64_t value = record.doc[afield.name].get(); if (afield.range_index) { trie->insert(value, seq_id); } else { num_tree->insert(value, seq_id); } }); } else if(afield.type == field_types::FLOAT) { auto num_tree = afield.range_index ? nullptr : numerical_index.at(afield.name); auto trie = afield.range_index ? range_index.at(afield.name) : nullptr; iterate_and_index_numerical_field(iter_batch, afield, [&afield, num_tree, trie] (const index_record& record, uint32_t seq_id) { float fvalue = record.doc[afield.name].get(); int64_t value = float_to_int64_t(fvalue); if (afield.range_index) { trie->insert(value, seq_id); } else { num_tree->insert(value, seq_id); } }); } else if(afield.type == field_types::BOOL) { auto num_tree = afield.range_index ? nullptr : numerical_index.at(afield.name); auto trie = afield.range_index ? range_index.at(afield.name) : nullptr; iterate_and_index_numerical_field(iter_batch, afield, [&afield, num_tree, trie] (const index_record& record, uint32_t seq_id) { bool value = record.doc[afield.name].get(); if (afield.range_index) { trie->insert(value, seq_id); } else { num_tree->insert(value, seq_id); } }); } else if(afield.type == field_types::GEOPOINT || afield.type == field_types::GEOPOINT_ARRAY) { auto geopoint_range_index = geo_range_index.at(afield.name); iterate_and_index_numerical_field(iter_batch, afield, [&afield, &geo_array_index=geo_array_index, geopoint_range_index](const index_record& record, uint32_t seq_id) { // nested geopoint value inside an array of object will be a simple array so must be treated as geopoint bool nested_obj_arr_geopoint = (afield.nested && afield.type == field_types::GEOPOINT_ARRAY && !record.doc[afield.name].empty() && record.doc[afield.name][0].is_number()); if(afield.type == field_types::GEOPOINT || nested_obj_arr_geopoint) { // this could be a nested gepoint array so can have more than 2 array values const std::vector& latlongs = record.doc[afield.name]; for(size_t li = 0; li < latlongs.size(); li+=2) { S2RegionTermIndexer::Options options; options.set_index_contains_points_only(true); S2RegionTermIndexer indexer(options); S2Point point = S2LatLng::FromDegrees(latlongs[li], latlongs[li+1]).ToPoint(); auto cell = S2CellId(point); geopoint_range_index->insert_geopoint(cell.id(), seq_id); } if(nested_obj_arr_geopoint) { int64_t* packed_latlongs = new int64_t[(latlongs.size()/2) + 1]; packed_latlongs[0] = latlongs.size()/2; size_t j_packed_latlongs = 0; for(size_t li = 0; li < latlongs.size(); li+=2) { int64_t packed_latlong = GeoPoint::pack_lat_lng(latlongs[li], latlongs[li+1]); packed_latlongs[j_packed_latlongs + 1] = packed_latlong; j_packed_latlongs++; } geo_array_index.at(afield.name)->emplace(seq_id, packed_latlongs); } } else { const std::vector>& latlongs = record.doc[afield.name]; S2RegionTermIndexer::Options options; options.set_index_contains_points_only(true); S2RegionTermIndexer indexer(options); int64_t* packed_latlongs = new int64_t[latlongs.size() + 1]; packed_latlongs[0] = latlongs.size(); for(size_t li = 0; li < latlongs.size(); li++) { auto& latlong = latlongs[li]; S2Point point = S2LatLng::FromDegrees(latlong[0], latlong[1]).ToPoint(); auto cell = S2CellId(point); geopoint_range_index->insert_geopoint(cell.id(), seq_id); int64_t packed_latlong = GeoPoint::pack_lat_lng(latlong[0], latlong[1]); packed_latlongs[li + 1] = packed_latlong; } geo_array_index.at(afield.name)->emplace(seq_id, packed_latlongs); } }); } else if(afield.is_array()) { // handle vector index first if(afield.type == field_types::FLOAT_ARRAY && afield.num_dim > 0) { auto vec_index = vector_index[afield.name]->vecdex; size_t curr_ele_count = vec_index->getCurrentElementCount(); if(curr_ele_count + iter_batch.size() > vec_index->getMaxElements()) { vec_index->resizeIndex((curr_ele_count + iter_batch.size()) * 1.3); } const size_t num_threads = 1; //std::min(4, iter_batch.size()); const size_t window_size = (num_threads == 0) ? 0 : (iter_batch.size() + num_threads - 1) / num_threads; // rounds up size_t num_processed = 0; std::mutex m_process; std::condition_variable cv_process; size_t num_queued = 0; size_t result_index = 0; for(size_t thread_id = 0; thread_id < num_threads && result_index < iter_batch.size(); thread_id++) { size_t batch_len = window_size; if(result_index + window_size > iter_batch.size()) { batch_len = iter_batch.size() - result_index; } num_queued++; thread_pool->enqueue([thread_id, &afield, &vec_index, &records = iter_batch, result_index, batch_len, &num_processed, &m_process, &cv_process]() { size_t batch_counter = 0; while(batch_counter < batch_len) { auto& record = records[result_index + batch_counter]; if(record.doc.count(afield.name) == 0 || !record.indexed.ok()) { batch_counter++; continue; } try { const std::vector& float_vals = record.doc[afield.name].get>(); if(float_vals.size() != afield.num_dim) { record.index_failure(400, "Vector size mismatch."); } else { if(afield.vec_dist == cosine) { std::vector normalized_vals(afield.num_dim); hnsw_index_t::normalize_vector(float_vals, normalized_vals); vec_index->addPoint(normalized_vals.data(), (size_t)record.seq_id, true); } else { vec_index->addPoint(float_vals.data(), (size_t)record.seq_id, true); } } } catch(const std::exception &e) { record.index_failure(400, e.what()); } batch_counter++; } std::unique_lock lock(m_process); num_processed++; cv_process.notify_one(); }); result_index += batch_len; } std::unique_lock lock_process(m_process); cv_process.wait(lock_process, [&](){ return num_processed == num_queued; }); return; } // all other numerical arrays auto num_tree = afield.range_index ? nullptr : numerical_index.at(afield.name); auto trie = afield.range_index ? range_index.at(afield.name) : nullptr; iterate_and_index_numerical_field(iter_batch, afield, [&afield, num_tree, trie] (const index_record& record, uint32_t seq_id) { for(size_t arr_i = 0; arr_i < record.doc[afield.name].size(); arr_i++) { const auto& arr_value = record.doc[afield.name][arr_i]; if(afield.type == field_types::INT32_ARRAY) { const int32_t value = arr_value; if (afield.range_index) { trie->insert(value, seq_id); } else { num_tree->insert(value, seq_id); } } else if(afield.type == field_types::INT64_ARRAY) { const int64_t value = arr_value; if (afield.range_index) { trie->insert(value, seq_id); } else { num_tree->insert(value, seq_id); } } else if(afield.type == field_types::FLOAT_ARRAY) { const float fvalue = arr_value; int64_t value = float_to_int64_t(fvalue); if (afield.range_index) { trie->insert(value, seq_id); } else { num_tree->insert(value, seq_id); } } else if(afield.type == field_types::BOOL_ARRAY) { const bool value = record.doc[afield.name][arr_i]; if (afield.range_index) { trie->insert(value, seq_id); } else { num_tree->insert(value, seq_id); } } } }); } // add numerical values automatically into sort index if sorting is enabled if(afield.is_num_sortable() && afield.type != field_types::GEOPOINT_ARRAY) { spp::sparse_hash_map *doc_to_score = sort_index.at(afield.name); bool is_integer = afield.is_integer(); bool is_float = afield.is_float(); bool is_bool = afield.is_bool(); bool is_geopoint = afield.is_geopoint(); for(const auto& record: iter_batch) { if(!record.indexed.ok()) { continue; } const auto& document = record.doc; const auto seq_id = record.seq_id; if (document.count(afield.name) == 0 || !afield.index) { continue; } if(is_integer) { doc_to_score->emplace(seq_id, document[afield.name].get()); } else if(is_float) { int64_t ifloat = float_to_int64_t(document[afield.name].get()); doc_to_score->emplace(seq_id, ifloat); } else if(is_bool) { doc_to_score->emplace(seq_id, (int64_t) document[afield.name].get()); } else if(is_geopoint) { const std::vector& latlong = document[afield.name]; int64_t lat_lng = GeoPoint::pack_lat_lng(latlong[0], latlong[1]); doc_to_score->emplace(seq_id, lat_lng); } } } } else if(afield.is_str_sortable()) { adi_tree_t* str_tree = str_sort_index.at(afield.name); for(const auto& record: iter_batch) { if(!record.indexed.ok()) { continue; } const auto& document = record.doc; const auto seq_id = record.seq_id; if (document.count(afield.name) == 0 || !afield.index) { continue; } std::string raw_str = document[afield.name].get(); Tokenizer str_tokenizer("", true, false, "", {' '}); str_tokenizer.tokenize(raw_str); if(!raw_str.empty()) { str_tree->index(seq_id, raw_str.substr(0, 2000)); } } } } void Index::tokenize_string(const std::string& text, const field& a_field, const std::vector& symbols_to_index, const std::vector& token_separators, std::unordered_map>& token_to_offsets) { Tokenizer tokenizer(text, true, !a_field.is_string(), a_field.locale, symbols_to_index, token_separators); std::string token; std::string last_token; size_t token_index = 0; while(tokenizer.next(token, token_index)) { if(token.empty()) { continue; } if(token.size() > 100) { token.erase(100); } token_to_offsets[token].push_back(token_index + 1); last_token = token; } if(!token_to_offsets.empty()) { // push 0 for the last occurring token (used for exact match ranking) token_to_offsets[last_token].push_back(0); } } void Index::tokenize_string_array(const std::vector& strings, const field& a_field, const std::vector& symbols_to_index, const std::vector& token_separators, std::unordered_map>& token_to_offsets) { for(size_t array_index = 0; array_index < strings.size(); array_index++) { const std::string& str = strings[array_index]; std::set token_set; // required to deal with repeating tokens Tokenizer tokenizer(str, true, !a_field.is_string(), a_field.locale, symbols_to_index, token_separators); std::string token, last_token; size_t token_index = 0; // iterate and append offset positions while(tokenizer.next(token, token_index)) { if(token.empty()) { continue; } if(token.size() > 100) { token.erase(100); } token_to_offsets[token].push_back(token_index + 1); token_set.insert(token); last_token = token; } if(token_set.empty()) { continue; } for(auto& the_token: token_set) { // repeat last element to indicate end of offsets for this array index token_to_offsets[the_token].push_back(token_to_offsets[the_token].back()); // iterate and append this array index to all tokens token_to_offsets[the_token].push_back(array_index); } // push 0 for the last occurring token (used for exact match ranking) token_to_offsets[last_token].push_back(0); } } void Index::initialize_facet_indexes(const field& facet_field) { facet_index_v4->initialize(facet_field.name); } void Index::compute_facet_stats(facet &a_facet, const std::string& raw_value, const std::string & field_type) { if(field_type == field_types::INT32 || field_type == field_types::INT32_ARRAY) { int32_t val = std::stoi(raw_value); if (val < a_facet.stats.fvmin) { a_facet.stats.fvmin = val; } if (val > a_facet.stats.fvmax) { a_facet.stats.fvmax = val; } a_facet.stats.fvsum += val; a_facet.stats.fvcount++; } else if(field_type == field_types::INT64 || field_type == field_types::INT64_ARRAY) { int64_t val = std::stoll(raw_value); if(val < a_facet.stats.fvmin) { a_facet.stats.fvmin = val; } if(val > a_facet.stats.fvmax) { a_facet.stats.fvmax = val; } a_facet.stats.fvsum += val; a_facet.stats.fvcount++; } else if(field_type == field_types::FLOAT || field_type == field_types::FLOAT_ARRAY) { float val = std::stof(raw_value); if(val < a_facet.stats.fvmin) { a_facet.stats.fvmin = val; } if(val > a_facet.stats.fvmax) { a_facet.stats.fvmax = val; } a_facet.stats.fvsum += val; a_facet.stats.fvcount++; } } void Index::compute_facet_stats(facet &a_facet, int64_t raw_value, const std::string & field_type) { if(field_type == field_types::INT32 || field_type == field_types::INT32_ARRAY) { int32_t val = raw_value; if (val < a_facet.stats.fvmin) { a_facet.stats.fvmin = val; } if (val > a_facet.stats.fvmax) { a_facet.stats.fvmax = val; } a_facet.stats.fvsum += val; a_facet.stats.fvcount++; } else if(field_type == field_types::INT64 || field_type == field_types::INT64_ARRAY) { int64_t val = raw_value; if(val < a_facet.stats.fvmin) { a_facet.stats.fvmin = val; } if(val > a_facet.stats.fvmax) { a_facet.stats.fvmax = val; } a_facet.stats.fvsum += val; a_facet.stats.fvcount++; } else if(field_type == field_types::FLOAT || field_type == field_types::FLOAT_ARRAY) { float val = reinterpret_cast(raw_value); if(val < a_facet.stats.fvmin) { a_facet.stats.fvmin = val; } if(val > a_facet.stats.fvmax) { a_facet.stats.fvmax = val; } a_facet.stats.fvsum += val; a_facet.stats.fvcount++; } } int64_t Index::get_doc_val_from_sort_index(sort_index_iterator sort_index_it, uint32_t doc_seq_id) const { if(sort_index_it != sort_index.end()){ auto doc_id_val_map = sort_index_it->second; auto doc_seq_id_it = doc_id_val_map->find(doc_seq_id); if(doc_seq_id_it != doc_id_val_map->end()){ return doc_seq_id_it->second; } } return INT64_MAX; } std::vector Index::get_group_by_field_iterators(const std::vector& group_by_fields, bool is_reverse) const { std::vector group_by_field_it_vec; for (const auto &field_name: group_by_fields) { if (!facet_index_v4->has_hash_index(field_name)) { continue; } auto facet_index = facet_index_v4->get_facet_hash_index(field_name); auto facet_index_it = is_reverse ? facet_index->new_rev_iterator() : facet_index->new_iterator(); group_by_field_it_t group_by_field_it_struct {field_name, std::move(facet_index_it)}; group_by_field_it_vec.emplace_back(std::move(group_by_field_it_struct)); } return group_by_field_it_vec; } void Index::do_facets(std::vector & facets, facet_query_t & facet_query, bool estimate_facets, size_t facet_sample_percent, const std::vector& facet_infos, const size_t group_limit, const std::vector& group_by_fields, const bool group_missing_values, const uint32_t* result_ids, size_t results_size, int max_facet_count, bool is_wildcard_query, bool no_filters_provided, facet_index_type_t facet_index_type) const { if(results_size == 0) { return ; } std::vector group_by_field_it_vec; if (group_limit != 0) { group_by_field_it_vec = get_group_by_field_iterators(group_by_fields); } size_t total_docs = seq_ids->num_ids(); // assumed that facet fields have already been validated upstream for(size_t findex=0; findex < facets.size(); findex++) { auto& a_facet = facets[findex]; const auto& facet_field = facet_infos[findex].facet_field; const bool use_facet_query = facet_infos[findex].use_facet_query; const auto& fquery_hashes = facet_infos[findex].hashes; const bool should_compute_stats = facet_infos[findex].should_compute_stats; const bool use_value_index = facet_infos[findex].use_value_index; auto sort_index_it = sort_index.find(a_facet.field_name); size_t mod_value = 100 / facet_sample_percent; auto num_facet_values = facet_index_v4->get_facet_count(facet_field.name); if(num_facet_values == 0) { continue; } bool is_wildcard_no_filter_query = is_wildcard_query && no_filters_provided; bool facet_value_index_exists = facet_index_v4->has_value_index(facet_field.name); #ifdef TEST_BUILD if(facet_index_type == VALUE) { #else if(facet_value_index_exists && use_value_index) { #endif // LOG(INFO) << "Using intersection to find facets"; a_facet.is_intersected = true; std::map facet_results; std::string sort_order = a_facet.is_sort_by_alpha ? a_facet.sort_order : ""; facet_index_v4->intersect(a_facet, use_facet_query, facet_infos[findex].fvalue_searched_tokens, result_ids, results_size, max_facet_count, facet_results, is_wildcard_no_filter_query, sort_order); for(const auto& kv : facet_results) { //range facet processing if(a_facet.is_range_query) { const auto doc_val = kv.first; std::pair range_pair {}; if(a_facet.get_range(std::stoll(doc_val), range_pair)) { const auto& range_id = range_pair.first; facet_count_t& facet_count = a_facet.result_map[range_id]; facet_count.count = kv.second.count; } } else { facet_count_t& facet_count = a_facet.value_result_map[kv.first]; facet_count.count = kv.second.count; facet_count.doc_id = kv.second.doc_id; } if(should_compute_stats) { //LOG(INFO) << "Computing facet stas for facet " << a_facet.field_name; for(size_t i = 0; i < kv.second.count; ++i) { compute_facet_stats(a_facet, kv.first, facet_field.type); } } } } else { //LOG(INFO) << "Using hashing to find facets"; bool facet_hash_index_exists = facet_index_v4->has_hash_index(facet_field.name); if(!facet_hash_index_exists) { continue; } const auto& fhash_int64_map = facet_index_v4->get_fhash_int64_map(a_facet.field_name); const auto facet_field_is_array = facet_field.is_array(); const auto& facet_index = facet_index_v4->get_facet_hash_index(facet_field.name); posting_list_t::iterator_t facet_index_it = facet_index->new_iterator(); std::vector facet_hashes; for(size_t i = 0; i < results_size; i++) { // if sampling is enabled, we will skip a portion of the results to speed up things if(estimate_facets) { if(i % mod_value != 0) { continue; } } uint32_t doc_seq_id = result_ids[i]; facet_index_it.skip_to(doc_seq_id); if(!facet_index_it.valid()) { break; } if(facet_index_it.id() != doc_seq_id) { continue; } facet_hashes.clear(); posting_list_t::get_offsets(facet_index_it, facet_hashes); uint64_t distinct_id = 0; if(group_limit) { distinct_id = 1; for(auto& kv : group_by_field_it_vec) { get_distinct_id(kv.field_name, kv.it, doc_seq_id, group_missing_values, distinct_id); } } //LOG(INFO) << "facet_hash_count " << facet_hash_count; if(((i + 1) % 16384) == 0) { RETURN_CIRCUIT_BREAKER } std::set unique_facet_hashes; for(size_t j = 0; j < facet_hashes.size(); j++) { const auto& fhash = facet_hashes[j]; if(unique_facet_hashes.count(fhash) == 0) { unique_facet_hashes.insert(fhash); } else { continue; } if(should_compute_stats) { int64_t val = fhash; if(facet_field.is_int64()) { if(fhash_int64_map.find(fhash) != fhash_int64_map.end()) { val = fhash_int64_map.at(fhash); } else { val = INT64_MAX; } } compute_facet_stats(a_facet, val, facet_field.type); } if(a_facet.is_range_query) { int64_t doc_val = get_doc_val_from_sort_index(sort_index_it, doc_seq_id); std::pair range_pair {}; if(a_facet.get_range(doc_val, range_pair)) { const auto& range_id = range_pair.first; facet_count_t& facet_count = a_facet.result_map[range_id]; facet_count.count += 1; if(group_limit) { a_facet.hash_groups[range_id].emplace(distinct_id); } } } else if(!use_facet_query || fquery_hashes.find(fhash) != fquery_hashes.end()) { facet_count_t& facet_count = a_facet.result_map[fhash]; //LOG(INFO) << "field: " << a_facet.field_name << ", doc id: " << doc_seq_id << ", hash: " << fhash; facet_count.doc_id = doc_seq_id; facet_count.array_pos = j; if(group_limit) { a_facet.hash_groups[fhash].emplace(distinct_id); } else { facet_count.count += 1; } if(use_facet_query) { //LOG (INFO) << "adding hash tokens for hash " << fhash; a_facet.hash_tokens[fhash] = fquery_hashes.at(fhash); } if(!a_facet.sort_field.empty()) { sort_index_it = sort_index.find(a_facet.sort_field); facet_count.sort_field_val = get_doc_val_from_sort_index(sort_index_it, doc_seq_id); //LOG(INFO) << "found sort_field val " << facet_count.sort_field; } } } } } } } void Index::aggregate_topster(Topster* agg_topster, Topster* index_topster) { if(index_topster->distinct) { for(auto &group_topster_entry: index_topster->group_kv_map) { Topster* group_topster = group_topster_entry.second; for(const auto& map_kv: group_topster->kv_map) { agg_topster->add(map_kv.second); } } } else { for(const auto& map_kv: index_topster->kv_map) { agg_topster->add(map_kv.second); } } } Option Index::search_all_candidates(const size_t num_search_fields, const text_match_type_t match_type, const std::vector& the_fields, filter_result_iterator_t* const filter_result_iterator, const uint32_t* exclude_token_ids, size_t exclude_token_ids_size, const std::unordered_set& excluded_group_ids, const std::vector& sort_fields, std::vector& token_candidates_vec, std::vector>& searched_queries, tsl::htrie_map& qtoken_set, const std::vector& dropped_tokens, Topster* topster, spp::sparse_hash_map& groups_processed, uint32_t*& all_result_ids, size_t& all_result_ids_len, const size_t typo_tokens_threshold, const size_t group_limit, const std::vector& group_by_fields, const bool group_missing_values, const std::vector& query_tokens, const std::vector& num_typos, const std::vector& prefixes, bool prioritize_exact_match, const bool prioritize_token_position, const bool prioritize_num_matching_fields, const bool exhaustive_search, const size_t max_candidates, int syn_orig_num_tokens, const int* sort_order, std::array*, 3>& field_values, const std::vector& geopoint_indices, std::set& query_hashes, std::vector& id_buff, const std::string& collection_name) const { /*if(!token_candidates_vec.empty()) { LOG(INFO) << "Prefix candidates size: " << token_candidates_vec.back().candidates.size(); LOG(INFO) << "max_candidates: " << max_candidates; LOG(INFO) << "token_candidates_vec.size(): " << token_candidates_vec.size(); }*/ auto product = []( long long a, tok_candidates & b ) { return a*b.candidates.size(); }; long long int N = std::accumulate(token_candidates_vec.begin(), token_candidates_vec.end(), 1LL, product); // escape hatch to prevent too much looping but subject to being overriden explicitly via `max_candidates` long long combination_limit = (num_search_fields == 1 && prefixes[0]) ? max_candidates : std::max(Index::COMBINATION_MIN_LIMIT, max_candidates); for(long long n = 0; n < N && n < combination_limit; ++n) { RETURN_CIRCUIT_BREAKER_OP std::vector query_suggestion(token_candidates_vec.size()); uint64 qhash; uint32_t total_cost = next_suggestion2(token_candidates_vec, n, query_suggestion, qhash); /*LOG(INFO) << "n: " << n; std::stringstream fullq; for(const auto& qtok : query_suggestion) { fullq << qtok.value << " "; } LOG(INFO) << "query: " << fullq.str() << ", total_cost: " << total_cost << ", all_result_ids_len: " << all_result_ids_len << ", bufsiz: " << id_buff.size();*/ if(query_hashes.find(qhash) != query_hashes.end()) { // skip this query since it has already been processed before //LOG(INFO) << "Skipping qhash " << qhash; continue; } //LOG(INFO) << "field_num_results: " << field_num_results << ", typo_tokens_threshold: " << typo_tokens_threshold; auto search_across_fields_op = search_across_fields(query_suggestion, num_typos, prefixes, the_fields, num_search_fields, match_type, sort_fields, topster,groups_processed, searched_queries, qtoken_set, dropped_tokens, group_limit, group_by_fields, group_missing_values, prioritize_exact_match, prioritize_token_position, prioritize_num_matching_fields, filter_result_iterator, total_cost, syn_orig_num_tokens, exclude_token_ids, exclude_token_ids_size, excluded_group_ids, sort_order, field_values, geopoint_indices, id_buff, all_result_ids, all_result_ids_len, collection_name); if (!search_across_fields_op.ok()) { return search_across_fields_op; } query_hashes.insert(qhash); filter_result_iterator->reset(); } return Option(true); } void Index::search_candidates(const uint8_t & field_id, bool field_is_array, const uint32_t* filter_ids, size_t filter_ids_length, const uint32_t* exclude_token_ids, size_t exclude_token_ids_size, const std::vector& curated_ids, std::vector & sort_fields, std::vector & token_candidates_vec, std::vector> & searched_queries, Topster* topster, spp::sparse_hash_map& groups_processed, uint32_t** all_result_ids, size_t & all_result_ids_len, size_t& field_num_results, const size_t typo_tokens_threshold, const size_t group_limit, const std::vector& group_by_fields, const bool group_missing_values, const std::vector& query_tokens, bool prioritize_exact_match, const bool exhaustive_search, int syn_orig_num_tokens, const size_t concurrency, std::set& query_hashes, std::vector& id_buff) const { auto product = []( long long a, token_candidates & b ) { return a*b.candidates.size(); }; long long int N = std::accumulate(token_candidates_vec.begin(), token_candidates_vec.end(), 1LL, product); int sort_order[3]; // 1 or -1 based on DESC or ASC respectively std::array*, 3> field_values; std::vector geopoint_indices; populate_sort_mapping(sort_order, geopoint_indices, sort_fields, field_values); // escape hatch to prevent too much looping size_t combination_limit = exhaustive_search ? Index::COMBINATION_MAX_LIMIT : Index::COMBINATION_MIN_LIMIT; for (long long n = 0; n < N && n < combination_limit; ++n) { RETURN_CIRCUIT_BREAKER // every element in `query_suggestion` contains a token and its associated hits std::vector query_suggestion(token_candidates_vec.size()); // actual query suggestion preserves original order of tokens in query std::vector actual_query_suggestion(token_candidates_vec.size()); uint64 qhash; uint32_t token_bits = 0; uint32_t total_cost = next_suggestion(token_candidates_vec, n, actual_query_suggestion, query_suggestion, syn_orig_num_tokens, token_bits, qhash); if(query_hashes.find(qhash) != query_hashes.end()) { // skip this query since it has already been processed before continue; } query_hashes.insert(qhash); //LOG(INFO) << "field_num_results: " << field_num_results << ", typo_tokens_threshold: " << typo_tokens_threshold; //LOG(INFO) << "n: " << n; /*std::stringstream fullq; for(const auto& qleaf : actual_query_suggestion) { std::string qtok(reinterpret_cast(qleaf->key),qleaf->key_len - 1); fullq << qtok << " "; } LOG(INFO) << "field: " << size_t(field_id) << ", query: " << fullq.str() << ", total_cost: " << total_cost;*/ // Prepare excluded document IDs that we can later remove from the result set uint32_t* excluded_result_ids = nullptr; size_t excluded_result_ids_size = ArrayUtils::or_scalar(exclude_token_ids, exclude_token_ids_size, &curated_ids[0], curated_ids.size(), &excluded_result_ids); std::vector posting_lists; for(auto& query_leaf : query_suggestion) { posting_lists.push_back(query_leaf->values); } result_iter_state_t iter_state( excluded_result_ids, excluded_result_ids_size, filter_ids, filter_ids_length ); // We fetch offset positions only for multi token query bool single_exact_query_token = false; if(total_cost == 0 && query_suggestion.size() == query_tokens.size() == 1) { // does this candidate suggestion token match query token exactly? single_exact_query_token = true; } if(topster == nullptr) { posting_t::block_intersector_t(posting_lists, iter_state) .intersect([&](uint32_t seq_id, std::vector& its) { id_buff.push_back(seq_id); }); } else { posting_t::block_intersector_t(posting_lists, iter_state) .intersect([&](uint32_t seq_id, std::vector& its) { score_results(sort_fields, searched_queries.size(), field_id, field_is_array, total_cost, topster, query_suggestion, groups_processed, seq_id, sort_order, field_values, geopoint_indices, group_limit, group_by_fields, group_missing_values, token_bits, prioritize_exact_match, single_exact_query_token, syn_orig_num_tokens, its); id_buff.push_back(seq_id); }); } delete [] excluded_result_ids; const size_t num_result_ids = id_buff.size(); if(id_buff.size() > 100000) { // prevents too many ORs during exhaustive searching std::sort(id_buff.begin(), id_buff.end()); id_buff.erase(std::unique( id_buff.begin(), id_buff.end() ), id_buff.end()); uint32_t* new_all_result_ids = nullptr; all_result_ids_len = ArrayUtils::or_scalar(*all_result_ids, all_result_ids_len, &id_buff[0], id_buff.size(), &new_all_result_ids); delete[] *all_result_ids; *all_result_ids = new_all_result_ids; id_buff.clear(); } if(num_result_ids == 0) { continue; } field_num_results += num_result_ids; searched_queries.push_back(actual_query_suggestion); } } bool Index::field_is_indexed(const std::string& field_name) const { return search_index.count(field_name) != 0 || numerical_index.count(field_name) != 0 || range_index.count(field_name) != 0 || geo_range_index.count(field_name) != 0; } Option Index::do_filtering_with_lock(filter_node_t* const filter_tree_root, filter_result_t& filter_result, const std::string& collection_name) const { std::shared_lock lock(mutex); auto filter_result_iterator = filter_result_iterator_t(collection_name, this, filter_tree_root); auto filter_init_op = filter_result_iterator.init_status(); if (!filter_init_op.ok()) { return filter_init_op; } filter_result.count = filter_result_iterator.to_filter_id_array(filter_result.docs); return Option(true); } Option Index::do_reference_filtering_with_lock(filter_node_t* const filter_tree_root, filter_result_t& filter_result, const std::string& collection_name, const std::string& reference_helper_field_name) const { std::shared_lock lock(mutex); auto filter_result_iterator = filter_result_iterator_t(collection_name, this, filter_tree_root); auto filter_init_op = filter_result_iterator.init_status(); if (!filter_init_op.ok()) { return filter_init_op; } uint32_t* reference_docs = nullptr; uint32_t count = filter_result_iterator.to_filter_id_array(reference_docs); std::unique_ptr docs_guard(reference_docs); if (count == 0) { return Option(true); } // Collect all the doc ids from the reference ids. std::vector> id_pairs; std::unordered_set unique_doc_ids; auto const& ref_index = *sort_index.at(reference_helper_field_name); for (uint32_t i = 0; i < count; i++) { auto& reference_doc_id = reference_docs[i]; auto doc_id = ref_index.at(reference_doc_id); id_pairs.emplace_back(std::pair(doc_id, reference_doc_id)); unique_doc_ids.insert(doc_id); } std::sort(id_pairs.begin(), id_pairs.end(), [](auto const& left, auto const& right) { return left.first < right.first; }); filter_result.count = unique_doc_ids.size(); filter_result.docs = new uint32_t[unique_doc_ids.size()]; filter_result.coll_to_references = new std::map[unique_doc_ids.size()] {}; std::vector previous_doc_references; for (uint32_t i = 0, previous_doc = id_pairs[0].first + 1, result_index = 0; i < id_pairs.size(); i++) { auto const& current_doc = id_pairs[i].first; auto const& reference_doc_id = id_pairs[i].second; if (current_doc != previous_doc) { filter_result.docs[result_index] = current_doc; if (result_index > 0) { auto& reference_result = filter_result.coll_to_references[result_index - 1]; auto r = reference_filter_result_t(previous_doc_references.size(), new uint32_t[previous_doc_references.size()]); std::copy(previous_doc_references.begin(), previous_doc_references.end(), r.docs); reference_result[collection_name] = std::move(r); previous_doc_references.clear(); } result_index++; previous_doc = current_doc; previous_doc_references.push_back(reference_doc_id); } else { previous_doc_references.push_back(reference_doc_id); } } if (!previous_doc_references.empty()) { auto& reference_result = filter_result.coll_to_references[filter_result.count - 1]; auto r = reference_filter_result_t(previous_doc_references.size(), new uint32_t[previous_doc_references.size()]); std::copy(previous_doc_references.begin(), previous_doc_references.end(), r.docs); reference_result[collection_name] = std::move(r); } return Option(true); } Option Index::run_search(search_args* search_params, const std::string& collection_name, facet_index_type_t facet_index_type) { return search(search_params->field_query_tokens, search_params->search_fields, search_params->match_type, search_params->filter_tree_root, search_params->facets, search_params->facet_query, search_params->max_facet_values, search_params->included_ids, search_params->excluded_ids, search_params->sort_fields_std, search_params->num_typos, search_params->topster, search_params->curated_topster, search_params->per_page, search_params->offset, search_params->token_order, search_params->prefixes, search_params->drop_tokens_threshold, search_params->all_result_ids_len, search_params->groups_processed, search_params->searched_queries, search_params->qtoken_set, search_params->raw_result_kvs, search_params->override_result_kvs, search_params->typo_tokens_threshold, search_params->group_limit, search_params->group_by_fields, search_params->group_missing_values, search_params->default_sorting_field, search_params->prioritize_exact_match, search_params->prioritize_token_position, search_params->prioritize_num_matching_fields, search_params->exhaustive_search, search_params->concurrency, search_params->search_cutoff_ms, search_params->min_len_1typo, search_params->min_len_2typo, search_params->max_candidates, search_params->infixes, search_params->max_extra_prefix, search_params->max_extra_suffix, search_params->facet_query_num_typos, search_params->filter_curated_hits, search_params->split_join_tokens, search_params->vector_query, search_params->facet_sample_percent, search_params->facet_sample_threshold, collection_name, facet_index_type, search_params->drop_tokens_mode); } void Index::collate_included_ids(const std::vector& q_included_tokens, const std::map> & included_ids_map, Topster* curated_topster, std::vector> & searched_queries) const { if(included_ids_map.empty()) { return; } for(const auto& pos_ids: included_ids_map) { const size_t outer_pos = pos_ids.first; for(const auto& index_seq_id: pos_ids.second) { uint32_t inner_pos = index_seq_id.first; uint32_t seq_id = index_seq_id.second; uint64_t distinct_id = outer_pos; // outer pos is the group distinct key uint64_t match_score = (64000 - outer_pos - inner_pos); // both outer pos and inner pos inside group // LOG(INFO) << "seq_id: " << seq_id << " - " << match_score; int64_t scores[3]; scores[0] = match_score; scores[1] = int64_t(1); scores[2] = int64_t(1); KV kv(searched_queries.size(), seq_id, distinct_id, 0, scores); curated_topster->add(&kv); } } } void Index::concat_topster_ids(Topster* topster, spp::sparse_hash_map>& topster_ids) { if(topster->distinct) { for(auto &group_topster_entry: topster->group_kv_map) { Topster* group_topster = group_topster_entry.second; for(const auto& map_kv: group_topster->kv_map) { topster_ids[map_kv.first].push_back(map_kv.second); } } } else { for(const auto& map_kv: topster->kv_map) { //LOG(INFO) << "map_kv.second.key: " << map_kv.second->key; //LOG(INFO) << "map_kv.first: " << map_kv.first; topster_ids[map_kv.first].push_back(map_kv.second); } } } bool Index::static_filter_query_eval(const override_t* override, std::vector& tokens, filter_node_t*& filter_tree_root) const { std::string query = StringUtils::join(tokens, " "); if ((override->rule.match == override_t::MATCH_EXACT && override->rule.normalized_query == query) || (override->rule.match == override_t::MATCH_CONTAINS && StringUtils::contains_word(query, override->rule.normalized_query))) { filter_node_t* new_filter_tree_root = nullptr; Option filter_op = filter::parse_filter_query(override->filter_by, search_schema, store, "", new_filter_tree_root); if (filter_op.ok()) { if (filter_tree_root == nullptr) { filter_tree_root = new_filter_tree_root; } else { auto root = new filter_node_t(AND, filter_tree_root, new_filter_tree_root); filter_tree_root = root; } return true; } else { delete new_filter_tree_root; } } return false; } bool Index::resolve_override(const std::vector& rule_tokens, const bool exact_rule_match, const std::vector& query_tokens, token_ordering token_order, std::set& absorbed_tokens, std::string& filter_by_clause) const { bool resolved_override = false; size_t i = 0, j = 0; std::unordered_map> field_placeholder_tokens; while(i < rule_tokens.size()) { if(rule_tokens[i].front() == '{' && rule_tokens[i].back() == '}') { // found a field placeholder std::vector field_names; std::string rule_part = rule_tokens[i]; field_names.emplace_back(rule_part.erase(0, 1).erase(rule_part.size() - 1)); // skip until we find a non-placeholder token i++; while(i < rule_tokens.size() && (rule_tokens[i].front() == '{' && rule_tokens[i].back() == '}')) { rule_part = rule_tokens[i]; field_names.emplace_back(rule_part.erase(0, 1).erase(rule_part.size() - 1)); i++; } std::vector matched_tokens; // `i` now points to either end of array or at a non-placeholder rule token // end of array: add remaining query tokens as matched tokens // non-placeholder: skip query tokens until it matches a rule token while(j < query_tokens.size() && (i == rule_tokens.size() || rule_tokens[i] != query_tokens[j])) { matched_tokens.emplace_back(query_tokens[j]); j++; } resolved_override = true; // we try to map `field_names` against `matched_tokens` now for(size_t findex = 0; findex < field_names.size(); findex++) { const auto& field_name = field_names[findex]; bool slide_window = (findex == 0); // fields following another field should match exactly std::vector field_absorbed_tokens; resolved_override &= check_for_overrides(token_order, field_name, slide_window, exact_rule_match, matched_tokens, absorbed_tokens, field_absorbed_tokens); if(!resolved_override) { goto RETURN_EARLY; } field_placeholder_tokens[field_name] = field_absorbed_tokens; } } else { // rule token is not a placeholder, so we have to skip the query tokens until it matches rule token while(j < query_tokens.size() && query_tokens[j] != rule_tokens[i]) { if(exact_rule_match) { // a single mismatch is enough to fail exact match return false; } j++; } // either we have exhausted all query tokens if(j == query_tokens.size()) { return false; } // or query token matches rule token, so we can proceed i++; j++; } } RETURN_EARLY: if(!resolved_override || (exact_rule_match && query_tokens.size() != absorbed_tokens.size())) { return false; } // replace placeholder with field_absorbed_tokens in rule_tokens for(const auto& kv: field_placeholder_tokens) { std::string pattern = "{" + kv.first + "}"; std::string replacement = StringUtils::join(kv.second, " "); StringUtils::replace_all(filter_by_clause, pattern, replacement); } return true; } void Index::process_filter_overrides(const std::vector& filter_overrides, std::vector& query_tokens, token_ordering token_order, filter_node_t*& filter_tree_root, std::vector& matched_dynamic_overrides) const { std::shared_lock lock(mutex); for (auto& override : filter_overrides) { if (!override->rule.dynamic_query) { // Simple static filtering: add to filter_by and rewrite query if needed. // Check the original query and then the synonym variants until a rule matches. bool resolved_override = static_filter_query_eval(override, query_tokens, filter_tree_root); if (resolved_override) { if (override->remove_matched_tokens) { std::vector rule_tokens; Tokenizer(override->rule.query, true).tokenize(rule_tokens); std::set rule_token_set(rule_tokens.begin(), rule_tokens.end()); remove_matched_tokens(query_tokens, rule_token_set); } if (override->stop_processing) { return; } } } else { // need to extract placeholder field names from the search query, filter on them and rewrite query // we will cover both original query and synonyms std::vector rule_parts; StringUtils::split(override->rule.normalized_query, rule_parts, " "); bool exact_rule_match = override->rule.match == override_t::MATCH_EXACT; std::string filter_by_clause = override->filter_by; std::set absorbed_tokens; bool resolved_override = resolve_override(rule_parts, exact_rule_match, query_tokens, token_order, absorbed_tokens, filter_by_clause); if (resolved_override) { filter_node_t* new_filter_tree_root = nullptr; Option filter_op = filter::parse_filter_query(filter_by_clause, search_schema, store, "", new_filter_tree_root); if (filter_op.ok()) { // have to ensure that dropped hits take precedence over added hits matched_dynamic_overrides.push_back(override); if (override->remove_matched_tokens) { std::vector& tokens = query_tokens; remove_matched_tokens(tokens, absorbed_tokens); } if (filter_tree_root == nullptr) { filter_tree_root = new_filter_tree_root; } else { filter_node_t* root = new filter_node_t(AND, filter_tree_root, new_filter_tree_root); filter_tree_root = root; } } else { delete new_filter_tree_root; } if (override->stop_processing) { return; } } } } } void Index::remove_matched_tokens(std::vector& tokens, const std::set& rule_token_set) { std::vector new_tokens; for(std::string& token: tokens) { if(rule_token_set.count(token) == 0) { new_tokens.push_back(token); } } if(new_tokens.empty()) { tokens = {"*"}; } else { tokens = new_tokens; } } bool Index::check_for_overrides(const token_ordering& token_order, const string& field_name, const bool slide_window, bool exact_rule_match, std::vector& tokens, std::set& absorbed_tokens, std::vector& field_absorbed_tokens) const { for(size_t window_len = tokens.size(); window_len > 0; window_len--) { for(size_t start_index = 0; start_index+window_len-1 < tokens.size(); start_index++) { std::vector window_tokens; std::set window_tokens_set; for (size_t i = start_index; i < start_index + window_len; i++) { bool is_prefix = (i == (start_index + window_len - 1)); window_tokens.emplace_back(i, tokens[i], is_prefix, tokens[i].size(), 0); window_tokens_set.emplace(tokens[i]); } std::vector facets; std::vector> searched_queries; Topster* topster = nullptr; spp::sparse_hash_map groups_processed; uint32_t* result_ids = nullptr; size_t result_ids_len = 0; size_t field_num_results = 0; std::vector group_by_fields; std::set query_hashes; size_t num_toks_dropped = 0; auto field_it = search_schema.find(field_name); if(field_it == search_schema.end()) { continue; } std::vector sort_fields; search_field(0, window_tokens, nullptr, 0, num_toks_dropped, field_it.value(), field_name, nullptr, 0, {}, sort_fields, -1, 0, searched_queries, topster, groups_processed, &result_ids, result_ids_len, field_num_results, 0, group_by_fields, true, false, 4, query_hashes, token_order, false, 0, 0, false, -1, 3, 7, 4); if(result_ids_len != 0) { // we need to narraw onto the exact matches std::vector posting_lists; art_tree* t = search_index.at(field_name); for(auto& w_token: window_tokens) { art_leaf* leaf = (art_leaf *) art_search(t, (const unsigned char*) w_token.value.c_str(), w_token.value.length()+1); if(leaf == nullptr) { continue; } posting_lists.push_back(leaf->values); } uint32_t* exact_strt_ids = new uint32_t[result_ids_len]; size_t exact_strt_size = 0; posting_t::get_exact_matches(posting_lists, field_it.value().is_array(), result_ids, result_ids_len, exact_strt_ids, exact_strt_size); delete [] result_ids; delete [] exact_strt_ids; if(exact_strt_size != 0) { // remove window_tokens from `tokens` std::vector new_tokens; for(size_t new_i = start_index; new_i < tokens.size(); new_i++) { const auto& token = tokens[new_i]; if(window_tokens_set.count(token) == 0) { new_tokens.emplace_back(token); } else { absorbed_tokens.insert(token); field_absorbed_tokens.emplace_back(token); } } tokens = new_tokens; return true; } } if(!slide_window) { break; } } } return false; } void Index::search_infix(const std::string& query, const std::string& field_name, std::vector& ids, const size_t max_extra_prefix, const size_t max_extra_suffix) const { auto infix_maps_it = infix_index.find(field_name); if(infix_maps_it == infix_index.end()) { return ; } auto infix_sets = infix_maps_it->second; std::vector leaves; size_t num_processed = 0; std::mutex m_process; std::condition_variable cv_process; auto search_tree = search_index.at(field_name); const auto parent_search_begin = search_begin_us; const auto parent_search_stop_ms = search_stop_us; auto parent_search_cutoff = search_cutoff; for(auto infix_set: infix_sets) { thread_pool->enqueue([infix_set, &leaves, search_tree, &query, max_extra_prefix, max_extra_suffix, &num_processed, &m_process, &cv_process, &parent_search_begin, &parent_search_stop_ms, &parent_search_cutoff]() { search_begin_us = parent_search_begin; search_cutoff = parent_search_cutoff; auto op_search_stop_ms = parent_search_stop_ms/2; std::vector this_leaves; std::string key_buffer; size_t num_iterated = 0; for(auto it = infix_set->begin(); it != infix_set->end(); it++) { it.key(key_buffer); num_iterated++; auto start_index = key_buffer.find(query); if(start_index != std::string::npos && start_index <= max_extra_prefix && (key_buffer.size() - (start_index + query.size())) <= max_extra_suffix) { art_leaf* l = (art_leaf *) art_search(search_tree, (const unsigned char *) key_buffer.c_str(), key_buffer.size()+1); if(l != nullptr) { this_leaves.push_back(l); } } // check for search cutoff but only once every 2^10 docs to reduce overhead if(((num_iterated + 1) % (1 << 12)) == 0) { if ((std::chrono::duration_cast(std::chrono::system_clock::now(). time_since_epoch()).count() - search_begin_us) > op_search_stop_ms) { search_cutoff = true; break; } } } std::unique_lock lock(m_process); leaves.insert(leaves.end(), this_leaves.begin(), this_leaves.end()); num_processed++; parent_search_cutoff = parent_search_cutoff || search_cutoff; cv_process.notify_one(); }); } std::unique_lock lock_process(m_process); cv_process.wait(lock_process, [&](){ return num_processed == infix_sets.size(); }); search_cutoff = parent_search_cutoff; for(auto leaf: leaves) { posting_t::merge({leaf->values}, ids); } } Option Index::search(std::vector& field_query_tokens, const std::vector& the_fields, const text_match_type_t match_type, filter_node_t*& filter_tree_root, std::vector& facets, facet_query_t& facet_query, const int max_facet_values, const std::vector>& included_ids, const std::vector& excluded_ids, std::vector& sort_fields_std, const std::vector& num_typos, Topster* topster, Topster* curated_topster, const size_t per_page, const size_t offset, const token_ordering token_order, const std::vector& prefixes, const size_t drop_tokens_threshold, size_t& all_result_ids_len, spp::sparse_hash_map& groups_processed, std::vector>& searched_queries, tsl::htrie_map& qtoken_set, std::vector>& raw_result_kvs, std::vector>& override_result_kvs, const size_t typo_tokens_threshold, const size_t group_limit, const std::vector& group_by_fields, const bool group_missing_values, const string& default_sorting_field, bool prioritize_exact_match, const bool prioritize_token_position, const bool prioritize_num_matching_fields, bool exhaustive_search, size_t concurrency, size_t search_cutoff_ms, size_t min_len_1typo, size_t min_len_2typo, size_t max_candidates, const std::vector& infixes, const size_t max_extra_prefix, const size_t max_extra_suffix, const size_t facet_query_num_typos, const bool filter_curated_hits, const enable_t split_join_tokens, const vector_query_t& vector_query, size_t facet_sample_percent, size_t facet_sample_threshold, const std::string& collection_name, facet_index_type_t facet_index_type, const drop_tokens_mode_t drop_tokens_mode) const { std::shared_lock lock(mutex); auto filter_result_iterator = new filter_result_iterator_t(collection_name, this, filter_tree_root); std::unique_ptr filter_iterator_guard(filter_result_iterator); auto filter_init_op = filter_result_iterator->init_status(); if (!filter_init_op.ok()) { return filter_init_op; } if (filter_tree_root != nullptr && !filter_result_iterator->is_valid) { return Option(true); } size_t fetch_size = offset + per_page; std::set curated_ids; std::map> included_ids_map; // outer pos => inner pos => list of IDs std::vector included_ids_vec; std::unordered_set excluded_group_ids; process_curated_ids(included_ids, excluded_ids, group_by_fields, group_limit, group_missing_values, filter_curated_hits, filter_result_iterator, curated_ids, included_ids_map, included_ids_vec, excluded_group_ids); filter_result_iterator->reset(); std::vector curated_ids_sorted(curated_ids.begin(), curated_ids.end()); std::sort(curated_ids_sorted.begin(), curated_ids_sorted.end()); // Order of `fields` are used to sort results // auto begin = std::chrono::high_resolution_clock::now(); uint32_t* all_result_ids = nullptr; const size_t num_search_fields = std::min(the_fields.size(), (size_t) FIELD_LIMIT_NUM); // handle exclusion of tokens/phrases uint32_t* exclude_token_ids = nullptr; size_t exclude_token_ids_size = 0; handle_exclusion(num_search_fields, field_query_tokens, the_fields, exclude_token_ids, exclude_token_ids_size); int sort_order[3]; // 1 or -1 based on DESC or ASC respectively std::array*, 3> field_values; std::vector geopoint_indices; populate_sort_mapping(sort_order, geopoint_indices, sort_fields_std, field_values); // Prepare excluded document IDs that we can later remove from the result set uint32_t* excluded_result_ids = nullptr; size_t excluded_result_ids_size = ArrayUtils::or_scalar(exclude_token_ids, exclude_token_ids_size, &curated_ids_sorted[0], curated_ids_sorted.size(), &excluded_result_ids); auto is_wildcard_query = !field_query_tokens.empty() && !field_query_tokens[0].q_include_tokens.empty() && field_query_tokens[0].q_include_tokens[0].value == "*"; bool no_filters_provided = (filter_tree_root == nullptr && !filter_result_iterator->is_valid); // handle phrase searches if (!field_query_tokens[0].q_phrases.empty()) { auto do_phrase_search_op = do_phrase_search(num_search_fields, the_fields, field_query_tokens, sort_fields_std, searched_queries, group_limit, group_by_fields, group_missing_values, topster, sort_order, field_values, geopoint_indices, curated_ids_sorted, filter_result_iterator, all_result_ids, all_result_ids_len, groups_processed, curated_ids, excluded_result_ids, excluded_result_ids_size, excluded_group_ids, curated_topster, included_ids_map, is_wildcard_query, collection_name); filter_iterator_guard.release(); filter_iterator_guard.reset(filter_result_iterator); if (!do_phrase_search_op.ok()) { delete [] all_result_ids; return do_phrase_search_op; } if (filter_result_iterator->approx_filter_ids_length == 0) { goto process_search_results; } } // for phrase query, parser will set field_query_tokens to "*", need to handle that if (is_wildcard_query && field_query_tokens[0].q_phrases.empty()) { if(no_filters_provided && facets.empty() && curated_ids.empty() && vector_query.field_name.empty() && sort_fields_std.size() == 1 && sort_fields_std[0].name == sort_field_const::seq_id && sort_fields_std[0].order == sort_field_const::desc) { // optimize for this path specifically std::vector result_ids; auto it = seq_ids->new_rev_iterator(); std::vector group_by_field_it_vec; if (group_limit != 0) { group_by_field_it_vec = get_group_by_field_iterators(group_by_fields, true); } while (it.valid()) { uint32_t seq_id = it.id(); uint64_t distinct_id = seq_id; if (group_limit != 0) { distinct_id = 1; for(auto& kv : group_by_field_it_vec) { get_distinct_id(kv.field_name, kv.it, seq_id, group_missing_values, distinct_id); kv.it.previous(); } if(excluded_group_ids.count(distinct_id) != 0) { continue; } } int64_t scores[3] = {0}; scores[0] = seq_id; int64_t match_score_index = -1; result_ids.push_back(seq_id); KV kv(searched_queries.size(), seq_id, distinct_id, match_score_index, scores); int ret = topster->add(&kv); if(group_limit != 0 && ret < 2) { groups_processed[distinct_id]++; } if (result_ids.size() == fetch_size) { break; } it.previous(); } all_result_ids_len = seq_ids->num_ids(); goto process_search_results; } collate_included_ids({}, included_ids_map, curated_topster, searched_queries); if (!vector_query.field_name.empty()) { auto k = vector_query.k == 0 ? std::max(vector_query.k, fetch_size) : vector_query.k; if(vector_query.query_doc_given) { // since we will omit the query doc from results k++; } auto& field_vector_index = vector_index.at(vector_query.field_name); std::vector> dist_results; uint32_t filter_id_count = 0; while (!no_filters_provided && filter_id_count < vector_query.flat_search_cutoff && filter_result_iterator->is_valid) { auto& seq_id = filter_result_iterator->seq_id; auto filter_result = single_filter_result_t(seq_id, std::move(filter_result_iterator->reference)); filter_result_iterator->next(); std::vector values; try { values = field_vector_index->vecdex->getDataByLabel(seq_id); } catch(...) { // likely not found continue; } float dist; if(field_vector_index->distance_type == cosine) { std::vector normalized_q(vector_query.values.size()); hnsw_index_t::normalize_vector(vector_query.values, normalized_q); dist = field_vector_index->space->get_dist_func()(normalized_q.data(), values.data(), &field_vector_index->num_dim); } else { dist = field_vector_index->space->get_dist_func()(vector_query.values.data(), values.data(), &field_vector_index->num_dim); } dist_results.emplace_back(dist, filter_result); filter_id_count++; } filter_result_iterator->reset(); if(no_filters_provided || (filter_id_count >= vector_query.flat_search_cutoff && filter_result_iterator->is_valid)) { dist_results.clear(); VectorFilterFunctor filterFunctor(filter_result_iterator); std::vector> pairs; if(field_vector_index->distance_type == cosine) { std::vector normalized_q(vector_query.values.size()); hnsw_index_t::normalize_vector(vector_query.values, normalized_q); pairs = field_vector_index->vecdex->searchKnnCloserFirst(normalized_q.data(), k, &filterFunctor); } else { pairs = field_vector_index->vecdex->searchKnnCloserFirst(vector_query.values.data(), k, &filterFunctor); } std::sort(pairs.begin(), pairs.end(), [](auto& x, auto& y) { return x.second < y.second; }); filter_result_iterator->reset(); if (filter_result_iterator->is_valid && !filter_result_iterator->reference.empty()) { // We'll have to get the references of each document. for (auto pair: pairs) { // The doc_id must be valid otherwise it would've been filtered out upstream. filter_result_iterator->skip_to(pair.second); auto filter_result = single_filter_result_t(pair.second, std::move(filter_result_iterator->reference)); dist_results.emplace_back(pair.first, filter_result); filter_result_iterator->reset(); } } else { for (const auto &pair: pairs) { auto filter_result = single_filter_result_t(pair.second, {}); dist_results.emplace_back(pair.first, filter_result); } } } std::vector nearest_ids; std::vector eval_filter_indexes; std::vector group_by_field_it_vec; if (group_limit != 0) { group_by_field_it_vec = get_group_by_field_iterators(group_by_fields); } for (auto& dist_result : dist_results) { auto& seq_id = dist_result.second.seq_id; auto references = std::move(dist_result.second.reference_filter_results); if(vector_query.query_doc_given && vector_query.seq_id == seq_id) { continue; } uint64_t distinct_id = seq_id; if (group_limit != 0) { distinct_id = 1; for(auto &kv : group_by_field_it_vec) { get_distinct_id(kv.field_name, kv.it, seq_id, group_missing_values, distinct_id); } if(excluded_group_ids.count(distinct_id) != 0) { continue; } } auto vec_dist_score = (field_vector_index->distance_type == cosine) ? std::abs(dist_result.first) : dist_result.first; if(vec_dist_score > vector_query.distance_threshold) { continue; } int64_t scores[3] = {0}; int64_t match_score_index = -1; auto compute_sort_scores_op = compute_sort_scores(sort_fields_std, sort_order, field_values, geopoint_indices, seq_id, references, eval_filter_indexes, 0, scores, match_score_index, vec_dist_score, collection_name); if (!compute_sort_scores_op.ok()) { return compute_sort_scores_op; } KV kv(searched_queries.size(), seq_id, distinct_id, match_score_index, scores, std::move(references)); kv.vector_distance = vec_dist_score; int ret = topster->add(&kv); if(group_limit != 0 && ret < 2) { groups_processed[distinct_id]++; } nearest_ids.push_back(seq_id); } if(!nearest_ids.empty()) { std::sort(nearest_ids.begin(), nearest_ids.end()); // seq_ids should be in ascending order all_result_ids = new uint32[nearest_ids.size()]; std::copy(nearest_ids.begin(), nearest_ids.end(), all_result_ids); all_result_ids_len = nearest_ids.size(); } } else { // if filters were not provided, use the seq_ids index to generate the list of all document ids if (no_filters_provided) { filter_result_iterator = new filter_result_iterator_t(seq_ids->uncompress(), seq_ids->num_ids()); filter_iterator_guard.reset(filter_result_iterator); } auto search_wildcard_op = search_wildcard(filter_tree_root, included_ids_map, sort_fields_std, topster, curated_topster, groups_processed, searched_queries, group_limit, group_by_fields, group_missing_values, curated_ids, curated_ids_sorted, excluded_result_ids, excluded_result_ids_size, excluded_group_ids, all_result_ids, all_result_ids_len, filter_result_iterator, concurrency, sort_order, field_values, geopoint_indices, collection_name); if (!search_wildcard_op.ok()) { return search_wildcard_op; } filter_result_iterator->reset(); } uint32_t _all_result_ids_len = all_result_ids_len; curate_filtered_ids(curated_ids, excluded_result_ids, excluded_result_ids_size, all_result_ids, _all_result_ids_len, curated_ids_sorted); all_result_ids_len = _all_result_ids_len; } else { // Non-wildcard // In multi-field searches, a record can be matched across different fields, so we use this for aggregation //begin = std::chrono::high_resolution_clock::now(); // FIXME: needed? std::set query_hashes; // resolve synonyms so that we can compute `syn_orig_num_tokens` std::vector> all_queries = {field_query_tokens[0].q_include_tokens}; std::vector> q_pos_synonyms; std::vector q_include_tokens; int syn_orig_num_tokens = -1; for(size_t j = 0; j < field_query_tokens[0].q_include_tokens.size(); j++) { q_include_tokens.push_back(field_query_tokens[0].q_include_tokens[j].value); } synonym_index->synonym_reduction(q_include_tokens, field_query_tokens[0].q_synonyms); if(!field_query_tokens[0].q_synonyms.empty()) { syn_orig_num_tokens = field_query_tokens[0].q_include_tokens.size(); } for(const auto& q_syn_vec: field_query_tokens[0].q_synonyms) { std::vector q_pos_syn; for(size_t j=0; j < q_syn_vec.size(); j++) { bool is_prefix = (j == q_syn_vec.size()-1); q_pos_syn.emplace_back(j, q_syn_vec[j], is_prefix, q_syn_vec[j].size(), 0); } q_pos_synonyms.push_back(q_pos_syn); all_queries.push_back(q_pos_syn); if((int)q_syn_vec.size() > syn_orig_num_tokens) { syn_orig_num_tokens = (int) q_syn_vec.size(); } } auto fuzzy_search_fields_op = fuzzy_search_fields(the_fields, field_query_tokens[0].q_include_tokens, {}, match_type, excluded_result_ids, excluded_result_ids_size, filter_result_iterator, curated_ids_sorted, excluded_group_ids, sort_fields_std, num_typos, searched_queries, qtoken_set, topster, groups_processed, all_result_ids, all_result_ids_len, group_limit, group_by_fields, group_missing_values, prioritize_exact_match, prioritize_token_position, prioritize_num_matching_fields, query_hashes, token_order, prefixes, typo_tokens_threshold, exhaustive_search, max_candidates, min_len_1typo, min_len_2typo, syn_orig_num_tokens, sort_order, field_values, geopoint_indices, collection_name); if (!fuzzy_search_fields_op.ok()) { return fuzzy_search_fields_op; } // try split/joining tokens if no results are found if(split_join_tokens == always || (all_result_ids_len == 0 && split_join_tokens == fallback)) { std::vector> space_resolved_queries; for (size_t i = 0; i < num_search_fields; i++) { std::vector orig_q_include_tokens; for(auto& q_include_token: field_query_tokens[i].q_include_tokens) { orig_q_include_tokens.push_back(q_include_token.value); } resolve_space_as_typos(orig_q_include_tokens, the_fields[i].name,space_resolved_queries); if (!space_resolved_queries.empty()) { break; } } // only one query is resolved for now, so just use that if (!space_resolved_queries.empty()) { const auto& resolved_query = space_resolved_queries[0]; std::vector resolved_tokens; for(size_t j=0; j < resolved_query.size(); j++) { bool is_prefix = (j == resolved_query.size()-1); resolved_tokens.emplace_back(j, space_resolved_queries[0][j], is_prefix, space_resolved_queries[0][j].size(), 0); } auto fuzzy_search_fields_op = fuzzy_search_fields(the_fields, resolved_tokens, {}, match_type, excluded_result_ids, excluded_result_ids_size, filter_result_iterator, curated_ids_sorted, excluded_group_ids, sort_fields_std, num_typos, searched_queries, qtoken_set, topster, groups_processed, all_result_ids, all_result_ids_len, group_limit, group_by_fields, group_missing_values, prioritize_exact_match, prioritize_token_position, prioritize_num_matching_fields, query_hashes, token_order, prefixes, typo_tokens_threshold, exhaustive_search, max_candidates, min_len_1typo, min_len_2typo, syn_orig_num_tokens, sort_order, field_values, geopoint_indices, collection_name); if (!fuzzy_search_fields_op.ok()) { return fuzzy_search_fields_op; } } } // do synonym based searches auto do_synonym_search_op = do_synonym_search(the_fields, match_type, filter_tree_root, included_ids_map, sort_fields_std, curated_topster, token_order, 0, group_limit, group_by_fields, group_missing_values, prioritize_exact_match, prioritize_token_position, prioritize_num_matching_fields, exhaustive_search, concurrency, prefixes, min_len_1typo, min_len_2typo, max_candidates, curated_ids, curated_ids_sorted, excluded_result_ids, excluded_result_ids_size, excluded_group_ids, topster, q_pos_synonyms, syn_orig_num_tokens, groups_processed, searched_queries, all_result_ids, all_result_ids_len, filter_result_iterator, query_hashes, sort_order, field_values, geopoint_indices, qtoken_set, collection_name); if (!do_synonym_search_op.ok()) { return do_synonym_search_op; } filter_result_iterator->reset(); // gather up both original query and synonym queries and do drop tokens if (exhaustive_search || all_result_ids_len < drop_tokens_threshold) { for (size_t qi = 0; qi < all_queries.size(); qi++) { auto& orig_tokens = all_queries[qi]; size_t num_tokens_dropped = 0; auto curr_direction = drop_tokens_mode; size_t total_dirs_done = 0; while(exhaustive_search || all_result_ids_len < drop_tokens_threshold) { // When atleast two tokens from the query are available we can drop one std::vector truncated_tokens; std::vector dropped_tokens; if(num_tokens_dropped >= orig_tokens.size() - 1) { // swap direction and reset counter curr_direction = (curr_direction == right_to_left) ? left_to_right : right_to_left; num_tokens_dropped = 0; total_dirs_done++; } if(orig_tokens.size() > 1 && total_dirs_done < 2) { bool prefix_search = false; if (curr_direction == right_to_left) { // drop from right size_t truncated_len = orig_tokens.size() - num_tokens_dropped - 1; for (size_t i = 0; i < orig_tokens.size(); i++) { if(i < truncated_len) { truncated_tokens.emplace_back(orig_tokens[i]); } else { dropped_tokens.emplace_back(orig_tokens[i]); } } } else { // drop from left prefix_search = true; size_t start_index = (num_tokens_dropped + 1); for(size_t i = 0; i < orig_tokens.size(); i++) { if(i >= start_index) { truncated_tokens.emplace_back(orig_tokens[i]); } else { dropped_tokens.emplace_back(orig_tokens[i]); } } } num_tokens_dropped++; std::vector drop_token_prefixes; for (const auto p : prefixes) { drop_token_prefixes.push_back(p && prefix_search); } auto fuzzy_search_fields_op = fuzzy_search_fields(the_fields, truncated_tokens, dropped_tokens, match_type, excluded_result_ids, excluded_result_ids_size, filter_result_iterator, curated_ids_sorted, excluded_group_ids, sort_fields_std, num_typos, searched_queries, qtoken_set, topster, groups_processed, all_result_ids, all_result_ids_len, group_limit, group_by_fields, group_missing_values, prioritize_exact_match, prioritize_token_position, prioritize_num_matching_fields, query_hashes, token_order, prefixes, typo_tokens_threshold, exhaustive_search, max_candidates, min_len_1typo, min_len_2typo, -1, sort_order, field_values, geopoint_indices, collection_name); if (!fuzzy_search_fields_op.ok()) { return fuzzy_search_fields_op; } } else { break; } } } } auto do_infix_search_op = do_infix_search(num_search_fields, the_fields, infixes, sort_fields_std, searched_queries, group_limit, group_by_fields, group_missing_values, max_extra_prefix, max_extra_suffix, field_query_tokens[0].q_include_tokens, topster, filter_result_iterator, sort_order, field_values, geopoint_indices, curated_ids_sorted, excluded_group_ids, all_result_ids, all_result_ids_len, groups_processed, collection_name); if (!do_infix_search_op.ok()) { return do_infix_search_op; } filter_result_iterator->reset(); if(!vector_query.field_name.empty()) { // check at least one of sort fields is text match bool has_text_match = false; for(auto& sort_field : sort_fields_std) { if(sort_field.name == sort_field_const::text_match) { has_text_match = true; break; } } if(has_text_match) { // For hybrid search, we need to give weight to text match and vector search constexpr float TEXT_MATCH_WEIGHT = 0.7; constexpr float VECTOR_SEARCH_WEIGHT = 1.0 - TEXT_MATCH_WEIGHT; VectorFilterFunctor filterFunctor(filter_result_iterator); auto& field_vector_index = vector_index.at(vector_query.field_name); std::vector> dist_labels; // use k as 100 by default for ensuring results stability in pagination size_t default_k = 100; auto k = vector_query.k == 0 ? std::max(fetch_size, default_k) : vector_query.k; if(field_vector_index->distance_type == cosine) { std::vector normalized_q(vector_query.values.size()); hnsw_index_t::normalize_vector(vector_query.values, normalized_q); dist_labels = field_vector_index->vecdex->searchKnnCloserFirst(normalized_q.data(), k, &filterFunctor); } else { dist_labels = field_vector_index->vecdex->searchKnnCloserFirst(vector_query.values.data(), k, &filterFunctor); } filter_result_iterator->reset(); std::vector> vec_results; for (const auto& dist_label : dist_labels) { uint32_t seq_id = dist_label.second; auto vec_dist_score = (field_vector_index->distance_type == cosine) ? std::abs(dist_label.first) : dist_label.first; if(vec_dist_score > vector_query.distance_threshold) { continue; } vec_results.emplace_back(seq_id, vec_dist_score); } std::sort(vec_results.begin(), vec_results.end(), [](const auto& a, const auto& b) { return a.second < b.second; }); std::vector kvs; if(group_limit != 0) { for(auto& kv_map : topster->group_kv_map) { for(int i = 0; i < kv_map.second->size; i++) { kvs.push_back(kv_map.second->getKV(i)); } } std::sort(kvs.begin(), kvs.end(), Topster::is_greater); } else { topster->sort(); } // Reciprocal rank fusion // Score is sum of (1 / rank_of_document) * WEIGHT from each list (text match and vector search) auto size = (group_limit != 0) ? kvs.size() : topster->size; for(uint32_t i = 0; i < size; i++) { auto result = (group_limit != 0) ? kvs[i] : topster->getKV(i); if(result->match_score_index < 0 || result->match_score_index > 2) { continue; } // (1 / rank_of_document) * WEIGHT) result->text_match_score = result->scores[result->match_score_index]; result->scores[result->match_score_index] = float_to_int64_t((1.0 / (i + 1)) * TEXT_MATCH_WEIGHT); } std::vector vec_search_ids; // list of IDs found only in vector search std::vector eval_filter_indexes; std::vector group_by_field_it_vec; if (group_limit != 0) { group_by_field_it_vec = get_group_by_field_iterators(group_by_fields); } for(size_t res_index = 0; res_index < vec_results.size(); res_index++) { auto& vec_result = vec_results[res_index]; auto seq_id = vec_result.first; filter_result_iterator->skip_to(seq_id); auto references = std::move(filter_result_iterator->reference); filter_result_iterator->reset(); KV* found_kv = nullptr; if(group_limit != 0) { for(auto& kv : kvs) { if(kv->key == seq_id) { found_kv = kv; break; } } } else { auto result_it = topster->kv_map.find(seq_id); if(result_it != topster->kv_map.end()) { found_kv = result_it->second; } } if(found_kv) { if(found_kv->match_score_index < 0 || found_kv->match_score_index > 2) { continue; } // result overlaps with keyword search: we have to combine the scores // old_score + (1 / rank_of_document) * WEIGHT) found_kv->vector_distance = vec_result.second; found_kv->text_match_score = found_kv->scores[found_kv->match_score_index]; int64_t match_score = float_to_int64_t( (int64_t_to_float(found_kv->scores[found_kv->match_score_index])) + ((1.0 / (res_index + 1)) * VECTOR_SEARCH_WEIGHT)); int64_t match_score_index = -1; int64_t scores[3] = {0}; auto compute_sort_scores_op = compute_sort_scores(sort_fields_std, sort_order, field_values, geopoint_indices, seq_id, references, eval_filter_indexes, match_score, scores, match_score_index, vec_result.second, collection_name); if (!compute_sort_scores_op.ok()) { return compute_sort_scores_op; } for(int i = 0; i < 3; i++) { found_kv->scores[i] = scores[i]; } found_kv->match_score_index = match_score_index; } else { // Result has been found only in vector search: we have to add it to both KV and result_ids // (1 / rank_of_document) * WEIGHT) int64_t scores[3] = {0}; int64_t match_score = float_to_int64_t((1.0 / (res_index + 1)) * VECTOR_SEARCH_WEIGHT); int64_t match_score_index = -1; auto compute_sort_scores_op = compute_sort_scores(sort_fields_std, sort_order, field_values, geopoint_indices, seq_id, references, eval_filter_indexes, match_score, scores, match_score_index, vec_result.second, collection_name); if (!compute_sort_scores_op.ok()) { return compute_sort_scores_op; } uint64_t distinct_id = seq_id; if (group_limit != 0) { distinct_id = 1; for(auto& kv : group_by_field_it_vec) { get_distinct_id(kv.field_name, kv.it, seq_id, group_missing_values, distinct_id); } if(excluded_group_ids.count(distinct_id) != 0) { continue; } } KV kv(searched_queries.size(), seq_id, distinct_id, match_score_index, scores, std::move(references)); kv.text_match_score = 0; kv.vector_distance = vec_result.second; if (filter_result_iterator->is_valid && !filter_result_iterator->reference.empty()) { // The doc_id must be valid otherwise it would've been filtered out upstream. filter_result_iterator->skip_to(seq_id); kv.reference_filter_results = std::move(filter_result_iterator->reference); filter_result_iterator->reset(); } auto ret = topster->add(&kv); vec_search_ids.push_back(seq_id); if(group_limit != 0 && ret < 2) { groups_processed[distinct_id]++; } } } if(!vec_search_ids.empty()) { uint32_t* new_all_result_ids = nullptr; all_result_ids_len = ArrayUtils::or_scalar(all_result_ids, all_result_ids_len, &vec_search_ids[0], vec_search_ids.size(), &new_all_result_ids); delete[] all_result_ids; all_result_ids = new_all_result_ids; } } } /*auto timeMillis0 = std::chrono::duration_cast( std::chrono::high_resolution_clock::now() - begin0).count(); LOG(INFO) << "Time taken for multi-field aggregation: " << timeMillis0 << "ms";*/ } //LOG(INFO) << "topster size: " << topster->size; process_search_results: delete [] exclude_token_ids; delete [] excluded_result_ids; bool estimate_facets = (facet_sample_percent < 100 && all_result_ids_len > facet_sample_threshold); bool is_wildcard_no_filter_query = is_wildcard_query && no_filters_provided; if(!facets.empty()) { const size_t num_threads = 1; const size_t window_size = (num_threads == 0) ? 0 : (all_result_ids_len + num_threads - 1) / num_threads; // rounds up size_t num_processed = 0; std::mutex m_process; std::condition_variable cv_process; // We have to choose between hash and value index: // 1. Group queries -> requires hash index // 2. Wildcard + no filters -> use value index // 3. Very few unique facet values (< 250) -> use value index // 4. Result match > 50% std::vector facet_infos(facets.size()); compute_facet_infos(facets, facet_query, facet_query_num_typos, all_result_ids, all_result_ids_len, group_by_fields, group_limit, is_wildcard_no_filter_query, max_candidates, facet_infos, facet_index_type); std::vector> facet_batches(num_threads); for(size_t i = 0; i < num_threads; i++) { for(const auto& this_facet: facets) { facet_batches[i].emplace_back(facet(this_facet.field_name, this_facet.facet_range_map, this_facet.is_range_query, this_facet.is_sort_by_alpha, this_facet.sort_order, this_facet.sort_field)); } } size_t num_queued = 0; size_t result_index = 0; const auto parent_search_begin = search_begin_us; const auto parent_search_stop_ms = search_stop_us; auto parent_search_cutoff = search_cutoff; //auto beginF = std::chrono::high_resolution_clock::now(); for(size_t thread_id = 0; thread_id < num_threads && result_index < all_result_ids_len; thread_id++) { size_t batch_res_len = window_size; if(result_index + window_size > all_result_ids_len) { batch_res_len = all_result_ids_len - result_index; } uint32_t* batch_result_ids = all_result_ids + result_index; num_queued++; thread_pool->enqueue([this, thread_id, &facet_batches, &facet_query, group_limit, group_by_fields, batch_result_ids, batch_res_len, &facet_infos, max_facet_values, is_wildcard_query, no_filters_provided, estimate_facets, facet_sample_percent, group_missing_values, &parent_search_begin, &parent_search_stop_ms, &parent_search_cutoff, &num_processed, &m_process, &cv_process, facet_index_type]() { search_begin_us = parent_search_begin; search_stop_us = parent_search_stop_ms; search_cutoff = parent_search_cutoff; auto fq = facet_query; std::vector> found_docs; do_facets(facet_batches[thread_id], fq, estimate_facets, facet_sample_percent, facet_infos, group_limit, group_by_fields, group_missing_values, batch_result_ids, batch_res_len, max_facet_values, is_wildcard_query, no_filters_provided, facet_index_type); std::unique_lock lock(m_process); num_processed++; parent_search_cutoff = parent_search_cutoff || search_cutoff; cv_process.notify_one(); }); result_index += batch_res_len; } std::unique_lock lock_process(m_process); cv_process.wait(lock_process, [&](){ return num_processed == num_queued; }); search_cutoff = parent_search_cutoff; for(auto& facet_batch: facet_batches) { for(size_t fi = 0; fi < facet_batch.size(); fi++) { auto& this_facet = facet_batch[fi]; auto& acc_facet = facets[fi]; acc_facet.is_intersected = this_facet.is_intersected; acc_facet.is_sort_by_alpha = this_facet.is_sort_by_alpha; acc_facet.sort_order = this_facet.sort_order; acc_facet.sort_field = this_facet.sort_field; for(auto & facet_kv: this_facet.result_map) { uint32_t fhash = 0; if(group_limit) { fhash = facet_kv.first; // we have to add all group sets acc_facet.hash_groups[fhash].insert( this_facet.hash_groups[fhash].begin(), this_facet.hash_groups[fhash].end() ); } else { size_t count = 0; if (acc_facet.result_map.count(facet_kv.first) == 0) { // not found, so set it count = facet_kv.second.count; } else { count = acc_facet.result_map[facet_kv.first].count + facet_kv.second.count; } acc_facet.result_map[facet_kv.first].count = count; } acc_facet.result_map[facet_kv.first].doc_id = facet_kv.second.doc_id; acc_facet.result_map[facet_kv.first].array_pos = facet_kv.second.array_pos; acc_facet.result_map[facet_kv.first].sort_field_val = facet_kv.second.sort_field_val; acc_facet.hash_tokens[facet_kv.first] = this_facet.hash_tokens[facet_kv.first]; } for(auto& facet_kv: this_facet.value_result_map) { size_t count = 0; if(acc_facet.value_result_map.count(facet_kv.first) == 0) { // not found, so set it count = facet_kv.second.count; } else { count = acc_facet.value_result_map[facet_kv.first].count + facet_kv.second.count; } acc_facet.value_result_map[facet_kv.first].count = count; acc_facet.value_result_map[facet_kv.first].doc_id = facet_kv.second.doc_id; acc_facet.value_result_map[facet_kv.first].array_pos = facet_kv.second.array_pos; acc_facet.fvalue_tokens[facet_kv.first] = this_facet.fvalue_tokens[facet_kv.first]; } if(this_facet.stats.fvcount != 0) { acc_facet.stats.fvcount += this_facet.stats.fvcount; acc_facet.stats.fvsum += this_facet.stats.fvsum; acc_facet.stats.fvmax = std::max(acc_facet.stats.fvmax, this_facet.stats.fvmax); acc_facet.stats.fvmin = std::min(acc_facet.stats.fvmin, this_facet.stats.fvmin); } } } for(auto & acc_facet: facets) { for(auto& facet_kv: acc_facet.result_map) { if(group_limit) { facet_kv.second.count = acc_facet.hash_groups[facet_kv.first].size(); } if(estimate_facets) { facet_kv.second.count = size_t(double(facet_kv.second.count) * (100.0f / facet_sample_percent)); } } for(auto& facet_kv: acc_facet.value_result_map) { if(estimate_facets) { facet_kv.second.count = size_t(double(facet_kv.second.count) * (100.0f / facet_sample_percent)); } } if(estimate_facets) { acc_facet.sampled = true; } } /*long long int timeMillisF = std::chrono::duration_cast( std::chrono::high_resolution_clock::now() - beginF).count(); LOG(INFO) << "Time for faceting: " << timeMillisF;*/ } std::vector> found_docs; std::vector facet_infos(facets.size()); compute_facet_infos(facets, facet_query, facet_query_num_typos, &included_ids_vec[0], included_ids_vec.size(), group_by_fields, group_limit, is_wildcard_no_filter_query, max_candidates, facet_infos, facet_index_type); do_facets(facets, facet_query, estimate_facets, facet_sample_percent, facet_infos, group_limit, group_by_fields, group_missing_values, &included_ids_vec[0], included_ids_vec.size(), max_facet_values, is_wildcard_query, no_filters_provided, facet_index_type); all_result_ids_len += curated_topster->size; delete [] all_result_ids; //LOG(INFO) << "all_result_ids_len " << all_result_ids_len << " for index " << name; //long long int timeMillis = std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - begin).count(); //LOG(INFO) << "Time taken for result calc: " << timeMillis << "ms"; return Option(true); } void Index::process_curated_ids(const std::vector>& included_ids, const std::vector& excluded_ids, const std::vector& group_by_fields, const size_t group_limit, const bool group_missing_values, const bool filter_curated_hits, filter_result_iterator_t* const filter_result_iterator, std::set& curated_ids, std::map>& included_ids_map, std::vector& included_ids_vec, std::unordered_set& excluded_group_ids) const { for(const auto& seq_id_pos: included_ids) { included_ids_vec.push_back(seq_id_pos.first); } if(group_limit != 0) { // if one `id` of a group is present in curated hits, we have to exclude that entire group from results auto group_by_field_it_vec = get_group_by_field_iterators(group_by_fields); for(auto seq_id: included_ids_vec) { uint64_t distinct_id = 1; for(auto& kv : group_by_field_it_vec) { get_distinct_id(kv.field_name, kv.it, seq_id, group_missing_values, distinct_id); } excluded_group_ids.emplace(distinct_id); } } std::sort(included_ids_vec.begin(), included_ids_vec.end()); // if `filter_curated_hits` is enabled, we will remove curated hits that don't match filter condition std::set included_ids_set; if(filter_result_iterator->is_valid && filter_curated_hits) { for (const auto &included_id: included_ids_vec) { auto result = filter_result_iterator->valid(included_id); if (result == -1) { break; } if (result == 1) { included_ids_set.insert(included_id); } } } else { included_ids_set.insert(included_ids_vec.begin(), included_ids_vec.end()); } std::map> included_ids_grouped; // pos -> seq_ids std::vector all_positions; for(const auto& seq_id_pos: included_ids) { all_positions.push_back(seq_id_pos.second); if(included_ids_set.count(seq_id_pos.first) == 0) { continue; } included_ids_grouped[seq_id_pos.second].push_back(seq_id_pos.first); } for(const auto& pos_ids: included_ids_grouped) { size_t outer_pos = pos_ids.first; size_t ids_per_pos = std::max(size_t(1), group_limit); auto num_inner_ids = std::min(ids_per_pos, pos_ids.second.size()); for(size_t inner_pos = 0; inner_pos < num_inner_ids; inner_pos++) { auto seq_id = pos_ids.second[inner_pos]; included_ids_map[outer_pos][inner_pos] = seq_id; curated_ids.insert(seq_id); } } curated_ids.insert(excluded_ids.begin(), excluded_ids.end()); if(all_positions.size() > included_ids_map.size()) { // Some curated IDs may have been removed via filtering or simply don't exist. // We have to shift lower placed hits upwards to fill those positions. std::sort(all_positions.begin(), all_positions.end()); all_positions.erase(unique(all_positions.begin(), all_positions.end()), all_positions.end()); std::map> new_included_ids_map; auto included_id_it = included_ids_map.begin(); auto all_pos_it = all_positions.begin(); while(included_id_it != included_ids_map.end()) { new_included_ids_map[*all_pos_it] = included_id_it->second; all_pos_it++; included_id_it++; } included_ids_map = new_included_ids_map; } } Option Index::fuzzy_search_fields(const std::vector& the_fields, const std::vector& query_tokens, const std::vector& dropped_tokens, const text_match_type_t match_type, const uint32_t* exclude_token_ids, size_t exclude_token_ids_size, filter_result_iterator_t* const filter_result_iterator, const std::vector& curated_ids, const std::unordered_set& excluded_group_ids, const std::vector & sort_fields, const std::vector& num_typos, std::vector> & searched_queries, tsl::htrie_map& qtoken_set, Topster* topster, spp::sparse_hash_map& groups_processed, uint32_t*& all_result_ids, size_t & all_result_ids_len, const size_t group_limit, const std::vector& group_by_fields, const bool group_missing_values, bool prioritize_exact_match, const bool prioritize_token_position, const bool prioritize_num_matching_fields, std::set& query_hashes, const token_ordering token_order, const std::vector& prefixes, const size_t typo_tokens_threshold, const bool exhaustive_search, const size_t max_candidates, size_t min_len_1typo, size_t min_len_2typo, int syn_orig_num_tokens, const int* sort_order, std::array*, 3>& field_values, const std::vector& geopoint_indices, const std::string& collection_name) const { // NOTE: `query_tokens` preserve original tokens, while `search_tokens` could be a result of dropped tokens // To prevent us from doing ART search repeatedly as we iterate through possible corrections spp::sparse_hash_map> token_cost_cache; std::vector> token_to_costs; for(size_t stoken_index=0; stoken_index < query_tokens.size(); stoken_index++) { const std::string& token = query_tokens[stoken_index].value; std::vector all_costs; // This ensures that we don't end up doing a cost of 1 for a single char etc. int bounded_cost = get_bounded_typo_cost(2, token.length(), min_len_1typo, min_len_2typo); for(int cost = 0; cost <= bounded_cost; cost++) { all_costs.push_back(cost); } token_to_costs.push_back(all_costs); } // stores candidates for each token, i.e. i-th index would have all possible tokens with a cost of "c" std::vector token_candidates_vec; std::set unique_tokens; const size_t num_search_fields = std::min(the_fields.size(), (size_t) FIELD_LIMIT_NUM); auto product = []( long long a, std::vector& b ) { return a*b.size(); }; long long n = 0; long long int N = token_to_costs.size() > 30 ? 1 : std::accumulate(token_to_costs.begin(), token_to_costs.end(), 1LL, product); const long long combination_limit = exhaustive_search ? Index::COMBINATION_MAX_LIMIT : Index::COMBINATION_MIN_LIMIT; while(n < N && n < combination_limit) { RETURN_CIRCUIT_BREAKER_OP //LOG(INFO) << "fuzzy_search_fields, n: " << n; // Outerloop generates combinations of [cost to max_cost] for each token // For e.g. for a 3-token query: [0, 0, 0], [0, 0, 1], [0, 1, 1] etc. std::vector costs(token_to_costs.size()); ldiv_t q { n, 0 }; for(long long i = (token_to_costs.size() - 1); 0 <= i ; --i ) { q = ldiv(q.quot, token_to_costs[i].size()); costs[i] = token_to_costs[i][q.rem]; } unique_tokens.clear(); token_candidates_vec.clear(); size_t token_index = 0; while(token_index < query_tokens.size()) { // For each token, look up the generated cost for this iteration and search using that cost const std::string& token = query_tokens[token_index].value; const std::string token_cost_hash = token + std::to_string(costs[token_index]); std::vector leaf_tokens; if(token_cost_cache.count(token_cost_hash) != 0) { leaf_tokens = token_cost_cache[token_cost_hash]; } else { //auto begin = std::chrono::high_resolution_clock::now(); // Prefix query with a preceding token should be handled in such a way that we give preference to // possible phrase continuation. Example: "steve j" for "steve jobs" name field query. To do this, // we will first attempt to match the prefix with the most "popular" fields of the preceding token. // Tokens matched from popular fields will also be searched across other query fields. // Only when we find *no results* for such an expansion, we will attempt cross field matching. bool last_token = query_tokens.size() > 1 && dropped_tokens.empty() && (token_index == (query_tokens.size() - 1)); std::vector query_field_ids(num_search_fields); for(size_t field_id = 0; field_id < num_search_fields; field_id++) { query_field_ids[field_id] = the_fields[field_id].orig_index; } std::vector popular_field_ids; // fields containing the token most across documents if(last_token) { popular_fields_of_token(search_index, token_candidates_vec.back().candidates[0], the_fields, num_search_fields, popular_field_ids); if(popular_field_ids.empty()) { break; } } const std::vector& field_ids = last_token ? popular_field_ids : query_field_ids; for(size_t field_id: field_ids) { // NOTE: when accessing other field ordered properties like prefixes or num_typos we have to index // them by `the_field.orig_index` since the original fields could be reordered on their weights. auto& the_field = the_fields[field_id]; const bool field_prefix = (the_field.orig_index < prefixes.size()) ? prefixes[the_field.orig_index] : prefixes[0]; const bool prefix_search = field_prefix && query_tokens[token_index].is_prefix_searched; const size_t token_len = prefix_search ? (int) token.length() : (int) token.length() + 1; /*LOG(INFO) << "Searching for field: " << the_field.name << ", token:" << token << " - cost: " << costs[token_index] << ", prefix_search: " << prefix_search;*/ int64_t field_num_typos = (the_field.orig_index < num_typos.size()) ? num_typos[the_field.orig_index] : num_typos[0]; auto& locale = search_schema.at(the_field.name).locale; if(locale != "" && (locale == "zh" || locale == "ko" || locale == "ja")) { // disable fuzzy trie traversal for CJK locales field_num_typos = 0; } if(costs[token_index] > field_num_typos) { continue; } //LOG(INFO) << "Searching for field: " << the_field.name << ", found token:" << token; const auto& prev_token = last_token ? token_candidates_vec.back().candidates[0] : ""; std::vector field_leaves; art_fuzzy_search_i(search_index.at(the_field.name), (const unsigned char *) token.c_str(), token_len, costs[token_index], costs[token_index], max_candidates, token_order, prefix_search, last_token, prev_token, filter_result_iterator, field_leaves, unique_tokens); filter_result_iterator->reset(); /*auto timeMillis = std::chrono::duration_cast( std::chrono::high_resolution_clock::now() - begin).count(); LOG(INFO) << "Time taken for fuzzy search: " << timeMillis << "ms";*/ if(field_leaves.empty()) { // look at the next field continue; } for(size_t i = 0; i < field_leaves.size(); i++) { auto leaf = field_leaves[i]; std::string tok(reinterpret_cast(leaf->key), leaf->key_len - 1); leaf_tokens.push_back(tok); } token_cost_cache.emplace(token_cost_hash, leaf_tokens); if(leaf_tokens.size() >= max_candidates) { goto token_done; } } if(last_token && leaf_tokens.size() < max_candidates) { // field-wise matching with previous token has failed, have to look at cross fields matching docs std::vector prev_token_doc_ids; find_across_fields(token_candidates_vec.back().token, token_candidates_vec.back().candidates[0], the_fields, num_search_fields, filter_result_iterator, exclude_token_ids, exclude_token_ids_size, prev_token_doc_ids, popular_field_ids); filter_result_iterator->reset(); for(size_t field_id: query_field_ids) { auto& the_field = the_fields[field_id]; const bool field_prefix = (the_field.orig_index < prefixes.size()) ? prefixes[the_field.orig_index] : prefixes[0];; const bool prefix_search = field_prefix && query_tokens[token_index].is_prefix_searched; const size_t token_len = prefix_search ? (int) token.length() : (int) token.length() + 1; int64_t field_num_typos = (the_field.orig_index < num_typos.size()) ? num_typos[the_field.orig_index] : num_typos[0]; auto& locale = search_schema.at(the_field.name).locale; if(locale != "" && locale != "en" && locale != "th" && !Tokenizer::is_cyrillic(locale)) { // disable fuzzy trie traversal for non-english locales field_num_typos = 0; } if(costs[token_index] > field_num_typos) { continue; } std::vector field_leaves; art_fuzzy_search_i(search_index.at(the_field.name), (const unsigned char *) token.c_str(), token_len, costs[token_index], costs[token_index], max_candidates, token_order, prefix_search, false, "", filter_result_iterator, field_leaves, unique_tokens); filter_result_iterator->reset(); if(field_leaves.empty()) { // look at the next field continue; } for(size_t i = 0; i < field_leaves.size(); i++) { auto leaf = field_leaves[i]; std::string tok(reinterpret_cast(leaf->key), leaf->key_len - 1); leaf_tokens.push_back(tok); } token_cost_cache.emplace(token_cost_hash, leaf_tokens); if(leaf_tokens.size() >= max_candidates) { goto token_done; } } } } token_done: if(!leaf_tokens.empty()) { //log_leaves(costs[token_index], token, leaves); token_candidates_vec.push_back(tok_candidates{query_tokens[token_index], costs[token_index], query_tokens[token_index].is_prefix_searched, leaf_tokens}); } else { // No result at `cost = costs[token_index]`. Remove `cost` for token and re-do combinations auto it = std::find(token_to_costs[token_index].begin(), token_to_costs[token_index].end(), costs[token_index]); if(it != token_to_costs[token_index].end()) { token_to_costs[token_index].erase(it); // when no more costs are left for this token if(token_to_costs[token_index].empty()) { // we cannot proceed further, as this token is not found within cost limits // and, dropping of tokens are done elsewhere. return Option(true); } } // Continue outerloop on new cost combination n = -1; N = std::accumulate(token_to_costs.begin(), token_to_costs.end(), 1LL, product); goto resume_typo_loop; } token_index++; } if(token_candidates_vec.size() == query_tokens.size()) { std::vector id_buff; auto search_all_candidates_op = search_all_candidates(num_search_fields, match_type, the_fields, filter_result_iterator, exclude_token_ids, exclude_token_ids_size, excluded_group_ids, sort_fields, token_candidates_vec, searched_queries, qtoken_set, dropped_tokens, topster, groups_processed, all_result_ids, all_result_ids_len, typo_tokens_threshold, group_limit, group_by_fields, group_missing_values, query_tokens, num_typos, prefixes, prioritize_exact_match, prioritize_token_position, prioritize_num_matching_fields, exhaustive_search, max_candidates, syn_orig_num_tokens, sort_order, field_values, geopoint_indices, query_hashes, id_buff, collection_name); if (!search_all_candidates_op.ok()) { return search_all_candidates_op; } if(id_buff.size() > 1) { gfx::timsort(id_buff.begin(), id_buff.end()); id_buff.erase(std::unique( id_buff.begin(), id_buff.end() ), id_buff.end()); } uint32_t* new_all_result_ids = nullptr; all_result_ids_len = ArrayUtils::or_scalar(all_result_ids, all_result_ids_len, &id_buff[0], id_buff.size(), &new_all_result_ids); delete[] all_result_ids; all_result_ids = new_all_result_ids; } resume_typo_loop: if(!exhaustive_search && all_result_ids_len >= typo_tokens_threshold) { // if typo threshold is breached, we are done return Option(true); } n++; } return Option(true); } void Index::popular_fields_of_token(const spp::sparse_hash_map& search_index, const std::string& previous_token, const std::vector& the_fields, const size_t num_search_fields, std::vector& popular_field_ids) { const auto token_c_str = (const unsigned char*) previous_token.c_str(); const int token_len = (int) previous_token.size() + 1; std::vector> field_id_doc_counts; for(size_t i = 0; i < num_search_fields; i++) { const std::string& field_name = the_fields[i].name; auto leaf = static_cast(art_search(search_index.at(field_name), token_c_str, token_len)); if(!leaf) { continue; } auto num_docs = posting_t::num_ids(leaf->values); field_id_doc_counts.emplace_back(i, num_docs); } std::sort(field_id_doc_counts.begin(), field_id_doc_counts.end(), [](const auto& p1, const auto& p2) { return p1.second > p2.second; }); for(const auto& field_id_doc_count: field_id_doc_counts) { popular_field_ids.push_back(field_id_doc_count.first); } } void Index::find_across_fields(const token_t& previous_token, const std::string& previous_token_str, const std::vector& the_fields, const size_t num_search_fields, filter_result_iterator_t* const filter_result_iterator, const uint32_t* exclude_token_ids, size_t exclude_token_ids_size, std::vector& prev_token_doc_ids, std::vector& top_prefix_field_ids) const { // one iterator for each token, each underlying iterator contains results of token across multiple fields std::vector token_its; // used to track plists that must be destructed once done std::vector expanded_plists; result_iter_state_t istate(exclude_token_ids, exclude_token_ids_size, filter_result_iterator); const bool prefix_search = previous_token.is_prefix_searched; const uint32_t token_num_typos = previous_token.num_typos; const bool token_prefix = previous_token.is_prefix_searched; auto& token_str = previous_token_str; auto token_c_str = (const unsigned char*) token_str.c_str(); const size_t token_len = token_str.size() + 1; std::vector its; std::vector> field_id_doc_counts; for(size_t i = 0; i < num_search_fields; i++) { const std::string& field_name = the_fields[i].name; art_tree* tree = search_index.at(field_name); art_leaf* leaf = static_cast(art_search(tree, token_c_str, token_len)); if(!leaf) { continue; } /*LOG(INFO) << "Token: " << token_str << ", field_name: " << field_name << ", num_ids: " << posting_t::num_ids(leaf->values);*/ if(IS_COMPACT_POSTING(leaf->values)) { auto compact_posting_list = COMPACT_POSTING_PTR(leaf->values); posting_list_t* full_posting_list = compact_posting_list->to_full_posting_list(); expanded_plists.push_back(full_posting_list); its.push_back(full_posting_list->new_iterator(nullptr, nullptr, i)); // moved, not copied } else { posting_list_t* full_posting_list = (posting_list_t*)(leaf->values); its.push_back(full_posting_list->new_iterator(nullptr, nullptr, i)); // moved, not copied } field_id_doc_counts.emplace_back(i, posting_t::num_ids(leaf->values)); } if(its.empty()) { // this token does not have any match across *any* field: probably a typo LOG(INFO) << "No matching field found for token: " << token_str; return; } std::sort(field_id_doc_counts.begin(), field_id_doc_counts.end(), [](const auto& p1, const auto& p2) { return p1.second > p2.second; }); for(auto& field_id_doc_count: field_id_doc_counts) { top_prefix_field_ids.push_back(field_id_doc_count.first); } or_iterator_t token_fields(its); token_its.push_back(std::move(token_fields)); or_iterator_t::intersect(token_its, istate, [&](const single_filter_result_t& filter_result, const std::vector& its) { auto& seq_id = filter_result.seq_id; prev_token_doc_ids.push_back(seq_id); }); for(posting_list_t* plist: expanded_plists) { delete plist; } } Option Index::search_across_fields(const std::vector& query_tokens, const std::vector& num_typos, const std::vector& prefixes, const std::vector& the_fields, const size_t num_search_fields, const text_match_type_t match_type, const std::vector& sort_fields, Topster* topster, spp::sparse_hash_map& groups_processed, std::vector>& searched_queries, tsl::htrie_map& qtoken_set, const std::vector& dropped_tokens, const size_t group_limit, const std::vector& group_by_fields, const bool group_missing_values, const bool prioritize_exact_match, const bool prioritize_token_position, const bool prioritize_num_matching_fields, filter_result_iterator_t* const filter_result_iterator, const uint32_t total_cost, const int syn_orig_num_tokens, const uint32_t* exclude_token_ids, size_t exclude_token_ids_size, const std::unordered_set& excluded_group_ids, const int* sort_order, std::array*, 3>& field_values, const std::vector& geopoint_indices, std::vector& id_buff, uint32_t*& all_result_ids, size_t& all_result_ids_len, const std::string& collection_name) const { std::vector query_suggestion; // one or_iterator for each token (across multiple fields) std::vector dropped_token_its; // used to track plists that must be destructed once done std::vector expanded_dropped_plists; for(auto& dropped_token: dropped_tokens) { auto& token = dropped_token.value; auto token_c_str = (const unsigned char*) token.c_str(); // convert token from each field into an or_iterator std::vector its; for(size_t i = 0; i < the_fields.size(); i++) { const std::string& field_name = the_fields[i].name; art_tree* tree = search_index.at(field_name); art_leaf* leaf = static_cast(art_search(tree, token_c_str, token.size()+1)); if(!leaf) { continue; } /*LOG(INFO) << "Token: " << token << ", field_name: " << field_name << ", num_ids: " << posting_t::num_ids(leaf->values);*/ if(IS_COMPACT_POSTING(leaf->values)) { auto compact_posting_list = COMPACT_POSTING_PTR(leaf->values); posting_list_t* full_posting_list = compact_posting_list->to_full_posting_list(); expanded_dropped_plists.push_back(full_posting_list); its.push_back(full_posting_list->new_iterator(nullptr, nullptr, i)); // moved, not copied } else { posting_list_t* full_posting_list = (posting_list_t*)(leaf->values); its.push_back(full_posting_list->new_iterator(nullptr, nullptr, i)); // moved, not copied } } or_iterator_t token_fields(its); dropped_token_its.push_back(std::move(token_fields)); } // one iterator for each token, each underlying iterator contains results of token across multiple fields std::vector token_its; // used to track plists that must be destructed once done std::vector expanded_plists; result_iter_state_t istate(exclude_token_ids, exclude_token_ids_size, filter_result_iterator); // for each token, find the posting lists across all query_by fields for(size_t ti = 0; ti < query_tokens.size(); ti++) { const uint32_t token_num_typos = query_tokens[ti].num_typos; const bool token_prefix = query_tokens[ti].is_prefix_searched; auto& token_str = query_tokens[ti].value; auto token_c_str = (const unsigned char*) token_str.c_str(); const size_t token_len = token_str.size() + 1; std::vector its; for(size_t i = 0; i < num_search_fields; i++) { const std::string& field_name = the_fields[i].name; const uint32_t field_num_typos = (the_fields[i].orig_index < num_typos.size()) ? num_typos[the_fields[i].orig_index] : num_typos[0]; const bool field_prefix = (the_fields[i].orig_index < prefixes.size()) ? prefixes[the_fields[i].orig_index] : prefixes[0]; if(token_num_typos > field_num_typos) { // since the token can come from any field, we still have to respect per-field num_typos continue; } if(token_prefix && !field_prefix) { // even though this token is an outcome of prefix search, we can't use it for this field, since // this field has prefix search disabled. continue; } art_tree* tree = search_index.at(field_name); art_leaf* leaf = static_cast(art_search(tree, token_c_str, token_len)); if(!leaf) { continue; } query_suggestion.push_back(leaf); /*LOG(INFO) << "Token: " << token_str << ", field_name: " << field_name << ", num_ids: " << posting_t::num_ids(leaf->values);*/ if(IS_COMPACT_POSTING(leaf->values)) { auto compact_posting_list = COMPACT_POSTING_PTR(leaf->values); posting_list_t* full_posting_list = compact_posting_list->to_full_posting_list(); expanded_plists.push_back(full_posting_list); its.push_back(full_posting_list->new_iterator(nullptr, nullptr, i)); // moved, not copied } else { posting_list_t* full_posting_list = (posting_list_t*)(leaf->values); its.push_back(full_posting_list->new_iterator(nullptr, nullptr, i)); // moved, not copied } } if(its.empty()) { // this token does not have any match across *any* field: probably a typo LOG(INFO) << "No matching field found for token: " << token_str; continue; } or_iterator_t token_fields(its); token_its.push_back(std::move(token_fields)); } std::vector result_ids; std::vector eval_filter_indexes; Option status(true); auto group_by_field_it_vec = get_group_by_field_iterators(group_by_fields); or_iterator_t::intersect(token_its, istate, [&](single_filter_result_t& filter_result, const std::vector& its) { auto& seq_id = filter_result.seq_id; auto references = std::move(filter_result.reference_filter_results); //LOG(INFO) << "seq_id: " << seq_id; // Convert [token -> fields] orientation to [field -> tokens] orientation std::vector> field_to_tokens(num_search_fields); for(size_t ti = 0; ti < its.size(); ti++) { const or_iterator_t& token_fields_iters = its[ti]; const std::vector& field_iters = token_fields_iters.get_its(); for(size_t fi = 0; fi < field_iters.size(); fi++) { const posting_list_t::iterator_t& field_iter = field_iters[fi]; if(field_iter.id() == seq_id) { // not all fields might contain a given token field_to_tokens[field_iter.get_field_id()].push_back(field_iter.clone()); } } } size_t query_len = query_tokens.size(); // check if seq_id exists in any of the dropped_token iters for(size_t ti = 0; ti < dropped_token_its.size(); ti++) { or_iterator_t& token_fields_iters = dropped_token_its[ti]; if(token_fields_iters.skip_to(seq_id) && token_fields_iters.id() == seq_id) { query_len++; const std::vector& field_iters = token_fields_iters.get_its(); for(size_t fi = 0; fi < field_iters.size(); fi++) { const posting_list_t::iterator_t& field_iter = field_iters[fi]; if(field_iter.id() == seq_id) { // not all fields might contain a given token field_to_tokens[field_iter.get_field_id()].push_back(field_iter.clone()); } } } } if(syn_orig_num_tokens != -1) { query_len = syn_orig_num_tokens; } int64_t best_field_match_score = 0, best_field_weight = 0; uint32_t num_matching_fields = 0; for(size_t fi = 0; fi < field_to_tokens.size(); fi++) { const std::vector& token_postings = field_to_tokens[fi]; if(token_postings.empty()) { continue; } const int64_t field_weight = the_fields[fi].weight; const bool field_is_array = search_schema.at(the_fields[fi].name).is_array(); int64_t field_match_score = 0; bool single_exact_query_token = false; if(total_cost == 0 && query_tokens.size() == 1) { // does this candidate suggestion token match query token exactly? single_exact_query_token = true; } score_results2(sort_fields, searched_queries.size(), fi, field_is_array, total_cost, field_match_score, seq_id, sort_order, prioritize_exact_match, single_exact_query_token, prioritize_token_position, query_tokens.size(), syn_orig_num_tokens, token_postings); if(match_type == max_score && field_match_score > best_field_match_score) { best_field_match_score = field_match_score; best_field_weight = field_weight; } if(match_type == max_weight && field_weight > best_field_weight) { best_field_weight = field_weight; best_field_match_score = field_match_score; } num_matching_fields++; } uint64_t distinct_id = seq_id; if(group_limit != 0) { distinct_id = 1; for(auto& kv : group_by_field_it_vec) { get_distinct_id(kv.field_name, kv.it, seq_id, group_missing_values, distinct_id); } if(excluded_group_ids.count(distinct_id) != 0) { return; } } int64_t scores[3] = {0}; int64_t match_score_index = -1; auto compute_sort_scores_op = compute_sort_scores(sort_fields, sort_order, field_values, geopoint_indices, seq_id, references, eval_filter_indexes, best_field_match_score, scores, match_score_index, 0, collection_name); if (!compute_sort_scores_op.ok()) { status = Option(compute_sort_scores_op.code(), compute_sort_scores_op.error()); return; } query_len = std::min(15, query_len); // NOTE: `query_len` is total tokens matched across fields. // Within a field, only a subset can match // MAX_SCORE // [ sign | tokens_matched | max_field_score | max_field_weight | num_matching_fields ] // [ 1 | 4 | 48 | 8 | 3 ] (64 bits) // MAX_WEIGHT // [ sign | tokens_matched | max_field_weight | max_field_score | num_matching_fields ] // [ 1 | 4 | 8 | 48 | 3 ] (64 bits) auto max_field_weight = std::min(FIELD_MAX_WEIGHT, best_field_weight); num_matching_fields = std::min(7, num_matching_fields); if(!prioritize_num_matching_fields) { num_matching_fields = 0; } uint64_t aggregated_score = match_type == max_score ? ((int64_t(query_len) << 59) | (int64_t(best_field_match_score) << 11) | (int64_t(max_field_weight) << 3) | (int64_t(num_matching_fields) << 0)) : ((int64_t(query_len) << 59) | (int64_t(max_field_weight) << 51) | (int64_t(best_field_match_score) << 3) | (int64_t(num_matching_fields) << 0)) ; /*LOG(INFO) << "seq_id: " << seq_id << ", query_len: " << query_len << ", syn_orig_num_tokens: " << syn_orig_num_tokens << ", best_field_match_score: " << best_field_match_score << ", max_field_weight: " << max_field_weight << ", num_matching_fields: " << num_matching_fields << ", aggregated_score: " << aggregated_score;*/ KV kv(searched_queries.size(), seq_id, distinct_id, match_score_index, scores, std::move(references)); if(match_score_index != -1) { kv.scores[match_score_index] = aggregated_score; kv.text_match_score = aggregated_score; } int ret = topster->add(&kv); if(group_limit != 0 && ret < 2) { groups_processed[distinct_id]++; } result_ids.push_back(seq_id); }); if (!status.ok()) { for(posting_list_t* plist: expanded_plists) { delete plist; } for(posting_list_t* plist: expanded_dropped_plists) { delete plist; } return status; } id_buff.insert(id_buff.end(), result_ids.begin(), result_ids.end()); if(id_buff.size() > 100000) { // prevents too many ORs during exhaustive searching gfx::timsort(id_buff.begin(), id_buff.end()); id_buff.erase(std::unique( id_buff.begin(), id_buff.end() ), id_buff.end()); uint32_t* new_all_result_ids = nullptr; all_result_ids_len = ArrayUtils::or_scalar(all_result_ids, all_result_ids_len, &id_buff[0], id_buff.size(), &new_all_result_ids); delete[] all_result_ids; all_result_ids = new_all_result_ids; id_buff.clear(); } if(!result_ids.empty()) { searched_queries.push_back(query_suggestion); for(const auto& qtoken: query_tokens) { qtoken_set.insert(qtoken.value, token_leaf(nullptr, qtoken.root_len, qtoken.num_typos, qtoken.is_prefix_searched)); } } for(posting_list_t* plist: expanded_plists) { delete plist; } for(posting_list_t* plist: expanded_dropped_plists) { delete plist; } return Option(true); } Option Index::compute_sort_scores(const std::vector& sort_fields, const int* sort_order, std::array*, 3> field_values, const std::vector& geopoint_indices, uint32_t seq_id, const std::map, reference_filter_result_t>& references, std::vector& filter_indexes, int64_t max_field_match_score, int64_t* scores, int64_t& match_score_index, float vector_distance, const std::string& collection_name) const { int64_t geopoint_distances[3]; for(auto& i: geopoint_indices) { spp::sparse_hash_map* geopoints = field_values[i]; int64_t dist = INT32_MAX; S2LatLng reference_lat_lng; GeoPoint::unpack_lat_lng(sort_fields[i].geopoint, reference_lat_lng); if(geopoints != nullptr) { auto it = geopoints->find(seq_id); if(it != geopoints->end()) { int64_t packed_latlng = it->second; S2LatLng s2_lat_lng; GeoPoint::unpack_lat_lng(packed_latlng, s2_lat_lng); dist = GeoPoint::distance(s2_lat_lng, reference_lat_lng); } } else { // indicates geo point array auto field_it = geo_array_index.at(sort_fields[i].name); auto it = field_it->find(seq_id); if(it != field_it->end()) { int64_t* latlngs = it->second; for(size_t li = 0; li < latlngs[0]; li++) { S2LatLng s2_lat_lng; int64_t packed_latlng = latlngs[li + 1]; GeoPoint::unpack_lat_lng(packed_latlng, s2_lat_lng); int64_t this_dist = GeoPoint::distance(s2_lat_lng, reference_lat_lng); if(this_dist < dist) { dist = this_dist; } } } } if(dist < sort_fields[i].exclude_radius) { dist = 0; } if(sort_fields[i].geo_precision > 0) { dist = dist + sort_fields[i].geo_precision - 1 - (dist + sort_fields[i].geo_precision - 1) % sort_fields[i].geo_precision; } geopoint_distances[i] = dist; // Swap (id -> latlong) index to (id -> distance) index field_values[i] = &geo_sentinel_value; } const int64_t default_score = INT64_MIN; // to handle field that doesn't exist in document (e.g. optional) // avoiding loop if (sort_fields.size() > 0) { // In case of reference sort_by, we need to get the sort score of the reference doc id. if (!sort_fields[0].reference_collection_name.empty()) { auto& sort_field = sort_fields[0]; auto const& ref_collection_name = sort_field.reference_collection_name; auto const& multiple_references_error_message = "Multiple references found to sort by on `" + ref_collection_name + "." + sort_field.name + "`."; auto const& no_references_error_message = "No references found to sort by on `" + ref_collection_name + "." + sort_field.name + "`."; // Joined on ref collection if (references.count(ref_collection_name) > 0) { if (references.at(ref_collection_name).count == 1) { seq_id = references.at(ref_collection_name).docs[0]; } else { return Option(400, references.at(ref_collection_name).count > 1 ? multiple_references_error_message : no_references_error_message); } } else { auto& cm = CollectionManager::get_instance(); auto ref_collection = cm.get_collection(ref_collection_name); if (ref_collection == nullptr) { return Option(400, "Referenced collection `" + ref_collection_name + "` in `sort_by` not found."); } // Current collection has a reference. if (ref_collection->is_referenced_in(collection_name)) { auto get_reference_field_op = ref_collection->get_reference_field(collection_name); if (!get_reference_field_op.ok()) { return Option(get_reference_field_op.code(), get_reference_field_op.error()); } auto const& field_name = get_reference_field_op.get(); if (sort_index.count(field_name) == 0 || sort_index.at(field_name)->count(seq_id) == 0) { return Option(400, "Could not find a reference for doc " + std::to_string(seq_id)); } seq_id = sort_index.at(field_name)->at(seq_id); } // Joined collection has a reference else { std::string joined_coll_having_reference; for (const auto &reference: references) { if (ref_collection->is_referenced_in(reference.first)) { joined_coll_having_reference = reference.first; break; } } if (joined_coll_having_reference.empty()) { return Option(400, no_references_error_message); } auto joined_collection = cm.get_collection(joined_coll_having_reference); if (joined_collection == nullptr) { return Option(400, "Referenced collection `" + joined_coll_having_reference + "` in `sort_by` not found."); } auto reference_field_name_op = ref_collection->get_reference_field(joined_coll_having_reference); if (!reference_field_name_op.ok()) { return Option(reference_field_name_op.code(), reference_field_name_op.error()); } auto const& reference_field_name = reference_field_name_op.get(); auto const& reference = references.at(joined_coll_having_reference); auto const& count = reference.count; if (count == 1) { auto op = joined_collection->get_sort_indexed_field_value(reference_field_name, reference.docs[0]); if (!op.ok()) { return Option(op.code(), op.error()); } seq_id = op.get(); } else { return Option(400, count > 1 ? multiple_references_error_message : no_references_error_message); } } } } if (field_values[0] == &text_match_sentinel_value) { scores[0] = int64_t(max_field_match_score); match_score_index = 0; } else if (field_values[0] == &seq_id_sentinel_value) { scores[0] = seq_id; } else if(field_values[0] == &geo_sentinel_value) { scores[0] = geopoint_distances[0]; } else if(field_values[0] == &str_sentinel_value) { if (sort_fields[0].reference_collection_name.empty()) { scores[0] = str_sort_index.at(sort_fields[0].name)->rank(seq_id); } else { auto& cm = CollectionManager::get_instance(); auto ref_collection = cm.get_collection(sort_fields[0].reference_collection_name); if (ref_collection == nullptr) { return Option(400, "Referenced collection `" + sort_fields[0].reference_collection_name + "` not found."); } scores[0] = ref_collection->reference_string_sort_score(sort_fields[0].name, seq_id); } if(scores[0] == adi_tree_t::NOT_FOUND) { if(sort_fields[0].order == sort_field_const::asc && sort_fields[0].missing_values == sort_by::missing_values_t::first) { scores[0] = -scores[0]; } else if(sort_fields[0].order == sort_field_const::desc && sort_fields[0].missing_values == sort_by::missing_values_t::last) { scores[0] = -scores[0]; } } } else if(field_values[0] == &eval_sentinel_value) { auto const& count = sort_fields[0].eval_expressions.size(); if (filter_indexes.empty()) { filter_indexes = std::vector(count, 0); } bool found = false; uint32_t index = 0; auto const& eval = sort_fields[0].eval; for (; index < count; index++) { auto& filter_index = filter_indexes[index]; auto const& eval_ids = eval.eval_ids_vec[index]; auto const& eval_ids_count = eval.eval_ids_count_vec[index]; if (filter_index == 0 || filter_index < eval_ids_count) { // Returns iterator to the first element that is >= to value or last if no such element is found. filter_index = std::lower_bound(eval_ids + filter_index, eval_ids + eval_ids_count, seq_id) - eval_ids; if (filter_index < eval_ids_count && eval_ids[filter_index] == seq_id) { filter_index++; found = true; break; } } } scores[0] = found ? eval.scores[index] : 0; } else if(field_values[0] == &vector_distance_sentinel_value) { scores[0] = float_to_int64_t(vector_distance); } else { auto it = field_values[0]->find(seq_id); scores[0] = (it == field_values[0]->end()) ? default_score : it->second; if(scores[0] == INT64_MIN && sort_fields[0].missing_values == sort_by::missing_values_t::first) { // By default, missing numerical value are always going to be sorted to be at the end // because: -INT64_MIN == INT64_MIN. To account for missing values config, we will have to change // the default for missing value based on whether it's asc or desc sort. bool is_asc = (sort_order[0] == -1); scores[0] = is_asc ? (INT64_MIN + 1) : INT64_MAX; } } if (sort_order[0] == -1) { scores[0] = -scores[0]; } } if(sort_fields.size() > 1) { // In case of reference sort_by, we need to get the sort score of the reference doc id. if (!sort_fields[1].reference_collection_name.empty()) { auto& sort_field = sort_fields[1]; auto const& ref_collection_name = sort_field.reference_collection_name; auto const& multiple_references_error_message = "Multiple references found to sort by on `" + ref_collection_name + "." + sort_field.name + "`."; auto const& no_references_error_message = "No references found to sort by on `" + ref_collection_name + "." + sort_field.name + "`."; // Joined on ref collection if (references.count(ref_collection_name) > 0) { if (references.at(ref_collection_name).count == 1) { seq_id = references.at(ref_collection_name).docs[0]; } else { return Option(400, references.at(ref_collection_name).count > 1 ? multiple_references_error_message : no_references_error_message); } } else { auto& cm = CollectionManager::get_instance(); auto ref_collection = cm.get_collection(ref_collection_name); if (ref_collection == nullptr) { return Option(400, "Referenced collection `" + ref_collection_name + "` in `sort_by` not found."); } // Current collection has a reference. if (ref_collection->is_referenced_in(collection_name)) { auto get_reference_field_op = ref_collection->get_reference_field(collection_name); if (!get_reference_field_op.ok()) { return Option(get_reference_field_op.code(), get_reference_field_op.error()); } auto const& field_name = get_reference_field_op.get(); if (sort_index.count(field_name) == 0 || sort_index.at(field_name)->count(seq_id) == 0) { return Option(400, "Could not find a reference for doc " + std::to_string(seq_id)); } seq_id = sort_index.at(field_name)->at(seq_id); } // Joined collection has a reference else { std::string joined_coll_having_reference; for (const auto &reference: references) { if (ref_collection->is_referenced_in(reference.first)) { joined_coll_having_reference = reference.first; break; } } if (joined_coll_having_reference.empty()) { return Option(400, no_references_error_message); } auto joined_collection = cm.get_collection(joined_coll_having_reference); if (joined_collection == nullptr) { return Option(400, "Referenced collection `" + joined_coll_having_reference + "` in `sort_by` not found."); } auto reference_field_name_op = ref_collection->get_reference_field(joined_coll_having_reference); if (!reference_field_name_op.ok()) { return Option(reference_field_name_op.code(), reference_field_name_op.error()); } auto const& reference_field_name = reference_field_name_op.get(); auto const& reference = references.at(joined_coll_having_reference); auto const& count = reference.count; if (count == 1) { auto op = joined_collection->get_sort_indexed_field_value(reference_field_name, reference.docs[0]); if (!op.ok()) { return Option(op.code(), op.error()); } seq_id = op.get(); } else { return Option(400, count > 1 ? multiple_references_error_message : no_references_error_message); } } } } if (field_values[1] == &text_match_sentinel_value) { scores[1] = int64_t(max_field_match_score); match_score_index = 1; } else if (field_values[1] == &seq_id_sentinel_value) { scores[1] = seq_id; } else if(field_values[1] == &geo_sentinel_value) { scores[1] = geopoint_distances[1]; } else if(field_values[1] == &str_sentinel_value) { if (sort_fields[1].reference_collection_name.empty()) { scores[1] = str_sort_index.at(sort_fields[1].name)->rank(seq_id); } else { auto& cm = CollectionManager::get_instance(); auto ref_collection = cm.get_collection(sort_fields[1].reference_collection_name); if (ref_collection == nullptr) { return Option(400, "Referenced collection `" + sort_fields[1].reference_collection_name + "` not found."); } scores[1] = ref_collection->reference_string_sort_score(sort_fields[1].name, seq_id); } if(scores[1] == adi_tree_t::NOT_FOUND) { if(sort_fields[1].order == sort_field_const::asc && sort_fields[1].missing_values == sort_by::missing_values_t::first) { scores[1] = -scores[1]; } else if(sort_fields[1].order == sort_field_const::desc && sort_fields[1].missing_values == sort_by::missing_values_t::last) { scores[1] = -scores[1]; } } } else if(field_values[1] == &eval_sentinel_value) { auto const& count = sort_fields[1].eval_expressions.size(); if (filter_indexes.empty()) { filter_indexes = std::vector(count, 0); } bool found = false; uint32_t index = 0; auto const& eval = sort_fields[1].eval; for (; index < count; index++) { auto& filter_index = filter_indexes[index]; auto const& eval_ids = eval.eval_ids_vec[index]; auto const& eval_ids_count = eval.eval_ids_count_vec[index]; if (filter_index == 0 || filter_index < eval_ids_count) { // Returns iterator to the first element that is >= to value or last if no such element is found. filter_index = std::lower_bound(eval_ids + filter_index, eval_ids + eval_ids_count, seq_id) - eval_ids; if (filter_index < eval_ids_count && eval_ids[filter_index] == seq_id) { filter_index++; found = true; break; } } } scores[1] = found ? eval.scores[index] : 0; } else if(field_values[1] == &vector_distance_sentinel_value) { scores[1] = float_to_int64_t(vector_distance); } else { auto it = field_values[1]->find(seq_id); scores[1] = (it == field_values[1]->end()) ? default_score : it->second; if(scores[1] == INT64_MIN && sort_fields[1].missing_values == sort_by::missing_values_t::first) { bool is_asc = (sort_order[1] == -1); scores[1] = is_asc ? (INT64_MIN + 1) : INT64_MAX; } } if (sort_order[1] == -1) { scores[1] = -scores[1]; } } if(sort_fields.size() > 2) { // In case of reference sort_by, we need to get the sort score of the reference doc id. if (!sort_fields[2].reference_collection_name.empty()) { auto& sort_field = sort_fields[2]; auto const& ref_collection_name = sort_field.reference_collection_name; auto const& multiple_references_error_message = "Multiple references found to sort by on `" + ref_collection_name + "." + sort_field.name + "`."; auto const& no_references_error_message = "No references found to sort by on `" + ref_collection_name + "." + sort_field.name + "`."; // Joined on ref collection if (references.count(ref_collection_name) > 0) { if (references.at(ref_collection_name).count == 1) { seq_id = references.at(ref_collection_name).docs[0]; } else { return Option(400, references.at(ref_collection_name).count > 1 ? multiple_references_error_message : no_references_error_message); } } else { auto& cm = CollectionManager::get_instance(); auto ref_collection = cm.get_collection(ref_collection_name); if (ref_collection == nullptr) { return Option(400, "Referenced collection `" + ref_collection_name + "` in `sort_by` not found."); } // Current collection has a reference. if (ref_collection->is_referenced_in(collection_name)) { auto get_reference_field_op = ref_collection->get_reference_field(collection_name); if (!get_reference_field_op.ok()) { return Option(get_reference_field_op.code(), get_reference_field_op.error()); } auto const& field_name = get_reference_field_op.get(); if (sort_index.count(field_name) == 0 || sort_index.at(field_name)->count(seq_id) == 0) { return Option(400, "Could not find a reference for doc " + std::to_string(seq_id)); } seq_id = sort_index.at(field_name)->at(seq_id); } // Joined collection has a reference else { std::string joined_coll_having_reference; for (const auto &reference: references) { if (ref_collection->is_referenced_in(reference.first)) { joined_coll_having_reference = reference.first; break; } } if (joined_coll_having_reference.empty()) { return Option(400, no_references_error_message); } auto joined_collection = cm.get_collection(joined_coll_having_reference); if (joined_collection == nullptr) { return Option(400, "Referenced collection `" + joined_coll_having_reference + "` in `sort_by` not found."); } auto reference_field_name_op = ref_collection->get_reference_field(joined_coll_having_reference); if (!reference_field_name_op.ok()) { return Option(reference_field_name_op.code(), reference_field_name_op.error()); } auto const& reference_field_name = reference_field_name_op.get(); auto const& reference = references.at(joined_coll_having_reference); auto const& count = reference.count; if (count == 1) { auto op = joined_collection->get_sort_indexed_field_value(reference_field_name, reference.docs[0]); if (!op.ok()) { return Option(op.code(), op.error()); } seq_id = op.get(); } else { return Option(400, count > 1 ? multiple_references_error_message : no_references_error_message); } } } } if (field_values[2] == &text_match_sentinel_value) { scores[2] = int64_t(max_field_match_score); match_score_index = 2; } else if (field_values[2] == &seq_id_sentinel_value) { scores[2] = seq_id; } else if(field_values[2] == &geo_sentinel_value) { scores[2] = geopoint_distances[2]; } else if(field_values[2] == &str_sentinel_value) { if (sort_fields[2].reference_collection_name.empty()) { scores[2] = str_sort_index.at(sort_fields[2].name)->rank(seq_id); } else { auto& cm = CollectionManager::get_instance(); auto ref_collection = cm.get_collection(sort_fields[2].reference_collection_name); if (ref_collection == nullptr) { return Option(400, "Referenced collection `" + sort_fields[2].reference_collection_name + "` not found."); } scores[2] = ref_collection->reference_string_sort_score(sort_fields[2].name, seq_id); } if(scores[2] == adi_tree_t::NOT_FOUND) { if(sort_fields[2].order == sort_field_const::asc && sort_fields[2].missing_values == sort_by::missing_values_t::first) { scores[2] = -scores[2]; } else if(sort_fields[2].order == sort_field_const::desc && sort_fields[2].missing_values == sort_by::missing_values_t::last) { scores[2] = -scores[2]; } } } else if(field_values[2] == &eval_sentinel_value) { auto const& count = sort_fields[2].eval_expressions.size(); if (filter_indexes.empty()) { filter_indexes = std::vector(count, 0); } bool found = false; uint32_t index = 0; auto const& eval = sort_fields[2].eval; for (; index < count; index++) { auto& filter_index = filter_indexes[index]; auto const& eval_ids = eval.eval_ids_vec[index]; auto const& eval_ids_count = eval.eval_ids_count_vec[index]; if (filter_index == 0 || filter_index < eval_ids_count) { // Returns iterator to the first element that is >= to value or last if no such element is found. filter_index = std::lower_bound(eval_ids + filter_index, eval_ids + eval_ids_count, seq_id) - eval_ids; if (filter_index < eval_ids_count && eval_ids[filter_index] == seq_id) { filter_index++; found = true; break; } } } scores[2] = found ? eval.scores[index] : 0; } else if(field_values[2] == &vector_distance_sentinel_value) { scores[2] = float_to_int64_t(vector_distance); } else { auto it = field_values[2]->find(seq_id); scores[2] = (it == field_values[2]->end()) ? default_score : it->second; if(scores[2] == INT64_MIN && sort_fields[2].missing_values == sort_by::missing_values_t::first) { bool is_asc = (sort_order[2] == -1); scores[2] = is_asc ? (INT64_MIN + 1) : INT64_MAX; } } if (sort_order[2] == -1) { scores[2] = -scores[2]; } } return Option(true); } Option Index::do_phrase_search(const size_t num_search_fields, const std::vector& search_fields, std::vector& field_query_tokens, const std::vector& sort_fields, std::vector>& searched_queries, const size_t group_limit, const std::vector& group_by_fields, const bool group_missing_values, Topster* actual_topster, const int sort_order[3], std::array*, 3> field_values, const std::vector& geopoint_indices, const std::vector& curated_ids_sorted, filter_result_iterator_t*& filter_result_iterator, uint32_t*& all_result_ids, size_t& all_result_ids_len, spp::sparse_hash_map& groups_processed, const std::set& curated_ids, const uint32_t* excluded_result_ids, size_t excluded_result_ids_size, const std::unordered_set& excluded_group_ids, Topster* curated_topster, const std::map>& included_ids_map, bool is_wildcard_query, const std::string& collection_name) const { uint32_t* phrase_result_ids = nullptr; uint32_t phrase_result_count = 0; std::map phrase_match_id_scores; for(size_t i = 0; i < num_search_fields; i++) { const std::string& field_name = search_fields[i].name; const size_t field_weight = search_fields[i].weight; bool is_array = search_schema.at(field_name).is_array(); uint32_t* field_phrase_match_ids = nullptr; size_t field_phrase_match_ids_size = 0; for(const auto& phrase: field_query_tokens[i].q_phrases) { std::vector posting_lists; for(const std::string& token: phrase) { art_leaf* leaf = (art_leaf *) art_search(search_index.at(field_name), (const unsigned char *) token.c_str(), token.size() + 1); if(leaf) { posting_lists.push_back(leaf->values); } } if(posting_lists.size() != phrase.size()) { // unmatched length means no matches will be found for this phrase, so skip to next phrase continue; } std::vector contains_ids; posting_t::intersect(posting_lists, contains_ids); uint32_t* this_phrase_ids = new uint32_t[contains_ids.size()]; size_t this_phrase_ids_size = 0; posting_t::get_phrase_matches(posting_lists, is_array, &contains_ids[0], contains_ids.size(), this_phrase_ids, this_phrase_ids_size); if(this_phrase_ids_size == 0) { // no results found for this phrase, but other phrases can find results delete [] this_phrase_ids; continue; } // results of multiple phrases must be ANDed if(field_phrase_match_ids_size == 0) { field_phrase_match_ids_size = this_phrase_ids_size; field_phrase_match_ids = this_phrase_ids; } else { uint32_t* phrase_ids_merged = nullptr; field_phrase_match_ids_size = ArrayUtils::and_scalar(this_phrase_ids, this_phrase_ids_size, field_phrase_match_ids, field_phrase_match_ids_size, &phrase_ids_merged); delete [] field_phrase_match_ids; delete [] this_phrase_ids; field_phrase_match_ids = phrase_ids_merged; } } if(field_phrase_match_ids_size == 0) { continue; } // upto 10K phrase match IDs per field will be weighted so that phrase match against a higher weighted field // is returned earlier in the results const size_t weight_score_base = 100000; // just to make score be a large number for(size_t pi = 0; pi < std::min(10000, field_phrase_match_ids_size); pi++) { auto this_field_score = (weight_score_base + field_weight); auto existing_score = phrase_match_id_scores[field_phrase_match_ids[pi]]; phrase_match_id_scores[field_phrase_match_ids[pi]] = std::max(this_field_score, existing_score); } // across fields, we have to OR phrase match ids if(phrase_result_count == 0) { phrase_result_ids = field_phrase_match_ids; phrase_result_count = field_phrase_match_ids_size; } else { uint32_t* phrase_ids_merged = nullptr; phrase_result_count = ArrayUtils::or_scalar(phrase_result_ids, phrase_result_count, field_phrase_match_ids, field_phrase_match_ids_size, &phrase_ids_merged); delete [] phrase_result_ids; delete [] field_phrase_match_ids; phrase_result_ids = phrase_ids_merged; } } curate_filtered_ids(curated_ids, excluded_result_ids, excluded_result_ids_size, phrase_result_ids, phrase_result_count, curated_ids_sorted); collate_included_ids({}, included_ids_map, curated_topster, searched_queries); // AND phrase id matches with filter ids if(filter_result_iterator->is_valid) { filter_result_iterator_t::add_phrase_ids(filter_result_iterator, phrase_result_ids, phrase_result_count); } else { delete filter_result_iterator; filter_result_iterator = new filter_result_iterator_t(phrase_result_ids, phrase_result_count); } if (!is_wildcard_query) { // this means that the there are non-phrase tokens in the query // so we cannot directly copy to the all_result_ids array return Option(true); } all_result_ids_len = filter_result_iterator->to_filter_id_array(all_result_ids); filter_result_iterator->reset(); std::vector eval_filter_indexes; std::vector group_by_field_it_vec; if (group_limit != 0) { group_by_field_it_vec = get_group_by_field_iterators(group_by_fields); } // populate topster for(size_t i = 0; i < std::min(10000, all_result_ids_len); i++) { auto seq_id = filter_result_iterator->seq_id; auto references = std::move(filter_result_iterator->reference); filter_result_iterator->next(); int64_t match_score = phrase_match_id_scores[seq_id]; int64_t scores[3] = {0}; int64_t match_score_index = -1; auto compute_sort_scores_op = compute_sort_scores(sort_fields, sort_order, field_values, geopoint_indices, seq_id, references, eval_filter_indexes, match_score, scores, match_score_index, 0, collection_name); if (!compute_sort_scores_op.ok()) { return compute_sort_scores_op; } uint64_t distinct_id = seq_id; if(group_limit != 0) { distinct_id = 1; for(auto& kv : group_by_field_it_vec) { get_distinct_id(kv.field_name, kv.it, seq_id, group_missing_values, distinct_id); } if(excluded_group_ids.count(distinct_id) != 0) { continue; } } KV kv(searched_queries.size(), seq_id, distinct_id, match_score_index, scores, std::move(references)); int ret = actual_topster->add(&kv); if(group_limit != 0 && ret < 2) { groups_processed[distinct_id]++; } if(((i + 1) % (1 << 12)) == 0) { BREAK_CIRCUIT_BREAKER } } filter_result_iterator->reset(); searched_queries.push_back({}); return Option(true); } Option Index::do_synonym_search(const std::vector& the_fields, const text_match_type_t match_type, filter_node_t const* const& filter_tree_root, const std::map>& included_ids_map, const std::vector& sort_fields_std, Topster* curated_topster, const token_ordering& token_order, const size_t typo_tokens_threshold, const size_t group_limit, const std::vector& group_by_fields, const bool group_missing_values, bool prioritize_exact_match, const bool prioritize_token_position, const bool prioritize_num_matching_fields, const bool exhaustive_search, const size_t concurrency, const std::vector& prefixes, size_t min_len_1typo, size_t min_len_2typo, const size_t max_candidates, const std::set& curated_ids, const std::vector& curated_ids_sorted, const uint32_t* exclude_token_ids, size_t exclude_token_ids_size, const std::unordered_set& excluded_group_ids, Topster* actual_topster, std::vector>& q_pos_synonyms, int syn_orig_num_tokens, spp::sparse_hash_map& groups_processed, std::vector>& searched_queries, uint32_t*& all_result_ids, size_t& all_result_ids_len, filter_result_iterator_t* const filter_result_iterator, std::set& query_hashes, const int* sort_order, std::array*, 3>& field_values, const std::vector& geopoint_indices, tsl::htrie_map& qtoken_set, const std::string& collection_name) const { for (const auto& syn_tokens : q_pos_synonyms) { query_hashes.clear(); auto fuzzy_search_fields_op = fuzzy_search_fields(the_fields, syn_tokens, {}, match_type, exclude_token_ids, exclude_token_ids_size, filter_result_iterator, curated_ids_sorted, excluded_group_ids, sort_fields_std, {0}, searched_queries, qtoken_set, actual_topster, groups_processed, all_result_ids, all_result_ids_len, group_limit, group_by_fields, group_missing_values, prioritize_exact_match, prioritize_token_position, prioritize_num_matching_fields, query_hashes, token_order, prefixes, typo_tokens_threshold, exhaustive_search, max_candidates, min_len_1typo, min_len_2typo, syn_orig_num_tokens, sort_order, field_values, geopoint_indices, collection_name); if (!fuzzy_search_fields_op.ok()) { return fuzzy_search_fields_op; } } collate_included_ids({}, included_ids_map, curated_topster, searched_queries); return Option(true); } Option Index::do_infix_search(const size_t num_search_fields, const std::vector& the_fields, const std::vector& infixes, const std::vector& sort_fields, std::vector>& searched_queries, const size_t group_limit, const std::vector& group_by_fields, const bool group_missing_values, const size_t max_extra_prefix, const size_t max_extra_suffix, const std::vector& query_tokens, Topster* actual_topster, filter_result_iterator_t* const filter_result_iterator, const int sort_order[3], std::array*, 3> field_values, const std::vector& geopoint_indices, const std::vector& curated_ids_sorted, const std::unordered_set& excluded_group_ids, uint32_t*& all_result_ids, size_t& all_result_ids_len, spp::sparse_hash_map& groups_processed, const std::string& collection_name) const { std::vector group_by_field_it_vec; if (group_limit != 0) { group_by_field_it_vec = get_group_by_field_iterators(group_by_fields); } for(size_t field_id = 0; field_id < num_search_fields; field_id++) { auto& field_name = the_fields[field_id].name; enable_t field_infix = (the_fields[field_id].orig_index < infixes.size()) ? infixes[the_fields[field_id].orig_index] : infixes[0]; if(field_infix == always || (field_infix == fallback && all_result_ids_len == 0)) { std::vector infix_ids; filter_result_t filtered_infix_ids; search_infix(query_tokens[0].value, field_name, infix_ids, max_extra_prefix, max_extra_suffix); if(!infix_ids.empty()) { gfx::timsort(infix_ids.begin(), infix_ids.end()); infix_ids.erase(std::unique( infix_ids.begin(), infix_ids.end() ), infix_ids.end()); auto& raw_infix_ids = filtered_infix_ids.docs; auto& raw_infix_ids_length = filtered_infix_ids.count; if(!curated_ids_sorted.empty()) { raw_infix_ids_length = ArrayUtils::exclude_scalar(&infix_ids[0], infix_ids.size(), &curated_ids_sorted[0], curated_ids_sorted.size(), &raw_infix_ids); infix_ids.clear(); } else { raw_infix_ids = &infix_ids[0]; raw_infix_ids_length = infix_ids.size(); } if(filter_result_iterator->is_valid) { filter_result_t result; filter_result_iterator->and_scalar(raw_infix_ids, raw_infix_ids_length, result); if(raw_infix_ids != &infix_ids[0]) { delete [] raw_infix_ids; } filtered_infix_ids = std::move(result); } bool field_is_array = search_schema.at(the_fields[field_id].name).is_array(); std::vector eval_filter_indexes; for(size_t i = 0; i < raw_infix_ids_length; i++) { auto seq_id = raw_infix_ids[i]; std::map references; if (filtered_infix_ids.coll_to_references != nullptr) { references = std::move(filtered_infix_ids.coll_to_references[i]); } int64_t match_score = 0; score_results2(sort_fields, searched_queries.size(), field_id, field_is_array, 0, match_score, seq_id, sort_order, false, false, false, 1, -1, {}); int64_t scores[3] = {0}; int64_t match_score_index = -1; auto compute_sort_scores_op = compute_sort_scores(sort_fields, sort_order, field_values, geopoint_indices, seq_id, references, eval_filter_indexes, 100, scores, match_score_index, 0, collection_name); if (!compute_sort_scores_op.ok()) { return compute_sort_scores_op; } uint64_t distinct_id = seq_id; if(group_limit != 0) { distinct_id = 1; for(auto& kv : group_by_field_it_vec) { get_distinct_id(kv.field_name, kv.it, seq_id, group_missing_values, distinct_id); } if(excluded_group_ids.count(distinct_id) != 0) { continue; } } KV kv(searched_queries.size(), seq_id, distinct_id, match_score_index, scores, std::move(references)); int ret = actual_topster->add(&kv); if(group_limit != 0 && ret < 2) { groups_processed[distinct_id]++; } if(((i + 1) % (1 << 12)) == 0) { BREAK_CIRCUIT_BREAKER } } uint32_t* new_all_result_ids = nullptr; all_result_ids_len = ArrayUtils::or_scalar(all_result_ids, all_result_ids_len, raw_infix_ids, raw_infix_ids_length, &new_all_result_ids); delete[] all_result_ids; all_result_ids = new_all_result_ids; if (raw_infix_ids == &infix_ids[0]) { raw_infix_ids = nullptr; } searched_queries.push_back({}); } } } return Option(true); } void Index::handle_exclusion(const size_t num_search_fields, std::vector& field_query_tokens, const std::vector& search_fields, uint32_t*& exclude_token_ids, size_t& exclude_token_ids_size) const { for(size_t i = 0; i < num_search_fields; i++) { const std::string & field_name = search_fields[i].name; bool is_array = search_schema.at(field_name).is_array(); for(const auto& q_exclude_phrase: field_query_tokens[i].q_exclude_tokens) { // if phrase has multiple words, then we have to do exclusion of phrase match results std::vector posting_lists; for(const std::string& exclude_token: q_exclude_phrase) { art_leaf* leaf = (art_leaf *) art_search(search_index.at(field_name), (const unsigned char *) exclude_token.c_str(), exclude_token.size() + 1); if(leaf) { posting_lists.push_back(leaf->values); } } if(posting_lists.size() != q_exclude_phrase.size()) { continue; } std::vector contains_ids; posting_t::intersect(posting_lists, contains_ids); if(posting_lists.size() == 1) { uint32_t *exclude_token_ids_merged = nullptr; exclude_token_ids_size = ArrayUtils::or_scalar(exclude_token_ids, exclude_token_ids_size, &contains_ids[0], contains_ids.size(), &exclude_token_ids_merged); delete [] exclude_token_ids; exclude_token_ids = exclude_token_ids_merged; } else { uint32_t* phrase_ids = new uint32_t[contains_ids.size()]; size_t phrase_ids_size = 0; posting_t::get_phrase_matches(posting_lists, is_array, &contains_ids[0], contains_ids.size(), phrase_ids, phrase_ids_size); uint32_t *exclude_token_ids_merged = nullptr; exclude_token_ids_size = ArrayUtils::or_scalar(exclude_token_ids, exclude_token_ids_size, phrase_ids, phrase_ids_size, &exclude_token_ids_merged); delete [] phrase_ids; delete [] exclude_token_ids; exclude_token_ids = exclude_token_ids_merged; } } } } void Index::compute_facet_infos(const std::vector& facets, facet_query_t& facet_query, const size_t facet_query_num_typos, const uint32_t* all_result_ids, const size_t& all_result_ids_len, const std::vector& group_by_fields, const size_t group_limit, const bool is_wildcard_no_filter_query, const size_t max_candidates, std::vector& facet_infos, facet_index_type_t facet_index_type) const { if(all_result_ids_len == 0) { return; } size_t total_docs = seq_ids->num_ids(); for(size_t findex=0; findex < facets.size(); findex++) { const auto& a_facet = facets[findex]; const field &facet_field = search_schema.at(a_facet.field_name); facet_infos[findex].facet_field = facet_field; facet_infos[findex].use_facet_query = false; facet_infos[findex].should_compute_stats = (facet_field.type != field_types::STRING && facet_field.type != field_types::BOOL && facet_field.type != field_types::STRING_ARRAY && facet_field.type != field_types::BOOL_ARRAY); size_t num_facet_values = facet_index_v4->get_facet_count(facet_field.name); facet_infos[findex].use_value_index = (group_limit == 0) && (a_facet.sort_field.empty()) && ( is_wildcard_no_filter_query || (all_result_ids_len > 1000 && num_facet_values < 250) || (all_result_ids_len > 1000 && all_result_ids_len * 2 > total_docs) || (a_facet.is_sort_by_alpha)); bool facet_value_index_exists = facet_index_v4->has_value_index(facet_field.name); if(a_facet.field_name == facet_query.field_name && !facet_query.query.empty()) { facet_infos[findex].use_facet_query = true; if (facet_field.is_bool()) { if (facet_query.query == "true") { facet_query.query = "1"; } else if (facet_query.query == "false") { facet_query.query = "0"; } } //LOG(INFO) << "facet_query.query: " << facet_query.query; std::vector query_tokens; Tokenizer(facet_query.query, true, !facet_field.is_string(), facet_field.locale, symbols_to_index, token_separators).tokenize(query_tokens); std::vector qtokens; for (size_t qtoken_index = 0; qtoken_index < query_tokens.size(); qtoken_index++) { bool is_prefix = (qtoken_index == query_tokens.size()-1); qtokens.emplace_back(qtoken_index, query_tokens[qtoken_index], is_prefix, query_tokens[qtoken_index].size(), 0); } std::vector> searched_queries; Topster* topster = nullptr; spp::sparse_hash_map groups_processed; uint32_t* field_result_ids = nullptr; size_t field_result_ids_len = 0; size_t field_num_results = 0; std::set query_hashes; size_t num_toks_dropped = 0; std::vector sort_fields; search_field(0, qtokens, nullptr, 0, num_toks_dropped, facet_field, facet_field.faceted_name(), all_result_ids, all_result_ids_len, {}, sort_fields, -1, facet_query_num_typos, searched_queries, topster, groups_processed, &field_result_ids, field_result_ids_len, field_num_results, 0, group_by_fields, true, false, 4, query_hashes, MAX_SCORE, true, 0, 1, false, -1, 3, 1000, max_candidates); //LOG(INFO) << "searched_queries.size: " << searched_queries.size(); // NOTE: `field_result_ids` will consist of IDs across ALL queries in searched_queries for(size_t si = 0; si < searched_queries.size(); si++) { const auto& searched_query = searched_queries[si]; std::vector searched_tokens; std::vector posting_lists; for(auto leaf: searched_query) { posting_lists.push_back(leaf->values); std::string tok(reinterpret_cast(leaf->key), leaf->key_len - 1); searched_tokens.push_back(tok); //LOG(INFO) << "tok: " << tok; } //LOG(INFO) << "si: " << si << ", field_result_ids_len: " << field_result_ids_len; #ifdef TEST_BUILD if(facet_index_type == VALUE) { #else if(facet_value_index_exists && facet_infos[findex].use_value_index) { #endif size_t num_tokens_found = 0; for(auto pl: posting_lists) { if(posting_t::contains_atleast_one(pl, field_result_ids, field_result_ids_len)) { num_tokens_found++; } else { break; } } if(num_tokens_found == posting_lists.size()) { // need to ensure that document ID actually contains searched_query tokens // since `field_result_ids` contains documents matched across all queries // value based index for(const auto& val : searched_tokens) { facet_infos[findex].fvalue_searched_tokens.emplace_back(val); } } } else { for(size_t i = 0; i < field_result_ids_len; i++) { uint32_t seq_id = field_result_ids[i]; bool id_matched = true; for(auto pl: posting_lists) { if(!posting_t::contains(pl, seq_id)) { // need to ensure that document ID actually contains searched_query tokens // since `field_result_ids` contains documents matched across all queries id_matched = false; break; } } if(!id_matched) { continue; } std::vector facet_hashes; auto facet_index = facet_index_v4->get_facet_hash_index(a_facet.field_name); posting_list_t::iterator_t facet_index_it = facet_index->new_iterator(); facet_index_it.skip_to(seq_id); if(facet_index_it.valid()) { posting_list_t::get_offsets(facet_index_it, facet_hashes); if(facet_field.is_array()) { std::vector array_indices; posting_t::get_matching_array_indices(posting_lists, seq_id, array_indices); for(size_t array_index: array_indices) { if(array_index < facet_hashes.size()) { uint32_t hash = facet_hashes[array_index]; /*LOG(INFO) << "seq_id: " << seq_id << ", hash: " << hash << ", array index: " << array_index;*/ if(facet_infos[findex].hashes.count(hash) == 0) { //LOG(INFO) << "adding searched_tokens for hash " << hash; facet_infos[findex].hashes.emplace(hash, searched_tokens); } } } } else { uint32_t hash = facet_hashes[0]; if(facet_infos[findex].hashes.count(hash) == 0) { //LOG(INFO) << "adding searched_tokens for hash " << hash; facet_infos[findex].hashes.emplace(hash, searched_tokens); } } } } } } delete [] field_result_ids; } } } void Index::curate_filtered_ids(const std::set& curated_ids, const uint32_t* exclude_token_ids, size_t exclude_token_ids_size, uint32_t*& filter_ids, uint32_t& filter_ids_length, const std::vector& curated_ids_sorted) const { if(!curated_ids.empty()) { uint32_t *excluded_result_ids = nullptr; filter_ids_length = ArrayUtils::exclude_scalar(filter_ids, filter_ids_length, &curated_ids_sorted[0], curated_ids_sorted.size(), &excluded_result_ids); delete [] filter_ids; filter_ids = excluded_result_ids; } // Exclude document IDs associated with excluded tokens from the result set if(exclude_token_ids_size != 0) { uint32_t *excluded_result_ids = nullptr; filter_ids_length = ArrayUtils::exclude_scalar(filter_ids, filter_ids_length, exclude_token_ids, exclude_token_ids_size, &excluded_result_ids); delete[] filter_ids; filter_ids = excluded_result_ids; } } Option Index::search_wildcard(filter_node_t const* const& filter_tree_root, const std::map>& included_ids_map, const std::vector& sort_fields, Topster* topster, Topster* curated_topster, spp::sparse_hash_map& groups_processed, std::vector>& searched_queries, const size_t group_limit, const std::vector& group_by_fields, const bool group_missing_values, const std::set& curated_ids, const std::vector& curated_ids_sorted, const uint32_t* exclude_token_ids, size_t exclude_token_ids_size, const std::unordered_set& excluded_group_ids, uint32_t*& all_result_ids, size_t& all_result_ids_len, filter_result_iterator_t* const filter_result_iterator, const size_t concurrency, const int* sort_order, std::array*, 3>& field_values, const std::vector& geopoint_indices, const std::string& collection_name) const { filter_result_iterator->compute_result(); auto const& approx_filter_ids_length = filter_result_iterator->approx_filter_ids_length; uint32_t token_bits = 0; const bool check_for_circuit_break = (approx_filter_ids_length > 1000000); //auto beginF = std::chrono::high_resolution_clock::now(); const size_t num_threads = std::min(concurrency, approx_filter_ids_length); const size_t window_size = (num_threads == 0) ? 0 : (approx_filter_ids_length + num_threads - 1) / num_threads; // rounds up spp::sparse_hash_map tgroups_processed[num_threads]; Topster* topsters[num_threads]; std::vector plists; size_t num_processed = 0; std::mutex m_process; std::condition_variable cv_process; size_t num_queued = 0; const auto parent_search_begin = search_begin_us; const auto parent_search_stop_ms = search_stop_us; auto parent_search_cutoff = search_cutoff; uint32_t excluded_result_index = 0; Option* compute_sort_score_statuses[num_threads]; for(size_t thread_id = 0; thread_id < num_threads && filter_result_iterator->is_valid; thread_id++) { auto batch_result = new filter_result_t(); filter_result_iterator->get_n_ids(window_size, excluded_result_index, exclude_token_ids, exclude_token_ids_size, batch_result); num_queued++; searched_queries.push_back({}); topsters[thread_id] = new Topster(topster->MAX_SIZE, topster->distinct); auto& compute_sort_score_status = compute_sort_score_statuses[thread_id] = nullptr; thread_pool->enqueue([this, &parent_search_begin, &parent_search_stop_ms, &parent_search_cutoff, thread_id, &sort_fields, &searched_queries, &group_limit, &group_by_fields, group_missing_values, &topsters, &tgroups_processed, &excluded_group_ids, &sort_order, field_values, &geopoint_indices, &plists, check_for_circuit_break, batch_result, &num_processed, &m_process, &cv_process, &compute_sort_score_status, collection_name]() { std::unique_ptr batch_result_guard(batch_result); search_begin_us = parent_search_begin; search_stop_us = parent_search_stop_ms; search_cutoff = parent_search_cutoff; std::vector filter_indexes; std::vector group_by_field_it_vec; if (group_limit != 0) { group_by_field_it_vec = get_group_by_field_iterators(group_by_fields); } for(size_t i = 0; i < batch_result->count; i++) { const uint32_t seq_id = batch_result->docs[i]; std::map, reference_filter_result_t> references; if (batch_result->coll_to_references != nullptr) { references = std::move(batch_result->coll_to_references[i]); } int64_t match_score = 0; score_results2(sort_fields, (uint16_t) searched_queries.size(), 0, false, 0, match_score, seq_id, sort_order, false, false, false, 1, -1, plists); int64_t scores[3] = {0}; int64_t match_score_index = -1; auto compute_sort_scores_op = compute_sort_scores(sort_fields, sort_order, field_values, geopoint_indices, seq_id, references, filter_indexes, 100, scores, match_score_index, 0, collection_name); if (!compute_sort_scores_op.ok()) { compute_sort_score_status = new Option(compute_sort_scores_op.code(), compute_sort_scores_op.error()); break; } uint64_t distinct_id = seq_id; if(group_limit != 0) { distinct_id = 1; for(auto& kv : group_by_field_it_vec) { get_distinct_id(kv.field_name, kv.it, seq_id, group_missing_values, distinct_id); } if(excluded_group_ids.count(distinct_id) != 0) { continue; } } KV kv(searched_queries.size(), seq_id, distinct_id, match_score_index, scores, std::move(references)); int ret = topsters[thread_id]->add(&kv); if(group_limit != 0 && ret < 2) { tgroups_processed[thread_id][distinct_id]++; } if(check_for_circuit_break && ((i + 1) % (1 << 15)) == 0) { // check only once every 2^15 docs to reduce overhead BREAK_CIRCUIT_BREAKER } } std::unique_lock lock(m_process); num_processed++; parent_search_cutoff = parent_search_cutoff || search_cutoff; cv_process.notify_one(); }); } std::unique_lock lock_process(m_process); cv_process.wait(lock_process, [&](){ return num_processed == num_queued; }); search_cutoff = parent_search_cutoff; for(size_t thread_id = 0; thread_id < num_processed; thread_id++) { if (compute_sort_score_statuses[thread_id] != nullptr) { auto& status = compute_sort_score_statuses[thread_id]; auto return_value = Option(status->code(), status->error()); // Cleanup the remaining threads. for (size_t i = thread_id; i < num_processed; i++) { delete compute_sort_score_statuses[i]; delete topsters[i]; } return return_value; } //groups_processed.insert(tgroups_processed[thread_id].begin(), tgroups_processed[thread_id].end()); for(const auto& it : tgroups_processed[thread_id]) { groups_processed[it.first]+= it.second; } aggregate_topster(topster, topsters[thread_id]); delete topsters[thread_id]; } /*long long int timeMillisF = std::chrono::duration_cast( std::chrono::high_resolution_clock::now() - beginF).count(); LOG(INFO) << "Time for raw scoring: " << timeMillisF;*/ filter_result_iterator->reset(); all_result_ids_len = filter_result_iterator->to_filter_id_array(all_result_ids); return Option(true); } void Index::populate_sort_mapping(int* sort_order, std::vector& geopoint_indices, std::vector& sort_fields_std, std::array*, 3>& field_values) const { for (size_t i = 0; i < sort_fields_std.size(); i++) { if (!sort_fields_std[i].reference_collection_name.empty()) { auto& cm = CollectionManager::get_instance(); auto ref_collection = cm.get_collection(sort_fields_std[i].reference_collection_name); int ref_sort_order[1]; std::vector ref_geopoint_indices; std::vector ref_sort_fields_std; ref_sort_fields_std.emplace_back(sort_fields_std[i]); ref_sort_fields_std.front().reference_collection_name.clear(); std::array*, 3> ref_field_values; ref_collection->reference_populate_sort_mapping(ref_sort_order, ref_geopoint_indices, ref_sort_fields_std, ref_field_values); sort_order[i] = ref_sort_order[0]; if (!ref_geopoint_indices.empty()) { geopoint_indices.push_back(i); } sort_fields_std[i] = ref_sort_fields_std[0]; sort_fields_std[i].reference_collection_name = ref_collection->get_name(); field_values[i] = ref_field_values[0]; continue; } sort_order[i] = 1; if (sort_fields_std[i].order == sort_field_const::asc) { sort_order[i] = -1; } if (sort_fields_std[i].name == sort_field_const::text_match) { field_values[i] = &text_match_sentinel_value; } else if (sort_fields_std[i].name == sort_field_const::seq_id || sort_fields_std[i].name == sort_field_const::group_found) { field_values[i] = &seq_id_sentinel_value; } else if (sort_fields_std[i].name == sort_field_const::eval) { field_values[i] = &eval_sentinel_value; auto& eval_exp = sort_fields_std[i].eval; auto count = sort_fields_std[i].eval_expressions.size(); for (uint32_t j = 0; j < count; j++) { auto filter_result_iterator = filter_result_iterator_t("", this, &eval_exp.filter_trees[j]); auto filter_init_op = filter_result_iterator.init_status(); if (!filter_init_op.ok()) { return; } uint32_t* eval_ids = nullptr; auto eval_ids_count = filter_result_iterator.to_filter_id_array(eval_ids); eval_exp.eval_ids_vec.push_back(eval_ids); eval_exp.eval_ids_count_vec.push_back(eval_ids_count); } } else if(sort_fields_std[i].name == sort_field_const::vector_distance) { field_values[i] = &vector_distance_sentinel_value; } else if (search_schema.count(sort_fields_std[i].name) != 0 && search_schema.at(sort_fields_std[i].name).sort) { if (search_schema.at(sort_fields_std[i].name).type == field_types::GEOPOINT_ARRAY) { geopoint_indices.push_back(i); field_values[i] = nullptr; // GEOPOINT_ARRAY uses a multi-valued index } else if(search_schema.at(sort_fields_std[i].name).type == field_types::STRING) { field_values[i] = &str_sentinel_value; } else { field_values[i] = sort_index.at(sort_fields_std[i].name); if (search_schema.at(sort_fields_std[i].name).is_geopoint()) { geopoint_indices.push_back(i); } } } } } void Index::populate_sort_mapping_with_lock(int* sort_order, std::vector& geopoint_indices, std::vector& sort_fields_std, std::array*, 3>& field_values) const { std::shared_lock lock(mutex); populate_sort_mapping(sort_order, geopoint_indices, sort_fields_std, field_values); } void Index::search_field(const uint8_t & field_id, const std::vector& query_tokens, const uint32_t* exclude_token_ids, size_t exclude_token_ids_size, size_t& num_tokens_dropped, const field& the_field, const std::string& field_name, // to handle faceted index const uint32_t *filter_ids, size_t filter_ids_length, const std::vector& curated_ids, std::vector & sort_fields, const int last_typo, const int max_typos, std::vector> & searched_queries, Topster* topster, spp::sparse_hash_map& groups_processed, uint32_t** all_result_ids, size_t & all_result_ids_len, size_t& field_num_results, const size_t group_limit, const std::vector& group_by_fields, const bool group_missing_values, bool prioritize_exact_match, const size_t concurrency, std::set& query_hashes, const token_ordering token_order, const bool prefix, const size_t drop_tokens_threshold, const size_t typo_tokens_threshold, const bool exhaustive_search, int syn_orig_num_tokens, size_t min_len_1typo, size_t min_len_2typo, const size_t max_candidates) const { // NOTE: `query_tokens` preserve original tokens, while `search_tokens` could be a result of dropped tokens size_t max_cost = (max_typos < 0 || max_typos > 2) ? 2 : max_typos; if(the_field.locale != "" && the_field.locale != "en" && !Tokenizer::is_cyrillic(the_field.locale)) { // disable fuzzy trie traversal for certain non-english locales max_cost = 0; } // To prevent us from doing ART search repeatedly as we iterate through possible corrections spp::sparse_hash_map> token_cost_cache; std::vector> token_to_costs; for(size_t stoken_index=0; stoken_index < query_tokens.size(); stoken_index++) { const std::string& token = query_tokens[stoken_index].value; std::vector all_costs; // This ensures that we don't end up doing a cost of 1 for a single char etc. int bounded_cost = get_bounded_typo_cost(max_cost, token.length(), min_len_1typo, min_len_2typo); for(int cost = 0; cost <= bounded_cost; cost++) { all_costs.push_back(cost); } token_to_costs.push_back(all_costs); } // stores candidates for each token, i.e. i-th index would have all possible tokens with a cost of "c" std::vector token_candidates_vec; std::set unique_tokens; auto product = []( long long a, std::vector& b ) { return a*b.size(); }; long long n = 0; long long int N = std::accumulate(token_to_costs.begin(), token_to_costs.end(), 1LL, product); const size_t combination_limit = exhaustive_search ? Index::COMBINATION_MAX_LIMIT : Index::COMBINATION_MIN_LIMIT; while(n < N && n < combination_limit) { RETURN_CIRCUIT_BREAKER // Outerloop generates combinations of [cost to max_cost] for each token // For e.g. for a 3-token query: [0, 0, 0], [0, 0, 1], [0, 1, 1] etc. std::vector costs(token_to_costs.size()); ldiv_t q { n, 0 }; bool valid_combo = false; for(long long i = (token_to_costs.size() - 1); 0 <= i ; --i ) { q = ldiv(q.quot, token_to_costs[i].size()); costs[i] = token_to_costs[i][q.rem]; if(costs[i] == uint32_t(last_typo+1)) { // to support progressive typo searching, there must be atleast one typo that's greater than last_typo valid_combo = true; } } if(last_typo != -1 && !valid_combo) { n++; continue; } unique_tokens.clear(); token_candidates_vec.clear(); size_t token_index = 0; while(token_index < query_tokens.size()) { // For each token, look up the generated cost for this iteration and search using that cost const std::string& token = query_tokens[token_index].value; const std::string token_cost_hash = token + std::to_string(costs[token_index]); std::vector leaves; const bool prefix_search = prefix && query_tokens[token_index].is_prefix_searched; /*LOG(INFO) << "Searching for field: " << the_field.name << ", token:" << token << " - cost: " << costs[token_index] << ", prefix_search: " << prefix_search;*/ if(token_cost_cache.count(token_cost_hash) != 0) { leaves = token_cost_cache[token_cost_hash]; } else { const size_t token_len = prefix_search ? (int) token.length() : (int) token.length() + 1; //auto begin = std::chrono::high_resolution_clock::now(); // need less candidates for filtered searches since we already only pick tokens with results art_fuzzy_search(search_index.at(field_name), (const unsigned char *) token.c_str(), token_len, costs[token_index], costs[token_index], max_candidates, token_order, prefix_search, false, "", filter_ids, filter_ids_length, leaves, unique_tokens); /*auto timeMillis = std::chrono::duration_cast( std::chrono::high_resolution_clock::now() - begin).count(); LOG(INFO) << "Time taken for fuzzy search: " << timeMillis << "ms";*/ if(!leaves.empty()) { token_cost_cache.emplace(token_cost_hash, leaves); for(auto leaf: leaves) { std::string tok(reinterpret_cast(leaf->key), leaf->key_len - 1); unique_tokens.emplace(tok); } } } if(!leaves.empty()) { //log_leaves(costs[token_index], token, leaves); token_candidates_vec.push_back( token_candidates{query_tokens[token_index], costs[token_index], prefix_search, leaves}); } token_index++; } if(token_candidates_vec.size() == query_tokens.size()) { std::vector id_buff; // If all tokens are, go ahead and search for candidates search_candidates(field_id, the_field.is_array(), filter_ids, filter_ids_length, exclude_token_ids, exclude_token_ids_size, curated_ids, sort_fields, token_candidates_vec, searched_queries, topster, groups_processed, all_result_ids, all_result_ids_len, field_num_results, typo_tokens_threshold, group_limit, group_by_fields, group_missing_values, query_tokens, prioritize_exact_match, exhaustive_search, syn_orig_num_tokens, concurrency, query_hashes, id_buff); if(id_buff.size() > 1) { std::sort(id_buff.begin(), id_buff.end()); id_buff.erase(std::unique( id_buff.begin(), id_buff.end() ), id_buff.end()); } uint32_t* new_all_result_ids = nullptr; all_result_ids_len = ArrayUtils::or_scalar(*all_result_ids, all_result_ids_len, &id_buff[0], id_buff.size(), &new_all_result_ids); delete[] *all_result_ids; *all_result_ids = new_all_result_ids; } if(!exhaustive_search && field_num_results >= typo_tokens_threshold) { // if typo threshold is breached, we are done return ; } n++; } } int Index::get_bounded_typo_cost(const size_t max_cost, const size_t token_len, const size_t min_len_1typo, const size_t min_len_2typo) { if(token_len < min_len_1typo) { // typo correction is disabled for small tokens return 0; } if(token_len < min_len_2typo) { // 2-typos are enabled only at token length of 7 chars return std::min(max_cost, 1); } return std::min(max_cost, 2); } void Index::log_leaves(const int cost, const std::string &token, const std::vector &leaves) const { LOG(INFO) << "Index: " << name << ", token: " << token << ", cost: " << cost; for(size_t i=0; i < leaves.size(); i++) { std::string key((char*)leaves[i]->key, leaves[i]->key_len); LOG(INFO) << key << " - " << posting_t::num_ids(leaves[i]->values); LOG(INFO) << "frequency: " << posting_t::num_ids(leaves[i]->values) << ", max_score: " << leaves[i]->max_score; /*for(auto j=0; jvalues->ids.getLength(); j++) { LOG(INFO) << "id: " << leaves[i]->values->ids.at(j); }*/ } } int64_t Index::score_results2(const std::vector & sort_fields, const uint16_t & query_index, const size_t field_id, const bool field_is_array, const uint32_t total_cost, int64_t& match_score, const uint32_t seq_id, const int sort_order[3], const bool prioritize_exact_match, const bool single_exact_query_token, const bool prioritize_token_position, size_t num_query_tokens, int syn_orig_num_tokens, const std::vector& posting_lists) const { //auto begin = std::chrono::high_resolution_clock::now(); //const std::string first_token((const char*)query_suggestion[0]->key, query_suggestion[0]->key_len-1); if (posting_lists.size() <= 1) { const uint8_t is_verbatim_match = uint8_t( prioritize_exact_match && single_exact_query_token && posting_list_t::is_single_token_verbatim_match(posting_lists[0], field_is_array) ); size_t words_present = (num_query_tokens == 1 && syn_orig_num_tokens != -1) ? syn_orig_num_tokens : 1; size_t distance = (num_query_tokens == 1 && syn_orig_num_tokens != -1) ? syn_orig_num_tokens-1 : 0; size_t max_offset = prioritize_token_position ? posting_list_t::get_last_offset(posting_lists[0], field_is_array) : 255; Match single_token_match = Match(words_present, distance, max_offset, is_verbatim_match); match_score = single_token_match.get_match_score(total_cost, words_present); /*auto this_words_present = ((match_score >> 32) & 0xFF); auto unique_words = ((match_score >> 40) & 0xFF); auto typo_score = ((match_score >> 24) & 0xFF); auto proximity = ((match_score >> 16) & 0xFF); auto verbatim = ((match_score >> 8) & 0xFF); auto offset_score = ((match_score >> 0) & 0xFF); LOG(INFO) << "seq_id: " << seq_id << ", words_present: " << this_words_present << ", unique_words: " << unique_words << ", typo_score: " << typo_score << ", proximity: " << proximity << ", verbatim: " << verbatim << ", offset_score: " << offset_score << ", match_score: " << match_score;*/ } else { std::map> array_token_positions; posting_list_t::get_offsets(posting_lists, array_token_positions); for (const auto& kv: array_token_positions) { const std::vector& token_positions = kv.second; if (token_positions.empty()) { continue; } const Match &match = Match(seq_id, token_positions, false, prioritize_exact_match); uint64_t this_match_score = match.get_match_score(total_cost, posting_lists.size()); // Within a field, only a subset of query tokens can match (unique_words), but even a smaller set // might be available within the window used for proximity calculation (this_words_present) auto this_words_present = ((this_match_score >> 32) & 0xFF); auto unique_words = field_is_array ? this_words_present : ((this_match_score >> 40) & 0xFF); auto typo_score = ((this_match_score >> 24) & 0xFF); auto proximity = ((this_match_score >> 16) & 0xFF); auto verbatim = ((this_match_score >> 8) & 0xFF); auto offset_score = prioritize_token_position ? ((this_match_score >> 0) & 0xFF) : 0; if(syn_orig_num_tokens != -1 && num_query_tokens == posting_lists.size()) { unique_words = syn_orig_num_tokens; this_words_present = syn_orig_num_tokens; proximity = 100 - (syn_orig_num_tokens - 1); } uint64_t mod_match_score = ( (int64_t(this_words_present) << 40) | (int64_t(unique_words) << 32) | (int64_t(typo_score) << 24) | (int64_t(proximity) << 16) | (int64_t(verbatim) << 8) | (int64_t(offset_score) << 0) ); if(mod_match_score > match_score) { match_score = mod_match_score; } /*std::ostringstream os; os << "seq_id: " << seq_id << ", field_id: " << field_id << ", this_words_present: " << this_words_present << ", unique_words: " << unique_words << ", typo_score: " << typo_score << ", proximity: " << proximity << ", verbatim: " << verbatim << ", offset_score: " << offset_score << ", mod_match_score: " << mod_match_score << ", token_positions: " << token_positions.size() << ", num_query_tokens: " << num_query_tokens << ", posting_lists.size: " << posting_lists.size() << ", array_index: " << kv.first << std::endl; LOG(INFO) << os.str();*/ } } //long long int timeNanos = std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - begin).count(); //LOG(INFO) << "Time taken for results iteration: " << timeNanos << "ms"; return 0; } void Index::score_results(const std::vector & sort_fields, const uint16_t & query_index, const uint8_t & field_id, const bool field_is_array, const uint32_t total_cost, Topster* topster, const std::vector &query_suggestion, spp::sparse_hash_map& groups_processed, const uint32_t seq_id, const int sort_order[3], std::array*, 3> field_values, const std::vector& geopoint_indices, const size_t group_limit, const std::vector& group_by_fields, const bool group_missing_values, const uint32_t token_bits, const bool prioritize_exact_match, const bool single_exact_query_token, int syn_orig_num_tokens, const std::vector& posting_lists) const { int64_t geopoint_distances[3]; for(auto& i: geopoint_indices) { spp::sparse_hash_map* geopoints = field_values[i]; int64_t dist = INT32_MAX; S2LatLng reference_lat_lng; GeoPoint::unpack_lat_lng(sort_fields[i].geopoint, reference_lat_lng); if(geopoints != nullptr) { auto it = geopoints->find(seq_id); if(it != geopoints->end()) { int64_t packed_latlng = it->second; S2LatLng s2_lat_lng; GeoPoint::unpack_lat_lng(packed_latlng, s2_lat_lng); dist = GeoPoint::distance(s2_lat_lng, reference_lat_lng); } } else { // indicates geo point array auto field_it = geo_array_index.at(sort_fields[i].name); auto it = field_it->find(seq_id); if(it != field_it->end()) { int64_t* latlngs = it->second; for(size_t li = 0; li < latlngs[0]; li++) { S2LatLng s2_lat_lng; int64_t packed_latlng = latlngs[li + 1]; GeoPoint::unpack_lat_lng(packed_latlng, s2_lat_lng); int64_t this_dist = GeoPoint::distance(s2_lat_lng, reference_lat_lng); if(this_dist < dist) { dist = this_dist; } } } } if(dist < sort_fields[i].exclude_radius) { dist = 0; } if(sort_fields[i].geo_precision > 0) { dist = dist + sort_fields[i].geo_precision - 1 - (dist + sort_fields[i].geo_precision - 1) % sort_fields[i].geo_precision; } geopoint_distances[i] = dist; // Swap (id -> latlong) index to (id -> distance) index field_values[i] = &geo_sentinel_value; } //auto begin = std::chrono::high_resolution_clock::now(); //const std::string first_token((const char*)query_suggestion[0]->key, query_suggestion[0]->key_len-1); uint64_t match_score = 0; if (posting_lists.size() <= 1) { const uint8_t is_verbatim_match = uint8_t( prioritize_exact_match && single_exact_query_token && posting_list_t::is_single_token_verbatim_match(posting_lists[0], field_is_array) ); size_t words_present = (syn_orig_num_tokens == -1) ? 1 : syn_orig_num_tokens; size_t distance = (syn_orig_num_tokens == -1) ? 0 : syn_orig_num_tokens-1; Match single_token_match = Match(words_present, distance, is_verbatim_match); match_score = single_token_match.get_match_score(total_cost, words_present); } else { std::map> array_token_positions; posting_list_t::get_offsets(posting_lists, array_token_positions); // NOTE: tokens found returned by matcher is only within the best matched window, so we have to still consider // unique tokens found if they are spread across the text. uint32_t unique_tokens_found = __builtin_popcount(token_bits); if(syn_orig_num_tokens != -1) { unique_tokens_found = syn_orig_num_tokens; } for (const auto& kv: array_token_positions) { const std::vector& token_positions = kv.second; if (token_positions.empty()) { continue; } const Match &match = Match(seq_id, token_positions, false, prioritize_exact_match); uint64_t this_match_score = match.get_match_score(total_cost, unique_tokens_found); auto this_words_present = ((this_match_score >> 24) & 0xFF); auto typo_score = ((this_match_score >> 16) & 0xFF); auto proximity = ((this_match_score >> 8) & 0xFF); auto verbatim = (this_match_score & 0xFF); if(syn_orig_num_tokens != -1) { this_words_present = syn_orig_num_tokens; proximity = 100 - (syn_orig_num_tokens - 1); } uint64_t mod_match_score = ( (int64_t(unique_tokens_found) << 32) | (int64_t(this_words_present) << 24) | (int64_t(typo_score) << 16) | (int64_t(proximity) << 8) | (int64_t(verbatim) << 0) ); if(mod_match_score > match_score) { match_score = mod_match_score; } /*std::ostringstream os; os << name << ", total_cost: " << (255 - total_cost) << ", words_present: " << match.words_present << ", match_score: " << match_score << ", match.distance: " << match.distance << ", seq_id: " << seq_id << std::endl; LOG(INFO) << os.str();*/ } } const int64_t default_score = INT64_MIN; // to handle field that doesn't exist in document (e.g. optional) int64_t scores[3] = {0}; size_t match_score_index = 0; // avoiding loop if (sort_fields.size() > 0) { if (field_values[0] == &text_match_sentinel_value) { scores[0] = int64_t(match_score); match_score_index = 0; } else if (field_values[0] == &seq_id_sentinel_value) { scores[0] = seq_id; } else if(field_values[0] == &geo_sentinel_value) { scores[0] = geopoint_distances[0]; } else if(field_values[0] == &str_sentinel_value) { scores[0] = str_sort_index.at(sort_fields[0].name)->rank(seq_id); } else { auto it = field_values[0]->find(seq_id); scores[0] = (it == field_values[0]->end()) ? default_score : it->second; } if (sort_order[0] == -1) { scores[0] = -scores[0]; } } if(sort_fields.size() > 1) { if (field_values[1] == &text_match_sentinel_value) { scores[1] = int64_t(match_score); match_score_index = 1; } else if (field_values[1] == &seq_id_sentinel_value) { scores[1] = seq_id; } else if(field_values[1] == &geo_sentinel_value) { scores[1] = geopoint_distances[1]; } else if(field_values[1] == &str_sentinel_value) { scores[1] = str_sort_index.at(sort_fields[1].name)->rank(seq_id); } else { auto it = field_values[1]->find(seq_id); scores[1] = (it == field_values[1]->end()) ? default_score : it->second; } if (sort_order[1] == -1) { scores[1] = -scores[1]; } } if(sort_fields.size() > 2) { if (field_values[2] == &text_match_sentinel_value) { scores[2] = int64_t(match_score); match_score_index = 2; } else if (field_values[2] == &seq_id_sentinel_value) { scores[2] = seq_id; } else if(field_values[2] == &geo_sentinel_value) { scores[2] = geopoint_distances[2]; } else if(field_values[2] == &str_sentinel_value) { scores[2] = str_sort_index.at(sort_fields[2].name)->rank(seq_id); } else { auto it = field_values[2]->find(seq_id); scores[2] = (it == field_values[2]->end()) ? default_score : it->second; } if (sort_order[2] == -1) { scores[2] = -scores[2]; } } uint64_t distinct_id = seq_id; if(group_limit != 0) { distinct_id = 1; auto group_by_field_it_vec = get_group_by_field_iterators(group_by_fields); for(auto& kv : group_by_field_it_vec) { get_distinct_id(kv.field_name, kv.it, seq_id, group_missing_values, distinct_id); } } //LOG(INFO) << "Seq id: " << seq_id << ", match_score: " << match_score; KV kv(query_index, seq_id, distinct_id, match_score_index, scores); int ret = topster->add(&kv); if(group_limit != 0 && ret < 2) { groups_processed[distinct_id]++; } //long long int timeNanos = std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - begin).count(); //LOG(INFO) << "Time taken for results iteration: " << timeNanos << "ms"; } void Index::get_distinct_id(const std::string& field_name, posting_list_t::iterator_t& facet_index_it, const uint32_t seq_id, const bool group_missing_values, uint64_t& distinct_id) const { if (!facet_index_it.valid()) { if (!group_missing_values) { distinct_id = seq_id; } return; } // calculate hash from group_by_fields std::vector facet_hashes; facet_index_it.skip_to(seq_id); if (facet_index_it.valid() && facet_index_it.id() == seq_id) { posting_list_t::get_offsets(facet_index_it, facet_hashes); if (search_schema.at(field_name).is_array()) { //LOG(INFO) << "combining hashes for facet array "; for (size_t i = 0; i < facet_hashes.size(); i++) { distinct_id = StringUtils::hash_combine(distinct_id, facet_hashes[i]); } } else { const auto &facet_hash = facet_hashes[0]; //LOG(INFO) << "combining hashes for facet "; distinct_id = StringUtils::hash_combine(distinct_id, facet_hash); } } //LOG(INFO) << "seq_id: " << seq_id << ", distinct_id: " << distinct_id; if (distinct_id == 1 && !group_missing_values) { distinct_id = seq_id; } return; } inline uint32_t Index::next_suggestion2(const std::vector& token_candidates_vec, long long int n, std::vector& query_suggestion, uint64& qhash) { uint32_t total_cost = 0; qhash = 1; // generate the next combination from `token_leaves` and store it in `query_suggestion` ldiv_t q { n, 0 }; for(size_t i = 0 ; i < token_candidates_vec.size(); i++) { size_t token_size = token_candidates_vec[i].token.value.size(); q = ldiv(q.quot, token_candidates_vec[i].candidates.size()); const auto& candidate = token_candidates_vec[i].candidates[q.rem]; size_t typo_cost = token_candidates_vec[i].cost; if (candidate.size() > 1 && !Tokenizer::is_ascii_char(candidate[0])) { icu::UnicodeString ustr = icu::UnicodeString::fromUTF8(candidate); auto code_point = ustr.char32At(0); if(code_point >= 0x600 && code_point <= 0x6ff) { // adjust typo cost for Arabic strings, since 1 byte difference makes no sense if(typo_cost == 1) { typo_cost = 2; } } } // we assume that toke was found via prefix search if candidate is longer than token's typo tolerance bool is_prefix_searched = token_candidates_vec[i].prefix_search && (candidate.size() > (token_size + typo_cost)); size_t actual_cost = (2 * typo_cost) + uint32_t(is_prefix_searched); total_cost += actual_cost; query_suggestion[i] = token_t(i, candidate, is_prefix_searched, token_size, typo_cost); uint64_t this_hash = StringUtils::hash_wy(query_suggestion[i].value.c_str(), query_suggestion[i].value.size()); qhash = StringUtils::hash_combine(qhash, this_hash); /*LOG(INFO) << "suggestion key: " << actual_query_suggestion[i]->key << ", token: " << token_candidates_vec[i].token.value << ", actual_cost: " << actual_cost; LOG(INFO) << ".";*/ } return total_cost; } inline uint32_t Index::next_suggestion(const std::vector &token_candidates_vec, long long int n, std::vector& actual_query_suggestion, std::vector& query_suggestion, const int syn_orig_num_tokens, uint32_t& token_bits, uint64& qhash) { uint32_t total_cost = 0; qhash = 1; // generate the next combination from `token_leaves` and store it in `query_suggestion` ldiv_t q { n, 0 }; for(long long i = 0 ; i < (long long) token_candidates_vec.size(); i++) { size_t token_size = token_candidates_vec[i].token.value.size(); q = ldiv(q.quot, token_candidates_vec[i].candidates.size()); actual_query_suggestion[i] = token_candidates_vec[i].candidates[q.rem]; query_suggestion[i] = token_candidates_vec[i].candidates[q.rem]; bool exact_match = token_candidates_vec[i].cost == 0 && token_size == actual_query_suggestion[i]->key_len-1; bool incr_for_prefix_search = token_candidates_vec[i].prefix_search && !exact_match; size_t actual_cost = (2 * token_candidates_vec[i].cost) + uint32_t(incr_for_prefix_search); total_cost += actual_cost; token_bits |= 1UL << token_candidates_vec[i].token.position; // sets n-th bit uintptr_t addr_val = (uintptr_t) query_suggestion[i]; qhash = StringUtils::hash_combine(qhash, addr_val); /*LOG(INFO) << "suggestion key: " << actual_query_suggestion[i]->key << ", token: " << token_candidates_vec[i].token.value << ", actual_cost: " << actual_cost; LOG(INFO) << ".";*/ } if(syn_orig_num_tokens != -1) { token_bits = 0; for(size_t i = 0; i < size_t(syn_orig_num_tokens); i++) { token_bits |= 1UL << i; } } return total_cost; } void Index::remove_facet_token(const field& search_field, spp::sparse_hash_map& search_index, const std::string& token, uint32_t seq_id) { const unsigned char *key = (const unsigned char *) token.c_str(); int key_len = (int) (token.length() + 1); const std::string& field_name = search_field.faceted_name(); art_leaf* leaf = (art_leaf *) art_search(search_index.at(field_name), key, key_len); if(leaf != nullptr) { posting_t::erase(leaf->values, seq_id); if (posting_t::num_ids(leaf->values) == 0) { void* values = art_delete(search_index.at(field_name), key, key_len); posting_t::destroy_list(values); } } } void Index::remove_field(uint32_t seq_id, const nlohmann::json& document, const std::string& field_name, const bool is_update) { const auto& search_field_it = search_schema.find(field_name); if(search_field_it == search_schema.end()) { return; } const auto& search_field = search_field_it.value(); if(!search_field.index) { return; } // Go through all the field names and find the keys+values so that they can be removed from in-memory index if(search_field.type == field_types::STRING_ARRAY || search_field.type == field_types::STRING) { std::vector tokens; tokenize_string_field(document, search_field, tokens, search_field.locale, symbols_to_index, token_separators); for(size_t i = 0; i < tokens.size(); i++) { const auto& token = tokens[i]; const unsigned char *key = (const unsigned char *) token.c_str(); int key_len = (int) (token.length() + 1); art_leaf* leaf = (art_leaf *) art_search(search_index.at(field_name), key, key_len); if(leaf != nullptr) { posting_t::erase(leaf->values, seq_id); if (posting_t::num_ids(leaf->values) == 0) { void* values = art_delete(search_index.at(field_name), key, key_len); posting_t::destroy_list(values); } } if(search_field.infix) { auto strhash = StringUtils::hash_wy(key, token.size()); const auto& infix_sets = infix_index.at(search_field.name); infix_sets[strhash % 4]->erase(token); } } } else if(search_field.is_int32()) { const std::vector& values = search_field.is_single_integer() ? std::vector{document[field_name].get()} : document[field_name].get>(); for(int32_t value: values) { if (search_field.range_index) { auto trie = range_index.at(field_name); trie->remove(value, seq_id); } else { num_tree_t* num_tree = numerical_index.at(field_name); num_tree->remove(value, seq_id); } if(search_field.facet) { remove_facet_token(search_field, search_index, std::to_string(value), seq_id); } } } else if(search_field.is_int64()) { const std::vector& values = search_field.is_single_integer() ? std::vector{document[field_name].get()} : document[field_name].get>(); for(int64_t value: values) { if (search_field.range_index) { auto trie = range_index.at(field_name); trie->remove(value, seq_id); } else { num_tree_t* num_tree = numerical_index.at(field_name); num_tree->remove(value, seq_id); } if(search_field.facet) { remove_facet_token(search_field, search_index, std::to_string(value), seq_id); } } } else if(search_field.num_dim) { if(!is_update) { // since vector index supports upsert natively, we should not attempt to delete for update vector_index[search_field.name]->vecdex->markDelete(seq_id); } } else if(search_field.is_float()) { const std::vector& values = search_field.is_single_float() ? std::vector{document[field_name].get()} : document[field_name].get>(); for(float value: values) { int64_t fintval = float_to_int64_t(value); if (search_field.range_index) { auto trie = range_index.at(field_name); trie->remove(fintval, seq_id); } else { num_tree_t* num_tree = numerical_index.at(field_name); num_tree->remove(fintval, seq_id); } if(search_field.facet) { remove_facet_token(search_field, search_index, StringUtils::float_to_str(value), seq_id); } } } else if(search_field.is_bool()) { const std::vector& values = search_field.is_single_bool() ? std::vector{document[field_name].get()} : document[field_name].get>(); for(bool value: values) { int64_t bool_int64 = value ? 1 : 0; if (search_field.range_index) { auto trie = range_index.at(field_name); trie->remove(bool_int64, seq_id); } else { num_tree_t* num_tree = numerical_index.at(field_name); num_tree->remove(bool_int64, seq_id); } if(search_field.facet) { remove_facet_token(search_field, search_index, std::to_string(value), seq_id); } } } else if(search_field.is_geopoint()) { auto geopoint_range_index = geo_range_index[field_name]; S2RegionTermIndexer::Options options; options.set_index_contains_points_only(true); S2RegionTermIndexer indexer(options); const std::vector>& latlongs = search_field.is_single_geopoint() ? std::vector>{document[field_name].get>()} : document[field_name].get>>(); for(const std::vector& latlong: latlongs) { S2Point point = S2LatLng::FromDegrees(latlong[0], latlong[1]).ToPoint(); auto cell = S2CellId(point); geopoint_range_index->delete_geopoint(cell.id(), seq_id); } if(!search_field.is_single_geopoint()) { spp::sparse_hash_map*& field_geo_array_map = geo_array_index.at(field_name); auto geo_array_it = field_geo_array_map->find(seq_id); if(geo_array_it != field_geo_array_map->end()) { delete [] geo_array_it->second; field_geo_array_map->erase(seq_id); } } } // remove facets facet_index_v4->remove(field_name, seq_id); // remove sort field if(sort_index.count(field_name) != 0) { sort_index[field_name]->erase(seq_id); } if(str_sort_index.count(field_name) != 0) { str_sort_index[field_name]->remove(seq_id); } } Option Index::remove(const uint32_t seq_id, const nlohmann::json & document, const std::vector& del_fields, const bool is_update) { std::unique_lock lock(mutex); // The exception during removal is mostly because of an edge case with auto schema detection: // Value indexed as Type T but later if field is dropped and reindexed in another type X, // the on-disk data will differ from the newly detected type on schema. We've to log the error, // but have to ignore the field and proceed because there's no leak caused here. if(!del_fields.empty()) { for(auto& the_field: del_fields) { if(!document.contains(the_field.name)) { // could be an optional field continue; } try { remove_field(seq_id, document, the_field.name, is_update); } catch(const std::exception& e) { LOG(WARNING) << "Error while removing field `" << the_field.name << "` from document, message: " << e.what(); } } } else { for(auto it = document.begin(); it != document.end(); ++it) { const std::string& field_name = it.key(); try { remove_field(seq_id, document, field_name, is_update); } catch(const std::exception& e) { LOG(WARNING) << "Error while removing field `" << field_name << "` from document, message: " << e.what(); } } } if(!is_update) { seq_ids->erase(seq_id); } return Option(seq_id); } void Index::tokenize_string_field(const nlohmann::json& document, const field& search_field, std::vector& tokens, const std::string& locale, const std::vector& symbols_to_index, const std::vector& token_separators) { const std::string& field_name = search_field.name; if(search_field.type == field_types::STRING) { Tokenizer(document[field_name], true, false, locale, symbols_to_index, token_separators).tokenize(tokens); } else if(search_field.type == field_types::STRING_ARRAY) { const std::vector& values = document[field_name].get>(); for(const std::string & value: values) { Tokenizer(value, true, false, locale, symbols_to_index, token_separators).tokenize(tokens); } } } art_leaf* Index::get_token_leaf(const std::string & field_name, const unsigned char* token, uint32_t token_len) { std::shared_lock lock(mutex); const art_tree *t = search_index.at(field_name); return (art_leaf*) art_search(t, token, (int) token_len); } const spp::sparse_hash_map &Index::_get_search_index() const { return search_index; } const spp::sparse_hash_map& Index::_get_numerical_index() const { return numerical_index; } const spp::sparse_hash_map& Index::_get_range_index() const { return range_index; } const spp::sparse_hash_map& Index::_get_infix_index() const { return infix_index; }; const spp::sparse_hash_map& Index::_get_vector_index() const { return vector_index; } void Index::refresh_schemas(const std::vector& new_fields, const std::vector& del_fields) { std::unique_lock lock(mutex); for(const auto & new_field: new_fields) { if(!new_field.index || new_field.is_dynamic()) { continue; } search_schema.emplace(new_field.name, new_field); if(new_field.type == field_types::FLOAT_ARRAY && new_field.num_dim > 0) { auto hnsw_index = new hnsw_index_t(new_field.num_dim, 1024, new_field.vec_dist); vector_index.emplace(new_field.name, hnsw_index); continue; } if(new_field.is_sortable()) { if(new_field.is_num_sortable()) { spp::sparse_hash_map * doc_to_score = new spp::sparse_hash_map(); sort_index.emplace(new_field.name, doc_to_score); } else if(new_field.is_str_sortable()) { str_sort_index.emplace(new_field.name, new adi_tree_t); } } if(search_index.count(new_field.name) == 0) { if(new_field.is_string() || field_types::is_string_or_array(new_field.type)) { art_tree *t = new art_tree; art_tree_init(t); search_index.emplace(new_field.name, t); } else if(new_field.is_geopoint()) { geo_range_index.emplace(new_field.name, new NumericTrie(32)); if(!new_field.is_single_geopoint()) { auto geo_array_map = new spp::sparse_hash_map(); geo_array_index.emplace(new_field.name, geo_array_map); } } else { if (new_field.range_index) { auto trie = new_field.is_bool() ? new NumericTrie(8) : new_field.is_int32() ? new NumericTrie(32) : new NumericTrie(64); range_index.emplace(new_field.name, trie); } else { num_tree_t* num_tree = new num_tree_t; numerical_index.emplace(new_field.name, num_tree); } } } if(new_field.is_facet()) { initialize_facet_indexes(new_field); // initialize for non-string facet fields if(!new_field.is_string()) { art_tree *ft = new art_tree; art_tree_init(ft); search_index.emplace(new_field.faceted_name(), ft); } } if(new_field.infix) { array_mapped_infix_t infix_sets(ARRAY_INFIX_DIM); for(auto& infix_set: infix_sets) { infix_set = new tsl::htrie_set(); } infix_index.emplace(new_field.name, infix_sets); } } for(const auto & del_field: del_fields) { if(search_schema.count(del_field.name) == 0) { // could be a dynamic field continue; } search_schema.erase(del_field.name); if(!del_field.index) { continue; } if(del_field.is_string() || field_types::is_string_or_array(del_field.type)) { art_tree_destroy(search_index[del_field.name]); delete search_index[del_field.name]; search_index.erase(del_field.name); } else if(del_field.is_geopoint()) { delete geo_range_index[del_field.name]; geo_range_index.erase(del_field.name); if(!del_field.is_single_geopoint()) { spp::sparse_hash_map* geo_array_map = geo_array_index[del_field.name]; for(auto& kv: *geo_array_map) { delete [] kv.second; } delete geo_array_map; geo_array_index.erase(del_field.name); } } else { if (del_field.range_index) { delete range_index[del_field.name]; range_index.erase(del_field.name); } else { delete numerical_index[del_field.name]; numerical_index.erase(del_field.name); } } if(del_field.is_sortable()) { if(del_field.is_num_sortable()) { delete sort_index[del_field.name]; sort_index.erase(del_field.name); } else if(del_field.is_str_sortable()) { delete str_sort_index[del_field.name]; str_sort_index.erase(del_field.name); } } if(del_field.is_facet()) { facet_index_v4->erase(del_field.name); if(!del_field.is_string()) { art_tree_destroy(search_index[del_field.faceted_name()]); delete search_index[del_field.faceted_name()]; search_index.erase(del_field.faceted_name()); } } if(del_field.infix) { auto& infix_set = infix_index[del_field.name]; for(size_t i = 0; i < infix_set.size(); i++) { delete infix_set[i]; } infix_index.erase(del_field.name); } if(del_field.num_dim) { auto hnsw_index = vector_index[del_field.name]; delete hnsw_index; vector_index.erase(del_field.name); } } } void Index::handle_doc_ops(const tsl::htrie_map& search_schema, nlohmann::json& update_doc, const nlohmann::json& old_doc) { /* { "$operations": { "increment": {"likes": 1, "views": 20} } } */ auto ops_it = update_doc.find("$operations"); if(ops_it != update_doc.end()) { const auto& operations = ops_it.value(); if(operations.contains("increment") && operations["increment"].is_object()) { for(const auto& item: operations["increment"].items()) { auto field_it = search_schema.find(item.key()); if(field_it != search_schema.end()) { if(field_it->type == field_types::INT32 && item.value().is_number_integer()) { int32_t existing_value = 0; if(old_doc.contains(item.key())) { existing_value = old_doc[item.key()].get(); } auto updated_value = existing_value + item.value().get(); update_doc[item.key()] = updated_value; } } } } update_doc.erase("$operations"); } } void Index::get_doc_changes(const index_operation_t op, const tsl::htrie_map& embedding_fields, nlohmann::json& update_doc, const nlohmann::json& old_doc, nlohmann::json& new_doc, nlohmann::json& del_doc) { if(op == UPSERT) { new_doc = update_doc; // since UPSERT could replace a doc with lesser fields, we have to add those missing fields to del_doc for(auto it = old_doc.begin(); it != old_doc.end(); ++it) { if(it.value().is_object() || (it.value().is_array() && (it.value().empty() || it.value()[0].is_object()))) { continue; } if(!update_doc.contains(it.key())) { // embedding field won't be part of upsert doc so populate new doc with the value from old doc if(embedding_fields.count(it.key()) != 0) { new_doc[it.key()] = it.value(); } else { del_doc[it.key()] = it.value(); } } } } else { new_doc = old_doc; new_doc.merge_patch(update_doc); if(old_doc.contains(".flat")) { new_doc[".flat"] = old_doc[".flat"]; for(auto& fl: update_doc[".flat"]) { new_doc[".flat"].push_back(fl); } } } auto it = update_doc.begin(); while(it != update_doc.end()) { if(it.value().is_object() || (it.value().is_array() && !it.value().empty() && it.value()[0].is_object())) { ++it; continue; } if(it.value().is_null()) { // null values should not be indexed new_doc.erase(it.key()); if(old_doc.contains(it.key())) { del_doc[it.key()] = old_doc[it.key()]; } it = update_doc.erase(it); continue; } if(old_doc.contains(it.key())) { if(old_doc[it.key()] == it.value()) { // unchanged so should not be part of update doc it = update_doc.erase(it); continue; } else { // delete this old value from index del_doc[it.key()] = old_doc[it.key()]; } } it++; } } size_t Index::num_seq_ids() const { std::shared_lock lock(mutex); return seq_ids->num_ids(); } Option Index::seq_ids_outside_top_k(const std::string& field_name, size_t k, std::vector& outside_seq_ids) { std::shared_lock lock(mutex); auto field_it = numerical_index.find(field_name); if(field_it != numerical_index.end()) { field_it->second->seq_ids_outside_top_k(k, outside_seq_ids); return Option(true); } auto range_trie_it = range_index.find(field_name); if (range_trie_it != range_index.end()) { range_trie_it->second->seq_ids_outside_top_k(k, outside_seq_ids); return Option(true); } return Option(400, "Field `" + field_name + "` not found in numerical index."); } void Index::resolve_space_as_typos(std::vector& qtokens, const string& field_name, std::vector>& resolved_queries) const { auto tree_it = search_index.find(field_name); if(tree_it == search_index.end()) { return ; } // we will try to find a verbatim match first art_tree* t = tree_it->second; std::vector leaves; for(const std::string& token: qtokens) { art_leaf* leaf = (art_leaf *) art_search(t, (const unsigned char*) token.c_str(), token.length()+1); if(leaf == nullptr) { break; } leaves.push_back(leaf); } // When we cannot find verbatim match, we can try concatting and splitting query tokens for alternatives. // Concatenation: size_t qtokens_size = std::min(5, qtokens.size()); // only first 5 tokens will be considered if(qtokens.size() > 1) { // a) join all tokens to form a single string const string& all_tokens_query = StringUtils::join(qtokens, ""); if(art_search(t, (const unsigned char*) all_tokens_query.c_str(), all_tokens_query.length()+1) != nullptr) { resolved_queries.push_back({all_tokens_query}); return; } // b) join 2 adjacent tokens in a sliding window (provided they are atleast 2 tokens in size) for(size_t i = 0; i < qtokens_size-1 && qtokens_size > 2; i++) { std::vector candidate_tokens; for(size_t j = 0; j < i; j++) { candidate_tokens.push_back(qtokens[j]); } std::string joined_tokens = qtokens[i] + qtokens[i+1]; candidate_tokens.push_back(joined_tokens); for(size_t j = i+2; j < qtokens.size(); j++) { candidate_tokens.push_back(qtokens[j]); } leaves.clear(); for(auto& token: candidate_tokens) { art_leaf* leaf = static_cast(art_search(t, (const unsigned char*) token.c_str(), token.length() + 1)); if(leaf == nullptr) { break; } leaves.push_back(leaf); } if(candidate_tokens.size() == leaves.size() && common_results_exist(leaves, false)) { resolved_queries.push_back(candidate_tokens); return; } } } // concats did not work, we will try splitting individual tokens for(size_t i = 0; i < qtokens_size; i++) { std::vector candidate_tokens; for(size_t j = 0; j < i; j++) { candidate_tokens.push_back(qtokens[j]); } const std::string& token = qtokens[i]; bool found_split = false; for(size_t ci = 1; ci < token.size(); ci++) { std::string first_part = token.substr(0, token.size()-ci); art_leaf* first_leaf = static_cast(art_search(t, (const unsigned char*) first_part.c_str(), first_part.length() + 1)); if(first_leaf != nullptr) { // check if rest of the string is also a valid token std::string second_part = token.substr(token.size()-ci, ci); art_leaf* second_leaf = static_cast(art_search(t, (const unsigned char*) second_part.c_str(), second_part.length() + 1)); std::vector part_leaves = {first_leaf, second_leaf}; if(second_leaf != nullptr && common_results_exist(part_leaves, true)) { candidate_tokens.push_back(first_part); candidate_tokens.push_back(second_part); found_split = true; break; } } } if(!found_split) { continue; } for(size_t j = i+1; j < qtokens.size(); j++) { candidate_tokens.push_back(qtokens[j]); } leaves.clear(); for(auto& candidate_token: candidate_tokens) { art_leaf* leaf = static_cast(art_search(t, (const unsigned char*) candidate_token.c_str(), candidate_token.length() + 1)); if(leaf == nullptr) { break; } leaves.push_back(leaf); } if(common_results_exist(leaves, false)) { resolved_queries.push_back(candidate_tokens); return; } } } bool Index::common_results_exist(std::vector& leaves, bool must_match_phrase) const { std::vector result_ids; std::vector leaf_vals; for(auto leaf: leaves) { leaf_vals.push_back(leaf->values); } posting_t::intersect(leaf_vals, result_ids); if(result_ids.empty()) { return false; } if(!must_match_phrase) { return !result_ids.empty(); } uint32_t* phrase_ids = new uint32_t[result_ids.size()]; size_t num_phrase_ids; posting_t::get_phrase_matches(leaf_vals, false, &result_ids[0], result_ids.size(), phrase_ids, num_phrase_ids); bool phrase_exists = (num_phrase_ids != 0); delete [] phrase_ids; return phrase_exists; } void Index::batch_embed_fields(std::vector& records, const tsl::htrie_map& embedding_fields, const tsl::htrie_map & search_schema, const size_t remote_embedding_batch_size) { for(const auto& field : embedding_fields) { std::vector> texts_to_embed; auto indexing_prefix = TextEmbedderManager::get_instance().get_indexing_prefix(field.embed[fields::model_config]); for(auto& record : records) { if(!record->indexed.ok()) { continue; } nlohmann::json* document; if(record->is_update) { document = &record->new_doc; } else { document = &record->doc; } if(document == nullptr) { continue; } if(document->contains(field.name) && !record->is_update) { // embedding already exists (could be a restore from export) continue; } std::string text = indexing_prefix; const auto& embed_from = field.embed[fields::from].get>(); for(const auto& field_name : embed_from) { auto field_it = search_schema.find(field_name); auto doc_field_it = document->find(field_name); if(doc_field_it == document->end()) { continue; } if(field_it.value().type == field_types::STRING) { text += doc_field_it->get() + " "; } else if(field_it.value().type == field_types::STRING_ARRAY) { for(const auto& val : *(doc_field_it)) { text += val.get() + " "; } } } if(text != indexing_prefix) { texts_to_embed.push_back(std::make_pair(record, text)); } } if(texts_to_embed.empty()) { continue; } TextEmbedderManager& embedder_manager = TextEmbedderManager::get_instance(); auto embedder_op = embedder_manager.get_text_embedder(field.embed[fields::model_config]); if(!embedder_op.ok()) { LOG(ERROR) << "Error while getting embedder for model: " << field.embed[fields::model_config]; LOG(ERROR) << "Error: " << embedder_op.error(); return; } // sort texts by length std::sort(texts_to_embed.begin(), texts_to_embed.end(), [](const std::pair& a, const std::pair& b) { return a.second.size() < b.second.size(); }); // get vector of texts std::vector texts; for(const auto& text_to_embed : texts_to_embed) { texts.push_back(text_to_embed.second); } auto embeddings = embedder_op.get()->batch_embed(texts, remote_embedding_batch_size); for(size_t i = 0; i < embeddings.size(); i++) { auto& embedding_res = embeddings[i]; if(!embedding_res.success) { texts_to_embed[i].first->embedding_res = embedding_res.error; texts_to_embed[i].first->index_failure(embedding_res.status_code, ""); continue; } nlohmann::json* document; if(texts_to_embed[i].first->is_update) { document = &texts_to_embed[i].first->new_doc; } else { document = &texts_to_embed[i].first->doc; } (*document)[field.name] = embedding_res.embedding; } } } Option Index::get_reference_doc_id_with_lock(const string& reference_helper_field_name, const uint32_t& seq_id) const { std::shared_lock lock(mutex); if (sort_index.count(reference_helper_field_name) == 0 || sort_index.at(reference_helper_field_name)->count(seq_id) == 0) { return Option(400, "Could not find a reference for doc " + std::to_string(seq_id)); } return Option(sort_index.at(reference_helper_field_name)->at(seq_id)); } int64_t Index::reference_string_sort_score(const string &field_name, const uint32_t &seq_id) const { std::shared_lock lock(mutex); return str_sort_index.at(field_name)->rank(seq_id); } Option Index::get_sort_indexed_field_value(const string& field_name, const uint32_t& seq_id) const { std::shared_lock lock(mutex); if (sort_index.count(field_name) == 0 || sort_index.at(field_name)->count(seq_id) == 0) { return Option(400, "Could not find `" + field_name + "` value for doc `" + std::to_string(seq_id) + "`."); } return Option(sort_index.at(field_name)->at(seq_id)); } /* // https://stackoverflow.com/questions/924171/geo-fencing-point-inside-outside-polygon // NOTE: polygon and point should have been transformed with `transform_for_180th_meridian` bool Index::is_point_in_polygon(const Geofence& poly, const GeoCoord &point) { int i, j; bool c = false; for (i = 0, j = poly.numVerts - 1; i < poly.numVerts; j = i++) { if ((((poly.verts[i].lat <= point.lat) && (point.lat < poly.verts[j].lat)) || ((poly.verts[j].lat <= point.lat) && (point.lat < poly.verts[i].lat))) && (point.lon < (poly.verts[j].lon - poly.verts[i].lon) * (point.lat - poly.verts[i].lat) / (poly.verts[j].lat - poly.verts[i].lat) + poly.verts[i].lon)) { c = !c; } } return c; } double Index::transform_for_180th_meridian(Geofence &poly) { double offset = 0.0; double maxLon = -1000, minLon = 1000; for(int v=0; v < poly.numVerts; v++) { if(poly.verts[v].lon < minLon) { minLon = poly.verts[v].lon; } if(poly.verts[v].lon > maxLon) { maxLon = poly.verts[v].lon; } if(std::abs(minLon - maxLon) > 180) { offset = 360.0; } } int i, j; for (i = 0, j = poly.numVerts - 1; i < poly.numVerts; j = i++) { if (poly.verts[i].lon < 0.0) { poly.verts[i].lon += offset; } if (poly.verts[j].lon < 0.0) { poly.verts[j].lon += offset; } } return offset; } void Index::transform_for_180th_meridian(GeoCoord &point, double offset) { point.lon = point.lon < 0.0 ? point.lon + offset : point.lon; } */