adding tests and optimizing indexing

This commit is contained in:
krunal1313 2023-05-01 12:01:35 +05:30
parent 58e2de03ce
commit 92c38c3fa5
10 changed files with 1373 additions and 112 deletions

1
BUILD
View File

@ -134,6 +134,7 @@ TEST_COPTS = [
"-Wno-unused-parameter",
"-Werror=return-type",
"-g",
"-DFORCE_INTERSECTION",
]
config_setting(

View File

@ -182,8 +182,8 @@ private:
std::vector<field>& new_fields,
bool enable_nested_fields);
static bool facet_count_compare(const std::pair<uint64_t, facet_count_t>& a,
const std::pair<uint64_t, facet_count_t>& b) {
static bool facet_count_compare(const std::pair<uint32_t, facet_count_t>& a,
const std::pair<uint32_t, facet_count_t>& b) {
return std::tie(a.second.count, a.first) > std::tie(b.second.count, b.first);
}
@ -460,7 +460,11 @@ public:
const text_match_type_t match_type = max_score,
const size_t facet_sample_percent = 100,
const size_t facet_sample_threshold = 0,
const size_t page_offset = UINT32_MAX) const;
const size_t page_offset = UINT32_MAX
#ifdef FORCE_INTERSECTION
, bool force_intersection = false
#endif
) const;
Option<bool> get_filter_ids(const std::string & filter_query, filter_result_t& filter_result) const;

View File

@ -52,7 +52,5 @@ public:
size_t get_facet_count(const std::string& field);
size_t intersect(const std::string& val, const uint32_t* result_ids, int result_id_len,
int max_facet_count, std::map<std::string, uint32_t>& found, bool is_wildcard_no_filter_query);
std::string get_facet_by_count_index(const std::string& field, uint32_t count_index);
int max_facet_count, std::map<std::string, uint32_t>& found, bool is_wildcard_no_filter_query);
};

View File

@ -363,7 +363,11 @@ private:
const std::vector<facet_info_t>& facet_infos,
size_t group_limit, const std::vector<std::string>& group_by_fields,
const uint32_t* result_ids, size_t results_size,
int max_facet_count, bool is_wildcard_query, bool no_filters_provided) const;
int max_facet_count, bool is_wildcard_query, bool no_filters_provided
#ifdef FORCE_INTERSECTION
, bool force_intersection = false
#endif
) const;
bool static_filter_query_eval(const override_t* override, std::vector<std::string>& tokens,
filter_node_t*& filter_tree_root) const;
@ -520,6 +524,8 @@ private:
static void compute_facet_stats(facet &a_facet, const std::string& raw_value, const std::string & field_type);
static void compute_facet_stats(facet &a_facet, const int64_t raw_value, const std::string & field_type);
static void get_doc_changes(const index_operation_t op, nlohmann::json &update_doc,
const nlohmann::json &old_doc, nlohmann::json &new_doc, nlohmann::json &del_doc);
@ -631,7 +637,12 @@ public:
// Public operations
Option<bool> run_search(search_args* search_params, const std::string& collection_name);
Option<bool> run_search(search_args* search_params,
const std::string& collection_name
#ifdef FORCE_INTERSECTION
, bool force_intersection
#endif
);
Option<bool> search(std::vector<query_tokens_t>& field_query_tokens, const std::vector<search_field_t>& the_fields,
const text_match_type_t match_type,
@ -656,7 +667,11 @@ public:
const size_t max_extra_suffix, const size_t facet_query_num_typos,
const bool filter_curated_hits, enable_t split_join_tokens,
const vector_query_t& vector_query, size_t facet_sample_percent, size_t facet_sample_threshold,
const std::string& collection_name) const;
const std::string& collection_name
#ifdef FORCE_INTERSECTION
, bool force_intersection = false
#endif
) const;
void remove_field(uint32_t seq_id, const nlohmann::json& document, const std::string& field_name);
@ -942,6 +957,8 @@ public:
uint32_t filter_ids_length, std::set<uint32_t>& curated_ids,
std::map<size_t, std::map<size_t, uint32_t>>& included_ids_map,
std::vector<uint32_t>& included_ids_vec) const;
int64_t get_doc_val_from_sort_index(const std::string& field_name, uint32_t doc_seq_id) const;
};
template<class T>

View File

@ -1070,7 +1070,11 @@ Option<nlohmann::json> Collection::search(std::string raw_query,
const text_match_type_t match_type,
const size_t facet_sample_percent,
const size_t facet_sample_threshold,
const size_t page_offset) const {
const size_t page_offset
#ifdef FORCE_INTERSECTION
, bool force_intersection
#endif
) const {
std::shared_lock lock(mutex);
@ -1520,7 +1524,12 @@ Option<nlohmann::json> Collection::search(std::string raw_query,
std::unique_ptr<search_args> search_params_guard(search_params);
auto search_op = index->run_search(search_params, name);
auto search_op = index->run_search(search_params, name
#ifdef FORCE_INTERSECTION
, force_intersection
#endif
);
if (!search_op.ok()) {
return Option<nlohmann::json>(search_op.code(), search_op.error());
}
@ -1937,7 +1946,7 @@ Option<nlohmann::json> Collection::search(std::string raw_query,
auto max_facets = std::min(max_facet_values, facet_counts.size());
std::sort(facet_counts.begin(), facet_counts.end(),
[&](const auto& p1, const auto& p2) {
return p1.second > p2.second;
return std::tie(p1.second, p1.first) > std::tie(p2.second, p2.first);
});
for(int i = 0; i < max_facets; ++i) {

View File

@ -39,23 +39,17 @@ uint32_t facet_index_t::insert(const std::string& field, const std::string& valu
counter_list.emplace_back(sv, facet_count);
} else {
auto counter_it = counter_list.begin();
//remove node from list
for(counter_it = counter_list.begin(); counter_it != counter_list.end(); ++counter_it) {
if(counter_it->facet_value == sv) {
//found facet in first node
counter_list.erase(counter_it);
break;
}
}
//find position in list and add node with updated count
count_list node(sv, facet_count);
for(counter_it = counter_list.begin(); counter_it != counter_list.end(); ++counter_it) {
// LOG (INFO) << "inserting in middle or front facet " << node.facet_value
// << " with count " << node.count;
if(counter_it->count <= facet_count) {
counter_list.emplace(counter_it, node);
if(counter_it->facet_value == sv) {
counter_it->count = facet_count;
auto prev_node = std::prev(counter_it);
if(prev_node->count < counter_it->count) {
std::swap(prev_node, counter_it);
}
break;
}
}
@ -127,9 +121,6 @@ size_t facet_index_t::intersect(const std::string& field, const uint32_t* result
ids_t::uncompress(ids, id_list);
const auto ids_len = id_list.size();
for(int i = 0; i < result_ids_len; ++i) {
// if(std::binary_search(id_list.begin(), id_list.end(), result_ids[i])) {
// ++count;
// }
uint32_t* out = nullptr;
count = ArrayUtils::and_scalar(id_list.data(), id_list.size(),
result_ids, result_ids_len, &out);
@ -148,24 +139,6 @@ size_t facet_index_t::intersect(const std::string& field, const uint32_t* result
return found.size();
}
std::string facet_index_t::get_facet_by_count_index(const std::string& field, uint32_t count_index) {
const auto& facet_field_it = facet_field_map.find(field);
if(facet_field_it == facet_field_map.end()) {
return "";
}
std::string result = "";
auto facet_index_map = facet_field_it->second.facet_index_map;
for(auto it = facet_index_map.begin(); it != facet_index_map.end(); ++it) {
if(it.value().index == count_index) {
result = it.key();
}
}
return result;
}
facet_index_t::~facet_index_t() {
facet_field_map.clear();
}

View File

@ -652,7 +652,7 @@ Option<bool> field::json_field_to_field(bool enable_nested_fields, nlohmann::jso
if(field_json["type"] == field_types::INT32 || field_json["type"] == field_types::INT64 ||
field_json["type"] == field_types::FLOAT || field_json["type"] == field_types::BOOL ||
field_json["type"] == field_types::GEOPOINT || field_json["type"] == field_types::GEOPOINT_ARRAY) {
if(field_json.count(fields::num_dim) == 0) {
if((field_json.count(fields::num_dim) == 0) || (field_json[fields::facet])) {
field_json[fields::sort] = true;
} else {
field_json[fields::sort] = false;

View File

@ -1139,12 +1139,66 @@ void Index::compute_facet_stats(facet &a_facet, const std::string& raw_value, co
}
}
void Index::compute_facet_stats(facet &a_facet, const int64_t raw_value, const std::string & field_type) {
if(field_type == field_types::INT32 || field_type == field_types::INT32_ARRAY) {
int32_t val = raw_value;
if (val < a_facet.stats.fvmin) {
a_facet.stats.fvmin = val;
}
if (val > a_facet.stats.fvmax) {
a_facet.stats.fvmax = val;
}
a_facet.stats.fvsum += val;
a_facet.stats.fvcount++;
} else if(field_type == field_types::INT64 || field_type == field_types::INT64_ARRAY) {
int64_t val = raw_value;
if(val < a_facet.stats.fvmin) {
a_facet.stats.fvmin = val;
}
if(val > a_facet.stats.fvmax) {
a_facet.stats.fvmax = val;
}
a_facet.stats.fvsum += val;
a_facet.stats.fvcount++;
} else if(field_type == field_types::FLOAT || field_type == field_types::FLOAT_ARRAY) {
float val = int64_t_to_float(raw_value);
if(val < a_facet.stats.fvmin) {
a_facet.stats.fvmin = val;
}
if(val > a_facet.stats.fvmax) {
a_facet.stats.fvmax = val;
}
a_facet.stats.fvsum += val;
a_facet.stats.fvcount++;
}
}
int64_t Index::get_doc_val_from_sort_index(const std::string& field_name, uint32_t doc_seq_id) const {
auto sort_index_it = sort_index.find(field_name);
if(sort_index_it != sort_index.end()){
auto doc_id_val_map = sort_index_it->second;
auto doc_seq_id_it = doc_id_val_map->find(doc_seq_id);
if(doc_seq_id_it != doc_id_val_map->end()){
return doc_seq_id_it->second;
}
}
return 0;
}
void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
bool estimate_facets, size_t facet_sample_percent,
const std::vector<facet_info_t>& facet_infos,
const size_t group_limit, const std::vector<std::string>& group_by_fields,
const uint32_t* result_ids, size_t results_size,
int max_facet_count, bool is_wildcard_query, bool no_filters_provided) const {
int max_facet_count, bool is_wildcard_query, bool no_filters_provided
#ifdef FORCE_INTERSECTION
, bool force_intersection
#endif
) const {
// assumed that facet fields have already been validated upstream
for(size_t findex=0; findex < facets.size(); findex++) {
auto& a_facet = facets[findex];
@ -1170,32 +1224,45 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
}
}
if(results_size && facet_records && (facet_records <= 10 || is_wildcard_query) &&
!use_facet_query && group_limit == 0 && no_filters_provided) {
#ifdef FORCE_INTERSECTION
bool use_hashes = false;
if(!force_intersection) {
use_hashes = true;
}
#endif
if(results_size && facet_records && ((facet_records <= 10 || is_wildcard_query) &&
!use_facet_query && group_limit == 0 && no_filters_provided)
#ifdef FORCE_INTERSECTION
&& !use_hashes || force_intersection
#endif
) {
//LOG(INFO) << "Using intersection to find facets";
a_facet.is_intersected = true;
std::map<std::string, uint32_t> facet_results;
if(facet_field.is_string()) {
facet_index_v4->intersect(a_facet.field_name, result_ids,
results_size, max_facet_count, facet_results, is_wildcard_query & no_filters_provided);
} else {
std::map<int64_t, uint32_t> facet_counts;
numerical_index.at(a_facet.field_name)->intersect(result_ids,
if(!facet_field.name.empty()) {
if(facet_field.is_string()) {
facet_index_v4->intersect(a_facet.field_name, result_ids,
results_size, max_facet_count, facet_results, is_wildcard_query & no_filters_provided);
} else {
std::map<int64_t, uint32_t> facet_counts;
numerical_index.at(a_facet.field_name)->intersect(result_ids,
results_size, max_facet_count, facet_counts, is_wildcard_query & no_filters_provided);
for(const auto& kv : facet_counts) {
std::string val;
if(facet_field.is_float()) {
val = std::to_string(int64_t_to_float(kv.first));
} else if(facet_field.is_bool()) {
val = kv.first == 1 ? "true" : "false";
} else {
val = std::to_string(kv.first);
for(const auto& kv : facet_counts) {
std::string val;
if(facet_field.is_float()) {
val = StringUtils::float_to_str(int64_t_to_float(kv.first));
} else if(facet_field.is_bool()) {
val = kv.first == 1 ? "true" : "false";
} else {
val = std::to_string(kv.first);
}
facet_results[val] = kv.second;
}
facet_results[val] = kv.second;
}
}
@ -1269,32 +1336,24 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
RETURN_CIRCUIT_BREAKER
}
int64_t doc_val = 0;
for(size_t j = 0; j < facet_hash_count; j++) {
if(facet_field.is_array()) {
fhash = facet_map_it->second.hashes[j];
}
if(should_compute_stats) {
std::string fvalue =
facet_index_v4->get_facet_by_count_index(a_facet.field_name, fhash);
if(!fvalue.empty()) {
compute_facet_stats(a_facet, fvalue, facet_field.type);
}
doc_val = get_doc_val_from_sort_index(a_facet.field_name, doc_seq_id);
compute_facet_stats(a_facet, doc_val, facet_field.type);
}
if(a_facet.is_range_query) {
auto sort_index_it = sort_index.find(a_facet.field_name);
if(sort_index_it != sort_index.end()){
auto doc_id_val_map = sort_index_it->second;
auto doc_seq_id_it = doc_id_val_map->find(doc_seq_id);
if(doc_seq_id_it != doc_id_val_map->end()){
doc_val = get_doc_val_from_sort_index(a_facet.field_name, doc_seq_id);
std::string doc_val = std::to_string(doc_seq_id_it->second);
std::pair<std::string, std::string> range_pair {};
if(a_facet.get_range(doc_val, range_pair)) {
const auto& range_id = range_pair.first;
facet_count_t& facet_count = a_facet.result_map[range_id];
facet_count.count += 1;
}
}
std::pair<std::string, std::string> range_pair {};
if(a_facet.get_range(std::to_string(doc_val), range_pair)) {
const auto& range_id = range_pair.first;
facet_count_t& facet_count = a_facet.result_map[range_id];
facet_count.count += 1;
}
} else if(!use_facet_query || fquery_hashes.find(fhash) != fquery_hashes.end()) {
std::string fhash_str = std::to_string(fhash);
@ -2288,7 +2347,11 @@ Option<bool> Index::get_approximate_reference_filter_ids_with_lock(filter_node_t
return rearrange_filter_tree(filter_tree_root, filter_ids_length);
}
Option<bool> Index::run_search(search_args* search_params, const std::string& collection_name) {
Option<bool> Index::run_search(search_args* search_params, const std::string& collection_name
#ifdef FORCE_INTERSECTION
, bool force_intersection
#endif
) {
return search(search_params->field_query_tokens,
search_params->search_fields,
search_params->match_type,
@ -2323,7 +2386,11 @@ Option<bool> Index::run_search(search_args* search_params, const std::string& co
search_params->vector_query,
search_params->facet_sample_percent,
search_params->facet_sample_threshold,
collection_name);
collection_name
#ifdef FORCE_INTERSECTION
, force_intersection
#endif
);
}
void Index::collate_included_ids(const std::vector<token_t>& q_included_tokens,
@ -2772,7 +2839,11 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
const bool filter_curated_hits, const enable_t split_join_tokens,
const vector_query_t& vector_query,
size_t facet_sample_percent, size_t facet_sample_threshold,
const std::string& collection_name) const {
const std::string& collection_name
#ifdef FORCE_INTERSECTION
, bool force_intersection
#endif
) const {
std::shared_lock lock(mutex);
uint32_t filter_ids_length = 0;
@ -3297,7 +3368,11 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
batch_result_ids, batch_res_len, &facet_infos, max_facet_values,
is_wildcard_query, no_filters_provided, estimate_facets, facet_sample_percent,
&parent_search_begin, &parent_search_stop_ms, &parent_search_cutoff,
&num_processed, &m_process, &cv_process]() {
&num_processed, &m_process, &cv_process
#ifdef FORCE_INTERSECTION
, force_intersection
#endif
]() {
search_begin_us = parent_search_begin;
search_stop_us = parent_search_stop_ms;
search_cutoff = parent_search_cutoff;
@ -3307,7 +3382,11 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
do_facets(facet_batches[thread_id], fq, estimate_facets, facet_sample_percent,
facet_infos, group_limit, group_by_fields,
batch_result_ids, batch_res_len, max_facet_values,
is_wildcard_query, no_filters_provided);
is_wildcard_query, no_filters_provided
#ifdef FORCE_INTERSECTION
, force_intersection
#endif
);
std::unique_lock<std::mutex> lock(m_process);
num_processed++;
parent_search_cutoff = parent_search_cutoff || search_cutoff;
@ -3392,7 +3471,11 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
max_candidates, facet_infos);
do_facets(facets, facet_query, estimate_facets, facet_sample_percent,
facet_infos, group_limit, group_by_fields, &included_ids_vec[0],
included_ids_vec.size(), max_facet_values, is_wildcard_query, no_filters_provided);
included_ids_vec.size(), max_facet_values, is_wildcard_query, no_filters_provided
#ifdef FORCE_INTERSECTION
, force_intersection
#endif
);
all_result_ids_len += curated_topster->size;

View File

@ -22,23 +22,17 @@ void num_tree_t::insert(int64_t value, uint32_t id, bool is_facet) {
counter_list.emplace_back(value, facet_count);
} else {
auto counter_it = counter_list.begin();
//remove node from list
for(counter_it = counter_list.begin(); counter_it != counter_list.end(); ++counter_it) {
if(counter_it->facet_value == value) {
//found facet in first node
counter_list.erase(counter_it);
break;
}
}
//find position in list and add node with updated count
count_list node(value, facet_count);
for(counter_it = counter_list.begin(); counter_it != counter_list.end(); ++counter_it) {
// LOG (INFO) << "inserting in middle or front facet " << node.facet_value
// << " with count " << node.count;
if(counter_it->count <= facet_count) {
counter_list.emplace(counter_it, node);
if(counter_it->facet_value == value) {
counter_it->count = facet_count;
auto prev_node = std::prev(counter_it);
if(prev_node->count < counter_it->count) {
std::swap(prev_node, counter_it);
}
break;
}
}
@ -389,9 +383,6 @@ size_t num_tree_t::intersect(const uint32_t* result_ids, int result_ids_len, int
ids_t::uncompress(ids, id_list);
const auto ids_len = id_list.size();
for(int i = 0; i < result_ids_len; ++i) {
// if(std::binary_search(id_list.begin(), id_list.end(), result_ids[i])) {
// ++count;
// }
uint32_t* out = nullptr;
count = ArrayUtils::and_scalar(id_list.data(), id_list.size(),
result_ids, result_ids_len, &out);

File diff suppressed because it is too large Load Diff