mirror of
https://github.com/typesense/typesense.git
synced 2025-05-19 21:22:25 +08:00
Simplify facet value filtering.
This commit is contained in:
parent
3eed0d509e
commit
a3def7dc5b
@ -546,8 +546,7 @@ struct facet_count_t {
|
||||
// used to fetch the actual document and value for representation
|
||||
uint32_t doc_id;
|
||||
uint32_t array_pos;
|
||||
|
||||
std::unordered_map<uint32_t, token_pos_cost_t> query_token_pos;
|
||||
std::vector<std::string> tokens;
|
||||
};
|
||||
|
||||
struct facet_stats_t {
|
||||
@ -567,6 +566,14 @@ struct facet {
|
||||
}
|
||||
};
|
||||
|
||||
struct facet_info_t {
|
||||
// facet hash => resolved tokens
|
||||
std::unordered_map<uint64_t, std::vector<std::string>> hashes;
|
||||
bool use_facet_query = false;
|
||||
bool should_compute_stats = false;
|
||||
field facet_field{"", "", false};
|
||||
};
|
||||
|
||||
struct facet_query_t {
|
||||
std::string field_name;
|
||||
std::string query;
|
||||
|
@ -380,8 +380,6 @@ class Index {
|
||||
private:
|
||||
mutable std::shared_mutex mutex;
|
||||
|
||||
static constexpr const uint64_t FACET_ARRAY_DELIMETER = std::numeric_limits<uint64_t>::max();
|
||||
|
||||
std::string name;
|
||||
|
||||
const uint32_t collection_id;
|
||||
@ -440,6 +438,7 @@ private:
|
||||
void log_leaves(int cost, const std::string &token, const std::vector<art_leaf *> &leaves) const;
|
||||
|
||||
void do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
|
||||
const std::vector<facet_info_t>& facet_infos,
|
||||
size_t group_limit, const std::vector<std::string>& group_by_fields,
|
||||
const uint32_t* result_ids, size_t results_size) const;
|
||||
|
||||
@ -469,9 +468,10 @@ private:
|
||||
const uint32_t* exclude_token_ids,
|
||||
size_t exclude_token_ids_size,
|
||||
size_t& num_tokens_dropped,
|
||||
const std::string & field, uint32_t *filter_ids, size_t filter_ids_length,
|
||||
const field& the_field, const std::string& field_name,
|
||||
const uint32_t *filter_ids, size_t filter_ids_length,
|
||||
const std::vector<uint32_t>& curated_ids,
|
||||
std::vector<facet> & facets, const std::vector<sort_by> & sort_fields,
|
||||
const std::vector<sort_by> & sort_fields,
|
||||
int num_typos, std::vector<std::vector<art_leaf*>> & searched_queries,
|
||||
Topster* topster, spp::sparse_hash_set<uint64_t>& groups_processed,
|
||||
uint32_t** all_result_ids, size_t & all_result_ids_len,
|
||||
@ -490,7 +490,7 @@ private:
|
||||
|
||||
void search_candidates(const uint8_t & field_id,
|
||||
bool field_is_array,
|
||||
uint32_t* filter_ids, size_t filter_ids_length,
|
||||
const uint32_t* filter_ids, size_t filter_ids_length,
|
||||
const uint32_t* exclude_token_ids, size_t exclude_token_ids_size,
|
||||
const std::vector<uint32_t>& curated_ids,
|
||||
const std::vector<sort_by> & sort_fields, std::vector<token_candidates> & token_to_candidates,
|
||||
@ -742,5 +742,10 @@ public:
|
||||
std::array<spp::sparse_hash_map<uint32_t, int64_t>*, 3>& field_values) const;
|
||||
|
||||
static void remove_matched_tokens(std::vector<std::string>& tokens, const std::set<std::string>& rule_token_set) ;
|
||||
|
||||
void compute_facet_infos(const std::vector<facet>& facets, facet_query_t& facet_query,
|
||||
const uint32_t* all_result_ids, const size_t& all_result_ids_len,
|
||||
const std::vector<std::string>& group_by_fields,
|
||||
std::vector<facet_info_t>& facet_infos) const;
|
||||
};
|
||||
|
||||
|
@ -105,6 +105,13 @@ public:
|
||||
const std::vector<void*>& posting_lists,
|
||||
std::unordered_map<size_t, std::vector<token_positions_t>>& array_token_positions
|
||||
);
|
||||
|
||||
static void get_exact_matches(const std::vector<void*>& raw_posting_lists, bool field_is_array,
|
||||
const uint32_t* ids, uint32_t num_ids,
|
||||
uint32_t*& exact_ids, size_t& num_exact_ids);
|
||||
|
||||
static void get_matching_array_indices(const std::vector<void*>& raw_posting_lists,
|
||||
uint32_t id, std::vector<size_t>& indices);
|
||||
};
|
||||
|
||||
template<class T>
|
||||
|
@ -79,8 +79,8 @@ public:
|
||||
|
||||
result_iter_state_t() = default;
|
||||
|
||||
result_iter_state_t(uint32_t* excluded_result_ids, size_t excluded_result_ids_size, uint32_t* filter_ids,
|
||||
size_t filter_ids_length) : excluded_result_ids(excluded_result_ids),
|
||||
result_iter_state_t(uint32_t* excluded_result_ids, size_t excluded_result_ids_size,
|
||||
const uint32_t* filter_ids, const size_t filter_ids_length) : excluded_result_ids(excluded_result_ids),
|
||||
excluded_result_ids_size(excluded_result_ids_size),
|
||||
filter_ids(filter_ids), filter_ids_length(filter_ids_length) {}
|
||||
};
|
||||
@ -164,6 +164,13 @@ public:
|
||||
);
|
||||
|
||||
static bool is_single_token_verbatim_match(const posting_list_t::iterator_t& it, bool field_is_array);
|
||||
|
||||
static void get_exact_matches(std::vector<iterator_t>& its, bool field_is_array,
|
||||
const uint32_t* ids, const uint32_t num_ids,
|
||||
uint32_t*& exact_ids, size_t& num_exact_ids);
|
||||
|
||||
static void get_matching_array_indices(uint32_t id, std::vector<iterator_t>& its,
|
||||
std::vector<size_t>& indices);
|
||||
};
|
||||
|
||||
template<class T>
|
||||
|
@ -1174,6 +1174,8 @@ Option<nlohmann::json> Collection::search(const std::string & raw_query, const s
|
||||
facet_hash_counts.emplace_back(kv);
|
||||
}
|
||||
|
||||
auto the_field = search_schema.at(a_facet.field_name);
|
||||
|
||||
// keep only top K facets
|
||||
auto max_facets = std::min(max_facet_values, facet_hash_counts.size());
|
||||
std::nth_element(facet_hash_counts.begin(), facet_hash_counts.begin() + max_facets,
|
||||
@ -1181,7 +1183,11 @@ Option<nlohmann::json> Collection::search(const std::string & raw_query, const s
|
||||
|
||||
|
||||
std::vector<std::string> facet_query_tokens;
|
||||
StringUtils::split(facet_query.query, facet_query_tokens, " ");
|
||||
if(the_field.locale.empty() || the_field.locale == "en") {
|
||||
StringUtils::split(facet_query.query, facet_query_tokens, " ");
|
||||
} else {
|
||||
Tokenizer(facet_query.query, true, !the_field.is_string()).tokenize(facet_query_tokens);
|
||||
}
|
||||
|
||||
std::vector<facet_value_t> facet_values;
|
||||
|
||||
@ -1207,32 +1213,71 @@ Option<nlohmann::json> Collection::search(const std::string & raw_query, const s
|
||||
continue;
|
||||
}
|
||||
|
||||
std::vector<std::string> tokens;
|
||||
StringUtils::split(value, tokens, " ");
|
||||
std::stringstream highlightedss;
|
||||
std::unordered_map<std::string, size_t> ftoken_pos;
|
||||
|
||||
// invert query_pos -> token_pos
|
||||
spp::sparse_hash_map<uint32_t, uint32_t> token_query_pos;
|
||||
for(auto qtoken_pos: facet_count.query_token_pos) {
|
||||
token_query_pos.emplace(qtoken_pos.second.pos, qtoken_pos.first);
|
||||
for(size_t ti = 0; ti < facet_count.tokens.size(); ti++) {
|
||||
if(the_field.is_bool()) {
|
||||
if(facet_count.tokens[ti] == "1") {
|
||||
facet_count.tokens[ti] = "true";
|
||||
} else {
|
||||
facet_count.tokens[ti] = "false";
|
||||
}
|
||||
}
|
||||
|
||||
const std::string& resolved_token = facet_count.tokens[ti];
|
||||
ftoken_pos[resolved_token] = ti;
|
||||
}
|
||||
|
||||
for(size_t i = 0; i < tokens.size(); i++) {
|
||||
if(i != 0) {
|
||||
highlightedss << " ";
|
||||
const std::string& last_full_q_token = facet_count.tokens.empty() ? "" : facet_count.tokens.back();
|
||||
const std::string& last_q_token = facet_query_tokens.empty() ? "" : facet_query_tokens.back();
|
||||
|
||||
// 2 passes: first identify tokens that need to be highlighted and then construct highlighted text
|
||||
|
||||
Tokenizer tokenizer(value, true, !the_field.is_string());
|
||||
std::string raw_token;
|
||||
size_t raw_token_index = 0, tok_start = 0, tok_end = 0;
|
||||
|
||||
// need an ordered map here to ensure that it is ordered by the key (start offset)
|
||||
std::map<size_t, size_t> token_offsets;
|
||||
size_t prefix_token_start_index = 0;
|
||||
|
||||
while(tokenizer.next(raw_token, raw_token_index, tok_start, tok_end)) {
|
||||
auto token_pos_it = ftoken_pos.find(raw_token);
|
||||
if(token_pos_it != ftoken_pos.end()) {
|
||||
token_offsets[tok_start] = tok_end;
|
||||
if(raw_token == last_full_q_token) {
|
||||
prefix_token_start_index = tok_start;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
auto offset_it = token_offsets.begin();
|
||||
size_t i = 0;
|
||||
std::stringstream highlightedss;
|
||||
|
||||
while(i < value.size()) {
|
||||
if(offset_it != token_offsets.end()) {
|
||||
if (i == offset_it->first) {
|
||||
highlightedss << highlight_start_tag;
|
||||
|
||||
// loop until end index, accumulate token and complete highlighting
|
||||
size_t token_len = (i == prefix_token_start_index) ?
|
||||
std::min(last_full_q_token.size(), last_q_token.size()) :
|
||||
(offset_it->second - i + 1);
|
||||
|
||||
for(size_t j = 0; j < token_len; j++) {
|
||||
highlightedss << value[i + j];
|
||||
}
|
||||
|
||||
highlightedss << highlight_end_tag;
|
||||
offset_it++;
|
||||
i += token_len;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if(token_query_pos.count(i) != 0) {
|
||||
size_t query_token_len = facet_query_tokens[token_query_pos[i]].size();
|
||||
// handle query token being larger than actual token (typo correction)
|
||||
query_token_len = std::min(query_token_len, tokens[i].size());
|
||||
const std::string & unmarked = tokens[i].substr(query_token_len, std::string::npos);
|
||||
highlightedss << highlight_start_tag <<
|
||||
tokens[i].substr(0, query_token_len) <<
|
||||
highlight_end_tag << unmarked;
|
||||
} else {
|
||||
highlightedss << tokens[i];
|
||||
}
|
||||
highlightedss << value[i];
|
||||
i++;
|
||||
}
|
||||
|
||||
facet_value_t facet_value = {value, highlightedss.str(), facet_count.count};
|
||||
@ -1414,7 +1459,9 @@ bool Collection::facet_value_to_string(const facet &a_facet, const facet_count_t
|
||||
} else if(facet_schema.at(a_facet.field_name).type == field_types::FLOAT) {
|
||||
float raw_val = document[a_facet.field_name].get<float>();
|
||||
value = StringUtils::float_to_str(raw_val);
|
||||
value.erase ( value.find_last_not_of('0') + 1, std::string::npos ); // remove trailing zeros
|
||||
if(value != "0") {
|
||||
value.erase ( value.find_last_not_of('0') + 1, std::string::npos ); // remove trailing zeros
|
||||
}
|
||||
} else if(facet_schema.at(a_facet.field_name).type == field_types::FLOAT_ARRAY) {
|
||||
float raw_val = document[a_facet.field_name][facet_count.array_pos].get<float>();
|
||||
value = StringUtils::float_to_str(raw_val);
|
||||
|
@ -292,12 +292,6 @@ Option<bool> filter::parse_filter_query(const string& simple_filter_query,
|
||||
NUM_COMPARATOR str_comparator = CONTAINS;
|
||||
|
||||
if(raw_value[0] == '=') {
|
||||
if(!_field.facet) {
|
||||
// EQUALS filtering on string is possible only on facet fields
|
||||
return Option<bool>(400, "To perform exact filtering, filter field `" +
|
||||
_field.name + "` must be a facet field.");
|
||||
}
|
||||
|
||||
// string filter should be evaluated in strict "equals" mode
|
||||
str_comparator = EQUALS;
|
||||
while(++filter_value_index < raw_value.size() && raw_value[filter_value_index] == ' ');
|
||||
|
469
src/index.cpp
469
src/index.cpp
@ -294,15 +294,9 @@ Option<uint32_t> Index::index_in_memory(const index_record& record, uint32_t seq
|
||||
|
||||
art_tree *t = search_index.at(field_pair.second.faceted_name());
|
||||
|
||||
if(field_pair.second.is_array()) {
|
||||
index_strings_field(points, t, seq_id, is_facet, field_pair.second,
|
||||
field_index_it->second.offsets,
|
||||
field_index_it->second.facet_hashes);
|
||||
} else {
|
||||
index_strings_field(points, t, seq_id, is_facet, field_pair.second,
|
||||
field_index_it->second.offsets,
|
||||
field_index_it->second.facet_hashes);
|
||||
}
|
||||
index_strings_field(points, t, seq_id, is_facet, field_pair.second,
|
||||
field_index_it->second.offsets,
|
||||
field_index_it->second.facet_hashes);
|
||||
}
|
||||
|
||||
if(field_pair.second.is_string()) {
|
||||
@ -762,11 +756,6 @@ void Index::tokenize_string_with_facets(const std::string& text, bool is_facet,
|
||||
continue;
|
||||
}
|
||||
|
||||
if(is_facet) {
|
||||
uint64_t hash = Index::facet_token_hash(a_field, token);
|
||||
facet_hashes.push_back(hash);
|
||||
}
|
||||
|
||||
token_to_offsets[token].push_back(token_index + 1);
|
||||
last_token = token;
|
||||
}
|
||||
@ -775,6 +764,11 @@ void Index::tokenize_string_with_facets(const std::string& text, bool is_facet,
|
||||
// push 0 for the last occurring token (used for exact match ranking)
|
||||
token_to_offsets[last_token].push_back(0);
|
||||
}
|
||||
|
||||
if(is_facet) {
|
||||
uint64_t hash = Index::facet_token_hash(a_field, text);
|
||||
facet_hashes.push_back(hash);
|
||||
}
|
||||
}
|
||||
|
||||
void Index::index_strings_field(const int64_t score, art_tree *t,
|
||||
@ -824,12 +818,6 @@ void Index::tokenize_string_array_with_facets(const std::vector<std::string>& st
|
||||
continue;
|
||||
}
|
||||
|
||||
if(is_facet) {
|
||||
uint64_t hash = facet_token_hash(a_field, token);
|
||||
facet_hashes.push_back(hash);
|
||||
//LOG(INFO) << "indexing " << token << ", hash:" << hash;
|
||||
}
|
||||
|
||||
token_to_offsets[token].push_back(token_index + 1);
|
||||
token_set.insert(token);
|
||||
last_token = token;
|
||||
@ -842,7 +830,9 @@ void Index::tokenize_string_array_with_facets(const std::vector<std::string>& st
|
||||
}
|
||||
|
||||
if(is_facet) {
|
||||
facet_hashes.push_back(FACET_ARRAY_DELIMETER); // as a delimiter
|
||||
uint64_t hash = facet_token_hash(a_field, str);
|
||||
//LOG(INFO) << "indexing " << token << ", hash:" << hash;
|
||||
facet_hashes.push_back(hash);
|
||||
}
|
||||
|
||||
for(auto& the_token: token_set) {
|
||||
@ -893,84 +883,16 @@ void Index::compute_facet_stats(facet &a_facet, uint64_t raw_value, const std::s
|
||||
}
|
||||
|
||||
void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
|
||||
const std::vector<facet_info_t>& facet_infos,
|
||||
const size_t group_limit, const std::vector<std::string>& group_by_fields,
|
||||
const uint32_t* result_ids, size_t results_size) const {
|
||||
|
||||
struct facet_info_t {
|
||||
// facet hash => token position in the query
|
||||
std::unordered_map<uint64_t, token_pos_cost_t> fhash_qtoken_pos;
|
||||
|
||||
bool use_facet_query = false;
|
||||
bool should_compute_stats = false;
|
||||
field facet_field{"", "", false};
|
||||
};
|
||||
|
||||
std::vector<facet_info_t> facet_infos(facets.size());
|
||||
|
||||
for(size_t findex=0; findex < facets.size(); findex++) {
|
||||
const auto& a_facet = facets[findex];
|
||||
|
||||
facet_infos[findex].use_facet_query = false;
|
||||
|
||||
const field &facet_field = facet_schema.at(a_facet.field_name);
|
||||
facet_infos[findex].facet_field = facet_field;
|
||||
|
||||
facet_infos[findex].should_compute_stats = (facet_field.type != field_types::STRING &&
|
||||
facet_field.type != field_types::BOOL &&
|
||||
facet_field.type != field_types::STRING_ARRAY &&
|
||||
facet_field.type != field_types::BOOL_ARRAY);
|
||||
|
||||
if(a_facet.field_name == facet_query.field_name && !facet_query.query.empty()) {
|
||||
facet_infos[findex].use_facet_query = true;
|
||||
|
||||
if (facet_field.is_bool()) {
|
||||
if (facet_query.query == "true") {
|
||||
facet_query.query = "1";
|
||||
} else if (facet_query.query == "false") {
|
||||
facet_query.query = "0";
|
||||
}
|
||||
}
|
||||
|
||||
// for non-string fields, `faceted_name` returns their aliased stringified field name
|
||||
art_tree *t = search_index.at(facet_field.faceted_name());
|
||||
|
||||
std::vector<std::string> query_tokens;
|
||||
Tokenizer(facet_query.query, true, !facet_field.is_string()).tokenize(query_tokens);
|
||||
|
||||
for (size_t qtoken_index = 0; qtoken_index < query_tokens.size(); qtoken_index++) {
|
||||
auto &q = query_tokens[qtoken_index];
|
||||
|
||||
int bounded_cost = (q.size() < 3) ? 0 : 1;
|
||||
bool prefix_search = (qtoken_index ==
|
||||
(query_tokens.size() - 1)); // only last token must be used as prefix
|
||||
|
||||
std::vector<art_leaf *> leaves;
|
||||
|
||||
const size_t q_len = prefix_search ? q.length() : q.length() + 1;
|
||||
art_fuzzy_search(t, (const unsigned char *) q.c_str(),
|
||||
q_len, 0, bounded_cost, 10000,
|
||||
token_ordering::MAX_SCORE, prefix_search, nullptr, 0, leaves);
|
||||
|
||||
for (size_t leaf_index = 0; leaf_index < leaves.size(); leaf_index++) {
|
||||
const auto &leaf = leaves[leaf_index];
|
||||
// calculate hash without terminating null char
|
||||
std::string key_str((const char *) leaf->key, leaf->key_len - 1);
|
||||
uint64_t hash = facet_token_hash(facet_field, key_str);
|
||||
|
||||
token_pos_cost_t token_pos_cost = {qtoken_index, 0};
|
||||
facet_infos[findex].fhash_qtoken_pos.emplace(hash, token_pos_cost);
|
||||
//printf("%.*s - %llu\n", leaf->key_len, leaf->key, hash);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// assumed that facet fields have already been validated upstream
|
||||
for(size_t findex=0; findex < facets.size(); findex++) {
|
||||
auto& a_facet = facets[findex];
|
||||
const auto& facet_field = facet_infos[findex].facet_field;
|
||||
const bool use_facet_query = facet_infos[findex].use_facet_query;
|
||||
const auto& fhash_qtoken_pos = facet_infos[findex].fhash_qtoken_pos;
|
||||
const auto& fquery_hashes = facet_infos[findex].hashes;
|
||||
const bool should_compute_stats = facet_infos[findex].should_compute_stats;
|
||||
|
||||
const auto& field_facet_mapping_it = facet_index_v3.find(a_facet.field_name);
|
||||
@ -988,91 +910,38 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
|
||||
continue;
|
||||
}
|
||||
|
||||
// FORMAT OF VALUES
|
||||
// String: h1 h2 h3
|
||||
// String array: h1 h2 h3 0 h1 0 h1 h2 0
|
||||
const auto& facet_hashes = facet_hashes_it->second;
|
||||
|
||||
const uint64_t distinct_id = group_limit ? get_distinct_id(group_by_fields, doc_seq_id) : 0;
|
||||
|
||||
int array_pos = 0;
|
||||
bool fvalue_found = false;
|
||||
uint64_t combined_hash = 1; // for hashing the entire facet value (multiple tokens)
|
||||
|
||||
std::unordered_map<uint32_t, token_pos_cost_t> query_token_positions;
|
||||
size_t field_token_index = -1;
|
||||
auto fhashes = facet_hashes.hashes;
|
||||
|
||||
for(size_t j = 0; j < facet_hashes.size(); j++) {
|
||||
if(fhashes[j] != FACET_ARRAY_DELIMETER) {
|
||||
uint64_t ftoken_hash = fhashes[j];
|
||||
field_token_index++;
|
||||
auto fhash = facet_hashes.hashes[j];
|
||||
|
||||
// reference: https://stackoverflow.com/a/4182771/131050
|
||||
// we also include token index to maintain orderliness
|
||||
combined_hash *= (1779033703 + 2*ftoken_hash*(field_token_index+1));
|
||||
|
||||
// ftoken_hash is the raw value for numeric fields
|
||||
if(should_compute_stats) {
|
||||
compute_facet_stats(a_facet, ftoken_hash, facet_field.type);
|
||||
}
|
||||
|
||||
const auto fhash_qtoken_pos_it = fhash_qtoken_pos.find(ftoken_hash);
|
||||
|
||||
// not using facet query or this particular facet value is found in facet filter
|
||||
if(!use_facet_query || fhash_qtoken_pos_it != fhash_qtoken_pos.end()) {
|
||||
fvalue_found = true;
|
||||
|
||||
if(use_facet_query) {
|
||||
// map token index to query index (used for highlighting later on)
|
||||
const token_pos_cost_t& qtoken_pos = fhash_qtoken_pos_it->second;
|
||||
|
||||
// if the query token has already matched another token in the string
|
||||
// we will replace the position only if the cost is lower
|
||||
if(query_token_positions.find(qtoken_pos.pos) == query_token_positions.end() ||
|
||||
query_token_positions[qtoken_pos.pos].cost >= qtoken_pos.cost ) {
|
||||
token_pos_cost_t ftoken_pos_cost = {field_token_index, qtoken_pos.cost};
|
||||
query_token_positions[qtoken_pos.pos] = ftoken_pos_cost;
|
||||
}
|
||||
}
|
||||
}
|
||||
if(should_compute_stats) {
|
||||
compute_facet_stats(a_facet, fhash, facet_field.type);
|
||||
}
|
||||
|
||||
// 0 indicates separator, while the second condition checks for non-array string
|
||||
if(fhashes[j] == FACET_ARRAY_DELIMETER || (facet_hashes.back() != FACET_ARRAY_DELIMETER && j == facet_hashes.size() - 1)) {
|
||||
if(!use_facet_query || fvalue_found) {
|
||||
uint64_t fhash = combined_hash;
|
||||
|
||||
if(a_facet.result_map.count(fhash) == 0) {
|
||||
a_facet.result_map.emplace(fhash, facet_count_t{0, spp::sparse_hash_set<uint64_t>(),
|
||||
doc_seq_id, 0,
|
||||
std::unordered_map<uint32_t, token_pos_cost_t>()});
|
||||
}
|
||||
|
||||
facet_count_t& facet_count = a_facet.result_map[fhash];
|
||||
|
||||
/*LOG(INFO) << "field: " << a_facet.field_name << ", doc id: " << doc_seq_id
|
||||
<< ", hash: " << fhash;*/
|
||||
|
||||
facet_count.doc_id = doc_seq_id;
|
||||
facet_count.array_pos = array_pos;
|
||||
|
||||
if(group_limit) {
|
||||
facet_count.groups.emplace(distinct_id);
|
||||
} else {
|
||||
facet_count.count += 1;
|
||||
}
|
||||
|
||||
if(use_facet_query) {
|
||||
facet_count.query_token_pos = query_token_positions;
|
||||
}
|
||||
if(!use_facet_query || fquery_hashes.find(fhash) != fquery_hashes.end()) {
|
||||
if(a_facet.result_map.count(fhash) == 0) {
|
||||
a_facet.result_map.emplace(fhash, facet_count_t{0, spp::sparse_hash_set<uint64_t>(),
|
||||
doc_seq_id, 0, {}});
|
||||
}
|
||||
|
||||
array_pos++;
|
||||
fvalue_found = false;
|
||||
combined_hash = 1;
|
||||
std::unordered_map<uint32_t, token_pos_cost_t>().swap(query_token_positions);
|
||||
field_token_index = -1;
|
||||
facet_count_t& facet_count = a_facet.result_map[fhash];
|
||||
|
||||
//LOG(INFO) << "field: " << a_facet.field_name << ", doc id: " << doc_seq_id << ", hash: " << fhash;
|
||||
|
||||
facet_count.doc_id = doc_seq_id;
|
||||
facet_count.array_pos = j;
|
||||
|
||||
if(group_limit) {
|
||||
facet_count.groups.emplace(distinct_id);
|
||||
} else {
|
||||
facet_count.count += 1;
|
||||
}
|
||||
|
||||
if(use_facet_query) {
|
||||
facet_count.tokens = fquery_hashes.at(fhash);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1095,7 +964,7 @@ void Index::aggregate_topster(Topster* agg_topster, Topster* index_topster) {
|
||||
}
|
||||
|
||||
void Index::search_candidates(const uint8_t & field_id, bool field_is_array,
|
||||
uint32_t* filter_ids, size_t filter_ids_length,
|
||||
const uint32_t* filter_ids, size_t filter_ids_length,
|
||||
const uint32_t* exclude_token_ids, size_t exclude_token_ids_size,
|
||||
const std::vector<uint32_t>& curated_ids,
|
||||
const std::vector<sort_by> & sort_fields,
|
||||
@ -1531,47 +1400,8 @@ void Index::do_filtering(uint32_t*& filter_ids, uint32_t& filter_ids_length,
|
||||
uint32_t* exact_strt_ids = new uint32_t[strt_ids_size];
|
||||
size_t exact_strt_size = 0;
|
||||
|
||||
for(size_t strt_ids_index = 0; strt_ids_index < strt_ids_size; strt_ids_index++) {
|
||||
uint32_t seq_id = strt_ids[strt_ids_index];
|
||||
const auto& fvalues = facet_index_v3.at(f.name)->at(seq_id);
|
||||
bool found_filter = false;
|
||||
|
||||
if(!f.is_array()) {
|
||||
found_filter = (posting_lists.size() == fvalues.length);
|
||||
} else {
|
||||
uint64_t filter_hash = 1;
|
||||
|
||||
for(size_t sindex=0; sindex < str_tokens.size(); sindex++) {
|
||||
auto& this_str_token = str_tokens[sindex];
|
||||
uint64_t thash = facet_token_hash(f, this_str_token);
|
||||
filter_hash *= (1779033703 + 2*thash*(sindex+1));
|
||||
}
|
||||
|
||||
uint64_t all_fvalue_hash = 1;
|
||||
size_t ftindex = 0;
|
||||
|
||||
for(size_t findex=0; findex < fvalues.size(); findex++) {
|
||||
auto fhash = fvalues.hashes[findex];
|
||||
if(fhash == FACET_ARRAY_DELIMETER) {
|
||||
// end of array, check hash
|
||||
if(all_fvalue_hash == filter_hash) {
|
||||
found_filter = true;
|
||||
break;
|
||||
}
|
||||
all_fvalue_hash = 1;
|
||||
ftindex = 0;
|
||||
} else {
|
||||
all_fvalue_hash *= (1779033703 + 2*fhash*(ftindex + 1));
|
||||
ftindex++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(found_filter) {
|
||||
exact_strt_ids[exact_strt_size] = seq_id;
|
||||
exact_strt_size++;
|
||||
}
|
||||
}
|
||||
posting_t::get_exact_matches(posting_lists, f.is_array(), strt_ids, strt_ids_size,
|
||||
exact_strt_ids, exact_strt_size);
|
||||
|
||||
delete[] strt_ids;
|
||||
strt_ids = exact_strt_ids;
|
||||
@ -2000,8 +1830,14 @@ bool Index::check_for_overrides(const token_ordering& token_order, const string&
|
||||
std::set<uint64> query_hashes;
|
||||
|
||||
size_t num_toks_dropped = 0;
|
||||
search_field(0, window_tokens, search_tokens, nullptr, 0, num_toks_dropped, field_name,
|
||||
nullptr, 0, {}, facets, {}, 2, searched_queries, topster, groups_processed,
|
||||
|
||||
auto field_it = search_schema.find(field_name);
|
||||
if(field_it == search_schema.end()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
search_field(0, window_tokens, search_tokens, nullptr, 0, num_toks_dropped, field_it->second, field_name,
|
||||
nullptr, 0, {}, {}, 2, searched_queries, topster, groups_processed,
|
||||
&result_ids, result_ids_len, field_num_results, 0, group_by_fields,
|
||||
false, 4, query_hashes, token_order, false, 0, 1, false, 3, 7);
|
||||
|
||||
@ -2175,6 +2011,11 @@ void Index::search(std::vector<query_tokens_t>& field_query_tokens,
|
||||
const uint8_t field_id = (uint8_t)(FIELD_LIMIT_NUM - i);
|
||||
const std::string& field_name = search_fields[i].name;
|
||||
|
||||
auto field_it = search_schema.find(field_name);
|
||||
if(field_it == search_schema.end()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
std::vector<token_t> query_tokens = q_include_pos_tokens;
|
||||
std::vector<token_t> search_tokens = q_include_pos_tokens;
|
||||
size_t num_tokens_dropped = 0;
|
||||
@ -2190,8 +2031,9 @@ void Index::search(std::vector<query_tokens_t>& field_query_tokens,
|
||||
size_t field_num_results = 0;
|
||||
std::set<uint64> query_hashes;
|
||||
|
||||
search_field(field_id, query_tokens, search_tokens, exclude_token_ids, exclude_token_ids_size, num_tokens_dropped,
|
||||
field_name, filter_ids, filter_ids_length, curated_ids_sorted, facets, sort_fields_std,
|
||||
search_field(field_id, query_tokens, search_tokens, exclude_token_ids, exclude_token_ids_size,
|
||||
num_tokens_dropped, field_it->second, field_name,
|
||||
filter_ids, filter_ids_length, curated_ids_sorted, sort_fields_std,
|
||||
field_num_typos, searched_queries, actual_topster, groups_processed, &all_result_ids, all_result_ids_len,
|
||||
field_num_results, group_limit, group_by_fields, prioritize_exact_match, concurrency,
|
||||
query_hashes, token_order, field_prefix,
|
||||
@ -2224,7 +2066,7 @@ void Index::search(std::vector<query_tokens_t>& field_query_tokens,
|
||||
all_result_ids, all_result_ids_len, filter_ids, filter_ids_length);
|
||||
} else {
|
||||
search_field(field_id, query_tokens, search_tokens, exclude_token_ids, exclude_token_ids_size, num_tokens_dropped,
|
||||
field_name, filter_ids, filter_ids_length, curated_ids_sorted, facets, sort_fields_std,
|
||||
field_it->second, field_name, filter_ids, filter_ids_length, curated_ids_sorted, sort_fields_std,
|
||||
field_num_typos, searched_queries, actual_topster, groups_processed, &all_result_ids, all_result_ids_len,
|
||||
field_num_results, group_limit, group_by_fields, prioritize_exact_match, concurrency,
|
||||
query_hashes, token_order, field_prefix,
|
||||
@ -2437,6 +2279,10 @@ void Index::search(std::vector<query_tokens_t>& field_query_tokens,
|
||||
std::mutex m_process;
|
||||
std::condition_variable cv_process;
|
||||
|
||||
std::vector<facet_info_t> facet_infos(facets.size());
|
||||
compute_facet_infos(facets, facet_query, all_result_ids, all_result_ids_len,
|
||||
group_by_fields, facet_infos);
|
||||
|
||||
std::vector<std::vector<facet>> facet_batches(num_threads);
|
||||
for(size_t i = 0; i < num_threads; i++) {
|
||||
for(const auto& this_facet: facets) {
|
||||
@ -2447,6 +2293,8 @@ void Index::search(std::vector<query_tokens_t>& field_query_tokens,
|
||||
size_t num_queued = 0;
|
||||
size_t result_index = 0;
|
||||
|
||||
//auto beginF = std::chrono::high_resolution_clock::now();
|
||||
|
||||
for(size_t thread_id = 0; thread_id < num_threads && result_index < all_result_ids_len; thread_id++) {
|
||||
size_t batch_res_len = window_size;
|
||||
|
||||
@ -2458,9 +2306,10 @@ void Index::search(std::vector<query_tokens_t>& field_query_tokens,
|
||||
num_queued++;
|
||||
|
||||
thread_pool->enqueue([this, thread_id, &facet_batches, &facet_query, group_limit, group_by_fields,
|
||||
batch_result_ids, batch_res_len, &num_processed, &m_process, &cv_process]() {
|
||||
batch_result_ids, batch_res_len, &facet_infos,
|
||||
&num_processed, &m_process, &cv_process]() {
|
||||
auto fq = facet_query;
|
||||
do_facets(facet_batches[thread_id], fq, group_limit, group_by_fields,
|
||||
do_facets(facet_batches[thread_id], fq, facet_infos, group_limit, group_by_fields,
|
||||
batch_result_ids, batch_res_len);
|
||||
std::unique_lock<std::mutex> lock(m_process);
|
||||
num_processed++;
|
||||
@ -2497,7 +2346,7 @@ void Index::search(std::vector<query_tokens_t>& field_query_tokens,
|
||||
|
||||
acc_facet.result_map[facet_kv.first].doc_id = facet_kv.second.doc_id;
|
||||
acc_facet.result_map[facet_kv.first].array_pos = facet_kv.second.array_pos;
|
||||
acc_facet.result_map[facet_kv.first].query_token_pos = facet_kv.second.query_token_pos;
|
||||
acc_facet.result_map[facet_kv.first].tokens = facet_kv.second.tokens;
|
||||
}
|
||||
|
||||
if(this_facet.stats.fvcount != 0) {
|
||||
@ -2508,9 +2357,15 @@ void Index::search(std::vector<query_tokens_t>& field_query_tokens,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*long long int timeMillisF = std::chrono::duration_cast<std::chrono::milliseconds>(
|
||||
std::chrono::high_resolution_clock::now() - beginF).count();
|
||||
LOG(INFO) << "Time for faceting: " << timeMillisF;*/
|
||||
}
|
||||
|
||||
do_facets(facets, facet_query, group_limit, group_by_fields, &included_ids[0], included_ids.size());
|
||||
std::vector<facet_info_t> facet_infos(facets.size());
|
||||
compute_facet_infos(facets, facet_query, &included_ids[0], included_ids.size(), group_by_fields, facet_infos);
|
||||
do_facets(facets, facet_query, facet_infos, group_limit, group_by_fields, &included_ids[0], included_ids.size());
|
||||
|
||||
all_result_ids_len += curated_topster->size;
|
||||
|
||||
@ -2526,6 +2381,141 @@ void Index::search(std::vector<query_tokens_t>& field_query_tokens,
|
||||
//LOG(INFO) << "Time taken for result calc: " << timeMillis << "ms";
|
||||
}
|
||||
|
||||
void Index::compute_facet_infos(const std::vector<facet>& facets, facet_query_t& facet_query,
|
||||
const uint32_t* all_result_ids, const size_t& all_result_ids_len,
|
||||
const std::vector<std::string>& group_by_fields,
|
||||
std::vector<facet_info_t>& facet_infos) const {
|
||||
|
||||
if(all_result_ids_len == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
for(size_t findex=0; findex < facets.size(); findex++) {
|
||||
const auto& a_facet = facets[findex];
|
||||
|
||||
const auto field_facet_mapping_it = facet_index_v3.find(a_facet.field_name);
|
||||
if(field_facet_mapping_it == facet_index_v3.end()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
facet_infos[findex].use_facet_query = false;
|
||||
|
||||
const field &facet_field = facet_schema.at(a_facet.field_name);
|
||||
facet_infos[findex].facet_field = facet_field;
|
||||
|
||||
facet_infos[findex].should_compute_stats = (facet_field.type != field_types::STRING &&
|
||||
facet_field.type != field_types::BOOL &&
|
||||
facet_field.type != field_types::STRING_ARRAY &&
|
||||
facet_field.type != field_types::BOOL_ARRAY);
|
||||
|
||||
if(a_facet.field_name == facet_query.field_name && !facet_query.query.empty()) {
|
||||
facet_infos[findex].use_facet_query = true;
|
||||
|
||||
if (facet_field.is_bool()) {
|
||||
if (facet_query.query == "true") {
|
||||
facet_query.query = "1";
|
||||
} else if (facet_query.query == "false") {
|
||||
facet_query.query = "0";
|
||||
}
|
||||
}
|
||||
|
||||
//LOG(INFO) << "facet_query.query: " << facet_query.query;
|
||||
|
||||
std::vector<std::string> query_tokens;
|
||||
Tokenizer(facet_query.query, true, !facet_field.is_string()).tokenize(query_tokens);
|
||||
|
||||
std::vector<token_t> search_tokens, qtokens;
|
||||
|
||||
for (size_t qtoken_index = 0; qtoken_index < query_tokens.size(); qtoken_index++) {
|
||||
search_tokens.emplace_back(token_t{qtoken_index, query_tokens[qtoken_index]});
|
||||
qtokens.emplace_back(token_t{qtoken_index, query_tokens[qtoken_index]});
|
||||
}
|
||||
|
||||
std::vector<std::vector<art_leaf*>> searched_queries;
|
||||
Topster* topster = nullptr;
|
||||
spp::sparse_hash_set<uint64_t> groups_processed;
|
||||
uint32_t* field_result_ids = nullptr;
|
||||
size_t field_result_ids_len = 0;
|
||||
size_t field_num_results = 0;
|
||||
std::set<uint64> query_hashes;
|
||||
size_t num_toks_dropped = 0;
|
||||
|
||||
search_field(0, qtokens, search_tokens, nullptr, 0, num_toks_dropped,
|
||||
facet_field, facet_field.faceted_name(),
|
||||
all_result_ids, all_result_ids_len, {}, {}, 2, searched_queries, topster, groups_processed,
|
||||
&field_result_ids, field_result_ids_len, field_num_results, 0, group_by_fields,
|
||||
false, 4, query_hashes, MAX_SCORE, true, 0, 1, false, 3, 1000);
|
||||
|
||||
//LOG(INFO) << "searched_queries.size: " << searched_queries.size();
|
||||
|
||||
// NOTE: `field_result_ids` will consists of IDs across ALL queries in searched_queries
|
||||
|
||||
for(size_t si = 0; si < searched_queries.size(); si++) {
|
||||
const auto& searched_query = searched_queries[si];
|
||||
std::vector<std::string> searched_tokens;
|
||||
|
||||
std::vector<void*> posting_lists;
|
||||
for(auto leaf: searched_query) {
|
||||
posting_lists.push_back(leaf->values);
|
||||
std::string tok(reinterpret_cast<char*>(leaf->key), leaf->key_len - 1);
|
||||
searched_tokens.push_back(tok);
|
||||
//LOG(INFO) << "tok: " << tok;
|
||||
}
|
||||
|
||||
//LOG(INFO) << "si: " << si << ", field_result_ids_len: " << field_result_ids_len;
|
||||
|
||||
for(size_t i = 0; i < std::min<size_t>(1000, field_result_ids_len); i++) {
|
||||
uint32_t seq_id = field_result_ids[i];
|
||||
|
||||
const auto doc_fvalues_it = field_facet_mapping_it->second->find(seq_id);
|
||||
if(doc_fvalues_it == field_facet_mapping_it->second->end()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
bool id_matched = true;
|
||||
|
||||
for(auto pl: posting_lists) {
|
||||
if(!posting_t::contains(pl, seq_id)) {
|
||||
// need to ensure that document ID actually contains both searched_query tokens
|
||||
id_matched = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if(!id_matched) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if(facet_field.is_array()) {
|
||||
std::vector<size_t> array_indices;
|
||||
posting_t::get_matching_array_indices(posting_lists, seq_id, array_indices);
|
||||
|
||||
for(size_t array_index: array_indices) {
|
||||
if(array_index < doc_fvalues_it->second.length) {
|
||||
uint64_t hash = doc_fvalues_it->second.hashes[array_index];
|
||||
|
||||
/*LOG(INFO) << "seq_id: " << seq_id << ", hash: " << hash << ", array index: "
|
||||
<< array_index;*/
|
||||
|
||||
if(facet_infos[findex].hashes.count(hash) == 0) {
|
||||
facet_infos[findex].hashes.emplace(hash, searched_tokens);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
uint64_t hash = doc_fvalues_it->second.hashes[0];
|
||||
if(facet_infos[findex].hashes.count(hash) == 0) {
|
||||
facet_infos[findex].hashes.emplace(hash, searched_tokens);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
delete [] field_result_ids;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void Index::curate_filtered_ids(const std::vector<filter>& filters, const std::set<uint32_t>& curated_ids,
|
||||
const uint32_t* exclude_token_ids, size_t exclude_token_ids_size,
|
||||
uint32_t*& filter_ids, uint32_t& filter_ids_length,
|
||||
@ -2642,10 +2632,10 @@ void Index::search_field(const uint8_t & field_id,
|
||||
const uint32_t* exclude_token_ids,
|
||||
size_t exclude_token_ids_size,
|
||||
size_t& num_tokens_dropped,
|
||||
const std::string & field,
|
||||
uint32_t *filter_ids, size_t filter_ids_length,
|
||||
const field& the_field, const std::string& field_name, // to handle faceted index
|
||||
const uint32_t *filter_ids, size_t filter_ids_length,
|
||||
const std::vector<uint32_t>& curated_ids,
|
||||
std::vector<facet> & facets, const std::vector<sort_by> & sort_fields, const int num_typos,
|
||||
const std::vector<sort_by> & sort_fields, const int num_typos,
|
||||
std::vector<std::vector<art_leaf*>> & searched_queries,
|
||||
Topster* topster, spp::sparse_hash_set<uint64_t>& groups_processed,
|
||||
uint32_t** all_result_ids, size_t & all_result_ids_len, size_t& field_num_results,
|
||||
@ -2663,13 +2653,6 @@ void Index::search_field(const uint8_t & field_id,
|
||||
// NOTE: `query_tokens` preserve original tokens, while `search_tokens` could be a result of dropped tokens
|
||||
|
||||
size_t max_cost = (num_typos < 0 || num_typos > 2) ? 2 : num_typos;
|
||||
auto field_it = search_schema.find(field);
|
||||
|
||||
if(field_it == search_schema.end()) {
|
||||
return;
|
||||
}
|
||||
|
||||
auto& the_field = field_it->second;
|
||||
|
||||
if(the_field.locale != "" && the_field.locale != "en") {
|
||||
// disable fuzzy trie traversal for non-english locales
|
||||
@ -2739,7 +2722,7 @@ void Index::search_field(const uint8_t & field_id,
|
||||
const size_t token_len = prefix_search ? (int) token.length() : (int) token.length() + 1;
|
||||
|
||||
// need less candidates for filtered searches since we already only pick tokens with results
|
||||
art_fuzzy_search(search_index.at(field), (const unsigned char *) token.c_str(), token_len,
|
||||
art_fuzzy_search(search_index.at(field_name), (const unsigned char *) token.c_str(), token_len,
|
||||
costs[token_index], costs[token_index], num_fuzzy_candidates, token_order, prefix_search,
|
||||
filter_ids, filter_ids_length, leaves, unique_tokens);
|
||||
|
||||
@ -2834,7 +2817,7 @@ void Index::search_field(const uint8_t & field_id,
|
||||
}
|
||||
|
||||
return search_field(field_id, query_tokens, truncated_tokens, exclude_token_ids, exclude_token_ids_size,
|
||||
num_tokens_dropped, field, filter_ids, filter_ids_length, curated_ids,facets,
|
||||
num_tokens_dropped, the_field, field_name, filter_ids, filter_ids_length, curated_ids,
|
||||
sort_fields, num_typos,searched_queries, topster, groups_processed, all_result_ids,
|
||||
all_result_ids_len, field_num_results, group_limit, group_by_fields,
|
||||
prioritize_exact_match, concurrency, query_hashes,
|
||||
@ -2884,10 +2867,6 @@ void Index::score_results(const std::vector<sort_by> & sort_fields, const uint16
|
||||
bool single_exact_query_token,
|
||||
const std::vector<posting_list_t::iterator_t>& posting_lists) const {
|
||||
|
||||
spp::sparse_hash_map<uint32_t, int64_t>* TEXT_MATCH_SENTINEL = &text_match_sentinel_value;
|
||||
spp::sparse_hash_map<uint32_t, int64_t>* SEQ_ID_SENTINEL = &seq_id_sentinel_value;
|
||||
spp::sparse_hash_map<uint32_t, int64_t>* GEO_SENTINEL = &geo_sentinel_value;
|
||||
|
||||
int64_t geopoint_distances[3];
|
||||
|
||||
for(auto& i: geopoint_indices) {
|
||||
@ -2937,7 +2916,7 @@ void Index::score_results(const std::vector<sort_by> & sort_fields, const uint16
|
||||
geopoint_distances[i] = dist;
|
||||
|
||||
// Swap (id -> latlong) index to (id -> distance) index
|
||||
field_values[i] = GEO_SENTINEL;
|
||||
field_values[i] = &geo_sentinel_value;
|
||||
}
|
||||
|
||||
//auto begin = std::chrono::high_resolution_clock::now();
|
||||
@ -3001,12 +2980,12 @@ void Index::score_results(const std::vector<sort_by> & sort_fields, const uint16
|
||||
|
||||
// avoiding loop
|
||||
if (sort_fields.size() > 0) {
|
||||
if (field_values[0] == TEXT_MATCH_SENTINEL) {
|
||||
if (field_values[0] == &text_match_sentinel_value) {
|
||||
scores[0] = int64_t(match_score);
|
||||
match_score_index = 0;
|
||||
} else if (field_values[0] == SEQ_ID_SENTINEL) {
|
||||
} else if (field_values[0] == &seq_id_sentinel_value) {
|
||||
scores[0] = seq_id;
|
||||
} else if(field_values[0] == GEO_SENTINEL) {
|
||||
} else if(field_values[0] == &geo_sentinel_value) {
|
||||
scores[0] = geopoint_distances[0];
|
||||
} else {
|
||||
auto it = field_values[0]->find(seq_id);
|
||||
@ -3019,12 +2998,12 @@ void Index::score_results(const std::vector<sort_by> & sort_fields, const uint16
|
||||
}
|
||||
|
||||
if(sort_fields.size() > 1) {
|
||||
if (field_values[1] == TEXT_MATCH_SENTINEL) {
|
||||
if (field_values[1] == &text_match_sentinel_value) {
|
||||
scores[1] = int64_t(match_score);
|
||||
match_score_index = 1;
|
||||
} else if (field_values[1] == SEQ_ID_SENTINEL) {
|
||||
} else if (field_values[1] == &seq_id_sentinel_value) {
|
||||
scores[1] = seq_id;
|
||||
} else if(field_values[1] == GEO_SENTINEL) {
|
||||
} else if(field_values[1] == &geo_sentinel_value) {
|
||||
scores[1] = geopoint_distances[1];
|
||||
} else {
|
||||
auto it = field_values[1]->find(seq_id);
|
||||
@ -3037,12 +3016,12 @@ void Index::score_results(const std::vector<sort_by> & sort_fields, const uint16
|
||||
}
|
||||
|
||||
if(sort_fields.size() > 2) {
|
||||
if (field_values[2] == TEXT_MATCH_SENTINEL) {
|
||||
if (field_values[2] == &text_match_sentinel_value) {
|
||||
scores[2] = int64_t(match_score);
|
||||
match_score_index = 2;
|
||||
} else if (field_values[2] == SEQ_ID_SENTINEL) {
|
||||
} else if (field_values[2] == &seq_id_sentinel_value) {
|
||||
scores[2] = seq_id;
|
||||
} else if(field_values[2] == GEO_SENTINEL) {
|
||||
} else if(field_values[2] == &geo_sentinel_value) {
|
||||
scores[2] = geopoint_distances[2];
|
||||
} else {
|
||||
auto it = field_values[2]->find(seq_id);
|
||||
|
@ -447,6 +447,46 @@ void posting_t::get_array_token_positions(uint32_t id, const std::vector<void*>&
|
||||
}
|
||||
}
|
||||
|
||||
void posting_t::get_exact_matches(const std::vector<void*>& raw_posting_lists, const bool field_is_array,
|
||||
const uint32_t* ids, const uint32_t num_ids,
|
||||
uint32_t*& exact_ids, size_t& num_exact_ids) {
|
||||
|
||||
std::vector<posting_list_t*> plists;
|
||||
std::vector<posting_list_t*> expanded_plists;
|
||||
to_expanded_plists(raw_posting_lists, plists, expanded_plists);
|
||||
|
||||
std::vector<posting_list_t::iterator_t> its;
|
||||
|
||||
for(posting_list_t* pl: plists) {
|
||||
its.push_back(pl->new_iterator());
|
||||
}
|
||||
|
||||
posting_list_t::get_exact_matches(its, field_is_array, ids, num_ids, exact_ids, num_exact_ids);
|
||||
|
||||
for(posting_list_t* expanded_plist: expanded_plists) {
|
||||
delete expanded_plist;
|
||||
}
|
||||
}
|
||||
|
||||
void posting_t::get_matching_array_indices(const std::vector<void*>& raw_posting_lists,
|
||||
uint32_t id, std::vector<size_t>& indices) {
|
||||
std::vector<posting_list_t*> plists;
|
||||
std::vector<posting_list_t*> expanded_plists;
|
||||
to_expanded_plists(raw_posting_lists, plists, expanded_plists);
|
||||
|
||||
std::vector<posting_list_t::iterator_t> its;
|
||||
|
||||
for(posting_list_t* pl: plists) {
|
||||
its.push_back(pl->new_iterator());
|
||||
}
|
||||
|
||||
posting_list_t::get_matching_array_indices(id, its, indices);
|
||||
|
||||
for(posting_list_t* expanded_plist: expanded_plists) {
|
||||
delete expanded_plist;
|
||||
}
|
||||
}
|
||||
|
||||
void posting_t::block_intersector_t::split_lists(size_t concurrency,
|
||||
std::vector<std::vector<posting_list_t::iterator_t>>& partial_its_vec) {
|
||||
const size_t num_blocks = this->plists[0]->num_blocks();
|
||||
|
@ -1,4 +1,5 @@
|
||||
#include "posting_list.h"
|
||||
#include <bitset>
|
||||
#include "for.h"
|
||||
#include "array_utils.h"
|
||||
|
||||
@ -977,6 +978,231 @@ bool posting_list_t::contains_atleast_one(const uint32_t* target_ids, size_t tar
|
||||
return false;
|
||||
}
|
||||
|
||||
void posting_list_t::get_exact_matches(std::vector<iterator_t>& its, const bool field_is_array,
|
||||
const uint32_t* ids, const uint32_t num_ids,
|
||||
uint32_t*& exact_ids, size_t& num_exact_ids) {
|
||||
|
||||
size_t exact_id_index = 0;
|
||||
|
||||
if(its.size() == 1) {
|
||||
for(size_t i = 0; i < num_ids; i++) {
|
||||
uint32_t id = ids[i];
|
||||
if(is_single_token_verbatim_match(its[0], field_is_array)) {
|
||||
exact_ids[exact_id_index++] = id;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
|
||||
if(!field_is_array) {
|
||||
for(size_t i = 0; i < num_ids; i++) {
|
||||
uint32_t id = ids[i];
|
||||
bool is_exact_match = true;
|
||||
|
||||
for(int j = its.size()-1; j >= 0; j--) {
|
||||
posting_list_t::iterator_t& it = its[j];
|
||||
it.skip_to(id);
|
||||
|
||||
block_t* curr_block = it.block();
|
||||
uint32_t curr_index = it.index();
|
||||
|
||||
if(curr_block == nullptr || curr_index == UINT32_MAX) {
|
||||
is_exact_match = false;
|
||||
break;
|
||||
}
|
||||
|
||||
uint32_t* offsets = it.offsets;
|
||||
|
||||
uint32_t start_offset_index = it.offset_index[curr_index];
|
||||
uint32_t end_offset_index = (curr_index == curr_block->size() - 1) ?
|
||||
curr_block->offsets.getLength() :
|
||||
it.offset_index[curr_index + 1];
|
||||
|
||||
if(j == its.size()-1) {
|
||||
// check if the last query token is the last offset
|
||||
if(offsets[end_offset_index-1] != 0) {
|
||||
// not the last token for the document, so skip
|
||||
is_exact_match = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// looping handles duplicate query tokens, e.g. "hip hip hurray hurray"
|
||||
while(start_offset_index < end_offset_index) {
|
||||
uint32_t offset = offsets[start_offset_index];
|
||||
|
||||
if(offset == (j + 1)) {
|
||||
// we have found a matching index, no need to look further
|
||||
is_exact_match = true;
|
||||
break;
|
||||
}
|
||||
|
||||
if(offset > (j + 1)) {
|
||||
is_exact_match = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if(!is_exact_match) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if(is_exact_match) {
|
||||
exact_ids[exact_id_index++] = id;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
else {
|
||||
// field is an array
|
||||
|
||||
for(size_t i = 0; i < num_ids; i++) {
|
||||
uint32_t id = ids[i];
|
||||
|
||||
std::map<size_t, std::bitset<32>> array_index_to_token_index;
|
||||
bool premature_exit = false;
|
||||
|
||||
for(int j = its.size()-1; j >= 0; j--) {
|
||||
posting_list_t::iterator_t& it = its[j];
|
||||
|
||||
it.skip_to(id);
|
||||
|
||||
block_t* curr_block = it.block();
|
||||
uint32_t curr_index = it.index();
|
||||
|
||||
if(curr_block == nullptr || curr_index == UINT32_MAX) {
|
||||
premature_exit = true;
|
||||
break;
|
||||
}
|
||||
|
||||
uint32_t* offsets = it.offsets;
|
||||
uint32_t start_offset_index = it.offset_index[curr_index];
|
||||
uint32_t end_offset_index = (curr_index == curr_block->size() - 1) ?
|
||||
curr_block->offsets.getLength() :
|
||||
it.offset_index[curr_index + 1];
|
||||
|
||||
int prev_pos = -1;
|
||||
bool has_atleast_one_last_token = false;
|
||||
bool found_matching_index = false;
|
||||
|
||||
while(start_offset_index < end_offset_index) {
|
||||
int pos = offsets[start_offset_index];
|
||||
start_offset_index++;
|
||||
|
||||
if(pos == prev_pos) { // indicates end of array index
|
||||
size_t array_index = (size_t) offsets[start_offset_index];
|
||||
|
||||
if(start_offset_index+1 < end_offset_index) {
|
||||
size_t next_offset = (size_t) offsets[start_offset_index + 1];
|
||||
if(next_offset == 0) {
|
||||
// indicates that token is the last token on the doc
|
||||
has_atleast_one_last_token = true;
|
||||
start_offset_index++;
|
||||
}
|
||||
}
|
||||
|
||||
if(found_matching_index) {
|
||||
array_index_to_token_index[array_index].set(j+1);
|
||||
}
|
||||
|
||||
start_offset_index++; // skip current value which is the array index or flag for last index
|
||||
prev_pos = -1;
|
||||
continue;
|
||||
}
|
||||
|
||||
if(pos == (j + 1)) {
|
||||
// we have found a matching index
|
||||
found_matching_index = true;
|
||||
}
|
||||
|
||||
prev_pos = pos;
|
||||
}
|
||||
|
||||
// check if the last query token is the last offset of ANY array element
|
||||
if(j == its.size()-1 && !has_atleast_one_last_token) {
|
||||
premature_exit = true;
|
||||
break;
|
||||
}
|
||||
|
||||
if(!found_matching_index) {
|
||||
// not even a single matching index found: can never be an exact match
|
||||
premature_exit = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if(!premature_exit) {
|
||||
// iterate array index to token index to check if atleast 1 array position contains all tokens
|
||||
for(auto& kv: array_index_to_token_index) {
|
||||
if(kv.second.count() == its.size()) {
|
||||
exact_ids[exact_id_index++] = id;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
num_exact_ids = exact_id_index;
|
||||
}
|
||||
|
||||
void posting_list_t::get_matching_array_indices(uint32_t id, std::vector<iterator_t>& its,
|
||||
std::vector<size_t>& indices) {
|
||||
std::map<size_t, std::bitset<32>> array_index_to_token_index;
|
||||
|
||||
for(int j = its.size()-1; j >= 0; j--) {
|
||||
posting_list_t::iterator_t& it = its[j];
|
||||
|
||||
it.skip_to(id);
|
||||
|
||||
block_t* curr_block = it.block();
|
||||
uint32_t curr_index = it.index();
|
||||
|
||||
if(curr_block == nullptr || curr_index == UINT32_MAX) {
|
||||
return;
|
||||
}
|
||||
|
||||
uint32_t* offsets = it.offsets;
|
||||
uint32_t start_offset_index = it.offset_index[curr_index];
|
||||
uint32_t end_offset_index = (curr_index == curr_block->size() - 1) ?
|
||||
curr_block->offsets.getLength() :
|
||||
it.offset_index[curr_index + 1];
|
||||
|
||||
int prev_pos = -1;
|
||||
while(start_offset_index < end_offset_index) {
|
||||
int pos = offsets[start_offset_index];
|
||||
start_offset_index++;
|
||||
|
||||
if(pos == prev_pos) { // indicates end of array index
|
||||
size_t array_index = (size_t) offsets[start_offset_index];
|
||||
|
||||
if(start_offset_index+1 < end_offset_index) {
|
||||
size_t next_offset = (size_t) offsets[start_offset_index + 1];
|
||||
if(next_offset == 0) {
|
||||
// indicates that token is the last token on the doc
|
||||
start_offset_index++;
|
||||
}
|
||||
}
|
||||
|
||||
array_index_to_token_index[array_index].set(j+1);
|
||||
start_offset_index++; // skip current value which is the array index or flag for last index
|
||||
prev_pos = -1;
|
||||
continue;
|
||||
}
|
||||
|
||||
prev_pos = pos;
|
||||
}
|
||||
}
|
||||
|
||||
// iterate array index to token index to check if atleast 1 array position contains all tokens
|
||||
for(auto& kv: array_index_to_token_index) {
|
||||
if(kv.second.count() == its.size()) {
|
||||
indices.push_back(kv.first);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* iterator_t operations */
|
||||
|
||||
posting_list_t::iterator_t::iterator_t(posting_list_t::block_t* start, posting_list_t::block_t* end):
|
||||
|
@ -177,13 +177,14 @@ TEST_F(CollectionFacetingTest, FacetCounts) {
|
||||
results = coll_array_fields->search("*", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY,
|
||||
{false}, Index::DROP_TOKENS_THRESHOLD,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "tags: fxne aluminium").get();
|
||||
spp::sparse_hash_set<std::string>(), 10, "tags: fxne platim").get();
|
||||
|
||||
ASSERT_EQ(5, results["hits"].size());
|
||||
ASSERT_EQ(1, results["facet_counts"].size());
|
||||
ASSERT_STREQ("tags", results["facet_counts"][0]["field_name"].get<std::string>().c_str());
|
||||
ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][0]["count"]);
|
||||
ASSERT_STREQ("FINE PLATINUM", results["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
|
||||
ASSERT_STREQ("<mark>FINE</mark> <mark>PLATIN</mark>UM", results["facet_counts"][0]["counts"][0]["highlighted"].get<std::string>().c_str());
|
||||
|
||||
// facet with facet filter query matching first token of an array
|
||||
results = coll_array_fields->search("*", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY,
|
||||
@ -218,6 +219,7 @@ TEST_F(CollectionFacetingTest, FacetCounts) {
|
||||
ASSERT_EQ(5, results["hits"].size());
|
||||
ASSERT_EQ(1, results["facet_counts"].size());
|
||||
ASSERT_STREQ("age", results["facet_counts"][0]["field_name"].get<std::string>().c_str());
|
||||
|
||||
ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][0]["count"]);
|
||||
ASSERT_STREQ("21", results["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
|
||||
ASSERT_STREQ("<mark>2</mark>1", results["facet_counts"][0]["counts"][0]["highlighted"].get<std::string>().c_str());
|
||||
@ -238,6 +240,10 @@ TEST_F(CollectionFacetingTest, FacetCounts) {
|
||||
ASSERT_FLOAT_EQ(24.400999426841736, results["facet_counts"][0]["stats"]["sum"].get<double>());
|
||||
ASSERT_FLOAT_EQ(5, results["facet_counts"][0]["stats"]["total_values"].get<size_t>());
|
||||
|
||||
// check for "0" case
|
||||
ASSERT_STREQ("0", results["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
|
||||
ASSERT_EQ(1, results["facet_counts"][0]["counts"][0]["count"].get<size_t>());
|
||||
|
||||
// facet query on a float field
|
||||
results = coll_array_fields->search("*", query_fields, "", {"rating"}, sort_fields, {0}, 10, 1, FREQUENCY,
|
||||
{false}, Index::DROP_TOKENS_THRESHOLD,
|
||||
@ -264,7 +270,6 @@ TEST_F(CollectionFacetingTest, FacetCounts) {
|
||||
{false}, Index::DROP_TOKENS_THRESHOLD,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "timestamps: 142189002").get();
|
||||
|
||||
ASSERT_EQ(5, results["hits"].size());
|
||||
ASSERT_EQ(1, results["facet_counts"].size());
|
||||
ASSERT_EQ(1, results["facet_counts"][0]["counts"].size());
|
||||
@ -688,8 +693,6 @@ TEST_F(CollectionFacetingTest, FacetCountOnSimilarStrings) {
|
||||
}
|
||||
|
||||
TEST_F(CollectionFacetingTest, FacetQueryOnStringWithColon) {
|
||||
;
|
||||
|
||||
std::vector<field> fields = {field("title", field_types::STRING, true),
|
||||
field("points", field_types::INT32, false)};
|
||||
|
||||
@ -731,3 +734,82 @@ TEST_F(CollectionFacetingTest, FacetQueryOnStringWithColon) {
|
||||
|
||||
collectionManager.drop_collection("coll1");
|
||||
}
|
||||
|
||||
TEST_F(CollectionFacetingTest, FacetQueryOnStringArray) {
|
||||
Collection* coll1;
|
||||
|
||||
std::vector<field> fields = {field("title", field_types::STRING, false),
|
||||
field("genres", field_types::STRING_ARRAY, true)};
|
||||
|
||||
coll1 = collectionManager.get_collection("coll1").get();
|
||||
if (coll1 == nullptr) {
|
||||
coll1 = collectionManager.create_collection("coll1", 2, fields, "").get();
|
||||
}
|
||||
|
||||
nlohmann::json doc1;
|
||||
doc1["id"] = "0";
|
||||
doc1["title"] = "Song 1";
|
||||
doc1["genres"] = {"Country Punk Rock", "Country", "Slow"};
|
||||
|
||||
nlohmann::json doc2;
|
||||
doc2["id"] = "1";
|
||||
doc2["title"] = "Song 2";
|
||||
doc2["genres"] = {"Soft Rock", "Rock", "Electronic"};
|
||||
|
||||
nlohmann::json doc3;
|
||||
doc3["id"] = "2";
|
||||
doc3["title"] = "Song 3";
|
||||
doc3["genres"] = {"Rockabilly", "Metal"};
|
||||
|
||||
nlohmann::json doc4;
|
||||
doc4["id"] = "3";
|
||||
doc4["title"] = "Song 4";
|
||||
doc4["genres"] = {"Pop Rock", "Rock", "Fast"};
|
||||
|
||||
nlohmann::json doc5;
|
||||
doc5["id"] = "4";
|
||||
doc5["title"] = "Song 5";
|
||||
doc5["genres"] = {"Pop", "Rockabilly", "Fast"};
|
||||
|
||||
ASSERT_TRUE(coll1->add(doc1.dump()).ok());
|
||||
ASSERT_TRUE(coll1->add(doc2.dump()).ok());
|
||||
ASSERT_TRUE(coll1->add(doc3.dump()).ok());
|
||||
ASSERT_TRUE(coll1->add(doc4.dump()).ok());
|
||||
ASSERT_TRUE(coll1->add(doc5.dump()).ok());
|
||||
|
||||
auto results = coll1->search("*", {}, "", {"genres"}, sort_fields, {0}, 0, 1, FREQUENCY,
|
||||
{false}, Index::DROP_TOKENS_THRESHOLD,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "genres: roc").get();
|
||||
|
||||
ASSERT_EQ(1, results["facet_counts"].size());
|
||||
ASSERT_EQ(5, results["facet_counts"][0]["counts"].size());
|
||||
|
||||
results = coll1->search("*", {}, "", {"genres"}, sort_fields, {0}, 0, 1, FREQUENCY,
|
||||
{false}, Index::DROP_TOKENS_THRESHOLD,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "genres: soft roc").get();
|
||||
|
||||
ASSERT_EQ(1, results["facet_counts"].size());
|
||||
ASSERT_EQ(1, results["facet_counts"][0]["counts"].size());
|
||||
|
||||
results = coll1->search("*", {}, "", {"genres"}, sort_fields, {0}, 0, 1, FREQUENCY,
|
||||
{false}, Index::DROP_TOKENS_THRESHOLD,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "genres: punk roc").get();
|
||||
|
||||
ASSERT_EQ(1, results["facet_counts"].size());
|
||||
ASSERT_EQ(1, results["facet_counts"][0]["counts"].size());
|
||||
ASSERT_EQ("Country <mark>Punk</mark> <mark>Roc</mark>k", results["facet_counts"][0]["counts"][0]["highlighted"].get<std::string>());
|
||||
|
||||
results = coll1->search("*", {}, "", {"genres"}, sort_fields, {0}, 0, 1, FREQUENCY,
|
||||
{false}, Index::DROP_TOKENS_THRESHOLD,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "genres: country roc").get();
|
||||
|
||||
ASSERT_EQ(1, results["facet_counts"].size());
|
||||
ASSERT_EQ(1, results["facet_counts"][0]["counts"].size());
|
||||
ASSERT_EQ("<mark>Country</mark> Punk <mark>Roc</mark>k", results["facet_counts"][0]["counts"][0]["highlighted"].get<std::string>());
|
||||
|
||||
collectionManager.drop_collection("coll1");
|
||||
}
|
||||
|
@ -260,10 +260,10 @@ TEST_F(CollectionFilteringTest, FacetFieldStringArrayFiltering) {
|
||||
ASSERT_EQ(1, results["hits"].size());
|
||||
ASSERT_EQ(1, results["found"].get<size_t>());
|
||||
|
||||
// don't allow exact filter on non-faceted field
|
||||
auto res_op = coll_array_fields->search("Jeremy", query_fields, "name:= Jeremy Howard", facets, sort_fields, {0}, 10, 1, FREQUENCY, {false});
|
||||
ASSERT_FALSE(res_op.ok());
|
||||
ASSERT_STREQ("To perform exact filtering, filter field `name` must be a facet field.", res_op.error().c_str());
|
||||
// allow exact filter on non-faceted field
|
||||
results = coll_array_fields->search("Jeremy", query_fields, "name:= Jeremy Howard", facets, sort_fields, {0}, 10, 1, FREQUENCY, {false}).get();
|
||||
ASSERT_EQ(5, results["hits"].size());
|
||||
ASSERT_EQ(5, results["found"].get<size_t>());
|
||||
|
||||
// multi match exact query (OR condition)
|
||||
results = coll_array_fields->search("Jeremy", query_fields, "tags:= [Gold, bronze]", facets, sort_fields, {0}, 10, 1, FREQUENCY, {false}).get();
|
||||
|
Loading…
x
Reference in New Issue
Block a user