mirror of
https://github.com/typesense/typesense.git
synced 2025-05-19 05:08:43 +08:00
Copy-free intersect + score.
This commit is contained in:
parent
22670b1342
commit
0e2adb4242
@ -371,8 +371,6 @@ private:
|
||||
return std::tie(a_count, a_value_size) > std::tie(b_count, b_value_size);
|
||||
}
|
||||
|
||||
void free_leaf_indices(std::vector<uint32_t*>& leaf_to_indices) const;
|
||||
|
||||
Option<bool> parse_filter_query(const std::string& simple_filter_query, std::vector<filter>& filters) const;
|
||||
|
||||
static Option<bool> parse_geopoint_filter_value(std::string& raw_value,
|
||||
|
@ -191,6 +191,11 @@ private:
|
||||
|
||||
StringUtils string_utils;
|
||||
|
||||
// used as sentinels
|
||||
|
||||
static spp::sparse_hash_map<uint32_t, int64_t> text_match_sentinel_value;
|
||||
static spp::sparse_hash_map<uint32_t, int64_t> seq_id_sentinel_value;
|
||||
|
||||
// Internal utility functions
|
||||
|
||||
static inline uint32_t next_suggestion(const std::vector<token_candidates> &token_candidates_vec,
|
||||
@ -321,8 +326,10 @@ public:
|
||||
void score_results(const std::vector<sort_by> &sort_fields, const uint16_t &query_index, const uint8_t &field_id,
|
||||
const uint32_t total_cost, Topster *topster, const std::vector<art_leaf *> &query_suggestion,
|
||||
spp::sparse_hash_set<uint64_t> &groups_processed,
|
||||
const std::vector<std::unordered_map<size_t, std::vector<token_positions_t>>>& array_token_positions_vec,
|
||||
const uint32_t* result_ids, size_t result_ids_size,
|
||||
const std::unordered_map<size_t, std::vector<token_positions_t>>& array_token_positions,
|
||||
const uint32_t seq_id, const int sort_order[3],
|
||||
std::array<spp::sparse_hash_map<uint32_t, int64_t>*, 3> field_values,
|
||||
const std::vector<size_t>& geopoint_indices,
|
||||
const size_t group_limit,
|
||||
const std::vector<std::string> &group_by_fields, uint32_t token_bits,
|
||||
const std::vector<token_t> &query_tokens,
|
||||
|
@ -48,25 +48,21 @@ private:
|
||||
public:
|
||||
|
||||
struct block_intersector_t {
|
||||
size_t batch_size;
|
||||
std::vector<posting_list_t::iterator_t> its;
|
||||
std::vector<posting_list_t*> plists;
|
||||
std::vector<uint32_t> expanded_plist_indices;
|
||||
|
||||
posting_list_t::result_iter_state_t& iter_state;
|
||||
|
||||
block_intersector_t(const std::vector<void*>& raw_posting_lists,
|
||||
size_t batch_size,
|
||||
posting_list_t::result_iter_state_t& iter_state):
|
||||
batch_size(batch_size), iter_state(iter_state) {
|
||||
iter_state(iter_state) {
|
||||
|
||||
to_expanded_plists(raw_posting_lists, plists, expanded_plist_indices);
|
||||
|
||||
its.reserve(plists.size());
|
||||
for(const auto& posting_list: plists) {
|
||||
its.push_back(posting_list->new_iterator());
|
||||
}
|
||||
|
||||
iter_state.num_lists = plists.size();
|
||||
}
|
||||
|
||||
~block_intersector_t() {
|
||||
@ -75,9 +71,8 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
bool intersect() {
|
||||
return posting_list_t::block_intersect(plists, batch_size, its, iter_state);;
|
||||
}
|
||||
template<class T>
|
||||
bool intersect(T func);
|
||||
};
|
||||
|
||||
static void upsert(void*& obj, uint32_t id, const std::vector<uint32_t>& offsets);
|
||||
@ -101,6 +96,12 @@ public:
|
||||
static void get_array_token_positions(
|
||||
uint32_t id,
|
||||
const std::vector<void*>& posting_lists,
|
||||
std::vector<std::unordered_map<size_t, std::vector<token_positions_t>>>& array_token_positions_vec
|
||||
std::unordered_map<size_t, std::vector<token_positions_t>>& array_token_positions
|
||||
);
|
||||
};
|
||||
};
|
||||
|
||||
template<class T>
|
||||
bool posting_t::block_intersector_t::intersect(T func) {
|
||||
posting_list_t::block_intersect<T>(plists, its, iter_state, func);
|
||||
return true;
|
||||
}
|
||||
|
@ -47,27 +47,41 @@ public:
|
||||
block_t* curr_block;
|
||||
uint32_t curr_index;
|
||||
|
||||
// uncompressed data structures for performance
|
||||
block_t* uncompressed_block;
|
||||
uint32_t* ids;
|
||||
|
||||
public:
|
||||
// uncompressed data structures for performance
|
||||
uint32_t* ids = nullptr;
|
||||
uint32_t* offset_index = nullptr;
|
||||
uint32_t* offsets = nullptr;
|
||||
|
||||
explicit iterator_t(block_t* root);
|
||||
iterator_t(iterator_t&& rhs) noexcept;
|
||||
~iterator_t();
|
||||
[[nodiscard]] inline bool valid() const;
|
||||
void inline next();
|
||||
[[nodiscard]] bool valid() const;
|
||||
void next();
|
||||
void skip_to(uint32_t id);
|
||||
[[nodiscard]] inline uint32_t id();
|
||||
[[nodiscard]] uint32_t id() const;
|
||||
[[nodiscard]] inline uint32_t index() const;
|
||||
[[nodiscard]] inline block_t* block() const;
|
||||
};
|
||||
|
||||
struct result_iter_state_t {
|
||||
size_t num_lists;
|
||||
std::vector<block_t*> blocks;
|
||||
std::vector<uint32_t> indices;
|
||||
std::vector<uint32_t> ids;
|
||||
uint32_t* excluded_result_ids = nullptr;
|
||||
size_t excluded_result_ids_size = 0;
|
||||
uint32_t* filter_ids = nullptr;
|
||||
size_t filter_ids_length = 0;
|
||||
|
||||
size_t excluded_result_ids_index = 0;
|
||||
size_t filter_ids_index = 0;
|
||||
|
||||
std::vector<std::unordered_map<size_t, std::vector<token_positions_t>>> array_token_positions_vec;
|
||||
|
||||
result_iter_state_t() = default;
|
||||
|
||||
result_iter_state_t(uint32_t* excluded_result_ids, size_t excluded_result_ids_size, uint32_t* filter_ids,
|
||||
size_t filter_ids_length) : excluded_result_ids(excluded_result_ids),
|
||||
excluded_result_ids_size(excluded_result_ids_size),
|
||||
filter_ids(filter_ids), filter_ids_length(filter_ids_length) {}
|
||||
};
|
||||
|
||||
private:
|
||||
@ -134,15 +148,83 @@ public:
|
||||
|
||||
static void intersect(const std::vector<posting_list_t*>& posting_lists, std::vector<uint32_t>& result_ids);
|
||||
|
||||
template<class T>
|
||||
static bool block_intersect(
|
||||
const std::vector<posting_list_t*>& posting_lists,
|
||||
size_t batch_size,
|
||||
std::vector<posting_list_t::iterator_t>& its,
|
||||
result_iter_state_t& iter_state
|
||||
result_iter_state_t& istate,
|
||||
T func
|
||||
);
|
||||
|
||||
static bool take_id(result_iter_state_t& istate, uint32_t id);
|
||||
|
||||
static bool get_offsets(
|
||||
result_iter_state_t& iter_state,
|
||||
std::vector<std::unordered_map<size_t, std::vector<token_positions_t>>>& array_token_positions
|
||||
std::vector<iterator_t>& its,
|
||||
std::unordered_map<size_t, std::vector<token_positions_t>>& array_token_pos
|
||||
);
|
||||
};
|
||||
|
||||
template<class T>
|
||||
bool posting_list_t::block_intersect(const std::vector<posting_list_t*>& posting_lists,
|
||||
std::vector<posting_list_t::iterator_t>& its,
|
||||
result_iter_state_t& istate,
|
||||
T func) {
|
||||
|
||||
if(posting_lists.empty()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if(its.empty()) {
|
||||
its.reserve(posting_lists.size());
|
||||
|
||||
for(const auto& posting_list: posting_lists) {
|
||||
its.push_back(posting_list->new_iterator());
|
||||
}
|
||||
|
||||
} else {
|
||||
// already in the middle of iteration: prepare for next batch
|
||||
|
||||
}
|
||||
|
||||
size_t num_lists = its.size();
|
||||
|
||||
switch (num_lists) {
|
||||
case 1:
|
||||
while(its[0].valid()) {
|
||||
if(posting_list_t::take_id(istate, its[0].id())) {
|
||||
func(its[0].id(), its);
|
||||
}
|
||||
|
||||
its[0].next();
|
||||
}
|
||||
break;
|
||||
case 2:
|
||||
while(!at_end2(its)) {
|
||||
if(equals2(its)) {
|
||||
if(posting_list_t::take_id(istate, its[0].id())) {
|
||||
func(its[0].id(), its);
|
||||
}
|
||||
|
||||
advance_all2(its);
|
||||
} else {
|
||||
advance_non_largest2(its);
|
||||
}
|
||||
}
|
||||
break;
|
||||
default:
|
||||
while(!at_end(its)) {
|
||||
if(equals(its)) {
|
||||
//LOG(INFO) << its[0].id();
|
||||
if(posting_list_t::take_id(istate, its[0].id())) {
|
||||
func(its[0].id(), its);
|
||||
}
|
||||
|
||||
advance_all(its);
|
||||
} else {
|
||||
advance_non_largest(its);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
@ -1150,9 +1150,14 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
|
||||
|
||||
bool highlighted_fully = (fields_highlighted_fully.find(field_name) != fields_highlighted_fully.end());
|
||||
highlight_t highlight;
|
||||
//LOG(INFO) << "Highlighting: " << document;
|
||||
/*if(document["title"] == "Quantum Quest: A Cassini Space Odyssey") {
|
||||
LOG(INFO) << "here!";
|
||||
}*/
|
||||
highlight_result(search_field, searched_queries, q_tokens, field_order_kv, document,
|
||||
string_utils, snippet_threshold, highlight_affix_num_tokens,
|
||||
highlighted_fully, highlight_start_tag, highlight_end_tag, highlight);
|
||||
//LOG(INFO) << "End";
|
||||
|
||||
if(!highlight.snippets.empty()) {
|
||||
highlights.push_back(highlight);
|
||||
@ -1547,7 +1552,7 @@ void Collection::highlight_result(const field &search_field,
|
||||
Index* index = indices[field_order_kv->key % num_memory_shards];
|
||||
art_leaf* actual_leaf = index->get_token_leaf(search_field.name, &token_leaf->key[0], token_leaf->key_len);
|
||||
|
||||
if(actual_leaf != nullptr) {
|
||||
if(actual_leaf != nullptr && posting_t::contains(actual_leaf->values, field_order_kv->key)) {
|
||||
query_suggestion.push_back(actual_leaf);
|
||||
query_suggestion_tokens.insert(token);
|
||||
//LOG(INFO) << "field: " << search_field.name << ", key: " << token;
|
||||
@ -1557,7 +1562,9 @@ void Collection::highlight_result(const field &search_field,
|
||||
qindex++;
|
||||
} while(field_order_kv->query_indices != nullptr && qindex < field_order_kv->query_indices[0]);
|
||||
|
||||
for(const std::string& q_token: q_tokens) {
|
||||
for(size_t i = 0; i < q_tokens.size(); i++) {
|
||||
const std::string& q_token = q_tokens[i];
|
||||
|
||||
if(query_suggestion_tokens.count(q_token) != 0) {
|
||||
continue;
|
||||
}
|
||||
@ -1566,32 +1573,29 @@ void Collection::highlight_result(const field &search_field,
|
||||
art_leaf *actual_leaf = index->get_token_leaf(search_field.name,
|
||||
reinterpret_cast<const unsigned char *>(q_token.c_str()),
|
||||
q_token.size() + 1);
|
||||
if(actual_leaf != nullptr) {
|
||||
|
||||
if(actual_leaf != nullptr && posting_t::contains(actual_leaf->values, field_order_kv->key)) {
|
||||
query_suggestion.push_back(actual_leaf);
|
||||
query_suggestion_tokens.insert(q_token);
|
||||
} else if(i == q_tokens.size()-1) {
|
||||
// we will copy the last token for highlighting prefix matches
|
||||
query_suggestion_tokens.insert(q_token);
|
||||
}
|
||||
}
|
||||
|
||||
if(query_suggestion.empty()) {
|
||||
if(query_suggestion_tokens.empty()) {
|
||||
// none of the tokens from the query were found on this field
|
||||
return ;
|
||||
}
|
||||
|
||||
//LOG(INFO) << "Document ID: " << document["id"];
|
||||
|
||||
std::vector<void*> posting_lists;
|
||||
for(art_leaf* leaf: query_suggestion) {
|
||||
posting_lists.push_back(leaf->values);
|
||||
}
|
||||
|
||||
std::vector<std::unordered_map<size_t, std::vector<token_positions_t>>> array_token_positions_vec;
|
||||
posting_t::get_array_token_positions(field_order_kv->key, posting_lists, array_token_positions_vec);
|
||||
std::unordered_map<size_t, std::vector<token_positions_t>> array_token_positions;
|
||||
posting_t::get_array_token_positions(field_order_kv->key, posting_lists, array_token_positions);
|
||||
|
||||
if(array_token_positions_vec.empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::unordered_map<size_t, std::vector<token_positions_t>>& array_token_positions = array_token_positions_vec[0];
|
||||
std::vector<match_index_t> match_indices;
|
||||
|
||||
for(const auto& kv: array_token_positions) {
|
||||
@ -1639,6 +1643,14 @@ void Collection::highlight_result(const field &search_field,
|
||||
}
|
||||
}
|
||||
|
||||
if(!document.contains(search_field.name)) {
|
||||
// could be an optional field
|
||||
continue;
|
||||
}
|
||||
|
||||
/*LOG(INFO) << "field: " << document[search_field.name] << ", id: " << field_order_kv->key
|
||||
<< ", index: " << match_index.index;*/
|
||||
|
||||
std::string text = (search_field.type == field_types::STRING) ? document[search_field.name] :
|
||||
document[search_field.name][match_index.index];
|
||||
Tokenizer tokenizer(text, true, false, search_field.locale);
|
||||
@ -1793,12 +1805,6 @@ void Collection::highlight_result(const field &search_field,
|
||||
highlight.match_score = match_indices[0].match_score;
|
||||
}
|
||||
|
||||
void Collection::free_leaf_indices(std::vector<uint32_t*>& leaf_to_indices) const {
|
||||
for(uint32_t* leaf_indices: leaf_to_indices) {
|
||||
delete [] leaf_indices;
|
||||
}
|
||||
}
|
||||
|
||||
Option<nlohmann::json> Collection::get(const std::string & id) const {
|
||||
std::string seq_id_str;
|
||||
StoreStatus seq_id_status = store->get(get_doc_id_key(id), seq_id_str);
|
||||
|
452
src/index.cpp
452
src/index.cpp
@ -20,6 +20,9 @@
|
||||
#include <posting.h>
|
||||
#include "logger.h"
|
||||
|
||||
spp::sparse_hash_map<uint32_t, int64_t> Index::text_match_sentinel_value;
|
||||
spp::sparse_hash_map<uint32_t, int64_t> Index::seq_id_sentinel_value;
|
||||
|
||||
Index::Index(const std::string name, const std::unordered_map<std::string, field> & search_schema,
|
||||
std::map<std::string, field> facet_schema, std::unordered_map<std::string, field> sort_schema):
|
||||
name(name), search_schema(search_schema), facet_schema(facet_schema), sort_schema(sort_schema) {
|
||||
@ -615,6 +618,8 @@ void Index::index_string_array_field(const std::vector<std::string> & strings, c
|
||||
last_token = token;
|
||||
}
|
||||
|
||||
//LOG(INFO) << "Str: " << str << ", last_token: " << last_token;
|
||||
|
||||
if(token_set.empty()) {
|
||||
continue;
|
||||
}
|
||||
@ -896,6 +901,29 @@ void Index::search_candidates(const uint8_t & field_id,
|
||||
auto product = []( long long a, token_candidates & b ) { return a*b.candidates.size(); };
|
||||
long long int N = std::accumulate(token_candidates_vec.begin(), token_candidates_vec.end(), 1LL, product);
|
||||
|
||||
int sort_order[3]; // 1 or -1 based on DESC or ASC respectively
|
||||
std::array<spp::sparse_hash_map<uint32_t, int64_t>*, 3> field_values;
|
||||
std::vector<size_t> geopoint_indices;
|
||||
|
||||
for (size_t i = 0; i < sort_fields.size(); i++) {
|
||||
sort_order[i] = 1;
|
||||
if (sort_fields[i].order == sort_field_const::asc) {
|
||||
sort_order[i] = -1;
|
||||
}
|
||||
|
||||
if (sort_fields[i].name == sort_field_const::text_match) {
|
||||
field_values[i] = &text_match_sentinel_value;
|
||||
} else if (sort_fields[i].name == sort_field_const::seq_id) {
|
||||
field_values[i] = &seq_id_sentinel_value;
|
||||
} else if (sort_index.count(sort_fields[i].name) != 0) {
|
||||
field_values[i] = sort_index.at(sort_fields[i].name);
|
||||
|
||||
if (sort_schema.at(sort_fields[i].name).is_geopoint()) {
|
||||
geopoint_indices.push_back(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for(long long n=0; n<N && n<combination_limit; ++n) {
|
||||
// every element in `query_suggestion` contains a token and its associated hits
|
||||
std::vector<art_leaf*> query_suggestion(token_candidates_vec.size());
|
||||
@ -917,12 +945,6 @@ void Index::search_candidates(const uint8_t & field_id,
|
||||
}
|
||||
LOG(INFO) << fullq.str();*/
|
||||
|
||||
// initialize results with the starting element (for further intersection)
|
||||
size_t result_size = posting_t::num_ids(query_suggestion[0]->values);
|
||||
if(result_size == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Prepare excluded document IDs that we can later remove from the result set
|
||||
uint32_t* excluded_result_ids = nullptr;
|
||||
size_t excluded_result_ids_size = ArrayUtils::or_scalar(exclude_token_ids, exclude_token_ids_size,
|
||||
@ -934,95 +956,42 @@ void Index::search_candidates(const uint8_t & field_id,
|
||||
posting_lists.push_back(query_leaf->values);
|
||||
}
|
||||
|
||||
std::vector<posting_list_t::iterator_t> its;
|
||||
posting_list_t::result_iter_state_t iter_state;
|
||||
posting_list_t::result_iter_state_t iter_state(
|
||||
excluded_result_ids, excluded_result_ids_size, filter_ids, filter_ids_length
|
||||
);
|
||||
|
||||
// We will have to be judicious about computing full match score: only when token does not match exact query
|
||||
bool use_single_token_score = (query_suggestion.size() == 1) &&
|
||||
(query_suggestion.size() == query_tokens.size()) &&
|
||||
((std::string((const char*)query_suggestion[0]->key, query_suggestion[0]->key_len-1) != query_tokens[0].value));
|
||||
|
||||
std::vector<uint32_t> result_id_vec;
|
||||
posting_t::block_intersector_t intersector(posting_lists, iter_state);
|
||||
|
||||
size_t excluded_result_ids_index = 0;
|
||||
size_t filter_ids_index = 0;
|
||||
|
||||
posting_t::block_intersector_t intersector(posting_lists, 1000, iter_state);
|
||||
bool has_more = true;
|
||||
|
||||
while(has_more) {
|
||||
has_more = intersector.intersect();
|
||||
posting_list_t::result_iter_state_t updated_iter_state;
|
||||
size_t id_block_index = 0;
|
||||
|
||||
for(size_t i = 0; i < iter_state.ids.size(); i++) {
|
||||
uint32_t id = iter_state.ids[i];
|
||||
|
||||
// decide if this result id should be excluded
|
||||
if(excluded_result_ids_size != 0) {
|
||||
while(excluded_result_ids_index < excluded_result_ids_size &&
|
||||
excluded_result_ids[excluded_result_ids_index] < id) {
|
||||
excluded_result_ids_index++;
|
||||
}
|
||||
|
||||
if(excluded_result_ids_index < excluded_result_ids_size &&
|
||||
id == excluded_result_ids[excluded_result_ids_index]) {
|
||||
excluded_result_ids_index++;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
bool id_found_in_filter = true;
|
||||
|
||||
// decide if this result be matched with filter results
|
||||
if(filter_ids_length != 0) {
|
||||
id_found_in_filter = false;
|
||||
|
||||
// e.g. [1, 3] vs [2, 3]
|
||||
|
||||
while(filter_ids_index < filter_ids_length && filter_ids[filter_ids_index] < id) {
|
||||
filter_ids_index++;
|
||||
}
|
||||
|
||||
if(filter_ids_index < filter_ids_length && filter_ids[filter_ids_index] == id) {
|
||||
filter_ids_index++;
|
||||
id_found_in_filter = true;
|
||||
}
|
||||
}
|
||||
|
||||
if(id_found_in_filter) {
|
||||
result_id_vec.push_back(id);
|
||||
|
||||
updated_iter_state.num_lists = iter_state.num_lists;
|
||||
updated_iter_state.ids.push_back(id);
|
||||
|
||||
for(size_t k = 0; k < iter_state.num_lists; k++) {
|
||||
updated_iter_state.blocks.push_back(iter_state.blocks[id_block_index]);
|
||||
updated_iter_state.indices.push_back(iter_state.indices[id_block_index++]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// We will have to be judicious about computing full match score: only when token does not match exact query
|
||||
bool use_single_token_score = (query_suggestion.size() == 1) &&
|
||||
(query_suggestion.size() == query_tokens.size()) &&
|
||||
((std::string((const char*)query_suggestion[0]->key, query_suggestion[0]->key_len-1) != query_tokens[0].value));
|
||||
|
||||
std::vector<std::unordered_map<size_t, std::vector<token_positions_t>>> array_token_positions_vec;
|
||||
intersector.intersect([&](uint32_t seq_id, std::vector<posting_list_t::iterator_t>& its) {
|
||||
std::unordered_map<size_t, std::vector<token_positions_t>> array_token_positions;
|
||||
|
||||
if(!use_single_token_score) {
|
||||
posting_list_t::get_offsets(updated_iter_state, array_token_positions_vec);
|
||||
posting_list_t::get_offsets(its, array_token_positions);
|
||||
}
|
||||
|
||||
score_results(sort_fields, (uint16_t) searched_queries.size(), field_id, total_cost, topster,
|
||||
query_suggestion, groups_processed, array_token_positions_vec,
|
||||
&updated_iter_state.ids[0], updated_iter_state.ids.size(),
|
||||
query_suggestion, groups_processed, array_token_positions,
|
||||
seq_id, sort_order, field_values, geopoint_indices,
|
||||
group_limit, group_by_fields, token_bits, query_tokens,
|
||||
use_single_token_score, prioritize_exact_match);
|
||||
}
|
||||
|
||||
result_id_vec.push_back(seq_id);
|
||||
});
|
||||
|
||||
if(result_id_vec.empty()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
uint32_t* result_ids = &result_id_vec[0];
|
||||
result_size = result_id_vec.size();
|
||||
size_t result_size = result_id_vec.size();
|
||||
|
||||
field_num_results += result_id_vec.size();
|
||||
field_num_results += result_size;
|
||||
|
||||
uint32_t* new_all_result_ids = nullptr;
|
||||
all_result_ids_len = ArrayUtils::or_scalar(*all_result_ids, all_result_ids_len, result_ids,
|
||||
@ -1571,10 +1540,41 @@ void Index::search(const std::vector<query_tokens_t>& field_query_tokens,
|
||||
filter_ids = excluded_result_ids;
|
||||
}
|
||||
|
||||
|
||||
// FIXME: duplicated
|
||||
int sort_order[3]; // 1 or -1 based on DESC or ASC respectively
|
||||
std::array<spp::sparse_hash_map<uint32_t, int64_t>*, 3> field_values;
|
||||
std::vector<size_t> geopoint_indices;
|
||||
|
||||
for (size_t i = 0; i < sort_fields_std.size(); i++) {
|
||||
sort_order[i] = 1;
|
||||
if (sort_fields_std[i].order == sort_field_const::asc) {
|
||||
sort_order[i] = -1;
|
||||
}
|
||||
|
||||
if (sort_fields_std[i].name == sort_field_const::text_match) {
|
||||
field_values[i] = &text_match_sentinel_value;
|
||||
} else if (sort_fields_std[i].name == sort_field_const::seq_id) {
|
||||
field_values[i] = &seq_id_sentinel_value;
|
||||
} else if (sort_index.count(sort_fields_std[i].name) != 0) {
|
||||
field_values[i] = sort_index.at(sort_fields_std[i].name);
|
||||
|
||||
if (sort_schema.at(sort_fields_std[i].name).is_geopoint()) {
|
||||
geopoint_indices.push_back(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t token_bits = 255;
|
||||
score_results(sort_fields_std, (uint16_t) searched_queries.size(), field_id, 0, topster, {},
|
||||
groups_processed, {}, filter_ids, filter_ids_length, group_limit, group_by_fields, token_bits, {},
|
||||
true, prioritize_exact_match);
|
||||
|
||||
for(size_t i = 0; i < filter_ids_length; i++) {
|
||||
const uint32_t seq_id = filter_ids[i];
|
||||
score_results(sort_fields_std, (uint16_t) searched_queries.size(), field_id, 0, topster, {},
|
||||
groups_processed, {}, seq_id, sort_order, field_values, geopoint_indices,
|
||||
group_limit, group_by_fields, token_bits, {},
|
||||
true, prioritize_exact_match);
|
||||
}
|
||||
|
||||
collate_included_ids(field_query_tokens[0].q_include_tokens, field, field_id, included_ids_map, curated_topster, searched_queries);
|
||||
|
||||
all_result_ids_len = filter_ids_length;
|
||||
@ -2063,191 +2063,163 @@ void Index::score_results(const std::vector<sort_by> & sort_fields, const uint16
|
||||
const uint8_t & field_id, const uint32_t total_cost, Topster* topster,
|
||||
const std::vector<art_leaf *> &query_suggestion,
|
||||
spp::sparse_hash_set<uint64_t>& groups_processed,
|
||||
const std::vector<std::unordered_map<size_t, std::vector<token_positions_t>>>& array_token_positions_vec,
|
||||
const uint32_t* result_ids, size_t result_ids_size,
|
||||
const std::unordered_map<size_t, std::vector<token_positions_t>>& array_token_positions,
|
||||
const uint32_t seq_id, const int sort_order[3],
|
||||
std::array<spp::sparse_hash_map<uint32_t, int64_t>*, 3> field_values,
|
||||
const std::vector<size_t>& geopoint_indices,
|
||||
const size_t group_limit, const std::vector<std::string>& group_by_fields,
|
||||
uint32_t token_bits,
|
||||
const std::vector<token_t>& query_tokens,
|
||||
bool use_single_token_score,
|
||||
bool prioritize_exact_match) const {
|
||||
|
||||
int sort_order[3]; // 1 or -1 based on DESC or ASC respectively
|
||||
spp::sparse_hash_map<uint32_t, int64_t>* field_values[3];
|
||||
|
||||
spp::sparse_hash_map<uint32_t, int64_t>* TEXT_MATCH_SENTINEL = &text_match_sentinel_value;
|
||||
spp::sparse_hash_map<uint32_t, int64_t>* SEQ_ID_SENTINEL = &seq_id_sentinel_value;
|
||||
spp::sparse_hash_map<uint32_t, int64_t> geopoint_distances[3];
|
||||
|
||||
spp::sparse_hash_map<uint32_t, int64_t> text_match_sentinel_value, seq_id_sentinel_value;
|
||||
spp::sparse_hash_map<uint32_t, int64_t> *TEXT_MATCH_SENTINEL = &text_match_sentinel_value;
|
||||
spp::sparse_hash_map<uint32_t, int64_t> *SEQ_ID_SENTINEL = &seq_id_sentinel_value;
|
||||
for(auto& i: geopoint_indices) {
|
||||
spp::sparse_hash_map<uint32_t, int64_t>* geopoints = field_values[i];
|
||||
|
||||
for (size_t i = 0; i < sort_fields.size(); i++) {
|
||||
sort_order[i] = 1;
|
||||
if (sort_fields[i].order == sort_field_const::asc) {
|
||||
sort_order[i] = -1;
|
||||
S2LatLng reference_lat_lng;
|
||||
GeoPoint::unpack_lat_lng(sort_fields[i].geopoint, reference_lat_lng);
|
||||
|
||||
auto it = geopoints->find(seq_id);
|
||||
int64_t dist = INT32_MAX;
|
||||
|
||||
if(it != geopoints->end()) {
|
||||
int64_t packed_latlng = it->second;
|
||||
S2LatLng s2_lat_lng;
|
||||
GeoPoint::unpack_lat_lng(packed_latlng, s2_lat_lng);
|
||||
dist = GeoPoint::distance(s2_lat_lng, reference_lat_lng);
|
||||
}
|
||||
|
||||
if (sort_fields[i].name == sort_field_const::text_match) {
|
||||
field_values[i] = TEXT_MATCH_SENTINEL;
|
||||
} else if (sort_fields[i].name == sort_field_const::seq_id) {
|
||||
field_values[i] = SEQ_ID_SENTINEL;
|
||||
} else if (sort_schema.at(sort_fields[i].name).is_geopoint()) {
|
||||
// we have to populate distances that will be used for match scoring
|
||||
spp::sparse_hash_map<uint32_t, int64_t> *geopoints = sort_index.at(sort_fields[i].name);
|
||||
|
||||
S2LatLng reference_lat_lng;
|
||||
GeoPoint::unpack_lat_lng(sort_fields[i].geopoint, reference_lat_lng);
|
||||
|
||||
for (size_t rindex = 0; rindex < result_ids_size; rindex++) {
|
||||
const uint32_t seq_id = result_ids[rindex];
|
||||
auto it = geopoints->find(seq_id);
|
||||
int64_t dist = INT32_MAX;
|
||||
|
||||
if(it != geopoints->end()) {
|
||||
int64_t packed_latlng = it->second;
|
||||
S2LatLng s2_lat_lng;
|
||||
GeoPoint::unpack_lat_lng(packed_latlng, s2_lat_lng);
|
||||
dist = GeoPoint::distance(s2_lat_lng, reference_lat_lng);
|
||||
}
|
||||
|
||||
if(dist < sort_fields[i].exclude_radius) {
|
||||
dist = 0;
|
||||
}
|
||||
|
||||
if(sort_fields[i].geo_precision > 0) {
|
||||
dist = dist + sort_fields[i].geo_precision - 1 -
|
||||
(dist + sort_fields[i].geo_precision - 1) % sort_fields[i].geo_precision;
|
||||
}
|
||||
|
||||
geopoint_distances[i].emplace(seq_id, dist);
|
||||
}
|
||||
|
||||
field_values[i] = &geopoint_distances[i];
|
||||
} else {
|
||||
field_values[i] = sort_index.at(sort_fields[i].name);
|
||||
if(dist < sort_fields[i].exclude_radius) {
|
||||
dist = 0;
|
||||
}
|
||||
|
||||
if(sort_fields[i].geo_precision > 0) {
|
||||
dist = dist + sort_fields[i].geo_precision - 1 -
|
||||
(dist + sort_fields[i].geo_precision - 1) % sort_fields[i].geo_precision;
|
||||
}
|
||||
|
||||
geopoint_distances[i].emplace(seq_id, dist);
|
||||
|
||||
// Swap (id -> latlong) index to (id -> distance) index
|
||||
field_values[i] = &geopoint_distances[i];
|
||||
}
|
||||
|
||||
Match single_token_match = Match(1, 0);
|
||||
const uint64_t single_token_match_score = single_token_match.get_match_score(total_cost);
|
||||
|
||||
//auto begin = std::chrono::high_resolution_clock::now();
|
||||
//const std::string first_token((const char*)query_suggestion[0]->key, query_suggestion[0]->key_len-1);
|
||||
|
||||
for (size_t i = 0; i < result_ids_size; i++) {
|
||||
const uint32_t seq_id = result_ids[i];
|
||||
uint64_t match_score = 0;
|
||||
|
||||
uint64_t match_score = 0;
|
||||
if (use_single_token_score || array_token_positions.empty()) {
|
||||
Match single_token_match = Match(1, 0);
|
||||
const uint64_t single_token_match_score = single_token_match.get_match_score(total_cost);
|
||||
match_score = single_token_match_score;
|
||||
} else {
|
||||
uint64_t total_tokens_found = 0, total_num_typos = 0, total_distance = 0, total_verbatim = 0;
|
||||
|
||||
if (use_single_token_score || array_token_positions_vec.empty()) {
|
||||
match_score = single_token_match_score;
|
||||
} else {
|
||||
const std::unordered_map<size_t, std::vector<token_positions_t>>& array_token_positions =
|
||||
array_token_positions_vec[i];
|
||||
|
||||
uint64_t total_tokens_found = 0, total_num_typos = 0, total_distance = 0, total_verbatim = 0;
|
||||
|
||||
for (const auto& kv: array_token_positions) {
|
||||
const std::vector<token_positions_t>& token_positions = kv.second;
|
||||
if (token_positions.empty()) {
|
||||
continue;
|
||||
}
|
||||
const Match &match = Match(seq_id, token_positions, false, prioritize_exact_match);
|
||||
uint64_t this_match_score = match.get_match_score(total_cost);
|
||||
|
||||
total_tokens_found += ((this_match_score >> 24) & 0xFF);
|
||||
total_num_typos += 255 - ((this_match_score >> 16) & 0xFF);
|
||||
total_distance += 100 - ((this_match_score >> 8) & 0xFF);
|
||||
total_verbatim += (this_match_score & 0xFF);
|
||||
|
||||
/*std::ostringstream os;
|
||||
os << name << ", total_cost: " << (255 - total_cost)
|
||||
<< ", words_present: " << match.words_present
|
||||
<< ", match_score: " << match_score
|
||||
<< ", match.distance: " << match.distance
|
||||
<< ", seq_id: " << seq_id << std::endl;
|
||||
LOG(INFO) << os.str();*/
|
||||
for (const auto& kv: array_token_positions) {
|
||||
const std::vector<token_positions_t>& token_positions = kv.second;
|
||||
if (token_positions.empty()) {
|
||||
continue;
|
||||
}
|
||||
const Match &match = Match(seq_id, token_positions, false, prioritize_exact_match);
|
||||
uint64_t this_match_score = match.get_match_score(total_cost);
|
||||
|
||||
match_score = (
|
||||
(uint64_t(total_tokens_found) << 24) |
|
||||
(uint64_t(255 - total_num_typos) << 16) |
|
||||
(uint64_t(100 - total_distance) << 8) |
|
||||
(uint64_t(total_verbatim) << 1)
|
||||
);
|
||||
total_tokens_found += ((this_match_score >> 24) & 0xFF);
|
||||
total_num_typos += 255 - ((this_match_score >> 16) & 0xFF);
|
||||
total_distance += 100 - ((this_match_score >> 8) & 0xFF);
|
||||
total_verbatim += (this_match_score & 0xFF);
|
||||
|
||||
/*LOG(INFO) << "Match score: " << match_score << ", for seq_id: " << seq_id
|
||||
<< " - total_tokens_found: " << total_tokens_found
|
||||
<< " - total_num_typos: " << total_num_typos
|
||||
<< " - total_distance: " << total_distance
|
||||
<< " - total_verbatim: " << total_verbatim
|
||||
<< " - total_cost: " << total_cost;*/
|
||||
/*std::ostringstream os;
|
||||
os << name << ", total_cost: " << (255 - total_cost)
|
||||
<< ", words_present: " << match.words_present
|
||||
<< ", match_score: " << match_score
|
||||
<< ", match.distance: " << match.distance
|
||||
<< ", seq_id: " << seq_id << std::endl;
|
||||
LOG(INFO) << os.str();*/
|
||||
}
|
||||
|
||||
const int64_t default_score = INT64_MIN; // to handle field that doesn't exist in document (e.g. optional)
|
||||
int64_t scores[3] = {0};
|
||||
size_t match_score_index = 0;
|
||||
match_score = (
|
||||
(uint64_t(total_tokens_found) << 24) |
|
||||
(uint64_t(255 - total_num_typos) << 16) |
|
||||
(uint64_t(100 - total_distance) << 8) |
|
||||
(uint64_t(total_verbatim) << 1)
|
||||
);
|
||||
|
||||
// avoiding loop
|
||||
if (sort_fields.size() > 0) {
|
||||
if (field_values[0] == TEXT_MATCH_SENTINEL) {
|
||||
scores[0] = int64_t(match_score);
|
||||
match_score_index = 0;
|
||||
} else if (field_values[0] == SEQ_ID_SENTINEL) {
|
||||
scores[0] = seq_id;
|
||||
} else {
|
||||
auto it = field_values[0]->find(seq_id);
|
||||
scores[0] = (it == field_values[0]->end()) ? default_score : it->second;
|
||||
}
|
||||
if (sort_order[0] == -1) {
|
||||
scores[0] = -scores[0];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if(sort_fields.size() > 1) {
|
||||
if (field_values[1] == TEXT_MATCH_SENTINEL) {
|
||||
scores[1] = int64_t(match_score);
|
||||
match_score_index = 1;
|
||||
} else if (field_values[1] == SEQ_ID_SENTINEL) {
|
||||
scores[1] = seq_id;
|
||||
} else {
|
||||
auto it = field_values[1]->find(seq_id);
|
||||
scores[1] = (it == field_values[1]->end()) ? default_score : it->second;
|
||||
}
|
||||
|
||||
if (sort_order[1] == -1) {
|
||||
scores[1] = -scores[1];
|
||||
}
|
||||
}
|
||||
|
||||
if(sort_fields.size() > 2) {
|
||||
if(field_values[2] == TEXT_MATCH_SENTINEL) {
|
||||
scores[2] = int64_t(match_score);
|
||||
match_score_index = 2;
|
||||
} else if (field_values[2] == SEQ_ID_SENTINEL) {
|
||||
scores[2] = seq_id;
|
||||
} else {
|
||||
auto it = field_values[2]->find(seq_id);
|
||||
scores[2] = (it == field_values[2]->end()) ? default_score : it->second;
|
||||
}
|
||||
|
||||
if(sort_order[2] == -1) {
|
||||
scores[2] = -scores[2];
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t distinct_id = seq_id;
|
||||
|
||||
if(group_limit != 0) {
|
||||
distinct_id = get_distinct_id(group_by_fields, seq_id);
|
||||
groups_processed.emplace(distinct_id);
|
||||
}
|
||||
|
||||
//LOG(INFO) << "Seq id: " << seq_id << ", match_score: " << match_score;
|
||||
KV kv(field_id, query_index, token_bits, seq_id, distinct_id, match_score_index, scores);
|
||||
topster->add(&kv);
|
||||
/*LOG(INFO) << "Match score: " << match_score << ", for seq_id: " << seq_id
|
||||
<< " - total_tokens_found: " << total_tokens_found
|
||||
<< " - total_num_typos: " << total_num_typos
|
||||
<< " - total_distance: " << total_distance
|
||||
<< " - total_verbatim: " << total_verbatim
|
||||
<< " - total_cost: " << total_cost;*/
|
||||
}
|
||||
|
||||
const int64_t default_score = INT64_MIN; // to handle field that doesn't exist in document (e.g. optional)
|
||||
int64_t scores[3] = {0};
|
||||
size_t match_score_index = 0;
|
||||
|
||||
// avoiding loop
|
||||
if (sort_fields.size() > 0) {
|
||||
if (field_values[0] == TEXT_MATCH_SENTINEL) {
|
||||
scores[0] = int64_t(match_score);
|
||||
match_score_index = 0;
|
||||
} else if (field_values[0] == SEQ_ID_SENTINEL) {
|
||||
scores[0] = seq_id;
|
||||
} else {
|
||||
auto it = field_values[0]->find(seq_id);
|
||||
scores[0] = (it == field_values[0]->end()) ? default_score : it->second;
|
||||
}
|
||||
if (sort_order[0] == -1) {
|
||||
scores[0] = -scores[0];
|
||||
}
|
||||
}
|
||||
|
||||
if(sort_fields.size() > 1) {
|
||||
if (field_values[1] == TEXT_MATCH_SENTINEL) {
|
||||
scores[1] = int64_t(match_score);
|
||||
match_score_index = 1;
|
||||
} else if (field_values[1] == SEQ_ID_SENTINEL) {
|
||||
scores[1] = seq_id;
|
||||
} else {
|
||||
auto it = field_values[1]->find(seq_id);
|
||||
scores[1] = (it == field_values[1]->end()) ? default_score : it->second;
|
||||
}
|
||||
|
||||
if (sort_order[1] == -1) {
|
||||
scores[1] = -scores[1];
|
||||
}
|
||||
}
|
||||
|
||||
if(sort_fields.size() > 2) {
|
||||
if(field_values[2] == TEXT_MATCH_SENTINEL) {
|
||||
scores[2] = int64_t(match_score);
|
||||
match_score_index = 2;
|
||||
} else if (field_values[2] == SEQ_ID_SENTINEL) {
|
||||
scores[2] = seq_id;
|
||||
} else {
|
||||
auto it = field_values[2]->find(seq_id);
|
||||
scores[2] = (it == field_values[2]->end()) ? default_score : it->second;
|
||||
}
|
||||
|
||||
if(sort_order[2] == -1) {
|
||||
scores[2] = -scores[2];
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t distinct_id = seq_id;
|
||||
|
||||
if(group_limit != 0) {
|
||||
distinct_id = get_distinct_id(group_by_fields, seq_id);
|
||||
groups_processed.emplace(distinct_id);
|
||||
}
|
||||
|
||||
//LOG(INFO) << "Seq id: " << seq_id << ", match_score: " << match_score;
|
||||
KV kv(field_id, query_index, token_bits, seq_id, distinct_id, match_score_index, scores);
|
||||
topster->add(&kv);
|
||||
|
||||
//long long int timeNanos = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - begin).count();
|
||||
//LOG(INFO) << "Time taken for results iteration: " << timeNanos << "ms";
|
||||
}
|
||||
|
@ -418,39 +418,20 @@ void posting_t::destroy_list(void*& obj) {
|
||||
}
|
||||
|
||||
void posting_t::get_array_token_positions(uint32_t id, const std::vector<void*>& raw_posting_lists,
|
||||
std::vector<std::unordered_map<size_t, std::vector<token_positions_t>>>& array_token_positions_vec) {
|
||||
std::unordered_map<size_t, std::vector<token_positions_t>>& array_token_positions) {
|
||||
|
||||
std::vector<posting_list_t*> plists;
|
||||
std::vector<uint32_t> expanded_plist_indices;
|
||||
to_expanded_plists(raw_posting_lists, plists, expanded_plist_indices);
|
||||
|
||||
posting_list_t::result_iter_state_t iter_state;
|
||||
iter_state.ids.push_back(id);
|
||||
iter_state.num_lists = plists.size();
|
||||
|
||||
std::vector<posting_list_t::block_t*>& block_vec = iter_state.blocks;
|
||||
std::vector<uint32_t>& index_vec = iter_state.indices;
|
||||
std::vector<posting_list_t::iterator_t> its;
|
||||
|
||||
for(posting_list_t* pl: plists) {
|
||||
posting_list_t::block_t* block = pl->block_of(id);
|
||||
block_vec.push_back(block);
|
||||
|
||||
bool found_index = false;
|
||||
|
||||
if(block != nullptr) {
|
||||
uint32_t index = block->ids.indexOf(id);
|
||||
if(index != block->ids.getLength()) {
|
||||
index_vec.push_back(index);
|
||||
found_index = true;
|
||||
}
|
||||
}
|
||||
|
||||
if(!found_index) {
|
||||
index_vec.push_back(UINT32_MAX);
|
||||
}
|
||||
its.push_back(pl->new_iterator());
|
||||
its.back().skip_to(id);
|
||||
}
|
||||
|
||||
posting_list_t::get_offsets(iter_state, array_token_positions_vec);
|
||||
posting_list_t::get_offsets(its, array_token_positions);
|
||||
|
||||
for(uint32_t expanded_plist_index: expanded_plist_indices) {
|
||||
delete plists[expanded_plist_index];
|
||||
|
@ -661,96 +661,44 @@ void posting_list_t::intersect(const std::vector<posting_list_t*>& posting_lists
|
||||
}
|
||||
}
|
||||
|
||||
bool posting_list_t::block_intersect(const std::vector<posting_list_t*>& posting_lists, const size_t batch_size,
|
||||
std::vector<posting_list_t::iterator_t>& its,
|
||||
result_iter_state_t& iter_state) {
|
||||
|
||||
if(posting_lists.empty()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if(its.empty()) {
|
||||
its.reserve(posting_lists.size());
|
||||
|
||||
for(const auto& posting_list: posting_lists) {
|
||||
its.push_back(posting_list->new_iterator());
|
||||
bool posting_list_t::take_id(result_iter_state_t& istate, uint32_t id) {
|
||||
// decide if this result id should be excluded
|
||||
if(istate.excluded_result_ids_size != 0) {
|
||||
while(istate.excluded_result_ids_index < istate.excluded_result_ids_size &&
|
||||
istate.excluded_result_ids[istate.excluded_result_ids_index] < id) {
|
||||
istate.excluded_result_ids_index++;
|
||||
}
|
||||
|
||||
iter_state.num_lists = posting_lists.size();
|
||||
iter_state.blocks.reserve(100);
|
||||
iter_state.indices.reserve(100);
|
||||
} else {
|
||||
// already in the middle of iteration: prepare for next batch
|
||||
iter_state.ids.clear();
|
||||
iter_state.indices.clear();
|
||||
iter_state.blocks.clear();
|
||||
if(istate.excluded_result_ids_index < istate.excluded_result_ids_size &&
|
||||
id == istate.excluded_result_ids[istate.excluded_result_ids_index]) {
|
||||
istate.excluded_result_ids_index++;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
size_t num_lists = its.size();
|
||||
bool id_found_in_filter = true;
|
||||
|
||||
switch (num_lists) {
|
||||
case 1:
|
||||
while(its[0].valid()) {
|
||||
iter_state.ids.push_back(its[0].id());
|
||||
iter_state.blocks.push_back(its[0].block());
|
||||
iter_state.indices.push_back(its[0].index());
|
||||
// decide if this result be matched with filter results
|
||||
if(istate.filter_ids_length != 0) {
|
||||
id_found_in_filter = false;
|
||||
|
||||
its[0].next();
|
||||
// e.g. [1, 3] vs [2, 3]
|
||||
|
||||
if(iter_state.ids.size() == batch_size) {
|
||||
return its[0].valid();
|
||||
}
|
||||
}
|
||||
break;
|
||||
case 2:
|
||||
while(!at_end2(its)) {
|
||||
if(equals2(its)) {
|
||||
// still need to ensure that the ID exists in inclusion list but NOT in exclusion list
|
||||
iter_state.ids.push_back(its[0].id());
|
||||
while(istate.filter_ids_index < istate.filter_ids_length && istate.filter_ids[istate.filter_ids_index] < id) {
|
||||
istate.filter_ids_index++;
|
||||
}
|
||||
|
||||
iter_state.blocks.push_back(its[0].block());
|
||||
iter_state.blocks.push_back(its[1].block());
|
||||
|
||||
iter_state.indices.push_back(its[0].index());
|
||||
iter_state.indices.push_back(its[1].index());
|
||||
|
||||
advance_all2(its);
|
||||
} else {
|
||||
advance_non_largest2(its);
|
||||
}
|
||||
|
||||
if(iter_state.ids.size() == batch_size) {
|
||||
return !at_end2(its);
|
||||
}
|
||||
}
|
||||
break;
|
||||
default:
|
||||
while(!at_end(its)) {
|
||||
if(equals(its)) {
|
||||
//LOG(INFO) << its[0].id();
|
||||
iter_state.ids.push_back(its[0].id());
|
||||
|
||||
for(size_t i = 0; i < its.size(); i++) {
|
||||
iter_state.blocks.push_back(its[i].block());
|
||||
iter_state.indices.push_back(its[i].index());
|
||||
}
|
||||
|
||||
advance_all(its);
|
||||
} else {
|
||||
advance_non_largest(its);
|
||||
}
|
||||
|
||||
if(iter_state.ids.size() == batch_size) {
|
||||
return !at_end(its);
|
||||
}
|
||||
}
|
||||
if(istate.filter_ids_index < istate.filter_ids_length && istate.filter_ids[istate.filter_ids_index] == id) {
|
||||
istate.filter_ids_index++;
|
||||
id_found_in_filter = true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
return id_found_in_filter;
|
||||
}
|
||||
|
||||
bool posting_list_t::get_offsets(posting_list_t::result_iter_state_t& iter_state,
|
||||
std::vector<std::unordered_map<size_t, std::vector<token_positions_t>>>& array_token_positions_vec) {
|
||||
bool posting_list_t::get_offsets(std::vector<iterator_t>& its,
|
||||
std::unordered_map<size_t, std::vector<token_positions_t>>& array_token_pos) {
|
||||
|
||||
// Plain string format:
|
||||
// offset1, offset2, ... , 0 (if token is the last offset for the document)
|
||||
@ -763,78 +711,84 @@ bool posting_list_t::get_offsets(posting_list_t::result_iter_state_t& iter_state
|
||||
|
||||
size_t id_block_index = 0;
|
||||
|
||||
for(size_t i = 0; i < iter_state.ids.size(); i++) {
|
||||
uint32_t id = iter_state.ids[i];
|
||||
array_token_positions_vec.emplace_back();
|
||||
std::unordered_map<size_t, std::vector<token_positions_t>>& array_tok_pos = array_token_positions_vec.back();
|
||||
for(size_t j = 0; j < its.size(); j++) {
|
||||
block_t* curr_block = its[j].block();
|
||||
uint32_t curr_index = its[j].index();
|
||||
|
||||
for(size_t j = 0; j < iter_state.num_lists; j++) {
|
||||
block_t* curr_block = iter_state.blocks[id_block_index];
|
||||
uint32_t curr_index = iter_state.indices[id_block_index++];
|
||||
if(curr_block == nullptr || curr_index == UINT32_MAX) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if(curr_block == nullptr || curr_index == UINT32_MAX) {
|
||||
/*uint32_t* offsets = curr_block->offsets.uncompress();
|
||||
|
||||
uint32_t start_offset = curr_block->offset_index.at(curr_index);
|
||||
uint32_t end_offset = (curr_index == curr_block->size() - 1) ?
|
||||
curr_block->offsets.getLength() :
|
||||
curr_block->offset_index.at(curr_index + 1);*/
|
||||
|
||||
uint32_t* offsets = its[j].offsets;
|
||||
|
||||
uint32_t start_offset = its[j].offset_index[curr_index];
|
||||
uint32_t end_offset = (curr_index == curr_block->size() - 1) ?
|
||||
curr_block->offsets.getLength() :
|
||||
its[j].offset_index[curr_index + 1];
|
||||
|
||||
std::vector<uint16_t> positions;
|
||||
int prev_pos = -1;
|
||||
bool is_last_token = false;
|
||||
|
||||
/*LOG(INFO) << "id: " << its[j].id() << ", start_offset: " << start_offset << ", end_offset: " << end_offset;
|
||||
for(size_t x = 0; x < end_offset; x++) {
|
||||
LOG(INFO) << "x: " << x << ", pos: " << offsets[x];
|
||||
}*/
|
||||
|
||||
while(start_offset < end_offset) {
|
||||
int pos = offsets[start_offset];
|
||||
start_offset++;
|
||||
|
||||
if(pos == 0) {
|
||||
// indicates that token is the last token on the doc
|
||||
is_last_token = true;
|
||||
start_offset++;
|
||||
continue;
|
||||
}
|
||||
|
||||
uint32_t* offsets = curr_block->offsets.uncompress();
|
||||
if(pos == prev_pos) { // indicates end of array index
|
||||
if(!positions.empty()) {
|
||||
size_t array_index = (size_t) offsets[start_offset];
|
||||
is_last_token = false;
|
||||
|
||||
uint32_t start_offset = curr_block->offset_index.at(curr_index);
|
||||
uint32_t end_offset = (curr_index == curr_block->size() - 1) ?
|
||||
curr_block->offsets.getLength() :
|
||||
curr_block->offset_index.at(curr_index + 1);
|
||||
|
||||
std::vector<uint16_t> positions;
|
||||
int prev_pos = -1;
|
||||
bool is_last_token = false;
|
||||
|
||||
while(start_offset < end_offset) {
|
||||
int pos = offsets[start_offset];
|
||||
start_offset++;
|
||||
|
||||
if(pos == 0) {
|
||||
// indicates that token is the last token on the doc
|
||||
is_last_token = true;
|
||||
start_offset++;
|
||||
continue;
|
||||
}
|
||||
|
||||
if(pos == prev_pos) { // indicates end of array index
|
||||
if(!positions.empty()) {
|
||||
size_t array_index = (size_t) offsets[start_offset];
|
||||
is_last_token = false;
|
||||
|
||||
if(start_offset+1 < end_offset) {
|
||||
size_t next_offset = (size_t) offsets[start_offset + 1];
|
||||
if(next_offset == 0) {
|
||||
// indicates that token is the last token on the doc
|
||||
is_last_token = true;
|
||||
start_offset++;
|
||||
}
|
||||
if(start_offset+1 < end_offset) {
|
||||
size_t next_offset = (size_t) offsets[start_offset + 1];
|
||||
if(next_offset == 0) {
|
||||
// indicates that token is the last token on the doc
|
||||
is_last_token = true;
|
||||
start_offset++;
|
||||
}
|
||||
|
||||
array_tok_pos[array_index].push_back(token_positions_t{is_last_token, positions});
|
||||
positions.clear();
|
||||
}
|
||||
|
||||
start_offset++; // skip current value which is the array index or flag for last index
|
||||
prev_pos = -1;
|
||||
continue;
|
||||
array_token_pos[array_index].push_back(token_positions_t{is_last_token, positions});
|
||||
positions.clear();
|
||||
}
|
||||
|
||||
prev_pos = pos;
|
||||
positions.push_back((uint16_t)pos - 1);
|
||||
start_offset++; // skip current value which is the array index or flag for last index
|
||||
prev_pos = -1;
|
||||
continue;
|
||||
}
|
||||
|
||||
if(!positions.empty()) {
|
||||
// for plain string fields
|
||||
array_tok_pos[0].push_back(token_positions_t{is_last_token, positions});
|
||||
}
|
||||
|
||||
delete [] offsets;
|
||||
prev_pos = pos;
|
||||
positions.push_back((uint16_t)pos - 1);
|
||||
}
|
||||
|
||||
if(!positions.empty()) {
|
||||
// for plain string fields
|
||||
array_token_pos[0].push_back(token_positions_t{is_last_token, positions});
|
||||
}
|
||||
|
||||
//delete [] offsets;
|
||||
}
|
||||
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool posting_list_t::at_end(const std::vector<posting_list_t::iterator_t>& its) {
|
||||
@ -982,8 +936,13 @@ bool posting_list_t::contains_atleast_one(const uint32_t* target_ids, size_t tar
|
||||
/* iterator_t operations */
|
||||
|
||||
posting_list_t::iterator_t::iterator_t(posting_list_t::block_t* root):
|
||||
curr_block(root), curr_index(0), uncompressed_block(nullptr), ids(nullptr) {
|
||||
curr_block(root), curr_index(0) {
|
||||
|
||||
if(curr_block != nullptr) {
|
||||
ids = curr_block->ids.uncompress();
|
||||
offset_index = curr_block->offset_index.uncompress();
|
||||
offsets = curr_block->offsets.uncompress();
|
||||
}
|
||||
}
|
||||
|
||||
bool posting_list_t::iterator_t::valid() const {
|
||||
@ -995,21 +954,22 @@ void posting_list_t::iterator_t::next() {
|
||||
if(curr_index == curr_block->size()) {
|
||||
curr_index = 0;
|
||||
curr_block = curr_block->next;
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t posting_list_t::iterator_t::id() {
|
||||
if(uncompressed_block != curr_block) {
|
||||
uncompressed_block = curr_block;
|
||||
|
||||
delete [] ids;
|
||||
ids = nullptr;
|
||||
delete [] offset_index;
|
||||
delete [] offsets;
|
||||
|
||||
ids = offset_index = offsets = nullptr;
|
||||
|
||||
if(curr_block != nullptr) {
|
||||
ids = curr_block->ids.uncompress();
|
||||
offset_index = curr_block->offset_index.uncompress();
|
||||
offsets = curr_block->offsets.uncompress();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t posting_list_t::iterator_t::id() const {
|
||||
return ids[curr_index];
|
||||
}
|
||||
|
||||
@ -1025,6 +985,20 @@ void posting_list_t::iterator_t::skip_to(uint32_t id) {
|
||||
bool skipped_block = false;
|
||||
while(curr_block != nullptr && curr_block->ids.last() < id) {
|
||||
curr_block = curr_block->next;
|
||||
|
||||
// FIXME: remove duplication
|
||||
delete [] ids;
|
||||
delete [] offset_index;
|
||||
delete [] offsets;
|
||||
|
||||
ids = offset_index = offsets = nullptr;
|
||||
|
||||
if(curr_block != nullptr) {
|
||||
ids = curr_block->ids.uncompress();
|
||||
offset_index = curr_block->offset_index.uncompress();
|
||||
offsets = curr_block->offsets.uncompress();
|
||||
}
|
||||
|
||||
skipped_block = true;
|
||||
}
|
||||
|
||||
@ -1045,10 +1019,12 @@ posting_list_t::iterator_t::~iterator_t() {
|
||||
posting_list_t::iterator_t::iterator_t(iterator_t&& rhs) noexcept {
|
||||
curr_block = rhs.curr_block;
|
||||
curr_index = rhs.curr_index;
|
||||
uncompressed_block = rhs.uncompressed_block;
|
||||
ids = rhs.ids;
|
||||
offset_index = rhs.offset_index;
|
||||
offsets = rhs.offsets;
|
||||
|
||||
rhs.curr_block = nullptr;
|
||||
rhs.uncompressed_block = nullptr;
|
||||
rhs.ids = nullptr;
|
||||
rhs.offset_index = nullptr;
|
||||
rhs.offsets = nullptr;
|
||||
}
|
||||
|
@ -608,28 +608,13 @@ TEST(PostingListTest, IntersectionBasics) {
|
||||
|
||||
std::vector<posting_list_t::iterator_t> its;
|
||||
posting_list_t::result_iter_state_t iter_state;
|
||||
bool has_more = posting_list_t::block_intersect(lists, 2, its, iter_state);
|
||||
ASSERT_FALSE(has_more);
|
||||
ASSERT_EQ(2, iter_state.ids.size());
|
||||
ASSERT_EQ(3, iter_state.ids[0]);
|
||||
ASSERT_EQ(20, iter_state.ids[1]);
|
||||
|
||||
ASSERT_EQ(6, iter_state.blocks.size());
|
||||
ASSERT_EQ(6, iter_state.indices.size());
|
||||
|
||||
// try with smaller batch size
|
||||
|
||||
std::vector<posting_list_t::iterator_t> its2;
|
||||
posting_list_t::result_iter_state_t iter_state2;
|
||||
has_more = posting_list_t::block_intersect(lists, 1, its2, iter_state2);
|
||||
ASSERT_TRUE(has_more);
|
||||
ASSERT_EQ(1, iter_state2.ids.size());
|
||||
ASSERT_EQ(3, iter_state2.ids[0]);
|
||||
|
||||
has_more = posting_list_t::block_intersect(lists, 1, its2, iter_state2);
|
||||
ASSERT_FALSE(has_more);
|
||||
ASSERT_EQ(1, iter_state2.ids.size());
|
||||
ASSERT_EQ(20, iter_state2.ids[0]);
|
||||
result_ids.clear();
|
||||
posting_list_t::block_intersect(lists, its, iter_state, [&](auto id, auto& its){
|
||||
result_ids.push_back(id);
|
||||
});
|
||||
ASSERT_EQ(2, result_ids.size());
|
||||
ASSERT_EQ(3, result_ids[0]);
|
||||
ASSERT_EQ(20, result_ids[1]);
|
||||
|
||||
// single item itersection
|
||||
std::vector<posting_list_t*> single_item_list = {&p1};
|
||||
@ -644,23 +629,15 @@ TEST(PostingListTest, IntersectionBasics) {
|
||||
|
||||
std::vector<posting_list_t::iterator_t> its3;
|
||||
posting_list_t::result_iter_state_t iter_state3;
|
||||
has_more = posting_list_t::block_intersect(single_item_list, 2, its3, iter_state3);
|
||||
ASSERT_TRUE(has_more);
|
||||
ASSERT_EQ(2, iter_state3.ids.size());
|
||||
ASSERT_EQ(0, iter_state3.ids[0]);
|
||||
ASSERT_EQ(2, iter_state3.ids[1]);
|
||||
ASSERT_EQ(2, iter_state3.blocks.size());
|
||||
ASSERT_EQ(2, iter_state3.indices.size());
|
||||
ASSERT_EQ(0, iter_state3.indices[0]);
|
||||
ASSERT_EQ(1, iter_state3.indices[1]);
|
||||
|
||||
has_more = posting_list_t::block_intersect(single_item_list, 2, its3, iter_state3);
|
||||
ASSERT_FALSE(has_more);
|
||||
ASSERT_EQ(2, iter_state3.ids.size());
|
||||
ASSERT_EQ(2, iter_state3.blocks.size());
|
||||
ASSERT_EQ(2, iter_state3.indices.size());
|
||||
ASSERT_EQ(0, iter_state3.indices[0]);
|
||||
ASSERT_EQ(1, iter_state3.indices[1]);
|
||||
result_ids.clear();
|
||||
posting_list_t::block_intersect(single_item_list, its3, iter_state3, [&](auto id, auto& its){
|
||||
result_ids.push_back(id);
|
||||
});
|
||||
ASSERT_EQ(4, result_ids.size());
|
||||
ASSERT_EQ(0, result_ids[0]);
|
||||
ASSERT_EQ(2, result_ids[1]);
|
||||
ASSERT_EQ(3, result_ids[2]);
|
||||
ASSERT_EQ(20, result_ids[3]);
|
||||
|
||||
// empty intersection list
|
||||
std::vector<posting_list_t*> empty_list;
|
||||
@ -670,11 +647,11 @@ TEST(PostingListTest, IntersectionBasics) {
|
||||
|
||||
std::vector<posting_list_t::iterator_t> its4;
|
||||
posting_list_t::result_iter_state_t iter_state4;
|
||||
has_more = posting_list_t::block_intersect(empty_list, 1, its4, iter_state4);
|
||||
ASSERT_FALSE(has_more);
|
||||
ASSERT_EQ(0, iter_state4.ids.size());
|
||||
ASSERT_EQ(0, iter_state4.blocks.size());
|
||||
ASSERT_EQ(0, iter_state4.indices.size());
|
||||
result_ids.clear();
|
||||
posting_list_t::block_intersect(empty_list, its4, iter_state4, [&](auto id, auto& its){
|
||||
result_ids.push_back(id);
|
||||
});
|
||||
ASSERT_EQ(0, result_ids.size());
|
||||
}
|
||||
|
||||
TEST(PostingListTest, ResultsAndOffsetsBasics) {
|
||||
@ -729,6 +706,7 @@ TEST(PostingListTest, ResultsAndOffsetsBasics) {
|
||||
lists.push_back(&p2);
|
||||
lists.push_back(&p3);
|
||||
|
||||
/*
|
||||
std::vector<posting_list_t::iterator_t> its;
|
||||
posting_list_t::result_iter_state_t iter_state;
|
||||
bool has_more = posting_list_t::block_intersect(lists, 2, its, iter_state);
|
||||
@ -745,6 +723,7 @@ TEST(PostingListTest, ResultsAndOffsetsBasics) {
|
||||
ASSERT_EQ(actual_offsets_20[0].positions, array_token_positions_vec[1].at(0)[0].positions);
|
||||
ASSERT_EQ(actual_offsets_20[1].positions, array_token_positions_vec[1].at(0)[1].positions);
|
||||
ASSERT_EQ(actual_offsets_20[2].positions, array_token_positions_vec[1].at(0)[2].positions);
|
||||
*/
|
||||
}
|
||||
|
||||
TEST(PostingListTest, IntersectionSkipBlocks) {
|
||||
@ -1269,15 +1248,15 @@ TEST(PostingListTest, BlockIntersectionOnMixedLists) {
|
||||
|
||||
std::vector<void*> raw_posting_lists = {SET_COMPACT_POSTING(list1), &p1};
|
||||
posting_list_t::result_iter_state_t iter_state;
|
||||
posting_t::block_intersector_t intersector(raw_posting_lists, 1, iter_state);
|
||||
posting_t::block_intersector_t intersector(raw_posting_lists, iter_state);
|
||||
|
||||
ASSERT_TRUE(intersector.intersect());
|
||||
ASSERT_EQ(1, iter_state.ids.size());
|
||||
ASSERT_EQ(5, iter_state.ids[0]);
|
||||
|
||||
ASSERT_FALSE(intersector.intersect());
|
||||
ASSERT_EQ(1, iter_state.ids.size());
|
||||
ASSERT_EQ(8, iter_state.ids[0]);
|
||||
std::vector<uint32_t> result_ids;
|
||||
intersector.intersect([&](auto seq_id, auto& its) {
|
||||
result_ids.push_back(seq_id);
|
||||
});
|
||||
ASSERT_EQ(2, result_ids.size());
|
||||
ASSERT_EQ(5, result_ids[0]);
|
||||
ASSERT_EQ(8, result_ids[1]);
|
||||
|
||||
free(list1);
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user