mirror of
https://github.com/typesense/typesense.git
synced 2025-05-19 21:22:25 +08:00
1590 lines
66 KiB
C++
1590 lines
66 KiB
C++
#include "collection.h"
|
|
|
|
#include <numeric>
|
|
#include <chrono>
|
|
#include <array_utils.h>
|
|
#include <match_score.h>
|
|
#include <string_utils.h>
|
|
#include <art.h>
|
|
|
|
Collection::Collection(const std::string name, const uint32_t collection_id, const uint32_t next_seq_id, Store *store,
|
|
const std::vector<field> &fields, const std::string & token_ranking_field):
|
|
name(name), collection_id(collection_id), next_seq_id(next_seq_id), store(store),
|
|
token_ranking_field(token_ranking_field) {
|
|
|
|
for(const field& field: fields) {
|
|
art_tree *t = new art_tree;
|
|
art_tree_init(t);
|
|
search_index.emplace(field.name, t);
|
|
search_schema.emplace(field.name, field);
|
|
|
|
if(field.is_facet()) {
|
|
facet_value fvalue;
|
|
facet_index.emplace(field.name, fvalue);
|
|
facet_schema.emplace(field.name, field);
|
|
}
|
|
|
|
if(field.is_single_integer() || field.is_single_float()) {
|
|
spp::sparse_hash_map<uint32_t, number_t> * doc_to_score = new spp::sparse_hash_map<uint32_t, number_t>();
|
|
sort_index.emplace(field.name, doc_to_score);
|
|
sort_schema.emplace(field.name, field);
|
|
}
|
|
}
|
|
|
|
num_documents = 0;
|
|
}
|
|
|
|
Collection::~Collection() {
|
|
for(auto & name_tree: search_index) {
|
|
art_tree_destroy(name_tree.second);
|
|
delete name_tree.second;
|
|
name_tree.second = nullptr;
|
|
}
|
|
|
|
search_index.clear();
|
|
|
|
for(auto & name_map: sort_index) {
|
|
delete name_map.second;
|
|
name_map.second = nullptr;
|
|
}
|
|
|
|
sort_index.clear();
|
|
}
|
|
|
|
uint32_t Collection::get_next_seq_id() {
|
|
store->increment(get_next_seq_id_key(name), 1);
|
|
return next_seq_id++;
|
|
}
|
|
|
|
void Collection::set_next_seq_id(uint32_t seq_id) {
|
|
next_seq_id = seq_id;
|
|
}
|
|
|
|
void Collection::increment_next_seq_id_field() {
|
|
next_seq_id++;
|
|
}
|
|
|
|
Option<std::string> Collection::add(const std::string & json_str) {
|
|
nlohmann::json document;
|
|
try {
|
|
document = nlohmann::json::parse(json_str);
|
|
} catch(...) {
|
|
return Option<std::string>(400, "Bad JSON.");
|
|
}
|
|
|
|
uint32_t seq_id = get_next_seq_id();
|
|
std::string seq_id_str = std::to_string(seq_id);
|
|
|
|
if(document.count("id") == 0) {
|
|
document["id"] = seq_id_str;
|
|
} else if(!document["id"].is_string()) {
|
|
return Option<std::string>(400, "Document's `id` field should be a string.");
|
|
}
|
|
|
|
std::string doc_id = document["id"];
|
|
|
|
const Option<uint32_t> & index_memory_op = index_in_memory(document, seq_id);
|
|
|
|
if(!index_memory_op.ok()) {
|
|
return Option<std::string>(index_memory_op.code(), index_memory_op.error());
|
|
}
|
|
|
|
store->insert(get_doc_id_key(document["id"]), seq_id_str);
|
|
store->insert(get_seq_id_key(seq_id), document.dump());
|
|
|
|
return Option<std::string>(doc_id);
|
|
}
|
|
|
|
Option<uint32_t> Collection::validate_index_in_memory(const nlohmann::json &document, uint32_t seq_id) {
|
|
if(!token_ranking_field.empty() && document.count(token_ranking_field) == 0) {
|
|
return Option<>(400, "Field `" + token_ranking_field + "` has been declared as a token ranking field, "
|
|
"but is not found in the document.");
|
|
}
|
|
|
|
if(!token_ranking_field.empty() && !document[token_ranking_field].is_number_integer() &&
|
|
!document[token_ranking_field].is_number_float()) {
|
|
return Option<>(400, "Token ranking field `" + token_ranking_field + "` must be a number.");
|
|
}
|
|
|
|
if(!token_ranking_field.empty() && document[token_ranking_field].is_number_integer() &&
|
|
document[token_ranking_field].get<int64_t>() > std::numeric_limits<int32_t>::max()) {
|
|
return Option<>(400, "Token ranking field `" + token_ranking_field + "` exceeds maximum value of int32.");
|
|
}
|
|
|
|
if(!token_ranking_field.empty() && document[token_ranking_field].is_number_float() &&
|
|
document[token_ranking_field].get<float>() > std::numeric_limits<float>::max()) {
|
|
return Option<>(400, "Token ranking field `" + token_ranking_field + "` exceeds maximum value of a float.");
|
|
}
|
|
|
|
for(const std::pair<std::string, field> & field_pair: search_schema) {
|
|
const std::string & field_name = field_pair.first;
|
|
|
|
if(document.count(field_name) == 0) {
|
|
return Option<>(400, "Field `" + field_name + "` has been declared in the schema, "
|
|
"but is not found in the document.");
|
|
}
|
|
|
|
art_tree *t = search_index.at(field_name);
|
|
|
|
if(field_pair.second.type == field_types::STRING) {
|
|
if(!document[field_name].is_string()) {
|
|
return Option<>(400, "Field `" + field_name + "` must be a string.");
|
|
}
|
|
} else if(field_pair.second.type == field_types::INT32) {
|
|
if(!document[field_name].is_number_integer()) {
|
|
return Option<>(400, "Field `" + field_name + "` must be an int32.");
|
|
}
|
|
|
|
if(document[field_name].get<int64_t>() > INT32_MAX) {
|
|
return Option<>(400, "Field `" + field_name + "` exceeds maximum value of int32.");
|
|
}
|
|
} else if(field_pair.second.type == field_types::INT64) {
|
|
if(!document[field_name].is_number_integer()) {
|
|
return Option<>(400, "Field `" + field_name + "` must be an int64.");
|
|
}
|
|
} else if(field_pair.second.type == field_types::FLOAT) {
|
|
if(!document[field_name].is_number_float()) {
|
|
return Option<>(400, "Field `" + field_name + "` must be a float.");
|
|
}
|
|
} else if(field_pair.second.type == field_types::STRING_ARRAY) {
|
|
if(!document[field_name].is_array()) {
|
|
return Option<>(400, "Field `" + field_name + "` must be a string array.");
|
|
}
|
|
if(document[field_name].size() > 0 && !document[field_name][0].is_string()) {
|
|
return Option<>(400, "Field `" + field_name + "` must be a string array.");
|
|
}
|
|
} else if(field_pair.second.type == field_types::INT32_ARRAY) {
|
|
if(!document[field_name].is_array()) {
|
|
return Option<>(400, "Field `" + field_name + "` must be an int32 array.");
|
|
}
|
|
|
|
if(document[field_name].size() > 0 && !document[field_name][0].is_number_integer()) {
|
|
return Option<>(400, "Field `" + field_name + "` must be an int32 array.");
|
|
}
|
|
} else if(field_pair.second.type == field_types::INT64_ARRAY) {
|
|
if(!document[field_name].is_array()) {
|
|
return Option<>(400, "Field `" + field_name + "` must be an int64 array.");
|
|
}
|
|
|
|
if(document[field_name].size() > 0 && !document[field_name][0].is_number_integer()) {
|
|
return Option<>(400, "Field `" + field_name + "` must be an int64 array.");
|
|
}
|
|
} else if(field_pair.second.type == field_types::FLOAT_ARRAY) {
|
|
if(!document[field_name].is_array()) {
|
|
return Option<>(400, "Field `" + field_name + "` must be a float array.");
|
|
}
|
|
|
|
if(document[field_name].size() > 0 && !document[field_name][0].is_number_float()) {
|
|
return Option<>(400, "Field `" + field_name + "` must be a float array.");
|
|
}
|
|
}
|
|
}
|
|
|
|
for(const std::pair<std::string, field> & field_pair: facet_schema) {
|
|
const std::string & field_name = field_pair.first;
|
|
|
|
if(document.count(field_name) == 0) {
|
|
return Option<>(400, "Field `" + field_name + "` has been declared as a facet field in the schema, "
|
|
"but is not found in the document.");
|
|
}
|
|
|
|
facet_value & fvalue = facet_index.at(field_name);
|
|
if(field_pair.second.type == field_types::STRING) {
|
|
if(!document[field_name].is_string()) {
|
|
return Option<>(400, "Facet field `" + field_name + "` must be a string.");
|
|
}
|
|
} else if(field_pair.second.type == field_types::STRING_ARRAY) {
|
|
if(!document[field_name].is_array()) {
|
|
return Option<>(400, "Facet field `" + field_name + "` must be a string array.");
|
|
}
|
|
|
|
if(document[field_name].size() > 0 && !document[field_name][0].is_string()) {
|
|
return Option<>(400, "Facet field `" + field_name + "` must be a string array.");
|
|
}
|
|
} else {
|
|
return Option<>(400, "Facet field `" + field_name + "` must be a string or a string[].");
|
|
}
|
|
}
|
|
|
|
return Option<>(200);
|
|
}
|
|
|
|
Option<uint32_t> Collection::index_in_memory(const nlohmann::json &document, uint32_t seq_id) {
|
|
Option<uint32_t> validation_op = validate_index_in_memory(document, seq_id);
|
|
|
|
if(!validation_op.ok()) {
|
|
return validation_op;
|
|
}
|
|
|
|
int32_t points = 0;
|
|
|
|
if(!token_ranking_field.empty()) {
|
|
if(document[token_ranking_field].is_number_float()) {
|
|
// serialize float to an integer and reverse the inverted range
|
|
float n = document[token_ranking_field];
|
|
memcpy(&points, &n, sizeof(int32_t));
|
|
points ^= ((points >> (std::numeric_limits<int32_t>::digits - 1)) | INT32_MIN);
|
|
points = -1 * (INT32_MAX - points);
|
|
} else {
|
|
points = document[token_ranking_field];
|
|
}
|
|
}
|
|
|
|
for(const std::pair<std::string, field> & field_pair: search_schema) {
|
|
const std::string & field_name = field_pair.first;
|
|
art_tree *t = search_index.at(field_name);
|
|
|
|
if(field_pair.second.type == field_types::STRING) {
|
|
const std::string & text = document[field_name];
|
|
index_string_field(text, points, t, seq_id, field_pair.second.is_facet());
|
|
} else if(field_pair.second.type == field_types::INT32) {
|
|
uint32_t value = document[field_name];
|
|
index_int32_field(value, points, t, seq_id);
|
|
} else if(field_pair.second.type == field_types::INT64) {
|
|
uint64_t value = document[field_name];
|
|
index_int64_field(value, points, t, seq_id);
|
|
} else if(field_pair.second.type == field_types::FLOAT) {
|
|
float value = document[field_name];
|
|
index_float_field(value, points, t, seq_id);
|
|
} else if(field_pair.second.type == field_types::STRING_ARRAY) {
|
|
std::vector<std::string> strings = document[field_name];
|
|
index_string_array_field(strings, points, t, seq_id, field_pair.second.is_facet());
|
|
} else if(field_pair.second.type == field_types::INT32_ARRAY) {
|
|
std::vector<int32_t> values = document[field_name];
|
|
index_int32_array_field(values, points, t, seq_id);
|
|
} else if(field_pair.second.type == field_types::INT64_ARRAY) {
|
|
std::vector<int64_t> values = document[field_name];
|
|
index_int64_array_field(values, points, t, seq_id);
|
|
} else if(field_pair.second.type == field_types::FLOAT_ARRAY) {
|
|
std::vector<float> values = document[field_name];
|
|
index_float_array_field(values, points, t, seq_id);
|
|
}
|
|
|
|
// add numerical values automatically into sort index
|
|
if(field_pair.second.type == field_types::INT32 || field_pair.second.type == field_types::INT64 ||
|
|
field_pair.second.type == field_types::FLOAT) {
|
|
spp::sparse_hash_map<uint32_t, number_t> *doc_to_score = sort_index.at(field_pair.first);
|
|
|
|
if(document[field_pair.first].is_number_integer()) {
|
|
doc_to_score->emplace(seq_id, document[field_pair.first].get<int64_t>());
|
|
} else if(document[field_pair.first].is_number_float()) {
|
|
doc_to_score->emplace(seq_id, document[field_pair.first].get<float>());
|
|
}
|
|
}
|
|
}
|
|
|
|
for(const std::pair<std::string, field> & field_pair: facet_schema) {
|
|
const std::string & field_name = field_pair.first;
|
|
facet_value & fvalue = facet_index.at(field_name);
|
|
if(field_pair.second.type == field_types::STRING) {
|
|
const std::string & value = document[field_name];
|
|
fvalue.index_values(seq_id, { value });
|
|
} else if(field_pair.second.type == field_types::STRING_ARRAY) {
|
|
const std::vector<std::string> & values = document[field_name];
|
|
fvalue.index_values(seq_id, values);
|
|
}
|
|
}
|
|
|
|
num_documents += 1;
|
|
return Option<>(200);
|
|
}
|
|
|
|
void Collection::index_int32_field(const int32_t value, uint32_t score, art_tree *t, uint32_t seq_id) const {
|
|
const int KEY_LEN = 8;
|
|
unsigned char key[KEY_LEN];
|
|
|
|
encode_int32(value, key);
|
|
|
|
uint32_t num_hits = 0;
|
|
art_leaf* leaf = (art_leaf *) art_search(t, key, KEY_LEN);
|
|
if(leaf != NULL) {
|
|
num_hits = leaf->values->ids.getLength();
|
|
}
|
|
|
|
num_hits += 1;
|
|
|
|
art_document art_doc;
|
|
art_doc.id = seq_id;
|
|
art_doc.score = score;
|
|
art_doc.offsets_len = 0;
|
|
art_doc.offsets = nullptr;
|
|
|
|
art_insert(t, key, KEY_LEN, &art_doc, num_hits);
|
|
}
|
|
|
|
void Collection::index_int64_field(const int64_t value, uint32_t score, art_tree *t, uint32_t seq_id) const {
|
|
const int KEY_LEN = 8;
|
|
unsigned char key[KEY_LEN];
|
|
|
|
encode_int64(value, key);
|
|
|
|
uint32_t num_hits = 0;
|
|
art_leaf* leaf = (art_leaf *) art_search(t, key, KEY_LEN);
|
|
if(leaf != NULL) {
|
|
num_hits = leaf->values->ids.getLength();
|
|
}
|
|
|
|
num_hits += 1;
|
|
|
|
art_document art_doc;
|
|
art_doc.id = seq_id;
|
|
art_doc.score = score;
|
|
art_doc.offsets_len = 0;
|
|
art_doc.offsets = nullptr;
|
|
|
|
art_insert(t, key, KEY_LEN, &art_doc, num_hits);
|
|
}
|
|
|
|
void Collection::index_float_field(const float value, uint32_t score, art_tree *t, uint32_t seq_id) const {
|
|
const int KEY_LEN = 8;
|
|
unsigned char key[KEY_LEN];
|
|
|
|
encode_float(value, key);
|
|
|
|
uint32_t num_hits = 0;
|
|
art_leaf* leaf = (art_leaf *) art_search(t, key, KEY_LEN);
|
|
if(leaf != NULL) {
|
|
num_hits = leaf->values->ids.getLength();
|
|
}
|
|
|
|
num_hits += 1;
|
|
|
|
art_document art_doc;
|
|
art_doc.id = seq_id;
|
|
art_doc.score = score;
|
|
art_doc.offsets_len = 0;
|
|
art_doc.offsets = nullptr;
|
|
|
|
art_insert(t, key, KEY_LEN, &art_doc, num_hits);
|
|
}
|
|
|
|
|
|
void Collection::index_string_field(const std::string & text, const uint32_t score, art_tree *t,
|
|
uint32_t seq_id, const bool verbatim) const {
|
|
std::vector<std::string> tokens;
|
|
std::unordered_map<std::string, std::vector<uint32_t>> token_to_offsets;
|
|
|
|
if(verbatim) {
|
|
tokens.push_back(text);
|
|
token_to_offsets[text].push_back(0);
|
|
} else {
|
|
StringUtils::split(text, tokens, " ");
|
|
for(uint32_t i=0; i<tokens.size(); i++) {
|
|
auto & token = tokens[i];
|
|
transform(token.begin(), token.end(), token.begin(), tolower);
|
|
token_to_offsets[token].push_back(i);
|
|
}
|
|
}
|
|
|
|
for(auto & kv: token_to_offsets) {
|
|
art_document art_doc;
|
|
art_doc.id = seq_id;
|
|
art_doc.score = score;
|
|
art_doc.offsets_len = (uint32_t) kv.second.size();
|
|
art_doc.offsets = new uint32_t[kv.second.size()];
|
|
|
|
uint32_t num_hits = 0;
|
|
|
|
const unsigned char *key = (const unsigned char *) kv.first.c_str();
|
|
int key_len = (int) kv.first.length() + 1; // for the terminating \0 char
|
|
|
|
art_leaf* leaf = (art_leaf *) art_search(t, key, key_len);
|
|
if(leaf != NULL) {
|
|
num_hits = leaf->values->ids.getLength();
|
|
}
|
|
|
|
num_hits += 1;
|
|
|
|
for(auto i=0; i<kv.second.size(); i++) {
|
|
art_doc.offsets[i] = kv.second[i];
|
|
}
|
|
|
|
art_insert(t, key, key_len, &art_doc, num_hits);
|
|
delete [] art_doc.offsets;
|
|
art_doc.offsets = nullptr;
|
|
}
|
|
}
|
|
|
|
void Collection::index_string_array_field(const std::vector<std::string> & strings, const uint32_t score, art_tree *t,
|
|
uint32_t seq_id, const bool verbatim) const {
|
|
for(const std::string & str: strings) {
|
|
index_string_field(str, score, t, seq_id, verbatim);
|
|
}
|
|
}
|
|
|
|
void Collection::index_int32_array_field(const std::vector<int32_t> & values, const uint32_t score, art_tree *t,
|
|
uint32_t seq_id) const {
|
|
for(const int32_t value: values) {
|
|
index_int32_field(value, score, t, seq_id);
|
|
}
|
|
}
|
|
|
|
void Collection::index_int64_array_field(const std::vector<int64_t> & values, const uint32_t score, art_tree *t,
|
|
uint32_t seq_id) const {
|
|
for(const int64_t value: values) {
|
|
index_int64_field(value, score, t, seq_id);
|
|
}
|
|
}
|
|
|
|
void Collection::index_float_array_field(const std::vector<float> & values, const uint32_t score, art_tree *t,
|
|
uint32_t seq_id) const {
|
|
for(const float value: values) {
|
|
index_float_field(value, score, t, seq_id);
|
|
}
|
|
}
|
|
|
|
void Collection::do_facets(std::vector<facet> & facets, uint32_t* result_ids, size_t results_size) {
|
|
for(auto & a_facet: facets) {
|
|
// assumed that facet fields have already been validated upstream
|
|
const field & facet_field = facet_schema.at(a_facet.field_name);
|
|
const facet_value & fvalue = facet_index.at(facet_field.name);
|
|
|
|
for(auto i = 0; i < results_size; i++) {
|
|
uint32_t doc_seq_id = result_ids[i];
|
|
if(fvalue.doc_values.count(doc_seq_id) != 0) {
|
|
// for every result document, get the values associated and increment counter
|
|
const std::vector<uint32_t> & value_indices = fvalue.doc_values.at(doc_seq_id);
|
|
for(auto j = 0; j < value_indices.size(); j++) {
|
|
const std::string & facet_value = fvalue.index_value.at(value_indices.at(j));
|
|
a_facet.result_map[facet_value] += 1;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void Collection::search_candidates(uint32_t* filter_ids, size_t filter_ids_length, std::vector<facet> & facets,
|
|
const std::vector<sort_by> & sort_fields, int & candidate_rank,
|
|
std::vector<std::vector<art_leaf*>> & token_to_candidates,
|
|
std::vector<std::vector<art_leaf*>> & searched_queries, Topster<100> & topster,
|
|
size_t & total_results, uint32_t** all_result_ids, size_t & all_result_ids_len,
|
|
const size_t & max_results, const bool prefix) {
|
|
const size_t combination_limit = 10;
|
|
const size_t prefix_candidates_limit = 5;
|
|
|
|
auto product = []( long long a, std::vector<art_leaf*>& b ) { return a*b.size(); };
|
|
long long int N = std::accumulate(token_to_candidates.begin(), token_to_candidates.end(), 1LL, product);
|
|
|
|
for(long long n=0; n<N && n<combination_limit; ++n) {
|
|
// every element in `query_suggestion` contains a token and its associated hits
|
|
std::vector<art_leaf *> query_suggestion = next_suggestion(token_to_candidates, n);
|
|
|
|
/*for(auto i=0; i < query_suggestion.size(); i++) {
|
|
std::cout << "i: " << i << " - " << query_suggestion[i]->key << std::endl;
|
|
}*/
|
|
|
|
// initialize results with the starting element (for further intersection)
|
|
size_t result_size = query_suggestion[0]->values->ids.getLength();
|
|
if(result_size == 0) {
|
|
continue;
|
|
}
|
|
|
|
uint32_t* result_ids = query_suggestion[0]->values->ids.uncompress();
|
|
candidate_rank += 1;
|
|
|
|
// intersect the document ids for each token to find docs that contain all the tokens (stored in `result_ids`)
|
|
for(auto i=1; i < query_suggestion.size(); i++) {
|
|
uint32_t* out = nullptr;
|
|
uint32_t* ids = query_suggestion[i]->values->ids.uncompress();
|
|
result_size = ArrayUtils::and_scalar(ids, query_suggestion[i]->values->ids.getLength(), result_ids, result_size, &out);
|
|
delete[] ids;
|
|
delete[] result_ids;
|
|
result_ids = out;
|
|
}
|
|
|
|
if(filter_ids != nullptr) {
|
|
// intersect once again with filter ids
|
|
uint32_t* filtered_result_ids = nullptr;
|
|
size_t filtered_results_size = ArrayUtils::and_scalar(filter_ids, filter_ids_length, result_ids,
|
|
result_size, &filtered_result_ids);
|
|
|
|
uint32_t* new_all_result_ids;
|
|
all_result_ids_len = ArrayUtils::or_scalar(*all_result_ids, all_result_ids_len, filtered_result_ids,
|
|
filtered_results_size, &new_all_result_ids);
|
|
delete [] *all_result_ids;
|
|
*all_result_ids = new_all_result_ids;
|
|
|
|
do_facets(facets, filtered_result_ids, filtered_results_size);
|
|
|
|
// go through each matching document id and calculate match score
|
|
score_results(sort_fields, searched_queries.size(), candidate_rank, topster, query_suggestion,
|
|
filtered_result_ids, filtered_results_size);
|
|
|
|
delete[] filtered_result_ids;
|
|
delete[] result_ids;
|
|
} else {
|
|
do_facets(facets, result_ids, result_size);
|
|
|
|
uint32_t* new_all_result_ids;
|
|
all_result_ids_len = ArrayUtils::or_scalar(*all_result_ids, all_result_ids_len, result_ids,
|
|
result_size, &new_all_result_ids);
|
|
delete [] *all_result_ids;
|
|
*all_result_ids = new_all_result_ids;
|
|
|
|
score_results(sort_fields, searched_queries.size(), candidate_rank, topster, query_suggestion,
|
|
result_ids, result_size);
|
|
delete[] result_ids;
|
|
}
|
|
|
|
total_results += topster.size;
|
|
searched_queries.push_back(query_suggestion);
|
|
|
|
if(!prefix && total_results >= max_results) {
|
|
break;
|
|
}
|
|
|
|
if(prefix && candidate_rank >= prefix_candidates_limit) {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
size_t Collection::union_of_ids(std::vector<std::pair<uint32_t*, size_t>> & result_array_pairs,
|
|
uint32_t **results_out) {
|
|
uint32_t *results = nullptr;
|
|
size_t results_length = 0;
|
|
|
|
uint32_t *prev_results = nullptr;
|
|
size_t prev_results_length = 0;
|
|
|
|
for(const std::pair<uint32_t*, size_t> & result_array_pair: result_array_pairs) {
|
|
results_length = ArrayUtils::or_scalar(prev_results, prev_results_length, result_array_pair.first,
|
|
result_array_pair.second, &results);
|
|
delete [] prev_results;
|
|
prev_results = results;
|
|
prev_results_length = results_length;
|
|
}
|
|
|
|
*results_out = results;
|
|
return results_length;
|
|
}
|
|
|
|
Option<uint32_t> Collection::do_filtering(uint32_t** filter_ids_out, const std::string & simple_filter_str) {
|
|
// parse the filter string
|
|
std::vector<std::string> filter_blocks;
|
|
StringUtils::split(simple_filter_str, filter_blocks, "&&");
|
|
|
|
std::vector<filter> filters;
|
|
|
|
for(const std::string & filter_block: filter_blocks) {
|
|
// split into [field_name, value]
|
|
std::vector<std::string> expression_parts;
|
|
StringUtils::split(filter_block, expression_parts, ":");
|
|
if(expression_parts.size() != 2) {
|
|
return Option<>(400, "Could not parse the filter query.");
|
|
}
|
|
|
|
const std::string & field_name = expression_parts[0];
|
|
if(search_schema.count(field_name) == 0) {
|
|
return Option<>(400, "Could not find a filter field named `" + field_name + "` in the schema.");
|
|
}
|
|
|
|
field _field = search_schema.at(field_name);
|
|
std::string & raw_value = expression_parts[1];
|
|
filter f;
|
|
|
|
if(_field.is_integer() || _field.is_float()) {
|
|
// could be a single value or a list
|
|
if(raw_value[0] == '[' && raw_value[raw_value.size() - 1] == ']') {
|
|
std::vector<std::string> filter_values;
|
|
StringUtils::split(raw_value.substr(1, raw_value.size() - 2), filter_values, ",");
|
|
|
|
for(const std::string & filter_value: filter_values) {
|
|
if(_field.is_integer() && !StringUtils::is_integer(filter_value)) {
|
|
return Option<>(400, "Error with field `" + _field.name + "`: Not an integer.");
|
|
}
|
|
|
|
if(_field.is_float() && !StringUtils::is_float(filter_value)) {
|
|
return Option<>(400, "Error with field `" + _field.name + "`: Not a float.");
|
|
}
|
|
}
|
|
|
|
f = {field_name, filter_values, EQUALS};
|
|
} else {
|
|
Option<NUM_COMPARATOR> op_comparator = filter::extract_num_comparator(raw_value);
|
|
if(!op_comparator.ok()) {
|
|
return Option<>(400, "Error with field `" + _field.name + "`: " + op_comparator.error());
|
|
}
|
|
|
|
// extract numerical value
|
|
std::string filter_value;
|
|
if(op_comparator.get() == LESS_THAN || op_comparator.get() == GREATER_THAN) {
|
|
filter_value = raw_value.substr(1);
|
|
} else if(op_comparator.get() == LESS_THAN_EQUALS || op_comparator.get() == GREATER_THAN_EQUALS) {
|
|
filter_value = raw_value.substr(2);
|
|
} else {
|
|
// EQUALS
|
|
filter_value = raw_value;
|
|
}
|
|
|
|
filter_value = StringUtils::trim(filter_value);
|
|
|
|
if(_field.is_integer() && !StringUtils::is_integer(filter_value)) {
|
|
return Option<>(400, "Error with field `" + _field.name + "`: Not an integer.");
|
|
}
|
|
|
|
if(_field.is_float() && !StringUtils::is_float(filter_value)) {
|
|
return Option<>(400, "Error with field `" + _field.name + "`: Not a float.");
|
|
}
|
|
|
|
f = {field_name, {filter_value}, op_comparator.get()};
|
|
}
|
|
} else if(_field.is_string()) {
|
|
if(raw_value[0] == '[' && raw_value[raw_value.size() - 1] == ']') {
|
|
std::vector<std::string> filter_values;
|
|
StringUtils::split(raw_value.substr(1, raw_value.size() - 2), filter_values, ",");
|
|
f = {field_name, filter_values, EQUALS};
|
|
} else {
|
|
f = {field_name, {raw_value}, EQUALS};
|
|
}
|
|
} else {
|
|
return Option<>(400, "Error with field `" + _field.name + "`: Unidentified field type.");
|
|
}
|
|
|
|
filters.push_back(f);
|
|
}
|
|
|
|
uint32_t* filter_ids = nullptr;
|
|
uint32_t filter_ids_length = 0;
|
|
|
|
// process the filters first
|
|
for(const filter & a_filter: filters) {
|
|
if(search_index.count(a_filter.field_name) != 0) {
|
|
art_tree* t = search_index.at(a_filter.field_name);
|
|
field f = search_schema.at(a_filter.field_name);
|
|
std::vector<std::pair<uint32_t*, size_t>> filter_result_array_pairs;
|
|
|
|
if(f.is_integer()) {
|
|
std::vector<const art_leaf*> leaves;
|
|
|
|
for(const std::string & filter_value: a_filter.values) {
|
|
if(f.type == field_types::INT32 || f.type == field_types::INT32_ARRAY) {
|
|
int32_t value = (int32_t) std::stoi(filter_value);
|
|
art_int32_search(t, value, a_filter.compare_operator, leaves);
|
|
} else {
|
|
int64_t value = (int64_t) std::stoi(filter_value);
|
|
art_int64_search(t, value, a_filter.compare_operator, leaves);
|
|
}
|
|
|
|
for(const art_leaf* leaf: leaves) {
|
|
filter_result_array_pairs.push_back(std::make_pair(leaf->values->ids.uncompress(),
|
|
leaf->values->ids.getLength()));
|
|
}
|
|
}
|
|
} else if(f.is_float()) {
|
|
std::vector<const art_leaf*> leaves;
|
|
|
|
for(const std::string & filter_value: a_filter.values) {
|
|
float value = (float) std::atof(filter_value.c_str());
|
|
art_float_search(t, value, a_filter.compare_operator, leaves);
|
|
for(const art_leaf* leaf: leaves) {
|
|
filter_result_array_pairs.push_back(std::make_pair(leaf->values->ids.uncompress(),
|
|
leaf->values->ids.getLength()));
|
|
}
|
|
}
|
|
} else if(f.is_string()) {
|
|
for(const std::string & filter_value: a_filter.values) {
|
|
// we have to tokenize the string, standardize it and then do an exact match
|
|
std::vector<std::string> str_tokens;
|
|
StringUtils::split(filter_value, str_tokens, " ");
|
|
|
|
uint32_t* filtered_ids = nullptr;
|
|
size_t filtered_size = 0;
|
|
|
|
for(auto i = 0; i < str_tokens.size(); i++) {
|
|
std::string & str_token = str_tokens[i];
|
|
StringUtils::normalize(str_token);
|
|
art_leaf* leaf = (art_leaf *) art_search(t, (const unsigned char*) str_token.c_str(),
|
|
str_token.length()+1);
|
|
if(leaf == nullptr) {
|
|
continue;
|
|
}
|
|
|
|
if(i == 0) {
|
|
filtered_ids = leaf->values->ids.uncompress();
|
|
filtered_size = leaf->values->ids.getLength();
|
|
} else {
|
|
// do AND for an exact match
|
|
uint32_t* out = nullptr;
|
|
uint32_t* leaf_ids = leaf->values->ids.uncompress();
|
|
filtered_size = ArrayUtils::and_scalar(filtered_ids, filtered_size, leaf_ids,
|
|
leaf->values->ids.getLength(), &out);
|
|
delete[] leaf_ids;
|
|
delete[] filtered_ids;
|
|
filtered_ids = out;
|
|
}
|
|
}
|
|
|
|
filter_result_array_pairs.push_back(std::make_pair(filtered_ids, filtered_size));
|
|
}
|
|
}
|
|
|
|
uint32_t* result_ids = nullptr;
|
|
size_t result_ids_length = union_of_ids(filter_result_array_pairs, &result_ids);
|
|
|
|
if(filter_ids == nullptr) {
|
|
filter_ids = result_ids;
|
|
filter_ids_length = result_ids_length;
|
|
} else {
|
|
uint32_t* filtered_results = nullptr;
|
|
filter_ids_length = ArrayUtils::and_scalar(filter_ids, filter_ids_length, result_ids,
|
|
result_ids_length, &filtered_results);
|
|
delete [] result_ids;
|
|
delete [] filter_ids;
|
|
filter_ids = filtered_results;
|
|
}
|
|
|
|
for(std::pair<uint32_t*, size_t> & filter_result_array_pair: filter_result_array_pairs) {
|
|
delete[] filter_result_array_pair.first;
|
|
}
|
|
}
|
|
}
|
|
|
|
*filter_ids_out = filter_ids;
|
|
return Option<>(filter_ids_length);
|
|
}
|
|
|
|
Option<nlohmann::json> Collection::search(std::string query, const std::vector<std::string> search_fields,
|
|
const std::string & simple_filter_query, const std::vector<std::string> & facet_fields,
|
|
const std::vector<sort_by> & sort_fields, const int num_typos,
|
|
const size_t per_page, const size_t page,
|
|
const token_ordering token_order, const bool prefix) {
|
|
nlohmann::json result = nlohmann::json::object();
|
|
std::vector<facet> facets;
|
|
|
|
// validate search fields
|
|
for(const std::string & field_name: search_fields) {
|
|
if(search_schema.count(field_name) == 0) {
|
|
std::string error = "Could not find a field named `" + field_name + "` in the schema.";
|
|
return Option<nlohmann::json>(400, error);
|
|
}
|
|
|
|
field search_field = search_schema.at(field_name);
|
|
if(search_field.type != field_types::STRING && search_field.type != field_types::STRING_ARRAY) {
|
|
std::string error = "Field `" + field_name + "` should be a string or a string array.";
|
|
return Option<nlohmann::json>(400, error);
|
|
}
|
|
|
|
if(search_field.facet) {
|
|
std::string error = "Field `" + field_name + "` is a faceted field - it cannot be used as a query field.";
|
|
return Option<nlohmann::json>(400, error);
|
|
}
|
|
}
|
|
|
|
// validate facet fields
|
|
for(const std::string & field_name: facet_fields) {
|
|
if(facet_schema.count(field_name) == 0) {
|
|
std::string error = "Could not find a facet field named `" + field_name + "` in the schema.";
|
|
return Option<nlohmann::json>(400, error);
|
|
}
|
|
facets.push_back(facet(field_name));
|
|
}
|
|
|
|
// validate sort fields and standardize
|
|
|
|
std::vector<sort_by> sort_fields_std;
|
|
|
|
for(const sort_by & _sort_field: sort_fields) {
|
|
if(sort_index.count(_sort_field.name) == 0) {
|
|
std::string error = "Could not find a field named `" + _sort_field.name + "` in the schema for sorting.";
|
|
return Option<nlohmann::json>(400, error);
|
|
}
|
|
|
|
std::string sort_order = _sort_field.order;
|
|
StringUtils::toupper(sort_order);
|
|
|
|
if(sort_order != sort_field_const::asc && sort_order != sort_field_const::desc) {
|
|
std::string error = "Order for field` " + _sort_field.name + "` should be either ASC or DESC.";
|
|
return Option<nlohmann::json>(400, error);
|
|
}
|
|
|
|
sort_fields_std.push_back({_sort_field.name, sort_order});
|
|
}
|
|
|
|
// process the filters
|
|
uint32_t* filter_ids = nullptr;
|
|
Option<uint32_t> op_filter_ids_length = do_filtering(&filter_ids, simple_filter_query);
|
|
if(!op_filter_ids_length.ok()) {
|
|
return Option<nlohmann::json>(op_filter_ids_length.code(), op_filter_ids_length.error());
|
|
}
|
|
|
|
const uint32_t filter_ids_length = op_filter_ids_length.get();
|
|
|
|
// check for valid pagination
|
|
if(page < 1) {
|
|
std::string message = "Page must be an integer of value greater than 0.";
|
|
return Option<nlohmann::json>(422, message);
|
|
}
|
|
|
|
if((page * per_page) > MAX_RESULTS) {
|
|
std::string message = "Only the first " + std::to_string(MAX_RESULTS) + " results are available.";
|
|
return Option<nlohmann::json>(422, message);
|
|
}
|
|
|
|
const size_t num_results = (page * per_page);
|
|
|
|
// Order of `fields` are used to sort results
|
|
auto begin = std::chrono::high_resolution_clock::now();
|
|
std::vector<std::pair<int, Topster<100>::KV>> field_order_kvs;
|
|
uint32_t* all_result_ids = nullptr;
|
|
size_t all_result_ids_len = 0;
|
|
|
|
// all search queries that were used for generating the results
|
|
std::vector<std::vector<art_leaf*>> searched_queries;
|
|
int searched_queries_index = 0;
|
|
|
|
for(int i = 0; i < search_fields.size(); i++) {
|
|
Topster<100> topster;
|
|
const std::string & field = search_fields[i];
|
|
// proceed to query search only when no filters are provided or when filtering produces results
|
|
if(simple_filter_query.size() == 0 || filter_ids_length > 0) {
|
|
search_field(query, field, filter_ids, filter_ids_length, facets, sort_fields_std, num_typos, num_results,
|
|
searched_queries, searched_queries_index, topster, &all_result_ids, all_result_ids_len, token_order, prefix);
|
|
topster.sort();
|
|
}
|
|
|
|
// order of fields specified matter: matching docs from earlier fields are more important
|
|
for(auto t = 0; t < topster.size && t < num_results; t++) {
|
|
field_order_kvs.push_back(std::make_pair(search_fields.size() - i, topster.getKV(t)));
|
|
}
|
|
}
|
|
|
|
delete [] filter_ids;
|
|
delete [] all_result_ids;
|
|
|
|
// All fields are sorted descending
|
|
std::sort(field_order_kvs.begin(), field_order_kvs.end(),
|
|
[](const std::pair<int, Topster<100>::KV> & a, const std::pair<int, Topster<100>::KV> & b) {
|
|
return std::tie(a.second.match_score, a.second.primary_attr, a.second.secondary_attr, a.first, a.second.key) >
|
|
std::tie(b.second.match_score, b.second.primary_attr, b.second.secondary_attr, b.first, b.second.key);
|
|
});
|
|
|
|
result["hits"] = nlohmann::json::array();
|
|
result["found"] = all_result_ids_len;
|
|
|
|
const int start_result_index = (page - 1) * per_page;
|
|
const int kvsize = field_order_kvs.size();
|
|
|
|
if(start_result_index > (kvsize - 1)) {
|
|
return Option<nlohmann::json>(result);
|
|
}
|
|
|
|
const int end_result_index = std::min(int(page * per_page), kvsize) - 1;
|
|
|
|
for(size_t field_order_kv_index = start_result_index; field_order_kv_index <= end_result_index; field_order_kv_index++) {
|
|
const auto & field_order_kv = field_order_kvs[field_order_kv_index];
|
|
const std::string& seq_id_key = get_seq_id_key((uint32_t) field_order_kv.second.key);
|
|
|
|
std::string value;
|
|
store->get(seq_id_key, value);
|
|
|
|
nlohmann::json document;
|
|
try {
|
|
document = nlohmann::json::parse(value);
|
|
} catch(...) {
|
|
return Option<nlohmann::json>(500, "Error while parsing stored document.");
|
|
}
|
|
|
|
// highlight query words in the result
|
|
const std::string & field_name = search_fields[search_fields.size() - field_order_kv.first];
|
|
field search_field = search_schema.at(field_name);
|
|
|
|
// only string fields are supported for now
|
|
if(search_field.type == field_types::STRING) {
|
|
std::vector<std::string> tokens;
|
|
StringUtils::split(document[field_name], tokens, " ");
|
|
|
|
// positions in the document of each token in the query
|
|
std::vector<std::vector<uint16_t>> token_positions;
|
|
|
|
for (const art_leaf *token_leaf : searched_queries[field_order_kv.second.query_index]) {
|
|
std::vector<uint16_t> positions;
|
|
int doc_index = token_leaf->values->ids.indexOf(field_order_kv.second.key);
|
|
if(doc_index == token_leaf->values->ids.getLength()) {
|
|
continue;
|
|
}
|
|
|
|
uint32_t start_offset = token_leaf->values->offset_index.at(doc_index);
|
|
uint32_t end_offset = (doc_index == token_leaf->values->ids.getLength() - 1) ?
|
|
token_leaf->values->offsets.getLength() :
|
|
token_leaf->values->offset_index.at(doc_index+1);
|
|
|
|
while(start_offset < end_offset) {
|
|
positions.push_back((uint16_t) token_leaf->values->offsets.at(start_offset));
|
|
start_offset++;
|
|
}
|
|
|
|
token_positions.push_back(positions);
|
|
}
|
|
|
|
MatchScore mscore = MatchScore::match_score(field_order_kv.second.key, token_positions);
|
|
|
|
// unpack `mscore.offset_diffs` into `token_indices`
|
|
std::vector<size_t> token_indices;
|
|
char num_tokens_found = mscore.offset_diffs[0];
|
|
for(size_t i = 1; i <= num_tokens_found; i++) {
|
|
if(mscore.offset_diffs[i] != std::numeric_limits<int8_t>::max()) {
|
|
size_t token_index = (size_t)(mscore.start_offset + mscore.offset_diffs[i]);
|
|
token_indices.push_back(token_index);
|
|
}
|
|
}
|
|
|
|
auto minmax = std::minmax_element(token_indices.begin(), token_indices.end());
|
|
|
|
// For longer strings, pick surrounding tokens within N tokens of min_index and max_index for the snippet
|
|
const size_t start_index = (tokens.size() <= SNIPPET_STR_ABOVE_LEN) ? 0 :
|
|
std::max(0, (int)(*(minmax.first)-5));
|
|
|
|
const size_t end_index = (tokens.size() <= SNIPPET_STR_ABOVE_LEN) ? tokens.size() :
|
|
std::min((int)tokens.size(), (int)(*(minmax.second)+5));
|
|
|
|
for(const size_t token_index: token_indices) {
|
|
tokens[token_index] = "<mark>" + tokens[token_index] + "</mark>";
|
|
}
|
|
|
|
std::stringstream snippet_stream;
|
|
for(size_t snippet_index = start_index; snippet_index < end_index; snippet_index++) {
|
|
if(snippet_index != start_index) {
|
|
snippet_stream << " ";
|
|
}
|
|
|
|
snippet_stream << tokens[snippet_index];
|
|
}
|
|
|
|
document["_highlight"] = nlohmann::json::object();
|
|
document["_highlight"][field_name] = snippet_stream.str();
|
|
}
|
|
|
|
result["hits"].push_back(document);
|
|
}
|
|
|
|
result["facet_counts"] = nlohmann::json::array();
|
|
|
|
// populate facets
|
|
for(const facet & a_facet: facets) {
|
|
nlohmann::json facet_result = nlohmann::json::object();
|
|
facet_result["field_name"] = a_facet.field_name;
|
|
facet_result["counts"] = nlohmann::json::array();
|
|
|
|
// keep only top 10 facets
|
|
std::vector<std::pair<std::string, size_t>> value_to_count;
|
|
for (auto itr = a_facet.result_map.begin(); itr != a_facet.result_map.end(); ++itr) {
|
|
value_to_count.push_back(*itr);
|
|
}
|
|
|
|
std::sort(value_to_count.begin(), value_to_count.end(),
|
|
[=](std::pair<std::string, size_t>& a, std::pair<std::string, size_t>& b) {
|
|
return a.second > b.second;
|
|
});
|
|
|
|
for(auto i = 0; i < std::min((size_t)10, value_to_count.size()); i++) {
|
|
auto & kv = value_to_count[i];
|
|
nlohmann::json facet_value_count = nlohmann::json::object();
|
|
facet_value_count["value"] = kv.first;
|
|
facet_value_count["count"] = kv.second;
|
|
facet_result["counts"].push_back(facet_value_count);
|
|
}
|
|
|
|
result["facet_counts"].push_back(facet_result);
|
|
}
|
|
|
|
long long int timeMillis = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::high_resolution_clock::now() - begin).count();
|
|
//!std::cout << "Time taken for result calc: " << timeMillis << "us" << std::endl;
|
|
//!store->print_memory_usage();
|
|
return result;
|
|
}
|
|
|
|
/*
|
|
1. Split the query into tokens
|
|
2. Outer loop will generate bounded cartesian product with costs for each token
|
|
3. Inner loop will iterate on each token with associated cost
|
|
4. Cartesian product of the results of the token searches will be used to form search phrases
|
|
(cartesian product adapted from: http://stackoverflow.com/a/31169617/131050)
|
|
4. Intersect the lists to find docs that match each phrase
|
|
5. Sort the docs based on some ranking criteria
|
|
*/
|
|
void Collection::search_field(std::string & query, const std::string & field, uint32_t *filter_ids, size_t filter_ids_length,
|
|
std::vector<facet> & facets, const std::vector<sort_by> & sort_fields, const int num_typos,
|
|
const size_t num_results, std::vector<std::vector<art_leaf*>> & searched_queries,
|
|
int & searched_queries_index, Topster<100> &topster, uint32_t** all_result_ids, size_t & all_result_ids_len,
|
|
const token_ordering token_order, const bool prefix) {
|
|
std::vector<std::string> tokens;
|
|
StringUtils::split(query, tokens, " ");
|
|
|
|
const int max_cost = (num_typos < 0 || num_typos > 2) ? 2 : num_typos;
|
|
const size_t max_results = std::min(num_results, (size_t) Collection::MAX_RESULTS);
|
|
|
|
size_t total_results = topster.size;
|
|
|
|
// To prevent us from doing ART search repeatedly as we iterate through possible corrections
|
|
spp::sparse_hash_map<std::string, std::vector<art_leaf*>> token_cost_cache;
|
|
|
|
// Used to drop the least occurring token(s) for partial searches
|
|
spp::sparse_hash_map<std::string, uint32_t> token_to_count;
|
|
|
|
std::vector<std::vector<int>> token_to_costs;
|
|
|
|
for(size_t token_index = 0; token_index < tokens.size(); token_index++) {
|
|
std::vector<int> all_costs;
|
|
const size_t token_len = tokens[token_index].length();
|
|
|
|
// This ensures that we don't end up doing a cost of 1 for a single char etc.
|
|
int bounded_cost = max_cost;
|
|
if(token_len > 0 && max_cost >= token_len && (token_len == 1 || token_len == 2)) {
|
|
bounded_cost = token_len - 1;
|
|
}
|
|
|
|
for(int cost = 0; cost <= bounded_cost; cost++) {
|
|
all_costs.push_back(cost);
|
|
}
|
|
|
|
token_to_costs.push_back(all_costs);
|
|
StringUtils::normalize(tokens[token_index]);
|
|
}
|
|
|
|
// stores candidates for each token, i.e. i-th index would have all possible tokens with a cost of "c"
|
|
std::vector<std::vector<art_leaf*>> token_to_candidates;
|
|
|
|
const size_t combination_limit = 10;
|
|
auto product = []( long long a, std::vector<int>& b ) { return a*b.size(); };
|
|
int candidate_rank = 0;
|
|
long long n = 0;
|
|
long long int N = std::accumulate(token_to_costs.begin(), token_to_costs.end(), 1LL, product);
|
|
|
|
while(n < N && n < combination_limit) {
|
|
// Outerloop generates combinations of [cost to max_cost] for each token
|
|
// For e.g. for a 3-token query: [0, 0, 0], [0, 0, 1], [0, 1, 1] etc.
|
|
std::vector<uint32_t> costs(token_to_costs.size());
|
|
ldiv_t q { n, 0 };
|
|
for(long long i = (token_to_costs.size() - 1); 0 <= i ; --i ) {
|
|
q = ldiv(q.quot, token_to_costs[i].size());
|
|
costs[i] = token_to_costs[i][q.rem];
|
|
}
|
|
|
|
token_to_candidates.clear();
|
|
int token_index = 0;
|
|
|
|
while(token_index < tokens.size()) {
|
|
// For each token, look up the generated cost for this iteration and search using that cost
|
|
std::string token = tokens[token_index];
|
|
const std::string token_cost_hash = token + std::to_string(costs[token_index]);
|
|
|
|
std::vector<art_leaf*> leaves;
|
|
/*std::cout << "\nSearching for: " << token << " - cost: " << costs[token_index] << ", candidate_rank: "
|
|
<< candidate_rank << std::endl;*/
|
|
|
|
if(token_cost_cache.count(token_cost_hash) != 0) {
|
|
leaves = token_cost_cache[token_cost_hash];
|
|
} else {
|
|
// prefix should apply only for last token
|
|
const bool prefix_search = prefix && ((token_index == tokens.size()-1) ? true : false);
|
|
const int token_len = prefix_search ? (int) token.length() : (int) token.length() + 1;
|
|
const int max_candidates = prefix_search ? 5 : 3;
|
|
|
|
art_fuzzy_search(search_index.at(field), (const unsigned char *) token.c_str(), token_len,
|
|
costs[token_index], costs[token_index], max_candidates, token_order, prefix_search, leaves);
|
|
|
|
if(!leaves.empty()) {
|
|
token_cost_cache.emplace(token_cost_hash, leaves);
|
|
}
|
|
}
|
|
|
|
if(!leaves.empty()) {
|
|
//!log_leaves(costs[token_index], token, leaves);
|
|
token_to_candidates.push_back(leaves);
|
|
token_to_count[token] = std::max(token_to_count[token], leaves.at(0)->values->ids.getLength());
|
|
} else {
|
|
// No result at `cost = costs[token_index]`. Remove costs until `cost` for token and re-do combinations
|
|
auto it = std::find(token_to_costs[token_index].begin(), token_to_costs[token_index].end(), costs[token_index]);
|
|
if(it != token_to_costs[token_index].end()) {
|
|
token_to_costs[token_index].erase(it);
|
|
|
|
// no more costs left for this token, clean up
|
|
if(token_to_costs[token_index].empty()) {
|
|
token_to_costs.erase(token_to_costs.begin()+token_index);
|
|
tokens.erase(tokens.begin()+token_index);
|
|
token_index--;
|
|
}
|
|
}
|
|
|
|
// To continue outerloop on new cost combination
|
|
n = -1;
|
|
N = std::accumulate(token_to_costs.begin(), token_to_costs.end(), 1LL, product);
|
|
break;
|
|
}
|
|
|
|
token_index++;
|
|
}
|
|
|
|
if(token_to_candidates.size() != 0 && token_to_candidates.size() == tokens.size()) {
|
|
// If all tokens were found, go ahead and search for candidates with what we have so far
|
|
search_candidates(filter_ids, filter_ids_length, facets, sort_fields, candidate_rank, token_to_candidates,
|
|
searched_queries, topster, total_results, all_result_ids, all_result_ids_len,
|
|
max_results, prefix);
|
|
|
|
if (!prefix && total_results >= max_results) {
|
|
// If we don't find enough results, we continue outerloop (looking at tokens with greater cost)
|
|
break;
|
|
}
|
|
|
|
// only allow upto 10 prefix candidate tokens
|
|
if(prefix && candidate_rank > 10) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
n++;
|
|
}
|
|
|
|
// When there are not enough overall results and atleast one token has results
|
|
if(topster.size < max_results && token_to_count.size() > 1) {
|
|
// Drop token with least hits and try searching again
|
|
std::string truncated_query;
|
|
|
|
std::vector<std::pair<std::string, uint32_t>> token_count_pairs;
|
|
for (auto itr = token_to_count.begin(); itr != token_to_count.end(); ++itr) {
|
|
token_count_pairs.push_back(*itr);
|
|
}
|
|
|
|
std::sort(token_count_pairs.begin(), token_count_pairs.end(), [=]
|
|
(const std::pair<std::string, uint32_t>& a, const std::pair<std::string, uint32_t>& b) {
|
|
return a.second > b.second;
|
|
}
|
|
);
|
|
|
|
for(uint32_t i = 0; i < token_count_pairs.size()-1; i++) {
|
|
// iterate till last but one
|
|
truncated_query += " " + token_count_pairs.at(i).first;
|
|
}
|
|
|
|
return search_field(truncated_query, field, filter_ids, filter_ids_length, facets, sort_fields, num_typos,
|
|
num_results, searched_queries, candidate_rank, topster, all_result_ids, all_result_ids_len,
|
|
token_order, prefix);
|
|
}
|
|
}
|
|
|
|
void Collection::log_leaves(const int cost, const std::string &token, const std::vector<art_leaf *> &leaves) const {
|
|
printf("Token: %s, cost: %d, candidates: \n", token.c_str(), cost);
|
|
for(auto i=0; i < leaves.size(); i++) {
|
|
printf("%.*s, ", leaves[i]->key_len, leaves[i]->key);
|
|
printf("frequency: %d, max_score: %d\n", leaves[i]->values->ids.getLength(), leaves[i]->max_score);
|
|
/*for(auto j=0; j<leaves[i]->values->ids.getLength(); j++) {
|
|
printf("id: %d\n", leaves[i]->values->ids.at(j));
|
|
}*/
|
|
}
|
|
}
|
|
|
|
void Collection::score_results(const std::vector<sort_by> & sort_fields, const int & query_index, const int & candidate_rank,
|
|
Topster<100> & topster, const std::vector<art_leaf *> &query_suggestion,
|
|
const uint32_t *result_ids, const size_t result_size) const {
|
|
|
|
const int max_candidate_rank = 250;
|
|
spp::sparse_hash_map<const art_leaf*, uint32_t*> leaf_to_indices;
|
|
|
|
for (art_leaf *token_leaf : query_suggestion) {
|
|
uint32_t *indices = new uint32_t[result_size];
|
|
token_leaf->values->ids.indexOf(result_ids, result_size, indices);
|
|
leaf_to_indices.emplace(token_leaf, indices);
|
|
}
|
|
|
|
spp::sparse_hash_map<uint32_t, number_t> * primary_rank_scores = nullptr;
|
|
spp::sparse_hash_map<uint32_t, number_t> * secondary_rank_scores = nullptr;
|
|
|
|
// Used for asc/desc ordering. NOTE: Topster keeps biggest keys (i.e. it's desc in nature)
|
|
number_t primary_rank_factor;
|
|
number_t secondary_rank_factor;
|
|
|
|
if(sort_fields.size() > 0) {
|
|
// assumed that rank field exists in the index - checked earlier in the chain
|
|
primary_rank_scores = sort_index.at(sort_fields[0].name);
|
|
|
|
// initialize primary_rank_factor
|
|
field sort_field = sort_schema.at(sort_fields[0].name);
|
|
if(sort_field.is_single_integer()) {
|
|
primary_rank_factor = ((int64_t) 1);
|
|
} else {
|
|
primary_rank_factor = ((float) 1);
|
|
}
|
|
|
|
if(sort_fields[0].order == sort_field_const::asc) {
|
|
primary_rank_factor = -primary_rank_factor;
|
|
}
|
|
}
|
|
|
|
if(sort_fields.size() > 1) {
|
|
secondary_rank_scores = sort_index.at(sort_fields[1].name);
|
|
|
|
// initialize secondary_rank_factor
|
|
field sort_field = sort_schema.at(sort_fields[1].name);
|
|
if(sort_field.is_single_integer()) {
|
|
secondary_rank_factor = ((int64_t) 1);
|
|
} else {
|
|
secondary_rank_factor = ((float) 1);
|
|
}
|
|
|
|
if(sort_fields[1].order == sort_field_const::asc) {
|
|
secondary_rank_factor = -secondary_rank_factor;
|
|
}
|
|
}
|
|
|
|
for(auto i=0; i<result_size; i++) {
|
|
uint32_t seq_id = result_ids[i];
|
|
MatchScore mscore;
|
|
|
|
if(query_suggestion.size() == 1) {
|
|
// short circuit to speed up single token searches (use dummy offsets for now)
|
|
char offset_diffs[16];
|
|
std::fill_n(offset_diffs, 16, 0);
|
|
mscore = MatchScore(1, 0, 0, offset_diffs);
|
|
} else {
|
|
std::vector<std::vector<uint16_t>> token_positions;
|
|
populate_token_positions(query_suggestion, leaf_to_indices, i, token_positions);
|
|
mscore = MatchScore::match_score(seq_id, token_positions);
|
|
}
|
|
|
|
int candidate_rank_score = max_candidate_rank - candidate_rank;
|
|
|
|
// Construct a single match_score from individual components (for multi-field sort)
|
|
const uint64_t match_score = ((uint64_t)(mscore.words_present) << 16) +
|
|
(candidate_rank_score << 8) +
|
|
(MAX_SEARCH_TOKENS - mscore.distance);
|
|
|
|
const int64_t default_score = 0;
|
|
const number_t & primary_rank_score = (primary_rank_scores && primary_rank_scores->count(seq_id) > 0) ?
|
|
primary_rank_scores->at(seq_id) : default_score;
|
|
const number_t & secondary_rank_score = (secondary_rank_scores && secondary_rank_scores->count(seq_id) > 0) ?
|
|
secondary_rank_scores->at(seq_id) : default_score;
|
|
|
|
const number_t & primary_rank_value = primary_rank_score * primary_rank_factor;
|
|
const number_t & secondary_rank_value = secondary_rank_score * secondary_rank_factor;
|
|
topster.add(seq_id, query_index, match_score, primary_rank_value, secondary_rank_value);
|
|
|
|
/*std::cout << "candidate_rank: " << candidate_rank << ", candidate_rank_score: " << candidate_rank_score
|
|
<< ", words_present: " << mscore.words_present << ", match_score: " << match_score
|
|
<< ", primary_rank_score: " << primary_rank_score.intval << ", seq_id: " << seq_id << std::endl;*/
|
|
}
|
|
|
|
for (auto it = leaf_to_indices.begin(); it != leaf_to_indices.end(); it++) {
|
|
delete [] it->second;
|
|
it->second = nullptr;
|
|
}
|
|
}
|
|
|
|
void Collection::populate_token_positions(const std::vector<art_leaf *> &query_suggestion,
|
|
spp::sparse_hash_map<const art_leaf *, uint32_t *> &leaf_to_indices,
|
|
size_t result_index, std::vector<std::vector<uint16_t>> &token_positions) const {
|
|
// for each token in the query, find the positions that it appears in this document
|
|
for (const art_leaf *token_leaf : query_suggestion) {
|
|
std::vector<uint16_t> positions;
|
|
int doc_index = leaf_to_indices.at(token_leaf)[result_index];
|
|
if(doc_index == token_leaf->values->ids.getLength()) {
|
|
continue;
|
|
}
|
|
|
|
uint32_t start_offset = token_leaf->values->offset_index.at(doc_index);
|
|
uint32_t end_offset = (doc_index == token_leaf->values->ids.getLength() - 1) ?
|
|
token_leaf->values->offsets.getLength() :
|
|
token_leaf->values->offset_index.at(doc_index+1);
|
|
|
|
while(start_offset < end_offset) {
|
|
positions.push_back((uint16_t) token_leaf->values->offsets.at(start_offset));
|
|
start_offset++;
|
|
}
|
|
|
|
token_positions.push_back(positions);
|
|
}
|
|
}
|
|
|
|
inline std::vector<art_leaf *> Collection::next_suggestion(const std::vector<std::vector<art_leaf *>> &token_leaves,
|
|
long long int n) {
|
|
std::vector<art_leaf*> query_suggestion(token_leaves.size());
|
|
|
|
// generate the next combination from `token_leaves` and store it in `query_suggestion`
|
|
ldiv_t q { n, 0 };
|
|
for(long long i = 0 ; i < token_leaves.size(); i++) {
|
|
q = ldiv(q.quot, token_leaves[i].size());
|
|
query_suggestion[i] = token_leaves[i][q.rem];
|
|
}
|
|
|
|
// sort ascending based on matched documents for each token for faster intersection
|
|
sort(query_suggestion.begin(), query_suggestion.end(), [](const art_leaf* left, const art_leaf* right) {
|
|
return left->values->ids.getLength() < right->values->ids.getLength();
|
|
});
|
|
|
|
return query_suggestion;
|
|
}
|
|
|
|
void Collection::remove_and_shift_offset_index(sorted_array &offset_index, const uint32_t *indices_sorted,
|
|
const uint32_t indices_length) {
|
|
uint32_t *curr_array = offset_index.uncompress();
|
|
uint32_t *new_array = new uint32_t[offset_index.getLength()];
|
|
|
|
new_array[0] = 0;
|
|
uint32_t new_index = 0;
|
|
uint32_t curr_index = 0;
|
|
uint32_t indices_counter = 0;
|
|
uint32_t shift_value = 0;
|
|
|
|
while(curr_index < offset_index.getLength()) {
|
|
if(indices_counter < indices_length && curr_index >= indices_sorted[indices_counter]) {
|
|
// skip copying
|
|
if(curr_index == indices_sorted[indices_counter]) {
|
|
curr_index++;
|
|
const uint32_t diff = curr_index == offset_index.getLength() ?
|
|
0 : (offset_index.at(curr_index) - offset_index.at(curr_index-1));
|
|
|
|
shift_value += diff;
|
|
}
|
|
indices_counter++;
|
|
} else {
|
|
new_array[new_index++] = curr_array[curr_index++] - shift_value;
|
|
}
|
|
}
|
|
|
|
offset_index.load(new_array, new_index);
|
|
|
|
delete[] curr_array;
|
|
delete[] new_array;
|
|
}
|
|
|
|
Option<nlohmann::json> Collection::get(const std::string & id) {
|
|
std::string seq_id_str;
|
|
StoreStatus status = store->get(get_doc_id_key(id), seq_id_str);
|
|
|
|
if(status == StoreStatus::NOT_FOUND) {
|
|
return Option<nlohmann::json>(404, "Could not find a document with id: " + id);
|
|
}
|
|
|
|
uint32_t seq_id = (uint32_t) std::stol(seq_id_str);
|
|
|
|
std::string parsed_document;
|
|
store->get(get_seq_id_key(seq_id), parsed_document);
|
|
|
|
nlohmann::json document;
|
|
try {
|
|
document = nlohmann::json::parse(parsed_document);
|
|
} catch(...) {
|
|
return Option<nlohmann::json>(500, "Error while parsing stored document.");
|
|
}
|
|
|
|
return Option<nlohmann::json>(document);
|
|
}
|
|
|
|
Option<std::string> Collection::remove(const std::string & id, const bool remove_from_store) {
|
|
std::string seq_id_str;
|
|
StoreStatus status = store->get(get_doc_id_key(id), seq_id_str);
|
|
|
|
if(status == StoreStatus::NOT_FOUND) {
|
|
return Option<std::string>(404, "Could not find a document with id: " + id);
|
|
}
|
|
|
|
uint32_t seq_id = (uint32_t) std::stol(seq_id_str);
|
|
|
|
std::string parsed_document;
|
|
store->get(get_seq_id_key(seq_id), parsed_document);
|
|
|
|
nlohmann::json document;
|
|
try {
|
|
document = nlohmann::json::parse(parsed_document);
|
|
} catch(...) {
|
|
return Option<std::string>(500, "Error while parsing stored document.");
|
|
}
|
|
|
|
for(auto & name_field: search_schema) {
|
|
// Go through all the field names and find the keys+values so that they can be removed from in-memory index
|
|
std::vector<std::string> tokens;
|
|
if(name_field.second.type == field_types::STRING) {
|
|
StringUtils::split(document[name_field.first], tokens, " ");
|
|
} else if(name_field.second.type == field_types::STRING_ARRAY) {
|
|
tokens = document[name_field.first].get<std::vector<std::string>>();
|
|
} else if(name_field.second.type == field_types::INT32) {
|
|
const int KEY_LEN = 8;
|
|
unsigned char key[KEY_LEN];
|
|
int32_t value = document[name_field.first].get<int32_t>();
|
|
encode_int32(value, key);
|
|
tokens.push_back(std::string((char*)key, KEY_LEN));
|
|
} else if(name_field.second.type == field_types::INT32_ARRAY) {
|
|
std::vector<int32_t> values = document[name_field.first].get<std::vector<int32_t>>();
|
|
for(const int32_t value: values) {
|
|
const int KEY_LEN = 8;
|
|
unsigned char key[KEY_LEN];
|
|
encode_int32(value, key);
|
|
tokens.push_back(std::string((char*)key, KEY_LEN));
|
|
}
|
|
} else if(name_field.second.type == field_types::INT64) {
|
|
const int KEY_LEN = 8;
|
|
unsigned char key[KEY_LEN];
|
|
int64_t value = document[name_field.first].get<int64_t>();
|
|
encode_int64(value, key);
|
|
tokens.push_back(std::string((char*)key, KEY_LEN));
|
|
} else if(name_field.second.type == field_types::INT64_ARRAY) {
|
|
std::vector<int64_t> values = document[name_field.first].get<std::vector<int64_t>>();
|
|
for(const int64_t value: values) {
|
|
const int KEY_LEN = 8;
|
|
unsigned char key[KEY_LEN];
|
|
encode_int64(value, key);
|
|
tokens.push_back(std::string((char*)key, KEY_LEN));
|
|
}
|
|
} else if(name_field.second.type == field_types::FLOAT) {
|
|
const int KEY_LEN = 8;
|
|
unsigned char key[KEY_LEN];
|
|
int64_t value = document[name_field.first].get<int64_t>();
|
|
encode_float(value, key);
|
|
tokens.push_back(std::string((char*)key, KEY_LEN));
|
|
} else if(name_field.second.type == field_types::FLOAT_ARRAY) {
|
|
std::vector<float> values = document[name_field.first].get<std::vector<float>>();
|
|
for(const float value: values) {
|
|
const int KEY_LEN = 8;
|
|
unsigned char key[KEY_LEN];
|
|
encode_float(value, key);
|
|
tokens.push_back(std::string((char*)key, KEY_LEN));
|
|
}
|
|
}
|
|
|
|
for(auto & token: tokens) {
|
|
const unsigned char *key;
|
|
int key_len;
|
|
|
|
if(name_field.second.type == field_types::STRING_ARRAY || name_field.second.type == field_types::STRING) {
|
|
StringUtils::normalize(token);
|
|
key = (const unsigned char *) token.c_str();
|
|
key_len = (int) (token.length() + 1);
|
|
} else {
|
|
key = (const unsigned char *) token.c_str();
|
|
key_len = (int) (token.length());
|
|
}
|
|
|
|
art_leaf* leaf = (art_leaf *) art_search(search_index.at(name_field.first), key, key_len);
|
|
if(leaf != NULL) {
|
|
uint32_t seq_id_values[1] = {seq_id};
|
|
uint32_t doc_index = leaf->values->ids.indexOf(seq_id);
|
|
|
|
if(doc_index == leaf->values->ids.getLength()) {
|
|
// not found - happens when 2 tokens repeat in a field, e.g "is it or is is not?"
|
|
continue;
|
|
}
|
|
|
|
uint32_t start_offset = leaf->values->offset_index.at(doc_index);
|
|
uint32_t end_offset = (doc_index == leaf->values->ids.getLength() - 1) ?
|
|
leaf->values->offsets.getLength() :
|
|
leaf->values->offset_index.at(doc_index+1);
|
|
|
|
uint32_t doc_indices[1] = {doc_index};
|
|
remove_and_shift_offset_index(leaf->values->offset_index, doc_indices, 1);
|
|
|
|
leaf->values->offsets.remove_index(start_offset, end_offset);
|
|
leaf->values->ids.remove_values(seq_id_values, 1);
|
|
|
|
/*len = leaf->values->offset_index.getLength();
|
|
for(auto i=0; i<len; i++) {
|
|
std::cout << "i: " << i << ", val: " << leaf->values->offset_index.at(i) << std::endl;
|
|
}
|
|
std::cout << "----" << std::endl;*/
|
|
|
|
if(leaf->values->ids.getLength() == 0) {
|
|
art_values* values = (art_values*) art_delete(search_index.at(name_field.first), key, key_len);
|
|
delete values;
|
|
values = nullptr;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// remove facets if any
|
|
for(auto & field_facet_value: facet_index) {
|
|
field_facet_value.second.doc_values.erase(seq_id);
|
|
}
|
|
|
|
// remove sort index if any
|
|
for(auto & field_doc_value_map: sort_index) {
|
|
field_doc_value_map.second->erase(seq_id);
|
|
}
|
|
|
|
if(remove_from_store) {
|
|
store->remove(get_doc_id_key(id));
|
|
store->remove(get_seq_id_key(seq_id));
|
|
}
|
|
|
|
num_documents -= 1;
|
|
|
|
return Option<std::string>(id);
|
|
}
|
|
|
|
std::string Collection::get_next_seq_id_key(const std::string & collection_name) {
|
|
return std::string(COLLECTION_NEXT_SEQ_PREFIX) + "_" + collection_name;
|
|
}
|
|
|
|
std::string Collection::get_seq_id_key(uint32_t seq_id) {
|
|
// We can't simply do std::to_string() because we want to preserve the byte order.
|
|
// & 0xFF masks all but the lowest eight bits.
|
|
unsigned char bytes[4];
|
|
bytes[0] = (unsigned char) ((seq_id >> 24) & 0xFF);
|
|
bytes[1] = (unsigned char) ((seq_id >> 16) & 0xFF);
|
|
bytes[2] = (unsigned char) ((seq_id >> 8) & 0xFF);
|
|
bytes[3] = (unsigned char) ((seq_id & 0xFF));
|
|
|
|
return get_seq_id_collection_prefix() + "_" + std::string(bytes, bytes+4);
|
|
}
|
|
|
|
uint32_t Collection::deserialize_seq_id_key(std::string serialized_seq_id) {
|
|
uint32_t seq_id = ((serialized_seq_id[0] & 0xFF) << 24) | ((serialized_seq_id[1] & 0xFF) << 16) |
|
|
((serialized_seq_id[2] & 0xFF) << 8) | (serialized_seq_id[3] & 0xFF);
|
|
return seq_id;
|
|
}
|
|
|
|
std::string Collection::get_doc_id_key(const std::string & doc_id) {
|
|
return std::to_string(collection_id) + "_" + DOC_ID_PREFIX + "_" + doc_id;
|
|
}
|
|
|
|
std::string Collection::get_name() {
|
|
return name;
|
|
}
|
|
|
|
size_t Collection::get_num_documents() {
|
|
return num_documents;
|
|
}
|
|
|
|
uint32_t Collection::get_collection_id() {
|
|
return collection_id;
|
|
}
|
|
|
|
uint32_t Collection::doc_id_to_seq_id(std::string doc_id) {
|
|
std::string seq_id_str;
|
|
store->get(get_doc_id_key(doc_id), seq_id_str);
|
|
uint32_t seq_id = (uint32_t) std::stoi(seq_id_str);
|
|
return seq_id;
|
|
}
|
|
|
|
std::vector<std::string> Collection::get_facet_fields() {
|
|
std::vector<std::string> facet_fields_copy;
|
|
for(auto it = facet_schema.begin(); it != facet_schema.end(); ++it) {
|
|
facet_fields_copy.push_back(it->first);
|
|
}
|
|
|
|
return facet_fields_copy;
|
|
}
|
|
|
|
std::vector<field> Collection::get_sort_fields() {
|
|
std::vector<field> sort_fields_copy;
|
|
for(auto it = sort_schema.begin(); it != sort_schema.end(); ++it) {
|
|
sort_fields_copy.push_back(it->second);
|
|
}
|
|
|
|
return sort_fields_copy;
|
|
}
|
|
|
|
spp::sparse_hash_map<std::string, field> Collection::get_schema() {
|
|
return search_schema;
|
|
};
|
|
|
|
std::string Collection::get_meta_key(const std::string & collection_name) {
|
|
return std::string(COLLECTION_META_PREFIX) + "_" + collection_name;
|
|
}
|
|
|
|
std::string Collection::get_seq_id_collection_prefix() {
|
|
return std::to_string(collection_id) + "_" + std::string(SEQ_ID_PREFIX);
|
|
}
|
|
|
|
std::string Collection::get_token_ranking_field() {
|
|
return token_ranking_field;
|
|
} |