typesense/src/index.cpp

2917 lines
125 KiB
C++

#include "index.h"
#include <numeric>
#include <chrono>
#include <set>
#include <unordered_map>
#include <array_utils.h>
#include <match_score.h>
#include <string_utils.h>
#include <art.h>
#include <tokenizer.h>
#include <h3api.h>
#include "logger.h"
Index::Index(const std::string name, const std::unordered_map<std::string, field> & search_schema,
std::map<std::string, field> facet_schema, std::unordered_map<std::string, field> sort_schema):
name(name), search_schema(search_schema), facet_schema(facet_schema), sort_schema(sort_schema) {
for(const auto & fname_field: search_schema) {
if(fname_field.second.is_string()) {
art_tree *t = new art_tree;
art_tree_init(t);
search_index.emplace(fname_field.first, t);
} else {
num_tree_t* num_tree = new num_tree_t;
numerical_index.emplace(fname_field.first, num_tree);
}
// initialize for non-string facet fields
if(fname_field.second.facet && !fname_field.second.is_string()) {
art_tree *ft = new art_tree;
art_tree_init(ft);
search_index.emplace(fname_field.second.faceted_name(), ft);
}
}
for(const auto & pair: sort_schema) {
spp::sparse_hash_map<uint32_t, int64_t> * doc_to_score = new spp::sparse_hash_map<uint32_t, int64_t>();
sort_index.emplace(pair.first, doc_to_score);
}
for(const auto& pair: facet_schema) {
spp::sparse_hash_map<uint32_t, facet_hash_values_t> *doc_to_values = new spp::sparse_hash_map<uint32_t, facet_hash_values_t>();
facet_index_v3.emplace(pair.first, doc_to_values);
}
num_documents = 0;
}
Index::~Index() {
for(auto & name_tree: search_index) {
art_tree_destroy(name_tree.second);
delete name_tree.second;
name_tree.second = nullptr;
}
for(auto & name_tree: numerical_index) {
delete name_tree.second;
name_tree.second = nullptr;
}
search_index.clear();
for(auto & name_map: sort_index) {
delete name_map.second;
name_map.second = nullptr;
}
sort_index.clear();
for(auto& kv: facet_index_v3) {
delete kv.second;
kv.second = nullptr;
}
facet_index_v3.clear();
}
int64_t Index::get_points_from_doc(const nlohmann::json &document, const std::string & default_sorting_field) {
int64_t points = 0;
if(document[default_sorting_field].is_number_float()) {
// serialize float to an integer and reverse the inverted range
float n = document[default_sorting_field];
memcpy(&points, &n, sizeof(int32_t));
points ^= ((points >> (std::numeric_limits<int32_t>::digits - 1)) | INT32_MIN);
points = -1 * (INT32_MAX - points);
} else {
points = document[default_sorting_field];
}
return points;
}
int64_t Index::float_to_in64_t(float f) {
// https://stackoverflow.com/questions/60530255/convert-float-to-int64-t-while-preserving-ordering
int32_t i;
memcpy(&i, &f, sizeof i);
if (i < 0) {
i ^= INT32_MAX;
}
return i;
}
Option<uint32_t> Index::index_in_memory(const nlohmann::json &document, uint32_t seq_id,
const std::string & default_sorting_field) {
std::unique_lock lock(mutex);
int64_t points = 0;
if(document.count(default_sorting_field) == 0) {
if(sort_index.count(default_sorting_field) != 0 && sort_index[default_sorting_field]->count(seq_id)) {
points = sort_index[default_sorting_field]->at(seq_id);
} else {
points = INT64_MIN;
}
} else {
points = get_points_from_doc(document, default_sorting_field);
}
seq_ids.append(seq_id);
// assumes that validation has already been done
for(const auto& field_pair: search_schema) {
const std::string & field_name = field_pair.first;
if(document.count(field_name) == 0 || !field_pair.second.index) {
continue;
}
bool is_facet = (facet_schema.count(field_name) != 0);
// non-string, non-geo faceted field should be indexed as faceted string field as well
if(field_pair.second.facet && !field_pair.second.is_string() && field_pair.second.type != field_types::GEOPOINT) {
art_tree *t = search_index.at(field_pair.second.faceted_name());
if(field_pair.second.is_array()) {
std::vector<std::string> strings;
if(field_pair.second.type == field_types::INT32_ARRAY) {
for(int32_t value: document[field_name]){
strings.push_back(std::to_string(value));
}
} else if(field_pair.second.type == field_types::INT64_ARRAY) {
for(int64_t value: document[field_name]){
strings.push_back(std::to_string(value));
}
} else if(field_pair.second.type == field_types::FLOAT_ARRAY) {
for(float value: document[field_name]){
strings.push_back(std::to_string(value));
}
} else if(field_pair.second.type == field_types::BOOL_ARRAY) {
for(bool value: document[field_name]){
strings.push_back(std::to_string(value));
}
}
index_string_array_field(strings, points, t, seq_id, is_facet, field_pair.second);
} else {
std::string text;
if(field_pair.second.type == field_types::INT32) {
text = std::to_string(document[field_name].get<int32_t>());
} else if(field_pair.second.type == field_types::INT64) {
text = std::to_string(document[field_name].get<int64_t>());
} else if(field_pair.second.type == field_types::FLOAT) {
text = std::to_string(document[field_name].get<float>());
} else if(field_pair.second.type == field_types::BOOL) {
text = std::to_string(document[field_name].get<bool>());
}
index_string_field(text, points, t, seq_id, is_facet, field_pair.second);
}
}
if(field_pair.second.type == field_types::STRING) {
art_tree *t = search_index.at(field_name);
const std::string & text = document[field_name];
index_string_field(text, points, t, seq_id, is_facet, field_pair.second);
}
else if(field_pair.second.type == field_types::INT32) {
auto num_tree = numerical_index.at(field_name);
uint32_t value = document[field_name];
num_tree->insert(value, seq_id);
} else if(field_pair.second.type == field_types::INT64) {
auto num_tree = numerical_index.at(field_name);
uint64_t value = document[field_name];
num_tree->insert(value, seq_id);
} else if(field_pair.second.type == field_types::FLOAT) {
auto num_tree = numerical_index.at(field_name);
float fvalue = document[field_name];
int64_t value = float_to_in64_t(fvalue);
num_tree->insert(value, seq_id);
} else if(field_pair.second.type == field_types::BOOL) {
auto num_tree = numerical_index.at(field_name);
bool value = document[field_name];
num_tree->insert(value, seq_id);
} else if(field_pair.second.type == field_types::GEOPOINT) {
auto num_tree = numerical_index.at(field_name);
const std::vector<double>& latlong = document[field_name];
GeoCoord x {degsToRads(latlong[0]), degsToRads(latlong[1])};
H3Index geoHash = geoToH3(&x, field_pair.second.geo_resolution);
//LOG(INFO) << "Indexing h3 index " << geoHash << " for seq_id " << seq_id << " at res: " << size_t(field_pair.second.geo_resolution);
num_tree->insert(geoHash, seq_id);
} else if(field_pair.second.type == field_types::STRING_ARRAY) {
art_tree *t = search_index.at(field_name);
index_string_array_field(document[field_name], points, t, seq_id, is_facet, field_pair.second);
}
else if(field_pair.second.is_array()) {
auto num_tree = numerical_index.at(field_name);
for(size_t arr_i = 0; arr_i < document[field_name].size(); arr_i++) {
const auto& arr_value = document[field_name][arr_i];
if(field_pair.second.type == field_types::INT32_ARRAY) {
const int32_t value = arr_value;
num_tree->insert(value, seq_id);
}
else if(field_pair.second.type == field_types::INT64_ARRAY) {
const int64_t value = arr_value;
num_tree->insert(value, seq_id);
}
else if(field_pair.second.type == field_types::FLOAT_ARRAY) {
const float fvalue = arr_value;
int64_t value = float_to_in64_t(fvalue);
num_tree->insert(value, seq_id);
}
else if(field_pair.second.type == field_types::BOOL_ARRAY) {
const bool value = document[field_name][arr_i];
num_tree->insert(int64_t(value), seq_id);
}
}
}
// add numerical values automatically into sort index
if(field_pair.second.type == field_types::INT32 || field_pair.second.type == field_types::INT64 ||
field_pair.second.type == field_types::FLOAT || field_pair.second.type == field_types::BOOL ||
field_pair.second.type == field_types::GEOPOINT) {
spp::sparse_hash_map<uint32_t, int64_t> *doc_to_score = sort_index.at(field_pair.first);
if(field_pair.second.is_integer() ) {
doc_to_score->emplace(seq_id, document[field_pair.first].get<int64_t>());
} else if(field_pair.second.is_float()) {
int64_t ifloat = float_to_in64_t(document[field_pair.first].get<float>());
doc_to_score->emplace(seq_id, ifloat);
} else if(field_pair.second.is_bool()) {
doc_to_score->emplace(seq_id, (int64_t) document[field_pair.first].get<bool>());
} else if(field_pair.second.is_geopoint()) {
const std::vector<double>& latlong = document[field_pair.first];
GeoCoord x {degsToRads(latlong[0]), degsToRads(latlong[1])};
H3Index geoHash = geoToH3(&x, FINEST_GEO_RESOLUTION);
doc_to_score->emplace(seq_id, (int64_t)(geoHash));
}
}
}
num_documents += 1;
return Option<>(201);
}
Option<uint32_t> Index::validate_index_in_memory(nlohmann::json& document, uint32_t seq_id,
const std::string & default_sorting_field,
const std::unordered_map<std::string, field> & search_schema,
const std::map<std::string, field> & facet_schema,
bool is_update,
const std::string& fallback_field_type,
const DIRTY_VALUES& dirty_values) {
bool missing_default_sort_field = (!default_sorting_field.empty() && document.count(default_sorting_field) == 0);
if(!is_update && missing_default_sort_field) {
return Option<>(400, "Field `" + default_sorting_field + "` has been declared as a default sorting field, "
"but is not found in the document.");
}
for(const auto& field_pair: search_schema) {
const std::string& field_name = field_pair.first;
const field& a_field = field_pair.second;
if((a_field.optional || is_update) && document.count(field_name) == 0) {
continue;
}
if(document.count(field_name) == 0) {
return Option<>(400, "Field `" + field_name + "` has been declared in the schema, "
"but is not found in the document.");
}
nlohmann::json::iterator dummy_iter;
bool array_ele_erased = false;
if(a_field.type == field_types::STRING && !document[field_name].is_string()) {
Option<uint32_t> coerce_op = coerce_string(dirty_values, fallback_field_type, a_field, document, field_name, dummy_iter, false, array_ele_erased);
if(!coerce_op.ok()) {
return coerce_op;
}
} else if(a_field.type == field_types::INT32) {
if(!document[field_name].is_number_integer()) {
Option<uint32_t> coerce_op = coerce_int32_t(dirty_values, a_field, document, field_name, dummy_iter, false, array_ele_erased);
if(!coerce_op.ok()) {
return coerce_op;
}
}
if(document[field_name].get<int64_t>() > INT32_MAX) {
if(a_field.optional && (dirty_values == DIRTY_VALUES::DROP || dirty_values == DIRTY_VALUES::COERCE_OR_REJECT)) {
document.erase(field_name);
continue;
} else {
return Option<>(400, "Field `" + field_name + "` exceeds maximum value of int32.");
}
}
} else if(a_field.type == field_types::INT64 && !document[field_name].is_number_integer()) {
Option<uint32_t> coerce_op = coerce_int64_t(dirty_values, a_field, document, field_name, dummy_iter, false, array_ele_erased);
if(!coerce_op.ok()) {
return coerce_op;
}
} else if(a_field.type == field_types::FLOAT && !document[field_name].is_number()) {
// using `is_number` allows integer to be passed to a float field
Option<uint32_t> coerce_op = coerce_float(dirty_values, a_field, document, field_name, dummy_iter, false, array_ele_erased);
if(!coerce_op.ok()) {
return coerce_op;
}
} else if(a_field.type == field_types::BOOL && !document[field_name].is_boolean()) {
Option<uint32_t> coerce_op = coerce_bool(dirty_values, a_field, document, field_name, dummy_iter, false, array_ele_erased);
if(!coerce_op.ok()) {
return coerce_op;
}
} else if(a_field.is_array()) {
if(!document[field_name].is_array()) {
if(a_field.optional && (dirty_values == DIRTY_VALUES::DROP ||
dirty_values == DIRTY_VALUES::COERCE_OR_DROP)) {
document.erase(field_name);
continue;
} else {
return Option<>(400, "Field `" + field_name + "` must be an array.");
}
}
nlohmann::json::iterator it = document[field_name].begin();
for(; it != document[field_name].end(); ) {
const auto& item = it.value();
array_ele_erased = false;
if (a_field.type == field_types::STRING_ARRAY && !item.is_string()) {
Option<uint32_t> coerce_op = coerce_string(dirty_values, fallback_field_type, a_field, document, field_name, it, true, array_ele_erased);
if (!coerce_op.ok()) {
return coerce_op;
}
} else if (a_field.type == field_types::INT32_ARRAY && !item.is_number_integer()) {
Option<uint32_t> coerce_op = coerce_int32_t(dirty_values, a_field, document, field_name, it, true, array_ele_erased);
if (!coerce_op.ok()) {
return coerce_op;
}
} else if (a_field.type == field_types::INT64_ARRAY && !item.is_number_integer()) {
Option<uint32_t> coerce_op = coerce_int64_t(dirty_values, a_field, document, field_name, it, true, array_ele_erased);
if (!coerce_op.ok()) {
return coerce_op;
}
} else if (a_field.type == field_types::FLOAT_ARRAY && !item.is_number()) {
// we check for `is_number` to allow whole numbers to be passed into float fields
Option<uint32_t> coerce_op = coerce_float(dirty_values, a_field, document, field_name, it, true, array_ele_erased);
if (!coerce_op.ok()) {
return coerce_op;
}
} else if (a_field.type == field_types::BOOL_ARRAY && !item.is_boolean()) {
Option<uint32_t> coerce_op = coerce_bool(dirty_values, a_field, document, field_name, it, true, array_ele_erased);
if (!coerce_op.ok()) {
return coerce_op;
}
}
if(!array_ele_erased) {
// if it is erased, the iterator will be reassigned
it++;
}
}
}
}
return Option<>(200);
}
void Index::scrub_reindex_doc(nlohmann::json& update_doc, nlohmann::json& del_doc, nlohmann::json& old_doc) {
std::vector<std::string> del_keys;
for(auto it = del_doc.cbegin(); it != del_doc.cend(); it++) {
const std::string& field_name = it.key();
const auto& search_field_it = search_schema.find(field_name);
if(search_field_it == search_schema.end()) {
continue;
}
const auto& search_field = search_field_it->second;
bool arrays_match = false;
// compare values between old and update docs:
// if they match, we will remove them from both del and update docs
if(search_field.is_string()) {
// Go through all the field names and find the keys+values so that they can be removed from in-memory index
std::vector<std::string> reindex_vals;
std::vector<std::string> old_vals;
tokenize_doc_field(update_doc, search_field, reindex_vals);
tokenize_doc_field(old_doc, search_field, old_vals);
arrays_match = _arrays_match<std::string>(reindex_vals, old_vals);
} else if(search_field.is_int32()) {
std::vector<int32_t> reindex_vals = search_field.is_single_integer() ?
std::vector<int32_t>{update_doc[field_name].get<int32_t>()} :
update_doc[field_name].get<std::vector<int32_t>>();
std::vector<int32_t> old_vals = search_field.is_single_integer() ?
std::vector<int32_t>{old_doc[field_name].get<int32_t>()} :
old_doc[field_name].get<std::vector<int32_t>>();
arrays_match = _arrays_match<int32_t>(reindex_vals, old_vals);
} else if(search_field.is_int64()) {
std::vector<int64_t> reindex_vals = search_field.is_single_integer() ?
std::vector<int64_t>{update_doc[field_name].get<int64_t>()} :
update_doc[field_name].get<std::vector<int64_t>>();
std::vector<int64_t> old_vals = search_field.is_single_integer() ?
std::vector<int64_t>{old_doc[field_name].get<int64_t>()} :
old_doc[field_name].get<std::vector<int64_t>>();
arrays_match = _arrays_match<int64_t>(reindex_vals, old_vals);
} else if(search_field.is_float()) {
std::vector<float> reindex_vals = search_field.is_single_float() ?
std::vector<float>{update_doc[field_name].get<float>()} :
update_doc[field_name].get<std::vector<float>>();
std::vector<float> old_vals = search_field.is_single_float() ?
std::vector<float>{old_doc[field_name].get<float>()} :
old_doc[field_name].get<std::vector<float>>();
arrays_match = _arrays_match<float>(reindex_vals, old_vals);
} else if(search_field.is_bool()) {
std::vector<bool> reindex_vals = search_field.is_single_bool() ?
std::vector<bool>{update_doc[field_name].get<bool>()} :
update_doc[field_name].get<std::vector<bool>>();
std::vector<bool> old_vals = search_field.is_single_bool() ?
std::vector<bool>{old_doc[field_name].get<bool>()} :
old_doc[field_name].get<std::vector<bool>>();
arrays_match = _arrays_match<bool>(reindex_vals, old_vals);
}
if(arrays_match) {
del_keys.push_back(field_name);
}
}
for(const auto& del_key: del_keys) {
del_doc.erase(del_key);
update_doc.erase(del_key);
}
}
size_t Index::batch_memory_index(Index *index, std::vector<index_record> & iter_batch,
const std::string & default_sorting_field,
const std::unordered_map<std::string, field> & search_schema,
const std::map<std::string, field> & facet_schema,
const std::string& fallback_field_type) {
size_t num_indexed = 0;
for(auto & index_rec: iter_batch) {
if(!index_rec.indexed.ok()) {
// some records could have been invalidated upstream
continue;
}
if(index_rec.operation != DELETE) {
Option<uint32_t> validation_op = validate_index_in_memory(index_rec.doc, index_rec.seq_id,
default_sorting_field,
search_schema, facet_schema,
index_rec.is_update,
fallback_field_type,
index_rec.dirty_values);
if(!validation_op.ok()) {
index_rec.index_failure(validation_op.code(), validation_op.error());
continue;
}
if(index_rec.is_update) {
// scrub string fields to reduce delete ops
get_doc_changes(index_rec.doc, index_rec.old_doc, index_rec.new_doc, index_rec.del_doc);
index->scrub_reindex_doc(index_rec.doc, index_rec.del_doc, index_rec.old_doc);
index->remove(index_rec.seq_id, index_rec.del_doc);
}
Option<uint32_t> index_mem_op(0);
try {
index_mem_op = index->index_in_memory(index_rec.doc, index_rec.seq_id, default_sorting_field);
} catch(const std::exception& e) {
const std::string& error_msg = std::string("Fatal error during indexing: ") + e.what();
LOG(ERROR) << error_msg << ", document: " << index_rec.doc;
index_mem_op = Option<uint32_t>(500, error_msg);
}
if(!index_mem_op.ok()) {
index->index_in_memory(index_rec.del_doc, index_rec.seq_id, default_sorting_field);
index_rec.index_failure(index_mem_op.code(), index_mem_op.error());
continue;
}
index_rec.index_success();
if(!index_rec.is_update) {
num_indexed++;
}
}
}
return num_indexed;
}
void Index::insert_doc(const int64_t score, art_tree *t, uint32_t seq_id,
const std::unordered_map<std::string, std::vector<uint32_t>> &token_to_offsets) const {
for(auto & kv: token_to_offsets) {
art_document art_doc;
art_doc.id = seq_id;
art_doc.score = score;
art_doc.offsets_len = (uint32_t) kv.second.size();
art_doc.offsets = new uint32_t[kv.second.size()];
uint32_t num_hits = 0;
const unsigned char *key = (const unsigned char *) kv.first.c_str();
int key_len = (int) kv.first.length() + 1; // for the terminating \0 char
art_leaf* leaf = (art_leaf *) art_search(t, key, key_len);
if(leaf != NULL) {
num_hits = leaf->values->ids.getLength();
}
num_hits += 1;
for(size_t i=0; i<kv.second.size(); i++) {
art_doc.offsets[i] = kv.second[i];
}
//LOG(INFO) << "key: " << key << ", art_doc.id: " << art_doc.id;
art_insert(t, key, key_len, &art_doc, num_hits);
delete [] art_doc.offsets;
art_doc.offsets = nullptr;
}
}
uint64_t Index::facet_token_hash(const field & a_field, const std::string &token) {
// for integer/float use their native values
uint64_t hash = 0;
if(a_field.is_float()) {
float f = std::stof(token);
reinterpret_cast<float&>(hash) = f; // store as int without loss of precision
} else if(a_field.is_integer() || a_field.is_bool()) {
hash = atoll(token.c_str());
} else {
// string field
hash = StringUtils::hash_wy(token.c_str(), token.size());
}
return hash;
}
void Index::index_string_field(const std::string & text, const int64_t score, art_tree *t,
uint32_t seq_id, bool is_facet, const field & a_field) {
std::unordered_map<std::string, std::vector<uint32_t>> token_to_offsets;
Tokenizer tokenizer(text, false, true, !a_field.is_string());
std::string token;
size_t token_index = 0;
std::vector<uint64_t> facet_hashes;
while(tokenizer.next(token, token_index)) {
if(token.empty()) {
continue;
}
if(is_facet) {
uint64_t hash = facet_token_hash(a_field, token);
facet_hashes.push_back(hash);
}
token_to_offsets[token].push_back(token_index);
}
/*if(seq_id == 0) {
LOG(INFO) << "field name: " << a_field.name;
}*/
insert_doc(score, t, seq_id, token_to_offsets);
if(is_facet) {
facet_hash_values_t fhashvalues;
fhashvalues.length = facet_hashes.size();
fhashvalues.hashes = new uint64_t[facet_hashes.size()];
for(size_t i = 0; i < facet_hashes.size(); i++) {
fhashvalues.hashes[i] = facet_hashes[i];
}
facet_index_v3[a_field.name]->emplace(seq_id, std::move(fhashvalues));
}
}
void Index::index_string_array_field(const std::vector<std::string> & strings, const int64_t score, art_tree *t,
uint32_t seq_id, bool is_facet, const field & a_field) {
std::unordered_map<std::string, std::vector<uint32_t>> token_positions;
std::vector<uint64_t> facet_hashes;
for(size_t array_index = 0; array_index < strings.size(); array_index++) {
const std::string& str = strings[array_index];
std::set<std::string> token_set; // required to deal with repeating tokens
Tokenizer tokenizer(str, false, true, !a_field.is_string());
std::string token;
size_t token_index = 0;
// iterate and append offset positions
while(tokenizer.next(token, token_index)) {
if(token.empty()) {
continue;
}
if(is_facet) {
uint64_t hash = facet_token_hash(a_field, token);
facet_hashes.push_back(hash);
//LOG(INFO) << "indexing " << token << ", hash:" << hash;
}
token_positions[token].push_back(token_index);
token_set.insert(token);
}
if(is_facet) {
facet_hashes.push_back(FACET_ARRAY_DELIMETER); // as a delimiter
}
// repeat last element to indicate end of offsets for this array index
for(auto & the_token: token_set) {
token_positions[the_token].push_back(token_positions[the_token].back());
}
// iterate and append this array index to all tokens
for(auto & the_token: token_set) {
token_positions[the_token].push_back(array_index);
}
}
if(is_facet) {
facet_hash_values_t fhashvalues;
fhashvalues.length = facet_hashes.size();
fhashvalues.hashes = new uint64_t[facet_hashes.size()];
for(size_t i = 0; i < facet_hashes.size(); i++) {
fhashvalues.hashes[i] = facet_hashes[i];
}
facet_index_v3[a_field.name]->emplace(seq_id, std::move(fhashvalues));
}
insert_doc(score, t, seq_id, token_positions);
}
void Index::compute_facet_stats(facet &a_facet, uint64_t raw_value, const std::string & field_type) {
if(field_type == field_types::INT32 || field_type == field_types::INT32_ARRAY) {
int32_t val = raw_value;
if (val < a_facet.stats.fvmin) {
a_facet.stats.fvmin = val;
}
if (val > a_facet.stats.fvmax) {
a_facet.stats.fvmax = val;
}
a_facet.stats.fvsum += val;
a_facet.stats.fvcount++;
} else if(field_type == field_types::INT64 || field_type == field_types::INT64_ARRAY) {
int64_t val = raw_value;
if(val < a_facet.stats.fvmin) {
a_facet.stats.fvmin = val;
}
if(val > a_facet.stats.fvmax) {
a_facet.stats.fvmax = val;
}
a_facet.stats.fvsum += val;
a_facet.stats.fvcount++;
} else if(field_type == field_types::FLOAT || field_type == field_types::FLOAT_ARRAY) {
float val = reinterpret_cast<float&>(raw_value);
if(val < a_facet.stats.fvmin) {
a_facet.stats.fvmin = val;
}
if(val > a_facet.stats.fvmax) {
a_facet.stats.fvmax = val;
}
a_facet.stats.fvsum += val;
a_facet.stats.fvcount++;
}
}
void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
size_t group_limit, const std::vector<std::string>& group_by_fields,
const uint32_t* result_ids, size_t results_size) const {
struct facet_info_t {
// facet hash => token position in the query
std::unordered_map<uint64_t, token_pos_cost_t> fhash_qtoken_pos;
bool use_facet_query = false;
bool should_compute_stats = false;
field facet_field{"", "", false};
};
std::vector<facet_info_t> facet_infos(facets.size());
for(size_t findex=0; findex < facets.size(); findex++) {
const auto& a_facet = facets[findex];
facet_infos[findex].use_facet_query = false;
const field &facet_field = facet_schema.at(a_facet.field_name);
facet_infos[findex].facet_field = facet_field;
facet_infos[findex].should_compute_stats = (facet_field.type != field_types::STRING &&
facet_field.type != field_types::BOOL &&
facet_field.type != field_types::STRING_ARRAY &&
facet_field.type != field_types::BOOL_ARRAY);
if(a_facet.field_name == facet_query.field_name && !facet_query.query.empty()) {
facet_infos[findex].use_facet_query = true;
if (facet_field.is_bool()) {
if (facet_query.query == "true") {
facet_query.query = "1";
} else if (facet_query.query == "false") {
facet_query.query = "0";
}
}
// for non-string fields, `faceted_name` returns their aliased stringified field name
art_tree *t = search_index.at(facet_field.faceted_name());
std::vector<std::string> query_tokens;
Tokenizer(facet_query.query, false, true, !facet_field.is_string()).tokenize(query_tokens);
for (size_t qtoken_index = 0; qtoken_index < query_tokens.size(); qtoken_index++) {
auto &q = query_tokens[qtoken_index];
int bounded_cost = (q.size() < 3) ? 0 : 1;
bool prefix_search = (qtoken_index ==
(query_tokens.size() - 1)); // only last token must be used as prefix
std::vector<art_leaf *> leaves;
art_fuzzy_search(t, (const unsigned char *) q.c_str(),
q.size(), 0, bounded_cost, 10000,
token_ordering::MAX_SCORE, prefix_search, nullptr, 0, leaves);
for (size_t leaf_index = 0; leaf_index < leaves.size(); leaf_index++) {
const auto &leaf = leaves[leaf_index];
// calculate hash without terminating null char
std::string key_str((const char *) leaf->key, leaf->key_len - 1);
uint64_t hash = facet_token_hash(facet_field, key_str);
token_pos_cost_t token_pos_cost = {qtoken_index, 0};
facet_infos[findex].fhash_qtoken_pos.emplace(hash, token_pos_cost);
//printf("%.*s - %llu\n", leaf->key_len, leaf->key, hash);
}
}
}
}
// assumed that facet fields have already been validated upstream
for(size_t findex=0; findex < facets.size(); findex++) {
auto& a_facet = facets[findex];
const auto& facet_field = facet_infos[findex].facet_field;
const bool use_facet_query = facet_infos[findex].use_facet_query;
const auto& fhash_qtoken_pos = facet_infos[findex].fhash_qtoken_pos;
const bool should_compute_stats = facet_infos[findex].should_compute_stats;
const auto& field_facet_mapping_it = facet_index_v3.find(a_facet.field_name);
if(field_facet_mapping_it == facet_index_v3.end()) {
continue;
}
const auto& field_facet_mapping = field_facet_mapping_it->second;
for(size_t i = 0; i < results_size; i++) {
uint32_t doc_seq_id = result_ids[i];
const auto& facet_hashes_it = field_facet_mapping->find(doc_seq_id);
if(facet_hashes_it == field_facet_mapping->end()) {
continue;
}
// FORMAT OF VALUES
// String: h1 h2 h3
// String array: h1 h2 h3 0 h1 0 h1 h2 0
const auto& facet_hashes = facet_hashes_it->second;
const uint64_t distinct_id = group_limit ? get_distinct_id(group_by_fields, doc_seq_id) : 0;
int array_pos = 0;
bool fvalue_found = false;
uint64_t combined_hash = 1; // for hashing the entire facet value (multiple tokens)
std::unordered_map<uint32_t, token_pos_cost_t> query_token_positions;
size_t field_token_index = -1;
auto fhashes = facet_hashes.hashes;
for(size_t j = 0; j < facet_hashes.size(); j++) {
if(fhashes[j] != FACET_ARRAY_DELIMETER) {
uint64_t ftoken_hash = fhashes[j];
field_token_index++;
// reference: https://stackoverflow.com/a/4182771/131050
// we also include token index to maintain orderliness
combined_hash *= (1779033703 + 2*ftoken_hash*(field_token_index+1));
// ftoken_hash is the raw value for numeric fields
if(should_compute_stats) {
compute_facet_stats(a_facet, ftoken_hash, facet_field.type);
}
const auto fhash_qtoken_pos_it = fhash_qtoken_pos.find(ftoken_hash);
// not using facet query or this particular facet value is found in facet filter
if(!use_facet_query || fhash_qtoken_pos_it != fhash_qtoken_pos.end()) {
fvalue_found = true;
if(use_facet_query) {
// map token index to query index (used for highlighting later on)
const token_pos_cost_t& qtoken_pos = fhash_qtoken_pos_it->second;
// if the query token has already matched another token in the string
// we will replace the position only if the cost is lower
if(query_token_positions.find(qtoken_pos.pos) == query_token_positions.end() ||
query_token_positions[qtoken_pos.pos].cost >= qtoken_pos.cost ) {
token_pos_cost_t ftoken_pos_cost = {field_token_index, qtoken_pos.cost};
query_token_positions[qtoken_pos.pos] = ftoken_pos_cost;
}
}
}
}
// 0 indicates separator, while the second condition checks for non-array string
if(fhashes[j] == FACET_ARRAY_DELIMETER || (facet_hashes.back() != FACET_ARRAY_DELIMETER && j == facet_hashes.size() - 1)) {
if(!use_facet_query || fvalue_found) {
uint64_t fhash = combined_hash;
if(a_facet.result_map.count(fhash) == 0) {
a_facet.result_map.emplace(fhash, facet_count_t{0, spp::sparse_hash_set<uint64_t>(),
doc_seq_id, 0,
std::unordered_map<uint32_t, token_pos_cost_t>()});
}
facet_count_t& facet_count = a_facet.result_map[fhash];
/*LOG(INFO) << "field: " << a_facet.field_name << ", doc id: " << doc_seq_id
<< ", hash: " << fhash;*/
facet_count.doc_id = doc_seq_id;
facet_count.array_pos = array_pos;
if(group_limit) {
facet_count.groups.emplace(distinct_id);
} else {
facet_count.count += 1;
}
if(use_facet_query) {
facet_count.query_token_pos = query_token_positions;
}
}
array_pos++;
fvalue_found = false;
combined_hash = 1;
std::unordered_map<uint32_t, token_pos_cost_t>().swap(query_token_positions);
field_token_index = -1;
}
}
}
}
}
void Index::search_candidates(const uint8_t & field_id,
uint32_t* filter_ids, size_t filter_ids_length,
const uint32_t* exclude_token_ids, size_t exclude_token_ids_size,
const std::vector<uint32_t>& curated_ids,
const std::vector<sort_by> & sort_fields,
std::vector<token_candidates> & token_candidates_vec,
std::vector<std::vector<art_leaf*>> & searched_queries, Topster* topster,
spp::sparse_hash_set<uint64_t>& groups_processed,
uint32_t** all_result_ids, size_t & all_result_ids_len,
size_t& field_num_results,
const size_t typo_tokens_threshold,
const size_t group_limit, const std::vector<std::string>& group_by_fields) const {
const long long combination_limit = 10;
auto product = []( long long a, token_candidates & b ) { return a*b.candidates.size(); };
long long int N = std::accumulate(token_candidates_vec.begin(), token_candidates_vec.end(), 1LL, product);
for(long long n=0; n<N && n<combination_limit; ++n) {
// every element in `query_suggestion` contains a token and its associated hits
std::vector<art_leaf*> query_suggestion(token_candidates_vec.size());
// actual query suggestion preserves original order of tokens in query
std::vector<art_leaf*> actual_query_suggestion(token_candidates_vec.size());
uint32_t token_bits = (uint32_t(1) << 31); // top most bit set to guarantee atleast 1 bit set
uint32_t total_cost = next_suggestion(token_candidates_vec, n, actual_query_suggestion,
query_suggestion, token_bits);
/*LOG(INFO) << "n: " << n;
for(size_t i=0; i < query_suggestion.size(); i++) {
LOG(INFO) << "i: " << i << " - " << query_suggestion[i]->key << ", ids: "
<< query_suggestion[i]->values->ids.getLength() << ", total_cost: " << total_cost;
}*/
// initialize results with the starting element (for further intersection)
size_t result_size = query_suggestion[0]->values->ids.getLength();
if(result_size == 0) {
continue;
}
uint32_t* result_ids = query_suggestion[0]->values->ids.uncompress();
// intersect the document ids for each token to find docs that contain all the tokens (stored in `result_ids`)
for(size_t i=1; i < query_suggestion.size(); i++) {
uint32_t* out = nullptr;
uint32_t* ids = query_suggestion[i]->values->ids.uncompress();
result_size = ArrayUtils::and_scalar(ids, query_suggestion[i]->values->ids.getLength(), result_ids, result_size, &out);
delete[] ids;
delete[] result_ids;
result_ids = out;
}
if(result_size == 0) {
delete[] result_ids;
continue;
}
// Exclude document IDs associated with excluded tokens from the result set
if(exclude_token_ids_size != 0) {
uint32_t *excluded_result_ids = nullptr;
result_size = ArrayUtils::exclude_scalar(result_ids, result_size, exclude_token_ids, exclude_token_ids_size,
&excluded_result_ids);
delete[] result_ids;
result_ids = excluded_result_ids;
}
if(!curated_ids.empty()) {
uint32_t *excluded_result_ids = nullptr;
result_size = ArrayUtils::exclude_scalar(result_ids, result_size, &curated_ids[0],
curated_ids.size(), &excluded_result_ids);
delete [] result_ids;
result_ids = excluded_result_ids;
}
//LOG(INFO) << "n: " << n;
/*std::stringstream log_query;
for(size_t i=0; i < query_suggestion.size(); i++) {
log_query << query_suggestion[i]->key << " ";
}*/
if(filter_ids != nullptr) {
// intersect once again with filter ids
uint32_t* filtered_result_ids = nullptr;
size_t filtered_results_size = ArrayUtils::and_scalar(filter_ids, filter_ids_length, result_ids,
result_size, &filtered_result_ids);
uint32_t* new_all_result_ids = nullptr;
all_result_ids_len = ArrayUtils::or_scalar(*all_result_ids, all_result_ids_len, filtered_result_ids,
filtered_results_size, &new_all_result_ids);
delete [] *all_result_ids;
*all_result_ids = new_all_result_ids;
// go through each matching document id and calculate match score
score_results(sort_fields, (uint16_t) searched_queries.size(), field_id, total_cost, topster, query_suggestion,
groups_processed, filtered_result_ids, filtered_results_size,
group_limit, group_by_fields, token_bits);
field_num_results += filtered_results_size;
delete[] filtered_result_ids;
delete[] result_ids;
} else {
uint32_t* new_all_result_ids = nullptr;
all_result_ids_len = ArrayUtils::or_scalar(*all_result_ids, all_result_ids_len, result_ids,
result_size, &new_all_result_ids);
delete [] *all_result_ids;
*all_result_ids = new_all_result_ids;
/*if(result_size != 0) {
LOG(INFO) << size_t(field_id) << " - " << log_query.str() << ", result_size: " << result_size;
}*/
score_results(sort_fields, (uint16_t) searched_queries.size(), field_id, total_cost, topster, query_suggestion,
groups_processed, result_ids, result_size, group_limit, group_by_fields, token_bits);
field_num_results += result_size;
delete[] result_ids;
}
searched_queries.push_back(actual_query_suggestion);
//LOG(INFO) << "field_num_results: " << field_num_results << ", typo_tokens_threshold: " << typo_tokens_threshold;
if(field_num_results >= typo_tokens_threshold) {
break;
}
}
}
uint32_t Index::do_filtering(uint32_t** filter_ids_out, const std::vector<filter> & filters) const {
//auto begin = std::chrono::high_resolution_clock::now();
uint32_t* filter_ids = nullptr;
uint32_t filter_ids_length = 0;
for(size_t i = 0; i < filters.size(); i++) {
const filter & a_filter = filters[i];
bool has_search_index = search_index.count(a_filter.field_name) != 0 ||
numerical_index.count(a_filter.field_name) != 0;
if(!has_search_index) {
continue;
}
field f = search_schema.at(a_filter.field_name);
uint32_t* result_ids = nullptr;
size_t result_ids_len = 0;
if(f.is_integer()) {
auto num_tree = numerical_index.at(a_filter.field_name);
for(size_t fi=0; fi < a_filter.values.size(); fi++) {
const std::string & filter_value = a_filter.values[fi];
int64_t value = (int64_t) std::stol(filter_value);
if(a_filter.comparators[fi] == RANGE_INCLUSIVE && fi+1 < a_filter.values.size()) {
const std::string& next_filter_value = a_filter.values[fi+1];
int64_t range_end_value = (int64_t) std::stol(next_filter_value);
num_tree->range_inclusive_search(value, range_end_value, &result_ids, result_ids_len);
fi++;
} else {
num_tree->search(a_filter.comparators[fi], value, &result_ids, result_ids_len);
}
}
} else if(f.is_float()) {
auto num_tree = numerical_index.at(a_filter.field_name);
for(size_t fi=0; fi < a_filter.values.size(); fi++) {
const std::string & filter_value = a_filter.values[fi];
float value = (float) std::atof(filter_value.c_str());
int64_t float_int64 = float_to_in64_t(value);
if(a_filter.comparators[fi] == RANGE_INCLUSIVE && fi+1 < a_filter.values.size()) {
const std::string& next_filter_value = a_filter.values[fi+1];
int64_t range_end_value = float_to_in64_t((float) std::atof(next_filter_value.c_str()));
num_tree->range_inclusive_search(float_int64, range_end_value, &result_ids, result_ids_len);
fi++;
} else {
num_tree->search(a_filter.comparators[fi], float_int64, &result_ids, result_ids_len);
}
}
} else if(f.is_bool()) {
auto num_tree = numerical_index.at(a_filter.field_name);
size_t value_index = 0;
for(const std::string & filter_value: a_filter.values) {
int64_t bool_int64 = (filter_value == "1") ? 1 : 0;
num_tree->search(a_filter.comparators[value_index], bool_int64, &result_ids, result_ids_len);
value_index++;
}
} else if(f.is_geopoint()) {
auto num_tree = numerical_index.at(a_filter.field_name);
auto record_to_geo = sort_index.at(a_filter.field_name);
double indexed_edge_len = edgeLengthM(f.geo_resolution);
for(const std::string& filter_value: a_filter.values) {
std::vector<std::string> filter_value_parts;
StringUtils::split(filter_value, filter_value_parts, ","); // x, y, 2 km (or) list of points
std::vector<uint32_t> geo_result_ids;
bool is_polygon = StringUtils::is_float(filter_value_parts.back());
if(is_polygon) {
const int num_verts = int(filter_value_parts.size()) / 2;
GeoCoord* verts = new GeoCoord[num_verts];
for(size_t point_index = 0; point_index < size_t(num_verts); point_index++) {
double lat = degsToRads(std::stod(filter_value_parts[point_index * 2]));
double lon = degsToRads(std::stod(filter_value_parts[point_index * 2 + 1]));
verts[point_index] = {lat, lon};
}
Geofence geo_fence = {num_verts, verts};
GeoPolygon geo_polygon = {geo_fence, 0, nullptr};
double lon_offset = transform_for_180th_meridian(geo_fence);
size_t num_hexagons = maxPolyfillSize(&geo_polygon, f.geo_resolution);
H3Index* hexagons = static_cast<H3Index *>(calloc(num_hexagons, sizeof(H3Index)));
polyfill(&geo_polygon, f.geo_resolution, hexagons);
// we will have to expand by kring=1 to ensure that hexagons completely cover the polygon
// see: https://github.com/uber/h3/issues/332
std::set<uint64_t> expanded_hexagons;
for (size_t hex_index = 0; hex_index < num_hexagons; hex_index++) {
// Some indexes may be 0 to indicate fewer than the maximum number of indexes.
if (hexagons[hex_index] != 0) {
expanded_hexagons.emplace(hexagons[hex_index]);
size_t k_rings = 1;
size_t max_neighboring = maxKringSize(k_rings);
H3Index* neighboring_indices = static_cast<H3Index *>(calloc(max_neighboring, sizeof(H3Index)));
kRing(hexagons[hex_index], k_rings, neighboring_indices);
for (size_t neighbour_index = 0; neighbour_index < max_neighboring; neighbour_index++) {
if (neighboring_indices[neighbour_index] != 0) {
expanded_hexagons.emplace(neighboring_indices[neighbour_index]);
}
}
free(neighboring_indices);
}
}
for(auto hex_id: expanded_hexagons) {
num_tree->get(hex_id, geo_result_ids);
}
// we will do an exact filtering again with point-in-poly checks
std::vector<uint32_t> exact_geo_result_ids;
for(auto result_id: geo_result_ids) {
GeoCoord point;
h3ToGeo(record_to_geo->at(result_id), &point);
point.lon = point.lon < 0.0 ? point.lon + lon_offset : point.lon;
if(is_point_in_polygon(geo_fence, point)) {
exact_geo_result_ids.push_back(result_id);
}
}
std::sort(exact_geo_result_ids.begin(), exact_geo_result_ids.end());
uint32_t *out = nullptr;
result_ids_len = ArrayUtils::or_scalar(&exact_geo_result_ids[0], exact_geo_result_ids.size(),
result_ids, result_ids_len, &out);
delete [] result_ids;
result_ids = out;
free(hexagons);
delete [] verts;
} else {
double radius = std::stof(filter_value_parts[2]);
const auto& unit = filter_value_parts[3];
if(unit == "km") {
radius *= 1000;
} else {
// assume "mi" (validated upstream)
radius *= 1609.34;
}
GeoCoord location;
location.lat = degsToRads(std::stod(filter_value_parts[0]));
location.lon = degsToRads(std::stod(filter_value_parts[1]));
H3Index query_index = geoToH3(&location, f.geo_resolution);
//LOG(INFO) << "query latlon: " << std::stod(filter_value_parts[0]) << ", " << std::stod(filter_value_parts[1]);
//LOG(INFO) << "query h3 index: " << query_index << " at res: " << size_t(f.geo_resolution);
size_t k_rings = size_t(std::ceil(radius / indexed_edge_len));
size_t max_neighboring = maxKringSize(k_rings);
H3Index* neighboring_indices = static_cast<H3Index *>(calloc(max_neighboring, sizeof(H3Index)));
kRing(query_index, k_rings, neighboring_indices);
for (size_t hex_index = 0; hex_index < max_neighboring; hex_index++) {
// Some indexes may be 0 to indicate fewer than the maximum number of indexes.
if (neighboring_indices[hex_index] != 0) {
//LOG(INFO) << "Neighbour index: " << neighboring_indices[hex_index];
num_tree->get(neighboring_indices[hex_index], geo_result_ids);
}
}
free(neighboring_indices);
// `geo_result_ids` will contain all IDs that are within K-ring hexagons
// we still need to do another round of exact filtering on them
std::vector<uint32_t> exact_geo_result_ids;
H3Index query_point_index = geoToH3(&location, FINEST_GEO_RESOLUTION);
for(auto result_id: geo_result_ids) {
size_t actual_dist_meters = h3Distance(query_point_index, record_to_geo->at(result_id));
if(actual_dist_meters <= radius) {
exact_geo_result_ids.push_back(result_id);
}
}
std::sort(exact_geo_result_ids.begin(), exact_geo_result_ids.end());
uint32_t *out = nullptr;
result_ids_len = ArrayUtils::or_scalar(&exact_geo_result_ids[0], exact_geo_result_ids.size(),
result_ids, result_ids_len, &out);
delete [] result_ids;
result_ids = out;
}
}
} else if(f.is_string()) {
art_tree* t = search_index.at(a_filter.field_name);
uint32_t* ids = nullptr;
size_t ids_size = 0;
for(const std::string & filter_value: a_filter.values) {
uint32_t* strt_ids = nullptr;
size_t strt_ids_size = 0;
std::vector<art_leaf *> query_suggestion;
// there could be multiple tokens in a filter value, which we have to treat as ANDs
// e.g. country: South Africa
Tokenizer tokenizer(filter_value, false, true);
std::string str_token;
size_t token_index = 0;
std::vector<std::string> str_tokens;
while(tokenizer.next(str_token, token_index)) {
str_tokens.push_back(str_token);
art_leaf* leaf = (art_leaf *) art_search(t, (const unsigned char*) str_token.c_str(),
str_token.length()+1);
if(leaf == nullptr) {
continue;
}
query_suggestion.push_back(leaf);
if(strt_ids == nullptr) {
strt_ids = leaf->values->ids.uncompress();
strt_ids_size = leaf->values->ids.getLength();
} else {
// do AND for an exact match
uint32_t* out = nullptr;
uint32_t* leaf_ids = leaf->values->ids.uncompress();
strt_ids_size = ArrayUtils::and_scalar(strt_ids, strt_ids_size, leaf_ids,
leaf->values->ids.getLength(), &out);
delete[] leaf_ids;
delete[] strt_ids;
strt_ids = out;
}
}
if(a_filter.comparators[0] == EQUALS && f.is_facet()) {
// need to do exact match (unlike CONTAINS) by using the facet index
// field being a facet is already enforced upstream
uint32_t* exact_strt_ids = new uint32_t[strt_ids_size];
size_t exact_strt_size = 0;
for(size_t strt_ids_index = 0; strt_ids_index < strt_ids_size; strt_ids_index++) {
uint32_t seq_id = strt_ids[strt_ids_index];
const auto& fvalues = facet_index_v3.at(f.name)->at(seq_id);
bool found_filter = false;
if(!f.is_array()) {
found_filter = (query_suggestion.size() == fvalues.length);
} else {
uint64_t filter_hash = 1;
for(size_t sindex=0; sindex < str_tokens.size(); sindex++) {
auto& str_token = str_tokens[sindex];
uint64_t thash = facet_token_hash(f, str_token);
filter_hash *= (1779033703 + 2*thash*(sindex+1));
}
uint64_t all_fvalue_hash = 1;
size_t ftindex = 0;
for(size_t findex=0; findex < fvalues.size(); findex++) {
auto fhash = fvalues.hashes[findex];
if(fhash == FACET_ARRAY_DELIMETER) {
// end of array, check hash
if(all_fvalue_hash == filter_hash) {
found_filter = true;
break;
}
all_fvalue_hash = 1;
ftindex = 0;
} else {
all_fvalue_hash *= (1779033703 + 2*fhash*(ftindex + 1));
ftindex++;
}
}
}
if(found_filter) {
exact_strt_ids[exact_strt_size] = seq_id;
exact_strt_size++;
}
}
delete[] strt_ids;
strt_ids = exact_strt_ids;
strt_ids_size = exact_strt_size;
}
// Otherwise, we just ensure that given record contains tokens in the filter query
// (NOT implemented) if the query is wrapped by double quotes, ensure phrase match
// bool exact_match = (filter_value.front() == '"' && filter_value.back() == '"');
uint32_t* out = nullptr;
ids_size = ArrayUtils::or_scalar(ids, ids_size, strt_ids, strt_ids_size, &out);
delete[] strt_ids;
delete[] ids;
ids = out;
}
result_ids = ids;
result_ids_len = ids_size;
}
if(i == 0) {
filter_ids = result_ids;
filter_ids_length = result_ids_len;
} else {
uint32_t* filtered_results = nullptr;
filter_ids_length = ArrayUtils::and_scalar(filter_ids, filter_ids_length, result_ids,
result_ids_len, &filtered_results);
delete [] result_ids;
delete [] filter_ids;
filter_ids = filtered_results;
}
}
/*long long int timeMillis =
std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - begin).count();
LOG(INFO) << "Time taken for filtering: " << timeMillis << "ms";*/
*filter_ids_out = filter_ids;
return filter_ids_length;
}
void Index::eq_str_filter_plain(const uint32_t *strt_ids, size_t strt_ids_size,
const std::vector<art_leaf *>& query_suggestion, uint32_t *exact_strt_ids,
size_t& exact_strt_size) const {
std::vector<uint32_t*> leaf_to_indices;
for (art_leaf *token_leaf: query_suggestion) {
if(token_leaf == nullptr) {
leaf_to_indices.push_back(nullptr);
continue;
}
uint32_t *indices = new uint32_t[strt_ids_size];
token_leaf->values->ids.indexOf(strt_ids, strt_ids_size, indices);
leaf_to_indices.push_back(indices);
}
// e.g. First In First Out => hash([0, 1, 0, 2])
spp::sparse_hash_map<size_t, uint32_t> leaf_to_id;
size_t next_id = 1;
size_t filter_hash = 1;
for(size_t leaf_index=0; leaf_index<query_suggestion.size(); leaf_index++) {
if(leaf_to_id.count(leaf_index) == 0) {
leaf_to_id.emplace(leaf_index, next_id++);
}
uint32_t leaf_id = leaf_to_id[leaf_index];
filter_hash *= (1779033703 + 2*leaf_id*(leaf_index+1));
}
for(size_t strt_ids_index = 0; strt_ids_index < strt_ids_size; strt_ids_index++) {
std::unordered_map<size_t, std::vector<std::vector<uint16_t>>> array_token_positions;
populate_token_positions(query_suggestion, leaf_to_indices, strt_ids_index, array_token_positions);
// iterate array_token_positions and compute hash
for(const auto& kv: array_token_positions) {
const std::vector<std::vector<uint16_t>>& token_positions = kv.second;
size_t this_hash = 1;
for(size_t token_index = 0; token_index < token_positions.size(); token_index++) {
auto& positions = token_positions[token_index];
for(auto pos: positions) {
this_hash *= (1779033703 + 2*(token_index+1)*(pos+1));
}
}
if(this_hash == filter_hash) {
exact_strt_ids[exact_strt_size++] = strt_ids[strt_ids_index];
break;
}
}
}
}
void Index::run_search(search_args* search_params) {
search(search_params->q_include_tokens, search_params->q_exclude_tokens,
search_params->q_synonyms,
search_params->search_fields,
search_params->filters, search_params->facets, search_params->facet_query,
search_params->included_ids, search_params->excluded_ids,
search_params->sort_fields_std, search_params->num_typos,
search_params->topster, search_params->curated_topster,
search_params->per_page, search_params->page, search_params->token_order,
search_params->prefix, search_params->drop_tokens_threshold,
search_params->all_result_ids_len, search_params->groups_processed,
search_params->searched_queries,
search_params->raw_result_kvs, search_params->override_result_kvs,
search_params->typo_tokens_threshold,
search_params->group_limit, search_params->group_by_fields,
search_params->default_sorting_field);
}
void Index::collate_included_ids(const std::vector<std::string>& q_included_tokens,
const std::string & field, const uint8_t field_id,
const std::map<size_t, std::map<size_t, uint32_t>> & included_ids_map,
Topster* curated_topster,
std::vector<std::vector<art_leaf*>> & searched_queries) const {
if(included_ids_map.empty()) {
return;
}
// calculate match_score and add to topster independently
std::vector<art_leaf *> override_query;
for(const std::string& token: q_included_tokens) {
const size_t token_len = token.length();
std::vector<art_leaf*> leaves;
art_fuzzy_search(search_index.at(field), (const unsigned char *) token.c_str(), token_len,
0, 0, 1, token_ordering::MAX_SCORE, false, nullptr, 0, leaves);
if(!leaves.empty()) {
override_query.push_back(leaves[0]);
}
}
for(const auto& pos_ids: included_ids_map) {
const size_t outer_pos = pos_ids.first;
for(const auto& index_seq_id: pos_ids.second) {
uint32_t inner_pos = index_seq_id.first;
uint32_t seq_id = index_seq_id.second;
uint64_t distinct_id = outer_pos; // outer pos is the group distinct key
uint64_t match_score = (64000 - outer_pos - inner_pos); // both outer pos and inner pos inside group
// LOG(INFO) << "seq_id: " << seq_id << " - " << match_score;
int64_t scores[3];
scores[0] = match_score;
scores[1] = int64_t(1);
scores[2] = int64_t(1);
uint32_t token_bits = (uint32_t(1) << 31);
KV kv(field_id, searched_queries.size(), token_bits, seq_id, distinct_id, 0, scores);
curated_topster->add(&kv);
}
}
searched_queries.push_back(override_query);
}
void Index::concat_topster_ids(Topster* topster, spp::sparse_hash_map<uint64_t, std::vector<KV*>>& topster_ids) {
if(topster->distinct) {
for(auto &group_topster_entry: topster->group_kv_map) {
Topster* group_topster = group_topster_entry.second;
for(const auto& map_kv: group_topster->kv_map) {
topster_ids[map_kv.first].push_back(map_kv.second);
}
}
} else {
for(const auto& map_kv: topster->kv_map) {
//LOG(INFO) << "map_kv.second.key: " << map_kv.second->key;
//LOG(INFO) << "map_kv.first: " << map_kv.first;
topster_ids[map_kv.first].push_back(map_kv.second);
}
}
}
void Index::search(const std::vector<std::string>& q_include_tokens,
const std::vector<std::string>& q_exclude_tokens,
const std::vector<std::vector<std::string>>& q_synonyms,
const std::vector<search_field_t>& search_fields,
const std::vector<filter>& filters,
std::vector<facet>& facets, facet_query_t& facet_query,
const std::map<size_t, std::map<size_t, uint32_t>> & included_ids_map,
const std::vector<uint32_t> & excluded_ids,
const std::vector<sort_by> & sort_fields_std, const int num_typos,
Topster* topster,
Topster* curated_topster,
const size_t per_page, const size_t page, const token_ordering token_order,
const bool prefix, const size_t drop_tokens_threshold,
size_t & all_result_ids_len,
spp::sparse_hash_set<uint64_t>& groups_processed,
std::vector<std::vector<art_leaf*>>& searched_queries,
std::vector<std::vector<KV*>> & raw_result_kvs,
std::vector<std::vector<KV*>> & override_result_kvs,
const size_t typo_tokens_threshold,
const size_t group_limit,
const std::vector<std::string>& group_by_fields,
const std::string& default_sorting_field) const {
std::shared_lock lock(mutex);
//auto begin = std::chrono::high_resolution_clock::now();
// process the filters
uint32_t* filter_ids = nullptr;
uint32_t filter_ids_length = do_filtering(&filter_ids, filters);
// we will be removing all curated IDs from organic result ids before running topster
std::set<uint32_t> curated_ids;
std::vector<uint32_t> included_ids;
for(const auto& outer_pos_ids: included_ids_map) {
for(const auto& inner_pos_seq_id: outer_pos_ids.second) {
curated_ids.insert(inner_pos_seq_id.second);
included_ids.push_back(inner_pos_seq_id.second);
}
}
curated_ids.insert(excluded_ids.begin(), excluded_ids.end());
std::vector<uint32_t> curated_ids_sorted(curated_ids.begin(), curated_ids.end());
std::sort(curated_ids_sorted.begin(), curated_ids_sorted.end());
// Order of `fields` are used to sort results
//auto begin = std::chrono::high_resolution_clock::now();
uint32_t* all_result_ids = nullptr;
const size_t num_search_fields = std::min(search_fields.size(), (size_t) FIELD_LIMIT_NUM);
uint32_t *exclude_token_ids = nullptr;
size_t exclude_token_ids_size = 0;
// find documents that contain the excluded tokens to exclude them from results later
for(size_t i = 0; i < num_search_fields; i++) {
const std::string & field_name = search_fields[i].name;
for(const std::string& exclude_token: q_exclude_tokens) {
art_leaf* leaf = (art_leaf *) art_search(search_index.at(field_name),
(const unsigned char *) exclude_token.c_str(),
exclude_token.size() + 1);
if(leaf) {
uint32_t *ids = leaf->values->ids.uncompress();
uint32_t *exclude_token_ids_merged = nullptr;
exclude_token_ids_size = ArrayUtils::or_scalar(exclude_token_ids, exclude_token_ids_size, ids,
leaf->values->ids.getLength(),
&exclude_token_ids_merged);
delete[] ids;
delete[] exclude_token_ids;
exclude_token_ids = exclude_token_ids_merged;
}
}
}
std::vector<Topster*> ftopsters;
if(!q_include_tokens.empty() && q_include_tokens[0] == "*") {
const uint8_t field_id = (uint8_t)(FIELD_LIMIT_NUM - 0);
const std::string& field = search_fields[0].name;
// if a filter is not specified, use the sorting index to generate the list of all document ids
if(filters.empty()) {
if(default_sorting_field.empty()) {
filter_ids_length = seq_ids.getLength();
filter_ids = seq_ids.uncompress();
} else {
const spp::sparse_hash_map<uint32_t, int64_t> *kvs = sort_index.at(default_sorting_field);
filter_ids_length = kvs->size();
filter_ids = new uint32_t[filter_ids_length];
size_t i = 0;
for(const auto& kv: *kvs) {
filter_ids[i++] = kv.first;
}
// ids populated from hash map will not be sorted, but sorting is required for intersection & other ops
std::sort(filter_ids, filter_ids+filter_ids_length);
}
}
if(!curated_ids.empty()) {
uint32_t *excluded_result_ids = nullptr;
filter_ids_length = ArrayUtils::exclude_scalar(filter_ids, filter_ids_length, &curated_ids_sorted[0],
curated_ids_sorted.size(), &excluded_result_ids);
delete [] filter_ids;
filter_ids = excluded_result_ids;
}
// Exclude document IDs associated with excluded tokens from the result set
if(exclude_token_ids_size != 0) {
uint32_t *excluded_result_ids = nullptr;
filter_ids_length = ArrayUtils::exclude_scalar(filter_ids, filter_ids_length, exclude_token_ids,
exclude_token_ids_size, &excluded_result_ids);
delete[] filter_ids;
filter_ids = excluded_result_ids;
}
uint32_t token_bits = 255;
score_results(sort_fields_std, (uint16_t) searched_queries.size(), field_id, 0, topster, {},
groups_processed, filter_ids, filter_ids_length, group_limit, group_by_fields, token_bits);
collate_included_ids(q_include_tokens, field, field_id, included_ids_map, curated_topster, searched_queries);
all_result_ids_len = filter_ids_length;
all_result_ids = filter_ids;
filter_ids = nullptr;
} else {
// In multi-field searches, a record can be matched across different fields, so we use this for aggregation
spp::sparse_hash_map<uint64_t, std::vector<KV*>> topster_ids;
std::vector<token_t> q_include_pos_tokens;
for(size_t i=0; i < q_include_tokens.size(); i++) {
q_include_pos_tokens.push_back({i, q_include_tokens[i]});
}
std::vector<std::vector<token_t>> q_pos_synonyms;
for(const auto& q_syn_vec: q_synonyms) {
std::vector<token_t> q_pos_syn;
for(size_t i=0; i < q_syn_vec.size(); i++) {
q_pos_syn.push_back({i, q_syn_vec[i]});
}
q_pos_synonyms.emplace_back(q_pos_syn);
}
//begin = std::chrono::high_resolution_clock::now();
// non-wildcard
for(size_t i = 0; i < num_search_fields; i++) {
// proceed to query search only when no filters are provided or when filtering produces results
if(filters.empty() || filter_ids_length > 0) {
const uint8_t field_id = (uint8_t)(FIELD_LIMIT_NUM - i);
const std::string& field = search_fields[i].name;
std::vector<token_t> query_tokens = q_include_pos_tokens;
std::vector<token_t> search_tokens = q_include_pos_tokens;
size_t num_tokens_dropped = 0;
//LOG(INFO) << "searching field! " << field;
Topster* ftopster = new Topster(topster->MAX_SIZE, topster->distinct);
ftopsters.push_back(ftopster);
// Don't waste additional cycles for single field searches
Topster* actual_topster = (num_search_fields == 1) ? topster : ftopster;
// tracks the number of results found for the current field
size_t field_num_results = 0;
search_field(field_id, query_tokens, search_tokens, exclude_token_ids, exclude_token_ids_size, num_tokens_dropped,
field, filter_ids, filter_ids_length, curated_ids_sorted, facets, sort_fields_std,
num_typos, searched_queries, actual_topster, groups_processed, &all_result_ids, all_result_ids_len,
field_num_results, group_limit, group_by_fields, token_order, prefix,
drop_tokens_threshold, typo_tokens_threshold);
// do synonym based searches
for(const auto& syn_tokens: q_pos_synonyms) {
num_tokens_dropped = 0;
field_num_results = 0;
query_tokens = search_tokens = syn_tokens;
search_field(field_id, query_tokens, search_tokens, exclude_token_ids, exclude_token_ids_size, num_tokens_dropped,
field, filter_ids, filter_ids_length, curated_ids_sorted, facets, sort_fields_std,
num_typos, searched_queries, actual_topster, groups_processed, &all_result_ids, all_result_ids_len,
field_num_results, group_limit, group_by_fields, token_order, prefix,
drop_tokens_threshold, typo_tokens_threshold);
}
concat_topster_ids(ftopster, topster_ids);
collate_included_ids(q_include_tokens, field, field_id, included_ids_map, curated_topster, searched_queries);
//LOG(INFO) << "topster_ids.size: " << topster_ids.size();
}
}
for(auto& seq_id_kvs: topster_ids) {
const uint64_t seq_id = seq_id_kvs.first;
auto& kvs = seq_id_kvs.second; // each `kv` can be from a different field
std::sort(kvs.begin(), kvs.end(), Topster::is_greater);
// LOG(INFO) << "DOC ID: " << seq_id << ", score: " << kvs[0]->scores[kvs[0]->match_score_index];
// to calculate existing aggregate scores across best matching fields
spp::sparse_hash_map<uint8_t, KV*> existing_field_kvs;
for(const auto kv: kvs) {
existing_field_kvs.emplace(kv->field_id, kv);
}
uint32_t token_bits = (uint32_t(1) << 31); // top most bit set to guarantee atleast 1 bit set
uint64_t total_typos = 0, total_distances = 0;
uint64_t num_exact_matches = 0;
//LOG(INFO) << "Init pop count: " << __builtin_popcount(token_bits);
for(size_t i = 0; i < num_search_fields; i++) {
const uint8_t field_id = (uint8_t)(FIELD_LIMIT_NUM - i);
size_t weight = search_fields[i].weight;
//LOG(INFO) << "--- field index: " << i << ", weight: " << weight;
if(existing_field_kvs.count(field_id) != 0) {
// for existing field, we will simply sum field-wise weighted scores
token_bits |= existing_field_kvs[field_id]->token_bits;
//LOG(INFO) << "existing_field_kvs.count pop count: " << __builtin_popcount(token_bits);
int64_t match_score = existing_field_kvs[field_id]->scores[existing_field_kvs[field_id]->match_score_index];
total_distances += ((100 - (match_score & 0xFF)) + 1) * weight;
uint64_t tokens_found = ((match_score >> 16) & 0xFF);
int64_t field_typos = 255 - ((match_score >> 8) & 0xFF);
total_typos += (field_typos + 1) * weight;
if(field_typos == 0 && tokens_found == q_include_tokens.size()) {
num_exact_matches++;
}
/*LOG(INFO) << "seq_id: " << seq_id << ", total_typos: " << (255 - ((match_score >> 8) & 0xFF))
<< ", weighted typos: " << std::max<uint64_t>((255 - ((match_score >> 8) & 0xFF)), 1) * weight
<< ", total dist: " << (((match_score & 0xFF)))
<< ", weighted dist: " << std::max<uint64_t>((100 - (match_score & 0xFF)), 1) * weight;*/
continue;
}
const std::string& field = search_fields[i].name;
// compute approximate match score for this field from actual query
size_t words_present = 0;
for(size_t token_index=0; token_index < q_include_tokens.size(); token_index++) {
const auto& token = q_include_tokens[token_index];
std::vector<art_leaf*> leaves;
const bool prefix_search = prefix && (token_index == q_include_tokens.size()-1);
const size_t token_len = prefix_search ? (int) token.length() : (int) token.length() + 1;
art_fuzzy_search(search_index.at(field), (const unsigned char *) token.c_str(), token_len,
0, 0, 1, token_order, prefix_search, nullptr, 0, leaves);
if(leaves.empty()) {
continue;
}
uint32_t doc_index = leaves[0]->values->ids.indexOf(seq_id);
if (doc_index == leaves[0]->values->ids.getLength()) {
continue;
}
token_bits |= 1UL << token_index; // sets nth bit
//LOG(INFO) << "token_index: " << token_index << ", pop count: " << __builtin_popcount(token_bits);
words_present += 1;
/*if(!leaves.empty()) {
LOG(INFO) << "tok: " << leaves[0]->key;
}*/
}
if(words_present != 0) {
uint64_t match_score = Match::get_match_score(words_present, 0, 0);
total_distances += ((100 - (match_score & 0xFF)) + 1) * weight;
uint64_t tokens_found = ((match_score >> 16) & 0xFF);
uint64_t field_typos = 255 - ((match_score >> 8) & 0xFF);
total_typos += (field_typos + 1) * weight;
if(field_typos == 0 && tokens_found == q_include_tokens.size()) {
num_exact_matches++;
}
//LOG(INFO) << "seq_id: " << seq_id << ", total_typos: " << ((match_score >> 8) & 0xFF);
}
}
int64_t tokens_present = int64_t(__builtin_popcount(token_bits)) - 1;
total_typos = std::min<uint64_t>(255, total_typos);
total_distances = std::min<uint64_t>(100, total_distances);
uint64_t aggregated_score = (
(num_exact_matches << 24) |
(tokens_present << 16) |
((255 - total_typos) << 8) |
(100 - total_distances)
);
/*LOG(INFO) << "seq id: " << seq_id << ", tokens_present: " << tokens_present
<< ", total_distances: " << total_distances << ", total_typos: " << total_typos
<< ", aggregated_score: " << aggregated_score << ", token_bits: " << token_bits;*/
kvs[0]->scores[kvs[0]->match_score_index] = aggregated_score;
topster->add(kvs[0]);
}
}
//LOG(INFO) << "topster size: " << topster->size;
delete [] exclude_token_ids;
do_facets(facets, facet_query, group_limit, group_by_fields, all_result_ids, all_result_ids_len);
do_facets(facets, facet_query, group_limit, group_by_fields, &included_ids[0], included_ids.size());
all_result_ids_len += curated_topster->size;
delete [] filter_ids;
delete [] all_result_ids;
for(Topster* ftopster: ftopsters) {
delete ftopster;
}
//LOG(INFO) << "all_result_ids_len " << all_result_ids_len << " for index " << name;
//long long int timeMillis = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - begin).count();
//LOG(INFO) << "Time taken for result calc: " << timeMillis << "ms";
}
/*
1. Split the query into tokens
2. Outer loop will generate bounded cartesian product with costs for each token
3. Inner loop will iterate on each token with associated cost
4. Cartesian product of the results of the token searches will be used to form search phrases
(cartesian product adapted from: http://stackoverflow.com/a/31169617/131050)
4. Intersect the lists to find docs that match each phrase
5. Sort the docs based on some ranking criteria
*/
void Index::search_field(const uint8_t & field_id,
std::vector<token_t>& query_tokens,
std::vector<token_t>& search_tokens,
const uint32_t* exclude_token_ids,
size_t exclude_token_ids_size,
size_t& num_tokens_dropped,
const std::string & field,
uint32_t *filter_ids, size_t filter_ids_length,
const std::vector<uint32_t>& curated_ids,
std::vector<facet> & facets, const std::vector<sort_by> & sort_fields, const int num_typos,
std::vector<std::vector<art_leaf*>> & searched_queries,
Topster* topster, spp::sparse_hash_set<uint64_t>& groups_processed,
uint32_t** all_result_ids, size_t & all_result_ids_len, size_t& field_num_results,
const size_t group_limit, const std::vector<std::string>& group_by_fields,
const token_ordering token_order, const bool prefix,
const size_t drop_tokens_threshold, const size_t typo_tokens_threshold) const {
const size_t max_cost = (num_typos < 0 || num_typos > 2) ? 2 : num_typos;
// To prevent us from doing ART search repeatedly as we iterate through possible corrections
spp::sparse_hash_map<std::string, std::vector<art_leaf*>> token_cost_cache;
std::vector<std::vector<int>> token_to_costs;
for(size_t stoken_index=0; stoken_index < search_tokens.size(); stoken_index++) {
const std::string& token = search_tokens[stoken_index].value;
std::vector<int> all_costs;
// This ensures that we don't end up doing a cost of 1 for a single char etc.
int bounded_cost = get_bounded_typo_cost(max_cost, token.length());
for(int cost = 0; cost <= bounded_cost; cost++) {
all_costs.push_back(cost);
}
token_to_costs.push_back(all_costs);
}
// stores candidates for each token, i.e. i-th index would have all possible tokens with a cost of "c"
std::vector<token_candidates> token_candidates_vec;
const long long combination_limit = 10;
auto product = []( long long a, std::vector<int>& b ) { return a*b.size(); };
long long n = 0;
long long int N = std::accumulate(token_to_costs.begin(), token_to_costs.end(), 1LL, product);
while(n < N && n < combination_limit) {
// Outerloop generates combinations of [cost to max_cost] for each token
// For e.g. for a 3-token query: [0, 0, 0], [0, 0, 1], [0, 1, 1] etc.
std::vector<uint32_t> costs(token_to_costs.size());
ldiv_t q { n, 0 };
for(long long i = (token_to_costs.size() - 1); 0 <= i ; --i ) {
q = ldiv(q.quot, token_to_costs[i].size());
costs[i] = token_to_costs[i][q.rem];
}
token_candidates_vec.clear();
size_t token_index = 0;
while(token_index < search_tokens.size()) {
// For each token, look up the generated cost for this iteration and search using that cost
const std::string& token = search_tokens[token_index].value;
const std::string token_cost_hash = token + std::to_string(costs[token_index]);
std::vector<art_leaf*> leaves;
//LOG(INFO) << "\nSearching for field: " << field << ", token:" << token << " - cost: " << costs[token_index];
if(token_cost_cache.count(token_cost_hash) != 0) {
leaves = token_cost_cache[token_cost_hash];
} else {
// prefix should apply only for last token
const bool prefix_search = prefix && (token_index == search_tokens.size()-1);
const size_t token_len = prefix_search ? (int) token.length() : (int) token.length() + 1;
// need less candidates for filtered searches since we already only pick tokens with results
const int max_candidates = (filter_ids_length == 0) ? 10 : 3;
art_fuzzy_search(search_index.at(field), (const unsigned char *) token.c_str(), token_len,
costs[token_index], costs[token_index], max_candidates, token_order, prefix_search,
filter_ids, filter_ids_length, leaves);
if(!leaves.empty()) {
token_cost_cache.emplace(token_cost_hash, leaves);
}
}
if(!leaves.empty()) {
//log_leaves(costs[token_index], token, leaves);
token_candidates_vec.push_back(token_candidates{search_tokens[token_index], costs[token_index], leaves});
} else {
// No result at `cost = costs[token_index]`. Remove `cost` for token and re-do combinations
auto it = std::find(token_to_costs[token_index].begin(), token_to_costs[token_index].end(), costs[token_index]);
if(it != token_to_costs[token_index].end()) {
token_to_costs[token_index].erase(it);
// when no more costs are left for this token
if(token_to_costs[token_index].empty()) {
// we can try to drop the token and search with remaining tokens
token_to_costs.erase(token_to_costs.begin()+token_index);
search_tokens.erase(search_tokens.begin()+token_index);
query_tokens.erase(query_tokens.begin()+token_index);
costs.erase(costs.begin()+token_index);
}
}
// Continue outerloop on new cost combination
n = -1;
N = std::accumulate(token_to_costs.begin(), token_to_costs.end(), 1LL, product);
goto resume_typo_loop;
}
token_index++;
}
if(!token_candidates_vec.empty()) {
// If atleast one token is found, go ahead and search for candidates
search_candidates(field_id, filter_ids, filter_ids_length, exclude_token_ids, exclude_token_ids_size,
curated_ids, sort_fields, token_candidates_vec, searched_queries, topster,
groups_processed, all_result_ids, all_result_ids_len, field_num_results,
typo_tokens_threshold, group_limit, group_by_fields);
}
resume_typo_loop:
if(field_num_results >= drop_tokens_threshold || field_num_results >= typo_tokens_threshold) {
// if either threshold is breached, we are done
return ;
}
n++;
}
// When atleast one token from the query is available
if(!query_tokens.empty() && num_tokens_dropped < query_tokens.size()) {
// Drop tokens from right until (len/2 + 1), and then from left until (len/2 + 1)
std::vector<token_t> truncated_tokens;
num_tokens_dropped++;
size_t mid_index = (query_tokens.size() / 2);
if(num_tokens_dropped <= mid_index) {
// drop from right
size_t end_index = (query_tokens.size() - 1) - num_tokens_dropped;
for(size_t i=0; i <= end_index; i++) {
truncated_tokens.push_back({query_tokens[i].position, query_tokens[i].value});
}
} else {
// drop from left
size_t start_index = (num_tokens_dropped - mid_index);
for(size_t i=start_index; i<query_tokens.size(); i++) {
truncated_tokens.push_back({query_tokens[i].position, query_tokens[i].value});
}
}
return search_field(field_id, query_tokens, truncated_tokens, exclude_token_ids, exclude_token_ids_size,
num_tokens_dropped, field, filter_ids, filter_ids_length, curated_ids,facets,
sort_fields, num_typos,searched_queries, topster, groups_processed, all_result_ids,
all_result_ids_len, field_num_results, group_limit, group_by_fields,
token_order, prefix);
}
}
int Index::get_bounded_typo_cost(const size_t max_cost, const size_t token_len) {
int bounded_cost = max_cost;
if(token_len > 0 && max_cost >= token_len && (token_len == 1 || token_len == 2)) {
bounded_cost = token_len - 1;
}
return bounded_cost;
}
void Index::log_leaves(const int cost, const std::string &token, const std::vector<art_leaf *> &leaves) const {
LOG(INFO) << "Index: " << name << ", token: " << token << ", cost: " << cost;
for(size_t i=0; i < leaves.size(); i++) {
std::string key((char*)leaves[i]->key, leaves[i]->key_len);
LOG(INFO) << key << " - " << leaves[i]->values->ids.getLength();
LOG(INFO) << "frequency: " << leaves[i]->values->ids.getLength() << ", max_score: " << leaves[i]->max_score;
/*for(auto j=0; j<leaves[i]->values->ids.getLength(); j++) {
LOG(INFO) << "id: " << leaves[i]->values->ids.at(j);
}*/
}
}
void Index::score_results(const std::vector<sort_by> & sort_fields, const uint16_t & query_index,
const uint8_t & field_id, const uint32_t total_cost, Topster* topster,
const std::vector<art_leaf *> &query_suggestion,
spp::sparse_hash_set<uint64_t>& groups_processed,
const uint32_t *result_ids, const size_t result_size,
const size_t group_limit, const std::vector<std::string>& group_by_fields,
uint32_t token_bits) const {
std::vector<uint32_t *> leaf_to_indices;
for (art_leaf *token_leaf: query_suggestion) {
uint32_t *indices = new uint32_t[result_size];
token_leaf->values->ids.indexOf(result_ids, result_size, indices);
leaf_to_indices.push_back(indices);
}
Match single_token_match = Match(1, 0);
const uint64_t single_token_match_score = single_token_match.get_match_score(total_cost);
int sort_order[3]; // 1 or -1 based on DESC or ASC respectively
spp::sparse_hash_map<uint32_t, int64_t>* field_values[3];
spp::sparse_hash_map<uint32_t, int64_t> geopoint_distances[3];
spp::sparse_hash_map<uint32_t, int64_t> text_match_sentinel_value, seq_id_sentinel_value;
spp::sparse_hash_map<uint32_t, int64_t> *TEXT_MATCH_SENTINEL = &text_match_sentinel_value;
spp::sparse_hash_map<uint32_t, int64_t> *SEQ_ID_SENTINEL = &seq_id_sentinel_value;
for (size_t i = 0; i < sort_fields.size(); i++) {
sort_order[i] = 1;
if (sort_fields[i].order == sort_field_const::asc) {
sort_order[i] = -1;
}
if (sort_fields[i].name == sort_field_const::text_match) {
field_values[i] = TEXT_MATCH_SENTINEL;
} else if (sort_fields[i].name == sort_field_const::seq_id) {
field_values[i] = SEQ_ID_SENTINEL;
} else if (sort_schema.at(sort_fields[i].name).is_geopoint()) {
// we have to populate distances that will be used for match scoring
spp::sparse_hash_map<uint32_t, int64_t> *geopoints = sort_index.at(sort_fields[i].name);
for (size_t rindex = 0; rindex < result_size; rindex++) {
const uint32_t seq_id = result_ids[rindex];
auto it = geopoints->find(seq_id);
int64_t dist = (it == geopoints->end()) ? INT32_MAX : h3Distance(sort_fields[i].geopoint, it->second);
geopoint_distances[i].emplace(seq_id, dist);
}
field_values[i] = &geopoint_distances[i];
} else {
field_values[i] = sort_index.at(sort_fields[i].name);
}
}
//auto begin = std::chrono::high_resolution_clock::now();
for (size_t i = 0; i < result_size; i++) {
const uint32_t seq_id = result_ids[i];
uint64_t match_score = 0;
if (query_suggestion.size() <= 1) {
match_score = single_token_match_score;
} else {
std::unordered_map<size_t, std::vector<std::vector<uint16_t>>> array_token_positions;
populate_token_positions(query_suggestion, leaf_to_indices, i, array_token_positions);
for (const auto& kv: array_token_positions) {
const std::vector<std::vector<uint16_t>> &token_positions = kv.second;
if (token_positions.empty()) {
continue;
}
const Match &match = Match(seq_id, token_positions, false);
uint64_t this_match_score = match.get_match_score(total_cost);
match_score += this_match_score;
/*std::ostringstream os;
os << name << ", total_cost: " << (255 - total_cost)
<< ", words_present: " << match.words_present
<< ", match_score: " << match_score
<< ", match.distance: " << match.distance
<< ", seq_id: " << seq_id << std::endl;
LOG(INFO) << os.str();*/
}
}
const int64_t default_score = INT64_MIN; // to handle field that doesn't exist in document (e.g. optional)
int64_t scores[3] = {0};
size_t match_score_index = 0;
// avoiding loop
if (sort_fields.size() > 0) {
if (field_values[0] == TEXT_MATCH_SENTINEL) {
scores[0] = int64_t(match_score);
match_score_index = 0;
} else if (field_values[0] == SEQ_ID_SENTINEL) {
scores[0] = seq_id;
} else {
auto it = field_values[0]->find(seq_id);
scores[0] = (it == field_values[0]->end()) ? default_score : it->second;
}
if (sort_order[0] == -1) {
scores[0] = -scores[0];
}
}
if(sort_fields.size() > 1) {
if (field_values[1] == TEXT_MATCH_SENTINEL) {
scores[1] = int64_t(match_score);
match_score_index = 1;
} else if (field_values[1] == SEQ_ID_SENTINEL) {
scores[1] = seq_id;
} else {
auto it = field_values[1]->find(seq_id);
scores[1] = (it == field_values[1]->end()) ? default_score : it->second;
}
if (sort_order[1] == -1) {
scores[1] = -scores[1];
}
}
if(sort_fields.size() > 2) {
if(field_values[2] != TEXT_MATCH_SENTINEL) {
scores[2] = int64_t(match_score);
match_score_index = 2;
} else if (field_values[2] == SEQ_ID_SENTINEL) {
scores[2] = seq_id;
} else {
auto it = field_values[2]->find(seq_id);
scores[2] = (it == field_values[2]->end()) ? default_score : it->second;
}
if(sort_order[2] == -1) {
scores[2] = -scores[2];
}
}
uint64_t distinct_id = seq_id;
if(group_limit != 0) {
distinct_id = get_distinct_id(group_by_fields, seq_id);
groups_processed.emplace(distinct_id);
}
//LOG(INFO) << "Seq id: " << seq_id << ", match_score: " << match_score;
KV kv(field_id, query_index, token_bits, seq_id, distinct_id, match_score_index, scores);
topster->add(&kv);
}
//long long int timeNanos = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - begin).count();
//LOG(INFO) << "Time taken for results iteration: " << timeNanos << "ms";
for(uint32_t* leaf_indices: leaf_to_indices) {
delete [] leaf_indices;
}
}
// pre-filter group_by_fields such that we can avoid the find() check
uint64_t Index::get_distinct_id(const std::vector<std::string>& group_by_fields,
const uint32_t seq_id) const {
uint64_t distinct_id = 1; // some constant initial value
// calculate hash from group_by_fields
for(const auto& field: group_by_fields) {
const auto& field_facet_mapping_it = facet_index_v3.find(field);
if(field_facet_mapping_it == facet_index_v3.end()) {
continue;
}
const auto& field_facet_mapping = field_facet_mapping_it->second;
const auto& facet_hashes_it = field_facet_mapping->find(seq_id);
if(facet_hashes_it == field_facet_mapping->end()) {
continue;
}
const auto& facet_hashes = facet_hashes_it->second;
for(size_t i = 0; i < facet_hashes.size(); i++) {
distinct_id = hash_combine(distinct_id, facet_hashes.hashes[i]);
}
}
return distinct_id;
}
void Index::populate_token_positions(const std::vector<art_leaf *>& query_suggestion,
const std::vector<uint32_t*>& leaf_to_indices,
const size_t result_index,
std::unordered_map<size_t, std::vector<std::vector<uint16_t>>>& array_token_positions) {
if(query_suggestion.empty()) {
return ;
}
// array_token_positions:
// for every element in a potential array, for every token in query suggestion, get the positions
for(size_t i = 0; i < query_suggestion.size(); i++) {
const art_leaf* token_leaf = query_suggestion[i];
uint32_t doc_index = leaf_to_indices[i][result_index];
/*LOG(INFO) << "doc_id: " << token_leaf->values->ids.at(doc_index) << ", token_leaf->values->ids.getLength(): "
<< token_leaf->values->ids.getLength();*/
// it's possible for a query token to not appear in a resulting document
if(doc_index == token_leaf->values->ids.getLength()) {
continue;
}
// Array offset storage format:
// a) last element is array_index b) second and third last elements will be largest offset
// (last element is repeated to indicate end of offsets for a given array index)
/*uint32_t* offsets = token_leaf->values->offsets.uncompress();
for(size_t ii=0; ii < token_leaf->values->offsets.getLength(); ii++) {
LOG(INFO) << "offset: " << offsets[ii];
}
uint32_t* offset_indices = token_leaf->values->offset_index.uncompress();
for(size_t ii=0; ii < token_leaf->values->offset_index.getLength(); ii++) {
LOG(INFO) << "offset index: " << offset_indices[ii];
}
LOG(INFO) << "token_leaf->values->offsets.getLength(): " << token_leaf->values->offsets.getLength();*/
uint32_t start_offset = token_leaf->values->offset_index.at(doc_index);
uint32_t end_offset = (doc_index == token_leaf->values->ids.getLength() - 1) ?
token_leaf->values->offsets.getLength() :
token_leaf->values->offset_index.at(doc_index+1);
std::vector<uint16_t> positions;
int prev_pos = -1;
while(start_offset < end_offset) {
int pos = token_leaf->values->offsets.at(start_offset);
start_offset++;
if(pos == prev_pos) { // indicates end of array index
if(!positions.empty()) {
size_t array_index = (size_t) token_leaf->values->offsets.at(start_offset);
array_token_positions[array_index].push_back(positions);
positions.clear();
}
start_offset++; // skip current value which is the array index
prev_pos = -1;
continue;
}
prev_pos = pos;
positions.push_back((uint16_t)pos);
}
if(!positions.empty()) {
// for plain string fields
array_token_positions[0].push_back(positions);
}
}
}
inline uint32_t Index::next_suggestion(const std::vector<token_candidates> &token_candidates_vec,
long long int n,
std::vector<art_leaf *>& actual_query_suggestion,
std::vector<art_leaf *>& query_suggestion,
uint32_t& token_bits) {
uint32_t total_cost = 0;
// generate the next combination from `token_leaves` and store it in `query_suggestion`
ldiv_t q { n, 0 };
for(long long i = 0 ; i < (long long) token_candidates_vec.size(); i++) {
size_t token_size = token_candidates_vec[i].token.value.size();
q = ldiv(q.quot, token_candidates_vec[i].candidates.size());
actual_query_suggestion[i] = token_candidates_vec[i].candidates[q.rem];
query_suggestion[i] = token_candidates_vec[i].candidates[q.rem];
total_cost += token_candidates_vec[i].cost;
token_bits |= 1UL << token_candidates_vec[i].token.position; // sets n-th bit
if(actual_query_suggestion[i]->key_len != token_size+1) {
total_cost++;
}
}
// Sort ascending based on matched documents for each token for faster intersection.
// However, this causes the token order to deviate from original query's order.
sort(query_suggestion.begin(), query_suggestion.end(), [](const art_leaf* left, const art_leaf* right) {
return left->values->ids.getLength() < right->values->ids.getLength();
});
return total_cost;
}
void Index::remove_and_shift_offset_index(sorted_array& offset_index, const uint32_t* indices_sorted,
const uint32_t indices_length) {
uint32_t *curr_array = offset_index.uncompress();
uint32_t *new_array = new uint32_t[offset_index.getLength()];
new_array[0] = 0;
uint32_t new_index = 0;
uint32_t curr_index = 0;
uint32_t indices_counter = 0;
uint32_t shift_value = 0;
while(curr_index < offset_index.getLength()) {
if(indices_counter < indices_length && curr_index >= indices_sorted[indices_counter]) {
// skip copying
if(curr_index == indices_sorted[indices_counter]) {
curr_index++;
const uint32_t diff = curr_index == offset_index.getLength() ?
0 : (offset_index.at(curr_index) - offset_index.at(curr_index-1));
shift_value += diff;
}
indices_counter++;
} else {
new_array[new_index++] = curr_array[curr_index++] - shift_value;
}
}
offset_index.load(new_array, new_index);
delete[] curr_array;
delete[] new_array;
}
Option<uint32_t> Index::remove(const uint32_t seq_id, const nlohmann::json & document) {
std::unique_lock lock(mutex);
for(auto it = document.begin(); it != document.end(); ++it) {
const std::string& field_name = it.key();
const auto& search_field_it = search_schema.find(field_name);
if(search_field_it == search_schema.end()) {
continue;
}
const auto& search_field = search_field_it->second;
if(!search_field.index) {
continue;
}
// Go through all the field names and find the keys+values so that they can be removed from in-memory index
if(search_field.type == field_types::STRING_ARRAY || search_field.type == field_types::STRING) {
std::vector<std::string> tokens;
tokenize_doc_field(document, search_field, tokens);
for(auto & token: tokens) {
const unsigned char *key = (const unsigned char *) token.c_str();
int key_len = (int) (token.length() + 1);
art_leaf* leaf = (art_leaf *) art_search(search_index.at(field_name), key, key_len);
if(leaf != nullptr) {
uint32_t doc_index = leaf->values->ids.indexOf(seq_id);
if (doc_index == leaf->values->ids.getLength()) {
// not found - happens when 2 tokens repeat in a field, e.g "is it or is is not?"
continue;
}
uint32_t start_offset = leaf->values->offset_index.at(doc_index);
uint32_t end_offset = (doc_index == leaf->values->ids.getLength() - 1) ?
leaf->values->offsets.getLength() :
leaf->values->offset_index.at(doc_index + 1);
uint32_t doc_indices[1] = {doc_index};
remove_and_shift_offset_index(leaf->values->offset_index, doc_indices, 1);
leaf->values->offsets.remove_index(start_offset, end_offset);
leaf->values->ids.remove_value(seq_id);
/*len = leaf->values->offset_index.getLength();
for(auto i=0; i<len; i++) {
LOG(INFO) << "i: " << i << ", val: " << leaf->values->offset_index.at(i);
}
LOG(INFO) << "----";*/
if (leaf->values->ids.getLength() == 0) {
art_values *values = (art_values *) art_delete(search_index.at(field_name), key, key_len);
delete values;
}
}
}
} else if(search_field.is_int32()) {
const std::vector<int32_t>& values = search_field.is_single_integer() ?
std::vector<int32_t>{document[field_name].get<int32_t>()} :
document[field_name].get<std::vector<int32_t>>();
for(int32_t value: values) {
num_tree_t* num_tree = numerical_index.at(field_name);
num_tree->remove(value, seq_id);
}
} else if(search_field.is_int64()) {
const std::vector<int64_t>& values = search_field.is_single_integer() ?
std::vector<int64_t>{document[field_name].get<int64_t>()} :
document[field_name].get<std::vector<int64_t>>();
for(int64_t value: values) {
num_tree_t* num_tree = numerical_index.at(field_name);
num_tree->remove(value, seq_id);
}
} else if(search_field.is_float()) {
const std::vector<float>& values = search_field.is_single_float() ?
std::vector<float>{document[field_name].get<float>()} :
document[field_name].get<std::vector<float>>();
for(float value: values) {
num_tree_t* num_tree = numerical_index.at(field_name);
int64_t fintval = float_to_in64_t(value);
num_tree->remove(fintval, seq_id);
}
} else if(search_field.is_bool()) {
const std::vector<bool>& values = search_field.is_single_bool() ?
std::vector<bool>{document[field_name].get<bool>()} :
document[field_name].get<std::vector<bool>>();
for(bool value: values) {
num_tree_t* num_tree = numerical_index.at(field_name);
int64_t bool_int64 = value ? 1 : 0;
num_tree->remove(bool_int64, seq_id);
}
}
// remove facets
const auto& field_facets_it = facet_index_v3.find(field_name);
if(field_facets_it != facet_index_v3.end()) {
const auto& fvalues_it = field_facets_it->second->find(seq_id);
if(fvalues_it != field_facets_it->second->end()) {
field_facets_it->second->erase(fvalues_it);
}
}
// remove sort field
if(sort_index.count(field_name) != 0) {
sort_index[field_name]->erase(seq_id);
}
}
if(seq_ids.contains(seq_id)) {
seq_ids.remove_value(seq_id);
}
return Option<uint32_t>(seq_id);
}
void Index::tokenize_doc_field(const nlohmann::json& document, const field& search_field,
std::vector<std::string>& tokens) {
const std::string& field_name = search_field.name;
if(search_field.type == field_types::STRING) {
Tokenizer(document[field_name], true, true, !search_field.is_string()).tokenize(tokens);
} else if(search_field.type == field_types::STRING_ARRAY) {
const std::vector<std::string>& values = document[field_name].get<std::vector<std::string>>();
for(const std::string & value: values) {
Tokenizer(value, true, true, !search_field.is_string()).tokenize(tokens);
}
}
}
art_leaf* Index::get_token_leaf(const std::string & field_name, const unsigned char* token, uint32_t token_len) {
std::shared_lock lock(mutex);
const art_tree *t = search_index.at(field_name);
return (art_leaf*) art_search(t, token, (int) token_len);
}
const spp::sparse_hash_map<std::string, art_tree *> &Index::_get_search_index() const {
return search_index;
}
const spp::sparse_hash_map<std::string, num_tree_t*>& Index::_get_numerical_index() const {
return numerical_index;
}
void Index::refresh_schemas(const std::vector<field>& new_fields) {
std::unique_lock lock(mutex);
for(const auto & new_field: new_fields) {
search_schema.emplace(new_field.name, new_field);
sort_schema.emplace(new_field.name, new_field);
if(search_index.count(new_field.name) == 0) {
if(new_field.is_string()) {
art_tree *t = new art_tree;
art_tree_init(t);
search_index.emplace(new_field.name, t);
} else {
num_tree_t* num_tree = new num_tree_t;
numerical_index.emplace(new_field.name, num_tree);
}
}
if(new_field.is_facet()) {
facet_schema.emplace(new_field.name, new_field);
spp::sparse_hash_map<uint32_t, facet_hash_values_t> *doc_to_values = new spp::sparse_hash_map<uint32_t, facet_hash_values_t>();
facet_index_v3.emplace(new_field.name, doc_to_values);
// initialize for non-string facet fields
if(!new_field.is_string()) {
art_tree *ft = new art_tree;
art_tree_init(ft);
search_index.emplace(new_field.faceted_name(), ft);
}
}
if(sort_index.count(new_field.name) == 0) {
spp::sparse_hash_map<uint32_t, int64_t> * doc_to_score = new spp::sparse_hash_map<uint32_t, int64_t>();
sort_index.emplace(new_field.name, doc_to_score);
}
}
}
Option<uint32_t> Index::coerce_string(const DIRTY_VALUES& dirty_values, const std::string& fallback_field_type,
const field& a_field, nlohmann::json &document,
const std::string &field_name, nlohmann::json::iterator& array_iter,
bool is_array, bool& array_ele_erased) {
std::string suffix = is_array ? "an array of" : "a";
auto& item = is_array ? array_iter.value() : document[field_name];
if(dirty_values == DIRTY_VALUES::REJECT) {
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " string.");
}
if(dirty_values == DIRTY_VALUES::DROP) {
if(!a_field.optional) {
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " string.");
}
if(!is_array) {
document.erase(field_name);
} else {
array_iter = document[field_name].erase(array_iter);
array_ele_erased = true;
}
return Option<uint32_t>(200);
}
// we will try to coerce the value to a string
if (item.is_number_integer()) {
item = std::to_string((int64_t)item);
}
else if(item.is_number_float()) {
item = std::to_string((float)item);
}
else if(item.is_boolean()) {
item = item == true ? "true" : "false";
}
else {
if(dirty_values == DIRTY_VALUES::COERCE_OR_DROP) {
if(!a_field.optional) {
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " string.");
}
if(!is_array) {
document.erase(field_name);
} else {
array_iter = document[field_name].erase(array_iter);
array_ele_erased = true;
}
} else {
// COERCE_OR_REJECT / non-optional + DROP
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " string.");
}
}
return Option<>(200);
}
Option<uint32_t> Index::coerce_int32_t(const DIRTY_VALUES& dirty_values, const field& a_field, nlohmann::json &document,
const std::string &field_name,
nlohmann::json::iterator& array_iter, bool is_array, bool& array_ele_erased) {
std::string suffix = is_array ? "an array of" : "an";
auto& item = is_array ? array_iter.value() : document[field_name];
if(dirty_values == DIRTY_VALUES::REJECT) {
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " int32.");
}
if(dirty_values == DIRTY_VALUES::DROP) {
if(!a_field.optional) {
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " int32.");
}
if(!is_array) {
document.erase(field_name);
} else {
array_iter = document[field_name].erase(array_iter);
array_ele_erased = true;
}
return Option<uint32_t>(200);
}
// try to value coerce into an integer
if(item.is_number_float()) {
item = static_cast<int32_t>(item.get<float>());
}
else if(item.is_boolean()) {
item = item == true ? 1 : 0;
}
else if(item.is_string() && StringUtils::is_int32_t(item)) {
item = std::atol(item.get<std::string>().c_str());
}
else {
if(dirty_values == DIRTY_VALUES::COERCE_OR_DROP) {
if(!a_field.optional) {
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " int32.");
}
if(!is_array) {
document.erase(field_name);
} else {
array_iter = document[field_name].erase(array_iter);
array_ele_erased = true;
}
} else {
// COERCE_OR_REJECT / non-optional + DROP
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " int32.");
}
}
return Option<uint32_t>(200);
}
Option<uint32_t> Index::coerce_int64_t(const DIRTY_VALUES& dirty_values, const field& a_field, nlohmann::json &document,
const std::string &field_name,
nlohmann::json::iterator& array_iter, bool is_array, bool& array_ele_erased) {
std::string suffix = is_array ? "an array of" : "an";
auto& item = is_array ? array_iter.value() : document[field_name];
if(dirty_values == DIRTY_VALUES::REJECT) {
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " int64.");
}
if(dirty_values == DIRTY_VALUES::DROP) {
if(!a_field.optional) {
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " int64.");
}
if(!is_array) {
document.erase(field_name);
} else {
array_iter = document[field_name].erase(array_iter);
array_ele_erased = true;
}
return Option<uint32_t>(200);
}
// try to value coerce into an integer
if(item.is_number_float()) {
item = static_cast<int64_t>(item.get<float>());
}
else if(item.is_boolean()) {
item = item == true ? 1 : 0;
}
else if(item.is_string() && StringUtils::is_int64_t(item)) {
item = std::atoll(item.get<std::string>().c_str());
}
else {
if(dirty_values == DIRTY_VALUES::COERCE_OR_DROP) {
if(!a_field.optional) {
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " int64.");
}
if(!is_array) {
document.erase(field_name);
} else {
array_iter = document[field_name].erase(array_iter);
array_ele_erased = true;
}
} else {
// COERCE_OR_REJECT / non-optional + DROP
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " int64.");
}
}
return Option<uint32_t>(200);
}
Option<uint32_t> Index::coerce_bool(const DIRTY_VALUES& dirty_values, const field& a_field, nlohmann::json &document,
const std::string &field_name,
nlohmann::json::iterator& array_iter, bool is_array, bool& array_ele_erased) {
std::string suffix = is_array ? "a array of" : "a";
auto& item = is_array ? array_iter.value() : document[field_name];
if(dirty_values == DIRTY_VALUES::REJECT) {
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " bool.");
}
if(dirty_values == DIRTY_VALUES::DROP) {
if(!a_field.optional) {
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " bool.");
}
if(!is_array) {
document.erase(field_name);
} else {
array_iter = document[field_name].erase(array_iter);
array_ele_erased = true;
}
return Option<uint32_t>(200);
}
// try to value coerce into a bool
if (item.is_number_integer() &&
(item.get<int64_t>() == 1 || item.get<int64_t>() == 0)) {
item = item.get<int64_t>() == 1;
}
else if(item.is_string()) {
std::string str_val = item.get<std::string>();
StringUtils::tolowercase(str_val);
if(str_val == "true") {
item = true;
return Option<uint32_t>(200);
} else if(str_val == "false") {
item = false;
return Option<uint32_t>(200);
} else {
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " bool.");
}
}
else {
if(dirty_values == DIRTY_VALUES::COERCE_OR_DROP) {
if(!a_field.optional) {
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " bool.");
}
if(!is_array) {
document.erase(field_name);
} else {
array_iter = document[field_name].erase(array_iter);
array_ele_erased = true;
}
} else {
// COERCE_OR_REJECT / non-optional + DROP
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " bool.");
}
}
return Option<uint32_t>(200);
}
Option<uint32_t> Index::coerce_float(const DIRTY_VALUES& dirty_values, const field& a_field, nlohmann::json &document,
const std::string &field_name,
nlohmann::json::iterator& array_iter, bool is_array, bool& array_ele_erased) {
std::string suffix = is_array ? "a array of" : "a";
auto& item = is_array ? array_iter.value() : document[field_name];
if(dirty_values == DIRTY_VALUES::REJECT) {
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " float.");
}
if(dirty_values == DIRTY_VALUES::DROP) {
if(!a_field.optional) {
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " float.");
}
if(!is_array) {
document.erase(field_name);
} else {
array_iter = document[field_name].erase(array_iter);
array_ele_erased = true;
}
return Option<uint32_t>(200);
}
// try to value coerce into a float
if(item.is_string() && StringUtils::is_float(item)) {
item = std::atof(item.get<std::string>().c_str());
}
else if(item.is_boolean()) {
item = item == true ? 1.0 : 0.0;
}
else {
if(dirty_values == DIRTY_VALUES::COERCE_OR_DROP) {
if(!a_field.optional) {
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " float.");
}
if(!is_array) {
document.erase(field_name);
} else {
array_iter = document[field_name].erase(array_iter);
array_ele_erased = true;
}
} else {
// COERCE_OR_REJECT / non-optional + DROP
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " float.");
}
}
return Option<uint32_t>(200);
}
void Index::get_doc_changes(const nlohmann::json &document, nlohmann::json &old_doc, nlohmann::json &new_doc,
nlohmann::json &del_doc) {
for(auto it = old_doc.begin(); it != old_doc.end(); ++it) {
new_doc[it.key()] = it.value();
}
for(auto it = document.begin(); it != document.end(); ++it) {
// adds new key or overrides existing key from `old_doc`
new_doc[it.key()] = it.value();
// if the update document contains a field that exists in old, we record that (for delete + reindex)
bool field_exists_in_old_doc = (old_doc.count(it.key()) != 0);
if(field_exists_in_old_doc) {
// key exists in the stored doc, so it must be reindexed
// we need to check for this because a field can be optional
del_doc[it.key()] = old_doc[it.key()];
}
}
}
// https://stackoverflow.com/questions/924171/geo-fencing-point-inside-outside-polygon
// NOTE: polygon and point should have been transformed with `transform_for_180th_meridian`
bool Index::is_point_in_polygon(const Geofence& poly, const GeoCoord &point) {
int i, j;
bool c = false;
for (i = 0, j = poly.numVerts - 1; i < poly.numVerts; j = i++) {
if ((((poly.verts[i].lat <= point.lat) && (point.lat < poly.verts[j].lat))
|| ((poly.verts[j].lat <= point.lat) && (point.lat < poly.verts[i].lat)))
&& (point.lon < (poly.verts[j].lon - poly.verts[i].lon) * (point.lat - poly.verts[i].lat)
/ (poly.verts[j].lat - poly.verts[i].lat) + poly.verts[i].lon)) {
c = !c;
}
}
return c;
}
double Index::transform_for_180th_meridian(Geofence &poly) {
double offset = 0.0;
double maxLon = -1000, minLon = 1000;
for(int v=0; v < poly.numVerts; v++) {
if(poly.verts[v].lon < minLon) {
minLon = poly.verts[v].lon;
}
if(poly.verts[v].lon > maxLon) {
maxLon = poly.verts[v].lon;
}
if(std::abs(minLon - maxLon) > 180) {
offset = 360.0;
}
}
int i, j;
for (i = 0, j = poly.numVerts - 1; i < poly.numVerts; j = i++) {
if (poly.verts[i].lon < 0.0) {
poly.verts[i].lon += offset;
}
if (poly.verts[j].lon < 0.0) {
poly.verts[j].lon += offset;
}
}
return offset;
}
void Index::transform_for_180th_meridian(GeoCoord &point, double offset) {
point.lon = point.lon < 0.0 ? point.lon + offset : point.lon;
}