mirror of
https://github.com/typesense/typesense.git
synced 2025-05-23 15:23:40 +08:00
2070 lines
85 KiB
C++
2070 lines
85 KiB
C++
#include "index.h"
|
|
|
|
#include <numeric>
|
|
#include <chrono>
|
|
#include <set>
|
|
#include <unordered_map>
|
|
#include <array_utils.h>
|
|
#include <match_score.h>
|
|
#include <string_utils.h>
|
|
#include <art.h>
|
|
#include <tokenizer.h>
|
|
#include "logger.h"
|
|
|
|
Index::Index(const std::string name, const std::unordered_map<std::string, field> & search_schema,
|
|
std::map<std::string, field> facet_schema, std::unordered_map<std::string, field> sort_schema):
|
|
name(name), search_schema(search_schema), facet_schema(facet_schema), sort_schema(sort_schema) {
|
|
|
|
for(const auto & pair: search_schema) {
|
|
art_tree *t = new art_tree;
|
|
art_tree_init(t);
|
|
search_index.emplace(pair.first, t);
|
|
|
|
// initialize for non-string facet fields
|
|
if(pair.second.facet && !pair.second.is_string()) {
|
|
art_tree *ft = new art_tree;
|
|
art_tree_init(ft);
|
|
search_index.emplace(pair.second.faceted_name(), ft);
|
|
}
|
|
}
|
|
|
|
for(const auto & pair: sort_schema) {
|
|
spp::sparse_hash_map<uint32_t, int64_t> * doc_to_score = new spp::sparse_hash_map<uint32_t, int64_t>();
|
|
sort_index.emplace(pair.first, doc_to_score);
|
|
}
|
|
|
|
num_documents = 0;
|
|
|
|
ready = false;
|
|
processed = false;
|
|
terminate = false;
|
|
}
|
|
|
|
Index::~Index() {
|
|
for(auto & name_tree: search_index) {
|
|
art_tree_destroy(name_tree.second);
|
|
delete name_tree.second;
|
|
name_tree.second = nullptr;
|
|
}
|
|
|
|
search_index.clear();
|
|
|
|
for(auto & name_map: sort_index) {
|
|
delete name_map.second;
|
|
name_map.second = nullptr;
|
|
}
|
|
|
|
sort_index.clear();
|
|
}
|
|
|
|
int64_t Index::get_points_from_doc(const nlohmann::json &document, const std::string & default_sorting_field) {
|
|
int64_t points = 0;
|
|
|
|
if(!default_sorting_field.empty()) {
|
|
if(document[default_sorting_field].is_number_float()) {
|
|
// serialize float to an integer and reverse the inverted range
|
|
float n = document[default_sorting_field];
|
|
memcpy(&points, &n, sizeof(int32_t));
|
|
points ^= ((points >> (std::numeric_limits<int32_t>::digits - 1)) | INT32_MIN);
|
|
points = -1 * (INT32_MAX - points);
|
|
} else {
|
|
points = document[default_sorting_field];
|
|
}
|
|
}
|
|
|
|
return points;
|
|
}
|
|
|
|
int64_t Index::float_to_in64_t(float f) {
|
|
// https://stackoverflow.com/questions/60530255/convert-float-to-int64-t-while-preserving-ordering
|
|
int32_t i;
|
|
memcpy(&i, &f, sizeof i);
|
|
if (i < 0) {
|
|
i ^= INT32_MAX;
|
|
}
|
|
return i;
|
|
}
|
|
|
|
Option<uint32_t> Index::index_in_memory(const nlohmann::json &document, uint32_t seq_id,
|
|
const std::string & default_sorting_field, bool is_update) {
|
|
|
|
int64_t points = 0;
|
|
|
|
if(is_update && document.count(default_sorting_field) == 0) {
|
|
points = sort_index[default_sorting_field]->at(seq_id);
|
|
} else {
|
|
points = get_points_from_doc(document, default_sorting_field);
|
|
}
|
|
|
|
std::unordered_map<std::string, size_t> facet_to_id;
|
|
size_t i_facet = 0;
|
|
for(const auto & facet: facet_schema) {
|
|
facet_to_id[facet.first] = i_facet;
|
|
i_facet++;
|
|
}
|
|
|
|
// initialize facet index since it will be updated as well during search indexing
|
|
// even if a field is optional, a facet position will be available in the vector for that field
|
|
// NOTE: Use of `emplace()` means that we will not replace existing facet values.
|
|
std::vector<std::vector<uint64_t>> values(facet_schema.size());
|
|
facet_index_v2.emplace(seq_id, values);
|
|
|
|
// assumes that validation has already been done
|
|
for(const auto& field_pair: search_schema) {
|
|
const std::string & field_name = field_pair.first;
|
|
|
|
if((field_pair.second.optional || is_update) && document.count(field_name) == 0) {
|
|
continue;
|
|
}
|
|
|
|
int facet_id = -1;
|
|
if(facet_schema.count(field_name) != 0) {
|
|
facet_id = facet_to_id[field_name];
|
|
}
|
|
|
|
// non-string faceted field should be indexed as faceted string field as well
|
|
if(field_pair.second.facet && !field_pair.second.is_string()) {
|
|
art_tree *t = search_index.at(field_pair.second.faceted_name());
|
|
|
|
if(field_pair.second.is_array()) {
|
|
std::vector<std::string> strings;
|
|
|
|
if(field_pair.second.type == field_types::INT32_ARRAY) {
|
|
for(int32_t value: document[field_name]){
|
|
strings.push_back(std::to_string(value));
|
|
}
|
|
} else if(field_pair.second.type == field_types::INT64_ARRAY) {
|
|
for(int64_t value: document[field_name]){
|
|
strings.push_back(std::to_string(value));
|
|
}
|
|
} else if(field_pair.second.type == field_types::FLOAT_ARRAY) {
|
|
for(float value: document[field_name]){
|
|
strings.push_back(std::to_string(value));
|
|
}
|
|
} else if(field_pair.second.type == field_types::BOOL_ARRAY) {
|
|
for(bool value: document[field_name]){
|
|
strings.push_back(std::to_string(value));
|
|
}
|
|
}
|
|
index_string_array_field(strings, points, t, seq_id, facet_id, field_pair.second);
|
|
} else {
|
|
std::string text;
|
|
|
|
if(field_pair.second.type == field_types::INT32) {
|
|
text = std::to_string(document[field_name].get<int32_t>());
|
|
} else if(field_pair.second.type == field_types::INT64) {
|
|
text = std::to_string(document[field_name].get<int64_t>());
|
|
} else if(field_pair.second.type == field_types::FLOAT) {
|
|
text = std::to_string(document[field_name].get<float>());
|
|
} else if(field_pair.second.type == field_types::BOOL) {
|
|
text = std::to_string(document[field_name].get<bool>());
|
|
}
|
|
|
|
index_string_field(text, points, t, seq_id, facet_id, field_pair.second);
|
|
}
|
|
}
|
|
|
|
art_tree *t = search_index.at(field_name);
|
|
|
|
if(field_pair.second.type == field_types::STRING) {
|
|
const std::string & text = document[field_name];
|
|
index_string_field(text, points, t, seq_id, facet_id, field_pair.second);
|
|
} else if(field_pair.second.type == field_types::INT32) {
|
|
uint32_t value = document[field_name];
|
|
index_int32_field(value, points, t, seq_id);
|
|
} else if(field_pair.second.type == field_types::INT64) {
|
|
uint64_t value = document[field_name];
|
|
index_int64_field(value, points, t, seq_id);
|
|
} else if(field_pair.second.type == field_types::FLOAT) {
|
|
float value = document[field_name];
|
|
index_float_field(value, points, t, seq_id);
|
|
} else if(field_pair.second.type == field_types::BOOL) {
|
|
bool value = document[field_name];
|
|
index_bool_field(value, points, t, seq_id);
|
|
} else if(field_pair.second.type == field_types::STRING_ARRAY) {
|
|
std::vector<std::string> strings = document[field_name];
|
|
index_string_array_field(strings, points, t, seq_id, facet_id, field_pair.second);
|
|
} else if(field_pair.second.type == field_types::INT32_ARRAY) {
|
|
std::vector<int32_t> values = document[field_name];
|
|
index_int32_array_field(values, points, t, seq_id);
|
|
} else if(field_pair.second.type == field_types::INT64_ARRAY) {
|
|
std::vector<int64_t> values = document[field_name];
|
|
index_int64_array_field(values, points, t, seq_id);
|
|
} else if(field_pair.second.type == field_types::FLOAT_ARRAY) {
|
|
std::vector<float> values = document[field_name];
|
|
index_float_array_field(values, points, t, seq_id);
|
|
} else if(field_pair.second.type == field_types::BOOL_ARRAY) {
|
|
std::vector<bool> values = document[field_name];
|
|
index_bool_array_field(values, points, t, seq_id);
|
|
}
|
|
|
|
// add numerical values automatically into sort index
|
|
if(field_pair.second.type == field_types::INT32 || field_pair.second.type == field_types::INT64 ||
|
|
field_pair.second.type == field_types::FLOAT || field_pair.second.type == field_types::BOOL) {
|
|
spp::sparse_hash_map<uint32_t, int64_t> *doc_to_score = sort_index.at(field_pair.first);
|
|
|
|
if(field_pair.second.is_integer() ) {
|
|
doc_to_score->emplace(seq_id, document[field_pair.first].get<int64_t>());
|
|
} else if(field_pair.second.is_float()) {
|
|
int64_t ifloat = float_to_in64_t(document[field_pair.first].get<float>());
|
|
doc_to_score->emplace(seq_id, ifloat);
|
|
} else if(field_pair.second.is_bool()) {
|
|
doc_to_score->emplace(seq_id, (int64_t) document[field_pair.first].get<bool>());
|
|
}
|
|
}
|
|
}
|
|
|
|
num_documents += 1;
|
|
return Option<>(201);
|
|
}
|
|
|
|
Option<uint32_t> Index::validate_index_in_memory(const nlohmann::json &document, uint32_t seq_id,
|
|
const std::string & default_sorting_field,
|
|
const std::unordered_map<std::string, field> & search_schema,
|
|
const std::map<std::string, field> & facet_schema,
|
|
bool is_update) {
|
|
|
|
bool has_default_sort_field = (document.count(default_sorting_field) != 0);
|
|
|
|
if(!has_default_sort_field && !is_update) {
|
|
return Option<>(400, "Field `" + default_sorting_field + "` has been declared as a default sorting field, "
|
|
"but is not found in the document.");
|
|
}
|
|
|
|
if(has_default_sort_field &&
|
|
!document[default_sorting_field].is_number_integer() && !document[default_sorting_field].is_number_float()) {
|
|
return Option<>(400, "Default sorting field `" + default_sorting_field + "` must be a single valued numerical field.");
|
|
}
|
|
|
|
if(has_default_sort_field && search_schema.at(default_sorting_field).is_single_float() &&
|
|
document[default_sorting_field].get<float>() > std::numeric_limits<float>::max()) {
|
|
return Option<>(400, "Default sorting field `" + default_sorting_field + "` exceeds maximum value of a float.");
|
|
}
|
|
|
|
for(const auto& field_pair: search_schema) {
|
|
const std::string& field_name = field_pair.first;
|
|
|
|
if((field_pair.second.optional || is_update) && document.count(field_name) == 0) {
|
|
continue;
|
|
}
|
|
|
|
if(document.count(field_name) == 0) {
|
|
return Option<>(400, "Field `" + field_name + "` has been declared in the schema, "
|
|
"but is not found in the document.");
|
|
}
|
|
|
|
if(field_pair.second.type == field_types::STRING) {
|
|
if(!document[field_name].is_string()) {
|
|
return Option<>(400, "Field `" + field_name + "` must be a string.");
|
|
}
|
|
} else if(field_pair.second.type == field_types::INT32) {
|
|
if(!document[field_name].is_number_integer()) {
|
|
return Option<>(400, "Field `" + field_name + "` must be an int32.");
|
|
}
|
|
|
|
if(document[field_name].get<int64_t>() > INT32_MAX) {
|
|
return Option<>(400, "Field `" + field_name + "` exceeds maximum value of int32.");
|
|
}
|
|
} else if(field_pair.second.type == field_types::INT64) {
|
|
if(!document[field_name].is_number_integer()) {
|
|
return Option<>(400, "Field `" + field_name + "` must be an int64.");
|
|
}
|
|
} else if(field_pair.second.type == field_types::FLOAT) {
|
|
if(!document[field_name].is_number()) { // allows integer to be passed to a float field
|
|
return Option<>(400, "Field `" + field_name + "` must be a float.");
|
|
}
|
|
} else if(field_pair.second.type == field_types::BOOL) {
|
|
if(!document[field_name].is_boolean()) {
|
|
return Option<>(400, "Field `" + field_name + "` must be a bool.");
|
|
}
|
|
} else if(field_pair.second.type == field_types::STRING_ARRAY) {
|
|
if(!document[field_name].is_array()) {
|
|
return Option<>(400, "Field `" + field_name + "` must be a string array.");
|
|
}
|
|
if(document[field_name].size() > 0 && !document[field_name][0].is_string()) {
|
|
return Option<>(400, "Field `" + field_name + "` must be a string array.");
|
|
}
|
|
} else if(field_pair.second.type == field_types::INT32_ARRAY) {
|
|
if(!document[field_name].is_array()) {
|
|
return Option<>(400, "Field `" + field_name + "` must be an int32 array.");
|
|
}
|
|
|
|
if(document[field_name].size() > 0 && !document[field_name][0].is_number_integer()) {
|
|
return Option<>(400, "Field `" + field_name + "` must be an int32 array.");
|
|
}
|
|
} else if(field_pair.second.type == field_types::INT64_ARRAY) {
|
|
if(!document[field_name].is_array()) {
|
|
return Option<>(400, "Field `" + field_name + "` must be an int64 array.");
|
|
}
|
|
|
|
if(document[field_name].size() > 0 && !document[field_name][0].is_number_integer()) {
|
|
return Option<>(400, "Field `" + field_name + "` must be an int64 array.");
|
|
}
|
|
} else if(field_pair.second.type == field_types::FLOAT_ARRAY) {
|
|
if(!document[field_name].is_array()) {
|
|
return Option<>(400, "Field `" + field_name + "` must be a float array.");
|
|
}
|
|
|
|
if(document[field_name].size() > 0 && !document[field_name][0].is_number()) {
|
|
// allows integer to be passed to a float array field
|
|
return Option<>(400, "Field `" + field_name + "` must be a float array.");
|
|
}
|
|
} else if(field_pair.second.type == field_types::BOOL_ARRAY) {
|
|
if(!document[field_name].is_array()) {
|
|
return Option<>(400, "Field `" + field_name + "` must be a bool array.");
|
|
}
|
|
|
|
if(document[field_name].size() > 0 && !document[field_name][0].is_boolean()) {
|
|
return Option<>(400, "Field `" + field_name + "` must be a bool array.");
|
|
}
|
|
}
|
|
}
|
|
|
|
return Option<>(200);
|
|
}
|
|
|
|
void Index::scrub_reindex_doc(nlohmann::json& update_doc, nlohmann::json& del_doc, nlohmann::json& old_doc) {
|
|
auto it = del_doc.cbegin();
|
|
while(it != del_doc.cend()) {
|
|
const std::string& field_name = it.key();
|
|
const auto& search_field_it = search_schema.find(field_name);
|
|
if(search_field_it == search_schema.end()) {
|
|
++it;
|
|
continue;
|
|
}
|
|
|
|
const auto& search_field = search_field_it->second;
|
|
|
|
// Go through all the field names and find the keys+values so that they can be removed from in-memory index
|
|
std::vector<std::string> reindex_tokens;
|
|
std::vector<std::string> old_tokens;
|
|
tokenize_doc_field(update_doc, search_field, reindex_tokens);
|
|
tokenize_doc_field(old_doc, search_field, old_tokens);
|
|
|
|
if(old_tokens.size() != reindex_tokens.size()) {
|
|
++it;
|
|
continue;
|
|
}
|
|
|
|
bool exact_match = true;
|
|
|
|
for(size_t i=0; i<reindex_tokens.size(); i++) {
|
|
const std::string& reindex_val = reindex_tokens[i];
|
|
const std::string& old_val = old_tokens[i];
|
|
if(reindex_val != old_val) {
|
|
exact_match = false;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if(exact_match) {
|
|
it = del_doc.erase(it);
|
|
update_doc.erase(field_name);
|
|
} else {
|
|
++it;
|
|
}
|
|
}
|
|
}
|
|
|
|
size_t Index::batch_memory_index(Index *index, std::vector<index_record> & iter_batch,
|
|
const std::string & default_sorting_field,
|
|
const std::unordered_map<std::string, field> & search_schema,
|
|
const std::map<std::string, field> & facet_schema) {
|
|
|
|
size_t num_indexed = 0;
|
|
|
|
for(auto & index_rec: iter_batch) {
|
|
if(!index_rec.indexed.ok()) {
|
|
// some records could have been invalidated upstream
|
|
continue;
|
|
}
|
|
|
|
if(index_rec.operation != DELETE) {
|
|
Option<uint32_t> validation_op = validate_index_in_memory(index_rec.doc, index_rec.seq_id,
|
|
default_sorting_field,
|
|
search_schema, facet_schema, index_rec.is_update);
|
|
|
|
if(!validation_op.ok()) {
|
|
index_rec.index_failure(validation_op.code(), validation_op.error());
|
|
continue;
|
|
}
|
|
|
|
if(index_rec.is_update) {
|
|
// scrub string fields to reduce delete ops
|
|
index->scrub_reindex_doc(index_rec.doc, index_rec.del_doc, index_rec.old_doc);
|
|
index->remove(index_rec.seq_id, index_rec.del_doc);
|
|
}
|
|
|
|
Option<uint32_t> index_mem_op = index->index_in_memory(index_rec.doc, index_rec.seq_id,
|
|
default_sorting_field, index_rec.is_update);
|
|
if(!index_mem_op.ok()) {
|
|
index->index_in_memory(index_rec.del_doc, index_rec.seq_id, default_sorting_field, true);
|
|
index_rec.index_failure(index_mem_op.code(), index_mem_op.error());
|
|
continue;
|
|
}
|
|
|
|
index_rec.index_success();
|
|
|
|
if(!index_rec.is_update) {
|
|
num_indexed++;
|
|
}
|
|
}
|
|
}
|
|
|
|
return num_indexed;
|
|
}
|
|
|
|
void Index::insert_doc(const int64_t score, art_tree *t, uint32_t seq_id,
|
|
const std::unordered_map<std::string, std::vector<uint32_t>> &token_to_offsets) const {
|
|
for(auto & kv: token_to_offsets) {
|
|
art_document art_doc;
|
|
art_doc.id = seq_id;
|
|
art_doc.score = score;
|
|
art_doc.offsets_len = (uint32_t) kv.second.size();
|
|
art_doc.offsets = new uint32_t[kv.second.size()];
|
|
|
|
uint32_t num_hits = 0;
|
|
|
|
const unsigned char *key = (const unsigned char *) kv.first.c_str();
|
|
int key_len = (int) kv.first.length() + 1; // for the terminating \0 char
|
|
|
|
art_leaf* leaf = (art_leaf *) art_search(t, key, key_len);
|
|
if(leaf != NULL) {
|
|
num_hits = leaf->values->ids.getLength();
|
|
}
|
|
|
|
num_hits += 1;
|
|
|
|
for(size_t i=0; i<kv.second.size(); i++) {
|
|
art_doc.offsets[i] = kv.second[i];
|
|
}
|
|
|
|
//LOG(INFO) << "key: " << key << ", art_doc.id: " << art_doc.id;
|
|
art_insert(t, key, key_len, &art_doc, num_hits);
|
|
delete [] art_doc.offsets;
|
|
art_doc.offsets = nullptr;
|
|
}
|
|
}
|
|
|
|
void Index::index_int32_field(const int32_t value, int64_t score, art_tree *t, uint32_t seq_id) const {
|
|
const int KEY_LEN = 8;
|
|
unsigned char key[KEY_LEN];
|
|
|
|
encode_int32(value, key);
|
|
|
|
uint32_t num_hits = 0;
|
|
art_leaf* leaf = (art_leaf *) art_search(t, key, KEY_LEN);
|
|
if(leaf != NULL) {
|
|
num_hits = leaf->values->ids.getLength();
|
|
}
|
|
|
|
num_hits += 1;
|
|
|
|
art_document art_doc;
|
|
art_doc.id = seq_id;
|
|
art_doc.score = score;
|
|
art_doc.offsets_len = 0;
|
|
art_doc.offsets = nullptr;
|
|
|
|
art_insert(t, key, KEY_LEN, &art_doc, num_hits);
|
|
}
|
|
|
|
void Index::index_int64_field(const int64_t value, int64_t score, art_tree *t, uint32_t seq_id) const {
|
|
const int KEY_LEN = 8;
|
|
unsigned char key[KEY_LEN];
|
|
|
|
encode_int64(value, key);
|
|
|
|
uint32_t num_hits = 0;
|
|
art_leaf* leaf = (art_leaf *) art_search(t, key, KEY_LEN);
|
|
if(leaf != NULL) {
|
|
num_hits = leaf->values->ids.getLength();
|
|
}
|
|
|
|
num_hits += 1;
|
|
|
|
art_document art_doc;
|
|
art_doc.id = seq_id;
|
|
art_doc.score = score;
|
|
art_doc.offsets_len = 0;
|
|
art_doc.offsets = nullptr;
|
|
|
|
art_insert(t, key, KEY_LEN, &art_doc, num_hits);
|
|
}
|
|
|
|
void Index::index_bool_field(const bool value, const int64_t score, art_tree *t, uint32_t seq_id) const {
|
|
const int KEY_LEN = 1;
|
|
unsigned char key[KEY_LEN];
|
|
key[0] = value ? '1' : '0';
|
|
|
|
uint32_t num_hits = 0;
|
|
art_leaf* leaf = (art_leaf *) art_search(t, key, KEY_LEN);
|
|
if(leaf != NULL) {
|
|
num_hits = leaf->values->ids.getLength();
|
|
}
|
|
|
|
num_hits += 1;
|
|
|
|
art_document art_doc;
|
|
art_doc.id = seq_id;
|
|
art_doc.score = score;
|
|
art_doc.offsets_len = 0;
|
|
art_doc.offsets = nullptr;
|
|
|
|
art_insert(t, key, KEY_LEN, &art_doc, num_hits);
|
|
}
|
|
|
|
void Index::index_float_field(const float value, int64_t score, art_tree *t, uint32_t seq_id) const {
|
|
const int KEY_LEN = 8;
|
|
unsigned char key[KEY_LEN];
|
|
|
|
encode_float(value, key);
|
|
|
|
uint32_t num_hits = 0;
|
|
art_leaf* leaf = (art_leaf *) art_search(t, key, KEY_LEN);
|
|
if(leaf != NULL) {
|
|
num_hits = leaf->values->ids.getLength();
|
|
}
|
|
|
|
num_hits += 1;
|
|
|
|
art_document art_doc;
|
|
art_doc.id = seq_id;
|
|
art_doc.score = score;
|
|
art_doc.offsets_len = 0;
|
|
art_doc.offsets = nullptr;
|
|
|
|
art_insert(t, key, KEY_LEN, &art_doc, num_hits);
|
|
}
|
|
|
|
|
|
uint64_t Index::facet_token_hash(const field & a_field, const std::string &token) {
|
|
// for integer/float use their native values
|
|
uint64_t hash = 0;
|
|
|
|
if(a_field.is_float()) {
|
|
float f = std::stof(token);
|
|
reinterpret_cast<float&>(hash) = f; // store as int without loss of precision
|
|
} else if(a_field.is_integer() || a_field.is_bool()) {
|
|
hash = atoll(token.c_str());
|
|
} else {
|
|
// string field
|
|
hash = StringUtils::hash_wy(token.c_str(), token.size());
|
|
}
|
|
|
|
return hash;
|
|
}
|
|
|
|
void Index::index_string_field(const std::string & text, const int64_t score, art_tree *t,
|
|
uint32_t seq_id, int facet_id, const field & a_field) {
|
|
std::unordered_map<std::string, std::vector<uint32_t>> token_to_offsets;
|
|
|
|
Tokenizer tokenizer(text, true, true, !a_field.is_string());
|
|
std::string token;
|
|
size_t token_index = 0;
|
|
|
|
while(tokenizer.next(token, token_index)) {
|
|
if(token.empty()) {
|
|
continue;
|
|
}
|
|
|
|
if(facet_id >= 0) {
|
|
uint64_t hash = facet_token_hash(a_field, token);
|
|
facet_index_v2[seq_id][facet_id].push_back(hash);
|
|
}
|
|
|
|
token_to_offsets[token].push_back(token_index);
|
|
}
|
|
|
|
/*if(seq_id == 0) {
|
|
LOG(INFO) << "field name: " << a_field.name;
|
|
}*/
|
|
|
|
insert_doc(score, t, seq_id, token_to_offsets);
|
|
|
|
if(facet_id >= 0) {
|
|
facet_index_v2[seq_id][facet_id].shrink_to_fit();
|
|
}
|
|
}
|
|
|
|
void Index::index_string_array_field(const std::vector<std::string> & strings, const int64_t score, art_tree *t,
|
|
uint32_t seq_id, int facet_id, const field & a_field) {
|
|
std::unordered_map<std::string, std::vector<uint32_t>> token_positions;
|
|
|
|
for(size_t array_index = 0; array_index < strings.size(); array_index++) {
|
|
const std::string& str = strings[array_index];
|
|
std::set<std::string> token_set; // required to deal with repeating tokens
|
|
|
|
Tokenizer tokenizer(str, true, true, !a_field.is_string());
|
|
std::string token;
|
|
size_t token_index = 0;
|
|
|
|
// iterate and append offset positions
|
|
while(tokenizer.next(token, token_index)) {
|
|
if(token.empty()) {
|
|
continue;
|
|
}
|
|
|
|
if(facet_id >= 0) {
|
|
uint64_t hash = facet_token_hash(a_field, token);
|
|
facet_index_v2[seq_id][facet_id].push_back(hash);
|
|
//printf("indexing %.*s - %llu\n", token.size(), token.c_str(), hash);
|
|
}
|
|
|
|
token_positions[token].push_back(token_index);
|
|
token_set.insert(token);
|
|
}
|
|
|
|
if(facet_id >= 0) {
|
|
facet_index_v2[seq_id][facet_id].push_back(FACET_ARRAY_DELIMETER); // as a delimiter
|
|
}
|
|
|
|
// repeat last element to indicate end of offsets for this array index
|
|
for(auto & the_token: token_set) {
|
|
token_positions[the_token].push_back(token_positions[the_token].back());
|
|
}
|
|
|
|
// iterate and append this array index to all tokens
|
|
for(auto & the_token: token_set) {
|
|
token_positions[the_token].push_back(array_index);
|
|
}
|
|
}
|
|
|
|
if(facet_id >= 0) {
|
|
facet_index_v2[seq_id][facet_id].shrink_to_fit();
|
|
}
|
|
|
|
insert_doc(score, t, seq_id, token_positions);
|
|
}
|
|
|
|
void Index::index_int32_array_field(const std::vector<int32_t> & values, const int64_t score, art_tree *t,
|
|
uint32_t seq_id) const {
|
|
for(const int32_t value: values) {
|
|
index_int32_field(value, score, t, seq_id);
|
|
}
|
|
}
|
|
|
|
void Index::index_int64_array_field(const std::vector<int64_t> & values, const int64_t score, art_tree *t,
|
|
uint32_t seq_id) const {
|
|
for(const int64_t value: values) {
|
|
index_int64_field(value, score, t, seq_id);
|
|
}
|
|
}
|
|
|
|
void Index::index_bool_array_field(const std::vector<bool> & values, const int64_t score, art_tree *t,
|
|
uint32_t seq_id) const {
|
|
for(const bool value: values) {
|
|
index_bool_field(value, score, t, seq_id);
|
|
}
|
|
}
|
|
|
|
void Index::index_float_array_field(const std::vector<float> & values, const int64_t score, art_tree *t,
|
|
uint32_t seq_id) const {
|
|
for(const float value: values) {
|
|
index_float_field(value, score, t, seq_id);
|
|
}
|
|
}
|
|
|
|
void Index::compute_facet_stats(facet &a_facet, uint64_t raw_value, const std::string & field_type) {
|
|
if(field_type == field_types::INT32 || field_type == field_types::INT32_ARRAY) {
|
|
int32_t val = raw_value;
|
|
if (val < a_facet.stats.fvmin) {
|
|
a_facet.stats.fvmin = val;
|
|
}
|
|
if (val > a_facet.stats.fvmax) {
|
|
a_facet.stats.fvmax = val;
|
|
}
|
|
a_facet.stats.fvsum += val;
|
|
a_facet.stats.fvcount++;
|
|
} else if(field_type == field_types::INT64 || field_type == field_types::INT64_ARRAY) {
|
|
int64_t val = raw_value;
|
|
if(val < a_facet.stats.fvmin) {
|
|
a_facet.stats.fvmin = val;
|
|
}
|
|
if(val > a_facet.stats.fvmax) {
|
|
a_facet.stats.fvmax = val;
|
|
}
|
|
a_facet.stats.fvsum += val;
|
|
a_facet.stats.fvcount++;
|
|
} else if(field_type == field_types::FLOAT || field_type == field_types::FLOAT_ARRAY) {
|
|
float val = reinterpret_cast<float&>(raw_value);
|
|
if(val < a_facet.stats.fvmin) {
|
|
a_facet.stats.fvmin = val;
|
|
}
|
|
if(val > a_facet.stats.fvmax) {
|
|
a_facet.stats.fvmax = val;
|
|
}
|
|
a_facet.stats.fvsum += val;
|
|
a_facet.stats.fvcount++;
|
|
}
|
|
}
|
|
|
|
void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
|
|
const uint32_t* result_ids, size_t results_size) {
|
|
|
|
std::unordered_map<std::string, size_t> facet_to_index;
|
|
get_facet_to_index(facet_to_index);
|
|
|
|
struct facet_info_t {
|
|
// facet hash => token position in the query
|
|
spp::sparse_hash_map<uint64_t, token_pos_cost_t> fhash_qtoken_pos;
|
|
|
|
bool use_facet_query = false;
|
|
bool should_compute_stats = false;
|
|
field facet_field{"", "", false};
|
|
};
|
|
|
|
std::vector<facet_info_t> facet_infos(facets.size());
|
|
|
|
for(size_t findex=0; findex < facets.size(); findex++) {
|
|
const auto& a_facet = facets[findex];
|
|
|
|
facet_infos[findex].use_facet_query = false;
|
|
|
|
const field &facet_field = facet_schema.at(a_facet.field_name);
|
|
facet_infos[findex].facet_field = facet_field;
|
|
|
|
facet_infos[findex].should_compute_stats = (facet_field.type != field_types::STRING &&
|
|
facet_field.type != field_types::BOOL &&
|
|
facet_field.type != field_types::STRING_ARRAY &&
|
|
facet_field.type != field_types::BOOL_ARRAY);
|
|
|
|
if(a_facet.field_name == facet_query.field_name && !facet_query.query.empty()) {
|
|
facet_infos[findex].use_facet_query = true;
|
|
|
|
if (facet_field.is_bool()) {
|
|
if (facet_query.query == "true") {
|
|
facet_query.query = "1";
|
|
} else if (facet_query.query == "false") {
|
|
facet_query.query = "0";
|
|
}
|
|
}
|
|
|
|
// for non-string fields, `faceted_name` returns their aliased stringified field name
|
|
art_tree *t = search_index.at(facet_field.faceted_name());
|
|
|
|
std::vector<std::string> query_tokens;
|
|
Tokenizer(facet_query.query, false, true, !facet_field.is_string()).tokenize(query_tokens);
|
|
|
|
for (size_t qtoken_index = 0; qtoken_index < query_tokens.size(); qtoken_index++) {
|
|
auto &q = query_tokens[qtoken_index];
|
|
|
|
int bounded_cost = (q.size() < 3) ? 0 : 1;
|
|
bool prefix_search = (qtoken_index ==
|
|
(query_tokens.size() - 1)); // only last token must be used as prefix
|
|
|
|
std::vector<art_leaf *> leaves;
|
|
|
|
art_fuzzy_search(t, (const unsigned char *) q.c_str(),
|
|
q.size(), 0, bounded_cost, 10000,
|
|
token_ordering::MAX_SCORE, prefix_search, leaves);
|
|
|
|
for (size_t leaf_index = 0; leaf_index < leaves.size(); leaf_index++) {
|
|
const auto &leaf = leaves[leaf_index];
|
|
// calculate hash without terminating null char
|
|
std::string key_str((const char *) leaf->key, leaf->key_len - 1);
|
|
uint64_t hash = facet_token_hash(facet_field, key_str);
|
|
|
|
token_pos_cost_t token_pos_cost = {qtoken_index, 0};
|
|
facet_infos[findex].fhash_qtoken_pos.emplace(hash, token_pos_cost);
|
|
//printf("%.*s - %llu\n", leaf->key_len, leaf->key, hash);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
for(size_t i = 0; i < results_size; i++) {
|
|
uint32_t doc_seq_id = result_ids[i];
|
|
|
|
auto doc_facet_index_it = facet_index_v2.find(doc_seq_id);
|
|
|
|
if(doc_facet_index_it == facet_index_v2.end()) {
|
|
continue;
|
|
}
|
|
|
|
const std::vector<std::vector<uint64_t>>& doc_facet_index = doc_facet_index_it->second;
|
|
const uint64_t distinct_id = search_params->group_limit ? get_distinct_id(facet_to_index, doc_seq_id) : 0;
|
|
|
|
// assumed that facet fields have already been validated upstream
|
|
for(size_t findex=0; findex < facets.size(); findex++) {
|
|
auto& a_facet = facets[findex];
|
|
size_t facet_id = facet_to_index[a_facet.field_name];
|
|
const auto& facet_field = facet_infos[findex].facet_field;
|
|
const bool use_facet_query = facet_infos[findex].use_facet_query;
|
|
const auto& fhash_qtoken_pos = facet_infos[findex].fhash_qtoken_pos;
|
|
const bool should_compute_stats = facet_infos[findex].should_compute_stats;
|
|
|
|
// FORMAT OF VALUES
|
|
// String: h1 h2 h3
|
|
// String array: h1 h2 h3 0 h1 0 h1 h2 0
|
|
const std::vector<uint64_t> & fhashes = doc_facet_index[facet_id];
|
|
|
|
int array_pos = 0;
|
|
bool fvalue_found = false;
|
|
uint64_t combined_hash = 1; // for hashing the entire facet value (multiple tokens)
|
|
|
|
spp::sparse_hash_map<uint32_t, token_pos_cost_t> query_token_positions;
|
|
size_t field_token_index = -1;
|
|
|
|
for(size_t j = 0; j < fhashes.size(); j++) {
|
|
if(fhashes[j] != FACET_ARRAY_DELIMETER) {
|
|
uint64_t ftoken_hash = fhashes[j];
|
|
field_token_index++;
|
|
|
|
// reference: https://stackoverflow.com/a/4182771/131050
|
|
// we also include token index to maintain orderliness
|
|
combined_hash *= (1779033703 + 2*ftoken_hash*(field_token_index+1));
|
|
|
|
// ftoken_hash is the raw value for numeric fields
|
|
if(should_compute_stats) {
|
|
compute_facet_stats(a_facet, ftoken_hash, facet_field.type);
|
|
}
|
|
|
|
// not using facet query or this particular facet value is found in facet filter
|
|
if(!use_facet_query || fhash_qtoken_pos.find(ftoken_hash) != fhash_qtoken_pos.end()) {
|
|
fvalue_found = true;
|
|
|
|
if(use_facet_query) {
|
|
// map token index to query index (used for highlighting later on)
|
|
token_pos_cost_t qtoken_pos = fhash_qtoken_pos.at(ftoken_hash);
|
|
|
|
// if the query token has already matched another token in the string
|
|
// we will replace the position only if the cost is lower
|
|
if(query_token_positions.find(qtoken_pos.pos) == query_token_positions.end() ||
|
|
query_token_positions[qtoken_pos.pos].cost >= qtoken_pos.cost ) {
|
|
token_pos_cost_t ftoken_pos_cost = {field_token_index, qtoken_pos.cost};
|
|
query_token_positions[qtoken_pos.pos] = ftoken_pos_cost;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// 0 indicates separator, while the second condition checks for non-array string
|
|
if(fhashes[j] == FACET_ARRAY_DELIMETER || (fhashes.back() != FACET_ARRAY_DELIMETER && j == fhashes.size() - 1)) {
|
|
if(!use_facet_query || fvalue_found) {
|
|
uint64_t fhash = combined_hash;
|
|
|
|
if(a_facet.result_map.count(fhash) == 0) {
|
|
a_facet.result_map[fhash] = facet_count_t{0, spp::sparse_hash_set<uint64_t>(),
|
|
doc_seq_id, 0,
|
|
spp::sparse_hash_map<uint32_t, token_pos_cost_t>()};
|
|
}
|
|
|
|
a_facet.result_map[fhash].doc_id = doc_seq_id;
|
|
a_facet.result_map[fhash].array_pos = array_pos;
|
|
|
|
if(search_params->group_limit) {
|
|
a_facet.result_map[fhash].groups.emplace(distinct_id);
|
|
} else {
|
|
a_facet.result_map[fhash].count += 1;
|
|
}
|
|
|
|
if(use_facet_query) {
|
|
a_facet.result_map[fhash].query_token_pos = query_token_positions;
|
|
}
|
|
}
|
|
|
|
array_pos++;
|
|
fvalue_found = false;
|
|
combined_hash = 1;
|
|
spp::sparse_hash_map<uint32_t, token_pos_cost_t>().swap(query_token_positions);
|
|
field_token_index = -1;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void Index::get_facet_to_index(std::unordered_map<std::string, size_t>& facet_to_index) {
|
|
size_t i_facet = 0;
|
|
for(const auto & facet_kv: facet_schema) {
|
|
facet_to_index.emplace(facet_kv.first, i_facet);
|
|
i_facet++;
|
|
}
|
|
}
|
|
|
|
void Index::search_candidates(const uint8_t & field_id, uint32_t* filter_ids, size_t filter_ids_length,
|
|
const std::vector<uint32_t>& curated_ids,
|
|
const std::vector<sort_by> & sort_fields,
|
|
std::vector<token_candidates> & token_candidates_vec,
|
|
std::vector<std::vector<art_leaf*>> & searched_queries, Topster* topster,
|
|
spp::sparse_hash_set<uint64_t>& groups_processed,
|
|
uint32_t** all_result_ids, size_t & all_result_ids_len,
|
|
const size_t typo_tokens_threshold) {
|
|
const long long combination_limit = 10;
|
|
|
|
auto product = []( long long a, token_candidates & b ) { return a*b.candidates.size(); };
|
|
long long int N = std::accumulate(token_candidates_vec.begin(), token_candidates_vec.end(), 1LL, product);
|
|
|
|
for(long long n=0; n<N && n<combination_limit; ++n) {
|
|
// every element in `query_suggestion` contains a token and its associated hits
|
|
std::vector<art_leaf*> query_suggestion(token_candidates_vec.size());
|
|
|
|
// actual query suggestion preserves original order of tokens in query
|
|
std::vector<art_leaf*> actual_query_suggestion(token_candidates_vec.size());
|
|
|
|
next_suggestion(token_candidates_vec, n, actual_query_suggestion, query_suggestion);
|
|
|
|
/*LOG(INFO) << "n: " << n;
|
|
for(size_t i=0; i < query_suggestion.size(); i++) {
|
|
LOG(INFO) << "i: " << i << " - " << query_suggestion[i]->key << ", ids: "
|
|
<< query_suggestion[i]->values->ids.getLength();
|
|
}*/
|
|
|
|
// initialize results with the starting element (for further intersection)
|
|
size_t result_size = query_suggestion[0]->values->ids.getLength();
|
|
if(result_size == 0) {
|
|
continue;
|
|
}
|
|
|
|
uint32_t total_cost = 0;
|
|
uint32_t* result_ids = query_suggestion[0]->values->ids.uncompress();
|
|
|
|
for(const auto& tc: token_candidates_vec) {
|
|
total_cost += tc.cost;
|
|
}
|
|
|
|
// intersect the document ids for each token to find docs that contain all the tokens (stored in `result_ids`)
|
|
for(size_t i=1; i < query_suggestion.size(); i++) {
|
|
uint32_t* out = nullptr;
|
|
uint32_t* ids = query_suggestion[i]->values->ids.uncompress();
|
|
result_size = ArrayUtils::and_scalar(ids, query_suggestion[i]->values->ids.getLength(), result_ids, result_size, &out);
|
|
delete[] ids;
|
|
delete[] result_ids;
|
|
result_ids = out;
|
|
}
|
|
|
|
if(result_size == 0) {
|
|
delete[] result_ids;
|
|
continue;
|
|
}
|
|
|
|
if(!curated_ids.empty()) {
|
|
uint32_t *excluded_result_ids = nullptr;
|
|
result_size = ArrayUtils::exclude_scalar(result_ids, result_size, &curated_ids[0],
|
|
curated_ids.size(), &excluded_result_ids);
|
|
|
|
delete [] result_ids;
|
|
result_ids = excluded_result_ids;
|
|
}
|
|
|
|
if(filter_ids != nullptr) {
|
|
// intersect once again with filter ids
|
|
uint32_t* filtered_result_ids = nullptr;
|
|
size_t filtered_results_size = ArrayUtils::and_scalar(filter_ids, filter_ids_length, result_ids,
|
|
result_size, &filtered_result_ids);
|
|
|
|
uint32_t* new_all_result_ids;
|
|
all_result_ids_len = ArrayUtils::or_scalar(*all_result_ids, all_result_ids_len, filtered_result_ids,
|
|
filtered_results_size, &new_all_result_ids);
|
|
delete [] *all_result_ids;
|
|
*all_result_ids = new_all_result_ids;
|
|
|
|
// go through each matching document id and calculate match score
|
|
score_results(sort_fields, (uint16_t) searched_queries.size(), field_id, total_cost, topster, query_suggestion,
|
|
groups_processed, filtered_result_ids, filtered_results_size);
|
|
|
|
delete[] filtered_result_ids;
|
|
delete[] result_ids;
|
|
} else {
|
|
uint32_t* new_all_result_ids;
|
|
all_result_ids_len = ArrayUtils::or_scalar(*all_result_ids, all_result_ids_len, result_ids,
|
|
result_size, &new_all_result_ids);
|
|
delete [] *all_result_ids;
|
|
*all_result_ids = new_all_result_ids;
|
|
|
|
score_results(sort_fields, (uint16_t) searched_queries.size(), field_id, total_cost, topster, query_suggestion,
|
|
groups_processed, result_ids, result_size);
|
|
delete[] result_ids;
|
|
}
|
|
|
|
searched_queries.push_back(actual_query_suggestion);
|
|
|
|
//LOG(INFO) << "all_result_ids_len: " << all_result_ids_len << ", typo_tokens_threshold: " << typo_tokens_threshold;
|
|
if(all_result_ids_len >= typo_tokens_threshold) {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
Option<uint32_t> Index::do_filtering(uint32_t** filter_ids_out, const std::vector<filter> & filters) {
|
|
uint32_t* filter_ids = nullptr;
|
|
uint32_t filter_ids_length = 0;
|
|
|
|
std::unordered_map<std::string, size_t> facet_to_index;
|
|
get_facet_to_index(facet_to_index);
|
|
|
|
for(size_t i = 0; i < filters.size(); i++) {
|
|
const filter & a_filter = filters[i];
|
|
if(search_index.count(a_filter.field_name) == 0) {
|
|
continue;
|
|
}
|
|
|
|
art_tree* t = search_index.at(a_filter.field_name);
|
|
field f = search_schema.at(a_filter.field_name);
|
|
|
|
uint32_t* result_ids = nullptr;
|
|
size_t result_ids_len = 0;
|
|
|
|
if(f.is_integer()) {
|
|
std::vector<const art_leaf*> leaves;
|
|
std::vector<uint32_t> ids;
|
|
|
|
for(const std::string & filter_value: a_filter.values) {
|
|
if(f.type == field_types::INT32 || f.type == field_types::INT32_ARRAY) {
|
|
int32_t value = (int32_t) std::stoi(filter_value);
|
|
art_int32_search(t, value, a_filter.compare_operator, leaves);
|
|
} else { // int64
|
|
int64_t value = (int64_t) std::stol(filter_value);
|
|
art_int64_search(t, value, a_filter.compare_operator, leaves);
|
|
}
|
|
}
|
|
|
|
result_ids = collate_leaf_ids(leaves, result_ids_len);
|
|
|
|
} else if(f.is_float()) {
|
|
std::vector<const art_leaf*> leaves;
|
|
std::vector<uint32_t> ids;
|
|
|
|
for(const std::string & filter_value: a_filter.values) {
|
|
float value = (float) std::atof(filter_value.c_str());
|
|
art_float_search(t, value, a_filter.compare_operator, leaves);
|
|
}
|
|
|
|
result_ids = collate_leaf_ids(leaves, result_ids_len);
|
|
|
|
} else if(f.is_bool()) {
|
|
std::vector<const art_leaf*> leaves;
|
|
|
|
for(const std::string & filter_value: a_filter.values) {
|
|
art_leaf* leaf = (art_leaf *) art_search(t, (const unsigned char*) filter_value.c_str(),
|
|
filter_value.length());
|
|
if(leaf) {
|
|
leaves.push_back(leaf);
|
|
}
|
|
}
|
|
result_ids = collate_leaf_ids(leaves, result_ids_len);
|
|
} else if(f.is_string()) {
|
|
uint32_t* ids = nullptr;
|
|
size_t ids_size = 0;
|
|
|
|
for(const std::string & filter_value: a_filter.values) {
|
|
uint32_t* strt_ids = nullptr;
|
|
size_t strt_ids_size = 0;
|
|
|
|
std::vector<art_leaf *> query_suggestion;
|
|
|
|
// there could be multiple tokens in a filter value, which we have to treat as ANDs
|
|
// e.g. country: South Africa
|
|
|
|
Tokenizer tokenizer(filter_value, false, true);
|
|
std::string str_token;
|
|
size_t token_index = 0;
|
|
std::vector<std::string> str_tokens;
|
|
|
|
while(tokenizer.next(str_token, token_index)) {
|
|
str_tokens.push_back(str_token);
|
|
|
|
art_leaf* leaf = (art_leaf *) art_search(t, (const unsigned char*) str_token.c_str(),
|
|
str_token.length()+1);
|
|
if(leaf == nullptr) {
|
|
continue;
|
|
}
|
|
|
|
query_suggestion.push_back(leaf);
|
|
|
|
if(strt_ids == nullptr) {
|
|
strt_ids = leaf->values->ids.uncompress();
|
|
strt_ids_size = leaf->values->ids.getLength();
|
|
} else {
|
|
// do AND for an exact match
|
|
uint32_t* out = nullptr;
|
|
uint32_t* leaf_ids = leaf->values->ids.uncompress();
|
|
strt_ids_size = ArrayUtils::and_scalar(strt_ids, strt_ids_size, leaf_ids,
|
|
leaf->values->ids.getLength(), &out);
|
|
delete[] leaf_ids;
|
|
delete[] strt_ids;
|
|
strt_ids = out;
|
|
}
|
|
}
|
|
|
|
if(a_filter.compare_operator == EQUALS && f.is_facet()) {
|
|
// need to do exact match (unlike CONTAINS) by using the facet index
|
|
// field being a facet is already enforced upstream
|
|
uint32_t* exact_strt_ids = new uint32_t[strt_ids_size];
|
|
size_t exact_strt_size = 0;
|
|
|
|
size_t facet_id = facet_to_index[f.name];
|
|
for(size_t strt_ids_index = 0; strt_ids_index < strt_ids_size; strt_ids_index++) {
|
|
uint32_t seq_id = strt_ids[strt_ids_index];
|
|
const std::vector<uint64_t> &fvalues = facet_index_v2[seq_id][facet_id];
|
|
bool found_filter = false;
|
|
|
|
if(!f.is_array()) {
|
|
found_filter = (query_suggestion.size() == fvalues.size());
|
|
} else {
|
|
uint64_t filter_hash = 1;
|
|
|
|
for(size_t sindex=0; sindex < str_tokens.size(); sindex++) {
|
|
auto& str_token = str_tokens[sindex];
|
|
uint64_t thash = facet_token_hash(f, str_token);
|
|
filter_hash *= (1779033703 + 2*thash*(sindex+1));
|
|
}
|
|
|
|
uint64_t all_fvalue_hash = 1;
|
|
size_t ftindex = 0;
|
|
|
|
for(size_t findex=0; findex < fvalues.size(); findex++) {
|
|
auto fhash = fvalues[findex];
|
|
if(fhash == FACET_ARRAY_DELIMETER) {
|
|
// end of array, check hash
|
|
if(all_fvalue_hash == filter_hash) {
|
|
found_filter = true;
|
|
break;
|
|
}
|
|
all_fvalue_hash = 1;
|
|
ftindex = 0;
|
|
} else {
|
|
all_fvalue_hash *= (1779033703 + 2*fhash*(ftindex + 1));
|
|
ftindex++;
|
|
}
|
|
}
|
|
}
|
|
|
|
if(found_filter) {
|
|
exact_strt_ids[exact_strt_size] = seq_id;
|
|
exact_strt_size++;
|
|
}
|
|
}
|
|
|
|
delete[] strt_ids;
|
|
strt_ids = exact_strt_ids;
|
|
strt_ids_size = exact_strt_size;
|
|
}
|
|
|
|
// Otherwise, we just ensure that given record contains tokens in the filter query
|
|
// (NOT implemented) if the query is wrapped by double quotes, ensure phrase match
|
|
// bool exact_match = (filter_value.front() == '"' && filter_value.back() == '"');
|
|
uint32_t* out = nullptr;
|
|
ids_size = ArrayUtils::or_scalar(ids, ids_size, strt_ids, strt_ids_size, &out);
|
|
delete[] strt_ids;
|
|
delete[] ids;
|
|
ids = out;
|
|
}
|
|
|
|
result_ids = ids;
|
|
result_ids_len = ids_size;
|
|
}
|
|
|
|
if(i == 0) {
|
|
filter_ids = result_ids;
|
|
filter_ids_length = result_ids_len;
|
|
} else {
|
|
uint32_t* filtered_results = nullptr;
|
|
filter_ids_length = ArrayUtils::and_scalar(filter_ids, filter_ids_length, result_ids,
|
|
result_ids_len, &filtered_results);
|
|
delete [] result_ids;
|
|
delete [] filter_ids;
|
|
filter_ids = filtered_results;
|
|
}
|
|
}
|
|
|
|
*filter_ids_out = filter_ids;
|
|
return Option<>(filter_ids_length);
|
|
}
|
|
|
|
void Index::eq_str_filter_plain(const uint32_t *strt_ids, size_t strt_ids_size,
|
|
const std::vector<art_leaf *>& query_suggestion, uint32_t *exact_strt_ids,
|
|
size_t& exact_strt_size) const {
|
|
std::vector<uint32_t*> leaf_to_indices;
|
|
for (art_leaf *token_leaf: query_suggestion) {
|
|
if(token_leaf == nullptr) {
|
|
leaf_to_indices.push_back(nullptr);
|
|
continue;
|
|
}
|
|
|
|
uint32_t *indices = new uint32_t[strt_ids_size];
|
|
token_leaf->values->ids.indexOf(strt_ids, strt_ids_size, indices);
|
|
leaf_to_indices.push_back(indices);
|
|
}
|
|
|
|
// e.g. First In First Out => hash([0, 1, 0, 2])
|
|
spp::sparse_hash_map<size_t, uint32_t> leaf_to_id;
|
|
size_t next_id = 1;
|
|
size_t filter_hash = 1;
|
|
|
|
for(size_t leaf_index=0; leaf_index<query_suggestion.size(); leaf_index++) {
|
|
if(leaf_to_id.count(leaf_index) == 0) {
|
|
leaf_to_id.emplace(leaf_index, next_id++);
|
|
}
|
|
|
|
uint32_t leaf_id = leaf_to_id[leaf_index];
|
|
filter_hash *= (1779033703 + 2*leaf_id*(leaf_index+1));
|
|
}
|
|
|
|
for(size_t strt_ids_index = 0; strt_ids_index < strt_ids_size; strt_ids_index++) {
|
|
std::unordered_map<size_t, std::vector<std::vector<uint16_t>>> array_token_positions;
|
|
populate_token_positions(query_suggestion, leaf_to_indices, strt_ids_index, array_token_positions);
|
|
// iterate array_token_positions and compute hash
|
|
|
|
for(const auto& kv: array_token_positions) {
|
|
const std::vector<std::vector<uint16_t>>& token_positions = kv.second;
|
|
size_t this_hash = 1;
|
|
|
|
for(size_t token_index = 0; token_index < token_positions.size(); token_index++) {
|
|
auto& positions = token_positions[token_index];
|
|
for(auto pos: positions) {
|
|
this_hash *= (1779033703 + 2*(token_index+1)*(pos+1));
|
|
}
|
|
}
|
|
|
|
if(this_hash == filter_hash) {
|
|
exact_strt_ids[exact_strt_size++] = strt_ids[strt_ids_index];
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
uint32_t* Index::collate_leaf_ids(const std::vector<const art_leaf *> &leaves, size_t& result_ids_len) const {
|
|
std::vector<uint32_t> ids;
|
|
|
|
for(const art_leaf* leaf: leaves) {
|
|
uint32_t num_ids = leaf->values->ids.getLength();
|
|
uint32_t* leaf_ids = leaf->values->ids.uncompress();
|
|
std::copy(leaf_ids, leaf_ids + num_ids, std::back_inserter(ids));
|
|
delete [] leaf_ids;
|
|
}
|
|
|
|
uint32_t* result_ids = new uint32_t[ids.size()];
|
|
std::sort(ids.begin(), ids.end());
|
|
std::copy(ids.begin(), ids.end(), result_ids);
|
|
result_ids_len = ids.size();
|
|
return result_ids;
|
|
}
|
|
|
|
void Index::run_search() {
|
|
while(true) {
|
|
// wait until main thread sends data
|
|
std::unique_lock<std::mutex> lk(m);
|
|
cv.wait(lk, [this]{return ready;});
|
|
|
|
if(terminate) {
|
|
break;
|
|
}
|
|
|
|
// after the wait, we own the lock.
|
|
search(search_params->outcome, search_params->query, search_params->search_fields,
|
|
search_params->filters, search_params->facets, search_params->facet_query,
|
|
search_params->included_ids, search_params->excluded_ids,
|
|
search_params->sort_fields_std, search_params->num_typos,
|
|
search_params->topster, search_params->curated_topster,
|
|
search_params->per_page, search_params->page, search_params->token_order,
|
|
search_params->prefix, search_params->drop_tokens_threshold,
|
|
search_params->all_result_ids_len, search_params->groups_processed,
|
|
search_params->searched_queries,
|
|
search_params->raw_result_kvs, search_params->override_result_kvs,
|
|
search_params->typo_tokens_threshold);
|
|
|
|
// hand control back to main thread
|
|
processed = true;
|
|
ready = false;
|
|
|
|
// manual unlocking is done before notifying, to avoid waking up the waiting thread only to block again
|
|
lk.unlock();
|
|
cv.notify_one();
|
|
}
|
|
}
|
|
|
|
void Index::collate_included_ids(const std::string & query, const std::string & field, const uint8_t field_id,
|
|
const std::map<size_t, std::map<size_t, uint32_t>> & included_ids_map,
|
|
Topster* curated_topster,
|
|
std::vector<std::vector<art_leaf*>> & searched_queries) {
|
|
|
|
if(included_ids_map.empty()) {
|
|
return;
|
|
}
|
|
|
|
// calculate match_score and add to topster independently
|
|
|
|
std::vector<art_leaf *> override_query;
|
|
|
|
Tokenizer tokenizer(query, false, true);
|
|
std::string token;
|
|
size_t token_index = 0;
|
|
|
|
while(tokenizer.next(token, token_index)) {
|
|
const size_t token_len = token.length();
|
|
|
|
std::vector<art_leaf*> leaves;
|
|
art_fuzzy_search(search_index.at(field), (const unsigned char *) token.c_str(), token_len,
|
|
0, 0, 1, token_ordering::MAX_SCORE, false, leaves);
|
|
|
|
if(!leaves.empty()) {
|
|
override_query.push_back(leaves[0]);
|
|
}
|
|
}
|
|
|
|
for(const auto& pos_ids: included_ids_map) {
|
|
const size_t outer_pos = pos_ids.first;
|
|
|
|
for(const auto& index_seq_id: pos_ids.second) {
|
|
uint32_t inner_pos = index_seq_id.first;
|
|
uint32_t seq_id = index_seq_id.second;
|
|
|
|
uint64_t distinct_id = outer_pos; // outer pos is the group distinct key
|
|
uint64_t match_score = (64000 - inner_pos); // inner pos within a group is the match score
|
|
|
|
// LOG(INFO) << "seq_id: " << seq_id << " - " << match_score;
|
|
|
|
int64_t scores[3];
|
|
scores[0] = match_score;
|
|
scores[1] = int64_t(1);
|
|
scores[2] = int64_t(1);
|
|
|
|
KV kv(field_id, searched_queries.size(), seq_id, distinct_id, match_score, scores);
|
|
curated_topster->add(&kv);
|
|
}
|
|
}
|
|
|
|
searched_queries.push_back(override_query);
|
|
}
|
|
|
|
void Index::search(Option<uint32_t> & outcome,
|
|
const std::string & query,
|
|
const std::vector<std::string> & search_fields,
|
|
const std::vector<filter> & filters,
|
|
std::vector<facet> & facets, facet_query_t & facet_query,
|
|
const std::map<size_t, std::map<size_t, uint32_t>> & included_ids_map,
|
|
const std::vector<uint32_t> & excluded_ids,
|
|
const std::vector<sort_by> & sort_fields_std, const int num_typos,
|
|
Topster* topster,
|
|
Topster* curated_topster,
|
|
const size_t per_page, const size_t page, const token_ordering token_order,
|
|
const bool prefix, const size_t drop_tokens_threshold,
|
|
size_t & all_result_ids_len,
|
|
spp::sparse_hash_set<uint64_t>& groups_processed,
|
|
std::vector<std::vector<art_leaf*>>& searched_queries,
|
|
std::vector<std::vector<KV*>> & raw_result_kvs,
|
|
std::vector<std::vector<KV*>> & override_result_kvs,
|
|
const size_t typo_tokens_threshold) {
|
|
|
|
// process the filters
|
|
|
|
uint32_t* filter_ids = nullptr;
|
|
Option<uint32_t> op_filter_ids_length = do_filtering(&filter_ids, filters);
|
|
if(!op_filter_ids_length.ok()) {
|
|
outcome = Option<uint32_t>(op_filter_ids_length);
|
|
return ;
|
|
}
|
|
|
|
uint32_t filter_ids_length = op_filter_ids_length.get();
|
|
|
|
// we will be removing all curated IDs from organic result ids before running topster
|
|
std::set<uint32_t> curated_ids;
|
|
std::vector<uint32_t> included_ids;
|
|
|
|
for(const auto& outer_pos_ids: included_ids_map) {
|
|
for(const auto& inner_pos_seq_id: outer_pos_ids.second) {
|
|
curated_ids.insert(inner_pos_seq_id.second);
|
|
included_ids.push_back(inner_pos_seq_id.second);
|
|
}
|
|
}
|
|
|
|
curated_ids.insert(excluded_ids.begin(), excluded_ids.end());
|
|
|
|
std::vector<uint32_t> curated_ids_sorted(curated_ids.begin(), curated_ids.end());
|
|
std::sort(curated_ids_sorted.begin(), curated_ids_sorted.end());
|
|
|
|
// Order of `fields` are used to sort results
|
|
//auto begin = std::chrono::high_resolution_clock::now();
|
|
uint32_t* all_result_ids = nullptr;
|
|
|
|
if(query == "*") {
|
|
const uint8_t field_id = (uint8_t)(FIELD_LIMIT_NUM - 0);
|
|
const std::string & field = search_fields[0];
|
|
|
|
// if a filter is not specified, use the sorting index to generate the list of all document ids
|
|
if(filters.empty()) {
|
|
std::string all_records_field;
|
|
|
|
// get the first non-optional field
|
|
for(const auto& kv: sort_schema) {
|
|
if(!kv.second.optional && kv.first != sort_field_const::text_match) {
|
|
all_records_field = kv.first;
|
|
break;
|
|
}
|
|
}
|
|
|
|
const spp::sparse_hash_map<uint32_t, int64_t> *kvs = sort_index[all_records_field];
|
|
filter_ids_length = kvs->size();
|
|
filter_ids = new uint32_t[filter_ids_length];
|
|
|
|
size_t i = 0;
|
|
for(const auto& kv: *kvs) {
|
|
filter_ids[i++] = kv.first;
|
|
}
|
|
}
|
|
|
|
if(!curated_ids.empty()) {
|
|
uint32_t *excluded_result_ids = nullptr;
|
|
filter_ids_length = ArrayUtils::exclude_scalar(filter_ids, filter_ids_length, &curated_ids_sorted[0],
|
|
curated_ids.size(), &excluded_result_ids);
|
|
delete [] filter_ids;
|
|
filter_ids = excluded_result_ids;
|
|
}
|
|
|
|
score_results(sort_fields_std, (uint16_t) searched_queries.size(), field_id, 0, topster, {},
|
|
groups_processed, filter_ids, filter_ids_length);
|
|
collate_included_ids(query, field, field_id, included_ids_map, curated_topster, searched_queries);
|
|
|
|
all_result_ids_len = filter_ids_length;
|
|
all_result_ids = filter_ids;
|
|
filter_ids = nullptr;
|
|
} else {
|
|
const size_t num_search_fields = std::min(search_fields.size(), (size_t) FIELD_LIMIT_NUM);
|
|
for(size_t i = 0; i < num_search_fields; i++) {
|
|
// proceed to query search only when no filters are provided or when filtering produces results
|
|
if(filters.empty() || filter_ids_length > 0) {
|
|
const uint8_t field_id = (uint8_t)(FIELD_LIMIT_NUM - i); // Order of `fields` are used to sort results
|
|
const std::string & field = search_fields[i];
|
|
|
|
search_field(field_id, query, field, filter_ids, filter_ids_length, curated_ids_sorted, facets, sort_fields_std,
|
|
num_typos, searched_queries, topster, groups_processed, &all_result_ids, all_result_ids_len,
|
|
token_order, prefix, drop_tokens_threshold, typo_tokens_threshold);
|
|
collate_included_ids(query, field, field_id, included_ids_map, curated_topster, searched_queries);
|
|
}
|
|
}
|
|
}
|
|
|
|
do_facets(facets, facet_query, all_result_ids, all_result_ids_len);
|
|
do_facets(facets, facet_query, &included_ids[0], included_ids.size());
|
|
|
|
// must be sorted before iterated upon to remove "empty" array entries
|
|
topster->sort();
|
|
curated_topster->sort();
|
|
|
|
all_result_ids_len += curated_topster->size;
|
|
|
|
delete [] filter_ids;
|
|
delete [] all_result_ids;
|
|
|
|
//long long int timeMillis = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::high_resolution_clock::now() - begin).count();
|
|
//!LOG(INFO) << "Time taken for result calc: " << timeMillis << "us";
|
|
|
|
outcome = Option<uint32_t>(1);
|
|
}
|
|
|
|
/*
|
|
1. Split the query into tokens
|
|
2. Outer loop will generate bounded cartesian product with costs for each token
|
|
3. Inner loop will iterate on each token with associated cost
|
|
4. Cartesian product of the results of the token searches will be used to form search phrases
|
|
(cartesian product adapted from: http://stackoverflow.com/a/31169617/131050)
|
|
4. Intersect the lists to find docs that match each phrase
|
|
5. Sort the docs based on some ranking criteria
|
|
*/
|
|
void Index::search_field(const uint8_t & field_id, const std::string & query, const std::string & field,
|
|
uint32_t *filter_ids, size_t filter_ids_length,
|
|
const std::vector<uint32_t>& curated_ids,
|
|
std::vector<facet> & facets, const std::vector<sort_by> & sort_fields, const int num_typos,
|
|
std::vector<std::vector<art_leaf*>> & searched_queries,
|
|
Topster* topster, spp::sparse_hash_set<uint64_t>& groups_processed,
|
|
uint32_t** all_result_ids, size_t & all_result_ids_len,
|
|
const token_ordering token_order, const bool prefix,
|
|
const size_t drop_tokens_threshold, const size_t typo_tokens_threshold) {
|
|
|
|
const size_t max_cost = (num_typos < 0 || num_typos > 2) ? 2 : num_typos;
|
|
|
|
// To prevent us from doing ART search repeatedly as we iterate through possible corrections
|
|
spp::sparse_hash_map<std::string, std::vector<art_leaf*>> token_cost_cache;
|
|
|
|
// Used to drop the least occurring token(s) for partial searches
|
|
std::unordered_map<std::string, uint32_t> token_to_count;
|
|
|
|
std::vector<std::vector<int>> token_to_costs;
|
|
|
|
Tokenizer tokenizer(query, false, true);
|
|
std::string token;
|
|
size_t token_index = 0;
|
|
std::vector<std::string> tokens;
|
|
|
|
while(tokenizer.next(token, token_index)) {
|
|
std::vector<int> all_costs;
|
|
// This ensures that we don't end up doing a cost of 1 for a single char etc.
|
|
int bounded_cost = get_bounded_typo_cost(max_cost, token.length());
|
|
|
|
for(int cost = 0; cost <= bounded_cost; cost++) {
|
|
all_costs.push_back(cost);
|
|
}
|
|
|
|
token_to_costs.push_back(all_costs);
|
|
tokens.push_back(token);
|
|
}
|
|
|
|
// stores candidates for each token, i.e. i-th index would have all possible tokens with a cost of "c"
|
|
std::vector<token_candidates> token_candidates_vec;
|
|
|
|
const long long combination_limit = 10;
|
|
auto product = []( long long a, std::vector<int>& b ) { return a*b.size(); };
|
|
long long n = 0;
|
|
long long int N = std::accumulate(token_to_costs.begin(), token_to_costs.end(), 1LL, product);
|
|
|
|
while(n < N && n < combination_limit) {
|
|
// Outerloop generates combinations of [cost to max_cost] for each token
|
|
// For e.g. for a 3-token query: [0, 0, 0], [0, 0, 1], [0, 1, 1] etc.
|
|
std::vector<uint32_t> costs(token_to_costs.size());
|
|
ldiv_t q { n, 0 };
|
|
for(long long i = (token_to_costs.size() - 1); 0 <= i ; --i ) {
|
|
q = ldiv(q.quot, token_to_costs[i].size());
|
|
costs[i] = token_to_costs[i][q.rem];
|
|
}
|
|
|
|
token_candidates_vec.clear();
|
|
size_t token_index = 0;
|
|
|
|
while(token_index < tokens.size()) {
|
|
// For each token, look up the generated cost for this iteration and search using that cost
|
|
std::string token = tokens[token_index];
|
|
const std::string token_cost_hash = token + std::to_string(costs[token_index]);
|
|
|
|
std::vector<art_leaf*> leaves;
|
|
//LOG(INFO) << "\nSearching for field: " << field << ", token:" << token << " - cost: " << costs[token_index];
|
|
|
|
if(token_cost_cache.count(token_cost_hash) != 0) {
|
|
leaves = token_cost_cache[token_cost_hash];
|
|
} else {
|
|
// prefix should apply only for last token
|
|
const bool prefix_search = prefix && (token_index == tokens.size()-1);
|
|
const size_t token_len = prefix_search ? (int) token.length() : (int) token.length() + 1;
|
|
|
|
// If this is a prefix search, look for more candidates and do a union of those document IDs
|
|
const int max_candidates = prefix_search ? 10 : 3;
|
|
art_fuzzy_search(search_index.at(field), (const unsigned char *) token.c_str(), token_len,
|
|
costs[token_index], costs[token_index], max_candidates, token_order, prefix_search, leaves);
|
|
|
|
if(!leaves.empty()) {
|
|
token_cost_cache.emplace(token_cost_hash, leaves);
|
|
}
|
|
}
|
|
|
|
if(!leaves.empty()) {
|
|
//log_leaves(costs[token_index], token, leaves);
|
|
token_candidates_vec.push_back(token_candidates{token, costs[token_index], leaves});
|
|
token_to_count[token] = std::max(token_to_count[token], leaves.at(0)->values->ids.getLength());
|
|
} else {
|
|
// No result at `cost = costs[token_index]`. Remove costs until `cost` for token and re-do combinations
|
|
auto it = std::find(token_to_costs[token_index].begin(), token_to_costs[token_index].end(), costs[token_index]);
|
|
if(it != token_to_costs[token_index].end()) {
|
|
token_to_costs[token_index].erase(it);
|
|
|
|
// when no more costs are left for this token and `drop_tokens_threshold` is breached
|
|
if(token_to_costs[token_index].empty() && all_result_ids_len >= drop_tokens_threshold) {
|
|
n = combination_limit; // to break outer loop
|
|
break;
|
|
}
|
|
|
|
// otherwise, we try to drop the token and search with remaining tokens
|
|
if(token_to_costs[token_index].empty()) {
|
|
token_to_costs.erase(token_to_costs.begin()+token_index);
|
|
tokens.erase(tokens.begin()+token_index);
|
|
costs.erase(costs.begin()+token_index);
|
|
token_index--;
|
|
}
|
|
}
|
|
|
|
// To continue outerloop on new cost combination
|
|
n = -1;
|
|
N = std::accumulate(token_to_costs.begin(), token_to_costs.end(), 1LL, product);
|
|
break;
|
|
}
|
|
|
|
token_index++;
|
|
}
|
|
|
|
if(!token_candidates_vec.empty() && token_candidates_vec.size() == tokens.size()) {
|
|
// If all tokens were found, go ahead and search for candidates with what we have so far
|
|
search_candidates(field_id, filter_ids, filter_ids_length, curated_ids, sort_fields, token_candidates_vec,
|
|
searched_queries, topster, groups_processed, all_result_ids, all_result_ids_len,
|
|
typo_tokens_threshold);
|
|
}
|
|
|
|
if (all_result_ids_len >= typo_tokens_threshold) {
|
|
// If we don't find enough results, we continue outerloop (looking at tokens with greater typo cost)
|
|
break;
|
|
}
|
|
|
|
n++;
|
|
}
|
|
|
|
// When there are not enough overall results and atleast one token has results
|
|
if(all_result_ids_len < drop_tokens_threshold && token_to_count.size() > 1) {
|
|
// Drop token with least hits and try searching again
|
|
std::string truncated_query;
|
|
|
|
std::vector<std::pair<std::string, uint32_t>> token_count_pairs;
|
|
for (auto itr = token_to_count.begin(); itr != token_to_count.end(); ++itr) {
|
|
token_count_pairs.push_back(*itr);
|
|
}
|
|
|
|
std::sort(token_count_pairs.begin(), token_count_pairs.end(), [=]
|
|
(const std::pair<std::string, uint32_t>& a, const std::pair<std::string, uint32_t>& b) {
|
|
return a.second > b.second;
|
|
}
|
|
);
|
|
|
|
for(uint32_t i = 0; i < token_count_pairs.size()-1; i++) {
|
|
// iterate till last but one
|
|
truncated_query += " " + token_count_pairs.at(i).first;
|
|
}
|
|
|
|
return search_field(field_id, truncated_query, field, filter_ids, filter_ids_length, curated_ids,
|
|
facets, sort_fields, num_typos,
|
|
searched_queries, topster, groups_processed, all_result_ids, all_result_ids_len,
|
|
token_order, prefix);
|
|
}
|
|
}
|
|
|
|
int Index::get_bounded_typo_cost(const size_t max_cost, const size_t token_len) const {
|
|
int bounded_cost = max_cost;
|
|
if(token_len > 0 && max_cost >= token_len && (token_len == 1 || token_len == 2)) {
|
|
bounded_cost = token_len - 1;
|
|
}
|
|
return bounded_cost;
|
|
}
|
|
|
|
void Index::log_leaves(const int cost, const std::string &token, const std::vector<art_leaf *> &leaves) const {
|
|
LOG(INFO) << "Token: " << token << ", cost: " << cost;
|
|
|
|
for(size_t i=0; i < leaves.size(); i++) {
|
|
printf("%.*s - %d, ", leaves[i]->key_len, leaves[i]->key, leaves[i]->values->ids.getLength());
|
|
LOG(INFO) << "frequency: " << leaves[i]->values->ids.getLength() << ", max_score: " << leaves[i]->max_score;
|
|
/*for(auto j=0; j<leaves[i]->values->ids.getLength(); j++) {
|
|
LOG(INFO) << "id: " << leaves[i]->values->ids.at(j);
|
|
}*/
|
|
}
|
|
}
|
|
|
|
void Index::score_results(const std::vector<sort_by> & sort_fields, const uint16_t & query_index,
|
|
const uint8_t & field_id, const uint32_t total_cost, Topster* topster,
|
|
const std::vector<art_leaf *> &query_suggestion,
|
|
spp::sparse_hash_set<uint64_t>& groups_processed,
|
|
const uint32_t *result_ids, const size_t result_size) {
|
|
|
|
std::vector<uint32_t*> leaf_to_indices;
|
|
for (art_leaf *token_leaf: query_suggestion) {
|
|
uint32_t *indices = new uint32_t[result_size];
|
|
token_leaf->values->ids.indexOf(result_ids, result_size, indices);
|
|
leaf_to_indices.push_back(indices);
|
|
}
|
|
|
|
std::unordered_map<std::string, size_t> facet_to_index;
|
|
get_facet_to_index(facet_to_index);
|
|
|
|
Match single_token_match = Match(1, 0);
|
|
const uint64_t single_token_match_score = single_token_match.get_match_score(total_cost, field_id);
|
|
|
|
int sort_order[3]; // 1 or -1 based on DESC or ASC respectively
|
|
spp::sparse_hash_map<uint32_t, int64_t>* field_values[3];
|
|
|
|
for(size_t i = 0; i < sort_fields.size(); i++) {
|
|
sort_order[i] = 1;
|
|
if(sort_fields[i].order == sort_field_const::asc) {
|
|
sort_order[i] = -1;
|
|
}
|
|
|
|
field_values[i] = (sort_fields[i].name != sort_field_const::text_match) ?
|
|
sort_index.at(sort_fields[i].name) :
|
|
nullptr;
|
|
}
|
|
|
|
//auto begin = std::chrono::high_resolution_clock::now();
|
|
|
|
for(size_t i=0; i<result_size; i++) {
|
|
const uint32_t seq_id = result_ids[i];
|
|
|
|
uint64_t match_score = 0;
|
|
|
|
if(query_suggestion.size() <= 1) {
|
|
match_score = single_token_match_score;
|
|
} else {
|
|
std::unordered_map<size_t, std::vector<std::vector<uint16_t>>> array_token_positions;
|
|
populate_token_positions(query_suggestion, leaf_to_indices, i, array_token_positions);
|
|
|
|
for(const auto& kv: array_token_positions) {
|
|
const std::vector<std::vector<uint16_t>>& token_positions = kv.second;
|
|
if(token_positions.empty()) {
|
|
continue;
|
|
}
|
|
const Match & match = Match(seq_id, token_positions, false);
|
|
uint64_t this_match_score = match.get_match_score(total_cost, field_id);
|
|
|
|
if(this_match_score > match_score) {
|
|
match_score = this_match_score;
|
|
}
|
|
|
|
/*std::ostringstream os;
|
|
os << name << ", total_cost: " << (255 - total_cost)
|
|
<< ", words_present: " << match.words_present
|
|
<< ", match_score: " << match_score
|
|
<< ", match.distance: " << match.distance
|
|
<< ", seq_id: " << seq_id << std::endl;
|
|
LOG(INFO) << os.str();*/
|
|
}
|
|
}
|
|
|
|
const int64_t default_score = 0;
|
|
int64_t scores[3] = {0};
|
|
|
|
// avoiding loop
|
|
if(sort_fields.size() > 0) {
|
|
if (field_values[0] != nullptr) {
|
|
auto it = field_values[0]->find(seq_id);
|
|
scores[0] = (it == field_values[0]->end()) ? default_score : it->second;
|
|
} else {
|
|
scores[0] = int64_t(match_score);
|
|
}
|
|
if (sort_order[0] == -1) {
|
|
scores[0] = -scores[0];
|
|
}
|
|
scores[1] = 0;
|
|
}
|
|
|
|
if(sort_fields.size() > 1) {
|
|
if (field_values[1] != nullptr) {
|
|
auto it = field_values[1]->find(seq_id);
|
|
scores[1] = (it == field_values[1]->end()) ? default_score : it->second;
|
|
} else {
|
|
scores[1] = int64_t(match_score);
|
|
}
|
|
if (sort_order[1] == -1) {
|
|
scores[1] = -scores[1];
|
|
}
|
|
scores[2] = 0;
|
|
}
|
|
|
|
if(sort_fields.size() > 2) {
|
|
if(field_values[2] != nullptr) {
|
|
auto it = field_values[2]->find(seq_id);
|
|
scores[2] = (it == field_values[2]->end()) ? default_score : it->second;
|
|
} else {
|
|
scores[2] = int64_t(match_score);
|
|
}
|
|
if(sort_order[2] == -1) {
|
|
scores[2] = -scores[2];
|
|
}
|
|
}
|
|
|
|
uint64_t distinct_id = seq_id;
|
|
|
|
if(search_params->group_limit != 0) {
|
|
distinct_id = get_distinct_id(facet_to_index, seq_id);
|
|
groups_processed.emplace(distinct_id);
|
|
}
|
|
|
|
KV kv(field_id, query_index, seq_id, distinct_id, match_score, scores);
|
|
topster->add(&kv);
|
|
}
|
|
|
|
//long long int timeNanos = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - begin).count();
|
|
//LOG(INFO) << "Time taken for results iteration: " << timeNanos << "ms";
|
|
|
|
for(uint32_t* leaf_indices: leaf_to_indices) {
|
|
delete [] leaf_indices;
|
|
}
|
|
}
|
|
|
|
uint64_t Index::get_distinct_id(const std::unordered_map<std::string, size_t> &facet_to_id,
|
|
const uint32_t seq_id) const {
|
|
uint64_t distinct_id = 1; // some constant initial value
|
|
|
|
// calculate hash from group_by_fields
|
|
for(const auto& field: search_params->group_by_fields) {
|
|
if(facet_to_id.count(field) == 0 || facet_index_v2.count(seq_id) == 0) {
|
|
continue;
|
|
}
|
|
|
|
size_t facet_id = facet_to_id.at(field);
|
|
const std::vector<uint64_t>& fhashes = facet_index_v2.at(seq_id)[facet_id];
|
|
|
|
for(const auto& hash: fhashes) {
|
|
distinct_id = hash_combine(distinct_id, hash);
|
|
}
|
|
}
|
|
return distinct_id;
|
|
}
|
|
|
|
void Index::populate_token_positions(const std::vector<art_leaf *>& query_suggestion,
|
|
std::vector<uint32_t*>& leaf_to_indices,
|
|
size_t result_index,
|
|
std::unordered_map<size_t, std::vector<std::vector<uint16_t>>>& array_token_positions) {
|
|
if(query_suggestion.empty()) {
|
|
return ;
|
|
}
|
|
|
|
// array_token_positions:
|
|
// for every element in a potential array, for every token in query suggestion, get the positions
|
|
|
|
for(size_t i = 0; i < query_suggestion.size(); i++) {
|
|
const art_leaf* token_leaf = query_suggestion[i];
|
|
uint32_t doc_index = leaf_to_indices[i][result_index];
|
|
|
|
// it's possible for a query token to not appear in a resulting document
|
|
if(doc_index == token_leaf->values->ids.getLength()) {
|
|
continue;
|
|
}
|
|
|
|
// Array offset storage format:
|
|
// a) last element is array_index b) second and third last elements will be largest offset
|
|
// (last element is repeated to indicate end of offsets for a given array index)
|
|
|
|
/*uint32_t* offsets = token_leaf->values->offsets.uncompress();
|
|
for(size_t ii=0; ii < token_leaf->values->offsets.getLength(); ii++) {
|
|
LOG(INFO) << "offset: " << offsets[ii];
|
|
}*/
|
|
|
|
uint32_t start_offset = token_leaf->values->offset_index.at(doc_index);
|
|
uint32_t end_offset = (doc_index == token_leaf->values->ids.getLength() - 1) ?
|
|
token_leaf->values->offsets.getLength() :
|
|
token_leaf->values->offset_index.at(doc_index+1);
|
|
|
|
std::vector<uint16_t> positions;
|
|
int prev_pos = -1;
|
|
|
|
while(start_offset < end_offset) {
|
|
int pos = token_leaf->values->offsets.at(start_offset);
|
|
start_offset++;
|
|
|
|
if(pos == prev_pos) { // indicates end of array index
|
|
if(!positions.empty()) {
|
|
size_t array_index = (size_t) token_leaf->values->offsets.at(start_offset);
|
|
array_token_positions[array_index].push_back(positions);
|
|
positions.clear();
|
|
}
|
|
|
|
start_offset++; // skip current value which is the array index
|
|
prev_pos = -1;
|
|
continue;
|
|
}
|
|
|
|
prev_pos = pos;
|
|
positions.push_back((uint16_t)pos);
|
|
}
|
|
|
|
if(!positions.empty()) {
|
|
// for plain string fields
|
|
array_token_positions[0].push_back(positions);
|
|
}
|
|
}
|
|
}
|
|
|
|
inline void Index::next_suggestion(const std::vector<token_candidates> &token_candidates_vec,
|
|
long long int n,
|
|
std::vector<art_leaf *>& actual_query_suggestion,
|
|
std::vector<art_leaf *>& query_suggestion) {
|
|
// generate the next combination from `token_leaves` and store it in `query_suggestion`
|
|
ldiv_t q { n, 0 };
|
|
for(long long i = 0 ; i < (long long) token_candidates_vec.size(); i++) {
|
|
q = ldiv(q.quot, token_candidates_vec[i].candidates.size());
|
|
actual_query_suggestion[i] = token_candidates_vec[i].candidates[q.rem];
|
|
query_suggestion[i] = token_candidates_vec[i].candidates[q.rem];
|
|
}
|
|
|
|
// Sort ascending based on matched documents for each token for faster intersection.
|
|
// However, this causes the token order to deviate from original query's order.
|
|
sort(query_suggestion.begin(), query_suggestion.end(), [](const art_leaf* left, const art_leaf* right) {
|
|
return left->values->ids.getLength() < right->values->ids.getLength();
|
|
});
|
|
}
|
|
|
|
void Index::remove_and_shift_offset_index(sorted_array& offset_index, const uint32_t* indices_sorted,
|
|
const uint32_t indices_length) {
|
|
uint32_t *curr_array = offset_index.uncompress();
|
|
uint32_t *new_array = new uint32_t[offset_index.getLength()];
|
|
|
|
new_array[0] = 0;
|
|
uint32_t new_index = 0;
|
|
uint32_t curr_index = 0;
|
|
uint32_t indices_counter = 0;
|
|
uint32_t shift_value = 0;
|
|
|
|
while(curr_index < offset_index.getLength()) {
|
|
if(indices_counter < indices_length && curr_index >= indices_sorted[indices_counter]) {
|
|
// skip copying
|
|
if(curr_index == indices_sorted[indices_counter]) {
|
|
curr_index++;
|
|
const uint32_t diff = curr_index == offset_index.getLength() ?
|
|
0 : (offset_index.at(curr_index) - offset_index.at(curr_index-1));
|
|
|
|
shift_value += diff;
|
|
}
|
|
indices_counter++;
|
|
} else {
|
|
new_array[new_index++] = curr_array[curr_index++] - shift_value;
|
|
}
|
|
}
|
|
|
|
offset_index.load(new_array, new_index);
|
|
|
|
delete[] curr_array;
|
|
delete[] new_array;
|
|
}
|
|
|
|
Option<uint32_t> Index::remove(const uint32_t seq_id, const nlohmann::json & document) {
|
|
std::unordered_map<std::string, size_t> facet_to_index;
|
|
get_facet_to_index(facet_to_index);
|
|
|
|
for(auto it = document.begin(); it != document.end(); ++it) {
|
|
const std::string& field_name = it.key();
|
|
const auto& search_field_it = search_schema.find(field_name);
|
|
if(search_field_it == search_schema.end()) {
|
|
continue;
|
|
}
|
|
|
|
const auto& search_field = search_field_it->second;
|
|
|
|
// Go through all the field names and find the keys+values so that they can be removed from in-memory index
|
|
std::vector<std::string> tokens;
|
|
tokenize_doc_field(document, search_field, tokens);
|
|
|
|
for(auto & token: tokens) {
|
|
const unsigned char *key;
|
|
int key_len;
|
|
|
|
if(search_field.type == field_types::STRING_ARRAY || search_field.type == field_types::STRING) {
|
|
key = (const unsigned char *) token.c_str();
|
|
key_len = (int) (token.length() + 1);
|
|
} else {
|
|
key = (const unsigned char *) token.c_str();
|
|
key_len = (int) (token.length());
|
|
}
|
|
|
|
art_leaf* leaf = (art_leaf *) art_search(search_index.at(field_name), key, key_len);
|
|
if(leaf != nullptr) {
|
|
uint32_t doc_index = leaf->values->ids.indexOf(seq_id);
|
|
|
|
if(doc_index == leaf->values->ids.getLength()) {
|
|
// not found - happens when 2 tokens repeat in a field, e.g "is it or is is not?"
|
|
continue;
|
|
}
|
|
|
|
uint32_t start_offset = leaf->values->offset_index.at(doc_index);
|
|
uint32_t end_offset = (doc_index == leaf->values->ids.getLength() - 1) ?
|
|
leaf->values->offsets.getLength() :
|
|
leaf->values->offset_index.at(doc_index+1);
|
|
|
|
uint32_t doc_indices[1] = {doc_index};
|
|
remove_and_shift_offset_index(leaf->values->offset_index, doc_indices, 1);
|
|
|
|
leaf->values->offsets.remove_index(start_offset, end_offset);
|
|
leaf->values->ids.remove_value(seq_id);
|
|
|
|
/*len = leaf->values->offset_index.getLength();
|
|
for(auto i=0; i<len; i++) {
|
|
LOG(INFO) << "i: " << i << ", val: " << leaf->values->offset_index.at(i);
|
|
}
|
|
LOG(INFO) << "----";*/
|
|
|
|
if(leaf->values->ids.getLength() == 0) {
|
|
art_values* values = (art_values*) art_delete(search_index.at(field_name), key, key_len);
|
|
delete values;
|
|
}
|
|
}
|
|
}
|
|
|
|
// remove facets
|
|
if(facet_to_index.count(field_name) != 0 && facet_index_v2.count(seq_id) != 0) {
|
|
size_t facet_index = facet_to_index[field_name];
|
|
std::vector<std::vector<uint64_t>>& facet_values = facet_index_v2[seq_id];
|
|
facet_values[facet_index].clear();
|
|
}
|
|
|
|
// remove sort field
|
|
if(sort_index.count(field_name) != 0) {
|
|
sort_index[field_name]->erase(seq_id);
|
|
}
|
|
}
|
|
|
|
return Option<uint32_t>(seq_id);
|
|
}
|
|
|
|
void Index::tokenize_doc_field(const nlohmann::json& document, const field& search_field,
|
|
std::vector<std::string>& tokens) {
|
|
|
|
const std::string& field_name = search_field.name;
|
|
|
|
if(search_field.type == field_types::STRING) {
|
|
Tokenizer(document[field_name], true, true, !search_field.is_string()).tokenize(tokens);
|
|
} else if(search_field.type == field_types::STRING_ARRAY) {
|
|
const std::vector<std::string>& values = document[field_name].get<std::vector<std::string>>();
|
|
for(const std::string & value: values) {
|
|
Tokenizer(value, true, true, !search_field.is_string()).tokenize(tokens);
|
|
}
|
|
} else if(search_field.type == field_types::INT32) {
|
|
const int KEY_LEN = 8;
|
|
unsigned char key[KEY_LEN];
|
|
const int32_t& value = document[field_name].get<int32_t>();
|
|
encode_int32(value, key);
|
|
tokens.emplace_back((char*)key, KEY_LEN);
|
|
} else if(search_field.type == field_types::INT32_ARRAY) {
|
|
const std::vector<int32_t>& values = document[field_name].get<std::vector<int32_t>>();
|
|
for(const int32_t value: values) {
|
|
const int KEY_LEN = 8;
|
|
unsigned char key[KEY_LEN];
|
|
encode_int32(value, key);
|
|
tokens.emplace_back((char*)key, KEY_LEN);
|
|
}
|
|
} else if(search_field.type == field_types::INT64) {
|
|
const int KEY_LEN = 8;
|
|
unsigned char key[KEY_LEN];
|
|
const int64_t& value = document[field_name].get<int64_t>();
|
|
encode_int64(value, key);
|
|
tokens.emplace_back((char*)key, KEY_LEN);
|
|
} else if(search_field.type == field_types::INT64_ARRAY) {
|
|
const std::vector<int64_t>& values = document[field_name].get<std::vector<int64_t>>();
|
|
for(const int64_t value: values) {
|
|
const int KEY_LEN = 8;
|
|
unsigned char key[KEY_LEN];
|
|
encode_int64(value, key);
|
|
tokens.emplace_back((char*)key, KEY_LEN);
|
|
}
|
|
} else if(search_field.type == field_types::FLOAT) {
|
|
const int KEY_LEN = 8;
|
|
unsigned char key[KEY_LEN];
|
|
const int64_t& value = document[field_name].get<int64_t>();
|
|
encode_float(value, key);
|
|
tokens.emplace_back((char*)key, KEY_LEN);
|
|
} else if(search_field.type == field_types::FLOAT_ARRAY) {
|
|
const std::vector<float>& values = document[field_name].get<std::vector<float>>();
|
|
for(const float value: values) {
|
|
const int KEY_LEN = 8;
|
|
unsigned char key[KEY_LEN];
|
|
encode_float(value, key);
|
|
tokens.emplace_back((char*)key, KEY_LEN);
|
|
}
|
|
} else if(search_field.type == field_types::BOOL) {
|
|
const int KEY_LEN = 1;
|
|
unsigned char key[KEY_LEN];
|
|
const bool& value = document[field_name].get<bool>();
|
|
key[0] = value ? '1' : '0';
|
|
tokens.emplace_back((char*)key, KEY_LEN);
|
|
} else if(search_field.type == field_types::BOOL_ARRAY) {
|
|
const std::vector<bool>& values = document[field_name].get<std::vector<bool>>();
|
|
for(const bool value: values) {
|
|
const int KEY_LEN = 1;
|
|
unsigned char key[KEY_LEN];
|
|
key[0] = value ? '1' : '0';
|
|
tokens.emplace_back((char*)key, KEY_LEN);
|
|
}
|
|
}
|
|
}
|
|
|
|
art_leaf* Index::get_token_leaf(const std::string & field_name, const unsigned char* token, uint32_t token_len) {
|
|
const art_tree *t = search_index.at(field_name);
|
|
return (art_leaf*) art_search(t, token, (int) token_len);
|
|
}
|
|
|
|
const spp::sparse_hash_map<std::string, art_tree *> &Index::_get_search_index() const {
|
|
return search_index;
|
|
}
|