Merge branch 'doc-update'

This commit is contained in:
Kishore Nallan 2020-10-25 20:33:28 +05:30
commit 3d1ea448b6
30 changed files with 1499 additions and 302 deletions

View File

@ -52,7 +52,7 @@ Here's a quick example showcasing how you can create a collection, index a docum
Let's begin by starting the Typesense server via Docker:
```
docker run -p 8108:8108 -v/tmp/data:/data typesense/typesense:0.15.0 --data-dir /data --api-key=Hu52dwsas2AdxdE
docker run -p 8108:8108 -v/tmp/data:/data typesense/typesense:0.16.0 --data-dir /data --api-key=Hu52dwsas2AdxdE
```
We have [API Clients](#api-clients) in a couple of languages, but let's use the Python client for this example.

View File

@ -18,6 +18,8 @@ private:
}
public:
void load(const uint32_t *sorted_array, uint32_t array_length, uint32_t m, uint32_t M);
uint32_t at(uint32_t index);
bool contains(uint32_t value);
@ -26,5 +28,7 @@ public:
bool append(uint32_t value);
bool insert(size_t index, const uint32_t* values, size_t num_values);
void remove_index(uint32_t start_index, uint32_t end_index);
};

View File

@ -36,7 +36,8 @@ public:
in = nullptr;
}
uint32_t* uncompress();
// len determines length of output buffer (default: length of input)
uint32_t* uncompress(uint32_t len=0);
uint32_t getSizeInBytes();

View File

@ -96,9 +96,9 @@ typedef struct {
* of arbitrary size, as they include the key.
*/
typedef struct {
art_values* values;
int32_t max_score;
uint32_t key_len;
int64_t max_score;
art_values* values;
unsigned char key[];
} art_leaf;

View File

@ -92,6 +92,11 @@ struct override_t {
}
};
struct doc_seq_id_t {
uint32_t seq_id;
bool is_new;
};
class Collection {
private:
@ -150,7 +155,9 @@ private:
void highlight_result(const field &search_field, const std::vector<std::vector<art_leaf *>> &searched_queries,
const KV* field_order_kv, const nlohmann::json &document,
StringUtils & string_utils, size_t snippet_threshold,
StringUtils & string_utils,
const size_t snippet_threshold,
const size_t highlight_affix_num_tokens,
bool highlighted_fully,
highlight_t &highlight);
@ -217,13 +224,16 @@ public:
std::string get_default_sorting_field();
Option<uint32_t> to_doc(const std::string & json_str, nlohmann::json & document);
Option<doc_seq_id_t> to_doc(const std::string& json_str, nlohmann::json& document,
const index_operation_t& operation, const std::string& id="");
nlohmann::json get_summary_json();
Option<nlohmann::json> add(const std::string & json_str);
Option<nlohmann::json> add(const std::string & json_str,
const index_operation_t& operation=CREATE, const std::string& id="");
nlohmann::json add_many(std::vector<std::string>& json_lines);
nlohmann::json add_many(std::vector<std::string>& json_lines, nlohmann::json& document,
const index_operation_t& operation=CREATE, const std::string& id="");
Option<nlohmann::json> search(const std::string & query, const std::vector<std::string> & search_fields,
const std::string & simple_filter_query, const std::vector<std::string> & facet_fields,
@ -236,6 +246,7 @@ public:
size_t max_facet_values=10,
const std::string & simple_facet_query = "",
const size_t snippet_threshold = 30,
const size_t highlight_affix_num_tokens = 4,
const std::string & highlight_full_fields = "",
size_t typo_tokens_threshold = Index::TYPO_TOKENS_THRESHOLD,
const std::map<size_t, std::vector<std::string>>& pinned_hits={},
@ -263,7 +274,7 @@ public:
Option<bool> get_document_from_store(const std::string & seq_id_key, nlohmann::json & document);
Option<uint32_t> index_in_memory(const nlohmann::json & document, uint32_t seq_id);
Option<uint32_t> index_in_memory(const nlohmann::json & document, uint32_t seq_id, bool is_update);
size_t par_index_in_memory(std::vector<std::vector<index_record>> & iter_batch, std::vector<size_t>& indexed_counts);
@ -296,5 +307,9 @@ public:
size_t &num_indexed);
bool is_exceeding_memory_threshold() const;
void get_doc_changes(const nlohmann::json &document, nlohmann::json &old_doc,
nlohmann::json &new_doc,
nlohmann::json &del_doc);
};

View File

@ -23,6 +23,8 @@ bool get_export_documents(http_req& req, http_res& res);
bool post_add_document(http_req& req, http_res& res);
bool patch_update_document(http_req& req, http_res& res);
bool post_import_documents(http_req& req, http_res& res);
bool get_fetch_document(http_req& req, http_res& res);

View File

@ -127,6 +127,8 @@ public:
void put(const std::string & path, bool (*handler)(http_req & req, http_res & res), bool async_req=false, bool async_res=false);
void patch(const std::string & path, bool (*handler)(http_req & req, http_res & res), bool async_req=false, bool async_res=false);
void del(const std::string & path, bool (*handler)(http_req & req, http_res & res), bool async_req=false, bool async_res=false);
void on(const std::string & message, bool (*handler)(void*));

View File

@ -79,15 +79,29 @@ struct search_args {
};
};
enum index_operation_t {
CREATE,
UPSERT,
UPDATE,
DELETE
};
struct index_record {
size_t position; // position of record in the original request
size_t position; // position of record in the original request
uint32_t seq_id;
nlohmann::json document;
Option<bool> indexed; // indicates if the indexing operation was a success
nlohmann::json doc;
nlohmann::json old_doc;
nlohmann::json new_doc;
nlohmann::json del_doc;
index_record(size_t record_pos, uint32_t seq_id, const nlohmann::json& doc):
position(record_pos), seq_id(seq_id), document(doc), indexed(true) {
index_operation_t operation;
bool is_update;
Option<bool> indexed; // indicates if the indexing operation was a success
index_record(size_t record_pos, uint32_t seq_id, const nlohmann::json& doc, index_operation_t operation):
position(record_pos), seq_id(seq_id), doc(doc), operation(operation), is_update(false), indexed(false) {
}
@ -95,7 +109,7 @@ struct index_record {
indexed = Option<bool>(err_code, err_msg);
}
void index_success(const index_record & record) {
void index_success() {
indexed = Option<bool>(true);
}
};
@ -154,32 +168,32 @@ private:
size_t & all_result_ids_len,
const size_t typo_tokens_threshold);
void insert_doc(const uint32_t score, art_tree *t, uint32_t seq_id,
void insert_doc(const int64_t score, art_tree *t, uint32_t seq_id,
const std::unordered_map<std::string, std::vector<uint32_t>> &token_to_offsets) const;
void index_string_field(const std::string & text, const uint32_t score, art_tree *t, uint32_t seq_id,
void index_string_field(const std::string & text, const int64_t score, art_tree *t, uint32_t seq_id,
int facet_id, const field & a_field);
void index_string_array_field(const std::vector<std::string> & strings, const uint32_t score, art_tree *t,
void index_string_array_field(const std::vector<std::string> & strings, const int64_t score, art_tree *t,
uint32_t seq_id, int facet_id, const field & a_field);
void index_int32_field(const int32_t value, const uint32_t score, art_tree *t, uint32_t seq_id) const;
void index_int32_field(const int32_t value, const int64_t score, art_tree *t, uint32_t seq_id) const;
void index_int64_field(const int64_t value, const uint32_t score, art_tree *t, uint32_t seq_id) const;
void index_int64_field(const int64_t value, const int64_t score, art_tree *t, uint32_t seq_id) const;
void index_float_field(const float value, const uint32_t score, art_tree *t, uint32_t seq_id) const;
void index_float_field(const float value, const int64_t score, art_tree *t, uint32_t seq_id) const;
void index_bool_field(const bool value, const uint32_t score, art_tree *t, uint32_t seq_id) const;
void index_bool_field(const bool value, const int64_t score, art_tree *t, uint32_t seq_id) const;
void index_int32_array_field(const std::vector<int32_t> & values, const uint32_t score, art_tree *t, uint32_t seq_id) const;
void index_int32_array_field(const std::vector<int32_t> & values, const int64_t score, art_tree *t, uint32_t seq_id) const;
void index_int64_array_field(const std::vector<int64_t> & values, const uint32_t score, art_tree *t, uint32_t seq_id) const;
void index_int64_array_field(const std::vector<int64_t> & values, const int64_t score, art_tree *t, uint32_t seq_id) const;
void index_float_array_field(const std::vector<float> & values, const uint32_t score, art_tree *t, uint32_t seq_id) const;
void index_float_array_field(const std::vector<float> & values, const int64_t score, art_tree *t, uint32_t seq_id) const;
void index_bool_array_field(const std::vector<bool> & values, const uint32_t score, art_tree *t, uint32_t seq_id) const;
void index_bool_array_field(const std::vector<bool> & values, const int64_t score, art_tree *t, uint32_t seq_id) const;
void remove_and_shift_offset_index(sorted_array &offset_index, const uint32_t *indices_sorted,
void remove_and_shift_offset_index(sorted_array& offset_index, const uint32_t* indices_sorted,
const uint32_t indices_length);
uint32_t* collate_leaf_ids(const std::vector<const art_leaf *> &leaves, size_t& result_ids_len) const;
@ -238,21 +252,22 @@ public:
spp::sparse_hash_set<uint64_t>& groups_processed,
const uint32_t *result_ids, const size_t result_size);
static int32_t get_points_from_doc(const nlohmann::json &document, const std::string & default_sorting_field);
static int64_t get_points_from_doc(const nlohmann::json &document, const std::string & default_sorting_field);
Option<uint32_t> index_in_memory(const nlohmann::json & document, uint32_t seq_id,
const std::string & default_sorting_field);
const std::string & default_sorting_field, bool is_update);
static Option<uint32_t> validate_index_in_memory(const nlohmann::json &document, uint32_t seq_id,
const std::string & default_sorting_field,
const std::unordered_map<std::string, field> & search_schema,
const std::map<std::string, field> & facet_schema);
const std::map<std::string, field> & facet_schema,
bool is_update);
static size_t batch_memory_index(Index *index,
std::vector<index_record> & iter_batch,
const std::string & default_sorting_field,
const std::unordered_map<std::string, field> & search_schema,
const std::map<std::string, field> & facet_schema);
std::vector<index_record> & iter_batch,
const std::string & default_sorting_field,
const std::unordered_map<std::string, field> & search_schema,
const std::map<std::string, field> & facet_schema);
const spp::sparse_hash_map<std::string, art_tree *> &_get_search_index() const;
@ -291,5 +306,10 @@ public:
void eq_str_filter_plain(const uint32_t *strt_ids, size_t strt_ids_size,
const std::vector<art_leaf *> &query_suggestion,
uint32_t *exact_strt_ids, size_t& exact_strt_size) const;
void scrub_reindex_doc(nlohmann::json& update_doc, nlohmann::json& del_doc, nlohmann::json& old_doc);
void tokenize_doc_field(const nlohmann::json& document, const std::string& field_name, const field& search_field,
std::vector<std::string>& tokens);
};

View File

@ -8,6 +8,7 @@
#include <limits>
#include <iostream>
#include "array_base.h"
#include "logger.h"
class sorted_array: public array_base {
private:
@ -16,7 +17,15 @@ private:
uint32_t m = std::min(min, value);
uint32_t M = std::max(max, value);
uint32_t bnew = required_bits(M - m);
return METADATA_OVERHEAD + 4 + for_compressed_size_bits(new_length, bnew);
uint32_t size_bits = for_compressed_size_bits(new_length, bnew);
/*if(new_length == 15) {
LOG(INFO) << "value: " << value << ", m: " << m << ", M: " << M << ", bnew: "
<< bnew << ", size_bits: " << size_bits;
}*/
return METADATA_OVERHEAD + 4 + size_bits;
}
uint32_t lower_bound_search_bits(const uint8_t *in, uint32_t imin, uint32_t imax, uint32_t base,
@ -39,7 +48,11 @@ public:
void indexOf(const uint32_t *values, const size_t values_len, uint32_t* indices);
// returns false if malloc fails
bool append(uint32_t value);
size_t append(uint32_t value);
void remove_values(uint32_t *sorted_values, uint32_t values_length);
bool insert(size_t index, uint32_t value);
void remove_value(uint32_t value);
void remove_values(uint32_t *sorted_values, uint32_t sorted_values_length);
};

View File

@ -199,6 +199,15 @@ struct StringUtils {
return (*p == 0) && val >= std::numeric_limits<int32_t>::min() && val <= std::numeric_limits<int32_t>::max();
}
static bool is_bool(std::string &s) {
if(s.empty()) {
return false;
}
StringUtils::tolowercase(s);
return s == "true" || s == "false";
}
static void toupper(std::string& str) {
std::transform(str.begin(), str.end(), str.begin(), ::toupper);
}

View File

@ -41,6 +41,47 @@ bool array::append(uint32_t value) {
return true;
}
void array::load(const uint32_t *sorted_array, const uint32_t array_length, const uint32_t m, const uint32_t M) {
min = m;
max = M;
uint32_t size_required = (uint32_t) (unsorted_append_size_required(max, array_length) * FOR_GROWTH_FACTOR);
uint8_t *out = (uint8_t *) malloc(size_required * sizeof *out);
uint32_t actual_size = for_compress_unsorted(sorted_array, out, array_length);
free(in);
in = nullptr;
in = out;
length = array_length;
size_bytes = size_required;
length_bytes = actual_size;
}
bool array::insert(size_t index, const uint32_t* values, size_t num_values) {
if(index >= length) {
return false;
}
uint32_t *curr_array = uncompress(length+num_values);
memmove(&curr_array[index+num_values], &curr_array[index], sizeof(uint32_t)*(length-index));
uint32_t m = min, M = max;
for(size_t i=0; i<num_values; i++) {
uint32_t value = values[i];
if(value < m) m = value;
if(value > M) M = value;
curr_array[index+i] = value;
}
load(curr_array, length+num_values, m, M);
delete [] curr_array;
return true;
}
void array::remove_index(uint32_t start_index, uint32_t end_index) {
uint32_t *curr_array = uncompress();

View File

@ -1,7 +1,8 @@
#include "array_base.h"
uint32_t* array_base::uncompress() {
uint32_t *out = new uint32_t[length];
uint32_t* array_base::uncompress(uint32_t len) {
uint32_t actual_len = std::max(len, length);
uint32_t *out = new uint32_t[actual_len];
for_uncompress(in, out, length);
return out;
}

View File

@ -39,6 +39,8 @@ static void art_fuzzy_recurse(unsigned char p, unsigned char c, const art_node *
void art_int_fuzzy_recurse(art_node *n, int depth, const unsigned char* int_str, int int_str_len,
NUM_COMPARATOR comparator, std::vector<const art_leaf *> &results);
static void insert_and_shift_offset_index(sorted_array& offset_index, const uint32_t index, const uint32_t num_offsets);
bool compare_art_leaf_frequency(const art_leaf *a, const art_leaf *b) {
return a->values->ids.getLength() > b->values->ids.getLength();
}
@ -408,15 +410,42 @@ art_leaf* art_maximum(art_tree *t) {
static void add_document_to_leaf(const art_document *document, art_leaf *leaf) {
leaf->max_score = MAX(leaf->max_score, document->score);
leaf->values->ids.append(document->id);
uint32_t curr_index = leaf->values->offsets.getLength();
leaf->values->offset_index.append(curr_index);
size_t inserted_index = leaf->values->ids.append(document->id);
for(uint32_t i=0; i<document->offsets_len; i++) {
leaf->values->offsets.append(document->offsets[i]);
if(inserted_index == leaf->values->ids.getLength()-1) {
// treat as appends
uint32_t curr_index = leaf->values->offsets.getLength();
leaf->values->offset_index.append(curr_index);
for(uint32_t i=0; i<document->offsets_len; i++) {
leaf->values->offsets.append(document->offsets[i]);
}
} else {
uint32_t existing_offset_index = leaf->values->offset_index.at(inserted_index);
insert_and_shift_offset_index(leaf->values->offset_index, inserted_index, document->offsets_len);
leaf->values->offsets.insert(existing_offset_index, document->offsets, document->offsets_len);
}
}
void insert_and_shift_offset_index(sorted_array& offset_index, const uint32_t index, const uint32_t num_offsets) {
uint32_t existing_offset_index = offset_index.at(index);
uint32_t length = offset_index.getLength();
uint32_t new_length = length + 1;
uint32_t *curr_array = offset_index.uncompress(new_length);
memmove(&curr_array[index+1], &curr_array[index], sizeof(uint32_t)*(length - index));
curr_array[index] = existing_offset_index;
uint32_t curr_index = index + 1;
while(curr_index < new_length) {
curr_array[curr_index] += num_offsets;
curr_index++;
}
offset_index.load(curr_array, new_length);
delete [] curr_array;
}
static art_leaf* make_leaf(const unsigned char *key, uint32_t key_len, art_document *document) {
art_leaf *l = (art_leaf *) malloc(sizeof(art_leaf) + key_len);
l->values = new art_values;

View File

@ -5,7 +5,7 @@ constexpr const char* AuthManager::DOCUMENTS_SEARCH_ACTION;
Option<bool> AuthManager::init(Store *store) {
// This function must be idempotent, i.e. when called multiple times, must produce the same state without leaks
LOG(INFO) << "AuthManager::init()";
//LOG(INFO) << "AuthManager::init()";
this->store = store;
@ -157,7 +157,7 @@ bool AuthManager::authenticate(const std::string& req_api_key, const std::string
}
// enrich params with values from embedded_params
for (const auto& it: embedded_params.items()){
for(auto it = embedded_params.begin(); it != embedded_params.end(); ++it) {
if(params.count(it.key()) == 0) {
params[it.key()] = it.value();
} else if(it.key() == "filter_by") {

View File

@ -8,7 +8,6 @@
#include <art.h>
#include <thread>
#include <future>
#include <chrono>
#include <rocksdb/write_batch.h>
#include <system_metrics.h>
#include "topster.h"
@ -99,33 +98,75 @@ void Collection::increment_next_seq_id_field() {
next_seq_id++;
}
Option<uint32_t> Collection::to_doc(const std::string & json_str, nlohmann::json & document) {
Option<doc_seq_id_t> Collection::to_doc(const std::string & json_str, nlohmann::json& document,
const index_operation_t& operation, const std::string& id) {
try {
document = nlohmann::json::parse(json_str);
} catch(const std::exception& e) {
LOG(ERROR) << "JSON error: " << e.what();
return Option<uint32_t>(400, std::string("Bad JSON: ") + e.what());
return Option<doc_seq_id_t>(400, std::string("Bad JSON: ") + e.what());
}
if(!document.is_object()) {
return Option<uint32_t>(400, "Bad JSON: not a properly formed document.");
return Option<doc_seq_id_t>(400, "Bad JSON: not a properly formed document.");
}
uint32_t seq_id = get_next_seq_id();
std::string seq_id_str = std::to_string(seq_id);
if(document.count("id") != 0 && id != "" && document["id"] != id) {
return Option<doc_seq_id_t>(400, "The `id` of the resource does not match the `id` in the JSON body.");
}
if(document.count("id") == 0 && !id.empty()) {
// use the explicit ID (usually from a PUT request) if document body does not have it
document["id"] = id;
}
if(document.count("id") != 0 && document["id"] == "") {
return Option<doc_seq_id_t>(400, "The `id` should not be empty.");
}
if(document.count("id") == 0) {
document["id"] = seq_id_str;
} else if(!document["id"].is_string()) {
return Option<uint32_t>(400, "Document's `id` field should be a string.");
}
if(operation == UPDATE) {
return Option<doc_seq_id_t>(400, "For update, the `id` key must be provided.");
}
// for UPSERT or CREATE, if a document does not have an ID, we will treat it as a new doc
uint32_t seq_id = get_next_seq_id();
document["id"] = std::to_string(seq_id);
return Option<doc_seq_id_t>(doc_seq_id_t{seq_id, true});
} else {
if(!document["id"].is_string()) {
return Option<doc_seq_id_t>(400, "Document's `id` field should be a string.");
}
const std::string& doc_id = document["id"];
if(doc_exists(doc_id)) {
return Option<uint32_t>(409, std::string("A document with id ") + doc_id + " already exists.");
}
const std::string& doc_id = document["id"];
return Option<uint32_t>(seq_id);
// try to get the corresponding sequence id from disk if present
std::string seq_id_str;
StoreStatus seq_id_status = store->get(get_doc_id_key(doc_id), seq_id_str);
if(seq_id_status == StoreStatus::ERROR) {
return Option<doc_seq_id_t>(500, "Error fetching the sequence key for document with id: " + doc_id);
}
if(seq_id_status == StoreStatus::FOUND) {
if(operation == CREATE) {
return Option<doc_seq_id_t>(409, std::string("A document with id ") + doc_id + " already exists.");
}
// UPSERT or UPDATE
uint32_t seq_id = (uint32_t) std::stoul(seq_id_str);
return Option<doc_seq_id_t>(doc_seq_id_t{seq_id, false});
} else {
if(operation == UPDATE) {
// for UPDATE, a document with given ID must be found
return Option<doc_seq_id_t>(404, "Could not find a document with id: " + doc_id);
} else {
// for UPSERT or CREATE, if a document with given ID is not found, we will treat it as a new doc
uint32_t seq_id = get_next_seq_id();
return Option<doc_seq_id_t>(doc_seq_id_t{seq_id, true});
}
}
}
}
nlohmann::json Collection::get_summary_json() {
@ -152,45 +193,48 @@ nlohmann::json Collection::get_summary_json() {
return json_response;
}
Option<nlohmann::json> Collection::add(const std::string & json_str) {
Option<nlohmann::json> Collection::add(const std::string & json_str,
const index_operation_t& operation, const std::string& id) {
nlohmann::json document;
Option<uint32_t> doc_seq_id_op = to_doc(json_str, document);
std::vector<std::string> json_lines = {json_str};
const nlohmann::json& res = add_many(json_lines, document, operation, id);
if(!doc_seq_id_op.ok()) {
return Option<nlohmann::json>(doc_seq_id_op.code(), doc_seq_id_op.error());
}
if(!res["success"].get<bool>()) {
nlohmann::json res_doc;
/*if(is_exceeding_memory_threshold()) {
return Option<nlohmann::json>(403, "Max memory ratio exceeded.");
}*/
try {
res_doc = nlohmann::json::parse(json_lines[0]);
} catch(const std::exception& e) {
LOG(ERROR) << "JSON error: " << e.what();
return Option<nlohmann::json>(400, std::string("Bad JSON: ") + e.what());
}
const uint32_t seq_id = doc_seq_id_op.get();
const std::string seq_id_str = std::to_string(seq_id);
const Option<uint32_t> & index_memory_op = index_in_memory(document, seq_id);
if(!index_memory_op.ok()) {
return Option<nlohmann::json>(index_memory_op.code(), index_memory_op.error());
}
const std::string& serialized_json = document.dump(-1, ' ', false, nlohmann::detail::error_handler_t::ignore);
rocksdb::WriteBatch batch;
batch.Put(get_doc_id_key(document["id"]), seq_id_str);
batch.Put(get_seq_id_key(seq_id), serialized_json);
bool write_ok = store->batch_write(batch);
if(!write_ok) {
remove_document(document, seq_id, false); // remove from in-memory store too
return Option<nlohmann::json>(500, "Could not write to on-disk storage.");
return Option<nlohmann::json>(res_doc["code"].get<size_t>(), res_doc["error"].get<std::string>());
}
return Option<nlohmann::json>(document);
}
nlohmann::json Collection::add_many(std::vector<std::string>& json_lines) {
//LOG(INFO) << "Memory ratio. Max = " << max_memory_ratio << ", Used = " << SystemMetrics::used_memory_ratio();
void Collection::get_doc_changes(const nlohmann::json &document, nlohmann::json &old_doc,
nlohmann::json &new_doc, nlohmann::json &del_doc) {
for(auto it = old_doc.begin(); it != old_doc.end(); ++it) {
new_doc[it.key()] = it.value();
}
for(auto it = document.begin(); it != document.end(); ++it) {
new_doc[it.key()] = it.value();
if(old_doc.count(it.key()) != 0) {
// key exists in the stored doc, so it must be reindexed
// we need to check for this because a field can be optional
del_doc[it.key()] = old_doc[it.key()];
}
}
}
nlohmann::json Collection::add_many(std::vector<std::string>& json_lines, nlohmann::json& document,
const index_operation_t& operation, const std::string& id) {
//LOG(INFO) << "Memory ratio. Max = " << max_memory_ratio << ", Used = " << SystemMetrics::used_memory_ratio();
std::vector<std::vector<index_record>> iter_batch;
for(size_t i = 0; i < num_memory_shards; i++) {
@ -203,16 +247,23 @@ nlohmann::json Collection::add_many(std::vector<std::string>& json_lines) {
for(size_t i=0; i < json_lines.size(); i++) {
const std::string & json_line = json_lines[i];
nlohmann::json document;
Option<uint32_t> doc_seq_id_op = to_doc(json_line, document);
Option<doc_seq_id_t> doc_seq_id_op = to_doc(json_line, document, operation, id);
const uint32_t seq_id = doc_seq_id_op.ok() ? doc_seq_id_op.get() : 0;
index_record record(i, seq_id, document);
const uint32_t seq_id = doc_seq_id_op.ok() ? doc_seq_id_op.get().seq_id : 0;
index_record record(i, seq_id, document, operation);
// NOTE: we overwrite the input json_lines with result to avoid memory pressure
record.is_update = false;
if(!doc_seq_id_op.ok()) {
record.index_failure(doc_seq_id_op.code(), doc_seq_id_op.error());
} else {
record.is_update = !doc_seq_id_op.get().is_new;
if(record.is_update) {
get_document_from_store(get_seq_id_key(seq_id), record.old_doc);
get_doc_changes(document, record.old_doc, record.new_doc, record.del_doc);
}
}
/*
@ -261,45 +312,74 @@ void Collection::batch_index(std::vector<std::vector<index_record>> &index_batch
// store only documents that were indexed in-memory successfully
for(auto& index_batch: index_batches) {
for(auto& index_record: index_batch) {
nlohmann::json res;
if(index_record.indexed.ok()) {
const std::string& seq_id_str = std::to_string(index_record.seq_id);
const std::string& serialized_json = index_record.document.dump(-1, ' ', false,
nlohmann::detail::error_handler_t::ignore);
if(index_record.is_update) {
const std::string& serialized_json = index_record.new_doc.dump(-1, ' ', false, nlohmann::detail::error_handler_t::ignore);
bool write_ok = store->insert(get_seq_id_key(index_record.seq_id), serialized_json);
rocksdb::WriteBatch batch;
batch.Put(get_doc_id_key(index_record.document["id"]), seq_id_str);
batch.Put(get_seq_id_key(index_record.seq_id), serialized_json);
bool write_ok = store->batch_write(batch);
if(!write_ok) {
// we will attempt to reindex the old doc on a best-effort basis
remove_document(index_record.new_doc, index_record.seq_id, false);
index_in_memory(index_record.old_doc, index_record.seq_id, false);
index_record.index_failure(500, "Could not write to on-disk storage.");
} else {
num_indexed++;
index_record.index_success();
}
if(!write_ok) {
index_record.indexed = Option<bool>(500, "Could not write to on-disk storage.");;
// remove from in-memory store to keep the state synced
remove_document(index_record.document, index_record.seq_id, false);
} else {
const std::string& seq_id_str = std::to_string(index_record.seq_id);
const std::string& serialized_json = index_record.doc.dump(-1, ' ', false,
nlohmann::detail::error_handler_t::ignore);
rocksdb::WriteBatch batch;
batch.Put(get_doc_id_key(index_record.doc["id"]), seq_id_str);
batch.Put(get_seq_id_key(index_record.seq_id), serialized_json);
bool write_ok = store->batch_write(batch);
if(!write_ok) {
// remove from in-memory store to keep the state synced
remove_document(index_record.doc, index_record.seq_id, false);
index_record.index_failure(500, "Could not write to on-disk storage.");
} else {
num_indexed++;
index_record.index_success();
}
}
json_out[index_record.position] = R"({"success": true})";
num_indexed++;
res["success"] = index_record.indexed.ok();
if(!index_record.indexed.ok()) {
res["document"] = json_out[index_record.position];
res["error"] = index_record.indexed.error();
res["code"] = index_record.indexed.code();
}
} else {
nlohmann::json res;
res["success"] = false;
res["error"] = index_record.indexed.error();
res["document"] = json_out[index_record.position];
json_out[index_record.position] = res.dump();
res["error"] = index_record.indexed.error();
res["code"] = index_record.indexed.code();
}
json_out[index_record.position] = res.dump();
}
}
}
Option<uint32_t> Collection::index_in_memory(const nlohmann::json &document, uint32_t seq_id) {
Option<uint32_t> validation_op = Index::validate_index_in_memory(document, seq_id, default_sorting_field,
search_schema, facet_schema);
Option<uint32_t> Collection::index_in_memory(const nlohmann::json &document, uint32_t seq_id, bool is_update) {
if(!is_update) {
// for update, validation should be done prior
Option<uint32_t> validation_op = Index::validate_index_in_memory(document, seq_id, default_sorting_field,
search_schema, facet_schema, is_update);
if(!validation_op.ok()) {
return validation_op;
if(!validation_op.ok()) {
return validation_op;
}
}
Index* index = indices[seq_id % num_memory_shards];
index->index_in_memory(document, seq_id, default_sorting_field);
index->index_in_memory(document, seq_id, default_sorting_field, is_update);
num_documents += 1;
return Option<>(200);
@ -418,6 +498,7 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
const size_t max_facet_values,
const std::string & simple_facet_query,
const size_t snippet_threshold,
const size_t highlight_affix_num_tokens,
const std::string & highlight_full_fields,
size_t typo_tokens_threshold,
const std::map<size_t, std::vector<std::string>>& pinned_hits,
@ -992,7 +1073,8 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
bool highlighted_fully = (fields_highlighted_fully.find(field_name) != fields_highlighted_fully.end());
highlight_t highlight;
highlight_result(search_field, searched_queries, field_order_kv, document,
string_utils, snippet_threshold, highlighted_fully, highlight);
string_utils, snippet_threshold, highlight_affix_num_tokens,
highlighted_fully, highlight);
if(!highlight.snippets.empty()) {
highlights.push_back(highlight);
@ -1238,7 +1320,9 @@ void Collection::facet_value_to_string(const facet &a_facet, const facet_count_t
void Collection::highlight_result(const field &search_field,
const std::vector<std::vector<art_leaf *>> &searched_queries,
const KV* field_order_kv, const nlohmann::json & document,
StringUtils & string_utils, size_t snippet_threshold,
StringUtils & string_utils,
const size_t snippet_threshold,
const size_t highlight_affix_num_tokens,
bool highlighted_fully,
highlight_t & highlight) {
@ -1316,6 +1400,10 @@ void Collection::highlight_result(const field &search_field,
if(match.offsets[i].offset != MAX_DISPLACEMENT) {
size_t token_index = (size_t)(match.offsets[i].offset);
token_indices.push_back(token_index);
if(token_index >= tokens.size()) {
LOG(ERROR) << "Highlight token index " << token_index << " is greater than length of store field.";
continue;
}
std::string token = tokens[token_index];
string_utils.unicode_normalize(token);
token_hits.insert(token);
@ -1324,12 +1412,15 @@ void Collection::highlight_result(const field &search_field,
auto minmax = std::minmax_element(token_indices.begin(), token_indices.end());
size_t prefix_length = highlight_affix_num_tokens;
size_t suffix_length = highlight_affix_num_tokens + 1;
// For longer strings, pick surrounding tokens within 4 tokens of min_index and max_index for the snippet
const size_t start_index = (tokens.size() <= snippet_threshold) ? 0 :
std::max(0, (int)(*(minmax.first) - 4));
std::max(0, (int)(*(minmax.first) - prefix_length));
const size_t end_index = (tokens.size() <= snippet_threshold) ? tokens.size() :
std::min((int)tokens.size(), (int)(*(minmax.second) + 5));
std::min((int)tokens.size(), (int)(*(minmax.second) + suffix_length));
std::stringstream snippet_stream;
for(size_t snippet_index = start_index; snippet_index < end_index; snippet_index++) {
@ -1401,7 +1492,7 @@ Option<nlohmann::json> Collection::get(const std::string & id) {
return Option<nlohmann::json>(500, "Error while fetching the document.");
}
uint32_t seq_id = (uint32_t) std::stol(seq_id_str);
uint32_t seq_id = (uint32_t) std::stoul(seq_id_str);
std::string parsed_document;
StoreStatus doc_status = store->get(get_seq_id_key(seq_id), parsed_document);
@ -1450,7 +1541,7 @@ Option<std::string> Collection::remove(const std::string & id, const bool remove
return Option<std::string>(500, "Error while fetching the document.");
}
uint32_t seq_id = (uint32_t) std::stol(seq_id_str);
uint32_t seq_id = (uint32_t) std::stoul(seq_id_str);
std::string parsed_document;
StoreStatus doc_status = store->get(get_seq_id_key(seq_id), parsed_document);

View File

@ -173,7 +173,7 @@ Option<bool> CollectionManager::load(const size_t init_batch_size) {
}
num_valid_docs++;
iter_batch[seq_id % collection->get_num_memory_shards()].emplace_back(index_record(0, seq_id, document));
iter_batch[seq_id % collection->get_num_memory_shards()].emplace_back(index_record(0, seq_id, document, CREATE));
// Peek and check for last record right here so that we handle batched indexing correctly
// Without doing this, the "last batch" would have to be indexed outside the loop.
@ -195,7 +195,7 @@ Option<bool> CollectionManager::load(const size_t init_batch_size) {
if(num_indexed != num_records) {
const Option<std::string> & index_error_op = get_first_index_error(iter_batch[i]);
if(index_error_op.ok()) {
if(!index_error_op.ok()) {
return Option<bool>(false, index_error_op.get());
}
}

View File

@ -27,6 +27,18 @@ bool handle_authentication(std::map<std::string, std::string>& req_params, const
return collectionManager.auth_key_matches(auth_key, rpath.action, collection, req_params);
}
index_operation_t get_index_operation(const std::string& action) {
if(action == "create") {
return CREATE;
} else if(action == "update") {
return UPDATE;
} else if(action == "upsert") {
return UPSERT;
}
return CREATE;
}
bool get_collections(http_req & req, http_res & res) {
CollectionManager & collectionManager = CollectionManager::get_instance();
std::vector<Collection*> collections = collectionManager.get_collections();
@ -254,6 +266,9 @@ bool get_search(http_req & req, http_res & res) {
// strings under this length will be fully highlighted, instead of showing a snippet of relevant portion
const char *SNIPPET_THRESHOLD = "snippet_threshold";
// the number of tokens that should surround the highlighted text
const char *HIGHLIGHT_AFFIX_NUM_TOKENS = "highlight_affix_num_tokens";
// list of fields which will be highlighted fully without snippeting
const char *HIGHLIGHT_FULL_FIELDS = "highlight_full_fields";
@ -290,6 +305,10 @@ bool get_search(http_req & req, http_res & res) {
req.params[SNIPPET_THRESHOLD] = "30";
}
if(req.params.count(HIGHLIGHT_AFFIX_NUM_TOKENS) == 0) {
req.params[HIGHLIGHT_AFFIX_NUM_TOKENS] = "4";
}
if(req.params.count(HIGHLIGHT_FULL_FIELDS) == 0) {
req.params[HIGHLIGHT_FULL_FIELDS] = "";
}
@ -362,6 +381,11 @@ bool get_search(http_req & req, http_res & res) {
return false;
}
if(!StringUtils::is_uint32_t(req.params[HIGHLIGHT_AFFIX_NUM_TOKENS])) {
res.set_400("Parameter `" + std::string(HIGHLIGHT_AFFIX_NUM_TOKENS) + "` must be an unsigned integer.");
return false;
}
if(!StringUtils::is_uint32_t(req.params[GROUP_LIMIT])) {
res.set_400("Parameter `" + std::string(GROUP_LIMIT) + "` must be an unsigned integer.");
return false;
@ -474,6 +498,7 @@ bool get_search(http_req & req, http_res & res) {
static_cast<size_t>(std::stol(req.params[MAX_FACET_VALUES])),
req.params[FACET_QUERY],
static_cast<size_t>(std::stol(req.params[SNIPPET_THRESHOLD])),
static_cast<size_t>(std::stol(req.params[HIGHLIGHT_AFFIX_NUM_TOKENS])),
req.params[HIGHLIGHT_FULL_FIELDS],
typo_tokens_threshold,
pinned_hits,
@ -579,11 +604,16 @@ bool post_import_documents(http_req& req, http_res& res) {
//LOG(INFO) << "post_import_documents";
//LOG(INFO) << "req.first_chunk=" << req.first_chunk_aggregate << ", last_chunk=" << req.last_chunk_aggregate;
const char *BATCH_SIZE = "batch_size";
const char *ACTION = "action";
if(req.params.count(BATCH_SIZE) == 0) {
req.params[BATCH_SIZE] = "40";
}
if(req.params.count(ACTION) == 0) {
req.params[ACTION] = "create";
}
if(!StringUtils::is_uint32_t(req.params[BATCH_SIZE])) {
req.last_chunk_aggregate = true;
res.final = true;
@ -592,6 +622,14 @@ bool post_import_documents(http_req& req, http_res& res) {
return false;
}
if(req.params[ACTION] != "create" && req.params[ACTION] != "update" && req.params[ACTION] != "upsert") {
req.last_chunk_aggregate = true;
res.final = true;
res.set_400("Parameter `" + std::string(ACTION) + "` must be a create|update|upsert.");
HttpServer::stream_response(req, res);
return false;
}
const size_t IMPORT_BATCH_SIZE = std::stoi(req.params[BATCH_SIZE]);
if(IMPORT_BATCH_SIZE == 0) {
@ -667,8 +705,11 @@ bool post_import_documents(http_req& req, http_res& res) {
//LOG(INFO) << "single_partial_record_body: " << single_partial_record_body;
const index_operation_t operation = get_index_operation(req.params[ACTION]);
if(!single_partial_record_body) {
nlohmann::json json_res = collection->add_many(json_lines);
nlohmann::json document;
nlohmann::json json_res = collection->add_many(json_lines, document, operation);
//const std::string& import_summary_json = json_res.dump();
//response_stream << import_summary_json << "\n";
@ -698,6 +739,16 @@ bool post_import_documents(http_req& req, http_res& res) {
}
bool post_add_document(http_req & req, http_res & res) {
const char *ACTION = "action";
if(req.params.count(ACTION) == 0) {
req.params[ACTION] = "create";
}
if(req.params[ACTION] != "create" && req.params[ACTION] != "update" && req.params[ACTION] != "upsert") {
res.set_400("Parameter `" + std::string(ACTION) + "` must be a create|update|upsert.");
return false;
}
CollectionManager & collectionManager = CollectionManager::get_instance();
Collection* collection = collectionManager.get_collection(req.params["collection"]);
@ -706,7 +757,8 @@ bool post_add_document(http_req & req, http_res & res) {
return false;
}
Option<nlohmann::json> inserted_doc_op = collection->add(req.body);
const index_operation_t operation = get_index_operation(req.params[ACTION]);
Option<nlohmann::json> inserted_doc_op = collection->add(req.body, operation);
if(!inserted_doc_op.ok()) {
res.set(inserted_doc_op.code(), inserted_doc_op.error());
@ -717,6 +769,28 @@ bool post_add_document(http_req & req, http_res & res) {
return true;
}
bool patch_update_document(http_req & req, http_res & res) {
std::string doc_id = req.params["id"];
CollectionManager & collectionManager = CollectionManager::get_instance();
Collection* collection = collectionManager.get_collection(req.params["collection"]);
if(collection == nullptr) {
res.set_404();
return false;
}
Option<nlohmann::json> upserted_doc_op = collection->add(req.body, index_operation_t::UPDATE, doc_id);
if(!upserted_doc_op.ok()) {
res.set(upserted_doc_op.code(), upserted_doc_op.error());
return false;
}
res.set_201(upserted_doc_op.get().dump());
return true;
}
bool get_fetch_document(http_req & req, http_res & res) {
std::string doc_id = req.params["id"];
@ -1044,7 +1118,7 @@ bool get_key(http_req &req, http_res &res) {
AuthManager &auth_manager = collectionManager.getAuthManager();
const std::string& key_id_str = req.params["id"];
uint32_t key_id = (uint32_t) std::stol(key_id_str);
uint32_t key_id = (uint32_t) std::stoul(key_id_str);
const Option<api_key_t>& key_op = auth_manager.get_key(key_id);
@ -1066,7 +1140,7 @@ bool del_key(http_req &req, http_res &res) {
AuthManager &auth_manager = collectionManager.getAuthManager();
const std::string& key_id_str = req.params["id"];
uint32_t key_id = (uint32_t) std::stol(key_id_str);
uint32_t key_id = (uint32_t) std::stoul(key_id_str);
const Option<api_key_t> &del_op = auth_manager.remove_key(key_id);

View File

@ -129,6 +129,7 @@ int HttpServer::create_listener() {
ctx.globalconf->server_name = h2o_strdup(nullptr, "", SIZE_MAX);
ctx.globalconf->http2.active_stream_window_size = ACTIVE_STREAM_WINDOW_SIZE;
ctx.globalconf->http2.idle_timeout = REQ_TIMEOUT_MS;
ctx.globalconf->max_request_entity_size = (1024 * 1024 * 1024); // 1 GB
ctx.globalconf->http1.req_timeout = REQ_TIMEOUT_MS;
ctx.globalconf->http1.req_io_timeout = REQ_TIMEOUT_MS;
@ -705,6 +706,13 @@ void HttpServer::put(const std::string & path, bool (*handler)(http_req &, http_
routes.emplace_back(rpath.route_hash(), rpath);
}
void HttpServer::patch(const std::string & path, bool (*handler)(http_req &, http_res &), bool async_req, bool async_res) {
std::vector<std::string> path_parts;
StringUtils::split(path, path_parts, "/");
route_path rpath("PATCH", path_parts, handler, async_req, async_res);
routes.emplace_back(rpath.route_hash(), rpath);
}
void HttpServer::del(const std::string & path, bool (*handler)(http_req &, http_res &), bool async_req, bool async_res) {
std::vector<std::string> path_parts;
StringUtils::split(path, path_parts, "/");

View File

@ -56,8 +56,8 @@ Index::~Index() {
sort_index.clear();
}
int32_t Index::get_points_from_doc(const nlohmann::json &document, const std::string & default_sorting_field) {
int32_t points = 0;
int64_t Index::get_points_from_doc(const nlohmann::json &document, const std::string & default_sorting_field) {
int64_t points = 0;
if(!default_sorting_field.empty()) {
if(document[default_sorting_field].is_number_float()) {
@ -85,8 +85,15 @@ int64_t Index::float_to_in64_t(float f) {
}
Option<uint32_t> Index::index_in_memory(const nlohmann::json &document, uint32_t seq_id,
const std::string & default_sorting_field) {
int32_t points = get_points_from_doc(document, default_sorting_field);
const std::string & default_sorting_field, bool is_update) {
int64_t points = 0;
if(is_update && document.count(default_sorting_field) == 0) {
points = sort_index[default_sorting_field]->at(seq_id);
} else {
points = get_points_from_doc(document, default_sorting_field);
}
std::unordered_map<std::string, size_t> facet_to_id;
size_t i_facet = 0;
@ -104,7 +111,7 @@ Option<uint32_t> Index::index_in_memory(const nlohmann::json &document, uint32_t
for(const std::pair<std::string, field> & field_pair: search_schema) {
const std::string & field_name = field_pair.first;
if(field_pair.second.optional && document.count(field_name) == 0) {
if((field_pair.second.optional || is_update) && document.count(field_name) == 0) {
continue;
}
@ -212,17 +219,22 @@ Option<uint32_t> Index::index_in_memory(const nlohmann::json &document, uint32_t
Option<uint32_t> Index::validate_index_in_memory(const nlohmann::json &document, uint32_t seq_id,
const std::string & default_sorting_field,
const std::unordered_map<std::string, field> & search_schema,
const std::map<std::string, field> & facet_schema) {
if(document.count(default_sorting_field) == 0) {
const std::map<std::string, field> & facet_schema,
bool is_update) {
bool has_default_sort_field = (document.count(default_sorting_field) != 0);
if(!has_default_sort_field && !is_update) {
return Option<>(400, "Field `" + default_sorting_field + "` has been declared as a default sorting field, "
"but is not found in the document.");
}
if(!document[default_sorting_field].is_number_integer() && !document[default_sorting_field].is_number_float()) {
if(has_default_sort_field &&
!document[default_sorting_field].is_number_integer() && !document[default_sorting_field].is_number_float()) {
return Option<>(400, "Default sorting field `" + default_sorting_field + "` must be a single valued numerical field.");
}
if(search_schema.at(default_sorting_field).is_single_float() &&
if(has_default_sort_field && search_schema.at(default_sorting_field).is_single_float() &&
document[default_sorting_field].get<float>() > std::numeric_limits<float>::max()) {
return Option<>(400, "Default sorting field `" + default_sorting_field + "` exceeds maximum value of a float.");
}
@ -230,7 +242,7 @@ Option<uint32_t> Index::validate_index_in_memory(const nlohmann::json &document,
for(const std::pair<std::string, field> & field_pair: search_schema) {
const std::string & field_name = field_pair.first;
if(field_pair.second.optional && document.count(field_name) == 0) {
if((field_pair.second.optional || is_update) && document.count(field_name) == 0) {
continue;
}
@ -309,6 +321,48 @@ Option<uint32_t> Index::validate_index_in_memory(const nlohmann::json &document,
return Option<>(200);
}
void Index::scrub_reindex_doc(nlohmann::json& update_doc, nlohmann::json& del_doc, nlohmann::json& old_doc) {
auto it = del_doc.cbegin();
while(it != del_doc.cend()) {
const std::string& field_name = it.key();
const auto& search_field_it = search_schema.find(field_name);
if(search_field_it == search_schema.end()) {
++it;
continue;
}
const auto& search_field = search_field_it->second;
// Go through all the field names and find the keys+values so that they can be removed from in-memory index
std::vector<std::string> reindex_tokens;
std::vector<std::string> old_tokens;
tokenize_doc_field(update_doc, field_name, search_field, reindex_tokens);
tokenize_doc_field(old_doc, field_name, search_field, old_tokens);
if(old_tokens.size() != reindex_tokens.size()) {
++it;
continue;
}
bool exact_match = true;
for(size_t i=0; i<reindex_tokens.size(); i++) {
const std::string& reindex_val = reindex_tokens[i];
const std::string& old_val = old_tokens[i];
if(reindex_val != old_val) {
exact_match = false;
break;
}
}
if(exact_match) {
it = del_doc.erase(it);
} else {
++it;
}
}
}
size_t Index::batch_memory_index(Index *index, std::vector<index_record> & iter_batch,
const std::string & default_sorting_field,
const std::unordered_map<std::string, field> & search_schema,
@ -322,29 +376,42 @@ size_t Index::batch_memory_index(Index *index, std::vector<index_record> & iter_
continue;
}
Option<uint32_t> validation_op = validate_index_in_memory(index_rec.document, index_rec.seq_id,
default_sorting_field,
search_schema, facet_schema);
if(index_rec.operation != DELETE) {
Option<uint32_t> validation_op = validate_index_in_memory(index_rec.doc, index_rec.seq_id,
default_sorting_field,
search_schema, facet_schema, index_rec.is_update);
if(!validation_op.ok()) {
index_rec.index_failure(validation_op.code(), validation_op.error());
continue;
if(!validation_op.ok()) {
index_rec.index_failure(validation_op.code(), validation_op.error());
continue;
}
if(index_rec.is_update) {
// scrub string fields to reduce delete ops
index->scrub_reindex_doc(index_rec.doc, index_rec.del_doc, index_rec.old_doc);
index->remove(index_rec.seq_id, index_rec.del_doc);
}
Option<uint32_t> index_mem_op = index->index_in_memory(index_rec.doc, index_rec.seq_id,
default_sorting_field, index_rec.is_update);
if(!index_mem_op.ok()) {
index->index_in_memory(index_rec.del_doc, index_rec.seq_id, default_sorting_field, true);
index_rec.index_failure(index_mem_op.code(), index_mem_op.error());
continue;
}
index_rec.index_success();
if(!index_rec.is_update) {
num_indexed++;
}
}
Option<uint32_t> index_mem_op = index->index_in_memory(index_rec.document, index_rec.seq_id, default_sorting_field);
if(!index_mem_op.ok()) {
index_rec.index_failure(index_mem_op.code(), index_mem_op.error());
continue;
}
index_rec.index_success(index_rec);
num_indexed++;
}
return num_indexed;
}
void Index::insert_doc(const uint32_t score, art_tree *t, uint32_t seq_id,
void Index::insert_doc(const int64_t score, art_tree *t, uint32_t seq_id,
const std::unordered_map<std::string, std::vector<uint32_t>> &token_to_offsets) const {
for(auto & kv: token_to_offsets) {
art_document art_doc;
@ -369,13 +436,14 @@ void Index::insert_doc(const uint32_t score, art_tree *t, uint32_t seq_id,
art_doc.offsets[i] = kv.second[i];
}
//LOG(INFO) << "key: " << key << ", art_doc.id: " << art_doc.id;
art_insert(t, key, key_len, &art_doc, num_hits);
delete [] art_doc.offsets;
art_doc.offsets = nullptr;
}
}
void Index::index_int32_field(const int32_t value, uint32_t score, art_tree *t, uint32_t seq_id) const {
void Index::index_int32_field(const int32_t value, int64_t score, art_tree *t, uint32_t seq_id) const {
const int KEY_LEN = 8;
unsigned char key[KEY_LEN];
@ -398,7 +466,7 @@ void Index::index_int32_field(const int32_t value, uint32_t score, art_tree *t,
art_insert(t, key, KEY_LEN, &art_doc, num_hits);
}
void Index::index_int64_field(const int64_t value, uint32_t score, art_tree *t, uint32_t seq_id) const {
void Index::index_int64_field(const int64_t value, int64_t score, art_tree *t, uint32_t seq_id) const {
const int KEY_LEN = 8;
unsigned char key[KEY_LEN];
@ -421,7 +489,7 @@ void Index::index_int64_field(const int64_t value, uint32_t score, art_tree *t,
art_insert(t, key, KEY_LEN, &art_doc, num_hits);
}
void Index::index_bool_field(const bool value, const uint32_t score, art_tree *t, uint32_t seq_id) const {
void Index::index_bool_field(const bool value, const int64_t score, art_tree *t, uint32_t seq_id) const {
const int KEY_LEN = 1;
unsigned char key[KEY_LEN];
key[0] = value ? '1' : '0';
@ -443,7 +511,7 @@ void Index::index_bool_field(const bool value, const uint32_t score, art_tree *t
art_insert(t, key, KEY_LEN, &art_doc, num_hits);
}
void Index::index_float_field(const float value, uint32_t score, art_tree *t, uint32_t seq_id) const {
void Index::index_float_field(const float value, int64_t score, art_tree *t, uint32_t seq_id) const {
const int KEY_LEN = 8;
unsigned char key[KEY_LEN];
@ -484,7 +552,7 @@ uint64_t Index::facet_token_hash(const field & a_field, const std::string &token
return hash;
}
void Index::index_string_field(const std::string & text, const uint32_t score, art_tree *t,
void Index::index_string_field(const std::string & text, const int64_t score, art_tree *t,
uint32_t seq_id, int facet_id, const field & a_field) {
std::vector<std::string> tokens;
StringUtils::split(text, tokens, " ");
@ -506,6 +574,10 @@ void Index::index_string_field(const std::string & text, const uint32_t score, a
token_to_offsets[token].push_back(i);
}
/*if(seq_id == 0) {
LOG(INFO) << "field name: " << a_field.name;
}*/
insert_doc(score, t, seq_id, token_to_offsets);
if(facet_id >= 0) {
@ -513,7 +585,7 @@ void Index::index_string_field(const std::string & text, const uint32_t score, a
}
}
void Index::index_string_array_field(const std::vector<std::string> & strings, const uint32_t score, art_tree *t,
void Index::index_string_array_field(const std::vector<std::string> & strings, const int64_t score, art_tree *t,
uint32_t seq_id, int facet_id, const field & a_field) {
std::unordered_map<std::string, std::vector<uint32_t>> token_positions;
@ -565,28 +637,28 @@ void Index::index_string_array_field(const std::vector<std::string> & strings, c
insert_doc(score, t, seq_id, token_positions);
}
void Index::index_int32_array_field(const std::vector<int32_t> & values, const uint32_t score, art_tree *t,
void Index::index_int32_array_field(const std::vector<int32_t> & values, const int64_t score, art_tree *t,
uint32_t seq_id) const {
for(const int32_t value: values) {
index_int32_field(value, score, t, seq_id);
}
}
void Index::index_int64_array_field(const std::vector<int64_t> & values, const uint32_t score, art_tree *t,
void Index::index_int64_array_field(const std::vector<int64_t> & values, const int64_t score, art_tree *t,
uint32_t seq_id) const {
for(const int64_t value: values) {
index_int64_field(value, score, t, seq_id);
}
}
void Index::index_bool_array_field(const std::vector<bool> & values, const uint32_t score, art_tree *t,
void Index::index_bool_array_field(const std::vector<bool> & values, const int64_t score, art_tree *t,
uint32_t seq_id) const {
for(const bool value: values) {
index_bool_field(value, score, t, seq_id);
}
}
void Index::index_float_array_field(const std::vector<float> & values, const uint32_t score, art_tree *t,
void Index::index_float_array_field(const std::vector<float> & values, const int64_t score, art_tree *t,
uint32_t seq_id) const {
for(const float value: values) {
index_float_field(value, score, t, seq_id);
@ -996,7 +1068,7 @@ Option<uint32_t> Index::do_filtering(uint32_t** filter_ids_out, const std::vecto
bool found_filter = false;
if(!f.is_array()) {
found_filter = (str_tokens.size() == fvalues.size());
found_filter = (query_suggestion.size() == fvalues.size());
} else {
uint64_t filter_hash = 1;
@ -1712,6 +1784,11 @@ void Index::populate_token_positions(const std::vector<art_leaf *>& query_sugges
// a) last element is array_index b) second and third last elements will be largest offset
// (last element is repeated to indicate end of offsets for a given array index)
/*uint32_t* offsets = token_leaf->values->offsets.uncompress();
for(size_t ii=0; ii < token_leaf->values->offsets.getLength(); ii++) {
LOG(INFO) << "offset: " << offsets[ii];
}*/
uint32_t start_offset = token_leaf->values->offset_index.at(doc_index);
uint32_t end_offset = (doc_index == token_leaf->values->ids.getLength() - 1) ?
token_leaf->values->offsets.getLength() :
@ -1767,8 +1844,8 @@ inline std::vector<art_leaf *> Index::next_suggestion(const std::vector<token_ca
return query_suggestion;
}
void Index::remove_and_shift_offset_index(sorted_array &offset_index, const uint32_t *indices_sorted,
const uint32_t indices_length) {
void Index::remove_and_shift_offset_index(sorted_array& offset_index, const uint32_t* indices_sorted,
const uint32_t indices_length) {
uint32_t *curr_array = offset_index.uncompress();
uint32_t *new_array = new uint32_t[offset_index.getLength()];
@ -1801,83 +1878,27 @@ void Index::remove_and_shift_offset_index(sorted_array &offset_index, const uint
}
Option<uint32_t> Index::remove(const uint32_t seq_id, const nlohmann::json & document) {
for(auto & name_field: search_schema) {
if(name_field.second.optional && document.count(name_field.first) == 0) {
std::unordered_map<std::string, size_t> facet_to_index;
get_facet_to_index(facet_to_index);
for(auto it = document.begin(); it != document.end(); ++it) {
const std::string& field_name = it.key();
const auto& search_field_it = search_schema.find(field_name);
if(search_field_it == search_schema.end()) {
continue;
}
const auto& search_field = search_field_it->second;
// Go through all the field names and find the keys+values so that they can be removed from in-memory index
std::vector<std::string> tokens;
if(name_field.second.type == field_types::STRING) {
StringUtils::split(document[name_field.first], tokens, " ");
} else if(name_field.second.type == field_types::STRING_ARRAY) {
std::vector<std::string> values = document[name_field.first].get<std::vector<std::string>>();
for(const std::string & value: values) {
StringUtils::split(value, tokens, " ");
}
} else if(name_field.second.type == field_types::INT32) {
const int KEY_LEN = 8;
unsigned char key[KEY_LEN];
int32_t value = document[name_field.first].get<int32_t>();
encode_int32(value, key);
tokens.push_back(std::string((char*)key, KEY_LEN));
} else if(name_field.second.type == field_types::INT32_ARRAY) {
std::vector<int32_t> values = document[name_field.first].get<std::vector<int32_t>>();
for(const int32_t value: values) {
const int KEY_LEN = 8;
unsigned char key[KEY_LEN];
encode_int32(value, key);
tokens.push_back(std::string((char*)key, KEY_LEN));
}
} else if(name_field.second.type == field_types::INT64) {
const int KEY_LEN = 8;
unsigned char key[KEY_LEN];
int64_t value = document[name_field.first].get<int64_t>();
encode_int64(value, key);
tokens.push_back(std::string((char*)key, KEY_LEN));
} else if(name_field.second.type == field_types::INT64_ARRAY) {
std::vector<int64_t> values = document[name_field.first].get<std::vector<int64_t>>();
for(const int64_t value: values) {
const int KEY_LEN = 8;
unsigned char key[KEY_LEN];
encode_int64(value, key);
tokens.push_back(std::string((char*)key, KEY_LEN));
}
} else if(name_field.second.type == field_types::FLOAT) {
const int KEY_LEN = 8;
unsigned char key[KEY_LEN];
int64_t value = document[name_field.first].get<int64_t>();
encode_float(value, key);
tokens.push_back(std::string((char*)key, KEY_LEN));
} else if(name_field.second.type == field_types::FLOAT_ARRAY) {
std::vector<float> values = document[name_field.first].get<std::vector<float>>();
for(const float value: values) {
const int KEY_LEN = 8;
unsigned char key[KEY_LEN];
encode_float(value, key);
tokens.push_back(std::string((char*)key, KEY_LEN));
}
} else if(name_field.second.type == field_types::BOOL) {
const int KEY_LEN = 1;
unsigned char key[KEY_LEN];
bool value = document[name_field.first].get<bool>();
key[0] = value ? '1' : '0';
tokens.push_back(std::string((char*)key, KEY_LEN));
} else if(name_field.second.type == field_types::BOOL_ARRAY) {
std::vector<bool> values = document[name_field.first].get<std::vector<bool>>();
for(const bool value: values) {
const int KEY_LEN = 1;
unsigned char key[KEY_LEN];
key[0] = value ? '1' : '0';
tokens.push_back(std::string((char*)key, KEY_LEN));
}
}
tokenize_doc_field(document, field_name, search_field, tokens);
for(auto & token: tokens) {
const unsigned char *key;
int key_len;
if(name_field.second.type == field_types::STRING_ARRAY || name_field.second.type == field_types::STRING) {
if(search_field.type == field_types::STRING_ARRAY || search_field.type == field_types::STRING) {
string_utils.unicode_normalize(token);
key = (const unsigned char *) token.c_str();
key_len = (int) (token.length() + 1);
@ -1886,9 +1907,8 @@ Option<uint32_t> Index::remove(const uint32_t seq_id, const nlohmann::json & doc
key_len = (int) (token.length());
}
art_leaf* leaf = (art_leaf *) art_search(search_index.at(name_field.first), key, key_len);
if(leaf != NULL) {
uint32_t seq_id_values[1] = {seq_id};
art_leaf* leaf = (art_leaf *) art_search(search_index.at(field_name), key, key_len);
if(leaf != nullptr) {
uint32_t doc_index = leaf->values->ids.indexOf(seq_id);
if(doc_index == leaf->values->ids.getLength()) {
@ -1905,7 +1925,7 @@ Option<uint32_t> Index::remove(const uint32_t seq_id, const nlohmann::json & doc
remove_and_shift_offset_index(leaf->values->offset_index, doc_indices, 1);
leaf->values->offsets.remove_index(start_offset, end_offset);
leaf->values->ids.remove_values(seq_id_values, 1);
leaf->values->ids.remove_value(seq_id);
/*len = leaf->values->offset_index.getLength();
for(auto i=0; i<len; i++) {
@ -1914,25 +1934,96 @@ Option<uint32_t> Index::remove(const uint32_t seq_id, const nlohmann::json & doc
LOG(INFO) << "----";*/
if(leaf->values->ids.getLength() == 0) {
art_values* values = (art_values*) art_delete(search_index.at(name_field.first), key, key_len);
art_values* values = (art_values*) art_delete(search_index.at(field_name), key, key_len);
delete values;
values = nullptr;
}
}
}
}
// remove facets if any
facet_index_v2.erase(seq_id);
// remove facets
if(facet_to_index.count(field_name) != 0 && facet_index_v2.count(seq_id) != 0) {
size_t facet_index = facet_to_index[field_name];
std::vector<std::vector<uint64_t>>& facet_values = facet_index_v2[seq_id];
facet_values[facet_index].clear();
}
// remove sort index if any
for(auto & field_doc_value_map: sort_index) {
field_doc_value_map.second->erase(seq_id);
// remove sort field
if(sort_index.count(field_name) != 0) {
sort_index[field_name]->erase(seq_id);
}
}
return Option<uint32_t>(seq_id);
}
void Index::tokenize_doc_field(const nlohmann::json& document, const std::string& field_name, const field& search_field,
std::vector<std::string>& tokens) {
if(search_field.type == field_types::STRING) {
StringUtils::split(document[field_name], tokens, " ");
} else if(search_field.type == field_types::STRING_ARRAY) {
const std::vector<std::string>& values = document[field_name].get<std::vector<std::string>>();
for(const std::string & value: values) {
StringUtils::split(value, tokens, " ");
}
} else if(search_field.type == field_types::INT32) {
const int KEY_LEN = 8;
unsigned char key[KEY_LEN];
const int32_t& value = document[field_name].get<int32_t>();
encode_int32(value, key);
tokens.emplace_back((char*)key, KEY_LEN);
} else if(search_field.type == field_types::INT32_ARRAY) {
const std::vector<int32_t>& values = document[field_name].get<std::vector<int32_t>>();
for(const int32_t value: values) {
const int KEY_LEN = 8;
unsigned char key[KEY_LEN];
encode_int32(value, key);
tokens.emplace_back((char*)key, KEY_LEN);
}
} else if(search_field.type == field_types::INT64) {
const int KEY_LEN = 8;
unsigned char key[KEY_LEN];
const int64_t& value = document[field_name].get<int64_t>();
encode_int64(value, key);
tokens.emplace_back((char*)key, KEY_LEN);
} else if(search_field.type == field_types::INT64_ARRAY) {
const std::vector<int64_t>& values = document[field_name].get<std::vector<int64_t>>();
for(const int64_t value: values) {
const int KEY_LEN = 8;
unsigned char key[KEY_LEN];
encode_int64(value, key);
tokens.emplace_back((char*)key, KEY_LEN);
}
} else if(search_field.type == field_types::FLOAT) {
const int KEY_LEN = 8;
unsigned char key[KEY_LEN];
const int64_t& value = document[field_name].get<int64_t>();
encode_float(value, key);
tokens.emplace_back((char*)key, KEY_LEN);
} else if(search_field.type == field_types::FLOAT_ARRAY) {
const std::vector<float>& values = document[field_name].get<std::vector<float>>();
for(const float value: values) {
const int KEY_LEN = 8;
unsigned char key[KEY_LEN];
encode_float(value, key);
tokens.emplace_back((char*)key, KEY_LEN);
}
} else if(search_field.type == field_types::BOOL) {
const int KEY_LEN = 1;
unsigned char key[KEY_LEN];
const bool& value = document[field_name].get<bool>();
key[0] = value ? '1' : '0';
tokens.emplace_back((char*)key, KEY_LEN);
} else if(search_field.type == field_types::BOOL_ARRAY) {
const std::vector<bool>& values = document[field_name].get<std::vector<bool>>();
for(const bool value: values) {
const int KEY_LEN = 1;
unsigned char key[KEY_LEN];
key[0] = value ? '1' : '0';
tokens.emplace_back((char*)key, KEY_LEN);
}
}
}
art_leaf* Index::get_token_leaf(const std::string & field_name, const unsigned char* token, uint32_t token_len) {
const art_tree *t = search_index.at(field_name);
return (art_leaf*) art_search(t, token, (int) token_len);

View File

@ -21,6 +21,7 @@ void master_server_routes() {
// document management - `/documents/:id` end-points must be placed last in the list
server->post("/collections/:collection/documents", post_add_document);
server->patch("/collections/:collection/documents/:id", patch_update_document);
server->get("/collections/:collection/documents/search", get_search);
server->post("/collections/:collection/documents/import", post_import_documents, true, true);

View File

@ -1,5 +1,6 @@
#include "sorted_array.h"
#include "array_utils.h"
#include "logger.h"
void sorted_array::load(const uint32_t *sorted_array, const uint32_t array_length) {
min = array_length != 0 ? sorted_array[0] : 0;
@ -18,28 +19,67 @@ void sorted_array::load(const uint32_t *sorted_array, const uint32_t array_lengt
length_bytes = actual_size;
}
bool sorted_array::append(uint32_t value) {
uint32_t size_required = sorted_append_size_required(value, length+1);
size_t sorted_array::append(uint32_t value) {
if(value < max) {
// we will have to re-encode the whole sequence again
uint32_t* arr = uncompress(length+1);
if(size_required+FOR_ELE_SIZE > size_bytes) {
// grow the array first
size_t new_size = (size_t) (size_required * FOR_GROWTH_FACTOR);
uint8_t *new_location = (uint8_t *) realloc(in, new_size);
if(new_location == NULL) {
abort();
// find the index of the element which is >= to `value`
uint32_t found_val;
uint32_t gte_index = for_lower_bound_search(in, length, value, &found_val);
for(size_t j=length; j>gte_index; j--) {
arr[j] = arr[j-1];
}
in = new_location;
size_bytes = (uint32_t) new_size;
arr[gte_index] = value;
load(arr, length+1);
delete [] arr;
return gte_index;
} else {
uint32_t size_required = sorted_append_size_required(value, length+1);
size_t min_expected_size = size_required + FOR_ELE_SIZE;
if(size_bytes < min_expected_size) {
// grow the array first
size_t new_size = min_expected_size * FOR_GROWTH_FACTOR;
uint8_t *new_location = (uint8_t *) realloc(in, new_size);
if(new_location == NULL) {
abort();
}
in = new_location;
size_bytes = (uint32_t) new_size;
//LOG(INFO) << "new_size: " << new_size;
}
uint32_t new_length_bytes = for_append_sorted(in, length, value);
if(new_length_bytes == 0) return false;
length_bytes = new_length_bytes;
length++;
if(value < min) min = value;
if(value > max) max = value;
return length-1;
}
}
bool sorted_array::insert(size_t index, uint32_t value) {
if(index >= length) {
return false;
}
uint32_t new_length_bytes = for_append_sorted(in, length, value);
if(new_length_bytes == 0) return false;
uint32_t *curr_array = uncompress(length+1);
memmove(&curr_array[index+1], &curr_array[index], sizeof(uint32_t)*(length-index));
curr_array[index] = value;
length_bytes = new_length_bytes;
length++;
load(curr_array, length+1);
if(value < min) min = value;
if(value > max) max = value;
delete [] curr_array;
return true;
}
@ -61,7 +101,11 @@ uint32_t sorted_array::indexOf(uint32_t value) {
uint32_t actual;
uint32_t index = for_lower_bound_search(in, length, value, &actual);
if(actual == value) return index;
if(actual == value) {
return index;
}
return length;
}
@ -150,20 +194,40 @@ void sorted_array::indexOf(const uint32_t *values, const size_t values_len, uint
binary_search_indices(values, head, tail, low_index, high_index, base, bits, indices);
}
void sorted_array::remove_values(uint32_t *sorted_values, uint32_t values_length) {
void sorted_array::remove_value(uint32_t value) {
// A lower bound search returns the first element in the sequence that is >= `value`
// So, `found_val` will be either equal or greater than `value`
uint32_t found_val;
uint32_t found_index = for_lower_bound_search(in, length, value, &found_val);
if(found_val != value) {
return ;
}
uint32_t *curr_array = uncompress();
if(found_index + 1 < length) {
memmove(&curr_array[found_index], &curr_array[found_index+1], sizeof(uint32_t) * (length - found_index - 1));
}
size_t new_length = (length == 0) ? 0 : (length - 1);
load(curr_array, new_length);
delete [] curr_array;
}
void sorted_array::remove_values(uint32_t *sorted_values, uint32_t sorted_values_length) {
uint32_t *curr_array = uncompress();
uint32_t *new_array = new uint32_t[length];
uint32_t new_index = 0;
uint32_t curr_index = 0;
uint32_t sorted_values_index = 0;
uint32_t curr_index = 0;
while(curr_index < length) {
if(sorted_values_index < values_length && curr_array[curr_index] >= sorted_values[sorted_values_index]) {
// skip copying
if(curr_array[curr_index] == sorted_values[sorted_values_index]) {
curr_index++;
}
if(sorted_values_index < sorted_values_length && sorted_values[sorted_values_index] == curr_array[curr_index]) {
curr_index++;
sorted_values_index++;
} else {
new_array[new_index++] = curr_array[curr_index++];

View File

@ -45,6 +45,31 @@ TEST(ArrayTest, Append) {
}
}
TEST(ArrayTest, InsertValues) {
std::vector<uint32_t> eles = {10, 1, 4, 5, 7};
array arr;
for(size_t i=0; i < eles.size(); i++) {
arr.append(eles[i]);
}
uint32_t insert_arr[2] = {2, 3};
arr.insert(2, insert_arr, 2);
eles = {10, 1, 2, 3, 4, 5, 7};
for(size_t i=0; i < eles.size(); i++) {
ASSERT_EQ(eles[i], arr.at(i));
}
uint32_t insert_arr2[2] = {20, 25};
arr.insert(6, insert_arr2, 2);
eles = {10, 1, 2, 3, 4, 5, 20, 25, 7};
for(size_t i=0; i < eles.size(); i++) {
ASSERT_EQ(eles[i], arr.at(i));
}
}
TEST(ArrayTest, Uncompress) {
const size_t SIZE = 10*1000;

View File

@ -71,6 +71,11 @@ TEST_F(CollectionFacetingTest, FacetFieldStringFiltering) {
ASSERT_EQ(0, results["hits"].size());
ASSERT_EQ(0, results["found"].get<size_t>());
// multiple tokens but with a typo on one of them
results = coll_str->search("*", query_fields, "starring:= ssamuel l. Jackson", facets, sort_fields, 0, 10, 1, FREQUENCY, false).get();
ASSERT_EQ(0, results["hits"].size());
ASSERT_EQ(0, results["found"].get<size_t>());
// same should succeed when verbatim filter is made
results = coll_str->search("*", query_fields, "starring:= samuel l. Jackson", facets, sort_fields, 0, 10, 1, FREQUENCY, false).get();
ASSERT_EQ(2, results["hits"].size());
@ -85,6 +90,11 @@ TEST_F(CollectionFacetingTest, FacetFieldStringFiltering) {
ASSERT_EQ(2, results["hits"].size());
ASSERT_EQ(2, results["found"].get<size_t>());
// contains when only 1 token matches
results = coll_str->search("*", query_fields, "starring: samuel johnson", facets, sort_fields, 0, 10, 1, FREQUENCY, false).get();
ASSERT_EQ(2, results["hits"].size());
ASSERT_EQ(2, results["found"].get<size_t>());
collectionManager.drop_collection("coll_str");
}
@ -131,6 +141,9 @@ TEST_F(CollectionFacetingTest, FacetFieldStringArrayFiltering) {
results = coll_array_fields->search("Jeremy", query_fields, "tags:= FINE", facets, sort_fields, 0, 10, 1, FREQUENCY, false).get();
ASSERT_EQ(0, results["hits"].size());
results = coll_array_fields->search("Jeremy", query_fields, "tags:= FFINE PLATINUM", facets, sort_fields, 0, 10, 1, FREQUENCY, false).get();
ASSERT_EQ(0, results["hits"].size());
// partial token filter should be made without "=" operator
results = coll_array_fields->search("Jeremy", query_fields, "tags: PLATINUM", facets, sort_fields, 0, 10, 1, FREQUENCY, false).get();
ASSERT_EQ(1, results["hits"].size());

View File

@ -64,7 +64,7 @@ TEST_F(CollectionGroupingTest, GroupingBasics) {
auto res = coll_group->search("*", {}, "", {"brand"}, {}, 0, 50, 1, FREQUENCY,
false, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30,
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
"", 10,
{}, {}, {"size"}, 2).get();
@ -107,7 +107,7 @@ TEST_F(CollectionGroupingTest, GroupingBasics) {
res = coll_group->search("*", {}, "", {"brand"}, sort_size, 0, 50, 1, FREQUENCY,
false, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "brand: omeg", 30,
spp::sparse_hash_set<std::string>(), 10, "brand: omeg", 30, 5,
"", 10,
{}, {}, {"rating"}, 2).get();
@ -147,7 +147,7 @@ TEST_F(CollectionGroupingTest, GroupingCompoundKey) {
auto res = coll_group->search("*", {}, "", {"brand"}, {}, 0, 50, 1, FREQUENCY,
false, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30,
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
"", 10,
{}, {}, {"size", "brand"}, 2).get();
@ -194,7 +194,7 @@ TEST_F(CollectionGroupingTest, GroupingCompoundKey) {
res = coll_group->search("*", {}, "", {"brand"}, {}, 0, 2, 2, FREQUENCY,
false, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30,
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
"", 10,
{}, {}, {"size", "brand"}, 2).get();
@ -230,7 +230,7 @@ TEST_F(CollectionGroupingTest, GroupingCompoundKey) {
auto res_op = coll_group->search("*", {}, "", {"brand"}, {}, 0, 50, 1, FREQUENCY,
false, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "brand: omeg", 30,
spp::sparse_hash_set<std::string>(), 10, "brand: omeg", 30, 5,
"", 10,
{}, {}, {"rating"}, 100);
@ -240,7 +240,7 @@ TEST_F(CollectionGroupingTest, GroupingCompoundKey) {
res_op = coll_group->search("*", {}, "", {"brand"}, {}, 0, 50, 1, FREQUENCY,
false, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "brand: omeg", 30,
spp::sparse_hash_set<std::string>(), 10, "brand: omeg", 30, 5,
"", 10,
{}, {}, {"rating"}, 0);
@ -252,7 +252,7 @@ TEST_F(CollectionGroupingTest, GroupingWithGropLimitOfOne) {
auto res = coll_group->search("*", {}, "", {"brand"}, {}, 0, 50, 1, FREQUENCY,
false, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30,
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
"", 10,
{}, {}, {"brand"}, 1).get();
@ -322,7 +322,7 @@ TEST_F(CollectionGroupingTest, GroupingWithArrayFieldAndOverride) {
auto res = coll_group->search("shirt", {"title"}, "", {"brand"}, {}, 0, 50, 1, FREQUENCY,
false, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30,
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
"", 10,
{}, {}, {"colors"}, 2).get();

View File

@ -213,7 +213,13 @@ TEST_F(CollectionManagerTest, RestoreRecordsOnRestart) {
// create a new collection manager to ensure that it restores the records from the disk backed store
CollectionManager & collectionManager2 = CollectionManager::get_instance();
collectionManager2.init(store, 1.0, "auth_key");
collectionManager2.load();
auto load_op = collectionManager2.load();
if(!load_op.ok()) {
LOG(ERROR) << load_op.error();
}
ASSERT_TRUE(load_op.ok());
collection1 = collectionManager2.get_collection("collection1");
ASSERT_NE(nullptr, collection1);

View File

@ -271,7 +271,7 @@ TEST_F(CollectionOverrideTest, IncludeExcludeHitsQuery) {
auto results = coll_mul_fields->search("the", {"title"}, "", {"starring"}, {}, 0, 50, 1, FREQUENCY,
false, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "starring: will", 30,
spp::sparse_hash_set<std::string>(), 10, "starring: will", 30, 5,
"", 10,
pinned_hits, {}).get();
@ -289,7 +289,7 @@ TEST_F(CollectionOverrideTest, IncludeExcludeHitsQuery) {
results = coll_mul_fields->search("the", {"title"}, "", {"starring"}, {}, 0, 50, 1, FREQUENCY,
false, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "starring: will", 30,
spp::sparse_hash_set<std::string>(), 10, "starring: will", 30, 5,
"", 10,
pinned_hits, hidden_hits).get();
@ -305,7 +305,7 @@ TEST_F(CollectionOverrideTest, IncludeExcludeHitsQuery) {
results = coll_mul_fields->search("the", {"title"}, "", {"starring"}, {}, 0, 2, 2, FREQUENCY,
false, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "starring: will", 30,
spp::sparse_hash_set<std::string>(), 10, "starring: will", 30, 5,
"", 10,
pinned_hits, hidden_hits).get();
@ -341,7 +341,7 @@ TEST_F(CollectionOverrideTest, IncludeExcludeHitsQuery) {
results = coll_mul_fields->search("the", {"title"}, "", {"starring"}, {}, 0, 50, 1, FREQUENCY,
false, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "starring: will", 30,
spp::sparse_hash_set<std::string>(), 10, "starring: will", 30, 5,
"", 10,
{}, {hidden_hits}).get();
@ -362,7 +362,7 @@ TEST_F(CollectionOverrideTest, PinnedHitsGrouping) {
auto results = coll_mul_fields->search("the", {"title"}, "", {"starring"}, {}, 0, 50, 1, FREQUENCY,
false, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "starring: will", 30,
spp::sparse_hash_set<std::string>(), 10, "starring: will", 30, 5,
"", 10,
pinned_hits, {}).get();
@ -383,7 +383,7 @@ TEST_F(CollectionOverrideTest, PinnedHitsGrouping) {
results = coll_mul_fields->search("the", {"title"}, "", {"starring"}, {}, 0, 50, 1, FREQUENCY,
false, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "starring: will", 30,
spp::sparse_hash_set<std::string>(), 10, "starring: will", 30, 5,
"", 10,
pinned_hits, {}, {"cast"}, 2).get();

View File

@ -14,6 +14,9 @@ protected:
CollectionManager & collectionManager = CollectionManager::get_instance();
std::vector<sort_by> sort_fields;
// used for generating random text
std::vector<std::string> words;
void setupCollection() {
std::string state_dir_path = "/tmp/typesense_test/collection";
LOG(INFO) << "Truncating and creating: " << state_dir_path;
@ -48,6 +51,12 @@ protected:
}
infile.close();
std::ifstream words_file(std::string(ROOT_DIR)+"test/resources/common100_english.txt");
std::stringstream strstream;
strstream << words_file.rdbuf();
words_file.close();
StringUtils::split(strstream.str(), words, "\n");
}
virtual void SetUp() {
@ -59,6 +68,18 @@ protected:
collectionManager.dispose();
delete store;
}
std::string get_text(size_t num_words) {
time_t t;
srand((unsigned) time(&t));
std::vector<std::string> strs;
for(size_t i = 0 ; i < num_words ; i++ ) {
int word_index = rand() % 100;
strs.push_back(words[word_index]);
}
return StringUtils::join(strs, " ");
}
};
TEST_F(CollectionTest, VerifyCountOfDocuments) {
@ -558,14 +579,14 @@ TEST_F(CollectionTest, TypoTokensThreshold) {
// Query expansion should happen only based on the `typo_tokens_threshold` value
auto results = collection->search("launch", {"title"}, "", {}, sort_fields, 2, 10, 1,
token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 5, "", 0).get();
spp::sparse_hash_set<std::string>(), 10, "", 5, 5, "", 0).get();
ASSERT_EQ(5, results["hits"].size());
ASSERT_EQ(5, results["found"].get<size_t>());
results = collection->search("launch", {"title"}, "", {}, sort_fields, 2, 10, 1,
token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 5, "", 10).get();
spp::sparse_hash_set<std::string>(), 10, "", 5, 5, "", 10).get();
ASSERT_EQ(7, results["hits"].size());
ASSERT_EQ(7, results["found"].get<size_t>());
@ -1296,6 +1317,243 @@ std::vector<nlohmann::json> import_res_to_json(const std::vector<std::string>& i
return out;
}
TEST_F(CollectionTest, ImportDocumentsUpsert) {
Collection *coll_mul_fields;
std::ifstream infile(std::string(ROOT_DIR)+"test/multi_field_documents.jsonl");
std::stringstream strstream;
strstream << infile.rdbuf();
infile.close();
std::vector<std::string> import_records;
StringUtils::split(strstream.str(), import_records, "\n");
std::vector<field> fields = {
field("title", field_types::STRING, false),
field("starring", field_types::STRING, false),
field("cast", field_types::STRING_ARRAY, false),
field("points", field_types::INT32, false)
};
coll_mul_fields = collectionManager.get_collection("coll_mul_fields");
if(coll_mul_fields == nullptr) {
coll_mul_fields = collectionManager.create_collection("coll_mul_fields", 1, fields, "points").get();
}
// try importing records
nlohmann::json document;
nlohmann::json import_response = coll_mul_fields->add_many(import_records, document);
ASSERT_TRUE(import_response["success"].get<bool>());
ASSERT_EQ(18, import_response["num_imported"].get<int>());
// update + upsert records
std::vector<std::string> more_records = {R"({"id": "0", "title": "The Fifth Harry"})",
R"({"id": "2", "cast": ["Chris Fisher", "Rand Alan"]})",
R"({"id": "18", "title": "Back Again Forest", "points": 45, "starring": "Ronald Wells", "cast": ["Dant Saren"]})",
R"({"id": "6", "points": 77})"};
import_response = coll_mul_fields->add_many(more_records, document, UPSERT);
ASSERT_TRUE(import_response["success"].get<bool>());
ASSERT_EQ(4, import_response["num_imported"].get<int>());
std::vector<nlohmann::json> import_results = import_res_to_json(more_records);
ASSERT_EQ(4, import_results.size());
for(size_t i=0; i<4; i++) {
ASSERT_TRUE(import_results[i]["success"].get<bool>());
ASSERT_EQ(1, import_results[i].size());
}
auto results = coll_mul_fields->search("*", query_fields, "", {}, sort_fields, 0, 30, 1, FREQUENCY, false).get();
ASSERT_EQ(19, results["hits"].size());
ASSERT_EQ(19, coll_mul_fields->get_num_documents());
results = coll_mul_fields->search("back again forest", query_fields, "", {}, sort_fields, 0, 30, 1, FREQUENCY, false).get();
ASSERT_EQ(1, results["hits"].size());
ASSERT_STREQ("Back Again Forest", coll_mul_fields->get("18").get()["title"].get<std::string>().c_str());
results = coll_mul_fields->search("fifth", query_fields, "", {}, sort_fields, 0, 10, 1, FREQUENCY, false).get();
ASSERT_EQ(2, results["hits"].size());
ASSERT_STREQ("The <mark>Fifth</mark> Harry", results["hits"][0]["highlights"][0]["snippet"].get<std::string>().c_str());
ASSERT_STREQ("The Woman in the <mark>Fifth</mark> from Kristin", results["hits"][1]["highlights"][0]["snippet"].get<std::string>().c_str());
results = coll_mul_fields->search("burgundy", query_fields, "", {}, sort_fields, 0, 10, 1, FREQUENCY, false).get();
ASSERT_EQ(0, results["hits"].size());
results = coll_mul_fields->search("harry", query_fields, "", {}, sort_fields, 0, 10, 1, FREQUENCY, false).get();
ASSERT_EQ(1, results["hits"].size());
results = coll_mul_fields->search("captain america", query_fields, "", {}, sort_fields, 0, 10, 1, FREQUENCY, false).get();
ASSERT_EQ(1, results["hits"].size());
ASSERT_EQ(77, results["hits"][0]["document"]["points"].get<size_t>());
// upserting with some bad docs
more_records = {R"({"id": "1", "title": "Wake up, Harry"})",
R"({"id": "90", "cast": ["Kim Werrel", "Random Wake"]})", // missing fields
R"({"id": "5", "points": 60})",
R"({"id": "24", "starring": "John", "cast": ["John Kim"], "points": 11})"}; // missing fields
import_response = coll_mul_fields->add_many(more_records, document, UPSERT);
ASSERT_FALSE(import_response["success"].get<bool>());
ASSERT_EQ(2, import_response["num_imported"].get<int>());
import_results = import_res_to_json(more_records);
ASSERT_FALSE(import_results[1]["success"].get<bool>());
ASSERT_FALSE(import_results[3]["success"].get<bool>());
ASSERT_STREQ("Field `points` has been declared as a default sorting field, but is not found in the document.", import_results[1]["error"].get<std::string>().c_str());
ASSERT_STREQ("Field `title` has been declared in the schema, but is not found in the document.", import_results[3]["error"].get<std::string>().c_str());
// try to duplicate records without upsert option
more_records = {R"({"id": "1", "title": "Wake up, Harry"})",
R"({"id": "5", "points": 60})"};
import_response = coll_mul_fields->add_many(more_records, document, CREATE);
ASSERT_FALSE(import_response["success"].get<bool>());
ASSERT_EQ(0, import_response["num_imported"].get<int>());
import_results = import_res_to_json(more_records);
ASSERT_FALSE(import_results[0]["success"].get<bool>());
ASSERT_FALSE(import_results[1]["success"].get<bool>());
ASSERT_STREQ("A document with id 1 already exists.", import_results[0]["error"].get<std::string>().c_str());
ASSERT_STREQ("A document with id 5 already exists.", import_results[1]["error"].get<std::string>().c_str());
// update document with verbatim fields, except for points
more_records = {R"({"id": "3", "cast":["Matt Damon","Ben Affleck","Minnie Driver"],
"points":70,"starring":"Robin Williams","starring_facet":"Robin Williams",
"title":"Good Will Hunting"})"};
import_response = coll_mul_fields->add_many(more_records, document, UPDATE);
ASSERT_TRUE(import_response["success"].get<bool>());
ASSERT_EQ(1, import_response["num_imported"].get<int>());
results = coll_mul_fields->search("Good Will Hunting", query_fields, "", {}, sort_fields, 0, 10, 1, FREQUENCY, false).get();
ASSERT_EQ(70, results["hits"][0]["document"]["points"].get<uint32_t>());
// updating a document that does not exist should fail, others should succeed
more_records = {R"({"id": "20", "points": 51})",
R"({"id": "1", "points": 64})"};
import_response = coll_mul_fields->add_many(more_records, document, UPDATE);
ASSERT_FALSE(import_response["success"].get<bool>());
ASSERT_EQ(1, import_response["num_imported"].get<int>());
import_results = import_res_to_json(more_records);
ASSERT_FALSE(import_results[0]["success"].get<bool>());
ASSERT_TRUE(import_results[1]["success"].get<bool>());
ASSERT_STREQ("Could not find a document with id: 20", import_results[0]["error"].get<std::string>().c_str());
ASSERT_EQ(404, import_results[0]["code"].get<size_t>());
results = coll_mul_fields->search("wake up harry", query_fields, "", {}, sort_fields, 0, 10, 1, FREQUENCY, false).get();
ASSERT_EQ(64, results["hits"][0]["document"]["points"].get<uint32_t>());
// trying to create documents with existing IDs should fail
more_records = {R"({"id": "2", "points": 51})",
R"({"id": "1", "points": 64})"};
import_response = coll_mul_fields->add_many(more_records, document, CREATE);
ASSERT_FALSE(import_response["success"].get<bool>());
ASSERT_EQ(0, import_response["num_imported"].get<int>());
import_results = import_res_to_json(more_records);
ASSERT_FALSE(import_results[0]["success"].get<bool>());
ASSERT_FALSE(import_results[1]["success"].get<bool>());
ASSERT_STREQ("A document with id 2 already exists.", import_results[0]["error"].get<std::string>().c_str());
ASSERT_STREQ("A document with id 1 already exists.", import_results[1]["error"].get<std::string>().c_str());
ASSERT_EQ(409, import_results[0]["code"].get<size_t>());
ASSERT_EQ(409, import_results[1]["code"].get<size_t>());
}
TEST_F(CollectionTest, ImportDocumentsUpsertOptional) {
Collection *coll1;
std::vector<field> fields = {
field("title", field_types::STRING_ARRAY, false, true),
field("points", field_types::INT32, false)
};
coll1 = collectionManager.get_collection("coll1");
if(coll1 == nullptr) {
coll1 = collectionManager.create_collection("coll1", 4, fields, "points").get();
}
std::vector<std::string> records;
size_t NUM_RECORDS = 1000;
for(size_t i=0; i<NUM_RECORDS; i++) {
nlohmann::json doc;
doc["id"] = std::to_string(i);
doc["points"] = i;
records.push_back(doc.dump());
}
// import records without title
nlohmann::json document;
nlohmann::json import_response = coll1->add_many(records, document, CREATE);
ASSERT_TRUE(import_response["success"].get<bool>());
ASSERT_EQ(1000, import_response["num_imported"].get<int>());
// upsert documents with title
records.clear();
for(size_t i=0; i<NUM_RECORDS; i++) {
nlohmann::json updoc;
updoc["id"] = std::to_string(i);
updoc["title"] = {
get_text(10),
get_text(10),
get_text(10),
get_text(10),
};
records.push_back(updoc.dump());
}
auto begin = std::chrono::high_resolution_clock::now();
import_response = coll1->add_many(records, document, UPSERT);
auto time_micros = std::chrono::duration_cast<std::chrono::microseconds>(
std::chrono::high_resolution_clock::now() - begin).count();
//LOG(INFO) << "Time taken for first upsert: " << time_micros;
ASSERT_TRUE(import_response["success"].get<bool>());
ASSERT_EQ(1000, import_response["num_imported"].get<int>());
// run upsert again with title override
records.clear();
for(size_t i=0; i<NUM_RECORDS; i++) {
nlohmann::json updoc;
updoc["id"] = std::to_string(i);
updoc["title"] = {
get_text(10),
get_text(10),
get_text(10),
get_text(10),
};
records.push_back(updoc.dump());
}
begin = std::chrono::high_resolution_clock::now();
import_response = coll1->add_many(records, document, UPSERT);
time_micros = std::chrono::duration_cast<std::chrono::microseconds>(
std::chrono::high_resolution_clock::now() - begin).count();
//LOG(INFO) << "Time taken for second upsert: " << time_micros;
ASSERT_TRUE(import_response["success"].get<bool>());
ASSERT_EQ(1000, import_response["num_imported"].get<int>());
}
TEST_F(CollectionTest, ImportDocuments) {
Collection *coll_mul_fields;
@ -1320,8 +1578,8 @@ TEST_F(CollectionTest, ImportDocuments) {
}
// try importing records
nlohmann::json import_response = coll_mul_fields->add_many(import_records);
nlohmann::json document;
nlohmann::json import_response = coll_mul_fields->add_many(import_records, document);
ASSERT_TRUE(import_response["success"].get<bool>());
ASSERT_EQ(18, import_response["num_imported"].get<int>());
@ -1346,7 +1604,7 @@ TEST_F(CollectionTest, ImportDocuments) {
// verify that empty import is handled gracefully
std::vector<std::string> empty_records;
import_response = coll_mul_fields->add_many(empty_records);
import_response = coll_mul_fields->add_many(empty_records, document);
ASSERT_TRUE(import_response["success"].get<bool>());
ASSERT_EQ(0, import_response["num_imported"].get<int>());
@ -1360,7 +1618,7 @@ TEST_F(CollectionTest, ImportDocuments) {
"{\"title\": \"Test4\", \"points\": 55, "
"\"cast\": [\"Tom Skerritt\"] }"};
import_response = coll_mul_fields->add_many(more_records);
import_response = coll_mul_fields->add_many(more_records, document);
ASSERT_FALSE(import_response["success"].get<bool>());
ASSERT_EQ(2, import_response["num_imported"].get<int>());
@ -1385,7 +1643,7 @@ TEST_F(CollectionTest, ImportDocuments) {
"{\"id\": \"id1\", \"title\": \"Test1\", \"starring\": \"Rand Fish\", \"points\": 12, "
"\"cast\": [\"Tom Skerritt\"] }"};
import_response = coll_mul_fields->add_many(more_records);
import_response = coll_mul_fields->add_many(more_records, document);
ASSERT_FALSE(import_response["success"].get<bool>());
ASSERT_EQ(1, import_response["num_imported"].get<int>());
@ -1403,7 +1661,7 @@ TEST_F(CollectionTest, ImportDocuments) {
// valid JSON but not a document
more_records = {"[]"};
import_response = coll_mul_fields->add_many(more_records);
import_response = coll_mul_fields->add_many(more_records, document);
ASSERT_FALSE(import_response["success"].get<bool>());
ASSERT_EQ(0, import_response["num_imported"].get<int>());
@ -1417,7 +1675,7 @@ TEST_F(CollectionTest, ImportDocuments) {
// invalid JSON
more_records = {"{"};
import_response = coll_mul_fields->add_many(more_records);
import_response = coll_mul_fields->add_many(more_records, document);
ASSERT_FALSE(import_response["success"].get<bool>());
ASSERT_EQ(0, import_response["num_imported"].get<int>());
@ -1756,7 +2014,7 @@ TEST_F(CollectionTest, IndexingWithBadData) {
sample_collection = collectionManager.create_collection("sample_collection", 4, fields, "age").get();
}
const Option<nlohmann::json> & search_fields_missing_op1 = sample_collection->add("{\"namezz\": \"foo\", \"age\": 29, \"average\": 78}");
const Option<nlohmann::json> & search_fields_missing_op1 = sample_collection->add("{\"name\": \"foo\", \"age\": 29, \"average\": 78}");
ASSERT_FALSE(search_fields_missing_op1.ok());
ASSERT_STREQ("Field `tags` has been declared in the schema, but is not found in the document.",
search_fields_missing_op1.error().c_str());
@ -2210,9 +2468,169 @@ TEST_F(CollectionTest, SearchHighlightShouldFollowThreshold) {
ASSERT_STREQ("fox jumped over the <mark>lazy</mark> dog and ran straight",
res["hits"][0]["highlights"][0]["snippet"].get<std::string>().c_str());
// specify the number of surrounding tokens to return
size_t highlight_affix_num_tokens = 2;
res = coll1->search("lazy", {"title"}, "", {}, sort_fields, 0, 10, 1,
token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 5, highlight_affix_num_tokens).get();
ASSERT_STREQ("over the <mark>lazy</mark> dog and",
res["hits"][0]["highlights"][0]["snippet"].get<std::string>().c_str());
highlight_affix_num_tokens = 0;
res = coll1->search("lazy", {"title"}, "", {}, sort_fields, 0, 10, 1,
token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 5, highlight_affix_num_tokens).get();
ASSERT_STREQ("<mark>lazy</mark>",
res["hits"][0]["highlights"][0]["snippet"].get<std::string>().c_str());
collectionManager.drop_collection("coll1");
}
TEST_F(CollectionTest, UpdateDocument) {
Collection *coll1;
std::vector<field> fields = {field("title", field_types::STRING, true),
field("tags", field_types::STRING_ARRAY, true),
field("points", field_types::INT32, false)};
std::vector<sort_by> sort_fields = {sort_by("points", "DESC")};
coll1 = collectionManager.get_collection("coll1");
if (coll1 == nullptr) {
coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
}
nlohmann::json doc;
doc["id"] = "100";
doc["title"] = "The quick brown fox jumped over the lazy dog and ran straight to the forest to sleep.";
doc["tags"] = {"NEWS", "LAZY"};
doc["points"] = 25;
auto add_op = coll1->add(doc.dump());
ASSERT_TRUE(add_op.ok());
auto res = coll1->search("lazy", {"title"}, "", {}, sort_fields, 0, 10, 1,
token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 5, 5, "title").get();
ASSERT_EQ(1, res["hits"].size());
ASSERT_STREQ("The quick brown fox jumped over the lazy dog and ran straight to the forest to sleep.",
res["hits"][0]["document"]["title"].get<std::string>().c_str());
// try changing the title and searching for an older token
doc["title"] = "The quick brown fox.";
add_op = coll1->add(doc.dump(), UPSERT);
ASSERT_TRUE(add_op.ok());
ASSERT_EQ(1, coll1->get_num_documents());
res = coll1->search("lazy", {"title"}, "", {}, sort_fields, 0, 10, 1,
token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 5, 5, "title").get();
ASSERT_EQ(0, res["hits"].size());
res = coll1->search("quick", {"title"}, "", {}, sort_fields, 0, 10, 1,
token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 5, 5, "title").get();
ASSERT_EQ(1, res["hits"].size());
ASSERT_STREQ("The quick brown fox.", res["hits"][0]["document"]["title"].get<std::string>().c_str());
// try to update document tags without `id`
nlohmann::json doc2;
doc2["tags"] = {"SENTENCE"};
add_op = coll1->add(doc2.dump(), UPDATE);
ASSERT_FALSE(add_op.ok());
ASSERT_STREQ("For update, the `id` key must be provided.", add_op.error().c_str());
// now change tags with id
doc2["id"] = "100";
add_op = coll1->add(doc2.dump(), UPDATE);
ASSERT_TRUE(add_op.ok());
// check for old tag
res = coll1->search("NEWS", {"tags"}, "", {}, sort_fields, 0, 10, 1,
token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 5, 5, "title").get();
ASSERT_EQ(0, res["hits"].size());
// now check for new tag and also try faceting on that field
res = coll1->search("SENTENCE", {"tags"}, "", {"tags"}, sort_fields, 0, 10, 1,
token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 5, 5, "title").get();
ASSERT_EQ(1, res["hits"].size());
ASSERT_STREQ("SENTENCE", res["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
// try changing points
nlohmann::json doc3;
doc3["points"] = 99;
doc3["id"] = "100";
add_op = coll1->add(doc3.dump(), UPDATE);
ASSERT_TRUE(add_op.ok());
res = coll1->search("*", {"tags"}, "points: > 90", {}, sort_fields, 0, 10, 1,
token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 5, 5, "title").get();
ASSERT_EQ(1, res["hits"].size());
ASSERT_EQ(99, res["hits"][0]["document"]["points"].get<size_t>());
// id can be passed by param
nlohmann::json doc4;
doc4["points"] = 105;
add_op = coll1->add(doc4.dump(), UPSERT, "100");
ASSERT_TRUE(add_op.ok());
res = coll1->search("*", {"tags"}, "points: > 101", {}, sort_fields, 0, 10, 1,
token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 5, 5, "title").get();
ASSERT_EQ(1, res["hits"].size());
ASSERT_EQ(105, res["hits"][0]["document"]["points"].get<size_t>());
// try to change a field with bad value and verify that old document is put back
doc4["points"] = "abc";
add_op = coll1->add(doc4.dump(), UPSERT, "100");
ASSERT_FALSE(add_op.ok());
res = coll1->search("*", {"tags"}, "points: > 101", {}, sort_fields, 0, 10, 1,
token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 5, 5, "title").get();
ASSERT_EQ(1, res["hits"].size());
ASSERT_EQ(105, res["hits"][0]["document"]["points"].get<size_t>());
// when explicit path id does not match doc id, error should be returned
nlohmann::json doc5;
doc5["id"] = "800";
doc5["title"] = "The Secret Seven";
doc5["points"] = 250;
doc5["tags"] = {"BOOK", "ENID BLYTON"};
add_op = coll1->add(doc5.dump(), UPSERT, "799");
ASSERT_FALSE(add_op.ok());
ASSERT_EQ(400, add_op.code());
ASSERT_STREQ("The `id` of the resource does not match the `id` in the JSON body.", add_op.error().c_str());
// passing an empty id should not succeed
nlohmann::json doc6;
doc6["id"] = "";
doc6["title"] = "The Secret Seven";
doc6["points"] = 250;
doc6["tags"] = {"BOOK", "ENID BLYTON"};
add_op = coll1->add(doc6.dump(), UPDATE);
ASSERT_FALSE(add_op.ok());
ASSERT_EQ(400, add_op.code());
ASSERT_STREQ("The `id` should not be empty.", add_op.error().c_str());
}
TEST_F(CollectionTest, SearchHighlightFieldFully) {
Collection *coll1;
@ -2240,7 +2658,7 @@ TEST_F(CollectionTest, SearchHighlightFieldFully) {
auto res = coll1->search("lazy", {"title"}, "", {}, sort_fields, 0, 10, 1,
token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 5, "title").get();
spp::sparse_hash_set<std::string>(), 10, "", 5, 5, "title").get();
ASSERT_EQ(1, res["hits"][0]["highlights"].size());
ASSERT_STREQ("The quick brown fox jumped over the <mark>lazy</mark> dog and ran straight to the forest to sleep.",
@ -2249,14 +2667,14 @@ TEST_F(CollectionTest, SearchHighlightFieldFully) {
// should not return value key when highlight_full_fields is not specified
res = coll1->search("lazy", {"title"}, "", {}, sort_fields, 0, 10, 1,
token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 5, "").get();
spp::sparse_hash_set<std::string>(), 10, "", 5, 5, "").get();
ASSERT_EQ(2, res["hits"][0]["highlights"][0].size());
// query multiple fields
res = coll1->search("lazy", {"title", "tags"}, "", {}, sort_fields, 0, 10, 1,
token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 5, "title, tags").get();
spp::sparse_hash_set<std::string>(), 10, "", 5, 5, "title, tags").get();
ASSERT_EQ(2, res["hits"][0]["highlights"].size());
ASSERT_STREQ("The quick brown fox jumped over the <mark>lazy</mark> dog and ran straight to the forest to sleep.",
@ -2269,7 +2687,7 @@ TEST_F(CollectionTest, SearchHighlightFieldFully) {
spp::sparse_hash_set<std::string> excluded_fields = {"tags"};
res = coll1->search("lazy", {"title", "tags"}, "", {}, sort_fields, 0, 10, 1,
token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
excluded_fields, 10, "", 5, "title, tags").get();
excluded_fields, 10, "", 5, 5, "title, tags").get();
ASSERT_EQ(1, res["hits"][0]["highlights"].size());
ASSERT_STREQ("The quick brown fox jumped over the <mark>lazy</mark> dog and ran straight to the forest to sleep.",
@ -2279,7 +2697,7 @@ TEST_F(CollectionTest, SearchHighlightFieldFully) {
excluded_fields = {"tags", "title"};
res = coll1->search("lazy", {"title", "tags"}, "", {}, sort_fields, 0, 10, 1,
token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
excluded_fields, 10, "", 5, "title, tags").get();
excluded_fields, 10, "", 5, 5, "title, tags").get();
ASSERT_EQ(0, res["hits"][0]["highlights"].size());
collectionManager.drop_collection("coll1");

59
test/index_test.cpp Normal file
View File

@ -0,0 +1,59 @@
#include <gtest/gtest.h>
#include "index.h"
#include <vector>
TEST(IndexTest, ScrubReindexDoc) {
std::unordered_map<std::string, field> search_schema;
search_schema.emplace("title", field("title", field_types::STRING, false));
search_schema.emplace("points", field("title", field_types::INT32, false));
search_schema.emplace("cast", field("cast", field_types::STRING_ARRAY, false));
search_schema.emplace("movie", field("movie", field_types::BOOL, false));
Index index("index", search_schema, {}, {});
nlohmann::json old_doc;
old_doc["id"] = "1";
old_doc["title"] = "One more thing.";
old_doc["points"] = 100;
old_doc["cast"] = {"John Wick", "Jeremy Renner"};
old_doc["movie"] = true;
// all fields remain same
nlohmann::json update_doc1, del_doc1;
update_doc1 = old_doc;
del_doc1 = old_doc;
index.scrub_reindex_doc(update_doc1, del_doc1, old_doc);
ASSERT_EQ(1, del_doc1.size());
ASSERT_STREQ("1", del_doc1["id"].get<std::string>().c_str());
// when only some fields are updated
nlohmann::json update_doc2, del_doc2;
update_doc2["id"] = "1";
update_doc2["points"] = 100;
update_doc2["cast"] = {"Jack"};
del_doc2 = update_doc2;
index.scrub_reindex_doc(update_doc2, del_doc2, old_doc);
ASSERT_EQ(2, del_doc2.size());
ASSERT_STREQ("1", del_doc2["id"].get<std::string>().c_str());
std::vector<std::string> cast = del_doc2["cast"].get<std::vector<std::string>>();
ASSERT_EQ(1, cast.size());
ASSERT_STREQ("Jack", cast[0].c_str());
// containing fields not part of search schema
nlohmann::json update_doc3, del_doc3;
update_doc3["id"] = "1";
update_doc3["title"] = "The Lawyer";
update_doc3["foo"] = "Bar";
del_doc3 = update_doc3;
index.scrub_reindex_doc(update_doc3, del_doc3, old_doc);
ASSERT_EQ(3, del_doc3.size());
ASSERT_STREQ("1", del_doc3["id"].get<std::string>().c_str());
ASSERT_STREQ("The Lawyer", del_doc3["title"].get<std::string>().c_str());
ASSERT_STREQ("Bar", del_doc3["foo"].get<std::string>().c_str());
}

View File

@ -0,0 +1,100 @@
the
of
to
and
a
in
is
it
you
that
he
was
for
on
are
with
as
I
his
they
be
at
one
have
this
from
or
had
by
not
word
but
what
some
we
can
out
other
were
all
there
when
up
use
your
how
said
an
each
she
which
do
their
time
if
will
way
about
many
then
them
write
would
like
so
these
her
long
make
thing
see
him
two
has
look
more
day
could
go
come
did
number
sound
no
most
people
my
over
know
water
than
call
first
who
may
down
side
been
now
find

View File

@ -12,7 +12,8 @@ TEST(SortedArrayTest, Append) {
EXPECT_EQ(arr.indexOf(100), 0); // when not found must be equal to length (0 in this case)
for(uint32_t i=0; i < SIZE; i++) {
arr.append(i);
size_t appended_index = arr.append(i);
ASSERT_EQ(i, appended_index);
}
EXPECT_EQ(arr.getLength(), SIZE);
@ -28,11 +29,94 @@ TEST(SortedArrayTest, Append) {
EXPECT_EQ(arr.indexOf(SIZE+1), SIZE);
sorted_array arr_small;
arr_small.append(100);
size_t appended_index = arr_small.append(100);
EXPECT_EQ(0, appended_index);
EXPECT_EQ(arr_small.getLength(), 1);
EXPECT_EQ(arr_small.at(0), 100);
}
TEST(SortedArrayTest, AppendOutOfOrder) {
sorted_array arr;
for(size_t i=5; i<=10; i++) {
size_t appended_index = arr.append(i);
ASSERT_EQ(i-5, appended_index);
}
EXPECT_EQ(6, arr.getLength());
int appended_index = -1;
appended_index = arr.append(1);
ASSERT_EQ(0, appended_index);
appended_index = arr.append(3);
ASSERT_EQ(1, appended_index);
appended_index = arr.append(2);
ASSERT_EQ(1, appended_index);
appended_index = arr.append(4);
ASSERT_EQ(3, appended_index);
appended_index = arr.append(11);
ASSERT_EQ(10, appended_index);
appended_index = arr.append(14);
ASSERT_EQ(11, appended_index);
appended_index = arr.append(12);
ASSERT_EQ(11, appended_index);
EXPECT_EQ(13, arr.getLength());
}
TEST(SortedArrayTest, InsertAtIndex) {
std::vector<uint32_t> eles;
sorted_array arr;
for(size_t i=5; i<=9; i++) {
arr.append(i);
}
arr.append(11);
eles = {5, 6, 7, 8, 9, 11};
for(size_t i=0; i < eles.size(); i++) {
ASSERT_EQ(eles[i], arr.at(i));
}
arr.insert(0, 1);
eles = { 1, 5, 6, 7, 8, 9, 11 };
for(size_t i=0; i < eles.size(); i++) {
ASSERT_EQ(eles[i], arr.at(i));
}
ASSERT_EQ(1, arr.at(0));
ASSERT_EQ(5, arr.at(1));
arr.insert(1, 2);
eles = {1, 2, 5, 6, 7, 8, 9, 11};
ASSERT_EQ(1, arr.at(0));
ASSERT_EQ(2, arr.at(1));
ASSERT_EQ(8, arr.getLength());
for(size_t i=0; i < eles.size(); i++) {
ASSERT_EQ(eles[i], arr.at(i));
}
arr.insert(7, 10);
eles = { 1, 2, 5, 6, 7, 8, 9, 10, 11};
ASSERT_EQ(10, arr.at(7));
ASSERT_EQ(11, arr.at(8));
ASSERT_EQ(9, arr.getLength());
for(size_t i=0; i < eles.size(); i++) {
ASSERT_EQ(eles[i], arr.at(i));
}
ASSERT_FALSE(arr.insert(9, 12)); // index out of range
}
TEST(SortedArrayTest, Load) {
sorted_array arr;
@ -70,6 +154,32 @@ TEST(SortedArrayTest, Uncompress) {
delete[] raw_sorted_arr;
}
TEST(SortedArrayTest, RemoveValue) {
sorted_array arr;
const size_t SIZE = 10*1000;
for(size_t i=0; i<SIZE; i++) {
arr.append(i);
}
uint32_t values[5] = {0, 100, 1000, 2000, SIZE-1};
for(size_t i=0; i<5; i++) {
arr.remove_value(values[i]);
}
ASSERT_EQ(arr.getLength(), SIZE-5);
for(size_t i=0; i<SIZE-5; i++) {
uint32_t value = arr.at(i);
ASSERT_FALSE(value == 0);
ASSERT_FALSE(value == 100);
ASSERT_FALSE(value == 1000);
ASSERT_FALSE(value == 2000);
ASSERT_FALSE(value == SIZE-1);
}
}
TEST(SortedArrayTest, RemoveValues) {
sorted_array arr;