typesense/src/collection.cpp

#include "collection.h"

#include <numeric>
#include <chrono>
#include <array_utils.h>
#include <match_score.h>
#include <string_utils.h>
#include <art.h>
#include <thread>
#include <chrono>
#include <rocksdb/write_batch.h>
#include "logger.h"

Collection::Collection(const std::string name, const uint32_t collection_id, const uint32_t next_seq_id, Store *store,
                       const std::vector<field> &fields, const std::string & token_ranking_field):
                       name(name), collection_id(collection_id), next_seq_id(next_seq_id), store(store),
                       fields(fields), token_ranking_field(token_ranking_field) {

    for(const field& field: fields) {
        search_schema.emplace(field.name, field);

        if(field.is_facet()) {
            facet_value fvalue;
            facet_schema.emplace(field.name, field);
        }

        if(field.is_single_integer() || field.is_single_float() || field.is_single_bool()) {
            sort_schema.emplace(field.name, field);
        }
    }

    num_indices = 4;
    for(size_t i = 0; i < num_indices; i++) {
        Index* index = new Index(name+std::to_string(i), search_schema, facet_schema, sort_schema);
        indices.push_back(index);
        std::thread* thread = new std::thread(&Index::run_search, index);
        index_threads.push_back(thread);
    }

    num_documents = 0;
}

Collection::~Collection() {
    for(size_t i = 0; i < indices.size(); i++) {
        std::thread *t = index_threads[i];
        Index* index = indices[i];
        index->ready = true;
        index->terminate = true;
        index->cv.notify_one();
        t->join();
        delete t;
        delete indices[i];
    }
}

uint32_t Collection::get_next_seq_id() {
    store->increment(get_next_seq_id_key(name), 1);
    return next_seq_id++;
}

void Collection::set_next_seq_id(uint32_t seq_id) {
    next_seq_id = seq_id;
}

void Collection::increment_next_seq_id_field() {
    next_seq_id++;
}

Option<nlohmann::json> Collection::add(const std::string & json_str) {
    nlohmann::json document;
    try {
        document = nlohmann::json::parse(json_str);
    } catch(...) {
        return Option<nlohmann::json>(400, "Bad JSON.");
    }

    uint32_t seq_id = get_next_seq_id();
    std::string seq_id_str = std::to_string(seq_id);

    if(document.count("id") == 0) {
        document["id"] = seq_id_str;
    } else if(!document["id"].is_string()) {
        return Option<nlohmann::json>(400, "Document's `id` field should be a string.");
    }

    std::string doc_id = document["id"];
    Option<nlohmann::json> doc_option = get(doc_id);

    // we need to check if document ID already exists before attempting to index
    if(doc_option.ok()) {
        return Option<nlohmann::json>(409, std::string("A document with id ") + doc_id + " already exists.");
    }

    const Option<uint32_t> & index_memory_op = index_in_memory(document, seq_id);

    if(!index_memory_op.ok()) {
        return Option<nlohmann::json>(index_memory_op.code(), index_memory_op.error());
    }

    rocksdb::WriteBatch batch;
    batch.Put(get_doc_id_key(document["id"]), seq_id_str);
    batch.Put(get_seq_id_key(seq_id), document.dump());
    bool write_ok = store->batch_write(batch);

    if(!write_ok) {
        return Option<nlohmann::json>(500, "Could not write to on-disk storage.");
    }

    return Option<nlohmann::json>(document);
}

Option<uint32_t> Collection::validate_index_in_memory(const nlohmann::json &document, uint32_t seq_id) {
    if(!token_ranking_field.empty() && document.count(token_ranking_field) == 0) {
        return Option<>(400, "Field `" + token_ranking_field  + "` has been declared as a token ranking field, "
                "but is not found in the document.");
    }

    if(!token_ranking_field.empty() && !document[token_ranking_field].is_number_integer() &&
       !document[token_ranking_field].is_number_float()) {
        return Option<>(400, "Token ranking field `" + token_ranking_field  + "` must be a number.");
    }

    if(!token_ranking_field.empty() && document[token_ranking_field].is_number_integer() &&
       document[token_ranking_field].get<int64_t>() > std::numeric_limits<int32_t>::max()) {
        return Option<>(400, "Token ranking field `" + token_ranking_field  + "` exceeds maximum value of int32.");
    }

    if(!token_ranking_field.empty() && document[token_ranking_field].is_number_float() &&
       document[token_ranking_field].get<float>() > std::numeric_limits<float>::max()) {
        return Option<>(400, "Token ranking field `" + token_ranking_field  + "` exceeds maximum value of a float.");
    }

    for(const std::pair<std::string, field> & field_pair: search_schema) {
        const std::string & field_name = field_pair.first;

        if(document.count(field_name) == 0) {
            return Option<>(400, "Field `" + field_name  + "` has been declared in the schema, "
                    "but is not found in the document.");
        }

        if(field_pair.second.type == field_types::STRING) {
            if(!document[field_name].is_string()) {
                return Option<>(400, "Field `" + field_name  + "` must be a string.");
            }
        } else if(field_pair.second.type == field_types::INT32) {
            if(!document[field_name].is_number_integer()) {
                return Option<>(400, "Field `" + field_name  + "` must be an int32.");
            }

            if(document[field_name].get<int64_t>() > INT32_MAX) {
                return Option<>(400, "Field `" + field_name  + "` exceeds maximum value of int32.");
            }
        } else if(field_pair.second.type == field_types::INT64) {
            if(!document[field_name].is_number_integer()) {
                return Option<>(400, "Field `" + field_name  + "` must be an int64.");
            }
        } else if(field_pair.second.type == field_types::FLOAT) {
            if(!document[field_name].is_number()) { // allows integer to be passed to a float field
                return Option<>(400, "Field `" + field_name  + "` must be a float.");
            }
        } else if(field_pair.second.type == field_types::BOOL) {
            if(!document[field_name].is_boolean()) {
                return Option<>(400, "Field `" + field_name  + "` must be a bool.");
            }
        } else if(field_pair.second.type == field_types::STRING_ARRAY) {
            if(!document[field_name].is_array()) {
                return Option<>(400, "Field `" + field_name  + "` must be a string array.");
            }
            if(document[field_name].size() > 0 && !document[field_name][0].is_string()) {
                return Option<>(400, "Field `" + field_name  + "` must be a string array.");
            }
        } else if(field_pair.second.type == field_types::INT32_ARRAY) {
            if(!document[field_name].is_array()) {
                return Option<>(400, "Field `" + field_name  + "` must be an int32 array.");
            }

            if(document[field_name].size() > 0 && !document[field_name][0].is_number_integer()) {
                return Option<>(400, "Field `" + field_name  + "` must be an int32 array.");
            }
        } else if(field_pair.second.type == field_types::INT64_ARRAY) {
            if(!document[field_name].is_array()) {
                return Option<>(400, "Field `" + field_name  + "` must be an int64 array.");
            }

            if(document[field_name].size() > 0 && !document[field_name][0].is_number_integer()) {
                return Option<>(400, "Field `" + field_name  + "` must be an int64 array.");
            }
        } else if(field_pair.second.type == field_types::FLOAT_ARRAY) {
            if(!document[field_name].is_array()) {
                return Option<>(400, "Field `" + field_name  + "` must be a float array.");
            }

            if(document[field_name].size() > 0 && !document[field_name][0].is_number_float()) {
                return Option<>(400, "Field `" + field_name  + "` must be a float array.");
            }
        } else if(field_pair.second.type == field_types::BOOL_ARRAY) {
            if(!document[field_name].is_array()) {
                return Option<>(400, "Field `" + field_name  + "` must be a bool array.");
            }

            if(document[field_name].size() > 0 && !document[field_name][0].is_boolean()) {
                return Option<>(400, "Field `" + field_name  + "` must be a bool array.");
            }
        }
    }

    for(const std::pair<std::string, field> & field_pair: facet_schema) {
        const std::string & field_name = field_pair.first;

        if(document.count(field_name) == 0) {
            return Option<>(400, "Field `" + field_name  + "` has been declared as a facet field in the schema, "
                    "but is not found in the document.");
        }

        if(field_pair.second.type == field_types::STRING) {
            if(!document[field_name].is_string()) {
                return Option<>(400, "Facet field `" + field_name  + "` must be a string.");
            }
        } else if(field_pair.second.type == field_types::STRING_ARRAY) {
            if(!document[field_name].is_array()) {
                return Option<>(400, "Facet field `" + field_name  + "` must be a string array.");
            }

            if(document[field_name].size() > 0 && !document[field_name][0].is_string()) {
                return Option<>(400, "Facet field `" + field_name  + "` must be a string array.");
            }
        } else {
            return Option<>(400, "Facet field `" + field_name  + "` must be a string or a string[].");
        }
    }

    return Option<>(200);
}

Option<uint32_t> Collection::index_in_memory(const nlohmann::json &document, uint32_t seq_id) {
    Option<uint32_t> validation_op = validate_index_in_memory(document, seq_id);

    if(!validation_op.ok()) {
        return validation_op;
    }

    int32_t points = 0;

    if(!token_ranking_field.empty()) {
        if(document[token_ranking_field].is_number_float()) {
            // serialize float to an integer and reverse the inverted range
            float n = document[token_ranking_field];
            memcpy(&points, &n, sizeof(int32_t));
            points ^= ((points >> (std::numeric_limits<int32_t>::digits - 1)) | INT32_MIN);
            points = -1 * (INT32_MAX - points);
        } else {
            points = document[token_ranking_field];
        }
    }

    Index* index = indices[seq_id % num_indices];
    index->index_in_memory(document, seq_id, points);

    num_documents += 1;
    return Option<>(200);
}

Option<nlohmann::json> Collection::search(std::string query, const std::vector<std::string> search_fields,
                                  const std::string & simple_filter_query, const std::vector<std::string> & facet_fields,
                                  const std::vector<sort_by> & sort_fields, const int num_typos,
                                  const size_t per_page, const size_t page,
                                  const token_ordering token_order, const bool prefix) {
    std::vector<facet> facets;

    // validate search fields
    for(const std::string & field_name: search_fields) {
        if(search_schema.count(field_name) == 0) {
            std::string error = "Could not find a field named `" + field_name + "` in the schema.";
            return Option<nlohmann::json>(404, error);
        }

        field search_field = search_schema.at(field_name);
        if(search_field.type != field_types::STRING && search_field.type != field_types::STRING_ARRAY) {
            std::string error = "Field `" + field_name + "` should be a string or a string array.";
            return Option<nlohmann::json>(400, error);
        }

        if(search_field.facet) {
            std::string error = "Field `" + field_name + "` is a faceted field - it cannot be used as a query field.";
            return Option<nlohmann::json>(400, error);
        }
    }

    // validate filter fields
    std::vector<std::string> filter_blocks;
    StringUtils::split(simple_filter_query, filter_blocks, "&&");

    std::vector<filter> filters;
    for(const std::string & filter_block: filter_blocks) {
        // split into [field_name, value]
        std::vector<std::string> expression_parts;
        StringUtils::split(filter_block, expression_parts, ":");
        if(expression_parts.size() != 2) {
            return Option<nlohmann::json>(400, "Could not parse the filter query.");
        }

        const std::string & field_name = expression_parts[0];
        if(search_schema.count(field_name) == 0) {
            return Option<nlohmann::json>(404, "Could not find a filter field named `" + field_name + "` in the schema.");
        }

        field _field = search_schema.at(field_name);
        std::string & raw_value = expression_parts[1];
        filter f;

        if(_field.is_integer() || _field.is_float()) {
            // could be a single value or a list
            if(raw_value[0] == '[' && raw_value[raw_value.size() - 1] == ']') {
                std::vector<std::string> filter_values;
                StringUtils::split(raw_value.substr(1, raw_value.size() - 2), filter_values, ",");

                for(const std::string & filter_value: filter_values) {
                    if(_field.is_integer() && !StringUtils::is_integer(filter_value)) {
                        return Option<nlohmann::json>(400, "Error with field `" + _field.name + "`: Not an integer.");
                    }

                    if(_field.is_float() && !StringUtils::is_float(filter_value)) {
                        return Option<nlohmann::json>(400, "Error with field `" + _field.name + "`: Not a float.");
                    }
                }

                f = {field_name, filter_values, EQUALS};
            } else {
                Option<NUM_COMPARATOR> op_comparator = filter::extract_num_comparator(raw_value);
                if(!op_comparator.ok()) {
                    return Option<nlohmann::json>(400, "Error with field `" + _field.name + "`: " + op_comparator.error());
                }

                // extract numerical value
                std::string filter_value;
                if(op_comparator.get() == LESS_THAN || op_comparator.get() == GREATER_THAN) {
                    filter_value = raw_value.substr(1);
                } else if(op_comparator.get() == LESS_THAN_EQUALS || op_comparator.get() == GREATER_THAN_EQUALS) {
                    filter_value = raw_value.substr(2);
                } else {
                    // EQUALS
                    filter_value = raw_value;
                }

                filter_value = StringUtils::trim(filter_value);

                if(_field.is_integer() && !StringUtils::is_integer(filter_value)) {
                    return Option<nlohmann::json>(400, "Error with field `" + _field.name + "`: Not an integer.");
                }

                if(_field.is_float() && !StringUtils::is_float(filter_value)) {
                    return Option<nlohmann::json>(400, "Error with field `" + _field.name + "`: Not a float.");
                }

                f = {field_name, {filter_value}, op_comparator.get()};
            }
        } else if(_field.is_bool()) {
            if(raw_value != "true" && raw_value != "false") {
                return Option<nlohmann::json>(400, "Value of field `" + _field.name + "`: must be `true` or `false`.");
            }
            std::string bool_value = (raw_value == "true") ? "1" : "0";
            f = {field_name, {bool_value}, EQUALS};
        } else if(_field.is_string()) {
            if(raw_value[0] == '[' && raw_value[raw_value.size() - 1] == ']') {
                std::vector<std::string> filter_values;
                StringUtils::split(raw_value.substr(1, raw_value.size() - 2), filter_values, ",");
                f = {field_name, filter_values, EQUALS};
            } else {
                f = {field_name, {raw_value}, EQUALS};
            }
        } else {
            return Option<nlohmann::json>(400, "Error with field `" + _field.name + "`: Unidentified field type.");
        }

        filters.push_back(f);
    }

    // validate facet fields
    for(const std::string & field_name: facet_fields) {
        if(facet_schema.count(field_name) == 0) {
            std::string error = "Could not find a facet field named `" + field_name + "` in the schema.";
            return Option<nlohmann::json>(404, error);
        }
        facets.push_back(facet(field_name));
    }

    // validate sort fields and standardize

    std::vector<sort_by> sort_fields_std;

    for(const sort_by & _sort_field: sort_fields) {
        if(sort_schema.count(_sort_field.name) == 0) {
            std::string error = "Could not find a field named `" + _sort_field.name + "` in the schema for sorting.";
            return Option<nlohmann::json>(404, error);
        }

        std::string sort_order = _sort_field.order;
        StringUtils::toupper(sort_order);

        if(sort_order != sort_field_const::asc && sort_order != sort_field_const::desc) {
            std::string error = "Order for field` " + _sort_field.name + "` should be either ASC or DESC.";
            return Option<nlohmann::json>(400, error);
        }

        sort_fields_std.push_back({_sort_field.name, sort_order});
    }

    // check for valid pagination
    if(page < 1) {
        std::string message = "Page must be an integer of value greater than 0.";
        return Option<nlohmann::json>(422, message);
    }

    if((page * per_page) > MAX_RESULTS) {
        std::string message = "Only the first " + std::to_string(MAX_RESULTS) + " results are available.";
        return Option<nlohmann::json>(422, message);
    }

    //auto begin = std::chrono::high_resolution_clock::now();

    // all search queries that were used for generating the results
    std::vector<std::vector<art_leaf*>> searched_queries;
    std::vector<std::pair<int, Topster<512>::KV>> field_order_kvs;
    size_t total_found = 0;

    // send data to individual index threads
    for(Index* index: indices) {
        index->search_params = search_args(query, search_fields, filters, facets, sort_fields_std,
                                           num_typos, per_page, page, token_order, prefix);
        {
            std::lock_guard<std::mutex> lk(index->m);
            index->ready = true;
            index->processed = false;
        }
        index->cv.notify_one();
        //std::this_thread::sleep_for(std::chrono::milliseconds(400));
    }

    Option<nlohmann::json> index_search_op({});  // stores the last error across all index threads

    for(Index* index: indices) {
        // wait for the worker
        {
            std::unique_lock<std::mutex> lk(index->m);
            index->cv.wait(lk, [index]{return index->processed;});
        }

        if(!index->search_params.outcome.ok()) {
            index_search_op = Option<nlohmann::json>(index->search_params.outcome.code(),
                                                    index->search_params.outcome.error());
        }

        if(!index_search_op.ok()) {
            // we still need to iterate without breaking to release the locks
            continue;
        }

        // need to remap the search query index before appending
        for(auto & field_order_kv: index->search_params.field_order_kvs) {
            field_order_kv.second.query_index += searched_queries.size();
            field_order_kvs.push_back(field_order_kv);
        }

        searched_queries.insert(searched_queries.end(), index->search_params.searched_queries.begin(),
                                index->search_params.searched_queries.end());

        for(size_t fi = 0; fi < index->search_params.facets.size(); fi++) {
            auto & this_facet = index->search_params.facets[fi];
            auto & acc_facet = facets[fi];

            for(auto & facet_kv: this_facet.result_map) {
                size_t count = 0;

                if(acc_facet.result_map.count(facet_kv.first) == 0) {
                    // not found, so set it
                    count = facet_kv.second;
                } else {
                    count = acc_facet.result_map[facet_kv.first] + facet_kv.second;
                }

                acc_facet.result_map[facet_kv.first] = count;
            }
        }

        total_found += index->search_params.all_result_ids_len;
    }

    if(!index_search_op.ok()) {
        return index_search_op;
    }

    // All fields are sorted descending
    std::sort(field_order_kvs.begin(), field_order_kvs.end(),
      [](const std::pair<int, Topster<512>::KV> & a, const std::pair<int, Topster<512>::KV> & b) {
          return std::tie(a.second.match_score, a.second.primary_attr, a.second.secondary_attr, a.first, a.second.key) >
                 std::tie(b.second.match_score, b.second.primary_attr, b.second.secondary_attr, b.first, b.second.key);
    });

    nlohmann::json result = nlohmann::json::object();

    result["hits"] = nlohmann::json::array();
    result["found"] = total_found;

    const int start_result_index = (page - 1) * per_page;
    const int kvsize = field_order_kvs.size();

    if(start_result_index > (kvsize - 1)) {
        return Option<nlohmann::json>(result);
    }

    const int end_result_index = std::min(int(page * per_page), kvsize) - 1;

    for(int field_order_kv_index = start_result_index; field_order_kv_index <= end_result_index; field_order_kv_index++) {
        const auto & field_order_kv = field_order_kvs[field_order_kv_index];
        const std::string& seq_id_key = get_seq_id_key((uint32_t) field_order_kv.second.key);

        std::string json_doc_str;
        StoreStatus json_doc_status = store->get(seq_id_key, json_doc_str);

        if(json_doc_status != StoreStatus::FOUND) {
            LOG(WARNING) << "Could not locate the JSON document for sequence ID: " << seq_id_key;
            continue;
        }

        nlohmann::json wrapper_doc;
        nlohmann::json document;

        try {
            document = nlohmann::json::parse(json_doc_str);
        } catch(...) {
            return Option<nlohmann::json>(500, "Error while parsing stored document.");
        }

        wrapper_doc["document"] = document;

        // highlight query words in the result
        const std::string & field_name = search_fields[search_fields.size() - field_order_kv.first];
        field search_field = search_schema.at(field_name);

        // only string fields are supported for now
        if(search_field.type == field_types::STRING) {
            std::vector<std::string> tokens;
            StringUtils::split(document[field_name], tokens, " ");

            // positions in the document of each token in the query
            std::vector<std::vector<uint16_t>> token_positions;

            for (const art_leaf *token_leaf : searched_queries[field_order_kv.second.query_index]) {
                std::vector<uint16_t> positions;
                uint32_t doc_index = token_leaf->values->ids.indexOf(field_order_kv.second.key);
                if(doc_index == token_leaf->values->ids.getLength()) {
                    continue;
                }

                uint32_t start_offset = token_leaf->values->offset_index.at(doc_index);
                uint32_t end_offset = (doc_index == token_leaf->values->ids.getLength() - 1) ?
                                      token_leaf->values->offsets.getLength() :
                                      token_leaf->values->offset_index.at(doc_index+1);

                while(start_offset < end_offset) {
                    positions.push_back((uint16_t) token_leaf->values->offsets.at(start_offset));
                    start_offset++;
                }

                token_positions.push_back(positions);
            }

            Match match = Match::match(field_order_kv.second.key, token_positions);

            // unpack `match.offset_diffs` into `token_indices`
            std::vector<size_t> token_indices;
            size_t num_tokens_found = (size_t) match.offset_diffs[0];
            for(size_t i = 1; i <= num_tokens_found; i++) {
                if(match.offset_diffs[i] != std::numeric_limits<int8_t>::max()) {
                    size_t token_index = (size_t)(match.start_offset + match.offset_diffs[i]);
                    token_indices.push_back(token_index);
                }
            }

            auto minmax = std::minmax_element(token_indices.begin(), token_indices.end());

            // For longer strings, pick surrounding tokens within N tokens of min_index and max_index for the snippet
            const size_t start_index = (tokens.size() <= SNIPPET_STR_ABOVE_LEN) ? 0 :
                                       std::max(0, (int)(*(minmax.first)-5));

            const size_t end_index = (tokens.size() <= SNIPPET_STR_ABOVE_LEN) ? tokens.size() :
                                     std::min((int)tokens.size(), (int)(*(minmax.second)+5));

            for(const size_t token_index: token_indices) {
                tokens[token_index] = "<mark>" + tokens[token_index] + "</mark>";
            }

            std::stringstream snippet_stream;
            for(size_t snippet_index = start_index; snippet_index < end_index; snippet_index++) {
                if(snippet_index != start_index) {
                    snippet_stream << " ";
                }

                snippet_stream << tokens[snippet_index];
            }

            wrapper_doc["_highlight"] = nlohmann::json::object();
            wrapper_doc["_highlight"][field_name] = snippet_stream.str();
        }

        result["hits"].push_back(wrapper_doc);
    }

    result["facet_counts"] = nlohmann::json::array();

    // populate facets
    for(const facet & a_facet: facets) {
        nlohmann::json facet_result = nlohmann::json::object();
        facet_result["field_name"] = a_facet.field_name;
        facet_result["counts"] = nlohmann::json::array();

        // keep only top 10 facets
        std::vector<std::pair<std::string, size_t>> value_to_count;
        for (auto itr = a_facet.result_map.begin(); itr != a_facet.result_map.end(); ++itr) {
            value_to_count.push_back(*itr);
        }

        std::sort(value_to_count.begin(), value_to_count.end(),
                  [=](std::pair<std::string, size_t>& a, std::pair<std::string, size_t>& b) {
                      return a.second > b.second;
                  });

        for(size_t i = 0; i < std::min((size_t)10, value_to_count.size()); i++) {
            auto & kv = value_to_count[i];
            nlohmann::json facet_value_count = nlohmann::json::object();
            facet_value_count["value"] = kv.first;
            facet_value_count["count"] = kv.second;
            facet_result["counts"].push_back(facet_value_count);
        }

        result["facet_counts"].push_back(facet_result);
    }

    //long long int timeMillis = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::high_resolution_clock::now() - begin).count();
    //!LOG(INFO) << "Time taken for result calc: " << timeMillis << "us";
    //!store->print_memory_usage();
    return result;
}

Option<nlohmann::json> Collection::get(const std::string & id) {
    std::string seq_id_str;
    StoreStatus seq_id_status = store->get(get_doc_id_key(id), seq_id_str);

    if(seq_id_status == StoreStatus::NOT_FOUND) {
        return Option<nlohmann::json>(404, "Could not find a document with id: " + id);
    }

    if(seq_id_status == StoreStatus::ERROR) {
        return Option<nlohmann::json>(500, "Error while fetching the document.");
    }

    uint32_t seq_id = (uint32_t) std::stol(seq_id_str);

    std::string parsed_document;
    StoreStatus doc_status = store->get(get_seq_id_key(seq_id), parsed_document);

    if(doc_status == StoreStatus::NOT_FOUND) {
        LOG(WARNING) << "Sequence ID exists, but document is missing for id: " << id;
        return Option<nlohmann::json>(404, "Could not find a document with id: " + id);
    }

    if(doc_status == StoreStatus::ERROR) {
        return Option<nlohmann::json>(500, "Error while fetching the document.");
    }

    nlohmann::json document;
    try {
        document = nlohmann::json::parse(parsed_document);
    } catch(...) {
        return Option<nlohmann::json>(500, "Error while parsing stored document.");
    }

    return Option<nlohmann::json>(document);
}

Option<std::string> Collection::remove(const std::string & id, const bool remove_from_store) {
    std::string seq_id_str;
    StoreStatus seq_id_status = store->get(get_doc_id_key(id), seq_id_str);

    if(seq_id_status == StoreStatus::NOT_FOUND) {
        return Option<std::string>(404, "Could not find a document with id: " + id);
    }

    if(seq_id_status == StoreStatus::ERROR) {
        return Option<std::string>(500, "Error while fetching the document.");
    }

    uint32_t seq_id = (uint32_t) std::stol(seq_id_str);

    std::string parsed_document;
    StoreStatus doc_status = store->get(get_seq_id_key(seq_id), parsed_document);

    if(doc_status == StoreStatus::NOT_FOUND) {
        LOG(WARNING) << "Sequence ID exists, but document is missing for id: " << id;
        return Option<std::string>(404, "Could not find a document with id: " + id);
    }

    if(doc_status == StoreStatus::ERROR) {
        return Option<std::string>(500, "Error while fetching the document.");
    }

    nlohmann::json document;
    try {
        document = nlohmann::json::parse(parsed_document);
    } catch(...) {
        return Option<std::string>(500, "Error while parsing stored document.");
    }

    Index* index = indices[seq_id % num_indices];
    index->remove(seq_id, document);

    if(remove_from_store) {
        store->remove(get_doc_id_key(id));
        store->remove(get_seq_id_key(seq_id));
    }

    num_documents -= 1;

    return Option<std::string>(id);
}

std::string Collection::get_next_seq_id_key(const std::string & collection_name) {
    return std::string(COLLECTION_NEXT_SEQ_PREFIX) + "_" + collection_name;
}

std::string Collection::get_seq_id_key(uint32_t seq_id) {
    // We can't simply do std::to_string() because we want to preserve the byte order.
    // & 0xFF masks all but the lowest eight bits.
    const std::string & serialized_id = StringUtils::serialize_uint32_t(seq_id);
    return get_seq_id_collection_prefix() + "_" + serialized_id;
}

std::string Collection::get_doc_id_key(const std::string & doc_id) {
    return std::to_string(collection_id) + "_" + DOC_ID_PREFIX + "_" + doc_id;
}

std::string Collection::get_name() {
    return name;
}

size_t Collection::get_num_documents() {
    return num_documents;
}

uint32_t Collection::get_collection_id() {
    return collection_id;
}

Option<uint32_t> Collection::doc_id_to_seq_id(std::string doc_id) {
    std::string seq_id_str;
    StoreStatus status = store->get(get_doc_id_key(doc_id), seq_id_str);
    if(status == StoreStatus::FOUND) {
        uint32_t seq_id = (uint32_t) std::stoi(seq_id_str);
        return Option<uint32_t>(seq_id);
    }

    if(status == StoreStatus::NOT_FOUND) {
        return Option<uint32_t>(404, "Not found.");
    }

    return Option<uint32_t>(500, "Error while fetching doc_id from store.");
}

std::vector<std::string> Collection::get_facet_fields() {
    std::vector<std::string> facet_fields_copy;
    for(auto it = facet_schema.begin(); it != facet_schema.end(); ++it) {
        facet_fields_copy.push_back(it->first);
    }

    return facet_fields_copy;
}

std::vector<field> Collection::get_sort_fields() {
    std::vector<field> sort_fields_copy;
    for(auto it = sort_schema.begin(); it != sort_schema.end(); ++it) {
        sort_fields_copy.push_back(it->second);
    }

    return sort_fields_copy;
}

std::vector<field> Collection::get_fields() {
    return fields;
}

std::unordered_map<std::string, field> Collection::get_schema() {
    return search_schema;
};

std::string Collection::get_meta_key(const std::string & collection_name) {
    return std::string(COLLECTION_META_PREFIX) + "_" + collection_name;
}

std::string Collection::get_seq_id_collection_prefix() {
    return std::to_string(collection_id) + "_" + std::string(SEQ_ID_PREFIX);
}

std::string Collection::get_token_ranking_field() {
    return token_ranking_field;
}