typesense/src/batched_indexer.cpp

#include "batched_indexer.h"
#include "core_api.h"
#include "thread_local_vars.h"

BatchedIndexer::BatchedIndexer(HttpServer* server, Store* store, const size_t num_threads):
                               server(server), store(store), num_threads(num_threads),
                               last_gc_run(std::chrono::high_resolution_clock::now()), exit(false) {
    thread_pool = new ThreadPool(num_threads);
    queues.resize(num_threads);
    qmutuxes = new std::mutex[num_threads];
}

void BatchedIndexer::enqueue(const std::shared_ptr<http_req>& req, const std::shared_ptr<http_res>& res) {
    std::string& coll_name = req->params["collection"];

    if(coll_name.empty()) {
        route_path* rpath;
        server->get_route(req->route_hash, &rpath);

        // ensure that collection creation is sent to the same queue as writes to that collection
        if(rpath->handler == post_create_collection) {
            nlohmann::json obj = nlohmann::json::parse(req->body, nullptr, false);

            if(obj != nlohmann::json::value_t::discarded && obj.is_object() &&
               obj.count("name") != 0 && obj["name"].is_string()) {
                coll_name = obj["name"];
            }
        }
    }

    uint64_t queue_id = StringUtils::hash_wy(coll_name.c_str(), coll_name.size()) % num_threads;

    uint32_t chunk_sequence = 0;

    {
        std::unique_lock lk(mutex);
        chunk_sequence = request_to_chunk[req->start_ts];
        request_to_chunk[req->start_ts] += 1;
    }

    const std::string& req_key_prefix = get_req_prefix_key(req->start_ts);
    const std::string& request_chunk_key = req_key_prefix + StringUtils::serialize_uint32_t(chunk_sequence);

    //LOG(INFO) << "req_id: " << req->start_ts << ", chunk_sequence: " << chunk_sequence;

    store->insert(request_chunk_key, req->serialize());
    req->body = "";

    {
        std::unique_lock lk(mutex);
        auto req_res_map_it = req_res_map.find(req->start_ts);
        if(req_res_map_it == req_res_map.end()) {
            uint64_t batch_begin_ts = std::chrono::duration_cast<std::chrono::seconds>(
                    std::chrono::system_clock::now().time_since_epoch()).count();

            req_res_t req_res{req, res, batch_begin_ts};
            req_res_map[req->start_ts] = req_res;
        }
    }

    if(req->last_chunk_aggregate) {
        //LOG(INFO) << "Last chunk for req_id: " << req->start_ts << ", queue_id: " << queue_id;

        queued_writes += (chunk_sequence + 1);

        {
            std::unique_lock lk(qmutuxes[queue_id]);
            queues[queue_id].emplace_back(req->start_ts);
        }

        std::unique_lock lk(mutex);
        request_to_chunk.erase(req->start_ts);
    }

    if(req->_req != nullptr && req->_req->proceed_req) {
        deferred_req_res_t* req_res = new deferred_req_res_t(req, res, server, true);
        server->get_message_dispatcher()->send_message(HttpServer::REQUEST_PROCEED_MESSAGE, req_res);
    }
}

void BatchedIndexer::run() {
    LOG(INFO) << "Starting batch indexer with " << num_threads << " threads.";

    for(size_t i = 0; i < num_threads; i++) {
        std::deque<uint64_t>& queue = queues[i];
        std::mutex& queue_mutex = qmutuxes[i];

        thread_pool->enqueue([&queue, &queue_mutex, this, i]() {
            while(!exit) {
                std::unique_lock<std::mutex> qlk(queue_mutex);

                if(queue.empty()) {
                    qlk.unlock();
                } else {
                    uint64_t req_id = queue.front();
                    queue.pop_front();
                    qlk.unlock();

                    std::unique_lock mlk(mutex);
                    req_res_t orig_req_res = req_res_map[req_id];
                    mlk.unlock();

                    // scan db for all logs associated with request
                    const std::string& req_key_prefix = get_req_prefix_key(req_id);

                    rocksdb::Iterator* iter = store->scan(req_key_prefix);
                    std::string prev_body = "";  // used to handle partial JSON documents caused by chunking

                    while(iter->Valid() && iter->key().starts_with(req_key_prefix)) {
                        std::shared_ptr<http_req>& orig_req = orig_req_res.req;
                        auto _req = orig_req->_req;
                        orig_req->body = prev_body;
                        orig_req->deserialize(iter->value().ToString());
                        orig_req->_req = _req;

                        // update thread local for reference during a crash
                        write_log_index = orig_req->log_index;

                        //LOG(INFO) << "original request: " << orig_req_res.req << ", _req: " << orig_req_res.req->_req;

                        route_path* found_rpath = nullptr;
                        bool route_found = server->get_route(orig_req->route_hash, &found_rpath);
                        bool async_res = false;

                        if(route_found) {
                            async_res = found_rpath->async_res;
                            found_rpath->handler(orig_req, orig_req_res.res);
                            prev_body = orig_req->body;
                        } else {
                            orig_req_res.res->set_404();
                            prev_body = "";
                        }

                        if(!async_res && orig_req_res.req->_req != nullptr) {
                            deferred_req_res_t* deferred_req_res = new deferred_req_res_t(orig_req_res.req,
                                                                                          orig_req_res.res,
                                                                                          server, true);
                            server->get_message_dispatcher()->send_message(HttpServer::STREAM_RESPONSE_MESSAGE,
                                                                           deferred_req_res);
                        }

                        queued_writes--;
                        iter->Next();
                    }

                    delete iter;

                    //LOG(INFO) << "Erasing request data from disk and memory for request " << req_res.req->start_ts;

                    // we can delete the buffered request content
                    store->delete_range(req_key_prefix, req_key_prefix + StringUtils::serialize_uint32_t(UINT32_MAX));

                    std::unique_lock lk(mutex);
                    req_res_map.erase(req_id);
                }

                std::this_thread::sleep_for(std::chrono::milliseconds (10));
            }
        });
    }

    while(!exit) {
        std::this_thread::sleep_for(std::chrono::milliseconds (1000));

        //LOG(INFO) << "Batch indexer main thread";

        // do gc, if we are due for one
        uint64_t seconds_elapsed = std::chrono::duration_cast<std::chrono::seconds>(
                std::chrono::high_resolution_clock::now() - last_gc_run).count();

        if(seconds_elapsed > GC_INTERVAL_SECONDS) {

            std::unique_lock lk(mutex);
            LOG(INFO) << "Running GC for aborted requests, req map size: " << req_res_map.size();

            // iterate through all map entries and delete ones which are > GC_PRUNE_MAX_SECONDS
            for (auto it = req_res_map.cbegin(); it != req_res_map.cend();) {
                uint64_t seconds_since_batch_start = std::chrono::duration_cast<std::chrono::seconds>(
                        std::chrono::system_clock::now().time_since_epoch()).count() - it->second.batch_begin_ts;

                //LOG(INFO) << "Seconds since batch start: " << seconds_since_batch_start;

                if(seconds_since_batch_start > GC_PRUNE_MAX_SECONDS) {
                    LOG(INFO) << "Deleting partial upload for req id " << it->second.req->start_ts;
                    const std::string& req_key_prefix = get_req_prefix_key(it->second.req->start_ts);
                    store->delete_range(req_key_prefix, req_key_prefix + StringUtils::serialize_uint32_t(UINT32_MAX));
                    request_to_chunk.erase(it->second.req->start_ts);
                    it = req_res_map.erase(it);
                } else {
                    it++;
                }
            }

            last_gc_run = std::chrono::high_resolution_clock::now();
        }
    }

    LOG(INFO) << "Batched indexer threadpool shutdown...";
    thread_pool->shutdown();
}

std::string BatchedIndexer::get_req_prefix_key(uint64_t req_id) {
    const std::string& req_key_prefix =
            RAFT_REQ_LOG_PREFIX + StringUtils::serialize_uint64_t(req_id) + "_";

    return req_key_prefix;
}

BatchedIndexer::~BatchedIndexer() {
    delete [] qmutuxes;
    delete thread_pool;
}

void BatchedIndexer::stop() {
    exit = true;
}

int64_t BatchedIndexer::get_queued_writes() {
    return queued_writes;
}