#include "store.h" #include "raft_server.h" #include #include #include #include #include #include #include "rocksdb/utilities/checkpoint.h" void ReplicationClosure::Run() { // nothing much to do here since responding to client is handled upstream // Auto delete `this` after Run() std::unique_ptr self_guard(this); } // State machine implementation int ReplicationState::start(const int api_port, int raft_port, int election_timeout_ms, int snapshot_interval_s, const std::string & raft_dir, const std::string & peers) { butil::EndPoint addr(butil::my_ip(), raft_port); braft::NodeOptions node_options; std::string actual_peers = peers; if(actual_peers.empty()) { char str[INET_ADDRSTRLEN]; inet_ntop(AF_INET, &(addr.ip.s_addr), str, INET_ADDRSTRLEN); actual_peers = std::string(str) + ":" + std::to_string(raft_port) + ":" + std::to_string(api_port); } if(node_options.initial_conf.parse_from(actual_peers) != 0) { LOG(ERROR) << "Fail to parse peer configuration `" << peers << "`"; return -1; } node_options.election_timeout_ms = election_timeout_ms; node_options.fsm = this; node_options.node_owns_fsm = false; node_options.snapshot_interval_s = snapshot_interval_s; std::string prefix = "local://" + raft_dir; node_options.log_uri = prefix + "/" + log_dir_name; node_options.raft_meta_uri = prefix + "/" + meta_dir_name; node_options.snapshot_uri = prefix + "/" + snapshot_dir_name; node_options.disable_cli = true; // api_port is used as the node identifier braft::Node* node = new braft::Node("default_group", braft::PeerId(addr, api_port)); std::string snapshot_dir = raft_dir + "/" + snapshot_dir_name; bool snapshot_exists = dir_enum_count(snapshot_dir) > 0; if(snapshot_exists) { // we will be assured of on_snapshot() firing and we will wait for that to init_db() } else if(!create_init_db_snapshot) { // `create_init_db_snapshot` will be handled only after leader starts, otherwise: LOG(INFO) << "Snapshot does not exist. We will remove db dir and init db fresh."; reset_db(); if (!butil::DeleteFile(butil::FilePath(store->get_state_dir_path()), true)) { LOG(WARNING) << "rm " << store->get_state_dir_path() << " failed"; return -1; } int init_db_status = init_db(); if(init_db_status != 0) { LOG(ERROR) << "Failed to initialize DB."; return init_db_status; } } if (node->init(node_options) != 0) { LOG(ERROR) << "Fail to init raft node"; delete node; return -1; } std::vector peer_vec; StringUtils::split(actual_peers, peer_vec, ","); if(peer_vec.size() == 1) { // NOTE: `reset_peers` is NOT safe to run on a cluster of nodes, but okay for standalone auto status = node->reset_peers(node_options.initial_conf); if(!status.ok()) { LOG(ERROR) << status.error_str(); } } this->node = node; return 0; } void ReplicationState::write(http_req* request, http_res* response) { if (!is_leader()) { if(node->leader_id().is_empty()) { // Handle no leader scenario LOG(ERROR) << "Rejecting write: could not find a leader."; response->send_500("Could not find a leader."); auto replication_arg = new AsyncIndexArg{request, response, nullptr}; replication_arg->req->route_index = static_cast(ROUTE_CODES::RETURN_EARLY); return message_dispatcher->send_message(REPLICATION_MSG, replication_arg); } const std::string & leader_addr = node->leader_id().to_string(); //LOG(INFO) << "Redirecting write to leader at: " << leader_addr; thread_pool->enqueue([leader_addr, request, response, this]() { auto raw_req = request->_req; std::string scheme = std::string(raw_req->scheme->name.base, raw_req->scheme->name.len); std::vector addr_parts; StringUtils::split(leader_addr, addr_parts, ":"); std::string leader_host_port = addr_parts[0] + ":" + addr_parts[2]; const std::string & path = std::string(raw_req->path.base, raw_req->path.len); std::string url = scheme + "://" + leader_host_port + path; if(request->http_method == "POST") { std::string api_res; long status = HttpClient::post_response(url, request->body, api_res); response->send(status, api_res); } else if(request->http_method == "DELETE") { std::string api_res; long status = HttpClient::delete_response(url, api_res); response->send(status, api_res); } else { const std::string& err = "Forwarding for http method not implemented: " + request->http_method; LOG(ERROR) << err; response->send_500(err); } auto replication_arg = new AsyncIndexArg{request, response, nullptr}; replication_arg->req->route_index = static_cast(ROUTE_CODES::RETURN_EARLY); message_dispatcher->send_message(REPLICATION_MSG, replication_arg); }); return ; } // Serialize request to replicated WAL so that all the peers in the group receive it as well. // NOTE: actual write must be done only on the `on_apply` method to maintain consistency. butil::IOBufBuilder bufBuilder; bufBuilder << request->serialize(); butil::IOBuf log = bufBuilder.buf(); // Apply this log as a braft::Task braft::Task task; task.data = &log; // This callback would be invoked when the task actually executes or fails task.done = new ReplicationClosure(request, response); // To avoid ABA problem task.expected_term = leader_term.load(butil::memory_order_relaxed); // Now the task is applied to the group, waiting for the result. return node->apply(task); } void ReplicationState::read(http_res* response) { // NOT USED: // For consistency, reads to followers could be rejected. // Currently, we don't do implement reads via raft. } void ReplicationState::on_apply(braft::Iterator& iter) { // A batch of tasks are committed, which must be processed through // |iter| for (; iter.valid(); iter.next()) { http_res* response; http_req* request; // Guard invokes replication_arg->done->Run() asynchronously to avoid the callback blocking the main thread braft::AsyncClosureGuard closure_guard(iter.done()); if (iter.done()) { // This task is applied by this node, get value from the closure to avoid additional parsing. ReplicationClosure* c = dynamic_cast(iter.done()); response = c->get_response(); request = c->get_request(); } else { // Parse request from the log response = new http_res; http_req* remote_request = new http_req; remote_request->deserialize(iter.data().to_string()); remote_request->_req = nullptr; // indicates remote request request = remote_request; } if(request->_req == nullptr && request->body == "INIT_SNAPSHOT") { // We attempt to trigger a cold snapshot against an existing stand-alone DB for backward compatibility InitSnapshotClosure* init_snapshot_closure = new InitSnapshotClosure(this); node->snapshot(init_snapshot_closure); delete request; delete response; continue ; } // Now that the log has been parsed, perform the actual operation // Call http server thread for write and response back to client (if `response` is NOT null) // We use a future to block current thread until the async flow finishes std::promise promise; std::future future = promise.get_future(); auto replication_arg = new AsyncIndexArg{request, response, &promise}; message_dispatcher->send_message(REPLICATION_MSG, replication_arg); future.get(); } } void* ReplicationState::save_snapshot(void* arg) { SnapshotArg* sa = static_cast(arg); std::unique_ptr arg_guard(sa); brpc::ClosureGuard done_guard(sa->done); std::string snapshot_path = sa->writer->get_path() + "/" + db_snapshot_name; rocksdb::Checkpoint* checkpoint = nullptr; rocksdb::Status status = rocksdb::Checkpoint::Create(sa->db, &checkpoint); if(!status.ok()) { LOG(WARNING) << "Checkpoint Create failed, msg:" << status.ToString(); return nullptr; } std::unique_ptr checkpoint_guard(checkpoint); status = checkpoint->CreateCheckpoint(snapshot_path); if(!status.ok()) { LOG(WARNING) << "Checkpoint CreateCheckpoint failed, msg:" << status.ToString(); return nullptr; } butil::FileEnumerator dir_enum(butil::FilePath(snapshot_path),false, butil::FileEnumerator::FILES); for (butil::FilePath name = dir_enum.Next(); !name.empty(); name = dir_enum.Next()) { std::string file_name = std::string(db_snapshot_name) + "/" + name.BaseName().value(); if (sa->writer->add_file(file_name) != 0) { sa->done->status().set_error(EIO, "Fail to add file to writer."); return nullptr; } } return nullptr; } void ReplicationState::on_snapshot_save(braft::SnapshotWriter* writer, braft::Closure* done) { // Start a new bthread to avoid blocking StateMachine since it could be slow to write data to disk SnapshotArg* arg = new SnapshotArg; arg->db = store->_get_db_unsafe(); arg->writer = writer; arg->done = done; bthread_t tid; bthread_start_urgent(&tid, NULL, save_snapshot, arg); } int ReplicationState::init_db() { if (!butil::CreateDirectory(butil::FilePath(store->get_state_dir_path()))) { LOG(WARNING) << "CreateDirectory " << store->get_state_dir_path() << " failed"; return -1; } const rocksdb::Status& status = store->init_db(); if (!status.ok()) { LOG(WARNING) << "Open DB " << store->get_state_dir_path() << " failed, msg: " << status.ToString(); return -1; } LOG(NOTICE) << "DB open success!"; LOG(INFO) << "Loading collections from disk..."; Option init_op = CollectionManager::get_instance().load(); if(init_op.ok()) { LOG(INFO) << "Finished loading collections from disk."; } else { LOG(ERROR)<< "Typesense failed to start. " << "Could not load collections from disk: " << init_op.error(); return 1; } init_readiness_count++; return 0; } int ReplicationState::on_snapshot_load(braft::SnapshotReader* reader) { CHECK(!is_leader()) << "Leader is not supposed to load snapshot"; LOG(INFO) << "on_snapshot_load"; // Load snapshot from reader, replacing the running StateMachine reset_db(); if (!butil::DeleteFile(butil::FilePath(store->get_state_dir_path()), true)) { LOG(WARNING) << "rm " << store->get_state_dir_path() << " failed"; return -1; } LOG(TRACE) << "rm " << store->get_state_dir_path() << " success"; std::string snapshot_path = reader->get_path(); snapshot_path.append(std::string("/") + db_snapshot_name); // tries to use link if possible, or else copies if (!copy_dir(snapshot_path, store->get_state_dir_path())) { LOG(WARNING) << "copy snapshot " << snapshot_path << " to " << store->get_state_dir_path() << " failed"; return -1; } LOG(TRACE) << "copy snapshot " << snapshot_path << " to " << store->get_state_dir_path() << " success"; return init_db(); } void ReplicationState::refresh_peers(const std::string & peers) { if(node && is_leader()) { LOG(INFO) << "Refreshing peer config"; braft::Configuration conf; conf.parse_from(peers); RefreshPeersClosure* refresh_peers_done = new RefreshPeersClosure; node->change_peers(conf, refresh_peers_done); } } ReplicationState::ReplicationState(Store *store, ThreadPool* thread_pool, http_message_dispatcher *message_dispatcher, bool create_init_db_snapshot): node(nullptr), leader_term(-1), store(store), thread_pool(thread_pool), message_dispatcher(message_dispatcher), init_readiness_count(0), create_init_db_snapshot(create_init_db_snapshot) { } void ReplicationState::reset_db() { store->close(); } size_t ReplicationState::get_init_readiness_count() const { return init_readiness_count.load(); } void InitSnapshotClosure::Run() { // Auto delete this after Run() std::unique_ptr self_guard(this); if(status().ok()) { LOG(INFO) << "Init snapshot succeeded!"; replication_state->reset_db(); replication_state->init_db(); } else { LOG(ERROR) << "Init snapshot failed, error: " << status().error_str(); } }