/* * ClusterRecovery.actor.cpp * * This source file is part of the FoundationDB open source project * * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "fdbrpc/sim_validation.h" #include "fdbserver/ApplyMetadataMutation.h" #include "fdbserver/BackupProgress.actor.h" #include "fdbserver/ClusterRecovery.actor.h" #include "fdbserver/MasterInterface.h" #include "fdbserver/WaitFailure.h" #include "flow/ProtocolVersion.h" #include "flow/actorcompiler.h" // This must be the last #include. static std::set const& normalClusterRecoveryErrors() { static std::set s; if (s.empty()) { s.insert(error_code_operation_failed); s.insert(error_code_tlog_stopped); s.insert(error_code_tlog_failed); s.insert(error_code_commit_proxy_failed); s.insert(error_code_grv_proxy_failed); s.insert(error_code_resolver_failed); s.insert(error_code_backup_worker_failed); s.insert(error_code_recruitment_failed); s.insert(error_code_no_more_servers); s.insert(error_code_cluster_recovery_failed); s.insert(error_code_coordinated_state_conflict); s.insert(error_code_master_max_versions_in_flight); s.insert(error_code_worker_removed); s.insert(error_code_new_coordinators_timed_out); s.insert(error_code_broken_promise); } return s; } ACTOR Future recoveryTerminateOnConflict(UID dbgid, Promise fullyRecovered, Future onConflict, Future switchedState) { choose { when(wait(onConflict)) { if (!fullyRecovered.isSet()) { TraceEvent("RecoveryTerminated", dbgid).detail("Reason", "Conflict"); TEST(true); // Coordinated state conflict, recovery terminating throw worker_removed(); } return Void(); } when(wait(switchedState)) { return Void(); } } } ACTOR Future recruitNewMaster(ClusterControllerData* cluster, ClusterControllerData::DBInfo* db, MasterInterface* newMaster) { state Future> fNewMaster; state WorkerFitnessInfo masterWorker; loop { // We must recruit the master in the same data center as the cluster controller. // This should always be possible, because we can recruit the master on the same process as the cluster // controller. std::map>, int> id_used; id_used[cluster->clusterControllerProcessId]++; masterWorker = cluster->getWorkerForRoleInDatacenter( cluster->clusterControllerDcId, ProcessClass::Master, ProcessClass::NeverAssign, db->config, id_used); if ((masterWorker.worker.processClass.machineClassFitness(ProcessClass::Master) > SERVER_KNOBS->EXPECTED_MASTER_FITNESS || masterWorker.worker.interf.locality.processId() == cluster->clusterControllerProcessId) && !cluster->goodRecruitmentTime.isReady()) { TraceEvent("RecruitNewMaster", cluster->id) .detail("Fitness", masterWorker.worker.processClass.machineClassFitness(ProcessClass::Master)); wait(delay(SERVER_KNOBS->ATTEMPT_RECRUITMENT_DELAY)); continue; } RecruitMasterRequest rmq; rmq.lifetime = db->serverInfo->get().masterLifetime; rmq.forceRecovery = db->forceRecovery; cluster->masterProcessId = masterWorker.worker.interf.locality.processId(); cluster->db.unfinishedRecoveries++; fNewMaster = masterWorker.worker.interf.master.tryGetReply(rmq); wait(ready(fNewMaster) || db->forceMasterFailure.onTrigger()); if (fNewMaster.isReady() && fNewMaster.get().present()) { TraceEvent("RecruitNewMaster", cluster->id).detail("Recruited", fNewMaster.get().get().id()); // for status tool TraceEvent("RecruitedMasterWorker", cluster->id) .detail("Address", fNewMaster.get().get().address()) .trackLatest(cluster->recruitedMasterWorkerEventHolder->trackingKey); *newMaster = fNewMaster.get().get(); return Void(); } else { TEST(true); // clusterWatchDatabase() !newMaster.present() wait(delay(SERVER_KNOBS->MASTER_SPIN_DELAY)); } } } ACTOR Future clusterRecruitFromConfiguration(ClusterControllerData* self, Reference req) { // At the moment this doesn't really need to be an actor (it always completes immediately) TEST(true); // ClusterController RecruitTLogsRequest loop { try { req->rep = self->findWorkersForConfiguration(req->req); return Void(); } catch (Error& e) { if (e.code() == error_code_no_more_servers && self->goodRecruitmentTime.isReady()) { self->outstandingRecruitmentRequests.push_back(req); TraceEvent(SevWarn, "RecruitFromConfigurationNotAvailable", self->id).error(e); wait(req->waitForCompletion.onTrigger()); return Void(); } else if (e.code() == error_code_operation_failed || e.code() == error_code_no_more_servers) { // recruitment not good enough, try again TraceEvent("RecruitFromConfigurationRetry", self->id) .error(e) .detail("GoodRecruitmentTimeReady", self->goodRecruitmentTime.isReady()); while (!self->goodRecruitmentTime.isReady()) { wait(lowPriorityDelay(SERVER_KNOBS->ATTEMPT_RECRUITMENT_DELAY)); } } else { TraceEvent(SevError, "RecruitFromConfigurationError", self->id).error(e); throw; } } wait(lowPriorityDelay(SERVER_KNOBS->ATTEMPT_RECRUITMENT_DELAY)); } } ACTOR Future clusterRecruitRemoteFromConfiguration( ClusterControllerData* self, Reference req) { // At the moment this doesn't really need to be an actor (it always completes immediately) TEST(true); // ClusterController RecruitTLogsRequest Remote loop { try { auto rep = self->findRemoteWorkersForConfiguration(req->req); return rep; } catch (Error& e) { if (e.code() == error_code_no_more_servers && self->goodRemoteRecruitmentTime.isReady()) { self->outstandingRemoteRecruitmentRequests.push_back(req); TraceEvent(SevWarn, "RecruitRemoteFromConfigurationNotAvailable", self->id).error(e); wait(req->waitForCompletion.onTrigger()); return req->rep; } else if (e.code() == error_code_operation_failed || e.code() == error_code_no_more_servers) { // recruitment not good enough, try again TraceEvent("RecruitRemoteFromConfigurationRetry", self->id) .error(e) .detail("GoodRecruitmentTimeReady", self->goodRemoteRecruitmentTime.isReady()); while (!self->goodRemoteRecruitmentTime.isReady()) { wait(lowPriorityDelay(SERVER_KNOBS->ATTEMPT_RECRUITMENT_DELAY)); } } else { TraceEvent(SevError, "RecruitRemoteFromConfigurationError", self->id).error(e); throw; } } wait(lowPriorityDelay(SERVER_KNOBS->ATTEMPT_RECRUITMENT_DELAY)); } } ACTOR Future newCommitProxies(Reference self, RecruitFromConfigurationReply recr) { std::vector> initializationReplies; for (int i = 0; i < recr.commitProxies.size(); i++) { InitializeCommitProxyRequest req; req.master = self->masterInterface; req.masterLifetime = self->masterLifetime; req.recoveryCount = self->cstate.myDBState.recoveryCount + 1; req.recoveryTransactionVersion = self->recoveryTransactionVersion; req.firstProxy = i == 0; TraceEvent("CommitProxyReplies", self->dbgid) .detail("WorkerID", recr.commitProxies[i].id()) .detail("ReocoveryTxnVersion", self->recoveryTransactionVersion) .detail("FirstProxy", req.firstProxy ? "True" : "False"); initializationReplies.push_back( transformErrors(throwErrorOr(recr.commitProxies[i].commitProxy.getReplyUnlessFailedFor( req, SERVER_KNOBS->TLOG_TIMEOUT, SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY)), commit_proxy_failed())); } std::vector newRecruits = wait(getAll(initializationReplies)); // It is required for the correctness of COMMIT_ON_FIRST_PROXY that self->commitProxies[0] is the firstCommitProxy. self->commitProxies = newRecruits; return Void(); } ACTOR Future newGrvProxies(Reference self, RecruitFromConfigurationReply recr) { std::vector> initializationReplies; for (int i = 0; i < recr.grvProxies.size(); i++) { InitializeGrvProxyRequest req; req.master = self->masterInterface; req.masterLifetime = self->masterLifetime; req.recoveryCount = self->cstate.myDBState.recoveryCount + 1; TraceEvent("GrvProxyReplies", self->dbgid).detail("WorkerID", recr.grvProxies[i].id()); initializationReplies.push_back( transformErrors(throwErrorOr(recr.grvProxies[i].grvProxy.getReplyUnlessFailedFor( req, SERVER_KNOBS->TLOG_TIMEOUT, SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY)), grv_proxy_failed())); } std::vector newRecruits = wait(getAll(initializationReplies)); self->grvProxies = newRecruits; return Void(); } ACTOR Future newResolvers(Reference self, RecruitFromConfigurationReply recr) { std::vector> initializationReplies; for (int i = 0; i < recr.resolvers.size(); i++) { InitializeResolverRequest req; req.masterLifetime = self->masterLifetime; req.recoveryCount = self->cstate.myDBState.recoveryCount + 1; req.commitProxyCount = recr.commitProxies.size(); req.resolverCount = recr.resolvers.size(); TraceEvent("ResolverReplies", self->dbgid).detail("WorkerID", recr.resolvers[i].id()); initializationReplies.push_back( transformErrors(throwErrorOr(recr.resolvers[i].resolver.getReplyUnlessFailedFor( req, SERVER_KNOBS->TLOG_TIMEOUT, SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY)), resolver_failed())); } std::vector newRecruits = wait(getAll(initializationReplies)); self->resolvers = newRecruits; return Void(); } ACTOR Future newTLogServers(Reference self, RecruitFromConfigurationReply recr, Reference oldLogSystem, std::vector>* initialConfChanges) { if (self->configuration.usableRegions > 1) { state Optional remoteDcId = self->remoteDcIds.size() ? self->remoteDcIds[0] : Optional(); if (!self->dcId_locality.count(recr.dcId)) { int8_t loc = self->getNextLocality(); Standalone tr; tr.set(tr.arena(), tagLocalityListKeyFor(recr.dcId), tagLocalityListValue(loc)); initialConfChanges->push_back(tr); self->dcId_locality[recr.dcId] = loc; TraceEvent(SevWarn, "UnknownPrimaryDCID", self->dbgid).detail("PrimaryId", recr.dcId).detail("Loc", loc); } if (!self->dcId_locality.count(remoteDcId)) { int8_t loc = self->getNextLocality(); Standalone tr; tr.set(tr.arena(), tagLocalityListKeyFor(remoteDcId), tagLocalityListValue(loc)); initialConfChanges->push_back(tr); self->dcId_locality[remoteDcId] = loc; TraceEvent(SevWarn, "UnknownRemoteDCID", self->dbgid).detail("RemoteId", remoteDcId).detail("Loc", loc); } std::vector exclusionWorkerIds; std::transform(recr.tLogs.begin(), recr.tLogs.end(), std::back_inserter(exclusionWorkerIds), [](const WorkerInterface& in) { return in.id(); }); std::transform(recr.satelliteTLogs.begin(), recr.satelliteTLogs.end(), std::back_inserter(exclusionWorkerIds), [](const WorkerInterface& in) { return in.id(); }); RecruitRemoteFromConfigurationRequest remoteRecruitReq( self->configuration, remoteDcId, recr.tLogs.size() * std::max(1, self->configuration.desiredLogRouterCount / std::max(1, recr.tLogs.size())), exclusionWorkerIds); remoteRecruitReq.dbgId = self->dbgid; state Reference recruitWorkersInfo = makeReference(remoteRecruitReq); recruitWorkersInfo->dbgId = self->dbgid; Future fRemoteWorkers = clusterRecruitRemoteFromConfiguration(self->controllerData, recruitWorkersInfo); self->primaryLocality = self->dcId_locality[recr.dcId]; self->logSystem = Reference(); // Cancels the actors in the previous log system. Reference newLogSystem = wait(oldLogSystem->newEpoch(recr, fRemoteWorkers, self->clusterId, self->configuration, self->cstate.myDBState.recoveryCount + 1, self->primaryLocality, self->dcId_locality[remoteDcId], self->allTags, self->recruitmentStalled)); self->logSystem = newLogSystem; } else { self->primaryLocality = tagLocalitySpecial; self->logSystem = Reference(); // Cancels the actors in the previous log system. Reference newLogSystem = wait(oldLogSystem->newEpoch(recr, Never(), self->clusterId, self->configuration, self->cstate.myDBState.recoveryCount + 1, self->primaryLocality, tagLocalitySpecial, self->allTags, self->recruitmentStalled)); self->logSystem = newLogSystem; } return Void(); } ACTOR Future newSeedServers(Reference self, RecruitFromConfigurationReply recruits, std::vector* servers) { // This is only necessary if the database is at version 0 servers->clear(); if (self->lastEpochEnd) return Void(); state int idx = 0; state std::map, Tag> dcId_tags; state int8_t nextLocality = 0; while (idx < recruits.storageServers.size()) { TraceEvent(getRecoveryEventName(ClusterRecoveryEventType::CLUSTER_RECOVERY_SS_RECRUITMENT_EVENT_NAME).c_str(), self->dbgid) .detail("CandidateWorker", recruits.storageServers[idx].locality.toString()); InitializeStorageRequest isr; isr.seedTag = dcId_tags.count(recruits.storageServers[idx].locality.dcId()) ? dcId_tags[recruits.storageServers[idx].locality.dcId()] : Tag(nextLocality, 0); isr.storeType = self->configuration.storageServerStoreType; isr.reqId = deterministicRandom()->randomUniqueID(); isr.interfaceId = deterministicRandom()->randomUniqueID(); isr.clusterId = self->clusterId; isr.initialClusterVersion = self->recoveryTransactionVersion; ErrorOr newServer = wait(recruits.storageServers[idx].storage.tryGetReply(isr)); if (newServer.isError()) { if (!newServer.isError(error_code_recruitment_failed) && !newServer.isError(error_code_request_maybe_delivered)) throw newServer.getError(); TEST(true); // initial storage recuitment loop failed to get new server wait(delay(SERVER_KNOBS->STORAGE_RECRUITMENT_DELAY)); } else { if (!dcId_tags.count(recruits.storageServers[idx].locality.dcId())) { dcId_tags[recruits.storageServers[idx].locality.dcId()] = Tag(nextLocality, 0); nextLocality++; } Tag& tag = dcId_tags[recruits.storageServers[idx].locality.dcId()]; tag.id++; idx++; servers->push_back(newServer.get().interf); } } self->dcId_locality.clear(); for (auto& it : dcId_tags) { self->dcId_locality[it.first] = it.second.locality; } TraceEvent("ClusterRecoveryRecruitedInitialStorageServers", self->dbgid) .detail("TargetCount", self->configuration.storageTeamSize) .detail("Servers", describe(*servers)); return Void(); } Future waitCommitProxyFailure(std::vector const& commitProxies) { std::vector> failed; failed.reserve(commitProxies.size()); for (auto commitProxy : commitProxies) { failed.push_back(waitFailureClient(commitProxy.waitFailure, SERVER_KNOBS->TLOG_TIMEOUT, -SERVER_KNOBS->TLOG_TIMEOUT / SERVER_KNOBS->SECONDS_BEFORE_NO_FAILURE_DELAY, /*trace=*/true)); } ASSERT(failed.size() >= 1); return tagError(quorum(failed, 1), commit_proxy_failed()); } Future waitGrvProxyFailure(std::vector const& grvProxies) { std::vector> failed; failed.reserve(grvProxies.size()); for (int i = 0; i < grvProxies.size(); i++) failed.push_back(waitFailureClient(grvProxies[i].waitFailure, SERVER_KNOBS->TLOG_TIMEOUT, -SERVER_KNOBS->TLOG_TIMEOUT / SERVER_KNOBS->SECONDS_BEFORE_NO_FAILURE_DELAY, /*trace=*/true)); ASSERT(failed.size() >= 1); return tagError(quorum(failed, 1), grv_proxy_failed()); } Future waitResolverFailure(std::vector const& resolvers) { std::vector> failed; failed.reserve(resolvers.size()); for (auto resolver : resolvers) { failed.push_back(waitFailureClient(resolver.waitFailure, SERVER_KNOBS->TLOG_TIMEOUT, -SERVER_KNOBS->TLOG_TIMEOUT / SERVER_KNOBS->SECONDS_BEFORE_NO_FAILURE_DELAY, /*trace=*/true)); } ASSERT(failed.size() >= 1); return tagError(quorum(failed, 1), resolver_failed()); } ACTOR Future rejoinRequestHandler(Reference self) { loop { TLogRejoinRequest req = waitNext(self->clusterController.tlogRejoin.getFuture()); TraceEvent(SevDebug, "TLogRejoinRequestHandler") .detail("MasterLifeTime", self->dbInfo->get().masterLifetime.toString()); req.reply.send(true); } } // Keeps the coordinated state (cstate) updated as the set of recruited tlogs change through recovery. ACTOR Future trackTlogRecovery(Reference self, Reference>> oldLogSystems, Future minRecoveryDuration) { state Future rejoinRequests = Never(); state DBRecoveryCount recoverCount = self->cstate.myDBState.recoveryCount + 1; state DatabaseConfiguration configuration = self->configuration; // self-configuration can be changed by configurationMonitor so we need a copy loop { state DBCoreState newState; self->logSystem->toCoreState(newState); newState.recoveryCount = recoverCount; state Future changed = self->logSystem->onCoreStateChanged(); ASSERT(newState.tLogs[0].tLogWriteAntiQuorum == configuration.tLogWriteAntiQuorum && newState.tLogs[0].tLogReplicationFactor == configuration.tLogReplicationFactor); state bool allLogs = newState.tLogs.size() == configuration.expectedLogSets(self->primaryDcId.size() ? self->primaryDcId[0] : Optional()); state bool finalUpdate = !newState.oldTLogData.size() && allLogs; TraceEvent("TrackTlogRecovery") .detail("FinalUpdate", finalUpdate) .detail("NewState.tlogs", newState.tLogs.size()) .detail("NewState.OldTLogs", newState.oldTLogData.size()) .detail("Expected.tlogs", configuration.expectedLogSets(self->primaryDcId.size() ? self->primaryDcId[0] : Optional())); wait(self->cstate.write(newState, finalUpdate)); if (self->cstateUpdated.canBeSet()) { self->cstateUpdated.send(Void()); } wait(minRecoveryDuration); self->logSystem->coreStateWritten(newState); if (self->recoveryReadyForCommits.canBeSet()) { self->recoveryReadyForCommits.send(Void()); } if (finalUpdate) { self->recoveryState = RecoveryState::FULLY_RECOVERED; TraceEvent(getRecoveryEventName(ClusterRecoveryEventType::CLUSTER_RECOVERY_STATE_EVENT_NAME).c_str(), self->dbgid) .detail("StatusCode", RecoveryStatus::fully_recovered) .detail("Status", RecoveryStatus::names[RecoveryStatus::fully_recovered]) .detail("ClusterId", self->clusterId) .trackLatest(self->clusterRecoveryStateEventHolder->trackingKey); TraceEvent(getRecoveryEventName(ClusterRecoveryEventType::CLUSTER_RECOVERY_GENERATION_EVENT_NAME).c_str(), self->dbgid) .detail("ActiveGenerations", 1) .trackLatest(self->clusterRecoveryGenerationsEventHolder->trackingKey); } else if (!newState.oldTLogData.size() && self->recoveryState < RecoveryState::STORAGE_RECOVERED) { self->recoveryState = RecoveryState::STORAGE_RECOVERED; TraceEvent(getRecoveryEventName(ClusterRecoveryEventType::CLUSTER_RECOVERY_STATE_EVENT_NAME).c_str(), self->dbgid) .detail("StatusCode", RecoveryStatus::storage_recovered) .detail("Status", RecoveryStatus::names[RecoveryStatus::storage_recovered]) .trackLatest(self->clusterRecoveryStateEventHolder->trackingKey); } else if (allLogs && self->recoveryState < RecoveryState::ALL_LOGS_RECRUITED) { self->recoveryState = RecoveryState::ALL_LOGS_RECRUITED; TraceEvent(getRecoveryEventName(ClusterRecoveryEventType::CLUSTER_RECOVERY_STATE_EVENT_NAME).c_str(), self->dbgid) .detail("StatusCode", RecoveryStatus::all_logs_recruited) .detail("Status", RecoveryStatus::names[RecoveryStatus::all_logs_recruited]) .trackLatest(self->clusterRecoveryStateEventHolder->trackingKey); } if (newState.oldTLogData.size() && configuration.repopulateRegionAntiQuorum > 0 && self->logSystem->remoteStorageRecovered()) { TraceEvent(SevWarnAlways, "RecruitmentStalled_RemoteStorageRecovered", self->dbgid).log(); self->recruitmentStalled->set(true); } self->registrationTrigger.trigger(); if (finalUpdate) { oldLogSystems->get()->stopRejoins(); rejoinRequests = rejoinRequestHandler(self); return Void(); } wait(changed); } } ACTOR Future changeCoordinators(Reference self) { loop { ChangeCoordinatorsRequest req = waitNext(self->clusterController.changeCoordinators.getFuture()); TraceEvent("ChangeCoordinators", self->dbgid).log(); ++self->changeCoordinatorsRequests; state ChangeCoordinatorsRequest changeCoordinatorsRequest = req; // Kill cluster controller to facilitate coordinator registration update if (self->controllerData->shouldCommitSuicide) { throw restart_cluster_controller(); } self->controllerData->shouldCommitSuicide = true; while (!self->cstate.previousWrite.isReady()) { wait(self->cstate.previousWrite); wait(delay( 0)); // if a new core state is ready to be written, have that take priority over our finalizing write; } if (!self->cstate.fullyRecovered.isSet()) { wait(self->cstate.write(self->cstate.myDBState, true)); } try { ClusterConnectionString conn(changeCoordinatorsRequest.newConnectionString.toString()); wait(self->cstate.move(conn)); } catch (Error& e) { if (e.code() != error_code_actor_cancelled) changeCoordinatorsRequest.reply.sendError(e); throw; } throw internal_error(); } } ACTOR Future configurationMonitor(Reference self, Database cx) { loop { state ReadYourWritesTransaction tr(cx); loop { try { tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); RangeResult results = wait(tr.getRange(configKeys, CLIENT_KNOBS->TOO_MANY)); ASSERT(!results.more && results.size() < CLIENT_KNOBS->TOO_MANY); DatabaseConfiguration conf; conf.fromKeyValues((VectorRef)results); TraceEvent("ConfigurationMonitor", self->dbgid) .detail(getRecoveryEventName(ClusterRecoveryEventType::CLUSTER_RECOVERY_STATE_EVENT_NAME).c_str(), self->recoveryState); if (conf != self->configuration) { if (self->recoveryState != RecoveryState::ALL_LOGS_RECRUITED && self->recoveryState != RecoveryState::FULLY_RECOVERED) { self->controllerData->shouldCommitSuicide = true; throw restart_cluster_controller(); } self->configuration = conf; self->registrationTrigger.trigger(); } state Future watchFuture = tr.watch(moveKeysLockOwnerKey) || tr.watch(excludedServersVersionKey) || tr.watch(failedServersVersionKey) || tr.watch(excludedLocalityVersionKey) || tr.watch(failedLocalityVersionKey); wait(tr.commit()); wait(watchFuture); break; } catch (Error& e) { wait(tr.onError(e)); } } } } ACTOR static Future> getMinBackupVersion(Reference self, Database cx) { loop { state ReadYourWritesTransaction tr(cx); try { tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr.setOption(FDBTransactionOptions::LOCK_AWARE); Optional value = wait(tr.get(backupStartedKey)); Optional minVersion; if (value.present()) { auto uidVersions = decodeBackupStartedValue(value.get()); TraceEvent e("GotBackupStartKey", self->dbgid); int i = 1; for (auto [uid, version] : uidVersions) { e.detail(format("BackupID%d", i), uid).detail(format("Version%d", i), version); i++; minVersion = minVersion.present() ? std::min(version, minVersion.get()) : version; } } else { TraceEvent("EmptyBackupStartKey", self->dbgid).log(); } return minVersion; } catch (Error& e) { wait(tr.onError(e)); } } } ACTOR static Future recruitBackupWorkers(Reference self, Database cx) { ASSERT(self->backupWorkers.size() > 0); // Avoid race between a backup worker's save progress and the reads below. wait(delay(SERVER_KNOBS->SECONDS_BEFORE_RECRUIT_BACKUP_WORKER)); state LogEpoch epoch = self->cstate.myDBState.recoveryCount; state Reference backupProgress( new BackupProgress(self->dbgid, self->logSystem->getOldEpochTagsVersionsInfo())); state Future gotProgress = getBackupProgress(cx, self->dbgid, backupProgress, /*logging=*/true); state std::vector> initializationReplies; state std::vector> idsTags; // worker IDs and tags for current epoch state int logRouterTags = self->logSystem->getLogRouterTags(); idsTags.reserve(logRouterTags); for (int i = 0; i < logRouterTags; i++) { idsTags.emplace_back(deterministicRandom()->randomUniqueID(), Tag(tagLocalityLogRouter, i)); } const Version startVersion = self->logSystem->getBackupStartVersion(); state int i = 0; for (; i < logRouterTags; i++) { const auto& worker = self->backupWorkers[i % self->backupWorkers.size()]; InitializeBackupRequest req(idsTags[i].first); req.recruitedEpoch = epoch; req.backupEpoch = epoch; req.routerTag = idsTags[i].second; req.totalTags = logRouterTags; req.startVersion = startVersion; TraceEvent("BackupRecruitment", self->dbgid) .detail("RequestID", req.reqId) .detail("Tag", req.routerTag.toString()) .detail("Epoch", epoch) .detail("BackupEpoch", epoch) .detail("StartVersion", req.startVersion); initializationReplies.push_back( transformErrors(throwErrorOr(worker.backup.getReplyUnlessFailedFor( req, SERVER_KNOBS->BACKUP_TIMEOUT, SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY)), backup_worker_failed())); } state Future> fMinVersion = getMinBackupVersion(self, cx); wait(gotProgress && success(fMinVersion)); TraceEvent("MinBackupVersion", self->dbgid).detail("Version", fMinVersion.get().present() ? fMinVersion.get() : -1); std::map, std::map> toRecruit = backupProgress->getUnfinishedBackup(); for (const auto& [epochVersionTags, tagVersions] : toRecruit) { const Version oldEpochEnd = std::get<1>(epochVersionTags); if (!fMinVersion.get().present() || fMinVersion.get().get() + 1 >= oldEpochEnd) { TraceEvent("SkipBackupRecruitment", self->dbgid) .detail("MinVersion", fMinVersion.get().present() ? fMinVersion.get() : -1) .detail("Epoch", epoch) .detail("OldEpoch", std::get<0>(epochVersionTags)) .detail("OldEpochEnd", oldEpochEnd); continue; } for (const auto& [tag, version] : tagVersions) { const auto& worker = self->backupWorkers[i % self->backupWorkers.size()]; i++; InitializeBackupRequest req(deterministicRandom()->randomUniqueID()); req.recruitedEpoch = epoch; req.backupEpoch = std::get<0>(epochVersionTags); req.routerTag = tag; req.totalTags = std::get<2>(epochVersionTags); req.startVersion = version; // savedVersion + 1 req.endVersion = std::get<1>(epochVersionTags) - 1; TraceEvent("BackupRecruitment", self->dbgid) .detail("RequestID", req.reqId) .detail("Tag", req.routerTag.toString()) .detail("Epoch", epoch) .detail("BackupEpoch", req.backupEpoch) .detail("StartVersion", req.startVersion) .detail("EndVersion", req.endVersion.get()); initializationReplies.push_back(transformErrors( throwErrorOr(worker.backup.getReplyUnlessFailedFor( req, SERVER_KNOBS->BACKUP_TIMEOUT, SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY)), backup_worker_failed())); } } std::vector newRecruits = wait(getAll(initializationReplies)); self->logSystem->setBackupWorkers(newRecruits); TraceEvent("BackupRecruitmentDone", self->dbgid).log(); self->registrationTrigger.trigger(); return Void(); } ACTOR Future updateLogsValue(Reference self, Database cx) { state Transaction tr(cx); loop { try { Optional> value = wait(tr.get(logsKey)); ASSERT(value.present()); auto logs = decodeLogsValue(value.get()); std::set logIds; for (auto& log : logs.first) { logIds.insert(log.first); } bool found = false; for (auto& logSet : self->logSystem->getLogSystemConfig().tLogs) { for (auto& log : logSet.tLogs) { if (logIds.count(log.id())) { found = true; break; } } if (found) { break; } } if (!found) { TEST(true); // old master attempted to change logsKey return Void(); } tr.set(logsKey, self->logSystem->getLogsValue()); wait(tr.commit()); return Void(); } catch (Error& e) { wait(tr.onError(e)); } } } // TODO(ahusain): ClusterController orchestrating recovery, self message can be avoided. Future sendMasterRegistration(ClusterRecoveryData* self, LogSystemConfig const& logSystemConfig, std::vector commitProxies, std::vector grvProxies, std::vector resolvers, DBRecoveryCount recoveryCount, std::vector priorCommittedLogServers) { RegisterMasterRequest masterReq; masterReq.id = self->masterInterface.id(); masterReq.mi = self->masterInterface.locality; masterReq.logSystemConfig = logSystemConfig; masterReq.commitProxies = commitProxies; masterReq.grvProxies = grvProxies; masterReq.resolvers = resolvers; masterReq.recoveryCount = recoveryCount; if (self->hasConfiguration) masterReq.configuration = self->configuration; masterReq.registrationCount = ++self->registrationCount; masterReq.priorCommittedLogServers = priorCommittedLogServers; masterReq.recoveryState = self->recoveryState; masterReq.recoveryStalled = self->recruitmentStalled->get(); masterReq.clusterId = self->clusterId; return brokenPromiseToNever(self->clusterController.registerMaster.getReply(masterReq)); } ACTOR Future updateRegistration(Reference self, Reference logSystem) { state Database cx = openDBOnServer(self->dbInfo, TaskPriority::DefaultEndpoint, LockAware::True); state Future trigger = self->registrationTrigger.onTrigger(); state Future updateLogsKey; loop { wait(trigger); wait(delay(.001)); // Coalesce multiple changes trigger = self->registrationTrigger.onTrigger(); auto logSystemConfig = logSystem->getLogSystemConfig(); TraceEvent("UpdateRegistration", self->dbgid) .detail("RecoveryCount", self->cstate.myDBState.recoveryCount) .detail("OldestBackupEpoch", logSystemConfig.oldestBackupEpoch) .detail("Logs", describe(logSystemConfig.tLogs)) .detail("CStateUpdated", self->cstateUpdated.isSet()) .detail("RecoveryTxnVersion", self->recoveryTransactionVersion) .detail("LastEpochEnd", self->lastEpochEnd); if (!self->cstateUpdated.isSet()) { wait(sendMasterRegistration(self.getPtr(), logSystemConfig, self->provisionalCommitProxies, self->provisionalGrvProxies, self->resolvers, self->cstate.myDBState.recoveryCount, self->cstate.prevDBState.getPriorCommittedLogServers())); } else if (self->recoveryState >= RecoveryState::ACCEPTING_COMMITS) { updateLogsKey = updateLogsValue(self, cx); wait(sendMasterRegistration(self.getPtr(), logSystemConfig, self->commitProxies, self->grvProxies, self->resolvers, self->cstate.myDBState.recoveryCount, std::vector())); } else { // The cluster should enter the accepting commits phase soon, and then we will register again TEST(true); // cstate is updated but we aren't accepting commits yet } } } ACTOR Future> provisionalMaster(Reference parent, Future activate) { wait(activate); // Register a fake commit proxy (to be provided right here) to make ourselves available to clients parent->provisionalCommitProxies = std::vector(1); parent->provisionalCommitProxies[0].provisional = true; parent->provisionalCommitProxies[0].initEndpoints(); parent->provisionalGrvProxies = std::vector(1); parent->provisionalGrvProxies[0].provisional = true; parent->provisionalGrvProxies[0].initEndpoints(); state Future waitCommitProxyFailure = waitFailureServer(parent->provisionalCommitProxies[0].waitFailure.getFuture()); state Future waitGrvProxyFailure = waitFailureServer(parent->provisionalGrvProxies[0].waitFailure.getFuture()); parent->registrationTrigger.trigger(); auto lockedKey = parent->txnStateStore->readValue(databaseLockedKey).get(); state bool locked = lockedKey.present() && lockedKey.get().size(); state Optional metadataVersion = parent->txnStateStore->readValue(metadataVersionKey).get(); // We respond to a minimal subset of the commit proxy protocol. Our sole purpose is to receive a single write-only // transaction which might repair our configuration, and return it. loop choose { when(GetReadVersionRequest req = waitNext(parent->provisionalGrvProxies[0].getConsistentReadVersion.getFuture())) { if ((req.flags & GetReadVersionRequest::FLAG_CAUSAL_READ_RISKY) && (req.flags & GetReadVersionRequest::FLAG_USE_PROVISIONAL_PROXIES) && parent->lastEpochEnd) { GetReadVersionReply rep; rep.version = parent->lastEpochEnd; rep.locked = locked; rep.metadataVersion = metadataVersion; rep.proxyId = parent->provisionalGrvProxies[0].id(); req.reply.send(rep); } else req.reply.send(Never()); // We can't perform causally consistent reads without recovering } when(CommitTransactionRequest req = waitNext(parent->provisionalCommitProxies[0].commit.getFuture())) { req.reply.send(Never()); // don't reply (clients always get commit_unknown_result) auto t = &req.transaction; if (t->read_snapshot == parent->lastEpochEnd && //< So no transactions can fall between the read snapshot // and the recovery transaction this (might) be merged with // vvv and also the changes we will make in the recovery // transaction (most notably to lastEpochEndKey) BEFORE we // merge initialConfChanges won't conflict !std::any_of(t->read_conflict_ranges.begin(), t->read_conflict_ranges.end(), [](KeyRangeRef const& r) { return r.contains(lastEpochEndKey); })) { for (auto m = t->mutations.begin(); m != t->mutations.end(); ++m) { TraceEvent("PM_CTM", parent->dbgid) .detail("MType", m->type) .detail("Param1", m->param1) .detail("Param2", m->param2); // emergency transaction only mean to do configuration change if (parent->configuration.involveMutation(*m)) { // We keep the mutations and write conflict ranges from this transaction, but not its read // conflict ranges Standalone out; out.read_snapshot = invalidVersion; out.mutations.append_deep(out.arena(), t->mutations.begin(), t->mutations.size()); out.write_conflict_ranges.append_deep( out.arena(), t->write_conflict_ranges.begin(), t->write_conflict_ranges.size()); return out; } } } } when(GetKeyServerLocationsRequest req = waitNext(parent->provisionalCommitProxies[0].getKeyServersLocations.getFuture())) { req.reply.send(Never()); } when(wait(waitCommitProxyFailure)) { throw worker_removed(); } when(wait(waitGrvProxyFailure)) { throw worker_removed(); } } } ACTOR Future>> recruitEverything( Reference self, std::vector* seedServers, Reference oldLogSystem) { if (!self->configuration.isValid()) { RecoveryStatus::RecoveryStatus status; if (self->configuration.initialized) { TraceEvent( SevWarn, getRecoveryEventName(ClusterRecoveryEventType::CLUSTER_RECOVERY_INVALID_CONFIG_EVENT_NAME).c_str(), self->dbgid) .setMaxEventLength(11000) .setMaxFieldLength(10000) .detail("Conf", self->configuration.toString()); status = RecoveryStatus::configuration_invalid; } else if (!self->cstate.prevDBState.tLogs.size()) { status = RecoveryStatus::configuration_never_created; self->neverCreated = true; } else { status = RecoveryStatus::configuration_missing; } TraceEvent(getRecoveryEventName(ClusterRecoveryEventType::CLUSTER_RECOVERY_STATE_EVENT_NAME).c_str(), self->dbgid) .detail("StatusCode", status) .detail("Status", RecoveryStatus::names[status]) .trackLatest(self->clusterRecoveryStateEventHolder->trackingKey); return Never(); } else TraceEvent(getRecoveryEventName(ClusterRecoveryEventType::CLUSTER_RECOVERY_STATE_EVENT_NAME).c_str(), self->dbgid) .detail("StatusCode", RecoveryStatus::recruiting_transaction_servers) .detail("Status", RecoveryStatus::names[RecoveryStatus::recruiting_transaction_servers]) .detail("Conf", self->configuration.toString()) .detail("RequiredCommitProxies", 1) .detail("RequiredGrvProxies", 1) .detail("RequiredResolvers", 1) .trackLatest(self->clusterRecoveryStateEventHolder->trackingKey); // FIXME: we only need log routers for the same locality as the master int maxLogRouters = self->cstate.prevDBState.logRouterTags; for (auto& old : self->cstate.prevDBState.oldTLogData) { maxLogRouters = std::max(maxLogRouters, old.logRouterTags); } RecruitFromConfigurationRequest recruitReq(self->configuration, self->lastEpochEnd == 0, maxLogRouters); state Reference recruitWorkersInfo = makeReference(recruitReq); recruitWorkersInfo->dbgId = self->dbgid; wait(clusterRecruitFromConfiguration(self->controllerData, recruitWorkersInfo)); state RecruitFromConfigurationReply recruits = recruitWorkersInfo->rep; std::string primaryDcIds, remoteDcIds; self->primaryDcId.clear(); self->remoteDcIds.clear(); if (recruits.dcId.present()) { self->primaryDcId.push_back(recruits.dcId); if (!primaryDcIds.empty()) { primaryDcIds += ','; } primaryDcIds += printable(recruits.dcId); if (self->configuration.regions.size() > 1) { Key remoteDcId = recruits.dcId.get() == self->configuration.regions[0].dcId ? self->configuration.regions[1].dcId : self->configuration.regions[0].dcId; self->remoteDcIds.push_back(remoteDcId); if (!remoteDcIds.empty()) { remoteDcIds += ','; } remoteDcIds += printable(remoteDcId); } } self->backupWorkers.swap(recruits.backupWorkers); TraceEvent(getRecoveryEventName(ClusterRecoveryEventType::CLUSTER_RECOVERY_STATE_EVENT_NAME).c_str(), self->dbgid) .detail("StatusCode", RecoveryStatus::initializing_transaction_servers) .detail("Status", RecoveryStatus::names[RecoveryStatus::initializing_transaction_servers]) .detail("CommitProxies", recruits.commitProxies.size()) .detail("GrvProxies", recruits.grvProxies.size()) .detail("TLogs", recruits.tLogs.size()) .detail("Resolvers", recruits.resolvers.size()) .detail("SatelliteTLogs", recruits.satelliteTLogs.size()) .detail("OldLogRouters", recruits.oldLogRouters.size()) .detail("StorageServers", recruits.storageServers.size()) .detail("BackupWorkers", self->backupWorkers.size()) .detail("PrimaryDcIds", primaryDcIds) .detail("RemoteDcIds", remoteDcIds) .trackLatest(self->clusterRecoveryStateEventHolder->trackingKey); // Actually, newSeedServers does both the recruiting and initialization of the seed servers; so if this is a brand // new database we are sort of lying that we are past the recruitment phase. In a perfect world we would split that // up so that the recruitment part happens above (in parallel with recruiting the transaction servers?). wait(newSeedServers(self, recruits, seedServers)); state std::vector> confChanges; wait(newCommitProxies(self, recruits) && newGrvProxies(self, recruits) && newResolvers(self, recruits) && newTLogServers(self, recruits, oldLogSystem, &confChanges)); // Update recovery related information to the newly elected sequencer (master) process. wait(brokenPromiseToNever( self->masterInterface.updateRecoveryData.getReply(UpdateRecoveryDataRequest(self->recoveryTransactionVersion, self->lastEpochEnd, self->commitProxies, self->resolvers, self->versionEpoch)))); return confChanges; } ACTOR Future updateLocalityForDcId(Optional dcId, Reference oldLogSystem, Reference> locality) { loop { std::pair loc = oldLogSystem->getLogSystemConfig().getLocalityForDcId(dcId); Version ver = locality->get().knownCommittedVersion; if (ver == invalidVersion) { ver = oldLogSystem->getKnownCommittedVersion(); } if (SERVER_KNOBS->ENABLE_VERSION_VECTOR_TLOG_UNICAST) { // Do not try to split peeks between data centers in peekTxns() to recover mem kvstore. // This recovery optimization won't work in UNICAST mode. loc.first = -1; } locality->set(PeekTxsInfo(loc.first, loc.second, ver)); TraceEvent("UpdatedLocalityForDcId") .detail("DcId", dcId) .detail("Locality0", loc.first) .detail("Locality1", loc.second) .detail("Version", ver); wait(oldLogSystem->onLogSystemConfigChange() || oldLogSystem->onKnownCommittedVersionChange()); } } ACTOR Future readTransactionSystemState(Reference self, Reference oldLogSystem, Version txsPoppedVersion) { state Reference> myLocality = Reference>( new AsyncVar(PeekTxsInfo(tagLocalityInvalid, tagLocalityInvalid, invalidVersion))); state Future localityUpdater = updateLocalityForDcId(self->masterInterface.locality.dcId(), oldLogSystem, myLocality); // Peek the txnStateTag in oldLogSystem and recover self->txnStateStore // For now, we also obtain the recovery metadata that the log system obtained during the end_epoch process for // comparison // Sets self->lastEpochEnd and self->recoveryTransactionVersion // Sets self->configuration to the configuration (FF/conf/ keys) at self->lastEpochEnd // Recover transaction state store if (self->txnStateStore) self->txnStateStore->close(); self->txnStateLogAdapter = openDiskQueueAdapter(oldLogSystem, myLocality, txsPoppedVersion); self->txnStateStore = keyValueStoreLogSystem(self->txnStateLogAdapter, self->dbgid, self->memoryLimit, false, false, true); // Version 0 occurs at the version epoch. The version epoch is the number // of microseconds since the Unix epoch. It can be set through fdbcli. self->versionEpoch.reset(); Optional> versionEpochValue = wait(self->txnStateStore->readValue(versionEpochKey)); if (versionEpochValue.present()) { self->versionEpoch = BinaryReader::fromStringRef(versionEpochValue.get(), Unversioned()); } // Versionstamped operations (particularly those applied from DR) define a minimum commit version // that we may recover to, as they embed the version in user-readable data and require that no // transactions will be committed at a lower version. Optional> requiredCommitVersion = wait(self->txnStateStore->readValue(minRequiredCommitVersionKey)); Version minRequiredCommitVersion = -1; if (requiredCommitVersion.present()) { minRequiredCommitVersion = BinaryReader::fromStringRef(requiredCommitVersion.get(), Unversioned()); } if (g_network->isSimulated() && self->versionEpoch.present()) { minRequiredCommitVersion = std::max( minRequiredCommitVersion, static_cast(g_network->timer() * SERVER_KNOBS->VERSIONS_PER_SECOND - self->versionEpoch.get())); } // Recover version info self->lastEpochEnd = oldLogSystem->getEnd() - 1; if (self->lastEpochEnd == 0) { self->recoveryTransactionVersion = 1; } else { if (self->forceRecovery) { self->recoveryTransactionVersion = self->lastEpochEnd + SERVER_KNOBS->MAX_VERSIONS_IN_FLIGHT_FORCED; } else { self->recoveryTransactionVersion = self->lastEpochEnd + SERVER_KNOBS->MAX_VERSIONS_IN_FLIGHT; } if (self->recoveryTransactionVersion < minRequiredCommitVersion) self->recoveryTransactionVersion = minRequiredCommitVersion; } if (BUGGIFY) { self->recoveryTransactionVersion += deterministicRandom()->randomInt64(0, 10000000); } TraceEvent(getRecoveryEventName(ClusterRecoveryEventType::CLUSTER_RECOVERY_RECOVERED_EVENT_NAME).c_str(), self->dbgid) .detail("LastEpochEnd", self->lastEpochEnd) .detail("RecoveryTransactionVersion", self->recoveryTransactionVersion); RangeResult rawConf = wait(self->txnStateStore->readRange(configKeys)); self->configuration.fromKeyValues(rawConf.castTo>()); self->originalConfiguration = self->configuration; self->hasConfiguration = true; TraceEvent(getRecoveryEventName(ClusterRecoveryEventType::CLUSTER_RECOVERY_RECOVERED_EVENT_NAME).c_str(), self->dbgid) .setMaxEventLength(11000) .setMaxFieldLength(10000) .detail("Conf", self->configuration.toString()) .trackLatest(self->recoveredConfigEventHolder->trackingKey); RangeResult rawLocalities = wait(self->txnStateStore->readRange(tagLocalityListKeys)); self->dcId_locality.clear(); for (auto& kv : rawLocalities) { self->dcId_locality[decodeTagLocalityListKey(kv.key)] = decodeTagLocalityListValue(kv.value); } RangeResult rawTags = wait(self->txnStateStore->readRange(serverTagKeys)); self->allTags.clear(); if (self->lastEpochEnd > 0) { self->allTags.push_back(cacheTag); } if (self->forceRecovery) { self->safeLocality = oldLogSystem->getLogSystemConfig().tLogs[0].locality; for (auto& kv : rawTags) { Tag tag = decodeServerTagValue(kv.value); if (tag.locality == self->safeLocality) { self->allTags.push_back(tag); } } } else { for (auto& kv : rawTags) { self->allTags.push_back(decodeServerTagValue(kv.value)); } } RangeResult rawHistoryTags = wait(self->txnStateStore->readRange(serverTagHistoryKeys)); for (auto& kv : rawHistoryTags) { self->allTags.push_back(decodeServerTagValue(kv.value)); } uniquify(self->allTags); // auto kvs = self->txnStateStore->readRange( systemKeys ); // for( auto & kv : kvs.get() ) // TraceEvent("ClusterRecoveredTXS", self->dbgid).detail("K", kv.key).detail("V", kv.value); self->txnStateLogAdapter->setNextVersion( oldLogSystem->getEnd()); //< FIXME: (1) the log adapter should do this automatically after recovery; (2) if we // make KeyValueStoreMemory guarantee immediate reads, we should be able to get rid of // the discardCommit() below and not need a writable log adapter TraceEvent("RTSSComplete", self->dbgid).log(); return Void(); } ACTOR Future sendInitialCommitToResolvers(Reference self) { state KeyRange txnKeys = allKeys; state Sequence txnSequence = 0; ASSERT(self->recoveryTransactionVersion); state RangeResult data = self->txnStateStore ->readRange(txnKeys, BUGGIFY ? 3 : SERVER_KNOBS->DESIRED_TOTAL_BYTES, SERVER_KNOBS->DESIRED_TOTAL_BYTES) .get(); state std::vector> txnReplies; state int64_t dataOutstanding = 0; state std::vector endpoints; for (auto& it : self->commitProxies) { endpoints.push_back(it.txnState.getEndpoint()); } if (SERVER_KNOBS->PROXY_USE_RESOLVER_PRIVATE_MUTATIONS) { // Broadcasts transaction state store to resolvers. for (auto& it : self->resolvers) { endpoints.push_back(it.txnState.getEndpoint()); } } loop { if (!data.size()) break; ((KeyRangeRef&)txnKeys) = KeyRangeRef(keyAfter(data.back().key, txnKeys.arena()), txnKeys.end); RangeResult nextData = self->txnStateStore ->readRange(txnKeys, BUGGIFY ? 3 : SERVER_KNOBS->DESIRED_TOTAL_BYTES, SERVER_KNOBS->DESIRED_TOTAL_BYTES) .get(); TxnStateRequest req; req.arena = data.arena(); req.data = data; req.sequence = txnSequence; req.last = !nextData.size(); req.broadcastInfo = endpoints; txnReplies.push_back(broadcastTxnRequest(req, SERVER_KNOBS->TXN_STATE_SEND_AMOUNT, false)); dataOutstanding += SERVER_KNOBS->TXN_STATE_SEND_AMOUNT * data.arena().getSize(); data = nextData; txnSequence++; if (dataOutstanding > SERVER_KNOBS->MAX_TXS_SEND_MEMORY) { wait(waitForAll(txnReplies)); txnReplies = std::vector>(); dataOutstanding = 0; } wait(yield()); } wait(waitForAll(txnReplies)); TraceEvent("RecoveryInternal", self->dbgid) .detail("StatusCode", RecoveryStatus::recovery_transaction) .detail("Status", RecoveryStatus::names[RecoveryStatus::recovery_transaction]) .detail("RecoveryTxnVersion", self->recoveryTransactionVersion) .detail("LastEpochEnd", self->lastEpochEnd) .detail("Step", "SentTxnStateStoreToCommitProxies"); std::vector> replies; for (auto& r : self->resolvers) { ResolveTransactionBatchRequest req; req.prevVersion = -1; req.version = self->lastEpochEnd; req.lastReceivedVersion = -1; replies.push_back(brokenPromiseToNever(r.resolve.getReply(req))); } wait(waitForAll(replies)); TraceEvent("RecoveryInternal", self->dbgid) .detail("StatusCode", RecoveryStatus::recovery_transaction) .detail("Status", RecoveryStatus::names[RecoveryStatus::recovery_transaction]) .detail("RecoveryTxnVersion", self->recoveryTransactionVersion) .detail("LastEpochEnd", self->lastEpochEnd) .detail("Step", "InitializedAllResolvers"); return Void(); } ACTOR Future triggerUpdates(Reference self, Reference oldLogSystem) { loop { wait(oldLogSystem->onLogSystemConfigChange() || self->cstate.fullyRecovered.getFuture() || self->recruitmentStalled->onChange()); if (self->cstate.fullyRecovered.isSet()) return Void(); self->registrationTrigger.trigger(); } } ACTOR Future discardCommit(IKeyValueStore* store, LogSystemDiskQueueAdapter* adapter) { state Future fcm = adapter->getCommitMessage(); state Future committed = store->commit(); LogSystemDiskQueueAdapter::CommitMessage cm = wait(fcm); ASSERT(!committed.isReady()); cm.acknowledge.send(Void()); ASSERT(committed.isReady()); return Void(); } void updateConfigForForcedRecovery(Reference self, std::vector>* initialConfChanges) { bool regionsChanged = false; for (auto& it : self->configuration.regions) { if (it.dcId == self->controllerData->clusterControllerDcId.get() && it.priority < 0) { it.priority = 1; regionsChanged = true; } else if (it.dcId != self->controllerData->clusterControllerDcId.get() && it.priority >= 0) { it.priority = -1; regionsChanged = true; } } Standalone regionCommit; regionCommit.mutations.push_back_deep( regionCommit.arena(), MutationRef(MutationRef::SetValue, configKeysPrefix.toString() + "usable_regions", LiteralStringRef("1"))); self->configuration.applyMutation(regionCommit.mutations.back()); if (regionsChanged) { std::sort( self->configuration.regions.begin(), self->configuration.regions.end(), RegionInfo::sort_by_priority()); StatusObject regionJSON; regionJSON["regions"] = self->configuration.getRegionJSON(); regionCommit.mutations.push_back_deep( regionCommit.arena(), MutationRef(MutationRef::SetValue, configKeysPrefix.toString() + "regions", BinaryWriter::toValue(regionJSON, IncludeVersion(ProtocolVersion::withRegionConfiguration())) .toString())); self->configuration.applyMutation( regionCommit.mutations.back()); // modifying the configuration directly does not change the configuration // when it is re-serialized unless we call applyMutation TraceEvent("ForcedRecoveryConfigChange", self->dbgid) .setMaxEventLength(11000) .setMaxFieldLength(10000) .detail("Conf", self->configuration.toString()); } initialConfChanges->push_back(regionCommit); } ACTOR Future recoverFrom(Reference self, Reference oldLogSystem, std::vector* seedServers, std::vector>* initialConfChanges, Future poppedTxsVersion, bool* clusterIdExists) { TraceEvent(getRecoveryEventName(ClusterRecoveryEventType::CLUSTER_RECOVERY_STATE_EVENT_NAME).c_str(), self->dbgid) .detail("StatusCode", RecoveryStatus::reading_transaction_system_state) .detail("Status", RecoveryStatus::names[RecoveryStatus::reading_transaction_system_state]) .trackLatest(self->clusterRecoveryStateEventHolder->trackingKey); self->hasConfiguration = false; if (BUGGIFY) wait(delay(10.0)); Version txsPoppedVersion = wait(poppedTxsVersion); wait(readTransactionSystemState(self, oldLogSystem, txsPoppedVersion)); for (auto& itr : *initialConfChanges) { for (auto& m : itr.mutations) { self->configuration.applyMutation(m); } } if (self->forceRecovery) { updateConfigForForcedRecovery(self, initialConfChanges); } debug_checkMaxRestoredVersion(UID(), self->lastEpochEnd, "DBRecovery"); // Generate a cluster ID to uniquely identify the cluster if it doesn't // already exist in the txnStateStore. Optional clusterId = self->txnStateStore->readValue(clusterIdKey).get(); *clusterIdExists = clusterId.present(); if (!clusterId.present()) { self->clusterId = deterministicRandom()->randomUniqueID(); } else { self->clusterId = BinaryReader::fromStringRef(clusterId.get(), Unversioned()); } // Ordinarily we pass through this loop once and recover. We go around the loop if recovery stalls for more than a // second, a provisional master is initialized, and an "emergency transaction" is submitted that might change the // configuration so that we can finish recovery. state std::map, int8_t> originalLocalityMap = self->dcId_locality; state Future>> recruitments = recruitEverything(self, seedServers, oldLogSystem); state double provisionalDelay = SERVER_KNOBS->PROVISIONAL_START_DELAY; loop { state Future> provisional = provisionalMaster(self, delay(provisionalDelay)); provisionalDelay = std::min(SERVER_KNOBS->PROVISIONAL_MAX_DELAY, provisionalDelay * SERVER_KNOBS->PROVISIONAL_DELAY_GROWTH); choose { when(std::vector> confChanges = wait(recruitments)) { initialConfChanges->insert(initialConfChanges->end(), confChanges.begin(), confChanges.end()); provisional.cancel(); break; } when(Standalone _req = wait(provisional)) { state Standalone req = _req; // mutable TEST(true); // Emergency transaction processing during recovery TraceEvent("EmergencyTransaction", self->dbgid).log(); for (auto m = req.mutations.begin(); m != req.mutations.end(); ++m) TraceEvent("EmergencyTransactionMutation", self->dbgid) .detail("MType", m->type) .detail("P1", m->param1) .detail("P2", m->param2); DatabaseConfiguration oldConf = self->configuration; self->configuration = self->originalConfiguration; for (auto& m : req.mutations) self->configuration.applyMutation(m); initialConfChanges->clear(); if (self->originalConfiguration.isValid() && self->configuration.usableRegions != self->originalConfiguration.usableRegions) { TraceEvent(SevWarnAlways, "CannotChangeUsableRegions", self->dbgid).log(); self->configuration = self->originalConfiguration; } else { initialConfChanges->push_back(req); } if (self->forceRecovery) { updateConfigForForcedRecovery(self, initialConfChanges); } if (self->configuration != oldConf) { // confChange does not trigger when including servers self->dcId_locality = originalLocalityMap; recruitments = recruitEverything(self, seedServers, oldLogSystem); } } } provisional.cancel(); } return Void(); } ACTOR Future clusterRecoveryCore(Reference self) { state TraceInterval recoveryInterval("ClusterRecovery"); state double recoverStartTime = now(); self->addActor.send(waitFailureServer(self->masterInterface.waitFailure.getFuture())); TraceEvent(recoveryInterval.begin(), self->dbgid).log(); self->recoveryState = RecoveryState::READING_CSTATE; TraceEvent(getRecoveryEventName(ClusterRecoveryEventType::CLUSTER_RECOVERY_STATE_EVENT_NAME).c_str(), self->dbgid) .detail("StatusCode", RecoveryStatus::reading_coordinated_state) .detail("Status", RecoveryStatus::names[RecoveryStatus::reading_coordinated_state]) .trackLatest(self->clusterRecoveryStateEventHolder->trackingKey); wait(self->cstate.read()); if (self->cstate.prevDBState.lowestCompatibleProtocolVersion > currentProtocolVersion) { TraceEvent(SevWarnAlways, "IncompatibleProtocolVersion", self->dbgid).log(); throw internal_error(); } self->recoveryState = RecoveryState::LOCKING_CSTATE; TraceEvent(getRecoveryEventName(ClusterRecoveryEventType::CLUSTER_RECOVERY_STATE_EVENT_NAME).c_str(), self->dbgid) .detail("StatusCode", RecoveryStatus::locking_coordinated_state) .detail("Status", RecoveryStatus::names[RecoveryStatus::locking_coordinated_state]) .detail("TLogs", self->cstate.prevDBState.tLogs.size()) .detail("ActiveGenerations", self->cstate.myDBState.oldTLogData.size() + 1) .detail("MyRecoveryCount", self->cstate.prevDBState.recoveryCount + 2) .detail("ForceRecovery", self->forceRecovery) .trackLatest(self->clusterRecoveryStateEventHolder->trackingKey); // for (const auto& old : self->cstate.prevDBState.oldTLogData) { // TraceEvent("BWReadCoreState", self->dbgid).detail("Epoch", old.epoch).detail("Version", old.epochEnd); //} TraceEvent(getRecoveryEventName(ClusterRecoveryEventType::CLUSTER_RECOVERY_GENERATION_EVENT_NAME).c_str(), self->dbgid) .detail("ActiveGenerations", self->cstate.myDBState.oldTLogData.size() + 1) .trackLatest(self->clusterRecoveryGenerationsEventHolder->trackingKey); if (self->cstate.myDBState.oldTLogData.size() > CLIENT_KNOBS->MAX_GENERATIONS_OVERRIDE) { if (self->cstate.myDBState.oldTLogData.size() >= CLIENT_KNOBS->MAX_GENERATIONS) { TraceEvent(SevError, "RecoveryStoppedTooManyOldGenerations") .detail("OldGenerations", self->cstate.myDBState.oldTLogData.size()) .detail("Reason", "Recovery stopped because too many recoveries have happened since the last time the cluster " "was fully_recovered. Set --knob-max-generations-override on your server processes to a value " "larger than OldGenerations to resume recovery once the underlying problem has been fixed."); wait(Future(Never())); } else if (self->cstate.myDBState.oldTLogData.size() > CLIENT_KNOBS->RECOVERY_DELAY_START_GENERATION) { TraceEvent(SevError, "RecoveryDelayedTooManyOldGenerations") .detail("OldGenerations", self->cstate.myDBState.oldTLogData.size()) .detail("Reason", "Recovery is delayed because too many recoveries have happened since the last time the cluster " "was fully_recovered. Set --knob-max-generations-override on your server processes to a value " "larger than OldGenerations to resume recovery once the underlying problem has been fixed."); wait(delay(CLIENT_KNOBS->RECOVERY_DELAY_SECONDS_PER_GENERATION * (self->cstate.myDBState.oldTLogData.size() - CLIENT_KNOBS->RECOVERY_DELAY_START_GENERATION))); } if (g_network->isSimulated() && self->cstate.myDBState.oldTLogData.size() > CLIENT_KNOBS->MAX_GENERATIONS_SIM) { g_simulator.connectionFailuresDisableDuration = 1e6; g_simulator.speedUpSimulation = true; TraceEvent(SevWarnAlways, "DisableConnectionFailures_TooManyGenerations").log(); } } state Reference>> oldLogSystems(new AsyncVar>); state Future recoverAndEndEpoch = ILogSystem::recoverAndEndEpoch(oldLogSystems, self->dbgid, self->cstate.prevDBState, self->clusterController.tlogRejoin.getFuture(), self->controllerData->db.serverInfo->get().myLocality, std::addressof(self->forceRecovery)); DBCoreState newState = self->cstate.myDBState; newState.recoveryCount++; newState.recoveryCount++; if (self->cstate.prevDBState.newestProtocolVersion.isInvalid() || self->cstate.prevDBState.newestProtocolVersion < currentProtocolVersion) { ASSERT(self->cstate.myDBState.lowestCompatibleProtocolVersion.isInvalid() || !self->cstate.myDBState.newestProtocolVersion.isInvalid()); newState.newestProtocolVersion = currentProtocolVersion; newState.lowestCompatibleProtocolVersion = minCompatibleProtocolVersion; } wait(self->cstate.write(newState) || recoverAndEndEpoch); TraceEvent("ProtocolVersionCompatibilityChecked", self->dbgid) .detail("NewestProtocolVersion", self->cstate.myDBState.newestProtocolVersion) .detail("LowestCompatibleProtocolVersion", self->cstate.myDBState.lowestCompatibleProtocolVersion) .trackLatest(self->swVersionCheckedEventHolder->trackingKey); self->recoveryState = RecoveryState::RECRUITING; state std::vector seedServers; state std::vector> initialConfChanges; state Future logChanges; state Future minRecoveryDuration; state Future poppedTxsVersion; state bool clusterIdExists = false; loop { Reference oldLogSystem = oldLogSystems->get(); if (oldLogSystem) { logChanges = triggerUpdates(self, oldLogSystem); if (!minRecoveryDuration.isValid()) { minRecoveryDuration = delay(SERVER_KNOBS->ENFORCED_MIN_RECOVERY_DURATION); poppedTxsVersion = oldLogSystem->getTxsPoppedVersion(); } } state Future reg = oldLogSystem ? updateRegistration(self, oldLogSystem) : Never(); self->registrationTrigger.trigger(); choose { when(wait(oldLogSystem ? recoverFrom(self, oldLogSystem, &seedServers, &initialConfChanges, poppedTxsVersion, std::addressof(clusterIdExists)) : Never())) { reg.cancel(); break; } when(wait(oldLogSystems->onChange())) {} when(wait(reg)) { throw internal_error(); } when(wait(recoverAndEndEpoch)) { throw internal_error(); } } } if (self->neverCreated) { recoverStartTime = now(); } recoverAndEndEpoch.cancel(); ASSERT_LE(self->commitProxies.size(), self->configuration.getDesiredCommitProxies()); ASSERT_GE(self->commitProxies.size(), 1); ASSERT_LE(self->grvProxies.size(), self->configuration.getDesiredGrvProxies()); ASSERT_GE(self->grvProxies.size(), 1); ASSERT_LE(self->resolvers.size(), self->configuration.getDesiredResolvers()); ASSERT_GE(self->resolvers.size(), 1); self->recoveryState = RecoveryState::RECOVERY_TRANSACTION; TraceEvent(getRecoveryEventName(ClusterRecoveryEventType::CLUSTER_RECOVERY_STATE_EVENT_NAME).c_str(), self->dbgid) .detail("StatusCode", RecoveryStatus::recovery_transaction) .detail("Status", RecoveryStatus::names[RecoveryStatus::recovery_transaction]) .detail("PrimaryLocality", self->primaryLocality) .detail("DcId", self->masterInterface.locality.dcId()) .detail("ClusterId", self->clusterId) .trackLatest(self->clusterRecoveryStateEventHolder->trackingKey); // Recovery transaction state bool debugResult = debug_checkMinRestoredVersion(UID(), self->lastEpochEnd, "DBRecovery", SevWarn); CommitTransactionRequest recoveryCommitRequest; recoveryCommitRequest.flags = recoveryCommitRequest.flags | CommitTransactionRequest::FLAG_IS_LOCK_AWARE; CommitTransactionRef& tr = recoveryCommitRequest.transaction; int mmApplied = 0; // The number of mutations in tr.mutations that have been applied to the txnStateStore so far if (self->lastEpochEnd != 0) { Optional snapRecoveryFlag = self->txnStateStore->readValue(writeRecoveryKey).get(); TraceEvent(getRecoveryEventName(ClusterRecoveryEventType::CLUSTER_RECOVERY_SNAPSHOT_CHECK_EVENT_NAME).c_str()) .detail("SnapRecoveryFlag", snapRecoveryFlag.present() ? snapRecoveryFlag.get().toString() : "N/A") .detail("LastEpochEnd", self->lastEpochEnd); if (snapRecoveryFlag.present()) { TEST(true); // Recovering from snapshot, writing to snapShotEndVersionKey BinaryWriter bw(Unversioned()); tr.set(recoveryCommitRequest.arena, snapshotEndVersionKey, (bw << self->lastEpochEnd).toValue()); // Pause the backups that got restored in this snapshot to avoid data corruption // Requires further operational work to abort the backup TraceEvent( getRecoveryEventName(ClusterRecoveryEventType::CLUSTER_RECOVERY_PAUSE_AGENT_BACKUP_EVENT_NAME).c_str()) .log(); Key backupPauseKey = FileBackupAgent::getPauseKey(); tr.set(recoveryCommitRequest.arena, backupPauseKey, StringRef()); // Clear the key so multiple recoveries will not overwrite the first version recorded tr.clear(recoveryCommitRequest.arena, singleKeyRange(writeRecoveryKey)); } if (self->forceRecovery) { BinaryWriter bw(Unversioned()); tr.set(recoveryCommitRequest.arena, killStorageKey, (bw << self->safeLocality).toValue()); } // This transaction sets \xff/lastEpochEnd, which the shard servers can use to roll back speculatively // processed semi-committed transactions from the previous epoch. // It also guarantees the shard servers and tlog servers eventually get versions in the new epoch, which // clients might rely on. // This transaction is by itself in a batch (has its own version number), which simplifies storage servers // slightly (they assume there are no modifications to serverKeys in the same batch) The proxy also expects the // lastEpochEndKey mutation to be first in the transaction BinaryWriter bw(Unversioned()); tr.set(recoveryCommitRequest.arena, lastEpochEndKey, (bw << self->lastEpochEnd).toValue()); if (self->forceRecovery) { tr.set(recoveryCommitRequest.arena, rebootWhenDurableKey, StringRef()); tr.set(recoveryCommitRequest.arena, moveKeysLockOwnerKey, BinaryWriter::toValue(deterministicRandom()->randomUniqueID(), Unversioned())); } } else { // Recruit and seed initial shard servers // This transaction must be the very first one in the database (version 1) seedShardServers(recoveryCommitRequest.arena, tr, seedServers); } // initialConfChanges have not been conflict checked against any earlier writes in the recovery transaction, so do // this as early as possible in the recovery transaction but see above comments as to why it can't be absolutely // first. Theoretically emergency transactions should conflict check against the lastEpochEndKey. for (auto& itr : initialConfChanges) { tr.mutations.append_deep(recoveryCommitRequest.arena, itr.mutations.begin(), itr.mutations.size()); tr.write_conflict_ranges.append_deep( recoveryCommitRequest.arena, itr.write_conflict_ranges.begin(), itr.write_conflict_ranges.size()); } tr.set( recoveryCommitRequest.arena, primaryLocalityKey, BinaryWriter::toValue(self->primaryLocality, Unversioned())); tr.set(recoveryCommitRequest.arena, backupVersionKey, backupVersionValue); tr.set(recoveryCommitRequest.arena, coordinatorsKey, self->coordinators.ccr->getConnectionString().toString()); tr.set(recoveryCommitRequest.arena, logsKey, self->logSystem->getLogsValue()); tr.set(recoveryCommitRequest.arena, primaryDatacenterKey, self->controllerData->clusterControllerDcId.present() ? self->controllerData->clusterControllerDcId.get() : StringRef()); tr.clear(recoveryCommitRequest.arena, tLogDatacentersKeys); for (auto& dc : self->primaryDcId) { tr.set(recoveryCommitRequest.arena, tLogDatacentersKeyFor(dc), StringRef()); } if (self->configuration.usableRegions > 1) { for (auto& dc : self->remoteDcIds) { tr.set(recoveryCommitRequest.arena, tLogDatacentersKeyFor(dc), StringRef()); } } // Write cluster ID into txnStateStore if it is missing. if (!clusterIdExists) { tr.set(recoveryCommitRequest.arena, clusterIdKey, BinaryWriter::toValue(self->clusterId, Unversioned())); } applyMetadataMutations(SpanID(), self->dbgid, recoveryCommitRequest.arena, tr.mutations.slice(mmApplied, tr.mutations.size()), self->txnStateStore); mmApplied = tr.mutations.size(); tr.read_snapshot = self->recoveryTransactionVersion; // lastEpochEnd would make more sense, but isn't in the initial // window of the resolver(s) TraceEvent(getRecoveryEventName(ClusterRecoveryEventType::CLUSTER_RECOVERY_COMMIT_EVENT_NAME).c_str(), self->dbgid) .log(); state Future> recoveryCommit = self->commitProxies[0].commit.tryGetReply(recoveryCommitRequest); self->addActor.send(self->logSystem->onError()); self->addActor.send(waitResolverFailure(self->resolvers)); self->addActor.send(waitCommitProxyFailure(self->commitProxies)); self->addActor.send(waitGrvProxyFailure(self->grvProxies)); self->addActor.send(reportErrors(updateRegistration(self, self->logSystem), "UpdateRegistration", self->dbgid)); self->registrationTrigger.trigger(); wait(discardCommit(self->txnStateStore, self->txnStateLogAdapter)); // Wait for the recovery transaction to complete. // SOMEDAY: For faster recovery, do this and setDBState asynchronously and don't wait for them // unless we want to change TLogs wait((success(recoveryCommit) && sendInitialCommitToResolvers(self))); if (recoveryCommit.isReady() && recoveryCommit.get().isError()) { TEST(true); // Cluster recovery failed because of the initial commit failed throw cluster_recovery_failed(); } ASSERT(self->recoveryTransactionVersion != 0); self->recoveryState = RecoveryState::WRITING_CSTATE; TraceEvent(getRecoveryEventName(ClusterRecoveryEventType::CLUSTER_RECOVERY_STATE_EVENT_NAME).c_str(), self->dbgid) .detail("StatusCode", RecoveryStatus::writing_coordinated_state) .detail("Status", RecoveryStatus::names[RecoveryStatus::writing_coordinated_state]) .detail("TLogList", self->logSystem->describe()) .trackLatest(self->clusterRecoveryStateEventHolder->trackingKey); // Multiple masters prevent conflicts between themselves via CoordinatedState (self->cstate) // 1. If SetMaster succeeds, then by CS's contract, these "new" Tlogs are the immediate // successors of the "old" ones we are replacing // 2. logSystem->recoverAndEndEpoch ensured that a co-quorum of the "old" tLogs were stopped at // versions <= self->lastEpochEnd, so no versions > self->lastEpochEnd could be (fully) committed to them. // 3. No other master will attempt to commit anything to our "new" Tlogs // because it didn't recruit them // 4. Therefore, no full commit can come between self->lastEpochEnd and the first commit // we made to the new Tlogs (self->recoveryTransactionVersion), and only our own semi-commits can come between // our first commit and the next new TLogs self->addActor.send(trackTlogRecovery(self, oldLogSystems, minRecoveryDuration)); debug_advanceMaxCommittedVersion(UID(), self->recoveryTransactionVersion); wait(self->cstateUpdated.getFuture()); debug_advanceMinCommittedVersion(UID(), self->recoveryTransactionVersion); if (debugResult) { TraceEvent(self->forceRecovery ? SevWarn : SevError, "DBRecoveryDurabilityError").log(); } TraceEvent(getRecoveryEventName(ClusterRecoveryEventType::CLUSTER_RECOVERY_COMMIT_EVENT_NAME).c_str(), self->dbgid) .detail("TLogs", self->logSystem->describe()) .detail("RecoveryCount", self->cstate.myDBState.recoveryCount) .detail("RecoveryTransactionVersion", self->recoveryTransactionVersion); TraceEvent(recoveryInterval.end(), self->dbgid) .detail("RecoveryTransactionVersion", self->recoveryTransactionVersion); self->recoveryState = RecoveryState::ACCEPTING_COMMITS; double recoveryDuration = now() - recoverStartTime; TraceEvent((recoveryDuration > 4 && !g_network->isSimulated()) ? SevWarnAlways : SevInfo, getRecoveryEventName(ClusterRecoveryEventType::CLUSTER_RECOVERY_DURATION_EVENT_NAME).c_str(), self->dbgid) .detail("RecoveryDuration", recoveryDuration) .trackLatest(self->clusterRecoveryDurationEventHolder->trackingKey); TraceEvent(getRecoveryEventName(ClusterRecoveryEventType::CLUSTER_RECOVERY_STATE_EVENT_NAME).c_str(), self->dbgid) .detail("StatusCode", RecoveryStatus::accepting_commits) .detail("Status", RecoveryStatus::names[RecoveryStatus::accepting_commits]) .detail("StoreType", self->configuration.storageServerStoreType) .detail("RecoveryDuration", recoveryDuration) .trackLatest(self->clusterRecoveryStateEventHolder->trackingKey); self->addActor.send(changeCoordinators(self)); Database cx = openDBOnServer(self->dbInfo, TaskPriority::DefaultEndpoint, LockAware::True); self->addActor.send(configurationMonitor(self, cx)); if (self->configuration.backupWorkerEnabled) { self->addActor.send(recruitBackupWorkers(self, cx)); } else { self->logSystem->setOldestBackupEpoch(self->cstate.myDBState.recoveryCount); } wait(Future(Never())); throw internal_error(); } ACTOR Future cleanupRecoveryActorCollection(Reference self, bool exThrown) { if (self.isValid()) { wait(delay(0.0)); while (!self->addActor.isEmpty()) { self->addActor.getFuture().pop(); } } return Void(); } bool isNormalClusterRecoveryError(const Error& error) { return normalClusterRecoveryErrors().count(error.code()); } std::string& getRecoveryEventName(ClusterRecoveryEventType type) { ASSERT(type >= ClusterRecoveryEventType::CLUSTER_RECOVERY_STATE_EVENT_NAME && type < ClusterRecoveryEventType::CLUSTER_RECOVERY_LAST); // Cluster recovery state machine used to be driven from master/sequencer process, recovery state // tracking is used by test environment and tooling scripts. To ease the process of migration, prefix // is controlled by a ServerKnob for now. // // TODO: ServerKnob should be removed ones all tooling scripts and usage is identified and updated accordingly static std::map recoveryEventNameMap; if (recoveryEventNameMap.empty()) { // initialize the map recoveryEventNameMap.insert({ ClusterRecoveryEventType::CLUSTER_RECOVERY_STATE_EVENT_NAME, SERVER_KNOBS->CLUSTER_RECOVERY_EVENT_NAME_PREFIX + "RecoveryState" }); recoveryEventNameMap.insert({ ClusterRecoveryEventType::CLUSTER_RECOVERY_COMMIT_TLOG_EVENT_NAME, SERVER_KNOBS->CLUSTER_RECOVERY_EVENT_NAME_PREFIX + "RecoveryCommittedTLogs" }); recoveryEventNameMap.insert({ ClusterRecoveryEventType::CLUSTER_RECOVERY_DURATION_EVENT_NAME, SERVER_KNOBS->CLUSTER_RECOVERY_EVENT_NAME_PREFIX + "RecoveryDuration" }); recoveryEventNameMap.insert({ ClusterRecoveryEventType::CLUSTER_RECOVERY_GENERATION_EVENT_NAME, SERVER_KNOBS->CLUSTER_RECOVERY_EVENT_NAME_PREFIX + "RecoveryGenerations" }); recoveryEventNameMap.insert( { ClusterRecoveryEventType::CLUSTER_RECOVERY_SS_RECRUITMENT_EVENT_NAME, SERVER_KNOBS->CLUSTER_RECOVERY_EVENT_NAME_PREFIX + "RecoveryRecruitingInitialStorageServer" }); recoveryEventNameMap.insert( { ClusterRecoveryEventType::CLUSTER_RECOVERY_INVALID_CONFIG_EVENT_NAME, SERVER_KNOBS->CLUSTER_RECOVERY_EVENT_NAME_PREFIX + "RecoveryInvalidConfiguration" }); recoveryEventNameMap.insert({ ClusterRecoveryEventType::CLUSTER_RECOVERY_RECOVERING_EVENT_NAME, SERVER_KNOBS->CLUSTER_RECOVERY_EVENT_NAME_PREFIX + "Recovering" }); recoveryEventNameMap.insert({ ClusterRecoveryEventType::CLUSTER_RECOVERY_RECOVERED_EVENT_NAME, SERVER_KNOBS->CLUSTER_RECOVERY_EVENT_NAME_PREFIX + "RecoveredConfig" }); recoveryEventNameMap.insert({ ClusterRecoveryEventType::CLUSTER_RECOVERY_SNAPSHOT_CHECK_EVENT_NAME, SERVER_KNOBS->CLUSTER_RECOVERY_EVENT_NAME_PREFIX + "RecoverySnapshotCheck" }); recoveryEventNameMap.insert({ ClusterRecoveryEventType::CLUSTER_RECOVERY_SS_RECRUITMENT_EVENT_NAME, SERVER_KNOBS->CLUSTER_RECOVERY_EVENT_NAME_PREFIX + "RecoverySnapshotCheck" }); recoveryEventNameMap.insert({ ClusterRecoveryEventType::CLUSTER_RECOVERY_PAUSE_AGENT_BACKUP_EVENT_NAME, SERVER_KNOBS->CLUSTER_RECOVERY_EVENT_NAME_PREFIX + "RecoveryPauseBackupAgents" }); recoveryEventNameMap.insert({ ClusterRecoveryEventType::CLUSTER_RECOVERY_COMMIT_EVENT_NAME, SERVER_KNOBS->CLUSTER_RECOVERY_EVENT_NAME_PREFIX + "RecoveryCommit" }); recoveryEventNameMap.insert({ ClusterRecoveryEventType::CLUSTER_RECOVERY_AVAILABLE_EVENT_NAME, SERVER_KNOBS->CLUSTER_RECOVERY_EVENT_NAME_PREFIX + "RecoveryAvailable" }); recoveryEventNameMap.insert({ ClusterRecoveryEventType::CLUSTER_RECOVERY_METRICS_EVENT_NAME, SERVER_KNOBS->CLUSTER_RECOVERY_EVENT_NAME_PREFIX + "RecoveryMetrics" }); } auto iter = recoveryEventNameMap.find(type); ASSERT(iter != recoveryEventNameMap.end()); return iter->second; }