diff --git a/fdbserver/BackupWorker.actor.cpp b/fdbserver/BackupWorker.actor.cpp index c8045bb0b5..ca426b7569 100644 --- a/fdbserver/BackupWorker.actor.cpp +++ b/fdbserver/BackupWorker.actor.cpp @@ -1086,7 +1086,7 @@ ACTOR Future backupWorker(BackupInterface interf, TraceEvent("BackupWorkerDone", self.myId).detail("BackupEpoch", self.backupEpoch); // Notify master so that this worker can be removed from log system, then this // worker (for an old epoch's unfinished work) can safely exit. - wait(brokenPromiseToNever(db->get().clusterInterface.notifyBackupWorkerDone.getReply( + wait(brokenPromiseToNever(db->get().master.notifyBackupWorkerDone.getReply( BackupWorkerDoneRequest(self.myId, self.backupEpoch)))); break; } diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp index 8e746d268c..88a3e90d2f 100644 --- a/fdbserver/ClusterController.actor.cpp +++ b/fdbserver/ClusterController.actor.cpp @@ -21,7 +21,6 @@ #include #include #include -#include #include #include @@ -30,22 +29,15 @@ #include "flow/SystemMonitor.h" #include "fdbclient/ClusterConnectionMemoryRecord.h" #include "fdbclient/NativeAPI.actor.h" -#include "fdbserver/ApplyMetadataMutation.h" #include "fdbserver/BackupInterface.h" -#include "fdbserver/BackupProgress.actor.h" -#include "fdbserver/ConfigBroadcaster.h" -#include "fdbserver/CoordinatedState.h" -#include "fdbserver/CoordinationInterface.h" // copy constructors for ServerCoordinators class +#include "fdbserver/CoordinationInterface.h" #include "fdbserver/DataDistributorInterface.h" -#include "fdbserver/DBCoreState.h" #include "fdbserver/Knobs.h" #include "fdbserver/ConfigBroadcaster.h" #include "fdbserver/MoveKeys.actor.h" #include "fdbserver/WorkerInterface.actor.h" #include "fdbserver/LeaderElection.h" -#include "fdbserver/LogSystem.h" #include "fdbserver/LogSystemConfig.h" -#include "fdbserver/LogSystemDiskQueueAdapter.h" #include "fdbserver/WaitFailure.h" #include "fdbserver/RatekeeperInterface.h" #include "fdbserver/BlobManagerInterface.h" @@ -58,7 +50,6 @@ #include "fdbclient/ReadYourWrites.h" #include "fdbrpc/Replication.h" #include "fdbrpc/ReplicationUtils.h" -#include "fdbrpc/sim_validation.h" #include "fdbclient/KeyBackedTypes.h" #include "flow/Util.h" #include "flow/actorcompiler.h" // This must be the last #include. @@ -129,24 +120,6 @@ struct WorkerFitnessInfo { : worker(worker), fitness(fitness), used(used) {} }; -struct RecruitWorkersInfo : ReferenceCounted { - RecruitFromConfigurationRequest req; - RecruitFromConfigurationReply rep; - AsyncTrigger waitForCompletion; - Optional dbgId; - - RecruitWorkersInfo(RecruitFromConfigurationRequest const& req) : req(req) {} -}; - -struct RecruitRemoteWorkersInfo : ReferenceCounted { - RecruitRemoteFromConfigurationRequest req; - RecruitRemoteFromConfigurationReply rep; - AsyncTrigger waitForCompletion; - Optional dbgId; - - RecruitRemoteWorkersInfo(RecruitRemoteFromConfigurationRequest const& req) : req(req) {} -}; - class ClusterControllerData { public: struct DBInfo { @@ -1650,13 +1623,6 @@ public: updateKnownIds(&id_used); - if (req.dbgId.present()) { - TraceEvent(SevDebug, "FindRemoteWorkersForConf", req.dbgId.get()) - .detail("RemoteDcId", req.dcId) - .detail("Configuration", req.configuration.toString()) - .detail("Policy", req.configuration.getRemoteTLogPolicy()->name()); - } - std::set> remoteDC; remoteDC.insert(req.dcId); @@ -1687,13 +1653,6 @@ public: throw operation_failed(); } - if (req.dbgId.present()) { - TraceEvent(SevDebug, "FindRemoteWorkersForConf_ReturnResult", req.dbgId.get()) - .detail("RemoteDcId", req.dcId) - .detail("ResultRemoteLogs", result.remoteTLogs.size()); - result.dbgId = req.dbgId; - } - return result; } @@ -3205,7 +3164,6 @@ public: RangeResult lastProcessClasses; bool gotProcessClasses; bool gotFullyRecoveredConfig; - bool shouldCommitSuicide; Optional> masterProcessId; Optional> clusterControllerProcessId; Optional> clusterControllerDcId; @@ -3215,8 +3173,8 @@ public: AsyncVar>>>> changedDcIds; // current DC priorities to change second, and whether the cluster controller has been changed UID id; - std::vector> outstandingRecruitmentRequests; - std::vector> outstandingRemoteRecruitmentRequests; + std::vector outstandingRecruitmentRequests; + std::vector outstandingRemoteRecruitmentRequests; std::vector> outstandingStorageRequests; std::vector> outstandingBlobWorkerRequests; ActorCollection ac; @@ -3281,12 +3239,12 @@ public: ClusterControllerData(ClusterControllerFullInterface const& ccInterface, LocalityData const& locality, ServerCoordinators const& coordinators) - : gotProcessClasses(false), gotFullyRecoveredConfig(false), shouldCommitSuicide(false), - clusterControllerProcessId(locality.processId()), clusterControllerDcId(locality.dcId()), id(ccInterface.id()), - ac(false), outstandingRequestChecker(Void()), outstandingRemoteRequestChecker(Void()), startTime(now()), - goodRecruitmentTime(Never()), goodRemoteRecruitmentTime(Never()), datacenterVersionDifference(0), - versionDifferenceUpdated(false), remoteDCMonitorStarted(false), remoteTransactionSystemDegraded(false), - recruitDistributor(false), recruitRatekeeper(false), recruitBlobManager(false), + : gotProcessClasses(false), gotFullyRecoveredConfig(false), clusterControllerProcessId(locality.processId()), + clusterControllerDcId(locality.dcId()), id(ccInterface.id()), ac(false), outstandingRequestChecker(Void()), + outstandingRemoteRequestChecker(Void()), startTime(now()), goodRecruitmentTime(Never()), + goodRemoteRecruitmentTime(Never()), datacenterVersionDifference(0), versionDifferenceUpdated(false), + remoteDCMonitorStarted(false), remoteTransactionSystemDegraded(false), recruitDistributor(false), + recruitRatekeeper(false), recruitBlobManager(false), clusterControllerMetrics("ClusterController", id.toString()), openDatabaseRequests("OpenDatabaseRequests", clusterControllerMetrics), registerWorkerRequests("RegisterWorkerRequests", clusterControllerMetrics), @@ -3313,2022 +3271,6 @@ public: } }; -// Assist Cluster Recovery state machine -namespace ClusterControllerRecovery { - -static std::set const& normalClusterRecoveryErrors() { - static std::set s; - if (s.empty()) { - s.insert(error_code_operation_failed); - s.insert(error_code_tlog_stopped); - s.insert(error_code_tlog_failed); - s.insert(error_code_commit_proxy_failed); - s.insert(error_code_grv_proxy_failed); - s.insert(error_code_resolver_failed); - s.insert(error_code_backup_worker_failed); - s.insert(error_code_recruitment_failed); - s.insert(error_code_no_more_servers); - s.insert(error_code_cluster_recovery_failed); - s.insert(error_code_coordinated_state_conflict); - s.insert(error_code_master_max_versions_in_flight); - s.insert(error_code_worker_removed); - s.insert(error_code_new_coordinators_timed_out); - s.insert(error_code_broken_promise); - } - return s; -} - -ACTOR Future recoveryTerminateOnConflict(UID dbgid, - Promise fullyRecovered, - Future onConflict, - Future switchedState) { - choose { - when(wait(onConflict)) { - if (!fullyRecovered.isSet()) { - TraceEvent("RecoveryTerminated", dbgid).detail("Reason", "Conflict"); - TEST(true); // Coordinated state conflict, recovery terminating - throw worker_removed(); - } - return Void(); - } - when(wait(switchedState)) { return Void(); } - } -} - -class ReusableCoordinatedState : NonCopyable { -public: - Promise fullyRecovered; - DBCoreState prevDBState; - DBCoreState myDBState; - bool finalWriteStarted; - Future previousWrite; - - ReusableCoordinatedState(ServerCoordinators const& coordinators, - PromiseStream> const& addActor, - UID const& dbgid) - : finalWriteStarted(false), previousWrite(Void()), cstate(coordinators), coordinators(coordinators), - addActor(addActor), dbgid(dbgid) {} - - Future read() { return _read(this); } - - Future write(DBCoreState newState, bool finalWrite = false) { - previousWrite = _write(this, newState, finalWrite); - return previousWrite; - } - - Future move(ClusterConnectionString const& nc) { return cstate.move(nc); } - -private: - MovableCoordinatedState cstate; - ServerCoordinators coordinators; - PromiseStream> addActor; - Promise switchedState; - UID dbgid; - - ACTOR Future _read(ReusableCoordinatedState* self) { - Value prevDBStateRaw = wait(self->cstate.read()); - Future onConflict = recoveryTerminateOnConflict( - self->dbgid, self->fullyRecovered, self->cstate.onConflict(), self->switchedState.getFuture()); - if (onConflict.isReady() && onConflict.isError()) { - throw onConflict.getError(); - } - self->addActor.send(onConflict); - - if (prevDBStateRaw.size()) { - self->prevDBState = BinaryReader::fromStringRef(prevDBStateRaw, IncludeVersion()); - self->myDBState = self->prevDBState; - } - - return Void(); - } - - ACTOR Future _write(ReusableCoordinatedState* self, DBCoreState newState, bool finalWrite) { - if (self->finalWriteStarted) { - wait(Future(Never())); - } - - if (finalWrite) { - self->finalWriteStarted = true; - } - - try { - wait(self->cstate.setExclusive( - BinaryWriter::toValue(newState, IncludeVersion(ProtocolVersion::withDBCoreState())))); - } catch (Error& e) { - TEST(true); // Master displaced during writeMasterState - throw; - } - - self->myDBState = newState; - - if (!finalWrite) { - self->switchedState.send(Void()); - self->cstate = MovableCoordinatedState(self->coordinators); - Value rereadDBStateRaw = wait(self->cstate.read()); - DBCoreState readState; - if (rereadDBStateRaw.size()) - readState = BinaryReader::fromStringRef(rereadDBStateRaw, IncludeVersion()); - - if (readState != newState) { - TraceEvent("RecoveryTerminated", self->dbgid).detail("Reason", "CStateChanged"); - TEST(true); // Coordinated state changed between writing and reading, recovery restarting - throw worker_removed(); - } - self->switchedState = Promise(); - self->addActor.send(recoveryTerminateOnConflict( - self->dbgid, self->fullyRecovered, self->cstate.onConflict(), self->switchedState.getFuture())); - } else { - self->fullyRecovered.send(Void()); - } - - return Void(); - } -}; - -struct ClusterRecoveryData : NonCopyable, ReferenceCounted { - ClusterControllerData* controllerData; - - UID dbgid; - - AsyncTrigger registrationTrigger; - Version lastEpochEnd, // The last version in the old epoch not (to be) rolled back in this recovery - recoveryTransactionVersion; // The first version in this epoch - double lastCommitTime; - - Version liveCommittedVersion; // The largest live committed version reported by commit proxies. - bool databaseLocked; - Optional proxyMetadataVersion; - Version minKnownCommittedVersion; - - DatabaseConfiguration originalConfiguration; - DatabaseConfiguration configuration; - std::vector> primaryDcId; - std::vector> remoteDcIds; - bool hasConfiguration; - - ServerCoordinators coordinators; - - Reference logSystem; - Version version; // The last version assigned to a proxy by getVersion() - double lastVersionTime; - LogSystemDiskQueueAdapter* txnStateLogAdapter; - IKeyValueStore* txnStateStore; - int64_t memoryLimit; - std::map, int8_t> dcId_locality; - std::vector allTags; - - int8_t getNextLocality() { - int8_t maxLocality = -1; - for (auto it : dcId_locality) { - maxLocality = std::max(maxLocality, it.second); - } - return maxLocality + 1; - } - - std::vector commitProxies; - std::vector provisionalCommitProxies; - std::vector grvProxies; - std::vector provisionalGrvProxies; - std::vector resolvers; - - std::map lastCommitProxyVersionReplies; - - UID clusterId; - Standalone dbId; - - MasterInterface masterInterface; - LifetimeToken masterLifetime; - const ClusterControllerFullInterface - clusterController; // If the cluster controller changes, this master will die, so this is immutable. - - ReusableCoordinatedState cstate; - Promise recoveryReadyForCommits; - Promise cstateUpdated; - Reference const> dbInfo; - int64_t registrationCount; // Number of different MasterRegistrationRequests sent to clusterController - - RecoveryState recoveryState; - - AsyncVar>> resolverChanges; - Version resolverChangesVersion; - std::set resolverNeedingChanges; - - PromiseStream> addActor; - Reference> recruitmentStalled; - bool forceRecovery; - bool neverCreated; - int8_t safeLocality; - int8_t primaryLocality; - - std::vector backupWorkers; // Recruited backup workers from cluster controller. - - CounterCollection cc; - Counter changeCoordinatorsRequests; - Counter getCommitVersionRequests; - Counter backupWorkerDoneRequests; - Counter getLiveCommittedVersionRequests; - Counter reportLiveCommittedVersionRequests; - - Future logger; - - Reference clusterRecoveryStateEventHolder; - Reference clusterRecoveryGenerationsEventHolder; - Reference clusterRecoveryDurationEventHolder; - Reference clusterRecoveryAvailableEventHolder; - Reference recoveredConfigEventHolder; - - ClusterRecoveryData(ClusterControllerData* controllerData, - Reference const> const& dbInfo, - MasterInterface const& masterInterface, - LifetimeToken const& masterLifetimeToken, - ServerCoordinators const& coordinators, - ClusterControllerFullInterface const& clusterController, - Standalone const& dbId, - PromiseStream> const& addActor, - bool forceRecovery) - - : controllerData(controllerData), dbgid(masterInterface.id()), lastEpochEnd(invalidVersion), - recoveryTransactionVersion(invalidVersion), lastCommitTime(0), liveCommittedVersion(invalidVersion), - databaseLocked(false), minKnownCommittedVersion(invalidVersion), hasConfiguration(false), - coordinators(coordinators), version(invalidVersion), lastVersionTime(0), txnStateStore(nullptr), - memoryLimit(2e9), dbId(dbId), masterInterface(masterInterface), masterLifetime(masterLifetimeToken), - clusterController(clusterController), cstate(coordinators, addActor, dbgid), dbInfo(dbInfo), - registrationCount(0), addActor(addActor), recruitmentStalled(makeReference>(false)), - forceRecovery(forceRecovery), neverCreated(false), safeLocality(tagLocalityInvalid), - primaryLocality(tagLocalityInvalid), cc("Master", dbgid.toString()), - changeCoordinatorsRequests("ChangeCoordinatorsRequests", cc), - getCommitVersionRequests("GetCommitVersionRequests", cc), - backupWorkerDoneRequests("BackupWorkerDoneRequests", cc), - getLiveCommittedVersionRequests("GetLiveCommittedVersionRequests", cc), - reportLiveCommittedVersionRequests("ReportLiveCommittedVersionRequests", cc), - clusterRecoveryStateEventHolder(makeReference("ClusterRecoveryState")), - clusterRecoveryGenerationsEventHolder(makeReference("ClusterRecoveryGenerations")), - clusterRecoveryDurationEventHolder(makeReference("ClusterRecoveryDuration")), - clusterRecoveryAvailableEventHolder(makeReference("ClusterRecoveryAvailable")), - recoveredConfigEventHolder(makeReference("RecoveredConfig")) { - logger = traceCounters( - "ClusterRecoveryMetrics", dbgid, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, &cc, "ClusterRecoveryMetrics"); - if (forceRecovery && !controllerData->clusterControllerDcId.present()) { - TraceEvent(SevError, "ForcedRecoveryRequiresDcID").log(); - forceRecovery = false; - } - } - ~ClusterRecoveryData() { - if (txnStateStore) - txnStateStore->close(); - } -}; - -ACTOR Future recruitNewMaster(ClusterControllerData* cluster, - ClusterControllerData::DBInfo* db, - MasterInterface* newMaster) { - state Future> fNewMaster; - state WorkerFitnessInfo masterWorker; - - loop { - // We must recruit the master in the same data center as the cluster controller. - // This should always be possible, because we can recruit the master on the same process as the cluster - // controller. - std::map>, int> id_used; - id_used[cluster->clusterControllerProcessId]++; - masterWorker = cluster->getWorkerForRoleInDatacenter( - cluster->clusterControllerDcId, ProcessClass::Master, ProcessClass::NeverAssign, db->config, id_used); - if ((masterWorker.worker.processClass.machineClassFitness(ProcessClass::Master) > - SERVER_KNOBS->EXPECTED_MASTER_FITNESS || - masterWorker.worker.interf.locality.processId() == cluster->clusterControllerProcessId) && - !cluster->goodRecruitmentTime.isReady()) { - TraceEvent("RecruitNewMaster", cluster->id) - .detail("Fitness", masterWorker.worker.processClass.machineClassFitness(ProcessClass::Master)); - wait(delay(SERVER_KNOBS->ATTEMPT_RECRUITMENT_DELAY)); - continue; - } - RecruitMasterRequest rmq; - rmq.lifetime = db->serverInfo->get().masterLifetime; - rmq.forceRecovery = db->forceRecovery; - - cluster->masterProcessId = masterWorker.worker.interf.locality.processId(); - cluster->db.unfinishedRecoveries++; - fNewMaster = masterWorker.worker.interf.master.tryGetReply(rmq); - wait(ready(fNewMaster) || db->forceMasterFailure.onTrigger()); - if (fNewMaster.isReady() && fNewMaster.get().present()) { - TraceEvent("RecruitNewMaster", cluster->id).detail("Recruited", fNewMaster.get().get().id()); - - // for status tool - TraceEvent("RecruitedMasterWorker", cluster->id) - .detail("Address", fNewMaster.get().get().address()) - .trackLatest(cluster->recruitedMasterWorkerEventHolder->trackingKey); - - *newMaster = fNewMaster.get().get(); - - return Void(); - } else { - TEST(true); // clusterWatchDatabase() !newMaster.present() - wait(delay(SERVER_KNOBS->MASTER_SPIN_DELAY)); - } - } -} - -ACTOR Future clusterRecruitFromConfiguration(ClusterControllerData* self, Reference req) { - // At the moment this doesn't really need to be an actor (it always completes immediately) - TEST(true); // ClusterController RecruitTLogsRequest - loop { - try { - req->rep = self->findWorkersForConfiguration(req->req); - return Void(); - } catch (Error& e) { - if (e.code() == error_code_no_more_servers && self->goodRecruitmentTime.isReady()) { - self->outstandingRecruitmentRequests.push_back(req); - TraceEvent(SevWarn, "RecruitFromConfigurationNotAvailable", self->id).error(e); - wait(req->waitForCompletion.onTrigger()); - return Void(); - } else if (e.code() == error_code_operation_failed || e.code() == error_code_no_more_servers) { - // recruitment not good enough, try again - TraceEvent("RecruitFromConfigurationRetry", self->id) - .error(e) - .detail("GoodRecruitmentTimeReady", self->goodRecruitmentTime.isReady()); - while (!self->goodRecruitmentTime.isReady()) { - wait(lowPriorityDelay(SERVER_KNOBS->ATTEMPT_RECRUITMENT_DELAY)); - } - } else { - TraceEvent(SevError, "RecruitFromConfigurationError", self->id).error(e); - throw; - } - } - wait(lowPriorityDelay(SERVER_KNOBS->ATTEMPT_RECRUITMENT_DELAY)); - } -} - -ACTOR Future clusterRecruitRemoteFromConfiguration( - ClusterControllerData* self, - Reference req) { - // At the moment this doesn't really need to be an actor (it always completes immediately) - TEST(true); // ClusterController RecruitTLogsRequest Remote - loop { - try { - auto rep = self->findRemoteWorkersForConfiguration(req->req); - return rep; - } catch (Error& e) { - if (e.code() == error_code_no_more_servers && self->goodRemoteRecruitmentTime.isReady()) { - self->outstandingRemoteRecruitmentRequests.push_back(req); - TraceEvent(SevWarn, "RecruitRemoteFromConfigurationNotAvailable", self->id).error(e); - wait(req->waitForCompletion.onTrigger()); - return req->rep; - } else if (e.code() == error_code_operation_failed || e.code() == error_code_no_more_servers) { - // recruitment not good enough, try again - TraceEvent("RecruitRemoteFromConfigurationRetry", self->id) - .error(e) - .detail("GoodRecruitmentTimeReady", self->goodRemoteRecruitmentTime.isReady()); - while (!self->goodRemoteRecruitmentTime.isReady()) { - wait(lowPriorityDelay(SERVER_KNOBS->ATTEMPT_RECRUITMENT_DELAY)); - } - } else { - TraceEvent(SevError, "RecruitRemoteFromConfigurationError", self->id).error(e); - throw; - } - } - wait(lowPriorityDelay(SERVER_KNOBS->ATTEMPT_RECRUITMENT_DELAY)); - } -} - -ACTOR Future newCommitProxies(Reference self, RecruitFromConfigurationReply recr) { - std::vector> initializationReplies; - for (int i = 0; i < recr.commitProxies.size(); i++) { - InitializeCommitProxyRequest req; - req.master = self->masterInterface; - req.masterLifetime = self->masterLifetime; - req.recoveryCount = self->cstate.myDBState.recoveryCount + 1; - req.recoveryTransactionVersion = self->recoveryTransactionVersion; - req.firstProxy = i == 0; - TraceEvent("CommitProxyReplies", self->dbgid) - .detail("WorkerID", recr.commitProxies[i].id()) - .detail("ReocoveryTxnVersion", self->recoveryTransactionVersion) - .detail("FirstProxy", req.firstProxy ? "True" : "False"); - initializationReplies.push_back( - transformErrors(throwErrorOr(recr.commitProxies[i].commitProxy.getReplyUnlessFailedFor( - req, SERVER_KNOBS->TLOG_TIMEOUT, SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY)), - commit_proxy_failed())); - } - - std::vector newRecruits = wait(getAll(initializationReplies)); - // It is required for the correctness of COMMIT_ON_FIRST_PROXY that self->commitProxies[0] is the firstCommitProxy. - self->commitProxies = newRecruits; - - return Void(); -} - -ACTOR Future newGrvProxies(Reference self, RecruitFromConfigurationReply recr) { - std::vector> initializationReplies; - for (int i = 0; i < recr.grvProxies.size(); i++) { - InitializeGrvProxyRequest req; - req.master = self->masterInterface; - req.masterLifetime = self->masterLifetime; - req.recoveryCount = self->cstate.myDBState.recoveryCount + 1; - TraceEvent("GrvProxyReplies", self->dbgid).detail("WorkerID", recr.grvProxies[i].id()); - initializationReplies.push_back( - transformErrors(throwErrorOr(recr.grvProxies[i].grvProxy.getReplyUnlessFailedFor( - req, SERVER_KNOBS->TLOG_TIMEOUT, SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY)), - grv_proxy_failed())); - } - - std::vector newRecruits = wait(getAll(initializationReplies)); - self->grvProxies = newRecruits; - return Void(); -} - -ACTOR Future newResolvers(Reference self, RecruitFromConfigurationReply recr) { - std::vector> initializationReplies; - for (int i = 0; i < recr.resolvers.size(); i++) { - InitializeResolverRequest req; - req.recoveryCount = self->cstate.myDBState.recoveryCount + 1; - req.commitProxyCount = recr.commitProxies.size(); - req.resolverCount = recr.resolvers.size(); - TraceEvent("ResolverReplies", self->dbgid).detail("WorkerID", recr.resolvers[i].id()); - initializationReplies.push_back( - transformErrors(throwErrorOr(recr.resolvers[i].resolver.getReplyUnlessFailedFor( - req, SERVER_KNOBS->TLOG_TIMEOUT, SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY)), - resolver_failed())); - } - - std::vector newRecruits = wait(getAll(initializationReplies)); - self->resolvers = newRecruits; - - return Void(); -} - -ACTOR Future newTLogServers(Reference self, - RecruitFromConfigurationReply recr, - Reference oldLogSystem, - std::vector>* initialConfChanges) { - if (self->configuration.usableRegions > 1) { - state Optional remoteDcId = self->remoteDcIds.size() ? self->remoteDcIds[0] : Optional(); - if (!self->dcId_locality.count(recr.dcId)) { - int8_t loc = self->getNextLocality(); - Standalone tr; - tr.set(tr.arena(), tagLocalityListKeyFor(recr.dcId), tagLocalityListValue(loc)); - initialConfChanges->push_back(tr); - self->dcId_locality[recr.dcId] = loc; - TraceEvent(SevWarn, "UnknownPrimaryDCID", self->dbgid).detail("PrimaryId", recr.dcId).detail("Loc", loc); - } - - if (!self->dcId_locality.count(remoteDcId)) { - int8_t loc = self->getNextLocality(); - Standalone tr; - tr.set(tr.arena(), tagLocalityListKeyFor(remoteDcId), tagLocalityListValue(loc)); - initialConfChanges->push_back(tr); - self->dcId_locality[remoteDcId] = loc; - TraceEvent(SevWarn, "UnknownRemoteDCID", self->dbgid).detail("RemoteId", remoteDcId).detail("Loc", loc); - } - - std::vector exclusionWorkerIds; - std::transform(recr.tLogs.begin(), - recr.tLogs.end(), - std::back_inserter(exclusionWorkerIds), - [](const WorkerInterface& in) { return in.id(); }); - std::transform(recr.satelliteTLogs.begin(), - recr.satelliteTLogs.end(), - std::back_inserter(exclusionWorkerIds), - [](const WorkerInterface& in) { return in.id(); }); - - RecruitRemoteFromConfigurationRequest remoteRecruitReq( - self->configuration, - remoteDcId, - recr.tLogs.size() * - std::max(1, self->configuration.desiredLogRouterCount / std::max(1, recr.tLogs.size())), - exclusionWorkerIds); - remoteRecruitReq.dbgId = self->dbgid; - state Reference recruitWorkersInfo = - makeReference(remoteRecruitReq); - recruitWorkersInfo->dbgId = self->dbgid; - Future fRemoteWorkers = - clusterRecruitRemoteFromConfiguration(self->controllerData, recruitWorkersInfo); - - self->primaryLocality = self->dcId_locality[recr.dcId]; - self->logSystem = Reference(); // Cancels the actors in the previous log system. - Reference newLogSystem = wait(oldLogSystem->newEpoch(recr, - fRemoteWorkers, - self->clusterId, - self->configuration, - self->cstate.myDBState.recoveryCount + 1, - self->primaryLocality, - self->dcId_locality[remoteDcId], - self->allTags, - self->recruitmentStalled)); - self->logSystem = newLogSystem; - } else { - self->primaryLocality = tagLocalitySpecial; - self->logSystem = Reference(); // Cancels the actors in the previous log system. - Reference newLogSystem = wait(oldLogSystem->newEpoch(recr, - Never(), - self->clusterId, - self->configuration, - self->cstate.myDBState.recoveryCount + 1, - self->primaryLocality, - tagLocalitySpecial, - self->allTags, - self->recruitmentStalled)); - self->logSystem = newLogSystem; - } - return Void(); -} - -ACTOR Future newSeedServers(Reference self, - RecruitFromConfigurationReply recruits, - std::vector* servers) { - // This is only necessary if the database is at version 0 - servers->clear(); - if (self->lastEpochEnd) - return Void(); - - state int idx = 0; - state std::map, Tag> dcId_tags; - state int8_t nextLocality = 0; - while (idx < recruits.storageServers.size()) { - TraceEvent("ClusterRecoveryRecruitingInitialStorageServer", self->dbgid) - .detail("CandidateWorker", recruits.storageServers[idx].locality.toString()); - - InitializeStorageRequest isr; - isr.seedTag = dcId_tags.count(recruits.storageServers[idx].locality.dcId()) - ? dcId_tags[recruits.storageServers[idx].locality.dcId()] - : Tag(nextLocality, 0); - isr.storeType = self->configuration.storageServerStoreType; - isr.reqId = deterministicRandom()->randomUniqueID(); - isr.interfaceId = deterministicRandom()->randomUniqueID(); - isr.clusterId = self->clusterId; - - ErrorOr newServer = wait(recruits.storageServers[idx].storage.tryGetReply(isr)); - - if (newServer.isError()) { - if (!newServer.isError(error_code_recruitment_failed) && - !newServer.isError(error_code_request_maybe_delivered)) - throw newServer.getError(); - - TEST(true); // initial storage recuitment loop failed to get new server - wait(delay(SERVER_KNOBS->STORAGE_RECRUITMENT_DELAY)); - } else { - if (!dcId_tags.count(recruits.storageServers[idx].locality.dcId())) { - dcId_tags[recruits.storageServers[idx].locality.dcId()] = Tag(nextLocality, 0); - nextLocality++; - } - - Tag& tag = dcId_tags[recruits.storageServers[idx].locality.dcId()]; - tag.id++; - idx++; - - servers->push_back(newServer.get().interf); - } - } - - self->dcId_locality.clear(); - for (auto& it : dcId_tags) { - self->dcId_locality[it.first] = it.second.locality; - } - - TraceEvent("ClusterRecoveryRecruitedInitialStorageServers", self->dbgid) - .detail("TargetCount", self->configuration.storageTeamSize) - .detail("Servers", describe(*servers)); - - return Void(); -} - -Future waitCommitProxyFailure(std::vector const& commitProxies) { - std::vector> failed; - failed.reserve(commitProxies.size()); - for (auto commitProxy : commitProxies) { - failed.push_back(waitFailureClient(commitProxy.waitFailure, - SERVER_KNOBS->TLOG_TIMEOUT, - -SERVER_KNOBS->TLOG_TIMEOUT / SERVER_KNOBS->SECONDS_BEFORE_NO_FAILURE_DELAY, - /*trace=*/true)); - } - ASSERT(failed.size() >= 1); - return tagError(quorum(failed, 1), commit_proxy_failed()); -} - -Future waitGrvProxyFailure(std::vector const& grvProxies) { - std::vector> failed; - failed.reserve(grvProxies.size()); - for (int i = 0; i < grvProxies.size(); i++) - failed.push_back(waitFailureClient(grvProxies[i].waitFailure, - SERVER_KNOBS->TLOG_TIMEOUT, - -SERVER_KNOBS->TLOG_TIMEOUT / SERVER_KNOBS->SECONDS_BEFORE_NO_FAILURE_DELAY, - /*trace=*/true)); - ASSERT(failed.size() >= 1); - return tagError(quorum(failed, 1), grv_proxy_failed()); -} - -Future waitResolverFailure(std::vector const& resolvers) { - std::vector> failed; - failed.reserve(resolvers.size()); - for (auto resolver : resolvers) { - failed.push_back(waitFailureClient(resolver.waitFailure, - SERVER_KNOBS->TLOG_TIMEOUT, - -SERVER_KNOBS->TLOG_TIMEOUT / SERVER_KNOBS->SECONDS_BEFORE_NO_FAILURE_DELAY, - /*trace=*/true)); - } - ASSERT(failed.size() >= 1); - return tagError(quorum(failed, 1), resolver_failed()); -} - -ACTOR Future rejoinRequestHandler(Reference self) { - loop { - TLogRejoinRequest req = waitNext(self->clusterController.tlogRejoin.getFuture()); - TraceEvent(SevDebug, "TLogRejoinRequestHandler") - .detail("MasterLifeTime", self->dbInfo->get().masterLifetime.toString()); - req.reply.send(true); - } -} - -// Keeps the coordinated state (cstate) updated as the set of recruited tlogs change through recovery. -ACTOR Future trackTlogRecovery(Reference self, - Reference>> oldLogSystems, - Future minRecoveryDuration) { - state Future rejoinRequests = Never(); - state DBRecoveryCount recoverCount = self->cstate.myDBState.recoveryCount + 1; - state DatabaseConfiguration configuration = - self->configuration; // self-configuration can be changed by configurationMonitor so we need a copy - loop { - state DBCoreState newState; - self->logSystem->toCoreState(newState); - newState.recoveryCount = recoverCount; - state Future changed = self->logSystem->onCoreStateChanged(); - - ASSERT(newState.tLogs[0].tLogWriteAntiQuorum == configuration.tLogWriteAntiQuorum && - newState.tLogs[0].tLogReplicationFactor == configuration.tLogReplicationFactor); - - state bool allLogs = - newState.tLogs.size() == - configuration.expectedLogSets(self->primaryDcId.size() ? self->primaryDcId[0] : Optional()); - state bool finalUpdate = !newState.oldTLogData.size() && allLogs; - TraceEvent("TrackTlogRecovery") - .detail("FinalUpdate", finalUpdate) - .detail("NewState.tlogs", newState.tLogs.size()) - .detail("NewState.OldTLogs", newState.oldTLogData.size()) - .detail("Expected.tlogs", - configuration.expectedLogSets(self->primaryDcId.size() ? self->primaryDcId[0] : Optional())); - wait(self->cstate.write(newState, finalUpdate)); - if (self->cstateUpdated.canBeSet()) { - self->cstateUpdated.send(Void()); - } - - wait(minRecoveryDuration); - self->logSystem->coreStateWritten(newState); - - if (self->recoveryReadyForCommits.canBeSet()) { - self->recoveryReadyForCommits.send(Void()); - } - - if (finalUpdate) { - self->recoveryState = RecoveryState::FULLY_RECOVERED; - TraceEvent("ClusterRecoveryState", self->dbgid) - .detail("StatusCode", RecoveryStatus::fully_recovered) - .detail("Status", RecoveryStatus::names[RecoveryStatus::fully_recovered]) - .detail("FullyRecoveredAtVersion", self->version) - .detail("ClusterId", self->clusterId) - .trackLatest(self->clusterRecoveryStateEventHolder->trackingKey); - - TraceEvent("ClusterRecoveryGenerations", self->dbgid) - .detail("ActiveGenerations", 1) - .trackLatest(self->clusterRecoveryGenerationsEventHolder->trackingKey); - } else if (!newState.oldTLogData.size() && self->recoveryState < RecoveryState::STORAGE_RECOVERED) { - self->recoveryState = RecoveryState::STORAGE_RECOVERED; - TraceEvent("ClusterRecoveryState", self->dbgid) - .detail("StatusCode", RecoveryStatus::storage_recovered) - .detail("Status", RecoveryStatus::names[RecoveryStatus::storage_recovered]) - .trackLatest(self->clusterRecoveryStateEventHolder->trackingKey); - } else if (allLogs && self->recoveryState < RecoveryState::ALL_LOGS_RECRUITED) { - self->recoveryState = RecoveryState::ALL_LOGS_RECRUITED; - TraceEvent("ClusterRecoveryState", self->dbgid) - .detail("StatusCode", RecoveryStatus::all_logs_recruited) - .detail("Status", RecoveryStatus::names[RecoveryStatus::all_logs_recruited]) - .trackLatest(self->clusterRecoveryStateEventHolder->trackingKey); - } - - if (newState.oldTLogData.size() && configuration.repopulateRegionAntiQuorum > 0 && - self->logSystem->remoteStorageRecovered()) { - TraceEvent(SevWarnAlways, "RecruitmentStalled_RemoteStorageRecovered", self->dbgid).log(); - self->recruitmentStalled->set(true); - } - self->registrationTrigger.trigger(); - - if (finalUpdate) { - oldLogSystems->get()->stopRejoins(); - rejoinRequests = rejoinRequestHandler(self); - return Void(); - } - - wait(changed); - } -} - -std::pair findRange(CoalescedKeyRangeMap& key_resolver, - Standalone>& movedRanges, - int src, - int dest) { - auto ranges = key_resolver.ranges(); - auto prev = ranges.begin(); - auto it = ranges.begin(); - ++it; - if (it == ranges.end()) { - if (ranges.begin().value() != src || - std::find(movedRanges.begin(), movedRanges.end(), ResolverMoveRef(ranges.begin()->range(), dest)) != - movedRanges.end()) - throw operation_failed(); - return std::make_pair(ranges.begin().range(), true); - } - - std::set borders; - // If possible expand an existing boundary between the two resolvers - for (; it != ranges.end(); ++it) { - if (it->value() == src && prev->value() == dest && - std::find(movedRanges.begin(), movedRanges.end(), ResolverMoveRef(it->range(), dest)) == - movedRanges.end()) { - return std::make_pair(it->range(), true); - } - if (it->value() == dest && prev->value() == src && - std::find(movedRanges.begin(), movedRanges.end(), ResolverMoveRef(prev->range(), dest)) == - movedRanges.end()) { - return std::make_pair(prev->range(), false); - } - if (it->value() == dest) - borders.insert(prev->value()); - if (prev->value() == dest) - borders.insert(it->value()); - ++prev; - } - - prev = ranges.begin(); - it = ranges.begin(); - ++it; - // If possible create a new boundry which doesn't exist yet - for (; it != ranges.end(); ++it) { - if (it->value() == src && !borders.count(prev->value()) && - std::find(movedRanges.begin(), movedRanges.end(), ResolverMoveRef(it->range(), dest)) == - movedRanges.end()) { - return std::make_pair(it->range(), true); - } - if (prev->value() == src && !borders.count(it->value()) && - std::find(movedRanges.begin(), movedRanges.end(), ResolverMoveRef(prev->range(), dest)) == - movedRanges.end()) { - return std::make_pair(prev->range(), false); - } - ++prev; - } - - it = ranges.begin(); - for (; it != ranges.end(); ++it) { - if (it->value() == src && - std::find(movedRanges.begin(), movedRanges.end(), ResolverMoveRef(it->range(), dest)) == - movedRanges.end()) { - return std::make_pair(it->range(), true); - } - } - throw operation_failed(); // we are already attempting to move all of the data one resolver is assigned, so do not - // move anything -} - -ACTOR Future resolutionBalancing(Reference self) { - state CoalescedKeyRangeMap key_resolver; - key_resolver.insert(allKeys, 0); - loop { - wait(delay(SERVER_KNOBS->MIN_BALANCE_TIME, TaskPriority::ResolutionMetrics)); - while (self->resolverChanges.get().size()) - wait(self->resolverChanges.onChange()); - state std::vector> futures; - for (auto& p : self->resolvers) - futures.push_back( - brokenPromiseToNever(p.metrics.getReply(ResolutionMetricsRequest(), TaskPriority::ResolutionMetrics))); - wait(waitForAll(futures)); - state IndexedSet, NoMetric> metrics; - - int64_t total = 0; - for (int i = 0; i < futures.size(); i++) { - total += futures[i].get().value; - metrics.insert(std::make_pair(futures[i].get().value, i), NoMetric()); - //TraceEvent("ResolverMetric").detail("I", i).detail("Metric", futures[i].get()); - } - if (metrics.lastItem()->first - metrics.begin()->first > SERVER_KNOBS->MIN_BALANCE_DIFFERENCE) { - try { - state int src = metrics.lastItem()->second; - state int dest = metrics.begin()->second; - state int64_t amount = std::min(metrics.lastItem()->first - total / self->resolvers.size(), - total / self->resolvers.size() - metrics.begin()->first) / - 2; - state Standalone> movedRanges; - - loop { - state std::pair range = findRange(key_resolver, movedRanges, src, dest); - - ResolutionSplitRequest req; - req.front = range.second; - req.offset = amount; - req.range = range.first; - - ResolutionSplitReply split = - wait(brokenPromiseToNever(self->resolvers[metrics.lastItem()->second].split.getReply( - req, TaskPriority::ResolutionMetrics))); - KeyRangeRef moveRange = range.second ? KeyRangeRef(range.first.begin, split.key) - : KeyRangeRef(split.key, range.first.end); - movedRanges.push_back_deep(movedRanges.arena(), ResolverMoveRef(moveRange, dest)); - TraceEvent("MovingResolutionRange") - .detail("Src", src) - .detail("Dest", dest) - .detail("Amount", amount) - .detail("StartRange", range.first) - .detail("MoveRange", moveRange) - .detail("Used", split.used) - .detail("KeyResolverRanges", key_resolver.size()); - amount -= split.used; - if (moveRange != range.first || amount <= 0) - break; - } - for (auto& it : movedRanges) - key_resolver.insert(it.range, it.dest); - // for(auto& it : key_resolver.ranges()) - // TraceEvent("KeyResolver").detail("Range", it.range()).detail("Value", it.value()); - - self->resolverChangesVersion = self->version + 1; - for (auto& p : self->commitProxies) - self->resolverNeedingChanges.insert(p.id()); - self->resolverChanges.set(movedRanges); - } catch (Error& e) { - if (e.code() != error_code_operation_failed) - throw; - } - } - } -} - -ACTOR Future changeCoordinators(Reference self) { - loop { - ChangeCoordinatorsRequest req = waitNext(self->clusterController.changeCoordinators.getFuture()); - TraceEvent("ChangeCoordinators", self->dbgid).log(); - ++self->changeCoordinatorsRequests; - state ChangeCoordinatorsRequest changeCoordinatorsRequest = req; - - // Kill cluster controller to facilitate coordinator registration update - if (self->controllerData->shouldCommitSuicide) { - throw restart_cluster_controller(); - } - self->controllerData->shouldCommitSuicide = true; - - while (!self->cstate.previousWrite.isReady()) { - wait(self->cstate.previousWrite); - wait(delay( - 0)); // if a new core state is ready to be written, have that take priority over our finalizing write; - } - - if (!self->cstate.fullyRecovered.isSet()) { - wait(self->cstate.write(self->cstate.myDBState, true)); - } - - try { - wait(self->cstate.move(ClusterConnectionString(changeCoordinatorsRequest.newConnectionString.toString()))); - } catch (Error& e) { - if (e.code() != error_code_actor_cancelled) - changeCoordinatorsRequest.reply.sendError(e); - - throw; - } - - throw internal_error(); - } -} - -ACTOR Future configurationMonitor(Reference self, Database cx) { - loop { - state ReadYourWritesTransaction tr(cx); - - loop { - try { - tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - RangeResult results = wait(tr.getRange(configKeys, CLIENT_KNOBS->TOO_MANY)); - ASSERT(!results.more && results.size() < CLIENT_KNOBS->TOO_MANY); - - DatabaseConfiguration conf; - conf.fromKeyValues((VectorRef)results); - TraceEvent("ConfigurationMonitor", self->dbgid).detail("ClusterRecoveryState", self->recoveryState); - if (conf != self->configuration) { - if (self->recoveryState != RecoveryState::ALL_LOGS_RECRUITED && - self->recoveryState != RecoveryState::FULLY_RECOVERED) { - self->controllerData->shouldCommitSuicide = true; - throw restart_cluster_controller(); - } - - self->configuration = conf; - self->registrationTrigger.trigger(); - } - - state Future watchFuture = - tr.watch(moveKeysLockOwnerKey) || tr.watch(excludedServersVersionKey) || - tr.watch(failedServersVersionKey) || tr.watch(excludedLocalityVersionKey) || - tr.watch(failedLocalityVersionKey); - wait(tr.commit()); - wait(watchFuture); - break; - } catch (Error& e) { - wait(tr.onError(e)); - } - } - } -} - -ACTOR static Future> getMinBackupVersion(Reference self, Database cx) { - loop { - state ReadYourWritesTransaction tr(cx); - - try { - tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - tr.setOption(FDBTransactionOptions::LOCK_AWARE); - Optional value = wait(tr.get(backupStartedKey)); - Optional minVersion; - if (value.present()) { - auto uidVersions = decodeBackupStartedValue(value.get()); - TraceEvent e("GotBackupStartKey", self->dbgid); - int i = 1; - for (auto [uid, version] : uidVersions) { - e.detail(format("BackupID%d", i), uid).detail(format("Version%d", i), version); - i++; - minVersion = minVersion.present() ? std::min(version, minVersion.get()) : version; - } - } else { - TraceEvent("EmptyBackupStartKey", self->dbgid).log(); - } - return minVersion; - - } catch (Error& e) { - wait(tr.onError(e)); - } - } -} - -ACTOR static Future recruitBackupWorkers(Reference self, Database cx) { - ASSERT(self->backupWorkers.size() > 0); - - // Avoid race between a backup worker's save progress and the reads below. - wait(delay(SERVER_KNOBS->SECONDS_BEFORE_RECRUIT_BACKUP_WORKER)); - - state LogEpoch epoch = self->cstate.myDBState.recoveryCount; - state Reference backupProgress( - new BackupProgress(self->dbgid, self->logSystem->getOldEpochTagsVersionsInfo())); - state Future gotProgress = getBackupProgress(cx, self->dbgid, backupProgress, /*logging=*/true); - state std::vector> initializationReplies; - - state std::vector> idsTags; // worker IDs and tags for current epoch - state int logRouterTags = self->logSystem->getLogRouterTags(); - idsTags.reserve(logRouterTags); - for (int i = 0; i < logRouterTags; i++) { - idsTags.emplace_back(deterministicRandom()->randomUniqueID(), Tag(tagLocalityLogRouter, i)); - } - - const Version startVersion = self->logSystem->getBackupStartVersion(); - state int i = 0; - for (; i < logRouterTags; i++) { - const auto& worker = self->backupWorkers[i % self->backupWorkers.size()]; - InitializeBackupRequest req(idsTags[i].first); - req.recruitedEpoch = epoch; - req.backupEpoch = epoch; - req.routerTag = idsTags[i].second; - req.totalTags = logRouterTags; - req.startVersion = startVersion; - TraceEvent("BackupRecruitment", self->dbgid) - .detail("RequestID", req.reqId) - .detail("Tag", req.routerTag.toString()) - .detail("Epoch", epoch) - .detail("BackupEpoch", epoch) - .detail("StartVersion", req.startVersion); - initializationReplies.push_back( - transformErrors(throwErrorOr(worker.backup.getReplyUnlessFailedFor( - req, SERVER_KNOBS->BACKUP_TIMEOUT, SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY)), - backup_worker_failed())); - } - - state Future> fMinVersion = getMinBackupVersion(self, cx); - wait(gotProgress && success(fMinVersion)); - TraceEvent("MinBackupVersion", self->dbgid).detail("Version", fMinVersion.get().present() ? fMinVersion.get() : -1); - - std::map, std::map> toRecruit = - backupProgress->getUnfinishedBackup(); - for (const auto& [epochVersionTags, tagVersions] : toRecruit) { - const Version oldEpochEnd = std::get<1>(epochVersionTags); - if (!fMinVersion.get().present() || fMinVersion.get().get() + 1 >= oldEpochEnd) { - TraceEvent("SkipBackupRecruitment", self->dbgid) - .detail("MinVersion", fMinVersion.get().present() ? fMinVersion.get() : -1) - .detail("Epoch", epoch) - .detail("OldEpoch", std::get<0>(epochVersionTags)) - .detail("OldEpochEnd", oldEpochEnd); - continue; - } - for (const auto& [tag, version] : tagVersions) { - const auto& worker = self->backupWorkers[i % self->backupWorkers.size()]; - i++; - InitializeBackupRequest req(deterministicRandom()->randomUniqueID()); - req.recruitedEpoch = epoch; - req.backupEpoch = std::get<0>(epochVersionTags); - req.routerTag = tag; - req.totalTags = std::get<2>(epochVersionTags); - req.startVersion = version; // savedVersion + 1 - req.endVersion = std::get<1>(epochVersionTags) - 1; - TraceEvent("BackupRecruitment", self->dbgid) - .detail("RequestID", req.reqId) - .detail("Tag", req.routerTag.toString()) - .detail("Epoch", epoch) - .detail("BackupEpoch", req.backupEpoch) - .detail("StartVersion", req.startVersion) - .detail("EndVersion", req.endVersion.get()); - initializationReplies.push_back(transformErrors( - throwErrorOr(worker.backup.getReplyUnlessFailedFor( - req, SERVER_KNOBS->BACKUP_TIMEOUT, SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY)), - backup_worker_failed())); - } - } - - std::vector newRecruits = wait(getAll(initializationReplies)); - self->logSystem->setBackupWorkers(newRecruits); - TraceEvent("BackupRecruitmentDone", self->dbgid).log(); - self->registrationTrigger.trigger(); - return Void(); -} - -ACTOR Future updateLogsValue(Reference self, Database cx) { - state Transaction tr(cx); - loop { - try { - Optional> value = wait(tr.get(logsKey)); - ASSERT(value.present()); - auto logs = decodeLogsValue(value.get()); - - std::set logIds; - for (auto& log : logs.first) { - logIds.insert(log.first); - } - - bool found = false; - for (auto& logSet : self->logSystem->getLogSystemConfig().tLogs) { - for (auto& log : logSet.tLogs) { - if (logIds.count(log.id())) { - found = true; - break; - } - } - if (found) { - break; - } - } - - if (!found) { - TEST(true); // old master attempted to change logsKey - return Void(); - } - - tr.set(logsKey, self->logSystem->getLogsValue()); - wait(tr.commit()); - return Void(); - } catch (Error& e) { - wait(tr.onError(e)); - } - } -} - -// TODO(ahusain): ClusterController orchestrating recovery, self message can be avoided. -Future sendMasterRegistration(ClusterRecoveryData* self, - LogSystemConfig const& logSystemConfig, - std::vector commitProxies, - std::vector grvProxies, - std::vector resolvers, - DBRecoveryCount recoveryCount, - std::vector priorCommittedLogServers) { - RegisterMasterRequest masterReq; - masterReq.id = self->masterInterface.id(); - masterReq.mi = self->masterInterface.locality; - masterReq.logSystemConfig = logSystemConfig; - masterReq.commitProxies = commitProxies; - masterReq.grvProxies = grvProxies; - masterReq.resolvers = resolvers; - masterReq.recoveryCount = recoveryCount; - if (self->hasConfiguration) - masterReq.configuration = self->configuration; - masterReq.registrationCount = ++self->registrationCount; - masterReq.priorCommittedLogServers = priorCommittedLogServers; - masterReq.recoveryState = self->recoveryState; - masterReq.recoveryStalled = self->recruitmentStalled->get(); - masterReq.clusterId = self->clusterId; - return brokenPromiseToNever(self->clusterController.registerMaster.getReply(masterReq)); -} - -ACTOR Future updateRegistration(Reference self, Reference logSystem) { - state Database cx = openDBOnServer(self->dbInfo, TaskPriority::DefaultEndpoint, LockAware::True); - state Future trigger = self->registrationTrigger.onTrigger(); - state Future updateLogsKey; - - loop { - wait(trigger); - wait(delay(.001)); // Coalesce multiple changes - - trigger = self->registrationTrigger.onTrigger(); - - auto logSystemConfig = logSystem->getLogSystemConfig(); - TraceEvent("UpdateRegistration", self->dbgid) - .detail("RecoveryCount", self->cstate.myDBState.recoveryCount) - .detail("OldestBackupEpoch", logSystemConfig.oldestBackupEpoch) - .detail("Logs", describe(logSystemConfig.tLogs)) - .detail("CStateUpdated", self->cstateUpdated.isSet()) - .detail("RecoveryTxnVersion", self->recoveryTransactionVersion) - .detail("LastEpochEnd", self->lastEpochEnd); - - if (!self->cstateUpdated.isSet()) { - wait(sendMasterRegistration(self.getPtr(), - logSystemConfig, - self->provisionalCommitProxies, - self->provisionalGrvProxies, - self->resolvers, - self->cstate.myDBState.recoveryCount, - self->cstate.prevDBState.getPriorCommittedLogServers())); - - } else if (self->recoveryState >= RecoveryState::ACCEPTING_COMMITS) { - updateLogsKey = updateLogsValue(self, cx); - wait(sendMasterRegistration(self.getPtr(), - logSystemConfig, - self->commitProxies, - self->grvProxies, - self->resolvers, - self->cstate.myDBState.recoveryCount, - std::vector())); - } else { - // The cluster should enter the accepting commits phase soon, and then we will register again - TEST(true); // cstate is updated but we aren't accepting commits yet - } - } -} - -ACTOR Future> provisionalMaster(Reference parent, - Future activate) { - wait(activate); - - // Register a fake commit proxy (to be provided right here) to make ourselves available to clients - parent->provisionalCommitProxies = std::vector(1); - parent->provisionalCommitProxies[0].provisional = true; - parent->provisionalCommitProxies[0].initEndpoints(); - parent->provisionalGrvProxies = std::vector(1); - parent->provisionalGrvProxies[0].provisional = true; - parent->provisionalGrvProxies[0].initEndpoints(); - state Future waitCommitProxyFailure = - waitFailureServer(parent->provisionalCommitProxies[0].waitFailure.getFuture()); - state Future waitGrvProxyFailure = - waitFailureServer(parent->provisionalGrvProxies[0].waitFailure.getFuture()); - parent->registrationTrigger.trigger(); - - auto lockedKey = parent->txnStateStore->readValue(databaseLockedKey).get(); - state bool locked = lockedKey.present() && lockedKey.get().size(); - - state Optional metadataVersion = parent->txnStateStore->readValue(metadataVersionKey).get(); - - // We respond to a minimal subset of the commit proxy protocol. Our sole purpose is to receive a single write-only - // transaction which might repair our configuration, and return it. - loop choose { - when(GetReadVersionRequest req = - waitNext(parent->provisionalGrvProxies[0].getConsistentReadVersion.getFuture())) { - if ((req.flags & GetReadVersionRequest::FLAG_CAUSAL_READ_RISKY) && - (req.flags & GetReadVersionRequest::FLAG_USE_PROVISIONAL_PROXIES) && parent->lastEpochEnd) { - GetReadVersionReply rep; - rep.version = parent->lastEpochEnd; - rep.locked = locked; - rep.metadataVersion = metadataVersion; - req.reply.send(rep); - } else - req.reply.send(Never()); // We can't perform causally consistent reads without recovering - } - when(CommitTransactionRequest req = waitNext(parent->provisionalCommitProxies[0].commit.getFuture())) { - req.reply.send(Never()); // don't reply (clients always get commit_unknown_result) - auto t = &req.transaction; - if (t->read_snapshot == parent->lastEpochEnd && //< So no transactions can fall between the read snapshot - // and the recovery transaction this (might) be merged with - // vvv and also the changes we will make in the recovery - // transaction (most notably to lastEpochEndKey) BEFORE we - // merge initialConfChanges won't conflict - !std::any_of(t->read_conflict_ranges.begin(), t->read_conflict_ranges.end(), [](KeyRangeRef const& r) { - return r.contains(lastEpochEndKey); - })) { - for (auto m = t->mutations.begin(); m != t->mutations.end(); ++m) { - TraceEvent("PM_CTM", parent->dbgid) - .detail("MType", m->type) - .detail("Param1", m->param1) - .detail("Param2", m->param2); - if (isMetadataMutation(*m)) { - // We keep the mutations and write conflict ranges from this transaction, but not its read - // conflict ranges - Standalone out; - out.read_snapshot = invalidVersion; - out.mutations.append_deep(out.arena(), t->mutations.begin(), t->mutations.size()); - out.write_conflict_ranges.append_deep( - out.arena(), t->write_conflict_ranges.begin(), t->write_conflict_ranges.size()); - return out; - } - } - } - } - when(GetKeyServerLocationsRequest req = - waitNext(parent->provisionalCommitProxies[0].getKeyServersLocations.getFuture())) { - req.reply.send(Never()); - } - when(wait(waitCommitProxyFailure)) { throw worker_removed(); } - when(wait(waitGrvProxyFailure)) { throw worker_removed(); } - } -} - -ACTOR Future>> recruitEverything( - Reference self, - std::vector* seedServers, - Reference oldLogSystem) { - if (!self->configuration.isValid()) { - RecoveryStatus::RecoveryStatus status; - if (self->configuration.initialized) { - TraceEvent(SevWarn, "ClusterRecoveryInvalidConfiguration", self->dbgid) - .setMaxEventLength(11000) - .setMaxFieldLength(10000) - .detail("Conf", self->configuration.toString()); - status = RecoveryStatus::configuration_invalid; - } else if (!self->cstate.prevDBState.tLogs.size()) { - status = RecoveryStatus::configuration_never_created; - self->neverCreated = true; - } else { - status = RecoveryStatus::configuration_missing; - } - TraceEvent("ClusterRecoveryState", self->dbgid) - .detail("StatusCode", status) - .detail("Status", RecoveryStatus::names[status]) - .trackLatest(self->clusterRecoveryStateEventHolder->trackingKey); - return Never(); - } else - TraceEvent("ClusterRecoveryState", self->dbgid) - .detail("StatusCode", RecoveryStatus::recruiting_transaction_servers) - .detail("Status", RecoveryStatus::names[RecoveryStatus::recruiting_transaction_servers]) - .detail("Conf", self->configuration.toString()) - .detail("RequiredCommitProxies", 1) - .detail("RequiredGrvProxies", 1) - .detail("RequiredResolvers", 1) - .trackLatest(self->clusterRecoveryStateEventHolder->trackingKey); - - // FIXME: we only need log routers for the same locality as the master - int maxLogRouters = self->cstate.prevDBState.logRouterTags; - for (auto& old : self->cstate.prevDBState.oldTLogData) { - maxLogRouters = std::max(maxLogRouters, old.logRouterTags); - } - - RecruitFromConfigurationRequest recruitReq(self->configuration, self->lastEpochEnd == 0, maxLogRouters); - state Reference recruitWorkersInfo = makeReference(recruitReq); - recruitWorkersInfo->dbgId = self->dbgid; - wait(clusterRecruitFromConfiguration(self->controllerData, recruitWorkersInfo)); - state RecruitFromConfigurationReply recruits = recruitWorkersInfo->rep; - - std::string primaryDcIds, remoteDcIds; - - self->primaryDcId.clear(); - self->remoteDcIds.clear(); - if (recruits.dcId.present()) { - self->primaryDcId.push_back(recruits.dcId); - if (!primaryDcIds.empty()) { - primaryDcIds += ','; - } - primaryDcIds += printable(recruits.dcId); - if (self->configuration.regions.size() > 1) { - Key remoteDcId = recruits.dcId.get() == self->configuration.regions[0].dcId - ? self->configuration.regions[1].dcId - : self->configuration.regions[0].dcId; - self->remoteDcIds.push_back(remoteDcId); - if (!remoteDcIds.empty()) { - remoteDcIds += ','; - } - remoteDcIds += printable(remoteDcId); - } - } - self->backupWorkers.swap(recruits.backupWorkers); - - TraceEvent("ClusterRecoveryState", self->dbgid) - .detail("StatusCode", RecoveryStatus::initializing_transaction_servers) - .detail("Status", RecoveryStatus::names[RecoveryStatus::initializing_transaction_servers]) - .detail("CommitProxies", recruits.commitProxies.size()) - .detail("GrvProxies", recruits.grvProxies.size()) - .detail("TLogs", recruits.tLogs.size()) - .detail("Resolvers", recruits.resolvers.size()) - .detail("SatelliteTLogs", recruits.satelliteTLogs.size()) - .detail("OldLogRouters", recruits.oldLogRouters.size()) - .detail("StorageServers", recruits.storageServers.size()) - .detail("BackupWorkers", self->backupWorkers.size()) - .detail("PrimaryDcIds", primaryDcIds) - .detail("RemoteDcIds", remoteDcIds) - .trackLatest(self->clusterRecoveryStateEventHolder->trackingKey); - - // Actually, newSeedServers does both the recruiting and initialization of the seed servers; so if this is a brand - // new database we are sort of lying that we are past the recruitment phase. In a perfect world we would split that - // up so that the recruitment part happens above (in parallel with recruiting the transaction servers?). - wait(newSeedServers(self, recruits, seedServers)); - state std::vector> confChanges; - wait(newCommitProxies(self, recruits) && newGrvProxies(self, recruits) && newResolvers(self, recruits) && - newTLogServers(self, recruits, oldLogSystem, &confChanges)); - - // Update recovery related information to the newly elected sequencer (master) process. - wait(brokenPromiseToNever(self->masterInterface.updateRecoveryData.getReply( - UpdateRecoveryDataRequest(self->recoveryTransactionVersion, self->lastEpochEnd, self->commitProxies)))); - - return confChanges; -} - -ACTOR Future updateLocalityForDcId(Optional dcId, - Reference oldLogSystem, - Reference> locality) { - loop { - std::pair loc = oldLogSystem->getLogSystemConfig().getLocalityForDcId(dcId); - Version ver = locality->get().knownCommittedVersion; - if (ver == invalidVersion) { - ver = oldLogSystem->getKnownCommittedVersion(); - } - locality->set(PeekTxsInfo(loc.first, loc.second, ver)); - TraceEvent("UpdatedLocalityForDcId") - .detail("DcId", dcId) - .detail("Locality0", loc.first) - .detail("Locality1", loc.second) - .detail("Version", ver); - wait(oldLogSystem->onLogSystemConfigChange() || oldLogSystem->onKnownCommittedVersionChange()); - } -} - -ACTOR Future readTransactionSystemState(Reference self, - Reference oldLogSystem, - Version txsPoppedVersion) { - state Reference> myLocality = Reference>( - new AsyncVar(PeekTxsInfo(tagLocalityInvalid, tagLocalityInvalid, invalidVersion))); - state Future localityUpdater = - updateLocalityForDcId(self->masterInterface.locality.dcId(), oldLogSystem, myLocality); - // Peek the txnStateTag in oldLogSystem and recover self->txnStateStore - - // For now, we also obtain the recovery metadata that the log system obtained during the end_epoch process for - // comparison - - // Sets self->lastEpochEnd and self->recoveryTransactionVersion - // Sets self->configuration to the configuration (FF/conf/ keys) at self->lastEpochEnd - - // Recover transaction state store - if (self->txnStateStore) - self->txnStateStore->close(); - self->txnStateLogAdapter = openDiskQueueAdapter(oldLogSystem, myLocality, txsPoppedVersion); - self->txnStateStore = - keyValueStoreLogSystem(self->txnStateLogAdapter, self->dbgid, self->memoryLimit, false, false, true); - - // Versionstamped operations (particularly those applied from DR) define a minimum commit version - // that we may recover to, as they embed the version in user-readable data and require that no - // transactions will be committed at a lower version. - Optional> requiredCommitVersion = - wait(self->txnStateStore->readValue(minRequiredCommitVersionKey)); - - Version minRequiredCommitVersion = -1; - if (requiredCommitVersion.present()) { - minRequiredCommitVersion = BinaryReader::fromStringRef(requiredCommitVersion.get(), Unversioned()); - } - - // Recover version info - self->lastEpochEnd = oldLogSystem->getEnd() - 1; - if (self->lastEpochEnd == 0) { - self->recoveryTransactionVersion = 1; - } else { - if (self->forceRecovery) { - self->recoveryTransactionVersion = self->lastEpochEnd + SERVER_KNOBS->MAX_VERSIONS_IN_FLIGHT_FORCED; - } else { - self->recoveryTransactionVersion = self->lastEpochEnd + SERVER_KNOBS->MAX_VERSIONS_IN_FLIGHT; - } - - if (BUGGIFY) { - self->recoveryTransactionVersion += - deterministicRandom()->randomInt64(0, SERVER_KNOBS->MAX_VERSIONS_IN_FLIGHT); - } - if (self->recoveryTransactionVersion < minRequiredCommitVersion) - self->recoveryTransactionVersion = minRequiredCommitVersion; - } - - TraceEvent("ClusterRecovering", self->dbgid) - .detail("LastEpochEnd", self->lastEpochEnd) - .detail("RecoveryTransactionVersion", self->recoveryTransactionVersion); - - RangeResult rawConf = wait(self->txnStateStore->readRange(configKeys)); - self->configuration.fromKeyValues(rawConf.castTo>()); - self->originalConfiguration = self->configuration; - self->hasConfiguration = true; - - TraceEvent("ClusterRecoveredConfig", self->dbgid) - .setMaxEventLength(11000) - .setMaxFieldLength(10000) - .detail("Conf", self->configuration.toString()) - .trackLatest(self->recoveredConfigEventHolder->trackingKey); - - RangeResult rawLocalities = wait(self->txnStateStore->readRange(tagLocalityListKeys)); - self->dcId_locality.clear(); - for (auto& kv : rawLocalities) { - self->dcId_locality[decodeTagLocalityListKey(kv.key)] = decodeTagLocalityListValue(kv.value); - } - - RangeResult rawTags = wait(self->txnStateStore->readRange(serverTagKeys)); - self->allTags.clear(); - if (self->lastEpochEnd > 0) { - self->allTags.push_back(cacheTag); - } - - if (self->forceRecovery) { - self->safeLocality = oldLogSystem->getLogSystemConfig().tLogs[0].locality; - for (auto& kv : rawTags) { - Tag tag = decodeServerTagValue(kv.value); - if (tag.locality == self->safeLocality) { - self->allTags.push_back(tag); - } - } - } else { - for (auto& kv : rawTags) { - self->allTags.push_back(decodeServerTagValue(kv.value)); - } - } - - RangeResult rawHistoryTags = wait(self->txnStateStore->readRange(serverTagHistoryKeys)); - for (auto& kv : rawHistoryTags) { - self->allTags.push_back(decodeServerTagValue(kv.value)); - } - - uniquify(self->allTags); - - // auto kvs = self->txnStateStore->readRange( systemKeys ); - // for( auto & kv : kvs.get() ) - // TraceEvent("ClusterRecoveredTXS", self->dbgid).detail("K", kv.key).detail("V", kv.value); - - self->txnStateLogAdapter->setNextVersion( - oldLogSystem->getEnd()); //< FIXME: (1) the log adapter should do this automatically after recovery; (2) if we - // make KeyValueStoreMemory guarantee immediate reads, we should be able to get rid of - // the discardCommit() below and not need a writable log adapter - - TraceEvent("RTSSComplete", self->dbgid).log(); - - return Void(); -} - -ACTOR Future sendInitialCommitToResolvers(Reference self) { - state KeyRange txnKeys = allKeys; - state Sequence txnSequence = 0; - ASSERT(self->recoveryTransactionVersion); - - state RangeResult data = - self->txnStateStore - ->readRange(txnKeys, BUGGIFY ? 3 : SERVER_KNOBS->DESIRED_TOTAL_BYTES, SERVER_KNOBS->DESIRED_TOTAL_BYTES) - .get(); - state std::vector> txnReplies; - state int64_t dataOutstanding = 0; - - state std::vector endpoints; - for (auto& it : self->commitProxies) { - endpoints.push_back(it.txnState.getEndpoint()); - } - - loop { - if (!data.size()) - break; - ((KeyRangeRef&)txnKeys) = KeyRangeRef(keyAfter(data.back().key, txnKeys.arena()), txnKeys.end); - RangeResult nextData = - self->txnStateStore - ->readRange(txnKeys, BUGGIFY ? 3 : SERVER_KNOBS->DESIRED_TOTAL_BYTES, SERVER_KNOBS->DESIRED_TOTAL_BYTES) - .get(); - - TxnStateRequest req; - req.arena = data.arena(); - req.data = data; - req.sequence = txnSequence; - req.last = !nextData.size(); - req.broadcastInfo = endpoints; - txnReplies.push_back(broadcastTxnRequest(req, SERVER_KNOBS->TXN_STATE_SEND_AMOUNT, false)); - dataOutstanding += SERVER_KNOBS->TXN_STATE_SEND_AMOUNT * data.arena().getSize(); - data = nextData; - txnSequence++; - - if (dataOutstanding > SERVER_KNOBS->MAX_TXS_SEND_MEMORY) { - wait(waitForAll(txnReplies)); - txnReplies = std::vector>(); - dataOutstanding = 0; - } - - wait(yield()); - } - wait(waitForAll(txnReplies)); - TraceEvent("RecoveryInternal", self->dbgid) - .detail("StatusCode", RecoveryStatus::recovery_transaction) - .detail("Status", RecoveryStatus::names[RecoveryStatus::recovery_transaction]) - .detail("RecoveryTxnVersion", self->recoveryTransactionVersion) - .detail("LastEpochEnd", self->lastEpochEnd) - .detail("Step", "SentTxnStateStoreToCommitProxies"); - - std::vector> replies; - for (auto& r : self->resolvers) { - ResolveTransactionBatchRequest req; - req.prevVersion = -1; - req.version = self->lastEpochEnd; - req.lastReceivedVersion = -1; - - replies.push_back(brokenPromiseToNever(r.resolve.getReply(req))); - } - - wait(waitForAll(replies)); - TraceEvent("RecoveryInternal", self->dbgid) - .detail("StatusCode", RecoveryStatus::recovery_transaction) - .detail("Status", RecoveryStatus::names[RecoveryStatus::recovery_transaction]) - .detail("RecoveryTxnVersion", self->recoveryTransactionVersion) - .detail("LastEpochEnd", self->lastEpochEnd) - .detail("Step", "InitializedAllResolvers"); - return Void(); -} - -ACTOR Future triggerUpdates(Reference self, Reference oldLogSystem) { - loop { - wait(oldLogSystem->onLogSystemConfigChange() || self->cstate.fullyRecovered.getFuture() || - self->recruitmentStalled->onChange()); - if (self->cstate.fullyRecovered.isSet()) - return Void(); - - self->registrationTrigger.trigger(); - } -} - -ACTOR Future discardCommit(IKeyValueStore* store, LogSystemDiskQueueAdapter* adapter) { - state Future fcm = adapter->getCommitMessage(); - state Future committed = store->commit(); - LogSystemDiskQueueAdapter::CommitMessage cm = wait(fcm); - ASSERT(!committed.isReady()); - cm.acknowledge.send(Void()); - ASSERT(committed.isReady()); - return Void(); -} - -void updateConfigForForcedRecovery(Reference self, - std::vector>* initialConfChanges) { - bool regionsChanged = false; - for (auto& it : self->configuration.regions) { - if (it.dcId == self->controllerData->clusterControllerDcId.get() && it.priority < 0) { - it.priority = 1; - regionsChanged = true; - } else if (it.dcId != self->controllerData->clusterControllerDcId.get() && it.priority >= 0) { - it.priority = -1; - regionsChanged = true; - } - } - Standalone regionCommit; - regionCommit.mutations.push_back_deep( - regionCommit.arena(), - MutationRef(MutationRef::SetValue, configKeysPrefix.toString() + "usable_regions", LiteralStringRef("1"))); - self->configuration.applyMutation(regionCommit.mutations.back()); - if (regionsChanged) { - std::sort( - self->configuration.regions.begin(), self->configuration.regions.end(), RegionInfo::sort_by_priority()); - StatusObject regionJSON; - regionJSON["regions"] = self->configuration.getRegionJSON(); - regionCommit.mutations.push_back_deep( - regionCommit.arena(), - MutationRef(MutationRef::SetValue, - configKeysPrefix.toString() + "regions", - BinaryWriter::toValue(regionJSON, IncludeVersion(ProtocolVersion::withRegionConfiguration())) - .toString())); - self->configuration.applyMutation( - regionCommit.mutations.back()); // modifying the configuration directly does not change the configuration - // when it is re-serialized unless we call applyMutation - TraceEvent("ForcedRecoveryConfigChange", self->dbgid) - .setMaxEventLength(11000) - .setMaxFieldLength(10000) - .detail("Conf", self->configuration.toString()); - } - initialConfChanges->push_back(regionCommit); -} - -ACTOR Future recoverFrom(Reference self, - Reference oldLogSystem, - std::vector* seedServers, - std::vector>* initialConfChanges, - Future poppedTxsVersion, - bool* clusterIdExists) { - TraceEvent("ClusterRecoveryState", self->dbgid) - .detail("StatusCode", RecoveryStatus::reading_transaction_system_state) - .detail("Status", RecoveryStatus::names[RecoveryStatus::reading_transaction_system_state]) - .trackLatest(self->clusterRecoveryStateEventHolder->trackingKey); - self->hasConfiguration = false; - - if (BUGGIFY) - wait(delay(10.0)); - - Version txsPoppedVersion = wait(poppedTxsVersion); - wait(readTransactionSystemState(self, oldLogSystem, txsPoppedVersion)); - for (auto& itr : *initialConfChanges) { - for (auto& m : itr.mutations) { - self->configuration.applyMutation(m); - } - } - - if (self->forceRecovery) { - updateConfigForForcedRecovery(self, initialConfChanges); - } - - debug_checkMaxRestoredVersion(UID(), self->lastEpochEnd, "DBRecovery"); - - // Generate a cluster ID to uniquely identify the cluster if it doesn't - // already exist in the txnStateStore. - Optional clusterId = self->txnStateStore->readValue(clusterIdKey).get(); - *clusterIdExists = clusterId.present(); - if (!clusterId.present()) { - self->clusterId = deterministicRandom()->randomUniqueID(); - } else { - self->clusterId = BinaryReader::fromStringRef(clusterId.get(), Unversioned()); - } - - // Ordinarily we pass through this loop once and recover. We go around the loop if recovery stalls for more than a - // second, a provisional master is initialized, and an "emergency transaction" is submitted that might change the - // configuration so that we can finish recovery. - - state std::map, int8_t> originalLocalityMap = self->dcId_locality; - state Future>> recruitments = - recruitEverything(self, seedServers, oldLogSystem); - state double provisionalDelay = SERVER_KNOBS->PROVISIONAL_START_DELAY; - loop { - state Future> provisional = provisionalMaster(self, delay(provisionalDelay)); - provisionalDelay = - std::min(SERVER_KNOBS->PROVISIONAL_MAX_DELAY, provisionalDelay * SERVER_KNOBS->PROVISIONAL_DELAY_GROWTH); - choose { - when(std::vector> confChanges = wait(recruitments)) { - initialConfChanges->insert(initialConfChanges->end(), confChanges.begin(), confChanges.end()); - provisional.cancel(); - break; - } - when(Standalone _req = wait(provisional)) { - state Standalone req = _req; // mutable - TEST(true); // Emergency transaction processing during recovery - TraceEvent("EmergencyTransaction", self->dbgid).log(); - for (auto m = req.mutations.begin(); m != req.mutations.end(); ++m) - TraceEvent("EmergencyTransactionMutation", self->dbgid) - .detail("MType", m->type) - .detail("P1", m->param1) - .detail("P2", m->param2); - - DatabaseConfiguration oldConf = self->configuration; - self->configuration = self->originalConfiguration; - for (auto& m : req.mutations) - self->configuration.applyMutation(m); - - initialConfChanges->clear(); - if (self->originalConfiguration.isValid() && - self->configuration.usableRegions != self->originalConfiguration.usableRegions) { - TraceEvent(SevWarnAlways, "CannotChangeUsableRegions", self->dbgid).log(); - self->configuration = self->originalConfiguration; - } else { - initialConfChanges->push_back(req); - } - if (self->forceRecovery) { - updateConfigForForcedRecovery(self, initialConfChanges); - } - - if (self->configuration != oldConf) { // confChange does not trigger when including servers - self->dcId_locality = originalLocalityMap; - recruitments = recruitEverything(self, seedServers, oldLogSystem); - } - } - } - - provisional.cancel(); - } - - return Void(); -} - -ACTOR Future clusterRecoveryCore(Reference self) { - state TraceInterval recoveryInterval("ClusterRecovery"); - state double recoverStartTime = now(); - - self->addActor.send(waitFailureServer(self->masterInterface.waitFailure.getFuture())); - - TraceEvent(recoveryInterval.begin(), self->dbgid).log(); - - self->recoveryState = RecoveryState::READING_CSTATE; - TraceEvent("ClusterRecoveryState", self->dbgid) - .detail("StatusCode", RecoveryStatus::reading_coordinated_state) - .detail("Status", RecoveryStatus::names[RecoveryStatus::reading_coordinated_state]) - .trackLatest(self->clusterRecoveryStateEventHolder->trackingKey); - - wait(self->cstate.read()); - - self->recoveryState = RecoveryState::LOCKING_CSTATE; - TraceEvent("ClusterRecoveryState", self->dbgid) - .detail("StatusCode", RecoveryStatus::locking_coordinated_state) - .detail("Status", RecoveryStatus::names[RecoveryStatus::locking_coordinated_state]) - .detail("TLogs", self->cstate.prevDBState.tLogs.size()) - .detail("ActiveGenerations", self->cstate.myDBState.oldTLogData.size() + 1) - .detail("MyRecoveryCount", self->cstate.prevDBState.recoveryCount + 2) - .detail("ForceRecovery", self->forceRecovery) - .trackLatest(self->clusterRecoveryStateEventHolder->trackingKey); - // for (const auto& old : self->cstate.prevDBState.oldTLogData) { - // TraceEvent("BWReadCoreState", self->dbgid).detail("Epoch", old.epoch).detail("Version", old.epochEnd); - //} - - TraceEvent("ClusterRecoveryGenerations", self->dbgid) - .detail("ActiveGenerations", self->cstate.myDBState.oldTLogData.size() + 1) - .trackLatest(self->clusterRecoveryGenerationsEventHolder->trackingKey); - - if (self->cstate.myDBState.oldTLogData.size() > CLIENT_KNOBS->MAX_GENERATIONS_OVERRIDE) { - if (self->cstate.myDBState.oldTLogData.size() >= CLIENT_KNOBS->MAX_GENERATIONS) { - TraceEvent(SevError, "RecoveryStoppedTooManyOldGenerations") - .detail("OldGenerations", self->cstate.myDBState.oldTLogData.size()) - .detail("Reason", - "Recovery stopped because too many recoveries have happened since the last time the cluster " - "was fully_recovered. Set --knob-max-generations-override on your server processes to a value " - "larger than OldGenerations to resume recovery once the underlying problem has been fixed."); - wait(Future(Never())); - } else if (self->cstate.myDBState.oldTLogData.size() > CLIENT_KNOBS->RECOVERY_DELAY_START_GENERATION) { - TraceEvent(SevError, "RecoveryDelayedTooManyOldGenerations") - .detail("OldGenerations", self->cstate.myDBState.oldTLogData.size()) - .detail("Reason", - "Recovery is delayed because too many recoveries have happened since the last time the cluster " - "was fully_recovered. Set --knob-max-generations-override on your server processes to a value " - "larger than OldGenerations to resume recovery once the underlying problem has been fixed."); - wait(delay(CLIENT_KNOBS->RECOVERY_DELAY_SECONDS_PER_GENERATION * - (self->cstate.myDBState.oldTLogData.size() - CLIENT_KNOBS->RECOVERY_DELAY_START_GENERATION))); - } - if (g_network->isSimulated() && self->cstate.myDBState.oldTLogData.size() > CLIENT_KNOBS->MAX_GENERATIONS_SIM) { - g_simulator.connectionFailuresDisableDuration = 1e6; - g_simulator.speedUpSimulation = true; - TraceEvent(SevWarnAlways, "DisableConnectionFailures_TooManyGenerations").log(); - } - } - - state Reference>> oldLogSystems(new AsyncVar>); - state Future recoverAndEndEpoch = - ILogSystem::recoverAndEndEpoch(oldLogSystems, - self->dbgid, - self->cstate.prevDBState, - self->clusterController.tlogRejoin.getFuture(), - self->controllerData->db.serverInfo->get().myLocality, - std::addressof(self->forceRecovery)); - - DBCoreState newState = self->cstate.myDBState; - newState.recoveryCount++; - wait(self->cstate.write(newState) || recoverAndEndEpoch); - - self->recoveryState = RecoveryState::RECRUITING; - - state std::vector seedServers; - state std::vector> initialConfChanges; - state Future logChanges; - state Future minRecoveryDuration; - state Future poppedTxsVersion; - state bool clusterIdExists = false; - - loop { - Reference oldLogSystem = oldLogSystems->get(); - if (oldLogSystem) { - logChanges = triggerUpdates(self, oldLogSystem); - if (!minRecoveryDuration.isValid()) { - minRecoveryDuration = delay(SERVER_KNOBS->ENFORCED_MIN_RECOVERY_DURATION); - poppedTxsVersion = oldLogSystem->getTxsPoppedVersion(); - } - } - - state Future reg = oldLogSystem ? updateRegistration(self, oldLogSystem) : Never(); - self->registrationTrigger.trigger(); - - choose { - when(wait(oldLogSystem ? recoverFrom(self, - oldLogSystem, - &seedServers, - &initialConfChanges, - poppedTxsVersion, - std::addressof(clusterIdExists)) - : Never())) { - reg.cancel(); - break; - } - when(wait(oldLogSystems->onChange())) {} - when(wait(reg)) { throw internal_error(); } - when(wait(recoverAndEndEpoch)) { throw internal_error(); } - } - } - - if (self->neverCreated) { - recoverStartTime = now(); - } - - recoverAndEndEpoch.cancel(); - - ASSERT(self->commitProxies.size() <= self->configuration.getDesiredCommitProxies()); - ASSERT(self->commitProxies.size() >= 1); - ASSERT(self->grvProxies.size() <= self->configuration.getDesiredGrvProxies()); - ASSERT(self->grvProxies.size() >= 1); - ASSERT(self->resolvers.size() <= self->configuration.getDesiredResolvers()); - ASSERT(self->resolvers.size() >= 1); - - self->recoveryState = RecoveryState::RECOVERY_TRANSACTION; - TraceEvent("ClusterRecoveryState", self->dbgid) - .detail("StatusCode", RecoveryStatus::recovery_transaction) - .detail("Status", RecoveryStatus::names[RecoveryStatus::recovery_transaction]) - .detail("PrimaryLocality", self->primaryLocality) - .detail("DcId", self->masterInterface.locality.dcId()) - .detail("ClusterId", self->clusterId) - .trackLatest(self->clusterRecoveryStateEventHolder->trackingKey); - - // Recovery transaction - state bool debugResult = debug_checkMinRestoredVersion(UID(), self->lastEpochEnd, "DBRecovery", SevWarn); - - CommitTransactionRequest recoveryCommitRequest; - recoveryCommitRequest.flags = recoveryCommitRequest.flags | CommitTransactionRequest::FLAG_IS_LOCK_AWARE; - CommitTransactionRef& tr = recoveryCommitRequest.transaction; - int mmApplied = 0; // The number of mutations in tr.mutations that have been applied to the txnStateStore so far - if (self->lastEpochEnd != 0) { - Optional snapRecoveryFlag = self->txnStateStore->readValue(writeRecoveryKey).get(); - TraceEvent("ClusterRecoverySnapshotCheck") - .detail("SnapRecoveryFlag", snapRecoveryFlag.present() ? snapRecoveryFlag.get().toString() : "N/A") - .detail("LastEpochEnd", self->lastEpochEnd); - if (snapRecoveryFlag.present()) { - TEST(true); // Recovering from snapshot, writing to snapShotEndVersionKey - BinaryWriter bw(Unversioned()); - tr.set(recoveryCommitRequest.arena, snapshotEndVersionKey, (bw << self->lastEpochEnd).toValue()); - // Pause the backups that got restored in this snapshot to avoid data corruption - // Requires further operational work to abort the backup - TraceEvent("ClusterRecoveryPauseBackupAgents").log(); - Key backupPauseKey = FileBackupAgent::getPauseKey(); - tr.set(recoveryCommitRequest.arena, backupPauseKey, StringRef()); - // Clear the key so multiple recoveries will not overwrite the first version recorded - tr.clear(recoveryCommitRequest.arena, singleKeyRange(writeRecoveryKey)); - } - if (self->forceRecovery) { - BinaryWriter bw(Unversioned()); - tr.set(recoveryCommitRequest.arena, killStorageKey, (bw << self->safeLocality).toValue()); - } - - // This transaction sets \xff/lastEpochEnd, which the shard servers can use to roll back speculatively - // processed semi-committed transactions from the previous epoch. - // It also guarantees the shard servers and tlog servers eventually get versions in the new epoch, which - // clients might rely on. - // This transaction is by itself in a batch (has its own version number), which simplifies storage servers - // slightly (they assume there are no modifications to serverKeys in the same batch) The proxy also expects the - // lastEpochEndKey mutation to be first in the transaction - BinaryWriter bw(Unversioned()); - tr.set(recoveryCommitRequest.arena, lastEpochEndKey, (bw << self->lastEpochEnd).toValue()); - - if (self->forceRecovery) { - tr.set(recoveryCommitRequest.arena, rebootWhenDurableKey, StringRef()); - tr.set(recoveryCommitRequest.arena, - moveKeysLockOwnerKey, - BinaryWriter::toValue(deterministicRandom()->randomUniqueID(), Unversioned())); - } - } else { - // Recruit and seed initial shard servers - // This transaction must be the very first one in the database (version 1) - seedShardServers(recoveryCommitRequest.arena, tr, seedServers); - } - // initialConfChanges have not been conflict checked against any earlier writes in the recovery transaction, so do - // this as early as possible in the recovery transaction but see above comments as to why it can't be absolutely - // first. Theoretically emergency transactions should conflict check against the lastEpochEndKey. - for (auto& itr : initialConfChanges) { - tr.mutations.append_deep(recoveryCommitRequest.arena, itr.mutations.begin(), itr.mutations.size()); - tr.write_conflict_ranges.append_deep( - recoveryCommitRequest.arena, itr.write_conflict_ranges.begin(), itr.write_conflict_ranges.size()); - } - - tr.set( - recoveryCommitRequest.arena, primaryLocalityKey, BinaryWriter::toValue(self->primaryLocality, Unversioned())); - tr.set(recoveryCommitRequest.arena, backupVersionKey, backupVersionValue); - tr.set(recoveryCommitRequest.arena, coordinatorsKey, self->coordinators.ccr->getConnectionString().toString()); - tr.set(recoveryCommitRequest.arena, logsKey, self->logSystem->getLogsValue()); - tr.set(recoveryCommitRequest.arena, - primaryDatacenterKey, - self->controllerData->clusterControllerDcId.present() ? self->controllerData->clusterControllerDcId.get() - : StringRef()); - - tr.clear(recoveryCommitRequest.arena, tLogDatacentersKeys); - for (auto& dc : self->primaryDcId) { - tr.set(recoveryCommitRequest.arena, tLogDatacentersKeyFor(dc), StringRef()); - } - if (self->configuration.usableRegions > 1) { - for (auto& dc : self->remoteDcIds) { - tr.set(recoveryCommitRequest.arena, tLogDatacentersKeyFor(dc), StringRef()); - } - } - - // Write cluster ID into txnStateStore if it is missing. - if (!clusterIdExists) { - tr.set(recoveryCommitRequest.arena, clusterIdKey, BinaryWriter::toValue(self->clusterId, Unversioned())); - } - - applyMetadataMutations(SpanID(), - self->dbgid, - recoveryCommitRequest.arena, - tr.mutations.slice(mmApplied, tr.mutations.size()), - self->txnStateStore); - mmApplied = tr.mutations.size(); - - tr.read_snapshot = self->recoveryTransactionVersion; // lastEpochEnd would make more sense, but isn't in the initial - // window of the resolver(s) - - TraceEvent("ClusterRecoveryCommit", self->dbgid).log(); - state Future> recoveryCommit = self->commitProxies[0].commit.tryGetReply(recoveryCommitRequest); - self->addActor.send(self->logSystem->onError()); - self->addActor.send(waitResolverFailure(self->resolvers)); - self->addActor.send(waitCommitProxyFailure(self->commitProxies)); - self->addActor.send(waitGrvProxyFailure(self->grvProxies)); - self->addActor.send(reportErrors(updateRegistration(self, self->logSystem), "UpdateRegistration", self->dbgid)); - self->registrationTrigger.trigger(); - - wait(discardCommit(self->txnStateStore, self->txnStateLogAdapter)); - - // Wait for the recovery transaction to complete. - // SOMEDAY: For faster recovery, do this and setDBState asynchronously and don't wait for them - // unless we want to change TLogs - wait((success(recoveryCommit) && sendInitialCommitToResolvers(self))); - if (recoveryCommit.isReady() && recoveryCommit.get().isError()) { - TEST(true); // Cluster recovery failed because of the initial commit failed - throw cluster_recovery_failed(); - } - - ASSERT(self->recoveryTransactionVersion != 0); - - self->recoveryState = RecoveryState::WRITING_CSTATE; - TraceEvent("ClusterRecoveryState", self->dbgid) - .detail("StatusCode", RecoveryStatus::writing_coordinated_state) - .detail("Status", RecoveryStatus::names[RecoveryStatus::writing_coordinated_state]) - .detail("TLogList", self->logSystem->describe()) - .trackLatest(self->clusterRecoveryStateEventHolder->trackingKey); - - // Multiple masters prevent conflicts between themselves via CoordinatedState (self->cstate) - // 1. If SetMaster succeeds, then by CS's contract, these "new" Tlogs are the immediate - // successors of the "old" ones we are replacing - // 2. logSystem->recoverAndEndEpoch ensured that a co-quorum of the "old" tLogs were stopped at - // versions <= self->lastEpochEnd, so no versions > self->lastEpochEnd could be (fully) committed to them. - // 3. No other master will attempt to commit anything to our "new" Tlogs - // because it didn't recruit them - // 4. Therefore, no full commit can come between self->lastEpochEnd and the first commit - // we made to the new Tlogs (self->recoveryTransactionVersion), and only our own semi-commits can come between - // our first commit and the next new TLogs - - self->addActor.send(trackTlogRecovery(self, oldLogSystems, minRecoveryDuration)); - debug_advanceMaxCommittedVersion(UID(), self->recoveryTransactionVersion); - wait(self->cstateUpdated.getFuture()); - debug_advanceMinCommittedVersion(UID(), self->recoveryTransactionVersion); - - if (debugResult) { - TraceEvent(self->forceRecovery ? SevWarn : SevError, "DBRecoveryDurabilityError").log(); - } - - TraceEvent("ClusterRecoveryCommittedTLogs", self->dbgid) - .detail("TLogs", self->logSystem->describe()) - .detail("RecoveryCount", self->cstate.myDBState.recoveryCount) - .detail("RecoveryTransactionVersion", self->recoveryTransactionVersion); - - TraceEvent(recoveryInterval.end(), self->dbgid) - .detail("RecoveryTransactionVersion", self->recoveryTransactionVersion); - - self->recoveryState = RecoveryState::ACCEPTING_COMMITS; - double recoveryDuration = now() - recoverStartTime; - - TraceEvent((recoveryDuration > 4 && !g_network->isSimulated()) ? SevWarnAlways : SevInfo, - "ClusterRecoveryDuration", - self->dbgid) - .detail("RecoveryDuration", recoveryDuration) - .trackLatest(self->clusterRecoveryDurationEventHolder->trackingKey); - - TraceEvent("ClusterRecoveryState", self->dbgid) - .detail("StatusCode", RecoveryStatus::accepting_commits) - .detail("Status", RecoveryStatus::names[RecoveryStatus::accepting_commits]) - .detail("StoreType", self->configuration.storageServerStoreType) - .detail("RecoveryDuration", recoveryDuration) - .trackLatest(self->clusterRecoveryStateEventHolder->trackingKey); - - TraceEvent("ClusterRecoveryAvailable", self->dbgid) - .detail("AvailableAtVersion", self->version) - .trackLatest(self->clusterRecoveryAvailableEventHolder->trackingKey); - - if (self->resolvers.size() > 1) - self->addActor.send(resolutionBalancing(self)); - - self->addActor.send(changeCoordinators(self)); - Database cx = openDBOnServer(self->dbInfo, TaskPriority::DefaultEndpoint, LockAware::True); - self->addActor.send(configurationMonitor(self, cx)); - if (self->configuration.backupWorkerEnabled) { - self->addActor.send(recruitBackupWorkers(self, cx)); - } else { - self->logSystem->setOldestBackupEpoch(self->cstate.myDBState.recoveryCount); - } - - wait(Future(Never())); - throw internal_error(); -} - -ACTOR Future cleanupActorCollection(Reference self, bool exThrown) { - if (self.isValid()) { - wait(delay(0.0)); - - while (!self->addActor.isEmpty()) { - self->addActor.getFuture().pop(); - } - } - - return Void(); -} - -} // namespace ClusterControllerRecovery - // Wrapper for singleton interfaces template struct Singleton { @@ -5407,26 +3349,8 @@ struct BlobManagerSingleton : Singleton { void recruit(ClusterControllerData* cc) const { cc->recruitBlobManager.set(true); } }; -ACTOR Future handleLeaderReplacement(Reference self, - Future leaderFail) { - loop choose { - when(wait(leaderFail)) { - TraceEvent("LeaderReplaced", self->controllerData->id).log(); - // We are no longer the leader if this has changed. - self->controllerData->shouldCommitSuicide = true; - throw restart_cluster_controller(); - } - } -} - -ACTOR Future clusterWatchDatabase(ClusterControllerData* cluster, - ClusterControllerData::DBInfo* db, - ServerCoordinators coordinators, - Future leaderFail) { +ACTOR Future clusterWatchDatabase(ClusterControllerData* cluster, ClusterControllerData::DBInfo* db) { state MasterInterface iMaster; - state Reference recoveryData; - state PromiseStream> addActor; - state Future recoveryCore; // SOMEDAY: If there is already a non-failed master referenced by zkMasterInfo, use that one until it fails // When this someday is implemented, make sure forced failures still cause the master to be recruited again @@ -5435,130 +3359,102 @@ ACTOR Future clusterWatchDatabase(ClusterControllerData* cluster, TraceEvent("CCWDB", cluster->id).log(); try { state double recoveryStart = now(); - state MasterInterface newMaster; - state Future collection; - TraceEvent("CCWDB", cluster->id).detail("Recruiting", "Master"); - wait(ClusterControllerRecovery::recruitNewMaster(cluster, db, std::addressof(newMaster))); - iMaster = newMaster; + // We must recruit the master in the same data center as the cluster controller. + // This should always be possible, because we can recruit the master on the same process as the cluster + // controller. + std::map>, int> id_used; + id_used[cluster->clusterControllerProcessId]++; + state WorkerFitnessInfo masterWorker = cluster->getWorkerForRoleInDatacenter( + cluster->clusterControllerDcId, ProcessClass::Master, ProcessClass::NeverAssign, db->config, id_used); + if ((masterWorker.worker.processClass.machineClassFitness(ProcessClass::Master) > + SERVER_KNOBS->EXPECTED_MASTER_FITNESS || + masterWorker.worker.interf.locality.processId() == cluster->clusterControllerProcessId) && + !cluster->goodRecruitmentTime.isReady()) { + TraceEvent("CCWDB", cluster->id) + .detail("Fitness", masterWorker.worker.processClass.machineClassFitness(ProcessClass::Master)); + wait(delay(SERVER_KNOBS->ATTEMPT_RECRUITMENT_DELAY)); + continue; + } + RecruitMasterRequest rmq; + rmq.lifetime = db->serverInfo->get().masterLifetime; + rmq.forceRecovery = db->forceRecovery; - db->masterRegistrationCount = 0; - db->recoveryStalled = false; + cluster->masterProcessId = masterWorker.worker.interf.locality.processId(); + cluster->db.unfinishedRecoveries++; + state Future> fNewMaster = masterWorker.worker.interf.master.tryGetReply(rmq); + wait(ready(fNewMaster) || db->forceMasterFailure.onTrigger()); + if (fNewMaster.isReady() && fNewMaster.get().present()) { + TraceEvent("CCWDB", cluster->id).detail("Recruited", fNewMaster.get().get().id()); - auto dbInfo = ServerDBInfo(); - dbInfo.master = iMaster; - dbInfo.id = deterministicRandom()->randomUniqueID(); - dbInfo.infoGeneration = ++db->dbInfoCount; - dbInfo.masterLifetime = db->serverInfo->get().masterLifetime; - ++dbInfo.masterLifetime; - dbInfo.clusterInterface = db->serverInfo->get().clusterInterface; - dbInfo.distributor = db->serverInfo->get().distributor; - dbInfo.ratekeeper = db->serverInfo->get().ratekeeper; - dbInfo.blobManager = db->serverInfo->get().blobManager; - dbInfo.latencyBandConfig = db->serverInfo->get().latencyBandConfig; - dbInfo.myLocality = db->serverInfo->get().myLocality; - dbInfo.client = ClientDBInfo(); + // for status tool + TraceEvent("RecruitedMasterWorker", cluster->id) + .detail("Address", fNewMaster.get().get().address()) + .trackLatest(cluster->recruitedMasterWorkerEventHolder->trackingKey); - TraceEvent("CCWDB", cluster->id) - .detail("NewMaster", dbInfo.master.id().toString()) - .detail("Lifetime", dbInfo.masterLifetime.toString()) - .detail("ChangeID", dbInfo.id); - db->serverInfo->set(dbInfo); + iMaster = fNewMaster.get().get(); - state Future spinDelay = delay( - SERVER_KNOBS - ->MASTER_SPIN_DELAY); // Don't retry master recovery more than once per second, but don't delay - // the "first" recovery after more than a second of normal operation + db->masterRegistrationCount = 0; + db->recoveryStalled = false; - TraceEvent("CCWDB", cluster->id).detail("Watching", iMaster.id()); - recoveryData = - makeReference(cluster, - db->serverInfo, - db->serverInfo->get().master, - db->serverInfo->get().masterLifetime, - coordinators, - db->serverInfo->get().clusterInterface, - LiteralStringRef(""), - addActor, - db->forceRecovery); + auto dbInfo = ServerDBInfo(); + dbInfo.master = iMaster; + dbInfo.id = deterministicRandom()->randomUniqueID(); + dbInfo.infoGeneration = ++db->dbInfoCount; + dbInfo.masterLifetime = db->serverInfo->get().masterLifetime; + ++dbInfo.masterLifetime; + dbInfo.clusterInterface = db->serverInfo->get().clusterInterface; + dbInfo.distributor = db->serverInfo->get().distributor; + dbInfo.ratekeeper = db->serverInfo->get().ratekeeper; + dbInfo.blobManager = db->serverInfo->get().blobManager; + dbInfo.latencyBandConfig = db->serverInfo->get().latencyBandConfig; - collection = actorCollection(recoveryData->addActor.getFuture()); - recoveryCore = ClusterControllerRecovery::clusterRecoveryCore(recoveryData); + TraceEvent("CCWDB", cluster->id) + .detail("Lifetime", dbInfo.masterLifetime.toString()) + .detail("ChangeID", dbInfo.id); + db->serverInfo->set(dbInfo); - // Master failure detection is pretty sensitive, but if we are in the middle of a very long recovery we - // really don't want to have to start over - loop choose { - when(wait(recoveryCore)) {} - when(wait(waitFailureClient( - iMaster.waitFailure, - db->masterRegistrationCount - ? SERVER_KNOBS->MASTER_FAILURE_REACTION_TIME - : (now() - recoveryStart) * SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY, - db->masterRegistrationCount ? -SERVER_KNOBS->MASTER_FAILURE_REACTION_TIME / - SERVER_KNOBS->SECONDS_BEFORE_NO_FAILURE_DELAY - : SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY) || - db->forceMasterFailure.onTrigger())) { - break; - } - when(wait(db->serverInfo->onChange())) {} - when(BackupWorkerDoneRequest req = - waitNext(db->serverInfo->get().clusterInterface.notifyBackupWorkerDone.getFuture())) { - if (recoveryData->logSystem.isValid() && recoveryData->logSystem->removeBackupWorker(req)) { - recoveryData->registrationTrigger.trigger(); + state Future spinDelay = delay( + SERVER_KNOBS + ->MASTER_SPIN_DELAY); // Don't retry master recovery more than once per second, but don't delay + // the "first" recovery after more than a second of normal operation + + TraceEvent("CCWDB", cluster->id).detail("Watching", iMaster.id()); + + // Master failure detection is pretty sensitive, but if we are in the middle of a very long recovery we + // really don't want to have to start over + loop choose { + when(wait(waitFailureClient( + iMaster.waitFailure, + db->masterRegistrationCount + ? SERVER_KNOBS->MASTER_FAILURE_REACTION_TIME + : (now() - recoveryStart) * SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY, + db->masterRegistrationCount ? -SERVER_KNOBS->MASTER_FAILURE_REACTION_TIME / + SERVER_KNOBS->SECONDS_BEFORE_NO_FAILURE_DELAY + : SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY) || + db->forceMasterFailure.onTrigger())) { + break; } - ++recoveryData->backupWorkerDoneRequests; - req.reply.send(Void()); - TraceEvent(SevDebug, "BackupWorkerDoneRequest", cluster->id).log(); + when(wait(db->serverInfo->onChange())) {} } - when(wait(collection)) { throw internal_error(); } - when(wait(handleLeaderReplacement(recoveryData, leaderFail))) { throw internal_error(); } - } - // failed master (better master exists) could happen while change-coordinators request processing is - // in-progress - if (cluster->shouldCommitSuicide) { - throw restart_cluster_controller(); - } + wait(spinDelay); - recoveryCore.cancel(); - wait(ClusterControllerRecovery::cleanupActorCollection(recoveryData, /*exThrown=*/false)); - ASSERT(addActor.isEmpty()); - - wait(spinDelay); - - TEST(true); // clusterWatchDatabase() master failed - TraceEvent(SevWarn, "DetectedFailedRecovery", cluster->id).detail("OldMaster", iMaster.id()); - } catch (Error& e) { - state Error err = e; - TraceEvent("CCWDB", cluster->id).error(e, true).detail("Master", iMaster.id()); - if (e.code() != error_code_actor_cancelled) - wait(delay(0.0)); - - recoveryCore.cancel(); - wait(ClusterControllerRecovery::cleanupActorCollection(recoveryData, true /* exThrown */)); - ASSERT(addActor.isEmpty()); - - TEST(err.code() == error_code_tlog_failed); // Terminated due to tLog failure - TEST(err.code() == error_code_commit_proxy_failed); // Terminated due to commit proxy failure - TEST(err.code() == error_code_grv_proxy_failed); // Terminated due to GRV proxy failure - TEST(err.code() == error_code_resolver_failed); // Terminated due to resolver failure - TEST(err.code() == error_code_backup_worker_failed); // Terminated due to backup worker failure - TEST(err.code() == error_code_operation_failed); // Terminated due to failed operation - TEST(err.code() == error_code_restart_cluster_controller); // Terminated due to cluster-controller restart. - - if (cluster->shouldCommitSuicide || err.code() == error_code_coordinators_changed) { - TraceEvent("ClusterControllerTerminate", cluster->id).error(err, true); - throw restart_cluster_controller(); - } - - if (ClusterControllerRecovery::normalClusterRecoveryErrors().count(err.code())) { - TraceEvent(SevWarn, "ClusterRecoveryRetrying", cluster->id).error(err); + TEST(true); // clusterWatchDatabase() master failed + TraceEvent(SevWarn, "DetectedFailedMaster", cluster->id).detail("OldMaster", iMaster.id()); } else { - bool ok = err.code() == error_code_no_more_servers; - TraceEvent(ok ? SevWarn : SevError, "ClusterWatchDatabaseRetrying", cluster->id).error(err); - if (!ok) - throw err; + TEST(true); // clusterWatchDatabase() !newMaster.present() + wait(delay(SERVER_KNOBS->MASTER_SPIN_DELAY)); } + } catch (Error& e) { + TraceEvent("CCWDB", cluster->id).error(e, true).detail("Master", iMaster.id()); + if (e.code() == error_code_actor_cancelled) + throw; + + bool ok = e.code() == error_code_no_more_servers; + TraceEvent(ok ? SevWarn : SevError, "ClusterWatchDatabaseRetrying", cluster->id).error(e); + if (!ok) + throw e; wait(delay(SERVER_KNOBS->ATTEMPT_RECRUITMENT_DELAY)); } } @@ -5598,14 +3494,10 @@ ACTOR Future clusterOpenDatabase(ClusterControllerData::DBInfo* db, OpenDa void checkOutstandingRecruitmentRequests(ClusterControllerData* self) { for (int i = 0; i < self->outstandingRecruitmentRequests.size(); i++) { - Reference info = self->outstandingRecruitmentRequests[i]; + RecruitFromConfigurationRequest& req = self->outstandingRecruitmentRequests[i]; try { - info->rep = self->findWorkersForConfiguration(info->req); - if (info->dbgId.present()) { - TraceEvent("CheckOutstandingRecruitment", info->dbgId.get()) - .detail("Request", info->req.configuration.toString()); - } - info->waitForCompletion.trigger(); + RecruitFromConfigurationReply rep = self->findWorkersForConfiguration(req); + req.reply.send(rep); swapAndPop(&self->outstandingRecruitmentRequests, i--); } catch (Error& e) { if (e.code() == error_code_no_more_servers || e.code() == error_code_operation_failed) { @@ -5620,14 +3512,10 @@ void checkOutstandingRecruitmentRequests(ClusterControllerData* self) { void checkOutstandingRemoteRecruitmentRequests(ClusterControllerData* self) { for (int i = 0; i < self->outstandingRemoteRecruitmentRequests.size(); i++) { - Reference info = self->outstandingRemoteRecruitmentRequests[i]; + RecruitRemoteFromConfigurationRequest& req = self->outstandingRemoteRecruitmentRequests[i]; try { - info->rep = self->findRemoteWorkersForConfiguration(info->req); - if (info->dbgId.present()) { - TraceEvent("CheckOutstandingRemoteRecruitment", info->dbgId.get()) - .detail("Request", info->req.configuration.toString()); - } - info->waitForCompletion.trigger(); + RecruitRemoteFromConfigurationReply rep = self->findRemoteWorkersForConfiguration(req); + req.reply.send(rep); swapAndPop(&self->outstandingRemoteRecruitmentRequests, i--); } catch (Error& e) { if (e.code() == error_code_no_more_servers || e.code() == error_code_operation_failed) { @@ -6104,6 +3992,67 @@ void clusterRecruitBlobWorker(ClusterControllerData* self, RecruitBlobWorkerRequ } } +ACTOR Future clusterRecruitFromConfiguration(ClusterControllerData* self, RecruitFromConfigurationRequest req) { + // At the moment this doesn't really need to be an actor (it always completes immediately) + TEST(true); // ClusterController RecruitTLogsRequest + loop { + try { + auto rep = self->findWorkersForConfiguration(req); + req.reply.send(rep); + return Void(); + } catch (Error& e) { + if (e.code() == error_code_no_more_servers && self->goodRecruitmentTime.isReady()) { + self->outstandingRecruitmentRequests.push_back(req); + TraceEvent(SevWarn, "RecruitFromConfigurationNotAvailable", self->id).error(e); + return Void(); + } else if (e.code() == error_code_operation_failed || e.code() == error_code_no_more_servers) { + // recruitment not good enough, try again + TraceEvent("RecruitFromConfigurationRetry", self->id) + .error(e) + .detail("GoodRecruitmentTimeReady", self->goodRecruitmentTime.isReady()); + while (!self->goodRecruitmentTime.isReady()) { + wait(lowPriorityDelay(SERVER_KNOBS->ATTEMPT_RECRUITMENT_DELAY)); + } + } else { + TraceEvent(SevError, "RecruitFromConfigurationError", self->id).error(e); + throw; // goodbye, cluster controller + } + } + wait(lowPriorityDelay(SERVER_KNOBS->ATTEMPT_RECRUITMENT_DELAY)); + } +} + +ACTOR Future clusterRecruitRemoteFromConfiguration(ClusterControllerData* self, + RecruitRemoteFromConfigurationRequest req) { + // At the moment this doesn't really need to be an actor (it always completes immediately) + TEST(true); // ClusterController RecruitTLogsRequest Remote + loop { + try { + RecruitRemoteFromConfigurationReply rep = self->findRemoteWorkersForConfiguration(req); + req.reply.send(rep); + return Void(); + } catch (Error& e) { + if (e.code() == error_code_no_more_servers && self->goodRemoteRecruitmentTime.isReady()) { + self->outstandingRemoteRecruitmentRequests.push_back(req); + TraceEvent(SevWarn, "RecruitRemoteFromConfigurationNotAvailable", self->id).error(e); + return Void(); + } else if (e.code() == error_code_operation_failed || e.code() == error_code_no_more_servers) { + // recruitment not good enough, try again + TraceEvent("RecruitRemoteFromConfigurationRetry", self->id) + .error(e) + .detail("GoodRecruitmentTimeReady", self->goodRemoteRecruitmentTime.isReady()); + while (!self->goodRemoteRecruitmentTime.isReady()) { + wait(lowPriorityDelay(SERVER_KNOBS->ATTEMPT_RECRUITMENT_DELAY)); + } + } else { + TraceEvent(SevError, "RecruitRemoteFromConfigurationError", self->id).error(e); + throw; // goodbye, cluster controller + } + } + wait(lowPriorityDelay(SERVER_KNOBS->ATTEMPT_RECRUITMENT_DELAY)); + } +} + void clusterRegisterMaster(ClusterControllerData* self, RegisterMasterRequest const& req) { req.reply.send(Void()); @@ -6178,12 +4127,6 @@ void clusterRegisterMaster(ClusterControllerData* self, RegisterMasterRequest co // Construct the client information if (db->clientInfo->get().commitProxies != req.commitProxies || db->clientInfo->get().grvProxies != req.grvProxies) { - TraceEvent("PublishNewClientInfo", self->id) - .detail("Master", dbInfo.master.id().toString()) - .detail("GrvProxies", db->clientInfo->get().grvProxies) - .detail("ReqGrvProxies", req.grvProxies) - .detail("CommitProxies", db->clientInfo->get().commitProxies) - .detail("ReqCPs", req.commitProxies); isChanged = true; // TODO why construct a new one and not just copy the old one and change proxies + id? ClientDBInfo clientInfo; @@ -7540,7 +5483,7 @@ ACTOR Future clusterControllerCore(ClusterControllerFullInterface interf, state uint64_t step = 0; state Future> error = errorOr(actorCollection(self.addActor.getFuture())); - self.addActor.send(clusterWatchDatabase(&self, &self.db, coordinators, leaderFail)); // Start the master database + self.addActor.send(clusterWatchDatabase(&self, &self.db)); // Start the master database self.addActor.send(self.updateWorkerList.init(self.db.db)); self.addActor.send(statusServer(interf.clientInterface.databaseStatus.getFuture(), &self, @@ -7577,7 +5520,7 @@ ACTOR Future clusterControllerCore(ClusterControllerFullInterface interf, loop choose { when(ErrorOr err = wait(error)) { - if (err.isError() && err.getError().code() != error_code_restart_cluster_controller) { + if (err.isError()) { endRole(Role::CLUSTER_CONTROLLER, interf.id(), "Stop Received Error", false, err.getError()); } else { endRole(Role::CLUSTER_CONTROLLER, interf.id(), "Stop Received Signal", true); @@ -7591,6 +5534,12 @@ ACTOR Future clusterControllerCore(ClusterControllerFullInterface interf, ++self.openDatabaseRequests; self.addActor.send(clusterOpenDatabase(&self.db, req)); } + when(RecruitFromConfigurationRequest req = waitNext(interf.recruitFromConfiguration.getFuture())) { + self.addActor.send(clusterRecruitFromConfiguration(&self, req)); + } + when(RecruitRemoteFromConfigurationRequest req = waitNext(interf.recruitRemoteFromConfiguration.getFuture())) { + self.addActor.send(clusterRecruitRemoteFromConfiguration(&self, req)); + } when(RecruitStorageRequest req = waitNext(interf.recruitStorage.getFuture())) { clusterRecruitStorage(&self, req); } @@ -7650,6 +5599,12 @@ ACTOR Future clusterControllerCore(ClusterControllerFullInterface interf, when(GetServerDBInfoRequest req = waitNext(interf.getServerDBInfo.getFuture())) { self.addActor.send(clusterGetServerInfo(&self.db, req.knownServerInfoID, req.reply)); } + when(wait(leaderFail)) { + // We are no longer the leader if this has changed. + endRole(Role::CLUSTER_CONTROLLER, interf.id(), "Leader Replaced", true); + TEST(true); // Lost Cluster Controller Role + return Void(); + } when(ReplyPromise ping = waitNext(interf.clientInterface.ping.getFuture())) { ping.send(Void()); } } } @@ -7974,17 +5929,12 @@ TEST_CASE("/fdbserver/clustercontroller/shouldTriggerRecoveryDueToDegradedServer NetworkAddress backup(IPAddress(0x06060606), 1); NetworkAddress proxy(IPAddress(0x07070707), 1); NetworkAddress resolver(IPAddress(0x08080808), 1); - NetworkAddress clusterController(IPAddress(0x09090909), 1); UID testUID(1, 2); // Create a ServerDBInfo using above addresses. ServerDBInfo testDbInfo; - testDbInfo.clusterInterface.changeCoordinators = - RequestStream(Endpoint({ clusterController }, UID(1, 2))); - - MasterInterface mInterface; - mInterface.getCommitVersion = RequestStream(Endpoint({ master }, UID(1, 2))); - testDbInfo.master = mInterface; + testDbInfo.master.changeCoordinators = + RequestStream(Endpoint({ master }, testUID)); TLogInterface localTLogInterf; localTLogInterf.peekMessages = RequestStream(Endpoint({ tlog }, testUID)); @@ -8085,15 +6035,14 @@ TEST_CASE("/fdbserver/clustercontroller/shouldTriggerFailoverDueToDegradedServer NetworkAddress proxy(IPAddress(0x07070707), 1); NetworkAddress proxy2(IPAddress(0x08080808), 1); NetworkAddress resolver(IPAddress(0x09090909), 1); - NetworkAddress clusterController(IPAddress(0x10101010), 1); UID testUID(1, 2); data.db.config.usableRegions = 2; // Create a ServerDBInfo using above addresses. ServerDBInfo testDbInfo; - testDbInfo.clusterInterface.changeCoordinators = - RequestStream(Endpoint({ clusterController }, UID(1, 2))); + testDbInfo.master.changeCoordinators = + RequestStream(Endpoint({ master }, testUID)); TLogInterface localTLogInterf; localTLogInterf.peekMessages = RequestStream(Endpoint({ tlog }, testUID)); diff --git a/fdbserver/CommitProxyServer.actor.cpp b/fdbserver/CommitProxyServer.actor.cpp index 9de55eea45..d2c5d7aa28 100644 --- a/fdbserver/CommitProxyServer.actor.cpp +++ b/fdbserver/CommitProxyServer.actor.cpp @@ -642,8 +642,6 @@ ACTOR Future preresolutionProcessing(CommitBatchContext* self) { self->commitVersion = versionReply.version; self->prevVersion = versionReply.prevVersion; - //TraceEvent("CPGetVersion", pProxyCommitData->dbgid).detail("Master", pProxyCommitData->master.id().toString()).detail("CommitVersion", self->commitVersion).detail("PrvVersion", self->prevVersion); - for (auto it : versionReply.resolverChanges) { auto rs = pProxyCommitData->keyResolvers.modify(it.range); for (auto r = rs.begin(); r != rs.end(); ++r) @@ -880,7 +878,7 @@ ACTOR Future applyMetadataToCommittedTransactions(CommitBatchContext* self if (!self->isMyFirstBatch && pProxyCommitData->txnStateStore->readValue(coordinatorsKey).get().get() != self->oldCoordinators.get()) { - wait(brokenPromiseToNever(pProxyCommitData->db->get().clusterInterface.changeCoordinators.getReply( + wait(brokenPromiseToNever(pProxyCommitData->master.changeCoordinators.getReply( ChangeCoordinatorsRequest(pProxyCommitData->txnStateStore->readValue(coordinatorsKey).get().get())))); ASSERT(false); // ChangeCoordinatorsRequest should always throw } @@ -1095,16 +1093,8 @@ ACTOR Future postResolution(CommitBatchContext* self) { applyMetadataEffect(self); - if (debugID.present()) { - g_traceBatch.addEvent("CommitDebug", debugID.get().first(), "CommitProxyServer.commitBatch.ApplyMetadaEffect"); - } - determineCommittedTransactions(self); - if (debugID.present()) { - g_traceBatch.addEvent("CommitDebug", debugID.get().first(), "CommitProxyServer.commitBatch.ApplyMetadaEffect"); - } - if (self->forceRecovery) { wait(Future(Never())); } @@ -1112,18 +1102,9 @@ ACTOR Future postResolution(CommitBatchContext* self) { // First pass wait(applyMetadataToCommittedTransactions(self)); - if (debugID.present()) { - g_traceBatch.addEvent( - "CommitDebug", debugID.get().first(), "CommitProxyServer.commitBatch.ApplyMetadaToCommittedTxn"); - } - // Second pass wait(assignMutationsToStorageServers(self)); - if (debugID.present()) { - g_traceBatch.addEvent("CommitDebug", debugID.get().first(), "CommitProxyServer.commitBatch.AssignMutationToSS"); - } - // Serialize and backup the mutations as a single mutation if ((pProxyCommitData->vecBackupKeys.size() > 1) && self->logRangeMutations.size()) { wait(addBackupMutations(pProxyCommitData, @@ -1260,7 +1241,7 @@ ACTOR Future transactionLogging(CommitBatchContext* self) { } } catch (Error& e) { if (e.code() == error_code_broken_promise) { - throw tlog_failed(); + throw master_tlog_failed(); } throw; } @@ -1292,10 +1273,8 @@ ACTOR Future reply(CommitBatchContext* self) { const Optional& debugID = self->debugID; - if (self->prevVersion && self->commitVersion - self->prevVersion < SERVER_KNOBS->MAX_VERSIONS_IN_FLIGHT / 2) { - //TraceEvent("CPAdvanceMinVersion", self->pProxyCommitData->dbgid).detail("PrvVersion", self->prevVersion).detail("CommitVersion", self->commitVersion).detail("Master", self->pProxyCommitData->master.id().toString()).detail("TxSize", self->trs.size()); + if (self->prevVersion && self->commitVersion - self->prevVersion < SERVER_KNOBS->MAX_VERSIONS_IN_FLIGHT / 2) debug_advanceMinCommittedVersion(UID(), self->commitVersion); - } //TraceEvent("ProxyPushed", pProxyCommitData->dbgid).detail("PrevVersion", prevVersion).detail("Version", commitVersion); if (debugID.present()) @@ -2020,7 +1999,6 @@ ACTOR Future processTransactionStateRequestPart(TransactionStateResolveCon ACTOR Future commitProxyServerCore(CommitProxyInterface proxy, MasterInterface master, - LifetimeToken masterLifetime, Reference const> db, LogEpoch epoch, Version recoveryTransactionVersion, @@ -2035,7 +2013,8 @@ ACTOR Future commitProxyServerCore(CommitProxyInterface proxy, state Future lastCommitComplete = Void(); state PromiseStream> addActor; - state Future onError = transformError(actorCollection(addActor.getFuture()), broken_promise(), tlog_failed()); + state Future onError = + transformError(actorCollection(addActor.getFuture()), broken_promise(), master_tlog_failed()); state double lastCommit = 0; state GetHealthMetricsReply healthMetricsReply; @@ -2047,7 +2026,7 @@ ACTOR Future commitProxyServerCore(CommitProxyInterface proxy, //TraceEvent("CommitProxyInit1", proxy.id()); // Wait until we can load the "real" logsystem, since we don't support switching them currently - while (!(masterLifetime.isEqual(commitData.db->get().masterLifetime) && + while (!(commitData.db->get().master.id() == master.id() && commitData.db->get().recoveryState >= RecoveryState::RECOVERY_TRANSACTION)) { //TraceEvent("ProxyInit2", proxy.id()).detail("LSEpoch", db->get().logSystemConfig.epoch).detail("Need", epoch); wait(commitData.db->onChange()); @@ -2109,7 +2088,7 @@ ACTOR Future commitProxyServerCore(CommitProxyInterface proxy, loop choose { when(wait(dbInfoChange)) { dbInfoChange = commitData.db->onChange(); - if (masterLifetime.isEqual(commitData.db->get().masterLifetime) && + if (commitData.db->get().master.id() == master.id() && commitData.db->get().recoveryState >= RecoveryState::RECOVERY_TRANSACTION) { commitData.logSystem = ILogSystem::fromServerDBInfo(proxy.id(), commitData.db->get(), false, addActor); for (auto it : commitData.tag_popped) { @@ -2127,9 +2106,7 @@ ACTOR Future commitProxyServerCore(CommitProxyInterface proxy, const std::vector& trs = batchedRequests.first; int batchBytes = batchedRequests.second; //TraceEvent("CommitProxyCTR", proxy.id()).detail("CommitTransactions", trs.size()).detail("TransactionRate", transactionRate).detail("TransactionQueue", transactionQueue.size()).detail("ReleasedTransactionCount", transactionCount); - //TraceEvent("CommitProxyCore", commitData.dbgid).detail("TxSize", trs.size()).detail("MasterLifetime", masterLifetime.toString()).detail("DbMasterLifetime", commitData.db->get().masterLifetime.toString()).detail("RecoveryState", commitData.db->get().recoveryState).detail("CCInf", commitData.db->get().clusterInterface.id().toString()); if (trs.size() || (commitData.db->get().recoveryState >= RecoveryState::ACCEPTING_COMMITS && - masterLifetime.isEqual(commitData.db->get().masterLifetime) && now() - lastCommit >= SERVER_KNOBS->MAX_COMMIT_BATCH_INTERVAL)) { lastCommit = now(); @@ -2178,7 +2155,6 @@ ACTOR Future commitProxyServer(CommitProxyInterface proxy, try { state Future core = commitProxyServerCore(proxy, req.master, - req.masterLifetime, db, req.recoveryCount, req.recoveryTransactionVersion, @@ -2189,7 +2165,7 @@ ACTOR Future commitProxyServer(CommitProxyInterface proxy, TraceEvent("CommitProxyTerminated", proxy.id()).error(e, true); if (e.code() != error_code_worker_removed && e.code() != error_code_tlog_stopped && - e.code() != error_code_tlog_failed && e.code() != error_code_coordinators_changed && + e.code() != error_code_master_tlog_failed && e.code() != error_code_coordinators_changed && e.code() != error_code_coordinated_state_conflict && e.code() != error_code_new_coordinators_timed_out && e.code() != error_code_failed_to_progress) { throw; diff --git a/fdbserver/GrvProxyServer.actor.cpp b/fdbserver/GrvProxyServer.actor.cpp index d35ecae369..d400666a43 100644 --- a/fdbserver/GrvProxyServer.actor.cpp +++ b/fdbserver/GrvProxyServer.actor.cpp @@ -920,12 +920,12 @@ ACTOR static Future transactionStarter(GrvProxyInterface proxy, ACTOR Future grvProxyServerCore(GrvProxyInterface proxy, MasterInterface master, - LifetimeToken masterLifetime, Reference const> db) { state GrvProxyData grvProxyData(proxy.id(), master, proxy.getConsistentReadVersion, db); state PromiseStream> addActor; - state Future onError = transformError(actorCollection(addActor.getFuture()), broken_promise(), tlog_failed()); + state Future onError = + transformError(actorCollection(addActor.getFuture()), broken_promise(), master_tlog_failed()); state GetHealthMetricsReply healthMetricsReply; state GetHealthMetricsReply detailedHealthMetricsReply; @@ -933,14 +933,9 @@ ACTOR Future grvProxyServerCore(GrvProxyInterface proxy, addActor.send(waitFailureServer(proxy.waitFailure.getFuture())); addActor.send(traceRole(Role::GRV_PROXY, proxy.id())); - TraceEvent("GrvProxyServerCore", proxy.id()) - .detail("MasterId", master.id().toString()) - .detail("MasterLifetime", masterLifetime.toString()) - .detail("RecoveryCount", db->get().recoveryCount); - // Wait until we can load the "real" logsystem, since we don't support switching them currently - while (!(masterLifetime.isEqual(grvProxyData.db->get().masterLifetime) && - grvProxyData.db->get().recoveryState >= RecoveryState::ACCEPTING_COMMITS)) { + while (!(grvProxyData.db->get().master.id() == master.id() && + grvProxyData.db->get().recoveryState >= RecoveryState::RECOVERY_TRANSACTION)) { wait(grvProxyData.db->onChange()); } // Do we need to wait for any db info change? Yes. To update latency band. @@ -961,7 +956,7 @@ ACTOR Future grvProxyServerCore(GrvProxyInterface proxy, when(wait(dbInfoChange)) { dbInfoChange = grvProxyData.db->onChange(); - if (masterLifetime.isEqual(grvProxyData.db->get().masterLifetime) && + if (grvProxyData.db->get().master.id() == master.id() && grvProxyData.db->get().recoveryState >= RecoveryState::RECOVERY_TRANSACTION) { grvProxyData.logSystem = ILogSystem::fromServerDBInfo(proxy.id(), grvProxyData.db->get(), false, addActor); @@ -988,13 +983,13 @@ ACTOR Future grvProxyServer(GrvProxyInterface proxy, InitializeGrvProxyRequest req, Reference const> db) { try { - state Future core = grvProxyServerCore(proxy, req.master, req.masterLifetime, db); + state Future core = grvProxyServerCore(proxy, req.master, db); wait(core || checkRemoved(db, req.recoveryCount, proxy)); } catch (Error& e) { TraceEvent("GrvProxyTerminated", proxy.id()).error(e, true); if (e.code() != error_code_worker_removed && e.code() != error_code_tlog_stopped && - e.code() != error_code_tlog_failed && e.code() != error_code_coordinators_changed && + e.code() != error_code_master_tlog_failed && e.code() != error_code_coordinators_changed && e.code() != error_code_coordinated_state_conflict && e.code() != error_code_new_coordinators_timed_out) { throw; } diff --git a/fdbserver/MasterInterface.h b/fdbserver/MasterInterface.h index a092226f75..a9fb8122aa 100644 --- a/fdbserver/MasterInterface.h +++ b/fdbserver/MasterInterface.h @@ -22,13 +22,11 @@ #define FDBSERVER_MASTERINTERFACE_H #pragma once -#include "fdbclient/CommitProxyInterface.h" #include "fdbclient/FDBTypes.h" #include "fdbclient/StorageServerInterface.h" #include "fdbclient/CommitTransaction.h" #include "fdbclient/DatabaseConfiguration.h" #include "fdbserver/TLogInterface.h" -#include "fdbclient/Notified.h" typedef uint64_t DBRecoveryCount; @@ -36,17 +34,20 @@ struct MasterInterface { constexpr static FileIdentifier file_identifier = 5979145; LocalityData locality; RequestStream> waitFailure; + RequestStream + tlogRejoin; // sent by tlog (whether or not rebooted) to communicate with a new master + RequestStream changeCoordinators; RequestStream getCommitVersion; + RequestStream notifyBackupWorkerDone; // Get the centralized live committed version reported by commit proxies. RequestStream getLiveCommittedVersion; // Report a proxy's committed version. RequestStream reportLiveCommittedVersion; - RequestStream updateRecoveryData; - NetworkAddress address() const { return getCommitVersion.getEndpoint().getPrimaryAddress(); } - NetworkAddressList addresses() const { return getCommitVersion.getEndpoint().addresses; } + NetworkAddress address() const { return changeCoordinators.getEndpoint().getPrimaryAddress(); } + NetworkAddressList addresses() const { return changeCoordinators.getEndpoint().addresses; } - UID id() const { return getCommitVersion.getEndpoint().token; } + UID id() const { return changeCoordinators.getEndpoint().token; } template void serialize(Archive& ar) { if constexpr (!is_fb_function) { @@ -54,28 +55,61 @@ struct MasterInterface { } serializer(ar, locality, waitFailure); if (Archive::isDeserializing) { + tlogRejoin = RequestStream(waitFailure.getEndpoint().getAdjustedEndpoint(1)); + changeCoordinators = + RequestStream(waitFailure.getEndpoint().getAdjustedEndpoint(2)); getCommitVersion = - RequestStream(waitFailure.getEndpoint().getAdjustedEndpoint(1)); + RequestStream(waitFailure.getEndpoint().getAdjustedEndpoint(3)); + notifyBackupWorkerDone = + RequestStream(waitFailure.getEndpoint().getAdjustedEndpoint(4)); getLiveCommittedVersion = - RequestStream(waitFailure.getEndpoint().getAdjustedEndpoint(2)); + RequestStream(waitFailure.getEndpoint().getAdjustedEndpoint(5)); reportLiveCommittedVersion = RequestStream( - waitFailure.getEndpoint().getAdjustedEndpoint(3)); - updateRecoveryData = - RequestStream(waitFailure.getEndpoint().getAdjustedEndpoint(4)); + waitFailure.getEndpoint().getAdjustedEndpoint(6)); } } void initEndpoints() { std::vector> streams; streams.push_back(waitFailure.getReceiver()); + streams.push_back(tlogRejoin.getReceiver(TaskPriority::MasterTLogRejoin)); + streams.push_back(changeCoordinators.getReceiver()); streams.push_back(getCommitVersion.getReceiver(TaskPriority::GetConsistentReadVersion)); + streams.push_back(notifyBackupWorkerDone.getReceiver()); streams.push_back(getLiveCommittedVersion.getReceiver(TaskPriority::GetLiveCommittedVersion)); streams.push_back(reportLiveCommittedVersion.getReceiver(TaskPriority::ReportLiveCommittedVersion)); - streams.push_back(updateRecoveryData.getReceiver(TaskPriority::UpdateRecoveryTransactionVersion)); FlowTransport::transport().addEndpoints(streams); } }; +struct TLogRejoinReply { + constexpr static FileIdentifier file_identifier = 11; + + // false means someone else registered, so we should re-register. true means this master is recovered, so don't + // send again to the same master. + bool masterIsRecovered; + TLogRejoinReply() = default; + explicit TLogRejoinReply(bool masterIsRecovered) : masterIsRecovered(masterIsRecovered) {} + + template + void serialize(Ar& ar) { + serializer(ar, masterIsRecovered); + } +}; + +struct TLogRejoinRequest { + constexpr static FileIdentifier file_identifier = 15692200; + TLogInterface myInterface; + ReplyPromise reply; + + TLogRejoinRequest() {} + explicit TLogRejoinRequest(const TLogInterface& interf) : myInterface(interf) {} + template + void serialize(Ar& ar) { + serializer(ar, myInterface, reply); + } +}; + struct ChangeCoordinatorsRequest { constexpr static FileIdentifier file_identifier = 13605416; Standalone newConnectionString; @@ -150,26 +184,6 @@ struct GetCommitVersionRequest { } }; -struct UpdateRecoveryDataRequest { - constexpr static FileIdentifier file_identifier = 13605417; - Version recoveryTransactionVersion; - Version lastEpochEnd; - std::vector commitProxies; - ReplyPromise reply; - - UpdateRecoveryDataRequest() {} - UpdateRecoveryDataRequest(Version recoveryTransactionVersion, - Version lastEpochEnd, - std::vector commitProxies) - : recoveryTransactionVersion(recoveryTransactionVersion), lastEpochEnd(lastEpochEnd), - commitProxies(commitProxies) {} - - template - void serialize(Ar& ar) { - serializer(ar, recoveryTransactionVersion, lastEpochEnd, commitProxies, reply); - } -}; - struct ReportRawCommittedVersionRequest { constexpr static FileIdentifier file_identifier = 1853148; Version version; @@ -193,6 +207,21 @@ struct ReportRawCommittedVersionRequest { } }; +struct BackupWorkerDoneRequest { + constexpr static FileIdentifier file_identifier = 8736351; + UID workerUID; + LogEpoch backupEpoch; + ReplyPromise reply; + + BackupWorkerDoneRequest() : workerUID(), backupEpoch(-1) {} + BackupWorkerDoneRequest(UID id, LogEpoch epoch) : workerUID(id), backupEpoch(epoch) {} + + template + void serialize(Ar& ar) { + serializer(ar, workerUID, backupEpoch, reply); + } +}; + struct LifetimeToken { UID ccID; int64_t count; @@ -202,9 +231,6 @@ struct LifetimeToken { bool isStillValid(LifetimeToken const& latestToken, bool isLatestID) const { return ccID == latestToken.ccID && (count >= latestToken.count || isLatestID); } - bool isEqual(LifetimeToken const& toCompare) { - return ccID.compare(toCompare.ccID) == 0 && count == toCompare.count; - } std::string toString() const { return ccID.shortString() + format("#%lld", count); } void operator++() { ++count; } @@ -214,18 +240,4 @@ struct LifetimeToken { } }; -struct CommitProxyVersionReplies { - std::map replies; - NotifiedVersion latestRequestNum; - - CommitProxyVersionReplies(CommitProxyVersionReplies&& r) noexcept - : replies(std::move(r.replies)), latestRequestNum(std::move(r.latestRequestNum)) {} - void operator=(CommitProxyVersionReplies&& r) noexcept { - replies = std::move(r.replies); - latestRequestNum = std::move(r.latestRequestNum); - } - - CommitProxyVersionReplies() : latestRequestNum(0) {} -}; - #endif diff --git a/fdbserver/OldTLogServer_4_6.actor.cpp b/fdbserver/OldTLogServer_4_6.actor.cpp index eca9c17390..9c4131e500 100644 --- a/fdbserver/OldTLogServer_4_6.actor.cpp +++ b/fdbserver/OldTLogServer_4_6.actor.cpp @@ -1249,11 +1249,11 @@ ACTOR Future commitQueue(TLogData* self) { } } -ACTOR Future rejoinClusterController(TLogData* self, - TLogInterface tli, - DBRecoveryCount recoveryCount, - Future registerWithCC) { - state LifetimeToken lastMasterLifetime; +ACTOR Future rejoinMasters(TLogData* self, + TLogInterface tli, + DBRecoveryCount recoveryCount, + Future registerWithMaster) { + state UID lastMasterID(0, 0); loop { auto const& inf = self->dbInfo->get(); bool isDisplaced = @@ -1274,21 +1274,18 @@ ACTOR Future rejoinClusterController(TLogData* self, throw worker_removed(); } - if (registerWithCC.isReady()) { - if (!lastMasterLifetime.isEqual(self->dbInfo->get().masterLifetime)) { + if (registerWithMaster.isReady()) { + if (self->dbInfo->get().master.id() != lastMasterID) { // The TLogRejoinRequest is needed to establish communications with a new master, which doesn't have our // TLogInterface TLogRejoinRequest req; req.myInterface = tli; - TraceEvent("TLogRejoining", tli.id()) - .detail("ClusterController", self->dbInfo->get().clusterInterface.id()) - .detail("DbInfoMasterLifeTime", self->dbInfo->get().masterLifetime.toString()) - .detail("LastMasterLifeTime", lastMasterLifetime.toString()); + TraceEvent("TLogRejoining", tli.id()).detail("Master", self->dbInfo->get().master.id()); choose { - when(TLogRejoinReply rep = wait( - brokenPromiseToNever(self->dbInfo->get().clusterInterface.tlogRejoin.getReply(req)))) { + when(TLogRejoinReply rep = + wait(brokenPromiseToNever(self->dbInfo->get().master.tlogRejoin.getReply(req)))) { if (rep.masterIsRecovered) - lastMasterLifetime = self->dbInfo->get().masterLifetime; + lastMasterID = self->dbInfo->get().master.id(); } when(wait(self->dbInfo->onChange())) {} } @@ -1296,7 +1293,7 @@ ACTOR Future rejoinClusterController(TLogData* self, wait(self->dbInfo->onChange()); } } else { - wait(registerWithCC || self->dbInfo->onChange()); + wait(registerWithMaster || self->dbInfo->onChange()); } } } @@ -1482,7 +1479,7 @@ ACTOR Future restorePersistentState(TLogData* self, LocalityData locality) ASSERT(fVers.get().size() == fRecoverCounts.get().size()); state int idx = 0; - state Promise registerWithCC; + state Promise registerWithMaster; for (idx = 0; idx < fVers.get().size(); idx++) { state KeyRef rawId = fVers.get()[idx].key.removePrefix(persistCurrentVersionKeys.begin); UID id1 = BinaryReader::fromStringRef(rawId, Unversioned()); @@ -1516,7 +1513,7 @@ ACTOR Future restorePersistentState(TLogData* self, LocalityData locality) logData->version.set(ver); logData->recoveryCount = BinaryReader::fromStringRef(fRecoverCounts.get()[idx].value, Unversioned()); - logData->removed = rejoinClusterController(self, recruited, logData->recoveryCount, registerWithCC.getFuture()); + logData->removed = rejoinMasters(self, recruited, logData->recoveryCount, registerWithMaster.getFuture()); removed.push_back(errorOr(logData->removed)); TraceEvent("TLogRestorePersistentStateVer", id1).detail("Ver", ver); @@ -1620,8 +1617,8 @@ ACTOR Future restorePersistentState(TLogData* self, LocalityData locality) self->sharedActors.send(tLogCore(self, it.second)); } - if (registerWithCC.canBeSet()) - registerWithCC.send(Void()); + if (registerWithMaster.canBeSet()) + registerWithMaster.send(Void()); return Void(); } diff --git a/fdbserver/OldTLogServer_6_0.actor.cpp b/fdbserver/OldTLogServer_6_0.actor.cpp index 98e8d74672..6f3e5da3a9 100644 --- a/fdbserver/OldTLogServer_6_0.actor.cpp +++ b/fdbserver/OldTLogServer_6_0.actor.cpp @@ -1729,12 +1729,12 @@ ACTOR Future initPersistentState(TLogData* self, Reference logDat return Void(); } -ACTOR Future rejoinClusterController(TLogData* self, - TLogInterface tli, - DBRecoveryCount recoveryCount, - Future registerWithCC, - bool isPrimary) { - state LifetimeToken lastMasterLifetime; +ACTOR Future rejoinMasters(TLogData* self, + TLogInterface tli, + DBRecoveryCount recoveryCount, + Future registerWithMaster, + bool isPrimary) { + state UID lastMasterID(0, 0); loop { auto const& inf = self->dbInfo->get(); bool isDisplaced = @@ -1762,20 +1762,17 @@ ACTOR Future rejoinClusterController(TLogData* self, throw worker_removed(); } - if (registerWithCC.isReady()) { - if (!lastMasterLifetime.isEqual(self->dbInfo->get().masterLifetime)) { + if (registerWithMaster.isReady()) { + if (self->dbInfo->get().master.id() != lastMasterID) { // The TLogRejoinRequest is needed to establish communications with a new master, which doesn't have our // TLogInterface TLogRejoinRequest req(tli); - TraceEvent("TLogRejoining", tli.id()) - .detail("ClusterController", self->dbInfo->get().clusterInterface.id()) - .detail("DbInfoMasterLifeTime", self->dbInfo->get().masterLifetime.toString()) - .detail("LastMasterLifeTime", lastMasterLifetime.toString()); + TraceEvent("TLogRejoining", tli.id()).detail("Master", self->dbInfo->get().master.id()); choose { - when(TLogRejoinReply rep = wait( - brokenPromiseToNever(self->dbInfo->get().clusterInterface.tlogRejoin.getReply(req)))) { + when(TLogRejoinReply rep = + wait(brokenPromiseToNever(self->dbInfo->get().master.tlogRejoin.getReply(req)))) { if (rep.masterIsRecovered) - lastMasterLifetime = self->dbInfo->get().masterLifetime; + lastMasterID = self->dbInfo->get().master.id(); } when(wait(self->dbInfo->onChange())) {} } @@ -1783,7 +1780,7 @@ ACTOR Future rejoinClusterController(TLogData* self, wait(self->dbInfo->onChange()); } } else { - wait(registerWithCC || self->dbInfo->onChange()); + wait(registerWithMaster || self->dbInfo->onChange()); } } } @@ -2387,7 +2384,7 @@ ACTOR Future restorePersistentState(TLogData* self, } state int idx = 0; - state Promise registerWithCC; + state Promise registerWithMaster; state std::map id_interf; for (idx = 0; idx < fVers.get().size(); idx++) { state KeyRef rawId = fVers.get()[idx].key.removePrefix(persistCurrentVersionKeys.begin); @@ -2435,7 +2432,7 @@ ACTOR Future restorePersistentState(TLogData* self, logData->recoveryCount = BinaryReader::fromStringRef(fRecoverCounts.get()[idx].value, Unversioned()); logData->removed = - rejoinClusterController(self, recruited, logData->recoveryCount, registerWithCC.getFuture(), false); + rejoinMasters(self, recruited, logData->recoveryCount, registerWithMaster.getFuture(), false); removed.push_back(errorOr(logData->removed)); TraceEvent("TLogRestorePersistentStateVer", id1).detail("Ver", ver); @@ -2537,8 +2534,8 @@ ACTOR Future restorePersistentState(TLogData* self, self->sharedActors.send(tLogCore(self, it.second, id_interf[it.first], false)); } - if (registerWithCC.canBeSet()) - registerWithCC.send(Void()); + if (registerWithMaster.canBeSet()) + registerWithMaster.send(Void()); return Void(); } @@ -2656,7 +2653,7 @@ ACTOR Future tLogStart(TLogData* self, InitializeTLogRequest req, Locality self->id_data[recruited.id()] = logData; logData->locality = req.locality; logData->recoveryCount = req.epoch; - logData->removed = rejoinClusterController(self, recruited, req.epoch, Future(Void()), req.isPrimary); + logData->removed = rejoinMasters(self, recruited, req.epoch, Future(Void()), req.isPrimary); self->queueOrder.push_back(recruited.id()); TraceEvent("TLogStart", logData->logId).log(); diff --git a/fdbserver/OldTLogServer_6_2.actor.cpp b/fdbserver/OldTLogServer_6_2.actor.cpp index 53be793c7c..2d2ed59703 100644 --- a/fdbserver/OldTLogServer_6_2.actor.cpp +++ b/fdbserver/OldTLogServer_6_2.actor.cpp @@ -2174,12 +2174,12 @@ ACTOR Future initPersistentState(TLogData* self, Reference logDat return Void(); } -ACTOR Future rejoinClusterController(TLogData* self, - TLogInterface tli, - DBRecoveryCount recoveryCount, - Future registerWithCC, - bool isPrimary) { - state LifetimeToken lastMasterLifetime; +ACTOR Future rejoinMasters(TLogData* self, + TLogInterface tli, + DBRecoveryCount recoveryCount, + Future registerWithMaster, + bool isPrimary) { + state UID lastMasterID(0, 0); loop { auto const& inf = self->dbInfo->get(); bool isDisplaced = @@ -2207,20 +2207,17 @@ ACTOR Future rejoinClusterController(TLogData* self, throw worker_removed(); } - if (registerWithCC.isReady()) { - if (!lastMasterLifetime.isEqual(self->dbInfo->get().masterLifetime)) { + if (registerWithMaster.isReady()) { + if (self->dbInfo->get().master.id() != lastMasterID) { // The TLogRejoinRequest is needed to establish communications with a new master, which doesn't have our // TLogInterface TLogRejoinRequest req(tli); - TraceEvent("TLogRejoining", tli.id()) - .detail("ClusterController", self->dbInfo->get().clusterInterface.id()) - .detail("DbInfoMasterLifeTime", self->dbInfo->get().masterLifetime.toString()) - .detail("LastMasterLifeTime", lastMasterLifetime.toString()); + TraceEvent("TLogRejoining", tli.id()).detail("Master", self->dbInfo->get().master.id()); choose { - when(TLogRejoinReply rep = wait( - brokenPromiseToNever(self->dbInfo->get().clusterInterface.tlogRejoin.getReply(req)))) { + when(TLogRejoinReply rep = + wait(brokenPromiseToNever(self->dbInfo->get().master.tlogRejoin.getReply(req)))) { if (rep.masterIsRecovered) - lastMasterLifetime = self->dbInfo->get().masterLifetime; + lastMasterID = self->dbInfo->get().master.id(); } when(wait(self->dbInfo->onChange())) {} } @@ -2228,7 +2225,7 @@ ACTOR Future rejoinClusterController(TLogData* self, wait(self->dbInfo->onChange()); } } else { - wait(registerWithCC || self->dbInfo->onChange()); + wait(registerWithMaster || self->dbInfo->onChange()); } } } @@ -2849,7 +2846,7 @@ ACTOR Future restorePersistentState(TLogData* self, } state int idx = 0; - state Promise registerWithCC; + state Promise registerWithMaster; state std::map id_interf; state std::vector> logsByVersion; for (idx = 0; idx < fVers.get().size(); idx++) { @@ -2897,7 +2894,7 @@ ACTOR Future restorePersistentState(TLogData* self, logData->recoveryCount = BinaryReader::fromStringRef(fRecoverCounts.get()[idx].value, Unversioned()); logData->removed = - rejoinClusterController(self, recruited, logData->recoveryCount, registerWithCC.getFuture(), false); + rejoinMasters(self, recruited, logData->recoveryCount, registerWithMaster.getFuture(), false); removed.push_back(errorOr(logData->removed)); logsByVersion.emplace_back(ver, id1); @@ -3020,8 +3017,8 @@ ACTOR Future restorePersistentState(TLogData* self, self->sharedActors.send(tLogCore(self, it.second, id_interf[it.first], false)); } - if (registerWithCC.canBeSet()) - registerWithCC.send(Void()); + if (registerWithMaster.canBeSet()) + registerWithMaster.send(Void()); return Void(); } @@ -3138,7 +3135,7 @@ ACTOR Future tLogStart(TLogData* self, InitializeTLogRequest req, Locality self->id_data[recruited.id()] = logData; logData->locality = req.locality; logData->recoveryCount = req.epoch; - logData->removed = rejoinClusterController(self, recruited, req.epoch, Future(Void()), req.isPrimary); + logData->removed = rejoinMasters(self, recruited, req.epoch, Future(Void()), req.isPrimary); self->popOrder.push_back(recruited.id()); self->spillOrder.push_back(recruited.id()); diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp index 9c86d81c2b..681f7fde2d 100644 --- a/fdbserver/TLogServer.actor.cpp +++ b/fdbserver/TLogServer.actor.cpp @@ -319,9 +319,9 @@ struct TLogData : NonCopyable { // data is written from. This value is restored from disk when the tlog // restarts. UID durableClusterId; - // The cluster-controller cluster ID stores the cluster ID read from the txnStateStore. + // The master cluster ID stores the cluster ID read from the txnStateStore. // It is cached in this variable. - UID ccClusterId; + UID masterClusterId; UID dbgid; UID workerID; @@ -783,7 +783,7 @@ void TLogQueue::updateVersionSizes(const TLogQueueEntry& result, ACTOR Future tLogLock(TLogData* self, ReplyPromise reply, Reference logData) { state Version stopVersion = logData->version.get(); - TEST(true); // TLog stopped by recovering cluster-controller + TEST(true); // TLog stopped by recovering master TEST(logData->stopped); // logData already stopped TEST(!logData->stopped); // logData not yet stopped @@ -2020,7 +2020,7 @@ ACTOR Future doQueueCommit(TLogData* self, logData->recoveryComplete.send(Void()); } - TraceEvent("TLogCommitDurable", self->dbgid).detail("Version", ver); + //TraceEvent("TLogCommitDurable", self->dbgid).detail("Version", ver); if (logData->logSystem->get() && (!logData->isPrimary || logData->logRouterPoppedVersion < logData->logRouterPopToVersion)) { logData->logRouterPoppedVersion = ver; @@ -2251,12 +2251,12 @@ ACTOR Future getClusterId(TLogData* self) { } } -ACTOR Future rejoinClusterController(TLogData* self, - TLogInterface tli, - DBRecoveryCount recoveryCount, - Future registerWithCC, - bool isPrimary) { - state LifetimeToken lastMasterLifetime; +ACTOR Future rejoinMasters(TLogData* self, + TLogInterface tli, + DBRecoveryCount recoveryCount, + Future registerWithMaster, + bool isPrimary) { + state UID lastMasterID(0, 0); loop { auto const& inf = self->dbInfo->get(); bool isDisplaced = @@ -2284,27 +2284,24 @@ ACTOR Future rejoinClusterController(TLogData* self, // with a different cluster ID. state UID clusterId = wait(getClusterId(self)); ASSERT(clusterId.isValid()); - self->ccClusterId = clusterId; + self->masterClusterId = clusterId; ev.detail("ClusterId", clusterId).detail("SelfClusterId", self->durableClusterId); if (BUGGIFY) wait(delay(SERVER_KNOBS->BUGGIFY_WORKER_REMOVED_MAX_LAG * deterministicRandom()->random01())); throw worker_removed(); } - if (registerWithCC.isReady()) { - if (!lastMasterLifetime.isEqual(self->dbInfo->get().masterLifetime)) { + if (registerWithMaster.isReady()) { + if (self->dbInfo->get().master.id() != lastMasterID) { // The TLogRejoinRequest is needed to establish communications with a new master, which doesn't have our // TLogInterface TLogRejoinRequest req(tli); - TraceEvent("TLogRejoining", tli.id()) - .detail("ClusterController", self->dbInfo->get().clusterInterface.id()) - .detail("DbInfoMasterLifeTime", self->dbInfo->get().masterLifetime.toString()) - .detail("LastMasterLifeTime", lastMasterLifetime.toString()); + TraceEvent("TLogRejoining", tli.id()).detail("Master", self->dbInfo->get().master.id()); choose { - when(TLogRejoinReply rep = wait( - brokenPromiseToNever(self->dbInfo->get().clusterInterface.tlogRejoin.getReply(req)))) { + when(TLogRejoinReply rep = + wait(brokenPromiseToNever(self->dbInfo->get().master.tlogRejoin.getReply(req)))) { if (rep.masterIsRecovered) - lastMasterLifetime = self->dbInfo->get().masterLifetime; + lastMasterID = self->dbInfo->get().master.id(); } when(wait(self->dbInfo->onChange())) {} } @@ -2312,7 +2309,7 @@ ACTOR Future rejoinClusterController(TLogData* self, wait(self->dbInfo->onChange()); } } else { - wait(registerWithCC || self->dbInfo->onChange()); + wait(registerWithMaster || self->dbInfo->onChange()); } } } @@ -2509,13 +2506,13 @@ ACTOR Future serveTLogInterface(TLogData* self, } // Persist cluster ID once cluster has recovered. - auto ccClusterId = self->dbInfo->get().clusterId; + auto masterClusterId = self->dbInfo->get().clusterId; if (self->dbInfo->get().recoveryState == RecoveryState::FULLY_RECOVERED && !self->durableClusterId.isValid()) { - ASSERT(ccClusterId.isValid()); - self->durableClusterId = ccClusterId; + ASSERT(masterClusterId.isValid()); + self->durableClusterId = masterClusterId; self->persistentData->set( - KeyValueRef(persistClusterIdKey, BinaryWriter::toValue(ccClusterId, Unversioned()))); + KeyValueRef(persistClusterIdKey, BinaryWriter::toValue(masterClusterId, Unversioned()))); wait(self->persistentData->commit()); } } @@ -2961,7 +2958,7 @@ ACTOR Future restorePersistentState(TLogData* self, } state int idx = 0; - state Promise registerWithCC; + state Promise registerWithMaster; state std::map id_interf; state std::vector> logsByVersion; for (idx = 0; idx < fVers.get().size(); idx++) { @@ -3017,7 +3014,7 @@ ACTOR Future restorePersistentState(TLogData* self, logData->recoveryCount = BinaryReader::fromStringRef(fRecoverCounts.get()[idx].value, Unversioned()); logData->removed = - rejoinClusterController(self, recruited, logData->recoveryCount, registerWithCC.getFuture(), false); + rejoinMasters(self, recruited, logData->recoveryCount, registerWithMaster.getFuture(), false); removed.push_back(errorOr(logData->removed)); logsByVersion.emplace_back(ver, id1); @@ -3140,8 +3137,8 @@ ACTOR Future restorePersistentState(TLogData* self, self->sharedActors.send(tLogCore(self, it.second, id_interf[it.first], false)); } - if (registerWithCC.canBeSet()) - registerWithCC.send(Void()); + if (registerWithMaster.canBeSet()) + registerWithMaster.send(Void()); return Void(); } @@ -3266,7 +3263,7 @@ ACTOR Future tLogStart(TLogData* self, InitializeTLogRequest req, Locality self->id_data[recruited.id()] = logData; logData->locality = req.locality; logData->recoveryCount = req.epoch; - logData->removed = rejoinClusterController(self, recruited, req.epoch, Future(Void()), req.isPrimary); + logData->removed = rejoinMasters(self, recruited, req.epoch, Future(Void()), req.isPrimary); self->popOrder.push_back(recruited.id()); self->spillOrder.push_back(recruited.id()); @@ -3494,13 +3491,13 @@ ACTOR Future tLog(IKeyValueStore* persistentData, // it should automatically exclude itself to avoid being used in // the new cluster. auto recoveryState = self.dbInfo->get().recoveryState; - if (recoveryState == RecoveryState::FULLY_RECOVERED && self.ccClusterId.isValid() && - self.durableClusterId.isValid() && self.ccClusterId != self.durableClusterId) { + if (recoveryState == RecoveryState::FULLY_RECOVERED && self.masterClusterId.isValid() && + self.durableClusterId.isValid() && self.masterClusterId != self.durableClusterId) { state NetworkAddress address = g_network->getLocalAddress(); wait(excludeServers(self.cx, { AddressExclusion{ address.ip, address.port } })); TraceEvent(SevWarnAlways, "TLogBelongsToExistingCluster") .detail("ClusterId", self.durableClusterId) - .detail("NewClusterId", self.ccClusterId); + .detail("NewClusterId", self.masterClusterId); } // If the tlog has a valid durable cluster ID, we don't want it to // wipe its data! Throw this error to signal to `tlogTerminated` to diff --git a/fdbserver/TagPartitionedLogSystem.actor.cpp b/fdbserver/TagPartitionedLogSystem.actor.cpp index c6e6d568b7..96cc285272 100644 --- a/fdbserver/TagPartitionedLogSystem.actor.cpp +++ b/fdbserver/TagPartitionedLogSystem.actor.cpp @@ -461,8 +461,8 @@ ACTOR Future TagPartitionedLogSystem::onError_internal(TagPartitionedLogSy changes.push_back(self->backupWorkerChanged.onTrigger()); ASSERT(failed.size() >= 1); - wait(quorum(changes, 1) || tagError(quorum(failed, 1), tlog_failed()) || - tagError(quorum(backupFailed, 1), backup_worker_failed())); + wait(quorum(changes, 1) || tagError(quorum(failed, 1), master_tlog_failed()) || + tagError(quorum(backupFailed, 1), master_backup_worker_failed())); } } @@ -2300,7 +2300,7 @@ ACTOR Future TagPartitionedLogSystem::recruitOldLogRouters(TagPartitionedL auto reply = transformErrors( throwErrorOr(workers[nextRouter].logRouter.getReplyUnlessFailedFor( req, SERVER_KNOBS->TLOG_TIMEOUT, SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY)), - cluster_recovery_failed()); + master_recovery_failed()); logRouterInitializationReplies.back().push_back(reply); allReplies.push_back(reply); nextRouter = (nextRouter + 1) % workers.size(); @@ -2349,7 +2349,7 @@ ACTOR Future TagPartitionedLogSystem::recruitOldLogRouters(TagPartitionedL auto reply = transformErrors( throwErrorOr(workers[nextRouter].logRouter.getReplyUnlessFailedFor( req, SERVER_KNOBS->TLOG_TIMEOUT, SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY)), - cluster_recovery_failed()); + master_recovery_failed()); logRouterInitializationReplies.back().push_back(reply); allReplies.push_back(reply); nextRouter = (nextRouter + 1) % workers.size(); @@ -2410,7 +2410,7 @@ ACTOR Future TagPartitionedLogSystem::recruitOldLogRouters(TagPartitionedL if (!forRemote) { self->logSystemConfigChanged.trigger(); - wait(failed.size() ? tagError(quorum(failed, 1), tlog_failed()) : Future(Never())); + wait(failed.size() ? tagError(quorum(failed, 1), master_tlog_failed()) : Future(Never())); throw internal_error(); } return Void(); @@ -2509,7 +2509,7 @@ ACTOR Future TagPartitionedLogSystem::newRemoteEpoch(TagPartitionedLogSyst throwErrorOr( remoteWorkers.logRouters[i % remoteWorkers.logRouters.size()].logRouter.getReplyUnlessFailedFor( req, SERVER_KNOBS->TLOG_TIMEOUT, SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY)), - cluster_recovery_failed())); + master_recovery_failed())); } std::vector localTags = TagPartitionedLogSystem::getLocalTags(remoteLocality, allTags); @@ -2587,7 +2587,7 @@ ACTOR Future TagPartitionedLogSystem::newRemoteEpoch(TagPartitionedLogSyst remoteTLogInitializationReplies.push_back(transformErrors( throwErrorOr(remoteWorkers.remoteTLogs[i].tLog.getReplyUnlessFailedFor( remoteTLogReqs[i], SERVER_KNOBS->TLOG_TIMEOUT, SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY)), - cluster_recovery_failed())); + master_recovery_failed())); TraceEvent("RemoteLogRecruitment_InitializingRemoteLogs") .detail("StartVersion", logSet->startVersion) @@ -2616,7 +2616,7 @@ ACTOR Future TagPartitionedLogSystem::newRemoteEpoch(TagPartitionedLogSyst TLogRecoveryFinishedRequest(), SERVER_KNOBS->TLOG_TIMEOUT, SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY)), - cluster_recovery_failed())); + master_recovery_failed())); self->remoteRecoveryComplete = waitForAll(recoveryComplete); self->tLogs.push_back(logSet); @@ -2857,7 +2857,7 @@ ACTOR Future> TagPartitionedLogSystem::newEpoch( initializationReplies.push_back(transformErrors( throwErrorOr(recr.tLogs[i].tLog.getReplyUnlessFailedFor( reqs[i], SERVER_KNOBS->TLOG_TIMEOUT, SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY)), - cluster_recovery_failed())); + master_recovery_failed())); state std::vector> recoveryComplete; @@ -2924,7 +2924,7 @@ ACTOR Future> TagPartitionedLogSystem::newEpoch( satelliteInitializationReplies.push_back(transformErrors( throwErrorOr(recr.satelliteTLogs[i].tLog.getReplyUnlessFailedFor( sreqs[i], SERVER_KNOBS->TLOG_TIMEOUT, SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY)), - cluster_recovery_failed())); + master_recovery_failed())); wait(waitForAll(satelliteInitializationReplies) || oldRouterRecruitment); @@ -2940,7 +2940,7 @@ ACTOR Future> TagPartitionedLogSystem::newEpoch( TLogRecoveryFinishedRequest(), SERVER_KNOBS->TLOG_TIMEOUT, SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY)), - cluster_recovery_failed())); + master_recovery_failed())); } wait(waitForAll(initializationReplies) || oldRouterRecruitment); @@ -2955,7 +2955,7 @@ ACTOR Future> TagPartitionedLogSystem::newEpoch( // Don't force failure of recovery if it took us a long time to recover. This avoids multiple long running // recoveries causing tests to timeout if (BUGGIFY && now() - startTime < 300 && g_network->isSimulated() && g_simulator.speedUpSimulation) - throw cluster_recovery_failed(); + throw master_recovery_failed(); for (int i = 0; i < logSystem->tLogs[0]->logServers.size(); i++) recoveryComplete.push_back(transformErrors( @@ -2963,7 +2963,7 @@ ACTOR Future> TagPartitionedLogSystem::newEpoch( TLogRecoveryFinishedRequest(), SERVER_KNOBS->TLOG_TIMEOUT, SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY)), - cluster_recovery_failed())); + master_recovery_failed())); logSystem->recoveryComplete = waitForAll(recoveryComplete); if (configuration.usableRegions > 1) { @@ -3057,7 +3057,7 @@ ACTOR Future TagPartitionedLogSystem::lockTLog( UID myID, Reference>> tlog) { - TraceEvent("TLogLockStarted", myID).detail("TLog", tlog->get().id()).detail("InfPresent", tlog->get().present()); + TraceEvent("TLogLockStarted", myID).detail("TLog", tlog->get().id()); loop { choose { when(TLogLockResult data = wait( diff --git a/fdbserver/WorkerInterface.actor.h b/fdbserver/WorkerInterface.actor.h index baf4cece98..1da9de249d 100644 --- a/fdbserver/WorkerInterface.actor.h +++ b/fdbserver/WorkerInterface.actor.h @@ -163,25 +163,17 @@ struct ClusterControllerFullInterface { RequestStream getServerDBInfo; // only used by testers; the cluster controller will send the serverDBInfo to workers RequestStream updateWorkerHealth; - RequestStream - tlogRejoin; // sent by tlog (whether or not rebooted) to communicate with a new controller - RequestStream notifyBackupWorkerDone; - RequestStream changeCoordinators; UID id() const { return clientInterface.id(); } bool operator==(ClusterControllerFullInterface const& r) const { return id() == r.id(); } bool operator!=(ClusterControllerFullInterface const& r) const { return id() != r.id(); } - NetworkAddress address() const { return clientInterface.address(); } - bool hasMessage() const { return clientInterface.hasMessage() || recruitFromConfiguration.getFuture().isReady() || recruitRemoteFromConfiguration.getFuture().isReady() || recruitStorage.getFuture().isReady() || recruitBlobWorker.getFuture().isReady() || registerWorker.getFuture().isReady() || getWorkers.getFuture().isReady() || registerMaster.getFuture().isReady() || - getServerDBInfo.getFuture().isReady() || updateWorkerHealth.getFuture().isReady() || - tlogRejoin.getFuture().isReady() || notifyBackupWorkerDone.getFuture().isReady() || - changeCoordinators.getFuture().isReady(); + getServerDBInfo.getFuture().isReady() || updateWorkerHealth.getFuture().isReady(); } void initEndpoints() { @@ -195,9 +187,6 @@ struct ClusterControllerFullInterface { registerMaster.getEndpoint(TaskPriority::ClusterControllerRegister); getServerDBInfo.getEndpoint(TaskPriority::ClusterController); updateWorkerHealth.getEndpoint(TaskPriority::ClusterController); - tlogRejoin.getEndpoint(TaskPriority::MasterTLogRejoin); - notifyBackupWorkerDone.getEndpoint(TaskPriority::ClusterController); - changeCoordinators.getEndpoint(TaskPriority::DefaultEndpoint); } template @@ -215,10 +204,7 @@ struct ClusterControllerFullInterface { getWorkers, registerMaster, getServerDBInfo, - updateWorkerHealth, - tlogRejoin, - notifyBackupWorkerDone, - changeCoordinators); + updateWorkerHealth); } }; @@ -337,11 +323,10 @@ struct RecruitRemoteFromConfigurationReply { constexpr static FileIdentifier file_identifier = 9091392; std::vector remoteTLogs; std::vector logRouters; - Optional dbgId; template void serialize(Ar& ar) { - serializer(ar, remoteTLogs, logRouters, dbgId); + serializer(ar, remoteTLogs, logRouters); } }; @@ -351,7 +336,6 @@ struct RecruitRemoteFromConfigurationRequest { Optional dcId; int logRouterCount; std::vector exclusionWorkerIds; - Optional dbgId; ReplyPromise reply; RecruitRemoteFromConfigurationRequest() {} @@ -364,7 +348,7 @@ struct RecruitRemoteFromConfigurationRequest { template void serialize(Ar& ar) { - serializer(ar, configuration, dcId, logRouterCount, exclusionWorkerIds, dbgId, reply); + serializer(ar, configuration, dcId, logRouterCount, exclusionWorkerIds, reply); } }; @@ -502,49 +486,6 @@ struct UpdateWorkerHealthRequest { } }; -struct TLogRejoinReply { - constexpr static FileIdentifier file_identifier = 11; - - // false means someone else registered, so we should re-register. true means this master is recovered, so don't - // send again to the same master. - bool masterIsRecovered; - TLogRejoinReply() = default; - explicit TLogRejoinReply(bool masterIsRecovered) : masterIsRecovered(masterIsRecovered) {} - - template - void serialize(Ar& ar) { - serializer(ar, masterIsRecovered); - } -}; - -struct TLogRejoinRequest { - constexpr static FileIdentifier file_identifier = 15692200; - TLogInterface myInterface; - ReplyPromise reply; - - TLogRejoinRequest() {} - explicit TLogRejoinRequest(const TLogInterface& interf) : myInterface(interf) {} - template - void serialize(Ar& ar) { - serializer(ar, myInterface, reply); - } -}; - -struct BackupWorkerDoneRequest { - constexpr static FileIdentifier file_identifier = 8736351; - UID workerUID; - LogEpoch backupEpoch; - ReplyPromise reply; - - BackupWorkerDoneRequest() : workerUID(), backupEpoch(-1) {} - BackupWorkerDoneRequest(UID id, LogEpoch epoch) : workerUID(id), backupEpoch(epoch) {} - - template - void serialize(Ar& ar) { - serializer(ar, workerUID, backupEpoch, reply); - } -}; - struct InitializeTLogRequest { constexpr static FileIdentifier file_identifier = 15604392; UID recruitmentID; @@ -664,7 +605,6 @@ struct RecruitMasterRequest { struct InitializeCommitProxyRequest { constexpr static FileIdentifier file_identifier = 10344153; MasterInterface master; - LifetimeToken masterLifetime; uint64_t recoveryCount; Version recoveryTransactionVersion; bool firstProxy; @@ -672,20 +612,19 @@ struct InitializeCommitProxyRequest { template void serialize(Ar& ar) { - serializer(ar, master, masterLifetime, recoveryCount, recoveryTransactionVersion, firstProxy, reply); + serializer(ar, master, recoveryCount, recoveryTransactionVersion, firstProxy, reply); } }; struct InitializeGrvProxyRequest { constexpr static FileIdentifier file_identifier = 8265613; MasterInterface master; - LifetimeToken masterLifetime; uint64_t recoveryCount; ReplyPromise reply; template void serialize(Ar& ar) { - serializer(ar, master, masterLifetime, recoveryCount, reply); + serializer(ar, master, recoveryCount, reply); } }; diff --git a/fdbserver/masterserver.actor.cpp b/fdbserver/masterserver.actor.cpp index 4abae4d5e8..426e9ebc49 100644 --- a/fdbserver/masterserver.actor.cpp +++ b/fdbserver/masterserver.actor.cpp @@ -36,6 +36,7 @@ #include "fdbserver/DataDistribution.actor.h" #include "fdbserver/IKeyValueStore.h" #include "fdbserver/Knobs.h" +#include "fdbserver/LogSystem.h" #include "fdbserver/LogSystemDiskQueueAdapter.h" #include "fdbserver/MasterInterface.h" #include "fdbserver/ProxyCommitData.actor.h" @@ -48,41 +49,216 @@ #include "flow/actorcompiler.h" // This must be the last #include. +struct CommitProxyVersionReplies { + std::map replies; + NotifiedVersion latestRequestNum; + + CommitProxyVersionReplies(CommitProxyVersionReplies&& r) noexcept + : replies(std::move(r.replies)), latestRequestNum(std::move(r.latestRequestNum)) {} + void operator=(CommitProxyVersionReplies&& r) noexcept { + replies = std::move(r.replies); + latestRequestNum = std::move(r.latestRequestNum); + } + + CommitProxyVersionReplies() : latestRequestNum(0) {} +}; + +ACTOR Future masterTerminateOnConflict(UID dbgid, + Promise fullyRecovered, + Future onConflict, + Future switchedState) { + choose { + when(wait(onConflict)) { + if (!fullyRecovered.isSet()) { + TraceEvent("MasterTerminated", dbgid).detail("Reason", "Conflict"); + TEST(true); // Coordinated state conflict, master dying + throw worker_removed(); + } + return Void(); + } + when(wait(switchedState)) { return Void(); } + } +} + +class ReusableCoordinatedState : NonCopyable { +public: + Promise fullyRecovered; + DBCoreState prevDBState; + DBCoreState myDBState; + bool finalWriteStarted; + Future previousWrite; + + ReusableCoordinatedState(ServerCoordinators const& coordinators, + PromiseStream> const& addActor, + UID const& dbgid) + : finalWriteStarted(false), previousWrite(Void()), cstate(coordinators), coordinators(coordinators), + addActor(addActor), dbgid(dbgid) {} + + Future read() { return _read(this); } + + Future write(DBCoreState newState, bool finalWrite = false) { + previousWrite = _write(this, newState, finalWrite); + return previousWrite; + } + + Future move(ClusterConnectionString const& nc) { return cstate.move(nc); } + +private: + MovableCoordinatedState cstate; + ServerCoordinators coordinators; + PromiseStream> addActor; + Promise switchedState; + UID dbgid; + + ACTOR Future _read(ReusableCoordinatedState* self) { + Value prevDBStateRaw = wait(self->cstate.read()); + Future onConflict = masterTerminateOnConflict( + self->dbgid, self->fullyRecovered, self->cstate.onConflict(), self->switchedState.getFuture()); + if (onConflict.isReady() && onConflict.isError()) { + throw onConflict.getError(); + } + self->addActor.send(onConflict); + + if (prevDBStateRaw.size()) { + self->prevDBState = BinaryReader::fromStringRef(prevDBStateRaw, IncludeVersion()); + self->myDBState = self->prevDBState; + } + + return Void(); + } + + ACTOR Future _write(ReusableCoordinatedState* self, DBCoreState newState, bool finalWrite) { + if (self->finalWriteStarted) { + wait(Future(Never())); + } + + if (finalWrite) { + self->finalWriteStarted = true; + } + + try { + wait(self->cstate.setExclusive( + BinaryWriter::toValue(newState, IncludeVersion(ProtocolVersion::withDBCoreState())))); + } catch (Error& e) { + TEST(true); // Master displaced during writeMasterState + throw; + } + + self->myDBState = newState; + + if (!finalWrite) { + self->switchedState.send(Void()); + self->cstate = MovableCoordinatedState(self->coordinators); + Value rereadDBStateRaw = wait(self->cstate.read()); + DBCoreState readState; + if (rereadDBStateRaw.size()) + readState = BinaryReader::fromStringRef(rereadDBStateRaw, IncludeVersion()); + + if (readState != newState) { + TraceEvent("MasterTerminated", self->dbgid).detail("Reason", "CStateChanged"); + TEST(true); // Coordinated state changed between writing and reading, master dying + throw worker_removed(); + } + self->switchedState = Promise(); + self->addActor.send(masterTerminateOnConflict( + self->dbgid, self->fullyRecovered, self->cstate.onConflict(), self->switchedState.getFuture())); + } else { + self->fullyRecovered.send(Void()); + } + + return Void(); + } +}; + struct MasterData : NonCopyable, ReferenceCounted { UID dbgid; + AsyncTrigger registrationTrigger; Version lastEpochEnd, // The last version in the old epoch not (to be) rolled back in this recovery recoveryTransactionVersion; // The first version in this epoch + double lastCommitTime; Version liveCommittedVersion; // The largest live committed version reported by commit proxies. bool databaseLocked; Optional proxyMetadataVersion; Version minKnownCommittedVersion; + DatabaseConfiguration originalConfiguration; + DatabaseConfiguration configuration; + std::vector> primaryDcId; + std::vector> remoteDcIds; + bool hasConfiguration; + ServerCoordinators coordinators; + Reference logSystem; Version version; // The last version assigned to a proxy by getVersion() double lastVersionTime; + LogSystemDiskQueueAdapter* txnStateLogAdapter; IKeyValueStore* txnStateStore; + int64_t memoryLimit; + std::map, int8_t> dcId_locality; + std::vector allTags; + + int8_t getNextLocality() { + int8_t maxLocality = -1; + for (auto it : dcId_locality) { + maxLocality = std::max(maxLocality, it.second); + } + return maxLocality + 1; + } std::vector commitProxies; + std::vector provisionalCommitProxies; + std::vector grvProxies; + std::vector provisionalGrvProxies; + std::vector resolvers; + std::map lastCommitProxyVersionReplies; + UID clusterId; + Standalone dbId; + MasterInterface myInterface; + const ClusterControllerFullInterface + clusterController; // If the cluster controller changes, this master will die, so this is immutable. + + ReusableCoordinatedState cstate; + Promise recoveryReadyForCommits; + Promise cstateUpdated; + Reference const> dbInfo; + int64_t registrationCount; // Number of different MasterRegistrationRequests sent to clusterController + + RecoveryState recoveryState; AsyncVar>> resolverChanges; Version resolverChangesVersion; std::set resolverNeedingChanges; + PromiseStream> addActor; + Reference> recruitmentStalled; bool forceRecovery; + bool neverCreated; + int8_t safeLocality; + int8_t primaryLocality; + + std::vector backupWorkers; // Recruited backup workers from cluster controller. CounterCollection cc; + Counter changeCoordinatorsRequests; Counter getCommitVersionRequests; + Counter backupWorkerDoneRequests; Counter getLiveCommittedVersionRequests; Counter reportLiveCommittedVersionRequests; Future logger; + Reference masterRecoveryStateEventHolder; + Reference masterRecoveryGenerationsEventHolder; + Reference masterRecoveryDurationEventHolder; + Reference masterRecoveryAvailableEventHolder; + Reference recoveredConfigEventHolder; + MasterData(Reference const> const& dbInfo, MasterInterface const& myInterface, ServerCoordinators const& coordinators, @@ -92,12 +268,23 @@ struct MasterData : NonCopyable, ReferenceCounted { bool forceRecovery) : dbgid(myInterface.id()), lastEpochEnd(invalidVersion), recoveryTransactionVersion(invalidVersion), - liveCommittedVersion(invalidVersion), databaseLocked(false), minKnownCommittedVersion(invalidVersion), - coordinators(coordinators), version(invalidVersion), lastVersionTime(0), txnStateStore(nullptr), - myInterface(myInterface), forceRecovery(forceRecovery), cc("Master", dbgid.toString()), + lastCommitTime(0), liveCommittedVersion(invalidVersion), databaseLocked(false), + minKnownCommittedVersion(invalidVersion), hasConfiguration(false), coordinators(coordinators), + version(invalidVersion), lastVersionTime(0), txnStateStore(nullptr), memoryLimit(2e9), dbId(dbId), + myInterface(myInterface), clusterController(clusterController), cstate(coordinators, addActor, dbgid), + dbInfo(dbInfo), registrationCount(0), addActor(addActor), + recruitmentStalled(makeReference>(false)), forceRecovery(forceRecovery), neverCreated(false), + safeLocality(tagLocalityInvalid), primaryLocality(tagLocalityInvalid), cc("Master", dbgid.toString()), + changeCoordinatorsRequests("ChangeCoordinatorsRequests", cc), getCommitVersionRequests("GetCommitVersionRequests", cc), + backupWorkerDoneRequests("BackupWorkerDoneRequests", cc), getLiveCommittedVersionRequests("GetLiveCommittedVersionRequests", cc), - reportLiveCommittedVersionRequests("ReportLiveCommittedVersionRequests", cc) { + reportLiveCommittedVersionRequests("ReportLiveCommittedVersionRequests", cc), + masterRecoveryStateEventHolder(makeReference("MasterRecoveryState")), + masterRecoveryGenerationsEventHolder(makeReference("MasterRecoveryGenerations")), + masterRecoveryDurationEventHolder(makeReference("MasterRecoveryDuration")), + masterRecoveryAvailableEventHolder(makeReference("MasterRecoveryAvailable")), + recoveredConfigEventHolder(makeReference("RecoveredConfig")) { logger = traceCounters("MasterMetrics", dbgid, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, &cc, "MasterMetrics"); if (forceRecovery && !myInterface.locality.dcId().present()) { TraceEvent(SevError, "ForcedRecoveryRequiresDcID").log(); @@ -110,6 +297,866 @@ struct MasterData : NonCopyable, ReferenceCounted { } }; +ACTOR Future newCommitProxies(Reference self, RecruitFromConfigurationReply recr) { + std::vector> initializationReplies; + for (int i = 0; i < recr.commitProxies.size(); i++) { + InitializeCommitProxyRequest req; + req.master = self->myInterface; + req.recoveryCount = self->cstate.myDBState.recoveryCount + 1; + req.recoveryTransactionVersion = self->recoveryTransactionVersion; + req.firstProxy = i == 0; + TraceEvent("CommitProxyReplies", self->dbgid) + .detail("WorkerID", recr.commitProxies[i].id()) + .detail("FirstProxy", req.firstProxy ? "True" : "False"); + initializationReplies.push_back( + transformErrors(throwErrorOr(recr.commitProxies[i].commitProxy.getReplyUnlessFailedFor( + req, SERVER_KNOBS->TLOG_TIMEOUT, SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY)), + master_recovery_failed())); + } + + std::vector newRecruits = wait(getAll(initializationReplies)); + // It is required for the correctness of COMMIT_ON_FIRST_PROXY that self->commitProxies[0] is the firstCommitProxy. + self->commitProxies = newRecruits; + + return Void(); +} + +ACTOR Future newGrvProxies(Reference self, RecruitFromConfigurationReply recr) { + std::vector> initializationReplies; + for (int i = 0; i < recr.grvProxies.size(); i++) { + InitializeGrvProxyRequest req; + req.master = self->myInterface; + req.recoveryCount = self->cstate.myDBState.recoveryCount + 1; + TraceEvent("GrvProxyReplies", self->dbgid).detail("WorkerID", recr.grvProxies[i].id()); + initializationReplies.push_back( + transformErrors(throwErrorOr(recr.grvProxies[i].grvProxy.getReplyUnlessFailedFor( + req, SERVER_KNOBS->TLOG_TIMEOUT, SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY)), + master_recovery_failed())); + } + + std::vector newRecruits = wait(getAll(initializationReplies)); + self->grvProxies = newRecruits; + return Void(); +} + +ACTOR Future newResolvers(Reference self, RecruitFromConfigurationReply recr) { + std::vector> initializationReplies; + for (int i = 0; i < recr.resolvers.size(); i++) { + InitializeResolverRequest req; + req.recoveryCount = self->cstate.myDBState.recoveryCount + 1; + req.commitProxyCount = recr.commitProxies.size(); + req.resolverCount = recr.resolvers.size(); + TraceEvent("ResolverReplies", self->dbgid).detail("WorkerID", recr.resolvers[i].id()); + initializationReplies.push_back( + transformErrors(throwErrorOr(recr.resolvers[i].resolver.getReplyUnlessFailedFor( + req, SERVER_KNOBS->TLOG_TIMEOUT, SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY)), + master_recovery_failed())); + } + + std::vector newRecruits = wait(getAll(initializationReplies)); + self->resolvers = newRecruits; + + return Void(); +} + +ACTOR Future newTLogServers(Reference self, + RecruitFromConfigurationReply recr, + Reference oldLogSystem, + std::vector>* initialConfChanges) { + if (self->configuration.usableRegions > 1) { + state Optional remoteDcId = self->remoteDcIds.size() ? self->remoteDcIds[0] : Optional(); + if (!self->dcId_locality.count(recr.dcId)) { + int8_t loc = self->getNextLocality(); + Standalone tr; + tr.set(tr.arena(), tagLocalityListKeyFor(recr.dcId), tagLocalityListValue(loc)); + initialConfChanges->push_back(tr); + self->dcId_locality[recr.dcId] = loc; + TraceEvent(SevWarn, "UnknownPrimaryDCID", self->dbgid).detail("PrimaryId", recr.dcId).detail("Loc", loc); + } + + if (!self->dcId_locality.count(remoteDcId)) { + int8_t loc = self->getNextLocality(); + Standalone tr; + tr.set(tr.arena(), tagLocalityListKeyFor(remoteDcId), tagLocalityListValue(loc)); + initialConfChanges->push_back(tr); + self->dcId_locality[remoteDcId] = loc; + TraceEvent(SevWarn, "UnknownRemoteDCID", self->dbgid).detail("RemoteId", remoteDcId).detail("Loc", loc); + } + + std::vector exclusionWorkerIds; + std::transform(recr.tLogs.begin(), + recr.tLogs.end(), + std::back_inserter(exclusionWorkerIds), + [](const WorkerInterface& in) { return in.id(); }); + std::transform(recr.satelliteTLogs.begin(), + recr.satelliteTLogs.end(), + std::back_inserter(exclusionWorkerIds), + [](const WorkerInterface& in) { return in.id(); }); + Future fRemoteWorkers = brokenPromiseToNever( + self->clusterController.recruitRemoteFromConfiguration.getReply(RecruitRemoteFromConfigurationRequest( + self->configuration, + remoteDcId, + recr.tLogs.size() * + std::max(1, self->configuration.desiredLogRouterCount / std::max(1, recr.tLogs.size())), + exclusionWorkerIds))); + + self->primaryLocality = self->dcId_locality[recr.dcId]; + self->logSystem = Reference(); // Cancels the actors in the previous log system. + Reference newLogSystem = wait(oldLogSystem->newEpoch(recr, + fRemoteWorkers, + self->clusterId, + self->configuration, + self->cstate.myDBState.recoveryCount + 1, + self->primaryLocality, + self->dcId_locality[remoteDcId], + self->allTags, + self->recruitmentStalled)); + self->logSystem = newLogSystem; + } else { + self->primaryLocality = tagLocalitySpecial; + self->logSystem = Reference(); // Cancels the actors in the previous log system. + Reference newLogSystem = wait(oldLogSystem->newEpoch(recr, + Never(), + self->clusterId, + self->configuration, + self->cstate.myDBState.recoveryCount + 1, + self->primaryLocality, + tagLocalitySpecial, + self->allTags, + self->recruitmentStalled)); + self->logSystem = newLogSystem; + } + return Void(); +} + +ACTOR Future newSeedServers(Reference self, + RecruitFromConfigurationReply recruits, + std::vector* servers) { + // This is only necessary if the database is at version 0 + servers->clear(); + if (self->lastEpochEnd) + return Void(); + + state int idx = 0; + state std::map, Tag> dcId_tags; + state int8_t nextLocality = 0; + while (idx < recruits.storageServers.size()) { + TraceEvent("MasterRecruitingInitialStorageServer", self->dbgid) + .detail("CandidateWorker", recruits.storageServers[idx].locality.toString()); + + InitializeStorageRequest isr; + isr.seedTag = dcId_tags.count(recruits.storageServers[idx].locality.dcId()) + ? dcId_tags[recruits.storageServers[idx].locality.dcId()] + : Tag(nextLocality, 0); + isr.storeType = self->configuration.storageServerStoreType; + isr.reqId = deterministicRandom()->randomUniqueID(); + isr.interfaceId = deterministicRandom()->randomUniqueID(); + isr.clusterId = self->clusterId; + + ErrorOr newServer = wait(recruits.storageServers[idx].storage.tryGetReply(isr)); + + if (newServer.isError()) { + if (!newServer.isError(error_code_recruitment_failed) && + !newServer.isError(error_code_request_maybe_delivered)) + throw newServer.getError(); + + TEST(true); // masterserver initial storage recuitment loop failed to get new server + wait(delay(SERVER_KNOBS->STORAGE_RECRUITMENT_DELAY)); + } else { + if (!dcId_tags.count(recruits.storageServers[idx].locality.dcId())) { + dcId_tags[recruits.storageServers[idx].locality.dcId()] = Tag(nextLocality, 0); + nextLocality++; + } + + Tag& tag = dcId_tags[recruits.storageServers[idx].locality.dcId()]; + tag.id++; + idx++; + + servers->push_back(newServer.get().interf); + } + } + + self->dcId_locality.clear(); + for (auto& it : dcId_tags) { + self->dcId_locality[it.first] = it.second.locality; + } + + TraceEvent("MasterRecruitedInitialStorageServers", self->dbgid) + .detail("TargetCount", self->configuration.storageTeamSize) + .detail("Servers", describe(*servers)); + + return Void(); +} + +Future waitCommitProxyFailure(std::vector const& commitProxies) { + std::vector> failed; + failed.reserve(commitProxies.size()); + for (auto commitProxy : commitProxies) { + failed.push_back(waitFailureClient(commitProxy.waitFailure, + SERVER_KNOBS->TLOG_TIMEOUT, + -SERVER_KNOBS->TLOG_TIMEOUT / SERVER_KNOBS->SECONDS_BEFORE_NO_FAILURE_DELAY, + /*trace=*/true)); + } + ASSERT(failed.size() >= 1); + return tagError(quorum(failed, 1), commit_proxy_failed()); +} + +Future waitGrvProxyFailure(std::vector const& grvProxies) { + std::vector> failed; + failed.reserve(grvProxies.size()); + for (int i = 0; i < grvProxies.size(); i++) + failed.push_back(waitFailureClient(grvProxies[i].waitFailure, + SERVER_KNOBS->TLOG_TIMEOUT, + -SERVER_KNOBS->TLOG_TIMEOUT / SERVER_KNOBS->SECONDS_BEFORE_NO_FAILURE_DELAY, + /*trace=*/true)); + ASSERT(failed.size() >= 1); + return tagError(quorum(failed, 1), grv_proxy_failed()); +} + +Future waitResolverFailure(std::vector const& resolvers) { + std::vector> failed; + failed.reserve(resolvers.size()); + for (auto resolver : resolvers) { + failed.push_back(waitFailureClient(resolver.waitFailure, + SERVER_KNOBS->TLOG_TIMEOUT, + -SERVER_KNOBS->TLOG_TIMEOUT / SERVER_KNOBS->SECONDS_BEFORE_NO_FAILURE_DELAY, + /*trace=*/true)); + } + ASSERT(failed.size() >= 1); + return tagError(quorum(failed, 1), master_resolver_failed()); +} + +ACTOR Future updateLogsValue(Reference self, Database cx) { + state Transaction tr(cx); + loop { + try { + Optional> value = wait(tr.get(logsKey)); + ASSERT(value.present()); + auto logs = decodeLogsValue(value.get()); + + std::set logIds; + for (auto& log : logs.first) { + logIds.insert(log.first); + } + + bool found = false; + for (auto& logSet : self->logSystem->getLogSystemConfig().tLogs) { + for (auto& log : logSet.tLogs) { + if (logIds.count(log.id())) { + found = true; + break; + } + } + if (found) { + break; + } + } + + if (!found) { + TEST(true); // old master attempted to change logsKey + return Void(); + } + + tr.set(logsKey, self->logSystem->getLogsValue()); + wait(tr.commit()); + return Void(); + } catch (Error& e) { + wait(tr.onError(e)); + } + } +} + +Future sendMasterRegistration(MasterData* self, + LogSystemConfig const& logSystemConfig, + std::vector commitProxies, + std::vector grvProxies, + std::vector resolvers, + DBRecoveryCount recoveryCount, + std::vector priorCommittedLogServers) { + RegisterMasterRequest masterReq; + masterReq.id = self->myInterface.id(); + masterReq.mi = self->myInterface.locality; + masterReq.logSystemConfig = logSystemConfig; + masterReq.commitProxies = commitProxies; + masterReq.grvProxies = grvProxies; + masterReq.resolvers = resolvers; + masterReq.recoveryCount = recoveryCount; + if (self->hasConfiguration) + masterReq.configuration = self->configuration; + masterReq.registrationCount = ++self->registrationCount; + masterReq.priorCommittedLogServers = priorCommittedLogServers; + masterReq.recoveryState = self->recoveryState; + masterReq.recoveryStalled = self->recruitmentStalled->get(); + masterReq.clusterId = self->clusterId; + return brokenPromiseToNever(self->clusterController.registerMaster.getReply(masterReq)); +} + +ACTOR Future updateRegistration(Reference self, Reference logSystem) { + state Database cx = openDBOnServer(self->dbInfo, TaskPriority::DefaultEndpoint, LockAware::True); + state Future trigger = self->registrationTrigger.onTrigger(); + state Future updateLogsKey; + + loop { + wait(trigger); + wait(delay(.001)); // Coalesce multiple changes + + trigger = self->registrationTrigger.onTrigger(); + + auto logSystemConfig = logSystem->getLogSystemConfig(); + TraceEvent("MasterUpdateRegistration", self->dbgid) + .detail("RecoveryCount", self->cstate.myDBState.recoveryCount) + .detail("OldestBackupEpoch", logSystemConfig.oldestBackupEpoch) + .detail("Logs", describe(logSystemConfig.tLogs)) + .detail("CStateUpdated", self->cstateUpdated.isSet()) + .detail("RecoveryState", self->recoveryState); + + if (!self->cstateUpdated.isSet()) { + wait(sendMasterRegistration(self.getPtr(), + logSystemConfig, + self->provisionalCommitProxies, + self->provisionalGrvProxies, + self->resolvers, + self->cstate.myDBState.recoveryCount, + self->cstate.prevDBState.getPriorCommittedLogServers())); + } else if (self->recoveryState >= RecoveryState::ACCEPTING_COMMITS) { + updateLogsKey = updateLogsValue(self, cx); + wait(sendMasterRegistration(self.getPtr(), + logSystemConfig, + self->commitProxies, + self->grvProxies, + self->resolvers, + self->cstate.myDBState.recoveryCount, + std::vector())); + } else { + // The master should enter the accepting commits phase soon, and then we will register again + TEST(true); // cstate is updated but we aren't accepting commits yet + } + } +} + +ACTOR Future> provisionalMaster(Reference parent, Future activate) { + wait(activate); + + // Register a fake commit proxy (to be provided right here) to make ourselves available to clients + parent->provisionalCommitProxies = std::vector(1); + parent->provisionalCommitProxies[0].provisional = true; + parent->provisionalCommitProxies[0].initEndpoints(); + parent->provisionalGrvProxies = std::vector(1); + parent->provisionalGrvProxies[0].provisional = true; + parent->provisionalGrvProxies[0].initEndpoints(); + state Future waitCommitProxyFailure = + waitFailureServer(parent->provisionalCommitProxies[0].waitFailure.getFuture()); + state Future waitGrvProxyFailure = + waitFailureServer(parent->provisionalGrvProxies[0].waitFailure.getFuture()); + parent->registrationTrigger.trigger(); + + auto lockedKey = parent->txnStateStore->readValue(databaseLockedKey).get(); + state bool locked = lockedKey.present() && lockedKey.get().size(); + + state Optional metadataVersion = parent->txnStateStore->readValue(metadataVersionKey).get(); + + // We respond to a minimal subset of the commit proxy protocol. Our sole purpose is to receive a single write-only + // transaction which might repair our configuration, and return it. + loop choose { + when(GetReadVersionRequest req = + waitNext(parent->provisionalGrvProxies[0].getConsistentReadVersion.getFuture())) { + if ((req.flags & GetReadVersionRequest::FLAG_CAUSAL_READ_RISKY) && + (req.flags & GetReadVersionRequest::FLAG_USE_PROVISIONAL_PROXIES) && parent->lastEpochEnd) { + GetReadVersionReply rep; + rep.version = parent->lastEpochEnd; + rep.locked = locked; + rep.metadataVersion = metadataVersion; + req.reply.send(rep); + } else + req.reply.send(Never()); // We can't perform causally consistent reads without recovering + } + when(CommitTransactionRequest req = waitNext(parent->provisionalCommitProxies[0].commit.getFuture())) { + req.reply.send(Never()); // don't reply (clients always get commit_unknown_result) + auto t = &req.transaction; + if (t->read_snapshot == parent->lastEpochEnd && //< So no transactions can fall between the read snapshot + // and the recovery transaction this (might) be merged with + // vvv and also the changes we will make in the recovery + // transaction (most notably to lastEpochEndKey) BEFORE we + // merge initialConfChanges won't conflict + !std::any_of(t->read_conflict_ranges.begin(), t->read_conflict_ranges.end(), [](KeyRangeRef const& r) { + return r.contains(lastEpochEndKey); + })) { + for (auto m = t->mutations.begin(); m != t->mutations.end(); ++m) { + TraceEvent("PM_CTM", parent->dbgid) + .detail("MType", m->type) + .detail("Param1", m->param1) + .detail("Param2", m->param2); + if (isMetadataMutation(*m)) { + // We keep the mutations and write conflict ranges from this transaction, but not its read + // conflict ranges + Standalone out; + out.read_snapshot = invalidVersion; + out.mutations.append_deep(out.arena(), t->mutations.begin(), t->mutations.size()); + out.write_conflict_ranges.append_deep( + out.arena(), t->write_conflict_ranges.begin(), t->write_conflict_ranges.size()); + return out; + } + } + } + } + when(GetKeyServerLocationsRequest req = + waitNext(parent->provisionalCommitProxies[0].getKeyServersLocations.getFuture())) { + req.reply.send(Never()); + } + when(wait(waitCommitProxyFailure)) { throw worker_removed(); } + when(wait(waitGrvProxyFailure)) { throw worker_removed(); } + } +} + +ACTOR Future>> recruitEverything( + Reference self, + std::vector* seedServers, + Reference oldLogSystem) { + if (!self->configuration.isValid()) { + RecoveryStatus::RecoveryStatus status; + if (self->configuration.initialized) { + TraceEvent(SevWarn, "MasterRecoveryInvalidConfiguration", self->dbgid) + .setMaxEventLength(11000) + .setMaxFieldLength(10000) + .detail("Conf", self->configuration.toString()); + status = RecoveryStatus::configuration_invalid; + } else if (!self->cstate.prevDBState.tLogs.size()) { + status = RecoveryStatus::configuration_never_created; + self->neverCreated = true; + } else { + status = RecoveryStatus::configuration_missing; + } + TraceEvent("MasterRecoveryState", self->dbgid) + .detail("StatusCode", status) + .detail("Status", RecoveryStatus::names[status]) + .trackLatest(self->masterRecoveryStateEventHolder->trackingKey); + return Never(); + } else + TraceEvent("MasterRecoveryState", self->dbgid) + .detail("StatusCode", RecoveryStatus::recruiting_transaction_servers) + .detail("Status", RecoveryStatus::names[RecoveryStatus::recruiting_transaction_servers]) + .detail("Conf", self->configuration.toString()) + .detail("RequiredCommitProxies", 1) + .detail("RequiredGrvProxies", 1) + .detail("RequiredResolvers", 1) + .trackLatest(self->masterRecoveryStateEventHolder->trackingKey); + + // FIXME: we only need log routers for the same locality as the master + int maxLogRouters = self->cstate.prevDBState.logRouterTags; + for (auto& old : self->cstate.prevDBState.oldTLogData) { + maxLogRouters = std::max(maxLogRouters, old.logRouterTags); + } + + state RecruitFromConfigurationReply recruits = + wait(brokenPromiseToNever(self->clusterController.recruitFromConfiguration.getReply( + RecruitFromConfigurationRequest(self->configuration, self->lastEpochEnd == 0, maxLogRouters)))); + + std::string primaryDcIds, remoteDcIds; + + self->primaryDcId.clear(); + self->remoteDcIds.clear(); + if (recruits.dcId.present()) { + self->primaryDcId.push_back(recruits.dcId); + if (!primaryDcIds.empty()) { + primaryDcIds += ','; + } + primaryDcIds += printable(recruits.dcId); + if (self->configuration.regions.size() > 1) { + Key remoteDcId = recruits.dcId.get() == self->configuration.regions[0].dcId + ? self->configuration.regions[1].dcId + : self->configuration.regions[0].dcId; + self->remoteDcIds.push_back(remoteDcId); + if (!remoteDcIds.empty()) { + remoteDcIds += ','; + } + remoteDcIds += printable(remoteDcId); + } + } + self->backupWorkers.swap(recruits.backupWorkers); + + TraceEvent("MasterRecoveryState", self->dbgid) + .detail("StatusCode", RecoveryStatus::initializing_transaction_servers) + .detail("Status", RecoveryStatus::names[RecoveryStatus::initializing_transaction_servers]) + .detail("CommitProxies", recruits.commitProxies.size()) + .detail("GrvProxies", recruits.grvProxies.size()) + .detail("TLogs", recruits.tLogs.size()) + .detail("Resolvers", recruits.resolvers.size()) + .detail("SatelliteTLogs", recruits.satelliteTLogs.size()) + .detail("OldLogRouters", recruits.oldLogRouters.size()) + .detail("StorageServers", recruits.storageServers.size()) + .detail("BackupWorkers", self->backupWorkers.size()) + .detail("PrimaryDcIds", primaryDcIds) + .detail("RemoteDcIds", remoteDcIds) + .trackLatest(self->masterRecoveryStateEventHolder->trackingKey); + + // Actually, newSeedServers does both the recruiting and initialization of the seed servers; so if this is a brand + // new database we are sort of lying that we are past the recruitment phase. In a perfect world we would split that + // up so that the recruitment part happens above (in parallel with recruiting the transaction servers?). + wait(newSeedServers(self, recruits, seedServers)); + state std::vector> confChanges; + wait(newCommitProxies(self, recruits) && newGrvProxies(self, recruits) && newResolvers(self, recruits) && + newTLogServers(self, recruits, oldLogSystem, &confChanges)); + return confChanges; +} + +ACTOR Future updateLocalityForDcId(Optional dcId, + Reference oldLogSystem, + Reference> locality) { + loop { + std::pair loc = oldLogSystem->getLogSystemConfig().getLocalityForDcId(dcId); + Version ver = locality->get().knownCommittedVersion; + if (ver == invalidVersion) { + ver = oldLogSystem->getKnownCommittedVersion(); + } + locality->set(PeekTxsInfo(loc.first, loc.second, ver)); + TraceEvent("UpdatedLocalityForDcId") + .detail("DcId", dcId) + .detail("Locality0", loc.first) + .detail("Locality1", loc.second) + .detail("Version", ver); + wait(oldLogSystem->onLogSystemConfigChange() || oldLogSystem->onKnownCommittedVersionChange()); + } +} + +ACTOR Future readTransactionSystemState(Reference self, + Reference oldLogSystem, + Version txsPoppedVersion) { + state Reference> myLocality = Reference>( + new AsyncVar(PeekTxsInfo(tagLocalityInvalid, tagLocalityInvalid, invalidVersion))); + state Future localityUpdater = + updateLocalityForDcId(self->myInterface.locality.dcId(), oldLogSystem, myLocality); + // Peek the txnStateTag in oldLogSystem and recover self->txnStateStore + + // For now, we also obtain the recovery metadata that the log system obtained during the end_epoch process for + // comparison + + // Sets self->lastEpochEnd and self->recoveryTransactionVersion + // Sets self->configuration to the configuration (FF/conf/ keys) at self->lastEpochEnd + + // Recover transaction state store + if (self->txnStateStore) + self->txnStateStore->close(); + self->txnStateLogAdapter = openDiskQueueAdapter(oldLogSystem, myLocality, txsPoppedVersion); + self->txnStateStore = + keyValueStoreLogSystem(self->txnStateLogAdapter, self->dbgid, self->memoryLimit, false, false, true); + + // Versionstamped operations (particularly those applied from DR) define a minimum commit version + // that we may recover to, as they embed the version in user-readable data and require that no + // transactions will be committed at a lower version. + Optional> requiredCommitVersion = + wait(self->txnStateStore->readValue(minRequiredCommitVersionKey)); + Version minRequiredCommitVersion = -1; + if (requiredCommitVersion.present()) { + minRequiredCommitVersion = BinaryReader::fromStringRef(requiredCommitVersion.get(), Unversioned()); + } + + // Recover version info + self->lastEpochEnd = oldLogSystem->getEnd() - 1; + if (self->lastEpochEnd == 0) { + self->recoveryTransactionVersion = 1; + } else { + if (self->forceRecovery) { + self->recoveryTransactionVersion = self->lastEpochEnd + SERVER_KNOBS->MAX_VERSIONS_IN_FLIGHT_FORCED; + } else { + self->recoveryTransactionVersion = self->lastEpochEnd + SERVER_KNOBS->MAX_VERSIONS_IN_FLIGHT; + } + + if (BUGGIFY) { + self->recoveryTransactionVersion += + deterministicRandom()->randomInt64(0, SERVER_KNOBS->MAX_VERSIONS_IN_FLIGHT); + } + if (self->recoveryTransactionVersion < minRequiredCommitVersion) + self->recoveryTransactionVersion = minRequiredCommitVersion; + } + + TraceEvent("MasterRecovering", self->dbgid) + .detail("LastEpochEnd", self->lastEpochEnd) + .detail("RecoveryTransactionVersion", self->recoveryTransactionVersion); + + RangeResult rawConf = wait(self->txnStateStore->readRange(configKeys)); + self->configuration.fromKeyValues(rawConf.castTo>()); + self->originalConfiguration = self->configuration; + self->hasConfiguration = true; + + TraceEvent("MasterRecoveredConfig", self->dbgid) + .setMaxEventLength(11000) + .setMaxFieldLength(10000) + .detail("Conf", self->configuration.toString()) + .trackLatest(self->recoveredConfigEventHolder->trackingKey); + + RangeResult rawLocalities = wait(self->txnStateStore->readRange(tagLocalityListKeys)); + self->dcId_locality.clear(); + for (auto& kv : rawLocalities) { + self->dcId_locality[decodeTagLocalityListKey(kv.key)] = decodeTagLocalityListValue(kv.value); + } + + RangeResult rawTags = wait(self->txnStateStore->readRange(serverTagKeys)); + self->allTags.clear(); + if (self->lastEpochEnd > 0) { + self->allTags.push_back(cacheTag); + } + + if (self->forceRecovery) { + self->safeLocality = oldLogSystem->getLogSystemConfig().tLogs[0].locality; + for (auto& kv : rawTags) { + Tag tag = decodeServerTagValue(kv.value); + if (tag.locality == self->safeLocality) { + self->allTags.push_back(tag); + } + } + } else { + for (auto& kv : rawTags) { + self->allTags.push_back(decodeServerTagValue(kv.value)); + } + } + + RangeResult rawHistoryTags = wait(self->txnStateStore->readRange(serverTagHistoryKeys)); + for (auto& kv : rawHistoryTags) { + self->allTags.push_back(decodeServerTagValue(kv.value)); + } + + uniquify(self->allTags); + + // auto kvs = self->txnStateStore->readRange( systemKeys ); + // for( auto & kv : kvs.get() ) + // TraceEvent("MasterRecoveredTXS", self->dbgid).detail("K", kv.key).detail("V", kv.value); + + self->txnStateLogAdapter->setNextVersion( + oldLogSystem->getEnd()); //< FIXME: (1) the log adapter should do this automatically after recovery; (2) if we + // make KeyValueStoreMemory guarantee immediate reads, we should be able to get rid of + // the discardCommit() below and not need a writable log adapter + + TraceEvent("RTSSComplete", self->dbgid).log(); + + return Void(); +} + +ACTOR Future sendInitialCommitToResolvers(Reference self) { + state KeyRange txnKeys = allKeys; + state Sequence txnSequence = 0; + ASSERT(self->recoveryTransactionVersion); + + state RangeResult data = + self->txnStateStore + ->readRange(txnKeys, BUGGIFY ? 3 : SERVER_KNOBS->DESIRED_TOTAL_BYTES, SERVER_KNOBS->DESIRED_TOTAL_BYTES) + .get(); + state std::vector> txnReplies; + state int64_t dataOutstanding = 0; + + state std::vector endpoints; + for (auto& it : self->commitProxies) { + endpoints.push_back(it.txnState.getEndpoint()); + } + + loop { + if (!data.size()) + break; + ((KeyRangeRef&)txnKeys) = KeyRangeRef(keyAfter(data.back().key, txnKeys.arena()), txnKeys.end); + RangeResult nextData = + self->txnStateStore + ->readRange(txnKeys, BUGGIFY ? 3 : SERVER_KNOBS->DESIRED_TOTAL_BYTES, SERVER_KNOBS->DESIRED_TOTAL_BYTES) + .get(); + + TxnStateRequest req; + req.arena = data.arena(); + req.data = data; + req.sequence = txnSequence; + req.last = !nextData.size(); + req.broadcastInfo = endpoints; + txnReplies.push_back(broadcastTxnRequest(req, SERVER_KNOBS->TXN_STATE_SEND_AMOUNT, false)); + dataOutstanding += SERVER_KNOBS->TXN_STATE_SEND_AMOUNT * data.arena().getSize(); + data = nextData; + txnSequence++; + + if (dataOutstanding > SERVER_KNOBS->MAX_TXS_SEND_MEMORY) { + wait(waitForAll(txnReplies)); + txnReplies = std::vector>(); + dataOutstanding = 0; + } + + wait(yield()); + } + wait(waitForAll(txnReplies)); + TraceEvent("RecoveryInternal", self->dbgid) + .detail("StatusCode", RecoveryStatus::recovery_transaction) + .detail("Status", RecoveryStatus::names[RecoveryStatus::recovery_transaction]) + .detail("Step", "SentTxnStateStoreToCommitProxies"); + + std::vector> replies; + for (auto& r : self->resolvers) { + ResolveTransactionBatchRequest req; + req.prevVersion = -1; + req.version = self->lastEpochEnd; + req.lastReceivedVersion = -1; + + replies.push_back(brokenPromiseToNever(r.resolve.getReply(req))); + } + + wait(waitForAll(replies)); + TraceEvent("RecoveryInternal", self->dbgid) + .detail("StatusCode", RecoveryStatus::recovery_transaction) + .detail("Status", RecoveryStatus::names[RecoveryStatus::recovery_transaction]) + .detail("Step", "InitializedAllResolvers"); + return Void(); +} + +ACTOR Future triggerUpdates(Reference self, Reference oldLogSystem) { + loop { + wait(oldLogSystem->onLogSystemConfigChange() || self->cstate.fullyRecovered.getFuture() || + self->recruitmentStalled->onChange()); + if (self->cstate.fullyRecovered.isSet()) + return Void(); + + self->registrationTrigger.trigger(); + } +} + +ACTOR Future discardCommit(IKeyValueStore* store, LogSystemDiskQueueAdapter* adapter) { + state Future fcm = adapter->getCommitMessage(); + state Future committed = store->commit(); + LogSystemDiskQueueAdapter::CommitMessage cm = wait(fcm); + ASSERT(!committed.isReady()); + cm.acknowledge.send(Void()); + ASSERT(committed.isReady()); + return Void(); +} + +void updateConfigForForcedRecovery(Reference self, + std::vector>* initialConfChanges) { + bool regionsChanged = false; + for (auto& it : self->configuration.regions) { + if (it.dcId == self->myInterface.locality.dcId().get() && it.priority < 0) { + it.priority = 1; + regionsChanged = true; + } else if (it.dcId != self->myInterface.locality.dcId().get() && it.priority >= 0) { + it.priority = -1; + regionsChanged = true; + } + } + Standalone regionCommit; + regionCommit.mutations.push_back_deep( + regionCommit.arena(), + MutationRef(MutationRef::SetValue, configKeysPrefix.toString() + "usable_regions", LiteralStringRef("1"))); + self->configuration.applyMutation(regionCommit.mutations.back()); + if (regionsChanged) { + std::sort( + self->configuration.regions.begin(), self->configuration.regions.end(), RegionInfo::sort_by_priority()); + StatusObject regionJSON; + regionJSON["regions"] = self->configuration.getRegionJSON(); + regionCommit.mutations.push_back_deep( + regionCommit.arena(), + MutationRef(MutationRef::SetValue, + configKeysPrefix.toString() + "regions", + BinaryWriter::toValue(regionJSON, IncludeVersion(ProtocolVersion::withRegionConfiguration())) + .toString())); + self->configuration.applyMutation( + regionCommit.mutations.back()); // modifying the configuration directly does not change the configuration + // when it is re-serialized unless we call applyMutation + TraceEvent("ForcedRecoveryConfigChange", self->dbgid) + .setMaxEventLength(11000) + .setMaxFieldLength(10000) + .detail("Conf", self->configuration.toString()); + } + initialConfChanges->push_back(regionCommit); +} + +ACTOR Future recoverFrom(Reference self, + Reference oldLogSystem, + std::vector* seedServers, + std::vector>* initialConfChanges, + Future poppedTxsVersion, + bool* clusterIdExists) { + TraceEvent("MasterRecoveryState", self->dbgid) + .detail("StatusCode", RecoveryStatus::reading_transaction_system_state) + .detail("Status", RecoveryStatus::names[RecoveryStatus::reading_transaction_system_state]) + .trackLatest(self->masterRecoveryStateEventHolder->trackingKey); + self->hasConfiguration = false; + + if (BUGGIFY) + wait(delay(10.0)); + + Version txsPoppedVersion = wait(poppedTxsVersion); + wait(readTransactionSystemState(self, oldLogSystem, txsPoppedVersion)); + for (auto& itr : *initialConfChanges) { + for (auto& m : itr.mutations) { + self->configuration.applyMutation(m); + } + } + + if (self->forceRecovery) { + updateConfigForForcedRecovery(self, initialConfChanges); + } + + debug_checkMaxRestoredVersion(UID(), self->lastEpochEnd, "DBRecovery"); + + // Generate a cluster ID to uniquely identify the cluster if it doesn't + // already exist in the txnStateStore. + Optional clusterId = self->txnStateStore->readValue(clusterIdKey).get(); + *clusterIdExists = clusterId.present(); + if (!clusterId.present()) { + self->clusterId = deterministicRandom()->randomUniqueID(); + } else { + self->clusterId = BinaryReader::fromStringRef(clusterId.get(), Unversioned()); + } + + // Ordinarily we pass through this loop once and recover. We go around the loop if recovery stalls for more than a + // second, a provisional master is initialized, and an "emergency transaction" is submitted that might change the + // configuration so that we can finish recovery. + + state std::map, int8_t> originalLocalityMap = self->dcId_locality; + state Future>> recruitments = + recruitEverything(self, seedServers, oldLogSystem); + state double provisionalDelay = SERVER_KNOBS->PROVISIONAL_START_DELAY; + loop { + state Future> provisional = provisionalMaster(self, delay(provisionalDelay)); + provisionalDelay = + std::min(SERVER_KNOBS->PROVISIONAL_MAX_DELAY, provisionalDelay * SERVER_KNOBS->PROVISIONAL_DELAY_GROWTH); + choose { + when(std::vector> confChanges = wait(recruitments)) { + initialConfChanges->insert(initialConfChanges->end(), confChanges.begin(), confChanges.end()); + provisional.cancel(); + break; + } + when(Standalone _req = wait(provisional)) { + state Standalone req = _req; // mutable + TEST(true); // Emergency transaction processing during recovery + TraceEvent("EmergencyTransaction", self->dbgid).log(); + for (auto m = req.mutations.begin(); m != req.mutations.end(); ++m) + TraceEvent("EmergencyTransactionMutation", self->dbgid) + .detail("MType", m->type) + .detail("P1", m->param1) + .detail("P2", m->param2); + + DatabaseConfiguration oldConf = self->configuration; + self->configuration = self->originalConfiguration; + for (auto& m : req.mutations) + self->configuration.applyMutation(m); + + initialConfChanges->clear(); + if (self->originalConfiguration.isValid() && + self->configuration.usableRegions != self->originalConfiguration.usableRegions) { + TraceEvent(SevWarnAlways, "CannotChangeUsableRegions", self->dbgid).log(); + self->configuration = self->originalConfiguration; + } else { + initialConfChanges->push_back(req); + } + if (self->forceRecovery) { + updateConfigForForcedRecovery(self, initialConfChanges); + } + + if (self->configuration != oldConf) { // confChange does not trigger when including servers + self->dcId_locality = originalLocalityMap; + recruitments = recruitEverything(self, seedServers, oldLogSystem); + } + } + } + + provisional.cancel(); + } + + return Void(); +} + ACTOR Future getVersion(Reference self, GetCommitVersionRequest req) { state Span span("M:getVersion"_loc, { req.spanContext }); state std::map::iterator proxyItr = @@ -236,31 +1283,139 @@ ACTOR Future serveLiveCommittedVersion(Reference self) { } } -ACTOR Future updateRecoveryData(Reference self) { +std::pair findRange(CoalescedKeyRangeMap& key_resolver, + Standalone>& movedRanges, + int src, + int dest) { + auto ranges = key_resolver.ranges(); + auto prev = ranges.begin(); + auto it = ranges.begin(); + ++it; + if (it == ranges.end()) { + if (ranges.begin().value() != src || + std::find(movedRanges.begin(), movedRanges.end(), ResolverMoveRef(ranges.begin()->range(), dest)) != + movedRanges.end()) + throw operation_failed(); + return std::make_pair(ranges.begin().range(), true); + } + + std::set borders; + // If possible expand an existing boundary between the two resolvers + for (; it != ranges.end(); ++it) { + if (it->value() == src && prev->value() == dest && + std::find(movedRanges.begin(), movedRanges.end(), ResolverMoveRef(it->range(), dest)) == + movedRanges.end()) { + return std::make_pair(it->range(), true); + } + if (it->value() == dest && prev->value() == src && + std::find(movedRanges.begin(), movedRanges.end(), ResolverMoveRef(prev->range(), dest)) == + movedRanges.end()) { + return std::make_pair(prev->range(), false); + } + if (it->value() == dest) + borders.insert(prev->value()); + if (prev->value() == dest) + borders.insert(it->value()); + ++prev; + } + + prev = ranges.begin(); + it = ranges.begin(); + ++it; + // If possible create a new boundry which doesn't exist yet + for (; it != ranges.end(); ++it) { + if (it->value() == src && !borders.count(prev->value()) && + std::find(movedRanges.begin(), movedRanges.end(), ResolverMoveRef(it->range(), dest)) == + movedRanges.end()) { + return std::make_pair(it->range(), true); + } + if (prev->value() == src && !borders.count(it->value()) && + std::find(movedRanges.begin(), movedRanges.end(), ResolverMoveRef(prev->range(), dest)) == + movedRanges.end()) { + return std::make_pair(prev->range(), false); + } + ++prev; + } + + it = ranges.begin(); + for (; it != ranges.end(); ++it) { + if (it->value() == src && + std::find(movedRanges.begin(), movedRanges.end(), ResolverMoveRef(it->range(), dest)) == + movedRanges.end()) { + return std::make_pair(it->range(), true); + } + } + throw operation_failed(); // we are already attempting to move all of the data one resolver is assigned, so do not + // move anything +} + +ACTOR Future resolutionBalancing(Reference self) { + state CoalescedKeyRangeMap key_resolver; + key_resolver.insert(allKeys, 0); loop { - choose { - when(UpdateRecoveryDataRequest req = waitNext(self->myInterface.updateRecoveryData.getFuture())) { - TraceEvent("UpdateRecoveryData", self->dbgid) - .detail("RecoveryTxnVersion", req.recoveryTransactionVersion) - .detail("LastEpochEnd", req.lastEpochEnd) - .detail("NumCommitProxies", req.commitProxies.size()); + wait(delay(SERVER_KNOBS->MIN_BALANCE_TIME, TaskPriority::ResolutionMetrics)); + while (self->resolverChanges.get().size()) + wait(self->resolverChanges.onChange()); + state std::vector> futures; + for (auto& p : self->resolvers) + futures.push_back( + brokenPromiseToNever(p.metrics.getReply(ResolutionMetricsRequest(), TaskPriority::ResolutionMetrics))); + wait(waitForAll(futures)); + state IndexedSet, NoMetric> metrics; - if (self->recoveryTransactionVersion == invalidVersion || - req.recoveryTransactionVersion > self->recoveryTransactionVersion) { - self->recoveryTransactionVersion = req.recoveryTransactionVersion; - } - if (self->lastEpochEnd == invalidVersion || req.lastEpochEnd > self->lastEpochEnd) { - self->lastEpochEnd = req.lastEpochEnd; - } - if (req.commitProxies.size() > 0) { - self->commitProxies = req.commitProxies; - self->lastCommitProxyVersionReplies.clear(); + int64_t total = 0; + for (int i = 0; i < futures.size(); i++) { + total += futures[i].get().value; + metrics.insert(std::make_pair(futures[i].get().value, i), NoMetric()); + //TraceEvent("ResolverMetric").detail("I", i).detail("Metric", futures[i].get()); + } + if (metrics.lastItem()->first - metrics.begin()->first > SERVER_KNOBS->MIN_BALANCE_DIFFERENCE) { + try { + state int src = metrics.lastItem()->second; + state int dest = metrics.begin()->second; + state int64_t amount = std::min(metrics.lastItem()->first - total / self->resolvers.size(), + total / self->resolvers.size() - metrics.begin()->first) / + 2; + state Standalone> movedRanges; - for (auto& p : self->commitProxies) { - self->lastCommitProxyVersionReplies[p.id()] = CommitProxyVersionReplies(); - } + loop { + state std::pair range = findRange(key_resolver, movedRanges, src, dest); + + ResolutionSplitRequest req; + req.front = range.second; + req.offset = amount; + req.range = range.first; + + ResolutionSplitReply split = + wait(brokenPromiseToNever(self->resolvers[metrics.lastItem()->second].split.getReply( + req, TaskPriority::ResolutionMetrics))); + KeyRangeRef moveRange = range.second ? KeyRangeRef(range.first.begin, split.key) + : KeyRangeRef(split.key, range.first.end); + movedRanges.push_back_deep(movedRanges.arena(), ResolverMoveRef(moveRange, dest)); + TraceEvent("MovingResolutionRange") + .detail("Src", src) + .detail("Dest", dest) + .detail("Amount", amount) + .detail("StartRange", range.first) + .detail("MoveRange", moveRange) + .detail("Used", split.used) + .detail("KeyResolverRanges", key_resolver.size()); + amount -= split.used; + if (moveRange != range.first || amount <= 0) + break; } - req.reply.send(Void()); + for (auto& it : movedRanges) + key_resolver.insert(it.range, it.dest); + // for(auto& it : key_resolver.ranges()) + // TraceEvent("KeyResolver").detail("Range", it.range()).detail("Value", it.value()); + + self->resolverChangesVersion = self->version + 1; + for (auto& p : self->commitProxies) + self->resolverNeedingChanges.insert(p.id()); + self->resolverChanges.set(movedRanges); + } catch (Error& e) { + if (e.code() != error_code_operation_failed) + throw; } } } @@ -270,14 +1425,14 @@ static std::set const& normalMasterErrors() { static std::set s; if (s.empty()) { s.insert(error_code_tlog_stopped); - s.insert(error_code_tlog_failed); + s.insert(error_code_master_tlog_failed); s.insert(error_code_commit_proxy_failed); s.insert(error_code_grv_proxy_failed); - s.insert(error_code_resolver_failed); - s.insert(error_code_backup_worker_failed); + s.insert(error_code_master_resolver_failed); + s.insert(error_code_master_backup_worker_failed); s.insert(error_code_recruitment_failed); s.insert(error_code_no_more_servers); - s.insert(error_code_cluster_recovery_failed); + s.insert(error_code_master_recovery_failed); s.insert(error_code_coordinated_state_conflict); s.insert(error_code_master_max_versions_in_flight); s.insert(error_code_worker_removed); @@ -287,6 +1442,591 @@ static std::set const& normalMasterErrors() { return s; } +ACTOR Future changeCoordinators(Reference self) { + loop { + ChangeCoordinatorsRequest req = waitNext(self->myInterface.changeCoordinators.getFuture()); + ++self->changeCoordinatorsRequests; + state ChangeCoordinatorsRequest changeCoordinatorsRequest = req; + + while (!self->cstate.previousWrite.isReady()) { + wait(self->cstate.previousWrite); + wait(delay( + 0)); // if a new core state is ready to be written, have that take priority over our finalizing write; + } + + if (!self->cstate.fullyRecovered.isSet()) { + wait(self->cstate.write(self->cstate.myDBState, true)); + } + + try { + wait(self->cstate.move(ClusterConnectionString(changeCoordinatorsRequest.newConnectionString.toString()))); + } catch (Error& e) { + if (e.code() != error_code_actor_cancelled) + changeCoordinatorsRequest.reply.sendError(e); + + throw; + } + + throw internal_error(); + } +} + +ACTOR Future rejoinRequestHandler(Reference self) { + loop { + TLogRejoinRequest req = waitNext(self->myInterface.tlogRejoin.getFuture()); + req.reply.send(true); + } +} + +// Keeps the coordinated state (cstate) updated as the set of recruited tlogs change through recovery. +ACTOR Future trackTlogRecovery(Reference self, + Reference>> oldLogSystems, + Future minRecoveryDuration) { + state Future rejoinRequests = Never(); + state DBRecoveryCount recoverCount = self->cstate.myDBState.recoveryCount + 1; + state DatabaseConfiguration configuration = + self->configuration; // self-configuration can be changed by configurationMonitor so we need a copy + loop { + state DBCoreState newState; + self->logSystem->toCoreState(newState); + newState.recoveryCount = recoverCount; + state Future changed = self->logSystem->onCoreStateChanged(); + + ASSERT(newState.tLogs[0].tLogWriteAntiQuorum == configuration.tLogWriteAntiQuorum && + newState.tLogs[0].tLogReplicationFactor == configuration.tLogReplicationFactor); + + state bool allLogs = + newState.tLogs.size() == + configuration.expectedLogSets(self->primaryDcId.size() ? self->primaryDcId[0] : Optional()); + state bool finalUpdate = !newState.oldTLogData.size() && allLogs; + wait(self->cstate.write(newState, finalUpdate)); + if (self->cstateUpdated.canBeSet()) { + self->cstateUpdated.send(Void()); + } + + wait(minRecoveryDuration); + self->logSystem->coreStateWritten(newState); + + if (self->recoveryReadyForCommits.canBeSet()) { + self->recoveryReadyForCommits.send(Void()); + } + + if (finalUpdate) { + self->recoveryState = RecoveryState::FULLY_RECOVERED; + TraceEvent("MasterRecoveryState", self->dbgid) + .detail("StatusCode", RecoveryStatus::fully_recovered) + .detail("Status", RecoveryStatus::names[RecoveryStatus::fully_recovered]) + .detail("FullyRecoveredAtVersion", self->version) + .detail("ClusterId", self->clusterId) + .trackLatest(self->masterRecoveryStateEventHolder->trackingKey); + + TraceEvent("MasterRecoveryGenerations", self->dbgid) + .detail("ActiveGenerations", 1) + .trackLatest("MasterRecoveryGenerations"); + } else if (!newState.oldTLogData.size() && self->recoveryState < RecoveryState::STORAGE_RECOVERED) { + self->recoveryState = RecoveryState::STORAGE_RECOVERED; + TraceEvent("MasterRecoveryState", self->dbgid) + .detail("StatusCode", RecoveryStatus::storage_recovered) + .detail("Status", RecoveryStatus::names[RecoveryStatus::storage_recovered]) + .trackLatest(self->masterRecoveryStateEventHolder->trackingKey); + } else if (allLogs && self->recoveryState < RecoveryState::ALL_LOGS_RECRUITED) { + self->recoveryState = RecoveryState::ALL_LOGS_RECRUITED; + TraceEvent("MasterRecoveryState", self->dbgid) + .detail("StatusCode", RecoveryStatus::all_logs_recruited) + .detail("Status", RecoveryStatus::names[RecoveryStatus::all_logs_recruited]) + .trackLatest(self->masterRecoveryStateEventHolder->trackingKey); + } + + if (newState.oldTLogData.size() && configuration.repopulateRegionAntiQuorum > 0 && + self->logSystem->remoteStorageRecovered()) { + TraceEvent(SevWarnAlways, "RecruitmentStalled_RemoteStorageRecovered", self->dbgid).log(); + self->recruitmentStalled->set(true); + } + self->registrationTrigger.trigger(); + + if (finalUpdate) { + oldLogSystems->get()->stopRejoins(); + rejoinRequests = rejoinRequestHandler(self); + return Void(); + } + + wait(changed); + } +} + +ACTOR Future configurationMonitor(Reference self, Database cx) { + loop { + state ReadYourWritesTransaction tr(cx); + + loop { + try { + tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + RangeResult results = wait(tr.getRange(configKeys, CLIENT_KNOBS->TOO_MANY)); + ASSERT(!results.more && results.size() < CLIENT_KNOBS->TOO_MANY); + + DatabaseConfiguration conf; + conf.fromKeyValues((VectorRef)results); + if (conf != self->configuration) { + if (self->recoveryState != RecoveryState::ALL_LOGS_RECRUITED && + self->recoveryState != RecoveryState::FULLY_RECOVERED) { + throw master_recovery_failed(); + } + + self->configuration = conf; + self->registrationTrigger.trigger(); + } + + state Future watchFuture = + tr.watch(moveKeysLockOwnerKey) || tr.watch(excludedServersVersionKey) || + tr.watch(failedServersVersionKey) || tr.watch(excludedLocalityVersionKey) || + tr.watch(failedLocalityVersionKey); + wait(tr.commit()); + wait(watchFuture); + break; + } catch (Error& e) { + wait(tr.onError(e)); + } + } + } +} + +ACTOR static Future> getMinBackupVersion(Reference self, Database cx) { + loop { + state ReadYourWritesTransaction tr(cx); + + try { + tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr.setOption(FDBTransactionOptions::LOCK_AWARE); + Optional value = wait(tr.get(backupStartedKey)); + Optional minVersion; + if (value.present()) { + auto uidVersions = decodeBackupStartedValue(value.get()); + TraceEvent e("GotBackupStartKey", self->dbgid); + int i = 1; + for (auto [uid, version] : uidVersions) { + e.detail(format("BackupID%d", i), uid).detail(format("Version%d", i), version); + i++; + minVersion = minVersion.present() ? std::min(version, minVersion.get()) : version; + } + } else { + TraceEvent("EmptyBackupStartKey", self->dbgid).log(); + } + return minVersion; + + } catch (Error& e) { + wait(tr.onError(e)); + } + } +} + +ACTOR static Future recruitBackupWorkers(Reference self, Database cx) { + ASSERT(self->backupWorkers.size() > 0); + + // Avoid race between a backup worker's save progress and the reads below. + wait(delay(SERVER_KNOBS->SECONDS_BEFORE_RECRUIT_BACKUP_WORKER)); + + state LogEpoch epoch = self->cstate.myDBState.recoveryCount; + state Reference backupProgress( + new BackupProgress(self->dbgid, self->logSystem->getOldEpochTagsVersionsInfo())); + state Future gotProgress = getBackupProgress(cx, self->dbgid, backupProgress, /*logging=*/true); + state std::vector> initializationReplies; + + state std::vector> idsTags; // worker IDs and tags for current epoch + state int logRouterTags = self->logSystem->getLogRouterTags(); + idsTags.reserve(logRouterTags); + for (int i = 0; i < logRouterTags; i++) { + idsTags.emplace_back(deterministicRandom()->randomUniqueID(), Tag(tagLocalityLogRouter, i)); + } + + const Version startVersion = self->logSystem->getBackupStartVersion(); + state int i = 0; + for (; i < logRouterTags; i++) { + const auto& worker = self->backupWorkers[i % self->backupWorkers.size()]; + InitializeBackupRequest req(idsTags[i].first); + req.recruitedEpoch = epoch; + req.backupEpoch = epoch; + req.routerTag = idsTags[i].second; + req.totalTags = logRouterTags; + req.startVersion = startVersion; + TraceEvent("BackupRecruitment", self->dbgid) + .detail("RequestID", req.reqId) + .detail("Tag", req.routerTag.toString()) + .detail("Epoch", epoch) + .detail("BackupEpoch", epoch) + .detail("StartVersion", req.startVersion); + initializationReplies.push_back( + transformErrors(throwErrorOr(worker.backup.getReplyUnlessFailedFor( + req, SERVER_KNOBS->BACKUP_TIMEOUT, SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY)), + master_backup_worker_failed())); + } + + state Future> fMinVersion = getMinBackupVersion(self, cx); + wait(gotProgress && success(fMinVersion)); + TraceEvent("MinBackupVersion", self->dbgid).detail("Version", fMinVersion.get().present() ? fMinVersion.get() : -1); + + std::map, std::map> toRecruit = + backupProgress->getUnfinishedBackup(); + for (const auto& [epochVersionTags, tagVersions] : toRecruit) { + const Version oldEpochEnd = std::get<1>(epochVersionTags); + if (!fMinVersion.get().present() || fMinVersion.get().get() + 1 >= oldEpochEnd) { + TraceEvent("SkipBackupRecruitment", self->dbgid) + .detail("MinVersion", fMinVersion.get().present() ? fMinVersion.get() : -1) + .detail("Epoch", epoch) + .detail("OldEpoch", std::get<0>(epochVersionTags)) + .detail("OldEpochEnd", oldEpochEnd); + continue; + } + for (const auto& [tag, version] : tagVersions) { + const auto& worker = self->backupWorkers[i % self->backupWorkers.size()]; + i++; + InitializeBackupRequest req(deterministicRandom()->randomUniqueID()); + req.recruitedEpoch = epoch; + req.backupEpoch = std::get<0>(epochVersionTags); + req.routerTag = tag; + req.totalTags = std::get<2>(epochVersionTags); + req.startVersion = version; // savedVersion + 1 + req.endVersion = std::get<1>(epochVersionTags) - 1; + TraceEvent("BackupRecruitment", self->dbgid) + .detail("RequestID", req.reqId) + .detail("Tag", req.routerTag.toString()) + .detail("Epoch", epoch) + .detail("BackupEpoch", req.backupEpoch) + .detail("StartVersion", req.startVersion) + .detail("EndVersion", req.endVersion.get()); + initializationReplies.push_back(transformErrors( + throwErrorOr(worker.backup.getReplyUnlessFailedFor( + req, SERVER_KNOBS->BACKUP_TIMEOUT, SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY)), + master_backup_worker_failed())); + } + } + + std::vector newRecruits = wait(getAll(initializationReplies)); + self->logSystem->setBackupWorkers(newRecruits); + TraceEvent("BackupRecruitmentDone", self->dbgid).log(); + self->registrationTrigger.trigger(); + return Void(); +} + +ACTOR Future masterCore(Reference self) { + state TraceInterval recoveryInterval("MasterRecovery"); + state double recoverStartTime = now(); + + self->addActor.send(waitFailureServer(self->myInterface.waitFailure.getFuture())); + + TraceEvent(recoveryInterval.begin(), self->dbgid); + + self->recoveryState = RecoveryState::READING_CSTATE; + TraceEvent("MasterRecoveryState", self->dbgid) + .detail("StatusCode", RecoveryStatus::reading_coordinated_state) + .detail("Status", RecoveryStatus::names[RecoveryStatus::reading_coordinated_state]) + .trackLatest(self->masterRecoveryStateEventHolder->trackingKey); + + wait(self->cstate.read()); + + self->recoveryState = RecoveryState::LOCKING_CSTATE; + TraceEvent("MasterRecoveryState", self->dbgid) + .detail("StatusCode", RecoveryStatus::locking_coordinated_state) + .detail("Status", RecoveryStatus::names[RecoveryStatus::locking_coordinated_state]) + .detail("TLogs", self->cstate.prevDBState.tLogs.size()) + .detail("ActiveGenerations", self->cstate.myDBState.oldTLogData.size() + 1) + .detail("MyRecoveryCount", self->cstate.prevDBState.recoveryCount + 2) + .detail("ForceRecovery", self->forceRecovery) + .trackLatest(self->masterRecoveryStateEventHolder->trackingKey); + // for (const auto& old : self->cstate.prevDBState.oldTLogData) { + // TraceEvent("BWReadCoreState", self->dbgid).detail("Epoch", old.epoch).detail("Version", old.epochEnd); + //} + + TraceEvent("MasterRecoveryGenerations", self->dbgid) + .detail("ActiveGenerations", self->cstate.myDBState.oldTLogData.size() + 1) + .trackLatest(self->masterRecoveryGenerationsEventHolder->trackingKey); + + if (self->cstate.myDBState.oldTLogData.size() > CLIENT_KNOBS->MAX_GENERATIONS_OVERRIDE) { + if (self->cstate.myDBState.oldTLogData.size() >= CLIENT_KNOBS->MAX_GENERATIONS) { + TraceEvent(SevError, "RecoveryStoppedTooManyOldGenerations") + .detail("OldGenerations", self->cstate.myDBState.oldTLogData.size()) + .detail("Reason", + "Recovery stopped because too many recoveries have happened since the last time the cluster " + "was fully_recovered. Set --knob-max-generations-override on your server processes to a value " + "larger than OldGenerations to resume recovery once the underlying problem has been fixed."); + wait(Future(Never())); + } else if (self->cstate.myDBState.oldTLogData.size() > CLIENT_KNOBS->RECOVERY_DELAY_START_GENERATION) { + TraceEvent(SevError, "RecoveryDelayedTooManyOldGenerations") + .detail("OldGenerations", self->cstate.myDBState.oldTLogData.size()) + .detail("Reason", + "Recovery is delayed because too many recoveries have happened since the last time the cluster " + "was fully_recovered. Set --knob-max-generations-override on your server processes to a value " + "larger than OldGenerations to resume recovery once the underlying problem has been fixed."); + wait(delay(CLIENT_KNOBS->RECOVERY_DELAY_SECONDS_PER_GENERATION * + (self->cstate.myDBState.oldTLogData.size() - CLIENT_KNOBS->RECOVERY_DELAY_START_GENERATION))); + } + if (g_network->isSimulated() && self->cstate.myDBState.oldTLogData.size() > CLIENT_KNOBS->MAX_GENERATIONS_SIM) { + g_simulator.connectionFailuresDisableDuration = 1e6; + g_simulator.speedUpSimulation = true; + TraceEvent(SevWarnAlways, "DisableConnectionFailures_TooManyGenerations").log(); + } + } + + state Reference>> oldLogSystems(new AsyncVar>); + state Future recoverAndEndEpoch = ILogSystem::recoverAndEndEpoch(oldLogSystems, + self->dbgid, + self->cstate.prevDBState, + self->myInterface.tlogRejoin.getFuture(), + self->myInterface.locality, + &self->forceRecovery); + + DBCoreState newState = self->cstate.myDBState; + newState.recoveryCount++; + wait(self->cstate.write(newState) || recoverAndEndEpoch); + + self->recoveryState = RecoveryState::RECRUITING; + + state std::vector seedServers; + state std::vector> initialConfChanges; + state Future logChanges; + state Future minRecoveryDuration; + state Future poppedTxsVersion; + state bool clusterIdExists = false; + + loop { + Reference oldLogSystem = oldLogSystems->get(); + if (oldLogSystem) { + logChanges = triggerUpdates(self, oldLogSystem); + if (!minRecoveryDuration.isValid()) { + minRecoveryDuration = delay(SERVER_KNOBS->ENFORCED_MIN_RECOVERY_DURATION); + poppedTxsVersion = oldLogSystem->getTxsPoppedVersion(); + } + } + + state Future reg = oldLogSystem ? updateRegistration(self, oldLogSystem) : Never(); + self->registrationTrigger.trigger(); + + choose { + when(wait(oldLogSystem ? recoverFrom(self, + oldLogSystem, + &seedServers, + &initialConfChanges, + poppedTxsVersion, + std::addressof(clusterIdExists)) + : Never())) { + reg.cancel(); + break; + } + when(wait(oldLogSystems->onChange())) {} + when(wait(reg)) { throw internal_error(); } + when(wait(recoverAndEndEpoch)) { throw internal_error(); } + } + } + + if (self->neverCreated) { + recoverStartTime = now(); + } + + recoverAndEndEpoch.cancel(); + + ASSERT(self->commitProxies.size() <= self->configuration.getDesiredCommitProxies()); + ASSERT(self->commitProxies.size() >= 1); + ASSERT(self->grvProxies.size() <= self->configuration.getDesiredGrvProxies()); + ASSERT(self->grvProxies.size() >= 1); + ASSERT(self->resolvers.size() <= self->configuration.getDesiredResolvers()); + ASSERT(self->resolvers.size() >= 1); + + self->recoveryState = RecoveryState::RECOVERY_TRANSACTION; + TraceEvent("MasterRecoveryState", self->dbgid) + .detail("StatusCode", RecoveryStatus::recovery_transaction) + .detail("Status", RecoveryStatus::names[RecoveryStatus::recovery_transaction]) + .detail("PrimaryLocality", self->primaryLocality) + .detail("DcId", self->myInterface.locality.dcId()) + .detail("ClusterId", self->clusterId) + .trackLatest(self->masterRecoveryStateEventHolder->trackingKey); + + // Recovery transaction + state bool debugResult = debug_checkMinRestoredVersion(UID(), self->lastEpochEnd, "DBRecovery", SevWarn); + + CommitTransactionRequest recoveryCommitRequest; + recoveryCommitRequest.flags = recoveryCommitRequest.flags | CommitTransactionRequest::FLAG_IS_LOCK_AWARE; + CommitTransactionRef& tr = recoveryCommitRequest.transaction; + int mmApplied = 0; // The number of mutations in tr.mutations that have been applied to the txnStateStore so far + if (self->lastEpochEnd != 0) { + Optional snapRecoveryFlag = self->txnStateStore->readValue(writeRecoveryKey).get(); + TraceEvent("MasterRecoverySnapshotCheck") + .detail("SnapRecoveryFlag", snapRecoveryFlag.present() ? snapRecoveryFlag.get().toString() : "N/A") + .detail("LastEpochEnd", self->lastEpochEnd); + if (snapRecoveryFlag.present()) { + TEST(true); // Recovering from snapshot, writing to snapShotEndVersionKey + BinaryWriter bw(Unversioned()); + tr.set(recoveryCommitRequest.arena, snapshotEndVersionKey, (bw << self->lastEpochEnd).toValue()); + // Pause the backups that got restored in this snapshot to avoid data corruption + // Requires further operational work to abort the backup + TraceEvent("MasterRecoveryPauseBackupAgents").log(); + Key backupPauseKey = FileBackupAgent::getPauseKey(); + tr.set(recoveryCommitRequest.arena, backupPauseKey, StringRef()); + // Clear the key so multiple recoveries will not overwrite the first version recorded + tr.clear(recoveryCommitRequest.arena, singleKeyRange(writeRecoveryKey)); + } + if (self->forceRecovery) { + BinaryWriter bw(Unversioned()); + tr.set(recoveryCommitRequest.arena, killStorageKey, (bw << self->safeLocality).toValue()); + } + + // This transaction sets \xff/lastEpochEnd, which the shard servers can use to roll back speculatively + // processed semi-committed transactions from the previous epoch. + // It also guarantees the shard servers and tlog servers eventually get versions in the new epoch, which + // clients might rely on. + // This transaction is by itself in a batch (has its own version number), which simplifies storage servers + // slightly (they assume there are no modifications to serverKeys in the same batch) The proxy also expects the + // lastEpochEndKey mutation to be first in the transaction + BinaryWriter bw(Unversioned()); + tr.set(recoveryCommitRequest.arena, lastEpochEndKey, (bw << self->lastEpochEnd).toValue()); + + if (self->forceRecovery) { + tr.set(recoveryCommitRequest.arena, rebootWhenDurableKey, StringRef()); + tr.set(recoveryCommitRequest.arena, + moveKeysLockOwnerKey, + BinaryWriter::toValue(deterministicRandom()->randomUniqueID(), Unversioned())); + } + } else { + // Recruit and seed initial shard servers + // This transaction must be the very first one in the database (version 1) + seedShardServers(recoveryCommitRequest.arena, tr, seedServers); + } + // initialConfChanges have not been conflict checked against any earlier writes in the recovery transaction, so do + // this as early as possible in the recovery transaction but see above comments as to why it can't be absolutely + // first. Theoretically emergency transactions should conflict check against the lastEpochEndKey. + for (auto& itr : initialConfChanges) { + tr.mutations.append_deep(recoveryCommitRequest.arena, itr.mutations.begin(), itr.mutations.size()); + tr.write_conflict_ranges.append_deep( + recoveryCommitRequest.arena, itr.write_conflict_ranges.begin(), itr.write_conflict_ranges.size()); + } + + tr.set( + recoveryCommitRequest.arena, primaryLocalityKey, BinaryWriter::toValue(self->primaryLocality, Unversioned())); + tr.set(recoveryCommitRequest.arena, backupVersionKey, backupVersionValue); + tr.set(recoveryCommitRequest.arena, coordinatorsKey, self->coordinators.ccr->getConnectionString().toString()); + tr.set(recoveryCommitRequest.arena, logsKey, self->logSystem->getLogsValue()); + tr.set(recoveryCommitRequest.arena, + primaryDatacenterKey, + self->myInterface.locality.dcId().present() ? self->myInterface.locality.dcId().get() : StringRef()); + + tr.clear(recoveryCommitRequest.arena, tLogDatacentersKeys); + for (auto& dc : self->primaryDcId) { + tr.set(recoveryCommitRequest.arena, tLogDatacentersKeyFor(dc), StringRef()); + } + if (self->configuration.usableRegions > 1) { + for (auto& dc : self->remoteDcIds) { + tr.set(recoveryCommitRequest.arena, tLogDatacentersKeyFor(dc), StringRef()); + } + } + + // Write cluster ID into txnStateStore if it is missing. + if (!clusterIdExists) { + tr.set(recoveryCommitRequest.arena, clusterIdKey, BinaryWriter::toValue(self->clusterId, Unversioned())); + } + + applyMetadataMutations(SpanID(), + self->dbgid, + recoveryCommitRequest.arena, + tr.mutations.slice(mmApplied, tr.mutations.size()), + self->txnStateStore); + mmApplied = tr.mutations.size(); + + tr.read_snapshot = self->recoveryTransactionVersion; // lastEpochEnd would make more sense, but isn't in the initial + // window of the resolver(s) + + TraceEvent("MasterRecoveryCommit", self->dbgid).log(); + state Future> recoveryCommit = self->commitProxies[0].commit.tryGetReply(recoveryCommitRequest); + self->addActor.send(self->logSystem->onError()); + self->addActor.send(waitResolverFailure(self->resolvers)); + self->addActor.send(waitCommitProxyFailure(self->commitProxies)); + self->addActor.send(waitGrvProxyFailure(self->grvProxies)); + self->addActor.send(provideVersions(self)); + self->addActor.send(serveLiveCommittedVersion(self)); + self->addActor.send(reportErrors(updateRegistration(self, self->logSystem), "UpdateRegistration", self->dbgid)); + self->registrationTrigger.trigger(); + + wait(discardCommit(self->txnStateStore, self->txnStateLogAdapter)); + + // Wait for the recovery transaction to complete. + // SOMEDAY: For faster recovery, do this and setDBState asynchronously and don't wait for them + // unless we want to change TLogs + wait((success(recoveryCommit) && sendInitialCommitToResolvers(self))); + if (recoveryCommit.isReady() && recoveryCommit.get().isError()) { + TEST(true); // Master recovery failed because of the initial commit failed + throw master_recovery_failed(); + } + + ASSERT(self->recoveryTransactionVersion != 0); + + self->recoveryState = RecoveryState::WRITING_CSTATE; + TraceEvent("MasterRecoveryState", self->dbgid) + .detail("StatusCode", RecoveryStatus::writing_coordinated_state) + .detail("Status", RecoveryStatus::names[RecoveryStatus::writing_coordinated_state]) + .detail("TLogList", self->logSystem->describe()) + .trackLatest(self->masterRecoveryStateEventHolder->trackingKey); + + // Multiple masters prevent conflicts between themselves via CoordinatedState (self->cstate) + // 1. If SetMaster succeeds, then by CS's contract, these "new" Tlogs are the immediate + // successors of the "old" ones we are replacing + // 2. logSystem->recoverAndEndEpoch ensured that a co-quorum of the "old" tLogs were stopped at + // versions <= self->lastEpochEnd, so no versions > self->lastEpochEnd could be (fully) committed to them. + // 3. No other master will attempt to commit anything to our "new" Tlogs + // because it didn't recruit them + // 4. Therefore, no full commit can come between self->lastEpochEnd and the first commit + // we made to the new Tlogs (self->recoveryTransactionVersion), and only our own semi-commits can come between + // our first commit and the next new TLogs + + self->addActor.send(trackTlogRecovery(self, oldLogSystems, minRecoveryDuration)); + debug_advanceMaxCommittedVersion(UID(), self->recoveryTransactionVersion); + wait(self->recoveryReadyForCommits.getFuture()); + debug_advanceMinCommittedVersion(UID(), self->recoveryTransactionVersion); + + if (debugResult) { + TraceEvent(self->forceRecovery ? SevWarn : SevError, "DBRecoveryDurabilityError").log(); + } + + TraceEvent("MasterCommittedTLogs", self->dbgid) + .detail("TLogs", self->logSystem->describe()) + .detail("RecoveryCount", self->cstate.myDBState.recoveryCount) + .detail("RecoveryTransactionVersion", self->recoveryTransactionVersion); + + TraceEvent(recoveryInterval.end(), self->dbgid) + .detail("RecoveryTransactionVersion", self->recoveryTransactionVersion); + + self->recoveryState = RecoveryState::ACCEPTING_COMMITS; + double recoveryDuration = now() - recoverStartTime; + + TraceEvent((recoveryDuration > 4 && !g_network->isSimulated()) ? SevWarnAlways : SevInfo, + "MasterRecoveryDuration", + self->dbgid) + .detail("RecoveryDuration", recoveryDuration) + .trackLatest(self->masterRecoveryDurationEventHolder->trackingKey); + + TraceEvent("MasterRecoveryState", self->dbgid) + .detail("StatusCode", RecoveryStatus::accepting_commits) + .detail("Status", RecoveryStatus::names[RecoveryStatus::accepting_commits]) + .detail("StoreType", self->configuration.storageServerStoreType) + .detail("RecoveryDuration", recoveryDuration) + .trackLatest(self->masterRecoveryStateEventHolder->trackingKey); + + TraceEvent("MasterRecoveryAvailable", self->dbgid) + .detail("AvailableAtVersion", self->version) + .trackLatest(self->masterRecoveryAvailableEventHolder->trackingKey); + + if (self->resolvers.size() > 1) + self->addActor.send(resolutionBalancing(self)); + + self->addActor.send(changeCoordinators(self)); + Database cx = openDBOnServer(self->dbInfo, TaskPriority::DefaultEndpoint, LockAware::True); + self->addActor.send(configurationMonitor(self, cx)); + if (self->configuration.backupWorkerEnabled) { + self->addActor.send(recruitBackupWorkers(self, cx)); + } else { + self->logSystem->setOldestBackupEpoch(self->cstate.myDBState.recoveryCount); + } + + wait(Future(Never())); + throw internal_error(); +} + ACTOR Future masterServer(MasterInterface mi, Reference const> db, Reference> const> ccInterface, @@ -309,18 +2049,16 @@ ACTOR Future masterServer(MasterInterface mi, state PromiseStream> addActor; state Reference self(new MasterData( db, mi, coordinators, db->get().clusterInterface, LiteralStringRef(""), addActor, forceRecovery)); - state Future collection = actorCollection(addActor.getFuture()); - - addActor.send(traceRole(Role::MASTER, mi.id())); - addActor.send(provideVersions(self)); - addActor.send(serveLiveCommittedVersion(self)); - addActor.send(updateRecoveryData(self)); + state Future collection = actorCollection(self->addActor.getFuture()); + self->addActor.send(traceRole(Role::MASTER, mi.id())); TEST(!lifetime.isStillValid(db->get().masterLifetime, mi.id() == db->get().master.id())); // Master born doomed TraceEvent("MasterLifetime", self->dbgid).detail("LifetimeToken", lifetime.toString()); try { + state Future core = masterCore(self); loop choose { + when(wait(core)) { break; } when(wait(onDBChange)) { onDBChange = db->onChange(); if (!lifetime.isStillValid(db->get().masterLifetime, mi.id() == db->get().master.id())) { @@ -334,6 +2072,13 @@ ACTOR Future masterServer(MasterInterface mi, throw worker_removed(); } } + when(BackupWorkerDoneRequest req = waitNext(mi.notifyBackupWorkerDone.getFuture())) { + if (self->logSystem.isValid() && self->logSystem->removeBackupWorker(req)) { + self->registrationTrigger.trigger(); + } + ++self->backupWorkerDoneRequests; + req.reply.send(Void()); + } when(wait(collection)) { ASSERT(false); throw internal_error(); @@ -344,15 +2089,16 @@ ACTOR Future masterServer(MasterInterface mi, if (e.code() != error_code_actor_cancelled) { wait(delay(0.0)); } - while (!addActor.isEmpty()) { - addActor.getFuture().pop(); + + while (!self->addActor.isEmpty()) { + self->addActor.getFuture().pop(); } - TEST(err.code() == error_code_tlog_failed); // Master: terminated due to tLog failure + TEST(err.code() == error_code_master_tlog_failed); // Master: terminated due to tLog failure TEST(err.code() == error_code_commit_proxy_failed); // Master: terminated due to commit proxy failure TEST(err.code() == error_code_grv_proxy_failed); // Master: terminated due to GRV proxy failure - TEST(err.code() == error_code_resolver_failed); // Master: terminated due to resolver failure - TEST(err.code() == error_code_backup_worker_failed); // Master: terminated due to backup worker failure + TEST(err.code() == error_code_master_resolver_failed); // Master: terminated due to resolver failure + TEST(err.code() == error_code_master_backup_worker_failed); // Master: terminated due to backup worker failure if (normalMasterErrors().count(err.code())) { TraceEvent("MasterTerminated", mi.id()).error(err); @@ -360,4 +2106,5 @@ ACTOR Future masterServer(MasterInterface mi, } throw err; } + return Void(); } diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp index f709aa6e92..8ddad4707a 100644 --- a/fdbserver/worker.actor.cpp +++ b/fdbserver/worker.actor.cpp @@ -700,8 +700,8 @@ TEST_CASE("/fdbserver/worker/addressInDbAndPrimaryDc") { // Manually set up a master address. NetworkAddress testAddress(IPAddress(0x13131313), 1); - testDbInfo.master.getCommitVersion = - RequestStream(Endpoint({ testAddress }, UID(1, 2))); + testDbInfo.master.changeCoordinators = + RequestStream(Endpoint({ testAddress }, UID(1, 2))); // First, create an empty TLogInterface, and check that it shouldn't be considered as in primary DC. testDbInfo.logSystemConfig.tLogs.push_back(TLogSet()); @@ -1772,10 +1772,12 @@ ACTOR Future workerServer(Reference connRecord, startRole(Role::MASTER, recruited.id(), interf.id()); DUMPTOKEN(recruited.waitFailure); + DUMPTOKEN(recruited.tlogRejoin); + DUMPTOKEN(recruited.changeCoordinators); DUMPTOKEN(recruited.getCommitVersion); DUMPTOKEN(recruited.getLiveCommittedVersion); DUMPTOKEN(recruited.reportLiveCommittedVersion); - DUMPTOKEN(recruited.updateRecoveryData); + DUMPTOKEN(recruited.notifyBackupWorkerDone); // printf("Recruited as masterServer\n"); Future masterProcess = masterServer( diff --git a/flow/error_definitions.h b/flow/error_definitions.h index 0e34cd6c27..1aabe0bf2b 100755 --- a/flow/error_definitions.h +++ b/flow/error_definitions.h @@ -93,22 +93,21 @@ ERROR( connection_leaked, 1103, "Connection object leaked" ) ERROR( recruitment_failed, 1200, "Recruitment of a server failed" ) // Be careful, catching this will delete the data of a storage server or tlog permanently ERROR( move_to_removed_server, 1201, "Attempt to move keys to a storage server that was removed" ) ERROR( worker_removed, 1202, "Normal worker shut down" ) // Be careful, catching this will delete the data of a storage server or tlog permanently -ERROR( cluster_recovery_failed, 1203, "Cluster recovery failed") +ERROR( master_recovery_failed, 1203, "Master recovery failed") ERROR( master_max_versions_in_flight, 1204, "Master hit maximum number of versions in flight" ) -ERROR( tlog_failed, 1205, "Cluster recovery terminating because a TLog failed" ) // similar to tlog_stopped, but the tlog has actually died +ERROR( master_tlog_failed, 1205, "Master terminating because a TLog failed" ) // similar to tlog_stopped, but the tlog has actually died ERROR( worker_recovery_failed, 1206, "Recovery of a worker process failed" ) ERROR( please_reboot, 1207, "Reboot of server process requested" ) ERROR( please_reboot_delete, 1208, "Reboot of server process requested, with deletion of state" ) ERROR( commit_proxy_failed, 1209, "Master terminating because a CommitProxy failed" ) -ERROR( resolver_failed, 1210, "Cluster recovery terminating because a Resolver failed" ) +ERROR( master_resolver_failed, 1210, "Master terminating because a Resolver failed" ) ERROR( server_overloaded, 1211, "Server is under too much load and cannot respond" ) -ERROR( backup_worker_failed, 1212, "Cluster recovery terminating because a backup worker failed") +ERROR( master_backup_worker_failed, 1212, "Master terminating because a backup worker failed") ERROR( tag_throttled, 1213, "Transaction tag is being throttled" ) -ERROR( grv_proxy_failed, 1214, "Cluster recovery terminating because a GRVProxy failed" ) +ERROR( grv_proxy_failed, 1214, "Master terminating because a GRVProxy failed" ) ERROR( dd_tracker_cancelled, 1215, "The data distribution tracker has been cancelled" ) ERROR( failed_to_progress, 1216, "Process has failed to make sufficient progress" ) ERROR( invalid_cluster_id, 1217, "Attempted to join cluster with a different cluster ID" ) -ERROR( restart_cluster_controller, 1218, "Restart cluster controller process" ) // 15xx Platform errors ERROR( platform_error, 1500, "Platform error" ) diff --git a/flow/network.h b/flow/network.h index 5d5b316d0d..b753a23ec6 100644 --- a/flow/network.h +++ b/flow/network.h @@ -84,7 +84,6 @@ enum class TaskPriority { GetConsistentReadVersion = 8500, GetLiveCommittedVersionReply = 8490, GetLiveCommittedVersion = 8480, - UpdateRecoveryTransactionVersion = 8470, DefaultPromiseEndpoint = 8000, DefaultOnMainThread = 7500, DefaultDelay = 7010,