diff --git a/fdbrpc/include/fdbrpc/simulator.h b/fdbrpc/include/fdbrpc/simulator.h index 5228ff7b92..494de44452 100644 --- a/fdbrpc/include/fdbrpc/simulator.h +++ b/fdbrpc/include/fdbrpc/simulator.h @@ -451,7 +451,13 @@ public: int physicalDatacenters; int processesPerMachine; int listenersPerProcess; + + // We won't kill machines in this set, but we might reboot + // them. This is a conservatie mechanism to prevent the + // simulator from killing off imporant processes and rendering + // the cluster unrecoverable, e.g. a quorum of coordinators. std::set protectedAddresses; + std::map currentlyRebootingProcesses; std::vector extraDatabases; Reference storagePolicy; diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp index 513990c581..55050f412c 100644 --- a/fdbserver/SimulatedCluster.actor.cpp +++ b/fdbserver/SimulatedCluster.actor.cpp @@ -2172,6 +2172,10 @@ void setupSimulatedSystem(std::vector>* systemActors, } ASSERT(coordinatorAddresses.size() > 0); + + // Mark a random majority of the coordinators as protected, so + // we won't accidently kill off a quorum and render the + // cluster unrecoverable. deterministicRandom()->randomShuffle(coordinatorAddresses); for (int i = 0; i < (coordinatorAddresses.size() / 2) + 1; i++) { TraceEvent("ProtectCoordinator")