From c1e5dd662c1c7fe2fe2bc2fee64a63a957450877 Mon Sep 17 00:00:00 2001 From: Lukas Joswiak Date: Wed, 19 Oct 2022 16:47:30 -0700 Subject: [PATCH 1/2] Protect availability of DR coordinators in simulation Otherwise, simulation may reboot and delete a majority of coordinator processes in the DR cluster, causing `configuration_never_created` errors. --- fdbserver/SimulatedCluster.actor.cpp | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp index 07bc48132a..e3c77f29cb 100644 --- a/fdbserver/SimulatedCluster.actor.cpp +++ b/fdbserver/SimulatedCluster.actor.cpp @@ -2172,6 +2172,19 @@ void setupSimulatedSystem(std::vector>* systemActors, } deterministicRandom()->randomShuffle(coordinatorAddresses); + for (const auto& coordinators : extraCoordinatorAddresses) { + for (int i = 0; i < (coordinators.size() / 2) + 1; i++) { + TraceEvent("ProtectCoordinator") + .detail("Address", coordinators[i]) + .detail("Coordinators", describe(coordinators)); + g_simulator->protectedAddresses.insert( + NetworkAddress(coordinators[i].ip, coordinators[i].port, true, coordinators[i].isTLS())); + if (coordinators[i].port == 2) { + g_simulator->protectedAddresses.insert(NetworkAddress(coordinators[i].ip, 1, true, true)); + } + } + } + ASSERT_EQ(coordinatorAddresses.size(), coordinatorCount); ClusterConnectionString conn(coordinatorAddresses, "TestCluster:0"_sr); if (useHostname) { From 165f4266b832cb52ce83a62084af8f52b3cea5ac Mon Sep 17 00:00:00 2001 From: Lukas Joswiak Date: Wed, 19 Oct 2022 16:50:50 -0700 Subject: [PATCH 2/2] Avoid passing processes from multiple clusters to `canKillProcesses` Previously, `canKillProcesses` was being called with processes from both the main and DR clusters. This would cause it to incorrectly allow permanent killing of machines, as it was acting on bad data. --- fdbrpc/sim2.actor.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fdbrpc/sim2.actor.cpp b/fdbrpc/sim2.actor.cpp index 12e15ce92a..bcd0241a14 100644 --- a/fdbrpc/sim2.actor.cpp +++ b/fdbrpc/sim2.actor.cpp @@ -1337,7 +1337,8 @@ public: bool isAvailable() const override { std::vector processesLeft, processesDead; for (auto processInfo : getAllProcesses()) { - if (processInfo->isAvailableClass()) { + if (processInfo->isAvailableClass() && + !processInfo->drProcess) { // Only checks availability of main cluster if (processInfo->isExcluded() || processInfo->isCleared() || !processInfo->isAvailable()) { processesDead.push_back(processInfo); } else { @@ -1853,7 +1854,7 @@ public: int protectedWorker = 0, unavailable = 0, excluded = 0, cleared = 0; for (auto processInfo : getAllProcesses()) { - if (processInfo->isAvailableClass()) { + if (processInfo->isAvailableClass() && processInfo->drProcess != isMainCluster) { if (processInfo->isExcluded()) { processesDead.push_back(processInfo); excluded++; @@ -2070,7 +2071,7 @@ public: (kt == RebootAndDelete) || (kt == RebootProcessAndDelete))) { std::vector processesLeft, processesDead; for (auto processInfo : getAllProcesses()) { - if (processInfo->isAvailableClass()) { + if (processInfo->isAvailableClass() && !processInfo->drProcess) { // TODO: Reboot DR processes as well if (processInfo->isExcluded() || processInfo->isCleared() || !processInfo->isAvailable()) { processesDead.push_back(processInfo); } else if (protectedAddresses.count(processInfo->address) ||