fix killMachine - make sure we have at least 1 blob worker in a dc

This commit is contained in:
Hui Liu 2022-09-29 17:00:41 -07:00
parent 03f1d13be3
commit 9db48eb10c
3 changed files with 51 additions and 0 deletions

View File

@ -482,6 +482,7 @@ public:
TSSMode tssMode;
std::map<NetworkAddress, bool> corruptWorkerMap;
ConfigDBType configDBType;
bool blobGranulesEnabled;
// Used by workloads that perform reconfigurations
int testerCount;

View File

@ -1367,6 +1367,45 @@ public:
return primaryTLogsDead || primaryProcessesDead.validate(storagePolicy);
}
// The following function will determine if a machine can be remove in case when it has a blob worker
bool canKillMachineWithBlobWorkers(Optional<Standalone<StringRef>> machineId, KillType kt, KillType* ktFinal) {
// Allow if no blob workers, or it's a reboot(without removing the machine)
if (!blobGranulesEnabled && kt >= RebootAndDelete) {
return true;
}
// Allow if the machine doesn't support blob worker
MachineInfo& currentMachine = machines[machineId];
bool hasBlobWorker = false;
for (auto processInfo : currentMachine.processes) {
if (processInfo->startingClass == ProcessClass::BlobWorkerClass) {
hasBlobWorker = true;
break;
}
}
if (!hasBlobWorker)
return true;
// Count # remaining support blob workers in current dc
auto currentDcId = currentMachine.machineProcess->locality.dcId();
int nLeft = 0;
for (auto processInfo : getAllProcesses()) {
if (currentDcId != processInfo->locality.dcId() || // skip other dc
processInfo->startingClass != ProcessClass::BlobWorkerClass || // skip non blob workers
processInfo->locality.machineId() == machineId) { // skip current machine
continue;
}
nLeft++; // alive blob workers after killing machineId
}
// Ensure there is at least 1 remaining blob workers after removing current machine
if (nLeft <= 1) {
*ktFinal = RebootAndDelete; // reboot and delete data, but keep this machine
return false;
}
return true;
}
// The following function will determine if the specified configuration of available and dead processes can allow
// the cluster to survive
bool canKillProcesses(std::vector<ProcessInfo*> const& availableProcesses,
@ -1787,6 +1826,14 @@ public:
// Check if machine can be removed, if requested
if (!forceKill && ((kt == KillInstantly) || (kt == InjectFaults) || (kt == FailDisk) ||
(kt == RebootAndDelete) || (kt == RebootProcessAndDelete))) {
if (!canKillMachineWithBlobWorkers(machineId, kt, &kt)) {
TraceEvent("canKillMachineWithBlobWorkers")
.detail("MachineId", machineId)
.detail("KillType", kt)
.detail("OrigKillType", ktOrig);
}
std::vector<ProcessInfo*> processesLeft, processesDead;
int protectedWorker = 0, unavailable = 0, excluded = 0, cleared = 0;

View File

@ -1421,6 +1421,7 @@ void SimulationConfig::setSpecificConfig(const TestConfig& testConfig) {
if (testConfig.resolverCount.present()) {
db.resolverCount = testConfig.resolverCount.get();
}
db.blobGranulesEnabled = testConfig.blobGranulesEnabled;
}
// Sets generateFearless and number of dataCenters based on testConfig details
@ -1939,6 +1940,8 @@ void setupSimulatedSystem(std::vector<Future<Void>>* systemActors,
simconfig.db.tenantMode = tenantMode;
simconfig.db.encryptionAtRestMode = EncryptionAtRestMode::DISABLED;
g_simulator->blobGranulesEnabled = simconfig.db.blobGranulesEnabled;
StatusObject startingConfigJSON = simconfig.db.toJSON(true);
std::string startingConfigString = "new";
if (testConfig.configureLocked) {