fix: desiredTLogCount was used instead of getDesiredLogs(), which caused problems with recruitment when desiredTLogCount was -1.

canKillProcess logic was wrong.
We still need to configure usable_regions because if datacenterVersionDifference is too large we cannot complete data movement.
This commit is contained in:
Evan Tschannen 2018-07-04 16:22:32 -04:00
parent ea3365dc38
commit e17dfea3b6
6 changed files with 26 additions and 33 deletions

View File

@ -1106,7 +1106,7 @@ public:
}
if(!hasSatelliteReplication) {
notEnoughLeft = ( !primaryProcessesLeft.validate(tLogPolicy) || !primaryProcessesLeft.validate(storagePolicy) ) && ( !remoteProcessesLeft.validate(tLogPolicy) || !remoteProcessesLeft.validate(storagePolicy) );
notEnoughLeft = !primaryProcessesLeft.validate(tLogPolicy) || !primaryProcessesLeft.validate(storagePolicy) || !remoteProcessesLeft.validate(tLogPolicy) || !remoteProcessesLeft.validate(storagePolicy);
if(usableRegions > 1) {
tooManyDead = primaryTLogsDead || remoteTLogsDead || ( primaryProcessesDead.validate(storagePolicy) && remoteProcessesDead.validate(storagePolicy) );
} else {
@ -1116,11 +1116,11 @@ public:
bool primarySatelliteTLogsDead = satelliteTLogWriteAntiQuorum ? !validateAllCombinations(badCombo, primarySatelliteProcessesDead, satelliteTLogPolicy, primarySatelliteLocalitiesLeft, satelliteTLogWriteAntiQuorum, false) : primarySatelliteProcessesDead.validate(satelliteTLogPolicy);
bool remoteSatelliteTLogsDead = satelliteTLogWriteAntiQuorum ? !validateAllCombinations(badCombo, remoteSatelliteProcessesDead, satelliteTLogPolicy, remoteSatelliteLocalitiesLeft, satelliteTLogWriteAntiQuorum, false) : remoteSatelliteProcessesDead.validate(satelliteTLogPolicy);
notEnoughLeft = ( !primaryProcessesLeft.validate(tLogPolicy) || !primaryProcessesLeft.validate(storagePolicy) || !primarySatelliteProcessesLeft.validate(satelliteTLogPolicy) ) && ( !remoteProcessesLeft.validate(tLogPolicy) || !remoteProcessesLeft.validate(storagePolicy) || !remoteSatelliteProcessesLeft.validate(satelliteTLogPolicy) );
notEnoughLeft = !primaryProcessesLeft.validate(tLogPolicy) || !primaryProcessesLeft.validate(storagePolicy) || !primarySatelliteProcessesLeft.validate(satelliteTLogPolicy) || !remoteProcessesLeft.validate(tLogPolicy) || !remoteProcessesLeft.validate(storagePolicy) || !remoteSatelliteProcessesLeft.validate(satelliteTLogPolicy);
if(usableRegions > 1) {
tooManyDead = ( primaryTLogsDead && primarySatelliteTLogsDead ) || ( remoteTLogsDead && remoteSatelliteTLogsDead ) || ( primaryProcessesDead.validate(storagePolicy) && remoteProcessesDead.validate(storagePolicy) );
tooManyDead = ( primaryTLogsDead && primarySatelliteTLogsDead ) || ( remoteTLogsDead && remoteSatelliteTLogsDead ) || ( primaryTLogsDead && remoteTLogsDead ) || ( primaryProcessesDead.validate(storagePolicy) && remoteProcessesDead.validate(storagePolicy) );
} else {
tooManyDead = ( primaryTLogsDead && primarySatelliteTLogsDead ) || ( remoteTLogsDead && remoteSatelliteTLogsDead ) || primaryProcessesDead.validate(storagePolicy) || remoteProcessesDead.validate(storagePolicy);
tooManyDead = primaryTLogsDead || remoteTLogsDead || primaryProcessesDead.validate(storagePolicy) || remoteProcessesDead.validate(storagePolicy);
}
}
}
@ -1132,13 +1132,13 @@ public:
TraceEvent("KillChanged").detail("KillType", kt).detail("NewKillType", newKt).detail("TLogPolicy", tLogPolicy->info()).detail("Reason", "tLogPolicy validates against dead processes.");
}
// Reboot and Delete if remaining machines do NOT fulfill policies
else if ((kt != RebootAndDelete) && (kt != RebootProcessAndDelete) && notEnoughLeft) {
newKt = (g_random->random01() < 0.33) ? RebootAndDelete : Reboot;
else if ((kt < RebootAndDelete) && notEnoughLeft) {
newKt = RebootAndDelete;
canSurvive = false;
TraceEvent("KillChanged").detail("KillType", kt).detail("NewKillType", newKt).detail("TLogPolicy", tLogPolicy->info()).detail("Reason", "tLogPolicy does not validates against remaining processes.");
}
else if ((kt != RebootAndDelete) && (kt != RebootProcessAndDelete) && (nQuorum > uniqueMachines.size())) {
newKt = (g_random->random01() < 0.33) ? RebootAndDelete : Reboot;
else if ((kt < RebootAndDelete) && (nQuorum > uniqueMachines.size())) {
newKt = RebootAndDelete;
canSurvive = false;
TraceEvent("KillChanged").detail("KillType", kt).detail("NewKillType", newKt).detail("StoragePolicy", storagePolicy->info()).detail("Quorum", nQuorum).detail("Machines", uniqueMachines.size()).detail("Reason", "Not enough unique machines to perform auto configuration of coordinators.");
}
@ -1220,9 +1220,8 @@ public:
killProcess_internal( processes[i], kt );
}
}
virtual bool killMachine(Optional<Standalone<StringRef>> zoneId, KillType kt, bool killIsSafe, bool forceKill, KillType* ktFinal) {
virtual bool killMachine(Optional<Standalone<StringRef>> zoneId, KillType kt, bool forceKill, KillType* ktFinal) {
auto ktOrig = kt;
if (killIsSafe) ASSERT( kt == ISimulator::RebootAndDelete ); // Only types of "safe" kill supported so far
TEST(true); // Trying to killing a machine
TEST(kt == KillInstantly); // Trying to kill instantly
@ -1288,9 +1287,6 @@ public:
}
}
if (!canKillProcesses(processesLeft, processesDead, kt, &kt)) {
if ((kt != Reboot) && (!killIsSafe)) {
kt = Reboot;
}
TraceEvent("ChangedKillMachine", zoneId).detailext("ZoneId", zoneId).detail("KillType", kt).detail("OrigKillType", ktOrig).detail("ProcessesLeft", processesLeft.size()).detail("ProcessesDead", processesDead.size()).detail("TotalProcesses", machines.size()).detail("ProcessesPerMachine", processesPerMachine).detail("Protected", protectedWorker).detail("Unavailable", unavailable).detail("Excluded", excluded).detail("Cleared", cleared).detail("ProtectedTotal", protectedAddresses.size()).detail("TLogPolicy", tLogPolicy->info()).detail("StoragePolicy", storagePolicy->info());
}
else if ((kt == KillInstantly) || (kt == InjectFaults)) {
@ -1324,15 +1320,15 @@ public:
}
// Check if any processes on machine are rebooting
if ( processesOnMachine != processesPerMachine) {
if ( processesOnMachine != processesPerMachine ) {
TEST(true); //Attempted reboot, but the target did not have all of its processes running
TraceEvent(SevWarn, "AbortedKill", zoneId).detail("KillType", kt).detailext("ZoneId", zoneId).detail("Reason", "Machine processes does not match number of processes per machine").detail("Processes", processesOnMachine).detail("ProcessesPerMachine", processesPerMachine).backtrace();
if (ktFinal) *ktFinal = None;
return false;
}
TraceEvent("KillMachine", zoneId).detailext("ZoneId", zoneId).detail("Kt", kt).detail("KtOrig", ktOrig).detail("KillableMachines", processesOnMachine).detail("ProcessPerMachine", processesPerMachine).detail("KillChanged", kt!=ktOrig).detail("KillIsSafe", killIsSafe);
if (kt < RebootAndDelete ) {
TraceEvent("KillMachine", zoneId).detailext("ZoneId", zoneId).detail("Kt", kt).detail("KtOrig", ktOrig).detail("KillableMachines", processesOnMachine).detail("ProcessPerMachine", processesPerMachine).detail("KillChanged", kt!=ktOrig);
if ( kt < RebootAndDelete ) {
if(kt == InjectFaults && machines[zoneId].machineProcess != nullptr)
killProcess_internal( machines[zoneId].machineProcess, kt );
for (auto& process : machines[zoneId].processes) {
@ -1341,7 +1337,7 @@ public:
killProcess_internal( process, kt );
}
}
else if ( kt == Reboot || killIsSafe) {
else if ( kt == Reboot || kt == RebootAndDelete ) {
for (auto& process : machines[zoneId].processes) {
TraceEvent("KillMachineProcess", zoneId).detail("KillType", kt).detail("Process", process->toString()).detail("StartingClass", process->startingClass.toString()).detail("Failed", process->failed).detail("Excluded", process->excluded).detail("Cleared", process->cleared).detail("Rebooting", process->rebooting);
if (process->startingClass != ProcessClass::TesterClass)
@ -1424,7 +1420,7 @@ public:
KillType ktResult, ktMin = kt;
for (auto& datacenterZone : datacenterZones) {
killMachine( datacenterZone.first, kt, (kt == RebootAndDelete), true, &ktResult);
killMachine(datacenterZone.first, kt, true, &ktResult);
if (ktResult != kt) {
TraceEvent(SevWarn, "KillDCFail")
.detailext("Zone", datacenterZone.first)

View File

@ -149,7 +149,7 @@ public:
virtual void rebootProcess(Optional<Standalone<StringRef>> zoneId, bool allProcesses ) = 0;
virtual void rebootProcess( ProcessInfo* process, KillType kt ) = 0;
virtual void killInterface( NetworkAddress address, KillType ) = 0;
virtual bool killMachine(Optional<Standalone<StringRef>> zoneId, KillType, bool killIsSafe = false, bool forceKill = false, KillType* ktFinal = NULL) = 0;
virtual bool killMachine(Optional<Standalone<StringRef>> zoneId, KillType, bool forceKill = false, KillType* ktFinal = NULL) = 0;
virtual bool killDataCenter(Optional<Standalone<StringRef>> dcId, KillType kt, KillType* ktFinal = NULL) = 0;
//virtual KillType getMachineKillState( UID zoneID ) = 0;
virtual bool canKillProcesses(std::vector<ProcessInfo*> const& availableProcesses, std::vector<ProcessInfo*> const& deadProcesses, KillType kt, KillType* newKillType) const = 0;

View File

@ -277,7 +277,7 @@ public:
if (logServerSet->size() < required) {
TraceEvent(SevWarn,"GWFTADTooFew", id).detail("Fitness", fitness).detail("Processes", logServerSet->size()).detail("Required", required).detail("TLogPolicy", policy->info()).detail("DesiredLogs", desired);
}
else if (logServerSet->size() <= desired) {
else if (logServerSet->size() == required || logServerSet->size() <= desired) {
if (logServerSet->validate(policy)) {
for (auto& object : logServerMap->getObjects()) {
results.push_back(*object);
@ -754,8 +754,7 @@ public:
std::set<Optional<Key>> primaryDC;
primaryDC.insert(regions[0].dcId);
getWorkersForTlogs(db.config, db.config.tLogReplicationFactor, db.config.desiredTLogCount, db.config.tLogPolicy, id_used, true, primaryDC);
getWorkersForTlogs(db.config, db.config.tLogReplicationFactor, db.config.getDesiredLogs(), db.config.tLogPolicy, id_used, true, primaryDC);
if(regions[0].satelliteTLogReplicationFactor > 0) {
bool satelliteFallback = false;
getWorkersForSatelliteLogs(db.config, regions[0], id_used, satelliteFallback, true);
@ -901,7 +900,7 @@ public:
// Check tLog fitness
RoleFitness oldTLogFit(tlogs, ProcessClass::TLog);
RoleFitness newTLogFit(getWorkersForTlogs(db.config, db.config.tLogReplicationFactor, db.config.desiredTLogCount, db.config.tLogPolicy, id_used, true, primaryDC), ProcessClass::TLog);
RoleFitness newTLogFit(getWorkersForTlogs(db.config, db.config.tLogReplicationFactor, db.config.getDesiredLogs(), db.config.tLogPolicy, id_used, true, primaryDC), ProcessClass::TLog);
if(oldTLogFit < newTLogFit) return false;
@ -1542,7 +1541,7 @@ void clusterRegisterMaster( ClusterControllerData* self, RegisterMasterRequest c
req.reply.send( Void() );
TraceEvent("MasterRegistrationReceived", self->id).detail("DbName", printable(req.dbName)).detail("MasterId", req.id).detail("Master", req.mi.toString()).detail("Tlogs", describe(req.logSystemConfig.tLogs)).detail("Resolvers", req.resolvers.size())
.detail("RecoveryState", (int)req.recoveryState).detail("RegistrationCount", req.registrationCount).detail("Proxies", req.proxies.size()).detail("RecoveryCount", req.recoveryCount);
.detail("RecoveryState", (int)req.recoveryState).detail("RegistrationCount", req.registrationCount).detail("Proxies", req.proxies.size()).detail("RecoveryCount", req.recoveryCount).detail("Stalled", req.recoveryStalled);
//make sure the request comes from an active database
auto db = &self->db;

View File

@ -283,19 +283,17 @@ ACTOR Future<bool> getStorageServersRecruiting( Database cx, Reference<AsyncVar<
}
}
ACTOR Future<Void> reconfigureAfter(Database cx, double time) {
ACTOR Future<Void> reconfigureAfter(Database cx, double time, Reference<AsyncVar<ServerDBInfo>> dbInfo) {
Void _ = wait( delay(time) );
if(g_network->isSimulated()) {
TraceEvent(SevWarnAlways, "DisablingFearlessConfiguration");
g_simulator.usableRegions = 1;
ConfigurationResult::Type _ = wait( changeConfig( cx, "repopulate_anti_quorum=1" ) );
if (g_network->isSimulated() && g_simulator.extraDB) {
Reference<ClusterConnectionFile> extraFile(new ClusterConnectionFile(*g_simulator.extraDB));
Reference<Cluster> cluster = Cluster::createCluster(extraFile, -1);
Database extraDB = cluster->createDatabase(LiteralStringRef("DB")).get();
ConfigurationResult::Type _ = wait(changeConfig(extraDB, "repopulate_anti_quorum=1"));
while( dbInfo->get().recoveryState < RecoveryState::REMOTE_RECOVERED ) {
Void _ = wait( dbInfo->onChange() );
}
ConfigurationResult::Type _ = wait( changeConfig( cx, "usable_regions=1" ) );
}
return Void();
@ -303,7 +301,7 @@ ACTOR Future<Void> reconfigureAfter(Database cx, double time) {
ACTOR Future<Void> waitForQuietDatabase( Database cx, Reference<AsyncVar<ServerDBInfo>> dbInfo, std::string phase, int64_t dataInFlightGate = 2e6,
int64_t maxTLogQueueGate = 5e6, int64_t maxStorageServerQueueGate = 5e6, int64_t maxDataDistributionQueueSize = 0 ) {
state Future<Void> reconfig = reconfigureAfter(cx, 100 + (g_random->random01()*100));
state Future<Void> reconfig = reconfigureAfter(cx, 100 + (g_random->random01()*100), dbInfo);
TraceEvent(("QuietDatabase" + phase + "Begin").c_str());

View File

@ -160,7 +160,7 @@ struct MoveKeysWorkload : TestWorkload {
ACTOR Future<Void> forceMasterFailure( Database cx, MoveKeysWorkload *self ) {
ASSERT( g_network->isSimulated() );
loop {
if( g_simulator.killMachine( self->dbInfo->get().master.locality.zoneId(), ISimulator::Reboot, false, true ) )
if( g_simulator.killMachine( self->dbInfo->get().master.locality.zoneId(), ISimulator::Reboot, true ) )
return Void();
Void _ = wait( delay(1.0) );
}

View File

@ -373,7 +373,7 @@ struct RemoveServersSafelyWorkload : TestWorkload {
}
TraceEvent("RemoveAndKill", functionId).detail("Step", removeViaClear ? "ClearMachines" : "KillMachines").detail("Addresses", describe(killAddrs)).detail("Processes", killProcArray.size()).detail("Zones", zoneIds.size()).detail("ClusterAvailable", g_simulator.isAvailable());
for (auto& zoneId : zoneIds) {
killedMachine = g_simulator.killMachine( zoneId, removeViaClear ? ISimulator::RebootAndDelete : ISimulator::KillInstantly, removeViaClear);
killedMachine = g_simulator.killMachine( zoneId, removeViaClear ? ISimulator::RebootAndDelete : ISimulator::KillInstantly );
TraceEvent(killedMachine ? SevInfo : SevWarn, "RemoveAndKill").detail("Step", removeViaClear ? "Clear Machine" : "Kill Machine").detailext("ZoneId", zoneId).detail(removeViaClear ? "Cleared" : "Killed", killedMachine).detail("ClusterAvailable", g_simulator.isAvailable());
}
}