The checkSafeExclusions function only ensures the exclusion is safe from the storage server prospective, but does not confirm it is safe in terms of the tlog replication

This commit is contained in:
Evan Tschannen 2021-03-23 13:31:16 -07:00
parent 43c81e550c
commit 272e649a3c

View File

@ -502,6 +502,8 @@ struct RemoveServersSafelyWorkload : TestWorkload {
return killProcArray;
}
// Attempts to exclude a set of processes, and once the exclusion is successful it kills them.
// If markExcludeAsFailed is true, then it is an error if we cannot complete the exclusion.
ACTOR static Future<Void> removeAndKill(RemoveServersSafelyWorkload* self,
Database cx,
std::set<AddressExclusion> toKill,
@ -556,7 +558,11 @@ struct RemoveServersSafelyWorkload : TestWorkload {
.detail("Step", "SafetyCheck")
.detail("Exclusions", describe(toKillMarkFailedArray));
choose {
when(bool _safe = wait(checkSafeExclusions(cx, toKillMarkFailedArray))) { safe = _safe; }
when(bool _safe = wait(checkSafeExclusions(cx, toKillMarkFailedArray))) {
safe = _safe && self->protectServers(std::set<AddressExclusion>(toKillMarkFailedArray.begin(),
toKillMarkFailedArray.end()))
.size() == toKillMarkFailedArray.size();
}
when(wait(delay(5.0))) {
TraceEvent("RemoveAndKill", functionId)
.detail("Step", "SafetyCheckTimedOut")