The checkSafeExclusions function only ensures the exclusion is safe from the storage server prospective, but does not confirm it is safe in terms of the tlog replication

2025-06-02 03:12:12 +08:00 · 2021-03-23 13:31:16 -07:00 · 2021-03-23 13:31:16 -07:00 · 272e649a3c
commit 272e649a3c
parent 43c81e550c
1 changed files with 7 additions and 1 deletions
--- a/fdbserver/workloads/RemoveServersSafely.actor.cpp
+++ b/fdbserver/workloads/RemoveServersSafely.actor.cpp
@ -502,6 +502,8 @@ struct RemoveServersSafelyWorkload : TestWorkload {
 		return killProcArray;
 	}

+	// Attempts to exclude a set of processes, and once the exclusion is successful it kills them.
+	// If markExcludeAsFailed is true, then it is an error if we cannot complete the exclusion.
 	ACTOR static Future<Void> removeAndKill(RemoveServersSafelyWorkload* self,
 	                                        Database cx,
 	                                        std::set<AddressExclusion> toKill,
@ -556,7 +558,11 @@ struct RemoveServersSafelyWorkload : TestWorkload {
 				    .detail("Step", "SafetyCheck")
 				    .detail("Exclusions", describe(toKillMarkFailedArray));
 				choose {
-					when(bool _safe = wait(checkSafeExclusions(cx, toKillMarkFailedArray))) { safe = _safe; }
+					when(bool _safe = wait(checkSafeExclusions(cx, toKillMarkFailedArray))) {
+						safe = _safe && self->protectServers(std::set<AddressExclusion>(toKillMarkFailedArray.begin(),
+						                                                                toKillMarkFailedArray.end()))
+						                        .size() == toKillMarkFailedArray.size();
+					}
 					when(wait(delay(5.0))) {
 						TraceEvent("RemoveAndKill", functionId)
 						    .detail("Step", "SafetyCheckTimedOut")