Add dynamic knob to disable gray failure recoveries. (#11509)

Co-authored-by: Dan Lambright <hlambright@apple.com>
This commit is contained in:
Dan Lambright 2024-07-20 14:35:21 -04:00 committed by GitHub
parent a733e48048
commit 1e834f84c8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 5 additions and 3 deletions

View File

@ -762,6 +762,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( COORDINATOR_REGISTER_INTERVAL, 5.0 );
init( CLIENT_REGISTER_INTERVAL, 600.0 );
init( CC_ENABLE_WORKER_HEALTH_MONITOR, false );
init( CC_PAUSE_HEALTH_MONITOR, false, Atomic::NO );
init( CC_WORKER_HEALTH_CHECKING_INTERVAL, 60.0 );
init( CC_DEGRADED_LINK_EXPIRATION_INTERVAL, 300.0 );
init( CC_MIN_DEGRADATION_INTERVAL, 120.0 );

View File

@ -717,6 +717,7 @@ public:
double REPLACE_INTERFACE_CHECK_DELAY;
double COORDINATOR_REGISTER_INTERVAL;
double CLIENT_REGISTER_INTERVAL;
bool CC_PAUSE_HEALTH_MONITOR;
bool CC_ENABLE_WORKER_HEALTH_MONITOR;
double CC_WORKER_HEALTH_CHECKING_INTERVAL; // The interval of refreshing the degraded server list.
double CC_DEGRADED_LINK_EXPIRATION_INTERVAL; // The time period from the last degradation report after which a

View File

@ -2954,7 +2954,6 @@ ACTOR Future<Void> workerHealthMonitor(ClusterControllerData* self) {
while (!self->goodRecruitmentTime.isReady()) {
wait(lowPriorityDelay(SERVER_KNOBS->CC_WORKER_HEALTH_CHECKING_INTERVAL));
}
self->degradationInfo = self->getDegradationInfo();
// Compare `self->degradationInfo` with `self->excludedDegradedServers` and remove those that have
@ -2988,7 +2987,9 @@ ACTOR Future<Void> workerHealthMonitor(ClusterControllerData* self) {
// Check if the cluster controller should trigger a recovery to exclude any degraded servers from
// the transaction system.
if (self->shouldTriggerRecoveryDueToDegradedServers()) {
if (SERVER_KNOBS->CC_PAUSE_HEALTH_MONITOR) {
TraceEvent(SevWarnAlways, "HealthMonitorPaused");
} else if (self->shouldTriggerRecoveryDueToDegradedServers()) {
if (SERVER_KNOBS->CC_HEALTH_TRIGGER_RECOVERY) {
if (self->recentRecoveryCountDueToHealth() < SERVER_KNOBS->CC_MAX_HEALTH_RECOVERY_COUNT) {
self->recentHealthTriggeredRecoveryTime.push(now());
@ -3176,7 +3177,6 @@ ACTOR Future<Void> clusterControllerCore(ClusterControllerFullInterface interf,
self.id.toString() + "/ClusterControllerMetrics"));
self.addActor.send(traceRole(Role::CLUSTER_CONTROLLER, interf.id()));
// printf("%s: I am the cluster controller\n", g_network->getLocalAddress().toString().c_str());
if (SERVER_KNOBS->CC_ENABLE_WORKER_HEALTH_MONITOR) {
self.addActor.send(workerHealthMonitor(&self));
self.addActor.send(updateRemoteDCHealth(&self));