mirror of
https://github.com/apple/foundationdb.git
synced 2025-06-02 19:25:52 +08:00
Add dynamic knob to disable gray failure recoveries. (#11509)
Co-authored-by: Dan Lambright <hlambright@apple.com>
This commit is contained in:
parent
a733e48048
commit
1e834f84c8
@ -762,6 +762,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
|
||||
init( COORDINATOR_REGISTER_INTERVAL, 5.0 );
|
||||
init( CLIENT_REGISTER_INTERVAL, 600.0 );
|
||||
init( CC_ENABLE_WORKER_HEALTH_MONITOR, false );
|
||||
init( CC_PAUSE_HEALTH_MONITOR, false, Atomic::NO );
|
||||
init( CC_WORKER_HEALTH_CHECKING_INTERVAL, 60.0 );
|
||||
init( CC_DEGRADED_LINK_EXPIRATION_INTERVAL, 300.0 );
|
||||
init( CC_MIN_DEGRADATION_INTERVAL, 120.0 );
|
||||
|
@ -717,6 +717,7 @@ public:
|
||||
double REPLACE_INTERFACE_CHECK_DELAY;
|
||||
double COORDINATOR_REGISTER_INTERVAL;
|
||||
double CLIENT_REGISTER_INTERVAL;
|
||||
bool CC_PAUSE_HEALTH_MONITOR;
|
||||
bool CC_ENABLE_WORKER_HEALTH_MONITOR;
|
||||
double CC_WORKER_HEALTH_CHECKING_INTERVAL; // The interval of refreshing the degraded server list.
|
||||
double CC_DEGRADED_LINK_EXPIRATION_INTERVAL; // The time period from the last degradation report after which a
|
||||
|
@ -2954,7 +2954,6 @@ ACTOR Future<Void> workerHealthMonitor(ClusterControllerData* self) {
|
||||
while (!self->goodRecruitmentTime.isReady()) {
|
||||
wait(lowPriorityDelay(SERVER_KNOBS->CC_WORKER_HEALTH_CHECKING_INTERVAL));
|
||||
}
|
||||
|
||||
self->degradationInfo = self->getDegradationInfo();
|
||||
|
||||
// Compare `self->degradationInfo` with `self->excludedDegradedServers` and remove those that have
|
||||
@ -2988,7 +2987,9 @@ ACTOR Future<Void> workerHealthMonitor(ClusterControllerData* self) {
|
||||
|
||||
// Check if the cluster controller should trigger a recovery to exclude any degraded servers from
|
||||
// the transaction system.
|
||||
if (self->shouldTriggerRecoveryDueToDegradedServers()) {
|
||||
if (SERVER_KNOBS->CC_PAUSE_HEALTH_MONITOR) {
|
||||
TraceEvent(SevWarnAlways, "HealthMonitorPaused");
|
||||
} else if (self->shouldTriggerRecoveryDueToDegradedServers()) {
|
||||
if (SERVER_KNOBS->CC_HEALTH_TRIGGER_RECOVERY) {
|
||||
if (self->recentRecoveryCountDueToHealth() < SERVER_KNOBS->CC_MAX_HEALTH_RECOVERY_COUNT) {
|
||||
self->recentHealthTriggeredRecoveryTime.push(now());
|
||||
@ -3176,7 +3177,6 @@ ACTOR Future<Void> clusterControllerCore(ClusterControllerFullInterface interf,
|
||||
self.id.toString() + "/ClusterControllerMetrics"));
|
||||
self.addActor.send(traceRole(Role::CLUSTER_CONTROLLER, interf.id()));
|
||||
// printf("%s: I am the cluster controller\n", g_network->getLocalAddress().toString().c_str());
|
||||
|
||||
if (SERVER_KNOBS->CC_ENABLE_WORKER_HEALTH_MONITOR) {
|
||||
self.addActor.send(workerHealthMonitor(&self));
|
||||
self.addActor.send(updateRemoteDCHealth(&self));
|
||||
|
Loading…
x
Reference in New Issue
Block a user