From a8e8be5aacce3ef4c2078a36336c611836b97c77 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Tue, 21 May 2019 11:54:17 -0700 Subject: [PATCH] added a wait failure client which always waits the full failure reaction time, even if it knows the interface is never coming back use this new wait failure client in data distribution, to give time for a storage server to rejoin the cluster after its interface fails --- fdbserver/DataDistribution.actor.cpp | 11 +---------- fdbserver/WaitFailure.actor.cpp | 10 ++++++++++ fdbserver/WaitFailure.h | 3 +++ 3 files changed, 14 insertions(+), 10 deletions(-) diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index 67106318cb..61bf411f94 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -2825,24 +2825,15 @@ ACTOR Future storageServerFailureTracker( if( status->isFailed ) self->restartRecruiting.trigger(); - state double startTime = now(); Future healthChanged = Never(); if(status->isFailed) { ASSERT(!inHealthyZone); healthChanged = IFailureMonitor::failureMonitor().onStateEqual( interf.waitFailure.getEndpoint(), FailureStatus(false)); } else if(!inHealthyZone) { - healthChanged = waitFailureClient(interf.waitFailure, SERVER_KNOBS->DATA_DISTRIBUTION_FAILURE_REACTION_TIME, 0, TaskDataDistribution); + healthChanged = waitFailureClientStrict(interf.waitFailure, SERVER_KNOBS->DATA_DISTRIBUTION_FAILURE_REACTION_TIME, TaskDataDistribution); } choose { when ( wait(healthChanged) ) { - double elapsed = now() - startTime; - if(!status->isFailed && elapsed < SERVER_KNOBS->DATA_DISTRIBUTION_FAILURE_REACTION_TIME) { - wait(delay(SERVER_KNOBS->DATA_DISTRIBUTION_FAILURE_REACTION_TIME - elapsed)); - if(!IFailureMonitor::failureMonitor().getState( interf.waitFailure.getEndpoint() ).isFailed()) { - continue; - } - } - status->isFailed = !status->isFailed; if(!status->isFailed && !server->teams.size()) { self->doBuildTeams = true; diff --git a/fdbserver/WaitFailure.actor.cpp b/fdbserver/WaitFailure.actor.cpp index 9fa2c8025c..c4ca435551 100644 --- a/fdbserver/WaitFailure.actor.cpp +++ b/fdbserver/WaitFailure.actor.cpp @@ -56,6 +56,16 @@ ACTOR Future waitFailureClient(RequestStream> waitFailu } } +ACTOR Future waitFailureClientStrict(RequestStream> waitFailure, double failureReactionTime, int taskID){ + loop { + wait(waitFailureClient(waitFailure, 0, 0, taskID)); + wait(delay(failureReactionTime, taskID) || IFailureMonitor::failureMonitor().onStateEqual( waitFailure.getEndpoint(), FailureStatus(false))); + if(IFailureMonitor::failureMonitor().getState( waitFailure.getEndpoint() ).isFailed()) { + return Void(); + } + } +} + ACTOR Future waitFailureTracker(RequestStream> waitFailure, Reference> failed, double reactionTime, double reactionSlope, int taskID){ loop { try { diff --git a/fdbserver/WaitFailure.h b/fdbserver/WaitFailure.h index f30c8d35f5..9ef3b4c3a0 100644 --- a/fdbserver/WaitFailure.h +++ b/fdbserver/WaitFailure.h @@ -28,6 +28,9 @@ Future waitFailureServer(const FutureStream>& waitFailu Future waitFailureClient(const RequestStream>& waitFailure, double const& failureReactionTime=0, double const& failureReactionSlope=0, int const& taskID=TaskDefaultEndpoint); +// talks to a wait failure server, returns Void on failure, reaction time is always waited +Future waitFailureClientStrict(const RequestStream>& waitFailure, double const& failureReactionTime=0, int const& taskID=TaskDefaultEndpoint); + // talks to a wait failure server, updates failed to be true or false based on failure status. Future waitFailureTracker(const RequestStream>& waitFailure, Reference> const& failed, double const& failureReactionTime=0, double const& failureReactionSlope=0, int const& taskID=TaskDefaultEndpoint);