added a wait failure client which always waits the full failure reaction time, even if it knows the interface is never coming back

use this new wait failure client in data distribution, to give time for a storage server to rejoin the cluster after its interface fails
This commit is contained in:
Evan Tschannen 2019-05-21 11:54:17 -07:00
parent 04e465925a
commit a8e8be5aac
3 changed files with 14 additions and 10 deletions

View File

@ -2825,24 +2825,15 @@ ACTOR Future<Void> storageServerFailureTracker(
if( status->isFailed )
self->restartRecruiting.trigger();
state double startTime = now();
Future<Void> healthChanged = Never();
if(status->isFailed) {
ASSERT(!inHealthyZone);
healthChanged = IFailureMonitor::failureMonitor().onStateEqual( interf.waitFailure.getEndpoint(), FailureStatus(false));
} else if(!inHealthyZone) {
healthChanged = waitFailureClient(interf.waitFailure, SERVER_KNOBS->DATA_DISTRIBUTION_FAILURE_REACTION_TIME, 0, TaskDataDistribution);
healthChanged = waitFailureClientStrict(interf.waitFailure, SERVER_KNOBS->DATA_DISTRIBUTION_FAILURE_REACTION_TIME, TaskDataDistribution);
}
choose {
when ( wait(healthChanged) ) {
double elapsed = now() - startTime;
if(!status->isFailed && elapsed < SERVER_KNOBS->DATA_DISTRIBUTION_FAILURE_REACTION_TIME) {
wait(delay(SERVER_KNOBS->DATA_DISTRIBUTION_FAILURE_REACTION_TIME - elapsed));
if(!IFailureMonitor::failureMonitor().getState( interf.waitFailure.getEndpoint() ).isFailed()) {
continue;
}
}
status->isFailed = !status->isFailed;
if(!status->isFailed && !server->teams.size()) {
self->doBuildTeams = true;

View File

@ -56,6 +56,16 @@ ACTOR Future<Void> waitFailureClient(RequestStream<ReplyPromise<Void>> waitFailu
}
}
ACTOR Future<Void> waitFailureClientStrict(RequestStream<ReplyPromise<Void>> waitFailure, double failureReactionTime, int taskID){
loop {
wait(waitFailureClient(waitFailure, 0, 0, taskID));
wait(delay(failureReactionTime, taskID) || IFailureMonitor::failureMonitor().onStateEqual( waitFailure.getEndpoint(), FailureStatus(false)));
if(IFailureMonitor::failureMonitor().getState( waitFailure.getEndpoint() ).isFailed()) {
return Void();
}
}
}
ACTOR Future<Void> waitFailureTracker(RequestStream<ReplyPromise<Void>> waitFailure, Reference<AsyncVar<bool>> failed, double reactionTime, double reactionSlope, int taskID){
loop {
try {

View File

@ -28,6 +28,9 @@ Future<Void> waitFailureServer(const FutureStream<ReplyPromise<Void>>& waitFailu
Future<Void> waitFailureClient(const RequestStream<ReplyPromise<Void>>& waitFailure,
double const& failureReactionTime=0, double const& failureReactionSlope=0, int const& taskID=TaskDefaultEndpoint);
// talks to a wait failure server, returns Void on failure, reaction time is always waited
Future<Void> waitFailureClientStrict(const RequestStream<ReplyPromise<Void>>& waitFailure, double const& failureReactionTime=0, int const& taskID=TaskDefaultEndpoint);
// talks to a wait failure server, updates failed to be true or false based on failure status.
Future<Void> waitFailureTracker(const RequestStream<ReplyPromise<Void>>& waitFailure, Reference<AsyncVar<bool>> const& failed,
double const& failureReactionTime=0, double const& failureReactionSlope=0, int const& taskID=TaskDefaultEndpoint);