1
0
mirror of https://github.com/apple/foundationdb.git synced 2025-05-21 05:53:02 +08:00

added a wait failure client which always waits the full failure reaction time, even if it knows the interface is never coming back

use this new wait failure client in data distribution, to give time for a storage server to rejoin the cluster after its interface fails
This commit is contained in:
Evan Tschannen 2019-05-21 11:54:17 -07:00
parent 04e465925a
commit a8e8be5aac
3 changed files with 14 additions and 10 deletions

@ -2825,24 +2825,15 @@ ACTOR Future<Void> storageServerFailureTracker(
if( status->isFailed ) if( status->isFailed )
self->restartRecruiting.trigger(); self->restartRecruiting.trigger();
state double startTime = now();
Future<Void> healthChanged = Never(); Future<Void> healthChanged = Never();
if(status->isFailed) { if(status->isFailed) {
ASSERT(!inHealthyZone); ASSERT(!inHealthyZone);
healthChanged = IFailureMonitor::failureMonitor().onStateEqual( interf.waitFailure.getEndpoint(), FailureStatus(false)); healthChanged = IFailureMonitor::failureMonitor().onStateEqual( interf.waitFailure.getEndpoint(), FailureStatus(false));
} else if(!inHealthyZone) { } else if(!inHealthyZone) {
healthChanged = waitFailureClient(interf.waitFailure, SERVER_KNOBS->DATA_DISTRIBUTION_FAILURE_REACTION_TIME, 0, TaskDataDistribution); healthChanged = waitFailureClientStrict(interf.waitFailure, SERVER_KNOBS->DATA_DISTRIBUTION_FAILURE_REACTION_TIME, TaskDataDistribution);
} }
choose { choose {
when ( wait(healthChanged) ) { when ( wait(healthChanged) ) {
double elapsed = now() - startTime;
if(!status->isFailed && elapsed < SERVER_KNOBS->DATA_DISTRIBUTION_FAILURE_REACTION_TIME) {
wait(delay(SERVER_KNOBS->DATA_DISTRIBUTION_FAILURE_REACTION_TIME - elapsed));
if(!IFailureMonitor::failureMonitor().getState( interf.waitFailure.getEndpoint() ).isFailed()) {
continue;
}
}
status->isFailed = !status->isFailed; status->isFailed = !status->isFailed;
if(!status->isFailed && !server->teams.size()) { if(!status->isFailed && !server->teams.size()) {
self->doBuildTeams = true; self->doBuildTeams = true;

@ -56,6 +56,16 @@ ACTOR Future<Void> waitFailureClient(RequestStream<ReplyPromise<Void>> waitFailu
} }
} }
ACTOR Future<Void> waitFailureClientStrict(RequestStream<ReplyPromise<Void>> waitFailure, double failureReactionTime, int taskID){
loop {
wait(waitFailureClient(waitFailure, 0, 0, taskID));
wait(delay(failureReactionTime, taskID) || IFailureMonitor::failureMonitor().onStateEqual( waitFailure.getEndpoint(), FailureStatus(false)));
if(IFailureMonitor::failureMonitor().getState( waitFailure.getEndpoint() ).isFailed()) {
return Void();
}
}
}
ACTOR Future<Void> waitFailureTracker(RequestStream<ReplyPromise<Void>> waitFailure, Reference<AsyncVar<bool>> failed, double reactionTime, double reactionSlope, int taskID){ ACTOR Future<Void> waitFailureTracker(RequestStream<ReplyPromise<Void>> waitFailure, Reference<AsyncVar<bool>> failed, double reactionTime, double reactionSlope, int taskID){
loop { loop {
try { try {

@ -28,6 +28,9 @@ Future<Void> waitFailureServer(const FutureStream<ReplyPromise<Void>>& waitFailu
Future<Void> waitFailureClient(const RequestStream<ReplyPromise<Void>>& waitFailure, Future<Void> waitFailureClient(const RequestStream<ReplyPromise<Void>>& waitFailure,
double const& failureReactionTime=0, double const& failureReactionSlope=0, int const& taskID=TaskDefaultEndpoint); double const& failureReactionTime=0, double const& failureReactionSlope=0, int const& taskID=TaskDefaultEndpoint);
// talks to a wait failure server, returns Void on failure, reaction time is always waited
Future<Void> waitFailureClientStrict(const RequestStream<ReplyPromise<Void>>& waitFailure, double const& failureReactionTime=0, int const& taskID=TaskDefaultEndpoint);
// talks to a wait failure server, updates failed to be true or false based on failure status. // talks to a wait failure server, updates failed to be true or false based on failure status.
Future<Void> waitFailureTracker(const RequestStream<ReplyPromise<Void>>& waitFailure, Reference<AsyncVar<bool>> const& failed, Future<Void> waitFailureTracker(const RequestStream<ReplyPromise<Void>>& waitFailure, Reference<AsyncVar<bool>> const& failed,
double const& failureReactionTime=0, double const& failureReactionSlope=0, int const& taskID=TaskDefaultEndpoint); double const& failureReactionTime=0, double const& failureReactionSlope=0, int const& taskID=TaskDefaultEndpoint);