mirror of
https://github.com/apple/foundationdb.git
synced 2025-05-15 02:18:39 +08:00
added a wait failure client which always waits the full failure reaction time, even if it knows the interface is never coming back
use this new wait failure client in data distribution, to give time for a storage server to rejoin the cluster after its interface fails
This commit is contained in:
parent
04e465925a
commit
a8e8be5aac
@ -2825,24 +2825,15 @@ ACTOR Future<Void> storageServerFailureTracker(
|
||||
if( status->isFailed )
|
||||
self->restartRecruiting.trigger();
|
||||
|
||||
state double startTime = now();
|
||||
Future<Void> healthChanged = Never();
|
||||
if(status->isFailed) {
|
||||
ASSERT(!inHealthyZone);
|
||||
healthChanged = IFailureMonitor::failureMonitor().onStateEqual( interf.waitFailure.getEndpoint(), FailureStatus(false));
|
||||
} else if(!inHealthyZone) {
|
||||
healthChanged = waitFailureClient(interf.waitFailure, SERVER_KNOBS->DATA_DISTRIBUTION_FAILURE_REACTION_TIME, 0, TaskDataDistribution);
|
||||
healthChanged = waitFailureClientStrict(interf.waitFailure, SERVER_KNOBS->DATA_DISTRIBUTION_FAILURE_REACTION_TIME, TaskDataDistribution);
|
||||
}
|
||||
choose {
|
||||
when ( wait(healthChanged) ) {
|
||||
double elapsed = now() - startTime;
|
||||
if(!status->isFailed && elapsed < SERVER_KNOBS->DATA_DISTRIBUTION_FAILURE_REACTION_TIME) {
|
||||
wait(delay(SERVER_KNOBS->DATA_DISTRIBUTION_FAILURE_REACTION_TIME - elapsed));
|
||||
if(!IFailureMonitor::failureMonitor().getState( interf.waitFailure.getEndpoint() ).isFailed()) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
status->isFailed = !status->isFailed;
|
||||
if(!status->isFailed && !server->teams.size()) {
|
||||
self->doBuildTeams = true;
|
||||
|
@ -56,6 +56,16 @@ ACTOR Future<Void> waitFailureClient(RequestStream<ReplyPromise<Void>> waitFailu
|
||||
}
|
||||
}
|
||||
|
||||
ACTOR Future<Void> waitFailureClientStrict(RequestStream<ReplyPromise<Void>> waitFailure, double failureReactionTime, int taskID){
|
||||
loop {
|
||||
wait(waitFailureClient(waitFailure, 0, 0, taskID));
|
||||
wait(delay(failureReactionTime, taskID) || IFailureMonitor::failureMonitor().onStateEqual( waitFailure.getEndpoint(), FailureStatus(false)));
|
||||
if(IFailureMonitor::failureMonitor().getState( waitFailure.getEndpoint() ).isFailed()) {
|
||||
return Void();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ACTOR Future<Void> waitFailureTracker(RequestStream<ReplyPromise<Void>> waitFailure, Reference<AsyncVar<bool>> failed, double reactionTime, double reactionSlope, int taskID){
|
||||
loop {
|
||||
try {
|
||||
|
@ -28,6 +28,9 @@ Future<Void> waitFailureServer(const FutureStream<ReplyPromise<Void>>& waitFailu
|
||||
Future<Void> waitFailureClient(const RequestStream<ReplyPromise<Void>>& waitFailure,
|
||||
double const& failureReactionTime=0, double const& failureReactionSlope=0, int const& taskID=TaskDefaultEndpoint);
|
||||
|
||||
// talks to a wait failure server, returns Void on failure, reaction time is always waited
|
||||
Future<Void> waitFailureClientStrict(const RequestStream<ReplyPromise<Void>>& waitFailure, double const& failureReactionTime=0, int const& taskID=TaskDefaultEndpoint);
|
||||
|
||||
// talks to a wait failure server, updates failed to be true or false based on failure status.
|
||||
Future<Void> waitFailureTracker(const RequestStream<ReplyPromise<Void>>& waitFailure, Reference<AsyncVar<bool>> const& failed,
|
||||
double const& failureReactionTime=0, double const& failureReactionSlope=0, int const& taskID=TaskDefaultEndpoint);
|
||||
|
Loading…
x
Reference in New Issue
Block a user