add a large random delay on failure detection so that not all storage servers need to attempt to become the cluster controller

This commit is contained in:
Evan Tschannen 2020-05-10 17:09:33 -07:00
parent 2ecea80539
commit 07111f0e41
3 changed files with 6 additions and 4 deletions

View File

@ -563,7 +563,8 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi
init( DEGRADED_WARNING_RESET_DELAY, 7*24*60*60 );
init( TRACE_LOG_FLUSH_FAILURE_CHECK_INTERVAL_SECONDS, 10 );
init( TRACE_LOG_PING_TIMEOUT_SECONDS, 5.0 );
init( DELAY_STORAGE_CANDIDACY_SECONDS, 10 ); if ( randomize && BUGGIFY ) DELAY_STORAGE_CANDIDACY_SECONDS = 10;
init( MIN_DELAY_STORAGE_CANDIDACY_SECONDS, 10.0 );
init( MAX_DELAY_STORAGE_CANDIDACY_SECONDS, 30.0 );
init( DBINFO_FAILED_DELAY, 1.0 );
// Test harness

View File

@ -492,7 +492,8 @@ public:
double DEGRADED_WARNING_RESET_DELAY;
int64_t TRACE_LOG_FLUSH_FAILURE_CHECK_INTERVAL_SECONDS;
double TRACE_LOG_PING_TIMEOUT_SECONDS;
int DELAY_STORAGE_CANDIDACY_SECONDS; // Listen for a leader for N seconds, and if not heard, then try to become the leader.
double MIN_DELAY_STORAGE_CANDIDACY_SECONDS; // Listen for a leader for N seconds, and if not heard, then try to become the leader.
double MAX_DELAY_STORAGE_CANDIDACY_SECONDS;
double DBINFO_FAILED_DELAY;
// Test harness

View File

@ -1654,7 +1654,7 @@ ACTOR Future<Void> monitorLeaderRemotelyWithDelayedCandidacy( Reference<ClusterC
if(currentCC->get().present() && dbInfo->get().clusterInterface == currentCC->get().get() && IFailureMonitor::failureMonitor().getState( currentCC->get().get().registerWorker.getEndpoint() ).isAvailable()) {
timeout = Future<Void>();
} else if(!timeout.isValid()) {
timeout = delay( SERVER_KNOBS->DELAY_STORAGE_CANDIDACY_SECONDS );
timeout = delay( SERVER_KNOBS->MIN_DELAY_STORAGE_CANDIDACY_SECONDS + (deterministicRandom()->random01()*(SERVER_KNOBS->MAX_DELAY_STORAGE_CANDIDACY_SECONDS-SERVER_KNOBS->MIN_DELAY_STORAGE_CANDIDACY_SECONDS)) );
}
choose {
when( wait(currentCC->onChange()) ) {}
@ -1713,7 +1713,7 @@ ACTOR Future<Void> fdbd(
actors.push_back(reportErrors(monitorAndWriteCCPriorityInfo(fitnessFilePath, asyncPriorityInfo), "MonitorAndWriteCCPriorityInfo"));
if (processClass == ProcessClass::TesterClass) {
actors.push_back( reportErrors( monitorLeader( connFile, cc ), "ClusterController" ) );
} else if (processClass == ProcessClass::StorageClass && SERVER_KNOBS->DELAY_STORAGE_CANDIDACY_SECONDS) {
} else if (processClass == ProcessClass::StorageClass && SERVER_KNOBS->MAX_DELAY_STORAGE_CANDIDACY_SECONDS > 0) {
actors.push_back( reportErrors( monitorLeaderRemotelyWithDelayedCandidacy( connFile, cc, asyncPriorityInfo, recoveredDiskFiles.getFuture(), localities, dbInfo ), "ClusterController" ) );
} else {
actors.push_back( reportErrors( clusterController( connFile, cc , asyncPriorityInfo, recoveredDiskFiles.getFuture(), localities ), "ClusterController") );