Workers should monitor coordinators in submitCandidacy(). (#6655)

* Workers should monitor coordinators in submitCandidacy().

* Change re-resolve delay to a knob.
This commit is contained in:
Renxuan Wang 2022-03-24 19:20:42 -07:00 committed by GitHub
parent 89addac989
commit 2a59c5fd4e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 102 additions and 54 deletions

View File

@ -50,6 +50,7 @@ void ClientKnobs::initialize(Randomize randomize) {
init( MAX_GENERATIONS_OVERRIDE, 0 );
init( MAX_GENERATIONS_SIM, 50 ); //Disable network connections after this many generations in simulation, should be less than RECOVERY_DELAY_START_GENERATION
init( COORDINATOR_HOSTNAME_RESOLVE_DELAY, 0.05 );
init( COORDINATOR_RECONNECTION_DELAY, 1.0 );
init( CLIENT_EXAMPLE_AMOUNT, 20 );
init( MAX_CLIENT_STATUS_AGE, 1.0 );

View File

@ -49,6 +49,7 @@ public:
double MAX_GENERATIONS_OVERRIDE;
double MAX_GENERATIONS_SIM;
double COORDINATOR_HOSTNAME_RESOLVE_DELAY;
double COORDINATOR_RECONNECTION_DELAY;
int CLIENT_EXAMPLE_AMOUNT;
double MAX_CLIENT_STATUS_AGE;

View File

@ -169,7 +169,7 @@ void ClusterConnectionString::resolveHostnamesBlocking() {
}
void ClusterConnectionString::resetToUnresolved() {
if (hostnames.size() > 0) {
if (status == RESOLVED && hostnames.size() > 0) {
coords.clear();
hostnames.clear();
networkAddressToHostname.clear();
@ -558,8 +558,8 @@ ACTOR Future<Void> monitorNominee(Key key,
.detail("Hostname", hostname.present() ? hostname.get().toString() : "UnknownHostname")
.detail("OldAddr", coord.getLeader.getEndpoint().getPrimaryAddress().toString());
if (rep.getError().code() == error_code_request_maybe_delivered) {
// 50 milliseconds delay to prevent tight resolving loop due to outdated DNS cache
wait(delay(0.05));
// Delay to prevent tight resolving loop due to outdated DNS cache
wait(delay(CLIENT_KNOBS->COORDINATOR_HOSTNAME_RESOLVE_DELAY));
throw coordinators_changed();
} else {
throw rep.getError();
@ -589,7 +589,6 @@ ACTOR Future<Void> monitorNominee(Key key,
if (li.present() && li.get().forward)
wait(Future<Void>(Never()));
wait(Future<Void>(Void()));
}
}
}

View File

@ -2475,11 +2475,12 @@ ACTOR Future<Void> workerHealthMonitor(ClusterControllerData* self) {
}
}
ACTOR Future<Void> clusterControllerCore(ClusterControllerFullInterface interf,
ACTOR Future<Void> clusterControllerCore(Reference<IClusterConnectionRecord> connRecord,
ClusterControllerFullInterface interf,
Future<Void> leaderFail,
ServerCoordinators coordinators,
LocalityData locality,
ConfigDBType configDBType) {
state ServerCoordinators coordinators(connRecord);
state ClusterControllerData self(interf, locality, coordinators);
state ConfigBroadcaster configBroadcaster(coordinators, configDBType);
state Future<Void> coordinationPingDelay = delay(SERVER_KNOBS->WORKER_COORDINATION_PING_DELAY);
@ -2612,7 +2613,7 @@ ACTOR Future<Void> replaceInterface(ClusterControllerFullInterface interf) {
}
}
ACTOR Future<Void> clusterController(ServerCoordinators coordinators,
ACTOR Future<Void> clusterController(Reference<IClusterConnectionRecord> connRecord,
Reference<AsyncVar<Optional<ClusterControllerFullInterface>>> currentCC,
bool hasConnected,
Reference<AsyncVar<ClusterControllerPriorityInfo>> asyncPriorityInfo,
@ -2623,9 +2624,10 @@ ACTOR Future<Void> clusterController(ServerCoordinators coordinators,
state bool inRole = false;
cci.initEndpoints();
try {
wait(connRecord->resolveHostnames());
// Register as a possible leader; wait to be elected
state Future<Void> leaderFail =
tryBecomeLeader(coordinators, cci, currentCC, hasConnected, asyncPriorityInfo);
tryBecomeLeader(connRecord, cci, currentCC, hasConnected, asyncPriorityInfo);
state Future<Void> shouldReplace = replaceInterface(cci);
while (!currentCC->get().present() || currentCC->get().get() != cci) {
@ -2644,7 +2646,7 @@ ACTOR Future<Void> clusterController(ServerCoordinators coordinators,
startRole(Role::CLUSTER_CONTROLLER, cci.id(), UID());
inRole = true;
wait(clusterControllerCore(cci, leaderFail, coordinators, locality, configDBType));
wait(clusterControllerCore(connRecord, cci, leaderFail, locality, configDBType));
}
} catch (Error& e) {
if (inRole)
@ -2673,15 +2675,12 @@ ACTOR Future<Void> clusterController(Reference<IClusterConnectionRecord> connRec
state bool hasConnected = false;
loop {
try {
wait(connRecord->resolveHostnames());
ServerCoordinators coordinators(connRecord);
wait(clusterController(coordinators, currentCC, hasConnected, asyncPriorityInfo, locality, configDBType));
wait(clusterController(connRecord, currentCC, hasConnected, asyncPriorityInfo, locality, configDBType));
hasConnected = true;
} catch (Error& e) {
if (e.code() != error_code_coordinators_changed)
throw; // Expected to terminate fdbserver
}
hasConnected = true;
}
}

View File

@ -25,28 +25,56 @@
#include "fdbclient/MonitorLeader.h"
#include "flow/actorcompiler.h" // This must be the last #include.
// Keep trying to become a leader by submitting itself to all coordinators.
// Monitor the health of all coordinators at the same time.
// Note: for coordinators whose NetworkAddress is parsed out of a hostname, a connection failure will cause this actor
// to throw `coordinators_changed()` error
ACTOR Future<Void> submitCandidacy(Key key,
LeaderElectionRegInterface coord,
LeaderInfo myInfo,
UID prevChangeID,
Reference<AsyncVar<std::vector<Optional<LeaderInfo>>>> nominees,
int index) {
AsyncTrigger* nomineeChange,
Optional<LeaderInfo>* nominee,
Optional<Hostname> hostname = Optional<Hostname>()) {
loop {
auto const& nom = nominees->get()[index];
Optional<LeaderInfo> li = wait(
retryBrokenPromise(coord.candidacy,
CandidacyRequest(key, myInfo, nom.present() ? nom.get().changeID : UID(), prevChangeID),
TaskPriority::CoordinationReply));
state Optional<LeaderInfo> li;
if (li != nominees->get()[index]) {
std::vector<Optional<LeaderInfo>> v = nominees->get();
v[index] = li;
nominees->set(v);
if (coord.candidacy.getEndpoint().getPrimaryAddress().fromHostname) {
state ErrorOr<Optional<LeaderInfo>> rep = wait(coord.candidacy.tryGetReply(
CandidacyRequest(key, myInfo, nominee->present() ? nominee->get().changeID : UID(), prevChangeID),
TaskPriority::CoordinationReply));
if (rep.isError()) {
// Connecting to nominee failed, most likely due to connection failed.
TraceEvent("SubmitCandadicyError")
.error(rep.getError())
.detail("Hostname", hostname.present() ? hostname.get().toString() : "UnknownHostname")
.detail("OldAddr", coord.candidacy.getEndpoint().getPrimaryAddress().toString());
if (rep.getError().code() == error_code_request_maybe_delivered) {
// Delay to prevent tight resolving loop due to outdated DNS cache
wait(delay(CLIENT_KNOBS->COORDINATOR_HOSTNAME_RESOLVE_DELAY));
throw coordinators_changed();
} else {
throw rep.getError();
}
} else if (rep.present()) {
li = rep.get();
}
} else {
Optional<LeaderInfo> tmp = wait(retryBrokenPromise(
coord.candidacy,
CandidacyRequest(key, myInfo, nominee->present() ? nominee->get().changeID : UID(), prevChangeID),
TaskPriority::CoordinationReply));
li = tmp;
}
wait(Future<Void>(Void())); // Make sure we weren't cancelled
if (li != *nominee) {
*nominee = li;
nomineeChange->trigger();
if (li.present() && li.get().forward)
wait(Future<Void>(Never()));
wait(Future<Void>(Void())); // Make sure we weren't cancelled
}
}
}
@ -84,13 +112,14 @@ ACTOR Future<Void> changeLeaderCoordinators(ServerCoordinators coordinators, Val
return Void();
}
ACTOR Future<Void> tryBecomeLeaderInternal(ServerCoordinators coordinators,
ACTOR Future<Void> tryBecomeLeaderInternal(Reference<IClusterConnectionRecord> connRecord,
Value proposedSerializedInterface,
Reference<AsyncVar<Value>> outSerializedLeader,
bool hasConnected,
Reference<AsyncVar<ClusterControllerPriorityInfo>> asyncPriorityInfo) {
state Reference<AsyncVar<std::vector<Optional<LeaderInfo>>>> nominees(
new AsyncVar<std::vector<Optional<LeaderInfo>>>());
state ServerCoordinators coordinators(connRecord);
state AsyncTrigger nomineeChange;
state std::vector<Optional<LeaderInfo>> nominees;
state LeaderInfo myInfo;
state Future<Void> candidacies;
state bool iAmLeader = false;
@ -105,8 +134,6 @@ ACTOR Future<Void> tryBecomeLeaderInternal(ServerCoordinators coordinators,
wait(delay(SERVER_KNOBS->WAIT_FOR_GOOD_RECRUITMENT_DELAY));
}
nominees->set(std::vector<Optional<LeaderInfo>>(coordinators.clientLeaderServers.size()));
myInfo.serializedInfo = proposedSerializedInterface;
outSerializedLeader->set(Value());
@ -114,6 +141,9 @@ ACTOR Future<Void> tryBecomeLeaderInternal(ServerCoordinators coordinators,
(SERVER_KNOBS->BUGGIFY_ALL_COORDINATION || BUGGIFY) ? buggifyDelayedAsyncVar(outSerializedLeader) : Void();
while (!iAmLeader) {
wait(connRecord->resolveHostnames());
coordinators = ServerCoordinators(connRecord);
nominees.resize(coordinators.leaderElectionServers.size());
state Future<Void> badCandidateTimeout;
myInfo.changeID = deterministicRandom()->randomUniqueID();
@ -122,13 +152,25 @@ ACTOR Future<Void> tryBecomeLeaderInternal(ServerCoordinators coordinators,
std::vector<Future<Void>> cand;
cand.reserve(coordinators.leaderElectionServers.size());
for (int i = 0; i < coordinators.leaderElectionServers.size(); i++)
cand.push_back(submitCandidacy(
coordinators.clusterKey, coordinators.leaderElectionServers[i], myInfo, prevChangeID, nominees, i));
for (int i = 0; i < coordinators.leaderElectionServers.size(); i++) {
Optional<Hostname> hostname;
auto r = connRecord->getConnectionString().networkAddressToHostname.find(
coordinators.leaderElectionServers[i].candidacy.getEndpoint().getPrimaryAddress());
if (r != connRecord->getConnectionString().networkAddressToHostname.end()) {
hostname = r->second;
}
cand.push_back(submitCandidacy(coordinators.clusterKey,
coordinators.leaderElectionServers[i],
myInfo,
prevChangeID,
&nomineeChange,
&nominees[i],
hostname));
}
candidacies = waitForAll(cand);
loop {
state Optional<std::pair<LeaderInfo, bool>> leader = getLeader(nominees->get());
state Optional<std::pair<LeaderInfo, bool>> leader = getLeader(nominees);
if (leader.present() && leader.get().first.forward) {
// These coordinators are forwarded to another set. But before we change our own cluster file, we need
// to make sure that a majority of coordinators know that. SOMEDAY: Wait briefly to see if other
@ -172,22 +214,30 @@ ACTOR Future<Void> tryBecomeLeaderInternal(ServerCoordinators coordinators,
// If more than 2*SERVER_KNOBS->POLLING_FREQUENCY elapses while we are nominated by some coordinator but
// there is no leader, we might be breaking the leader election process for someone with better
// communications but lower ID, so change IDs.
if ((!leader.present() || !leader.get().second) &&
std::count(nominees->get().begin(), nominees->get().end(), myInfo)) {
if ((!leader.present() || !leader.get().second) && std::count(nominees.begin(), nominees.end(), myInfo)) {
if (!badCandidateTimeout.isValid())
badCandidateTimeout = delay(SERVER_KNOBS->POLLING_FREQUENCY * 2, TaskPriority::CoordinationReply);
} else
badCandidateTimeout = Future<Void>();
choose {
when(wait(nominees->onChange())) {}
when(wait(badCandidateTimeout.isValid() ? badCandidateTimeout : Never())) {
TEST(true); // Bad candidate timeout
TraceEvent("LeaderBadCandidateTimeout", myInfo.changeID).log();
break;
try {
choose {
when(wait(nomineeChange.onTrigger())) {}
when(wait(badCandidateTimeout.isValid() ? badCandidateTimeout : Never())) {
TEST(true); // Bad candidate timeout
TraceEvent("LeaderBadCandidateTimeout", myInfo.changeID).log();
break;
}
when(wait(candidacies)) { ASSERT(false); }
when(wait(asyncPriorityInfo->onChange())) { break; }
}
} catch (Error& e) {
if (e.code() == error_code_coordinators_changed) {
connRecord->getConnectionString().resetToUnresolved();
break;
} else {
throw e;
}
when(wait(candidacies)) { ASSERT(false); }
when(wait(asyncPriorityInfo->onChange())) { break; }
}
}

View File

@ -37,7 +37,7 @@ class ServerCoordinators;
// eventually be set. If the return value is cancelled, the candidacy or leadership of the proposedInterface
// will eventually end.
template <class LeaderInterface>
Future<Void> tryBecomeLeader(ServerCoordinators const& coordinators,
Future<Void> tryBecomeLeader(Reference<IClusterConnectionRecord> const& connRecord,
LeaderInterface const& proposedInterface,
Reference<AsyncVar<Optional<LeaderInterface>>> const& outKnownLeader,
bool hasConnected,
@ -50,20 +50,20 @@ Future<Void> changeLeaderCoordinators(ServerCoordinators const& coordinators, Va
#pragma region Implementation
#endif // __INTEL_COMPILER
Future<Void> tryBecomeLeaderInternal(ServerCoordinators const& coordinators,
Future<Void> tryBecomeLeaderInternal(Reference<IClusterConnectionRecord> const& connRecord,
Value const& proposedSerializedInterface,
Reference<AsyncVar<Value>> const& outSerializedLeader,
bool const& hasConnected,
Reference<AsyncVar<ClusterControllerPriorityInfo>> const& asyncPriorityInfo);
template <class LeaderInterface>
Future<Void> tryBecomeLeader(ServerCoordinators const& coordinators,
Future<Void> tryBecomeLeader(Reference<IClusterConnectionRecord> const& connRecord,
LeaderInterface const& proposedInterface,
Reference<AsyncVar<Optional<LeaderInterface>>> const& outKnownLeader,
bool hasConnected,
Reference<AsyncVar<ClusterControllerPriorityInfo>> const& asyncPriorityInfo) {
auto serializedInfo = makeReference<AsyncVar<Value>>();
Future<Void> m = tryBecomeLeaderInternal(coordinators,
Future<Void> m = tryBecomeLeaderInternal(connRecord,
ObjectWriter::toValue(proposedInterface, IncludeVersion()),
serializedInfo,
hasConnected,

View File

@ -2726,8 +2726,6 @@ ACTOR Future<Void> fdbd(Reference<IClusterConnectionRecord> connRecord,
actors.push_back(serveProcess());
try {
wait(connRecord->resolveHostnames());
ServerCoordinators coordinators(connRecord);
if (g_network->isSimulated()) {
whitelistBinPaths = ",, random_path, /bin/snap_create.sh,,";
}
@ -2745,8 +2743,8 @@ ACTOR Future<Void> fdbd(Reference<IClusterConnectionRecord> connRecord,
if (coordFolder.size()) {
// SOMEDAY: remove the fileNotFound wrapper and make DiskQueue construction safe from errors setting up
// their files
actors.push_back(fileNotFoundToNever(
coordinationServer(coordFolder, coordinators.ccr, configNode, configBroadcastInterface)));
actors.push_back(
fileNotFoundToNever(coordinationServer(coordFolder, connRecord, configNode, configBroadcastInterface)));
}
state UID processIDUid = wait(createAndLockProcessIdFile(dataFolder));