The consistency check should retry if it couldn't find all the commit proxies when getting key server locations

This commit is contained in:
A.J. Beamon 2023-03-17 12:00:47 -07:00
parent c492f83bf4
commit dc2bd78aa7
3 changed files with 13 additions and 14 deletions

View File

@ -166,6 +166,7 @@ ACTOR Future<bool> getKeyServers(
Promise<std::vector<std::pair<KeyRange, std::vector<StorageServerInterface>>>> keyServersPromise,
KeyRangeRef kr,
bool performQuiescentChecks,
bool failureIsError,
bool* success);
ACTOR Future<bool> getKeyLocations(Database cx,
std::vector<std::pair<KeyRange, std::vector<StorageServerInterface>>> shards,

View File

@ -97,6 +97,7 @@ ACTOR Future<bool> getKeyServers(
Promise<std::vector<std::pair<KeyRange, std::vector<StorageServerInterface>>>> keyServersPromise,
KeyRangeRef kr,
bool performQuiescentChecks,
bool failureIsError,
bool* success) {
state std::vector<std::pair<KeyRange, std::vector<StorageServerInterface>>> keyServers;
@ -134,7 +135,7 @@ ACTOR Future<bool> getKeyServers(
TraceEvent("ConsistencyCheck_CommitProxyUnavailable")
.error(shards.getError())
.detail("CommitProxyID", commitProxyInfo->getId(i));
testFailure("Commit proxy unavailable", performQuiescentChecks, success, true);
testFailure("Commit proxy unavailable", performQuiescentChecks, success, failureIsError);
return false;
}
@ -979,7 +980,8 @@ ACTOR Future<Void> runDataValidationCheck(ConsistencyScanData* self) {
// Get a list of key servers; verify that the TLogs and master all agree about who the key servers are
state Promise<std::vector<std::pair<KeyRange, std::vector<StorageServerInterface>>>> keyServerPromise;
state std::map<UID, StorageServerInterface> tssMapping;
bool keyServerResult = wait(getKeyServers(self->db, keyServerPromise, keyServersKeys, false, &self->success));
bool keyServerResult =
wait(getKeyServers(self->db, keyServerPromise, keyServersKeys, false, false, &self->success));
if (keyServerResult) {
state std::vector<std::pair<KeyRange, std::vector<StorageServerInterface>>> keyServers =
keyServerPromise.getFuture().get();

View File

@ -345,8 +345,12 @@ struct ConsistencyCheckWorkload : TestWorkload {
// Get a list of key servers; verify that the TLogs and master all agree about who the key servers are
state Promise<std::vector<std::pair<KeyRange, std::vector<StorageServerInterface>>>> keyServerPromise;
bool keyServerResult = wait(
getKeyServers(cx, keyServerPromise, keyServersKeys, self->performQuiescentChecks, &self->success));
bool keyServerResult = wait(getKeyServers(cx,
keyServerPromise,
keyServersKeys,
self->performQuiescentChecks,
self->failureIsError,
&self->success));
if (keyServerResult) {
state std::vector<std::pair<KeyRange, std::vector<StorageServerInterface>>> keyServers =
keyServerPromise.getFuture().get();
@ -797,8 +801,8 @@ struct ConsistencyCheckWorkload : TestWorkload {
bool removePrefix) {
// get shards paired with corresponding storage servers
state Promise<std::vector<std::pair<KeyRange, std::vector<StorageServerInterface>>>> keyServerPromise;
bool keyServerResult =
wait(getKeyServers(cx, keyServerPromise, range, self->performQuiescentChecks, &self->success));
bool keyServerResult = wait(getKeyServers(
cx, keyServerPromise, range, self->performQuiescentChecks, self->failureIsError, &self->success));
if (!keyServerResult)
return false;
state std::vector<std::pair<KeyRange, std::vector<StorageServerInterface>>> shards =
@ -1160,14 +1164,6 @@ struct ConsistencyCheckWorkload : TestWorkload {
}
if (foundExtraDataStore) {
// Let the cluster fully recover after rebooting/killing storage servers with extra stores.
//
// This requires an end-to-end comitting transaction to ensure recovery has started in case
// any stateless processes, like the commit proxy, were killed.
wait(::success(doEmptyCommit(cx)));
while (self->dbInfo->get().recoveryState != RecoveryState::FULLY_RECOVERED) {
wait(self->dbInfo->onChange());
}
self->testFailure("Extra data stores present on workers");
return false;
}