diff --git a/fdbclient/ManagementAPI.actor.cpp b/fdbclient/ManagementAPI.actor.cpp index 067730ba5e..2a5c9ac910 100644 --- a/fdbclient/ManagementAPI.actor.cpp +++ b/fdbclient/ManagementAPI.actor.cpp @@ -804,6 +804,8 @@ ACTOR Future> getConnectionString(Database cx) } } +static std::vector connectionStrings; + namespace { ACTOR Future> getClusterConnectionStringFromStorageServer(Transaction* tr) { @@ -821,6 +823,19 @@ ACTOR Future> getClusterConnectionStringFromSt Version readVersion = wait(tr->getReadVersion()); state Optional currentKey = wait(tr->get(coordinatorsKey)); + if (g_network->isSimulated() && currentKey.present()) { + // If the change coordinators request succeeded, the coordinators + // should have changed to the connection string of the most + // recently issued request. If instead the connection string is + // equal to one of the previously issued requests, there is a bug + // and we are breaking the promises we make with + // commit_unknown_result (the transaction must no longer be in + // progress when receiving this error). + int n = connectionStrings.size() > 0 ? connectionStrings.size() - 1 : 0; // avoid underflow + for (int i = 0; i < n; ++i) { + ASSERT(currentKey.get() != connectionStrings.at(i)); + } + } if (!currentKey.present()) { // Someone deleted this key entirely? @@ -879,10 +894,12 @@ ACTOR Future> changeQuorumChecker(Transaction* tr, std::sort(old.hostnames.begin(), old.hostnames.end()); std::sort(old.coords.begin(), old.coords.end()); if (conn->hostnames == old.hostnames && conn->coords == old.coords && old.clusterKeyName() == newName) { + connectionStrings.clear(); return CoordinatorsResult::SAME_NETWORK_ADDRESSES; } conn->parseKey(newName + ':' + deterministicRandom()->randomAlphaNumeric(32)); + connectionStrings.push_back(conn->toString()); if (g_network->isSimulated()) { int i = 0; diff --git a/fdbserver/ClusterRecovery.actor.cpp b/fdbserver/ClusterRecovery.actor.cpp index 9df53348f9..604656fccd 100644 --- a/fdbserver/ClusterRecovery.actor.cpp +++ b/fdbserver/ClusterRecovery.actor.cpp @@ -521,6 +521,19 @@ ACTOR Future changeCoordinators(Reference self) { TraceEvent("ChangeCoordinators", self->dbgid).log(); ++self->changeCoordinatorsRequests; state ChangeCoordinatorsRequest changeCoordinatorsRequest = req; + if (self->masterInterface.id() != changeCoordinatorsRequest.masterId) { + // Make sure the request is coming from a proxy from the same + // generation. If not, throw coordinators_changed - this is OK + // because the client will still receive commit_unknown_result, and + // will retry the request. This check is necessary because + // otherwise in rare circumstances where a recovery occurs between + // the change coordinators request from the client and the cstate + // actually being moved, the client may think the change + // coordinators command failed when it is still in progress. So we + // preempt the issue here and force failure if the generations + // don't match. + throw coordinators_changed(); + } // Kill cluster controller to facilitate coordinator registration update if (self->controllerData->shouldCommitSuicide) { diff --git a/fdbserver/CommitProxyServer.actor.cpp b/fdbserver/CommitProxyServer.actor.cpp index 237f888257..9a80c76db1 100644 --- a/fdbserver/CommitProxyServer.actor.cpp +++ b/fdbserver/CommitProxyServer.actor.cpp @@ -1138,7 +1138,8 @@ ACTOR Future applyMetadataToCommittedTransactions(CommitBatchContext* self if (!self->isMyFirstBatch && pProxyCommitData->txnStateStore->readValue(coordinatorsKey).get().get() != self->oldCoordinators.get()) { wait(brokenPromiseToNever(pProxyCommitData->db->get().clusterInterface.changeCoordinators.getReply( - ChangeCoordinatorsRequest(pProxyCommitData->txnStateStore->readValue(coordinatorsKey).get().get())))); + ChangeCoordinatorsRequest(pProxyCommitData->txnStateStore->readValue(coordinatorsKey).get().get(), + self->pProxyCommitData->master.id())))); ASSERT(false); // ChangeCoordinatorsRequest should always throw } diff --git a/fdbserver/include/fdbserver/MasterInterface.h b/fdbserver/include/fdbserver/MasterInterface.h index 4cf4865a42..5c45ae7323 100644 --- a/fdbserver/include/fdbserver/MasterInterface.h +++ b/fdbserver/include/fdbserver/MasterInterface.h @@ -83,13 +83,15 @@ struct ChangeCoordinatorsRequest { constexpr static FileIdentifier file_identifier = 13605416; Standalone newConnectionString; ReplyPromise reply; // normally throws even on success! + UID masterId; ChangeCoordinatorsRequest() {} - ChangeCoordinatorsRequest(Standalone newConnectionString) : newConnectionString(newConnectionString) {} + ChangeCoordinatorsRequest(Standalone newConnectionString, UID masterId) + : newConnectionString(newConnectionString), masterId(masterId) {} template void serialize(Ar& ar) { - serializer(ar, newConnectionString, reply); + serializer(ar, newConnectionString, reply, masterId); } };