mirror of
https://github.com/apple/foundationdb.git
synced 2025-05-14 18:02:31 +08:00
Revert "Refactor: ClusterController driving cluster-recovery state machine"
This reverts commit 1520390bc50614ae7583638c07c033739f40dbfb.
This commit is contained in:
parent
32ebdc6da2
commit
bb17e194d9
@ -8,10 +8,7 @@ set(FDBSERVER_SRCS
|
|||||||
BlobManager.actor.cpp
|
BlobManager.actor.cpp
|
||||||
BlobManagerInterface.h
|
BlobManagerInterface.h
|
||||||
BlobWorker.actor.cpp
|
BlobWorker.actor.cpp
|
||||||
ClusterController.actor.h
|
|
||||||
ClusterController.actor.cpp
|
ClusterController.actor.cpp
|
||||||
ClusterRecovery.actor.h
|
|
||||||
ClusterRecovery.actor.cpp
|
|
||||||
ConfigBroadcaster.actor.cpp
|
ConfigBroadcaster.actor.cpp
|
||||||
ConfigBroadcaster.h
|
ConfigBroadcaster.h
|
||||||
ConfigDatabaseUnitTests.actor.cpp
|
ConfigDatabaseUnitTests.actor.cpp
|
||||||
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1,289 +0,0 @@
|
|||||||
/*
|
|
||||||
* ClusterRecovery.actor.h
|
|
||||||
*
|
|
||||||
* This source file is part of the FoundationDB open source project
|
|
||||||
*
|
|
||||||
* Copyright 2013-2020 Apple Inc. and the FoundationDB project authors
|
|
||||||
*
|
|
||||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
* you may not use this file except in compliance with the License.
|
|
||||||
* You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#pragma once
|
|
||||||
|
|
||||||
// When actually compiled (NO_INTELLISENSE), include the generated version of this file. In intellisense use the source
|
|
||||||
// version.
|
|
||||||
#include <utility>
|
|
||||||
|
|
||||||
#if defined(NO_INTELLISENSE) && !defined(FDBSERVER_CLUSTERRECOVERY_ACTOR_G_H)
|
|
||||||
#define FDBSERVER_CLUSTERRECOVERY_ACTOR_G_H
|
|
||||||
#include "fdbserver/ClusterRecovery.actor.g.h"
|
|
||||||
#elif !defined(FDBSERVER_CLUSTERRECOVERY_ACTOR_H)
|
|
||||||
#define FDBSERVER_CLUSTERRECOVERY_ACTOR_H
|
|
||||||
|
|
||||||
#include "fdbclient/DatabaseContext.h"
|
|
||||||
#include "fdbrpc/Replication.h"
|
|
||||||
#include "fdbrpc/ReplicationUtils.h"
|
|
||||||
#include "fdbserver/CoordinatedState.h"
|
|
||||||
#include "fdbserver/CoordinationInterface.h" // copy constructors for ServerCoordinators class
|
|
||||||
#include "fdbserver/ClusterController.actor.h"
|
|
||||||
#include "fdbserver/DBCoreState.h"
|
|
||||||
#include "fdbserver/Knobs.h"
|
|
||||||
#include "fdbserver/LogSystem.h"
|
|
||||||
#include "fdbserver/LogSystemConfig.h"
|
|
||||||
#include "fdbserver/LogSystemDiskQueueAdapter.h"
|
|
||||||
#include "fdbserver/MoveKeys.actor.h"
|
|
||||||
#include "fdbserver/WorkerInterface.actor.h"
|
|
||||||
#include "flow/Error.h"
|
|
||||||
#include "flow/SystemMonitor.h"
|
|
||||||
|
|
||||||
#include "flow/actorcompiler.h" // This must be the last #include.
|
|
||||||
|
|
||||||
ACTOR Future<Void> recoveryTerminateOnConflict(UID dbgid,
|
|
||||||
Promise<Void> fullyRecovered,
|
|
||||||
Future<Void> onConflict,
|
|
||||||
Future<Void> switchedState);
|
|
||||||
|
|
||||||
class ReusableCoordinatedState : NonCopyable {
|
|
||||||
public:
|
|
||||||
Promise<Void> fullyRecovered;
|
|
||||||
DBCoreState prevDBState;
|
|
||||||
DBCoreState myDBState;
|
|
||||||
bool finalWriteStarted;
|
|
||||||
Future<Void> previousWrite;
|
|
||||||
|
|
||||||
ReusableCoordinatedState(ServerCoordinators const& coordinators,
|
|
||||||
PromiseStream<Future<Void>> const& addActor,
|
|
||||||
UID const& dbgid)
|
|
||||||
: finalWriteStarted(false), previousWrite(Void()), cstate(coordinators), coordinators(coordinators),
|
|
||||||
addActor(addActor), dbgid(dbgid) {}
|
|
||||||
|
|
||||||
Future<Void> read() { return _read(this); }
|
|
||||||
|
|
||||||
Future<Void> write(DBCoreState newState, bool finalWrite = false) {
|
|
||||||
previousWrite = _write(this, newState, finalWrite);
|
|
||||||
return previousWrite;
|
|
||||||
}
|
|
||||||
|
|
||||||
Future<Void> move(ClusterConnectionString const& nc) { return cstate.move(nc); }
|
|
||||||
|
|
||||||
private:
|
|
||||||
MovableCoordinatedState cstate;
|
|
||||||
ServerCoordinators coordinators;
|
|
||||||
PromiseStream<Future<Void>> addActor;
|
|
||||||
Promise<Void> switchedState;
|
|
||||||
UID dbgid;
|
|
||||||
|
|
||||||
ACTOR Future<Void> _read(ReusableCoordinatedState* self) {
|
|
||||||
Value prevDBStateRaw = wait(self->cstate.read());
|
|
||||||
Future<Void> onConflict = recoveryTerminateOnConflict(
|
|
||||||
self->dbgid, self->fullyRecovered, self->cstate.onConflict(), self->switchedState.getFuture());
|
|
||||||
if (onConflict.isReady() && onConflict.isError()) {
|
|
||||||
throw onConflict.getError();
|
|
||||||
}
|
|
||||||
self->addActor.send(onConflict);
|
|
||||||
|
|
||||||
if (prevDBStateRaw.size()) {
|
|
||||||
self->prevDBState = BinaryReader::fromStringRef<DBCoreState>(prevDBStateRaw, IncludeVersion());
|
|
||||||
self->myDBState = self->prevDBState;
|
|
||||||
}
|
|
||||||
|
|
||||||
return Void();
|
|
||||||
}
|
|
||||||
|
|
||||||
ACTOR Future<Void> _write(ReusableCoordinatedState* self, DBCoreState newState, bool finalWrite) {
|
|
||||||
if (self->finalWriteStarted) {
|
|
||||||
wait(Future<Void>(Never()));
|
|
||||||
}
|
|
||||||
|
|
||||||
if (finalWrite) {
|
|
||||||
self->finalWriteStarted = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
|
||||||
wait(self->cstate.setExclusive(
|
|
||||||
BinaryWriter::toValue(newState, IncludeVersion(ProtocolVersion::withDBCoreState()))));
|
|
||||||
} catch (Error& e) {
|
|
||||||
TEST(true); // Master displaced during writeMasterState
|
|
||||||
throw;
|
|
||||||
}
|
|
||||||
|
|
||||||
self->myDBState = newState;
|
|
||||||
|
|
||||||
if (!finalWrite) {
|
|
||||||
self->switchedState.send(Void());
|
|
||||||
self->cstate = MovableCoordinatedState(self->coordinators);
|
|
||||||
Value rereadDBStateRaw = wait(self->cstate.read());
|
|
||||||
DBCoreState readState;
|
|
||||||
if (rereadDBStateRaw.size())
|
|
||||||
readState = BinaryReader::fromStringRef<DBCoreState>(rereadDBStateRaw, IncludeVersion());
|
|
||||||
|
|
||||||
if (readState != newState) {
|
|
||||||
TraceEvent("RecoveryTerminated", self->dbgid).detail("Reason", "CStateChanged");
|
|
||||||
TEST(true); // Coordinated state changed between writing and reading, recovery restarting
|
|
||||||
throw worker_removed();
|
|
||||||
}
|
|
||||||
self->switchedState = Promise<Void>();
|
|
||||||
self->addActor.send(recoveryTerminateOnConflict(
|
|
||||||
self->dbgid, self->fullyRecovered, self->cstate.onConflict(), self->switchedState.getFuture()));
|
|
||||||
} else {
|
|
||||||
self->fullyRecovered.send(Void());
|
|
||||||
}
|
|
||||||
|
|
||||||
return Void();
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
struct ClusterRecoveryData : NonCopyable, ReferenceCounted<ClusterRecoveryData> {
|
|
||||||
ClusterControllerData* controllerData;
|
|
||||||
|
|
||||||
UID dbgid;
|
|
||||||
|
|
||||||
AsyncTrigger registrationTrigger;
|
|
||||||
Version lastEpochEnd, // The last version in the old epoch not (to be) rolled back in this recovery
|
|
||||||
recoveryTransactionVersion; // The first version in this epoch
|
|
||||||
double lastCommitTime;
|
|
||||||
|
|
||||||
Version liveCommittedVersion; // The largest live committed version reported by commit proxies.
|
|
||||||
bool databaseLocked;
|
|
||||||
Optional<Value> proxyMetadataVersion;
|
|
||||||
Version minKnownCommittedVersion;
|
|
||||||
|
|
||||||
DatabaseConfiguration originalConfiguration;
|
|
||||||
DatabaseConfiguration configuration;
|
|
||||||
std::vector<Optional<Key>> primaryDcId;
|
|
||||||
std::vector<Optional<Key>> remoteDcIds;
|
|
||||||
bool hasConfiguration;
|
|
||||||
|
|
||||||
ServerCoordinators coordinators;
|
|
||||||
|
|
||||||
Reference<ILogSystem> logSystem;
|
|
||||||
Version version; // The last version assigned to a proxy by getVersion()
|
|
||||||
double lastVersionTime;
|
|
||||||
LogSystemDiskQueueAdapter* txnStateLogAdapter;
|
|
||||||
IKeyValueStore* txnStateStore;
|
|
||||||
int64_t memoryLimit;
|
|
||||||
std::map<Optional<Value>, int8_t> dcId_locality;
|
|
||||||
std::vector<Tag> allTags;
|
|
||||||
|
|
||||||
int8_t getNextLocality() {
|
|
||||||
int8_t maxLocality = -1;
|
|
||||||
for (auto it : dcId_locality) {
|
|
||||||
maxLocality = std::max(maxLocality, it.second);
|
|
||||||
}
|
|
||||||
return maxLocality + 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<CommitProxyInterface> commitProxies;
|
|
||||||
std::vector<CommitProxyInterface> provisionalCommitProxies;
|
|
||||||
std::vector<GrvProxyInterface> grvProxies;
|
|
||||||
std::vector<GrvProxyInterface> provisionalGrvProxies;
|
|
||||||
std::vector<ResolverInterface> resolvers;
|
|
||||||
|
|
||||||
std::map<UID, CommitProxyVersionReplies> lastCommitProxyVersionReplies;
|
|
||||||
|
|
||||||
UID clusterId;
|
|
||||||
Standalone<StringRef> dbId;
|
|
||||||
|
|
||||||
MasterInterface masterInterface;
|
|
||||||
LifetimeToken masterLifetime;
|
|
||||||
const ClusterControllerFullInterface
|
|
||||||
clusterController; // If the cluster controller changes, this master will die, so this is immutable.
|
|
||||||
|
|
||||||
ReusableCoordinatedState cstate;
|
|
||||||
Promise<Void> recoveryReadyForCommits;
|
|
||||||
Promise<Void> cstateUpdated;
|
|
||||||
Reference<AsyncVar<ServerDBInfo> const> dbInfo;
|
|
||||||
int64_t registrationCount; // Number of different MasterRegistrationRequests sent to clusterController
|
|
||||||
|
|
||||||
RecoveryState recoveryState;
|
|
||||||
|
|
||||||
AsyncVar<Standalone<VectorRef<ResolverMoveRef>>> resolverChanges;
|
|
||||||
Version resolverChangesVersion;
|
|
||||||
std::set<UID> resolverNeedingChanges;
|
|
||||||
|
|
||||||
PromiseStream<Future<Void>> addActor;
|
|
||||||
Reference<AsyncVar<bool>> recruitmentStalled;
|
|
||||||
bool forceRecovery;
|
|
||||||
bool neverCreated;
|
|
||||||
int8_t safeLocality;
|
|
||||||
int8_t primaryLocality;
|
|
||||||
|
|
||||||
std::vector<WorkerInterface> backupWorkers; // Recruited backup workers from cluster controller.
|
|
||||||
|
|
||||||
CounterCollection cc;
|
|
||||||
Counter changeCoordinatorsRequests;
|
|
||||||
Counter getCommitVersionRequests;
|
|
||||||
Counter backupWorkerDoneRequests;
|
|
||||||
Counter getLiveCommittedVersionRequests;
|
|
||||||
Counter reportLiveCommittedVersionRequests;
|
|
||||||
|
|
||||||
Future<Void> logger;
|
|
||||||
|
|
||||||
Reference<EventCacheHolder> clusterRecoveryStateEventHolder;
|
|
||||||
Reference<EventCacheHolder> clusterRecoveryGenerationsEventHolder;
|
|
||||||
Reference<EventCacheHolder> clusterRecoveryDurationEventHolder;
|
|
||||||
Reference<EventCacheHolder> clusterRecoveryAvailableEventHolder;
|
|
||||||
Reference<EventCacheHolder> recoveredConfigEventHolder;
|
|
||||||
|
|
||||||
ClusterRecoveryData(ClusterControllerData* controllerData,
|
|
||||||
Reference<AsyncVar<ServerDBInfo> const> const& dbInfo,
|
|
||||||
MasterInterface const& masterInterface,
|
|
||||||
LifetimeToken const& masterLifetimeToken,
|
|
||||||
ServerCoordinators const& coordinators,
|
|
||||||
ClusterControllerFullInterface const& clusterController,
|
|
||||||
Standalone<StringRef> const& dbId,
|
|
||||||
PromiseStream<Future<Void>> const& addActor,
|
|
||||||
bool forceRecovery)
|
|
||||||
|
|
||||||
: controllerData(controllerData), dbgid(masterInterface.id()), lastEpochEnd(invalidVersion),
|
|
||||||
recoveryTransactionVersion(invalidVersion), lastCommitTime(0), liveCommittedVersion(invalidVersion),
|
|
||||||
databaseLocked(false), minKnownCommittedVersion(invalidVersion), hasConfiguration(false),
|
|
||||||
coordinators(coordinators), version(invalidVersion), lastVersionTime(0), txnStateStore(nullptr),
|
|
||||||
memoryLimit(2e9), dbId(dbId), masterInterface(masterInterface), masterLifetime(masterLifetimeToken),
|
|
||||||
clusterController(clusterController), cstate(coordinators, addActor, dbgid), dbInfo(dbInfo),
|
|
||||||
registrationCount(0), addActor(addActor), recruitmentStalled(makeReference<AsyncVar<bool>>(false)),
|
|
||||||
forceRecovery(forceRecovery), neverCreated(false), safeLocality(tagLocalityInvalid),
|
|
||||||
primaryLocality(tagLocalityInvalid), cc("Master", dbgid.toString()),
|
|
||||||
changeCoordinatorsRequests("ChangeCoordinatorsRequests", cc),
|
|
||||||
getCommitVersionRequests("GetCommitVersionRequests", cc),
|
|
||||||
backupWorkerDoneRequests("BackupWorkerDoneRequests", cc),
|
|
||||||
getLiveCommittedVersionRequests("GetLiveCommittedVersionRequests", cc),
|
|
||||||
reportLiveCommittedVersionRequests("ReportLiveCommittedVersionRequests", cc),
|
|
||||||
clusterRecoveryStateEventHolder(makeReference<EventCacheHolder>("ClusterRecoveryState")),
|
|
||||||
clusterRecoveryGenerationsEventHolder(makeReference<EventCacheHolder>("ClusterRecoveryGenerations")),
|
|
||||||
clusterRecoveryDurationEventHolder(makeReference<EventCacheHolder>("ClusterRecoveryDuration")),
|
|
||||||
clusterRecoveryAvailableEventHolder(makeReference<EventCacheHolder>("ClusterRecoveryAvailable")),
|
|
||||||
recoveredConfigEventHolder(makeReference<EventCacheHolder>("RecoveredConfig")) {
|
|
||||||
logger = traceCounters(
|
|
||||||
"ClusterRecoveryMetrics", dbgid, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, &cc, "ClusterRecoveryMetrics");
|
|
||||||
if (forceRecovery && !controllerData->clusterControllerDcId.present()) {
|
|
||||||
TraceEvent(SevError, "ForcedRecoveryRequiresDcID").log();
|
|
||||||
forceRecovery = false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
~ClusterRecoveryData() {
|
|
||||||
if (txnStateStore)
|
|
||||||
txnStateStore->close();
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
ACTOR Future<Void> recruitNewMaster(ClusterControllerData* cluster,
|
|
||||||
ClusterControllerData::DBInfo* db,
|
|
||||||
MasterInterface* newMaster);
|
|
||||||
ACTOR Future<Void> cleanupRecoveryActorCollection(Reference<ClusterRecoveryData> self, bool exThrown);
|
|
||||||
ACTOR Future<Void> clusterRecoveryCore(Reference<ClusterRecoveryData> self);
|
|
||||||
bool isNormalClusterRecoveryError(const Error&);
|
|
||||||
|
|
||||||
#include "flow/unactorcompiler.h"
|
|
||||||
|
|
||||||
#endif
|
|
Loading…
x
Reference in New Issue
Block a user