mirror of
https://github.com/apple/foundationdb.git
synced 2025-05-14 18:02:31 +08:00
added a list of tlog IDs that are missing to status
This commit is contained in:
parent
eba8151a09
commit
48901a9223
@ -698,6 +698,8 @@ void printStatus(StatusObjectReader statusObj, StatusClient::StatusLevel level,
|
||||
description += format("\nNeed at least %d log servers, %d proxies and %d resolvers.", recoveryState["required_logs"].get_int(), recoveryState["required_proxies"].get_int(), recoveryState["required_resolvers"].get_int());
|
||||
if (statusObjCluster.has("machines") && statusObjCluster.has("processes"))
|
||||
description += format("\nHave %d processes on %d machines.", statusObjCluster["processes"].get_obj().size(), statusObjCluster["machines"].get_obj().size());
|
||||
} else if (name == "locking_old_transaction_servers" && recoveryState["missing_logs"].get_str().size()) {
|
||||
description += format("\nNeed one or more of the following log servers: %s", recoveryState["missing_logs"].get_str().c_str());
|
||||
}
|
||||
description = lineWrap(description.c_str(), 80);
|
||||
if (!printedCoordinators && (
|
||||
|
@ -904,11 +904,11 @@ static StatusObject clientStatusFetcher(ClientVersionMap clientVersionMap) {
|
||||
return clientStatus;
|
||||
}
|
||||
|
||||
ACTOR static Future<StatusObject> recoveryStateStatusFetcher(std::pair<WorkerInterface, ProcessClass> mWorker, std::string dbName, int workerCount, std::set<std::string> *incomplete_reasons) {
|
||||
ACTOR static Future<StatusObject> recoveryStateStatusFetcher(std::pair<WorkerInterface, ProcessClass> mWorker, int workerCount, std::set<std::string> *incomplete_reasons) {
|
||||
state StatusObject message;
|
||||
|
||||
try {
|
||||
Standalone<StringRef> md = wait( timeoutError(mWorker.first.eventLogRequest.getReply( EventLogRequest(StringRef(dbName+"/MasterRecoveryState") ) ), 1.0) );
|
||||
Standalone<StringRef> md = wait( timeoutError(mWorker.first.eventLogRequest.getReply( EventLogRequest( LiteralStringRef("MasterRecoveryState") ) ), 1.0) );
|
||||
state int mStatusCode = parseInt( extractAttribute(md, LiteralStringRef("StatusCode")) );
|
||||
if (mStatusCode < 0 || mStatusCode >= RecoveryStatus::END)
|
||||
throw attribute_not_found();
|
||||
@ -926,6 +926,8 @@ ACTOR static Future<StatusObject> recoveryStateStatusFetcher(std::pair<WorkerInt
|
||||
message["required_logs"] = requiredLogs;
|
||||
message["required_proxies"] = requiredProxies;
|
||||
message["required_resolvers"] = requiredResolvers;
|
||||
} else if (mStatusCode == RecoveryStatus::locking_old_transaction_servers) {
|
||||
message["missing_logs"] = extractAttribute(md, LiteralStringRef("MissingIDs")).c_str();
|
||||
}
|
||||
// TODO: time_in_recovery: 0.5
|
||||
// time_in_state: 0.1
|
||||
@ -1744,7 +1746,7 @@ ACTOR Future<StatusReply> clusterGetStatus(
|
||||
}
|
||||
|
||||
// construct status information for cluster subsections
|
||||
state StatusObject recoveryStateStatus = wait(recoveryStateStatusFetcher(mWorker, dbName, workers.size(), &status_incomplete_reasons));
|
||||
state StatusObject recoveryStateStatus = wait(recoveryStateStatusFetcher(mWorker, workers.size(), &status_incomplete_reasons));
|
||||
|
||||
// machine metrics
|
||||
state WorkerEvents mMetrics = workerEventsVec[0].present() ? workerEventsVec[0].get().first : WorkerEvents();
|
||||
|
@ -28,6 +28,7 @@
|
||||
#include "fdbrpc/simulator.h"
|
||||
#include "fdbrpc/Replication.h"
|
||||
#include "fdbrpc/ReplicationUtils.h"
|
||||
#include "RecoveryState.h"
|
||||
|
||||
template <class Collection>
|
||||
void uniquify( Collection& c ) {
|
||||
@ -648,6 +649,8 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
|
||||
std::vector<TLogLockResult> results;
|
||||
std::string sServerState;
|
||||
LocalityGroup unResponsiveSet;
|
||||
std::string missingServerIds;
|
||||
|
||||
double t = timer();
|
||||
cycles ++;
|
||||
|
||||
@ -660,6 +663,10 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
|
||||
else {
|
||||
unResponsiveSet.add(prevState.tLogLocalities[t]);
|
||||
sServerState += 'f';
|
||||
if(missingServerIds.size()) {
|
||||
missingServerIds += ", ";
|
||||
}
|
||||
missingServerIds += logServers[t]->get().toString();
|
||||
}
|
||||
}
|
||||
|
||||
@ -773,22 +780,12 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
|
||||
.detail("LogZones", ::describeZones(prevState.tLogLocalities))
|
||||
.detail("LogDataHalls", ::describeDataHalls(prevState.tLogLocalities));
|
||||
}
|
||||
}
|
||||
// Too many failures
|
||||
else {
|
||||
TraceEvent("LogSystemWaitingForRecovery", dbgid).detail("Cycles", cycles)
|
||||
.detail("AvailableServers", results.size())
|
||||
.detail("TotalServers", logServers.size())
|
||||
.detail("Present", results.size())
|
||||
.detail("Available", availableItems.size())
|
||||
.detail("Absent", logServers.size() - results.size())
|
||||
.detail("ServerState", sServerState)
|
||||
.detail("ReplicationFactor", prevState.tLogReplicationFactor)
|
||||
.detail("AntiQuorum", prevState.tLogWriteAntiQuorum)
|
||||
.detail("Policy", prevState.tLogPolicy->info())
|
||||
.detail("TooManyFailures", bTooManyFailures)
|
||||
.detail("LogZones", ::describeZones(prevState.tLogLocalities))
|
||||
.detail("LogDataHalls", ::describeDataHalls(prevState.tLogLocalities));
|
||||
} else {
|
||||
TraceEvent("MasterRecoveryState", dbgid)
|
||||
.detail("StatusCode", RecoveryStatus::locking_old_transaction_servers)
|
||||
.detail("Status", RecoveryStatus::names[RecoveryStatus::locking_old_transaction_servers])
|
||||
.detail("MissingIDs", missingServerIds)
|
||||
.trackLatest("MasterRecoveryState");
|
||||
}
|
||||
|
||||
// Wait for anything relevant to change
|
||||
|
@ -452,7 +452,7 @@ ACTOR Future<Void> recruitEverything( Reference<MasterData> self, vector<Storage
|
||||
TraceEvent("MasterRecoveryState", self->dbgid)
|
||||
.detail("StatusCode", status)
|
||||
.detail("Status", RecoveryStatus::names[status])
|
||||
.trackLatest(format("%s/MasterRecoveryState", printable(self->dbName).c_str() ).c_str());
|
||||
.trackLatest("MasterRecoveryState");
|
||||
return Never();
|
||||
} else
|
||||
TraceEvent("MasterRecoveryState", self->dbgid)
|
||||
@ -465,7 +465,7 @@ ACTOR Future<Void> recruitEverything( Reference<MasterData> self, vector<Storage
|
||||
.detail("RequiredResolvers", 1)
|
||||
.detail("DesiredResolvers", self->configuration.getDesiredResolvers())
|
||||
.detail("storeType", self->configuration.storageServerStoreType)
|
||||
.trackLatest(format("%s/MasterRecoveryState", printable(self->dbName).c_str() ).c_str());
|
||||
.trackLatest("MasterRecoveryState");
|
||||
|
||||
RecruitFromConfigurationReply recruits = wait(
|
||||
brokenPromiseToNever( self->clusterController.recruitFromConfiguration.getReply(
|
||||
@ -477,7 +477,7 @@ ACTOR Future<Void> recruitEverything( Reference<MasterData> self, vector<Storage
|
||||
.detail("Proxies", recruits.proxies.size())
|
||||
.detail("TLogs", recruits.tLogs.size())
|
||||
.detail("Resolvers", recruits.resolvers.size())
|
||||
.trackLatest(format("%s/MasterRecoveryState", printable(self->dbName).c_str() ).c_str());
|
||||
.trackLatest("MasterRecoveryState");
|
||||
|
||||
// Actually, newSeedServers does both the recruiting and initialization of the seed servers; so if this is a brand new database we are sort of lying that we are
|
||||
// past the recruitment phase. In a perfect world we would split that up so that the recruitment part happens above (in parallel with recruiting the transaction servers?).
|
||||
@ -637,7 +637,7 @@ ACTOR Future<Void> recoverFrom( Reference<MasterData> self, Reference<ILogSystem
|
||||
TraceEvent("MasterRecoveryState", self->dbgid)
|
||||
.detail("StatusCode", RecoveryStatus::reading_transaction_system_state)
|
||||
.detail("Status", RecoveryStatus::names[RecoveryStatus::reading_transaction_system_state])
|
||||
.trackLatest(format("%s/MasterRecoveryState", printable(self->dbName).c_str() ).c_str());
|
||||
.trackLatest("MasterRecoveryState");
|
||||
self->hasConfiguration = false;
|
||||
|
||||
if(BUGGIFY)
|
||||
@ -966,7 +966,7 @@ ACTOR Future<Void> masterCore( Reference<MasterData> self, PromiseStream<Future<
|
||||
TraceEvent("MasterRecoveryState", self->dbgid)
|
||||
.detail("StatusCode", RecoveryStatus::reading_coordinated_state)
|
||||
.detail("Status", RecoveryStatus::names[RecoveryStatus::reading_coordinated_state])
|
||||
.trackLatest(format("%s/MasterRecoveryState", printable(self->dbName).c_str() ).c_str());
|
||||
.trackLatest("MasterRecoveryState");
|
||||
|
||||
Value prevDBStateRaw = wait( self->cstate1.read() );
|
||||
addActor.send( masterTerminateOnConflict( self, self->cstate1.onConflict() ) );
|
||||
@ -981,7 +981,7 @@ ACTOR Future<Void> masterCore( Reference<MasterData> self, PromiseStream<Future<
|
||||
.detail("TLogs", self->prevDBState.tLogs.size())
|
||||
.detail("MyRecoveryCount", self->prevDBState.recoveryCount+2)
|
||||
.detail("StateSize", prevDBStateRaw.size())
|
||||
.trackLatest(format("%s/MasterRecoveryState", printable(self->dbName).c_str() ).c_str());
|
||||
.trackLatest("MasterRecoveryState");
|
||||
|
||||
state Reference<AsyncVar<Reference<ILogSystem>>> oldLogSystems( new AsyncVar<Reference<ILogSystem>> );
|
||||
state Future<Void> recoverAndEndEpoch = ILogSystem::recoverAndEndEpoch(oldLogSystems, self->dbgid, self->prevDBState, self->myInterface.tlogRejoin.getFuture(), self->myInterface.locality);
|
||||
@ -999,7 +999,8 @@ ACTOR Future<Void> masterCore( Reference<MasterData> self, PromiseStream<Future<
|
||||
TraceEvent("MasterRecoveryState", self->dbgid)
|
||||
.detail("StatusCode", RecoveryStatus::locking_old_transaction_servers)
|
||||
.detail("Status", RecoveryStatus::names[RecoveryStatus::locking_old_transaction_servers])
|
||||
.trackLatest(format("%s/MasterRecoveryState", printable(self->dbName).c_str() ).c_str());
|
||||
.detail("MissingIDs", "")
|
||||
.trackLatest("MasterRecoveryState");
|
||||
|
||||
loop {
|
||||
Reference<ILogSystem> oldLogSystem = oldLogSystems->get();
|
||||
@ -1025,7 +1026,7 @@ ACTOR Future<Void> masterCore( Reference<MasterData> self, PromiseStream<Future<
|
||||
TraceEvent("MasterRecoveryState", self->dbgid)
|
||||
.detail("StatusCode", RecoveryStatus::recovery_transaction)
|
||||
.detail("Status", RecoveryStatus::names[RecoveryStatus::recovery_transaction])
|
||||
.trackLatest(format("%s/MasterRecoveryState", printable(self->dbName).c_str() ).c_str());
|
||||
.trackLatest("MasterRecoveryState");
|
||||
|
||||
// Recovery transaction
|
||||
state bool debugResult = debug_checkMinRestoredVersion( UID(), self->lastEpochEnd, "DBRecovery", SevWarn );
|
||||
@ -1101,7 +1102,7 @@ ACTOR Future<Void> masterCore( Reference<MasterData> self, PromiseStream<Future<
|
||||
.detail("Status", RecoveryStatus::names[RecoveryStatus::writing_coordinated_state])
|
||||
.detail("TLogs", self->logSystem->getLogServerCount())
|
||||
.detail("TLogList", self->logSystem->describe())
|
||||
.trackLatest(format("%s/MasterRecoveryState", printable(self->dbName).c_str() ).c_str());
|
||||
.trackLatest("MasterRecoveryState");
|
||||
|
||||
// Multiple masters prevent conflicts between themselves via CoordinatedState (self->cstate)
|
||||
// 1. If SetMaster succeeds, then by CS's contract, these "new" Tlogs are the immediate
|
||||
@ -1137,7 +1138,7 @@ ACTOR Future<Void> masterCore( Reference<MasterData> self, PromiseStream<Future<
|
||||
.detail("StatusCode", RecoveryStatus::fully_recovered)
|
||||
.detail("Status", RecoveryStatus::names[RecoveryStatus::fully_recovered])
|
||||
.detail("storeType", self->configuration.storageServerStoreType)
|
||||
.trackLatest(format("%s/MasterRecoveryState", printable(self->dbName).c_str() ).c_str());
|
||||
.trackLatest("MasterRecoveryState");
|
||||
|
||||
// Now that recovery is complete, we register ourselves with the cluster controller, so that the client and server information
|
||||
// it hands out can be updated
|
||||
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Loading…
x
Reference in New Issue
Block a user