added a list of tlog IDs that are missing to status

This commit is contained in:
Evan Tschannen 2017-10-24 16:28:50 -07:00
parent eba8151a09
commit 48901a9223
7 changed files with 34 additions and 32 deletions

View File

@ -698,6 +698,8 @@ void printStatus(StatusObjectReader statusObj, StatusClient::StatusLevel level,
description += format("\nNeed at least %d log servers, %d proxies and %d resolvers.", recoveryState["required_logs"].get_int(), recoveryState["required_proxies"].get_int(), recoveryState["required_resolvers"].get_int());
if (statusObjCluster.has("machines") && statusObjCluster.has("processes"))
description += format("\nHave %d processes on %d machines.", statusObjCluster["processes"].get_obj().size(), statusObjCluster["machines"].get_obj().size());
} else if (name == "locking_old_transaction_servers" && recoveryState["missing_logs"].get_str().size()) {
description += format("\nNeed one or more of the following log servers: %s", recoveryState["missing_logs"].get_str().c_str());
}
description = lineWrap(description.c_str(), 80);
if (!printedCoordinators && (

View File

@ -904,11 +904,11 @@ static StatusObject clientStatusFetcher(ClientVersionMap clientVersionMap) {
return clientStatus;
}
ACTOR static Future<StatusObject> recoveryStateStatusFetcher(std::pair<WorkerInterface, ProcessClass> mWorker, std::string dbName, int workerCount, std::set<std::string> *incomplete_reasons) {
ACTOR static Future<StatusObject> recoveryStateStatusFetcher(std::pair<WorkerInterface, ProcessClass> mWorker, int workerCount, std::set<std::string> *incomplete_reasons) {
state StatusObject message;
try {
Standalone<StringRef> md = wait( timeoutError(mWorker.first.eventLogRequest.getReply( EventLogRequest(StringRef(dbName+"/MasterRecoveryState") ) ), 1.0) );
Standalone<StringRef> md = wait( timeoutError(mWorker.first.eventLogRequest.getReply( EventLogRequest( LiteralStringRef("MasterRecoveryState") ) ), 1.0) );
state int mStatusCode = parseInt( extractAttribute(md, LiteralStringRef("StatusCode")) );
if (mStatusCode < 0 || mStatusCode >= RecoveryStatus::END)
throw attribute_not_found();
@ -926,6 +926,8 @@ ACTOR static Future<StatusObject> recoveryStateStatusFetcher(std::pair<WorkerInt
message["required_logs"] = requiredLogs;
message["required_proxies"] = requiredProxies;
message["required_resolvers"] = requiredResolvers;
} else if (mStatusCode == RecoveryStatus::locking_old_transaction_servers) {
message["missing_logs"] = extractAttribute(md, LiteralStringRef("MissingIDs")).c_str();
}
// TODO: time_in_recovery: 0.5
// time_in_state: 0.1
@ -1744,7 +1746,7 @@ ACTOR Future<StatusReply> clusterGetStatus(
}
// construct status information for cluster subsections
state StatusObject recoveryStateStatus = wait(recoveryStateStatusFetcher(mWorker, dbName, workers.size(), &status_incomplete_reasons));
state StatusObject recoveryStateStatus = wait(recoveryStateStatusFetcher(mWorker, workers.size(), &status_incomplete_reasons));
// machine metrics
state WorkerEvents mMetrics = workerEventsVec[0].present() ? workerEventsVec[0].get().first : WorkerEvents();

View File

@ -28,6 +28,7 @@
#include "fdbrpc/simulator.h"
#include "fdbrpc/Replication.h"
#include "fdbrpc/ReplicationUtils.h"
#include "RecoveryState.h"
template <class Collection>
void uniquify( Collection& c ) {
@ -648,6 +649,8 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
std::vector<TLogLockResult> results;
std::string sServerState;
LocalityGroup unResponsiveSet;
std::string missingServerIds;
double t = timer();
cycles ++;
@ -660,6 +663,10 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
else {
unResponsiveSet.add(prevState.tLogLocalities[t]);
sServerState += 'f';
if(missingServerIds.size()) {
missingServerIds += ", ";
}
missingServerIds += logServers[t]->get().toString();
}
}
@ -773,22 +780,12 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
.detail("LogZones", ::describeZones(prevState.tLogLocalities))
.detail("LogDataHalls", ::describeDataHalls(prevState.tLogLocalities));
}
}
// Too many failures
else {
TraceEvent("LogSystemWaitingForRecovery", dbgid).detail("Cycles", cycles)
.detail("AvailableServers", results.size())
.detail("TotalServers", logServers.size())
.detail("Present", results.size())
.detail("Available", availableItems.size())
.detail("Absent", logServers.size() - results.size())
.detail("ServerState", sServerState)
.detail("ReplicationFactor", prevState.tLogReplicationFactor)
.detail("AntiQuorum", prevState.tLogWriteAntiQuorum)
.detail("Policy", prevState.tLogPolicy->info())
.detail("TooManyFailures", bTooManyFailures)
.detail("LogZones", ::describeZones(prevState.tLogLocalities))
.detail("LogDataHalls", ::describeDataHalls(prevState.tLogLocalities));
} else {
TraceEvent("MasterRecoveryState", dbgid)
.detail("StatusCode", RecoveryStatus::locking_old_transaction_servers)
.detail("Status", RecoveryStatus::names[RecoveryStatus::locking_old_transaction_servers])
.detail("MissingIDs", missingServerIds)
.trackLatest("MasterRecoveryState");
}
// Wait for anything relevant to change

View File

@ -452,7 +452,7 @@ ACTOR Future<Void> recruitEverything( Reference<MasterData> self, vector<Storage
TraceEvent("MasterRecoveryState", self->dbgid)
.detail("StatusCode", status)
.detail("Status", RecoveryStatus::names[status])
.trackLatest(format("%s/MasterRecoveryState", printable(self->dbName).c_str() ).c_str());
.trackLatest("MasterRecoveryState");
return Never();
} else
TraceEvent("MasterRecoveryState", self->dbgid)
@ -465,7 +465,7 @@ ACTOR Future<Void> recruitEverything( Reference<MasterData> self, vector<Storage
.detail("RequiredResolvers", 1)
.detail("DesiredResolvers", self->configuration.getDesiredResolvers())
.detail("storeType", self->configuration.storageServerStoreType)
.trackLatest(format("%s/MasterRecoveryState", printable(self->dbName).c_str() ).c_str());
.trackLatest("MasterRecoveryState");
RecruitFromConfigurationReply recruits = wait(
brokenPromiseToNever( self->clusterController.recruitFromConfiguration.getReply(
@ -477,7 +477,7 @@ ACTOR Future<Void> recruitEverything( Reference<MasterData> self, vector<Storage
.detail("Proxies", recruits.proxies.size())
.detail("TLogs", recruits.tLogs.size())
.detail("Resolvers", recruits.resolvers.size())
.trackLatest(format("%s/MasterRecoveryState", printable(self->dbName).c_str() ).c_str());
.trackLatest("MasterRecoveryState");
// Actually, newSeedServers does both the recruiting and initialization of the seed servers; so if this is a brand new database we are sort of lying that we are
// past the recruitment phase. In a perfect world we would split that up so that the recruitment part happens above (in parallel with recruiting the transaction servers?).
@ -637,7 +637,7 @@ ACTOR Future<Void> recoverFrom( Reference<MasterData> self, Reference<ILogSystem
TraceEvent("MasterRecoveryState", self->dbgid)
.detail("StatusCode", RecoveryStatus::reading_transaction_system_state)
.detail("Status", RecoveryStatus::names[RecoveryStatus::reading_transaction_system_state])
.trackLatest(format("%s/MasterRecoveryState", printable(self->dbName).c_str() ).c_str());
.trackLatest("MasterRecoveryState");
self->hasConfiguration = false;
if(BUGGIFY)
@ -966,7 +966,7 @@ ACTOR Future<Void> masterCore( Reference<MasterData> self, PromiseStream<Future<
TraceEvent("MasterRecoveryState", self->dbgid)
.detail("StatusCode", RecoveryStatus::reading_coordinated_state)
.detail("Status", RecoveryStatus::names[RecoveryStatus::reading_coordinated_state])
.trackLatest(format("%s/MasterRecoveryState", printable(self->dbName).c_str() ).c_str());
.trackLatest("MasterRecoveryState");
Value prevDBStateRaw = wait( self->cstate1.read() );
addActor.send( masterTerminateOnConflict( self, self->cstate1.onConflict() ) );
@ -981,7 +981,7 @@ ACTOR Future<Void> masterCore( Reference<MasterData> self, PromiseStream<Future<
.detail("TLogs", self->prevDBState.tLogs.size())
.detail("MyRecoveryCount", self->prevDBState.recoveryCount+2)
.detail("StateSize", prevDBStateRaw.size())
.trackLatest(format("%s/MasterRecoveryState", printable(self->dbName).c_str() ).c_str());
.trackLatest("MasterRecoveryState");
state Reference<AsyncVar<Reference<ILogSystem>>> oldLogSystems( new AsyncVar<Reference<ILogSystem>> );
state Future<Void> recoverAndEndEpoch = ILogSystem::recoverAndEndEpoch(oldLogSystems, self->dbgid, self->prevDBState, self->myInterface.tlogRejoin.getFuture(), self->myInterface.locality);
@ -999,7 +999,8 @@ ACTOR Future<Void> masterCore( Reference<MasterData> self, PromiseStream<Future<
TraceEvent("MasterRecoveryState", self->dbgid)
.detail("StatusCode", RecoveryStatus::locking_old_transaction_servers)
.detail("Status", RecoveryStatus::names[RecoveryStatus::locking_old_transaction_servers])
.trackLatest(format("%s/MasterRecoveryState", printable(self->dbName).c_str() ).c_str());
.detail("MissingIDs", "")
.trackLatest("MasterRecoveryState");
loop {
Reference<ILogSystem> oldLogSystem = oldLogSystems->get();
@ -1025,7 +1026,7 @@ ACTOR Future<Void> masterCore( Reference<MasterData> self, PromiseStream<Future<
TraceEvent("MasterRecoveryState", self->dbgid)
.detail("StatusCode", RecoveryStatus::recovery_transaction)
.detail("Status", RecoveryStatus::names[RecoveryStatus::recovery_transaction])
.trackLatest(format("%s/MasterRecoveryState", printable(self->dbName).c_str() ).c_str());
.trackLatest("MasterRecoveryState");
// Recovery transaction
state bool debugResult = debug_checkMinRestoredVersion( UID(), self->lastEpochEnd, "DBRecovery", SevWarn );
@ -1101,7 +1102,7 @@ ACTOR Future<Void> masterCore( Reference<MasterData> self, PromiseStream<Future<
.detail("Status", RecoveryStatus::names[RecoveryStatus::writing_coordinated_state])
.detail("TLogs", self->logSystem->getLogServerCount())
.detail("TLogList", self->logSystem->describe())
.trackLatest(format("%s/MasterRecoveryState", printable(self->dbName).c_str() ).c_str());
.trackLatest("MasterRecoveryState");
// Multiple masters prevent conflicts between themselves via CoordinatedState (self->cstate)
// 1. If SetMaster succeeds, then by CS's contract, these "new" Tlogs are the immediate
@ -1137,7 +1138,7 @@ ACTOR Future<Void> masterCore( Reference<MasterData> self, PromiseStream<Future<
.detail("StatusCode", RecoveryStatus::fully_recovered)
.detail("Status", RecoveryStatus::names[RecoveryStatus::fully_recovered])
.detail("storeType", self->configuration.storageServerStoreType)
.trackLatest(format("%s/MasterRecoveryState", printable(self->dbName).c_str() ).c_str());
.trackLatest("MasterRecoveryState");
// Now that recovery is complete, we register ourselves with the cluster controller, so that the client and server information
// it hands out can be updated

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long