Merge pull request #2013 from etschannen/feature-dd-logging

Warn when different parts of shard relocations take more than 10 minutes
This commit is contained in:
A.J. Beamon 2019-08-27 08:55:53 -07:00 committed by GitHub
commit e0824f4915
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 31 additions and 4 deletions

View File

@ -2909,7 +2909,7 @@ ACTOR Future<Void> teamTracker(DDTeamCollection* self, Reference<TCTeamInfo> tea
}
} catch(Error& e) {
if(logTeamEvents) {
TraceEvent("TeamTrackerStopping", self->distributorId).detail("Team", team->getDesc());
TraceEvent("TeamTrackerStopping", self->distributorId).detail("Team", team->getDesc()).detail("Priority", team->getPriority());
}
self->priority_teams[team->getPriority()]--;
if (team->isHealthy()) {

View File

@ -879,6 +879,8 @@ ACTOR Future<Void> dataDistributionRelocator( DDQueueData *self, RelocateData rd
state bool allHealthy = true;
state bool anyWithSource = false;
state std::vector<std::pair<Reference<IDataDistributionTeam>,bool>> bestTeams;
state double startTime = now();
state std::vector<UID> destIds;
try {
if(now() - self->lastInterval < 1.0) {
@ -955,7 +957,7 @@ ACTOR Future<Void> dataDistributionRelocator( DDQueueData *self, RelocateData rd
wait( delay( SERVER_KNOBS->BEST_TEAM_STUCK_DELAY, TaskPriority::DataDistributionLaunch ) );
}
state std::vector<UID> destIds;
destIds.clear();
state std::vector<UID> healthyIds;
state std::vector<UID> extraIds;
state std::vector<ShardsAffectedByTeamFailure::Team> destinationTeams;
@ -1066,7 +1068,10 @@ ACTOR Future<Void> dataDistributionRelocator( DDQueueData *self, RelocateData rd
// onFinished.send( rs );
if( !error.code() ) {
TraceEvent(relocateShardInterval.end(), distributorId).detail("Result","Success");
TraceEvent(relocateShardInterval.end(), distributorId).detail("Duration", now() - startTime).detail("Result","Success");
if(now() - startTime > 600) {
TraceEvent(SevWarnAlways, "RelocateShardTooLong").detail("Duration", now() - startTime).detail("Dest", describe(destIds)).detail("Src", describe(rd.src));
}
if(rd.keys.begin == keyServersPrefix) {
TraceEvent("MovedKeyServerKeys").detail("Dest", describe(destIds)).trackLatest("MovedKeyServers");
}
@ -1090,7 +1095,10 @@ ACTOR Future<Void> dataDistributionRelocator( DDQueueData *self, RelocateData rd
}
}
} catch (Error& e) {
TraceEvent(relocateShardInterval.end(), distributorId).error(e, true);
TraceEvent(relocateShardInterval.end(), distributorId).error(e, true).detail("Duration", now() - startTime);
if(now() - startTime > 600) {
TraceEvent(SevWarnAlways, "RelocateShardTooLong").error(e, true).detail("Duration", now() - startTime).detail("Dest", describe(destIds)).detail("Src", describe(rd.src));
}
if( !signalledTransferComplete )
dataTransferComplete.send( rd );

View File

@ -258,11 +258,20 @@ ACTOR Future<vector<vector<UID>>> additionalSources(Standalone<RangeResultRef> s
return result;
}
ACTOR Future<Void> logWarningAfter( const char * context, double duration, vector<UID> servers) {
state double startTime = now();
loop {
wait(delay(duration));
TraceEvent(SevWarnAlways, context).detail("Duration", now() - startTime).detail("Servers", describe(servers));
}
}
// Set keyServers[keys].dest = servers
// Set serverKeys[servers][keys] = active for each subrange of keys that the server did not already have, complete for each subrange that it already has
// Set serverKeys[dest][keys] = "" for the dest servers of each existing shard in keys (unless that destination is a member of servers OR if the source list is sufficiently degraded)
ACTOR Future<Void> startMoveKeys( Database occ, KeyRange keys, vector<UID> servers, MoveKeysLock lock, FlowLock *startMoveKeysLock, UID relocationIntervalId ) {
state TraceInterval interval("RelocateShard_StartMoveKeys");
state Future<Void> warningLogger = logWarningAfter("StartMoveKeysTooLong", 600, servers);
//state TraceInterval waitInterval("");
wait( startMoveKeysLock->take( TaskPriority::DataDistributionLaunch ) );
@ -500,6 +509,7 @@ ACTOR Future<Void> finishMoveKeys( Database occ, KeyRange keys, vector<UID> dest
{
state TraceInterval interval("RelocateShard_FinishMoveKeys");
state TraceInterval waitInterval("");
state Future<Void> warningLogger = logWarningAfter("FinishMoveKeysTooLong", 600, destinationTeam);
state Key begin = keys.begin;
state Key endKey;
state int retries = 0;

View File

@ -1936,9 +1936,18 @@ void splitMutation(StorageServer* data, KeyRangeMap<T>& map, MutationRef const&
ASSERT(false); // Unknown mutation type in splitMutations
}
ACTOR Future<Void> logFetchKeysWarning(AddingShard* shard) {
state double startTime = now();
loop {
wait(delay(600));
TraceEvent(SevWarnAlways, "FetchKeysTooLong").detail("Duration", now() - startTime).detail("Phase", shard->phase).detail("Begin", shard->keys.begin.printable()).detail("End", shard->keys.end.printable());
}
}
ACTOR Future<Void> fetchKeys( StorageServer *data, AddingShard* shard ) {
state TraceInterval interval("FetchKeys");
state KeyRange keys = shard->keys;
state Future<Void> warningLogger = logFetchKeysWarning(shard);
state double startt = now();
state int fetchBlockBytes = BUGGIFY ? SERVER_KNOBS->BUGGIFY_BLOCK_BYTES : SERVER_KNOBS->FETCH_BLOCK_BYTES;