mirror of
https://github.com/apple/foundationdb.git
synced 2025-06-01 10:45:56 +08:00
Merge pull request #2013 from etschannen/feature-dd-logging
Warn when different parts of shard relocations take more than 10 minutes
This commit is contained in:
commit
e0824f4915
@ -2909,7 +2909,7 @@ ACTOR Future<Void> teamTracker(DDTeamCollection* self, Reference<TCTeamInfo> tea
|
||||
}
|
||||
} catch(Error& e) {
|
||||
if(logTeamEvents) {
|
||||
TraceEvent("TeamTrackerStopping", self->distributorId).detail("Team", team->getDesc());
|
||||
TraceEvent("TeamTrackerStopping", self->distributorId).detail("Team", team->getDesc()).detail("Priority", team->getPriority());
|
||||
}
|
||||
self->priority_teams[team->getPriority()]--;
|
||||
if (team->isHealthy()) {
|
||||
|
@ -879,6 +879,8 @@ ACTOR Future<Void> dataDistributionRelocator( DDQueueData *self, RelocateData rd
|
||||
state bool allHealthy = true;
|
||||
state bool anyWithSource = false;
|
||||
state std::vector<std::pair<Reference<IDataDistributionTeam>,bool>> bestTeams;
|
||||
state double startTime = now();
|
||||
state std::vector<UID> destIds;
|
||||
|
||||
try {
|
||||
if(now() - self->lastInterval < 1.0) {
|
||||
@ -955,7 +957,7 @@ ACTOR Future<Void> dataDistributionRelocator( DDQueueData *self, RelocateData rd
|
||||
wait( delay( SERVER_KNOBS->BEST_TEAM_STUCK_DELAY, TaskPriority::DataDistributionLaunch ) );
|
||||
}
|
||||
|
||||
state std::vector<UID> destIds;
|
||||
destIds.clear();
|
||||
state std::vector<UID> healthyIds;
|
||||
state std::vector<UID> extraIds;
|
||||
state std::vector<ShardsAffectedByTeamFailure::Team> destinationTeams;
|
||||
@ -1066,7 +1068,10 @@ ACTOR Future<Void> dataDistributionRelocator( DDQueueData *self, RelocateData rd
|
||||
|
||||
// onFinished.send( rs );
|
||||
if( !error.code() ) {
|
||||
TraceEvent(relocateShardInterval.end(), distributorId).detail("Result","Success");
|
||||
TraceEvent(relocateShardInterval.end(), distributorId).detail("Duration", now() - startTime).detail("Result","Success");
|
||||
if(now() - startTime > 600) {
|
||||
TraceEvent(SevWarnAlways, "RelocateShardTooLong").detail("Duration", now() - startTime).detail("Dest", describe(destIds)).detail("Src", describe(rd.src));
|
||||
}
|
||||
if(rd.keys.begin == keyServersPrefix) {
|
||||
TraceEvent("MovedKeyServerKeys").detail("Dest", describe(destIds)).trackLatest("MovedKeyServers");
|
||||
}
|
||||
@ -1090,7 +1095,10 @@ ACTOR Future<Void> dataDistributionRelocator( DDQueueData *self, RelocateData rd
|
||||
}
|
||||
}
|
||||
} catch (Error& e) {
|
||||
TraceEvent(relocateShardInterval.end(), distributorId).error(e, true);
|
||||
TraceEvent(relocateShardInterval.end(), distributorId).error(e, true).detail("Duration", now() - startTime);
|
||||
if(now() - startTime > 600) {
|
||||
TraceEvent(SevWarnAlways, "RelocateShardTooLong").error(e, true).detail("Duration", now() - startTime).detail("Dest", describe(destIds)).detail("Src", describe(rd.src));
|
||||
}
|
||||
if( !signalledTransferComplete )
|
||||
dataTransferComplete.send( rd );
|
||||
|
||||
|
@ -258,11 +258,20 @@ ACTOR Future<vector<vector<UID>>> additionalSources(Standalone<RangeResultRef> s
|
||||
return result;
|
||||
}
|
||||
|
||||
ACTOR Future<Void> logWarningAfter( const char * context, double duration, vector<UID> servers) {
|
||||
state double startTime = now();
|
||||
loop {
|
||||
wait(delay(duration));
|
||||
TraceEvent(SevWarnAlways, context).detail("Duration", now() - startTime).detail("Servers", describe(servers));
|
||||
}
|
||||
}
|
||||
|
||||
// Set keyServers[keys].dest = servers
|
||||
// Set serverKeys[servers][keys] = active for each subrange of keys that the server did not already have, complete for each subrange that it already has
|
||||
// Set serverKeys[dest][keys] = "" for the dest servers of each existing shard in keys (unless that destination is a member of servers OR if the source list is sufficiently degraded)
|
||||
ACTOR Future<Void> startMoveKeys( Database occ, KeyRange keys, vector<UID> servers, MoveKeysLock lock, FlowLock *startMoveKeysLock, UID relocationIntervalId ) {
|
||||
state TraceInterval interval("RelocateShard_StartMoveKeys");
|
||||
state Future<Void> warningLogger = logWarningAfter("StartMoveKeysTooLong", 600, servers);
|
||||
//state TraceInterval waitInterval("");
|
||||
|
||||
wait( startMoveKeysLock->take( TaskPriority::DataDistributionLaunch ) );
|
||||
@ -500,6 +509,7 @@ ACTOR Future<Void> finishMoveKeys( Database occ, KeyRange keys, vector<UID> dest
|
||||
{
|
||||
state TraceInterval interval("RelocateShard_FinishMoveKeys");
|
||||
state TraceInterval waitInterval("");
|
||||
state Future<Void> warningLogger = logWarningAfter("FinishMoveKeysTooLong", 600, destinationTeam);
|
||||
state Key begin = keys.begin;
|
||||
state Key endKey;
|
||||
state int retries = 0;
|
||||
|
@ -1936,9 +1936,18 @@ void splitMutation(StorageServer* data, KeyRangeMap<T>& map, MutationRef const&
|
||||
ASSERT(false); // Unknown mutation type in splitMutations
|
||||
}
|
||||
|
||||
ACTOR Future<Void> logFetchKeysWarning(AddingShard* shard) {
|
||||
state double startTime = now();
|
||||
loop {
|
||||
wait(delay(600));
|
||||
TraceEvent(SevWarnAlways, "FetchKeysTooLong").detail("Duration", now() - startTime).detail("Phase", shard->phase).detail("Begin", shard->keys.begin.printable()).detail("End", shard->keys.end.printable());
|
||||
}
|
||||
}
|
||||
|
||||
ACTOR Future<Void> fetchKeys( StorageServer *data, AddingShard* shard ) {
|
||||
state TraceInterval interval("FetchKeys");
|
||||
state KeyRange keys = shard->keys;
|
||||
state Future<Void> warningLogger = logFetchKeysWarning(shard);
|
||||
state double startt = now();
|
||||
state int fetchBlockBytes = BUGGIFY ? SERVER_KNOBS->BUGGIFY_BLOCK_BYTES : SERVER_KNOBS->FETCH_BLOCK_BYTES;
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user