diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index 4306cc0708..af34f1da0f 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -2909,7 +2909,7 @@ ACTOR Future teamTracker(DDTeamCollection* self, Reference tea } } catch(Error& e) { if(logTeamEvents) { - TraceEvent("TeamTrackerStopping", self->distributorId).detail("Team", team->getDesc()); + TraceEvent("TeamTrackerStopping", self->distributorId).detail("Team", team->getDesc()).detail("Priority", team->getPriority()); } self->priority_teams[team->getPriority()]--; if (team->isHealthy()) { diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp index 7ea6597c24..c8c22c859d 100644 --- a/fdbserver/DataDistributionQueue.actor.cpp +++ b/fdbserver/DataDistributionQueue.actor.cpp @@ -879,6 +879,8 @@ ACTOR Future dataDistributionRelocator( DDQueueData *self, RelocateData rd state bool allHealthy = true; state bool anyWithSource = false; state std::vector,bool>> bestTeams; + state double startTime = now(); + state std::vector destIds; try { if(now() - self->lastInterval < 1.0) { @@ -955,7 +957,7 @@ ACTOR Future dataDistributionRelocator( DDQueueData *self, RelocateData rd wait( delay( SERVER_KNOBS->BEST_TEAM_STUCK_DELAY, TaskPriority::DataDistributionLaunch ) ); } - state std::vector destIds; + destIds.clear(); state std::vector healthyIds; state std::vector extraIds; state std::vector destinationTeams; @@ -1066,7 +1068,10 @@ ACTOR Future dataDistributionRelocator( DDQueueData *self, RelocateData rd // onFinished.send( rs ); if( !error.code() ) { - TraceEvent(relocateShardInterval.end(), distributorId).detail("Result","Success"); + TraceEvent(relocateShardInterval.end(), distributorId).detail("Duration", now() - startTime).detail("Result","Success"); + if(now() - startTime > 600) { + TraceEvent(SevWarnAlways, "RelocateShardTooLong").detail("Duration", now() - startTime).detail("Dest", describe(destIds)).detail("Src", describe(rd.src)); + } if(rd.keys.begin == keyServersPrefix) { TraceEvent("MovedKeyServerKeys").detail("Dest", describe(destIds)).trackLatest("MovedKeyServers"); } @@ -1090,7 +1095,10 @@ ACTOR Future dataDistributionRelocator( DDQueueData *self, RelocateData rd } } } catch (Error& e) { - TraceEvent(relocateShardInterval.end(), distributorId).error(e, true); + TraceEvent(relocateShardInterval.end(), distributorId).error(e, true).detail("Duration", now() - startTime); + if(now() - startTime > 600) { + TraceEvent(SevWarnAlways, "RelocateShardTooLong").error(e, true).detail("Duration", now() - startTime).detail("Dest", describe(destIds)).detail("Src", describe(rd.src)); + } if( !signalledTransferComplete ) dataTransferComplete.send( rd ); diff --git a/fdbserver/MoveKeys.actor.cpp b/fdbserver/MoveKeys.actor.cpp index ff77f7db35..d95f14a19d 100644 --- a/fdbserver/MoveKeys.actor.cpp +++ b/fdbserver/MoveKeys.actor.cpp @@ -258,11 +258,20 @@ ACTOR Future>> additionalSources(Standalone s return result; } +ACTOR Future logWarningAfter( const char * context, double duration, vector servers) { + state double startTime = now(); + loop { + wait(delay(duration)); + TraceEvent(SevWarnAlways, context).detail("Duration", now() - startTime).detail("Servers", describe(servers)); + } +} + // Set keyServers[keys].dest = servers // Set serverKeys[servers][keys] = active for each subrange of keys that the server did not already have, complete for each subrange that it already has // Set serverKeys[dest][keys] = "" for the dest servers of each existing shard in keys (unless that destination is a member of servers OR if the source list is sufficiently degraded) ACTOR Future startMoveKeys( Database occ, KeyRange keys, vector servers, MoveKeysLock lock, FlowLock *startMoveKeysLock, UID relocationIntervalId ) { state TraceInterval interval("RelocateShard_StartMoveKeys"); + state Future warningLogger = logWarningAfter("StartMoveKeysTooLong", 600, servers); //state TraceInterval waitInterval(""); wait( startMoveKeysLock->take( TaskPriority::DataDistributionLaunch ) ); @@ -500,6 +509,7 @@ ACTOR Future finishMoveKeys( Database occ, KeyRange keys, vector dest { state TraceInterval interval("RelocateShard_FinishMoveKeys"); state TraceInterval waitInterval(""); + state Future warningLogger = logWarningAfter("FinishMoveKeysTooLong", 600, destinationTeam); state Key begin = keys.begin; state Key endKey; state int retries = 0; diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp index 5d1f9cbb40..1024916598 100644 --- a/fdbserver/storageserver.actor.cpp +++ b/fdbserver/storageserver.actor.cpp @@ -1936,9 +1936,18 @@ void splitMutation(StorageServer* data, KeyRangeMap& map, MutationRef const& ASSERT(false); // Unknown mutation type in splitMutations } +ACTOR Future logFetchKeysWarning(AddingShard* shard) { + state double startTime = now(); + loop { + wait(delay(600)); + TraceEvent(SevWarnAlways, "FetchKeysTooLong").detail("Duration", now() - startTime).detail("Phase", shard->phase).detail("Begin", shard->keys.begin.printable()).detail("End", shard->keys.end.printable()); + } +} + ACTOR Future fetchKeys( StorageServer *data, AddingShard* shard ) { state TraceInterval interval("FetchKeys"); state KeyRange keys = shard->keys; + state Future warningLogger = logFetchKeysWarning(shard); state double startt = now(); state int fetchBlockBytes = BUGGIFY ? SERVER_KNOBS->BUGGIFY_BLOCK_BYTES : SERVER_KNOBS->FETCH_BLOCK_BYTES;