From 297b65236f746a806ca3ab24e47798eb1fe01e84 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Fri, 16 Aug 2019 14:56:58 -0700 Subject: [PATCH 1/3] added additional trace events to warn when different parts of shard relocations take more than 10 minutes --- fdbserver/DataDistribution.actor.cpp | 2 +- fdbserver/DataDistributionQueue.actor.cpp | 10 +++++++++- fdbserver/MoveKeys.actor.cpp | 10 ++++++++++ fdbserver/storageserver.actor.cpp | 9 +++++++++ 4 files changed, 29 insertions(+), 2 deletions(-) diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index 4306cc0708..af34f1da0f 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -2909,7 +2909,7 @@ ACTOR Future teamTracker(DDTeamCollection* self, Reference tea } } catch(Error& e) { if(logTeamEvents) { - TraceEvent("TeamTrackerStopping", self->distributorId).detail("Team", team->getDesc()); + TraceEvent("TeamTrackerStopping", self->distributorId).detail("Team", team->getDesc()).detail("Priority", team->getPriority()); } self->priority_teams[team->getPriority()]--; if (team->isHealthy()) { diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp index 7ea6597c24..6d3accaced 100644 --- a/fdbserver/DataDistributionQueue.actor.cpp +++ b/fdbserver/DataDistributionQueue.actor.cpp @@ -879,6 +879,8 @@ ACTOR Future dataDistributionRelocator( DDQueueData *self, RelocateData rd state bool allHealthy = true; state bool anyWithSource = false; state std::vector,bool>> bestTeams; + state double startTime = now(); + state std::vector destIds; try { if(now() - self->lastInterval < 1.0) { @@ -955,7 +957,7 @@ ACTOR Future dataDistributionRelocator( DDQueueData *self, RelocateData rd wait( delay( SERVER_KNOBS->BEST_TEAM_STUCK_DELAY, TaskPriority::DataDistributionLaunch ) ); } - state std::vector destIds; + destIds.clear(); state std::vector healthyIds; state std::vector extraIds; state std::vector destinationTeams; @@ -1067,6 +1069,9 @@ ACTOR Future dataDistributionRelocator( DDQueueData *self, RelocateData rd // onFinished.send( rs ); if( !error.code() ) { TraceEvent(relocateShardInterval.end(), distributorId).detail("Result","Success"); + if(now() - startTime > 600) { + TraceEvent(SevWarnAlways, "RelocateShardTooLong").detail("Duration", now() - startTime).detail("Dest", describe(destIds)); + } if(rd.keys.begin == keyServersPrefix) { TraceEvent("MovedKeyServerKeys").detail("Dest", describe(destIds)).trackLatest("MovedKeyServers"); } @@ -1091,6 +1096,9 @@ ACTOR Future dataDistributionRelocator( DDQueueData *self, RelocateData rd } } catch (Error& e) { TraceEvent(relocateShardInterval.end(), distributorId).error(e, true); + if(now() - startTime > 600) { + TraceEvent(SevWarnAlways, "RelocateShardTooLong").error(e, true).detail("Duration", now() - startTime).detail("Dest", describe(destIds)); + } if( !signalledTransferComplete ) dataTransferComplete.send( rd ); diff --git a/fdbserver/MoveKeys.actor.cpp b/fdbserver/MoveKeys.actor.cpp index ff77f7db35..d95f14a19d 100644 --- a/fdbserver/MoveKeys.actor.cpp +++ b/fdbserver/MoveKeys.actor.cpp @@ -258,11 +258,20 @@ ACTOR Future>> additionalSources(Standalone s return result; } +ACTOR Future logWarningAfter( const char * context, double duration, vector servers) { + state double startTime = now(); + loop { + wait(delay(duration)); + TraceEvent(SevWarnAlways, context).detail("Duration", now() - startTime).detail("Servers", describe(servers)); + } +} + // Set keyServers[keys].dest = servers // Set serverKeys[servers][keys] = active for each subrange of keys that the server did not already have, complete for each subrange that it already has // Set serverKeys[dest][keys] = "" for the dest servers of each existing shard in keys (unless that destination is a member of servers OR if the source list is sufficiently degraded) ACTOR Future startMoveKeys( Database occ, KeyRange keys, vector servers, MoveKeysLock lock, FlowLock *startMoveKeysLock, UID relocationIntervalId ) { state TraceInterval interval("RelocateShard_StartMoveKeys"); + state Future warningLogger = logWarningAfter("StartMoveKeysTooLong", 600, servers); //state TraceInterval waitInterval(""); wait( startMoveKeysLock->take( TaskPriority::DataDistributionLaunch ) ); @@ -500,6 +509,7 @@ ACTOR Future finishMoveKeys( Database occ, KeyRange keys, vector dest { state TraceInterval interval("RelocateShard_FinishMoveKeys"); state TraceInterval waitInterval(""); + state Future warningLogger = logWarningAfter("FinishMoveKeysTooLong", 600, destinationTeam); state Key begin = keys.begin; state Key endKey; state int retries = 0; diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp index 5d1f9cbb40..1024916598 100644 --- a/fdbserver/storageserver.actor.cpp +++ b/fdbserver/storageserver.actor.cpp @@ -1936,9 +1936,18 @@ void splitMutation(StorageServer* data, KeyRangeMap& map, MutationRef const& ASSERT(false); // Unknown mutation type in splitMutations } +ACTOR Future logFetchKeysWarning(AddingShard* shard) { + state double startTime = now(); + loop { + wait(delay(600)); + TraceEvent(SevWarnAlways, "FetchKeysTooLong").detail("Duration", now() - startTime).detail("Phase", shard->phase).detail("Begin", shard->keys.begin.printable()).detail("End", shard->keys.end.printable()); + } +} + ACTOR Future fetchKeys( StorageServer *data, AddingShard* shard ) { state TraceInterval interval("FetchKeys"); state KeyRange keys = shard->keys; + state Future warningLogger = logFetchKeysWarning(shard); state double startt = now(); state int fetchBlockBytes = BUGGIFY ? SERVER_KNOBS->BUGGIFY_BLOCK_BYTES : SERVER_KNOBS->FETCH_BLOCK_BYTES; From d30d4cb955fc85ac2128696c6ed42bd4c2ff4942 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Fri, 16 Aug 2019 15:15:36 -0700 Subject: [PATCH 2/3] Added a duration to regular relocateShard trace events --- fdbserver/DataDistributionQueue.actor.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp index 6d3accaced..467afe1219 100644 --- a/fdbserver/DataDistributionQueue.actor.cpp +++ b/fdbserver/DataDistributionQueue.actor.cpp @@ -1068,7 +1068,7 @@ ACTOR Future dataDistributionRelocator( DDQueueData *self, RelocateData rd // onFinished.send( rs ); if( !error.code() ) { - TraceEvent(relocateShardInterval.end(), distributorId).detail("Result","Success"); + TraceEvent(relocateShardInterval.end(), distributorId).detail("Duration", now() - startTime).detail("Result","Success"); if(now() - startTime > 600) { TraceEvent(SevWarnAlways, "RelocateShardTooLong").detail("Duration", now() - startTime).detail("Dest", describe(destIds)); } @@ -1095,7 +1095,7 @@ ACTOR Future dataDistributionRelocator( DDQueueData *self, RelocateData rd } } } catch (Error& e) { - TraceEvent(relocateShardInterval.end(), distributorId).error(e, true); + TraceEvent(relocateShardInterval.end(), distributorId).error(e, true).detail("Duration", now() - startTime); if(now() - startTime > 600) { TraceEvent(SevWarnAlways, "RelocateShardTooLong").error(e, true).detail("Duration", now() - startTime).detail("Dest", describe(destIds)); } From ac68c8e4fd3d2ec86eeadc6a7deb3ac08419098c Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Wed, 21 Aug 2019 11:48:29 -0700 Subject: [PATCH 3/3] added sources servers to the warning message --- fdbserver/DataDistributionQueue.actor.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp index 467afe1219..c8c22c859d 100644 --- a/fdbserver/DataDistributionQueue.actor.cpp +++ b/fdbserver/DataDistributionQueue.actor.cpp @@ -1070,7 +1070,7 @@ ACTOR Future dataDistributionRelocator( DDQueueData *self, RelocateData rd if( !error.code() ) { TraceEvent(relocateShardInterval.end(), distributorId).detail("Duration", now() - startTime).detail("Result","Success"); if(now() - startTime > 600) { - TraceEvent(SevWarnAlways, "RelocateShardTooLong").detail("Duration", now() - startTime).detail("Dest", describe(destIds)); + TraceEvent(SevWarnAlways, "RelocateShardTooLong").detail("Duration", now() - startTime).detail("Dest", describe(destIds)).detail("Src", describe(rd.src)); } if(rd.keys.begin == keyServersPrefix) { TraceEvent("MovedKeyServerKeys").detail("Dest", describe(destIds)).trackLatest("MovedKeyServers"); @@ -1097,7 +1097,7 @@ ACTOR Future dataDistributionRelocator( DDQueueData *self, RelocateData rd } catch (Error& e) { TraceEvent(relocateShardInterval.end(), distributorId).error(e, true).detail("Duration", now() - startTime); if(now() - startTime > 600) { - TraceEvent(SevWarnAlways, "RelocateShardTooLong").error(e, true).detail("Duration", now() - startTime).detail("Dest", describe(destIds)); + TraceEvent(SevWarnAlways, "RelocateShardTooLong").error(e, true).detail("Duration", now() - startTime).detail("Dest", describe(destIds)).detail("Src", describe(rd.src)); } if( !signalledTransferComplete ) dataTransferComplete.send( rd );