Merge pull request #2013 from etschannen/feature-dd-logging

Warn when different parts of shard relocations take more than 10 minutes
2025-06-01 10:45:56 +08:00 · 2019-08-27 08:55:53 -07:00 · 2019-08-27 08:55:53 -07:00 · e0824f4915
commit e0824f4915
parent fff0d37595 ac68c8e4fd
4 changed files with 31 additions and 4 deletions
--- a/fdbserver/DataDistribution.actor.cpp
+++ b/fdbserver/DataDistribution.actor.cpp
@ -2909,7 +2909,7 @@ ACTOR Future<Void> teamTracker(DDTeamCollection* self, Reference<TCTeamInfo> tea
 		}
 	} catch(Error& e) {
 		if(logTeamEvents) {
-			TraceEvent("TeamTrackerStopping", self->distributorId).detail("Team", team->getDesc());
+			TraceEvent("TeamTrackerStopping", self->distributorId).detail("Team", team->getDesc()).detail("Priority", team->getPriority());
 		}
 		self->priority_teams[team->getPriority()]--;
 		if (team->isHealthy()) {
--- a/fdbserver/DataDistributionQueue.actor.cpp
+++ b/fdbserver/DataDistributionQueue.actor.cpp
@ -879,6 +879,8 @@ ACTOR Future<Void> dataDistributionRelocator( DDQueueData *self, RelocateData rd
 	state bool allHealthy = true;
 	state bool anyWithSource = false;
 	state std::vector<std::pair<Reference<IDataDistributionTeam>,bool>> bestTeams;
+	state double startTime = now();
+	state std::vector<UID> destIds;

 	try {
 		if(now() - self->lastInterval < 1.0) {
@ -955,7 +957,7 @@ ACTOR Future<Void> dataDistributionRelocator( DDQueueData *self, RelocateData rd
 				wait( delay( SERVER_KNOBS->BEST_TEAM_STUCK_DELAY, TaskPriority::DataDistributionLaunch ) );
 			}

-			state std::vector<UID> destIds;
+			destIds.clear();
 			state std::vector<UID> healthyIds;
 			state std::vector<UID> extraIds;
 			state std::vector<ShardsAffectedByTeamFailure::Team> destinationTeams;
@ -1066,7 +1068,10 @@ ACTOR Future<Void> dataDistributionRelocator( DDQueueData *self, RelocateData rd

 				// onFinished.send( rs );
 				if( !error.code() ) {
-					TraceEvent(relocateShardInterval.end(), distributorId).detail("Result","Success");
+					TraceEvent(relocateShardInterval.end(), distributorId).detail("Duration", now() - startTime).detail("Result","Success");
+					if(now() - startTime > 600) {
+						TraceEvent(SevWarnAlways, "RelocateShardTooLong").detail("Duration", now() - startTime).detail("Dest", describe(destIds)).detail("Src", describe(rd.src));
+					}
 					if(rd.keys.begin == keyServersPrefix) {
 						TraceEvent("MovedKeyServerKeys").detail("Dest", describe(destIds)).trackLatest("MovedKeyServers");
 					}
@ -1090,7 +1095,10 @@ ACTOR Future<Void> dataDistributionRelocator( DDQueueData *self, RelocateData rd
 			}
 		}
 	} catch (Error& e) {
-		TraceEvent(relocateShardInterval.end(), distributorId).error(e, true);
+		TraceEvent(relocateShardInterval.end(), distributorId).error(e, true).detail("Duration", now() - startTime);
+		if(now() - startTime > 600) {
+			TraceEvent(SevWarnAlways, "RelocateShardTooLong").error(e, true).detail("Duration", now() - startTime).detail("Dest", describe(destIds)).detail("Src", describe(rd.src));
+		}
 		if( !signalledTransferComplete )
 			dataTransferComplete.send( rd );

--- a/fdbserver/MoveKeys.actor.cpp
+++ b/fdbserver/MoveKeys.actor.cpp
@ -258,11 +258,20 @@ ACTOR Future<vector<vector<UID>>> additionalSources(Standalone<RangeResultRef> s
 	return result;
 }

+ACTOR Future<Void> logWarningAfter( const char * context, double duration, vector<UID> servers) {
+	state double startTime = now();
+	loop {
+		wait(delay(duration));
+		TraceEvent(SevWarnAlways, context).detail("Duration", now() - startTime).detail("Servers", describe(servers));
+	}
+}
+
 // Set keyServers[keys].dest = servers
 // Set serverKeys[servers][keys] = active for each subrange of keys that the server did not already have, complete for each subrange that it already has
 // Set serverKeys[dest][keys] = "" for the dest servers of each existing shard in keys (unless that destination is a member of servers OR if the source list is sufficiently degraded)
 ACTOR Future<Void> startMoveKeys( Database occ, KeyRange keys, vector<UID> servers, MoveKeysLock lock, FlowLock *startMoveKeysLock, UID relocationIntervalId ) {
 	state TraceInterval interval("RelocateShard_StartMoveKeys");
+	state Future<Void> warningLogger = logWarningAfter("StartMoveKeysTooLong", 600, servers);
 	//state TraceInterval waitInterval("");

 	wait( startMoveKeysLock->take( TaskPriority::DataDistributionLaunch ) );
@ -500,6 +509,7 @@ ACTOR Future<Void> finishMoveKeys( Database occ, KeyRange keys, vector<UID> dest
 {
 	state TraceInterval interval("RelocateShard_FinishMoveKeys");
 	state TraceInterval waitInterval("");
+	state Future<Void> warningLogger = logWarningAfter("FinishMoveKeysTooLong", 600, destinationTeam);
 	state Key begin = keys.begin;
 	state Key endKey;
 	state int retries = 0;
--- a/fdbserver/storageserver.actor.cpp
+++ b/fdbserver/storageserver.actor.cpp
@ -1936,9 +1936,18 @@ void splitMutation(StorageServer* data, KeyRangeMap<T>& map, MutationRef const&
 		ASSERT(false);  // Unknown mutation type in splitMutations
 }

+ACTOR Future<Void> logFetchKeysWarning(AddingShard* shard) {
+	state double startTime = now();
+	loop {
+		wait(delay(600));
+		TraceEvent(SevWarnAlways, "FetchKeysTooLong").detail("Duration", now() - startTime).detail("Phase", shard->phase).detail("Begin", shard->keys.begin.printable()).detail("End", shard->keys.end.printable());
+	}
+}
+
 ACTOR Future<Void> fetchKeys( StorageServer *data, AddingShard* shard ) {
 	state TraceInterval interval("FetchKeys");
 	state KeyRange keys = shard->keys;
+	state Future<Void> warningLogger = logFetchKeysWarning(shard);
 	state double startt = now();
 	state int fetchBlockBytes = BUGGIFY ? SERVER_KNOBS->BUGGIFY_BLOCK_BYTES : SERVER_KNOBS->FETCH_BLOCK_BYTES;