From 9e137d3b49c8fa1a4f504b4a69a37a37532d6d0c Mon Sep 17 00:00:00 2001
From: Evan Tschannen <ejt@apple.com>
Date: Thu, 19 Dec 2019 18:26:29 -0800
Subject: [PATCH] fix: addPeerReference only marks a connection as healthy if
 it is the first peerReference added additional logging to long LoadBalance
 calls, and when the failure monitor state changes for an address

---
 fdbrpc/FailureMonitor.actor.cpp | 10 ++++++++--
 fdbrpc/FlowTransport.actor.cpp  |  6 ++++--
 fdbrpc/LoadBalance.actor.h      | 16 ++++++++++++++++
 flow/Trace.h                    |  4 ++++
 4 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/fdbrpc/FailureMonitor.actor.cpp b/fdbrpc/FailureMonitor.actor.cpp
index 0e6fa9d135..2c5a449e44 100644
--- a/fdbrpc/FailureMonitor.actor.cpp
+++ b/fdbrpc/FailureMonitor.actor.cpp
@@ -73,9 +73,9 @@ void SimpleFailureMonitor::setStatus( NetworkAddress const& address, FailureStat
 	// for an endpoint that is waited on changes, the waiter sees its failure status change
 	auto it = addressStatus.find(address);
 
-	//TraceEvent("NotifyFailureStatus").detail("Address", address).detail("Status", status.failed ? "Failed" : "OK").detail("Present", it == addressStatus.end());
 	if (it == addressStatus.end()) {
 		if (status != FailureStatus()) {
+			TraceEvent("NotifyAddressHealthy").suppressFor(1.0).detail("Address", address);
 			addressStatus[address]=status;
 			endpointKnownFailed.triggerRange( Endpoint({address}, UID()), Endpoint({address}, UID(-1,-1)) );
 		}
@@ -85,8 +85,14 @@ void SimpleFailureMonitor::setStatus( NetworkAddress const& address, FailureStat
 			it->second = status;
 		else
 			addressStatus.erase(it);
-		if(triggerEndpoint)
+		if(triggerEndpoint) {
+			if(status.failed) {
+				TraceEvent("NotifyAddressFailed").suppressFor(1.0).detail("Address", address);
+			} else {
+				TraceEvent("NotifyAddressHealthyPresent").suppressFor(1.0).detail("Address", address);
+			}
 			endpointKnownFailed.triggerRange( Endpoint({address}, UID()), Endpoint({address}, UID(-1,-1)) );
+		}
 	}
 }
 
diff --git a/fdbrpc/FlowTransport.actor.cpp b/fdbrpc/FlowTransport.actor.cpp
index 260a27718f..ab93d8ae6e 100644
--- a/fdbrpc/FlowTransport.actor.cpp
+++ b/fdbrpc/FlowTransport.actor.cpp
@@ -1094,11 +1094,13 @@ Endpoint FlowTransport::loadedEndpoint( const UID& token ) {
 void FlowTransport::addPeerReference(const Endpoint& endpoint, bool isStream) {
 	if (!isStream || !endpoint.getPrimaryAddress().isValid())
 		return;
-	else if (FlowTransport::transport().isClient())
-		IFailureMonitor::failureMonitor().setStatus(endpoint.getPrimaryAddress(), FailureStatus(false));
 
 	Reference<Peer> peer = self->getOrOpenPeer(endpoint.getPrimaryAddress());
+	
 	if(peer->peerReferences == -1) {
+		if (FlowTransport::transport().isClient()) {
+			IFailureMonitor::failureMonitor().setStatus(endpoint.getPrimaryAddress(), FailureStatus(false));
+		}
 		peer->peerReferences = 1;
 	} else {
 		peer->peerReferences++;
diff --git a/fdbrpc/LoadBalance.actor.h b/fdbrpc/LoadBalance.actor.h
index 0f8a216ddb..d6192209cc 100644
--- a/fdbrpc/LoadBalance.actor.h
+++ b/fdbrpc/LoadBalance.actor.h
@@ -185,6 +185,7 @@ Future< REPLY_TYPE(Request) > loadBalance(
 	state Future<Void> secondDelay = Never();
 
 	state Promise<Void> requestFinished;
+	state double startTime = now();
 	
 	setReplyPriority(request, taskID);
 	if (!alternatives)
@@ -278,6 +279,21 @@ Future< REPLY_TYPE(Request) > loadBalance(
 	state double backoff = 0;
 	state bool triedAllOptions = false;
 	loop {
+		if(now() - startTime > g_network->isSimulated() ? 30.0 : 600.0) {
+			TraceEvent ev(g_network->isSimulated() ? SevWarn : SevWarnAlways, "LoadBalanceTooLong");
+			ev.suppressFor(1.0);
+			ev.detail("Duration", now() - startTime);
+			ev.detail("NumAttempts", numAttempts);
+			ev.detail("Backoff", backoff);
+			ev.detail("TriedAllOptions", triedAllOptions);
+			if(ev.isEnabled()) {
+				for(int alternativeNum=0; alternativeNum<alternatives->size(); alternativeNum++) {
+					RequestStream<Request> const* thisStream = &alternatives->get( alternativeNum, channel );
+					TraceEvent(g_network->isSimulated() ? SevWarn : SevWarnAlways, "LoadBalanceTooLongEndpoint").detail("Addr", thisStream->getEndpoint().getPrimaryAddress()).detail("Token", thisStream->getEndpoint().token).detail("Failed", IFailureMonitor::failureMonitor().getState( thisStream->getEndpoint() ).failed);
+				}
+			}
+		}
+
 		// Find an alternative, if any, that is not failed, starting with nextAlt
 		state RequestStream<Request> const* stream = NULL;
 		for(int alternativeNum=0; alternativeNum<alternatives->size(); alternativeNum++) {
diff --git a/flow/Trace.h b/flow/Trace.h
index da2d39fc55..a49d676df4 100644
--- a/flow/Trace.h
+++ b/flow/Trace.h
@@ -460,6 +460,10 @@ public:
 
 	TraceEvent& GetLastError();
 
+	bool isEnabled() const {
+		return enabled;
+	}
+
 	~TraceEvent();  // Actually logs the event
 
 	// Return the number of invocations of TraceEvent() at the specified logging level.