From 18b852c4e4882ad965e35ca68fad822c1c6330bc Mon Sep 17 00:00:00 2001
From: Vaidas Gasiunas <vaidas.gasiunas@snowflake.com>
Date: Thu, 3 Nov 2022 13:20:21 +0100
Subject: [PATCH] Improving troubleshooting of stopping the FDB client thread
 (#8629)

* Upgrade tests: dump thread call stacks of the tester process if it fails to terminate

* ApiTester: log before and after stopping the network thread

* Catch and print exceptions in closeTraceFile; Close trace file at the end of MVC runNetwork

* Change trace event name for MVC runNetwork termination

Co-authored-by: A.J. Beamon <aj.beamon@snowflake.com>

Co-authored-by: A.J. Beamon <aj.beamon@snowflake.com>
---
 .../c/test/apitester/fdb_c_api_tester.cpp     |  2 ++
 fdbclient/MultiVersionTransaction.actor.cpp   | 11 ++++++-
 fdbclient/ThreadSafeTransaction.cpp           | 10 +++---
 flow/Trace.cpp                                | 33 +++++++++++--------
 tests/TestRunner/upgrade_test.py              |  8 +++--
 5 files changed, 40 insertions(+), 24 deletions(-)

diff --git a/bindings/c/test/apitester/fdb_c_api_tester.cpp b/bindings/c/test/apitester/fdb_c_api_tester.cpp
index 83977fb29e..161338d1eb 100644
--- a/bindings/c/test/apitester/fdb_c_api_tester.cpp
+++ b/bindings/c/test/apitester/fdb_c_api_tester.cpp
@@ -459,8 +459,10 @@ int main(int argc, char** argv) {
 			retCode = 1;
 		}
 
+		fprintf(stderr, "Stopping FDB network thread\n");
 		fdb_check(fdb::network::stop(), "Failed to stop FDB thread");
 		network_thread.join();
+		fprintf(stderr, "FDB network thread successfully stopped\n");
 	} catch (const std::exception& err) {
 		fmt::print(stderr, "ERROR: {}\n", err.what());
 		retCode = 1;
diff --git a/fdbclient/MultiVersionTransaction.actor.cpp b/fdbclient/MultiVersionTransaction.actor.cpp
index 3e0c2e18db..bbf376c805 100644
--- a/fdbclient/MultiVersionTransaction.actor.cpp
+++ b/fdbclient/MultiVersionTransaction.actor.cpp
@@ -18,6 +18,7 @@
  * limitations under the License.
  */
 
+#include "flow/Trace.h"
 #ifdef ADDRESS_SANITIZER
 #include <sanitizer/lsan_interface.h>
 #endif
@@ -2812,11 +2813,19 @@ void MultiVersionApi::runNetwork() {
 		});
 	}
 
-	localClient->api->runNetwork();
+	try {
+		localClient->api->runNetwork();
+	} catch (const Error& e) {
+		closeTraceFile();
+		throw e;
+	}
 
 	for (auto h : handles) {
 		waitThread(h);
 	}
+
+	TraceEvent("MultiVersionRunNetworkTerminating");
+	closeTraceFile();
 }
 
 void MultiVersionApi::stopNetwork() {
diff --git a/fdbclient/ThreadSafeTransaction.cpp b/fdbclient/ThreadSafeTransaction.cpp
index a79578a509..5821b30dee 100644
--- a/fdbclient/ThreadSafeTransaction.cpp
+++ b/fdbclient/ThreadSafeTransaction.cpp
@@ -743,10 +743,10 @@ void ThreadSafeApi::runNetwork() {
 	Optional<Error> runErr;
 	try {
 		::runNetwork();
-	} catch (Error& e) {
+	} catch (const Error& e) {
 		TraceEvent(SevError, "RunNetworkError").error(e);
 		runErr = e;
-	} catch (std::exception& e) {
+	} catch (const std::exception& e) {
 		runErr = unknown_error();
 		TraceEvent(SevError, "RunNetworkError").error(unknown_error()).detail("RootException", e.what());
 	} catch (...) {
@@ -757,9 +757,9 @@ void ThreadSafeApi::runNetwork() {
 	for (auto& hook : threadCompletionHooks) {
 		try {
 			hook.first(hook.second);
-		} catch (Error& e) {
+		} catch (const Error& e) {
 			TraceEvent(SevError, "NetworkShutdownHookError").error(e);
-		} catch (std::exception& e) {
+		} catch (const std::exception& e) {
 			TraceEvent(SevError, "NetworkShutdownHookError").error(unknown_error()).detail("RootException", e.what());
 		} catch (...) {
 			TraceEvent(SevError, "NetworkShutdownHookError").error(unknown_error());
@@ -767,12 +767,10 @@ void ThreadSafeApi::runNetwork() {
 	}
 
 	if (runErr.present()) {
-		closeTraceFile();
 		throw runErr.get();
 	}
 
 	TraceEvent("RunNetworkTerminating");
-	closeTraceFile();
 }
 
 void ThreadSafeApi::stopNetwork() {
diff --git a/flow/Trace.cpp b/flow/Trace.cpp
index 4717247d16..41b23ad491 100644
--- a/flow/Trace.cpp
+++ b/flow/Trace.cpp
@@ -25,6 +25,7 @@
 #include "flow/JsonTraceLogFormatter.h"
 #include "flow/flow.h"
 #include "flow/DeterministicRandom.h"
+#include <exception>
 #include <stdlib.h>
 #include <stdarg.h>
 #include <cctype>
@@ -514,25 +515,29 @@ public:
 
 	void close() {
 		if (opened) {
-			MutexHolder hold(mutex);
+			try {
+				MutexHolder hold(mutex);
 
-			// Write remaining contents
-			auto a = new WriterThread::WriteBuffer(std::move(eventBuffer));
-			loggedLength += bufferLength;
-			eventBuffer = std::vector<TraceEventFields>();
-			bufferLength = 0;
-			writer->post(a);
+				// Write remaining contents
+				auto a = new WriterThread::WriteBuffer(std::move(eventBuffer));
+				loggedLength += bufferLength;
+				eventBuffer = std::vector<TraceEventFields>();
+				bufferLength = 0;
+				writer->post(a);
 
-			auto c = new WriterThread::Close();
-			writer->post(c);
+				auto c = new WriterThread::Close();
+				writer->post(c);
 
-			ThreadFuture<Void> f(new ThreadSingleAssignmentVar<Void>);
-			barriers->push(f);
-			writer->post(new WriterThread::Barrier);
+				ThreadFuture<Void> f(new ThreadSingleAssignmentVar<Void>);
+				barriers->push(f);
+				writer->post(new WriterThread::Barrier);
 
-			f.getBlocking();
+				f.getBlocking();
 
-			opened = false;
+				opened = false;
+			} catch (const std::exception& e) {
+				fprintf(stderr, "Error closing trace file: %s\n", e.what());
+			}
 		}
 	}
 
diff --git a/tests/TestRunner/upgrade_test.py b/tests/TestRunner/upgrade_test.py
index 3948c56533..51700ba8c0 100755
--- a/tests/TestRunner/upgrade_test.py
+++ b/tests/TestRunner/upgrade_test.py
@@ -280,11 +280,13 @@ class UpgradeTest:
             os.close(self.ctrl_pipe)
 
     # Kill the tester process if it is still alive
-    def kill_tester_if_alive(self, workload_thread):
+    def kill_tester_if_alive(self, workload_thread, dump_stacks):
         if not workload_thread.is_alive():
             return
         if self.tester_proc is not None:
             try:
+                if dump_stacks:
+                    os.system("pstack {}".format(self.tester_proc.pid))
                 print("Killing the tester process")
                 self.tester_proc.kill()
                 workload_thread.join(5)
@@ -310,11 +312,11 @@ class UpgradeTest:
         except Exception:
             print("Upgrade test failed")
             print(traceback.format_exc())
-            self.kill_tester_if_alive(workload_thread)
+            self.kill_tester_if_alive(workload_thread, False)
         finally:
             workload_thread.join(5)
             reader_thread.join(5)
-            self.kill_tester_if_alive(workload_thread)
+            self.kill_tester_if_alive(workload_thread, True)
             if test_retcode == 0:
                 test_retcode = self.tester_retcode
         return test_retcode