Improving troubleshooting of stopping the FDB client thread (#8629)

* Upgrade tests: dump thread call stacks of the tester process if it fails to terminate

* ApiTester: log before and after stopping the network thread

* Catch and print exceptions in closeTraceFile; Close trace file at the end of MVC runNetwork

* Change trace event name for MVC runNetwork termination

Co-authored-by: A.J. Beamon <aj.beamon@snowflake.com>

Co-authored-by: A.J. Beamon <aj.beamon@snowflake.com>
This commit is contained in:
Vaidas Gasiunas 2022-11-03 13:20:21 +01:00 committed by GitHub
parent 9de72eb675
commit 18b852c4e4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 40 additions and 24 deletions

View File

@ -459,8 +459,10 @@ int main(int argc, char** argv) {
retCode = 1; retCode = 1;
} }
fprintf(stderr, "Stopping FDB network thread\n");
fdb_check(fdb::network::stop(), "Failed to stop FDB thread"); fdb_check(fdb::network::stop(), "Failed to stop FDB thread");
network_thread.join(); network_thread.join();
fprintf(stderr, "FDB network thread successfully stopped\n");
} catch (const std::exception& err) { } catch (const std::exception& err) {
fmt::print(stderr, "ERROR: {}\n", err.what()); fmt::print(stderr, "ERROR: {}\n", err.what());
retCode = 1; retCode = 1;

View File

@ -18,6 +18,7 @@
* limitations under the License. * limitations under the License.
*/ */
#include "flow/Trace.h"
#ifdef ADDRESS_SANITIZER #ifdef ADDRESS_SANITIZER
#include <sanitizer/lsan_interface.h> #include <sanitizer/lsan_interface.h>
#endif #endif
@ -2812,11 +2813,19 @@ void MultiVersionApi::runNetwork() {
}); });
} }
localClient->api->runNetwork(); try {
localClient->api->runNetwork();
} catch (const Error& e) {
closeTraceFile();
throw e;
}
for (auto h : handles) { for (auto h : handles) {
waitThread(h); waitThread(h);
} }
TraceEvent("MultiVersionRunNetworkTerminating");
closeTraceFile();
} }
void MultiVersionApi::stopNetwork() { void MultiVersionApi::stopNetwork() {

View File

@ -743,10 +743,10 @@ void ThreadSafeApi::runNetwork() {
Optional<Error> runErr; Optional<Error> runErr;
try { try {
::runNetwork(); ::runNetwork();
} catch (Error& e) { } catch (const Error& e) {
TraceEvent(SevError, "RunNetworkError").error(e); TraceEvent(SevError, "RunNetworkError").error(e);
runErr = e; runErr = e;
} catch (std::exception& e) { } catch (const std::exception& e) {
runErr = unknown_error(); runErr = unknown_error();
TraceEvent(SevError, "RunNetworkError").error(unknown_error()).detail("RootException", e.what()); TraceEvent(SevError, "RunNetworkError").error(unknown_error()).detail("RootException", e.what());
} catch (...) { } catch (...) {
@ -757,9 +757,9 @@ void ThreadSafeApi::runNetwork() {
for (auto& hook : threadCompletionHooks) { for (auto& hook : threadCompletionHooks) {
try { try {
hook.first(hook.second); hook.first(hook.second);
} catch (Error& e) { } catch (const Error& e) {
TraceEvent(SevError, "NetworkShutdownHookError").error(e); TraceEvent(SevError, "NetworkShutdownHookError").error(e);
} catch (std::exception& e) { } catch (const std::exception& e) {
TraceEvent(SevError, "NetworkShutdownHookError").error(unknown_error()).detail("RootException", e.what()); TraceEvent(SevError, "NetworkShutdownHookError").error(unknown_error()).detail("RootException", e.what());
} catch (...) { } catch (...) {
TraceEvent(SevError, "NetworkShutdownHookError").error(unknown_error()); TraceEvent(SevError, "NetworkShutdownHookError").error(unknown_error());
@ -767,12 +767,10 @@ void ThreadSafeApi::runNetwork() {
} }
if (runErr.present()) { if (runErr.present()) {
closeTraceFile();
throw runErr.get(); throw runErr.get();
} }
TraceEvent("RunNetworkTerminating"); TraceEvent("RunNetworkTerminating");
closeTraceFile();
} }
void ThreadSafeApi::stopNetwork() { void ThreadSafeApi::stopNetwork() {

View File

@ -25,6 +25,7 @@
#include "flow/JsonTraceLogFormatter.h" #include "flow/JsonTraceLogFormatter.h"
#include "flow/flow.h" #include "flow/flow.h"
#include "flow/DeterministicRandom.h" #include "flow/DeterministicRandom.h"
#include <exception>
#include <stdlib.h> #include <stdlib.h>
#include <stdarg.h> #include <stdarg.h>
#include <cctype> #include <cctype>
@ -514,25 +515,29 @@ public:
void close() { void close() {
if (opened) { if (opened) {
MutexHolder hold(mutex); try {
MutexHolder hold(mutex);
// Write remaining contents // Write remaining contents
auto a = new WriterThread::WriteBuffer(std::move(eventBuffer)); auto a = new WriterThread::WriteBuffer(std::move(eventBuffer));
loggedLength += bufferLength; loggedLength += bufferLength;
eventBuffer = std::vector<TraceEventFields>(); eventBuffer = std::vector<TraceEventFields>();
bufferLength = 0; bufferLength = 0;
writer->post(a); writer->post(a);
auto c = new WriterThread::Close(); auto c = new WriterThread::Close();
writer->post(c); writer->post(c);
ThreadFuture<Void> f(new ThreadSingleAssignmentVar<Void>); ThreadFuture<Void> f(new ThreadSingleAssignmentVar<Void>);
barriers->push(f); barriers->push(f);
writer->post(new WriterThread::Barrier); writer->post(new WriterThread::Barrier);
f.getBlocking(); f.getBlocking();
opened = false; opened = false;
} catch (const std::exception& e) {
fprintf(stderr, "Error closing trace file: %s\n", e.what());
}
} }
} }

View File

@ -280,11 +280,13 @@ class UpgradeTest:
os.close(self.ctrl_pipe) os.close(self.ctrl_pipe)
# Kill the tester process if it is still alive # Kill the tester process if it is still alive
def kill_tester_if_alive(self, workload_thread): def kill_tester_if_alive(self, workload_thread, dump_stacks):
if not workload_thread.is_alive(): if not workload_thread.is_alive():
return return
if self.tester_proc is not None: if self.tester_proc is not None:
try: try:
if dump_stacks:
os.system("pstack {}".format(self.tester_proc.pid))
print("Killing the tester process") print("Killing the tester process")
self.tester_proc.kill() self.tester_proc.kill()
workload_thread.join(5) workload_thread.join(5)
@ -310,11 +312,11 @@ class UpgradeTest:
except Exception: except Exception:
print("Upgrade test failed") print("Upgrade test failed")
print(traceback.format_exc()) print(traceback.format_exc())
self.kill_tester_if_alive(workload_thread) self.kill_tester_if_alive(workload_thread, False)
finally: finally:
workload_thread.join(5) workload_thread.join(5)
reader_thread.join(5) reader_thread.join(5)
self.kill_tester_if_alive(workload_thread) self.kill_tester_if_alive(workload_thread, True)
if test_retcode == 0: if test_retcode == 0:
test_retcode = self.tester_retcode test_retcode = self.tester_retcode
return test_retcode return test_retcode