Improving troubleshooting of stopping the FDB client thread (#8629)

* Upgrade tests: dump thread call stacks of the tester process if it fails to terminate

* ApiTester: log before and after stopping the network thread

* Catch and print exceptions in closeTraceFile; Close trace file at the end of MVC runNetwork

* Change trace event name for MVC runNetwork termination

Co-authored-by: A.J. Beamon <aj.beamon@snowflake.com>

Co-authored-by: A.J. Beamon <aj.beamon@snowflake.com>
This commit is contained in:
Vaidas Gasiunas 2022-11-03 13:20:21 +01:00 committed by GitHub
parent 9de72eb675
commit 18b852c4e4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 40 additions and 24 deletions

View File

@ -459,8 +459,10 @@ int main(int argc, char** argv) {
retCode = 1;
}
fprintf(stderr, "Stopping FDB network thread\n");
fdb_check(fdb::network::stop(), "Failed to stop FDB thread");
network_thread.join();
fprintf(stderr, "FDB network thread successfully stopped\n");
} catch (const std::exception& err) {
fmt::print(stderr, "ERROR: {}\n", err.what());
retCode = 1;

View File

@ -18,6 +18,7 @@
* limitations under the License.
*/
#include "flow/Trace.h"
#ifdef ADDRESS_SANITIZER
#include <sanitizer/lsan_interface.h>
#endif
@ -2812,11 +2813,19 @@ void MultiVersionApi::runNetwork() {
});
}
try {
localClient->api->runNetwork();
} catch (const Error& e) {
closeTraceFile();
throw e;
}
for (auto h : handles) {
waitThread(h);
}
TraceEvent("MultiVersionRunNetworkTerminating");
closeTraceFile();
}
void MultiVersionApi::stopNetwork() {

View File

@ -743,10 +743,10 @@ void ThreadSafeApi::runNetwork() {
Optional<Error> runErr;
try {
::runNetwork();
} catch (Error& e) {
} catch (const Error& e) {
TraceEvent(SevError, "RunNetworkError").error(e);
runErr = e;
} catch (std::exception& e) {
} catch (const std::exception& e) {
runErr = unknown_error();
TraceEvent(SevError, "RunNetworkError").error(unknown_error()).detail("RootException", e.what());
} catch (...) {
@ -757,9 +757,9 @@ void ThreadSafeApi::runNetwork() {
for (auto& hook : threadCompletionHooks) {
try {
hook.first(hook.second);
} catch (Error& e) {
} catch (const Error& e) {
TraceEvent(SevError, "NetworkShutdownHookError").error(e);
} catch (std::exception& e) {
} catch (const std::exception& e) {
TraceEvent(SevError, "NetworkShutdownHookError").error(unknown_error()).detail("RootException", e.what());
} catch (...) {
TraceEvent(SevError, "NetworkShutdownHookError").error(unknown_error());
@ -767,12 +767,10 @@ void ThreadSafeApi::runNetwork() {
}
if (runErr.present()) {
closeTraceFile();
throw runErr.get();
}
TraceEvent("RunNetworkTerminating");
closeTraceFile();
}
void ThreadSafeApi::stopNetwork() {

View File

@ -25,6 +25,7 @@
#include "flow/JsonTraceLogFormatter.h"
#include "flow/flow.h"
#include "flow/DeterministicRandom.h"
#include <exception>
#include <stdlib.h>
#include <stdarg.h>
#include <cctype>
@ -514,6 +515,7 @@ public:
void close() {
if (opened) {
try {
MutexHolder hold(mutex);
// Write remaining contents
@ -533,6 +535,9 @@ public:
f.getBlocking();
opened = false;
} catch (const std::exception& e) {
fprintf(stderr, "Error closing trace file: %s\n", e.what());
}
}
}

View File

@ -280,11 +280,13 @@ class UpgradeTest:
os.close(self.ctrl_pipe)
# Kill the tester process if it is still alive
def kill_tester_if_alive(self, workload_thread):
def kill_tester_if_alive(self, workload_thread, dump_stacks):
if not workload_thread.is_alive():
return
if self.tester_proc is not None:
try:
if dump_stacks:
os.system("pstack {}".format(self.tester_proc.pid))
print("Killing the tester process")
self.tester_proc.kill()
workload_thread.join(5)
@ -310,11 +312,11 @@ class UpgradeTest:
except Exception:
print("Upgrade test failed")
print(traceback.format_exc())
self.kill_tester_if_alive(workload_thread)
self.kill_tester_if_alive(workload_thread, False)
finally:
workload_thread.join(5)
reader_thread.join(5)
self.kill_tester_if_alive(workload_thread)
self.kill_tester_if_alive(workload_thread, True)
if test_retcode == 0:
test_retcode = self.tester_retcode
return test_retcode