mirror of
https://github.com/apple/foundationdb.git
synced 2025-05-14 18:02:31 +08:00
Improving troubleshooting of stopping the FDB client thread (#8629)
* Upgrade tests: dump thread call stacks of the tester process if it fails to terminate * ApiTester: log before and after stopping the network thread * Catch and print exceptions in closeTraceFile; Close trace file at the end of MVC runNetwork * Change trace event name for MVC runNetwork termination Co-authored-by: A.J. Beamon <aj.beamon@snowflake.com> Co-authored-by: A.J. Beamon <aj.beamon@snowflake.com>
This commit is contained in:
parent
9de72eb675
commit
18b852c4e4
@ -459,8 +459,10 @@ int main(int argc, char** argv) {
|
|||||||
retCode = 1;
|
retCode = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fprintf(stderr, "Stopping FDB network thread\n");
|
||||||
fdb_check(fdb::network::stop(), "Failed to stop FDB thread");
|
fdb_check(fdb::network::stop(), "Failed to stop FDB thread");
|
||||||
network_thread.join();
|
network_thread.join();
|
||||||
|
fprintf(stderr, "FDB network thread successfully stopped\n");
|
||||||
} catch (const std::exception& err) {
|
} catch (const std::exception& err) {
|
||||||
fmt::print(stderr, "ERROR: {}\n", err.what());
|
fmt::print(stderr, "ERROR: {}\n", err.what());
|
||||||
retCode = 1;
|
retCode = 1;
|
||||||
|
@ -18,6 +18,7 @@
|
|||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
#include "flow/Trace.h"
|
||||||
#ifdef ADDRESS_SANITIZER
|
#ifdef ADDRESS_SANITIZER
|
||||||
#include <sanitizer/lsan_interface.h>
|
#include <sanitizer/lsan_interface.h>
|
||||||
#endif
|
#endif
|
||||||
@ -2812,11 +2813,19 @@ void MultiVersionApi::runNetwork() {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
localClient->api->runNetwork();
|
localClient->api->runNetwork();
|
||||||
|
} catch (const Error& e) {
|
||||||
|
closeTraceFile();
|
||||||
|
throw e;
|
||||||
|
}
|
||||||
|
|
||||||
for (auto h : handles) {
|
for (auto h : handles) {
|
||||||
waitThread(h);
|
waitThread(h);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TraceEvent("MultiVersionRunNetworkTerminating");
|
||||||
|
closeTraceFile();
|
||||||
}
|
}
|
||||||
|
|
||||||
void MultiVersionApi::stopNetwork() {
|
void MultiVersionApi::stopNetwork() {
|
||||||
|
@ -743,10 +743,10 @@ void ThreadSafeApi::runNetwork() {
|
|||||||
Optional<Error> runErr;
|
Optional<Error> runErr;
|
||||||
try {
|
try {
|
||||||
::runNetwork();
|
::runNetwork();
|
||||||
} catch (Error& e) {
|
} catch (const Error& e) {
|
||||||
TraceEvent(SevError, "RunNetworkError").error(e);
|
TraceEvent(SevError, "RunNetworkError").error(e);
|
||||||
runErr = e;
|
runErr = e;
|
||||||
} catch (std::exception& e) {
|
} catch (const std::exception& e) {
|
||||||
runErr = unknown_error();
|
runErr = unknown_error();
|
||||||
TraceEvent(SevError, "RunNetworkError").error(unknown_error()).detail("RootException", e.what());
|
TraceEvent(SevError, "RunNetworkError").error(unknown_error()).detail("RootException", e.what());
|
||||||
} catch (...) {
|
} catch (...) {
|
||||||
@ -757,9 +757,9 @@ void ThreadSafeApi::runNetwork() {
|
|||||||
for (auto& hook : threadCompletionHooks) {
|
for (auto& hook : threadCompletionHooks) {
|
||||||
try {
|
try {
|
||||||
hook.first(hook.second);
|
hook.first(hook.second);
|
||||||
} catch (Error& e) {
|
} catch (const Error& e) {
|
||||||
TraceEvent(SevError, "NetworkShutdownHookError").error(e);
|
TraceEvent(SevError, "NetworkShutdownHookError").error(e);
|
||||||
} catch (std::exception& e) {
|
} catch (const std::exception& e) {
|
||||||
TraceEvent(SevError, "NetworkShutdownHookError").error(unknown_error()).detail("RootException", e.what());
|
TraceEvent(SevError, "NetworkShutdownHookError").error(unknown_error()).detail("RootException", e.what());
|
||||||
} catch (...) {
|
} catch (...) {
|
||||||
TraceEvent(SevError, "NetworkShutdownHookError").error(unknown_error());
|
TraceEvent(SevError, "NetworkShutdownHookError").error(unknown_error());
|
||||||
@ -767,12 +767,10 @@ void ThreadSafeApi::runNetwork() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (runErr.present()) {
|
if (runErr.present()) {
|
||||||
closeTraceFile();
|
|
||||||
throw runErr.get();
|
throw runErr.get();
|
||||||
}
|
}
|
||||||
|
|
||||||
TraceEvent("RunNetworkTerminating");
|
TraceEvent("RunNetworkTerminating");
|
||||||
closeTraceFile();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void ThreadSafeApi::stopNetwork() {
|
void ThreadSafeApi::stopNetwork() {
|
||||||
|
@ -25,6 +25,7 @@
|
|||||||
#include "flow/JsonTraceLogFormatter.h"
|
#include "flow/JsonTraceLogFormatter.h"
|
||||||
#include "flow/flow.h"
|
#include "flow/flow.h"
|
||||||
#include "flow/DeterministicRandom.h"
|
#include "flow/DeterministicRandom.h"
|
||||||
|
#include <exception>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <stdarg.h>
|
#include <stdarg.h>
|
||||||
#include <cctype>
|
#include <cctype>
|
||||||
@ -514,6 +515,7 @@ public:
|
|||||||
|
|
||||||
void close() {
|
void close() {
|
||||||
if (opened) {
|
if (opened) {
|
||||||
|
try {
|
||||||
MutexHolder hold(mutex);
|
MutexHolder hold(mutex);
|
||||||
|
|
||||||
// Write remaining contents
|
// Write remaining contents
|
||||||
@ -533,6 +535,9 @@ public:
|
|||||||
f.getBlocking();
|
f.getBlocking();
|
||||||
|
|
||||||
opened = false;
|
opened = false;
|
||||||
|
} catch (const std::exception& e) {
|
||||||
|
fprintf(stderr, "Error closing trace file: %s\n", e.what());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -280,11 +280,13 @@ class UpgradeTest:
|
|||||||
os.close(self.ctrl_pipe)
|
os.close(self.ctrl_pipe)
|
||||||
|
|
||||||
# Kill the tester process if it is still alive
|
# Kill the tester process if it is still alive
|
||||||
def kill_tester_if_alive(self, workload_thread):
|
def kill_tester_if_alive(self, workload_thread, dump_stacks):
|
||||||
if not workload_thread.is_alive():
|
if not workload_thread.is_alive():
|
||||||
return
|
return
|
||||||
if self.tester_proc is not None:
|
if self.tester_proc is not None:
|
||||||
try:
|
try:
|
||||||
|
if dump_stacks:
|
||||||
|
os.system("pstack {}".format(self.tester_proc.pid))
|
||||||
print("Killing the tester process")
|
print("Killing the tester process")
|
||||||
self.tester_proc.kill()
|
self.tester_proc.kill()
|
||||||
workload_thread.join(5)
|
workload_thread.join(5)
|
||||||
@ -310,11 +312,11 @@ class UpgradeTest:
|
|||||||
except Exception:
|
except Exception:
|
||||||
print("Upgrade test failed")
|
print("Upgrade test failed")
|
||||||
print(traceback.format_exc())
|
print(traceback.format_exc())
|
||||||
self.kill_tester_if_alive(workload_thread)
|
self.kill_tester_if_alive(workload_thread, False)
|
||||||
finally:
|
finally:
|
||||||
workload_thread.join(5)
|
workload_thread.join(5)
|
||||||
reader_thread.join(5)
|
reader_thread.join(5)
|
||||||
self.kill_tester_if_alive(workload_thread)
|
self.kill_tester_if_alive(workload_thread, True)
|
||||||
if test_retcode == 0:
|
if test_retcode == 0:
|
||||||
test_retcode = self.tester_retcode
|
test_retcode = self.tester_retcode
|
||||||
return test_retcode
|
return test_retcode
|
||||||
|
Loading…
x
Reference in New Issue
Block a user