From 0559bd3c6f0d5628e3162977ccd7cece7f7727be Mon Sep 17 00:00:00 2001 From: Ata E Husain Bohra Date: Tue, 23 Aug 2022 09:08:24 -0700 Subject: [PATCH] RegisterWorker to cancel existing watcher if reusing the worker (#7949) Description Register worker maintain 'id_worker' map to track the worker interface registeration, it also sets 'WorkerAvailabilityWatcher' actor to track for failures. However, if worker is already registered but interface gets updated, the existing code doesn't actively cancel the watcher. One possible race condition is, old watcher detects worker failure and removes the worker from id_worker map even before the new watcher started monitoring the new interface. If such a scenario is hit, it trips an assert in rebootAndCheck routine which expects the worker should be present in the id_worker map. Patch addresses the race condition by actively cancelling existing watcher actor before registering the new watcher actor. Testing devRunCorrectness - 100K --- fdbserver/ClusterController.actor.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp index 400259d521..0ab1ef622e 100644 --- a/fdbserver/ClusterController.actor.cpp +++ b/fdbserver/ClusterController.actor.cpp @@ -802,6 +802,7 @@ void checkOutstandingRequests(ClusterControllerData* self) { ACTOR Future rebootAndCheck(ClusterControllerData* cluster, Optional> processID) { { + ASSERT(processID.present()); auto watcher = cluster->id_worker.find(processID); ASSERT(watcher != cluster->id_worker.end()); @@ -1245,6 +1246,10 @@ ACTOR Future registerWorker(RegisterWorkerRequest req, if (info->second.details.interf.id() != w.id()) { self->removedDBInfoEndpoints.insert(info->second.details.interf.updateServerDBInfo.getEndpoint()); info->second.details.interf = w; + // Cancel the existing watcher actor; possible race condition could be, the older registered watcher + // detects failures and removes the worker from id_worker even before the new watcher starts monitoring the + // new interface + info->second.watcher.cancel(); info->second.watcher = workerAvailabilityWatch(w, newProcessClass, self); } if (req.requestDbInfo) {