mirror of
https://github.com/apple/foundationdb.git
synced 2025-06-02 03:12:12 +08:00
Cleaning up debugging and fixing race in blob manager recruitment
This commit is contained in:
parent
a85b578d2b
commit
0f9e88572a
@ -2735,17 +2735,18 @@ ACTOR Future<Void> blobManager(BlobManagerInterface bmInterf,
|
|||||||
if (BM_DEBUG) {
|
if (BM_DEBUG) {
|
||||||
fmt::print("BM {} exiting because it is replaced\n", self->epoch);
|
fmt::print("BM {} exiting because it is replaced\n", self->epoch);
|
||||||
}
|
}
|
||||||
|
TraceEvent("BlobManagerReplaced", bmInterf.id()).detail("Epoch", epoch);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
when(HaltBlobManagerRequest req = waitNext(bmInterf.haltBlobManager.getFuture())) {
|
when(HaltBlobManagerRequest req = waitNext(bmInterf.haltBlobManager.getFuture())) {
|
||||||
req.reply.send(Void());
|
req.reply.send(Void());
|
||||||
TraceEvent("BlobManagerHalted", bmInterf.id()).detail("ReqID", req.requesterID);
|
TraceEvent("BlobManagerHalted", bmInterf.id()).detail("Epoch", epoch).detail("ReqID", req.requesterID);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
when(state HaltBlobGranulesRequest req = waitNext(bmInterf.haltBlobGranules.getFuture())) {
|
when(state HaltBlobGranulesRequest req = waitNext(bmInterf.haltBlobGranules.getFuture())) {
|
||||||
wait(haltBlobGranules(self));
|
wait(haltBlobGranules(self));
|
||||||
req.reply.send(Void());
|
req.reply.send(Void());
|
||||||
TraceEvent("BlobGranulesHalted", bmInterf.id()).detail("ReqID", req.requesterID);
|
TraceEvent("BlobGranulesHalted", bmInterf.id()).detail("Epoch", epoch).detail("ReqID", req.requesterID);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
when(BlobManagerExclusionSafetyCheckRequest exclCheckReq =
|
when(BlobManagerExclusionSafetyCheckRequest exclCheckReq =
|
||||||
@ -2753,7 +2754,7 @@ ACTOR Future<Void> blobManager(BlobManagerInterface bmInterf,
|
|||||||
blobManagerExclusionSafetyCheck(self, exclCheckReq);
|
blobManagerExclusionSafetyCheck(self, exclCheckReq);
|
||||||
}
|
}
|
||||||
when(wait(collection)) {
|
when(wait(collection)) {
|
||||||
TraceEvent("BlobManagerActorCollectionError");
|
TraceEvent(SevError, "BlobManagerActorCollectionError");
|
||||||
ASSERT(false);
|
ASSERT(false);
|
||||||
throw internal_error();
|
throw internal_error();
|
||||||
}
|
}
|
||||||
|
@ -2123,6 +2123,7 @@ ACTOR Future<int64_t> getNextBMEpoch(ClusterControllerData* self) {
|
|||||||
tr->set(blobManagerEpochKey, blobManagerEpochValueFor(newEpoch));
|
tr->set(blobManagerEpochKey, blobManagerEpochValueFor(newEpoch));
|
||||||
|
|
||||||
wait(tr->commit());
|
wait(tr->commit());
|
||||||
|
TraceEvent(SevDebug, "CCNextBlobManagerEpoch", self->id).detail("Epoch", newEpoch);
|
||||||
return newEpoch;
|
return newEpoch;
|
||||||
} catch (Error& e) {
|
} catch (Error& e) {
|
||||||
wait(tr->onError(e));
|
wait(tr->onError(e));
|
||||||
|
@ -524,20 +524,21 @@ std::vector<DiskStore> getDiskStores(std::string folder) {
|
|||||||
|
|
||||||
// Register the worker interf to cluster controller (cc) and
|
// Register the worker interf to cluster controller (cc) and
|
||||||
// re-register the worker when key roles interface, e.g., cc, dd, ratekeeper, change.
|
// re-register the worker when key roles interface, e.g., cc, dd, ratekeeper, change.
|
||||||
ACTOR Future<Void> registrationClient(Reference<AsyncVar<Optional<ClusterControllerFullInterface>> const> ccInterface,
|
ACTOR Future<Void> registrationClient(
|
||||||
WorkerInterface interf,
|
Reference<AsyncVar<Optional<ClusterControllerFullInterface>> const> ccInterface,
|
||||||
Reference<AsyncVar<ClusterControllerPriorityInfo>> asyncPriorityInfo,
|
WorkerInterface interf,
|
||||||
ProcessClass initialClass,
|
Reference<AsyncVar<ClusterControllerPriorityInfo>> asyncPriorityInfo,
|
||||||
Reference<AsyncVar<Optional<DataDistributorInterface>> const> ddInterf,
|
ProcessClass initialClass,
|
||||||
Reference<AsyncVar<Optional<RatekeeperInterface>> const> rkInterf,
|
Reference<AsyncVar<Optional<DataDistributorInterface>> const> ddInterf,
|
||||||
Reference<AsyncVar<Optional<BlobManagerInterface>> const> bmInterf,
|
Reference<AsyncVar<Optional<RatekeeperInterface>> const> rkInterf,
|
||||||
Reference<AsyncVar<Optional<EncryptKeyProxyInterface>> const> ekpInterf,
|
Reference<AsyncVar<Optional<std::pair<int64_t, BlobManagerInterface>>> const> bmInterf,
|
||||||
Reference<AsyncVar<bool> const> degraded,
|
Reference<AsyncVar<Optional<EncryptKeyProxyInterface>> const> ekpInterf,
|
||||||
Reference<IClusterConnectionRecord> connRecord,
|
Reference<AsyncVar<bool> const> degraded,
|
||||||
Reference<AsyncVar<std::set<std::string>> const> issues,
|
Reference<IClusterConnectionRecord> connRecord,
|
||||||
Reference<ConfigNode> configNode,
|
Reference<AsyncVar<std::set<std::string>> const> issues,
|
||||||
Reference<LocalConfiguration> localConfig,
|
Reference<ConfigNode> configNode,
|
||||||
Reference<AsyncVar<ServerDBInfo>> dbInfo) {
|
Reference<LocalConfiguration> localConfig,
|
||||||
|
Reference<AsyncVar<ServerDBInfo>> dbInfo) {
|
||||||
// Keeps the cluster controller (as it may be re-elected) informed that this worker exists
|
// Keeps the cluster controller (as it may be re-elected) informed that this worker exists
|
||||||
// The cluster controller uses waitFailureClient to find out if we die, and returns from registrationReply
|
// The cluster controller uses waitFailureClient to find out if we die, and returns from registrationReply
|
||||||
// (requiring us to re-register) The registration request piggybacks optional distributor interface if it exists.
|
// (requiring us to re-register) The registration request piggybacks optional distributor interface if it exists.
|
||||||
@ -567,7 +568,8 @@ ACTOR Future<Void> registrationClient(Reference<AsyncVar<Optional<ClusterControl
|
|||||||
requestGeneration++,
|
requestGeneration++,
|
||||||
ddInterf->get(),
|
ddInterf->get(),
|
||||||
rkInterf->get(),
|
rkInterf->get(),
|
||||||
bmInterf->get(),
|
bmInterf->get().present() ? bmInterf->get().get().second
|
||||||
|
: Optional<BlobManagerInterface>(),
|
||||||
ekpInterf->get(),
|
ekpInterf->get(),
|
||||||
degraded->get(),
|
degraded->get(),
|
||||||
localConfig->lastSeenVersion(),
|
localConfig->lastSeenVersion(),
|
||||||
@ -1374,6 +1376,24 @@ ACTOR Future<Void> chaosMetricsLogger() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// like genericactors setWhenDoneOrError, but we need to take into account the bm epoch. We don't want to reset it if
|
||||||
|
// this manager was replaced by a later manager (with a higher epoch) on this worker
|
||||||
|
ACTOR Future<Void> resetBlobManagerWhenDoneOrError(
|
||||||
|
Future<Void> blobManagerProcess,
|
||||||
|
Reference<AsyncVar<Optional<std::pair<int64_t, BlobManagerInterface>>>> var,
|
||||||
|
int64_t epoch) {
|
||||||
|
try {
|
||||||
|
wait(blobManagerProcess);
|
||||||
|
} catch (Error& e) {
|
||||||
|
if (e.code() == error_code_actor_cancelled)
|
||||||
|
throw;
|
||||||
|
}
|
||||||
|
if (var->get().present() && var->get().get().first == epoch) {
|
||||||
|
var->set(Optional<std::pair<int64_t, BlobManagerInterface>>());
|
||||||
|
}
|
||||||
|
return Void();
|
||||||
|
}
|
||||||
|
|
||||||
ACTOR Future<Void> workerServer(Reference<IClusterConnectionRecord> connRecord,
|
ACTOR Future<Void> workerServer(Reference<IClusterConnectionRecord> connRecord,
|
||||||
Reference<AsyncVar<Optional<ClusterControllerFullInterface>> const> ccInterface,
|
Reference<AsyncVar<Optional<ClusterControllerFullInterface>> const> ccInterface,
|
||||||
LocalityData locality,
|
LocalityData locality,
|
||||||
@ -1395,8 +1415,8 @@ ACTOR Future<Void> workerServer(Reference<IClusterConnectionRecord> connRecord,
|
|||||||
state Reference<AsyncVar<Optional<DataDistributorInterface>>> ddInterf(
|
state Reference<AsyncVar<Optional<DataDistributorInterface>>> ddInterf(
|
||||||
new AsyncVar<Optional<DataDistributorInterface>>());
|
new AsyncVar<Optional<DataDistributorInterface>>());
|
||||||
state Reference<AsyncVar<Optional<RatekeeperInterface>>> rkInterf(new AsyncVar<Optional<RatekeeperInterface>>());
|
state Reference<AsyncVar<Optional<RatekeeperInterface>>> rkInterf(new AsyncVar<Optional<RatekeeperInterface>>());
|
||||||
state Reference<AsyncVar<Optional<BlobManagerInterface>>> bmInterf(new AsyncVar<Optional<BlobManagerInterface>>());
|
state Reference<AsyncVar<Optional<std::pair<int64_t, BlobManagerInterface>>>> bmEpochAndInterf(
|
||||||
state int64_t myBMEpoch = -1;
|
new AsyncVar<Optional<std::pair<int64_t, BlobManagerInterface>>>());
|
||||||
state Reference<AsyncVar<Optional<EncryptKeyProxyInterface>>> ekpInterf(
|
state Reference<AsyncVar<Optional<EncryptKeyProxyInterface>>> ekpInterf(
|
||||||
new AsyncVar<Optional<EncryptKeyProxyInterface>>());
|
new AsyncVar<Optional<EncryptKeyProxyInterface>>());
|
||||||
state Future<Void> handleErrors = workerHandleErrors(errors.getFuture()); // Needs to be stopped last
|
state Future<Void> handleErrors = workerHandleErrors(errors.getFuture()); // Needs to be stopped last
|
||||||
@ -1672,7 +1692,7 @@ ACTOR Future<Void> workerServer(Reference<IClusterConnectionRecord> connRecord,
|
|||||||
initialClass,
|
initialClass,
|
||||||
ddInterf,
|
ddInterf,
|
||||||
rkInterf,
|
rkInterf,
|
||||||
bmInterf,
|
bmEpochAndInterf,
|
||||||
ekpInterf,
|
ekpInterf,
|
||||||
degraded,
|
degraded,
|
||||||
connRecord,
|
connRecord,
|
||||||
@ -1874,8 +1894,8 @@ ACTOR Future<Void> workerServer(Reference<IClusterConnectionRecord> connRecord,
|
|||||||
BlobManagerInterface recruited(locality, req.reqId);
|
BlobManagerInterface recruited(locality, req.reqId);
|
||||||
recruited.initEndpoints();
|
recruited.initEndpoints();
|
||||||
|
|
||||||
if (bmInterf->get().present() && myBMEpoch == req.epoch) {
|
if (bmEpochAndInterf->get().present() && bmEpochAndInterf->get().get().first == req.epoch) {
|
||||||
recruited = bmInterf->get().get();
|
recruited = bmEpochAndInterf->get().get().second;
|
||||||
|
|
||||||
TEST(true); // Recruited while already a blob manager.
|
TEST(true); // Recruited while already a blob manager.
|
||||||
} else {
|
} else {
|
||||||
@ -1884,7 +1904,6 @@ ACTOR Future<Void> workerServer(Reference<IClusterConnectionRecord> connRecord,
|
|||||||
// Also, not halting lets us handle the case here where the last BM had a higher
|
// Also, not halting lets us handle the case here where the last BM had a higher
|
||||||
// epoch and somehow the epochs got out of order by a delayed initialize request. The one we start
|
// epoch and somehow the epochs got out of order by a delayed initialize request. The one we start
|
||||||
// here will just halt on the lock check.
|
// here will just halt on the lock check.
|
||||||
myBMEpoch = req.epoch;
|
|
||||||
startRole(Role::BLOB_MANAGER, recruited.id(), interf.id());
|
startRole(Role::BLOB_MANAGER, recruited.id(), interf.id());
|
||||||
DUMPTOKEN(recruited.waitFailure);
|
DUMPTOKEN(recruited.waitFailure);
|
||||||
DUMPTOKEN(recruited.haltBlobManager);
|
DUMPTOKEN(recruited.haltBlobManager);
|
||||||
@ -1892,12 +1911,13 @@ ACTOR Future<Void> workerServer(Reference<IClusterConnectionRecord> connRecord,
|
|||||||
DUMPTOKEN(recruited.blobManagerExclCheckReq);
|
DUMPTOKEN(recruited.blobManagerExclCheckReq);
|
||||||
|
|
||||||
Future<Void> blobManagerProcess = blobManager(recruited, dbInfo, req.epoch);
|
Future<Void> blobManagerProcess = blobManager(recruited, dbInfo, req.epoch);
|
||||||
errorForwarders.add(forwardError(
|
errorForwarders.add(
|
||||||
errors,
|
forwardError(errors,
|
||||||
Role::BLOB_MANAGER,
|
Role::BLOB_MANAGER,
|
||||||
recruited.id(),
|
recruited.id(),
|
||||||
setWhenDoneOrError(blobManagerProcess, bmInterf, Optional<BlobManagerInterface>())));
|
resetBlobManagerWhenDoneOrError(blobManagerProcess, bmEpochAndInterf, req.epoch)));
|
||||||
bmInterf->set(Optional<BlobManagerInterface>(recruited));
|
bmEpochAndInterf->set(
|
||||||
|
Optional<std::pair<int64_t, BlobManagerInterface>>(std::pair(req.epoch, recruited)));
|
||||||
}
|
}
|
||||||
TraceEvent("BlobManagerReceived", req.reqId).detail("BlobManagerId", recruited.id());
|
TraceEvent("BlobManagerReceived", req.reqId).detail("BlobManagerId", recruited.id());
|
||||||
req.reply.send(recruited);
|
req.reply.send(recruited);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user