mirror of
https://github.com/apple/foundationdb.git
synced 2025-05-14 18:02:31 +08:00
Fix orphaned storage server due to force recovery (#6914)
* Fix orphaned storage server due to force recovery The force recovery can roll back the transaction that adds a storage server. However, the storage server may now at version B > A, the recovery version. As a result, its peek to buddy TLog won't return TLogPeekReply::popped to trigger its exit, and instead getting a higher version C > B back. To the storage server, this means the message is empty, thus not removing itself and keeps peeking. The fix is to instead of using recovery version as the popped version for the SS, we use the recovery transaction version, which is the first transaction after the recovery. Force recovery bumps this version to a much higher version than the SS's version. So the TLog would set TLogPeekReply::popped to trigger the storage server exit. * Fix tlog peek to disallow return empty message between recoveredAt and recovery txn version This contract today is not explicitly set and can cause storage server to fail with assertion "rollbackVersion >= data->storageVersion()". This is because if such an empty version is returned, SS may advance its storage version to a value larger than the rollback version set in the recovery transaction. The fix is to block peek reply until recovery transaction has been received. * Move recoveryTxnReceived to be per LogData This is because a shared TLog can have a first generation TLog which is already setting the promise, thus later generations won't wait for the recovery version. For the current generation, all peeks need to wait, while for older generations, there is no need to wait (by checking if they are stopped). * For initial commit, poppedVersion needs to be at least 2 To get rid of the previous unsuccessful recovery's recruited seed storage servers.
This commit is contained in:
parent
fa2e85f1d3
commit
05e63bc703
@ -603,7 +603,12 @@ Future<Void> logRouterPeekMessages(PromiseType replyPromise,
|
||||
}
|
||||
|
||||
replyPromise.send(reply);
|
||||
//TraceEvent("LogRouterPeek4", self->dbgid);
|
||||
DisabledTraceEvent("LogRouterPeek4", self->dbgid)
|
||||
.detail("Tag", reqTag.toString())
|
||||
.detail("ReqBegin", reqBegin)
|
||||
.detail("End", reply.end)
|
||||
.detail("MessageSize", reply.messages.size())
|
||||
.detail("PoppedVersion", self->poppedVersion);
|
||||
return Void();
|
||||
}
|
||||
|
||||
|
@ -58,6 +58,7 @@ ILogSystem::ServerPeekCursor::ServerPeekCursor(Reference<AsyncVar<OptionalInterf
|
||||
this->results.minKnownCommittedVersion = 0;
|
||||
DisabledTraceEvent(SevDebug, "SPC_Starting", randomID)
|
||||
.detail("Tag", tag.toString())
|
||||
.detail("UsePeekStream", usePeekStream)
|
||||
.detail("Begin", begin)
|
||||
.detail("End", end);
|
||||
}
|
||||
|
@ -567,6 +567,8 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
|
||||
TLogData* tLogData;
|
||||
Promise<Void> recoveryComplete, committingQueue;
|
||||
Version unrecoveredBefore, recoveredAt;
|
||||
Version recoveryTxnVersion;
|
||||
Promise<Void> recoveryTxnReceived;
|
||||
|
||||
struct PeekTrackerData {
|
||||
std::map<int, Promise<std::pair<Version, bool>>>
|
||||
@ -646,10 +648,11 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
|
||||
blockingPeekTimeouts("BlockingPeekTimeouts", cc), emptyPeeks("EmptyPeeks", cc),
|
||||
nonEmptyPeeks("NonEmptyPeeks", cc), logId(interf.id()), protocolVersion(protocolVersion),
|
||||
newPersistentDataVersion(invalidVersion), tLogData(tLogData), unrecoveredBefore(1), recoveredAt(1),
|
||||
logSystem(new AsyncVar<Reference<ILogSystem>>()), remoteTag(remoteTag), isPrimary(isPrimary),
|
||||
logRouterTags(logRouterTags), logRouterPoppedVersion(0), logRouterPopToVersion(0), locality(tagLocalityInvalid),
|
||||
recruitmentID(recruitmentID), logSpillType(logSpillType), allTags(tags.begin(), tags.end()),
|
||||
terminated(tLogData->terminated.getFuture()), execOpCommitInProgress(false), txsTags(txsTags) {
|
||||
recoveryTxnVersion(1), logSystem(new AsyncVar<Reference<ILogSystem>>()), remoteTag(remoteTag),
|
||||
isPrimary(isPrimary), logRouterTags(logRouterTags), logRouterPoppedVersion(0), logRouterPopToVersion(0),
|
||||
locality(tagLocalityInvalid), recruitmentID(recruitmentID), logSpillType(logSpillType),
|
||||
allTags(tags.begin(), tags.end()), terminated(tLogData->terminated.getFuture()), execOpCommitInProgress(false),
|
||||
txsTags(txsTags) {
|
||||
startRole(Role::TRANSACTION_LOG,
|
||||
interf.id(),
|
||||
tLogData->workerID,
|
||||
@ -1565,7 +1568,7 @@ Version poppedVersion(Reference<LogData> self, Tag tag) {
|
||||
if (tag == txsTag || tag.locality == tagLocalityTxs) {
|
||||
return 0;
|
||||
}
|
||||
return self->recoveredAt + 1;
|
||||
return std::max(self->recoveredAt + 1, self->recoveryTxnVersion);
|
||||
}
|
||||
return tagData->popped;
|
||||
}
|
||||
@ -1743,12 +1746,24 @@ Future<Void> tLogPeekMessages(PromiseType replyPromise,
|
||||
return Void();
|
||||
}
|
||||
|
||||
//TraceEvent("TLogPeekMessages0", self->dbgid).detail("ReqBeginEpoch", reqBegin.epoch).detail("ReqBeginSeq", reqBegin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", reqTag1).detail("Tag2", reqTag2);
|
||||
DisabledTraceEvent("TLogPeekMessages0", self->dbgid)
|
||||
.detail("LogId", logData->logId)
|
||||
.detail("Tag", reqTag.toString())
|
||||
.detail("ReqBegin", reqBegin)
|
||||
.detail("Version", logData->version.get())
|
||||
.detail("RecoveredAt", logData->recoveredAt);
|
||||
// Wait until we have something to return that the caller doesn't already have
|
||||
if (logData->version.get() < reqBegin) {
|
||||
wait(logData->version.whenAtLeast(reqBegin));
|
||||
wait(delay(SERVER_KNOBS->TLOG_PEEK_DELAY, g_network->getCurrentTask()));
|
||||
}
|
||||
if (!logData->stopped && reqTag.locality != tagLocalityTxs && reqTag != txsTag) {
|
||||
// Make sure the peek reply has the recovery txn for the current TLog.
|
||||
// Older generation TLog has been stopped and doesn't wait here.
|
||||
// Similarly during recovery, reading transaction state store
|
||||
// doesn't wait here.
|
||||
wait(logData->recoveryTxnReceived.getFuture());
|
||||
}
|
||||
|
||||
if (logData->locality != tagLocalitySatellite && reqTag.locality == tagLocalityLogRouter) {
|
||||
wait(self->concurrentLogRouterReads.take());
|
||||
@ -1788,6 +1803,11 @@ Future<Void> tLogPeekMessages(PromiseType replyPromise,
|
||||
poppedVer = poppedVersion(logData, reqTag);
|
||||
}
|
||||
|
||||
DisabledTraceEvent("TLogPeekMessages1", self->dbgid)
|
||||
.detail("LogId", logData->logId)
|
||||
.detail("Tag", reqTag.toString())
|
||||
.detail("ReqBegin", reqBegin)
|
||||
.detail("PoppedVer", poppedVer);
|
||||
if (poppedVer > reqBegin) {
|
||||
TLogPeekReply rep;
|
||||
rep.maxKnownVersion = logData->version.get();
|
||||
@ -1832,7 +1852,9 @@ Future<Void> tLogPeekMessages(PromiseType replyPromise,
|
||||
onlySpilled = false;
|
||||
|
||||
// grab messages from disk
|
||||
//TraceEvent("TLogPeekMessages", self->dbgid).detail("ReqBeginEpoch", reqBegin.epoch).detail("ReqBeginSeq", reqBegin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", reqTag1).detail("Tag2", reqTag2);
|
||||
DisabledTraceEvent("TLogPeekMessages2", self->dbgid)
|
||||
.detail("ReqBegin", reqBegin)
|
||||
.detail("Tag", reqTag.toString());
|
||||
if (reqBegin <= logData->persistentDataDurableVersion) {
|
||||
// Just in case the durable version changes while we are waiting for the read, we grab this data from
|
||||
// memory. We may or may not actually send it depending on whether we get enough data from disk. SOMEDAY:
|
||||
@ -1993,13 +2015,12 @@ Future<Void> tLogPeekMessages(PromiseType replyPromise,
|
||||
reply.end = endVersion;
|
||||
reply.onlySpilled = onlySpilled;
|
||||
|
||||
// TraceEvent("TlogPeek", self->dbgid)
|
||||
// .detail("LogId", logData->logId)
|
||||
// .detail("Tag", req.tag.toString())
|
||||
// .detail("BeginVer", req.begin)
|
||||
// .detail("EndVer", reply.end)
|
||||
// .detail("MsgBytes", reply.messages.expectedSize())
|
||||
// .detail("ForAddress", req.reply.getEndpoint().getPrimaryAddress());
|
||||
DisabledTraceEvent("TLogPeekMessages4", self->dbgid)
|
||||
.detail("LogId", logData->logId)
|
||||
.detail("Tag", reqTag.toString())
|
||||
.detail("ReqBegin", reqBegin)
|
||||
.detail("EndVer", reply.end)
|
||||
.detail("MsgBytes", reply.messages.expectedSize());
|
||||
|
||||
if (reqSequence.present()) {
|
||||
auto& trackerData = logData->peekTracker[peekId];
|
||||
@ -2221,6 +2242,9 @@ ACTOR Future<Void> tLogCommit(TLogData* self,
|
||||
g_traceBatch.addEvent("CommitDebug", tlogDebugID.get().first(), "TLog.tLogCommit.BeforeWaitForVersion");
|
||||
}
|
||||
|
||||
if (req.prevVersion == logData->recoveredAt) {
|
||||
logData->recoveryTxnVersion = req.version;
|
||||
}
|
||||
logData->minKnownCommittedVersion = std::max(logData->minKnownCommittedVersion, req.minKnownCommittedVersion);
|
||||
|
||||
wait(logData->version.whenAtLeast(req.prevVersion));
|
||||
@ -2274,6 +2298,15 @@ ACTOR Future<Void> tLogCommit(TLogData* self,
|
||||
}
|
||||
// Notifies the commitQueue actor to commit persistentQueue, and also unblocks tLogPeekMessages actors
|
||||
logData->version.set(req.version);
|
||||
if (logData->recoveryTxnReceived.canBeSet() &&
|
||||
(req.prevVersion == 0 || req.prevVersion == logData->recoveredAt)) {
|
||||
TraceEvent("TLogInfo", self->dbgid)
|
||||
.detail("Log", logData->logId)
|
||||
.detail("Prev", req.prevVersion)
|
||||
.detail("RecoveredAt", logData->recoveredAt)
|
||||
.detail("RecoveryTxnVersion", req.version);
|
||||
logData->recoveryTxnReceived.send(Void());
|
||||
}
|
||||
if (SERVER_KNOBS->ENABLE_VERSION_VECTOR_TLOG_UNICAST) {
|
||||
self->unknownCommittedVersions.push_front(std::make_tuple(req.version, req.tLogCount));
|
||||
while (!self->unknownCommittedVersions.empty() &&
|
||||
@ -2777,6 +2810,7 @@ ACTOR Future<Void> pullAsyncData(TLogData* self,
|
||||
|
||||
state Version ver = 0;
|
||||
state std::vector<TagsAndMessage> messages;
|
||||
state bool pullingRecoveryData = endVersion.present() && endVersion.get() == logData->recoveredAt;
|
||||
loop {
|
||||
state bool foundMessage = r->hasMessage();
|
||||
if (!foundMessage || r->version().version != ver) {
|
||||
@ -2814,6 +2848,13 @@ ACTOR Future<Void> pullAsyncData(TLogData* self,
|
||||
// Notifies the commitQueue actor to commit persistentQueue, and also unblocks tLogPeekMessages
|
||||
// actors
|
||||
logData->version.set(ver);
|
||||
if (logData->recoveryTxnReceived.canBeSet() && !pullingRecoveryData && ver > logData->recoveredAt) {
|
||||
TraceEvent("TLogInfo", self->dbgid)
|
||||
.detail("Log", logData->logId)
|
||||
.detail("RecoveredAt", logData->recoveredAt)
|
||||
.detail("RecoveryTxnVersion", ver);
|
||||
logData->recoveryTxnReceived.send(Void());
|
||||
}
|
||||
wait(yield(TaskPriority::TLogCommit));
|
||||
}
|
||||
lastVer = ver;
|
||||
|
@ -6802,7 +6802,9 @@ ACTOR Future<Void> update(StorageServer* data, bool* pReceivedUpdate) {
|
||||
}
|
||||
data->tlogCursorReadsLatencyHistogram->sampleSeconds(now() - beforeTLogCursorReads);
|
||||
if (cursor->popped() > 0) {
|
||||
TraceEvent("StorageServerWorkerRemoved", data->thisServerID).detail("Reason", "PeekPoppedTLogData");
|
||||
TraceEvent("StorageServerWorkerRemoved", data->thisServerID)
|
||||
.detail("Reason", "PeekPoppedTLogData")
|
||||
.detail("Version", cursor->popped());
|
||||
throw worker_removed();
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user