Fix orphaned storage server due to force recovery (#6914)

* Fix orphaned storage server due to force recovery

The force recovery can roll back the transaction that adds a storage server.
However, the storage server may now at version B > A, the recovery version.
As a result, its peek to buddy TLog won't return TLogPeekReply::popped to
trigger its exit, and instead getting a higher version C > B back. To the
storage server, this means the message is empty, thus not removing itself and
keeps peeking.

The fix is to instead of using recovery version as the popped version for the
SS, we use the recovery transaction version, which is the first transaction
after the recovery. Force recovery bumps this version to a much higher version
than the SS's version. So the TLog would set TLogPeekReply::popped to trigger
the storage server exit.

* Fix tlog peek to disallow return empty message between recoveredAt and recovery txn version

This contract today is not explicitly set and can cause storage server to fail
with assertion "rollbackVersion >= data->storageVersion()". This is because if
such an empty version is returned, SS may advance its storage version to a
value larger than the rollback version set in the recovery transaction.

The fix is to block peek reply until recovery transaction has been received.

* Move recoveryTxnReceived to be per LogData

This is because a shared TLog can have a first generation TLog which is already
setting the promise, thus later generations won't wait for the recovery version.
For the current generation, all peeks need to wait, while for older generations,
there is no need to wait (by checking if they are stopped).

* For initial commit, poppedVersion needs to be at least 2

To get rid of the previous unsuccessful recovery's recruited seed
storage servers.
This commit is contained in:
Jingyu Zhou 2022-05-02 17:17:37 -07:00 committed by GitHub
parent fa2e85f1d3
commit 05e63bc703
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 65 additions and 16 deletions

View File

@ -603,7 +603,12 @@ Future<Void> logRouterPeekMessages(PromiseType replyPromise,
}
replyPromise.send(reply);
//TraceEvent("LogRouterPeek4", self->dbgid);
DisabledTraceEvent("LogRouterPeek4", self->dbgid)
.detail("Tag", reqTag.toString())
.detail("ReqBegin", reqBegin)
.detail("End", reply.end)
.detail("MessageSize", reply.messages.size())
.detail("PoppedVersion", self->poppedVersion);
return Void();
}

View File

@ -58,6 +58,7 @@ ILogSystem::ServerPeekCursor::ServerPeekCursor(Reference<AsyncVar<OptionalInterf
this->results.minKnownCommittedVersion = 0;
DisabledTraceEvent(SevDebug, "SPC_Starting", randomID)
.detail("Tag", tag.toString())
.detail("UsePeekStream", usePeekStream)
.detail("Begin", begin)
.detail("End", end);
}

View File

@ -567,6 +567,8 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
TLogData* tLogData;
Promise<Void> recoveryComplete, committingQueue;
Version unrecoveredBefore, recoveredAt;
Version recoveryTxnVersion;
Promise<Void> recoveryTxnReceived;
struct PeekTrackerData {
std::map<int, Promise<std::pair<Version, bool>>>
@ -646,10 +648,11 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
blockingPeekTimeouts("BlockingPeekTimeouts", cc), emptyPeeks("EmptyPeeks", cc),
nonEmptyPeeks("NonEmptyPeeks", cc), logId(interf.id()), protocolVersion(protocolVersion),
newPersistentDataVersion(invalidVersion), tLogData(tLogData), unrecoveredBefore(1), recoveredAt(1),
logSystem(new AsyncVar<Reference<ILogSystem>>()), remoteTag(remoteTag), isPrimary(isPrimary),
logRouterTags(logRouterTags), logRouterPoppedVersion(0), logRouterPopToVersion(0), locality(tagLocalityInvalid),
recruitmentID(recruitmentID), logSpillType(logSpillType), allTags(tags.begin(), tags.end()),
terminated(tLogData->terminated.getFuture()), execOpCommitInProgress(false), txsTags(txsTags) {
recoveryTxnVersion(1), logSystem(new AsyncVar<Reference<ILogSystem>>()), remoteTag(remoteTag),
isPrimary(isPrimary), logRouterTags(logRouterTags), logRouterPoppedVersion(0), logRouterPopToVersion(0),
locality(tagLocalityInvalid), recruitmentID(recruitmentID), logSpillType(logSpillType),
allTags(tags.begin(), tags.end()), terminated(tLogData->terminated.getFuture()), execOpCommitInProgress(false),
txsTags(txsTags) {
startRole(Role::TRANSACTION_LOG,
interf.id(),
tLogData->workerID,
@ -1565,7 +1568,7 @@ Version poppedVersion(Reference<LogData> self, Tag tag) {
if (tag == txsTag || tag.locality == tagLocalityTxs) {
return 0;
}
return self->recoveredAt + 1;
return std::max(self->recoveredAt + 1, self->recoveryTxnVersion);
}
return tagData->popped;
}
@ -1743,12 +1746,24 @@ Future<Void> tLogPeekMessages(PromiseType replyPromise,
return Void();
}
//TraceEvent("TLogPeekMessages0", self->dbgid).detail("ReqBeginEpoch", reqBegin.epoch).detail("ReqBeginSeq", reqBegin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", reqTag1).detail("Tag2", reqTag2);
DisabledTraceEvent("TLogPeekMessages0", self->dbgid)
.detail("LogId", logData->logId)
.detail("Tag", reqTag.toString())
.detail("ReqBegin", reqBegin)
.detail("Version", logData->version.get())
.detail("RecoveredAt", logData->recoveredAt);
// Wait until we have something to return that the caller doesn't already have
if (logData->version.get() < reqBegin) {
wait(logData->version.whenAtLeast(reqBegin));
wait(delay(SERVER_KNOBS->TLOG_PEEK_DELAY, g_network->getCurrentTask()));
}
if (!logData->stopped && reqTag.locality != tagLocalityTxs && reqTag != txsTag) {
// Make sure the peek reply has the recovery txn for the current TLog.
// Older generation TLog has been stopped and doesn't wait here.
// Similarly during recovery, reading transaction state store
// doesn't wait here.
wait(logData->recoveryTxnReceived.getFuture());
}
if (logData->locality != tagLocalitySatellite && reqTag.locality == tagLocalityLogRouter) {
wait(self->concurrentLogRouterReads.take());
@ -1788,6 +1803,11 @@ Future<Void> tLogPeekMessages(PromiseType replyPromise,
poppedVer = poppedVersion(logData, reqTag);
}
DisabledTraceEvent("TLogPeekMessages1", self->dbgid)
.detail("LogId", logData->logId)
.detail("Tag", reqTag.toString())
.detail("ReqBegin", reqBegin)
.detail("PoppedVer", poppedVer);
if (poppedVer > reqBegin) {
TLogPeekReply rep;
rep.maxKnownVersion = logData->version.get();
@ -1832,7 +1852,9 @@ Future<Void> tLogPeekMessages(PromiseType replyPromise,
onlySpilled = false;
// grab messages from disk
//TraceEvent("TLogPeekMessages", self->dbgid).detail("ReqBeginEpoch", reqBegin.epoch).detail("ReqBeginSeq", reqBegin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", reqTag1).detail("Tag2", reqTag2);
DisabledTraceEvent("TLogPeekMessages2", self->dbgid)
.detail("ReqBegin", reqBegin)
.detail("Tag", reqTag.toString());
if (reqBegin <= logData->persistentDataDurableVersion) {
// Just in case the durable version changes while we are waiting for the read, we grab this data from
// memory. We may or may not actually send it depending on whether we get enough data from disk. SOMEDAY:
@ -1993,13 +2015,12 @@ Future<Void> tLogPeekMessages(PromiseType replyPromise,
reply.end = endVersion;
reply.onlySpilled = onlySpilled;
// TraceEvent("TlogPeek", self->dbgid)
// .detail("LogId", logData->logId)
// .detail("Tag", req.tag.toString())
// .detail("BeginVer", req.begin)
// .detail("EndVer", reply.end)
// .detail("MsgBytes", reply.messages.expectedSize())
// .detail("ForAddress", req.reply.getEndpoint().getPrimaryAddress());
DisabledTraceEvent("TLogPeekMessages4", self->dbgid)
.detail("LogId", logData->logId)
.detail("Tag", reqTag.toString())
.detail("ReqBegin", reqBegin)
.detail("EndVer", reply.end)
.detail("MsgBytes", reply.messages.expectedSize());
if (reqSequence.present()) {
auto& trackerData = logData->peekTracker[peekId];
@ -2221,6 +2242,9 @@ ACTOR Future<Void> tLogCommit(TLogData* self,
g_traceBatch.addEvent("CommitDebug", tlogDebugID.get().first(), "TLog.tLogCommit.BeforeWaitForVersion");
}
if (req.prevVersion == logData->recoveredAt) {
logData->recoveryTxnVersion = req.version;
}
logData->minKnownCommittedVersion = std::max(logData->minKnownCommittedVersion, req.minKnownCommittedVersion);
wait(logData->version.whenAtLeast(req.prevVersion));
@ -2274,6 +2298,15 @@ ACTOR Future<Void> tLogCommit(TLogData* self,
}
// Notifies the commitQueue actor to commit persistentQueue, and also unblocks tLogPeekMessages actors
logData->version.set(req.version);
if (logData->recoveryTxnReceived.canBeSet() &&
(req.prevVersion == 0 || req.prevVersion == logData->recoveredAt)) {
TraceEvent("TLogInfo", self->dbgid)
.detail("Log", logData->logId)
.detail("Prev", req.prevVersion)
.detail("RecoveredAt", logData->recoveredAt)
.detail("RecoveryTxnVersion", req.version);
logData->recoveryTxnReceived.send(Void());
}
if (SERVER_KNOBS->ENABLE_VERSION_VECTOR_TLOG_UNICAST) {
self->unknownCommittedVersions.push_front(std::make_tuple(req.version, req.tLogCount));
while (!self->unknownCommittedVersions.empty() &&
@ -2777,6 +2810,7 @@ ACTOR Future<Void> pullAsyncData(TLogData* self,
state Version ver = 0;
state std::vector<TagsAndMessage> messages;
state bool pullingRecoveryData = endVersion.present() && endVersion.get() == logData->recoveredAt;
loop {
state bool foundMessage = r->hasMessage();
if (!foundMessage || r->version().version != ver) {
@ -2814,6 +2848,13 @@ ACTOR Future<Void> pullAsyncData(TLogData* self,
// Notifies the commitQueue actor to commit persistentQueue, and also unblocks tLogPeekMessages
// actors
logData->version.set(ver);
if (logData->recoveryTxnReceived.canBeSet() && !pullingRecoveryData && ver > logData->recoveredAt) {
TraceEvent("TLogInfo", self->dbgid)
.detail("Log", logData->logId)
.detail("RecoveredAt", logData->recoveredAt)
.detail("RecoveryTxnVersion", ver);
logData->recoveryTxnReceived.send(Void());
}
wait(yield(TaskPriority::TLogCommit));
}
lastVer = ver;

View File

@ -6802,7 +6802,9 @@ ACTOR Future<Void> update(StorageServer* data, bool* pReceivedUpdate) {
}
data->tlogCursorReadsLatencyHistogram->sampleSeconds(now() - beforeTLogCursorReads);
if (cursor->popped() > 0) {
TraceEvent("StorageServerWorkerRemoved", data->thisServerID).detail("Reason", "PeekPoppedTLogData");
TraceEvent("StorageServerWorkerRemoved", data->thisServerID)
.detail("Reason", "PeekPoppedTLogData")
.detail("Version", cursor->popped());
throw worker_removed();
}