/* * TagPartitionedLogSystem.actor.cpp * * This source file is part of the FoundationDB open source project * * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "fdbserver/TagPartitionedLogSystem.actor.h" #include "flow/actorcompiler.h" // This must be the last #include. ACTOR Future minVersionWhenReady(Future f, std::vector> replies) { wait(f); Version minVersion = std::numeric_limits::max(); for (auto& reply : replies) { if (reply.isReady() && !reply.isError()) { minVersion = std::min(minVersion, reply.get().version); } } return minVersion; } LogSet::LogSet(const TLogSet& tLogSet) : tLogWriteAntiQuorum(tLogSet.tLogWriteAntiQuorum), tLogReplicationFactor(tLogSet.tLogReplicationFactor), tLogLocalities(tLogSet.tLogLocalities), tLogVersion(tLogSet.tLogVersion), tLogPolicy(tLogSet.tLogPolicy), isLocal(tLogSet.isLocal), locality(tLogSet.locality), startVersion(tLogSet.startVersion), satelliteTagLocations(tLogSet.satelliteTagLocations) { for (const auto& log : tLogSet.tLogs) { logServers.push_back(makeReference>>(log)); } for (const auto& log : tLogSet.logRouters) { logRouters.push_back(makeReference>>(log)); } for (const auto& log : tLogSet.backupWorkers) { backupWorkers.push_back(makeReference>>(log)); } filterLocalityDataForPolicy(tLogPolicy, &tLogLocalities); updateLocalitySet(tLogLocalities); } LogSet::LogSet(const CoreTLogSet& coreSet) : tLogWriteAntiQuorum(coreSet.tLogWriteAntiQuorum), tLogReplicationFactor(coreSet.tLogReplicationFactor), tLogLocalities(coreSet.tLogLocalities), tLogVersion(coreSet.tLogVersion), tLogPolicy(coreSet.tLogPolicy), isLocal(coreSet.isLocal), locality(coreSet.locality), startVersion(coreSet.startVersion), satelliteTagLocations(coreSet.satelliteTagLocations) { for (const auto& log : coreSet.tLogs) { logServers.push_back( makeReference>>(OptionalInterface(log))); } // Do NOT recover coreSet.backupWorkers, because master will recruit new ones. filterLocalityDataForPolicy(tLogPolicy, &tLogLocalities); updateLocalitySet(tLogLocalities); } TLogSet::TLogSet(const LogSet& rhs) : tLogWriteAntiQuorum(rhs.tLogWriteAntiQuorum), tLogReplicationFactor(rhs.tLogReplicationFactor), tLogLocalities(rhs.tLogLocalities), tLogVersion(rhs.tLogVersion), tLogPolicy(rhs.tLogPolicy), isLocal(rhs.isLocal), locality(rhs.locality), startVersion(rhs.startVersion), satelliteTagLocations(rhs.satelliteTagLocations) { for (const auto& tlog : rhs.logServers) { tLogs.push_back(tlog->get()); } for (const auto& logRouter : rhs.logRouters) { logRouters.push_back(logRouter->get()); } for (const auto& worker : rhs.backupWorkers) { backupWorkers.push_back(worker->get()); } } OldTLogConf::OldTLogConf(const OldLogData& oldLogData) : epochBegin(oldLogData.epochBegin), epochEnd(oldLogData.epochEnd), logRouterTags(oldLogData.logRouterTags), txsTags(oldLogData.txsTags), pseudoLocalities(oldLogData.pseudoLocalities), epoch(oldLogData.epoch) { for (const Reference& logSet : oldLogData.tLogs) { tLogs.emplace_back(*logSet); } } CoreTLogSet::CoreTLogSet(const LogSet& logset) : tLogWriteAntiQuorum(logset.tLogWriteAntiQuorum), tLogReplicationFactor(logset.tLogReplicationFactor), tLogLocalities(logset.tLogLocalities), tLogPolicy(logset.tLogPolicy), isLocal(logset.isLocal), locality(logset.locality), startVersion(logset.startVersion), satelliteTagLocations(logset.satelliteTagLocations), tLogVersion(logset.tLogVersion) { for (const auto& log : logset.logServers) { tLogs.push_back(log->get().id()); } // Do NOT store logset.backupWorkers, because master will recruit new ones. } OldTLogCoreData::OldTLogCoreData(const OldLogData& oldData) : logRouterTags(oldData.logRouterTags), txsTags(oldData.txsTags), epochBegin(oldData.epochBegin), epochEnd(oldData.epochEnd), pseudoLocalities(oldData.pseudoLocalities), epoch(oldData.epoch) { for (const Reference& logSet : oldData.tLogs) { if (logSet->logServers.size()) { tLogs.emplace_back(*logSet); } } } Future ILogSystem::recoverAndEndEpoch(Reference>> const& outLogSystem, UID const& dbgid, DBCoreState const& oldState, FutureStream const& rejoins, LocalityData const& locality, bool* forceRecovery) { return TagPartitionedLogSystem::recoverAndEndEpoch(outLogSystem, dbgid, oldState, rejoins, locality, forceRecovery); } Reference ILogSystem::fromLogSystemConfig(UID const& dbgid, struct LocalityData const& locality, struct LogSystemConfig const& conf, bool excludeRemote, bool useRecoveredAt, Optional>> addActor) { if (conf.logSystemType == LogSystemType::empty) return Reference(); else if (conf.logSystemType == LogSystemType::tagPartitioned) return TagPartitionedLogSystem::fromLogSystemConfig( dbgid, locality, conf, excludeRemote, useRecoveredAt, addActor); else throw internal_error(); } Reference ILogSystem::fromOldLogSystemConfig(UID const& dbgid, struct LocalityData const& locality, struct LogSystemConfig const& conf) { if (conf.logSystemType == LogSystemType::empty) return Reference(); else if (conf.logSystemType == LogSystemType::tagPartitioned) return TagPartitionedLogSystem::fromOldLogSystemConfig(dbgid, locality, conf); else throw internal_error(); } Reference ILogSystem::fromServerDBInfo(UID const& dbgid, ServerDBInfo const& dbInfo, bool useRecoveredAt, Optional>> addActor) { return fromLogSystemConfig(dbgid, dbInfo.myLocality, dbInfo.logSystemConfig, false, useRecoveredAt, addActor); } void TagPartitionedLogSystem::stopRejoins() { rejoins = Future(); } void TagPartitionedLogSystem::addref() { ReferenceCounted::addref(); } void TagPartitionedLogSystem::delref() { ReferenceCounted::delref(); } std::string TagPartitionedLogSystem::describe() const { std::string result; for (int i = 0; i < tLogs.size(); i++) { result += format("%d: ", i); for (int j = 0; j < tLogs[i]->logServers.size(); j++) { result += tLogs[i]->logServers[j]->get().id().toString() + ((j == tLogs[i]->logServers.size() - 1) ? " " : ", "); } } return result; } UID TagPartitionedLogSystem::getDebugID() const { return dbgid; } void TagPartitionedLogSystem::addPseudoLocality(int8_t locality) { ASSERT(locality < 0); pseudoLocalities.insert(locality); for (uint16_t i = 0; i < logRouterTags; i++) { pseudoLocalityPopVersion[Tag(locality, i)] = 0; } } Tag TagPartitionedLogSystem::getPseudoPopTag(Tag tag, ProcessClass::ClassType type) const { switch (type) { case ProcessClass::LogRouterClass: if (tag.locality == tagLocalityLogRouter) { ASSERT(pseudoLocalities.count(tagLocalityLogRouterMapped) > 0); tag.locality = tagLocalityLogRouterMapped; } break; case ProcessClass::BackupClass: if (tag.locality == tagLocalityLogRouter) { ASSERT(pseudoLocalities.count(tagLocalityBackup) > 0); tag.locality = tagLocalityBackup; } break; default: // This should be an error at caller site. break; } return tag; } bool TagPartitionedLogSystem::hasPseudoLocality(int8_t locality) const { return pseudoLocalities.count(locality) > 0; } Version TagPartitionedLogSystem::popPseudoLocalityTag(Tag tag, Version upTo) { ASSERT(isPseudoLocality(tag.locality) && hasPseudoLocality(tag.locality)); Version& localityVersion = pseudoLocalityPopVersion[tag]; localityVersion = std::max(localityVersion, upTo); Version minVersion = localityVersion; // Why do we need to use the minimum popped version among all tags? Reason: for example, // 2 pseudo tags pop 100 or 150, respectively. It's only safe to pop min(100, 150), // because [101,150) is needed by another pseudo tag. for (const int8_t locality : pseudoLocalities) { minVersion = std::min(minVersion, pseudoLocalityPopVersion[Tag(locality, tag.id)]); } // TraceEvent("TLogPopPseudoTag", dbgid).detail("Tag", tag.toString()).detail("Version", upTo).detail("PopVersion", minVersion); return minVersion; } Future TagPartitionedLogSystem::recoverAndEndEpoch(Reference>> const& outLogSystem, UID const& dbgid, DBCoreState const& oldState, FutureStream const& rejoins, LocalityData const& locality, bool* forceRecovery) { return epochEnd(outLogSystem, dbgid, oldState, rejoins, locality, forceRecovery); } Reference TagPartitionedLogSystem::fromLogSystemConfig(UID const& dbgid, LocalityData const& locality, LogSystemConfig const& lsConf, bool excludeRemote, bool useRecoveredAt, Optional>> addActor) { ASSERT(lsConf.logSystemType == LogSystemType::tagPartitioned || (lsConf.logSystemType == LogSystemType::empty && !lsConf.tLogs.size())); // ASSERT(lsConf.epoch == epoch); //< FIXME auto logSystem = makeReference(dbgid, locality, lsConf.epoch, addActor); logSystem->tLogs.reserve(lsConf.tLogs.size()); logSystem->expectedLogSets = lsConf.expectedLogSets; logSystem->logRouterTags = lsConf.logRouterTags; logSystem->txsTags = lsConf.txsTags; logSystem->recruitmentID = lsConf.recruitmentID; logSystem->stopped = lsConf.stopped; if (useRecoveredAt) { logSystem->recoveredAt = lsConf.recoveredAt; } logSystem->pseudoLocalities = lsConf.pseudoLocalities; for (const TLogSet& tLogSet : lsConf.tLogs) { if (!excludeRemote || tLogSet.isLocal) { logSystem->tLogs.push_back(makeReference(tLogSet)); } } for (const auto& oldTlogConf : lsConf.oldTLogs) { logSystem->oldLogData.emplace_back(oldTlogConf); //TraceEvent("BWFromLSConf") // .detail("Epoch", logSystem->oldLogData.back().epoch) // .detail("Version", logSystem->oldLogData.back().epochEnd); } logSystem->logSystemType = lsConf.logSystemType; logSystem->oldestBackupEpoch = lsConf.oldestBackupEpoch; return logSystem; } Reference TagPartitionedLogSystem::fromOldLogSystemConfig(UID const& dbgid, LocalityData const& locality, LogSystemConfig const& lsConf) { ASSERT(lsConf.logSystemType == LogSystemType::tagPartitioned || (lsConf.logSystemType == LogSystemType::empty && !lsConf.tLogs.size())); // ASSERT(lsConf.epoch == epoch); //< FIXME const LogEpoch e = lsConf.oldTLogs.size() > 0 ? lsConf.oldTLogs[0].epoch : 0; auto logSystem = makeReference(dbgid, locality, e); if (lsConf.oldTLogs.size()) { for (const TLogSet& tLogSet : lsConf.oldTLogs[0].tLogs) { logSystem->tLogs.push_back(makeReference(tLogSet)); } logSystem->logRouterTags = lsConf.oldTLogs[0].logRouterTags; logSystem->txsTags = lsConf.oldTLogs[0].txsTags; // logSystem->epochEnd = lsConf.oldTLogs[0].epochEnd; for (int i = 1; i < lsConf.oldTLogs.size(); i++) { logSystem->oldLogData.emplace_back(lsConf.oldTLogs[i]); } } logSystem->logSystemType = lsConf.logSystemType; logSystem->stopped = true; logSystem->pseudoLocalities = lsConf.pseudoLocalities; return logSystem; } void TagPartitionedLogSystem::toCoreState(DBCoreState& newState) { if (recoveryComplete.isValid() && recoveryComplete.isError()) throw recoveryComplete.getError(); if (remoteRecoveryComplete.isValid() && remoteRecoveryComplete.isError()) throw remoteRecoveryComplete.getError(); newState.tLogs.clear(); newState.logRouterTags = logRouterTags; newState.txsTags = txsTags; newState.pseudoLocalities = pseudoLocalities; for (const auto& t : tLogs) { if (t->logServers.size()) { newState.tLogs.emplace_back(*t); newState.tLogs.back().tLogLocalities.clear(); for (const auto& log : t->logServers) { newState.tLogs.back().tLogLocalities.push_back(log->get().interf().filteredLocality); } } } newState.oldTLogData.clear(); if (!recoveryComplete.isValid() || !recoveryComplete.isReady() || (repopulateRegionAntiQuorum == 0 && (!remoteRecoveryComplete.isValid() || !remoteRecoveryComplete.isReady())) || epoch != oldestBackupEpoch) { for (const auto& oldData : oldLogData) { newState.oldTLogData.emplace_back(oldData); TraceEvent("BWToCore") .detail("Epoch", newState.oldTLogData.back().epoch) .detail("TotalTags", newState.oldTLogData.back().logRouterTags) .detail("BeginVersion", newState.oldTLogData.back().epochBegin) .detail("EndVersion", newState.oldTLogData.back().epochEnd); } } newState.logSystemType = logSystemType; } bool TagPartitionedLogSystem::remoteStorageRecovered() { return remoteRecoveryComplete.isValid() && remoteRecoveryComplete.isReady(); } Future TagPartitionedLogSystem::onCoreStateChanged() { std::vector> changes; changes.push_back(Never()); if (recoveryComplete.isValid() && !recoveryComplete.isReady()) { changes.push_back(recoveryComplete); } if (remoteRecovery.isValid() && !remoteRecovery.isReady()) { changes.push_back(remoteRecovery); } if (remoteRecoveryComplete.isValid() && !remoteRecoveryComplete.isReady()) { changes.push_back(remoteRecoveryComplete); } changes.push_back(backupWorkerChanged.onTrigger()); // changes to oldestBackupEpoch return waitForAny(changes); } void TagPartitionedLogSystem::coreStateWritten(DBCoreState const& newState) { if (!newState.oldTLogData.size()) { recoveryCompleteWrittenToCoreState.set(true); } for (auto& t : newState.tLogs) { if (!t.isLocal) { TraceEvent("RemoteLogsWritten", dbgid).log(); remoteLogsWrittenToCoreState = true; break; } } } Future TagPartitionedLogSystem::onError() { return onError_internal(this); } ACTOR Future TagPartitionedLogSystem::onError_internal(TagPartitionedLogSystem* self) { // Never returns normally, but throws an error if the subsystem stops working loop { std::vector> failed; std::vector> backupFailed(1, Never()); std::vector> changes; for (auto& it : self->tLogs) { for (auto& t : it->logServers) { if (t->get().present()) { failed.push_back( waitFailureClient(t->get().interf().waitFailure, SERVER_KNOBS->TLOG_TIMEOUT, -SERVER_KNOBS->TLOG_TIMEOUT / SERVER_KNOBS->SECONDS_BEFORE_NO_FAILURE_DELAY, /*trace=*/true)); } else { changes.push_back(t->onChange()); } } for (auto& t : it->logRouters) { if (t->get().present()) { failed.push_back( waitFailureClient(t->get().interf().waitFailure, SERVER_KNOBS->TLOG_TIMEOUT, -SERVER_KNOBS->TLOG_TIMEOUT / SERVER_KNOBS->SECONDS_BEFORE_NO_FAILURE_DELAY, /*trace=*/true)); } else { changes.push_back(t->onChange()); } } for (const auto& worker : it->backupWorkers) { if (worker->get().present()) { backupFailed.push_back( waitFailureClient(worker->get().interf().waitFailure, SERVER_KNOBS->BACKUP_TIMEOUT, -SERVER_KNOBS->BACKUP_TIMEOUT / SERVER_KNOBS->SECONDS_BEFORE_NO_FAILURE_DELAY, /*trace=*/true)); } else { changes.push_back(worker->onChange()); } } } if (!self->recoveryCompleteWrittenToCoreState.get()) { for (auto& old : self->oldLogData) { for (auto& it : old.tLogs) { for (auto& t : it->logRouters) { if (t->get().present()) { failed.push_back(waitFailureClient(t->get().interf().waitFailure, SERVER_KNOBS->TLOG_TIMEOUT, -SERVER_KNOBS->TLOG_TIMEOUT / SERVER_KNOBS->SECONDS_BEFORE_NO_FAILURE_DELAY, /*trace=*/true)); } else { changes.push_back(t->onChange()); } } } // Monitor changes of backup workers for old epochs. for (const auto& worker : old.tLogs[0]->backupWorkers) { if (worker->get().present()) { backupFailed.push_back(waitFailureClient(worker->get().interf().waitFailure, SERVER_KNOBS->BACKUP_TIMEOUT, -SERVER_KNOBS->BACKUP_TIMEOUT / SERVER_KNOBS->SECONDS_BEFORE_NO_FAILURE_DELAY, /*trace=*/true)); } else { changes.push_back(worker->onChange()); } } } } if (self->hasRemoteServers && (!self->remoteRecovery.isReady() || self->remoteRecovery.isError())) { changes.push_back(self->remoteRecovery); } changes.push_back(self->recoveryCompleteWrittenToCoreState.onChange()); changes.push_back(self->backupWorkerChanged.onTrigger()); ASSERT(failed.size() >= 1); wait(quorum(changes, 1) || tagError(quorum(failed, 1), tlog_failed()) || tagError(quorum(backupFailed, 1), backup_worker_failed())); } } ACTOR Future TagPartitionedLogSystem::pushResetChecker(Reference self, NetworkAddress addr) { self->slowReplies = 0; self->fastReplies = 0; wait(delay(SERVER_KNOBS->PUSH_STATS_INTERVAL)); TraceEvent("SlowPushStats") .detail("PeerAddress", addr) .detail("SlowReplies", self->slowReplies) .detail("FastReplies", self->fastReplies); if (self->slowReplies >= SERVER_KNOBS->PUSH_STATS_SLOW_AMOUNT && self->slowReplies / double(self->slowReplies + self->fastReplies) >= SERVER_KNOBS->PUSH_STATS_SLOW_RATIO) { FlowTransport::transport().resetConnection(addr); self->lastReset = now(); } return Void(); } ACTOR Future TagPartitionedLogSystem::recordPushMetrics(Reference self, Reference dist, NetworkAddress addr, Future in) { state double startTime = now(); TLogCommitReply t = wait(in); if (now() - self->lastReset > SERVER_KNOBS->PUSH_RESET_INTERVAL) { if (now() - startTime > SERVER_KNOBS->PUSH_MAX_LATENCY) { if (self->resetCheck.isReady()) { self->resetCheck = TagPartitionedLogSystem::pushResetChecker(self, addr); } self->slowReplies++; } else { self->fastReplies++; } } dist->sampleSeconds(now() - startTime); return t; } Future TagPartitionedLogSystem::push(Version prevVersion, Version version, Version knownCommittedVersion, Version minKnownCommittedVersion, LogPushData& data, SpanContext const& spanContext, Optional debugID, Optional> tpcvMap) { // FIXME: Randomize request order as in LegacyLogSystem? std::vector> quorumResults; std::vector> allReplies; int location = 0; Span span("TPLS:push"_loc, spanContext); std::unordered_map tLogCount; if (SERVER_KNOBS->ENABLE_VERSION_VECTOR_TLOG_UNICAST) { int location = 0; int logGroupLocal = 0; for (auto& it : tLogs) { if (!it->isLocal) { continue; } for (int loc = 0; loc < it->logServers.size(); loc++) { if (tpcvMap.get().find(location) != tpcvMap.get().end()) { tLogCount[logGroupLocal]++; } location++; } logGroupLocal++; } } int logGroupLocal = 0; for (auto& it : tLogs) { if (it->isLocal && it->logServers.size()) { if (it->connectionResetTrackers.size() == 0) { for (int i = 0; i < it->logServers.size(); i++) { it->connectionResetTrackers.push_back(makeReference()); } } if (it->tlogPushDistTrackers.empty()) { for (int i = 0; i < it->logServers.size(); i++) { it->tlogPushDistTrackers.push_back( Histogram::getHistogram("ToTlog_" + it->logServers[i]->get().interf().uniqueID.toString(), it->logServers[i]->get().interf().address().toString(), Histogram::Unit::microseconds)); } } std::vector> tLogCommitResults; for (int loc = 0; loc < it->logServers.size(); loc++) { if (SERVER_KNOBS->ENABLE_VERSION_VECTOR_TLOG_UNICAST) { if (tpcvMap.get().find(location) != tpcvMap.get().end()) { prevVersion = tpcvMap.get()[location]; } else { location++; continue; } } Standalone msg = data.getMessages(location); data.recordEmptyMessage(location, msg); allReplies.push_back(recordPushMetrics( it->connectionResetTrackers[loc], it->tlogPushDistTrackers[loc], it->logServers[loc]->get().interf().address(), it->logServers[loc]->get().interf().commit.getReply(TLogCommitRequest(spanContext, msg.arena(), prevVersion, version, knownCommittedVersion, minKnownCommittedVersion, msg, tLogCount[logGroupLocal], debugID), TaskPriority::ProxyTLogCommitReply))); Future commitSuccess = success(allReplies.back()); addActor.get().send(commitSuccess); tLogCommitResults.push_back(commitSuccess); location++; } quorumResults.push_back(quorum(tLogCommitResults, tLogCommitResults.size() - it->tLogWriteAntiQuorum)); logGroupLocal++; } } return minVersionWhenReady(waitForAll(quorumResults), allReplies); } Reference TagPartitionedLogSystem::peekAll(UID dbgid, Version begin, Version end, Tag tag, bool parallelGetMore) { int bestSet = 0; std::vector> localSets; Version lastBegin = 0; bool foundSpecial = false; for (auto& log : tLogs) { if (log->locality == tagLocalitySpecial || log->locality == tagLocalityUpgraded) { foundSpecial = true; } if (log->isLocal && log->logServers.size() && (log->locality == tagLocalitySpecial || log->locality == tagLocalityUpgraded || log->locality == tag.locality || tag == txsTag || tag.locality == tagLocalityTxs || tag.locality == tagLocalityLogRouter || ((tag.locality == tagLocalityUpgraded || tag == cacheTag) && log->locality != tagLocalitySatellite))) { lastBegin = std::max(lastBegin, log->startVersion); localSets.push_back(log); if (log->locality != tagLocalitySatellite) { bestSet = localSets.size() - 1; } } } if (!localSets.size()) { lastBegin = end; } if (begin >= lastBegin && localSets.size()) { TraceEvent("TLogPeekAllCurrentOnly", dbgid) .detail("Tag", tag.toString()) .detail("Begin", begin) .detail("End", end) .detail("BestLogs", localSets[bestSet]->logServerString()); return makeReference( localSets, bestSet, localSets[bestSet]->bestLocationFor(tag), tag, begin, end, parallelGetMore); } else { std::vector> cursors; std::vector epochEnds; if (lastBegin < end && localSets.size()) { TraceEvent("TLogPeekAllAddingCurrent", dbgid) .detail("Tag", tag.toString()) .detail("Begin", begin) .detail("End", end) .detail("BestLogs", localSets[bestSet]->logServerString()); cursors.push_back(makeReference( localSets, bestSet, localSets[bestSet]->bestLocationFor(tag), tag, lastBegin, end, parallelGetMore)); } for (int i = 0; begin < lastBegin; i++) { if (i == oldLogData.size()) { if (tag == txsTag || tag.locality == tagLocalityTxs || tag == cacheTag) { break; } TraceEvent("TLogPeekAllDead", dbgid) .detail("Tag", tag.toString()) .detail("Begin", begin) .detail("End", end) .detail("LastBegin", lastBegin) .detail("OldLogDataSize", oldLogData.size()); return makeReference( Reference>>(), tag, begin, getPeekEnd(), false, false); } int bestOldSet = 0; std::vector> localOldSets; Version thisBegin = begin; bool thisSpecial = false; for (auto& log : oldLogData[i].tLogs) { if (log->locality == tagLocalitySpecial || log->locality == tagLocalityUpgraded) { thisSpecial = true; } if (log->isLocal && log->logServers.size() && (log->locality == tagLocalitySpecial || log->locality == tagLocalityUpgraded || log->locality == tag.locality || tag == txsTag || tag.locality == tagLocalityTxs || tag.locality == tagLocalityLogRouter || ((tag.locality == tagLocalityUpgraded || tag == cacheTag) && log->locality != tagLocalitySatellite))) { thisBegin = std::max(thisBegin, log->startVersion); localOldSets.push_back(log); if (log->locality != tagLocalitySatellite) { bestOldSet = localOldSets.size() - 1; } } } if (!localOldSets.size()) { TraceEvent("TLogPeekAllNoLocalSets", dbgid) .detail("Tag", tag.toString()) .detail("Begin", begin) .detail("End", end) .detail("LastBegin", lastBegin); if (!cursors.size() && !foundSpecial) { continue; } return makeReference( Reference>>(), tag, begin, getPeekEnd(), false, false); } if (thisSpecial) { foundSpecial = true; } if (thisBegin < lastBegin) { if (thisBegin < end) { TraceEvent("TLogPeekAllAddingOld", dbgid) .detail("Tag", tag.toString()) .detail("Begin", begin) .detail("End", end) .detail("BestLogs", localOldSets[bestOldSet]->logServerString()) .detail("LastBegin", lastBegin) .detail("ThisBegin", thisBegin); cursors.push_back( makeReference(localOldSets, bestOldSet, localOldSets[bestOldSet]->bestLocationFor(tag), tag, thisBegin, std::min(lastBegin, end), parallelGetMore)); epochEnds.push_back(LogMessageVersion(std::min(lastBegin, end))); } lastBegin = thisBegin; } } return makeReference(cursors, epochEnds); } } Reference TagPartitionedLogSystem::peekRemote(UID dbgid, Version begin, Optional end, Tag tag, bool parallelGetMore) { int bestSet = -1; Version lastBegin = recoveredAt.present() ? recoveredAt.get() + 1 : 0; for (int t = 0; t < tLogs.size(); t++) { if (tLogs[t]->isLocal) { lastBegin = std::max(lastBegin, tLogs[t]->startVersion); } if (tLogs[t]->logRouters.size()) { ASSERT(bestSet == -1); bestSet = t; } } if (bestSet == -1) { TraceEvent("TLogPeekRemoteNoBestSet", dbgid) .detail("Tag", tag.toString()) .detail("Begin", begin) .detail("End", end.present() ? end.get() : getPeekEnd()); return makeReference( Reference>>(), tag, begin, getPeekEnd(), false, parallelGetMore); } if (begin >= lastBegin) { TraceEvent("TLogPeekRemoteBestOnly", dbgid) .detail("Tag", tag.toString()) .detail("Begin", begin) .detail("End", end.present() ? end.get() : getPeekEnd()) .detail("BestSet", bestSet) .detail("BestSetStart", lastBegin) .detail("LogRouterIds", tLogs[bestSet]->logRouterString()); return makeReference( tLogs[bestSet]->logRouters, tag, begin, end.present() ? end.get() + 1 : getPeekEnd(), parallelGetMore); } else { std::vector> cursors; std::vector epochEnds; TraceEvent("TLogPeekRemoteAddingBest", dbgid) .detail("Tag", tag.toString()) .detail("Begin", begin) .detail("End", end.present() ? end.get() : getPeekEnd()) .detail("BestSet", bestSet) .detail("BestSetStart", lastBegin) .detail("LogRouterIds", tLogs[bestSet]->logRouterString()); cursors.push_back(makeReference( tLogs[bestSet]->logRouters, tag, lastBegin, end.present() ? end.get() + 1 : getPeekEnd(), parallelGetMore)); int i = 0; while (begin < lastBegin) { if (i == oldLogData.size()) { TraceEvent("TLogPeekRemoteDead", dbgid) .detail("Tag", tag.toString()) .detail("Begin", begin) .detail("End", end.present() ? end.get() : getPeekEnd()) .detail("LastBegin", lastBegin) .detail("OldLogDataSize", oldLogData.size()); return makeReference( Reference>>(), tag, begin, getPeekEnd(), false, parallelGetMore); } int bestOldSet = -1; Version thisBegin = begin; for (int t = 0; t < oldLogData[i].tLogs.size(); t++) { if (oldLogData[i].tLogs[t]->isLocal) { thisBegin = std::max(thisBegin, oldLogData[i].tLogs[t]->startVersion); } if (oldLogData[i].tLogs[t]->logRouters.size()) { ASSERT(bestOldSet == -1); bestOldSet = t; } } if (bestOldSet == -1) { TraceEvent("TLogPeekRemoteNoOldBestSet", dbgid) .detail("Tag", tag.toString()) .detail("Begin", begin) .detail("End", end.present() ? end.get() : getPeekEnd()); return makeReference( Reference>>(), tag, begin, getPeekEnd(), false, parallelGetMore); } if (thisBegin < lastBegin) { TraceEvent("TLogPeekRemoteAddingOldBest", dbgid) .detail("Tag", tag.toString()) .detail("Begin", begin) .detail("End", end.present() ? end.get() : getPeekEnd()) .detail("BestOldSet", bestOldSet) .detail("LogRouterIds", oldLogData[i].tLogs[bestOldSet]->logRouterString()) .detail("LastBegin", lastBegin) .detail("ThisBegin", thisBegin) .detail("BestStartVer", oldLogData[i].tLogs[bestOldSet]->startVersion); cursors.push_back(makeReference( oldLogData[i].tLogs[bestOldSet]->logRouters, tag, thisBegin, lastBegin, parallelGetMore)); epochEnds.emplace_back(lastBegin); lastBegin = thisBegin; } i++; } return makeReference(cursors, epochEnds); } } Reference TagPartitionedLogSystem::peek(UID dbgid, Version begin, Optional end, Tag tag, bool parallelGetMore) { if (!tLogs.size()) { TraceEvent("TLogPeekNoLogSets", dbgid).detail("Tag", tag.toString()).detail("Begin", begin); return makeReference( Reference>>(), tag, begin, getPeekEnd(), false, false); } if (tag.locality == tagLocalityRemoteLog) { return peekRemote(dbgid, begin, end, tag, parallelGetMore); } else { return peekAll(dbgid, begin, getPeekEnd(), tag, parallelGetMore); } } Reference TagPartitionedLogSystem::peek(UID dbgid, Version begin, Optional end, std::vector tags, bool parallelGetMore) { if (tags.empty()) { TraceEvent("TLogPeekNoTags", dbgid).detail("Begin", begin); return makeReference( Reference>>(), invalidTag, begin, getPeekEnd(), false, false); } if (tags.size() == 1) { return peek(dbgid, begin, end, tags[0], parallelGetMore); } std::vector> cursors; cursors.reserve(tags.size()); for (auto tag : tags) { cursors.push_back(peek(dbgid, begin, end, tag, parallelGetMore)); } return makeReference(cursors, begin, end.present() ? end.get() + 1 : getPeekEnd(), true, tLogs[0]->locality == tagLocalityUpgraded, false); } Reference TagPartitionedLogSystem::peekLocal(UID dbgid, Tag tag, Version begin, Version end, bool useMergePeekCursors, int8_t peekLocality) { if (tag.locality >= 0 || tag.locality == tagLocalityUpgraded || tag.locality == tagLocalitySpecial) { peekLocality = tag.locality; } ASSERT(peekLocality >= 0 || peekLocality == tagLocalityUpgraded || tag.locality == tagLocalitySpecial); int bestSet = -1; bool foundSpecial = false; int logCount = 0; for (int t = 0; t < tLogs.size(); t++) { if (tLogs[t]->logServers.size() && tLogs[t]->locality != tagLocalitySatellite) { logCount++; } if (tLogs[t]->logServers.size() && (tLogs[t]->locality == tagLocalitySpecial || tLogs[t]->locality == tagLocalityUpgraded || tLogs[t]->locality == peekLocality || peekLocality == tagLocalityUpgraded || peekLocality == tagLocalitySpecial)) { if (tLogs[t]->locality == tagLocalitySpecial || tLogs[t]->locality == tagLocalityUpgraded) { foundSpecial = true; } bestSet = t; break; } } if (bestSet == -1) { TraceEvent("TLogPeekLocalNoBestSet", dbgid) .detail("Tag", tag.toString()) .detail("Begin", begin) .detail("End", end) .detail("LogCount", logCount); if (useMergePeekCursors || logCount > 1) { throw worker_removed(); } else { return makeReference( Reference>>(), tag, begin, getPeekEnd(), false, false); } } if (begin >= tLogs[bestSet]->startVersion) { TraceEvent("TLogPeekLocalBestOnly", dbgid) .detail("Tag", tag.toString()) .detail("Begin", begin) .detail("End", end) .detail("BestSet", bestSet) .detail("BestSetStart", tLogs[bestSet]->startVersion) .detail("LogId", tLogs[bestSet]->logServers[tLogs[bestSet]->bestLocationFor(tag)]->get().id()); if (useMergePeekCursors) { return makeReference(tLogs[bestSet]->logServers, tLogs[bestSet]->bestLocationFor(tag), tLogs[bestSet]->logServers.size() + 1 - tLogs[bestSet]->tLogReplicationFactor, tag, begin, end, true, tLogs[bestSet]->tLogLocalities, tLogs[bestSet]->tLogPolicy, tLogs[bestSet]->tLogReplicationFactor); } else { return makeReference( tLogs[bestSet]->logServers[tLogs[bestSet]->bestLocationFor(tag)], tag, begin, end, false, false); } } else { std::vector> cursors; std::vector epochEnds; if (tLogs[bestSet]->startVersion < end) { TraceEvent("TLogPeekLocalAddingBest", dbgid) .detail("Tag", tag.toString()) .detail("Begin", begin) .detail("End", end) .detail("BestSet", bestSet) .detail("BestSetStart", tLogs[bestSet]->startVersion) .detail("LogId", tLogs[bestSet]->logServers[tLogs[bestSet]->bestLocationFor(tag)]->get().id()); if (useMergePeekCursors) { cursors.push_back(makeReference(tLogs[bestSet]->logServers, tLogs[bestSet]->bestLocationFor(tag), tLogs[bestSet]->logServers.size() + 1 - tLogs[bestSet]->tLogReplicationFactor, tag, tLogs[bestSet]->startVersion, end, true, tLogs[bestSet]->tLogLocalities, tLogs[bestSet]->tLogPolicy, tLogs[bestSet]->tLogReplicationFactor)); } else { cursors.push_back(makeReference( tLogs[bestSet]->logServers[tLogs[bestSet]->bestLocationFor(tag)], tag, tLogs[bestSet]->startVersion, end, false, false)); } } Version lastBegin = tLogs[bestSet]->startVersion; for (int i = 0; begin < lastBegin; i++) { if (i == oldLogData.size()) { if ((tag == txsTag || tag.locality == tagLocalityTxs) && cursors.size()) { break; } TraceEvent("TLogPeekLocalDead", dbgid) .detail("Tag", tag.toString()) .detail("Begin", begin) .detail("End", end) .detail("LastBegin", lastBegin) .detail("OldLogDataSize", oldLogData.size()); throw worker_removed(); } int bestOldSet = -1; logCount = 0; bool nextFoundSpecial = false; for (int t = 0; t < oldLogData[i].tLogs.size(); t++) { if (oldLogData[i].tLogs[t]->logServers.size() && oldLogData[i].tLogs[t]->locality != tagLocalitySatellite) { logCount++; } if (oldLogData[i].tLogs[t]->logServers.size() && (oldLogData[i].tLogs[t]->locality == tagLocalitySpecial || oldLogData[i].tLogs[t]->locality == tagLocalityUpgraded || oldLogData[i].tLogs[t]->locality == peekLocality || peekLocality == tagLocalityUpgraded || peekLocality == tagLocalitySpecial)) { if (oldLogData[i].tLogs[t]->locality == tagLocalitySpecial || oldLogData[i].tLogs[t]->locality == tagLocalityUpgraded) { nextFoundSpecial = true; } if (foundSpecial && !oldLogData[i].tLogs[t]->isLocal) { TraceEvent("TLogPeekLocalRemoteBeforeSpecial", dbgid) .detail("Tag", tag.toString()) .detail("Begin", begin) .detail("End", end) .detail("LastBegin", lastBegin) .detail("OldLogDataSize", oldLogData.size()) .detail("Idx", i); throw worker_removed(); } bestOldSet = t; break; } } if (bestOldSet == -1) { TraceEvent("TLogPeekLocalNoBestSet", dbgid) .detail("Tag", tag.toString()) .detail("Begin", begin) .detail("End", end) .detail("LastBegin", lastBegin) .detail("OldLogDataSize", oldLogData.size()) .detail("Idx", i) .detail("LogRouterTags", oldLogData[i].logRouterTags) .detail("LogCount", logCount) .detail("FoundSpecial", foundSpecial); if (oldLogData[i].logRouterTags == 0 || logCount > 1 || foundSpecial) { throw worker_removed(); } continue; } foundSpecial = nextFoundSpecial; Version thisBegin = std::max(oldLogData[i].tLogs[bestOldSet]->startVersion, begin); if (thisBegin < lastBegin) { if (thisBegin < end) { TraceEvent("TLogPeekLocalAddingOldBest", dbgid) .detail("Tag", tag.toString()) .detail("Begin", begin) .detail("End", end) .detail("BestOldSet", bestOldSet) .detail("LogServers", oldLogData[i].tLogs[bestOldSet]->logServerString()) .detail("ThisBegin", thisBegin) .detail("LastBegin", lastBegin); // detail("LogId", // oldLogData[i].tLogs[bestOldSet]->logServers[tLogs[bestOldSet]->bestLocationFor( tag // )]->get().id()); cursors.push_back(makeReference( oldLogData[i].tLogs[bestOldSet]->logServers, oldLogData[i].tLogs[bestOldSet]->bestLocationFor(tag), oldLogData[i].tLogs[bestOldSet]->logServers.size() + 1 - oldLogData[i].tLogs[bestOldSet]->tLogReplicationFactor, tag, thisBegin, std::min(lastBegin, end), useMergePeekCursors, oldLogData[i].tLogs[bestOldSet]->tLogLocalities, oldLogData[i].tLogs[bestOldSet]->tLogPolicy, oldLogData[i].tLogs[bestOldSet]->tLogReplicationFactor)); epochEnds.emplace_back(std::min(lastBegin, end)); } lastBegin = thisBegin; } } return makeReference(cursors, epochEnds); } } Reference TagPartitionedLogSystem::peekTxs(UID dbgid, Version begin, int8_t peekLocality, Version localEnd, bool canDiscardPopped) { Version end = getEnd(); if (!tLogs.size()) { TraceEvent("TLogPeekTxsNoLogs", dbgid).log(); return makeReference( Reference>>(), txsTag, begin, end, false, false); } TraceEvent("TLogPeekTxs", dbgid) .detail("Begin", begin) .detail("End", end) .detail("LocalEnd", localEnd) .detail("PeekLocality", peekLocality) .detail("CanDiscardPopped", canDiscardPopped); int maxTxsTags = txsTags; bool needsOldTxs = tLogs[0]->tLogVersion < TLogVersion::V4; for (auto& it : oldLogData) { maxTxsTags = std::max(maxTxsTags, it.txsTags); needsOldTxs = needsOldTxs || it.tLogs[0]->tLogVersion < TLogVersion::V4; } if (peekLocality < 0 || localEnd == invalidVersion || localEnd <= begin) { std::vector> cursors; cursors.reserve(maxTxsTags); for (int i = 0; i < maxTxsTags; i++) { cursors.push_back(peekAll(dbgid, begin, end, Tag(tagLocalityTxs, i), true)); } // SOMEDAY: remove once upgrades from 6.2 are no longer supported if (needsOldTxs) { cursors.push_back(peekAll(dbgid, begin, end, txsTag, true)); } return makeReference(cursors, begin, end, false, false, canDiscardPopped); } try { if (localEnd >= end) { std::vector> cursors; cursors.reserve(maxTxsTags); for (int i = 0; i < maxTxsTags; i++) { cursors.push_back(peekLocal(dbgid, Tag(tagLocalityTxs, i), begin, end, true, peekLocality)); } // SOMEDAY: remove once upgrades from 6.2 are no longer supported if (needsOldTxs) { cursors.push_back(peekLocal(dbgid, txsTag, begin, end, true, peekLocality)); } return makeReference(cursors, begin, end, false, false, canDiscardPopped); } std::vector> cursors; std::vector epochEnds; cursors.resize(2); std::vector> localCursors; std::vector> allCursors; for (int i = 0; i < maxTxsTags; i++) { localCursors.push_back(peekLocal(dbgid, Tag(tagLocalityTxs, i), begin, localEnd, true, peekLocality)); allCursors.push_back(peekAll(dbgid, localEnd, end, Tag(tagLocalityTxs, i), true)); } // SOMEDAY: remove once upgrades from 6.2 are no longer supported if (needsOldTxs) { localCursors.push_back(peekLocal(dbgid, txsTag, begin, localEnd, true, peekLocality)); allCursors.push_back(peekAll(dbgid, localEnd, end, txsTag, true)); } cursors[1] = makeReference(localCursors, begin, localEnd, false, false, canDiscardPopped); cursors[0] = makeReference(allCursors, localEnd, end, false, false, false); epochEnds.emplace_back(localEnd); return makeReference(cursors, epochEnds); } catch (Error& e) { if (e.code() == error_code_worker_removed) { std::vector> cursors; cursors.reserve(maxTxsTags); for (int i = 0; i < maxTxsTags; i++) { cursors.push_back(peekAll(dbgid, begin, end, Tag(tagLocalityTxs, i), true)); } // SOMEDAY: remove once upgrades from 6.2 are no longer supported if (needsOldTxs) { cursors.push_back(peekAll(dbgid, begin, end, txsTag, true)); } return makeReference(cursors, begin, end, false, false, canDiscardPopped); } throw; } } Reference TagPartitionedLogSystem::peekSingle(UID dbgid, Version begin, Tag tag, std::vector> history) { while (history.size() && begin >= history.back().first) { history.pop_back(); } if (history.size() == 0) { TraceEvent("TLogPeekSingleNoHistory", dbgid).detail("Tag", tag.toString()).detail("Begin", begin); return peekLocal(dbgid, tag, begin, getPeekEnd(), false); } else { std::vector> cursors; std::vector epochEnds; TraceEvent("TLogPeekSingleAddingLocal", dbgid).detail("Tag", tag.toString()).detail("Begin", history[0].first); cursors.push_back(peekLocal(dbgid, tag, history[0].first, getPeekEnd(), false)); for (int i = 0; i < history.size(); i++) { TraceEvent("TLogPeekSingleAddingOld", dbgid) .detail("Tag", tag.toString()) .detail("HistoryTag", history[i].second.toString()) .detail("Begin", i + 1 == history.size() ? begin : std::max(history[i + 1].first, begin)) .detail("End", history[i].first); cursors.push_back(peekLocal(dbgid, history[i].second, i + 1 == history.size() ? begin : std::max(history[i + 1].first, begin), history[i].first, false)); epochEnds.emplace_back(history[i].first); } return makeReference(cursors, epochEnds); } } Reference TagPartitionedLogSystem::peekLogRouter(UID dbgid, Version begin, Tag tag) { bool found = false; for (const auto& log : tLogs) { found = log->hasLogRouter(dbgid) || log->hasBackupWorker(dbgid); if (found) { break; } } if (found) { if (stopped) { std::vector> localSets; int bestPrimarySet = 0; int bestSatelliteSet = -1; for (auto& log : tLogs) { if (log->isLocal && log->logServers.size()) { TraceEvent("TLogPeekLogRouterLocalSet", dbgid) .detail("Tag", tag.toString()) .detail("Begin", begin) .detail("LogServers", log->logServerString()); localSets.push_back(log); if (log->locality == tagLocalitySatellite) { bestSatelliteSet = localSets.size() - 1; } else { bestPrimarySet = localSets.size() - 1; } } } int bestSet = bestPrimarySet; if (SERVER_KNOBS->LOG_ROUTER_PEEK_FROM_SATELLITES_PREFERRED && bestSatelliteSet != -1 && tLogs[bestSatelliteSet]->tLogVersion >= TLogVersion::V4) { bestSet = bestSatelliteSet; } TraceEvent("TLogPeekLogRouterSets", dbgid).detail("Tag", tag.toString()).detail("Begin", begin); // FIXME: do this merge on one of the logs in the other data center to avoid sending multiple copies // across the WAN return makeReference( localSets, bestSet, localSets[bestSet]->bestLocationFor(tag), tag, begin, getPeekEnd(), true); } else { int bestPrimarySet = -1; int bestSatelliteSet = -1; for (int i = 0; i < tLogs.size(); i++) { const auto& log = tLogs[i]; if (log->logServers.size() && log->isLocal) { if (log->locality == tagLocalitySatellite) { bestSatelliteSet = i; break; } else { if (bestPrimarySet == -1) bestPrimarySet = i; } } } int bestSet = bestPrimarySet; if (SERVER_KNOBS->LOG_ROUTER_PEEK_FROM_SATELLITES_PREFERRED && bestSatelliteSet != -1 && tLogs[bestSatelliteSet]->tLogVersion >= TLogVersion::V4) { bestSet = bestSatelliteSet; } const auto& log = tLogs[bestSet]; TraceEvent("TLogPeekLogRouterBestOnly", dbgid) .detail("Tag", tag.toString()) .detail("Begin", begin) .detail("LogId", log->logServers[log->bestLocationFor(tag)]->get().id()); return makeReference( log->logServers[log->bestLocationFor(tag)], tag, begin, getPeekEnd(), false, true); } } bool firstOld = true; for (const auto& old : oldLogData) { found = false; for (const auto& log : old.tLogs) { found = log->hasLogRouter(dbgid) || log->hasBackupWorker(dbgid); if (found) { break; } } if (found) { int bestPrimarySet = 0; int bestSatelliteSet = -1; std::vector> localSets; for (auto& log : old.tLogs) { if (log->isLocal && log->logServers.size()) { TraceEvent("TLogPeekLogRouterOldLocalSet", dbgid) .detail("Tag", tag.toString()) .detail("Begin", begin) .detail("LogServers", log->logServerString()); localSets.push_back(log); if (log->locality == tagLocalitySatellite) { bestSatelliteSet = localSets.size() - 1; } else { bestPrimarySet = localSets.size() - 1; } } } int bestSet = bestPrimarySet; if (SERVER_KNOBS->LOG_ROUTER_PEEK_FROM_SATELLITES_PREFERRED && bestSatelliteSet != -1 && old.tLogs[bestSatelliteSet]->tLogVersion >= TLogVersion::V4) { bestSet = bestSatelliteSet; } TraceEvent("TLogPeekLogRouterOldSets", dbgid) .detail("Tag", tag.toString()) .detail("Begin", begin) .detail("OldEpoch", old.epochEnd) .detail("RecoveredAt", recoveredAt.present() ? recoveredAt.get() : -1) .detail("FirstOld", firstOld); // FIXME: do this merge on one of the logs in the other data center to avoid sending multiple copies // across the WAN return makeReference(localSets, bestSet, localSets[bestSet]->bestLocationFor(tag), tag, begin, firstOld && recoveredAt.present() ? recoveredAt.get() + 1 : old.epochEnd, true); } firstOld = false; } return makeReference( Reference>>(), tag, begin, getPeekEnd(), false, false); } Version TagPartitionedLogSystem::getKnownCommittedVersion() { Version result = invalidVersion; for (auto& it : lockResults) { auto versions = TagPartitionedLogSystem::getDurableVersion(dbgid, it); if (versions.present()) { result = std::max(result, std::get<0>(versions.get())); } } return result; } Future TagPartitionedLogSystem::onKnownCommittedVersionChange() { std::vector> result; for (auto& it : lockResults) { result.push_back(TagPartitionedLogSystem::getDurableVersionChanged(it)); } if (!result.size()) { return Never(); } return waitForAny(result); } void TagPartitionedLogSystem::popLogRouter(Version upTo, Tag tag, Version durableKnownCommittedVersion, int8_t popLocality) { if (!upTo) return; Version lastGenerationStartVersion = TagPartitionedLogSystem::getMaxLocalStartVersion(tLogs); if (upTo >= lastGenerationStartVersion) { for (auto& t : tLogs) { if (t->locality == popLocality) { for (auto& log : t->logRouters) { Version prev = outstandingPops[std::make_pair(log->get().id(), tag)].first; if (prev < upTo) { outstandingPops[std::make_pair(log->get().id(), tag)] = std::make_pair(upTo, durableKnownCommittedVersion); } if (prev == 0) { popActors.add(popFromLog(this, log, tag, /*delayBeforePop=*/0.0, /*popLogRouter=*/true)); // Fast pop time because log routers can only // hold 5 seconds of data. } } } } } Version nextGenerationStartVersion = lastGenerationStartVersion; for (const auto& old : oldLogData) { Version generationStartVersion = TagPartitionedLogSystem::getMaxLocalStartVersion(old.tLogs); if (generationStartVersion <= upTo) { for (auto& t : old.tLogs) { if (t->locality == popLocality) { for (auto& log : t->logRouters) { auto logRouterIdTagPair = std::make_pair(log->get().id(), tag); // We pop the log router only if the popped version is within this generation's version range. // That is between the current generation's start version and the next generation's start // version. if (logRouterLastPops.find(logRouterIdTagPair) == logRouterLastPops.end() || logRouterLastPops[logRouterIdTagPair] < nextGenerationStartVersion) { Version prev = outstandingPops[logRouterIdTagPair].first; if (prev < upTo) { outstandingPops[logRouterIdTagPair] = std::make_pair(upTo, durableKnownCommittedVersion); } if (prev == 0) { popActors.add( popFromLog(this, log, tag, /*delayBeforePop=*/0.0, /*popLogRouter=*/true)); } } } } } } nextGenerationStartVersion = generationStartVersion; } } void TagPartitionedLogSystem::popTxs(Version upTo, int8_t popLocality) { if (getTLogVersion() < TLogVersion::V4) { pop(upTo, txsTag, 0, popLocality); } else { for (int i = 0; i < txsTags; i++) { pop(upTo, Tag(tagLocalityTxs, i), 0, popLocality); } } } void TagPartitionedLogSystem::pop(Version upTo, Tag tag, Version durableKnownCommittedVersion, int8_t popLocality) { if (upTo <= 0) return; if (tag.locality == tagLocalityRemoteLog) { popLogRouter(upTo, tag, durableKnownCommittedVersion, popLocality); return; } for (auto& t : tLogs) { if (t->locality == tagLocalitySpecial || t->locality == tag.locality || tag.locality == tagLocalityUpgraded || (tag.locality < 0 && ((popLocality == tagLocalityInvalid) == t->isLocal))) { for (auto& log : t->logServers) { Version prev = outstandingPops[std::make_pair(log->get().id(), tag)].first; if (prev < upTo) { // update pop version for popFromLog actor outstandingPops[std::make_pair(log->get().id(), tag)] = std::make_pair(upTo, durableKnownCommittedVersion); } if (prev == 0) { // pop tag from log upto version defined in outstandingPops[].first popActors.add( popFromLog(this, log, tag, /*delayBeforePop*/ 1.0, /*popLogRouter=*/false)); //< FIXME: knob } } } } } ACTOR Future TagPartitionedLogSystem::popFromLog(TagPartitionedLogSystem* self, Reference>> log, Tag tag, double delayBeforePop, bool popLogRouter) { state Version last = 0; loop { wait(delay(delayBeforePop, TaskPriority::TLogPop)); // to: first is upto version, second is durableKnownComittedVersion state std::pair to = self->outstandingPops[std::make_pair(log->get().id(), tag)]; if (to.first <= last) { self->outstandingPops.erase(std::make_pair(log->get().id(), tag)); return Void(); } try { if (!log->get().present()) return Void(); wait(log->get().interf().popMessages.getReply(TLogPopRequest(to.first, to.second, tag), TaskPriority::TLogPop)); if (popLogRouter) { self->logRouterLastPops[std::make_pair(log->get().id(), tag)] = to.first; } last = to.first; } catch (Error& e) { if (e.code() == error_code_actor_cancelled) throw; TraceEvent((e.code() == error_code_broken_promise) ? SevInfo : SevError, "LogPopError", self->dbgid) .error(e) .detail("Log", log->get().id()); return Void(); // Leaving outstandingPops filled in means no further pop requests to this tlog from this // logSystem } } } ACTOR Future TagPartitionedLogSystem::getPoppedFromTLog( Reference>> log, Tag tag) { loop { choose { when(TLogPeekReply rep = wait(log->get().present() ? brokenPromiseToNever(log->get().interf().peekMessages.getReply( TLogPeekRequest(-1, tag, false, false))) : Never())) { ASSERT(rep.popped.present()); return rep.popped.get(); } when(wait(log->onChange())) {} } } } ACTOR Future TagPartitionedLogSystem::getPoppedTxs(TagPartitionedLogSystem* self) { state std::vector>> poppedFutures; state std::vector> poppedReady; if (self->tLogs.size()) { poppedFutures.push_back(std::vector>()); for (auto& it : self->tLogs) { for (auto& log : it->logServers) { poppedFutures.back().push_back(TagPartitionedLogSystem::getPoppedFromTLog( log, self->tLogs[0]->tLogVersion < TLogVersion::V4 ? txsTag : Tag(tagLocalityTxs, 0))); } } poppedReady.push_back(waitForAny(poppedFutures.back())); } for (auto& old : self->oldLogData) { if (old.tLogs.size()) { poppedFutures.push_back(std::vector>()); for (auto& it : old.tLogs) { for (auto& log : it->logServers) { poppedFutures.back().push_back(TagPartitionedLogSystem::getPoppedFromTLog( log, old.tLogs[0]->tLogVersion < TLogVersion::V4 ? txsTag : Tag(tagLocalityTxs, 0))); } } poppedReady.push_back(waitForAny(poppedFutures.back())); } } state UID dbgid = self->dbgid; state Future maxGetPoppedDuration = delay(SERVER_KNOBS->TXS_POPPED_MAX_DELAY); wait(waitForAll(poppedReady) || maxGetPoppedDuration); if (maxGetPoppedDuration.isReady()) { TraceEvent(SevWarnAlways, "PoppedTxsNotReady", dbgid).log(); } Version maxPopped = 1; for (auto& it : poppedFutures) { for (auto& v : it) { if (v.isReady()) { maxPopped = std::max(maxPopped, v.get()); } } } return maxPopped; } Future TagPartitionedLogSystem::getTxsPoppedVersion() { return getPoppedTxs(this); } ACTOR Future TagPartitionedLogSystem::confirmEpochLive_internal(Reference logSet, Optional debugID) { state std::vector> alive; int numPresent = 0; for (auto& t : logSet->logServers) { if (t->get().present()) { alive.push_back(brokenPromiseToNever(t->get().interf().confirmRunning.getReply( TLogConfirmRunningRequest(debugID), TaskPriority::TLogConfirmRunningReply))); numPresent++; } else { alive.push_back(Never()); } } wait(quorum(alive, std::min(logSet->tLogReplicationFactor, numPresent - logSet->tLogWriteAntiQuorum))); state std::vector aliveEntries; state std::vector responded(alive.size(), false); loop { for (int i = 0; i < alive.size(); i++) { if (!responded[i] && alive[i].isReady() && !alive[i].isError()) { aliveEntries.push_back(logSet->logEntryArray[i]); responded[i] = true; } } if (logSet->satisfiesPolicy(aliveEntries)) { return Void(); } // The current set of responders that we have weren't enough to form a quorum, so we must // wait for more responses and try again. std::vector> changes; for (int i = 0; i < alive.size(); i++) { if (!alive[i].isReady()) { changes.push_back(ready(alive[i])); } else if (alive[i].isReady() && alive[i].isError() && alive[i].getError().code() == error_code_tlog_stopped) { // All commits must go to all TLogs. If any TLog is stopped, then our epoch has ended. return Never(); } } ASSERT(changes.size() != 0); wait(waitForAny(changes)); } } Future TagPartitionedLogSystem::confirmEpochLive(Optional debugID) { std::vector> quorumResults; for (auto& it : tLogs) { if (it->isLocal && it->logServers.size()) { quorumResults.push_back(confirmEpochLive_internal(it, debugID)); } } return waitForAll(quorumResults); } Future TagPartitionedLogSystem::endEpoch() { std::vector> lockResults; for (auto& logSet : tLogs) { for (auto& log : logSet->logServers) { lockResults.push_back(success(lockTLog(dbgid, log))); } } return waitForAll(lockResults); } Future> TagPartitionedLogSystem::newEpoch( RecruitFromConfigurationReply const& recr, Future const& fRemoteWorkers, UID clusterId, DatabaseConfiguration const& config, LogEpoch recoveryCount, Version recoveryTransactionVersion, int8_t primaryLocality, int8_t remoteLocality, std::vector const& allTags, Reference> const& recruitmentStalled) { return newEpoch(Reference::addRef(this), recr, fRemoteWorkers, clusterId, config, recoveryCount, recoveryTransactionVersion, primaryLocality, remoteLocality, allTags, recruitmentStalled); } LogSystemConfig TagPartitionedLogSystem::getLogSystemConfig() const { LogSystemConfig logSystemConfig(epoch); logSystemConfig.logSystemType = logSystemType; logSystemConfig.expectedLogSets = expectedLogSets; logSystemConfig.logRouterTags = logRouterTags; logSystemConfig.txsTags = txsTags; logSystemConfig.recruitmentID = recruitmentID; logSystemConfig.stopped = stopped; logSystemConfig.recoveredAt = recoveredAt; logSystemConfig.pseudoLocalities = pseudoLocalities; logSystemConfig.oldestBackupEpoch = oldestBackupEpoch; for (const Reference& logSet : tLogs) { if (logSet->isLocal || remoteLogsWrittenToCoreState) { logSystemConfig.tLogs.emplace_back(*logSet); } } if (!recoveryCompleteWrittenToCoreState.get()) { for (const auto& oldData : oldLogData) { logSystemConfig.oldTLogs.emplace_back(oldData); } } return logSystemConfig; } Standalone TagPartitionedLogSystem::getLogsValue() const { std::vector> logs; std::vector> oldLogs; for (auto& t : tLogs) { if (t->isLocal || remoteLogsWrittenToCoreState) { for (int i = 0; i < t->logServers.size(); i++) { logs.emplace_back(t->logServers[i]->get().id(), t->logServers[i]->get().present() ? t->logServers[i]->get().interf().address() : NetworkAddress()); } } } if (!recoveryCompleteWrittenToCoreState.get()) { for (int i = 0; i < oldLogData.size(); i++) { for (auto& t : oldLogData[i].tLogs) { for (int j = 0; j < t->logServers.size(); j++) { oldLogs.emplace_back(t->logServers[j]->get().id(), t->logServers[j]->get().present() ? t->logServers[j]->get().interf().address() : NetworkAddress()); } } } } return logsValue(logs, oldLogs); } Future TagPartitionedLogSystem::onLogSystemConfigChange() { std::vector> changes; changes.push_back(logSystemConfigChanged.onTrigger()); for (auto& t : tLogs) { for (int i = 0; i < t->logServers.size(); i++) { changes.push_back(t->logServers[i]->onChange()); } } for (int i = 0; i < oldLogData.size(); i++) { for (auto& t : oldLogData[i].tLogs) { for (int j = 0; j < t->logServers.size(); j++) { changes.push_back(t->logServers[j]->onChange()); } } } if (hasRemoteServers && !remoteRecovery.isReady()) { changes.push_back(remoteRecovery); } return waitForAny(changes); } Version TagPartitionedLogSystem::getEnd() const { ASSERT(recoverAt.present()); return recoverAt.get() + 1; } Version TagPartitionedLogSystem::getPeekEnd() const { if (recoverAt.present()) return getEnd(); else return std::numeric_limits::max(); } void TagPartitionedLogSystem::getPushLocations(VectorRef tags, std::vector& locations, bool allLocations) const { int locationOffset = 0; for (auto& log : tLogs) { if (log->isLocal && log->logServers.size()) { log->getPushLocations(tags, locations, locationOffset, allLocations); locationOffset += log->logServers.size(); } } } bool TagPartitionedLogSystem::hasRemoteLogs() const { return logRouterTags > 0 || pseudoLocalities.size() > 0; } Tag TagPartitionedLogSystem::getRandomRouterTag() const { return Tag(tagLocalityLogRouter, deterministicRandom()->randomInt(0, logRouterTags)); } Tag TagPartitionedLogSystem::getRandomTxsTag() const { return Tag(tagLocalityTxs, deterministicRandom()->randomInt(0, txsTags)); } TLogVersion TagPartitionedLogSystem::getTLogVersion() const { return tLogs[0]->tLogVersion; } int TagPartitionedLogSystem::getLogRouterTags() const { return logRouterTags; } Version TagPartitionedLogSystem::getBackupStartVersion() const { ASSERT(tLogs.size() > 0); return backupStartVersion; } std::map TagPartitionedLogSystem::getOldEpochTagsVersionsInfo() const { std::map epochInfos; for (const auto& old : oldLogData) { epochInfos.insert( { old.epoch, ILogSystem::EpochTagsVersionsInfo(old.logRouterTags, old.epochBegin, old.epochEnd) }); TraceEvent("OldEpochTagsVersions", dbgid) .detail("Epoch", old.epoch) .detail("Tags", old.logRouterTags) .detail("BeginVersion", old.epochBegin) .detail("EndVersion", old.epochEnd); } return epochInfos; } inline Reference TagPartitionedLogSystem::getEpochLogSet(LogEpoch epoch) const { for (const auto& old : oldLogData) { if (epoch == old.epoch) return old.tLogs[0]; } return Reference(nullptr); } void TagPartitionedLogSystem::setBackupWorkers(const std::vector& replies) { ASSERT(tLogs.size() > 0); Reference logset = tLogs[0]; // Master recruits this epoch's worker first. LogEpoch logsetEpoch = this->epoch; oldestBackupEpoch = this->epoch; for (const auto& reply : replies) { if (removedBackupWorkers.count(reply.interf.id()) > 0) { removedBackupWorkers.erase(reply.interf.id()); continue; } auto worker = makeReference>>( OptionalInterface(reply.interf)); if (reply.backupEpoch != logsetEpoch) { // find the logset from oldLogData logsetEpoch = reply.backupEpoch; oldestBackupEpoch = std::min(oldestBackupEpoch, logsetEpoch); logset = getEpochLogSet(logsetEpoch); ASSERT(logset.isValid()); } logset->backupWorkers.push_back(worker); TraceEvent("AddBackupWorker", dbgid).detail("Epoch", logsetEpoch).detail("BackupWorkerID", reply.interf.id()); } TraceEvent("SetOldestBackupEpoch", dbgid).detail("Epoch", oldestBackupEpoch); backupWorkerChanged.trigger(); } bool TagPartitionedLogSystem::removeBackupWorker(const BackupWorkerDoneRequest& req) { bool removed = false; Reference logset = getEpochLogSet(req.backupEpoch); if (logset.isValid()) { for (auto it = logset->backupWorkers.begin(); it != logset->backupWorkers.end(); it++) { if (it->getPtr()->get().interf().id() == req.workerUID) { logset->backupWorkers.erase(it); removed = true; break; } } } if (removed) { oldestBackupEpoch = epoch; for (const auto& old : oldLogData) { if (old.epoch < oldestBackupEpoch && old.tLogs[0]->backupWorkers.size() > 0) { oldestBackupEpoch = old.epoch; } } backupWorkerChanged.trigger(); } else { removedBackupWorkers.insert(req.workerUID); } TraceEvent("RemoveBackupWorker", dbgid) .detail("Removed", removed) .detail("BackupEpoch", req.backupEpoch) .detail("WorkerID", req.workerUID) .detail("OldestBackupEpoch", oldestBackupEpoch); return removed; } LogEpoch TagPartitionedLogSystem::getOldestBackupEpoch() const { return oldestBackupEpoch; } void TagPartitionedLogSystem::setOldestBackupEpoch(LogEpoch epoch) { oldestBackupEpoch = epoch; backupWorkerChanged.trigger(); } ACTOR Future TagPartitionedLogSystem::monitorLog(Reference>> logServer, Reference> failed) { state Future waitFailure; loop { if (logServer->get().present()) waitFailure = waitFailureTracker(logServer->get().interf().waitFailure, failed); else failed->set(true); wait(logServer->onChange()); } } Optional>> TagPartitionedLogSystem::getDurableVersion( UID dbgid, LogLockInfo lockInfo, std::vector>> failed, Optional lastEnd) { Reference logSet = lockInfo.logSet; // To ensure consistent recovery, the number of servers NOT in the write quorum plus the number of servers NOT // in the read quorum have to be strictly less than the replication factor. Otherwise there could be a replica // set consistent entirely of servers that are out of date due to not being in the write quorum or unavailable // due to not being in the read quorum. So with N = # of tlogs, W = antiquorum, R = required count, F = // replication factor, W + (N - R) < F, and optimally (N-W)+(N-R)=F-1. Thus R=N+1-F+W. int requiredCount = (int)logSet->logServers.size() + 1 - logSet->tLogReplicationFactor + logSet->tLogWriteAntiQuorum; ASSERT(requiredCount > 0 && requiredCount <= logSet->logServers.size()); ASSERT(logSet->tLogReplicationFactor >= 1 && logSet->tLogReplicationFactor <= logSet->logServers.size()); ASSERT(logSet->tLogWriteAntiQuorum >= 0 && logSet->tLogWriteAntiQuorum < logSet->logServers.size()); std::vector availableItems, badCombo; std::vector results; std::string sServerState; LocalityGroup unResponsiveSet; for (int t = 0; t < logSet->logServers.size(); t++) { if (lockInfo.replies[t].isReady() && !lockInfo.replies[t].isError() && (!failed.size() || !failed[t]->get())) { results.push_back(lockInfo.replies[t].get()); availableItems.push_back(logSet->tLogLocalities[t]); sServerState += 'a'; } else { unResponsiveSet.add(logSet->tLogLocalities[t]); sServerState += 'f'; } } // Check if the list of results is not larger than the anti quorum bool bTooManyFailures = (results.size() <= logSet->tLogWriteAntiQuorum); // Check if failed logs complete the policy bTooManyFailures = bTooManyFailures || ((unResponsiveSet.size() >= logSet->tLogReplicationFactor) && (unResponsiveSet.validate(logSet->tLogPolicy))); // Check all combinations of the AntiQuorum within the failed if (!bTooManyFailures && (logSet->tLogWriteAntiQuorum) && (!validateAllCombinations( badCombo, unResponsiveSet, logSet->tLogPolicy, availableItems, logSet->tLogWriteAntiQuorum, false))) { TraceEvent("EpochEndBadCombo", dbgid) .detail("Required", requiredCount) .detail("Present", results.size()) .detail("ServerState", sServerState); bTooManyFailures = true; } ASSERT(logSet->logServers.size() == lockInfo.replies.size()); if (!bTooManyFailures) { std::sort(results.begin(), results.end(), [](const TLogLockResult& a, const TLogLockResult& b) -> bool { return a.end < b.end; }); int absent = logSet->logServers.size() - results.size(); int safe_range_begin = logSet->tLogWriteAntiQuorum; int new_safe_range_begin = std::min(logSet->tLogWriteAntiQuorum, (int)(results.size() - 1)); int safe_range_end = std::max(logSet->tLogReplicationFactor - absent, 1); if (!lastEnd.present() || ((safe_range_end > 0) && (safe_range_end - 1 < results.size()) && results[safe_range_end - 1].end < lastEnd.get())) { Version knownCommittedVersion = 0; for (int i = 0; i < results.size(); i++) { knownCommittedVersion = std::max(knownCommittedVersion, results[i].knownCommittedVersion); } if (knownCommittedVersion > results[new_safe_range_begin].end) { knownCommittedVersion = results[new_safe_range_begin].end; } TraceEvent("GetDurableResult", dbgid) .detail("Required", requiredCount) .detail("Present", results.size()) .detail("Anti", logSet->tLogWriteAntiQuorum) .detail("ServerState", sServerState) .detail("RecoveryVersion", ((safe_range_end > 0) && (safe_range_end - 1 < results.size())) ? results[safe_range_end - 1].end : -1) .detail("EndVersion", results[new_safe_range_begin].end) .detail("SafeBegin", safe_range_begin) .detail("SafeEnd", safe_range_end) .detail("NewSafeBegin", new_safe_range_begin) .detail("KnownCommittedVersion", knownCommittedVersion) .detail("EpochEnd", lockInfo.epochEnd); return std::make_tuple(knownCommittedVersion, results[new_safe_range_begin].end, results); } } TraceEvent("GetDurableResultWaiting", dbgid) .detail("Required", requiredCount) .detail("Present", results.size()) .detail("ServerState", sServerState); return Optional>>(); } ACTOR Future TagPartitionedLogSystem::getDurableVersionChanged(LogLockInfo lockInfo, std::vector>> failed) { // Wait for anything relevant to change std::vector> changes; for (int j = 0; j < lockInfo.logSet->logServers.size(); j++) { if (!lockInfo.replies[j].isReady()) changes.push_back(ready(lockInfo.replies[j])); else { changes.push_back(lockInfo.logSet->logServers[j]->onChange()); if (failed.size()) { changes.push_back(failed[j]->onChange()); } } } ASSERT(changes.size()); wait(waitForAny(changes)); return Void(); } // If VERSION_VECTOR_UNICAST is enabled, one tLog's DV may advance beyond the min(DV) over all tLogs. // This function finds the highest recoverable version for each tLog group over all log groups. // All prior versions to the chosen RV must also be recoverable. // TODO: unit tests to stress UNICAST Version getRecoverVersionUnicast(std::vector>>& logGroupResults, Version minEnd) { Version minLogGroup = std::numeric_limits::max(); for (auto& logGroupResult : logGroupResults) { std::unordered_map versionRepCount; std::map versionTLogCount; int replicationFactor = std::get<0>(logGroupResult); for (auto& tLogResult : std::get<1>(logGroupResult)) { bool logGroupCandidate = false; for (auto& unknownCommittedVersion : tLogResult.unknownCommittedVersions) { Version k = std::get<0>(unknownCommittedVersion); if (k > minEnd) { versionRepCount[k]++; versionTLogCount[k] = std::get<1>(unknownCommittedVersion); logGroupCandidate = true; } } if (!logGroupCandidate) { return minEnd; } } Version minTLogs = minEnd; for (auto const& [version, tLogCount] : versionTLogCount) { if (versionRepCount[version] >= tLogCount - replicationFactor + 1) { minTLogs = version; } else { break; } } minLogGroup = std::min(minLogGroup, minTLogs); } return minLogGroup; } ACTOR Future TagPartitionedLogSystem::epochEnd(Reference>> outLogSystem, UID dbgid, DBCoreState prevState, FutureStream rejoinRequests, LocalityData locality, bool* forceRecovery) { // Stops a co-quorum of tlogs so that no further versions can be committed until the DBCoreState coordination // state is changed Creates a new logSystem representing the (now frozen) epoch No other important side effects. // The writeQuorum in the master info is from the previous configuration if (!prevState.tLogs.size()) { // This is a brand new database auto logSystem = makeReference(dbgid, locality, 0); logSystem->logSystemType = prevState.logSystemType; logSystem->recoverAt = 0; logSystem->knownCommittedVersion = 0; logSystem->stopped = true; outLogSystem->set(logSystem); wait(Future(Never())); throw internal_error(); } if (*forceRecovery) { DBCoreState modifiedState = prevState; int8_t primaryLocality = -1; for (auto& coreSet : modifiedState.tLogs) { if (coreSet.isLocal && coreSet.locality >= 0 && coreSet.tLogLocalities[0].dcId() != locality.dcId()) { primaryLocality = coreSet.locality; break; } } bool foundRemote = false; int8_t remoteLocality = -1; int modifiedLogSets = 0; int removedLogSets = 0; if (primaryLocality >= 0) { auto copiedLogs = modifiedState.tLogs; for (auto& coreSet : copiedLogs) { if (coreSet.locality != primaryLocality && coreSet.locality >= 0) { foundRemote = true; remoteLocality = coreSet.locality; modifiedState.tLogs.clear(); modifiedState.tLogs.push_back(coreSet); modifiedState.tLogs[0].isLocal = true; modifiedState.logRouterTags = 0; modifiedLogSets++; break; } } while (!foundRemote && modifiedState.oldTLogData.size()) { for (auto& coreSet : modifiedState.oldTLogData[0].tLogs) { if (coreSet.locality != primaryLocality && coreSet.locality >= tagLocalitySpecial) { foundRemote = true; remoteLocality = coreSet.locality; modifiedState.tLogs.clear(); modifiedState.tLogs.push_back(coreSet); modifiedState.tLogs[0].isLocal = true; modifiedState.logRouterTags = 0; modifiedState.txsTags = modifiedState.oldTLogData[0].txsTags; modifiedLogSets++; break; } } modifiedState.oldTLogData.erase(modifiedState.oldTLogData.begin()); removedLogSets++; } if (foundRemote) { for (int i = 0; i < modifiedState.oldTLogData.size(); i++) { bool found = false; auto copiedLogs = modifiedState.oldTLogData[i].tLogs; for (auto& coreSet : copiedLogs) { if (coreSet.locality == remoteLocality || coreSet.locality == tagLocalitySpecial) { found = true; if (!coreSet.isLocal || copiedLogs.size() > 1) { modifiedState.oldTLogData[i].tLogs.clear(); modifiedState.oldTLogData[i].tLogs.push_back(coreSet); modifiedState.oldTLogData[i].tLogs[0].isLocal = true; modifiedState.oldTLogData[i].logRouterTags = 0; modifiedState.oldTLogData[i].epochBegin = modifiedState.oldTLogData[i].tLogs[0].startVersion; modifiedState.oldTLogData[i].epochEnd = (i == 0 ? modifiedState.tLogs[0].startVersion : modifiedState.oldTLogData[i - 1].tLogs[0].startVersion); modifiedLogSets++; } break; } } if (!found) { modifiedState.oldTLogData.erase(modifiedState.oldTLogData.begin() + i); removedLogSets++; i--; } } prevState = modifiedState; } else { *forceRecovery = false; } } else { *forceRecovery = false; } TraceEvent(SevWarnAlways, "ForcedRecovery", dbgid) .detail("PrimaryLocality", primaryLocality) .detail("RemoteLocality", remoteLocality) .detail("FoundRemote", foundRemote) .detail("Modified", modifiedLogSets) .detail("Removed", removedLogSets); for (int i = 0; i < prevState.tLogs.size(); i++) { TraceEvent("ForcedRecoveryTLogs", dbgid) .detail("I", i) .detail("Log", ::describe(prevState.tLogs[i].tLogs)) .detail("Loc", prevState.tLogs[i].locality) .detail("Txs", prevState.txsTags); } for (int i = 0; i < prevState.oldTLogData.size(); i++) { for (int j = 0; j < prevState.oldTLogData[i].tLogs.size(); j++) { TraceEvent("ForcedRecoveryTLogs", dbgid) .detail("I", i) .detail("J", j) .detail("Log", ::describe(prevState.oldTLogData[i].tLogs[j].tLogs)) .detail("Loc", prevState.oldTLogData[i].tLogs[j].locality) .detail("Txs", prevState.oldTLogData[i].txsTags); } } } CODE_PROBE(true, "Master recovery from pre-existing database"); // trackRejoins listens for rejoin requests from the tLogs that we are recovering from, to learn their // TLogInterfaces state std::vector lockResults; state std::vector>>, Reference>> allLogServers; state std::vector> logServers; state std::vector oldLogData; state std::vector>>> logFailed; state std::vector> failureTrackers; for (const CoreTLogSet& coreSet : prevState.tLogs) { logServers.push_back(makeReference(coreSet)); std::vector>> failed; for (const auto& logVar : logServers.back()->logServers) { allLogServers.emplace_back(logVar, coreSet.tLogPolicy); failed.push_back(makeReference>()); failureTrackers.push_back(TagPartitionedLogSystem::monitorLog(logVar, failed.back())); } logFailed.push_back(failed); } for (const auto& oldTlogData : prevState.oldTLogData) { oldLogData.emplace_back(oldTlogData); for (const auto& logSet : oldLogData.back().tLogs) { for (const auto& logVar : logSet->logServers) { allLogServers.emplace_back(logVar, logSet->tLogPolicy); } } } state Future rejoins = TagPartitionedLogSystem::trackRejoins(dbgid, allLogServers, rejoinRequests); lockResults.resize(logServers.size()); std::set lockedLocalities; bool foundSpecial = false; for (int i = 0; i < logServers.size(); i++) { if (logServers[i]->locality == tagLocalitySpecial || logServers[i]->locality == tagLocalityUpgraded) { foundSpecial = true; } lockedLocalities.insert(logServers[i]->locality); lockResults[i].isCurrent = true; lockResults[i].logSet = logServers[i]; for (int t = 0; t < logServers[i]->logServers.size(); t++) { lockResults[i].replies.push_back(TagPartitionedLogSystem::lockTLog(dbgid, logServers[i]->logServers[t])); } } for (auto& old : oldLogData) { if (foundSpecial) { break; } for (auto& log : old.tLogs) { if (log->locality == tagLocalitySpecial || log->locality == tagLocalityUpgraded) { foundSpecial = true; break; } if (!lockedLocalities.count(log->locality)) { TraceEvent("EpochEndLockExtra").detail("Locality", log->locality); CODE_PROBE(true, "locking old generations for version information"); lockedLocalities.insert(log->locality); LogLockInfo lockResult; lockResult.epochEnd = old.epochEnd; lockResult.logSet = log; for (int t = 0; t < log->logServers.size(); t++) { lockResult.replies.push_back(TagPartitionedLogSystem::lockTLog(dbgid, log->logServers[t])); } lockResults.push_back(lockResult); } } } if (*forceRecovery) { state std::vector allLockResults; ASSERT(lockResults.size() == 1); allLockResults.push_back(lockResults[0]); for (auto& old : oldLogData) { ASSERT(old.tLogs.size() == 1); LogLockInfo lockResult; lockResult.epochEnd = old.epochEnd; lockResult.logSet = old.tLogs[0]; for (int t = 0; t < old.tLogs[0]->logServers.size(); t++) { lockResult.replies.push_back(TagPartitionedLogSystem::lockTLog(dbgid, old.tLogs[0]->logServers[t])); } allLockResults.push_back(lockResult); } state int lockNum = 0; state Version maxRecoveryVersion = 0; state int maxRecoveryIndex = 0; while (lockNum < allLockResults.size()) { auto versions = TagPartitionedLogSystem::getDurableVersion(dbgid, allLockResults[lockNum]); if (versions.present()) { if (std::get<1>(versions.get()) > maxRecoveryVersion) { TraceEvent("HigherRecoveryVersion", dbgid) .detail("Idx", lockNum) .detail("Ver", std::get<1>(versions.get())); maxRecoveryVersion = std::get<1>(versions.get()); maxRecoveryIndex = lockNum; } lockNum++; } else { wait(TagPartitionedLogSystem::getDurableVersionChanged(allLockResults[lockNum])); } } if (maxRecoveryIndex > 0) { logServers = oldLogData[maxRecoveryIndex - 1].tLogs; prevState.txsTags = oldLogData[maxRecoveryIndex - 1].txsTags; lockResults[0] = allLockResults[maxRecoveryIndex]; lockResults[0].isCurrent = true; std::vector>> failed; for (auto& log : logServers[0]->logServers) { failed.push_back(makeReference>()); failureTrackers.push_back(TagPartitionedLogSystem::monitorLog(log, failed.back())); } ASSERT(logFailed.size() == 1); logFailed[0] = failed; oldLogData.erase(oldLogData.begin(), oldLogData.begin() + maxRecoveryIndex); } } state Optional lastEnd; state Version knownCommittedVersion = 0; loop { Version minEnd = std::numeric_limits::max(); Version maxEnd = 0; std::vector> changes; std::vector>> logGroupResults; for (int log = 0; log < logServers.size(); log++) { if (!logServers[log]->isLocal) { continue; } auto versions = TagPartitionedLogSystem::getDurableVersion(dbgid, lockResults[log], logFailed[log], lastEnd); if (versions.present()) { knownCommittedVersion = std::max(knownCommittedVersion, std::get<0>(versions.get())); logGroupResults.emplace_back(logServers[log]->tLogReplicationFactor, std::get<2>(versions.get())); maxEnd = std::max(maxEnd, std::get<1>(versions.get())); minEnd = std::min(minEnd, std::get<1>(versions.get())); } changes.push_back(TagPartitionedLogSystem::getDurableVersionChanged(lockResults[log], logFailed[log])); } if (maxEnd > 0 && (!lastEnd.present() || maxEnd < lastEnd.get())) { CODE_PROBE(lastEnd.present(), "Restarting recovery at an earlier point"); auto logSystem = makeReference(dbgid, locality, prevState.recoveryCount); logSystem->recoverAt = minEnd; if (SERVER_KNOBS->ENABLE_VERSION_VECTOR_TLOG_UNICAST) { logSystem->recoverAt = getRecoverVersionUnicast(logGroupResults, minEnd); TraceEvent("RecoveryVersionInfo").detail("RecoverAt", logSystem->recoverAt); } lastEnd = minEnd; logSystem->tLogs = logServers; logSystem->logRouterTags = prevState.logRouterTags; logSystem->txsTags = prevState.txsTags; logSystem->oldLogData = oldLogData; logSystem->logSystemType = prevState.logSystemType; logSystem->rejoins = rejoins; logSystem->lockResults = lockResults; if (knownCommittedVersion > minEnd) { knownCommittedVersion = minEnd; } logSystem->knownCommittedVersion = knownCommittedVersion; TraceEvent(SevDebug, "FinalRecoveryVersionInfo") .detail("KCV", knownCommittedVersion) .detail("MinEnd", minEnd); logSystem->remoteLogsWrittenToCoreState = true; logSystem->stopped = true; logSystem->pseudoLocalities = prevState.pseudoLocalities; outLogSystem->set(logSystem); } wait(waitForAny(changes)); } } ACTOR Future TagPartitionedLogSystem::recruitOldLogRouters(TagPartitionedLogSystem* self, std::vector workers, LogEpoch recoveryCount, int8_t locality, Version startVersion, std::vector tLogLocalities, Reference tLogPolicy, bool forRemote) { state std::vector>> logRouterInitializationReplies; state std::vector> allReplies; int nextRouter = 0; state Version lastStart = std::numeric_limits::max(); if (!forRemote) { Version maxStart = TagPartitionedLogSystem::getMaxLocalStartVersion(self->tLogs); lastStart = std::max(startVersion, maxStart); if (self->logRouterTags == 0) { ASSERT_WE_THINK(false); self->logSystemConfigChanged.trigger(); return Void(); } bool found = false; for (auto& tLogs : self->tLogs) { if (tLogs->locality == locality) { found = true; } tLogs->logRouters.clear(); } if (!found) { TraceEvent("RecruitingOldLogRoutersAddingLocality") .detail("Locality", locality) .detail("LastStart", lastStart); auto newLogSet = makeReference(); newLogSet->locality = locality; newLogSet->startVersion = lastStart; newLogSet->isLocal = false; self->tLogs.push_back(newLogSet); } for (auto& tLogs : self->tLogs) { // Recruit log routers for old generations of the primary locality if (tLogs->locality == locality) { logRouterInitializationReplies.emplace_back(); for (int i = 0; i < self->logRouterTags; i++) { InitializeLogRouterRequest req; req.recoveryCount = recoveryCount; req.routerTag = Tag(tagLocalityLogRouter, i); req.startVersion = lastStart; req.tLogLocalities = tLogLocalities; req.tLogPolicy = tLogPolicy; req.locality = locality; auto reply = transformErrors( throwErrorOr(workers[nextRouter].logRouter.getReplyUnlessFailedFor( req, SERVER_KNOBS->TLOG_TIMEOUT, SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY)), cluster_recovery_failed()); logRouterInitializationReplies.back().push_back(reply); allReplies.push_back(reply); nextRouter = (nextRouter + 1) % workers.size(); } } } } for (auto& old : self->oldLogData) { Version maxStart = TagPartitionedLogSystem::getMaxLocalStartVersion(old.tLogs); if (old.logRouterTags == 0 || maxStart >= lastStart) { break; } lastStart = std::max(startVersion, maxStart); bool found = false; for (auto& tLogs : old.tLogs) { if (tLogs->locality == locality) { found = true; } tLogs->logRouters.clear(); } if (!found) { TraceEvent("RecruitingOldLogRoutersAddingLocality") .detail("Locality", locality) .detail("LastStart", lastStart); auto newLogSet = makeReference(); newLogSet->locality = locality; newLogSet->startVersion = lastStart; old.tLogs.push_back(newLogSet); } for (auto& tLogs : old.tLogs) { // Recruit log routers for old generations of the primary locality if (tLogs->locality == locality) { logRouterInitializationReplies.emplace_back(); for (int i = 0; i < old.logRouterTags; i++) { InitializeLogRouterRequest req; req.recoveryCount = recoveryCount; req.routerTag = Tag(tagLocalityLogRouter, i); req.startVersion = lastStart; req.tLogLocalities = tLogLocalities; req.tLogPolicy = tLogPolicy; req.locality = locality; auto reply = transformErrors( throwErrorOr(workers[nextRouter].logRouter.getReplyUnlessFailedFor( req, SERVER_KNOBS->TLOG_TIMEOUT, SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY)), cluster_recovery_failed()); logRouterInitializationReplies.back().push_back(reply); allReplies.push_back(reply); nextRouter = (nextRouter + 1) % workers.size(); } } } } wait(waitForAll(allReplies)); int nextReplies = 0; lastStart = std::numeric_limits::max(); std::vector> failed; if (!forRemote) { Version maxStart = TagPartitionedLogSystem::getMaxLocalStartVersion(self->tLogs); lastStart = std::max(startVersion, maxStart); for (auto& tLogs : self->tLogs) { if (tLogs->locality == locality) { for (int i = 0; i < logRouterInitializationReplies[nextReplies].size(); i++) { tLogs->logRouters.push_back(makeReference>>( OptionalInterface(logRouterInitializationReplies[nextReplies][i].get()))); failed.push_back( waitFailureClient(logRouterInitializationReplies[nextReplies][i].get().waitFailure, SERVER_KNOBS->TLOG_TIMEOUT, -SERVER_KNOBS->TLOG_TIMEOUT / SERVER_KNOBS->SECONDS_BEFORE_NO_FAILURE_DELAY, /*trace=*/true)); } nextReplies++; } } } for (auto& old : self->oldLogData) { Version maxStart = TagPartitionedLogSystem::getMaxLocalStartVersion(old.tLogs); if (old.logRouterTags == 0 || maxStart >= lastStart) { break; } lastStart = std::max(startVersion, maxStart); for (auto& tLogs : old.tLogs) { if (tLogs->locality == locality) { for (int i = 0; i < logRouterInitializationReplies[nextReplies].size(); i++) { tLogs->logRouters.push_back(makeReference>>( OptionalInterface(logRouterInitializationReplies[nextReplies][i].get()))); if (!forRemote) { failed.push_back(waitFailureClient( logRouterInitializationReplies[nextReplies][i].get().waitFailure, SERVER_KNOBS->TLOG_TIMEOUT, -SERVER_KNOBS->TLOG_TIMEOUT / SERVER_KNOBS->SECONDS_BEFORE_NO_FAILURE_DELAY, /*trace=*/true)); } } nextReplies++; } } } if (!forRemote) { self->logSystemConfigChanged.trigger(); wait(failed.size() ? tagError(quorum(failed, 1), tlog_failed()) : Future(Never())); throw internal_error(); } return Void(); } Version TagPartitionedLogSystem::getMaxLocalStartVersion(const std::vector>& tLogs) { Version maxStart = 0; for (const auto& logSet : tLogs) { if (logSet->isLocal) { maxStart = std::max(maxStart, logSet->startVersion); } } return maxStart; } std::vector TagPartitionedLogSystem::getLocalTags(int8_t locality, const std::vector& allTags) { std::vector localTags; for (const auto& tag : allTags) { if (locality == tagLocalitySpecial || locality == tag.locality || tag.locality < 0) { localTags.push_back(tag); } } return localTags; } ACTOR Future TagPartitionedLogSystem::newRemoteEpoch(TagPartitionedLogSystem* self, Reference oldLogSystem, Future fRemoteWorkers, UID clusterId, DatabaseConfiguration configuration, LogEpoch recoveryCount, Version recoveryTransactionVersion, int8_t remoteLocality, std::vector allTags) { TraceEvent("RemoteLogRecruitment_WaitingForWorkers").log(); state RecruitRemoteFromConfigurationReply remoteWorkers = wait(fRemoteWorkers); state Reference logSet(new LogSet()); logSet->tLogReplicationFactor = configuration.getRemoteTLogReplicationFactor(); logSet->tLogVersion = configuration.tLogVersion; logSet->tLogPolicy = configuration.getRemoteTLogPolicy(); logSet->isLocal = false; logSet->locality = remoteLocality; logSet->startVersion = oldLogSystem->knownCommittedVersion + 1; state int lockNum = 0; while (lockNum < oldLogSystem->lockResults.size()) { if (oldLogSystem->lockResults[lockNum].logSet->locality == remoteLocality) { loop { auto versions = TagPartitionedLogSystem::getDurableVersion(self->dbgid, oldLogSystem->lockResults[lockNum]); if (versions.present()) { logSet->startVersion = std::min(std::min(std::get<0>(versions.get()) + 1, oldLogSystem->lockResults[lockNum].epochEnd), logSet->startVersion); break; } wait(TagPartitionedLogSystem::getDurableVersionChanged(oldLogSystem->lockResults[lockNum])); } break; } lockNum++; } std::vector localities; localities.resize(remoteWorkers.remoteTLogs.size()); for (int i = 0; i < remoteWorkers.remoteTLogs.size(); i++) { localities[i] = remoteWorkers.remoteTLogs[i].locality; } state Future oldRouterRecruitment = Void(); if (logSet->startVersion < oldLogSystem->knownCommittedVersion + 1) { ASSERT(oldLogSystem->logRouterTags > 0); oldRouterRecruitment = TagPartitionedLogSystem::recruitOldLogRouters(self, remoteWorkers.logRouters, recoveryCount, remoteLocality, logSet->startVersion, localities, logSet->tLogPolicy, true); } state std::vector> logRouterInitializationReplies; const Version startVersion = oldLogSystem->logRouterTags == 0 ? oldLogSystem->recoverAt.get() + 1 : std::max(self->tLogs[0]->startVersion, logSet->startVersion); for (int i = 0; i < self->logRouterTags; i++) { InitializeLogRouterRequest req; req.recoveryCount = recoveryCount; req.routerTag = Tag(tagLocalityLogRouter, i); req.startVersion = startVersion; req.tLogLocalities = localities; req.tLogPolicy = logSet->tLogPolicy; req.locality = remoteLocality; TraceEvent("RemoteTLogRouterReplies", self->dbgid) .detail("WorkerID", remoteWorkers.logRouters[i % remoteWorkers.logRouters.size()].id()); logRouterInitializationReplies.push_back(transformErrors( throwErrorOr( remoteWorkers.logRouters[i % remoteWorkers.logRouters.size()].logRouter.getReplyUnlessFailedFor( req, SERVER_KNOBS->TLOG_TIMEOUT, SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY)), cluster_recovery_failed())); } std::vector localTags = TagPartitionedLogSystem::getLocalTags(remoteLocality, allTags); LogSystemConfig oldLogSystemConfig = oldLogSystem->getLogSystemConfig(); logSet->tLogLocalities.resize(remoteWorkers.remoteTLogs.size()); logSet->logServers.resize( remoteWorkers.remoteTLogs .size()); // Dummy interfaces, so that logSystem->getPushLocations() below uses the correct size logSet->updateLocalitySet(localities); state std::vector> remoteTLogInitializationReplies; std::vector remoteTLogReqs(remoteWorkers.remoteTLogs.size()); bool nonShardedTxs = self->getTLogVersion() < TLogVersion::V4; if (oldLogSystem->logRouterTags == 0) { std::vector locations; for (Tag tag : localTags) { locations.clear(); logSet->getPushLocations(VectorRef(&tag, 1), locations, 0); for (int loc : locations) remoteTLogReqs[loc].recoverTags.push_back(tag); } if (oldLogSystem->tLogs.size()) { int maxTxsTags = oldLogSystem->txsTags; bool needsOldTxs = oldLogSystem->tLogs[0]->tLogVersion < TLogVersion::V4; for (auto& it : oldLogSystem->oldLogData) { maxTxsTags = std::max(maxTxsTags, it.txsTags); needsOldTxs = needsOldTxs || it.tLogs[0]->tLogVersion < TLogVersion::V4; } for (int i = needsOldTxs ? -1 : 0; i < maxTxsTags; i++) { Tag tag = i == -1 ? txsTag : Tag(tagLocalityTxs, i); Tag pushTag = (i == -1 || nonShardedTxs) ? txsTag : Tag(tagLocalityTxs, i % self->txsTags); locations.clear(); logSet->getPushLocations(VectorRef(&pushTag, 1), locations, 0); for (int loc : locations) remoteTLogReqs[loc].recoverTags.push_back(tag); } } } if (oldLogSystem->tLogs.size()) { if (nonShardedTxs) { localTags.push_back(txsTag); } else { for (int i = 0; i < self->txsTags; i++) { localTags.push_back(Tag(tagLocalityTxs, i)); } } } for (int i = 0; i < remoteWorkers.remoteTLogs.size(); i++) { InitializeTLogRequest& req = remoteTLogReqs[i]; req.recruitmentID = self->recruitmentID; req.logVersion = configuration.tLogVersion; req.storeType = configuration.tLogDataStoreType; req.spillType = configuration.tLogSpillType; req.recoverFrom = oldLogSystemConfig; req.recoverAt = oldLogSystem->recoverAt.get(); req.knownCommittedVersion = oldLogSystem->knownCommittedVersion; req.epoch = recoveryCount; req.remoteTag = Tag(tagLocalityRemoteLog, i); req.locality = remoteLocality; req.isPrimary = false; req.allTags = localTags; req.startVersion = logSet->startVersion; req.logRouterTags = 0; req.txsTags = self->txsTags; req.clusterId = clusterId; req.recoveryTransactionVersion = recoveryTransactionVersion; } remoteTLogInitializationReplies.reserve(remoteWorkers.remoteTLogs.size()); for (int i = 0; i < remoteWorkers.remoteTLogs.size(); i++) { TraceEvent("RemoteTLogReplies", self->dbgid).detail("WorkerID", remoteWorkers.remoteTLogs[i].id()); remoteTLogInitializationReplies.push_back(transformErrors( throwErrorOr(remoteWorkers.remoteTLogs[i].tLog.getReplyUnlessFailedFor( remoteTLogReqs[i], SERVER_KNOBS->TLOG_TIMEOUT, SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY)), cluster_recovery_failed())); } TraceEvent("RemoteLogRecruitment_InitializingRemoteLogs") .detail("StartVersion", logSet->startVersion) .detail("LocalStart", self->tLogs[0]->startVersion) .detail("LogRouterTags", self->logRouterTags); wait(waitForAll(remoteTLogInitializationReplies) && waitForAll(logRouterInitializationReplies) && oldRouterRecruitment); for (int i = 0; i < logRouterInitializationReplies.size(); i++) { logSet->logRouters.push_back(makeReference>>( OptionalInterface(logRouterInitializationReplies[i].get()))); } for (int i = 0; i < remoteTLogInitializationReplies.size(); i++) { logSet->logServers[i] = makeReference>>( OptionalInterface(remoteTLogInitializationReplies[i].get())); logSet->tLogLocalities[i] = remoteWorkers.remoteTLogs[i].locality; } filterLocalityDataForPolicy(logSet->tLogPolicy, &logSet->tLogLocalities); std::vector> recoveryComplete; recoveryComplete.reserve(logSet->logServers.size()); for (int i = 0; i < logSet->logServers.size(); i++) recoveryComplete.push_back( transformErrors(throwErrorOr(logSet->logServers[i]->get().interf().recoveryFinished.getReplyUnlessFailedFor( TLogRecoveryFinishedRequest(), SERVER_KNOBS->TLOG_TIMEOUT, SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY)), cluster_recovery_failed())); self->remoteRecoveryComplete = waitForAll(recoveryComplete); self->tLogs.push_back(logSet); TraceEvent("RemoteLogRecruitment_CompletingRecovery").log(); return Void(); } ACTOR Future> TagPartitionedLogSystem::newEpoch( Reference oldLogSystem, RecruitFromConfigurationReply recr, Future fRemoteWorkers, UID clusterId, DatabaseConfiguration configuration, LogEpoch recoveryCount, Version recoveryTransactionVersion, int8_t primaryLocality, int8_t remoteLocality, std::vector allTags, Reference> recruitmentStalled) { state double startTime = now(); state Reference logSystem( new TagPartitionedLogSystem(oldLogSystem->getDebugID(), oldLogSystem->locality, recoveryCount)); logSystem->logSystemType = LogSystemType::tagPartitioned; logSystem->expectedLogSets = 1; logSystem->recoveredAt = oldLogSystem->recoverAt; logSystem->repopulateRegionAntiQuorum = configuration.repopulateRegionAntiQuorum; logSystem->recruitmentID = deterministicRandom()->randomUniqueID(); logSystem->txsTags = configuration.tLogVersion >= TLogVersion::V4 ? recr.tLogs.size() : 0; oldLogSystem->recruitmentID = logSystem->recruitmentID; if (configuration.usableRegions > 1) { logSystem->logRouterTags = recr.tLogs.size() * std::max(1, configuration.desiredLogRouterCount / std::max(1, recr.tLogs.size())); logSystem->expectedLogSets++; logSystem->addPseudoLocality(tagLocalityLogRouterMapped); TraceEvent e("AddPseudoLocality", logSystem->getDebugID()); e.detail("Locality1", "LogRouterMapped"); if (configuration.backupWorkerEnabled) { logSystem->addPseudoLocality(tagLocalityBackup); e.detail("Locality2", "Backup"); } } else if (configuration.backupWorkerEnabled) { // Single region uses log router tag for backup workers. logSystem->logRouterTags = recr.tLogs.size() * std::max(1, configuration.desiredLogRouterCount / std::max(1, recr.tLogs.size())); logSystem->addPseudoLocality(tagLocalityBackup); TraceEvent("AddPseudoLocality", logSystem->getDebugID()).detail("Locality", "Backup"); } logSystem->tLogs.push_back(makeReference()); logSystem->tLogs[0]->tLogVersion = configuration.tLogVersion; logSystem->tLogs[0]->tLogWriteAntiQuorum = configuration.tLogWriteAntiQuorum; logSystem->tLogs[0]->tLogReplicationFactor = configuration.tLogReplicationFactor; logSystem->tLogs[0]->tLogPolicy = configuration.tLogPolicy; logSystem->tLogs[0]->isLocal = true; logSystem->tLogs[0]->locality = primaryLocality; state RegionInfo region = configuration.getRegion(recr.dcId); state int maxTxsTags = oldLogSystem->txsTags; state bool needsOldTxs = oldLogSystem->tLogs.size() && oldLogSystem->getTLogVersion() < TLogVersion::V4; for (auto& it : oldLogSystem->oldLogData) { maxTxsTags = std::max(maxTxsTags, it.txsTags); needsOldTxs = needsOldTxs || it.tLogs[0]->tLogVersion < TLogVersion::V4; } if (region.satelliteTLogReplicationFactor > 0 && configuration.usableRegions > 1) { logSystem->tLogs.push_back(makeReference()); if (recr.satelliteFallback) { logSystem->tLogs[1]->tLogWriteAntiQuorum = region.satelliteTLogWriteAntiQuorumFallback; logSystem->tLogs[1]->tLogReplicationFactor = region.satelliteTLogReplicationFactorFallback; logSystem->tLogs[1]->tLogPolicy = region.satelliteTLogPolicyFallback; } else { logSystem->tLogs[1]->tLogWriteAntiQuorum = region.satelliteTLogWriteAntiQuorum; logSystem->tLogs[1]->tLogReplicationFactor = region.satelliteTLogReplicationFactor; logSystem->tLogs[1]->tLogPolicy = region.satelliteTLogPolicy; } logSystem->tLogs[1]->isLocal = true; logSystem->tLogs[1]->locality = tagLocalitySatellite; logSystem->tLogs[1]->tLogVersion = configuration.tLogVersion; logSystem->tLogs[1]->startVersion = oldLogSystem->knownCommittedVersion + 1; logSystem->tLogs[1]->tLogLocalities.resize(recr.satelliteTLogs.size()); for (int i = 0; i < recr.satelliteTLogs.size(); i++) { logSystem->tLogs[1]->tLogLocalities[i] = recr.satelliteTLogs[i].locality; } filterLocalityDataForPolicy(logSystem->tLogs[1]->tLogPolicy, &logSystem->tLogs[1]->tLogLocalities); logSystem->tLogs[1]->logServers.resize( recr.satelliteTLogs .size()); // Dummy interfaces, so that logSystem->getPushLocations() below uses the correct size logSystem->tLogs[1]->updateLocalitySet(logSystem->tLogs[1]->tLogLocalities); logSystem->tLogs[1]->populateSatelliteTagLocations( logSystem->logRouterTags, oldLogSystem->logRouterTags, logSystem->txsTags, maxTxsTags); logSystem->expectedLogSets++; } if (oldLogSystem->tLogs.size()) { logSystem->oldLogData.emplace_back(); logSystem->oldLogData[0].tLogs = oldLogSystem->tLogs; logSystem->oldLogData[0].epochBegin = oldLogSystem->tLogs[0]->startVersion; logSystem->oldLogData[0].epochEnd = oldLogSystem->knownCommittedVersion + 1; logSystem->oldLogData[0].logRouterTags = oldLogSystem->logRouterTags; logSystem->oldLogData[0].txsTags = oldLogSystem->txsTags; logSystem->oldLogData[0].pseudoLocalities = oldLogSystem->pseudoLocalities; logSystem->oldLogData[0].epoch = oldLogSystem->epoch; } logSystem->oldLogData.insert( logSystem->oldLogData.end(), oldLogSystem->oldLogData.begin(), oldLogSystem->oldLogData.end()); logSystem->tLogs[0]->startVersion = oldLogSystem->knownCommittedVersion + 1; logSystem->backupStartVersion = oldLogSystem->knownCommittedVersion + 1; state int lockNum = 0; while (lockNum < oldLogSystem->lockResults.size()) { if (oldLogSystem->lockResults[lockNum].logSet->locality == primaryLocality) { if (oldLogSystem->lockResults[lockNum].isCurrent && oldLogSystem->lockResults[lockNum].logSet->isLocal) { break; } state Future stalledAfter = setAfter(recruitmentStalled, SERVER_KNOBS->MAX_RECOVERY_TIME, true); loop { auto versions = TagPartitionedLogSystem::getDurableVersion(logSystem->dbgid, oldLogSystem->lockResults[lockNum]); if (versions.present()) { logSystem->tLogs[0]->startVersion = std::min(std::min(std::get<0>(versions.get()) + 1, oldLogSystem->lockResults[lockNum].epochEnd), logSystem->tLogs[0]->startVersion); break; } wait(TagPartitionedLogSystem::getDurableVersionChanged(oldLogSystem->lockResults[lockNum])); } stalledAfter.cancel(); break; } lockNum++; } std::vector localities; localities.resize(recr.tLogs.size()); for (int i = 0; i < recr.tLogs.size(); i++) { localities[i] = recr.tLogs[i].locality; } state Future oldRouterRecruitment = Never(); TraceEvent("NewEpochStartVersion", oldLogSystem->getDebugID()) .detail("StartVersion", logSystem->tLogs[0]->startVersion) .detail("EpochEnd", oldLogSystem->knownCommittedVersion + 1) .detail("Locality", primaryLocality) .detail("OldLogRouterTags", oldLogSystem->logRouterTags); if (oldLogSystem->logRouterTags > 0 || logSystem->tLogs[0]->startVersion < oldLogSystem->knownCommittedVersion + 1) { // Use log routers to recover [knownCommittedVersion, recoveryVersion] from the old generation. oldRouterRecruitment = TagPartitionedLogSystem::recruitOldLogRouters(oldLogSystem.getPtr(), recr.oldLogRouters, recoveryCount, primaryLocality, logSystem->tLogs[0]->startVersion, localities, logSystem->tLogs[0]->tLogPolicy, false); if (oldLogSystem->knownCommittedVersion - logSystem->tLogs[0]->startVersion > SERVER_KNOBS->MAX_RECOVERY_VERSIONS) { // make sure we can recover in the other DC. for (auto& lockResult : oldLogSystem->lockResults) { if (lockResult.logSet->locality == remoteLocality) { if (TagPartitionedLogSystem::getDurableVersion(logSystem->dbgid, lockResult).present()) { recruitmentStalled->set(true); } } } } } else { oldLogSystem->logSystemConfigChanged.trigger(); } std::vector localTags = TagPartitionedLogSystem::getLocalTags(primaryLocality, allTags); state LogSystemConfig oldLogSystemConfig = oldLogSystem->getLogSystemConfig(); state std::vector> initializationReplies; std::vector reqs(recr.tLogs.size()); logSystem->tLogs[0]->tLogLocalities.resize(recr.tLogs.size()); logSystem->tLogs[0]->logServers.resize( recr.tLogs.size()); // Dummy interfaces, so that logSystem->getPushLocations() below uses the correct size logSystem->tLogs[0]->updateLocalitySet(localities); std::vector locations; for (Tag tag : localTags) { locations.clear(); logSystem->tLogs[0]->getPushLocations(VectorRef(&tag, 1), locations, 0); for (int loc : locations) reqs[loc].recoverTags.push_back(tag); } for (int i = 0; i < oldLogSystem->logRouterTags; i++) { Tag tag = Tag(tagLocalityLogRouter, i); reqs[logSystem->tLogs[0]->bestLocationFor(tag)].recoverTags.push_back(tag); } bool nonShardedTxs = logSystem->getTLogVersion() < TLogVersion::V4; if (oldLogSystem->tLogs.size()) { for (int i = needsOldTxs ? -1 : 0; i < maxTxsTags; i++) { Tag tag = i == -1 ? txsTag : Tag(tagLocalityTxs, i); Tag pushTag = (i == -1 || nonShardedTxs) ? txsTag : Tag(tagLocalityTxs, i % logSystem->txsTags); locations.clear(); logSystem->tLogs[0]->getPushLocations(VectorRef(&pushTag, 1), locations, 0); for (int loc : locations) reqs[loc].recoverTags.push_back(tag); } if (nonShardedTxs) { localTags.push_back(txsTag); } else { for (int i = 0; i < logSystem->txsTags; i++) { localTags.push_back(Tag(tagLocalityTxs, i)); } } } for (int i = 0; i < recr.tLogs.size(); i++) { InitializeTLogRequest& req = reqs[i]; req.recruitmentID = logSystem->recruitmentID; req.logVersion = configuration.tLogVersion; req.storeType = configuration.tLogDataStoreType; req.spillType = configuration.tLogSpillType; req.recoverFrom = oldLogSystemConfig; req.recoverAt = oldLogSystem->recoverAt.get(); req.knownCommittedVersion = oldLogSystem->knownCommittedVersion; req.epoch = recoveryCount; req.locality = primaryLocality; req.remoteTag = Tag(tagLocalityRemoteLog, i); req.isPrimary = true; req.allTags = localTags; req.startVersion = logSystem->tLogs[0]->startVersion; req.logRouterTags = logSystem->logRouterTags; req.txsTags = logSystem->txsTags; req.clusterId = clusterId; req.recoveryTransactionVersion = recoveryTransactionVersion; } initializationReplies.reserve(recr.tLogs.size()); for (int i = 0; i < recr.tLogs.size(); i++) { TraceEvent("PrimaryTLogReplies", logSystem->getDebugID()).detail("WorkerID", recr.tLogs[i].id()); initializationReplies.push_back(transformErrors( throwErrorOr(recr.tLogs[i].tLog.getReplyUnlessFailedFor( reqs[i], SERVER_KNOBS->TLOG_TIMEOUT, SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY)), cluster_recovery_failed())); } state std::vector> recoveryComplete; if (region.satelliteTLogReplicationFactor > 0 && configuration.usableRegions > 1) { state std::vector> satelliteInitializationReplies; std::vector sreqs(recr.satelliteTLogs.size()); std::vector satelliteTags; if (logSystem->logRouterTags) { for (int i = 0; i < oldLogSystem->logRouterTags; i++) { Tag tag = Tag(tagLocalityLogRouter, i); // Satellite logs will index a mutation with tagLocalityLogRouter with an id greater than // the number of log routers as having an id mod the number of log routers. We thus need // to make sure that if we're going from more log routers in the previous generation to // less log routers in the newer one, that we map the log router tags onto satellites that // are the preferred location for id%logRouterTags. Tag pushLocation = Tag(tagLocalityLogRouter, i % logSystem->logRouterTags); locations.clear(); logSystem->tLogs[1]->getPushLocations(VectorRef(&pushLocation, 1), locations, 0); for (int loc : locations) sreqs[loc].recoverTags.push_back(tag); } } if (oldLogSystem->tLogs.size()) { for (int i = needsOldTxs ? -1 : 0; i < maxTxsTags; i++) { Tag tag = i == -1 ? txsTag : Tag(tagLocalityTxs, i); Tag pushTag = (i == -1 || nonShardedTxs) ? txsTag : Tag(tagLocalityTxs, i % logSystem->txsTags); locations.clear(); logSystem->tLogs[1]->getPushLocations(VectorRef(&pushTag, 1), locations, 0); for (int loc : locations) sreqs[loc].recoverTags.push_back(tag); } if (nonShardedTxs) { satelliteTags.push_back(txsTag); } else { for (int i = 0; i < logSystem->txsTags; i++) { satelliteTags.push_back(Tag(tagLocalityTxs, i)); } } } for (int i = 0; i < recr.satelliteTLogs.size(); i++) { InitializeTLogRequest& req = sreqs[i]; req.recruitmentID = logSystem->recruitmentID; req.logVersion = configuration.tLogVersion; req.storeType = configuration.tLogDataStoreType; req.spillType = configuration.tLogSpillType; req.recoverFrom = oldLogSystemConfig; req.recoverAt = oldLogSystem->recoverAt.get(); req.knownCommittedVersion = oldLogSystem->knownCommittedVersion; req.epoch = recoveryCount; req.locality = tagLocalitySatellite; req.remoteTag = Tag(); req.isPrimary = true; req.allTags = satelliteTags; req.startVersion = oldLogSystem->knownCommittedVersion + 1; req.logRouterTags = logSystem->logRouterTags; req.txsTags = logSystem->txsTags; req.clusterId = clusterId; req.recoveryTransactionVersion = recoveryTransactionVersion; } satelliteInitializationReplies.reserve(recr.satelliteTLogs.size()); for (int i = 0; i < recr.satelliteTLogs.size(); i++) { TraceEvent("PrimarySatelliteTLogReplies", logSystem->getDebugID()) .detail("WorkerID", recr.satelliteTLogs[i].id()); satelliteInitializationReplies.push_back(transformErrors( throwErrorOr(recr.satelliteTLogs[i].tLog.getReplyUnlessFailedFor( sreqs[i], SERVER_KNOBS->TLOG_TIMEOUT, SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY)), cluster_recovery_failed())); } wait(waitForAll(satelliteInitializationReplies) || oldRouterRecruitment); for (int i = 0; i < satelliteInitializationReplies.size(); i++) { logSystem->tLogs[1]->logServers[i] = makeReference>>( OptionalInterface(satelliteInitializationReplies[i].get())); } for (int i = 0; i < logSystem->tLogs[1]->logServers.size(); i++) recoveryComplete.push_back(transformErrors( throwErrorOr( logSystem->tLogs[1]->logServers[i]->get().interf().recoveryFinished.getReplyUnlessFailedFor( TLogRecoveryFinishedRequest(), SERVER_KNOBS->TLOG_TIMEOUT, SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY)), cluster_recovery_failed())); } wait(waitForAll(initializationReplies) || oldRouterRecruitment); for (int i = 0; i < initializationReplies.size(); i++) { logSystem->tLogs[0]->logServers[i] = makeReference>>( OptionalInterface(initializationReplies[i].get())); logSystem->tLogs[0]->tLogLocalities[i] = recr.tLogs[i].locality; } filterLocalityDataForPolicy(logSystem->tLogs[0]->tLogPolicy, &logSystem->tLogs[0]->tLogLocalities); // Don't force failure of recovery if it took us a long time to recover. This avoids multiple long running // recoveries causing tests to timeout if (BUGGIFY && now() - startTime < 300 && g_network->isSimulated() && g_simulator->speedUpSimulation) throw cluster_recovery_failed(); for (int i = 0; i < logSystem->tLogs[0]->logServers.size(); i++) recoveryComplete.push_back(transformErrors( throwErrorOr(logSystem->tLogs[0]->logServers[i]->get().interf().recoveryFinished.getReplyUnlessFailedFor( TLogRecoveryFinishedRequest(), SERVER_KNOBS->TLOG_TIMEOUT, SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY)), cluster_recovery_failed())); logSystem->recoveryComplete = waitForAll(recoveryComplete); if (configuration.usableRegions > 1) { logSystem->hasRemoteServers = true; logSystem->remoteRecovery = TagPartitionedLogSystem::newRemoteEpoch(logSystem.getPtr(), oldLogSystem, fRemoteWorkers, clusterId, configuration, recoveryCount, recoveryTransactionVersion, remoteLocality, allTags); if (oldLogSystem->tLogs.size() > 0 && oldLogSystem->tLogs[0]->locality == tagLocalitySpecial) { // The wait is required so that we know both primary logs and remote logs have copied the data between // the known committed version and the recovery version. // FIXME: we can remove this wait once we are able to have log routers which can ship data to the remote // logs without using log router tags. wait(logSystem->remoteRecovery); } } else { logSystem->hasRemoteServers = false; logSystem->remoteRecovery = logSystem->recoveryComplete; logSystem->remoteRecoveryComplete = logSystem->recoveryComplete; } return logSystem; } ACTOR Future TagPartitionedLogSystem::trackRejoins( UID dbgid, std::vector>>, Reference>> logServers, FutureStream rejoinRequests) { state std::map> lastReply; state std::set logsWaiting; state double startTime = now(); state Future warnTimeout = delay(SERVER_KNOBS->TLOG_SLOW_REJOIN_WARN_TIMEOUT_SECS); for (const auto& log : logServers) { logsWaiting.insert(log.first->get().id()); } try { loop choose { when(TLogRejoinRequest req = waitNext(rejoinRequests)) { int pos = -1; for (int i = 0; i < logServers.size(); i++) { if (logServers[i].first->get().id() == req.myInterface.id()) { pos = i; logsWaiting.erase(logServers[i].first->get().id()); break; } } if (pos != -1) { TraceEvent("TLogJoinedMe", dbgid) .detail("TLog", req.myInterface.id()) .detail("Address", req.myInterface.commit.getEndpoint().getPrimaryAddress().toString()); if (!logServers[pos].first->get().present() || req.myInterface.commit.getEndpoint() != logServers[pos].first->get().interf().commit.getEndpoint()) { TLogInterface interf = req.myInterface; filterLocalityDataForPolicyDcAndProcess(logServers[pos].second, &interf.filteredLocality); logServers[pos].first->setUnconditional(OptionalInterface(interf)); } lastReply[req.myInterface.id()].send(TLogRejoinReply{ false }); lastReply[req.myInterface.id()] = req.reply; } else { TraceEvent("TLogJoinedMeUnknown", dbgid) .detail("TLog", req.myInterface.id()) .detail("Address", req.myInterface.commit.getEndpoint().getPrimaryAddress().toString()); req.reply.send(true); } } when(wait(warnTimeout)) { for (const auto& logId : logsWaiting) { TraceEvent(SevWarnAlways, "TLogRejoinSlow", dbgid) .detail("Elapsed", startTime - now()) .detail("LogId", logId); } warnTimeout = Never(); } } } catch (...) { for (auto it = lastReply.begin(); it != lastReply.end(); ++it) it->second.send(TLogRejoinReply{ true }); throw; } } ACTOR Future TagPartitionedLogSystem::lockTLog( UID myID, Reference>> tlog) { TraceEvent("TLogLockStarted", myID).detail("TLog", tlog->get().id()).detail("InfPresent", tlog->get().present()); loop { choose { when(TLogLockResult data = wait( tlog->get().present() ? brokenPromiseToNever(tlog->get().interf().lock.getReply()) : Never())) { TraceEvent("TLogLocked", myID).detail("TLog", tlog->get().id()).detail("End", data.end); return data; } when(wait(tlog->onChange())) {} } } }