From ae3542f8ab39b66ffdbc100b2e914c87a90c465d Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Tue, 29 Jun 2021 01:26:05 +0000 Subject: [PATCH 01/29] add stream struct in Tlog --- fdbserver/TLogInterface.h | 42 ++++++++++++++++++++++++++++++++++ fdbserver/TLogServer.actor.cpp | 4 ++++ 2 files changed, 46 insertions(+) diff --git a/fdbserver/TLogInterface.h b/fdbserver/TLogInterface.h index e9e5b20b0d..202aa38e18 100644 --- a/fdbserver/TLogInterface.h +++ b/fdbserver/TLogInterface.h @@ -38,6 +38,7 @@ struct TLogInterface { UID sharedTLogID; RequestStream peekMessages; + RequestStream peekStreamMessages; // request establish a peek stream with the TLog server RequestStream popMessages; RequestStream commit; @@ -70,6 +71,7 @@ struct TLogInterface { void initEndpoints() { std::vector> streams; streams.push_back(peekMessages.getReceiver(TaskPriority::TLogPeek)); + streams.push_back(peekStreamMessages.getReceiver(TaskPriority::ReadSocket)); streams.push_back(popMessages.getReceiver(TaskPriority::TLogPop)); streams.push_back(commit.getReceiver(TaskPriority::TLogCommit)); streams.push_back(lock.getReceiver()); @@ -106,6 +108,7 @@ struct TLogInterface { enablePopRequest = RequestStream(peekMessages.getEndpoint().getAdjustedEndpoint(9)); snapRequest = RequestStream(peekMessages.getEndpoint().getAdjustedEndpoint(10)); + peekStreamMessages = RequestStream(peekMessages.getEndpoint().getAdjustedEndpoint(11)); } } }; @@ -209,6 +212,45 @@ struct TLogPeekRequest { } }; +struct TLogPeekStreamReply: public ReplyPromiseStreamReply { + constexpr static FileIdentifier file_identifier = 10072848; + Arena arena; + StringRef messages; + Version end; + Optional popped; + Version maxKnownVersion; + Version minKnownCommittedVersion; + Optional begin; + + int expectedSize() const { + return messages.expectedSize() + sizeof(TLogPeekStreamReply); + } + + template + void serialize(Ar& ar) { + serializer(ar, ReplyPromiseStreamReply::acknowledgeToken, arena, messages, end, popped, maxKnownVersion, + minKnownCommittedVersion, begin); + } +}; + +struct TLogPeekStreamRequest { + constexpr static FileIdentifier file_identifier = 10072821; + Arena arena; + Version begin; + Tag tag; + int limit, limitBytes; + ReplyPromiseStream reply; + + TLogPeekStreamRequest() {} + TLogPeekStreamRequest(Version version, Tag tag, int limit, int limitBytes) + : begin(version), tag(tag), limit(limit), limitBytes(limitBytes) {} + + template + void serialize(Ar& ar) { + serializer(ar, arena, begin, tag, limit, limitBytes, reply); + } +}; + struct TLogPopRequest { constexpr static FileIdentifier file_identifier = 5556423; Arena arena; diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp index 3d2d90c3f7..802bb3c7ec 100644 --- a/fdbserver/TLogServer.actor.cpp +++ b/fdbserver/TLogServer.actor.cpp @@ -2398,6 +2398,9 @@ ACTOR Future serveTLogInterface(TLogData* self, } else { logData->logSystem->set(Reference()); } + } + when(TLogPeekStreamRequest req = waitNext(tli.peekStreamMessages.getFuture())) { + } when(TLogPeekRequest req = waitNext(tli.peekMessages.getFuture())) { logData->addActor.send(tLogPeekMessages(self, req, logData)); @@ -3100,6 +3103,7 @@ ACTOR Future tLogStart(TLogData* self, InitializeTLogRequest req, Locality recruited.initEndpoints(); DUMPTOKEN(recruited.peekMessages); + DUMPTOKEN(recruited.peekStreamMessages); DUMPTOKEN(recruited.popMessages); DUMPTOKEN(recruited.commit); DUMPTOKEN(recruited.lock); From b50fda6b4bee524cad6fecc35ddac8117f88f32a Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Thu, 1 Jul 2021 04:32:30 +0000 Subject: [PATCH 02/29] add simple streaming peek functions --- fdbclient/ServerKnobs.cpp | 3 +- fdbclient/ServerKnobs.h | 3 +- fdbserver/LogSystem.h | 2 + fdbserver/LogSystemPeekCursor.actor.cpp | 101 ++++++++++++++++++------ fdbserver/TLogInterface.h | 47 ++++++----- fdbserver/TLogServer.actor.cpp | 19 +++-- fdbserver/worker.actor.cpp | 1 + 7 files changed, 120 insertions(+), 56 deletions(-) diff --git a/fdbclient/ServerKnobs.cpp b/fdbclient/ServerKnobs.cpp index 8cbb77defb..e4dcd9d38a 100644 --- a/fdbclient/ServerKnobs.cpp +++ b/fdbclient/ServerKnobs.cpp @@ -66,7 +66,8 @@ void ServerKnobs::initialize(Randomize _randomize, ClientKnobs* clientKnobs, IsS init( TLOG_MESSAGE_BLOCK_BYTES, 10e6 ); init( TLOG_MESSAGE_BLOCK_OVERHEAD_FACTOR, double(TLOG_MESSAGE_BLOCK_BYTES) / (TLOG_MESSAGE_BLOCK_BYTES - MAX_MESSAGE_SIZE) ); //1.0121466709838096006362758832473 init( PEEK_TRACKER_EXPIRATION_TIME, 600 ); if( randomize && BUGGIFY ) PEEK_TRACKER_EXPIRATION_TIME = deterministicRandom()->coinflip() ? 0.1 : 120; - init( PARALLEL_GET_MORE_REQUESTS, 32 ); if( randomize && BUGGIFY ) PARALLEL_GET_MORE_REQUESTS = 2; + init( PEEK_USEING_STREAMING, true ); + init( PARALLEL_GET_MORE_REQUESTS, 32 ); if( randomize && BUGGIFY ) PARALLEL_GET_MORE_REQUESTS = 2; init( MULTI_CURSOR_PRE_FETCH_LIMIT, 10 ); init( MAX_QUEUE_COMMIT_BYTES, 15e6 ); if( randomize && BUGGIFY ) MAX_QUEUE_COMMIT_BYTES = 5000; init( DESIRED_OUTSTANDING_MESSAGES, 5000 ); if( randomize && BUGGIFY ) DESIRED_OUTSTANDING_MESSAGES = deterministicRandom()->randomInt(0,100); diff --git a/fdbclient/ServerKnobs.h b/fdbclient/ServerKnobs.h index 30468c7e84..d7dda18e82 100644 --- a/fdbclient/ServerKnobs.h +++ b/fdbclient/ServerKnobs.h @@ -40,7 +40,8 @@ public: // often, so that versions always advance smoothly // TLogs - double TLOG_TIMEOUT; // tlog OR commit proxy failure - master's reaction time + bool PEEK_USEING_STREAMING; + double TLOG_TIMEOUT; // tlog OR commit proxy failure - master's reaction time double TLOG_SLOW_REJOIN_WARN_TIMEOUT_SECS; // Warns if a tlog takes too long to rejoin double RECOVERY_TLOG_SMART_QUORUM_DELAY; // smaller might be better for bug amplification double TLOG_STORAGE_MIN_UPDATE_INTERVAL; diff --git a/fdbserver/LogSystem.h b/fdbserver/LogSystem.h index da2fbcf5f2..a8b7e10d75 100644 --- a/fdbserver/LogSystem.h +++ b/fdbserver/LogSystem.h @@ -433,9 +433,11 @@ struct ILogSystem { bool onlySpilled; bool parallelGetMore; + bool usePeekStream; int sequence; Deque> futureResults; Future interfaceChanged; + Optional> peekReplyStream; double lastReset; Future resetCheck; diff --git a/fdbserver/LogSystemPeekCursor.actor.cpp b/fdbserver/LogSystemPeekCursor.actor.cpp index 26287919cd..89b7e63c97 100644 --- a/fdbserver/LogSystemPeekCursor.actor.cpp +++ b/fdbserver/LogSystemPeekCursor.actor.cpp @@ -25,6 +25,18 @@ #include "fdbrpc/ReplicationUtils.h" #include "flow/actorcompiler.h" // has to be last include +// create a peek stream for cursor when it's possible +void tryEstablishPeekStream(ILogSystem::ServerPeekCursor* self) { + if (self->peekReplyStream.present()) + return; + else if (!self->interf || !self->interf->get().present()) { + self->peekReplyStream.reset(); + return; + } + self->peekReplyStream = self->interf->get().interf().peekStreamMessages.getReplyStream( + TLogPeekStreamRequest(self->messageVersion.version, self->tag, std::numeric_limits::max())); +} + ILogSystem::ServerPeekCursor::ServerPeekCursor(Reference>> const& interf, Tag tag, Version begin, @@ -35,9 +47,12 @@ ILogSystem::ServerPeekCursor::ServerPeekCursor(ReferencerandomUniqueID()), poppedVersion(0), returnIfBlocked(returnIfBlocked), sequence(0), onlySpilled(false), parallelGetMore(parallelGetMore), lastReset(0), slowReplies(0), fastReplies(0), unknownReplies(0), - resetCheck(Void()) { + resetCheck(Void()), usePeekStream(SERVER_KNOBS->PEEK_USEING_STREAMING) { this->results.maxKnownVersion = 0; this->results.minKnownCommittedVersion = 0; + if (usePeekStream) { + tryEstablishPeekStream(this); + } //TraceEvent("SPC_Starting", randomID).detail("Tag", tag.toString()).detail("Begin", begin).detail("End", end).backtrace(); } @@ -51,7 +66,8 @@ ILogSystem::ServerPeekCursor::ServerPeekCursor(TLogPeekReply const& results, : results(results), tag(tag), rd(results.arena, results.messages, Unversioned()), messageVersion(messageVersion), end(end), messageAndTags(message), hasMsg(hasMsg), randomID(deterministicRandom()->randomUniqueID()), poppedVersion(poppedVersion), returnIfBlocked(false), sequence(0), onlySpilled(false), parallelGetMore(false), - lastReset(0), slowReplies(0), fastReplies(0), unknownReplies(0), resetCheck(Void()) { + lastReset(0), slowReplies(0), fastReplies(0), unknownReplies(0), resetCheck(Void()), + usePeekStream(SERVER_KNOBS->PEEK_USEING_STREAMING) { //TraceEvent("SPC_Clone", randomID); this->results.maxKnownVersion = 0; this->results.minKnownCommittedVersion = 0; @@ -154,6 +170,20 @@ void ILogSystem::ServerPeekCursor::advanceTo(LogMessageVersion n) { } } +// This function is called after the cursor received one TLogPeekReply to update its members, which is the common logic +// in getMore helper functions. +void updateCursorWithReply(ILogSystem::ServerPeekCursor* self, const TLogPeekReply& res) { + self->results = res; + self->onlySpilled = res.onlySpilled; + if (res.popped.present()) + self->poppedVersion = std::min(std::max(self->poppedVersion, res.popped.get()), self->end.version); + self->rd = ArenaReader(self->results.arena, self->results.messages, Unversioned()); + LogMessageVersion skipSeq = self->messageVersion; + self->hasMsg = true; + self->nextMessage(); + self->advanceTo(skipSeq); +} + ACTOR Future resetChecker(ILogSystem::ServerPeekCursor* self, NetworkAddress addr) { self->slowReplies = 0; self->unknownReplies = 0; @@ -209,7 +239,7 @@ ACTOR Future recordRequestMetrics(ILogSystem::ServerPeekCursor* s } ACTOR Future serverPeekParallelGetMore(ILogSystem::ServerPeekCursor* self, TaskPriority taskID) { - if (!self->interf || self->messageVersion >= self->end) { + if (!self->interf || self->isExhausted()) { if (self->hasMessage()) return Void(); wait(Future(Never())); @@ -254,16 +284,7 @@ ACTOR Future serverPeekParallelGetMore(ILogSystem::ServerPeekCursor* self, } expectedBegin = res.end; self->futureResults.pop_front(); - self->results = res; - self->onlySpilled = res.onlySpilled; - if (res.popped.present()) - self->poppedVersion = - std::min(std::max(self->poppedVersion, res.popped.get()), self->end.version); - self->rd = ArenaReader(self->results.arena, self->results.messages, Unversioned()); - LogMessageVersion skipSeq = self->messageVersion; - self->hasMsg = true; - self->nextMessage(); - self->advanceTo(skipSeq); + updateCursorWithReply(self, res); //TraceEvent("SPC_GetMoreB", self->randomID).detail("Has", self->hasMessage()).detail("End", res.end).detail("Popped", res.popped.present() ? res.popped.get() : 0); return Void(); } @@ -297,8 +318,49 @@ ACTOR Future serverPeekParallelGetMore(ILogSystem::ServerPeekCursor* self, } } +ACTOR Future serverPeekStreamGetMore(ILogSystem::ServerPeekCursor* self, TaskPriority taskID) { + if (self->isExhausted()) { + if (self->hasMessage()) + return Void(); + wait(Future(Never())); + throw internal_error(); + } + + try { + tryEstablishPeekStream(self); + loop { + try { + choose { + when(wait(self->interf->onChange())) { + self->onlySpilled = false; + self->peekReplyStream.reset(); + tryEstablishPeekStream(self); + } + when(TLogPeekStreamReply res = wait(self->peekReplyStream.present() + ? waitAndForward(self->peekReplyStream.get().getFuture()) + : Never())) { + updateCursorWithReply(self, res.rep); + return Void(); + } + } + } catch (Error& e) { + if (e.code() == error_code_connection_failed) { + self->peekReplyStream.reset(); + } + throw; + } + } + } catch (Error& e) { + if (e.code() == error_code_end_of_stream) { + self->end.reset(self->messageVersion.version); + return Void(); + } + throw; + } +} + ACTOR Future serverPeekGetMore(ILogSystem::ServerPeekCursor* self, TaskPriority taskID) { - if (!self->interf || self->messageVersion >= self->end) { + if (!self->interf || self->isExhausted()) { wait(Future(Never())); throw internal_error(); } @@ -314,16 +376,7 @@ ACTOR Future serverPeekGetMore(ILogSystem::ServerPeekCursor* self, TaskPri self->onlySpilled), taskID)) : Never())) { - self->results = res; - self->onlySpilled = res.onlySpilled; - if (res.popped.present()) - self->poppedVersion = - std::min(std::max(self->poppedVersion, res.popped.get()), self->end.version); - self->rd = ArenaReader(self->results.arena, self->results.messages, Unversioned()); - LogMessageVersion skipSeq = self->messageVersion; - self->hasMsg = true; - self->nextMessage(); - self->advanceTo(skipSeq); + updateCursorWithReply(self, res); //TraceEvent("SPC_GetMoreB", self->randomID).detail("Has", self->hasMessage()).detail("End", res.end).detail("Popped", res.popped.present() ? res.popped.get() : 0); return Void(); } diff --git a/fdbserver/TLogInterface.h b/fdbserver/TLogInterface.h index 202aa38e18..ba8bc10d64 100644 --- a/fdbserver/TLogInterface.h +++ b/fdbserver/TLogInterface.h @@ -38,7 +38,8 @@ struct TLogInterface { UID sharedTLogID; RequestStream peekMessages; - RequestStream peekStreamMessages; // request establish a peek stream with the TLog server + RequestStream + peekStreamMessages; // request establish a peek stream with the TLog server RequestStream popMessages; RequestStream commit; @@ -69,9 +70,9 @@ struct TLogInterface { NetworkAddressList addresses() const { return peekMessages.getEndpoint().addresses; } void initEndpoints() { + // NOTE: the adding order should be the same as the hardcoded indices in serialize() std::vector> streams; streams.push_back(peekMessages.getReceiver(TaskPriority::TLogPeek)); - streams.push_back(peekStreamMessages.getReceiver(TaskPriority::ReadSocket)); streams.push_back(popMessages.getReceiver(TaskPriority::TLogPop)); streams.push_back(commit.getReceiver(TaskPriority::TLogCommit)); streams.push_back(lock.getReceiver()); @@ -82,7 +83,8 @@ struct TLogInterface { streams.push_back(disablePopRequest.getReceiver()); streams.push_back(enablePopRequest.getReceiver()); streams.push_back(snapRequest.getReceiver()); - FlowTransport::transport().addEndpoints(streams); + streams.push_back(peekStreamMessages.getReceiver(TaskPriority::ReadSocket)); + FlowTransport::transport().addEndpoints(streams); } template @@ -108,7 +110,8 @@ struct TLogInterface { enablePopRequest = RequestStream(peekMessages.getEndpoint().getAdjustedEndpoint(9)); snapRequest = RequestStream(peekMessages.getEndpoint().getAdjustedEndpoint(10)); - peekStreamMessages = RequestStream(peekMessages.getEndpoint().getAdjustedEndpoint(11)); + peekStreamMessages = + RequestStream(peekMessages.getEndpoint().getAdjustedEndpoint(11)); } } }; @@ -212,25 +215,19 @@ struct TLogPeekRequest { } }; -struct TLogPeekStreamReply: public ReplyPromiseStreamReply { - constexpr static FileIdentifier file_identifier = 10072848; - Arena arena; - StringRef messages; - Version end; - Optional popped; - Version maxKnownVersion; - Version minKnownCommittedVersion; - Optional begin; +struct TLogPeekStreamReply : public ReplyPromiseStreamReply { + constexpr static FileIdentifier file_identifier = 10072848; + TLogPeekReply rep; - int expectedSize() const { - return messages.expectedSize() + sizeof(TLogPeekStreamReply); + TLogPeekStreamReply() = default; + explicit TLogPeekStreamReply(const TLogPeekReply& rep) : rep(rep) {} + + int expectedSize() const { return rep.messages.expectedSize() + sizeof(TLogPeekStreamReply); } + + template + void serialize(Ar& ar) { + serializer(ar, ReplyPromiseStreamReply::acknowledgeToken, rep); } - - template - void serialize(Ar& ar) { - serializer(ar, ReplyPromiseStreamReply::acknowledgeToken, arena, messages, end, popped, maxKnownVersion, - minKnownCommittedVersion, begin); - } }; struct TLogPeekStreamRequest { @@ -238,16 +235,16 @@ struct TLogPeekStreamRequest { Arena arena; Version begin; Tag tag; - int limit, limitBytes; + int limitBytes; ReplyPromiseStream reply; TLogPeekStreamRequest() {} - TLogPeekStreamRequest(Version version, Tag tag, int limit, int limitBytes) - : begin(version), tag(tag), limit(limit), limitBytes(limitBytes) {} + TLogPeekStreamRequest(Version version, Tag tag, int limitBytes) + : begin(version), tag(tag), limitBytes(limitBytes) {} template void serialize(Ar& ar) { - serializer(ar, arena, begin, tag, limit, limitBytes, reply); + serializer(ar, arena, begin, tag, limitBytes, reply); } }; diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp index 802bb3c7ec..6ab8e2568c 100644 --- a/fdbserver/TLogServer.actor.cpp +++ b/fdbserver/TLogServer.actor.cpp @@ -1568,6 +1568,14 @@ ACTOR Future> parseMessagesForTag(StringRef commitBlob, T return relevantMessages; } +ACTOR Future tLogPeekStream(TLogData* self, TLogPeekStreamRequest req, Reference logData) { + if (req.tag.locality == tagLocalityTxs && req.tag.id >= logData->txsTags && logData->txsTags > 0) { + req.tag.id = req.tag.id % logData->txsTags; + } + req.reply.setByteLimit(SERVER_KNOBS->MAXIMUM_PEEK_BYTES); + return Void(); +} + ACTOR Future tLogPeekMessages(TLogData* self, TLogPeekRequest req, Reference logData) { state BinaryWriter messages(Unversioned()); state BinaryWriter messages2(Unversioned()); @@ -2400,7 +2408,7 @@ ACTOR Future serveTLogInterface(TLogData* self, } } when(TLogPeekStreamRequest req = waitNext(tli.peekStreamMessages.getFuture())) { - + logData->addActor.send(tLogPeekStream(self, req, logData)); } when(TLogPeekRequest req = waitNext(tli.peekMessages.getFuture())) { logData->addActor.send(tLogPeekMessages(self, req, logData)); @@ -2658,7 +2666,7 @@ ACTOR Future tLogCore(TLogData* self, SERVER_KNOBS->STORAGE_LOGGING_DELAY, &logData->cc, logData->logId.toString() + "/TLogMetrics", - [self=self](TraceEvent& te) { + [self = self](TraceEvent& te) { StorageBytes sbTlog = self->persistentData->getStorageBytes(); te.detail("KvstoreBytesUsed", sbTlog.used); te.detail("KvstoreBytesFree", sbTlog.free); @@ -2842,6 +2850,7 @@ ACTOR Future restorePersistentState(TLogData* self, recruited.initEndpoints(); DUMPTOKEN(recruited.peekMessages); + DUMPTOKEN(recruited.peekStreamMessages); DUMPTOKEN(recruited.popMessages); DUMPTOKEN(recruited.commit); DUMPTOKEN(recruited.lock); @@ -2888,9 +2897,9 @@ ACTOR Future restorePersistentState(TLogData* self, logsByVersion.emplace_back(ver, id1); TraceEvent("TLogPersistentStateRestore", self->dbgid) - .detail("LogId", logData->logId) - .detail("Ver", ver) - .detail("RecoveryCount", logData->recoveryCount); + .detail("LogId", logData->logId) + .detail("Ver", ver) + .detail("RecoveryCount", logData->recoveryCount); // Restore popped keys. Pop operations that took place after the last (committed) updatePersistentDataVersion // might be lost, but that is fine because we will get the corresponding data back, too. tagKeys = prefixRange(rawId.withPrefix(persistTagPoppedKeys.begin)); diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp index 6d107cff7f..8ff6208db6 100644 --- a/fdbserver/worker.actor.cpp +++ b/fdbserver/worker.actor.cpp @@ -1829,6 +1829,7 @@ ACTOR Future workerServer(Reference connFile, startRole(Role::LOG_ROUTER, recruited.id(), interf.id(), details); DUMPTOKEN(recruited.peekMessages); + DUMPTOKEN(recruited.peekStreamMessages); DUMPTOKEN(recruited.popMessages); DUMPTOKEN(recruited.commit); DUMPTOKEN(recruited.lock); From 9948b9d4ef39b39ced9bbec8c57b41eb213a82bd Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Mon, 5 Jul 2021 00:14:27 +0000 Subject: [PATCH 03/29] refactor TLog Peek code --- fdbserver/TLogServer.actor.cpp | 245 ++++++++++++++++++--------------- 1 file changed, 133 insertions(+), 112 deletions(-) diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp index 6ab8e2568c..6e92e31c0c 100644 --- a/fdbserver/TLogServer.actor.cpp +++ b/fdbserver/TLogServer.actor.cpp @@ -1507,15 +1507,16 @@ std::deque>& getVersionMessages(Refe }; void peekMessagesFromMemory(Reference self, - TLogPeekRequest const& req, + Tag tag, + Version begin, BinaryWriter& messages, Version& endVersion) { ASSERT(!messages.getLength()); - auto& deque = getVersionMessages(self, req.tag); + auto& deque = getVersionMessages(self, tag); //TraceEvent("TLogPeekMem", self->dbgid).detail("Tag", req.tag1).detail("PDS", self->persistentDataSequence).detail("PDDS", self->persistentDataDurableSequence).detail("Oldest", map1.empty() ? 0 : map1.begin()->key ).detail("OldestMsgCount", map1.empty() ? 0 : map1.begin()->value.size()); - Version begin = std::max(req.begin, self->persistentDataDurableVersion + 1); + begin = std::max(begin, self->persistentDataDurableVersion + 1); auto it = std::lower_bound(deque.begin(), deque.end(), std::make_pair(begin, LengthPrefixedStringRef()), @@ -1542,7 +1543,7 @@ void peekMessagesFromMemory(Reference self, DEBUG_TAGS_AND_MESSAGE( "TLogPeek", currentVersion, StringRef((uint8_t*)data + offset, messages.getLength() - offset)) .detail("LogId", self->logId) - .detail("PeekTag", req.tag); + .detail("PeekTag", tag); } } @@ -1568,105 +1569,97 @@ ACTOR Future> parseMessagesForTag(StringRef commitBlob, T return relevantMessages; } -ACTOR Future tLogPeekStream(TLogData* self, TLogPeekStreamRequest req, Reference logData) { - if (req.tag.locality == tagLocalityTxs && req.tag.id >= logData->txsTags && logData->txsTags > 0) { - req.tag.id = req.tag.id % logData->txsTags; - } - req.reply.setByteLimit(SERVER_KNOBS->MAXIMUM_PEEK_BYTES); - return Void(); -} - -ACTOR Future tLogPeekMessages(TLogData* self, TLogPeekRequest req, Reference logData) { +// Common logics to peek TLog and create TLogPeekReply that serves both streaming peek or normal peek request +ACTOR Future peekTLog( + TLogData* self, + Reference logData, + Version begin, + Tag tag, + bool returnIfBlocked = false, + bool reqOnlySpilled = false, + Optional> sequence = Optional>()) { state BinaryWriter messages(Unversioned()); state BinaryWriter messages2(Unversioned()); - state int sequence = -1; + state int sequenceNum = -1; state UID peekId; state double queueStart = now(); - if (req.tag.locality == tagLocalityTxs && req.tag.id >= logData->txsTags && logData->txsTags > 0) { - req.tag.id = req.tag.id % logData->txsTags; + if (tag.locality == tagLocalityTxs && tag.id >= logData->txsTags && logData->txsTags > 0) { + tag.id = tag.id % logData->txsTags; } - if (req.sequence.present()) { - try { - peekId = req.sequence.get().first; - sequence = req.sequence.get().second; - if (sequence >= SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS && - logData->peekTracker.find(peekId) == logData->peekTracker.end()) { - throw operation_obsolete(); - } - auto& trackerData = logData->peekTracker[peekId]; - if (sequence == 0 && trackerData.sequence_version.find(0) == trackerData.sequence_version.end()) { - trackerData.tag = req.tag; - trackerData.sequence_version[0].send(std::make_pair(req.begin, req.onlySpilled)); - } - auto seqBegin = trackerData.sequence_version.begin(); - // The peek cursor and this comparison need to agree about the maximum number of in-flight requests. - while (trackerData.sequence_version.size() && - seqBegin->first <= sequence - SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS) { - if (seqBegin->second.canBeSet()) { - seqBegin->second.sendError(operation_obsolete()); - } - trackerData.sequence_version.erase(seqBegin); - seqBegin = trackerData.sequence_version.begin(); - } - - if (trackerData.sequence_version.size() && sequence < seqBegin->first) { - throw operation_obsolete(); - } - - Future> fPrevPeekData = trackerData.sequence_version[sequence].getFuture(); - if (fPrevPeekData.isReady()) { - trackerData.unblockedPeeks++; - double t = now() - trackerData.lastUpdate; - if (t > trackerData.idleMax) - trackerData.idleMax = t; - trackerData.idleTime += t; - } - trackerData.lastUpdate = now(); - std::pair prevPeekData = wait(fPrevPeekData); - req.begin = std::max(prevPeekData.first, req.begin); - req.onlySpilled = prevPeekData.second; - wait(yield()); - } catch (Error& e) { - if (e.code() == error_code_timed_out || e.code() == error_code_operation_obsolete) { - req.reply.sendError(e); - return Void(); - } else { - throw; - } + // TODO: once the fake stream is replaced by ReplyPromiseStream, we can remove the code handling sequence requests + // STEP: a. mark obsolete sequence requests; b. wait previous sequence requests are handled in order + if (sequence.present()) { + peekId = sequence.get().first; + sequenceNum = sequence.get().second; + if (sequenceNum >= SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS && + logData->peekTracker.find(peekId) == logData->peekTracker.end()) { + throw operation_obsolete(); } + auto& trackerData = logData->peekTracker[peekId]; + if (sequenceNum == 0 && trackerData.sequence_version.find(0) == trackerData.sequence_version.end()) { + trackerData.tag = tag; + trackerData.sequence_version[0].send(std::make_pair(begin, reqOnlySpilled)); + } + auto seqBegin = trackerData.sequence_version.begin(); + // The peek cursor and this comparison need to agree about the maximum number of in-flight requests. + while (trackerData.sequence_version.size() && + seqBegin->first <= sequenceNum - SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS) { + if (seqBegin->second.canBeSet()) { + seqBegin->second.sendError(operation_obsolete()); + } + trackerData.sequence_version.erase(seqBegin); + seqBegin = trackerData.sequence_version.begin(); + } + + if (trackerData.sequence_version.size() && sequenceNum < seqBegin->first) { + throw operation_obsolete(); + } + + Future> fPrevPeekData = trackerData.sequence_version[sequenceNum].getFuture(); + if (fPrevPeekData.isReady()) { + trackerData.unblockedPeeks++; + double t = now() - trackerData.lastUpdate; + if (t > trackerData.idleMax) + trackerData.idleMax = t; + trackerData.idleTime += t; + } + trackerData.lastUpdate = now(); + std::pair prevPeekData = wait(fPrevPeekData); + begin = std::max(prevPeekData.first, begin); + reqOnlySpilled = prevPeekData.second; + wait(yield()); } state double blockStart = now(); - if (req.returnIfBlocked && logData->version.get() < req.begin) { - req.reply.sendError(end_of_stream()); - if (req.sequence.present()) { + if (returnIfBlocked && logData->version.get() < begin) { + if (sequence.present()) { auto& trackerData = logData->peekTracker[peekId]; - auto& sequenceData = trackerData.sequence_version[sequence + 1]; + auto& sequenceData = trackerData.sequence_version[sequenceNum + 1]; trackerData.lastUpdate = now(); if (!sequenceData.isSet()) { - sequenceData.send(std::make_pair(req.begin, req.onlySpilled)); + sequenceData.send(std::make_pair(begin, reqOnlySpilled)); } } - return Void(); + throw end_of_stream(); } //TraceEvent("TLogPeekMessages0", self->dbgid).detail("ReqBeginEpoch", req.begin.epoch).detail("ReqBeginSeq", req.begin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", req.tag1).detail("Tag2", req.tag2); // Wait until we have something to return that the caller doesn't already have - if (logData->version.get() < req.begin) { - wait(logData->version.whenAtLeast(req.begin)); + if (logData->version.get() < begin) { + wait(logData->version.whenAtLeast(begin)); wait(delay(SERVER_KNOBS->TLOG_PEEK_DELAY, g_network->getCurrentTask())); } - if (logData->locality != tagLocalitySatellite && req.tag.locality == tagLocalityLogRouter) { + if (logData->locality != tagLocalitySatellite && tag.locality == tagLocalityLogRouter) { wait(self->concurrentLogRouterReads.take()); state FlowLock::Releaser globalReleaser(self->concurrentLogRouterReads); wait(delay(0.0, TaskPriority::Low)); } - if (req.begin <= logData->persistentDataDurableVersion && req.tag.locality != tagLocalityTxs && req.tag != txsTag) { + if (begin <= logData->persistentDataDurableVersion && tag.locality != tagLocalityTxs && tag != txsTag) { // Reading spilled data will almost always imply that the storage server is >5s behind the rest // of the cluster. We shouldn't prioritize spending CPU on helping this server catch up // slightly faster over keeping the rest of the cluster operating normally. @@ -1677,8 +1670,9 @@ ACTOR Future tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen state double workStart = now(); - Version poppedVer = poppedVersion(logData, req.tag); - if (poppedVer > req.begin) { + Version poppedVer = poppedVersion(logData, tag); + if (poppedVer > begin) { + // reply with an empty message and let the next reply start from poppedVer TLogPeekReply rep; rep.maxKnownVersion = logData->version.get(); rep.minKnownCommittedVersion = logData->minKnownCommittedVersion; @@ -1686,30 +1680,28 @@ ACTOR Future tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen rep.end = poppedVer; rep.onlySpilled = false; - if (req.sequence.present()) { + // TODO: once the fake stream is replaced by ReplyPromiseStream, we can remove the code handling sequence requests. + if (sequence.present()) { auto& trackerData = logData->peekTracker[peekId]; - auto& sequenceData = trackerData.sequence_version[sequence + 1]; + auto& sequenceData = trackerData.sequence_version[sequenceNum + 1]; trackerData.lastUpdate = now(); - if (trackerData.sequence_version.size() && sequence + 1 < trackerData.sequence_version.begin()->first) { - req.reply.sendError(operation_obsolete()); + if (trackerData.sequence_version.size() && sequenceNum + 1 < trackerData.sequence_version.begin()->first) { if (!sequenceData.isSet()) sequenceData.sendError(operation_obsolete()); - return Void(); + throw operation_obsolete(); } if (sequenceData.isSet()) { if (sequenceData.getFuture().get().first != rep.end) { - TEST(true); // tlog peek second attempt ended at a different version - req.reply.sendError(operation_obsolete()); - return Void(); + TEST(true); // 1 tlog peek second attempt ended at a different version + throw operation_obsolete(); } } else { sequenceData.send(std::make_pair(rep.end, rep.onlySpilled)); } - rep.begin = req.begin; + rep.begin = begin; } - req.reply.send(rep); - return Void(); + return rep; } state Version endVersion = logData->version.get() + 1; @@ -1717,23 +1709,23 @@ ACTOR Future tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen // grab messages from disk //TraceEvent("TLogPeekMessages", self->dbgid).detail("ReqBeginEpoch", req.begin.epoch).detail("ReqBeginSeq", req.begin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", req.tag1).detail("Tag2", req.tag2); - if (req.begin <= logData->persistentDataDurableVersion) { + if (begin <= logData->persistentDataDurableVersion) { // Just in case the durable version changes while we are waiting for the read, we grab this data from memory. We // may or may not actually send it depending on whether we get enough data from disk. SOMEDAY: Only do this if // an initial attempt to read from disk results in insufficient data and the required data is no longer in // memory SOMEDAY: Should we only send part of the messages we collected, to actually limit the size of the // result? - if (req.onlySpilled) { + if (reqOnlySpilled) { endVersion = logData->persistentDataDurableVersion + 1; } else { - peekMessagesFromMemory(logData, req, messages2, endVersion); + peekMessagesFromMemory(logData, tag, begin, messages2, endVersion); } - if (logData->shouldSpillByValue(req.tag)) { + if (logData->shouldSpillByValue(tag)) { RangeResult kvs = wait(self->persistentData->readRange( - KeyRangeRef(persistTagMessagesKey(logData->logId, req.tag, req.begin), - persistTagMessagesKey(logData->logId, req.tag, logData->persistentDataDurableVersion + 1)), + KeyRangeRef(persistTagMessagesKey(logData->logId, tag, begin), + persistTagMessagesKey(logData->logId, tag, logData->persistentDataDurableVersion + 1)), SERVER_KNOBS->DESIRED_TOTAL_BYTES, SERVER_KNOBS->DESIRED_TOTAL_BYTES)); @@ -1752,9 +1744,8 @@ ACTOR Future tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen } else { // FIXME: Limit to approximately DESIRED_TOTATL_BYTES somehow. RangeResult kvrefs = wait(self->persistentData->readRange( - KeyRangeRef( - persistTagMessageRefsKey(logData->logId, req.tag, req.begin), - persistTagMessageRefsKey(logData->logId, req.tag, logData->persistentDataDurableVersion + 1)), + KeyRangeRef(persistTagMessageRefsKey(logData->logId, tag, begin), + persistTagMessageRefsKey(logData->logId, tag, logData->persistentDataDurableVersion + 1)), SERVER_KNOBS->TLOG_SPILL_REFERENCE_MAX_BATCHES_PER_PEEK + 1)); //TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", req.reply.getEndpoint().getPrimaryAddress()).detail("Tag1Results", s1).detail("Tag2Results", s2).detail("Tag1ResultsLim", kv1.size()).detail("Tag2ResultsLim", kv2.size()).detail("Tag1ResultsLast", kv1.size() ? kv1[0].key : "").detail("Tag2ResultsLast", kv2.size() ? kv2[0].key : "").detail("Limited", limited).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowEpoch", self->epoch()).detail("NowSeq", self->sequence.getNextSequence()); @@ -1774,7 +1765,7 @@ ACTOR Future tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen earlyEnd = true; break; } - if (sd.version >= req.begin) { + if (sd.version >= begin) { firstVersion = std::min(firstVersion, sd.version); const IDiskQueue::location end = sd.start.lo + sd.length; commitLocations.emplace_back(sd.start, end); @@ -1816,13 +1807,13 @@ ACTOR Future tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen messages << VERSION_HEADER << entry.version; std::vector rawMessages = - wait(parseMessagesForTag(entry.messages, req.tag, logData->logRouterTags)); + wait(parseMessagesForTag(entry.messages, tag, logData->logRouterTags)); for (const StringRef& msg : rawMessages) { messages.serializeBytes(msg); DEBUG_TAGS_AND_MESSAGE("TLogPeekFromDisk", entry.version, msg) .detail("UID", self->dbgid) .detail("LogId", logData->logId) - .detail("PeekTag", req.tag); + .detail("PeekTag", tag); } lastRefMessageVersion = entry.version; @@ -1840,10 +1831,10 @@ ACTOR Future tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen } } } else { - if (req.onlySpilled) { + if (reqOnlySpilled) { endVersion = logData->persistentDataDurableVersion + 1; } else { - peekMessagesFromMemory(logData, req, messages, endVersion); + peekMessagesFromMemory(logData, tag, begin, messages, endVersion); } //TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", req.reply.getEndpoint().getPrimaryAddress()).detail("MessageBytes", messages.getLength()).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowSeq", self->sequence.getNextSequence()); @@ -1852,7 +1843,7 @@ ACTOR Future tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen TLogPeekReply reply; reply.maxKnownVersion = logData->version.get(); reply.minKnownCommittedVersion = logData->minKnownCommittedVersion; - reply.messages = messages.toValue(); + reply.messages = StringRef(reply.arena, messages.toValue()); reply.end = endVersion; reply.onlySpilled = onlySpilled; @@ -1861,7 +1852,7 @@ ACTOR Future tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen // detail("MsgBytes", reply.messages.expectedSize()). // detail("ForAddress", req.reply.getEndpoint().getPrimaryAddress()); - if (req.sequence.present()) { + if (sequence.present()) { auto& trackerData = logData->peekTracker[peekId]; trackerData.lastUpdate = now(); @@ -1883,9 +1874,8 @@ ACTOR Future tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen trackerData.blockTime += blockT; trackerData.workTime += workT; - auto& sequenceData = trackerData.sequence_version[sequence + 1]; - if (trackerData.sequence_version.size() && sequence + 1 < trackerData.sequence_version.begin()->first) { - req.reply.sendError(operation_obsolete()); + auto& sequenceData = trackerData.sequence_version[sequenceNum + 1]; + if (trackerData.sequence_version.size() && sequenceNum + 1 < trackerData.sequence_version.begin()->first) { if (!sequenceData.isSet()) { // It would technically be more correct to .send({req.begin, req.onlySpilled}), as the next // request might still be in the window of active requests, but LogSystemPeekCursor will @@ -1893,22 +1883,53 @@ ACTOR Future tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen // response will probably be a waste of CPU. sequenceData.sendError(operation_obsolete()); } - return Void(); + throw operation_obsolete(); } if (sequenceData.isSet()) { trackerData.duplicatePeeks++; if (sequenceData.getFuture().get().first != reply.end) { - TEST(true); // tlog peek second attempt ended at a different version (2) - req.reply.sendError(operation_obsolete()); - return Void(); + TEST(true); // 1 tlog peek second attempt ended at a different version (2) + throw operation_obsolete(); } } else { sequenceData.send(std::make_pair(reply.end, reply.onlySpilled)); } - reply.begin = req.begin; + reply.begin = begin; + } + + return reply; +} + +// This actor keep pushing TLogPeekStreamReply until it's removed from the cluster or should recover +ACTOR Future tLogPeekStream(TLogData* self, TLogPeekStreamRequest req, Reference logData) { + if (req.tag.locality == tagLocalityTxs && req.tag.id >= logData->txsTags && logData->txsTags > 0) { + req.tag.id = req.tag.id % logData->txsTags; + } + req.reply.setByteLimit(SERVER_KNOBS->MAXIMUM_PEEK_BYTES); + // loop { wait(req.reply.onReady()); }; + return Void(); +} + +ACTOR Future tLogPeekMessages(TLogData* self, TLogPeekRequest req, Reference logData) { + state BinaryWriter messages(Unversioned()); + state BinaryWriter messages2(Unversioned()); + state int sequence = -1; + state UID peekId; + state double queueStart = now(); + + try { + TLogPeekReply reply = wait(peekTLog(self, logData, req.begin, req.tag, req.returnIfBlocked, req.onlySpilled, req.sequence)); + req.reply.send(reply); + } catch (Error& e) { + if (e.code() == error_code_timed_out || e.code() == error_code_operation_obsolete || + e.code() == error_code_end_of_stream) { + req.reply.sendError(e); + return Void(); + } else { + throw; + } } - req.reply.send(reply); return Void(); } From b6d5c8a091b3fd662611a9ca69128d7c09722b76 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Mon, 5 Jul 2021 05:57:24 +0000 Subject: [PATCH 04/29] implement tLogPeekStream --- fdbserver/LogSystem.h | 2 +- fdbserver/LogSystemPeekCursor.actor.cpp | 38 +++++++++++++------------ fdbserver/TLogServer.actor.cpp | 23 +++++++++++++-- 3 files changed, 41 insertions(+), 22 deletions(-) diff --git a/fdbserver/LogSystem.h b/fdbserver/LogSystem.h index a8b7e10d75..d6d1190b84 100644 --- a/fdbserver/LogSystem.h +++ b/fdbserver/LogSystem.h @@ -423,7 +423,7 @@ struct ILogSystem { TLogPeekReply results; ArenaReader rd; - LogMessageVersion messageVersion, end; + LogMessageVersion messageVersion, end; // the version of current message; the intended end version of current cursor Version poppedVersion; TagsAndMessage messageAndTags; bool hasMsg; diff --git a/fdbserver/LogSystemPeekCursor.actor.cpp b/fdbserver/LogSystemPeekCursor.actor.cpp index 89b7e63c97..64d1026ea8 100644 --- a/fdbserver/LogSystemPeekCursor.actor.cpp +++ b/fdbserver/LogSystemPeekCursor.actor.cpp @@ -50,9 +50,6 @@ ILogSystem::ServerPeekCursor::ServerPeekCursor(ReferencePEEK_USEING_STREAMING) { this->results.maxKnownVersion = 0; this->results.minKnownCommittedVersion = 0; - if (usePeekStream) { - tryEstablishPeekStream(this); - } //TraceEvent("SPC_Starting", randomID).detail("Tag", tag.toString()).detail("Begin", begin).detail("End", end).backtrace(); } @@ -66,8 +63,7 @@ ILogSystem::ServerPeekCursor::ServerPeekCursor(TLogPeekReply const& results, : results(results), tag(tag), rd(results.arena, results.messages, Unversioned()), messageVersion(messageVersion), end(end), messageAndTags(message), hasMsg(hasMsg), randomID(deterministicRandom()->randomUniqueID()), poppedVersion(poppedVersion), returnIfBlocked(false), sequence(0), onlySpilled(false), parallelGetMore(false), - lastReset(0), slowReplies(0), fastReplies(0), unknownReplies(0), resetCheck(Void()), - usePeekStream(SERVER_KNOBS->PEEK_USEING_STREAMING) { + lastReset(0), slowReplies(0), fastReplies(0), unknownReplies(0), resetCheck(Void()), usePeekStream(false) { //TraceEvent("SPC_Clone", randomID); this->results.maxKnownVersion = 0; this->results.minKnownCommittedVersion = 0; @@ -319,12 +315,12 @@ ACTOR Future serverPeekParallelGetMore(ILogSystem::ServerPeekCursor* self, } ACTOR Future serverPeekStreamGetMore(ILogSystem::ServerPeekCursor* self, TaskPriority taskID) { - if (self->isExhausted()) { - if (self->hasMessage()) - return Void(); - wait(Future(Never())); - throw internal_error(); - } + if (self->isExhausted()) { + if (self->hasMessage()) + return Void(); + wait(Future(Never())); + throw internal_error(); + } try { tryEstablishPeekStream(self); @@ -336,9 +332,10 @@ ACTOR Future serverPeekStreamGetMore(ILogSystem::ServerPeekCursor* self, T self->peekReplyStream.reset(); tryEstablishPeekStream(self); } - when(TLogPeekStreamReply res = wait(self->peekReplyStream.present() - ? waitAndForward(self->peekReplyStream.get().getFuture()) - : Never())) { + when(TLogPeekStreamReply res = + wait(self->peekReplyStream.present() + ? brokenPromiseToNever(waitAndForward(self->peekReplyStream.get().getFuture())) + : Never())) { updateCursorWithReply(self, res.rep); return Void(); } @@ -346,15 +343,18 @@ ACTOR Future serverPeekStreamGetMore(ILogSystem::ServerPeekCursor* self, T } catch (Error& e) { if (e.code() == error_code_connection_failed) { self->peekReplyStream.reset(); + } else { + throw; } - throw; } } } catch (Error& e) { - if (e.code() == error_code_end_of_stream) { + if (e.code() == error_code_end_of_stream || e.code() == error_code_operation_obsolete) { self->end.reset(self->messageVersion.version); + self->peekReplyStream.reset(); return Void(); } + self->peekReplyStream.reset(); throw; } } @@ -397,7 +397,9 @@ Future ILogSystem::ServerPeekCursor::getMore(TaskPriority taskID) { if (hasMessage() && !parallelGetMore) return Void(); if (!more.isValid() || more.isReady()) { - if (parallelGetMore || onlySpilled || futureResults.size()) { + if (usePeekStream) { + more = serverPeekStreamGetMore(this, taskID); + } else if (parallelGetMore || onlySpilled || futureResults.size()) { more = serverPeekParallelGetMore(this, taskID); } else { more = serverPeekGetMore(this, taskID); @@ -427,7 +429,7 @@ Future ILogSystem::ServerPeekCursor::onFailed() { bool ILogSystem::ServerPeekCursor::isActive() const { if (!interf->get().present()) return false; - if (messageVersion >= end) + if (isExhausted()) return false; return IFailureMonitor::failureMonitor().getState(interf->get().interf().peekMessages.getEndpoint()).isAvailable(); } diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp index 6e92e31c0c..8f512572cc 100644 --- a/fdbserver/TLogServer.actor.cpp +++ b/fdbserver/TLogServer.actor.cpp @@ -1902,12 +1902,29 @@ ACTOR Future peekTLog( // This actor keep pushing TLogPeekStreamReply until it's removed from the cluster or should recover ACTOR Future tLogPeekStream(TLogData* self, TLogPeekStreamRequest req, Reference logData) { + state Version begin = req.begin; + state bool onlySpilled = false; if (req.tag.locality == tagLocalityTxs && req.tag.id >= logData->txsTags && logData->txsTags > 0) { req.tag.id = req.tag.id % logData->txsTags; } - req.reply.setByteLimit(SERVER_KNOBS->MAXIMUM_PEEK_BYTES); - // loop { wait(req.reply.onReady()); }; - return Void(); + req.reply.setByteLimit(std::min(SERVER_KNOBS->MAXIMUM_PEEK_BYTES, req.limitBytes)); + loop { + state TLogPeekStreamReply reply; + try { + wait(req.reply.onReady() && store(reply.rep, peekTLog(self, logData, begin, req.tag, false, onlySpilled))); + req.reply.send(reply); + begin = reply.rep.end; + onlySpilled = reply.rep.onlySpilled; + + wait(delay(0.005, TaskPriority::TLogPeekReply)); + } catch (Error& e) { + if(e.code() == error_code_end_of_stream || e.code() == error_code_operation_obsolete) { + req.reply.sendError(e); + return Void(); + } + throw; + } + } } ACTOR Future tLogPeekMessages(TLogData* self, TLogPeekRequest req, Reference logData) { From 15347773d90ea2e5829196254d56e02376934e57 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Wed, 7 Jul 2021 22:55:49 +0000 Subject: [PATCH 05/29] fix double destruction memory bug --- fdbrpc/genericactors.actor.h | 2 +- fdbserver/LogSystemPeekCursor.actor.cpp | 65 +++++++++++-------------- 2 files changed, 29 insertions(+), 38 deletions(-) diff --git a/fdbrpc/genericactors.actor.h b/fdbrpc/genericactors.actor.h index 23bd3e97c2..46a79d29cf 100644 --- a/fdbrpc/genericactors.actor.h +++ b/fdbrpc/genericactors.actor.h @@ -197,7 +197,7 @@ struct PeerHolder { } }; -// Implements getRepyStream, this a void actor with the same lifetime as the input ReplyPromiseStream. +// Implements getReplyStream, this a void actor with the same lifetime as the input ReplyPromiseStream. // Because this actor holds a reference to the stream, normally it would be impossible to know when there are no other // references. To get around this, there is a SAV inside the stream that has one less promise reference than it should // (caused by getErrorFutureAndDelPromiseRef()). When that SAV gets a broken promise because no one besides this void diff --git a/fdbserver/LogSystemPeekCursor.actor.cpp b/fdbserver/LogSystemPeekCursor.actor.cpp index 64d1026ea8..c13f9dfac8 100644 --- a/fdbserver/LogSystemPeekCursor.actor.cpp +++ b/fdbserver/LogSystemPeekCursor.actor.cpp @@ -238,8 +238,7 @@ ACTOR Future serverPeekParallelGetMore(ILogSystem::ServerPeekCursor* self, if (!self->interf || self->isExhausted()) { if (self->hasMessage()) return Void(); - wait(Future(Never())); - throw internal_error(); + return Never(); } if (!self->interfaceChanged.isValid()) { @@ -315,54 +314,46 @@ ACTOR Future serverPeekParallelGetMore(ILogSystem::ServerPeekCursor* self, } ACTOR Future serverPeekStreamGetMore(ILogSystem::ServerPeekCursor* self, TaskPriority taskID) { - if (self->isExhausted()) { + if (!self->interf || self->isExhausted()) { if (self->hasMessage()) return Void(); - wait(Future(Never())); - throw internal_error(); + return Never(); } - try { - tryEstablishPeekStream(self); - loop { - try { - choose { - when(wait(self->interf->onChange())) { - self->onlySpilled = false; - self->peekReplyStream.reset(); - tryEstablishPeekStream(self); - } - when(TLogPeekStreamReply res = - wait(self->peekReplyStream.present() - ? brokenPromiseToNever(waitAndForward(self->peekReplyStream.get().getFuture())) - : Never())) { - updateCursorWithReply(self, res.rep); - return Void(); - } - } - } catch (Error& e) { - if (e.code() == error_code_connection_failed) { + tryEstablishPeekStream(self); + loop { + try { + choose { + when(wait(self->interf->onChange())) { + self->onlySpilled = false; self->peekReplyStream.reset(); - } else { - throw; + tryEstablishPeekStream(self); + } + when(TLogPeekStreamReply res = wait(self->peekReplyStream.present() + ? brokenPromiseToNever(waitAndForward(self->peekReplyStream.get().getFuture())) + : Never())) { + updateCursorWithReply(self, res.rep); + return Void(); } } + } catch (Error& e) { + if (e.code() == error_code_connection_failed) { + self->peekReplyStream.reset(); + } + else if(e.code() == error_code_end_of_stream) { + self->end.reset(self->messageVersion.version); + return Void(); + } + else { + throw; + } } - } catch (Error& e) { - if (e.code() == error_code_end_of_stream || e.code() == error_code_operation_obsolete) { - self->end.reset(self->messageVersion.version); - self->peekReplyStream.reset(); - return Void(); - } - self->peekReplyStream.reset(); - throw; } } ACTOR Future serverPeekGetMore(ILogSystem::ServerPeekCursor* self, TaskPriority taskID) { if (!self->interf || self->isExhausted()) { - wait(Future(Never())); - throw internal_error(); + return Never(); } try { loop { From 5a43a8c367836746d5706dcc1c38d7654a901c88 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Thu, 8 Jul 2021 19:32:58 +0000 Subject: [PATCH 06/29] add returnIfBlocked in stream request --- fdbserver/LogSystemPeekCursor.actor.cpp | 10 +++---- fdbserver/TLogInterface.h | 7 ++--- fdbserver/TLogServer.actor.cpp | 36 ++++++++++++------------- 3 files changed, 25 insertions(+), 28 deletions(-) diff --git a/fdbserver/LogSystemPeekCursor.actor.cpp b/fdbserver/LogSystemPeekCursor.actor.cpp index c13f9dfac8..ce7bc5aa1c 100644 --- a/fdbserver/LogSystemPeekCursor.actor.cpp +++ b/fdbserver/LogSystemPeekCursor.actor.cpp @@ -33,8 +33,8 @@ void tryEstablishPeekStream(ILogSystem::ServerPeekCursor* self) { self->peekReplyStream.reset(); return; } - self->peekReplyStream = self->interf->get().interf().peekStreamMessages.getReplyStream( - TLogPeekStreamRequest(self->messageVersion.version, self->tag, std::numeric_limits::max())); + self->peekReplyStream = self->interf->get().interf().peekStreamMessages.getReplyStream(TLogPeekStreamRequest( + self->messageVersion.version, self->tag, self->returnIfBlocked, std::numeric_limits::max())); } ILogSystem::ServerPeekCursor::ServerPeekCursor(Reference>> const& interf, @@ -339,12 +339,10 @@ ACTOR Future serverPeekStreamGetMore(ILogSystem::ServerPeekCursor* self, T } catch (Error& e) { if (e.code() == error_code_connection_failed) { self->peekReplyStream.reset(); - } - else if(e.code() == error_code_end_of_stream) { + } else if (e.code() == error_code_end_of_stream) { self->end.reset(self->messageVersion.version); return Void(); - } - else { + } else { throw; } } diff --git a/fdbserver/TLogInterface.h b/fdbserver/TLogInterface.h index ba8bc10d64..892adb3730 100644 --- a/fdbserver/TLogInterface.h +++ b/fdbserver/TLogInterface.h @@ -235,16 +235,17 @@ struct TLogPeekStreamRequest { Arena arena; Version begin; Tag tag; + bool returnIfBlocked; int limitBytes; ReplyPromiseStream reply; TLogPeekStreamRequest() {} - TLogPeekStreamRequest(Version version, Tag tag, int limitBytes) - : begin(version), tag(tag), limitBytes(limitBytes) {} + TLogPeekStreamRequest(Version version, Tag tag, bool returnIfBlocked, int limitBytes) + : begin(version), tag(tag), returnIfBlocked(returnIfBlocked), limitBytes(limitBytes) {} template void serialize(Ar& ar) { - serializer(ar, arena, begin, tag, limitBytes, reply); + serializer(ar, arena, begin, tag, returnIfBlocked, limitBytes, reply); } }; diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp index 8f512572cc..bce1639dc4 100644 --- a/fdbserver/TLogServer.actor.cpp +++ b/fdbserver/TLogServer.actor.cpp @@ -1570,14 +1570,13 @@ ACTOR Future> parseMessagesForTag(StringRef commitBlob, T } // Common logics to peek TLog and create TLogPeekReply that serves both streaming peek or normal peek request -ACTOR Future peekTLog( - TLogData* self, - Reference logData, - Version begin, - Tag tag, - bool returnIfBlocked = false, - bool reqOnlySpilled = false, - Optional> sequence = Optional>()) { +ACTOR Future peekTLog(TLogData* self, + Reference logData, + Version begin, + Tag tag, + bool returnIfBlocked = false, + bool reqOnlySpilled = false, + Optional> sequence = Optional>()) { state BinaryWriter messages(Unversioned()); state BinaryWriter messages2(Unversioned()); state int sequenceNum = -1; @@ -1680,7 +1679,8 @@ ACTOR Future peekTLog( rep.end = poppedVer; rep.onlySpilled = false; - // TODO: once the fake stream is replaced by ReplyPromiseStream, we can remove the code handling sequence requests. + // TODO: once the fake stream is replaced by ReplyPromiseStream, we can remove the code handling sequence + // requests. if (sequence.present()) { auto& trackerData = logData->peekTracker[peekId]; auto& sequenceData = trackerData.sequence_version[sequenceNum + 1]; @@ -1911,31 +1911,29 @@ ACTOR Future tLogPeekStream(TLogData* self, TLogPeekStreamRequest req, Ref loop { state TLogPeekStreamReply reply; try { - wait(req.reply.onReady() && store(reply.rep, peekTLog(self, logData, begin, req.tag, false, onlySpilled))); + wait(req.reply.onReady() && + store(reply.rep, peekTLog(self, logData, begin, req.tag, req.returnIfBlocked, onlySpilled))); req.reply.send(reply); begin = reply.rep.end; onlySpilled = reply.rep.onlySpilled; wait(delay(0.005, TaskPriority::TLogPeekReply)); + // return Void(); } catch (Error& e) { - if(e.code() == error_code_end_of_stream || e.code() == error_code_operation_obsolete) { + if (e.code() == error_code_end_of_stream) { req.reply.sendError(e); return Void(); + } else { + throw; } - throw; } } } ACTOR Future tLogPeekMessages(TLogData* self, TLogPeekRequest req, Reference logData) { - state BinaryWriter messages(Unversioned()); - state BinaryWriter messages2(Unversioned()); - state int sequence = -1; - state UID peekId; - state double queueStart = now(); - try { - TLogPeekReply reply = wait(peekTLog(self, logData, req.begin, req.tag, req.returnIfBlocked, req.onlySpilled, req.sequence)); + TLogPeekReply reply = + wait(peekTLog(self, logData, req.begin, req.tag, req.returnIfBlocked, req.onlySpilled, req.sequence)); req.reply.send(reply); } catch (Error& e) { if (e.code() == error_code_timed_out || e.code() == error_code_operation_obsolete || From 6d1c12899dcdb263cb112a5b5ec15e3a00bd9a9c Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Fri, 9 Jul 2021 22:46:16 +0000 Subject: [PATCH 07/29] catch exceptions --- fdbrpc/fdbrpc.h | 4 ++-- fdbserver/LogSystemPeekCursor.actor.cpp | 12 +++++++----- fdbserver/TLogInterface.h | 2 +- fdbserver/TLogServer.actor.cpp | 8 +++++--- 4 files changed, 15 insertions(+), 11 deletions(-) diff --git a/fdbrpc/fdbrpc.h b/fdbrpc/fdbrpc.h index bfc0d93c78..81c3bbd3b5 100644 --- a/fdbrpc/fdbrpc.h +++ b/fdbrpc/fdbrpc.h @@ -406,8 +406,6 @@ struct NetNotifiedQueueWithAcknowledgements final : NotifiedQueue, template class ReplyPromiseStream { public: - // The endpoints of a ReplyPromiseStream must be initialized at Task::ReadSocket, because with lower priorities a - // delay(0) in FlowTransport deliver can cause out of order delivery. // stream.send( request ) // Unreliable at most once delivery: Delivers request unless there is a connection failure (zero or one times) @@ -477,6 +475,8 @@ public: errors->delPromiseRef(); } + // The endpoints of a ReplyPromiseStream must be initialized at Task::ReadSocket, because with lower priorities a + // delay(0) in FlowTransport deliver can cause out of order delivery. const Endpoint& getEndpoint() const { return queue->getEndpoint(TaskPriority::ReadSocket); } bool operator==(const ReplyPromiseStream& rhs) const { return queue == rhs.queue; } diff --git a/fdbserver/LogSystemPeekCursor.actor.cpp b/fdbserver/LogSystemPeekCursor.actor.cpp index ce7bc5aa1c..97d17c4d62 100644 --- a/fdbserver/LogSystemPeekCursor.actor.cpp +++ b/fdbserver/LogSystemPeekCursor.actor.cpp @@ -386,13 +386,15 @@ Future ILogSystem::ServerPeekCursor::getMore(TaskPriority taskID) { if (hasMessage() && !parallelGetMore) return Void(); if (!more.isValid() || more.isReady()) { - if (usePeekStream) { + if (usePeekStream && taskID == TaskPriority::TLogPeekReply) { more = serverPeekStreamGetMore(this, taskID); - } else if (parallelGetMore || onlySpilled || futureResults.size()) { - more = serverPeekParallelGetMore(this, taskID); - } else { - more = serverPeekGetMore(this, taskID); } +// if (parallelGetMore || onlySpilled || futureResults.size()) { +// more = serverPeekParallelGetMore(this, taskID); +// } + else { + more = serverPeekGetMore(this, taskID); + } } return more; } diff --git a/fdbserver/TLogInterface.h b/fdbserver/TLogInterface.h index 892adb3730..f92638d639 100644 --- a/fdbserver/TLogInterface.h +++ b/fdbserver/TLogInterface.h @@ -83,7 +83,7 @@ struct TLogInterface { streams.push_back(disablePopRequest.getReceiver()); streams.push_back(enablePopRequest.getReceiver()); streams.push_back(snapRequest.getReceiver()); - streams.push_back(peekStreamMessages.getReceiver(TaskPriority::ReadSocket)); + streams.push_back(peekStreamMessages.getReceiver(TaskPriority::TLogPeek)); FlowTransport::transport().addEndpoints(streams); } diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp index bce1639dc4..5524612a2c 100644 --- a/fdbserver/TLogServer.actor.cpp +++ b/fdbserver/TLogServer.actor.cpp @@ -1916,15 +1916,17 @@ ACTOR Future tLogPeekStream(TLogData* self, TLogPeekStreamRequest req, Ref req.reply.send(reply); begin = reply.rep.end; onlySpilled = reply.rep.onlySpilled; - - wait(delay(0.005, TaskPriority::TLogPeekReply)); // return Void(); } catch (Error& e) { if (e.code() == error_code_end_of_stream) { req.reply.sendError(e); return Void(); + } + else if (e.code() == error_code_operation_obsolete) { + // reply stream is cancelled on the client + return Void(); } else { - throw; + throw; } } } From 066d5341947a0df60efb8e21c7511510371a8a01 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Wed, 14 Jul 2021 16:19:23 +0000 Subject: [PATCH 08/29] trivial changes --- fdbserver/LogSystemPeekCursor.actor.cpp | 31 +++++++++++++++++++------ fdbserver/TLogServer.actor.cpp | 3 ++- 2 files changed, 26 insertions(+), 8 deletions(-) diff --git a/fdbserver/LogSystemPeekCursor.actor.cpp b/fdbserver/LogSystemPeekCursor.actor.cpp index 97d17c4d62..809b126e70 100644 --- a/fdbserver/LogSystemPeekCursor.actor.cpp +++ b/fdbserver/LogSystemPeekCursor.actor.cpp @@ -35,6 +35,7 @@ void tryEstablishPeekStream(ILogSystem::ServerPeekCursor* self) { } self->peekReplyStream = self->interf->get().interf().peekStreamMessages.getReplyStream(TLogPeekStreamRequest( self->messageVersion.version, self->tag, self->returnIfBlocked, std::numeric_limits::max())); + TraceEvent(SevDebug, "StreamCreated"); } ILogSystem::ServerPeekCursor::ServerPeekCursor(Reference>> const& interf, @@ -329,10 +330,15 @@ ACTOR Future serverPeekStreamGetMore(ILogSystem::ServerPeekCursor* self, T self->peekReplyStream.reset(); tryEstablishPeekStream(self); } - when(TLogPeekStreamReply res = wait(self->peekReplyStream.present() + when(TLogPeekStreamReply res = + wait(self->peekReplyStream.present() ? brokenPromiseToNever(waitAndForward(self->peekReplyStream.get().getFuture())) : Never())) { updateCursorWithReply(self, res.rep); + TraceEvent("SPC_GetMoreB", self->randomID) + .detail("Has", self->hasMessage()) + .detail("End", res.rep.end) + .detail("Popped", res.rep.popped.present() ? res.rep.popped.get() : 0); return Void(); } } @@ -389,12 +395,12 @@ Future ILogSystem::ServerPeekCursor::getMore(TaskPriority taskID) { if (usePeekStream && taskID == TaskPriority::TLogPeekReply) { more = serverPeekStreamGetMore(this, taskID); } -// if (parallelGetMore || onlySpilled || futureResults.size()) { -// more = serverPeekParallelGetMore(this, taskID); -// } + // if (parallelGetMore || onlySpilled || futureResults.size()) { + // more = serverPeekParallelGetMore(this, taskID); + // } else { - more = serverPeekGetMore(this, taskID); - } + more = serverPeekGetMore(this, taskID); + } } return more; } @@ -408,6 +414,12 @@ ACTOR Future serverPeekOnFailed(ILogSystem::ServerPeekCursor* self) { : Never())) { return Void(); } + when(wait(self->interf->get().present() + ? IFailureMonitor::failureMonitor().onStateEqual( + self->interf->get().interf().peekStreamMessages.getEndpoint(), FailureStatus()) + : Never())) { + return Void(); + } when(wait(self->interf->onChange())) {} } } @@ -422,7 +434,12 @@ bool ILogSystem::ServerPeekCursor::isActive() const { return false; if (isExhausted()) return false; - return IFailureMonitor::failureMonitor().getState(interf->get().interf().peekMessages.getEndpoint()).isAvailable(); + return IFailureMonitor::failureMonitor() + .getState(interf->get().interf().peekMessages.getEndpoint()) + .isAvailable() && + IFailureMonitor::failureMonitor() + .getState(interf->get().interf().peekStreamMessages.getEndpoint()) + .isAvailable(); } bool ILogSystem::ServerPeekCursor::isExhausted() const { diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp index 5524612a2c..cdf8dacbad 100644 --- a/fdbserver/TLogServer.actor.cpp +++ b/fdbserver/TLogServer.actor.cpp @@ -1916,7 +1916,8 @@ ACTOR Future tLogPeekStream(TLogData* self, TLogPeekStreamRequest req, Ref req.reply.send(reply); begin = reply.rep.end; onlySpilled = reply.rep.onlySpilled; - // return Void(); + + wait(delay(.05, g_network->getCurrentTask())); } catch (Error& e) { if (e.code() == error_code_end_of_stream) { req.reply.sendError(e); From 227570357ab5f040dcea217d079f34f35bf42c5d Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Thu, 15 Jul 2021 21:30:14 +0000 Subject: [PATCH 09/29] trace log and reset changes; byteAcknownledge overflow --- fdbrpc/fdbrpc.h | 44 ++++++++++++++------- fdbserver/LogSystemPeekCursor.actor.cpp | 52 ++++++++++++++++--------- fdbserver/TLogServer.actor.cpp | 36 +++++++++-------- 3 files changed, 82 insertions(+), 50 deletions(-) diff --git a/fdbrpc/fdbrpc.h b/fdbrpc/fdbrpc.h index 403d8d4dc2..b0443d2a23 100644 --- a/fdbrpc/fdbrpc.h +++ b/fdbrpc/fdbrpc.h @@ -277,9 +277,9 @@ struct AcknowledgementReceiver final : FlowReceiver, FastAllocated::operator new; using FastAllocated::operator delete; - int64_t bytesSent; - int64_t bytesAcknowledged; - int64_t bytesLimit; + uint64_t bytesSent; + uint64_t bytesAcknowledged; + uint64_t bytesLimit; Promise ready; Future failures; @@ -358,11 +358,19 @@ struct NetNotifiedQueueWithAcknowledgements final : NotifiedQueue, // send an ack immediately if (acknowledgements.getRawEndpoint().isValid()) { acknowledgements.bytesAcknowledged += message.get().asUnderlyingType().expectedSize(); - FlowTransport::transport().sendUnreliable( - SerializeSource>( - AcknowledgementReply(acknowledgements.bytesAcknowledged)), - acknowledgements.getEndpoint(TaskPriority::ReadSocket), - false); + // int64_t overflow: we need to reset this stream + if (acknowledgements.bytesAcknowledged > std::numeric_limits::max()) { + FlowTransport::transport().sendUnreliable( + SerializeSource>(operation_obsolete()), + acknowledgements.getEndpoint(TaskPriority::ReadSocket), + false); + } else { + FlowTransport::transport().sendUnreliable( + SerializeSource>( + AcknowledgementReply(acknowledgements.bytesAcknowledged)), + acknowledgements.getEndpoint(TaskPriority::ReadSocket), + false); + } } } @@ -376,10 +384,17 @@ struct NetNotifiedQueueWithAcknowledgements final : NotifiedQueue, // A reply that has been queued up is being consumed, so send an ack to the server if (acknowledgements.getRawEndpoint().isValid()) { acknowledgements.bytesAcknowledged += res.expectedSize(); - FlowTransport::transport().sendUnreliable(SerializeSource>( - AcknowledgementReply(acknowledgements.bytesAcknowledged)), - acknowledgements.getEndpoint(TaskPriority::ReadSocket), - false); + if (acknowledgements.bytesAcknowledged > std::numeric_limits::max()) { + FlowTransport::transport().sendUnreliable( + SerializeSource>(operation_obsolete()), + acknowledgements.getEndpoint(TaskPriority::ReadSocket), + false); + } else { + FlowTransport::transport().sendUnreliable(SerializeSource>( + AcknowledgementReply(acknowledgements.bytesAcknowledged)), + acknowledgements.getEndpoint(TaskPriority::ReadSocket), + false); + } } return res; } @@ -406,7 +421,6 @@ struct NetNotifiedQueueWithAcknowledgements final : NotifiedQueue, template class ReplyPromiseStream { public: - // stream.send( request ) // Unreliable at most once delivery: Delivers request unless there is a connection failure (zero or one times) @@ -475,8 +489,8 @@ public: errors->delPromiseRef(); } - // The endpoints of a ReplyPromiseStream must be initialized at Task::ReadSocket, because with lower priorities a - // delay(0) in FlowTransport deliver can cause out of order delivery. + // The endpoints of a ReplyPromiseStream must be initialized at Task::ReadSocket, because with lower priorities a + // delay(0) in FlowTransport deliver can cause out of order delivery. const Endpoint& getEndpoint() const { return queue->getEndpoint(TaskPriority::ReadSocket); } bool operator==(const ReplyPromiseStream& rhs) const { return queue == rhs.queue; } diff --git a/fdbserver/LogSystemPeekCursor.actor.cpp b/fdbserver/LogSystemPeekCursor.actor.cpp index 809b126e70..154d870f88 100644 --- a/fdbserver/LogSystemPeekCursor.actor.cpp +++ b/fdbserver/LogSystemPeekCursor.actor.cpp @@ -35,7 +35,7 @@ void tryEstablishPeekStream(ILogSystem::ServerPeekCursor* self) { } self->peekReplyStream = self->interf->get().interf().peekStreamMessages.getReplyStream(TLogPeekStreamRequest( self->messageVersion.version, self->tag, self->returnIfBlocked, std::numeric_limits::max())); - TraceEvent(SevDebug, "StreamCreated"); + TraceEvent(SevDebug, "SPC_StreamCreated", self->randomID); } ILogSystem::ServerPeekCursor::ServerPeekCursor(Reference>> const& interf, @@ -51,7 +51,11 @@ ILogSystem::ServerPeekCursor::ServerPeekCursor(ReferencePEEK_USEING_STREAMING) { this->results.maxKnownVersion = 0; this->results.minKnownCommittedVersion = 0; - //TraceEvent("SPC_Starting", randomID).detail("Tag", tag.toString()).detail("Begin", begin).detail("End", end).backtrace(); + TraceEvent("SPC_Starting", randomID) + .detail("Tag", tag.toString()) + .detail("Begin", begin) + .detail("End", end) + .backtrace(); } ILogSystem::ServerPeekCursor::ServerPeekCursor(TLogPeekReply const& results, @@ -316,29 +320,34 @@ ACTOR Future serverPeekParallelGetMore(ILogSystem::ServerPeekCursor* self, ACTOR Future serverPeekStreamGetMore(ILogSystem::ServerPeekCursor* self, TaskPriority taskID) { if (!self->interf || self->isExhausted()) { + self->peekReplyStream.reset(); if (self->hasMessage()) return Void(); return Never(); } - tryEstablishPeekStream(self); loop { try { + tryEstablishPeekStream(self); + state Future fPeekReply = self->peekReplyStream.present() + ? map(waitAndForward(self->peekReplyStream.get().getFuture()), + [](const TLogPeekStreamReply& r) { return r.rep; }) + : Never(); choose { when(wait(self->interf->onChange())) { self->onlySpilled = false; self->peekReplyStream.reset(); - tryEstablishPeekStream(self); } - when(TLogPeekStreamReply res = + when(TLogPeekReply res = wait(self->peekReplyStream.present() - ? brokenPromiseToNever(waitAndForward(self->peekReplyStream.get().getFuture())) + ? recordRequestMetrics( + self, self->peekReplyStream.get().getEndpoint().getPrimaryAddress(), fPeekReply) : Never())) { - updateCursorWithReply(self, res.rep); + updateCursorWithReply(self, res); TraceEvent("SPC_GetMoreB", self->randomID) .detail("Has", self->hasMessage()) - .detail("End", res.rep.end) - .detail("Popped", res.rep.popped.present() ? res.rep.popped.get() : 0); + .detail("End", res.end) + .detail("Popped", res.popped.present() ? res.popped.get() : 0); return Void(); } } @@ -388,19 +397,24 @@ ACTOR Future serverPeekGetMore(ILogSystem::ServerPeekCursor* self, TaskPri } Future ILogSystem::ServerPeekCursor::getMore(TaskPriority taskID) { - //TraceEvent("SPC_GetMore", randomID).detail("HasMessage", hasMessage()).detail("More", !more.isValid() || more.isReady()).detail("MessageVersion", messageVersion.toString()).detail("End", end.toString()); + TraceEvent("SPC_GetMore", randomID) + .detail("HasMessage", hasMessage()) + .detail("More", !more.isValid() || more.isReady()) + .detail("MessageVersion", messageVersion.toString()) + .detail("End", end.toString()); if (hasMessage() && !parallelGetMore) return Void(); if (!more.isValid() || more.isReady()) { - if (usePeekStream && taskID == TaskPriority::TLogPeekReply) { - more = serverPeekStreamGetMore(this, taskID); - } - // if (parallelGetMore || onlySpilled || futureResults.size()) { - // more = serverPeekParallelGetMore(this, taskID); - // } - else { - more = serverPeekGetMore(this, taskID); - } + more = serverPeekStreamGetMore(this, taskID); +// if (usePeekStream && taskID == TaskPriority::TLogPeekReply) { +// more = serverPeekStreamGetMore(this, taskID); +// } +// if (parallelGetMore || onlySpilled || futureResults.size()) { +// more = serverPeekParallelGetMore(this, taskID); +// } +// else { +// more = serverPeekGetMore(this, taskID); +// } } return more; } diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp index a46d22d856..9ad416986e 100644 --- a/fdbserver/TLogServer.actor.cpp +++ b/fdbserver/TLogServer.actor.cpp @@ -557,6 +557,7 @@ struct LogData : NonCopyable, public ReferenceCounted { TLogData* tLogData; Promise recoveryComplete, committingQueue; Version unrecoveredBefore, recoveredAt; + int activePeekStreams = 0; struct PeekTrackerData { std::map>> @@ -668,6 +669,7 @@ struct LogData : NonCopyable, public ReferenceCounted { specialCounter(cc, "PeekMemoryReserved", [tLogData]() { return tLogData->peekMemoryLimiter.activePermits(); }); specialCounter(cc, "PeekMemoryRequestsStalled", [tLogData]() { return tLogData->peekMemoryLimiter.waiters(); }); specialCounter(cc, "Generation", [this]() { return this->recoveryCount; }); + specialCounter(cc, "ActivePeekStreams", [this]() { return this->activePeekStreams; }); } ~LogData() { @@ -1167,17 +1169,19 @@ ACTOR Future tLogPopCore(TLogData* self, Tag inputTag, Version to, Referen } uint64_t PoppedVersionLag = logData->persistentDataDurableVersion - logData->queuePoppedVersion; - if ( SERVER_KNOBS->ENABLE_DETAILED_TLOG_POP_TRACE && - (logData->queuePoppedVersion > 0) && //avoid generating massive events at beginning - (tagData->unpoppedRecovered || PoppedVersionLag >= SERVER_KNOBS->TLOG_POPPED_VER_LAG_THRESHOLD_FOR_TLOGPOP_TRACE)) { //when recovery or long lag + if (SERVER_KNOBS->ENABLE_DETAILED_TLOG_POP_TRACE && + (logData->queuePoppedVersion > 0) && // avoid generating massive events at beginning + (tagData->unpoppedRecovered || + PoppedVersionLag >= + SERVER_KNOBS->TLOG_POPPED_VER_LAG_THRESHOLD_FOR_TLOGPOP_TRACE)) { // when recovery or long lag TraceEvent("TLogPopDetails", logData->logId) - .detail("Tag", tagData->tag.toString()) - .detail("UpTo", upTo) - .detail("PoppedVersionLag", PoppedVersionLag) - .detail("MinPoppedTag", logData->minPoppedTag.toString()) - .detail("QueuePoppedVersion", logData->queuePoppedVersion) - .detail("UnpoppedRecovered", tagData->unpoppedRecovered ? "True" : "False") - .detail("NothingPersistent", tagData->nothingPersistent ? "True" : "False"); + .detail("Tag", tagData->tag.toString()) + .detail("UpTo", upTo) + .detail("PoppedVersionLag", PoppedVersionLag) + .detail("MinPoppedTag", logData->minPoppedTag.toString()) + .detail("QueuePoppedVersion", logData->queuePoppedVersion) + .detail("UnpoppedRecovered", tagData->unpoppedRecovered ? "True" : "False") + .detail("NothingPersistent", tagData->nothingPersistent ? "True" : "False"); } if (upTo > logData->persistentDataDurableVersion) wait(tagData->eraseMessagesBefore(upTo, self, logData, TaskPriority::TLogPop)); @@ -1915,6 +1919,7 @@ ACTOR Future peekTLog(TLogData* self, // This actor keep pushing TLogPeekStreamReply until it's removed from the cluster or should recover ACTOR Future tLogPeekStream(TLogData* self, TLogPeekStreamRequest req, Reference logData) { + logData->activePeekStreams ++; state Version begin = req.begin; state bool onlySpilled = false; if (req.tag.locality == tagLocalityTxs && req.tag.id >= logData->txsTags && logData->txsTags > 0) { @@ -1929,18 +1934,17 @@ ACTOR Future tLogPeekStream(TLogData* self, TLogPeekStreamRequest req, Ref req.reply.send(reply); begin = reply.rep.end; onlySpilled = reply.rep.onlySpilled; - - wait(delay(.05, g_network->getCurrentTask())); + wait(delay(0, g_network->getCurrentTask())); } catch (Error& e) { + logData->activePeekStreams --; if (e.code() == error_code_end_of_stream) { req.reply.sendError(e); return Void(); - } - else if (e.code() == error_code_operation_obsolete) { + } else if (e.code() == error_code_operation_obsolete) { // reply stream is cancelled on the client - return Void(); + return Void(); } else { - throw; + throw; } } } From f3667ce91adc18dbc49fb3f7d3e609b1f46f78d6 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Mon, 19 Jul 2021 18:43:51 +0000 Subject: [PATCH 10/29] more debug logs; let tryEstablishStream wait until the connection is good --- fdbserver/LogSystemPeekCursor.actor.cpp | 45 ++++++++++++++----------- fdbserver/TLogServer.actor.cpp | 11 +++--- 2 files changed, 33 insertions(+), 23 deletions(-) diff --git a/fdbserver/LogSystemPeekCursor.actor.cpp b/fdbserver/LogSystemPeekCursor.actor.cpp index 154d870f88..7473d7525a 100644 --- a/fdbserver/LogSystemPeekCursor.actor.cpp +++ b/fdbserver/LogSystemPeekCursor.actor.cpp @@ -26,16 +26,20 @@ #include "flow/actorcompiler.h" // has to be last include // create a peek stream for cursor when it's possible -void tryEstablishPeekStream(ILogSystem::ServerPeekCursor* self) { +ACTOR Future tryEstablishPeekStream(ILogSystem::ServerPeekCursor* self) { if (self->peekReplyStream.present()) - return; + return Void(); else if (!self->interf || !self->interf->get().present()) { self->peekReplyStream.reset(); - return; + return Never(); } + wait(IFailureMonitor::failureMonitor().onStateEqual(self->interf->get().interf().peekStreamMessages.getEndpoint(), + FailureStatus(false))); self->peekReplyStream = self->interf->get().interf().peekStreamMessages.getReplyStream(TLogPeekStreamRequest( self->messageVersion.version, self->tag, self->returnIfBlocked, std::numeric_limits::max())); - TraceEvent(SevDebug, "SPC_StreamCreated", self->randomID); + TraceEvent(SevDebug, "SPC_StreamCreated", self->randomID) + .detail("PeerAddress", self->interf->get().interf().peekStreamMessages.getEndpoint().getPrimaryAddress()); + return Void(); } ILogSystem::ServerPeekCursor::ServerPeekCursor(Reference>> const& interf, @@ -328,21 +332,23 @@ ACTOR Future serverPeekStreamGetMore(ILogSystem::ServerPeekCursor* self, T loop { try { - tryEstablishPeekStream(self); state Future fPeekReply = self->peekReplyStream.present() ? map(waitAndForward(self->peekReplyStream.get().getFuture()), [](const TLogPeekStreamReply& r) { return r.rep; }) : Never(); choose { + when(wait(self->peekReplyStream.present() ? Never() : tryEstablishPeekStream(self))) {} when(wait(self->interf->onChange())) { self->onlySpilled = false; self->peekReplyStream.reset(); } - when(TLogPeekReply res = - wait(self->peekReplyStream.present() - ? recordRequestMetrics( - self, self->peekReplyStream.get().getEndpoint().getPrimaryAddress(), fPeekReply) - : Never())) { + when(TLogPeekReply res = wait( + self->peekReplyStream.present() + ? recordRequestMetrics( + self, + self->interf->get().interf().peekStreamMessages.getEndpoint().getPrimaryAddress(), + fPeekReply) + : Never())) { updateCursorWithReply(self, res); TraceEvent("SPC_GetMoreB", self->randomID) .detail("Has", self->hasMessage()) @@ -352,6 +358,7 @@ ACTOR Future serverPeekStreamGetMore(ILogSystem::ServerPeekCursor* self, T } } } catch (Error& e) { + TraceEvent(SevDebug, "SPC_GetMoreB_Error", self->randomID).detail("Error", e.what()); if (e.code() == error_code_connection_failed) { self->peekReplyStream.reset(); } else if (e.code() == error_code_end_of_stream) { @@ -406,15 +413,15 @@ Future ILogSystem::ServerPeekCursor::getMore(TaskPriority taskID) { return Void(); if (!more.isValid() || more.isReady()) { more = serverPeekStreamGetMore(this, taskID); -// if (usePeekStream && taskID == TaskPriority::TLogPeekReply) { -// more = serverPeekStreamGetMore(this, taskID); -// } -// if (parallelGetMore || onlySpilled || futureResults.size()) { -// more = serverPeekParallelGetMore(this, taskID); -// } -// else { -// more = serverPeekGetMore(this, taskID); -// } + // if (usePeekStream && taskID == TaskPriority::TLogPeekReply) { + // more = serverPeekStreamGetMore(this, taskID); + // } + // if (parallelGetMore || onlySpilled || futureResults.size()) { + // more = serverPeekParallelGetMore(this, taskID); + // } + // else { + // more = serverPeekGetMore(this, taskID); + // } } return more; } diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp index 9ad416986e..f437f24637 100644 --- a/fdbserver/TLogServer.actor.cpp +++ b/fdbserver/TLogServer.actor.cpp @@ -341,6 +341,7 @@ struct TLogData : NonCopyable { int64_t targetVolatileBytes; // The number of bytes of mutations this TLog should hold in memory before spilling. int64_t overheadBytesInput; int64_t overheadBytesDurable; + int activePeekStreams = 0; WorkerCache tlogCache; FlowLock peekMemoryLimiter; @@ -557,7 +558,6 @@ struct LogData : NonCopyable, public ReferenceCounted { TLogData* tLogData; Promise recoveryComplete, committingQueue; Version unrecoveredBefore, recoveredAt; - int activePeekStreams = 0; struct PeekTrackerData { std::map>> @@ -669,7 +669,7 @@ struct LogData : NonCopyable, public ReferenceCounted { specialCounter(cc, "PeekMemoryReserved", [tLogData]() { return tLogData->peekMemoryLimiter.activePermits(); }); specialCounter(cc, "PeekMemoryRequestsStalled", [tLogData]() { return tLogData->peekMemoryLimiter.waiters(); }); specialCounter(cc, "Generation", [this]() { return this->recoveryCount; }); - specialCounter(cc, "ActivePeekStreams", [this]() { return this->activePeekStreams; }); + specialCounter(cc, "ActivePeekStreams", [tLogData]() { return tLogData->activePeekStreams; }); } ~LogData() { @@ -1919,7 +1919,8 @@ ACTOR Future peekTLog(TLogData* self, // This actor keep pushing TLogPeekStreamReply until it's removed from the cluster or should recover ACTOR Future tLogPeekStream(TLogData* self, TLogPeekStreamRequest req, Reference logData) { - logData->activePeekStreams ++; + self->activePeekStreams++; + TraceEvent(SevDebug, "TLogPeekStream", logData->logId); state Version begin = req.begin; state bool onlySpilled = false; if (req.tag.locality == tagLocalityTxs && req.tag.id >= logData->txsTags && logData->txsTags > 0) { @@ -1936,7 +1937,9 @@ ACTOR Future tLogPeekStream(TLogData* self, TLogPeekStreamRequest req, Ref onlySpilled = reply.rep.onlySpilled; wait(delay(0, g_network->getCurrentTask())); } catch (Error& e) { - logData->activePeekStreams --; + self->activePeekStreams--; + TraceEvent(SevDebug, "TLogPeekStreamEnd", logData->logId).detail("Error", e.what()); + if (e.code() == error_code_end_of_stream) { req.reply.sendError(e); return Void(); From 5046ee3b07ec89a81a61403be184d847f45bedf5 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Tue, 20 Jul 2021 17:42:00 +0000 Subject: [PATCH 11/29] add stream peek to logRouter --- fdbrpc/fdbrpc.h | 21 +- fdbserver/LogRouter.actor.cpp | 376 +++++++++++++++++------- fdbserver/LogSystemPeekCursor.actor.cpp | 20 +- fdbserver/QuietDatabase.actor.cpp | 3 +- fdbserver/TLogServer.actor.cpp | 5 +- 5 files changed, 298 insertions(+), 127 deletions(-) diff --git a/fdbrpc/fdbrpc.h b/fdbrpc/fdbrpc.h index 25736055f6..1f40bb47a7 100644 --- a/fdbrpc/fdbrpc.h +++ b/fdbrpc/fdbrpc.h @@ -492,8 +492,8 @@ public: errors->delPromiseRef(); } - // The endpoints of a ReplyPromiseStream must be initialized at Task::NoDeliverDelay, because with lower priorities a - // delay(0) in FlowTransport deliver can cause out of order delivery. + // The endpoints of a ReplyPromiseStream must be initialized at Task::NoDeliverDelay, because with lower priorities + // a delay(0) in FlowTransport deliver can cause out of order delivery. const Endpoint& getEndpoint() const { return queue->getEndpoint(TaskPriority::NoDeliverDelay); } bool operator==(const ReplyPromiseStream& rhs) const { return queue == rhs.queue; } @@ -710,20 +710,17 @@ public: template ReplyPromiseStream getReplyStream(const X& value) const { + Future disc = makeDependent(IFailureMonitor::failureMonitor()).onDisconnectOrFailure(getEndpoint()); + auto& p = getReplyPromiseStream(value); + Reference peer; if (queue->isRemoteEndpoint()) { - Future disc = - makeDependent(IFailureMonitor::failureMonitor()).onDisconnectOrFailure(getEndpoint()); - auto& p = getReplyPromiseStream(value); - Reference peer = - FlowTransport::transport().sendUnreliable(SerializeSource(value), getEndpoint(), true); - // FIXME: defer sending the message until we know the connection is established - endStreamOnDisconnect(disc, p, getEndpoint(), peer); - return p; + peer = FlowTransport::transport().sendUnreliable(SerializeSource(value), getEndpoint(), true); } else { send(value); - auto& p = getReplyPromiseStream(value); - return p; } + // FIXME: defer sending the message until we know the connection is established + endStreamOnDisconnect(disc, p, getEndpoint(), peer); + return p; } // stream.getReplyUnlessFailedFor( request, double sustainedFailureDuration, double sustainedFailureSlope ) diff --git a/fdbserver/LogRouter.actor.cpp b/fdbserver/LogRouter.actor.cpp index ec0ec6a416..8e9ee308a6 100644 --- a/fdbserver/LogRouter.actor.cpp +++ b/fdbserver/LogRouter.actor.cpp @@ -117,6 +117,7 @@ struct LogRouterData { getMoreBlockedCount; // Increase by 1 if data is not available when LR tries to pull data from satellite tLog. Future logger; Reference eventCacheHolder; + int activePeekStreams = 0; std::vector> tag_data; // we only store data for the remote tag locality @@ -193,6 +194,7 @@ struct LogRouterData { return int64_t(1000 * val); }); specialCounter(cc, "Generation", [this]() { return this->generation; }); + specialCounter(cc, "ActivePeekStreams", [this]() { return this->activePeekStreams; }); logger = traceCounters("LogRouterMetrics", dbgid, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, @@ -404,18 +406,15 @@ std::deque>& get_version_messages(Lo return tagData->version_messages; }; -void peekMessagesFromMemory(LogRouterData* self, - TLogPeekRequest const& req, - BinaryWriter& messages, - Version& endVersion) { +void peekMessagesFromMemory(LogRouterData* self, Tag tag, Version begin, BinaryWriter& messages, Version& endVersion) { ASSERT(!messages.getLength()); - auto& deque = get_version_messages(self, req.tag); + auto& deque = get_version_messages(self, tag); //TraceEvent("TLogPeekMem", self->dbgid).detail("Tag", req.tag1).detail("PDS", self->persistentDataSequence).detail("PDDS", self->persistentDataDurableSequence).detail("Oldest", map1.empty() ? 0 : map1.begin()->key ).detail("OldestMsgCount", map1.empty() ? 0 : map1.begin()->value.size()); auto it = std::lower_bound(deque.begin(), deque.end(), - std::make_pair(req.begin, LengthPrefixedStringRef()), + std::make_pair(begin, LengthPrefixedStringRef()), CompareFirst>()); Version currentVersion = -1; @@ -442,126 +441,296 @@ Version poppedVersion(LogRouterData* self, Tag tag) { return tagData->popped; } -ACTOR Future logRouterPeekMessages(LogRouterData* self, TLogPeekRequest req) { +ACTOR Future peekLogRouter(LogRouterData* self, + Version begin, + Tag tag, + bool returnIfBlocked = false, + bool reqOnlySpilled = false, + Optional> sequence = Optional>()) { state BinaryWriter messages(Unversioned()); - state int sequence = -1; + state int sequenceNum = -1; state UID peekId; - if (req.sequence.present()) { - try { - peekId = req.sequence.get().first; - sequence = req.sequence.get().second; - if (sequence >= SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS && - self->peekTracker.find(peekId) == self->peekTracker.end()) { - throw operation_obsolete(); + if (sequence.present()) { + peekId = sequence.get().first; + sequenceNum = sequence.get().second; + if (sequenceNum >= SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS && + self->peekTracker.find(peekId) == self->peekTracker.end()) { + throw operation_obsolete(); + } + auto& trackerData = self->peekTracker[peekId]; + if (sequenceNum == 0 && trackerData.sequence_version.find(0) == trackerData.sequence_version.end()) { + trackerData.sequence_version[0].send(std::make_pair(begin, reqOnlySpilled)); + } + auto seqBegin = trackerData.sequence_version.begin(); + // The peek cursor and this comparison need to agree about the maximum number of in-flight requests. + while (trackerData.sequence_version.size() && + seqBegin->first <= sequenceNum - SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS) { + if (seqBegin->second.canBeSet()) { + seqBegin->second.sendError(operation_obsolete()); } + trackerData.sequence_version.erase(seqBegin); + seqBegin = trackerData.sequence_version.begin(); + } + + if (trackerData.sequence_version.size() && sequenceNum < seqBegin->first) { + throw operation_obsolete(); + } + + trackerData.lastUpdate = now(); + std::pair prevPeekData = wait(trackerData.sequence_version[sequenceNum].getFuture()); + begin = prevPeekData.first; + reqOnlySpilled = prevPeekData.second; + wait(yield()); + } + + //TraceEvent("LogRouterPeek1", self->dbgid).detail("From", req.reply.getEndpoint().getPrimaryAddress()).detail("Ver", self->version.get()).detail("Begin", req.begin); + if (returnIfBlocked && self->version.get() < begin) { + //TraceEvent("LogRouterPeek2", self->dbgid); + if (sequence.present()) { auto& trackerData = self->peekTracker[peekId]; - if (sequence == 0 && trackerData.sequence_version.find(0) == trackerData.sequence_version.end()) { - trackerData.sequence_version[0].send(std::make_pair(req.begin, req.onlySpilled)); - } - auto seqBegin = trackerData.sequence_version.begin(); - // The peek cursor and this comparison need to agree about the maximum number of in-flight requests. - while (trackerData.sequence_version.size() && - seqBegin->first <= sequence - SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS) { - if (seqBegin->second.canBeSet()) { - seqBegin->second.sendError(operation_obsolete()); - } - trackerData.sequence_version.erase(seqBegin); - seqBegin = trackerData.sequence_version.begin(); + auto& sequenceData = trackerData.sequence_version[sequenceNum + 1]; + if (!sequenceData.isSet()) { + sequenceData.send(std::make_pair(begin, reqOnlySpilled)); } + } + throw end_of_stream(); + } - if (trackerData.sequence_version.size() && sequence < seqBegin->first) { + if (self->version.get() < begin) { + wait(self->version.whenAtLeast(begin)); + wait(delay(SERVER_KNOBS->TLOG_PEEK_DELAY, g_network->getCurrentTask())); + } + + Version poppedVer = poppedVersion(self, tag); + + if (poppedVer > begin || begin < self->startVersion) { + // This should only happen if a packet is sent multiple times and the reply is not needed. + // Since we are using popped differently, do not send a reply. + TraceEvent(SevWarnAlways, "LogRouterPeekPopped", self->dbgid) + .detail("Begin", begin) + .detail("Popped", poppedVer) + .detail("Start", self->startVersion); + if (sequence.present()) { + auto& trackerData = self->peekTracker[peekId]; + auto& sequenceData = trackerData.sequence_version[sequenceNum + 1]; + if (!sequenceData.isSet()) { + sequenceData.send(std::make_pair(begin, reqOnlySpilled)); + } + } + throw success(); // we've already replied in the past + } + + Version endVersion = self->version.get() + 1; + peekMessagesFromMemory(self, tag, begin, messages, endVersion); + + TLogPeekReply reply; + reply.maxKnownVersion = self->version.get(); + reply.minKnownCommittedVersion = self->poppedVersion; + reply.messages = StringRef(reply.arena, messages.toValue()); + reply.popped = self->minPopped.get() >= self->startVersion ? self->minPopped.get() : 0; + reply.end = endVersion; + reply.onlySpilled = false; + + if (sequence.present()) { + auto& trackerData = self->peekTracker[peekId]; + trackerData.lastUpdate = now(); + auto& sequenceData = trackerData.sequence_version[sequenceNum + 1]; + if (trackerData.sequence_version.size() && sequenceNum + 1 < trackerData.sequence_version.begin()->first) { + if (!sequenceData.isSet()) + sequenceData.sendError(operation_obsolete()); + throw operation_obsolete(); + } + if (sequenceData.isSet()) { + if (sequenceData.getFuture().get().first != reply.end) { + TEST(true); // tlog peek second attempt ended at a different version throw operation_obsolete(); } + } else { + sequenceData.send(std::make_pair(reply.end, reply.onlySpilled)); + } + reply.begin = begin; + } - trackerData.lastUpdate = now(); - std::pair prevPeekData = wait(trackerData.sequence_version[sequence].getFuture()); - req.begin = prevPeekData.first; - req.onlySpilled = prevPeekData.second; - wait(yield()); + //TraceEvent("LogRouterPeek4", self->dbgid); + return reply; +} + +// This actor keep pushing TLogPeekStreamReply until it's removed from the cluster or should recover +ACTOR Future logRouterPeekStream(LogRouterData* self, TLogPeekStreamRequest req) { + self->activePeekStreams++; + TraceEvent(SevDebug, "TLogPeekStream", self->dbgid).detail("Token", req.reply.getEndpoint().token); + + state Version begin = req.begin; + state bool onlySpilled = false; + + req.reply.setByteLimit(std::min(SERVER_KNOBS->MAXIMUM_PEEK_BYTES, req.limitBytes)); + loop { + state TLogPeekStreamReply reply; + try { + wait(req.reply.onReady() && + store(reply.rep, peekLogRouter(self, req.begin, req.tag, req.returnIfBlocked, onlySpilled))); + req.reply.send(reply); + begin = reply.rep.end; + onlySpilled = reply.rep.onlySpilled; + wait(delay(0, g_network->getCurrentTask())); } catch (Error& e) { - if (e.code() == error_code_timed_out || e.code() == error_code_operation_obsolete) { + self->activePeekStreams--; + TraceEvent(SevDebug, "TLogPeekStreamEnd", self->dbgid).error(e, true); + + if (e.code() == error_code_success) { + continue; + } else if (e.code() == error_code_end_of_stream) { req.reply.sendError(e); return Void(); + } else if (e.code() == error_code_operation_obsolete) { + // reply stream is cancelled on the client + return Void(); } else { throw; } } } +} - //TraceEvent("LogRouterPeek1", self->dbgid).detail("From", req.reply.getEndpoint().getPrimaryAddress()).detail("Ver", self->version.get()).detail("Begin", req.begin); - if (req.returnIfBlocked && self->version.get() < req.begin) { - //TraceEvent("LogRouterPeek2", self->dbgid); - req.reply.sendError(end_of_stream()); - if (req.sequence.present()) { - auto& trackerData = self->peekTracker[peekId]; - auto& sequenceData = trackerData.sequence_version[sequence + 1]; - if (!sequenceData.isSet()) { - sequenceData.send(std::make_pair(req.begin, req.onlySpilled)); - } - } - return Void(); - } - - if (self->version.get() < req.begin) { - wait(self->version.whenAtLeast(req.begin)); - wait(delay(SERVER_KNOBS->TLOG_PEEK_DELAY, g_network->getCurrentTask())); - } - - Version poppedVer = poppedVersion(self, req.tag); - - if (poppedVer > req.begin || req.begin < self->startVersion) { - // This should only happen if a packet is sent multiple times and the reply is not needed. - // Since we are using popped differently, do not send a reply. - TraceEvent(SevWarnAlways, "LogRouterPeekPopped", self->dbgid) - .detail("Begin", req.begin) - .detail("Popped", poppedVer) - .detail("Start", self->startVersion); - req.reply.send(Never()); - if (req.sequence.present()) { - auto& trackerData = self->peekTracker[peekId]; - auto& sequenceData = trackerData.sequence_version[sequence + 1]; - if (!sequenceData.isSet()) { - sequenceData.send(std::make_pair(req.begin, req.onlySpilled)); - } - } - return Void(); - } - - Version endVersion = self->version.get() + 1; - peekMessagesFromMemory(self, req, messages, endVersion); - - TLogPeekReply reply; - reply.maxKnownVersion = self->version.get(); - reply.minKnownCommittedVersion = self->poppedVersion; - reply.messages = messages.toValue(); - reply.popped = self->minPopped.get() >= self->startVersion ? self->minPopped.get() : 0; - reply.end = endVersion; - reply.onlySpilled = false; - - if (req.sequence.present()) { - auto& trackerData = self->peekTracker[peekId]; - trackerData.lastUpdate = now(); - auto& sequenceData = trackerData.sequence_version[sequence + 1]; - if (trackerData.sequence_version.size() && sequence + 1 < trackerData.sequence_version.begin()->first) { - req.reply.sendError(operation_obsolete()); - if (!sequenceData.isSet()) - sequenceData.sendError(operation_obsolete()); +ACTOR Future logRouterPeekMessages(LogRouterData* self, TLogPeekRequest req) { + try { + TLogPeekReply reply = + wait(peekLogRouter(self, req.begin, req.tag, req.returnIfBlocked, req.onlySpilled, req.sequence)); + req.reply.send(reply); + } catch (Error& e) { + if (e.code() == error_code_success) { + req.reply.send(Never()); + return Void(); + } else if (e.code() == error_code_timed_out || e.code() == error_code_operation_obsolete || + e.code() == error_code_end_of_stream) { + req.reply.sendError(e); return Void(); - } - if (sequenceData.isSet()) { - if (sequenceData.getFuture().get().first != reply.end) { - TEST(true); // tlog peek second attempt ended at a different version - req.reply.sendError(operation_obsolete()); - return Void(); - } } else { - sequenceData.send(std::make_pair(reply.end, reply.onlySpilled)); + throw; } - reply.begin = req.begin; } - req.reply.send(reply); + // state BinaryWriter messages(Unversioned()); + // state int sequence = -1; + // state UID peekId; + // + // if (req.sequence.present()) { + // try { + // peekId = req.sequence.get().first; + // sequence = req.sequence.get().second; + // if (sequence >= SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS && + // self->peekTracker.find(peekId) == self->peekTracker.end()) { + // throw operation_obsolete(); + // } + // auto& trackerData = self->peekTracker[peekId]; + // if (sequence == 0 && trackerData.sequence_version.find(0) == trackerData.sequence_version.end()) { + // trackerData.sequence_version[0].send(std::make_pair(req.begin, req.onlySpilled)); + // } + // auto seqBegin = trackerData.sequence_version.begin(); + // // The peek cursor and this comparison need to agree about the maximum number of in-flight requests. + // while (trackerData.sequence_version.size() && + // seqBegin->first <= sequence - SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS) { + // if (seqBegin->second.canBeSet()) { + // seqBegin->second.sendError(operation_obsolete()); + // } + // trackerData.sequence_version.erase(seqBegin); + // seqBegin = trackerData.sequence_version.begin(); + // } + // + // if (trackerData.sequence_version.size() && sequence < seqBegin->first) { + // throw operation_obsolete(); + // } + // + // trackerData.lastUpdate = now(); + // std::pair prevPeekData = wait(trackerData.sequence_version[sequence].getFuture()); + // req.begin = prevPeekData.first; + // req.onlySpilled = prevPeekData.second; + // wait(yield()); + // } catch (Error& e) { + // if (e.code() == error_code_timed_out || e.code() == error_code_operation_obsolete) { + // req.reply.sendError(e); + // return Void(); + // } else { + // throw; + // } + // } + // } + // + // //TraceEvent("LogRouterPeek1", self->dbgid).detail("From", + // req.reply.getEndpoint().getPrimaryAddress()).detail("Ver", self->version.get()).detail("Begin", req.begin); if + //(req.returnIfBlocked && self->version.get() < req.begin) { + // //TraceEvent("LogRouterPeek2", self->dbgid); + // req.reply.sendError(end_of_stream()); + // if (req.sequence.present()) { + // auto& trackerData = self->peekTracker[peekId]; + // auto& sequenceData = trackerData.sequence_version[sequence + 1]; + // if (!sequenceData.isSet()) { + // sequenceData.send(std::make_pair(req.begin, req.onlySpilled)); + // } + // } + // return Void(); + // } + // + // if (self->version.get() < req.begin) { + // wait(self->version.whenAtLeast(req.begin)); + // wait(delay(SERVER_KNOBS->TLOG_PEEK_DELAY, g_network->getCurrentTask())); + // } + // + // Version poppedVer = poppedVersion(self, req.tag); + // + // if (poppedVer > req.begin || req.begin < self->startVersion) { + // // This should only happen if a packet is sent multiple times and the reply is not needed. + // // Since we are using popped differently, do not send a reply. + // TraceEvent(SevWarnAlways, "LogRouterPeekPopped", self->dbgid) + // .detail("Begin", req.begin) + // .detail("Popped", poppedVer) + // .detail("Start", self->startVersion); + // req.reply.send(Never()); + // if (req.sequence.present()) { + // auto& trackerData = self->peekTracker[peekId]; + // auto& sequenceData = trackerData.sequence_version[sequence + 1]; + // if (!sequenceData.isSet()) { + // sequenceData.send(std::make_pair(req.begin, req.onlySpilled)); + // } + // } + // return Void(); + // } + // + // Version endVersion = self->version.get() + 1; + // peekMessagesFromMemory(self, req.tag, req.begin, messages, endVersion); + // + // TLogPeekReply reply; + // reply.maxKnownVersion = self->version.get(); + // reply.minKnownCommittedVersion = self->poppedVersion; + // reply.messages = messages.toValue(); + // reply.popped = self->minPopped.get() >= self->startVersion ? self->minPopped.get() : 0; + // reply.end = endVersion; + // reply.onlySpilled = false; + // + // if (req.sequence.present()) { + // auto& trackerData = self->peekTracker[peekId]; + // trackerData.lastUpdate = now(); + // auto& sequenceData = trackerData.sequence_version[sequence + 1]; + // if (trackerData.sequence_version.size() && sequence + 1 < trackerData.sequence_version.begin()->first) { + // req.reply.sendError(operation_obsolete()); + // if (!sequenceData.isSet()) + // sequenceData.sendError(operation_obsolete()); + // return Void(); + // } + // if (sequenceData.isSet()) { + // if (sequenceData.getFuture().get().first != reply.end) { + // TEST(true); // tlog peek second attempt ended at a different version + // req.reply.sendError(operation_obsolete()); + // return Void(); + // } + // } else { + // sequenceData.send(std::make_pair(reply.end, reply.onlySpilled)); + // } + // reply.begin = req.begin; + // } + // + // req.reply.send(reply); //TraceEvent("LogRouterPeek4", self->dbgid); return Void(); } @@ -645,6 +814,9 @@ ACTOR Future logRouterCore(TLogInterface interf, when(TLogPeekRequest req = waitNext(interf.peekMessages.getFuture())) { addActor.send(logRouterPeekMessages(&logRouterData, req)); } + when(TLogPeekStreamRequest req = waitNext(interf.peekStreamMessages.getFuture())) { + addActor.send(logRouterPeekStream(&logRouterData, req)); + } when(TLogPopRequest req = waitNext(interf.popMessages.getFuture())) { // Request from remote tLog to pop data from LR addActor.send(logRouterPop(&logRouterData, req)); diff --git a/fdbserver/LogSystemPeekCursor.actor.cpp b/fdbserver/LogSystemPeekCursor.actor.cpp index 7473d7525a..b4e76d3eb8 100644 --- a/fdbserver/LogSystemPeekCursor.actor.cpp +++ b/fdbserver/LogSystemPeekCursor.actor.cpp @@ -354,6 +354,10 @@ ACTOR Future serverPeekStreamGetMore(ILogSystem::ServerPeekCursor* self, T .detail("Has", self->hasMessage()) .detail("End", res.end) .detail("Popped", res.popped.present() ? res.popped.get() : 0); + + // NOTE: delay is needed here since TLog need to be scheduled to response if there are TLog and SS + // on the same machine + wait(delay(0)); return Void(); } } @@ -412,16 +416,12 @@ Future ILogSystem::ServerPeekCursor::getMore(TaskPriority taskID) { if (hasMessage() && !parallelGetMore) return Void(); if (!more.isValid() || more.isReady()) { - more = serverPeekStreamGetMore(this, taskID); - // if (usePeekStream && taskID == TaskPriority::TLogPeekReply) { - // more = serverPeekStreamGetMore(this, taskID); - // } - // if (parallelGetMore || onlySpilled || futureResults.size()) { - // more = serverPeekParallelGetMore(this, taskID); - // } - // else { - // more = serverPeekGetMore(this, taskID); - // } + // more = serverPeekStreamGetMore(this, taskID); + if (parallelGetMore || onlySpilled || futureResults.size()) { + more = serverPeekParallelGetMore(this, taskID); + } else { + more = serverPeekGetMore(this, taskID); + } } return more; } diff --git a/fdbserver/QuietDatabase.actor.cpp b/fdbserver/QuietDatabase.actor.cpp index 47b9a9f2f3..0ce415f7f9 100644 --- a/fdbserver/QuietDatabase.actor.cpp +++ b/fdbserver/QuietDatabase.actor.cpp @@ -634,10 +634,11 @@ ACTOR Future waitForQuietDatabase(Database cx, // In a simulated environment, wait 5 seconds so that workers can move to their optimal locations if (g_network->isSimulated()) wait(delay(5.0)); - + printf("------- 1 -------\n"); // The quiet database check (which runs at the end of every test) will always time out due to active data movement. // To get around this, quiet Database will disable the perpetual wiggle in the setup phase. wait(setPerpetualStorageWiggle(cx, false, LockAware::TRUE)); + printf("------- 2 -------\n"); // Require 3 consecutive successful quiet database checks spaced 2 second apart state int numSuccesses = 0; diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp index f437f24637..58f708160a 100644 --- a/fdbserver/TLogServer.actor.cpp +++ b/fdbserver/TLogServer.actor.cpp @@ -1920,7 +1920,8 @@ ACTOR Future peekTLog(TLogData* self, // This actor keep pushing TLogPeekStreamReply until it's removed from the cluster or should recover ACTOR Future tLogPeekStream(TLogData* self, TLogPeekStreamRequest req, Reference logData) { self->activePeekStreams++; - TraceEvent(SevDebug, "TLogPeekStream", logData->logId); + TraceEvent(SevDebug, "TLogPeekStream", logData->logId).detail("Token", req.reply.getEndpoint().token); + state Version begin = req.begin; state bool onlySpilled = false; if (req.tag.locality == tagLocalityTxs && req.tag.id >= logData->txsTags && logData->txsTags > 0) { @@ -1938,7 +1939,7 @@ ACTOR Future tLogPeekStream(TLogData* self, TLogPeekStreamRequest req, Ref wait(delay(0, g_network->getCurrentTask())); } catch (Error& e) { self->activePeekStreams--; - TraceEvent(SevDebug, "TLogPeekStreamEnd", logData->logId).detail("Error", e.what()); + TraceEvent(SevDebug, "TLogPeekStreamEnd", logData->logId).error(e, true); if (e.code() == error_code_end_of_stream) { req.reply.sendError(e); From 974bb4b3448ad7d31d69bcffb833fa6d2926135a Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Tue, 20 Jul 2021 17:01:37 -0700 Subject: [PATCH 12/29] add stream peek function to oldTLogServer_x_x.actor.cpp and LogRouter --- fdbserver/LogRouter.actor.cpp | 130 +------- fdbserver/OldTLogServer_4_6.actor.cpp | 216 +++++++++++++- fdbserver/OldTLogServer_6_0.actor.cpp | 319 +++++++++++++++++++- fdbserver/OldTLogServer_6_2.actor.cpp | 409 +++++++++++++++++++++++++- flow/error_definitions.h | 1 + 5 files changed, 918 insertions(+), 157 deletions(-) diff --git a/fdbserver/LogRouter.actor.cpp b/fdbserver/LogRouter.actor.cpp index 8e9ee308a6..eb3ea81326 100644 --- a/fdbserver/LogRouter.actor.cpp +++ b/fdbserver/LogRouter.actor.cpp @@ -518,7 +518,7 @@ ACTOR Future peekLogRouter(LogRouterData* self, sequenceData.send(std::make_pair(begin, reqOnlySpilled)); } } - throw success(); // we've already replied in the past + throw no_action_needed(); // we've already replied in the past } Version endVersion = self->version.get() + 1; @@ -578,8 +578,8 @@ ACTOR Future logRouterPeekStream(LogRouterData* self, TLogPeekStreamReques self->activePeekStreams--; TraceEvent(SevDebug, "TLogPeekStreamEnd", self->dbgid).error(e, true); - if (e.code() == error_code_success) { - continue; + if (e.code() == error_code_no_action_needed) { + return Void(); } else if (e.code() == error_code_end_of_stream) { req.reply.sendError(e); return Void(); @@ -599,7 +599,7 @@ ACTOR Future logRouterPeekMessages(LogRouterData* self, TLogPeekRequest re wait(peekLogRouter(self, req.begin, req.tag, req.returnIfBlocked, req.onlySpilled, req.sequence)); req.reply.send(reply); } catch (Error& e) { - if (e.code() == error_code_success) { + if (e.code() == error_code_no_action_needed) { req.reply.send(Never()); return Void(); } else if (e.code() == error_code_timed_out || e.code() == error_code_operation_obsolete || @@ -610,128 +610,6 @@ ACTOR Future logRouterPeekMessages(LogRouterData* self, TLogPeekRequest re throw; } } - - // state BinaryWriter messages(Unversioned()); - // state int sequence = -1; - // state UID peekId; - // - // if (req.sequence.present()) { - // try { - // peekId = req.sequence.get().first; - // sequence = req.sequence.get().second; - // if (sequence >= SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS && - // self->peekTracker.find(peekId) == self->peekTracker.end()) { - // throw operation_obsolete(); - // } - // auto& trackerData = self->peekTracker[peekId]; - // if (sequence == 0 && trackerData.sequence_version.find(0) == trackerData.sequence_version.end()) { - // trackerData.sequence_version[0].send(std::make_pair(req.begin, req.onlySpilled)); - // } - // auto seqBegin = trackerData.sequence_version.begin(); - // // The peek cursor and this comparison need to agree about the maximum number of in-flight requests. - // while (trackerData.sequence_version.size() && - // seqBegin->first <= sequence - SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS) { - // if (seqBegin->second.canBeSet()) { - // seqBegin->second.sendError(operation_obsolete()); - // } - // trackerData.sequence_version.erase(seqBegin); - // seqBegin = trackerData.sequence_version.begin(); - // } - // - // if (trackerData.sequence_version.size() && sequence < seqBegin->first) { - // throw operation_obsolete(); - // } - // - // trackerData.lastUpdate = now(); - // std::pair prevPeekData = wait(trackerData.sequence_version[sequence].getFuture()); - // req.begin = prevPeekData.first; - // req.onlySpilled = prevPeekData.second; - // wait(yield()); - // } catch (Error& e) { - // if (e.code() == error_code_timed_out || e.code() == error_code_operation_obsolete) { - // req.reply.sendError(e); - // return Void(); - // } else { - // throw; - // } - // } - // } - // - // //TraceEvent("LogRouterPeek1", self->dbgid).detail("From", - // req.reply.getEndpoint().getPrimaryAddress()).detail("Ver", self->version.get()).detail("Begin", req.begin); if - //(req.returnIfBlocked && self->version.get() < req.begin) { - // //TraceEvent("LogRouterPeek2", self->dbgid); - // req.reply.sendError(end_of_stream()); - // if (req.sequence.present()) { - // auto& trackerData = self->peekTracker[peekId]; - // auto& sequenceData = trackerData.sequence_version[sequence + 1]; - // if (!sequenceData.isSet()) { - // sequenceData.send(std::make_pair(req.begin, req.onlySpilled)); - // } - // } - // return Void(); - // } - // - // if (self->version.get() < req.begin) { - // wait(self->version.whenAtLeast(req.begin)); - // wait(delay(SERVER_KNOBS->TLOG_PEEK_DELAY, g_network->getCurrentTask())); - // } - // - // Version poppedVer = poppedVersion(self, req.tag); - // - // if (poppedVer > req.begin || req.begin < self->startVersion) { - // // This should only happen if a packet is sent multiple times and the reply is not needed. - // // Since we are using popped differently, do not send a reply. - // TraceEvent(SevWarnAlways, "LogRouterPeekPopped", self->dbgid) - // .detail("Begin", req.begin) - // .detail("Popped", poppedVer) - // .detail("Start", self->startVersion); - // req.reply.send(Never()); - // if (req.sequence.present()) { - // auto& trackerData = self->peekTracker[peekId]; - // auto& sequenceData = trackerData.sequence_version[sequence + 1]; - // if (!sequenceData.isSet()) { - // sequenceData.send(std::make_pair(req.begin, req.onlySpilled)); - // } - // } - // return Void(); - // } - // - // Version endVersion = self->version.get() + 1; - // peekMessagesFromMemory(self, req.tag, req.begin, messages, endVersion); - // - // TLogPeekReply reply; - // reply.maxKnownVersion = self->version.get(); - // reply.minKnownCommittedVersion = self->poppedVersion; - // reply.messages = messages.toValue(); - // reply.popped = self->minPopped.get() >= self->startVersion ? self->minPopped.get() : 0; - // reply.end = endVersion; - // reply.onlySpilled = false; - // - // if (req.sequence.present()) { - // auto& trackerData = self->peekTracker[peekId]; - // trackerData.lastUpdate = now(); - // auto& sequenceData = trackerData.sequence_version[sequence + 1]; - // if (trackerData.sequence_version.size() && sequence + 1 < trackerData.sequence_version.begin()->first) { - // req.reply.sendError(operation_obsolete()); - // if (!sequenceData.isSet()) - // sequenceData.sendError(operation_obsolete()); - // return Void(); - // } - // if (sequenceData.isSet()) { - // if (sequenceData.getFuture().get().first != reply.end) { - // TEST(true); // tlog peek second attempt ended at a different version - // req.reply.sendError(operation_obsolete()); - // return Void(); - // } - // } else { - // sequenceData.send(std::make_pair(reply.end, reply.onlySpilled)); - // } - // reply.begin = req.begin; - // } - // - // req.reply.send(reply); - //TraceEvent("LogRouterPeek4", self->dbgid); return Void(); } diff --git a/fdbserver/OldTLogServer_4_6.actor.cpp b/fdbserver/OldTLogServer_4_6.actor.cpp index d8d6755910..13924707bd 100644 --- a/fdbserver/OldTLogServer_4_6.actor.cpp +++ b/fdbserver/OldTLogServer_4_6.actor.cpp @@ -300,6 +300,7 @@ struct TLogData : NonCopyable { int64_t instanceID; int64_t bytesInput; int64_t bytesDurable; + int activePeekStreams = 0; Version prevVersion; @@ -478,6 +479,7 @@ struct LogData : NonCopyable, public ReferenceCounted { }); specialCounter( cc, "QueueDiskBytesTotal", [tLogData]() { return tLogData->rawPersistentQueue->getStorageBytes().total; }); + specialCounter(cc, "ActivePeekStreams", [tLogData]() {return tLogData->activePeekStreams;}); } ~LogData() { @@ -932,14 +934,15 @@ ACTOR Future tLogPop(TLogData* self, TLogPopRequest req, Reference self, - TLogPeekRequest const& req, + Tag tag, + Version reqBegin, BinaryWriter& messages, Version& endVersion) { - OldTag oldTag = convertTag(req.tag); + OldTag oldTag = convertTag(tag); ASSERT(!messages.getLength()); auto& deque = get_version_messages(self, oldTag); - Version begin = std::max(req.begin, self->persistentDataDurableVersion + 1); + Version begin = std::max(reqBegin, self->persistentDataDurableVersion + 1); auto it = std::lower_bound(deque.begin(), deque.end(), std::make_pair(begin, LengthPrefixedStringRef()), @@ -964,14 +967,212 @@ void peekMessagesFromMemory(Reference self, uint32_t subVersion; rd >> messageLength >> subVersion; messageLength += sizeof(uint16_t) + sizeof(Tag); - messages << messageLength << subVersion << uint16_t(1) << req.tag; + messages << messageLength << subVersion << uint16_t(1) << tag; messageLength -= (sizeof(subVersion) + sizeof(uint16_t) + sizeof(Tag)); messages.serializeBytes(rd.readBytes(messageLength), messageLength); } } } +// Common logics to peek TLog and create TLogPeekReply that serves both streaming peek or normal peek request +ACTOR Future peekTLog(TLogData* self, + Reference logData, + Version begin, + Tag tag, + bool returnIfBlocked = false, + bool reqOnlySpilled = false, + Optional> sequence = Optional>()) { + state BinaryWriter messages(Unversioned()); + state BinaryWriter messages2(Unversioned()); + state int sequenceNum = -1; + state UID peekId; + state double queueStart = now(); + state OldTag oldTag = convertTag(tag); + + if (sequence.present()) { + peekId = sequence.get().first; + sequenceNum = sequence.get().second; + if (sequenceNum >= SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS && + self->peekTracker.find(peekId) == self->peekTracker.end()) { + throw operation_obsolete(); + } + if (sequenceNum > 0) { + auto& trackerData = self->peekTracker[peekId]; + trackerData.lastUpdate = now(); + Version ver = wait(trackerData.sequence_version[sequenceNum].getFuture()); + begin = std::max(ver, begin); + wait(yield()); + } + } + + if (returnIfBlocked && logData->version.get() < begin) { + throw end_of_stream(); + } + + //TraceEvent("TLogPeekMessages0", self->dbgid).detail("ReqBeginEpoch", req.begin.epoch).detail("ReqBeginSeq", req.begin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", req.tag1).detail("Tag2", req.tag2); + // Wait until we have something to return that the caller doesn't already have + if (logData->version.get() < begin) { + wait(logData->version.whenAtLeast(begin)); + wait(delay(SERVER_KNOBS->TLOG_PEEK_DELAY, g_network->getCurrentTask())); + } + + state Version endVersion = logData->version.get() + 1; + + Version poppedVer = poppedVersion(logData, oldTag); + if (poppedVer > begin) { + TLogPeekReply rep; + rep.maxKnownVersion = logData->version.get(); + rep.minKnownCommittedVersion = 0; + rep.popped = poppedVer; + rep.end = poppedVer; + rep.onlySpilled = false; + + if (sequence.present()) { + auto& trackerData = self->peekTracker[peekId]; + auto& sequenceData = trackerData.sequence_version[sequenceNum + 1]; + trackerData.lastUpdate = now(); + if (trackerData.sequence_version.size() && sequenceNum + 1 < trackerData.sequence_version.begin()->first) { + if (!sequenceData.isSet()) + sequenceData.sendError(operation_obsolete()); + throw operation_obsolete(); + } + if (sequenceData.isSet()) { + if (sequenceData.getFuture().get() != rep.end) { + TEST(true); // 0 tlog peek second attempt ended at a different version + throw operation_obsolete(); + } + } else { + sequenceData.send(rep.end); + } + rep.begin = begin; + } + + return rep; + } + + // grab messages from disk + //TraceEvent("TLogPeekMessages", self->dbgid).detail("ReqBeginEpoch", req.begin.epoch).detail("ReqBeginSeq", req.begin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", req.tag1).detail("Tag2", req.tag2); + if (begin <= logData->persistentDataDurableVersion) { + // Just in case the durable version changes while we are waiting for the read, we grab this data from memory. We + // may or may not actually send it depending on whether we get enough data from disk. SOMEDAY: Only do this if + // an initial attempt to read from disk results in insufficient data and the required data is no longer in + // memory SOMEDAY: Should we only send part of the messages we collected, to actually limit the size of the + // result? + + peekMessagesFromMemory(logData, tag, begin, messages2, endVersion); + + RangeResult kvs = wait(self->persistentData->readRange( + KeyRangeRef(persistTagMessagesKey(logData->logId, oldTag, begin), + persistTagMessagesKey(logData->logId, oldTag, logData->persistentDataDurableVersion + 1)), + SERVER_KNOBS->DESIRED_TOTAL_BYTES, + SERVER_KNOBS->DESIRED_TOTAL_BYTES)); + + //TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", reply.getEndpoint().getPrimaryAddress()).detail("Tag1Results", s1).detail("Tag2Results", s2).detail("Tag1ResultsLim", kv1.size()).detail("Tag2ResultsLim", kv2.size()).detail("Tag1ResultsLast", kv1.size() ? kv1[0].key : "").detail("Tag2ResultsLast", kv2.size() ? kv2[0].key : "").detail("Limited", limited).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowEpoch", self->epoch()).detail("NowSeq", self->sequence.getNextSequence()); + + for (auto& kv : kvs) { + auto ver = decodeTagMessagesKey(kv.key); + messages << int32_t(-1) << ver; + + BinaryReader rd(kv.value, Unversioned()); + while (!rd.empty()) { + int32_t messageLength; + uint32_t subVersion; + rd >> messageLength >> subVersion; + messageLength += sizeof(uint16_t) + sizeof(Tag); + messages << messageLength << subVersion << uint16_t(1) << tag; + messageLength -= (sizeof(subVersion) + sizeof(uint16_t) + sizeof(Tag)); + messages.serializeBytes(rd.readBytes(messageLength), messageLength); + } + } + + if (kvs.expectedSize() >= SERVER_KNOBS->DESIRED_TOTAL_BYTES) + endVersion = decodeTagMessagesKey(kvs.end()[-1].key) + 1; + else + messages.serializeBytes(messages2.toValue()); + } else { + peekMessagesFromMemory(logData, tag, begin, messages, endVersion); + //TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", reply.getEndpoint().getPrimaryAddress()).detail("MessageBytes", messages.getLength()).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowSeq", self->sequence.getNextSequence()); + } + + TLogPeekReply reply; + reply.maxKnownVersion = logData->version.get(); + reply.minKnownCommittedVersion = 0; + reply.onlySpilled = false; + reply.messages = StringRef(reply.arena, messages.toValue()); + reply.end = endVersion; + + //TraceEvent("TlogPeek", self->dbgid).detail("LogId", logData->logId).detail("EndVer", reply.end).detail("MsgBytes", reply.messages.expectedSize()).detail("ForAddress", req.reply.getEndpoint().getPrimaryAddress()); + + if (sequence.present()) { + auto& trackerData = self->peekTracker[peekId]; + trackerData.lastUpdate = now(); + auto& sequenceData = trackerData.sequence_version[sequenceNum + 1]; + if (sequenceData.isSet()) { + if (sequenceData.getFuture().get() != reply.end) { + TEST(true); // 0 tlog peek second attempt ended at a different version (2) + throw operation_obsolete(); + } + } else { + sequenceData.send(reply.end); + } + reply.begin = begin; + } + + return reply; +} + +// This actor keep pushing TLogPeekStreamReply until it's removed from the cluster or should recover +ACTOR Future tLogPeekStream(TLogData* self, TLogPeekStreamRequest req, Reference logData) { + self->activePeekStreams++; + TraceEvent(SevDebug, "TLogPeekStream", logData->logId).detail("Token", req.reply.getEndpoint().token); + + state Version begin = req.begin; + state bool onlySpilled = false; + + req.reply.setByteLimit(std::min(SERVER_KNOBS->MAXIMUM_PEEK_BYTES, req.limitBytes)); + loop { + state TLogPeekStreamReply reply; + try { + wait(req.reply.onReady() && + store(reply.rep, peekTLog(self, logData, begin, req.tag, req.returnIfBlocked, onlySpilled))); + req.reply.send(reply); + begin = reply.rep.end; + onlySpilled = reply.rep.onlySpilled; + wait(delay(0, g_network->getCurrentTask())); + } catch (Error& e) { + self->activePeekStreams--; + TraceEvent(SevDebug, "TLogPeekStreamEnd", logData->logId).error(e, true); + + if (e.code() == error_code_end_of_stream) { + req.reply.sendError(e); + return Void(); + } else if (e.code() == error_code_operation_obsolete) { + // reply stream is cancelled on the client + return Void(); + } else { + throw; + } + } + } +} + ACTOR Future tLogPeekMessages(TLogData* self, TLogPeekRequest req, Reference logData) { + /*try { + TLogPeekReply reply = + wait(peekTLog(self, logData, req.begin, req.tag, req.returnIfBlocked, req.onlySpilled, req.sequence)); + req.reply.send(reply); + } catch (Error& e) { + if (e.code() == error_code_timed_out || e.code() == error_code_operation_obsolete || + e.code() == error_code_end_of_stream) { + req.reply.sendError(e); + return Void(); + } else { + throw; + } + } + + return Void();*/ + state BinaryWriter messages(Unversioned()); state BinaryWriter messages2(Unversioned()); state int sequence = -1; @@ -1061,7 +1262,7 @@ ACTOR Future tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen // memory SOMEDAY: Should we only send part of the messages we collected, to actually limit the size of the // result? - peekMessagesFromMemory(logData, req, messages2, endVersion); + peekMessagesFromMemory(logData, req.tag, req.begin, messages2, endVersion); RangeResult kvs = wait(self->persistentData->readRange( KeyRangeRef(persistTagMessagesKey(logData->logId, oldTag, req.begin), @@ -1092,7 +1293,7 @@ ACTOR Future tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen else messages.serializeBytes(messages2.toValue()); } else { - peekMessagesFromMemory(logData, req, messages, endVersion); + peekMessagesFromMemory(logData, req.tag, req.begin, messages, endVersion); //TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", req.reply.getEndpoint().getPrimaryAddress()).detail("MessageBytes", messages.getLength()).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowSeq", self->sequence.getNextSequence()); } @@ -1436,6 +1637,7 @@ ACTOR Future restorePersistentState(TLogData* self, LocalityData locality) recruited.initEndpoints(); DUMPTOKEN(recruited.peekMessages); + DUMPTOKEN(recruited.peekStreamMessages); DUMPTOKEN(recruited.popMessages); DUMPTOKEN(recruited.commit); DUMPTOKEN(recruited.lock); @@ -1575,7 +1777,7 @@ ACTOR Future tLog(IKeyValueStore* persistentData, state TLogData self(tlogId, workerID, persistentData, persistentQueue, db); state Future error = actorCollection(self.sharedActors.getFuture()); - TraceEvent("SharedTlog", tlogId); + TraceEvent("SharedTlog", tlogId).detail("Version", "4.6"); try { wait(restorePersistentState(&self, locality)); diff --git a/fdbserver/OldTLogServer_6_0.actor.cpp b/fdbserver/OldTLogServer_6_0.actor.cpp index 24c97f741c..417b3d3dfd 100644 --- a/fdbserver/OldTLogServer_6_0.actor.cpp +++ b/fdbserver/OldTLogServer_6_0.actor.cpp @@ -276,6 +276,7 @@ struct TLogData : NonCopyable { int64_t targetVolatileBytes; // The number of bytes of mutations this TLog should hold in memory before spilling. int64_t overheadBytesInput; int64_t overheadBytesDurable; + int activePeekStreams = 0; WorkerCache tlogCache; @@ -573,6 +574,7 @@ struct LogData : NonCopyable, public ReferenceCounted { }); specialCounter( cc, "QueueDiskBytesTotal", [tLogData]() { return tLogData->rawPersistentQueue->getStorageBytes().total; }); + specialCounter(cc, "ActivePeekStreams", [tLogData]() { return tLogData->activePeekStreams;}); } ~LogData() { @@ -1173,19 +1175,20 @@ std::deque>& getVersionMessages(Refe }; void peekMessagesFromMemory(Reference self, - TLogPeekRequest const& req, + Tag tag, + Version begin, BinaryWriter& messages, Version& endVersion) { - ASSERT(!messages.getLength()); + ASSERT(!messages.getLength()); - auto& deque = getVersionMessages(self, req.tag); - //TraceEvent("TLogPeekMem", self->dbgid).detail("Tag", req.tag1).detail("PDS", self->persistentDataSequence).detail("PDDS", self->persistentDataDurableSequence).detail("Oldest", map1.empty() ? 0 : map1.begin()->key ).detail("OldestMsgCount", map1.empty() ? 0 : map1.begin()->value.size()); + auto& deque = getVersionMessages(self, tag); + //TraceEvent("TLogPeekMem", self->dbgid).detail("Tag", req.tag1).detail("PDS", self->persistentDataSequence).detail("PDDS", self->persistentDataDurableSequence).detail("Oldest", map1.empty() ? 0 : map1.begin()->key ).detail("OldestMsgCount", map1.empty() ? 0 : map1.begin()->value.size()); - Version begin = std::max(req.begin, self->persistentDataDurableVersion + 1); - auto it = std::lower_bound(deque.begin(), - deque.end(), - std::make_pair(begin, LengthPrefixedStringRef()), - CompareFirst>()); + begin = std::max(begin, self->persistentDataDurableVersion + 1); + auto it = std::lower_bound(deque.begin(), + deque.end(), + std::make_pair(begin, LengthPrefixedStringRef()), + CompareFirst>()); Version currentVersion = -1; for (; it != deque.end(); ++it) { @@ -1204,7 +1207,294 @@ void peekMessagesFromMemory(Reference self, } } +// Common logics to peek TLog and create TLogPeekReply that serves both streaming peek or normal peek request +ACTOR Future peekTLog(TLogData* self, + Reference logData, + Version begin, + Tag tag, + bool returnIfBlocked = false, + bool reqOnlySpilled = false, + Optional> sequence = Optional>()) { + state BinaryWriter messages(Unversioned()); + state BinaryWriter messages2(Unversioned()); + state int sequenceNum = -1; + state UID peekId; + state double queueStart = now(); + + if (tag.locality == tagLocalityTxs && tag.id >= logData->txsTags && logData->txsTags > 0) { + tag.id = tag.id % logData->txsTags; + } + + // TODO: once the fake stream is replaced by ReplyPromiseStream, we can remove the code handling sequence requests + // STEP: a. mark obsolete sequence requests; b. wait previous sequence requests are handled in order + if (sequence.present()) { + peekId = sequence.get().first; + sequenceNum = sequence.get().second; + if (sequenceNum >= SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS && + logData->peekTracker.find(peekId) == logData->peekTracker.end()) { + throw operation_obsolete(); + } + auto& trackerData = logData->peekTracker[peekId]; + if (sequenceNum == 0 && trackerData.sequence_version.find(0) == trackerData.sequence_version.end()) { + trackerData.tag = tag; + trackerData.sequence_version[0].send(std::make_pair(begin, reqOnlySpilled)); + } + auto seqBegin = trackerData.sequence_version.begin(); + // The peek cursor and this comparison need to agree about the maximum number of in-flight requests. + while (trackerData.sequence_version.size() && + seqBegin->first <= sequenceNum - SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS) { + if (seqBegin->second.canBeSet()) { + seqBegin->second.sendError(operation_obsolete()); + } + trackerData.sequence_version.erase(seqBegin); + seqBegin = trackerData.sequence_version.begin(); + } + + if (trackerData.sequence_version.size() && sequenceNum < seqBegin->first) { + throw operation_obsolete(); + } + + Future> fPrevPeekData = trackerData.sequence_version[sequenceNum].getFuture(); + if (fPrevPeekData.isReady()) { + trackerData.unblockedPeeks++; + double t = now() - trackerData.lastUpdate; + if (t > trackerData.idleMax) + trackerData.idleMax = t; + trackerData.idleTime += t; + } + trackerData.lastUpdate = now(); + std::pair prevPeekData = wait(fPrevPeekData); + begin = std::max(prevPeekData.first, begin); + reqOnlySpilled = prevPeekData.second; + wait(yield()); + } + + state double blockStart = now(); + + if (returnIfBlocked && logData->version.get() < begin) { + if (sequence.present()) { + auto& trackerData = logData->peekTracker[peekId]; + auto& sequenceData = trackerData.sequence_version[sequenceNum + 1]; + trackerData.lastUpdate = now(); + if (!sequenceData.isSet()) { + sequenceData.send(std::make_pair(begin, reqOnlySpilled)); + } + } + throw end_of_stream(); + } + + //TraceEvent("TLogPeekMessages0", self->dbgid).detail("ReqBeginEpoch", req.begin.epoch).detail("ReqBeginSeq", req.begin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", req.tag1).detail("Tag2", req.tag2); + // Wait until we have something to return that the caller doesn't already have + if (logData->version.get() < begin) { + wait(logData->version.whenAtLeast(begin)); + wait(delay(SERVER_KNOBS->TLOG_PEEK_DELAY, g_network->getCurrentTask())); + } + + if (logData->locality != tagLocalitySatellite && tag.locality == tagLocalityLogRouter) { + wait(self->concurrentLogRouterReads.take()); + state FlowLock::Releaser globalReleaser(self->concurrentLogRouterReads); + wait(delay(0.0, TaskPriority::Low)); + } + + if (begin <= logData->persistentDataDurableVersion && tag.locality != tagLocalityTxs && tag != txsTag) { + // Reading spilled data will almost always imply that the storage server is >5s behind the rest + // of the cluster. We shouldn't prioritize spending CPU on helping this server catch up + // slightly faster over keeping the rest of the cluster operating normally. + // txsTag is only ever peeked on recovery, and we would still wish to prioritize requests + // that impact recovery duration. + wait(delay(0, TaskPriority::TLogSpilledPeekReply)); + } + + state double workStart = now(); + + Version poppedVer = poppedVersion(logData, tag); + if (poppedVer > begin) { + // reply with an empty message and let the next reply start from poppedVer + TLogPeekReply rep; + rep.maxKnownVersion = logData->version.get(); + rep.minKnownCommittedVersion = logData->minKnownCommittedVersion; + rep.popped = poppedVer; + rep.end = poppedVer; + rep.onlySpilled = false; + + // TODO: once the fake stream is replaced by ReplyPromiseStream, we can remove the code handling sequence + // requests. + if (sequence.present()) { + auto& trackerData = logData->peekTracker[peekId]; + auto& sequenceData = trackerData.sequence_version[sequenceNum + 1]; + trackerData.lastUpdate = now(); + if (trackerData.sequence_version.size() && sequenceNum + 1 < trackerData.sequence_version.begin()->first) { + if (!sequenceData.isSet()) + sequenceData.sendError(operation_obsolete()); + throw operation_obsolete(); + } + if (sequenceData.isSet()) { + if (sequenceData.getFuture().get().first != rep.end) { + TEST(true); // 1 tlog peek second attempt ended at a different version + throw operation_obsolete(); + } + } else { + sequenceData.send(std::make_pair(rep.end, rep.onlySpilled)); + } + rep.begin = begin; + } + + return rep; + } + + state Version endVersion = logData->version.get() + 1; + state bool onlySpilled = false; + + // grab messages from disk + //TraceEvent("TLogPeekMessages", self->dbgid).detail("ReqBeginEpoch", req.begin.epoch).detail("ReqBeginSeq", req.begin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", req.tag1).detail("Tag2", req.tag2); + if (begin <= logData->persistentDataDurableVersion) { + // Just in case the durable version changes while we are waiting for the read, we grab this data from memory. We + // may or may not actually send it depending on whether we get enough data from disk. SOMEDAY: Only do this if + // an initial attempt to read from disk results in insufficient data and the required data is no longer in + // memory SOMEDAY: Should we only send part of the messages we collected, to actually limit the size of the + // result? + + if (reqOnlySpilled) { + endVersion = logData->persistentDataDurableVersion + 1; + } else { + peekMessagesFromMemory(logData, tag, begin, messages2, endVersion); + } + + RangeResult kvs = wait(self->persistentData->readRange( + KeyRangeRef(persistTagMessagesKey(logData->logId, tag, begin), + persistTagMessagesKey(logData->logId, tag, logData->persistentDataDurableVersion + 1)), + SERVER_KNOBS->DESIRED_TOTAL_BYTES, + SERVER_KNOBS->DESIRED_TOTAL_BYTES)); + + //TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", reply.getEndpoint().address).detail("Tag1Results", s1).detail("Tag2Results", s2).detail("Tag1ResultsLim", kv1.size()).detail("Tag2ResultsLim", kv2.size()).detail("Tag1ResultsLast", kv1.size() ? kv1[0].key : "").detail("Tag2ResultsLast", kv2.size() ? kv2[0].key : "").detail("Limited", limited).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowEpoch", self->epoch()).detail("NowSeq", self->sequence.getNextSequence()); + + for (auto& kv : kvs) { + auto ver = decodeTagMessagesKey(kv.key); + messages << VERSION_HEADER << ver; + messages.serializeBytes(kv.value); + } + + if (kvs.expectedSize() >= SERVER_KNOBS->DESIRED_TOTAL_BYTES) { + endVersion = decodeTagMessagesKey(kvs.end()[-1].key) + 1; + onlySpilled = true; + } else { + messages.serializeBytes(messages2.toValue()); + } + } else { + peekMessagesFromMemory(logData, tag, begin, messages, endVersion); + //TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", reply.getEndpoint().address).detail("MessageBytes", messages.getLength()).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowSeq", self->sequence.getNextSequence()); + } + + TLogPeekReply reply; + reply.maxKnownVersion = logData->version.get(); + reply.minKnownCommittedVersion = logData->minKnownCommittedVersion; + reply.messages = StringRef(reply.arena, messages.toValue()); + reply.end = endVersion; + reply.onlySpilled = onlySpilled; + + //TraceEvent("TlogPeek", self->dbgid).detail("LogId", logData->logId).detail("EndVer", reply.end).detail("MsgBytes", reply.messages.expectedSize()).detail("ForAddress", reply.getEndpoint().address); + + if (sequence.present()) { + auto& trackerData = logData->peekTracker[peekId]; + trackerData.lastUpdate = now(); + + double queueT = blockStart - queueStart; + double blockT = workStart - blockStart; + double workT = now() - workStart; + + trackerData.totalPeeks++; + trackerData.replyBytes += reply.messages.size(); + + if (queueT > trackerData.queueMax) + trackerData.queueMax = queueT; + if (blockT > trackerData.blockMax) + trackerData.blockMax = blockT; + if (workT > trackerData.workMax) + trackerData.workMax = workT; + + trackerData.queueTime += queueT; + trackerData.blockTime += blockT; + trackerData.workTime += workT; + + auto& sequenceData = trackerData.sequence_version[sequenceNum + 1]; + if (trackerData.sequence_version.size() && sequenceNum + 1 < trackerData.sequence_version.begin()->first) { + if (!sequenceData.isSet()) { + // It would technically be more correct to .send({req.begin, req.onlySpilled}), as the next + // request might still be in the window of active requests, but LogSystemPeekCursor will + // throw away all future responses upon getting an operation_obsolete(), so computing a + // response will probably be a waste of CPU. + sequenceData.sendError(operation_obsolete()); + } + throw operation_obsolete(); + } + if (sequenceData.isSet()) { + trackerData.duplicatePeeks++; + if (sequenceData.getFuture().get().first != reply.end) { + TEST(true); // 1 tlog peek second attempt ended at a different version (2) + throw operation_obsolete(); + } + } else { + sequenceData.send(std::make_pair(reply.end, reply.onlySpilled)); + } + reply.begin = begin; + } + + return reply; +} + +// This actor keep pushing TLogPeekStreamReply until it's removed from the cluster or should recover +ACTOR Future tLogPeekStream(TLogData* self, TLogPeekStreamRequest req, Reference logData) { + self->activePeekStreams++; + TraceEvent(SevDebug, "TLogPeekStream", logData->logId).detail("Token", req.reply.getEndpoint().token); + + state Version begin = req.begin; + state bool onlySpilled = false; + if (req.tag.locality == tagLocalityTxs && req.tag.id >= logData->txsTags && logData->txsTags > 0) { + req.tag.id = req.tag.id % logData->txsTags; + } + req.reply.setByteLimit(std::min(SERVER_KNOBS->MAXIMUM_PEEK_BYTES, req.limitBytes)); + loop { + state TLogPeekStreamReply reply; + try { + wait(req.reply.onReady() && + store(reply.rep, peekTLog(self, logData, begin, req.tag, req.returnIfBlocked, onlySpilled))); + req.reply.send(reply); + begin = reply.rep.end; + onlySpilled = reply.rep.onlySpilled; + wait(delay(0, g_network->getCurrentTask())); + } catch (Error& e) { + self->activePeekStreams--; + TraceEvent(SevDebug, "TLogPeekStreamEnd", logData->logId).error(e, true); + + if (e.code() == error_code_end_of_stream) { + req.reply.sendError(e); + return Void(); + } else if (e.code() == error_code_operation_obsolete) { + // reply stream is cancelled on the client + return Void(); + } else { + throw; + } + } + } +} + ACTOR Future tLogPeekMessages(TLogData* self, TLogPeekRequest req, Reference logData) { + /*try { + TLogPeekReply reply = + wait(peekTLog(self, logData, req.begin, req.tag, req.returnIfBlocked, req.onlySpilled, req.sequence)); + req.reply.send(reply); + } catch (Error& e) { + if (e.code() == error_code_timed_out || e.code() == error_code_operation_obsolete || + e.code() == error_code_end_of_stream) { + req.reply.sendError(e); + return Void(); + } else { + throw; + } + } + + return Void();*/ state BinaryWriter messages(Unversioned()); state BinaryWriter messages2(Unversioned()); state int sequence = -1; @@ -1354,7 +1644,7 @@ ACTOR Future tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen if (req.onlySpilled) { endVersion = logData->persistentDataDurableVersion + 1; } else { - peekMessagesFromMemory(logData, req, messages2, endVersion); + peekMessagesFromMemory(logData, req.tag, req.begin, messages2, endVersion); } RangeResult kvs = wait(self->persistentData->readRange( @@ -1378,7 +1668,7 @@ ACTOR Future tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen messages.serializeBytes(messages2.toValue()); } } else { - peekMessagesFromMemory(logData, req, messages, endVersion); + peekMessagesFromMemory(logData, req.tag, req.begin, messages, endVersion); //TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", req.reply.getEndpoint().address).detail("MessageBytes", messages.getLength()).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowSeq", self->sequence.getNextSequence()); } @@ -1933,6 +2223,9 @@ ACTOR Future serveTLogInterface(TLogData* self, when(TLogPeekRequest req = waitNext(tli.peekMessages.getFuture())) { logData->addActor.send(tLogPeekMessages(self, req, logData)); } + when(TLogPeekStreamRequest req = waitNext(tli.peekStreamMessages.getFuture())) { + logData->addActor.send(tLogPeekStream(self, req, logData)); + } when(TLogPopRequest req = waitNext(tli.popMessages.getFuture())) { logData->addActor.send(tLogPop(self, req, logData)); } @@ -2328,6 +2621,7 @@ ACTOR Future restorePersistentState(TLogData* self, recruited.initEndpoints(); DUMPTOKEN(recruited.peekMessages); + DUMPTOKEN(recruited.peekStreamMessages); DUMPTOKEN(recruited.popMessages); DUMPTOKEN(recruited.commit); DUMPTOKEN(recruited.lock); @@ -2538,6 +2832,7 @@ ACTOR Future tLogStart(TLogData* self, InitializeTLogRequest req, Locality recruited.initEndpoints(); DUMPTOKEN(recruited.peekMessages); + DUMPTOKEN(recruited.peekStreamMessages); DUMPTOKEN(recruited.popMessages); DUMPTOKEN(recruited.commit); DUMPTOKEN(recruited.lock); @@ -2730,7 +3025,7 @@ ACTOR Future tLog(IKeyValueStore* persistentData, state TLogData self(tlogId, workerID, persistentData, persistentQueue, db, degraded, folder); state Future error = actorCollection(self.sharedActors.getFuture()); - TraceEvent("SharedTlog", tlogId); + TraceEvent("SharedTlog", tlogId).detail("Version", "6.0"); try { if (restoreFromDisk) { wait(restorePersistentState(&self, locality, oldLog, recovered, tlogRequests)); diff --git a/fdbserver/OldTLogServer_6_2.actor.cpp b/fdbserver/OldTLogServer_6_2.actor.cpp index 68c125858f..94c35775a6 100644 --- a/fdbserver/OldTLogServer_6_2.actor.cpp +++ b/fdbserver/OldTLogServer_6_2.actor.cpp @@ -339,6 +339,7 @@ struct TLogData : NonCopyable { int64_t targetVolatileBytes; // The number of bytes of mutations this TLog should hold in memory before spilling. int64_t overheadBytesInput; int64_t overheadBytesDurable; + int activePeekStreams; WorkerCache tlogCache; FlowLock peekMemoryLimiter; @@ -662,6 +663,7 @@ struct LogData : NonCopyable, public ReferenceCounted { cc, "QueueDiskBytesTotal", [tLogData]() { return tLogData->rawPersistentQueue->getStorageBytes().total; }); specialCounter(cc, "PeekMemoryReserved", [tLogData]() { return tLogData->peekMemoryLimiter.activePermits(); }); specialCounter(cc, "PeekMemoryRequestsStalled", [tLogData]() { return tLogData->peekMemoryLimiter.waiters(); }); + specialCounter(cc, "ActivePeekStreams", [tLogData]() {return tLogData->activePeekStreams;}); } ~LogData() { @@ -1488,19 +1490,20 @@ ACTOR Future tLogPop(TLogData* self, TLogPopRequest req, Reference self, - TLogPeekRequest const& req, + Tag tag, + Version begin, BinaryWriter& messages, Version& endVersion) { - ASSERT(!messages.getLength()); + ASSERT(!messages.getLength()); - auto& deque = getVersionMessages(self, req.tag); - //TraceEvent("TLogPeekMem", self->dbgid).detail("Tag", req.tag1).detail("PDS", self->persistentDataSequence).detail("PDDS", self->persistentDataDurableSequence).detail("Oldest", map1.empty() ? 0 : map1.begin()->key ).detail("OldestMsgCount", map1.empty() ? 0 : map1.begin()->value.size()); + auto& deque = getVersionMessages(self, tag); + //TraceEvent("TLogPeekMem", self->dbgid).detail("Tag", req.tag1).detail("PDS", self->persistentDataSequence).detail("PDDS", self->persistentDataDurableSequence).detail("Oldest", map1.empty() ? 0 : map1.begin()->key ).detail("OldestMsgCount", map1.empty() ? 0 : map1.begin()->value.size()); - Version begin = std::max(req.begin, self->persistentDataDurableVersion + 1); - auto it = std::lower_bound(deque.begin(), - deque.end(), - std::make_pair(begin, LengthPrefixedStringRef()), - CompareFirst>()); + begin = std::max(begin, self->persistentDataDurableVersion + 1); + auto it = std::lower_bound(deque.begin(), + deque.end(), + std::make_pair(begin, LengthPrefixedStringRef()), + CompareFirst>()); Version currentVersion = -1; for (; it != deque.end(); ++it) { @@ -1541,7 +1544,387 @@ ACTOR Future> parseMessagesForTag(StringRef commitBlob, T return relevantMessages; } +// Common logics to peek TLog and create TLogPeekReply that serves both streaming peek or normal peek request +ACTOR Future peekTLog(TLogData* self, + Reference logData, + Version begin, + Tag tag, + bool returnIfBlocked = false, + bool reqOnlySpilled = false, + Optional> sequence = Optional>()) { + state BinaryWriter messages(Unversioned()); + state BinaryWriter messages2(Unversioned()); + state int sequenceNum = -1; + state UID peekId; + state double queueStart = now(); + + if (tag.locality == tagLocalityTxs && tag.id >= logData->txsTags && logData->txsTags > 0) { + tag.id = tag.id % logData->txsTags; + } + + // TODO: once the fake stream is replaced by ReplyPromiseStream, we can remove the code handling sequence requests + // STEP: a. mark obsolete sequence requests; b. wait previous sequence requests are handled in order + if (sequence.present()) { + peekId = sequence.get().first; + sequenceNum = sequence.get().second; + if (sequenceNum >= SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS && + logData->peekTracker.find(peekId) == logData->peekTracker.end()) { + throw operation_obsolete(); + } + auto& trackerData = logData->peekTracker[peekId]; + if (sequenceNum == 0 && trackerData.sequence_version.find(0) == trackerData.sequence_version.end()) { + trackerData.tag = tag; + trackerData.sequence_version[0].send(std::make_pair(begin, reqOnlySpilled)); + } + auto seqBegin = trackerData.sequence_version.begin(); + // The peek cursor and this comparison need to agree about the maximum number of in-flight requests. + while (trackerData.sequence_version.size() && + seqBegin->first <= sequenceNum - SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS) { + if (seqBegin->second.canBeSet()) { + seqBegin->second.sendError(operation_obsolete()); + } + trackerData.sequence_version.erase(seqBegin); + seqBegin = trackerData.sequence_version.begin(); + } + + if (trackerData.sequence_version.size() && sequenceNum < seqBegin->first) { + throw operation_obsolete(); + } + + Future> fPrevPeekData = trackerData.sequence_version[sequenceNum].getFuture(); + if (fPrevPeekData.isReady()) { + trackerData.unblockedPeeks++; + double t = now() - trackerData.lastUpdate; + if (t > trackerData.idleMax) + trackerData.idleMax = t; + trackerData.idleTime += t; + } + trackerData.lastUpdate = now(); + std::pair prevPeekData = wait(fPrevPeekData); + begin = std::max(prevPeekData.first, begin); + reqOnlySpilled = prevPeekData.second; + wait(yield()); + } + + state double blockStart = now(); + + if (returnIfBlocked && logData->version.get() < begin) { + if (sequence.present()) { + auto& trackerData = logData->peekTracker[peekId]; + auto& sequenceData = trackerData.sequence_version[sequenceNum + 1]; + trackerData.lastUpdate = now(); + if (!sequenceData.isSet()) { + sequenceData.send(std::make_pair(begin, reqOnlySpilled)); + } + } + throw end_of_stream(); + } + + //TraceEvent("TLogPeekMessages0", self->dbgid).detail("ReqBeginEpoch", req.begin.epoch).detail("ReqBeginSeq", req.begin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", req.tag1).detail("Tag2", req.tag2); + // Wait until we have something to return that the caller doesn't already have + if (logData->version.get() < begin) { + wait(logData->version.whenAtLeast(begin)); + wait(delay(SERVER_KNOBS->TLOG_PEEK_DELAY, g_network->getCurrentTask())); + } + + if (logData->locality != tagLocalitySatellite && tag.locality == tagLocalityLogRouter) { + wait(self->concurrentLogRouterReads.take()); + state FlowLock::Releaser globalReleaser(self->concurrentLogRouterReads); + wait(delay(0.0, TaskPriority::Low)); + } + + if (begin <= logData->persistentDataDurableVersion && tag.locality != tagLocalityTxs && tag != txsTag) { + // Reading spilled data will almost always imply that the storage server is >5s behind the rest + // of the cluster. We shouldn't prioritize spending CPU on helping this server catch up + // slightly faster over keeping the rest of the cluster operating normally. + // txsTag is only ever peeked on recovery, and we would still wish to prioritize requests + // that impact recovery duration. + wait(delay(0, TaskPriority::TLogSpilledPeekReply)); + } + + state double workStart = now(); + + Version poppedVer = poppedVersion(logData, tag); + if (poppedVer > begin) { + // reply with an empty message and let the next reply start from poppedVer + TLogPeekReply rep; + rep.maxKnownVersion = logData->version.get(); + rep.minKnownCommittedVersion = logData->minKnownCommittedVersion; + rep.popped = poppedVer; + rep.end = poppedVer; + rep.onlySpilled = false; + + // TODO: once the fake stream is replaced by ReplyPromiseStream, we can remove the code handling sequence + // requests. + if (sequence.present()) { + auto& trackerData = logData->peekTracker[peekId]; + auto& sequenceData = trackerData.sequence_version[sequenceNum + 1]; + trackerData.lastUpdate = now(); + if (trackerData.sequence_version.size() && sequenceNum + 1 < trackerData.sequence_version.begin()->first) { + if (!sequenceData.isSet()) + sequenceData.sendError(operation_obsolete()); + throw operation_obsolete(); + } + if (sequenceData.isSet()) { + if (sequenceData.getFuture().get().first != rep.end) { + TEST(true); // 1 tlog peek second attempt ended at a different version + throw operation_obsolete(); + } + } else { + sequenceData.send(std::make_pair(rep.end, rep.onlySpilled)); + } + rep.begin = begin; + } + + return rep; + } + + state Version endVersion = logData->version.get() + 1; + state bool onlySpilled = false; + + // grab messages from disk + //TraceEvent("TLogPeekMessages", self->dbgid).detail("ReqBeginEpoch", req.begin.epoch).detail("ReqBeginSeq", req.begin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", req.tag1).detail("Tag2", req.tag2); + if (begin <= logData->persistentDataDurableVersion) { + // Just in case the durable version changes while we are waiting for the read, we grab this data from memory. We + // may or may not actually send it depending on whether we get enough data from disk. SOMEDAY: Only do this if + // an initial attempt to read from disk results in insufficient data and the required data is no longer in + // memory SOMEDAY: Should we only send part of the messages we collected, to actually limit the size of the + // result? + + if (reqOnlySpilled) { + endVersion = logData->persistentDataDurableVersion + 1; + } else { + peekMessagesFromMemory(logData, tag, begin, messages2, endVersion); + } + + if (tag.locality == tagLocalityTxs || tag == txsTag) { + RangeResult kvs = wait(self->persistentData->readRange( + KeyRangeRef(persistTagMessagesKey(logData->logId, tag, begin), + persistTagMessagesKey(logData->logId, tag, logData->persistentDataDurableVersion + 1)), + SERVER_KNOBS->DESIRED_TOTAL_BYTES, + SERVER_KNOBS->DESIRED_TOTAL_BYTES)); + + for (auto& kv : kvs) { + auto ver = decodeTagMessagesKey(kv.key); + messages << VERSION_HEADER << ver; + messages.serializeBytes(kv.value); + } + + if (kvs.expectedSize() >= SERVER_KNOBS->DESIRED_TOTAL_BYTES) { + endVersion = decodeTagMessagesKey(kvs.end()[-1].key) + 1; + onlySpilled = true; + } else { + messages.serializeBytes(messages2.toValue()); + } + } else { + // FIXME: Limit to approximately DESIRED_TOTATL_BYTES somehow. + RangeResult kvrefs = wait(self->persistentData->readRange( + KeyRangeRef( + persistTagMessageRefsKey(logData->logId, tag, begin), + persistTagMessageRefsKey(logData->logId, tag, logData->persistentDataDurableVersion + 1)), + SERVER_KNOBS->TLOG_SPILL_REFERENCE_MAX_BATCHES_PER_PEEK + 1)); + + //TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", reply.getEndpoint().getPrimaryAddress()).detail("Tag1Results", s1).detail("Tag2Results", s2).detail("Tag1ResultsLim", kv1.size()).detail("Tag2ResultsLim", kv2.size()).detail("Tag1ResultsLast", kv1.size() ? kv1[0].key : "").detail("Tag2ResultsLast", kv2.size() ? kv2[0].key : "").detail("Limited", limited).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowEpoch", self->epoch()).detail("NowSeq", self->sequence.getNextSequence()); + + state std::vector> commitLocations; + state bool earlyEnd = false; + uint32_t mutationBytes = 0; + state uint64_t commitBytes = 0; + state Version firstVersion = std::numeric_limits::max(); + for (int i = 0; i < kvrefs.size() && i < SERVER_KNOBS->TLOG_SPILL_REFERENCE_MAX_BATCHES_PER_PEEK; i++) { + auto& kv = kvrefs[i]; + VectorRef spilledData; + BinaryReader r(kv.value, AssumeVersion(logData->protocolVersion)); + r >> spilledData; + for (const SpilledData& sd : spilledData) { + if (mutationBytes >= SERVER_KNOBS->DESIRED_TOTAL_BYTES) { + earlyEnd = true; + break; + } + if (sd.version >= begin) { + firstVersion = std::min(firstVersion, sd.version); + const IDiskQueue::location end = sd.start.lo + sd.length; + commitLocations.emplace_back(sd.start, end); + // This isn't perfect, because we aren't accounting for page boundaries, but should be + // close enough. + commitBytes += sd.length; + mutationBytes += sd.mutationBytes; + } + } + if (earlyEnd) + break; + } + earlyEnd = earlyEnd || (kvrefs.size() >= SERVER_KNOBS->TLOG_SPILL_REFERENCE_MAX_BATCHES_PER_PEEK + 1); + wait(self->peekMemoryLimiter.take(TaskPriority::TLogSpilledPeekReply, commitBytes)); + state FlowLock::Releaser memoryReservation(self->peekMemoryLimiter, commitBytes); + state std::vector>> messageReads; + messageReads.reserve(commitLocations.size()); + for (const auto& pair : commitLocations) { + messageReads.push_back(self->rawPersistentQueue->read(pair.first, pair.second, CheckHashes::TRUE)); + } + commitLocations.clear(); + wait(waitForAll(messageReads)); + + state Version lastRefMessageVersion = 0; + state int index = 0; + loop { + if (index >= messageReads.size()) + break; + Standalone queueEntryData = messageReads[index].get(); + uint8_t valid; + const uint32_t length = *(uint32_t*)queueEntryData.begin(); + queueEntryData = queueEntryData.substr(4, queueEntryData.size() - 4); + BinaryReader rd(queueEntryData, IncludeVersion()); + state TLogQueueEntry entry; + rd >> entry >> valid; + ASSERT(valid == 0x01); + ASSERT(length + sizeof(valid) == queueEntryData.size()); + + messages << VERSION_HEADER << entry.version; + + std::vector rawMessages = + wait(parseMessagesForTag(entry.messages, tag, logData->logRouterTags)); + for (const StringRef& msg : rawMessages) { + messages.serializeBytes(msg); + } + + lastRefMessageVersion = entry.version; + index++; + } + + messageReads.clear(); + memoryReservation.release(); + + if (earlyEnd) { + endVersion = lastRefMessageVersion + 1; + onlySpilled = true; + } else { + messages.serializeBytes(messages2.toValue()); + } + } + } else { + if (reqOnlySpilled) { + endVersion = logData->persistentDataDurableVersion + 1; + } else { + peekMessagesFromMemory(logData, tag, begin, messages, endVersion); + } + + //TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", req.reply.getEndpoint().getPrimaryAddress()).detail("MessageBytes", messages.getLength()).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowSeq", self->sequence.getNextSequence()); + } + + TLogPeekReply reply; + reply.maxKnownVersion = logData->version.get(); + reply.minKnownCommittedVersion = logData->minKnownCommittedVersion; + reply.messages = StringRef(reply.arena, messages.toValue()); + reply.end = endVersion; + reply.onlySpilled = onlySpilled; + + //TraceEvent("TlogPeek", self->dbgid).detail("LogId", logData->logId).detail("Tag", req.tag.toString()). + // detail("BeginVer", req.begin).detail("EndVer", reply.end). + // detail("MsgBytes", reply.messages.expectedSize()). + // detail("ForAddress", req.reply.getEndpoint().getPrimaryAddress()); + + if (sequence.present()) { + auto& trackerData = logData->peekTracker[peekId]; + trackerData.lastUpdate = now(); + + double queueT = blockStart - queueStart; + double blockT = workStart - blockStart; + double workT = now() - workStart; + + trackerData.totalPeeks++; + trackerData.replyBytes += reply.messages.size(); + + if (queueT > trackerData.queueMax) + trackerData.queueMax = queueT; + if (blockT > trackerData.blockMax) + trackerData.blockMax = blockT; + if (workT > trackerData.workMax) + trackerData.workMax = workT; + + trackerData.queueTime += queueT; + trackerData.blockTime += blockT; + trackerData.workTime += workT; + + auto& sequenceData = trackerData.sequence_version[sequenceNum + 1]; + if (trackerData.sequence_version.size() && sequenceNum + 1 < trackerData.sequence_version.begin()->first) { + if (!sequenceData.isSet()) { + // It would technically be more correct to .send({req.begin, req.onlySpilled}), as the next + // request might still be in the window of active requests, but LogSystemPeekCursor will + // throw away all future responses upon getting an operation_obsolete(), so computing a + // response will probably be a waste of CPU. + sequenceData.sendError(operation_obsolete()); + } + throw operation_obsolete(); + } + if (sequenceData.isSet()) { + trackerData.duplicatePeeks++; + if (sequenceData.getFuture().get().first != reply.end) { + TEST(true); // 1 tlog peek second attempt ended at a different version (2) + throw operation_obsolete(); + } + } else { + sequenceData.send(std::make_pair(reply.end, reply.onlySpilled)); + } + reply.begin = begin; + } + + return reply; +} + +// This actor keep pushing TLogPeekStreamReply until it's removed from the cluster or should recover +ACTOR Future tLogPeekStream(TLogData* self, TLogPeekStreamRequest req, Reference logData) { + self->activePeekStreams++; + TraceEvent(SevDebug, "TLogPeekStream", logData->logId).detail("Token", req.reply.getEndpoint().token); + + state Version begin = req.begin; + state bool onlySpilled = false; + if (req.tag.locality == tagLocalityTxs && req.tag.id >= logData->txsTags && logData->txsTags > 0) { + req.tag.id = req.tag.id % logData->txsTags; + } + req.reply.setByteLimit(std::min(SERVER_KNOBS->MAXIMUM_PEEK_BYTES, req.limitBytes)); + loop { + state TLogPeekStreamReply reply; + try { + wait(req.reply.onReady() && + store(reply.rep, peekTLog(self, logData, begin, req.tag, req.returnIfBlocked, onlySpilled))); + req.reply.send(reply); + begin = reply.rep.end; + onlySpilled = reply.rep.onlySpilled; + wait(delay(0, g_network->getCurrentTask())); + } catch (Error& e) { + self->activePeekStreams--; + TraceEvent(SevDebug, "TLogPeekStreamEnd", logData->logId).error(e, true); + + if (e.code() == error_code_end_of_stream) { + req.reply.sendError(e); + return Void(); + } else if (e.code() == error_code_operation_obsolete) { + // reply stream is cancelled on the client + return Void(); + } else { + throw; + } + } + } +} + ACTOR Future tLogPeekMessages(TLogData* self, TLogPeekRequest req, Reference logData) { + /*try { + TLogPeekReply reply = + wait(peekTLog(self, logData, req.begin, req.tag, req.returnIfBlocked, req.onlySpilled, req.sequence)); + req.reply.send(reply); + } catch (Error& e) { + if (e.code() == error_code_timed_out || e.code() == error_code_operation_obsolete || + e.code() == error_code_end_of_stream) { + req.reply.sendError(e); + return Void(); + } else { + throw; + } + } + + return Void();*/ state BinaryWriter messages(Unversioned()); state BinaryWriter messages2(Unversioned()); state int sequence = -1; @@ -1691,7 +2074,7 @@ ACTOR Future tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen if (req.onlySpilled) { endVersion = logData->persistentDataDurableVersion + 1; } else { - peekMessagesFromMemory(logData, req, messages2, endVersion); + peekMessagesFromMemory(logData, req.tag, req.begin, messages2, endVersion); } if (req.tag.locality == tagLocalityTxs || req.tag == txsTag) { @@ -1803,7 +2186,7 @@ ACTOR Future tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen if (req.onlySpilled) { endVersion = logData->persistentDataDurableVersion + 1; } else { - peekMessagesFromMemory(logData, req, messages, endVersion); + peekMessagesFromMemory(logData, req.tag, req.begin, messages, endVersion); } //TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", req.reply.getEndpoint().getPrimaryAddress()).detail("MessageBytes", messages.getLength()).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowSeq", self->sequence.getNextSequence()); @@ -2789,6 +3172,7 @@ ACTOR Future restorePersistentState(TLogData* self, recruited.initEndpoints(); DUMPTOKEN(recruited.peekMessages); + DUMPTOKEN(recruited.peekStreamMessages); DUMPTOKEN(recruited.popMessages); DUMPTOKEN(recruited.commit); DUMPTOKEN(recruited.lock); @@ -3020,6 +3404,7 @@ ACTOR Future tLogStart(TLogData* self, InitializeTLogRequest req, Locality recruited.initEndpoints(); DUMPTOKEN(recruited.peekMessages); + DUMPTOKEN(recruited.peekStreamMessages); DUMPTOKEN(recruited.popMessages); DUMPTOKEN(recruited.commit); DUMPTOKEN(recruited.lock); @@ -3219,7 +3604,7 @@ ACTOR Future tLog(IKeyValueStore* persistentData, state TLogData self(tlogId, workerID, persistentData, persistentQueue, db, degraded, folder); state Future error = actorCollection(self.sharedActors.getFuture()); - TraceEvent("SharedTlog", tlogId); + TraceEvent("SharedTlog", tlogId).detail("Version", "6.2"); try { if (restoreFromDisk) { wait(restorePersistentState(&self, locality, oldLog, recovered, tlogRequests)); diff --git a/flow/error_definitions.h b/flow/error_definitions.h index 8ffb54f290..3af43b2d88 100755 --- a/flow/error_definitions.h +++ b/flow/error_definitions.h @@ -33,6 +33,7 @@ // clang-format off ERROR( success, 0, "Success" ) ERROR( end_of_stream, 1, "End of stream" ) +ERROR( no_action_needed, 2, "No action needed" ) ERROR( operation_failed, 1000, "Operation failed") ERROR( wrong_shard_server, 1001, "Shard is not available from this server") ERROR( operation_obsolete, 1002, "Operation result no longer necessary") From 68b08a32247763075524e0be67859961afe9bd35 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Wed, 21 Jul 2021 13:23:16 -0700 Subject: [PATCH 13/29] add TLog streaming peek to OldTLogServer_x_x --- fdbserver/OldTLogServer_4_6.actor.cpp | 344 ++++-------- fdbserver/OldTLogServer_6_0.actor.cpp | 544 ++++++------------ fdbserver/OldTLogServer_6_2.actor.cpp | 769 ++++++++------------------ 3 files changed, 478 insertions(+), 1179 deletions(-) diff --git a/fdbserver/OldTLogServer_4_6.actor.cpp b/fdbserver/OldTLogServer_4_6.actor.cpp index 13924707bd..f84a4a47c8 100644 --- a/fdbserver/OldTLogServer_4_6.actor.cpp +++ b/fdbserver/OldTLogServer_4_6.actor.cpp @@ -479,7 +479,7 @@ struct LogData : NonCopyable, public ReferenceCounted { }); specialCounter( cc, "QueueDiskBytesTotal", [tLogData]() { return tLogData->rawPersistentQueue->getStorageBytes().total; }); - specialCounter(cc, "ActivePeekStreams", [tLogData]() {return tLogData->activePeekStreams;}); + specialCounter(cc, "ActivePeekStreams", [tLogData]() { return tLogData->activePeekStreams; }); } ~LogData() { @@ -982,244 +982,44 @@ ACTOR Future peekTLog(TLogData* self, bool returnIfBlocked = false, bool reqOnlySpilled = false, Optional> sequence = Optional>()) { - state BinaryWriter messages(Unversioned()); - state BinaryWriter messages2(Unversioned()); - state int sequenceNum = -1; - state UID peekId; - state double queueStart = now(); - state OldTag oldTag = convertTag(tag); - - if (sequence.present()) { - peekId = sequence.get().first; - sequenceNum = sequence.get().second; - if (sequenceNum >= SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS && - self->peekTracker.find(peekId) == self->peekTracker.end()) { - throw operation_obsolete(); - } - if (sequenceNum > 0) { - auto& trackerData = self->peekTracker[peekId]; - trackerData.lastUpdate = now(); - Version ver = wait(trackerData.sequence_version[sequenceNum].getFuture()); - begin = std::max(ver, begin); - wait(yield()); - } - } - - if (returnIfBlocked && logData->version.get() < begin) { - throw end_of_stream(); - } - - //TraceEvent("TLogPeekMessages0", self->dbgid).detail("ReqBeginEpoch", req.begin.epoch).detail("ReqBeginSeq", req.begin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", req.tag1).detail("Tag2", req.tag2); - // Wait until we have something to return that the caller doesn't already have - if (logData->version.get() < begin) { - wait(logData->version.whenAtLeast(begin)); - wait(delay(SERVER_KNOBS->TLOG_PEEK_DELAY, g_network->getCurrentTask())); - } - - state Version endVersion = logData->version.get() + 1; - - Version poppedVer = poppedVersion(logData, oldTag); - if (poppedVer > begin) { - TLogPeekReply rep; - rep.maxKnownVersion = logData->version.get(); - rep.minKnownCommittedVersion = 0; - rep.popped = poppedVer; - rep.end = poppedVer; - rep.onlySpilled = false; - - if (sequence.present()) { - auto& trackerData = self->peekTracker[peekId]; - auto& sequenceData = trackerData.sequence_version[sequenceNum + 1]; - trackerData.lastUpdate = now(); - if (trackerData.sequence_version.size() && sequenceNum + 1 < trackerData.sequence_version.begin()->first) { - if (!sequenceData.isSet()) - sequenceData.sendError(operation_obsolete()); - throw operation_obsolete(); - } - if (sequenceData.isSet()) { - if (sequenceData.getFuture().get() != rep.end) { - TEST(true); // 0 tlog peek second attempt ended at a different version - throw operation_obsolete(); - } - } else { - sequenceData.send(rep.end); - } - rep.begin = begin; - } - - return rep; - } - - // grab messages from disk - //TraceEvent("TLogPeekMessages", self->dbgid).detail("ReqBeginEpoch", req.begin.epoch).detail("ReqBeginSeq", req.begin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", req.tag1).detail("Tag2", req.tag2); - if (begin <= logData->persistentDataDurableVersion) { - // Just in case the durable version changes while we are waiting for the read, we grab this data from memory. We - // may or may not actually send it depending on whether we get enough data from disk. SOMEDAY: Only do this if - // an initial attempt to read from disk results in insufficient data and the required data is no longer in - // memory SOMEDAY: Should we only send part of the messages we collected, to actually limit the size of the - // result? - - peekMessagesFromMemory(logData, tag, begin, messages2, endVersion); - - RangeResult kvs = wait(self->persistentData->readRange( - KeyRangeRef(persistTagMessagesKey(logData->logId, oldTag, begin), - persistTagMessagesKey(logData->logId, oldTag, logData->persistentDataDurableVersion + 1)), - SERVER_KNOBS->DESIRED_TOTAL_BYTES, - SERVER_KNOBS->DESIRED_TOTAL_BYTES)); - - //TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", reply.getEndpoint().getPrimaryAddress()).detail("Tag1Results", s1).detail("Tag2Results", s2).detail("Tag1ResultsLim", kv1.size()).detail("Tag2ResultsLim", kv2.size()).detail("Tag1ResultsLast", kv1.size() ? kv1[0].key : "").detail("Tag2ResultsLast", kv2.size() ? kv2[0].key : "").detail("Limited", limited).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowEpoch", self->epoch()).detail("NowSeq", self->sequence.getNextSequence()); - - for (auto& kv : kvs) { - auto ver = decodeTagMessagesKey(kv.key); - messages << int32_t(-1) << ver; - - BinaryReader rd(kv.value, Unversioned()); - while (!rd.empty()) { - int32_t messageLength; - uint32_t subVersion; - rd >> messageLength >> subVersion; - messageLength += sizeof(uint16_t) + sizeof(Tag); - messages << messageLength << subVersion << uint16_t(1) << tag; - messageLength -= (sizeof(subVersion) + sizeof(uint16_t) + sizeof(Tag)); - messages.serializeBytes(rd.readBytes(messageLength), messageLength); - } - } - - if (kvs.expectedSize() >= SERVER_KNOBS->DESIRED_TOTAL_BYTES) - endVersion = decodeTagMessagesKey(kvs.end()[-1].key) + 1; - else - messages.serializeBytes(messages2.toValue()); - } else { - peekMessagesFromMemory(logData, tag, begin, messages, endVersion); - //TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", reply.getEndpoint().getPrimaryAddress()).detail("MessageBytes", messages.getLength()).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowSeq", self->sequence.getNextSequence()); - } - - TLogPeekReply reply; - reply.maxKnownVersion = logData->version.get(); - reply.minKnownCommittedVersion = 0; - reply.onlySpilled = false; - reply.messages = StringRef(reply.arena, messages.toValue()); - reply.end = endVersion; - - //TraceEvent("TlogPeek", self->dbgid).detail("LogId", logData->logId).detail("EndVer", reply.end).detail("MsgBytes", reply.messages.expectedSize()).detail("ForAddress", req.reply.getEndpoint().getPrimaryAddress()); - - if (sequence.present()) { - auto& trackerData = self->peekTracker[peekId]; - trackerData.lastUpdate = now(); - auto& sequenceData = trackerData.sequence_version[sequenceNum + 1]; - if (sequenceData.isSet()) { - if (sequenceData.getFuture().get() != reply.end) { - TEST(true); // 0 tlog peek second attempt ended at a different version (2) - throw operation_obsolete(); - } - } else { - sequenceData.send(reply.end); - } - reply.begin = begin; - } - - return reply; -} - -// This actor keep pushing TLogPeekStreamReply until it's removed from the cluster or should recover -ACTOR Future tLogPeekStream(TLogData* self, TLogPeekStreamRequest req, Reference logData) { - self->activePeekStreams++; - TraceEvent(SevDebug, "TLogPeekStream", logData->logId).detail("Token", req.reply.getEndpoint().token); - - state Version begin = req.begin; - state bool onlySpilled = false; - - req.reply.setByteLimit(std::min(SERVER_KNOBS->MAXIMUM_PEEK_BYTES, req.limitBytes)); - loop { - state TLogPeekStreamReply reply; - try { - wait(req.reply.onReady() && - store(reply.rep, peekTLog(self, logData, begin, req.tag, req.returnIfBlocked, onlySpilled))); - req.reply.send(reply); - begin = reply.rep.end; - onlySpilled = reply.rep.onlySpilled; - wait(delay(0, g_network->getCurrentTask())); - } catch (Error& e) { - self->activePeekStreams--; - TraceEvent(SevDebug, "TLogPeekStreamEnd", logData->logId).error(e, true); - - if (e.code() == error_code_end_of_stream) { - req.reply.sendError(e); - return Void(); - } else if (e.code() == error_code_operation_obsolete) { - // reply stream is cancelled on the client - return Void(); - } else { - throw; - } - } - } -} - -ACTOR Future tLogPeekMessages(TLogData* self, TLogPeekRequest req, Reference logData) { - /*try { - TLogPeekReply reply = - wait(peekTLog(self, logData, req.begin, req.tag, req.returnIfBlocked, req.onlySpilled, req.sequence)); - req.reply.send(reply); - } catch (Error& e) { - if (e.code() == error_code_timed_out || e.code() == error_code_operation_obsolete || - e.code() == error_code_end_of_stream) { - req.reply.sendError(e); - return Void(); - } else { - throw; - } - } - - return Void();*/ - state BinaryWriter messages(Unversioned()); state BinaryWriter messages2(Unversioned()); - state int sequence = -1; + state int sequenceNum = -1; state UID peekId; - state OldTag oldTag = convertTag(req.tag); + state double queueStart = now(); + state OldTag oldTag = convertTag(tag); - if (req.sequence.present()) { - try { - peekId = req.sequence.get().first; - sequence = req.sequence.get().second; - if (sequence >= SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS && - self->peekTracker.find(peekId) == self->peekTracker.end()) { - throw operation_obsolete(); - } - if (sequence > 0) { - auto& trackerData = self->peekTracker[peekId]; - trackerData.lastUpdate = now(); - Version ver = wait(trackerData.sequence_version[sequence].getFuture()); - req.begin = std::max(ver, req.begin); - wait(yield()); - } - } catch (Error& e) { - if (e.code() == error_code_timed_out || e.code() == error_code_operation_obsolete) { - req.reply.sendError(e); - return Void(); - } else { - throw; - } + if (sequence.present()) { + peekId = sequence.get().first; + sequenceNum = sequence.get().second; + if (sequenceNum >= SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS && + self->peekTracker.find(peekId) == self->peekTracker.end()) { + throw operation_obsolete(); + } + if (sequenceNum > 0) { + auto& trackerData = self->peekTracker[peekId]; + trackerData.lastUpdate = now(); + Version ver = wait(trackerData.sequence_version[sequenceNum].getFuture()); + begin = std::max(ver, begin); + wait(yield()); } } - if (req.returnIfBlocked && logData->version.get() < req.begin) { - req.reply.sendError(end_of_stream()); - return Void(); + if (returnIfBlocked && logData->version.get() < begin) { + throw end_of_stream(); } //TraceEvent("TLogPeekMessages0", self->dbgid).detail("ReqBeginEpoch", req.begin.epoch).detail("ReqBeginSeq", req.begin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", req.tag1).detail("Tag2", req.tag2); // Wait until we have something to return that the caller doesn't already have - if (logData->version.get() < req.begin) { - wait(logData->version.whenAtLeast(req.begin)); + if (logData->version.get() < begin) { + wait(logData->version.whenAtLeast(begin)); wait(delay(SERVER_KNOBS->TLOG_PEEK_DELAY, g_network->getCurrentTask())); } state Version endVersion = logData->version.get() + 1; Version poppedVer = poppedVersion(logData, oldTag); - if (poppedVer > req.begin) { + if (poppedVer > begin) { TLogPeekReply rep; rep.maxKnownVersion = logData->version.get(); rep.minKnownCommittedVersion = 0; @@ -1227,50 +1027,47 @@ ACTOR Future tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen rep.end = poppedVer; rep.onlySpilled = false; - if (req.sequence.present()) { + if (sequence.present()) { auto& trackerData = self->peekTracker[peekId]; - auto& sequenceData = trackerData.sequence_version[sequence + 1]; + auto& sequenceData = trackerData.sequence_version[sequenceNum + 1]; trackerData.lastUpdate = now(); - if (trackerData.sequence_version.size() && sequence + 1 < trackerData.sequence_version.begin()->first) { - req.reply.sendError(operation_obsolete()); + if (trackerData.sequence_version.size() && sequenceNum + 1 < trackerData.sequence_version.begin()->first) { if (!sequenceData.isSet()) sequenceData.sendError(operation_obsolete()); - return Void(); + throw operation_obsolete(); } if (sequenceData.isSet()) { if (sequenceData.getFuture().get() != rep.end) { - TEST(true); // tlog peek second attempt ended at a different version - req.reply.sendError(operation_obsolete()); - return Void(); + TEST(true); // 0 tlog peek second attempt ended at a different version + throw operation_obsolete(); } } else { sequenceData.send(rep.end); } - rep.begin = req.begin; + rep.begin = begin; } - req.reply.send(rep); - return Void(); + return rep; } // grab messages from disk //TraceEvent("TLogPeekMessages", self->dbgid).detail("ReqBeginEpoch", req.begin.epoch).detail("ReqBeginSeq", req.begin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", req.tag1).detail("Tag2", req.tag2); - if (req.begin <= logData->persistentDataDurableVersion) { + if (begin <= logData->persistentDataDurableVersion) { // Just in case the durable version changes while we are waiting for the read, we grab this data from memory. We // may or may not actually send it depending on whether we get enough data from disk. SOMEDAY: Only do this if // an initial attempt to read from disk results in insufficient data and the required data is no longer in // memory SOMEDAY: Should we only send part of the messages we collected, to actually limit the size of the // result? - peekMessagesFromMemory(logData, req.tag, req.begin, messages2, endVersion); + peekMessagesFromMemory(logData, tag, begin, messages2, endVersion); RangeResult kvs = wait(self->persistentData->readRange( - KeyRangeRef(persistTagMessagesKey(logData->logId, oldTag, req.begin), + KeyRangeRef(persistTagMessagesKey(logData->logId, oldTag, begin), persistTagMessagesKey(logData->logId, oldTag, logData->persistentDataDurableVersion + 1)), SERVER_KNOBS->DESIRED_TOTAL_BYTES, SERVER_KNOBS->DESIRED_TOTAL_BYTES)); - //TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", req.reply.getEndpoint().getPrimaryAddress()).detail("Tag1Results", s1).detail("Tag2Results", s2).detail("Tag1ResultsLim", kv1.size()).detail("Tag2ResultsLim", kv2.size()).detail("Tag1ResultsLast", kv1.size() ? kv1[0].key : "").detail("Tag2ResultsLast", kv2.size() ? kv2[0].key : "").detail("Limited", limited).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowEpoch", self->epoch()).detail("NowSeq", self->sequence.getNextSequence()); + //TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", reply.getEndpoint().getPrimaryAddress()).detail("Tag1Results", s1).detail("Tag2Results", s2).detail("Tag1ResultsLim", kv1.size()).detail("Tag2ResultsLim", kv2.size()).detail("Tag1ResultsLast", kv1.size() ? kv1[0].key : "").detail("Tag2ResultsLast", kv2.size() ? kv2[0].key : "").detail("Limited", limited).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowEpoch", self->epoch()).detail("NowSeq", self->sequence.getNextSequence()); for (auto& kv : kvs) { auto ver = decodeTagMessagesKey(kv.key); @@ -1282,7 +1079,7 @@ ACTOR Future tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen uint32_t subVersion; rd >> messageLength >> subVersion; messageLength += sizeof(uint16_t) + sizeof(Tag); - messages << messageLength << subVersion << uint16_t(1) << req.tag; + messages << messageLength << subVersion << uint16_t(1) << tag; messageLength -= (sizeof(subVersion) + sizeof(uint16_t) + sizeof(Tag)); messages.serializeBytes(rd.readBytes(messageLength), messageLength); } @@ -1293,36 +1090,87 @@ ACTOR Future tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen else messages.serializeBytes(messages2.toValue()); } else { - peekMessagesFromMemory(logData, req.tag, req.begin, messages, endVersion); - //TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", req.reply.getEndpoint().getPrimaryAddress()).detail("MessageBytes", messages.getLength()).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowSeq", self->sequence.getNextSequence()); + peekMessagesFromMemory(logData, tag, begin, messages, endVersion); + //TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", reply.getEndpoint().getPrimaryAddress()).detail("MessageBytes", messages.getLength()).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowSeq", self->sequence.getNextSequence()); } TLogPeekReply reply; reply.maxKnownVersion = logData->version.get(); reply.minKnownCommittedVersion = 0; reply.onlySpilled = false; - reply.messages = messages.toValue(); + reply.messages = StringRef(reply.arena, messages.toValue()); reply.end = endVersion; //TraceEvent("TlogPeek", self->dbgid).detail("LogId", logData->logId).detail("EndVer", reply.end).detail("MsgBytes", reply.messages.expectedSize()).detail("ForAddress", req.reply.getEndpoint().getPrimaryAddress()); - if (req.sequence.present()) { + if (sequence.present()) { auto& trackerData = self->peekTracker[peekId]; trackerData.lastUpdate = now(); - auto& sequenceData = trackerData.sequence_version[sequence + 1]; + auto& sequenceData = trackerData.sequence_version[sequenceNum + 1]; if (sequenceData.isSet()) { if (sequenceData.getFuture().get() != reply.end) { - TEST(true); // tlog peek second attempt ended at a different version (2) - req.reply.sendError(operation_obsolete()); - return Void(); + TEST(true); // 0 tlog peek second attempt ended at a different version (2) + throw operation_obsolete(); } } else { sequenceData.send(reply.end); } - reply.begin = req.begin; + reply.begin = begin; + } + + return reply; +} + +// This actor keep pushing TLogPeekStreamReply until it's removed from the cluster or should recover +ACTOR Future tLogPeekStream(TLogData* self, TLogPeekStreamRequest req, Reference logData) { + self->activePeekStreams++; + TraceEvent(SevDebug, "TLogPeekStream", logData->logId).detail("Token", req.reply.getEndpoint().token); + + state Version begin = req.begin; + state bool onlySpilled = false; + + req.reply.setByteLimit(std::min(SERVER_KNOBS->MAXIMUM_PEEK_BYTES, req.limitBytes)); + loop { + state TLogPeekStreamReply reply; + try { + wait(req.reply.onReady() && + store(reply.rep, peekTLog(self, logData, begin, req.tag, req.returnIfBlocked, onlySpilled))); + req.reply.send(reply); + begin = reply.rep.end; + onlySpilled = reply.rep.onlySpilled; + wait(delay(0, g_network->getCurrentTask())); + } catch (Error& e) { + self->activePeekStreams--; + TraceEvent(SevDebug, "TLogPeekStreamEnd", logData->logId).error(e, true); + + if (e.code() == error_code_end_of_stream) { + req.reply.sendError(e); + return Void(); + } else if (e.code() == error_code_operation_obsolete) { + // reply stream is cancelled on the client + return Void(); + } else { + throw; + } + } + } +} + +ACTOR Future tLogPeekMessages(TLogData* self, TLogPeekRequest req, Reference logData) { + try { + TLogPeekReply reply = + wait(peekTLog(self, logData, req.begin, req.tag, req.returnIfBlocked, req.onlySpilled, req.sequence)); + req.reply.send(reply); + } catch (Error& e) { + if (e.code() == error_code_timed_out || e.code() == error_code_operation_obsolete || + e.code() == error_code_end_of_stream) { + req.reply.sendError(e); + return Void(); + } else { + throw; + } } - req.reply.send(reply); return Void(); } diff --git a/fdbserver/OldTLogServer_6_0.actor.cpp b/fdbserver/OldTLogServer_6_0.actor.cpp index 58e0e5563d..b76a865bac 100644 --- a/fdbserver/OldTLogServer_6_0.actor.cpp +++ b/fdbserver/OldTLogServer_6_0.actor.cpp @@ -574,7 +574,7 @@ struct LogData : NonCopyable, public ReferenceCounted { }); specialCounter( cc, "QueueDiskBytesTotal", [tLogData]() { return tLogData->rawPersistentQueue->getStorageBytes().total; }); - specialCounter(cc, "ActivePeekStreams", [tLogData]() { return tLogData->activePeekStreams;}); + specialCounter(cc, "ActivePeekStreams", [tLogData]() { return tLogData->activePeekStreams; }); } ~LogData() { @@ -1179,16 +1179,16 @@ void peekMessagesFromMemory(Reference self, Version begin, BinaryWriter& messages, Version& endVersion) { - ASSERT(!messages.getLength()); + ASSERT(!messages.getLength()); - auto& deque = getVersionMessages(self, tag); - //TraceEvent("TLogPeekMem", self->dbgid).detail("Tag", req.tag1).detail("PDS", self->persistentDataSequence).detail("PDDS", self->persistentDataDurableSequence).detail("Oldest", map1.empty() ? 0 : map1.begin()->key ).detail("OldestMsgCount", map1.empty() ? 0 : map1.begin()->value.size()); + auto& deque = getVersionMessages(self, tag); + //TraceEvent("TLogPeekMem", self->dbgid).detail("Tag", req.tag1).detail("PDS", self->persistentDataSequence).detail("PDDS", self->persistentDataDurableSequence).detail("Oldest", map1.empty() ? 0 : map1.begin()->key ).detail("OldestMsgCount", map1.empty() ? 0 : map1.begin()->value.size()); - begin = std::max(begin, self->persistentDataDurableVersion + 1); - auto it = std::lower_bound(deque.begin(), - deque.end(), - std::make_pair(begin, LengthPrefixedStringRef()), - CompareFirst>()); + begin = std::max(begin, self->persistentDataDurableVersion + 1); + auto it = std::lower_bound(deque.begin(), + deque.end(), + std::make_pair(begin, LengthPrefixedStringRef()), + CompareFirst>()); Version currentVersion = -1; for (; it != deque.end(); ++it) { @@ -1215,375 +1215,88 @@ ACTOR Future peekTLog(TLogData* self, bool returnIfBlocked = false, bool reqOnlySpilled = false, Optional> sequence = Optional>()) { - state BinaryWriter messages(Unversioned()); - state BinaryWriter messages2(Unversioned()); - state int sequenceNum = -1; - state UID peekId; - state double queueStart = now(); - - if (tag.locality == tagLocalityTxs && tag.id >= logData->txsTags && logData->txsTags > 0) { - tag.id = tag.id % logData->txsTags; - } - - // TODO: once the fake stream is replaced by ReplyPromiseStream, we can remove the code handling sequence requests - // STEP: a. mark obsolete sequence requests; b. wait previous sequence requests are handled in order - if (sequence.present()) { - peekId = sequence.get().first; - sequenceNum = sequence.get().second; - if (sequenceNum >= SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS && - logData->peekTracker.find(peekId) == logData->peekTracker.end()) { - throw operation_obsolete(); - } - auto& trackerData = logData->peekTracker[peekId]; - if (sequenceNum == 0 && trackerData.sequence_version.find(0) == trackerData.sequence_version.end()) { - trackerData.tag = tag; - trackerData.sequence_version[0].send(std::make_pair(begin, reqOnlySpilled)); - } - auto seqBegin = trackerData.sequence_version.begin(); - // The peek cursor and this comparison need to agree about the maximum number of in-flight requests. - while (trackerData.sequence_version.size() && - seqBegin->first <= sequenceNum - SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS) { - if (seqBegin->second.canBeSet()) { - seqBegin->second.sendError(operation_obsolete()); - } - trackerData.sequence_version.erase(seqBegin); - seqBegin = trackerData.sequence_version.begin(); - } - - if (trackerData.sequence_version.size() && sequenceNum < seqBegin->first) { - throw operation_obsolete(); - } - - Future> fPrevPeekData = trackerData.sequence_version[sequenceNum].getFuture(); - if (fPrevPeekData.isReady()) { - trackerData.unblockedPeeks++; - double t = now() - trackerData.lastUpdate; - if (t > trackerData.idleMax) - trackerData.idleMax = t; - trackerData.idleTime += t; - } - trackerData.lastUpdate = now(); - std::pair prevPeekData = wait(fPrevPeekData); - begin = std::max(prevPeekData.first, begin); - reqOnlySpilled = prevPeekData.second; - wait(yield()); - } - - state double blockStart = now(); - - if (returnIfBlocked && logData->version.get() < begin) { - if (sequence.present()) { - auto& trackerData = logData->peekTracker[peekId]; - auto& sequenceData = trackerData.sequence_version[sequenceNum + 1]; - trackerData.lastUpdate = now(); - if (!sequenceData.isSet()) { - sequenceData.send(std::make_pair(begin, reqOnlySpilled)); - } - } - throw end_of_stream(); - } - - //TraceEvent("TLogPeekMessages0", self->dbgid).detail("ReqBeginEpoch", req.begin.epoch).detail("ReqBeginSeq", req.begin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", req.tag1).detail("Tag2", req.tag2); - // Wait until we have something to return that the caller doesn't already have - if (logData->version.get() < begin) { - wait(logData->version.whenAtLeast(begin)); - wait(delay(SERVER_KNOBS->TLOG_PEEK_DELAY, g_network->getCurrentTask())); - } - - if (logData->locality != tagLocalitySatellite && tag.locality == tagLocalityLogRouter) { - wait(self->concurrentLogRouterReads.take()); - state FlowLock::Releaser globalReleaser(self->concurrentLogRouterReads); - wait(delay(0.0, TaskPriority::Low)); - } - - if (begin <= logData->persistentDataDurableVersion && tag.locality != tagLocalityTxs && tag != txsTag) { - // Reading spilled data will almost always imply that the storage server is >5s behind the rest - // of the cluster. We shouldn't prioritize spending CPU on helping this server catch up - // slightly faster over keeping the rest of the cluster operating normally. - // txsTag is only ever peeked on recovery, and we would still wish to prioritize requests - // that impact recovery duration. - wait(delay(0, TaskPriority::TLogSpilledPeekReply)); - } - - state double workStart = now(); - - Version poppedVer = poppedVersion(logData, tag); - if (poppedVer > begin) { - // reply with an empty message and let the next reply start from poppedVer - TLogPeekReply rep; - rep.maxKnownVersion = logData->version.get(); - rep.minKnownCommittedVersion = logData->minKnownCommittedVersion; - rep.popped = poppedVer; - rep.end = poppedVer; - rep.onlySpilled = false; - - // TODO: once the fake stream is replaced by ReplyPromiseStream, we can remove the code handling sequence - // requests. - if (sequence.present()) { - auto& trackerData = logData->peekTracker[peekId]; - auto& sequenceData = trackerData.sequence_version[sequenceNum + 1]; - trackerData.lastUpdate = now(); - if (trackerData.sequence_version.size() && sequenceNum + 1 < trackerData.sequence_version.begin()->first) { - if (!sequenceData.isSet()) - sequenceData.sendError(operation_obsolete()); - throw operation_obsolete(); - } - if (sequenceData.isSet()) { - if (sequenceData.getFuture().get().first != rep.end) { - TEST(true); // 1 tlog peek second attempt ended at a different version - throw operation_obsolete(); - } - } else { - sequenceData.send(std::make_pair(rep.end, rep.onlySpilled)); - } - rep.begin = begin; - } - - return rep; - } - - state Version endVersion = logData->version.get() + 1; - state bool onlySpilled = false; - - // grab messages from disk - //TraceEvent("TLogPeekMessages", self->dbgid).detail("ReqBeginEpoch", req.begin.epoch).detail("ReqBeginSeq", req.begin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", req.tag1).detail("Tag2", req.tag2); - if (begin <= logData->persistentDataDurableVersion) { - // Just in case the durable version changes while we are waiting for the read, we grab this data from memory. We - // may or may not actually send it depending on whether we get enough data from disk. SOMEDAY: Only do this if - // an initial attempt to read from disk results in insufficient data and the required data is no longer in - // memory SOMEDAY: Should we only send part of the messages we collected, to actually limit the size of the - // result? - - if (reqOnlySpilled) { - endVersion = logData->persistentDataDurableVersion + 1; - } else { - peekMessagesFromMemory(logData, tag, begin, messages2, endVersion); - } - - RangeResult kvs = wait(self->persistentData->readRange( - KeyRangeRef(persistTagMessagesKey(logData->logId, tag, begin), - persistTagMessagesKey(logData->logId, tag, logData->persistentDataDurableVersion + 1)), - SERVER_KNOBS->DESIRED_TOTAL_BYTES, - SERVER_KNOBS->DESIRED_TOTAL_BYTES)); - - //TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", reply.getEndpoint().address).detail("Tag1Results", s1).detail("Tag2Results", s2).detail("Tag1ResultsLim", kv1.size()).detail("Tag2ResultsLim", kv2.size()).detail("Tag1ResultsLast", kv1.size() ? kv1[0].key : "").detail("Tag2ResultsLast", kv2.size() ? kv2[0].key : "").detail("Limited", limited).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowEpoch", self->epoch()).detail("NowSeq", self->sequence.getNextSequence()); - - for (auto& kv : kvs) { - auto ver = decodeTagMessagesKey(kv.key); - messages << VERSION_HEADER << ver; - messages.serializeBytes(kv.value); - } - - if (kvs.expectedSize() >= SERVER_KNOBS->DESIRED_TOTAL_BYTES) { - endVersion = decodeTagMessagesKey(kvs.end()[-1].key) + 1; - onlySpilled = true; - } else { - messages.serializeBytes(messages2.toValue()); - } - } else { - peekMessagesFromMemory(logData, tag, begin, messages, endVersion); - //TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", reply.getEndpoint().address).detail("MessageBytes", messages.getLength()).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowSeq", self->sequence.getNextSequence()); - } - - TLogPeekReply reply; - reply.maxKnownVersion = logData->version.get(); - reply.minKnownCommittedVersion = logData->minKnownCommittedVersion; - reply.messages = StringRef(reply.arena, messages.toValue()); - reply.end = endVersion; - reply.onlySpilled = onlySpilled; - - //TraceEvent("TlogPeek", self->dbgid).detail("LogId", logData->logId).detail("EndVer", reply.end).detail("MsgBytes", reply.messages.expectedSize()).detail("ForAddress", reply.getEndpoint().address); - - if (sequence.present()) { - auto& trackerData = logData->peekTracker[peekId]; - trackerData.lastUpdate = now(); - - double queueT = blockStart - queueStart; - double blockT = workStart - blockStart; - double workT = now() - workStart; - - trackerData.totalPeeks++; - trackerData.replyBytes += reply.messages.size(); - - if (queueT > trackerData.queueMax) - trackerData.queueMax = queueT; - if (blockT > trackerData.blockMax) - trackerData.blockMax = blockT; - if (workT > trackerData.workMax) - trackerData.workMax = workT; - - trackerData.queueTime += queueT; - trackerData.blockTime += blockT; - trackerData.workTime += workT; - - auto& sequenceData = trackerData.sequence_version[sequenceNum + 1]; - if (trackerData.sequence_version.size() && sequenceNum + 1 < trackerData.sequence_version.begin()->first) { - if (!sequenceData.isSet()) { - // It would technically be more correct to .send({req.begin, req.onlySpilled}), as the next - // request might still be in the window of active requests, but LogSystemPeekCursor will - // throw away all future responses upon getting an operation_obsolete(), so computing a - // response will probably be a waste of CPU. - sequenceData.sendError(operation_obsolete()); - } - throw operation_obsolete(); - } - if (sequenceData.isSet()) { - trackerData.duplicatePeeks++; - if (sequenceData.getFuture().get().first != reply.end) { - TEST(true); // 1 tlog peek second attempt ended at a different version (2) - throw operation_obsolete(); - } - } else { - sequenceData.send(std::make_pair(reply.end, reply.onlySpilled)); - } - reply.begin = begin; - } - - return reply; -} - -// This actor keep pushing TLogPeekStreamReply until it's removed from the cluster or should recover -ACTOR Future tLogPeekStream(TLogData* self, TLogPeekStreamRequest req, Reference logData) { - self->activePeekStreams++; - TraceEvent(SevDebug, "TLogPeekStream", logData->logId).detail("Token", req.reply.getEndpoint().token); - - state Version begin = req.begin; - state bool onlySpilled = false; - if (req.tag.locality == tagLocalityTxs && req.tag.id >= logData->txsTags && logData->txsTags > 0) { - req.tag.id = req.tag.id % logData->txsTags; - } - req.reply.setByteLimit(std::min(SERVER_KNOBS->MAXIMUM_PEEK_BYTES, req.limitBytes)); - loop { - state TLogPeekStreamReply reply; - try { - wait(req.reply.onReady() && - store(reply.rep, peekTLog(self, logData, begin, req.tag, req.returnIfBlocked, onlySpilled))); - req.reply.send(reply); - begin = reply.rep.end; - onlySpilled = reply.rep.onlySpilled; - wait(delay(0, g_network->getCurrentTask())); - } catch (Error& e) { - self->activePeekStreams--; - TraceEvent(SevDebug, "TLogPeekStreamEnd", logData->logId).error(e, true); - - if (e.code() == error_code_end_of_stream) { - req.reply.sendError(e); - return Void(); - } else if (e.code() == error_code_operation_obsolete) { - // reply stream is cancelled on the client - return Void(); - } else { - throw; - } - } - } -} - -ACTOR Future tLogPeekMessages(TLogData* self, TLogPeekRequest req, Reference logData) { - /*try { - TLogPeekReply reply = - wait(peekTLog(self, logData, req.begin, req.tag, req.returnIfBlocked, req.onlySpilled, req.sequence)); - req.reply.send(reply); - } catch (Error& e) { - if (e.code() == error_code_timed_out || e.code() == error_code_operation_obsolete || - e.code() == error_code_end_of_stream) { - req.reply.sendError(e); - return Void(); - } else { - throw; - } - } - - return Void();*/ state BinaryWriter messages(Unversioned()); state BinaryWriter messages2(Unversioned()); - state int sequence = -1; + state int sequenceNum = -1; state UID peekId; state double queueStart = now(); - if (req.tag.locality == tagLocalityTxs && req.tag.id >= logData->txsTags && logData->txsTags > 0) { - req.tag.id = req.tag.id % logData->txsTags; + if (tag.locality == tagLocalityTxs && tag.id >= logData->txsTags && logData->txsTags > 0) { + tag.id = tag.id % logData->txsTags; } - if (req.sequence.present()) { - try { - peekId = req.sequence.get().first; - sequence = req.sequence.get().second; - if (sequence >= SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS && - logData->peekTracker.find(peekId) == logData->peekTracker.end()) { - throw operation_obsolete(); - } - auto& trackerData = logData->peekTracker[peekId]; - if (sequence == 0 && trackerData.sequence_version.find(0) == trackerData.sequence_version.end()) { - trackerData.tag = req.tag; - trackerData.sequence_version[0].send(std::make_pair(req.begin, req.onlySpilled)); - } - auto seqBegin = trackerData.sequence_version.begin(); - while (trackerData.sequence_version.size() && - seqBegin->first <= sequence - SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS) { - if (seqBegin->second.canBeSet()) { - seqBegin->second.sendError(operation_obsolete()); - } - trackerData.sequence_version.erase(seqBegin); - seqBegin = trackerData.sequence_version.begin(); - } - - if (trackerData.sequence_version.size() && sequence < seqBegin->first) { - throw operation_obsolete(); - } - - Future> fPrevPeekData = trackerData.sequence_version[sequence].getFuture(); - if (fPrevPeekData.isReady()) { - trackerData.unblockedPeeks++; - double t = now() - trackerData.lastUpdate; - if (t > trackerData.idleMax) - trackerData.idleMax = t; - trackerData.idleTime += t; - } - trackerData.lastUpdate = now(); - std::pair prevPeekData = wait(fPrevPeekData); - - req.begin = std::max(prevPeekData.first, req.begin); - req.onlySpilled = prevPeekData.second; - wait(yield()); - } catch (Error& e) { - if (e.code() == error_code_timed_out || e.code() == error_code_operation_obsolete) { - req.reply.sendError(e); - return Void(); - } else { - throw; - } + // TODO: once the fake stream is replaced by ReplyPromiseStream, we can remove the code handling sequence requests + // STEP: a. mark obsolete sequence requests; b. wait previous sequence requests are handled in order + if (sequence.present()) { + peekId = sequence.get().first; + sequenceNum = sequence.get().second; + if (sequenceNum >= SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS && + logData->peekTracker.find(peekId) == logData->peekTracker.end()) { + throw operation_obsolete(); } + auto& trackerData = logData->peekTracker[peekId]; + if (sequenceNum == 0 && trackerData.sequence_version.find(0) == trackerData.sequence_version.end()) { + trackerData.tag = tag; + trackerData.sequence_version[0].send(std::make_pair(begin, reqOnlySpilled)); + } + auto seqBegin = trackerData.sequence_version.begin(); + // The peek cursor and this comparison need to agree about the maximum number of in-flight requests. + while (trackerData.sequence_version.size() && + seqBegin->first <= sequenceNum - SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS) { + if (seqBegin->second.canBeSet()) { + seqBegin->second.sendError(operation_obsolete()); + } + trackerData.sequence_version.erase(seqBegin); + seqBegin = trackerData.sequence_version.begin(); + } + + if (trackerData.sequence_version.size() && sequenceNum < seqBegin->first) { + throw operation_obsolete(); + } + + Future> fPrevPeekData = trackerData.sequence_version[sequenceNum].getFuture(); + if (fPrevPeekData.isReady()) { + trackerData.unblockedPeeks++; + double t = now() - trackerData.lastUpdate; + if (t > trackerData.idleMax) + trackerData.idleMax = t; + trackerData.idleTime += t; + } + trackerData.lastUpdate = now(); + std::pair prevPeekData = wait(fPrevPeekData); + begin = std::max(prevPeekData.first, begin); + reqOnlySpilled = prevPeekData.second; + wait(yield()); } state double blockStart = now(); - if (req.returnIfBlocked && logData->version.get() < req.begin) { - req.reply.sendError(end_of_stream()); - if (req.sequence.present()) { + if (returnIfBlocked && logData->version.get() < begin) { + if (sequence.present()) { auto& trackerData = logData->peekTracker[peekId]; - auto& sequenceData = trackerData.sequence_version[sequence + 1]; + auto& sequenceData = trackerData.sequence_version[sequenceNum + 1]; + trackerData.lastUpdate = now(); if (!sequenceData.isSet()) { - sequenceData.send(std::make_pair(req.begin, req.onlySpilled)); + sequenceData.send(std::make_pair(begin, reqOnlySpilled)); } } - return Void(); + throw end_of_stream(); } //TraceEvent("TLogPeekMessages0", self->dbgid).detail("ReqBeginEpoch", req.begin.epoch).detail("ReqBeginSeq", req.begin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", req.tag1).detail("Tag2", req.tag2); // Wait until we have something to return that the caller doesn't already have - if (logData->version.get() < req.begin) { - wait(logData->version.whenAtLeast(req.begin)); + if (logData->version.get() < begin) { + wait(logData->version.whenAtLeast(begin)); wait(delay(SERVER_KNOBS->TLOG_PEEK_DELAY, g_network->getCurrentTask())); } - if (logData->locality != tagLocalitySatellite && req.tag.locality == tagLocalityLogRouter) { + if (logData->locality != tagLocalitySatellite && tag.locality == tagLocalityLogRouter) { wait(self->concurrentLogRouterReads.take()); state FlowLock::Releaser globalReleaser(self->concurrentLogRouterReads); wait(delay(0.0, TaskPriority::Low)); } - if (req.begin <= logData->persistentDataDurableVersion && req.tag.locality != tagLocalityTxs && req.tag != txsTag) { + if (begin <= logData->persistentDataDurableVersion && tag.locality != tagLocalityTxs && tag != txsTag) { // Reading spilled data will almost always imply that the storage server is >5s behind the rest // of the cluster. We shouldn't prioritize spending CPU on helping this server catch up // slightly faster over keeping the rest of the cluster operating normally. @@ -1594,8 +1307,9 @@ ACTOR Future tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen state double workStart = now(); - Version poppedVer = poppedVersion(logData, req.tag); - if (poppedVer > req.begin) { + Version poppedVer = poppedVersion(logData, tag); + if (poppedVer > begin) { + // reply with an empty message and let the next reply start from poppedVer TLogPeekReply rep; rep.maxKnownVersion = logData->version.get(); rep.minKnownCommittedVersion = logData->minKnownCommittedVersion; @@ -1603,30 +1317,29 @@ ACTOR Future tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen rep.end = poppedVer; rep.onlySpilled = false; - if (req.sequence.present()) { + // TODO: once the fake stream is replaced by ReplyPromiseStream, we can remove the code handling sequence + // requests. + if (sequence.present()) { auto& trackerData = logData->peekTracker[peekId]; - auto& sequenceData = trackerData.sequence_version[sequence + 1]; + auto& sequenceData = trackerData.sequence_version[sequenceNum + 1]; trackerData.lastUpdate = now(); - if (trackerData.sequence_version.size() && sequence + 1 < trackerData.sequence_version.begin()->first) { - req.reply.sendError(operation_obsolete()); + if (trackerData.sequence_version.size() && sequenceNum + 1 < trackerData.sequence_version.begin()->first) { if (!sequenceData.isSet()) sequenceData.sendError(operation_obsolete()); - return Void(); + throw operation_obsolete(); } if (sequenceData.isSet()) { if (sequenceData.getFuture().get().first != rep.end) { - TEST(true); // tlog peek second attempt ended at a different version - req.reply.sendError(operation_obsolete()); - return Void(); + TEST(true); // 1 tlog peek second attempt ended at a different version + throw operation_obsolete(); } } else { sequenceData.send(std::make_pair(rep.end, rep.onlySpilled)); } - rep.begin = req.begin; + rep.begin = begin; } - req.reply.send(rep); - return Void(); + return rep; } state Version endVersion = logData->version.get() + 1; @@ -1634,26 +1347,26 @@ ACTOR Future tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen // grab messages from disk //TraceEvent("TLogPeekMessages", self->dbgid).detail("ReqBeginEpoch", req.begin.epoch).detail("ReqBeginSeq", req.begin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", req.tag1).detail("Tag2", req.tag2); - if (req.begin <= logData->persistentDataDurableVersion) { + if (begin <= logData->persistentDataDurableVersion) { // Just in case the durable version changes while we are waiting for the read, we grab this data from memory. We // may or may not actually send it depending on whether we get enough data from disk. SOMEDAY: Only do this if // an initial attempt to read from disk results in insufficient data and the required data is no longer in // memory SOMEDAY: Should we only send part of the messages we collected, to actually limit the size of the // result? - if (req.onlySpilled) { + if (reqOnlySpilled) { endVersion = logData->persistentDataDurableVersion + 1; } else { - peekMessagesFromMemory(logData, req.tag, req.begin, messages2, endVersion); + peekMessagesFromMemory(logData, tag, begin, messages2, endVersion); } RangeResult kvs = wait(self->persistentData->readRange( - KeyRangeRef(persistTagMessagesKey(logData->logId, req.tag, req.begin), - persistTagMessagesKey(logData->logId, req.tag, logData->persistentDataDurableVersion + 1)), + KeyRangeRef(persistTagMessagesKey(logData->logId, tag, begin), + persistTagMessagesKey(logData->logId, tag, logData->persistentDataDurableVersion + 1)), SERVER_KNOBS->DESIRED_TOTAL_BYTES, SERVER_KNOBS->DESIRED_TOTAL_BYTES)); - //TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", req.reply.getEndpoint().address).detail("Tag1Results", s1).detail("Tag2Results", s2).detail("Tag1ResultsLim", kv1.size()).detail("Tag2ResultsLim", kv2.size()).detail("Tag1ResultsLast", kv1.size() ? kv1[0].key : "").detail("Tag2ResultsLast", kv2.size() ? kv2[0].key : "").detail("Limited", limited).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowEpoch", self->epoch()).detail("NowSeq", self->sequence.getNextSequence()); + //TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", reply.getEndpoint().address).detail("Tag1Results", s1).detail("Tag2Results", s2).detail("Tag1ResultsLim", kv1.size()).detail("Tag2ResultsLim", kv2.size()).detail("Tag1ResultsLast", kv1.size() ? kv1[0].key : "").detail("Tag2ResultsLast", kv2.size() ? kv2[0].key : "").detail("Limited", limited).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowEpoch", self->epoch()).detail("NowSeq", self->sequence.getNextSequence()); for (auto& kv : kvs) { auto ver = decodeTagMessagesKey(kv.key); @@ -1668,20 +1381,20 @@ ACTOR Future tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen messages.serializeBytes(messages2.toValue()); } } else { - peekMessagesFromMemory(logData, req.tag, req.begin, messages, endVersion); - //TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", req.reply.getEndpoint().address).detail("MessageBytes", messages.getLength()).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowSeq", self->sequence.getNextSequence()); + peekMessagesFromMemory(logData, tag, begin, messages, endVersion); + //TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", reply.getEndpoint().address).detail("MessageBytes", messages.getLength()).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowSeq", self->sequence.getNextSequence()); } TLogPeekReply reply; reply.maxKnownVersion = logData->version.get(); reply.minKnownCommittedVersion = logData->minKnownCommittedVersion; - reply.messages = messages.toValue(); + reply.messages = StringRef(reply.arena, messages.toValue()); reply.end = endVersion; reply.onlySpilled = onlySpilled; - //TraceEvent("TlogPeek", self->dbgid).detail("LogId", logData->logId).detail("EndVer", reply.end).detail("MsgBytes", reply.messages.expectedSize()).detail("ForAddress", req.reply.getEndpoint().address); + //TraceEvent("TlogPeek", self->dbgid).detail("LogId", logData->logId).detail("EndVer", reply.end).detail("MsgBytes", reply.messages.expectedSize()).detail("ForAddress", reply.getEndpoint().address); - if (req.sequence.present()) { + if (sequence.present()) { auto& trackerData = logData->peekTracker[peekId]; trackerData.lastUpdate = now(); @@ -1703,27 +1416,84 @@ ACTOR Future tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen trackerData.blockTime += blockT; trackerData.workTime += workT; - auto& sequenceData = trackerData.sequence_version[sequence + 1]; - if (trackerData.sequence_version.size() && sequence + 1 < trackerData.sequence_version.begin()->first) { - req.reply.sendError(operation_obsolete()); - if (!sequenceData.isSet()) + auto& sequenceData = trackerData.sequence_version[sequenceNum + 1]; + if (trackerData.sequence_version.size() && sequenceNum + 1 < trackerData.sequence_version.begin()->first) { + if (!sequenceData.isSet()) { + // It would technically be more correct to .send({req.begin, req.onlySpilled}), as the next + // request might still be in the window of active requests, but LogSystemPeekCursor will + // throw away all future responses upon getting an operation_obsolete(), so computing a + // response will probably be a waste of CPU. sequenceData.sendError(operation_obsolete()); - return Void(); + } + throw operation_obsolete(); } if (sequenceData.isSet()) { trackerData.duplicatePeeks++; if (sequenceData.getFuture().get().first != reply.end) { - TEST(true); // tlog peek second attempt ended at a different version (2) - req.reply.sendError(operation_obsolete()); - return Void(); + TEST(true); // 1 tlog peek second attempt ended at a different version (2) + throw operation_obsolete(); } } else { sequenceData.send(std::make_pair(reply.end, reply.onlySpilled)); } - reply.begin = req.begin; + reply.begin = begin; + } + + return reply; +} + +// This actor keep pushing TLogPeekStreamReply until it's removed from the cluster or should recover +ACTOR Future tLogPeekStream(TLogData* self, TLogPeekStreamRequest req, Reference logData) { + self->activePeekStreams++; + TraceEvent(SevDebug, "TLogPeekStream", logData->logId).detail("Token", req.reply.getEndpoint().token); + + state Version begin = req.begin; + state bool onlySpilled = false; + if (req.tag.locality == tagLocalityTxs && req.tag.id >= logData->txsTags && logData->txsTags > 0) { + req.tag.id = req.tag.id % logData->txsTags; + } + req.reply.setByteLimit(std::min(SERVER_KNOBS->MAXIMUM_PEEK_BYTES, req.limitBytes)); + loop { + state TLogPeekStreamReply reply; + try { + wait(req.reply.onReady() && + store(reply.rep, peekTLog(self, logData, begin, req.tag, req.returnIfBlocked, onlySpilled))); + req.reply.send(reply); + begin = reply.rep.end; + onlySpilled = reply.rep.onlySpilled; + wait(delay(0, g_network->getCurrentTask())); + } catch (Error& e) { + self->activePeekStreams--; + TraceEvent(SevDebug, "TLogPeekStreamEnd", logData->logId).error(e, true); + + if (e.code() == error_code_end_of_stream) { + req.reply.sendError(e); + return Void(); + } else if (e.code() == error_code_operation_obsolete) { + // reply stream is cancelled on the client + return Void(); + } else { + throw; + } + } + } +} + +ACTOR Future tLogPeekMessages(TLogData* self, TLogPeekRequest req, Reference logData) { + try { + TLogPeekReply reply = + wait(peekTLog(self, logData, req.begin, req.tag, req.returnIfBlocked, req.onlySpilled, req.sequence)); + req.reply.send(reply); + } catch (Error& e) { + if (e.code() == error_code_timed_out || e.code() == error_code_operation_obsolete || + e.code() == error_code_end_of_stream) { + req.reply.sendError(e); + return Void(); + } else { + throw; + } } - req.reply.send(reply); return Void(); } @@ -2223,9 +1993,9 @@ ACTOR Future serveTLogInterface(TLogData* self, when(TLogPeekRequest req = waitNext(tli.peekMessages.getFuture())) { logData->addActor.send(tLogPeekMessages(self, req, logData)); } - when(TLogPeekStreamRequest req = waitNext(tli.peekStreamMessages.getFuture())) { - logData->addActor.send(tLogPeekStream(self, req, logData)); - } + when(TLogPeekStreamRequest req = waitNext(tli.peekStreamMessages.getFuture())) { + logData->addActor.send(tLogPeekStream(self, req, logData)); + } when(TLogPopRequest req = waitNext(tli.popMessages.getFuture())) { logData->addActor.send(tLogPop(self, req, logData)); } diff --git a/fdbserver/OldTLogServer_6_2.actor.cpp b/fdbserver/OldTLogServer_6_2.actor.cpp index 72a5773a72..7f6bea5f57 100644 --- a/fdbserver/OldTLogServer_6_2.actor.cpp +++ b/fdbserver/OldTLogServer_6_2.actor.cpp @@ -663,7 +663,7 @@ struct LogData : NonCopyable, public ReferenceCounted { cc, "QueueDiskBytesTotal", [tLogData]() { return tLogData->rawPersistentQueue->getStorageBytes().total; }); specialCounter(cc, "PeekMemoryReserved", [tLogData]() { return tLogData->peekMemoryLimiter.activePermits(); }); specialCounter(cc, "PeekMemoryRequestsStalled", [tLogData]() { return tLogData->peekMemoryLimiter.waiters(); }); - specialCounter(cc, "ActivePeekStreams", [tLogData]() {return tLogData->activePeekStreams;}); + specialCounter(cc, "ActivePeekStreams", [tLogData]() { return tLogData->activePeekStreams; }); } ~LogData() { @@ -1443,17 +1443,19 @@ ACTOR Future tLogPopCore(TLogData* self, Tag inputTag, Version to, Referen } uint64_t PoppedVersionLag = logData->persistentDataDurableVersion - logData->queuePoppedVersion; - if ( SERVER_KNOBS->ENABLE_DETAILED_TLOG_POP_TRACE && - (logData->queuePoppedVersion > 0) && //avoid generating massive events at beginning - (tagData->unpoppedRecovered || PoppedVersionLag >= SERVER_KNOBS->TLOG_POPPED_VER_LAG_THRESHOLD_FOR_TLOGPOP_TRACE)) { //when recovery or long lag + if (SERVER_KNOBS->ENABLE_DETAILED_TLOG_POP_TRACE && + (logData->queuePoppedVersion > 0) && // avoid generating massive events at beginning + (tagData->unpoppedRecovered || + PoppedVersionLag >= + SERVER_KNOBS->TLOG_POPPED_VER_LAG_THRESHOLD_FOR_TLOGPOP_TRACE)) { // when recovery or long lag TraceEvent("TLogPopDetails", logData->logId) - .detail("Tag", tagData->tag.toString()) - .detail("UpTo", upTo) - .detail("PoppedVersionLag", PoppedVersionLag) - .detail("MinPoppedTag", logData->minPoppedTag.toString()) - .detail("QueuePoppedVersion", logData->queuePoppedVersion) - .detail("UnpoppedRecovered", tagData->unpoppedRecovered ? "True" : "False") - .detail("NothingPersistent", tagData->nothingPersistent ? "True" : "False"); + .detail("Tag", tagData->tag.toString()) + .detail("UpTo", upTo) + .detail("PoppedVersionLag", PoppedVersionLag) + .detail("MinPoppedTag", logData->minPoppedTag.toString()) + .detail("QueuePoppedVersion", logData->queuePoppedVersion) + .detail("UnpoppedRecovered", tagData->unpoppedRecovered ? "True" : "False") + .detail("NothingPersistent", tagData->nothingPersistent ? "True" : "False"); } if (upTo > logData->persistentDataDurableVersion) wait(tagData->eraseMessagesBefore(upTo, self, logData, TaskPriority::TLogPop)); @@ -1494,16 +1496,16 @@ void peekMessagesFromMemory(Reference self, Version begin, BinaryWriter& messages, Version& endVersion) { - ASSERT(!messages.getLength()); + ASSERT(!messages.getLength()); - auto& deque = getVersionMessages(self, tag); - //TraceEvent("TLogPeekMem", self->dbgid).detail("Tag", req.tag1).detail("PDS", self->persistentDataSequence).detail("PDDS", self->persistentDataDurableSequence).detail("Oldest", map1.empty() ? 0 : map1.begin()->key ).detail("OldestMsgCount", map1.empty() ? 0 : map1.begin()->value.size()); + auto& deque = getVersionMessages(self, tag); + //TraceEvent("TLogPeekMem", self->dbgid).detail("Tag", req.tag1).detail("PDS", self->persistentDataSequence).detail("PDDS", self->persistentDataDurableSequence).detail("Oldest", map1.empty() ? 0 : map1.begin()->key ).detail("OldestMsgCount", map1.empty() ? 0 : map1.begin()->value.size()); - begin = std::max(begin, self->persistentDataDurableVersion + 1); - auto it = std::lower_bound(deque.begin(), - deque.end(), - std::make_pair(begin, LengthPrefixedStringRef()), - CompareFirst>()); + begin = std::max(begin, self->persistentDataDurableVersion + 1); + auto it = std::lower_bound(deque.begin(), + deque.end(), + std::make_pair(begin, LengthPrefixedStringRef()), + CompareFirst>()); Version currentVersion = -1; for (; it != deque.end(); ++it) { @@ -1552,139 +1554,139 @@ ACTOR Future peekTLog(TLogData* self, bool returnIfBlocked = false, bool reqOnlySpilled = false, Optional> sequence = Optional>()) { - state BinaryWriter messages(Unversioned()); - state BinaryWriter messages2(Unversioned()); - state int sequenceNum = -1; - state UID peekId; - state double queueStart = now(); + state BinaryWriter messages(Unversioned()); + state BinaryWriter messages2(Unversioned()); + state int sequenceNum = -1; + state UID peekId; + state double queueStart = now(); - if (tag.locality == tagLocalityTxs && tag.id >= logData->txsTags && logData->txsTags > 0) { - tag.id = tag.id % logData->txsTags; - } + if (tag.locality == tagLocalityTxs && tag.id >= logData->txsTags && logData->txsTags > 0) { + tag.id = tag.id % logData->txsTags; + } - // TODO: once the fake stream is replaced by ReplyPromiseStream, we can remove the code handling sequence requests - // STEP: a. mark obsolete sequence requests; b. wait previous sequence requests are handled in order - if (sequence.present()) { - peekId = sequence.get().first; - sequenceNum = sequence.get().second; - if (sequenceNum >= SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS && - logData->peekTracker.find(peekId) == logData->peekTracker.end()) { - throw operation_obsolete(); - } - auto& trackerData = logData->peekTracker[peekId]; - if (sequenceNum == 0 && trackerData.sequence_version.find(0) == trackerData.sequence_version.end()) { - trackerData.tag = tag; - trackerData.sequence_version[0].send(std::make_pair(begin, reqOnlySpilled)); - } - auto seqBegin = trackerData.sequence_version.begin(); - // The peek cursor and this comparison need to agree about the maximum number of in-flight requests. - while (trackerData.sequence_version.size() && - seqBegin->first <= sequenceNum - SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS) { - if (seqBegin->second.canBeSet()) { - seqBegin->second.sendError(operation_obsolete()); - } - trackerData.sequence_version.erase(seqBegin); - seqBegin = trackerData.sequence_version.begin(); - } + // TODO: once the fake stream is replaced by ReplyPromiseStream, we can remove the code handling sequence requests + // STEP: a. mark obsolete sequence requests; b. wait previous sequence requests are handled in order + if (sequence.present()) { + peekId = sequence.get().first; + sequenceNum = sequence.get().second; + if (sequenceNum >= SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS && + logData->peekTracker.find(peekId) == logData->peekTracker.end()) { + throw operation_obsolete(); + } + auto& trackerData = logData->peekTracker[peekId]; + if (sequenceNum == 0 && trackerData.sequence_version.find(0) == trackerData.sequence_version.end()) { + trackerData.tag = tag; + trackerData.sequence_version[0].send(std::make_pair(begin, reqOnlySpilled)); + } + auto seqBegin = trackerData.sequence_version.begin(); + // The peek cursor and this comparison need to agree about the maximum number of in-flight requests. + while (trackerData.sequence_version.size() && + seqBegin->first <= sequenceNum - SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS) { + if (seqBegin->second.canBeSet()) { + seqBegin->second.sendError(operation_obsolete()); + } + trackerData.sequence_version.erase(seqBegin); + seqBegin = trackerData.sequence_version.begin(); + } - if (trackerData.sequence_version.size() && sequenceNum < seqBegin->first) { - throw operation_obsolete(); - } + if (trackerData.sequence_version.size() && sequenceNum < seqBegin->first) { + throw operation_obsolete(); + } - Future> fPrevPeekData = trackerData.sequence_version[sequenceNum].getFuture(); - if (fPrevPeekData.isReady()) { - trackerData.unblockedPeeks++; - double t = now() - trackerData.lastUpdate; - if (t > trackerData.idleMax) - trackerData.idleMax = t; - trackerData.idleTime += t; - } - trackerData.lastUpdate = now(); - std::pair prevPeekData = wait(fPrevPeekData); - begin = std::max(prevPeekData.first, begin); - reqOnlySpilled = prevPeekData.second; - wait(yield()); - } + Future> fPrevPeekData = trackerData.sequence_version[sequenceNum].getFuture(); + if (fPrevPeekData.isReady()) { + trackerData.unblockedPeeks++; + double t = now() - trackerData.lastUpdate; + if (t > trackerData.idleMax) + trackerData.idleMax = t; + trackerData.idleTime += t; + } + trackerData.lastUpdate = now(); + std::pair prevPeekData = wait(fPrevPeekData); + begin = std::max(prevPeekData.first, begin); + reqOnlySpilled = prevPeekData.second; + wait(yield()); + } - state double blockStart = now(); + state double blockStart = now(); - if (returnIfBlocked && logData->version.get() < begin) { - if (sequence.present()) { - auto& trackerData = logData->peekTracker[peekId]; - auto& sequenceData = trackerData.sequence_version[sequenceNum + 1]; - trackerData.lastUpdate = now(); - if (!sequenceData.isSet()) { - sequenceData.send(std::make_pair(begin, reqOnlySpilled)); - } - } - throw end_of_stream(); - } + if (returnIfBlocked && logData->version.get() < begin) { + if (sequence.present()) { + auto& trackerData = logData->peekTracker[peekId]; + auto& sequenceData = trackerData.sequence_version[sequenceNum + 1]; + trackerData.lastUpdate = now(); + if (!sequenceData.isSet()) { + sequenceData.send(std::make_pair(begin, reqOnlySpilled)); + } + } + throw end_of_stream(); + } - //TraceEvent("TLogPeekMessages0", self->dbgid).detail("ReqBeginEpoch", req.begin.epoch).detail("ReqBeginSeq", req.begin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", req.tag1).detail("Tag2", req.tag2); - // Wait until we have something to return that the caller doesn't already have - if (logData->version.get() < begin) { - wait(logData->version.whenAtLeast(begin)); - wait(delay(SERVER_KNOBS->TLOG_PEEK_DELAY, g_network->getCurrentTask())); - } + //TraceEvent("TLogPeekMessages0", self->dbgid).detail("ReqBeginEpoch", req.begin.epoch).detail("ReqBeginSeq", req.begin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", req.tag1).detail("Tag2", req.tag2); + // Wait until we have something to return that the caller doesn't already have + if (logData->version.get() < begin) { + wait(logData->version.whenAtLeast(begin)); + wait(delay(SERVER_KNOBS->TLOG_PEEK_DELAY, g_network->getCurrentTask())); + } - if (logData->locality != tagLocalitySatellite && tag.locality == tagLocalityLogRouter) { - wait(self->concurrentLogRouterReads.take()); - state FlowLock::Releaser globalReleaser(self->concurrentLogRouterReads); - wait(delay(0.0, TaskPriority::Low)); - } + if (logData->locality != tagLocalitySatellite && tag.locality == tagLocalityLogRouter) { + wait(self->concurrentLogRouterReads.take()); + state FlowLock::Releaser globalReleaser(self->concurrentLogRouterReads); + wait(delay(0.0, TaskPriority::Low)); + } - if (begin <= logData->persistentDataDurableVersion && tag.locality != tagLocalityTxs && tag != txsTag) { - // Reading spilled data will almost always imply that the storage server is >5s behind the rest - // of the cluster. We shouldn't prioritize spending CPU on helping this server catch up - // slightly faster over keeping the rest of the cluster operating normally. - // txsTag is only ever peeked on recovery, and we would still wish to prioritize requests - // that impact recovery duration. - wait(delay(0, TaskPriority::TLogSpilledPeekReply)); - } + if (begin <= logData->persistentDataDurableVersion && tag.locality != tagLocalityTxs && tag != txsTag) { + // Reading spilled data will almost always imply that the storage server is >5s behind the rest + // of the cluster. We shouldn't prioritize spending CPU on helping this server catch up + // slightly faster over keeping the rest of the cluster operating normally. + // txsTag is only ever peeked on recovery, and we would still wish to prioritize requests + // that impact recovery duration. + wait(delay(0, TaskPriority::TLogSpilledPeekReply)); + } - state double workStart = now(); + state double workStart = now(); - Version poppedVer = poppedVersion(logData, tag); - if (poppedVer > begin) { - // reply with an empty message and let the next reply start from poppedVer - TLogPeekReply rep; - rep.maxKnownVersion = logData->version.get(); - rep.minKnownCommittedVersion = logData->minKnownCommittedVersion; - rep.popped = poppedVer; - rep.end = poppedVer; - rep.onlySpilled = false; + Version poppedVer = poppedVersion(logData, tag); + if (poppedVer > begin) { + // reply with an empty message and let the next reply start from poppedVer + TLogPeekReply rep; + rep.maxKnownVersion = logData->version.get(); + rep.minKnownCommittedVersion = logData->minKnownCommittedVersion; + rep.popped = poppedVer; + rep.end = poppedVer; + rep.onlySpilled = false; - // TODO: once the fake stream is replaced by ReplyPromiseStream, we can remove the code handling sequence - // requests. - if (sequence.present()) { - auto& trackerData = logData->peekTracker[peekId]; - auto& sequenceData = trackerData.sequence_version[sequenceNum + 1]; - trackerData.lastUpdate = now(); - if (trackerData.sequence_version.size() && sequenceNum + 1 < trackerData.sequence_version.begin()->first) { - if (!sequenceData.isSet()) - sequenceData.sendError(operation_obsolete()); - throw operation_obsolete(); - } - if (sequenceData.isSet()) { - if (sequenceData.getFuture().get().first != rep.end) { - TEST(true); // 1 tlog peek second attempt ended at a different version - throw operation_obsolete(); - } - } else { - sequenceData.send(std::make_pair(rep.end, rep.onlySpilled)); - } - rep.begin = begin; - } + // TODO: once the fake stream is replaced by ReplyPromiseStream, we can remove the code handling sequence + // requests. + if (sequence.present()) { + auto& trackerData = logData->peekTracker[peekId]; + auto& sequenceData = trackerData.sequence_version[sequenceNum + 1]; + trackerData.lastUpdate = now(); + if (trackerData.sequence_version.size() && sequenceNum + 1 < trackerData.sequence_version.begin()->first) { + if (!sequenceData.isSet()) + sequenceData.sendError(operation_obsolete()); + throw operation_obsolete(); + } + if (sequenceData.isSet()) { + if (sequenceData.getFuture().get().first != rep.end) { + TEST(true); // 1 tlog peek second attempt ended at a different version + throw operation_obsolete(); + } + } else { + sequenceData.send(std::make_pair(rep.end, rep.onlySpilled)); + } + rep.begin = begin; + } - return rep; - } + return rep; + } - state Version endVersion = logData->version.get() + 1; - state bool onlySpilled = false; + state Version endVersion = logData->version.get() + 1; + state bool onlySpilled = false; - // grab messages from disk - //TraceEvent("TLogPeekMessages", self->dbgid).detail("ReqBeginEpoch", req.begin.epoch).detail("ReqBeginSeq", req.begin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", req.tag1).detail("Tag2", req.tag2); - if (begin <= logData->persistentDataDurableVersion) { + // grab messages from disk + //TraceEvent("TLogPeekMessages", self->dbgid).detail("ReqBeginEpoch", req.begin.epoch).detail("ReqBeginSeq", req.begin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", req.tag1).detail("Tag2", req.tag2); + if (begin <= logData->persistentDataDurableVersion) { // Just in case the durable version changes while we are waiting for the read, we grab this data from memory. We // may or may not actually send it depending on whether we get enough data from disk. SOMEDAY: Only do this if // an initial attempt to read from disk results in insufficient data and the required data is no longer in @@ -1697,390 +1699,10 @@ ACTOR Future peekTLog(TLogData* self, peekMessagesFromMemory(logData, tag, begin, messages2, endVersion); } - if (tag.locality == tagLocalityTxs || tag == txsTag) { - RangeResult kvs = wait(self->persistentData->readRange( - KeyRangeRef(persistTagMessagesKey(logData->logId, tag, begin), - persistTagMessagesKey(logData->logId, tag, logData->persistentDataDurableVersion + 1)), - SERVER_KNOBS->DESIRED_TOTAL_BYTES, - SERVER_KNOBS->DESIRED_TOTAL_BYTES)); - - for (auto& kv : kvs) { - auto ver = decodeTagMessagesKey(kv.key); - messages << VERSION_HEADER << ver; - messages.serializeBytes(kv.value); - } - - if (kvs.expectedSize() >= SERVER_KNOBS->DESIRED_TOTAL_BYTES) { - endVersion = decodeTagMessagesKey(kvs.end()[-1].key) + 1; - onlySpilled = true; - } else { - messages.serializeBytes(messages2.toValue()); - } - } else { - // FIXME: Limit to approximately DESIRED_TOTATL_BYTES somehow. - RangeResult kvrefs = wait(self->persistentData->readRange( - KeyRangeRef( - persistTagMessageRefsKey(logData->logId, tag, begin), - persistTagMessageRefsKey(logData->logId, tag, logData->persistentDataDurableVersion + 1)), - SERVER_KNOBS->TLOG_SPILL_REFERENCE_MAX_BATCHES_PER_PEEK + 1)); - - //TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", reply.getEndpoint().getPrimaryAddress()).detail("Tag1Results", s1).detail("Tag2Results", s2).detail("Tag1ResultsLim", kv1.size()).detail("Tag2ResultsLim", kv2.size()).detail("Tag1ResultsLast", kv1.size() ? kv1[0].key : "").detail("Tag2ResultsLast", kv2.size() ? kv2[0].key : "").detail("Limited", limited).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowEpoch", self->epoch()).detail("NowSeq", self->sequence.getNextSequence()); - - state std::vector> commitLocations; - state bool earlyEnd = false; - uint32_t mutationBytes = 0; - state uint64_t commitBytes = 0; - state Version firstVersion = std::numeric_limits::max(); - for (int i = 0; i < kvrefs.size() && i < SERVER_KNOBS->TLOG_SPILL_REFERENCE_MAX_BATCHES_PER_PEEK; i++) { - auto& kv = kvrefs[i]; - VectorRef spilledData; - BinaryReader r(kv.value, AssumeVersion(logData->protocolVersion)); - r >> spilledData; - for (const SpilledData& sd : spilledData) { - if (mutationBytes >= SERVER_KNOBS->DESIRED_TOTAL_BYTES) { - earlyEnd = true; - break; - } - if (sd.version >= begin) { - firstVersion = std::min(firstVersion, sd.version); - const IDiskQueue::location end = sd.start.lo + sd.length; - commitLocations.emplace_back(sd.start, end); - // This isn't perfect, because we aren't accounting for page boundaries, but should be - // close enough. - commitBytes += sd.length; - mutationBytes += sd.mutationBytes; - } - } - if (earlyEnd) - break; - } - earlyEnd = earlyEnd || (kvrefs.size() >= SERVER_KNOBS->TLOG_SPILL_REFERENCE_MAX_BATCHES_PER_PEEK + 1); - wait(self->peekMemoryLimiter.take(TaskPriority::TLogSpilledPeekReply, commitBytes)); - state FlowLock::Releaser memoryReservation(self->peekMemoryLimiter, commitBytes); - state std::vector>> messageReads; - messageReads.reserve(commitLocations.size()); - for (const auto& pair : commitLocations) { - messageReads.push_back(self->rawPersistentQueue->read(pair.first, pair.second, CheckHashes::TRUE)); - } - commitLocations.clear(); - wait(waitForAll(messageReads)); - - state Version lastRefMessageVersion = 0; - state int index = 0; - loop { - if (index >= messageReads.size()) - break; - Standalone queueEntryData = messageReads[index].get(); - uint8_t valid; - const uint32_t length = *(uint32_t*)queueEntryData.begin(); - queueEntryData = queueEntryData.substr(4, queueEntryData.size() - 4); - BinaryReader rd(queueEntryData, IncludeVersion()); - state TLogQueueEntry entry; - rd >> entry >> valid; - ASSERT(valid == 0x01); - ASSERT(length + sizeof(valid) == queueEntryData.size()); - - messages << VERSION_HEADER << entry.version; - - std::vector rawMessages = - wait(parseMessagesForTag(entry.messages, tag, logData->logRouterTags)); - for (const StringRef& msg : rawMessages) { - messages.serializeBytes(msg); - } - - lastRefMessageVersion = entry.version; - index++; - } - - messageReads.clear(); - memoryReservation.release(); - - if (earlyEnd) { - endVersion = lastRefMessageVersion + 1; - onlySpilled = true; - } else { - messages.serializeBytes(messages2.toValue()); - } - } - } else { - if (reqOnlySpilled) { - endVersion = logData->persistentDataDurableVersion + 1; - } else { - peekMessagesFromMemory(logData, tag, begin, messages, endVersion); - } - - //TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", req.reply.getEndpoint().getPrimaryAddress()).detail("MessageBytes", messages.getLength()).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowSeq", self->sequence.getNextSequence()); - } - - TLogPeekReply reply; - reply.maxKnownVersion = logData->version.get(); - reply.minKnownCommittedVersion = logData->minKnownCommittedVersion; - reply.messages = StringRef(reply.arena, messages.toValue()); - reply.end = endVersion; - reply.onlySpilled = onlySpilled; - - //TraceEvent("TlogPeek", self->dbgid).detail("LogId", logData->logId).detail("Tag", req.tag.toString()). - // detail("BeginVer", req.begin).detail("EndVer", reply.end). - // detail("MsgBytes", reply.messages.expectedSize()). - // detail("ForAddress", req.reply.getEndpoint().getPrimaryAddress()); - - if (sequence.present()) { - auto& trackerData = logData->peekTracker[peekId]; - trackerData.lastUpdate = now(); - - double queueT = blockStart - queueStart; - double blockT = workStart - blockStart; - double workT = now() - workStart; - - trackerData.totalPeeks++; - trackerData.replyBytes += reply.messages.size(); - - if (queueT > trackerData.queueMax) - trackerData.queueMax = queueT; - if (blockT > trackerData.blockMax) - trackerData.blockMax = blockT; - if (workT > trackerData.workMax) - trackerData.workMax = workT; - - trackerData.queueTime += queueT; - trackerData.blockTime += blockT; - trackerData.workTime += workT; - - auto& sequenceData = trackerData.sequence_version[sequenceNum + 1]; - if (trackerData.sequence_version.size() && sequenceNum + 1 < trackerData.sequence_version.begin()->first) { - if (!sequenceData.isSet()) { - // It would technically be more correct to .send({req.begin, req.onlySpilled}), as the next - // request might still be in the window of active requests, but LogSystemPeekCursor will - // throw away all future responses upon getting an operation_obsolete(), so computing a - // response will probably be a waste of CPU. - sequenceData.sendError(operation_obsolete()); - } - throw operation_obsolete(); - } - if (sequenceData.isSet()) { - trackerData.duplicatePeeks++; - if (sequenceData.getFuture().get().first != reply.end) { - TEST(true); // 1 tlog peek second attempt ended at a different version (2) - throw operation_obsolete(); - } - } else { - sequenceData.send(std::make_pair(reply.end, reply.onlySpilled)); - } - reply.begin = begin; - } - - return reply; -} - -// This actor keep pushing TLogPeekStreamReply until it's removed from the cluster or should recover -ACTOR Future tLogPeekStream(TLogData* self, TLogPeekStreamRequest req, Reference logData) { - self->activePeekStreams++; - TraceEvent(SevDebug, "TLogPeekStream", logData->logId).detail("Token", req.reply.getEndpoint().token); - - state Version begin = req.begin; - state bool onlySpilled = false; - if (req.tag.locality == tagLocalityTxs && req.tag.id >= logData->txsTags && logData->txsTags > 0) { - req.tag.id = req.tag.id % logData->txsTags; - } - req.reply.setByteLimit(std::min(SERVER_KNOBS->MAXIMUM_PEEK_BYTES, req.limitBytes)); - loop { - state TLogPeekStreamReply reply; - try { - wait(req.reply.onReady() && - store(reply.rep, peekTLog(self, logData, begin, req.tag, req.returnIfBlocked, onlySpilled))); - req.reply.send(reply); - begin = reply.rep.end; - onlySpilled = reply.rep.onlySpilled; - wait(delay(0, g_network->getCurrentTask())); - } catch (Error& e) { - self->activePeekStreams--; - TraceEvent(SevDebug, "TLogPeekStreamEnd", logData->logId).error(e, true); - - if (e.code() == error_code_end_of_stream) { - req.reply.sendError(e); - return Void(); - } else if (e.code() == error_code_operation_obsolete) { - // reply stream is cancelled on the client - return Void(); - } else { - throw; - } - } - } -} - -ACTOR Future tLogPeekMessages(TLogData* self, TLogPeekRequest req, Reference logData) { - /*try { - TLogPeekReply reply = - wait(peekTLog(self, logData, req.begin, req.tag, req.returnIfBlocked, req.onlySpilled, req.sequence)); - req.reply.send(reply); - } catch (Error& e) { - if (e.code() == error_code_timed_out || e.code() == error_code_operation_obsolete || - e.code() == error_code_end_of_stream) { - req.reply.sendError(e); - return Void(); - } else { - throw; - } - } - - return Void();*/ - state BinaryWriter messages(Unversioned()); - state BinaryWriter messages2(Unversioned()); - state int sequence = -1; - state UID peekId; - state double queueStart = now(); - - if (req.tag.locality == tagLocalityTxs && req.tag.id >= logData->txsTags && logData->txsTags > 0) { - req.tag.id = req.tag.id % logData->txsTags; - } - - if (req.sequence.present()) { - try { - peekId = req.sequence.get().first; - sequence = req.sequence.get().second; - if (sequence >= SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS && - logData->peekTracker.find(peekId) == logData->peekTracker.end()) { - throw operation_obsolete(); - } - auto& trackerData = logData->peekTracker[peekId]; - if (sequence == 0 && trackerData.sequence_version.find(0) == trackerData.sequence_version.end()) { - trackerData.tag = req.tag; - trackerData.sequence_version[0].send(std::make_pair(req.begin, req.onlySpilled)); - } - auto seqBegin = trackerData.sequence_version.begin(); - // The peek cursor and this comparison need to agree about the maximum number of in-flight requests. - while (trackerData.sequence_version.size() && - seqBegin->first <= sequence - SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS) { - if (seqBegin->second.canBeSet()) { - seqBegin->second.sendError(operation_obsolete()); - } - trackerData.sequence_version.erase(seqBegin); - seqBegin = trackerData.sequence_version.begin(); - } - - if (trackerData.sequence_version.size() && sequence < seqBegin->first) { - throw operation_obsolete(); - } - - Future> fPrevPeekData = trackerData.sequence_version[sequence].getFuture(); - if (fPrevPeekData.isReady()) { - trackerData.unblockedPeeks++; - double t = now() - trackerData.lastUpdate; - if (t > trackerData.idleMax) - trackerData.idleMax = t; - trackerData.idleTime += t; - } - trackerData.lastUpdate = now(); - std::pair prevPeekData = wait(fPrevPeekData); - req.begin = std::max(prevPeekData.first, req.begin); - req.onlySpilled = prevPeekData.second; - wait(yield()); - } catch (Error& e) { - if (e.code() == error_code_timed_out || e.code() == error_code_operation_obsolete) { - req.reply.sendError(e); - return Void(); - } else { - throw; - } - } - } - - state double blockStart = now(); - - if (req.returnIfBlocked && logData->version.get() < req.begin) { - req.reply.sendError(end_of_stream()); - if (req.sequence.present()) { - auto& trackerData = logData->peekTracker[peekId]; - auto& sequenceData = trackerData.sequence_version[sequence + 1]; - if (!sequenceData.isSet()) { - sequenceData.send(std::make_pair(req.begin, req.onlySpilled)); - } - } - return Void(); - } - - //TraceEvent("TLogPeekMessages0", self->dbgid).detail("ReqBeginEpoch", req.begin.epoch).detail("ReqBeginSeq", req.begin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", req.tag1).detail("Tag2", req.tag2); - // Wait until we have something to return that the caller doesn't already have - if (logData->version.get() < req.begin) { - wait(logData->version.whenAtLeast(req.begin)); - wait(delay(SERVER_KNOBS->TLOG_PEEK_DELAY, g_network->getCurrentTask())); - } - - if (req.tag.locality == tagLocalityLogRouter) { - wait(self->concurrentLogRouterReads.take()); - state FlowLock::Releaser globalReleaser(self->concurrentLogRouterReads); - wait(delay(0.0, TaskPriority::Low)); - } - - if (req.begin <= logData->persistentDataDurableVersion && req.tag.locality != tagLocalityTxs && req.tag != txsTag) { - // Reading spilled data will almost always imply that the storage server is >5s behind the rest - // of the cluster. We shouldn't prioritize spending CPU on helping this server catch up - // slightly faster over keeping the rest of the cluster operating normally. - // txsTag is only ever peeked on recovery, and we would still wish to prioritize requests - // that impact recovery duration. - wait(delay(0, TaskPriority::TLogSpilledPeekReply)); - } - - state double workStart = now(); - - Version poppedVer = poppedVersion(logData, req.tag); - if (poppedVer > req.begin) { - TLogPeekReply rep; - rep.maxKnownVersion = logData->version.get(); - rep.minKnownCommittedVersion = logData->minKnownCommittedVersion; - rep.popped = poppedVer; - rep.end = poppedVer; - rep.onlySpilled = false; - - if (req.sequence.present()) { - auto& trackerData = logData->peekTracker[peekId]; - auto& sequenceData = trackerData.sequence_version[sequence + 1]; - trackerData.lastUpdate = now(); - if (trackerData.sequence_version.size() && sequence + 1 < trackerData.sequence_version.begin()->first) { - req.reply.sendError(operation_obsolete()); - if (!sequenceData.isSet()) - sequenceData.sendError(operation_obsolete()); - return Void(); - } - if (sequenceData.isSet()) { - if (sequenceData.getFuture().get().first != rep.end) { - TEST(true); // tlog peek second attempt ended at a different version - req.reply.sendError(operation_obsolete()); - return Void(); - } - } else { - sequenceData.send(std::make_pair(rep.end, rep.onlySpilled)); - } - rep.begin = req.begin; - } - - req.reply.send(rep); - return Void(); - } - - state Version endVersion = logData->version.get() + 1; - state bool onlySpilled = false; - - // grab messages from disk - //TraceEvent("TLogPeekMessages", self->dbgid).detail("ReqBeginEpoch", req.begin.epoch).detail("ReqBeginSeq", req.begin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", req.tag1).detail("Tag2", req.tag2); - if (req.begin <= logData->persistentDataDurableVersion) { - // Just in case the durable version changes while we are waiting for the read, we grab this data from memory. We - // may or may not actually send it depending on whether we get enough data from disk. SOMEDAY: Only do this if - // an initial attempt to read from disk results in insufficient data and the required data is no longer in - // memory SOMEDAY: Should we only send part of the messages we collected, to actually limit the size of the - // result? - - if (req.onlySpilled) { - endVersion = logData->persistentDataDurableVersion + 1; - } else { - peekMessagesFromMemory(logData, req.tag, req.begin, messages2, endVersion); - } - - if (req.tag.locality == tagLocalityTxs || req.tag == txsTag) { + if (tag.locality == tagLocalityTxs || tag == txsTag) { RangeResult kvs = wait(self->persistentData->readRange( - KeyRangeRef(persistTagMessagesKey(logData->logId, req.tag, req.begin), - persistTagMessagesKey(logData->logId, req.tag, logData->persistentDataDurableVersion + 1)), + KeyRangeRef(persistTagMessagesKey(logData->logId, tag, begin), + persistTagMessagesKey(logData->logId, tag, logData->persistentDataDurableVersion + 1)), SERVER_KNOBS->DESIRED_TOTAL_BYTES, SERVER_KNOBS->DESIRED_TOTAL_BYTES)); @@ -2099,12 +1721,11 @@ ACTOR Future tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen } else { // FIXME: Limit to approximately DESIRED_TOTATL_BYTES somehow. RangeResult kvrefs = wait(self->persistentData->readRange( - KeyRangeRef( - persistTagMessageRefsKey(logData->logId, req.tag, req.begin), - persistTagMessageRefsKey(logData->logId, req.tag, logData->persistentDataDurableVersion + 1)), + KeyRangeRef(persistTagMessageRefsKey(logData->logId, tag, begin), + persistTagMessageRefsKey(logData->logId, tag, logData->persistentDataDurableVersion + 1)), SERVER_KNOBS->TLOG_SPILL_REFERENCE_MAX_BATCHES_PER_PEEK + 1)); - //TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", req.reply.getEndpoint().getPrimaryAddress()).detail("Tag1Results", s1).detail("Tag2Results", s2).detail("Tag1ResultsLim", kv1.size()).detail("Tag2ResultsLim", kv2.size()).detail("Tag1ResultsLast", kv1.size() ? kv1[0].key : "").detail("Tag2ResultsLast", kv2.size() ? kv2[0].key : "").detail("Limited", limited).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowEpoch", self->epoch()).detail("NowSeq", self->sequence.getNextSequence()); + //TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", reply.getEndpoint().getPrimaryAddress()).detail("Tag1Results", s1).detail("Tag2Results", s2).detail("Tag1ResultsLim", kv1.size()).detail("Tag2ResultsLim", kv2.size()).detail("Tag1ResultsLast", kv1.size() ? kv1[0].key : "").detail("Tag2ResultsLast", kv2.size() ? kv2[0].key : "").detail("Limited", limited).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowEpoch", self->epoch()).detail("NowSeq", self->sequence.getNextSequence()); state std::vector> commitLocations; state bool earlyEnd = false; @@ -2121,7 +1742,7 @@ ACTOR Future tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen earlyEnd = true; break; } - if (sd.version >= req.begin) { + if (sd.version >= begin) { firstVersion = std::min(firstVersion, sd.version); const IDiskQueue::location end = sd.start.lo + sd.length; commitLocations.emplace_back(sd.start, end); @@ -2163,7 +1784,7 @@ ACTOR Future tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen messages << VERSION_HEADER << entry.version; std::vector rawMessages = - wait(parseMessagesForTag(entry.messages, req.tag, logData->logRouterTags)); + wait(parseMessagesForTag(entry.messages, tag, logData->logRouterTags)); for (const StringRef& msg : rawMessages) { messages.serializeBytes(msg); } @@ -2183,10 +1804,10 @@ ACTOR Future tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen } } } else { - if (req.onlySpilled) { + if (reqOnlySpilled) { endVersion = logData->persistentDataDurableVersion + 1; } else { - peekMessagesFromMemory(logData, req.tag, req.begin, messages, endVersion); + peekMessagesFromMemory(logData, tag, begin, messages, endVersion); } //TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", req.reply.getEndpoint().getPrimaryAddress()).detail("MessageBytes", messages.getLength()).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowSeq", self->sequence.getNextSequence()); @@ -2195,13 +1816,16 @@ ACTOR Future tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen TLogPeekReply reply; reply.maxKnownVersion = logData->version.get(); reply.minKnownCommittedVersion = logData->minKnownCommittedVersion; - reply.messages = messages.toValue(); + reply.messages = StringRef(reply.arena, messages.toValue()); reply.end = endVersion; reply.onlySpilled = onlySpilled; - //TraceEvent("TlogPeek", self->dbgid).detail("LogId", logData->logId).detail("EndVer", reply.end).detail("MsgBytes", reply.messages.expectedSize()).detail("ForAddress", req.reply.getEndpoint().getPrimaryAddress()); + //TraceEvent("TlogPeek", self->dbgid).detail("LogId", logData->logId).detail("Tag", req.tag.toString()). + // detail("BeginVer", req.begin).detail("EndVer", reply.end). + // detail("MsgBytes", reply.messages.expectedSize()). + // detail("ForAddress", req.reply.getEndpoint().getPrimaryAddress()); - if (req.sequence.present()) { + if (sequence.present()) { auto& trackerData = logData->peekTracker[peekId]; trackerData.lastUpdate = now(); @@ -2223,27 +1847,84 @@ ACTOR Future tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen trackerData.blockTime += blockT; trackerData.workTime += workT; - auto& sequenceData = trackerData.sequence_version[sequence + 1]; - if (trackerData.sequence_version.size() && sequence + 1 < trackerData.sequence_version.begin()->first) { - req.reply.sendError(operation_obsolete()); - if (!sequenceData.isSet()) + auto& sequenceData = trackerData.sequence_version[sequenceNum + 1]; + if (trackerData.sequence_version.size() && sequenceNum + 1 < trackerData.sequence_version.begin()->first) { + if (!sequenceData.isSet()) { + // It would technically be more correct to .send({req.begin, req.onlySpilled}), as the next + // request might still be in the window of active requests, but LogSystemPeekCursor will + // throw away all future responses upon getting an operation_obsolete(), so computing a + // response will probably be a waste of CPU. sequenceData.sendError(operation_obsolete()); - return Void(); + } + throw operation_obsolete(); } if (sequenceData.isSet()) { trackerData.duplicatePeeks++; if (sequenceData.getFuture().get().first != reply.end) { - TEST(true); // tlog peek second attempt ended at a different version (2) - req.reply.sendError(operation_obsolete()); - return Void(); + TEST(true); // 1 tlog peek second attempt ended at a different version (2) + throw operation_obsolete(); } } else { sequenceData.send(std::make_pair(reply.end, reply.onlySpilled)); } - reply.begin = req.begin; + reply.begin = begin; + } + + return reply; +} + +// This actor keep pushing TLogPeekStreamReply until it's removed from the cluster or should recover +ACTOR Future tLogPeekStream(TLogData* self, TLogPeekStreamRequest req, Reference logData) { + self->activePeekStreams++; + TraceEvent(SevDebug, "TLogPeekStream", logData->logId).detail("Token", req.reply.getEndpoint().token); + + state Version begin = req.begin; + state bool onlySpilled = false; + if (req.tag.locality == tagLocalityTxs && req.tag.id >= logData->txsTags && logData->txsTags > 0) { + req.tag.id = req.tag.id % logData->txsTags; + } + req.reply.setByteLimit(std::min(SERVER_KNOBS->MAXIMUM_PEEK_BYTES, req.limitBytes)); + loop { + state TLogPeekStreamReply reply; + try { + wait(req.reply.onReady() && + store(reply.rep, peekTLog(self, logData, begin, req.tag, req.returnIfBlocked, onlySpilled))); + req.reply.send(reply); + begin = reply.rep.end; + onlySpilled = reply.rep.onlySpilled; + wait(delay(0, g_network->getCurrentTask())); + } catch (Error& e) { + self->activePeekStreams--; + TraceEvent(SevDebug, "TLogPeekStreamEnd", logData->logId).error(e, true); + + if (e.code() == error_code_end_of_stream) { + req.reply.sendError(e); + return Void(); + } else if (e.code() == error_code_operation_obsolete) { + // reply stream is cancelled on the client + return Void(); + } else { + throw; + } + } + } +} + +ACTOR Future tLogPeekMessages(TLogData* self, TLogPeekRequest req, Reference logData) { + try { + TLogPeekReply reply = + wait(peekTLog(self, logData, req.begin, req.tag, req.returnIfBlocked, req.onlySpilled, req.sequence)); + req.reply.send(reply); + } catch (Error& e) { + if (e.code() == error_code_timed_out || e.code() == error_code_operation_obsolete || + e.code() == error_code_end_of_stream) { + req.reply.sendError(e); + return Void(); + } else { + throw; + } } - req.reply.send(reply); return Void(); } @@ -3211,9 +2892,9 @@ ACTOR Future restorePersistentState(TLogData* self, logsByVersion.emplace_back(ver, id1); TraceEvent("TLogPersistentStateRestore", self->dbgid) - .detail("LogId", logData->logId) - .detail("Ver", ver) - .detail("RecoveryCount", logData->recoveryCount); + .detail("LogId", logData->logId) + .detail("Ver", ver) + .detail("RecoveryCount", logData->recoveryCount); // Restore popped keys. Pop operations that took place after the last (committed) updatePersistentDataVersion // might be lost, but that is fine because we will get the corresponding data back, too. tagKeys = prefixRange(rawId.withPrefix(persistTagPoppedKeys.begin)); From cd32478b52a393b5e2d80e7b2222d12d1527cab6 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Thu, 22 Jul 2021 15:45:59 -0700 Subject: [PATCH 14/29] memory error(Simple config) --- fdbrpc/fdbrpc.h | 62 +++++++++++-------------- fdbserver/LogRouter.actor.cpp | 11 ++--- fdbserver/LogSystemPeekCursor.actor.cpp | 33 ++++++------- fdbserver/OldTLogServer_4_6.actor.cpp | 5 +- fdbserver/OldTLogServer_6_0.actor.cpp | 5 +- fdbserver/OldTLogServer_6_2.actor.cpp | 5 +- fdbserver/QuietDatabase.actor.cpp | 2 + fdbserver/SimulatedCluster.actor.cpp | 2 +- fdbserver/TLogServer.actor.cpp | 5 +- 9 files changed, 55 insertions(+), 75 deletions(-) diff --git a/fdbrpc/fdbrpc.h b/fdbrpc/fdbrpc.h index 1f40bb47a7..5a56fc120c 100644 --- a/fdbrpc/fdbrpc.h +++ b/fdbrpc/fdbrpc.h @@ -277,9 +277,9 @@ struct AcknowledgementReceiver final : FlowReceiver, FastAllocated::operator new; using FastAllocated::operator delete; - uint64_t bytesSent; - uint64_t bytesAcknowledged; - uint64_t bytesLimit; + int64_t bytesSent; + int64_t bytesAcknowledged; + int64_t bytesLimit; Promise ready; Future failures; @@ -300,7 +300,7 @@ struct AcknowledgementReceiver final : FlowReceiver, FastAllocated hold = ready; hold.sendError(message.getError()); } else { - ASSERT(message.get().bytes > bytesAcknowledged); + ASSERT(message.get().bytes > bytesAcknowledged || (message.get().bytes < 0 && bytesAcknowledged > 0)); bytesAcknowledged = message.get().bytes; if (ready.isValid() && bytesSent - bytesAcknowledged < bytesLimit) { Promise hold = ready; @@ -336,6 +336,8 @@ struct NetNotifiedQueueWithAcknowledgements final : NotifiedQueue, void destroy() override { delete this; } void receive(ArenaObjectReader& reader) override { this->addPromiseRef(); + TraceEvent(SevDebug, "NetNotifiedQueueWithAcknowledgementsReceive") + .detail("PromiseRef", this->getPromiseReferenceCount()); ErrorOr> message; reader.deserialize(message); @@ -358,25 +360,19 @@ struct NetNotifiedQueueWithAcknowledgements final : NotifiedQueue, // send an ack immediately if (acknowledgements.getRawEndpoint().isValid()) { acknowledgements.bytesAcknowledged += message.get().asUnderlyingType().expectedSize(); - // int64_t overflow: we need to reset this stream - if (acknowledgements.bytesAcknowledged > std::numeric_limits::max()) { - FlowTransport::transport().sendUnreliable( - SerializeSource>(operation_obsolete()), - acknowledgements.getEndpoint(TaskPriority::ReadSocket), - false); - } else { - FlowTransport::transport().sendUnreliable( - SerializeSource>( - AcknowledgementReply(acknowledgements.bytesAcknowledged)), - acknowledgements.getEndpoint(TaskPriority::ReadSocket), - false); - } + FlowTransport::transport().sendUnreliable( + SerializeSource>( + AcknowledgementReply(acknowledgements.bytesAcknowledged)), + acknowledgements.getEndpoint(TaskPriority::NoDeliverDelay), + false); } } this->send(std::move(message.get().asUnderlyingType())); } this->delPromiseRef(); + TraceEvent(SevDebug, "NetNotifiedQueueWithAcknowledgementsReceiveEnd") + .detail("PromiseRef", this->getPromiseReferenceCount()); } T pop() override { @@ -384,17 +380,10 @@ struct NetNotifiedQueueWithAcknowledgements final : NotifiedQueue, // A reply that has been queued up is being consumed, so send an ack to the server if (acknowledgements.getRawEndpoint().isValid()) { acknowledgements.bytesAcknowledged += res.expectedSize(); - if (acknowledgements.bytesAcknowledged > std::numeric_limits::max()) { - FlowTransport::transport().sendUnreliable( - SerializeSource>(operation_obsolete()), - acknowledgements.getEndpoint(TaskPriority::ReadSocket), - false); - } else { - FlowTransport::transport().sendUnreliable(SerializeSource>( - AcknowledgementReply(acknowledgements.bytesAcknowledged)), - acknowledgements.getEndpoint(TaskPriority::ReadSocket), - false); - } + FlowTransport::transport().sendUnreliable(SerializeSource>( + AcknowledgementReply(acknowledgements.bytesAcknowledged)), + acknowledgements.getEndpoint(TaskPriority::NoDeliverDelay), + false); } return res; } @@ -408,7 +397,8 @@ struct NetNotifiedQueueWithAcknowledgements final : NotifiedQueue, false); } if (isRemoteEndpoint() && !sentError && !acknowledgements.failures.isReady()) { - // The ReplyPromiseStream was cancelled before sending an error, so the storage server must have died + // Notify the client ReplyPromiseStream was cancelled before sending an error, so the storage server must + // have died FlowTransport::transport().sendUnreliable(SerializeSource>>(broken_promise()), getEndpoint(TaskPriority::NoDeliverDelay), false); @@ -431,6 +421,7 @@ public: void send(U&& value) const { if (queue->isRemoteEndpoint()) { if (!queue->acknowledgements.getRawEndpoint().isValid()) { + // register acknowledge receiver on sender and tell the receiver where to send acknowledge messages value.acknowledgeToken = queue->acknowledgements.getEndpoint(TaskPriority::NoDeliverDelay).token; } queue->acknowledgements.bytesSent += value.expectedSize(); @@ -710,16 +701,17 @@ public: template ReplyPromiseStream getReplyStream(const X& value) const { - Future disc = makeDependent(IFailureMonitor::failureMonitor()).onDisconnectOrFailure(getEndpoint()); - auto& p = getReplyPromiseStream(value); - Reference peer; + auto p = getReplyPromiseStream(value); if (queue->isRemoteEndpoint()) { - peer = FlowTransport::transport().sendUnreliable(SerializeSource(value), getEndpoint(), true); + Future disc = + makeDependent(IFailureMonitor::failureMonitor()).onDisconnectOrFailure(getEndpoint()); + Reference peer = + FlowTransport::transport().sendUnreliable(SerializeSource(value), getEndpoint(), true); + // FIXME: defer sending the message until we know the connection is established + endStreamOnDisconnect(disc, p, getEndpoint(), peer); } else { send(value); } - // FIXME: defer sending the message until we know the connection is established - endStreamOnDisconnect(disc, p, getEndpoint(), peer); return p; } diff --git a/fdbserver/LogRouter.actor.cpp b/fdbserver/LogRouter.actor.cpp index eb3ea81326..f91a138b9e 100644 --- a/fdbserver/LogRouter.actor.cpp +++ b/fdbserver/LogRouter.actor.cpp @@ -559,7 +559,7 @@ ACTOR Future peekLogRouter(LogRouterData* self, // This actor keep pushing TLogPeekStreamReply until it's removed from the cluster or should recover ACTOR Future logRouterPeekStream(LogRouterData* self, TLogPeekStreamRequest req) { self->activePeekStreams++; - TraceEvent(SevDebug, "TLogPeekStream", self->dbgid).detail("Token", req.reply.getEndpoint().token); + TraceEvent(SevDebug, "LogRouterPeekStream", self->dbgid).detail("Token", req.reply.getEndpoint().token); state Version begin = req.begin; state bool onlySpilled = false; @@ -576,16 +576,13 @@ ACTOR Future logRouterPeekStream(LogRouterData* self, TLogPeekStreamReques wait(delay(0, g_network->getCurrentTask())); } catch (Error& e) { self->activePeekStreams--; - TraceEvent(SevDebug, "TLogPeekStreamEnd", self->dbgid).error(e, true); + TraceEvent(SevDebug, "LogRouterPeekStreamEnd", self->dbgid).error(e, true); if (e.code() == error_code_no_action_needed) { - return Void(); - } else if (e.code() == error_code_end_of_stream) { + req.reply.sendError(end_of_stream()); + } else if (e.code() == error_code_end_of_stream || e.code() == error_code_operation_obsolete) { req.reply.sendError(e); return Void(); - } else if (e.code() == error_code_operation_obsolete) { - // reply stream is cancelled on the client - return Void(); } else { throw; } diff --git a/fdbserver/LogSystemPeekCursor.actor.cpp b/fdbserver/LogSystemPeekCursor.actor.cpp index b4e76d3eb8..4f3d7a6d0b 100644 --- a/fdbserver/LogSystemPeekCursor.actor.cpp +++ b/fdbserver/LogSystemPeekCursor.actor.cpp @@ -38,7 +38,8 @@ ACTOR Future tryEstablishPeekStream(ILogSystem::ServerPeekCursor* self) { self->peekReplyStream = self->interf->get().interf().peekStreamMessages.getReplyStream(TLogPeekStreamRequest( self->messageVersion.version, self->tag, self->returnIfBlocked, std::numeric_limits::max())); TraceEvent(SevDebug, "SPC_StreamCreated", self->randomID) - .detail("PeerAddress", self->interf->get().interf().peekStreamMessages.getEndpoint().getPrimaryAddress()); + .detail("PeerAddress", self->interf->get().interf().peekStreamMessages.getEndpoint().getPrimaryAddress()) + .detail("PeerToken", self->interf->get().interf().peekStreamMessages.getEndpoint().token); return Void(); } @@ -350,10 +351,10 @@ ACTOR Future serverPeekStreamGetMore(ILogSystem::ServerPeekCursor* self, T fPeekReply) : Never())) { updateCursorWithReply(self, res); - TraceEvent("SPC_GetMoreB", self->randomID) - .detail("Has", self->hasMessage()) - .detail("End", res.end) - .detail("Popped", res.popped.present() ? res.popped.get() : 0); + // TraceEvent("SPC_GetMoreB", self->randomID) + // .detail("Has", self->hasMessage()) + // .detail("End", res.end) + // .detail("Popped", res.popped.present() ? res.popped.get() : 0); // NOTE: delay is needed here since TLog need to be scheduled to response if there are TLog and SS // on the same machine @@ -363,7 +364,7 @@ ACTOR Future serverPeekStreamGetMore(ILogSystem::ServerPeekCursor* self, T } } catch (Error& e) { TraceEvent(SevDebug, "SPC_GetMoreB_Error", self->randomID).detail("Error", e.what()); - if (e.code() == error_code_connection_failed) { + if (e.code() == error_code_connection_failed || e.code() == error_code_operation_obsolete) { self->peekReplyStream.reset(); } else if (e.code() == error_code_end_of_stream) { self->end.reset(self->messageVersion.version); @@ -408,20 +409,20 @@ ACTOR Future serverPeekGetMore(ILogSystem::ServerPeekCursor* self, TaskPri } Future ILogSystem::ServerPeekCursor::getMore(TaskPriority taskID) { - TraceEvent("SPC_GetMore", randomID) - .detail("HasMessage", hasMessage()) - .detail("More", !more.isValid() || more.isReady()) - .detail("MessageVersion", messageVersion.toString()) - .detail("End", end.toString()); + // TraceEvent("SPC_GetMore", randomID) + // .detail("HasMessage", hasMessage()) + // .detail("More", !more.isValid() || more.isReady()) + // .detail("MessageVersion", messageVersion.toString()) + // .detail("End", end.toString()); if (hasMessage() && !parallelGetMore) return Void(); if (!more.isValid() || more.isReady()) { - // more = serverPeekStreamGetMore(this, taskID); - if (parallelGetMore || onlySpilled || futureResults.size()) { - more = serverPeekParallelGetMore(this, taskID); + more = serverPeekStreamGetMore(this, taskID); + /*if (parallelGetMore || onlySpilled || futureResults.size()) { + more = serverPeekParallelGetMore(this, taskID); } else { - more = serverPeekGetMore(this, taskID); - } + more = serverPeekGetMore(this, taskID); + }*/ } return more; } diff --git a/fdbserver/OldTLogServer_4_6.actor.cpp b/fdbserver/OldTLogServer_4_6.actor.cpp index f84a4a47c8..1ea777e550 100644 --- a/fdbserver/OldTLogServer_4_6.actor.cpp +++ b/fdbserver/OldTLogServer_4_6.actor.cpp @@ -1143,12 +1143,9 @@ ACTOR Future tLogPeekStream(TLogData* self, TLogPeekStreamRequest req, Ref self->activePeekStreams--; TraceEvent(SevDebug, "TLogPeekStreamEnd", logData->logId).error(e, true); - if (e.code() == error_code_end_of_stream) { + if (e.code() == error_code_end_of_stream || e.code() == error_code_operation_obsolete) { req.reply.sendError(e); return Void(); - } else if (e.code() == error_code_operation_obsolete) { - // reply stream is cancelled on the client - return Void(); } else { throw; } diff --git a/fdbserver/OldTLogServer_6_0.actor.cpp b/fdbserver/OldTLogServer_6_0.actor.cpp index b76a865bac..abc7c37517 100644 --- a/fdbserver/OldTLogServer_6_0.actor.cpp +++ b/fdbserver/OldTLogServer_6_0.actor.cpp @@ -1466,12 +1466,9 @@ ACTOR Future tLogPeekStream(TLogData* self, TLogPeekStreamRequest req, Ref self->activePeekStreams--; TraceEvent(SevDebug, "TLogPeekStreamEnd", logData->logId).error(e, true); - if (e.code() == error_code_end_of_stream) { + if (e.code() == error_code_end_of_stream || e.code() == error_code_operation_obsolete) { req.reply.sendError(e); return Void(); - } else if (e.code() == error_code_operation_obsolete) { - // reply stream is cancelled on the client - return Void(); } else { throw; } diff --git a/fdbserver/OldTLogServer_6_2.actor.cpp b/fdbserver/OldTLogServer_6_2.actor.cpp index 7f6bea5f57..c536aab0d4 100644 --- a/fdbserver/OldTLogServer_6_2.actor.cpp +++ b/fdbserver/OldTLogServer_6_2.actor.cpp @@ -1897,12 +1897,9 @@ ACTOR Future tLogPeekStream(TLogData* self, TLogPeekStreamRequest req, Ref self->activePeekStreams--; TraceEvent(SevDebug, "TLogPeekStreamEnd", logData->logId).error(e, true); - if (e.code() == error_code_end_of_stream) { + if (e.code() == error_code_end_of_stream || e.code() == error_code_operation_obsolete) { req.reply.sendError(e); return Void(); - } else if (e.code() == error_code_operation_obsolete) { - // reply stream is cancelled on the client - return Void(); } else { throw; } diff --git a/fdbserver/QuietDatabase.actor.cpp b/fdbserver/QuietDatabase.actor.cpp index 13ef1bff8c..d633352088 100644 --- a/fdbserver/QuietDatabase.actor.cpp +++ b/fdbserver/QuietDatabase.actor.cpp @@ -636,7 +636,9 @@ ACTOR Future waitForQuietDatabase(Database cx, wait(delay(5.0)); // The quiet database check (which runs at the end of every test) will always time out due to active data movement. // To get around this, quiet Database will disable the perpetual wiggle in the setup phase. + printf("------- 1 -------\n"); wait(setPerpetualStorageWiggle(cx, false, LockAware::True)); + printf("------- 2 -------\n"); // Require 3 consecutive successful quiet database checks spaced 2 second apart state int numSuccesses = 0; diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp index 5f656f13f1..bcab98b9fc 100644 --- a/fdbserver/SimulatedCluster.actor.cpp +++ b/fdbserver/SimulatedCluster.actor.cpp @@ -253,7 +253,7 @@ public: // Refer to FDBTypes.h::TLogVersion. Defaults to the maximum supported version. int maxTLogVersion = TLogVersion::MAX_SUPPORTED; // Set true to simplify simulation configs for easier debugging - bool simpleConfig = false; + bool simpleConfig = true; Optional generateFearless, buggify; Optional datacenters, desiredTLogCount, commitProxyCount, grvProxyCount, resolverCount, storageEngineType, stderrSeverity, machineCount, processesPerMachine, coordinators; diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp index 814c6ba317..3e50dc9132 100644 --- a/fdbserver/TLogServer.actor.cpp +++ b/fdbserver/TLogServer.actor.cpp @@ -1941,12 +1941,9 @@ ACTOR Future tLogPeekStream(TLogData* self, TLogPeekStreamRequest req, Ref self->activePeekStreams--; TraceEvent(SevDebug, "TLogPeekStreamEnd", logData->logId).error(e, true); - if (e.code() == error_code_end_of_stream) { + if (e.code() == error_code_end_of_stream || e.code() == error_code_operation_obsolete) { req.reply.sendError(e); return Void(); - } else if (e.code() == error_code_operation_obsolete) { - // reply stream is cancelled on the client - return Void(); } else { throw; } From 09214b11603d04eb929ecb78645d72c55dd3c8f4 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Thu, 22 Jul 2021 16:14:55 -0700 Subject: [PATCH 15/29] add stream actor to serveTLogInterf --- fdbserver/OldTLogServer_4_6.actor.cpp | 3 +++ fdbserver/OldTLogServer_6_2.actor.cpp | 3 +++ 2 files changed, 6 insertions(+) diff --git a/fdbserver/OldTLogServer_4_6.actor.cpp b/fdbserver/OldTLogServer_4_6.actor.cpp index 1ea777e550..7c3c067b92 100644 --- a/fdbserver/OldTLogServer_4_6.actor.cpp +++ b/fdbserver/OldTLogServer_4_6.actor.cpp @@ -1337,6 +1337,9 @@ ACTOR Future serveTLogInterface(TLogData* self, when(TLogPeekRequest req = waitNext(tli.peekMessages.getFuture())) { logData->addActor.send(tLogPeekMessages(self, req, logData)); } + when(TLogPeekStreamRequest req = waitNext(tli.peekStreamMessages.getFuture())) { + logData->addActor.send(tLogPeekStream(self, req, logData)); + } when(TLogPopRequest req = waitNext(tli.popMessages.getFuture())) { logData->addActor.send(tLogPop(self, req, logData)); } diff --git a/fdbserver/OldTLogServer_6_2.actor.cpp b/fdbserver/OldTLogServer_6_2.actor.cpp index c536aab0d4..3d9749e0d5 100644 --- a/fdbserver/OldTLogServer_6_2.actor.cpp +++ b/fdbserver/OldTLogServer_6_2.actor.cpp @@ -2437,6 +2437,9 @@ ACTOR Future serveTLogInterface(TLogData* self, when(TLogPeekRequest req = waitNext(tli.peekMessages.getFuture())) { logData->addActor.send(tLogPeekMessages(self, req, logData)); } + when(TLogPeekStreamRequest req = waitNext(tli.peekStreamMessages.getFuture())) { + logData->addActor.send(tLogPeekStream(self, req, logData)); + } when(TLogPopRequest req = waitNext(tli.popMessages.getFuture())) { logData->addActor.send(tLogPop(self, req, logData)); } From 15871923312cf85f16f70c728ecfcf1391b92080 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Mon, 26 Jul 2021 09:36:23 -0700 Subject: [PATCH 16/29] temperary change to fix local out-of-order delivery --- fdbrpc/FlowTransport.actor.cpp | 16 ++++++++++------ flow/Knobs.cpp | 2 +- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/fdbrpc/FlowTransport.actor.cpp b/fdbrpc/FlowTransport.actor.cpp index d44483da12..069f2b266b 100644 --- a/fdbrpc/FlowTransport.actor.cpp +++ b/fdbrpc/FlowTransport.actor.cpp @@ -158,7 +158,10 @@ const Endpoint& EndpointMap::insert(NetworkAddressList localAddresses, NetworkMessageReceiver* EndpointMap::get(Endpoint::Token const& token) { uint32_t index = token.second(); if (index < wellKnownEndpointCount && data[index].receiver == nullptr) { - TraceEvent(SevWarnAlways, "WellKnownEndpointNotAdded").detail("Token", token).detail("Index", index).backtrace(); + TraceEvent(SevWarnAlways, "WellKnownEndpointNotAdded") + .detail("Token", token) + .detail("Index", index) + .backtrace(); } if (index < data.size() && data[index].token().first() == token.first() && ((data[index].token().second() & 0xffffffff00000000LL) | index) == token.second()) @@ -799,8 +802,9 @@ Peer::Peer(TransportData* transport, NetworkAddress const& destination) reconnectionDelay(FLOW_KNOBS->INITIAL_RECONNECTION_TIME), compatible(true), outstandingReplies(0), incompatibleProtocolVersionNewer(false), peerReferences(-1), bytesReceived(0), lastDataPacketSentTime(now()), pingLatencies(destination.isPublic() ? FLOW_KNOBS->PING_SAMPLE_AMOUNT : 1), lastLoggedBytesReceived(0), - bytesSent(0), lastLoggedBytesSent(0), timeoutCount(0), lastLoggedTime(0.0), connectOutgoingCount(0), connectIncomingCount(0), - connectFailedCount(0), connectLatencies(destination.isPublic() ? FLOW_KNOBS->NETWORK_CONNECT_SAMPLE_AMOUNT : 1), + bytesSent(0), lastLoggedBytesSent(0), timeoutCount(0), lastLoggedTime(0.0), connectOutgoingCount(0), + connectIncomingCount(0), connectFailedCount(0), + connectLatencies(destination.isPublic() ? FLOW_KNOBS->NETWORK_CONNECT_SAMPLE_AMOUNT : 1), protocolVersion(Reference>>(new AsyncVar>())) { IFailureMonitor::failureMonitor().setStatus(destination, FailureStatus(false)); } @@ -921,9 +925,9 @@ ACTOR static void deliver(TransportData* self, bool inReadSocket) { // We want to run the task at the right priority. If the priority is higher than the current priority (which is // ReadSocket) we can just upgrade. Otherwise we'll context switch so that we don't block other tasks that might run - // with a higher priority. ReplyPromiseStream needs to guarentee that messages are recieved in the order they were - // sent, so even in the case of local delivery those messages need to skip this delay. - if (priority < TaskPriority::ReadSocket || (priority != TaskPriority::NoDeliverDelay && !inReadSocket)) { + // with a higher priority. + // NOTE: don't skip delay(0) when it's local deliver since it could cause out of order object deconstruction. + if (priority < TaskPriority::ReadSocket || !inReadSocket) { wait(delay(0, priority)); } else { g_network->setCurrentTask(priority); diff --git a/flow/Knobs.cpp b/flow/Knobs.cpp index 7a8f1e24f7..25c6e32b80 100644 --- a/flow/Knobs.cpp +++ b/flow/Knobs.cpp @@ -184,7 +184,7 @@ void FlowKnobs::initialize(Randomize randomize, IsSimulated isSimulated) { init( FAST_NETWORK_LATENCY, 800e-6 ); init( SLOW_NETWORK_LATENCY, 100e-3 ); init( MAX_CLOGGING_LATENCY, 0 ); if( randomize && BUGGIFY ) MAX_CLOGGING_LATENCY = 0.1 * deterministicRandom()->random01(); - init( MAX_BUGGIFIED_DELAY, 0 ); if( randomize && BUGGIFY ) MAX_BUGGIFIED_DELAY = 0.2 * deterministicRandom()->random01(); + init( MAX_BUGGIFIED_DELAY, 0 ); // if( randomize && BUGGIFY ) MAX_BUGGIFIED_DELAY = 0.2 * deterministicRandom()->random01(); init( SIM_CONNECT_ERROR_MODE, deterministicRandom()->randomInt(0,3) ); //Tracefiles From c6b0de1264fb16d2701945856aca760e9ee31ba1 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Mon, 26 Jul 2021 09:36:53 -0700 Subject: [PATCH 17/29] problem: OOM --- fdbrpc/fdbrpc.h | 14 +++++++------- fdbserver/LogRouter.actor.cpp | 3 ++- fdbserver/LogSystemPeekCursor.actor.cpp | 8 ++++---- fdbserver/OldTLogServer_4_6.actor.cpp | 2 ++ fdbserver/OldTLogServer_6_0.actor.cpp | 3 ++- fdbserver/OldTLogServer_6_2.actor.cpp | 2 ++ fdbserver/TLogServer.actor.cpp | 3 ++- 7 files changed, 21 insertions(+), 14 deletions(-) diff --git a/fdbrpc/fdbrpc.h b/fdbrpc/fdbrpc.h index 5a56fc120c..2a5cdf7ee4 100644 --- a/fdbrpc/fdbrpc.h +++ b/fdbrpc/fdbrpc.h @@ -328,16 +328,16 @@ struct NetNotifiedQueueWithAcknowledgements final : NotifiedQueue, NetNotifiedQueueWithAcknowledgements(int futures, int promises, const Endpoint& remoteEndpoint) : NotifiedQueue(futures, promises), FlowReceiver(remoteEndpoint, true) { // A ReplyPromiseStream will be terminated on the server side if the network connection with the client breaks - acknowledgements.failures = tagError( - makeDependent(IFailureMonitor::failureMonitor()).onDisconnect(remoteEndpoint.getPrimaryAddress()), - operation_obsolete()); + acknowledgements.failures = + tagError(makeDependent(IFailureMonitor::failureMonitor()).onDisconnectOrFailure(remoteEndpoint), + operation_obsolete()); } void destroy() override { delete this; } void receive(ArenaObjectReader& reader) override { this->addPromiseRef(); - TraceEvent(SevDebug, "NetNotifiedQueueWithAcknowledgementsReceive") - .detail("PromiseRef", this->getPromiseReferenceCount()); + // TraceEvent(SevDebug, "NetNotifiedQueueWithAcknowledgementsReceive") + // .detail("PromiseRef", this->getPromiseReferenceCount()); ErrorOr> message; reader.deserialize(message); @@ -371,8 +371,8 @@ struct NetNotifiedQueueWithAcknowledgements final : NotifiedQueue, this->send(std::move(message.get().asUnderlyingType())); } this->delPromiseRef(); - TraceEvent(SevDebug, "NetNotifiedQueueWithAcknowledgementsReceiveEnd") - .detail("PromiseRef", this->getPromiseReferenceCount()); + // TraceEvent(SevDebug, "NetNotifiedQueueWithAcknowledgementsReceiveEnd") + // .detail("PromiseRef", this->getPromiseReferenceCount()); } T pop() override { diff --git a/fdbserver/LogRouter.actor.cpp b/fdbserver/LogRouter.actor.cpp index 8527a7a01e..f0c634dfae 100644 --- a/fdbserver/LogRouter.actor.cpp +++ b/fdbserver/LogRouter.actor.cpp @@ -559,7 +559,6 @@ ACTOR Future peekLogRouter(LogRouterData* self, // This actor keep pushing TLogPeekStreamReply until it's removed from the cluster or should recover ACTOR Future logRouterPeekStream(LogRouterData* self, TLogPeekStreamRequest req) { self->activePeekStreams++; - TraceEvent(SevDebug, "LogRouterPeekStream", self->dbgid).detail("Token", req.reply.getEndpoint().token); state Version begin = req.begin; state bool onlySpilled = false; @@ -690,6 +689,8 @@ ACTOR Future logRouterCore(TLogInterface interf, addActor.send(logRouterPeekMessages(&logRouterData, req)); } when(TLogPeekStreamRequest req = waitNext(interf.peekStreamMessages.getFuture())) { + TraceEvent(SevDebug, "LogRouterPeekStream", logRouterData.dbgid) + .detail("Token", interf.peekStreamMessages.getEndpoint().token); addActor.send(logRouterPeekStream(&logRouterData, req)); } when(TLogPopRequest req = waitNext(interf.popMessages.getFuture())) { diff --git a/fdbserver/LogSystemPeekCursor.actor.cpp b/fdbserver/LogSystemPeekCursor.actor.cpp index 4f3d7a6d0b..3668b52d08 100644 --- a/fdbserver/LogSystemPeekCursor.actor.cpp +++ b/fdbserver/LogSystemPeekCursor.actor.cpp @@ -351,10 +351,10 @@ ACTOR Future serverPeekStreamGetMore(ILogSystem::ServerPeekCursor* self, T fPeekReply) : Never())) { updateCursorWithReply(self, res); - // TraceEvent("SPC_GetMoreB", self->randomID) - // .detail("Has", self->hasMessage()) - // .detail("End", res.end) - // .detail("Popped", res.popped.present() ? res.popped.get() : 0); + TraceEvent("SPC_GetMoreB", self->randomID) + .detail("Has", self->hasMessage()) + .detail("End", res.end) + .detail("Popped", res.popped.present() ? res.popped.get() : 0); // NOTE: delay is needed here since TLog need to be scheduled to response if there are TLog and SS // on the same machine diff --git a/fdbserver/OldTLogServer_4_6.actor.cpp b/fdbserver/OldTLogServer_4_6.actor.cpp index 2b8c8b2cc5..97d9b8efa2 100644 --- a/fdbserver/OldTLogServer_4_6.actor.cpp +++ b/fdbserver/OldTLogServer_4_6.actor.cpp @@ -1338,6 +1338,8 @@ ACTOR Future serveTLogInterface(TLogData* self, logData->addActor.send(tLogPeekMessages(self, req, logData)); } when(TLogPeekStreamRequest req = waitNext(tli.peekStreamMessages.getFuture())) { + TraceEvent(SevDebug, "TLogPeekStream", logData->logId) + .detail("Token", tli.peekStreamMessages.getEndpoint().token); logData->addActor.send(tLogPeekStream(self, req, logData)); } when(TLogPopRequest req = waitNext(tli.popMessages.getFuture())) { diff --git a/fdbserver/OldTLogServer_6_0.actor.cpp b/fdbserver/OldTLogServer_6_0.actor.cpp index 3de98dda8e..c9ec74354d 100644 --- a/fdbserver/OldTLogServer_6_0.actor.cpp +++ b/fdbserver/OldTLogServer_6_0.actor.cpp @@ -1445,7 +1445,6 @@ ACTOR Future peekTLog(TLogData* self, // This actor keep pushing TLogPeekStreamReply until it's removed from the cluster or should recover ACTOR Future tLogPeekStream(TLogData* self, TLogPeekStreamRequest req, Reference logData) { self->activePeekStreams++; - TraceEvent(SevDebug, "TLogPeekStream", logData->logId).detail("Token", req.reply.getEndpoint().token); state Version begin = req.begin; state bool onlySpilled = false; @@ -1991,6 +1990,8 @@ ACTOR Future serveTLogInterface(TLogData* self, logData->addActor.send(tLogPeekMessages(self, req, logData)); } when(TLogPeekStreamRequest req = waitNext(tli.peekStreamMessages.getFuture())) { + TraceEvent(SevDebug, "TLogPeekStream", logData->logId) + .detail("Token", tli.peekStreamMessages.getEndpoint().token); logData->addActor.send(tLogPeekStream(self, req, logData)); } when(TLogPopRequest req = waitNext(tli.popMessages.getFuture())) { diff --git a/fdbserver/OldTLogServer_6_2.actor.cpp b/fdbserver/OldTLogServer_6_2.actor.cpp index 64b3ce4008..d4fcc595d0 100644 --- a/fdbserver/OldTLogServer_6_2.actor.cpp +++ b/fdbserver/OldTLogServer_6_2.actor.cpp @@ -2438,6 +2438,8 @@ ACTOR Future serveTLogInterface(TLogData* self, logData->addActor.send(tLogPeekMessages(self, req, logData)); } when(TLogPeekStreamRequest req = waitNext(tli.peekStreamMessages.getFuture())) { + TraceEvent(SevDebug, "TLogPeekStream", logData->logId) + .detail("Token", tli.peekStreamMessages.getEndpoint().token); logData->addActor.send(tLogPeekStream(self, req, logData)); } when(TLogPopRequest req = waitNext(tli.popMessages.getFuture())) { diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp index 799c8a52da..665a5cfc53 100644 --- a/fdbserver/TLogServer.actor.cpp +++ b/fdbserver/TLogServer.actor.cpp @@ -1920,7 +1920,6 @@ ACTOR Future peekTLog(TLogData* self, // This actor keep pushing TLogPeekStreamReply until it's removed from the cluster or should recover ACTOR Future tLogPeekStream(TLogData* self, TLogPeekStreamRequest req, Reference logData) { self->activePeekStreams++; - TraceEvent(SevDebug, "TLogPeekStream", logData->logId).detail("Token", req.reply.getEndpoint().token); state Version begin = req.begin; state bool onlySpilled = false; @@ -2465,6 +2464,8 @@ ACTOR Future serveTLogInterface(TLogData* self, } } when(TLogPeekStreamRequest req = waitNext(tli.peekStreamMessages.getFuture())) { + TraceEvent(SevDebug, "TLogPeekStream", logData->logId) + .detail("Token", tli.peekStreamMessages.getEndpoint().token); logData->addActor.send(tLogPeekStream(self, req, logData)); } when(TLogPeekRequest req = waitNext(tli.peekMessages.getFuture())) { From 12d4f5c26184b42df960ef3d5ab005c44dbf2341 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Wed, 28 Jul 2021 14:11:25 -0700 Subject: [PATCH 18/29] disable streaming peek for localities < 0 --- fdbclient/FDBTypes.h | 4 ++-- fdbserver/LogRouter.actor.cpp | 30 ++++++++++++++++--------- fdbserver/LogSystemPeekCursor.actor.cpp | 14 +++++++----- fdbserver/OldTLogServer_4_6.actor.cpp | 6 ++++- fdbserver/OldTLogServer_6_0.actor.cpp | 6 ++++- fdbserver/OldTLogServer_6_2.actor.cpp | 6 ++++- fdbserver/QuietDatabase.actor.cpp | 4 ++-- fdbserver/SimulatedCluster.actor.cpp | 2 +- fdbserver/TLogServer.actor.cpp | 6 ++++- 9 files changed, 53 insertions(+), 25 deletions(-) diff --git a/fdbclient/FDBTypes.h b/fdbclient/FDBTypes.h index 25e31d1134..1aa1c31273 100644 --- a/fdbclient/FDBTypes.h +++ b/fdbclient/FDBTypes.h @@ -41,8 +41,8 @@ typedef UID SpanID; enum { tagLocalitySpecial = -1, // tag with this locality means it is invalidTag (id=0), txsTag (id=1), or cacheTag (id=2) tagLocalityLogRouter = -2, - tagLocalityRemoteLog = -3, // tag created by log router for remote tLogs - tagLocalityUpgraded = -4, + tagLocalityRemoteLog = -3, // tag created by log router for remote (aka. not in Primary DC) tLogs + tagLocalityUpgraded = -4, // tlogs with old log format tagLocalitySatellite = -5, tagLocalityLogRouterMapped = -6, // The pseudo tag used by log routers to pop the real LogRouter tag (i.e., -2) tagLocalityTxs = -7, diff --git a/fdbserver/LogRouter.actor.cpp b/fdbserver/LogRouter.actor.cpp index f0c634dfae..945fca63be 100644 --- a/fdbserver/LogRouter.actor.cpp +++ b/fdbserver/LogRouter.actor.cpp @@ -446,6 +446,7 @@ ACTOR Future peekLogRouter(LogRouterData* self, Tag tag, bool returnIfBlocked = false, bool reqOnlySpilled = false, + bool streamReply = false, Optional> sequence = Optional>()) { state BinaryWriter messages(Unversioned()); state int sequenceNum = -1; @@ -518,7 +519,12 @@ ACTOR Future peekLogRouter(LogRouterData* self, sequenceData.send(std::make_pair(begin, reqOnlySpilled)); } } - throw no_action_needed(); // we've already replied in the past + if (streamReply) { + // for streaming reply, we skip the popped part + begin = std::min(poppedVer, self->startVersion); + } else { + throw no_action_needed(); // we've already replied in the past + } } Version endVersion = self->version.get() + 1; @@ -568,18 +574,20 @@ ACTOR Future logRouterPeekStream(LogRouterData* self, TLogPeekStreamReques state TLogPeekStreamReply reply; try { wait(req.reply.onReady() && - store(reply.rep, peekLogRouter(self, req.begin, req.tag, req.returnIfBlocked, onlySpilled))); + store(reply.rep, peekLogRouter(self, req.begin, req.tag, req.returnIfBlocked, onlySpilled, true))); req.reply.send(reply); begin = reply.rep.end; onlySpilled = reply.rep.onlySpilled; - wait(delay(0, g_network->getCurrentTask())); + if (reply.rep.end > self->version.get()) { + wait(delay(SERVER_KNOBS->TLOG_PEEK_DELAY, g_network->getCurrentTask())); + } else { + wait(delay(0, g_network->getCurrentTask())); + } } catch (Error& e) { self->activePeekStreams--; - TraceEvent(SevDebug, "LogRouterPeekStreamEnd", self->dbgid).error(e, true); - if (e.code() == error_code_no_action_needed) { - req.reply.sendError(end_of_stream()); - } else if (e.code() == error_code_end_of_stream || e.code() == error_code_operation_obsolete) { + TraceEvent(SevDebug, "LogRouterPeekStreamEnd", self->dbgid).error(e, true); + if (e.code() == error_code_end_of_stream || e.code() == error_code_operation_obsolete) { req.reply.sendError(e); return Void(); } else { @@ -592,7 +600,7 @@ ACTOR Future logRouterPeekStream(LogRouterData* self, TLogPeekStreamReques ACTOR Future logRouterPeekMessages(LogRouterData* self, TLogPeekRequest req) { try { TLogPeekReply reply = - wait(peekLogRouter(self, req.begin, req.tag, req.returnIfBlocked, req.onlySpilled, req.sequence)); + wait(peekLogRouter(self, req.begin, req.tag, req.returnIfBlocked, req.onlySpilled, false, req.sequence)); req.reply.send(reply); } catch (Error& e) { if (e.code() == error_code_no_action_needed) { @@ -689,9 +697,11 @@ ACTOR Future logRouterCore(TLogInterface interf, addActor.send(logRouterPeekMessages(&logRouterData, req)); } when(TLogPeekStreamRequest req = waitNext(interf.peekStreamMessages.getFuture())) { - TraceEvent(SevDebug, "LogRouterPeekStream", logRouterData.dbgid) + // addActor.send(logRouterPeekStream(&logRouterData, req)); + // FIXME: temporarily disable streaming peek from LogRouter + TraceEvent(SevError, "LogRouterPeekStream", logRouterData.dbgid) .detail("Token", interf.peekStreamMessages.getEndpoint().token); - addActor.send(logRouterPeekStream(&logRouterData, req)); + req.reply.sendError(operation_failed()); } when(TLogPopRequest req = waitNext(interf.popMessages.getFuture())) { // Request from remote tLog to pop data from LR diff --git a/fdbserver/LogSystemPeekCursor.actor.cpp b/fdbserver/LogSystemPeekCursor.actor.cpp index 3668b52d08..b097d88c8f 100644 --- a/fdbserver/LogSystemPeekCursor.actor.cpp +++ b/fdbserver/LogSystemPeekCursor.actor.cpp @@ -358,7 +358,7 @@ ACTOR Future serverPeekStreamGetMore(ILogSystem::ServerPeekCursor* self, T // NOTE: delay is needed here since TLog need to be scheduled to response if there are TLog and SS // on the same machine - wait(delay(0)); + wait(delay(0, taskID)); return Void(); } } @@ -417,12 +417,14 @@ Future ILogSystem::ServerPeekCursor::getMore(TaskPriority taskID) { if (hasMessage() && !parallelGetMore) return Void(); if (!more.isValid() || more.isReady()) { - more = serverPeekStreamGetMore(this, taskID); - /*if (parallelGetMore || onlySpilled || futureResults.size()) { - more = serverPeekParallelGetMore(this, taskID); + // TODO: remove locality check when log router support streaming peek + if (usePeekStream && tag.locality >= 0) { + more = serverPeekStreamGetMore(this, taskID); + } else if (parallelGetMore || onlySpilled || futureResults.size()) { + more = serverPeekParallelGetMore(this, taskID); } else { - more = serverPeekGetMore(this, taskID); - }*/ + more = serverPeekGetMore(this, taskID); + } } return more; } diff --git a/fdbserver/OldTLogServer_4_6.actor.cpp b/fdbserver/OldTLogServer_4_6.actor.cpp index 97d9b8efa2..e5645223c8 100644 --- a/fdbserver/OldTLogServer_4_6.actor.cpp +++ b/fdbserver/OldTLogServer_4_6.actor.cpp @@ -1138,7 +1138,11 @@ ACTOR Future tLogPeekStream(TLogData* self, TLogPeekStreamRequest req, Ref req.reply.send(reply); begin = reply.rep.end; onlySpilled = reply.rep.onlySpilled; - wait(delay(0, g_network->getCurrentTask())); + if (reply.rep.end > logData->version.get()) { + wait(delay(SERVER_KNOBS->TLOG_PEEK_DELAY, g_network->getCurrentTask())); + } else { + wait(delay(0, g_network->getCurrentTask())); + } } catch (Error& e) { self->activePeekStreams--; TraceEvent(SevDebug, "TLogPeekStreamEnd", logData->logId).error(e, true); diff --git a/fdbserver/OldTLogServer_6_0.actor.cpp b/fdbserver/OldTLogServer_6_0.actor.cpp index c9ec74354d..bdd2a64bc3 100644 --- a/fdbserver/OldTLogServer_6_0.actor.cpp +++ b/fdbserver/OldTLogServer_6_0.actor.cpp @@ -1460,7 +1460,11 @@ ACTOR Future tLogPeekStream(TLogData* self, TLogPeekStreamRequest req, Ref req.reply.send(reply); begin = reply.rep.end; onlySpilled = reply.rep.onlySpilled; - wait(delay(0, g_network->getCurrentTask())); + if (reply.rep.end > logData->version.get()) { + wait(delay(SERVER_KNOBS->TLOG_PEEK_DELAY, g_network->getCurrentTask())); + } else { + wait(delay(0, g_network->getCurrentTask())); + } } catch (Error& e) { self->activePeekStreams--; TraceEvent(SevDebug, "TLogPeekStreamEnd", logData->logId).error(e, true); diff --git a/fdbserver/OldTLogServer_6_2.actor.cpp b/fdbserver/OldTLogServer_6_2.actor.cpp index d4fcc595d0..16077d648f 100644 --- a/fdbserver/OldTLogServer_6_2.actor.cpp +++ b/fdbserver/OldTLogServer_6_2.actor.cpp @@ -1892,7 +1892,11 @@ ACTOR Future tLogPeekStream(TLogData* self, TLogPeekStreamRequest req, Ref req.reply.send(reply); begin = reply.rep.end; onlySpilled = reply.rep.onlySpilled; - wait(delay(0, g_network->getCurrentTask())); + if (reply.rep.end > logData->version.get()) { + wait(delay(SERVER_KNOBS->TLOG_PEEK_DELAY, g_network->getCurrentTask())); + } else { + wait(delay(0, g_network->getCurrentTask())); + } } catch (Error& e) { self->activePeekStreams--; TraceEvent(SevDebug, "TLogPeekStreamEnd", logData->logId).error(e, true); diff --git a/fdbserver/QuietDatabase.actor.cpp b/fdbserver/QuietDatabase.actor.cpp index f5e5443ca0..305543f8c9 100644 --- a/fdbserver/QuietDatabase.actor.cpp +++ b/fdbserver/QuietDatabase.actor.cpp @@ -639,9 +639,9 @@ ACTOR Future waitForQuietDatabase(Database cx, wait(delay(5.0)); // The quiet database check (which runs at the end of every test) will always time out due to active data movement. // To get around this, quiet Database will disable the perpetual wiggle in the setup phase. - printf("------- 1 -------\n"); + printf("Set perpetual_storage_wiggle=0 ...\n"); wait(setPerpetualStorageWiggle(cx, false, LockAware::True)); - printf("------- 2 -------\n"); + printf("Set perpetual_storage_wiggle=0 Done.\n"); // Require 3 consecutive successful quiet database checks spaced 2 second apart state int numSuccesses = 0; diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp index bcab98b9fc..5f656f13f1 100644 --- a/fdbserver/SimulatedCluster.actor.cpp +++ b/fdbserver/SimulatedCluster.actor.cpp @@ -253,7 +253,7 @@ public: // Refer to FDBTypes.h::TLogVersion. Defaults to the maximum supported version. int maxTLogVersion = TLogVersion::MAX_SUPPORTED; // Set true to simplify simulation configs for easier debugging - bool simpleConfig = true; + bool simpleConfig = false; Optional generateFearless, buggify; Optional datacenters, desiredTLogCount, commitProxyCount, grvProxyCount, resolverCount, storageEngineType, stderrSeverity, machineCount, processesPerMachine, coordinators; diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp index 665a5cfc53..87f7770b5c 100644 --- a/fdbserver/TLogServer.actor.cpp +++ b/fdbserver/TLogServer.actor.cpp @@ -1935,7 +1935,11 @@ ACTOR Future tLogPeekStream(TLogData* self, TLogPeekStreamRequest req, Ref req.reply.send(reply); begin = reply.rep.end; onlySpilled = reply.rep.onlySpilled; - wait(delay(0, g_network->getCurrentTask())); + if (reply.rep.end > logData->version.get()) { + wait(delay(SERVER_KNOBS->TLOG_PEEK_DELAY, g_network->getCurrentTask())); + } else { + wait(delay(0, g_network->getCurrentTask())); + } } catch (Error& e) { self->activePeekStreams--; TraceEvent(SevDebug, "TLogPeekStreamEnd", logData->logId).error(e, true); From 1c4bce17aab8f4f57615a25108bbc59175a942aa Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Fri, 30 Jul 2021 19:08:22 -0700 Subject: [PATCH 19/29] revert code refactor --- fdbserver/LogRouter.actor.cpp | 266 ++++++-------- fdbserver/LogSystemPeekCursor.actor.cpp | 2 +- fdbserver/OldTLogServer_4_6.actor.cpp | 159 +++++++- fdbserver/OldTLogServer_6_0.actor.cpp | 239 +++++++++++- fdbserver/OldTLogServer_6_2.actor.cpp | 450 +++++++++++++++++++---- fdbserver/TLogServer.actor.cpp | 467 ++++++++++++++++++++---- 6 files changed, 1279 insertions(+), 304 deletions(-) diff --git a/fdbserver/LogRouter.actor.cpp b/fdbserver/LogRouter.actor.cpp index 945fca63be..b0e39dbb9e 100644 --- a/fdbserver/LogRouter.actor.cpp +++ b/fdbserver/LogRouter.actor.cpp @@ -441,153 +441,50 @@ Version poppedVersion(LogRouterData* self, Tag tag) { return tagData->popped; } -ACTOR Future peekLogRouter(LogRouterData* self, - Version begin, - Tag tag, - bool returnIfBlocked = false, - bool reqOnlySpilled = false, - bool streamReply = false, - Optional> sequence = Optional>()) { - state BinaryWriter messages(Unversioned()); - state int sequenceNum = -1; - state UID peekId; - - if (sequence.present()) { - peekId = sequence.get().first; - sequenceNum = sequence.get().second; - if (sequenceNum >= SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS && - self->peekTracker.find(peekId) == self->peekTracker.end()) { - throw operation_obsolete(); - } - auto& trackerData = self->peekTracker[peekId]; - if (sequenceNum == 0 && trackerData.sequence_version.find(0) == trackerData.sequence_version.end()) { - trackerData.sequence_version[0].send(std::make_pair(begin, reqOnlySpilled)); - } - auto seqBegin = trackerData.sequence_version.begin(); - // The peek cursor and this comparison need to agree about the maximum number of in-flight requests. - while (trackerData.sequence_version.size() && - seqBegin->first <= sequenceNum - SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS) { - if (seqBegin->second.canBeSet()) { - seqBegin->second.sendError(operation_obsolete()); - } - trackerData.sequence_version.erase(seqBegin); - seqBegin = trackerData.sequence_version.begin(); - } - - if (trackerData.sequence_version.size() && sequenceNum < seqBegin->first) { - throw operation_obsolete(); - } - - trackerData.lastUpdate = now(); - std::pair prevPeekData = wait(trackerData.sequence_version[sequenceNum].getFuture()); - begin = prevPeekData.first; - reqOnlySpilled = prevPeekData.second; - wait(yield()); - } - - //TraceEvent("LogRouterPeek1", self->dbgid).detail("From", req.reply.getEndpoint().getPrimaryAddress()).detail("Ver", self->version.get()).detail("Begin", req.begin); - if (returnIfBlocked && self->version.get() < begin) { - //TraceEvent("LogRouterPeek2", self->dbgid); - if (sequence.present()) { - auto& trackerData = self->peekTracker[peekId]; - auto& sequenceData = trackerData.sequence_version[sequenceNum + 1]; - if (!sequenceData.isSet()) { - sequenceData.send(std::make_pair(begin, reqOnlySpilled)); - } - } - throw end_of_stream(); - } - - if (self->version.get() < begin) { - wait(self->version.whenAtLeast(begin)); - wait(delay(SERVER_KNOBS->TLOG_PEEK_DELAY, g_network->getCurrentTask())); - } - - Version poppedVer = poppedVersion(self, tag); - - if (poppedVer > begin || begin < self->startVersion) { - // This should only happen if a packet is sent multiple times and the reply is not needed. - // Since we are using popped differently, do not send a reply. - TraceEvent(SevWarnAlways, "LogRouterPeekPopped", self->dbgid) - .detail("Begin", begin) - .detail("Popped", poppedVer) - .detail("Start", self->startVersion); - if (sequence.present()) { - auto& trackerData = self->peekTracker[peekId]; - auto& sequenceData = trackerData.sequence_version[sequenceNum + 1]; - if (!sequenceData.isSet()) { - sequenceData.send(std::make_pair(begin, reqOnlySpilled)); - } - } - if (streamReply) { - // for streaming reply, we skip the popped part - begin = std::min(poppedVer, self->startVersion); - } else { - throw no_action_needed(); // we've already replied in the past - } - } - - Version endVersion = self->version.get() + 1; - peekMessagesFromMemory(self, tag, begin, messages, endVersion); - - TLogPeekReply reply; - reply.maxKnownVersion = self->version.get(); - reply.minKnownCommittedVersion = self->poppedVersion; - reply.messages = StringRef(reply.arena, messages.toValue()); - reply.popped = self->minPopped.get() >= self->startVersion ? self->minPopped.get() : 0; - reply.end = endVersion; - reply.onlySpilled = false; - - if (sequence.present()) { - auto& trackerData = self->peekTracker[peekId]; - trackerData.lastUpdate = now(); - auto& sequenceData = trackerData.sequence_version[sequenceNum + 1]; - if (trackerData.sequence_version.size() && sequenceNum + 1 < trackerData.sequence_version.begin()->first) { - if (!sequenceData.isSet()) - sequenceData.sendError(operation_obsolete()); - throw operation_obsolete(); - } - if (sequenceData.isSet()) { - if (sequenceData.getFuture().get().first != reply.end) { - TEST(true); // tlog peek second attempt ended at a different version - throw operation_obsolete(); - } - } else { - sequenceData.send(std::make_pair(reply.end, reply.onlySpilled)); - } - reply.begin = begin; - } - - //TraceEvent("LogRouterPeek4", self->dbgid); - return reply; -} - // This actor keep pushing TLogPeekStreamReply until it's removed from the cluster or should recover ACTOR Future logRouterPeekStream(LogRouterData* self, TLogPeekStreamRequest req) { - self->activePeekStreams++; + return Void(); +} - state Version begin = req.begin; - state bool onlySpilled = false; +ACTOR Future logRouterPeekMessages(LogRouterData* self, TLogPeekRequest req) { + state BinaryWriter messages(Unversioned()); + state int sequence = -1; + state UID peekId; - req.reply.setByteLimit(std::min(SERVER_KNOBS->MAXIMUM_PEEK_BYTES, req.limitBytes)); - loop { - state TLogPeekStreamReply reply; + if (req.sequence.present()) { try { - wait(req.reply.onReady() && - store(reply.rep, peekLogRouter(self, req.begin, req.tag, req.returnIfBlocked, onlySpilled, true))); - req.reply.send(reply); - begin = reply.rep.end; - onlySpilled = reply.rep.onlySpilled; - if (reply.rep.end > self->version.get()) { - wait(delay(SERVER_KNOBS->TLOG_PEEK_DELAY, g_network->getCurrentTask())); - } else { - wait(delay(0, g_network->getCurrentTask())); + peekId = req.sequence.get().first; + sequence = req.sequence.get().second; + if (sequence >= SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS && + self->peekTracker.find(peekId) == self->peekTracker.end()) { + throw operation_obsolete(); + } + auto& trackerData = self->peekTracker[peekId]; + if (sequence == 0 && trackerData.sequence_version.find(0) == trackerData.sequence_version.end()) { + trackerData.sequence_version[0].send(std::make_pair(req.begin, req.onlySpilled)); + } + auto seqBegin = trackerData.sequence_version.begin(); + // The peek cursor and this comparison need to agree about the maximum number of in-flight requests. + while (trackerData.sequence_version.size() && + seqBegin->first <= sequence - SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS) { + if (seqBegin->second.canBeSet()) { + seqBegin->second.sendError(operation_obsolete()); + } + trackerData.sequence_version.erase(seqBegin); + seqBegin = trackerData.sequence_version.begin(); } - } catch (Error& e) { - self->activePeekStreams--; - TraceEvent(SevDebug, "LogRouterPeekStreamEnd", self->dbgid).error(e, true); - if (e.code() == error_code_end_of_stream || e.code() == error_code_operation_obsolete) { + if (trackerData.sequence_version.size() && sequence < seqBegin->first) { + throw operation_obsolete(); + } + + trackerData.lastUpdate = now(); + std::pair prevPeekData = wait(trackerData.sequence_version[sequence].getFuture()); + req.begin = prevPeekData.first; + req.onlySpilled = prevPeekData.second; + wait(yield()); + } catch (Error& e) { + if (e.code() == error_code_timed_out || e.code() == error_code_operation_obsolete) { req.reply.sendError(e); return Void(); } else { @@ -595,28 +492,85 @@ ACTOR Future logRouterPeekStream(LogRouterData* self, TLogPeekStreamReques } } } -} -ACTOR Future logRouterPeekMessages(LogRouterData* self, TLogPeekRequest req) { - try { - TLogPeekReply reply = - wait(peekLogRouter(self, req.begin, req.tag, req.returnIfBlocked, req.onlySpilled, false, req.sequence)); - req.reply.send(reply); - } catch (Error& e) { - if (e.code() == error_code_no_action_needed) { - req.reply.send(Never()); - return Void(); - } else if (e.code() == error_code_timed_out || e.code() == error_code_operation_obsolete || - e.code() == error_code_end_of_stream) { - req.reply.sendError(e); - return Void(); - } else { - throw; + //TraceEvent("LogRouterPeek1", self->dbgid).detail("From", req.reply.getEndpoint().getPrimaryAddress()).detail("Ver", self->version.get()).detail("Begin", req.begin); + if (req.returnIfBlocked && self->version.get() < req.begin) { + //TraceEvent("LogRouterPeek2", self->dbgid); + req.reply.sendError(end_of_stream()); + if (req.sequence.present()) { + auto& trackerData = self->peekTracker[peekId]; + auto& sequenceData = trackerData.sequence_version[sequence + 1]; + if (!sequenceData.isSet()) { + sequenceData.send(std::make_pair(req.begin, req.onlySpilled)); + } } + return Void(); } + + if (self->version.get() < req.begin) { + wait(self->version.whenAtLeast(req.begin)); + wait(delay(SERVER_KNOBS->TLOG_PEEK_DELAY, g_network->getCurrentTask())); + } + + Version poppedVer = poppedVersion(self, req.tag); + + if (poppedVer > req.begin || req.begin < self->startVersion) { + // This should only happen if a packet is sent multiple times and the reply is not needed. + // Since we are using popped differently, do not send a reply. + TraceEvent(SevWarnAlways, "LogRouterPeekPopped", self->dbgid) + .detail("Begin", req.begin) + .detail("Popped", poppedVer) + .detail("Start", self->startVersion); + req.reply.send(Never()); + if (req.sequence.present()) { + auto& trackerData = self->peekTracker[peekId]; + auto& sequenceData = trackerData.sequence_version[sequence + 1]; + if (!sequenceData.isSet()) { + sequenceData.send(std::make_pair(req.begin, req.onlySpilled)); + } + } + return Void(); + } + + Version endVersion = self->version.get() + 1; + peekMessagesFromMemory(self, req.tag, req.begin, messages, endVersion); + + TLogPeekReply reply; + reply.maxKnownVersion = self->version.get(); + reply.minKnownCommittedVersion = self->poppedVersion; + reply.messages = messages.toValue(); + reply.popped = self->minPopped.get() >= self->startVersion ? self->minPopped.get() : 0; + reply.end = endVersion; + reply.onlySpilled = false; + + if (req.sequence.present()) { + auto& trackerData = self->peekTracker[peekId]; + trackerData.lastUpdate = now(); + auto& sequenceData = trackerData.sequence_version[sequence + 1]; + if (trackerData.sequence_version.size() && sequence + 1 < trackerData.sequence_version.begin()->first) { + req.reply.sendError(operation_obsolete()); + if (!sequenceData.isSet()) + sequenceData.sendError(operation_obsolete()); + return Void(); + } + if (sequenceData.isSet()) { + if (sequenceData.getFuture().get().first != reply.end) { + TEST(true); // tlog peek second attempt ended at a different version + req.reply.sendError(operation_obsolete()); + return Void(); + } + } else { + sequenceData.send(std::make_pair(reply.end, reply.onlySpilled)); + } + reply.begin = req.begin; + } + + req.reply.send(reply); + //TraceEvent("LogRouterPeek4", self->dbgid); return Void(); } + ACTOR Future cleanupPeekTrackers(LogRouterData* self) { loop { double minTimeUntilExpiration = SERVER_KNOBS->PEEK_TRACKER_EXPIRATION_TIME; diff --git a/fdbserver/LogSystemPeekCursor.actor.cpp b/fdbserver/LogSystemPeekCursor.actor.cpp index b097d88c8f..c210f89f29 100644 --- a/fdbserver/LogSystemPeekCursor.actor.cpp +++ b/fdbserver/LogSystemPeekCursor.actor.cpp @@ -418,7 +418,7 @@ Future ILogSystem::ServerPeekCursor::getMore(TaskPriority taskID) { return Void(); if (!more.isValid() || more.isReady()) { // TODO: remove locality check when log router support streaming peek - if (usePeekStream && tag.locality >= 0) { + if (false && usePeekStream && tag.locality >= 0) { more = serverPeekStreamGetMore(this, taskID); } else if (parallelGetMore || onlySpilled || futureResults.size()) { more = serverPeekParallelGetMore(this, taskID); diff --git a/fdbserver/OldTLogServer_4_6.actor.cpp b/fdbserver/OldTLogServer_4_6.actor.cpp index 9c4cea2eac..d5fafab6c7 100644 --- a/fdbserver/OldTLogServer_4_6.actor.cpp +++ b/fdbserver/OldTLogServer_4_6.actor.cpp @@ -1157,21 +1157,158 @@ ACTOR Future tLogPeekStream(TLogData* self, TLogPeekStreamRequest req, Ref } } + ACTOR Future tLogPeekMessages(TLogData* self, TLogPeekRequest req, Reference logData) { - try { - TLogPeekReply reply = - wait(peekTLog(self, logData, req.begin, req.tag, req.returnIfBlocked, req.onlySpilled, req.sequence)); - req.reply.send(reply); - } catch (Error& e) { - if (e.code() == error_code_timed_out || e.code() == error_code_operation_obsolete || - e.code() == error_code_end_of_stream) { - req.reply.sendError(e); - return Void(); - } else { - throw; + state BinaryWriter messages(Unversioned()); + state BinaryWriter messages2(Unversioned()); + state int sequence = -1; + state UID peekId; + state OldTag oldTag = convertTag(req.tag); + + if (req.sequence.present()) { + try { + peekId = req.sequence.get().first; + sequence = req.sequence.get().second; + if (sequence >= SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS && + self->peekTracker.find(peekId) == self->peekTracker.end()) { + throw operation_obsolete(); + } + if (sequence > 0) { + auto& trackerData = self->peekTracker[peekId]; + trackerData.lastUpdate = now(); + Version ver = wait(trackerData.sequence_version[sequence].getFuture()); + req.begin = std::max(ver, req.begin); + wait(yield()); + } + } catch (Error& e) { + if (e.code() == error_code_timed_out || e.code() == error_code_operation_obsolete) { + req.reply.sendError(e); + return Void(); + } else { + throw; + } } } + if (req.returnIfBlocked && logData->version.get() < req.begin) { + req.reply.sendError(end_of_stream()); + return Void(); + } + + //TraceEvent("TLogPeekMessages0", self->dbgid).detail("ReqBeginEpoch", req.begin.epoch).detail("ReqBeginSeq", req.begin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", req.tag1).detail("Tag2", req.tag2); + // Wait until we have something to return that the caller doesn't already have + if (logData->version.get() < req.begin) { + wait(logData->version.whenAtLeast(req.begin)); + wait(delay(SERVER_KNOBS->TLOG_PEEK_DELAY, g_network->getCurrentTask())); + } + + state Version endVersion = logData->version.get() + 1; + + Version poppedVer = poppedVersion(logData, oldTag); + if (poppedVer > req.begin) { + TLogPeekReply rep; + rep.maxKnownVersion = logData->version.get(); + rep.minKnownCommittedVersion = 0; + rep.popped = poppedVer; + rep.end = poppedVer; + rep.onlySpilled = false; + + if (req.sequence.present()) { + auto& trackerData = self->peekTracker[peekId]; + auto& sequenceData = trackerData.sequence_version[sequence + 1]; + trackerData.lastUpdate = now(); + if (trackerData.sequence_version.size() && sequence + 1 < trackerData.sequence_version.begin()->first) { + req.reply.sendError(operation_obsolete()); + if (!sequenceData.isSet()) + sequenceData.sendError(operation_obsolete()); + return Void(); + } + if (sequenceData.isSet()) { + if (sequenceData.getFuture().get() != rep.end) { + TEST(true); // tlog peek second attempt ended at a different version + req.reply.sendError(operation_obsolete()); + return Void(); + } + } else { + sequenceData.send(rep.end); + } + rep.begin = req.begin; + } + + req.reply.send(rep); + return Void(); + } + + // grab messages from disk + //TraceEvent("TLogPeekMessages", self->dbgid).detail("ReqBeginEpoch", req.begin.epoch).detail("ReqBeginSeq", req.begin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", req.tag1).detail("Tag2", req.tag2); + if (req.begin <= logData->persistentDataDurableVersion) { + // Just in case the durable version changes while we are waiting for the read, we grab this data from memory. We + // may or may not actually send it depending on whether we get enough data from disk. SOMEDAY: Only do this if + // an initial attempt to read from disk results in insufficient data and the required data is no longer in + // memory SOMEDAY: Should we only send part of the messages we collected, to actually limit the size of the + // result? + + peekMessagesFromMemory(logData, req.tag, req.begin, messages2, endVersion); + + RangeResult kvs = wait(self->persistentData->readRange( + KeyRangeRef(persistTagMessagesKey(logData->logId, oldTag, req.begin), + persistTagMessagesKey(logData->logId, oldTag, logData->persistentDataDurableVersion + 1)), + SERVER_KNOBS->DESIRED_TOTAL_BYTES, + SERVER_KNOBS->DESIRED_TOTAL_BYTES)); + + //TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", req.reply.getEndpoint().getPrimaryAddress()).detail("Tag1Results", s1).detail("Tag2Results", s2).detail("Tag1ResultsLim", kv1.size()).detail("Tag2ResultsLim", kv2.size()).detail("Tag1ResultsLast", kv1.size() ? kv1[0].key : "").detail("Tag2ResultsLast", kv2.size() ? kv2[0].key : "").detail("Limited", limited).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowEpoch", self->epoch()).detail("NowSeq", self->sequence.getNextSequence()); + + for (auto& kv : kvs) { + auto ver = decodeTagMessagesKey(kv.key); + messages << int32_t(-1) << ver; + + BinaryReader rd(kv.value, Unversioned()); + while (!rd.empty()) { + int32_t messageLength; + uint32_t subVersion; + rd >> messageLength >> subVersion; + messageLength += sizeof(uint16_t) + sizeof(Tag); + messages << messageLength << subVersion << uint16_t(1) << req.tag; + messageLength -= (sizeof(subVersion) + sizeof(uint16_t) + sizeof(Tag)); + messages.serializeBytes(rd.readBytes(messageLength), messageLength); + } + } + + if (kvs.expectedSize() >= SERVER_KNOBS->DESIRED_TOTAL_BYTES) + endVersion = decodeTagMessagesKey(kvs.end()[-1].key) + 1; + else + messages.serializeBytes(messages2.toValue()); + } else { + peekMessagesFromMemory(logData, req.tag, req.begin, messages, endVersion); + //TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", req.reply.getEndpoint().getPrimaryAddress()).detail("MessageBytes", messages.getLength()).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowSeq", self->sequence.getNextSequence()); + } + + TLogPeekReply reply; + reply.maxKnownVersion = logData->version.get(); + reply.minKnownCommittedVersion = 0; + reply.onlySpilled = false; + reply.messages = messages.toValue(); + reply.end = endVersion; + + //TraceEvent("TlogPeek", self->dbgid).detail("LogId", logData->logId).detail("EndVer", reply.end).detail("MsgBytes", reply.messages.expectedSize()).detail("ForAddress", req.reply.getEndpoint().getPrimaryAddress()); + + if (req.sequence.present()) { + auto& trackerData = self->peekTracker[peekId]; + trackerData.lastUpdate = now(); + auto& sequenceData = trackerData.sequence_version[sequence + 1]; + if (sequenceData.isSet()) { + if (sequenceData.getFuture().get() != reply.end) { + TEST(true); // tlog peek second attempt ended at a different version (2) + req.reply.sendError(operation_obsolete()); + return Void(); + } + } else { + sequenceData.send(reply.end); + } + reply.begin = req.begin; + } + + req.reply.send(reply); return Void(); } diff --git a/fdbserver/OldTLogServer_6_0.actor.cpp b/fdbserver/OldTLogServer_6_0.actor.cpp index 04142babf7..b6964bf9c9 100644 --- a/fdbserver/OldTLogServer_6_0.actor.cpp +++ b/fdbserver/OldTLogServer_6_0.actor.cpp @@ -1479,24 +1479,241 @@ ACTOR Future tLogPeekStream(TLogData* self, TLogPeekStreamRequest req, Ref } } + ACTOR Future tLogPeekMessages(TLogData* self, TLogPeekRequest req, Reference logData) { - try { - TLogPeekReply reply = - wait(peekTLog(self, logData, req.begin, req.tag, req.returnIfBlocked, req.onlySpilled, req.sequence)); - req.reply.send(reply); - } catch (Error& e) { - if (e.code() == error_code_timed_out || e.code() == error_code_operation_obsolete || - e.code() == error_code_end_of_stream) { - req.reply.sendError(e); - return Void(); - } else { - throw; + state BinaryWriter messages(Unversioned()); + state BinaryWriter messages2(Unversioned()); + state int sequence = -1; + state UID peekId; + state double queueStart = now(); + + if (req.tag.locality == tagLocalityTxs && req.tag.id >= logData->txsTags && logData->txsTags > 0) { + req.tag.id = req.tag.id % logData->txsTags; + } + + if (req.sequence.present()) { + try { + peekId = req.sequence.get().first; + sequence = req.sequence.get().second; + if (sequence >= SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS && + logData->peekTracker.find(peekId) == logData->peekTracker.end()) { + throw operation_obsolete(); + } + auto& trackerData = logData->peekTracker[peekId]; + if (sequence == 0 && trackerData.sequence_version.find(0) == trackerData.sequence_version.end()) { + trackerData.tag = req.tag; + trackerData.sequence_version[0].send(std::make_pair(req.begin, req.onlySpilled)); + } + auto seqBegin = trackerData.sequence_version.begin(); + while (trackerData.sequence_version.size() && + seqBegin->first <= sequence - SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS) { + if (seqBegin->second.canBeSet()) { + seqBegin->second.sendError(operation_obsolete()); + } + trackerData.sequence_version.erase(seqBegin); + seqBegin = trackerData.sequence_version.begin(); + } + + if (trackerData.sequence_version.size() && sequence < seqBegin->first) { + throw operation_obsolete(); + } + + Future> fPrevPeekData = trackerData.sequence_version[sequence].getFuture(); + if (fPrevPeekData.isReady()) { + trackerData.unblockedPeeks++; + double t = now() - trackerData.lastUpdate; + if (t > trackerData.idleMax) + trackerData.idleMax = t; + trackerData.idleTime += t; + } + trackerData.lastUpdate = now(); + std::pair prevPeekData = wait(fPrevPeekData); + + req.begin = std::max(prevPeekData.first, req.begin); + req.onlySpilled = prevPeekData.second; + wait(yield()); + } catch (Error& e) { + if (e.code() == error_code_timed_out || e.code() == error_code_operation_obsolete) { + req.reply.sendError(e); + return Void(); + } else { + throw; + } } } + state double blockStart = now(); + + if (req.returnIfBlocked && logData->version.get() < req.begin) { + req.reply.sendError(end_of_stream()); + if (req.sequence.present()) { + auto& trackerData = logData->peekTracker[peekId]; + auto& sequenceData = trackerData.sequence_version[sequence + 1]; + if (!sequenceData.isSet()) { + sequenceData.send(std::make_pair(req.begin, req.onlySpilled)); + } + } + return Void(); + } + + //TraceEvent("TLogPeekMessages0", self->dbgid).detail("ReqBeginEpoch", req.begin.epoch).detail("ReqBeginSeq", req.begin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", req.tag1).detail("Tag2", req.tag2); + // Wait until we have something to return that the caller doesn't already have + if (logData->version.get() < req.begin) { + wait(logData->version.whenAtLeast(req.begin)); + wait(delay(SERVER_KNOBS->TLOG_PEEK_DELAY, g_network->getCurrentTask())); + } + + if (logData->locality != tagLocalitySatellite && req.tag.locality == tagLocalityLogRouter) { + wait(self->concurrentLogRouterReads.take()); + state FlowLock::Releaser globalReleaser(self->concurrentLogRouterReads); + wait(delay(0.0, TaskPriority::Low)); + } + + if (req.begin <= logData->persistentDataDurableVersion && req.tag.locality != tagLocalityTxs && req.tag != txsTag) { + // Reading spilled data will almost always imply that the storage server is >5s behind the rest + // of the cluster. We shouldn't prioritize spending CPU on helping this server catch up + // slightly faster over keeping the rest of the cluster operating normally. + // txsTag is only ever peeked on recovery, and we would still wish to prioritize requests + // that impact recovery duration. + wait(delay(0, TaskPriority::TLogSpilledPeekReply)); + } + + state double workStart = now(); + + Version poppedVer = poppedVersion(logData, req.tag); + if (poppedVer > req.begin) { + TLogPeekReply rep; + rep.maxKnownVersion = logData->version.get(); + rep.minKnownCommittedVersion = logData->minKnownCommittedVersion; + rep.popped = poppedVer; + rep.end = poppedVer; + rep.onlySpilled = false; + + if (req.sequence.present()) { + auto& trackerData = logData->peekTracker[peekId]; + auto& sequenceData = trackerData.sequence_version[sequence + 1]; + trackerData.lastUpdate = now(); + if (trackerData.sequence_version.size() && sequence + 1 < trackerData.sequence_version.begin()->first) { + req.reply.sendError(operation_obsolete()); + if (!sequenceData.isSet()) + sequenceData.sendError(operation_obsolete()); + return Void(); + } + if (sequenceData.isSet()) { + if (sequenceData.getFuture().get().first != rep.end) { + TEST(true); // tlog peek second attempt ended at a different version + req.reply.sendError(operation_obsolete()); + return Void(); + } + } else { + sequenceData.send(std::make_pair(rep.end, rep.onlySpilled)); + } + rep.begin = req.begin; + } + + req.reply.send(rep); + return Void(); + } + + state Version endVersion = logData->version.get() + 1; + state bool onlySpilled = false; + + // grab messages from disk + //TraceEvent("TLogPeekMessages", self->dbgid).detail("ReqBeginEpoch", req.begin.epoch).detail("ReqBeginSeq", req.begin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", req.tag1).detail("Tag2", req.tag2); + if (req.begin <= logData->persistentDataDurableVersion) { + // Just in case the durable version changes while we are waiting for the read, we grab this data from memory. We + // may or may not actually send it depending on whether we get enough data from disk. SOMEDAY: Only do this if + // an initial attempt to read from disk results in insufficient data and the required data is no longer in + // memory SOMEDAY: Should we only send part of the messages we collected, to actually limit the size of the + // result? + + if (req.onlySpilled) { + endVersion = logData->persistentDataDurableVersion + 1; + } else { + peekMessagesFromMemory(logData, req.tag, req.begin, messages2, endVersion); + } + + RangeResult kvs = wait(self->persistentData->readRange( + KeyRangeRef(persistTagMessagesKey(logData->logId, req.tag, req.begin), + persistTagMessagesKey(logData->logId, req.tag, logData->persistentDataDurableVersion + 1)), + SERVER_KNOBS->DESIRED_TOTAL_BYTES, + SERVER_KNOBS->DESIRED_TOTAL_BYTES)); + + //TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", req.reply.getEndpoint().address).detail("Tag1Results", s1).detail("Tag2Results", s2).detail("Tag1ResultsLim", kv1.size()).detail("Tag2ResultsLim", kv2.size()).detail("Tag1ResultsLast", kv1.size() ? kv1[0].key : "").detail("Tag2ResultsLast", kv2.size() ? kv2[0].key : "").detail("Limited", limited).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowEpoch", self->epoch()).detail("NowSeq", self->sequence.getNextSequence()); + + for (auto& kv : kvs) { + auto ver = decodeTagMessagesKey(kv.key); + messages << VERSION_HEADER << ver; + messages.serializeBytes(kv.value); + } + + if (kvs.expectedSize() >= SERVER_KNOBS->DESIRED_TOTAL_BYTES) { + endVersion = decodeTagMessagesKey(kvs.end()[-1].key) + 1; + onlySpilled = true; + } else { + messages.serializeBytes(messages2.toValue()); + } + } else { + peekMessagesFromMemory(logData, req.tag, req.begin, messages, endVersion); + //TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", req.reply.getEndpoint().address).detail("MessageBytes", messages.getLength()).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowSeq", self->sequence.getNextSequence()); + } + + TLogPeekReply reply; + reply.maxKnownVersion = logData->version.get(); + reply.minKnownCommittedVersion = logData->minKnownCommittedVersion; + reply.messages = messages.toValue(); + reply.end = endVersion; + reply.onlySpilled = onlySpilled; + + //TraceEvent("TlogPeek", self->dbgid).detail("LogId", logData->logId).detail("EndVer", reply.end).detail("MsgBytes", reply.messages.expectedSize()).detail("ForAddress", req.reply.getEndpoint().address); + + if (req.sequence.present()) { + auto& trackerData = logData->peekTracker[peekId]; + trackerData.lastUpdate = now(); + + double queueT = blockStart - queueStart; + double blockT = workStart - blockStart; + double workT = now() - workStart; + + trackerData.totalPeeks++; + trackerData.replyBytes += reply.messages.size(); + + if (queueT > trackerData.queueMax) + trackerData.queueMax = queueT; + if (blockT > trackerData.blockMax) + trackerData.blockMax = blockT; + if (workT > trackerData.workMax) + trackerData.workMax = workT; + + trackerData.queueTime += queueT; + trackerData.blockTime += blockT; + trackerData.workTime += workT; + + auto& sequenceData = trackerData.sequence_version[sequence + 1]; + if (trackerData.sequence_version.size() && sequence + 1 < trackerData.sequence_version.begin()->first) { + req.reply.sendError(operation_obsolete()); + if (!sequenceData.isSet()) + sequenceData.sendError(operation_obsolete()); + return Void(); + } + if (sequenceData.isSet()) { + trackerData.duplicatePeeks++; + if (sequenceData.getFuture().get().first != reply.end) { + TEST(true); // tlog peek second attempt ended at a different version (2) + req.reply.sendError(operation_obsolete()); + return Void(); + } + } else { + sequenceData.send(std::make_pair(reply.end, reply.onlySpilled)); + } + reply.begin = req.begin; + } + + req.reply.send(reply); return Void(); } + ACTOR Future doQueueCommit(TLogData* self, Reference logData, std::vector> missingFinalCommit) { diff --git a/fdbserver/OldTLogServer_6_2.actor.cpp b/fdbserver/OldTLogServer_6_2.actor.cpp index 917f772503..88029f9eb1 100644 --- a/fdbserver/OldTLogServer_6_2.actor.cpp +++ b/fdbserver/OldTLogServer_6_2.actor.cpp @@ -1547,7 +1547,8 @@ ACTOR Future> parseMessagesForTag(StringRef commitBlob, T } // Common logics to peek TLog and create TLogPeekReply that serves both streaming peek or normal peek request -ACTOR Future peekTLog(TLogData* self, +ACTOR template +Future peekTLogAndSend(PromiseType replyPromise, TLogData* self, Reference logData, Version begin, Tag tag, @@ -1567,50 +1568,60 @@ ACTOR Future peekTLog(TLogData* self, // TODO: once the fake stream is replaced by ReplyPromiseStream, we can remove the code handling sequence requests // STEP: a. mark obsolete sequence requests; b. wait previous sequence requests are handled in order if (sequence.present()) { - peekId = sequence.get().first; - sequenceNum = sequence.get().second; - if (sequenceNum >= SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS && - logData->peekTracker.find(peekId) == logData->peekTracker.end()) { - throw operation_obsolete(); - } - auto& trackerData = logData->peekTracker[peekId]; - if (sequenceNum == 0 && trackerData.sequence_version.find(0) == trackerData.sequence_version.end()) { - trackerData.tag = tag; - trackerData.sequence_version[0].send(std::make_pair(begin, reqOnlySpilled)); - } - auto seqBegin = trackerData.sequence_version.begin(); - // The peek cursor and this comparison need to agree about the maximum number of in-flight requests. - while (trackerData.sequence_version.size() && - seqBegin->first <= sequenceNum - SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS) { - if (seqBegin->second.canBeSet()) { - seqBegin->second.sendError(operation_obsolete()); + try{ + peekId = sequence.get().first; + sequenceNum = sequence.get().second; + if (sequenceNum >= SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS && + logData->peekTracker.find(peekId) == logData->peekTracker.end()) { + throw operation_obsolete(); + } + auto& trackerData = logData->peekTracker[peekId]; + if (sequenceNum == 0 && trackerData.sequence_version.find(0) == trackerData.sequence_version.end()) { + trackerData.tag = tag; + trackerData.sequence_version[0].send(std::make_pair(begin, reqOnlySpilled)); + } + auto seqBegin = trackerData.sequence_version.begin(); + // The peek cursor and this comparison need to agree about the maximum number of in-flight requests. + while (trackerData.sequence_version.size() && + seqBegin->first <= sequenceNum - SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS) { + if (seqBegin->second.canBeSet()) { + seqBegin->second.sendError(operation_obsolete()); + } + trackerData.sequence_version.erase(seqBegin); + seqBegin = trackerData.sequence_version.begin(); + } + + if (trackerData.sequence_version.size() && sequenceNum < seqBegin->first) { + throw operation_obsolete(); + } + + Future> fPrevPeekData = trackerData.sequence_version[sequenceNum].getFuture(); + if (fPrevPeekData.isReady()) { + trackerData.unblockedPeeks++; + double t = now() - trackerData.lastUpdate; + if (t > trackerData.idleMax) + trackerData.idleMax = t; + trackerData.idleTime += t; + } + trackerData.lastUpdate = now(); + std::pair prevPeekData = wait(fPrevPeekData); + begin = std::max(prevPeekData.first, begin); + reqOnlySpilled = prevPeekData.second; + wait(yield()); + } catch (Error& e) { + if (e.code() == error_code_timed_out || e.code() == error_code_operation_obsolete) { + replyPromise.sendError(e); + return Void(); + } else { + throw; } - trackerData.sequence_version.erase(seqBegin); - seqBegin = trackerData.sequence_version.begin(); } - - if (trackerData.sequence_version.size() && sequenceNum < seqBegin->first) { - throw operation_obsolete(); - } - - Future> fPrevPeekData = trackerData.sequence_version[sequenceNum].getFuture(); - if (fPrevPeekData.isReady()) { - trackerData.unblockedPeeks++; - double t = now() - trackerData.lastUpdate; - if (t > trackerData.idleMax) - trackerData.idleMax = t; - trackerData.idleTime += t; - } - trackerData.lastUpdate = now(); - std::pair prevPeekData = wait(fPrevPeekData); - begin = std::max(prevPeekData.first, begin); - reqOnlySpilled = prevPeekData.second; - wait(yield()); } state double blockStart = now(); if (returnIfBlocked && logData->version.get() < begin) { + replyPromise.sendError(end_of_stream()); if (sequence.present()) { auto& trackerData = logData->peekTracker[peekId]; auto& sequenceData = trackerData.sequence_version[sequenceNum + 1]; @@ -1619,7 +1630,7 @@ ACTOR Future peekTLog(TLogData* self, sequenceData.send(std::make_pair(begin, reqOnlySpilled)); } } - throw end_of_stream(); + return Void(); } //TraceEvent("TLogPeekMessages0", self->dbgid).detail("ReqBeginEpoch", req.begin.epoch).detail("ReqBeginSeq", req.begin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", req.tag1).detail("Tag2", req.tag2); @@ -1663,14 +1674,16 @@ ACTOR Future peekTLog(TLogData* self, auto& sequenceData = trackerData.sequence_version[sequenceNum + 1]; trackerData.lastUpdate = now(); if (trackerData.sequence_version.size() && sequenceNum + 1 < trackerData.sequence_version.begin()->first) { + replyPromise.sendError(operation_obsolete()); if (!sequenceData.isSet()) sequenceData.sendError(operation_obsolete()); - throw operation_obsolete(); + return Void(); } if (sequenceData.isSet()) { if (sequenceData.getFuture().get().first != rep.end) { TEST(true); // 1 tlog peek second attempt ended at a different version - throw operation_obsolete(); + replyPromise.sendError(operation_obsolete()); + return Void(); } } else { sequenceData.send(std::make_pair(rep.end, rep.onlySpilled)); @@ -1678,7 +1691,8 @@ ACTOR Future peekTLog(TLogData* self, rep.begin = begin; } - return rep; + replyPromise.send(rep); + return Void(); } state Version endVersion = logData->version.get() + 1; @@ -1849,6 +1863,7 @@ ACTOR Future peekTLog(TLogData* self, auto& sequenceData = trackerData.sequence_version[sequenceNum + 1]; if (trackerData.sequence_version.size() && sequenceNum + 1 < trackerData.sequence_version.begin()->first) { + replyPromise.sendError(operation_obsolete()); if (!sequenceData.isSet()) { // It would technically be more correct to .send({req.begin, req.onlySpilled}), as the next // request might still be in the window of active requests, but LogSystemPeekCursor will @@ -1856,13 +1871,14 @@ ACTOR Future peekTLog(TLogData* self, // response will probably be a waste of CPU. sequenceData.sendError(operation_obsolete()); } - throw operation_obsolete(); + return Void(); } if (sequenceData.isSet()) { trackerData.duplicatePeeks++; if (sequenceData.getFuture().get().first != reply.end) { TEST(true); // 1 tlog peek second attempt ended at a different version (2) - throw operation_obsolete(); + replyPromise.sendError(operation_obsolete()); + return Void(); } } else { sequenceData.send(std::make_pair(reply.end, reply.onlySpilled)); @@ -1870,25 +1886,29 @@ ACTOR Future peekTLog(TLogData* self, reply.begin = begin; } - return reply; + replyPromise.send(reply); + return Void(); } // This actor keep pushing TLogPeekStreamReply until it's removed from the cluster or should recover ACTOR Future tLogPeekStream(TLogData* self, TLogPeekStreamRequest req, Reference logData) { self->activePeekStreams++; - TraceEvent(SevDebug, "TLogPeekStream", logData->logId).detail("Token", req.reply.getEndpoint().token); state Version begin = req.begin; state bool onlySpilled = false; - if (req.tag.locality == tagLocalityTxs && req.tag.id >= logData->txsTags && logData->txsTags > 0) { - req.tag.id = req.tag.id % logData->txsTags; - } req.reply.setByteLimit(std::min(SERVER_KNOBS->MAXIMUM_PEEK_BYTES, req.limitBytes)); loop { state TLogPeekStreamReply reply; + state Promise promise; + state Future future(promise.getFuture()); try { - wait(req.reply.onReady() && - store(reply.rep, peekTLog(self, logData, begin, req.tag, req.returnIfBlocked, onlySpilled))); + wait(req.reply.onReady() && peekTLogAndSend(promise, self, logData, begin, req.tag, req.returnIfBlocked, onlySpilled)); + ASSERT(future.isReady()); + if(future.isError()) { + throw future.getError(); + } + + reply.rep = future.get(); req.reply.send(reply); begin = reply.rep.end; onlySpilled = reply.rep.onlySpilled; @@ -1911,21 +1931,327 @@ ACTOR Future tLogPeekStream(TLogData* self, TLogPeekStreamRequest req, Ref } } + ACTOR Future tLogPeekMessages(TLogData* self, TLogPeekRequest req, Reference logData) { - try { - TLogPeekReply reply = - wait(peekTLog(self, logData, req.begin, req.tag, req.returnIfBlocked, req.onlySpilled, req.sequence)); - req.reply.send(reply); - } catch (Error& e) { - if (e.code() == error_code_timed_out || e.code() == error_code_operation_obsolete || - e.code() == error_code_end_of_stream) { - req.reply.sendError(e); - return Void(); - } else { - throw; + state BinaryWriter messages(Unversioned()); + state BinaryWriter messages2(Unversioned()); + state int sequence = -1; + state UID peekId; + state double queueStart = now(); + + if (req.tag.locality == tagLocalityTxs && req.tag.id >= logData->txsTags && logData->txsTags > 0) { + req.tag.id = req.tag.id % logData->txsTags; + } + + if (req.sequence.present()) { + try { + peekId = req.sequence.get().first; + sequence = req.sequence.get().second; + if (sequence >= SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS && + logData->peekTracker.find(peekId) == logData->peekTracker.end()) { + throw operation_obsolete(); + } + auto& trackerData = logData->peekTracker[peekId]; + if (sequence == 0 && trackerData.sequence_version.find(0) == trackerData.sequence_version.end()) { + trackerData.tag = req.tag; + trackerData.sequence_version[0].send(std::make_pair(req.begin, req.onlySpilled)); + } + auto seqBegin = trackerData.sequence_version.begin(); + // The peek cursor and this comparison need to agree about the maximum number of in-flight requests. + while (trackerData.sequence_version.size() && + seqBegin->first <= sequence - SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS) { + if (seqBegin->second.canBeSet()) { + seqBegin->second.sendError(operation_obsolete()); + } + trackerData.sequence_version.erase(seqBegin); + seqBegin = trackerData.sequence_version.begin(); + } + + if (trackerData.sequence_version.size() && sequence < seqBegin->first) { + throw operation_obsolete(); + } + + Future> fPrevPeekData = trackerData.sequence_version[sequence].getFuture(); + if (fPrevPeekData.isReady()) { + trackerData.unblockedPeeks++; + double t = now() - trackerData.lastUpdate; + if (t > trackerData.idleMax) + trackerData.idleMax = t; + trackerData.idleTime += t; + } + trackerData.lastUpdate = now(); + std::pair prevPeekData = wait(fPrevPeekData); + req.begin = std::max(prevPeekData.first, req.begin); + req.onlySpilled = prevPeekData.second; + wait(yield()); + } catch (Error& e) { + if (e.code() == error_code_timed_out || e.code() == error_code_operation_obsolete) { + req.reply.sendError(e); + return Void(); + } else { + throw; + } } } + state double blockStart = now(); + + if (req.returnIfBlocked && logData->version.get() < req.begin) { + req.reply.sendError(end_of_stream()); + if (req.sequence.present()) { + auto& trackerData = logData->peekTracker[peekId]; + auto& sequenceData = trackerData.sequence_version[sequence + 1]; + if (!sequenceData.isSet()) { + sequenceData.send(std::make_pair(req.begin, req.onlySpilled)); + } + } + return Void(); + } + + //TraceEvent("TLogPeekMessages0", self->dbgid).detail("ReqBeginEpoch", req.begin.epoch).detail("ReqBeginSeq", req.begin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", req.tag1).detail("Tag2", req.tag2); + // Wait until we have something to return that the caller doesn't already have + if (logData->version.get() < req.begin) { + wait(logData->version.whenAtLeast(req.begin)); + wait(delay(SERVER_KNOBS->TLOG_PEEK_DELAY, g_network->getCurrentTask())); + } + + if (req.tag.locality == tagLocalityLogRouter) { + wait(self->concurrentLogRouterReads.take()); + state FlowLock::Releaser globalReleaser(self->concurrentLogRouterReads); + wait(delay(0.0, TaskPriority::Low)); + } + + if (req.begin <= logData->persistentDataDurableVersion && req.tag.locality != tagLocalityTxs && req.tag != txsTag) { + // Reading spilled data will almost always imply that the storage server is >5s behind the rest + // of the cluster. We shouldn't prioritize spending CPU on helping this server catch up + // slightly faster over keeping the rest of the cluster operating normally. + // txsTag is only ever peeked on recovery, and we would still wish to prioritize requests + // that impact recovery duration. + wait(delay(0, TaskPriority::TLogSpilledPeekReply)); + } + + state double workStart = now(); + + Version poppedVer = poppedVersion(logData, req.tag); + if (poppedVer > req.begin) { + TLogPeekReply rep; + rep.maxKnownVersion = logData->version.get(); + rep.minKnownCommittedVersion = logData->minKnownCommittedVersion; + rep.popped = poppedVer; + rep.end = poppedVer; + rep.onlySpilled = false; + + if (req.sequence.present()) { + auto& trackerData = logData->peekTracker[peekId]; + auto& sequenceData = trackerData.sequence_version[sequence + 1]; + trackerData.lastUpdate = now(); + if (trackerData.sequence_version.size() && sequence + 1 < trackerData.sequence_version.begin()->first) { + req.reply.sendError(operation_obsolete()); + if (!sequenceData.isSet()) + sequenceData.sendError(operation_obsolete()); + return Void(); + } + if (sequenceData.isSet()) { + if (sequenceData.getFuture().get().first != rep.end) { + TEST(true); // tlog peek second attempt ended at a different version + req.reply.sendError(operation_obsolete()); + return Void(); + } + } else { + sequenceData.send(std::make_pair(rep.end, rep.onlySpilled)); + } + rep.begin = req.begin; + } + + req.reply.send(rep); + return Void(); + } + + state Version endVersion = logData->version.get() + 1; + state bool onlySpilled = false; + + // grab messages from disk + //TraceEvent("TLogPeekMessages", self->dbgid).detail("ReqBeginEpoch", req.begin.epoch).detail("ReqBeginSeq", req.begin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", req.tag1).detail("Tag2", req.tag2); + if (req.begin <= logData->persistentDataDurableVersion) { + // Just in case the durable version changes while we are waiting for the read, we grab this data from memory. We + // may or may not actually send it depending on whether we get enough data from disk. SOMEDAY: Only do this if + // an initial attempt to read from disk results in insufficient data and the required data is no longer in + // memory SOMEDAY: Should we only send part of the messages we collected, to actually limit the size of the + // result? + + if (req.onlySpilled) { + endVersion = logData->persistentDataDurableVersion + 1; + } else { + peekMessagesFromMemory(logData, req.tag, req.begin, messages2, endVersion); + } + + if (req.tag.locality == tagLocalityTxs || req.tag == txsTag) { + RangeResult kvs = wait(self->persistentData->readRange( + KeyRangeRef(persistTagMessagesKey(logData->logId, req.tag, req.begin), + persistTagMessagesKey(logData->logId, req.tag, logData->persistentDataDurableVersion + 1)), + SERVER_KNOBS->DESIRED_TOTAL_BYTES, + SERVER_KNOBS->DESIRED_TOTAL_BYTES)); + + for (auto& kv : kvs) { + auto ver = decodeTagMessagesKey(kv.key); + messages << VERSION_HEADER << ver; + messages.serializeBytes(kv.value); + } + + if (kvs.expectedSize() >= SERVER_KNOBS->DESIRED_TOTAL_BYTES) { + endVersion = decodeTagMessagesKey(kvs.end()[-1].key) + 1; + onlySpilled = true; + } else { + messages.serializeBytes(messages2.toValue()); + } + } else { + // FIXME: Limit to approximately DESIRED_TOTATL_BYTES somehow. + RangeResult kvrefs = wait(self->persistentData->readRange( + KeyRangeRef( + persistTagMessageRefsKey(logData->logId, req.tag, req.begin), + persistTagMessageRefsKey(logData->logId, req.tag, logData->persistentDataDurableVersion + 1)), + SERVER_KNOBS->TLOG_SPILL_REFERENCE_MAX_BATCHES_PER_PEEK + 1)); + + //TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", req.reply.getEndpoint().getPrimaryAddress()).detail("Tag1Results", s1).detail("Tag2Results", s2).detail("Tag1ResultsLim", kv1.size()).detail("Tag2ResultsLim", kv2.size()).detail("Tag1ResultsLast", kv1.size() ? kv1[0].key : "").detail("Tag2ResultsLast", kv2.size() ? kv2[0].key : "").detail("Limited", limited).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowEpoch", self->epoch()).detail("NowSeq", self->sequence.getNextSequence()); + + state std::vector> commitLocations; + state bool earlyEnd = false; + uint32_t mutationBytes = 0; + state uint64_t commitBytes = 0; + state Version firstVersion = std::numeric_limits::max(); + for (int i = 0; i < kvrefs.size() && i < SERVER_KNOBS->TLOG_SPILL_REFERENCE_MAX_BATCHES_PER_PEEK; i++) { + auto& kv = kvrefs[i]; + VectorRef spilledData; + BinaryReader r(kv.value, AssumeVersion(logData->protocolVersion)); + r >> spilledData; + for (const SpilledData& sd : spilledData) { + if (mutationBytes >= SERVER_KNOBS->DESIRED_TOTAL_BYTES) { + earlyEnd = true; + break; + } + if (sd.version >= req.begin) { + firstVersion = std::min(firstVersion, sd.version); + const IDiskQueue::location end = sd.start.lo + sd.length; + commitLocations.emplace_back(sd.start, end); + // This isn't perfect, because we aren't accounting for page boundaries, but should be + // close enough. + commitBytes += sd.length; + mutationBytes += sd.mutationBytes; + } + } + if (earlyEnd) + break; + } + earlyEnd = earlyEnd || (kvrefs.size() >= SERVER_KNOBS->TLOG_SPILL_REFERENCE_MAX_BATCHES_PER_PEEK + 1); + wait(self->peekMemoryLimiter.take(TaskPriority::TLogSpilledPeekReply, commitBytes)); + state FlowLock::Releaser memoryReservation(self->peekMemoryLimiter, commitBytes); + state std::vector>> messageReads; + messageReads.reserve(commitLocations.size()); + for (const auto& pair : commitLocations) { + messageReads.push_back(self->rawPersistentQueue->read(pair.first, pair.second, CheckHashes::True)); + } + commitLocations.clear(); + wait(waitForAll(messageReads)); + + state Version lastRefMessageVersion = 0; + state int index = 0; + loop { + if (index >= messageReads.size()) + break; + Standalone queueEntryData = messageReads[index].get(); + uint8_t valid; + const uint32_t length = *(uint32_t*)queueEntryData.begin(); + queueEntryData = queueEntryData.substr(4, queueEntryData.size() - 4); + BinaryReader rd(queueEntryData, IncludeVersion()); + state TLogQueueEntry entry; + rd >> entry >> valid; + ASSERT(valid == 0x01); + ASSERT(length + sizeof(valid) == queueEntryData.size()); + + messages << VERSION_HEADER << entry.version; + + std::vector rawMessages = + wait(parseMessagesForTag(entry.messages, req.tag, logData->logRouterTags)); + for (const StringRef& msg : rawMessages) { + messages.serializeBytes(msg); + } + + lastRefMessageVersion = entry.version; + index++; + } + + messageReads.clear(); + memoryReservation.release(); + + if (earlyEnd) { + endVersion = lastRefMessageVersion + 1; + onlySpilled = true; + } else { + messages.serializeBytes(messages2.toValue()); + } + } + } else { + if (req.onlySpilled) { + endVersion = logData->persistentDataDurableVersion + 1; + } else { + peekMessagesFromMemory(logData, req.tag, req.begin, messages, endVersion); + } + + //TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", req.reply.getEndpoint().getPrimaryAddress()).detail("MessageBytes", messages.getLength()).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowSeq", self->sequence.getNextSequence()); + } + + TLogPeekReply reply; + reply.maxKnownVersion = logData->version.get(); + reply.minKnownCommittedVersion = logData->minKnownCommittedVersion; + reply.messages = messages.toValue(); + reply.end = endVersion; + reply.onlySpilled = onlySpilled; + + //TraceEvent("TlogPeek", self->dbgid).detail("LogId", logData->logId).detail("EndVer", reply.end).detail("MsgBytes", reply.messages.expectedSize()).detail("ForAddress", req.reply.getEndpoint().getPrimaryAddress()); + + if (req.sequence.present()) { + auto& trackerData = logData->peekTracker[peekId]; + trackerData.lastUpdate = now(); + + double queueT = blockStart - queueStart; + double blockT = workStart - blockStart; + double workT = now() - workStart; + + trackerData.totalPeeks++; + trackerData.replyBytes += reply.messages.size(); + + if (queueT > trackerData.queueMax) + trackerData.queueMax = queueT; + if (blockT > trackerData.blockMax) + trackerData.blockMax = blockT; + if (workT > trackerData.workMax) + trackerData.workMax = workT; + + trackerData.queueTime += queueT; + trackerData.blockTime += blockT; + trackerData.workTime += workT; + + auto& sequenceData = trackerData.sequence_version[sequence + 1]; + if (trackerData.sequence_version.size() && sequence + 1 < trackerData.sequence_version.begin()->first) { + req.reply.sendError(operation_obsolete()); + if (!sequenceData.isSet()) + sequenceData.sendError(operation_obsolete()); + return Void(); + } + if (sequenceData.isSet()) { + trackerData.duplicatePeeks++; + if (sequenceData.getFuture().get().first != reply.end) { + TEST(true); // tlog peek second attempt ended at a different version (2) + req.reply.sendError(operation_obsolete()); + return Void(); + } + } else { + sequenceData.send(std::make_pair(reply.end, reply.onlySpilled)); + } + reply.begin = req.begin; + } + + req.reply.send(reply); return Void(); } diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp index f172a496da..3ca0926135 100644 --- a/fdbserver/TLogServer.actor.cpp +++ b/fdbserver/TLogServer.actor.cpp @@ -1587,7 +1587,8 @@ ACTOR Future> parseMessagesForTag(StringRef commitBlob, T } // Common logics to peek TLog and create TLogPeekReply that serves both streaming peek or normal peek request -ACTOR Future peekTLog(TLogData* self, +ACTOR template +Future peekTLogAndSend(PromiseType replyPromise, TLogData* self, Reference logData, Version begin, Tag tag, @@ -1607,50 +1608,60 @@ ACTOR Future peekTLog(TLogData* self, // TODO: once the fake stream is replaced by ReplyPromiseStream, we can remove the code handling sequence requests // STEP: a. mark obsolete sequence requests; b. wait previous sequence requests are handled in order if (sequence.present()) { - peekId = sequence.get().first; - sequenceNum = sequence.get().second; - if (sequenceNum >= SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS && - logData->peekTracker.find(peekId) == logData->peekTracker.end()) { - throw operation_obsolete(); - } - auto& trackerData = logData->peekTracker[peekId]; - if (sequenceNum == 0 && trackerData.sequence_version.find(0) == trackerData.sequence_version.end()) { - trackerData.tag = tag; - trackerData.sequence_version[0].send(std::make_pair(begin, reqOnlySpilled)); - } - auto seqBegin = trackerData.sequence_version.begin(); - // The peek cursor and this comparison need to agree about the maximum number of in-flight requests. - while (trackerData.sequence_version.size() && - seqBegin->first <= sequenceNum - SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS) { - if (seqBegin->second.canBeSet()) { - seqBegin->second.sendError(operation_obsolete()); + try { + peekId = sequence.get().first; + sequenceNum = sequence.get().second; + if (sequenceNum >= SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS && + logData->peekTracker.find(peekId) == logData->peekTracker.end()) { + throw operation_obsolete(); + } + auto& trackerData = logData->peekTracker[peekId]; + if (sequenceNum == 0 && trackerData.sequence_version.find(0) == trackerData.sequence_version.end()) { + trackerData.tag = tag; + trackerData.sequence_version[0].send(std::make_pair(begin, reqOnlySpilled)); + } + auto seqBegin = trackerData.sequence_version.begin(); + // The peek cursor and this comparison need to agree about the maximum number of in-flight requests. + while (trackerData.sequence_version.size() && + seqBegin->first <= sequenceNum - SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS) { + if (seqBegin->second.canBeSet()) { + seqBegin->second.sendError(operation_obsolete()); + } + trackerData.sequence_version.erase(seqBegin); + seqBegin = trackerData.sequence_version.begin(); + } + + if (trackerData.sequence_version.size() && sequenceNum < seqBegin->first) { + throw operation_obsolete(); + } + + Future> fPrevPeekData = trackerData.sequence_version[sequenceNum].getFuture(); + if (fPrevPeekData.isReady()) { + trackerData.unblockedPeeks++; + double t = now() - trackerData.lastUpdate; + if (t > trackerData.idleMax) + trackerData.idleMax = t; + trackerData.idleTime += t; + } + trackerData.lastUpdate = now(); + std::pair prevPeekData = wait(fPrevPeekData); + begin = std::max(prevPeekData.first, begin); + reqOnlySpilled = prevPeekData.second; + wait(yield()); + } catch (Error& e) { + if (e.code() == error_code_timed_out || e.code() == error_code_operation_obsolete) { + replyPromise.sendError(e); + return Void(); + } else { + throw; } - trackerData.sequence_version.erase(seqBegin); - seqBegin = trackerData.sequence_version.begin(); } - - if (trackerData.sequence_version.size() && sequenceNum < seqBegin->first) { - throw operation_obsolete(); - } - - Future> fPrevPeekData = trackerData.sequence_version[sequenceNum].getFuture(); - if (fPrevPeekData.isReady()) { - trackerData.unblockedPeeks++; - double t = now() - trackerData.lastUpdate; - if (t > trackerData.idleMax) - trackerData.idleMax = t; - trackerData.idleTime += t; - } - trackerData.lastUpdate = now(); - std::pair prevPeekData = wait(fPrevPeekData); - begin = std::max(prevPeekData.first, begin); - reqOnlySpilled = prevPeekData.second; - wait(yield()); } state double blockStart = now(); if (returnIfBlocked && logData->version.get() < begin) { + replyPromise.sendError(end_of_stream()); if (sequence.present()) { auto& trackerData = logData->peekTracker[peekId]; auto& sequenceData = trackerData.sequence_version[sequenceNum + 1]; @@ -1659,7 +1670,7 @@ ACTOR Future peekTLog(TLogData* self, sequenceData.send(std::make_pair(begin, reqOnlySpilled)); } } - throw end_of_stream(); + return Void(); } //TraceEvent("TLogPeekMessages0", self->dbgid).detail("ReqBeginEpoch", req.begin.epoch).detail("ReqBeginSeq", req.begin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", req.tag1).detail("Tag2", req.tag2); @@ -1703,14 +1714,16 @@ ACTOR Future peekTLog(TLogData* self, auto& sequenceData = trackerData.sequence_version[sequenceNum + 1]; trackerData.lastUpdate = now(); if (trackerData.sequence_version.size() && sequenceNum + 1 < trackerData.sequence_version.begin()->first) { + replyPromise.sendError(operation_obsolete()); if (!sequenceData.isSet()) sequenceData.sendError(operation_obsolete()); - throw operation_obsolete(); + return Void(); } if (sequenceData.isSet()) { if (sequenceData.getFuture().get().first != rep.end) { - TEST(true); // 1 tlog peek second attempt ended at a different version - throw operation_obsolete(); + TEST(true); // xz tlog peek second attempt ended at a different version + replyPromise.sendError(operation_obsolete()); + return Void(); } } else { sequenceData.send(std::make_pair(rep.end, rep.onlySpilled)); @@ -1718,7 +1731,8 @@ ACTOR Future peekTLog(TLogData* self, rep.begin = begin; } - return rep; + replyPromise.send(rep); + return Void(); } state Version endVersion = logData->version.get() + 1; @@ -1893,6 +1907,7 @@ ACTOR Future peekTLog(TLogData* self, auto& sequenceData = trackerData.sequence_version[sequenceNum + 1]; if (trackerData.sequence_version.size() && sequenceNum + 1 < trackerData.sequence_version.begin()->first) { + replyPromise.sendError(operation_obsolete()); if (!sequenceData.isSet()) { // It would technically be more correct to .send({req.begin, req.onlySpilled}), as the next // request might still be in the window of active requests, but LogSystemPeekCursor will @@ -1900,13 +1915,14 @@ ACTOR Future peekTLog(TLogData* self, // response will probably be a waste of CPU. sequenceData.sendError(operation_obsolete()); } - throw operation_obsolete(); + return Void(); } if (sequenceData.isSet()) { trackerData.duplicatePeeks++; if (sequenceData.getFuture().get().first != reply.end) { - TEST(true); // 1 tlog peek second attempt ended at a different version (2) - throw operation_obsolete(); + TEST(true); // xz tlog peek second attempt ended at a different version (2) + replyPromise.sendError(operation_obsolete()); + return Void(); } } else { sequenceData.send(std::make_pair(reply.end, reply.onlySpilled)); @@ -1914,24 +1930,30 @@ ACTOR Future peekTLog(TLogData* self, reply.begin = begin; } - return reply; + replyPromise.send(reply); + return Void(); } + // This actor keep pushing TLogPeekStreamReply until it's removed from the cluster or should recover ACTOR Future tLogPeekStream(TLogData* self, TLogPeekStreamRequest req, Reference logData) { self->activePeekStreams++; state Version begin = req.begin; state bool onlySpilled = false; - if (req.tag.locality == tagLocalityTxs && req.tag.id >= logData->txsTags && logData->txsTags > 0) { - req.tag.id = req.tag.id % logData->txsTags; - } req.reply.setByteLimit(std::min(SERVER_KNOBS->MAXIMUM_PEEK_BYTES, req.limitBytes)); loop { state TLogPeekStreamReply reply; + state Promise promise; + state Future future(promise.getFuture()); try { - wait(req.reply.onReady() && - store(reply.rep, peekTLog(self, logData, begin, req.tag, req.returnIfBlocked, onlySpilled))); + wait(req.reply.onReady() && peekTLogAndSend(promise, self, logData, begin, req.tag, req.returnIfBlocked, onlySpilled)); + ASSERT(future.isReady()); + if(future.isError()) { + throw future.getError(); + } + + reply.rep = future.get(); req.reply.send(reply); begin = reply.rep.end; onlySpilled = reply.rep.onlySpilled; @@ -1955,23 +1977,342 @@ ACTOR Future tLogPeekStream(TLogData* self, TLogPeekStreamRequest req, Ref } ACTOR Future tLogPeekMessages(TLogData* self, TLogPeekRequest req, Reference logData) { - try { - TLogPeekReply reply = - wait(peekTLog(self, logData, req.begin, req.tag, req.returnIfBlocked, req.onlySpilled, req.sequence)); - req.reply.send(reply); - } catch (Error& e) { - if (e.code() == error_code_timed_out || e.code() == error_code_operation_obsolete || - e.code() == error_code_end_of_stream) { - req.reply.sendError(e); - return Void(); - } else { - throw; + state BinaryWriter messages(Unversioned()); + state BinaryWriter messages2(Unversioned()); + state int sequence = -1; + state UID peekId; + state double queueStart = now(); + + if (req.tag.locality == tagLocalityTxs && req.tag.id >= logData->txsTags && logData->txsTags > 0) { + req.tag.id = req.tag.id % logData->txsTags; + } + + if (req.sequence.present()) { + try { + peekId = req.sequence.get().first; + sequence = req.sequence.get().second; + if (sequence >= SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS && + logData->peekTracker.find(peekId) == logData->peekTracker.end()) { + throw operation_obsolete(); + } + auto& trackerData = logData->peekTracker[peekId]; + if (sequence == 0 && trackerData.sequence_version.find(0) == trackerData.sequence_version.end()) { + trackerData.tag = req.tag; + trackerData.sequence_version[0].send(std::make_pair(req.begin, req.onlySpilled)); + } + auto seqBegin = trackerData.sequence_version.begin(); + // The peek cursor and this comparison need to agree about the maximum number of in-flight requests. + while (trackerData.sequence_version.size() && + seqBegin->first <= sequence - SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS) { + if (seqBegin->second.canBeSet()) { + seqBegin->second.sendError(operation_obsolete()); + } + trackerData.sequence_version.erase(seqBegin); + seqBegin = trackerData.sequence_version.begin(); + } + + if (trackerData.sequence_version.size() && sequence < seqBegin->first) { + throw operation_obsolete(); + } + + Future> fPrevPeekData = trackerData.sequence_version[sequence].getFuture(); + if (fPrevPeekData.isReady()) { + trackerData.unblockedPeeks++; + double t = now() - trackerData.lastUpdate; + if (t > trackerData.idleMax) + trackerData.idleMax = t; + trackerData.idleTime += t; + } + trackerData.lastUpdate = now(); + std::pair prevPeekData = wait(fPrevPeekData); + req.begin = std::max(prevPeekData.first, req.begin); + req.onlySpilled = prevPeekData.second; + wait(yield()); + } catch (Error& e) { + if (e.code() == error_code_timed_out || e.code() == error_code_operation_obsolete) { + req.reply.sendError(e); + return Void(); + } else { + throw; + } } } + state double blockStart = now(); + + if (req.returnIfBlocked && logData->version.get() < req.begin) { + req.reply.sendError(end_of_stream()); + if (req.sequence.present()) { + auto& trackerData = logData->peekTracker[peekId]; + auto& sequenceData = trackerData.sequence_version[sequence + 1]; + trackerData.lastUpdate = now(); + if (!sequenceData.isSet()) { + sequenceData.send(std::make_pair(req.begin, req.onlySpilled)); + } + } + return Void(); + } + + //TraceEvent("TLogPeekMessages0", self->dbgid).detail("ReqBeginEpoch", req.begin.epoch).detail("ReqBeginSeq", req.begin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", req.tag1).detail("Tag2", req.tag2); + // Wait until we have something to return that the caller doesn't already have + if (logData->version.get() < req.begin) { + wait(logData->version.whenAtLeast(req.begin)); + wait(delay(SERVER_KNOBS->TLOG_PEEK_DELAY, g_network->getCurrentTask())); + } + + if (logData->locality != tagLocalitySatellite && req.tag.locality == tagLocalityLogRouter) { + wait(self->concurrentLogRouterReads.take()); + state FlowLock::Releaser globalReleaser(self->concurrentLogRouterReads); + wait(delay(0.0, TaskPriority::Low)); + } + + if (req.begin <= logData->persistentDataDurableVersion && req.tag.locality != tagLocalityTxs && req.tag != txsTag) { + // Reading spilled data will almost always imply that the storage server is >5s behind the rest + // of the cluster. We shouldn't prioritize spending CPU on helping this server catch up + // slightly faster over keeping the rest of the cluster operating normally. + // txsTag is only ever peeked on recovery, and we would still wish to prioritize requests + // that impact recovery duration. + wait(delay(0, TaskPriority::TLogSpilledPeekReply)); + } + + state double workStart = now(); + + Version poppedVer = poppedVersion(logData, req.tag); + if (poppedVer > req.begin) { + TLogPeekReply rep; + rep.maxKnownVersion = logData->version.get(); + rep.minKnownCommittedVersion = logData->minKnownCommittedVersion; + rep.popped = poppedVer; + rep.end = poppedVer; + rep.onlySpilled = false; + + if (req.sequence.present()) { + auto& trackerData = logData->peekTracker[peekId]; + auto& sequenceData = trackerData.sequence_version[sequence + 1]; + trackerData.lastUpdate = now(); + if (trackerData.sequence_version.size() && sequence + 1 < trackerData.sequence_version.begin()->first) { + req.reply.sendError(operation_obsolete()); + if (!sequenceData.isSet()) + sequenceData.sendError(operation_obsolete()); + return Void(); + } + if (sequenceData.isSet()) { + if (sequenceData.getFuture().get().first != rep.end) { + TEST(true); // tlog peek second attempt ended at a different version + req.reply.sendError(operation_obsolete()); + return Void(); + } + } else { + sequenceData.send(std::make_pair(rep.end, rep.onlySpilled)); + } + rep.begin = req.begin; + } + + req.reply.send(rep); + return Void(); + } + + state Version endVersion = logData->version.get() + 1; + state bool onlySpilled = false; + + // grab messages from disk + //TraceEvent("TLogPeekMessages", self->dbgid).detail("ReqBeginEpoch", req.begin.epoch).detail("ReqBeginSeq", req.begin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", req.tag1).detail("Tag2", req.tag2); + if (req.begin <= logData->persistentDataDurableVersion) { + // Just in case the durable version changes while we are waiting for the read, we grab this data from memory. We + // may or may not actually send it depending on whether we get enough data from disk. SOMEDAY: Only do this if + // an initial attempt to read from disk results in insufficient data and the required data is no longer in + // memory SOMEDAY: Should we only send part of the messages we collected, to actually limit the size of the + // result? + + if (req.onlySpilled) { + endVersion = logData->persistentDataDurableVersion + 1; + } else { + peekMessagesFromMemory(logData, req.tag, req.begin, messages2, endVersion); + } + + if (logData->shouldSpillByValue(req.tag)) { + RangeResult kvs = wait(self->persistentData->readRange( + KeyRangeRef(persistTagMessagesKey(logData->logId, req.tag, req.begin), + persistTagMessagesKey(logData->logId, req.tag, logData->persistentDataDurableVersion + 1)), + SERVER_KNOBS->DESIRED_TOTAL_BYTES, + SERVER_KNOBS->DESIRED_TOTAL_BYTES)); + + for (auto& kv : kvs) { + auto ver = decodeTagMessagesKey(kv.key); + messages << VERSION_HEADER << ver; + messages.serializeBytes(kv.value); + } + + if (kvs.expectedSize() >= SERVER_KNOBS->DESIRED_TOTAL_BYTES) { + endVersion = decodeTagMessagesKey(kvs.end()[-1].key) + 1; + onlySpilled = true; + } else { + messages.serializeBytes(messages2.toValue()); + } + } else { + // FIXME: Limit to approximately DESIRED_TOTATL_BYTES somehow. + RangeResult kvrefs = wait(self->persistentData->readRange( + KeyRangeRef( + persistTagMessageRefsKey(logData->logId, req.tag, req.begin), + persistTagMessageRefsKey(logData->logId, req.tag, logData->persistentDataDurableVersion + 1)), + SERVER_KNOBS->TLOG_SPILL_REFERENCE_MAX_BATCHES_PER_PEEK + 1)); + + //TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", req.reply.getEndpoint().getPrimaryAddress()).detail("Tag1Results", s1).detail("Tag2Results", s2).detail("Tag1ResultsLim", kv1.size()).detail("Tag2ResultsLim", kv2.size()).detail("Tag1ResultsLast", kv1.size() ? kv1[0].key : "").detail("Tag2ResultsLast", kv2.size() ? kv2[0].key : "").detail("Limited", limited).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowEpoch", self->epoch()).detail("NowSeq", self->sequence.getNextSequence()); + + state std::vector> commitLocations; + state bool earlyEnd = false; + uint32_t mutationBytes = 0; + state uint64_t commitBytes = 0; + state Version firstVersion = std::numeric_limits::max(); + for (int i = 0; i < kvrefs.size() && i < SERVER_KNOBS->TLOG_SPILL_REFERENCE_MAX_BATCHES_PER_PEEK; i++) { + auto& kv = kvrefs[i]; + VectorRef spilledData; + BinaryReader r(kv.value, AssumeVersion(logData->protocolVersion)); + r >> spilledData; + for (const SpilledData& sd : spilledData) { + if (mutationBytes >= SERVER_KNOBS->DESIRED_TOTAL_BYTES) { + earlyEnd = true; + break; + } + if (sd.version >= req.begin) { + firstVersion = std::min(firstVersion, sd.version); + const IDiskQueue::location end = sd.start.lo + sd.length; + commitLocations.emplace_back(sd.start, end); + // This isn't perfect, because we aren't accounting for page boundaries, but should be + // close enough. + commitBytes += sd.length; + mutationBytes += sd.mutationBytes; + } + } + if (earlyEnd) + break; + } + earlyEnd = earlyEnd || (kvrefs.size() >= SERVER_KNOBS->TLOG_SPILL_REFERENCE_MAX_BATCHES_PER_PEEK + 1); + wait(self->peekMemoryLimiter.take(TaskPriority::TLogSpilledPeekReply, commitBytes)); + state FlowLock::Releaser memoryReservation(self->peekMemoryLimiter, commitBytes); + state std::vector>> messageReads; + messageReads.reserve(commitLocations.size()); + for (const auto& pair : commitLocations) { + messageReads.push_back(self->rawPersistentQueue->read(pair.first, pair.second, CheckHashes::True)); + } + commitLocations.clear(); + wait(waitForAll(messageReads)); + + state Version lastRefMessageVersion = 0; + state int index = 0; + loop { + if (index >= messageReads.size()) + break; + Standalone queueEntryData = messageReads[index].get(); + uint8_t valid; + const uint32_t length = *(uint32_t*)queueEntryData.begin(); + queueEntryData = queueEntryData.substr(4, queueEntryData.size() - 4); + BinaryReader rd(queueEntryData, IncludeVersion()); + state TLogQueueEntry entry; + rd >> entry >> valid; + ASSERT(valid == 0x01); + ASSERT(length + sizeof(valid) == queueEntryData.size()); + + messages << VERSION_HEADER << entry.version; + + std::vector rawMessages = + wait(parseMessagesForTag(entry.messages, req.tag, logData->logRouterTags)); + for (const StringRef& msg : rawMessages) { + messages.serializeBytes(msg); + DEBUG_TAGS_AND_MESSAGE("TLogPeekFromDisk", entry.version, msg) + .detail("UID", self->dbgid) + .detail("LogId", logData->logId) + .detail("PeekTag", req.tag); + } + + lastRefMessageVersion = entry.version; + index++; + } + + messageReads.clear(); + memoryReservation.release(); + + if (earlyEnd) { + endVersion = lastRefMessageVersion + 1; + onlySpilled = true; + } else { + messages.serializeBytes(messages2.toValue()); + } + } + } else { + if (req.onlySpilled) { + endVersion = logData->persistentDataDurableVersion + 1; + } else { + peekMessagesFromMemory(logData, req.tag, req.begin, messages, endVersion); + } + + //TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", req.reply.getEndpoint().getPrimaryAddress()).detail("MessageBytes", messages.getLength()).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowSeq", self->sequence.getNextSequence()); + } + + TLogPeekReply reply; + reply.maxKnownVersion = logData->version.get(); + reply.minKnownCommittedVersion = logData->minKnownCommittedVersion; + reply.messages = messages.toValue(); + reply.end = endVersion; + reply.onlySpilled = onlySpilled; + + //TraceEvent("TlogPeek", self->dbgid).detail("LogId", logData->logId).detail("Tag", req.tag.toString()). + // detail("BeginVer", req.begin).detail("EndVer", reply.end). + // detail("MsgBytes", reply.messages.expectedSize()). + // detail("ForAddress", req.reply.getEndpoint().getPrimaryAddress()); + + if (req.sequence.present()) { + auto& trackerData = logData->peekTracker[peekId]; + trackerData.lastUpdate = now(); + + double queueT = blockStart - queueStart; + double blockT = workStart - blockStart; + double workT = now() - workStart; + + trackerData.totalPeeks++; + trackerData.replyBytes += reply.messages.size(); + + if (queueT > trackerData.queueMax) + trackerData.queueMax = queueT; + if (blockT > trackerData.blockMax) + trackerData.blockMax = blockT; + if (workT > trackerData.workMax) + trackerData.workMax = workT; + + trackerData.queueTime += queueT; + trackerData.blockTime += blockT; + trackerData.workTime += workT; + + auto& sequenceData = trackerData.sequence_version[sequence + 1]; + if (trackerData.sequence_version.size() && sequence + 1 < trackerData.sequence_version.begin()->first) { + req.reply.sendError(operation_obsolete()); + if (!sequenceData.isSet()) { + // It would technically be more correct to .send({req.begin, req.onlySpilled}), as the next + // request might still be in the window of active requests, but LogSystemPeekCursor will + // throw away all future responses upon getting an operation_obsolete(), so computing a + // response will probably be a waste of CPU. + sequenceData.sendError(operation_obsolete()); + } + return Void(); + } + if (sequenceData.isSet()) { + trackerData.duplicatePeeks++; + if (sequenceData.getFuture().get().first != reply.end) { + TEST(true); // tlog peek second attempt ended at a different version (2) + req.reply.sendError(operation_obsolete()); + return Void(); + } + } else { + sequenceData.send(std::make_pair(reply.end, reply.onlySpilled)); + } + reply.begin = req.begin; + } + + req.reply.send(reply); return Void(); } + ACTOR Future doQueueCommit(TLogData* self, Reference logData, std::vector> missingFinalCommit) { From 517ff9801d744a3775c8709eefb699a3dcf5fe7b Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Fri, 30 Jul 2021 19:10:13 -0700 Subject: [PATCH 20/29] add information print --- fdbserver/tester.actor.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fdbserver/tester.actor.cpp b/fdbserver/tester.actor.cpp index e819267bb1..e3ecd480e7 100644 --- a/fdbserver/tester.actor.cpp +++ b/fdbserver/tester.actor.cpp @@ -1481,7 +1481,9 @@ ACTOR Future runTests(Reference Date: Sat, 31 Jul 2021 09:07:53 -0700 Subject: [PATCH 21/29] clean 100k simulation test. revert changes of fdbrpc.h --- fdbrpc/fdbrpc.h | 16 +- fdbserver/MoveKeys.actor.cpp | 3 +- fdbserver/OldTLogServer_4_6.actor.cpp | 296 ++++---------- fdbserver/OldTLogServer_6_0.actor.cpp | 410 +++++--------------- fdbserver/OldTLogServer_6_2.actor.cpp | 522 +++++-------------------- fdbserver/TLogServer.actor.cpp | 533 +++++--------------------- flow/Knobs.cpp | 2 +- 7 files changed, 376 insertions(+), 1406 deletions(-) diff --git a/fdbrpc/fdbrpc.h b/fdbrpc/fdbrpc.h index c0f39aa0b4..60b4c0168e 100644 --- a/fdbrpc/fdbrpc.h +++ b/fdbrpc/fdbrpc.h @@ -328,16 +328,14 @@ struct NetNotifiedQueueWithAcknowledgements final : NotifiedQueue, NetNotifiedQueueWithAcknowledgements(int futures, int promises, const Endpoint& remoteEndpoint) : NotifiedQueue(futures, promises), FlowReceiver(remoteEndpoint, true) { // A ReplyPromiseStream will be terminated on the server side if the network connection with the client breaks - acknowledgements.failures = - tagError(makeDependent(IFailureMonitor::failureMonitor()).onDisconnectOrFailure(remoteEndpoint), - operation_obsolete()); + acknowledgements.failures = tagError( + makeDependent(IFailureMonitor::failureMonitor()).onDisconnect(remoteEndpoint.getPrimaryAddress()), + operation_obsolete()); } void destroy() override { delete this; } void receive(ArenaObjectReader& reader) override { this->addPromiseRef(); - // TraceEvent(SevDebug, "NetNotifiedQueueWithAcknowledgementsReceive") - // .detail("PromiseRef", this->getPromiseReferenceCount()); ErrorOr> message; reader.deserialize(message); @@ -371,8 +369,6 @@ struct NetNotifiedQueueWithAcknowledgements final : NotifiedQueue, this->send(std::move(message.get().asUnderlyingType())); } this->delPromiseRef(); - // TraceEvent(SevDebug, "NetNotifiedQueueWithAcknowledgementsReceiveEnd") - // .detail("PromiseRef", this->getPromiseReferenceCount()); } T pop() override { @@ -698,18 +694,20 @@ public: template ReplyPromiseStream getReplyStream(const X& value) const { - auto p = getReplyPromiseStream(value); if (queue->isRemoteEndpoint()) { Future disc = makeDependent(IFailureMonitor::failureMonitor()).onDisconnectOrFailure(getEndpoint()); + auto& p = getReplyPromiseStream(value); Reference peer = FlowTransport::transport().sendUnreliable(SerializeSource(value), getEndpoint(), true); // FIXME: defer sending the message until we know the connection is established endStreamOnDisconnect(disc, p, getEndpoint(), peer); + return p; } else { send(value); + auto& p = getReplyPromiseStream(value); + return p; } - return p; } // stream.getReplyUnlessFailedFor( request, double sustainedFailureDuration, double sustainedFailureSlope ) diff --git a/fdbserver/MoveKeys.actor.cpp b/fdbserver/MoveKeys.actor.cpp index 55706e458f..dad17a1a67 100644 --- a/fdbserver/MoveKeys.actor.cpp +++ b/fdbserver/MoveKeys.actor.cpp @@ -815,8 +815,7 @@ ACTOR static Future finishMoveKeys(Database occ, // Wait for a durable quorum of servers in destServers to have keys available (readWrite) // They must also have at least the transaction read version so they can't "forget" the shard - // between - // now and when this transaction commits. + // between now and when this transaction commits. state vector> serverReady; // only for count below state vector> tssReady; // for waiting in parallel with tss state vector tssReadyInterfs; diff --git a/fdbserver/OldTLogServer_4_6.actor.cpp b/fdbserver/OldTLogServer_4_6.actor.cpp index d5fafab6c7..35d142b9f7 100644 --- a/fdbserver/OldTLogServer_4_6.actor.cpp +++ b/fdbserver/OldTLogServer_4_6.actor.cpp @@ -975,51 +975,62 @@ void peekMessagesFromMemory(Reference self, } // Common logics to peek TLog and create TLogPeekReply that serves both streaming peek or normal peek request -ACTOR Future peekTLog(TLogData* self, - Reference logData, - Version begin, - Tag tag, - bool returnIfBlocked = false, - bool reqOnlySpilled = false, - Optional> sequence = Optional>()) { +ACTOR template +Future tLogPeekMessages(PromiseType replyPromise, + TLogData* self, + Reference logData, + Version reqBegin, + Tag reqTag, + bool reqReturnIfBlocked = false, + bool reqOnlySpilled = false, + Optional> reqSequence = Optional>()) { state BinaryWriter messages(Unversioned()); state BinaryWriter messages2(Unversioned()); - state int sequenceNum = -1; + state int sequence = -1; state UID peekId; - state double queueStart = now(); - state OldTag oldTag = convertTag(tag); + state OldTag oldTag = convertTag(reqTag); - if (sequence.present()) { - peekId = sequence.get().first; - sequenceNum = sequence.get().second; - if (sequenceNum >= SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS && - self->peekTracker.find(peekId) == self->peekTracker.end()) { - throw operation_obsolete(); - } - if (sequenceNum > 0) { - auto& trackerData = self->peekTracker[peekId]; - trackerData.lastUpdate = now(); - Version ver = wait(trackerData.sequence_version[sequenceNum].getFuture()); - begin = std::max(ver, begin); - wait(yield()); + if (reqSequence.present()) { + try { + peekId = reqSequence.get().first; + sequence = reqSequence.get().second; + if (sequence >= SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS && + self->peekTracker.find(peekId) == self->peekTracker.end()) { + throw operation_obsolete(); + } + if (sequence > 0) { + auto& trackerData = self->peekTracker[peekId]; + trackerData.lastUpdate = now(); + Version ver = wait(trackerData.sequence_version[sequence].getFuture()); + reqBegin = std::max(ver, reqBegin); + wait(yield()); + } + } catch (Error& e) { + if (e.code() == error_code_timed_out || e.code() == error_code_operation_obsolete) { + replyPromise.sendError(e); + return Void(); + } else { + throw; + } } } - if (returnIfBlocked && logData->version.get() < begin) { - throw end_of_stream(); + if (reqReturnIfBlocked && logData->version.get() < reqBegin) { + replyPromise.sendError(end_of_stream()); + return Void(); } - //TraceEvent("TLogPeekMessages0", self->dbgid).detail("ReqBeginEpoch", req.begin.epoch).detail("ReqBeginSeq", req.begin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", req.tag1).detail("Tag2", req.tag2); + //TraceEvent("TLogPeekMessages0", self->dbgid).detail("ReqBeginEpoch", reqBegin.epoch).detail("ReqBeginSeq", reqBegin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", reqTag1).detail("Tag2", reqTag2); // Wait until we have something to return that the caller doesn't already have - if (logData->version.get() < begin) { - wait(logData->version.whenAtLeast(begin)); + if (logData->version.get() < reqBegin) { + wait(logData->version.whenAtLeast(reqBegin)); wait(delay(SERVER_KNOBS->TLOG_PEEK_DELAY, g_network->getCurrentTask())); } state Version endVersion = logData->version.get() + 1; Version poppedVer = poppedVersion(logData, oldTag); - if (poppedVer > begin) { + if (poppedVer > reqBegin) { TLogPeekReply rep; rep.maxKnownVersion = logData->version.get(); rep.minKnownCommittedVersion = 0; @@ -1027,47 +1038,50 @@ ACTOR Future peekTLog(TLogData* self, rep.end = poppedVer; rep.onlySpilled = false; - if (sequence.present()) { + if (reqSequence.present()) { auto& trackerData = self->peekTracker[peekId]; - auto& sequenceData = trackerData.sequence_version[sequenceNum + 1]; + auto& sequenceData = trackerData.sequence_version[sequence + 1]; trackerData.lastUpdate = now(); - if (trackerData.sequence_version.size() && sequenceNum + 1 < trackerData.sequence_version.begin()->first) { + if (trackerData.sequence_version.size() && sequence + 1 < trackerData.sequence_version.begin()->first) { + replyPromise.sendError(operation_obsolete()); if (!sequenceData.isSet()) sequenceData.sendError(operation_obsolete()); - throw operation_obsolete(); + return Void(); } if (sequenceData.isSet()) { if (sequenceData.getFuture().get() != rep.end) { - TEST(true); // 0 tlog peek second attempt ended at a different version - throw operation_obsolete(); + TEST(true); // tlog peek second attempt ended at a different version + replyPromise.sendError(operation_obsolete()); + return Void(); } } else { sequenceData.send(rep.end); } - rep.begin = begin; + rep.begin = reqBegin; } - return rep; + replyPromise.send(rep); + return Void(); } // grab messages from disk - //TraceEvent("TLogPeekMessages", self->dbgid).detail("ReqBeginEpoch", req.begin.epoch).detail("ReqBeginSeq", req.begin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", req.tag1).detail("Tag2", req.tag2); - if (begin <= logData->persistentDataDurableVersion) { + //TraceEvent("TLogPeekMessages", self->dbgid).detail("ReqBeginEpoch", reqBegin.epoch).detail("ReqBeginSeq", reqBegin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", reqTag1).detail("Tag2", reqTag2); + if (reqBegin <= logData->persistentDataDurableVersion) { // Just in case the durable version changes while we are waiting for the read, we grab this data from memory. We // may or may not actually send it depending on whether we get enough data from disk. SOMEDAY: Only do this if // an initial attempt to read from disk results in insufficient data and the required data is no longer in // memory SOMEDAY: Should we only send part of the messages we collected, to actually limit the size of the // result? - peekMessagesFromMemory(logData, tag, begin, messages2, endVersion); + peekMessagesFromMemory(logData, reqTag, reqBegin, messages2, endVersion); RangeResult kvs = wait(self->persistentData->readRange( - KeyRangeRef(persistTagMessagesKey(logData->logId, oldTag, begin), - persistTagMessagesKey(logData->logId, oldTag, logData->persistentDataDurableVersion + 1)), - SERVER_KNOBS->DESIRED_TOTAL_BYTES, - SERVER_KNOBS->DESIRED_TOTAL_BYTES)); + KeyRangeRef(persistTagMessagesKey(logData->logId, oldTag, reqBegin), + persistTagMessagesKey(logData->logId, oldTag, logData->persistentDataDurableVersion + 1)), + SERVER_KNOBS->DESIRED_TOTAL_BYTES, + SERVER_KNOBS->DESIRED_TOTAL_BYTES)); - //TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", reply.getEndpoint().getPrimaryAddress()).detail("Tag1Results", s1).detail("Tag2Results", s2).detail("Tag1ResultsLim", kv1.size()).detail("Tag2ResultsLim", kv2.size()).detail("Tag1ResultsLast", kv1.size() ? kv1[0].key : "").detail("Tag2ResultsLast", kv2.size() ? kv2[0].key : "").detail("Limited", limited).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowEpoch", self->epoch()).detail("NowSeq", self->sequence.getNextSequence()); + //TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", replyPromise.getEndpoint().getPrimaryAddress()).detail("Tag1Results", s1).detail("Tag2Results", s2).detail("Tag1ResultsLim", kv1.size()).detail("Tag2ResultsLim", kv2.size()).detail("Tag1ResultsLast", kv1.size() ? kv1[0].key : "").detail("Tag2ResultsLast", kv2.size() ? kv2[0].key : "").detail("Limited", limited).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowEpoch", self->epoch()).detail("NowSeq", self->sequence.getNextSequence()); for (auto& kv : kvs) { auto ver = decodeTagMessagesKey(kv.key); @@ -1079,7 +1093,7 @@ ACTOR Future peekTLog(TLogData* self, uint32_t subVersion; rd >> messageLength >> subVersion; messageLength += sizeof(uint16_t) + sizeof(Tag); - messages << messageLength << subVersion << uint16_t(1) << tag; + messages << messageLength << subVersion << uint16_t(1) << reqTag; messageLength -= (sizeof(subVersion) + sizeof(uint16_t) + sizeof(Tag)); messages.serializeBytes(rd.readBytes(messageLength), messageLength); } @@ -1090,8 +1104,8 @@ ACTOR Future peekTLog(TLogData* self, else messages.serializeBytes(messages2.toValue()); } else { - peekMessagesFromMemory(logData, tag, begin, messages, endVersion); - //TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", reply.getEndpoint().getPrimaryAddress()).detail("MessageBytes", messages.getLength()).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowSeq", self->sequence.getNextSequence()); + peekMessagesFromMemory(logData, reqTag, reqBegin, messages, endVersion); + //TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", replyPromise.getEndpoint().getPrimaryAddress()).detail("MessageBytes", messages.getLength()).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowSeq", self->sequence.getNextSequence()); } TLogPeekReply reply; @@ -1101,40 +1115,48 @@ ACTOR Future peekTLog(TLogData* self, reply.messages = StringRef(reply.arena, messages.toValue()); reply.end = endVersion; - //TraceEvent("TlogPeek", self->dbgid).detail("LogId", logData->logId).detail("EndVer", reply.end).detail("MsgBytes", reply.messages.expectedSize()).detail("ForAddress", req.reply.getEndpoint().getPrimaryAddress()); + //TraceEvent("TlogPeek", self->dbgid).detail("LogId", logData->logId).detail("EndVer", reply.end).detail("MsgBytes", reply.messages.expectedSize()).detail("ForAddress", replyPromise.getEndpoint().getPrimaryAddress()); - if (sequence.present()) { + if (reqSequence.present()) { auto& trackerData = self->peekTracker[peekId]; trackerData.lastUpdate = now(); - auto& sequenceData = trackerData.sequence_version[sequenceNum + 1]; + auto& sequenceData = trackerData.sequence_version[sequence + 1]; if (sequenceData.isSet()) { if (sequenceData.getFuture().get() != reply.end) { - TEST(true); // 0 tlog peek second attempt ended at a different version (2) - throw operation_obsolete(); + TEST(true); // tlog peek second attempt ended at a different version (2) + replyPromise.sendError(operation_obsolete()); + return Void(); } } else { sequenceData.send(reply.end); } - reply.begin = begin; + reply.begin = reqBegin; } - return reply; + replyPromise.send(reply); + return Void(); } // This actor keep pushing TLogPeekStreamReply until it's removed from the cluster or should recover ACTOR Future tLogPeekStream(TLogData* self, TLogPeekStreamRequest req, Reference logData) { self->activePeekStreams++; - TraceEvent(SevDebug, "TLogPeekStream", logData->logId).detail("Token", req.reply.getEndpoint().token); state Version begin = req.begin; state bool onlySpilled = false; - req.reply.setByteLimit(std::min(SERVER_KNOBS->MAXIMUM_PEEK_BYTES, req.limitBytes)); loop { state TLogPeekStreamReply reply; + state Promise promise; + state Future future(promise.getFuture()); try { wait(req.reply.onReady() && - store(reply.rep, peekTLog(self, logData, begin, req.tag, req.returnIfBlocked, onlySpilled))); + tLogPeekMessages(promise, self, logData, begin, req.tag, req.returnIfBlocked, onlySpilled)); + ASSERT(future.isReady()); + if (future.isError()) { + throw future.getError(); + } + + reply.rep = future.get(); req.reply.send(reply); begin = reply.rep.end; onlySpilled = reply.rep.onlySpilled; @@ -1157,161 +1179,6 @@ ACTOR Future tLogPeekStream(TLogData* self, TLogPeekStreamRequest req, Ref } } - -ACTOR Future tLogPeekMessages(TLogData* self, TLogPeekRequest req, Reference logData) { - state BinaryWriter messages(Unversioned()); - state BinaryWriter messages2(Unversioned()); - state int sequence = -1; - state UID peekId; - state OldTag oldTag = convertTag(req.tag); - - if (req.sequence.present()) { - try { - peekId = req.sequence.get().first; - sequence = req.sequence.get().second; - if (sequence >= SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS && - self->peekTracker.find(peekId) == self->peekTracker.end()) { - throw operation_obsolete(); - } - if (sequence > 0) { - auto& trackerData = self->peekTracker[peekId]; - trackerData.lastUpdate = now(); - Version ver = wait(trackerData.sequence_version[sequence].getFuture()); - req.begin = std::max(ver, req.begin); - wait(yield()); - } - } catch (Error& e) { - if (e.code() == error_code_timed_out || e.code() == error_code_operation_obsolete) { - req.reply.sendError(e); - return Void(); - } else { - throw; - } - } - } - - if (req.returnIfBlocked && logData->version.get() < req.begin) { - req.reply.sendError(end_of_stream()); - return Void(); - } - - //TraceEvent("TLogPeekMessages0", self->dbgid).detail("ReqBeginEpoch", req.begin.epoch).detail("ReqBeginSeq", req.begin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", req.tag1).detail("Tag2", req.tag2); - // Wait until we have something to return that the caller doesn't already have - if (logData->version.get() < req.begin) { - wait(logData->version.whenAtLeast(req.begin)); - wait(delay(SERVER_KNOBS->TLOG_PEEK_DELAY, g_network->getCurrentTask())); - } - - state Version endVersion = logData->version.get() + 1; - - Version poppedVer = poppedVersion(logData, oldTag); - if (poppedVer > req.begin) { - TLogPeekReply rep; - rep.maxKnownVersion = logData->version.get(); - rep.minKnownCommittedVersion = 0; - rep.popped = poppedVer; - rep.end = poppedVer; - rep.onlySpilled = false; - - if (req.sequence.present()) { - auto& trackerData = self->peekTracker[peekId]; - auto& sequenceData = trackerData.sequence_version[sequence + 1]; - trackerData.lastUpdate = now(); - if (trackerData.sequence_version.size() && sequence + 1 < trackerData.sequence_version.begin()->first) { - req.reply.sendError(operation_obsolete()); - if (!sequenceData.isSet()) - sequenceData.sendError(operation_obsolete()); - return Void(); - } - if (sequenceData.isSet()) { - if (sequenceData.getFuture().get() != rep.end) { - TEST(true); // tlog peek second attempt ended at a different version - req.reply.sendError(operation_obsolete()); - return Void(); - } - } else { - sequenceData.send(rep.end); - } - rep.begin = req.begin; - } - - req.reply.send(rep); - return Void(); - } - - // grab messages from disk - //TraceEvent("TLogPeekMessages", self->dbgid).detail("ReqBeginEpoch", req.begin.epoch).detail("ReqBeginSeq", req.begin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", req.tag1).detail("Tag2", req.tag2); - if (req.begin <= logData->persistentDataDurableVersion) { - // Just in case the durable version changes while we are waiting for the read, we grab this data from memory. We - // may or may not actually send it depending on whether we get enough data from disk. SOMEDAY: Only do this if - // an initial attempt to read from disk results in insufficient data and the required data is no longer in - // memory SOMEDAY: Should we only send part of the messages we collected, to actually limit the size of the - // result? - - peekMessagesFromMemory(logData, req.tag, req.begin, messages2, endVersion); - - RangeResult kvs = wait(self->persistentData->readRange( - KeyRangeRef(persistTagMessagesKey(logData->logId, oldTag, req.begin), - persistTagMessagesKey(logData->logId, oldTag, logData->persistentDataDurableVersion + 1)), - SERVER_KNOBS->DESIRED_TOTAL_BYTES, - SERVER_KNOBS->DESIRED_TOTAL_BYTES)); - - //TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", req.reply.getEndpoint().getPrimaryAddress()).detail("Tag1Results", s1).detail("Tag2Results", s2).detail("Tag1ResultsLim", kv1.size()).detail("Tag2ResultsLim", kv2.size()).detail("Tag1ResultsLast", kv1.size() ? kv1[0].key : "").detail("Tag2ResultsLast", kv2.size() ? kv2[0].key : "").detail("Limited", limited).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowEpoch", self->epoch()).detail("NowSeq", self->sequence.getNextSequence()); - - for (auto& kv : kvs) { - auto ver = decodeTagMessagesKey(kv.key); - messages << int32_t(-1) << ver; - - BinaryReader rd(kv.value, Unversioned()); - while (!rd.empty()) { - int32_t messageLength; - uint32_t subVersion; - rd >> messageLength >> subVersion; - messageLength += sizeof(uint16_t) + sizeof(Tag); - messages << messageLength << subVersion << uint16_t(1) << req.tag; - messageLength -= (sizeof(subVersion) + sizeof(uint16_t) + sizeof(Tag)); - messages.serializeBytes(rd.readBytes(messageLength), messageLength); - } - } - - if (kvs.expectedSize() >= SERVER_KNOBS->DESIRED_TOTAL_BYTES) - endVersion = decodeTagMessagesKey(kvs.end()[-1].key) + 1; - else - messages.serializeBytes(messages2.toValue()); - } else { - peekMessagesFromMemory(logData, req.tag, req.begin, messages, endVersion); - //TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", req.reply.getEndpoint().getPrimaryAddress()).detail("MessageBytes", messages.getLength()).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowSeq", self->sequence.getNextSequence()); - } - - TLogPeekReply reply; - reply.maxKnownVersion = logData->version.get(); - reply.minKnownCommittedVersion = 0; - reply.onlySpilled = false; - reply.messages = messages.toValue(); - reply.end = endVersion; - - //TraceEvent("TlogPeek", self->dbgid).detail("LogId", logData->logId).detail("EndVer", reply.end).detail("MsgBytes", reply.messages.expectedSize()).detail("ForAddress", req.reply.getEndpoint().getPrimaryAddress()); - - if (req.sequence.present()) { - auto& trackerData = self->peekTracker[peekId]; - trackerData.lastUpdate = now(); - auto& sequenceData = trackerData.sequence_version[sequence + 1]; - if (sequenceData.isSet()) { - if (sequenceData.getFuture().get() != reply.end) { - TEST(true); // tlog peek second attempt ended at a different version (2) - req.reply.sendError(operation_obsolete()); - return Void(); - } - } else { - sequenceData.send(reply.end); - } - reply.begin = req.begin; - } - - req.reply.send(reply); - return Void(); -} - ACTOR Future doQueueCommit(TLogData* self, Reference logData) { state Version ver = logData->version.get(); state Version commitNumber = self->queueCommitBegin + 1; @@ -1476,7 +1343,8 @@ ACTOR Future serveTLogInterface(TLogData* self, PromiseStream warningCollectorInput) { loop choose { when(TLogPeekRequest req = waitNext(tli.peekMessages.getFuture())) { - logData->addActor.send(tLogPeekMessages(self, req, logData)); + logData->addActor.send(tLogPeekMessages( + req.reply, self, logData, req.begin, req.tag, req.returnIfBlocked, req.onlySpilled, req.sequence)); } when(TLogPeekStreamRequest req = waitNext(tli.peekStreamMessages.getFuture())) { TraceEvent(SevDebug, "TLogPeekStream", logData->logId) diff --git a/fdbserver/OldTLogServer_6_0.actor.cpp b/fdbserver/OldTLogServer_6_0.actor.cpp index b6964bf9c9..5c27581b2c 100644 --- a/fdbserver/OldTLogServer_6_0.actor.cpp +++ b/fdbserver/OldTLogServer_6_0.actor.cpp @@ -1208,301 +1208,37 @@ void peekMessagesFromMemory(Reference self, } // Common logics to peek TLog and create TLogPeekReply that serves both streaming peek or normal peek request -ACTOR Future peekTLog(TLogData* self, - Reference logData, - Version begin, - Tag tag, - bool returnIfBlocked = false, - bool reqOnlySpilled = false, - Optional> sequence = Optional>()) { - state BinaryWriter messages(Unversioned()); - state BinaryWriter messages2(Unversioned()); - state int sequenceNum = -1; - state UID peekId; - state double queueStart = now(); - - if (tag.locality == tagLocalityTxs && tag.id >= logData->txsTags && logData->txsTags > 0) { - tag.id = tag.id % logData->txsTags; - } - - // TODO: once the fake stream is replaced by ReplyPromiseStream, we can remove the code handling sequence requests - // STEP: a. mark obsolete sequence requests; b. wait previous sequence requests are handled in order - if (sequence.present()) { - peekId = sequence.get().first; - sequenceNum = sequence.get().second; - if (sequenceNum >= SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS && - logData->peekTracker.find(peekId) == logData->peekTracker.end()) { - throw operation_obsolete(); - } - auto& trackerData = logData->peekTracker[peekId]; - if (sequenceNum == 0 && trackerData.sequence_version.find(0) == trackerData.sequence_version.end()) { - trackerData.tag = tag; - trackerData.sequence_version[0].send(std::make_pair(begin, reqOnlySpilled)); - } - auto seqBegin = trackerData.sequence_version.begin(); - // The peek cursor and this comparison need to agree about the maximum number of in-flight requests. - while (trackerData.sequence_version.size() && - seqBegin->first <= sequenceNum - SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS) { - if (seqBegin->second.canBeSet()) { - seqBegin->second.sendError(operation_obsolete()); - } - trackerData.sequence_version.erase(seqBegin); - seqBegin = trackerData.sequence_version.begin(); - } - - if (trackerData.sequence_version.size() && sequenceNum < seqBegin->first) { - throw operation_obsolete(); - } - - Future> fPrevPeekData = trackerData.sequence_version[sequenceNum].getFuture(); - if (fPrevPeekData.isReady()) { - trackerData.unblockedPeeks++; - double t = now() - trackerData.lastUpdate; - if (t > trackerData.idleMax) - trackerData.idleMax = t; - trackerData.idleTime += t; - } - trackerData.lastUpdate = now(); - std::pair prevPeekData = wait(fPrevPeekData); - begin = std::max(prevPeekData.first, begin); - reqOnlySpilled = prevPeekData.second; - wait(yield()); - } - - state double blockStart = now(); - - if (returnIfBlocked && logData->version.get() < begin) { - if (sequence.present()) { - auto& trackerData = logData->peekTracker[peekId]; - auto& sequenceData = trackerData.sequence_version[sequenceNum + 1]; - trackerData.lastUpdate = now(); - if (!sequenceData.isSet()) { - sequenceData.send(std::make_pair(begin, reqOnlySpilled)); - } - } - throw end_of_stream(); - } - - //TraceEvent("TLogPeekMessages0", self->dbgid).detail("ReqBeginEpoch", req.begin.epoch).detail("ReqBeginSeq", req.begin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", req.tag1).detail("Tag2", req.tag2); - // Wait until we have something to return that the caller doesn't already have - if (logData->version.get() < begin) { - wait(logData->version.whenAtLeast(begin)); - wait(delay(SERVER_KNOBS->TLOG_PEEK_DELAY, g_network->getCurrentTask())); - } - - if (logData->locality != tagLocalitySatellite && tag.locality == tagLocalityLogRouter) { - wait(self->concurrentLogRouterReads.take()); - state FlowLock::Releaser globalReleaser(self->concurrentLogRouterReads); - wait(delay(0.0, TaskPriority::Low)); - } - - if (begin <= logData->persistentDataDurableVersion && tag.locality != tagLocalityTxs && tag != txsTag) { - // Reading spilled data will almost always imply that the storage server is >5s behind the rest - // of the cluster. We shouldn't prioritize spending CPU on helping this server catch up - // slightly faster over keeping the rest of the cluster operating normally. - // txsTag is only ever peeked on recovery, and we would still wish to prioritize requests - // that impact recovery duration. - wait(delay(0, TaskPriority::TLogSpilledPeekReply)); - } - - state double workStart = now(); - - Version poppedVer = poppedVersion(logData, tag); - if (poppedVer > begin) { - // reply with an empty message and let the next reply start from poppedVer - TLogPeekReply rep; - rep.maxKnownVersion = logData->version.get(); - rep.minKnownCommittedVersion = logData->minKnownCommittedVersion; - rep.popped = poppedVer; - rep.end = poppedVer; - rep.onlySpilled = false; - - // TODO: once the fake stream is replaced by ReplyPromiseStream, we can remove the code handling sequence - // requests. - if (sequence.present()) { - auto& trackerData = logData->peekTracker[peekId]; - auto& sequenceData = trackerData.sequence_version[sequenceNum + 1]; - trackerData.lastUpdate = now(); - if (trackerData.sequence_version.size() && sequenceNum + 1 < trackerData.sequence_version.begin()->first) { - if (!sequenceData.isSet()) - sequenceData.sendError(operation_obsolete()); - throw operation_obsolete(); - } - if (sequenceData.isSet()) { - if (sequenceData.getFuture().get().first != rep.end) { - TEST(true); // 1 tlog peek second attempt ended at a different version - throw operation_obsolete(); - } - } else { - sequenceData.send(std::make_pair(rep.end, rep.onlySpilled)); - } - rep.begin = begin; - } - - return rep; - } - - state Version endVersion = logData->version.get() + 1; - state bool onlySpilled = false; - - // grab messages from disk - //TraceEvent("TLogPeekMessages", self->dbgid).detail("ReqBeginEpoch", req.begin.epoch).detail("ReqBeginSeq", req.begin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", req.tag1).detail("Tag2", req.tag2); - if (begin <= logData->persistentDataDurableVersion) { - // Just in case the durable version changes while we are waiting for the read, we grab this data from memory. We - // may or may not actually send it depending on whether we get enough data from disk. SOMEDAY: Only do this if - // an initial attempt to read from disk results in insufficient data and the required data is no longer in - // memory SOMEDAY: Should we only send part of the messages we collected, to actually limit the size of the - // result? - - if (reqOnlySpilled) { - endVersion = logData->persistentDataDurableVersion + 1; - } else { - peekMessagesFromMemory(logData, tag, begin, messages2, endVersion); - } - - RangeResult kvs = wait(self->persistentData->readRange( - KeyRangeRef(persistTagMessagesKey(logData->logId, tag, begin), - persistTagMessagesKey(logData->logId, tag, logData->persistentDataDurableVersion + 1)), - SERVER_KNOBS->DESIRED_TOTAL_BYTES, - SERVER_KNOBS->DESIRED_TOTAL_BYTES)); - - //TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", reply.getEndpoint().address).detail("Tag1Results", s1).detail("Tag2Results", s2).detail("Tag1ResultsLim", kv1.size()).detail("Tag2ResultsLim", kv2.size()).detail("Tag1ResultsLast", kv1.size() ? kv1[0].key : "").detail("Tag2ResultsLast", kv2.size() ? kv2[0].key : "").detail("Limited", limited).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowEpoch", self->epoch()).detail("NowSeq", self->sequence.getNextSequence()); - - for (auto& kv : kvs) { - auto ver = decodeTagMessagesKey(kv.key); - messages << VERSION_HEADER << ver; - messages.serializeBytes(kv.value); - } - - if (kvs.expectedSize() >= SERVER_KNOBS->DESIRED_TOTAL_BYTES) { - endVersion = decodeTagMessagesKey(kvs.end()[-1].key) + 1; - onlySpilled = true; - } else { - messages.serializeBytes(messages2.toValue()); - } - } else { - peekMessagesFromMemory(logData, tag, begin, messages, endVersion); - //TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", reply.getEndpoint().address).detail("MessageBytes", messages.getLength()).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowSeq", self->sequence.getNextSequence()); - } - - TLogPeekReply reply; - reply.maxKnownVersion = logData->version.get(); - reply.minKnownCommittedVersion = logData->minKnownCommittedVersion; - reply.messages = StringRef(reply.arena, messages.toValue()); - reply.end = endVersion; - reply.onlySpilled = onlySpilled; - - //TraceEvent("TlogPeek", self->dbgid).detail("LogId", logData->logId).detail("EndVer", reply.end).detail("MsgBytes", reply.messages.expectedSize()).detail("ForAddress", reply.getEndpoint().address); - - if (sequence.present()) { - auto& trackerData = logData->peekTracker[peekId]; - trackerData.lastUpdate = now(); - - double queueT = blockStart - queueStart; - double blockT = workStart - blockStart; - double workT = now() - workStart; - - trackerData.totalPeeks++; - trackerData.replyBytes += reply.messages.size(); - - if (queueT > trackerData.queueMax) - trackerData.queueMax = queueT; - if (blockT > trackerData.blockMax) - trackerData.blockMax = blockT; - if (workT > trackerData.workMax) - trackerData.workMax = workT; - - trackerData.queueTime += queueT; - trackerData.blockTime += blockT; - trackerData.workTime += workT; - - auto& sequenceData = trackerData.sequence_version[sequenceNum + 1]; - if (trackerData.sequence_version.size() && sequenceNum + 1 < trackerData.sequence_version.begin()->first) { - if (!sequenceData.isSet()) { - // It would technically be more correct to .send({req.begin, req.onlySpilled}), as the next - // request might still be in the window of active requests, but LogSystemPeekCursor will - // throw away all future responses upon getting an operation_obsolete(), so computing a - // response will probably be a waste of CPU. - sequenceData.sendError(operation_obsolete()); - } - throw operation_obsolete(); - } - if (sequenceData.isSet()) { - trackerData.duplicatePeeks++; - if (sequenceData.getFuture().get().first != reply.end) { - TEST(true); // 1 tlog peek second attempt ended at a different version (2) - throw operation_obsolete(); - } - } else { - sequenceData.send(std::make_pair(reply.end, reply.onlySpilled)); - } - reply.begin = begin; - } - - return reply; -} - -// This actor keep pushing TLogPeekStreamReply until it's removed from the cluster or should recover -ACTOR Future tLogPeekStream(TLogData* self, TLogPeekStreamRequest req, Reference logData) { - self->activePeekStreams++; - - state Version begin = req.begin; - state bool onlySpilled = false; - if (req.tag.locality == tagLocalityTxs && req.tag.id >= logData->txsTags && logData->txsTags > 0) { - req.tag.id = req.tag.id % logData->txsTags; - } - req.reply.setByteLimit(std::min(SERVER_KNOBS->MAXIMUM_PEEK_BYTES, req.limitBytes)); - loop { - state TLogPeekStreamReply reply; - try { - wait(req.reply.onReady() && - store(reply.rep, peekTLog(self, logData, begin, req.tag, req.returnIfBlocked, onlySpilled))); - req.reply.send(reply); - begin = reply.rep.end; - onlySpilled = reply.rep.onlySpilled; - if (reply.rep.end > logData->version.get()) { - wait(delay(SERVER_KNOBS->TLOG_PEEK_DELAY, g_network->getCurrentTask())); - } else { - wait(delay(0, g_network->getCurrentTask())); - } - } catch (Error& e) { - self->activePeekStreams--; - TraceEvent(SevDebug, "TLogPeekStreamEnd", logData->logId).error(e, true); - - if (e.code() == error_code_end_of_stream || e.code() == error_code_operation_obsolete) { - req.reply.sendError(e); - return Void(); - } else { - throw; - } - } - } -} - - -ACTOR Future tLogPeekMessages(TLogData* self, TLogPeekRequest req, Reference logData) { +ACTOR template +Future tLogPeekMessages(PromiseType replyPromise, + TLogData* self, + Reference logData, + Version reqBegin, + Tag reqTag, + bool reqReturnIfBlocked = false, + bool reqOnlySpilled = false, + Optional> reqSequence = Optional>()) { state BinaryWriter messages(Unversioned()); state BinaryWriter messages2(Unversioned()); state int sequence = -1; state UID peekId; state double queueStart = now(); - if (req.tag.locality == tagLocalityTxs && req.tag.id >= logData->txsTags && logData->txsTags > 0) { - req.tag.id = req.tag.id % logData->txsTags; + if (reqTag.locality == tagLocalityTxs && reqTag.id >= logData->txsTags && logData->txsTags > 0) { + reqTag.id = reqTag.id % logData->txsTags; } - if (req.sequence.present()) { + if (reqSequence.present()) { try { - peekId = req.sequence.get().first; - sequence = req.sequence.get().second; + peekId = reqSequence.get().first; + sequence = reqSequence.get().second; if (sequence >= SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS && logData->peekTracker.find(peekId) == logData->peekTracker.end()) { throw operation_obsolete(); } auto& trackerData = logData->peekTracker[peekId]; if (sequence == 0 && trackerData.sequence_version.find(0) == trackerData.sequence_version.end()) { - trackerData.tag = req.tag; - trackerData.sequence_version[0].send(std::make_pair(req.begin, req.onlySpilled)); + trackerData.tag = reqTag; + trackerData.sequence_version[0].send(std::make_pair(reqBegin, reqOnlySpilled)); } auto seqBegin = trackerData.sequence_version.begin(); while (trackerData.sequence_version.size() && @@ -1529,12 +1265,12 @@ ACTOR Future tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen trackerData.lastUpdate = now(); std::pair prevPeekData = wait(fPrevPeekData); - req.begin = std::max(prevPeekData.first, req.begin); - req.onlySpilled = prevPeekData.second; + reqBegin = std::max(prevPeekData.first, reqBegin); + reqOnlySpilled = prevPeekData.second; wait(yield()); } catch (Error& e) { if (e.code() == error_code_timed_out || e.code() == error_code_operation_obsolete) { - req.reply.sendError(e); + replyPromise.sendError(e); return Void(); } else { throw; @@ -1544,32 +1280,32 @@ ACTOR Future tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen state double blockStart = now(); - if (req.returnIfBlocked && logData->version.get() < req.begin) { - req.reply.sendError(end_of_stream()); - if (req.sequence.present()) { + if (reqReturnIfBlocked && logData->version.get() < reqBegin) { + replyPromise.sendError(end_of_stream()); + if (reqSequence.present()) { auto& trackerData = logData->peekTracker[peekId]; auto& sequenceData = trackerData.sequence_version[sequence + 1]; if (!sequenceData.isSet()) { - sequenceData.send(std::make_pair(req.begin, req.onlySpilled)); + sequenceData.send(std::make_pair(reqBegin, reqOnlySpilled)); } } return Void(); } - //TraceEvent("TLogPeekMessages0", self->dbgid).detail("ReqBeginEpoch", req.begin.epoch).detail("ReqBeginSeq", req.begin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", req.tag1).detail("Tag2", req.tag2); + //TraceEvent("TLogPeekMessages0", self->dbgid).detail("ReqBeginEpoch", reqBegin.epoch).detail("ReqBeginSeq", reqBegin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", reqTag1).detail("Tag2", reqTag2); // Wait until we have something to return that the caller doesn't already have - if (logData->version.get() < req.begin) { - wait(logData->version.whenAtLeast(req.begin)); + if (logData->version.get() < reqBegin) { + wait(logData->version.whenAtLeast(reqBegin)); wait(delay(SERVER_KNOBS->TLOG_PEEK_DELAY, g_network->getCurrentTask())); } - if (logData->locality != tagLocalitySatellite && req.tag.locality == tagLocalityLogRouter) { + if (logData->locality != tagLocalitySatellite && reqTag.locality == tagLocalityLogRouter) { wait(self->concurrentLogRouterReads.take()); state FlowLock::Releaser globalReleaser(self->concurrentLogRouterReads); wait(delay(0.0, TaskPriority::Low)); } - if (req.begin <= logData->persistentDataDurableVersion && req.tag.locality != tagLocalityTxs && req.tag != txsTag) { + if (reqBegin <= logData->persistentDataDurableVersion && reqTag.locality != tagLocalityTxs && reqTag != txsTag) { // Reading spilled data will almost always imply that the storage server is >5s behind the rest // of the cluster. We shouldn't prioritize spending CPU on helping this server catch up // slightly faster over keeping the rest of the cluster operating normally. @@ -1580,8 +1316,8 @@ ACTOR Future tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen state double workStart = now(); - Version poppedVer = poppedVersion(logData, req.tag); - if (poppedVer > req.begin) { + Version poppedVer = poppedVersion(logData, reqTag); + if (poppedVer > reqBegin) { TLogPeekReply rep; rep.maxKnownVersion = logData->version.get(); rep.minKnownCommittedVersion = logData->minKnownCommittedVersion; @@ -1589,12 +1325,12 @@ ACTOR Future tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen rep.end = poppedVer; rep.onlySpilled = false; - if (req.sequence.present()) { + if (reqSequence.present()) { auto& trackerData = logData->peekTracker[peekId]; auto& sequenceData = trackerData.sequence_version[sequence + 1]; trackerData.lastUpdate = now(); if (trackerData.sequence_version.size() && sequence + 1 < trackerData.sequence_version.begin()->first) { - req.reply.sendError(operation_obsolete()); + replyPromise.sendError(operation_obsolete()); if (!sequenceData.isSet()) sequenceData.sendError(operation_obsolete()); return Void(); @@ -1602,16 +1338,16 @@ ACTOR Future tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen if (sequenceData.isSet()) { if (sequenceData.getFuture().get().first != rep.end) { TEST(true); // tlog peek second attempt ended at a different version - req.reply.sendError(operation_obsolete()); + replyPromise.sendError(operation_obsolete()); return Void(); } } else { sequenceData.send(std::make_pair(rep.end, rep.onlySpilled)); } - rep.begin = req.begin; + rep.begin = reqBegin; } - req.reply.send(rep); + replyPromise.send(rep); return Void(); } @@ -1619,27 +1355,27 @@ ACTOR Future tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen state bool onlySpilled = false; // grab messages from disk - //TraceEvent("TLogPeekMessages", self->dbgid).detail("ReqBeginEpoch", req.begin.epoch).detail("ReqBeginSeq", req.begin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", req.tag1).detail("Tag2", req.tag2); - if (req.begin <= logData->persistentDataDurableVersion) { + //TraceEvent("TLogPeekMessages", self->dbgid).detail("ReqBeginEpoch", reqBegin.epoch).detail("ReqBeginSeq", reqBegin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", reqTag1).detail("Tag2", reqTag2); + if (reqBegin <= logData->persistentDataDurableVersion) { // Just in case the durable version changes while we are waiting for the read, we grab this data from memory. We // may or may not actually send it depending on whether we get enough data from disk. SOMEDAY: Only do this if // an initial attempt to read from disk results in insufficient data and the required data is no longer in // memory SOMEDAY: Should we only send part of the messages we collected, to actually limit the size of the // result? - if (req.onlySpilled) { + if (reqOnlySpilled) { endVersion = logData->persistentDataDurableVersion + 1; } else { - peekMessagesFromMemory(logData, req.tag, req.begin, messages2, endVersion); + peekMessagesFromMemory(logData, reqTag, reqBegin, messages2, endVersion); } RangeResult kvs = wait(self->persistentData->readRange( - KeyRangeRef(persistTagMessagesKey(logData->logId, req.tag, req.begin), - persistTagMessagesKey(logData->logId, req.tag, logData->persistentDataDurableVersion + 1)), + KeyRangeRef(persistTagMessagesKey(logData->logId, reqTag, reqBegin), + persistTagMessagesKey(logData->logId, reqTag, logData->persistentDataDurableVersion + 1)), SERVER_KNOBS->DESIRED_TOTAL_BYTES, SERVER_KNOBS->DESIRED_TOTAL_BYTES)); - //TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", req.reply.getEndpoint().address).detail("Tag1Results", s1).detail("Tag2Results", s2).detail("Tag1ResultsLim", kv1.size()).detail("Tag2ResultsLim", kv2.size()).detail("Tag1ResultsLast", kv1.size() ? kv1[0].key : "").detail("Tag2ResultsLast", kv2.size() ? kv2[0].key : "").detail("Limited", limited).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowEpoch", self->epoch()).detail("NowSeq", self->sequence.getNextSequence()); + //TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", replyPromise.getEndpoint().address).detail("Tag1Results", s1).detail("Tag2Results", s2).detail("Tag1ResultsLim", kv1.size()).detail("Tag2ResultsLim", kv2.size()).detail("Tag1ResultsLast", kv1.size() ? kv1[0].key : "").detail("Tag2ResultsLast", kv2.size() ? kv2[0].key : "").detail("Limited", limited).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowEpoch", self->epoch()).detail("NowSeq", self->sequence.getNextSequence()); for (auto& kv : kvs) { auto ver = decodeTagMessagesKey(kv.key); @@ -1654,20 +1390,20 @@ ACTOR Future tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen messages.serializeBytes(messages2.toValue()); } } else { - peekMessagesFromMemory(logData, req.tag, req.begin, messages, endVersion); - //TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", req.reply.getEndpoint().address).detail("MessageBytes", messages.getLength()).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowSeq", self->sequence.getNextSequence()); + peekMessagesFromMemory(logData, reqTag, reqBegin, messages, endVersion); + //TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", replyPromise.getEndpoint().address).detail("MessageBytes", messages.getLength()).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowSeq", self->sequence.getNextSequence()); } TLogPeekReply reply; reply.maxKnownVersion = logData->version.get(); reply.minKnownCommittedVersion = logData->minKnownCommittedVersion; - reply.messages = messages.toValue(); + reply.messages = StringRef(reply.arena, messages.toValue()); reply.end = endVersion; reply.onlySpilled = onlySpilled; - //TraceEvent("TlogPeek", self->dbgid).detail("LogId", logData->logId).detail("EndVer", reply.end).detail("MsgBytes", reply.messages.expectedSize()).detail("ForAddress", req.reply.getEndpoint().address); + //TraceEvent("TlogPeek", self->dbgid).detail("LogId", logData->logId).detail("EndVer", reply.end).detail("MsgBytes", reply.messages.expectedSize()).detail("ForAddress", replyPromise.getEndpoint().address); - if (req.sequence.present()) { + if (reqSequence.present()) { auto& trackerData = logData->peekTracker[peekId]; trackerData.lastUpdate = now(); @@ -1691,7 +1427,7 @@ ACTOR Future tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen auto& sequenceData = trackerData.sequence_version[sequence + 1]; if (trackerData.sequence_version.size() && sequence + 1 < trackerData.sequence_version.begin()->first) { - req.reply.sendError(operation_obsolete()); + replyPromise.sendError(operation_obsolete()); if (!sequenceData.isSet()) sequenceData.sendError(operation_obsolete()); return Void(); @@ -1700,19 +1436,60 @@ ACTOR Future tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen trackerData.duplicatePeeks++; if (sequenceData.getFuture().get().first != reply.end) { TEST(true); // tlog peek second attempt ended at a different version (2) - req.reply.sendError(operation_obsolete()); + replyPromise.sendError(operation_obsolete()); return Void(); } } else { sequenceData.send(std::make_pair(reply.end, reply.onlySpilled)); } - reply.begin = req.begin; + reply.begin = reqBegin; } - req.reply.send(reply); + replyPromise.send(reply); return Void(); } +// This actor keep pushing TLogPeekStreamReply until it's removed from the cluster or should recover +ACTOR Future tLogPeekStream(TLogData* self, TLogPeekStreamRequest req, Reference logData) { + self->activePeekStreams++; + + state Version begin = req.begin; + state bool onlySpilled = false; + req.reply.setByteLimit(std::min(SERVER_KNOBS->MAXIMUM_PEEK_BYTES, req.limitBytes)); + loop { + state TLogPeekStreamReply reply; + state Promise promise; + state Future future(promise.getFuture()); + try { + wait(req.reply.onReady() && + tLogPeekMessages(promise, self, logData, begin, req.tag, req.returnIfBlocked, onlySpilled)); + ASSERT(future.isReady()); + if (future.isError()) { + throw future.getError(); + } + + reply.rep = future.get(); + req.reply.send(reply); + begin = reply.rep.end; + onlySpilled = reply.rep.onlySpilled; + if (reply.rep.end > logData->version.get()) { + wait(delay(SERVER_KNOBS->TLOG_PEEK_DELAY, g_network->getCurrentTask())); + } else { + wait(delay(0, g_network->getCurrentTask())); + } + } catch (Error& e) { + self->activePeekStreams--; + TraceEvent(SevDebug, "TLogPeekStreamEnd", logData->logId).error(e, true); + + if (e.code() == error_code_end_of_stream || e.code() == error_code_operation_obsolete) { + req.reply.sendError(e); + return Void(); + } else { + throw; + } + } + } +} ACTOR Future doQueueCommit(TLogData* self, Reference logData, @@ -2208,7 +1985,8 @@ ACTOR Future serveTLogInterface(TLogData* self, } } when(TLogPeekRequest req = waitNext(tli.peekMessages.getFuture())) { - logData->addActor.send(tLogPeekMessages(self, req, logData)); + logData->addActor.send(tLogPeekMessages( + req.reply, self, logData, req.begin, req.tag, req.returnIfBlocked, req.onlySpilled, req.sequence)); } when(TLogPeekStreamRequest req = waitNext(tli.peekStreamMessages.getFuture())) { TraceEvent(SevDebug, "TLogPeekStream", logData->logId) diff --git a/fdbserver/OldTLogServer_6_2.actor.cpp b/fdbserver/OldTLogServer_6_2.actor.cpp index 88029f9eb1..81025d93de 100644 --- a/fdbserver/OldTLogServer_6_2.actor.cpp +++ b/fdbserver/OldTLogServer_6_2.actor.cpp @@ -1547,67 +1547,66 @@ ACTOR Future> parseMessagesForTag(StringRef commitBlob, T } // Common logics to peek TLog and create TLogPeekReply that serves both streaming peek or normal peek request -ACTOR template -Future peekTLogAndSend(PromiseType replyPromise, TLogData* self, - Reference logData, - Version begin, - Tag tag, - bool returnIfBlocked = false, - bool reqOnlySpilled = false, - Optional> sequence = Optional>()) { +ACTOR template +Future tLogPeekMessages(PromiseType replyPromise, + TLogData* self, + Reference logData, + Version reqBegin, + Tag reqTag, + bool reqReturnIfBlocked = false, + bool reqOnlySpilled = false, + Optional> reqSequence = Optional>()) { state BinaryWriter messages(Unversioned()); state BinaryWriter messages2(Unversioned()); - state int sequenceNum = -1; + state int sequence = -1; state UID peekId; state double queueStart = now(); - if (tag.locality == tagLocalityTxs && tag.id >= logData->txsTags && logData->txsTags > 0) { - tag.id = tag.id % logData->txsTags; + if (reqTag.locality == tagLocalityTxs && reqTag.id >= logData->txsTags && logData->txsTags > 0) { + reqTag.id = reqTag.id % logData->txsTags; } - // TODO: once the fake stream is replaced by ReplyPromiseStream, we can remove the code handling sequence requests - // STEP: a. mark obsolete sequence requests; b. wait previous sequence requests are handled in order - if (sequence.present()) { - try{ - peekId = sequence.get().first; - sequenceNum = sequence.get().second; - if (sequenceNum >= SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS && - logData->peekTracker.find(peekId) == logData->peekTracker.end()) { - throw operation_obsolete(); - } - auto& trackerData = logData->peekTracker[peekId]; - if (sequenceNum == 0 && trackerData.sequence_version.find(0) == trackerData.sequence_version.end()) { - trackerData.tag = tag; - trackerData.sequence_version[0].send(std::make_pair(begin, reqOnlySpilled)); - } - auto seqBegin = trackerData.sequence_version.begin(); - // The peek cursor and this comparison need to agree about the maximum number of in-flight requests. - while (trackerData.sequence_version.size() && - seqBegin->first <= sequenceNum - SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS) { - if (seqBegin->second.canBeSet()) { - seqBegin->second.sendError(operation_obsolete()); - } - trackerData.sequence_version.erase(seqBegin); - seqBegin = trackerData.sequence_version.begin(); - } + if (reqSequence.present()) { + try { + peekId = reqSequence.get().first; + sequence = reqSequence.get().second; + if (sequence >= SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS && + logData->peekTracker.find(peekId) == logData->peekTracker.end()) { + throw operation_obsolete(); + } + auto& trackerData = logData->peekTracker[peekId]; + if (sequence == 0 && trackerData.sequence_version.find(0) == trackerData.sequence_version.end()) { + trackerData.tag = reqTag; + trackerData.sequence_version[0].send(std::make_pair(reqBegin, reqOnlySpilled)); + } + auto seqBegin = trackerData.sequence_version.begin(); + // The peek cursor and this comparison need to agree about the maximum number of in-flight requests. + while (trackerData.sequence_version.size() && + seqBegin->first <= sequence - SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS) { + if (seqBegin->second.canBeSet()) { + seqBegin->second.sendError(operation_obsolete()); + } + trackerData.sequence_version.erase(seqBegin); + seqBegin = trackerData.sequence_version.begin(); + } - if (trackerData.sequence_version.size() && sequenceNum < seqBegin->first) { - throw operation_obsolete(); - } + if (trackerData.sequence_version.size() && sequence < seqBegin->first) { + throw operation_obsolete(); + } - Future> fPrevPeekData = trackerData.sequence_version[sequenceNum].getFuture(); - if (fPrevPeekData.isReady()) { - trackerData.unblockedPeeks++; - double t = now() - trackerData.lastUpdate; - if (t > trackerData.idleMax) - trackerData.idleMax = t; - trackerData.idleTime += t; - } - trackerData.lastUpdate = now(); - std::pair prevPeekData = wait(fPrevPeekData); - begin = std::max(prevPeekData.first, begin); - reqOnlySpilled = prevPeekData.second; - wait(yield()); + Future> fPrevPeekData = trackerData.sequence_version[sequence].getFuture(); + if (fPrevPeekData.isReady()) { + trackerData.unblockedPeeks++; + double t = now() - trackerData.lastUpdate; + if (t > trackerData.idleMax) + trackerData.idleMax = t; + trackerData.idleTime += t; + } + trackerData.lastUpdate = now(); + std::pair prevPeekData = wait(fPrevPeekData); + reqBegin = std::max(prevPeekData.first, reqBegin); + reqOnlySpilled = prevPeekData.second; + wait(yield()); } catch (Error& e) { if (e.code() == error_code_timed_out || e.code() == error_code_operation_obsolete) { replyPromise.sendError(e); @@ -1620,33 +1619,32 @@ Future peekTLogAndSend(PromiseType replyPromise, TLogData* self, state double blockStart = now(); - if (returnIfBlocked && logData->version.get() < begin) { - replyPromise.sendError(end_of_stream()); - if (sequence.present()) { + if (reqReturnIfBlocked && logData->version.get() < reqBegin) { + replyPromise.sendError(end_of_stream()); + if (reqSequence.present()) { auto& trackerData = logData->peekTracker[peekId]; - auto& sequenceData = trackerData.sequence_version[sequenceNum + 1]; - trackerData.lastUpdate = now(); + auto& sequenceData = trackerData.sequence_version[sequence + 1]; if (!sequenceData.isSet()) { - sequenceData.send(std::make_pair(begin, reqOnlySpilled)); + sequenceData.send(std::make_pair(reqBegin, reqOnlySpilled)); } } return Void(); } - //TraceEvent("TLogPeekMessages0", self->dbgid).detail("ReqBeginEpoch", req.begin.epoch).detail("ReqBeginSeq", req.begin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", req.tag1).detail("Tag2", req.tag2); + //TraceEvent("TLogPeekMessages0", self->dbgid).detail("ReqBeginEpoch", reqBegin.epoch).detail("ReqBeginSeq", reqBegin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", reqTag1).detail("Tag2", reqTag2); // Wait until we have something to return that the caller doesn't already have - if (logData->version.get() < begin) { - wait(logData->version.whenAtLeast(begin)); + if (logData->version.get() < reqBegin) { + wait(logData->version.whenAtLeast(reqBegin)); wait(delay(SERVER_KNOBS->TLOG_PEEK_DELAY, g_network->getCurrentTask())); } - if (logData->locality != tagLocalitySatellite && tag.locality == tagLocalityLogRouter) { + if (reqTag.locality == tagLocalityLogRouter) { wait(self->concurrentLogRouterReads.take()); state FlowLock::Releaser globalReleaser(self->concurrentLogRouterReads); wait(delay(0.0, TaskPriority::Low)); } - if (begin <= logData->persistentDataDurableVersion && tag.locality != tagLocalityTxs && tag != txsTag) { + if (reqBegin <= logData->persistentDataDurableVersion && reqTag.locality != tagLocalityTxs && reqTag != txsTag) { // Reading spilled data will almost always imply that the storage server is >5s behind the rest // of the cluster. We shouldn't prioritize spending CPU on helping this server catch up // slightly faster over keeping the rest of the cluster operating normally. @@ -1657,9 +1655,8 @@ Future peekTLogAndSend(PromiseType replyPromise, TLogData* self, state double workStart = now(); - Version poppedVer = poppedVersion(logData, tag); - if (poppedVer > begin) { - // reply with an empty message and let the next reply start from poppedVer + Version poppedVer = poppedVersion(logData, reqTag); + if (poppedVer > reqBegin) { TLogPeekReply rep; rep.maxKnownVersion = logData->version.get(); rep.minKnownCommittedVersion = logData->minKnownCommittedVersion; @@ -1667,13 +1664,11 @@ Future peekTLogAndSend(PromiseType replyPromise, TLogData* self, rep.end = poppedVer; rep.onlySpilled = false; - // TODO: once the fake stream is replaced by ReplyPromiseStream, we can remove the code handling sequence - // requests. - if (sequence.present()) { + if (reqSequence.present()) { auto& trackerData = logData->peekTracker[peekId]; - auto& sequenceData = trackerData.sequence_version[sequenceNum + 1]; + auto& sequenceData = trackerData.sequence_version[sequence + 1]; trackerData.lastUpdate = now(); - if (trackerData.sequence_version.size() && sequenceNum + 1 < trackerData.sequence_version.begin()->first) { + if (trackerData.sequence_version.size() && sequence + 1 < trackerData.sequence_version.begin()->first) { replyPromise.sendError(operation_obsolete()); if (!sequenceData.isSet()) sequenceData.sendError(operation_obsolete()); @@ -1681,14 +1676,14 @@ Future peekTLogAndSend(PromiseType replyPromise, TLogData* self, } if (sequenceData.isSet()) { if (sequenceData.getFuture().get().first != rep.end) { - TEST(true); // 1 tlog peek second attempt ended at a different version + TEST(true); // tlog peek second attempt ended at a different version replyPromise.sendError(operation_obsolete()); return Void(); } } else { sequenceData.send(std::make_pair(rep.end, rep.onlySpilled)); } - rep.begin = begin; + rep.begin = reqBegin; } replyPromise.send(rep); @@ -1699,8 +1694,8 @@ Future peekTLogAndSend(PromiseType replyPromise, TLogData* self, state bool onlySpilled = false; // grab messages from disk - //TraceEvent("TLogPeekMessages", self->dbgid).detail("ReqBeginEpoch", req.begin.epoch).detail("ReqBeginSeq", req.begin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", req.tag1).detail("Tag2", req.tag2); - if (begin <= logData->persistentDataDurableVersion) { + //TraceEvent("TLogPeekMessages", self->dbgid).detail("ReqBeginEpoch", reqBegin.epoch).detail("ReqBeginSeq", reqBegin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", reqTag1).detail("Tag2", reqTag2); + if (reqBegin <= logData->persistentDataDurableVersion) { // Just in case the durable version changes while we are waiting for the read, we grab this data from memory. We // may or may not actually send it depending on whether we get enough data from disk. SOMEDAY: Only do this if // an initial attempt to read from disk results in insufficient data and the required data is no longer in @@ -1710,13 +1705,13 @@ Future peekTLogAndSend(PromiseType replyPromise, TLogData* self, if (reqOnlySpilled) { endVersion = logData->persistentDataDurableVersion + 1; } else { - peekMessagesFromMemory(logData, tag, begin, messages2, endVersion); + peekMessagesFromMemory(logData, reqTag, reqBegin, messages2, endVersion); } - if (tag.locality == tagLocalityTxs || tag == txsTag) { + if (reqTag.locality == tagLocalityTxs || reqTag == txsTag) { RangeResult kvs = wait(self->persistentData->readRange( - KeyRangeRef(persistTagMessagesKey(logData->logId, tag, begin), - persistTagMessagesKey(logData->logId, tag, logData->persistentDataDurableVersion + 1)), + KeyRangeRef(persistTagMessagesKey(logData->logId, reqTag, reqBegin), + persistTagMessagesKey(logData->logId, reqTag, logData->persistentDataDurableVersion + 1)), SERVER_KNOBS->DESIRED_TOTAL_BYTES, SERVER_KNOBS->DESIRED_TOTAL_BYTES)); @@ -1735,11 +1730,12 @@ Future peekTLogAndSend(PromiseType replyPromise, TLogData* self, } else { // FIXME: Limit to approximately DESIRED_TOTATL_BYTES somehow. RangeResult kvrefs = wait(self->persistentData->readRange( - KeyRangeRef(persistTagMessageRefsKey(logData->logId, tag, begin), - persistTagMessageRefsKey(logData->logId, tag, logData->persistentDataDurableVersion + 1)), + KeyRangeRef( + persistTagMessageRefsKey(logData->logId, reqTag, reqBegin), + persistTagMessageRefsKey(logData->logId, reqTag, logData->persistentDataDurableVersion + 1)), SERVER_KNOBS->TLOG_SPILL_REFERENCE_MAX_BATCHES_PER_PEEK + 1)); - //TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", reply.getEndpoint().getPrimaryAddress()).detail("Tag1Results", s1).detail("Tag2Results", s2).detail("Tag1ResultsLim", kv1.size()).detail("Tag2ResultsLim", kv2.size()).detail("Tag1ResultsLast", kv1.size() ? kv1[0].key : "").detail("Tag2ResultsLast", kv2.size() ? kv2[0].key : "").detail("Limited", limited).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowEpoch", self->epoch()).detail("NowSeq", self->sequence.getNextSequence()); + //TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", replyPromise.getEndpoint().getPrimaryAddress()).detail("Tag1Results", s1).detail("Tag2Results", s2).detail("Tag1ResultsLim", kv1.size()).detail("Tag2ResultsLim", kv2.size()).detail("Tag1ResultsLast", kv1.size() ? kv1[0].key : "").detail("Tag2ResultsLast", kv2.size() ? kv2[0].key : "").detail("Limited", limited).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowEpoch", self->epoch()).detail("NowSeq", self->sequence.getNextSequence()); state std::vector> commitLocations; state bool earlyEnd = false; @@ -1756,7 +1752,7 @@ Future peekTLogAndSend(PromiseType replyPromise, TLogData* self, earlyEnd = true; break; } - if (sd.version >= begin) { + if (sd.version >= reqBegin) { firstVersion = std::min(firstVersion, sd.version); const IDiskQueue::location end = sd.start.lo + sd.length; commitLocations.emplace_back(sd.start, end); @@ -1798,7 +1794,7 @@ Future peekTLogAndSend(PromiseType replyPromise, TLogData* self, messages << VERSION_HEADER << entry.version; std::vector rawMessages = - wait(parseMessagesForTag(entry.messages, tag, logData->logRouterTags)); + wait(parseMessagesForTag(entry.messages, reqTag, logData->logRouterTags)); for (const StringRef& msg : rawMessages) { messages.serializeBytes(msg); } @@ -1821,10 +1817,10 @@ Future peekTLogAndSend(PromiseType replyPromise, TLogData* self, if (reqOnlySpilled) { endVersion = logData->persistentDataDurableVersion + 1; } else { - peekMessagesFromMemory(logData, tag, begin, messages, endVersion); + peekMessagesFromMemory(logData, reqTag, reqBegin, messages, endVersion); } - //TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", req.reply.getEndpoint().getPrimaryAddress()).detail("MessageBytes", messages.getLength()).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowSeq", self->sequence.getNextSequence()); + //TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", replyPromise.getEndpoint().getPrimaryAddress()).detail("MessageBytes", messages.getLength()).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowSeq", self->sequence.getNextSequence()); } TLogPeekReply reply; @@ -1834,12 +1830,9 @@ Future peekTLogAndSend(PromiseType replyPromise, TLogData* self, reply.end = endVersion; reply.onlySpilled = onlySpilled; - //TraceEvent("TlogPeek", self->dbgid).detail("LogId", logData->logId).detail("Tag", req.tag.toString()). - // detail("BeginVer", req.begin).detail("EndVer", reply.end). - // detail("MsgBytes", reply.messages.expectedSize()). - // detail("ForAddress", req.reply.getEndpoint().getPrimaryAddress()); + //TraceEvent("TlogPeek", self->dbgid).detail("LogId", logData->logId).detail("EndVer", reply.end).detail("MsgBytes", reply.messages.expectedSize()).detail("ForAddress", replyPromise.getEndpoint().getPrimaryAddress()); - if (sequence.present()) { + if (reqSequence.present()) { auto& trackerData = logData->peekTracker[peekId]; trackerData.lastUpdate = now(); @@ -1861,29 +1854,24 @@ Future peekTLogAndSend(PromiseType replyPromise, TLogData* self, trackerData.blockTime += blockT; trackerData.workTime += workT; - auto& sequenceData = trackerData.sequence_version[sequenceNum + 1]; - if (trackerData.sequence_version.size() && sequenceNum + 1 < trackerData.sequence_version.begin()->first) { + auto& sequenceData = trackerData.sequence_version[sequence + 1]; + if (trackerData.sequence_version.size() && sequence + 1 < trackerData.sequence_version.begin()->first) { replyPromise.sendError(operation_obsolete()); - if (!sequenceData.isSet()) { - // It would technically be more correct to .send({req.begin, req.onlySpilled}), as the next - // request might still be in the window of active requests, but LogSystemPeekCursor will - // throw away all future responses upon getting an operation_obsolete(), so computing a - // response will probably be a waste of CPU. + if (!sequenceData.isSet()) sequenceData.sendError(operation_obsolete()); - } return Void(); } if (sequenceData.isSet()) { trackerData.duplicatePeeks++; if (sequenceData.getFuture().get().first != reply.end) { - TEST(true); // 1 tlog peek second attempt ended at a different version (2) + TEST(true); // tlog peek second attempt ended at a different version (2) replyPromise.sendError(operation_obsolete()); return Void(); } } else { sequenceData.send(std::make_pair(reply.end, reply.onlySpilled)); } - reply.begin = begin; + reply.begin = reqBegin; } replyPromise.send(reply); @@ -1902,9 +1890,10 @@ ACTOR Future tLogPeekStream(TLogData* self, TLogPeekStreamRequest req, Ref state Promise promise; state Future future(promise.getFuture()); try { - wait(req.reply.onReady() && peekTLogAndSend(promise, self, logData, begin, req.tag, req.returnIfBlocked, onlySpilled)); + wait(req.reply.onReady() && + tLogPeekMessages(promise, self, logData, begin, req.tag, req.returnIfBlocked, onlySpilled)); ASSERT(future.isReady()); - if(future.isError()) { + if (future.isError()) { throw future.getError(); } @@ -1931,330 +1920,6 @@ ACTOR Future tLogPeekStream(TLogData* self, TLogPeekStreamRequest req, Ref } } - -ACTOR Future tLogPeekMessages(TLogData* self, TLogPeekRequest req, Reference logData) { - state BinaryWriter messages(Unversioned()); - state BinaryWriter messages2(Unversioned()); - state int sequence = -1; - state UID peekId; - state double queueStart = now(); - - if (req.tag.locality == tagLocalityTxs && req.tag.id >= logData->txsTags && logData->txsTags > 0) { - req.tag.id = req.tag.id % logData->txsTags; - } - - if (req.sequence.present()) { - try { - peekId = req.sequence.get().first; - sequence = req.sequence.get().second; - if (sequence >= SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS && - logData->peekTracker.find(peekId) == logData->peekTracker.end()) { - throw operation_obsolete(); - } - auto& trackerData = logData->peekTracker[peekId]; - if (sequence == 0 && trackerData.sequence_version.find(0) == trackerData.sequence_version.end()) { - trackerData.tag = req.tag; - trackerData.sequence_version[0].send(std::make_pair(req.begin, req.onlySpilled)); - } - auto seqBegin = trackerData.sequence_version.begin(); - // The peek cursor and this comparison need to agree about the maximum number of in-flight requests. - while (trackerData.sequence_version.size() && - seqBegin->first <= sequence - SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS) { - if (seqBegin->second.canBeSet()) { - seqBegin->second.sendError(operation_obsolete()); - } - trackerData.sequence_version.erase(seqBegin); - seqBegin = trackerData.sequence_version.begin(); - } - - if (trackerData.sequence_version.size() && sequence < seqBegin->first) { - throw operation_obsolete(); - } - - Future> fPrevPeekData = trackerData.sequence_version[sequence].getFuture(); - if (fPrevPeekData.isReady()) { - trackerData.unblockedPeeks++; - double t = now() - trackerData.lastUpdate; - if (t > trackerData.idleMax) - trackerData.idleMax = t; - trackerData.idleTime += t; - } - trackerData.lastUpdate = now(); - std::pair prevPeekData = wait(fPrevPeekData); - req.begin = std::max(prevPeekData.first, req.begin); - req.onlySpilled = prevPeekData.second; - wait(yield()); - } catch (Error& e) { - if (e.code() == error_code_timed_out || e.code() == error_code_operation_obsolete) { - req.reply.sendError(e); - return Void(); - } else { - throw; - } - } - } - - state double blockStart = now(); - - if (req.returnIfBlocked && logData->version.get() < req.begin) { - req.reply.sendError(end_of_stream()); - if (req.sequence.present()) { - auto& trackerData = logData->peekTracker[peekId]; - auto& sequenceData = trackerData.sequence_version[sequence + 1]; - if (!sequenceData.isSet()) { - sequenceData.send(std::make_pair(req.begin, req.onlySpilled)); - } - } - return Void(); - } - - //TraceEvent("TLogPeekMessages0", self->dbgid).detail("ReqBeginEpoch", req.begin.epoch).detail("ReqBeginSeq", req.begin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", req.tag1).detail("Tag2", req.tag2); - // Wait until we have something to return that the caller doesn't already have - if (logData->version.get() < req.begin) { - wait(logData->version.whenAtLeast(req.begin)); - wait(delay(SERVER_KNOBS->TLOG_PEEK_DELAY, g_network->getCurrentTask())); - } - - if (req.tag.locality == tagLocalityLogRouter) { - wait(self->concurrentLogRouterReads.take()); - state FlowLock::Releaser globalReleaser(self->concurrentLogRouterReads); - wait(delay(0.0, TaskPriority::Low)); - } - - if (req.begin <= logData->persistentDataDurableVersion && req.tag.locality != tagLocalityTxs && req.tag != txsTag) { - // Reading spilled data will almost always imply that the storage server is >5s behind the rest - // of the cluster. We shouldn't prioritize spending CPU on helping this server catch up - // slightly faster over keeping the rest of the cluster operating normally. - // txsTag is only ever peeked on recovery, and we would still wish to prioritize requests - // that impact recovery duration. - wait(delay(0, TaskPriority::TLogSpilledPeekReply)); - } - - state double workStart = now(); - - Version poppedVer = poppedVersion(logData, req.tag); - if (poppedVer > req.begin) { - TLogPeekReply rep; - rep.maxKnownVersion = logData->version.get(); - rep.minKnownCommittedVersion = logData->minKnownCommittedVersion; - rep.popped = poppedVer; - rep.end = poppedVer; - rep.onlySpilled = false; - - if (req.sequence.present()) { - auto& trackerData = logData->peekTracker[peekId]; - auto& sequenceData = trackerData.sequence_version[sequence + 1]; - trackerData.lastUpdate = now(); - if (trackerData.sequence_version.size() && sequence + 1 < trackerData.sequence_version.begin()->first) { - req.reply.sendError(operation_obsolete()); - if (!sequenceData.isSet()) - sequenceData.sendError(operation_obsolete()); - return Void(); - } - if (sequenceData.isSet()) { - if (sequenceData.getFuture().get().first != rep.end) { - TEST(true); // tlog peek second attempt ended at a different version - req.reply.sendError(operation_obsolete()); - return Void(); - } - } else { - sequenceData.send(std::make_pair(rep.end, rep.onlySpilled)); - } - rep.begin = req.begin; - } - - req.reply.send(rep); - return Void(); - } - - state Version endVersion = logData->version.get() + 1; - state bool onlySpilled = false; - - // grab messages from disk - //TraceEvent("TLogPeekMessages", self->dbgid).detail("ReqBeginEpoch", req.begin.epoch).detail("ReqBeginSeq", req.begin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", req.tag1).detail("Tag2", req.tag2); - if (req.begin <= logData->persistentDataDurableVersion) { - // Just in case the durable version changes while we are waiting for the read, we grab this data from memory. We - // may or may not actually send it depending on whether we get enough data from disk. SOMEDAY: Only do this if - // an initial attempt to read from disk results in insufficient data and the required data is no longer in - // memory SOMEDAY: Should we only send part of the messages we collected, to actually limit the size of the - // result? - - if (req.onlySpilled) { - endVersion = logData->persistentDataDurableVersion + 1; - } else { - peekMessagesFromMemory(logData, req.tag, req.begin, messages2, endVersion); - } - - if (req.tag.locality == tagLocalityTxs || req.tag == txsTag) { - RangeResult kvs = wait(self->persistentData->readRange( - KeyRangeRef(persistTagMessagesKey(logData->logId, req.tag, req.begin), - persistTagMessagesKey(logData->logId, req.tag, logData->persistentDataDurableVersion + 1)), - SERVER_KNOBS->DESIRED_TOTAL_BYTES, - SERVER_KNOBS->DESIRED_TOTAL_BYTES)); - - for (auto& kv : kvs) { - auto ver = decodeTagMessagesKey(kv.key); - messages << VERSION_HEADER << ver; - messages.serializeBytes(kv.value); - } - - if (kvs.expectedSize() >= SERVER_KNOBS->DESIRED_TOTAL_BYTES) { - endVersion = decodeTagMessagesKey(kvs.end()[-1].key) + 1; - onlySpilled = true; - } else { - messages.serializeBytes(messages2.toValue()); - } - } else { - // FIXME: Limit to approximately DESIRED_TOTATL_BYTES somehow. - RangeResult kvrefs = wait(self->persistentData->readRange( - KeyRangeRef( - persistTagMessageRefsKey(logData->logId, req.tag, req.begin), - persistTagMessageRefsKey(logData->logId, req.tag, logData->persistentDataDurableVersion + 1)), - SERVER_KNOBS->TLOG_SPILL_REFERENCE_MAX_BATCHES_PER_PEEK + 1)); - - //TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", req.reply.getEndpoint().getPrimaryAddress()).detail("Tag1Results", s1).detail("Tag2Results", s2).detail("Tag1ResultsLim", kv1.size()).detail("Tag2ResultsLim", kv2.size()).detail("Tag1ResultsLast", kv1.size() ? kv1[0].key : "").detail("Tag2ResultsLast", kv2.size() ? kv2[0].key : "").detail("Limited", limited).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowEpoch", self->epoch()).detail("NowSeq", self->sequence.getNextSequence()); - - state std::vector> commitLocations; - state bool earlyEnd = false; - uint32_t mutationBytes = 0; - state uint64_t commitBytes = 0; - state Version firstVersion = std::numeric_limits::max(); - for (int i = 0; i < kvrefs.size() && i < SERVER_KNOBS->TLOG_SPILL_REFERENCE_MAX_BATCHES_PER_PEEK; i++) { - auto& kv = kvrefs[i]; - VectorRef spilledData; - BinaryReader r(kv.value, AssumeVersion(logData->protocolVersion)); - r >> spilledData; - for (const SpilledData& sd : spilledData) { - if (mutationBytes >= SERVER_KNOBS->DESIRED_TOTAL_BYTES) { - earlyEnd = true; - break; - } - if (sd.version >= req.begin) { - firstVersion = std::min(firstVersion, sd.version); - const IDiskQueue::location end = sd.start.lo + sd.length; - commitLocations.emplace_back(sd.start, end); - // This isn't perfect, because we aren't accounting for page boundaries, but should be - // close enough. - commitBytes += sd.length; - mutationBytes += sd.mutationBytes; - } - } - if (earlyEnd) - break; - } - earlyEnd = earlyEnd || (kvrefs.size() >= SERVER_KNOBS->TLOG_SPILL_REFERENCE_MAX_BATCHES_PER_PEEK + 1); - wait(self->peekMemoryLimiter.take(TaskPriority::TLogSpilledPeekReply, commitBytes)); - state FlowLock::Releaser memoryReservation(self->peekMemoryLimiter, commitBytes); - state std::vector>> messageReads; - messageReads.reserve(commitLocations.size()); - for (const auto& pair : commitLocations) { - messageReads.push_back(self->rawPersistentQueue->read(pair.first, pair.second, CheckHashes::True)); - } - commitLocations.clear(); - wait(waitForAll(messageReads)); - - state Version lastRefMessageVersion = 0; - state int index = 0; - loop { - if (index >= messageReads.size()) - break; - Standalone queueEntryData = messageReads[index].get(); - uint8_t valid; - const uint32_t length = *(uint32_t*)queueEntryData.begin(); - queueEntryData = queueEntryData.substr(4, queueEntryData.size() - 4); - BinaryReader rd(queueEntryData, IncludeVersion()); - state TLogQueueEntry entry; - rd >> entry >> valid; - ASSERT(valid == 0x01); - ASSERT(length + sizeof(valid) == queueEntryData.size()); - - messages << VERSION_HEADER << entry.version; - - std::vector rawMessages = - wait(parseMessagesForTag(entry.messages, req.tag, logData->logRouterTags)); - for (const StringRef& msg : rawMessages) { - messages.serializeBytes(msg); - } - - lastRefMessageVersion = entry.version; - index++; - } - - messageReads.clear(); - memoryReservation.release(); - - if (earlyEnd) { - endVersion = lastRefMessageVersion + 1; - onlySpilled = true; - } else { - messages.serializeBytes(messages2.toValue()); - } - } - } else { - if (req.onlySpilled) { - endVersion = logData->persistentDataDurableVersion + 1; - } else { - peekMessagesFromMemory(logData, req.tag, req.begin, messages, endVersion); - } - - //TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", req.reply.getEndpoint().getPrimaryAddress()).detail("MessageBytes", messages.getLength()).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowSeq", self->sequence.getNextSequence()); - } - - TLogPeekReply reply; - reply.maxKnownVersion = logData->version.get(); - reply.minKnownCommittedVersion = logData->minKnownCommittedVersion; - reply.messages = messages.toValue(); - reply.end = endVersion; - reply.onlySpilled = onlySpilled; - - //TraceEvent("TlogPeek", self->dbgid).detail("LogId", logData->logId).detail("EndVer", reply.end).detail("MsgBytes", reply.messages.expectedSize()).detail("ForAddress", req.reply.getEndpoint().getPrimaryAddress()); - - if (req.sequence.present()) { - auto& trackerData = logData->peekTracker[peekId]; - trackerData.lastUpdate = now(); - - double queueT = blockStart - queueStart; - double blockT = workStart - blockStart; - double workT = now() - workStart; - - trackerData.totalPeeks++; - trackerData.replyBytes += reply.messages.size(); - - if (queueT > trackerData.queueMax) - trackerData.queueMax = queueT; - if (blockT > trackerData.blockMax) - trackerData.blockMax = blockT; - if (workT > trackerData.workMax) - trackerData.workMax = workT; - - trackerData.queueTime += queueT; - trackerData.blockTime += blockT; - trackerData.workTime += workT; - - auto& sequenceData = trackerData.sequence_version[sequence + 1]; - if (trackerData.sequence_version.size() && sequence + 1 < trackerData.sequence_version.begin()->first) { - req.reply.sendError(operation_obsolete()); - if (!sequenceData.isSet()) - sequenceData.sendError(operation_obsolete()); - return Void(); - } - if (sequenceData.isSet()) { - trackerData.duplicatePeeks++; - if (sequenceData.getFuture().get().first != reply.end) { - TEST(true); // tlog peek second attempt ended at a different version (2) - req.reply.sendError(operation_obsolete()); - return Void(); - } - } else { - sequenceData.send(std::make_pair(reply.end, reply.onlySpilled)); - } - reply.begin = req.begin; - } - - req.reply.send(reply); - return Void(); -} - ACTOR Future watchDegraded(TLogData* self) { if (g_network->isSimulated() && g_simulator.speedUpSimulation) { return Void(); @@ -2765,7 +2430,8 @@ ACTOR Future serveTLogInterface(TLogData* self, } } when(TLogPeekRequest req = waitNext(tli.peekMessages.getFuture())) { - logData->addActor.send(tLogPeekMessages(self, req, logData)); + logData->addActor.send(tLogPeekMessages( + req.reply, self, logData, req.begin, req.tag, req.returnIfBlocked, req.onlySpilled, req.sequence)); } when(TLogPeekStreamRequest req = waitNext(tli.peekStreamMessages.getFuture())) { TraceEvent(SevDebug, "TLogPeekStream", logData->logId) diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp index 3ca0926135..aa2afd5406 100644 --- a/fdbserver/TLogServer.actor.cpp +++ b/fdbserver/TLogServer.actor.cpp @@ -1587,67 +1587,66 @@ ACTOR Future> parseMessagesForTag(StringRef commitBlob, T } // Common logics to peek TLog and create TLogPeekReply that serves both streaming peek or normal peek request -ACTOR template -Future peekTLogAndSend(PromiseType replyPromise, TLogData* self, - Reference logData, - Version begin, - Tag tag, - bool returnIfBlocked = false, - bool reqOnlySpilled = false, - Optional> sequence = Optional>()) { +ACTOR template +Future tLogPeekMessages(PromiseType replyPromise, + TLogData* self, + Reference logData, + Version reqBegin, + Tag reqTag, + bool reqReturnIfBlocked = false, + bool reqOnlySpilled = false, + Optional> reqSequence = Optional>()) { state BinaryWriter messages(Unversioned()); state BinaryWriter messages2(Unversioned()); - state int sequenceNum = -1; + state int sequence = -1; state UID peekId; state double queueStart = now(); - if (tag.locality == tagLocalityTxs && tag.id >= logData->txsTags && logData->txsTags > 0) { - tag.id = tag.id % logData->txsTags; + if (reqTag.locality == tagLocalityTxs && reqTag.id >= logData->txsTags && logData->txsTags > 0) { + reqTag.id = reqTag.id % logData->txsTags; } - // TODO: once the fake stream is replaced by ReplyPromiseStream, we can remove the code handling sequence requests - // STEP: a. mark obsolete sequence requests; b. wait previous sequence requests are handled in order - if (sequence.present()) { - try { - peekId = sequence.get().first; - sequenceNum = sequence.get().second; - if (sequenceNum >= SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS && - logData->peekTracker.find(peekId) == logData->peekTracker.end()) { - throw operation_obsolete(); - } - auto& trackerData = logData->peekTracker[peekId]; - if (sequenceNum == 0 && trackerData.sequence_version.find(0) == trackerData.sequence_version.end()) { - trackerData.tag = tag; - trackerData.sequence_version[0].send(std::make_pair(begin, reqOnlySpilled)); - } - auto seqBegin = trackerData.sequence_version.begin(); - // The peek cursor and this comparison need to agree about the maximum number of in-flight requests. - while (trackerData.sequence_version.size() && - seqBegin->first <= sequenceNum - SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS) { - if (seqBegin->second.canBeSet()) { - seqBegin->second.sendError(operation_obsolete()); - } - trackerData.sequence_version.erase(seqBegin); - seqBegin = trackerData.sequence_version.begin(); - } + if (reqSequence.present()) { + try { + peekId = reqSequence.get().first; + sequence = reqSequence.get().second; + if (sequence >= SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS && + logData->peekTracker.find(peekId) == logData->peekTracker.end()) { + throw operation_obsolete(); + } + auto& trackerData = logData->peekTracker[peekId]; + if (sequence == 0 && trackerData.sequence_version.find(0) == trackerData.sequence_version.end()) { + trackerData.tag = reqTag; + trackerData.sequence_version[0].send(std::make_pair(reqBegin, reqOnlySpilled)); + } + auto seqBegin = trackerData.sequence_version.begin(); + // The peek cursor and this comparison need to agree about the maximum number of in-flight requests. + while (trackerData.sequence_version.size() && + seqBegin->first <= sequence - SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS) { + if (seqBegin->second.canBeSet()) { + seqBegin->second.sendError(operation_obsolete()); + } + trackerData.sequence_version.erase(seqBegin); + seqBegin = trackerData.sequence_version.begin(); + } - if (trackerData.sequence_version.size() && sequenceNum < seqBegin->first) { - throw operation_obsolete(); - } + if (trackerData.sequence_version.size() && sequence < seqBegin->first) { + throw operation_obsolete(); + } - Future> fPrevPeekData = trackerData.sequence_version[sequenceNum].getFuture(); - if (fPrevPeekData.isReady()) { - trackerData.unblockedPeeks++; - double t = now() - trackerData.lastUpdate; - if (t > trackerData.idleMax) - trackerData.idleMax = t; - trackerData.idleTime += t; - } - trackerData.lastUpdate = now(); - std::pair prevPeekData = wait(fPrevPeekData); - begin = std::max(prevPeekData.first, begin); - reqOnlySpilled = prevPeekData.second; - wait(yield()); + Future> fPrevPeekData = trackerData.sequence_version[sequence].getFuture(); + if (fPrevPeekData.isReady()) { + trackerData.unblockedPeeks++; + double t = now() - trackerData.lastUpdate; + if (t > trackerData.idleMax) + trackerData.idleMax = t; + trackerData.idleTime += t; + } + trackerData.lastUpdate = now(); + std::pair prevPeekData = wait(fPrevPeekData); + reqBegin = std::max(prevPeekData.first, reqBegin); + reqOnlySpilled = prevPeekData.second; + wait(yield()); } catch (Error& e) { if (e.code() == error_code_timed_out || e.code() == error_code_operation_obsolete) { replyPromise.sendError(e); @@ -1660,33 +1659,33 @@ Future peekTLogAndSend(PromiseType replyPromise, TLogData* self, state double blockStart = now(); - if (returnIfBlocked && logData->version.get() < begin) { - replyPromise.sendError(end_of_stream()); - if (sequence.present()) { + if (reqReturnIfBlocked && logData->version.get() < reqBegin) { + replyPromise.sendError(end_of_stream()); + if (reqSequence.present()) { auto& trackerData = logData->peekTracker[peekId]; - auto& sequenceData = trackerData.sequence_version[sequenceNum + 1]; + auto& sequenceData = trackerData.sequence_version[sequence + 1]; trackerData.lastUpdate = now(); if (!sequenceData.isSet()) { - sequenceData.send(std::make_pair(begin, reqOnlySpilled)); + sequenceData.send(std::make_pair(reqBegin, reqOnlySpilled)); } } return Void(); } - //TraceEvent("TLogPeekMessages0", self->dbgid).detail("ReqBeginEpoch", req.begin.epoch).detail("ReqBeginSeq", req.begin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", req.tag1).detail("Tag2", req.tag2); + //TraceEvent("TLogPeekMessages0", self->dbgid).detail("ReqBeginEpoch", reqBegin.epoch).detail("ReqBeginSeq", reqBegin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", reqTag1).detail("Tag2", reqTag2); // Wait until we have something to return that the caller doesn't already have - if (logData->version.get() < begin) { - wait(logData->version.whenAtLeast(begin)); + if (logData->version.get() < reqBegin) { + wait(logData->version.whenAtLeast(reqBegin)); wait(delay(SERVER_KNOBS->TLOG_PEEK_DELAY, g_network->getCurrentTask())); } - if (logData->locality != tagLocalitySatellite && tag.locality == tagLocalityLogRouter) { + if (logData->locality != tagLocalitySatellite && reqTag.locality == tagLocalityLogRouter) { wait(self->concurrentLogRouterReads.take()); state FlowLock::Releaser globalReleaser(self->concurrentLogRouterReads); wait(delay(0.0, TaskPriority::Low)); } - if (begin <= logData->persistentDataDurableVersion && tag.locality != tagLocalityTxs && tag != txsTag) { + if (reqBegin <= logData->persistentDataDurableVersion && reqTag.locality != tagLocalityTxs && reqTag != txsTag) { // Reading spilled data will almost always imply that the storage server is >5s behind the rest // of the cluster. We shouldn't prioritize spending CPU on helping this server catch up // slightly faster over keeping the rest of the cluster operating normally. @@ -1697,9 +1696,8 @@ Future peekTLogAndSend(PromiseType replyPromise, TLogData* self, state double workStart = now(); - Version poppedVer = poppedVersion(logData, tag); - if (poppedVer > begin) { - // reply with an empty message and let the next reply start from poppedVer + Version poppedVer = poppedVersion(logData, reqTag); + if (poppedVer > reqBegin) { TLogPeekReply rep; rep.maxKnownVersion = logData->version.get(); rep.minKnownCommittedVersion = logData->minKnownCommittedVersion; @@ -1707,13 +1705,11 @@ Future peekTLogAndSend(PromiseType replyPromise, TLogData* self, rep.end = poppedVer; rep.onlySpilled = false; - // TODO: once the fake stream is replaced by ReplyPromiseStream, we can remove the code handling sequence - // requests. - if (sequence.present()) { + if (reqSequence.present()) { auto& trackerData = logData->peekTracker[peekId]; - auto& sequenceData = trackerData.sequence_version[sequenceNum + 1]; + auto& sequenceData = trackerData.sequence_version[sequence + 1]; trackerData.lastUpdate = now(); - if (trackerData.sequence_version.size() && sequenceNum + 1 < trackerData.sequence_version.begin()->first) { + if (trackerData.sequence_version.size() && sequence + 1 < trackerData.sequence_version.begin()->first) { replyPromise.sendError(operation_obsolete()); if (!sequenceData.isSet()) sequenceData.sendError(operation_obsolete()); @@ -1721,14 +1717,14 @@ Future peekTLogAndSend(PromiseType replyPromise, TLogData* self, } if (sequenceData.isSet()) { if (sequenceData.getFuture().get().first != rep.end) { - TEST(true); // xz tlog peek second attempt ended at a different version + TEST(true); // tlog peek second attempt ended at a different version replyPromise.sendError(operation_obsolete()); return Void(); } } else { sequenceData.send(std::make_pair(rep.end, rep.onlySpilled)); } - rep.begin = begin; + rep.begin = reqBegin; } replyPromise.send(rep); @@ -1739,8 +1735,8 @@ Future peekTLogAndSend(PromiseType replyPromise, TLogData* self, state bool onlySpilled = false; // grab messages from disk - //TraceEvent("TLogPeekMessages", self->dbgid).detail("ReqBeginEpoch", req.begin.epoch).detail("ReqBeginSeq", req.begin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", req.tag1).detail("Tag2", req.tag2); - if (begin <= logData->persistentDataDurableVersion) { + //TraceEvent("TLogPeekMessages", self->dbgid).detail("ReqBeginEpoch", reqBegin.epoch).detail("ReqBeginSeq", reqBegin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", reqTag1).detail("Tag2", reqTag2); + if (reqBegin <= logData->persistentDataDurableVersion) { // Just in case the durable version changes while we are waiting for the read, we grab this data from memory. We // may or may not actually send it depending on whether we get enough data from disk. SOMEDAY: Only do this if // an initial attempt to read from disk results in insufficient data and the required data is no longer in @@ -1750,13 +1746,13 @@ Future peekTLogAndSend(PromiseType replyPromise, TLogData* self, if (reqOnlySpilled) { endVersion = logData->persistentDataDurableVersion + 1; } else { - peekMessagesFromMemory(logData, tag, begin, messages2, endVersion); + peekMessagesFromMemory(logData, reqTag, reqBegin, messages2, endVersion); } - if (logData->shouldSpillByValue(tag)) { + if (logData->shouldSpillByValue(reqTag)) { RangeResult kvs = wait(self->persistentData->readRange( - KeyRangeRef(persistTagMessagesKey(logData->logId, tag, begin), - persistTagMessagesKey(logData->logId, tag, logData->persistentDataDurableVersion + 1)), + KeyRangeRef(persistTagMessagesKey(logData->logId, reqTag, reqBegin), + persistTagMessagesKey(logData->logId, reqTag, logData->persistentDataDurableVersion + 1)), SERVER_KNOBS->DESIRED_TOTAL_BYTES, SERVER_KNOBS->DESIRED_TOTAL_BYTES)); @@ -1775,11 +1771,12 @@ Future peekTLogAndSend(PromiseType replyPromise, TLogData* self, } else { // FIXME: Limit to approximately DESIRED_TOTATL_BYTES somehow. RangeResult kvrefs = wait(self->persistentData->readRange( - KeyRangeRef(persistTagMessageRefsKey(logData->logId, tag, begin), - persistTagMessageRefsKey(logData->logId, tag, logData->persistentDataDurableVersion + 1)), + KeyRangeRef( + persistTagMessageRefsKey(logData->logId, reqTag, reqBegin), + persistTagMessageRefsKey(logData->logId, reqTag, logData->persistentDataDurableVersion + 1)), SERVER_KNOBS->TLOG_SPILL_REFERENCE_MAX_BATCHES_PER_PEEK + 1)); - //TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", req.reply.getEndpoint().getPrimaryAddress()).detail("Tag1Results", s1).detail("Tag2Results", s2).detail("Tag1ResultsLim", kv1.size()).detail("Tag2ResultsLim", kv2.size()).detail("Tag1ResultsLast", kv1.size() ? kv1[0].key : "").detail("Tag2ResultsLast", kv2.size() ? kv2[0].key : "").detail("Limited", limited).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowEpoch", self->epoch()).detail("NowSeq", self->sequence.getNextSequence()); + //TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", replyPromise.getEndpoint().getPrimaryAddress()).detail("Tag1Results", s1).detail("Tag2Results", s2).detail("Tag1ResultsLim", kv1.size()).detail("Tag2ResultsLim", kv2.size()).detail("Tag1ResultsLast", kv1.size() ? kv1[0].key : "").detail("Tag2ResultsLast", kv2.size() ? kv2[0].key : "").detail("Limited", limited).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowEpoch", self->epoch()).detail("NowSeq", self->sequence.getNextSequence()); state std::vector> commitLocations; state bool earlyEnd = false; @@ -1796,7 +1793,7 @@ Future peekTLogAndSend(PromiseType replyPromise, TLogData* self, earlyEnd = true; break; } - if (sd.version >= begin) { + if (sd.version >= reqBegin) { firstVersion = std::min(firstVersion, sd.version); const IDiskQueue::location end = sd.start.lo + sd.length; commitLocations.emplace_back(sd.start, end); @@ -1838,13 +1835,13 @@ Future peekTLogAndSend(PromiseType replyPromise, TLogData* self, messages << VERSION_HEADER << entry.version; std::vector rawMessages = - wait(parseMessagesForTag(entry.messages, tag, logData->logRouterTags)); + wait(parseMessagesForTag(entry.messages, reqTag, logData->logRouterTags)); for (const StringRef& msg : rawMessages) { messages.serializeBytes(msg); DEBUG_TAGS_AND_MESSAGE("TLogPeekFromDisk", entry.version, msg) .detail("UID", self->dbgid) .detail("LogId", logData->logId) - .detail("PeekTag", tag); + .detail("PeekTag", reqTag); } lastRefMessageVersion = entry.version; @@ -1865,10 +1862,10 @@ Future peekTLogAndSend(PromiseType replyPromise, TLogData* self, if (reqOnlySpilled) { endVersion = logData->persistentDataDurableVersion + 1; } else { - peekMessagesFromMemory(logData, tag, begin, messages, endVersion); + peekMessagesFromMemory(logData, reqTag, reqBegin, messages, endVersion); } - //TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", req.reply.getEndpoint().getPrimaryAddress()).detail("MessageBytes", messages.getLength()).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowSeq", self->sequence.getNextSequence()); + //TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", replyPromise.getEndpoint().getPrimaryAddress()).detail("MessageBytes", messages.getLength()).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowSeq", self->sequence.getNextSequence()); } TLogPeekReply reply; @@ -1878,12 +1875,12 @@ Future peekTLogAndSend(PromiseType replyPromise, TLogData* self, reply.end = endVersion; reply.onlySpilled = onlySpilled; - //TraceEvent("TlogPeek", self->dbgid).detail("LogId", logData->logId).detail("Tag", req.tag.toString()). - // detail("BeginVer", req.begin).detail("EndVer", reply.end). + //TraceEvent("TlogPeek", self->dbgid).detail("LogId", logData->logId).detail("Tag", reqTag.toString()). + // detail("BeginVer", reqBegin).detail("EndVer", reply.end). // detail("MsgBytes", reply.messages.expectedSize()). - // detail("ForAddress", req.reply.getEndpoint().getPrimaryAddress()); + // detail("ForAddress", replyPromise.getEndpoint().getPrimaryAddress()); - if (sequence.present()) { + if (reqSequence.present()) { auto& trackerData = logData->peekTracker[peekId]; trackerData.lastUpdate = now(); @@ -1905,11 +1902,11 @@ Future peekTLogAndSend(PromiseType replyPromise, TLogData* self, trackerData.blockTime += blockT; trackerData.workTime += workT; - auto& sequenceData = trackerData.sequence_version[sequenceNum + 1]; - if (trackerData.sequence_version.size() && sequenceNum + 1 < trackerData.sequence_version.begin()->first) { + auto& sequenceData = trackerData.sequence_version[sequence + 1]; + if (trackerData.sequence_version.size() && sequence + 1 < trackerData.sequence_version.begin()->first) { replyPromise.sendError(operation_obsolete()); if (!sequenceData.isSet()) { - // It would technically be more correct to .send({req.begin, req.onlySpilled}), as the next + // It would technically be more correct to .send({reqBegin, reqOnlySpilled}), as the next // request might still be in the window of active requests, but LogSystemPeekCursor will // throw away all future responses upon getting an operation_obsolete(), so computing a // response will probably be a waste of CPU. @@ -1920,21 +1917,20 @@ Future peekTLogAndSend(PromiseType replyPromise, TLogData* self, if (sequenceData.isSet()) { trackerData.duplicatePeeks++; if (sequenceData.getFuture().get().first != reply.end) { - TEST(true); // xz tlog peek second attempt ended at a different version (2) + TEST(true); // tlog peek second attempt ended at a different version (2) replyPromise.sendError(operation_obsolete()); return Void(); } } else { sequenceData.send(std::make_pair(reply.end, reply.onlySpilled)); } - reply.begin = begin; + reply.begin = reqBegin; } replyPromise.send(reply); return Void(); } - // This actor keep pushing TLogPeekStreamReply until it's removed from the cluster or should recover ACTOR Future tLogPeekStream(TLogData* self, TLogPeekStreamRequest req, Reference logData) { self->activePeekStreams++; @@ -1947,9 +1943,10 @@ ACTOR Future tLogPeekStream(TLogData* self, TLogPeekStreamRequest req, Ref state Promise promise; state Future future(promise.getFuture()); try { - wait(req.reply.onReady() && peekTLogAndSend(promise, self, logData, begin, req.tag, req.returnIfBlocked, onlySpilled)); + wait(req.reply.onReady() && + tLogPeekMessages(promise, self, logData, begin, req.tag, req.returnIfBlocked, onlySpilled)); ASSERT(future.isReady()); - if(future.isError()) { + if (future.isError()) { throw future.getError(); } @@ -1976,343 +1973,6 @@ ACTOR Future tLogPeekStream(TLogData* self, TLogPeekStreamRequest req, Ref } } -ACTOR Future tLogPeekMessages(TLogData* self, TLogPeekRequest req, Reference logData) { - state BinaryWriter messages(Unversioned()); - state BinaryWriter messages2(Unversioned()); - state int sequence = -1; - state UID peekId; - state double queueStart = now(); - - if (req.tag.locality == tagLocalityTxs && req.tag.id >= logData->txsTags && logData->txsTags > 0) { - req.tag.id = req.tag.id % logData->txsTags; - } - - if (req.sequence.present()) { - try { - peekId = req.sequence.get().first; - sequence = req.sequence.get().second; - if (sequence >= SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS && - logData->peekTracker.find(peekId) == logData->peekTracker.end()) { - throw operation_obsolete(); - } - auto& trackerData = logData->peekTracker[peekId]; - if (sequence == 0 && trackerData.sequence_version.find(0) == trackerData.sequence_version.end()) { - trackerData.tag = req.tag; - trackerData.sequence_version[0].send(std::make_pair(req.begin, req.onlySpilled)); - } - auto seqBegin = trackerData.sequence_version.begin(); - // The peek cursor and this comparison need to agree about the maximum number of in-flight requests. - while (trackerData.sequence_version.size() && - seqBegin->first <= sequence - SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS) { - if (seqBegin->second.canBeSet()) { - seqBegin->second.sendError(operation_obsolete()); - } - trackerData.sequence_version.erase(seqBegin); - seqBegin = trackerData.sequence_version.begin(); - } - - if (trackerData.sequence_version.size() && sequence < seqBegin->first) { - throw operation_obsolete(); - } - - Future> fPrevPeekData = trackerData.sequence_version[sequence].getFuture(); - if (fPrevPeekData.isReady()) { - trackerData.unblockedPeeks++; - double t = now() - trackerData.lastUpdate; - if (t > trackerData.idleMax) - trackerData.idleMax = t; - trackerData.idleTime += t; - } - trackerData.lastUpdate = now(); - std::pair prevPeekData = wait(fPrevPeekData); - req.begin = std::max(prevPeekData.first, req.begin); - req.onlySpilled = prevPeekData.second; - wait(yield()); - } catch (Error& e) { - if (e.code() == error_code_timed_out || e.code() == error_code_operation_obsolete) { - req.reply.sendError(e); - return Void(); - } else { - throw; - } - } - } - - state double blockStart = now(); - - if (req.returnIfBlocked && logData->version.get() < req.begin) { - req.reply.sendError(end_of_stream()); - if (req.sequence.present()) { - auto& trackerData = logData->peekTracker[peekId]; - auto& sequenceData = trackerData.sequence_version[sequence + 1]; - trackerData.lastUpdate = now(); - if (!sequenceData.isSet()) { - sequenceData.send(std::make_pair(req.begin, req.onlySpilled)); - } - } - return Void(); - } - - //TraceEvent("TLogPeekMessages0", self->dbgid).detail("ReqBeginEpoch", req.begin.epoch).detail("ReqBeginSeq", req.begin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", req.tag1).detail("Tag2", req.tag2); - // Wait until we have something to return that the caller doesn't already have - if (logData->version.get() < req.begin) { - wait(logData->version.whenAtLeast(req.begin)); - wait(delay(SERVER_KNOBS->TLOG_PEEK_DELAY, g_network->getCurrentTask())); - } - - if (logData->locality != tagLocalitySatellite && req.tag.locality == tagLocalityLogRouter) { - wait(self->concurrentLogRouterReads.take()); - state FlowLock::Releaser globalReleaser(self->concurrentLogRouterReads); - wait(delay(0.0, TaskPriority::Low)); - } - - if (req.begin <= logData->persistentDataDurableVersion && req.tag.locality != tagLocalityTxs && req.tag != txsTag) { - // Reading spilled data will almost always imply that the storage server is >5s behind the rest - // of the cluster. We shouldn't prioritize spending CPU on helping this server catch up - // slightly faster over keeping the rest of the cluster operating normally. - // txsTag is only ever peeked on recovery, and we would still wish to prioritize requests - // that impact recovery duration. - wait(delay(0, TaskPriority::TLogSpilledPeekReply)); - } - - state double workStart = now(); - - Version poppedVer = poppedVersion(logData, req.tag); - if (poppedVer > req.begin) { - TLogPeekReply rep; - rep.maxKnownVersion = logData->version.get(); - rep.minKnownCommittedVersion = logData->minKnownCommittedVersion; - rep.popped = poppedVer; - rep.end = poppedVer; - rep.onlySpilled = false; - - if (req.sequence.present()) { - auto& trackerData = logData->peekTracker[peekId]; - auto& sequenceData = trackerData.sequence_version[sequence + 1]; - trackerData.lastUpdate = now(); - if (trackerData.sequence_version.size() && sequence + 1 < trackerData.sequence_version.begin()->first) { - req.reply.sendError(operation_obsolete()); - if (!sequenceData.isSet()) - sequenceData.sendError(operation_obsolete()); - return Void(); - } - if (sequenceData.isSet()) { - if (sequenceData.getFuture().get().first != rep.end) { - TEST(true); // tlog peek second attempt ended at a different version - req.reply.sendError(operation_obsolete()); - return Void(); - } - } else { - sequenceData.send(std::make_pair(rep.end, rep.onlySpilled)); - } - rep.begin = req.begin; - } - - req.reply.send(rep); - return Void(); - } - - state Version endVersion = logData->version.get() + 1; - state bool onlySpilled = false; - - // grab messages from disk - //TraceEvent("TLogPeekMessages", self->dbgid).detail("ReqBeginEpoch", req.begin.epoch).detail("ReqBeginSeq", req.begin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", req.tag1).detail("Tag2", req.tag2); - if (req.begin <= logData->persistentDataDurableVersion) { - // Just in case the durable version changes while we are waiting for the read, we grab this data from memory. We - // may or may not actually send it depending on whether we get enough data from disk. SOMEDAY: Only do this if - // an initial attempt to read from disk results in insufficient data and the required data is no longer in - // memory SOMEDAY: Should we only send part of the messages we collected, to actually limit the size of the - // result? - - if (req.onlySpilled) { - endVersion = logData->persistentDataDurableVersion + 1; - } else { - peekMessagesFromMemory(logData, req.tag, req.begin, messages2, endVersion); - } - - if (logData->shouldSpillByValue(req.tag)) { - RangeResult kvs = wait(self->persistentData->readRange( - KeyRangeRef(persistTagMessagesKey(logData->logId, req.tag, req.begin), - persistTagMessagesKey(logData->logId, req.tag, logData->persistentDataDurableVersion + 1)), - SERVER_KNOBS->DESIRED_TOTAL_BYTES, - SERVER_KNOBS->DESIRED_TOTAL_BYTES)); - - for (auto& kv : kvs) { - auto ver = decodeTagMessagesKey(kv.key); - messages << VERSION_HEADER << ver; - messages.serializeBytes(kv.value); - } - - if (kvs.expectedSize() >= SERVER_KNOBS->DESIRED_TOTAL_BYTES) { - endVersion = decodeTagMessagesKey(kvs.end()[-1].key) + 1; - onlySpilled = true; - } else { - messages.serializeBytes(messages2.toValue()); - } - } else { - // FIXME: Limit to approximately DESIRED_TOTATL_BYTES somehow. - RangeResult kvrefs = wait(self->persistentData->readRange( - KeyRangeRef( - persistTagMessageRefsKey(logData->logId, req.tag, req.begin), - persistTagMessageRefsKey(logData->logId, req.tag, logData->persistentDataDurableVersion + 1)), - SERVER_KNOBS->TLOG_SPILL_REFERENCE_MAX_BATCHES_PER_PEEK + 1)); - - //TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", req.reply.getEndpoint().getPrimaryAddress()).detail("Tag1Results", s1).detail("Tag2Results", s2).detail("Tag1ResultsLim", kv1.size()).detail("Tag2ResultsLim", kv2.size()).detail("Tag1ResultsLast", kv1.size() ? kv1[0].key : "").detail("Tag2ResultsLast", kv2.size() ? kv2[0].key : "").detail("Limited", limited).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowEpoch", self->epoch()).detail("NowSeq", self->sequence.getNextSequence()); - - state std::vector> commitLocations; - state bool earlyEnd = false; - uint32_t mutationBytes = 0; - state uint64_t commitBytes = 0; - state Version firstVersion = std::numeric_limits::max(); - for (int i = 0; i < kvrefs.size() && i < SERVER_KNOBS->TLOG_SPILL_REFERENCE_MAX_BATCHES_PER_PEEK; i++) { - auto& kv = kvrefs[i]; - VectorRef spilledData; - BinaryReader r(kv.value, AssumeVersion(logData->protocolVersion)); - r >> spilledData; - for (const SpilledData& sd : spilledData) { - if (mutationBytes >= SERVER_KNOBS->DESIRED_TOTAL_BYTES) { - earlyEnd = true; - break; - } - if (sd.version >= req.begin) { - firstVersion = std::min(firstVersion, sd.version); - const IDiskQueue::location end = sd.start.lo + sd.length; - commitLocations.emplace_back(sd.start, end); - // This isn't perfect, because we aren't accounting for page boundaries, but should be - // close enough. - commitBytes += sd.length; - mutationBytes += sd.mutationBytes; - } - } - if (earlyEnd) - break; - } - earlyEnd = earlyEnd || (kvrefs.size() >= SERVER_KNOBS->TLOG_SPILL_REFERENCE_MAX_BATCHES_PER_PEEK + 1); - wait(self->peekMemoryLimiter.take(TaskPriority::TLogSpilledPeekReply, commitBytes)); - state FlowLock::Releaser memoryReservation(self->peekMemoryLimiter, commitBytes); - state std::vector>> messageReads; - messageReads.reserve(commitLocations.size()); - for (const auto& pair : commitLocations) { - messageReads.push_back(self->rawPersistentQueue->read(pair.first, pair.second, CheckHashes::True)); - } - commitLocations.clear(); - wait(waitForAll(messageReads)); - - state Version lastRefMessageVersion = 0; - state int index = 0; - loop { - if (index >= messageReads.size()) - break; - Standalone queueEntryData = messageReads[index].get(); - uint8_t valid; - const uint32_t length = *(uint32_t*)queueEntryData.begin(); - queueEntryData = queueEntryData.substr(4, queueEntryData.size() - 4); - BinaryReader rd(queueEntryData, IncludeVersion()); - state TLogQueueEntry entry; - rd >> entry >> valid; - ASSERT(valid == 0x01); - ASSERT(length + sizeof(valid) == queueEntryData.size()); - - messages << VERSION_HEADER << entry.version; - - std::vector rawMessages = - wait(parseMessagesForTag(entry.messages, req.tag, logData->logRouterTags)); - for (const StringRef& msg : rawMessages) { - messages.serializeBytes(msg); - DEBUG_TAGS_AND_MESSAGE("TLogPeekFromDisk", entry.version, msg) - .detail("UID", self->dbgid) - .detail("LogId", logData->logId) - .detail("PeekTag", req.tag); - } - - lastRefMessageVersion = entry.version; - index++; - } - - messageReads.clear(); - memoryReservation.release(); - - if (earlyEnd) { - endVersion = lastRefMessageVersion + 1; - onlySpilled = true; - } else { - messages.serializeBytes(messages2.toValue()); - } - } - } else { - if (req.onlySpilled) { - endVersion = logData->persistentDataDurableVersion + 1; - } else { - peekMessagesFromMemory(logData, req.tag, req.begin, messages, endVersion); - } - - //TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", req.reply.getEndpoint().getPrimaryAddress()).detail("MessageBytes", messages.getLength()).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowSeq", self->sequence.getNextSequence()); - } - - TLogPeekReply reply; - reply.maxKnownVersion = logData->version.get(); - reply.minKnownCommittedVersion = logData->minKnownCommittedVersion; - reply.messages = messages.toValue(); - reply.end = endVersion; - reply.onlySpilled = onlySpilled; - - //TraceEvent("TlogPeek", self->dbgid).detail("LogId", logData->logId).detail("Tag", req.tag.toString()). - // detail("BeginVer", req.begin).detail("EndVer", reply.end). - // detail("MsgBytes", reply.messages.expectedSize()). - // detail("ForAddress", req.reply.getEndpoint().getPrimaryAddress()); - - if (req.sequence.present()) { - auto& trackerData = logData->peekTracker[peekId]; - trackerData.lastUpdate = now(); - - double queueT = blockStart - queueStart; - double blockT = workStart - blockStart; - double workT = now() - workStart; - - trackerData.totalPeeks++; - trackerData.replyBytes += reply.messages.size(); - - if (queueT > trackerData.queueMax) - trackerData.queueMax = queueT; - if (blockT > trackerData.blockMax) - trackerData.blockMax = blockT; - if (workT > trackerData.workMax) - trackerData.workMax = workT; - - trackerData.queueTime += queueT; - trackerData.blockTime += blockT; - trackerData.workTime += workT; - - auto& sequenceData = trackerData.sequence_version[sequence + 1]; - if (trackerData.sequence_version.size() && sequence + 1 < trackerData.sequence_version.begin()->first) { - req.reply.sendError(operation_obsolete()); - if (!sequenceData.isSet()) { - // It would technically be more correct to .send({req.begin, req.onlySpilled}), as the next - // request might still be in the window of active requests, but LogSystemPeekCursor will - // throw away all future responses upon getting an operation_obsolete(), so computing a - // response will probably be a waste of CPU. - sequenceData.sendError(operation_obsolete()); - } - return Void(); - } - if (sequenceData.isSet()) { - trackerData.duplicatePeeks++; - if (sequenceData.getFuture().get().first != reply.end) { - TEST(true); // tlog peek second attempt ended at a different version (2) - req.reply.sendError(operation_obsolete()); - return Void(); - } - } else { - sequenceData.send(std::make_pair(reply.end, reply.onlySpilled)); - } - reply.begin = req.begin; - } - - req.reply.send(reply); - return Void(); -} - - ACTOR Future doQueueCommit(TLogData* self, Reference logData, std::vector> missingFinalCommit) { @@ -2814,7 +2474,8 @@ ACTOR Future serveTLogInterface(TLogData* self, logData->addActor.send(tLogPeekStream(self, req, logData)); } when(TLogPeekRequest req = waitNext(tli.peekMessages.getFuture())) { - logData->addActor.send(tLogPeekMessages(self, req, logData)); + logData->addActor.send(tLogPeekMessages( + req.reply, self, logData, req.begin, req.tag, req.returnIfBlocked, req.onlySpilled, req.sequence)); } when(TLogPopRequest req = waitNext(tli.popMessages.getFuture())) { logData->addActor.send(tLogPop(self, req, logData)); diff --git a/flow/Knobs.cpp b/flow/Knobs.cpp index 25c6e32b80..7a8f1e24f7 100644 --- a/flow/Knobs.cpp +++ b/flow/Knobs.cpp @@ -184,7 +184,7 @@ void FlowKnobs::initialize(Randomize randomize, IsSimulated isSimulated) { init( FAST_NETWORK_LATENCY, 800e-6 ); init( SLOW_NETWORK_LATENCY, 100e-3 ); init( MAX_CLOGGING_LATENCY, 0 ); if( randomize && BUGGIFY ) MAX_CLOGGING_LATENCY = 0.1 * deterministicRandom()->random01(); - init( MAX_BUGGIFIED_DELAY, 0 ); // if( randomize && BUGGIFY ) MAX_BUGGIFIED_DELAY = 0.2 * deterministicRandom()->random01(); + init( MAX_BUGGIFIED_DELAY, 0 ); if( randomize && BUGGIFY ) MAX_BUGGIFIED_DELAY = 0.2 * deterministicRandom()->random01(); init( SIM_CONNECT_ERROR_MODE, deterministicRandom()->randomInt(0,3) ); //Tracefiles From ae2268f9f2bc1d0c89c1a65e54f336e1f000f114 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Sat, 31 Jul 2021 22:07:43 -0700 Subject: [PATCH 22/29] 200k simulation: check stream sequence; delay in GetMore loop --- fdbserver/LogSystemPeekCursor.actor.cpp | 12 ++++++++++-- fdbserver/OldTLogServer_4_6.actor.cpp | 1 + fdbserver/OldTLogServer_6_0.actor.cpp | 1 + fdbserver/OldTLogServer_6_2.actor.cpp | 1 + fdbserver/TLogServer.actor.cpp | 1 + 5 files changed, 14 insertions(+), 2 deletions(-) diff --git a/fdbserver/LogSystemPeekCursor.actor.cpp b/fdbserver/LogSystemPeekCursor.actor.cpp index c210f89f29..f719c7c83a 100644 --- a/fdbserver/LogSystemPeekCursor.actor.cpp +++ b/fdbserver/LogSystemPeekCursor.actor.cpp @@ -333,6 +333,7 @@ ACTOR Future serverPeekStreamGetMore(ILogSystem::ServerPeekCursor* self, T loop { try { + state Version expectedBegin = self->messageVersion.version; state Future fPeekReply = self->peekReplyStream.present() ? map(waitAndForward(self->peekReplyStream.get().getFuture()), [](const TLogPeekStreamReply& r) { return r.rep; }) @@ -350,7 +351,11 @@ ACTOR Future serverPeekStreamGetMore(ILogSystem::ServerPeekCursor* self, T self->interf->get().interf().peekStreamMessages.getEndpoint().getPrimaryAddress(), fPeekReply) : Never())) { + if (res.begin.get() != expectedBegin) { + throw operation_obsolete(); + } updateCursorWithReply(self, res); + expectedBegin = res.end; TraceEvent("SPC_GetMoreB", self->randomID) .detail("Has", self->hasMessage()) .detail("End", res.end) @@ -364,8 +369,11 @@ ACTOR Future serverPeekStreamGetMore(ILogSystem::ServerPeekCursor* self, T } } catch (Error& e) { TraceEvent(SevDebug, "SPC_GetMoreB_Error", self->randomID).detail("Error", e.what()); + + self->peekReplyStream.reset(); if (e.code() == error_code_connection_failed || e.code() == error_code_operation_obsolete) { - self->peekReplyStream.reset(); + // NOTE: delay in order to avoid the endless retry loop block other tasks + wait(delay(0)); } else if (e.code() == error_code_end_of_stream) { self->end.reset(self->messageVersion.version); return Void(); @@ -418,7 +426,7 @@ Future ILogSystem::ServerPeekCursor::getMore(TaskPriority taskID) { return Void(); if (!more.isValid() || more.isReady()) { // TODO: remove locality check when log router support streaming peek - if (false && usePeekStream && tag.locality >= 0) { + if (usePeekStream && tag.locality >= 0) { more = serverPeekStreamGetMore(this, taskID); } else if (parallelGetMore || onlySpilled || futureResults.size()) { more = serverPeekParallelGetMore(this, taskID); diff --git a/fdbserver/OldTLogServer_4_6.actor.cpp b/fdbserver/OldTLogServer_4_6.actor.cpp index 35d142b9f7..ea0f6ba22e 100644 --- a/fdbserver/OldTLogServer_4_6.actor.cpp +++ b/fdbserver/OldTLogServer_4_6.actor.cpp @@ -1157,6 +1157,7 @@ ACTOR Future tLogPeekStream(TLogData* self, TLogPeekStreamRequest req, Ref } reply.rep = future.get(); + reply.rep.begin = begin; req.reply.send(reply); begin = reply.rep.end; onlySpilled = reply.rep.onlySpilled; diff --git a/fdbserver/OldTLogServer_6_0.actor.cpp b/fdbserver/OldTLogServer_6_0.actor.cpp index 5c27581b2c..fa08f0e3f3 100644 --- a/fdbserver/OldTLogServer_6_0.actor.cpp +++ b/fdbserver/OldTLogServer_6_0.actor.cpp @@ -1469,6 +1469,7 @@ ACTOR Future tLogPeekStream(TLogData* self, TLogPeekStreamRequest req, Ref } reply.rep = future.get(); + reply.rep.begin = begin; req.reply.send(reply); begin = reply.rep.end; onlySpilled = reply.rep.onlySpilled; diff --git a/fdbserver/OldTLogServer_6_2.actor.cpp b/fdbserver/OldTLogServer_6_2.actor.cpp index 81025d93de..181960cec9 100644 --- a/fdbserver/OldTLogServer_6_2.actor.cpp +++ b/fdbserver/OldTLogServer_6_2.actor.cpp @@ -1898,6 +1898,7 @@ ACTOR Future tLogPeekStream(TLogData* self, TLogPeekStreamRequest req, Ref } reply.rep = future.get(); + reply.rep.begin = begin; req.reply.send(reply); begin = reply.rep.end; onlySpilled = reply.rep.onlySpilled; diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp index aa2afd5406..746064ce91 100644 --- a/fdbserver/TLogServer.actor.cpp +++ b/fdbserver/TLogServer.actor.cpp @@ -1951,6 +1951,7 @@ ACTOR Future tLogPeekStream(TLogData* self, TLogPeekStreamRequest req, Ref } reply.rep = future.get(); + reply.rep.begin = begin; req.reply.send(reply); begin = reply.rep.end; onlySpilled = reply.rep.onlySpilled; From fd74a16f35e2e3a9fe21f4917e165bdbaecdcc7f Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Mon, 2 Aug 2021 14:24:20 -0700 Subject: [PATCH 23/29] format code --- fdbclient/ServerKnobs.cpp | 4 ++-- fdbclient/ServerKnobs.h | 4 ++-- fdbserver/LogRouter.actor.cpp | 9 +++++---- fdbserver/LogSystemPeekCursor.actor.cpp | 7 ++++--- fdbserver/TLogInterface.h | 4 ++-- flow/error_definitions.h | 1 - 6 files changed, 15 insertions(+), 14 deletions(-) diff --git a/fdbclient/ServerKnobs.cpp b/fdbclient/ServerKnobs.cpp index 36d599ba03..290452012a 100644 --- a/fdbclient/ServerKnobs.cpp +++ b/fdbclient/ServerKnobs.cpp @@ -64,8 +64,8 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( TLOG_MESSAGE_BLOCK_BYTES, 10e6 ); init( TLOG_MESSAGE_BLOCK_OVERHEAD_FACTOR, double(TLOG_MESSAGE_BLOCK_BYTES) / (TLOG_MESSAGE_BLOCK_BYTES - MAX_MESSAGE_SIZE) ); //1.0121466709838096006362758832473 init( PEEK_TRACKER_EXPIRATION_TIME, 600 ); if( randomize && BUGGIFY ) PEEK_TRACKER_EXPIRATION_TIME = deterministicRandom()->coinflip() ? 0.1 : 120; - init( PEEK_USEING_STREAMING, true ); - init( PARALLEL_GET_MORE_REQUESTS, 32 ); if( randomize && BUGGIFY ) PARALLEL_GET_MORE_REQUESTS = 2; + init( PEEK_USEING_STREAMING, true ); + init( PARALLEL_GET_MORE_REQUESTS, 32 ); if( randomize && BUGGIFY ) PARALLEL_GET_MORE_REQUESTS = 2; init( MULTI_CURSOR_PRE_FETCH_LIMIT, 10 ); init( MAX_QUEUE_COMMIT_BYTES, 15e6 ); if( randomize && BUGGIFY ) MAX_QUEUE_COMMIT_BYTES = 5000; init( DESIRED_OUTSTANDING_MESSAGES, 5000 ); if( randomize && BUGGIFY ) DESIRED_OUTSTANDING_MESSAGES = deterministicRandom()->randomInt(0,100); diff --git a/fdbclient/ServerKnobs.h b/fdbclient/ServerKnobs.h index ea7ca9ca3a..23b3049668 100644 --- a/fdbclient/ServerKnobs.h +++ b/fdbclient/ServerKnobs.h @@ -41,8 +41,8 @@ public: // often, so that versions always advance smoothly // TLogs - bool PEEK_USEING_STREAMING; - double TLOG_TIMEOUT; // tlog OR commit proxy failure - master's reaction time + bool PEEK_USEING_STREAMING; + double TLOG_TIMEOUT; // tlog OR commit proxy failure - master's reaction time double TLOG_SLOW_REJOIN_WARN_TIMEOUT_SECS; // Warns if a tlog takes too long to rejoin double RECOVERY_TLOG_SMART_QUORUM_DELAY; // smaller might be better for bug amplification double TLOG_STORAGE_MIN_UPDATE_INTERVAL; diff --git a/fdbserver/LogRouter.actor.cpp b/fdbserver/LogRouter.actor.cpp index 06806e649b..aae2edb05b 100644 --- a/fdbserver/LogRouter.actor.cpp +++ b/fdbserver/LogRouter.actor.cpp @@ -441,10 +441,11 @@ Version poppedVersion(LogRouterData* self, Tag tag) { return tagData->popped; } +// TODO: enable streaming peek log from log router // This actor keep pushing TLogPeekStreamReply until it's removed from the cluster or should recover -ACTOR Future logRouterPeekStream(LogRouterData* self, TLogPeekStreamRequest req) { - return Void(); -} +// ACTOR Future logRouterPeekStream(LogRouterData* self, TLogPeekStreamRequest req) { +// return Void(); +// } ACTOR Future logRouterPeekMessages(LogRouterData* self, TLogPeekRequest req) { state BinaryWriter messages(Unversioned()); @@ -652,7 +653,7 @@ ACTOR Future logRouterCore(TLogInterface interf, } when(TLogPeekStreamRequest req = waitNext(interf.peekStreamMessages.getFuture())) { // addActor.send(logRouterPeekStream(&logRouterData, req)); - // FIXME: temporarily disable streaming peek from LogRouter + // FIXME: currently LogRouter doesn't support streaming peek request TraceEvent(SevError, "LogRouterPeekStream", logRouterData.dbgid) .detail("Token", interf.peekStreamMessages.getEndpoint().token); req.reply.sendError(operation_failed()); diff --git a/fdbserver/LogSystemPeekCursor.actor.cpp b/fdbserver/LogSystemPeekCursor.actor.cpp index 88faa618d8..953f5debdd 100644 --- a/fdbserver/LogSystemPeekCursor.actor.cpp +++ b/fdbserver/LogSystemPeekCursor.actor.cpp @@ -52,7 +52,8 @@ ILogSystem::ServerPeekCursor::ServerPeekCursor(ReferencerandomUniqueID()), returnIfBlocked(returnIfBlocked), onlySpilled(false), parallelGetMore(parallelGetMore), sequence(0), lastReset(0), - resetCheck(Void()), slowReplies(0), fastReplies(0), unknownReplies(0), usePeekStream(SERVER_KNOBS->PEEK_USEING_STREAMING) { + resetCheck(Void()), slowReplies(0), fastReplies(0), unknownReplies(0), + usePeekStream(SERVER_KNOBS->PEEK_USEING_STREAMING) { this->results.maxKnownVersion = 0; this->results.minKnownCommittedVersion = 0; TraceEvent("SPC_Starting", randomID) @@ -361,8 +362,8 @@ ACTOR Future serverPeekStreamGetMore(ILogSystem::ServerPeekCursor* self, T .detail("End", res.end) .detail("Popped", res.popped.present() ? res.popped.get() : 0); - // NOTE: delay is needed here since TLog need to be scheduled to response if there are TLog and SS - // on the same machine + // NOTE: delay is necessary here since ReplyPromiseStream delivers reply on high priority. Here we + // change the priority to the intended one. wait(delay(0, taskID)); return Void(); } diff --git a/fdbserver/TLogInterface.h b/fdbserver/TLogInterface.h index 9f147189fe..322c7ddb35 100644 --- a/fdbserver/TLogInterface.h +++ b/fdbserver/TLogInterface.h @@ -83,8 +83,8 @@ struct TLogInterface { streams.push_back(disablePopRequest.getReceiver()); streams.push_back(enablePopRequest.getReceiver()); streams.push_back(snapRequest.getReceiver()); - streams.push_back(peekStreamMessages.getReceiver(TaskPriority::TLogPeek)); - FlowTransport::transport().addEndpoints(streams); + streams.push_back(peekStreamMessages.getReceiver(TaskPriority::TLogPeek)); + FlowTransport::transport().addEndpoints(streams); } template diff --git a/flow/error_definitions.h b/flow/error_definitions.h index bfe57bfd06..b69801cfd7 100755 --- a/flow/error_definitions.h +++ b/flow/error_definitions.h @@ -33,7 +33,6 @@ // clang-format off ERROR( success, 0, "Success" ) ERROR( end_of_stream, 1, "End of stream" ) -ERROR( no_action_needed, 2, "No action needed" ) ERROR( operation_failed, 1000, "Operation failed") ERROR( wrong_shard_server, 1001, "Shard is not available from this server") ERROR( operation_obsolete, 1002, "Operation result no longer necessary") From 9986d2b0b6acfb4557fc71b33d87cf65ff78a87e Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Mon, 2 Aug 2021 22:33:17 -0700 Subject: [PATCH 24/29] change log severity --- fdbserver/LogSystemPeekCursor.actor.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fdbserver/LogSystemPeekCursor.actor.cpp b/fdbserver/LogSystemPeekCursor.actor.cpp index 77ee0296f4..36f62cd6a5 100644 --- a/fdbserver/LogSystemPeekCursor.actor.cpp +++ b/fdbserver/LogSystemPeekCursor.actor.cpp @@ -56,11 +56,10 @@ ILogSystem::ServerPeekCursor::ServerPeekCursor(Referenceresults.maxKnownVersion = 0; this->results.minKnownCommittedVersion = 0; - TraceEvent("SPC_Starting", randomID) + TraceEvent(SevDebug, "SPC_Starting", randomID) .detail("Tag", tag.toString()) .detail("Begin", begin) - .detail("End", end) - .backtrace(); + .detail("End", end); } ILogSystem::ServerPeekCursor::ServerPeekCursor(TLogPeekReply const& results, @@ -357,7 +356,7 @@ ACTOR Future serverPeekStreamGetMore(ILogSystem::ServerPeekCursor* self, T } updateCursorWithReply(self, res); expectedBegin = res.end; - TraceEvent("SPC_GetMoreB", self->randomID) + TraceEvent(SevDebug, "SPC_GetMoreB", self->randomID) .detail("Has", self->hasMessage()) .detail("End", res.end) .detail("Popped", res.popped.present() ? res.popped.get() : 0); From 80a5120df8e0bb80f6a8b56c4cdcf4a568af68e8 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Thu, 5 Aug 2021 19:51:17 -0700 Subject: [PATCH 25/29] support LogRouter peek from TLog --- fdbserver/LogSystemPeekCursor.actor.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fdbserver/LogSystemPeekCursor.actor.cpp b/fdbserver/LogSystemPeekCursor.actor.cpp index 36f62cd6a5..935da90b8b 100644 --- a/fdbserver/LogSystemPeekCursor.actor.cpp +++ b/fdbserver/LogSystemPeekCursor.actor.cpp @@ -368,13 +368,13 @@ ACTOR Future serverPeekStreamGetMore(ILogSystem::ServerPeekCursor* self, T } } } catch (Error& e) { - TraceEvent(SevDebug, "SPC_GetMoreB_Error", self->randomID).detail("Error", e.what()); - - self->peekReplyStream.reset(); + TraceEvent(SevDebug, "SPC_GetMoreB_Error").error(e, true); if (e.code() == error_code_connection_failed || e.code() == error_code_operation_obsolete) { // NOTE: delay in order to avoid the endless retry loop block other tasks + self->peekReplyStream.reset(); wait(delay(0)); } else if (e.code() == error_code_end_of_stream) { + self->peekReplyStream.reset(); self->end.reset(self->messageVersion.version); return Void(); } else { @@ -425,8 +425,8 @@ Future ILogSystem::ServerPeekCursor::getMore(TaskPriority taskID) { if (hasMessage() && !parallelGetMore) return Void(); if (!more.isValid() || more.isReady()) { - // TODO: remove locality check when log router support streaming peek - if (usePeekStream && tag.locality >= 0) { + // TODO: add tagLocalityRemoteLog when log router support streaming peek + if (usePeekStream && (tag.locality >= 0 || tag.locality == tagLocalityLogRouter)) { more = serverPeekStreamGetMore(this, taskID); } else if (parallelGetMore || onlySpilled || futureResults.size()) { more = serverPeekParallelGetMore(this, taskID); From 2263626cdcc59a5e984ff48f62e54cb5b7630d4e Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Sat, 7 Aug 2021 09:53:22 -0700 Subject: [PATCH 26/29] 200k test clean: enable remote Log pull from LogRouter --- fdbserver/LogRouter.actor.cpp | 125 ++++++++++++++++-------- fdbserver/LogSystemPeekCursor.actor.cpp | 2 +- fdbserver/OldTLogServer_4_6.actor.cpp | 35 +++---- fdbserver/OldTLogServer_6_0.actor.cpp | 37 +++---- fdbserver/OldTLogServer_6_2.actor.cpp | 11 +-- fdbserver/TLogServer.actor.cpp | 7 +- 6 files changed, 119 insertions(+), 98 deletions(-) diff --git a/fdbserver/LogRouter.actor.cpp b/fdbserver/LogRouter.actor.cpp index aae2edb05b..f26ea539af 100644 --- a/fdbserver/LogRouter.actor.cpp +++ b/fdbserver/LogRouter.actor.cpp @@ -441,33 +441,35 @@ Version poppedVersion(LogRouterData* self, Tag tag) { return tagData->popped; } -// TODO: enable streaming peek log from log router -// This actor keep pushing TLogPeekStreamReply until it's removed from the cluster or should recover -// ACTOR Future logRouterPeekStream(LogRouterData* self, TLogPeekStreamRequest req) { -// return Void(); -// } - -ACTOR Future logRouterPeekMessages(LogRouterData* self, TLogPeekRequest req) { +// Common logics to peek TLog and create TLogPeekReply that serves both streaming peek or normal peek request +ACTOR template +Future logRouterPeekMessages(PromiseType replyPromise, + LogRouterData* self, + Version reqBegin, + Tag reqTag, + bool reqReturnIfBlocked = false, + bool reqOnlySpilled = false, + Optional> reqSequence = Optional>()) { state BinaryWriter messages(Unversioned()); state int sequence = -1; state UID peekId; - if (req.sequence.present()) { + if (reqSequence.present()) { try { - peekId = req.sequence.get().first; - sequence = req.sequence.get().second; + peekId = reqSequence.get().first; + sequence = reqSequence.get().second; if (sequence >= SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS && - self->peekTracker.find(peekId) == self->peekTracker.end()) { + self->peekTracker.find(peekId) == self->peekTracker.end()) { throw operation_obsolete(); } auto& trackerData = self->peekTracker[peekId]; if (sequence == 0 && trackerData.sequence_version.find(0) == trackerData.sequence_version.end()) { - trackerData.sequence_version[0].send(std::make_pair(req.begin, req.onlySpilled)); + trackerData.sequence_version[0].send(std::make_pair(reqBegin, reqOnlySpilled)); } auto seqBegin = trackerData.sequence_version.begin(); // The peek cursor and this comparison need to agree about the maximum number of in-flight requests. while (trackerData.sequence_version.size() && - seqBegin->first <= sequence - SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS) { + seqBegin->first <= sequence - SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS) { if (seqBegin->second.canBeSet()) { seqBegin->second.sendError(operation_obsolete()); } @@ -481,12 +483,12 @@ ACTOR Future logRouterPeekMessages(LogRouterData* self, TLogPeekRequest re trackerData.lastUpdate = now(); std::pair prevPeekData = wait(trackerData.sequence_version[sequence].getFuture()); - req.begin = prevPeekData.first; - req.onlySpilled = prevPeekData.second; + reqBegin = prevPeekData.first; + reqOnlySpilled = prevPeekData.second; wait(yield()); } catch (Error& e) { if (e.code() == error_code_timed_out || e.code() == error_code_operation_obsolete) { - req.reply.sendError(e); + replyPromise.sendError(e); return Void(); } else { throw; @@ -494,62 +496,62 @@ ACTOR Future logRouterPeekMessages(LogRouterData* self, TLogPeekRequest re } } - //TraceEvent("LogRouterPeek1", self->dbgid).detail("From", req.reply.getEndpoint().getPrimaryAddress()).detail("Ver", self->version.get()).detail("Begin", req.begin); - if (req.returnIfBlocked && self->version.get() < req.begin) { + //TraceEvent("LogRouterPeek1", self->dbgid).detail("From", replyPromise.getEndpoint().getPrimaryAddress()).detail("Ver", self->version.get()).detail("Begin", reqBegin); + if (reqReturnIfBlocked && self->version.get() < reqBegin) { //TraceEvent("LogRouterPeek2", self->dbgid); - req.reply.sendError(end_of_stream()); - if (req.sequence.present()) { + replyPromise.sendError(end_of_stream()); + if (reqSequence.present()) { auto& trackerData = self->peekTracker[peekId]; auto& sequenceData = trackerData.sequence_version[sequence + 1]; if (!sequenceData.isSet()) { - sequenceData.send(std::make_pair(req.begin, req.onlySpilled)); + sequenceData.send(std::make_pair(reqBegin, reqOnlySpilled)); } } return Void(); } - if (self->version.get() < req.begin) { - wait(self->version.whenAtLeast(req.begin)); + if (self->version.get() < reqBegin) { + wait(self->version.whenAtLeast(reqBegin)); wait(delay(SERVER_KNOBS->TLOG_PEEK_DELAY, g_network->getCurrentTask())); } - Version poppedVer = poppedVersion(self, req.tag); + Version poppedVer = poppedVersion(self, reqTag); - if (poppedVer > req.begin || req.begin < self->startVersion) { + if (poppedVer > reqBegin || reqBegin < self->startVersion) { // This should only happen if a packet is sent multiple times and the reply is not needed. // Since we are using popped differently, do not send a reply. TraceEvent(SevWarnAlways, "LogRouterPeekPopped", self->dbgid) - .detail("Begin", req.begin) - .detail("Popped", poppedVer) - .detail("Start", self->startVersion); - req.reply.send(Never()); - if (req.sequence.present()) { + .detail("Begin", reqBegin) + .detail("Popped", poppedVer) + .detail("Start", self->startVersion); + replyPromise.send(Never()); + if (reqSequence.present()) { auto& trackerData = self->peekTracker[peekId]; auto& sequenceData = trackerData.sequence_version[sequence + 1]; if (!sequenceData.isSet()) { - sequenceData.send(std::make_pair(req.begin, req.onlySpilled)); + sequenceData.send(std::make_pair(reqBegin, reqOnlySpilled)); } } return Void(); } Version endVersion = self->version.get() + 1; - peekMessagesFromMemory(self, req.tag, req.begin, messages, endVersion); + peekMessagesFromMemory(self, reqTag, reqBegin, messages, endVersion); TLogPeekReply reply; reply.maxKnownVersion = self->version.get(); reply.minKnownCommittedVersion = self->poppedVersion; - reply.messages = messages.toValue(); + reply.messages = StringRef(reply.arena, messages.toValue()); reply.popped = self->minPopped.get() >= self->startVersion ? self->minPopped.get() : 0; reply.end = endVersion; reply.onlySpilled = false; - if (req.sequence.present()) { + if (reqSequence.present()) { auto& trackerData = self->peekTracker[peekId]; trackerData.lastUpdate = now(); auto& sequenceData = trackerData.sequence_version[sequence + 1]; if (trackerData.sequence_version.size() && sequence + 1 < trackerData.sequence_version.begin()->first) { - req.reply.sendError(operation_obsolete()); + replyPromise.sendError(operation_obsolete()); if (!sequenceData.isSet()) sequenceData.sendError(operation_obsolete()); return Void(); @@ -557,20 +559,58 @@ ACTOR Future logRouterPeekMessages(LogRouterData* self, TLogPeekRequest re if (sequenceData.isSet()) { if (sequenceData.getFuture().get().first != reply.end) { TEST(true); // tlog peek second attempt ended at a different version - req.reply.sendError(operation_obsolete()); + replyPromise.sendError(operation_obsolete()); return Void(); } } else { sequenceData.send(std::make_pair(reply.end, reply.onlySpilled)); } - reply.begin = req.begin; + reply.begin = reqBegin; } - req.reply.send(reply); + replyPromise.send(reply); //TraceEvent("LogRouterPeek4", self->dbgid); return Void(); } +// TODO: enable streaming peek log from log router +// This actor keep pushing TLogPeekStreamReply until it's removed from the cluster or should recover +ACTOR Future logRouterPeekStream(LogRouterData* self, TLogPeekStreamRequest req) { + self->activePeekStreams++; + + state Version begin = req.begin; + state bool onlySpilled = false; + req.reply.setByteLimit(std::min(SERVER_KNOBS->MAXIMUM_PEEK_BYTES, req.limitBytes)); + loop { + state TLogPeekStreamReply reply; + state Promise promise; + state Future future(promise.getFuture()); + try { + wait(req.reply.onReady() && store(reply.rep, future) && + logRouterPeekMessages(promise, self, begin, req.tag, req.returnIfBlocked, onlySpilled)); + + reply.rep.begin = begin; + req.reply.send(reply); + begin = reply.rep.end; + onlySpilled = reply.rep.onlySpilled; + if (reply.rep.end > self->version.get()) { + wait(delay(SERVER_KNOBS->TLOG_PEEK_DELAY, g_network->getCurrentTask())); + } else { + wait(delay(0, g_network->getCurrentTask())); + } + } catch (Error& e) { + self->activePeekStreams--; + TraceEvent(SevDebug, "TLogPeekStreamEnd", self->dbgid).error(e, true); + + if (e.code() == error_code_end_of_stream || e.code() == error_code_operation_obsolete) { + req.reply.sendError(e); + return Void(); + } else { + throw; + } + } + } +} ACTOR Future cleanupPeekTrackers(LogRouterData* self) { loop { @@ -649,14 +689,15 @@ ACTOR Future logRouterCore(TLogInterface interf, logRouterData.logSystem->set(ILogSystem::fromServerDBInfo(logRouterData.dbgid, db->get(), true)); } when(TLogPeekRequest req = waitNext(interf.peekMessages.getFuture())) { - addActor.send(logRouterPeekMessages(&logRouterData, req)); + addActor.send(logRouterPeekMessages( + req.reply, &logRouterData, req.begin, req.tag, req.returnIfBlocked, req.onlySpilled, req.sequence)); } when(TLogPeekStreamRequest req = waitNext(interf.peekStreamMessages.getFuture())) { - // addActor.send(logRouterPeekStream(&logRouterData, req)); // FIXME: currently LogRouter doesn't support streaming peek request - TraceEvent(SevError, "LogRouterPeekStream", logRouterData.dbgid) + TraceEvent(SevDebug, "LogRouterPeekStream", logRouterData.dbgid) .detail("Token", interf.peekStreamMessages.getEndpoint().token); - req.reply.sendError(operation_failed()); + addActor.send(logRouterPeekStream(&logRouterData, req)); + // req.reply.sendError(operation_failed()); } when(TLogPopRequest req = waitNext(interf.popMessages.getFuture())) { // Request from remote tLog to pop data from LR diff --git a/fdbserver/LogSystemPeekCursor.actor.cpp b/fdbserver/LogSystemPeekCursor.actor.cpp index 935da90b8b..26cb4b6eec 100644 --- a/fdbserver/LogSystemPeekCursor.actor.cpp +++ b/fdbserver/LogSystemPeekCursor.actor.cpp @@ -426,7 +426,7 @@ Future ILogSystem::ServerPeekCursor::getMore(TaskPriority taskID) { return Void(); if (!more.isValid() || more.isReady()) { // TODO: add tagLocalityRemoteLog when log router support streaming peek - if (usePeekStream && (tag.locality >= 0 || tag.locality == tagLocalityLogRouter)) { + if (usePeekStream && (tag.locality >= 0 || tag.locality == tagLocalityLogRouter || tag.locality == tagLocalityRemoteLog)) { more = serverPeekStreamGetMore(this, taskID); } else if (parallelGetMore || onlySpilled || futureResults.size()) { more = serverPeekParallelGetMore(this, taskID); diff --git a/fdbserver/OldTLogServer_4_6.actor.cpp b/fdbserver/OldTLogServer_4_6.actor.cpp index bf0bf1639a..f50744cc1a 100644 --- a/fdbserver/OldTLogServer_4_6.actor.cpp +++ b/fdbserver/OldTLogServer_4_6.actor.cpp @@ -976,13 +976,13 @@ void peekMessagesFromMemory(Reference self, // Common logics to peek TLog and create TLogPeekReply that serves both streaming peek or normal peek request ACTOR template Future tLogPeekMessages(PromiseType replyPromise, - TLogData* self, - Reference logData, - Version reqBegin, - Tag reqTag, - bool reqReturnIfBlocked = false, - bool reqOnlySpilled = false, - Optional> reqSequence = Optional>()) { + TLogData* self, + Reference logData, + Version reqBegin, + Tag reqTag, + bool reqReturnIfBlocked = false, + bool reqOnlySpilled = false, + Optional> reqSequence = Optional>()) { state BinaryWriter messages(Unversioned()); state BinaryWriter messages2(Unversioned()); state int sequence = -1; @@ -994,7 +994,7 @@ Future tLogPeekMessages(PromiseType replyPromise, peekId = reqSequence.get().first; sequence = reqSequence.get().second; if (sequence >= SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS && - self->peekTracker.find(peekId) == self->peekTracker.end()) { + self->peekTracker.find(peekId) == self->peekTracker.end()) { throw operation_obsolete(); } if (sequence > 0) { @@ -1075,10 +1075,10 @@ Future tLogPeekMessages(PromiseType replyPromise, peekMessagesFromMemory(logData, reqTag, reqBegin, messages2, endVersion); RangeResult kvs = wait(self->persistentData->readRange( - KeyRangeRef(persistTagMessagesKey(logData->logId, oldTag, reqBegin), - persistTagMessagesKey(logData->logId, oldTag, logData->persistentDataDurableVersion + 1)), - SERVER_KNOBS->DESIRED_TOTAL_BYTES, - SERVER_KNOBS->DESIRED_TOTAL_BYTES)); + KeyRangeRef(persistTagMessagesKey(logData->logId, oldTag, reqBegin), + persistTagMessagesKey(logData->logId, oldTag, logData->persistentDataDurableVersion + 1)), + SERVER_KNOBS->DESIRED_TOTAL_BYTES, + SERVER_KNOBS->DESIRED_TOTAL_BYTES)); //TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", replyPromise.getEndpoint().getPrimaryAddress()).detail("Tag1Results", s1).detail("Tag2Results", s2).detail("Tag1ResultsLim", kv1.size()).detail("Tag2ResultsLim", kv2.size()).detail("Tag1ResultsLast", kv1.size() ? kv1[0].key : "").detail("Tag2ResultsLast", kv2.size() ? kv2[0].key : "").detail("Limited", limited).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowEpoch", self->epoch()).detail("NowSeq", self->sequence.getNextSequence()); @@ -1148,14 +1148,9 @@ ACTOR Future tLogPeekStream(TLogData* self, TLogPeekStreamRequest req, Ref state Promise promise; state Future future(promise.getFuture()); try { - wait(req.reply.onReady() && - tLogPeekMessages(promise, self, logData, begin, req.tag, req.returnIfBlocked, onlySpilled)); - ASSERT(future.isReady()); - if (future.isError()) { - throw future.getError(); - } + wait(req.reply.onReady() && store(reply.rep, future) && + tLogPeekMessages(promise, self, logData, begin, req.tag, req.returnIfBlocked, onlySpilled)); - reply.rep = future.get(); reply.rep.begin = begin; req.reply.send(reply); begin = reply.rep.end; @@ -1344,7 +1339,7 @@ ACTOR Future serveTLogInterface(TLogData* self, loop choose { when(TLogPeekRequest req = waitNext(tli.peekMessages.getFuture())) { logData->addActor.send(tLogPeekMessages( - req.reply, self, logData, req.begin, req.tag, req.returnIfBlocked, req.onlySpilled, req.sequence)); + req.reply, self, logData, req.begin, req.tag, req.returnIfBlocked, req.onlySpilled, req.sequence)); } when(TLogPeekStreamRequest req = waitNext(tli.peekStreamMessages.getFuture())) { TraceEvent(SevDebug, "TLogPeekStream", logData->logId) diff --git a/fdbserver/OldTLogServer_6_0.actor.cpp b/fdbserver/OldTLogServer_6_0.actor.cpp index d6d2daf7c0..5d77b4e12f 100644 --- a/fdbserver/OldTLogServer_6_0.actor.cpp +++ b/fdbserver/OldTLogServer_6_0.actor.cpp @@ -1209,13 +1209,13 @@ void peekMessagesFromMemory(Reference self, // Common logics to peek TLog and create TLogPeekReply that serves both streaming peek or normal peek request ACTOR template Future tLogPeekMessages(PromiseType replyPromise, - TLogData* self, - Reference logData, - Version reqBegin, - Tag reqTag, - bool reqReturnIfBlocked = false, - bool reqOnlySpilled = false, - Optional> reqSequence = Optional>()) { + TLogData* self, + Reference logData, + Version reqBegin, + Tag reqTag, + bool reqReturnIfBlocked = false, + bool reqOnlySpilled = false, + Optional> reqSequence = Optional>()) { state BinaryWriter messages(Unversioned()); state BinaryWriter messages2(Unversioned()); state int sequence = -1; @@ -1231,7 +1231,7 @@ Future tLogPeekMessages(PromiseType replyPromise, peekId = reqSequence.get().first; sequence = reqSequence.get().second; if (sequence >= SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS && - logData->peekTracker.find(peekId) == logData->peekTracker.end()) { + logData->peekTracker.find(peekId) == logData->peekTracker.end()) { throw operation_obsolete(); } auto& trackerData = logData->peekTracker[peekId]; @@ -1241,7 +1241,7 @@ Future tLogPeekMessages(PromiseType replyPromise, } auto seqBegin = trackerData.sequence_version.begin(); while (trackerData.sequence_version.size() && - seqBegin->first <= sequence - SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS) { + seqBegin->first <= sequence - SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS) { if (seqBegin->second.canBeSet()) { seqBegin->second.sendError(operation_obsolete()); } @@ -1369,10 +1369,10 @@ Future tLogPeekMessages(PromiseType replyPromise, } RangeResult kvs = wait(self->persistentData->readRange( - KeyRangeRef(persistTagMessagesKey(logData->logId, reqTag, reqBegin), - persistTagMessagesKey(logData->logId, reqTag, logData->persistentDataDurableVersion + 1)), - SERVER_KNOBS->DESIRED_TOTAL_BYTES, - SERVER_KNOBS->DESIRED_TOTAL_BYTES)); + KeyRangeRef(persistTagMessagesKey(logData->logId, reqTag, reqBegin), + persistTagMessagesKey(logData->logId, reqTag, logData->persistentDataDurableVersion + 1)), + SERVER_KNOBS->DESIRED_TOTAL_BYTES, + SERVER_KNOBS->DESIRED_TOTAL_BYTES)); //TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", replyPromise.getEndpoint().address).detail("Tag1Results", s1).detail("Tag2Results", s2).detail("Tag1ResultsLim", kv1.size()).detail("Tag2ResultsLim", kv2.size()).detail("Tag1ResultsLast", kv1.size() ? kv1[0].key : "").detail("Tag2ResultsLast", kv2.size() ? kv2[0].key : "").detail("Limited", limited).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowEpoch", self->epoch()).detail("NowSeq", self->sequence.getNextSequence()); @@ -1460,14 +1460,9 @@ ACTOR Future tLogPeekStream(TLogData* self, TLogPeekStreamRequest req, Ref state Promise promise; state Future future(promise.getFuture()); try { - wait(req.reply.onReady() && - tLogPeekMessages(promise, self, logData, begin, req.tag, req.returnIfBlocked, onlySpilled)); - ASSERT(future.isReady()); - if (future.isError()) { - throw future.getError(); - } + wait(req.reply.onReady() && store(reply.rep, future) && + tLogPeekMessages(promise, self, logData, begin, req.tag, req.returnIfBlocked, onlySpilled)); - reply.rep = future.get(); reply.rep.begin = begin; req.reply.send(reply); begin = reply.rep.end; @@ -1986,7 +1981,7 @@ ACTOR Future serveTLogInterface(TLogData* self, } when(TLogPeekRequest req = waitNext(tli.peekMessages.getFuture())) { logData->addActor.send(tLogPeekMessages( - req.reply, self, logData, req.begin, req.tag, req.returnIfBlocked, req.onlySpilled, req.sequence)); + req.reply, self, logData, req.begin, req.tag, req.returnIfBlocked, req.onlySpilled, req.sequence)); } when(TLogPeekStreamRequest req = waitNext(tli.peekStreamMessages.getFuture())) { TraceEvent(SevDebug, "TLogPeekStream", logData->logId) diff --git a/fdbserver/OldTLogServer_6_2.actor.cpp b/fdbserver/OldTLogServer_6_2.actor.cpp index 9419b4cd9e..687ea0e638 100644 --- a/fdbserver/OldTLogServer_6_2.actor.cpp +++ b/fdbserver/OldTLogServer_6_2.actor.cpp @@ -1889,14 +1889,9 @@ ACTOR Future tLogPeekStream(TLogData* self, TLogPeekStreamRequest req, Ref state Promise promise; state Future future(promise.getFuture()); try { - wait(req.reply.onReady() && - tLogPeekMessages(promise, self, logData, begin, req.tag, req.returnIfBlocked, onlySpilled)); - ASSERT(future.isReady()); - if (future.isError()) { - throw future.getError(); - } + wait(req.reply.onReady() && store(reply.rep, future) && + tLogPeekMessages(promise, self, logData, begin, req.tag, req.returnIfBlocked, onlySpilled)); - reply.rep = future.get(); reply.rep.begin = begin; req.reply.send(reply); begin = reply.rep.end; @@ -2431,7 +2426,7 @@ ACTOR Future serveTLogInterface(TLogData* self, } when(TLogPeekRequest req = waitNext(tli.peekMessages.getFuture())) { logData->addActor.send(tLogPeekMessages( - req.reply, self, logData, req.begin, req.tag, req.returnIfBlocked, req.onlySpilled, req.sequence)); + req.reply, self, logData, req.begin, req.tag, req.returnIfBlocked, req.onlySpilled, req.sequence)); } when(TLogPeekStreamRequest req = waitNext(tli.peekStreamMessages.getFuture())) { TraceEvent(SevDebug, "TLogPeekStream", logData->logId) diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp index b20d9f41cf..fea695645c 100644 --- a/fdbserver/TLogServer.actor.cpp +++ b/fdbserver/TLogServer.actor.cpp @@ -1942,14 +1942,9 @@ ACTOR Future tLogPeekStream(TLogData* self, TLogPeekStreamRequest req, Ref state Promise promise; state Future future(promise.getFuture()); try { - wait(req.reply.onReady() && + wait(req.reply.onReady() && store(reply.rep, future) && tLogPeekMessages(promise, self, logData, begin, req.tag, req.returnIfBlocked, onlySpilled)); - ASSERT(future.isReady()); - if (future.isError()) { - throw future.getError(); - } - reply.rep = future.get(); reply.rep.begin = begin; req.reply.send(reply); begin = reply.rep.end; From a97570bd065bc1796788d27fb1586457ccc4949c Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Wed, 11 Aug 2021 18:26:00 -0700 Subject: [PATCH 27/29] solve mis-spelling, trace log and format problems --- fdbclient/ServerKnobs.cpp | 2 +- fdbclient/ServerKnobs.h | 2 +- fdbserver/LogRouter.actor.cpp | 5 +++-- fdbserver/LogSystemPeekCursor.actor.cpp | 11 +++++------ fdbserver/OldTLogServer_4_6.actor.cpp | 4 +++- fdbserver/OldTLogServer_6_0.actor.cpp | 4 +++- fdbserver/OldTLogServer_6_2.actor.cpp | 4 +++- fdbserver/TLogInterface.h | 4 ++-- fdbserver/TLogServer.actor.cpp | 4 +++- flow/Trace.h | 1 + 10 files changed, 25 insertions(+), 16 deletions(-) diff --git a/fdbclient/ServerKnobs.cpp b/fdbclient/ServerKnobs.cpp index 290452012a..1e90c5a38b 100644 --- a/fdbclient/ServerKnobs.cpp +++ b/fdbclient/ServerKnobs.cpp @@ -64,7 +64,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( TLOG_MESSAGE_BLOCK_BYTES, 10e6 ); init( TLOG_MESSAGE_BLOCK_OVERHEAD_FACTOR, double(TLOG_MESSAGE_BLOCK_BYTES) / (TLOG_MESSAGE_BLOCK_BYTES - MAX_MESSAGE_SIZE) ); //1.0121466709838096006362758832473 init( PEEK_TRACKER_EXPIRATION_TIME, 600 ); if( randomize && BUGGIFY ) PEEK_TRACKER_EXPIRATION_TIME = deterministicRandom()->coinflip() ? 0.1 : 120; - init( PEEK_USEING_STREAMING, true ); + init( PEEK_USING_STREAMING, true ); init( PARALLEL_GET_MORE_REQUESTS, 32 ); if( randomize && BUGGIFY ) PARALLEL_GET_MORE_REQUESTS = 2; init( MULTI_CURSOR_PRE_FETCH_LIMIT, 10 ); init( MAX_QUEUE_COMMIT_BYTES, 15e6 ); if( randomize && BUGGIFY ) MAX_QUEUE_COMMIT_BYTES = 5000; diff --git a/fdbclient/ServerKnobs.h b/fdbclient/ServerKnobs.h index 23b3049668..c905720898 100644 --- a/fdbclient/ServerKnobs.h +++ b/fdbclient/ServerKnobs.h @@ -41,7 +41,7 @@ public: // often, so that versions always advance smoothly // TLogs - bool PEEK_USEING_STREAMING; + bool PEEK_USING_STREAMING; double TLOG_TIMEOUT; // tlog OR commit proxy failure - master's reaction time double TLOG_SLOW_REJOIN_WARN_TIMEOUT_SECS; // Warns if a tlog takes too long to rejoin double RECOVERY_TLOG_SMART_QUORUM_DELAY; // smaller might be better for bug amplification diff --git a/fdbserver/LogRouter.actor.cpp b/fdbserver/LogRouter.actor.cpp index f26ea539af..22348fe8b3 100644 --- a/fdbserver/LogRouter.actor.cpp +++ b/fdbserver/LogRouter.actor.cpp @@ -573,7 +573,6 @@ Future logRouterPeekMessages(PromiseType replyPromise, return Void(); } -// TODO: enable streaming peek log from log router // This actor keep pushing TLogPeekStreamReply until it's removed from the cluster or should recover ACTOR Future logRouterPeekStream(LogRouterData* self, TLogPeekStreamRequest req) { self->activePeekStreams++; @@ -600,7 +599,9 @@ ACTOR Future logRouterPeekStream(LogRouterData* self, TLogPeekStreamReques } } catch (Error& e) { self->activePeekStreams--; - TraceEvent(SevDebug, "TLogPeekStreamEnd", self->dbgid).error(e, true); + TraceEvent(SevDebug, "TLogPeekStreamEnd", self->dbgid) + .detail("PeerAddr", req.reply.getEndpoint().getPrimaryAddress()) + .error(e, true); if (e.code() == error_code_end_of_stream || e.code() == error_code_operation_obsolete) { req.reply.sendError(e); diff --git a/fdbserver/LogSystemPeekCursor.actor.cpp b/fdbserver/LogSystemPeekCursor.actor.cpp index 769fd2ee57..4ab5aaf56f 100644 --- a/fdbserver/LogSystemPeekCursor.actor.cpp +++ b/fdbserver/LogSystemPeekCursor.actor.cpp @@ -38,7 +38,7 @@ ACTOR Future tryEstablishPeekStream(ILogSystem::ServerPeekCursor* self) { self->peekReplyStream = self->interf->get().interf().peekStreamMessages.getReplyStream(TLogPeekStreamRequest( self->messageVersion.version, self->tag, self->returnIfBlocked, std::numeric_limits::max())); TraceEvent(SevDebug, "SPC_StreamCreated", self->randomID) - .detail("PeerAddress", self->interf->get().interf().peekStreamMessages.getEndpoint().getPrimaryAddress()) + .detail("PeerAddr", self->interf->get().interf().peekStreamMessages.getEndpoint().getPrimaryAddress()) .detail("PeerToken", self->interf->get().interf().peekStreamMessages.getEndpoint().token); return Void(); } @@ -52,11 +52,11 @@ ILogSystem::ServerPeekCursor::ServerPeekCursor(ReferencerandomUniqueID()), returnIfBlocked(returnIfBlocked), onlySpilled(false), parallelGetMore(parallelGetMore), - usePeekStream(SERVER_KNOBS->PEEK_USEING_STREAMING), sequence(0), lastReset(0), resetCheck(Void()), slowReplies(0), + usePeekStream(SERVER_KNOBS->PEEK_USING_STREAMING), sequence(0), lastReset(0), resetCheck(Void()), slowReplies(0), fastReplies(0), unknownReplies(0) { this->results.maxKnownVersion = 0; this->results.minKnownCommittedVersion = 0; - TraceEvent(SevDebug, "SPC_Starting", randomID) + DisabledTraceEvent(SevDebug, "SPC_Starting", randomID) .detail("Tag", tag.toString()) .detail("Begin", begin) .detail("End", end); @@ -355,7 +355,7 @@ ACTOR Future serverPeekStreamGetMore(ILogSystem::ServerPeekCursor* self, T } updateCursorWithReply(self, res); expectedBegin = res.end; - TraceEvent(SevDebug, "SPC_GetMoreB", self->randomID) + DisabledTraceEvent(SevDebug, "SPC_GetMoreB", self->randomID) .detail("Has", self->hasMessage()) .detail("End", res.end) .detail("Popped", res.popped.present() ? res.popped.get() : 0); @@ -367,7 +367,7 @@ ACTOR Future serverPeekStreamGetMore(ILogSystem::ServerPeekCursor* self, T } } } catch (Error& e) { - TraceEvent(SevDebug, "SPC_GetMoreB_Error").error(e, true); + TraceEvent(SevDebug, "SPC_GetMoreB_Error", self->randomID).error(e, true); if (e.code() == error_code_connection_failed || e.code() == error_code_operation_obsolete) { // NOTE: delay in order to avoid the endless retry loop block other tasks self->peekReplyStream.reset(); @@ -424,7 +424,6 @@ Future ILogSystem::ServerPeekCursor::getMore(TaskPriority taskID) { if (hasMessage() && !parallelGetMore) return Void(); if (!more.isValid() || more.isReady()) { - // TODO: add tagLocalityRemoteLog when log router support streaming peek if (usePeekStream && (tag.locality >= 0 || tag.locality == tagLocalityLogRouter || tag.locality == tagLocalityRemoteLog)) { more = serverPeekStreamGetMore(this, taskID); } else if (parallelGetMore || onlySpilled || futureResults.size()) { diff --git a/fdbserver/OldTLogServer_4_6.actor.cpp b/fdbserver/OldTLogServer_4_6.actor.cpp index f50744cc1a..7cb4c565bf 100644 --- a/fdbserver/OldTLogServer_4_6.actor.cpp +++ b/fdbserver/OldTLogServer_4_6.actor.cpp @@ -1162,7 +1162,9 @@ ACTOR Future tLogPeekStream(TLogData* self, TLogPeekStreamRequest req, Ref } } catch (Error& e) { self->activePeekStreams--; - TraceEvent(SevDebug, "TLogPeekStreamEnd", logData->logId).error(e, true); + TraceEvent(SevDebug, "TLogPeekStreamEnd", logData->logId) + .detail("PeerAddr", req.reply.getEndpoint().getPrimaryAddress()) + .error(e, true); if (e.code() == error_code_end_of_stream || e.code() == error_code_operation_obsolete) { req.reply.sendError(e); diff --git a/fdbserver/OldTLogServer_6_0.actor.cpp b/fdbserver/OldTLogServer_6_0.actor.cpp index 5d77b4e12f..cc561d1f3d 100644 --- a/fdbserver/OldTLogServer_6_0.actor.cpp +++ b/fdbserver/OldTLogServer_6_0.actor.cpp @@ -1474,7 +1474,9 @@ ACTOR Future tLogPeekStream(TLogData* self, TLogPeekStreamRequest req, Ref } } catch (Error& e) { self->activePeekStreams--; - TraceEvent(SevDebug, "TLogPeekStreamEnd", logData->logId).error(e, true); + TraceEvent(SevDebug, "TLogPeekStreamEnd", logData->logId) + .detail("PeerAddr", req.reply.getEndpoint().getPrimaryAddress()) + .error(e, true); if (e.code() == error_code_end_of_stream || e.code() == error_code_operation_obsolete) { req.reply.sendError(e); diff --git a/fdbserver/OldTLogServer_6_2.actor.cpp b/fdbserver/OldTLogServer_6_2.actor.cpp index 687ea0e638..284afb1da3 100644 --- a/fdbserver/OldTLogServer_6_2.actor.cpp +++ b/fdbserver/OldTLogServer_6_2.actor.cpp @@ -1903,7 +1903,9 @@ ACTOR Future tLogPeekStream(TLogData* self, TLogPeekStreamRequest req, Ref } } catch (Error& e) { self->activePeekStreams--; - TraceEvent(SevDebug, "TLogPeekStreamEnd", logData->logId).error(e, true); + TraceEvent(SevDebug, "TLogPeekStreamEnd", logData->logId) + .detail("PeerAddr", req.reply.getEndpoint().getPrimaryAddress()) + .error(e, true); if (e.code() == error_code_end_of_stream || e.code() == error_code_operation_obsolete) { req.reply.sendError(e); diff --git a/fdbserver/TLogInterface.h b/fdbserver/TLogInterface.h index 322c7ddb35..1ca049927d 100644 --- a/fdbserver/TLogInterface.h +++ b/fdbserver/TLogInterface.h @@ -83,7 +83,7 @@ struct TLogInterface { streams.push_back(disablePopRequest.getReceiver()); streams.push_back(enablePopRequest.getReceiver()); streams.push_back(snapRequest.getReceiver()); - streams.push_back(peekStreamMessages.getReceiver(TaskPriority::TLogPeek)); + streams.push_back(peekStreamMessages.getReceiver(TaskPriority::TLogPeek)); FlowTransport::transport().addEndpoints(streams); } @@ -235,7 +235,7 @@ struct TLogPeekStreamRequest { Arena arena; Version begin; Tag tag; - bool returnIfBlocked; + bool returnIfBlocked; int limitBytes; ReplyPromiseStream reply; diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp index f7853e23b2..6608468860 100644 --- a/fdbserver/TLogServer.actor.cpp +++ b/fdbserver/TLogServer.actor.cpp @@ -1953,7 +1953,9 @@ ACTOR Future tLogPeekStream(TLogData* self, TLogPeekStreamRequest req, Ref } } catch (Error& e) { self->activePeekStreams--; - TraceEvent(SevDebug, "TLogPeekStreamEnd", logData->logId).error(e, true); + TraceEvent(SevDebug, "TLogPeekStreamEnd", logData->logId) + .detail("PeerAddr", req.reply.getEndpoint().getPrimaryAddress()) + .error(e, true); if (e.code() == error_code_end_of_stream || e.code() == error_code_operation_obsolete) { req.reply.sendError(e); diff --git a/flow/Trace.h b/flow/Trace.h index 467422b9cc..aeaabb4373 100644 --- a/flow/Trace.h +++ b/flow/Trace.h @@ -599,4 +599,5 @@ extern TraceBatch g_traceBatch; #define DUMPTOKEN(name) \ TraceEvent("DumpToken", recruited.id()).detail("Name", #name).detail("Token", name.getEndpoint().token) +#define DisabledTraceEvent(...) false && TraceEvent() #endif From df7a801945f0e607af784205c57eb8e29bb35c47 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Thu, 12 Aug 2021 14:10:34 -0700 Subject: [PATCH 28/29] remove FIXME --- fdbclient/ServerKnobs.cpp | 2 +- fdbserver/LogRouter.actor.cpp | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/fdbclient/ServerKnobs.cpp b/fdbclient/ServerKnobs.cpp index 1e90c5a38b..0873b072dc 100644 --- a/fdbclient/ServerKnobs.cpp +++ b/fdbclient/ServerKnobs.cpp @@ -64,7 +64,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( TLOG_MESSAGE_BLOCK_BYTES, 10e6 ); init( TLOG_MESSAGE_BLOCK_OVERHEAD_FACTOR, double(TLOG_MESSAGE_BLOCK_BYTES) / (TLOG_MESSAGE_BLOCK_BYTES - MAX_MESSAGE_SIZE) ); //1.0121466709838096006362758832473 init( PEEK_TRACKER_EXPIRATION_TIME, 600 ); if( randomize && BUGGIFY ) PEEK_TRACKER_EXPIRATION_TIME = deterministicRandom()->coinflip() ? 0.1 : 120; - init( PEEK_USING_STREAMING, true ); + init( PEEK_USING_STREAMING, true ); init( PARALLEL_GET_MORE_REQUESTS, 32 ); if( randomize && BUGGIFY ) PARALLEL_GET_MORE_REQUESTS = 2; init( MULTI_CURSOR_PRE_FETCH_LIMIT, 10 ); init( MAX_QUEUE_COMMIT_BYTES, 15e6 ); if( randomize && BUGGIFY ) MAX_QUEUE_COMMIT_BYTES = 5000; diff --git a/fdbserver/LogRouter.actor.cpp b/fdbserver/LogRouter.actor.cpp index 22348fe8b3..7e3316b28d 100644 --- a/fdbserver/LogRouter.actor.cpp +++ b/fdbserver/LogRouter.actor.cpp @@ -694,11 +694,9 @@ ACTOR Future logRouterCore(TLogInterface interf, req.reply, &logRouterData, req.begin, req.tag, req.returnIfBlocked, req.onlySpilled, req.sequence)); } when(TLogPeekStreamRequest req = waitNext(interf.peekStreamMessages.getFuture())) { - // FIXME: currently LogRouter doesn't support streaming peek request TraceEvent(SevDebug, "LogRouterPeekStream", logRouterData.dbgid) .detail("Token", interf.peekStreamMessages.getEndpoint().token); addActor.send(logRouterPeekStream(&logRouterData, req)); - // req.reply.sendError(operation_failed()); } when(TLogPopRequest req = waitNext(interf.popMessages.getFuture())) { // Request from remote tLog to pop data from LR From d12bda94ae3d5593c21a49a01c95e2f854cc9b0d Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Mon, 16 Aug 2021 16:33:20 -0700 Subject: [PATCH 29/29] disable trace log --- fdbserver/LogSystemPeekCursor.actor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbserver/LogSystemPeekCursor.actor.cpp b/fdbserver/LogSystemPeekCursor.actor.cpp index 4ab5aaf56f..429e803c02 100644 --- a/fdbserver/LogSystemPeekCursor.actor.cpp +++ b/fdbserver/LogSystemPeekCursor.actor.cpp @@ -367,7 +367,7 @@ ACTOR Future serverPeekStreamGetMore(ILogSystem::ServerPeekCursor* self, T } } } catch (Error& e) { - TraceEvent(SevDebug, "SPC_GetMoreB_Error", self->randomID).error(e, true); + DisabledTraceEvent(SevDebug, "SPC_GetMoreB_Error", self->randomID).error(e, true); if (e.code() == error_code_connection_failed || e.code() == error_code_operation_obsolete) { // NOTE: delay in order to avoid the endless retry loop block other tasks self->peekReplyStream.reset();