mirror of
https://github.com/apple/foundationdb.git
synced 2025-05-15 02:18:39 +08:00
* Fix orphaned storage server due to force recovery The force recovery can roll back the transaction that adds a storage server. However, the storage server may now at version B > A, the recovery version. As a result, its peek to buddy TLog won't return TLogPeekReply::popped to trigger its exit, and instead getting a higher version C > B back. To the storage server, this means the message is empty, thus not removing itself and keeps peeking. The fix is to instead of using recovery version as the popped version for the SS, we use the recovery transaction version, which is the first transaction after the recovery. Force recovery bumps this version to a much higher version than the SS's version. So the TLog would set TLogPeekReply::popped to trigger the storage server exit. * Fix tlog peek to disallow return empty message between recoveredAt and recovery txn version This contract today is not explicitly set and can cause storage server to fail with assertion "rollbackVersion >= data->storageVersion()". This is because if such an empty version is returned, SS may advance its storage version to a value larger than the rollback version set in the recovery transaction. The fix is to block peek reply until recovery transaction has been received. * Move recoveryTxnReceived to be per LogData This is because a shared TLog can have a first generation TLog which is already setting the promise, thus later generations won't wait for the recovery version. For the current generation, all peeks need to wait, while for older generations, there is no need to wait (by checking if they are stopped). * For initial commit, poppedVersion needs to be at least 2 To get rid of the previous unsuccessful recovery's recruited seed storage servers.
3758 lines
152 KiB
C++
3758 lines
152 KiB
C++
/*
|
|
* TLogServer.actor.cpp
|
|
*
|
|
* This source file is part of the FoundationDB open source project
|
|
*
|
|
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#include "flow/Hash3.h"
|
|
#include "flow/UnitTest.h"
|
|
#include "fdbclient/NativeAPI.actor.h"
|
|
#include "fdbclient/Notified.h"
|
|
#include "fdbclient/KeyRangeMap.h"
|
|
#include "fdbclient/RunTransaction.actor.h"
|
|
#include "fdbclient/SystemData.h"
|
|
#include "fdbclient/FDBTypes.h"
|
|
#include "fdbclient/ManagementAPI.actor.h"
|
|
#include "fdbserver/WorkerInterface.actor.h"
|
|
#include "fdbserver/LogProtocolMessage.h"
|
|
#include "fdbserver/SpanContextMessage.h"
|
|
#include "fdbserver/TLogInterface.h"
|
|
#include "fdbserver/Knobs.h"
|
|
#include "fdbserver/IKeyValueStore.h"
|
|
#include "fdbserver/MutationTracking.h"
|
|
#include "flow/ActorCollection.h"
|
|
#include "fdbrpc/FailureMonitor.h"
|
|
#include "fdbserver/IDiskQueue.h"
|
|
#include "fdbrpc/sim_validation.h"
|
|
#include "fdbrpc/simulator.h"
|
|
#include "fdbrpc/Stats.h"
|
|
#include "fdbserver/ServerDBInfo.h"
|
|
#include "fdbserver/LogSystem.h"
|
|
#include "fdbserver/WaitFailure.h"
|
|
#include "fdbserver/RecoveryState.h"
|
|
#include "fdbserver/FDBExecHelper.actor.h"
|
|
#include "flow/Histogram.h"
|
|
#include "flow/actorcompiler.h" // This must be the last #include.
|
|
|
|
struct TLogQueueEntryRef {
|
|
UID id;
|
|
Version version;
|
|
Version knownCommittedVersion;
|
|
StringRef messages;
|
|
TLogQueueEntryRef() : version(0), knownCommittedVersion(0) {}
|
|
TLogQueueEntryRef(Arena& a, TLogQueueEntryRef const& from)
|
|
: id(from.id), version(from.version), knownCommittedVersion(from.knownCommittedVersion),
|
|
messages(a, from.messages) {}
|
|
|
|
// To change this serialization, ProtocolVersion::TLogQueueEntryRef must be updated, and downgrades need to be
|
|
// considered
|
|
template <class Ar>
|
|
void serialize(Ar& ar) {
|
|
serializer(ar, version, messages, knownCommittedVersion, id);
|
|
}
|
|
size_t expectedSize() const { return messages.expectedSize(); }
|
|
};
|
|
|
|
struct AlternativeTLogQueueEntryRef {
|
|
UID id;
|
|
Version version;
|
|
Version knownCommittedVersion;
|
|
std::vector<TagsAndMessage>* alternativeMessages;
|
|
|
|
AlternativeTLogQueueEntryRef() : version(0), knownCommittedVersion(0), alternativeMessages(nullptr) {}
|
|
|
|
template <class Ar>
|
|
void serialize(Ar& ar) {
|
|
ASSERT(!ar.isDeserializing && alternativeMessages);
|
|
uint32_t msgSize = expectedSize();
|
|
serializer(ar, version, msgSize);
|
|
for (auto& msg : *alternativeMessages) {
|
|
ar.serializeBytes(msg.message);
|
|
}
|
|
serializer(ar, knownCommittedVersion, id);
|
|
}
|
|
|
|
uint32_t expectedSize() const {
|
|
uint32_t msgSize = 0;
|
|
for (auto& msg : *alternativeMessages) {
|
|
msgSize += msg.message.size();
|
|
}
|
|
return msgSize;
|
|
}
|
|
};
|
|
|
|
typedef Standalone<TLogQueueEntryRef> TLogQueueEntry;
|
|
struct LogData;
|
|
struct TLogData;
|
|
|
|
struct TLogQueue final : public IClosable {
|
|
public:
|
|
TLogQueue(IDiskQueue* queue, UID dbgid) : queue(queue), dbgid(dbgid) {}
|
|
|
|
// Each packet in the queue is
|
|
// uint32_t payloadSize
|
|
// uint8_t payload[payloadSize] (begins with uint64_t protocolVersion via IncludeVersion)
|
|
// uint8_t validFlag
|
|
|
|
// TLogQueue is a durable queue of TLogQueueEntry objects with an interface similar to IDiskQueue
|
|
|
|
// TLogQueue pushes (but not commits) are atomic - after commit fails to return, a prefix of entire calls to push
|
|
// are durable. This is
|
|
// implemented on top of the weaker guarantee of IDiskQueue::commit (that a prefix of bytes is durable) using
|
|
// validFlag and by padding any incomplete packet with zeros after recovery.
|
|
|
|
// Before calling push, pop, or commit, the user must call readNext() until it throws
|
|
// end_of_stream(). It may not be called again thereafter.
|
|
Future<TLogQueueEntry> readNext(TLogData* tLog) { return readNext(this, tLog); }
|
|
|
|
Future<bool> initializeRecovery(IDiskQueue::location recoverAt) { return queue->initializeRecovery(recoverAt); }
|
|
|
|
template <class T>
|
|
void push(T const& qe, Reference<LogData> logData);
|
|
void forgetBefore(Version upToVersion, Reference<LogData> logData);
|
|
void pop(IDiskQueue::location upToLocation);
|
|
Future<Void> commit() { return queue->commit(); }
|
|
|
|
// Implements IClosable
|
|
Future<Void> getError() const override { return queue->getError(); }
|
|
Future<Void> onClosed() const override { return queue->onClosed(); }
|
|
void dispose() override {
|
|
queue->dispose();
|
|
delete this;
|
|
}
|
|
void close() override {
|
|
queue->close();
|
|
delete this;
|
|
}
|
|
|
|
private:
|
|
IDiskQueue* queue;
|
|
UID dbgid;
|
|
|
|
void updateVersionSizes(const TLogQueueEntry& result,
|
|
TLogData* tLog,
|
|
IDiskQueue::location start,
|
|
IDiskQueue::location end);
|
|
|
|
ACTOR static Future<TLogQueueEntry> readNext(TLogQueue* self, TLogData* tLog) {
|
|
state TLogQueueEntry result;
|
|
state int zeroFillSize = 0;
|
|
|
|
loop {
|
|
state IDiskQueue::location startloc = self->queue->getNextReadLocation();
|
|
Standalone<StringRef> h = wait(self->queue->readNext(sizeof(uint32_t)));
|
|
if (h.size() != sizeof(uint32_t)) {
|
|
if (h.size()) {
|
|
TEST(true); // Zero fill within size field
|
|
int payloadSize = 0;
|
|
memcpy(&payloadSize, h.begin(), h.size());
|
|
zeroFillSize = sizeof(uint32_t) - h.size(); // zero fill the size itself
|
|
zeroFillSize += payloadSize + 1; // and then the contents and valid flag
|
|
}
|
|
break;
|
|
}
|
|
|
|
state uint32_t payloadSize = *(uint32_t*)h.begin();
|
|
ASSERT(payloadSize < (100 << 20));
|
|
|
|
Standalone<StringRef> e = wait(self->queue->readNext(payloadSize + 1));
|
|
if (e.size() != payloadSize + 1) {
|
|
TEST(true); // Zero fill within payload
|
|
zeroFillSize = payloadSize + 1 - e.size();
|
|
break;
|
|
}
|
|
|
|
if (e[payloadSize]) {
|
|
ASSERT(e[payloadSize] == 1);
|
|
Arena a = e.arena();
|
|
ArenaReader ar(a, e.substr(0, payloadSize), IncludeVersion());
|
|
ar >> result;
|
|
const IDiskQueue::location endloc = self->queue->getNextReadLocation();
|
|
self->updateVersionSizes(result, tLog, startloc, endloc);
|
|
return result;
|
|
}
|
|
}
|
|
if (zeroFillSize) {
|
|
TEST(true); // Fixing a partial commit at the end of the tlog queue
|
|
for (int i = 0; i < zeroFillSize; i++)
|
|
self->queue->push(StringRef((const uint8_t*)"", 1));
|
|
}
|
|
throw end_of_stream();
|
|
}
|
|
};
|
|
|
|
////// Persistence format (for self->persistentData)
|
|
|
|
// Immutable keys
|
|
// persistFormat has been mostly invalidated by TLogVersion, and can probably be removed when
|
|
// 4.6's TLog code is removed.
|
|
static const KeyValueRef persistFormat(LiteralStringRef("Format"), LiteralStringRef("FoundationDB/LogServer/3/0"));
|
|
static const KeyRangeRef persistFormatReadableRange(LiteralStringRef("FoundationDB/LogServer/3/0"),
|
|
LiteralStringRef("FoundationDB/LogServer/4/0"));
|
|
static const KeyRangeRef persistProtocolVersionKeys(LiteralStringRef("ProtocolVersion/"),
|
|
LiteralStringRef("ProtocolVersion0"));
|
|
static const KeyRangeRef persistTLogSpillTypeKeys(LiteralStringRef("TLogSpillType/"),
|
|
LiteralStringRef("TLogSpillType0"));
|
|
static const KeyRangeRef persistRecoveryCountKeys =
|
|
KeyRangeRef(LiteralStringRef("DbRecoveryCount/"), LiteralStringRef("DbRecoveryCount0"));
|
|
|
|
// Updated on updatePersistentData()
|
|
static const KeyRangeRef persistCurrentVersionKeys =
|
|
KeyRangeRef(LiteralStringRef("version/"), LiteralStringRef("version0"));
|
|
static const KeyRangeRef persistKnownCommittedVersionKeys =
|
|
KeyRangeRef(LiteralStringRef("knownCommitted/"), LiteralStringRef("knownCommitted0"));
|
|
static const KeyRef persistRecoveryLocationKey = KeyRef(LiteralStringRef("recoveryLocation"));
|
|
static const KeyRangeRef persistLocalityKeys =
|
|
KeyRangeRef(LiteralStringRef("Locality/"), LiteralStringRef("Locality0"));
|
|
static const KeyRangeRef persistLogRouterTagsKeys =
|
|
KeyRangeRef(LiteralStringRef("LogRouterTags/"), LiteralStringRef("LogRouterTags0"));
|
|
static const KeyRangeRef persistTxsTagsKeys = KeyRangeRef(LiteralStringRef("TxsTags/"), LiteralStringRef("TxsTags0"));
|
|
static const KeyRange persistTagMessagesKeys = prefixRange(LiteralStringRef("TagMsg/"));
|
|
static const KeyRange persistTagMessageRefsKeys = prefixRange(LiteralStringRef("TagMsgRef/"));
|
|
static const KeyRange persistTagPoppedKeys = prefixRange(LiteralStringRef("TagPop/"));
|
|
|
|
static const KeyRef persistClusterIdKey = LiteralStringRef("clusterId");
|
|
|
|
static Key persistTagMessagesKey(UID id, Tag tag, Version version) {
|
|
BinaryWriter wr(Unversioned());
|
|
wr.serializeBytes(persistTagMessagesKeys.begin);
|
|
wr << id;
|
|
wr << tag;
|
|
wr << bigEndian64(version);
|
|
return wr.toValue();
|
|
}
|
|
|
|
static Key persistTagMessageRefsKey(UID id, Tag tag, Version version) {
|
|
BinaryWriter wr(Unversioned());
|
|
wr.serializeBytes(persistTagMessageRefsKeys.begin);
|
|
wr << id;
|
|
wr << tag;
|
|
wr << bigEndian64(version);
|
|
return wr.toValue();
|
|
}
|
|
|
|
static Key persistTagPoppedKey(UID id, Tag tag) {
|
|
BinaryWriter wr(Unversioned());
|
|
wr.serializeBytes(persistTagPoppedKeys.begin);
|
|
wr << id;
|
|
wr << tag;
|
|
return wr.toValue();
|
|
}
|
|
|
|
static Value persistTagPoppedValue(Version popped) {
|
|
return BinaryWriter::toValue(popped, Unversioned());
|
|
}
|
|
|
|
static Tag decodeTagPoppedKey(KeyRef id, KeyRef key) {
|
|
Tag s;
|
|
BinaryReader rd(key.removePrefix(persistTagPoppedKeys.begin).removePrefix(id), Unversioned());
|
|
rd >> s;
|
|
return s;
|
|
}
|
|
|
|
static Version decodeTagPoppedValue(ValueRef value) {
|
|
return BinaryReader::fromStringRef<Version>(value, Unversioned());
|
|
}
|
|
|
|
static StringRef stripTagMessagesKey(StringRef key) {
|
|
return key.substr(sizeof(UID) + sizeof(Tag) + persistTagMessagesKeys.begin.size());
|
|
}
|
|
|
|
static Version decodeTagMessagesKey(StringRef key) {
|
|
return bigEndian64(BinaryReader::fromStringRef<Version>(stripTagMessagesKey(key), Unversioned()));
|
|
}
|
|
|
|
struct SpilledData {
|
|
SpilledData() = default;
|
|
SpilledData(Version version, IDiskQueue::location start, uint32_t length, uint32_t mutationBytes)
|
|
: version(version), start(start), length(length), mutationBytes(mutationBytes) {}
|
|
|
|
template <class Ar>
|
|
void serialize(Ar& ar) {
|
|
serializer(ar, version, start, length, mutationBytes);
|
|
}
|
|
|
|
Version version = 0;
|
|
IDiskQueue::location start = 0;
|
|
uint32_t length = 0;
|
|
uint32_t mutationBytes = 0;
|
|
};
|
|
|
|
struct TLogData : NonCopyable {
|
|
AsyncTrigger newLogData;
|
|
// A process has only 1 SharedTLog, which holds data for multiple logs, so that it obeys its assigned memory limit.
|
|
// A process has only 1 active log and multiple non-active log from old generations.
|
|
// In the figure below, TLog [1-4] are logs from old generations.
|
|
// Because SS may need to pull data from old generation log, we keep Tlog [1-4].
|
|
//
|
|
// We always pop the disk queue from the oldest TLog, spill from the oldest TLog that still has
|
|
// data in memory, and commits to the disk queue come from the most recent TLog.
|
|
//
|
|
// SharedTLog
|
|
// +--------+--------+--------+--------+--------+
|
|
// | TLog 1 | TLog 2 | TLog 3 | TLog 4 | TLog 5 |
|
|
// +--------+--------+--------+--------+--------+
|
|
// ^ popOrder ^spillOrder ^committing
|
|
//
|
|
// ^popOrder is the location where SS reads the to-be-read data from tlog.
|
|
// ^committing is the location where the active TLog accepts the pushed data.
|
|
Deque<UID> popOrder;
|
|
Deque<UID> spillOrder;
|
|
std::map<UID, Reference<struct LogData>> id_data;
|
|
|
|
// The durable cluster ID identifies which cluster the tlogs persistent
|
|
// data is written from. This value is restored from disk when the tlog
|
|
// restarts.
|
|
UID durableClusterId;
|
|
// The cluster-controller cluster ID stores the cluster ID read from the txnStateStore.
|
|
// It is cached in this variable.
|
|
UID ccClusterId;
|
|
UID dbgid;
|
|
UID workerID;
|
|
|
|
IKeyValueStore* persistentData; // Durable data on disk that were spilled.
|
|
IDiskQueue* rawPersistentQueue; // The physical queue the persistentQueue below stores its data. Ideally, log
|
|
// interface should work without directly accessing rawPersistentQueue
|
|
TLogQueue* persistentQueue; // Logical queue the log operates on and persist its data.
|
|
|
|
std::deque<std::tuple<Version, int>> unknownCommittedVersions;
|
|
|
|
int64_t diskQueueCommitBytes;
|
|
AsyncVar<bool>
|
|
largeDiskQueueCommitBytes; // becomes true when diskQueueCommitBytes is greater than MAX_QUEUE_COMMIT_BYTES
|
|
|
|
Reference<AsyncVar<ServerDBInfo> const> dbInfo;
|
|
Database cx;
|
|
|
|
NotifiedVersion queueCommitEnd;
|
|
Version queueCommitBegin;
|
|
|
|
int64_t instanceID;
|
|
int64_t bytesInput;
|
|
int64_t bytesDurable;
|
|
int64_t targetVolatileBytes; // The number of bytes of mutations this TLog should hold in memory before spilling.
|
|
int64_t overheadBytesInput;
|
|
int64_t overheadBytesDurable;
|
|
int activePeekStreams = 0;
|
|
|
|
WorkerCache<TLogInterface> tlogCache;
|
|
FlowLock peekMemoryLimiter;
|
|
|
|
PromiseStream<Future<Void>> sharedActors;
|
|
Promise<Void> terminated;
|
|
FlowLock concurrentLogRouterReads;
|
|
FlowLock persistentDataCommitLock;
|
|
|
|
// Beginning of fields used by snapshot based backup and restore
|
|
double ignorePopDeadline; // time until which the ignorePopRequest will be
|
|
// honored
|
|
std::string ignorePopUid; // callers that set ignorePopRequest will set this
|
|
// extra state, used to validate the ownership of
|
|
// the set and for callers that unset will
|
|
// be able to match it up
|
|
std::string dataFolder; // folder where data is stored
|
|
Reference<AsyncVar<bool>> degraded;
|
|
// End of fields used by snapshot based backup and restore
|
|
|
|
std::vector<TagsAndMessage> tempTagMessages;
|
|
|
|
Reference<Histogram> commitLatencyDist;
|
|
|
|
TLogData(UID dbgid,
|
|
UID workerID,
|
|
IKeyValueStore* persistentData,
|
|
IDiskQueue* persistentQueue,
|
|
Reference<AsyncVar<ServerDBInfo> const> dbInfo,
|
|
Reference<AsyncVar<bool>> degraded,
|
|
std::string folder)
|
|
: dbgid(dbgid), workerID(workerID), persistentData(persistentData), rawPersistentQueue(persistentQueue),
|
|
persistentQueue(new TLogQueue(persistentQueue, dbgid)), diskQueueCommitBytes(0),
|
|
largeDiskQueueCommitBytes(false), dbInfo(dbInfo), queueCommitEnd(0), queueCommitBegin(0),
|
|
instanceID(deterministicRandom()->randomUniqueID().first()), bytesInput(0), bytesDurable(0),
|
|
targetVolatileBytes(SERVER_KNOBS->TLOG_SPILL_THRESHOLD), overheadBytesInput(0), overheadBytesDurable(0),
|
|
peekMemoryLimiter(SERVER_KNOBS->TLOG_SPILL_REFERENCE_MAX_PEEK_MEMORY_BYTES),
|
|
concurrentLogRouterReads(SERVER_KNOBS->CONCURRENT_LOG_ROUTER_READS), ignorePopDeadline(0), dataFolder(folder),
|
|
degraded(degraded), commitLatencyDist(Histogram::getHistogram(LiteralStringRef("tLog"),
|
|
LiteralStringRef("commit"),
|
|
Histogram::Unit::microseconds)) {
|
|
cx = openDBOnServer(dbInfo, TaskPriority::DefaultEndpoint, LockAware::True);
|
|
}
|
|
};
|
|
|
|
struct LogData : NonCopyable, public ReferenceCounted<LogData> {
|
|
struct TagData : NonCopyable, public ReferenceCounted<TagData> {
|
|
std::deque<std::pair<Version, LengthPrefixedStringRef>> versionMessages;
|
|
bool
|
|
nothingPersistent; // true means tag is *known* to have no messages in persistentData. false means nothing.
|
|
bool poppedRecently; // `popped` has changed since last updatePersistentData
|
|
Version popped; // see popped version tracking contract below
|
|
Version persistentPopped; // The popped version recorded in the btree.
|
|
Version versionForPoppedLocation; // `poppedLocation` was calculated at this popped version
|
|
IDiskQueue::location poppedLocation; // The location of the earliest commit with data for this tag.
|
|
bool unpoppedRecovered;
|
|
Tag tag;
|
|
|
|
TagData(Tag tag,
|
|
Version popped,
|
|
IDiskQueue::location poppedLocation,
|
|
bool nothingPersistent,
|
|
bool poppedRecently,
|
|
bool unpoppedRecovered)
|
|
: nothingPersistent(nothingPersistent), poppedRecently(poppedRecently), popped(popped), persistentPopped(0),
|
|
versionForPoppedLocation(0), poppedLocation(poppedLocation), unpoppedRecovered(unpoppedRecovered),
|
|
tag(tag) {}
|
|
|
|
TagData(TagData&& r) noexcept
|
|
: versionMessages(std::move(r.versionMessages)), nothingPersistent(r.nothingPersistent),
|
|
poppedRecently(r.poppedRecently), popped(r.popped), persistentPopped(r.persistentPopped),
|
|
versionForPoppedLocation(r.versionForPoppedLocation), poppedLocation(r.poppedLocation),
|
|
unpoppedRecovered(r.unpoppedRecovered), tag(r.tag) {}
|
|
void operator=(TagData&& r) noexcept {
|
|
versionMessages = std::move(r.versionMessages);
|
|
nothingPersistent = r.nothingPersistent;
|
|
poppedRecently = r.poppedRecently;
|
|
popped = r.popped;
|
|
persistentPopped = r.persistentPopped;
|
|
versionForPoppedLocation = r.versionForPoppedLocation;
|
|
poppedLocation = r.poppedLocation;
|
|
tag = r.tag;
|
|
unpoppedRecovered = r.unpoppedRecovered;
|
|
}
|
|
|
|
// Erase messages not needed to update *from* versions >= before (thus, messages with toversion <= before)
|
|
ACTOR Future<Void> eraseMessagesBefore(TagData* self,
|
|
Version before,
|
|
TLogData* tlogData,
|
|
Reference<LogData> logData,
|
|
TaskPriority taskID) {
|
|
while (!self->versionMessages.empty() && self->versionMessages.front().first < before) {
|
|
Version version = self->versionMessages.front().first;
|
|
std::pair<int, int>& sizes = logData->version_sizes[version];
|
|
int64_t messagesErased = 0;
|
|
|
|
while (!self->versionMessages.empty() && self->versionMessages.front().first == version) {
|
|
auto const& m = self->versionMessages.front();
|
|
++messagesErased;
|
|
|
|
if (self->tag.locality != tagLocalityTxs && self->tag != txsTag) {
|
|
sizes.first -= m.second.expectedSize();
|
|
} else {
|
|
sizes.second -= m.second.expectedSize();
|
|
}
|
|
|
|
self->versionMessages.pop_front();
|
|
}
|
|
|
|
int64_t bytesErased = messagesErased * SERVER_KNOBS->VERSION_MESSAGES_ENTRY_BYTES_WITH_OVERHEAD;
|
|
logData->bytesDurable += bytesErased;
|
|
tlogData->bytesDurable += bytesErased;
|
|
tlogData->overheadBytesDurable += bytesErased;
|
|
wait(yield(taskID));
|
|
}
|
|
|
|
return Void();
|
|
}
|
|
|
|
Future<Void> eraseMessagesBefore(Version before,
|
|
TLogData* tlogData,
|
|
Reference<LogData> logData,
|
|
TaskPriority taskID) {
|
|
return eraseMessagesBefore(this, before, tlogData, logData, taskID);
|
|
}
|
|
};
|
|
|
|
Map<Version, std::pair<IDiskQueue::location, IDiskQueue::location>>
|
|
versionLocation; // For the version of each entry that was push()ed, the [start, end) location of the serialized
|
|
// bytes
|
|
|
|
/*
|
|
Popped version tracking contract needed by log system to implement ILogCursor::popped():
|
|
|
|
- Log server tracks for each (possible) tag a popped_version
|
|
Impl: TagData::popped (in memory) and persistTagPoppedKeys (in persistentData)
|
|
- popped_version(tag) is <= the maximum version for which log server (or a predecessor) is ever asked to pop the
|
|
tag Impl: Only increased by tLogPop() in response to either a pop request or recovery from a predecessor
|
|
- popped_version(tag) is > the maximum version for which log server is unable to peek messages due to previous
|
|
pops (on this server or a predecessor) Impl: Increased by tLogPop() atomically with erasing messages from memory;
|
|
persisted by updatePersistentData() atomically with erasing messages from store; messages are not erased from queue
|
|
where popped_version is not persisted
|
|
- LockTLogReply returns all tags which either have messages, or which have nonzero popped_versions
|
|
Impl: tag_data is present for all such tags
|
|
- peek(tag, v) returns the popped_version for tag if that is greater than v
|
|
Impl: Check tag_data->popped (after all waits)
|
|
*/
|
|
|
|
AsyncTrigger stopCommit;
|
|
bool stopped, initialized;
|
|
DBRecoveryCount recoveryCount;
|
|
|
|
// If persistentDataVersion != persistentDurableDataVersion,
|
|
// then spilling is happening from persistentDurableDataVersion to persistentDataVersion.
|
|
// Data less than persistentDataDurableVersion is spilled on disk (or fully popped from the TLog);
|
|
VersionMetricHandle persistentDataVersion,
|
|
persistentDataDurableVersion; // The last version number in the portion of the log (written|durable) to
|
|
// persistentData
|
|
NotifiedVersion version;
|
|
NotifiedVersion queueCommittedVersion; // The disk queue has committed up until the queueCommittedVersion version.
|
|
Version queueCommittingVersion;
|
|
Version knownCommittedVersion; // The maximum version that a proxy has told us that is committed (all TLogs have
|
|
// ack'd a commit for this version).
|
|
Version durableKnownCommittedVersion, minKnownCommittedVersion;
|
|
Version queuePoppedVersion; // The disk queue has been popped up until the location which represents this version.
|
|
Version minPoppedTagVersion;
|
|
Tag minPoppedTag; // The tag that makes tLog hold its data and cause tLog's disk queue increasing.
|
|
|
|
Deque<std::pair<Version, Standalone<VectorRef<uint8_t>>>> messageBlocks;
|
|
std::vector<std::vector<Reference<TagData>>> tag_data; // tag.locality | tag.id
|
|
int unpoppedRecoveredTags;
|
|
std::map<Tag, Promise<Void>> waitingTags;
|
|
|
|
Reference<TagData> getTagData(Tag tag) {
|
|
int idx = tag.toTagDataIndex();
|
|
if (idx >= tag_data.size()) {
|
|
tag_data.resize(idx + 1);
|
|
}
|
|
if (tag.id >= tag_data[idx].size()) {
|
|
tag_data[idx].resize(tag.id + 1);
|
|
}
|
|
return tag_data[idx][tag.id];
|
|
}
|
|
|
|
// only callable after getTagData returns a null reference
|
|
Reference<TagData> createTagData(Tag tag,
|
|
Version popped,
|
|
bool nothingPersistent,
|
|
bool poppedRecently,
|
|
bool unpoppedRecovered) {
|
|
if (tag.locality != tagLocalityLogRouter && tag.locality != tagLocalityTxs && tag != txsTag && allTags.size() &&
|
|
!allTags.count(tag) && popped <= recoveredAt) {
|
|
popped = recoveredAt + 1;
|
|
}
|
|
auto newTagData = makeReference<TagData>(tag, popped, 0, nothingPersistent, poppedRecently, unpoppedRecovered);
|
|
tag_data[tag.toTagDataIndex()][tag.id] = newTagData;
|
|
return newTagData;
|
|
}
|
|
|
|
Map<Version, std::pair<int, int>> version_sizes;
|
|
|
|
CounterCollection cc;
|
|
Counter bytesInput;
|
|
Counter bytesDurable;
|
|
Counter blockingPeeks;
|
|
Counter blockingPeekTimeouts;
|
|
Counter emptyPeeks;
|
|
Counter nonEmptyPeeks;
|
|
std::map<Tag, LatencySample> blockingPeekLatencies;
|
|
std::map<Tag, LatencySample> peekVersionCounts;
|
|
|
|
UID logId;
|
|
ProtocolVersion protocolVersion;
|
|
Version newPersistentDataVersion;
|
|
Future<Void> removed;
|
|
PromiseStream<Future<Void>> addActor;
|
|
TLogData* tLogData;
|
|
Promise<Void> recoveryComplete, committingQueue;
|
|
Version unrecoveredBefore, recoveredAt;
|
|
Version recoveryTxnVersion;
|
|
Promise<Void> recoveryTxnReceived;
|
|
|
|
struct PeekTrackerData {
|
|
std::map<int, Promise<std::pair<Version, bool>>>
|
|
sequence_version; // second: Version is peeked begin version. bool is onlySpilled
|
|
double lastUpdate;
|
|
|
|
Tag tag;
|
|
|
|
double lastLogged;
|
|
int64_t totalPeeks;
|
|
int64_t replyBytes;
|
|
int64_t duplicatePeeks;
|
|
double queueTime;
|
|
double queueMax;
|
|
double blockTime;
|
|
double blockMax;
|
|
double workTime;
|
|
double workMax;
|
|
|
|
int64_t unblockedPeeks;
|
|
double idleTime;
|
|
double idleMax;
|
|
|
|
PeekTrackerData() : lastUpdate(0) { resetMetrics(); }
|
|
|
|
void resetMetrics() {
|
|
lastLogged = now();
|
|
totalPeeks = 0;
|
|
replyBytes = 0;
|
|
duplicatePeeks = 0;
|
|
queueTime = 0;
|
|
queueMax = 0;
|
|
blockTime = 0;
|
|
blockMax = 0;
|
|
workTime = 0;
|
|
workMax = 0;
|
|
unblockedPeeks = 0;
|
|
idleTime = 0;
|
|
idleMax = 0;
|
|
}
|
|
};
|
|
|
|
std::map<UID, PeekTrackerData> peekTracker;
|
|
|
|
Reference<AsyncVar<Reference<ILogSystem>>> logSystem;
|
|
Tag remoteTag;
|
|
bool isPrimary;
|
|
int logRouterTags;
|
|
Version logRouterPoppedVersion, logRouterPopToVersion;
|
|
int8_t locality;
|
|
UID recruitmentID;
|
|
TLogSpillType logSpillType;
|
|
std::set<Tag> allTags;
|
|
Future<Void> terminated;
|
|
FlowLock execOpLock;
|
|
bool execOpCommitInProgress;
|
|
int txsTags;
|
|
|
|
std::map<Tag, Version> toBePopped; // map of Tag->Version for all the pops
|
|
// that came when ignorePopRequest was set
|
|
|
|
explicit LogData(TLogData* tLogData,
|
|
TLogInterface interf,
|
|
Tag remoteTag,
|
|
bool isPrimary,
|
|
int logRouterTags,
|
|
int txsTags,
|
|
UID recruitmentID,
|
|
ProtocolVersion protocolVersion,
|
|
TLogSpillType logSpillType,
|
|
std::vector<Tag> tags,
|
|
std::string context)
|
|
: stopped(false), initialized(false), queueCommittingVersion(0), knownCommittedVersion(0),
|
|
durableKnownCommittedVersion(0), minKnownCommittedVersion(0), queuePoppedVersion(0), minPoppedTagVersion(0),
|
|
minPoppedTag(invalidTag), unpoppedRecoveredTags(0), cc("TLog", interf.id().toString()),
|
|
bytesInput("BytesInput", cc), bytesDurable("BytesDurable", cc), blockingPeeks("BlockingPeeks", cc),
|
|
blockingPeekTimeouts("BlockingPeekTimeouts", cc), emptyPeeks("EmptyPeeks", cc),
|
|
nonEmptyPeeks("NonEmptyPeeks", cc), logId(interf.id()), protocolVersion(protocolVersion),
|
|
newPersistentDataVersion(invalidVersion), tLogData(tLogData), unrecoveredBefore(1), recoveredAt(1),
|
|
recoveryTxnVersion(1), logSystem(new AsyncVar<Reference<ILogSystem>>()), remoteTag(remoteTag),
|
|
isPrimary(isPrimary), logRouterTags(logRouterTags), logRouterPoppedVersion(0), logRouterPopToVersion(0),
|
|
locality(tagLocalityInvalid), recruitmentID(recruitmentID), logSpillType(logSpillType),
|
|
allTags(tags.begin(), tags.end()), terminated(tLogData->terminated.getFuture()), execOpCommitInProgress(false),
|
|
txsTags(txsTags) {
|
|
startRole(Role::TRANSACTION_LOG,
|
|
interf.id(),
|
|
tLogData->workerID,
|
|
{ { "SharedTLog", tLogData->dbgid.shortString() } },
|
|
context);
|
|
addActor.send(traceRole(Role::TRANSACTION_LOG, interf.id()));
|
|
|
|
persistentDataVersion.init(LiteralStringRef("TLog.PersistentDataVersion"), cc.id);
|
|
persistentDataDurableVersion.init(LiteralStringRef("TLog.PersistentDataDurableVersion"), cc.id);
|
|
version.initMetric(LiteralStringRef("TLog.Version"), cc.id);
|
|
queueCommittedVersion.initMetric(LiteralStringRef("TLog.QueueCommittedVersion"), cc.id);
|
|
|
|
specialCounter(cc, "Version", [this]() { return this->version.get(); });
|
|
specialCounter(cc, "QueueCommittedVersion", [this]() { return this->queueCommittedVersion.get(); });
|
|
specialCounter(cc, "PersistentDataVersion", [this]() { return this->persistentDataVersion; });
|
|
specialCounter(cc, "PersistentDataDurableVersion", [this]() { return this->persistentDataDurableVersion; });
|
|
specialCounter(cc, "KnownCommittedVersion", [this]() { return this->knownCommittedVersion; });
|
|
specialCounter(cc, "QueuePoppedVersion", [this]() { return this->queuePoppedVersion; });
|
|
specialCounter(cc, "MinPoppedTagVersion", [this]() { return this->minPoppedTagVersion; });
|
|
// The locality and id of the tag that is responsible for making the TLog hold onto its oldest piece of data.
|
|
// If disk queues are growing and no one is sure why, then you shall look at this to find the tag responsible
|
|
// for why the TLog thinks it can't throw away data.
|
|
specialCounter(cc, "MinPoppedTagLocality", [this]() { return this->minPoppedTag.locality; });
|
|
specialCounter(cc, "MinPoppedTagId", [this]() { return this->minPoppedTag.id; });
|
|
specialCounter(cc, "SharedBytesInput", [tLogData]() { return tLogData->bytesInput; });
|
|
specialCounter(cc, "SharedBytesDurable", [tLogData]() { return tLogData->bytesDurable; });
|
|
specialCounter(cc, "SharedOverheadBytesInput", [tLogData]() { return tLogData->overheadBytesInput; });
|
|
specialCounter(cc, "SharedOverheadBytesDurable", [tLogData]() { return tLogData->overheadBytesDurable; });
|
|
specialCounter(cc, "PeekMemoryReserved", [tLogData]() { return tLogData->peekMemoryLimiter.activePermits(); });
|
|
specialCounter(cc, "PeekMemoryRequestsStalled", [tLogData]() { return tLogData->peekMemoryLimiter.waiters(); });
|
|
specialCounter(cc, "Generation", [this]() { return this->recoveryCount; });
|
|
specialCounter(cc, "ActivePeekStreams", [tLogData]() { return tLogData->activePeekStreams; });
|
|
}
|
|
|
|
~LogData() {
|
|
endRole(Role::TRANSACTION_LOG, logId, "Error", true);
|
|
|
|
if (!terminated.isReady()) {
|
|
tLogData->bytesDurable += bytesInput.getValue() - bytesDurable.getValue();
|
|
TraceEvent("TLogBytesWhenRemoved", logId)
|
|
.detail("SharedBytesInput", tLogData->bytesInput)
|
|
.detail("SharedBytesDurable", tLogData->bytesDurable)
|
|
.detail("LocalBytesInput", bytesInput.getValue())
|
|
.detail("LocalBytesDurable", bytesDurable.getValue());
|
|
|
|
ASSERT_ABORT(tLogData->bytesDurable <= tLogData->bytesInput);
|
|
|
|
Key logIdKey = BinaryWriter::toValue(logId, Unversioned());
|
|
tLogData->persistentData->clear(singleKeyRange(logIdKey.withPrefix(persistCurrentVersionKeys.begin)));
|
|
tLogData->persistentData->clear(
|
|
singleKeyRange(logIdKey.withPrefix(persistKnownCommittedVersionKeys.begin)));
|
|
tLogData->persistentData->clear(singleKeyRange(logIdKey.withPrefix(persistLocalityKeys.begin)));
|
|
tLogData->persistentData->clear(singleKeyRange(logIdKey.withPrefix(persistLogRouterTagsKeys.begin)));
|
|
tLogData->persistentData->clear(singleKeyRange(logIdKey.withPrefix(persistTxsTagsKeys.begin)));
|
|
tLogData->persistentData->clear(singleKeyRange(logIdKey.withPrefix(persistRecoveryCountKeys.begin)));
|
|
tLogData->persistentData->clear(singleKeyRange(logIdKey.withPrefix(persistProtocolVersionKeys.begin)));
|
|
tLogData->persistentData->clear(singleKeyRange(logIdKey.withPrefix(persistTLogSpillTypeKeys.begin)));
|
|
tLogData->persistentData->clear(singleKeyRange(logIdKey.withPrefix(persistRecoveryLocationKey)));
|
|
Key msgKey = logIdKey.withPrefix(persistTagMessagesKeys.begin);
|
|
tLogData->persistentData->clear(KeyRangeRef(msgKey, strinc(msgKey)));
|
|
Key msgRefKey = logIdKey.withPrefix(persistTagMessageRefsKeys.begin);
|
|
tLogData->persistentData->clear(KeyRangeRef(msgRefKey, strinc(msgRefKey)));
|
|
Key poppedKey = logIdKey.withPrefix(persistTagPoppedKeys.begin);
|
|
tLogData->persistentData->clear(KeyRangeRef(poppedKey, strinc(poppedKey)));
|
|
}
|
|
|
|
for (auto it = peekTracker.begin(); it != peekTracker.end(); ++it) {
|
|
for (auto seq : it->second.sequence_version) {
|
|
if (!seq.second.isSet()) {
|
|
seq.second.sendError(timed_out());
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
LogEpoch epoch() const { return recoveryCount; }
|
|
|
|
bool shouldSpillByValue(Tag t) const {
|
|
switch (logSpillType) {
|
|
case TLogSpillType::VALUE:
|
|
return true;
|
|
case TLogSpillType::REFERENCE:
|
|
return t.locality == tagLocalityTxs || t == txsTag;
|
|
default:
|
|
ASSERT(false);
|
|
return false;
|
|
}
|
|
}
|
|
|
|
bool shouldSpillByReference(Tag t) const { return !shouldSpillByValue(t); }
|
|
|
|
void unblockWaitingPeeks() {
|
|
if (SERVER_KNOBS->ENABLE_VERSION_VECTOR) {
|
|
for (auto& iter : waitingTags) {
|
|
TraceEvent("UnblockWaitingPeeks", tLogData->dbgid)
|
|
.detail("LogId", logId)
|
|
.detail("Tag", iter.first.toString());
|
|
iter.second.send(Void());
|
|
}
|
|
waitingTags.clear();
|
|
}
|
|
}
|
|
};
|
|
|
|
template <class T>
|
|
void TLogQueue::push(T const& qe, Reference<LogData> logData) {
|
|
BinaryWriter wr(Unversioned()); // outer framing is not versioned
|
|
wr << uint32_t(0);
|
|
IncludeVersion(ProtocolVersion::withTLogQueueEntryRef()).write(wr); // payload is versioned
|
|
wr << qe;
|
|
wr << uint8_t(1);
|
|
*(uint32_t*)wr.getData() = wr.getLength() - sizeof(uint32_t) - sizeof(uint8_t);
|
|
const IDiskQueue::location startloc = queue->getNextPushLocation();
|
|
// FIXME: push shouldn't return anything. We should call getNextPushLocation() again.
|
|
const IDiskQueue::location endloc = queue->push(wr.toValue());
|
|
//TraceEvent("TLogQueueVersionWritten", dbgid).detail("Size", wr.getLength() - sizeof(uint32_t) - sizeof(uint8_t)).detail("Loc", loc);
|
|
logData->versionLocation[qe.version] = std::make_pair(startloc, endloc);
|
|
}
|
|
|
|
void TLogQueue::forgetBefore(Version upToVersion, Reference<LogData> logData) {
|
|
// Keep only the given and all subsequent version numbers
|
|
// Find the first version >= upTo
|
|
auto v = logData->versionLocation.lower_bound(upToVersion);
|
|
if (v == logData->versionLocation.begin())
|
|
return;
|
|
|
|
if (v == logData->versionLocation.end()) {
|
|
v = logData->versionLocation.lastItem();
|
|
} else {
|
|
v.decrementNonEnd();
|
|
}
|
|
|
|
logData->versionLocation.erase(logData->versionLocation.begin(),
|
|
v); // ... and then we erase that previous version and all prior versions
|
|
}
|
|
|
|
void TLogQueue::pop(IDiskQueue::location upToLocation) {
|
|
queue->pop(upToLocation);
|
|
}
|
|
|
|
void TLogQueue::updateVersionSizes(const TLogQueueEntry& result,
|
|
TLogData* tLog,
|
|
IDiskQueue::location start,
|
|
IDiskQueue::location end) {
|
|
auto it = tLog->id_data.find(result.id);
|
|
if (it != tLog->id_data.end()) {
|
|
it->second->versionLocation[result.version] = std::make_pair(start, end);
|
|
}
|
|
}
|
|
|
|
ACTOR Future<Void> tLogLock(TLogData* self, ReplyPromise<TLogLockResult> reply, Reference<LogData> logData) {
|
|
state Version stopVersion = logData->version.get();
|
|
|
|
TEST(true); // TLog stopped by recovering cluster-controller
|
|
TEST(logData->stopped); // logData already stopped
|
|
TEST(!logData->stopped); // logData not yet stopped
|
|
|
|
TraceEvent("TLogStop", logData->logId)
|
|
.detail("Ver", stopVersion)
|
|
.detail("IsStopped", logData->stopped)
|
|
.detail("QueueCommitted", logData->queueCommittedVersion.get());
|
|
|
|
logData->stopped = true;
|
|
logData->unblockWaitingPeeks();
|
|
if (!logData->recoveryComplete.isSet()) {
|
|
logData->recoveryComplete.sendError(end_of_stream());
|
|
}
|
|
|
|
// Lock once the current version has been committed
|
|
wait(logData->queueCommittedVersion.whenAtLeast(stopVersion));
|
|
|
|
ASSERT(stopVersion == logData->version.get());
|
|
|
|
TLogLockResult result;
|
|
result.end = stopVersion;
|
|
result.knownCommittedVersion = logData->knownCommittedVersion;
|
|
result.unknownCommittedVersions = self->unknownCommittedVersions;
|
|
result.id = self->dbgid;
|
|
|
|
TraceEvent("TLogStop2", self->dbgid)
|
|
.detail("LogId", logData->logId)
|
|
.detail("Ver", stopVersion)
|
|
.detail("IsStopped", logData->stopped)
|
|
.detail("QueueCommitted", logData->queueCommittedVersion.get())
|
|
.detail("KnownCommitted", result.knownCommittedVersion);
|
|
|
|
reply.send(result);
|
|
return Void();
|
|
}
|
|
|
|
void updatePersistentPopped(TLogData* self, Reference<LogData> logData, Reference<LogData::TagData> data) {
|
|
if (!data->poppedRecently)
|
|
return;
|
|
self->persistentData->set(
|
|
KeyValueRef(persistTagPoppedKey(logData->logId, data->tag), persistTagPoppedValue(data->popped)));
|
|
data->poppedRecently = false;
|
|
data->persistentPopped = data->popped;
|
|
|
|
if (data->nothingPersistent)
|
|
return;
|
|
|
|
if (logData->shouldSpillByValue(data->tag)) {
|
|
self->persistentData->clear(KeyRangeRef(persistTagMessagesKey(logData->logId, data->tag, Version(0)),
|
|
persistTagMessagesKey(logData->logId, data->tag, data->popped)));
|
|
} else {
|
|
self->persistentData->clear(KeyRangeRef(persistTagMessageRefsKey(logData->logId, data->tag, Version(0)),
|
|
persistTagMessageRefsKey(logData->logId, data->tag, data->popped)));
|
|
}
|
|
|
|
if (data->popped > logData->persistentDataVersion) {
|
|
data->nothingPersistent = true;
|
|
}
|
|
}
|
|
|
|
ACTOR Future<Void> updatePoppedLocation(TLogData* self, Reference<LogData> logData, Reference<LogData::TagData> data) {
|
|
// For anything spilled by value, we do not need to track its popped location.
|
|
if (logData->shouldSpillByValue(data->tag)) {
|
|
return Void();
|
|
}
|
|
|
|
if (data->versionForPoppedLocation >= data->persistentPopped)
|
|
return Void();
|
|
data->versionForPoppedLocation = data->persistentPopped;
|
|
|
|
// Use persistentPopped and not popped, so that a pop update received after spilling doesn't cause
|
|
// us to remove data that still is pointed to by SpilledData in the btree.
|
|
if (data->persistentPopped <= logData->persistentDataVersion) {
|
|
// Recover the next needed location in the Disk Queue from the index.
|
|
RangeResult kvrefs = wait(self->persistentData->readRange(
|
|
KeyRangeRef(persistTagMessageRefsKey(logData->logId, data->tag, data->persistentPopped),
|
|
persistTagMessageRefsKey(logData->logId, data->tag, logData->persistentDataVersion + 1)),
|
|
1));
|
|
|
|
if (kvrefs.empty()) {
|
|
// Nothing was persistent after all.
|
|
data->nothingPersistent = true;
|
|
} else {
|
|
VectorRef<SpilledData> spilledData;
|
|
BinaryReader r(kvrefs[0].value, AssumeVersion(logData->protocolVersion));
|
|
r >> spilledData;
|
|
|
|
for (const SpilledData& sd : spilledData) {
|
|
if (sd.version >= data->persistentPopped) {
|
|
data->poppedLocation = sd.start;
|
|
data->versionForPoppedLocation = sd.version;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if (data->persistentPopped >= logData->persistentDataVersion || data->nothingPersistent) {
|
|
// Then the location must be in memory.
|
|
auto locationIter = logData->versionLocation.lower_bound(data->persistentPopped);
|
|
if (locationIter != logData->versionLocation.end()) {
|
|
data->poppedLocation = locationIter->value.first;
|
|
data->versionForPoppedLocation = locationIter->key;
|
|
} else {
|
|
// No data on disk and no data in RAM.
|
|
// This TLog instance will be removed soon anyway, so we temporarily freeze our poppedLocation
|
|
// to avoid trying to track what the ending location of this TLog instance was.
|
|
}
|
|
}
|
|
|
|
return Void();
|
|
}
|
|
|
|
// It runs against the oldest TLog instance, calculates the first location in the disk queue that contains un-popped
|
|
// data, and then issues a pop to the disk queue at that location so that anything earlier can be
|
|
// removed/forgotten/overwritten. In effect, it applies the effect of TLogPop RPCs to disk.
|
|
ACTOR Future<Void> popDiskQueue(TLogData* self, Reference<LogData> logData) {
|
|
if (!logData->initialized)
|
|
return Void();
|
|
|
|
std::vector<Future<Void>> updates;
|
|
for (int tagLocality = 0; tagLocality < logData->tag_data.size(); tagLocality++) {
|
|
for (int tagId = 0; tagId < logData->tag_data[tagLocality].size(); tagId++) {
|
|
Reference<LogData::TagData> tagData = logData->tag_data[tagLocality][tagId];
|
|
if (tagData) {
|
|
updates.push_back(updatePoppedLocation(self, logData, tagData));
|
|
}
|
|
}
|
|
}
|
|
wait(waitForAll(updates));
|
|
|
|
IDiskQueue::location minLocation = 0;
|
|
Version minVersion = 0;
|
|
auto locationIter = logData->versionLocation.lower_bound(logData->persistentDataVersion);
|
|
if (locationIter != logData->versionLocation.end()) {
|
|
minLocation = locationIter->value.first;
|
|
minVersion = locationIter->key;
|
|
}
|
|
logData->minPoppedTagVersion = std::numeric_limits<Version>::max();
|
|
|
|
for (int tagLocality = 0; tagLocality < logData->tag_data.size(); tagLocality++) {
|
|
for (int tagId = 0; tagId < logData->tag_data[tagLocality].size(); tagId++) {
|
|
Reference<LogData::TagData> tagData = logData->tag_data[tagLocality][tagId];
|
|
if (tagData && logData->shouldSpillByReference(tagData->tag)) {
|
|
if (!tagData->nothingPersistent) {
|
|
minLocation = std::min(minLocation, tagData->poppedLocation);
|
|
minVersion = std::min(minVersion, tagData->popped);
|
|
}
|
|
if ((!tagData->nothingPersistent || tagData->versionMessages.size()) &&
|
|
tagData->popped < logData->minPoppedTagVersion) {
|
|
logData->minPoppedTagVersion = tagData->popped;
|
|
logData->minPoppedTag = tagData->tag;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if (self->queueCommitEnd.get() > 0) {
|
|
Version lastCommittedVersion = logData->queueCommittedVersion.get();
|
|
IDiskQueue::location lastCommittedLocation = minLocation;
|
|
auto locationIter = logData->versionLocation.lower_bound(lastCommittedVersion);
|
|
if (locationIter != logData->versionLocation.end()) {
|
|
lastCommittedLocation = locationIter->value.first;
|
|
}
|
|
self->persistentQueue->pop(std::min(minLocation, lastCommittedLocation));
|
|
logData->queuePoppedVersion = std::max(logData->queuePoppedVersion, minVersion);
|
|
}
|
|
|
|
return Void();
|
|
}
|
|
|
|
ACTOR Future<Void> updatePersistentData(TLogData* self, Reference<LogData> logData, Version newPersistentDataVersion) {
|
|
state BinaryWriter wr(Unversioned());
|
|
// PERSIST: Changes self->persistentDataVersion and writes and commits the relevant changes
|
|
ASSERT(newPersistentDataVersion <= logData->version.get());
|
|
ASSERT(newPersistentDataVersion <= logData->queueCommittedVersion.get());
|
|
ASSERT(newPersistentDataVersion > logData->persistentDataVersion);
|
|
ASSERT(logData->persistentDataVersion == logData->persistentDataDurableVersion);
|
|
logData->newPersistentDataVersion = newPersistentDataVersion;
|
|
|
|
//TraceEvent("UpdatePersistentData", self->dbgid).detail("Seq", newPersistentDataSeq);
|
|
|
|
state bool anyData = false;
|
|
|
|
// For all existing tags
|
|
state int tagLocality = 0;
|
|
state int tagId = 0;
|
|
|
|
for (tagLocality = 0; tagLocality < logData->tag_data.size(); tagLocality++) {
|
|
for (tagId = 0; tagId < logData->tag_data[tagLocality].size(); tagId++) {
|
|
state Reference<LogData::TagData> tagData = logData->tag_data[tagLocality][tagId];
|
|
if (tagData) {
|
|
wait(tagData->eraseMessagesBefore(tagData->popped, self, logData, TaskPriority::UpdateStorage));
|
|
state Version currentVersion = 0;
|
|
// Clear recently popped versions from persistentData if necessary
|
|
updatePersistentPopped(self, logData, tagData);
|
|
state Version lastVersion = std::numeric_limits<Version>::min();
|
|
state IDiskQueue::location firstLocation = std::numeric_limits<IDiskQueue::location>::max();
|
|
// Transfer unpopped messages with version numbers less than newPersistentDataVersion to persistentData
|
|
state std::deque<std::pair<Version, LengthPrefixedStringRef>>::iterator msg =
|
|
tagData->versionMessages.begin();
|
|
state int refSpilledTagCount = 0;
|
|
wr = BinaryWriter(AssumeVersion(logData->protocolVersion));
|
|
// We prefix our spilled locations with a count, so that we can read this back out as a VectorRef.
|
|
wr << uint32_t(0);
|
|
while (msg != tagData->versionMessages.end() && msg->first <= newPersistentDataVersion) {
|
|
currentVersion = msg->first;
|
|
anyData = true;
|
|
tagData->nothingPersistent = false;
|
|
|
|
if (logData->shouldSpillByValue(tagData->tag)) {
|
|
wr = BinaryWriter(Unversioned());
|
|
for (; msg != tagData->versionMessages.end() && msg->first == currentVersion; ++msg) {
|
|
wr << msg->second.toStringRef();
|
|
}
|
|
self->persistentData->set(KeyValueRef(
|
|
persistTagMessagesKey(logData->logId, tagData->tag, currentVersion), wr.toValue()));
|
|
} else {
|
|
// spill everything else by reference
|
|
const IDiskQueue::location begin = logData->versionLocation[currentVersion].first;
|
|
const IDiskQueue::location end = logData->versionLocation[currentVersion].second;
|
|
ASSERT(end > begin && end.lo - begin.lo < std::numeric_limits<uint32_t>::max());
|
|
uint32_t length = static_cast<uint32_t>(end.lo - begin.lo);
|
|
refSpilledTagCount++;
|
|
|
|
uint32_t size = 0;
|
|
for (; msg != tagData->versionMessages.end() && msg->first == currentVersion; ++msg) {
|
|
// Fast forward until we find a new version.
|
|
size += msg->second.expectedSize();
|
|
}
|
|
|
|
SpilledData spilledData(currentVersion, begin, length, size);
|
|
wr << spilledData;
|
|
|
|
lastVersion = std::max(currentVersion, lastVersion);
|
|
firstLocation = std::min(begin, firstLocation);
|
|
|
|
if ((wr.getLength() + sizeof(SpilledData) >
|
|
SERVER_KNOBS->TLOG_SPILL_REFERENCE_MAX_BYTES_PER_BATCH)) {
|
|
*(uint32_t*)wr.getData() = refSpilledTagCount;
|
|
self->persistentData->set(KeyValueRef(
|
|
persistTagMessageRefsKey(logData->logId, tagData->tag, lastVersion), wr.toValue()));
|
|
tagData->poppedLocation = std::min(tagData->poppedLocation, firstLocation);
|
|
refSpilledTagCount = 0;
|
|
wr = BinaryWriter(AssumeVersion(logData->protocolVersion));
|
|
wr << uint32_t(0);
|
|
}
|
|
|
|
Future<Void> f = yield(TaskPriority::UpdateStorage);
|
|
if (!f.isReady()) {
|
|
wait(f);
|
|
msg = std::upper_bound(
|
|
tagData->versionMessages.begin(),
|
|
tagData->versionMessages.end(),
|
|
std::make_pair(currentVersion, LengthPrefixedStringRef()),
|
|
[](const auto& l, const auto& r) -> bool { return l.first < r.first; });
|
|
}
|
|
}
|
|
}
|
|
if (refSpilledTagCount > 0) {
|
|
*(uint32_t*)wr.getData() = refSpilledTagCount;
|
|
self->persistentData->set(
|
|
KeyValueRef(persistTagMessageRefsKey(logData->logId, tagData->tag, lastVersion), wr.toValue()));
|
|
tagData->poppedLocation = std::min(tagData->poppedLocation, firstLocation);
|
|
}
|
|
|
|
wait(yield(TaskPriority::UpdateStorage));
|
|
}
|
|
}
|
|
}
|
|
|
|
auto locationIter = logData->versionLocation.lower_bound(newPersistentDataVersion);
|
|
if (locationIter != logData->versionLocation.end()) {
|
|
self->persistentData->set(
|
|
KeyValueRef(persistRecoveryLocationKey, BinaryWriter::toValue(locationIter->value.first, Unversioned())));
|
|
}
|
|
|
|
self->persistentData->set(
|
|
KeyValueRef(BinaryWriter::toValue(logData->logId, Unversioned()).withPrefix(persistCurrentVersionKeys.begin),
|
|
BinaryWriter::toValue(newPersistentDataVersion, Unversioned())));
|
|
self->persistentData->set(KeyValueRef(
|
|
BinaryWriter::toValue(logData->logId, Unversioned()).withPrefix(persistKnownCommittedVersionKeys.begin),
|
|
BinaryWriter::toValue(logData->knownCommittedVersion, Unversioned())));
|
|
logData->persistentDataVersion = newPersistentDataVersion;
|
|
|
|
wait(self->persistentData->commit()); // SOMEDAY: This seems to be running pretty often, should we slow it down???
|
|
wait(delay(0, TaskPriority::UpdateStorage));
|
|
|
|
// Now that the changes we made to persistentData are durable, erase the data we moved from memory and the queue,
|
|
// increase bytesDurable accordingly, and update persistentDataDurableVersion.
|
|
|
|
TEST(anyData); // TLog moved data to persistentData
|
|
logData->persistentDataDurableVersion = newPersistentDataVersion;
|
|
for (tagLocality = 0; tagLocality < logData->tag_data.size(); tagLocality++) {
|
|
for (tagId = 0; tagId < logData->tag_data[tagLocality].size(); tagId++) {
|
|
if (logData->tag_data[tagLocality][tagId]) {
|
|
wait(logData->tag_data[tagLocality][tagId]->eraseMessagesBefore(
|
|
newPersistentDataVersion + 1, self, logData, TaskPriority::UpdateStorage));
|
|
wait(yield(TaskPriority::UpdateStorage));
|
|
}
|
|
}
|
|
}
|
|
|
|
logData->version_sizes.erase(logData->version_sizes.begin(),
|
|
logData->version_sizes.lower_bound(logData->persistentDataDurableVersion));
|
|
|
|
wait(yield(TaskPriority::UpdateStorage));
|
|
|
|
while (!logData->messageBlocks.empty() && logData->messageBlocks.front().first <= newPersistentDataVersion) {
|
|
int64_t bytesErased =
|
|
int64_t(logData->messageBlocks.front().second.size()) * SERVER_KNOBS->TLOG_MESSAGE_BLOCK_OVERHEAD_FACTOR;
|
|
logData->bytesDurable += bytesErased;
|
|
self->bytesDurable += bytesErased;
|
|
logData->messageBlocks.pop_front();
|
|
wait(yield(TaskPriority::UpdateStorage));
|
|
}
|
|
|
|
if (logData->bytesDurable.getValue() > logData->bytesInput.getValue() || self->bytesDurable > self->bytesInput) {
|
|
TraceEvent(SevError, "BytesDurableTooLarge", logData->logId)
|
|
.detail("SharedBytesInput", self->bytesInput)
|
|
.detail("SharedBytesDurable", self->bytesDurable)
|
|
.detail("LocalBytesInput", logData->bytesInput.getValue())
|
|
.detail("LocalBytesDurable", logData->bytesDurable.getValue());
|
|
}
|
|
|
|
ASSERT(logData->bytesDurable.getValue() <= logData->bytesInput.getValue());
|
|
ASSERT(self->bytesDurable <= self->bytesInput);
|
|
|
|
if (self->queueCommitEnd.get() > 0) {
|
|
// FIXME: Maintain a heap of tags ordered by version to make this O(1) instead of O(n).
|
|
Version minVersion = std::numeric_limits<Version>::max();
|
|
for (tagLocality = 0; tagLocality < logData->tag_data.size(); tagLocality++) {
|
|
for (tagId = 0; tagId < logData->tag_data[tagLocality].size(); tagId++) {
|
|
Reference<LogData::TagData> tagData = logData->tag_data[tagLocality][tagId];
|
|
if (tagData) {
|
|
if (logData->shouldSpillByValue(tagData->tag)) {
|
|
minVersion = std::min(minVersion, newPersistentDataVersion);
|
|
} else {
|
|
minVersion = std::min(minVersion, tagData->popped);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if (minVersion != std::numeric_limits<Version>::max()) {
|
|
self->persistentQueue->forgetBefore(
|
|
newPersistentDataVersion,
|
|
logData); // SOMEDAY: this can cause a slow task (~0.5ms), presumably from erasing too many versions.
|
|
// Should we limit the number of versions cleared at a time?
|
|
}
|
|
}
|
|
logData->newPersistentDataVersion = invalidVersion;
|
|
|
|
return Void();
|
|
}
|
|
|
|
ACTOR Future<Void> tLogPopCore(TLogData* self, Tag inputTag, Version to, Reference<LogData> logData) {
|
|
state Version upTo = to;
|
|
int8_t tagLocality = inputTag.locality;
|
|
if (isPseudoLocality(tagLocality)) {
|
|
if (logData->logSystem->get().isValid()) {
|
|
// if the configuration change from multi-region to single region mode, the delayed pop created during
|
|
// multi-region stage should be skipped. Same thing applies to the backup worker
|
|
if (isPseudoLocality(inputTag.locality) &&
|
|
logData->logSystem->get()->hasPseudoLocality(inputTag.locality)) {
|
|
upTo = logData->logSystem->get()->popPseudoLocalityTag(inputTag, to);
|
|
tagLocality = tagLocalityLogRouter;
|
|
} else {
|
|
ASSERT_WE_THINK(tagLocality == tagLocalityLogRouterMapped);
|
|
TraceEvent(SevWarn, "TLogPopNoPseudoLocality", self->dbgid)
|
|
.detail("Locality", tagLocality)
|
|
.detail("Version", upTo);
|
|
return Void();
|
|
}
|
|
} else {
|
|
TraceEvent(SevWarn, "TLogPopNoLogSystem", self->dbgid)
|
|
.detail("Locality", tagLocality)
|
|
.detail("Version", upTo);
|
|
return Void();
|
|
}
|
|
}
|
|
state Tag tag(tagLocality, inputTag.id);
|
|
auto tagData = logData->getTagData(tag);
|
|
if (!tagData) {
|
|
tagData = logData->createTagData(tag, upTo, true, true, false);
|
|
} else if (upTo > tagData->popped) {
|
|
tagData->popped = upTo;
|
|
tagData->poppedRecently = true;
|
|
|
|
if (tagData->unpoppedRecovered && upTo > logData->recoveredAt) {
|
|
tagData->unpoppedRecovered = false;
|
|
logData->unpoppedRecoveredTags--;
|
|
TraceEvent("TLogPoppedTag", logData->logId)
|
|
.detail("Tags", logData->unpoppedRecoveredTags)
|
|
.detail("Tag", tag.toString())
|
|
.detail("DurableKCVer", logData->durableKnownCommittedVersion)
|
|
.detail("RecoveredAt", logData->recoveredAt);
|
|
if (logData->unpoppedRecoveredTags == 0 && logData->durableKnownCommittedVersion >= logData->recoveredAt &&
|
|
logData->recoveryComplete.canBeSet()) {
|
|
logData->recoveryComplete.send(Void());
|
|
}
|
|
}
|
|
|
|
uint64_t PoppedVersionLag = logData->persistentDataDurableVersion - logData->queuePoppedVersion;
|
|
if (SERVER_KNOBS->ENABLE_DETAILED_TLOG_POP_TRACE &&
|
|
(logData->queuePoppedVersion > 0) && // avoid generating massive events at beginning
|
|
(tagData->unpoppedRecovered ||
|
|
PoppedVersionLag >=
|
|
SERVER_KNOBS->TLOG_POPPED_VER_LAG_THRESHOLD_FOR_TLOGPOP_TRACE)) { // when recovery or long lag
|
|
TraceEvent("TLogPopDetails", logData->logId)
|
|
.detail("Tag", tagData->tag.toString())
|
|
.detail("UpTo", upTo)
|
|
.detail("PoppedVersionLag", PoppedVersionLag)
|
|
.detail("MinPoppedTag", logData->minPoppedTag.toString())
|
|
.detail("QueuePoppedVersion", logData->queuePoppedVersion)
|
|
.detail("UnpoppedRecovered", tagData->unpoppedRecovered ? "True" : "False")
|
|
.detail("NothingPersistent", tagData->nothingPersistent ? "True" : "False");
|
|
}
|
|
if (upTo > logData->persistentDataDurableVersion)
|
|
wait(tagData->eraseMessagesBefore(upTo, self, logData, TaskPriority::TLogPop));
|
|
//TraceEvent("TLogPop", logData->logId).detail("Tag", tag.toString()).detail("To", upTo);
|
|
}
|
|
return Void();
|
|
}
|
|
|
|
ACTOR Future<Void> processPopRequests(TLogData* self, Reference<LogData> logData) {
|
|
state std::vector<Future<Void>> ignoredPops;
|
|
state std::map<Tag, Version>::const_iterator it;
|
|
state int ignoredPopsPlayed = 0;
|
|
state std::map<Tag, Version> toBePopped;
|
|
|
|
while (now() < self->ignorePopDeadline) {
|
|
wait(delayUntil(self->ignorePopDeadline + 0.0001));
|
|
}
|
|
|
|
toBePopped = std::move(logData->toBePopped);
|
|
logData->toBePopped.clear();
|
|
self->ignorePopUid = "";
|
|
for (it = toBePopped.cbegin(); it != toBePopped.cend(); ++it) {
|
|
const auto& [tag, version] = *it;
|
|
TraceEvent("PlayIgnoredPop", logData->logId).detail("Tag", tag.toString()).detail("Version", version);
|
|
ignoredPops.push_back(tLogPopCore(self, tag, version, logData));
|
|
if (++ignoredPopsPlayed % SERVER_KNOBS->TLOG_POP_BATCH_SIZE == 0) {
|
|
TEST(true); // Yielding while processing pop requests
|
|
wait(yield());
|
|
}
|
|
}
|
|
wait(waitForAll(ignoredPops));
|
|
TraceEvent("ResetIgnorePopRequest", logData->logId).detail("IgnorePopDeadline", self->ignorePopDeadline);
|
|
return Void();
|
|
}
|
|
|
|
ACTOR Future<Void> tLogPop(TLogData* self, TLogPopRequest req, Reference<LogData> logData) {
|
|
if (now() < self->ignorePopDeadline) {
|
|
TraceEvent(SevDebug, "IgnoringPopRequest", logData->logId).detail("IgnorePopDeadline", self->ignorePopDeadline);
|
|
|
|
if (logData->toBePopped.empty()) {
|
|
logData->addActor.send(processPopRequests(self, logData));
|
|
}
|
|
|
|
auto& v = logData->toBePopped[req.tag];
|
|
v = std::max(v, req.to);
|
|
|
|
TraceEvent(SevDebug, "IgnoringPopRequest", logData->logId)
|
|
.detail("IgnorePopDeadline", self->ignorePopDeadline)
|
|
.detail("Tag", req.tag.toString())
|
|
.detail("Version", req.to);
|
|
} else {
|
|
wait(tLogPopCore(self, req.tag, req.to, logData));
|
|
}
|
|
req.reply.send(Void());
|
|
return Void();
|
|
}
|
|
|
|
// This function (and updatePersistentData, which is called by this function) run at a low priority and can soak up all
|
|
// CPU resources. For this reason, they employ aggressive use of yields to avoid causing slow tasks that could introduce
|
|
// latencies for more important work (e.g. commits).
|
|
// This actor is just a loop that calls updatePersistentData and popDiskQueue whenever
|
|
// (a) there's data to be spilled or (b) we should update metadata after some commits have been fully popped.
|
|
ACTOR Future<Void> updateStorage(TLogData* self) {
|
|
while (self->spillOrder.size() && !self->id_data.count(self->spillOrder.front())) {
|
|
self->spillOrder.pop_front();
|
|
}
|
|
|
|
if (!self->spillOrder.size()) {
|
|
wait(delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL
|
|
: SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL,
|
|
TaskPriority::UpdateStorage));
|
|
return Void();
|
|
}
|
|
|
|
state Reference<LogData> logData = self->id_data[self->spillOrder.front()];
|
|
state Version nextVersion = 0;
|
|
state int totalSize = 0;
|
|
|
|
state FlowLock::Releaser commitLockReleaser;
|
|
|
|
// FIXME: This policy for calculating the cache pop version could end up popping recent data in the remote DC after
|
|
// two consecutive recoveries.
|
|
// It also does not protect against spilling the cache tag directly, so it is theoretically possible to spill this
|
|
// tag; which is not intended to ever happen.
|
|
Optional<Version> cachePopVersion;
|
|
for (auto& it : self->id_data) {
|
|
if (!it.second->stopped) {
|
|
if (it.second->version.get() - it.second->unrecoveredBefore >
|
|
SERVER_KNOBS->MAX_VERSIONS_IN_FLIGHT + SERVER_KNOBS->MAX_CACHE_VERSIONS) {
|
|
cachePopVersion = it.second->version.get() - SERVER_KNOBS->MAX_CACHE_VERSIONS;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (cachePopVersion.present()) {
|
|
state std::vector<Future<Void>> cachePopFutures;
|
|
for (auto& it : self->id_data) {
|
|
cachePopFutures.push_back(tLogPop(self, TLogPopRequest(cachePopVersion.get(), 0, cacheTag), it.second));
|
|
}
|
|
wait(waitForAll(cachePopFutures));
|
|
}
|
|
|
|
if (logData->stopped) {
|
|
if (self->bytesInput - self->bytesDurable >= self->targetVolatileBytes) {
|
|
while (logData->persistentDataDurableVersion != logData->version.get()) {
|
|
totalSize = 0;
|
|
Map<Version, std::pair<int, int>>::iterator sizeItr = logData->version_sizes.begin();
|
|
nextVersion = logData->version.get();
|
|
while (totalSize < SERVER_KNOBS->REFERENCE_SPILL_UPDATE_STORAGE_BYTE_LIMIT &&
|
|
sizeItr != logData->version_sizes.end()) {
|
|
totalSize += sizeItr->value.first + sizeItr->value.second;
|
|
++sizeItr;
|
|
nextVersion = sizeItr == logData->version_sizes.end() ? logData->version.get() : sizeItr->key;
|
|
}
|
|
|
|
wait(logData->queueCommittedVersion.whenAtLeast(nextVersion));
|
|
if (logData->queueCommittedVersion.get() == std::numeric_limits<Version>::max()) {
|
|
return Void();
|
|
}
|
|
wait(delay(0, TaskPriority::UpdateStorage));
|
|
|
|
//TraceEvent("TlogUpdatePersist", self->dbgid).detail("LogId", logData->logId).detail("NextVersion", nextVersion).detail("Version", logData->version.get()).detail("PersistentDataDurableVer", logData->persistentDataDurableVersion).detail("QueueCommitVer", logData->queueCommittedVersion.get()).detail("PersistDataVer", logData->persistentDataVersion);
|
|
if (nextVersion > logData->persistentDataVersion) {
|
|
wait(self->persistentDataCommitLock.take());
|
|
commitLockReleaser = FlowLock::Releaser(self->persistentDataCommitLock);
|
|
wait(updatePersistentData(self, logData, nextVersion));
|
|
// Concurrently with this loop, the last stopped TLog could have been removed.
|
|
if (self->popOrder.size()) {
|
|
wait(popDiskQueue(self, self->id_data[self->popOrder.front()]));
|
|
}
|
|
commitLockReleaser.release();
|
|
} else {
|
|
wait(delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL
|
|
: SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL,
|
|
TaskPriority::UpdateStorage));
|
|
}
|
|
|
|
if (logData->removed.isReady()) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (logData->persistentDataDurableVersion == logData->version.get()) {
|
|
self->spillOrder.pop_front();
|
|
}
|
|
wait(delay(0.0, TaskPriority::UpdateStorage));
|
|
} else {
|
|
wait(delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL
|
|
: SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL,
|
|
TaskPriority::UpdateStorage));
|
|
}
|
|
} else if (logData->initialized) {
|
|
ASSERT(self->spillOrder.size() == 1);
|
|
if (logData->version_sizes.empty()) {
|
|
nextVersion = logData->version.get();
|
|
} else {
|
|
// Double check that a running TLog wasn't wrongly affected by spilling locked SharedTLogs.
|
|
ASSERT_WE_THINK(self->targetVolatileBytes == SERVER_KNOBS->TLOG_SPILL_THRESHOLD);
|
|
Map<Version, std::pair<int, int>>::iterator sizeItr = logData->version_sizes.begin();
|
|
while (totalSize < SERVER_KNOBS->REFERENCE_SPILL_UPDATE_STORAGE_BYTE_LIMIT &&
|
|
sizeItr != logData->version_sizes.end() &&
|
|
(logData->bytesInput.getValue() - logData->bytesDurable.getValue() - totalSize >=
|
|
self->targetVolatileBytes ||
|
|
sizeItr->value.first == 0)) {
|
|
totalSize += sizeItr->value.first + sizeItr->value.second;
|
|
++sizeItr;
|
|
nextVersion = sizeItr == logData->version_sizes.end() ? logData->version.get() : sizeItr->key;
|
|
}
|
|
}
|
|
|
|
//TraceEvent("UpdateStorageVer", logData->logId).detail("NextVersion", nextVersion).detail("PersistentDataVersion", logData->persistentDataVersion).detail("TotalSize", totalSize);
|
|
|
|
wait(logData->queueCommittedVersion.whenAtLeast(nextVersion));
|
|
if (logData->queueCommittedVersion.get() == std::numeric_limits<Version>::max()) {
|
|
return Void();
|
|
}
|
|
wait(delay(0, TaskPriority::UpdateStorage));
|
|
|
|
if (nextVersion > logData->persistentDataVersion) {
|
|
wait(self->persistentDataCommitLock.take());
|
|
commitLockReleaser = FlowLock::Releaser(self->persistentDataCommitLock);
|
|
wait(updatePersistentData(self, logData, nextVersion));
|
|
if (self->popOrder.size()) {
|
|
wait(popDiskQueue(self, self->id_data[self->popOrder.front()]));
|
|
}
|
|
commitLockReleaser.release();
|
|
}
|
|
|
|
if (totalSize < SERVER_KNOBS->REFERENCE_SPILL_UPDATE_STORAGE_BYTE_LIMIT) {
|
|
wait(delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL
|
|
: SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL,
|
|
TaskPriority::UpdateStorage));
|
|
} else {
|
|
// recovery wants to commit to persistant data when updatePersistentData is not active, this delay ensures
|
|
// that immediately after updatePersist returns another one has not been started yet.
|
|
wait(delay(0.0, TaskPriority::UpdateStorage));
|
|
}
|
|
} else {
|
|
wait(delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL
|
|
: SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL,
|
|
TaskPriority::UpdateStorage));
|
|
}
|
|
return Void();
|
|
}
|
|
|
|
ACTOR Future<Void> updateStorageLoop(TLogData* self) {
|
|
wait(delay(0, TaskPriority::UpdateStorage));
|
|
|
|
loop { wait(updateStorage(self)); }
|
|
}
|
|
|
|
void commitMessages(TLogData* self,
|
|
Reference<LogData> logData,
|
|
Version version,
|
|
const std::vector<TagsAndMessage>& taggedMessages) {
|
|
// SOMEDAY: This method of copying messages is reasonably memory efficient, but it's still a lot of bytes copied.
|
|
// Find a way to do the memory allocation right as we receive the messages in the network layer.
|
|
|
|
int64_t addedBytes = 0;
|
|
int64_t overheadBytes = 0;
|
|
int expectedBytes = 0;
|
|
int txsBytes = 0;
|
|
|
|
if (!taggedMessages.size()) {
|
|
return;
|
|
}
|
|
|
|
int msgSize = 0;
|
|
for (auto& i : taggedMessages) {
|
|
msgSize += i.message.size();
|
|
}
|
|
|
|
// Grab the last block in the blocks list so we can share its arena
|
|
// We pop all of the elements of it to create a "fresh" vector that starts at the end of the previous vector
|
|
Standalone<VectorRef<uint8_t>> block;
|
|
if (logData->messageBlocks.empty()) {
|
|
block = Standalone<VectorRef<uint8_t>>();
|
|
block.reserve(block.arena(), std::max<int64_t>(SERVER_KNOBS->TLOG_MESSAGE_BLOCK_BYTES, msgSize));
|
|
} else {
|
|
block = logData->messageBlocks.back().second;
|
|
}
|
|
|
|
block.pop_front(block.size());
|
|
|
|
for (auto& msg : taggedMessages) {
|
|
if (msg.message.size() > block.capacity() - block.size()) {
|
|
logData->messageBlocks.emplace_back(version, block);
|
|
addedBytes += int64_t(block.size()) * SERVER_KNOBS->TLOG_MESSAGE_BLOCK_OVERHEAD_FACTOR;
|
|
block = Standalone<VectorRef<uint8_t>>();
|
|
block.reserve(block.arena(), std::max<int64_t>(SERVER_KNOBS->TLOG_MESSAGE_BLOCK_BYTES, msgSize));
|
|
}
|
|
|
|
DEBUG_TAGS_AND_MESSAGE("TLogCommitMessages", version, msg.getRawMessage(), logData->logId)
|
|
.detail("DebugID", self->dbgid);
|
|
block.append(block.arena(), msg.message.begin(), msg.message.size());
|
|
for (auto tag : msg.tags) {
|
|
if (logData->locality == tagLocalitySatellite) {
|
|
if (!(tag.locality == tagLocalityTxs || tag.locality == tagLocalityLogRouter || tag == txsTag)) {
|
|
continue;
|
|
}
|
|
} else if (!(logData->locality == tagLocalitySpecial || logData->locality == tag.locality ||
|
|
tag.locality < 0)) {
|
|
continue;
|
|
}
|
|
|
|
if (tag.locality == tagLocalityLogRouter) {
|
|
if (!logData->logRouterTags) {
|
|
continue;
|
|
}
|
|
tag.id = tag.id % logData->logRouterTags;
|
|
}
|
|
if (tag.locality == tagLocalityTxs) {
|
|
if (logData->txsTags > 0) {
|
|
tag.id = tag.id % logData->txsTags;
|
|
} else {
|
|
tag = txsTag;
|
|
}
|
|
}
|
|
Reference<LogData::TagData> tagData = logData->getTagData(tag);
|
|
if (!tagData) {
|
|
tagData = logData->createTagData(tag, 0, true, true, false);
|
|
}
|
|
|
|
if (version >= tagData->popped) {
|
|
tagData->versionMessages.emplace_back(
|
|
version, LengthPrefixedStringRef((uint32_t*)(block.end() - msg.message.size())));
|
|
if (tagData->versionMessages.back().second.expectedSize() > SERVER_KNOBS->MAX_MESSAGE_SIZE) {
|
|
TraceEvent(SevWarnAlways, "LargeMessage")
|
|
.detail("Size", tagData->versionMessages.back().second.expectedSize());
|
|
}
|
|
if (tag.locality != tagLocalityTxs && tag != txsTag) {
|
|
expectedBytes += tagData->versionMessages.back().second.expectedSize();
|
|
} else {
|
|
txsBytes += tagData->versionMessages.back().second.expectedSize();
|
|
}
|
|
if (SERVER_KNOBS->ENABLE_VERSION_VECTOR) {
|
|
auto iter = logData->waitingTags.find(tag);
|
|
if (iter != logData->waitingTags.end()) {
|
|
auto promise = iter->second;
|
|
logData->waitingTags.erase(iter);
|
|
promise.send(Void());
|
|
}
|
|
}
|
|
|
|
// The factor of VERSION_MESSAGES_OVERHEAD is intended to be an overestimate of the actual memory used
|
|
// to store this data in a std::deque. In practice, this number is probably something like 528/512
|
|
// ~= 1.03, but this could vary based on the implementation. There will also be a fixed overhead per
|
|
// std::deque, but its size should be trivial relative to the size of the TLog queue and can be thought
|
|
// of as increasing the capacity of the queue slightly.
|
|
overheadBytes += SERVER_KNOBS->VERSION_MESSAGES_ENTRY_BYTES_WITH_OVERHEAD;
|
|
}
|
|
}
|
|
|
|
msgSize -= msg.message.size();
|
|
}
|
|
logData->messageBlocks.emplace_back(version, block);
|
|
addedBytes += int64_t(block.size()) * SERVER_KNOBS->TLOG_MESSAGE_BLOCK_OVERHEAD_FACTOR;
|
|
addedBytes += overheadBytes;
|
|
|
|
logData->version_sizes[version] = std::make_pair(expectedBytes, txsBytes);
|
|
logData->bytesInput += addedBytes;
|
|
self->bytesInput += addedBytes;
|
|
self->overheadBytesInput += overheadBytes;
|
|
|
|
//TraceEvent("TLogPushed", self->dbgid).detail("Bytes", addedBytes).detail("MessageBytes", messages.size()).detail("Tags", tags.size()).detail("ExpectedBytes", expectedBytes).detail("MCount", mCount).detail("TCount", tCount);
|
|
}
|
|
|
|
void commitMessages(TLogData* self, Reference<LogData> logData, Version version, Arena arena, StringRef messages) {
|
|
ArenaReader rd(arena, messages, Unversioned());
|
|
self->tempTagMessages.clear();
|
|
while (!rd.empty()) {
|
|
TagsAndMessage tagsAndMsg;
|
|
tagsAndMsg.loadFromArena(&rd, nullptr);
|
|
self->tempTagMessages.push_back(std::move(tagsAndMsg));
|
|
}
|
|
commitMessages(self, logData, version, self->tempTagMessages);
|
|
}
|
|
|
|
Version poppedVersion(Reference<LogData> self, Tag tag) {
|
|
auto tagData = self->getTagData(tag);
|
|
if (!tagData) {
|
|
if (tag == txsTag || tag.locality == tagLocalityTxs) {
|
|
return 0;
|
|
}
|
|
return std::max(self->recoveredAt + 1, self->recoveryTxnVersion);
|
|
}
|
|
return tagData->popped;
|
|
}
|
|
|
|
std::deque<std::pair<Version, LengthPrefixedStringRef>>& getVersionMessages(Reference<LogData> self, Tag tag) {
|
|
auto tagData = self->getTagData(tag);
|
|
if (!tagData) {
|
|
static std::deque<std::pair<Version, LengthPrefixedStringRef>> empty;
|
|
return empty;
|
|
}
|
|
return tagData->versionMessages;
|
|
};
|
|
|
|
ACTOR Future<Void> waitForMessagesForTag(Reference<LogData> self, Tag reqTag, Version reqBegin, double timeout) {
|
|
self->blockingPeeks += 1;
|
|
auto tagData = self->getTagData(reqTag);
|
|
if (tagData.isValid() && !tagData->versionMessages.empty() && tagData->versionMessages.back().first >= reqBegin) {
|
|
return Void();
|
|
}
|
|
choose {
|
|
when(wait(self->waitingTags[reqTag].getFuture())) {
|
|
// we want the caller to finish first, otherwise the data structure it is building might not be complete
|
|
wait(delay(0.0));
|
|
}
|
|
when(wait(delay(timeout))) { self->blockingPeekTimeouts += 1; }
|
|
}
|
|
return Void();
|
|
}
|
|
|
|
void peekMessagesFromMemory(Reference<LogData> self,
|
|
Tag tag,
|
|
Version begin,
|
|
BinaryWriter& messages,
|
|
Version& endVersion) {
|
|
ASSERT(!messages.getLength());
|
|
|
|
auto& deque = getVersionMessages(self, tag);
|
|
//TraceEvent("TLogPeekMem", self->dbgid).detail("Tag", req.tag1).detail("PDS", self->persistentDataSequence).detail("PDDS", self->persistentDataDurableSequence).detail("Oldest", map1.empty() ? 0 : map1.begin()->key ).detail("OldestMsgCount", map1.empty() ? 0 : map1.begin()->value.size());
|
|
|
|
begin = std::max(begin, self->persistentDataDurableVersion + 1);
|
|
auto it = std::lower_bound(deque.begin(),
|
|
deque.end(),
|
|
std::make_pair(begin, LengthPrefixedStringRef()),
|
|
[](const auto& l, const auto& r) -> bool { return l.first < r.first; });
|
|
|
|
Version currentVersion = -1;
|
|
for (; it != deque.end(); ++it) {
|
|
if (it->first != currentVersion) {
|
|
if (messages.getLength() >= SERVER_KNOBS->DESIRED_TOTAL_BYTES) {
|
|
endVersion = currentVersion + 1;
|
|
//TraceEvent("TLogPeekMessagesReached2", self->dbgid);
|
|
break;
|
|
}
|
|
|
|
currentVersion = it->first;
|
|
messages << VERSION_HEADER << currentVersion;
|
|
}
|
|
|
|
// We need the 4 byte length prefix to be a TagsAndMessage format, but that prefix is added as part of StringRef
|
|
// serialization.
|
|
int offset = messages.getLength();
|
|
messages << it->second.toStringRef();
|
|
void* data = messages.getData();
|
|
DEBUG_TAGS_AND_MESSAGE(
|
|
"TLogPeek", currentVersion, StringRef((uint8_t*)data + offset, messages.getLength() - offset), self->logId)
|
|
.detail("PeekTag", tag);
|
|
}
|
|
}
|
|
|
|
ACTOR Future<std::vector<StringRef>> parseMessagesForTag(StringRef commitBlob, Tag tag, int logRouters) {
|
|
// See the comment in LogSystem.cpp for the binary format of commitBlob.
|
|
state std::vector<StringRef> relevantMessages;
|
|
state BinaryReader rd(commitBlob, AssumeVersion(g_network->protocolVersion()));
|
|
while (!rd.empty()) {
|
|
TagsAndMessage tagsAndMessage;
|
|
tagsAndMessage.loadFromArena(&rd, nullptr);
|
|
for (Tag t : tagsAndMessage.tags) {
|
|
if (t == tag || (tag.locality == tagLocalityLogRouter && t.locality == tagLocalityLogRouter &&
|
|
t.id % logRouters == tag.id)) {
|
|
// Mutations that are in the partially durable span between known comitted version and
|
|
// recovery version get copied to the new log generation. These commits might have had more
|
|
// log router tags than what now exist, so we mod them down to what we have.
|
|
relevantMessages.push_back(tagsAndMessage.getRawMessage());
|
|
break;
|
|
}
|
|
}
|
|
wait(yield());
|
|
}
|
|
return relevantMessages;
|
|
}
|
|
|
|
// Common logics to peek TLog and create TLogPeekReply that serves both streaming peek or normal peek request
|
|
ACTOR template <typename PromiseType>
|
|
Future<Void> tLogPeekMessages(PromiseType replyPromise,
|
|
TLogData* self,
|
|
Reference<LogData> logData,
|
|
Version reqBegin,
|
|
Tag reqTag,
|
|
bool reqReturnIfBlocked = false,
|
|
bool reqOnlySpilled = false,
|
|
Optional<std::pair<UID, int>> reqSequence = Optional<std::pair<UID, int>>()) {
|
|
state BinaryWriter messages(Unversioned());
|
|
state BinaryWriter messages2(Unversioned());
|
|
state int sequence = -1;
|
|
state UID peekId;
|
|
state double queueStart = now();
|
|
|
|
if (reqTag.locality == tagLocalityTxs && reqTag.id >= logData->txsTags && logData->txsTags > 0) {
|
|
reqTag.id = reqTag.id % logData->txsTags;
|
|
}
|
|
|
|
if (reqSequence.present()) {
|
|
try {
|
|
peekId = reqSequence.get().first;
|
|
sequence = reqSequence.get().second;
|
|
if (sequence >= SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS &&
|
|
logData->peekTracker.find(peekId) == logData->peekTracker.end()) {
|
|
throw operation_obsolete();
|
|
}
|
|
auto& trackerData = logData->peekTracker[peekId];
|
|
if (sequence == 0 && trackerData.sequence_version.find(0) == trackerData.sequence_version.end()) {
|
|
trackerData.tag = reqTag;
|
|
trackerData.sequence_version[0].send(std::make_pair(reqBegin, reqOnlySpilled));
|
|
}
|
|
auto seqBegin = trackerData.sequence_version.begin();
|
|
// The peek cursor and this comparison need to agree about the maximum number of in-flight requests.
|
|
while (trackerData.sequence_version.size() &&
|
|
seqBegin->first <= sequence - SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS) {
|
|
if (seqBegin->second.canBeSet()) {
|
|
seqBegin->second.sendError(operation_obsolete());
|
|
}
|
|
trackerData.sequence_version.erase(seqBegin);
|
|
seqBegin = trackerData.sequence_version.begin();
|
|
}
|
|
|
|
if (trackerData.sequence_version.size() && sequence < seqBegin->first) {
|
|
throw operation_obsolete();
|
|
}
|
|
|
|
Future<std::pair<Version, bool>> fPrevPeekData = trackerData.sequence_version[sequence].getFuture();
|
|
if (fPrevPeekData.isReady()) {
|
|
trackerData.unblockedPeeks++;
|
|
double t = now() - trackerData.lastUpdate;
|
|
if (t > trackerData.idleMax)
|
|
trackerData.idleMax = t;
|
|
trackerData.idleTime += t;
|
|
}
|
|
trackerData.lastUpdate = now();
|
|
std::pair<Version, bool> prevPeekData = wait(fPrevPeekData);
|
|
reqBegin = std::max(prevPeekData.first, reqBegin);
|
|
reqOnlySpilled = prevPeekData.second;
|
|
wait(yield());
|
|
} catch (Error& e) {
|
|
if (e.code() == error_code_timed_out || e.code() == error_code_operation_obsolete) {
|
|
replyPromise.sendError(e);
|
|
return Void();
|
|
} else {
|
|
throw;
|
|
}
|
|
}
|
|
}
|
|
|
|
state double blockStart = now();
|
|
|
|
if (reqReturnIfBlocked && logData->version.get() < reqBegin) {
|
|
replyPromise.sendError(end_of_stream());
|
|
if (reqSequence.present()) {
|
|
auto& trackerData = logData->peekTracker[peekId];
|
|
auto& sequenceData = trackerData.sequence_version[sequence + 1];
|
|
trackerData.lastUpdate = now();
|
|
if (!sequenceData.isSet()) {
|
|
sequenceData.send(std::make_pair(reqBegin, reqOnlySpilled));
|
|
}
|
|
}
|
|
return Void();
|
|
}
|
|
|
|
DisabledTraceEvent("TLogPeekMessages0", self->dbgid)
|
|
.detail("LogId", logData->logId)
|
|
.detail("Tag", reqTag.toString())
|
|
.detail("ReqBegin", reqBegin)
|
|
.detail("Version", logData->version.get())
|
|
.detail("RecoveredAt", logData->recoveredAt);
|
|
// Wait until we have something to return that the caller doesn't already have
|
|
if (logData->version.get() < reqBegin) {
|
|
wait(logData->version.whenAtLeast(reqBegin));
|
|
wait(delay(SERVER_KNOBS->TLOG_PEEK_DELAY, g_network->getCurrentTask()));
|
|
}
|
|
if (!logData->stopped && reqTag.locality != tagLocalityTxs && reqTag != txsTag) {
|
|
// Make sure the peek reply has the recovery txn for the current TLog.
|
|
// Older generation TLog has been stopped and doesn't wait here.
|
|
// Similarly during recovery, reading transaction state store
|
|
// doesn't wait here.
|
|
wait(logData->recoveryTxnReceived.getFuture());
|
|
}
|
|
|
|
if (logData->locality != tagLocalitySatellite && reqTag.locality == tagLocalityLogRouter) {
|
|
wait(self->concurrentLogRouterReads.take());
|
|
state FlowLock::Releaser globalReleaser(self->concurrentLogRouterReads);
|
|
wait(delay(0.0, TaskPriority::Low));
|
|
}
|
|
|
|
if (reqBegin <= logData->persistentDataDurableVersion && reqTag.locality != tagLocalityTxs && reqTag != txsTag) {
|
|
// Reading spilled data will almost always imply that the storage server is >5s behind the rest
|
|
// of the cluster. We shouldn't prioritize spending CPU on helping this server catch up
|
|
// slightly faster over keeping the rest of the cluster operating normally.
|
|
// txsTag is only ever peeked on recovery, and we would still wish to prioritize requests
|
|
// that impact recovery duration.
|
|
wait(delay(0, TaskPriority::TLogSpilledPeekReply));
|
|
}
|
|
|
|
state double workStart = now();
|
|
|
|
state Version poppedVer = poppedVersion(logData, reqTag);
|
|
|
|
if (SERVER_KNOBS->ENABLE_VERSION_VECTOR && poppedVer <= reqBegin &&
|
|
reqBegin > logData->persistentDataDurableVersion && !reqOnlySpilled && reqTag.locality >= 0 &&
|
|
!reqReturnIfBlocked) {
|
|
state double startTime = now();
|
|
// TODO (version vector) check if this should be included in "status details" json
|
|
// TODO (version vector) all tags may be too many, instead, standard deviation?
|
|
wait(waitForMessagesForTag(logData, reqTag, reqBegin, SERVER_KNOBS->BLOCKING_PEEK_TIMEOUT));
|
|
double latency = now() - startTime;
|
|
if (logData->blockingPeekLatencies.find(reqTag) == logData->blockingPeekLatencies.end()) {
|
|
UID ssID = nondeterministicRandom()->randomUniqueID();
|
|
std::string s = "BlockingPeekLatencies-" + reqTag.toString();
|
|
logData->blockingPeekLatencies.try_emplace(
|
|
reqTag, s, ssID, SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL, SERVER_KNOBS->LATENCY_SAMPLE_SIZE);
|
|
}
|
|
LatencySample& sample = logData->blockingPeekLatencies.at(reqTag);
|
|
sample.addMeasurement(latency);
|
|
poppedVer = poppedVersion(logData, reqTag);
|
|
}
|
|
|
|
DisabledTraceEvent("TLogPeekMessages1", self->dbgid)
|
|
.detail("LogId", logData->logId)
|
|
.detail("Tag", reqTag.toString())
|
|
.detail("ReqBegin", reqBegin)
|
|
.detail("PoppedVer", poppedVer);
|
|
if (poppedVer > reqBegin) {
|
|
TLogPeekReply rep;
|
|
rep.maxKnownVersion = logData->version.get();
|
|
rep.minKnownCommittedVersion = logData->minKnownCommittedVersion;
|
|
rep.popped = poppedVer;
|
|
rep.end = poppedVer;
|
|
rep.onlySpilled = false;
|
|
|
|
if (reqSequence.present()) {
|
|
auto& trackerData = logData->peekTracker[peekId];
|
|
auto& sequenceData = trackerData.sequence_version[sequence + 1];
|
|
trackerData.lastUpdate = now();
|
|
if (trackerData.sequence_version.size() && sequence + 1 < trackerData.sequence_version.begin()->first) {
|
|
replyPromise.sendError(operation_obsolete());
|
|
if (!sequenceData.isSet())
|
|
sequenceData.sendError(operation_obsolete());
|
|
return Void();
|
|
}
|
|
if (sequenceData.isSet()) {
|
|
if (sequenceData.getFuture().get().first != rep.end) {
|
|
TEST(true); // tlog peek second attempt ended at a different version
|
|
replyPromise.sendError(operation_obsolete());
|
|
return Void();
|
|
}
|
|
} else {
|
|
sequenceData.send(std::make_pair(rep.end, rep.onlySpilled));
|
|
}
|
|
rep.begin = reqBegin;
|
|
}
|
|
|
|
replyPromise.send(rep);
|
|
return Void();
|
|
}
|
|
|
|
state Version endVersion;
|
|
state bool onlySpilled;
|
|
|
|
// Run the peek logic in a loop to account for the case where there is no data to return to the caller, and we may
|
|
// want to wait a little bit instead of just sending back an empty message. This feature is controlled by a knob.
|
|
loop {
|
|
endVersion = logData->version.get() + 1;
|
|
onlySpilled = false;
|
|
|
|
// grab messages from disk
|
|
DisabledTraceEvent("TLogPeekMessages2", self->dbgid)
|
|
.detail("ReqBegin", reqBegin)
|
|
.detail("Tag", reqTag.toString());
|
|
if (reqBegin <= logData->persistentDataDurableVersion) {
|
|
// Just in case the durable version changes while we are waiting for the read, we grab this data from
|
|
// memory. We may or may not actually send it depending on whether we get enough data from disk. SOMEDAY:
|
|
// Only do this if an initial attempt to read from disk results in insufficient data and the required data
|
|
// is no longer in memory SOMEDAY: Should we only send part of the messages we collected, to actually limit
|
|
// the size of the result?
|
|
|
|
if (reqOnlySpilled) {
|
|
endVersion = logData->persistentDataDurableVersion + 1;
|
|
} else {
|
|
peekMessagesFromMemory(logData, reqTag, reqBegin, messages2, endVersion);
|
|
}
|
|
|
|
if (logData->shouldSpillByValue(reqTag)) {
|
|
RangeResult kvs = wait(self->persistentData->readRange(
|
|
KeyRangeRef(
|
|
persistTagMessagesKey(logData->logId, reqTag, reqBegin),
|
|
persistTagMessagesKey(logData->logId, reqTag, logData->persistentDataDurableVersion + 1)),
|
|
SERVER_KNOBS->DESIRED_TOTAL_BYTES,
|
|
SERVER_KNOBS->DESIRED_TOTAL_BYTES));
|
|
|
|
for (auto& kv : kvs) {
|
|
auto ver = decodeTagMessagesKey(kv.key);
|
|
messages << VERSION_HEADER << ver;
|
|
messages.serializeBytes(kv.value);
|
|
}
|
|
|
|
if (kvs.expectedSize() >= SERVER_KNOBS->DESIRED_TOTAL_BYTES) {
|
|
endVersion = decodeTagMessagesKey(kvs.end()[-1].key) + 1;
|
|
onlySpilled = true;
|
|
} else {
|
|
messages.serializeBytes(messages2.toValue());
|
|
}
|
|
} else {
|
|
// FIXME: Limit to approximately DESIRED_TOTATL_BYTES somehow.
|
|
RangeResult kvrefs = wait(self->persistentData->readRange(
|
|
KeyRangeRef(
|
|
persistTagMessageRefsKey(logData->logId, reqTag, reqBegin),
|
|
persistTagMessageRefsKey(logData->logId, reqTag, logData->persistentDataDurableVersion + 1)),
|
|
SERVER_KNOBS->TLOG_SPILL_REFERENCE_MAX_BATCHES_PER_PEEK + 1));
|
|
|
|
//TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", replyPromise.getEndpoint().getPrimaryAddress()).detail("Tag1Results", s1).detail("Tag2Results", s2).detail("Tag1ResultsLim", kv1.size()).detail("Tag2ResultsLim", kv2.size()).detail("Tag1ResultsLast", kv1.size() ? kv1[0].key : "").detail("Tag2ResultsLast", kv2.size() ? kv2[0].key : "").detail("Limited", limited).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowEpoch", self->epoch()).detail("NowSeq", self->sequence.getNextSequence());
|
|
|
|
state std::vector<std::pair<IDiskQueue::location, IDiskQueue::location>> commitLocations;
|
|
state bool earlyEnd = false;
|
|
uint32_t mutationBytes = 0;
|
|
state uint64_t commitBytes = 0;
|
|
state Version firstVersion = std::numeric_limits<Version>::max();
|
|
for (int i = 0; i < kvrefs.size() && i < SERVER_KNOBS->TLOG_SPILL_REFERENCE_MAX_BATCHES_PER_PEEK; i++) {
|
|
auto& kv = kvrefs[i];
|
|
VectorRef<SpilledData> spilledData;
|
|
BinaryReader r(kv.value, AssumeVersion(logData->protocolVersion));
|
|
r >> spilledData;
|
|
for (const SpilledData& sd : spilledData) {
|
|
if (mutationBytes >= SERVER_KNOBS->DESIRED_TOTAL_BYTES) {
|
|
earlyEnd = true;
|
|
break;
|
|
}
|
|
if (sd.version >= reqBegin) {
|
|
firstVersion = std::min(firstVersion, sd.version);
|
|
const IDiskQueue::location end = sd.start.lo + sd.length;
|
|
commitLocations.emplace_back(sd.start, end);
|
|
// This isn't perfect, because we aren't accounting for page boundaries, but should be
|
|
// close enough.
|
|
commitBytes += sd.length;
|
|
mutationBytes += sd.mutationBytes;
|
|
}
|
|
}
|
|
if (earlyEnd)
|
|
break;
|
|
}
|
|
earlyEnd = earlyEnd || (kvrefs.size() >= SERVER_KNOBS->TLOG_SPILL_REFERENCE_MAX_BATCHES_PER_PEEK + 1);
|
|
wait(self->peekMemoryLimiter.take(TaskPriority::TLogSpilledPeekReply, commitBytes));
|
|
state FlowLock::Releaser memoryReservation(self->peekMemoryLimiter, commitBytes);
|
|
state std::vector<Future<Standalone<StringRef>>> messageReads;
|
|
messageReads.reserve(commitLocations.size());
|
|
for (const auto& pair : commitLocations) {
|
|
messageReads.push_back(self->rawPersistentQueue->read(pair.first, pair.second, CheckHashes::True));
|
|
}
|
|
commitLocations.clear();
|
|
wait(waitForAll(messageReads));
|
|
|
|
state Version lastRefMessageVersion = 0;
|
|
state int index = 0;
|
|
loop {
|
|
if (index >= messageReads.size())
|
|
break;
|
|
Standalone<StringRef> queueEntryData = messageReads[index].get();
|
|
uint8_t valid;
|
|
const uint32_t length = *(uint32_t*)queueEntryData.begin();
|
|
queueEntryData = queueEntryData.substr(4, queueEntryData.size() - 4);
|
|
BinaryReader rd(queueEntryData, IncludeVersion());
|
|
state TLogQueueEntry entry;
|
|
rd >> entry >> valid;
|
|
ASSERT(valid == 0x01);
|
|
ASSERT(length + sizeof(valid) == queueEntryData.size());
|
|
|
|
messages << VERSION_HEADER << entry.version;
|
|
|
|
std::vector<StringRef> rawMessages =
|
|
wait(parseMessagesForTag(entry.messages, reqTag, logData->logRouterTags));
|
|
for (const StringRef& msg : rawMessages) {
|
|
messages.serializeBytes(msg);
|
|
DEBUG_TAGS_AND_MESSAGE("TLogPeekFromDisk", entry.version, msg, logData->logId)
|
|
.detail("DebugID", self->dbgid)
|
|
.detail("PeekTag", reqTag);
|
|
}
|
|
|
|
lastRefMessageVersion = entry.version;
|
|
index++;
|
|
}
|
|
|
|
messageReads.clear();
|
|
memoryReservation.release();
|
|
|
|
if (earlyEnd) {
|
|
endVersion = lastRefMessageVersion + 1;
|
|
onlySpilled = true;
|
|
} else {
|
|
messages.serializeBytes(messages2.toValue());
|
|
}
|
|
}
|
|
} else {
|
|
if (reqOnlySpilled) {
|
|
endVersion = logData->persistentDataDurableVersion + 1;
|
|
} else {
|
|
peekMessagesFromMemory(logData, reqTag, reqBegin, messages, endVersion);
|
|
}
|
|
|
|
//TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", replyPromise.getEndpoint().getPrimaryAddress()).detail("MessageBytes", messages.getLength()).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowSeq", self->sequence.getNextSequence());
|
|
}
|
|
|
|
// Reply the peek request when
|
|
// - Have data return to the caller, or
|
|
// - Batching empty peek is disabled, or
|
|
// - Batching empty peek interval has been reached.
|
|
if (messages.getLength() > 0 || !SERVER_KNOBS->PEEK_BATCHING_EMPTY_MSG ||
|
|
(now() - blockStart > SERVER_KNOBS->PEEK_BATCHING_EMPTY_MSG_INTERVAL)) {
|
|
break;
|
|
}
|
|
|
|
state Version waitUntilVersion = logData->version.get() + 1;
|
|
|
|
// Currently, from `reqBegin` to logData->version are all empty peeks. Wait for more versions, or the empty
|
|
// batching interval has expired.
|
|
wait(logData->version.whenAtLeast(waitUntilVersion) ||
|
|
delay(SERVER_KNOBS->PEEK_BATCHING_EMPTY_MSG_INTERVAL - (now() - blockStart)));
|
|
if (logData->version.get() < waitUntilVersion) {
|
|
break; // We know that from `reqBegin` to logData->version are all empty messages. Skip re-executing the
|
|
// peek logic.
|
|
}
|
|
}
|
|
|
|
TLogPeekReply reply;
|
|
reply.maxKnownVersion = logData->version.get();
|
|
reply.minKnownCommittedVersion = logData->minKnownCommittedVersion;
|
|
reply.messages = StringRef(reply.arena, messages.toValue());
|
|
reply.end = endVersion;
|
|
reply.onlySpilled = onlySpilled;
|
|
|
|
DisabledTraceEvent("TLogPeekMessages4", self->dbgid)
|
|
.detail("LogId", logData->logId)
|
|
.detail("Tag", reqTag.toString())
|
|
.detail("ReqBegin", reqBegin)
|
|
.detail("EndVer", reply.end)
|
|
.detail("MsgBytes", reply.messages.expectedSize());
|
|
|
|
if (reqSequence.present()) {
|
|
auto& trackerData = logData->peekTracker[peekId];
|
|
trackerData.lastUpdate = now();
|
|
|
|
double queueT = blockStart - queueStart;
|
|
double blockT = workStart - blockStart;
|
|
double workT = now() - workStart;
|
|
|
|
trackerData.totalPeeks++;
|
|
trackerData.replyBytes += reply.messages.size();
|
|
|
|
if (queueT > trackerData.queueMax)
|
|
trackerData.queueMax = queueT;
|
|
if (blockT > trackerData.blockMax)
|
|
trackerData.blockMax = blockT;
|
|
if (workT > trackerData.workMax)
|
|
trackerData.workMax = workT;
|
|
|
|
trackerData.queueTime += queueT;
|
|
trackerData.blockTime += blockT;
|
|
trackerData.workTime += workT;
|
|
|
|
auto& sequenceData = trackerData.sequence_version[sequence + 1];
|
|
if (trackerData.sequence_version.size() && sequence + 1 < trackerData.sequence_version.begin()->first) {
|
|
replyPromise.sendError(operation_obsolete());
|
|
if (!sequenceData.isSet()) {
|
|
// It would technically be more correct to .send({reqBegin, reqOnlySpilled}), as the next
|
|
// request might still be in the window of active requests, but LogSystemPeekCursor will
|
|
// throw away all future responses upon getting an operation_obsolete(), so computing a
|
|
// response will probably be a waste of CPU.
|
|
sequenceData.sendError(operation_obsolete());
|
|
}
|
|
return Void();
|
|
}
|
|
if (sequenceData.isSet()) {
|
|
trackerData.duplicatePeeks++;
|
|
if (sequenceData.getFuture().get().first != reply.end) {
|
|
TEST(true); // tlog peek second attempt ended at a different version (2)
|
|
replyPromise.sendError(operation_obsolete());
|
|
return Void();
|
|
}
|
|
} else {
|
|
sequenceData.send(std::make_pair(reply.end, reply.onlySpilled));
|
|
}
|
|
reply.begin = reqBegin;
|
|
}
|
|
|
|
replyPromise.send(reply);
|
|
return Void();
|
|
}
|
|
|
|
// This actor keep pushing TLogPeekStreamReply until it's removed from the cluster or should recover
|
|
ACTOR Future<Void> tLogPeekStream(TLogData* self, TLogPeekStreamRequest req, Reference<LogData> logData) {
|
|
self->activePeekStreams++;
|
|
|
|
state Version begin = req.begin;
|
|
state bool onlySpilled = false;
|
|
req.reply.setByteLimit(std::min(SERVER_KNOBS->MAXIMUM_PEEK_BYTES, req.limitBytes));
|
|
loop {
|
|
state TLogPeekStreamReply reply;
|
|
state Promise<TLogPeekReply> promise;
|
|
state Future<TLogPeekReply> future(promise.getFuture());
|
|
try {
|
|
wait(req.reply.onReady() && store(reply.rep, future) &&
|
|
tLogPeekMessages(promise, self, logData, begin, req.tag, req.returnIfBlocked, onlySpilled));
|
|
|
|
reply.rep.begin = begin;
|
|
req.reply.send(reply);
|
|
begin = reply.rep.end;
|
|
onlySpilled = reply.rep.onlySpilled;
|
|
if (reply.rep.end > logData->version.get()) {
|
|
wait(delay(SERVER_KNOBS->TLOG_PEEK_DELAY, g_network->getCurrentTask()));
|
|
} else {
|
|
wait(delay(0, g_network->getCurrentTask()));
|
|
}
|
|
} catch (Error& e) {
|
|
self->activePeekStreams--;
|
|
TraceEvent(SevDebug, "TLogPeekStreamEnd", logData->logId)
|
|
.errorUnsuppressed(e)
|
|
.detail("PeerAddr", req.reply.getEndpoint().getPrimaryAddress());
|
|
|
|
if (e.code() == error_code_end_of_stream || e.code() == error_code_operation_obsolete) {
|
|
req.reply.sendError(e);
|
|
return Void();
|
|
} else {
|
|
throw;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
ACTOR Future<Void> doQueueCommit(TLogData* self,
|
|
Reference<LogData> logData,
|
|
std::vector<Reference<LogData>> missingFinalCommit) {
|
|
state Version ver = logData->version.get();
|
|
state Version commitNumber = self->queueCommitBegin + 1;
|
|
state Version knownCommittedVersion = logData->knownCommittedVersion;
|
|
self->queueCommitBegin = commitNumber;
|
|
logData->queueCommittingVersion = ver;
|
|
|
|
g_network->setCurrentTask(TaskPriority::TLogCommitReply);
|
|
Future<Void> c = self->persistentQueue->commit();
|
|
self->diskQueueCommitBytes = 0;
|
|
self->largeDiskQueueCommitBytes.set(false);
|
|
|
|
wait(ioDegradedOrTimeoutError(
|
|
c, SERVER_KNOBS->MAX_STORAGE_COMMIT_TIME, self->degraded, SERVER_KNOBS->TLOG_DEGRADED_DURATION));
|
|
if (g_network->isSimulated() && !g_simulator.speedUpSimulation && BUGGIFY_WITH_PROB(0.0001)) {
|
|
wait(delay(6.0));
|
|
}
|
|
wait(self->queueCommitEnd.whenAtLeast(commitNumber - 1));
|
|
|
|
// Calling check_yield instead of yield to avoid a destruction ordering problem in simulation
|
|
if (g_network->check_yield(g_network->getCurrentTask())) {
|
|
wait(delay(0, g_network->getCurrentTask()));
|
|
}
|
|
|
|
ASSERT(ver > logData->queueCommittedVersion.get());
|
|
|
|
logData->durableKnownCommittedVersion = knownCommittedVersion;
|
|
if (logData->unpoppedRecoveredTags == 0 && knownCommittedVersion >= logData->recoveredAt &&
|
|
logData->recoveryComplete.canBeSet()) {
|
|
TraceEvent("TLogRecoveryComplete", logData->logId)
|
|
.detail("Tags", logData->unpoppedRecoveredTags)
|
|
.detail("DurableKCVer", logData->durableKnownCommittedVersion)
|
|
.detail("RecoveredAt", logData->recoveredAt);
|
|
logData->recoveryComplete.send(Void());
|
|
}
|
|
|
|
//TraceEvent("TLogCommitDurable", self->dbgid).detail("Version", ver);
|
|
if (logData->logSystem->get() &&
|
|
(!logData->isPrimary || logData->logRouterPoppedVersion < logData->logRouterPopToVersion)) {
|
|
logData->logRouterPoppedVersion = ver;
|
|
logData->logSystem->get()->pop(ver, logData->remoteTag, knownCommittedVersion, logData->locality);
|
|
}
|
|
|
|
logData->queueCommittedVersion.set(ver);
|
|
self->queueCommitEnd.set(commitNumber);
|
|
|
|
for (auto& it : missingFinalCommit) {
|
|
TraceEvent("TLogCommitMissingFinalCommit", self->dbgid)
|
|
.detail("LogId", logData->logId)
|
|
.detail("Version", it->version.get())
|
|
.detail("QueueVer", it->queueCommittedVersion.get());
|
|
TEST(true); // A TLog was replaced before having a chance to commit its queue
|
|
it->queueCommittedVersion.set(it->version.get());
|
|
}
|
|
return Void();
|
|
}
|
|
|
|
ACTOR Future<Void> commitQueue(TLogData* self) {
|
|
state Reference<LogData> logData;
|
|
state std::vector<Reference<LogData>> missingFinalCommit;
|
|
|
|
loop {
|
|
int foundCount = 0;
|
|
for (auto it : self->id_data) {
|
|
if (!it.second->stopped) {
|
|
logData = it.second;
|
|
foundCount++;
|
|
} else if (it.second->version.get() >
|
|
std::max(it.second->queueCommittingVersion, it.second->queueCommittedVersion.get())) {
|
|
missingFinalCommit.push_back(it.second);
|
|
}
|
|
}
|
|
|
|
ASSERT(foundCount < 2);
|
|
if (!foundCount) {
|
|
wait(self->newLogData.onTrigger());
|
|
continue;
|
|
}
|
|
|
|
TraceEvent("CommitQueueNewLog", self->dbgid)
|
|
.detail("LogId", logData->logId)
|
|
.detail("Version", logData->version.get())
|
|
.detail("Committing", logData->queueCommittingVersion)
|
|
.detail("Commmitted", logData->queueCommittedVersion.get());
|
|
if (logData->committingQueue.canBeSet()) {
|
|
logData->committingQueue.send(Void());
|
|
}
|
|
|
|
loop {
|
|
if (logData->stopped && logData->version.get() == std::max(logData->queueCommittingVersion,
|
|
logData->queueCommittedVersion.get())) {
|
|
wait(logData->queueCommittedVersion.whenAtLeast(logData->version.get()));
|
|
break;
|
|
}
|
|
|
|
choose {
|
|
when(wait(logData->version.whenAtLeast(
|
|
std::max(logData->queueCommittingVersion, logData->queueCommittedVersion.get()) + 1))) {
|
|
while (self->queueCommitBegin != self->queueCommitEnd.get() &&
|
|
!self->largeDiskQueueCommitBytes.get()) {
|
|
wait(self->queueCommitEnd.whenAtLeast(self->queueCommitBegin) ||
|
|
self->largeDiskQueueCommitBytes.onChange());
|
|
}
|
|
if (logData->queueCommittedVersion.get() == std::numeric_limits<Version>::max()) {
|
|
break;
|
|
}
|
|
self->sharedActors.send(doQueueCommit(self, logData, missingFinalCommit));
|
|
missingFinalCommit.clear();
|
|
}
|
|
when(wait(self->newLogData.onTrigger())) {}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
ACTOR Future<Void> tLogCommit(TLogData* self,
|
|
TLogCommitRequest req,
|
|
Reference<LogData> logData,
|
|
PromiseStream<Void> warningCollectorInput) {
|
|
state Span span("TLog:tLogCommit"_loc, req.spanContext);
|
|
state Optional<UID> tlogDebugID;
|
|
if (req.debugID.present()) {
|
|
tlogDebugID = nondeterministicRandom()->randomUniqueID();
|
|
g_traceBatch.addAttach("CommitAttachID", req.debugID.get().first(), tlogDebugID.get().first());
|
|
g_traceBatch.addEvent("CommitDebug", tlogDebugID.get().first(), "TLog.tLogCommit.BeforeWaitForVersion");
|
|
}
|
|
|
|
if (req.prevVersion == logData->recoveredAt) {
|
|
logData->recoveryTxnVersion = req.version;
|
|
}
|
|
logData->minKnownCommittedVersion = std::max(logData->minKnownCommittedVersion, req.minKnownCommittedVersion);
|
|
|
|
wait(logData->version.whenAtLeast(req.prevVersion));
|
|
|
|
// Calling check_yield instead of yield to avoid a destruction ordering problem in simulation
|
|
if (g_network->check_yield(g_network->getCurrentTask())) {
|
|
wait(delay(0, g_network->getCurrentTask()));
|
|
}
|
|
|
|
state double waitStartT = 0;
|
|
while (self->bytesInput - self->bytesDurable >= SERVER_KNOBS->TLOG_HARD_LIMIT_BYTES && !logData->stopped) {
|
|
if (now() - waitStartT >= 1) {
|
|
TraceEvent(SevWarn, "TLogUpdateLag", logData->logId)
|
|
.detail("Version", logData->version.get())
|
|
.detail("PersistentDataVersion", logData->persistentDataVersion)
|
|
.detail("PersistentDataDurableVersion", logData->persistentDataDurableVersion);
|
|
waitStartT = now();
|
|
}
|
|
wait(delayJittered(.005, TaskPriority::TLogCommit));
|
|
}
|
|
|
|
if (logData->stopped) {
|
|
req.reply.sendError(tlog_stopped());
|
|
return Void();
|
|
}
|
|
|
|
state double beforeCommitT = now();
|
|
|
|
// Not a duplicate (check relies on critical section between here self->version.set() below!)
|
|
state bool isNotDuplicate = (logData->version.get() == req.prevVersion);
|
|
if (isNotDuplicate) {
|
|
if (req.debugID.present())
|
|
g_traceBatch.addEvent("CommitDebug", tlogDebugID.get().first(), "TLog.tLogCommit.Before");
|
|
|
|
//TraceEvent("TLogCommit", logData->logId).detail("Version", req.version);
|
|
commitMessages(self, logData, req.version, req.arena, req.messages);
|
|
|
|
logData->knownCommittedVersion = std::max(logData->knownCommittedVersion, req.knownCommittedVersion);
|
|
|
|
TLogQueueEntryRef qe;
|
|
// Log the changes to the persistent queue, to be committed by commitQueue()
|
|
qe.version = req.version;
|
|
qe.knownCommittedVersion = logData->knownCommittedVersion;
|
|
qe.messages = req.messages;
|
|
qe.id = logData->logId;
|
|
self->persistentQueue->push(qe, logData);
|
|
|
|
self->diskQueueCommitBytes += qe.expectedSize();
|
|
if (self->diskQueueCommitBytes > SERVER_KNOBS->MAX_QUEUE_COMMIT_BYTES) {
|
|
self->largeDiskQueueCommitBytes.set(true);
|
|
}
|
|
// Notifies the commitQueue actor to commit persistentQueue, and also unblocks tLogPeekMessages actors
|
|
logData->version.set(req.version);
|
|
if (logData->recoveryTxnReceived.canBeSet() &&
|
|
(req.prevVersion == 0 || req.prevVersion == logData->recoveredAt)) {
|
|
TraceEvent("TLogInfo", self->dbgid)
|
|
.detail("Log", logData->logId)
|
|
.detail("Prev", req.prevVersion)
|
|
.detail("RecoveredAt", logData->recoveredAt)
|
|
.detail("RecoveryTxnVersion", req.version);
|
|
logData->recoveryTxnReceived.send(Void());
|
|
}
|
|
if (SERVER_KNOBS->ENABLE_VERSION_VECTOR_TLOG_UNICAST) {
|
|
self->unknownCommittedVersions.push_front(std::make_tuple(req.version, req.tLogCount));
|
|
while (!self->unknownCommittedVersions.empty() &&
|
|
std::get<0>(self->unknownCommittedVersions.back()) <= req.knownCommittedVersion) {
|
|
self->unknownCommittedVersions.pop_back();
|
|
}
|
|
}
|
|
|
|
if (req.debugID.present())
|
|
g_traceBatch.addEvent("CommitDebug", tlogDebugID.get().first(), "TLog.tLogCommit.AfterTLogCommit");
|
|
}
|
|
// Send replies only once all prior messages have been received and committed.
|
|
state Future<Void> stopped = logData->stopCommit.onTrigger();
|
|
wait(
|
|
timeoutWarning(logData->queueCommittedVersion.whenAtLeast(req.version) || stopped, 0.1, warningCollectorInput));
|
|
|
|
if (stopped.isReady()) {
|
|
ASSERT(logData->stopped);
|
|
req.reply.sendError(tlog_stopped());
|
|
return Void();
|
|
}
|
|
|
|
if (isNotDuplicate) {
|
|
self->commitLatencyDist->sampleSeconds(now() - beforeCommitT);
|
|
}
|
|
|
|
if (req.debugID.present())
|
|
g_traceBatch.addEvent("CommitDebug", tlogDebugID.get().first(), "TLog.tLogCommit.After");
|
|
|
|
req.reply.send(logData->durableKnownCommittedVersion);
|
|
return Void();
|
|
}
|
|
|
|
ACTOR Future<Void> initPersistentState(TLogData* self, Reference<LogData> logData) {
|
|
wait(self->persistentDataCommitLock.take());
|
|
state FlowLock::Releaser commitLockReleaser(self->persistentDataCommitLock);
|
|
|
|
// PERSIST: Initial setup of persistentData for a brand new tLog for a new database
|
|
state IKeyValueStore* storage = self->persistentData;
|
|
wait(ioTimeoutError(storage->init(), SERVER_KNOBS->TLOG_MAX_CREATE_DURATION));
|
|
storage->set(persistFormat);
|
|
storage->set(
|
|
KeyValueRef(BinaryWriter::toValue(logData->logId, Unversioned()).withPrefix(persistCurrentVersionKeys.begin),
|
|
BinaryWriter::toValue(logData->version.get(), Unversioned())));
|
|
storage->set(KeyValueRef(
|
|
BinaryWriter::toValue(logData->logId, Unversioned()).withPrefix(persistKnownCommittedVersionKeys.begin),
|
|
BinaryWriter::toValue(logData->knownCommittedVersion, Unversioned())));
|
|
storage->set(KeyValueRef(BinaryWriter::toValue(logData->logId, Unversioned()).withPrefix(persistLocalityKeys.begin),
|
|
BinaryWriter::toValue(logData->locality, Unversioned())));
|
|
storage->set(
|
|
KeyValueRef(BinaryWriter::toValue(logData->logId, Unversioned()).withPrefix(persistLogRouterTagsKeys.begin),
|
|
BinaryWriter::toValue(logData->logRouterTags, Unversioned())));
|
|
storage->set(KeyValueRef(BinaryWriter::toValue(logData->logId, Unversioned()).withPrefix(persistTxsTagsKeys.begin),
|
|
BinaryWriter::toValue(logData->txsTags, Unversioned())));
|
|
storage->set(
|
|
KeyValueRef(BinaryWriter::toValue(logData->logId, Unversioned()).withPrefix(persistRecoveryCountKeys.begin),
|
|
BinaryWriter::toValue(logData->recoveryCount, Unversioned())));
|
|
storage->set(
|
|
KeyValueRef(BinaryWriter::toValue(logData->logId, Unversioned()).withPrefix(persistProtocolVersionKeys.begin),
|
|
BinaryWriter::toValue(logData->protocolVersion, Unversioned())));
|
|
storage->set(
|
|
KeyValueRef(BinaryWriter::toValue(logData->logId, Unversioned()).withPrefix(persistTLogSpillTypeKeys.begin),
|
|
BinaryWriter::toValue(logData->logSpillType, AssumeVersion(logData->protocolVersion))));
|
|
|
|
for (auto tag : logData->allTags) {
|
|
ASSERT(!logData->getTagData(tag));
|
|
logData->createTagData(tag, 0, true, true, true);
|
|
updatePersistentPopped(self, logData, logData->getTagData(tag));
|
|
}
|
|
|
|
TraceEvent("TLogInitCommit", logData->logId).log();
|
|
wait(ioTimeoutError(self->persistentData->commit(), SERVER_KNOBS->TLOG_MAX_CREATE_DURATION));
|
|
return Void();
|
|
}
|
|
|
|
ACTOR Future<UID> getClusterId(TLogData* self) {
|
|
state ReadYourWritesTransaction tr(self->cx);
|
|
loop {
|
|
try {
|
|
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
|
|
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
|
|
Optional<Value> clusterId = wait(tr.get(clusterIdKey));
|
|
if (clusterId.present()) {
|
|
return BinaryReader::fromStringRef<UID>(clusterId.get(), Unversioned());
|
|
} else {
|
|
return UID();
|
|
}
|
|
} catch (Error& e) {
|
|
wait(tr.onError(e));
|
|
}
|
|
}
|
|
}
|
|
|
|
ACTOR Future<Void> rejoinClusterController(TLogData* self,
|
|
TLogInterface tli,
|
|
DBRecoveryCount recoveryCount,
|
|
Future<Void> registerWithCC,
|
|
bool isPrimary) {
|
|
state LifetimeToken lastMasterLifetime;
|
|
loop {
|
|
auto const& inf = self->dbInfo->get();
|
|
bool isDisplaced =
|
|
!std::count(inf.priorCommittedLogServers.begin(), inf.priorCommittedLogServers.end(), tli.id());
|
|
if (isPrimary) {
|
|
isDisplaced =
|
|
isDisplaced && inf.recoveryCount >= recoveryCount && inf.recoveryState != RecoveryState::UNINITIALIZED;
|
|
} else {
|
|
isDisplaced = isDisplaced &&
|
|
((inf.recoveryCount > recoveryCount && inf.recoveryState != RecoveryState::UNINITIALIZED) ||
|
|
(inf.recoveryCount == recoveryCount && inf.recoveryState == RecoveryState::FULLY_RECOVERED));
|
|
}
|
|
isDisplaced = isDisplaced && !inf.logSystemConfig.hasTLog(tli.id());
|
|
if (isDisplaced) {
|
|
state TraceEvent ev("TLogDisplaced", tli.id());
|
|
ev.detail("Reason", "DBInfoDoesNotContain")
|
|
.detail("RecoveryCount", recoveryCount)
|
|
.detail("InfRecoveryCount", inf.recoveryCount)
|
|
.detail("RecoveryState", (int)inf.recoveryState)
|
|
.detail("LogSysConf", describe(inf.logSystemConfig.tLogs))
|
|
.detail("PriorLogs", describe(inf.priorCommittedLogServers))
|
|
.detail("OldLogGens", inf.logSystemConfig.oldTLogs.size());
|
|
// Read and cache cluster ID before displacing this tlog. We want
|
|
// to avoid removing the tlogs data if it has joined a new cluster
|
|
// with a different cluster ID.
|
|
state UID clusterId = wait(getClusterId(self));
|
|
ASSERT(clusterId.isValid());
|
|
self->ccClusterId = clusterId;
|
|
ev.detail("ClusterId", clusterId).detail("SelfClusterId", self->durableClusterId);
|
|
if (BUGGIFY)
|
|
wait(delay(SERVER_KNOBS->BUGGIFY_WORKER_REMOVED_MAX_LAG * deterministicRandom()->random01()));
|
|
throw worker_removed();
|
|
}
|
|
|
|
if (registerWithCC.isReady()) {
|
|
if (!lastMasterLifetime.isEqual(self->dbInfo->get().masterLifetime)) {
|
|
// The TLogRejoinRequest is needed to establish communications with a new master, which doesn't have our
|
|
// TLogInterface
|
|
TLogRejoinRequest req(tli);
|
|
TraceEvent("TLogRejoining", tli.id())
|
|
.detail("ClusterController", self->dbInfo->get().clusterInterface.id())
|
|
.detail("DbInfoMasterLifeTime", self->dbInfo->get().masterLifetime.toString())
|
|
.detail("LastMasterLifeTime", lastMasterLifetime.toString());
|
|
choose {
|
|
when(TLogRejoinReply rep = wait(
|
|
brokenPromiseToNever(self->dbInfo->get().clusterInterface.tlogRejoin.getReply(req)))) {
|
|
if (rep.masterIsRecovered)
|
|
lastMasterLifetime = self->dbInfo->get().masterLifetime;
|
|
}
|
|
when(wait(self->dbInfo->onChange())) {}
|
|
}
|
|
} else {
|
|
wait(self->dbInfo->onChange());
|
|
}
|
|
} else {
|
|
wait(registerWithCC || self->dbInfo->onChange());
|
|
}
|
|
}
|
|
}
|
|
|
|
ACTOR Future<Void> respondToRecovered(TLogInterface tli, Promise<Void> recoveryComplete) {
|
|
state bool finishedRecovery = true;
|
|
try {
|
|
wait(recoveryComplete.getFuture());
|
|
} catch (Error& e) {
|
|
if (e.code() != error_code_end_of_stream) {
|
|
throw;
|
|
}
|
|
finishedRecovery = false;
|
|
}
|
|
TraceEvent("TLogRespondToRecovered", tli.id()).detail("Finished", finishedRecovery);
|
|
loop {
|
|
TLogRecoveryFinishedRequest req = waitNext(tli.recoveryFinished.getFuture());
|
|
if (finishedRecovery) {
|
|
req.reply.send(Void());
|
|
} else {
|
|
req.reply.send(Never());
|
|
}
|
|
}
|
|
}
|
|
|
|
ACTOR Future<Void> cleanupPeekTrackers(LogData* logData) {
|
|
loop {
|
|
double minTimeUntilExpiration = SERVER_KNOBS->PEEK_TRACKER_EXPIRATION_TIME;
|
|
auto it = logData->peekTracker.begin();
|
|
while (it != logData->peekTracker.end()) {
|
|
double timeUntilExpiration = it->second.lastUpdate + SERVER_KNOBS->PEEK_TRACKER_EXPIRATION_TIME - now();
|
|
if (timeUntilExpiration < 1.0e-6) {
|
|
for (auto seq : it->second.sequence_version) {
|
|
if (!seq.second.isSet()) {
|
|
seq.second.sendError(timed_out());
|
|
}
|
|
}
|
|
it = logData->peekTracker.erase(it);
|
|
} else {
|
|
minTimeUntilExpiration = std::min(minTimeUntilExpiration, timeUntilExpiration);
|
|
++it;
|
|
}
|
|
}
|
|
|
|
wait(delay(minTimeUntilExpiration));
|
|
}
|
|
}
|
|
|
|
ACTOR Future<Void> logPeekTrackers(LogData* logData) {
|
|
loop {
|
|
int64_t logThreshold = 1;
|
|
if (logData->peekTracker.size() > SERVER_KNOBS->PEEK_LOGGING_AMOUNT) {
|
|
std::vector<int64_t> peekCounts;
|
|
peekCounts.reserve(logData->peekTracker.size());
|
|
for (auto& it : logData->peekTracker) {
|
|
peekCounts.push_back(it.second.totalPeeks);
|
|
}
|
|
size_t pivot = peekCounts.size() - SERVER_KNOBS->PEEK_LOGGING_AMOUNT;
|
|
std::nth_element(peekCounts.begin(), peekCounts.begin() + pivot, peekCounts.end());
|
|
logThreshold = std::max<int64_t>(1, peekCounts[pivot]);
|
|
}
|
|
int logCount = 0;
|
|
for (auto& it : logData->peekTracker) {
|
|
if (it.second.totalPeeks >= logThreshold) {
|
|
logCount++;
|
|
TraceEvent("PeekMetrics", logData->logId)
|
|
.detail("Tag", it.second.tag.toString())
|
|
.detail("Elapsed", now() - it.second.lastLogged)
|
|
.detail("MeanReplyBytes", it.second.replyBytes / it.second.totalPeeks)
|
|
.detail("TotalPeeks", it.second.totalPeeks)
|
|
.detail("UnblockedPeeks", it.second.unblockedPeeks)
|
|
.detail("DuplicatePeeks", it.second.duplicatePeeks)
|
|
.detail("Sequence",
|
|
it.second.sequence_version.size() ? it.second.sequence_version.begin()->first : -1)
|
|
.detail("IdleSeconds", it.second.idleTime)
|
|
.detail("IdleMax", it.second.idleMax)
|
|
.detail("QueueSeconds", it.second.queueTime)
|
|
.detail("QueueMax", it.second.queueMax)
|
|
.detail("BlockSeconds", it.second.blockTime)
|
|
.detail("BlockMax", it.second.blockMax)
|
|
.detail("WorkSeconds", it.second.workTime)
|
|
.detail("WorkMax", it.second.workMax);
|
|
it.second.resetMetrics();
|
|
}
|
|
}
|
|
|
|
wait(delay(SERVER_KNOBS->PEEK_LOGGING_DELAY * std::max(1, logCount)));
|
|
}
|
|
}
|
|
|
|
void getQueuingMetrics(TLogData* self, Reference<LogData> logData, TLogQueuingMetricsRequest const& req) {
|
|
TLogQueuingMetricsReply reply;
|
|
reply.localTime = now();
|
|
reply.instanceID = self->instanceID;
|
|
reply.bytesInput = self->bytesInput;
|
|
reply.bytesDurable = self->bytesDurable;
|
|
reply.storageBytes = self->persistentData->getStorageBytes();
|
|
// FIXME: Add the knownCommittedVersion to this message and change ratekeeper to use that version.
|
|
reply.v = logData->durableKnownCommittedVersion;
|
|
req.reply.send(reply);
|
|
}
|
|
|
|
ACTOR Future<Void> tLogSnapCreate(TLogSnapRequest snapReq, TLogData* self, Reference<LogData> logData) {
|
|
if (self->ignorePopUid != snapReq.snapUID.toString()) {
|
|
snapReq.reply.sendError(operation_failed());
|
|
return Void();
|
|
}
|
|
ExecCmdValueString snapArg(snapReq.snapPayload);
|
|
try {
|
|
int err = wait(execHelper(&snapArg, snapReq.snapUID, self->dataFolder, snapReq.role.toString()));
|
|
|
|
std::string uidStr = snapReq.snapUID.toString();
|
|
TraceEvent("ExecTraceTLog")
|
|
.detail("Uid", uidStr)
|
|
.detail("Status", err)
|
|
.detail("Role", snapReq.role)
|
|
.detail("Value", self->dataFolder)
|
|
.detail("ExecPayload", snapReq.snapPayload)
|
|
.detail("PersistentDataVersion", logData->persistentDataVersion)
|
|
.detail("PersistentDatadurableVersion", logData->persistentDataDurableVersion)
|
|
.detail("QueueCommittedVersion", logData->queueCommittedVersion.get())
|
|
.detail("Version", logData->version.get());
|
|
|
|
if (err != 0) {
|
|
throw operation_failed();
|
|
}
|
|
snapReq.reply.send(Void());
|
|
} catch (Error& e) {
|
|
TraceEvent("TLogExecHelperError").errorUnsuppressed(e);
|
|
if (e.code() != error_code_operation_cancelled) {
|
|
snapReq.reply.sendError(e);
|
|
} else {
|
|
throw e;
|
|
}
|
|
}
|
|
return Void();
|
|
}
|
|
|
|
ACTOR Future<Void> tLogEnablePopReq(TLogEnablePopRequest enablePopReq, TLogData* self, Reference<LogData> logData) {
|
|
if (self->ignorePopUid != enablePopReq.snapUID.toString()) {
|
|
TraceEvent(SevWarn, "TLogPopDisableEnableUidMismatch")
|
|
.detail("IgnorePopUid", self->ignorePopUid)
|
|
.detail("UidStr", enablePopReq.snapUID.toString());
|
|
enablePopReq.reply.sendError(operation_failed());
|
|
return Void();
|
|
}
|
|
TraceEvent("EnableTLogPlayAllIgnoredPops2", logData->logId)
|
|
.detail("UidStr", enablePopReq.snapUID.toString())
|
|
.detail("IgnorePopUid", self->ignorePopUid)
|
|
.detail("IgnorePopDeadline", self->ignorePopDeadline)
|
|
.detail("PersistentDataVersion", logData->persistentDataVersion)
|
|
.detail("PersistentDataDurableVersion", logData->persistentDataDurableVersion)
|
|
.detail("QueueCommittedVersion", logData->queueCommittedVersion.get())
|
|
.detail("Version", logData->version.get());
|
|
self->ignorePopDeadline = 0;
|
|
wait(processPopRequests(self, logData));
|
|
enablePopReq.reply.send(Void());
|
|
return Void();
|
|
}
|
|
|
|
ACTOR Future<Void> serveTLogInterface(TLogData* self,
|
|
TLogInterface tli,
|
|
Reference<LogData> logData,
|
|
PromiseStream<Void> warningCollectorInput) {
|
|
state Future<Void> dbInfoChange = Void();
|
|
|
|
loop choose {
|
|
when(wait(dbInfoChange)) {
|
|
dbInfoChange = self->dbInfo->onChange();
|
|
bool found = false;
|
|
if (self->dbInfo->get().recoveryState >= RecoveryState::ACCEPTING_COMMITS) {
|
|
for (auto& logs : self->dbInfo->get().logSystemConfig.tLogs) {
|
|
if (std::count(logs.tLogs.begin(), logs.tLogs.end(), logData->logId)) {
|
|
found = true;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
if (found && self->dbInfo->get().logSystemConfig.recruitmentID == logData->recruitmentID) {
|
|
logData->logSystem->set(ILogSystem::fromServerDBInfo(self->dbgid, self->dbInfo->get()));
|
|
if (!logData->isPrimary) {
|
|
logData->logSystem->get()->pop(logData->logRouterPoppedVersion,
|
|
logData->remoteTag,
|
|
logData->durableKnownCommittedVersion,
|
|
logData->locality);
|
|
}
|
|
|
|
if (!logData->isPrimary && logData->stopped) {
|
|
TraceEvent("TLogAlreadyStopped", self->dbgid).detail("LogId", logData->logId);
|
|
logData->removed = logData->removed && logData->logSystem->get()->endEpoch();
|
|
}
|
|
} else {
|
|
logData->logSystem->set(Reference<ILogSystem>());
|
|
}
|
|
|
|
// Persist cluster ID once cluster has recovered.
|
|
auto ccClusterId = self->dbInfo->get().clusterId;
|
|
if (self->dbInfo->get().recoveryState == RecoveryState::FULLY_RECOVERED &&
|
|
!self->durableClusterId.isValid()) {
|
|
ASSERT(ccClusterId.isValid());
|
|
self->durableClusterId = ccClusterId;
|
|
self->persistentData->set(
|
|
KeyValueRef(persistClusterIdKey, BinaryWriter::toValue(ccClusterId, Unversioned())));
|
|
wait(self->persistentData->commit());
|
|
}
|
|
}
|
|
when(TLogPeekStreamRequest req = waitNext(tli.peekStreamMessages.getFuture())) {
|
|
TraceEvent(SevDebug, "TLogPeekStream", logData->logId)
|
|
.detail("Token", tli.peekStreamMessages.getEndpoint().token);
|
|
logData->addActor.send(tLogPeekStream(self, req, logData));
|
|
}
|
|
when(TLogPeekRequest req = waitNext(tli.peekMessages.getFuture())) {
|
|
logData->addActor.send(tLogPeekMessages(
|
|
req.reply, self, logData, req.begin, req.tag, req.returnIfBlocked, req.onlySpilled, req.sequence));
|
|
}
|
|
when(TLogPopRequest req = waitNext(tli.popMessages.getFuture())) {
|
|
logData->addActor.send(tLogPop(self, req, logData));
|
|
}
|
|
when(TLogCommitRequest req = waitNext(tli.commit.getFuture())) {
|
|
//TraceEvent("TLogCommitReq", logData->logId).detail("Ver", req.version).detail("PrevVer", req.prevVersion).detail("LogVer", logData->version.get());
|
|
ASSERT(logData->isPrimary);
|
|
TEST(logData->stopped); // TLogCommitRequest while stopped
|
|
if (!logData->stopped)
|
|
logData->addActor.send(tLogCommit(self, req, logData, warningCollectorInput));
|
|
else
|
|
req.reply.sendError(tlog_stopped());
|
|
}
|
|
when(ReplyPromise<TLogLockResult> reply = waitNext(tli.lock.getFuture())) {
|
|
logData->addActor.send(tLogLock(self, reply, logData));
|
|
}
|
|
when(TLogQueuingMetricsRequest req = waitNext(tli.getQueuingMetrics.getFuture())) {
|
|
getQueuingMetrics(self, logData, req);
|
|
}
|
|
when(TLogConfirmRunningRequest req = waitNext(tli.confirmRunning.getFuture())) {
|
|
if (req.debugID.present()) {
|
|
UID tlogDebugID = nondeterministicRandom()->randomUniqueID();
|
|
g_traceBatch.addAttach("TransactionAttachID", req.debugID.get().first(), tlogDebugID.first());
|
|
g_traceBatch.addEvent("TransactionDebug", tlogDebugID.first(), "TLogServer.TLogConfirmRunningRequest");
|
|
}
|
|
if (!logData->stopped)
|
|
req.reply.send(Void());
|
|
else
|
|
req.reply.sendError(tlog_stopped());
|
|
}
|
|
when(TLogDisablePopRequest req = waitNext(tli.disablePopRequest.getFuture())) {
|
|
if (self->ignorePopUid != "") {
|
|
TraceEvent(SevWarn, "TLogPopDisableonDisable")
|
|
.detail("IgnorePopUid", self->ignorePopUid)
|
|
.detail("UidStr", req.snapUID.toString())
|
|
.detail("PersistentDataVersion", logData->persistentDataVersion)
|
|
.detail("PersistentDatadurableVersion", logData->persistentDataDurableVersion)
|
|
.detail("QueueCommittedVersion", logData->queueCommittedVersion.get())
|
|
.detail("Version", logData->version.get());
|
|
req.reply.sendError(operation_failed());
|
|
} else {
|
|
// FIXME: As part of reverting snapshot V1, make ignorePopUid a UID instead of string
|
|
self->ignorePopUid = req.snapUID.toString();
|
|
self->ignorePopDeadline = now() + SERVER_KNOBS->TLOG_IGNORE_POP_AUTO_ENABLE_DELAY;
|
|
req.reply.send(Void());
|
|
}
|
|
}
|
|
when(TLogEnablePopRequest enablePopReq = waitNext(tli.enablePopRequest.getFuture())) {
|
|
logData->addActor.send(tLogEnablePopReq(enablePopReq, self, logData));
|
|
}
|
|
when(TLogSnapRequest snapReq = waitNext(tli.snapRequest.getFuture())) {
|
|
logData->addActor.send(tLogSnapCreate(snapReq, self, logData));
|
|
}
|
|
}
|
|
}
|
|
|
|
void removeLog(TLogData* self, Reference<LogData> logData) {
|
|
TraceEvent("TLogRemoved", self->dbgid)
|
|
.detail("LogId", logData->logId)
|
|
.detail("Input", logData->bytesInput.getValue())
|
|
.detail("Durable", logData->bytesDurable.getValue());
|
|
logData->stopped = true;
|
|
logData->unblockWaitingPeeks();
|
|
if (!logData->recoveryComplete.isSet()) {
|
|
logData->recoveryComplete.sendError(end_of_stream());
|
|
}
|
|
|
|
logData->addActor = PromiseStream<Future<Void>>(); // there could be items still in the promise stream if one of the
|
|
// actors threw an error immediately
|
|
self->id_data.erase(logData->logId);
|
|
|
|
while (self->popOrder.size() && !self->id_data.count(self->popOrder.front())) {
|
|
self->popOrder.pop_front();
|
|
}
|
|
|
|
if (self->id_data.size() == 0) {
|
|
throw worker_removed();
|
|
}
|
|
if (logData->queueCommittingVersion == 0) {
|
|
// If the removed tlog never attempted a queue commit, the update storage loop could become stuck waiting for
|
|
// queueCommittedVersion to advance.
|
|
logData->queueCommittedVersion.set(std::numeric_limits<Version>::max());
|
|
}
|
|
}
|
|
|
|
// remote tLog pull data from log routers
|
|
ACTOR Future<Void> pullAsyncData(TLogData* self,
|
|
Reference<LogData> logData,
|
|
std::vector<Tag> tags,
|
|
Version beginVersion,
|
|
Optional<Version> endVersion,
|
|
bool poppedIsKnownCommitted) {
|
|
state Future<Void> dbInfoChange = Void();
|
|
state Reference<ILogSystem::IPeekCursor> r;
|
|
state Version tagAt = beginVersion;
|
|
state Version lastVer = 0;
|
|
|
|
if (endVersion.present()) {
|
|
TraceEvent("TLogRestoreReplicationFactor", self->dbgid)
|
|
.detail("LogId", logData->logId)
|
|
.detail("Locality", logData->locality)
|
|
.detail("RecoverFrom", beginVersion)
|
|
.detail("RecoverTo", endVersion.get());
|
|
}
|
|
|
|
while (!endVersion.present() || logData->version.get() < endVersion.get()) {
|
|
loop {
|
|
choose {
|
|
when(wait(r ? r->getMore(TaskPriority::TLogCommit) : Never())) { break; }
|
|
when(wait(dbInfoChange)) {
|
|
if (logData->logSystem->get()) {
|
|
r = logData->logSystem->get()->peek(logData->logId, tagAt, endVersion, tags, true);
|
|
} else {
|
|
r = Reference<ILogSystem::IPeekCursor>();
|
|
}
|
|
dbInfoChange = logData->logSystem->onChange();
|
|
}
|
|
}
|
|
}
|
|
|
|
state double waitStartT = 0;
|
|
while (self->bytesInput - self->bytesDurable >= SERVER_KNOBS->TLOG_HARD_LIMIT_BYTES && !logData->stopped) {
|
|
if (now() - waitStartT >= 1) {
|
|
TraceEvent(SevWarn, "TLogUpdateLag", logData->logId)
|
|
.detail("Version", logData->version.get())
|
|
.detail("PersistentDataVersion", logData->persistentDataVersion)
|
|
.detail("PersistentDataDurableVersion", logData->persistentDataDurableVersion);
|
|
waitStartT = now();
|
|
}
|
|
wait(delayJittered(.005, TaskPriority::TLogCommit));
|
|
}
|
|
|
|
state Version ver = 0;
|
|
state std::vector<TagsAndMessage> messages;
|
|
state bool pullingRecoveryData = endVersion.present() && endVersion.get() == logData->recoveredAt;
|
|
loop {
|
|
state bool foundMessage = r->hasMessage();
|
|
if (!foundMessage || r->version().version != ver) {
|
|
ASSERT(r->version().version > lastVer);
|
|
if (ver) {
|
|
if (logData->stopped || (endVersion.present() && ver > endVersion.get())) {
|
|
return Void();
|
|
}
|
|
|
|
if (poppedIsKnownCommitted) {
|
|
logData->knownCommittedVersion = std::max(logData->knownCommittedVersion, r->popped());
|
|
logData->minKnownCommittedVersion =
|
|
std::max(logData->minKnownCommittedVersion, r->getMinKnownCommittedVersion());
|
|
}
|
|
|
|
commitMessages(self, logData, ver, messages);
|
|
|
|
if (self->terminated.isSet()) {
|
|
return Void();
|
|
}
|
|
|
|
// Log the changes to the persistent queue, to be committed by commitQueue()
|
|
AlternativeTLogQueueEntryRef qe;
|
|
qe.version = ver;
|
|
qe.knownCommittedVersion = logData->knownCommittedVersion;
|
|
qe.alternativeMessages = &messages;
|
|
qe.id = logData->logId;
|
|
self->persistentQueue->push(qe, logData);
|
|
|
|
self->diskQueueCommitBytes += qe.expectedSize();
|
|
if (self->diskQueueCommitBytes > SERVER_KNOBS->MAX_QUEUE_COMMIT_BYTES) {
|
|
self->largeDiskQueueCommitBytes.set(true);
|
|
}
|
|
|
|
// Notifies the commitQueue actor to commit persistentQueue, and also unblocks tLogPeekMessages
|
|
// actors
|
|
logData->version.set(ver);
|
|
if (logData->recoveryTxnReceived.canBeSet() && !pullingRecoveryData && ver > logData->recoveredAt) {
|
|
TraceEvent("TLogInfo", self->dbgid)
|
|
.detail("Log", logData->logId)
|
|
.detail("RecoveredAt", logData->recoveredAt)
|
|
.detail("RecoveryTxnVersion", ver);
|
|
logData->recoveryTxnReceived.send(Void());
|
|
}
|
|
wait(yield(TaskPriority::TLogCommit));
|
|
}
|
|
lastVer = ver;
|
|
ver = r->version().version;
|
|
messages.clear();
|
|
|
|
if (!foundMessage) {
|
|
ver--;
|
|
if (ver > logData->version.get()) {
|
|
if (logData->stopped || (endVersion.present() && ver > endVersion.get())) {
|
|
return Void();
|
|
}
|
|
|
|
if (poppedIsKnownCommitted) {
|
|
logData->knownCommittedVersion = std::max(logData->knownCommittedVersion, r->popped());
|
|
logData->minKnownCommittedVersion =
|
|
std::max(logData->minKnownCommittedVersion, r->getMinKnownCommittedVersion());
|
|
}
|
|
|
|
if (self->terminated.isSet()) {
|
|
return Void();
|
|
}
|
|
|
|
// Log the changes to the persistent queue, to be committed by commitQueue()
|
|
TLogQueueEntryRef qe;
|
|
qe.version = ver;
|
|
qe.knownCommittedVersion = logData->knownCommittedVersion;
|
|
qe.messages = StringRef();
|
|
qe.id = logData->logId;
|
|
self->persistentQueue->push(qe, logData);
|
|
|
|
self->diskQueueCommitBytes += qe.expectedSize();
|
|
if (self->diskQueueCommitBytes > SERVER_KNOBS->MAX_QUEUE_COMMIT_BYTES) {
|
|
self->largeDiskQueueCommitBytes.set(true);
|
|
}
|
|
|
|
// Notifies the commitQueue actor to commit persistentQueue, and also unblocks tLogPeekMessages
|
|
// actors
|
|
logData->version.set(ver);
|
|
wait(yield(TaskPriority::TLogCommit));
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
messages.emplace_back(r->getMessageWithTags(), r->getTags());
|
|
r->nextMessage();
|
|
}
|
|
|
|
tagAt = std::max(r->version().version, logData->version.get() + 1);
|
|
}
|
|
return Void();
|
|
}
|
|
|
|
ACTOR Future<Void> tLogCore(TLogData* self,
|
|
Reference<LogData> logData,
|
|
TLogInterface tli,
|
|
bool pulledRecoveryVersions) {
|
|
if (logData->removed.isReady()) {
|
|
wait(delay(0)); // to avoid iterator invalidation in restorePersistentState when removed is already ready
|
|
ASSERT(logData->removed.isError());
|
|
|
|
if (logData->removed.getError().code() != error_code_worker_removed) {
|
|
throw logData->removed.getError();
|
|
}
|
|
|
|
removeLog(self, logData);
|
|
return Void();
|
|
}
|
|
|
|
state PromiseStream<Void> warningCollectorInput;
|
|
state Future<Void> warningCollector =
|
|
timeoutWarningCollector(warningCollectorInput.getFuture(), 1.0, "TLogQueueCommitSlow", self->dbgid);
|
|
state Future<Void> error = actorCollection(logData->addActor.getFuture());
|
|
|
|
logData->addActor.send(waitFailureServer(tli.waitFailure.getFuture()));
|
|
logData->addActor.send(logData->removed);
|
|
// FIXME: update tlogMetrics to include new information, or possibly only have one copy for the shared instance
|
|
logData->addActor.send(traceCounters("TLogMetrics",
|
|
logData->logId,
|
|
SERVER_KNOBS->STORAGE_LOGGING_DELAY,
|
|
&logData->cc,
|
|
logData->logId.toString() + "/TLogMetrics",
|
|
[self = self](TraceEvent& te) {
|
|
StorageBytes sbTlog = self->persistentData->getStorageBytes();
|
|
te.detail("KvstoreBytesUsed", sbTlog.used);
|
|
te.detail("KvstoreBytesFree", sbTlog.free);
|
|
te.detail("KvstoreBytesAvailable", sbTlog.available);
|
|
te.detail("KvstoreBytesTotal", sbTlog.total);
|
|
te.detail("KvstoreBytesTemp", sbTlog.temp);
|
|
|
|
StorageBytes sbQueue = self->rawPersistentQueue->getStorageBytes();
|
|
te.detail("QueueDiskBytesUsed", sbQueue.used);
|
|
te.detail("QueueDiskBytesFree", sbQueue.free);
|
|
te.detail("QueueDiskBytesAvailable", sbQueue.available);
|
|
te.detail("QueueDiskBytesTotal", sbQueue.total);
|
|
te.detail("QueueDiskBytesTemp", sbQueue.temp);
|
|
}));
|
|
|
|
logData->addActor.send(serveTLogInterface(self, tli, logData, warningCollectorInput));
|
|
logData->addActor.send(cleanupPeekTrackers(logData.getPtr()));
|
|
logData->addActor.send(logPeekTrackers(logData.getPtr()));
|
|
|
|
if (!logData->isPrimary) {
|
|
std::vector<Tag> tags;
|
|
tags.push_back(logData->remoteTag);
|
|
logData->addActor.send(
|
|
pullAsyncData(self,
|
|
logData,
|
|
tags,
|
|
pulledRecoveryVersions ? logData->recoveredAt + 1 : logData->unrecoveredBefore,
|
|
Optional<Version>(),
|
|
true));
|
|
}
|
|
|
|
try {
|
|
wait(error);
|
|
throw internal_error();
|
|
} catch (Error& e) {
|
|
if (e.code() != error_code_worker_removed)
|
|
throw;
|
|
|
|
removeLog(self, logData);
|
|
return Void();
|
|
}
|
|
}
|
|
|
|
ACTOR Future<Void> checkEmptyQueue(TLogData* self) {
|
|
TraceEvent("TLogCheckEmptyQueueBegin", self->dbgid).log();
|
|
try {
|
|
bool recoveryFinished = wait(self->persistentQueue->initializeRecovery(0));
|
|
if (recoveryFinished)
|
|
return Void();
|
|
TLogQueueEntry r = wait(self->persistentQueue->readNext(self));
|
|
throw internal_error();
|
|
} catch (Error& e) {
|
|
if (e.code() != error_code_end_of_stream)
|
|
throw;
|
|
TraceEvent("TLogCheckEmptyQueueEnd", self->dbgid).log();
|
|
return Void();
|
|
}
|
|
}
|
|
|
|
ACTOR Future<Void> checkRecovered(TLogData* self) {
|
|
TraceEvent("TLogCheckRecoveredBegin", self->dbgid).log();
|
|
Optional<Value> v = wait(self->persistentData->readValue(StringRef()));
|
|
TraceEvent("TLogCheckRecoveredEnd", self->dbgid).log();
|
|
return Void();
|
|
}
|
|
|
|
// Recovery persistent state of tLog from disk
|
|
ACTOR Future<Void> restorePersistentState(TLogData* self,
|
|
LocalityData locality,
|
|
Promise<Void> oldLog,
|
|
Promise<Void> recovered,
|
|
PromiseStream<InitializeTLogRequest> tlogRequests) {
|
|
state double startt = now();
|
|
state Reference<LogData> logData;
|
|
state KeyRange tagKeys;
|
|
// PERSIST: Read basic state from persistentData; replay persistentQueue but don't erase it
|
|
|
|
TraceEvent("TLogRestorePersistentState", self->dbgid).log();
|
|
|
|
state IKeyValueStore* storage = self->persistentData;
|
|
wait(storage->init());
|
|
state Future<Optional<Value>> fFormat = storage->readValue(persistFormat.key);
|
|
state Future<Optional<Value>> fRecoveryLocation = storage->readValue(persistRecoveryLocationKey);
|
|
state Future<Optional<Value>> fClusterId = storage->readValue(persistClusterIdKey);
|
|
state Future<RangeResult> fVers = storage->readRange(persistCurrentVersionKeys);
|
|
state Future<RangeResult> fKnownCommitted = storage->readRange(persistKnownCommittedVersionKeys);
|
|
state Future<RangeResult> fLocality = storage->readRange(persistLocalityKeys);
|
|
state Future<RangeResult> fLogRouterTags = storage->readRange(persistLogRouterTagsKeys);
|
|
state Future<RangeResult> fTxsTags = storage->readRange(persistTxsTagsKeys);
|
|
state Future<RangeResult> fRecoverCounts = storage->readRange(persistRecoveryCountKeys);
|
|
state Future<RangeResult> fProtocolVersions = storage->readRange(persistProtocolVersionKeys);
|
|
state Future<RangeResult> fTLogSpillTypes = storage->readRange(persistTLogSpillTypeKeys);
|
|
|
|
// FIXME: metadata in queue?
|
|
|
|
wait(waitForAll(std::vector{ fFormat, fRecoveryLocation, fClusterId }));
|
|
wait(waitForAll(std::vector{ fVers,
|
|
fKnownCommitted,
|
|
fLocality,
|
|
fLogRouterTags,
|
|
fTxsTags,
|
|
fRecoverCounts,
|
|
fProtocolVersions,
|
|
fTLogSpillTypes }));
|
|
|
|
if (fClusterId.get().present()) {
|
|
self->durableClusterId = BinaryReader::fromStringRef<UID>(fClusterId.get().get(), Unversioned());
|
|
}
|
|
|
|
if (fFormat.get().present() && !persistFormatReadableRange.contains(fFormat.get().get())) {
|
|
// FIXME: remove when we no longer need to test upgrades from 4.X releases
|
|
if (g_network->isSimulated()) {
|
|
TraceEvent("ElapsedTime").detail("SimTime", now()).detail("RealTime", 0).detail("RandomUnseed", 0);
|
|
flushAndExit(0);
|
|
}
|
|
|
|
TraceEvent(SevError, "UnsupportedDBFormat", self->dbgid)
|
|
.detail("Format", fFormat.get().get())
|
|
.detail("Expected", persistFormat.value.toString());
|
|
throw worker_recovery_failed();
|
|
}
|
|
|
|
if (!fFormat.get().present()) {
|
|
RangeResult v = wait(self->persistentData->readRange(KeyRangeRef(StringRef(), LiteralStringRef("\xff")), 1));
|
|
if (!v.size()) {
|
|
TEST(true); // The DB is completely empty, so it was never initialized. Delete it.
|
|
throw worker_removed();
|
|
} else {
|
|
// This should never happen
|
|
TraceEvent(SevError, "NoDBFormatKey", self->dbgid).detail("FirstKey", v[0].key);
|
|
ASSERT(false);
|
|
throw worker_recovery_failed();
|
|
}
|
|
}
|
|
|
|
state std::vector<Future<ErrorOr<Void>>> removed;
|
|
|
|
ASSERT(fFormat.get().get() == LiteralStringRef("FoundationDB/LogServer/3/0"));
|
|
|
|
ASSERT(fVers.get().size() == fRecoverCounts.get().size());
|
|
|
|
state std::map<UID, int8_t> id_locality;
|
|
for (auto it : fLocality.get()) {
|
|
id_locality[BinaryReader::fromStringRef<UID>(it.key.removePrefix(persistLocalityKeys.begin), Unversioned())] =
|
|
BinaryReader::fromStringRef<int8_t>(it.value, Unversioned());
|
|
}
|
|
|
|
state std::map<UID, int> id_logRouterTags;
|
|
for (auto it : fLogRouterTags.get()) {
|
|
id_logRouterTags[BinaryReader::fromStringRef<UID>(it.key.removePrefix(persistLogRouterTagsKeys.begin),
|
|
Unversioned())] =
|
|
BinaryReader::fromStringRef<int>(it.value, Unversioned());
|
|
}
|
|
|
|
state std::map<UID, int> id_txsTags;
|
|
for (auto it : fTxsTags.get()) {
|
|
id_txsTags[BinaryReader::fromStringRef<UID>(it.key.removePrefix(persistTxsTagsKeys.begin), Unversioned())] =
|
|
BinaryReader::fromStringRef<int>(it.value, Unversioned());
|
|
}
|
|
|
|
state std::map<UID, Version> id_knownCommitted;
|
|
for (auto it : fKnownCommitted.get()) {
|
|
id_knownCommitted[BinaryReader::fromStringRef<UID>(it.key.removePrefix(persistKnownCommittedVersionKeys.begin),
|
|
Unversioned())] =
|
|
BinaryReader::fromStringRef<Version>(it.value, Unversioned());
|
|
}
|
|
|
|
state IDiskQueue::location minimumRecoveryLocation = 0;
|
|
if (fRecoveryLocation.get().present()) {
|
|
minimumRecoveryLocation =
|
|
BinaryReader::fromStringRef<IDiskQueue::location>(fRecoveryLocation.get().get(), Unversioned());
|
|
}
|
|
|
|
state int idx = 0;
|
|
state Promise<Void> registerWithCC;
|
|
state std::map<UID, TLogInterface> id_interf;
|
|
state std::vector<std::pair<Version, UID>> logsByVersion;
|
|
for (idx = 0; idx < fVers.get().size(); idx++) {
|
|
state KeyRef rawId = fVers.get()[idx].key.removePrefix(persistCurrentVersionKeys.begin);
|
|
UID id1 = BinaryReader::fromStringRef<UID>(rawId, Unversioned());
|
|
UID id2 = BinaryReader::fromStringRef<UID>(
|
|
fRecoverCounts.get()[idx].key.removePrefix(persistRecoveryCountKeys.begin), Unversioned());
|
|
ASSERT(id1 == id2);
|
|
|
|
TLogInterface recruited(id1, self->dbgid, locality);
|
|
recruited.initEndpoints();
|
|
|
|
DUMPTOKEN(recruited.peekMessages);
|
|
DUMPTOKEN(recruited.peekStreamMessages);
|
|
DUMPTOKEN(recruited.popMessages);
|
|
DUMPTOKEN(recruited.commit);
|
|
DUMPTOKEN(recruited.lock);
|
|
DUMPTOKEN(recruited.getQueuingMetrics);
|
|
DUMPTOKEN(recruited.confirmRunning);
|
|
DUMPTOKEN(recruited.waitFailure);
|
|
DUMPTOKEN(recruited.recoveryFinished);
|
|
DUMPTOKEN(recruited.disablePopRequest);
|
|
DUMPTOKEN(recruited.enablePopRequest);
|
|
DUMPTOKEN(recruited.snapRequest);
|
|
|
|
ProtocolVersion protocolVersion =
|
|
BinaryReader::fromStringRef<ProtocolVersion>(fProtocolVersions.get()[idx].value, Unversioned());
|
|
TLogSpillType logSpillType = BinaryReader::fromStringRef<TLogSpillType>(fTLogSpillTypes.get()[idx].value,
|
|
AssumeVersion(protocolVersion));
|
|
|
|
// We do not need the remoteTag, because we will not be loading any additional data
|
|
logData = makeReference<LogData>(self,
|
|
recruited,
|
|
Tag(),
|
|
true,
|
|
id_logRouterTags[id1],
|
|
id_txsTags[id1],
|
|
UID(),
|
|
protocolVersion,
|
|
logSpillType,
|
|
std::vector<Tag>(),
|
|
"Restored");
|
|
logData->locality = id_locality[id1];
|
|
logData->stopped = true;
|
|
logData->unblockWaitingPeeks();
|
|
self->id_data[id1] = logData;
|
|
id_interf[id1] = recruited;
|
|
|
|
logData->knownCommittedVersion = id_knownCommitted[id1];
|
|
Version ver = BinaryReader::fromStringRef<Version>(fVers.get()[idx].value, Unversioned());
|
|
logData->persistentDataVersion = ver;
|
|
logData->persistentDataDurableVersion = ver;
|
|
logData->version.set(ver);
|
|
logData->recoveryCount =
|
|
BinaryReader::fromStringRef<DBRecoveryCount>(fRecoverCounts.get()[idx].value, Unversioned());
|
|
logData->removed =
|
|
rejoinClusterController(self, recruited, logData->recoveryCount, registerWithCC.getFuture(), false);
|
|
removed.push_back(errorOr(logData->removed));
|
|
logsByVersion.emplace_back(ver, id1);
|
|
|
|
TraceEvent("TLogPersistentStateRestore", self->dbgid)
|
|
.detail("LogId", logData->logId)
|
|
.detail("Ver", ver)
|
|
.detail("RecoveryCount", logData->recoveryCount);
|
|
// Restore popped keys. Pop operations that took place after the last (committed) updatePersistentDataVersion
|
|
// might be lost, but that is fine because we will get the corresponding data back, too.
|
|
tagKeys = prefixRange(rawId.withPrefix(persistTagPoppedKeys.begin));
|
|
loop {
|
|
if (logData->removed.isReady())
|
|
break;
|
|
RangeResult data = wait(self->persistentData->readRange(tagKeys, BUGGIFY ? 3 : 1 << 30, 1 << 20));
|
|
if (!data.size())
|
|
break;
|
|
((KeyRangeRef&)tagKeys) = KeyRangeRef(keyAfter(data.back().key, tagKeys.arena()), tagKeys.end);
|
|
|
|
for (auto& kv : data) {
|
|
Tag tag = decodeTagPoppedKey(rawId, kv.key);
|
|
Version popped = decodeTagPoppedValue(kv.value);
|
|
TraceEvent("TLogRestorePopped", logData->logId).detail("Tag", tag.toString()).detail("To", popped);
|
|
auto tagData = logData->getTagData(tag);
|
|
ASSERT(!tagData);
|
|
logData->createTagData(tag, popped, false, false, false);
|
|
logData->getTagData(tag)->persistentPopped = popped;
|
|
}
|
|
}
|
|
}
|
|
|
|
std::sort(logsByVersion.begin(), logsByVersion.end());
|
|
for (const auto& pair : logsByVersion) {
|
|
// TLogs that have been fully spilled won't have queue entries read in the loop below.
|
|
self->popOrder.push_back(pair.second);
|
|
}
|
|
logsByVersion.clear();
|
|
|
|
state Future<Void> allRemoved = waitForAll(removed);
|
|
state UID lastId = UID(1, 1); // initialized so it will not compare equal to a default UID
|
|
state double recoverMemoryLimit = SERVER_KNOBS->TLOG_RECOVER_MEMORY_LIMIT;
|
|
if (BUGGIFY)
|
|
recoverMemoryLimit =
|
|
std::max<double>(SERVER_KNOBS->BUGGIFY_RECOVER_MEMORY_LIMIT, (double)SERVER_KNOBS->TLOG_SPILL_THRESHOLD);
|
|
|
|
try {
|
|
bool recoveryFinished = wait(self->persistentQueue->initializeRecovery(minimumRecoveryLocation));
|
|
if (recoveryFinished)
|
|
throw end_of_stream();
|
|
loop {
|
|
if (allRemoved.isReady()) {
|
|
TEST(true); // all tlogs removed during queue recovery
|
|
throw worker_removed();
|
|
}
|
|
choose {
|
|
when(TLogQueueEntry qe = wait(self->persistentQueue->readNext(self))) {
|
|
if (qe.id != lastId) {
|
|
lastId = qe.id;
|
|
auto it = self->id_data.find(qe.id);
|
|
if (it != self->id_data.end()) {
|
|
logData = it->second;
|
|
} else {
|
|
logData = Reference<LogData>();
|
|
}
|
|
}
|
|
|
|
//TraceEvent("TLogRecoveredQE", self->dbgid).detail("LogId", qe.id).detail("Ver", qe.version).detail("MessageBytes", qe.messages.size()).detail("Tags", qe.tags.size())
|
|
// .detail("Tag0", qe.tags.size() ? qe.tags[0].tag : invalidTag).detail("Version",
|
|
// logData->version.get());
|
|
|
|
if (logData) {
|
|
if (!self->spillOrder.size() || self->spillOrder.back() != qe.id) {
|
|
self->spillOrder.push_back(qe.id);
|
|
}
|
|
logData->knownCommittedVersion =
|
|
std::max(logData->knownCommittedVersion, qe.knownCommittedVersion);
|
|
if (qe.version > logData->version.get()) {
|
|
commitMessages(self, logData, qe.version, qe.arena(), qe.messages);
|
|
logData->version.set(qe.version);
|
|
logData->queueCommittedVersion.set(qe.version);
|
|
|
|
while (self->bytesInput - self->bytesDurable >= recoverMemoryLimit) {
|
|
TEST(true); // Flush excess data during TLog queue recovery
|
|
TraceEvent("FlushLargeQueueDuringRecovery", self->dbgid)
|
|
.detail("LogId", logData->logId)
|
|
.detail("BytesInput", self->bytesInput)
|
|
.detail("BytesDurable", self->bytesDurable)
|
|
.detail("Version", logData->version.get())
|
|
.detail("PVer", logData->persistentDataVersion);
|
|
|
|
choose {
|
|
when(wait(updateStorage(self))) {}
|
|
when(wait(allRemoved)) { throw worker_removed(); }
|
|
}
|
|
}
|
|
} else {
|
|
// Updating persistRecoveryLocation and persistCurrentVersion at the same time,
|
|
// transactionally, should mean that we never read any TLogQueueEntry that has already
|
|
// been spilled.
|
|
ASSERT_WE_THINK(qe.version == logData->version.get());
|
|
}
|
|
}
|
|
}
|
|
when(wait(allRemoved)) { throw worker_removed(); }
|
|
}
|
|
}
|
|
} catch (Error& e) {
|
|
if (e.code() != error_code_end_of_stream)
|
|
throw;
|
|
}
|
|
|
|
TraceEvent("TLogRestorePersistentStateDone", self->dbgid).detail("Took", now() - startt);
|
|
TEST(now() - startt >= 1.0); // TLog recovery took more than 1 second
|
|
|
|
for (auto it : self->id_data) {
|
|
if (it.second->queueCommittedVersion.get() == 0) {
|
|
TraceEvent("TLogZeroVersion", self->dbgid).detail("LogId", it.first);
|
|
it.second->queueCommittedVersion.set(it.second->version.get());
|
|
}
|
|
it.second->recoveryComplete.sendError(end_of_stream());
|
|
self->sharedActors.send(tLogCore(self, it.second, id_interf[it.first], false));
|
|
}
|
|
|
|
if (registerWithCC.canBeSet())
|
|
registerWithCC.send(Void());
|
|
return Void();
|
|
}
|
|
|
|
bool tlogTerminated(TLogData* self, IKeyValueStore* persistentData, TLogQueue* persistentQueue, Error const& e) {
|
|
// Dispose the IKVS (destroying its data permanently) only if this shutdown is definitely permanent. Otherwise just
|
|
// close it.
|
|
if (e.code() == error_code_worker_removed || e.code() == error_code_recruitment_failed) {
|
|
persistentData->dispose();
|
|
persistentQueue->dispose();
|
|
} else {
|
|
persistentData->close();
|
|
persistentQueue->close();
|
|
}
|
|
|
|
if (e.code() == error_code_worker_removed || e.code() == error_code_recruitment_failed ||
|
|
e.code() == error_code_file_not_found || e.code() == error_code_invalid_cluster_id) {
|
|
TraceEvent("TLogTerminated", self->dbgid).errorUnsuppressed(e);
|
|
return true;
|
|
} else
|
|
return false;
|
|
}
|
|
|
|
ACTOR Future<Void> updateLogSystem(TLogData* self,
|
|
Reference<LogData> logData,
|
|
LogSystemConfig recoverFrom,
|
|
Reference<AsyncVar<Reference<ILogSystem>>> logSystem) {
|
|
loop {
|
|
bool found = self->dbInfo->get().logSystemConfig.recruitmentID == logData->recruitmentID;
|
|
if (found) {
|
|
if (self->dbInfo->get().logSystemConfig.isNextGenerationOf(recoverFrom)) {
|
|
logSystem->set(ILogSystem::fromOldLogSystemConfig(
|
|
logData->logId, self->dbInfo->get().myLocality, self->dbInfo->get().logSystemConfig));
|
|
} else if (self->dbInfo->get().logSystemConfig.isEqualIds(recoverFrom)) {
|
|
logSystem->set(ILogSystem::fromLogSystemConfig(
|
|
logData->logId, self->dbInfo->get().myLocality, self->dbInfo->get().logSystemConfig, false, true));
|
|
} else if (self->dbInfo->get().recoveryState >= RecoveryState::ACCEPTING_COMMITS) {
|
|
logSystem->set(ILogSystem::fromLogSystemConfig(
|
|
logData->logId, self->dbInfo->get().myLocality, self->dbInfo->get().logSystemConfig, true));
|
|
} else {
|
|
found = false;
|
|
}
|
|
}
|
|
if (!found) {
|
|
logSystem->set(Reference<ILogSystem>());
|
|
} else {
|
|
logData->logSystem->get()->pop(logData->logRouterPoppedVersion,
|
|
logData->remoteTag,
|
|
logData->durableKnownCommittedVersion,
|
|
logData->locality);
|
|
}
|
|
TraceEvent("TLogUpdate", self->dbgid)
|
|
.detail("LogId", logData->logId)
|
|
.detail("RecruitmentID", logData->recruitmentID)
|
|
.detail("DbRecruitmentID", self->dbInfo->get().logSystemConfig.recruitmentID)
|
|
.detail("RecoverFrom", recoverFrom.toString())
|
|
.detail("DbInfo", self->dbInfo->get().logSystemConfig.toString())
|
|
.detail("Found", found)
|
|
.detail("LogSystem", (bool)logSystem->get())
|
|
.detail("RecoveryState", (int)self->dbInfo->get().recoveryState);
|
|
for (const auto& it : self->dbInfo->get().logSystemConfig.oldTLogs) {
|
|
TraceEvent("TLogUpdateOld", self->dbgid).detail("LogId", logData->logId).detail("DbInfo", it.toString());
|
|
}
|
|
wait(self->dbInfo->onChange());
|
|
}
|
|
}
|
|
|
|
void stopAllTLogs(TLogData* self, UID newLogId) {
|
|
for (auto it : self->id_data) {
|
|
if (!it.second->stopped) {
|
|
TraceEvent("TLogStoppedByNewRecruitment", self->dbgid)
|
|
.detail("LogId", it.second->logId)
|
|
.detail("StoppedId", it.first.toString())
|
|
.detail("RecruitedId", newLogId)
|
|
.detail("EndEpoch", it.second->logSystem->get().getPtr() != 0);
|
|
if (!it.second->isPrimary && it.second->logSystem->get()) {
|
|
it.second->removed = it.second->removed && it.second->logSystem->get()->endEpoch();
|
|
}
|
|
if (it.second->committingQueue.canBeSet()) {
|
|
it.second->committingQueue.sendError(worker_removed());
|
|
}
|
|
}
|
|
it.second->stopped = true;
|
|
it.second->unblockWaitingPeeks();
|
|
if (!it.second->recoveryComplete.isSet()) {
|
|
it.second->recoveryComplete.sendError(end_of_stream());
|
|
}
|
|
it.second->stopCommit.trigger();
|
|
}
|
|
}
|
|
|
|
// Start the tLog role for a worker
|
|
ACTOR Future<Void> tLogStart(TLogData* self, InitializeTLogRequest req, LocalityData locality) {
|
|
state TLogInterface recruited(self->dbgid, locality);
|
|
recruited.initEndpoints();
|
|
|
|
DUMPTOKEN(recruited.peekMessages);
|
|
DUMPTOKEN(recruited.peekStreamMessages);
|
|
DUMPTOKEN(recruited.popMessages);
|
|
DUMPTOKEN(recruited.commit);
|
|
DUMPTOKEN(recruited.lock);
|
|
DUMPTOKEN(recruited.getQueuingMetrics);
|
|
DUMPTOKEN(recruited.confirmRunning);
|
|
DUMPTOKEN(recruited.waitFailure);
|
|
DUMPTOKEN(recruited.recoveryFinished);
|
|
DUMPTOKEN(recruited.disablePopRequest);
|
|
DUMPTOKEN(recruited.enablePopRequest);
|
|
DUMPTOKEN(recruited.snapRequest);
|
|
|
|
stopAllTLogs(self, recruited.id());
|
|
|
|
bool recovering = (req.recoverFrom.logSystemType == LogSystemType::tagPartitioned);
|
|
state Reference<LogData> logData = makeReference<LogData>(self,
|
|
recruited,
|
|
req.remoteTag,
|
|
req.isPrimary,
|
|
req.logRouterTags,
|
|
req.txsTags,
|
|
req.recruitmentID,
|
|
g_network->protocolVersion(),
|
|
req.spillType,
|
|
req.allTags,
|
|
recovering ? "Recovered" : "Recruited");
|
|
self->id_data[recruited.id()] = logData;
|
|
logData->locality = req.locality;
|
|
logData->recoveryCount = req.epoch;
|
|
logData->removed = rejoinClusterController(self, recruited, req.epoch, Future<Void>(Void()), req.isPrimary);
|
|
self->popOrder.push_back(recruited.id());
|
|
self->spillOrder.push_back(recruited.id());
|
|
|
|
TraceEvent("TLogStart", logData->logId).detail("RecoveryCount", logData->recoveryCount);
|
|
|
|
state Future<Void> updater;
|
|
state bool pulledRecoveryVersions = false;
|
|
try {
|
|
if (logData->removed.isReady()) {
|
|
throw logData->removed.getError();
|
|
}
|
|
|
|
if (recovering) {
|
|
logData->unrecoveredBefore = req.startVersion;
|
|
state Version recoverAt = req.recoverAt;
|
|
logData->recoveredAt = recoverAt;
|
|
logData->knownCommittedVersion = req.startVersion - 1;
|
|
logData->persistentDataVersion = logData->unrecoveredBefore - 1;
|
|
logData->persistentDataDurableVersion = logData->unrecoveredBefore - 1;
|
|
logData->queueCommittedVersion.set(logData->unrecoveredBefore - 1);
|
|
logData->version.set(logData->unrecoveredBefore - 1);
|
|
|
|
logData->unpoppedRecoveredTags = req.allTags.size();
|
|
wait(initPersistentState(self, logData) || logData->removed);
|
|
|
|
TraceEvent("TLogRecover", self->dbgid)
|
|
.detail("LogId", logData->logId)
|
|
.detail("At", recoverAt)
|
|
.detail("Known", req.knownCommittedVersion)
|
|
.detail("Unrecovered", logData->unrecoveredBefore)
|
|
.detail("Tags", describe(req.recoverTags))
|
|
.detail("Locality", req.locality)
|
|
.detail("LogRouterTags", logData->logRouterTags);
|
|
|
|
if (logData->recoveryComplete.isSet()) {
|
|
throw worker_removed();
|
|
}
|
|
|
|
updater = updateLogSystem(self, logData, req.recoverFrom, logData->logSystem);
|
|
|
|
logData->initialized = true;
|
|
self->newLogData.trigger();
|
|
|
|
if ((req.isPrimary || req.recoverFrom.logRouterTags == 0) && !logData->stopped &&
|
|
logData->unrecoveredBefore <= recoverAt) {
|
|
if (req.recoverFrom.logRouterTags > 0 && req.locality != tagLocalitySatellite) {
|
|
logData->logRouterPopToVersion = recoverAt;
|
|
std::vector<Tag> tags;
|
|
tags.push_back(logData->remoteTag);
|
|
wait(pullAsyncData(self, logData, tags, logData->unrecoveredBefore, recoverAt, true) ||
|
|
logData->removed || logData->stopCommit.onTrigger());
|
|
} else if (!req.recoverTags.empty()) {
|
|
ASSERT(logData->unrecoveredBefore > req.knownCommittedVersion);
|
|
wait(pullAsyncData(
|
|
self, logData, req.recoverTags, req.knownCommittedVersion + 1, recoverAt, false) ||
|
|
logData->removed || logData->stopCommit.onTrigger());
|
|
}
|
|
pulledRecoveryVersions = true;
|
|
logData->knownCommittedVersion = recoverAt;
|
|
}
|
|
|
|
state Version lastVersionPrevEpoch = req.recoverAt;
|
|
|
|
if ((req.isPrimary || req.recoverFrom.logRouterTags == 0) &&
|
|
logData->version.get() < lastVersionPrevEpoch && !logData->stopped) {
|
|
// Log the changes to the persistent queue, to be committed by commitQueue()
|
|
TLogQueueEntryRef qe;
|
|
qe.version = lastVersionPrevEpoch;
|
|
qe.knownCommittedVersion = logData->knownCommittedVersion;
|
|
qe.messages = StringRef();
|
|
qe.id = logData->logId;
|
|
self->persistentQueue->push(qe, logData);
|
|
|
|
self->diskQueueCommitBytes += qe.expectedSize();
|
|
if (self->diskQueueCommitBytes > SERVER_KNOBS->MAX_QUEUE_COMMIT_BYTES) {
|
|
self->largeDiskQueueCommitBytes.set(true);
|
|
}
|
|
logData->version.set(lastVersionPrevEpoch);
|
|
}
|
|
|
|
if (logData->recoveryComplete.isSet()) {
|
|
throw worker_removed();
|
|
}
|
|
|
|
logData->addActor.send(respondToRecovered(recruited, logData->recoveryComplete));
|
|
} else {
|
|
// Brand new tlog, initialization has already been done by caller
|
|
wait(initPersistentState(self, logData) || logData->removed);
|
|
|
|
if (logData->recoveryComplete.isSet()) {
|
|
throw worker_removed();
|
|
}
|
|
|
|
logData->initialized = true;
|
|
self->newLogData.trigger();
|
|
|
|
logData->recoveryComplete.send(Void());
|
|
}
|
|
wait(logData->committingQueue.getFuture() || logData->removed);
|
|
} catch (Error& e) {
|
|
req.reply.sendError(recruitment_failed());
|
|
|
|
if (e.code() != error_code_worker_removed) {
|
|
throw;
|
|
}
|
|
|
|
wait(delay(0.0)); // if multiple recruitment requests were already in the promise stream make sure they are all
|
|
// started before any are removed
|
|
|
|
removeLog(self, logData);
|
|
return Void();
|
|
}
|
|
|
|
req.reply.send(recruited);
|
|
|
|
TraceEvent("TLogReady", logData->logId)
|
|
.detail("AllTags", describe(req.allTags))
|
|
.detail("Locality", logData->locality);
|
|
|
|
updater = Void();
|
|
wait(tLogCore(self, logData, recruited, pulledRecoveryVersions));
|
|
return Void();
|
|
}
|
|
|
|
ACTOR Future<Void> startSpillingInTenSeconds(TLogData* self, UID tlogId, Reference<AsyncVar<UID>> activeSharedTLog) {
|
|
wait(delay(10));
|
|
if (activeSharedTLog->get() != tlogId) {
|
|
// TODO: This should fully spill, but currently doing so will cause us to no longer update poppedVersion
|
|
// and QuietDatabase will hang thinking our TLog is behind.
|
|
TraceEvent("SharedTLogBeginSpilling", self->dbgid).detail("NowActive", activeSharedTLog->get());
|
|
self->targetVolatileBytes = SERVER_KNOBS->REFERENCE_SPILL_UPDATE_STORAGE_BYTE_LIMIT * 2;
|
|
} else {
|
|
TraceEvent("SharedTLogSkipSpilling", self->dbgid).detail("NowActive", activeSharedTLog->get());
|
|
}
|
|
return Void();
|
|
}
|
|
|
|
// New tLog (if !recoverFrom.size()) or restore from network
|
|
ACTOR Future<Void> tLog(IKeyValueStore* persistentData,
|
|
IDiskQueue* persistentQueue,
|
|
Reference<AsyncVar<ServerDBInfo> const> db,
|
|
LocalityData locality,
|
|
PromiseStream<InitializeTLogRequest> tlogRequests,
|
|
UID tlogId,
|
|
UID workerID,
|
|
bool restoreFromDisk,
|
|
Promise<Void> oldLog,
|
|
Promise<Void> recovered,
|
|
std::string folder,
|
|
Reference<AsyncVar<bool>> degraded,
|
|
Reference<AsyncVar<UID>> activeSharedTLog) {
|
|
state TLogData self(tlogId, workerID, persistentData, persistentQueue, db, degraded, folder);
|
|
state Future<Void> error = actorCollection(self.sharedActors.getFuture());
|
|
|
|
TraceEvent("SharedTlog", tlogId).log();
|
|
try {
|
|
try {
|
|
if (restoreFromDisk) {
|
|
wait(restorePersistentState(&self, locality, oldLog, recovered, tlogRequests));
|
|
} else {
|
|
wait(ioTimeoutError(checkEmptyQueue(&self) && checkRecovered(&self),
|
|
SERVER_KNOBS->TLOG_MAX_CREATE_DURATION));
|
|
}
|
|
|
|
// Disk errors need a chance to kill this actor.
|
|
wait(delay(0.000001));
|
|
|
|
if (recovered.canBeSet())
|
|
recovered.send(Void());
|
|
|
|
self.sharedActors.send(commitQueue(&self));
|
|
self.sharedActors.send(updateStorageLoop(&self));
|
|
self.sharedActors.send(traceRole(Role::SHARED_TRANSACTION_LOG, tlogId));
|
|
state Future<Void> activeSharedChange = Void();
|
|
|
|
loop {
|
|
choose {
|
|
when(state InitializeTLogRequest req = waitNext(tlogRequests.getFuture())) {
|
|
ASSERT(req.clusterId.isValid());
|
|
// Durably persist the cluster ID if it is not already
|
|
// durable and the cluster has progressed far enough
|
|
// through recovery. To avoid different partitions from
|
|
// persisting different cluster IDs, we need to wait
|
|
// until a single cluster ID has been persisted in the
|
|
// txnStateStore before finally writing it to disk.
|
|
auto recoveryState = self.dbInfo->get().recoveryState;
|
|
if (!self.durableClusterId.isValid() && recoveryState >= RecoveryState::ACCEPTING_COMMITS) {
|
|
self.durableClusterId = req.clusterId;
|
|
// Will let commit loop durably write the cluster ID.
|
|
self.persistentData->set(
|
|
KeyValueRef(persistClusterIdKey, BinaryWriter::toValue(req.clusterId, Unversioned())));
|
|
}
|
|
|
|
if (!self.tlogCache.exists(req.recruitmentID)) {
|
|
self.tlogCache.set(req.recruitmentID, req.reply.getFuture());
|
|
self.sharedActors.send(
|
|
self.tlogCache.removeOnReady(req.recruitmentID, tLogStart(&self, req, locality)));
|
|
} else {
|
|
forwardPromise(req.reply, self.tlogCache.get(req.recruitmentID));
|
|
}
|
|
}
|
|
when(wait(error)) { throw internal_error(); }
|
|
when(wait(activeSharedChange)) {
|
|
if (activeSharedTLog->get() == tlogId) {
|
|
TraceEvent("SharedTLogNowActive", self.dbgid).detail("NowActive", activeSharedTLog->get());
|
|
self.targetVolatileBytes = SERVER_KNOBS->TLOG_SPILL_THRESHOLD;
|
|
} else {
|
|
stopAllTLogs(&self, tlogId);
|
|
TraceEvent("SharedTLogQueueSpilling", self.dbgid)
|
|
.detail("NowActive", activeSharedTLog->get());
|
|
self.sharedActors.send(startSpillingInTenSeconds(&self, tlogId, activeSharedTLog));
|
|
}
|
|
activeSharedChange = activeSharedTLog->onChange();
|
|
}
|
|
}
|
|
}
|
|
} catch (Error& e) {
|
|
if (e.code() != error_code_worker_removed) {
|
|
throw;
|
|
}
|
|
// Don't need to worry about deleting data if there is no durable
|
|
// cluster ID.
|
|
if (!self.durableClusterId.isValid()) {
|
|
throw;
|
|
}
|
|
// When a tlog joins a new cluster and has data for an old cluster,
|
|
// it should automatically exclude itself to avoid being used in
|
|
// the new cluster.
|
|
auto recoveryState = self.dbInfo->get().recoveryState;
|
|
if (recoveryState == RecoveryState::FULLY_RECOVERED && self.ccClusterId.isValid() &&
|
|
self.durableClusterId.isValid() && self.ccClusterId != self.durableClusterId) {
|
|
state NetworkAddress address = g_network->getLocalAddress();
|
|
wait(excludeServers(self.cx, { AddressExclusion{ address.ip, address.port } }));
|
|
TraceEvent(SevWarnAlways, "TLogBelongsToExistingCluster")
|
|
.detail("ClusterId", self.durableClusterId)
|
|
.detail("NewClusterId", self.ccClusterId);
|
|
}
|
|
// If the tlog has a valid durable cluster ID, we don't want it to
|
|
// wipe its data! Throw this error to signal to `tlogTerminated` to
|
|
// close the persistent data store instead of deleting it.
|
|
throw invalid_cluster_id();
|
|
}
|
|
} catch (Error& e) {
|
|
self.terminated.send(Void());
|
|
TraceEvent("TLogError", tlogId).errorUnsuppressed(e);
|
|
if (recovered.canBeSet())
|
|
recovered.send(Void());
|
|
|
|
while (!tlogRequests.isEmpty()) {
|
|
tlogRequests.getFuture().pop().reply.sendError(recruitment_failed());
|
|
}
|
|
|
|
for (auto& it : self.id_data) {
|
|
if (!it.second->recoveryComplete.isSet()) {
|
|
it.second->recoveryComplete.sendError(end_of_stream());
|
|
}
|
|
}
|
|
|
|
if (tlogTerminated(&self, persistentData, self.persistentQueue, e)) {
|
|
return Void();
|
|
} else {
|
|
throw;
|
|
}
|
|
}
|
|
}
|
|
|
|
// UNIT TESTS
|
|
struct DequeAllocatorStats {
|
|
static int64_t allocatedBytes;
|
|
};
|
|
|
|
int64_t DequeAllocatorStats::allocatedBytes = 0;
|
|
|
|
template <class T>
|
|
struct DequeAllocator : std::allocator<T> {
|
|
template <typename U>
|
|
struct rebind {
|
|
typedef DequeAllocator<U> other;
|
|
};
|
|
|
|
DequeAllocator() {}
|
|
|
|
template <typename U>
|
|
DequeAllocator(DequeAllocator<U> const& u) : std::allocator<T>(u) {}
|
|
|
|
T* allocate(std::size_t n, std::allocator<void>::const_pointer hint = 0) {
|
|
DequeAllocatorStats::allocatedBytes += n * sizeof(T);
|
|
// fprintf(stderr, "Allocating %lld objects for %lld bytes (total allocated: %lld)\n", n, n * sizeof(T),
|
|
// DequeAllocatorStats::allocatedBytes);
|
|
return std::allocator<T>::allocate(n, hint);
|
|
}
|
|
void deallocate(T* p, std::size_t n) {
|
|
DequeAllocatorStats::allocatedBytes -= n * sizeof(T);
|
|
// fprintf(stderr, "Deallocating %lld objects for %lld bytes (total allocated: %lld)\n", n, n * sizeof(T),
|
|
// DequeAllocatorStats::allocatedBytes);
|
|
return std::allocator<T>::deallocate(p, n);
|
|
}
|
|
};
|
|
|
|
TEST_CASE("Lfdbserver/tlogserver/VersionMessagesOverheadFactor") {
|
|
|
|
typedef std::pair<Version, LengthPrefixedStringRef> TestType; // type used by versionMessages
|
|
|
|
for (int i = 1; i < 9; ++i) {
|
|
for (int j = 0; j < 20; ++j) {
|
|
DequeAllocatorStats::allocatedBytes = 0;
|
|
DequeAllocator<TestType> allocator;
|
|
std::deque<TestType, DequeAllocator<TestType>> d(allocator);
|
|
|
|
int numElements = deterministicRandom()->randomInt(pow(10, i - 1), pow(10, i));
|
|
for (int k = 0; k < numElements; ++k) {
|
|
d.push_back(TestType());
|
|
}
|
|
|
|
int removedElements = 0; // deterministicRandom()->randomInt(0, numElements); // FIXME: the overhead factor
|
|
// does not accurately account for removal!
|
|
for (int k = 0; k < removedElements; ++k) {
|
|
d.pop_front();
|
|
}
|
|
|
|
int64_t dequeBytes = DequeAllocatorStats::allocatedBytes + sizeof(std::deque<TestType>);
|
|
int64_t insertedBytes = (numElements - removedElements) * sizeof(TestType);
|
|
double overheadFactor =
|
|
std::max<double>(insertedBytes, dequeBytes - 10000) /
|
|
insertedBytes; // We subtract 10K here as an estimated upper bound for the fixed cost of an std::deque
|
|
// fprintf(stderr, "%d elements (%d inserted, %d removed):\n", numElements-removedElements, numElements,
|
|
// removedElements); fprintf(stderr, "Allocated %lld bytes to store %lld bytes (%lf overhead factor)\n",
|
|
// dequeBytes, insertedBytes, overheadFactor);
|
|
ASSERT(overheadFactor * 1024 <= SERVER_KNOBS->VERSION_MESSAGES_OVERHEAD_FACTOR_1024THS);
|
|
}
|
|
}
|
|
|
|
return Void();
|
|
}
|