foundationdb/fdbserver/LogSystem.h
Sreenath Bodagala b0554b4554 Capture how fast an SS is catching up to its tLog-SS lag
Changes:
LogSystem.h, LogSystemPeekCursor.actor.cpp:
Add APIs to find the ID of the tLog from which an SS has fetched the latest
set of versions.

storageserver.actor.cpp:
Capture the number of latest set of versions fetched, the time (in seconds)
in which those versions were fetched, and the tLog from which they were
fetched. Add this information to a TraceLogEvent.

Capture how many versions an SS has fetched in the
2021-05-11 20:03:21 +00:00

1133 lines
43 KiB
C++

/*
* LogSystem.h
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef FDBSERVER_LOGSYSTEM_H
#define FDBSERVER_LOGSYSTEM_H
#include <set>
#include <vector>
#include "fdbserver/SpanContextMessage.h"
#include "fdbserver/TLogInterface.h"
#include "fdbserver/WorkerInterface.actor.h"
#include "fdbclient/DatabaseConfiguration.h"
#include "fdbserver/MutationTracking.h"
#include "flow/IndexedSet.h"
#include "flow/Knobs.h"
#include "fdbrpc/ReplicationPolicy.h"
#include "fdbrpc/Locality.h"
#include "fdbrpc/Replication.h"
struct DBCoreState;
struct TLogSet;
struct CoreTLogSet;
struct ConnectionResetInfo : public ReferenceCounted<ConnectionResetInfo> {
double lastReset;
Future<Void> resetCheck;
int slowReplies;
int fastReplies;
ConnectionResetInfo() : lastReset(now()), slowReplies(0), fastReplies(0), resetCheck(Void()) {}
};
// The set of tLog servers, logRouters and backupWorkers for a log tag
class LogSet : NonCopyable, public ReferenceCounted<LogSet> {
public:
std::vector<Reference<AsyncVar<OptionalInterface<TLogInterface>>>> logServers;
std::vector<Reference<AsyncVar<OptionalInterface<TLogInterface>>>> logRouters;
std::vector<Reference<AsyncVar<OptionalInterface<BackupInterface>>>> backupWorkers;
std::vector<Reference<ConnectionResetInfo>> connectionResetTrackers;
int32_t tLogWriteAntiQuorum;
int32_t tLogReplicationFactor;
std::vector<LocalityData> tLogLocalities; // Stores the localities of the log servers
TLogVersion tLogVersion;
Reference<IReplicationPolicy> tLogPolicy;
Reference<LocalitySet> logServerSet;
std::vector<int> logIndexArray;
std::vector<LocalityEntry> logEntryArray;
bool isLocal; // true if the LogSet is in primary DC or primary DC's satellite
int8_t locality;
Version startVersion;
std::vector<Future<TLogLockResult>> replies;
std::vector<std::vector<int>> satelliteTagLocations;
LogSet()
: tLogWriteAntiQuorum(0), tLogReplicationFactor(0), isLocal(true), locality(tagLocalityInvalid),
startVersion(invalidVersion) {}
LogSet(const TLogSet& tlogSet);
LogSet(const CoreTLogSet& coreSet);
std::string logRouterString() {
std::string result;
for (int i = 0; i < logRouters.size(); i++) {
if (i > 0) {
result += ", ";
}
result += logRouters[i]->get().id().toString();
}
return result;
}
bool hasLogRouter(UID id) const {
for (const auto& router : logRouters) {
if (router->get().id() == id) {
return true;
}
}
return false;
}
bool hasBackupWorker(UID id) const {
for (const auto& worker : backupWorkers) {
if (worker->get().id() == id) {
return true;
}
}
return false;
}
std::string logServerString() {
std::string result;
for (int i = 0; i < logServers.size(); i++) {
if (i > 0) {
result += ", ";
}
result += logServers[i]->get().id().toString();
}
return result;
}
void populateSatelliteTagLocations(int logRouterTags, int oldLogRouterTags, int txsTags, int oldTxsTags) {
satelliteTagLocations.clear();
satelliteTagLocations.resize(std::max({ logRouterTags, oldLogRouterTags, txsTags, oldTxsTags }) + 1);
std::map<int, int> server_usedBest;
std::set<std::pair<int, int>> used_servers;
for (int i = 0; i < tLogLocalities.size(); i++) {
used_servers.insert(std::make_pair(0, i));
}
Reference<LocalitySet> serverSet = Reference<LocalitySet>(new LocalityMap<std::pair<int, int>>());
LocalityMap<std::pair<int, int>>* serverMap = (LocalityMap<std::pair<int, int>>*)serverSet.getPtr();
std::vector<std::pair<int, int>> resultPairs;
for (int loc = 0; loc < satelliteTagLocations.size(); loc++) {
int team = loc;
if (loc < logRouterTags) {
team = loc + 1;
} else if (loc == logRouterTags) {
team = 0;
}
bool teamComplete = false;
alsoServers.resize(1);
serverMap->clear();
resultPairs.clear();
for (auto& used_idx : used_servers) {
auto entry = serverMap->add(tLogLocalities[used_idx.second], &used_idx);
if (!resultPairs.size()) {
resultPairs.push_back(used_idx);
alsoServers[0] = entry;
}
resultEntries.clear();
if (serverSet->selectReplicas(tLogPolicy, alsoServers, resultEntries)) {
for (auto& entry : resultEntries) {
resultPairs.push_back(*serverMap->getObject(entry));
}
int firstBestUsed = server_usedBest[resultPairs[0].second];
for (int i = 1; i < resultPairs.size(); i++) {
int thisBestUsed = server_usedBest[resultPairs[i].second];
if (thisBestUsed < firstBestUsed) {
std::swap(resultPairs[0], resultPairs[i]);
firstBestUsed = thisBestUsed;
}
}
server_usedBest[resultPairs[0].second]++;
for (auto& res : resultPairs) {
satelliteTagLocations[team].push_back(res.second);
used_servers.erase(res);
res.first++;
used_servers.insert(res);
}
teamComplete = true;
break;
}
}
ASSERT(teamComplete);
}
checkSatelliteTagLocations();
}
void checkSatelliteTagLocations() {
std::vector<int> usedBest;
std::vector<int> used;
usedBest.resize(tLogLocalities.size());
used.resize(tLogLocalities.size());
for (auto team : satelliteTagLocations) {
usedBest[team[0]]++;
for (auto loc : team) {
used[loc]++;
}
}
int minUsedBest = satelliteTagLocations.size();
int maxUsedBest = 0;
for (auto i : usedBest) {
minUsedBest = std::min(minUsedBest, i);
maxUsedBest = std::max(maxUsedBest, i);
}
int minUsed = satelliteTagLocations.size();
int maxUsed = 0;
for (auto i : used) {
minUsed = std::min(minUsed, i);
maxUsed = std::max(maxUsed, i);
}
bool foundDuplicate = false;
std::set<Optional<Key>> zones;
std::set<Optional<Key>> dcs;
for (auto& loc : tLogLocalities) {
if (zones.count(loc.zoneId())) {
foundDuplicate = true;
break;
}
zones.insert(loc.zoneId());
dcs.insert(loc.dcId());
}
bool moreThanOneDC = dcs.size() > 1 ? true : false;
TraceEvent(((maxUsed - minUsed > 1) || (maxUsedBest - minUsedBest > 1))
? (g_network->isSimulated() && !foundDuplicate && !moreThanOneDC ? SevError : SevWarnAlways)
: SevInfo,
"CheckSatelliteTagLocations")
.detail("MinUsed", minUsed)
.detail("MaxUsed", maxUsed)
.detail("MinUsedBest", minUsedBest)
.detail("MaxUsedBest", maxUsedBest)
.detail("DuplicateZones", foundDuplicate)
.detail("NumOfDCs", dcs.size());
}
int bestLocationFor(Tag tag) {
if (locality == tagLocalitySatellite) {
return satelliteTagLocations[tag == txsTag ? 0 : tag.id + 1][0];
}
// the following logic supports upgrades from 5.X
if (tag == txsTag)
return txsTagOld % logServers.size();
return tag.id % logServers.size();
}
void updateLocalitySet(std::vector<LocalityData> const& localities) {
LocalityMap<int>* logServerMap;
logServerSet = Reference<LocalitySet>(new LocalityMap<int>());
logServerMap = (LocalityMap<int>*)logServerSet.getPtr();
logEntryArray.clear();
logEntryArray.reserve(localities.size());
logIndexArray.clear();
logIndexArray.reserve(localities.size());
for (int i = 0; i < localities.size(); i++) {
logIndexArray.push_back(i);
logEntryArray.push_back(logServerMap->add(localities[i], &logIndexArray.back()));
}
}
bool satisfiesPolicy(const std::vector<LocalityEntry>& locations) {
resultEntries.clear();
// Run the policy, assert if unable to satify
bool result = logServerSet->selectReplicas(tLogPolicy, locations, resultEntries);
ASSERT(result);
return resultEntries.size() == 0;
}
void getPushLocations(VectorRef<Tag> tags,
std::vector<int>& locations,
int locationOffset,
bool allLocations = false) {
if (locality == tagLocalitySatellite) {
for (auto& t : tags) {
if (t == txsTag || t.locality == tagLocalityTxs || t.locality == tagLocalityLogRouter) {
for (int loc : satelliteTagLocations[t == txsTag ? 0 : t.id + 1]) {
locations.push_back(locationOffset + loc);
}
}
}
uniquify(locations);
return;
}
newLocations.clear();
alsoServers.clear();
resultEntries.clear();
if (allLocations) {
// special handling for allLocations
TraceEvent("AllLocationsSet");
for (int i = 0; i < logServers.size(); i++) {
newLocations.push_back(i);
}
} else {
for (auto& t : tags) {
if (locality == tagLocalitySpecial || t.locality == locality || t.locality < 0) {
newLocations.push_back(bestLocationFor(t));
}
}
}
uniquify(newLocations);
if (newLocations.size())
alsoServers.reserve(newLocations.size());
// Convert locations to the also servers
for (auto location : newLocations) {
locations.push_back(locationOffset + location);
alsoServers.push_back(logEntryArray[location]);
}
// Run the policy, assert if unable to satify
bool result = logServerSet->selectReplicas(tLogPolicy, alsoServers, resultEntries);
ASSERT(result);
// Add the new servers to the location array
LocalityMap<int>* logServerMap = (LocalityMap<int>*)logServerSet.getPtr();
for (auto entry : resultEntries) {
locations.push_back(locationOffset + *logServerMap->getObject(entry));
}
//TraceEvent("GetPushLocations").detail("Policy", tLogPolicy->info())
// .detail("Results", locations.size()).detail("Selection", logServerSet->size())
// .detail("Included", alsoServers.size()).detail("Duration", timer() - t);
}
private:
std::vector<LocalityEntry> alsoServers, resultEntries;
std::vector<int> newLocations;
};
struct ILogSystem {
// Represents a particular (possibly provisional) epoch of the log subsystem
struct IPeekCursor {
// clones the peek cursor, however you cannot call getMore() on the cloned cursor.
virtual Reference<IPeekCursor> cloneNoMore() = 0;
virtual void setProtocolVersion(ProtocolVersion version) = 0;
// if hasMessage() returns true, getMessage(), getMessageWithTags(), or reader() can be called.
// does not modify the cursor
virtual bool hasMessage() const = 0;
// pre: only callable if hasMessage() returns true
// return the tags associated with the message for the current sequence
virtual VectorRef<Tag> getTags() const = 0;
// pre: only callable if hasMessage() returns true
// returns the arena containing the contents of getMessage(), getMessageWithTags(), and reader()
virtual Arena& arena() = 0;
// pre: only callable if hasMessage() returns true
// returns an arena reader for the next message
// caller cannot call getMessage(), getMessageWithTags(), and reader()
// the caller must advance the reader before calling nextMessage()
virtual ArenaReader* reader() = 0;
// pre: only callable if hasMessage() returns true
// caller cannot call getMessage(), getMessageWithTags(), and reader()
// return the contents of the message for the current sequence
virtual StringRef getMessage() = 0;
// pre: only callable if hasMessage() returns true
// caller cannot call getMessage(), getMessageWithTags(), and reader()
// return the contents of the message for the current sequence
virtual StringRef getMessageWithTags() = 0;
// pre: only callable after getMessage(), getMessageWithTags(), or reader()
// post: hasMessage() and version() have been updated
// hasMessage() will never return false "in the middle" of a version (that is, if it does return false,
// version().subsequence will be zero) < FIXME: Can we lose this property?
virtual void nextMessage() = 0;
// advances the cursor to the supplied LogMessageVersion, and updates hasMessage
virtual void advanceTo(LogMessageVersion n) = 0;
// returns immediately if hasMessage() returns true.
// returns when either the result of hasMessage() or version() has changed, or a cursor has internally been
// exhausted.
virtual Future<Void> getMore(TaskPriority taskID = TaskPriority::TLogPeekReply) = 0;
// returns when the failure monitor detects that the servers associated with the cursor are failed
virtual Future<Void> onFailed() = 0;
// returns false if:
// (1) the failure monitor detects that the servers associated with the cursor is failed
// (2) the interface is not present
// (3) the cursor cannot return any more results
virtual bool isActive() const = 0;
// returns true if the cursor cannot return any more results
virtual bool isExhausted() const = 0;
// Returns the smallest possible message version which the current message (if any) or a subsequent message
// might have (If hasMessage(), this is therefore the message version of the current message)
virtual const LogMessageVersion& version() const = 0;
// So far, the cursor has returned all messages which both satisfy the criteria passed to peek() to create the
// cursor AND have (popped(),0) <= message version number <= version() Other messages might have been skipped
virtual Version popped() const = 0;
// Returns the maximum version known to have been pushed (not necessarily durably) into the log system (0 is
// always a possible result!)
virtual Version getMaxKnownVersion() const { return 0; }
virtual Version getMinKnownCommittedVersion() const = 0;
virtual Optional<UID> getPrimaryPeekLocation() const = 0;
virtual Optional<UID> getCurrentPeekLocation() const = 0;
virtual void addref() = 0;
virtual void delref() = 0;
};
struct ServerPeekCursor final : IPeekCursor, ReferenceCounted<ServerPeekCursor> {
Reference<AsyncVar<OptionalInterface<TLogInterface>>> interf;
const Tag tag;
TLogPeekReply results;
ArenaReader rd;
LogMessageVersion messageVersion, end;
Version poppedVersion;
TagsAndMessage messageAndTags;
bool hasMsg;
Future<Void> more;
UID randomID;
bool returnIfBlocked;
bool onlySpilled;
bool parallelGetMore;
int sequence;
Deque<Future<TLogPeekReply>> futureResults;
Future<Void> interfaceChanged;
double lastReset;
Future<Void> resetCheck;
int slowReplies;
int fastReplies;
int unknownReplies;
ServerPeekCursor(Reference<AsyncVar<OptionalInterface<TLogInterface>>> const& interf,
Tag tag,
Version begin,
Version end,
bool returnIfBlocked,
bool parallelGetMore);
ServerPeekCursor(TLogPeekReply const& results,
LogMessageVersion const& messageVersion,
LogMessageVersion const& end,
TagsAndMessage const& message,
bool hasMsg,
Version poppedVersion,
Tag tag);
Reference<IPeekCursor> cloneNoMore() override;
void setProtocolVersion(ProtocolVersion version) override;
Arena& arena() override;
ArenaReader* reader() override;
bool hasMessage() const override;
void nextMessage() override;
StringRef getMessage() override;
StringRef getMessageWithTags() override;
VectorRef<Tag> getTags() const override;
void advanceTo(LogMessageVersion n) override;
Future<Void> getMore(TaskPriority taskID = TaskPriority::TLogPeekReply) override;
Future<Void> onFailed() override;
bool isActive() const override;
bool isExhausted() const override;
const LogMessageVersion& version() const override;
Version popped() const override;
Version getMinKnownCommittedVersion() const override;
Optional<UID> getPrimaryPeekLocation() const override;
Optional<UID> getCurrentPeekLocation() const override;
void addref() override { ReferenceCounted<ServerPeekCursor>::addref(); }
void delref() override { ReferenceCounted<ServerPeekCursor>::delref(); }
Version getMaxKnownVersion() const override { return results.maxKnownVersion; }
};
struct MergedPeekCursor final : IPeekCursor, ReferenceCounted<MergedPeekCursor> {
Reference<LogSet> logSet;
std::vector<Reference<IPeekCursor>> serverCursors;
std::vector<LocalityEntry> locations;
std::vector<std::pair<LogMessageVersion, int>> sortedVersions;
Tag tag;
int bestServer, currentCursor, readQuorum;
Optional<LogMessageVersion> nextVersion;
LogMessageVersion messageVersion;
bool hasNextMessage;
UID randomID;
int tLogReplicationFactor;
Future<Void> more;
MergedPeekCursor(std::vector<Reference<ILogSystem::IPeekCursor>> const& serverCursors, Version begin);
MergedPeekCursor(std::vector<Reference<AsyncVar<OptionalInterface<TLogInterface>>>> const& logServers,
int bestServer,
int readQuorum,
Tag tag,
Version begin,
Version end,
bool parallelGetMore,
std::vector<LocalityData> const& tLogLocalities,
Reference<IReplicationPolicy> const tLogPolicy,
int tLogReplicationFactor);
MergedPeekCursor(std::vector<Reference<IPeekCursor>> const& serverCursors,
LogMessageVersion const& messageVersion,
int bestServer,
int readQuorum,
Optional<LogMessageVersion> nextVersion,
Reference<LogSet> logSet,
int tLogReplicationFactor);
Reference<IPeekCursor> cloneNoMore() override;
void setProtocolVersion(ProtocolVersion version) override;
Arena& arena() override;
ArenaReader* reader() override;
void calcHasMessage();
void updateMessage(bool usePolicy);
bool hasMessage() const override;
void nextMessage() override;
StringRef getMessage() override;
StringRef getMessageWithTags() override;
VectorRef<Tag> getTags() const override;
void advanceTo(LogMessageVersion n) override;
Future<Void> getMore(TaskPriority taskID = TaskPriority::TLogPeekReply) override;
Future<Void> onFailed() override;
bool isActive() const override;
bool isExhausted() const override;
const LogMessageVersion& version() const override;
Version popped() const override;
Version getMinKnownCommittedVersion() const override;
Optional<UID> getPrimaryPeekLocation() const override;
Optional<UID> getCurrentPeekLocation() const override;
void addref() override { ReferenceCounted<MergedPeekCursor>::addref(); }
void delref() override { ReferenceCounted<MergedPeekCursor>::delref(); }
};
struct SetPeekCursor final : IPeekCursor, ReferenceCounted<SetPeekCursor> {
std::vector<Reference<LogSet>> logSets;
std::vector<std::vector<Reference<IPeekCursor>>> serverCursors;
Tag tag;
int bestSet, bestServer, currentSet, currentCursor;
std::vector<LocalityEntry> locations;
std::vector<std::pair<LogMessageVersion, int>> sortedVersions;
Optional<LogMessageVersion> nextVersion;
LogMessageVersion messageVersion;
bool hasNextMessage;
bool useBestSet;
UID randomID;
Future<Void> more;
SetPeekCursor(std::vector<Reference<LogSet>> const& logSets,
int bestSet,
int bestServer,
Tag tag,
Version begin,
Version end,
bool parallelGetMore);
SetPeekCursor(std::vector<Reference<LogSet>> const& logSets,
std::vector<std::vector<Reference<IPeekCursor>>> const& serverCursors,
LogMessageVersion const& messageVersion,
int bestSet,
int bestServer,
Optional<LogMessageVersion> nextVersion,
bool useBestSet);
Reference<IPeekCursor> cloneNoMore() override;
void setProtocolVersion(ProtocolVersion version) override;
Arena& arena() override;
ArenaReader* reader() override;
void calcHasMessage();
void updateMessage(int logIdx, bool usePolicy);
bool hasMessage() const override;
void nextMessage() override;
StringRef getMessage() override;
StringRef getMessageWithTags() override;
VectorRef<Tag> getTags() const override;
void advanceTo(LogMessageVersion n) override;
Future<Void> getMore(TaskPriority taskID = TaskPriority::TLogPeekReply) override;
Future<Void> onFailed() override;
bool isActive() const override;
bool isExhausted() const override;
const LogMessageVersion& version() const override;
Version popped() const override;
Version getMinKnownCommittedVersion() const override;
Optional<UID> getPrimaryPeekLocation() const override;
Optional<UID> getCurrentPeekLocation() const override;
void addref() override { ReferenceCounted<SetPeekCursor>::addref(); }
void delref() override { ReferenceCounted<SetPeekCursor>::delref(); }
};
struct MultiCursor final : IPeekCursor, ReferenceCounted<MultiCursor> {
std::vector<Reference<IPeekCursor>> cursors;
std::vector<LogMessageVersion> epochEnds;
Version poppedVersion;
MultiCursor(std::vector<Reference<IPeekCursor>> cursors, std::vector<LogMessageVersion> epochEnds);
Reference<IPeekCursor> cloneNoMore() override;
void setProtocolVersion(ProtocolVersion version) override;
Arena& arena() override;
ArenaReader* reader() override;
bool hasMessage() const override;
void nextMessage() override;
StringRef getMessage() override;
StringRef getMessageWithTags() override;
VectorRef<Tag> getTags() const override;
void advanceTo(LogMessageVersion n) override;
Future<Void> getMore(TaskPriority taskID = TaskPriority::TLogPeekReply) override;
Future<Void> onFailed() override;
bool isActive() const override;
bool isExhausted() const override;
const LogMessageVersion& version() const override;
Version popped() const override;
Version getMinKnownCommittedVersion() const override;
Optional<UID> getPrimaryPeekLocation() const override;
Optional<UID> getCurrentPeekLocation() const override;
void addref() override { ReferenceCounted<MultiCursor>::addref(); }
void delref() override { ReferenceCounted<MultiCursor>::delref(); }
};
struct BufferedCursor final : IPeekCursor, ReferenceCounted<BufferedCursor> {
struct BufferedMessage {
Arena arena;
StringRef message;
VectorRef<Tag> tags;
LogMessageVersion version;
BufferedMessage() {}
explicit BufferedMessage(Version version) : version(version) {}
BufferedMessage(Arena arena,
StringRef message,
const VectorRef<Tag>& tags,
const LogMessageVersion& version)
: arena(arena), message(message), tags(tags), version(version) {}
bool operator<(BufferedMessage const& r) const { return version < r.version; }
bool operator==(BufferedMessage const& r) const { return version == r.version; }
};
std::vector<Reference<IPeekCursor>> cursors;
std::vector<Deque<BufferedMessage>> cursorMessages;
std::vector<BufferedMessage> messages;
int messageIndex;
LogMessageVersion messageVersion;
Version end;
bool hasNextMessage;
bool withTags;
bool knownUnique;
Version minKnownCommittedVersion;
Version poppedVersion;
Version initialPoppedVersion;
bool canDiscardPopped;
Future<Void> more;
int targetQueueSize;
UID randomID;
// FIXME: collectTags is needed to support upgrades from 5.X to 6.0. Remove this code when we no longer support
// that upgrade.
bool collectTags;
void combineMessages();
BufferedCursor(std::vector<Reference<IPeekCursor>> cursors,
Version begin,
Version end,
bool withTags,
bool collectTags,
bool canDiscardPopped);
BufferedCursor(std::vector<Reference<AsyncVar<OptionalInterface<TLogInterface>>>> const& logServers,
Tag tag,
Version begin,
Version end,
bool parallelGetMore);
Reference<IPeekCursor> cloneNoMore() override;
void setProtocolVersion(ProtocolVersion version) override;
Arena& arena() override;
ArenaReader* reader() override;
bool hasMessage() const override;
void nextMessage() override;
StringRef getMessage() override;
StringRef getMessageWithTags() override;
VectorRef<Tag> getTags() const override;
void advanceTo(LogMessageVersion n) override;
Future<Void> getMore(TaskPriority taskID = TaskPriority::TLogPeekReply) override;
Future<Void> onFailed() override;
bool isActive() const override;
bool isExhausted() const override;
const LogMessageVersion& version() const override;
Version popped() const override;
Version getMinKnownCommittedVersion() const override;
Optional<UID> getPrimaryPeekLocation() const override;
Optional<UID> getCurrentPeekLocation() const override;
void addref() override { ReferenceCounted<BufferedCursor>::addref(); }
void delref() override { ReferenceCounted<BufferedCursor>::delref(); }
};
virtual void addref() = 0;
virtual void delref() = 0;
virtual std::string describe() const = 0;
virtual UID getDebugID() const = 0;
virtual void toCoreState(DBCoreState&) = 0;
virtual bool remoteStorageRecovered() = 0;
virtual Future<Void> onCoreStateChanged() = 0;
// Returns if and when the output of toCoreState() would change (for example, when older logs can be discarded from
// the state)
virtual void coreStateWritten(DBCoreState const& newState) = 0;
// Called when a core state has been written to the coordinators
virtual Future<Void> onError() = 0;
// Never returns normally, but throws an error if the subsystem stops working
// Future<Void> push( UID bundle, int64_t seq, VectorRef<TaggedMessageRef> messages );
virtual Future<Version> push(Version prevVersion,
Version version,
Version knownCommittedVersion,
Version minKnownCommittedVersion,
struct LogPushData& data,
SpanID const& spanContext,
Optional<UID> debugID = Optional<UID>()) = 0;
// Waits for the version number of the bundle (in this epoch) to be prevVersion (i.e. for all pushes ordered
// earlier) Puts the given messages into the bundle, each with the given tags, and with message versions (version,
// 0) - (version, N) Changes the version number of the bundle to be version (unblocking the next push) Returns when
// the preceding changes are durable. (Later we will need multiple return signals for diffferent durability levels)
// If the current epoch has ended, push will not return, and the pushed messages will not be visible in any
// subsequent epoch (but may become visible in this epoch)
virtual Reference<IPeekCursor> peek(UID dbgid,
Version begin,
Optional<Version> end,
Tag tag,
bool parallelGetMore = false) = 0;
// Returns (via cursor interface) a stream of messages with the given tag and message versions >= (begin, 0),
// ordered by message version If pop was previously or concurrently called with upTo > begin, the cursor may not
// return all such messages. In that case cursor->popped() will be greater than begin to reflect that.
virtual Reference<IPeekCursor> peek(UID dbgid,
Version begin,
Optional<Version> end,
std::vector<Tag> tags,
bool parallelGetMore = false) = 0;
// Same contract as peek(), but for a set of tags
virtual Reference<IPeekCursor> peekSingle(
UID dbgid,
Version begin,
Tag tag,
std::vector<std::pair<Version, Tag>> history = std::vector<std::pair<Version, Tag>>()) = 0;
// Same contract as peek(), but blocks until the preferred log server(s) for the given tag are available (and is
// correspondingly less expensive)
virtual Reference<IPeekCursor> peekLogRouter(UID dbgid, Version begin, Tag tag) = 0;
// Same contract as peek(), but can only peek from the logs elected in the same generation.
// If the preferred log server is down, a different log from the same generation will merge results locally before
// sending them to the log router.
virtual Reference<IPeekCursor> peekTxs(UID dbgid,
Version begin,
int8_t peekLocality,
Version localEnd,
bool canDiscardPopped) = 0;
// Same contract as peek(), but only for peeking the txsLocality. It allows specifying a preferred peek locality.
virtual Future<Version> getTxsPoppedVersion() = 0;
virtual Version getKnownCommittedVersion() = 0;
virtual Future<Void> onKnownCommittedVersionChange() = 0;
virtual void popTxs(Version upTo, int8_t popLocality = tagLocalityInvalid) = 0;
virtual void pop(Version upTo,
Tag tag,
Version knownCommittedVersion = 0,
int8_t popLocality = tagLocalityInvalid) = 0;
// Permits, but does not require, the log subsystem to strip `tag` from any or all messages with message versions <
// (upTo,0) The popping of any given message may be arbitrarily delayed.
virtual Future<Void> confirmEpochLive(Optional<UID> debugID = Optional<UID>()) = 0;
// Returns success after confirming that pushes in the current epoch are still possible
virtual Future<Void> endEpoch() = 0;
// Ends the current epoch without starting a new one
static Reference<ILogSystem> fromServerDBInfo(
UID const& dbgid,
struct ServerDBInfo const& db,
bool useRecoveredAt = false,
Optional<PromiseStream<Future<Void>>> addActor = Optional<PromiseStream<Future<Void>>>());
static Reference<ILogSystem> fromLogSystemConfig(
UID const& dbgid,
struct LocalityData const&,
struct LogSystemConfig const&,
bool excludeRemote = false,
bool useRecoveredAt = false,
Optional<PromiseStream<Future<Void>>> addActor = Optional<PromiseStream<Future<Void>>>());
// Constructs a new ILogSystem implementation from the given ServerDBInfo/LogSystemConfig. Might return a null
// reference if there isn't a fully recovered log system available. The caller can peek() the returned log system
// and can push() if it has version numbers reserved for it and prevVersions
static Reference<ILogSystem> fromOldLogSystemConfig(UID const& dbgid,
struct LocalityData const&,
struct LogSystemConfig const&);
// Constructs a new ILogSystem implementation from the old log data within a ServerDBInfo/LogSystemConfig. Might
// return a null reference if there isn't a fully recovered log system available.
static Future<Void> recoverAndEndEpoch(Reference<AsyncVar<Reference<ILogSystem>>> const& outLogSystem,
UID const& dbgid,
DBCoreState const& oldState,
FutureStream<TLogRejoinRequest> const& rejoins,
LocalityData const& locality,
bool* forceRecovery);
// Constructs a new ILogSystem implementation based on the given oldState and rejoining log servers
// Ensures that any calls to push or confirmEpochLive in the current epoch but strictly later than change_epoch will
// not return Whenever changes in the set of available log servers require restarting recovery with a different end
// sequence, outLogSystem will be changed to a new ILogSystem
virtual Version getEnd() const = 0;
// Call only on an ILogSystem obtained from recoverAndEndEpoch()
// Returns the first unreadable version number of the recovered epoch (i.e. message version numbers < (get_end(), 0)
// will be readable)
// Returns the start version of current epoch for backup workers.
virtual Version getBackupStartVersion() const = 0;
struct EpochTagsVersionsInfo {
int32_t logRouterTags; // Number of log router tags.
Version epochBegin, epochEnd;
explicit EpochTagsVersionsInfo(int32_t n, Version begin, Version end)
: logRouterTags(n), epochBegin(begin), epochEnd(end) {}
};
// Returns EpochTagVersionsInfo for old epochs that this log system is aware of, excluding the current epoch.
virtual std::map<LogEpoch, EpochTagsVersionsInfo> getOldEpochTagsVersionsInfo() const = 0;
virtual Future<Reference<ILogSystem>> newEpoch(
struct RecruitFromConfigurationReply const& recr,
Future<struct RecruitRemoteFromConfigurationReply> const& fRemoteWorkers,
DatabaseConfiguration const& config,
LogEpoch recoveryCount,
int8_t primaryLocality,
int8_t remoteLocality,
std::vector<Tag> const& allTags,
Reference<AsyncVar<bool>> const& recruitmentStalled) = 0;
// Call only on an ILogSystem obtained from recoverAndEndEpoch()
// Returns an ILogSystem representing a new epoch immediately following this one. The new epoch is only provisional
// until the caller updates the coordinated DBCoreState
virtual LogSystemConfig getLogSystemConfig() const = 0;
// Returns the physical configuration of this LogSystem, that could be used to construct an equivalent LogSystem
// using fromLogSystemConfig()
virtual Standalone<StringRef> getLogsValue() const = 0;
virtual Future<Void> onLogSystemConfigChange() = 0;
// Returns when the log system configuration has changed due to a tlog rejoin.
virtual void getPushLocations(VectorRef<Tag> tags,
std::vector<int>& locations,
bool allLocations = false) const = 0;
void getPushLocations(std::vector<Tag> const& tags, std::vector<int>& locations, bool allLocations = false) {
getPushLocations(VectorRef<Tag>((Tag*)&tags.front(), tags.size()), locations, allLocations);
}
virtual bool hasRemoteLogs() const = 0;
virtual Tag getRandomRouterTag() const = 0;
virtual int getLogRouterTags() const = 0; // Returns the number of router tags.
virtual Tag getRandomTxsTag() const = 0;
// Returns the TLogVersion of the current generation of TLogs.
// (This only exists because getLogSystemConfig is a significantly more expensive call.)
virtual TLogVersion getTLogVersion() const = 0;
virtual void stopRejoins() = 0;
// Returns the pseudo tag to be popped for the given process class. If the
// process class doesn't use pseudo tag, return the same tag.
virtual Tag getPseudoPopTag(Tag tag, ProcessClass::ClassType type) const = 0;
virtual bool hasPseudoLocality(int8_t locality) const = 0;
// Returns the actual version to be popped from the log router tag for the given pseudo tag.
// For instance, a pseudo tag (-8, 2) means the actual popping tag is (-2, 2). Assuming there
// are multiple pseudo tags, the returned version is the min(all pseudo tags' "upTo" versions).
virtual Version popPseudoLocalityTag(Tag tag, Version upTo) = 0;
virtual void setBackupWorkers(const std::vector<InitializeBackupReply>& replies) = 0;
// Removes a finished backup worker from log system and returns true. Returns false
// if the worker is not found.
virtual bool removeBackupWorker(const BackupWorkerDoneRequest& req) = 0;
virtual LogEpoch getOldestBackupEpoch() const = 0;
virtual void setOldestBackupEpoch(LogEpoch epoch) = 0;
};
struct LengthPrefixedStringRef {
// Represents a pointer to a string which is prefixed by a 4-byte length
// A LengthPrefixedStringRef is only pointer-sized (8 bytes vs 12 bytes for StringRef), but the corresponding string
// is 4 bytes bigger, and substring operations aren't efficient as they are with StringRef. It's a good choice when
// there might be lots of references to the same exact string.
uint32_t* length;
StringRef toStringRef() const {
ASSERT(length);
return StringRef((uint8_t*)(length + 1), *length);
}
int expectedSize() const {
ASSERT(length);
return *length;
}
uint32_t* getLengthPtr() const { return length; }
LengthPrefixedStringRef() : length(nullptr) {}
LengthPrefixedStringRef(uint32_t* length) : length(length) {}
};
template <class T>
struct CompareFirst {
bool operator()(T const& lhs, T const& rhs) const { return lhs.first < rhs.first; }
};
// Structure to store serialized mutations sent from the proxy to the
// transaction logs. The serialization repeats with the following format:
//
// +----------------------+ +----------------------+ +----------+ +----------------+ +----------------------+
// | Message size | | Subsequence | | # of tags| | Tag | . . . . | Mutation |
// +----------------------+ +----------------------+ +----------+ +----------------+ +----------------------+
// <------- 32 bits ------> <------- 32 bits ------> <- 16 bits-> <---- 24 bits ---> <---- variable bits --->
//
// `Mutation` can be a serialized MutationRef or a special metadata message
// such as LogProtocolMessage or SpanContextMessage. The type of `Mutation` is
// uniquely identified by its first byte -- a value from MutationRef::Type.
//
struct LogPushData : NonCopyable {
// Log subsequences have to start at 1 (the MergedPeekCursor relies on this to make sure we never have !hasMessage()
// in the middle of data for a version
explicit LogPushData(Reference<ILogSystem> logSystem) : logSystem(logSystem), subsequence(1) {
for (auto& log : logSystem->getLogSystemConfig().tLogs) {
if (log.isLocal) {
for (int i = 0; i < log.tLogs.size(); i++) {
messagesWriter.push_back(BinaryWriter(AssumeVersion(g_network->protocolVersion())));
}
}
}
}
void addTxsTag() {
if (logSystem->getTLogVersion() >= TLogVersion::V4) {
next_message_tags.push_back(logSystem->getRandomTxsTag());
} else {
next_message_tags.push_back(txsTag);
}
}
// addTag() adds a tag for the *next* message to be added
void addTag(Tag tag) { next_message_tags.push_back(tag); }
template <class T>
void addTags(T tags) {
next_message_tags.insert(next_message_tags.end(), tags.begin(), tags.end());
}
// Add transaction info to be written before the first mutation in the transaction.
void addTransactionInfo(SpanID const& context) {
TEST(!spanContext.isValid()); // addTransactionInfo with invalid SpanID
spanContext = context;
writtenLocations.clear();
}
void writeMessage(StringRef rawMessageWithoutLength, bool usePreviousLocations) {
if (!usePreviousLocations) {
prev_tags.clear();
if (logSystem->hasRemoteLogs()) {
prev_tags.push_back(logSystem->getRandomRouterTag());
}
for (auto& tag : next_message_tags) {
prev_tags.push_back(tag);
}
msg_locations.clear();
logSystem->getPushLocations(prev_tags, msg_locations);
next_message_tags.clear();
}
uint32_t subseq = this->subsequence++;
uint32_t msgsize =
rawMessageWithoutLength.size() + sizeof(subseq) + sizeof(uint16_t) + sizeof(Tag) * prev_tags.size();
for (int loc : msg_locations) {
BinaryWriter& wr = messagesWriter[loc];
wr << msgsize << subseq << uint16_t(prev_tags.size());
for (auto& tag : prev_tags)
wr << tag;
wr.serializeBytes(rawMessageWithoutLength);
}
}
template <class T>
void writeTypedMessage(T const& item, bool metadataMessage = false, bool allLocations = false) {
prev_tags.clear();
if (logSystem->hasRemoteLogs()) {
prev_tags.push_back(logSystem->getRandomRouterTag());
}
for (auto& tag : next_message_tags) {
prev_tags.push_back(tag);
}
msg_locations.clear();
logSystem->getPushLocations(prev_tags, msg_locations, allLocations);
BinaryWriter bw(AssumeVersion(g_network->protocolVersion()));
// Metadata messages (currently LogProtocolMessage is the only metadata
// message) should be written before span information. If this isn't a
// metadata message, make sure all locations have had transaction info
// written to them. Mutations may have different sets of tags, so it
// is necessary to check all tag locations each time a mutation is
// written.
if (!metadataMessage) {
uint32_t subseq = this->subsequence++;
bool updatedLocation = false;
for (int loc : msg_locations) {
updatedLocation = writeTransactionInfo(loc, subseq) || updatedLocation;
}
// If this message doesn't write to any new locations, the
// subsequence wasn't actually used and can be decremented.
if (!updatedLocation) {
this->subsequence--;
TEST(true); // No new SpanContextMessage written to transaction logs
ASSERT(this->subsequence > 0);
}
} else {
// When writing a metadata message, make sure transaction state has
// been reset. If you are running into this assertion, make sure
// you are calling addTransactionInfo before each transaction.
ASSERT(writtenLocations.size() == 0);
}
uint32_t subseq = this->subsequence++;
bool first = true;
int firstOffset = -1, firstLength = -1;
for (int loc : msg_locations) {
BinaryWriter& wr = messagesWriter[loc];
if (first) {
firstOffset = wr.getLength();
wr << uint32_t(0) << subseq << uint16_t(prev_tags.size());
for (auto& tag : prev_tags)
wr << tag;
wr << item;
firstLength = wr.getLength() - firstOffset;
*(uint32_t*)((uint8_t*)wr.getData() + firstOffset) = firstLength - sizeof(uint32_t);
DEBUG_TAGS_AND_MESSAGE("ProxyPushLocations",
invalidVersion,
StringRef(((uint8_t*)wr.getData() + firstOffset), firstLength))
.detail("PushLocations", msg_locations);
first = false;
} else {
BinaryWriter& from = messagesWriter[msg_locations[0]];
wr.serializeBytes((uint8_t*)from.getData() + firstOffset, firstLength);
}
}
next_message_tags.clear();
}
Standalone<StringRef> getMessages(int loc) { return messagesWriter[loc].toValue(); }
private:
Reference<ILogSystem> logSystem;
std::vector<Tag> next_message_tags;
std::vector<Tag> prev_tags;
std::vector<BinaryWriter> messagesWriter;
std::vector<int> msg_locations;
// Stores message locations that have had span information written to them
// for the current transaction. Adding transaction info will reset this
// field.
std::unordered_set<int> writtenLocations;
uint32_t subsequence;
SpanID spanContext;
// Writes transaction info to the message stream at the given location if
// it has not already been written (for the current transaction). Returns
// true on a successful write, and false if the location has already been
// written.
bool writeTransactionInfo(int location, uint32_t subseq) {
if (!FLOW_KNOBS->WRITE_TRACING_ENABLED || logSystem->getTLogVersion() < TLogVersion::V6 ||
writtenLocations.count(location) != 0) {
return false;
}
TEST(true); // Wrote SpanContextMessage to a transaction log
writtenLocations.insert(location);
BinaryWriter& wr = messagesWriter[location];
SpanContextMessage contextMessage(spanContext);
int offset = wr.getLength();
wr << uint32_t(0) << subseq << uint16_t(prev_tags.size());
for (auto& tag : prev_tags)
wr << tag;
wr << contextMessage;
int length = wr.getLength() - offset;
*(uint32_t*)((uint8_t*)wr.getData() + offset) = length - sizeof(uint32_t);
return true;
}
};
#endif