foundationdb/fdbserver/worker.actor.cpp
Ata E Husain Bohra dfe9d184ff Refactor: ClusterController driving cluster-recovery state machine
At present, cluster recovery process consists of following steps:
1. ClusterController clusterWatchDatabase actor recruits
   master/sequencer process.
2. Sequencer process implements the cluster recovery state machine,
   responsible to recruit all other processes as well restore the
   cluster state.

Patch proposes a scheme where the cluster recovery state machine
is implemented and driven by the ClusterController process instead
of the Sequencer process.

Advantages of the scheme could be:
1. Simplified design where ClusterController recruits "sequencer"
   process like other worker processes compared to current scheme
   where "sequencer" process gets special treatment. In newer scheme
   sequencer is responsible for maintaining/providing
   "committed version" (as expected).
2. ClusterController is responsible for worker processes recruitment,
   the sequencer though orchestrating the recovery state machine, it
   need to reachout to the ClusterController for recruiting worker
   processes etc.

NOTE:
Patch has moved the recovery state machine code from
'sequencer' -> 'cluster-controller' process, however, necessary
updates were done for both functionality as well as performance
improvement reasons.

Next Steps:
Cluster recovery documentation will be updated in near future.
2021-12-22 14:06:27 -08:00

2708 lines
108 KiB
C++

/*
* worker.actor.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <tuple>
#include <boost/lexical_cast.hpp>
#include "fdbrpc/Locality.h"
#include "fdbclient/GlobalConfig.actor.h"
#include "fdbclient/ProcessInterface.h"
#include "fdbclient/StorageServerInterface.h"
#include "fdbserver/Knobs.h"
#include "flow/ActorCollection.h"
#include "flow/ProtocolVersion.h"
#include "flow/SystemMonitor.h"
#include "flow/TDMetric.actor.h"
#include "fdbrpc/simulator.h"
#include "fdbclient/NativeAPI.actor.h"
#include "fdbserver/MetricLogger.actor.h"
#include "fdbserver/BackupInterface.h"
#include "fdbserver/RoleLineage.actor.h"
#include "fdbserver/WorkerInterface.actor.h"
#include "fdbserver/IKeyValueStore.h"
#include "fdbserver/WaitFailure.h"
#include "fdbserver/TesterInterface.actor.h" // for poisson()
#include "fdbserver/IDiskQueue.h"
#include "fdbclient/DatabaseContext.h"
#include "fdbserver/DataDistributorInterface.h"
#include "fdbserver/BlobManagerInterface.h"
#include "fdbserver/ServerDBInfo.h"
#include "fdbserver/FDBExecHelper.actor.h"
#include "fdbserver/CoordinationInterface.h"
#include "fdbserver/LocalConfiguration.h"
#include "fdbclient/MonitorLeader.h"
#include "fdbclient/ClientWorkerInterface.h"
#include "flow/Profiler.h"
#include "flow/ThreadHelper.actor.h"
#include "flow/Trace.h"
#include "flow/flow.h"
#include "flow/network.h"
#ifdef __linux__
#include <fcntl.h>
#include <stdio.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
#endif
#if defined(__linux__) || defined(__FreeBSD__)
#ifdef USE_GPERFTOOLS
#include "gperftools/profiler.h"
#include "gperftools/heap-profiler.h"
#endif
#include <unistd.h>
#include <thread>
#include <execinfo.h>
#endif
#include "flow/actorcompiler.h" // This must be the last #include.
#if CENABLED(0, NOT_IN_CLEAN)
extern IKeyValueStore* keyValueStoreCompressTestData(IKeyValueStore* store);
#define KV_STORE(filename, uid) keyValueStoreCompressTestData(keyValueStoreSQLite(filename, uid))
#elif CENABLED(0, NOT_IN_CLEAN)
#define KV_STORE(filename, uid) keyValueStoreSQLite(filename, uid)
#else
#define KV_STORE(filename, uid) keyValueStoreMemory(filename, uid)
#endif
namespace {
RoleLineageCollector roleLineageCollector;
}
ACTOR Future<std::vector<Endpoint>> tryDBInfoBroadcast(RequestStream<UpdateServerDBInfoRequest> stream,
UpdateServerDBInfoRequest req) {
ErrorOr<std::vector<Endpoint>> rep =
wait(stream.getReplyUnlessFailedFor(req, SERVER_KNOBS->DBINFO_FAILED_DELAY, 0));
if (rep.present()) {
return rep.get();
}
req.broadcastInfo.push_back(stream.getEndpoint());
return req.broadcastInfo;
}
ACTOR Future<std::vector<Endpoint>> broadcastDBInfoRequest(UpdateServerDBInfoRequest req,
int sendAmount,
Optional<Endpoint> sender,
bool sendReply) {
state std::vector<Future<std::vector<Endpoint>>> replies;
state ReplyPromise<std::vector<Endpoint>> reply = req.reply;
resetReply(req);
int currentStream = 0;
std::vector<Endpoint> broadcastEndpoints = req.broadcastInfo;
for (int i = 0; i < sendAmount && currentStream < broadcastEndpoints.size(); i++) {
std::vector<Endpoint> endpoints;
RequestStream<UpdateServerDBInfoRequest> cur(broadcastEndpoints[currentStream++]);
while (currentStream < broadcastEndpoints.size() * (i + 1) / sendAmount) {
endpoints.push_back(broadcastEndpoints[currentStream++]);
}
req.broadcastInfo = endpoints;
replies.push_back(tryDBInfoBroadcast(cur, req));
resetReply(req);
}
wait(waitForAll(replies));
std::vector<Endpoint> notUpdated;
if (sender.present()) {
notUpdated.push_back(sender.get());
}
for (auto& it : replies) {
notUpdated.insert(notUpdated.end(), it.get().begin(), it.get().end());
}
if (sendReply) {
reply.send(notUpdated);
}
return notUpdated;
}
ACTOR static Future<Void> extractClientInfo(Reference<AsyncVar<ServerDBInfo> const> db,
Reference<AsyncVar<ClientDBInfo>> info) {
state std::vector<UID> lastCommitProxyUIDs;
state std::vector<CommitProxyInterface> lastCommitProxies;
state std::vector<UID> lastGrvProxyUIDs;
state std::vector<GrvProxyInterface> lastGrvProxies;
loop {
ClientDBInfo ni = db->get().client;
shrinkProxyList(ni, lastCommitProxyUIDs, lastCommitProxies, lastGrvProxyUIDs, lastGrvProxies);
info->set(ni);
wait(db->onChange());
}
}
Database openDBOnServer(Reference<AsyncVar<ServerDBInfo> const> const& db,
TaskPriority taskID,
LockAware lockAware,
EnableLocalityLoadBalance enableLocalityLoadBalance) {
auto info = makeReference<AsyncVar<ClientDBInfo>>();
auto cx = DatabaseContext::create(info,
extractClientInfo(db, info),
enableLocalityLoadBalance ? db->get().myLocality : LocalityData(),
enableLocalityLoadBalance,
taskID,
lockAware);
GlobalConfig::create(cx, db, std::addressof(db->get().client));
GlobalConfig::globalConfig().trigger(samplingFrequency, samplingProfilerUpdateFrequency);
GlobalConfig::globalConfig().trigger(samplingWindow, samplingProfilerUpdateWindow);
return cx;
}
struct ErrorInfo {
Error error;
const Role& role;
UID id;
ErrorInfo(Error e, const Role& role, UID id) : error(e), role(role), id(id) {}
template <class Ar>
void serialize(Ar&) {
ASSERT(false);
}
};
Error checkIOTimeout(Error const& e) {
// Convert all_errors to io_timeout if global timeout bool was set
bool timeoutOccurred = (bool)g_network->global(INetwork::enASIOTimedOut);
// In simulation, have to check global timed out flag for both this process and the machine process on which IO is
// done
if (g_network->isSimulated() && !timeoutOccurred)
timeoutOccurred = g_pSimulator->getCurrentProcess()->machine->machineProcess->global(INetwork::enASIOTimedOut);
if (timeoutOccurred) {
TEST(true); // Timeout occurred
Error timeout = io_timeout();
// Preserve injectedness of error
if (e.isInjectedFault())
timeout = timeout.asInjectedFault();
return timeout;
}
return e;
}
ACTOR Future<Void> forwardError(PromiseStream<ErrorInfo> errors, Role role, UID id, Future<Void> process) {
try {
wait(process);
errors.send(ErrorInfo(success(), role, id));
return Void();
} catch (Error& e) {
errors.send(ErrorInfo(e, role, id));
return Void();
}
}
ACTOR Future<Void> handleIOErrors(Future<Void> actor, IClosable* store, UID id, Future<Void> onClosed = Void()) {
state Future<ErrorOr<Void>> storeError = actor.isReady() ? Never() : errorOr(store->getError());
choose {
when(state ErrorOr<Void> e = wait(errorOr(actor))) {
if (e.isError() && e.getError().code() == error_code_please_reboot) {
// no need to wait.
} else {
wait(onClosed);
}
if (e.isError() && e.getError().code() == error_code_broken_promise && !storeError.isReady()) {
wait(delay(0.00001 + FLOW_KNOBS->MAX_BUGGIFIED_DELAY));
}
if (storeError.isReady())
throw storeError.get().getError();
if (e.isError())
throw e.getError();
else
return e.get();
}
when(ErrorOr<Void> e = wait(storeError)) {
TraceEvent("WorkerTerminatingByIOError", id).error(e.getError(), true);
actor.cancel();
// file_not_found can occur due to attempting to open a partially deleted DiskQueue, which should not be
// reported SevError.
if (e.getError().code() == error_code_file_not_found) {
TEST(true); // Worker terminated with file_not_found error
return Void();
}
throw e.getError();
}
}
}
ACTOR Future<Void> workerHandleErrors(FutureStream<ErrorInfo> errors) {
loop choose {
when(ErrorInfo _err = waitNext(errors)) {
ErrorInfo err = _err;
bool ok = err.error.code() == error_code_success || err.error.code() == error_code_please_reboot ||
err.error.code() == error_code_actor_cancelled ||
err.error.code() == error_code_coordinators_changed || // The worker server was cancelled
err.error.code() == error_code_shutdown_in_progress;
if (!ok) {
err.error = checkIOTimeout(err.error); // Possibly convert error to io_timeout
}
endRole(err.role, err.id, "Error", ok, err.error);
if (err.error.code() == error_code_please_reboot ||
(err.role == Role::SHARED_TRANSACTION_LOG &&
(err.error.code() == error_code_io_error || err.error.code() == error_code_io_timeout)))
throw err.error;
}
}
}
// Improve simulation code coverage by sometimes deferring the destruction of workerInterface (and therefore "endpoint
// not found" responses to clients
// for an extra second, so that clients are more likely to see broken_promise errors
ACTOR template <class T>
Future<Void> zombie(T workerInterface, Future<Void> worker) {
try {
wait(worker);
if (BUGGIFY)
wait(delay(1.0));
return Void();
} catch (Error& e) {
throw;
}
}
ACTOR Future<Void> loadedPonger(FutureStream<LoadedPingRequest> pings) {
state Standalone<StringRef> payloadBack(std::string(20480, '.'));
loop {
LoadedPingRequest pong = waitNext(pings);
LoadedReply rep;
rep.payload = (pong.loadReply ? payloadBack : LiteralStringRef(""));
rep.id = pong.id;
pong.reply.send(rep);
}
}
StringRef fileStoragePrefix = LiteralStringRef("storage-");
StringRef testingStoragePrefix = LiteralStringRef("testingstorage-");
StringRef fileLogDataPrefix = LiteralStringRef("log-");
StringRef fileVersionedLogDataPrefix = LiteralStringRef("log2-");
StringRef fileLogQueuePrefix = LiteralStringRef("logqueue-");
StringRef tlogQueueExtension = LiteralStringRef("fdq");
enum class FilesystemCheck {
FILES_ONLY,
DIRECTORIES_ONLY,
FILES_AND_DIRECTORIES,
};
struct KeyValueStoreSuffix {
KeyValueStoreType type;
std::string suffix;
FilesystemCheck check;
};
KeyValueStoreSuffix bTreeV1Suffix = { KeyValueStoreType::SSD_BTREE_V1, ".fdb", FilesystemCheck::FILES_ONLY };
KeyValueStoreSuffix bTreeV2Suffix = { KeyValueStoreType::SSD_BTREE_V2, ".sqlite", FilesystemCheck::FILES_ONLY };
KeyValueStoreSuffix memorySuffix = { KeyValueStoreType::MEMORY, "-0.fdq", FilesystemCheck::FILES_ONLY };
KeyValueStoreSuffix memoryRTSuffix = { KeyValueStoreType::MEMORY_RADIXTREE, "-0.fdr", FilesystemCheck::FILES_ONLY };
KeyValueStoreSuffix redwoodSuffix = { KeyValueStoreType::SSD_REDWOOD_V1, ".redwood-v1", FilesystemCheck::FILES_ONLY };
KeyValueStoreSuffix rocksdbSuffix = { KeyValueStoreType::SSD_ROCKSDB_V1,
".rocksdb",
FilesystemCheck::DIRECTORIES_ONLY };
std::string validationFilename = "_validate";
std::string filenameFromSample(KeyValueStoreType storeType, std::string folder, std::string sample_filename) {
if (storeType == KeyValueStoreType::SSD_BTREE_V1)
return joinPath(folder, sample_filename);
else if (storeType == KeyValueStoreType::SSD_BTREE_V2)
return joinPath(folder, sample_filename);
else if (storeType == KeyValueStoreType::MEMORY || storeType == KeyValueStoreType::MEMORY_RADIXTREE)
return joinPath(folder, sample_filename.substr(0, sample_filename.size() - 5));
else if (storeType == KeyValueStoreType::SSD_REDWOOD_V1)
return joinPath(folder, sample_filename);
else if (storeType == KeyValueStoreType::SSD_ROCKSDB_V1)
return joinPath(folder, sample_filename);
UNREACHABLE();
}
std::string filenameFromId(KeyValueStoreType storeType, std::string folder, std::string prefix, UID id) {
if (storeType == KeyValueStoreType::SSD_BTREE_V1)
return joinPath(folder, prefix + id.toString() + ".fdb");
else if (storeType == KeyValueStoreType::SSD_BTREE_V2)
return joinPath(folder, prefix + id.toString() + ".sqlite");
else if (storeType == KeyValueStoreType::MEMORY || storeType == KeyValueStoreType::MEMORY_RADIXTREE)
return joinPath(folder, prefix + id.toString() + "-");
else if (storeType == KeyValueStoreType::SSD_REDWOOD_V1)
return joinPath(folder, prefix + id.toString() + ".redwood-v1");
else if (storeType == KeyValueStoreType::SSD_ROCKSDB_V1)
return joinPath(folder, prefix + id.toString() + ".rocksdb");
TraceEvent(SevError, "UnknownStoreType").detail("StoreType", storeType.toString());
UNREACHABLE();
}
struct TLogOptions {
TLogOptions() = default;
TLogOptions(TLogVersion v, TLogSpillType s) : version(v), spillType(s) {}
TLogVersion version = TLogVersion::DEFAULT;
TLogSpillType spillType = TLogSpillType::UNSET;
static ErrorOr<TLogOptions> FromStringRef(StringRef s) {
TLogOptions options;
for (StringRef key = s.eat("_"), value = s.eat("_"); s.size() != 0 || key.size();
key = s.eat("_"), value = s.eat("_")) {
if (key.size() != 0 && value.size() == 0)
return default_error_or();
if (key == LiteralStringRef("V")) {
ErrorOr<TLogVersion> tLogVersion = TLogVersion::FromStringRef(value);
if (tLogVersion.isError())
return tLogVersion.getError();
options.version = tLogVersion.get();
} else if (key == LiteralStringRef("LS")) {
ErrorOr<TLogSpillType> tLogSpillType = TLogSpillType::FromStringRef(value);
if (tLogSpillType.isError())
return tLogSpillType.getError();
options.spillType = tLogSpillType.get();
} else {
return default_error_or();
}
}
return options;
}
bool operator==(const TLogOptions& o) {
return version == o.version && (spillType == o.spillType || version >= TLogVersion::V5);
}
std::string toPrefix() const {
std::string toReturn = "";
switch (version) {
case TLogVersion::UNSET:
ASSERT(false);
case TLogVersion::V2:
return "";
case TLogVersion::V3:
case TLogVersion::V4:
toReturn =
"V_" + boost::lexical_cast<std::string>(version) + "_LS_" + boost::lexical_cast<std::string>(spillType);
break;
case TLogVersion::V5:
case TLogVersion::V6:
toReturn = "V_" + boost::lexical_cast<std::string>(version);
break;
}
ASSERT_WE_THINK(FromStringRef(toReturn).get() == *this);
return toReturn + "-";
}
};
TLogFn tLogFnForOptions(TLogOptions options) {
switch (options.version) {
case TLogVersion::V2:
if (options.spillType == TLogSpillType::REFERENCE)
ASSERT(false);
return oldTLog_6_0::tLog;
case TLogVersion::V3:
case TLogVersion::V4:
if (options.spillType == TLogSpillType::VALUE)
return oldTLog_6_0::tLog;
else
return oldTLog_6_2::tLog;
case TLogVersion::V5:
case TLogVersion::V6:
return tLog;
default:
ASSERT(false);
}
return tLog;
}
struct DiskStore {
enum COMPONENT { TLogData, Storage, UNSET };
UID storeID = UID();
std::string filename = ""; // For KVStoreMemory just the base filename to be passed to IDiskQueue
COMPONENT storedComponent = UNSET;
KeyValueStoreType storeType = KeyValueStoreType::END;
TLogOptions tLogOptions;
};
std::vector<DiskStore> getDiskStores(std::string folder,
std::string suffix,
KeyValueStoreType type,
FilesystemCheck check) {
std::vector<DiskStore> result;
std::vector<std::string> files;
if (check == FilesystemCheck::FILES_ONLY || check == FilesystemCheck::FILES_AND_DIRECTORIES) {
files = platform::listFiles(folder, suffix);
}
if (check == FilesystemCheck::DIRECTORIES_ONLY || check == FilesystemCheck::FILES_AND_DIRECTORIES) {
for (const auto& directory : platform::listDirectories(folder)) {
if (StringRef(directory).endsWith(suffix)) {
files.push_back(directory);
}
}
}
for (int idx = 0; idx < files.size(); idx++) {
DiskStore store;
store.storeType = type;
StringRef filename = StringRef(files[idx]);
Standalone<StringRef> prefix;
if (filename.startsWith(fileStoragePrefix)) {
store.storedComponent = DiskStore::Storage;
prefix = fileStoragePrefix;
} else if (filename.startsWith(testingStoragePrefix)) {
store.storedComponent = DiskStore::Storage;
prefix = testingStoragePrefix;
} else if (filename.startsWith(fileVersionedLogDataPrefix)) {
store.storedComponent = DiskStore::TLogData;
// Use the option string that's in the file rather than tLogOptions.toPrefix(),
// because they might be different if a new option was introduced in this version.
StringRef optionsString = filename.removePrefix(fileVersionedLogDataPrefix).eat("-");
TraceEvent("DiskStoreVersioned").detail("Filename", filename);
ErrorOr<TLogOptions> tLogOptions = TLogOptions::FromStringRef(optionsString);
if (tLogOptions.isError()) {
TraceEvent(SevWarn, "DiskStoreMalformedFilename").detail("Filename", filename);
continue;
}
TraceEvent("DiskStoreVersionedSuccess").detail("Filename", filename);
store.tLogOptions = tLogOptions.get();
prefix = filename.substr(0, fileVersionedLogDataPrefix.size() + optionsString.size() + 1);
} else if (filename.startsWith(fileLogDataPrefix)) {
TraceEvent("DiskStoreUnversioned").detail("Filename", filename);
store.storedComponent = DiskStore::TLogData;
store.tLogOptions.version = TLogVersion::V2;
store.tLogOptions.spillType = TLogSpillType::VALUE;
prefix = fileLogDataPrefix;
} else {
continue;
}
store.storeID = UID::fromString(files[idx].substr(prefix.size(), 32));
store.filename = filenameFromSample(type, folder, files[idx]);
result.push_back(store);
}
return result;
}
std::vector<DiskStore> getDiskStores(std::string folder) {
auto result = getDiskStores(folder, bTreeV1Suffix.suffix, bTreeV1Suffix.type, bTreeV1Suffix.check);
auto result1 = getDiskStores(folder, bTreeV2Suffix.suffix, bTreeV2Suffix.type, bTreeV2Suffix.check);
result.insert(result.end(), result1.begin(), result1.end());
auto result2 = getDiskStores(folder, memorySuffix.suffix, memorySuffix.type, memorySuffix.check);
result.insert(result.end(), result2.begin(), result2.end());
auto result3 = getDiskStores(folder, redwoodSuffix.suffix, redwoodSuffix.type, redwoodSuffix.check);
result.insert(result.end(), result3.begin(), result3.end());
auto result4 = getDiskStores(folder, memoryRTSuffix.suffix, memoryRTSuffix.type, memoryRTSuffix.check);
result.insert(result.end(), result4.begin(), result4.end());
auto result5 = getDiskStores(folder, rocksdbSuffix.suffix, rocksdbSuffix.type, rocksdbSuffix.check);
result.insert(result.end(), result5.begin(), result5.end());
return result;
}
// Register the worker interf to cluster controller (cc) and
// re-register the worker when key roles interface, e.g., cc, dd, ratekeeper, change.
ACTOR Future<Void> registrationClient(Reference<AsyncVar<Optional<ClusterControllerFullInterface>> const> ccInterface,
WorkerInterface interf,
Reference<AsyncVar<ClusterControllerPriorityInfo>> asyncPriorityInfo,
ProcessClass initialClass,
Reference<AsyncVar<Optional<DataDistributorInterface>> const> ddInterf,
Reference<AsyncVar<Optional<RatekeeperInterface>> const> rkInterf,
Reference<AsyncVar<Optional<BlobManagerInterface>> const> bmInterf,
Reference<AsyncVar<bool> const> degraded,
Reference<IClusterConnectionRecord> connRecord,
Reference<AsyncVar<std::set<std::string>> const> issues,
Reference<LocalConfiguration> localConfig,
Reference<AsyncVar<ServerDBInfo>> dbInfo) {
// Keeps the cluster controller (as it may be re-elected) informed that this worker exists
// The cluster controller uses waitFailureClient to find out if we die, and returns from registrationReply
// (requiring us to re-register) The registration request piggybacks optional distributor interface if it exists.
state Generation requestGeneration = 0;
state ProcessClass processClass = initialClass;
state Reference<AsyncVar<Optional<std::pair<uint16_t, StorageServerInterface>>>> scInterf(
new AsyncVar<Optional<std::pair<uint16_t, StorageServerInterface>>>());
state Future<Void> cacheProcessFuture;
state Future<Void> cacheErrorsFuture;
state Optional<double> incorrectTime;
state bool firstReg = true;
loop {
state ClusterConnectionString storedConnectionString;
state bool upToDate = true;
if (connRecord) {
bool upToDateResult = wait(connRecord->upToDate(storedConnectionString));
upToDate = upToDateResult;
}
if (upToDate) {
incorrectTime = Optional<double>();
}
RegisterWorkerRequest request(interf,
initialClass,
processClass,
asyncPriorityInfo->get(),
requestGeneration++,
ddInterf->get(),
rkInterf->get(),
bmInterf->get(),
degraded->get(),
localConfig->lastSeenVersion(),
localConfig->configClassSet());
for (auto const& i : issues->get()) {
request.issues.push_back_deep(request.issues.arena(), i);
}
if (!upToDate) {
request.issues.push_back_deep(request.issues.arena(), LiteralStringRef("incorrect_cluster_file_contents"));
std::string connectionString = connRecord->getConnectionString().toString();
if (!incorrectTime.present()) {
incorrectTime = now();
}
// Don't log a SevWarnAlways initially to account for transient issues (e.g. someone else changing
// the file right before us)
TraceEvent(now() - incorrectTime.get() > 300 ? SevWarnAlways : SevWarn, "IncorrectClusterFileContents")
.detail("ClusterFile", connRecord->toString())
.detail("StoredConnectionString", storedConnectionString.toString())
.detail("CurrentConnectionString", connectionString);
}
auto peers = FlowTransport::transport().getIncompatiblePeers();
for (auto it = peers->begin(); it != peers->end();) {
if (now() - it->second.second > FLOW_KNOBS->INCOMPATIBLE_PEER_DELAY_BEFORE_LOGGING) {
request.incompatiblePeers.push_back(it->first);
it = peers->erase(it);
} else {
it++;
}
}
state bool ccInterfacePresent = ccInterface->get().present();
if (ccInterfacePresent) {
request.requestDbInfo = (ccInterface->get().get().id() != dbInfo->get().clusterInterface.id());
if (firstReg) {
request.requestDbInfo = true;
firstReg = false;
}
}
state Future<RegisterWorkerReply> registrationReply =
ccInterfacePresent ? brokenPromiseToNever(ccInterface->get().get().registerWorker.getReply(request))
: Never();
state double startTime = now();
loop choose {
when(RegisterWorkerReply reply = wait(registrationReply)) {
processClass = reply.processClass;
asyncPriorityInfo->set(reply.priorityInfo);
TraceEvent("WorkerRegisterReply")
.detail("CCID", ccInterface->get().get().id())
.detail("ProcessClass", reply.processClass.toString());
break;
}
when(wait(delay(SERVER_KNOBS->UNKNOWN_CC_TIMEOUT))) {
if (!ccInterfacePresent) {
TraceEvent(SevWarn, "WorkerRegisterTimeout").detail("WaitTime", now() - startTime);
}
}
when(wait(ccInterface->onChange())) { break; }
when(wait(ddInterf->onChange())) { break; }
when(wait(rkInterf->onChange())) { break; }
when(wait(bmInterf->onChange())) { break; }
when(wait(degraded->onChange())) { break; }
when(wait(FlowTransport::transport().onIncompatibleChanged())) { break; }
when(wait(issues->onChange())) { break; }
}
}
}
// Returns true if `address` is used in the db (indicated by `dbInfo`) transaction system and in the db's primary DC.
bool addressInDbAndPrimaryDc(const NetworkAddress& address, Reference<AsyncVar<ServerDBInfo> const> dbInfo) {
const auto& dbi = dbInfo->get();
if (dbi.master.addresses().contains(address)) {
return true;
}
if (dbi.distributor.present() && dbi.distributor.get().address() == address) {
return true;
}
if (dbi.ratekeeper.present() && dbi.ratekeeper.get().address() == address) {
return true;
}
if (dbi.blobManager.present() && dbi.blobManager.get().address() == address) {
return true;
}
for (const auto& resolver : dbi.resolvers) {
if (resolver.address() == address) {
return true;
}
}
for (const auto& grvProxy : dbi.client.grvProxies) {
if (grvProxy.addresses().contains(address)) {
return true;
}
}
for (const auto& commitProxy : dbi.client.commitProxies) {
if (commitProxy.addresses().contains(address)) {
return true;
}
}
auto localityIsInPrimaryDc = [&dbInfo](const LocalityData& locality) {
return locality.dcId() == dbInfo->get().master.locality.dcId();
};
for (const auto& logSet : dbi.logSystemConfig.tLogs) {
for (const auto& tlog : logSet.tLogs) {
if (!tlog.present()) {
continue;
}
if (!localityIsInPrimaryDc(tlog.interf().filteredLocality)) {
continue;
}
if (tlog.interf().addresses().contains(address)) {
return true;
}
}
}
return false;
}
bool addressesInDbAndPrimaryDc(const NetworkAddressList& addresses, Reference<AsyncVar<ServerDBInfo> const> dbInfo) {
return addressInDbAndPrimaryDc(addresses.address, dbInfo) ||
(addresses.secondaryAddress.present() && addressInDbAndPrimaryDc(addresses.secondaryAddress.get(), dbInfo));
}
namespace {
TEST_CASE("/fdbserver/worker/addressInDbAndPrimaryDc") {
// Setup a ServerDBInfo for test.
ServerDBInfo testDbInfo;
LocalityData testLocal;
testLocal.set(LiteralStringRef("dcid"), StringRef(std::to_string(1)));
testDbInfo.master.locality = testLocal;
// Manually set up a master address.
NetworkAddress testAddress(IPAddress(0x13131313), 1);
testDbInfo.master.getCommitVersion =
RequestStream<struct GetCommitVersionRequest>(Endpoint({ testAddress }, UID(1, 2)));
// First, create an empty TLogInterface, and check that it shouldn't be considered as in primary DC.
testDbInfo.logSystemConfig.tLogs.push_back(TLogSet());
testDbInfo.logSystemConfig.tLogs.back().tLogs.push_back(OptionalInterface<TLogInterface>());
ASSERT(!addressInDbAndPrimaryDc(g_network->getLocalAddress(), makeReference<AsyncVar<ServerDBInfo>>(testDbInfo)));
// Create a remote TLog. Although the remote TLog also uses the local address, it shouldn't be considered as
// in primary DC given the remote locality.
LocalityData fakeRemote;
fakeRemote.set(LiteralStringRef("dcid"), StringRef(std::to_string(2)));
TLogInterface remoteTlog(fakeRemote);
remoteTlog.initEndpoints();
testDbInfo.logSystemConfig.tLogs.back().tLogs.push_back(OptionalInterface(remoteTlog));
ASSERT(!addressInDbAndPrimaryDc(g_network->getLocalAddress(), makeReference<AsyncVar<ServerDBInfo>>(testDbInfo)));
// Next, create a local TLog. Now, the local address should be considered as in local DC.
TLogInterface localTlog(testLocal);
localTlog.initEndpoints();
testDbInfo.logSystemConfig.tLogs.back().tLogs.push_back(OptionalInterface(localTlog));
ASSERT(addressInDbAndPrimaryDc(g_network->getLocalAddress(), makeReference<AsyncVar<ServerDBInfo>>(testDbInfo)));
// Use the master's address to test, which should be considered as in local DC.
testDbInfo.logSystemConfig.tLogs.clear();
ASSERT(addressInDbAndPrimaryDc(testAddress, makeReference<AsyncVar<ServerDBInfo>>(testDbInfo)));
// Last, tests that proxies included in the ClientDbInfo are considered as local.
NetworkAddress grvProxyAddress(IPAddress(0x26262626), 1);
GrvProxyInterface grvProxyInterf;
grvProxyInterf.getConsistentReadVersion =
RequestStream<struct GetReadVersionRequest>(Endpoint({ grvProxyAddress }, UID(1, 2)));
testDbInfo.client.grvProxies.push_back(grvProxyInterf);
ASSERT(addressInDbAndPrimaryDc(grvProxyAddress, makeReference<AsyncVar<ServerDBInfo>>(testDbInfo)));
NetworkAddress commitProxyAddress(IPAddress(0x37373737), 1);
CommitProxyInterface commitProxyInterf;
commitProxyInterf.commit =
RequestStream<struct CommitTransactionRequest>(Endpoint({ commitProxyAddress }, UID(1, 2)));
testDbInfo.client.commitProxies.push_back(commitProxyInterf);
ASSERT(addressInDbAndPrimaryDc(commitProxyAddress, makeReference<AsyncVar<ServerDBInfo>>(testDbInfo)));
return Void();
}
} // namespace
bool addressInDbAndRemoteDc(const NetworkAddress& address, Reference<AsyncVar<ServerDBInfo> const> dbInfo) {
const auto& dbi = dbInfo->get();
for (const auto& logSet : dbi.logSystemConfig.tLogs) {
if (logSet.isLocal || logSet.locality == tagLocalitySatellite) {
continue;
}
for (const auto& tlog : logSet.tLogs) {
if (tlog.present() && tlog.interf().addresses().contains(address)) {
return true;
}
}
for (const auto& logRouter : logSet.logRouters) {
if (logRouter.present() && logRouter.interf().addresses().contains(address)) {
return true;
}
}
}
return false;
}
bool addressesInDbAndRemoteDc(const NetworkAddressList& addresses, Reference<AsyncVar<ServerDBInfo> const> dbInfo) {
return addressInDbAndRemoteDc(addresses.address, dbInfo) ||
(addresses.secondaryAddress.present() && addressInDbAndRemoteDc(addresses.secondaryAddress.get(), dbInfo));
}
namespace {
TEST_CASE("/fdbserver/worker/addressInDbAndRemoteDc") {
// Setup a ServerDBInfo for test.
ServerDBInfo testDbInfo;
LocalityData testLocal;
testLocal.set(LiteralStringRef("dcid"), StringRef(std::to_string(1)));
testDbInfo.master.locality = testLocal;
// First, create an empty TLogInterface, and check that it shouldn't be considered as in remote DC.
testDbInfo.logSystemConfig.tLogs.push_back(TLogSet());
testDbInfo.logSystemConfig.tLogs.back().isLocal = true;
testDbInfo.logSystemConfig.tLogs.back().tLogs.push_back(OptionalInterface<TLogInterface>());
ASSERT(!addressInDbAndRemoteDc(g_network->getLocalAddress(), makeReference<AsyncVar<ServerDBInfo>>(testDbInfo)));
TLogInterface localTlog(testLocal);
localTlog.initEndpoints();
testDbInfo.logSystemConfig.tLogs.back().tLogs.push_back(OptionalInterface(localTlog));
ASSERT(!addressInDbAndRemoteDc(g_network->getLocalAddress(), makeReference<AsyncVar<ServerDBInfo>>(testDbInfo)));
// Create a remote TLog, and it should be considered as in remote DC.
LocalityData fakeRemote;
fakeRemote.set(LiteralStringRef("dcid"), StringRef(std::to_string(2)));
TLogInterface remoteTlog(fakeRemote);
remoteTlog.initEndpoints();
testDbInfo.logSystemConfig.tLogs.push_back(TLogSet());
testDbInfo.logSystemConfig.tLogs.back().isLocal = false;
testDbInfo.logSystemConfig.tLogs.back().tLogs.push_back(OptionalInterface(remoteTlog));
ASSERT(addressInDbAndRemoteDc(g_network->getLocalAddress(), makeReference<AsyncVar<ServerDBInfo>>(testDbInfo)));
// Create a remote log router, and it should be considered as in remote DC.
NetworkAddress logRouterAddress(IPAddress(0x26262626), 1);
TLogInterface remoteLogRouter(fakeRemote);
remoteLogRouter.initEndpoints();
remoteLogRouter.peekMessages = RequestStream<struct TLogPeekRequest>(Endpoint({ logRouterAddress }, UID(1, 2)));
testDbInfo.logSystemConfig.tLogs.back().logRouters.push_back(OptionalInterface(remoteLogRouter));
ASSERT(addressInDbAndRemoteDc(logRouterAddress, makeReference<AsyncVar<ServerDBInfo>>(testDbInfo)));
// Create a satellite tlog, and it shouldn't be considered as in remote DC.
testDbInfo.logSystemConfig.tLogs.push_back(TLogSet());
testDbInfo.logSystemConfig.tLogs.back().locality = tagLocalitySatellite;
NetworkAddress satelliteTLogAddress(IPAddress(0x13131313), 1);
TLogInterface satelliteTLog(fakeRemote);
satelliteTLog.initEndpoints();
satelliteTLog.peekMessages = RequestStream<struct TLogPeekRequest>(Endpoint({ satelliteTLogAddress }, UID(1, 2)));
testDbInfo.logSystemConfig.tLogs.back().tLogs.push_back(OptionalInterface(satelliteTLog));
ASSERT(!addressInDbAndRemoteDc(satelliteTLogAddress, makeReference<AsyncVar<ServerDBInfo>>(testDbInfo)));
return Void();
}
} // namespace
// The actor that actively monitors the health of local and peer servers, and reports anomaly to the cluster controller.
ACTOR Future<Void> healthMonitor(Reference<AsyncVar<Optional<ClusterControllerFullInterface>> const> ccInterface,
WorkerInterface interf,
LocalityData locality,
Reference<AsyncVar<ServerDBInfo> const> dbInfo) {
loop {
Future<Void> nextHealthCheckDelay = Never();
if (dbInfo->get().recoveryState >= RecoveryState::ACCEPTING_COMMITS && ccInterface->get().present()) {
nextHealthCheckDelay = delay(SERVER_KNOBS->WORKER_HEALTH_MONITOR_INTERVAL);
const auto& allPeers = FlowTransport::transport().getAllPeers();
UpdateWorkerHealthRequest req;
bool workerInDb = false;
bool workerInPrimary = false;
if (addressesInDbAndPrimaryDc(interf.addresses(), dbInfo)) {
workerInDb = true;
workerInPrimary = true;
} else if (addressesInDbAndRemoteDc(interf.addresses(), dbInfo)) {
workerInDb = true;
workerInPrimary = false;
}
if (workerInDb) {
for (const auto& [address, peer] : allPeers) {
if (peer->pingLatencies.getPopulationSize() < SERVER_KNOBS->PEER_LATENCY_CHECK_MIN_POPULATION) {
// Ignore peers that don't have enough samples.
// TODO(zhewu): Currently, FlowTransport latency monitor clears ping latency samples on a
// regular
// basis, which may affect the measurement count. Currently,
// WORKER_HEALTH_MONITOR_INTERVAL is much smaller than the ping clearance interval,
// so it may be ok. If this ends to be a problem, we need to consider keep track of
// last ping latencies logged.
continue;
}
if ((workerInPrimary && addressInDbAndPrimaryDc(address, dbInfo)) ||
(!workerInPrimary && addressInDbAndRemoteDc(address, dbInfo))) {
// Only monitoring the servers that in the primary or remote DC's transaction systems.
// Note that currently we are not monitor storage servers, since lagging in storage servers
// today already can trigger server exclusion by data distributor.
if (peer->pingLatencies.percentile(SERVER_KNOBS->PEER_LATENCY_DEGRADATION_PERCENTILE) >
SERVER_KNOBS->PEER_LATENCY_DEGRADATION_THRESHOLD ||
peer->timeoutCount / (double)(peer->pingLatencies.getPopulationSize()) >
SERVER_KNOBS->PEER_TIMEOUT_PERCENTAGE_DEGRADATION_THRESHOLD) {
// This is a degraded peer.
TraceEvent("HealthMonitorDetectDegradedPeer")
.suppressFor(30)
.detail("Peer", address)
.detail("Elapsed", now() - peer->lastLoggedTime)
.detail("MinLatency", peer->pingLatencies.min())
.detail("MaxLatency", peer->pingLatencies.max())
.detail("MeanLatency", peer->pingLatencies.mean())
.detail("MedianLatency", peer->pingLatencies.median())
.detail("CheckedPercentile", SERVER_KNOBS->PEER_LATENCY_DEGRADATION_PERCENTILE)
.detail(
"CheckedPercentileLatency",
peer->pingLatencies.percentile(SERVER_KNOBS->PEER_LATENCY_DEGRADATION_PERCENTILE))
.detail("Count", peer->pingLatencies.getPopulationSize())
.detail("TimeoutCount", peer->timeoutCount);
req.degradedPeers.push_back(address);
}
}
}
}
if (!req.degradedPeers.empty()) {
req.address = FlowTransport::transport().getLocalAddress();
ccInterface->get().get().updateWorkerHealth.send(req);
}
}
choose {
when(wait(nextHealthCheckDelay)) {}
when(wait(ccInterface->onChange())) {}
when(wait(dbInfo->onChange())) {}
}
}
}
#if (defined(__linux__) || defined(__FreeBSD__)) && defined(USE_GPERFTOOLS)
// A set of threads that should be profiled
std::set<std::thread::id> profiledThreads;
// Returns whether or not a given thread should be profiled
int filter_in_thread(void* arg) {
return profiledThreads.count(std::this_thread::get_id()) > 0 ? 1 : 0;
}
#endif
// Enables the calling thread to be profiled
void registerThreadForProfiling() {
#if (defined(__linux__) || defined(__FreeBSD__)) && defined(USE_GPERFTOOLS)
// Not sure if this is actually needed, but a call to backtrace was advised here:
// http://groups.google.com/group/google-perftools/browse_thread/thread/0dfd74532e038eb8/2686d9f24ac4365f?pli=1
profiledThreads.insert(std::this_thread::get_id());
const int num_levels = 100;
void* pc[num_levels];
backtrace(pc, num_levels);
#endif
}
// Starts or stops the CPU profiler
void updateCpuProfiler(ProfilerRequest req) {
switch (req.type) {
case ProfilerRequest::Type::GPROF:
#if (defined(__linux__) || defined(__FreeBSD__)) && defined(USE_GPERFTOOLS) && !defined(VALGRIND)
switch (req.action) {
case ProfilerRequest::Action::ENABLE: {
const char* path = (const char*)req.outputFile.begin();
ProfilerOptions* options = new ProfilerOptions();
options->filter_in_thread = &filter_in_thread;
options->filter_in_thread_arg = nullptr;
ProfilerStartWithOptions(path, options);
break;
}
case ProfilerRequest::Action::DISABLE:
ProfilerStop();
break;
case ProfilerRequest::Action::RUN:
ASSERT(false); // User should have called runProfiler.
break;
}
#endif
break;
case ProfilerRequest::Type::FLOW:
switch (req.action) {
case ProfilerRequest::Action::ENABLE:
startProfiling(g_network, {}, req.outputFile);
break;
case ProfilerRequest::Action::DISABLE:
stopProfiling();
break;
case ProfilerRequest::Action::RUN:
ASSERT(false); // User should have called runProfiler.
break;
}
break;
default:
ASSERT(false);
break;
}
}
ACTOR Future<Void> runCpuProfiler(ProfilerRequest req) {
if (req.action == ProfilerRequest::Action::RUN) {
req.action = ProfilerRequest::Action::ENABLE;
updateCpuProfiler(req);
wait(delay(req.duration));
req.action = ProfilerRequest::Action::DISABLE;
updateCpuProfiler(req);
return Void();
} else {
updateCpuProfiler(req);
return Void();
}
}
void runHeapProfiler(const char* msg) {
#if defined(__linux__) && defined(USE_GPERFTOOLS) && !defined(VALGRIND)
if (IsHeapProfilerRunning()) {
HeapProfilerDump(msg);
} else {
TraceEvent("ProfilerError").detail("Message", "HeapProfiler not running");
}
#else
TraceEvent("ProfilerError").detail("Message", "HeapProfiler Unsupported");
#endif
}
ACTOR Future<Void> runProfiler(ProfilerRequest req) {
if (req.type == ProfilerRequest::Type::GPROF_HEAP) {
runHeapProfiler("User triggered heap dump");
} else {
wait(runCpuProfiler(req));
}
return Void();
}
bool checkHighMemory(int64_t threshold, bool* error) {
#if defined(__linux__) && defined(USE_GPERFTOOLS) && !defined(VALGRIND)
*error = false;
uint64_t page_size = sysconf(_SC_PAGESIZE);
int fd = open("/proc/self/statm", O_RDONLY | O_CLOEXEC);
if (fd < 0) {
TraceEvent("OpenStatmFileFailure").log();
*error = true;
return false;
}
const int buf_sz = 256;
char stat_buf[buf_sz];
ssize_t stat_nread = read(fd, stat_buf, buf_sz);
if (stat_nread < 0) {
TraceEvent("ReadStatmFileFailure").log();
*error = true;
return false;
}
uint64_t vmsize, rss;
sscanf(stat_buf, "%lu %lu", &vmsize, &rss);
rss *= page_size;
if (rss >= threshold) {
return true;
}
#else
TraceEvent("CheckHighMemoryUnsupported").log();
*error = true;
#endif
return false;
}
// Runs heap profiler when RSS memory usage is high.
ACTOR Future<Void> monitorHighMemory(int64_t threshold) {
if (threshold <= 0)
return Void();
loop {
bool err = false;
bool highmem = checkHighMemory(threshold, &err);
if (err)
break;
if (highmem)
runHeapProfiler("Highmem heap dump");
wait(delay(SERVER_KNOBS->HEAP_PROFILER_INTERVAL));
}
return Void();
}
struct TrackRunningStorage {
UID self;
KeyValueStoreType storeType;
std::set<std::pair<UID, KeyValueStoreType>>* runningStorages;
TrackRunningStorage(UID self,
KeyValueStoreType storeType,
std::set<std::pair<UID, KeyValueStoreType>>* runningStorages)
: self(self), storeType(storeType), runningStorages(runningStorages) {
runningStorages->emplace(self, storeType);
}
~TrackRunningStorage() { runningStorages->erase(std::make_pair(self, storeType)); };
};
ACTOR Future<Void> storageServerRollbackRebooter(std::set<std::pair<UID, KeyValueStoreType>>* runningStorages,
Future<Void> prevStorageServer,
KeyValueStoreType storeType,
std::string filename,
UID id,
LocalityData locality,
bool isTss,
Reference<AsyncVar<ServerDBInfo> const> db,
std::string folder,
ActorCollection* filesClosed,
int64_t memoryLimit,
IKeyValueStore* store) {
state TrackRunningStorage _(id, storeType, runningStorages);
loop {
ErrorOr<Void> e = wait(errorOr(prevStorageServer));
if (!e.isError())
return Void();
else if (e.getError().code() != error_code_please_reboot)
throw e.getError();
TraceEvent("StorageServerRequestedReboot", id).log();
StorageServerInterface recruited;
recruited.uniqueID = id;
recruited.locality = locality;
recruited.tssPairID =
isTss ? Optional<UID>(UID()) : Optional<UID>(); // set this here since we use its presence to determine
// whether this server is a tss or not
recruited.initEndpoints();
DUMPTOKEN(recruited.getValue);
DUMPTOKEN(recruited.getKey);
DUMPTOKEN(recruited.getKeyValues);
DUMPTOKEN(recruited.getKeyValuesAndFlatMap);
DUMPTOKEN(recruited.getShardState);
DUMPTOKEN(recruited.waitMetrics);
DUMPTOKEN(recruited.splitMetrics);
DUMPTOKEN(recruited.getReadHotRanges);
DUMPTOKEN(recruited.getRangeSplitPoints);
DUMPTOKEN(recruited.getStorageMetrics);
DUMPTOKEN(recruited.waitFailure);
DUMPTOKEN(recruited.getQueuingMetrics);
DUMPTOKEN(recruited.getKeyValueStoreType);
DUMPTOKEN(recruited.watchValue);
DUMPTOKEN(recruited.getKeyValuesStream);
DUMPTOKEN(recruited.getKeyValuesAndFlatMap);
prevStorageServer =
storageServer(store, recruited, db, folder, Promise<Void>(), Reference<IClusterConnectionRecord>(nullptr));
prevStorageServer = handleIOErrors(prevStorageServer, store, id, store->onClosed());
}
}
ACTOR Future<Void> storageCacheRollbackRebooter(Future<Void> prevStorageCache,
UID id,
LocalityData locality,
Reference<AsyncVar<ServerDBInfo> const> db) {
loop {
ErrorOr<Void> e = wait(errorOr(prevStorageCache));
if (!e.isError()) {
TraceEvent("StorageCacheRequestedReboot1", id).log();
return Void();
} else if (e.getError().code() != error_code_please_reboot &&
e.getError().code() != error_code_worker_removed) {
TraceEvent("StorageCacheRequestedReboot2", id).detail("Code", e.getError().code());
throw e.getError();
}
TraceEvent("StorageCacheRequestedReboot", id).log();
StorageServerInterface recruited;
recruited.uniqueID = deterministicRandom()->randomUniqueID(); // id;
recruited.locality = locality;
recruited.initEndpoints();
DUMPTOKEN(recruited.getValue);
DUMPTOKEN(recruited.getKey);
DUMPTOKEN(recruited.getKeyValues);
DUMPTOKEN(recruited.getShardState);
DUMPTOKEN(recruited.waitMetrics);
DUMPTOKEN(recruited.splitMetrics);
DUMPTOKEN(recruited.getStorageMetrics);
DUMPTOKEN(recruited.waitFailure);
DUMPTOKEN(recruited.getQueuingMetrics);
DUMPTOKEN(recruited.getKeyValueStoreType);
DUMPTOKEN(recruited.watchValue);
prevStorageCache = storageCacheServer(recruited, 0, db);
}
}
// FIXME: This will not work correctly in simulation as all workers would share the same roles map
std::set<std::pair<std::string, std::string>> g_roles;
Standalone<StringRef> roleString(std::set<std::pair<std::string, std::string>> roles, bool with_ids) {
std::string result;
for (auto& r : roles) {
if (!result.empty())
result.append(",");
result.append(r.first);
if (with_ids) {
result.append(":");
result.append(r.second);
}
}
return StringRef(result);
}
void startRole(const Role& role,
UID roleId,
UID workerId,
const std::map<std::string, std::string>& details,
const std::string& origination) {
if (role.includeInTraceRoles) {
addTraceRole(role.abbreviation);
}
TraceEvent ev("Role", roleId);
ev.detail("As", role.roleName)
.detail("Transition", "Begin")
.detail("Origination", origination)
.detail("OnWorker", workerId);
for (auto it = details.begin(); it != details.end(); it++)
ev.detail(it->first.c_str(), it->second);
ev.trackLatest(roleId.shortString() + ".Role");
// Update roles map, log Roles metrics
g_roles.insert({ role.roleName, roleId.shortString() });
StringMetricHandle(LiteralStringRef("Roles")) = roleString(g_roles, false);
StringMetricHandle(LiteralStringRef("RolesWithIDs")) = roleString(g_roles, true);
if (g_network->isSimulated())
g_simulator.addRole(g_network->getLocalAddress(), role.roleName);
}
void endRole(const Role& role, UID id, std::string reason, bool ok, Error e) {
{
TraceEvent ev("Role", id);
if (e.code() != invalid_error_code)
ev.error(e, true);
ev.detail("Transition", "End").detail("As", role.roleName).detail("Reason", reason);
ev.trackLatest(id.shortString() + ".Role");
}
if (!ok) {
std::string type = role.roleName + "Failed";
TraceEvent err(SevError, type.c_str(), id);
if (e.code() != invalid_error_code) {
err.error(e, true);
}
err.detail("Reason", reason);
}
latestEventCache.clear(id.shortString());
// Update roles map, log Roles metrics
g_roles.erase({ role.roleName, id.shortString() });
StringMetricHandle(LiteralStringRef("Roles")) = roleString(g_roles, false);
StringMetricHandle(LiteralStringRef("RolesWithIDs")) = roleString(g_roles, true);
if (g_network->isSimulated())
g_simulator.removeRole(g_network->getLocalAddress(), role.roleName);
if (role.includeInTraceRoles) {
removeTraceRole(role.abbreviation);
}
}
ACTOR Future<Void> traceRole(Role role, UID roleId) {
loop {
wait(delay(SERVER_KNOBS->WORKER_LOGGING_INTERVAL));
TraceEvent("Role", roleId).detail("Transition", "Refresh").detail("As", role.roleName);
}
}
ACTOR Future<Void> workerSnapCreate(WorkerSnapRequest snapReq, Standalone<StringRef> snapFolder) {
state ExecCmdValueString snapArg(snapReq.snapPayload);
try {
int err = wait(execHelper(&snapArg, snapReq.snapUID, snapFolder.toString(), snapReq.role.toString()));
std::string uidStr = snapReq.snapUID.toString();
TraceEvent("ExecTraceWorker")
.detail("Uid", uidStr)
.detail("Status", err)
.detail("Role", snapReq.role)
.detail("Value", snapFolder)
.detail("ExecPayload", snapReq.snapPayload);
if (err != 0) {
throw operation_failed();
}
if (snapReq.role.toString() == "storage") {
printStorageVersionInfo();
}
snapReq.reply.send(Void());
} catch (Error& e) {
TraceEvent("ExecHelperError").error(e, true /*includeCancelled*/);
if (e.code() != error_code_operation_cancelled) {
snapReq.reply.sendError(e);
} else {
throw e;
}
}
return Void();
}
// TODO: `issues` is right now only updated by `monitorTraceLogIssues` and thus is being `set` on every update.
// It could be changed to `insert` and `trigger` later if we want to use it as a generic way for the caller of this
// function to report issues to cluster controller.
ACTOR Future<Void> monitorTraceLogIssues(Reference<AsyncVar<std::set<std::string>>> issues) {
state bool pingTimeout = false;
loop {
wait(delay(SERVER_KNOBS->TRACE_LOG_FLUSH_FAILURE_CHECK_INTERVAL_SECONDS));
Future<Void> pingAck = pingTraceLogWriterThread();
try {
wait(timeoutError(pingAck, SERVER_KNOBS->TRACE_LOG_PING_TIMEOUT_SECONDS));
} catch (Error& e) {
if (e.code() == error_code_timed_out) {
pingTimeout = true;
} else {
throw;
}
}
std::set<std::string> _issues;
retrieveTraceLogIssues(_issues);
if (pingTimeout) {
// Ping trace log writer thread timeout.
_issues.insert("trace_log_writer_thread_unresponsive");
pingTimeout = false;
}
issues->set(_issues);
}
}
class SharedLogsKey {
TLogVersion logVersion;
TLogSpillType spillType;
KeyValueStoreType storeType;
public:
SharedLogsKey(const TLogOptions& options, KeyValueStoreType kvst)
: logVersion(options.version), spillType(options.spillType), storeType(kvst) {
if (logVersion >= TLogVersion::V5)
spillType = TLogSpillType::UNSET;
}
bool operator<(const SharedLogsKey& other) const {
return std::tie(logVersion, spillType, storeType) <
std::tie(other.logVersion, other.spillType, other.storeType);
}
};
struct SharedLogsValue {
Future<Void> actor = Void();
UID uid = UID();
PromiseStream<InitializeTLogRequest> requests;
SharedLogsValue() = default;
SharedLogsValue(Future<Void> actor, UID uid, PromiseStream<InitializeTLogRequest> requests)
: actor(actor), uid(uid), requests(requests) {}
};
ACTOR Future<Void> chaosMetricsLogger() {
auto res = g_network->global(INetwork::enChaosMetrics);
if (!res)
return Void();
state ChaosMetrics* chaosMetrics = static_cast<ChaosMetrics*>(res);
chaosMetrics->clear();
loop {
wait(delay(FLOW_KNOBS->CHAOS_LOGGING_INTERVAL));
TraceEvent e("ChaosMetrics");
double elapsed = now() - chaosMetrics->startTime;
e.detail("Elapsed", elapsed);
chaosMetrics->getFields(&e);
e.trackLatest("ChaosMetrics");
chaosMetrics->clear();
}
}
ACTOR Future<Void> workerServer(Reference<IClusterConnectionRecord> connRecord,
Reference<AsyncVar<Optional<ClusterControllerFullInterface>> const> ccInterface,
LocalityData locality,
Reference<AsyncVar<ClusterControllerPriorityInfo>> asyncPriorityInfo,
ProcessClass initialClass,
std::string folder,
int64_t memoryLimit,
std::string metricsConnFile,
std::string metricsPrefix,
Promise<Void> recoveredDiskFiles,
int64_t memoryProfileThreshold,
std::string _coordFolder,
std::string whitelistBinPaths,
Reference<AsyncVar<ServerDBInfo>> dbInfo,
ConfigDBType configDBType,
Reference<LocalConfiguration> localConfig) {
state PromiseStream<ErrorInfo> errors;
state Reference<AsyncVar<Optional<DataDistributorInterface>>> ddInterf(
new AsyncVar<Optional<DataDistributorInterface>>());
state Reference<AsyncVar<Optional<RatekeeperInterface>>> rkInterf(new AsyncVar<Optional<RatekeeperInterface>>());
state Reference<AsyncVar<Optional<BlobManagerInterface>>> bmInterf(new AsyncVar<Optional<BlobManagerInterface>>());
state Future<Void> handleErrors = workerHandleErrors(errors.getFuture()); // Needs to be stopped last
state ActorCollection errorForwarders(false);
state Future<Void> loggingTrigger = Void();
state double loggingDelay = SERVER_KNOBS->WORKER_LOGGING_INTERVAL;
state ActorCollection filesClosed(true);
state Promise<Void> stopping;
state WorkerCache<InitializeStorageReply> storageCache;
state Future<Void> metricsLogger;
state Future<Void> chaosMetricsActor;
state Reference<AsyncVar<bool>> degraded = FlowTransport::transport().getDegraded();
// tLogFnForOptions() can return a function that doesn't correspond with the FDB version that the
// TLogVersion represents. This can be done if the newer TLog doesn't support a requested option.
// As (store type, spill type) can map to the same TLogFn across multiple TLogVersions, we need to
// decide if we should collapse them into the same SharedTLog instance as well. The answer
// here is no, so that when running with log_version==3, all files should say V=3.
state std::map<SharedLogsKey, SharedLogsValue> sharedLogs;
state Reference<AsyncVar<UID>> activeSharedTLog(new AsyncVar<UID>());
state WorkerCache<InitializeBackupReply> backupWorkerCache;
state std::string coordFolder = abspath(_coordFolder);
state WorkerInterface interf(locality);
state std::set<std::pair<UID, KeyValueStoreType>> runningStorages;
interf.initEndpoints();
state Reference<AsyncVar<std::set<std::string>>> issues(new AsyncVar<std::set<std::string>>());
if (FLOW_KNOBS->ENABLE_CHAOS_FEATURES) {
TraceEvent(SevInfo, "ChaosFeaturesEnabled");
chaosMetricsActor = chaosMetricsLogger();
}
folder = abspath(folder);
if (metricsPrefix.size() > 0) {
if (metricsConnFile.size() > 0) {
try {
state Database db =
Database::createDatabase(metricsConnFile, Database::API_VERSION_LATEST, IsInternal::True, locality);
metricsLogger = runMetrics(db, KeyRef(metricsPrefix));
} catch (Error& e) {
TraceEvent(SevWarnAlways, "TDMetricsBadClusterFile").error(e).detail("ConnFile", metricsConnFile);
}
} else {
auto lockAware = metricsPrefix.size() && metricsPrefix[0] == '\xff' ? LockAware::True : LockAware::False;
metricsLogger =
runMetrics(openDBOnServer(dbInfo, TaskPriority::DefaultEndpoint, lockAware), KeyRef(metricsPrefix));
}
GlobalConfig::globalConfig().trigger(samplingFrequency, samplingProfilerUpdateFrequency);
}
errorForwarders.add(resetAfter(degraded,
SERVER_KNOBS->DEGRADED_RESET_INTERVAL,
false,
SERVER_KNOBS->DEGRADED_WARNING_LIMIT,
SERVER_KNOBS->DEGRADED_WARNING_RESET_DELAY,
"DegradedReset"));
errorForwarders.add(loadedPonger(interf.debugPing.getFuture()));
errorForwarders.add(waitFailureServer(interf.waitFailure.getFuture()));
errorForwarders.add(monitorTraceLogIssues(issues));
errorForwarders.add(testerServerCore(interf.testerInterface, connRecord, dbInfo, locality));
errorForwarders.add(monitorHighMemory(memoryProfileThreshold));
filesClosed.add(stopping.getFuture());
initializeSystemMonitorMachineState(SystemMonitorMachineState(
folder, locality.dcId(), locality.zoneId(), locality.machineId(), g_network->getLocalAddress().ip));
{
auto recruited = interf;
DUMPTOKEN(recruited.clientInterface.reboot);
DUMPTOKEN(recruited.clientInterface.profiler);
DUMPTOKEN(recruited.tLog);
DUMPTOKEN(recruited.master);
DUMPTOKEN(recruited.commitProxy);
DUMPTOKEN(recruited.grvProxy);
DUMPTOKEN(recruited.resolver);
DUMPTOKEN(recruited.storage);
DUMPTOKEN(recruited.debugPing);
DUMPTOKEN(recruited.coordinationPing);
DUMPTOKEN(recruited.waitFailure);
DUMPTOKEN(recruited.setMetricsRate);
DUMPTOKEN(recruited.eventLogRequest);
DUMPTOKEN(recruited.traceBatchDumpRequest);
DUMPTOKEN(recruited.updateServerDBInfo);
}
state std::vector<Future<Void>> recoveries;
try {
std::vector<DiskStore> stores = getDiskStores(folder);
bool validateDataFiles = deleteFile(joinPath(folder, validationFilename));
for (int f = 0; f < stores.size(); f++) {
DiskStore s = stores[f];
// FIXME: Error handling
if (s.storedComponent == DiskStore::Storage) {
LocalLineage _;
getCurrentLineage()->modify(&RoleLineage::role) = ProcessClass::ClusterRole::Storage;
IKeyValueStore* kv =
openKVStore(s.storeType, s.filename, s.storeID, memoryLimit, false, validateDataFiles);
Future<Void> kvClosed = kv->onClosed();
filesClosed.add(kvClosed);
// std::string doesn't have startsWith
std::string tssPrefix = testingStoragePrefix.toString();
// TODO might be more efficient to mark a boolean on DiskStore in getDiskStores, but that kind of breaks
// the abstraction since DiskStore also applies to storage cache + tlog
bool isTss = s.filename.find(tssPrefix) != std::string::npos;
Role ssRole = isTss ? Role::TESTING_STORAGE_SERVER : Role::STORAGE_SERVER;
StorageServerInterface recruited;
recruited.uniqueID = s.storeID;
recruited.locality = locality;
recruited.tssPairID =
isTss ? Optional<UID>(UID())
: Optional<UID>(); // presence of optional is used as source of truth for tss vs not. Value
// gets overridden later in restoreDurableState
recruited.initEndpoints();
std::map<std::string, std::string> details;
details["StorageEngine"] = s.storeType.toString();
details["IsTSS"] = isTss ? "Yes" : "No";
startRole(ssRole, recruited.id(), interf.id(), details, "Restored");
DUMPTOKEN(recruited.getValue);
DUMPTOKEN(recruited.getKey);
DUMPTOKEN(recruited.getKeyValues);
DUMPTOKEN(recruited.getShardState);
DUMPTOKEN(recruited.waitMetrics);
DUMPTOKEN(recruited.splitMetrics);
DUMPTOKEN(recruited.getReadHotRanges);
DUMPTOKEN(recruited.getRangeSplitPoints);
DUMPTOKEN(recruited.getStorageMetrics);
DUMPTOKEN(recruited.waitFailure);
DUMPTOKEN(recruited.getQueuingMetrics);
DUMPTOKEN(recruited.getKeyValueStoreType);
DUMPTOKEN(recruited.watchValue);
DUMPTOKEN(recruited.getKeyValuesStream);
DUMPTOKEN(recruited.getKeyValuesAndFlatMap);
Promise<Void> recovery;
Future<Void> f = storageServer(kv, recruited, dbInfo, folder, recovery, connRecord);
recoveries.push_back(recovery.getFuture());
f = handleIOErrors(f, kv, s.storeID, kvClosed);
f = storageServerRollbackRebooter(&runningStorages,
f,
s.storeType,
s.filename,
recruited.id(),
recruited.locality,
isTss,
dbInfo,
folder,
&filesClosed,
memoryLimit,
kv);
errorForwarders.add(forwardError(errors, ssRole, recruited.id(), f));
} else if (s.storedComponent == DiskStore::TLogData) {
LocalLineage _;
getCurrentLineage()->modify(&RoleLineage::role) = ProcessClass::ClusterRole::TLog;
std::string logQueueBasename;
const std::string filename = basename(s.filename);
if (StringRef(filename).startsWith(fileLogDataPrefix)) {
logQueueBasename = fileLogQueuePrefix.toString();
} else {
StringRef optionsString = StringRef(filename).removePrefix(fileVersionedLogDataPrefix).eat("-");
logQueueBasename = fileLogQueuePrefix.toString() + optionsString.toString() + "-";
}
ASSERT_WE_THINK(abspath(parentDirectory(s.filename)) == folder);
IKeyValueStore* kv = openKVStore(s.storeType, s.filename, s.storeID, memoryLimit, validateDataFiles);
const DiskQueueVersion dqv =
s.tLogOptions.version >= TLogVersion::V3 ? DiskQueueVersion::V1 : DiskQueueVersion::V0;
const int64_t diskQueueWarnSize =
s.tLogOptions.spillType == TLogSpillType::VALUE ? 10 * SERVER_KNOBS->TARGET_BYTES_PER_TLOG : -1;
IDiskQueue* queue = openDiskQueue(joinPath(folder, logQueueBasename + s.storeID.toString() + "-"),
tlogQueueExtension.toString(),
s.storeID,
dqv,
diskQueueWarnSize);
filesClosed.add(kv->onClosed());
filesClosed.add(queue->onClosed());
std::map<std::string, std::string> details;
details["StorageEngine"] = s.storeType.toString();
startRole(Role::SHARED_TRANSACTION_LOG, s.storeID, interf.id(), details, "Restored");
Promise<Void> oldLog;
Promise<Void> recovery;
TLogFn tLogFn = tLogFnForOptions(s.tLogOptions);
auto& logData = sharedLogs[SharedLogsKey(s.tLogOptions, s.storeType)];
// FIXME: Shouldn't if logData.first isValid && !isReady, shouldn't we
// be sending a fake InitializeTLogRequest rather than calling tLog() ?
Future<Void> tl =
tLogFn(kv,
queue,
dbInfo,
locality,
!logData.actor.isValid() || logData.actor.isReady() ? logData.requests
: PromiseStream<InitializeTLogRequest>(),
s.storeID,
interf.id(),
true,
oldLog,
recovery,
folder,
degraded,
activeSharedTLog);
recoveries.push_back(recovery.getFuture());
activeSharedTLog->set(s.storeID);
tl = handleIOErrors(tl, kv, s.storeID);
tl = handleIOErrors(tl, queue, s.storeID);
if (!logData.actor.isValid() || logData.actor.isReady()) {
logData.actor = oldLog.getFuture() || tl;
logData.uid = s.storeID;
}
errorForwarders.add(forwardError(errors, Role::SHARED_TRANSACTION_LOG, s.storeID, tl));
}
}
bool hasCache = false;
// start cache role if we have the right process class
if (initialClass.classType() == ProcessClass::StorageCacheClass) {
hasCache = true;
StorageServerInterface recruited;
recruited.locality = locality;
recruited.initEndpoints();
std::map<std::string, std::string> details;
startRole(Role::STORAGE_CACHE, recruited.id(), interf.id(), details);
// DUMPTOKEN(recruited.getVersion);
DUMPTOKEN(recruited.getValue);
DUMPTOKEN(recruited.getKey);
DUMPTOKEN(recruited.getKeyValues);
DUMPTOKEN(recruited.getKeyValuesAndFlatMap);
DUMPTOKEN(recruited.getShardState);
DUMPTOKEN(recruited.waitMetrics);
DUMPTOKEN(recruited.splitMetrics);
DUMPTOKEN(recruited.getStorageMetrics);
DUMPTOKEN(recruited.waitFailure);
DUMPTOKEN(recruited.getQueuingMetrics);
DUMPTOKEN(recruited.getKeyValueStoreType);
DUMPTOKEN(recruited.watchValue);
auto f = storageCacheServer(recruited, 0, dbInfo);
f = storageCacheRollbackRebooter(f, recruited.id(), recruited.locality, dbInfo);
errorForwarders.add(forwardError(errors, Role::STORAGE_CACHE, recruited.id(), f));
}
std::map<std::string, std::string> details;
details["Locality"] = locality.toString();
details["DataFolder"] = folder;
details["StoresPresent"] = format("%d", stores.size());
details["CachePresent"] = hasCache ? "true" : "false";
startRole(Role::WORKER, interf.id(), interf.id(), details);
errorForwarders.add(traceRole(Role::WORKER, interf.id()));
wait(waitForAll(recoveries));
recoveredDiskFiles.send(Void());
errorForwarders.add(registrationClient(ccInterface,
interf,
asyncPriorityInfo,
initialClass,
ddInterf,
rkInterf,
bmInterf,
degraded,
connRecord,
issues,
localConfig,
dbInfo));
if (configDBType != ConfigDBType::DISABLED) {
errorForwarders.add(localConfig->consume(interf.configBroadcastInterface));
}
if (SERVER_KNOBS->ENABLE_WORKER_HEALTH_MONITOR) {
errorForwarders.add(healthMonitor(ccInterface, interf, locality, dbInfo));
}
TraceEvent("RecoveriesComplete", interf.id());
loop choose {
when(UpdateServerDBInfoRequest req = waitNext(interf.updateServerDBInfo.getFuture())) {
ServerDBInfo localInfo = BinaryReader::fromStringRef<ServerDBInfo>(
req.serializedDbInfo, AssumeVersion(g_network->protocolVersion()));
localInfo.myLocality = locality;
if (localInfo.infoGeneration < dbInfo->get().infoGeneration &&
localInfo.clusterInterface == dbInfo->get().clusterInterface) {
std::vector<Endpoint> rep = req.broadcastInfo;
rep.push_back(interf.updateServerDBInfo.getEndpoint());
req.reply.send(rep);
} else {
Optional<Endpoint> notUpdated;
if (!ccInterface->get().present() || localInfo.clusterInterface != ccInterface->get().get()) {
notUpdated = interf.updateServerDBInfo.getEndpoint();
} else if (localInfo.infoGeneration > dbInfo->get().infoGeneration ||
dbInfo->get().clusterInterface != ccInterface->get().get()) {
TraceEvent("GotServerDBInfoChange")
.detail("ChangeID", localInfo.id)
.detail("MasterID", localInfo.master.id())
.detail("RatekeeperID",
localInfo.ratekeeper.present() ? localInfo.ratekeeper.get().id() : UID())
.detail("DataDistributorID",
localInfo.distributor.present() ? localInfo.distributor.get().id() : UID())
.detail("BlobManagerID",
localInfo.blobManager.present() ? localInfo.blobManager.get().id() : UID());
dbInfo->set(localInfo);
}
errorForwarders.add(
success(broadcastDBInfoRequest(req, SERVER_KNOBS->DBINFO_SEND_AMOUNT, notUpdated, true)));
}
}
when(RebootRequest req = waitNext(interf.clientInterface.reboot.getFuture())) {
state RebootRequest rebootReq = req;
// If suspendDuration is INT_MAX, the trace will not be logged if it was inside the next block
// Also a useful trace to have even if suspendDuration is 0
TraceEvent("RebootRequestSuspendingProcess").detail("Duration", req.waitForDuration);
if (req.waitForDuration) {
flushTraceFileVoid();
setProfilingEnabled(0);
g_network->stop();
threadSleep(req.waitForDuration);
}
if (rebootReq.checkData) {
Reference<IAsyncFile> checkFile =
wait(IAsyncFileSystem::filesystem()->open(joinPath(folder, validationFilename),
IAsyncFile::OPEN_CREATE | IAsyncFile::OPEN_READWRITE,
0600));
wait(checkFile->sync());
}
if (g_network->isSimulated()) {
TraceEvent("SimulatedReboot").detail("Deletion", rebootReq.deleteData);
if (rebootReq.deleteData) {
throw please_reboot_delete();
}
throw please_reboot();
} else {
TraceEvent("ProcessReboot").log();
ASSERT(!rebootReq.deleteData);
flushAndExit(0);
}
}
when(SetFailureInjection req = waitNext(interf.clientInterface.setFailureInjection.getFuture())) {
if (FLOW_KNOBS->ENABLE_CHAOS_FEATURES) {
if (req.diskFailure.present()) {
auto diskFailureInjector = DiskFailureInjector::injector();
diskFailureInjector->setDiskFailure(req.diskFailure.get().stallInterval,
req.diskFailure.get().stallPeriod,
req.diskFailure.get().throttlePeriod);
} else if (req.flipBits.present()) {
auto bitFlipper = BitFlipper::flipper();
bitFlipper->setBitFlipPercentage(req.flipBits.get().percentBitFlips);
}
req.reply.send(Void());
} else {
req.reply.sendError(client_invalid_operation());
}
}
when(ProfilerRequest req = waitNext(interf.clientInterface.profiler.getFuture())) {
state ProfilerRequest profilerReq = req;
// There really isn't a great "filepath sanitizer" or "filepath escape" function available,
// thus we instead enforce a different requirement. One can only write to a file that's
// beneath the working directory, and we remove the ability to do any symlink or ../..
// tricks by resolving all paths through `abspath` first.
try {
std::string realLogDir = abspath(SERVER_KNOBS->LOG_DIRECTORY);
std::string realOutPath = abspath(realLogDir + "/" + profilerReq.outputFile.toString());
if (realLogDir.size() < realOutPath.size() &&
strncmp(realLogDir.c_str(), realOutPath.c_str(), realLogDir.size()) == 0) {
profilerReq.outputFile = realOutPath;
uncancellable(runProfiler(profilerReq));
profilerReq.reply.send(Void());
} else {
profilerReq.reply.sendError(client_invalid_operation());
}
} catch (Error& e) {
profilerReq.reply.sendError(e);
}
}
when(RecruitMasterRequest req = waitNext(interf.master.getFuture())) {
LocalLineage _;
getCurrentLineage()->modify(&RoleLineage::role) = ProcessClass::ClusterRole::Master;
MasterInterface recruited;
recruited.locality = locality;
recruited.initEndpoints();
startRole(Role::MASTER, recruited.id(), interf.id());
DUMPTOKEN(recruited.waitFailure);
DUMPTOKEN(recruited.getCommitVersion);
DUMPTOKEN(recruited.getLiveCommittedVersion);
DUMPTOKEN(recruited.reportLiveCommittedVersion);
DUMPTOKEN(recruited.updateRecoveryData);
// printf("Recruited as masterServer\n");
Future<Void> masterProcess = masterServer(
recruited, dbInfo, ccInterface, ServerCoordinators(connRecord), req.lifetime, req.forceRecovery);
errorForwarders.add(
zombie(recruited, forwardError(errors, Role::MASTER, recruited.id(), masterProcess)));
req.reply.send(recruited);
}
when(InitializeDataDistributorRequest req = waitNext(interf.dataDistributor.getFuture())) {
LocalLineage _;
getCurrentLineage()->modify(&RoleLineage::role) = ProcessClass::ClusterRole::DataDistributor;
DataDistributorInterface recruited(locality, req.reqId);
recruited.initEndpoints();
if (ddInterf->get().present()) {
recruited = ddInterf->get().get();
TEST(true); // Recruited while already a data distributor.
} else {
startRole(Role::DATA_DISTRIBUTOR, recruited.id(), interf.id());
DUMPTOKEN(recruited.waitFailure);
Future<Void> dataDistributorProcess = dataDistributor(recruited, dbInfo);
errorForwarders.add(forwardError(
errors,
Role::DATA_DISTRIBUTOR,
recruited.id(),
setWhenDoneOrError(dataDistributorProcess, ddInterf, Optional<DataDistributorInterface>())));
ddInterf->set(Optional<DataDistributorInterface>(recruited));
}
TraceEvent("DataDistributorReceived", req.reqId).detail("DataDistributorId", recruited.id());
req.reply.send(recruited);
}
when(InitializeRatekeeperRequest req = waitNext(interf.ratekeeper.getFuture())) {
LocalLineage _;
getCurrentLineage()->modify(&RoleLineage::role) = ProcessClass::ClusterRole::Ratekeeper;
RatekeeperInterface recruited(locality, req.reqId);
recruited.initEndpoints();
if (rkInterf->get().present()) {
recruited = rkInterf->get().get();
TEST(true); // Recruited while already a ratekeeper.
} else {
startRole(Role::RATEKEEPER, recruited.id(), interf.id());
DUMPTOKEN(recruited.waitFailure);
DUMPTOKEN(recruited.getRateInfo);
DUMPTOKEN(recruited.haltRatekeeper);
DUMPTOKEN(recruited.reportCommitCostEstimation);
Future<Void> ratekeeperProcess = ratekeeper(recruited, dbInfo);
errorForwarders.add(
forwardError(errors,
Role::RATEKEEPER,
recruited.id(),
setWhenDoneOrError(ratekeeperProcess, rkInterf, Optional<RatekeeperInterface>())));
rkInterf->set(Optional<RatekeeperInterface>(recruited));
}
TraceEvent("Ratekeeper_InitRequest", req.reqId).detail("RatekeeperId", recruited.id());
req.reply.send(recruited);
}
when(InitializeBlobManagerRequest req = waitNext(interf.blobManager.getFuture())) {
LocalLineage _;
getCurrentLineage()->modify(&RoleLineage::role) = ProcessClass::ClusterRole::BlobManager;
BlobManagerInterface recruited(locality, req.reqId);
recruited.initEndpoints();
if (bmInterf->get().present()) {
recruited = bmInterf->get().get();
TEST(true); // Recruited while already a blob manager.
} else {
startRole(Role::BLOB_MANAGER, recruited.id(), interf.id());
DUMPTOKEN(recruited.waitFailure);
DUMPTOKEN(recruited.haltBlobManager);
Future<Void> blobManagerProcess = blobManager(recruited, dbInfo, req.epoch);
errorForwarders.add(forwardError(
errors,
Role::BLOB_MANAGER,
recruited.id(),
setWhenDoneOrError(blobManagerProcess, bmInterf, Optional<BlobManagerInterface>())));
bmInterf->set(Optional<BlobManagerInterface>(recruited));
}
TraceEvent("BlobManagerReceived", req.reqId).detail("BlobManagerId", recruited.id());
req.reply.send(recruited);
}
when(InitializeBackupRequest req = waitNext(interf.backup.getFuture())) {
if (!backupWorkerCache.exists(req.reqId)) {
LocalLineage _;
getCurrentLineage()->modify(&RoleLineage::role) = ProcessClass::ClusterRole::Backup;
BackupInterface recruited(locality);
recruited.initEndpoints();
startRole(Role::BACKUP, recruited.id(), interf.id());
DUMPTOKEN(recruited.waitFailure);
ReplyPromise<InitializeBackupReply> backupReady = req.reply;
backupWorkerCache.set(req.reqId, backupReady.getFuture());
Future<Void> backupProcess = backupWorker(recruited, req, dbInfo);
backupProcess = storageCache.removeOnReady(req.reqId, backupProcess);
errorForwarders.add(forwardError(errors, Role::BACKUP, recruited.id(), backupProcess));
TraceEvent("BackupInitRequest", req.reqId).detail("BackupId", recruited.id());
InitializeBackupReply reply(recruited, req.backupEpoch);
backupReady.send(reply);
} else {
forwardPromise(req.reply, backupWorkerCache.get(req.reqId));
}
}
when(InitializeTLogRequest req = waitNext(interf.tLog.getFuture())) {
// For now, there's a one-to-one mapping of spill type to TLogVersion.
// With future work, a particular version of the TLog can support multiple
// different spilling strategies, at which point SpillType will need to be
// plumbed down into tLogFn.
if (req.logVersion < TLogVersion::MIN_RECRUITABLE) {
TraceEvent(SevError, "InitializeTLogInvalidLogVersion")
.detail("Version", req.logVersion)
.detail("MinRecruitable", TLogVersion::MIN_RECRUITABLE);
req.reply.sendError(internal_error());
}
LocalLineage _;
getCurrentLineage()->modify(&RoleLineage::role) = ProcessClass::ClusterRole::TLog;
TLogOptions tLogOptions(req.logVersion, req.spillType);
TLogFn tLogFn = tLogFnForOptions(tLogOptions);
auto& logData = sharedLogs[SharedLogsKey(tLogOptions, req.storeType)];
logData.requests.send(req);
if (!logData.actor.isValid() || logData.actor.isReady()) {
UID logId = deterministicRandom()->randomUniqueID();
std::map<std::string, std::string> details;
details["ForMaster"] = req.recruitmentID.shortString();
details["StorageEngine"] = req.storeType.toString();
// FIXME: start role for every tlog instance, rather that just for the shared actor, also use a
// different role type for the shared actor
startRole(Role::SHARED_TRANSACTION_LOG, logId, interf.id(), details);
const StringRef prefix =
req.logVersion > TLogVersion::V2 ? fileVersionedLogDataPrefix : fileLogDataPrefix;
std::string filename =
filenameFromId(req.storeType, folder, prefix.toString() + tLogOptions.toPrefix(), logId);
IKeyValueStore* data = openKVStore(req.storeType, filename, logId, memoryLimit);
const DiskQueueVersion dqv =
tLogOptions.version >= TLogVersion::V3 ? DiskQueueVersion::V1 : DiskQueueVersion::V0;
IDiskQueue* queue = openDiskQueue(
joinPath(folder,
fileLogQueuePrefix.toString() + tLogOptions.toPrefix() + logId.toString() + "-"),
tlogQueueExtension.toString(),
logId,
dqv);
filesClosed.add(data->onClosed());
filesClosed.add(queue->onClosed());
Future<Void> tLogCore = tLogFn(data,
queue,
dbInfo,
locality,
logData.requests,
logId,
interf.id(),
false,
Promise<Void>(),
Promise<Void>(),
folder,
degraded,
activeSharedTLog);
tLogCore = handleIOErrors(tLogCore, data, logId);
tLogCore = handleIOErrors(tLogCore, queue, logId);
errorForwarders.add(forwardError(errors, Role::SHARED_TRANSACTION_LOG, logId, tLogCore));
logData.actor = tLogCore;
logData.uid = logId;
}
activeSharedTLog->set(logData.uid);
}
when(InitializeStorageRequest req = waitNext(interf.storage.getFuture())) {
// We want to prevent double recruiting on a worker unless we try to recruit something
// with a different storage engine (otherwise storage migration won't work for certain
// configuration). Additionally we also need to allow double recruitment for seed servers.
// The reason for this is that a storage will only remove itself if after it was able
// to read the system key space. But if recovery fails right after a `configure new ...`
// was run it won't be able to do so.
if (!storageCache.exists(req.reqId) &&
(std::all_of(runningStorages.begin(),
runningStorages.end(),
[&req](const auto& p) { return p.second != req.storeType; }) ||
req.seedTag != invalidTag)) {
ASSERT(req.clusterId.isValid());
LocalLineage _;
getCurrentLineage()->modify(&RoleLineage::role) = ProcessClass::ClusterRole::Storage;
bool isTss = req.tssPairIDAndVersion.present();
StorageServerInterface recruited(req.interfaceId);
recruited.locality = locality;
recruited.tssPairID = isTss ? req.tssPairIDAndVersion.get().first : Optional<UID>();
recruited.initEndpoints();
std::map<std::string, std::string> details;
details["StorageEngine"] = req.storeType.toString();
details["IsTSS"] = std::to_string(isTss);
Role ssRole = isTss ? Role::TESTING_STORAGE_SERVER : Role::STORAGE_SERVER;
startRole(ssRole, recruited.id(), interf.id(), details);
DUMPTOKEN(recruited.getValue);
DUMPTOKEN(recruited.getKey);
DUMPTOKEN(recruited.getKeyValues);
DUMPTOKEN(recruited.getShardState);
DUMPTOKEN(recruited.waitMetrics);
DUMPTOKEN(recruited.splitMetrics);
DUMPTOKEN(recruited.getReadHotRanges);
DUMPTOKEN(recruited.getRangeSplitPoints);
DUMPTOKEN(recruited.getStorageMetrics);
DUMPTOKEN(recruited.waitFailure);
DUMPTOKEN(recruited.getQueuingMetrics);
DUMPTOKEN(recruited.getKeyValueStoreType);
DUMPTOKEN(recruited.watchValue);
DUMPTOKEN(recruited.getKeyValuesStream);
DUMPTOKEN(recruited.getKeyValuesAndFlatMap);
// printf("Recruited as storageServer\n");
std::string filename =
filenameFromId(req.storeType,
folder,
isTss ? testingStoragePrefix.toString() : fileStoragePrefix.toString(),
recruited.id());
IKeyValueStore* data = openKVStore(req.storeType, filename, recruited.id(), memoryLimit);
Future<Void> kvClosed = data->onClosed();
filesClosed.add(kvClosed);
ReplyPromise<InitializeStorageReply> storageReady = req.reply;
storageCache.set(req.reqId, storageReady.getFuture());
Future<Void> s = storageServer(data,
recruited,
req.seedTag,
req.clusterId,
isTss ? req.tssPairIDAndVersion.get().second : 0,
storageReady,
dbInfo,
folder);
s = handleIOErrors(s, data, recruited.id(), kvClosed);
s = storageCache.removeOnReady(req.reqId, s);
s = storageServerRollbackRebooter(&runningStorages,
s,
req.storeType,
filename,
recruited.id(),
recruited.locality,
isTss,
dbInfo,
folder,
&filesClosed,
memoryLimit,
data);
errorForwarders.add(forwardError(errors, ssRole, recruited.id(), s));
} else if (storageCache.exists(req.reqId)) {
forwardPromise(req.reply, storageCache.get(req.reqId));
} else {
TraceEvent("AttemptedDoubleRecruitement", interf.id()).detail("ForRole", "StorageServer");
errorForwarders.add(map(delay(0.5), [reply = req.reply](Void) {
reply.sendError(recruitment_failed());
return Void();
}));
}
}
when(InitializeBlobWorkerRequest req = waitNext(interf.blobWorker.getFuture())) {
BlobWorkerInterface recruited(locality, req.interfaceId);
recruited.initEndpoints();
startRole(Role::BLOB_WORKER, recruited.id(), interf.id());
ReplyPromise<InitializeBlobWorkerReply> blobWorkerReady = req.reply;
Future<Void> bw = blobWorker(recruited, blobWorkerReady, dbInfo);
errorForwarders.add(forwardError(errors, Role::BLOB_WORKER, recruited.id(), bw));
}
when(InitializeCommitProxyRequest req = waitNext(interf.commitProxy.getFuture())) {
LocalLineage _;
getCurrentLineage()->modify(&RoleLineage::role) = ProcessClass::ClusterRole::CommitProxy;
CommitProxyInterface recruited;
recruited.processId = locality.processId();
recruited.provisional = false;
recruited.initEndpoints();
std::map<std::string, std::string> details;
details["ForMaster"] = req.master.id().shortString();
startRole(Role::COMMIT_PROXY, recruited.id(), interf.id(), details);
DUMPTOKEN(recruited.commit);
DUMPTOKEN(recruited.getConsistentReadVersion);
DUMPTOKEN(recruited.getKeyServersLocations);
DUMPTOKEN(recruited.getStorageServerRejoinInfo);
DUMPTOKEN(recruited.waitFailure);
DUMPTOKEN(recruited.txnState);
// printf("Recruited as commitProxyServer\n");
errorForwarders.add(zombie(recruited,
forwardError(errors,
Role::COMMIT_PROXY,
recruited.id(),
commitProxyServer(recruited, req, dbInfo, whitelistBinPaths))));
req.reply.send(recruited);
}
when(InitializeGrvProxyRequest req = waitNext(interf.grvProxy.getFuture())) {
LocalLineage _;
getCurrentLineage()->modify(&RoleLineage::role) = ProcessClass::ClusterRole::GrvProxy;
GrvProxyInterface recruited;
recruited.processId = locality.processId();
recruited.provisional = false;
recruited.initEndpoints();
std::map<std::string, std::string> details;
details["ForMaster"] = req.master.id().shortString();
startRole(Role::GRV_PROXY, recruited.id(), interf.id(), details);
DUMPTOKEN(recruited.getConsistentReadVersion);
DUMPTOKEN(recruited.waitFailure);
DUMPTOKEN(recruited.getHealthMetrics);
// printf("Recruited as grvProxyServer\n");
errorForwarders.add(zombie(
recruited,
forwardError(errors, Role::GRV_PROXY, recruited.id(), grvProxyServer(recruited, req, dbInfo))));
req.reply.send(recruited);
}
when(InitializeResolverRequest req = waitNext(interf.resolver.getFuture())) {
LocalLineage _;
getCurrentLineage()->modify(&RoleLineage::role) = ProcessClass::ClusterRole::Resolver;
ResolverInterface recruited;
recruited.locality = locality;
recruited.initEndpoints();
std::map<std::string, std::string> details;
startRole(Role::RESOLVER, recruited.id(), interf.id(), details);
DUMPTOKEN(recruited.resolve);
DUMPTOKEN(recruited.metrics);
DUMPTOKEN(recruited.split);
DUMPTOKEN(recruited.waitFailure);
errorForwarders.add(zombie(
recruited, forwardError(errors, Role::RESOLVER, recruited.id(), resolver(recruited, req, dbInfo))));
req.reply.send(recruited);
}
when(InitializeLogRouterRequest req = waitNext(interf.logRouter.getFuture())) {
LocalLineage _;
getCurrentLineage()->modify(&RoleLineage::role) = ProcessClass::ClusterRole::LogRouter;
TLogInterface recruited(locality);
recruited.initEndpoints();
std::map<std::string, std::string> details;
startRole(Role::LOG_ROUTER, recruited.id(), interf.id(), details);
DUMPTOKEN(recruited.peekMessages);
DUMPTOKEN(recruited.peekStreamMessages);
DUMPTOKEN(recruited.popMessages);
DUMPTOKEN(recruited.commit);
DUMPTOKEN(recruited.lock);
DUMPTOKEN(recruited.getQueuingMetrics);
DUMPTOKEN(recruited.confirmRunning);
DUMPTOKEN(recruited.waitFailure);
DUMPTOKEN(recruited.recoveryFinished);
DUMPTOKEN(recruited.disablePopRequest);
DUMPTOKEN(recruited.enablePopRequest);
DUMPTOKEN(recruited.snapRequest);
errorForwarders.add(
zombie(recruited,
forwardError(errors, Role::LOG_ROUTER, recruited.id(), logRouter(recruited, req, dbInfo))));
req.reply.send(recruited);
}
when(CoordinationPingMessage m = waitNext(interf.coordinationPing.getFuture())) {
TraceEvent("CoordinationPing", interf.id())
.detail("CCID", m.clusterControllerId)
.detail("TimeStep", m.timeStep);
}
when(SetMetricsLogRateRequest req = waitNext(interf.setMetricsRate.getFuture())) {
TraceEvent("LoggingRateChange", interf.id())
.detail("OldDelay", loggingDelay)
.detail("NewLogPS", req.metricsLogsPerSecond);
if (req.metricsLogsPerSecond != 0) {
loggingDelay = 1.0 / req.metricsLogsPerSecond;
loggingTrigger = Void();
}
}
when(EventLogRequest req = waitNext(interf.eventLogRequest.getFuture())) {
TraceEventFields e;
if (req.getLastError)
e = latestEventCache.getLatestError();
else
e = latestEventCache.get(req.eventName.toString());
req.reply.send(e);
}
when(TraceBatchDumpRequest req = waitNext(interf.traceBatchDumpRequest.getFuture())) {
g_traceBatch.dump();
req.reply.send(Void());
}
when(DiskStoreRequest req = waitNext(interf.diskStoreRequest.getFuture())) {
Standalone<VectorRef<UID>> ids;
for (DiskStore d : getDiskStores(folder)) {
bool included = true;
if (!req.includePartialStores) {
if (d.storeType == KeyValueStoreType::SSD_BTREE_V1) {
included = fileExists(d.filename + ".fdb-wal");
} else if (d.storeType == KeyValueStoreType::SSD_BTREE_V2) {
included = fileExists(d.filename + ".sqlite-wal");
} else if (d.storeType == KeyValueStoreType::SSD_REDWOOD_V1) {
included = fileExists(d.filename + "0.pagerlog") && fileExists(d.filename + "1.pagerlog");
} else if (d.storeType == KeyValueStoreType::SSD_ROCKSDB_V1) {
included = fileExists(joinPath(d.filename, "CURRENT")) &&
fileExists(joinPath(d.filename, "IDENTITY"));
} else if (d.storeType == KeyValueStoreType::MEMORY) {
included = fileExists(d.filename + "1.fdq");
} else {
ASSERT(d.storeType == KeyValueStoreType::MEMORY_RADIXTREE);
included = fileExists(d.filename + "1.fdr");
}
if (d.storedComponent == DiskStore::COMPONENT::TLogData && included) {
included = false;
// The previous code assumed that d.filename is a filename. But that is not true.
// d.filename is a path. Removing a prefix and adding a new one just makes a broken
// directory name. So fileExists would always return false.
// Weirdly, this doesn't break anything, as tested by taking a clean check of FDB,
// setting included to false always, and then running correctness. So I'm just
// improving the situation by actually marking it as broken.
// FIXME: this whole thing
/*
std::string logDataBasename;
StringRef filename = d.filename;
if (filename.startsWith(fileLogDataPrefix)) {
logDataBasename = fileLogQueuePrefix.toString() +
d.filename.substr(fileLogDataPrefix.size()); } else { StringRef optionsString =
filename.removePrefix(fileVersionedLogDataPrefix).eat("-"); logDataBasename =
fileLogQueuePrefix.toString() + optionsString.toString() + "-";
}
TraceEvent("DiskStoreRequest").detail("FilenameBasename", logDataBasename);
if (fileExists(logDataBasename + "0.fdq") && fileExists(logDataBasename + "1.fdq")) {
included = true;
}
*/
}
}
if (included) {
ids.push_back(ids.arena(), d.storeID);
}
}
req.reply.send(ids);
}
when(wait(loggingTrigger)) {
systemMonitor();
loggingTrigger = delay(loggingDelay, TaskPriority::FlushTrace);
}
when(state WorkerSnapRequest snapReq = waitNext(interf.workerSnapReq.getFuture())) {
Standalone<StringRef> snapFolder = StringRef(folder);
if (snapReq.role.toString() == "coord") {
snapFolder = coordFolder;
}
errorForwarders.add(workerSnapCreate(snapReq, snapFolder));
}
when(wait(errorForwarders.getResult())) {}
when(wait(handleErrors)) {}
}
} catch (Error& err) {
// Make sure actors are cancelled before "recovery" promises are destructed.
for (auto f : recoveries)
f.cancel();
state Error e = err;
bool ok = e.code() == error_code_please_reboot || e.code() == error_code_actor_cancelled ||
e.code() == error_code_please_reboot_delete;
endRole(Role::WORKER, interf.id(), "WorkerError", ok, e);
errorForwarders.clear(false);
sharedLogs.clear();
if (e.code() !=
error_code_actor_cancelled) { // We get cancelled e.g. when an entire simulation times out, but in that case
// we won't be restarted and don't need to wait for shutdown
stopping.send(Void());
wait(filesClosed.getResult()); // Wait for complete shutdown of KV stores
wait(delay(0.0)); // Unwind the callstack to make sure that IAsyncFile references are all gone
TraceEvent(SevInfo, "WorkerShutdownComplete", interf.id());
}
throw e;
}
}
ACTOR Future<Void> extractClusterInterface(Reference<AsyncVar<Optional<ClusterControllerFullInterface>> const> in,
Reference<AsyncVar<Optional<ClusterInterface>>> out) {
loop {
if (in->get().present())
out->set(in->get().get().clientInterface);
else
out->set(Optional<ClusterInterface>());
wait(in->onChange());
}
}
static std::set<int> const& normalWorkerErrors() {
static std::set<int> s;
if (s.empty()) {
s.insert(error_code_please_reboot);
s.insert(error_code_please_reboot_delete);
}
return s;
}
ACTOR Future<Void> fileNotFoundToNever(Future<Void> f) {
try {
wait(f);
return Void();
} catch (Error& e) {
if (e.code() == error_code_file_not_found) {
TraceEvent(SevWarn, "ClusterCoordinatorFailed").error(e);
return Never();
}
throw;
}
}
ACTOR Future<Void> printTimeout() {
wait(delay(5));
if (!g_network->isSimulated()) {
fprintf(stderr, "Warning: FDBD has not joined the cluster after 5 seconds.\n");
fprintf(stderr, " Check configuration and availability using the 'status' command with the fdbcli\n");
}
return Void();
}
ACTOR Future<Void> printOnFirstConnected(Reference<AsyncVar<Optional<ClusterInterface>> const> ci) {
state Future<Void> timeoutFuture = printTimeout();
loop {
choose {
when(wait(ci->get().present() ? IFailureMonitor::failureMonitor().onStateEqual(
ci->get().get().openDatabase.getEndpoint(), FailureStatus(false))
: Never())) {
printf("FDBD joined cluster.\n");
TraceEvent("FDBDConnected").log();
return Void();
}
when(wait(ci->onChange())) {}
}
}
}
ClusterControllerPriorityInfo getCCPriorityInfo(std::string filePath, ProcessClass processClass) {
if (!fileExists(filePath))
return ClusterControllerPriorityInfo(ProcessClass(processClass.classType(), ProcessClass::CommandLineSource)
.machineClassFitness(ProcessClass::ClusterController),
false,
ClusterControllerPriorityInfo::FitnessUnknown);
std::string contents(readFileBytes(filePath, 1000));
BinaryReader br(StringRef(contents), IncludeVersion());
ClusterControllerPriorityInfo priorityInfo(
ProcessClass::UnsetFit, false, ClusterControllerPriorityInfo::FitnessUnknown);
br >> priorityInfo;
if (!br.empty()) {
if (g_network->isSimulated()) {
ASSERT(false);
} else {
TraceEvent(SevWarnAlways, "FitnessFileCorrupted").detail("filePath", filePath);
return ClusterControllerPriorityInfo(ProcessClass(processClass.classType(), ProcessClass::CommandLineSource)
.machineClassFitness(ProcessClass::ClusterController),
false,
ClusterControllerPriorityInfo::FitnessUnknown);
}
}
return priorityInfo;
}
ACTOR Future<Void> monitorAndWriteCCPriorityInfo(std::string filePath,
Reference<AsyncVar<ClusterControllerPriorityInfo>> asyncPriorityInfo) {
loop {
wait(asyncPriorityInfo->onChange());
std::string contents(BinaryWriter::toValue(asyncPriorityInfo->get(),
IncludeVersion(ProtocolVersion::withClusterControllerPriorityInfo()))
.toString());
atomicReplace(filePath, contents, false);
}
}
ACTOR Future<UID> createAndLockProcessIdFile(std::string folder) {
state UID processIDUid;
platform::createDirectory(folder);
loop {
try {
state std::string lockFilePath = joinPath(folder, "processId");
state ErrorOr<Reference<IAsyncFile>> lockFile = wait(errorOr(IAsyncFileSystem::filesystem(g_network)->open(
lockFilePath, IAsyncFile::OPEN_READWRITE | IAsyncFile::OPEN_LOCK, 0600)));
if (lockFile.isError() && lockFile.getError().code() == error_code_file_not_found &&
!fileExists(lockFilePath)) {
Reference<IAsyncFile> _lockFile = wait(IAsyncFileSystem::filesystem()->open(
lockFilePath,
IAsyncFile::OPEN_ATOMIC_WRITE_AND_CREATE | IAsyncFile::OPEN_CREATE | IAsyncFile::OPEN_LOCK |
IAsyncFile::OPEN_READWRITE,
0600));
lockFile = _lockFile;
processIDUid = deterministicRandom()->randomUniqueID();
BinaryWriter wr(IncludeVersion(ProtocolVersion::withProcessIDFile()));
wr << processIDUid;
wait(lockFile.get()->write(wr.getData(), wr.getLength(), 0));
wait(lockFile.get()->sync());
} else {
if (lockFile.isError())
throw lockFile.getError(); // If we've failed to open the file, throw an exception
int64_t fileSize = wait(lockFile.get()->size());
state Key fileData = makeString(fileSize);
wait(success(lockFile.get()->read(mutateString(fileData), fileSize, 0)));
try {
processIDUid = BinaryReader::fromStringRef<UID>(fileData, IncludeVersion());
return processIDUid;
} catch (Error& e) {
if (!g_network->isSimulated()) {
throw;
}
lockFile = ErrorOr<Reference<IAsyncFile>>();
wait(IAsyncFileSystem::filesystem()->deleteFile(lockFilePath, true));
}
}
} catch (Error& e) {
if (e.code() == error_code_actor_cancelled) {
throw;
}
if (!e.isInjectedFault()) {
fprintf(stderr,
"ERROR: error creating or opening process id file `%s'.\n",
joinPath(folder, "processId").c_str());
}
TraceEvent(SevError, "OpenProcessIdError").error(e);
throw;
}
}
}
ACTOR Future<MonitorLeaderInfo> monitorLeaderWithDelayedCandidacyImplOneGeneration(
Reference<IClusterConnectionRecord> connRecord,
Reference<AsyncVar<Value>> result,
MonitorLeaderInfo info) {
state ClusterConnectionString ccf = info.intermediateConnRecord->getConnectionString();
state std::vector<NetworkAddress> addrs = ccf.coordinators();
state ElectionResultRequest request;
state int index = 0;
state int successIndex = 0;
request.key = ccf.clusterKey();
request.coordinators = ccf.coordinators();
deterministicRandom()->randomShuffle(addrs);
loop {
LeaderElectionRegInterface interf(addrs[index]);
request.reply = ReplyPromise<Optional<LeaderInfo>>();
ErrorOr<Optional<LeaderInfo>> leader = wait(interf.electionResult.tryGetReply(request));
if (leader.present()) {
if (leader.get().present()) {
if (leader.get().get().forward) {
info.intermediateConnRecord = connRecord->makeIntermediateRecord(
ClusterConnectionString(leader.get().get().serializedInfo.toString()));
return info;
}
if (connRecord != info.intermediateConnRecord) {
if (!info.hasConnected) {
TraceEvent(SevWarnAlways, "IncorrectClusterFileContentsAtConnection")
.detail("ClusterFile", connRecord->toString())
.detail("StoredConnectionString", connRecord->getConnectionString().toString())
.detail("CurrentConnectionString",
info.intermediateConnRecord->getConnectionString().toString());
}
connRecord->setConnectionString(info.intermediateConnRecord->getConnectionString());
info.intermediateConnRecord = connRecord;
}
info.hasConnected = true;
connRecord->notifyConnected();
request.knownLeader = leader.get().get().changeID;
ClusterControllerPriorityInfo info = leader.get().get().getPriorityInfo();
if (leader.get().get().serializedInfo.size() && !info.isExcluded &&
(info.dcFitness == ClusterControllerPriorityInfo::FitnessPrimary ||
info.dcFitness == ClusterControllerPriorityInfo::FitnessPreferred ||
info.dcFitness == ClusterControllerPriorityInfo::FitnessUnknown)) {
result->set(leader.get().get().serializedInfo);
} else {
result->set(Value());
}
}
successIndex = index;
} else {
index = (index + 1) % addrs.size();
if (index == successIndex) {
wait(delay(CLIENT_KNOBS->COORDINATOR_RECONNECTION_DELAY));
}
}
}
}
ACTOR Future<Void> monitorLeaderWithDelayedCandidacyImplInternal(Reference<IClusterConnectionRecord> connRecord,
Reference<AsyncVar<Value>> outSerializedLeaderInfo) {
state MonitorLeaderInfo info(connRecord);
loop {
MonitorLeaderInfo _info =
wait(monitorLeaderWithDelayedCandidacyImplOneGeneration(connRecord, outSerializedLeaderInfo, info));
info = _info;
}
}
template <class LeaderInterface>
Future<Void> monitorLeaderWithDelayedCandidacyImpl(
Reference<IClusterConnectionRecord> const& connRecord,
Reference<AsyncVar<Optional<LeaderInterface>>> const& outKnownLeader) {
LeaderDeserializer<LeaderInterface> deserializer;
auto serializedInfo = makeReference<AsyncVar<Value>>();
Future<Void> m = monitorLeaderWithDelayedCandidacyImplInternal(connRecord, serializedInfo);
return m || deserializer(serializedInfo, outKnownLeader);
}
ACTOR Future<Void> monitorLeaderWithDelayedCandidacy(
Reference<IClusterConnectionRecord> connRecord,
Reference<AsyncVar<Optional<ClusterControllerFullInterface>>> currentCC,
Reference<AsyncVar<ClusterControllerPriorityInfo>> asyncPriorityInfo,
Future<Void> recoveredDiskFiles,
LocalityData locality,
Reference<AsyncVar<ServerDBInfo>> dbInfo,
ConfigDBType configDBType) {
state Future<Void> monitor = monitorLeaderWithDelayedCandidacyImpl(connRecord, currentCC);
state Future<Void> timeout;
wait(recoveredDiskFiles);
loop {
if (currentCC->get().present() && dbInfo->get().clusterInterface == currentCC->get().get() &&
IFailureMonitor::failureMonitor()
.getState(currentCC->get().get().registerWorker.getEndpoint())
.isAvailable()) {
timeout = Future<Void>();
} else if (!timeout.isValid()) {
timeout =
delay(SERVER_KNOBS->MIN_DELAY_CC_WORST_FIT_CANDIDACY_SECONDS +
(deterministicRandom()->random01() * (SERVER_KNOBS->MAX_DELAY_CC_WORST_FIT_CANDIDACY_SECONDS -
SERVER_KNOBS->MIN_DELAY_CC_WORST_FIT_CANDIDACY_SECONDS)));
}
choose {
when(wait(currentCC->onChange())) {}
when(wait(dbInfo->onChange())) {}
when(wait(currentCC->get().present() ? IFailureMonitor::failureMonitor().onStateChanged(
currentCC->get().get().registerWorker.getEndpoint())
: Never())) {}
when(wait(timeout.isValid() ? timeout : Never())) {
monitor.cancel();
wait(clusterController(
connRecord, currentCC, asyncPriorityInfo, recoveredDiskFiles, locality, configDBType));
return Void();
}
}
}
}
extern void setupStackSignal();
ACTOR Future<Void> serveProtocolInfo() {
state RequestStream<ProtocolInfoRequest> protocolInfo(
PeerCompatibilityPolicy{ RequirePeer::AtLeast, ProtocolVersion::withStableInterfaces() });
protocolInfo.makeWellKnownEndpoint(WLTOKEN_PROTOCOL_INFO, TaskPriority::DefaultEndpoint);
loop {
state ProtocolInfoRequest req = waitNext(protocolInfo.getFuture());
req.reply.send(ProtocolInfoReply{ g_network->protocolVersion() });
}
}
// Handles requests from ProcessInterface, an interface meant for direct
// communication between the client and FDB processes.
ACTOR Future<Void> serveProcess() {
state ProcessInterface process;
process.getInterface.makeWellKnownEndpoint(WLTOKEN_PROCESS, TaskPriority::DefaultEndpoint);
loop {
choose {
when(GetProcessInterfaceRequest req = waitNext(process.getInterface.getFuture())) {
req.reply.send(process);
}
when(ActorLineageRequest req = waitNext(process.actorLineage.getFuture())) {
state SampleCollection sampleCollector;
auto samples = sampleCollector->get(req.timeStart, req.timeEnd);
std::vector<SerializedSample> serializedSamples;
for (const auto& samplePtr : samples) {
auto serialized = SerializedSample{ .time = samplePtr->time };
for (const auto& [waitState, pair] : samplePtr->data) {
if (waitState >= req.waitStateStart && waitState <= req.waitStateEnd) {
serialized.data[waitState] = std::string(pair.first, pair.second);
}
}
serializedSamples.push_back(std::move(serialized));
}
ActorLineageReply reply{ serializedSamples };
req.reply.send(reply);
}
}
}
}
ACTOR Future<Void> fdbd(Reference<IClusterConnectionRecord> connRecord,
LocalityData localities,
ProcessClass processClass,
std::string dataFolder,
std::string coordFolder,
int64_t memoryLimit,
std::string metricsConnFile,
std::string metricsPrefix,
int64_t memoryProfileThreshold,
std::string whitelistBinPaths,
std::string configPath,
std::map<std::string, std::string> manualKnobOverrides,
ConfigDBType configDBType) {
state std::vector<Future<Void>> actors;
state Promise<Void> recoveredDiskFiles;
state Reference<LocalConfiguration> localConfig =
makeReference<LocalConfiguration>(dataFolder, configPath, manualKnobOverrides);
// setupStackSignal();
getCurrentLineage()->modify(&RoleLineage::role) = ProcessClass::Worker;
// FIXME: Initializing here causes simulation issues, these must be fixed
/*
if (configDBType != ConfigDBType::DISABLED) {
wait(localConfig->initialize());
}
*/
actors.push_back(serveProtocolInfo());
actors.push_back(serveProcess());
try {
ServerCoordinators coordinators(connRecord);
if (g_network->isSimulated()) {
whitelistBinPaths = ",, random_path, /bin/snap_create.sh,,";
}
TraceEvent("StartingFDBD")
.detail("ZoneID", localities.zoneId())
.detail("MachineId", localities.machineId())
.detail("DiskPath", dataFolder)
.detail("CoordPath", coordFolder)
.detail("WhiteListBinPath", whitelistBinPaths);
// SOMEDAY: start the services on the machine in a staggered fashion in simulation?
// Endpoints should be registered first before any process trying to connect to it.
// So coordinationServer actor should be the first one executed before any other.
if (coordFolder.size()) {
// SOMEDAY: remove the fileNotFound wrapper and make DiskQueue construction safe from errors setting up
// their files
actors.push_back(fileNotFoundToNever(coordinationServer(coordFolder, coordinators.ccr, configDBType)));
}
state UID processIDUid = wait(createAndLockProcessIdFile(dataFolder));
localities.set(LocalityData::keyProcessId, processIDUid.toString());
// Only one process can execute on a dataFolder from this point onwards
std::string fitnessFilePath = joinPath(dataFolder, "fitness");
auto cc = makeReference<AsyncVar<Optional<ClusterControllerFullInterface>>>();
auto ci = makeReference<AsyncVar<Optional<ClusterInterface>>>();
auto asyncPriorityInfo =
makeReference<AsyncVar<ClusterControllerPriorityInfo>>(getCCPriorityInfo(fitnessFilePath, processClass));
auto dbInfo = makeReference<AsyncVar<ServerDBInfo>>();
actors.push_back(reportErrors(monitorAndWriteCCPriorityInfo(fitnessFilePath, asyncPriorityInfo),
"MonitorAndWriteCCPriorityInfo"));
if (processClass.machineClassFitness(ProcessClass::ClusterController) == ProcessClass::NeverAssign) {
actors.push_back(reportErrors(monitorLeader(connRecord, cc), "ClusterController"));
} else if (processClass.machineClassFitness(ProcessClass::ClusterController) == ProcessClass::WorstFit &&
SERVER_KNOBS->MAX_DELAY_CC_WORST_FIT_CANDIDACY_SECONDS > 0) {
actors.push_back(reportErrors(monitorLeaderWithDelayedCandidacy(connRecord,
cc,
asyncPriorityInfo,
recoveredDiskFiles.getFuture(),
localities,
dbInfo,
configDBType),
"ClusterController"));
} else {
actors.push_back(reportErrors(
clusterController(
connRecord, cc, asyncPriorityInfo, recoveredDiskFiles.getFuture(), localities, configDBType),
"ClusterController"));
}
actors.push_back(reportErrors(extractClusterInterface(cc, ci), "ExtractClusterInterface"));
actors.push_back(reportErrorsExcept(workerServer(connRecord,
cc,
localities,
asyncPriorityInfo,
processClass,
dataFolder,
memoryLimit,
metricsConnFile,
metricsPrefix,
recoveredDiskFiles,
memoryProfileThreshold,
coordFolder,
whitelistBinPaths,
dbInfo,
configDBType,
localConfig),
"WorkerServer",
UID(),
&normalWorkerErrors()));
state Future<Void> firstConnect = reportErrors(printOnFirstConnected(ci), "ClusterFirstConnectedError");
wait(quorum(actors, 1));
ASSERT(false); // None of these actors should terminate normally
throw internal_error();
} catch (Error& e) {
// Make sure actors are cancelled before recoveredDiskFiles is destructed.
// Otherwise, these actors may get a broken promise error.
for (auto f : actors)
f.cancel();
Error err = checkIOTimeout(e);
throw err;
}
}
const Role Role::WORKER("Worker", "WK", false);
const Role Role::STORAGE_SERVER("StorageServer", "SS");
const Role Role::TESTING_STORAGE_SERVER("TestingStorageServer", "ST");
const Role Role::TRANSACTION_LOG("TLog", "TL");
const Role Role::SHARED_TRANSACTION_LOG("SharedTLog", "SL", false);
const Role Role::COMMIT_PROXY("CommitProxyServer", "CP");
const Role Role::GRV_PROXY("GrvProxyServer", "GP");
const Role Role::MASTER("MasterServer", "MS");
const Role Role::RESOLVER("Resolver", "RV");
const Role Role::CLUSTER_CONTROLLER("ClusterController", "CC");
const Role Role::TESTER("Tester", "TS");
const Role Role::LOG_ROUTER("LogRouter", "LR");
const Role Role::DATA_DISTRIBUTOR("DataDistributor", "DD");
const Role Role::RATEKEEPER("Ratekeeper", "RK");
const Role Role::BLOB_MANAGER("BlobManager", "BM");
const Role Role::BLOB_WORKER("BlobWorker", "BW");
const Role Role::STORAGE_CACHE("StorageCache", "SC");
const Role Role::COORDINATOR("Coordinator", "CD");
const Role Role::BACKUP("Backup", "BK");