foundationdb/fdbserver/RestoreLoader.actor.cpp
Yi Wu 364644673f
Support TLog encryption in commit proxy (#6942)
This PR add support for TLog encryption through commit proxy. The encryption is done on per-mutation basis. As CP writes mutations to TLog, it inserts encryption header alongside encrypted mutations. Storage server (and other consumers of TLog such as storage cache and backup worker) decrypts the mutations as they peek TLog.
2022-06-29 14:21:05 -07:00

1550 lines
69 KiB
C++

/*
* RestoreLoader.actor.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// This file implements the functions and actors used by the RestoreLoader role.
// The RestoreLoader role starts with the restoreLoaderCore actor
#include "flow/UnitTest.h"
#include "fdbclient/BackupContainer.h"
#include "fdbclient/BackupAgent.actor.h"
#include "fdbserver/EncryptedMutationMessage.h"
#include "fdbserver/RestoreLoader.actor.h"
#include "fdbserver/RestoreRoleCommon.actor.h"
#include "fdbserver/MutationTracking.h"
#include "fdbserver/StorageMetrics.h"
#include "flow/actorcompiler.h" // This must be the last #include.
// SerializedMutationListMap: Buffered mutation lists from data blocks in log files
// Key is the signature/version of the mutation list; Value.first is the mutation list which may come from multiple
// data blocks of log file; Value.second is the largest part number of the mutation list, which is used to sanity check
// the data blocks for the same mutation list are concatenated in increasing order of part number.
typedef std::map<Standalone<StringRef>, std::pair<Standalone<StringRef>, uint32_t>> SerializedMutationListMap;
std::vector<UID> getApplierIDs(std::map<Key, UID>& rangeToApplier);
void splitMutation(const KeyRangeMap<UID>& krMap,
MutationRef m,
Arena& mvector_arena,
VectorRef<MutationRef>& mvector,
Arena& nodeIDs_arena,
VectorRef<UID>& nodeIDs);
void _parseSerializedMutation(KeyRangeMap<Version>* pRangeVersions,
std::map<LoadingParam, VersionedMutationsMap>::iterator kvOpsIter,
SerializedMutationListMap* mutationMap,
std::map<LoadingParam, SampledMutationsVec>::iterator samplesIter,
LoaderCounters* cc,
const RestoreAsset& asset);
void handleRestoreSysInfoRequest(const RestoreSysInfoRequest& req, Reference<RestoreLoaderData> self);
ACTOR Future<Void> handleLoadFileRequest(RestoreLoadFileRequest req, Reference<RestoreLoaderData> self);
ACTOR Future<Void> handleSendMutationsRequest(RestoreSendMutationsToAppliersRequest req,
Reference<RestoreLoaderData> self);
ACTOR Future<Void> sendMutationsToApplier(
std::priority_queue<RestoreLoaderSchedSendLoadParamRequest>* sendLoadParamQueue,
std::map<int, int>* inflightSendLoadParamReqs,
NotifiedVersion* finishedBatch,
VersionedMutationsMap* pkvOps,
int batchIndex,
RestoreAsset asset,
bool isRangeFile,
std::map<Key, UID>* pRangeToApplier,
std::map<UID, RestoreApplierInterface>* pApplierInterfaces);
ACTOR static Future<Void> _parseLogFileToMutationsOnLoader(NotifiedVersion* pProcessedFileOffset,
SerializedMutationListMap* mutationMap,
Reference<IBackupContainer> bc,
RestoreAsset asset);
ACTOR static Future<Void> parseLogFileToMutationsOnLoader(NotifiedVersion* pProcessedFileOffset,
SerializedMutationListMap* mutationMap,
Reference<IBackupContainer> bc,
RestoreAsset asset);
ACTOR static Future<Void> _parseRangeFileToMutationsOnLoader(
std::map<LoadingParam, VersionedMutationsMap>::iterator kvOpsIter,
std::map<LoadingParam, SampledMutationsVec>::iterator samplesIter,
LoaderCounters* cc,
Reference<IBackupContainer> bc,
Version version,
RestoreAsset asset);
ACTOR Future<Void> handleFinishVersionBatchRequest(RestoreVersionBatchRequest req, Reference<RestoreLoaderData> self);
// Dispatch requests based on node's business (i.e, cpu usage for now) and requests' priorities
// Requests for earlier version batches are preferred; which is equivalent to
// sendMuttionsRequests are preferred than loadingFileRequests
ACTOR Future<Void> dispatchRequests(Reference<RestoreLoaderData> self) {
try {
state int curVBInflightReqs = 0;
state int sendLoadParams = 0;
state int lastLoadReqs = 0;
loop {
TraceEvent(SevDebug, "FastRestoreLoaderDispatchRequests", self->id())
.detail("SendingQueue", self->sendingQueue.size())
.detail("LoadingQueue", self->loadingQueue.size())
.detail("SendingLoadParamQueue", self->sendLoadParamQueue.size())
.detail("InflightSendingReqs", self->inflightSendingReqs)
.detail("InflightSendingReqsThreshold", SERVER_KNOBS->FASTRESTORE_SCHED_INFLIGHT_SEND_REQS)
.detail("InflightLoadingReqs", self->inflightLoadingReqs)
.detail("InflightLoadingReqsThreshold", SERVER_KNOBS->FASTRESTORE_SCHED_INFLIGHT_LOAD_REQS)
.detail("LastLoadFileRequests", lastLoadReqs)
.detail("LoadFileRequestsBatchThreshold", SERVER_KNOBS->FASTRESTORE_SCHED_LOAD_REQ_BATCHSIZE)
.detail("LastDispatchSendLoadParamReqsForCurrentVB", curVBInflightReqs)
.detail("LastDispatchSendLoadParamReqsForFutureVB", sendLoadParams)
.detail("CpuUsage", self->cpuUsage)
.detail("TargetCpuUsage", SERVER_KNOBS->FASTRESTORE_SCHED_TARGET_CPU_PERCENT)
.detail("MaxCpuUsage", SERVER_KNOBS->FASTRESTORE_SCHED_MAX_CPU_PERCENT);
// TODO: Pop old requests whose version batch <= finishedBatch.get()
// TODO2: Simulate delayed request can be too old by introducing artificial delay
if (SERVER_KNOBS->FASTRESTORE_EXPENSIVE_VALIDATION) {
// Sanity check: All requests before and in finishedBatch must have been processed; otherwise,
// those requests may cause segmentation fault after applier remove the batch data
if (!self->loadingQueue.empty() && self->loadingQueue.top().batchIndex <= self->finishedBatch.get()) {
// Still has pending requests from earlier batchIndex and current batchIndex, which should not
// happen
TraceEvent(SevError, "FastRestoreLoaderSchedulerHasOldLoadFileRequests")
.detail("FinishedBatchIndex", self->finishedBatch.get())
.detail("PendingRequest", self->loadingQueue.top().toString());
}
if (!self->sendingQueue.empty() && self->sendingQueue.top().batchIndex <= self->finishedBatch.get()) {
TraceEvent(SevError, "FastRestoreLoaderSchedulerHasOldSendRequests")
.detail("FinishedBatchIndex", self->finishedBatch.get())
.detail("PendingRequest", self->sendingQueue.top().toString());
}
if (!self->sendLoadParamQueue.empty() &&
self->sendLoadParamQueue.top().batchIndex <= self->finishedBatch.get()) {
TraceEvent(SevError, "FastRestoreLoaderSchedulerHasOldSendLoadParamRequests")
.detail("FinishedBatchIndex", self->finishedBatch.get())
.detail("PendingRequest", self->sendLoadParamQueue.top().toString());
}
}
if (!self->sendingQueue.empty()) {
// Only release one sendMutationRequest at a time because it sends all data for a version batch
// and it takes large amount of resource
const RestoreSendMutationsToAppliersRequest& req = self->sendingQueue.top();
// Dispatch the request if it is the next version batch to process or if cpu usage is low
if (req.batchIndex - 1 == self->finishedSendingVB ||
self->cpuUsage < SERVER_KNOBS->FASTRESTORE_SCHED_TARGET_CPU_PERCENT) {
self->addActor.send(handleSendMutationsRequest(req, self));
self->sendingQueue.pop();
}
}
// When shall the node pause the process of other requests, e.g., load file requests
// TODO: Revisit if we should have (self->inflightSendingReqs > 0 && self->inflightLoadingReqs > 0)
if ((self->inflightSendingReqs > 0 && self->inflightLoadingReqs > 0) &&
(self->inflightSendingReqs >= SERVER_KNOBS->FASTRESTORE_SCHED_INFLIGHT_SEND_REQS ||
self->inflightLoadingReqs >= SERVER_KNOBS->FASTRESTORE_SCHED_INFLIGHT_LOAD_REQS ||
(self->inflightSendingReqs >= 1 &&
self->cpuUsage >= SERVER_KNOBS->FASTRESTORE_SCHED_TARGET_CPU_PERCENT) ||
self->cpuUsage >= SERVER_KNOBS->FASTRESTORE_SCHED_MAX_CPU_PERCENT)) {
if (self->inflightSendingReqs >= SERVER_KNOBS->FASTRESTORE_SCHED_INFLIGHT_SEND_REQS) {
TraceEvent(SevWarn, "FastRestoreLoaderTooManyInflightRequests")
.detail("VersionBatchesBlockedAtSendingMutationsToAppliers", self->inflightSendingReqs)
.detail("CpuUsage", self->cpuUsage)
.detail("InflightSendingReq", self->inflightSendingReqs)
.detail("InflightSendingReqThreshold", SERVER_KNOBS->FASTRESTORE_SCHED_INFLIGHT_SEND_REQS)
.detail("InflightLoadingReq", self->inflightLoadingReqs)
.detail("InflightLoadingReqThreshold", SERVER_KNOBS->FASTRESTORE_SCHED_INFLIGHT_LOAD_REQS);
}
wait(delay(SERVER_KNOBS->FASTRESTORE_SCHED_UPDATE_DELAY));
updateProcessStats(self);
continue;
}
// Dispatch queued requests of sending mutations per loading param
while (!self->sendLoadParamQueue.empty()) { // dispatch current VB first
const RestoreLoaderSchedSendLoadParamRequest& req = self->sendLoadParamQueue.top();
if (req.batchIndex - 1 > self->finishedSendingVB) { // future VB
break;
} else {
req.toSched.send(Void());
self->sendLoadParamQueue.pop();
}
}
sendLoadParams = 0;
curVBInflightReqs = self->inflightSendLoadParamReqs[self->finishedSendingVB + 1];
while (!self->sendLoadParamQueue.empty()) {
const RestoreLoaderSchedSendLoadParamRequest& req = self->sendLoadParamQueue.top();
if (curVBInflightReqs >= SERVER_KNOBS->FASTRESTORE_SCHED_INFLIGHT_SENDPARAM_THRESHOLD ||
sendLoadParams >= SERVER_KNOBS->FASTRESTORE_SCHED_SEND_FUTURE_VB_REQS_BATCH) {
// Too many future VB requests are released
break;
} else {
req.toSched.send(Void());
self->sendLoadParamQueue.pop();
sendLoadParams++;
}
}
// Dispatch loading backup file requests
lastLoadReqs = 0;
while (!self->loadingQueue.empty()) {
if (lastLoadReqs >= SERVER_KNOBS->FASTRESTORE_SCHED_LOAD_REQ_BATCHSIZE) {
break;
}
const RestoreLoadFileRequest& req = self->loadingQueue.top();
if (req.batchIndex <= self->finishedBatch.get()) {
TraceEvent(SevError, "FastRestoreLoaderDispatchRestoreLoadFileRequestTooOld")
.detail("FinishedBatchIndex", self->finishedBatch.get())
.detail("RequestBatchIndex", req.batchIndex);
req.reply.send(RestoreLoadFileReply(req.param, true));
self->loadingQueue.pop();
ASSERT(false); // Check if this ever happens easily
} else {
self->addActor.send(handleLoadFileRequest(req, self));
self->loadingQueue.pop();
lastLoadReqs++;
}
}
if (self->cpuUsage >= SERVER_KNOBS->FASTRESTORE_SCHED_TARGET_CPU_PERCENT) {
wait(delay(SERVER_KNOBS->FASTRESTORE_SCHED_UPDATE_DELAY));
}
updateProcessStats(self);
if (self->loadingQueue.empty() && self->sendingQueue.empty() && self->sendLoadParamQueue.empty()) {
TraceEvent(SevDebug, "FastRestoreLoaderDispatchRequestsWaitOnRequests", self->id())
.detail("HasPendingRequests", self->hasPendingRequests->get());
self->hasPendingRequests->set(false);
wait(self->hasPendingRequests->onChange()); // CAREFUL:Improper req release may cause restore stuck here
}
}
} catch (Error& e) {
if (e.code() != error_code_actor_cancelled) {
TraceEvent(SevError, "FastRestoreLoaderDispatchRequests").errorUnsuppressed(e);
throw e;
}
}
return Void();
}
ACTOR Future<Void> restoreLoaderCore(RestoreLoaderInterface loaderInterf,
int nodeIndex,
Database cx,
RestoreControllerInterface ci) {
state Reference<RestoreLoaderData> self = makeReference<RestoreLoaderData>(loaderInterf.id(), nodeIndex, ci);
state Future<Void> error = actorCollection(self->addActor.getFuture());
state ActorCollection actors(false); // actors whose errors can be ignored
state Future<Void> exitRole = Never();
state bool hasQueuedRequests = false;
actors.add(updateProcessMetrics(self));
actors.add(traceProcessMetrics(self, "RestoreLoader"));
self->addActor.send(dispatchRequests(self));
loop {
state std::string requestTypeStr = "[Init]";
try {
choose {
when(RestoreSimpleRequest req = waitNext(loaderInterf.heartbeat.getFuture())) {
requestTypeStr = "heartbeat";
actors.add(handleHeartbeat(req, loaderInterf.id()));
}
when(RestoreSysInfoRequest req = waitNext(loaderInterf.updateRestoreSysInfo.getFuture())) {
requestTypeStr = "updateRestoreSysInfo";
handleRestoreSysInfoRequest(req, self);
}
when(RestoreLoadFileRequest req = waitNext(loaderInterf.loadFile.getFuture())) {
requestTypeStr = "loadFile";
hasQueuedRequests = !self->loadingQueue.empty() || !self->sendingQueue.empty();
self->initBackupContainer(req.param.url, req.param.proxy);
self->loadingQueue.push(req);
if (!hasQueuedRequests) {
self->hasPendingRequests->set(true);
}
}
when(RestoreSendMutationsToAppliersRequest req = waitNext(loaderInterf.sendMutations.getFuture())) {
requestTypeStr = "sendMutations";
hasQueuedRequests = !self->loadingQueue.empty() || !self->sendingQueue.empty();
self->sendingQueue.push(req);
if (!hasQueuedRequests) {
self->hasPendingRequests->set(true);
}
}
when(RestoreVersionBatchRequest req = waitNext(loaderInterf.initVersionBatch.getFuture())) {
requestTypeStr = "initVersionBatch";
actors.add(handleInitVersionBatchRequest(req, self));
}
when(RestoreVersionBatchRequest req = waitNext(loaderInterf.finishVersionBatch.getFuture())) {
requestTypeStr = "finishVersionBatch";
actors.add(handleFinishVersionBatchRequest(req, self));
}
when(RestoreFinishRequest req = waitNext(loaderInterf.finishRestore.getFuture())) {
requestTypeStr = "finishRestore";
handleFinishRestoreRequest(req, self);
if (req.terminate) {
exitRole = Void();
}
}
when(wait(actors.getResult())) {}
when(wait(exitRole)) {
TraceEvent("FastRestoreLoaderCoreExitRole", self->id());
break;
}
when(wait(error)) { TraceEvent("FastRestoreLoaderActorCollectionError", self->id()); }
}
} catch (Error& e) {
bool isError = e.code() != error_code_operation_cancelled; // == error_code_broken_promise
TraceEvent(isError ? SevError : SevWarnAlways, "FastRestoreLoaderError", self->id())
.errorUnsuppressed(e)
.detail("RequestType", requestTypeStr);
actors.clear(false);
break;
}
}
return Void();
}
static inline bool _logMutationTooOld(KeyRangeMap<Version>* pRangeVersions, KeyRangeRef keyRange, Version v) {
ASSERT(pRangeVersions != nullptr);
auto ranges = pRangeVersions->intersectingRanges(keyRange);
Version minVersion = MAX_VERSION;
for (auto r = ranges.begin(); r != ranges.end(); ++r) {
minVersion = std::min(minVersion, r->value());
}
ASSERT(minVersion != MAX_VERSION); // pRangeVersions is initialized as entired keyspace, ranges cannot be empty
return minVersion >= v;
}
static inline bool logMutationTooOld(KeyRangeMap<Version>* pRangeVersions, MutationRef mutation, Version v) {
return isRangeMutation(mutation)
? _logMutationTooOld(pRangeVersions, KeyRangeRef(mutation.param1, mutation.param2), v)
: _logMutationTooOld(pRangeVersions, KeyRangeRef(singleKeyRange(mutation.param1)), v);
}
// Assume: Only update the local data if it (applierInterf) has not been set
void handleRestoreSysInfoRequest(const RestoreSysInfoRequest& req, Reference<RestoreLoaderData> self) {
TraceEvent("FastRestoreLoader", self->id()).detail("HandleRestoreSysInfoRequest", self->id());
ASSERT(self.isValid());
// The loader has received the appliers interfaces
if (!self->appliersInterf.empty()) {
req.reply.send(RestoreCommonReply(self->id()));
return;
}
self->appliersInterf = req.sysInfo.appliers;
// Update rangeVersions
ASSERT(req.rangeVersions.size() > 0); // At least the min version of range files will be used
ASSERT(self->rangeVersions.size() == 1); // rangeVersions has not been set
for (auto rv = req.rangeVersions.begin(); rv != req.rangeVersions.end(); ++rv) {
self->rangeVersions.insert(rv->first, rv->second);
}
// Debug message for range version in each loader
auto ranges = self->rangeVersions.ranges();
int i = 0;
for (auto r = ranges.begin(); r != ranges.end(); ++r) {
TraceEvent("FastRestoreLoader", self->id())
.detail("RangeIndex", i++)
.detail("RangeBegin", r->begin())
.detail("RangeEnd", r->end())
.detail("Version", r->value());
}
req.reply.send(RestoreCommonReply(self->id()));
}
// Parse a data block in a partitioned mutation log file and store mutations
// into "kvOpsIter" and samples into "samplesIter".
ACTOR static Future<Void> _parsePartitionedLogFileOnLoader(
KeyRangeMap<Version>* pRangeVersions,
NotifiedVersion* processedFileOffset,
std::map<LoadingParam, VersionedMutationsMap>::iterator kvOpsIter,
std::map<LoadingParam, SampledMutationsVec>::iterator samplesIter,
LoaderCounters* cc,
Reference<IBackupContainer> bc,
RestoreAsset asset) {
state Standalone<StringRef> buf = makeString(asset.len);
state Reference<IAsyncFile> file = wait(bc->readFile(asset.filename));
int rLen = wait(file->read(mutateString(buf), asset.len, asset.offset));
if (rLen != asset.len)
throw restore_bad_read();
simulateBlobFailure();
TraceEvent("FastRestoreLoaderDecodingLogFile")
.detail("BatchIndex", asset.batchIndex)
.detail("Filename", asset.filename)
.detail("Offset", asset.offset)
.detail("Length", asset.len);
// Ensure data blocks in the same file are processed in order
wait(processedFileOffset->whenAtLeast(asset.offset));
ASSERT(processedFileOffset->get() == asset.offset);
Arena tempArena;
StringRefReader reader(buf, restore_corrupted_data());
try {
// Read block header
if (reader.consume<int32_t>() != PARTITIONED_MLOG_VERSION)
throw restore_unsupported_file_version();
VersionedMutationsMap& kvOps = kvOpsIter->second;
while (1) {
// If eof reached or first key len bytes is 0xFF then end of block was reached.
if (reader.eof() || *reader.rptr == 0xFF)
break;
// Deserialize messages written in saveMutationsToFile().
LogMessageVersion msgVersion;
msgVersion.version = reader.consumeNetworkUInt64();
msgVersion.sub = reader.consumeNetworkUInt32();
int msgSize = reader.consumeNetworkInt32();
const uint8_t* message = reader.consume(msgSize);
// Skip mutations out of the version range
if (!asset.isInVersionRange(msgVersion.version))
continue;
VersionedMutationsMap::iterator it;
bool inserted;
std::tie(it, inserted) = kvOps.emplace(msgVersion, MutationsVec());
// A clear mutation can be split into multiple mutations with the same (version, sub).
// See saveMutationsToFile(). Current tests only use one key range per backup, thus
// only one clear mutation is generated (i.e., always inserted).
ASSERT(inserted);
ArenaReader rd(buf.arena(), StringRef(message, msgSize), AssumeVersion(g_network->protocolVersion()));
if (EncryptedMutationMessage::isNextIn(rd)) {
throw encrypt_unsupported();
}
MutationRef mutation;
rd >> mutation;
// Skip mutation whose commitVesion < range kv's version
if (logMutationTooOld(pRangeVersions, mutation, msgVersion.version)) {
cc->oldLogMutations += 1;
continue;
}
// Should this mutation be skipped?
if (mutation.param1 >= asset.range.end ||
(isRangeMutation(mutation) && mutation.param2 < asset.range.begin) ||
(!isRangeMutation(mutation) && mutation.param1 < asset.range.begin)) {
continue;
}
// Only apply mutation within the asset.range
ASSERT(asset.removePrefix.size() == 0);
if (isRangeMutation(mutation)) {
mutation.param1 = mutation.param1 >= asset.range.begin ? mutation.param1 : asset.range.begin;
mutation.param2 = mutation.param2 < asset.range.end ? mutation.param2 : asset.range.end;
// Remove prefix or add prefix when we restore to a new key space
if (asset.hasPrefix()) { // Avoid creating new Key
mutation.param1 =
mutation.param1.removePrefix(asset.removePrefix).withPrefix(asset.addPrefix, tempArena);
mutation.param2 =
mutation.param2.removePrefix(asset.removePrefix).withPrefix(asset.addPrefix, tempArena);
}
} else {
if (asset.hasPrefix()) { // Avoid creating new Key
mutation.param1 =
mutation.param1.removePrefix(asset.removePrefix).withPrefix(asset.addPrefix, tempArena);
}
}
TraceEvent(SevFRMutationInfo, "FastRestoreDecodePartitionedLogFile")
.detail("CommitVersion", msgVersion.toString())
.detail("ParsedMutation", mutation.toString());
it->second.push_back_deep(it->second.arena(), mutation);
cc->loadedLogBytes += mutation.totalSize();
// Sampling data similar to SS sample kvs
ByteSampleInfo sampleInfo = isKeyValueInSample(KeyValueRef(mutation.param1, mutation.param2));
if (sampleInfo.inSample) {
cc->sampledLogBytes += sampleInfo.sampledSize;
samplesIter->second.push_back_deep(samplesIter->second.arena(),
SampledMutation(mutation.param1, sampleInfo.sampledSize));
}
}
// Make sure any remaining bytes in the block are 0xFF
for (auto b : reader.remainder()) {
if (b != 0xFF)
throw restore_corrupted_data_padding();
}
} catch (Error& e) {
TraceEvent(SevWarn, "FileRestoreCorruptLogFileBlock")
.error(e)
.detail("BatchIndex", asset.batchIndex)
.detail("Filename", file->getFilename())
.detail("BlockOffset", asset.offset)
.detail("BlockLen", asset.len);
throw;
}
processedFileOffset->set(asset.offset + asset.len);
return Void();
}
// wrapper of _parsePartitionedLogFileOnLoader to retry on blob error
ACTOR static Future<Void> parsePartitionedLogFileOnLoader(
KeyRangeMap<Version>* pRangeVersions,
NotifiedVersion* processedFileOffset,
std::map<LoadingParam, VersionedMutationsMap>::iterator kvOpsIter,
std::map<LoadingParam, SampledMutationsVec>::iterator samplesIter,
LoaderCounters* cc,
Reference<IBackupContainer> bc,
RestoreAsset asset) {
state int readFileRetries = 0;
loop {
try {
wait(_parsePartitionedLogFileOnLoader(
pRangeVersions, processedFileOffset, kvOpsIter, samplesIter, cc, bc, asset));
break;
} catch (Error& e) {
if (e.code() == error_code_restore_bad_read || e.code() == error_code_restore_unsupported_file_version ||
e.code() == error_code_restore_corrupted_data_padding) { // no retriable error
TraceEvent(SevError, "FastRestoreFileRestoreCorruptedPartitionedLogFileBlock").error(e);
throw;
} else if (e.code() == error_code_http_request_failed || e.code() == error_code_connection_failed ||
e.code() == error_code_timed_out || e.code() == error_code_lookup_failed) {
// blob http request failure, retry
TraceEvent(SevWarnAlways, "FastRestoreDecodedPartitionedLogFileConnectionFailure")
.error(e)
.detail("Retries", ++readFileRetries);
wait(delayJittered(0.1));
} else {
TraceEvent(SevError, "FastRestoreParsePartitionedLogFileOnLoaderUnexpectedError").error(e);
throw;
}
}
}
return Void();
}
ACTOR Future<Void> _processLoadingParam(KeyRangeMap<Version>* pRangeVersions,
LoadingParam param,
Reference<LoaderBatchData> batchData,
UID loaderID,
Reference<IBackupContainer> bc) {
// Temporary data structure for parsing log files into (version, <K, V, mutationType>)
// Must use StandAlone to save mutations, otherwise, the mutationref memory will be corrupted
// mutationMap: Key is the unique identifier for a batch of mutation logs at the same version
state SerializedMutationListMap mutationMap;
state NotifiedVersion processedFileOffset(0);
state std::vector<Future<Void>> fileParserFutures;
state std::map<LoadingParam, VersionedMutationsMap>::iterator kvOpsPerLPIter = batchData->kvOpsPerLP.end();
state std::map<LoadingParam, SampledMutationsVec>::iterator samplesIter = batchData->sampleMutations.end();
TraceEvent("FastRestoreLoaderProcessLoadingParam", loaderID)
.detail("BatchIndex", param.asset.batchIndex)
.detail("LoadingParam", param.toString());
ASSERT(param.blockSize > 0);
ASSERT(param.asset.offset % param.blockSize == 0); // Parse file must be at block boundary.
ASSERT(batchData->kvOpsPerLP.find(param) == batchData->kvOpsPerLP.end());
// NOTE: map's iterator is guaranteed to be stable, but pointer may not.
bool inserted;
std::tie(kvOpsPerLPIter, inserted) = batchData->kvOpsPerLP.emplace(param, VersionedMutationsMap());
ASSERT(inserted);
std::tie(samplesIter, inserted) = batchData->sampleMutations.emplace(param, SampledMutationsVec());
ASSERT(inserted);
for (int64_t j = param.asset.offset; j < param.asset.len; j += param.blockSize) {
RestoreAsset subAsset = param.asset;
subAsset.offset = j;
subAsset.len = std::min<int64_t>(param.blockSize, param.asset.len - j);
if (param.isRangeFile) {
fileParserFutures.push_back(_parseRangeFileToMutationsOnLoader(
kvOpsPerLPIter, samplesIter, &batchData->counters, bc, param.rangeVersion.get(), subAsset));
} else {
// TODO: Sanity check the log file's range is overlapped with the restored version range
if (param.isPartitionedLog()) {
fileParserFutures.push_back(parsePartitionedLogFileOnLoader(pRangeVersions,
&processedFileOffset,
kvOpsPerLPIter,
samplesIter,
&batchData->counters,
bc,
subAsset));
} else {
fileParserFutures.push_back(
parseLogFileToMutationsOnLoader(&processedFileOffset, &mutationMap, bc, subAsset));
}
}
}
wait(waitForAll(fileParserFutures));
if (!param.isRangeFile && !param.isPartitionedLog()) {
_parseSerializedMutation(
pRangeVersions, kvOpsPerLPIter, &mutationMap, samplesIter, &batchData->counters, param.asset);
}
TraceEvent("FastRestoreLoaderProcessLoadingParamDone", loaderID)
.detail("BatchIndex", param.asset.batchIndex)
.detail("LoadingParam", param.toString());
return Void();
}
// A loader can process multiple RestoreLoadFileRequest in parallel.
ACTOR Future<Void> handleLoadFileRequest(RestoreLoadFileRequest req, Reference<RestoreLoaderData> self) {
state Reference<LoaderBatchData> batchData = self->batch[req.batchIndex];
state bool isDuplicated = true;
state bool printTrace = false;
ASSERT(batchData.isValid());
ASSERT(req.batchIndex > self->finishedBatch.get());
bool paramExist = batchData->processedFileParams.find(req.param) != batchData->processedFileParams.end();
bool isReady = paramExist ? batchData->processedFileParams[req.param].isReady() : false;
batchData->loadFileReqs += 1;
printTrace = (batchData->loadFileReqs % 10 == 1);
// TODO: Make the actor priority lower than sendMutation priority. (Unsure it will help performance though)
TraceEvent(printTrace ? SevInfo : SevFRDebugInfo, "FastRestoreLoaderPhaseLoadFile", self->id())
.detail("BatchIndex", req.batchIndex)
.detail("ProcessLoadParam", req.param.toString())
.detail("NotProcessed", !paramExist)
.detail("Processed", isReady)
.detail("CurrentMemory", getSystemStatistics().processMemory);
// Loader destroy batchData once the batch finishes and self->finishedBatch.set(req.batchIndex);
ASSERT(self->finishedBatch.get() < req.batchIndex);
wait(isSchedulable(self, req.batchIndex, __FUNCTION__));
if (batchData->processedFileParams.find(req.param) == batchData->processedFileParams.end()) {
TraceEvent(SevFRDebugInfo, "FastRestoreLoadFile", self->id())
.detail("BatchIndex", req.batchIndex)
.detail("ProcessLoadParam", req.param.toString());
ASSERT(batchData->sampleMutations.find(req.param) == batchData->sampleMutations.end());
batchData->processedFileParams[req.param] =
_processLoadingParam(&self->rangeVersions, req.param, batchData, self->id(), self->bc);
self->inflightLoadingReqs++;
isDuplicated = false;
} else {
TraceEvent(SevFRDebugInfo, "FastRestoreLoadFile", self->id())
.detail("BatchIndex", req.batchIndex)
.detail("WaitOnProcessLoadParam", req.param.toString());
}
auto it = batchData->processedFileParams.find(req.param);
ASSERT(it != batchData->processedFileParams.end());
wait(it->second); // wait on the processing of the req.param.
// Send sampled mutations back to controller: batchData->sampleMutations[req.param]
std::vector<Future<RestoreCommonReply>> fSendSamples;
SampledMutationsVec& samples = batchData->sampleMutations[req.param];
SampledMutationsVec sampleBatch = SampledMutationsVec(); // sampleBatch: Standalone pointer to the created object
long sampleBatchSize = 0;
for (int i = 0; i < samples.size(); ++i) {
sampleBatchSize += samples[i].totalSize();
sampleBatch.push_back_deep(sampleBatch.arena(), samples[i]); // TODO: may not need deep copy
if (sampleBatchSize >= SERVER_KNOBS->FASTRESTORE_SAMPLE_MSG_BYTES) {
fSendSamples.push_back(self->ci.samples.getReply(
RestoreSamplesRequest(deterministicRandom()->randomUniqueID(), req.batchIndex, sampleBatch)));
sampleBatchSize = 0;
sampleBatch = SampledMutationsVec();
}
}
if (sampleBatchSize > 0) {
fSendSamples.push_back(self->ci.samples.getReply(
RestoreSamplesRequest(deterministicRandom()->randomUniqueID(), req.batchIndex, sampleBatch)));
sampleBatchSize = 0;
}
state int samplesMessages = fSendSamples.size();
try {
wait(waitForAll(fSendSamples));
} catch (Error& e) { // In case ci.samples throws broken_promise due to unstable network
if (e.code() == error_code_broken_promise || e.code() == error_code_operation_cancelled) {
TraceEvent(SevWarnAlways, "FastRestoreLoaderPhaseLoadFileSendSamples")
.errorUnsuppressed(e)
.detail("SamplesMessages", samplesMessages);
} else {
TraceEvent(SevError, "FastRestoreLoaderPhaseLoadFileSendSamplesUnexpectedError").errorUnsuppressed(e);
}
}
// Ack restore controller the param is processed
self->inflightLoadingReqs--;
req.reply.send(RestoreLoadFileReply(req.param, isDuplicated));
TraceEvent(printTrace ? SevInfo : SevFRDebugInfo, "FastRestoreLoaderPhaseLoadFileDone", self->id())
.detail("BatchIndex", req.batchIndex)
.detail("ProcessLoadParam", req.param.toString());
return Void();
}
// Send buffered mutations to appliers.
// Do not need to block on low memory usage because this actor should not increase memory usage.
ACTOR Future<Void> handleSendMutationsRequest(RestoreSendMutationsToAppliersRequest req,
Reference<RestoreLoaderData> self) {
state Reference<LoaderBatchData> batchData;
state Reference<LoaderBatchStatus> batchStatus;
state bool isDuplicated = true;
if (req.batchIndex <= self->finishedBatch.get()) {
TraceEvent(SevWarn, "FastRestoreLoaderRestoreSendMutationsToAppliersRequestTooOld")
.detail("FinishedBatchIndex", self->finishedBatch.get())
.detail("RequestBatchIndex", req.batchIndex);
req.reply.send(RestoreCommonReply(self->id(), isDuplicated));
return Void();
}
batchData = self->batch[req.batchIndex];
batchStatus = self->status[req.batchIndex];
ASSERT(batchData.isValid() && batchStatus.isValid());
// Loader destroy batchData once the batch finishes and self->finishedBatch.set(req.batchIndex);
ASSERT(req.batchIndex > self->finishedBatch.get());
TraceEvent("FastRestoreLoaderPhaseSendMutations", self->id())
.detail("BatchIndex", req.batchIndex)
.detail("UseRangeFile", req.useRangeFile)
.detail("LoaderSendStatus", batchStatus->toString());
// The VB must finish loading phase before it can send mutations; update finishedLoadingVB for scheduler
self->finishedLoadingVB = std::max(self->finishedLoadingVB, req.batchIndex);
// Ensure each file is sent exactly once by using batchStatus->sendAllLogs and batchStatus->sendAllRanges
if (!req.useRangeFile) {
if (!batchStatus->sendAllLogs.present()) { // Has not sent
batchStatus->sendAllLogs = Never();
isDuplicated = false;
TraceEvent(SevInfo, "FastRestoreSendMutationsProcessLogRequest", self->id())
.detail("BatchIndex", req.batchIndex)
.detail("UseRangeFile", req.useRangeFile);
} else if (!batchStatus->sendAllLogs.get().isReady()) { // In the process of sending
TraceEvent(SevDebug, "FastRestoreSendMutationsWaitDuplicateLogRequest", self->id())
.detail("BatchIndex", req.batchIndex)
.detail("UseRangeFile", req.useRangeFile);
wait(batchStatus->sendAllLogs.get());
} else { // Already sent
TraceEvent(SevDebug, "FastRestoreSendMutationsSkipDuplicateLogRequest", self->id())
.detail("BatchIndex", req.batchIndex)
.detail("UseRangeFile", req.useRangeFile);
}
} else {
if (!batchStatus->sendAllRanges.present()) {
batchStatus->sendAllRanges = Never();
isDuplicated = false;
TraceEvent(SevInfo, "FastRestoreSendMutationsProcessRangeRequest", self->id())
.detail("BatchIndex", req.batchIndex)
.detail("UseRangeFile", req.useRangeFile);
} else if (!batchStatus->sendAllRanges.get().isReady()) {
TraceEvent(SevDebug, "FastRestoreSendMutationsWaitDuplicateRangeRequest", self->id())
.detail("BatchIndex", req.batchIndex)
.detail("UseRangeFile", req.useRangeFile);
wait(batchStatus->sendAllRanges.get());
} else {
TraceEvent(SevDebug, "FastRestoreSendMutationsSkipDuplicateRangeRequest", self->id())
.detail("BatchIndex", req.batchIndex)
.detail("UseRangeFile", req.useRangeFile);
}
}
if (!isDuplicated) {
self->inflightSendingReqs++;
std::vector<Future<Void>> fSendMutations;
batchData->rangeToApplier = req.rangeToApplier;
for (auto& [loadParam, kvOps] : batchData->kvOpsPerLP) {
if (loadParam.isRangeFile == req.useRangeFile) {
// Send the parsed mutation to applier who will apply the mutation to DB
fSendMutations.push_back(sendMutationsToApplier(&self->sendLoadParamQueue,
&self->inflightSendLoadParamReqs,
&self->finishedBatch,
&kvOps,
req.batchIndex,
loadParam.asset,
loadParam.isRangeFile,
&batchData->rangeToApplier,
&self->appliersInterf));
}
}
wait(waitForAll(fSendMutations));
self->inflightSendingReqs--;
if (req.useRangeFile) {
batchStatus->sendAllRanges = Void(); // Finish sending kvs parsed from range files
} else {
batchStatus->sendAllLogs = Void();
}
if ((batchStatus->sendAllRanges.present() && batchStatus->sendAllRanges.get().isReady()) &&
(batchStatus->sendAllLogs.present() && batchStatus->sendAllLogs.get().isReady())) {
// Both log and range files have been sent.
self->finishedSendingVB = std::max(self->finishedSendingVB, req.batchIndex);
batchData->kvOpsPerLP.clear();
}
}
TraceEvent("FastRestoreLoaderPhaseSendMutationsDone", self->id())
.detail("BatchIndex", req.batchIndex)
.detail("UseRangeFile", req.useRangeFile)
.detail("LoaderSendStatus", batchStatus->toString());
req.reply.send(RestoreCommonReply(self->id(), isDuplicated));
return Void();
}
void buildApplierRangeMap(KeyRangeMap<UID>* krMap, std::map<Key, UID>* pRangeToApplier) {
std::map<Key, UID>::iterator beginKey = pRangeToApplier->begin();
std::map<Key, UID>::iterator endKey = std::next(beginKey, 1);
while (endKey != pRangeToApplier->end()) {
krMap->insert(KeyRangeRef(beginKey->first, endKey->first), beginKey->second);
beginKey = endKey;
endKey++;
}
if (beginKey != pRangeToApplier->end()) {
krMap->insert(KeyRangeRef(beginKey->first, normalKeys.end), beginKey->second);
}
}
// Assume: kvOps data are from the same RestoreAsset.
// Input: pkvOps: versioned kv mutation for the asset in the version batch (batchIndex)
// isRangeFile: is pkvOps from range file? Let receiver (applier) know if the mutation is log mutation;
// pRangeToApplier: range to applierID mapping, deciding which applier is responsible for which range
// pApplierInterfaces: applier interfaces to send the mutations to
ACTOR Future<Void> sendMutationsToApplier(
std::priority_queue<RestoreLoaderSchedSendLoadParamRequest>* sendLoadParamQueue,
std::map<int, int>* inflightSendLoadParamReqs,
NotifiedVersion* finishedBatch,
VersionedMutationsMap* pkvOps,
int batchIndex,
RestoreAsset asset,
bool isRangeFile,
std::map<Key, UID>* pRangeToApplier,
std::map<UID, RestoreApplierInterface>* pApplierInterfaces) {
state VersionedMutationsMap& kvOps = *pkvOps;
state VersionedMutationsMap::iterator kvOp = kvOps.begin();
state int kvCount = 0;
state int splitMutationIndex = 0;
state Version msgIndex = 1; // Monotonically increased index for send message, must start at 1
state std::vector<UID> applierIDs = getApplierIDs(*pRangeToApplier);
state double msgSize = 0; // size of mutations in the message
// Wait for scheduler to kick it off
Promise<Void> toSched;
sendLoadParamQueue->push(RestoreLoaderSchedSendLoadParamRequest(batchIndex, toSched, now()));
wait(toSched.getFuture());
if (finishedBatch->get() >= batchIndex) {
TraceEvent(SevError, "FastRestoreLoaderSendMutationToApplierLateRequest")
.detail("FinishedBatchIndex", finishedBatch->get())
.detail("RequestBatchIndex", batchIndex);
ASSERT(false);
return Void();
}
(*inflightSendLoadParamReqs)[batchIndex]++;
TraceEvent("FastRestoreLoaderSendMutationToApplier")
.detail("IsRangeFile", isRangeFile)
.detail("EndVersion", asset.endVersion)
.detail("RestoreAsset", asset.toString());
// There should be no mutation at asset.endVersion version because it is exclusive
if (kvOps.lower_bound(LogMessageVersion(asset.endVersion)) != kvOps.end()) {
TraceEvent(SevError, "FastRestoreLoaderSendMutationToApplier")
.detail("BatchIndex", batchIndex)
.detail("RestoreAsset", asset.toString())
.detail("IsRangeFile", isRangeFile)
.detail("Data loss at version", asset.endVersion);
} else {
// Ensure there is a mutation request sent at endVersion, so that applier can advance its notifiedVersion
kvOps[LogMessageVersion(asset.endVersion)] = MutationsVec(); // Empty mutation vector will be handled by applier
}
splitMutationIndex = 0;
kvCount = 0;
// applierVersionedMutationsBuffer is the mutation-and-its-version vector to be sent to each applier
state std::map<UID, VersionedMutationsVec> applierVersionedMutationsBuffer;
state int mIndex = 0;
state LogMessageVersion commitVersion;
state std::vector<Future<Void>> fSends;
for (auto& applierID : applierIDs) {
applierVersionedMutationsBuffer[applierID] = VersionedMutationsVec();
}
KeyRangeMap<UID> krMap;
buildApplierRangeMap(&krMap, pRangeToApplier);
for (kvOp = kvOps.begin(); kvOp != kvOps.end(); kvOp++) {
commitVersion = kvOp->first;
ASSERT(commitVersion.version >= asset.beginVersion);
ASSERT(commitVersion.version <= asset.endVersion); // endVersion is an empty commit to ensure progress
for (mIndex = 0; mIndex < kvOp->second.size(); mIndex++) {
MutationRef& kvm = kvOp->second[mIndex];
// Send the mutation to applier
if (isRangeMutation(kvm)) {
MutationsVec mvector;
Standalone<VectorRef<UID>> nodeIDs;
// Because using a vector of mutations causes overhead, and the range mutation should happen rarely;
// We handle the range mutation and key mutation differently for the benefit of avoiding memory copy
splitMutation(krMap, kvm, mvector.arena(), mvector.contents(), nodeIDs.arena(), nodeIDs.contents());
ASSERT(mvector.size() == nodeIDs.size());
if (MUTATION_TRACKING_ENABLED) {
TraceEvent&& e = debugMutation("RestoreLoaderDebugSplit", commitVersion.version, kvm);
if (e.isEnabled()) {
int i = 0;
for (auto& [key, uid] : *pRangeToApplier) {
e.detail(format("Range%d", i).c_str(), printable(key))
.detail(format("UID%d", i).c_str(), uid.toString());
i++;
}
}
}
for (splitMutationIndex = 0; splitMutationIndex < mvector.size(); splitMutationIndex++) {
MutationRef mutation = mvector[splitMutationIndex];
UID applierID = nodeIDs[splitMutationIndex];
DEBUG_MUTATION("RestoreLoaderSplitMutation", commitVersion.version, mutation)
.detail("CommitVersion", commitVersion.toString());
// CAREFUL: The split mutations' lifetime is shorter than the for-loop
// Must use deep copy for split mutations
applierVersionedMutationsBuffer[applierID].push_back_deep(
applierVersionedMutationsBuffer[applierID].arena(), VersionedMutation(mutation, commitVersion));
msgSize += mutation.expectedSize();
kvCount++;
}
} else { // mutation operates on a particular key
std::map<Key, UID>::iterator itlow = pRangeToApplier->upper_bound(kvm.param1);
--itlow; // make sure itlow->first <= m.param1
ASSERT(itlow->first <= kvm.param1);
UID applierID = itlow->second;
kvCount++;
DEBUG_MUTATION("RestoreLoaderSendMutation", commitVersion.version, kvm)
.detail("Applier", applierID)
.detail("SubVersion", commitVersion.toString());
// kvm data is saved in pkvOps in batchData, so shallow copy is ok here.
applierVersionedMutationsBuffer[applierID].push_back(applierVersionedMutationsBuffer[applierID].arena(),
VersionedMutation(kvm, commitVersion));
msgSize += kvm.expectedSize();
}
// Batch mutations at multiple versions up to FASTRESTORE_LOADER_SEND_MUTATION_MSG_BYTES size
// to improve bandwidth from a loader to appliers
if (msgSize >= SERVER_KNOBS->FASTRESTORE_LOADER_SEND_MUTATION_MSG_BYTES) {
std::vector<std::pair<UID, RestoreSendVersionedMutationsRequest>> requests;
requests.reserve(applierIDs.size());
for (const UID& applierID : applierIDs) {
requests.emplace_back(
applierID,
RestoreSendVersionedMutationsRequest(
batchIndex, asset, msgIndex, isRangeFile, applierVersionedMutationsBuffer[applierID]));
}
TraceEvent(SevInfo, "FastRestoreLoaderSendMutationToApplier")
.detail("MessageIndex", msgIndex)
.detail("RestoreAsset", asset.toString())
.detail("Requests", requests.size());
fSends.push_back(sendBatchRequests(&RestoreApplierInterface::sendMutationVector,
*pApplierInterfaces,
requests,
TaskPriority::RestoreLoaderSendMutations));
msgIndex++;
msgSize = 0;
for (auto& applierID : applierIDs) {
applierVersionedMutationsBuffer[applierID] = VersionedMutationsVec();
}
}
} // Mutations at the same LogMessageVersion
} // all versions of mutations in the same file
// Send the remaining mutations in the applierMutationsBuffer
if (msgSize > 0) {
// TODO: Sanity check each asset has been received exactly once!
std::vector<std::pair<UID, RestoreSendVersionedMutationsRequest>> requests;
requests.reserve(applierIDs.size());
for (const UID& applierID : applierIDs) {
requests.emplace_back(
applierID,
RestoreSendVersionedMutationsRequest(
batchIndex, asset, msgIndex, isRangeFile, applierVersionedMutationsBuffer[applierID]));
}
TraceEvent(SevInfo, "FastRestoreLoaderSendMutationToApplier")
.detail("MessageIndex", msgIndex)
.detail("RestoreAsset", asset.toString())
.detail("Requests", requests.size());
fSends.push_back(sendBatchRequests(&RestoreApplierInterface::sendMutationVector,
*pApplierInterfaces,
requests,
TaskPriority::RestoreLoaderSendMutations));
}
wait(waitForAll(fSends));
(*inflightSendLoadParamReqs)[batchIndex]--;
if (finishedBatch->get() < batchIndex) {
kvOps = VersionedMutationsMap(); // Free memory for parsed mutations at the restore asset.
TraceEvent("FastRestoreLoaderSendMutationToApplierDone")
.detail("BatchIndex", batchIndex)
.detail("RestoreAsset", asset.toString())
.detail("Mutations", kvCount);
} else {
TraceEvent(SevWarnAlways, "FastRestoreLoaderSendMutationToApplierDoneTooLate")
.detail("BatchIndex", batchIndex)
.detail("FinishedBatchIndex", finishedBatch->get())
.detail("RestoreAsset", asset.toString())
.detail("Mutations", kvCount);
}
return Void();
}
// Splits a clear range mutation for Appliers and puts results of split mutations and
// Applier IDs into "mvector" and "nodeIDs" on return.
void splitMutation(const KeyRangeMap<UID>& krMap,
MutationRef m,
Arena& mvector_arena,
VectorRef<MutationRef>& mvector,
Arena& nodeIDs_arena,
VectorRef<UID>& nodeIDs) {
TraceEvent(SevDebug, "FastRestoreSplitMutation").detail("Mutation", m);
ASSERT(mvector.empty());
ASSERT(nodeIDs.empty());
auto r = krMap.intersectingRanges(KeyRangeRef(m.param1, m.param2));
for (auto i = r.begin(); i != r.end(); ++i) {
// Calculate the overlap range
KeyRef rangeBegin = m.param1 > i->range().begin ? m.param1 : i->range().begin;
KeyRef rangeEnd = m.param2 < i->range().end ? m.param2 : i->range().end;
KeyRange krange1(KeyRangeRef(rangeBegin, rangeEnd));
mvector.push_back_deep(mvector_arena, MutationRef(MutationRef::ClearRange, rangeBegin, rangeEnd));
nodeIDs.push_back(nodeIDs_arena, i->cvalue());
}
}
// key_input format:
// [logRangeMutation.first][hash_value_of_commit_version:1B][bigEndian64(commitVersion)][bigEndian32(part)]
// value_input: serialized binary of mutations at the same version
bool concatenateBackupMutationForLogFile(SerializedMutationListMap* pMutationMap,
Standalone<StringRef> key_input,
Standalone<StringRef> val_input,
const RestoreAsset& asset) {
SerializedMutationListMap& mutationMap = *pMutationMap;
const int key_prefix_len = sizeof(uint8_t) + sizeof(Version) + sizeof(uint32_t);
StringRefReader readerKey(key_input, restore_corrupted_data()); // read key_input!
int logRangeMutationFirstLength = key_input.size() - key_prefix_len;
bool concatenated = false;
ASSERT_WE_THINK(key_input.size() >= key_prefix_len);
if (logRangeMutationFirstLength > 0) {
// Strip out the [logRangeMutation.first]; otherwise, the following readerKey.consume will produce wrong value
readerKey.consume(logRangeMutationFirstLength);
}
readerKey.consume<uint8_t>(); // uint8_t hashValue = readerKey.consume<uint8_t>()
Version commitVersion = readerKey.consumeNetworkUInt64();
// Skip mutations not in [asset.beginVersion, asset.endVersion), which is what we are only processing right now
if (!asset.isInVersionRange(commitVersion)) {
return false;
}
uint32_t part = readerKey.consumeNetworkUInt32();
// Use commitVersion as id
Standalone<StringRef> id = StringRef((uint8_t*)&commitVersion, sizeof(Version));
auto it = mutationMap.find(id);
if (it == mutationMap.end()) {
mutationMap.emplace(id, std::make_pair(val_input, 0));
if (part != 0) {
TraceEvent(SevError, "FastRestoreLoader")
.detail("FirstPartNotZero", part)
.detail("KeyInput", getHexString(key_input));
}
} else { // Concatenate the val string with the same commitVersion
it->second.first =
it->second.first.contents().withSuffix(val_input.contents()); // Assign the new Areana to the map's value
auto& currentPart = it->second.second;
if (part != (currentPart + 1)) {
// Check if the same range or log file has been processed more than once!
TraceEvent(SevError, "FastRestoreLoader")
.detail("CurrentPart1", currentPart)
.detail("CurrentPart2", part)
.detail("KeyInput", getHexString(key_input))
.detail("Hint", "Check if the same range or log file has been processed more than once");
}
currentPart = part;
concatenated = true;
}
return concatenated;
}
// Parse the kv pair (version, serialized_mutation), which are the results parsed from log file, into
// (version, <K, V, mutationType>) pair;
// Put the parsed versioned mutations into *pkvOps.
//
// Input key: [commitVersion_of_the_mutation_batch:uint64_t];
// Input value: [includeVersion:uint64_t][val_length:uint32_t][encoded_list_of_mutations], where
// includeVersion is the serialized version in the batch commit. It is not the commitVersion in Input key.
//
// val_length is always equal to (val.size() - 12); otherwise,
// we may not get the entire mutation list for the version encoded_list_of_mutations:
// [mutation1][mutation2]...[mutationk], where
// a mutation is encoded as [type:uint32_t][keyLength:uint32_t][valueLength:uint32_t][keyContent][valueContent]
void _parseSerializedMutation(KeyRangeMap<Version>* pRangeVersions,
std::map<LoadingParam, VersionedMutationsMap>::iterator kvOpsIter,
SerializedMutationListMap* pmutationMap,
std::map<LoadingParam, SampledMutationsVec>::iterator samplesIter,
LoaderCounters* cc,
const RestoreAsset& asset) {
VersionedMutationsMap& kvOps = kvOpsIter->second;
SampledMutationsVec& samples = samplesIter->second;
SerializedMutationListMap& mutationMap = *pmutationMap;
TraceEvent(SevFRMutationInfo, "FastRestoreLoaderParseSerializedLogMutation")
.detail("BatchIndex", asset.batchIndex)
.detail("RestoreAsset", asset.toString());
Arena tempArena;
for (auto& m : mutationMap) {
StringRef k = m.first.contents();
StringRef val = m.second.first.contents();
StringRefReader kReader(k, restore_corrupted_data());
uint64_t commitVersion = kReader.consume<uint64_t>(); // Consume little Endian data
// We have already filter the commit not in [beginVersion, endVersion) when we concatenate kv pair in log file
ASSERT_WE_THINK(asset.isInVersionRange(commitVersion));
StringRefReader vReader(val, restore_corrupted_data());
vReader.consume<uint64_t>(); // Consume the includeVersion
// TODO(xumengpanda): verify the protocol version is compatible and raise error if needed
// Parse little endian value, confirmed it is correct!
uint32_t val_length_decoded = vReader.consume<uint32_t>();
ASSERT(val_length_decoded == val.size() - sizeof(uint64_t) - sizeof(uint32_t));
int sub = 0;
while (1) {
// stop when reach the end of the string
if (vReader.eof()) { //|| *reader.rptr == 0xFF
break;
}
uint32_t type = vReader.consume<uint32_t>();
uint32_t kLen = vReader.consume<uint32_t>();
uint32_t vLen = vReader.consume<uint32_t>();
const uint8_t* k = vReader.consume(kLen);
const uint8_t* v = vReader.consume(vLen);
MutationRef mutation((MutationRef::Type)type, KeyRef(k, kLen), KeyRef(v, vLen));
// Should this mutation be skipped?
// Skip mutation whose commitVesion < range kv's version
if (logMutationTooOld(pRangeVersions, mutation, commitVersion)) {
cc->oldLogMutations += 1;
continue;
}
if (mutation.param1 >= asset.range.end ||
(isRangeMutation(mutation) && mutation.param2 < asset.range.begin) ||
(!isRangeMutation(mutation) && mutation.param1 < asset.range.begin)) {
continue;
}
// Only apply mutation within the asset.range and apply removePrefix and addPrefix
ASSERT(asset.removePrefix.size() == 0);
if (isRangeMutation(mutation)) {
mutation.param1 = mutation.param1 >= asset.range.begin ? mutation.param1 : asset.range.begin;
mutation.param2 = mutation.param2 < asset.range.end ? mutation.param2 : asset.range.end;
// Remove prefix or add prefix if we restore data to a new key space
if (asset.hasPrefix()) { // Avoid creating new Key
mutation.param1 =
mutation.param1.removePrefix(asset.removePrefix).withPrefix(asset.addPrefix, tempArena);
mutation.param2 =
mutation.param2.removePrefix(asset.removePrefix).withPrefix(asset.addPrefix, tempArena);
}
} else {
if (asset.hasPrefix()) { // Avoid creating new Key
mutation.param1 =
mutation.param1.removePrefix(asset.removePrefix).withPrefix(asset.addPrefix, tempArena);
}
}
cc->loadedLogBytes += mutation.totalSize();
TraceEvent(SevFRMutationInfo, "FastRestoreDecodeLogFile")
.detail("CommitVersion", commitVersion)
.detail("ParsedMutation", mutation.toString());
auto it = kvOps.insert(std::make_pair(LogMessageVersion(commitVersion, sub++), MutationsVec()));
ASSERT(it.second); // inserted is true
ASSERT(sub < std::numeric_limits<int32_t>::max()); // range file mutation uses int32_max as subversion
it.first->second.push_back_deep(it.first->second.arena(), mutation);
// Sampling data similar to how SS sample bytes
ByteSampleInfo sampleInfo = isKeyValueInSample(KeyValueRef(mutation.param1, mutation.param2));
if (sampleInfo.inSample) {
cc->sampledLogBytes += sampleInfo.sampledSize;
samples.push_back_deep(samples.arena(), SampledMutation(mutation.param1, sampleInfo.sampledSize));
}
ASSERT_WE_THINK(kLen >= 0 && kLen < val.size());
ASSERT_WE_THINK(vLen >= 0 && vLen < val.size());
}
}
}
// Parsing the data blocks in a range file
// kvOpsIter: saves the parsed versioned-mutations for the specific LoadingParam;
// samplesIter: saves the sampled mutations from the parsed versioned-mutations;
// bc: backup container to read the backup file
// version: the version the parsed mutations should be at
// asset: RestoreAsset about which backup data should be parsed
ACTOR static Future<Void> _parseRangeFileToMutationsOnLoader(
std::map<LoadingParam, VersionedMutationsMap>::iterator kvOpsIter,
std::map<LoadingParam, SampledMutationsVec>::iterator samplesIter,
LoaderCounters* cc,
Reference<IBackupContainer> bc,
Version version,
RestoreAsset asset) {
state VersionedMutationsMap& kvOps = kvOpsIter->second;
state SampledMutationsVec& sampleMutations = samplesIter->second;
TraceEvent(SevFRDebugInfo, "FastRestoreDecodedRangeFile")
.detail("BatchIndex", asset.batchIndex)
.detail("Filename", asset.filename)
.detail("Version", version)
.detail("BeginVersion", asset.beginVersion)
.detail("EndVersion", asset.endVersion)
.detail("RestoreAsset", asset.toString());
// Sanity check the range file is within the restored version range
ASSERT_WE_THINK(asset.isInVersionRange(version));
state Standalone<VectorRef<KeyValueRef>> blockData;
// should retry here
state int readFileRetries = 0;
loop {
try {
// The set of key value version is rangeFile.version. the key-value set in the same range file has the same
// version
Reference<IAsyncFile> inFile = wait(bc->readFile(asset.filename));
Standalone<VectorRef<KeyValueRef>> kvs =
wait(fileBackup::decodeRangeFileBlock(inFile, asset.offset, asset.len));
TraceEvent("FastRestoreLoaderDecodedRangeFile")
.detail("BatchIndex", asset.batchIndex)
.detail("Filename", asset.filename)
.detail("DataSize", kvs.contents().size());
blockData = kvs;
break;
} catch (Error& e) {
if (e.code() == error_code_restore_bad_read || e.code() == error_code_restore_unsupported_file_version ||
e.code() == error_code_restore_corrupted_data_padding) { // no retriable error
TraceEvent(SevError, "FastRestoreFileRestoreCorruptedRangeFileBlock").error(e);
throw;
} else if (e.code() == error_code_http_request_failed || e.code() == error_code_connection_failed ||
e.code() == error_code_timed_out || e.code() == error_code_lookup_failed) {
// blob http request failure, retry
TraceEvent(SevWarnAlways, "FastRestoreDecodedRangeFileConnectionFailure")
.error(e)
.detail("Retries", ++readFileRetries);
wait(delayJittered(0.1));
} else {
TraceEvent(SevError, "FastRestoreParseRangeFileOnLoaderUnexpectedError").error(e);
throw;
}
}
}
// First and last key are the range for this file
KeyRange fileRange = KeyRangeRef(blockData.front().key, blockData.back().key);
// If fileRange doesn't intersect restore range then we're done.
if (!fileRange.intersects(asset.range)) {
return Void();
}
// We know the file range intersects the restore range but there could still be keys outside the restore range.
// Find the subvector of kv pairs that intersect the restore range.
// Note that the first and last keys are just the range endpoints for this file.
// They are metadata, not the real data.
int rangeStart = 1;
int rangeEnd = blockData.size() - 1; // The rangeStart and rangeEnd is [,)
// Slide start from begining, stop if something in range is found
// Move rangeStart and rangeEnd until they is within restoreRange
while (rangeStart < rangeEnd && !asset.range.contains(blockData[rangeStart].key)) {
++rangeStart;
}
// Side end from back, stop if something at (rangeEnd-1) is found in range
while (rangeEnd > rangeStart && !asset.range.contains(blockData[rangeEnd - 1].key)) {
--rangeEnd;
}
// Now data only contains the kv mutation within restoreRange
VectorRef<KeyValueRef> data = blockData.slice(rangeStart, rangeEnd);
// Note we give INT_MAX as the sub sequence number to override any log mutations.
const LogMessageVersion msgVersion(version, std::numeric_limits<int32_t>::max());
// Convert KV in data into SET mutations of different keys in kvOps
Arena tempArena;
for (const KeyValueRef& kv : data) {
// NOTE: The KV pairs in range files are the real KV pairs in original DB.
MutationRef m(MutationRef::Type::SetValue, kv.key, kv.value);
// Remove prefix or add prefix in case we restore data to a different sub keyspace
if (asset.hasPrefix()) { // Avoid creating new Key
ASSERT(asset.removePrefix.size() == 0);
m.param1 = m.param1.removePrefix(asset.removePrefix).withPrefix(asset.addPrefix, tempArena);
}
cc->loadedRangeBytes += m.totalSize();
// We cache all kv operations into kvOps, and apply all kv operations later in one place
auto it = kvOps.insert(std::make_pair(msgVersion, MutationsVec()));
TraceEvent(SevFRMutationInfo, "FastRestoreDecodeRangeFile")
.detail("BatchIndex", asset.batchIndex)
.detail("CommitVersion", version)
.detail("ParsedMutationKV", m.toString());
it.first->second.push_back_deep(it.first->second.arena(), m);
// Sampling (FASTRESTORE_SAMPLING_PERCENT%) data
ByteSampleInfo sampleInfo = isKeyValueInSample(KeyValueRef(m.param1, m.param2));
if (sampleInfo.inSample) {
cc->sampledRangeBytes += sampleInfo.sampledSize;
sampleMutations.push_back_deep(sampleMutations.arena(), SampledMutation(m.param1, sampleInfo.sampledSize));
}
}
return Void();
}
// Parse data blocks in a log file into a vector of <string, string> pairs.
// Each pair.second contains the mutations at a version encoded in pair.first;
// Step 1: decodeLogFileBlock into <string, string> pairs;
// Step 2: Concatenate the second of pairs with the same pair.first.
// pProcessedFileOffset: ensure each data block is processed in order exactly once;
// pMutationMap: concatenated mutation list string at the mutation's commit version
ACTOR static Future<Void> _parseLogFileToMutationsOnLoader(NotifiedVersion* pProcessedFileOffset,
SerializedMutationListMap* pMutationMap,
Reference<IBackupContainer> bc,
RestoreAsset asset) {
Reference<IAsyncFile> inFile = wait(bc->readFile(asset.filename));
// decodeLogFileBlock() must read block by block!
state Standalone<VectorRef<KeyValueRef>> data =
wait(parallelFileRestore::decodeLogFileBlock(inFile, asset.offset, asset.len));
TraceEvent("FastRestoreLoaderDecodeLogFile")
.detail("BatchIndex", asset.batchIndex)
.detail("RestoreAsset", asset.toString())
.detail("DataSize", data.contents().size());
// Ensure data blocks in the same file are processed in order
wait(pProcessedFileOffset->whenAtLeast(asset.offset));
if (pProcessedFileOffset->get() == asset.offset) {
for (const KeyValueRef& kv : data) {
// Concatenate the backup param1 and param2 (KV) at the same version.
concatenateBackupMutationForLogFile(pMutationMap, kv.key, kv.value, asset);
}
pProcessedFileOffset->set(asset.offset + asset.len);
}
return Void();
}
// retry on _parseLogFileToMutationsOnLoader
ACTOR static Future<Void> parseLogFileToMutationsOnLoader(NotifiedVersion* pProcessedFileOffset,
SerializedMutationListMap* pMutationMap,
Reference<IBackupContainer> bc,
RestoreAsset asset) {
state int readFileRetries = 0;
loop {
try {
wait(_parseLogFileToMutationsOnLoader(pProcessedFileOffset, pMutationMap, bc, asset));
break;
} catch (Error& e) {
if (e.code() == error_code_restore_bad_read || e.code() == error_code_restore_unsupported_file_version ||
e.code() == error_code_restore_corrupted_data_padding) { // non retriable error
TraceEvent(SevError, "FastRestoreFileRestoreCorruptedLogFileBlock").error(e);
throw;
} else if (e.code() == error_code_http_request_failed || e.code() == error_code_connection_failed ||
e.code() == error_code_timed_out || e.code() == error_code_lookup_failed) {
// blob http request failure, retry
TraceEvent(SevWarnAlways, "FastRestoreDecodedLogFileConnectionFailure")
.error(e)
.detail("Retries", ++readFileRetries);
wait(delayJittered(0.1));
} else {
TraceEvent(SevError, "FastRestoreParseLogFileToMutationsOnLoaderUnexpectedError").error(e);
throw;
}
}
}
return Void();
}
// Return applier IDs that are used to apply key-values
std::vector<UID> getApplierIDs(std::map<Key, UID>& rangeToApplier) {
std::vector<UID> applierIDs;
applierIDs.reserve(rangeToApplier.size());
for (auto& applier : rangeToApplier) {
applierIDs.push_back(applier.second);
}
ASSERT(!applierIDs.empty());
return applierIDs;
}
// Notify loaders that the version batch (index) has been applied.
// This affects which version batch each loader can release actors even when the worker has low memory
ACTOR Future<Void> handleFinishVersionBatchRequest(RestoreVersionBatchRequest req, Reference<RestoreLoaderData> self) {
// Ensure batch (i-1) is applied before batch i
TraceEvent("FastRestoreLoaderHandleFinishVersionBatch", self->id())
.detail("FinishedBatchIndex", self->finishedBatch.get())
.detail("RequestedBatchIndex", req.batchIndex);
wait(self->finishedBatch.whenAtLeast(req.batchIndex - 1));
if (self->finishedBatch.get() == req.batchIndex - 1) {
// Sanity check: All requests before and in this batchIndex must have been processed; otherwise,
// those requests may cause segmentation fault after applier remove the batch data
while (!self->loadingQueue.empty() && self->loadingQueue.top().batchIndex <= req.batchIndex) {
// Still has pending requests from earlier batchIndex and current batchIndex, which should not happen
TraceEvent(SevWarn, "FastRestoreLoaderHasPendingLoadFileRequests")
.detail("PendingRequest", self->loadingQueue.top().toString());
self->loadingQueue.pop();
}
while (!self->sendingQueue.empty() && self->sendingQueue.top().batchIndex <= req.batchIndex) {
TraceEvent(SevWarn, "FastRestoreLoaderHasPendingSendRequests")
.detail("PendingRequest", self->sendingQueue.top().toString());
self->sendingQueue.pop();
}
while (!self->sendLoadParamQueue.empty() && self->sendLoadParamQueue.top().batchIndex <= req.batchIndex) {
TraceEvent(SevWarn, "FastRestoreLoaderHasPendingSendLoadParamRequests")
.detail("PendingRequest", self->sendLoadParamQueue.top().toString());
self->sendLoadParamQueue.pop();
}
self->finishedBatch.set(req.batchIndex);
// Clean up batchData
self->batch.erase(req.batchIndex);
self->status.erase(req.batchIndex);
}
if (self->delayedActors > 0) {
self->checkMemory.trigger();
}
req.reply.send(RestoreCommonReply(self->id(), false));
return Void();
}
namespace {
void oldSplitMutation(std::map<Key, UID>* pRangeToApplier,
MutationRef m,
Arena& mvector_arena,
VectorRef<MutationRef>& mvector,
Arena& nodeIDs_arena,
VectorRef<UID>& nodeIDs) {
// mvector[i] should be mapped to nodeID[i]
ASSERT(mvector.empty());
ASSERT(nodeIDs.empty());
// key range [m->param1, m->param2)
std::map<Key, UID>::iterator itlow, itup; // we will return [itlow, itup)
itlow = pRangeToApplier->lower_bound(m.param1); // lower_bound returns the iterator that is >= m.param1
if (itlow == pRangeToApplier->end()) {
--itlow;
mvector.push_back_deep(mvector_arena, m);
nodeIDs.push_back(nodeIDs_arena, itlow->second);
return;
}
if (itlow->first > m.param1) {
if (itlow != pRangeToApplier->begin()) {
--itlow;
}
}
itup = pRangeToApplier->upper_bound(m.param2); // return rmap::end if no key is after m.param2.
ASSERT(itup == pRangeToApplier->end() || itup->first > m.param2);
std::map<Key, UID>::iterator itApplier;
while (itlow != itup) {
Standalone<MutationRef> curm; // current mutation
curm.type = m.type;
// The first split mutation should starts with m.first.
// The later ones should start with the rangeToApplier boundary.
if (m.param1 > itlow->first) {
curm.param1 = m.param1;
} else {
curm.param1 = itlow->first;
}
itApplier = itlow;
itlow++;
if (itlow == itup) {
ASSERT(m.param2 <= normalKeys.end);
curm.param2 = m.param2;
} else if (m.param2 < itlow->first) {
UNREACHABLE();
curm.param2 = m.param2;
} else {
curm.param2 = itlow->first;
}
ASSERT(curm.param1 <= curm.param2);
// itup > m.param2: (itup-1) may be out of mutation m's range
// Ensure the added mutations have overlap with mutation m
if (m.param1 < curm.param2 && m.param2 > curm.param1) {
mvector.push_back_deep(mvector_arena, curm);
nodeIDs.push_back(nodeIDs_arena, itApplier->second);
}
}
}
// Test splitMutation
TEST_CASE("/FastRestore/RestoreLoader/splitMutation") {
std::map<Key, UID> rangeToApplier;
MutationsVec mvector;
Standalone<VectorRef<UID>> nodeIDs;
// Prepare RangeToApplier
rangeToApplier.emplace(normalKeys.begin, deterministicRandom()->randomUniqueID());
int numAppliers = deterministicRandom()->randomInt(1, 50);
for (int i = 0; i < numAppliers; ++i) {
Key k = Key(deterministicRandom()->randomAlphaNumeric(deterministicRandom()->randomInt(1, 1000)));
UID node = deterministicRandom()->randomUniqueID();
rangeToApplier.emplace(k, node);
TraceEvent("RangeToApplier").detail("Key", k).detail("Node", node);
}
Key k1 = Key(deterministicRandom()->randomAlphaNumeric(deterministicRandom()->randomInt(1, 500)));
Key k2 = Key(deterministicRandom()->randomAlphaNumeric(deterministicRandom()->randomInt(1, 1000)));
Key beginK = k1 < k2 ? k1 : k2;
Key endK = k1 < k2 ? k2 : k1;
Standalone<MutationRef> mutation(MutationRef(MutationRef::ClearRange, beginK.contents(), endK.contents()));
// Method 1: Use old splitMutation
oldSplitMutation(
&rangeToApplier, mutation, mvector.arena(), mvector.contents(), nodeIDs.arena(), nodeIDs.contents());
ASSERT(mvector.size() == nodeIDs.size());
// Method 2: Use new intersection based method
KeyRangeMap<UID> krMap;
buildApplierRangeMap(&krMap, &rangeToApplier);
MutationsVec mvector2;
Standalone<VectorRef<UID>> nodeIDs2;
splitMutation(krMap, mutation, mvector2.arena(), mvector2.contents(), nodeIDs2.arena(), nodeIDs2.contents());
ASSERT(mvector2.size() == nodeIDs2.size());
ASSERT(mvector.size() == mvector2.size());
int splitMutationIndex = 0;
for (; splitMutationIndex < mvector.size(); splitMutationIndex++) {
MutationRef result = mvector[splitMutationIndex];
MutationRef result2 = mvector2[splitMutationIndex];
UID applierID = nodeIDs[splitMutationIndex];
UID applierID2 = nodeIDs2[splitMutationIndex];
KeyRange krange(KeyRangeRef(result.param1, result.param2));
KeyRange krange2(KeyRangeRef(result2.param1, result2.param2));
TraceEvent("Result")
.detail("KeyRange1", krange)
.detail("KeyRange2", krange2)
.detail("ApplierID1", applierID)
.detail("ApplierID2", applierID2);
if (krange != krange2 || applierID != applierID2) {
TraceEvent(SevError, "IncorrectResult")
.detail("Mutation", mutation)
.detail("KeyRange1", krange)
.detail("KeyRange2", krange2)
.detail("ApplierID1", applierID)
.detail("ApplierID2", applierID2);
}
}
return Void();
}
} // namespace