/* * FileBackupAgent.actor.cpp * * This source file is part of the FoundationDB open source project * * Copyright 2013-2024 Apple Inc. and the FoundationDB project authors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "fdbclient/BlobGranuleCommon.h" #include "fdbclient/CommitProxyInterface.h" #include "fdbclient/DatabaseConfiguration.h" #include "fdbclient/TenantEntryCache.actor.h" #include "fdbclient/TenantManagement.actor.h" #include "fdbclient/BlobRestoreCommon.h" #include "fdbrpc/TenantInfo.h" #include "fdbrpc/simulator.h" #include "flow/EncryptUtils.h" #include "flow/FastRef.h" #include "flow/flow.h" #include "fmt/format.h" #include "fdbclient/BackupAgent.actor.h" #include "fdbclient/BackupContainer.h" #include "fdbclient/BlobCipher.h" #include "fdbclient/ClientBooleanParams.h" #include "fdbclient/DatabaseContext.h" #include "fdbclient/FDBTypes.h" #include "fdbclient/GetEncryptCipherKeys.h" #include "fdbclient/JsonBuilder.h" #include "fdbclient/KeyBackedTypes.actor.h" #include "fdbclient/KeyRangeMap.h" #include "fdbclient/Knobs.h" #include "fdbclient/ManagementAPI.actor.h" #include "fdbclient/RestoreInterface.h" #include "fdbclient/Status.h" #include "fdbclient/SystemData.h" #include "fdbclient/TaskBucket.h" #include "fdbclient/Tenant.h" #include "flow/network.h" #include "flow/Trace.h" #include #include #include #include "flow/IAsyncFile.h" #include "flow/genericactors.actor.h" #include "flow/Hash3.h" #include "flow/xxhash.h" #include #include #include #include #include #include #include #include "flow/actorcompiler.h" // This must be the last #include. Optional fileBackupAgentProxy = Optional(); #define SevFRTestInfo SevVerbose // #define SevFRTestInfo SevInfo static std::string boolToYesOrNo(bool val) { return val ? std::string("Yes") : std::string("No"); } static std::string versionToString(Optional version) { if (version.present()) return std::to_string(version.get()); else return "N/A"; } static std::string timeStampToString(Optional epochs) { if (!epochs.present()) return "N/A"; return BackupAgentBase::formatTime(epochs.get()); } static Future> getTimestampFromVersion(Optional ver, Reference tr) { if (!ver.present()) return Optional(); return timeKeeperEpochsFromVersion(ver.get(), tr); } // Time format : // <= 59 seconds // <= 59.99 minutes // <= 23.99 hours // N.NN days std::string secondsToTimeFormat(int64_t seconds) { if (seconds >= 86400) return format("%.2f day(s)", seconds / 86400.0); else if (seconds >= 3600) return format("%.2f hour(s)", seconds / 3600.0); else if (seconds >= 60) return format("%.2f minute(s)", seconds / 60.0); else return format("%lld second(s)", seconds); } const Key FileBackupAgent::keyLastRestorable = "last_restorable"_sr; // For convenience typedef FileBackupAgent::ERestoreState ERestoreState; StringRef FileBackupAgent::restoreStateText(ERestoreState id) { switch (id) { case ERestoreState::UNINITIALIZED: return "uninitialized"_sr; case ERestoreState::QUEUED: return "queued"_sr; case ERestoreState::STARTING: return "starting"_sr; case ERestoreState::RUNNING: return "running"_sr; case ERestoreState::COMPLETED: return "completed"_sr; case ERestoreState::ABORTED: return "aborted"_sr; default: return "Unknown"_sr; } } Key FileBackupAgent::getPauseKey() { FileBackupAgent backupAgent; return backupAgent.taskBucket->getPauseKey(); } ACTOR Future> TagUidMap::getAll_impl(TagUidMap* tagsMap, Reference tr, Snapshot snapshot) { state Key prefix = tagsMap->prefix; // Copying it here as tagsMap lifetime is not tied to this actor TagMap::RangeResultType tagPairs = wait(tagsMap->getRange(tr, std::string(), {}, 1e6, snapshot)); std::vector results; for (auto& p : tagPairs.results) results.push_back(KeyBackedTag(p.first, prefix)); return results; } KeyBackedTag::KeyBackedTag(std::string tagName, StringRef tagMapPrefix) : KeyBackedProperty(TagUidMap(tagMapPrefix).getProperty(tagName)), tagName(tagName), tagMapPrefix(tagMapPrefix) {} class RestoreConfig : public KeyBackedTaskConfig { public: RestoreConfig(UID uid = UID()) : KeyBackedTaskConfig(fileRestorePrefixRange.begin, uid) {} RestoreConfig(Reference task) : KeyBackedTaskConfig(fileRestorePrefixRange.begin, task) {} KeyBackedProperty stateEnum() { return configSpace.pack(__FUNCTION__sr); } Future stateText(Reference tr) { return map(stateEnum().getD(tr), [](ERestoreState s) -> StringRef { return FileBackupAgent::restoreStateText(s); }); } KeyBackedProperty addPrefix() { return configSpace.pack(__FUNCTION__sr); } KeyBackedProperty removePrefix() { return configSpace.pack(__FUNCTION__sr); } KeyBackedProperty onlyApplyMutationLogs() { return configSpace.pack(__FUNCTION__sr); } KeyBackedProperty inconsistentSnapshotOnly() { return configSpace.pack(__FUNCTION__sr); } KeyBackedProperty unlockDBAfterRestore() { return configSpace.pack(__FUNCTION__sr); } // XXX: Remove restoreRange() once it is safe to remove. It has been changed to restoreRanges KeyBackedProperty restoreRange() { return configSpace.pack(__FUNCTION__sr); } // XXX: Changed to restoreRangeSet. It can be removed. KeyBackedProperty> restoreRanges() { return configSpace.pack(__FUNCTION__sr); } KeyBackedSet restoreRangeSet() { return configSpace.pack(__FUNCTION__sr); } KeyBackedProperty batchFuture() { return configSpace.pack(__FUNCTION__sr); } KeyBackedProperty beginVersion() { return configSpace.pack(__FUNCTION__sr); } KeyBackedProperty restoreVersion() { return configSpace.pack(__FUNCTION__sr); } KeyBackedProperty firstConsistentVersion() { return configSpace.pack(__FUNCTION__sr); } KeyBackedProperty> sourceContainer() { return configSpace.pack(__FUNCTION__sr); } // Get the source container as a bare URL, without creating a container instance KeyBackedProperty sourceContainerURL() { return configSpace.pack("sourceContainer"_sr); } // Total bytes written by all log and range restore tasks. KeyBackedBinaryValue bytesWritten() { return configSpace.pack(__FUNCTION__sr); } // File blocks that have had tasks created for them by the Dispatch task KeyBackedBinaryValue filesBlocksDispatched() { return configSpace.pack(__FUNCTION__sr); } // File blocks whose tasks have finished KeyBackedBinaryValue fileBlocksFinished() { return configSpace.pack(__FUNCTION__sr); } // Total number of files in the fileMap KeyBackedBinaryValue fileCount() { return configSpace.pack(__FUNCTION__sr); } // Total number of file blocks in the fileMap KeyBackedBinaryValue fileBlockCount() { return configSpace.pack(__FUNCTION__sr); } // True for blob granule restore KeyBackedBinaryValue isBlobGranuleRestore() { return configSpace.pack(__FUNCTION__sr); } Future> getRestoreRangesOrDefault(Reference tr) { return getRestoreRangesOrDefault_impl(this, tr); } ACTOR static Future> getRestoreRangesOrDefault_impl(RestoreConfig* self, Reference tr) { state std::vector ranges; state int batchSize = BUGGIFY ? 1 : CLIENT_KNOBS->RESTORE_RANGES_READ_BATCH; state Optional begin; state Arena arena; loop { KeyBackedSet::RangeResultType rangeResult = wait(self->restoreRangeSet().getRange(tr, begin, {}, batchSize)); ranges.insert(ranges.end(), rangeResult.results.begin(), rangeResult.results.end()); if (!rangeResult.more) { break; } ASSERT(!rangeResult.results.empty()); begin = KeyRangeRef(KeyRef(arena, ranges.back().begin), keyAfter(ranges.back().end, arena)); } // fall back to original fields if the new field is empty if (ranges.empty()) { std::vector _ranges = wait(self->restoreRanges().getD(tr)); ranges = _ranges; if (ranges.empty()) { KeyRange range = wait(self->restoreRange().getD(tr)); ranges.push_back(range); } } return ranges; } // Describes a file to load blocks from during restore. Ordered by version and then fileName to enable // incrementally advancing through the map, saving the version and path of the next starting point. struct RestoreFile { Version version; std::string fileName; bool isRange{ false }; // false for log file int64_t blockSize{ 0 }; int64_t fileSize{ 0 }; Version endVersion{ ::invalidVersion }; // not meaningful for range files Tuple pack() const { return Tuple::makeTuple(version, fileName, (int)isRange, fileSize, blockSize, endVersion); } static RestoreFile unpack(Tuple const& t) { RestoreFile r; int i = 0; r.version = t.getInt(i++); r.fileName = t.getString(i++).toString(); r.isRange = t.getInt(i++) != 0; r.fileSize = t.getInt(i++); r.blockSize = t.getInt(i++); r.endVersion = t.getInt(i++); return r; } }; typedef KeyBackedSet FileSetT; FileSetT fileSet() { return configSpace.pack(__FUNCTION__sr); } Future isRunnable(Reference tr) { return map(stateEnum().getD(tr), [](ERestoreState s) -> bool { return s != ERestoreState::ABORTED && s != ERestoreState::COMPLETED && s != ERestoreState::UNINITIALIZED; }); } Future logError(Database cx, Error e, std::string const& details, void* taskInstance = nullptr) { if (!uid.isValid()) { TraceEvent(SevError, "FileRestoreErrorNoUID").error(e).detail("Description", details); return Void(); } TraceEvent t(SevWarn, "FileRestoreError"); t.error(e) .detail("RestoreUID", uid) .detail("Description", details) .detail("TaskInstance", (uint64_t)taskInstance); // key_not_found could happen if (e.code() == error_code_key_not_found) t.backtrace(); return updateErrorInfo(cx, e, details); } Key mutationLogPrefix() { return uidPrefixKey(applyLogKeys.begin, uid); } Key applyMutationsMapPrefix() { return uidPrefixKey(applyMutationsKeyVersionMapRange.begin, uid); } ACTOR static Future getApplyVersionLag_impl(Reference tr, UID uid) { state Future> beginVal = tr->get(uidPrefixKey(applyMutationsBeginRange.begin, uid), Snapshot::True); state Future> endVal = tr->get(uidPrefixKey(applyMutationsEndRange.begin, uid), Snapshot::True); wait(success(beginVal) && success(endVal)); if (!beginVal.get().present() || !endVal.get().present()) return 0; Version beginVersion = BinaryReader::fromStringRef(beginVal.get().get(), Unversioned()); Version endVersion = BinaryReader::fromStringRef(endVal.get().get(), Unversioned()); return endVersion - beginVersion; } Future getApplyVersionLag(Reference tr) { return getApplyVersionLag_impl(tr, uid); } void initApplyMutations(Reference tr, Key addPrefix, Key removePrefix, OnlyApplyMutationLogs onlyApplyMutationLogs) { // Set these because they have to match the applyMutations values. this->addPrefix().set(tr, addPrefix); this->removePrefix().set(tr, removePrefix); clearApplyMutationsKeys(tr); // Initialize add/remove prefix, range version map count and set the map's start key to InvalidVersion tr->set(uidPrefixKey(applyMutationsAddPrefixRange.begin, uid), addPrefix); tr->set(uidPrefixKey(applyMutationsRemovePrefixRange.begin, uid), removePrefix); int64_t startCount = 0; tr->set(uidPrefixKey(applyMutationsKeyVersionCountRange.begin, uid), StringRef((uint8_t*)&startCount, 8)); Key mapStart = uidPrefixKey(applyMutationsKeyVersionMapRange.begin, uid); tr->set(mapStart, BinaryWriter::toValue(invalidVersion, Unversioned())); } void clearApplyMutationsKeys(Reference tr) { tr->setOption(FDBTransactionOptions::COMMIT_ON_FIRST_PROXY); // Clear add/remove prefix keys tr->clear(uidPrefixKey(applyMutationsAddPrefixRange.begin, uid)); tr->clear(uidPrefixKey(applyMutationsRemovePrefixRange.begin, uid)); // Clear range version map and count key tr->clear(uidPrefixKey(applyMutationsKeyVersionCountRange.begin, uid)); Key mapStart = uidPrefixKey(applyMutationsKeyVersionMapRange.begin, uid); tr->clear(KeyRangeRef(mapStart, strinc(mapStart))); // Clear any loaded mutations that have not yet been applied Key mutationPrefix = mutationLogPrefix(); tr->clear(KeyRangeRef(mutationPrefix, strinc(mutationPrefix))); // Clear end and begin versions (intentionally in this order) tr->clear(uidPrefixKey(applyMutationsEndRange.begin, uid)); tr->clear(uidPrefixKey(applyMutationsBeginRange.begin, uid)); } void setApplyBeginVersion(Reference tr, Version ver) { tr->set(uidPrefixKey(applyMutationsBeginRange.begin, uid), BinaryWriter::toValue(ver, Unversioned())); } Future getApplyBeginVersion(Reference tr) { return map(tr->get(uidPrefixKey(applyMutationsBeginRange.begin, uid)), [=](Optional const& value) -> Version { return value.present() ? BinaryReader::fromStringRef(value.get(), Unversioned()) : 0; }); } void setApplyEndVersion(Reference tr, Version ver) { tr->set(uidPrefixKey(applyMutationsEndRange.begin, uid), BinaryWriter::toValue(ver, Unversioned())); } Future getApplyEndVersion(Reference tr) { return map(tr->get(uidPrefixKey(applyMutationsEndRange.begin, uid)), [=](Optional const& value) -> Version { return value.present() ? BinaryReader::fromStringRef(value.get(), Unversioned()) : 0; }); } ACTOR static Future getCurrentVersion_impl(RestoreConfig* self, Reference tr) { state ERestoreState status = wait(self->stateEnum().getD(tr)); state Version version = -1; if (status == ERestoreState::RUNNING) { wait(store(version, self->getApplyBeginVersion(tr))); } else if (status == ERestoreState::COMPLETED) { wait(store(version, self->restoreVersion().getD(tr))); } return version; } Future getCurrentVersion(Reference tr) { return getCurrentVersion_impl(this, tr); } ACTOR static Future getProgress_impl(RestoreConfig restore, Reference tr); Future getProgress(Reference tr) { return getProgress_impl(*this, tr); } ACTOR static Future getFullStatus_impl(RestoreConfig restore, Reference tr); Future getFullStatus(Reference tr) { return getFullStatus_impl(*this, tr); } }; typedef RestoreConfig::RestoreFile RestoreFile; ACTOR Future RestoreConfig::getProgress_impl(RestoreConfig restore, Reference tr) { tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr->setOption(FDBTransactionOptions::LOCK_AWARE); state Future fileCount = restore.fileCount().getD(tr); state Future fileBlockCount = restore.fileBlockCount().getD(tr); state Future fileBlocksDispatched = restore.filesBlocksDispatched().getD(tr); state Future fileBlocksFinished = restore.fileBlocksFinished().getD(tr); state Future bytesWritten = restore.bytesWritten().getD(tr); state Future status = restore.stateText(tr); state Future currentVersion = restore.getCurrentVersion(tr); state Future lag = restore.getApplyVersionLag(tr); state Future firstConsistentVersion = restore.firstConsistentVersion().getD(tr); state Future tag = restore.tag().getD(tr); state Future> lastError = restore.lastError().getD(tr); // restore might no longer be valid after the first wait so make sure it is not needed anymore. state UID uid = restore.getUid(); wait(success(fileCount) && success(fileBlockCount) && success(fileBlocksDispatched) && success(fileBlocksFinished) && success(bytesWritten) && success(status) && success(currentVersion) && success(lag) && success(firstConsistentVersion) && success(tag) && success(lastError)); std::string errstr = "None"; if (lastError.get().second != 0) errstr = format("'%s' %" PRId64 "s ago.\n", lastError.get().first.c_str(), (tr->getReadVersion().get() - lastError.get().second) / CLIENT_KNOBS->CORE_VERSIONSPERSECOND); TraceEvent("FileRestoreProgress") .detail("RestoreUID", uid) .detail("Tag", tag.get()) .detail("State", status.get().toString()) .detail("FileCount", fileCount.get()) .detail("FileBlocksFinished", fileBlocksFinished.get()) .detail("FileBlocksTotal", fileBlockCount.get()) .detail("FileBlocksInProgress", fileBlocksDispatched.get() - fileBlocksFinished.get()) .detail("BytesWritten", bytesWritten.get()) .detail("CurrentVersion", currentVersion.get()) .detail("FirstConsistentVersion", firstConsistentVersion.get()) .detail("ApplyLag", lag.get()) .detail("TaskInstance", THIS_ADDR); return format("Tag: %s UID: %s State: %s Blocks: %lld/%lld BlocksInProgress: %lld Files: %lld BytesWritten: " "%lld CurrentVersion: %lld FirstConsistentVersion: %lld ApplyVersionLag: %lld LastError: %s", tag.get().c_str(), uid.toString().c_str(), status.get().toString().c_str(), fileBlocksFinished.get(), fileBlockCount.get(), fileBlocksDispatched.get() - fileBlocksFinished.get(), fileCount.get(), bytesWritten.get(), currentVersion.get(), firstConsistentVersion.get(), lag.get(), errstr.c_str()); } ACTOR Future RestoreConfig::getFullStatus_impl(RestoreConfig restore, Reference tr) { tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr->setOption(FDBTransactionOptions::LOCK_AWARE); state Future> ranges = restore.getRestoreRangesOrDefault(tr); state Future addPrefix = restore.addPrefix().getD(tr); state Future removePrefix = restore.removePrefix().getD(tr); state Future url = restore.sourceContainerURL().getD(tr); state Future restoreVersion = restore.restoreVersion().getD(tr); state Future progress = restore.getProgress(tr); // restore might no longer be valid after the first wait so make sure it is not needed anymore. wait(success(ranges) && success(addPrefix) && success(removePrefix) && success(url) && success(restoreVersion) && success(progress)); std::string returnStr; returnStr = format("%s URL: %s", progress.get().c_str(), url.get().toString().c_str()); for (auto& range : ranges.get()) { returnStr += format(" Range: '%s'-'%s'", printable(range.begin).c_str(), printable(range.end).c_str()); } returnStr += format(" AddPrefix: '%s' RemovePrefix: '%s' Version: %lld", printable(addPrefix.get()).c_str(), printable(removePrefix.get()).c_str(), restoreVersion.get()); return returnStr; } FileBackupAgent::FileBackupAgent() : subspace(Subspace(fileBackupPrefixRange.begin)) // The other subspaces have logUID -> value , config(subspace.get(BackupAgentBase::keyConfig)), lastRestorable(subspace.get(FileBackupAgent::keyLastRestorable)), taskBucket(new TaskBucket(subspace.get(BackupAgentBase::keyTasks), AccessSystemKeys::True, PriorityBatch::False, LockAware::True)), futureBucket(new FutureBucket(subspace.get(BackupAgentBase::keyFutures), AccessSystemKeys::True, LockAware::True)) { } namespace fileBackup { // Return a block of contiguous padding bytes, growing if needed. Value makePadding(int size) { static Value pad; if (pad.size() < size) { pad = makeString(size); memset(mutateString(pad), '\xff', pad.size()); } return pad.substr(0, size); } struct IRangeFileWriter { public: virtual Future padEnd(bool final) = 0; virtual Future writeKV(Key k, Value v) = 0; virtual Future writeKey(Key k) = 0; virtual Future finish() = 0; virtual ~IRangeFileWriter() {} }; struct SnapshotFileBackupEncryptionKeys { Reference textCipherKey; Optional> headerCipherKey; StringRef ivRef; }; // File Format handlers. // Both Range and Log formats are designed to be readable starting at any BACKUP_RANGEFILE_BLOCK_SIZE boundary // so they can be read in parallel. // // Writer instances must be kept alive while any member actors are in progress. // // EncryptedRangeFileWriter must be used as follows: // 1 - writeKey(key) the queried key range begin // 2 - writeKV(k, v) each kv pair to restore // 3 - writeKey(key) the queried key range end // 4 - finish() // // EncryptedRangeFileWriter will insert the required padding, header, and extra // end/begin keys around the 1MB boundaries as needed. // // Example: // The range a-z is queries and returns c-j which covers 3 blocks across 2 tenants. // The client code writes keys in this sequence: // t1a t1c t1d t1e t1f t1g t2h t2i t2j t2z // // H = header P = padding a...z = keys v = value | = block boundary // // Encoded file: H t1a t1cv t1dv t1ev P | H t1e t1ev t1fv t1gv t2 P | H t2 t2hv t2iv t2jv t2z // Decoded in blocks yields: // Block 1: range [t1a, t1e) with kv pairs t1cv, t1dv // Block 2: range [t1e, t2) with kv pairs t1ev, t1fv, t1gv // Block 3: range [t2, t2z) with kv pairs t2hv, t2iv, t2jv // // NOTE: All blocks except for the final block will have one last // value which will not be used. This isn't actually a waste since // if the next KV pair wouldn't fit within the block after the value // then the space after the final key to the next 1MB boundary would // just be padding anyway. // // NOTE: For the EncryptedRangeFileWriter blocks will be split either on the BACKUP_RANGEFILE_BLOCK_SIZE boundary or // when a new tenant id is encountered. If a block is split for crossing tenant boundaries then the last key will be // truncated to just the tenant prefix and the value will be empty (to avoid having sensitive data of one tenant be // encrypted with a key for a different tenant) struct EncryptedRangeFileWriter : public IRangeFileWriter { EncryptedRangeFileWriter(Database cx, Arena* arena, EncryptionAtRestMode encryptMode, Optional>> tenantCache, Reference file = Reference(), int blockSize = 0) : cx(cx), arena(arena), file(file), encryptMode(encryptMode), tenantCache(tenantCache), blockSize(blockSize), blockEnd(0), fileVersion(BACKUP_AGENT_ENCRYPTED_SNAPSHOT_FILE_VERSION) { buffer = makeString(blockSize); wPtr = mutateString(buffer); } ACTOR static Future decryptImpl(Database cx, BlobCipherEncryptHeaderRef header, const uint8_t* dataP, int64_t dataLen, Arena* arena) { Reference const> dbInfo = cx->clientInfo; state BlobCipherEncryptHeaderRef headerRef = header; TextAndHeaderCipherKeys cipherKeys = wait( GetEncryptCipherKeys::getEncryptCipherKeys(dbInfo, headerRef, BlobCipherMetrics::RESTORE)); EncryptHeaderCipherDetails cipherDetails = headerRef.getCipherDetails(); cipherDetails.textCipherDetails.validateCipherDetailsWithCipherKey(cipherKeys.cipherTextKey); if (cipherDetails.headerCipherDetails.present()) { cipherDetails.headerCipherDetails.get().validateCipherDetailsWithCipherKey(cipherKeys.cipherHeaderKey); } DecryptBlobCipherAes256Ctr decryptor( cipherKeys.cipherTextKey, cipherKeys.cipherHeaderKey, headerRef.getIV(), BlobCipherMetrics::RESTORE); return decryptor.decrypt(dataP, dataLen, headerRef, *arena); } static Future decrypt(Database cx, BlobCipherEncryptHeaderRef header, const uint8_t* dataP, int64_t dataLen, Arena* arena) { return decryptImpl(cx, header, dataP, dataLen, arena); } ACTOR static Future> refreshKey(EncryptedRangeFileWriter* self, EncryptCipherDomainId domainId) { Reference const> dbInfo = self->cx->clientInfo; TextAndHeaderCipherKeys cipherKeys = wait(GetEncryptCipherKeys::getLatestEncryptCipherKeysForDomain( dbInfo, domainId, BlobCipherMetrics::BACKUP)); return cipherKeys.cipherTextKey; } ACTOR static Future encrypt(EncryptedRangeFileWriter* self) { // TODO: HeaderCipher key not needed for 'no authentication encryption' ASSERT(self->cipherKeys.headerCipherKey.present() && self->cipherKeys.headerCipherKey.get().isValid() && self->cipherKeys.textCipherKey.isValid()); // Ensure that the keys we got are still valid before flushing the block if (self->cipherKeys.headerCipherKey.get()->isExpired() || self->cipherKeys.headerCipherKey.get()->needsRefresh()) { Reference cipherKey = wait(refreshKey(self, self->cipherKeys.headerCipherKey.get()->getDomainId())); self->cipherKeys.headerCipherKey = cipherKey; } if (self->cipherKeys.textCipherKey->isExpired() || self->cipherKeys.textCipherKey->needsRefresh()) { Reference cipherKey = wait(refreshKey(self, self->cipherKeys.textCipherKey->getDomainId())); self->cipherKeys.textCipherKey = cipherKey; } EncryptBlobCipherAes265Ctr encryptor( self->cipherKeys.textCipherKey, self->cipherKeys.headerCipherKey, self->cipherKeys.ivRef.begin(), AES_256_IV_LENGTH, getEncryptAuthTokenMode(EncryptAuthTokenMode::ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE), BlobCipherMetrics::BACKUP); int64_t payloadSize = self->wPtr - self->dataPayloadStart; StringRef encryptedData; BlobCipherEncryptHeaderRef headerRef; encryptedData = encryptor.encrypt(self->dataPayloadStart, payloadSize, &headerRef, *self->arena); Standalone serialized = BlobCipherEncryptHeaderRef::toStringRef(headerRef); self->arena->dependsOn(serialized.arena()); ASSERT(serialized.size() == self->encryptHeader.size()); std::memcpy(mutateString(self->encryptHeader), serialized.begin(), self->encryptHeader.size()); // re-write encrypted data to buffer std::memcpy(self->dataPayloadStart, encryptedData.begin(), payloadSize); return Void(); } ACTOR static Future updateEncryptionKeysCtx(EncryptedRangeFileWriter* self, KeyRef key, SnapshotBackupUseTenantCache checkTenantCache) { state EncryptCipherDomainId curDomainId = wait(getEncryptionDomainDetails(key, self->encryptMode, self->tenantCache, checkTenantCache)); state Reference const> dbInfo = self->cx->clientInfo; // Get text and header cipher key TextAndHeaderCipherKeys textAndHeaderCipherKeys = wait(GetEncryptCipherKeys::getLatestEncryptCipherKeysForDomain( dbInfo, curDomainId, BlobCipherMetrics::BACKUP)); self->cipherKeys.textCipherKey = textAndHeaderCipherKeys.cipherTextKey; self->cipherKeys.headerCipherKey = textAndHeaderCipherKeys.cipherHeaderKey; // Set ivRef self->cipherKeys.ivRef = makeString(AES_256_IV_LENGTH, *self->arena); deterministicRandom()->randomBytes(mutateString(self->cipherKeys.ivRef), AES_256_IV_LENGTH); return Void(); } // Returns the number of bytes that have been written to the buffer static int64_t currentBufferSize(EncryptedRangeFileWriter* self) { return self->wPtr - self->buffer.begin(); } static int64_t expectedFileSize(EncryptedRangeFileWriter* self) { // Return what has already been written to file plus the size of the current buffer // which indicates how many bytes the file will contain once the buffer is written return self->file->size() + currentBufferSize(self); } static void copyToBuffer(EncryptedRangeFileWriter* self, const void* src, size_t size) { if (size > 0) { std::memcpy(self->wPtr, src, size); self->wPtr += size; ASSERT(currentBufferSize(self) <= self->blockSize); } } static void appendStringRefWithLenToBuffer(EncryptedRangeFileWriter* self, StringRef* s) { // Append the string length followed by the string to the buffer uint32_t lenBuf = bigEndian32((uint32_t)s->size()); copyToBuffer(self, &lenBuf, sizeof(lenBuf)); copyToBuffer(self, s->begin(), s->size()); } ACTOR static Future getEncryptionDomainDetails( KeyRef key, EncryptionAtRestMode encryptMode, Optional>> tenantCache, SnapshotBackupUseTenantCache checkTenantCache) { if (isSystemKey(key)) { return SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID; } if (key.size() < TenantAPI::PREFIX_SIZE || encryptMode.mode == EncryptionAtRestMode::CLUSTER_AWARE) { return FDB_DEFAULT_ENCRYPT_DOMAIN_ID; } // dealing with domain aware encryption so all keys should belong to a tenant KeyRef tenantPrefix = KeyRef(key.begin(), TenantAPI::PREFIX_SIZE); state int64_t tenantId = TenantAPI::prefixToId(tenantPrefix); // It's possible for the first and last key in a block (when writeKey is called) to not have a valid tenant // prefix, since they mark the start and end of a range, in that case we denote them as having a default encrypt // domain for the purpose of encrypting the block if (checkTenantCache && tenantCache.present()) { Optional> payload = wait(tenantCache.get()->getById(tenantId)); if (!payload.present()) { return FDB_DEFAULT_ENCRYPT_DOMAIN_ID; } } return tenantId; } // Handles the first block and internal blocks. Ends current block if needed. // The final flag is used in simulation to pad the file's final block to a whole block size ACTOR static Future newBlock(EncryptedRangeFileWriter* self, int bytesNeeded, KeyRef lastKey, bool writeValue, bool final = false) { // Write padding to finish current block if needed int bytesLeft = self->blockEnd - expectedFileSize(self); ASSERT(bytesLeft >= 0); if (bytesLeft > 0) { state Value paddingFFs = makePadding(bytesLeft); copyToBuffer(self, paddingFFs.begin(), bytesLeft); } if (expectedFileSize(self) > 0) { // write buffer to file since block is finished ASSERT(currentBufferSize(self) == self->blockSize); wait(encrypt(self)); wait(self->file->append(self->buffer.begin(), self->blockSize)); // reset write pointer to beginning of StringRef self->wPtr = mutateString(self->buffer); } if (final) { ASSERT(g_network->isSimulated()); return Void(); } // Set new blockEnd self->blockEnd += self->blockSize; // write Header copyToBuffer(self, (uint8_t*)&self->fileVersion, sizeof(self->fileVersion)); // calculate encryption header size uint32_t headerSize = 0; EncryptAuthTokenMode authTokenMode = getEncryptAuthTokenMode(EncryptAuthTokenMode::ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE); EncryptAuthTokenAlgo authTokenAlgo = getAuthTokenAlgoFromMode(authTokenMode); headerSize = BlobCipherEncryptHeaderRef::getHeaderSize(CLIENT_KNOBS->ENCRYPT_HEADER_FLAGS_VERSION, getEncryptCurrentAlgoHeaderVersion(authTokenMode, authTokenAlgo), ENCRYPT_CIPHER_MODE_AES_256_CTR, authTokenMode, authTokenAlgo); ASSERT(headerSize > 0); // write header size to buffer copyToBuffer(self, (uint8_t*)&headerSize, sizeof(headerSize)); // leave space for encryption header self->encryptHeader = StringRef(self->wPtr, headerSize); self->wPtr += headerSize; self->dataPayloadStart = self->wPtr; // If this is NOT the first block then write duplicate stuff needed from last block if (self->blockEnd > self->blockSize) { appendStringRefWithLenToBuffer(self, &lastKey); appendStringRefWithLenToBuffer(self, &self->lastKey); if (writeValue) { appendStringRefWithLenToBuffer(self, &self->lastValue); } } // There must now be room in the current block for bytesNeeded or the block size is too small if (expectedFileSize(self) + bytesNeeded > self->blockEnd) { throw backup_bad_block_size(); } return Void(); } Future padEnd(bool final) { if (expectedFileSize(this) > 0) { return newBlock(this, 0, StringRef(), true, final); } return Void(); } // Ends the current block if necessary based on bytesNeeded. ACTOR static Future newBlockIfNeeded(EncryptedRangeFileWriter* self, int bytesNeeded) { if (expectedFileSize(self) + bytesNeeded > self->blockEnd) { wait(newBlock(self, bytesNeeded, self->lastKey, true)); } return Void(); } ACTOR static Future handleTenantBondary(EncryptedRangeFileWriter* self, Key k, Value v, bool writeValue, EncryptCipherDomainId curKeyDomainId, SnapshotBackupUseTenantCache checkTenantCache) { state KeyRef endKey = k; // If we are crossing a boundary with a key that has a tenant prefix then truncate it if (curKeyDomainId != SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID && curKeyDomainId != FDB_DEFAULT_ENCRYPT_DOMAIN_ID) { endKey = StringRef(k.begin(), TenantAPI::PREFIX_SIZE); } state ValueRef newValue = StringRef(); self->lastKey = k; self->lastValue = v; appendStringRefWithLenToBuffer(self, &endKey); appendStringRefWithLenToBuffer(self, &newValue); wait(newBlock(self, 0, endKey, writeValue)); wait(updateEncryptionKeysCtx(self, self->lastKey, checkTenantCache)); return Void(); } ACTOR static Future finishCurTenantBlockStartNewIfNeeded(EncryptedRangeFileWriter* self, Key k, Value v, bool writeValue, SnapshotBackupUseTenantCache checkTenantCache) { // Don't want to start a new block if the current key or previous key is empty if (self->lastKey.size() == 0 || k.size() == 0) { return false; } state EncryptCipherDomainId curKeyDomainId = wait(getEncryptionDomainDetails(k, self->encryptMode, self->tenantCache, checkTenantCache)); state EncryptCipherDomainId prevKeyDomainId = wait(getEncryptionDomainDetails(self->lastKey, self->encryptMode, self->tenantCache, checkTenantCache)); if (curKeyDomainId != prevKeyDomainId) { CODE_PROBE(true, "crossed tenant boundaries"); wait(handleTenantBondary(self, k, v, writeValue, curKeyDomainId, checkTenantCache)); return true; } return false; } // Start a new block if needed, then write the key and value ACTOR static Future writeKV_impl(EncryptedRangeFileWriter* self, Key k, Value v) { if (!self->cipherKeys.headerCipherKey.present() || !self->cipherKeys.headerCipherKey.get().isValid() || !self->cipherKeys.textCipherKey.isValid()) { wait(updateEncryptionKeysCtx(self, k, SnapshotBackupUseTenantCache::False)); } state int toWrite = sizeof(int32_t) + k.size() + sizeof(int32_t) + v.size(); wait(newBlockIfNeeded(self, toWrite)); bool createdNewBlock = wait(finishCurTenantBlockStartNewIfNeeded(self, k, v, true, SnapshotBackupUseTenantCache::False)); if (createdNewBlock) { return Void(); } appendStringRefWithLenToBuffer(self, &k); appendStringRefWithLenToBuffer(self, &v); self->lastKey = k; self->lastValue = v; return Void(); } Future writeKV(Key k, Value v) { return writeKV_impl(this, k, v); } // Write begin key or end key. ACTOR static Future writeKey_impl(EncryptedRangeFileWriter* self, Key k) { // TODO (Nim): Is it possible to write empty begin and end keys? if (k.size() > 0 && (!self->cipherKeys.headerCipherKey.present() || !self->cipherKeys.headerCipherKey.get().isValid() || !self->cipherKeys.textCipherKey.isValid())) { wait(updateEncryptionKeysCtx(self, k, SnapshotBackupUseTenantCache::True)); } // Need to account for extra "empty" value being written in the case of crossing tenant boundaries int toWrite = sizeof(uint32_t) + k.size() + sizeof(uint32_t); wait(newBlockIfNeeded(self, toWrite)); // We want to check the tenant cache here since the first/last key for a block may not be a valid KV pair (in // which case we use the default domain) bool createdNewBlock = wait(finishCurTenantBlockStartNewIfNeeded(self, k, StringRef(), false, SnapshotBackupUseTenantCache::True)); if (createdNewBlock) { return Void(); } appendStringRefWithLenToBuffer(self, &k); self->lastKey = k; return Void(); } Future writeKey(Key k) { return writeKey_impl(this, k); } ACTOR static Future finish_impl(EncryptedRangeFileWriter* self) { // Write any outstanding bytes to the file if (currentBufferSize(self) > 0) { wait(encrypt(self)); wait(self->file->append(self->buffer.begin(), currentBufferSize(self))); } return Void(); } Future finish() { return finish_impl(this); } Database cx; Arena* arena; EncryptionAtRestMode encryptMode; Reference file; Optional>> tenantCache; int blockSize; private: Standalone buffer; uint8_t* wPtr; StringRef encryptHeader; uint8_t* dataPayloadStart; int64_t blockEnd; uint32_t fileVersion; Key lastKey; Key lastValue; SnapshotFileBackupEncryptionKeys cipherKeys; }; // File Format handlers. // Both Range and Log formats are designed to be readable starting at any BACKUP_RANGEFILE_BLOCK_SIZE boundary // so they can be read in parallel. // // Writer instances must be kept alive while any member actors are in progress. // // RangeFileWriter must be used as follows: // 1 - writeKey(key) the queried key range begin // 2 - writeKV(k, v) each kv pair to restore // 3 - writeKey(key) the queried key range end // 4 - finish() // // RangeFileWriter will insert the required padding, header, and extra // end/begin keys around the 1MB boundaries as needed. // // Example: // The range a-z is queries and returns c-j which covers 3 blocks. // The client code writes keys in this sequence: // a c d e f g h i j z // // H = header P = padding a...z = keys v = value | = block boundary // // Encoded file: H a cv dv ev P | H e ev fv gv hv P | H h hv iv jv z // Decoded in blocks yields: // Block 1: range [a, e) with kv pairs cv, dv // Block 2: range [e, h) with kv pairs ev, fv, gv // Block 3: range [h, z) with kv pairs hv, iv, jv // // NOTE: All blocks except for the final block will have one last // value which will not be used. This isn't actually a waste since // if the next KV pair wouldn't fit within the block after the value // then the space after the final key to the next 1MB boundary would // just be padding anyway. struct RangeFileWriter : public IRangeFileWriter { RangeFileWriter(Reference file = Reference(), int blockSize = 0) : file(file), blockSize(blockSize), blockEnd(0), fileVersion(BACKUP_AGENT_SNAPSHOT_FILE_VERSION) {} // Handles the first block and internal blocks. Ends current block if needed. // The final flag is used in simulation to pad the file's final block to a whole block size ACTOR static Future newBlock(RangeFileWriter* self, int bytesNeeded, bool final = false) { // Write padding to finish current block if needed int bytesLeft = self->blockEnd - self->file->size(); if (bytesLeft > 0) { state Value paddingFFs = makePadding(bytesLeft); wait(self->file->append(paddingFFs.begin(), bytesLeft)); } if (final) { ASSERT(g_network->isSimulated()); return Void(); } // Set new blockEnd self->blockEnd += self->blockSize; // write Header wait(self->file->append((uint8_t*)&self->fileVersion, sizeof(self->fileVersion))); // If this is NOT the first block then write duplicate stuff needed from last block if (self->blockEnd > self->blockSize) { wait(self->file->appendStringRefWithLen(self->lastKey)); wait(self->file->appendStringRefWithLen(self->lastKey)); wait(self->file->appendStringRefWithLen(self->lastValue)); } // There must now be room in the current block for bytesNeeded or the block size is too small if (self->file->size() + bytesNeeded > self->blockEnd) throw backup_bad_block_size(); return Void(); } // Used in simulation only to create backup file sizes which are an integer multiple of the block size Future padEnd(bool final) { ASSERT(g_network->isSimulated()); if (file->size() > 0) { return newBlock(this, 0, final); } return Void(); } // Ends the current block if necessary based on bytesNeeded. Future newBlockIfNeeded(int bytesNeeded) { if (file->size() + bytesNeeded > blockEnd) return newBlock(this, bytesNeeded); return Void(); } // Start a new block if needed, then write the key and value ACTOR static Future writeKV_impl(RangeFileWriter* self, Key k, Value v) { int toWrite = sizeof(int32_t) + k.size() + sizeof(int32_t) + v.size(); wait(self->newBlockIfNeeded(toWrite)); wait(self->file->appendStringRefWithLen(k)); wait(self->file->appendStringRefWithLen(v)); self->lastKey = k; self->lastValue = v; return Void(); } Future writeKV(Key k, Value v) { return writeKV_impl(this, k, v); } // Write begin key or end key. ACTOR static Future writeKey_impl(RangeFileWriter* self, Key k) { int toWrite = sizeof(uint32_t) + k.size(); wait(self->newBlockIfNeeded(toWrite)); wait(self->file->appendStringRefWithLen(k)); return Void(); } Future writeKey(Key k) { return writeKey_impl(this, k); } Future finish() { return Void(); } Reference file; int blockSize; private: int64_t blockEnd; uint32_t fileVersion; Key lastKey; Key lastValue; }; ACTOR static Future decodeKVPairs(StringRefReader* reader, Standalone>* results, bool encryptedBlock, EncryptionAtRestMode encryptMode, Optional blockDomainId, Optional>> tenantCache) { // Read begin key, if this fails then block was invalid. state uint32_t kLen = reader->consumeNetworkUInt32(); state const uint8_t* k = reader->consume(kLen); results->push_back(results->arena(), KeyValueRef(KeyRef(k, kLen), ValueRef())); state KeyRef prevKey = KeyRef(k, kLen); state bool done = false; state Optional prevDomainId; // Read kv pairs and end key while (1) { // Read a key. kLen = reader->consumeNetworkUInt32(); k = reader->consume(kLen); // make sure that all keys in a block belong to exactly one tenant, // unless its the last key in which case it can be a truncated (different) tenant prefix if (encryptedBlock && g_network && g_network->isSimulated()) { ASSERT(blockDomainId.present()); state KeyRef curKey = KeyRef(k, kLen); if (!prevDomainId.present()) { EncryptCipherDomainId domainId = wait(EncryptedRangeFileWriter::getEncryptionDomainDetails( prevKey, encryptMode, tenantCache, SnapshotBackupUseTenantCache::False)); prevDomainId = domainId; } state EncryptCipherDomainId curDomainId = wait(EncryptedRangeFileWriter::getEncryptionDomainDetails( curKey, encryptMode, tenantCache, SnapshotBackupUseTenantCache::False)); if (!curKey.empty() && !prevKey.empty() && prevDomainId.get() != curDomainId) { ASSERT(!done); // Make sure that all tenant specific keys in a block have the correct prefix size if (curDomainId != SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID && curDomainId != FDB_DEFAULT_ENCRYPT_DOMAIN_ID && curKey.size() != TenantAPI::PREFIX_SIZE) { ASSERT(tenantCache.present()); Optional> payload = wait(tenantCache.get()->getById(curDomainId)); ASSERT(!payload.present()); } done = true; } // make sure that all keys (except the last key) in a block are encrypted using the correct key.; if (blockDomainId.get() != FDB_DEFAULT_ENCRYPT_DOMAIN_ID && !prevKey.empty()) { ASSERT_EQ(prevDomainId.get(), blockDomainId.get()); } prevKey = curKey; prevDomainId = curDomainId; } // If eof reached or first value len byte is 0xFF then a valid block end was reached. if (reader->eof() || *reader->rptr == 0xFF) { results->push_back(results->arena(), KeyValueRef(KeyRef(k, kLen), ValueRef())); break; } // Read a value, which must exist or the block is invalid state uint32_t vLen = reader->consumeNetworkUInt32(); state const uint8_t* v = reader->consume(vLen); if (tenantCache.present() && !isSystemKey(KeyRef(k, kLen))) { state int64_t tenantId = TenantAPI::extractTenantIdFromKeyRef(StringRef(k, kLen)); Optional> payload = wait(tenantCache.get()->getById(tenantId)); // The first and last KV pairs are not restored so if the tenant is not found for the last key then it's ok // to include it in the restore set if (!payload.present() && !(reader->eof() || *reader->rptr == 0xFF)) { TraceEvent(SevWarnAlways, "SnapshotRestoreTenantNotFound").detail("TenantId", tenantId); CODE_PROBE(true, "Snapshot restore tenant not found"); } else { results->push_back(results->arena(), KeyValueRef(KeyRef(k, kLen), ValueRef(v, vLen))); } } else { results->push_back(results->arena(), KeyValueRef(KeyRef(k, kLen), ValueRef(v, vLen))); } // If eof reached or first byte of next key len is 0xFF then a valid block end was reached. if (reader->eof() || *reader->rptr == 0xFF) break; } // Make sure any remaining bytes in the block are 0xFF for (auto b : reader->remainder()) if (b != 0xFF) throw restore_corrupted_data_padding(); return Void(); } static Reference getBackupContainerWithProxy(Reference _bc) { Reference bc = IBackupContainer::openContainer(_bc->getURL(), fileBackupAgentProxy, {}); return bc; } Standalone> decodeRangeFileBlock(const Standalone& buf) { Standalone> results({}, buf.arena()); StringRefReader reader(buf, restore_corrupted_data()); // Read header, currently only decoding BACKUP_AGENT_SNAPSHOT_FILE_VERSION if (reader.consume() != BACKUP_AGENT_SNAPSHOT_FILE_VERSION) throw restore_unsupported_file_version(); // Read begin key, if this fails then block was invalid. uint32_t kLen = reader.consumeNetworkUInt32(); const uint8_t* k = reader.consume(kLen); results.push_back(results.arena(), KeyValueRef(KeyRef(k, kLen), ValueRef())); // Read kv pairs and end key while (1) { // If eof reached or first value len byte is 0xFF then a valid block end was reached. if (reader.eof() || *reader.rptr == 0xFF) { break; } // Read a value, which must exist or the block is invalid uint32_t vLen = reader.consumeNetworkUInt32(); const uint8_t* v = reader.consume(vLen); results.push_back(results.arena(), KeyValueRef(KeyRef(k, kLen), ValueRef(v, vLen))); } // Make sure any remaining bytes in the block are 0xFF for (auto b : reader.remainder()) if (b != 0xFF) throw restore_corrupted_data_padding(); return results; } ACTOR Future>> decodeRangeFileBlock(Reference file, int64_t offset, int len, Database cx) { state Standalone buf = makeString(len); int rLen = wait(uncancellable(holdWhile(buf, file->read(mutateString(buf), len, offset)))); if (rLen != len) throw restore_bad_read(); simulateBlobFailure(); state Standalone> results({}, buf.arena()); state StringRefReader reader(buf, restore_corrupted_data()); state Arena arena; state DatabaseConfiguration config = wait(getDatabaseConfiguration(cx)); state Optional>> tenantCache; if (config.tenantMode == TenantMode::REQUIRED) { tenantCache = makeReference>(cx, TenantEntryCacheRefreshMode::WATCH); wait(tenantCache.get()->init()); } state EncryptionAtRestMode encryptMode = config.encryptionAtRestMode; state int64_t blockDomainId = TenantInfo::INVALID_TENANT; try { // Read header, currently only decoding BACKUP_AGENT_SNAPSHOT_FILE_VERSION or // BACKUP_AGENT_ENCRYPTED_SNAPSHOT_FILE_VERSION int32_t file_version = reader.consume(); ASSERT(!encryptMode.isEncryptionEnabled() || file_version == BACKUP_AGENT_ENCRYPTED_SNAPSHOT_FILE_VERSION); if (file_version == BACKUP_AGENT_SNAPSHOT_FILE_VERSION) { wait(decodeKVPairs(&reader, &results, false, encryptMode, Optional(), tenantCache)); } else if (file_version == BACKUP_AGENT_ENCRYPTED_SNAPSHOT_FILE_VERSION) { CODE_PROBE(true, "decoding encrypted block"); // read header size state uint32_t headerLen = reader.consume(); // read the encryption header state const uint8_t* headerStart = reader.consume(headerLen); StringRef headerS = StringRef(headerStart, headerLen); state BlobCipherEncryptHeaderRef encryptHeader; encryptHeader = BlobCipherEncryptHeaderRef::fromStringRef(headerS); blockDomainId = encryptHeader.getCipherDetails().textCipherDetails.encryptDomainId; if (config.tenantMode == TenantMode::REQUIRED && !isReservedEncryptDomain(blockDomainId)) { ASSERT(tenantCache.present()); Optional> payload = wait(tenantCache.get()->getById(blockDomainId)); if (!payload.present()) { throw tenant_not_found(); } } const uint8_t* dataPayloadStart = headerStart + headerLen; // calculate the total bytes read up to (and including) the header int64_t bytesRead = sizeof(int32_t) + sizeof(uint32_t) + headerLen; // get the size of the encrypted payload and decrypt it int64_t dataLen = len - bytesRead; StringRef decryptedData = wait(EncryptedRangeFileWriter::decrypt(cx, encryptHeader, dataPayloadStart, dataLen, &results.arena())); reader = StringRefReader(decryptedData, restore_corrupted_data()); wait(decodeKVPairs(&reader, &results, true, encryptMode, blockDomainId, tenantCache)); } else { throw restore_unsupported_file_version(); } return results; } catch (Error& e) { if (e.code() == error_code_encrypt_keys_fetch_failed || e.code() == error_code_encrypt_key_not_found) { ASSERT(!isReservedEncryptDomain(blockDomainId)); TraceEvent(SevWarnAlways, "SnapshotRestoreEncryptKeyFetchFailed").detail("TenantId", blockDomainId); CODE_PROBE(true, "Snapshot restore encrypt keys not found"); } else if (e.code() == error_code_tenant_not_found) { ASSERT(!isReservedEncryptDomain(blockDomainId)); TraceEvent(SevWarnAlways, "EncryptedSnapshotRestoreTenantNotFound").detail("TenantId", blockDomainId); CODE_PROBE(true, "Encrypted Snapshot restore tenant not found"); } TraceEvent(SevWarn, "FileRestoreDecodeRangeFileBlockFailed") .error(e) .detail("Filename", file->getFilename()) .detail("BlockOffset", offset) .detail("BlockLen", len) .detail("ErrorRelativeOffset", reader.rptr - buf.begin()) .detail("ErrorAbsoluteOffset", reader.rptr - buf.begin() + offset); throw; } } // Very simple format compared to KeyRange files. // Header, [Key, Value]... Key len struct LogFileWriter { LogFileWriter(Reference file = Reference(), int blockSize = 0) : file(file), blockSize(blockSize), blockEnd(0) {} // Start a new block if needed, then write the key and value ACTOR static Future writeKV_impl(LogFileWriter* self, Key k, Value v) { // If key and value do not fit in this block, end it and start a new one int toWrite = sizeof(int32_t) + k.size() + sizeof(int32_t) + v.size(); if (self->file->size() + toWrite > self->blockEnd) { // Write padding if needed int bytesLeft = self->blockEnd - self->file->size(); if (bytesLeft > 0) { state Value paddingFFs = makePadding(bytesLeft); wait(self->file->append(paddingFFs.begin(), bytesLeft)); } // Set new blockEnd self->blockEnd += self->blockSize; // write the block header wait(self->file->append((uint8_t*)&BACKUP_AGENT_MLOG_VERSION, sizeof(BACKUP_AGENT_MLOG_VERSION))); } wait(self->file->appendStringRefWithLen(k)); wait(self->file->appendStringRefWithLen(v)); // At this point we should be in whatever the current block is or the block size is too small if (self->file->size() > self->blockEnd) throw backup_bad_block_size(); return Void(); } Future writeKV(Key k, Value v) { return writeKV_impl(this, k, v); } Reference file; int blockSize; private: int64_t blockEnd; }; Standalone> decodeMutationLogFileBlock(const Standalone& buf) { Standalone> results({}, buf.arena()); StringRefReader reader(buf, restore_corrupted_data()); // Read header, currently only decoding version BACKUP_AGENT_MLOG_VERSION if (reader.consume() != BACKUP_AGENT_MLOG_VERSION) throw restore_unsupported_file_version(); // Read k/v pairs. Block ends either at end of last value exactly or with 0xFF as first key len byte. while (1) { // If eof reached or first key len bytes is 0xFF then end of block was reached. if (reader.eof() || *reader.rptr == 0xFF) break; // Read key and value. If anything throws then there is a problem. uint32_t kLen = reader.consumeNetworkUInt32(); const uint8_t* k = reader.consume(kLen); uint32_t vLen = reader.consumeNetworkUInt32(); const uint8_t* v = reader.consume(vLen); results.push_back(results.arena(), KeyValueRef(KeyRef(k, kLen), ValueRef(v, vLen))); } // Make sure any remaining bytes in the block are 0xFF for (auto b : reader.remainder()) if (b != 0xFF) throw restore_corrupted_data_padding(); return results; } ACTOR Future>> decodeMutationLogFileBlock(Reference file, int64_t offset, int len) { state Standalone buf = makeString(len); int rLen = wait(file->read(mutateString(buf), len, offset)); if (rLen != len) throw restore_bad_read(); try { return decodeMutationLogFileBlock(buf); } catch (Error& e) { TraceEvent(SevWarn, "FileRestoreCorruptLogFileBlock") .error(e) .detail("Filename", file->getFilename()) .detail("BlockOffset", offset) .detail("BlockLen", len); throw; } } ACTOR Future checkTaskVersion(Database cx, Reference task, StringRef name, uint32_t version) { uint32_t taskVersion = task->getVersion(); if (taskVersion > version) { state Error err = task_invalid_version(); TraceEvent(SevWarn, "BA_BackupRangeTaskFuncExecute") .detail("TaskVersion", taskVersion) .detail("Name", name) .detail("Version", version); if (KeyBackedTaskConfig::TaskParams.uid().exists(task)) { std::string msg = format("%s task version `%lu' is greater than supported version `%lu'", task->params[Task::reservedTaskParamKeyType].toString().c_str(), (unsigned long)taskVersion, (unsigned long)version); wait(BackupConfig(task).logError(cx, err, msg)); } throw err; } return Void(); } ACTOR static Future abortFiveZeroBackup(FileBackupAgent* backupAgent, Reference tr, std::string tagName) { tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr->setOption(FDBTransactionOptions::LOCK_AWARE); state Subspace tagNames = backupAgent->subspace.get(BackupAgentBase::keyTagName); Optional uidStr = wait(tr->get(tagNames.pack(Key(tagName)))); if (!uidStr.present()) { TraceEvent(SevWarn, "FileBackupAbortIncompatibleBackup_TagNotFound").detail("TagName", tagName.c_str()); return Void(); } state UID uid = BinaryReader::fromStringRef(uidStr.get(), Unversioned()); state Subspace statusSpace = backupAgent->subspace.get(BackupAgentBase::keyStates).get(uid.toString()); state Subspace globalConfig = backupAgent->subspace.get(BackupAgentBase::keyConfig).get(uid.toString()); state Subspace newConfigSpace = uidPrefixKey("uid->config/"_sr.withPrefix(fileBackupPrefixRange.begin), uid); Optional statusStr = wait(tr->get(statusSpace.pack(FileBackupAgent::keyStateStatus))); state EBackupState status = !statusStr.present() ? EBackupState::STATE_NEVERRAN : BackupAgentBase::getState(statusStr.get().toString()); TraceEvent(SevInfo, "FileBackupAbortIncompatibleBackup") .detail("TagName", tagName.c_str()) .detail("Status", BackupAgentBase::getStateText(status)); // Clear the folder id to prevent future tasks from executing at all tr->clear(singleKeyRange(StringRef(globalConfig.pack(FileBackupAgent::keyFolderId)))); // Clear the mutations logging config and data Key configPath = uidPrefixKey(logRangesRange.begin, uid); Key logsPath = uidPrefixKey(backupLogKeys.begin, uid); tr->clear(KeyRangeRef(configPath, strinc(configPath))); tr->clear(KeyRangeRef(logsPath, strinc(logsPath))); // Clear the new-style config space tr->clear(newConfigSpace.range()); Key statusKey = StringRef(statusSpace.pack(FileBackupAgent::keyStateStatus)); // Set old style state key to Aborted if it was Runnable if (backupAgent->isRunnable(status)) tr->set(statusKey, StringRef(FileBackupAgent::getStateText(EBackupState::STATE_ABORTED))); return Void(); } struct AbortFiveZeroBackupTask : TaskFuncBase { static StringRef name; ACTOR static Future _finish(Reference tr, Reference taskBucket, Reference futureBucket, Reference task) { state FileBackupAgent backupAgent; state std::string tagName = task->params[BackupAgentBase::keyConfigBackupTag].toString(); TraceEvent(SevInfo, "FileBackupCancelOldTask") .detail("Task", task->params[Task::reservedTaskParamKeyType]) .detail("TagName", tagName); wait(abortFiveZeroBackup(&backupAgent, tr, tagName)); wait(taskBucket->finish(tr, task)); return Void(); } StringRef getName() const override { TraceEvent(SevError, "FileBackupError") .detail("Cause", "AbortFiveZeroBackupTaskFunc::name() should never be called"); ASSERT(false); return StringRef(); } Future execute(Database cx, Reference tb, Reference fb, Reference task) override { return Future(Void()); }; Future finish(Reference tr, Reference tb, Reference fb, Reference task) override { return _finish(tr, tb, fb, task); }; }; StringRef AbortFiveZeroBackupTask::name = "abort_legacy_backup"_sr; REGISTER_TASKFUNC(AbortFiveZeroBackupTask); REGISTER_TASKFUNC_ALIAS(AbortFiveZeroBackupTask, file_backup_diff_logs); REGISTER_TASKFUNC_ALIAS(AbortFiveZeroBackupTask, file_backup_log_range); REGISTER_TASKFUNC_ALIAS(AbortFiveZeroBackupTask, file_backup_logs); REGISTER_TASKFUNC_ALIAS(AbortFiveZeroBackupTask, file_backup_range); REGISTER_TASKFUNC_ALIAS(AbortFiveZeroBackupTask, file_backup_restorable); REGISTER_TASKFUNC_ALIAS(AbortFiveZeroBackupTask, file_finish_full_backup); REGISTER_TASKFUNC_ALIAS(AbortFiveZeroBackupTask, file_finished_full_backup); REGISTER_TASKFUNC_ALIAS(AbortFiveZeroBackupTask, file_start_full_backup); ACTOR static Future abortFiveOneBackup(FileBackupAgent* backupAgent, Reference tr, std::string tagName) { tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr->setOption(FDBTransactionOptions::LOCK_AWARE); state KeyBackedTag tag = makeBackupTag(tagName); state UidAndAbortedFlagT current = wait(tag.getOrThrow(tr, Snapshot::False, backup_unneeded())); state BackupConfig config(current.first); EBackupState status = wait(config.stateEnum().getD(tr, Snapshot::False, EBackupState::STATE_NEVERRAN)); if (!backupAgent->isRunnable(status)) { throw backup_unneeded(); } TraceEvent(SevInfo, "FBA_AbortFileOneBackup") .detail("TagName", tagName.c_str()) .detail("Status", BackupAgentBase::getStateText(status)); // Cancel backup task through tag wait(tag.cancel(tr)); Key configPath = uidPrefixKey(logRangesRange.begin, config.getUid()); Key logsPath = uidPrefixKey(backupLogKeys.begin, config.getUid()); tr->clear(KeyRangeRef(configPath, strinc(configPath))); tr->clear(KeyRangeRef(logsPath, strinc(logsPath))); config.stateEnum().set(tr, EBackupState::STATE_ABORTED); return Void(); } struct AbortFiveOneBackupTask : TaskFuncBase { static StringRef name; ACTOR static Future _finish(Reference tr, Reference taskBucket, Reference futureBucket, Reference task) { state FileBackupAgent backupAgent; state BackupConfig config(task); state std::string tagName = wait(config.tag().getOrThrow(tr)); TraceEvent(SevInfo, "FileBackupCancelFiveOneTask") .detail("Task", task->params[Task::reservedTaskParamKeyType]) .detail("TagName", tagName); wait(abortFiveOneBackup(&backupAgent, tr, tagName)); wait(taskBucket->finish(tr, task)); return Void(); } StringRef getName() const override { TraceEvent(SevError, "FileBackupError") .detail("Cause", "AbortFiveOneBackupTaskFunc::name() should never be called"); ASSERT(false); return StringRef(); } Future execute(Database cx, Reference tb, Reference fb, Reference task) override { return Future(Void()); }; Future finish(Reference tr, Reference tb, Reference fb, Reference task) override { return _finish(tr, tb, fb, task); }; }; StringRef AbortFiveOneBackupTask::name = "abort_legacy_backup_5.2"_sr; REGISTER_TASKFUNC(AbortFiveOneBackupTask); REGISTER_TASKFUNC_ALIAS(AbortFiveOneBackupTask, file_backup_write_range); REGISTER_TASKFUNC_ALIAS(AbortFiveOneBackupTask, file_backup_dispatch_ranges); REGISTER_TASKFUNC_ALIAS(AbortFiveOneBackupTask, file_backup_write_logs); REGISTER_TASKFUNC_ALIAS(AbortFiveOneBackupTask, file_backup_erase_logs); REGISTER_TASKFUNC_ALIAS(AbortFiveOneBackupTask, file_backup_dispatch_logs); REGISTER_TASKFUNC_ALIAS(AbortFiveOneBackupTask, file_backup_finished); REGISTER_TASKFUNC_ALIAS(AbortFiveOneBackupTask, file_backup_write_snapshot_manifest); REGISTER_TASKFUNC_ALIAS(AbortFiveOneBackupTask, file_backup_start); std::function)> NOP_SETUP_TASK_FN = [](Reference task) { /* NOP */ }; ACTOR static Future addBackupTask(StringRef name, uint32_t version, Reference tr, Reference taskBucket, TaskCompletionKey completionKey, BackupConfig config, Reference waitFor = Reference(), std::function)> setupTaskFn = NOP_SETUP_TASK_FN, int priority = 0, SetValidation setValidation = SetValidation::True) { tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr->setOption(FDBTransactionOptions::LOCK_AWARE); Key doneKey = wait(completionKey.get(tr, taskBucket)); state Reference task(new Task(name, version, doneKey, priority)); // Bind backup config to new task wait(config.toTask(tr, task, setValidation)); // Set task specific params setupTaskFn(task); if (!waitFor) { return taskBucket->addTask(tr, task); } wait(waitFor->onSetAddTask(tr, taskBucket, task)); return "OnSetAddTask"_sr; } // Clears the backup ID from "backupStartedKey" to pause backup workers. ACTOR static Future clearBackupStartID(Reference tr, UID backupUid) { // If backup worker is not enabled, exit early. Optional started = wait(tr->get(backupStartedKey)); std::vector> ids; if (started.present()) { ids = decodeBackupStartedValue(started.get()); } auto it = std::find_if(ids.begin(), ids.end(), [=](const std::pair& p) { return p.first == backupUid; }); if (it != ids.end()) { ids.erase(it); } if (ids.empty()) { TraceEvent("ClearBackup").detail("BackupID", backupUid); tr->clear(backupStartedKey); } else { tr->set(backupStartedKey, encodeBackupStartedValue(ids)); } return Void(); } // Backup and Restore taskFunc definitions will inherit from one of the following classes which // servers to catch and log to the appropriate config any error that execute/finish didn't catch and log. struct RestoreTaskFuncBase : TaskFuncBase { Future handleError(Database cx, Reference task, Error const& error) final { return RestoreConfig(task).logError( cx, error, format("'%s' on '%s'", error.what(), task->params[Task::reservedTaskParamKeyType].printable().c_str())); } virtual std::string toString(Reference task) const { return ""; } }; struct BackupTaskFuncBase : TaskFuncBase { Future handleError(Database cx, Reference task, Error const& error) final { return BackupConfig(task).logError( cx, error, format("'%s' on '%s'", error.what(), task->params[Task::reservedTaskParamKeyType].printable().c_str())); } virtual std::string toString(Reference task) const { return ""; } }; ACTOR static Future>> getBlockOfShards(Reference tr, Key beginKey, Key endKey, int limit) { tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr->setOption(FDBTransactionOptions::LOCK_AWARE); state Standalone> results; RangeResult values = wait(tr->getRange( KeyRangeRef(keyAfter(beginKey.withPrefix(keyServersPrefix)), endKey.withPrefix(keyServersPrefix)), limit)); for (auto& s : values) { KeyRef k = s.key.removePrefix(keyServersPrefix); results.push_back_deep(results.arena(), k); } return results; } struct BackupRangeTaskFunc : BackupTaskFuncBase { static StringRef name; static constexpr uint32_t version = 1; static struct { static TaskParam beginKey() { return __FUNCTION__sr; } static TaskParam endKey() { return __FUNCTION__sr; } static TaskParam addBackupRangeTasks() { return __FUNCTION__sr; } } Params; std::string toString(Reference task) const override { return format("beginKey '%s' endKey '%s' addTasks %d", Params.beginKey().get(task).printable().c_str(), Params.endKey().get(task).printable().c_str(), Params.addBackupRangeTasks().get(task)); } StringRef getName() const override { return name; }; Future execute(Database cx, Reference tb, Reference fb, Reference task) override { return _execute(cx, tb, fb, task); }; Future finish(Reference tr, Reference tb, Reference fb, Reference task) override { return _finish(tr, tb, fb, task); }; // Finish (which flushes/syncs) the file, and then in a single transaction, make some range backup progress // durable. This means: // - increment the backup config's range bytes written // - update the range file map // - update the task begin key // - save/extend the task with the new params // Returns whether or not the caller should continue executing the task. ACTOR static Future finishRangeFile(Reference file, Database cx, Reference task, Reference taskBucket, KeyRange range, Version version) { wait(file->finish()); // Ignore empty ranges. if (range.empty()) return false; state Reference tr(new ReadYourWritesTransaction(cx)); state BackupConfig backup(task); state bool usedFile = false; // Avoid unnecessary conflict by prevent taskbucket's automatic timeout extension // because the following transaction loop extends and updates the task. wait(task->extendMutex.take()); state FlowLock::Releaser releaser(task->extendMutex, 1); loop { try { tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr->setOption(FDBTransactionOptions::LOCK_AWARE); // Update the start key of the task so if this transaction completes but the task then fails // when it is restarted it will continue where this execution left off. Params.beginKey().set(task, range.end); // Save and extend the task with the new begin parameter state Version newTimeout = wait(taskBucket->extendTimeout(tr, task, UpdateParams::True)); // Update the range bytes written in the backup config backup.rangeBytesWritten().atomicOp(tr, file->size(), MutationRef::AddValue); backup.snapshotRangeFileCount().atomicOp(tr, 1, MutationRef::AddValue); // See if there is already a file for this key which has an earlier begin, update the map if not. Optional s = wait(backup.snapshotRangeFileMap().get(tr, range.end)); if (!s.present() || s.get().begin >= range.begin) { backup.snapshotRangeFileMap().set( tr, range.end, { range.begin, version, file->getFileName(), file->size() }); usedFile = true; } wait(tr->commit()); task->timeoutVersion = newTimeout; break; } catch (Error& e) { wait(tr->onError(e)); } } return usedFile; } ACTOR static Future addTask(Reference tr, Reference taskBucket, Reference parentTask, int priority, Key begin, Key end, TaskCompletionKey completionKey, Reference waitFor = Reference(), Version scheduledVersion = invalidVersion) { Key key = wait(addBackupTask( BackupRangeTaskFunc::name, BackupRangeTaskFunc::version, tr, taskBucket, completionKey, BackupConfig(parentTask), waitFor, [=](Reference task) { Params.beginKey().set(task, begin); Params.endKey().set(task, end); Params.addBackupRangeTasks().set(task, false); if (scheduledVersion != invalidVersion) ReservedTaskParams::scheduledVersion().set(task, scheduledVersion); }, priority)); return key; } ACTOR static Future _execute(Database cx, Reference taskBucket, Reference futureBucket, Reference task) { state Reference lock(new FlowLock(CLIENT_KNOBS->BACKUP_LOCK_BYTES)); wait(checkTaskVersion(cx, task, BackupRangeTaskFunc::name, BackupRangeTaskFunc::version)); state Key beginKey = Params.beginKey().get(task); state Key endKey = Params.endKey().get(task); TraceEvent("FileBackupRangeStart") .suppressFor(60) .detail("BackupUID", BackupConfig(task).getUid()) .detail("BeginKey", Params.beginKey().get(task).printable()) .detail("EndKey", Params.endKey().get(task).printable()) .detail("TaskKey", task->key.printable()); // When a key range task saves the last chunk of progress and then the executor dies, when the task // continues its beginKey and endKey will be equal but there is no work to be done. if (beginKey == endKey) return Void(); // Find out if there is a shard boundary in(beginKey, endKey) Standalone> keys = wait(runRYWTransaction( cx, [=](Reference tr) { return getBlockOfShards(tr, beginKey, endKey, 1); })); if (keys.size() > 0) { Params.addBackupRangeTasks().set(task, true); return Void(); } // Read everything from beginKey to endKey, write it to an output file, run the output file processor, and // then set on_done. If we are still writing after X seconds, end the output file and insert a new // backup_range task for the remainder. state Reference outFile; state Version outVersion = invalidVersion; state Key lastKey; // retrieve kvData state PromiseStream results; state Future rc = readCommitted(cx, results, lock, KeyRangeRef(beginKey, endKey), Terminator::True, AccessSystemKeys::True, LockAware::True); state std::unique_ptr rangeFile; state BackupConfig backup(task); state Arena arena; DatabaseConfiguration config = wait(getDatabaseConfiguration(cx)); state EncryptionAtRestMode encryptMode = config.encryptionAtRestMode; state Optional>> tenantCache; if (encryptMode.mode == EncryptionAtRestMode::DOMAIN_AWARE) { tenantCache = makeReference>(cx, TenantEntryCacheRefreshMode::WATCH); wait(tenantCache.get()->init()); } // Don't need to check keepRunning(task) here because we will do that while finishing each output file, but // if bc is false then clearly the backup is no longer in progress Reference _bc = wait(backup.backupContainer().getD(cx.getReference())); if (!_bc) { return Void(); } state Reference bc = getBackupContainerWithProxy(_bc); state bool done = false; state int64_t nrKeys = 0; state Optional encryptionEnabled; loop { state RangeResultWithVersion values; try { RangeResultWithVersion _values = waitNext(results.getFuture()); values = _values; lock->release(values.first.expectedSize()); } catch (Error& e) { if (e.code() == error_code_end_of_stream) done = true; else throw; } // If we've seen a new read version OR hit the end of the stream, then if we were writing a file finish // it. if (values.second != outVersion || done) { if (outFile) { CODE_PROBE(outVersion != invalidVersion, "Backup range task wrote multiple versions"); state Key nextKey = done ? endKey : keyAfter(lastKey); wait(rangeFile->writeKey(nextKey)); if (BUGGIFY) { wait(rangeFile->padEnd(true)); } wait(rangeFile->finish()); bool usedFile = wait( finishRangeFile(outFile, cx, task, taskBucket, KeyRangeRef(beginKey, nextKey), outVersion)); TraceEvent("FileBackupWroteRangeFile") .suppressFor(60) .detail("BackupUID", backup.getUid()) .detail("Size", outFile->size()) .detail("Keys", nrKeys) .detail("ReadVersion", outVersion) .detail("BeginKey", beginKey.printable()) .detail("EndKey", nextKey.printable()) .detail("AddedFileToMap", usedFile); nrKeys = 0; beginKey = nextKey; } if (done) return Void(); // Start writing a new file after verifying this task should keep running as of a new read version // (which must be >= outVersion) outVersion = values.second; // block size must be at least large enough for 3 max size keys and 2 max size values + overhead so // 250k conservatively. state int blockSize = BUGGIFY ? deterministicRandom()->randomInt(250e3, 4e6) : CLIENT_KNOBS->BACKUP_RANGEFILE_BLOCK_SIZE; state Version snapshotBeginVersion; state int64_t snapshotRangeFileCount; state Reference tr(new ReadYourWritesTransaction(cx)); loop { try { tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr->setOption(FDBTransactionOptions::LOCK_AWARE); wait(taskBucket->keepRunning(tr, task) && storeOrThrow(snapshotBeginVersion, backup.snapshotBeginVersion().get(tr)) && store(encryptionEnabled, backup.enableSnapshotBackupEncryption().get(tr)) && store(snapshotRangeFileCount, backup.snapshotRangeFileCount().getD(tr))); break; } catch (Error& e) { wait(tr->onError(e)); } } Reference f = wait(bc->writeRangeFile(snapshotBeginVersion, snapshotRangeFileCount, outVersion, blockSize)); outFile = f; if (!encryptionEnabled.present() || !encryptionEnabled.get()) { encryptMode = EncryptionAtRestMode::DISABLED; } TraceEvent(SevDebug, "EncryptionMode").detail("EncryptMode", encryptMode.toString()); // Initialize range file writer and write begin key if (encryptMode.mode != EncryptionAtRestMode::DISABLED) { CODE_PROBE(true, "using encrypted snapshot file writer"); rangeFile = std::make_unique( cx, &arena, encryptMode, tenantCache, outFile, blockSize); } else { rangeFile = std::make_unique(outFile, blockSize); } wait(rangeFile->writeKey(beginKey)); } // write kvData to file, update lastKey and key count if (values.first.size() != 0) { state size_t i = 0; for (; i < values.first.size(); ++i) { wait(rangeFile->writeKV(values.first[i].key, values.first[i].value)); } lastKey = values.first.back().key; nrKeys += values.first.size(); } } } ACTOR static Future startBackupRangeInternal(Reference tr, Reference taskBucket, Reference futureBucket, Reference task, Reference onDone) { tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr->setOption(FDBTransactionOptions::LOCK_AWARE); state Key nextKey = Params.beginKey().get(task); state Key endKey = Params.endKey().get(task); state Standalone> keys = wait(getBlockOfShards(tr, nextKey, endKey, CLIENT_KNOBS->BACKUP_SHARD_TASK_LIMIT)); std::vector> addTaskVector; for (int idx = 0; idx < keys.size(); ++idx) { if (nextKey != keys[idx]) { addTaskVector.push_back(addTask(tr, taskBucket, task, task->getPriority(), nextKey, keys[idx], TaskCompletionKey::joinWith(onDone))); TraceEvent("FileBackupRangeSplit") .suppressFor(60) .detail("BackupUID", BackupConfig(task).getUid()) .detail("BeginKey", Params.beginKey().get(task).printable()) .detail("EndKey", Params.endKey().get(task).printable()) .detail("SliceBeginKey", nextKey.printable()) .detail("SliceEndKey", keys[idx].printable()); } nextKey = keys[idx]; } wait(waitForAll(addTaskVector)); if (nextKey != endKey) { // Add task to cover nextKey to the end, using the priority of the current task wait(success(addTask(tr, taskBucket, task, task->getPriority(), nextKey, endKey, TaskCompletionKey::joinWith(onDone), Reference(), task->getPriority()))); } return Void(); } ACTOR static Future _finish(Reference tr, Reference taskBucket, Reference futureBucket, Reference task) { state Reference taskFuture = futureBucket->unpack(task->params[Task::reservedTaskParamKeyDone]); if (Params.addBackupRangeTasks().get(task)) { wait(startBackupRangeInternal(tr, taskBucket, futureBucket, task, taskFuture)); } else { wait(taskFuture->set(tr, taskBucket)); } wait(taskBucket->finish(tr, task)); TraceEvent("FileBackupRangeFinish") .suppressFor(60) .detail("BackupUID", BackupConfig(task).getUid()) .detail("BeginKey", Params.beginKey().get(task).printable()) .detail("EndKey", Params.endKey().get(task).printable()) .detail("TaskKey", task->key.printable()); return Void(); } }; StringRef BackupRangeTaskFunc::name = "file_backup_write_range_5.2"_sr; REGISTER_TASKFUNC(BackupRangeTaskFunc); struct BackupSnapshotDispatchTask : BackupTaskFuncBase { static StringRef name; static constexpr uint32_t version = 1; static struct { // Set by Execute, used by Finish static TaskParam shardsBehind() { return __FUNCTION__sr; } // Set by Execute, used by Finish static TaskParam snapshotFinished() { return __FUNCTION__sr; } // Set by Execute, used by Finish static TaskParam nextDispatchVersion() { return __FUNCTION__sr; } } Params; StringRef getName() const override { return name; }; Future execute(Database cx, Reference tb, Reference fb, Reference task) override { return _execute(cx, tb, fb, task); }; Future finish(Reference tr, Reference tb, Reference fb, Reference task) override { return _finish(tr, tb, fb, task); }; ACTOR static Future addTask(Reference tr, Reference taskBucket, Reference parentTask, int priority, TaskCompletionKey completionKey, Reference waitFor = Reference(), Version scheduledVersion = invalidVersion) { Key key = wait(addBackupTask( name, version, tr, taskBucket, completionKey, BackupConfig(parentTask), waitFor, [=](Reference task) { if (scheduledVersion != invalidVersion) ReservedTaskParams::scheduledVersion().set(task, scheduledVersion); }, priority)); return key; } enum DispatchState { SKIP = 0, DONE = 1, NOT_DONE_MIN = 2 }; ACTOR static Future _execute(Database cx, Reference taskBucket, Reference futureBucket, Reference task) { state Reference lock(new FlowLock(CLIENT_KNOBS->BACKUP_LOCK_BYTES)); wait(checkTaskVersion(cx, task, name, version)); state double startTime = timer(); state Reference tr(new ReadYourWritesTransaction(cx)); // The shard map will use 3 values classes. Exactly SKIP, exactly DONE, then any number >= NOT_DONE_MIN // which will mean not done. This is to enable an efficient coalesce() call to squash adjacent ranges which // are not yet finished to enable efficiently finding random database shards which are not done. state int notDoneSequence = NOT_DONE_MIN; state KeyRangeMap shardMap(notDoneSequence++); state Key beginKey = allKeys.begin; // Read all shard boundaries and add them to the map loop { try { tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr->setOption(FDBTransactionOptions::LOCK_AWARE); state Future>> shardBoundaries = getBlockOfShards(tr, beginKey, allKeys.end, CLIENT_KNOBS->TOO_MANY); wait(success(shardBoundaries) && taskBucket->keepRunning(tr, task)); if (shardBoundaries.get().size() == 0) break; for (auto& boundary : shardBoundaries.get()) { shardMap.rawInsert(boundary, notDoneSequence++); } beginKey = keyAfter(shardBoundaries.get().back()); tr->reset(); } catch (Error& e) { wait(tr->onError(e)); } } // Read required stuff from backup config state BackupConfig config(task); state Version recentReadVersion; state Version snapshotBeginVersion; state Version snapshotTargetEndVersion; state int64_t snapshotIntervalSeconds; state Optional latestSnapshotEndVersion; state std::vector backupRanges; state Optional snapshotBatchFutureKey; state Reference snapshotBatchFuture; state Optional snapshotBatchSize; tr->reset(); loop { try { tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr->setOption(FDBTransactionOptions::LOCK_AWARE); wait(store(snapshotBeginVersion, config.snapshotBeginVersion().getOrThrow(tr)) && store(snapshotTargetEndVersion, config.snapshotTargetEndVersion().getOrThrow(tr)) && store(backupRanges, config.backupRanges().getOrThrow(tr)) && store(snapshotIntervalSeconds, config.snapshotIntervalSeconds().getOrThrow(tr)) // The next two parameters are optional && store(snapshotBatchFutureKey, config.snapshotBatchFuture().get(tr)) && store(snapshotBatchSize, config.snapshotBatchSize().get(tr)) && store(latestSnapshotEndVersion, config.latestSnapshotEndVersion().get(tr)) && store(recentReadVersion, tr->getReadVersion()) && taskBucket->keepRunning(tr, task)); // If the snapshot batch future key does not exist, this is the first execution of this dispatch // task so // - create and set the snapshot batch future key // - initialize the batch size to 0 // - initialize the target snapshot end version if it is not yet set // - commit if (!snapshotBatchFutureKey.present()) { snapshotBatchFuture = futureBucket->future(tr); config.snapshotBatchFuture().set(tr, snapshotBatchFuture->pack()); snapshotBatchSize = 0; config.snapshotBatchSize().set(tr, snapshotBatchSize.get()); // The dispatch of this batch can take multiple separate executions if the executor fails // so store a completion key for the dispatch finish() to set when dispatching the batch is // done. state TaskCompletionKey dispatchCompletionKey = TaskCompletionKey::joinWith(snapshotBatchFuture); // this is a bad hack - but flow doesn't work well with lambda functions and capturing // state variables... auto cfg = &config; auto tx = &tr; wait(map(dispatchCompletionKey.get(tr, taskBucket), [cfg, tx](Key const& k) { cfg->snapshotBatchDispatchDoneKey().set(*tx, k); return Void(); })); wait(tr->commit()); } else { ASSERT(snapshotBatchSize.present()); // Batch future key exists in the config so create future from it snapshotBatchFuture = makeReference(futureBucket, snapshotBatchFutureKey.get()); } break; } catch (Error& e) { wait(tr->onError(e)); } } // Read all dispatched ranges state std::vector> dispatchBoundaries; tr->reset(); beginKey = allKeys.begin; loop { try { tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr->setOption(FDBTransactionOptions::LOCK_AWARE); state Future bounds = config.snapshotRangeDispatchMap().getRange( tr, beginKey, keyAfter(allKeys.end), CLIENT_KNOBS->TOO_MANY); wait(success(bounds) && taskBucket->keepRunning(tr, task) && store(recentReadVersion, tr->getReadVersion())); if (!bounds.get().results.empty()) { dispatchBoundaries.reserve(dispatchBoundaries.size() + bounds.get().results.size()); dispatchBoundaries.insert( dispatchBoundaries.end(), bounds.get().results.begin(), bounds.get().results.end()); } if (!bounds.get().more) break; beginKey = keyAfter(bounds.get().results.back().first); tr->reset(); } catch (Error& e) { wait(tr->onError(e)); } } // The next few sections involve combining the results above. Yields are used after operations // that could have operated on many thousands of things and in loops which could have many // thousands of iterations. // Declare some common iterators which must be state vars and will be used multiple times. state int i; state RangeMap::iterator iShard; state RangeMap::iterator iShardEnd; // Set anything inside a dispatched range to DONE. // Also ensure that the boundary value are true, false, [true, false]... if (dispatchBoundaries.size() > 0) { state bool lastValue = false; state Key lastKey; for (i = 0; i < dispatchBoundaries.size(); ++i) { const std::pair& boundary = dispatchBoundaries[i]; // Values must alternate ASSERT(boundary.second == !lastValue); // If this was the end of a dispatched range if (!boundary.second) { // Ensure that the dispatched boundaries exist AND set all shard ranges in the dispatched range // to DONE. RangeMap::Ranges shardRanges = shardMap.modify(KeyRangeRef(lastKey, boundary.first)); iShard = shardRanges.begin(); iShardEnd = shardRanges.end(); for (; iShard != iShardEnd; ++iShard) { iShard->value() = DONE; wait(yield()); } } lastValue = dispatchBoundaries[i].second; lastKey = dispatchBoundaries[i].first; wait(yield()); } ASSERT(lastValue == false); } // Set anything outside the backup ranges to SKIP. We can use insert() here instead of modify() // because it's OK to delete shard boundaries in the skipped ranges. if (backupRanges.size() > 0) { shardMap.insert(KeyRangeRef(allKeys.begin, backupRanges.front().begin), SKIP); wait(yield()); for (i = 0; i < backupRanges.size() - 1; ++i) { shardMap.insert(KeyRangeRef(backupRanges[i].end, backupRanges[i + 1].begin), SKIP); wait(yield()); } shardMap.insert(KeyRangeRef(backupRanges.back().end, allKeys.end), SKIP); wait(yield()); } state int countShardsDone = 0; state int countShardsNotDone = 0; // Scan through the shard map, counting the DONE and NOT_DONE shards. RangeMap::Ranges shardRanges = shardMap.ranges(); iShard = shardRanges.begin(); iShardEnd = shardRanges.end(); for (; iShard != iShardEnd; ++iShard) { if (iShard->value() == DONE) { ++countShardsDone; } else if (iShard->value() >= NOT_DONE_MIN) ++countShardsNotDone; wait(yield()); } // Coalesce the shard map to make random selection below more efficient. shardMap.coalesce(allKeys); wait(yield()); // In this context "all" refers to all of the shards relevant for this particular backup state int countAllShards = countShardsDone + countShardsNotDone; if (countShardsNotDone == 0) { TraceEvent("FileBackupSnapshotDispatchFinished") .detail("BackupUID", config.getUid()) .detail("AllShards", countAllShards) .detail("ShardsDone", countShardsDone) .detail("ShardsNotDone", countShardsNotDone) .detail("SnapshotBeginVersion", snapshotBeginVersion) .detail("SnapshotTargetEndVersion", snapshotTargetEndVersion) .detail("CurrentVersion", recentReadVersion) .detail("SnapshotIntervalSeconds", snapshotIntervalSeconds); Params.snapshotFinished().set(task, true); return Void(); } // Decide when the next snapshot dispatch should run. state Version nextDispatchVersion; // In simulation, use snapshot interval / 5 to ensure multiple dispatches run // Otherwise, use the knob for the number of seconds between snapshot dispatch tasks. if (g_network->isSimulated()) nextDispatchVersion = recentReadVersion + CLIENT_KNOBS->CORE_VERSIONSPERSECOND * (snapshotIntervalSeconds / 5.0); else nextDispatchVersion = recentReadVersion + CLIENT_KNOBS->CORE_VERSIONSPERSECOND * CLIENT_KNOBS->BACKUP_SNAPSHOT_DISPATCH_INTERVAL_SEC; // If nextDispatchVersion is greater than snapshotTargetEndVersion (which could be in the past) then just // use the greater of recentReadVersion or snapshotTargetEndVersion. Any range tasks created in this // dispatch will be scheduled at a random time between recentReadVersion and nextDispatchVersion, so // nextDispatchVersion shouldn't be less than recentReadVersion. if (nextDispatchVersion > snapshotTargetEndVersion) nextDispatchVersion = std::max(recentReadVersion, snapshotTargetEndVersion); Params.nextDispatchVersion().set(task, nextDispatchVersion); // Calculate number of shards that should be done before the next interval end // timeElapsed is between 0 and 1 and represents what portion of the shards we should have completed by now double timeElapsed; Version snapshotScheduledVersionInterval = snapshotTargetEndVersion - snapshotBeginVersion; if (snapshotTargetEndVersion > snapshotBeginVersion) timeElapsed = std::min( 1.0, (double)(nextDispatchVersion - snapshotBeginVersion) / (snapshotScheduledVersionInterval)); else timeElapsed = 1.0; state int countExpectedShardsDone = countAllShards * timeElapsed; state int countShardsToDispatch = std::max(0, countExpectedShardsDone - countShardsDone); // Calculate the number of shards that would have been dispatched by a normal (on-schedule) // BackupSnapshotDispatchTask given the dispatch window and the start and expected-end versions of the // current snapshot. int64_t dispatchWindow = nextDispatchVersion - recentReadVersion; // If the scheduled snapshot interval is 0 (such as for initial, as-fast-as-possible snapshot) then all // shards are considered late int countShardsExpectedPerNormalWindow; if (snapshotScheduledVersionInterval == 0) { countShardsExpectedPerNormalWindow = 0; } else { // A dispatchWindow of 0 means the target end version is <= now which also results in all shards being // considered late countShardsExpectedPerNormalWindow = (double(dispatchWindow) / snapshotScheduledVersionInterval) * countAllShards; } // The number of shards 'behind' the snapshot is the count of how may additional shards beyond normal are // being dispatched, if any. int countShardsBehind = std::max(0, countShardsToDispatch + snapshotBatchSize.get() - countShardsExpectedPerNormalWindow); Params.shardsBehind().set(task, countShardsBehind); TraceEvent("FileBackupSnapshotDispatchStats") .detail("BackupUID", config.getUid()) .detail("AllShards", countAllShards) .detail("ShardsDone", countShardsDone) .detail("ShardsNotDone", countShardsNotDone) .detail("ExpectedShardsDone", countExpectedShardsDone) .detail("ShardsToDispatch", countShardsToDispatch) .detail("ShardsBehind", countShardsBehind) .detail("SnapshotBeginVersion", snapshotBeginVersion) .detail("SnapshotTargetEndVersion", snapshotTargetEndVersion) .detail("NextDispatchVersion", nextDispatchVersion) .detail("CurrentVersion", recentReadVersion) .detail("TimeElapsed", timeElapsed) .detail("SnapshotIntervalSeconds", snapshotIntervalSeconds); // Dispatch random shards to catch up to the expected progress while (countShardsToDispatch > 0) { // First select ranges to add state std::vector rangesToAdd; // Limit number of tasks added per transaction int taskBatchSize = BUGGIFY ? deterministicRandom()->randomInt(1, countShardsToDispatch + 1) : CLIENT_KNOBS->BACKUP_DISPATCH_ADDTASK_SIZE; int added = 0; while (countShardsToDispatch > 0 && added < taskBatchSize && shardMap.size() > 0) { // Get a random range. auto it = shardMap.randomRange(); // Find a NOT_DONE range and add it to rangesToAdd while (1) { if (it->value() >= NOT_DONE_MIN) { rangesToAdd.push_back(it->range()); it->value() = DONE; shardMap.coalesce(Key(it->begin())); ++added; ++countShardsDone; --countShardsToDispatch; --countShardsNotDone; break; } if (it->end() == shardMap.mapEnd) break; ++it; } } state int64_t oldBatchSize = snapshotBatchSize.get(); state int64_t newBatchSize = oldBatchSize + rangesToAdd.size(); // Now add the selected ranges in a single transaction. tr->reset(); loop { try { TraceEvent("FileBackupSnapshotDispatchAddingTasks") .suppressFor(2) .detail("TasksToAdd", rangesToAdd.size()) .detail("NewBatchSize", newBatchSize); tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr->setOption(FDBTransactionOptions::LOCK_AWARE); // For each range, make sure it isn't set in the dispatched range map. state std::vector>> beginReads; state std::vector>> endReads; for (auto& range : rangesToAdd) { beginReads.push_back(config.snapshotRangeDispatchMap().get(tr, range.begin)); endReads.push_back(config.snapshotRangeDispatchMap().get(tr, range.end)); } wait(store(snapshotBatchSize.get(), config.snapshotBatchSize().getOrThrow(tr)) && waitForAll(beginReads) && waitForAll(endReads) && taskBucket->keepRunning(tr, task)); // Snapshot batch size should be either oldBatchSize or newBatchSize. If new, this transaction // is already done. if (snapshotBatchSize.get() == newBatchSize) { break; } else { ASSERT(snapshotBatchSize.get() == oldBatchSize); config.snapshotBatchSize().set(tr, newBatchSize); snapshotBatchSize = newBatchSize; config.snapshotDispatchLastShardsBehind().set(tr, Params.shardsBehind().get(task)); config.snapshotDispatchLastVersion().set(tr, tr->getReadVersion().get()); } state std::vector> addTaskFutures; for (i = 0; i < beginReads.size(); ++i) { KeyRange& range = rangesToAdd[i]; // This loop might have made changes to begin or end boundaries in a prior // iteration. If so, the updated values exist in the RYW cache so re-read both entries. Optional beginValue = config.snapshotRangeDispatchMap().get(tr, range.begin).get(); Optional endValue = config.snapshotRangeDispatchMap().get(tr, range.end).get(); ASSERT(!beginValue.present() || !endValue.present() || beginValue != endValue); // If begin is present, it must be a range end so value must be false // If end is present, it must be a range begin so value must be true if ((!beginValue.present() || !beginValue.get()) && (!endValue.present() || endValue.get())) { if (beginValue.present()) { config.snapshotRangeDispatchMap().erase(tr, range.begin); } else { config.snapshotRangeDispatchMap().set(tr, range.begin, true); } if (endValue.present()) { config.snapshotRangeDispatchMap().erase(tr, range.end); } else { config.snapshotRangeDispatchMap().set(tr, range.end, false); } Version scheduledVersion = invalidVersion; // If the next dispatch version is in the future, choose a random version at which to // start the new task. if (nextDispatchVersion > recentReadVersion) scheduledVersion = recentReadVersion + deterministicRandom()->random01() * (nextDispatchVersion - recentReadVersion); // Range tasks during the initial snapshot should run at a higher priority int priority = latestSnapshotEndVersion.present() ? 0 : 1; addTaskFutures.push_back( success(BackupRangeTaskFunc::addTask(tr, taskBucket, task, priority, range.begin, range.end, TaskCompletionKey::joinWith(snapshotBatchFuture), Reference(), scheduledVersion))); TraceEvent("FileBackupSnapshotRangeDispatched") .suppressFor(2) .detail("BackupUID", config.getUid()) .detail("CurrentVersion", recentReadVersion) .detail("ScheduledVersion", scheduledVersion) .detail("BeginKey", range.begin.printable()) .detail("EndKey", range.end.printable()); } else { // This shouldn't happen because if the transaction was already done or if another // execution of this task is making progress it should have been detected above. ASSERT(false); } } wait(waitForAll(addTaskFutures)); wait(tr->commit()); break; } catch (Error& e) { wait(tr->onError(e)); } } } if (countShardsNotDone == 0) { TraceEvent("FileBackupSnapshotDispatchFinished") .detail("BackupUID", config.getUid()) .detail("AllShards", countAllShards) .detail("ShardsDone", countShardsDone) .detail("ShardsNotDone", countShardsNotDone) .detail("SnapshotBeginVersion", snapshotBeginVersion) .detail("SnapshotTargetEndVersion", snapshotTargetEndVersion) .detail("CurrentVersion", recentReadVersion) .detail("SnapshotIntervalSeconds", snapshotIntervalSeconds) .detail("DispatchTimeSeconds", timer() - startTime); Params.snapshotFinished().set(task, true); } return Void(); } // This function is just a wrapper for BackupSnapshotManifest::addTask() which is defined below. // The BackupSnapshotDispatchTask and BackupSnapshotManifest tasks reference each other so in order to keep // their execute and finish phases defined together inside their class definitions this wrapper is declared here // but defined after BackupSnapshotManifest is defined. static Future addSnapshotManifestTask(Reference tr, Reference taskBucket, Reference parentTask, TaskCompletionKey completionKey, Reference waitFor = Reference()); ACTOR static Future _finish(Reference tr, Reference taskBucket, Reference futureBucket, Reference task) { state BackupConfig config(task); // Get the batch future and dispatch done keys, then clear them. state Key snapshotBatchFutureKey; state Key snapshotBatchDispatchDoneKey; wait(store(snapshotBatchFutureKey, config.snapshotBatchFuture().getOrThrow(tr)) && store(snapshotBatchDispatchDoneKey, config.snapshotBatchDispatchDoneKey().getOrThrow(tr))); state Reference snapshotBatchFuture = futureBucket->unpack(snapshotBatchFutureKey); state Reference snapshotBatchDispatchDoneFuture = futureBucket->unpack(snapshotBatchDispatchDoneKey); config.snapshotBatchFuture().clear(tr); config.snapshotBatchDispatchDoneKey().clear(tr); config.snapshotBatchSize().clear(tr); // Update shardsBehind here again in case the execute phase did not actually have to create any shard tasks config.snapshotDispatchLastShardsBehind().set(tr, Params.shardsBehind().getOrDefault(task, 0)); config.snapshotDispatchLastVersion().set(tr, tr->getReadVersion().get()); state Reference snapshotFinishedFuture = task->getDoneFuture(futureBucket); // If the snapshot is finished, the next task is to write a snapshot manifest, otherwise it's another // snapshot dispatch task. In either case, the task should wait for snapshotBatchFuture. The snapshot done // key, passed to the current task, is also passed on. if (Params.snapshotFinished().getOrDefault(task, false)) { wait(success(addSnapshotManifestTask( tr, taskBucket, task, TaskCompletionKey::signal(snapshotFinishedFuture), snapshotBatchFuture))); } else { wait(success(addTask(tr, taskBucket, task, 1, TaskCompletionKey::signal(snapshotFinishedFuture), snapshotBatchFuture, Params.nextDispatchVersion().get(task)))); } // This snapshot batch is finished, so set the batch done future. wait(snapshotBatchDispatchDoneFuture->set(tr, taskBucket)); wait(taskBucket->finish(tr, task)); return Void(); } }; StringRef BackupSnapshotDispatchTask::name = "file_backup_dispatch_ranges_5.2"_sr; REGISTER_TASKFUNC(BackupSnapshotDispatchTask); struct BackupLogRangeTaskFunc : BackupTaskFuncBase { static StringRef name; static constexpr uint32_t version = 1; static struct { static TaskParam addBackupLogRangeTasks() { return __FUNCTION__sr; } static TaskParam fileSize() { return __FUNCTION__sr; } static TaskParam beginVersion() { return __FUNCTION__sr; } static TaskParam endVersion() { return __FUNCTION__sr; } } Params; StringRef getName() const override { return name; }; Future execute(Database cx, Reference tb, Reference fb, Reference task) override { return _execute(cx, tb, fb, task); }; Future finish(Reference tr, Reference tb, Reference fb, Reference task) override { return _finish(tr, tb, fb, task); }; ACTOR static Future _execute(Database cx, Reference taskBucket, Reference