/* *RocksDBCheckpointUtils.actor.cpp * * This source file is part of the FoundationDB open source project * * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "fdbserver/RocksDBCheckpointUtils.actor.h" #include "fdbclient/FDBTypes.h" #include "fdbclient/NativeAPI.actor.h" #include "fdbclient/StorageCheckpoint.h" #include "flow/Trace.h" #include "flow/flow.h" #include "flow/actorcompiler.h" // has to be last include namespace { class RocksDBCheckpointReader : public ICheckpointReader { public: RocksDBCheckpointReader(const CheckpointMetaData& checkpoint, UID logID) : checkpoint_(checkpoint), id_(logID), file_(Reference()), offset_(0) {} Future init(StringRef token) override; Future nextKeyValues(const int rowLimit, const int byteLimit) override { throw not_implemented(); } // Returns the next chunk of serialized checkpoint. Future> nextChunk(const int byteLimit) override; Future close() override; private: ACTOR static Future doInit(RocksDBCheckpointReader* self) { ASSERT(self != nullptr); try { state Reference _file = wait(IAsyncFileSystem::filesystem()->open( self->path_, IAsyncFile::OPEN_READONLY | IAsyncFile::OPEN_UNCACHED | IAsyncFile::OPEN_NO_AIO, 0)); self->file_ = _file; TraceEvent("RocksDBCheckpointReaderOpenFile").detail("File", self->path_); } catch (Error& e) { TraceEvent(SevWarnAlways, "ServerGetCheckpointFileFailure") .errorUnsuppressed(e) .detail("File", self->path_); throw e; } return Void(); } ACTOR static Future> getNextChunk(RocksDBCheckpointReader* self, int byteLimit) { int blockSize = std::min(64 * 1024, byteLimit); // Block size read from disk. state Standalone buf = makeAlignedString(_PAGE_SIZE, blockSize); int bytesRead = wait(self->file_->read(mutateString(buf), blockSize, self->offset_)); if (bytesRead == 0) { throw end_of_stream(); } self->offset_ += bytesRead; return buf.substr(0, bytesRead); } ACTOR static Future doClose(RocksDBCheckpointReader* self) { wait(delay(0, TaskPriority::FetchKeys)); delete self; return Void(); } CheckpointMetaData checkpoint_; UID id_; Reference file_; int offset_; std::string path_; }; Future RocksDBCheckpointReader::init(StringRef token) { ASSERT_EQ(this->checkpoint_.getFormat(), RocksDBColumnFamily); const std::string name = token.toString(); this->offset_ = 0; this->path_.clear(); const RocksDBColumnFamilyCheckpoint rocksCF = getRocksCF(this->checkpoint_); for (const auto& sstFile : rocksCF.sstFiles) { if (sstFile.name == name) { this->path_ = sstFile.db_path + sstFile.name; break; } } if (this->path_.empty()) { TraceEvent("RocksDBCheckpointReaderInitFileNotFound").detail("File", this->path_); return checkpoint_not_found(); } return doInit(this); } Future> RocksDBCheckpointReader::nextChunk(const int byteLimit) { return getNextChunk(this, byteLimit); } Future RocksDBCheckpointReader::close() { return doClose(this); } // Fetch a single sst file from storage server. If the file is fetch successfully, it will be recorded via cFun. ACTOR Future fetchCheckpointFile(Database cx, std::shared_ptr metaData, int idx, std::string dir, std::function(const CheckpointMetaData&)> cFun, int maxRetries = 3) { state RocksDBColumnFamilyCheckpoint rocksCF; ObjectReader reader(metaData->serializedCheckpoint.begin(), IncludeVersion()); reader.deserialize(rocksCF); // Skip fetched file. if (rocksCF.sstFiles[idx].fetched && rocksCF.sstFiles[idx].db_path == dir) { return Void(); } state std::string remoteFile = rocksCF.sstFiles[idx].name; state std::string localFile = dir + rocksCF.sstFiles[idx].name; state UID ssID = metaData->ssID; state Transaction tr(cx); state StorageServerInterface ssi; loop { try { tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); Optional ss = wait(tr.get(serverListKeyFor(ssID))); if (!ss.present()) { throw checkpoint_not_found(); } ssi = decodeServerListValue(ss.get()); break; } catch (Error& e) { wait(tr.onError(e)); } } state int attempt = 0; loop { try { ++attempt; TraceEvent("FetchCheckpointFileBegin") .detail("RemoteFile", remoteFile) .detail("TargetUID", ssID.toString()) .detail("StorageServer", ssi.id().toString()) .detail("LocalFile", localFile) .detail("Attempt", attempt); wait(IAsyncFileSystem::filesystem()->deleteFile(localFile, true)); const int64_t flags = IAsyncFile::OPEN_ATOMIC_WRITE_AND_CREATE | IAsyncFile::OPEN_READWRITE | IAsyncFile::OPEN_CREATE | IAsyncFile::OPEN_UNCACHED | IAsyncFile::OPEN_NO_AIO; state int64_t offset = 0; state Reference asyncFile = wait(IAsyncFileSystem::filesystem()->open(localFile, flags, 0666)); state ReplyPromiseStream stream = ssi.fetchCheckpoint.getReplyStream(FetchCheckpointRequest(metaData->checkpointID, remoteFile)); TraceEvent("FetchCheckpointFileReceivingData") .detail("RemoteFile", remoteFile) .detail("TargetUID", ssID.toString()) .detail("StorageServer", ssi.id().toString()) .detail("LocalFile", localFile) .detail("Attempt", attempt); loop { state FetchCheckpointReply rep = waitNext(stream.getFuture()); wait(asyncFile->write(rep.data.begin(), rep.data.size(), offset)); wait(asyncFile->flush()); offset += rep.data.size(); } } catch (Error& e) { if (e.code() != error_code_end_of_stream) { TraceEvent("FetchCheckpointFileError") .errorUnsuppressed(e) .detail("RemoteFile", remoteFile) .detail("StorageServer", ssi.toString()) .detail("LocalFile", localFile) .detail("Attempt", attempt); if (attempt >= maxRetries) { throw e; } } else { wait(asyncFile->sync()); int64_t fileSize = wait(asyncFile->size()); TraceEvent("FetchCheckpointFileEnd") .detail("RemoteFile", remoteFile) .detail("StorageServer", ssi.toString()) .detail("LocalFile", localFile) .detail("Attempt", attempt) .detail("DataSize", offset) .detail("FileSize", fileSize); rocksCF.sstFiles[idx].db_path = dir; rocksCF.sstFiles[idx].fetched = true; metaData->serializedCheckpoint = ObjectWriter::toValue(rocksCF, IncludeVersion()); if (cFun) { wait(cFun(*metaData)); } return Void(); } } } } } // namespace ACTOR Future fetchRocksDBCheckpoint(Database cx, CheckpointMetaData initialState, std::string dir, std::function(const CheckpointMetaData&)> cFun) { TraceEvent("FetchRocksCheckpointBegin") .detail("InitialState", initialState.toString()) .detail("CheckpointDir", dir); state std::shared_ptr metaData = std::make_shared(initialState); if (metaData->format == RocksDBColumnFamily) { state RocksDBColumnFamilyCheckpoint rocksCF = getRocksCF(initialState); TraceEvent("RocksDBCheckpointMetaData").detail("RocksCF", rocksCF.toString()); state int i = 0; state std::vector> fs; for (; i < rocksCF.sstFiles.size(); ++i) { fs.push_back(fetchCheckpointFile(cx, metaData, i, dir, cFun)); TraceEvent("GetCheckpointFetchingFile") .detail("FileName", rocksCF.sstFiles[i].name) .detail("Server", metaData->ssID.toString()); } wait(waitForAll(fs)); } else { throw not_implemented(); } return *metaData; } ACTOR Future deleteRocksCFCheckpoint(CheckpointMetaData checkpoint) { ASSERT_EQ(checkpoint.getFormat(), RocksDBColumnFamily); RocksDBColumnFamilyCheckpoint rocksCF = getRocksCF(checkpoint); TraceEvent("DeleteRocksColumnFamilyCheckpoint", checkpoint.checkpointID) .detail("CheckpointID", checkpoint.checkpointID) .detail("RocksCF", rocksCF.toString()); state std::unordered_set dirs; for (const LiveFileMetaData& file : rocksCF.sstFiles) { dirs.insert(file.db_path); } state std::unordered_set::iterator it = dirs.begin(); for (; it != dirs.end(); ++it) { const std::string dir = *it; platform::eraseDirectoryRecursive(dir); TraceEvent("DeleteCheckpointRemovedDir", checkpoint.checkpointID) .detail("CheckpointID", checkpoint.checkpointID) .detail("Dir", dir); wait(delay(0, TaskPriority::FetchKeys)); } return Void(); } ICheckpointReader* newRocksDBCheckpointReader(const CheckpointMetaData& checkpoint, UID logID) { return new RocksDBCheckpointReader(checkpoint, logID); } RocksDBColumnFamilyCheckpoint getRocksCF(const CheckpointMetaData& checkpoint) { RocksDBColumnFamilyCheckpoint rocksCF; ObjectReader reader(checkpoint.serializedCheckpoint.begin(), IncludeVersion()); reader.deserialize(rocksCF); return rocksCF; }