/* * BlobGranuleServerCommon.actor.cpp * * This source file is part of the FoundationDB open source project * * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "fmt/format.h" #include "fdbclient/BlobGranuleCommon.h" #include "fdbclient/CommitTransaction.h" #include "fdbclient/FDBTypes.h" #include "fdbclient/ReadYourWrites.h" #include "fdbclient/SystemData.h" #include "fdbserver/BlobGranuleServerCommon.actor.h" #include "fdbserver/Knobs.h" #include "flow/Arena.h" #include "flow/UnitTest.h" #include "flow/actorcompiler.h" // has to be last include // serialize change feed key as UID bytes, to use 16 bytes on disk Key granuleIDToCFKey(UID granuleID) { BinaryWriter wr(Unversioned()); wr << granuleID; return wr.toValue(); } // parse change feed key back to UID, to be human-readable UID cfKeyToGranuleID(Key cfKey) { return BinaryReader::fromStringRef<UID>(cfKey, Unversioned()); } // Gets the latest granule history node for range that was persisted ACTOR Future<Optional<GranuleHistory>> getLatestGranuleHistory(Transaction* tr, KeyRange range) { state KeyRange historyRange = blobGranuleHistoryKeyRangeFor(range); state RangeResult result = wait(tr->getRange(historyRange, 1, Snapshot::False, Reverse::True)); ASSERT(result.size() <= 1); Optional<GranuleHistory> history; if (!result.empty()) { std::pair<KeyRange, Version> decodedKey = decodeBlobGranuleHistoryKey(result[0].key); ASSERT(range == decodedKey.first); history = GranuleHistory(range, decodedKey.second, decodeBlobGranuleHistoryValue(result[0].value)); } return history; } // Gets the files based on the file key range [startKey, endKey) // and populates the files object accordingly ACTOR Future<Void> readGranuleFiles(Transaction* tr, Key* startKey, Key endKey, GranuleFiles* files, UID granuleID) { loop { int lim = BUGGIFY ? 2 : 1000; RangeResult res = wait(tr->getRange(KeyRangeRef(*startKey, endKey), lim)); for (auto& it : res) { UID gid; uint8_t fileType; Version version; Standalone<StringRef> filename; int64_t offset; int64_t length; int64_t fullFileLength; Optional<BlobGranuleCipherKeysMeta> cipherKeysMeta; std::tie(gid, version, fileType) = decodeBlobGranuleFileKey(it.key); ASSERT(gid == granuleID); std::tie(filename, offset, length, fullFileLength, cipherKeysMeta) = decodeBlobGranuleFileValue(it.value); BlobFileIndex idx(version, filename.toString(), offset, length, fullFileLength, cipherKeysMeta); if (fileType == 'S') { ASSERT(files->snapshotFiles.empty() || files->snapshotFiles.back().version < idx.version); files->snapshotFiles.push_back(idx); } else { ASSERT(fileType == 'D'); ASSERT(files->deltaFiles.empty() || files->deltaFiles.back().version < idx.version); files->deltaFiles.push_back(idx); } } if (res.more) { *startKey = keyAfter(res.back().key); } else { break; } } return Void(); } // Wrapper around readGranuleFiles // Gets all files belonging to the granule with id granule ID ACTOR Future<GranuleFiles> loadHistoryFiles(Database cx, UID granuleID) { state KeyRange range = blobGranuleFileKeyRangeFor(granuleID); state Key startKey = range.begin; state GranuleFiles files; state Transaction tr(cx); loop { try { tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); wait(readGranuleFiles(&tr, &startKey, range.end, &files, granuleID)); return files; } catch (Error& e) { wait(tr.onError(e)); } } } // Normally a beginVersion != 0 means the caller wants all mutations between beginVersion and readVersion, instead of // the latest snapshot before readVersion + deltas after the snapshot. When canCollapse is set, the beginVersion is // essentially just an optimization hint. The caller is still concerned with reconstructing rows at readVersion, it just // knows it doesn't need anything before beginVersion. // Normally this can eliminate the need for a snapshot and just return a small amount of deltas. But in a highly active // key range, the granule may have a snapshot file at version X, where beginVersion < X <= readVersion. In this case, if // the number of bytes in delta files between beginVersion and X is larger than the snapshot file at version X, it is // strictly more efficient (in terms of files and bytes read) to just use the snapshot file at version X instead. // // To assist BlobGranule file (snapshot and/or delta) file encryption, the routine while populating snapshot and/or // delta files, constructs BlobFilePointerRef->cipherKeysMeta field. Approach avoids this method to be defined as an // ACTOR, as fetching desired EncryptionKey may potentially involve reaching out to EncryptKeyProxy or external KMS. void GranuleFiles::getFiles(Version beginVersion, Version readVersion, bool canCollapse, BlobGranuleChunkRef& chunk, Arena& replyArena, int64_t& deltaBytesCounter) const { BlobFileIndex dummyIndex; // for searching // if beginVersion == 0 or we can collapse, find the latest snapshot <= readVersion auto snapshotF = snapshotFiles.end(); if (beginVersion == 0 || canCollapse) { dummyIndex.version = readVersion; snapshotF = std::lower_bound(snapshotFiles.begin(), snapshotFiles.end(), dummyIndex); if (snapshotF == snapshotFiles.end() || snapshotF->version > readVersion) { ASSERT(snapshotF != snapshotFiles.begin()); snapshotF--; } ASSERT(snapshotF != snapshotFiles.end()); ASSERT(snapshotF->version <= readVersion); } auto deltaF = deltaFiles.end(); if (beginVersion > 0) { dummyIndex.version = beginVersion; deltaF = std::lower_bound(deltaFiles.begin(), deltaFiles.end(), dummyIndex); if (canCollapse) { ASSERT(snapshotF != snapshotFiles.end()); // If we can collapse, see if delta files up to snapshotVersion are smaller or larger than snapshotBytes in // total auto deltaFCopy = deltaF; int64_t snapshotBytes = snapshotF->length; while (deltaFCopy != deltaFiles.end() && deltaFCopy->version <= snapshotF->version && snapshotBytes > 0) { snapshotBytes -= deltaFCopy->length; deltaFCopy++; } // if delta files contain the same or more bytes as the snapshot with collapse, do the collapse if (snapshotBytes > 0) { // don't collapse, clear snapshotF and just do delta files snapshotF = snapshotFiles.end(); } else { // do snapshot instead of previous deltas dummyIndex.version = snapshotF->version; deltaF = std::upper_bound(deltaFiles.begin(), deltaFiles.end(), dummyIndex); ASSERT(deltaF == deltaFiles.end() || deltaF->version > snapshotF->version); } } } else { dummyIndex.version = snapshotF->version; deltaF = std::upper_bound(deltaFiles.begin(), deltaFiles.end(), dummyIndex); ASSERT(deltaF == deltaFiles.end() || deltaF->version > snapshotF->version); } Version lastIncluded = invalidVersion; if (snapshotF != snapshotFiles.end()) { chunk.snapshotVersion = snapshotF->version; chunk.snapshotFile = BlobFilePointerRef(replyArena, snapshotF->filename, snapshotF->offset, snapshotF->length, snapshotF->fullFileLength, snapshotF->cipherKeysMeta); lastIncluded = chunk.snapshotVersion; } else { chunk.snapshotVersion = invalidVersion; } while (deltaF != deltaFiles.end() && deltaF->version < readVersion) { chunk.deltaFiles.emplace_back_deep(replyArena, deltaF->filename, deltaF->offset, deltaF->length, deltaF->fullFileLength, deltaF->cipherKeysMeta); deltaBytesCounter += deltaF->length; ASSERT(lastIncluded < deltaF->version); lastIncluded = deltaF->version; deltaF++; } // include last delta file that passes readVersion, if it exists if (deltaF != deltaFiles.end() && lastIncluded < readVersion) { chunk.deltaFiles.emplace_back_deep(replyArena, deltaF->filename, deltaF->offset, deltaF->length, deltaF->fullFileLength, deltaF->cipherKeysMeta); deltaBytesCounter += deltaF->length; lastIncluded = deltaF->version; } } static std::string makeTestFileName(Version v) { return "test" + std::to_string(v); } static BlobFileIndex makeTestFile(Version v, int64_t len) { return BlobFileIndex(v, makeTestFileName(v), 0, len, len); } static void checkFile(int expectedVersion, const BlobFilePointerRef& actualFile) { ASSERT(makeTestFileName(expectedVersion) == actualFile.filename.toString()); } static void checkFiles(const GranuleFiles& f, Version beginVersion, Version readVersion, bool canCollapse, Optional<int> expectedSnapshotVersion, std::vector<int> expectedDeltaVersions) { Arena a; BlobGranuleChunkRef chunk; int64_t deltaBytes = 0; f.getFiles(beginVersion, readVersion, canCollapse, chunk, a, deltaBytes); fmt::print("results({0}, {1}, {2}):\nEXPECTED:\n snapshot={3}\n deltas ({4}):\n", beginVersion, readVersion, canCollapse ? "T" : "F", expectedSnapshotVersion.present() ? makeTestFileName(expectedSnapshotVersion.get()).c_str() : "<N/A>", expectedDeltaVersions.size()); for (int d : expectedDeltaVersions) { fmt::print(" {}\n", makeTestFileName(d)); } fmt::print("ACTUAL:\n snapshot={0}\n deltas ({1}):\n", chunk.snapshotFile.present() ? chunk.snapshotFile.get().filename.toString().c_str() : "<N/A>", chunk.deltaFiles.size()); for (auto& it : chunk.deltaFiles) { fmt::print(" {}\n", it.filename.toString()); } printf("\n\n\n"); ASSERT(expectedSnapshotVersion.present() == chunk.snapshotFile.present()); if (expectedSnapshotVersion.present()) { checkFile(expectedSnapshotVersion.get(), chunk.snapshotFile.get()); } ASSERT(expectedDeltaVersions.size() == chunk.deltaFiles.size()); for (int i = 0; i < expectedDeltaVersions.size(); i++) { checkFile(expectedDeltaVersions[i], chunk.deltaFiles[i]); } } /* * Files: * S @ 100 (10 bytes) * D @ 150 (5 bytes) * D @ 200 (6 bytes) * S @ 200 (15 bytes) * D @ 250 (7 bytes) * D @ 300 (8 bytes) * S @ 300 (10 bytes) * D @ 350 (4 bytes) */ TEST_CASE("/blobgranule/server/common/granulefiles") { // simple cases first // single snapshot file, no deltas GranuleFiles files; files.snapshotFiles.push_back(makeTestFile(100, 10)); printf("Just snapshot\n"); checkFiles(files, 0, 100, false, 100, {}); checkFiles(files, 0, 200, false, 100, {}); printf("Small test\n"); // add delta files with re-snapshot at end files.deltaFiles.push_back(makeTestFile(150, 5)); files.deltaFiles.push_back(makeTestFile(200, 6)); files.snapshotFiles.push_back(makeTestFile(200, 15)); // check different read versions with beginVersion=0 checkFiles(files, 0, 100, false, 100, {}); checkFiles(files, 0, 101, false, 100, { 150 }); checkFiles(files, 0, 149, false, 100, { 150 }); checkFiles(files, 0, 150, false, 100, { 150 }); checkFiles(files, 0, 151, false, 100, { 150, 200 }); checkFiles(files, 0, 199, false, 100, { 150, 200 }); checkFiles(files, 0, 200, false, 200, {}); checkFiles(files, 0, 300, false, 200, {}); // Test all cases of beginVersion + readVersion. Because delta files are smaller than snapshot at 200, this should // be the same with and without collapse checkFiles(files, 100, 200, false, Optional<int>(), { 150, 200 }); checkFiles(files, 100, 300, false, Optional<int>(), { 150, 200 }); checkFiles(files, 101, 199, false, Optional<int>(), { 150, 200 }); checkFiles(files, 149, 151, false, Optional<int>(), { 150, 200 }); checkFiles(files, 149, 150, false, Optional<int>(), { 150 }); checkFiles(files, 150, 151, false, Optional<int>(), { 150, 200 }); checkFiles(files, 151, 200, false, Optional<int>(), { 200 }); checkFiles(files, 100, 200, true, Optional<int>(), { 150, 200 }); checkFiles(files, 100, 300, true, Optional<int>(), { 150, 200 }); checkFiles(files, 101, 199, true, Optional<int>(), { 150, 200 }); checkFiles(files, 149, 151, true, Optional<int>(), { 150, 200 }); checkFiles(files, 149, 150, true, Optional<int>(), { 150 }); checkFiles(files, 150, 151, true, Optional<int>(), { 150, 200 }); checkFiles(files, 151, 200, true, Optional<int>(), { 200 }); printf("Larger test\n"); // add more delta files and snapshots to check collapse logic files.deltaFiles.push_back(makeTestFile(250, 7)); files.deltaFiles.push_back(makeTestFile(300, 8)); files.snapshotFiles.push_back(makeTestFile(300, 10)); files.deltaFiles.push_back(makeTestFile(350, 4)); checkFiles(files, 0, 300, false, 300, {}); checkFiles(files, 0, 301, false, 300, { 350 }); checkFiles(files, 0, 400, false, 300, { 350 }); // check delta files without collapse checkFiles(files, 100, 301, false, Optional<int>(), { 150, 200, 250, 300, 350 }); checkFiles(files, 100, 300, false, Optional<int>(), { 150, 200, 250, 300 }); checkFiles(files, 100, 251, false, Optional<int>(), { 150, 200, 250, 300 }); checkFiles(files, 100, 250, false, Optional<int>(), { 150, 200, 250 }); checkFiles(files, 151, 300, false, Optional<int>(), { 200, 250, 300 }); checkFiles(files, 151, 301, false, Optional<int>(), { 200, 250, 300, 350 }); checkFiles(files, 151, 400, false, Optional<int>(), { 200, 250, 300, 350 }); checkFiles(files, 201, 300, false, Optional<int>(), { 250, 300 }); checkFiles(files, 201, 301, false, Optional<int>(), { 250, 300, 350 }); checkFiles(files, 201, 400, false, Optional<int>(), { 250, 300, 350 }); checkFiles(files, 251, 300, false, Optional<int>(), { 300 }); checkFiles(files, 251, 301, false, Optional<int>(), { 300, 350 }); checkFiles(files, 251, 400, false, Optional<int>(), { 300, 350 }); checkFiles(files, 301, 400, false, Optional<int>(), { 350 }); checkFiles(files, 351, 400, false, Optional<int>(), {}); // check with collapse // these 2 collapse because the delta files at 150+200+250+300 are larger than the snapshot at 300 checkFiles(files, 100, 301, true, 300, { 350 }); checkFiles(files, 100, 300, true, 300, {}); // these 2 don't collapse because 150+200 delta files are smaller than the snapshot at 200 checkFiles(files, 100, 251, true, Optional<int>(), { 150, 200, 250, 300 }); checkFiles(files, 100, 250, true, Optional<int>(), { 150, 200, 250 }); // these 3 do collapse because the delta files at 200+250+300 are larger than the snapshot at 300 checkFiles(files, 151, 300, true, 300, {}); checkFiles(files, 151, 301, true, 300, { 350 }); checkFiles(files, 151, 400, true, 300, { 350 }); // these 3 do collapse because the delta files at 250+300 are larger than the snapshot at 300 checkFiles(files, 201, 300, true, 300, {}); checkFiles(files, 201, 301, true, 300, { 350 }); checkFiles(files, 201, 400, true, 300, { 350 }); // these don't collapse because the delta file at 300 is smaller than the snapshot at 300 checkFiles(files, 251, 300, true, Optional<int>(), { 300 }); checkFiles(files, 251, 301, true, Optional<int>(), { 300, 350 }); checkFiles(files, 251, 400, true, Optional<int>(), { 300, 350 }); checkFiles(files, 301, 400, true, Optional<int>(), { 350 }); checkFiles(files, 351, 400, true, Optional<int>(), {}); return Void(); } // FIXME: if credentials can expire, refresh periodically ACTOR Future<Void> loadBlobMetadataForTenants(BGTenantMap* self, std::vector<TenantMapEntry> tenantMapEntries) { ASSERT(SERVER_KNOBS->BG_METADATA_SOURCE == "tenant"); ASSERT(!tenantMapEntries.empty()); state std::vector<BlobMetadataDomainId> domainIds; for (auto& entry : tenantMapEntries) { domainIds.push_back(entry.id); } // FIXME: if one tenant gets an error, don't kill whole process // TODO: add latency metrics loop { Future<EKPGetLatestBlobMetadataReply> requestFuture; if (self->dbInfo.isValid() && self->dbInfo->get().encryptKeyProxy.present()) { EKPGetLatestBlobMetadataRequest req; req.domainIds = domainIds; requestFuture = brokenPromiseToNever(self->dbInfo->get().encryptKeyProxy.get().getLatestBlobMetadata.getReply(req)); } else { requestFuture = Never(); } choose { when(EKPGetLatestBlobMetadataReply rep = wait(requestFuture)) { ASSERT(rep.blobMetadataDetails.size() == domainIds.size()); // not guaranteed to be in same order in the request as the response for (auto& metadata : rep.blobMetadataDetails) { auto info = self->tenantInfoById.find(metadata.domainId); if (info == self->tenantInfoById.end()) { continue; } auto dataEntry = self->tenantData.rangeContaining(info->second.prefix); ASSERT(dataEntry.begin() == info->second.prefix); dataEntry.cvalue()->setBStore(BlobConnectionProvider::newBlobConnectionProvider(metadata)); } return Void(); } when(wait(self->dbInfo->onChange())) {} } } } // list of tenants that may or may not already exist void BGTenantMap::addTenants(std::vector<std::pair<TenantName, TenantMapEntry>> tenants) { std::vector<TenantMapEntry> tenantsToLoad; for (auto entry : tenants) { if (tenantInfoById.insert({ entry.second.id, entry.second }).second) { auto r = makeReference<GranuleTenantData>(entry.first, entry.second); tenantData.insert(KeyRangeRef(entry.second.prefix, entry.second.prefix.withSuffix(normalKeys.end)), r); if (SERVER_KNOBS->BG_METADATA_SOURCE != "tenant") { r->bstoreLoaded.send(Void()); } else { tenantsToLoad.push_back(entry.second); } } } if (!tenantsToLoad.empty()) { addActor.send(loadBlobMetadataForTenants(this, tenantsToLoad)); } } // TODO: implement void BGTenantMap::removeTenants(std::vector<int64_t> tenantIds) { throw not_implemented(); } Optional<TenantMapEntry> BGTenantMap::getTenantById(int64_t id) { auto tenant = tenantInfoById.find(id); if (tenant == tenantInfoById.end()) { return {}; } else { return tenant->second; } } // TODO: handle case where tenant isn't loaded yet Reference<GranuleTenantData> BGTenantMap::getDataForGranule(const KeyRangeRef& keyRange) { auto tenant = tenantData.rangeContaining(keyRange.begin); ASSERT(tenant.begin() <= keyRange.begin); ASSERT(tenant.end() >= keyRange.end); return tenant.cvalue(); }