Full integration with BlobConnectionProvider for blob granules

This commit is contained in:
Josh Slocum 2022-06-01 11:19:24 -05:00
parent ffa4255c65
commit 567b1d35f7
18 changed files with 349 additions and 234 deletions

View File

@ -21,7 +21,7 @@
#include <string> #include <string>
#include "flow/IRandom.h" #include "flow/IRandom.h"
#include "fdbserver/BlobConnectionProvider.h" #include "fdbclient/BlobConnectionProvider.h"
struct SingleBlobConnectionProvider : BlobConnectionProvider { struct SingleBlobConnectionProvider : BlobConnectionProvider {
public: public:

View File

@ -22,7 +22,7 @@
#define BLOB_CONNECTION_PROVIDER_H #define BLOB_CONNECTION_PROVIDER_H
#include "fdbclient/BackupContainerFileSystem.h" #include "fdbclient/BackupContainerFileSystem.h"
#include "fdbserver/BlobMetadataUtils.h" #include "fdbclient/BlobMetadataUtils.h"
struct BlobConnectionProvider : NonCopyable, ReferenceCounted<BlobConnectionProvider> { struct BlobConnectionProvider : NonCopyable, ReferenceCounted<BlobConnectionProvider> {
// chooses a partition and prepends the necessary prefix to the filename (if necessary) for writing a file, and // chooses a partition and prepends the necessary prefix to the filename (if necessary) for writing a file, and
@ -38,6 +38,8 @@ struct BlobConnectionProvider : NonCopyable, ReferenceCounted<BlobConnectionProv
static Reference<BlobConnectionProvider> newBlobConnectionProvider(std::string blobUrl); static Reference<BlobConnectionProvider> newBlobConnectionProvider(std::string blobUrl);
static Reference<BlobConnectionProvider> newBlobConnectionProvider(Standalone<BlobMetadataDetailsRef> blobMetadata); static Reference<BlobConnectionProvider> newBlobConnectionProvider(Standalone<BlobMetadataDetailsRef> blobMetadata);
// TODO add update impl
}; };
#endif #endif

View File

@ -38,11 +38,13 @@
// TODO could refactor the file reading code from here and the delta file function into another actor, // TODO could refactor the file reading code from here and the delta file function into another actor,
// then this part would also be testable? but meh // then this part would also be testable? but meh
ACTOR Future<Standalone<StringRef>> readFile(Reference<BackupContainerFileSystem> bstore, BlobFilePointerRef f) { ACTOR Future<Standalone<StringRef>> readFile(Reference<BlobConnectionProvider> bstoreProvider, BlobFilePointerRef f) {
try { try {
state Arena arena; state Arena arena;
// printf("Starting read of snapshot file %s\n", filename.c_str()); std::string fname = f.filename.toString();
state Reference<IAsyncFile> reader = wait(bstore->readFile(f.filename.toString())); state Reference<BackupContainerFileSystem> bstore = bstoreProvider->getForRead(fname);
// printf("Starting read of snapshot file %s\n", fname.c_str());
state Reference<IAsyncFile> reader = wait(bstore->readFile(fname));
// printf("Got snapshot file size %lld\n", size); // printf("Got snapshot file size %lld\n", size);
state uint8_t* data = new (arena) uint8_t[f.length]; state uint8_t* data = new (arena) uint8_t[f.length];
// printf("Reading %lld bytes from snapshot file %s\n", size, filename.c_str()); // printf("Reading %lld bytes from snapshot file %s\n", size, filename.c_str());
@ -66,7 +68,7 @@ ACTOR Future<RangeResult> readBlobGranule(BlobGranuleChunkRef chunk,
KeyRangeRef keyRange, KeyRangeRef keyRange,
Version beginVersion, Version beginVersion,
Version readVersion, Version readVersion,
Reference<BackupContainerFileSystem> bstore, Reference<BlobConnectionProvider> bstore,
Optional<BlobWorkerStats*> stats) { Optional<BlobWorkerStats*> stats) {
// TODO REMOVE with early replying // TODO REMOVE with early replying
@ -120,7 +122,7 @@ ACTOR Future<RangeResult> readBlobGranule(BlobGranuleChunkRef chunk,
// TODO probably should add things like limit/bytelimit at some point? // TODO probably should add things like limit/bytelimit at some point?
ACTOR Future<Void> readBlobGranules(BlobGranuleFileRequest request, ACTOR Future<Void> readBlobGranules(BlobGranuleFileRequest request,
BlobGranuleFileReply reply, BlobGranuleFileReply reply,
Reference<BackupContainerFileSystem> bstore, Reference<BlobConnectionProvider> bstore,
PromiseStream<RangeResult> results) { PromiseStream<RangeResult> results) {
// TODO for large amount of chunks, this should probably have some sort of buffer limit like ReplyPromiseStream. // TODO for large amount of chunks, this should probably have some sort of buffer limit like ReplyPromiseStream.
// Maybe just use ReplyPromiseStream instead of PromiseStream? // Maybe just use ReplyPromiseStream instead of PromiseStream?

View File

@ -29,6 +29,7 @@
#define BLOB_GRANULE_READER_CLIENT_H #define BLOB_GRANULE_READER_CLIENT_H
#include "fdbclient/BackupContainerFileSystem.h" #include "fdbclient/BackupContainerFileSystem.h"
#include "fdbclient/BlobConnectionProvider.h"
#include "fdbclient/BlobGranuleCommon.h" #include "fdbclient/BlobGranuleCommon.h"
#include "fdbclient/BlobGranuleFiles.h" #include "fdbclient/BlobGranuleFiles.h"
#include "fdbclient/BlobWorkerInterface.h" #include "fdbclient/BlobWorkerInterface.h"
@ -42,12 +43,12 @@ ACTOR Future<RangeResult> readBlobGranule(BlobGranuleChunkRef chunk,
KeyRangeRef keyRange, KeyRangeRef keyRange,
Version beginVersion, Version beginVersion,
Version readVersion, Version readVersion,
Reference<BackupContainerFileSystem> bstore, Reference<BlobConnectionProvider> bstore,
Optional<BlobWorkerStats*> stats = Optional<BlobWorkerStats*>()); Optional<BlobWorkerStats*> stats = Optional<BlobWorkerStats*>());
ACTOR Future<Void> readBlobGranules(BlobGranuleFileRequest request, ACTOR Future<Void> readBlobGranules(BlobGranuleFileRequest request,
BlobGranuleFileReply reply, BlobGranuleFileReply reply,
Reference<BackupContainerFileSystem> bstore, Reference<BlobConnectionProvider> bstore,
PromiseStream<RangeResult> results); PromiseStream<RangeResult> results);
#include "flow/unactorcompiler.h" #include "flow/unactorcompiler.h"

View File

@ -18,15 +18,18 @@ set(FDBCLIENT_SRCS
BackupContainerLocalDirectory.h BackupContainerLocalDirectory.h
BackupContainerS3BlobStore.actor.cpp BackupContainerS3BlobStore.actor.cpp
BackupContainerS3BlobStore.h BackupContainerS3BlobStore.h
ClientBooleanParams.cpp BlobConnectionProvider.h
ClientBooleanParams.h BlobConnectionProvider.cpp
BlobWorkerInterface.h BlobWorkerInterface.h
BlobGranuleReader.actor.cpp BlobGranuleReader.actor.cpp
BlobGranuleReader.actor.h BlobGranuleReader.actor.h
BlobGranuleCommon.h BlobGranuleCommon.h
BlobGranuleFiles.cpp BlobGranuleFiles.cpp
BlobGranuleFiles.h BlobGranuleFiles.h
BlobMetadataUtils.h
BlobWorkerCommon.h BlobWorkerCommon.h
ClientBooleanParams.cpp
ClientBooleanParams.h
ClientKnobCollection.cpp ClientKnobCollection.cpp
ClientKnobCollection.h ClientKnobCollection.h
ClientKnobs.cpp ClientKnobs.cpp

View File

@ -19,12 +19,13 @@
*/ */
#include "contrib/fmt-8.1.1/include/fmt/format.h" #include "contrib/fmt-8.1.1/include/fmt/format.h"
#include "fdbclient/SystemData.h"
#include "fdbclient/BlobGranuleCommon.h" #include "fdbclient/BlobGranuleCommon.h"
#include "fdbserver/BlobGranuleServerCommon.actor.h"
#include "fdbclient/CommitTransaction.h" #include "fdbclient/CommitTransaction.h"
#include "fdbclient/FDBTypes.h" #include "fdbclient/FDBTypes.h"
#include "fdbclient/ReadYourWrites.h" #include "fdbclient/ReadYourWrites.h"
#include "fdbclient/SystemData.h"
#include "fdbserver/BlobGranuleServerCommon.actor.h"
#include "fdbserver/Knobs.h"
#include "flow/Arena.h" #include "flow/Arena.h"
#include "flow/UnitTest.h" #include "flow/UnitTest.h"
#include "flow/actorcompiler.h" // has to be last include #include "flow/actorcompiler.h" // has to be last include
@ -357,3 +358,87 @@ TEST_CASE("/blobgranule/server/common/granulefiles") {
return Void(); return Void();
} }
// FIXME: if credentials can expire, refresh periodically
ACTOR Future<Void> loadBlobMetadataForTenants(BGTenantMap* self, std::vector<TenantMapEntry> tenantMapEntries) {
ASSERT(SERVER_KNOBS->BG_METADATA_SOURCE == "tenant");
ASSERT(!tenantMapEntries.empty());
state std::vector<BlobMetadataDomainId> domainIds;
for (auto& entry : tenantMapEntries) {
domainIds.push_back(entry.id);
}
// FIXME: if one tenant gets an error, don't kill whole process
// TODO: add latency metrics
loop {
Future<EKPGetLatestBlobMetadataReply> requestFuture;
if (self->dbInfo.isValid() && self->dbInfo->get().encryptKeyProxy.present()) {
EKPGetLatestBlobMetadataRequest req;
req.domainIds = domainIds;
requestFuture =
brokenPromiseToNever(self->dbInfo->get().encryptKeyProxy.get().getLatestBlobMetadata.getReply(req));
} else {
requestFuture = Never();
}
choose {
when(EKPGetLatestBlobMetadataReply rep = wait(requestFuture)) {
ASSERT(rep.blobMetadataDetails.size() == domainIds.size());
// not guaranteed to be in same order in the request as the response
for (auto& metadata : rep.blobMetadataDetails) {
auto info = self->tenantInfoById.find(metadata.domainId);
if (info == self->tenantInfoById.end()) {
continue;
}
auto dataEntry = self->tenantData.rangeContaining(info->second.prefix);
ASSERT(dataEntry.begin() == info->second.prefix);
dataEntry.cvalue()->setBStore(BlobConnectionProvider::newBlobConnectionProvider(metadata));
}
return Void();
}
when(wait(self->dbInfo->onChange())) {}
}
}
}
// list of tenants that may or may not already exist
void BGTenantMap::addTenants(std::vector<std::pair<TenantName, TenantMapEntry>> tenants) {
std::vector<TenantMapEntry> tenantsToLoad;
for (auto entry : tenants) {
if (tenantInfoById.insert({ entry.second.id, entry.second }).second) {
auto r = makeReference<GranuleTenantData>(entry.first, entry.second);
tenantData.insert(KeyRangeRef(entry.second.prefix, entry.second.prefix.withSuffix(normalKeys.end)), r);
if (SERVER_KNOBS->BG_METADATA_SOURCE != "tenant") {
r->bstoreLoaded.send(Void());
} else {
tenantsToLoad.push_back(entry.second);
}
}
}
if (!tenantsToLoad.empty()) {
addActor.send(loadBlobMetadataForTenants(this, tenantsToLoad));
}
}
// TODO: implement
void BGTenantMap::removeTenants(std::vector<int64_t> tenantIds) {
throw not_implemented();
}
Optional<TenantMapEntry> BGTenantMap::getTenantById(int64_t id) {
auto tenant = tenantInfoById.find(id);
if (tenant == tenantInfoById.end()) {
return {};
} else {
return tenant->second;
}
}
// TODO: handle case where tenant isn't loaded yet
Reference<GranuleTenantData> BGTenantMap::getDataForGranule(const KeyRangeRef& keyRange) {
auto tenant = tenantData.rangeContaining(keyRange.begin);
ASSERT(tenant.begin() <= keyRange.begin);
ASSERT(tenant.end() >= keyRange.end);
return tenant.cvalue();
}

View File

@ -29,7 +29,10 @@
#include "flow/flow.h" #include "flow/flow.h"
#include "fdbclient/CommitTransaction.h" #include "fdbclient/CommitTransaction.h"
#include "fdbclient/FDBTypes.h" #include "fdbclient/FDBTypes.h"
#include "fdbclient/BlobConnectionProvider.h"
#include "fdbclient/BlobGranuleCommon.h" #include "fdbclient/BlobGranuleCommon.h"
#include "fdbclient/Tenant.h"
#include "fdbserver/ServerDBInfo.h"
#include "flow/actorcompiler.h" // has to be last include #include "flow/actorcompiler.h" // has to be last include
struct GranuleHistory { struct GranuleHistory {
@ -79,4 +82,45 @@ ACTOR Future<Optional<GranuleHistory>> getLatestGranuleHistory(Transaction* tr,
ACTOR Future<Void> readGranuleFiles(Transaction* tr, Key* startKey, Key endKey, GranuleFiles* files, UID granuleID); ACTOR Future<Void> readGranuleFiles(Transaction* tr, Key* startKey, Key endKey, GranuleFiles* files, UID granuleID);
ACTOR Future<GranuleFiles> loadHistoryFiles(Database cx, UID granuleID); ACTOR Future<GranuleFiles> loadHistoryFiles(Database cx, UID granuleID);
// TODO: versioned like SS has?
struct GranuleTenantData : NonCopyable, ReferenceCounted<GranuleTenantData> {
TenantName name;
TenantMapEntry entry;
Reference<BlobConnectionProvider> bstore;
Promise<Void> bstoreLoaded;
GranuleTenantData() {}
GranuleTenantData(TenantName name, TenantMapEntry entry) : name(name), entry(entry) {}
void setBStore(Reference<BlobConnectionProvider> bs) {
ASSERT(bstoreLoaded.canBeSet());
bstore = bs;
bstoreLoaded.send(Void());
}
};
// TODO: add refreshing
struct BGTenantMap {
public:
void addTenants(std::vector<std::pair<TenantName, TenantMapEntry>>);
void removeTenants(std::vector<int64_t> tenantIds);
Optional<TenantMapEntry> getTenantById(int64_t id);
Reference<GranuleTenantData> getDataForGranule(const KeyRangeRef& keyRange);
KeyRangeMap<Reference<GranuleTenantData>> tenantData;
std::unordered_map<int64_t, TenantMapEntry> tenantInfoById;
Reference<AsyncVar<ServerDBInfo> const> dbInfo;
PromiseStream<Future<Void>> addActor;
BGTenantMap() {}
explicit BGTenantMap(const Reference<AsyncVar<ServerDBInfo> const> dbInfo) : dbInfo(dbInfo) {
collection = actorCollection(addActor.getFuture());
}
private:
Future<Void> collection;
};
#endif #endif

View File

@ -61,7 +61,7 @@ ACTOR Future<std::pair<RangeResult, Version>> readFromFDB(Database cx, KeyRange
// FIXME: typedef this pair type and/or chunk list // FIXME: typedef this pair type and/or chunk list
ACTOR Future<std::pair<RangeResult, Standalone<VectorRef<BlobGranuleChunkRef>>>> readFromBlob( ACTOR Future<std::pair<RangeResult, Standalone<VectorRef<BlobGranuleChunkRef>>>> readFromBlob(
Database cx, Database cx,
Reference<BackupContainerFileSystem> bstore, Reference<BlobConnectionProvider> bstore,
KeyRange range, KeyRange range,
Version beginVersion, Version beginVersion,
Version readVersion, Version readVersion,

View File

@ -37,7 +37,7 @@
ACTOR Future<std::pair<RangeResult, Standalone<VectorRef<BlobGranuleChunkRef>>>> readFromBlob( ACTOR Future<std::pair<RangeResult, Standalone<VectorRef<BlobGranuleChunkRef>>>> readFromBlob(
Database cx, Database cx,
Reference<BackupContainerFileSystem> bstore, Reference<BlobConnectionProvider> bstore,
KeyRange range, KeyRange range,
Version beginVersion, Version beginVersion,
Version readVersion, Version readVersion,

View File

@ -49,7 +49,7 @@
* The Blob Manager is responsible for managing range granules, and recruiting and monitoring Blob Workers. * The Blob Manager is responsible for managing range granules, and recruiting and monitoring Blob Workers.
*/ */
#define BM_DEBUG false #define BM_DEBUG true
void handleClientBlobRange(KeyRangeMap<bool>* knownBlobRanges, void handleClientBlobRange(KeyRangeMap<bool>* knownBlobRanges,
Arena& ar, Arena& ar,
@ -255,7 +255,7 @@ struct BlobManagerData : NonCopyable, ReferenceCounted<BlobManagerData> {
BlobManagerStats stats; BlobManagerStats stats;
Reference<BackupContainerFileSystem> bstore; Reference<BlobConnectionProvider> bstore;
std::unordered_map<UID, BlobWorkerInterface> workersById; std::unordered_map<UID, BlobWorkerInterface> workersById;
std::unordered_map<UID, BlobWorkerInfo> workerStats; // mapping between workerID -> workerStats std::unordered_map<UID, BlobWorkerInfo> workerStats; // mapping between workerID -> workerStats
@ -265,6 +265,7 @@ struct BlobManagerData : NonCopyable, ReferenceCounted<BlobManagerData> {
KeyRangeActorMap assignsInProgress; KeyRangeActorMap assignsInProgress;
KeyRangeMap<SplitEvaluation> splitEvaluations; KeyRangeMap<SplitEvaluation> splitEvaluations;
KeyRangeMap<bool> knownBlobRanges; KeyRangeMap<bool> knownBlobRanges;
BGTenantMap tenantData;
AsyncTrigger startRecruiting; AsyncTrigger startRecruiting;
Debouncer restartRecruiting; Debouncer restartRecruiting;
@ -282,18 +283,18 @@ struct BlobManagerData : NonCopyable, ReferenceCounted<BlobManagerData> {
// assigned sequence numbers // assigned sequence numbers
PromiseStream<RangeAssignment> rangesToAssign; PromiseStream<RangeAssignment> rangesToAssign;
BlobManagerData(UID id, Database db, Optional<Key> dcId) BlobManagerData(UID id, Reference<AsyncVar<ServerDBInfo> const> dbInfo, Database db, Optional<Key> dcId)
: id(id), db(db), dcId(dcId), stats(id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, &workersById), : id(id), db(db), dcId(dcId), stats(id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, &workersById),
knownBlobRanges(false, normalKeys.end), restartRecruiting(SERVER_KNOBS->DEBOUNCE_RECRUITING_DELAY), knownBlobRanges(false, normalKeys.end), tenantData(BGTenantMap(dbInfo)),
recruitingStream(0) {} restartRecruiting(SERVER_KNOBS->DEBOUNCE_RECRUITING_DELAY), recruitingStream(0) {}
// only initialize blob store if actually needed // only initialize blob store if actually needed
void initBStore() { void initBStore() {
if (!bstore.isValid()) { if (!bstore.isValid() && SERVER_KNOBS->BG_METADATA_SOURCE != "tenant") {
if (BM_DEBUG) { if (BM_DEBUG) {
fmt::print("BM {} constructing backup container from {}\n", epoch, SERVER_KNOBS->BG_URL.c_str()); fmt::print("BM {} constructing backup container from {}\n", epoch, SERVER_KNOBS->BG_URL.c_str());
} }
bstore = BackupContainerFileSystem::openContainerFS(SERVER_KNOBS->BG_URL, {}, {}); bstore = BlobConnectionProvider::newBlobConnectionProvider(SERVER_KNOBS->BG_URL);
if (BM_DEBUG) { if (BM_DEBUG) {
fmt::print("BM {} constructed backup container\n", epoch); fmt::print("BM {} constructed backup container\n", epoch);
} }
@ -840,11 +841,15 @@ ACTOR Future<Void> monitorClientRanges(Reference<BlobManagerData> bmData) {
throw internal_error(); throw internal_error();
} }
std::vector<std::pair<TenantName, TenantMapEntry>> tenants;
std::vector<Key> prefixes; std::vector<Key> prefixes;
for (auto& it : tenantResults) { for (auto& it : tenantResults) {
StringRef tenantName = it.key.removePrefix(tenantMapPrefix);
TenantMapEntry entry = decodeTenantEntry(it.value); TenantMapEntry entry = decodeTenantEntry(it.value);
tenants.push_back(std::pair(tenantName, entry));
prefixes.push_back(entry.prefix); prefixes.push_back(entry.prefix);
} }
bmData->tenantData.addTenants(tenants);
// make this look like knownBlobRanges // make this look like knownBlobRanges
std::sort(prefixes.begin(), prefixes.end()); std::sort(prefixes.begin(), prefixes.end());
@ -2279,6 +2284,30 @@ ACTOR Future<Void> canDeleteFullGranule(Reference<BlobManagerData> self, UID gra
return Void(); return Void();
} }
static Future<Void> deleteFile(Reference<BlobConnectionProvider> bstoreProvider, std::string filePath) {
Reference<BackupContainerFileSystem> bstore = bstoreProvider->getForRead(filePath);
return bstore->deleteFile(filePath);
}
// since all BM operations for a tenant are driven by discovering the tenant from the tenant map, it will always know
// about this tenant
ACTOR Future<Reference<BlobConnectionProvider>> getBStoreForGranule(Reference<BlobManagerData> self,
KeyRange granuleRange) {
if (self->bstore.isValid()) {
return self->bstore;
}
loop {
state Reference<GranuleTenantData> data = self->tenantData.getDataForGranule(granuleRange);
if (data.isValid()) {
wait(data->bstoreLoaded.getFuture());
return data->bstore;
} else {
// race on startup between loading tenant ranges and bgcc/purging. just wait
wait(delay(0.1));
}
}
}
/* /*
* Deletes all files pertaining to the granule with id granuleId and * Deletes all files pertaining to the granule with id granuleId and
* also removes the history entry for this granule from the system keyspace * also removes the history entry for this granule from the system keyspace
@ -2286,7 +2315,8 @@ ACTOR Future<Void> canDeleteFullGranule(Reference<BlobManagerData> self, UID gra
ACTOR Future<Void> fullyDeleteGranule(Reference<BlobManagerData> self, ACTOR Future<Void> fullyDeleteGranule(Reference<BlobManagerData> self,
UID granuleId, UID granuleId,
Key historyKey, Key historyKey,
Version purgeVersion) { Version purgeVersion,
KeyRange granuleRange) {
if (BM_DEBUG) { if (BM_DEBUG) {
fmt::print("Fully deleting granule {0}: init\n", granuleId.toString()); fmt::print("Fully deleting granule {0}: init\n", granuleId.toString());
} }
@ -2294,6 +2324,7 @@ ACTOR Future<Void> fullyDeleteGranule(Reference<BlobManagerData> self,
// if granule is still splitting and files are needed for new sub-granules to re-snapshot, we can only partially // if granule is still splitting and files are needed for new sub-granules to re-snapshot, we can only partially
// delete the granule, since we need to keep the last snapshot and deltas for splitting // delete the granule, since we need to keep the last snapshot and deltas for splitting
wait(canDeleteFullGranule(self, granuleId)); wait(canDeleteFullGranule(self, granuleId));
state Reference<BlobConnectionProvider> bstore = wait(getBStoreForGranule(self, granuleRange));
// get files // get files
GranuleFiles files = wait(loadHistoryFiles(self->db, granuleId)); GranuleFiles files = wait(loadHistoryFiles(self->db, granuleId));
@ -2303,13 +2334,13 @@ ACTOR Future<Void> fullyDeleteGranule(Reference<BlobManagerData> self,
for (auto snapshotFile : files.snapshotFiles) { for (auto snapshotFile : files.snapshotFiles) {
std::string fname = snapshotFile.filename; std::string fname = snapshotFile.filename;
deletions.emplace_back(self->bstore->deleteFile(fname)); deletions.push_back(deleteFile(bstore, fname));
filesToDelete.emplace_back(fname); filesToDelete.emplace_back(fname);
} }
for (auto deltaFile : files.deltaFiles) { for (auto deltaFile : files.deltaFiles) {
std::string fname = deltaFile.filename; std::string fname = deltaFile.filename;
deletions.emplace_back(self->bstore->deleteFile(fname)); deletions.push_back(deleteFile(bstore, fname));
filesToDelete.emplace_back(fname); filesToDelete.emplace_back(fname);
} }
@ -2371,11 +2402,16 @@ ACTOR Future<Void> fullyDeleteGranule(Reference<BlobManagerData> self,
* file might be deleted. We will need to ensure we don't rely on the granule's startVersion * file might be deleted. We will need to ensure we don't rely on the granule's startVersion
* (that's persisted as part of the key), but rather use the granule's first snapshot's version when needed * (that's persisted as part of the key), but rather use the granule's first snapshot's version when needed
*/ */
ACTOR Future<Void> partiallyDeleteGranule(Reference<BlobManagerData> self, UID granuleId, Version purgeVersion) { ACTOR Future<Void> partiallyDeleteGranule(Reference<BlobManagerData> self,
UID granuleId,
Version purgeVersion,
KeyRange granuleRange) {
if (BM_DEBUG) { if (BM_DEBUG) {
fmt::print("Partially deleting granule {0}: init\n", granuleId.toString()); fmt::print("Partially deleting granule {0}: init\n", granuleId.toString());
} }
state Reference<BlobConnectionProvider> bstore = wait(getBStoreForGranule(self, granuleRange));
// get files // get files
GranuleFiles files = wait(loadHistoryFiles(self->db, granuleId)); GranuleFiles files = wait(loadHistoryFiles(self->db, granuleId));
@ -2391,7 +2427,7 @@ ACTOR Future<Void> partiallyDeleteGranule(Reference<BlobManagerData> self, UID g
// if we already found the latestSnapshotVersion, this snapshot can be deleted // if we already found the latestSnapshotVersion, this snapshot can be deleted
if (latestSnapshotVersion != invalidVersion) { if (latestSnapshotVersion != invalidVersion) {
std::string fname = files.snapshotFiles[idx].filename; std::string fname = files.snapshotFiles[idx].filename;
deletions.emplace_back(self->bstore->deleteFile(fname)); deletions.push_back(deleteFile(bstore, fname));
deletedFileKeys.emplace_back(blobGranuleFileKeyFor(granuleId, files.snapshotFiles[idx].version, 'S')); deletedFileKeys.emplace_back(blobGranuleFileKeyFor(granuleId, files.snapshotFiles[idx].version, 'S'));
filesToDelete.emplace_back(fname); filesToDelete.emplace_back(fname);
} else if (files.snapshotFiles[idx].version <= purgeVersion) { } else if (files.snapshotFiles[idx].version <= purgeVersion) {
@ -2415,7 +2451,7 @@ ACTOR Future<Void> partiallyDeleteGranule(Reference<BlobManagerData> self, UID g
// otherwise deltaFile.version <= latestSnapshotVersion so delete it // otherwise deltaFile.version <= latestSnapshotVersion so delete it
// == should also be deleted because the last delta file before a snapshot would have the same version // == should also be deleted because the last delta file before a snapshot would have the same version
std::string fname = deltaFile.filename; std::string fname = deltaFile.filename;
deletions.emplace_back(self->bstore->deleteFile(fname)); deletions.push_back(deleteFile(bstore, fname));
deletedFileKeys.emplace_back(blobGranuleFileKeyFor(granuleId, deltaFile.version, 'D')); deletedFileKeys.emplace_back(blobGranuleFileKeyFor(granuleId, deltaFile.version, 'D'));
filesToDelete.emplace_back(fname); filesToDelete.emplace_back(fname);
} }
@ -2500,8 +2536,8 @@ ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range
state std::queue<std::tuple<KeyRange, Version, Version>> historyEntryQueue; state std::queue<std::tuple<KeyRange, Version, Version>> historyEntryQueue;
// stacks of <granuleId, historyKey> and <granuleId> to track which granules to delete // stacks of <granuleId, historyKey> and <granuleId> to track which granules to delete
state std::vector<std::tuple<UID, Key>> toFullyDelete; state std::vector<std::tuple<UID, Key, KeyRange>> toFullyDelete;
state std::vector<UID> toPartiallyDelete; state std::vector<std::pair<UID, KeyRange>> toPartiallyDelete;
// track which granules we have already added to traversal // track which granules we have already added to traversal
// note: (startKey, startVersion) uniquely identifies a granule // note: (startKey, startVersion) uniquely identifies a granule
@ -2562,7 +2598,7 @@ ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range
} }
while (!historyEntryQueue.empty()) { while (!historyEntryQueue.empty()) {
// process the node at the front of the queue and remove it // process the node at the front of the queue and remove it
KeyRange currRange; state KeyRange currRange;
state Version startVersion; state Version startVersion;
state Version endVersion; state Version endVersion;
std::tie(currRange, startVersion, endVersion) = historyEntryQueue.front(); std::tie(currRange, startVersion, endVersion) = historyEntryQueue.front();
@ -2612,12 +2648,12 @@ ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range
if (BM_DEBUG) { if (BM_DEBUG) {
fmt::print("Granule {0} will be FULLY deleted\n", currHistoryNode.granuleID.toString()); fmt::print("Granule {0} will be FULLY deleted\n", currHistoryNode.granuleID.toString());
} }
toFullyDelete.push_back({ currHistoryNode.granuleID, historyKey }); toFullyDelete.push_back({ currHistoryNode.granuleID, historyKey, currRange });
} else if (startVersion < purgeVersion) { } else if (startVersion < purgeVersion) {
if (BM_DEBUG) { if (BM_DEBUG) {
fmt::print("Granule {0} will be partially deleted\n", currHistoryNode.granuleID.toString()); fmt::print("Granule {0} will be partially deleted\n", currHistoryNode.granuleID.toString());
} }
toPartiallyDelete.push_back({ currHistoryNode.granuleID }); toPartiallyDelete.push_back({ currHistoryNode.granuleID, currRange });
} }
// add all of the node's parents to the queue // add all of the node's parents to the queue
@ -2668,12 +2704,13 @@ ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range
for (i = toFullyDelete.size() - 1; i >= 0; --i) { for (i = toFullyDelete.size() - 1; i >= 0; --i) {
state UID granuleId; state UID granuleId;
Key historyKey; Key historyKey;
std::tie(granuleId, historyKey) = toFullyDelete[i]; KeyRange keyRange;
std::tie(granuleId, historyKey, keyRange) = toFullyDelete[i];
// FIXME: consider batching into a single txn (need to take care of txn size limit) // FIXME: consider batching into a single txn (need to take care of txn size limit)
if (BM_DEBUG) { if (BM_DEBUG) {
fmt::print("About to fully delete granule {0}\n", granuleId.toString()); fmt::print("About to fully delete granule {0}\n", granuleId.toString());
} }
wait(fullyDeleteGranule(self, granuleId, historyKey, purgeVersion)); wait(fullyDeleteGranule(self, granuleId, historyKey, purgeVersion, range));
} }
if (BM_DEBUG) { if (BM_DEBUG) {
@ -2681,11 +2718,13 @@ ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range
} }
for (i = toPartiallyDelete.size() - 1; i >= 0; --i) { for (i = toPartiallyDelete.size() - 1; i >= 0; --i) {
UID granuleId = toPartiallyDelete[i]; UID granuleId;
KeyRange range;
std::tie(granuleId, range) = toPartiallyDelete[i];
if (BM_DEBUG) { if (BM_DEBUG) {
fmt::print("About to partially delete granule {0}\n", granuleId.toString()); fmt::print("About to partially delete granule {0}\n", granuleId.toString());
} }
partialDeletions.emplace_back(partiallyDeleteGranule(self, granuleId, purgeVersion)); partialDeletions.emplace_back(partiallyDeleteGranule(self, granuleId, purgeVersion, range));
} }
wait(waitForAll(partialDeletions)); wait(waitForAll(partialDeletions));
@ -2906,9 +2945,10 @@ static void blobManagerExclusionSafetyCheck(Reference<BlobManagerData> self,
ACTOR Future<int64_t> bgccCheckGranule(Reference<BlobManagerData> bmData, KeyRange range) { ACTOR Future<int64_t> bgccCheckGranule(Reference<BlobManagerData> bmData, KeyRange range) {
state std::pair<RangeResult, Version> fdbResult = wait(readFromFDB(bmData->db, range)); state std::pair<RangeResult, Version> fdbResult = wait(readFromFDB(bmData->db, range));
state Reference<BlobConnectionProvider> bstore = wait(getBStoreForGranule(bmData, range));
std::pair<RangeResult, Standalone<VectorRef<BlobGranuleChunkRef>>> blobResult = std::pair<RangeResult, Standalone<VectorRef<BlobGranuleChunkRef>>> blobResult =
wait(readFromBlob(bmData->db, bmData->bstore, range, 0, fdbResult.second)); wait(readFromBlob(bmData->db, bstore, range, 0, fdbResult.second));
if (!compareFDBAndBlob(fdbResult.first, blobResult, range, fdbResult.second, BM_DEBUG)) { if (!compareFDBAndBlob(fdbResult.first, blobResult, range, fdbResult.second, BM_DEBUG)) {
++bmData->stats.ccMismatches; ++bmData->stats.ccMismatches;
@ -3008,6 +3048,7 @@ ACTOR Future<Void> blobManager(BlobManagerInterface bmInterf,
} }
state Reference<BlobManagerData> self = state Reference<BlobManagerData> self =
makeReference<BlobManagerData>(deterministicRandom()->randomUniqueID(), makeReference<BlobManagerData>(deterministicRandom()->randomUniqueID(),
dbInfo,
openDBOnServer(dbInfo, TaskPriority::DefaultEndpoint, LockAware::True), openDBOnServer(dbInfo, TaskPriority::DefaultEndpoint, LockAware::True),
bmInterf.locality.dcId()); bmInterf.locality.dcId());

View File

@ -27,19 +27,20 @@
#include "fdbclient/FDBTypes.h" #include "fdbclient/FDBTypes.h"
#include "fdbclient/SystemData.h" #include "fdbclient/SystemData.h"
#include "fdbclient/BackupContainerFileSystem.h" #include "fdbclient/BackupContainerFileSystem.h"
#include "fdbclient/BlobConnectionProvider.h"
#include "fdbclient/BlobGranuleCommon.h" #include "fdbclient/BlobGranuleCommon.h"
#include "fdbclient/BlobGranuleReader.actor.h" #include "fdbclient/BlobGranuleReader.actor.h"
#include "fdbclient/BlobMetadataUtils.h"
#include "fdbclient/BlobWorkerCommon.h" #include "fdbclient/BlobWorkerCommon.h"
#include "fdbclient/BlobWorkerInterface.h" #include "fdbclient/BlobWorkerInterface.h"
#include "fdbclient/DatabaseContext.h" #include "fdbclient/DatabaseContext.h"
#include "fdbclient/ManagementAPI.actor.h" #include "fdbclient/ManagementAPI.actor.h"
#include "fdbclient/NativeAPI.actor.h" #include "fdbclient/NativeAPI.actor.h"
#include "fdbclient/Notified.h" #include "fdbclient/Notified.h"
#include "fdbclient/Tenant.h"
#include "fdbserver/BlobMetadataUtils.h"
#include "fdbserver/Knobs.h" #include "fdbserver/Knobs.h"
#include "fdbserver/BlobGranuleServerCommon.actor.h" #include "fdbserver/BlobGranuleServerCommon.actor.h"
#include "fdbserver/BlobConnectionProvider.h"
#include "fdbserver/MutationTracking.h" #include "fdbserver/MutationTracking.h"
#include "fdbserver/WaitFailure.h" #include "fdbserver/WaitFailure.h"
#include "fdbserver/ServerDBInfo.h" #include "fdbserver/ServerDBInfo.h"
@ -51,7 +52,7 @@
#include "flow/actorcompiler.h" // has to be last include #include "flow/actorcompiler.h" // has to be last include
#define BW_DEBUG false #define BW_DEBUG true
#define BW_REQUEST_DEBUG false #define BW_REQUEST_DEBUG false
/* /*
@ -158,18 +159,6 @@ struct GranuleHistoryEntry : NonCopyable, ReferenceCounted<GranuleHistoryEntry>
: range(range), granuleID(granuleID), startVersion(startVersion), endVersion(endVersion) {} : range(range), granuleID(granuleID), startVersion(startVersion), endVersion(endVersion) {}
}; };
// TODO: does this need to be versioned like SS has?
struct GranuleTenantData : NonCopyable, ReferenceCounted<GranuleTenantData> {
TenantName name;
TenantMapEntry entry;
// TODO add other useful stuff like per-tenant blob connection, if necessary
Reference<BlobConnectionProvider> conn;
Promise<Void> connLoaded;
GranuleTenantData() {}
GranuleTenantData(TenantName name, TenantMapEntry entry) : name(name), entry(entry) {}
};
struct BlobWorkerData : NonCopyable, ReferenceCounted<BlobWorkerData> { struct BlobWorkerData : NonCopyable, ReferenceCounted<BlobWorkerData> {
UID id; UID id;
Database db; Database db;
@ -186,10 +175,9 @@ struct BlobWorkerData : NonCopyable, ReferenceCounted<BlobWorkerData> {
// FIXME: refactor out the parts of this that are just for interacting with blob stores from the backup business // FIXME: refactor out the parts of this that are just for interacting with blob stores from the backup business
// logic // logic
Reference<BackupContainerFileSystem> bstore; Reference<BlobConnectionProvider> bstore;
KeyRangeMap<GranuleRangeMetadata> granuleMetadata; KeyRangeMap<GranuleRangeMetadata> granuleMetadata;
KeyRangeMap<Reference<GranuleTenantData>> tenantData; BGTenantMap tenantData;
std::unordered_map<int64_t, TenantMapEntry> tenantInfoById;
// contains the history of completed granules before the existing ones. Maps to the latest one, and has // contains the history of completed granules before the existing ones. Maps to the latest one, and has
// back-pointers to earlier granules // back-pointers to earlier granules
@ -207,8 +195,8 @@ struct BlobWorkerData : NonCopyable, ReferenceCounted<BlobWorkerData> {
int changeFeedStreamReplyBufferSize = SERVER_KNOBS->BG_DELTA_FILE_TARGET_BYTES / 2; int changeFeedStreamReplyBufferSize = SERVER_KNOBS->BG_DELTA_FILE_TARGET_BYTES / 2;
BlobWorkerData(UID id, Database db) BlobWorkerData(UID id, Reference<AsyncVar<ServerDBInfo> const> dbInfo, Database db)
: id(id), db(db), stats(id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL), : id(id), db(db), stats(id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL), tenantData(BGTenantMap(dbInfo)),
initialSnapshotLock(SERVER_KNOBS->BLOB_WORKER_INITIAL_SNAPSHOT_PARALLELISM) {} initialSnapshotLock(SERVER_KNOBS->BLOB_WORKER_INITIAL_SNAPSHOT_PARALLELISM) {}
bool managerEpochOk(int64_t epoch) { bool managerEpochOk(int64_t epoch) {
@ -479,6 +467,7 @@ ACTOR Future<std::pair<BlobGranuleSplitState, Version>> getGranuleSplitState(Tra
// in flight. The synchronization happens after the s3 file is written, but before we update the FDB index of what files // in flight. The synchronization happens after the s3 file is written, but before we update the FDB index of what files
// exist. Before updating FDB, we ensure the version is committed and all previous delta files have updated FDB. // exist. Before updating FDB, we ensure the version is committed and all previous delta files have updated FDB.
ACTOR Future<BlobFileIndex> writeDeltaFile(Reference<BlobWorkerData> bwData, ACTOR Future<BlobFileIndex> writeDeltaFile(Reference<BlobWorkerData> bwData,
Reference<BlobConnectionProvider> bstore,
KeyRange keyRange, KeyRange keyRange,
UID granuleID, UID granuleID,
int64_t epoch, int64_t epoch,
@ -492,9 +481,9 @@ ACTOR Future<BlobFileIndex> writeDeltaFile(Reference<BlobWorkerData> bwData,
// Prefix filename with random chars both to avoid hotspotting on granuleID, and to have unique file names if // Prefix filename with random chars both to avoid hotspotting on granuleID, and to have unique file names if
// multiple blob workers try to create the exact same file at the same millisecond (which observably happens) // multiple blob workers try to create the exact same file at the same millisecond (which observably happens)
state std::string fname = deterministicRandom()->randomUniqueID().shortString() + "_" + granuleID.toString() + std::string fileName = deterministicRandom()->randomUniqueID().shortString() + "_" + granuleID.toString() + "_T" +
"_T" + std::to_string((uint64_t)(1000.0 * now())) + "_V" + std::to_string((uint64_t)(1000.0 * now())) + "_V" + std::to_string(currentDeltaVersion) +
std::to_string(currentDeltaVersion) + ".delta"; ".delta";
state Value serialized = ObjectWriter::toValue(deltasToWrite, Unversioned()); state Value serialized = ObjectWriter::toValue(deltasToWrite, Unversioned());
state size_t serializedSize = serialized.size(); state size_t serializedSize = serialized.size();
@ -502,7 +491,10 @@ ACTOR Future<BlobFileIndex> writeDeltaFile(Reference<BlobWorkerData> bwData,
// Free up deltasToWrite here to reduce memory // Free up deltasToWrite here to reduce memory
deltasToWrite = Standalone<GranuleDeltas>(); deltasToWrite = Standalone<GranuleDeltas>();
state Reference<IBackupFile> objectFile = wait(bwData->bstore->writeFile(fname)); state Reference<BackupContainerFileSystem> writeBStore;
state std::string fname;
std::tie(writeBStore, fname) = bstore->createForWrite(fileName);
state Reference<IBackupFile> objectFile = wait(writeBStore->writeFile(fname));
++bwData->stats.s3PutReqs; ++bwData->stats.s3PutReqs;
++bwData->stats.deltaFilesWritten; ++bwData->stats.deltaFilesWritten;
@ -577,12 +569,13 @@ ACTOR Future<BlobFileIndex> writeDeltaFile(Reference<BlobWorkerData> bwData,
} }
TEST(true); // Granule cleaning up delta file after error TEST(true); // Granule cleaning up delta file after error
++bwData->stats.s3DeleteReqs; ++bwData->stats.s3DeleteReqs;
bwData->addActor.send(bwData->bstore->deleteFile(fname)); bwData->addActor.send(writeBStore->deleteFile(fname));
throw e; throw e;
} }
} }
ACTOR Future<BlobFileIndex> writeSnapshot(Reference<BlobWorkerData> bwData, ACTOR Future<BlobFileIndex> writeSnapshot(Reference<BlobWorkerData> bwData,
Reference<BlobConnectionProvider> bstore,
KeyRange keyRange, KeyRange keyRange,
UID granuleID, UID granuleID,
int64_t epoch, int64_t epoch,
@ -592,7 +585,7 @@ ACTOR Future<BlobFileIndex> writeSnapshot(Reference<BlobWorkerData> bwData,
bool createGranuleHistory) { bool createGranuleHistory) {
// Prefix filename with random chars both to avoid hotspotting on granuleID, and to have unique file names if // Prefix filename with random chars both to avoid hotspotting on granuleID, and to have unique file names if
// multiple blob workers try to create the exact same file at the same millisecond (which observably happens) // multiple blob workers try to create the exact same file at the same millisecond (which observably happens)
state std::string fname = deterministicRandom()->randomUniqueID().shortString() + "_" + granuleID.toString() + state std::string fileName = deterministicRandom()->randomUniqueID().shortString() + "_" + granuleID.toString() +
"_T" + std::to_string((uint64_t)(1000.0 * now())) + "_V" + std::to_string(version) + "_T" + std::to_string((uint64_t)(1000.0 * now())) + "_V" + std::to_string(version) +
".snapshot"; ".snapshot";
state Standalone<GranuleSnapshot> snapshot; state Standalone<GranuleSnapshot> snapshot;
@ -644,7 +637,10 @@ ACTOR Future<BlobFileIndex> writeSnapshot(Reference<BlobWorkerData> bwData,
snapshot = Standalone<GranuleSnapshot>(); snapshot = Standalone<GranuleSnapshot>();
// write to blob using multi part upload // write to blob using multi part upload
state Reference<IBackupFile> objectFile = wait(bwData->bstore->writeFile(fname)); state Reference<BackupContainerFileSystem> writeBStore;
state std::string fname;
std::tie(writeBStore, fname) = bstore->createForWrite(fileName);
state Reference<IBackupFile> objectFile = wait(writeBStore->writeFile(fname));
++bwData->stats.s3PutReqs; ++bwData->stats.s3PutReqs;
++bwData->stats.snapshotFilesWritten; ++bwData->stats.snapshotFilesWritten;
@ -698,7 +694,7 @@ ACTOR Future<BlobFileIndex> writeSnapshot(Reference<BlobWorkerData> bwData,
} }
TEST(true); // Granule deleting snapshot file after error TEST(true); // Granule deleting snapshot file after error
++bwData->stats.s3DeleteReqs; ++bwData->stats.s3DeleteReqs;
bwData->addActor.send(bwData->bstore->deleteFile(fname)); bwData->addActor.send(writeBStore->deleteFile(fname));
throw e; throw e;
} }
@ -719,6 +715,7 @@ ACTOR Future<BlobFileIndex> writeSnapshot(Reference<BlobWorkerData> bwData,
} }
ACTOR Future<BlobFileIndex> dumpInitialSnapshotFromFDB(Reference<BlobWorkerData> bwData, ACTOR Future<BlobFileIndex> dumpInitialSnapshotFromFDB(Reference<BlobWorkerData> bwData,
Reference<BlobConnectionProvider> bstore,
Reference<GranuleMetadata> metadata, Reference<GranuleMetadata> metadata,
UID granuleID, UID granuleID,
Key cfKey, Key cfKey,
@ -747,6 +744,7 @@ ACTOR Future<BlobFileIndex> dumpInitialSnapshotFromFDB(Reference<BlobWorkerData>
ASSERT(lastReadVersion <= readVersion); ASSERT(lastReadVersion <= readVersion);
state PromiseStream<RangeResult> rowsStream; state PromiseStream<RangeResult> rowsStream;
state Future<BlobFileIndex> snapshotWriter = writeSnapshot(bwData, state Future<BlobFileIndex> snapshotWriter = writeSnapshot(bwData,
bstore,
metadata->keyRange, metadata->keyRange,
granuleID, granuleID,
metadata->originalEpoch, metadata->originalEpoch,
@ -805,6 +803,7 @@ ACTOR Future<BlobFileIndex> dumpInitialSnapshotFromFDB(Reference<BlobWorkerData>
// files might not be the current set of files in metadata, in the case of doing the initial snapshot of a granule that // files might not be the current set of files in metadata, in the case of doing the initial snapshot of a granule that
// was split. // was split.
ACTOR Future<BlobFileIndex> compactFromBlob(Reference<BlobWorkerData> bwData, ACTOR Future<BlobFileIndex> compactFromBlob(Reference<BlobWorkerData> bwData,
Reference<BlobConnectionProvider> bstore,
Reference<GranuleMetadata> metadata, Reference<GranuleMetadata> metadata,
UID granuleID, UID granuleID,
GranuleFiles files, GranuleFiles files,
@ -859,6 +858,7 @@ ACTOR Future<BlobFileIndex> compactFromBlob(Reference<BlobWorkerData> bwData,
try { try {
state PromiseStream<RangeResult> rowsStream; state PromiseStream<RangeResult> rowsStream;
state Future<BlobFileIndex> snapshotWriter = writeSnapshot(bwData, state Future<BlobFileIndex> snapshotWriter = writeSnapshot(bwData,
bstore,
metadata->keyRange, metadata->keyRange,
granuleID, granuleID,
metadata->originalEpoch, metadata->originalEpoch,
@ -867,7 +867,7 @@ ACTOR Future<BlobFileIndex> compactFromBlob(Reference<BlobWorkerData> bwData,
rowsStream, rowsStream,
false); false);
RangeResult newGranule = RangeResult newGranule =
wait(readBlobGranule(chunk, metadata->keyRange, 0, version, bwData->bstore, &bwData->stats)); wait(readBlobGranule(chunk, metadata->keyRange, 0, version, bstore, &bwData->stats));
bwData->stats.bytesReadFromS3ForCompaction += compactBytesRead; bwData->stats.bytesReadFromS3ForCompaction += compactBytesRead;
rowsStream.send(std::move(newGranule)); rowsStream.send(std::move(newGranule));
@ -906,6 +906,7 @@ struct CounterHolder {
}; };
ACTOR Future<BlobFileIndex> checkSplitAndReSnapshot(Reference<BlobWorkerData> bwData, ACTOR Future<BlobFileIndex> checkSplitAndReSnapshot(Reference<BlobWorkerData> bwData,
Reference<BlobConnectionProvider> bstore,
Reference<GranuleMetadata> metadata, Reference<GranuleMetadata> metadata,
UID granuleID, UID granuleID,
int64_t bytesInNewDeltaFiles, int64_t bytesInNewDeltaFiles,
@ -1012,7 +1013,7 @@ ACTOR Future<BlobFileIndex> checkSplitAndReSnapshot(Reference<BlobWorkerData> bw
wait(delay(FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY)); wait(delay(FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY));
} }
BlobFileIndex reSnapshotIdx = BlobFileIndex reSnapshotIdx =
wait(compactFromBlob(bwData, metadata, granuleID, metadata->files, reSnapshotVersion)); wait(compactFromBlob(bwData, bstore, metadata, granuleID, metadata->files, reSnapshotVersion));
return reSnapshotIdx; return reSnapshotIdx;
} }
@ -1305,7 +1306,9 @@ ACTOR Future<Void> waitVersionCommitted(Reference<BlobWorkerData> bwData,
// TODO: this is getting kind of large. Should try to split out this actor if it continues to grow? // TODO: this is getting kind of large. Should try to split out this actor if it continues to grow?
ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData, ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
Reference<GranuleMetadata> metadata, Reference<GranuleMetadata> metadata,
Future<GranuleStartState> assignFuture) { Future<GranuleStartState> assignFuture,
Future<Reference<BlobConnectionProvider>> bstoreFuture) {
state Reference<BlobConnectionProvider> bstore;
state std::deque<InFlightFile> inFlightFiles; state std::deque<InFlightFile> inFlightFiles;
state std::deque<Future<Void>> inFlightPops; state std::deque<Future<Void>> inFlightPops;
state Future<Void> oldChangeFeedFuture; state Future<Void> oldChangeFeedFuture;
@ -1327,9 +1330,9 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
// set resume snapshot so it's not valid until we pause to ask the blob manager for a re-snapshot // set resume snapshot so it's not valid until we pause to ask the blob manager for a re-snapshot
metadata->resumeSnapshot.send(Void()); metadata->resumeSnapshot.send(Void());
// before starting, make sure worker persists range assignment and acquires the granule lock // before starting, make sure worker persists range assignment, acquires the granule lock, and has a blob store
GranuleStartState _info = wait(assignFuture); wait(store(startState, assignFuture));
startState = _info; wait(store(bstore, bstoreFuture));
wait(delay(0, TaskPriority::BlobWorkerUpdateStorage)); wait(delay(0, TaskPriority::BlobWorkerUpdateStorage));
@ -1385,15 +1388,15 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
if (startState.blobFilesToSnapshot.present()) { if (startState.blobFilesToSnapshot.present()) {
startVersion = startState.previousDurableVersion; startVersion = startState.previousDurableVersion;
Future<BlobFileIndex> inFlightBlobSnapshot = compactFromBlob( Future<BlobFileIndex> inFlightBlobSnapshot = compactFromBlob(
bwData, metadata, startState.granuleID, startState.blobFilesToSnapshot.get(), startVersion); bwData, bstore, metadata, startState.granuleID, startState.blobFilesToSnapshot.get(), startVersion);
inFlightFiles.push_back(InFlightFile(inFlightBlobSnapshot, startVersion, 0, true)); inFlightFiles.push_back(InFlightFile(inFlightBlobSnapshot, startVersion, 0, true));
pendingSnapshots++; pendingSnapshots++;
metadata->durableSnapshotVersion.set(startState.blobFilesToSnapshot.get().snapshotFiles.back().version); metadata->durableSnapshotVersion.set(startState.blobFilesToSnapshot.get().snapshotFiles.back().version);
} else { } else {
ASSERT(startState.previousDurableVersion == invalidVersion); ASSERT(startState.previousDurableVersion == invalidVersion);
BlobFileIndex fromFDB = BlobFileIndex fromFDB = wait(
wait(dumpInitialSnapshotFromFDB(bwData, metadata, startState.granuleID, cfKey, &inFlightPops)); dumpInitialSnapshotFromFDB(bwData, bstore, metadata, startState.granuleID, cfKey, &inFlightPops));
newSnapshotFile = fromFDB; newSnapshotFile = fromFDB;
ASSERT(startState.changeFeedStartVersion <= fromFDB.version); ASSERT(startState.changeFeedStartVersion <= fromFDB.version);
startVersion = newSnapshotFile.version; startVersion = newSnapshotFile.version;
@ -1764,6 +1767,7 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
} }
Future<BlobFileIndex> dfFuture = Future<BlobFileIndex> dfFuture =
writeDeltaFile(bwData, writeDeltaFile(bwData,
bstore,
metadata->keyRange, metadata->keyRange,
startState.granuleID, startState.granuleID,
metadata->originalEpoch, metadata->originalEpoch,
@ -1825,6 +1829,7 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
int64_t versionsSinceLastSnapshot = int64_t versionsSinceLastSnapshot =
metadata->pendingDeltaVersion - metadata->pendingSnapshotVersion; metadata->pendingDeltaVersion - metadata->pendingSnapshotVersion;
Future<BlobFileIndex> inFlightBlobSnapshot = checkSplitAndReSnapshot(bwData, Future<BlobFileIndex> inFlightBlobSnapshot = checkSplitAndReSnapshot(bwData,
bstore,
metadata, metadata,
startState.granuleID, startState.granuleID,
metadata->bytesInNewDeltaFiles, metadata->bytesInNewDeltaFiles,
@ -2227,10 +2232,10 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
state Optional<Key> tenantPrefix; state Optional<Key> tenantPrefix;
if (req.tenantInfo.name.present()) { if (req.tenantInfo.name.present()) {
ASSERT(req.tenantInfo.tenantId != TenantInfo::INVALID_TENANT); ASSERT(req.tenantInfo.tenantId != TenantInfo::INVALID_TENANT);
auto tenantEntry = bwData->tenantInfoById.find(req.tenantInfo.tenantId); Optional<TenantMapEntry> tenantEntry = bwData->tenantData.getTenantById(req.tenantInfo.tenantId);
if (tenantEntry != bwData->tenantInfoById.end()) { if (tenantEntry.present()) {
ASSERT(tenantEntry->second.id == req.tenantInfo.tenantId); ASSERT(tenantEntry.get().id == req.tenantInfo.tenantId);
tenantPrefix = tenantEntry->second.prefix; tenantPrefix = tenantEntry.get().prefix;
} else { } else {
TEST(true); // Blob worker unknown tenant TEST(true); // Blob worker unknown tenant
// FIXME - better way. Wait on retry here, or just have better model for tenant metadata? // FIXME - better way. Wait on retry here, or just have better model for tenant metadata?
@ -2744,11 +2749,39 @@ ACTOR Future<GranuleStartState> openGranule(Reference<BlobWorkerData> bwData, As
} }
} }
ACTOR Future<Reference<BlobConnectionProvider>> loadBStoreForTenant(Reference<BlobWorkerData> bwData,
KeyRange keyRange) {
state int retryCount = 0;
loop {
state Reference<GranuleTenantData> data = bwData->tenantData.getDataForGranule(keyRange);
if (data.isValid()) {
wait(data->bstoreLoaded.getFuture());
return data->bstore;
} else {
TEST(true); // bstore for unknown tenant
// Assume not loaded yet, just wait a bit. Could do sophisticated mechanism but will redo tenant loading to
// be versioned anyway
retryCount++;
TraceEvent(retryCount < 10 ? SevDebug : SevWarn, "BlobWorkerUnknownTenantForGranule", bwData->id)
.detail("KeyRange", keyRange);
wait(delay(0.1));
}
}
}
ACTOR Future<Void> start(Reference<BlobWorkerData> bwData, GranuleRangeMetadata* meta, AssignBlobRangeRequest req) { ACTOR Future<Void> start(Reference<BlobWorkerData> bwData, GranuleRangeMetadata* meta, AssignBlobRangeRequest req) {
ASSERT(meta->activeMetadata.isValid()); ASSERT(meta->activeMetadata.isValid());
Future<Reference<BlobConnectionProvider>> loadBStore;
if (SERVER_KNOBS->BG_METADATA_SOURCE != "tenant") {
loadBStore = Future<Reference<BlobConnectionProvider>>(bwData->bstore); // done
} else {
loadBStore = loadBStoreForTenant(bwData, req.keyRange);
}
meta->activeMetadata->originalReq = req; meta->activeMetadata->originalReq = req;
meta->assignFuture = openGranule(bwData, req); meta->assignFuture = openGranule(bwData, req);
meta->fileUpdaterFuture = blobGranuleUpdateFiles(bwData, meta->activeMetadata, meta->assignFuture); meta->fileUpdaterFuture = blobGranuleUpdateFiles(bwData, meta->activeMetadata, meta->assignFuture, loadBStore);
meta->historyLoaderFuture = blobGranuleLoadHistory(bwData, meta->activeMetadata, meta->assignFuture); meta->historyLoaderFuture = blobGranuleLoadHistory(bwData, meta->activeMetadata, meta->assignFuture);
wait(success(meta->assignFuture)); wait(success(meta->assignFuture));
return Void(); return Void();
@ -3115,72 +3148,6 @@ ACTOR Future<Void> monitorRemoval(Reference<BlobWorkerData> bwData) {
} }
} }
// FIXME: if credentials can expire, refresh periodically
ACTOR Future<Void> loadBlobMetadataForTenants(Reference<BlobWorkerData> bwData,
Reference<AsyncVar<ServerDBInfo> const> dbInfo,
std::vector<TenantMapEntry> tenantMapEntries) {
ASSERT(SERVER_KNOBS->BG_METADATA_SOURCE == "tenant");
ASSERT(!tenantMapEntries.empty());
state std::vector<BlobMetadataDomainId> domainIds;
for (auto& entry : tenantMapEntries) {
if (BW_DEBUG) {
fmt::print(
"BW {0} loading blob metadata for tenant {1}\n", bwData->id.shortString().substr(0, 5), entry.id);
}
domainIds.push_back(entry.id);
}
// FIXME: if one tenant gets an error, don't kill whole blob worker
// TODO: add latency metrics
loop {
Future<EKPGetLatestBlobMetadataReply> requestFuture;
if (dbInfo.isValid() && dbInfo->get().encryptKeyProxy.present()) {
EKPGetLatestBlobMetadataRequest req;
req.domainIds = domainIds;
requestFuture =
brokenPromiseToNever(dbInfo->get().encryptKeyProxy.get().getLatestBlobMetadata.getReply(req));
} else {
requestFuture = Never();
}
choose {
when(EKPGetLatestBlobMetadataReply rep = wait(requestFuture)) {
ASSERT(rep.blobMetadataDetails.size() == domainIds.size());
// not guaranteed to be in same order in the request as the response
for (auto& metadata : rep.blobMetadataDetails) {
auto info = bwData->tenantInfoById.find(metadata.domainId);
if (info == bwData->tenantInfoById.end()) {
TraceEvent(SevWarn, "BlobWorkerTenantDeletedWhileLoadMetadata", bwData->id)
.detail("TenantId", metadata.domainId);
continue;
}
auto dataEntry = bwData->tenantData.rangeContaining(info->second.prefix);
ASSERT(dataEntry.begin() == info->second.prefix);
dataEntry.cvalue()->conn = BlobConnectionProvider::newBlobConnectionProvider(metadata);
dataEntry.cvalue()->connLoaded.send(Void());
TraceEvent(SevDebug, "BlobWorkerTenantMetadataLoaded", bwData->id)
.detail("TenantId", metadata.domainId);
if (BW_DEBUG) {
fmt::print("BW {0} loaded blob metadata for {1}: {2}",
bwData->id.shortString().substr(0, 5),
metadata.domainId,
metadata.base.present() ? metadata.base.get().toString() : "");
if (metadata.partitions.empty()) {
fmt::print("\n");
} else {
fmt::print(" ({0})\n", metadata.partitions.size());
for (auto& it : metadata.partitions) {
fmt::print(" {0}\n", it.toString());
}
}
}
}
return Void();
}
when(wait(dbInfo->onChange())) {}
}
}
}
// Because change feeds send uncommitted data and explicit rollback messages, we speculatively buffer/write // Because change feeds send uncommitted data and explicit rollback messages, we speculatively buffer/write
// uncommitted data. This means we must ensure the data is actually committed before "committing" those writes in // uncommitted data. This means we must ensure the data is actually committed before "committing" those writes in
// the blob granule. The simplest way to do this is to have the blob worker do a periodic GRV, which is guaranteed // the blob granule. The simplest way to do this is to have the blob worker do a periodic GRV, which is guaranteed
@ -3213,7 +3180,7 @@ ACTOR Future<Void> runGRVChecks(Reference<BlobWorkerData> bwData) {
// FIXME: better way to do this? // FIXME: better way to do this?
// monitor system keyspace for new tenants // monitor system keyspace for new tenants
ACTOR Future<Void> monitorTenants(Reference<BlobWorkerData> bwData, Reference<AsyncVar<ServerDBInfo> const> dbInfo) { ACTOR Future<Void> monitorTenants(Reference<BlobWorkerData> bwData) {
loop { loop {
state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(bwData->db); state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(bwData->db);
loop { loop {
@ -3233,33 +3200,14 @@ ACTOR Future<Void> monitorTenants(Reference<BlobWorkerData> bwData, Reference<As
throw internal_error(); throw internal_error();
} }
std::vector<TenantMapEntry> tenantsToLoad; std::vector<std::pair<TenantName, TenantMapEntry>> tenants;
for (auto& it : tenantResults) { for (auto& it : tenantResults) {
// FIXME: handle removing/moving tenants! // FIXME: handle removing/moving tenants!
StringRef tenantName = it.key.removePrefix(tenantMapPrefix); StringRef tenantName = it.key.removePrefix(tenantMapPrefix);
TenantMapEntry entry = decodeTenantEntry(it.value); TenantMapEntry entry = decodeTenantEntry(it.value);
tenants.push_back(std::pair(tenantName, entry));
if (bwData->tenantInfoById.insert({ entry.id, entry }).second) {
if (BW_DEBUG) {
fmt::print("BW {0} found new tenant {1}: {2} {3}\n",
bwData->id.shortString().substr(0, 5),
tenantName.printable(),
entry.id,
entry.prefix.printable());
}
auto r = makeReference<GranuleTenantData>(tenantName, entry);
bwData->tenantData.insert(KeyRangeRef(entry.prefix, entry.prefix.withSuffix(normalKeys.end)),
r);
if (SERVER_KNOBS->BG_METADATA_SOURCE != "tenant") {
r->connLoaded.send(Void());
} else {
tenantsToLoad.push_back(entry);
}
}
}
if (!tenantsToLoad.empty()) {
bwData->addActor.send(loadBlobMetadataForTenants(bwData, dbInfo, tenantsToLoad));
} }
bwData->tenantData.addTenants(tenants);
state Future<Void> watchChange = tr->watch(tenantLastIdKey); state Future<Void> watchChange = tr->watch(tenantLastIdKey);
wait(tr->commit()); wait(tr->commit());
@ -3298,8 +3246,8 @@ static void handleGetGranuleAssignmentsRequest(Reference<BlobWorkerData> self,
ACTOR Future<Void> blobWorker(BlobWorkerInterface bwInterf, ACTOR Future<Void> blobWorker(BlobWorkerInterface bwInterf,
ReplyPromise<InitializeBlobWorkerReply> recruitReply, ReplyPromise<InitializeBlobWorkerReply> recruitReply,
Reference<AsyncVar<ServerDBInfo> const> dbInfo) { Reference<AsyncVar<ServerDBInfo> const> dbInfo) {
state Reference<BlobWorkerData> self( state Reference<BlobWorkerData> self(new BlobWorkerData(
new BlobWorkerData(bwInterf.id(), openDBOnServer(dbInfo, TaskPriority::DefaultEndpoint, LockAware::True))); bwInterf.id(), dbInfo, openDBOnServer(dbInfo, TaskPriority::DefaultEndpoint, LockAware::True)));
self->id = bwInterf.id(); self->id = bwInterf.id();
self->locality = bwInterf.locality; self->locality = bwInterf.locality;
@ -3310,19 +3258,21 @@ ACTOR Future<Void> blobWorker(BlobWorkerInterface bwInterf,
} }
try { try {
if (SERVER_KNOBS->BG_RANGE_SOURCE != "tenant") {
if (BW_DEBUG) { if (BW_DEBUG) {
fmt::print("BW constructing backup container from {0}\n", SERVER_KNOBS->BG_URL); fmt::print("BW constructing backup container from {0}\n", SERVER_KNOBS->BG_URL);
} }
self->bstore = BackupContainerFileSystem::openContainerFS(SERVER_KNOBS->BG_URL, {}, {}); self->bstore = BlobConnectionProvider::newBlobConnectionProvider(SERVER_KNOBS->BG_URL);
if (BW_DEBUG) { if (BW_DEBUG) {
printf("BW constructed backup container\n"); printf("BW constructed backup container\n");
} }
}
// register the blob worker to the system keyspace // register the blob worker to the system keyspace
wait(registerBlobWorker(self, bwInterf)); wait(registerBlobWorker(self, bwInterf));
} catch (Error& e) { } catch (Error& e) {
if (BW_DEBUG) { if (BW_DEBUG) {
fmt::print("BW got backup container init error {0}\n", e.name()); fmt::print("BW got init error {0}\n", e.name());
} }
// if any errors came up while initializing the blob worker, let the blob manager know // if any errors came up while initializing the blob worker, let the blob manager know
// that recruitment failed // that recruitment failed
@ -3342,7 +3292,7 @@ ACTOR Future<Void> blobWorker(BlobWorkerInterface bwInterf,
self->addActor.send(waitFailureServer(bwInterf.waitFailure.getFuture())); self->addActor.send(waitFailureServer(bwInterf.waitFailure.getFuture()));
self->addActor.send(runGRVChecks(self)); self->addActor.send(runGRVChecks(self));
if (SERVER_KNOBS->BG_RANGE_SOURCE == "tenant") { if (SERVER_KNOBS->BG_RANGE_SOURCE == "tenant") {
self->addActor.send(monitorTenants(self, dbInfo)); self->addActor.send(monitorTenants(self));
} }
state Future<Void> selfRemoved = monitorRemoval(self); state Future<Void> selfRemoved = monitorRemoval(self);

View File

@ -5,15 +5,12 @@ set(FDBSERVER_SRCS
BackupProgress.actor.cpp BackupProgress.actor.cpp
BackupProgress.actor.h BackupProgress.actor.h
BackupWorker.actor.cpp BackupWorker.actor.cpp
BlobConnectionProvider.h
BlobConnectionProvider.cpp
BlobGranuleServerCommon.actor.cpp BlobGranuleServerCommon.actor.cpp
BlobGranuleServerCommon.actor.h BlobGranuleServerCommon.actor.h
BlobGranuleValidation.actor.cpp BlobGranuleValidation.actor.cpp
BlobGranuleValidation.actor.h BlobGranuleValidation.actor.h
BlobManager.actor.cpp BlobManager.actor.cpp
BlobManagerInterface.h BlobManagerInterface.h
BlobMetadataUtils.h
BlobWorker.actor.cpp BlobWorker.actor.cpp
ClusterController.actor.cpp ClusterController.actor.cpp
ClusterController.actor.h ClusterController.actor.h

View File

@ -26,10 +26,10 @@
#include "flow/network.h" #include "flow/network.h"
#pragma once #pragma once
#include "fdbclient/BlobMetadataUtils.h"
#include "fdbclient/FDBTypes.h" #include "fdbclient/FDBTypes.h"
#include "fdbrpc/fdbrpc.h" #include "fdbrpc/fdbrpc.h"
#include "fdbrpc/Locality.h" #include "fdbrpc/Locality.h"
#include "fdbserver/BlobMetadataUtils.h"
struct EncryptKeyProxyInterface { struct EncryptKeyProxyInterface {
constexpr static FileIdentifier file_identifier = 1303419; constexpr static FileIdentifier file_identifier = 1303419;

View File

@ -28,7 +28,7 @@
#include "flow/Trace.h" #include "flow/Trace.h"
#include "flow/flow.h" #include "flow/flow.h"
#include "flow/network.h" #include "flow/network.h"
#include "fdbserver/BlobMetadataUtils.h" #include "fdbclient/BlobMetadataUtils.h"
struct KmsConnectorInterface { struct KmsConnectorInterface {
constexpr static FileIdentifier file_identifier = 2416711; constexpr static FileIdentifier file_identifier = 2416711;

View File

@ -20,6 +20,7 @@
#include "fdbserver/SimKmsConnector.h" #include "fdbserver/SimKmsConnector.h"
#include "contrib/fmt-8.1.1/include/fmt/format.h"
#include "fdbrpc/sim_validation.h" #include "fdbrpc/sim_validation.h"
#include "fdbserver/Knobs.h" #include "fdbserver/Knobs.h"
#include "flow/ActorCollection.h" #include "flow/ActorCollection.h"
@ -151,17 +152,21 @@ static Standalone<BlobMetadataDetailsRef> createBlobMetadata(BlobMetadataDomainI
// 0 == no partition, 1 == suffix partitioned, 2 == storage location partitioned // 0 == no partition, 1 == suffix partitioned, 2 == storage location partitioned
int type = deterministicRandom()->randomInt(0, 3); int type = deterministicRandom()->randomInt(0, 3);
int partitionCount = (type == 0) ? 0 : deterministicRandom()->randomInt(2, 12); int partitionCount = (type == 0) ? 0 : deterministicRandom()->randomInt(2, 12);
fmt::print("SimBlobMetadata ({})\n", domainId);
if (type == 0) { if (type == 0) {
// single storage location // single storage location
metadata.base = StringRef(metadata.arena(), "file://fdbblob/" + std::to_string(domainId) + "/"); metadata.base = StringRef(metadata.arena(), "file://fdbblob/" + std::to_string(domainId) + "/");
fmt::print(" {}\n", metadata.base.get().printable());
} }
if (type == 1) { if (type == 1) {
// simulate hash prefixing in s3 // simulate hash prefixing in s3
metadata.base = StringRef(metadata.arena(), "file://fdbblob/"); metadata.base = StringRef(metadata.arena(), "file://fdbblob/"_sr);
fmt::print(" {} ({})\n", metadata.base.get().printable(), partitionCount);
for (int i = 0; i < partitionCount; i++) { for (int i = 0; i < partitionCount; i++) {
metadata.partitions.push_back_deep(metadata.arena(), metadata.partitions.push_back_deep(metadata.arena(),
deterministicRandom()->randomUniqueID().shortString() + "-" + deterministicRandom()->randomUniqueID().shortString() + "-" +
std::to_string(domainId) + "/"); std::to_string(domainId) + "/");
fmt::print(" {}\n", metadata.partitions.back().printable());
} }
} }
if (type == 2) { if (type == 2) {
@ -169,6 +174,7 @@ static Standalone<BlobMetadataDetailsRef> createBlobMetadata(BlobMetadataDomainI
for (int i = 0; i < partitionCount; i++) { for (int i = 0; i < partitionCount; i++) {
metadata.partitions.push_back_deep( metadata.partitions.push_back_deep(
metadata.arena(), "file://fdbblob" + std::to_string(domainId) + "_" + std::to_string(i) + "/"); metadata.arena(), "file://fdbblob" + std::to_string(domainId) + "_" + std::to_string(i) + "/");
fmt::print(" {}\n", metadata.partitions.back().printable());
} }
} }
return metadata; return metadata;

View File

@ -30,6 +30,7 @@
#include "fdbclient/NativeAPI.actor.h" #include "fdbclient/NativeAPI.actor.h"
#include "fdbclient/ReadYourWrites.h" #include "fdbclient/ReadYourWrites.h"
#include "fdbclient/SystemData.h" #include "fdbclient/SystemData.h"
#include "fdbserver/BlobGranuleServerCommon.actor.h"
#include "fdbserver/BlobGranuleValidation.actor.h" #include "fdbserver/BlobGranuleValidation.actor.h"
#include "fdbserver/Knobs.h" #include "fdbserver/Knobs.h"
#include "fdbserver/TesterInterface.actor.h" #include "fdbserver/TesterInterface.actor.h"
@ -66,6 +67,7 @@ struct ThreadData : ReferenceCounted<ThreadData>, NonCopyable {
KeyRange directoryRange; KeyRange directoryRange;
TenantName tenantName; TenantName tenantName;
TenantMapEntry tenant; TenantMapEntry tenant;
Reference<BlobConnectionProvider> bstore;
// key + value gen data // key + value gen data
// in vector for efficient random selection // in vector for efficient random selection
@ -150,7 +152,6 @@ struct BlobGranuleCorrectnessWorkload : TestWorkload {
std::vector<Reference<ThreadData>> directories; std::vector<Reference<ThreadData>> directories;
std::vector<Future<Void>> clients; std::vector<Future<Void>> clients;
DatabaseConfiguration config; DatabaseConfiguration config;
Reference<BackupContainerFileSystem> bstore;
BlobGranuleCorrectnessWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) { BlobGranuleCorrectnessWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) {
doSetup = !clientId; // only do this on the "first" client doSetup = !clientId; // only do this on the "first" client
@ -245,6 +246,8 @@ struct BlobGranuleCorrectnessWorkload : TestWorkload {
} }
state int directoryIdx = 0; state int directoryIdx = 0;
state std::vector<std::pair<TenantName, TenantMapEntry>> tenants;
state BGTenantMap tenantData(self->dbInfo);
for (; directoryIdx < self->directories.size(); directoryIdx++) { for (; directoryIdx < self->directories.size(); directoryIdx++) {
// Set up the blob range first // Set up the blob range first
TenantMapEntry tenantEntry = wait(self->setUpTenant(cx, self->directories[directoryIdx]->tenantName)); TenantMapEntry tenantEntry = wait(self->setUpTenant(cx, self->directories[directoryIdx]->tenantName));
@ -252,32 +255,17 @@ struct BlobGranuleCorrectnessWorkload : TestWorkload {
self->directories[directoryIdx]->tenant = tenantEntry; self->directories[directoryIdx]->tenant = tenantEntry;
self->directories[directoryIdx]->directoryRange = self->directories[directoryIdx]->directoryRange =
KeyRangeRef(tenantEntry.prefix, tenantEntry.prefix.withSuffix(normalKeys.end)); KeyRangeRef(tenantEntry.prefix, tenantEntry.prefix.withSuffix(normalKeys.end));
tenants.push_back({ self->directories[directoryIdx]->tenantName, tenantEntry });
} }
tenantData.addTenants(tenants);
if (BGW_DEBUG) { // wait for tenant data to be loaded
printf("Initializing Blob Granule Correctness s3 stuff\n");
} for (directoryIdx = 0; directoryIdx < self->directories.size(); directoryIdx++) {
try { state Reference<GranuleTenantData> data =
if (g_network->isSimulated()) { tenantData.getDataForGranule(self->directories[directoryIdx]->directoryRange);
if (BGW_DEBUG) { wait(data->bstoreLoaded.getFuture());
printf("Blob Granule Correctness constructing simulated backup container\n"); self->directories[directoryIdx]->bstore = data->bstore;
}
self->bstore = BackupContainerFileSystem::openContainerFS("file://fdbblob/", {}, {});
} else {
if (BGW_DEBUG) {
printf("Blob Granule Correctness constructing backup container from %s\n",
SERVER_KNOBS->BG_URL.c_str());
}
self->bstore = BackupContainerFileSystem::openContainerFS(SERVER_KNOBS->BG_URL, {}, {});
if (BGW_DEBUG) {
printf("Blob Granule Correctness constructed backup container\n");
}
}
} catch (Error& e) {
if (BGW_DEBUG) {
printf("Blob Granule Correctness got backup container init error %s\n", e.name());
}
throw e;
} }
return Void(); return Void();
@ -307,8 +295,13 @@ struct BlobGranuleCorrectnessWorkload : TestWorkload {
try { try {
Version rv = wait(self->doGrv(&tr)); Version rv = wait(self->doGrv(&tr));
state Version readVersion = rv; state Version readVersion = rv;
std::pair<RangeResult, Standalone<VectorRef<BlobGranuleChunkRef>>> blob = wait(readFromBlob( std::pair<RangeResult, Standalone<VectorRef<BlobGranuleChunkRef>>> blob =
cx, self->bstore, normalKeys /* tenant handles range */, 0, readVersion, threadData->tenantName)); wait(readFromBlob(cx,
threadData->bstore,
normalKeys /* tenant handles range */,
0,
readVersion,
threadData->tenantName));
fmt::print("Directory {0} got {1} RV {2}\n", fmt::print("Directory {0} got {1} RV {2}\n",
threadData->directoryID, threadData->directoryID,
doSetup ? "initial" : "final", doSetup ? "initial" : "final",
@ -681,8 +674,8 @@ struct BlobGranuleCorrectnessWorkload : TestWorkload {
beginVersion = threadData->writeVersions[beginVersionIdx]; beginVersion = threadData->writeVersions[beginVersionIdx];
} }
std::pair<RangeResult, Standalone<VectorRef<BlobGranuleChunkRef>>> blob = std::pair<RangeResult, Standalone<VectorRef<BlobGranuleChunkRef>>> blob = wait(
wait(readFromBlob(cx, self->bstore, range, beginVersion, readVersion, threadData->tenantName)); readFromBlob(cx, threadData->bstore, range, beginVersion, readVersion, threadData->tenantName));
self->validateResult(threadData, blob, startKey, endKey, beginVersion, readVersion); self->validateResult(threadData, blob, startKey, endKey, beginVersion, readVersion);
int resultBytes = blob.first.expectedSize(); int resultBytes = blob.first.expectedSize();
@ -883,7 +876,7 @@ struct BlobGranuleCorrectnessWorkload : TestWorkload {
fmt::print("Directory {0} doing final data check @ {1}\n", threadData->directoryID, readVersion); fmt::print("Directory {0} doing final data check @ {1}\n", threadData->directoryID, readVersion);
} }
std::pair<RangeResult, Standalone<VectorRef<BlobGranuleChunkRef>>> blob = wait(readFromBlob( std::pair<RangeResult, Standalone<VectorRef<BlobGranuleChunkRef>>> blob = wait(readFromBlob(
cx, self->bstore, normalKeys /*tenant handles range*/, 0, readVersion, threadData->tenantName)); cx, threadData->bstore, normalKeys /*tenant handles range*/, 0, readVersion, threadData->tenantName));
result = self->validateResult(threadData, blob, 0, std::numeric_limits<uint32_t>::max(), 0, readVersion); result = self->validateResult(threadData, blob, 0, std::numeric_limits<uint32_t>::max(), 0, readVersion);
finalRowsValidated = blob.first.size(); finalRowsValidated = blob.first.size();

View File

@ -68,7 +68,7 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
DatabaseConfiguration config; DatabaseConfiguration config;
Reference<BackupContainerFileSystem> bstore; Reference<BlobConnectionProvider> bstore;
AsyncVar<Standalone<VectorRef<KeyRangeRef>>> granuleRanges; AsyncVar<Standalone<VectorRef<KeyRangeRef>>> granuleRanges;
BlobGranuleVerifierWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) { BlobGranuleVerifierWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) {
@ -87,22 +87,13 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
printf("Initializing Blob Granule Verifier s3 stuff\n"); printf("Initializing Blob Granule Verifier s3 stuff\n");
} }
try { try {
if (g_network->isSimulated()) {
if (BGV_DEBUG) { if (BGV_DEBUG) {
printf("Blob Granule Verifier constructing simulated backup container\n"); printf("Blob Granule Verifier constructing backup container from %s\n", SERVER_KNOBS->BG_URL.c_str());
} }
bstore = BackupContainerFileSystem::openContainerFS("file://fdbblob/", {}, {}); bstore = BlobConnectionProvider::newBlobConnectionProvider(SERVER_KNOBS->BG_URL);
} else {
if (BGV_DEBUG) {
printf("Blob Granule Verifier constructing backup container from %s\n",
SERVER_KNOBS->BG_URL.c_str());
}
bstore = BackupContainerFileSystem::openContainerFS(SERVER_KNOBS->BG_URL, {}, {});
if (BGV_DEBUG) { if (BGV_DEBUG) {
printf("Blob Granule Verifier constructed backup container\n"); printf("Blob Granule Verifier constructed backup container\n");
} }
}
} catch (Error& e) { } catch (Error& e) {
if (BGV_DEBUG) { if (BGV_DEBUG) {
printf("Blob Granule Verifier got backup container init error %s\n", e.name()); printf("Blob Granule Verifier got backup container init error %s\n", e.name());