mirror of
https://github.com/apple/foundationdb.git
synced 2025-05-15 02:18:39 +08:00
More granule purging fixes (#7756)
* Granule purge cannot delete history entry for fully deleting granule until all children are completely done splitting * Several purging fixes related to granule history * Fixed typo in refactor * fixing memory model for purgeRange * formatting * weakening granule purge test for now * cleanup * review comments
This commit is contained in:
parent
97cf3c99e7
commit
7f45cccb56
@ -3322,18 +3322,43 @@ ACTOR Future<GranuleFiles> loadHistoryFiles(Reference<BlobManagerData> bmData, U
|
||||
}
|
||||
}
|
||||
|
||||
ACTOR Future<Void> canDeleteFullGranule(Reference<BlobManagerData> self, UID granuleId) {
|
||||
ACTOR Future<bool> canDeleteFullGranule(Reference<BlobManagerData> self, UID granuleId) {
|
||||
state Transaction tr(self->db);
|
||||
state KeyRange splitRange = blobGranuleSplitKeyRangeFor(granuleId);
|
||||
state KeyRange checkRange = splitRange;
|
||||
state bool retry = false;
|
||||
|
||||
if (BM_PURGE_DEBUG) {
|
||||
fmt::print("BM {0} Fully delete granule check {1}\n", self->epoch, granuleId.toString());
|
||||
}
|
||||
|
||||
loop {
|
||||
try {
|
||||
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
|
||||
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
|
||||
|
||||
state RangeResult splitState = wait(tr.getRange(splitRange, SERVER_KNOBS->BG_MAX_SPLIT_FANOUT));
|
||||
int lim = SERVER_KNOBS->BG_MAX_SPLIT_FANOUT;
|
||||
if (BUGGIFY_WITH_PROB(0.1)) {
|
||||
lim = deterministicRandom()->randomInt(1, std::max(2, SERVER_KNOBS->BG_MAX_SPLIT_FANOUT));
|
||||
}
|
||||
state RangeResult splitState = wait(tr.getRange(checkRange, lim));
|
||||
// if first try and empty, splitting state is fully cleaned up
|
||||
if (!retry && checkRange == splitRange && splitState.empty() && !splitState.more) {
|
||||
if (BM_PURGE_DEBUG) {
|
||||
fmt::print("BM {0} Proceed with full deletion, no split state for {1}\n",
|
||||
self->epoch,
|
||||
granuleId.toString());
|
||||
}
|
||||
return true;
|
||||
}
|
||||
if (BM_PURGE_DEBUG) {
|
||||
fmt::print("BM {0} Full delete check found {1} split states for {2}\n",
|
||||
self->epoch,
|
||||
splitState.size(),
|
||||
granuleId.toString());
|
||||
}
|
||||
state int i = 0;
|
||||
state bool retry = false;
|
||||
|
||||
for (; i < splitState.size(); i++) {
|
||||
UID parent, child;
|
||||
BlobGranuleSplitState st;
|
||||
@ -3364,17 +3389,22 @@ ACTOR Future<Void> canDeleteFullGranule(Reference<BlobManagerData> self, UID gra
|
||||
if (retry) {
|
||||
tr.reset();
|
||||
wait(delay(1.0));
|
||||
retry = false;
|
||||
checkRange = splitRange;
|
||||
} else {
|
||||
if (splitState.empty() || !splitState.more) {
|
||||
break;
|
||||
}
|
||||
splitRange = KeyRangeRef(keyAfter(splitState.back().key), splitRange.end);
|
||||
checkRange = KeyRangeRef(keyAfter(splitState.back().key), checkRange.end);
|
||||
}
|
||||
} catch (Error& e) {
|
||||
wait(tr.onError(e));
|
||||
}
|
||||
}
|
||||
return Void();
|
||||
if (BM_PURGE_DEBUG) {
|
||||
fmt::print("BM {0} Full delete check {1} done. Not deleting history key\n", self->epoch, granuleId.toString());
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static Future<Void> deleteFile(Reference<BlobConnectionProvider> bstoreProvider, std::string filePath) {
|
||||
@ -3415,7 +3445,10 @@ ACTOR Future<Void> fullyDeleteGranule(Reference<BlobManagerData> self,
|
||||
|
||||
// if granule is still splitting and files are needed for new sub-granules to re-snapshot, we can only partially
|
||||
// delete the granule, since we need to keep the last snapshot and deltas for splitting
|
||||
wait(canDeleteFullGranule(self, granuleId));
|
||||
// Or, if the granule isn't finalized (still needs the history entry for the old change feed id, because all data
|
||||
// from the old change feed hasn't yet been persisted in blob), we can delete the files but need to keep the granule
|
||||
// history entry.
|
||||
state bool canDeleteHistoryKey = wait(canDeleteFullGranule(self, granuleId));
|
||||
state Reference<BlobConnectionProvider> bstore = wait(getBStoreForGranule(self, granuleRange));
|
||||
|
||||
// get files
|
||||
@ -3465,7 +3498,9 @@ ACTOR Future<Void> fullyDeleteGranule(Reference<BlobManagerData> self,
|
||||
loop {
|
||||
try {
|
||||
KeyRange fileRangeKey = blobGranuleFileKeyRangeFor(granuleId);
|
||||
if (canDeleteHistoryKey) {
|
||||
tr.clear(historyKey);
|
||||
}
|
||||
tr.clear(fileRangeKey);
|
||||
wait(tr.commit());
|
||||
break;
|
||||
@ -3475,7 +3510,10 @@ ACTOR Future<Void> fullyDeleteGranule(Reference<BlobManagerData> self,
|
||||
}
|
||||
|
||||
if (BM_PURGE_DEBUG) {
|
||||
fmt::print("BM {0} Fully deleting granule {1}: success\n", self->epoch, granuleId.toString());
|
||||
fmt::print("BM {0} Fully deleting granule {1}: success {2}\n",
|
||||
self->epoch,
|
||||
granuleId.toString(),
|
||||
canDeleteHistoryKey ? "" : " ignoring history key!");
|
||||
}
|
||||
|
||||
TraceEvent("GranuleFullPurge", self->id)
|
||||
@ -3648,28 +3686,35 @@ ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range
|
||||
state std::unordered_set<std::pair<std::string, Version>, boost::hash<std::pair<std::string, Version>>> visited;
|
||||
|
||||
// find all active granules (that comprise the range) and add to the queue
|
||||
state KeyRangeMap<UID>::Ranges activeRanges = self->workerAssignments.intersectingRanges(range);
|
||||
|
||||
state Transaction tr(self->db);
|
||||
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
|
||||
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
|
||||
|
||||
state KeyRangeMap<UID>::iterator activeRange;
|
||||
for (activeRange = activeRanges.begin(); activeRange != activeRanges.end(); ++activeRange) {
|
||||
auto ranges = self->workerAssignments.intersectingRanges(range);
|
||||
state std::vector<KeyRange> activeRanges;
|
||||
|
||||
// copy into state variable before waits
|
||||
for (auto& it : ranges) {
|
||||
activeRanges.push_back(it.range());
|
||||
}
|
||||
|
||||
state int rangeIdx;
|
||||
for (rangeIdx = 0; rangeIdx < activeRanges.size(); rangeIdx++) {
|
||||
state KeyRange activeRange = activeRanges[rangeIdx];
|
||||
if (BM_PURGE_DEBUG) {
|
||||
fmt::print("BM {0} Checking if active range [{1} - {2}), owned by BW {3}, should be purged\n",
|
||||
fmt::print("BM {0} Checking if active range [{1} - {2}) should be purged\n",
|
||||
self->epoch,
|
||||
activeRange.begin().printable(),
|
||||
activeRange.end().printable(),
|
||||
activeRange.value().toString());
|
||||
activeRange.begin.printable(),
|
||||
activeRange.end.printable());
|
||||
}
|
||||
|
||||
// assumption: purge boundaries must respect granule boundaries
|
||||
if (activeRange.begin() < range.begin || activeRange.end() > range.end) {
|
||||
if (activeRange.begin < range.begin || activeRange.end > range.end) {
|
||||
TraceEvent(SevWarn, "GranulePurgeRangesUnaligned", self->id)
|
||||
.detail("Epoch", self->epoch)
|
||||
.detail("PurgeRange", range)
|
||||
.detail("GranuleRange", activeRange.range());
|
||||
.detail("GranuleRange", activeRange);
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -3682,24 +3727,23 @@ ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range
|
||||
if (BM_PURGE_DEBUG) {
|
||||
fmt::print("BM {0} Fetching latest history entry for range [{1} - {2})\n",
|
||||
self->epoch,
|
||||
activeRange.begin().printable(),
|
||||
activeRange.end().printable());
|
||||
activeRange.begin.printable(),
|
||||
activeRange.end.printable());
|
||||
}
|
||||
// FIXME: doing this serially will likely be too slow for large purges
|
||||
Optional<GranuleHistory> history = wait(getLatestGranuleHistory(&tr, activeRange.range()));
|
||||
Optional<GranuleHistory> history = wait(getLatestGranuleHistory(&tr, activeRange));
|
||||
// TODO: can we tell from the krm that this range is not valid, so that we don't need to do a
|
||||
// get
|
||||
if (history.present()) {
|
||||
if (BM_PURGE_DEBUG) {
|
||||
fmt::print("BM {0} Adding range to history queue: [{1} - {2}) @ {3} ({4})\n",
|
||||
fmt::print("BM {0} Adding range to history queue: [{1} - {2}) @ {3}\n",
|
||||
self->epoch,
|
||||
activeRange.begin().printable(),
|
||||
activeRange.end().printable(),
|
||||
history.get().version,
|
||||
(void*)(activeRange.range().begin.begin()));
|
||||
activeRange.begin.printable(),
|
||||
activeRange.end.printable(),
|
||||
history.get().version);
|
||||
}
|
||||
visited.insert({ activeRange.range().begin.toString(), history.get().version });
|
||||
historyEntryQueue.push({ activeRange.range(), history.get().version, MAX_VERSION });
|
||||
visited.insert({ activeRange.begin.toString(), history.get().version });
|
||||
historyEntryQueue.push({ activeRange, history.get().version, MAX_VERSION });
|
||||
} else if (BM_PURGE_DEBUG) {
|
||||
fmt::print("BM {0} No history for range, ignoring\n", self->epoch);
|
||||
}
|
||||
|
@ -2489,7 +2489,7 @@ ACTOR Future<Void> blobGranuleLoadHistory(Reference<BlobWorkerData> bwData,
|
||||
}
|
||||
|
||||
auto prev = bwData->granuleHistory.intersectingRanges(curHistory.range);
|
||||
bool allLess = true;
|
||||
bool allLess = true; // if the version is less than all existing granules
|
||||
for (auto& it : prev) {
|
||||
if (it.cvalue().isValid() && curHistory.version >= it.cvalue()->endVersion) {
|
||||
allLess = false;
|
||||
@ -2526,7 +2526,9 @@ ACTOR Future<Void> blobGranuleLoadHistory(Reference<BlobWorkerData> bwData,
|
||||
}
|
||||
|
||||
state int pIdx = 0;
|
||||
state bool noParentsPresent = true;
|
||||
state bool anyParentsMissing = curHistory.value.parentVersions.empty();
|
||||
state std::vector<GranuleHistory> nexts;
|
||||
nexts.reserve(curHistory.value.parentVersions.size());
|
||||
// FIXME: parallelize this for all parents/all entries in queue?
|
||||
loop {
|
||||
if (pIdx >= curHistory.value.parentVersions.size()) {
|
||||
@ -2549,7 +2551,7 @@ ACTOR Future<Void> blobGranuleLoadHistory(Reference<BlobWorkerData> bwData,
|
||||
// curHistory.version]
|
||||
inserted.first->second.entry = makeReference<GranuleHistoryEntry>(
|
||||
next.range, next.value.granuleID, next.version, curHistory.version);
|
||||
queue.push_back(next);
|
||||
nexts.push_back(next);
|
||||
if (BW_HISTORY_DEBUG) {
|
||||
fmt::print("HL {0} {1}) [{2} - {3}) @ {4}: loaded parent [{5} - {6}) @ {7}\n",
|
||||
bwData->id.shortString().substr(0, 5),
|
||||
@ -2577,7 +2579,8 @@ ACTOR Future<Void> blobGranuleLoadHistory(Reference<BlobWorkerData> bwData,
|
||||
}
|
||||
ASSERT(inserted.first->second.entry->endVersion == curHistory.version);
|
||||
}
|
||||
noParentsPresent = false;
|
||||
} else {
|
||||
anyParentsMissing = true;
|
||||
}
|
||||
|
||||
pIdx++;
|
||||
@ -2586,16 +2589,21 @@ ACTOR Future<Void> blobGranuleLoadHistory(Reference<BlobWorkerData> bwData,
|
||||
}
|
||||
}
|
||||
|
||||
if (noParentsPresent) {
|
||||
if (anyParentsMissing) {
|
||||
if (BW_HISTORY_DEBUG) {
|
||||
fmt::print("HL {0} {1}) [{2} - {3}) @ {4}: root b/c no parents\n",
|
||||
fmt::print("HL {0} {1}) [{2} - {3}) @ {4}: root b/c parents missing\n",
|
||||
bwData->id.shortString().substr(0, 5),
|
||||
loadId,
|
||||
curHistory.range.begin.printable(),
|
||||
curHistory.range.end.printable(),
|
||||
curHistory.version);
|
||||
}
|
||||
|
||||
rootGranules.push(OrderedHistoryKey(curHistory.version, curHistory.value.granuleID));
|
||||
} else {
|
||||
for (auto& it : nexts) {
|
||||
queue.push_back(it);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -2808,17 +2816,21 @@ std::vector<std::pair<KeyRange, Future<GranuleFiles>>> loadHistoryChunks(Referen
|
||||
if (!it.cvalue().isValid()) {
|
||||
throw blob_granule_transaction_too_old();
|
||||
}
|
||||
if (expectedEndVersion != it.cvalue()->endVersion) {
|
||||
if (expectedEndVersion > it.cvalue()->endVersion) {
|
||||
if (BW_DEBUG) {
|
||||
// history must have been pruned up to live granule, but BW still has previous history cached.
|
||||
fmt::print("live granule history version {0} for [{1} - {2}) != history end version {3} for "
|
||||
"[{4} - {5})\n",
|
||||
"[{4} - {5}) on BW {6}\n",
|
||||
expectedEndVersion,
|
||||
keyRange.begin.printable(),
|
||||
keyRange.end.printable(),
|
||||
it.cvalue()->endVersion,
|
||||
it.begin().printable(),
|
||||
it.end().printable());
|
||||
it.end().printable(),
|
||||
bwData->id.toString().substr(0, 5));
|
||||
}
|
||||
throw blob_granule_transaction_too_old();
|
||||
}
|
||||
ASSERT(expectedEndVersion == it.cvalue()->endVersion);
|
||||
visited.insert(it.cvalue()->granuleID);
|
||||
queue.push_back(it.cvalue());
|
||||
|
||||
|
@ -65,6 +65,7 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
|
||||
int64_t purges = 0;
|
||||
std::vector<Future<Void>> clients;
|
||||
bool enablePurging;
|
||||
bool strictPurgeChecking;
|
||||
|
||||
DatabaseConfiguration config;
|
||||
|
||||
@ -81,6 +82,11 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
|
||||
timeTravelBufferSize = getOption(options, LiteralStringRef("timeTravelBufferSize"), 100000000);
|
||||
threads = getOption(options, LiteralStringRef("threads"), 1);
|
||||
enablePurging = getOption(options, LiteralStringRef("enablePurging"), false /*sharedRandomNumber % 2 == 0*/);
|
||||
sharedRandomNumber /= 2;
|
||||
// FIXME: re-enable this! There exist several bugs with purging active granules where a small amount of state
|
||||
// won't be cleaned up.
|
||||
strictPurgeChecking =
|
||||
getOption(options, LiteralStringRef("strictPurgeChecking"), false /*sharedRandomNumber % 2 == 0*/);
|
||||
ASSERT(threads >= 1);
|
||||
|
||||
if (BGV_DEBUG) {
|
||||
@ -308,6 +314,7 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
|
||||
// if purged just before read, verify that purge cleaned up data by restarting blob workers and
|
||||
// reading older than the purge version
|
||||
if (doPurging) {
|
||||
if (self->strictPurgeChecking) {
|
||||
wait(self->killBlobWorkers(cx, self));
|
||||
if (BGV_DEBUG) {
|
||||
fmt::print("BGV Reading post-purge [{0} - {1}) @ {2}\n",
|
||||
@ -315,6 +322,7 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
|
||||
oldRead.range.end.printable(),
|
||||
prevPurgeVersion);
|
||||
}
|
||||
}
|
||||
// ensure purge version exactly is still readable
|
||||
std::pair<RangeResult, Standalone<VectorRef<BlobGranuleChunkRef>>> versionRead1 =
|
||||
wait(readFromBlob(cx, self->bstore, oldRead.range, 0, prevPurgeVersion));
|
||||
@ -322,6 +330,7 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
|
||||
fmt::print("BGV Post-purge first read:\n");
|
||||
printGranuleChunks(versionRead1.second);
|
||||
}
|
||||
if (self->strictPurgeChecking) {
|
||||
try {
|
||||
// read at purgeVersion - 1, should NOT be readable
|
||||
Version minSnapshotVersion = newPurgeVersion;
|
||||
@ -350,6 +359,7 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// pick a random range
|
||||
int rIndex = deterministicRandom()->randomInt(0, self->granuleRanges.get().size());
|
||||
|
Loading…
x
Reference in New Issue
Block a user