mirror of
https://github.com/facebook/rocksdb.git
synced 2025-05-14 00:43:07 +08:00
Reduce risk of backup or checkpoint missing a WAL file (#10083)
Summary: We recently saw a case in crash test in which a WAL file in the middle of the list of live WALs was not included in the backup, so the DB was not openable due to missing WAL. We are not sure why, but this change should at least turn that into a backup-time failure by ensuring all the WAL files expected by the manifest (according to VersionSet) are included in `GetSortedWalFiles()` (used by `GetLiveFilesStorageInfo()`, `BackupEngine`, and `Checkpoint`) Related: to maximize the effectiveness of track_and_verify_wals_in_manifest with GetSortedWalFiles() during checkpoint/backup, we will now sync WAL in GetLiveFilesStorageInfo() when track_and_verify_wals_in_manifest=true. Pull Request resolved: https://github.com/facebook/rocksdb/pull/10083 Test Plan: added new unit test for the check in GetSortedWalFiles() Reviewed By: ajkr Differential Revision: D36791608 Pulled By: pdillinger fbshipit-source-id: a27bcf0213fc7ab177760fede50d4375d579afa6
This commit is contained in:
parent
d04df2752a
commit
a00cffaf69
@ -6,13 +6,14 @@
|
||||
* Add two-phase commit support to C API.
|
||||
* Add `rocksdb_transaction_get_writebatch_wi` and `rocksdb_transaction_rebuild_from_writebatch` to C API.
|
||||
* Add SingleDelete for DB in C API
|
||||
* Add User Defined Timestamp in C API.
|
||||
* Add User Defined Timestamp in C API.
|
||||
* `rocksdb_comparator_with_ts_create` to create timestamp aware comparator
|
||||
* Put, Get, Delete, SingleDelete, MultiGet APIs has corresponding timestamp aware APIs with suffix `with_ts`
|
||||
* And Add C API's for Transaction, SstFileWriter, Compaction as mentioned [here](https://github.com/facebook/rocksdb/wiki/User-defined-Timestamp-(Experimental))
|
||||
|
||||
### New Features
|
||||
* Add FileSystem::ReadAsync API in io_tracing
|
||||
* Add an extra sanity check in `GetSortedWalFiles()` (also used by `GetLiveFilesStorageInfo()`, `BackupEngine`, and `Checkpoint`) to reduce risk of successfully created backup or checkpoint failing to open because of missing WAL file.
|
||||
|
||||
### Behavior changes
|
||||
* DB::Open(), DB::OpenAsSecondary() will fail if a Logger cannot be created (#9984)
|
||||
|
@ -124,6 +124,9 @@ Status DBImpl::GetLiveFiles(std::vector<std::string>& ret,
|
||||
}
|
||||
|
||||
Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) {
|
||||
// Record tracked WALs as a (minimum) cross-check for directory scan
|
||||
std::vector<uint64_t> required_by_manifest;
|
||||
|
||||
// If caller disabled deletions, this function should return files that are
|
||||
// guaranteed not to be deleted until deletions are re-enabled. We need to
|
||||
// wait for pending purges to finish since WalManager doesn't know which
|
||||
@ -137,6 +140,13 @@ Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) {
|
||||
while (pending_purge_obsolete_files_ > 0 || bg_purge_scheduled_ > 0) {
|
||||
bg_cv_.Wait();
|
||||
}
|
||||
|
||||
// Record tracked WALs as a (minimum) cross-check for directory scan
|
||||
const auto& manifest_wals = versions_->GetWalSet().GetWals();
|
||||
required_by_manifest.reserve(manifest_wals.size());
|
||||
for (const auto& wal : manifest_wals) {
|
||||
required_by_manifest.push_back(wal.first);
|
||||
}
|
||||
}
|
||||
|
||||
Status s = wal_manager_.GetSortedWalFiles(files);
|
||||
@ -150,6 +160,29 @@ Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) {
|
||||
assert(deletions_disabled.IsNotSupported());
|
||||
}
|
||||
|
||||
if (s.ok()) {
|
||||
// Verify includes those required by manifest (one sorted list is superset
|
||||
// of the other)
|
||||
auto required = required_by_manifest.begin();
|
||||
auto included = files.begin();
|
||||
|
||||
while (required != required_by_manifest.end()) {
|
||||
if (included == files.end() || *required < (*included)->LogNumber()) {
|
||||
// FAIL - did not find
|
||||
return Status::Corruption(
|
||||
"WAL file " + std::to_string(*required) +
|
||||
" required by manifest but not in directory list");
|
||||
}
|
||||
if (*required == (*included)->LogNumber()) {
|
||||
++required;
|
||||
++included;
|
||||
} else {
|
||||
assert(*required > (*included)->LogNumber());
|
||||
++included;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
@ -349,7 +382,13 @@ Status DBImpl::GetLiveFilesStorageInfo(
|
||||
TEST_SYNC_POINT("CheckpointImpl::CreateCheckpoint:SavedLiveFiles2");
|
||||
|
||||
if (s.ok()) {
|
||||
s = FlushWAL(false /* sync */);
|
||||
// To maximize the effectiveness of track_and_verify_wals_in_manifest,
|
||||
// sync WAL when it is enabled.
|
||||
s = FlushWAL(
|
||||
immutable_db_options_.track_and_verify_wals_in_manifest /* sync */);
|
||||
if (s.IsNotSupported()) { // read-only DB or similar
|
||||
s = Status::OK();
|
||||
}
|
||||
}
|
||||
|
||||
TEST_SYNC_POINT("CheckpointImpl::CreateCustomCheckpoint:AfterGetLive1");
|
||||
|
@ -2165,6 +2165,59 @@ TEST_F(DBWALTest, ReadOnlyRecoveryNoTruncate) {
|
||||
#endif // ROCKSDB_FALLOCATE_PRESENT
|
||||
#endif // ROCKSDB_PLATFORM_POSIX
|
||||
|
||||
TEST_F(DBWALTest, WalInManifestButNotInSortedWals) {
|
||||
Options options = CurrentOptions();
|
||||
options.track_and_verify_wals_in_manifest = true;
|
||||
options.wal_recovery_mode = WALRecoveryMode::kAbsoluteConsistency;
|
||||
|
||||
// Build a way to make wal files selectively go missing
|
||||
bool wals_go_missing = false;
|
||||
struct MissingWalFs : public FileSystemWrapper {
|
||||
MissingWalFs(const std::shared_ptr<FileSystem>& t,
|
||||
bool* _wals_go_missing_flag)
|
||||
: FileSystemWrapper(t), wals_go_missing_flag(_wals_go_missing_flag) {}
|
||||
bool* wals_go_missing_flag;
|
||||
IOStatus GetChildren(const std::string& dir, const IOOptions& io_opts,
|
||||
std::vector<std::string>* r,
|
||||
IODebugContext* dbg) override {
|
||||
IOStatus s = target_->GetChildren(dir, io_opts, r, dbg);
|
||||
if (s.ok() && *wals_go_missing_flag) {
|
||||
for (size_t i = 0; i < r->size();) {
|
||||
if (EndsWith(r->at(i), ".log")) {
|
||||
r->erase(r->begin() + i);
|
||||
} else {
|
||||
++i;
|
||||
}
|
||||
}
|
||||
}
|
||||
return s;
|
||||
}
|
||||
const char* Name() const override { return "MissingWalFs"; }
|
||||
};
|
||||
auto my_fs =
|
||||
std::make_shared<MissingWalFs>(env_->GetFileSystem(), &wals_go_missing);
|
||||
std::unique_ptr<Env> my_env(NewCompositeEnv(my_fs));
|
||||
options.env = my_env.get();
|
||||
|
||||
CreateAndReopenWithCF({"blah"}, options);
|
||||
|
||||
// Currently necessary to get a WAL tracked in manifest; see
|
||||
// https://github.com/facebook/rocksdb/issues/10080
|
||||
ASSERT_OK(Put(0, "x", "y"));
|
||||
ASSERT_OK(db_->SyncWAL());
|
||||
ASSERT_OK(Put(1, "x", "y"));
|
||||
ASSERT_OK(db_->SyncWAL());
|
||||
ASSERT_OK(Flush(1));
|
||||
|
||||
ASSERT_FALSE(dbfull()->GetVersionSet()->GetWalSet().GetWals().empty());
|
||||
std::vector<std::unique_ptr<LogFile>> wals;
|
||||
ASSERT_OK(db_->GetSortedWalFiles(wals));
|
||||
wals_go_missing = true;
|
||||
ASSERT_NOK(db_->GetSortedWalFiles(wals));
|
||||
wals_go_missing = false;
|
||||
Close();
|
||||
}
|
||||
|
||||
#endif // ROCKSDB_LITE
|
||||
|
||||
TEST_F(DBWALTest, WalTermTest) {
|
||||
|
Loading…
x
Reference in New Issue
Block a user