mirror of
https://github.com/facebook/rocksdb.git
synced 2025-05-14 08:53:08 +08:00
Handoff checksum during WAL replay (#10212)
Summary: Added checksum protection for write batch content read from WAL to when per key-value checksum is computed on the write batch. This gives full coverage on write batch integrity of WAL replay to memtable. Pull Request resolved: https://github.com/facebook/rocksdb/pull/10212 Test Plan: - Added unit test and the existing tests (replay code path covers the change in this PR): `make -j32 check` - Stress test: ran `db_stress` for 30min. - Perf regression: ``` # setup TEST_TMPDIR=/dev/shm/100MB_WAL_DB/ ./db_bench -benchmarks=fillrandom -write_buffer_size=1048576000 # benchmark db open time TEST_TMPDIR=/dev/shm/100MB_WAL_DB/ /usr/bin/time ./db_bench -use_existing_db=true -benchmarks=overwrite -write_buffer_size=1048576000 -writes=1 -report_open_timing=true For 20 runs, pre-PR avg: 3734.31ms, post-PR avg: 3790.06 ms (~1.5% regression). Pre-PR OpenDb: 3714.36 milliseconds OpenDb: 3622.71 milliseconds OpenDb: 3591.17 milliseconds OpenDb: 3674.7 milliseconds OpenDb: 3615.79 milliseconds OpenDb: 3982.83 milliseconds OpenDb: 3650.6 milliseconds OpenDb: 3809.26 milliseconds OpenDb: 3576.44 milliseconds OpenDb: 3638.12 milliseconds OpenDb: 3845.68 milliseconds OpenDb: 3677.32 milliseconds OpenDb: 3659.64 milliseconds OpenDb: 3837.55 milliseconds OpenDb: 3899.64 milliseconds OpenDb: 3840.72 milliseconds OpenDb: 3802.71 milliseconds OpenDb: 3573.27 milliseconds OpenDb: 3895.76 milliseconds OpenDb: 3778.02 milliseconds Post-PR: OpenDb: 3880.46 milliseconds OpenDb: 3709.02 milliseconds OpenDb: 3954.67 milliseconds OpenDb: 3955.64 milliseconds OpenDb: 3958.64 milliseconds OpenDb: 3631.28 milliseconds OpenDb: 3721 milliseconds OpenDb: 3729.89 milliseconds OpenDb: 3730.55 milliseconds OpenDb: 3966.32 milliseconds OpenDb: 3685.54 milliseconds OpenDb: 3573.17 milliseconds OpenDb: 3703.75 milliseconds OpenDb: 3873.62 milliseconds OpenDb: 3704.4 milliseconds OpenDb: 3820.98 milliseconds OpenDb: 3721.62 milliseconds OpenDb: 3770.86 milliseconds OpenDb: 3949.78 milliseconds OpenDb: 3760.07 milliseconds ``` Reviewed By: ajkr Differential Revision: D37302092 Pulled By: cbi42 fbshipit-source-id: 7346e625f453ce4c0e5d708776cd1fb2af6b068b
This commit is contained in:
parent
caced09e79
commit
0ff7713112
@ -1108,9 +1108,11 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& wal_numbers,
|
|||||||
|
|
||||||
TEST_SYNC_POINT_CALLBACK("DBImpl::RecoverLogFiles:BeforeReadWal",
|
TEST_SYNC_POINT_CALLBACK("DBImpl::RecoverLogFiles:BeforeReadWal",
|
||||||
/*arg=*/nullptr);
|
/*arg=*/nullptr);
|
||||||
|
uint64_t record_checksum;
|
||||||
while (!stop_replay_by_wal_filter &&
|
while (!stop_replay_by_wal_filter &&
|
||||||
reader.ReadRecord(&record, &scratch,
|
reader.ReadRecord(&record, &scratch,
|
||||||
immutable_db_options_.wal_recovery_mode) &&
|
immutable_db_options_.wal_recovery_mode,
|
||||||
|
&record_checksum) &&
|
||||||
status.ok()) {
|
status.ok()) {
|
||||||
if (record.size() < WriteBatchInternal::kHeader) {
|
if (record.size() < WriteBatchInternal::kHeader) {
|
||||||
reporter.Corruption(record.size(),
|
reporter.Corruption(record.size(),
|
||||||
@ -1126,8 +1128,13 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& wal_numbers,
|
|||||||
if (!status.ok()) {
|
if (!status.ok()) {
|
||||||
return status;
|
return status;
|
||||||
}
|
}
|
||||||
status = WriteBatchInternal::UpdateProtectionInfo(&batch,
|
TEST_SYNC_POINT_CALLBACK(
|
||||||
8 /* bytes_per_key */);
|
"DBImpl::RecoverLogFiles:BeforeUpdateProtectionInfo:batch", &batch);
|
||||||
|
TEST_SYNC_POINT_CALLBACK(
|
||||||
|
"DBImpl::RecoverLogFiles:BeforeUpdateProtectionInfo:checksum",
|
||||||
|
&record_checksum);
|
||||||
|
status = WriteBatchInternal::UpdateProtectionInfo(
|
||||||
|
&batch, 8 /* bytes_per_key */, &record_checksum);
|
||||||
if (!status.ok()) {
|
if (!status.ok()) {
|
||||||
return status;
|
return status;
|
||||||
}
|
}
|
||||||
|
@ -627,6 +627,39 @@ INSTANTIATE_TEST_CASE_P(
|
|||||||
|
|
||||||
// TODO: add test for transactions
|
// TODO: add test for transactions
|
||||||
// TODO: add test for corrupted write batch with WAL disabled
|
// TODO: add test for corrupted write batch with WAL disabled
|
||||||
|
|
||||||
|
class DbKVChecksumWALToWriteBatchTest : public DBTestBase {
|
||||||
|
public:
|
||||||
|
DbKVChecksumWALToWriteBatchTest()
|
||||||
|
: DBTestBase("db_kv_checksum_test", /*env_do_fsync=*/false) {}
|
||||||
|
};
|
||||||
|
|
||||||
|
TEST_F(DbKVChecksumWALToWriteBatchTest, WriteBatchChecksumHandoff) {
|
||||||
|
Options options = CurrentOptions();
|
||||||
|
Reopen(options);
|
||||||
|
ASSERT_OK(db_->Put(WriteOptions(), "key", "val"));
|
||||||
|
std::string content = "";
|
||||||
|
SyncPoint::GetInstance()->SetCallBack(
|
||||||
|
"DBImpl::RecoverLogFiles:BeforeUpdateProtectionInfo:batch",
|
||||||
|
[&](void* batch_ptr) {
|
||||||
|
WriteBatch* batch = reinterpret_cast<WriteBatch*>(batch_ptr);
|
||||||
|
content.assign(batch->Data().data(), batch->GetDataSize());
|
||||||
|
Slice batch_content = batch->Data();
|
||||||
|
// Corrupt first bit
|
||||||
|
CorruptWriteBatch(&batch_content, 0, 1);
|
||||||
|
});
|
||||||
|
SyncPoint::GetInstance()->SetCallBack(
|
||||||
|
"DBImpl::RecoverLogFiles:BeforeUpdateProtectionInfo:checksum",
|
||||||
|
[&](void* checksum_ptr) {
|
||||||
|
// Verify that checksum is produced on the batch content
|
||||||
|
uint64_t checksum = *reinterpret_cast<uint64_t*>(checksum_ptr);
|
||||||
|
ASSERT_EQ(checksum, XXH3_64bits(content.data(), content.size()));
|
||||||
|
});
|
||||||
|
SyncPoint::GetInstance()->EnableProcessing();
|
||||||
|
ASSERT_TRUE(TryReopen(options).IsCorruption());
|
||||||
|
SyncPoint::GetInstance()->DisableProcessing();
|
||||||
|
};
|
||||||
|
|
||||||
} // namespace ROCKSDB_NAMESPACE
|
} // namespace ROCKSDB_NAMESPACE
|
||||||
|
|
||||||
int main(int argc, char** argv) {
|
int main(int argc, char** argv) {
|
||||||
|
@ -43,13 +43,17 @@ Reader::Reader(std::shared_ptr<Logger> info_log,
|
|||||||
first_record_read_(false),
|
first_record_read_(false),
|
||||||
compression_type_(kNoCompression),
|
compression_type_(kNoCompression),
|
||||||
compression_type_record_read_(false),
|
compression_type_record_read_(false),
|
||||||
uncompress_(nullptr) {}
|
uncompress_(nullptr),
|
||||||
|
hash_state_(nullptr) {}
|
||||||
|
|
||||||
Reader::~Reader() {
|
Reader::~Reader() {
|
||||||
delete[] backing_store_;
|
delete[] backing_store_;
|
||||||
if (uncompress_) {
|
if (uncompress_) {
|
||||||
delete uncompress_;
|
delete uncompress_;
|
||||||
}
|
}
|
||||||
|
if (hash_state_) {
|
||||||
|
XXH3_freeState(hash_state_);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// For kAbsoluteConsistency, on clean shutdown we don't expect any error
|
// For kAbsoluteConsistency, on clean shutdown we don't expect any error
|
||||||
@ -60,9 +64,15 @@ Reader::~Reader() {
|
|||||||
// TODO krad: Evaluate if we need to move to a more strict mode where we
|
// TODO krad: Evaluate if we need to move to a more strict mode where we
|
||||||
// restrict the inconsistency to only the last log
|
// restrict the inconsistency to only the last log
|
||||||
bool Reader::ReadRecord(Slice* record, std::string* scratch,
|
bool Reader::ReadRecord(Slice* record, std::string* scratch,
|
||||||
WALRecoveryMode wal_recovery_mode) {
|
WALRecoveryMode wal_recovery_mode, uint64_t* checksum) {
|
||||||
scratch->clear();
|
scratch->clear();
|
||||||
record->clear();
|
record->clear();
|
||||||
|
if (checksum != nullptr) {
|
||||||
|
if (hash_state_ == nullptr) {
|
||||||
|
hash_state_ = XXH3_createState();
|
||||||
|
}
|
||||||
|
XXH3_64bits_reset(hash_state_);
|
||||||
|
}
|
||||||
if (uncompress_) {
|
if (uncompress_) {
|
||||||
uncompress_->Reset();
|
uncompress_->Reset();
|
||||||
}
|
}
|
||||||
@ -86,6 +96,10 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch,
|
|||||||
// at the beginning of the next block.
|
// at the beginning of the next block.
|
||||||
ReportCorruption(scratch->size(), "partial record without end(1)");
|
ReportCorruption(scratch->size(), "partial record without end(1)");
|
||||||
}
|
}
|
||||||
|
if (checksum != nullptr) {
|
||||||
|
// No need to stream since the record is a single fragment
|
||||||
|
*checksum = XXH3_64bits(fragment.data(), fragment.size());
|
||||||
|
}
|
||||||
prospective_record_offset = physical_record_offset;
|
prospective_record_offset = physical_record_offset;
|
||||||
scratch->clear();
|
scratch->clear();
|
||||||
*record = fragment;
|
*record = fragment;
|
||||||
@ -101,6 +115,10 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch,
|
|||||||
// of a block followed by a kFullType or kFirstType record
|
// of a block followed by a kFullType or kFirstType record
|
||||||
// at the beginning of the next block.
|
// at the beginning of the next block.
|
||||||
ReportCorruption(scratch->size(), "partial record without end(2)");
|
ReportCorruption(scratch->size(), "partial record without end(2)");
|
||||||
|
XXH3_64bits_reset(hash_state_);
|
||||||
|
}
|
||||||
|
if (checksum != nullptr) {
|
||||||
|
XXH3_64bits_update(hash_state_, fragment.data(), fragment.size());
|
||||||
}
|
}
|
||||||
prospective_record_offset = physical_record_offset;
|
prospective_record_offset = physical_record_offset;
|
||||||
scratch->assign(fragment.data(), fragment.size());
|
scratch->assign(fragment.data(), fragment.size());
|
||||||
@ -113,6 +131,9 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch,
|
|||||||
ReportCorruption(fragment.size(),
|
ReportCorruption(fragment.size(),
|
||||||
"missing start of fragmented record(1)");
|
"missing start of fragmented record(1)");
|
||||||
} else {
|
} else {
|
||||||
|
if (checksum != nullptr) {
|
||||||
|
XXH3_64bits_update(hash_state_, fragment.data(), fragment.size());
|
||||||
|
}
|
||||||
scratch->append(fragment.data(), fragment.size());
|
scratch->append(fragment.data(), fragment.size());
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
@ -123,6 +144,10 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch,
|
|||||||
ReportCorruption(fragment.size(),
|
ReportCorruption(fragment.size(),
|
||||||
"missing start of fragmented record(2)");
|
"missing start of fragmented record(2)");
|
||||||
} else {
|
} else {
|
||||||
|
if (checksum != nullptr) {
|
||||||
|
XXH3_64bits_update(hash_state_, fragment.data(), fragment.size());
|
||||||
|
*checksum = XXH3_64bits_digest(hash_state_);
|
||||||
|
}
|
||||||
scratch->append(fragment.data(), fragment.size());
|
scratch->append(fragment.data(), fragment.size());
|
||||||
*record = Slice(*scratch);
|
*record = Slice(*scratch);
|
||||||
last_record_offset_ = prospective_record_offset;
|
last_record_offset_ = prospective_record_offset;
|
||||||
@ -509,7 +534,8 @@ void Reader::InitCompression(const CompressionTypeRecord& compression_record) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
bool FragmentBufferedReader::ReadRecord(Slice* record, std::string* scratch,
|
bool FragmentBufferedReader::ReadRecord(Slice* record, std::string* scratch,
|
||||||
WALRecoveryMode /*unused*/) {
|
WALRecoveryMode /*unused*/,
|
||||||
|
uint64_t* /* checksum */) {
|
||||||
assert(record != nullptr);
|
assert(record != nullptr);
|
||||||
assert(scratch != nullptr);
|
assert(scratch != nullptr);
|
||||||
record->clear();
|
record->clear();
|
||||||
|
@ -18,6 +18,7 @@
|
|||||||
#include "rocksdb/slice.h"
|
#include "rocksdb/slice.h"
|
||||||
#include "rocksdb/status.h"
|
#include "rocksdb/status.h"
|
||||||
#include "util/compression.h"
|
#include "util/compression.h"
|
||||||
|
#include "util/xxhash.h"
|
||||||
|
|
||||||
namespace ROCKSDB_NAMESPACE {
|
namespace ROCKSDB_NAMESPACE {
|
||||||
class Logger;
|
class Logger;
|
||||||
@ -61,12 +62,17 @@ class Reader {
|
|||||||
|
|
||||||
// Read the next record into *record. Returns true if read
|
// Read the next record into *record. Returns true if read
|
||||||
// successfully, false if we hit end of the input. May use
|
// successfully, false if we hit end of the input. May use
|
||||||
// "*scratch" as temporary storage. The contents filled in *record
|
// "*scratch" as temporary storage. The contents filled in *record
|
||||||
// will only be valid until the next mutating operation on this
|
// will only be valid until the next mutating operation on this
|
||||||
// reader or the next mutation to *scratch.
|
// reader or the next mutation to *scratch.
|
||||||
|
// If record_checksum is not nullptr, then this function will calculate the
|
||||||
|
// checksum of the record read and set record_checksum to it. The checksum is
|
||||||
|
// calculated from the original buffers that contain the contents of the
|
||||||
|
// record.
|
||||||
virtual bool ReadRecord(Slice* record, std::string* scratch,
|
virtual bool ReadRecord(Slice* record, std::string* scratch,
|
||||||
WALRecoveryMode wal_recovery_mode =
|
WALRecoveryMode wal_recovery_mode =
|
||||||
WALRecoveryMode::kTolerateCorruptedTailRecords);
|
WALRecoveryMode::kTolerateCorruptedTailRecords,
|
||||||
|
uint64_t* record_checksum = nullptr);
|
||||||
|
|
||||||
// Returns the physical offset of the last record returned by ReadRecord.
|
// Returns the physical offset of the last record returned by ReadRecord.
|
||||||
//
|
//
|
||||||
@ -145,6 +151,8 @@ class Reader {
|
|||||||
std::unique_ptr<char[]> uncompressed_buffer_;
|
std::unique_ptr<char[]> uncompressed_buffer_;
|
||||||
// Reusable uncompressed record
|
// Reusable uncompressed record
|
||||||
std::string uncompressed_record_;
|
std::string uncompressed_record_;
|
||||||
|
// Used for stream hashing log record
|
||||||
|
XXH3_state_t* hash_state_;
|
||||||
|
|
||||||
// Extend record types with the following special values
|
// Extend record types with the following special values
|
||||||
enum {
|
enum {
|
||||||
@ -191,7 +199,8 @@ class FragmentBufferedReader : public Reader {
|
|||||||
~FragmentBufferedReader() override {}
|
~FragmentBufferedReader() override {}
|
||||||
bool ReadRecord(Slice* record, std::string* scratch,
|
bool ReadRecord(Slice* record, std::string* scratch,
|
||||||
WALRecoveryMode wal_recovery_mode =
|
WALRecoveryMode wal_recovery_mode =
|
||||||
WALRecoveryMode::kTolerateCorruptedTailRecords) override;
|
WALRecoveryMode::kTolerateCorruptedTailRecords,
|
||||||
|
uint64_t* record_checksum = nullptr) override;
|
||||||
void UnmarkEOF() override;
|
void UnmarkEOF() override;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
@ -3063,7 +3063,8 @@ size_t WriteBatchInternal::AppendedByteSize(size_t leftByteSize,
|
|||||||
}
|
}
|
||||||
|
|
||||||
Status WriteBatchInternal::UpdateProtectionInfo(WriteBatch* wb,
|
Status WriteBatchInternal::UpdateProtectionInfo(WriteBatch* wb,
|
||||||
size_t bytes_per_key) {
|
size_t bytes_per_key,
|
||||||
|
uint64_t* checksum) {
|
||||||
if (bytes_per_key == 0) {
|
if (bytes_per_key == 0) {
|
||||||
if (wb->prot_info_ != nullptr) {
|
if (wb->prot_info_ != nullptr) {
|
||||||
wb->prot_info_.reset();
|
wb->prot_info_.reset();
|
||||||
@ -3076,7 +3077,14 @@ Status WriteBatchInternal::UpdateProtectionInfo(WriteBatch* wb,
|
|||||||
if (wb->prot_info_ == nullptr) {
|
if (wb->prot_info_ == nullptr) {
|
||||||
wb->prot_info_.reset(new WriteBatch::ProtectionInfo());
|
wb->prot_info_.reset(new WriteBatch::ProtectionInfo());
|
||||||
ProtectionInfoUpdater prot_info_updater(wb->prot_info_.get());
|
ProtectionInfoUpdater prot_info_updater(wb->prot_info_.get());
|
||||||
return wb->Iterate(&prot_info_updater);
|
Status s = wb->Iterate(&prot_info_updater);
|
||||||
|
if (s.ok() && checksum != nullptr) {
|
||||||
|
uint64_t expected_hash = XXH3_64bits(wb->rep_.data(), wb->rep_.size());
|
||||||
|
if (expected_hash != *checksum) {
|
||||||
|
return Status::Corruption("Write batch content corrupted.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return s;
|
||||||
} else {
|
} else {
|
||||||
// Already protected.
|
// Already protected.
|
||||||
return Status::OK();
|
return Status::OK();
|
||||||
|
@ -240,7 +240,10 @@ class WriteBatchInternal {
|
|||||||
return wb.has_key_with_ts_;
|
return wb.has_key_with_ts_;
|
||||||
}
|
}
|
||||||
|
|
||||||
static Status UpdateProtectionInfo(WriteBatch* wb, size_t bytes_per_key);
|
// Update per-key value protection information on this write batch.
|
||||||
|
// If checksum is provided, the batch content is verfied against the checksum.
|
||||||
|
static Status UpdateProtectionInfo(WriteBatch* wb, size_t bytes_per_key,
|
||||||
|
uint64_t* checksum = nullptr);
|
||||||
};
|
};
|
||||||
|
|
||||||
// LocalSavePoint is similar to a scope guard
|
// LocalSavePoint is similar to a scope guard
|
||||||
|
Loading…
x
Reference in New Issue
Block a user