mirror of
https://github.com/facebook/rocksdb.git
synced 2025-04-20 13:52:03 +08:00
Cap compaction_readahead_size by max_sectors_kb (#12937)
Summary: **Context/Summary:** https://github.com/facebook/rocksdb/issues/12038 reported a regression where compaction read ahead does not work when `compaction_readahead_size ` is greater than `max_sectors_kb` defined in linux (i.e, largest I/O size that the OS issues to a block device, see https://www.kernel.org/doc/Documentation/block/queue-sysfs.txt for more). This PR fixes it by capping the `compaction_readahead_size` by `max_sectors_kb` if any. A refactoring of reading queue sys file is also included to reuse existing code. Pull Request resolved: https://github.com/facebook/rocksdb/pull/12937 Test Plan: https://github.com/facebook/rocksdb/issues/12038#issuecomment-2327618031 verified the regression was fixed Reviewed By: anand1976 Differential Revision: D61350188 Pulled By: hx235 fbshipit-source-id: e10677f2f5854c22ebf6318b052557db94b98abe
This commit is contained in:
parent
d3296260c2
commit
f69a5fe8ee
@ -1461,8 +1461,6 @@ Status DBImpl::SetDBOptions(
|
||||
// TODO(xiez): clarify why apply optimize for read to write options
|
||||
file_options_for_compaction_ = fs_->OptimizeForCompactionTableRead(
|
||||
file_options_for_compaction_, immutable_db_options_);
|
||||
file_options_for_compaction_.compaction_readahead_size =
|
||||
mutable_db_options_.compaction_readahead_size;
|
||||
if (wal_other_option_changed || wal_size_option_changed) {
|
||||
WriteThread::Writer w;
|
||||
write_thread_.EnterUnbatched(&w, &mutex_);
|
||||
|
53
env/fs_posix.cc
vendored
53
env/fs_posix.cc
vendored
@ -51,6 +51,7 @@
|
||||
#include "env/io_posix.h"
|
||||
#include "monitoring/iostats_context_imp.h"
|
||||
#include "monitoring/thread_status_updater.h"
|
||||
#include "options/db_options.h"
|
||||
#include "port/lang.h"
|
||||
#include "port/port.h"
|
||||
#include "rocksdb/options.h"
|
||||
@ -930,6 +931,28 @@ class PosixFileSystem : public FileSystem {
|
||||
optimized.fallocate_with_keep_size = true;
|
||||
return optimized;
|
||||
}
|
||||
|
||||
FileOptions OptimizeForCompactionTableRead(
|
||||
const FileOptions& file_options,
|
||||
const ImmutableDBOptions& db_options) const override {
|
||||
FileOptions fo = FileOptions(file_options);
|
||||
#ifdef OS_LINUX
|
||||
// To fix https://github.com/facebook/rocksdb/issues/12038
|
||||
if (!file_options.use_direct_reads &&
|
||||
file_options.compaction_readahead_size > 0) {
|
||||
size_t system_limit =
|
||||
GetCompactionReadaheadSizeSystemLimit(db_options.db_paths);
|
||||
if (system_limit > 0 &&
|
||||
file_options.compaction_readahead_size > system_limit) {
|
||||
fo.compaction_readahead_size = system_limit;
|
||||
}
|
||||
}
|
||||
#else
|
||||
(void)db_options;
|
||||
#endif
|
||||
return fo;
|
||||
}
|
||||
|
||||
#ifdef OS_LINUX
|
||||
Status RegisterDbPaths(const std::vector<std::string>& paths) override {
|
||||
return logical_block_size_cache_.RefAndCacheLogicalBlockSize(paths);
|
||||
@ -942,6 +965,36 @@ class PosixFileSystem : public FileSystem {
|
||||
private:
|
||||
bool forceMmapOff_ = false; // do we override Env options?
|
||||
|
||||
#ifdef OS_LINUX
|
||||
// Get the minimum "linux system limit" (i.e, the largest I/O size that the OS
|
||||
// can issue to block devices under a directory, also known as
|
||||
// "max_sectors_kb" ) among `db_paths`.
|
||||
// Return 0 if no limit can be found or there is an error in
|
||||
// retrieving such limit.
|
||||
static size_t GetCompactionReadaheadSizeSystemLimit(
|
||||
const std::vector<DbPath>& db_paths) {
|
||||
Status s;
|
||||
size_t limit_kb = 0;
|
||||
|
||||
for (const auto& db_path : db_paths) {
|
||||
size_t dir_max_sectors_kb = 0;
|
||||
s = PosixHelper::GetMaxSectorsKBOfDirectory(db_path.path,
|
||||
&dir_max_sectors_kb);
|
||||
if (!s.ok()) {
|
||||
break;
|
||||
}
|
||||
|
||||
limit_kb = (limit_kb == 0) ? dir_max_sectors_kb
|
||||
: std::min(limit_kb, dir_max_sectors_kb);
|
||||
}
|
||||
|
||||
if (s.ok()) {
|
||||
return limit_kb * 1024;
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
// Returns true iff the named directory exists and is a directory.
|
||||
virtual bool DirExists(const std::string& dname) {
|
||||
struct stat statbuf;
|
||||
|
71
env/io_posix.cc
vendored
71
env/io_posix.cc
vendored
@ -455,38 +455,71 @@ size_t LogicalBlockSizeCache::GetLogicalBlockSize(const std::string& fname,
|
||||
|
||||
Status PosixHelper::GetLogicalBlockSizeOfDirectory(const std::string& directory,
|
||||
size_t* size) {
|
||||
return GetQueueSysfsFileValueofDirectory(directory,
|
||||
GetLogicalBlockSizeFileName(), size);
|
||||
}
|
||||
|
||||
Status PosixHelper::GetMaxSectorsKBOfDirectory(const std::string& directory,
|
||||
size_t* kb) {
|
||||
return GetQueueSysfsFileValueofDirectory(directory, GetMaxSectorsKBFileName(),
|
||||
kb);
|
||||
}
|
||||
|
||||
Status PosixHelper::GetQueueSysfsFileValueofDirectory(
|
||||
const std::string& directory, const std::string& file_name, size_t* value) {
|
||||
int fd = open(directory.c_str(), O_DIRECTORY | O_RDONLY);
|
||||
if (fd == -1) {
|
||||
return Status::IOError("Cannot open directory " + directory);
|
||||
}
|
||||
*size = PosixHelper::GetLogicalBlockSizeOfFd(fd);
|
||||
if (file_name == PosixHelper::GetLogicalBlockSizeFileName()) {
|
||||
*value = PosixHelper::GetLogicalBlockSizeOfFd(fd);
|
||||
} else if (file_name == PosixHelper::GetMaxSectorsKBFileName()) {
|
||||
*value = PosixHelper::GetMaxSectorsKBOfFd(fd);
|
||||
} else {
|
||||
assert(false);
|
||||
}
|
||||
close(fd);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
size_t PosixHelper::GetLogicalBlockSizeOfFd(int fd) {
|
||||
return GetQueueSysfsFileValueOfFd(fd, GetLogicalBlockSizeFileName(),
|
||||
kDefaultPageSize);
|
||||
}
|
||||
|
||||
size_t PosixHelper::GetMaxSectorsKBOfFd(int fd) {
|
||||
return GetQueueSysfsFileValueOfFd(fd, GetMaxSectorsKBFileName(),
|
||||
kDefaultMaxSectorsKB);
|
||||
}
|
||||
|
||||
size_t PosixHelper::GetQueueSysfsFileValueOfFd(
|
||||
int fd, const std::string& file_name, const size_t default_return_value) {
|
||||
#ifdef OS_LINUX
|
||||
struct stat buf;
|
||||
int result = fstat(fd, &buf);
|
||||
if (result == -1) {
|
||||
return kDefaultPageSize;
|
||||
return default_return_value;
|
||||
}
|
||||
|
||||
// Get device number
|
||||
if (major(buf.st_dev) == 0) {
|
||||
// Unnamed devices (e.g. non-device mounts), reserved as null device number.
|
||||
// These don't have an entry in /sys/dev/block/. Return a sensible default.
|
||||
return kDefaultPageSize;
|
||||
return default_return_value;
|
||||
}
|
||||
|
||||
// Reading queue/logical_block_size does not require special permissions.
|
||||
// Get device path
|
||||
const int kBufferSize = 100;
|
||||
char path[kBufferSize];
|
||||
char real_path[PATH_MAX + 1];
|
||||
snprintf(path, kBufferSize, "/sys/dev/block/%u:%u", major(buf.st_dev),
|
||||
minor(buf.st_dev));
|
||||
if (realpath(path, real_path) == nullptr) {
|
||||
return kDefaultPageSize;
|
||||
return default_return_value;
|
||||
}
|
||||
std::string device_dir(real_path);
|
||||
|
||||
// Get the queue sysfs file path
|
||||
if (!device_dir.empty() && device_dir.back() == '/') {
|
||||
device_dir.pop_back();
|
||||
}
|
||||
@ -500,11 +533,11 @@ size_t PosixHelper::GetLogicalBlockSizeOfFd(int fd) {
|
||||
// ../../devices/pci0000:17/0000:17:00.0/0000:18:00.0/nvme/nvme0/nvme0n1/nvme0n1p1
|
||||
size_t parent_end = device_dir.rfind('/', device_dir.length() - 1);
|
||||
if (parent_end == std::string::npos) {
|
||||
return kDefaultPageSize;
|
||||
return default_return_value;
|
||||
}
|
||||
size_t parent_begin = device_dir.rfind('/', parent_end - 1);
|
||||
if (parent_begin == std::string::npos) {
|
||||
return kDefaultPageSize;
|
||||
return default_return_value;
|
||||
}
|
||||
std::string parent =
|
||||
device_dir.substr(parent_begin + 1, parent_end - parent_begin - 1);
|
||||
@ -513,25 +546,37 @@ size_t PosixHelper::GetLogicalBlockSizeOfFd(int fd) {
|
||||
(child.compare(0, 4, "nvme") || child.find('p') != std::string::npos)) {
|
||||
device_dir = device_dir.substr(0, parent_end);
|
||||
}
|
||||
std::string fname = device_dir + "/queue/logical_block_size";
|
||||
std::string fname = device_dir + "/queue/" + file_name;
|
||||
|
||||
// Get value in the queue sysfs file
|
||||
FILE* fp;
|
||||
size_t size = 0;
|
||||
size_t value = 0;
|
||||
fp = fopen(fname.c_str(), "r");
|
||||
if (fp != nullptr) {
|
||||
char* line = nullptr;
|
||||
size_t len = 0;
|
||||
if (getline(&line, &len, fp) != -1) {
|
||||
sscanf(line, "%zu", &size);
|
||||
sscanf(line, "%zu", &value);
|
||||
}
|
||||
free(line);
|
||||
fclose(fp);
|
||||
}
|
||||
if (size != 0 && (size & (size - 1)) == 0) {
|
||||
return size;
|
||||
|
||||
if (file_name == GetLogicalBlockSizeFileName()) {
|
||||
if (value != 0 && (value & (value - 1)) == 0) {
|
||||
return value;
|
||||
}
|
||||
} else if (file_name == GetMaxSectorsKBFileName()) {
|
||||
if (value != 0) {
|
||||
return value;
|
||||
}
|
||||
} else {
|
||||
assert(false);
|
||||
}
|
||||
#endif
|
||||
(void)fd;
|
||||
return kDefaultPageSize;
|
||||
(void)file_name;
|
||||
return default_return_value;
|
||||
}
|
||||
|
||||
/*
|
||||
|
29
env/io_posix.h
vendored
29
env/io_posix.h
vendored
@ -53,10 +53,39 @@ IOStatus IOError(const std::string& context, const std::string& file_name,
|
||||
|
||||
class PosixHelper {
|
||||
public:
|
||||
static const std::string& GetLogicalBlockSizeFileName() {
|
||||
static const std::string kLogicalBlockSizeFileName = "logical_block_size";
|
||||
return kLogicalBlockSizeFileName;
|
||||
}
|
||||
static const std::string& GetMaxSectorsKBFileName() {
|
||||
static const std::string kMaxSectorsKBFileName = "max_sectors_kb";
|
||||
return kMaxSectorsKBFileName;
|
||||
}
|
||||
static size_t GetUniqueIdFromFile(int fd, char* id, size_t max_size);
|
||||
static size_t GetLogicalBlockSizeOfFd(int fd);
|
||||
static Status GetLogicalBlockSizeOfDirectory(const std::string& directory,
|
||||
size_t* size);
|
||||
|
||||
static Status GetMaxSectorsKBOfDirectory(const std::string& directory,
|
||||
size_t* kb);
|
||||
|
||||
private:
|
||||
static const size_t kDefaultMaxSectorsKB = 2 * 1024;
|
||||
|
||||
static size_t GetMaxSectorsKBOfFd(int fd);
|
||||
|
||||
// Return the value in the specified `file_name` under
|
||||
// `/sys/block/xxx/queue/` for the device where the file of `fd` is on.
|
||||
// If not found, then return the specified `default_return_value`
|
||||
static size_t GetQueueSysfsFileValueOfFd(int fd, const std::string& file_name,
|
||||
size_t default_return_value);
|
||||
|
||||
/// Return the value in the specified `file_name` under
|
||||
// `/sys/block/xxx/queue/` for the device where `directory` is on.
|
||||
// If not found, then return the specified `default_return_value`
|
||||
static Status GetQueueSysfsFileValueofDirectory(const std::string& directory,
|
||||
const std::string& file_name,
|
||||
size_t* value);
|
||||
};
|
||||
|
||||
/*
|
||||
|
@ -0,0 +1 @@
|
||||
Fix regression in issue #12038 due to `Options::compaction_readahead_size` greater than `max_sectors_kb` (i.e, largest I/O size that the OS issues to a block device defined in linux)
|
Loading…
x
Reference in New Issue
Block a user