Enable RocksDB metrics (#7739)

* add metrics

* Pass stats to metric logger

* Add debugID
This commit is contained in:
Yao Xiao 2022-08-05 00:38:59 -07:00 committed by GitHub
parent ff23d5994e
commit 38c79a0eb5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -301,7 +301,8 @@ rocksdb::Options getOptions() {
options.IncreaseParallelism(SERVER_KNOBS->ROCKSDB_BACKGROUND_PARALLELISM); options.IncreaseParallelism(SERVER_KNOBS->ROCKSDB_BACKGROUND_PARALLELISM);
} }
// TODO: enable rocksdb metrics. options.statistics = rocksdb::CreateDBStatistics();
options.statistics->set_stats_level(rocksdb::kExceptHistogramOrTimers);
options.db_log_dir = SERVER_KNOBS->LOG_DIRECTORY; options.db_log_dir = SERVER_KNOBS->LOG_DIRECTORY;
return options; return options;
} }
@ -600,11 +601,10 @@ public:
return Void(); return Void();
} }
rocksdb::Status init() { rocksdb::Status init(rocksdb::Options options) {
// Open instance. // Open instance.
TraceEvent(SevInfo, "ShardedRocksShardManagerInitBegin", this->logId).detail("DataPath", path); TraceEvent(SevInfo, "ShardedRocksShardManagerInitBegin", this->logId).detail("DataPath", path);
std::vector<std::string> columnFamilies; std::vector<std::string> columnFamilies;
rocksdb::Options options = getOptions();
rocksdb::Status status = rocksdb::DB::ListColumnFamilies(options, path, &columnFamilies); rocksdb::Status status = rocksdb::DB::ListColumnFamilies(options, path, &columnFamilies);
rocksdb::ColumnFamilyOptions cfOptions = getCFOptions(); rocksdb::ColumnFamilyOptions cfOptions = getCFOptions();
@ -1041,9 +1041,7 @@ private:
class RocksDBMetrics { class RocksDBMetrics {
public: public:
RocksDBMetrics(); RocksDBMetrics(UID debugID, std::shared_ptr<rocksdb::Statistics> stats);
// Statistics
std::shared_ptr<rocksdb::Statistics> getStatsObjForRocksDB();
void logStats(rocksdb::DB* db); void logStats(rocksdb::DB* db);
// PerfContext // PerfContext
void resetPerfContext(); void resetPerfContext();
@ -1069,9 +1067,10 @@ public:
Reference<Histogram> getWriteHistogram(); Reference<Histogram> getWriteHistogram();
Reference<Histogram> getDeleteCompactRangeHistogram(); Reference<Histogram> getDeleteCompactRangeHistogram();
// Stat for Memory Usage // Stat for Memory Usage
void logMemUsagePerShard(std::string shardName, rocksdb::DB* db); void logMemUsage(rocksdb::DB* db);
private: private:
const UID debugID;
// Global Statistic Input to RocksDB DB instance // Global Statistic Input to RocksDB DB instance
std::shared_ptr<rocksdb::Statistics> stats; std::shared_ptr<rocksdb::Statistics> stats;
// Statistic Output from RocksDB // Statistic Output from RocksDB
@ -1158,9 +1157,8 @@ Reference<Histogram> RocksDBMetrics::getDeleteCompactRangeHistogram() {
return deleteCompactRangeHistogram; return deleteCompactRangeHistogram;
} }
RocksDBMetrics::RocksDBMetrics() { RocksDBMetrics::RocksDBMetrics(UID debugID, std::shared_ptr<rocksdb::Statistics> stats)
stats = rocksdb::CreateDBStatistics(); : debugID(debugID), stats(stats) {
stats->set_stats_level(rocksdb::kExceptHistogramOrTimers);
tickerStats = { tickerStats = {
{ "StallMicros", rocksdb::STALL_MICROS, 0 }, { "StallMicros", rocksdb::STALL_MICROS, 0 },
{ "BytesRead", rocksdb::BYTES_READ, 0 }, { "BytesRead", rocksdb::BYTES_READ, 0 },
@ -1343,16 +1341,8 @@ RocksDBMetrics::RocksDBMetrics() {
ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_DELETE_COMPACTRANGE_HISTOGRAM, Histogram::Unit::microseconds); ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_DELETE_COMPACTRANGE_HISTOGRAM, Histogram::Unit::microseconds);
} }
std::shared_ptr<rocksdb::Statistics> RocksDBMetrics::getStatsObjForRocksDB() {
// Zhe: reserved for statistic of RocksDBMetrics per shard
// ASSERT(shard != nullptr && shard->stats != nullptr);
// return shard->stats;
ASSERT(stats != nullptr);
return stats;
}
void RocksDBMetrics::logStats(rocksdb::DB* db) { void RocksDBMetrics::logStats(rocksdb::DB* db) {
TraceEvent e("ShardedRocksDBMetrics"); TraceEvent e(SevInfo, "ShardedRocksDBMetrics", debugID);
uint64_t stat; uint64_t stat;
for (auto& [name, ticker, cumulation] : tickerStats) { for (auto& [name, ticker, cumulation] : tickerStats) {
stat = stats->getTickerCount(ticker); stat = stats->getTickerCount(ticker);
@ -1364,18 +1354,10 @@ void RocksDBMetrics::logStats(rocksdb::DB* db) {
ASSERT(db->GetIntProperty(property, &stat)); ASSERT(db->GetIntProperty(property, &stat));
e.detail(name, stat); e.detail(name, stat);
} }
/*
stat = readIterPool->numReadIteratorsCreated();
e.detail("NumReadIteratorsCreated", stat - readIteratorPoolStats["NumReadIteratorsCreated"]);
readIteratorPoolStats["NumReadIteratorsCreated"] = stat;
stat = readIterPool->numTimesReadIteratorsReused();
e.detail("NumTimesReadIteratorsReused", stat - readIteratorPoolStats["NumTimesReadIteratorsReused"]);
readIteratorPoolStats["NumTimesReadIteratorsReused"] = stat;
*/
} }
void RocksDBMetrics::logMemUsagePerShard(std::string shardName, rocksdb::DB* db) { void RocksDBMetrics::logMemUsage(rocksdb::DB* db) {
TraceEvent e("ShardedRocksDBShardMemMetrics"); TraceEvent e(SevInfo, "ShardedRocksDBMemMetrics", debugID);
uint64_t stat; uint64_t stat;
ASSERT(db != nullptr); ASSERT(db != nullptr);
ASSERT(db->GetIntProperty(rocksdb::DB::Properties::kBlockCacheUsage, &stat)); ASSERT(db->GetIntProperty(rocksdb::DB::Properties::kBlockCacheUsage, &stat));
@ -1386,7 +1368,6 @@ void RocksDBMetrics::logMemUsagePerShard(std::string shardName, rocksdb::DB* db)
e.detail("AllMemtablesBytes", stat); e.detail("AllMemtablesBytes", stat);
ASSERT(db->GetIntProperty(rocksdb::DB::Properties::kBlockCachePinnedUsage, &stat)); ASSERT(db->GetIntProperty(rocksdb::DB::Properties::kBlockCachePinnedUsage, &stat));
e.detail("BlockCachePinnedUsage", stat); e.detail("BlockCachePinnedUsage", stat);
e.detail("Name", shardName);
} }
void RocksDBMetrics::resetPerfContext() { void RocksDBMetrics::resetPerfContext() {
@ -1401,7 +1382,7 @@ void RocksDBMetrics::setPerfContext(int index) {
} }
void RocksDBMetrics::logPerfContext(bool ignoreZeroMetric) { void RocksDBMetrics::logPerfContext(bool ignoreZeroMetric) {
TraceEvent e("ShardedRocksDBPerfContextMetrics"); TraceEvent e(SevInfo, "ShardedRocksDBPerfContextMetrics", debugID);
e.setMaxEventLength(20000); e.setMaxEventLength(20000);
for (auto& [name, metric, vals] : perfContextMetrics) { for (auto& [name, metric, vals] : perfContextMetrics) {
uint64_t s = 0; uint64_t s = 0;
@ -1563,18 +1544,31 @@ uint64_t RocksDBMetrics::getRocksdbPerfcontextMetric(int metric) {
return 0; return 0;
} }
ACTOR Future<Void> rocksDBAggregatedMetricsLogger(std::shared_ptr<RocksDBMetrics> rocksDBMetrics, rocksdb::DB* db) { ACTOR Future<Void> rocksDBAggregatedMetricsLogger(std::shared_ptr<ShardedRocksDBState> rState,
Future<Void> openFuture,
std::shared_ptr<RocksDBMetrics> rocksDBMetrics,
ShardManager* shardManager) {
try {
wait(openFuture);
state rocksdb::DB* db = shardManager->getDb();
loop { loop {
wait(delay(SERVER_KNOBS->ROCKSDB_METRICS_DELAY)); wait(delay(SERVER_KNOBS->ROCKSDB_METRICS_DELAY));
/* if (rState->closing) {
if (SERVER_KNOBS->ROCKSDB_ENABLE_STATISTIC) { break;
rocksDBMetrics->logStats(db);
} }
rocksDBMetrics->logStats(db);
rocksDBMetrics->logMemUsage(db);
if (SERVER_KNOBS->ROCKSDB_PERFCONTEXT_SAMPLE_RATE != 0) { if (SERVER_KNOBS->ROCKSDB_PERFCONTEXT_SAMPLE_RATE != 0) {
rocksDBMetrics->logPerfContext(true); rocksDBMetrics->logPerfContext(true);
}*/
} }
} }
} catch (Error& e) {
if (e.code() != error_code_actor_cancelled) {
TraceEvent(SevError, "ShardedRocksDBMetricsError").errorUnsuppressed(e);
}
}
return Void();
}
struct ShardedRocksDBKeyValueStore : IKeyValueStore { struct ShardedRocksDBKeyValueStore : IKeyValueStore {
using CF = rocksdb::ColumnFamilyHandle*; using CF = rocksdb::ColumnFamilyHandle*;
@ -1639,6 +1633,7 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
struct OpenAction : TypedAction<Writer, OpenAction> { struct OpenAction : TypedAction<Writer, OpenAction> {
ShardManager* shardManager; ShardManager* shardManager;
rocksdb::Options dbOptions;
ThreadReturnPromise<Void> done; ThreadReturnPromise<Void> done;
Optional<Future<Void>>& metrics; Optional<Future<Void>>& metrics;
const FlowLock* readLock; const FlowLock* readLock;
@ -1646,18 +1641,20 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
std::shared_ptr<RocksDBErrorListener> errorListener; std::shared_ptr<RocksDBErrorListener> errorListener;
OpenAction(ShardManager* shardManager, OpenAction(ShardManager* shardManager,
rocksdb::Options dbOptions,
Optional<Future<Void>>& metrics, Optional<Future<Void>>& metrics,
const FlowLock* readLock, const FlowLock* readLock,
const FlowLock* fetchLock, const FlowLock* fetchLock,
std::shared_ptr<RocksDBErrorListener> errorListener) std::shared_ptr<RocksDBErrorListener> errorListener)
: shardManager(shardManager), metrics(metrics), readLock(readLock), fetchLock(fetchLock), : shardManager(shardManager), dbOptions(dbOptions), metrics(metrics), readLock(readLock),
errorListener(errorListener) {} fetchLock(fetchLock), errorListener(errorListener) {}
double getTimeEstimate() const override { return SERVER_KNOBS->COMMIT_TIME_ESTIMATE; } double getTimeEstimate() const override { return SERVER_KNOBS->COMMIT_TIME_ESTIMATE; }
}; };
void action(OpenAction& a) { void action(OpenAction& a) {
auto status = a.shardManager->init(); auto status = a.shardManager->init(a.dbOptions);
if (!status.ok()) { if (!status.ok()) {
logRocksDBError(status, "Open"); logRocksDBError(status, "Open");
a.done.sendError(statusToError(status)); a.done.sendError(statusToError(status));
@ -2114,10 +2111,10 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
std::reverse(a.shardRanges.begin(), a.shardRanges.end()); std::reverse(a.shardRanges.begin(), a.shardRanges.end());
} }
// TODO: consider multi-thread reads. It's possible to read multiple shards in parallel. However, the number // TODO: consider multi-thread reads. It's possible to read multiple shards in parallel. However, the
// of rows to read needs to be calculated based on the previous read result. We may read more than we // number of rows to read needs to be calculated based on the previous read result. We may read more
// expected when parallel read is used when the previous result is not available. It's unlikely to get to // than we expected when parallel read is used when the previous result is not available. It's unlikely
// performance improvement when the actual number of rows to read is very small. // to get to performance improvement when the actual number of rows to read is very small.
int accumulatedBytes = 0; int accumulatedBytes = 0;
int numShards = 0; int numShards = 0;
for (auto& [shard, range] : a.shardRanges) { for (auto& [shard, range] : a.shardRanges) {
@ -2175,17 +2172,19 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
numReadWaiters(SERVER_KNOBS->ROCKSDB_READ_QUEUE_HARD_MAX - SERVER_KNOBS->ROCKSDB_READ_QUEUE_SOFT_MAX), numReadWaiters(SERVER_KNOBS->ROCKSDB_READ_QUEUE_HARD_MAX - SERVER_KNOBS->ROCKSDB_READ_QUEUE_SOFT_MAX),
numFetchWaiters(SERVER_KNOBS->ROCKSDB_FETCH_QUEUE_HARD_MAX - SERVER_KNOBS->ROCKSDB_FETCH_QUEUE_SOFT_MAX), numFetchWaiters(SERVER_KNOBS->ROCKSDB_FETCH_QUEUE_HARD_MAX - SERVER_KNOBS->ROCKSDB_FETCH_QUEUE_SOFT_MAX),
errorListener(std::make_shared<RocksDBErrorListener>()), errorFuture(errorListener->getFuture()), errorListener(std::make_shared<RocksDBErrorListener>()), errorFuture(errorListener->getFuture()),
shardManager(path, id), rocksDBMetrics(new RocksDBMetrics()) { shardManager(path, id), dbOptions(getOptions()),
// In simluation, run the reader/writer threads as Coro threads (i.e. in the network thread. The storage engine rocksDBMetrics(std::make_shared<RocksDBMetrics>(id, dbOptions.statistics)) {
// is still multi-threaded as background compaction threads are still present. Reads/writes to disk will also // In simluation, run the reader/writer threads as Coro threads (i.e. in the network thread. The storage
// block the network thread in a way that would be unacceptable in production but is a necessary evil here. When // engine is still multi-threaded as background compaction threads are still present. Reads/writes to disk
// performing the reads in background threads in simulation, the event loop thinks there is no work to do and // will also block the network thread in a way that would be unacceptable in production but is a necessary
// advances time faster than 1 sec/sec. By the time the blocking read actually finishes, simulation has advanced // evil here. When performing the reads in background threads in simulation, the event loop thinks there is
// time by more than 5 seconds, so every read fails with a transaction_too_old error. Doing blocking IO on the // no work to do and advances time faster than 1 sec/sec. By the time the blocking read actually finishes,
// main thread solves this issue. There are almost certainly better fixes, but my goal was to get a less // simulation has advanced time by more than 5 seconds, so every read fails with a transaction_too_old
// invasive change merged first and work on a more realistic version if/when we think that would provide // error. Doing blocking IO on the main thread solves this issue. There are almost certainly better fixes,
// substantially more confidence in the correctness. // but my goal was to get a less invasive change merged first and work on a more realistic version if/when
// TODO: Adapt the simulation framework to not advance time quickly when background reads/writes are occurring. // we think that would provide substantially more confidence in the correctness.
// TODO: Adapt the simulation framework to not advance time quickly when background reads/writes are
// occurring.
if (g_network->isSimulated()) { if (g_network->isSimulated()) {
writeThread = CoroThreadPool::createThreadPool(); writeThread = CoroThreadPool::createThreadPool();
readThreads = CoroThreadPool::createThreadPool(); readThreads = CoroThreadPool::createThreadPool();
@ -2233,14 +2232,15 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
Future<Void> init() override { Future<Void> init() override {
if (openFuture.isValid()) { if (openFuture.isValid()) {
return openFuture; return openFuture;
// Restore durable state if KVS is open. KVS will be re-initialized during rollback. To avoid the cost of // Restore durable state if KVS is open. KVS will be re-initialized during rollback. To avoid the cost
// opening and closing multiple rocksdb instances, we reconcile the shard map using persist shard mapping // of opening and closing multiple rocksdb instances, we reconcile the shard map using persist shard
// data. // mapping data.
} else { } else {
auto a = std::make_unique<Writer::OpenAction>( auto a = std::make_unique<Writer::OpenAction>(
&shardManager, metrics, &readSemaphore, &fetchSemaphore, errorListener); &shardManager, dbOptions, metrics, &readSemaphore, &fetchSemaphore, errorListener);
openFuture = a->done.getFuture(); openFuture = a->done.getFuture();
this->metrics = ShardManager::shardMetricsLogger(this->rState, openFuture, &shardManager); this->metrics = ShardManager::shardMetricsLogger(this->rState, openFuture, &shardManager) &&
rocksDBAggregatedMetricsLogger(this->rState, openFuture, rocksDBMetrics, &shardManager);
this->refreshHolder = refreshReadIteratorPools(this->rState, openFuture, shardManager.getAllShards()); this->refreshHolder = refreshReadIteratorPools(this->rState, openFuture, shardManager.getAllShards());
writeThread->post(a.release()); writeThread->post(a.release());
return openFuture; return openFuture;
@ -2433,6 +2433,7 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
std::shared_ptr<ShardedRocksDBState> rState; std::shared_ptr<ShardedRocksDBState> rState;
ShardManager shardManager; ShardManager shardManager;
rocksdb::Options dbOptions;
std::shared_ptr<RocksDBMetrics> rocksDBMetrics; std::shared_ptr<RocksDBMetrics> rocksDBMetrics;
std::string path; std::string path;
UID id; UID id;