mirror of
https://github.com/apple/foundationdb.git
synced 2025-06-01 18:56:00 +08:00
EaR: reduce metrics logging (#10453)
* EaR: reduce metrics logging BlobCipherMetrics used to break down by usage types (whehter it is for tlog, redwood, backup, etc), and these counters will be printed to trace log even when encryption is not enabled, or the specific usage is not happening on a node (e.g. a node with only stateless roles will also print blob cipher counters for redwood). We are reducing the BlobCipherMetrics loggings by: 1. Default to not breakdown the metrics by usage type, and the behavior is controlled by the knob `ENCRYPT_KEY_CACHE_ENABLE_DETAIL_LOGGING` 2. When the detail breakdown is enabled, the counters are lazily initialize 3. Even if the counters are initialized, they will not be logged if the count is 0 (so like if a node was recruited as tlog but then drops the tlog role later on, the tlog counter inside BlobCipherMetrics will not be logged anymore). * buggify BlobCipherMetrics detail logging knob * format
This commit is contained in:
parent
b4e2aef58b
commit
7048ad21a8
@ -343,42 +343,40 @@ void BlobCipherEncryptHeaderRef::validateEncryptionHeaderDetails(const BlobCiphe
|
||||
|
||||
// BlobCipherMetrics methods
|
||||
|
||||
const std::unordered_map<int, std::string> BlobCipherMetrics::usageTypeNames = {
|
||||
{ BlobCipherMetrics::UsageType::ALL, "" },
|
||||
{ BlobCipherMetrics::UsageType::TLOG, "TLog" },
|
||||
{ BlobCipherMetrics::UsageType::TLOG_POST_RESOLUTION, "TLogPostResolution" },
|
||||
{ BlobCipherMetrics::UsageType::KV_MEMORY, "KVMemory" },
|
||||
{ BlobCipherMetrics::UsageType::KV_REDWOOD, "KVRedwood" },
|
||||
{ BlobCipherMetrics::UsageType::BLOB_GRANULE, "BlobGranule" },
|
||||
{ BlobCipherMetrics::UsageType::BACKUP, "Backup" },
|
||||
{ BlobCipherMetrics::UsageType::RESTORE, "Restore" },
|
||||
{ BlobCipherMetrics::UsageType::TEST, "Test" },
|
||||
};
|
||||
|
||||
BlobCipherMetrics::CounterSet::CounterSet(CounterCollection& cc, std::string name)
|
||||
: encryptCPUTimeNS(name + "EncryptCPUTimeNS", cc), decryptCPUTimeNS(name + "DecryptCPUTimeNS", cc),
|
||||
: encryptCPUTimeNS(name + "EncryptCPUTimeNS", cc, true), decryptCPUTimeNS(name + "DecryptCPUTimeNS", cc, true),
|
||||
getCipherKeysLatency(name + "GetCipherKeysLatency",
|
||||
UID(),
|
||||
FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_INTERVAL,
|
||||
FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_SKETCH_ACCURACY),
|
||||
FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_SKETCH_ACCURACY,
|
||||
true),
|
||||
getLatestCipherKeysLatency(name + "GetLatestCipherKeysLatency",
|
||||
UID(),
|
||||
FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_INTERVAL,
|
||||
FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_SKETCH_ACCURACY) {}
|
||||
FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_SKETCH_ACCURACY,
|
||||
true) {}
|
||||
|
||||
BlobCipherMetrics::BlobCipherMetrics()
|
||||
: cc("BlobCipher"), cipherKeyCacheHit("CipherKeyCacheHit", cc), cipherKeyCacheMiss("CipherKeyCacheMiss", cc),
|
||||
cipherKeyCacheExpired("CipherKeyCacheExpired", cc), latestCipherKeyCacheHit("LatestCipherKeyCacheHit", cc),
|
||||
latestCipherKeyCacheMiss("LatestCipherKeyCacheMiss", cc),
|
||||
latestCipherKeyCacheNeedsRefresh("LatestCipherKeyCacheNeedsRefresh", cc),
|
||||
getCipherKeysLatency("GetCipherKeysLatency",
|
||||
UID(),
|
||||
FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_INTERVAL,
|
||||
FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_SKETCH_ACCURACY),
|
||||
getLatestCipherKeysLatency("GetLatestCipherKeysLatency",
|
||||
UID(),
|
||||
FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_INTERVAL,
|
||||
FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_SKETCH_ACCURACY),
|
||||
getBlobMetadataLatency("GetBlobMetadataLatency",
|
||||
UID(),
|
||||
FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_INTERVAL,
|
||||
FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_SKETCH_ACCURACY),
|
||||
counterSets({ CounterSet(cc, "TLog"),
|
||||
CounterSet(cc, "TLogPostResolution"),
|
||||
CounterSet(cc, "KVMemory"),
|
||||
CounterSet(cc, "KVRedwood"),
|
||||
CounterSet(cc, "BlobGranule"),
|
||||
CounterSet(cc, "Backup"),
|
||||
CounterSet(cc, "Restore"),
|
||||
CounterSet(cc, "Test") }) {
|
||||
FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_SKETCH_ACCURACY) {
|
||||
specialCounter(cc, "CacheSize", []() { return BlobCipherKeyCache::getInstance()->getSize(); });
|
||||
traceFuture = cc.traceCounters("BlobCipherMetrics", UID(), FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_INTERVAL);
|
||||
}
|
||||
|
@ -73,7 +73,8 @@ public:
|
||||
|
||||
// Order of this enum has to match initializer of counterSets.
|
||||
enum UsageType : int {
|
||||
TLOG = 0,
|
||||
ALL = 0,
|
||||
TLOG,
|
||||
TLOG_POST_RESOLUTION,
|
||||
KV_MEMORY,
|
||||
KV_REDWOOD,
|
||||
@ -84,7 +85,9 @@ public:
|
||||
MAX,
|
||||
};
|
||||
|
||||
struct CounterSet {
|
||||
static const std::unordered_map<int, std::string> usageTypeNames;
|
||||
|
||||
struct CounterSet : public ReferenceCounted<CounterSet> {
|
||||
Counter encryptCPUTimeNS;
|
||||
Counter decryptCPUTimeNS;
|
||||
LatencySample getCipherKeysLatency;
|
||||
@ -95,7 +98,12 @@ public:
|
||||
|
||||
static CounterSet& counters(UsageType t) {
|
||||
ASSERT(t < UsageType::MAX);
|
||||
return getInstance()->counterSets[int(t)];
|
||||
auto& counterSets = getInstance()->counterSets;
|
||||
int index = FLOW_KNOBS->ENCRYPT_KEY_CACHE_ENABLE_DETAIL_LOGGING ? (int)t : (int)UsageType::ALL;
|
||||
if (!counterSets[index].isValid()) {
|
||||
counterSets[index] = makeReference<CounterSet>(getInstance()->cc, usageTypeNames.at(index));
|
||||
}
|
||||
return *counterSets[index];
|
||||
}
|
||||
|
||||
private:
|
||||
@ -111,10 +119,8 @@ public:
|
||||
Counter latestCipherKeyCacheHit;
|
||||
Counter latestCipherKeyCacheMiss;
|
||||
Counter latestCipherKeyCacheNeedsRefresh;
|
||||
LatencySample getCipherKeysLatency;
|
||||
LatencySample getLatestCipherKeysLatency;
|
||||
LatencySample getBlobMetadataLatency;
|
||||
std::array<CounterSet, int(UsageType::MAX)> counterSets;
|
||||
std::array<Reference<CounterSet>, int(UsageType::MAX)> counterSets;
|
||||
};
|
||||
|
||||
std::string toString(BlobCipherMetrics::UsageType type);
|
||||
|
@ -164,7 +164,6 @@ Future<std::unordered_map<EncryptCipherDomainId, Reference<BlobCipherKey>>> _get
|
||||
when(wait(_onEncryptKeyProxyChange(db))) {}
|
||||
}
|
||||
double elapsed = now() - startTime;
|
||||
BlobCipherMetrics::getInstance()->getLatestCipherKeysLatency.addMeasurement(elapsed);
|
||||
BlobCipherMetrics::counters(usageType).getLatestCipherKeysLatency.addMeasurement(elapsed);
|
||||
return cipherKeys;
|
||||
}
|
||||
@ -302,7 +301,6 @@ Future<std::unordered_map<BlobCipherDetails, Reference<BlobCipherKey>>> _getEncr
|
||||
when(wait(_onEncryptKeyProxyChange(db))) {}
|
||||
}
|
||||
double elapsed = now() - startTime;
|
||||
BlobCipherMetrics::getInstance()->getCipherKeysLatency.addMeasurement(elapsed);
|
||||
BlobCipherMetrics::counters(usageType).getCipherKeysLatency.addMeasurement(elapsed);
|
||||
return cipherKeys;
|
||||
}
|
||||
|
@ -29,9 +29,9 @@
|
||||
#include <string>
|
||||
#include "flow/actorcompiler.h" // has to be last include
|
||||
|
||||
Counter::Counter(std::string const& name, CounterCollection& collection)
|
||||
Counter::Counter(std::string const& name, CounterCollection& collection, bool skipTraceOnSilentInterval)
|
||||
: name(name), interval_start(0), last_event(0), interval_sq_time(0), roughness_interval_start(0), interval_delta(0),
|
||||
interval_start_value(0) {
|
||||
interval_start_value(0), skip_trace_on_silent_interval(skipTraceOnSilentInterval) {
|
||||
metric.init(collection.getName() + "." + (char)toupper(name.at(0)) + name.substr(1), collection.getId());
|
||||
collection.addCounter(this);
|
||||
}
|
||||
@ -119,7 +119,9 @@ void CounterCollection::logToTraceEvent(TraceEvent& te) {
|
||||
}
|
||||
}
|
||||
}
|
||||
te.detail(c->getName().c_str(), c);
|
||||
if (!c->suppressTrace()) {
|
||||
te.detail(c->getName().c_str(), c);
|
||||
}
|
||||
c->resetInterval();
|
||||
}
|
||||
}
|
||||
@ -215,9 +217,14 @@ LatencyBands::~LatencyBands() {
|
||||
clearBands();
|
||||
}
|
||||
|
||||
LatencySample::LatencySample(std::string name, UID id, double loggingInterval, double accuracy)
|
||||
LatencySample::LatencySample(std::string name,
|
||||
UID id,
|
||||
double loggingInterval,
|
||||
double accuracy,
|
||||
bool skipTraceOnSilentInterval)
|
||||
: name(name), IMetric(knobToMetricModel(FLOW_KNOBS->METRICS_DATA_MODEL)), id(id), sampleEmit(now()), sketch(accuracy),
|
||||
latencySampleEventHolder(makeReference<EventCacheHolder>(id.toString() + "/" + name)) {
|
||||
latencySampleEventHolder(makeReference<EventCacheHolder>(id.toString() + "/" + name)),
|
||||
skipTraceOnSilentInterval(skipTraceOnSilentInterval) {
|
||||
logger = recurring([this]() { logSample(); }, loggingInterval);
|
||||
p50id = deterministicRandom()->randomUniqueID();
|
||||
p90id = deterministicRandom()->randomUniqueID();
|
||||
@ -231,6 +238,9 @@ void LatencySample::addMeasurement(double measurement) {
|
||||
}
|
||||
|
||||
void LatencySample::logSample() {
|
||||
if (skipTraceOnSilentInterval && sketch.getPopulationSize() == 0) {
|
||||
return;
|
||||
}
|
||||
double p25 = sketch.percentile(0.25);
|
||||
double p50 = sketch.mean();
|
||||
double p90 = sketch.percentile(0.9);
|
||||
|
@ -62,6 +62,7 @@ struct ICounter : public IMetric {
|
||||
virtual void resetInterval() = 0;
|
||||
|
||||
virtual void remove() {}
|
||||
virtual bool suppressTrace() const { return false; }
|
||||
};
|
||||
|
||||
template <>
|
||||
@ -115,7 +116,7 @@ struct Counter final : public ICounter, NonCopyable {
|
||||
public:
|
||||
typedef int64_t Value;
|
||||
|
||||
Counter(std::string const& name, CounterCollection& collection);
|
||||
Counter(std::string const& name, CounterCollection& collection, bool skipTraceOnSilentInterval = false);
|
||||
|
||||
void operator+=(Value delta);
|
||||
void operator++() { *this += 1; }
|
||||
@ -145,11 +146,14 @@ public:
|
||||
bool hasRate() const override { return true; }
|
||||
bool hasRoughness() const override { return true; }
|
||||
|
||||
bool suppressTrace() const override { return skip_trace_on_silent_interval && getIntervalDelta() == 0; }
|
||||
|
||||
private:
|
||||
std::string name;
|
||||
double interval_start, last_event, interval_sq_time, roughness_interval_start;
|
||||
Value interval_delta, interval_start_value;
|
||||
Int64MetricHandle metric;
|
||||
bool skip_trace_on_silent_interval;
|
||||
};
|
||||
|
||||
template <>
|
||||
@ -227,7 +231,11 @@ public:
|
||||
|
||||
class LatencySample : public IMetric {
|
||||
public:
|
||||
LatencySample(std::string name, UID id, double loggingInterval, double accuracy);
|
||||
LatencySample(std::string name,
|
||||
UID id,
|
||||
double loggingInterval,
|
||||
double accuracy,
|
||||
bool skipTraceOnSilentInterval = false);
|
||||
void addMeasurement(double measurement);
|
||||
|
||||
private:
|
||||
@ -248,6 +256,7 @@ private:
|
||||
|
||||
DDSketch<double> sketch;
|
||||
Future<Void> logger;
|
||||
bool skipTraceOnSilentInterval;
|
||||
|
||||
Reference<EventCacheHolder> latencySampleEventHolder;
|
||||
|
||||
|
@ -312,6 +312,7 @@ void FlowKnobs::initialize(Randomize randomize, IsSimulated isSimulated) {
|
||||
init( ENCRYPT_KEY_HEALTH_CHECK_INTERVAL, 10 );
|
||||
if ( randomize && BUGGIFY) { ENCRYPT_KEY_HEALTH_CHECK_INTERVAL = deterministicRandom()->randomInt(10, 60); }
|
||||
init( EKP_HEALTH_CHECK_REQUEST_TIMEOUT, 10.0);
|
||||
init( ENCRYPT_KEY_CACHE_ENABLE_DETAIL_LOGGING, false ); if( randomize && BUGGIFY ) ENCRYPT_KEY_CACHE_ENABLE_DETAIL_LOGGING = true;
|
||||
init( ENCRYPT_KEY_CACHE_LOGGING_INTERVAL, 5.0 );
|
||||
init( ENCRYPT_KEY_CACHE_LOGGING_SKETCH_ACCURACY, 0.01 );
|
||||
// Refer to EncryptUtil::EncryptAuthTokenAlgo for more details
|
||||
|
@ -374,6 +374,7 @@ public:
|
||||
int64_t ENCRYPT_KEY_REFRESH_INTERVAL;
|
||||
int64_t ENCRYPT_KEY_HEALTH_CHECK_INTERVAL;
|
||||
double EKP_HEALTH_CHECK_REQUEST_TIMEOUT;
|
||||
bool ENCRYPT_KEY_CACHE_ENABLE_DETAIL_LOGGING;
|
||||
double ENCRYPT_KEY_CACHE_LOGGING_INTERVAL;
|
||||
double ENCRYPT_KEY_CACHE_LOGGING_SKETCH_ACCURACY;
|
||||
bool ENCRYPT_HEADER_AUTH_TOKEN_ENABLED;
|
||||
|
Loading…
x
Reference in New Issue
Block a user