From 6581161dd31f62dcaa37636a44e98ec46925b998 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Thu, 15 Aug 2019 11:07:04 -0700 Subject: [PATCH 1/4] Add ratekeeper's durability lag statistics to status --- documentation/sphinx/source/mr-status-json-schemas.rst.inc | 4 +++- fdbclient/Schemas.cpp | 4 +++- fdbserver/Ratekeeper.actor.cpp | 4 ++-- fdbserver/Status.actor.cpp | 2 ++ 4 files changed, 10 insertions(+), 4 deletions(-) diff --git a/documentation/sphinx/source/mr-status-json-schemas.rst.inc b/documentation/sphinx/source/mr-status-json-schemas.rst.inc index ff9ae86947..b0e10d953c 100644 --- a/documentation/sphinx/source/mr-status-json-schemas.rst.inc +++ b/documentation/sphinx/source/mr-status-json-schemas.rst.inc @@ -264,7 +264,9 @@ "limiting_queue_bytes_storage_server":0, "worst_queue_bytes_storage_server":0, "limiting_version_lag_storage_server":0, - "worst_version_lag_storage_server":0 + "worst_version_lag_storage_server":0, + "limiting_durability_lag_storage_server":0, + "worst_durability_lag_storage_server":0 }, "incompatible_connections":[ ], diff --git a/fdbclient/Schemas.cpp b/fdbclient/Schemas.cpp index 06fd8f4041..d54fbddd7f 100644 --- a/fdbclient/Schemas.cpp +++ b/fdbclient/Schemas.cpp @@ -286,7 +286,9 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema( "limiting_queue_bytes_storage_server":0, "worst_queue_bytes_storage_server":0, "limiting_version_lag_storage_server":0, - "worst_version_lag_storage_server":0 + "worst_version_lag_storage_server":0, + "limiting_durability_lag_storage_server":0, + "worst_durability_lag_storage_server":0 }, "incompatible_connections":[ diff --git a/fdbserver/Ratekeeper.actor.cpp b/fdbserver/Ratekeeper.actor.cpp index 4d9b50b93d..d966da49e1 100644 --- a/fdbserver/Ratekeeper.actor.cpp +++ b/fdbserver/Ratekeeper.actor.cpp @@ -685,8 +685,8 @@ void updateRate(RatekeeperData* self, RatekeeperLimits* limits) { .detail("TotalDiskUsageBytes", totalDiskUsageBytes) .detail("WorstStorageServerVersionLag", worstVersionLag) .detail("LimitingStorageServerVersionLag", limitingVersionLag) - .detail("WorstDurabilityLag", worstDurabilityLag) - .detail("LimitingDurabilityLag", limitingDurabilityLag) + .detail("WorstStorageServerDurabilityLag", worstDurabilityLag) + .detail("LimitingStorageServerDurabilityLag", limitingDurabilityLag) .trackLatest(name.c_str()); } } diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp index 5c3f9d3bd7..bd51e7f7fd 100644 --- a/fdbserver/Status.actor.cpp +++ b/fdbserver/Status.actor.cpp @@ -1613,6 +1613,8 @@ ACTOR static Future workloadStatusFetcher(Reference 0) { From 02ba73917b4b8729c2ad29d07612cc913b98ea85 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Thu, 15 Aug 2019 11:08:08 -0700 Subject: [PATCH 2/4] Add release note. --- documentation/sphinx/source/release-notes.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/documentation/sphinx/source/release-notes.rst b/documentation/sphinx/source/release-notes.rst index cde95d0889..9be1f56d5b 100644 --- a/documentation/sphinx/source/release-notes.rst +++ b/documentation/sphinx/source/release-notes.rst @@ -54,6 +54,7 @@ Status * ``connected_clients`` is now only a sample of the connected clients, rather than a complete list. `(PR #1902) `_. * Added ``max_protocol_clients`` to the ``supported_versions`` section, which provides a sample of connected clients which cannot connect to any higher protocol version. `(PR #1902) `_. * Clients which connect without specifying their supported versions are tracked as an ``Unknown`` version in the ``supported_versions`` section. [6.2.2] `(PR #1990) `_. +* Added ``worst_durability_lag_storage_server`` and ``limiting_durability_lag_storage_server`` to the ``cluster.qos`` section. These report the durability lag values being used by ratekeeper to potentially limit the transaction rate. [6.2.3] `(PR #2003) `_. Bindings -------- From bb72cdd36a2c99e2d0c0d00bad70d0ffc1fed0ab Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Thu, 15 Aug 2019 13:42:39 -0700 Subject: [PATCH 3/4] Report lag with the usual "seconds" and "versions" fields. Rename and deprecate the qos.*version_lag_storage_server fields. --- .../source/mr-status-json-schemas.rst.inc | 18 ++++++++-- documentation/sphinx/source/release-notes.rst | 3 +- fdbclient/Schemas.cpp | 18 ++++++++-- fdbserver/Status.actor.cpp | 35 +++++++++---------- 4 files changed, 51 insertions(+), 23 deletions(-) diff --git a/documentation/sphinx/source/mr-status-json-schemas.rst.inc b/documentation/sphinx/source/mr-status-json-schemas.rst.inc index b0e10d953c..ba23a065da 100644 --- a/documentation/sphinx/source/mr-status-json-schemas.rst.inc +++ b/documentation/sphinx/source/mr-status-json-schemas.rst.inc @@ -265,8 +265,22 @@ "worst_queue_bytes_storage_server":0, "limiting_version_lag_storage_server":0, "worst_version_lag_storage_server":0, - "limiting_durability_lag_storage_server":0, - "worst_durability_lag_storage_server":0 + "limiting_data_lag_storage_server":{ + "versions":0, + "seconds":0.0 + }, + "worst_data_lag_storage_server":{ + "versions":0, + "seconds":0.0 + }, + "limiting_durability_lag_storage_server":{ + "versions":0, + "seconds":0.0 + }, + "worst_durability_lag_storage_server":{ + "versions":0, + "seconds":0.0 + } }, "incompatible_connections":[ ], diff --git a/documentation/sphinx/source/release-notes.rst b/documentation/sphinx/source/release-notes.rst index 9be1f56d5b..a183cc42a1 100644 --- a/documentation/sphinx/source/release-notes.rst +++ b/documentation/sphinx/source/release-notes.rst @@ -54,7 +54,8 @@ Status * ``connected_clients`` is now only a sample of the connected clients, rather than a complete list. `(PR #1902) `_. * Added ``max_protocol_clients`` to the ``supported_versions`` section, which provides a sample of connected clients which cannot connect to any higher protocol version. `(PR #1902) `_. * Clients which connect without specifying their supported versions are tracked as an ``Unknown`` version in the ``supported_versions`` section. [6.2.2] `(PR #1990) `_. -* Added ``worst_durability_lag_storage_server`` and ``limiting_durability_lag_storage_server`` to the ``cluster.qos`` section. These report the durability lag values being used by ratekeeper to potentially limit the transaction rate. [6.2.3] `(PR #2003) `_. +* Added ``worst_durability_lag_storage_server`` and ``limiting_durability_lag_storage_server`` to the ``cluster.qos`` section, each with subfields ``versions`` and ``seconds``. These report the durability lag values being used by ratekeeper to potentially limit the transaction rate. [6.2.3] `(PR #2003) `_. +* Added ``worst_data_lag_storage_server`` and ``limiting_data_lag_storage_server`` to the ``cluster.qos`` section, each with subfields ``versions`` and ``seconds``. These are meant to replace ``worst_version_lag_storage_server`` and ``limiting_version_lag_storage_server``, which are now deprecated. [6.2.3] `(PR #2003) `_. Bindings -------- diff --git a/fdbclient/Schemas.cpp b/fdbclient/Schemas.cpp index d54fbddd7f..79c0f85725 100644 --- a/fdbclient/Schemas.cpp +++ b/fdbclient/Schemas.cpp @@ -287,8 +287,22 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema( "worst_queue_bytes_storage_server":0, "limiting_version_lag_storage_server":0, "worst_version_lag_storage_server":0, - "limiting_durability_lag_storage_server":0, - "worst_durability_lag_storage_server":0 + "limiting_data_lag_storage_server":{ + "versions":0, + "seconds":0.0 + }, + "worst_data_lag_storage_server":{ + "versions":0, + "seconds":0.0 + }, + "limiting_durability_lag_storage_server":{ + "versions":0, + "seconds":0.0 + }, + "worst_durability_lag_storage_server":{ + "versions":0, + "seconds":0.0 + } }, "incompatible_connections":[ diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp index bd51e7f7fd..e91303cb69 100644 --- a/fdbserver/Status.actor.cpp +++ b/fdbserver/Status.actor.cpp @@ -388,6 +388,13 @@ static JsonBuilderObject machineStatusFetcher(WorkerEvents mMetrics, vectorVERSIONS_PER_SECOND; + return lag; +} + struct MachineMemoryInfo { double memoryUsage; double numProcesses; @@ -474,17 +481,8 @@ struct RolesInfo { obj["read_latency_bands"] = addLatencyBandInfo(readLatencyMetrics); } - JsonBuilderObject dataLag; - dataLag["versions"] = versionLag; - dataLagSeconds = versionLag / (double)SERVER_KNOBS->VERSIONS_PER_SECOND; - dataLag["seconds"] = dataLagSeconds; - - JsonBuilderObject durabilityLag; - durabilityLag["versions"] = version - durableVersion; - durabilityLag["seconds"] = (version - durableVersion) / (double)SERVER_KNOBS->VERSIONS_PER_SECOND; - - obj["data_lag"] = dataLag; - obj["durability_lag"] = durabilityLag; + obj["data_lag"] = getLagObject(versionLag); + obj["durability_lag"] = getLagObject(version - durableVersion); } catch (Error& e) { if(e.code() != error_code_attribute_not_found) @@ -1611,10 +1609,15 @@ ACTOR static Future workloadStatusFetcher(Reference 0) { @@ -2306,11 +2309,7 @@ ACTOR Future clusterGetStatus( incompatibleConnectionsArray.push_back(it.toString()); } statusObj["incompatible_connections"] = incompatibleConnectionsArray; - - StatusObject datacenterLag; - datacenterLag["versions"] = datacenterVersionDifference; - datacenterLag["seconds"] = datacenterVersionDifference / (double)SERVER_KNOBS->VERSIONS_PER_SECOND; - statusObj["datacenter_lag"] = datacenterLag; + statusObj["datacenter_lag"] = getLagObject(datacenterVersionDifference); int totalDegraded = 0; for(auto& it : workers) { From a148ddc7d55c4d842fca9069dcd8b5ed638fe8ec Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Thu, 15 Aug 2019 14:45:36 -0700 Subject: [PATCH 4/4] Fix spacing --- documentation/sphinx/source/mr-status-json-schemas.rst.inc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/documentation/sphinx/source/mr-status-json-schemas.rst.inc b/documentation/sphinx/source/mr-status-json-schemas.rst.inc index ba23a065da..c36fea9d7a 100644 --- a/documentation/sphinx/source/mr-status-json-schemas.rst.inc +++ b/documentation/sphinx/source/mr-status-json-schemas.rst.inc @@ -265,7 +265,7 @@ "worst_queue_bytes_storage_server":0, "limiting_version_lag_storage_server":0, "worst_version_lag_storage_server":0, - "limiting_data_lag_storage_server":{ + "limiting_data_lag_storage_server":{ "versions":0, "seconds":0.0 },