From fe03e6f82289b7634a58da6468d5a2ec37504617 Mon Sep 17 00:00:00 2001 From: "Bharadwaj V.R" Date: Tue, 15 Feb 2022 22:43:06 -0800 Subject: [PATCH 1/5] Introduce a new server knob and use it to test if storage servers are near the min bar for available space --- fdbclient/ServerKnobs.cpp | 1 + fdbclient/ServerKnobs.h | 1 + fdbserver/DDTeamCollection.actor.cpp | 6 +---- fdbserver/TCInfo.actor.cpp | 34 +++++++++++++++++++++++++++- fdbserver/TCInfo.h | 4 ++++ 5 files changed, 40 insertions(+), 6 deletions(-) diff --git a/fdbclient/ServerKnobs.cpp b/fdbclient/ServerKnobs.cpp index 752839ccc6..a24100870e 100644 --- a/fdbclient/ServerKnobs.cpp +++ b/fdbclient/ServerKnobs.cpp @@ -579,6 +579,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( MIN_AVAILABLE_SPACE, 1e8 ); init( MIN_AVAILABLE_SPACE_RATIO, 0.05 ); + init( AVAILABLE_SPACE_RATIO_CUTOFF_SAFETY_BUFFER, 0.03 ); init( TARGET_AVAILABLE_SPACE_RATIO, 0.30 ); init( AVAILABLE_SPACE_UPDATE_DELAY, 5.0 ); diff --git a/fdbclient/ServerKnobs.h b/fdbclient/ServerKnobs.h index a813bad097..1eac4c27e2 100644 --- a/fdbclient/ServerKnobs.h +++ b/fdbclient/ServerKnobs.h @@ -525,6 +525,7 @@ public: int64_t MIN_AVAILABLE_SPACE; double MIN_AVAILABLE_SPACE_RATIO; + double AVAILABLE_SPACE_RATIO_CUTOFF_SAFETY_BUFFER; double TARGET_AVAILABLE_SPACE_RATIO; double AVAILABLE_SPACE_UPDATE_DELAY; diff --git a/fdbserver/DDTeamCollection.actor.cpp b/fdbserver/DDTeamCollection.actor.cpp index c6dc10b0f1..b0effff1d9 100644 --- a/fdbserver/DDTeamCollection.actor.cpp +++ b/fdbserver/DDTeamCollection.actor.cpp @@ -5671,11 +5671,7 @@ TEST_CASE("/DataDistribution/GetTeam/ServerUtilizationNearCutoff") { std::pair>, bool> resTeam = req.reply.getFuture().get(); - std::set expectedServers{ UID(2, 0), UID(3, 0), UID(4, 0) }; - ASSERT(resTeam.first.present()); - auto servers = resTeam.first.get()->getServerIDs(); - const std::set selectedServers(servers.begin(), servers.end()); - ASSERT(expectedServers == selectedServers); + ASSERT(!resTeam.first.present()); return Void(); } diff --git a/fdbserver/TCInfo.actor.cpp b/fdbserver/TCInfo.actor.cpp index a430b0d5f7..6d4e65afa4 100644 --- a/fdbserver/TCInfo.actor.cpp +++ b/fdbserver/TCInfo.actor.cpp @@ -138,6 +138,23 @@ TCServerInfo::TCServerInfo(StorageServerInterface ssi, } } +bool TCServerInfo::hasHealthyAvailableSpace(double minAvailableSpaceRatio) const { + ASSERT(serverMetrics.present()); + + auto& metrics = serverMetrics.get(); + ASSERT(metrics.available.bytes >= 0); + ASSERT(metrics.capacity.bytes >= 0); + + double availableSpaceRatio; + if (metrics.capacity.bytes == 0) { + availableSpaceRatio = 0; + } else { + availableSpaceRatio = (((double)metrics.available.bytes) / metrics.capacity.bytes); + } + + return availableSpaceRatio >= minAvailableSpaceRatio; +} + Future TCServerInfo::updateServerMetrics() { return TCServerInfoImpl::updateServerMetrics(this); } @@ -319,8 +336,23 @@ double TCTeamInfo::getMinAvailableSpaceRatio(bool includeInFlight) const { return minRatio; } +bool TCTeamInfo::allServersHaveHealthyAvailableSpace() const { + bool result = true; + double minAvailableSpaceRatio = + SERVER_KNOBS->AVAILABLE_SPACE_RATIO_CUTOFF + SERVER_KNOBS->AVAILABLE_SPACE_RATIO_CUTOFF_SAFETY_BUFFER; + for (const auto& server : servers) { + if (!server->serverMetrics.present() || !server->hasHealthyAvailableSpace(minAvailableSpaceRatio)) { + result = false; + break; + } + } + + return result; +} + bool TCTeamInfo::hasHealthyAvailableSpace(double minRatio) const { - return getMinAvailableSpaceRatio() >= minRatio && getMinAvailableSpace() > SERVER_KNOBS->MIN_AVAILABLE_SPACE; + return getMinAvailableSpaceRatio() >= minRatio && getMinAvailableSpace() > SERVER_KNOBS->MIN_AVAILABLE_SPACE && + allServersHaveHealthyAvailableSpace(); } bool TCTeamInfo::isOptimal() const { diff --git a/fdbserver/TCInfo.h b/fdbserver/TCInfo.h index b64e415204..101dd845e9 100644 --- a/fdbserver/TCInfo.h +++ b/fdbserver/TCInfo.h @@ -71,6 +71,8 @@ public: return (storeType == configStoreType || storeType == KeyValueStoreType::END); } + bool hasHealthyAvailableSpace(double minAvailableSpaceRatio) const; + Future updateServerMetrics(); static Future updateServerMetrics(Reference server); @@ -180,4 +182,6 @@ private: // Calculate an "average" of the metrics replies that we received. Penalize teams from which we did not receive all // replies. int64_t getLoadAverage() const; + + bool allServersHaveHealthyAvailableSpace() const; }; From 3fe6a952f145f8ca2331a44810131cb01e535d48 Mon Sep 17 00:00:00 2001 From: "Bharadwaj V.R" Date: Wed, 16 Feb 2022 10:28:55 -0800 Subject: [PATCH 2/5] Merge with upstream tcinfo refactor and move the server knob init to be adjacent to related knobs --- fdbclient/ServerKnobs.cpp | 2 +- fdbclient/ServerKnobs.h | 2 +- fdbserver/TCInfo.actor.cpp | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fdbclient/ServerKnobs.cpp b/fdbclient/ServerKnobs.cpp index a24100870e..d507bb7dd7 100644 --- a/fdbclient/ServerKnobs.cpp +++ b/fdbclient/ServerKnobs.cpp @@ -233,6 +233,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( STORAGE_METRICS_POLLING_DELAY, 2.0 ); if( randomize && BUGGIFY ) STORAGE_METRICS_POLLING_DELAY = 15.0; init( STORAGE_METRICS_RANDOM_DELAY, 0.2 ); init( AVAILABLE_SPACE_RATIO_CUTOFF, 0.05 ); + init( AVAILABLE_SPACE_RATIO_CUTOFF_SAFETY_BUFFER, 0.03 ); init( DESIRED_TEAMS_PER_SERVER, 5 ); if( randomize && BUGGIFY ) DESIRED_TEAMS_PER_SERVER = deterministicRandom()->randomInt(1, 10); init( MAX_TEAMS_PER_SERVER, 5*DESIRED_TEAMS_PER_SERVER ); init( DD_SHARD_SIZE_GRANULARITY, 5000000 ); @@ -579,7 +580,6 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( MIN_AVAILABLE_SPACE, 1e8 ); init( MIN_AVAILABLE_SPACE_RATIO, 0.05 ); - init( AVAILABLE_SPACE_RATIO_CUTOFF_SAFETY_BUFFER, 0.03 ); init( TARGET_AVAILABLE_SPACE_RATIO, 0.30 ); init( AVAILABLE_SPACE_UPDATE_DELAY, 5.0 ); diff --git a/fdbclient/ServerKnobs.h b/fdbclient/ServerKnobs.h index 1eac4c27e2..9f442f3470 100644 --- a/fdbclient/ServerKnobs.h +++ b/fdbclient/ServerKnobs.h @@ -185,6 +185,7 @@ public: double STORAGE_METRICS_POLLING_DELAY; double STORAGE_METRICS_RANDOM_DELAY; double AVAILABLE_SPACE_RATIO_CUTOFF; + double AVAILABLE_SPACE_RATIO_CUTOFF_SAFETY_BUFFER; int DESIRED_TEAMS_PER_SERVER; int MAX_TEAMS_PER_SERVER; int64_t DD_SHARD_SIZE_GRANULARITY; @@ -525,7 +526,6 @@ public: int64_t MIN_AVAILABLE_SPACE; double MIN_AVAILABLE_SPACE_RATIO; - double AVAILABLE_SPACE_RATIO_CUTOFF_SAFETY_BUFFER; double TARGET_AVAILABLE_SPACE_RATIO; double AVAILABLE_SPACE_UPDATE_DELAY; diff --git a/fdbserver/TCInfo.actor.cpp b/fdbserver/TCInfo.actor.cpp index 87212ead0f..e85a0aeb83 100644 --- a/fdbserver/TCInfo.actor.cpp +++ b/fdbserver/TCInfo.actor.cpp @@ -138,9 +138,9 @@ TCServerInfo::TCServerInfo(StorageServerInterface ssi, } bool TCServerInfo::hasHealthyAvailableSpace(double minAvailableSpaceRatio) const { - ASSERT(serverMetrics.present()); + ASSERT(serverMetricsPresent()); - auto& metrics = serverMetrics.get(); + auto& metrics = getServerMetrics(); ASSERT(metrics.available.bytes >= 0); ASSERT(metrics.capacity.bytes >= 0); @@ -403,7 +403,7 @@ bool TCTeamInfo::allServersHaveHealthyAvailableSpace() const { double minAvailableSpaceRatio = SERVER_KNOBS->AVAILABLE_SPACE_RATIO_CUTOFF + SERVER_KNOBS->AVAILABLE_SPACE_RATIO_CUTOFF_SAFETY_BUFFER; for (const auto& server : servers) { - if (!server->serverMetrics.present() || !server->hasHealthyAvailableSpace(minAvailableSpaceRatio)) { + if (!server->serverMetricsPresent() || !server->hasHealthyAvailableSpace(minAvailableSpaceRatio)) { result = false; break; } From 949f1f1c3e0bac2fa32fd5058b52d741512c81e7 Mon Sep 17 00:00:00 2001 From: "Bharadwaj V.R" Date: Wed, 16 Feb 2022 11:33:07 -0800 Subject: [PATCH 3/5] Switch to testing MIN_AVAILABLE_SPACE --- fdbclient/ServerKnobs.cpp | 2 +- fdbclient/ServerKnobs.h | 2 +- fdbserver/TCInfo.actor.cpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fdbclient/ServerKnobs.cpp b/fdbclient/ServerKnobs.cpp index d507bb7dd7..3c68f8c712 100644 --- a/fdbclient/ServerKnobs.cpp +++ b/fdbclient/ServerKnobs.cpp @@ -233,7 +233,6 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( STORAGE_METRICS_POLLING_DELAY, 2.0 ); if( randomize && BUGGIFY ) STORAGE_METRICS_POLLING_DELAY = 15.0; init( STORAGE_METRICS_RANDOM_DELAY, 0.2 ); init( AVAILABLE_SPACE_RATIO_CUTOFF, 0.05 ); - init( AVAILABLE_SPACE_RATIO_CUTOFF_SAFETY_BUFFER, 0.03 ); init( DESIRED_TEAMS_PER_SERVER, 5 ); if( randomize && BUGGIFY ) DESIRED_TEAMS_PER_SERVER = deterministicRandom()->randomInt(1, 10); init( MAX_TEAMS_PER_SERVER, 5*DESIRED_TEAMS_PER_SERVER ); init( DD_SHARD_SIZE_GRANULARITY, 5000000 ); @@ -580,6 +579,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( MIN_AVAILABLE_SPACE, 1e8 ); init( MIN_AVAILABLE_SPACE_RATIO, 0.05 ); + init( MIN_AVAILABLE_SPACE_RATIO_SAFETY_BUFFER, 0.03 ); init( TARGET_AVAILABLE_SPACE_RATIO, 0.30 ); init( AVAILABLE_SPACE_UPDATE_DELAY, 5.0 ); diff --git a/fdbclient/ServerKnobs.h b/fdbclient/ServerKnobs.h index 9f442f3470..5bb48fc3e2 100644 --- a/fdbclient/ServerKnobs.h +++ b/fdbclient/ServerKnobs.h @@ -185,7 +185,6 @@ public: double STORAGE_METRICS_POLLING_DELAY; double STORAGE_METRICS_RANDOM_DELAY; double AVAILABLE_SPACE_RATIO_CUTOFF; - double AVAILABLE_SPACE_RATIO_CUTOFF_SAFETY_BUFFER; int DESIRED_TEAMS_PER_SERVER; int MAX_TEAMS_PER_SERVER; int64_t DD_SHARD_SIZE_GRANULARITY; @@ -526,6 +525,7 @@ public: int64_t MIN_AVAILABLE_SPACE; double MIN_AVAILABLE_SPACE_RATIO; + double MIN_AVAILABLE_SPACE_RATIO_SAFETY_BUFFER; double TARGET_AVAILABLE_SPACE_RATIO; double AVAILABLE_SPACE_UPDATE_DELAY; diff --git a/fdbserver/TCInfo.actor.cpp b/fdbserver/TCInfo.actor.cpp index e85a0aeb83..5cf7f6b2dd 100644 --- a/fdbserver/TCInfo.actor.cpp +++ b/fdbserver/TCInfo.actor.cpp @@ -401,7 +401,7 @@ double TCTeamInfo::getMinAvailableSpaceRatio(bool includeInFlight) const { bool TCTeamInfo::allServersHaveHealthyAvailableSpace() const { bool result = true; double minAvailableSpaceRatio = - SERVER_KNOBS->AVAILABLE_SPACE_RATIO_CUTOFF + SERVER_KNOBS->AVAILABLE_SPACE_RATIO_CUTOFF_SAFETY_BUFFER; + SERVER_KNOBS->MIN_AVAILABLE_SPACE_RATIO + SERVER_KNOBS->MIN_AVAILABLE_SPACE_RATIO_SAFETY_BUFFER; for (const auto& server : servers) { if (!server->serverMetricsPresent() || !server->hasHealthyAvailableSpace(minAvailableSpaceRatio)) { result = false; From a54acb372004d5f5d368c2591b49e579573e56c3 Mon Sep 17 00:00:00 2001 From: "Bharadwaj V.R" Date: Wed, 16 Feb 2022 19:26:40 -0800 Subject: [PATCH 4/5] Temporarily lower safety buffer knob. AtomicBackupCorrectness needs fixing --- fdbclient/ServerKnobs.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbclient/ServerKnobs.cpp b/fdbclient/ServerKnobs.cpp index 1b218fdf9c..21f1b38390 100644 --- a/fdbclient/ServerKnobs.cpp +++ b/fdbclient/ServerKnobs.cpp @@ -581,7 +581,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( MIN_AVAILABLE_SPACE, 1e8 ); init( MIN_AVAILABLE_SPACE_RATIO, 0.05 ); - init( MIN_AVAILABLE_SPACE_RATIO_SAFETY_BUFFER, 0.03 ); + init( MIN_AVAILABLE_SPACE_RATIO_SAFETY_BUFFER, 0.01 ); init( TARGET_AVAILABLE_SPACE_RATIO, 0.30 ); init( AVAILABLE_SPACE_UPDATE_DELAY, 5.0 ); From 8cb554a006960b3d9d587b834d98dbab6e132a29 Mon Sep 17 00:00:00 2001 From: "Bharadwaj V.R" Date: Wed, 23 Feb 2022 10:21:34 -0800 Subject: [PATCH 5/5] Fix formatting issues in DDTeamCollection.actor.cpp --- fdbserver/DDTeamCollection.actor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbserver/DDTeamCollection.actor.cpp b/fdbserver/DDTeamCollection.actor.cpp index cf8d9c1708..e3e1c3d724 100644 --- a/fdbserver/DDTeamCollection.actor.cpp +++ b/fdbserver/DDTeamCollection.actor.cpp @@ -5644,7 +5644,7 @@ public: std::pair>, bool> resTeam = req.reply.getFuture().get(); - ASSERT(!resTeam.first.present()); + ASSERT(!resTeam.first.present()); return Void(); }