diff --git a/fdbserver/DDTeamCollection.actor.cpp b/fdbserver/DDTeamCollection.actor.cpp index 58507ffdf6..b145d0eb47 100644 --- a/fdbserver/DDTeamCollection.actor.cpp +++ b/fdbserver/DDTeamCollection.actor.cpp @@ -200,8 +200,9 @@ public: } int64_t bestLoadBytes = 0; + bool wigglingBestOption = false; // best option contains server in paused wiggle state Optional> bestOption; - std::vector> randomTeams; + std::vector> randomTeams; const std::set completeSources(req.completeSources.begin(), req.completeSources.end()); // Note: this block does not apply any filters from the request @@ -249,9 +250,18 @@ public: (!req.teamMustHaveShards || self->shardsAffectedByTeamFailure->hasShards(ShardsAffectedByTeamFailure::Team( self->teams[currentIndex]->getServerIDs(), self->primary)))) { + + // bestOption doesn't contain wiggling SS while current team does. Don't replace bestOption + // in this case + if (bestOption.present() && !wigglingBestOption && + self->teams[currentIndex]->hasWigglePausedServer()) { + continue; + } + bestLoadBytes = loadBytes; bestOption = self->teams[currentIndex]; bestIndex = currentIndex; + wigglingBestOption = self->teams[bestIndex]->hasWigglePausedServer(); } } } @@ -262,7 +272,7 @@ public: while (randomTeams.size() < SERVER_KNOBS->BEST_TEAM_OPTION_COUNT && nTries < SERVER_KNOBS->BEST_TEAM_MAX_TEAM_TRIES) { // If unhealthy team is majority, we may not find an ok dest in this while loop - Reference dest = deterministicRandom()->randomChoice(self->teams); + Reference dest = deterministicRandom()->randomChoice(self->teams); bool ok = dest->isHealthy() && (!req.preferLowerUtilization || dest->hasHealthyAvailableSpace(self->medianAvailableSpace)); @@ -298,8 +308,16 @@ public: int64_t loadBytes = randomTeams[i]->getLoadBytes(true, req.inflightPenalty); if (!bestOption.present() || (req.preferLowerUtilization && loadBytes < bestLoadBytes) || (!req.preferLowerUtilization && loadBytes > bestLoadBytes)) { + + // bestOption doesn't contain wiggling SS while current team does. Don't replace bestOption + // in this case + if (bestOption.present() && !wigglingBestOption && randomTeams[i]->hasWigglePausedServer()) { + continue; + } + bestLoadBytes = loadBytes; bestOption = randomTeams[i]; + wigglingBestOption = randomTeams[i]->hasWigglePausedServer(); } } } @@ -3611,6 +3629,10 @@ void DDTeamCollection::removeLaggingStorageServer(Key zoneId) { disableFailingLaggingServers.set(false); } +bool DDTeamCollection::isWigglePausedServer(const UID& server) const { + return pauseWiggle && pauseWiggle->get() && wigglingId == server; +} + std::vector DDTeamCollection::getRandomHealthyTeam(const UID& excludeServer) { std::vector candidates, backup; for (int i = 0; i < teams.size(); ++i) { diff --git a/fdbserver/DDTeamCollection.h b/fdbserver/DDTeamCollection.h index 86cd92e4cf..c307dd4598 100644 --- a/fdbserver/DDTeamCollection.h +++ b/fdbserver/DDTeamCollection.h @@ -594,6 +594,9 @@ public: void removeLaggingStorageServer(Key zoneId); + // whether server is under wiggling proces, but wiggle is paused for some healthy compliance. + bool isWigglePausedServer(const UID& server) const; + // Returns a random healthy team, which does not contain excludeServer. std::vector getRandomHealthyTeam(const UID& excludeServer); diff --git a/fdbserver/TCInfo.actor.cpp b/fdbserver/TCInfo.actor.cpp index 439a71255e..67a4e0a96c 100644 --- a/fdbserver/TCInfo.actor.cpp +++ b/fdbserver/TCInfo.actor.cpp @@ -154,6 +154,10 @@ bool TCServerInfo::hasHealthyAvailableSpace(double minAvailableSpaceRatio) const return availableSpaceRatio >= minAvailableSpaceRatio; } +bool TCServerInfo::isWigglePausedServer() const { + return collection && collection->isWigglePausedServer(id); +} + Future TCServerInfo::updateServerMetrics() { return TCServerInfoImpl::updateServerMetrics(this); } @@ -431,6 +435,14 @@ bool TCTeamInfo::hasServer(const UID& server) const { return std::find(serverIDs.begin(), serverIDs.end(), server) != serverIDs.end(); } +bool TCTeamInfo::hasWigglePausedServer() const { + for (const auto& server : servers) { + if (server->isWigglePausedServer()) + return true; + } + return false; +} + void TCTeamInfo::addServers(const std::vector& servers) { serverIDs.reserve(servers.size()); for (int i = 0; i < servers.size(); i++) { diff --git a/fdbserver/TCInfo.h b/fdbserver/TCInfo.h index da18d345e7..df08598c4e 100644 --- a/fdbserver/TCInfo.h +++ b/fdbserver/TCInfo.h @@ -97,6 +97,7 @@ public: // If a storage server does not reply its storeType, it will be tracked by failure monitor and removed. return (storeType == configStoreType || storeType == KeyValueStoreType::END); } + bool isWigglePausedServer() const; std::pair spaceBytes(bool includeInFlight = true) const; int64_t loadBytes() const; @@ -214,6 +215,7 @@ public: void delref() override { ReferenceCounted::delref(); } bool hasServer(const UID& server) const; + bool hasWigglePausedServer() const; void addServers(const std::vector& servers) override;