mirror of
https://github.com/apple/foundationdb.git
synced 2025-05-14 18:02:31 +08:00
Revert "Properly set simulation test for perpetual storage wiggle and bug fixing"
This commit is contained in:
parent
5faf082f83
commit
ad576e8c20
@ -52,7 +52,6 @@ class TCMachineTeamInfo;
|
||||
ACTOR Future<Void> checkAndRemoveInvalidLocalityAddr(DDTeamCollection* self);
|
||||
ACTOR Future<Void> removeWrongStoreType(DDTeamCollection* self);
|
||||
ACTOR Future<Void> waitForAllDataRemoved(Database cx, UID serverID, Version addedVersion, DDTeamCollection* teams);
|
||||
bool _exclusionSafetyCheck(vector<UID>& excludeServerIDs, DDTeamCollection* teamCollection);
|
||||
|
||||
struct TCServerInfo : public ReferenceCounted<TCServerInfo> {
|
||||
UID id;
|
||||
@ -376,16 +375,14 @@ struct ServerStatus {
|
||||
LocalityData locality;
|
||||
ServerStatus()
|
||||
: isWiggling(false), isFailed(true), isUndesired(false), isWrongConfiguration(false), initialized(false) {}
|
||||
ServerStatus(bool isFailed, bool isUndesired, bool isWiggling, LocalityData const& locality)
|
||||
ServerStatus(bool isFailed, bool isUndesired, LocalityData const& locality)
|
||||
: isFailed(isFailed), isUndesired(isUndesired), locality(locality), isWrongConfiguration(false),
|
||||
initialized(true), isWiggling(isWiggling) {}
|
||||
initialized(true), isWiggling(false) {}
|
||||
bool isUnhealthy() const { return isFailed || isUndesired; }
|
||||
const char* toString() const {
|
||||
return isFailed ? "Failed" : isUndesired ? "Undesired" : isWiggling ? "Wiggling" : "Healthy";
|
||||
}
|
||||
const char* toString() const { return isFailed ? "Failed" : isUndesired ? "Undesired" : "Healthy"; }
|
||||
|
||||
bool operator==(ServerStatus const& r) const {
|
||||
return isFailed == r.isFailed && isUndesired == r.isUndesired && isWiggling == r.isWiggling &&
|
||||
return isFailed == r.isFailed && isUndesired == r.isUndesired &&
|
||||
isWrongConfiguration == r.isWrongConfiguration && locality == r.locality && initialized == r.initialized;
|
||||
}
|
||||
bool operator!=(ServerStatus const& r) const { return !(*this == r); }
|
||||
@ -624,7 +621,6 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
|
||||
std::map<int,int> priority_teams;
|
||||
std::map<UID, Reference<TCServerInfo>> server_info;
|
||||
std::map<Key, std::vector<Reference<TCServerInfo>>> pid2server_info; // some process may serve as multiple storage servers
|
||||
std::vector<AddressExclusion> wiggle_addresses; // collection of wiggling servers' address
|
||||
std::map<UID, Reference<TCServerInfo>> tss_info_by_pair;
|
||||
std::map<UID, Reference<TCServerInfo>> server_and_tss_info; // TODO could replace this with an efficient way to do a read-only concatenation of 2 data structures?
|
||||
std::map<Key, int> lagging_zones; // zone to number of storage servers lagging
|
||||
@ -2830,7 +2826,6 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
|
||||
this->excludedServers.get(addr) != DDTeamCollection::Status::NONE) {
|
||||
continue; // don't overwrite the value set by actor trackExcludedServer
|
||||
}
|
||||
this->wiggle_addresses.push_back(addr);
|
||||
this->excludedServers.set(addr, DDTeamCollection::Status::WIGGLING);
|
||||
moveFutures.push_back(
|
||||
waitForAllDataRemoved(this->cx, info->lastKnownInterface.id(), info->addedVersion, this));
|
||||
@ -2842,19 +2837,19 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
|
||||
return moveFutures;
|
||||
}
|
||||
|
||||
// Include wiggled storage servers by setting their status from `WIGGLING`
|
||||
// Include storage servers held on process of which the Process Id is “pid” by setting their status from `WIGGLING`
|
||||
// to `NONE`. The storage recruiter will recruit them as new storage servers
|
||||
void includeStorageServersForWiggle() {
|
||||
void includeStorageServersForWiggle(const Value& pid) {
|
||||
bool included = false;
|
||||
for (auto& address : this->wiggle_addresses) {
|
||||
if (!this->excludedServers.count(address) ||
|
||||
this->excludedServers.get(address) != DDTeamCollection::Status::WIGGLING) {
|
||||
for (auto& info : this->pid2server_info[pid]) {
|
||||
AddressExclusion addr(info->lastKnownInterface.address().ip);
|
||||
if (!this->excludedServers.count(addr) ||
|
||||
this->excludedServers.get(addr) != DDTeamCollection::Status::WIGGLING) {
|
||||
continue;
|
||||
}
|
||||
included = true;
|
||||
this->excludedServers.set(address, DDTeamCollection::Status::NONE);
|
||||
this->excludedServers.set(addr, DDTeamCollection::Status::NONE);
|
||||
}
|
||||
this->wiggle_addresses.clear();
|
||||
if (included) {
|
||||
this->restartRecruiting.trigger();
|
||||
}
|
||||
@ -3536,7 +3531,8 @@ ACTOR Future<Void> teamTracker(DDTeamCollection* self, Reference<TCTeamInfo> tea
|
||||
}
|
||||
change.push_back(self->zeroHealthyTeams->onChange());
|
||||
|
||||
bool healthy = !badTeam && !anyUndesired && serversLeft == self->configuration.storageTeamSize;
|
||||
bool healthy =
|
||||
!badTeam && !anyUndesired && serversLeft == self->configuration.storageTeamSize && !anyWigglingServer;
|
||||
team->setHealthy(healthy); // Unhealthy teams won't be chosen by bestTeam
|
||||
bool optimal = team->isOptimal() && healthy;
|
||||
bool containsFailed = teamContainsFailedServer(self, team);
|
||||
@ -3833,12 +3829,10 @@ ACTOR Future<Void> trackExcludedServers(DDTeamCollection* self) {
|
||||
|
||||
// Reset and reassign self->excludedServers based on excluded, but we only
|
||||
// want to trigger entries that are different
|
||||
// Do not retrigger and double-overwrite failed or wiggling servers
|
||||
// Do not retrigger and double-overwrite failed servers
|
||||
auto old = self->excludedServers.getKeys();
|
||||
for (const auto& o : old) {
|
||||
if (!excluded.count(o) && !failed.count(o) &&
|
||||
!(self->excludedServers.count(o) &&
|
||||
self->excludedServers.get(o) == DDTeamCollection::Status::WIGGLING)) {
|
||||
if (!excluded.count(o) && !failed.count(o)) {
|
||||
self->excludedServers.set(o, DDTeamCollection::Status::NONE);
|
||||
}
|
||||
}
|
||||
@ -3890,7 +3884,6 @@ ACTOR Future<vector<std::pair<StorageServerInterface, ProcessClass>>> getServerL
|
||||
// to a sorted PID set maintained by the data distributor. If now no storage server exists, the new Process ID is 0.
|
||||
ACTOR Future<Void> updateNextWigglingStoragePID(DDTeamCollection* teamCollection) {
|
||||
state ReadYourWritesTransaction tr(teamCollection->cx);
|
||||
state Value writeValue = LiteralStringRef("0");
|
||||
loop {
|
||||
try {
|
||||
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
|
||||
@ -3903,14 +3896,11 @@ ACTOR Future<Void> updateNextWigglingStoragePID(DDTeamCollection* teamCollection
|
||||
auto nextIt = teamCollection->pid2server_info.upper_bound(value.get());
|
||||
if (nextIt == teamCollection->pid2server_info.end()) {
|
||||
tr.set(wigglingStorageServerKey, pid);
|
||||
writeValue = pid;
|
||||
} else {
|
||||
tr.set(wigglingStorageServerKey, nextIt->first);
|
||||
writeValue = nextIt->first;
|
||||
}
|
||||
} else {
|
||||
tr.set(wigglingStorageServerKey, pid);
|
||||
writeValue = pid;
|
||||
}
|
||||
}
|
||||
wait(tr.commit());
|
||||
@ -3919,9 +3909,6 @@ ACTOR Future<Void> updateNextWigglingStoragePID(DDTeamCollection* teamCollection
|
||||
wait(tr.onError(e));
|
||||
}
|
||||
}
|
||||
TraceEvent(SevDebug, "PerpetualNextWigglingStoragePID", teamCollection->distributorId)
|
||||
.detail("WriteValue", writeValue);
|
||||
|
||||
return Void();
|
||||
}
|
||||
|
||||
@ -3931,6 +3918,9 @@ ACTOR Future<Void> updateNextWigglingStoragePID(DDTeamCollection* teamCollection
|
||||
ACTOR Future<Void> perpetualStorageWiggleIterator(AsyncTrigger* stopSignal,
|
||||
FutureStream<Void> finishStorageWiggleSignal,
|
||||
DDTeamCollection* teamCollection) {
|
||||
// initialize PID
|
||||
wait(updateNextWigglingStoragePID(teamCollection));
|
||||
|
||||
loop choose {
|
||||
when(wait(stopSignal->onTrigger())) { break; }
|
||||
when(waitNext(finishStorageWiggleSignal)) { wait(updateNextWigglingStoragePID(teamCollection)); }
|
||||
@ -3941,8 +3931,8 @@ ACTOR Future<Void> perpetualStorageWiggleIterator(AsyncTrigger* stopSignal,
|
||||
|
||||
// Watch the value change of `wigglingStorageServerKey`.
|
||||
// Return the watch future and the current value of `wigglingStorageServerKey`.
|
||||
ACTOR Future<std::pair<Future<Void>, Value>> watchPerpetualStoragePIDChange(DDTeamCollection* self) {
|
||||
state ReadYourWritesTransaction tr(self->cx);
|
||||
ACTOR Future<std::pair<Future<Void>, Value>> watchPerpetualStoragePIDChange(Database cx) {
|
||||
state ReadYourWritesTransaction tr(cx);
|
||||
state Future<Void> watchFuture;
|
||||
state Value ret;
|
||||
loop {
|
||||
@ -3970,7 +3960,7 @@ ACTOR Future<Void> perpetualStorageWiggler(AsyncTrigger* stopSignal,
|
||||
PromiseStream<Void> finishStorageWiggleSignal,
|
||||
DDTeamCollection* self,
|
||||
const DDEnabledState* ddEnabledState) {
|
||||
state Future<Void> watchFuture = Never();
|
||||
state Future<Void> watchFuture;
|
||||
state Future<Void> moveFinishFuture = Never();
|
||||
state Debouncer pauseWiggle(SERVER_KNOBS->DEBOUNCE_RECRUITING_DELAY);
|
||||
state AsyncTrigger restart;
|
||||
@ -3978,16 +3968,13 @@ ACTOR Future<Void> perpetualStorageWiggler(AsyncTrigger* stopSignal,
|
||||
delay(SERVER_KNOBS->DD_ZERO_HEALTHY_TEAM_DELAY, TaskPriority::DataDistributionLow);
|
||||
state int movingCount = 0;
|
||||
state bool isPaused = false;
|
||||
state vector<UID> excludedServerIds;
|
||||
state std::pair<Future<Void>, Value> res = wait(watchPerpetualStoragePIDChange(self));
|
||||
ASSERT(!self->wigglingPid.present()); // only single process wiggle is allowed
|
||||
|
||||
state std::pair<Future<Void>, Value> res = wait(watchPerpetualStoragePIDChange(self->cx));
|
||||
watchFuture = res.first;
|
||||
self->wigglingPid = Optional<Key>(res.second);
|
||||
|
||||
// start with the initial pid
|
||||
for (const auto& info : self->pid2server_info[self->wigglingPid.get()]) {
|
||||
excludedServerIds.push_back(info->id);
|
||||
}
|
||||
if (self->teams.size() > 1 && _exclusionSafetyCheck(excludedServerIds, self)) { // pre-check health status
|
||||
if (self->healthyTeamCount > 1) { // pre-check health status
|
||||
TEST(true); // start the first wiggling
|
||||
|
||||
auto fv = self->excludeStorageServersForWiggle(self->wigglingPid.get());
|
||||
@ -4006,20 +3993,15 @@ ACTOR Future<Void> perpetualStorageWiggler(AsyncTrigger* stopSignal,
|
||||
choose {
|
||||
when(wait(stopSignal->onTrigger())) { break; }
|
||||
when(wait(watchFuture)) {
|
||||
ASSERT(!self->wigglingPid.present()); // the previous wiggle must be finished
|
||||
watchFuture = Never();
|
||||
|
||||
// read new pid and set the next watch Future
|
||||
wait(store(res, watchPerpetualStoragePIDChange(self)));
|
||||
wait(store(res, watchPerpetualStoragePIDChange(self->cx)));
|
||||
watchFuture = res.first;
|
||||
self->wigglingPid = Optional<Key>(res.second);
|
||||
StringRef pid = self->wigglingPid.get();
|
||||
|
||||
// pre-check health status
|
||||
excludedServerIds.clear();
|
||||
for (const auto& info : self->pid2server_info[self->wigglingPid.get()]) {
|
||||
excludedServerIds.push_back(info->id);
|
||||
}
|
||||
if (self->teams.size() > 1 && _exclusionSafetyCheck(excludedServerIds, self)) {
|
||||
if (self->healthyTeamCount <= 1) { // pre-check health status
|
||||
pauseWiggle.trigger();
|
||||
} else {
|
||||
TEST(true); // start wiggling
|
||||
|
||||
auto fv = self->excludeStorageServersForWiggle(pid);
|
||||
@ -4028,8 +4010,6 @@ ACTOR Future<Void> perpetualStorageWiggler(AsyncTrigger* stopSignal,
|
||||
TraceEvent("PerpetualStorageWiggleStart", self->distributorId)
|
||||
.detail("ProcessId", pid)
|
||||
.detail("StorageCount", movingCount);
|
||||
} else {
|
||||
pauseWiggle.trigger();
|
||||
}
|
||||
}
|
||||
when(wait(restart.onTrigger())) {
|
||||
@ -4050,13 +4030,12 @@ ACTOR Future<Void> perpetualStorageWiggler(AsyncTrigger* stopSignal,
|
||||
StringRef pid = self->wigglingPid.get();
|
||||
|
||||
moveFinishFuture = Never();
|
||||
self->includeStorageServersForWiggle();
|
||||
self->includeStorageServersForWiggle(pid);
|
||||
TraceEvent("PerpetualStorageWiggleFinish", self->distributorId)
|
||||
.detail("ProcessId", pid.toString())
|
||||
.detail("StorageCount", movingCount);
|
||||
|
||||
self->wigglingPid.reset();
|
||||
watchFuture = res.first;
|
||||
finishStorageWiggleSignal.send(Void());
|
||||
}
|
||||
when(wait(self->zeroHealthyTeams->onChange())) {
|
||||
@ -4071,11 +4050,11 @@ ACTOR Future<Void> perpetualStorageWiggler(AsyncTrigger* stopSignal,
|
||||
|
||||
if (count >= SERVER_KNOBS->DD_STORAGE_WIGGLE_PAUSE_THRESHOLD && !isPaused) {
|
||||
pauseWiggle.trigger();
|
||||
} else if (isPaused && count < SERVER_KNOBS->DD_STORAGE_WIGGLE_PAUSE_THRESHOLD &&
|
||||
self->teams.size() > 1 && _exclusionSafetyCheck(excludedServerIds, self)) {
|
||||
} else if (count < SERVER_KNOBS->DD_STORAGE_WIGGLE_PAUSE_THRESHOLD && self->healthyTeamCount > 1 &&
|
||||
isPaused) {
|
||||
restart.trigger();
|
||||
}
|
||||
ddQueueCheck = delay(SERVER_KNOBS->CHECK_TEAM_DELAY, TaskPriority::DataDistributionLow);
|
||||
ddQueueCheck = delay(SERVER_KNOBS->DD_ZERO_HEALTHY_TEAM_DELAY, TaskPriority::DataDistributionLow);
|
||||
}
|
||||
when(wait(pauseWiggle.onTrigger())) {
|
||||
if (self->wigglingPid.present()) {
|
||||
@ -4083,7 +4062,7 @@ ACTOR Future<Void> perpetualStorageWiggler(AsyncTrigger* stopSignal,
|
||||
StringRef pid = self->wigglingPid.get();
|
||||
isPaused = true;
|
||||
moveFinishFuture = Never();
|
||||
self->includeStorageServersForWiggle();
|
||||
self->includeStorageServersForWiggle(pid);
|
||||
TraceEvent("PerpetualStorageWigglePause", self->distributorId)
|
||||
.detail("ProcessId", pid)
|
||||
.detail("StorageCount", movingCount);
|
||||
@ -4093,9 +4072,7 @@ ACTOR Future<Void> perpetualStorageWiggler(AsyncTrigger* stopSignal,
|
||||
}
|
||||
|
||||
if (self->wigglingPid.present()) {
|
||||
self->includeStorageServersForWiggle();
|
||||
TraceEvent("PerpetualStorageWiggleExitingPause", self->distributorId)
|
||||
.detail("ProcessId", self->wigglingPid.get());
|
||||
self->includeStorageServersForWiggle(self->wigglingPid.get());
|
||||
self->wigglingPid.reset();
|
||||
}
|
||||
|
||||
@ -4111,7 +4088,7 @@ ACTOR Future<Void> monitorPerpetualStorageWiggle(DDTeamCollection* teamCollectio
|
||||
state AsyncTrigger stopWiggleSignal;
|
||||
state PromiseStream<Void> finishStorageWiggleSignal;
|
||||
state SignalableActorCollection collection;
|
||||
state bool started = false;
|
||||
|
||||
loop {
|
||||
state ReadYourWritesTransaction tr(teamCollection->cx);
|
||||
loop {
|
||||
@ -4126,18 +4103,16 @@ ACTOR Future<Void> monitorPerpetualStorageWiggle(DDTeamCollection* teamCollectio
|
||||
wait(tr.commit());
|
||||
|
||||
ASSERT(speed == 1 || speed == 0);
|
||||
if (speed == 1 && !started) {
|
||||
if (speed == 1) {
|
||||
collection.add(perpetualStorageWiggleIterator(
|
||||
&stopWiggleSignal, finishStorageWiggleSignal.getFuture(), teamCollection));
|
||||
collection.add(perpetualStorageWiggler(
|
||||
&stopWiggleSignal, finishStorageWiggleSignal, teamCollection, ddEnabledState));
|
||||
TraceEvent("PerpetualStorageWiggleOpen", teamCollection->distributorId);
|
||||
started = true;
|
||||
} else if (speed == 0 && started) {
|
||||
} else {
|
||||
stopWiggleSignal.trigger();
|
||||
wait(collection.signalAndReset());
|
||||
TraceEvent("PerpetualStorageWiggleClose", teamCollection->distributorId);
|
||||
started = false;
|
||||
}
|
||||
wait(watchFuture);
|
||||
break;
|
||||
@ -4435,7 +4410,7 @@ ACTOR Future<Void> storageServerTracker(
|
||||
bool isTss) {
|
||||
|
||||
state Future<Void> failureTracker;
|
||||
state ServerStatus status(false, false, false, server->lastKnownInterface.locality);
|
||||
state ServerStatus status(false, false, server->lastKnownInterface.locality);
|
||||
state bool lastIsUnhealthy = false;
|
||||
state Future<Void> metricsTracker = serverMetricsPolling(server);
|
||||
|
||||
@ -4452,7 +4427,6 @@ ACTOR Future<Void> storageServerTracker(
|
||||
loop {
|
||||
status.isUndesired = !self->disableFailingLaggingServers.get() && server->ssVersionTooFarBehind.get();
|
||||
status.isWrongConfiguration = false;
|
||||
status.isWiggling = false;
|
||||
hasWrongDC = !isCorrectDC(self, server);
|
||||
hasInvalidLocality =
|
||||
!self->isValidLocality(self->configuration.storagePolicy, server->lastKnownInterface.locality);
|
||||
@ -4532,21 +4506,10 @@ ACTOR Future<Void> storageServerTracker(
|
||||
status.isWrongConfiguration = true;
|
||||
}
|
||||
|
||||
// An invalid wiggle server should set itself the right status. Otherwise, it cannot be re-included by
|
||||
// wiggler.
|
||||
auto invalidWiggleServer =
|
||||
[](const AddressExclusion& addr, const DDTeamCollection* tc, const TCServerInfo* server) {
|
||||
return server->lastKnownInterface.locality.processId() != tc->wigglingPid;
|
||||
};
|
||||
// If the storage server is in the excluded servers list, it is undesired
|
||||
NetworkAddress a = server->lastKnownInterface.address();
|
||||
AddressExclusion worstAddr(a.ip, a.port);
|
||||
DDTeamCollection::Status worstStatus = self->excludedServers.get(worstAddr);
|
||||
|
||||
if (worstStatus == DDTeamCollection::Status::WIGGLING && invalidWiggleServer(worstAddr, self, server)) {
|
||||
self->excludedServers.set(worstAddr, DDTeamCollection::Status::NONE);
|
||||
worstStatus = DDTeamCollection::Status::NONE;
|
||||
}
|
||||
otherChanges.push_back(self->excludedServers.onChange(worstAddr));
|
||||
|
||||
for (int i = 0; i < 3; i++) {
|
||||
@ -4562,12 +4525,6 @@ ACTOR Future<Void> storageServerTracker(
|
||||
else if (i == 2)
|
||||
testAddr = AddressExclusion(server->lastKnownInterface.secondaryAddress().get().ip);
|
||||
DDTeamCollection::Status testStatus = self->excludedServers.get(testAddr);
|
||||
|
||||
if (testStatus == DDTeamCollection::Status::WIGGLING && invalidWiggleServer(testAddr, self, server)) {
|
||||
self->excludedServers.set(testAddr, DDTeamCollection::Status::NONE);
|
||||
testStatus = DDTeamCollection::Status::NONE;
|
||||
}
|
||||
|
||||
if (testStatus > worstStatus) {
|
||||
worstStatus = testStatus;
|
||||
worstAddr = testAddr;
|
||||
@ -4586,7 +4543,6 @@ ACTOR Future<Void> storageServerTracker(
|
||||
status.isWiggling = true;
|
||||
TraceEvent("PerpetualWigglingStorageServer", self->distributorId)
|
||||
.detail("Server", server->id)
|
||||
.detail("ProcessId", server->lastKnownInterface.locality.processId())
|
||||
.detail("Address", worstAddr.toString());
|
||||
} else if (worstStatus == DDTeamCollection::Status::FAILED && !isTss) {
|
||||
TraceEvent(SevWarn, "FailedServerRemoveKeys", self->distributorId)
|
||||
@ -4651,14 +4607,11 @@ ACTOR Future<Void> storageServerTracker(
|
||||
bool localityChanged = server->lastKnownInterface.locality != newInterface.first.locality;
|
||||
bool machineLocalityChanged = server->lastKnownInterface.locality.zoneId().get() !=
|
||||
newInterface.first.locality.zoneId().get();
|
||||
bool processIdChanged = server->lastKnownInterface.locality.processId().get() !=
|
||||
newInterface.first.locality.processId().get();
|
||||
TraceEvent("StorageServerInterfaceChanged", self->distributorId)
|
||||
.detail("ServerID", server->id)
|
||||
.detail("NewWaitFailureToken", newInterface.first.waitFailure.getEndpoint().token)
|
||||
.detail("OldWaitFailureToken", server->lastKnownInterface.waitFailure.getEndpoint().token)
|
||||
.detail("LocalityChanged", localityChanged)
|
||||
.detail("ProcessIdChanged", processIdChanged)
|
||||
.detail("MachineLocalityChanged", machineLocalityChanged);
|
||||
|
||||
server->lastKnownInterface = newInterface.first;
|
||||
@ -4703,20 +4656,6 @@ ACTOR Future<Void> storageServerTracker(
|
||||
ASSERT(destMachine.isValid());
|
||||
}
|
||||
|
||||
// update pid2server_info if the process id has changed
|
||||
if (processIdChanged) {
|
||||
self->pid2server_info[newInterface.first.locality.processId().get()].push_back(
|
||||
self->server_info[server->id]);
|
||||
// delete the old one
|
||||
auto& old_infos =
|
||||
self->pid2server_info[server->lastKnownInterface.locality.processId().get()];
|
||||
for (int i = 0; i < old_infos.size(); ++i) {
|
||||
if (old_infos[i].getPtr() == server) {
|
||||
std::swap(old_infos[i--], old_infos.back());
|
||||
old_infos.pop_back();
|
||||
}
|
||||
}
|
||||
}
|
||||
// Ensure the server's server team belong to a machine team, and
|
||||
// Get the newBadTeams due to the locality change
|
||||
vector<Reference<TCTeamInfo>> newBadTeams;
|
||||
@ -4763,8 +4702,7 @@ ACTOR Future<Void> storageServerTracker(
|
||||
interfaceChanged = server->onInterfaceChanged;
|
||||
// Old failureTracker for the old interface will be actorCancelled since the handler of the old
|
||||
// actor now points to the new failure monitor actor.
|
||||
status = ServerStatus(
|
||||
status.isFailed, status.isUndesired, status.isWiggling, server->lastKnownInterface.locality);
|
||||
status = ServerStatus(status.isFailed, status.isUndesired, server->lastKnownInterface.locality);
|
||||
|
||||
// self->traceTeamCollectionInfo();
|
||||
recordTeamCollectionInfo = true;
|
||||
@ -5524,10 +5462,8 @@ ACTOR Future<Void> dataDistributionTeamCollection(Reference<DDTeamCollection> te
|
||||
self->addActor.send(trackExcludedServers(self));
|
||||
self->addActor.send(monitorHealthyTeams(self));
|
||||
self->addActor.send(waitHealthyZoneChange(self));
|
||||
self->addActor.send(monitorPerpetualStorageWiggle(self, ddEnabledState));
|
||||
|
||||
if (self->primary) { // the primary dc also handle the satellite dc's perpetual wiggling
|
||||
self->addActor.send(monitorPerpetualStorageWiggle(self, ddEnabledState));
|
||||
}
|
||||
// SOMEDAY: Monitor FF/serverList for (new) servers that aren't in allServers and add or remove them
|
||||
|
||||
loop choose {
|
||||
@ -6279,30 +6215,6 @@ ACTOR Future<Void> ddSnapCreate(DistributorSnapRequest snapReq,
|
||||
return Void();
|
||||
}
|
||||
|
||||
// Find size of set intersection of excludeServerIDs and serverIDs on each team and see if the leftover team is valid
|
||||
bool _exclusionSafetyCheck(vector<UID>& excludeServerIDs, DDTeamCollection* teamCollection) {
|
||||
std::sort(excludeServerIDs.begin(), excludeServerIDs.end());
|
||||
for (const auto& team : teamCollection->teams) {
|
||||
vector<UID> teamServerIDs = team->getServerIDs();
|
||||
std::sort(teamServerIDs.begin(), teamServerIDs.end());
|
||||
TraceEvent(SevDebug, "DDExclusionSafetyCheck", teamCollection->distributorId)
|
||||
.detail("Excluding", describe(excludeServerIDs))
|
||||
.detail("Existing", team->getDesc());
|
||||
// Find size of set intersection of both vectors and see if the leftover team is valid
|
||||
vector<UID> intersectSet(teamServerIDs.size());
|
||||
auto it = std::set_intersection(excludeServerIDs.begin(),
|
||||
excludeServerIDs.end(),
|
||||
teamServerIDs.begin(),
|
||||
teamServerIDs.end(),
|
||||
intersectSet.begin());
|
||||
intersectSet.resize(it - intersectSet.begin());
|
||||
if (teamServerIDs.size() - intersectSet.size() < SERVER_KNOBS->DD_EXCLUDE_MIN_REPLICAS) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
ACTOR Future<Void> ddExclusionSafetyCheck(DistributorExclusionSafetyCheckRequest req,
|
||||
Reference<DataDistributorData> self,
|
||||
Database cx) {
|
||||
@ -6332,7 +6244,26 @@ ACTOR Future<Void> ddExclusionSafetyCheck(DistributorExclusionSafetyCheckRequest
|
||||
}
|
||||
}
|
||||
}
|
||||
reply.safe = _exclusionSafetyCheck(excludeServerIDs, self->teamCollection);
|
||||
std::sort(excludeServerIDs.begin(), excludeServerIDs.end());
|
||||
for (const auto& team : self->teamCollection->teams) {
|
||||
vector<UID> teamServerIDs = team->getServerIDs();
|
||||
std::sort(teamServerIDs.begin(), teamServerIDs.end());
|
||||
TraceEvent(SevDebug, "DDExclusionSafetyCheck", self->ddId)
|
||||
.detail("Excluding", describe(excludeServerIDs))
|
||||
.detail("Existing", team->getDesc());
|
||||
// Find size of set intersection of both vectors and see if the leftover team is valid
|
||||
vector<UID> intersectSet(teamServerIDs.size());
|
||||
auto it = std::set_intersection(excludeServerIDs.begin(),
|
||||
excludeServerIDs.end(),
|
||||
teamServerIDs.begin(),
|
||||
teamServerIDs.end(),
|
||||
intersectSet.begin());
|
||||
intersectSet.resize(it - intersectSet.begin());
|
||||
if (teamServerIDs.size() - intersectSet.size() < SERVER_KNOBS->DD_EXCLUDE_MIN_REPLICAS) {
|
||||
reply.safe = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
TraceEvent("DDExclusionSafetyCheckFinish", self->ddId);
|
||||
req.reply.send(reply);
|
||||
return Void();
|
||||
@ -6509,7 +6440,7 @@ std::unique_ptr<DDTeamCollection> testTeamCollection(int teamSize,
|
||||
interface.locality.set(LiteralStringRef("data_hall"), Standalone<StringRef>(std::to_string(id % 3)));
|
||||
collection->server_info[uid] = makeReference<TCServerInfo>(
|
||||
interface, collection.get(), ProcessClass(), true, collection->storageServerSet);
|
||||
collection->server_status.set(uid, ServerStatus(false, false, false, interface.locality));
|
||||
collection->server_status.set(uid, ServerStatus(false, false, interface.locality));
|
||||
collection->checkAndCreateMachine(collection->server_info[uid]);
|
||||
}
|
||||
|
||||
@ -6566,7 +6497,7 @@ std::unique_ptr<DDTeamCollection> testMachineTeamCollection(int teamSize,
|
||||
collection->server_info[uid] = makeReference<TCServerInfo>(
|
||||
interface, collection.get(), ProcessClass(), true, collection->storageServerSet);
|
||||
|
||||
collection->server_status.set(uid, ServerStatus(false, false, false, interface.locality));
|
||||
collection->server_status.set(uid, ServerStatus(false, false, interface.locality));
|
||||
}
|
||||
|
||||
int totalServerIndex = collection->constructMachinesFromServers();
|
||||
|
@ -993,7 +993,7 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueueData* self, RelocateData rd,
|
||||
allHealthy = true;
|
||||
anyWithSource = false;
|
||||
bestTeams.clear();
|
||||
// Get team from teamCollections in different DCs and find the best one
|
||||
// Get team from teamCollections in diffrent DCs and find the best one
|
||||
while (tciIndex < self->teamCollections.size()) {
|
||||
double inflightPenalty = SERVER_KNOBS->INFLIGHT_PENALTY_HEALTHY;
|
||||
if (rd.healthPriority == SERVER_KNOBS->PRIORITY_TEAM_UNHEALTHY ||
|
||||
|
@ -133,7 +133,7 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi
|
||||
init( PRIORITY_RECOVER_MOVE, 110 );
|
||||
init( PRIORITY_REBALANCE_UNDERUTILIZED_TEAM, 120 );
|
||||
init( PRIORITY_REBALANCE_OVERUTILIZED_TEAM, 121 );
|
||||
init( PRIORITY_PERPETUAL_STORAGE_WIGGLE, 139 );
|
||||
init( PRIORITY_PERPETUAL_STORAGE_WIGGLE, 140 );
|
||||
init( PRIORITY_TEAM_HEALTHY, 140 );
|
||||
init( PRIORITY_TEAM_CONTAINS_UNDESIRED_SERVER, 150 );
|
||||
init( PRIORITY_TEAM_REDUNDANT, 200 );
|
||||
|
@ -890,7 +890,6 @@ ACTOR Future<Void> checkConsistency(Database cx,
|
||||
StringRef performTSSCheck = LiteralStringRef("false");
|
||||
if (doQuiescentCheck) {
|
||||
performQuiescent = LiteralStringRef("true");
|
||||
spec.restorePerpetualWiggleSetting = false;
|
||||
}
|
||||
if (doCacheCheck) {
|
||||
performCacheCheck = LiteralStringRef("true");
|
||||
@ -1386,8 +1385,6 @@ ACTOR Future<Void> runTests(Reference<AsyncVar<Optional<struct ClusterController
|
||||
state bool useDB = false;
|
||||
state bool waitForQuiescenceBegin = false;
|
||||
state bool waitForQuiescenceEnd = false;
|
||||
state bool restorePerpetualWiggleSetting = false;
|
||||
state bool perpetualWiggleEnabled = false;
|
||||
state double startDelay = 0.0;
|
||||
state double databasePingDelay = 1e9;
|
||||
state ISimulator::BackupAgentType simBackupAgents = ISimulator::BackupAgentType::NoBackupAgents;
|
||||
@ -1402,8 +1399,6 @@ ACTOR Future<Void> runTests(Reference<AsyncVar<Optional<struct ClusterController
|
||||
waitForQuiescenceBegin = true;
|
||||
if (iter->waitForQuiescenceEnd)
|
||||
waitForQuiescenceEnd = true;
|
||||
if (iter->restorePerpetualWiggleSetting)
|
||||
restorePerpetualWiggleSetting = true;
|
||||
startDelay = std::max(startDelay, iter->startDelay);
|
||||
databasePingDelay = std::min(databasePingDelay, iter->databasePingDelay);
|
||||
if (iter->simBackupAgents != ISimulator::BackupAgentType::NoBackupAgents)
|
||||
@ -1442,15 +1437,6 @@ ACTOR Future<Void> runTests(Reference<AsyncVar<Optional<struct ClusterController
|
||||
} catch (Error& e) {
|
||||
TraceEvent(SevError, "TestFailure").error(e).detail("Reason", "Unable to set starting configuration");
|
||||
}
|
||||
if (restorePerpetualWiggleSetting) {
|
||||
std::string_view confView(reinterpret_cast<const char*>(startingConfiguration.begin()),
|
||||
startingConfiguration.size());
|
||||
const std::string setting = "perpetual_storage_wiggle:=";
|
||||
auto pos = confView.find(setting);
|
||||
if (pos != confView.npos && confView.at(pos + setting.size()) == '1') {
|
||||
perpetualWiggleEnabled = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (useDB && waitForQuiescenceBegin) {
|
||||
@ -1466,10 +1452,6 @@ ACTOR Future<Void> runTests(Reference<AsyncVar<Optional<struct ClusterController
|
||||
TraceEvent("QuietDatabaseStartExternalError").error(e);
|
||||
throw;
|
||||
}
|
||||
|
||||
if (perpetualWiggleEnabled) { // restore the enabled perpetual storage wiggle setting
|
||||
wait(setPerpetualStorageWiggle(cx, true, true));
|
||||
}
|
||||
}
|
||||
|
||||
TraceEvent("TestsExpectedToPass").detail("Count", tests.size());
|
||||
|
@ -1777,7 +1777,6 @@ struct ConsistencyCheckWorkload : TestWorkload {
|
||||
if (!found) {
|
||||
TraceEvent("ConsistencyCheck_NoStorage")
|
||||
.detail("Address", addr)
|
||||
.detail("ProcessId", workers[i].interf.locality.processId())
|
||||
.detail("ProcessClassEqualToStorageClass",
|
||||
(int)(workers[i].processClass == ProcessClass::StorageClass));
|
||||
missingStorage.push_back(workers[i].interf.locality.dcId());
|
||||
|
@ -159,7 +159,6 @@ public:
|
||||
simConnectionFailuresDisableDuration = 0;
|
||||
simBackupAgents = ISimulator::BackupAgentType::NoBackupAgents;
|
||||
simDrAgents = ISimulator::BackupAgentType::NoBackupAgents;
|
||||
restorePerpetualWiggleSetting = true;
|
||||
}
|
||||
TestSpec(StringRef title,
|
||||
bool dump,
|
||||
@ -170,8 +169,8 @@ public:
|
||||
: title(title), dumpAfterTest(dump), clearAfterTest(clear), startDelay(startDelay), useDB(useDB), timeout(600),
|
||||
databasePingDelay(databasePingDelay), runConsistencyCheck(g_network->isSimulated()),
|
||||
runConsistencyCheckOnCache(false), runConsistencyCheckOnTSS(false), waitForQuiescenceBegin(true),
|
||||
waitForQuiescenceEnd(true), restorePerpetualWiggleSetting(true), simCheckRelocationDuration(false),
|
||||
simConnectionFailuresDisableDuration(0), simBackupAgents(ISimulator::BackupAgentType::NoBackupAgents),
|
||||
waitForQuiescenceEnd(true), simCheckRelocationDuration(false), simConnectionFailuresDisableDuration(0),
|
||||
simBackupAgents(ISimulator::BackupAgentType::NoBackupAgents),
|
||||
simDrAgents(ISimulator::BackupAgentType::NoBackupAgents) {
|
||||
phases = TestWorkload::SETUP | TestWorkload::EXECUTION | TestWorkload::CHECK | TestWorkload::METRICS;
|
||||
if (databasePingDelay < 0)
|
||||
@ -192,11 +191,6 @@ public:
|
||||
bool runConsistencyCheckOnTSS;
|
||||
bool waitForQuiescenceBegin;
|
||||
bool waitForQuiescenceEnd;
|
||||
bool restorePerpetualWiggleSetting; // whether set perpetual_storage_wiggle as the value after run
|
||||
// QuietDatabase. QuietDatabase always disables perpetual storage wiggle on
|
||||
// purpose. If waitForQuiescenceBegin == true and we want to keep perpetual
|
||||
// storage wiggle the same setting as before during testing, this value should
|
||||
// be set true.
|
||||
|
||||
bool simCheckRelocationDuration; // If set to true, then long duration relocations generate SevWarnAlways messages.
|
||||
// Once any workload sets this to true, it will be true for the duration of the
|
||||
|
Loading…
x
Reference in New Issue
Block a user