diff --git a/documentation/sphinx/source/release-notes/release-notes-620.rst b/documentation/sphinx/source/release-notes/release-notes-620.rst index 9c8e90e0d7..ea6b708af9 100644 --- a/documentation/sphinx/source/release-notes/release-notes-620.rst +++ b/documentation/sphinx/source/release-notes/release-notes-620.rst @@ -6,7 +6,7 @@ Release Notes 6.2.28 ====== -* Log detailed team collection information both periodically and on trigger. `(PR #3912) `_ +* Log detailed team collection information on trigger. `(PR #3912) `_ 6.2.27 diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index 4aaec75acb..a7a9192b71 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -631,8 +631,6 @@ struct DDTeamCollection : ReferenceCounted { int highestUtilizationTeam; AsyncTrigger printDetailedTeamsInfo; - double lastTeamCollectionInfoSnapshotTime; - double lastTeamCollectionInfoLoggingTriggeredTime; void resetLocalitySet() { storageServerSet = Reference(new LocalityMap()); @@ -773,13 +771,7 @@ struct DDTeamCollection : ReferenceCounted { TraceEvent(SevWarn, "DDTeamMedianAvailableSpaceTooSmall") .detail("MedianAvailableSpaceRatio", self->medianAvailableSpace) .detail("TargetAvailableSpaceRatio", SERVER_KNOBS->TARGET_AVAILABLE_SPACE_RATIO); - if (now() - self->lastTeamCollectionInfoLoggingTriggeredTime > - // It's meaningless to trigger the logging again if the snapshot is stale - SERVER_KNOBS->DD_TEAMS_INFO_SNAPSHOT_REFRESH_INTERVAL) { - // Trigger teams info print - self->printDetailedTeamsInfo.trigger(); - self->lastTeamCollectionInfoLoggingTriggeredTime = now(); - } + self->printDetailedTeamsInfo.trigger(); } } @@ -2586,8 +2578,8 @@ ACTOR Future waitUntilHealthy(DDTeamCollection* self, double extraDelay = } // Take a snapshot of necessary data structures from `DDTeamCollection` and print them out with yields to avoid slow -// task on the run loop. This function is very expensive to call due to the data it needs to copy. Use with caution. -ACTOR Future printSnapshotTeamsInfo(Reference self, std::string tcIdString) { +// task on the run loop. +ACTOR Future printSnapshotTeamsInfo(Reference self) { state DatabaseConfiguration configuration; state std::map> server_info; state std::map server_status; @@ -2599,90 +2591,99 @@ ACTOR Future printSnapshotTeamsInfo(Reference self, std: // state std::vector> machineLocalityMapRecordArray; state int traceEventsPrinted = 0; state std::vector serverIDs; + state double lastPrintTime; loop { wait(self->printDetailedTeamsInfo.onTrigger()); + if (now() - lastPrintTime < SERVER_KNOBS->DD_TEAMS_INFO_PRINT_INTERVAL) { + continue; + } + lastPrintTime = now(); traceEventsPrinted = 0; - if (now() - self->lastTeamCollectionInfoSnapshotTime >= SERVER_KNOBS->DD_TEAMS_INFO_SNAPSHOT_REFRESH_INTERVAL) { - // Refresh state snapshot - self->lastTeamCollectionInfoSnapshotTime = now(); - configuration = self->configuration; - server_info = self->server_info; - teams = self->teams; - machine_info = self->machine_info; - machineTeams = self->machineTeams; - // internedLocalityRecordKeyNameStrings = self->machineLocalityMap._keymap->_lookuparray; - // machineLocalityMapEntryArraySize = self->machineLocalityMap.size(); - // machineLocalityMapRecordArray = self->machineLocalityMap.getRecordArray(); - std::vector _uids = self->machineLocalityMap.getObjects(); - serverIDs = _uids; + double snapshotStart = now(); - auto const& keys = self->server_status.getKeys(); - for (auto const& key : keys) { - server_status.emplace(key, self->server_status.get(key)); - } + configuration = self->configuration; + server_info = self->server_info; + teams = self->teams; + machine_info = self->machine_info; + machineTeams = self->machineTeams; + // internedLocalityRecordKeyNameStrings = self->machineLocalityMap._keymap->_lookuparray; + // machineLocalityMapEntryArraySize = self->machineLocalityMap.size(); + // machineLocalityMapRecordArray = self->machineLocalityMap.getRecordArray(); + std::vector _uids = self->machineLocalityMap.getObjects(); + serverIDs = _uids; + + auto const& keys = self->server_status.getKeys(); + for (auto const& key : keys) { + server_status.emplace(key, self->server_status.get(key)); } + TraceEvent("DDPrintSnapshotTeasmInfo", self->distributorId) + .detail("SnapshotSpeed", now() - snapshotStart) + .detail("Primary", self->primary); + // Print to TraceEvents - TraceEvent("DDConfig") + TraceEvent("DDConfig", self->distributorId) .detail("StorageTeamSize", configuration.storageTeamSize) .detail("DesiredTeamsPerServer", SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER) .detail("MaxTeamsPerServer", SERVER_KNOBS->MAX_TEAMS_PER_SERVER) - .detail("TeamCollectionID", tcIdString); + .detail("Primary", self->primary); - TraceEvent("ServerInfo").detail("Size", server_info.size()).detail("TeamCollectionID", tcIdString); - state int i = 0; + TraceEvent("ServerInfo", self->distributorId) + .detail("Size", server_info.size()) + .detail("Primary", self->primary); + state int i; state std::map>::iterator server = server_info.begin(); - for (; i < server_info.size(); i++) { - TraceEvent("ServerInfo") + for (i = 0; i < server_info.size(); i++) { + TraceEvent("ServerInfo", self->distributorId) .detail("ServerInfoIndex", i) .detail("ServerID", server->first.toString()) .detail("ServerTeamOwned", server->second->teams.size()) .detail("MachineID", server->second->machine->machineID.contents().toString()) - .detail("TeamCollectionID", tcIdString); + .detail("Primary", self->primary); server++; if (++traceEventsPrinted % SERVER_KNOBS->DD_TEAMS_INFO_PRINT_YIELD_COUNT == 0) { wait(yield()); } } - i = 0; server = server_info.begin(); - for (; i < server_info.size(); i++) { + for (i = 0; i < server_info.size(); i++) { const UID& uid = server->first; - TraceEvent("ServerStatus", uid) + TraceEvent("ServerStatus", self->distributorId) + .detail("ServerUID", uid) .detail("Healthy", !server_status.at(uid).isUnhealthy()) .detail("MachineIsValid", server_info[uid]->machine.isValid()) .detail("MachineTeamSize", server_info[uid]->machine.isValid() ? server_info[uid]->machine->machineTeams.size() : -1) - .detail("TeamCollectionID", tcIdString); + .detail("Primary", self->primary); server++; if (++traceEventsPrinted % SERVER_KNOBS->DD_TEAMS_INFO_PRINT_YIELD_COUNT == 0) { wait(yield()); } } - i = 0; - TraceEvent("ServerTeamInfo").detail("Size", teams.size()).detail("TeamCollectionID", tcIdString); - for (; i < teams.size(); i++) { + TraceEvent("ServerTeamInfo", self->distributorId).detail("Size", teams.size()).detail("Primary", self->primary); + for (i = 0; i < teams.size(); i++) { const auto& team = teams[i]; - TraceEvent("ServerTeamInfo") + TraceEvent("ServerTeamInfo", self->distributorId) .detail("TeamIndex", i) .detail("Healthy", team->isHealthy()) .detail("TeamSize", team->size()) .detail("MemberIDs", team->getServerIDsStr()) - .detail("TeamCollectionID", tcIdString); + .detail("Primary", self->primary); if (++traceEventsPrinted % SERVER_KNOBS->DD_TEAMS_INFO_PRINT_YIELD_COUNT == 0) { wait(yield()); } } - i = 0; - TraceEvent("MachineInfo").detail("Size", machine_info.size()).detail("TeamCollectionID", tcIdString); + TraceEvent("MachineInfo", self->distributorId) + .detail("Size", machine_info.size()) + .detail("Primary", self->primary); state std::map, Reference>::iterator machine = machine_info.begin(); state bool isMachineHealthy = false; - for (; i < machine_info.size(); i++) { + for (i = 0; i < machine_info.size(); i++) { Reference _machine = machine->second; if (!_machine.isValid() || machine_info.find(_machine->machineID) == machine_info.end() || _machine->serversOnMachine.empty()) { @@ -2697,68 +2698,67 @@ ACTOR Future printSnapshotTeamsInfo(Reference self, std: } isMachineHealthy = false; - TraceEvent("MachineInfo") + TraceEvent("MachineInfo", self->distributorId) .detail("MachineInfoIndex", i) .detail("Healthy", isMachineHealthy) .detail("MachineID", machine->first.contents().toString()) .detail("MachineTeamOwned", machine->second->machineTeams.size()) .detail("ServerNumOnMachine", machine->second->serversOnMachine.size()) .detail("ServersID", machine->second->getServersIDStr()) - .detail("TeamCollectionID", tcIdString); + .detail("Primary", self->primary); machine++; if (++traceEventsPrinted % SERVER_KNOBS->DD_TEAMS_INFO_PRINT_YIELD_COUNT == 0) { wait(yield()); } } - i = 0; - TraceEvent("MachineTeamInfo").detail("Size", machineTeams.size()).detail("TeamCollectionID", tcIdString); - for (; i < machineTeams.size(); i++) { + TraceEvent("MachineTeamInfo", self->distributorId) + .detail("Size", machineTeams.size()) + .detail("Primary", self->primary); + for (i = 0; i < machineTeams.size(); i++) { const auto& team = machineTeams[i]; - TraceEvent("MachineTeamInfo") + TraceEvent("MachineTeamInfo", self->distributorId) .detail("TeamIndex", i) .detail("MachineIDs", team->getMachineIDsStr()) .detail("ServerTeams", team->serverTeams.size()) - .detail("TeamCollectionID", tcIdString); + .detail("Primary", self->primary); if (++traceEventsPrinted % SERVER_KNOBS->DD_TEAMS_INFO_PRINT_YIELD_COUNT == 0) { wait(yield()); } } // TODO: re-enable the following logging or remove them. - // i = 0; - // TraceEvent("LocalityRecordKeyName") + // TraceEvent("LocalityRecordKeyName", self->distributorId) // .detail("Size", internedLocalityRecordKeyNameStrings.size()) - // .detail("TeamCollectionID", tcIdString); - // for (; i < internedLocalityRecordKeyNameStrings.size(); i++) { - // TraceEvent("LocalityRecordKeyIndexName") + // .detail("Primary", self->primary); + // for (i = 0; i < internedLocalityRecordKeyNameStrings.size(); i++) { + // TraceEvent("LocalityRecordKeyIndexName", self->distributorId) // .detail("KeyIndex", i) // .detail("KeyName", internedLocalityRecordKeyNameStrings[i]) - // .detail("TeamCollectionID", tcIdString); + // .detail("Primary", self->primary); // if (++traceEventsPrinted % SERVER_KNOBS->DD_TEAMS_INFO_PRINT_YIELD_COUNT == 0) { // wait(yield()); // } // } - // i = 0; - // TraceEvent("MachineLocalityMap") + // TraceEvent("MachineLocalityMap", self->distributorId) // .detail("Size", machineLocalityMapEntryArraySize) - // .detail("TeamCollectionID", tcIdString); - // for (; i < serverIDs.size(); i++) { + // .detail("Primary", self->primary); + // for (i = 0; i < serverIDs.size(); i++) { // const auto& serverID = serverIDs[i]; // Reference record = machineLocalityMapRecordArray[i]; // if (record.isValid()) { - // TraceEvent("MachineLocalityMap") + // TraceEvent("MachineLocalityMap", self->distributorId) // .detail("LocalityIndex", i) // .detail("UID", serverID->toString()) // .detail("LocalityRecord", record->toString()) - // .detail("TeamCollectionID", tcIdString); + // .detail("Primary", self->primary); // } else { - // TraceEvent("MachineLocalityMap") + // TraceEvent("MachineLocalityMap", self->distributorId) // .detail("LocalityIndex", i) // .detail("UID", serverID->toString()) // .detail("LocalityRecord", "[NotFound]") - // .detail("TeamCollectionID", tcIdString); + // .detail("Primary", self->primary); // } // if (++traceEventsPrinted % SERVER_KNOBS->DD_TEAMS_INFO_PRINT_YIELD_COUNT == 0) { // wait(yield()); @@ -4527,11 +4527,11 @@ ACTOR Future dataDistribution(Reference self) teamCollectionsPtrs.push_back(remoteTeamCollection.getPtr()); remoteTeamCollection->teamCollections = teamCollectionsPtrs; actors.push_back( reportErrorsExcept( dataDistributionTeamCollection( remoteTeamCollection, initData, tcis[1], self->dbInfo ), "DDTeamCollectionSecondary", self->ddId, &normalDDQueueErrors() ) ); - actors.push_back(printSnapshotTeamsInfo(remoteTeamCollection, "DDTeamCollectionSecondary")); + actors.push_back(printSnapshotTeamsInfo(remoteTeamCollection)); } primaryTeamCollection->teamCollections = teamCollectionsPtrs; actors.push_back( reportErrorsExcept( dataDistributionTeamCollection( primaryTeamCollection, initData, tcis[0], self->dbInfo ), "DDTeamCollectionPrimary", self->ddId, &normalDDQueueErrors() ) ); - actors.push_back(printSnapshotTeamsInfo(primaryTeamCollection, "DDTeamCollectionPrimary")); + actors.push_back(printSnapshotTeamsInfo(primaryTeamCollection)); actors.push_back(yieldPromiseStream(output.getFuture(), input)); wait( waitForAll( actors ) ); diff --git a/fdbserver/Knobs.cpp b/fdbserver/Knobs.cpp index 5208b92f3d..f2bbd8738b 100644 --- a/fdbserver/Knobs.cpp +++ b/fdbserver/Knobs.cpp @@ -219,7 +219,7 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs, bool isSimula init( DD_VALIDATE_LOCALITY, true ); if( randomize && BUGGIFY ) DD_VALIDATE_LOCALITY = false; init( DD_CHECK_INVALID_LOCALITY_DELAY, 60 ); if( randomize && BUGGIFY ) DD_CHECK_INVALID_LOCALITY_DELAY = 1 + deterministicRandom()->random01() * 600; init( DD_ENABLE_VERBOSE_TRACING, false ); if( randomize && BUGGIFY ) DD_ENABLE_VERBOSE_TRACING = true; - init( DD_TEAMS_INFO_SNAPSHOT_REFRESH_INTERVAL, 300 ); // Refresh snapshot of the detailed teams info every 5 mins + init( DD_TEAMS_INFO_PRINT_INTERVAL, 60 ); if( randomize && BUGGIFY ) DD_TEAMS_INFO_PRINT_INTERVAL = 10; init( DD_TEAMS_INFO_PRINT_YIELD_COUNT, 100 ); if( randomize && BUGGIFY ) DD_TEAMS_INFO_PRINT_YIELD_COUNT = deterministicRandom()->random01() * 1000 + 1; // TeamRemover diff --git a/fdbserver/Knobs.h b/fdbserver/Knobs.h index e9c55a5b8e..78000730be 100644 --- a/fdbserver/Knobs.h +++ b/fdbserver/Knobs.h @@ -182,7 +182,7 @@ public: bool DD_VALIDATE_LOCALITY; int DD_CHECK_INVALID_LOCALITY_DELAY; bool DD_ENABLE_VERBOSE_TRACING; - int DD_TEAMS_INFO_SNAPSHOT_REFRESH_INTERVAL; + int DD_TEAMS_INFO_PRINT_INTERVAL; int DD_TEAMS_INFO_PRINT_YIELD_COUNT; // TeamRemover to remove redundant teams