From 3787ddae894d1962374063a440114237fe8d3abd Mon Sep 17 00:00:00 2001 From: "Bharadwaj V.R" Date: Fri, 15 Apr 2022 09:04:52 -0700 Subject: [PATCH] Add trace-based status reporting for server version --- fdbserver/ClusterRecovery.actor.cpp | 7 ++++++- fdbserver/ClusterRecovery.actor.h | 3 +++ fdbserver/Status.actor.cpp | 26 ++++++++++++++++++++++++-- 3 files changed, 33 insertions(+), 3 deletions(-) diff --git a/fdbserver/ClusterRecovery.actor.cpp b/fdbserver/ClusterRecovery.actor.cpp index 62696aa9cd..402b82896d 100644 --- a/fdbserver/ClusterRecovery.actor.cpp +++ b/fdbserver/ClusterRecovery.actor.cpp @@ -1409,7 +1409,7 @@ ACTOR Future clusterRecoveryCore(Reference self) { wait(self->cstate.read()); if (self->cstate.prevDBState.lowestCompatibleServerVersion > currentProtocolVersion) { - TraceEvent(SevWarnAlways, "IncompatbleServerVersion", self->dbgid).log(); + TraceEvent(SevWarnAlways, "IncompatibleServerVersion", self->dbgid).log(); throw internal_error(); } @@ -1478,6 +1478,11 @@ ACTOR Future clusterRecoveryCore(Reference self) { } wait(self->cstate.write(newState) || recoverAndEndEpoch); + TraceEvent("SWVersionCompatibilityChecked", self->dbgid) + .detail("NewestServerVersion", self->cstate.myDBState.newestServerVersion) + .detail("LowestCompatibleVersion", self->cstate.myDBState.lowestCompatibleServerVersion) + .trackLatest(self->swVersionCheckedEventHolder->trackingKey); + self->recoveryState = RecoveryState::RECRUITING; state std::vector seedServers; diff --git a/fdbserver/ClusterRecovery.actor.h b/fdbserver/ClusterRecovery.actor.h index d0deef911f..810bd35f7a 100644 --- a/fdbserver/ClusterRecovery.actor.h +++ b/fdbserver/ClusterRecovery.actor.h @@ -22,6 +22,7 @@ // When actually compiled (NO_INTELLISENSE), include the generated version of this file. In intellisense use the source // version. +#include "flow/Trace.h" #include #if defined(NO_INTELLISENSE) && !defined(FDBSERVER_CLUSTERRECOVERY_ACTOR_G_H) @@ -244,6 +245,7 @@ struct ClusterRecoveryData : NonCopyable, ReferenceCounted Future logger; + Reference swVersionCheckedEventHolder; Reference recoveredConfigEventHolder; Reference clusterRecoveryStateEventHolder; Reference clusterRecoveryGenerationsEventHolder; @@ -273,6 +275,7 @@ struct ClusterRecoveryData : NonCopyable, ReferenceCounted backupWorkerDoneRequests("BackupWorkerDoneRequests", cc), getLiveCommittedVersionRequests("GetLiveCommittedVersionRequests", cc), reportLiveCommittedVersionRequests("ReportLiveCommittedVersionRequests", cc), + swVersionCheckedEventHolder(makeReference("SWVersionCompatibilityChecked")), recoveredConfigEventHolder(makeReference("RecoveredConfig")) { clusterRecoveryStateEventHolder = makeReference( getRecoveryEventName(ClusterRecoveryEventType::CLUSTER_RECOVERY_STATE_EVENT_NAME)); diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp index ee26a6970d..5fa1600ee7 100644 --- a/fdbserver/Status.actor.cpp +++ b/fdbserver/Status.actor.cpp @@ -1543,6 +1543,25 @@ ACTOR Future getNewestProtocolVersion(Database cx) { } } +ACTOR Future getNewestProtocolVersion(Database cx, WorkerDetails ccWorker) { + + try { + state Future swVersionF = timeoutError( + ccWorker.interf.eventLogRequest.getReply(EventLogRequest("SWVersionCompatibilityChecked"_sr)), 1.0); + + wait(success(swVersionF)); + const TraceEventFields& swVersionTrace = swVersionF.get(); + int64_t newestProtocolVersionValue = atoi(swVersionTrace.getValue("NewestServerVersion").c_str()); + return ProtocolVersion(newestProtocolVersionValue); + + } catch (Error& e) { + if (e.code() == error_code_actor_cancelled) + throw; + + return ProtocolVersion(); + } +} + struct LoadConfigurationResult { bool fullReplication; Optional healthyZone; @@ -2896,6 +2915,8 @@ ACTOR Future clusterGetStatus( messages.push_back(message); } + state ProtocolVersion newestProtocolVersion = wait(getNewestProtocolVersion(cx, ccWorker)); + // construct status information for cluster subsections state int statusCode = (int)RecoveryStatus::END; state JsonBuilderObject recoveryStateStatus = wait( @@ -2933,10 +2954,11 @@ ACTOR Future clusterGetStatus( statusObj["protocol_version"] = format("%" PRIx64, g_network->protocolVersion().version()); statusObj["connection_string"] = coordinators.ccr->getConnectionString().toString(); statusObj["bounce_impact"] = getBounceImpactInfo(statusCode); - - ProtocolVersion newestProtocolVersion = wait(getNewestProtocolVersion(cx)); statusObj["latest_server_version"] = format("%" PRIx64, newestProtocolVersion.version()); + // ProtocolVersion newestProtocolVersion = wait(getNewestProtocolVersion(cx)); + // statusObj["latest_server_version"] = format("%" PRIx64, newestProtocolVersion.version()); + state Optional configuration; state Optional loadResult;