diff --git a/fdbclient/BackupContainer.actor.cpp b/fdbclient/BackupContainer.actor.cpp index b701a24648..c318a6591d 100644 --- a/fdbclient/BackupContainer.actor.cpp +++ b/fdbclient/BackupContainer.actor.cpp @@ -290,6 +290,7 @@ Reference<IBackupContainer> IBackupContainer::openContainer(const std::string& u u.eat("azure://"_sr); auto address = u.eat("/"_sr); if (address.endsWith(std::string(azure::storage_lite::constants::default_endpoint_suffix))) { + CODE_PROBE(true, "Azure backup url with standard azure storage account endpoint"); // <account>.<service>.core.windows.net/<resource_path> auto endPoint = address.toString(); auto accountName = address.eat("."_sr).toString(); @@ -305,6 +306,7 @@ Reference<IBackupContainer> IBackupContainer::openContainer(const std::string& u auto hostname = Hostname::parse(endpoint); auto resolvedAddress = hostname.resolveBlocking(); if (resolvedAddress.present()) { + CODE_PROBE(true, "Azure backup url with hostname in the endpoint"); parsedAddress = resolvedAddress.get(); } } catch (Error& e) { diff --git a/fdbclient/BackupContainerFileSystem.actor.cpp b/fdbclient/BackupContainerFileSystem.actor.cpp index 281ebb5839..b222153517 100644 --- a/fdbclient/BackupContainerFileSystem.actor.cpp +++ b/fdbclient/BackupContainerFileSystem.actor.cpp @@ -1525,6 +1525,7 @@ Reference<BackupContainerFileSystem> BackupContainerFileSystem::openContainerFS( u.eat("azure://"_sr); auto address = u.eat("/"_sr); if (address.endsWith(std::string(azure::storage_lite::constants::default_endpoint_suffix))) { + CODE_PROBE(true, "Azure backup url with standard azure storage account endpoint"); // <account>.<service>.core.windows.net/<resource_path> auto endPoint = address.toString(); auto accountName = address.eat("."_sr).toString(); @@ -1540,6 +1541,7 @@ Reference<BackupContainerFileSystem> BackupContainerFileSystem::openContainerFS( auto hostname = Hostname::parse(endpoint); auto resolvedAddress = hostname.resolveBlocking(); if (resolvedAddress.present()) { + CODE_PROBE(true, "Azure backup url with hostname in the endpoint"); parsedAddress = resolvedAddress.get(); } } catch (Error& e) { diff --git a/fdbclient/include/fdbclient/BackupContainerAzureBlobStore.h b/fdbclient/include/fdbclient/BackupContainerAzureBlobStore.h index f4b87ee69a..ed79a56078 100644 --- a/fdbclient/include/fdbclient/BackupContainerAzureBlobStore.h +++ b/fdbclient/include/fdbclient/BackupContainerAzureBlobStore.h @@ -25,9 +25,6 @@ #include "fdbclient/AsyncTaskThread.h" #include "fdbclient/BackupContainerFileSystem.h" -#include "constants.h" -#include "storage_credential.h" -#include "storage_account.h" #include "blob/blob_client.h" class BackupContainerAzureBlobStore final : public BackupContainerFileSystem, diff --git a/fdbserver/CommitProxyServer.actor.cpp b/fdbserver/CommitProxyServer.actor.cpp index 2d817bbaf4..cfdaff1c0e 100644 --- a/fdbserver/CommitProxyServer.actor.cpp +++ b/fdbserver/CommitProxyServer.actor.cpp @@ -2174,7 +2174,8 @@ ACTOR Future<Void> proxySnapCreate(ProxySnapRequest snapReq, ProxyCommitData* co TraceEvent("SnapCommitProxy_DDSnapResponseError") .errorUnsuppressed(e) .detail("SnapPayload", snapReq.snapPayload) - .detail("SnapUID", snapReq.snapUID); + .detail("SnapUID", snapReq.snapUID) + .detail("Retry", snapReqRetry); // Retry if we have network issues if (e.code() != error_code_request_maybe_delivered || ++snapReqRetry > SERVER_KNOBS->SNAP_NETWORK_FAILURE_RETRY_LIMIT) diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index e7904f8e52..a590423fbc 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -805,7 +805,8 @@ ACTOR Future<ErrorOr<Void>> trySendSnapReq(RequestStream<WorkerSnapRequest> stre if (reply.isError()) { TraceEvent("SnapDataDistributor_ReqError") .errorUnsuppressed(reply.getError()) - .detail("Peer", stream.getEndpoint().getPrimaryAddress()); + .detail("Peer", stream.getEndpoint().getPrimaryAddress()) + .detail("Retry", snapReqRetry); if (reply.getError().code() != error_code_request_maybe_delivered || ++snapReqRetry > SERVER_KNOBS->SNAP_NETWORK_FAILURE_RETRY_LIMIT) return ErrorOr<Void>(reply.getError()); @@ -911,6 +912,7 @@ ACTOR Future<std::map<NetworkAddress, std::pair<WorkerInterface, std::string>>> // get coordinators Optional<Value> coordinators = wait(tr.get(coordinatorsKey)); if (!coordinators.present()) { + CODE_PROBE(true, "Failed to read the coordinatorsKey"); throw operation_failed(); } ClusterConnectionString ccs(coordinators.get().toString()); @@ -1001,7 +1003,8 @@ ACTOR Future<Void> ddSnapCreateCore(DistributorSnapRequest snapReq, Reference<As TraceEvent("SnapDataDistributor_GotStatefulWorkers") .detail("SnapPayload", snapReq.snapPayload) - .detail("SnapUID", snapReq.snapUID); + .detail("SnapUID", snapReq.snapUID) + .detail("StorageFaultTolerance", storageFaultTolerance); // we need to snapshot storage nodes before snapshot any tlogs std::vector<Future<ErrorOr<Void>>> storageSnapReqs; @@ -1013,7 +1016,6 @@ ACTOR Future<Void> ddSnapCreateCore(DistributorSnapRequest snapReq, Reference<As } wait(waitForMost(storageSnapReqs, storageFaultTolerance, snap_storage_failed())); TraceEvent("SnapDataDistributor_AfterSnapStorage") - .detail("FaultTolerance", storageFaultTolerance) .detail("SnapPayload", snapReq.snapPayload) .detail("SnapUID", snapReq.snapUID); @@ -1336,14 +1338,14 @@ ACTOR Future<Void> dataDistributor(DataDistributorInterface di, Reference<AsyncV when(DistributorSnapRequest snapReq = waitNext(di.distributorSnapReq.getFuture())) { auto& snapUID = snapReq.snapUID; if (ddSnapReqResultMap.count(snapUID)) { - CODE_PROBE(true, "Data distributor received a duplicate finished snap request"); + CODE_PROBE(true, "Data distributor received a duplicate finished snapshot request"); auto result = ddSnapReqResultMap[snapUID]; result.isError() ? snapReq.reply.sendError(result.getError()) : snapReq.reply.send(result.get()); TraceEvent("RetryFinishedDistributorSnapRequest") .detail("SnapUID", snapUID) .detail("Result", result.isError() ? result.getError().code() : 0); } else if (ddSnapReqMap.count(snapReq.snapUID)) { - CODE_PROBE(true, "Data distributor received a duplicate ongoing snap request"); + CODE_PROBE(true, "Data distributor received a duplicate ongoing snapshot request"); TraceEvent("RetryOngoingDistributorSnapRequest").detail("SnapUID", snapUID); ASSERT(snapReq.snapPayload == ddSnapReqMap[snapUID].snapPayload); ddSnapReqMap[snapUID] = snapReq; diff --git a/fdbserver/include/fdbserver/CoordinationInterface.h b/fdbserver/include/fdbserver/CoordinationInterface.h index 35175eceab..3c6c904d4c 100644 --- a/fdbserver/include/fdbserver/CoordinationInterface.h +++ b/fdbserver/include/fdbserver/CoordinationInterface.h @@ -236,7 +236,7 @@ Future<Void> coordinationServer(std::string const& dataFolder, Reference<ConfigNode> const&, ConfigBroadcastInterface const&); -// Read a value of MovableValue and if old cluster key presents in it, update to the new key +// Read a value of MovableValue and if the old cluster key is nested in it, update it to the new key Optional<Value> updateCCSInMovableValue(ValueRef movableVal, KeyRef oldClusterKey, KeyRef newClusterKey); Future<Void> coordChangeClusterKey(std::string dataFolder, KeyRef newClusterKey, KeyRef oldClusterKey); diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp index 1694c8dbec..0026788167 100644 --- a/fdbserver/worker.actor.cpp +++ b/fdbserver/worker.actor.cpp @@ -1236,13 +1236,9 @@ struct TrackRunningStorage { KeyValueStoreType storeType, std::set<std::pair<UID, KeyValueStoreType>>* runningStorages) : self(self), storeType(storeType), runningStorages(runningStorages) { - TraceEvent(SevDebug, "TrackingRunningStorageConstruction").detail("StorageID", self); runningStorages->emplace(self, storeType); } - ~TrackRunningStorage() { - runningStorages->erase(std::make_pair(self, storeType)); - TraceEvent(SevDebug, "TrackingRunningStorageDesctruction").detail("StorageID", self); - }; + ~TrackRunningStorage() { runningStorages->erase(std::make_pair(self, storeType)); }; }; ACTOR Future<Void> storageServerRollbackRebooter(std::set<std::pair<UID, KeyValueStoreType>>* runningStorages, @@ -2544,28 +2540,30 @@ ACTOR Future<Void> workerServer(Reference<IClusterConnectionRecord> connRecord, loggingTrigger = delay(loggingDelay, TaskPriority::FlushTrace); } when(state WorkerSnapRequest snapReq = waitNext(interf.workerSnapReq.getFuture())) { - std::string snapUID = snapReq.snapUID.toString() + snapReq.role.toString(); - if (snapReqResultMap.count(snapUID)) { - CODE_PROBE(true, "Worker received a duplicate finished snap request"); - auto result = snapReqResultMap[snapUID]; + std::string snapReqKey = snapReq.snapUID.toString() + snapReq.role.toString(); + if (snapReqResultMap.count(snapReqKey)) { + CODE_PROBE(true, "Worker received a duplicate finished snapshot request"); + auto result = snapReqResultMap[snapReqKey]; result.isError() ? snapReq.reply.sendError(result.getError()) : snapReq.reply.send(result.get()); TraceEvent("RetryFinishedWorkerSnapRequest") - .detail("SnapUID", snapUID) + .detail("SnapUID", snapReq.snapUID.toString()) .detail("Role", snapReq.role) - .detail("Result", result.isError() ? result.getError().code() : 0); - } else if (snapReqMap.count(snapUID)) { - CODE_PROBE(true, "Worker received a duplicate ongoing snap request"); - TraceEvent("RetryOngoingWorkerSnapRequest").detail("SnapUID", snapUID).detail("Role", snapReq.role); - ASSERT(snapReq.role == snapReqMap[snapUID].role); - ASSERT(snapReq.snapPayload == snapReqMap[snapUID].snapPayload); - snapReqMap[snapUID] = snapReq; + .detail("Result", result.isError() ? result.getError().code() : success().code()); + } else if (snapReqMap.count(snapReqKey)) { + CODE_PROBE(true, "Worker received a duplicate ongoing snapshot request"); + TraceEvent("RetryOngoingWorkerSnapRequest") + .detail("SnapUID", snapReq.snapUID.toString()) + .detail("Role", snapReq.role); + ASSERT(snapReq.role == snapReqMap[snapReqKey].role); + ASSERT(snapReq.snapPayload == snapReqMap[snapReqKey].snapPayload); + snapReqMap[snapReqKey] = snapReq; } else { - snapReqMap[snapUID] = snapReq; // set map point to the request + snapReqMap[snapReqKey] = snapReq; // set map point to the request if (g_network->isSimulated() && (now() - lastSnapTime) < SERVER_KNOBS->SNAP_MINIMUM_TIME_GAP) { // only allow duplicate snapshots on same process in a short time for different roles auto okay = (lastSnapReq.snapUID == snapReq.snapUID) && lastSnapReq.role != snapReq.role; TraceEvent(okay ? SevInfo : SevError, "RapidSnapRequestsOnSameProcess") - .detail("CurrSnapUID", snapUID) + .detail("CurrSnapUID", snapReqKey) .detail("PrevSnapUID", lastSnapReq.snapUID) .detail("CurrRole", snapReq.role) .detail("PrevRole", lastSnapReq.role) @@ -2577,8 +2575,8 @@ ACTOR Future<Void> workerServer(Reference<IClusterConnectionRecord> connRecord, &snapReqResultMap)); auto* snapReqResultMapPtr = &snapReqResultMap; errorForwarders.add(fmap( - [snapReqResultMapPtr, snapUID](Void _) { - snapReqResultMapPtr->erase(snapUID); + [snapReqResultMapPtr, snapReqKey](Void _) { + snapReqResultMapPtr->erase(snapReqKey); return Void(); }, delay(SERVER_KNOBS->SNAP_MINIMUM_TIME_GAP)));