Add traces for snapshot related updates (#7862)

* Add logging; fix typos in comments;

* format files
This commit is contained in:
Chaoguang Lin 2022-08-13 03:10:20 -04:00 committed by GitHub
parent f103a10c37
commit a27d27c5ee
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 33 additions and 31 deletions

View File

@ -290,6 +290,7 @@ Reference<IBackupContainer> IBackupContainer::openContainer(const std::string& u
u.eat("azure://"_sr);
auto address = u.eat("/"_sr);
if (address.endsWith(std::string(azure::storage_lite::constants::default_endpoint_suffix))) {
CODE_PROBE(true, "Azure backup url with standard azure storage account endpoint");
// <account>.<service>.core.windows.net/<resource_path>
auto endPoint = address.toString();
auto accountName = address.eat("."_sr).toString();
@ -305,6 +306,7 @@ Reference<IBackupContainer> IBackupContainer::openContainer(const std::string& u
auto hostname = Hostname::parse(endpoint);
auto resolvedAddress = hostname.resolveBlocking();
if (resolvedAddress.present()) {
CODE_PROBE(true, "Azure backup url with hostname in the endpoint");
parsedAddress = resolvedAddress.get();
}
} catch (Error& e) {

View File

@ -1525,6 +1525,7 @@ Reference<BackupContainerFileSystem> BackupContainerFileSystem::openContainerFS(
u.eat("azure://"_sr);
auto address = u.eat("/"_sr);
if (address.endsWith(std::string(azure::storage_lite::constants::default_endpoint_suffix))) {
CODE_PROBE(true, "Azure backup url with standard azure storage account endpoint");
// <account>.<service>.core.windows.net/<resource_path>
auto endPoint = address.toString();
auto accountName = address.eat("."_sr).toString();
@ -1540,6 +1541,7 @@ Reference<BackupContainerFileSystem> BackupContainerFileSystem::openContainerFS(
auto hostname = Hostname::parse(endpoint);
auto resolvedAddress = hostname.resolveBlocking();
if (resolvedAddress.present()) {
CODE_PROBE(true, "Azure backup url with hostname in the endpoint");
parsedAddress = resolvedAddress.get();
}
} catch (Error& e) {

View File

@ -25,9 +25,6 @@
#include "fdbclient/AsyncTaskThread.h"
#include "fdbclient/BackupContainerFileSystem.h"
#include "constants.h"
#include "storage_credential.h"
#include "storage_account.h"
#include "blob/blob_client.h"
class BackupContainerAzureBlobStore final : public BackupContainerFileSystem,

View File

@ -2174,7 +2174,8 @@ ACTOR Future<Void> proxySnapCreate(ProxySnapRequest snapReq, ProxyCommitData* co
TraceEvent("SnapCommitProxy_DDSnapResponseError")
.errorUnsuppressed(e)
.detail("SnapPayload", snapReq.snapPayload)
.detail("SnapUID", snapReq.snapUID);
.detail("SnapUID", snapReq.snapUID)
.detail("Retry", snapReqRetry);
// Retry if we have network issues
if (e.code() != error_code_request_maybe_delivered ||
++snapReqRetry > SERVER_KNOBS->SNAP_NETWORK_FAILURE_RETRY_LIMIT)

View File

@ -805,7 +805,8 @@ ACTOR Future<ErrorOr<Void>> trySendSnapReq(RequestStream<WorkerSnapRequest> stre
if (reply.isError()) {
TraceEvent("SnapDataDistributor_ReqError")
.errorUnsuppressed(reply.getError())
.detail("Peer", stream.getEndpoint().getPrimaryAddress());
.detail("Peer", stream.getEndpoint().getPrimaryAddress())
.detail("Retry", snapReqRetry);
if (reply.getError().code() != error_code_request_maybe_delivered ||
++snapReqRetry > SERVER_KNOBS->SNAP_NETWORK_FAILURE_RETRY_LIMIT)
return ErrorOr<Void>(reply.getError());
@ -911,6 +912,7 @@ ACTOR Future<std::map<NetworkAddress, std::pair<WorkerInterface, std::string>>>
// get coordinators
Optional<Value> coordinators = wait(tr.get(coordinatorsKey));
if (!coordinators.present()) {
CODE_PROBE(true, "Failed to read the coordinatorsKey");
throw operation_failed();
}
ClusterConnectionString ccs(coordinators.get().toString());
@ -1001,7 +1003,8 @@ ACTOR Future<Void> ddSnapCreateCore(DistributorSnapRequest snapReq, Reference<As
TraceEvent("SnapDataDistributor_GotStatefulWorkers")
.detail("SnapPayload", snapReq.snapPayload)
.detail("SnapUID", snapReq.snapUID);
.detail("SnapUID", snapReq.snapUID)
.detail("StorageFaultTolerance", storageFaultTolerance);
// we need to snapshot storage nodes before snapshot any tlogs
std::vector<Future<ErrorOr<Void>>> storageSnapReqs;
@ -1013,7 +1016,6 @@ ACTOR Future<Void> ddSnapCreateCore(DistributorSnapRequest snapReq, Reference<As
}
wait(waitForMost(storageSnapReqs, storageFaultTolerance, snap_storage_failed()));
TraceEvent("SnapDataDistributor_AfterSnapStorage")
.detail("FaultTolerance", storageFaultTolerance)
.detail("SnapPayload", snapReq.snapPayload)
.detail("SnapUID", snapReq.snapUID);
@ -1336,14 +1338,14 @@ ACTOR Future<Void> dataDistributor(DataDistributorInterface di, Reference<AsyncV
when(DistributorSnapRequest snapReq = waitNext(di.distributorSnapReq.getFuture())) {
auto& snapUID = snapReq.snapUID;
if (ddSnapReqResultMap.count(snapUID)) {
CODE_PROBE(true, "Data distributor received a duplicate finished snap request");
CODE_PROBE(true, "Data distributor received a duplicate finished snapshot request");
auto result = ddSnapReqResultMap[snapUID];
result.isError() ? snapReq.reply.sendError(result.getError()) : snapReq.reply.send(result.get());
TraceEvent("RetryFinishedDistributorSnapRequest")
.detail("SnapUID", snapUID)
.detail("Result", result.isError() ? result.getError().code() : 0);
} else if (ddSnapReqMap.count(snapReq.snapUID)) {
CODE_PROBE(true, "Data distributor received a duplicate ongoing snap request");
CODE_PROBE(true, "Data distributor received a duplicate ongoing snapshot request");
TraceEvent("RetryOngoingDistributorSnapRequest").detail("SnapUID", snapUID);
ASSERT(snapReq.snapPayload == ddSnapReqMap[snapUID].snapPayload);
ddSnapReqMap[snapUID] = snapReq;

View File

@ -236,7 +236,7 @@ Future<Void> coordinationServer(std::string const& dataFolder,
Reference<ConfigNode> const&,
ConfigBroadcastInterface const&);
// Read a value of MovableValue and if old cluster key presents in it, update to the new key
// Read a value of MovableValue and if the old cluster key is nested in it, update it to the new key
Optional<Value> updateCCSInMovableValue(ValueRef movableVal, KeyRef oldClusterKey, KeyRef newClusterKey);
Future<Void> coordChangeClusterKey(std::string dataFolder, KeyRef newClusterKey, KeyRef oldClusterKey);

View File

@ -1236,13 +1236,9 @@ struct TrackRunningStorage {
KeyValueStoreType storeType,
std::set<std::pair<UID, KeyValueStoreType>>* runningStorages)
: self(self), storeType(storeType), runningStorages(runningStorages) {
TraceEvent(SevDebug, "TrackingRunningStorageConstruction").detail("StorageID", self);
runningStorages->emplace(self, storeType);
}
~TrackRunningStorage() {
runningStorages->erase(std::make_pair(self, storeType));
TraceEvent(SevDebug, "TrackingRunningStorageDesctruction").detail("StorageID", self);
};
~TrackRunningStorage() { runningStorages->erase(std::make_pair(self, storeType)); };
};
ACTOR Future<Void> storageServerRollbackRebooter(std::set<std::pair<UID, KeyValueStoreType>>* runningStorages,
@ -2544,28 +2540,30 @@ ACTOR Future<Void> workerServer(Reference<IClusterConnectionRecord> connRecord,
loggingTrigger = delay(loggingDelay, TaskPriority::FlushTrace);
}
when(state WorkerSnapRequest snapReq = waitNext(interf.workerSnapReq.getFuture())) {
std::string snapUID = snapReq.snapUID.toString() + snapReq.role.toString();
if (snapReqResultMap.count(snapUID)) {
CODE_PROBE(true, "Worker received a duplicate finished snap request");
auto result = snapReqResultMap[snapUID];
std::string snapReqKey = snapReq.snapUID.toString() + snapReq.role.toString();
if (snapReqResultMap.count(snapReqKey)) {
CODE_PROBE(true, "Worker received a duplicate finished snapshot request");
auto result = snapReqResultMap[snapReqKey];
result.isError() ? snapReq.reply.sendError(result.getError()) : snapReq.reply.send(result.get());
TraceEvent("RetryFinishedWorkerSnapRequest")
.detail("SnapUID", snapUID)
.detail("SnapUID", snapReq.snapUID.toString())
.detail("Role", snapReq.role)
.detail("Result", result.isError() ? result.getError().code() : 0);
} else if (snapReqMap.count(snapUID)) {
CODE_PROBE(true, "Worker received a duplicate ongoing snap request");
TraceEvent("RetryOngoingWorkerSnapRequest").detail("SnapUID", snapUID).detail("Role", snapReq.role);
ASSERT(snapReq.role == snapReqMap[snapUID].role);
ASSERT(snapReq.snapPayload == snapReqMap[snapUID].snapPayload);
snapReqMap[snapUID] = snapReq;
.detail("Result", result.isError() ? result.getError().code() : success().code());
} else if (snapReqMap.count(snapReqKey)) {
CODE_PROBE(true, "Worker received a duplicate ongoing snapshot request");
TraceEvent("RetryOngoingWorkerSnapRequest")
.detail("SnapUID", snapReq.snapUID.toString())
.detail("Role", snapReq.role);
ASSERT(snapReq.role == snapReqMap[snapReqKey].role);
ASSERT(snapReq.snapPayload == snapReqMap[snapReqKey].snapPayload);
snapReqMap[snapReqKey] = snapReq;
} else {
snapReqMap[snapUID] = snapReq; // set map point to the request
snapReqMap[snapReqKey] = snapReq; // set map point to the request
if (g_network->isSimulated() && (now() - lastSnapTime) < SERVER_KNOBS->SNAP_MINIMUM_TIME_GAP) {
// only allow duplicate snapshots on same process in a short time for different roles
auto okay = (lastSnapReq.snapUID == snapReq.snapUID) && lastSnapReq.role != snapReq.role;
TraceEvent(okay ? SevInfo : SevError, "RapidSnapRequestsOnSameProcess")
.detail("CurrSnapUID", snapUID)
.detail("CurrSnapUID", snapReqKey)
.detail("PrevSnapUID", lastSnapReq.snapUID)
.detail("CurrRole", snapReq.role)
.detail("PrevRole", lastSnapReq.role)
@ -2577,8 +2575,8 @@ ACTOR Future<Void> workerServer(Reference<IClusterConnectionRecord> connRecord,
&snapReqResultMap));
auto* snapReqResultMapPtr = &snapReqResultMap;
errorForwarders.add(fmap(
[snapReqResultMapPtr, snapUID](Void _) {
snapReqResultMapPtr->erase(snapUID);
[snapReqResultMapPtr, snapReqKey](Void _) {
snapReqResultMapPtr->erase(snapReqKey);
return Void();
},
delay(SERVER_KNOBS->SNAP_MINIMUM_TIME_GAP)));