mirror of
https://github.com/apple/foundationdb.git
synced 2025-05-14 09:58:50 +08:00
Add traces for snapshot related updates (#7862)
* Add logging; fix typos in comments; * format files
This commit is contained in:
parent
f103a10c37
commit
a27d27c5ee
@ -290,6 +290,7 @@ Reference<IBackupContainer> IBackupContainer::openContainer(const std::string& u
|
||||
u.eat("azure://"_sr);
|
||||
auto address = u.eat("/"_sr);
|
||||
if (address.endsWith(std::string(azure::storage_lite::constants::default_endpoint_suffix))) {
|
||||
CODE_PROBE(true, "Azure backup url with standard azure storage account endpoint");
|
||||
// <account>.<service>.core.windows.net/<resource_path>
|
||||
auto endPoint = address.toString();
|
||||
auto accountName = address.eat("."_sr).toString();
|
||||
@ -305,6 +306,7 @@ Reference<IBackupContainer> IBackupContainer::openContainer(const std::string& u
|
||||
auto hostname = Hostname::parse(endpoint);
|
||||
auto resolvedAddress = hostname.resolveBlocking();
|
||||
if (resolvedAddress.present()) {
|
||||
CODE_PROBE(true, "Azure backup url with hostname in the endpoint");
|
||||
parsedAddress = resolvedAddress.get();
|
||||
}
|
||||
} catch (Error& e) {
|
||||
|
@ -1525,6 +1525,7 @@ Reference<BackupContainerFileSystem> BackupContainerFileSystem::openContainerFS(
|
||||
u.eat("azure://"_sr);
|
||||
auto address = u.eat("/"_sr);
|
||||
if (address.endsWith(std::string(azure::storage_lite::constants::default_endpoint_suffix))) {
|
||||
CODE_PROBE(true, "Azure backup url with standard azure storage account endpoint");
|
||||
// <account>.<service>.core.windows.net/<resource_path>
|
||||
auto endPoint = address.toString();
|
||||
auto accountName = address.eat("."_sr).toString();
|
||||
@ -1540,6 +1541,7 @@ Reference<BackupContainerFileSystem> BackupContainerFileSystem::openContainerFS(
|
||||
auto hostname = Hostname::parse(endpoint);
|
||||
auto resolvedAddress = hostname.resolveBlocking();
|
||||
if (resolvedAddress.present()) {
|
||||
CODE_PROBE(true, "Azure backup url with hostname in the endpoint");
|
||||
parsedAddress = resolvedAddress.get();
|
||||
}
|
||||
} catch (Error& e) {
|
||||
|
@ -25,9 +25,6 @@
|
||||
#include "fdbclient/AsyncTaskThread.h"
|
||||
#include "fdbclient/BackupContainerFileSystem.h"
|
||||
|
||||
#include "constants.h"
|
||||
#include "storage_credential.h"
|
||||
#include "storage_account.h"
|
||||
#include "blob/blob_client.h"
|
||||
|
||||
class BackupContainerAzureBlobStore final : public BackupContainerFileSystem,
|
||||
|
@ -2174,7 +2174,8 @@ ACTOR Future<Void> proxySnapCreate(ProxySnapRequest snapReq, ProxyCommitData* co
|
||||
TraceEvent("SnapCommitProxy_DDSnapResponseError")
|
||||
.errorUnsuppressed(e)
|
||||
.detail("SnapPayload", snapReq.snapPayload)
|
||||
.detail("SnapUID", snapReq.snapUID);
|
||||
.detail("SnapUID", snapReq.snapUID)
|
||||
.detail("Retry", snapReqRetry);
|
||||
// Retry if we have network issues
|
||||
if (e.code() != error_code_request_maybe_delivered ||
|
||||
++snapReqRetry > SERVER_KNOBS->SNAP_NETWORK_FAILURE_RETRY_LIMIT)
|
||||
|
@ -805,7 +805,8 @@ ACTOR Future<ErrorOr<Void>> trySendSnapReq(RequestStream<WorkerSnapRequest> stre
|
||||
if (reply.isError()) {
|
||||
TraceEvent("SnapDataDistributor_ReqError")
|
||||
.errorUnsuppressed(reply.getError())
|
||||
.detail("Peer", stream.getEndpoint().getPrimaryAddress());
|
||||
.detail("Peer", stream.getEndpoint().getPrimaryAddress())
|
||||
.detail("Retry", snapReqRetry);
|
||||
if (reply.getError().code() != error_code_request_maybe_delivered ||
|
||||
++snapReqRetry > SERVER_KNOBS->SNAP_NETWORK_FAILURE_RETRY_LIMIT)
|
||||
return ErrorOr<Void>(reply.getError());
|
||||
@ -911,6 +912,7 @@ ACTOR Future<std::map<NetworkAddress, std::pair<WorkerInterface, std::string>>>
|
||||
// get coordinators
|
||||
Optional<Value> coordinators = wait(tr.get(coordinatorsKey));
|
||||
if (!coordinators.present()) {
|
||||
CODE_PROBE(true, "Failed to read the coordinatorsKey");
|
||||
throw operation_failed();
|
||||
}
|
||||
ClusterConnectionString ccs(coordinators.get().toString());
|
||||
@ -1001,7 +1003,8 @@ ACTOR Future<Void> ddSnapCreateCore(DistributorSnapRequest snapReq, Reference<As
|
||||
|
||||
TraceEvent("SnapDataDistributor_GotStatefulWorkers")
|
||||
.detail("SnapPayload", snapReq.snapPayload)
|
||||
.detail("SnapUID", snapReq.snapUID);
|
||||
.detail("SnapUID", snapReq.snapUID)
|
||||
.detail("StorageFaultTolerance", storageFaultTolerance);
|
||||
|
||||
// we need to snapshot storage nodes before snapshot any tlogs
|
||||
std::vector<Future<ErrorOr<Void>>> storageSnapReqs;
|
||||
@ -1013,7 +1016,6 @@ ACTOR Future<Void> ddSnapCreateCore(DistributorSnapRequest snapReq, Reference<As
|
||||
}
|
||||
wait(waitForMost(storageSnapReqs, storageFaultTolerance, snap_storage_failed()));
|
||||
TraceEvent("SnapDataDistributor_AfterSnapStorage")
|
||||
.detail("FaultTolerance", storageFaultTolerance)
|
||||
.detail("SnapPayload", snapReq.snapPayload)
|
||||
.detail("SnapUID", snapReq.snapUID);
|
||||
|
||||
@ -1336,14 +1338,14 @@ ACTOR Future<Void> dataDistributor(DataDistributorInterface di, Reference<AsyncV
|
||||
when(DistributorSnapRequest snapReq = waitNext(di.distributorSnapReq.getFuture())) {
|
||||
auto& snapUID = snapReq.snapUID;
|
||||
if (ddSnapReqResultMap.count(snapUID)) {
|
||||
CODE_PROBE(true, "Data distributor received a duplicate finished snap request");
|
||||
CODE_PROBE(true, "Data distributor received a duplicate finished snapshot request");
|
||||
auto result = ddSnapReqResultMap[snapUID];
|
||||
result.isError() ? snapReq.reply.sendError(result.getError()) : snapReq.reply.send(result.get());
|
||||
TraceEvent("RetryFinishedDistributorSnapRequest")
|
||||
.detail("SnapUID", snapUID)
|
||||
.detail("Result", result.isError() ? result.getError().code() : 0);
|
||||
} else if (ddSnapReqMap.count(snapReq.snapUID)) {
|
||||
CODE_PROBE(true, "Data distributor received a duplicate ongoing snap request");
|
||||
CODE_PROBE(true, "Data distributor received a duplicate ongoing snapshot request");
|
||||
TraceEvent("RetryOngoingDistributorSnapRequest").detail("SnapUID", snapUID);
|
||||
ASSERT(snapReq.snapPayload == ddSnapReqMap[snapUID].snapPayload);
|
||||
ddSnapReqMap[snapUID] = snapReq;
|
||||
|
@ -236,7 +236,7 @@ Future<Void> coordinationServer(std::string const& dataFolder,
|
||||
Reference<ConfigNode> const&,
|
||||
ConfigBroadcastInterface const&);
|
||||
|
||||
// Read a value of MovableValue and if old cluster key presents in it, update to the new key
|
||||
// Read a value of MovableValue and if the old cluster key is nested in it, update it to the new key
|
||||
Optional<Value> updateCCSInMovableValue(ValueRef movableVal, KeyRef oldClusterKey, KeyRef newClusterKey);
|
||||
|
||||
Future<Void> coordChangeClusterKey(std::string dataFolder, KeyRef newClusterKey, KeyRef oldClusterKey);
|
||||
|
@ -1236,13 +1236,9 @@ struct TrackRunningStorage {
|
||||
KeyValueStoreType storeType,
|
||||
std::set<std::pair<UID, KeyValueStoreType>>* runningStorages)
|
||||
: self(self), storeType(storeType), runningStorages(runningStorages) {
|
||||
TraceEvent(SevDebug, "TrackingRunningStorageConstruction").detail("StorageID", self);
|
||||
runningStorages->emplace(self, storeType);
|
||||
}
|
||||
~TrackRunningStorage() {
|
||||
runningStorages->erase(std::make_pair(self, storeType));
|
||||
TraceEvent(SevDebug, "TrackingRunningStorageDesctruction").detail("StorageID", self);
|
||||
};
|
||||
~TrackRunningStorage() { runningStorages->erase(std::make_pair(self, storeType)); };
|
||||
};
|
||||
|
||||
ACTOR Future<Void> storageServerRollbackRebooter(std::set<std::pair<UID, KeyValueStoreType>>* runningStorages,
|
||||
@ -2544,28 +2540,30 @@ ACTOR Future<Void> workerServer(Reference<IClusterConnectionRecord> connRecord,
|
||||
loggingTrigger = delay(loggingDelay, TaskPriority::FlushTrace);
|
||||
}
|
||||
when(state WorkerSnapRequest snapReq = waitNext(interf.workerSnapReq.getFuture())) {
|
||||
std::string snapUID = snapReq.snapUID.toString() + snapReq.role.toString();
|
||||
if (snapReqResultMap.count(snapUID)) {
|
||||
CODE_PROBE(true, "Worker received a duplicate finished snap request");
|
||||
auto result = snapReqResultMap[snapUID];
|
||||
std::string snapReqKey = snapReq.snapUID.toString() + snapReq.role.toString();
|
||||
if (snapReqResultMap.count(snapReqKey)) {
|
||||
CODE_PROBE(true, "Worker received a duplicate finished snapshot request");
|
||||
auto result = snapReqResultMap[snapReqKey];
|
||||
result.isError() ? snapReq.reply.sendError(result.getError()) : snapReq.reply.send(result.get());
|
||||
TraceEvent("RetryFinishedWorkerSnapRequest")
|
||||
.detail("SnapUID", snapUID)
|
||||
.detail("SnapUID", snapReq.snapUID.toString())
|
||||
.detail("Role", snapReq.role)
|
||||
.detail("Result", result.isError() ? result.getError().code() : 0);
|
||||
} else if (snapReqMap.count(snapUID)) {
|
||||
CODE_PROBE(true, "Worker received a duplicate ongoing snap request");
|
||||
TraceEvent("RetryOngoingWorkerSnapRequest").detail("SnapUID", snapUID).detail("Role", snapReq.role);
|
||||
ASSERT(snapReq.role == snapReqMap[snapUID].role);
|
||||
ASSERT(snapReq.snapPayload == snapReqMap[snapUID].snapPayload);
|
||||
snapReqMap[snapUID] = snapReq;
|
||||
.detail("Result", result.isError() ? result.getError().code() : success().code());
|
||||
} else if (snapReqMap.count(snapReqKey)) {
|
||||
CODE_PROBE(true, "Worker received a duplicate ongoing snapshot request");
|
||||
TraceEvent("RetryOngoingWorkerSnapRequest")
|
||||
.detail("SnapUID", snapReq.snapUID.toString())
|
||||
.detail("Role", snapReq.role);
|
||||
ASSERT(snapReq.role == snapReqMap[snapReqKey].role);
|
||||
ASSERT(snapReq.snapPayload == snapReqMap[snapReqKey].snapPayload);
|
||||
snapReqMap[snapReqKey] = snapReq;
|
||||
} else {
|
||||
snapReqMap[snapUID] = snapReq; // set map point to the request
|
||||
snapReqMap[snapReqKey] = snapReq; // set map point to the request
|
||||
if (g_network->isSimulated() && (now() - lastSnapTime) < SERVER_KNOBS->SNAP_MINIMUM_TIME_GAP) {
|
||||
// only allow duplicate snapshots on same process in a short time for different roles
|
||||
auto okay = (lastSnapReq.snapUID == snapReq.snapUID) && lastSnapReq.role != snapReq.role;
|
||||
TraceEvent(okay ? SevInfo : SevError, "RapidSnapRequestsOnSameProcess")
|
||||
.detail("CurrSnapUID", snapUID)
|
||||
.detail("CurrSnapUID", snapReqKey)
|
||||
.detail("PrevSnapUID", lastSnapReq.snapUID)
|
||||
.detail("CurrRole", snapReq.role)
|
||||
.detail("PrevRole", lastSnapReq.role)
|
||||
@ -2577,8 +2575,8 @@ ACTOR Future<Void> workerServer(Reference<IClusterConnectionRecord> connRecord,
|
||||
&snapReqResultMap));
|
||||
auto* snapReqResultMapPtr = &snapReqResultMap;
|
||||
errorForwarders.add(fmap(
|
||||
[snapReqResultMapPtr, snapUID](Void _) {
|
||||
snapReqResultMapPtr->erase(snapUID);
|
||||
[snapReqResultMapPtr, snapReqKey](Void _) {
|
||||
snapReqResultMapPtr->erase(snapReqKey);
|
||||
return Void();
|
||||
},
|
||||
delay(SERVER_KNOBS->SNAP_MINIMUM_TIME_GAP)));
|
||||
|
Loading…
x
Reference in New Issue
Block a user