improved error msgs for snapshot cmd

This commit is contained in:
sramamoorthy 2019-08-23 11:56:06 -07:00
parent e0824f4915
commit 5d87443323
6 changed files with 48 additions and 30 deletions

View File

@ -172,7 +172,6 @@ public:
Counter transactionsMaybeCommitted; Counter transactionsMaybeCommitted;
Counter transactionsResourceConstrained; Counter transactionsResourceConstrained;
Counter transactionsProcessBehind; Counter transactionsProcessBehind;
Counter transactionWaitsForFullRecovery;
ContinuousSample<double> latencies, readLatencies, commitLatencies, GRVLatencies, mutationsPerCommit, bytesPerCommit; ContinuousSample<double> latencies, readLatencies, commitLatencies, GRVLatencies, mutationsPerCommit, bytesPerCommit;

View File

@ -519,7 +519,7 @@ DatabaseContext::DatabaseContext(
transactionCommittedMutations("CommittedMutations", cc), transactionCommittedMutationBytes("CommittedMutationBytes", cc), transactionsCommitStarted("CommitStarted", cc), transactionCommittedMutations("CommittedMutations", cc), transactionCommittedMutationBytes("CommittedMutationBytes", cc), transactionsCommitStarted("CommitStarted", cc),
transactionsCommitCompleted("CommitCompleted", cc), transactionsTooOld("TooOld", cc), transactionsFutureVersions("FutureVersions", cc), transactionsCommitCompleted("CommitCompleted", cc), transactionsTooOld("TooOld", cc), transactionsFutureVersions("FutureVersions", cc),
transactionsNotCommitted("NotCommitted", cc), transactionsMaybeCommitted("MaybeCommitted", cc), transactionsResourceConstrained("ResourceConstrained", cc), transactionsNotCommitted("NotCommitted", cc), transactionsMaybeCommitted("MaybeCommitted", cc), transactionsResourceConstrained("ResourceConstrained", cc),
transactionsProcessBehind("ProcessBehind", cc), transactionWaitsForFullRecovery("WaitsForFullRecovery", cc), outstandingWatches(0), transactionsProcessBehind("ProcessBehind", cc), outstandingWatches(0),
latencies(1000), readLatencies(1000), commitLatencies(1000), GRVLatencies(1000), mutationsPerCommit(1000), bytesPerCommit(1000), mvCacheInsertLocation(0), latencies(1000), readLatencies(1000), commitLatencies(1000), GRVLatencies(1000), mutationsPerCommit(1000), bytesPerCommit(1000), mvCacheInsertLocation(0),
healthMetricsLastUpdated(0), detailedHealthMetricsLastUpdated(0), internal(internal) healthMetricsLastUpdated(0), detailedHealthMetricsLastUpdated(0), internal(internal)
{ {
@ -548,7 +548,7 @@ DatabaseContext::DatabaseContext( const Error &err ) : deferredError(err), cc("T
transactionCommittedMutations("CommittedMutations", cc), transactionCommittedMutationBytes("CommittedMutationBytes", cc), transactionsCommitStarted("CommitStarted", cc), transactionCommittedMutations("CommittedMutations", cc), transactionCommittedMutationBytes("CommittedMutationBytes", cc), transactionsCommitStarted("CommitStarted", cc),
transactionsCommitCompleted("CommitCompleted", cc), transactionsTooOld("TooOld", cc), transactionsFutureVersions("FutureVersions", cc), transactionsCommitCompleted("CommitCompleted", cc), transactionsTooOld("TooOld", cc), transactionsFutureVersions("FutureVersions", cc),
transactionsNotCommitted("NotCommitted", cc), transactionsMaybeCommitted("MaybeCommitted", cc), transactionsResourceConstrained("ResourceConstrained", cc), transactionsNotCommitted("NotCommitted", cc), transactionsMaybeCommitted("MaybeCommitted", cc), transactionsResourceConstrained("ResourceConstrained", cc),
transactionsProcessBehind("ProcessBehind", cc), transactionWaitsForFullRecovery("WaitsForFullRecovery", cc), latencies(1000), readLatencies(1000), commitLatencies(1000), transactionsProcessBehind("ProcessBehind", cc), latencies(1000), readLatencies(1000), commitLatencies(1000),
GRVLatencies(1000), mutationsPerCommit(1000), bytesPerCommit(1000), GRVLatencies(1000), mutationsPerCommit(1000), bytesPerCommit(1000),
internal(false) {} internal(false) {}
@ -2705,10 +2705,7 @@ ACTOR static Future<Void> tryCommit( Database cx, Reference<TransactionLogInfo>
if (e.code() != error_code_transaction_too_old if (e.code() != error_code_transaction_too_old
&& e.code() != error_code_not_committed && e.code() != error_code_not_committed
&& e.code() != error_code_database_locked && e.code() != error_code_database_locked
&& e.code() != error_code_proxy_memory_limit_exceeded && e.code() != error_code_proxy_memory_limit_exceeded)
&& e.code() != error_code_transaction_not_permitted
&& e.code() != error_code_cluster_not_fully_recovered
&& e.code() != error_code_txn_exec_log_anti_quorum)
TraceEvent(SevError, "TryCommitError").error(e); TraceEvent(SevError, "TryCommitError").error(e);
if (trLogInfo) if (trLogInfo)
trLogInfo->addLog(FdbClientLogEvents::EventCommitError(startTime, static_cast<int>(e.code()), req)); trLogInfo->addLog(FdbClientLogEvents::EventCommitError(startTime, static_cast<int>(e.code()), req));
@ -3115,8 +3112,7 @@ Future<Void> Transaction::onError( Error const& e ) {
e.code() == error_code_commit_unknown_result || e.code() == error_code_commit_unknown_result ||
e.code() == error_code_database_locked || e.code() == error_code_database_locked ||
e.code() == error_code_proxy_memory_limit_exceeded || e.code() == error_code_proxy_memory_limit_exceeded ||
e.code() == error_code_process_behind || e.code() == error_code_process_behind)
e.code() == error_code_cluster_not_fully_recovered)
{ {
if(e.code() == error_code_not_committed) if(e.code() == error_code_not_committed)
++cx->transactionsNotCommitted; ++cx->transactionsNotCommitted;
@ -3126,9 +3122,6 @@ Future<Void> Transaction::onError( Error const& e ) {
++cx->transactionsResourceConstrained; ++cx->transactionsResourceConstrained;
if (e.code() == error_code_process_behind) if (e.code() == error_code_process_behind)
++cx->transactionsProcessBehind; ++cx->transactionsProcessBehind;
if (e.code() == error_code_cluster_not_fully_recovered) {
++cx->transactionWaitsForFullRecovery;
}
double backoff = getBackoff(e.code()); double backoff = getBackoff(e.code());
reset(); reset();

View File

@ -4147,7 +4147,7 @@ ACTOR Future<Void> ddSnapCreateCore(DistributorSnapRequest snapReq, Reference<As
std::vector<Future<Void>> disablePops; std::vector<Future<Void>> disablePops;
for (const auto & tlog : tlogs) { for (const auto & tlog : tlogs) {
disablePops.push_back( disablePops.push_back(
transformErrors(throwErrorOr(tlog.disablePopRequest.tryGetReply(TLogDisablePopRequest(snapReq.snapUID))), operation_failed()) transformErrors(throwErrorOr(tlog.disablePopRequest.tryGetReply(TLogDisablePopRequest(snapReq.snapUID))), snap_disable_tlog_pop_failed())
); );
} }
wait(waitForAll(disablePops)); wait(waitForAll(disablePops));
@ -4156,14 +4156,14 @@ ACTOR Future<Void> ddSnapCreateCore(DistributorSnapRequest snapReq, Reference<As
.detail("SnapPayload", snapReq.snapPayload) .detail("SnapPayload", snapReq.snapPayload)
.detail("SnapUID", snapReq.snapUID); .detail("SnapUID", snapReq.snapUID);
// snap local storage nodes // snap local storage nodes
std::vector<WorkerInterface> storageWorkers = wait(getStorageWorkers(cx, db, true /* localOnly */)); std::vector<WorkerInterface> storageWorkers = wait(transformErrors(getStorageWorkers(cx, db, true /* localOnly */), snap_storage_failed()));
TraceEvent("SnapDataDistributor_GotStorageWorkers") TraceEvent("SnapDataDistributor_GotStorageWorkers")
.detail("SnapPayload", snapReq.snapPayload) .detail("SnapPayload", snapReq.snapPayload)
.detail("SnapUID", snapReq.snapUID); .detail("SnapUID", snapReq.snapUID);
std::vector<Future<Void>> storageSnapReqs; std::vector<Future<Void>> storageSnapReqs;
for (const auto & worker : storageWorkers) { for (const auto & worker : storageWorkers) {
storageSnapReqs.push_back( storageSnapReqs.push_back(
transformErrors(throwErrorOr(worker.workerSnapReq.tryGetReply(WorkerSnapRequest(snapReq.snapPayload, snapReq.snapUID, LiteralStringRef("storage")))), operation_failed()) transformErrors(throwErrorOr(worker.workerSnapReq.tryGetReply(WorkerSnapRequest(snapReq.snapPayload, snapReq.snapUID, LiteralStringRef("storage")))), snap_storage_failed())
); );
} }
wait(waitForAll(storageSnapReqs)); wait(waitForAll(storageSnapReqs));
@ -4175,7 +4175,7 @@ ACTOR Future<Void> ddSnapCreateCore(DistributorSnapRequest snapReq, Reference<As
std::vector<Future<Void>> tLogSnapReqs; std::vector<Future<Void>> tLogSnapReqs;
for (const auto & tlog : tlogs) { for (const auto & tlog : tlogs) {
tLogSnapReqs.push_back( tLogSnapReqs.push_back(
transformErrors(throwErrorOr(tlog.snapRequest.tryGetReply(TLogSnapRequest(snapReq.snapPayload, snapReq.snapUID, LiteralStringRef("tlog")))), operation_failed()) transformErrors(throwErrorOr(tlog.snapRequest.tryGetReply(TLogSnapRequest(snapReq.snapPayload, snapReq.snapUID, LiteralStringRef("tlog")))), snap_tlog_failed())
); );
} }
wait(waitForAll(tLogSnapReqs)); wait(waitForAll(tLogSnapReqs));
@ -4187,7 +4187,7 @@ ACTOR Future<Void> ddSnapCreateCore(DistributorSnapRequest snapReq, Reference<As
std::vector<Future<Void>> enablePops; std::vector<Future<Void>> enablePops;
for (const auto & tlog : tlogs) { for (const auto & tlog : tlogs) {
enablePops.push_back( enablePops.push_back(
transformErrors(throwErrorOr(tlog.enablePopRequest.tryGetReply(TLogEnablePopRequest(snapReq.snapUID))), operation_failed()) transformErrors(throwErrorOr(tlog.enablePopRequest.tryGetReply(TLogEnablePopRequest(snapReq.snapUID))), snap_enable_tlog_pop_failed())
); );
} }
wait(waitForAll(enablePops)); wait(waitForAll(enablePops));
@ -4203,18 +4203,36 @@ ACTOR Future<Void> ddSnapCreateCore(DistributorSnapRequest snapReq, Reference<As
std::vector<Future<Void>> coordSnapReqs; std::vector<Future<Void>> coordSnapReqs;
for (const auto & worker : coordWorkers) { for (const auto & worker : coordWorkers) {
coordSnapReqs.push_back( coordSnapReqs.push_back(
transformErrors(throwErrorOr(worker.workerSnapReq.tryGetReply(WorkerSnapRequest(snapReq.snapPayload, snapReq.snapUID, LiteralStringRef("coord")))), operation_failed()) transformErrors(throwErrorOr(worker.workerSnapReq.tryGetReply(WorkerSnapRequest(snapReq.snapPayload, snapReq.snapUID, LiteralStringRef("coord")))), snap_coord_failed())
); );
} }
wait(waitForAll(coordSnapReqs)); wait(waitForAll(coordSnapReqs));
TraceEvent("SnapDataDistributor_AfterSnapCoords") TraceEvent("SnapDataDistributor_AfterSnapCoords")
.detail("SnapPayload", snapReq.snapPayload) .detail("SnapPayload", snapReq.snapPayload)
.detail("SnapUID", snapReq.snapUID); .detail("SnapUID", snapReq.snapUID);
} catch (Error& e) { } catch (Error& err) {
state Error e = err;
TraceEvent("SnapDataDistributor_SnapReqExit") TraceEvent("SnapDataDistributor_SnapReqExit")
.detail("SnapPayload", snapReq.snapPayload) .detail("SnapPayload", snapReq.snapPayload)
.detail("SnapUID", snapReq.snapUID) .detail("SnapUID", snapReq.snapUID)
.error(e, true /*includeCancelled */); .error(e, true /*includeCancelled */);
if (e.code() == error_code_snap_storage_failed
|| e.code() == error_code_snap_tlog_failed
|| e.code() == error_code_operation_cancelled) {
// enable tlog pop on local tlog nodes
std::vector<TLogInterface> tlogs = db->get().logSystemConfig.allLocalLogs(false);
try {
std::vector<Future<Void>> enablePops;
for (const auto & tlog : tlogs) {
enablePops.push_back(
transformErrors(throwErrorOr(tlog.enablePopRequest.tryGetReply(TLogEnablePopRequest(snapReq.snapUID))), snap_enable_tlog_pop_failed())
);
}
wait(waitForAll(enablePops));
} catch (Error& error) {
TraceEvent(SevDebug, "IgnoreEnableTLogPopFailure");
}
}
throw e; throw e;
} }
return Void(); return Void();
@ -4235,7 +4253,7 @@ ACTOR Future<Void> ddSnapCreate(DistributorSnapRequest snapReq, Reference<AsyncV
TraceEvent("SnapDDCreateDBInfoChanged") TraceEvent("SnapDDCreateDBInfoChanged")
.detail("SnapPayload", snapReq.snapPayload) .detail("SnapPayload", snapReq.snapPayload)
.detail("SnapUID", snapReq.snapUID); .detail("SnapUID", snapReq.snapUID);
snapReq.reply.sendError(operation_failed()); snapReq.reply.sendError(snap_with_recovery_unsupported());
} }
when (wait(ddSnapCreateCore(snapReq, db))) { when (wait(ddSnapCreateCore(snapReq, db))) {
TraceEvent("SnapDDCreateSuccess") TraceEvent("SnapDDCreateSuccess")

View File

@ -1466,7 +1466,7 @@ ACTOR Future<Void> proxySnapCreate(ProxySnapRequest snapReq, ProxyCommitData* co
TraceEvent("SnapMasterProxy_WhiteListCheckFailed") TraceEvent("SnapMasterProxy_WhiteListCheckFailed")
.detail("SnapPayload", snapReq.snapPayload) .detail("SnapPayload", snapReq.snapPayload)
.detail("SnapUID", snapReq.snapUID); .detail("SnapUID", snapReq.snapUID);
throw transaction_not_permitted(); throw snap_path_not_whitelisted();
} }
// db fully recovered check // db fully recovered check
if (commitData->db->get().recoveryState != RecoveryState::FULLY_RECOVERED) { if (commitData->db->get().recoveryState != RecoveryState::FULLY_RECOVERED) {
@ -1478,7 +1478,7 @@ ACTOR Future<Void> proxySnapCreate(ProxySnapRequest snapReq, ProxyCommitData* co
TraceEvent("SnapMasterProxy_ClusterNotFullyRecovered") TraceEvent("SnapMasterProxy_ClusterNotFullyRecovered")
.detail("SnapPayload", snapReq.snapPayload) .detail("SnapPayload", snapReq.snapPayload)
.detail("SnapUID", snapReq.snapUID); .detail("SnapUID", snapReq.snapUID);
throw cluster_not_fully_recovered(); throw snap_not_fully_recovered_unsupported();
} }
auto result = auto result =
@ -1493,7 +1493,7 @@ ACTOR Future<Void> proxySnapCreate(ProxySnapRequest snapReq, ProxyCommitData* co
TraceEvent("SnapMasterProxy_LogAnitQuorumNotSupported") TraceEvent("SnapMasterProxy_LogAnitQuorumNotSupported")
.detail("SnapPayload", snapReq.snapPayload) .detail("SnapPayload", snapReq.snapPayload)
.detail("SnapUID", snapReq.snapUID); .detail("SnapUID", snapReq.snapUID);
throw txn_exec_log_anti_quorum(); throw snap_log_anti_quorum_unsupported();
} }
// send a snap request to DD // send a snap request to DD

View File

@ -211,7 +211,7 @@ public: // workload functions
wait(status); wait(status);
break; break;
} catch (Error& e) { } catch (Error& e) {
if (e.code() == error_code_txn_exec_log_anti_quorum) { if (e.code() == error_code_snap_log_anti_quorum_unsupported) {
snapFailed = true; snapFailed = true;
break; break;
} }
@ -298,12 +298,12 @@ public: // workload functions
wait(status); wait(status);
break; break;
} catch (Error& e) { } catch (Error& e) {
if (e.code() == error_code_cluster_not_fully_recovered || if (e.code() == error_code_snap_not_fully_recovered_unsupported ||
e.code() == error_code_txn_exec_log_anti_quorum) { e.code() == error_code_snap_log_anti_quorum_unsupported) {
snapFailed = true; snapFailed = true;
break; break;
} }
if (e.code() == error_code_transaction_not_permitted) { if (e.code() == error_code_snap_path_not_whitelisted) {
testedFailure = true; testedFailure = true;
break; break;
} }

View File

@ -65,9 +65,6 @@ ERROR( lookup_failed, 1041, "DNS lookup failed" )
ERROR( proxy_memory_limit_exceeded, 1042, "Proxy commit memory limit exceeded" ) ERROR( proxy_memory_limit_exceeded, 1042, "Proxy commit memory limit exceeded" )
ERROR( shutdown_in_progress, 1043, "Operation no longer supported due to shutdown" ) ERROR( shutdown_in_progress, 1043, "Operation no longer supported due to shutdown" )
ERROR( serialization_failed, 1044, "Failed to deserialize an object" ) ERROR( serialization_failed, 1044, "Failed to deserialize an object" )
ERROR( transaction_not_permitted, 1045, "Operation not permitted")
ERROR( cluster_not_fully_recovered, 1046, "Cluster not fully recovered")
ERROR( txn_exec_log_anti_quorum, 1047, "Execute Transaction not supported when log anti quorum is configured")
ERROR( connection_unreferenced, 1048, "No peer references for connection" ) ERROR( connection_unreferenced, 1048, "No peer references for connection" )
ERROR( connection_idle, 1049, "Connection closed after idle timeout" ) ERROR( connection_idle, 1049, "Connection closed after idle timeout" )
ERROR( disk_adapter_reset, 1050, "The disk queue adpater reset" ) ERROR( disk_adapter_reset, 1050, "The disk queue adpater reset" )
@ -206,6 +203,17 @@ ERROR( key_not_found, 2400, "Expected key is missing")
ERROR( json_malformed, 2401, "JSON string was malformed") ERROR( json_malformed, 2401, "JSON string was malformed")
ERROR( json_eof_expected, 2402, "JSON string did not terminate where expected") ERROR( json_eof_expected, 2402, "JSON string did not terminate where expected")
// 2500 - disk snapshot based backup errors
ERROR( snap_disable_tlog_pop_failed, 2500, "Snapshot error")
ERROR( snap_storage_failed, 2501, "Failed to snapshot storage nodes")
ERROR( snap_tlog_failed, 2502, "Failed to snapshot TLog nodes")
ERROR( snap_coord_failed, 2503, "Failed to snapshot coordinator nodes")
ERROR( snap_enable_tlog_pop_failed, 2504, "Snapshot error")
ERROR( snap_path_not_whitelisted, 2505, "Snapshot create binary path not whitelisted")
ERROR( snap_not_fully_recovered_unsupported, 2506, "Unsupported when the cluster is not fully recovered")
ERROR( snap_log_anti_quorum_unsupported, 2507, "Unsupported when log anti quorum is configured")
ERROR( snap_with_recovery_unsupported, 2508, "Cluster recovery during snapshot operation not supported")
// 4xxx Internal errors (those that should be generated only by bugs) are decimal 4xxx // 4xxx Internal errors (those that should be generated only by bugs) are decimal 4xxx
ERROR( unknown_error, 4000, "An unknown error occurred" ) // C++ exception not of type Error ERROR( unknown_error, 4000, "An unknown error occurred" ) // C++ exception not of type Error
ERROR( internal_error, 4100, "An internal error occurred" ) ERROR( internal_error, 4100, "An internal error occurred" )