diff --git a/fdbclient/DatabaseContext.h b/fdbclient/DatabaseContext.h index 2fc18d3200..7b4991f4a4 100644 --- a/fdbclient/DatabaseContext.h +++ b/fdbclient/DatabaseContext.h @@ -172,7 +172,6 @@ public: Counter transactionsMaybeCommitted; Counter transactionsResourceConstrained; Counter transactionsProcessBehind; - Counter transactionWaitsForFullRecovery; ContinuousSample latencies, readLatencies, commitLatencies, GRVLatencies, mutationsPerCommit, bytesPerCommit; diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 97df886a6f..e934bdb80f 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -519,7 +519,7 @@ DatabaseContext::DatabaseContext( transactionCommittedMutations("CommittedMutations", cc), transactionCommittedMutationBytes("CommittedMutationBytes", cc), transactionsCommitStarted("CommitStarted", cc), transactionsCommitCompleted("CommitCompleted", cc), transactionsTooOld("TooOld", cc), transactionsFutureVersions("FutureVersions", cc), transactionsNotCommitted("NotCommitted", cc), transactionsMaybeCommitted("MaybeCommitted", cc), transactionsResourceConstrained("ResourceConstrained", cc), - transactionsProcessBehind("ProcessBehind", cc), transactionWaitsForFullRecovery("WaitsForFullRecovery", cc), outstandingWatches(0), + transactionsProcessBehind("ProcessBehind", cc), outstandingWatches(0), latencies(1000), readLatencies(1000), commitLatencies(1000), GRVLatencies(1000), mutationsPerCommit(1000), bytesPerCommit(1000), mvCacheInsertLocation(0), healthMetricsLastUpdated(0), detailedHealthMetricsLastUpdated(0), internal(internal) { @@ -548,7 +548,7 @@ DatabaseContext::DatabaseContext( const Error &err ) : deferredError(err), cc("T transactionCommittedMutations("CommittedMutations", cc), transactionCommittedMutationBytes("CommittedMutationBytes", cc), transactionsCommitStarted("CommitStarted", cc), transactionsCommitCompleted("CommitCompleted", cc), transactionsTooOld("TooOld", cc), transactionsFutureVersions("FutureVersions", cc), transactionsNotCommitted("NotCommitted", cc), transactionsMaybeCommitted("MaybeCommitted", cc), transactionsResourceConstrained("ResourceConstrained", cc), - transactionsProcessBehind("ProcessBehind", cc), transactionWaitsForFullRecovery("WaitsForFullRecovery", cc), latencies(1000), readLatencies(1000), commitLatencies(1000), + transactionsProcessBehind("ProcessBehind", cc), latencies(1000), readLatencies(1000), commitLatencies(1000), GRVLatencies(1000), mutationsPerCommit(1000), bytesPerCommit(1000), internal(false) {} @@ -2705,10 +2705,7 @@ ACTOR static Future tryCommit( Database cx, Reference if (e.code() != error_code_transaction_too_old && e.code() != error_code_not_committed && e.code() != error_code_database_locked - && e.code() != error_code_proxy_memory_limit_exceeded - && e.code() != error_code_transaction_not_permitted - && e.code() != error_code_cluster_not_fully_recovered - && e.code() != error_code_txn_exec_log_anti_quorum) + && e.code() != error_code_proxy_memory_limit_exceeded) TraceEvent(SevError, "TryCommitError").error(e); if (trLogInfo) trLogInfo->addLog(FdbClientLogEvents::EventCommitError(startTime, static_cast(e.code()), req)); @@ -3115,8 +3112,7 @@ Future Transaction::onError( Error const& e ) { e.code() == error_code_commit_unknown_result || e.code() == error_code_database_locked || e.code() == error_code_proxy_memory_limit_exceeded || - e.code() == error_code_process_behind || - e.code() == error_code_cluster_not_fully_recovered) + e.code() == error_code_process_behind) { if(e.code() == error_code_not_committed) ++cx->transactionsNotCommitted; @@ -3126,9 +3122,6 @@ Future Transaction::onError( Error const& e ) { ++cx->transactionsResourceConstrained; if (e.code() == error_code_process_behind) ++cx->transactionsProcessBehind; - if (e.code() == error_code_cluster_not_fully_recovered) { - ++cx->transactionWaitsForFullRecovery; - } double backoff = getBackoff(e.code()); reset(); diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index af34f1da0f..a978e081e4 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -4147,7 +4147,7 @@ ACTOR Future ddSnapCreateCore(DistributorSnapRequest snapReq, Reference> disablePops; for (const auto & tlog : tlogs) { disablePops.push_back( - transformErrors(throwErrorOr(tlog.disablePopRequest.tryGetReply(TLogDisablePopRequest(snapReq.snapUID))), operation_failed()) + transformErrors(throwErrorOr(tlog.disablePopRequest.tryGetReply(TLogDisablePopRequest(snapReq.snapUID))), snap_disable_tlog_pop_failed()) ); } wait(waitForAll(disablePops)); @@ -4156,14 +4156,14 @@ ACTOR Future ddSnapCreateCore(DistributorSnapRequest snapReq, Reference storageWorkers = wait(getStorageWorkers(cx, db, true /* localOnly */)); + std::vector storageWorkers = wait(transformErrors(getStorageWorkers(cx, db, true /* localOnly */), snap_storage_failed())); TraceEvent("SnapDataDistributor_GotStorageWorkers") .detail("SnapPayload", snapReq.snapPayload) .detail("SnapUID", snapReq.snapUID); std::vector> storageSnapReqs; for (const auto & worker : storageWorkers) { storageSnapReqs.push_back( - transformErrors(throwErrorOr(worker.workerSnapReq.tryGetReply(WorkerSnapRequest(snapReq.snapPayload, snapReq.snapUID, LiteralStringRef("storage")))), operation_failed()) + transformErrors(throwErrorOr(worker.workerSnapReq.tryGetReply(WorkerSnapRequest(snapReq.snapPayload, snapReq.snapUID, LiteralStringRef("storage")))), snap_storage_failed()) ); } wait(waitForAll(storageSnapReqs)); @@ -4175,7 +4175,7 @@ ACTOR Future ddSnapCreateCore(DistributorSnapRequest snapReq, Reference> tLogSnapReqs; for (const auto & tlog : tlogs) { tLogSnapReqs.push_back( - transformErrors(throwErrorOr(tlog.snapRequest.tryGetReply(TLogSnapRequest(snapReq.snapPayload, snapReq.snapUID, LiteralStringRef("tlog")))), operation_failed()) + transformErrors(throwErrorOr(tlog.snapRequest.tryGetReply(TLogSnapRequest(snapReq.snapPayload, snapReq.snapUID, LiteralStringRef("tlog")))), snap_tlog_failed()) ); } wait(waitForAll(tLogSnapReqs)); @@ -4187,7 +4187,7 @@ ACTOR Future ddSnapCreateCore(DistributorSnapRequest snapReq, Reference> enablePops; for (const auto & tlog : tlogs) { enablePops.push_back( - transformErrors(throwErrorOr(tlog.enablePopRequest.tryGetReply(TLogEnablePopRequest(snapReq.snapUID))), operation_failed()) + transformErrors(throwErrorOr(tlog.enablePopRequest.tryGetReply(TLogEnablePopRequest(snapReq.snapUID))), snap_enable_tlog_pop_failed()) ); } wait(waitForAll(enablePops)); @@ -4203,18 +4203,36 @@ ACTOR Future ddSnapCreateCore(DistributorSnapRequest snapReq, Reference> coordSnapReqs; for (const auto & worker : coordWorkers) { coordSnapReqs.push_back( - transformErrors(throwErrorOr(worker.workerSnapReq.tryGetReply(WorkerSnapRequest(snapReq.snapPayload, snapReq.snapUID, LiteralStringRef("coord")))), operation_failed()) + transformErrors(throwErrorOr(worker.workerSnapReq.tryGetReply(WorkerSnapRequest(snapReq.snapPayload, snapReq.snapUID, LiteralStringRef("coord")))), snap_coord_failed()) ); } wait(waitForAll(coordSnapReqs)); TraceEvent("SnapDataDistributor_AfterSnapCoords") .detail("SnapPayload", snapReq.snapPayload) .detail("SnapUID", snapReq.snapUID); - } catch (Error& e) { + } catch (Error& err) { + state Error e = err; TraceEvent("SnapDataDistributor_SnapReqExit") .detail("SnapPayload", snapReq.snapPayload) .detail("SnapUID", snapReq.snapUID) .error(e, true /*includeCancelled */); + if (e.code() == error_code_snap_storage_failed + || e.code() == error_code_snap_tlog_failed + || e.code() == error_code_operation_cancelled) { + // enable tlog pop on local tlog nodes + std::vector tlogs = db->get().logSystemConfig.allLocalLogs(false); + try { + std::vector> enablePops; + for (const auto & tlog : tlogs) { + enablePops.push_back( + transformErrors(throwErrorOr(tlog.enablePopRequest.tryGetReply(TLogEnablePopRequest(snapReq.snapUID))), snap_enable_tlog_pop_failed()) + ); + } + wait(waitForAll(enablePops)); + } catch (Error& error) { + TraceEvent(SevDebug, "IgnoreEnableTLogPopFailure"); + } + } throw e; } return Void(); @@ -4235,7 +4253,7 @@ ACTOR Future ddSnapCreate(DistributorSnapRequest snapReq, Reference proxySnapCreate(ProxySnapRequest snapReq, ProxyCommitData* co TraceEvent("SnapMasterProxy_WhiteListCheckFailed") .detail("SnapPayload", snapReq.snapPayload) .detail("SnapUID", snapReq.snapUID); - throw transaction_not_permitted(); + throw snap_path_not_whitelisted(); } // db fully recovered check if (commitData->db->get().recoveryState != RecoveryState::FULLY_RECOVERED) { @@ -1478,7 +1478,7 @@ ACTOR Future proxySnapCreate(ProxySnapRequest snapReq, ProxyCommitData* co TraceEvent("SnapMasterProxy_ClusterNotFullyRecovered") .detail("SnapPayload", snapReq.snapPayload) .detail("SnapUID", snapReq.snapUID); - throw cluster_not_fully_recovered(); + throw snap_not_fully_recovered_unsupported(); } auto result = @@ -1493,7 +1493,7 @@ ACTOR Future proxySnapCreate(ProxySnapRequest snapReq, ProxyCommitData* co TraceEvent("SnapMasterProxy_LogAnitQuorumNotSupported") .detail("SnapPayload", snapReq.snapPayload) .detail("SnapUID", snapReq.snapUID); - throw txn_exec_log_anti_quorum(); + throw snap_log_anti_quorum_unsupported(); } // send a snap request to DD diff --git a/fdbserver/workloads/SnapTest.actor.cpp b/fdbserver/workloads/SnapTest.actor.cpp index 639f5fb1d1..aaed65ce11 100644 --- a/fdbserver/workloads/SnapTest.actor.cpp +++ b/fdbserver/workloads/SnapTest.actor.cpp @@ -211,7 +211,7 @@ public: // workload functions wait(status); break; } catch (Error& e) { - if (e.code() == error_code_txn_exec_log_anti_quorum) { + if (e.code() == error_code_snap_log_anti_quorum_unsupported) { snapFailed = true; break; } @@ -298,12 +298,12 @@ public: // workload functions wait(status); break; } catch (Error& e) { - if (e.code() == error_code_cluster_not_fully_recovered || - e.code() == error_code_txn_exec_log_anti_quorum) { + if (e.code() == error_code_snap_not_fully_recovered_unsupported || + e.code() == error_code_snap_log_anti_quorum_unsupported) { snapFailed = true; break; } - if (e.code() == error_code_transaction_not_permitted) { + if (e.code() == error_code_snap_path_not_whitelisted) { testedFailure = true; break; } diff --git a/flow/error_definitions.h b/flow/error_definitions.h index 0d95b9fda5..b489a2ea69 100755 --- a/flow/error_definitions.h +++ b/flow/error_definitions.h @@ -65,9 +65,6 @@ ERROR( lookup_failed, 1041, "DNS lookup failed" ) ERROR( proxy_memory_limit_exceeded, 1042, "Proxy commit memory limit exceeded" ) ERROR( shutdown_in_progress, 1043, "Operation no longer supported due to shutdown" ) ERROR( serialization_failed, 1044, "Failed to deserialize an object" ) -ERROR( transaction_not_permitted, 1045, "Operation not permitted") -ERROR( cluster_not_fully_recovered, 1046, "Cluster not fully recovered") -ERROR( txn_exec_log_anti_quorum, 1047, "Execute Transaction not supported when log anti quorum is configured") ERROR( connection_unreferenced, 1048, "No peer references for connection" ) ERROR( connection_idle, 1049, "Connection closed after idle timeout" ) ERROR( disk_adapter_reset, 1050, "The disk queue adpater reset" ) @@ -206,6 +203,17 @@ ERROR( key_not_found, 2400, "Expected key is missing") ERROR( json_malformed, 2401, "JSON string was malformed") ERROR( json_eof_expected, 2402, "JSON string did not terminate where expected") +// 2500 - disk snapshot based backup errors +ERROR( snap_disable_tlog_pop_failed, 2500, "Snapshot error") +ERROR( snap_storage_failed, 2501, "Failed to snapshot storage nodes") +ERROR( snap_tlog_failed, 2502, "Failed to snapshot TLog nodes") +ERROR( snap_coord_failed, 2503, "Failed to snapshot coordinator nodes") +ERROR( snap_enable_tlog_pop_failed, 2504, "Snapshot error") +ERROR( snap_path_not_whitelisted, 2505, "Snapshot create binary path not whitelisted") +ERROR( snap_not_fully_recovered_unsupported, 2506, "Unsupported when the cluster is not fully recovered") +ERROR( snap_log_anti_quorum_unsupported, 2507, "Unsupported when log anti quorum is configured") +ERROR( snap_with_recovery_unsupported, 2508, "Cluster recovery during snapshot operation not supported") + // 4xxx Internal errors (those that should be generated only by bugs) are decimal 4xxx ERROR( unknown_error, 4000, "An unknown error occurred" ) // C++ exception not of type Error ERROR( internal_error, 4100, "An internal error occurred" )