FastRestore:Debug trace for seg fault

This commit is contained in:
Meng Xu 2020-08-17 20:34:33 -07:00
parent 01255b7ead
commit 7b7490efe7
7 changed files with 19 additions and 8 deletions

View File

@ -558,8 +558,8 @@ struct RestoreSendVersionedMutationsRequest : TimedRequest {
std::string toString() {
std::stringstream ss;
ss << "VersionBatchIndex:" << batchIndex << "RestoreAsset:" << asset.toString() << " msgIndex:" << msgIndex
<< " isRangeFile:" << isRangeFile << " versionedMutations.size:" << versionedMutations.size();
ss << "VersionBatchIndex:" << batchIndex << " msgIndex:" << msgIndex << " isRangeFile:" << isRangeFile
<< " versionedMutations.size:" << versionedMutations.size() << " RestoreAsset:" << asset.toString();
return ss.str();
}

View File

@ -651,6 +651,7 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi
init( FASTRESTORE_SCHED_LOAD_REQ_BATCHSIZE, 5 ); if( randomize && BUGGIFY ) { FASTRESTORE_SCHED_LOAD_REQ_BATCHSIZE = deterministicRandom()->random01() < 0.2 ? 1 : deterministicRandom()->random01() * 20 + 1;}
init( FASTRESTORE_SCHED_INFLIGHT_SENDPARAM_THRESHOLD, 10 ); if( randomize && BUGGIFY ) { FASTRESTORE_SCHED_INFLIGHT_SENDPARAM_THRESHOLD = deterministicRandom()->random01() < 0.2 ? 1 : deterministicRandom()->random01() * 30 + 1;}
init( FASTRESTORE_SCHED_SEND_FUTURE_VB_REQS_BATCH, 2 ); if( randomize && BUGGIFY ) { FASTRESTORE_SCHED_SEND_FUTURE_VB_REQS_BATCH = deterministicRandom()->random01() < 0.2 ? 1 : deterministicRandom()->random01() * 30 + 1;}
init( FASTRESTORE_NUM_TRACE_EVENTS, 100 ); if( randomize && BUGGIFY ) { FASTRESTORE_NUM_TRACE_EVENTS = deterministicRandom()->random01() < 0.2 ? 1 : deterministicRandom()->random01() * 500 + 1;}
init( REDWOOD_DEFAULT_PAGE_SIZE, 4096 );
init( REDWOOD_KVSTORE_CONCURRENT_READS, 64 );

View File

@ -583,6 +583,7 @@ public:
int FASTRESTORE_SCHED_LOAD_REQ_BATCHSIZE; // number of load request to release at once
int FASTRESTORE_SCHED_INFLIGHT_SENDPARAM_THRESHOLD; // we can send future VB requests if it is less than this knob
int FASTRESTORE_SCHED_SEND_FUTURE_VB_REQS_BATCH; // number of future VB sendLoadingParam requests to process at once
int FASTRESTORE_NUM_TRACE_EVENTS;
int REDWOOD_DEFAULT_PAGE_SIZE; // Page size for new Redwood files
int REDWOOD_KVSTORE_CONCURRENT_READS; // Max number of simultaneous point or range reads in progress.

View File

@ -88,6 +88,7 @@ ACTOR Future<Void> restoreApplierCore(RestoreApplierInterface applierInterf, int
break;
}
}
TraceEvent("RestoreApplierCore", self->id()).detail("Request", requestTypeStr); // For debug only
} catch (Error& e) {
TraceEvent(SevWarn, "FastRestoreApplierError", self->id())
.detail("RequestType", requestTypeStr)
@ -121,7 +122,7 @@ ACTOR static Future<Void> handleSendMutationVectorRequest(RestoreSendVersionedMu
// Trace when the receive phase starts at a VB and when it finishes.
// This can help check if receiveMutations block applyMutation phase.
// If so, we need more sophisticated scheduler to ensure priority execution
printTrace = (batchData->receiveMutationReqs % 100 == 1);
printTrace = (batchData->receiveMutationReqs % SERVER_KNOBS->FASTRESTORE_NUM_TRACE_EVENTS == 0);
TraceEvent(printTrace ? SevInfo : SevFRDebugInfo, "FastRestoreApplierPhaseReceiveMutations", self->id())
.detail("BatchIndex", req.batchIndex)
.detail("RestoreAsset", req.asset.toString())
@ -398,6 +399,7 @@ ACTOR static Future<Void> precomputeMutationsResult(Reference<ApplierBatchData>
std::map<Key, std::map<Key, StagingKey>::iterator> incompleteStagingKeys;
std::map<Key, StagingKey>::iterator stagingKeyIter = batchData->stagingKeys.begin();
int numKeysInBatch = 0;
int numGetTxns = 0;
double delayTime = 0; // Start transactions at different time to avoid overwhelming FDB.
for (; stagingKeyIter != batchData->stagingKeys.end(); stagingKeyIter++) {
if (!stagingKeyIter->second.hasBaseValue()) {
@ -407,12 +409,14 @@ ACTOR static Future<Void> precomputeMutationsResult(Reference<ApplierBatchData>
if (numKeysInBatch == SERVER_KNOBS->FASTRESTORE_APPLIER_FETCH_KEYS_SIZE) {
fGetAndComputeKeys.push_back(getAndComputeStagingKeys(incompleteStagingKeys, delayTime, cx, applierID,
batchIndex, &batchData->counters));
numGetTxns++;
delayTime += SERVER_KNOBS->FASTRESTORE_TXN_EXTRA_DELAY;
numKeysInBatch = 0;
incompleteStagingKeys.clear();
}
}
if (numKeysInBatch > 0) {
numGetTxns++;
fGetAndComputeKeys.push_back(getAndComputeStagingKeys(incompleteStagingKeys, delayTime, cx, applierID,
batchIndex, &batchData->counters));
}
@ -420,7 +424,8 @@ ACTOR static Future<Void> precomputeMutationsResult(Reference<ApplierBatchData>
TraceEvent("FastRestoreApplerPhasePrecomputeMutationsResult", applierID)
.detail("BatchIndex", batchIndex)
.detail("Step", "Compute the other staging keys")
.detail("StagingKeys", batchData->stagingKeys.size());
.detail("StagingKeys", batchData->stagingKeys.size())
.detail("GetStagingKeyBatchTxns", numGetTxns);
// Pre-compute pendingMutations to other keys in stagingKeys that has base value
for (stagingKeyIter = batchData->stagingKeys.begin(); stagingKeyIter != batchData->stagingKeys.end();
stagingKeyIter++) {

View File

@ -164,7 +164,10 @@ ACTOR Future<Void> recruitRestoreRoles(Reference<RestoreWorkerData> controllerWo
break;
}
TraceEvent("FastRestoreController", controllerData->id()).detail("WorkerNode", workerInterf.first);
TraceEvent("FastRestoreController", controllerData->id())
.detail("WorkerNode", workerInterf.first)
.detail("NodeRole", role)
.detail("NodeIndex", nodeIndex);
requests.emplace_back(workerInterf.first,
RestoreRecruitRoleRequest(controllerWorker->controllerInterf.get(), role, nodeIndex));
nodeIndex++;

View File

@ -776,7 +776,7 @@ ACTOR Future<Void> sendMutationsToApplier(
applierID, RestoreSendVersionedMutationsRequest(batchIndex, asset, msgIndex, isRangeFile,
applierVersionedMutationsBuffer[applierID]));
}
TraceEvent(SevDebug, "FastRestoreLoaderSendMutationToApplier")
TraceEvent(SevInfo, "FastRestoreLoaderSendMutationToApplier")
.detail("MessageIndex", msgIndex)
.detail("RestoreAsset", asset.toString())
.detail("Requests", requests.size());
@ -800,7 +800,7 @@ ACTOR Future<Void> sendMutationsToApplier(
RestoreSendVersionedMutationsRequest(batchIndex, asset, msgIndex, isRangeFile,
applierVersionedMutationsBuffer[applierID]));
}
TraceEvent(SevDebug, "FastRestoreLoaderSendMutationToApplier")
TraceEvent(SevInfo, "FastRestoreLoaderSendMutationToApplier")
.detail("MessageIndex", msgIndex)
.detail("RestoreAsset", asset.toString())
.detail("Requests", requests.size());

View File

@ -75,8 +75,9 @@ ACTOR Future<Void> handlerTerminateWorkerRequest(RestoreSimpleRequest req, Refer
// Future: Multiple roles in a restore worker
void handleRecruitRoleRequest(RestoreRecruitRoleRequest req, Reference<RestoreWorkerData> self,
ActorCollection* actors, Database cx) {
// Already recruited a role
// Future: Allow multiple restore roles on a restore worker. The design should easily allow this.
ASSERT(!self->loaderInterf.present() || !self->applierInterf.present()); // Only one role per worker for now
// Already recruited a role
if (self->loaderInterf.present()) {
ASSERT(req.role == RestoreRole::Loader);
req.reply.send(RestoreRecruitRoleReply(self->id(), RestoreRole::Loader, self->loaderInterf.get()));