FastRestore:LoaderScheduler:Add knobs

This commit is contained in:
Meng Xu 2020-08-15 21:40:24 -07:00
parent 7e302b5910
commit a035e7a872
4 changed files with 28 additions and 13 deletions

View File

@ -643,6 +643,11 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi
init( FASTRESTORE_USE_RANGE_FILE, true ); // Perf test only: set it to false will cause simulation failure
init( FASTRESTORE_USE_LOG_FILE, true ); // Perf test only: set it to false will cause simulation failure
init( FASTRESTORE_SAMPLE_MSG_BYTES, 1048576 ); if( randomize && BUGGIFY ) { FASTRESTORE_SAMPLE_MSG_BYTES = deterministicRandom()->random01() * 2048;}
init( FASTRESTORE_SCHED_UPDATE_DELAY, 0.5 ); if( randomize && BUGGIFY ) { FASTRESTORE_SCHED_UPDATE_DELAY = deterministicRandom()->random01() * 2;}
init( FASTRESTORE_SCHED_TARGET_CPU_PERCENT, 70 ); if( randomize && BUGGIFY ) { FASTRESTORE_SCHED_TARGET_CPU_PERCENT = deterministicRandom()->random01() * 100;}
init( FASTRESTORE_SCHED_MAX_CPU_PERCENT, 90 ); if( randomize && BUGGIFY ) { FASTRESTORE_SCHED_MAX_CPU_PERCENT = FASTRESTORE_SCHED_TARGET_CPU_PERCENT + deterministicRandom()->random01() * 100;}
init( FASTRESTORE_SCHED_INFLIGHT_LOAD_REQS, 20 ); if( randomize && BUGGIFY ) { FASTRESTORE_SCHED_INFLIGHT_LOAD_REQS = deterministicRandom()->random01() * 30;}
init( FASTRESTORE_SCHED_INFLIGHT_SEND_REQS, 3 ); if( randomize && BUGGIFY ) { FASTRESTORE_SCHED_INFLIGHT_SEND_REQS = deterministicRandom()->random01() < 0.2 ? 1 : deterministicRandom()->random01() * 5 + 1;}
init( REDWOOD_DEFAULT_PAGE_SIZE, 4096 );
init( REDWOOD_KVSTORE_CONCURRENT_READS, 64 );

View File

@ -575,6 +575,11 @@ public:
bool FASTRESTORE_USE_RANGE_FILE; // use range file in backup
bool FASTRESTORE_USE_LOG_FILE; // use log file in backup
int64_t FASTRESTORE_SAMPLE_MSG_BYTES; // sample message desired size
double FASTRESTORE_SCHED_UPDATE_DELAY; // delay in seconds in updating process metrics
int FASTRESTORE_SCHED_TARGET_CPU_PERCENT; // release as many requests as possible when cpu usage is below the knob
int FASTRESTORE_SCHED_MAX_CPU_PERCENT; // max cpu percent when scheduler shall not release non-urgent requests
int FASTRESTORE_SCHED_INFLIGHT_LOAD_REQS; // number of inflight requests to load backup files
int FASTRESTORE_SCHED_INFLIGHT_SEND_REQS; // number of infligth requests for loaders to send mutations to appliers
int REDWOOD_DEFAULT_PAGE_SIZE; // Page size for new Redwood files
int REDWOOD_KVSTORE_CONCURRENT_READS; // Max number of simultaneous point or range reads in progress.

View File

@ -70,7 +70,8 @@ ACTOR Future<Void> dispatchRequests(Reference<RestoreLoaderData> self) {
while (!self->sendingQueue.empty()) {
const RestoreSendMutationsToAppliersRequest& req = self->sendingQueue.top();
// Dispatch the request if it is the next version batch to process or if cpu usage is low
if (req.batchIndex - 1 == self->finishedSendingVB || self->cpuUsage < 70) {
if (req.batchIndex - 1 == self->finishedSendingVB ||
self->cpuUsage < SERVER_KNOBS->FASTRESTORE_SCHED_TARGET_CPU_PERCENT) {
self->addActor.send(handleSendMutationsRequest(req, self));
self->sendingQueue.pop();
}
@ -78,29 +79,30 @@ ACTOR Future<Void> dispatchRequests(Reference<RestoreLoaderData> self) {
// and it takes large amount of resource
}
// When shall the node pause the process of more loading file requests
if (self->inflightSendingReqs >= 3 || (self->inflightSendingReqs >= 1 && self->cpuUsage >= 70) ||
self->cpuUsage >= 90) {
if (self->inflightSendingReqs >= 3) {
if ((self->inflightSendingReqs >= SERVER_KNOBS->FASTRESTORE_SCHED_INFLIGHT_SEND_REQS ||
(self->inflightSendingReqs >= 1 &&
self->cpuUsage >= SERVER_KNOBS->FASTRESTORE_SCHED_TARGET_CPU_PERCENT) ||
self->cpuUsage >= SERVER_KNOBS->FASTRESTORE_SCHED_MAX_CPU_PERCENT) &&
(self->inflightSendingReqs > 0 && self->inflightLoadingReqs > 0)) {
if (self->inflightSendingReqs >= SERVER_KNOBS->FASTRESTORE_SCHED_INFLIGHT_SEND_REQS) {
TraceEvent(SevWarn, "FastRestoreLoaderTooManyInflightSendingMutationRequests")
.detail("VersionBatchesBlockedAtSendingMutationsToAppliers", self->inflightSendingReqs)
.detail("Reason", "Sending mutations is too slow");
}
wait(delay(0.5)); // TODO: Knob
wait(delay(SERVER_KNOBS->FASTRESTORE_SCHED_UPDATE_DELAY));
updateProcessStats(self);
continue;
}
// Dispatch loading backup file requests
int releasedReq = 0;
while (!self->loadingQueue.empty()) {
const RestoreLoadFileRequest& req = self->loadingQueue.top();
self->addActor.send(handleLoadFileRequest(req, self));
++releasedReq;
self->loadingQueue.pop();
if (releasedReq > 10) { // TODO: Knob
if (self->inflightLoadingReqs > SERVER_KNOBS->FASTRESTORE_SCHED_INFLIGHT_LOAD_REQS) {
break;
}
const RestoreLoadFileRequest& req = self->loadingQueue.top();
self->addActor.send(handleLoadFileRequest(req, self));
self->loadingQueue.pop();
}
if (self->cpuUsage >= 70) {
if (self->cpuUsage >= SERVER_KNOBS->FASTRESTORE_SCHED_TARGET_CPU_PERCENT) {
wait(delay(0.1));
updateProcessStats(self);
}
@ -439,6 +441,7 @@ ACTOR Future<Void> handleLoadFileRequest(RestoreLoadFileRequest req, Reference<R
ASSERT(batchData->sampleMutations.find(req.param) == batchData->sampleMutations.end());
batchData->processedFileParams[req.param] =
_processLoadingParam(&self->rangeVersions, req.param, batchData, self->id(), self->bc);
self->inflightLoadingReqs++;
isDuplicated = false;
} else {
TraceEvent(SevFRDebugInfo, "FastRestoreLoadFile", self->id())
@ -483,6 +486,7 @@ ACTOR Future<Void> handleLoadFileRequest(RestoreLoadFileRequest req, Reference<R
}
// Ack restore controller the param is processed
self->inflightLoadingReqs--;
req.reply.send(RestoreLoadFileReply(req.param, isDuplicated));
TraceEvent(printTrace ? SevInfo : SevFRDebugInfo, "FastRestoreLoaderPhaseLoadFileDone", self->id())
.detail("BatchIndex", req.batchIndex)

View File

@ -146,6 +146,7 @@ struct RestoreLoaderData : RestoreRoleData, public ReferenceCounted<RestoreLoade
int finishedLoadingVB; // the max version batch index that finished loading file phase
int finishedSendingVB; // the max version batch index that finished sending mutations phase
int inflightSendingReqs; // number of sendingMutations requests released
int inflightLoadingReqs; // number of load backup file requests release
// addActor: add to actorCollection so that when an actor has error, the ActorCollection can catch the error.
// addActor is used to create the actorCollection when the RestoreController is created
@ -155,7 +156,7 @@ struct RestoreLoaderData : RestoreRoleData, public ReferenceCounted<RestoreLoade
void delref() { return ReferenceCounted<RestoreLoaderData>::delref(); }
explicit RestoreLoaderData(UID loaderInterfID, int assignedIndex, RestoreControllerInterface ci)
: ci(ci), finishedLoadingVB(0), finishedSendingVB(0) {
: ci(ci), finishedLoadingVB(0), finishedSendingVB(0), inflightSendingReqs(0), inflightLoadingReqs(0) {
nodeID = loaderInterfID;
nodeIndex = assignedIndex;
role = RestoreRole::Loader;