/* * Resolver.actor.cpp * * This source file is part of the FoundationDB open source project * * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "flow/ActorCollection.h" #include "fdbclient/NativeAPI.actor.h" #include "fdbserver/ResolverInterface.h" #include "fdbserver/MasterInterface.h" #include "fdbserver/WorkerInterface.actor.h" #include "fdbserver/WaitFailure.h" #include "fdbserver/Knobs.h" #include "fdbserver/ServerDBInfo.h" #include "fdbserver/Orderer.actor.h" #include "fdbserver/ConflictSet.h" #include "fdbserver/StorageMetrics.h" #include "fdbclient/SystemData.h" #include "flow/actorcompiler.h" // This must be the last #include. namespace { struct ProxyRequestsInfo { std::map outstandingBatches; Version lastVersion; ProxyRequestsInfo() : lastVersion(-1) {} }; } namespace{ struct Resolver : ReferenceCounted { UID dbgid; int proxyCount, resolverCount; NotifiedVersion version; AsyncVar neededVersion; Map>> recentStateTransactions; Deque> recentStateTransactionSizes; AsyncVar totalStateBytes; AsyncTrigger checkNeededVersion; std::map proxyInfoMap; ConflictSet *conflictSet; TransientStorageMetricSample iopsSample; Version debugMinRecentStateVersion; CounterCollection cc; Counter resolveBatchIn; Counter resolveBatchStart; Counter resolvedTransactions; Counter resolvedBytes; Counter resolvedReadConflictRanges; Counter resolvedWriteConflictRanges; Counter transactionsAccepted; Counter transactionsTooOld; Counter transactionsConflicted; Counter resolvedStateTransactions; Counter resolvedStateMutations; Counter resolvedStateBytes; Counter resolveBatchOut; Counter metricsRequests; Counter splitRequests; Future logger; Resolver( UID dbgid, int proxyCount, int resolverCount ) : dbgid(dbgid), proxyCount(proxyCount), resolverCount(resolverCount), version(-1), conflictSet( newConflictSet() ), iopsSample( SERVER_KNOBS->KEY_BYTES_PER_SAMPLE ), debugMinRecentStateVersion(0), cc("Resolver", dbgid.toString()), resolveBatchIn("ResolveBatchIn", cc), resolveBatchStart("ResolveBatchStart", cc), resolvedTransactions("ResolvedTransactions", cc), resolvedBytes("ResolvedBytes", cc), resolvedReadConflictRanges("ResolvedReadConflictRanges", cc), resolvedWriteConflictRanges("ResolvedWriteConflictRanges", cc), transactionsAccepted("TransactionsAccepted", cc), transactionsTooOld("TransactionsTooOld", cc), transactionsConflicted("TransactionsConflicted", cc), resolvedStateTransactions("ResolvedStateTransactions", cc), resolvedStateMutations("ResolvedStateMutations", cc), resolvedStateBytes("ResolvedStateBytes", cc), resolveBatchOut("ResolveBatchOut", cc), metricsRequests("MetricsRequests", cc), splitRequests("SplitRequests", cc) { specialCounter(cc, "Version", [this](){ return this->version.get(); }); specialCounter(cc, "NeededVersion", [this](){ return this->neededVersion.get(); }); specialCounter(cc, "TotalStateBytes", [this](){ return this->totalStateBytes.get(); }); logger = traceCounters("ResolverMetrics", dbgid, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, &cc, "ResolverMetrics"); } ~Resolver() { destroyConflictSet( conflictSet ); } }; } // namespace ACTOR Future resolveBatch( Reference self, ResolveTransactionBatchRequest req) { state Optional debugID; // The first request (prevVersion < 0) comes from the master state NetworkAddress proxyAddress = req.prevVersion >= 0 ? req.reply.getEndpoint().getPrimaryAddress() : NetworkAddress(); state ProxyRequestsInfo &proxyInfo = self->proxyInfoMap[proxyAddress]; ++self->resolveBatchIn; if(req.debugID.present()) { debugID = nondeterministicRandom()->randomUniqueID(); g_traceBatch.addAttach("CommitAttachID", req.debugID.get().first(), debugID.get().first()); g_traceBatch.addEvent("CommitDebug",debugID.get().first(),"Resolver.resolveBatch.Before"); } /*TraceEvent("ResolveBatchStart", self->dbgid).detail("From", proxyAddress).detail("Version", req.version).detail("PrevVersion", req.prevVersion).detail("StateTransactions", req.txnStateTransactions.size()) .detail("RecentStateTransactions", self->recentStateTransactionSizes.size()).detail("LastVersion", proxyInfo.lastVersion).detail("FirstVersion", self->recentStateTransactionSizes.empty() ? -1 : self->recentStateTransactionSizes.front().first) .detail("ResolverVersion", self->version.get());*/ while( self->totalStateBytes.get() > SERVER_KNOBS->RESOLVER_STATE_MEMORY_LIMIT && self->recentStateTransactionSizes.size() && proxyInfo.lastVersion > self->recentStateTransactionSizes.front().first && req.version > self->neededVersion.get() ) { /*TraceEvent("ResolveBatchDelay").detail("From", proxyAddress).detail("StateBytes", self->totalStateBytes.get()).detail("RecentStateTransactionSize", self->recentStateTransactionSizes.size()) .detail("LastVersion", proxyInfo.lastVersion).detail("RequestVersion", req.version).detail("NeededVersion", self->neededVersion.get()) .detail("RecentStateVer", self->recentStateTransactions.begin()->key);*/ wait( self->totalStateBytes.onChange() || self->neededVersion.onChange() ); } if(debugID.present()) { g_traceBatch.addEvent("CommitDebug",debugID.get().first(),"Resolver.resolveBatch.AfterQueueSizeCheck"); } loop { if( self->recentStateTransactionSizes.size() && proxyInfo.lastVersion <= self->recentStateTransactionSizes.front().first ) { self->neededVersion.set( std::max(self->neededVersion.get(), req.prevVersion) ); } choose { when(wait(self->version.whenAtLeast(req.prevVersion))) { break; } when(wait(self->checkNeededVersion.onTrigger())) { } } } if (check_yield(TaskPriority::DefaultEndpoint)) { wait( delay( 0, TaskPriority::Low ) || delay( SERVER_KNOBS->COMMIT_SLEEP_TIME ) ); // FIXME: Is this still right? g_network->setCurrentTask(TaskPriority::DefaultEndpoint); } if (self->version.get() == req.prevVersion) { // Not a duplicate (check relies on no waiting between here and self->version.set() below!) ++self->resolveBatchStart; self->resolvedTransactions += req.transactions.size(); self->resolvedBytes += req.transactions.expectedSize(); if(proxyInfo.lastVersion > 0) { proxyInfo.outstandingBatches.erase(proxyInfo.outstandingBatches.begin(), proxyInfo.outstandingBatches.upper_bound(req.lastReceivedVersion)); } Version firstUnseenVersion = proxyInfo.lastVersion + 1; proxyInfo.lastVersion = req.version; if(req.debugID.present()) g_traceBatch.addEvent("CommitDebug", debugID.get().first(), "Resolver.resolveBatch.AfterOrderer"); ResolveTransactionBatchReply& reply = proxyInfo.outstandingBatches[req.version]; vector commitList; vector tooOldList; // Detect conflicts double expire = now() + SERVER_KNOBS->SAMPLE_EXPIRATION_TIME; ConflictBatch conflictBatch(self->conflictSet, &reply.conflictingKeyRangeMap, &reply.arena); int keys = 0; for(int t=0; tresolvedReadConflictRanges += req.transactions[t].read_conflict_ranges.size(); self->resolvedWriteConflictRanges += req.transactions[t].write_conflict_ranges.size(); keys += req.transactions[t].write_conflict_ranges.size()*2 + req.transactions[t].read_conflict_ranges.size()*2; if(self->resolverCount > 1) { for(auto it : req.transactions[t].write_conflict_ranges) self->iopsSample.addAndExpire( it.begin, SERVER_KNOBS->SAMPLE_OFFSET_PER_KEY + it.begin.size(), expire ); for(auto it : req.transactions[t].read_conflict_ranges) self->iopsSample.addAndExpire( it.begin, SERVER_KNOBS->SAMPLE_OFFSET_PER_KEY + it.begin.size(), expire ); } } conflictBatch.detectConflicts( req.version, req.version - SERVER_KNOBS->MAX_WRITE_TRANSACTION_LIFE_VERSIONS, commitList, &tooOldList); reply.debugID = req.debugID; reply.committed.resize( reply.arena, req.transactions.size() ); for(int c=0; ctransactionsAccepted += commitList.size(); self->transactionsTooOld += tooOldList.size(); self->transactionsConflicted += req.transactions.size() - commitList.size() - tooOldList.size(); ASSERT(req.prevVersion >= 0 || req.txnStateTransactions.size() == 0); // The master's request should not have any state transactions auto& stateTransactions = self->recentStateTransactions[ req.version ]; int64_t stateMutations = 0; int64_t stateBytes = 0; for(int t : req.txnStateTransactions) { stateMutations += req.transactions[t].mutations.size(); stateBytes += req.transactions[t].mutations.expectedSize(); stateTransactions.push_back_deep(stateTransactions.arena(), StateTransactionRef(reply.committed[t] == ConflictBatch::TransactionCommitted, req.transactions[t].mutations)); } self->resolvedStateTransactions += req.txnStateTransactions.size(); self->resolvedStateMutations += stateMutations; self->resolvedStateBytes += stateBytes; if(stateBytes > 0) self->recentStateTransactionSizes.push_back(std::make_pair(req.version, stateBytes)); ASSERT(req.version >= firstUnseenVersion); ASSERT(firstUnseenVersion >= self->debugMinRecentStateVersion); TEST(firstUnseenVersion == req.version); // Resolver first unseen version is current version auto stateTransactionItr = self->recentStateTransactions.lower_bound(firstUnseenVersion); auto endItr = self->recentStateTransactions.lower_bound(req.version); for(; stateTransactionItr != endItr; ++stateTransactionItr) { reply.stateMutations.push_back( reply.arena, stateTransactionItr->value); reply.arena.dependsOn( stateTransactionItr->value.arena() ); } //TraceEvent("ResolveBatch", self->dbgid).detail("PrevVersion", req.prevVersion).detail("Version", req.version).detail("StateTransactionVersions", self->recentStateTransactionSizes.size()).detail("StateBytes", stateBytes).detail("FirstVersion", self->recentStateTransactionSizes.empty() ? -1 : self->recentStateTransactionSizes.front().first).detail("StateMutationsIn", req.txnStateTransactions.size()).detail("StateMutationsOut", reply.stateMutations.size()).detail("From", proxyAddress); ASSERT(!proxyInfo.outstandingBatches.empty()); ASSERT(self->proxyInfoMap.size() <= self->proxyCount+1); // SOMEDAY: This is O(n) in number of proxies. O(log n) solution using appropriate data structure? Version oldestProxyVersion = req.version; for(auto itr = self->proxyInfoMap.begin(); itr != self->proxyInfoMap.end(); ++itr) { //TraceEvent("ResolveBatchProxyVersion", self->dbgid).detail("Proxy", itr->first).detail("Version", itr->second.lastVersion); if(itr->first.isValid()) { // Don't consider the first master request oldestProxyVersion = std::min(itr->second.lastVersion, oldestProxyVersion); } else { // The master's request version should never prevent us from clearing recentStateTransactions ASSERT(self->debugMinRecentStateVersion == 0 || self->debugMinRecentStateVersion > itr->second.lastVersion); } } TEST(oldestProxyVersion == req.version); // The proxy that sent this request has the oldest current version TEST(oldestProxyVersion != req.version); // The proxy that sent this request does not have the oldest current version bool anyPopped = false; if(firstUnseenVersion <= oldestProxyVersion && self->proxyInfoMap.size() == self->proxyCount+1) { TEST(true); // Deleting old state transactions self->recentStateTransactions.erase( self->recentStateTransactions.begin(), self->recentStateTransactions.upper_bound( oldestProxyVersion ) ); self->debugMinRecentStateVersion = oldestProxyVersion + 1; while(self->recentStateTransactionSizes.size() && self->recentStateTransactionSizes.front().first <= oldestProxyVersion) { anyPopped = true; stateBytes -= self->recentStateTransactionSizes.front().second; self->recentStateTransactionSizes.pop_front(); } } self->version.set( req.version ); bool breachedLimit = self->totalStateBytes.get() <= SERVER_KNOBS->RESOLVER_STATE_MEMORY_LIMIT && self->totalStateBytes.get() + stateBytes > SERVER_KNOBS->RESOLVER_STATE_MEMORY_LIMIT; self->totalStateBytes.setUnconditional(self->totalStateBytes.get() + stateBytes); if(anyPopped || breachedLimit) { self->checkNeededVersion.trigger(); } if(req.debugID.present()) g_traceBatch.addEvent("CommitDebug", debugID.get().first(), "Resolver.resolveBatch.After"); } else { TEST(true); // Duplicate resolve batch request //TraceEvent("DupResolveBatchReq", self->dbgid).detail("From", proxyAddress); } auto proxyInfoItr = self->proxyInfoMap.find(proxyAddress); if(proxyInfoItr != self->proxyInfoMap.end()) { auto batchItr = proxyInfoItr->second.outstandingBatches.find(req.version); if(batchItr != proxyInfoItr->second.outstandingBatches.end()) { req.reply.send(batchItr->second); } else { TEST(true); // No outstanding batches for version on proxy req.reply.send(Never()); } } else { ASSERT_WE_THINK(false); // The first non-duplicate request with this proxyAddress, including this one, should have inserted this item in the map! //TEST(true); // No prior proxy requests req.reply.send(Never()); } ++self->resolveBatchOut; return Void(); } ACTOR Future resolverCore( ResolverInterface resolver, InitializeResolverRequest initReq) { state Reference self( new Resolver(resolver.id(), initReq.proxyCount, initReq.resolverCount) ); state ActorCollection actors(false); state Future doPollMetrics = self->resolverCount > 1 ? Void() : Future(Never()); actors.add( waitFailureServer(resolver.waitFailure.getFuture()) ); actors.add( traceRole(Role::RESOLVER, resolver.id()) ); TraceEvent("ResolverInit", resolver.id()).detail("RecoveryCount", initReq.recoveryCount); loop choose { when ( ResolveTransactionBatchRequest batch = waitNext( resolver.resolve.getFuture() ) ) { actors.add( resolveBatch(self, batch) ); } when ( ResolutionMetricsRequest req = waitNext( resolver.metrics.getFuture() ) ) { ++self->metricsRequests; req.reply.send(self->iopsSample.getEstimate(allKeys)); } when ( ResolutionSplitRequest req = waitNext( resolver.split.getFuture() ) ) { ++self->splitRequests; ResolutionSplitReply rep; rep.key = self->iopsSample.splitEstimate(req.range, req.offset, req.front); rep.used = self->iopsSample.getEstimate(req.front ? KeyRangeRef(req.range.begin, rep.key) : KeyRangeRef(rep.key, req.range.end)); req.reply.send(rep); } when ( wait( actors.getResult() ) ) {} when (wait(doPollMetrics) ) { self->iopsSample.poll(); doPollMetrics = delay(SERVER_KNOBS->SAMPLE_POLL_TIME); } } } ACTOR Future checkRemoved( Reference> db, uint64_t recoveryCount, ResolverInterface myInterface ) { loop { if ( db->get().recoveryCount >= recoveryCount && !std::count(db->get().resolvers.begin(), db->get().resolvers.end(), myInterface) ) throw worker_removed(); wait( db->onChange() ); } } ACTOR Future resolver( ResolverInterface resolver, InitializeResolverRequest initReq, Reference> db ) { try { state Future core = resolverCore( resolver, initReq ); loop choose { when( wait( core ) ) { return Void(); } when( wait( checkRemoved( db, initReq.recoveryCount, resolver ) ) ) {} } } catch (Error& e) { if (e.code() == error_code_actor_cancelled || e.code() == error_code_worker_removed) { TraceEvent("ResolverTerminated", resolver.id()).error(e,true); return Void(); } throw; } }