/* * NativeAPI.actor.cpp * * This source file is part of the FoundationDB open source project * * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "fdbclient/NativeAPI.actor.h" #include #include #include #include #include #include #include #include "contrib/fmt-8.1.1/include/fmt/format.h" #include "fdbclient/FDBOptions.g.h" #include "fdbclient/FDBTypes.h" #include "fdbrpc/FailureMonitor.h" #include "fdbrpc/MultiInterface.h" #include "fdbclient/ActorLineageProfiler.h" #include "fdbclient/AnnotateActor.h" #include "fdbclient/Atomic.h" #include "fdbclient/BlobGranuleCommon.h" #include "fdbclient/ClusterInterface.h" #include "fdbclient/ClusterConnectionFile.h" #include "fdbclient/CoordinationInterface.h" #include "fdbclient/DatabaseContext.h" #include "fdbclient/GlobalConfig.actor.h" #include "fdbclient/IKnobCollection.h" #include "fdbclient/JsonBuilder.h" #include "fdbclient/KeyBackedTypes.h" #include "fdbclient/KeyRangeMap.h" #include "fdbclient/ManagementAPI.actor.h" #include "fdbclient/NameLineage.h" #include "fdbclient/CommitProxyInterface.h" #include "fdbclient/MonitorLeader.h" #include "fdbclient/MutationList.h" #include "fdbclient/ReadYourWrites.h" #include "fdbclient/ParallelStream.actor.h" #include "fdbclient/SpecialKeySpace.actor.h" #include "fdbclient/StorageServerInterface.h" #include "fdbclient/SystemData.h" #include "fdbclient/TransactionLineage.h" #include "fdbclient/versions.h" #include "fdbclient/WellKnownEndpoints.h" #include "fdbrpc/LoadBalance.h" #include "fdbrpc/Net2FileSystem.h" #include "fdbrpc/simulator.h" #include "fdbrpc/sim_validation.h" #include "flow/Arena.h" #include "flow/ActorCollection.h" #include "flow/DeterministicRandom.h" #include "flow/Error.h" #include "flow/FastRef.h" #include "flow/IRandom.h" #include "flow/Trace.h" #include "flow/ProtocolVersion.h" #include "flow/flow.h" #include "flow/genericactors.actor.h" #include "flow/Knobs.h" #include "flow/Platform.h" #include "flow/SystemMonitor.h" #include "flow/TLSConfig.actor.h" #include "flow/Tracing.h" #include "flow/UnitTest.h" #include "flow/network.h" #include "flow/serialize.h" #ifdef ADDRESS_SANITIZER #include #endif #ifdef WIN32 #define WIN32_LEAN_AND_MEAN #include #undef min #undef max #else #include #endif #include "flow/actorcompiler.h" // This must be the last #include. extern const char* getSourceVersion(); namespace { TransactionLineageCollector transactionLineageCollector; NameLineageCollector nameLineageCollector; template Future loadBalance( DatabaseContext* ctx, const Reference alternatives, RequestStream Interface::*channel, const Request& request = Request(), TaskPriority taskID = TaskPriority::DefaultPromiseEndpoint, AtMostOnce atMostOnce = AtMostOnce::False, // if true, throws request_maybe_delivered() instead of retrying automatically QueueModel* model = nullptr) { if (alternatives->hasCaches) { return loadBalance(alternatives->locations(), channel, request, taskID, atMostOnce, model); } return fmap( [ctx](auto const& res) { if (res.cached) { ctx->updateCache.trigger(); } return res; }, loadBalance(alternatives->locations(), channel, request, taskID, atMostOnce, model)); } } // namespace FDB_BOOLEAN_PARAM(TransactionRecordLogInfo); FDB_DEFINE_BOOLEAN_PARAM(UseProvisionalProxies); // Used to determine whether or not client will load balance based on the number of GRVs released by each proxy FDB_DEFINE_BOOLEAN_PARAM(BalanceOnRequests); // Whether or not a request should include the tenant name FDB_BOOLEAN_PARAM(UseTenant); NetworkOptions networkOptions; TLSConfig tlsConfig(TLSEndpointType::CLIENT); // The default values, TRACE_DEFAULT_ROLL_SIZE and TRACE_DEFAULT_MAX_LOGS_SIZE are located in Trace.h. NetworkOptions::NetworkOptions() : traceRollSize(TRACE_DEFAULT_ROLL_SIZE), traceMaxLogsSize(TRACE_DEFAULT_MAX_LOGS_SIZE), traceLogGroup("default"), traceFormat("xml"), traceClockSource("now"), supportedVersions(new ReferencedObject>>()), runLoopProfilingEnabled(false), primaryClient(true) {} static const Key CLIENT_LATENCY_INFO_PREFIX = LiteralStringRef("client_latency/"); static const Key CLIENT_LATENCY_INFO_CTR_PREFIX = LiteralStringRef("client_latency_counter/"); void DatabaseContext::addTssMapping(StorageServerInterface const& ssi, StorageServerInterface const& tssi) { auto result = tssMapping.find(ssi.id()); // Update tss endpoint mapping if ss isn't in mapping, or the interface it mapped to changed if (result == tssMapping.end() || result->second.getValue.getEndpoint().token.first() != tssi.getValue.getEndpoint().token.first()) { Reference metrics; if (result == tssMapping.end()) { // new TSS pairing metrics = makeReference(); tssMetrics[tssi.id()] = metrics; tssMapping[ssi.id()] = tssi; } else { if (result->second.id() == tssi.id()) { metrics = tssMetrics[tssi.id()]; } else { TEST(true); // SS now maps to new TSS! This will probably never happen in practice tssMetrics.erase(result->second.id()); metrics = makeReference(); tssMetrics[tssi.id()] = metrics; } result->second = tssi; } // data requests duplicated for load and data comparison queueModel.updateTssEndpoint(ssi.getValue.getEndpoint().token.first(), TSSEndpointData(tssi.id(), tssi.getValue.getEndpoint(), metrics)); queueModel.updateTssEndpoint(ssi.getKey.getEndpoint().token.first(), TSSEndpointData(tssi.id(), tssi.getKey.getEndpoint(), metrics)); queueModel.updateTssEndpoint(ssi.getKeyValues.getEndpoint().token.first(), TSSEndpointData(tssi.id(), tssi.getKeyValues.getEndpoint(), metrics)); queueModel.updateTssEndpoint(ssi.getMappedKeyValues.getEndpoint().token.first(), TSSEndpointData(tssi.id(), tssi.getMappedKeyValues.getEndpoint(), metrics)); queueModel.updateTssEndpoint(ssi.getKeyValuesStream.getEndpoint().token.first(), TSSEndpointData(tssi.id(), tssi.getKeyValuesStream.getEndpoint(), metrics)); // non-data requests duplicated for load queueModel.updateTssEndpoint(ssi.watchValue.getEndpoint().token.first(), TSSEndpointData(tssi.id(), tssi.watchValue.getEndpoint(), metrics)); queueModel.updateTssEndpoint(ssi.splitMetrics.getEndpoint().token.first(), TSSEndpointData(tssi.id(), tssi.splitMetrics.getEndpoint(), metrics)); queueModel.updateTssEndpoint(ssi.getReadHotRanges.getEndpoint().token.first(), TSSEndpointData(tssi.id(), tssi.getReadHotRanges.getEndpoint(), metrics)); queueModel.updateTssEndpoint(ssi.getRangeSplitPoints.getEndpoint().token.first(), TSSEndpointData(tssi.id(), tssi.getRangeSplitPoints.getEndpoint(), metrics)); } } void DatabaseContext::removeTssMapping(StorageServerInterface const& ssi) { auto result = tssMapping.find(ssi.id()); if (result != tssMapping.end()) { tssMetrics.erase(ssi.id()); tssMapping.erase(result); queueModel.removeTssEndpoint(ssi.getValue.getEndpoint().token.first()); queueModel.removeTssEndpoint(ssi.getKey.getEndpoint().token.first()); queueModel.removeTssEndpoint(ssi.getKeyValues.getEndpoint().token.first()); queueModel.removeTssEndpoint(ssi.getMappedKeyValues.getEndpoint().token.first()); queueModel.removeTssEndpoint(ssi.getKeyValuesStream.getEndpoint().token.first()); queueModel.removeTssEndpoint(ssi.watchValue.getEndpoint().token.first()); queueModel.removeTssEndpoint(ssi.splitMetrics.getEndpoint().token.first()); queueModel.removeTssEndpoint(ssi.getReadHotRanges.getEndpoint().token.first()); queueModel.removeTssEndpoint(ssi.getRangeSplitPoints.getEndpoint().token.first()); } } void DatabaseContext::addSSIdTagMapping(const UID& uid, const Tag& tag) { ssidTagMapping[uid] = tag; } void DatabaseContext::getLatestCommitVersions(const Reference& locationInfo, Version readVersion, Reference info, VersionVector& latestCommitVersions) { latestCommitVersions.clear(); if (info->debugID.present()) { g_traceBatch.addEvent("TransactionDebug", info->debugID.get().first(), "NativeAPI.getLatestCommitVersions"); } if (!info->readVersionObtainedFromGrvProxy) { return; } if (ssVersionVectorCache.getMaxVersion() != invalidVersion && readVersion > ssVersionVectorCache.getMaxVersion()) { TraceEvent(SevDebug, "GetLatestCommitVersions") .detail("ReadVersion", readVersion) .detail("VersionVector", ssVersionVectorCache.toString()); ssVersionVectorCache.clear(); throw stale_version_vector(); // TODO: investigate why } std::map> versionMap; // order the versions to be returned for (int i = 0; i < locationInfo->locations()->size(); i++) { UID uid = locationInfo->locations()->getId(i); if (ssidTagMapping.find(uid) != ssidTagMapping.end()) { Tag tag = ssidTagMapping[uid]; if (ssVersionVectorCache.hasVersion(tag)) { Version commitVersion = ssVersionVectorCache.getVersion(tag); // latest commit version if (commitVersion < readVersion) { versionMap[commitVersion].insert(tag); } } } } // insert the commit versions in the version vector. for (auto& iter : versionMap) { latestCommitVersions.setVersion(iter.second, iter.first); } } void updateCachedReadVersionShared(double t, Version v, DatabaseSharedState* p) { MutexHolder mutex(p->mutexLock); if (v >= p->grvCacheSpace.cachedReadVersion) { TraceEvent(SevDebug, "CacheReadVersionUpdate") .detail("Version", v) .detail("CurTime", t) .detail("LastVersion", p->grvCacheSpace.cachedReadVersion) .detail("LastTime", p->grvCacheSpace.lastGrvTime); p->grvCacheSpace.cachedReadVersion = v; if (t > p->grvCacheSpace.lastGrvTime) { p->grvCacheSpace.lastGrvTime = t; } } } void DatabaseContext::updateCachedReadVersion(double t, Version v) { if (sharedStatePtr) { return updateCachedReadVersionShared(t, v, sharedStatePtr); } if (v >= cachedReadVersion) { TraceEvent(SevDebug, "CachedReadVersionUpdate") .detail("Version", v) .detail("GrvStartTime", t) .detail("LastVersion", cachedReadVersion) .detail("LastTime", lastGrvTime); cachedReadVersion = v; // Since the time is based on the start of the request, it's possible that we // get a newer version with an older time. // (Request started earlier, but was latest to reach the proxy) // Only update time when strictly increasing (?) if (t > lastGrvTime) { lastGrvTime = t; } } } Version DatabaseContext::getCachedReadVersion() { if (sharedStatePtr) { MutexHolder mutex(sharedStatePtr->mutexLock); return sharedStatePtr->grvCacheSpace.cachedReadVersion; } return cachedReadVersion; } double DatabaseContext::getLastGrvTime() { if (sharedStatePtr) { MutexHolder mutex(sharedStatePtr->mutexLock); return sharedStatePtr->grvCacheSpace.lastGrvTime; } return lastGrvTime; } Reference StorageServerInfo::getInterface(DatabaseContext* cx, StorageServerInterface const& ssi, LocalityData const& locality) { auto it = cx->server_interf.find(ssi.id()); if (it != cx->server_interf.end()) { if (it->second->interf.getValue.getEndpoint().token != ssi.getValue.getEndpoint().token) { if (it->second->interf.locality == ssi.locality) { // FIXME: load balance holds pointers to individual members of the interface, and this assignment will // swap out the object they are // pointing to. This is technically correct, but is very unnatural. We may want to refactor load // balance to take an AsyncVar> so that it is notified when the interface // changes. it->second->interf = ssi; } else { it->second->notifyContextDestroyed(); Reference loc(new StorageServerInfo(cx, ssi, locality)); cx->server_interf[ssi.id()] = loc.getPtr(); return loc; } } return Reference::addRef(it->second); } Reference loc(new StorageServerInfo(cx, ssi, locality)); cx->server_interf[ssi.id()] = loc.getPtr(); return loc; } void StorageServerInfo::notifyContextDestroyed() { cx = nullptr; } StorageServerInfo::~StorageServerInfo() { if (cx) { auto it = cx->server_interf.find(interf.id()); if (it != cx->server_interf.end()) cx->server_interf.erase(it); cx = nullptr; } } std::string printable(const VectorRef& val) { std::string s; for (int i = 0; i < val.size(); i++) s = s + printable(val[i].key) + format(":%d ", val[i].value.size()); return s; } std::string printable(const KeyValueRef& val) { return printable(val.key) + format(":%d ", val.value.size()); } std::string printable(const VectorRef& val) { std::string s; for (int i = 0; i < val.size(); i++) s = s + printable(val[i]) + " "; return s; } std::string printable(const StringRef& val) { return val.printable(); } std::string printable(const std::string& str) { return StringRef(str).printable(); } std::string printable(const KeyRangeRef& range) { return printable(range.begin) + " - " + printable(range.end); } std::string printable(const VectorRef& val) { std::string s; for (int i = 0; i < val.size(); i++) s = s + printable(val[i]) + " "; return s; } int unhex(char c) { if (c >= '0' && c <= '9') return c - '0'; if (c >= 'a' && c <= 'f') return c - 'a' + 10; if (c >= 'A' && c <= 'F') return c - 'A' + 10; UNREACHABLE(); } std::string unprintable(std::string const& val) { std::string s; for (int i = 0; i < val.size(); i++) { char c = val[i]; if (c == '\\') { if (++i == val.size()) ASSERT(false); if (val[i] == '\\') { s += '\\'; } else if (val[i] == 'x') { if (i + 2 >= val.size()) ASSERT(false); s += char((unhex(val[i + 1]) << 4) + unhex(val[i + 2])); i += 2; } else ASSERT(false); } else s += c; } return s; } void DatabaseContext::validateVersion(Version version) const { // Version could be 0 if the INITIALIZE_NEW_DATABASE option is set. In that case, it is illegal to perform any // reads. We throw client_invalid_operation because the caller didn't directly set the version, so the // version_invalid error might be confusing. if (version == 0) { throw client_invalid_operation(); } if (switchable && version < minAcceptableReadVersion) { TEST(true); // Attempted to read a version lower than any this client has seen from the current cluster throw transaction_too_old(); } ASSERT(version > 0 || version == latestVersion); } void validateOptionValuePresent(Optional value) { if (!value.present()) { throw invalid_option_value(); } } void validateOptionValueNotPresent(Optional value) { if (value.present() && value.get().size() > 0) { throw invalid_option_value(); } } void dumpMutations(const MutationListRef& mutations) { for (auto m = mutations.begin(); m; ++m) { switch (m->type) { case MutationRef::SetValue: printf(" '%s' := '%s'\n", printable(m->param1).c_str(), printable(m->param2).c_str()); break; case MutationRef::AddValue: printf(" '%s' += '%s'", printable(m->param1).c_str(), printable(m->param2).c_str()); break; case MutationRef::ClearRange: printf(" Clear ['%s','%s')\n", printable(m->param1).c_str(), printable(m->param2).c_str()); break; default: printf(" Unknown mutation %d('%s','%s')\n", m->type, printable(m->param1).c_str(), printable(m->param2).c_str()); break; } } } template <> void addref(DatabaseContext* ptr) { ptr->addref(); } template <> void delref(DatabaseContext* ptr) { ptr->delref(); } void traceTSSErrors(const char* name, UID tssId, const std::unordered_map& errorsByCode) { TraceEvent ev(name, tssId); for (auto& it : errorsByCode) { ev.detail("E" + std::to_string(it.first), it.second); } } /* For each request type, this will produce Count {SS,TSS}{Mean,P50,P90,P99} Example: GetValueLatencySSMean */ void traceSSOrTSSPercentiles(TraceEvent& ev, const std::string name, ContinuousSample& sample) { ev.detail(name + "Mean", sample.mean()); // don't log the larger percentiles unless we actually have enough samples to log the accurate percentile instead of // the largest sample in this window if (sample.getPopulationSize() >= 3) { ev.detail(name + "P50", sample.median()); } if (sample.getPopulationSize() >= 10) { ev.detail(name + "P90", sample.percentile(0.90)); } if (sample.getPopulationSize() >= 100) { ev.detail(name + "P99", sample.percentile(0.99)); } } void traceTSSPercentiles(TraceEvent& ev, const std::string name, ContinuousSample& ssSample, ContinuousSample& tssSample) { ASSERT(ssSample.getPopulationSize() == tssSample.getPopulationSize()); ev.detail(name + "Count", ssSample.getPopulationSize()); if (ssSample.getPopulationSize() > 0) { traceSSOrTSSPercentiles(ev, name + "SS", ssSample); traceSSOrTSSPercentiles(ev, name + "TSS", tssSample); } } ACTOR Future tssLogger(DatabaseContext* cx) { state double lastLogged = 0; loop { wait(delay(CLIENT_KNOBS->TSS_METRICS_LOGGING_INTERVAL, TaskPriority::FlushTrace)); // Log each TSS pair separately for (const auto& it : cx->tssMetrics) { if (it.second->detailedMismatches.size()) { cx->tssMismatchStream.send( std::pair>(it.first, it.second->detailedMismatches)); } // Do error histograms as separate event if (it.second->ssErrorsByCode.size()) { traceTSSErrors("TSS_SSErrors", it.first, it.second->ssErrorsByCode); } if (it.second->tssErrorsByCode.size()) { traceTSSErrors("TSS_TSSErrors", it.first, it.second->tssErrorsByCode); } TraceEvent tssEv("TSSClientMetrics", cx->dbId); tssEv.detail("TSSID", it.first) .detail("Elapsed", (lastLogged == 0) ? 0 : now() - lastLogged) .detail("Internal", cx->internal); it.second->cc.logToTraceEvent(tssEv); traceTSSPercentiles(tssEv, "GetValueLatency", it.second->SSgetValueLatency, it.second->TSSgetValueLatency); traceTSSPercentiles( tssEv, "GetKeyValuesLatency", it.second->SSgetKeyValuesLatency, it.second->TSSgetKeyValuesLatency); traceTSSPercentiles(tssEv, "GetKeyLatency", it.second->SSgetKeyLatency, it.second->TSSgetKeyLatency); traceTSSPercentiles(tssEv, "GetMappedKeyValuesLatency", it.second->SSgetMappedKeyValuesLatency, it.second->TSSgetMappedKeyValuesLatency); it.second->clear(); } lastLogged = now(); } } ACTOR Future databaseLogger(DatabaseContext* cx) { state double lastLogged = 0; loop { wait(delay(CLIENT_KNOBS->SYSTEM_MONITOR_INTERVAL, TaskPriority::FlushTrace)); if (!g_network->isSimulated()) { TraceEvent ev("TransactionMetrics", cx->dbId); ev.detail("Elapsed", (lastLogged == 0) ? 0 : now() - lastLogged) .detail("Cluster", cx->getConnectionRecord() ? cx->getConnectionRecord()->getConnectionString().clusterKeyName().toString() : "") .detail("Internal", cx->internal); cx->cc.logToTraceEvent(ev); ev.detail("LocationCacheEntryCount", cx->locationCache.size()); ev.detail("MeanLatency", cx->latencies.mean()) .detail("MedianLatency", cx->latencies.median()) .detail("Latency90", cx->latencies.percentile(0.90)) .detail("Latency98", cx->latencies.percentile(0.98)) .detail("MaxLatency", cx->latencies.max()) .detail("MeanRowReadLatency", cx->readLatencies.mean()) .detail("MedianRowReadLatency", cx->readLatencies.median()) .detail("MaxRowReadLatency", cx->readLatencies.max()) .detail("MeanGRVLatency", cx->GRVLatencies.mean()) .detail("MedianGRVLatency", cx->GRVLatencies.median()) .detail("MaxGRVLatency", cx->GRVLatencies.max()) .detail("MeanCommitLatency", cx->commitLatencies.mean()) .detail("MedianCommitLatency", cx->commitLatencies.median()) .detail("MaxCommitLatency", cx->commitLatencies.max()) .detail("MeanMutationsPerCommit", cx->mutationsPerCommit.mean()) .detail("MedianMutationsPerCommit", cx->mutationsPerCommit.median()) .detail("MaxMutationsPerCommit", cx->mutationsPerCommit.max()) .detail("MeanBytesPerCommit", cx->bytesPerCommit.mean()) .detail("MedianBytesPerCommit", cx->bytesPerCommit.median()) .detail("MaxBytesPerCommit", cx->bytesPerCommit.max()) .detail("NumLocalityCacheEntries", cx->locationCache.size()); if (cx->anyBlobGranuleRequests) { ev.detail("MeanBGLatency", cx->bgLatencies.mean()) .detail("MedianBGLatency", cx->bgLatencies.median()) .detail("MaxBGLatency", cx->bgLatencies.max()) .detail("MeanBGGranulesPerRequest", cx->bgGranulesPerRequest.mean()) .detail("MedianBGGranulesPerRequest", cx->bgGranulesPerRequest.median()) .detail("MaxBGGranulesPerRequest", cx->bgGranulesPerRequest.max()); } } cx->latencies.clear(); cx->readLatencies.clear(); cx->GRVLatencies.clear(); cx->commitLatencies.clear(); cx->mutationsPerCommit.clear(); cx->bytesPerCommit.clear(); cx->bgLatencies.clear(); cx->bgGranulesPerRequest.clear(); lastLogged = now(); } } struct TrInfoChunk { ValueRef value; Key key; }; ACTOR static Future transactionInfoCommitActor(Transaction* tr, std::vector* chunks) { state const Key clientLatencyAtomicCtr = CLIENT_LATENCY_INFO_CTR_PREFIX.withPrefix(fdbClientInfoPrefixRange.begin); state int retryCount = 0; loop { try { tr->reset(); tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr->setOption(FDBTransactionOptions::LOCK_AWARE); state Future> vstamp = tr->getVersionstamp(); int64_t numCommitBytes = 0; for (auto& chunk : *chunks) { tr->atomicOp(chunk.key, chunk.value, MutationRef::SetVersionstampedKey); numCommitBytes += chunk.key.size() + chunk.value.size() - 4; // subtract number of bytes of key that denotes version stamp index } tr->atomicOp(clientLatencyAtomicCtr, StringRef((uint8_t*)&numCommitBytes, 8), MutationRef::AddValue); wait(tr->commit()); return Void(); } catch (Error& e) { retryCount++; if (retryCount == 10) throw; wait(tr->onError(e)); } } } ACTOR static Future delExcessClntTxnEntriesActor(Transaction* tr, int64_t clientTxInfoSizeLimit) { state const Key clientLatencyName = CLIENT_LATENCY_INFO_PREFIX.withPrefix(fdbClientInfoPrefixRange.begin); state const Key clientLatencyAtomicCtr = CLIENT_LATENCY_INFO_CTR_PREFIX.withPrefix(fdbClientInfoPrefixRange.begin); TraceEvent(SevInfo, "DelExcessClntTxnEntriesCalled").log(); loop { try { tr->reset(); tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr->setOption(FDBTransactionOptions::LOCK_AWARE); Optional ctrValue = wait(tr->get(KeyRef(clientLatencyAtomicCtr), Snapshot::True)); if (!ctrValue.present()) { TraceEvent(SevInfo, "NumClntTxnEntriesNotFound").log(); return Void(); } state int64_t txInfoSize = 0; ASSERT(ctrValue.get().size() == sizeof(int64_t)); memcpy(&txInfoSize, ctrValue.get().begin(), ctrValue.get().size()); if (txInfoSize < clientTxInfoSizeLimit) return Void(); int getRangeByteLimit = (txInfoSize - clientTxInfoSizeLimit) < CLIENT_KNOBS->TRANSACTION_SIZE_LIMIT ? (txInfoSize - clientTxInfoSizeLimit) : CLIENT_KNOBS->TRANSACTION_SIZE_LIMIT; GetRangeLimits limit(GetRangeLimits::ROW_LIMIT_UNLIMITED, getRangeByteLimit); RangeResult txEntries = wait(tr->getRange(KeyRangeRef(clientLatencyName, strinc(clientLatencyName)), limit)); state int64_t numBytesToDel = 0; KeyRef endKey; for (auto& kv : txEntries) { endKey = kv.key; numBytesToDel += kv.key.size() + kv.value.size(); if (txInfoSize - numBytesToDel <= clientTxInfoSizeLimit) break; } if (numBytesToDel) { tr->clear(KeyRangeRef(txEntries[0].key, strinc(endKey))); TraceEvent(SevInfo, "DeletingExcessCntTxnEntries").detail("BytesToBeDeleted", numBytesToDel); int64_t bytesDel = -numBytesToDel; tr->atomicOp(clientLatencyAtomicCtr, StringRef((uint8_t*)&bytesDel, 8), MutationRef::AddValue); wait(tr->commit()); } if (txInfoSize - numBytesToDel <= clientTxInfoSizeLimit) return Void(); } catch (Error& e) { wait(tr->onError(e)); } } } // Delref and addref self to give self a chance to get destroyed. ACTOR static Future refreshTransaction(DatabaseContext* self, Transaction* tr) { *tr = Transaction(); wait(delay(0)); // Give ourselves the chance to get cancelled if self was destroyed *tr = Transaction(Database(Reference::addRef(self))); return Void(); } // The reason for getting a pointer to DatabaseContext instead of a reference counted object is because reference // counting will increment reference count for DatabaseContext which holds the future of this actor. This creates a // cyclic reference and hence this actor and Database object will not be destroyed at all. ACTOR static Future clientStatusUpdateActor(DatabaseContext* cx) { state const std::string clientLatencyName = CLIENT_LATENCY_INFO_PREFIX.withPrefix(fdbClientInfoPrefixRange.begin).toString(); state Transaction tr; state std::vector commitQ; state int txBytes = 0; loop { // Need to make sure that we eventually destroy tr. We can't rely on getting cancelled to do this because of // the cyclic reference to self. wait(refreshTransaction(cx, &tr)); try { ASSERT(cx->clientStatusUpdater.outStatusQ.empty()); cx->clientStatusUpdater.inStatusQ.swap(cx->clientStatusUpdater.outStatusQ); // Split Transaction Info into chunks state std::vector trChunksQ; for (auto& entry : cx->clientStatusUpdater.outStatusQ) { auto& bw = entry.second; int64_t value_size_limit = BUGGIFY ? deterministicRandom()->randomInt(1e3, CLIENT_KNOBS->VALUE_SIZE_LIMIT) : CLIENT_KNOBS->VALUE_SIZE_LIMIT; int num_chunks = (bw.getLength() + value_size_limit - 1) / value_size_limit; std::string random_id = deterministicRandom()->randomAlphaNumeric(16); std::string user_provided_id = entry.first.size() ? entry.first + "/" : ""; for (int i = 0; i < num_chunks; i++) { TrInfoChunk chunk; BinaryWriter chunkBW(Unversioned()); chunkBW << bigEndian32(i + 1) << bigEndian32(num_chunks); chunk.key = KeyRef(clientLatencyName + std::string(10, '\x00') + "/" + random_id + "/" + chunkBW.toValue().toString() + "/" + user_provided_id + std::string(4, '\x00')); int32_t pos = littleEndian32(clientLatencyName.size()); memcpy(mutateString(chunk.key) + chunk.key.size() - sizeof(int32_t), &pos, sizeof(int32_t)); if (i == num_chunks - 1) { chunk.value = ValueRef(static_cast(bw.getData()) + (i * value_size_limit), bw.getLength() - (i * value_size_limit)); } else { chunk.value = ValueRef(static_cast(bw.getData()) + (i * value_size_limit), value_size_limit); } trChunksQ.push_back(std::move(chunk)); } } // Commit the chunks splitting into different transactions if needed state int64_t dataSizeLimit = BUGGIFY ? deterministicRandom()->randomInt(200e3, 1.5 * CLIENT_KNOBS->TRANSACTION_SIZE_LIMIT) : 0.8 * CLIENT_KNOBS->TRANSACTION_SIZE_LIMIT; state std::vector::iterator tracking_iter = trChunksQ.begin(); ASSERT(commitQ.empty() && (txBytes == 0)); loop { state std::vector::iterator iter = tracking_iter; txBytes = 0; commitQ.clear(); try { while (iter != trChunksQ.end()) { if (iter->value.size() + iter->key.size() + txBytes > dataSizeLimit) { wait(transactionInfoCommitActor(&tr, &commitQ)); tracking_iter = iter; commitQ.clear(); txBytes = 0; } commitQ.push_back(*iter); txBytes += iter->value.size() + iter->key.size(); ++iter; } if (!commitQ.empty()) { wait(transactionInfoCommitActor(&tr, &commitQ)); commitQ.clear(); txBytes = 0; } break; } catch (Error& e) { if (e.code() == error_code_transaction_too_large) { dataSizeLimit /= 2; ASSERT(dataSizeLimit >= CLIENT_KNOBS->VALUE_SIZE_LIMIT + CLIENT_KNOBS->KEY_SIZE_LIMIT); } else { TraceEvent(SevWarnAlways, "ClientTrInfoErrorCommit").error(e).detail("TxBytes", txBytes); commitQ.clear(); txBytes = 0; throw; } } } cx->clientStatusUpdater.outStatusQ.clear(); wait(GlobalConfig::globalConfig().onInitialized()); double sampleRate = GlobalConfig::globalConfig().get(fdbClientInfoTxnSampleRate, std::numeric_limits::infinity()); double clientSamplingProbability = std::isinf(sampleRate) ? CLIENT_KNOBS->CSI_SAMPLING_PROBABILITY : sampleRate; int64_t sizeLimit = GlobalConfig::globalConfig().get(fdbClientInfoTxnSizeLimit, -1); int64_t clientTxnInfoSizeLimit = sizeLimit == -1 ? CLIENT_KNOBS->CSI_SIZE_LIMIT : sizeLimit; if (!trChunksQ.empty() && deterministicRandom()->random01() < clientSamplingProbability) wait(delExcessClntTxnEntriesActor(&tr, clientTxnInfoSizeLimit)); wait(delay(CLIENT_KNOBS->CSI_STATUS_DELAY)); } catch (Error& e) { if (e.code() == error_code_actor_cancelled) { throw; } cx->clientStatusUpdater.outStatusQ.clear(); TraceEvent(SevWarnAlways, "UnableToWriteClientStatus").error(e); wait(delay(10.0)); } } } ACTOR Future assertFailure(GrvProxyInterface remote, Future> reply) { try { ErrorOr res = wait(reply); if (!res.isError()) { TraceEvent(SevError, "GotStaleReadVersion") .detail("Remote", remote.getConsistentReadVersion.getEndpoint().addresses.address.toString()) .detail("Provisional", remote.provisional) .detail("ReadVersion", res.get().version); ASSERT_WE_THINK(false); } } catch (Error& e) { if (e.code() == error_code_actor_cancelled) { throw; } // we want this to fail -- so getting here is good, we'll just ignore the error. } return Void(); } Future attemptGRVFromOldProxies(std::vector oldProxies, std::vector newProxies) { Span span(deterministicRandom()->randomUniqueID(), "VerifyCausalReadRisky"_loc); std::vector> replies; replies.reserve(oldProxies.size()); GetReadVersionRequest req( span.context, 1, TransactionPriority::IMMEDIATE, GetReadVersionRequest::FLAG_CAUSAL_READ_RISKY); TraceEvent evt("AttemptGRVFromOldProxies"); evt.detail("NumOldProxies", oldProxies.size()).detail("NumNewProxies", newProxies.size()); auto traceProxies = [&](std::vector& proxies, std::string const& key) { for (int i = 0; i < proxies.size(); ++i) { auto k = key + std::to_string(i); evt.detail(k.c_str(), proxies[i].id()); } }; traceProxies(oldProxies, "OldProxy"s); traceProxies(newProxies, "NewProxy"s); evt.log(); for (auto& i : oldProxies) { req.reply = ReplyPromise(); replies.push_back(assertFailure(i, i.getConsistentReadVersion.tryGetReply(req))); } return waitForAll(replies); } ACTOR static Future monitorClientDBInfoChange(DatabaseContext* cx, Reference const> clientDBInfo, AsyncTrigger* proxiesChangeTrigger) { state std::vector curCommitProxies; state std::vector curGrvProxies; state ActorCollection actors(false); state Future clientDBInfoOnChange = clientDBInfo->onChange(); curCommitProxies = clientDBInfo->get().commitProxies; curGrvProxies = clientDBInfo->get().grvProxies; loop { choose { when(wait(clientDBInfoOnChange)) { clientDBInfoOnChange = clientDBInfo->onChange(); if (clientDBInfo->get().commitProxies != curCommitProxies || clientDBInfo->get().grvProxies != curGrvProxies) { // This condition is a bit complicated. Here we want to verify that we're unable to receive a read // version from a proxy of an old generation after a successful recovery. The conditions are: // 1. We only do this with a configured probability. // 2. If the old set of Grv proxies is empty, there's nothing to do // 3. If the new set of Grv proxies is empty, it means the recovery is not complete. So if an old // Grv proxy still gives out read versions, this would be correct behavior. // 4. If we see a provisional proxy, it means the recovery didn't complete yet, so the same as (3) // applies. if (deterministicRandom()->random01() < cx->verifyCausalReadsProp && !curGrvProxies.empty() && !clientDBInfo->get().grvProxies.empty() && !clientDBInfo->get().grvProxies[0].provisional) { actors.add(attemptGRVFromOldProxies(curGrvProxies, clientDBInfo->get().grvProxies)); } curCommitProxies = clientDBInfo->get().commitProxies; curGrvProxies = clientDBInfo->get().grvProxies; proxiesChangeTrigger->trigger(); } } when(wait(actors.getResult())) { UNSTOPPABLE_ASSERT(false); } } } } void updateLocationCacheWithCaches(DatabaseContext* self, const std::map& removed, const std::map& added) { // TODO: this needs to be more clever in the future auto ranges = self->locationCache.ranges(); for (auto iter = ranges.begin(); iter != ranges.end(); ++iter) { if (iter->value() && iter->value()->hasCaches) { auto& val = iter->value(); std::vector>> interfaces; interfaces.reserve(val->size() - removed.size() + added.size()); for (int i = 0; i < val->size(); ++i) { const auto& interf = (*val)[i]; if (removed.count(interf->interf.id()) == 0) { interfaces.emplace_back(interf); } } for (const auto& p : added) { interfaces.push_back(makeReference>(p.second)); } iter->value() = makeReference(interfaces, true); } } } Reference addCaches(const Reference& loc, const std::vector>>& other) { std::vector>> interfaces; interfaces.reserve(loc->size() + other.size()); for (int i = 0; i < loc->size(); ++i) { interfaces.emplace_back((*loc)[i]); } interfaces.insert(interfaces.end(), other.begin(), other.end()); return makeReference(interfaces, true); } ACTOR Future updateCachedRanges(DatabaseContext* self, std::map* cacheServers) { state Transaction tr; state Value trueValue = storageCacheValue(std::vector{ 0 }); state Value falseValue = storageCacheValue(std::vector{}); try { loop { // Need to make sure that we eventually destroy tr. We can't rely on getting cancelled to do this because of // the cyclic reference to self. tr = Transaction(); wait(delay(0)); // Give ourselves the chance to get cancelled if self was destroyed wait(brokenPromiseToNever(self->updateCache.onTrigger())); // brokenPromiseToNever because self might get // destroyed elsewhere while we're waiting here. tr = Transaction(Database(Reference::addRef(self))); tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr.setOption(FDBTransactionOptions::READ_LOCK_AWARE); try { RangeResult range = wait(tr.getRange(storageCacheKeys, CLIENT_KNOBS->TOO_MANY)); ASSERT(!range.more); std::vector>> cacheInterfaces; cacheInterfaces.reserve(cacheServers->size()); for (const auto& p : *cacheServers) { cacheInterfaces.push_back(makeReference>(p.second)); } bool currCached = false; KeyRef begin, end; for (const auto& kv : range) { // These booleans have to flip consistently ASSERT(currCached == (kv.value == falseValue)); if (kv.value == trueValue) { begin = kv.key.substr(storageCacheKeys.begin.size()); currCached = true; } else { currCached = false; end = kv.key.substr(storageCacheKeys.begin.size()); KeyRangeRef cachedRange{ begin, end }; auto ranges = self->locationCache.containedRanges(cachedRange); KeyRef containedRangesBegin, containedRangesEnd, prevKey; if (!ranges.empty()) { containedRangesBegin = ranges.begin().range().begin; } for (auto iter = ranges.begin(); iter != ranges.end(); ++iter) { containedRangesEnd = iter->range().end; if (iter->value() && !iter->value()->hasCaches) { iter->value() = addCaches(iter->value(), cacheInterfaces); } } auto iter = self->locationCache.rangeContaining(begin); if (iter->value() && !iter->value()->hasCaches) { if (end >= iter->range().end) { Key endCopy = iter->range().end; // Copy because insertion invalidates iterator self->locationCache.insert(KeyRangeRef{ begin, endCopy }, addCaches(iter->value(), cacheInterfaces)); } else { self->locationCache.insert(KeyRangeRef{ begin, end }, addCaches(iter->value(), cacheInterfaces)); } } iter = self->locationCache.rangeContainingKeyBefore(end); if (iter->value() && !iter->value()->hasCaches) { Key beginCopy = iter->range().begin; // Copy because insertion invalidates iterator self->locationCache.insert(KeyRangeRef{ beginCopy, end }, addCaches(iter->value(), cacheInterfaces)); } } } wait(delay(2.0)); // we want to wait at least some small amount of time before // updating this list again } catch (Error& e) { wait(tr.onError(e)); } } } catch (Error& e) { TraceEvent(SevError, "UpdateCachedRangesFailed").error(e); throw; } } // The reason for getting a pointer to DatabaseContext instead of a reference counted object is because reference // counting will increment reference count for DatabaseContext which holds the future of this actor. This creates a // cyclic reference and hence this actor and Database object will not be destroyed at all. ACTOR Future monitorCacheList(DatabaseContext* self) { state Transaction tr; state std::map cacheServerMap; state Future updateRanges = updateCachedRanges(self, &cacheServerMap); // if no caches are configured, we don't want to run this actor at all // so we just wait for the first trigger from a storage server wait(self->updateCache.onTrigger()); try { loop { // Need to make sure that we eventually destroy tr. We can't rely on getting cancelled to do this because of // the cyclic reference to self. wait(refreshTransaction(self, &tr)); try { RangeResult cacheList = wait(tr.getRange(storageCacheServerKeys, CLIENT_KNOBS->TOO_MANY)); ASSERT(!cacheList.more); bool hasChanges = false; std::map allCacheServers; for (auto kv : cacheList) { auto ssi = BinaryReader::fromStringRef(kv.value, IncludeVersion()); allCacheServers.emplace(ssi.id(), ssi); } std::map newCacheServers; std::map deletedCacheServers; std::set_difference(allCacheServers.begin(), allCacheServers.end(), cacheServerMap.begin(), cacheServerMap.end(), std::insert_iterator>( newCacheServers, newCacheServers.begin())); std::set_difference(cacheServerMap.begin(), cacheServerMap.end(), allCacheServers.begin(), allCacheServers.end(), std::insert_iterator>( deletedCacheServers, deletedCacheServers.begin())); hasChanges = !(newCacheServers.empty() && deletedCacheServers.empty()); if (hasChanges) { updateLocationCacheWithCaches(self, deletedCacheServers, newCacheServers); } cacheServerMap = std::move(allCacheServers); wait(delay(5.0)); } catch (Error& e) { wait(tr.onError(e)); } } } catch (Error& e) { TraceEvent(SevError, "MonitorCacheListFailed").error(e); throw; } } ACTOR static Future handleTssMismatches(DatabaseContext* cx) { state Reference tr; state KeyBackedMap tssMapDB = KeyBackedMap(tssMappingKeys.begin); state KeyBackedMap tssMismatchDB = KeyBackedMap(tssMismatchKeys.begin); loop { // state std::pair> data = waitNext(cx->tssMismatchStream.getFuture()); // return to calling actor, don't do this as part of metrics loop wait(delay(0)); // find ss pair id so we can remove it from the mapping state UID tssPairID; bool found = false; for (const auto& it : cx->tssMapping) { if (it.second.id() == data.first) { tssPairID = it.first; found = true; break; } } if (found) { state bool quarantine = CLIENT_KNOBS->QUARANTINE_TSS_ON_MISMATCH; TraceEvent(SevWarnAlways, quarantine ? "TSS_QuarantineMismatch" : "TSS_KillMismatch") .detail("TSSID", data.first.toString()); TEST(quarantine); // Quarantining TSS because it got mismatch TEST(!quarantine); // Killing TSS because it got mismatch tr = makeReference(Database(Reference::addRef(cx))); state int tries = 0; loop { try { tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); if (quarantine) { tr->set(tssQuarantineKeyFor(data.first), LiteralStringRef("")); } else { tr->clear(serverTagKeyFor(data.first)); } tssMapDB.erase(tr, tssPairID); for (const DetailedTSSMismatch& d : data.second) { // -> mismatch data tssMismatchDB.set( tr, Tuple().append(data.first.toString()).append(d.timestamp).append(d.mismatchId.toString()), d.traceString); } wait(tr->commit()); break; } catch (Error& e) { wait(tr->onError(e)); } tries++; if (tries > 10) { // Give up, it'll get another mismatch or a human will investigate eventually TraceEvent("TSS_MismatchGaveUp").detail("TSSID", data.first.toString()); break; } } // clear out txn so that the extra DatabaseContext ref gets decref'd and we can free cx tr = makeReference(); } else { TEST(true); // Not handling TSS with mismatch because it's already gone } } } ACTOR static Future backgroundGrvUpdater(DatabaseContext* cx) { state Transaction tr; state double grvDelay = 0.001; try { loop { if (CLIENT_KNOBS->FORCE_GRV_CACHE_OFF) return Void(); wait(refreshTransaction(cx, &tr)); state double curTime = now(); state double lastTime = cx->getLastGrvTime(); state double lastProxyTime = cx->lastProxyRequestTime; TraceEvent(SevDebug, "BackgroundGrvUpdaterBefore") .detail("CurTime", curTime) .detail("LastTime", lastTime) .detail("GrvDelay", grvDelay) .detail("CachedReadVersion", cx->getCachedReadVersion()) .detail("CachedTime", cx->getLastGrvTime()) .detail("Gap", curTime - lastTime) .detail("Bound", CLIENT_KNOBS->MAX_VERSION_CACHE_LAG - grvDelay); if (curTime - lastTime >= (CLIENT_KNOBS->MAX_VERSION_CACHE_LAG - grvDelay) || curTime - lastProxyTime > CLIENT_KNOBS->MAX_PROXY_CONTACT_LAG) { try { tr.setOption(FDBTransactionOptions::SKIP_GRV_CACHE); wait(success(tr.getReadVersion())); cx->lastProxyRequestTime = curTime; grvDelay = (grvDelay + (now() - curTime)) / 2.0; TraceEvent(SevDebug, "BackgroundGrvUpdaterSuccess") .detail("GrvDelay", grvDelay) .detail("CachedReadVersion", cx->getCachedReadVersion()) .detail("CachedTime", cx->getLastGrvTime()); } catch (Error& e) { TraceEvent(SevInfo, "BackgroundGrvUpdaterTxnError").errorUnsuppressed(e); wait(tr.onError(e)); } } else { wait( delay(std::max(0.001, std::min(CLIENT_KNOBS->MAX_PROXY_CONTACT_LAG - (curTime - lastProxyTime), (CLIENT_KNOBS->MAX_VERSION_CACHE_LAG - grvDelay) - (curTime - lastTime))))); } } } catch (Error& e) { TraceEvent(SevInfo, "BackgroundGrvUpdaterFailed").errorUnsuppressed(e); throw; } } ACTOR static Future getHealthMetricsActor(DatabaseContext* cx, bool detailed) { if (now() - cx->healthMetricsLastUpdated < CLIENT_KNOBS->AGGREGATE_HEALTH_METRICS_MAX_STALENESS) { if (detailed) { return cx->healthMetrics; } else { HealthMetrics result; result.update(cx->healthMetrics, false, false); return result; } } state bool sendDetailedRequest = detailed && now() - cx->detailedHealthMetricsLastUpdated > CLIENT_KNOBS->DETAILED_HEALTH_METRICS_MAX_STALENESS; loop { choose { when(wait(cx->onProxiesChanged())) {} when(GetHealthMetricsReply rep = wait(basicLoadBalance(cx->getGrvProxies(UseProvisionalProxies::False), &GrvProxyInterface::getHealthMetrics, GetHealthMetricsRequest(sendDetailedRequest)))) { cx->healthMetrics.update(rep.healthMetrics, detailed, true); if (detailed) { cx->healthMetricsLastUpdated = now(); cx->detailedHealthMetricsLastUpdated = now(); return cx->healthMetrics; } else { cx->healthMetricsLastUpdated = now(); HealthMetrics result; result.update(cx->healthMetrics, false, false); return result; } } } } } Future DatabaseContext::getHealthMetrics(bool detailed = false) { return getHealthMetricsActor(this, detailed); } void DatabaseContext::registerSpecialKeySpaceModule(SpecialKeySpace::MODULE module, SpecialKeySpace::IMPLTYPE type, std::unique_ptr&& impl, int deprecatedVersion) { // if deprecated, add the implementation when the api version is less than the depracated version if (deprecatedVersion == -1 || apiVersion < deprecatedVersion) { specialKeySpace->registerKeyRange(module, type, impl->getKeyRange(), impl.get()); specialKeySpaceModules.push_back(std::move(impl)); } } ACTOR Future getWorkerInterfaces(Reference clusterRecord); ACTOR Future> getJSON(Database db); struct WorkerInterfacesSpecialKeyImpl : SpecialKeyRangeReadImpl { Future getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr, GetRangeLimits limitsHint) const override { if (ryw->getDatabase().getPtr() && ryw->getDatabase()->getConnectionRecord()) { Key prefix = Key(getKeyRange().begin); return map(getWorkerInterfaces(ryw->getDatabase()->getConnectionRecord()), [prefix = prefix, kr = KeyRange(kr)](const RangeResult& in) { RangeResult result; for (const auto& [k_, v] : in) { auto k = k_.withPrefix(prefix); if (kr.contains(k)) result.push_back_deep(result.arena(), KeyValueRef(k, v)); } std::sort(result.begin(), result.end(), KeyValueRef::OrderByKey{}); return result; }); } else { return RangeResult(); } } explicit WorkerInterfacesSpecialKeyImpl(KeyRangeRef kr) : SpecialKeyRangeReadImpl(kr) {} }; struct SingleSpecialKeyImpl : SpecialKeyRangeReadImpl { Future getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr, GetRangeLimits limitsHint) const override { ASSERT(kr.contains(k)); return map(f(ryw), [k = k](Optional v) { RangeResult result; if (v.present()) { result.push_back_deep(result.arena(), KeyValueRef(k, v.get())); } return result; }); } SingleSpecialKeyImpl(KeyRef k, const std::function>(ReadYourWritesTransaction*)>& f) : SpecialKeyRangeReadImpl(singleKeyRange(k)), k(k), f(f) {} private: Key k; std::function>(ReadYourWritesTransaction*)> f; }; class HealthMetricsRangeImpl : public SpecialKeyRangeAsyncImpl { public: explicit HealthMetricsRangeImpl(KeyRangeRef kr); Future getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr, GetRangeLimits limitsHint) const override; }; static RangeResult healthMetricsToKVPairs(const HealthMetrics& metrics, KeyRangeRef kr) { RangeResult result; if (CLIENT_BUGGIFY) return result; if (kr.contains(LiteralStringRef("\xff\xff/metrics/health/aggregate")) && metrics.worstStorageDurabilityLag != 0) { json_spirit::mObject statsObj; statsObj["batch_limited"] = metrics.batchLimited; statsObj["tps_limit"] = metrics.tpsLimit; statsObj["worst_storage_durability_lag"] = metrics.worstStorageDurabilityLag; statsObj["limiting_storage_durability_lag"] = metrics.limitingStorageDurabilityLag; statsObj["worst_storage_queue"] = metrics.worstStorageQueue; statsObj["limiting_storage_queue"] = metrics.limitingStorageQueue; statsObj["worst_log_queue"] = metrics.worstTLogQueue; std::string statsString = json_spirit::write_string(json_spirit::mValue(statsObj), json_spirit::Output_options::raw_utf8); ValueRef bytes(result.arena(), statsString); result.push_back(result.arena(), KeyValueRef(LiteralStringRef("\xff\xff/metrics/health/aggregate"), bytes)); } // tlog stats { int phase = 0; // Avoid comparing twice per loop iteration for (const auto& [uid, logStats] : metrics.tLogQueue) { StringRef k{ StringRef(uid.toString()).withPrefix(LiteralStringRef("\xff\xff/metrics/health/log/"), result.arena()) }; if (phase == 0 && k >= kr.begin) { phase = 1; } if (phase == 1) { if (k < kr.end) { json_spirit::mObject statsObj; statsObj["log_queue"] = logStats; std::string statsString = json_spirit::write_string(json_spirit::mValue(statsObj), json_spirit::Output_options::raw_utf8); ValueRef bytes(result.arena(), statsString); result.push_back(result.arena(), KeyValueRef(k, bytes)); } else { break; } } } } // Storage stats { int phase = 0; // Avoid comparing twice per loop iteration for (const auto& [uid, storageStats] : metrics.storageStats) { StringRef k{ StringRef(uid.toString()) .withPrefix(LiteralStringRef("\xff\xff/metrics/health/storage/"), result.arena()) }; if (phase == 0 && k >= kr.begin) { phase = 1; } if (phase == 1) { if (k < kr.end) { json_spirit::mObject statsObj; statsObj["storage_durability_lag"] = storageStats.storageDurabilityLag; statsObj["storage_queue"] = storageStats.storageQueue; statsObj["cpu_usage"] = storageStats.cpuUsage; statsObj["disk_usage"] = storageStats.diskUsage; std::string statsString = json_spirit::write_string(json_spirit::mValue(statsObj), json_spirit::Output_options::raw_utf8); ValueRef bytes(result.arena(), statsString); result.push_back(result.arena(), KeyValueRef(k, bytes)); } else { break; } } } } return result; } ACTOR static Future healthMetricsGetRangeActor(ReadYourWritesTransaction* ryw, KeyRangeRef kr) { HealthMetrics metrics = wait(ryw->getDatabase()->getHealthMetrics( /*detailed ("per process")*/ kr.intersects(KeyRangeRef(LiteralStringRef("\xff\xff/metrics/health/storage/"), LiteralStringRef("\xff\xff/metrics/health/storage0"))) || kr.intersects(KeyRangeRef(LiteralStringRef("\xff\xff/metrics/health/log/"), LiteralStringRef("\xff\xff/metrics/health/log0"))))); return healthMetricsToKVPairs(metrics, kr); } HealthMetricsRangeImpl::HealthMetricsRangeImpl(KeyRangeRef kr) : SpecialKeyRangeAsyncImpl(kr) {} Future HealthMetricsRangeImpl::getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr, GetRangeLimits limitsHint) const { return healthMetricsGetRangeActor(ryw, kr); } KeyRangeRef toRelativeRange(KeyRangeRef range, KeyRef prefix) { if (prefix.empty()) { return range; } else { KeyRef begin = range.begin.startsWith(prefix) ? range.begin.removePrefix(prefix) : allKeys.begin; KeyRef end = range.end.startsWith(prefix) ? range.end.removePrefix(prefix) : allKeys.end; return KeyRangeRef(begin, end); } } DatabaseContext::DatabaseContext(Reference>> connectionRecord, Reference> clientInfo, Reference> const> coordinator, Future clientInfoMonitor, TaskPriority taskID, LocalityData const& clientLocality, EnableLocalityLoadBalance enableLocalityLoadBalance, LockAware lockAware, IsInternal internal, int apiVersion, IsSwitchable switchable, Optional defaultTenant) : lockAware(lockAware), switchable(switchable), connectionRecord(connectionRecord), proxyProvisional(false), clientLocality(clientLocality), enableLocalityLoadBalance(enableLocalityLoadBalance), defaultTenant(defaultTenant), internal(internal), cc("TransactionMetrics"), transactionReadVersions("ReadVersions", cc), transactionReadVersionsThrottled("ReadVersionsThrottled", cc), transactionReadVersionsCompleted("ReadVersionsCompleted", cc), transactionReadVersionBatches("ReadVersionBatches", cc), transactionBatchReadVersions("BatchPriorityReadVersions", cc), transactionDefaultReadVersions("DefaultPriorityReadVersions", cc), transactionImmediateReadVersions("ImmediatePriorityReadVersions", cc), transactionBatchReadVersionsCompleted("BatchPriorityReadVersionsCompleted", cc), transactionDefaultReadVersionsCompleted("DefaultPriorityReadVersionsCompleted", cc), transactionImmediateReadVersionsCompleted("ImmediatePriorityReadVersionsCompleted", cc), transactionLogicalReads("LogicalUncachedReads", cc), transactionPhysicalReads("PhysicalReadRequests", cc), transactionPhysicalReadsCompleted("PhysicalReadRequestsCompleted", cc), transactionGetKeyRequests("GetKeyRequests", cc), transactionGetValueRequests("GetValueRequests", cc), transactionGetRangeRequests("GetRangeRequests", cc), transactionGetMappedRangeRequests("GetMappedRangeRequests", cc), transactionGetRangeStreamRequests("GetRangeStreamRequests", cc), transactionWatchRequests("WatchRequests", cc), transactionGetAddressesForKeyRequests("GetAddressesForKeyRequests", cc), transactionBytesRead("BytesRead", cc), transactionKeysRead("KeysRead", cc), transactionMetadataVersionReads("MetadataVersionReads", cc), transactionCommittedMutations("CommittedMutations", cc), transactionCommittedMutationBytes("CommittedMutationBytes", cc), transactionSetMutations("SetMutations", cc), transactionClearMutations("ClearMutations", cc), transactionAtomicMutations("AtomicMutations", cc), transactionsCommitStarted("CommitStarted", cc), transactionsCommitCompleted("CommitCompleted", cc), transactionKeyServerLocationRequests("KeyServerLocationRequests", cc), transactionKeyServerLocationRequestsCompleted("KeyServerLocationRequestsCompleted", cc), transactionStatusRequests("StatusRequests", cc), transactionsTooOld("TooOld", cc), transactionsFutureVersions("FutureVersions", cc), transactionsNotCommitted("NotCommitted", cc), transactionsMaybeCommitted("MaybeCommitted", cc), transactionsResourceConstrained("ResourceConstrained", cc), transactionsProcessBehind("ProcessBehind", cc), transactionsThrottled("Throttled", cc), transactionsExpensiveClearCostEstCount("ExpensiveClearCostEstCount", cc), transactionGrvFullBatches("NumGrvFullBatches", cc), transactionGrvTimedOutBatches("NumGrvTimedOutBatches", cc), transactionsStaleVersionVectors("NumStaleVersionVectors", cc), latencies(1000), readLatencies(1000), commitLatencies(1000), GRVLatencies(1000), mutationsPerCommit(1000), bytesPerCommit(1000), bgLatencies(1000), bgGranulesPerRequest(1000), outstandingWatches(0), sharedStatePtr(nullptr), lastGrvTime(0.0), cachedReadVersion(0), lastRkBatchThrottleTime(0.0), lastRkDefaultThrottleTime(0.0), lastProxyRequestTime(0.0), transactionTracingSample(false), taskID(taskID), clientInfo(clientInfo), clientInfoMonitor(clientInfoMonitor), coordinator(coordinator), apiVersion(apiVersion), mvCacheInsertLocation(0), healthMetricsLastUpdated(0), detailedHealthMetricsLastUpdated(0), smoothMidShardSize(CLIENT_KNOBS->SHARD_STAT_SMOOTH_AMOUNT), specialKeySpace(std::make_unique(specialKeys.begin, specialKeys.end, /* test */ false)), connectToDatabaseEventCacheHolder(format("ConnectToDatabase/%s", dbId.toString().c_str())) { dbId = deterministicRandom()->randomUniqueID(); connected = (clientInfo->get().commitProxies.size() && clientInfo->get().grvProxies.size()) ? Void() : clientInfo->onChange(); metadataVersionCache.resize(CLIENT_KNOBS->METADATA_VERSION_CACHE_SIZE); maxOutstandingWatches = CLIENT_KNOBS->DEFAULT_MAX_OUTSTANDING_WATCHES; snapshotRywEnabled = apiVersionAtLeast(300) ? 1 : 0; logger = databaseLogger(this) && tssLogger(this); locationCacheSize = g_network->isSimulated() ? CLIENT_KNOBS->LOCATION_CACHE_EVICTION_SIZE_SIM : CLIENT_KNOBS->LOCATION_CACHE_EVICTION_SIZE; tenantCacheSize = g_network->isSimulated() ? CLIENT_KNOBS->TENANT_CACHE_EVICTION_SIZE_SIM : CLIENT_KNOBS->TENANT_CACHE_EVICTION_SIZE; getValueSubmitted.init(LiteralStringRef("NativeAPI.GetValueSubmitted")); getValueCompleted.init(LiteralStringRef("NativeAPI.GetValueCompleted")); clientDBInfoMonitor = monitorClientDBInfoChange(this, clientInfo, &proxiesChangeTrigger); tssMismatchHandler = handleTssMismatches(this); clientStatusUpdater.actor = clientStatusUpdateActor(this); cacheListMonitor = monitorCacheList(this); smoothMidShardSize.reset(CLIENT_KNOBS->INIT_MID_SHARD_BYTES); if (apiVersionAtLeast(710)) { registerSpecialKeySpaceModule( SpecialKeySpace::MODULE::MANAGEMENT, SpecialKeySpace::IMPLTYPE::READWRITE, std::make_unique(SpecialKeySpace::getManagementApiCommandRange("tenantmap"))); } if (apiVersionAtLeast(700)) { registerSpecialKeySpaceModule(SpecialKeySpace::MODULE::ERRORMSG, SpecialKeySpace::IMPLTYPE::READONLY, std::make_unique( SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::ERRORMSG).begin, [](ReadYourWritesTransaction* ryw) -> Future> { if (ryw->getSpecialKeySpaceErrorMsg().present()) return Optional(ryw->getSpecialKeySpaceErrorMsg().get()); else return Optional(); })); registerSpecialKeySpaceModule( SpecialKeySpace::MODULE::MANAGEMENT, SpecialKeySpace::IMPLTYPE::READWRITE, std::make_unique( KeyRangeRef(LiteralStringRef("options/"), LiteralStringRef("options0")) .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin))); registerSpecialKeySpaceModule( SpecialKeySpace::MODULE::MANAGEMENT, SpecialKeySpace::IMPLTYPE::READWRITE, std::make_unique(SpecialKeySpace::getManagementApiCommandRange("exclude"))); registerSpecialKeySpaceModule( SpecialKeySpace::MODULE::MANAGEMENT, SpecialKeySpace::IMPLTYPE::READWRITE, std::make_unique(SpecialKeySpace::getManagementApiCommandRange("failed"))); registerSpecialKeySpaceModule(SpecialKeySpace::MODULE::MANAGEMENT, SpecialKeySpace::IMPLTYPE::READWRITE, std::make_unique( SpecialKeySpace::getManagementApiCommandRange("excludedlocality"))); registerSpecialKeySpaceModule(SpecialKeySpace::MODULE::MANAGEMENT, SpecialKeySpace::IMPLTYPE::READWRITE, std::make_unique( SpecialKeySpace::getManagementApiCommandRange("failedlocality"))); registerSpecialKeySpaceModule( SpecialKeySpace::MODULE::MANAGEMENT, SpecialKeySpace::IMPLTYPE::READONLY, std::make_unique( KeyRangeRef(LiteralStringRef("in_progress_exclusion/"), LiteralStringRef("in_progress_exclusion0")) .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin))); registerSpecialKeySpaceModule( SpecialKeySpace::MODULE::CONFIGURATION, SpecialKeySpace::IMPLTYPE::READWRITE, std::make_unique( KeyRangeRef(LiteralStringRef("process/class_type/"), LiteralStringRef("process/class_type0")) .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::CONFIGURATION).begin))); registerSpecialKeySpaceModule( SpecialKeySpace::MODULE::CONFIGURATION, SpecialKeySpace::IMPLTYPE::READONLY, std::make_unique( KeyRangeRef(LiteralStringRef("process/class_source/"), LiteralStringRef("process/class_source0")) .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::CONFIGURATION).begin))); registerSpecialKeySpaceModule( SpecialKeySpace::MODULE::MANAGEMENT, SpecialKeySpace::IMPLTYPE::READWRITE, std::make_unique( singleKeyRange(LiteralStringRef("db_locked")) .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin))); registerSpecialKeySpaceModule( SpecialKeySpace::MODULE::MANAGEMENT, SpecialKeySpace::IMPLTYPE::READWRITE, std::make_unique( singleKeyRange(LiteralStringRef("consistency_check_suspended")) .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin))); registerSpecialKeySpaceModule( SpecialKeySpace::MODULE::GLOBALCONFIG, SpecialKeySpace::IMPLTYPE::READWRITE, std::make_unique(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::GLOBALCONFIG))); registerSpecialKeySpaceModule( SpecialKeySpace::MODULE::TRACING, SpecialKeySpace::IMPLTYPE::READWRITE, std::make_unique(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::TRACING))); registerSpecialKeySpaceModule( SpecialKeySpace::MODULE::CONFIGURATION, SpecialKeySpace::IMPLTYPE::READWRITE, std::make_unique( KeyRangeRef(LiteralStringRef("coordinators/"), LiteralStringRef("coordinators0")) .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::CONFIGURATION).begin))); registerSpecialKeySpaceModule( SpecialKeySpace::MODULE::MANAGEMENT, SpecialKeySpace::IMPLTYPE::READONLY, std::make_unique( singleKeyRange(LiteralStringRef("auto_coordinators")) .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin))); registerSpecialKeySpaceModule( SpecialKeySpace::MODULE::MANAGEMENT, SpecialKeySpace::IMPLTYPE::READWRITE, std::make_unique( singleKeyRange(LiteralStringRef("min_required_commit_version")) .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin))); registerSpecialKeySpaceModule( SpecialKeySpace::MODULE::MANAGEMENT, SpecialKeySpace::IMPLTYPE::READWRITE, std::make_unique( singleKeyRange(LiteralStringRef("version_epoch")) .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin))); registerSpecialKeySpaceModule( SpecialKeySpace::MODULE::MANAGEMENT, SpecialKeySpace::IMPLTYPE::READWRITE, std::make_unique( KeyRangeRef(LiteralStringRef("profiling/"), LiteralStringRef("profiling0")) .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin)), /* depracated */ 720); registerSpecialKeySpaceModule( SpecialKeySpace::MODULE::MANAGEMENT, SpecialKeySpace::IMPLTYPE::READWRITE, std::make_unique( KeyRangeRef(LiteralStringRef("maintenance/"), LiteralStringRef("maintenance0")) .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin))); registerSpecialKeySpaceModule( SpecialKeySpace::MODULE::MANAGEMENT, SpecialKeySpace::IMPLTYPE::READWRITE, std::make_unique( KeyRangeRef(LiteralStringRef("data_distribution/"), LiteralStringRef("data_distribution0")) .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin))); registerSpecialKeySpaceModule( SpecialKeySpace::MODULE::ACTORLINEAGE, SpecialKeySpace::IMPLTYPE::READONLY, std::make_unique(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::ACTORLINEAGE))); registerSpecialKeySpaceModule(SpecialKeySpace::MODULE::ACTOR_PROFILER_CONF, SpecialKeySpace::IMPLTYPE::READWRITE, std::make_unique(SpecialKeySpace::getModuleRange( SpecialKeySpace::MODULE::ACTOR_PROFILER_CONF))); } if (apiVersionAtLeast(630)) { registerSpecialKeySpaceModule(SpecialKeySpace::MODULE::TRANSACTION, SpecialKeySpace::IMPLTYPE::READONLY, std::make_unique(conflictingKeysRange)); registerSpecialKeySpaceModule(SpecialKeySpace::MODULE::TRANSACTION, SpecialKeySpace::IMPLTYPE::READONLY, std::make_unique(readConflictRangeKeysRange)); registerSpecialKeySpaceModule(SpecialKeySpace::MODULE::TRANSACTION, SpecialKeySpace::IMPLTYPE::READONLY, std::make_unique(writeConflictRangeKeysRange)); registerSpecialKeySpaceModule(SpecialKeySpace::MODULE::METRICS, SpecialKeySpace::IMPLTYPE::READONLY, std::make_unique(ddStatsRange)); registerSpecialKeySpaceModule( SpecialKeySpace::MODULE::METRICS, SpecialKeySpace::IMPLTYPE::READONLY, std::make_unique(KeyRangeRef(LiteralStringRef("\xff\xff/metrics/health/"), LiteralStringRef("\xff\xff/metrics/health0")))); registerSpecialKeySpaceModule( SpecialKeySpace::MODULE::WORKERINTERFACE, SpecialKeySpace::IMPLTYPE::READONLY, std::make_unique(KeyRangeRef( LiteralStringRef("\xff\xff/worker_interfaces/"), LiteralStringRef("\xff\xff/worker_interfaces0")))); registerSpecialKeySpaceModule( SpecialKeySpace::MODULE::STATUSJSON, SpecialKeySpace::IMPLTYPE::READONLY, std::make_unique(LiteralStringRef("\xff\xff/status/json"), [](ReadYourWritesTransaction* ryw) -> Future> { if (ryw->getDatabase().getPtr() && ryw->getDatabase()->getConnectionRecord()) { ++ryw->getDatabase()->transactionStatusRequests; return getJSON(ryw->getDatabase()); } else { return Optional(); } })); registerSpecialKeySpaceModule( SpecialKeySpace::MODULE::CLUSTERFILEPATH, SpecialKeySpace::IMPLTYPE::READONLY, std::make_unique( LiteralStringRef("\xff\xff/cluster_file_path"), [](ReadYourWritesTransaction* ryw) -> Future> { try { if (ryw->getDatabase().getPtr() && ryw->getDatabase()->getConnectionRecord()) { Optional output = StringRef(ryw->getDatabase()->getConnectionRecord()->getLocation()); return output; } } catch (Error& e) { return e; } return Optional(); })); registerSpecialKeySpaceModule( SpecialKeySpace::MODULE::CONNECTIONSTRING, SpecialKeySpace::IMPLTYPE::READONLY, std::make_unique( LiteralStringRef("\xff\xff/connection_string"), [](ReadYourWritesTransaction* ryw) -> Future> { try { if (ryw->getDatabase().getPtr() && ryw->getDatabase()->getConnectionRecord()) { Reference f = ryw->getDatabase()->getConnectionRecord(); Optional output = StringRef(f->getConnectionString().toString()); return output; } } catch (Error& e) { return e; } return Optional(); })); } throttleExpirer = recurring([this]() { expireThrottles(); }, CLIENT_KNOBS->TAG_THROTTLE_EXPIRATION_INTERVAL); if (BUGGIFY) { DatabaseContext::debugUseTags = true; } } DatabaseContext::DatabaseContext(const Error& err) : deferredError(err), internal(IsInternal::False), cc("TransactionMetrics"), transactionReadVersions("ReadVersions", cc), transactionReadVersionsThrottled("ReadVersionsThrottled", cc), transactionReadVersionsCompleted("ReadVersionsCompleted", cc), transactionReadVersionBatches("ReadVersionBatches", cc), transactionBatchReadVersions("BatchPriorityReadVersions", cc), transactionDefaultReadVersions("DefaultPriorityReadVersions", cc), transactionImmediateReadVersions("ImmediatePriorityReadVersions", cc), transactionBatchReadVersionsCompleted("BatchPriorityReadVersionsCompleted", cc), transactionDefaultReadVersionsCompleted("DefaultPriorityReadVersionsCompleted", cc), transactionImmediateReadVersionsCompleted("ImmediatePriorityReadVersionsCompleted", cc), transactionLogicalReads("LogicalUncachedReads", cc), transactionPhysicalReads("PhysicalReadRequests", cc), transactionPhysicalReadsCompleted("PhysicalReadRequestsCompleted", cc), transactionGetKeyRequests("GetKeyRequests", cc), transactionGetValueRequests("GetValueRequests", cc), transactionGetRangeRequests("GetRangeRequests", cc), transactionGetMappedRangeRequests("GetMappedRangeRequests", cc), transactionGetRangeStreamRequests("GetRangeStreamRequests", cc), transactionWatchRequests("WatchRequests", cc), transactionGetAddressesForKeyRequests("GetAddressesForKeyRequests", cc), transactionBytesRead("BytesRead", cc), transactionKeysRead("KeysRead", cc), transactionMetadataVersionReads("MetadataVersionReads", cc), transactionCommittedMutations("CommittedMutations", cc), transactionCommittedMutationBytes("CommittedMutationBytes", cc), transactionSetMutations("SetMutations", cc), transactionClearMutations("ClearMutations", cc), transactionAtomicMutations("AtomicMutations", cc), transactionsCommitStarted("CommitStarted", cc), transactionsCommitCompleted("CommitCompleted", cc), transactionKeyServerLocationRequests("KeyServerLocationRequests", cc), transactionKeyServerLocationRequestsCompleted("KeyServerLocationRequestsCompleted", cc), transactionStatusRequests("StatusRequests", cc), transactionsTooOld("TooOld", cc), transactionsFutureVersions("FutureVersions", cc), transactionsNotCommitted("NotCommitted", cc), transactionsMaybeCommitted("MaybeCommitted", cc), transactionsResourceConstrained("ResourceConstrained", cc), transactionsProcessBehind("ProcessBehind", cc), transactionsThrottled("Throttled", cc), transactionsExpensiveClearCostEstCount("ExpensiveClearCostEstCount", cc), transactionGrvFullBatches("NumGrvFullBatches", cc), transactionGrvTimedOutBatches("NumGrvTimedOutBatches", cc), transactionsStaleVersionVectors("NumStaleVersionVectors", cc), latencies(1000), readLatencies(1000), commitLatencies(1000), GRVLatencies(1000), mutationsPerCommit(1000), bytesPerCommit(1000), bgLatencies(1000), bgGranulesPerRequest(1000), transactionTracingSample(false), smoothMidShardSize(CLIENT_KNOBS->SHARD_STAT_SMOOTH_AMOUNT), connectToDatabaseEventCacheHolder(format("ConnectToDatabase/%s", dbId.toString().c_str())) {} // Static constructor used by server processes to create a DatabaseContext // For internal (fdbserver) use only Database DatabaseContext::create(Reference> clientInfo, Future clientInfoMonitor, LocalityData clientLocality, EnableLocalityLoadBalance enableLocalityLoadBalance, TaskPriority taskID, LockAware lockAware, int apiVersion, IsSwitchable switchable) { return Database(new DatabaseContext(Reference>>(), clientInfo, makeReference>>(), clientInfoMonitor, taskID, clientLocality, enableLocalityLoadBalance, lockAware, IsInternal::True, apiVersion, switchable)); } DatabaseContext::~DatabaseContext() { cacheListMonitor.cancel(); clientDBInfoMonitor.cancel(); monitorTssInfoChange.cancel(); tssMismatchHandler.cancel(); if (grvUpdateHandler.isValid()) { grvUpdateHandler.cancel(); } if (sharedStatePtr) { sharedStatePtr->delRef(sharedStatePtr); } for (auto it = server_interf.begin(); it != server_interf.end(); it = server_interf.erase(it)) it->second->notifyContextDestroyed(); ASSERT_ABORT(server_interf.empty()); locationCache.insert(allKeys, Reference()); } Optional DatabaseContext::getCachedLocation(const Optional& tenantName, const KeyRef& key, Reverse isBackward) { TenantMapEntry tenantEntry; Arena arena; KeyRef resolvedKey = key; if (tenantName.present()) { auto itr = tenantCache.find(tenantName.get()); if (itr != tenantCache.end()) { tenantEntry = itr->second; resolvedKey = resolvedKey.withPrefix(tenantEntry.prefix, arena); } else { return Optional(); } } auto range = isBackward ? locationCache.rangeContainingKeyBefore(resolvedKey) : locationCache.rangeContaining(resolvedKey); if (range->value()) { return KeyRangeLocationInfo(tenantEntry, toRelativeRange(range->range(), tenantEntry.prefix), range->value()); } return Optional(); } bool DatabaseContext::getCachedLocations(const Optional& tenantName, const KeyRangeRef& range, std::vector& result, int limit, Reverse reverse) { result.clear(); TenantMapEntry tenantEntry; Arena arena; KeyRangeRef resolvedRange = range; if (tenantName.present()) { auto itr = tenantCache.find(tenantName.get()); if (itr != tenantCache.end()) { tenantEntry = itr->second; resolvedRange = resolvedRange.withPrefix(tenantEntry.prefix, arena); } else { return false; } } auto begin = locationCache.rangeContaining(resolvedRange.begin); auto end = locationCache.rangeContainingKeyBefore(resolvedRange.end); loop { auto r = reverse ? end : begin; if (!r->value()) { TEST(result.size()); // had some but not all cached locations result.clear(); return false; } result.emplace_back(tenantEntry, toRelativeRange(r->range() & resolvedRange, tenantEntry.prefix), r->value()); if (result.size() == limit || begin == end) { break; } if (reverse) --end; else ++begin; } return true; } void DatabaseContext::cacheTenant(const TenantName& tenant, const TenantMapEntry& tenantEntry) { if (tenantCacheSize > 0) { // Naive cache eviction just erases the entire cache when it gets full. // We don't expect a single client to fill the tenant cache typically, so this should work reasonably well. if (tenantCache.size() > tenantCacheSize) { tenantCache.clear(); } tenantCache[tenant] = tenantEntry; } } Reference DatabaseContext::setCachedLocation(const Optional& tenant, const TenantMapEntry& tenantEntry, const KeyRangeRef& absoluteKeys, const std::vector& servers) { if (tenant.present()) { cacheTenant(tenant.get(), tenantEntry); } std::vector>> serverRefs; serverRefs.reserve(servers.size()); for (const auto& interf : servers) { serverRefs.push_back(StorageServerInfo::getInterface(this, interf, clientLocality)); } int maxEvictionAttempts = 100, attempts = 0; auto loc = makeReference(serverRefs); while (locationCache.size() > locationCacheSize && attempts < maxEvictionAttempts) { TEST(true); // NativeAPI storage server locationCache entry evicted attempts++; auto r = locationCache.randomRange(); Key begin = r.begin(), end = r.end(); // insert invalidates r, so can't be passed a mere reference into it locationCache.insert(KeyRangeRef(begin, end), Reference()); } locationCache.insert(absoluteKeys, loc); return loc; } void DatabaseContext::invalidateCachedTenant(const TenantNameRef& tenant) { tenantCache.erase(tenant); } void DatabaseContext::invalidateCache(const KeyRef& tenantPrefix, const KeyRef& key, Reverse isBackward) { Arena arena; KeyRef resolvedKey = key; if (!tenantPrefix.empty()) { resolvedKey = resolvedKey.withPrefix(tenantPrefix, arena); } if (isBackward) { locationCache.rangeContainingKeyBefore(resolvedKey)->value() = Reference(); } else { locationCache.rangeContaining(resolvedKey)->value() = Reference(); } } void DatabaseContext::invalidateCache(const KeyRef& tenantPrefix, const KeyRangeRef& keys) { Arena arena; KeyRangeRef resolvedKeys = keys; if (!tenantPrefix.empty()) { resolvedKeys = resolvedKeys.withPrefix(tenantPrefix, arena); } auto rs = locationCache.intersectingRanges(resolvedKeys); Key begin = rs.begin().begin(), end = rs.end().begin(); // insert invalidates rs, so can't be passed a mere reference into it locationCache.insert(KeyRangeRef(begin, end), Reference()); } void DatabaseContext::setFailedEndpointOnHealthyServer(const Endpoint& endpoint) { if (failedEndpointsOnHealthyServersInfo.find(endpoint) == failedEndpointsOnHealthyServersInfo.end()) { failedEndpointsOnHealthyServersInfo[endpoint] = EndpointFailureInfo{ .startTime = now(), .lastRefreshTime = now() }; } } void DatabaseContext::updateFailedEndpointRefreshTime(const Endpoint& endpoint) { if (failedEndpointsOnHealthyServersInfo.find(endpoint) == failedEndpointsOnHealthyServersInfo.end()) { // The endpoint is not failed. Nothing to update. return; } failedEndpointsOnHealthyServersInfo[endpoint].lastRefreshTime = now(); } Optional DatabaseContext::getEndpointFailureInfo(const Endpoint& endpoint) { if (failedEndpointsOnHealthyServersInfo.find(endpoint) == failedEndpointsOnHealthyServersInfo.end()) { return Optional(); } return failedEndpointsOnHealthyServersInfo[endpoint]; } void DatabaseContext::clearFailedEndpointOnHealthyServer(const Endpoint& endpoint) { failedEndpointsOnHealthyServersInfo.erase(endpoint); } Future DatabaseContext::onProxiesChanged() const { return this->proxiesChangeTrigger.onTrigger(); } bool DatabaseContext::sampleReadTags() const { double sampleRate = GlobalConfig::globalConfig().get(transactionTagSampleRate, CLIENT_KNOBS->READ_TAG_SAMPLE_RATE); return sampleRate > 0 && deterministicRandom()->random01() <= sampleRate; } bool DatabaseContext::sampleOnCost(uint64_t cost) const { double sampleCost = GlobalConfig::globalConfig().get(transactionTagSampleCost, CLIENT_KNOBS->COMMIT_SAMPLE_COST); if (sampleCost <= 0) return false; return deterministicRandom()->random01() <= (double)cost / sampleCost; } int64_t extractIntOption(Optional value, int64_t minValue, int64_t maxValue) { validateOptionValuePresent(value); if (value.get().size() != 8) { throw invalid_option_value(); } int64_t passed = *((int64_t*)(value.get().begin())); if (passed > maxValue || passed < minValue) { throw invalid_option_value(); } return passed; } uint64_t extractHexOption(StringRef value) { char* end; uint64_t id = strtoull(value.toString().c_str(), &end, 16); if (*end) throw invalid_option_value(); return id; } void DatabaseContext::setOption(FDBDatabaseOptions::Option option, Optional value) { int defaultFor = FDBDatabaseOptions::optionInfo.getMustExist(option).defaultFor; if (defaultFor >= 0) { ASSERT(FDBTransactionOptions::optionInfo.find((FDBTransactionOptions::Option)defaultFor) != FDBTransactionOptions::optionInfo.end()); transactionDefaults.addOption((FDBTransactionOptions::Option)defaultFor, value.castTo>()); } else { switch (option) { case FDBDatabaseOptions::LOCATION_CACHE_SIZE: locationCacheSize = (int)extractIntOption(value, 0, std::numeric_limits::max()); break; case FDBDatabaseOptions::MACHINE_ID: clientLocality = LocalityData(clientLocality.processId(), value.present() ? Standalone(value.get()) : Optional>(), clientLocality.machineId(), clientLocality.dcId()); if (clientInfo->get().commitProxies.size()) commitProxies = makeReference(clientInfo->get().commitProxies); if (clientInfo->get().grvProxies.size()) grvProxies = makeReference(clientInfo->get().grvProxies, BalanceOnRequests::True); server_interf.clear(); locationCache.insert(allKeys, Reference()); break; case FDBDatabaseOptions::MAX_WATCHES: maxOutstandingWatches = (int)extractIntOption(value, 0, CLIENT_KNOBS->ABSOLUTE_MAX_WATCHES); break; case FDBDatabaseOptions::DATACENTER_ID: clientLocality = LocalityData(clientLocality.processId(), clientLocality.zoneId(), clientLocality.machineId(), value.present() ? Standalone(value.get()) : Optional>()); if (clientInfo->get().commitProxies.size()) commitProxies = makeReference(clientInfo->get().commitProxies); if (clientInfo->get().grvProxies.size()) grvProxies = makeReference(clientInfo->get().grvProxies, BalanceOnRequests::True); server_interf.clear(); locationCache.insert(allKeys, Reference()); break; case FDBDatabaseOptions::SNAPSHOT_RYW_ENABLE: validateOptionValueNotPresent(value); snapshotRywEnabled++; break; case FDBDatabaseOptions::SNAPSHOT_RYW_DISABLE: validateOptionValueNotPresent(value); snapshotRywEnabled--; break; case FDBDatabaseOptions::USE_CONFIG_DATABASE: validateOptionValueNotPresent(value); useConfigDatabase = true; break; case FDBDatabaseOptions::TEST_CAUSAL_READ_RISKY: verifyCausalReadsProp = double(extractIntOption(value, 0, 100)) / 100.0; break; default: break; } } } void DatabaseContext::addWatch() { if (outstandingWatches >= maxOutstandingWatches) throw too_many_watches(); ++outstandingWatches; } void DatabaseContext::removeWatch() { --outstandingWatches; ASSERT(outstandingWatches >= 0); } Future DatabaseContext::onConnected() { return connected; } ACTOR static Future switchConnectionRecordImpl(Reference connRecord, DatabaseContext* self) { TEST(true); // Switch connection file TraceEvent("SwitchConnectionRecord") .detail("ClusterFile", connRecord->toString()) .detail("ConnectionString", connRecord->getConnectionString().toString()); // Reset state from former cluster. self->commitProxies.clear(); self->grvProxies.clear(); self->minAcceptableReadVersion = std::numeric_limits::max(); self->tenantCache.clear(); self->invalidateCache(Key(), allKeys); self->ssVersionVectorCache.clear(); auto clearedClientInfo = self->clientInfo->get(); clearedClientInfo.commitProxies.clear(); clearedClientInfo.grvProxies.clear(); clearedClientInfo.id = deterministicRandom()->randomUniqueID(); self->clientInfo->set(clearedClientInfo); self->connectionRecord->set(connRecord); state Database db(Reference::addRef(self)); state Transaction tr(db); loop { tr.setOption(FDBTransactionOptions::READ_LOCK_AWARE); try { TraceEvent("SwitchConnectionRecordAttemptingGRV").log(); Version v = wait(tr.getReadVersion()); TraceEvent("SwitchConnectionRecordGotRV") .detail("ReadVersion", v) .detail("MinAcceptableReadVersion", self->minAcceptableReadVersion); ASSERT(self->minAcceptableReadVersion != std::numeric_limits::max()); self->connectionFileChangedTrigger.trigger(); return Void(); } catch (Error& e) { TraceEvent("SwitchConnectionRecordError").detail("Error", e.what()); wait(tr.onError(e)); } } } Reference DatabaseContext::getConnectionRecord() { if (connectionRecord) { return connectionRecord->get(); } return Reference(); } Future DatabaseContext::switchConnectionRecord(Reference standby) { ASSERT(switchable); return switchConnectionRecordImpl(standby, this); } Future DatabaseContext::connectionFileChanged() { return connectionFileChangedTrigger.onTrigger(); } void DatabaseContext::expireThrottles() { for (auto& priorityItr : throttledTags) { for (auto tagItr = priorityItr.second.begin(); tagItr != priorityItr.second.end();) { if (tagItr->second.expired()) { TEST(true); // Expiring client throttle tagItr = priorityItr.second.erase(tagItr); } else { ++tagItr; } } } } extern IPAddress determinePublicIPAutomatically(ClusterConnectionString& ccs); // Creates a database object that represents a connection to a cluster // This constructor uses a preallocated DatabaseContext that may have been created // on another thread Database Database::createDatabase(Reference connRecord, int apiVersion, IsInternal internal, LocalityData const& clientLocality, DatabaseContext* preallocatedDb) { if (!g_network) throw network_not_setup(); ASSERT(TraceEvent::isNetworkThread()); platform::ImageInfo imageInfo = platform::getImageInfo(); if (connRecord) { if (networkOptions.traceDirectory.present() && !traceFileIsOpen()) { g_network->initMetrics(); FlowTransport::transport().initMetrics(); initTraceEventMetrics(); auto publicIP = determinePublicIPAutomatically(connRecord->getConnectionString()); selectTraceFormatter(networkOptions.traceFormat); selectTraceClockSource(networkOptions.traceClockSource); addUniversalTraceField("ClientDescription", format("%s-%s-%" PRIu64, networkOptions.primaryClient ? "primary" : "external", FDB_VT_VERSION, getTraceThreadId())); openTraceFile(NetworkAddress(publicIP, ::getpid()), networkOptions.traceRollSize, networkOptions.traceMaxLogsSize, networkOptions.traceDirectory.get(), "trace", networkOptions.traceLogGroup, networkOptions.traceFileIdentifier, networkOptions.tracePartialFileSuffix); TraceEvent("ClientStart") .detail("SourceVersion", getSourceVersion()) .detail("Version", FDB_VT_VERSION) .detail("PackageName", FDB_VT_PACKAGE_NAME) .detailf("ActualTime", "%lld", DEBUG_DETERMINISM ? 0 : time(nullptr)) .detail("ApiVersion", apiVersion) .detail("ClientLibrary", imageInfo.fileName) .detailf("ImageOffset", "%p", imageInfo.offset) .detail("Primary", networkOptions.primaryClient) .trackLatest("ClientStart"); initializeSystemMonitorMachineState(SystemMonitorMachineState(IPAddress(publicIP))); systemMonitor(); uncancellable(recurring(&systemMonitor, CLIENT_KNOBS->SYSTEM_MONITOR_INTERVAL, TaskPriority::FlushTrace)); } } g_network->initTLS(); auto clientInfo = makeReference>(); auto coordinator = makeReference>>(); auto connectionRecord = makeReference>>(); connectionRecord->set(connRecord); Future clientInfoMonitor = monitorProxies(connectionRecord, clientInfo, coordinator, networkOptions.supportedVersions, StringRef(networkOptions.traceLogGroup)); DatabaseContext* db; if (preallocatedDb) { db = new (preallocatedDb) DatabaseContext(connectionRecord, clientInfo, coordinator, clientInfoMonitor, TaskPriority::DefaultEndpoint, clientLocality, EnableLocalityLoadBalance::True, LockAware::False, internal, apiVersion, IsSwitchable::True); } else { db = new DatabaseContext(connectionRecord, clientInfo, coordinator, clientInfoMonitor, TaskPriority::DefaultEndpoint, clientLocality, EnableLocalityLoadBalance::True, LockAware::False, internal, apiVersion, IsSwitchable::True); } auto database = Database(db); GlobalConfig::create( database, Reference const>(clientInfo), std::addressof(clientInfo->get())); GlobalConfig::globalConfig().trigger(samplingFrequency, samplingProfilerUpdateFrequency); GlobalConfig::globalConfig().trigger(samplingWindow, samplingProfilerUpdateWindow); TraceEvent("ConnectToDatabase", database->dbId) .detail("Version", FDB_VT_VERSION) .detail("ClusterFile", connRecord ? connRecord->toString() : "None") .detail("ConnectionString", connRecord ? connRecord->getConnectionString().toString() : "None") .detail("ClientLibrary", imageInfo.fileName) .detail("Primary", networkOptions.primaryClient) .detail("Internal", internal) .trackLatest(database->connectToDatabaseEventCacheHolder.trackingKey); return database; } Database Database::createDatabase(std::string connFileName, int apiVersion, IsInternal internal, LocalityData const& clientLocality) { Reference rccr = Reference( new ClusterConnectionFile(ClusterConnectionFile::lookupClusterFileName(connFileName).first)); return Database::createDatabase(rccr, apiVersion, internal, clientLocality); } Reference DatabaseContext::getWatchMetadata(int64_t tenantId, KeyRef key) const { const auto it = watchMap.find(std::make_pair(tenantId, key)); if (it == watchMap.end()) return Reference(); return it->second; } void DatabaseContext::setWatchMetadata(Reference metadata) { watchMap[std::make_pair(metadata->parameters->tenant.tenantId, metadata->parameters->key)] = metadata; } void DatabaseContext::deleteWatchMetadata(int64_t tenantId, KeyRef key) { watchMap.erase(std::make_pair(tenantId, key)); } void DatabaseContext::clearWatchMetadata() { watchMap.clear(); } const UniqueOrderedOptionList& Database::getTransactionDefaults() const { ASSERT(db); return db->transactionDefaults; } void setNetworkOption(FDBNetworkOptions::Option option, Optional value) { std::regex identifierRegex("^[a-zA-Z0-9_]*$"); switch (option) { // SOMEDAY: If the network is already started, should these five throw an error? case FDBNetworkOptions::TRACE_ENABLE: networkOptions.traceDirectory = value.present() ? value.get().toString() : ""; break; case FDBNetworkOptions::TRACE_ROLL_SIZE: validateOptionValuePresent(value); networkOptions.traceRollSize = extractIntOption(value, 0, std::numeric_limits::max()); break; case FDBNetworkOptions::TRACE_MAX_LOGS_SIZE: validateOptionValuePresent(value); networkOptions.traceMaxLogsSize = extractIntOption(value, 0, std::numeric_limits::max()); break; case FDBNetworkOptions::TRACE_FORMAT: validateOptionValuePresent(value); networkOptions.traceFormat = value.get().toString(); if (!validateTraceFormat(networkOptions.traceFormat)) { fprintf(stderr, "Unrecognized trace format: `%s'\n", networkOptions.traceFormat.c_str()); throw invalid_option_value(); } break; case FDBNetworkOptions::TRACE_FILE_IDENTIFIER: validateOptionValuePresent(value); networkOptions.traceFileIdentifier = value.get().toString(); if (networkOptions.traceFileIdentifier.length() > CLIENT_KNOBS->TRACE_LOG_FILE_IDENTIFIER_MAX_LENGTH) { fprintf(stderr, "Trace file identifier provided is too long.\n"); throw invalid_option_value(); } else if (!std::regex_match(networkOptions.traceFileIdentifier, identifierRegex)) { fprintf(stderr, "Trace file identifier should only contain alphanumerics and underscores.\n"); throw invalid_option_value(); } break; case FDBNetworkOptions::TRACE_LOG_GROUP: if (value.present()) { if (traceFileIsOpen()) { setTraceLogGroup(value.get().toString()); } else { networkOptions.traceLogGroup = value.get().toString(); } } break; case FDBNetworkOptions::TRACE_CLOCK_SOURCE: validateOptionValuePresent(value); networkOptions.traceClockSource = value.get().toString(); if (!validateTraceClockSource(networkOptions.traceClockSource)) { fprintf(stderr, "Unrecognized trace clock source: `%s'\n", networkOptions.traceClockSource.c_str()); throw invalid_option_value(); } break; case FDBNetworkOptions::TRACE_PARTIAL_FILE_SUFFIX: validateOptionValuePresent(value); networkOptions.tracePartialFileSuffix = value.get().toString(); break; case FDBNetworkOptions::KNOB: { validateOptionValuePresent(value); std::string optionValue = value.get().toString(); TraceEvent("SetKnob").detail("KnobString", optionValue); size_t eq = optionValue.find_first_of('='); if (eq == optionValue.npos) { TraceEvent(SevWarnAlways, "InvalidKnobString").detail("KnobString", optionValue); throw invalid_option_value(); } std::string knobName = optionValue.substr(0, eq); std::string knobValueString = optionValue.substr(eq + 1); try { auto knobValue = IKnobCollection::parseKnobValue(knobName, knobValueString, IKnobCollection::Type::CLIENT); if (g_network) { IKnobCollection::getMutableGlobalKnobCollection().setKnob(knobName, knobValue); } else { networkOptions.knobs[knobName] = knobValue; } } catch (Error& e) { TraceEvent(SevWarnAlways, "UnrecognizedKnob").detail("Knob", knobName.c_str()); fprintf(stderr, "FoundationDB client ignoring unrecognized knob option '%s'\n", knobName.c_str()); } break; } case FDBNetworkOptions::TLS_PLUGIN: validateOptionValuePresent(value); break; case FDBNetworkOptions::TLS_CERT_PATH: validateOptionValuePresent(value); tlsConfig.setCertificatePath(value.get().toString()); break; case FDBNetworkOptions::TLS_CERT_BYTES: { validateOptionValuePresent(value); tlsConfig.setCertificateBytes(value.get().toString()); break; } case FDBNetworkOptions::TLS_CA_PATH: { validateOptionValuePresent(value); tlsConfig.setCAPath(value.get().toString()); break; } case FDBNetworkOptions::TLS_CA_BYTES: { validateOptionValuePresent(value); tlsConfig.setCABytes(value.get().toString()); break; } case FDBNetworkOptions::TLS_PASSWORD: validateOptionValuePresent(value); tlsConfig.setPassword(value.get().toString()); break; case FDBNetworkOptions::TLS_KEY_PATH: validateOptionValuePresent(value); tlsConfig.setKeyPath(value.get().toString()); break; case FDBNetworkOptions::TLS_KEY_BYTES: { validateOptionValuePresent(value); tlsConfig.setKeyBytes(value.get().toString()); break; } case FDBNetworkOptions::TLS_VERIFY_PEERS: validateOptionValuePresent(value); tlsConfig.clearVerifyPeers(); tlsConfig.addVerifyPeers(value.get().toString()); break; case FDBNetworkOptions::CLIENT_BUGGIFY_ENABLE: enableBuggify(true, BuggifyType::Client); break; case FDBNetworkOptions::CLIENT_BUGGIFY_DISABLE: enableBuggify(false, BuggifyType::Client); break; case FDBNetworkOptions::CLIENT_BUGGIFY_SECTION_ACTIVATED_PROBABILITY: validateOptionValuePresent(value); clearBuggifySections(BuggifyType::Client); P_BUGGIFIED_SECTION_ACTIVATED[int(BuggifyType::Client)] = double(extractIntOption(value, 0, 100)) / 100.0; break; case FDBNetworkOptions::CLIENT_BUGGIFY_SECTION_FIRED_PROBABILITY: validateOptionValuePresent(value); P_BUGGIFIED_SECTION_FIRES[int(BuggifyType::Client)] = double(extractIntOption(value, 0, 100)) / 100.0; break; case FDBNetworkOptions::DISABLE_CLIENT_STATISTICS_LOGGING: validateOptionValueNotPresent(value); networkOptions.logClientInfo = false; break; case FDBNetworkOptions::SUPPORTED_CLIENT_VERSIONS: { // The multi-version API should be providing us these guarantees ASSERT(g_network); ASSERT(value.present()); Standalone> supportedVersions; std::vector supportedVersionsStrings = value.get().splitAny(LiteralStringRef(";")); for (StringRef versionString : supportedVersionsStrings) { #ifdef ADDRESS_SANITIZER __lsan_disable(); #endif // LSAN reports that we leak this allocation in client // tests, but I cannot seem to figure out why. AFAICT // it's not actually leaking. If it is a leak, it's only a few bytes. supportedVersions.push_back_deep(supportedVersions.arena(), ClientVersionRef(versionString)); #ifdef ADDRESS_SANITIZER __lsan_enable(); #endif } ASSERT(supportedVersions.size() > 0); networkOptions.supportedVersions->set(supportedVersions); break; } case FDBNetworkOptions::ENABLE_RUN_LOOP_PROFILING: // Same as ENABLE_SLOW_TASK_PROFILING validateOptionValueNotPresent(value); networkOptions.runLoopProfilingEnabled = true; break; case FDBNetworkOptions::DISTRIBUTED_CLIENT_TRACER: { validateOptionValuePresent(value); std::string tracer = value.get().toString(); if (tracer == "none" || tracer == "disabled") { openTracer(TracerType::DISABLED); } else if (tracer == "logfile" || tracer == "file" || tracer == "log_file") { openTracer(TracerType::LOG_FILE); } else if (tracer == "network_lossy") { openTracer(TracerType::NETWORK_LOSSY); } else { fprintf(stderr, "ERROR: Unknown or unsupported tracer: `%s'", tracer.c_str()); throw invalid_option_value(); } break; } case FDBNetworkOptions::EXTERNAL_CLIENT: networkOptions.primaryClient = false; break; default: break; } } // update the network busyness on a 1s cadence ACTOR Future monitorNetworkBusyness() { state double prevTime = now(); loop { wait(delay(CLIENT_KNOBS->NETWORK_BUSYNESS_MONITOR_INTERVAL, TaskPriority::FlushTrace)); double elapsed = now() - prevTime; // get elapsed time from last execution prevTime = now(); struct NetworkMetrics::PriorityStats& tracker = g_network->networkInfo.metrics.starvationTrackerNetworkBusyness; if (tracker.active) { // update metrics tracker.duration += now() - tracker.windowedTimer; tracker.maxDuration = std::max(tracker.maxDuration, now() - tracker.timer); tracker.windowedTimer = now(); } double busyFraction = std::min(elapsed, tracker.duration) / elapsed; // The burstiness score is an indicator of the maximum busyness spike over the measurement interval. // It scales linearly from 0 to 1 as the largest burst goes from the start to the saturation threshold. // This allows us to account for saturation that happens in smaller bursts than the measurement interval. // // Burstiness will not be calculated if the saturation threshold is smaller than the start threshold or // if either value is negative. double burstiness = 0; if (CLIENT_KNOBS->BUSYNESS_SPIKE_START_THRESHOLD >= 0 && CLIENT_KNOBS->BUSYNESS_SPIKE_SATURATED_THRESHOLD >= CLIENT_KNOBS->BUSYNESS_SPIKE_START_THRESHOLD) { burstiness = std::min(1.0, std::max(0.0, tracker.maxDuration - CLIENT_KNOBS->BUSYNESS_SPIKE_START_THRESHOLD) / std::max(1e-6, CLIENT_KNOBS->BUSYNESS_SPIKE_SATURATED_THRESHOLD - CLIENT_KNOBS->BUSYNESS_SPIKE_START_THRESHOLD)); } g_network->networkInfo.metrics.networkBusyness = std::max(busyFraction, burstiness); tracker.duration = 0; tracker.maxDuration = 0; } } static void setupGlobalKnobs() { IKnobCollection::setGlobalKnobCollection(IKnobCollection::Type::CLIENT, Randomize::False, IsSimulated::False); for (const auto& [knobName, knobValue] : networkOptions.knobs) { IKnobCollection::getMutableGlobalKnobCollection().setKnob(knobName, knobValue); } } // Setup g_network and start monitoring for network busyness void setupNetwork(uint64_t transportId, UseMetrics useMetrics) { if (g_network) throw network_already_setup(); if (!networkOptions.logClientInfo.present()) networkOptions.logClientInfo = true; setupGlobalKnobs(); g_network = newNet2(tlsConfig, false, useMetrics || networkOptions.traceDirectory.present()); g_network->addStopCallback(Net2FileSystem::stop); FlowTransport::createInstance(true, transportId, WLTOKEN_RESERVED_COUNT); Net2FileSystem::newFileSystem(); uncancellable(monitorNetworkBusyness()); } void runNetwork() { if (!g_network) { throw network_not_setup(); } if (!g_network->checkRunnable()) { throw network_cannot_be_restarted(); } if (networkOptions.traceDirectory.present() && networkOptions.runLoopProfilingEnabled) { setupRunLoopProfiler(); } g_network->run(); if (networkOptions.traceDirectory.present()) systemMonitor(); } void stopNetwork() { if (!g_network) throw network_not_setup(); TraceEvent("ClientStopNetwork").log(); g_network->stop(); closeTraceFile(); } void DatabaseContext::updateProxies() { if (proxiesLastChange == clientInfo->get().id) return; proxiesLastChange = clientInfo->get().id; commitProxies.clear(); grvProxies.clear(); ssVersionVectorCache.clear(); bool commitProxyProvisional = false, grvProxyProvisional = false; if (clientInfo->get().commitProxies.size()) { commitProxies = makeReference(clientInfo->get().commitProxies); commitProxyProvisional = clientInfo->get().commitProxies[0].provisional; } if (clientInfo->get().grvProxies.size()) { grvProxies = makeReference(clientInfo->get().grvProxies, BalanceOnRequests::True); grvProxyProvisional = clientInfo->get().grvProxies[0].provisional; } if (clientInfo->get().commitProxies.size() && clientInfo->get().grvProxies.size()) { ASSERT(commitProxyProvisional == grvProxyProvisional); proxyProvisional = commitProxyProvisional; } } Reference DatabaseContext::getCommitProxies(UseProvisionalProxies useProvisionalProxies) { updateProxies(); if (proxyProvisional && !useProvisionalProxies) { return Reference(); } return commitProxies; } Reference DatabaseContext::getGrvProxies(UseProvisionalProxies useProvisionalProxies) { updateProxies(); if (proxyProvisional && !useProvisionalProxies) { return Reference(); } return grvProxies; } bool DatabaseContext::isCurrentGrvProxy(UID proxyId) const { for (const auto& proxy : clientInfo->get().grvProxies) { if (proxy.id() == proxyId) return true; } TEST(true); // stale GRV proxy detected return false; } // Actor which will wait until the MultiInterface returned by the DatabaseContext cx is not // nullptr ACTOR Future> getCommitProxiesFuture(DatabaseContext* cx, UseProvisionalProxies useProvisionalProxies) { loop { Reference commitProxies = cx->getCommitProxies(useProvisionalProxies); if (commitProxies) return commitProxies; wait(cx->onProxiesChanged()); } } // Returns a future which will not be set until the CommitProxyInfo of this DatabaseContext is not nullptr Future> DatabaseContext::getCommitProxiesFuture( UseProvisionalProxies useProvisionalProxies) { return ::getCommitProxiesFuture(this, useProvisionalProxies); } void GetRangeLimits::decrement(VectorRef const& data) { if (rows != GetRangeLimits::ROW_LIMIT_UNLIMITED) { ASSERT(data.size() <= rows); rows -= data.size(); } minRows = std::max(0, minRows - data.size()); if (bytes != GetRangeLimits::BYTE_LIMIT_UNLIMITED) bytes = std::max(0, bytes - (int)data.expectedSize() - (8 - (int)sizeof(KeyValueRef)) * data.size()); } void GetRangeLimits::decrement(KeyValueRef const& data) { minRows = std::max(0, minRows - 1); if (rows != GetRangeLimits::ROW_LIMIT_UNLIMITED) rows--; if (bytes != GetRangeLimits::BYTE_LIMIT_UNLIMITED) bytes = std::max(0, bytes - (int)8 - (int)data.expectedSize()); } void GetRangeLimits::decrement(VectorRef const& data) { if (rows != GetRangeLimits::ROW_LIMIT_UNLIMITED) { ASSERT(data.size() <= rows); rows -= data.size(); } minRows = std::max(0, minRows - data.size()); // TODO: For now, expectedSize only considers the size of the original key values, but not the underlying queries or // results. Also, double check it is correct when dealing with sizeof(MappedKeyValueRef). if (bytes != GetRangeLimits::BYTE_LIMIT_UNLIMITED) bytes = std::max(0, bytes - (int)data.expectedSize() - (8 - (int)sizeof(MappedKeyValueRef)) * data.size()); } void GetRangeLimits::decrement(MappedKeyValueRef const& data) { minRows = std::max(0, minRows - 1); if (rows != GetRangeLimits::ROW_LIMIT_UNLIMITED) rows--; // TODO: For now, expectedSize only considers the size of the original key values, but not the underlying queries or // results. Also, double check it is correct when dealing with sizeof(MappedKeyValueRef). if (bytes != GetRangeLimits::BYTE_LIMIT_UNLIMITED) bytes = std::max(0, bytes - (int)8 - (int)data.expectedSize()); } // True if either the row or byte limit has been reached bool GetRangeLimits::isReached() { return rows == 0 || (bytes == 0 && minRows == 0); } // True if data would cause the row or byte limit to be reached bool GetRangeLimits::reachedBy(VectorRef const& data) { return (rows != GetRangeLimits::ROW_LIMIT_UNLIMITED && data.size() >= rows) || (bytes != GetRangeLimits::BYTE_LIMIT_UNLIMITED && (int)data.expectedSize() + (8 - (int)sizeof(KeyValueRef)) * data.size() >= bytes && data.size() >= minRows); } bool GetRangeLimits::hasByteLimit() { return bytes != GetRangeLimits::BYTE_LIMIT_UNLIMITED; } bool GetRangeLimits::hasRowLimit() { return rows != GetRangeLimits::ROW_LIMIT_UNLIMITED; } bool GetRangeLimits::hasSatisfiedMinRows() { return hasByteLimit() && minRows == 0; } AddressExclusion AddressExclusion::parse(StringRef const& key) { // Must not change: serialized to the database! auto parsedIp = IPAddress::parse(key.toString()); if (parsedIp.present()) { return AddressExclusion(parsedIp.get()); } // Not a whole machine, includes `port'. try { auto addr = NetworkAddress::parse(key.toString()); if (addr.isTLS()) { TraceEvent(SevWarnAlways, "AddressExclusionParseError") .detail("String", key) .detail("Description", "Address inclusion string should not include `:tls' suffix."); return AddressExclusion(); } return AddressExclusion(addr.ip, addr.port); } catch (Error&) { TraceEvent(SevWarnAlways, "AddressExclusionParseError").detail("String", key); return AddressExclusion(); } } Future> getValue(Reference const& trState, Key const& key, Future const& version, UseTenant const& useTenant = UseTenant::True, TransactionRecordLogInfo const& recordLogInfo = TransactionRecordLogInfo::True); Future getRange(Reference const& trState, Future const& fVersion, KeySelector const& begin, KeySelector const& end, GetRangeLimits const& limits, Reverse const& reverse, UseTenant const& useTenant); ACTOR Future> fetchServerInterface(Reference trState, Future ver, UID id) { Optional val = wait(getValue(trState, serverListKeyFor(id), ver, UseTenant::False, TransactionRecordLogInfo::False)); if (!val.present()) { // A storage server has been removed from serverList since we read keyServers return Optional(); } return decodeServerListValue(val.get()); } ACTOR Future>> transactionalGetServerInterfaces(Reference trState, Future ver, std::vector ids) { state std::vector>> serverListEntries; serverListEntries.reserve(ids.size()); for (int s = 0; s < ids.size(); s++) { serverListEntries.push_back(fetchServerInterface(trState, ver, ids[s])); } std::vector> serverListValues = wait(getAll(serverListEntries)); std::vector serverInterfaces; for (int s = 0; s < serverListValues.size(); s++) { if (!serverListValues[s].present()) { // A storage server has been removed from ServerList since we read keyServers return Optional>(); } serverInterfaces.push_back(serverListValues[s].get()); } return serverInterfaces; } void updateTssMappings(Database cx, const GetKeyServerLocationsReply& reply) { // Since a ss -> tss mapping is included in resultsTssMapping iff that SS is in results and has a tss pair, // all SS in results that do not have a mapping present must not have a tss pair. std::unordered_map ssiById; for (const auto& [_, shard] : reply.results) { for (auto& ssi : shard) { ssiById[ssi.id()] = &ssi; } } for (const auto& mapping : reply.resultsTssMapping) { auto ssi = ssiById.find(mapping.first); ASSERT(ssi != ssiById.end()); cx->addTssMapping(*ssi->second, mapping.second); ssiById.erase(mapping.first); } // if SS didn't have a mapping above, it's still in the ssiById map, so remove its tss mapping for (const auto& it : ssiById) { cx->removeTssMapping(*it.second); } } void updateTagMappings(Database cx, const GetKeyServerLocationsReply& reply) { for (const auto& mapping : reply.resultsTagMapping) { cx->addSSIdTagMapping(mapping.first, mapping.second); } } // If isBackward == true, returns the shard containing the key before 'key' (an infinitely long, inexpressible key). // Otherwise returns the shard containing key ACTOR Future getKeyLocation_internal(Database cx, Optional tenant, Key key, SpanID spanID, Optional debugID, UseProvisionalProxies useProvisionalProxies, Reverse isBackward, Version version) { state Span span("NAPI:getKeyLocation"_loc, spanID); if (isBackward) { ASSERT(key != allKeys.begin && key <= allKeys.end); } else { ASSERT(key < allKeys.end); } if (debugID.present()) g_traceBatch.addEvent("TransactionDebug", debugID.get().first(), "NativeAPI.getKeyLocation.Before"); try { loop { ++cx->transactionKeyServerLocationRequests; choose { when(wait(cx->onProxiesChanged())) {} when(GetKeyServerLocationsReply rep = wait(basicLoadBalance(cx->getCommitProxies(useProvisionalProxies), &CommitProxyInterface::getKeyServersLocations, GetKeyServerLocationsRequest(span.context, tenant.castTo(), key, Optional(), 100, isBackward, version, key.arena()), TaskPriority::DefaultPromiseEndpoint))) { ++cx->transactionKeyServerLocationRequestsCompleted; if (debugID.present()) g_traceBatch.addEvent( "TransactionDebug", debugID.get().first(), "NativeAPI.getKeyLocation.After"); ASSERT(rep.results.size() == 1); auto locationInfo = cx->setCachedLocation(tenant, rep.tenantEntry, rep.results[0].first, rep.results[0].second); updateTssMappings(cx, rep); return KeyRangeLocationInfo( rep.tenantEntry, KeyRange(toRelativeRange(rep.results[0].first, rep.tenantEntry.prefix), rep.arena), locationInfo); } } } } catch (Error& e) { if (e.code() == error_code_tenant_not_found) { ASSERT(tenant.present()); cx->invalidateCachedTenant(tenant.get()); } throw; } } // Checks if `endpoint` is failed on a healthy server or not. Returns true if we need to refresh the location cache for // the endpoint. bool checkOnlyEndpointFailed(const Database& cx, const Endpoint& endpoint) { if (IFailureMonitor::failureMonitor().onlyEndpointFailed(endpoint)) { // This endpoint is failed, but the server is still healthy. There are two cases this can happen: // - There is a recent bounce in the cluster where the endpoints in SSes get updated. // - The SS is failed and terminated on a server, but the server is kept running. // To account for the first case, we invalidate the cache and issue GetKeyLocation requests to the proxy to // update the cache with the new SS points. However, if the failure is caused by the second case, the // requested key location will continue to be the failed endpoint until the data movement is finished. But // every read will generate a GetKeyLocation request to the proxies (and still getting the failed endpoint // back), which may overload the proxy and affect data movement speed. Therefore, we only refresh the // location cache for short period of time, and after the initial grace period that we keep retrying // resolving key location, we will slow it down to resolve it only once every // `LOCATION_CACHE_FAILED_ENDPOINT_RETRY_INTERVAL`. cx->setFailedEndpointOnHealthyServer(endpoint); const auto& failureInfo = cx->getEndpointFailureInfo(endpoint); ASSERT(failureInfo.present()); if (now() - failureInfo.get().startTime < CLIENT_KNOBS->LOCATION_CACHE_ENDPOINT_FAILURE_GRACE_PERIOD || now() - failureInfo.get().lastRefreshTime > CLIENT_KNOBS->LOCATION_CACHE_FAILED_ENDPOINT_RETRY_INTERVAL) { cx->updateFailedEndpointRefreshTime(endpoint); return true; } } else { cx->clearFailedEndpointOnHealthyServer(endpoint); } return false; } template Future getKeyLocation(Database const& cx, Optional const& tenant, Key const& key, F StorageServerInterface::*member, SpanID spanID, Optional debugID, UseProvisionalProxies useProvisionalProxies, Reverse isBackward, Version version) { // we first check whether this range is cached Optional locationInfo = cx->getCachedLocation(tenant, key, isBackward); if (!locationInfo.present()) { return getKeyLocation_internal(cx, tenant, key, spanID, debugID, useProvisionalProxies, isBackward, version); } bool onlyEndpointFailedAndNeedRefresh = false; for (int i = 0; i < locationInfo.get().locations->size(); i++) { if (checkOnlyEndpointFailed(cx, locationInfo.get().locations->get(i, member).getEndpoint())) { onlyEndpointFailedAndNeedRefresh = true; } } if (onlyEndpointFailedAndNeedRefresh) { cx->invalidateCache(locationInfo.get().tenantEntry.prefix, key); // Refresh the cache with a new getKeyLocations made to proxies. return getKeyLocation_internal(cx, tenant, key, spanID, debugID, useProvisionalProxies, isBackward, version); } return locationInfo.get(); } template Future getKeyLocation(Reference trState, Key const& key, F StorageServerInterface::*member, Reverse isBackward, UseTenant useTenant, Version version) { auto f = getKeyLocation(trState->cx, useTenant ? trState->tenant() : Optional(), key, member, trState->spanID, trState->debugID, trState->useProvisionalProxies, isBackward, version); if (trState->tenant().present() && useTenant) { return map(f, [trState](const KeyRangeLocationInfo& locationInfo) { trState->tenantId = locationInfo.tenantEntry.id; return locationInfo; }); } else { return f; } } ACTOR Future> getKeyRangeLocations_internal( Database cx, Optional tenant, KeyRange keys, int limit, Reverse reverse, SpanID spanID, Optional debugID, UseProvisionalProxies useProvisionalProxies, Version version) { state Span span("NAPI:getKeyRangeLocations"_loc, spanID); if (debugID.present()) g_traceBatch.addEvent("TransactionDebug", debugID.get().first(), "NativeAPI.getKeyLocations.Before"); try { loop { ++cx->transactionKeyServerLocationRequests; choose { when(wait(cx->onProxiesChanged())) {} when(GetKeyServerLocationsReply _rep = wait(basicLoadBalance(cx->getCommitProxies(useProvisionalProxies), &CommitProxyInterface::getKeyServersLocations, GetKeyServerLocationsRequest(span.context, tenant.castTo(), keys.begin, keys.end, limit, reverse, version, keys.arena()), TaskPriority::DefaultPromiseEndpoint))) { ++cx->transactionKeyServerLocationRequestsCompleted; state GetKeyServerLocationsReply rep = _rep; if (debugID.present()) g_traceBatch.addEvent( "TransactionDebug", debugID.get().first(), "NativeAPI.getKeyLocations.After"); ASSERT(rep.results.size()); state std::vector results; state int shard = 0; for (; shard < rep.results.size(); shard++) { // FIXME: these shards are being inserted into the map sequentially, it would be much more CPU // efficient to save the map pairs and insert them all at once. results.emplace_back( rep.tenantEntry, (toRelativeRange(rep.results[shard].first, rep.tenantEntry.prefix) & keys), cx->setCachedLocation( tenant, rep.tenantEntry, rep.results[shard].first, rep.results[shard].second)); wait(yield()); } updateTssMappings(cx, rep); updateTagMappings(cx, rep); return results; } } } } catch (Error& e) { if (e.code() == error_code_tenant_not_found) { ASSERT(tenant.present()); cx->invalidateCachedTenant(tenant.get()); } throw; } } // Get the SS locations for each shard in the 'keys' key-range; // Returned vector size is the number of shards in the input keys key-range. // Returned vector element is pairs, where // ShardRange is the whole shard key-range, not a part of the given key range. // Example: If query the function with key range (b, d), the returned list of pairs could be something like: // [([a, b1), locationInfo), ([b1, c), locationInfo), ([c, d1), locationInfo)]. template Future> getKeyRangeLocations(Database const& cx, Optional tenant, KeyRange const& keys, int limit, Reverse reverse, F StorageServerInterface::*member, SpanID const& spanID, Optional const& debugID, UseProvisionalProxies useProvisionalProxies, Version version) { ASSERT(!keys.empty()); std::vector locations; if (!cx->getCachedLocations(tenant, keys, locations, limit, reverse)) { return getKeyRangeLocations_internal( cx, tenant, keys, limit, reverse, spanID, debugID, useProvisionalProxies, version); } bool foundFailed = false; for (const auto& locationInfo : locations) { bool onlyEndpointFailedAndNeedRefresh = false; for (int i = 0; i < locationInfo.locations->size(); i++) { if (checkOnlyEndpointFailed(cx, locationInfo.locations->get(i, member).getEndpoint())) { onlyEndpointFailedAndNeedRefresh = true; } } if (onlyEndpointFailedAndNeedRefresh) { cx->invalidateCache(locationInfo.tenantEntry.prefix, locationInfo.range.begin); foundFailed = true; } } if (foundFailed) { // Refresh the cache with a new getKeyRangeLocations made to proxies. return getKeyRangeLocations_internal( cx, tenant, keys, limit, reverse, spanID, debugID, useProvisionalProxies, version); } return locations; } template Future> getKeyRangeLocations(Reference trState, KeyRange const& keys, int limit, Reverse reverse, F StorageServerInterface::*member, UseTenant useTenant, Version version) { auto f = getKeyRangeLocations(trState->cx, useTenant ? trState->tenant() : Optional(), keys, limit, reverse, member, trState->spanID, trState->debugID, trState->useProvisionalProxies, version); if (trState->tenant().present() && useTenant) { return map(f, [trState](const std::vector& locationInfo) { ASSERT(!locationInfo.empty()); trState->tenantId = locationInfo[0].tenantEntry.id; return locationInfo; }); } else { return f; } } ACTOR Future warmRange_impl(Reference trState, KeyRange keys, Future fVersion) { state int totalRanges = 0; state int totalRequests = 0; state Version version = wait(fVersion); loop { std::vector locations = wait(getKeyRangeLocations_internal(trState->cx, trState->tenant(), keys, CLIENT_KNOBS->WARM_RANGE_SHARD_LIMIT, Reverse::False, trState->spanID, trState->debugID, trState->useProvisionalProxies, version)); totalRanges += CLIENT_KNOBS->WARM_RANGE_SHARD_LIMIT; totalRequests++; if (locations.size() == 0 || totalRanges >= trState->cx->locationCacheSize || locations[locations.size() - 1].range.end >= keys.end) break; keys = KeyRangeRef(locations[locations.size() - 1].range.end, keys.end); if (totalRequests % 20 == 0) { // To avoid blocking the proxies from starting other transactions, occasionally get a read version. state Transaction tr(trState->cx, trState->tenant()); loop { try { tr.setOption(FDBTransactionOptions::LOCK_AWARE); tr.setOption(FDBTransactionOptions::CAUSAL_READ_RISKY); wait(success(tr.getReadVersion())); break; } catch (Error& e) { wait(tr.onError(e)); } } } } return Void(); } SpanID generateSpanID(bool transactionTracingSample, SpanID parentContext = SpanID()) { uint64_t txnId = deterministicRandom()->randomUInt64(); if (parentContext.isValid()) { if (parentContext.first() > 0) { txnId = parentContext.first(); } uint64_t tokenId = parentContext.second() > 0 ? deterministicRandom()->randomUInt64() : 0; return SpanID(txnId, tokenId); } else if (transactionTracingSample) { uint64_t tokenId = deterministicRandom()->random01() <= FLOW_KNOBS->TRACING_SAMPLE_RATE ? deterministicRandom()->randomUInt64() : 0; return SpanID(txnId, tokenId); } else { return SpanID(txnId, 0); } } TransactionState::TransactionState(Database cx, Optional tenant, TaskPriority taskID, SpanID spanID, Reference trLogInfo) : cx(cx), trLogInfo(trLogInfo), options(cx), taskID(taskID), spanID(spanID), readVersionObtainedFromGrvProxy(true), tenant_(tenant), tenantSet(tenant.present()) {} Reference TransactionState::cloneAndReset(Reference newTrLogInfo, bool generateNewSpan) const { SpanID newSpanID = generateNewSpan ? generateSpanID(cx->transactionTracingSample) : spanID; Reference newState = makeReference(cx, tenant_, cx->taskID, newSpanID, newTrLogInfo); if (!cx->apiVersionAtLeast(16)) { newState->options = options; } newState->numErrors = numErrors; newState->startTime = startTime; newState->committedVersion = committedVersion; newState->conflictingKeys = conflictingKeys; newState->tenantSet = tenantSet; return newState; } TenantInfo TransactionState::getTenantInfo() { Optional const& t = tenant(); if (options.rawAccess) { return TenantInfo(); } else if (!cx->internal && cx->clientInfo->get().tenantMode == TenantMode::REQUIRED && !t.present()) { throw tenant_name_required(); } else if (!t.present()) { return TenantInfo(); } else if (cx->clientInfo->get().tenantMode == TenantMode::DISABLED && t.present()) { throw tenants_disabled(); } ASSERT(tenantId != TenantInfo::INVALID_TENANT); return TenantInfo(t.get(), tenantId); } Optional const& TransactionState::tenant() { if (tenantSet) { return tenant_; } else { if (!tenant_.present() && !options.rawAccess) { tenant_ = cx->defaultTenant; } tenantSet = true; return tenant_; } } bool TransactionState::hasTenant() const { return tenantSet && tenant_.present(); } Future Transaction::warmRange(KeyRange keys) { return warmRange_impl(trState, keys, getReadVersion()); } ACTOR Future> getValue(Reference trState, Key key, Future version, UseTenant useTenant, TransactionRecordLogInfo recordLogInfo) { state Version ver = wait(version); state Span span("NAPI:getValue"_loc, trState->spanID); if (useTenant && trState->tenant().present()) { span.addTag("tenant"_sr, trState->tenant().get()); } span.addTag("key"_sr, key); trState->cx->validateVersion(ver); loop { state KeyRangeLocationInfo locationInfo = wait(getKeyLocation(trState, key, &StorageServerInterface::getValue, Reverse::False, useTenant, ver)); state Optional getValueID = Optional(); state uint64_t startTime; state double startTimeD; state VersionVector ssLatestCommitVersions; trState->cx->getLatestCommitVersions(locationInfo.locations, ver, trState, ssLatestCommitVersions); try { if (trState->debugID.present()) { getValueID = nondeterministicRandom()->randomUniqueID(); g_traceBatch.addAttach("GetValueAttachID", trState->debugID.get().first(), getValueID.get().first()); g_traceBatch.addEvent("GetValueDebug", getValueID.get().first(), "NativeAPI.getValue.Before"); //.detail("TaskID", g_network->getCurrentTask()); /*TraceEvent("TransactionDebugGetValueInfo", getValueID.get()) .detail("Key", key) .detail("ReqVersion", ver) .detail("Servers", describe(ssi.second->get()));*/ } ++trState->cx->getValueSubmitted; startTime = timer_int(); startTimeD = now(); ++trState->cx->transactionPhysicalReads; state GetValueReply reply; try { if (CLIENT_BUGGIFY_WITH_PROB(.01)) { throw deterministicRandom()->randomChoice( std::vector{ transaction_too_old(), future_version() }); } choose { when(wait(trState->cx->connectionFileChanged())) { throw transaction_too_old(); } when(GetValueReply _reply = wait(loadBalance( trState->cx.getPtr(), locationInfo.locations, &StorageServerInterface::getValue, GetValueRequest(span.context, useTenant ? trState->getTenantInfo() : TenantInfo(), key, ver, trState->cx->sampleReadTags() ? trState->options.readTags : Optional(), getValueID, ssLatestCommitVersions), TaskPriority::DefaultPromiseEndpoint, AtMostOnce::False, trState->cx->enableLocalityLoadBalance ? &trState->cx->queueModel : nullptr))) { reply = _reply; } } ++trState->cx->transactionPhysicalReadsCompleted; } catch (Error&) { ++trState->cx->transactionPhysicalReadsCompleted; throw; } double latency = now() - startTimeD; trState->cx->readLatencies.addSample(latency); if (trState->trLogInfo && recordLogInfo) { int valueSize = reply.value.present() ? reply.value.get().size() : 0; trState->trLogInfo->addLog(FdbClientLogEvents::EventGet( startTimeD, trState->cx->clientLocality.dcId(), latency, valueSize, key, trState->tenant())); } trState->cx->getValueCompleted->latency = timer_int() - startTime; trState->cx->getValueCompleted->log(); if (getValueID.present()) { g_traceBatch.addEvent("GetValueDebug", getValueID.get().first(), "NativeAPI.getValue.After"); //.detail("TaskID", g_network->getCurrentTask()); /*TraceEvent("TransactionDebugGetValueDone", getValueID.get()) .detail("Key", key) .detail("ReqVersion", ver) .detail("ReplySize", reply.value.present() ? reply.value.get().size() : -1);*/ } trState->cx->transactionBytesRead += reply.value.present() ? reply.value.get().size() : 0; ++trState->cx->transactionKeysRead; return reply.value; } catch (Error& e) { trState->cx->getValueCompleted->latency = timer_int() - startTime; trState->cx->getValueCompleted->log(); if (getValueID.present()) { g_traceBatch.addEvent("GetValueDebug", getValueID.get().first(), "NativeAPI.getValue.Error"); //.detail("TaskID", g_network->getCurrentTask()); /*TraceEvent("TransactionDebugGetValueDone", getValueID.get()) .detail("Key", key) .detail("ReqVersion", ver) .detail("ReplySize", reply.value.present() ? reply.value.get().size() : -1);*/ } if (e.code() == error_code_wrong_shard_server || e.code() == error_code_all_alternatives_failed || (e.code() == error_code_transaction_too_old && ver == latestVersion)) { trState->cx->invalidateCache(locationInfo.tenantEntry.prefix, key); wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, trState->taskID)); } else if (e.code() == error_code_unknown_tenant) { ASSERT(useTenant && trState->tenant().present()); trState->cx->invalidateCachedTenant(trState->tenant().get()); wait(delay(CLIENT_KNOBS->UNKNOWN_TENANT_RETRY_DELAY, trState->taskID)); } else { if (trState->trLogInfo && recordLogInfo) trState->trLogInfo->addLog(FdbClientLogEvents::EventGetError(startTimeD, trState->cx->clientLocality.dcId(), static_cast(e.code()), key, trState->tenant())); throw e; } } } } ACTOR Future getKey(Reference trState, KeySelector k, Future version, UseTenant useTenant = UseTenant::True) { wait(success(version)); state Optional getKeyID = Optional(); state Span span("NAPI:getKey"_loc, trState->spanID); if (trState->debugID.present()) { getKeyID = nondeterministicRandom()->randomUniqueID(); g_traceBatch.addAttach("GetKeyAttachID", trState->debugID.get().first(), getKeyID.get().first()); g_traceBatch.addEvent( "GetKeyDebug", getKeyID.get().first(), "NativeAPI.getKey.AfterVersion"); //.detail("StartKey", // k.getKey()).detail("Offset",k.offset).detail("OrEqual",k.orEqual); } loop { if (k.getKey() == allKeys.end) { if (k.offset > 0) { return allKeys.end; } k.orEqual = false; } else if (k.getKey() == allKeys.begin && k.offset <= 0) { return Key(); } Key locationKey(k.getKey(), k.arena()); state KeyRangeLocationInfo locationInfo = wait(getKeyLocation(trState, locationKey, &StorageServerInterface::getKey, Reverse{ k.isBackward() }, useTenant, version.get())); state VersionVector ssLatestCommitVersions; trState->cx->getLatestCommitVersions(locationInfo.locations, version.get(), trState, ssLatestCommitVersions); try { if (getKeyID.present()) g_traceBatch.addEvent( "GetKeyDebug", getKeyID.get().first(), "NativeAPI.getKey.Before"); //.detail("StartKey", // k.getKey()).detail("Offset",k.offset).detail("OrEqual",k.orEqual); ++trState->cx->transactionPhysicalReads; GetKeyRequest req(span.context, useTenant ? trState->getTenantInfo() : TenantInfo(), k, version.get(), trState->cx->sampleReadTags() ? trState->options.readTags : Optional(), getKeyID, ssLatestCommitVersions); req.arena.dependsOn(k.arena()); state GetKeyReply reply; try { choose { when(wait(trState->cx->connectionFileChanged())) { throw transaction_too_old(); } when(GetKeyReply _reply = wait(loadBalance( trState->cx.getPtr(), locationInfo.locations, &StorageServerInterface::getKey, req, TaskPriority::DefaultPromiseEndpoint, AtMostOnce::False, trState->cx->enableLocalityLoadBalance ? &trState->cx->queueModel : nullptr))) { reply = _reply; } } ++trState->cx->transactionPhysicalReadsCompleted; } catch (Error&) { ++trState->cx->transactionPhysicalReadsCompleted; throw; } if (getKeyID.present()) g_traceBatch.addEvent("GetKeyDebug", getKeyID.get().first(), "NativeAPI.getKey.After"); //.detail("NextKey",reply.sel.key).detail("Offset", // reply.sel.offset).detail("OrEqual", k.orEqual); k = reply.sel; if (!k.offset && k.orEqual) { return k.getKey(); } } catch (Error& e) { if (getKeyID.present()) g_traceBatch.addEvent("GetKeyDebug", getKeyID.get().first(), "NativeAPI.getKey.Error"); if (e.code() == error_code_wrong_shard_server || e.code() == error_code_all_alternatives_failed) { trState->cx->invalidateCache(locationInfo.tenantEntry.prefix, k.getKey(), Reverse{ k.isBackward() }); wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, trState->taskID)); } else if (e.code() == error_code_unknown_tenant) { ASSERT(useTenant && trState->tenant().present()); trState->cx->invalidateCachedTenant(trState->tenant().get()); wait(delay(CLIENT_KNOBS->UNKNOWN_TENANT_RETRY_DELAY, trState->taskID)); } else { TraceEvent(SevInfo, "GetKeyError").error(e).detail("AtKey", k.getKey()).detail("Offset", k.offset); throw e; } } } } ACTOR Future waitForCommittedVersion(Database cx, Version version, SpanID spanContext) { state Span span("NAPI:waitForCommittedVersion"_loc, { spanContext }); try { loop { choose { when(wait(cx->onProxiesChanged())) {} when(GetReadVersionReply v = wait(basicLoadBalance( cx->getGrvProxies(UseProvisionalProxies::False), &GrvProxyInterface::getConsistentReadVersion, GetReadVersionRequest( span.context, 0, TransactionPriority::IMMEDIATE, cx->ssVersionVectorCache.getMaxVersion()), cx->taskID))) { cx->minAcceptableReadVersion = std::min(cx->minAcceptableReadVersion, v.version); if (v.midShardSize > 0) cx->smoothMidShardSize.setTotal(v.midShardSize); if (cx->isCurrentGrvProxy(v.proxyId)) { cx->ssVersionVectorCache.applyDelta(v.ssVersionVectorDelta); } else { cx->ssVersionVectorCache.clear(); } if (v.version >= version) return v.version; // SOMEDAY: Do the wait on the server side, possibly use less expensive source of committed version // (causal consistency is not needed for this purpose) wait(delay(CLIENT_KNOBS->FUTURE_VERSION_RETRY_DELAY, cx->taskID)); } } } } catch (Error& e) { TraceEvent(SevError, "WaitForCommittedVersionError").error(e); throw; } } ACTOR Future getRawVersion(Reference trState) { state Span span("NAPI:getRawVersion"_loc, { trState->spanID }); loop { choose { when(wait(trState->cx->onProxiesChanged())) {} when(GetReadVersionReply v = wait(basicLoadBalance(trState->cx->getGrvProxies(UseProvisionalProxies::False), &GrvProxyInterface::getConsistentReadVersion, GetReadVersionRequest(trState->spanID, 0, TransactionPriority::IMMEDIATE, trState->cx->ssVersionVectorCache.getMaxVersion()), trState->cx->taskID))) { if (trState->cx->isCurrentGrvProxy(v.proxyId)) { trState->cx->ssVersionVectorCache.applyDelta(v.ssVersionVectorDelta); } else { trState->cx->ssVersionVectorCache.clear(); } return v.version; } } } } ACTOR Future readVersionBatcher( DatabaseContext* cx, FutureStream, Optional>> versionStream, uint32_t flags); ACTOR Future watchValue(Database cx, Reference parameters) { state Span span("NAPI:watchValue"_loc, parameters->spanID); state Version ver = parameters->version; cx->validateVersion(parameters->version); ASSERT(parameters->version != latestVersion); loop { state KeyRangeLocationInfo locationInfo = wait(getKeyLocation(cx, parameters->tenant.name, parameters->key, &StorageServerInterface::watchValue, parameters->spanID, parameters->debugID, parameters->useProvisionalProxies, Reverse::False, parameters->version)); try { state Optional watchValueID = Optional(); if (parameters->debugID.present()) { watchValueID = nondeterministicRandom()->randomUniqueID(); g_traceBatch.addAttach( "WatchValueAttachID", parameters->debugID.get().first(), watchValueID.get().first()); g_traceBatch.addEvent("WatchValueDebug", watchValueID.get().first(), "NativeAPI.watchValue.Before"); //.detail("TaskID", g_network->getCurrentTask()); } state WatchValueReply resp; choose { when(WatchValueReply r = wait( loadBalance(cx.getPtr(), locationInfo.locations, &StorageServerInterface::watchValue, WatchValueRequest(span.context, parameters->tenant, parameters->key, parameters->value, ver, cx->sampleReadTags() ? parameters->tags : Optional(), watchValueID), TaskPriority::DefaultPromiseEndpoint))) { resp = r; } when(wait(cx->connectionRecord ? cx->connectionRecord->onChange() : Never())) { wait(Never()); } } if (watchValueID.present()) { g_traceBatch.addEvent("WatchValueDebug", watchValueID.get().first(), "NativeAPI.watchValue.After"); } // FIXME: wait for known committed version on the storage server before replying, // cannot do this until the storage server is notified on knownCommittedVersion changes from tlog (faster // than the current update loop) Version v = wait(waitForCommittedVersion(cx, resp.version, span.context)); // False if there is a master failure between getting the response and getting the committed version, // Dependent on SERVER_KNOBS->MAX_VERSIONS_IN_FLIGHT if (v - resp.version < 50000000) { return resp.version; } ver = v; } catch (Error& e) { if (e.code() == error_code_wrong_shard_server || e.code() == error_code_all_alternatives_failed) { cx->invalidateCache(locationInfo.tenantEntry.prefix, parameters->key); wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, parameters->taskID)); } else if (e.code() == error_code_unknown_tenant) { ASSERT(parameters->tenant.name.present()); cx->invalidateCachedTenant(parameters->tenant.name.get()); wait(delay(CLIENT_KNOBS->UNKNOWN_TENANT_RETRY_DELAY, parameters->taskID)); } else if (e.code() == error_code_watch_cancelled || e.code() == error_code_process_behind) { // clang-format off TEST(e.code() == error_code_watch_cancelled); // Too many watches on the storage server, poll for changes instead TEST(e.code() == error_code_process_behind); // The storage servers are all behind // clang-format on wait(delay(CLIENT_KNOBS->WATCH_POLLING_TIME, parameters->taskID)); } else if (e.code() == error_code_timed_out) { // The storage server occasionally times out watches in case // it was cancelled TEST(true); // A watch timed out wait(delay(CLIENT_KNOBS->FUTURE_VERSION_RETRY_DELAY, parameters->taskID)); } else { state Error err = e; wait(delay(CLIENT_KNOBS->FUTURE_VERSION_RETRY_DELAY, parameters->taskID)); throw err; } } } } ACTOR Future watchStorageServerResp(int64_t tenantId, Key key, Database cx) { loop { try { state Reference metadata = cx->getWatchMetadata(tenantId, key); if (!metadata.isValid()) return Void(); Version watchVersion = wait(watchValue(cx, metadata->parameters)); metadata = cx->getWatchMetadata(tenantId, key); if (!metadata.isValid()) return Void(); // case 1: version_1 (SS) >= version_2 (map) if (watchVersion >= metadata->parameters->version) { cx->deleteWatchMetadata(tenantId, key); if (metadata->watchPromise.canBeSet()) metadata->watchPromise.send(watchVersion); } // ABA happens else { TEST(true); // ABA issue where the version returned from the server is less than the version in the map // case 2: version_1 < version_2 and future_count == 1 if (metadata->watchPromise.getFutureReferenceCount() == 1) { cx->deleteWatchMetadata(tenantId, key); } } } catch (Error& e) { if (e.code() == error_code_operation_cancelled) { throw e; } Reference metadata = cx->getWatchMetadata(tenantId, key); if (!metadata.isValid()) { return Void(); } else if (metadata->watchPromise.getFutureReferenceCount() == 1) { cx->deleteWatchMetadata(tenantId, key); return Void(); } else if (e.code() == error_code_future_version) { continue; } cx->deleteWatchMetadata(tenantId, key); metadata->watchPromise.sendError(e); throw e; } } } ACTOR Future sameVersionDiffValue(Database cx, Reference parameters) { state ReadYourWritesTransaction tr(cx, parameters->tenant.name); loop { try { if (!parameters->tenant.name.present()) { tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); } state Optional valSS = wait(tr.get(parameters->key)); Reference metadata = cx->getWatchMetadata(parameters->tenant.tenantId, parameters->key); // val_3 != val_1 (storage server value doesn't match value in map) if (metadata.isValid() && valSS != metadata->parameters->value) { cx->deleteWatchMetadata(parameters->tenant.tenantId, parameters->key); metadata->watchPromise.send(parameters->version); metadata->watchFutureSS.cancel(); } // val_3 == val_2 (storage server value matches value passed into the function -> new watch) if (valSS == parameters->value && tr.getTransactionState()->tenantId == parameters->tenant.tenantId) { metadata = makeReference(parameters); cx->setWatchMetadata(metadata); metadata->watchFutureSS = watchStorageServerResp(parameters->tenant.tenantId, parameters->key, cx); } // if val_3 != val_2 if (valSS != parameters->value) return Void(); // val_3 == val_2 wait(success(metadata->watchPromise.getFuture())); return Void(); } catch (Error& e) { wait(tr.onError(e)); } } } Future getWatchFuture(Database cx, Reference parameters) { Reference metadata = cx->getWatchMetadata(parameters->tenant.tenantId, parameters->key); // case 1: key not in map if (!metadata.isValid()) { metadata = makeReference(parameters); cx->setWatchMetadata(metadata); metadata->watchFutureSS = watchStorageServerResp(parameters->tenant.tenantId, parameters->key, cx); return success(metadata->watchPromise.getFuture()); } // case 2: val_1 == val_2 (received watch with same value as key already in the map so just update) else if (metadata->parameters->value == parameters->value) { if (parameters->version > metadata->parameters->version) { metadata->parameters = parameters; } return success(metadata->watchPromise.getFuture()); } // case 3: val_1 != val_2 && version_2 > version_1 (received watch with different value and a higher version so // recreate in SS) else if (parameters->version > metadata->parameters->version) { TEST(true); // Setting a watch that has a different value than the one in the map but a higher version (newer) cx->deleteWatchMetadata(parameters->tenant.tenantId, parameters->key); metadata->watchPromise.send(parameters->version); metadata->watchFutureSS.cancel(); metadata = makeReference(parameters); cx->setWatchMetadata(metadata); metadata->watchFutureSS = watchStorageServerResp(parameters->tenant.tenantId, parameters->key, cx); return success(metadata->watchPromise.getFuture()); } // case 5: val_1 != val_2 && version_1 == version_2 (received watch with different value but same version) else if (metadata->parameters->version == parameters->version) { TEST(true); // Setting a watch which has a different value than the one in the map but the same version return sameVersionDiffValue(cx, parameters); } TEST(true); // Setting a watch which has a different value than the one in the map but a lower version (older) // case 4: val_1 != val_2 && version_2 < version_1 return Void(); } ACTOR Future watchValueMap(Future version, TenantInfo tenant, Key key, Optional value, Database cx, TagSet tags, SpanID spanID, TaskPriority taskID, Optional debugID, UseProvisionalProxies useProvisionalProxies) { state Version ver = wait(version); wait(getWatchFuture( cx, makeReference(tenant, key, value, ver, tags, spanID, taskID, debugID, useProvisionalProxies))); return Void(); } template void transformRangeLimits(GetRangeLimits limits, Reverse reverse, GetKeyValuesFamilyRequest& req) { if (limits.bytes != 0) { if (!limits.hasRowLimit()) req.limit = CLIENT_KNOBS->REPLY_BYTE_LIMIT; // Can't get more than this many rows anyway else req.limit = std::min(CLIENT_KNOBS->REPLY_BYTE_LIMIT, limits.rows); if (reverse) req.limit *= -1; if (!limits.hasByteLimit()) req.limitBytes = CLIENT_KNOBS->REPLY_BYTE_LIMIT; else req.limitBytes = std::min(CLIENT_KNOBS->REPLY_BYTE_LIMIT, limits.bytes); } else { req.limitBytes = CLIENT_KNOBS->REPLY_BYTE_LIMIT; req.limit = reverse ? -limits.minRows : limits.minRows; } } template PublicRequestStream StorageServerInterface::*getRangeRequestStream() { if constexpr (std::is_same::value) { return &StorageServerInterface::getKeyValues; } else if (std::is_same::value) { return &StorageServerInterface::getMappedKeyValues; } else { UNREACHABLE(); } } ACTOR template Future getExactRange(Reference trState, Version version, KeyRange keys, Key mapper, GetRangeLimits limits, Reverse reverse, UseTenant useTenant) { state RangeResultFamily output; state Span span("NAPI:getExactRange"_loc, trState->spanID); if (useTenant && trState->tenant().present()) { span.addTag("tenant"_sr, trState->tenant().get()); } // printf("getExactRange( '%s', '%s' )\n", keys.begin.toString().c_str(), keys.end.toString().c_str()); loop { state std::vector locations = wait(getKeyRangeLocations(trState, keys, CLIENT_KNOBS->GET_RANGE_SHARD_LIMIT, reverse, getRangeRequestStream(), useTenant, version)); ASSERT(locations.size()); state int shard = 0; loop { const KeyRangeRef& range = locations[shard].range; GetKeyValuesFamilyRequest req; req.mapper = mapper; req.arena.dependsOn(mapper.arena()); req.tenantInfo = useTenant ? trState->getTenantInfo() : TenantInfo(); req.version = version; req.begin = firstGreaterOrEqual(range.begin); req.end = firstGreaterOrEqual(range.end); req.spanContext = span.context; trState->cx->getLatestCommitVersions( locations[shard].locations, req.version, trState, req.ssLatestCommitVersions); // keep shard's arena around in case of async tss comparison req.arena.dependsOn(locations[shard].range.arena()); transformRangeLimits(limits, reverse, req); ASSERT(req.limitBytes > 0 && req.limit != 0 && req.limit < 0 == reverse); // FIXME: buggify byte limits on internal functions that use them, instead of globally req.tags = trState->cx->sampleReadTags() ? trState->options.readTags : Optional(); req.debugID = trState->debugID; try { if (trState->debugID.present()) { g_traceBatch.addEvent( "TransactionDebug", trState->debugID.get().first(), "NativeAPI.getExactRange.Before"); /*TraceEvent("TransactionDebugGetExactRangeInfo", trState->debugID.get()) .detail("ReqBeginKey", req.begin.getKey()) .detail("ReqEndKey", req.end.getKey()) .detail("ReqLimit", req.limit) .detail("ReqLimitBytes", req.limitBytes) .detail("ReqVersion", req.version) .detail("Reverse", reverse) .detail("Servers", locations[shard].second->description());*/ } ++trState->cx->transactionPhysicalReads; state GetKeyValuesFamilyReply rep; try { choose { when(wait(trState->cx->connectionFileChanged())) { throw transaction_too_old(); } when(GetKeyValuesFamilyReply _rep = wait(loadBalance( trState->cx.getPtr(), locations[shard].locations, getRangeRequestStream(), req, TaskPriority::DefaultPromiseEndpoint, AtMostOnce::False, trState->cx->enableLocalityLoadBalance ? &trState->cx->queueModel : nullptr))) { rep = _rep; } } ++trState->cx->transactionPhysicalReadsCompleted; } catch (Error&) { ++trState->cx->transactionPhysicalReadsCompleted; throw; } if (trState->debugID.present()) g_traceBatch.addEvent( "TransactionDebug", trState->debugID.get().first(), "NativeAPI.getExactRange.After"); output.arena().dependsOn(rep.arena); output.append(output.arena(), rep.data.begin(), rep.data.size()); if (limits.hasRowLimit() && rep.data.size() > limits.rows) { TraceEvent(SevError, "GetExactRangeTooManyRows") .detail("RowLimit", limits.rows) .detail("DeliveredRows", output.size()); ASSERT(false); } limits.decrement(rep.data); if (limits.isReached()) { output.more = true; return output; } bool more = rep.more; // If the reply says there is more but we know that we finished the shard, then fix rep.more if (reverse && more && rep.data.size() > 0 && output[output.size() - 1].key == locations[shard].range.begin) more = false; if (more) { if (!rep.data.size()) { TraceEvent(SevError, "GetExactRangeError") .detail("Reason", "More data indicated but no rows present") .detail("LimitBytes", limits.bytes) .detail("LimitRows", limits.rows) .detail("OutputSize", output.size()) .detail("OutputBytes", output.expectedSize()) .detail("BlockSize", rep.data.size()) .detail("BlockBytes", rep.data.expectedSize()); ASSERT(false); } TEST(true); // GetKeyValuesFamilyReply.more in getExactRange // Make next request to the same shard with a beginning key just after the last key returned if (reverse) locations[shard].range = KeyRangeRef(locations[shard].range.begin, output[output.size() - 1].key); else locations[shard].range = KeyRangeRef(keyAfter(output[output.size() - 1].key), locations[shard].range.end); } if (!more || locations[shard].range.empty()) { TEST(true); // getExactrange (!more || locations[shard].first.empty()) if (shard == locations.size() - 1) { const KeyRangeRef& range = locations[shard].range; KeyRef begin = reverse ? keys.begin : range.end; KeyRef end = reverse ? range.begin : keys.end; if (begin >= end) { output.more = false; return output; } TEST(true); // Multiple requests of key locations keys = KeyRangeRef(begin, end); break; } ++shard; } // Soft byte limit - return results early if the user specified a byte limit and we got results // This can prevent problems where the desired range spans many shards and would be too slow to // fetch entirely. if (limits.hasSatisfiedMinRows() && output.size() > 0) { output.more = true; return output; } } catch (Error& e) { if (e.code() == error_code_wrong_shard_server || e.code() == error_code_all_alternatives_failed) { const KeyRangeRef& range = locations[shard].range; if (reverse) keys = KeyRangeRef(keys.begin, range.end); else keys = KeyRangeRef(range.begin, keys.end); trState->cx->invalidateCache(locations[0].tenantEntry.prefix, keys); wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, trState->taskID)); break; } else if (e.code() == error_code_unknown_tenant) { ASSERT(useTenant && trState->tenant().present()); trState->cx->invalidateCachedTenant(trState->tenant().get()); wait(delay(CLIENT_KNOBS->UNKNOWN_TENANT_RETRY_DELAY, trState->taskID)); break; } else { TraceEvent(SevInfo, "GetExactRangeError") .error(e) .detail("Tenant", trState->tenant()) .detail("ShardBegin", locations[shard].range.begin) .detail("ShardEnd", locations[shard].range.end); throw; } } } } } Future resolveKey(Reference trState, KeySelector const& key, Version const& version, UseTenant useTenant) { if (key.isFirstGreaterOrEqual()) return Future(key.getKey()); if (key.isFirstGreaterThan()) return Future(keyAfter(key.getKey())); return getKey(trState, key, version, useTenant); } ACTOR template Future getRangeFallback(Reference trState, Version version, KeySelector begin, KeySelector end, Key mapper, GetRangeLimits limits, Reverse reverse, UseTenant useTenant) { if (version == latestVersion) { state Transaction transaction(trState->cx); transaction.setOption(FDBTransactionOptions::CAUSAL_READ_RISKY); transaction.setOption(FDBTransactionOptions::LOCK_AWARE); transaction.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); Version ver = wait(transaction.getReadVersion()); version = ver; } Future fb = resolveKey(trState, begin, version, useTenant); state Future fe = resolveKey(trState, end, version, useTenant); state Key b = wait(fb); state Key e = wait(fe); if (b >= e) { return RangeResultFamily(); } // if e is allKeys.end, we have read through the end of the database/tenant // if b is allKeys.begin, we have either read through the beginning of the database/tenant, // or allKeys.begin exists in the database/tenant and will be part of the conflict range anyways RangeResultFamily _r = wait(getExactRange( trState, version, KeyRangeRef(b, e), mapper, limits, reverse, useTenant)); RangeResultFamily r = _r; if (b == allKeys.begin && ((reverse && !r.more) || !reverse)) r.readToBegin = true; // TODO: this currently causes us to have a conflict range that is too large if our end key resolves to the // key after the last key in the database. In that case, we don't need a conflict between the last key and // the end of the database. // // If fixed, the ConflictRange test can be updated to stop checking for this condition. if (e == allKeys.end && ((!reverse && !r.more) || reverse)) r.readThroughEnd = true; ASSERT(!limits.hasRowLimit() || r.size() <= limits.rows); // If we were limiting bytes and the returned range is twice the request (plus 10K) log a warning if (limits.hasByteLimit() && r.expectedSize() > size_t(limits.bytes + CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT + CLIENT_KNOBS->VALUE_SIZE_LIMIT + 1) && limits.minRows == 0) { TraceEvent(SevWarnAlways, "GetRangeFallbackTooMuchData") .detail("LimitBytes", limits.bytes) .detail("DeliveredBytes", r.expectedSize()) .detail("LimitRows", limits.rows) .detail("DeliveredRows", r.size()); } return r; } int64_t inline getRangeResultFamilyBytes(RangeResultRef result) { return result.expectedSize(); } int64_t inline getRangeResultFamilyBytes(MappedRangeResultRef result) { int64_t bytes = 0; for (const MappedKeyValueRef& mappedKeyValue : result) { bytes += mappedKeyValue.key.size() + mappedKeyValue.value.size(); auto& reqAndResult = mappedKeyValue.reqAndResult; if (std::holds_alternative(reqAndResult)) { auto getValue = std::get(reqAndResult); bytes += getValue.expectedSize(); } else if (std::holds_alternative(reqAndResult)) { auto getRange = std::get(reqAndResult); bytes += getRange.result.expectedSize(); } else { throw internal_error(); } } return bytes; } // TODO: Client should add mapped keys to conflict ranges. template // RangeResult or MappedRangeResult void getRangeFinished(Reference trState, double startTime, KeySelector begin, KeySelector end, Snapshot snapshot, Promise> conflictRange, Reverse reverse, RangeResultFamily result) { int64_t bytes = getRangeResultFamilyBytes(result); trState->cx->transactionBytesRead += bytes; trState->cx->transactionKeysRead += result.size(); if (trState->trLogInfo) { trState->trLogInfo->addLog(FdbClientLogEvents::EventGetRange(startTime, trState->cx->clientLocality.dcId(), now() - startTime, bytes, begin.getKey(), end.getKey(), trState->tenant())); } if (!snapshot) { Key rangeBegin; Key rangeEnd; if (result.readToBegin) { rangeBegin = allKeys.begin; } else if (((!reverse || !result.more || begin.offset > 1) && begin.offset > 0) || result.size() == 0) { rangeBegin = Key(begin.getKey(), begin.arena()); } else { rangeBegin = reverse ? result.end()[-1].key : result[0].key; } if (end.offset > begin.offset && end.getKey() < rangeBegin) { rangeBegin = Key(end.getKey(), end.arena()); } if (result.readThroughEnd) { rangeEnd = allKeys.end; } else if (((reverse || !result.more || end.offset <= 0) && end.offset <= 1) || result.size() == 0) { rangeEnd = Key(end.getKey(), end.arena()); } else { rangeEnd = keyAfter(reverse ? result[0].key : result.end()[-1].key); } if (begin.offset < end.offset && begin.getKey() > rangeEnd) { rangeEnd = Key(begin.getKey(), begin.arena()); } conflictRange.send(std::make_pair(rangeBegin, rangeEnd)); } } ACTOR template Future getRange(Reference trState, Future fVersion, KeySelector begin, KeySelector end, Key mapper, GetRangeLimits limits, Promise> conflictRange, Snapshot snapshot, Reverse reverse, UseTenant useTenant = UseTenant::True) { // state using RangeResultRefFamily = typename RangeResultFamily::RefType; state GetRangeLimits originalLimits(limits); state KeySelector originalBegin = begin; state KeySelector originalEnd = end; state RangeResultFamily output; state Span span("NAPI:getRange"_loc, trState->spanID); if (useTenant && trState->tenant().present()) { span.addTag("tenant"_sr, trState->tenant().get()); } try { state Version version = wait(fVersion); trState->cx->validateVersion(version); state double startTime = now(); state Version readVersion = version; // Needed for latestVersion requests; if more, make future requests at the // version that the first one completed // FIXME: Is this really right? Weaken this and see if there is a problem; // if so maybe there is a much subtler problem even with this. if (begin.getKey() == allKeys.begin && begin.offset < 1) { output.readToBegin = true; begin = KeySelector(firstGreaterOrEqual(begin.getKey()), begin.arena()); } ASSERT(!limits.isReached()); ASSERT((!limits.hasRowLimit() || limits.rows >= limits.minRows) && limits.minRows >= 0); loop { if (end.getKey() == allKeys.begin && (end.offset < 1 || end.isFirstGreaterOrEqual())) { getRangeFinished( trState, startTime, originalBegin, originalEnd, snapshot, conflictRange, reverse, output); return output; } Key locationKey = reverse ? Key(end.getKey(), end.arena()) : Key(begin.getKey(), begin.arena()); Reverse locationBackward{ reverse ? (end - 1).isBackward() : begin.isBackward() }; state KeyRangeLocationInfo beginServer = wait(getKeyLocation(trState, locationKey, getRangeRequestStream(), locationBackward, useTenant, version)); state KeyRange shard = beginServer.range; state bool modifiedSelectors = false; state GetKeyValuesFamilyRequest req; req.mapper = mapper; req.arena.dependsOn(mapper.arena()); req.tenantInfo = useTenant ? trState->getTenantInfo() : TenantInfo(); req.isFetchKeys = (trState->taskID == TaskPriority::FetchKeys); req.version = readVersion; trState->cx->getLatestCommitVersions( beginServer.locations, req.version, trState, req.ssLatestCommitVersions); // In case of async tss comparison, also make req arena depend on begin, end, and/or shard's arena depending // on which is used bool dependOnShard = false; if (reverse && (begin - 1).isDefinitelyLess(shard.begin) && (!begin.isFirstGreaterOrEqual() || begin.getKey() != shard.begin)) { // In this case we would be setting modifiedSelectors to true, but // not modifying anything req.begin = firstGreaterOrEqual(shard.begin); modifiedSelectors = true; req.arena.dependsOn(shard.arena()); dependOnShard = true; } else { req.begin = begin; req.arena.dependsOn(begin.arena()); } if (!reverse && end.isDefinitelyGreater(shard.end)) { req.end = firstGreaterOrEqual(shard.end); modifiedSelectors = true; if (!dependOnShard) { req.arena.dependsOn(shard.arena()); } } else { req.end = end; req.arena.dependsOn(end.arena()); } transformRangeLimits(limits, reverse, req); ASSERT(req.limitBytes > 0 && req.limit != 0 && req.limit < 0 == reverse); req.tags = trState->cx->sampleReadTags() ? trState->options.readTags : Optional(); req.debugID = trState->debugID; req.spanContext = span.context; try { if (trState->debugID.present()) { g_traceBatch.addEvent( "TransactionDebug", trState->debugID.get().first(), "NativeAPI.getRange.Before"); /*TraceEvent("TransactionDebugGetRangeInfo", trState->debugID.get()) .detail("ReqBeginKey", req.begin.getKey()) .detail("ReqEndKey", req.end.getKey()) .detail("OriginalBegin", originalBegin.toString()) .detail("OriginalEnd", originalEnd.toString()) .detail("Begin", begin.toString()) .detail("End", end.toString()) .detail("Shard", shard) .detail("ReqLimit", req.limit) .detail("ReqLimitBytes", req.limitBytes) .detail("ReqVersion", req.version) .detail("Reverse", reverse) .detail("ModifiedSelectors", modifiedSelectors) .detail("Servers", beginServer.second->description());*/ } ++trState->cx->transactionPhysicalReads; state GetKeyValuesFamilyReply rep; try { if (CLIENT_BUGGIFY_WITH_PROB(.01)) { throw deterministicRandom()->randomChoice( std::vector{ transaction_too_old(), future_version() }); } // state AnnotateActor annotation(currentLineage); GetKeyValuesFamilyReply _rep = wait(loadBalance(trState->cx.getPtr(), beginServer.locations, getRangeRequestStream(), req, TaskPriority::DefaultPromiseEndpoint, AtMostOnce::False, trState->cx->enableLocalityLoadBalance ? &trState->cx->queueModel : nullptr)); rep = _rep; ++trState->cx->transactionPhysicalReadsCompleted; } catch (Error&) { ++trState->cx->transactionPhysicalReadsCompleted; throw; } if (trState->debugID.present()) { g_traceBatch.addEvent("TransactionDebug", trState->debugID.get().first(), "NativeAPI.getRange.After"); //.detail("SizeOf", rep.data.size()); /*TraceEvent("TransactionDebugGetRangeDone", trState->debugID.get()) .detail("ReqBeginKey", req.begin.getKey()) .detail("ReqEndKey", req.end.getKey()) .detail("RepIsMore", rep.more) .detail("VersionReturned", rep.version) .detail("RowsReturned", rep.data.size());*/ } ASSERT(!rep.more || rep.data.size()); ASSERT(!limits.hasRowLimit() || rep.data.size() <= limits.rows); limits.decrement(rep.data); if (reverse && begin.isLastLessOrEqual() && rep.data.size() && rep.data.end()[-1].key == begin.getKey()) { modifiedSelectors = false; } bool finished = limits.isReached() || (!modifiedSelectors && !rep.more) || limits.hasSatisfiedMinRows(); bool readThrough = modifiedSelectors && !rep.more; // optimization: first request got all data--just return it if (finished && !output.size()) { bool readToBegin = output.readToBegin; bool readThroughEnd = output.readThroughEnd; using RangeResultRefFamily = typename RangeResultFamily::RefType; output = RangeResultFamily( RangeResultRefFamily(rep.data, modifiedSelectors || limits.isReached() || rep.more), rep.arena); output.readToBegin = readToBegin; output.readThroughEnd = readThroughEnd; if (BUGGIFY && limits.hasByteLimit() && output.size() > std::max(1, originalLimits.minRows)) { // Copy instead of resizing because TSS maybe be using output's arena for comparison. This only // happens in simulation so it's fine RangeResultFamily copy; int newSize = deterministicRandom()->randomInt(std::max(1, originalLimits.minRows), output.size()); for (int i = 0; i < newSize; i++) { copy.push_back_deep(copy.arena(), output[i]); } output = copy; output.more = true; getRangeFinished( trState, startTime, originalBegin, originalEnd, snapshot, conflictRange, reverse, output); return output; } if (readThrough) { output.arena().dependsOn(shard.arena()); output.readThrough = reverse ? shard.begin : shard.end; } getRangeFinished( trState, startTime, originalBegin, originalEnd, snapshot, conflictRange, reverse, output); return output; } output.arena().dependsOn(rep.arena); output.append(output.arena(), rep.data.begin(), rep.data.size()); if (finished) { if (readThrough) { output.arena().dependsOn(shard.arena()); output.readThrough = reverse ? shard.begin : shard.end; } output.more = modifiedSelectors || limits.isReached() || rep.more; getRangeFinished( trState, startTime, originalBegin, originalEnd, snapshot, conflictRange, reverse, output); return output; } if (readVersion == latestVersion) { readVersion = rep.version; // see above comment } if (!rep.more) { ASSERT(modifiedSelectors); TEST(true); // !GetKeyValuesFamilyReply.more and modifiedSelectors in getRange if (!rep.data.size()) { // VERSION_VECTOR change version to readVersion in getRangeFallback RangeResultFamily result = wait( getRangeFallback( trState, version, originalBegin, originalEnd, mapper, originalLimits, reverse, useTenant)); getRangeFinished( trState, startTime, originalBegin, originalEnd, snapshot, conflictRange, reverse, result); return result; } if (reverse) end = firstGreaterOrEqual(shard.begin); else begin = firstGreaterOrEqual(shard.end); } else { TEST(true); // GetKeyValuesFamilyReply.more in getRange if (reverse) end = firstGreaterOrEqual(output[output.size() - 1].key); else begin = firstGreaterThan(output[output.size() - 1].key); } } catch (Error& e) { if (trState->debugID.present()) { g_traceBatch.addEvent( "TransactionDebug", trState->debugID.get().first(), "NativeAPI.getRange.Error"); TraceEvent("TransactionDebugError", trState->debugID.get()).error(e); } if (e.code() == error_code_wrong_shard_server || e.code() == error_code_all_alternatives_failed || (e.code() == error_code_transaction_too_old && readVersion == latestVersion)) { trState->cx->invalidateCache(beginServer.tenantEntry.prefix, reverse ? end.getKey() : begin.getKey(), Reverse{ reverse ? (end - 1).isBackward() : begin.isBackward() }); if (e.code() == error_code_wrong_shard_server) { RangeResultFamily result = wait( getRangeFallback( trState, version, originalBegin, originalEnd, mapper, originalLimits, reverse, useTenant)); getRangeFinished( trState, startTime, originalBegin, originalEnd, snapshot, conflictRange, reverse, result); return result; } wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, trState->taskID)); } else if (e.code() == error_code_unknown_tenant) { ASSERT(useTenant && trState->tenant().present()); trState->cx->invalidateCachedTenant(trState->tenant().get()); wait(delay(CLIENT_KNOBS->UNKNOWN_TENANT_RETRY_DELAY, trState->taskID)); } else { if (trState->trLogInfo) trState->trLogInfo->addLog( FdbClientLogEvents::EventGetRangeError(startTime, trState->cx->clientLocality.dcId(), static_cast