/* * Net2.actor.cpp * * This source file is part of the FoundationDB open source project * * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "boost/asio/buffer.hpp" #include "boost/asio/ip/address.hpp" #include "boost/system/system_error.hpp" #include "flow/Platform.h" #include "flow/Trace.h" #include #include #define BOOST_SYSTEM_NO_LIB #define BOOST_DATE_TIME_NO_LIB #define BOOST_REGEX_NO_LIB #include #include #include #include #include #include "flow/network.h" #include "flow/IThreadPool.h" #include "flow/ActorCollection.h" #include "flow/ThreadSafeQueue.h" #include "flow/ThreadHelper.actor.h" #include "flow/TDMetric.actor.h" #include "flow/AsioReactor.h" #include "flow/Profiler.h" #include "flow/ProtocolVersion.h" #include "flow/TLSConfig.actor.h" #include "flow/genericactors.actor.h" #include "flow/Util.h" // See the comment in TLSConfig.actor.h for the explanation of why this module breaking include was done. #include "fdbrpc/IAsyncFile.h" #ifdef WIN32 #include #endif #include "flow/actorcompiler.h" // This must be the last #include. #include // Defined to track the stack limit extern "C" intptr_t g_stackYieldLimit; intptr_t g_stackYieldLimit = 0; using namespace boost::asio::ip; #if defined(__linux__) || defined(__FreeBSD__) #include std::atomic net2RunLoopIterations(0); std::atomic net2RunLoopSleeps(0); volatile size_t net2backtraces_max = 10000; volatile void** volatile net2backtraces = nullptr; volatile size_t net2backtraces_offset = 0; volatile bool net2backtraces_overflow = false; volatile int net2backtraces_count = 0; volatile void **other_backtraces = nullptr; sigset_t sigprof_set; void initProfiling() { net2backtraces = new volatile void*[net2backtraces_max]; other_backtraces = new volatile void*[net2backtraces_max]; // According to folk wisdom, calling this once before setting up the signal handler makes // it async signal safe in practice :-/ backtrace(const_cast(other_backtraces), net2backtraces_max); sigemptyset(&sigprof_set); sigaddset(&sigprof_set, SIGPROF); } #endif DESCR struct SlowTask { int64_t clocks; //clocks int64_t duration; // ns int64_t priority; // priority level int64_t numYields; // count }; namespace N2 { // No indent, it's the whole file class Net2; class Peer; class Connection; std::unique_ptr g_net2 = 0; class Task { public: virtual void operator()() = 0; }; struct OrderedTask { int64_t priority; TaskPriority taskID; Task *task; OrderedTask(int64_t priority, TaskPriority taskID, Task* task) : priority(priority), taskID(taskID), task(task) {} bool operator < (OrderedTask const& rhs) const { return priority < rhs.priority; } }; thread_local INetwork* thread_network = 0; class Net2 final : public INetwork, public INetworkConnections { public: Net2(const TLSConfig& tlsConfig, bool useThreadPool, bool useMetrics); void initTLS(ETLSInitState targetState); void run(); void initMetrics(); // INetworkConnections interface Future> connect( NetworkAddress toAddr, std::string host ) override; Future> createUDPSocket(NetworkAddress toAddr) override; Future> createUDPSocket(bool isV6) override; Future> resolveTCPEndpoint( std::string host, std::string service) override; Reference listen( NetworkAddress localAddr ) override; // INetwork interface double now() const override { return currentTime; }; double timer() override { return ::timer(); }; double timer_monotonic() override { return ::timer_monotonic(); }; Future delay(double seconds, TaskPriority taskId) override; Future yield(TaskPriority taskID) override; bool check_yield(TaskPriority taskId) override; TaskPriority getCurrentTask() const override { return currentTaskID; } void setCurrentTask(TaskPriority taskID ) override { currentTaskID = taskID; priorityMetric = (int64_t)taskID; } void onMainThread( Promise&& signal, TaskPriority taskID ) override; bool isOnMainThread() const override { return thread_network == this; } void stop() override { if ( thread_network == this ) stopImmediately(); else onMainThreadVoid( [this] { this->stopImmediately(); }, nullptr ); } void addStopCallback( std::function fn ) override { if ( thread_network == this ) stopCallbacks.emplace_back(std::move(fn)); else onMainThreadVoid( [this, fn] { this->stopCallbacks.emplace_back(std::move(fn)); }, nullptr ); } bool isSimulated() const override { return false; } THREAD_HANDLE startThread( THREAD_FUNC_RETURN (*func) (void*), void *arg) override; void getDiskBytes( std::string const& directory, int64_t& free, int64_t& total ) override; bool isAddressOnThisHost(NetworkAddress const& addr) const override; void updateNow(){ currentTime = timer_monotonic(); } flowGlobalType global(int id) const override { return (globals.size() > id) ? globals[id] : nullptr; } void setGlobal(size_t id, flowGlobalType v) override { globals.resize(std::max(globals.size(),id+1)); globals[id] = v; } ProtocolVersion protocolVersion() override { return currentProtocolVersion; } std::vector globals; const TLSConfig& getTLSConfig() const override { return tlsConfig; } bool checkRunnable() override; bool useThreadPool; //private: ASIOReactor reactor; #ifndef TLS_DISABLED AsyncVar>> sslContextVar; Reference sslHandshakerPool; int sslHandshakerThreadsStarted; int sslPoolHandshakesInProgress; #endif TLSConfig tlsConfig; Future backgroundCertRefresh; ETLSInitState tlsInitializedState; INetworkConnections *network; // initially this, but can be changed int64_t tscBegin, tscEnd; double taskBegin; TaskPriority currentTaskID; uint64_t tasksIssued; TDMetricCollection tdmetrics; double currentTime; bool stopped; mutable std::map addressOnHostCache; std::atomic started; uint64_t numYields; NetworkMetrics::PriorityStats* lastPriorityStats; std::priority_queue> ready; ThreadSafeQueue threadReady; struct DelayedTask : OrderedTask { double at; DelayedTask(double at, int64_t priority, TaskPriority taskID, Task* task) : at(at), OrderedTask(priority, taskID, task) {} bool operator < (DelayedTask const& rhs) const { return at > rhs.at; } // Ordering is reversed for priority_queue }; std::priority_queue> timers; void checkForSlowTask(int64_t tscBegin, int64_t tscEnd, double duration, TaskPriority priority); bool check_yield(TaskPriority taskId, int64_t tscNow); void processThreadReady(); void trackAtPriority( TaskPriority priority, double now ); void stopImmediately() { stopped=true; decltype(ready) _1; ready.swap(_1); decltype(timers) _2; timers.swap(_2); } Future timeOffsetLogger; Future logTimeOffset(); Int64MetricHandle bytesReceived; Int64MetricHandle udpBytesReceived; Int64MetricHandle countWriteProbes; Int64MetricHandle countReadProbes; Int64MetricHandle countReads; Int64MetricHandle countUDPReads; Int64MetricHandle countWouldBlock; Int64MetricHandle countWrites; Int64MetricHandle countUDPWrites; Int64MetricHandle countRunLoop; Int64MetricHandle countCantSleep; Int64MetricHandle countWontSleep; Int64MetricHandle countTimers; Int64MetricHandle countTasks; Int64MetricHandle countYields; Int64MetricHandle countYieldBigStack; Int64MetricHandle countYieldCalls; Int64MetricHandle countYieldCallsTrue; Int64MetricHandle countASIOEvents; Int64MetricHandle countRunLoopProfilingSignals; Int64MetricHandle countTLSPolicyFailures; Int64MetricHandle priorityMetric; DoubleMetricHandle countLaunchTime; DoubleMetricHandle countReactTime; BoolMetricHandle awakeMetric; EventMetricHandle slowTaskMetric; std::vector blobCredentialFiles; std::vector> stopCallbacks; }; static boost::asio::ip::address tcpAddress(IPAddress const& n) { if (n.isV6()) { return boost::asio::ip::address_v6(n.toV6()); } else { return boost::asio::ip::address_v4(n.toV4()); } } static IPAddress toIPAddress(boost::asio::ip::address const& addr) { if (addr.is_v4()) { return IPAddress(addr.to_v4().to_uint()); } else { return IPAddress(addr.to_v6().to_bytes()); } } static tcp::endpoint tcpEndpoint( NetworkAddress const& n ) { return tcp::endpoint(tcpAddress(n.ip), n.port); } static udp::endpoint udpEndpoint(NetworkAddress const& n) { return udp::endpoint(tcpAddress(n.ip), n.port); } class BindPromise { Promise p; const char* errContext; UID errID; public: BindPromise( const char* errContext, UID errID ) : errContext(errContext), errID(errID) {} BindPromise( BindPromise const& r ) : p(r.p), errContext(r.errContext), errID(r.errID) {} BindPromise(BindPromise&& r) noexcept : p(std::move(r.p)), errContext(r.errContext), errID(r.errID) {} Future getFuture() { return p.getFuture(); } void operator()( const boost::system::error_code& error, size_t bytesWritten=0 ) { try { if (error) { // Log the error... { TraceEvent evt(SevWarn, errContext, errID); evt.suppressFor(1.0).detail("ErrorCode", error.value()).detail("Message", error.message()); #ifndef TLS_DISABLED // There is no function in OpenSSL to use to check if an error code is from OpenSSL, // but all OpenSSL errors have a non-zero "library" code set in bits 24-32, and linux // error codes should never go that high. if (error.value() >= (1 << 24L)) { evt.detail("WhichMeans", TLSPolicy::ErrorString(error)); } #endif } p.sendError( connection_failed() ); } else p.send( Void() ); } catch (Error& e) { p.sendError(e); } catch (...) { p.sendError(unknown_error()); } } }; struct SendBufferIterator { typedef boost::asio::const_buffer value_type; typedef std::forward_iterator_tag iterator_category; typedef size_t difference_type; typedef boost::asio::const_buffer* pointer; typedef boost::asio::const_buffer& reference; SendBuffer const* p; int limit; SendBufferIterator(SendBuffer const* p=0, int limit = std::numeric_limits::max()) : p(p), limit(limit) { ASSERT(limit > 0); } bool operator == (SendBufferIterator const& r) const { return p == r.p; } bool operator != (SendBufferIterator const& r) const { return p != r.p; } void operator++() { limit -= p->bytes_written - p->bytes_sent; if(limit > 0) p = p->next; else p = nullptr; } boost::asio::const_buffer operator*() const { return boost::asio::const_buffer(p->data() + p->bytes_sent, std::min(limit, p->bytes_written - p->bytes_sent)); } }; class Connection final : public IConnection, ReferenceCounted { public: void addref() override { ReferenceCounted::addref(); } void delref() override { ReferenceCounted::delref(); } void close() override { closeSocket(); } explicit Connection( boost::asio::io_service& io_service ) : id(nondeterministicRandom()->randomUniqueID()), socket(io_service) { } // This is not part of the IConnection interface, because it is wrapped by INetwork::connect() ACTOR static Future> connect( boost::asio::io_service* ios, NetworkAddress addr ) { state Reference self( new Connection(*ios) ); self->peer_address = addr; try { auto to = tcpEndpoint(addr); BindPromise p("N2_ConnectError", self->id); Future onConnected = p.getFuture(); self->socket.async_connect( to, std::move(p) ); wait( onConnected ); self->init(); return self; } catch (Error&) { // Either the connection failed, or was cancelled by the caller self->closeSocket(); throw; } } // This is not part of the IConnection interface, because it is wrapped by IListener::accept() void accept(NetworkAddress peerAddr) { this->peer_address = peerAddr; init(); } Future acceptHandshake() override { return Void(); } Future connectHandshake() override { return Void(); } // returns when write() can write at least one byte Future onWritable() override { ++g_net2->countWriteProbes; BindPromise p("N2_WriteProbeError", id); auto f = p.getFuture(); socket.async_write_some( boost::asio::null_buffers(), std::move(p) ); return f; } // returns when read() can read at least one byte Future onReadable() override { ++g_net2->countReadProbes; BindPromise p("N2_ReadProbeError", id); auto f = p.getFuture(); socket.async_read_some( boost::asio::null_buffers(), std::move(p) ); return f; } // Reads as many bytes as possible from the read buffer into [begin,end) and returns the number of bytes read (might be 0) int read( uint8_t* begin, uint8_t* end ) override { boost::system::error_code err; ++g_net2->countReads; size_t toRead = end-begin; size_t size = socket.read_some( boost::asio::mutable_buffers_1(begin, toRead), err ); g_net2->bytesReceived += size; //TraceEvent("ConnRead", this->id).detail("Bytes", size); if (err) { if (err == boost::asio::error::would_block) { ++g_net2->countWouldBlock; return 0; } onReadError(err); throw connection_failed(); } ASSERT( size ); // If the socket is closed, we expect an 'eof' error, not a zero return value return size; } // Writes as many bytes as possible from the given SendBuffer chain into the write buffer and returns the number of bytes written (might be 0) int write( SendBuffer const* data, int limit ) override { boost::system::error_code err; ++g_net2->countWrites; size_t sent = socket.write_some( boost::iterator_range(SendBufferIterator(data, limit), SendBufferIterator()), err ); if (err) { // Since there was an error, sent's value can't be used to infer that the buffer has data and the limit is positive so check explicitly. ASSERT(limit > 0); bool notEmpty = false; for(auto p = data; p; p = p->next) if(p->bytes_written - p->bytes_sent > 0) { notEmpty = true; break; } ASSERT(notEmpty); if (err == boost::asio::error::would_block) { ++g_net2->countWouldBlock; return 0; } onWriteError(err); throw connection_failed(); } ASSERT( sent ); // Make sure data was sent, and also this check will fail if the buffer chain was empty or the limit was not > 0. return sent; } NetworkAddress getPeerAddress() const override { return peer_address; } UID getDebugID() const override { return id; } tcp::socket& getSocket() { return socket; } private: UID id; tcp::socket socket; NetworkAddress peer_address; void init() { // Socket settings that have to be set after connect or accept succeeds socket.non_blocking(true); if (FLOW_KNOBS->FLOW_TCP_NODELAY & 1) { socket.set_option(boost::asio::ip::tcp::no_delay(true)); } if (FLOW_KNOBS->FLOW_TCP_QUICKACK & 1) { #ifdef __linux__ socket.set_option(boost::asio::detail::socket_option::boolean(true)); #else TraceEvent(SevWarn, "N2_InitWarn").detail("Message", "TCP_QUICKACK not supported"); #endif } platform::setCloseOnExec(socket.native_handle()); } void closeSocket() { boost::system::error_code error; socket.close(error); if (error) TraceEvent(SevWarn, "N2_CloseError", id).suppressFor(1.0).detail("ErrorCode", error.value()).detail("Message", error.message()); } void onReadError( const boost::system::error_code& error ) { TraceEvent(SevWarn, "N2_ReadError", id).suppressFor(1.0).detail("ErrorCode", error.value()).detail("Message", error.message()); closeSocket(); } void onWriteError( const boost::system::error_code& error ) { TraceEvent(SevWarn, "N2_WriteError", id).suppressFor(1.0).detail("ErrorCode", error.value()).detail("Message", error.message()); closeSocket(); } }; class ReadPromise { Promise p; const char* errContext; UID errID; std::shared_ptr endpoint = nullptr; public: ReadPromise(const char* errContext, UID errID) : errContext(errContext), errID(errID) {} ReadPromise(ReadPromise const& other) = default; ReadPromise(ReadPromise&& other) : p(std::move(other.p)), errContext(other.errContext), errID(other.errID) {} std::shared_ptr& getEndpoint() { return endpoint; } Future getFuture() { return p.getFuture(); } void operator()(const boost::system::error_code& error, size_t bytesWritten) { try { if (error) { TraceEvent evt(SevWarn, errContext, errID); evt.suppressFor(1.0).detail("ErrorCode", error.value()).detail("Message", error.message()); p.sendError(connection_failed()); } else { p.send(int(bytesWritten)); } } catch (Error& e) { p.sendError(e); } catch (...) { p.sendError(unknown_error()); } } }; class UDPSocket : public IUDPSocket, ReferenceCounted { UID id; Optional toAddress; udp::socket socket; bool isPublic = false; public: ACTOR static Future> connect(boost::asio::io_service* io_service, Optional toAddress, bool isV6) { state Reference self(new UDPSocket(*io_service, toAddress, isV6)); ASSERT(!toAddress.present() || toAddress.get().ip.isV6() == isV6); if (!toAddress.present()) { return self; } try { if (toAddress.present()) { auto to = udpEndpoint(toAddress.get()); BindPromise p("N2_UDPConnectError", self->id); Future onConnected = p.getFuture(); self->socket.async_connect(to, std::move(p)); wait(onConnected); } self->init(); return self; } catch (...) { self->closeSocket(); throw; } } void close() override { closeSocket(); } Future receive(uint8_t* begin, uint8_t* end) override { ++g_net2->countUDPReads; ReadPromise p("N2_UDPReadError", id); auto res = p.getFuture(); socket.async_receive(boost::asio::mutable_buffer(begin, end - begin), std::move(p)); return fmap( [](int bytes) { g_net2->udpBytesReceived += bytes; return bytes; }, res); } Future receiveFrom(uint8_t* begin, uint8_t* end, NetworkAddress* sender) override { ++g_net2->countUDPReads; ReadPromise p("N2_UDPReadFromError", id); p.getEndpoint() = std::make_shared(); auto endpoint = p.getEndpoint().get(); auto res = p.getFuture(); socket.async_receive_from(boost::asio::mutable_buffer(begin, end - begin), *endpoint, std::move(p)); return fmap( [endpoint, sender](int bytes) { if (sender) { sender->port = endpoint->port(); sender->ip = toIPAddress(endpoint->address()); } g_net2->udpBytesReceived += bytes; return bytes; }, res); } Future send(uint8_t const* begin, uint8_t const* end) override { ++g_net2->countUDPWrites; ReadPromise p("N2_UDPWriteError", id); auto res = p.getFuture(); socket.async_send(boost::asio::const_buffer(begin, end - begin), std::move(p)); return res; } Future sendTo(uint8_t const* begin, uint8_t const* end, NetworkAddress const& peer) override { ++g_net2->countUDPWrites; ReadPromise p("N2_UDPWriteError", id); auto res = p.getFuture(); udp::endpoint toEndpoint = udpEndpoint(peer); socket.async_send_to(boost::asio::const_buffer(begin, end - begin), toEndpoint, std::move(p)); return res; } void bind(NetworkAddress const& addr) override { boost::system::error_code ec; socket.bind(udpEndpoint(addr), ec); if (ec) { Error x; if (ec.value() == EADDRINUSE) x = address_in_use(); else if (ec.value() == EADDRNOTAVAIL) x = invalid_local_address(); else x = bind_failed(); TraceEvent(SevWarnAlways, "Net2UDPBindError").error(x); throw x; } isPublic = true; } UID getDebugID() const override { return id; } void addref() override { ReferenceCounted::addref(); } void delref() override { ReferenceCounted::delref(); } NetworkAddress localAddress() const { auto endpoint = socket.local_endpoint(); return NetworkAddress(toIPAddress(endpoint.address()), endpoint.port(), isPublic, false); } private: UDPSocket(boost::asio::io_service& io_service, Optional toAddress, bool isV6) : id(nondeterministicRandom()->randomUniqueID()), socket(io_service, isV6 ? udp::v6() : udp::v4()) { } void closeSocket() { boost::system::error_code error; socket.close(error); if (error) TraceEvent(SevWarn, "N2_CloseError", id) .suppressFor(1.0) .detail("ErrorCode", error.value()) .detail("Message", error.message()); } void onReadError(const boost::system::error_code& error) { TraceEvent(SevWarn, "N2_UDPReadError", id) .suppressFor(1.0) .detail("ErrorCode", error.value()) .detail("Message", error.message()); closeSocket(); } void onWriteError(const boost::system::error_code& error) { TraceEvent(SevWarn, "N2_UDPWriteError", id) .suppressFor(1.0) .detail("ErrorCode", error.value()) .detail("Message", error.message()); closeSocket(); } void init() { socket.non_blocking(true); platform::setCloseOnExec(socket.native_handle()); } }; class Listener final : public IListener, ReferenceCounted { boost::asio::io_context& io_service; NetworkAddress listenAddress; tcp::acceptor acceptor; public: Listener( boost::asio::io_context& io_service, NetworkAddress listenAddress ) : io_service(io_service), listenAddress(listenAddress), acceptor( io_service, tcpEndpoint( listenAddress ) ) { platform::setCloseOnExec(acceptor.native_handle()); } void addref() override { ReferenceCounted::addref(); } void delref() override { ReferenceCounted::delref(); } // Returns one incoming connection when it is available Future> accept() override { return doAccept( this ); } NetworkAddress getListenAddress() const override { return listenAddress; } private: ACTOR static Future> doAccept( Listener* self ) { state Reference conn( new Connection( self->io_service ) ); state tcp::acceptor::endpoint_type peer_endpoint; try { BindPromise p("N2_AcceptError", UID()); auto f = p.getFuture(); self->acceptor.async_accept( conn->getSocket(), peer_endpoint, std::move(p) ); wait( f ); auto peer_address = peer_endpoint.address().is_v6() ? IPAddress(peer_endpoint.address().to_v6().to_bytes()) : IPAddress(peer_endpoint.address().to_v4().to_ulong()); conn->accept(NetworkAddress(peer_address, peer_endpoint.port())); return conn; } catch (...) { conn->close(); throw; } } }; #ifndef TLS_DISABLED typedef boost::asio::ssl::stream ssl_socket; struct SSLHandshakerThread final : IThreadPoolReceiver { SSLHandshakerThread() {} void init() override {} struct Handshake final : TypedAction { Handshake(ssl_socket &socket, ssl_socket::handshake_type type) : socket(socket), type(type) { } double getTimeEstimate() const override { return 0.001; } ThreadReturnPromise done; ssl_socket &socket; ssl_socket::handshake_type type; boost::system::error_code err; }; void action( Handshake &h) { try { h.socket.next_layer().non_blocking(false, h.err); if(!h.err.failed()) { h.socket.handshake(h.type, h.err); } if(!h.err.failed()) { h.socket.next_layer().non_blocking(true, h.err); } if(h.err.failed()) { TraceEvent(SevWarn, h.type == ssl_socket::handshake_type::client ? "N2_ConnectHandshakeError" : "N2_AcceptHandshakeError") .detail("ErrorCode", h.err.value()) .detail("ErrorMsg", h.err.message().c_str()) .detail("BackgroundThread", true); h.done.sendError(connection_failed()); } else { h.done.send(Void()); } } catch(...) { TraceEvent(SevWarn, h.type == ssl_socket::handshake_type::client ? "N2_ConnectHandshakeUnknownError" : "N2_AcceptHandshakeUnknownError") .detail("BackgroundThread", true); h.done.sendError(connection_failed()); } } }; class SSLConnection final : public IConnection, ReferenceCounted { public: void addref() override { ReferenceCounted::addref(); } void delref() override { ReferenceCounted::delref(); } void close() override { closeSocket(); } explicit SSLConnection( boost::asio::io_service& io_service, Reference> context ) : id(nondeterministicRandom()->randomUniqueID()), socket(io_service), ssl_sock(socket, context->mutate()), sslContext(context) { } // This is not part of the IConnection interface, because it is wrapped by INetwork::connect() ACTOR static Future> connect( boost::asio::io_service* ios, Reference> context, NetworkAddress addr ) { std::pair peerIP = std::make_pair(addr.ip, addr.port); auto iter(g_network->networkInfo.serverTLSConnectionThrottler.find(peerIP)); if(iter != g_network->networkInfo.serverTLSConnectionThrottler.end()) { if (now() < iter->second.second) { if(iter->second.first >= FLOW_KNOBS->TLS_CLIENT_CONNECTION_THROTTLE_ATTEMPTS) { TraceEvent("TLSOutgoingConnectionThrottlingWarning").suppressFor(1.0).detail("PeerIP", addr); wait(delay(FLOW_KNOBS->CONNECTION_MONITOR_TIMEOUT)); throw connection_failed(); } } else { g_network->networkInfo.serverTLSConnectionThrottler.erase(peerIP); } } state Reference self( new SSLConnection(*ios, context) ); self->peer_address = addr; try { auto to = tcpEndpoint(self->peer_address); BindPromise p("N2_ConnectError", self->id); Future onConnected = p.getFuture(); self->socket.async_connect( to, std::move(p) ); wait( onConnected ); self->init(); return self; } catch (Error& e) { // Either the connection failed, or was cancelled by the caller self->closeSocket(); throw; } } // This is not part of the IConnection interface, because it is wrapped by IListener::accept() void accept(NetworkAddress peerAddr) { this->peer_address = peerAddr; init(); } ACTOR static void doAcceptHandshake( Reference self, Promise connected ) { state Hold holder; try { Future onHandshook; // If the background handshakers are not all busy, use one if(N2::g_net2->sslPoolHandshakesInProgress < N2::g_net2->sslHandshakerThreadsStarted) { holder = Hold(&N2::g_net2->sslPoolHandshakesInProgress); auto handshake = new SSLHandshakerThread::Handshake(self->ssl_sock, boost::asio::ssl::stream_base::server); onHandshook = handshake->done.getFuture(); N2::g_net2->sslHandshakerPool->post(handshake); } else { // Otherwise use flow network thread BindPromise p("N2_AcceptHandshakeError", UID()); onHandshook = p.getFuture(); self->ssl_sock.async_handshake( boost::asio::ssl::stream_base::server, std::move(p) ); } wait( onHandshook ); wait(delay(0, TaskPriority::Handshake)); connected.send(Void()); } catch (...) { self->closeSocket(); connected.sendError(connection_failed()); } } ACTOR static Future acceptHandshakeWrapper( Reference self ) { state std::pair peerIP; peerIP = std::make_pair(self->getPeerAddress().ip, static_cast(0)); auto iter(g_network->networkInfo.serverTLSConnectionThrottler.find(peerIP)); if(iter != g_network->networkInfo.serverTLSConnectionThrottler.end()) { if (now() < iter->second.second) { if(iter->second.first >= FLOW_KNOBS->TLS_SERVER_CONNECTION_THROTTLE_ATTEMPTS) { TraceEvent("TLSIncomingConnectionThrottlingWarning").suppressFor(1.0).detail("PeerIP", peerIP.first.toString()); wait(delay(FLOW_KNOBS->CONNECTION_MONITOR_TIMEOUT)); self->closeSocket(); throw connection_failed(); } } else { g_network->networkInfo.serverTLSConnectionThrottler.erase(peerIP); } } wait(g_network->networkInfo.handshakeLock->take()); state FlowLock::Releaser releaser(*g_network->networkInfo.handshakeLock); Promise connected; doAcceptHandshake(self, connected); try { choose { when(wait(connected.getFuture())) { return Void(); } when(wait(delay(FLOW_KNOBS->CONNECTION_MONITOR_TIMEOUT))) { throw connection_failed(); } } } catch (Error& e) { if(e.code() != error_code_actor_cancelled) { auto iter(g_network->networkInfo.serverTLSConnectionThrottler.find(peerIP)); if(iter != g_network->networkInfo.serverTLSConnectionThrottler.end()) { iter->second.first++; } else { g_network->networkInfo.serverTLSConnectionThrottler[peerIP] = std::make_pair(0,now() + FLOW_KNOBS->TLS_SERVER_CONNECTION_THROTTLE_TIMEOUT); } } // Either the connection failed, or was cancelled by the caller self->closeSocket(); throw; } } Future acceptHandshake() override { return acceptHandshakeWrapper( Reference::addRef(this) ); } ACTOR static void doConnectHandshake( Reference self, Promise connected) { state Hold holder; try { Future onHandshook; // If the background handshakers are not all busy, use one if(N2::g_net2->sslPoolHandshakesInProgress < N2::g_net2->sslHandshakerThreadsStarted) { holder = Hold(&N2::g_net2->sslPoolHandshakesInProgress); auto handshake = new SSLHandshakerThread::Handshake(self->ssl_sock, boost::asio::ssl::stream_base::client); onHandshook = handshake->done.getFuture(); N2::g_net2->sslHandshakerPool->post(handshake); } else { // Otherwise use flow network thread BindPromise p("N2_ConnectHandshakeError", self->id); onHandshook = p.getFuture(); self->ssl_sock.async_handshake( boost::asio::ssl::stream_base::client, std::move(p) ); } wait( onHandshook ); wait(delay(0, TaskPriority::Handshake)); connected.send(Void()); } catch (...) { self->closeSocket(); connected.sendError(connection_failed()); } } ACTOR static Future connectHandshakeWrapper( Reference self ) { wait(g_network->networkInfo.handshakeLock->take()); state FlowLock::Releaser releaser(*g_network->networkInfo.handshakeLock); Promise connected; doConnectHandshake(self, connected); try { choose { when(wait(connected.getFuture())) { return Void(); } when(wait(delay(FLOW_KNOBS->CONNECTION_MONITOR_TIMEOUT))) { throw connection_failed(); } } } catch (Error& e) { // Either the connection failed, or was cancelled by the caller if(e.code() != error_code_actor_cancelled) { std::pair peerIP = std::make_pair(self->peer_address.ip, self->peer_address.port); auto iter(g_network->networkInfo.serverTLSConnectionThrottler.find(peerIP)); if(iter != g_network->networkInfo.serverTLSConnectionThrottler.end()) { iter->second.first++; } else { g_network->networkInfo.serverTLSConnectionThrottler[peerIP] = std::make_pair(0,now() + FLOW_KNOBS->TLS_CLIENT_CONNECTION_THROTTLE_TIMEOUT); } } self->closeSocket(); throw; } } Future connectHandshake() override { return connectHandshakeWrapper( Reference::addRef(this) ); } // returns when write() can write at least one byte Future onWritable() override { ++g_net2->countWriteProbes; BindPromise p("N2_WriteProbeError", id); auto f = p.getFuture(); socket.async_write_some( boost::asio::null_buffers(), std::move(p) ); return f; } // returns when read() can read at least one byte Future onReadable() override { ++g_net2->countReadProbes; BindPromise p("N2_ReadProbeError", id); auto f = p.getFuture(); socket.async_read_some( boost::asio::null_buffers(), std::move(p) ); return f; } // Reads as many bytes as possible from the read buffer into [begin,end) and returns the number of bytes read (might be 0) int read( uint8_t* begin, uint8_t* end ) override { boost::system::error_code err; ++g_net2->countReads; size_t toRead = end-begin; size_t size = ssl_sock.read_some( boost::asio::mutable_buffers_1(begin, toRead), err ); g_net2->bytesReceived += size; //TraceEvent("ConnRead", this->id).detail("Bytes", size); if (err) { if (err == boost::asio::error::would_block) { ++g_net2->countWouldBlock; return 0; } onReadError(err); throw connection_failed(); } ASSERT( size ); // If the socket is closed, we expect an 'eof' error, not a zero return value return size; } // Writes as many bytes as possible from the given SendBuffer chain into the write buffer and returns the number of bytes written (might be 0) int write( SendBuffer const* data, int limit ) override { #ifdef __APPLE__ // For some reason, writing ssl_sock with more than 2016 bytes when socket is writeable sometimes results in a broken pipe error. limit = std::min(limit, 2016); #endif boost::system::error_code err; ++g_net2->countWrites; size_t sent = ssl_sock.write_some( boost::iterator_range(SendBufferIterator(data, limit), SendBufferIterator()), err ); if (err) { // Since there was an error, sent's value can't be used to infer that the buffer has data and the limit is positive so check explicitly. ASSERT(limit > 0); bool notEmpty = false; for(auto p = data; p; p = p->next) if(p->bytes_written - p->bytes_sent > 0) { notEmpty = true; break; } ASSERT(notEmpty); if (err == boost::asio::error::would_block) { ++g_net2->countWouldBlock; return 0; } onWriteError(err); throw connection_failed(); } ASSERT( sent ); // Make sure data was sent, and also this check will fail if the buffer chain was empty or the limit was not > 0. return sent; } NetworkAddress getPeerAddress() const override { return peer_address; } UID getDebugID() const override { return id; } tcp::socket& getSocket() { return socket; } ssl_socket& getSSLSocket() { return ssl_sock; } private: UID id; tcp::socket socket; ssl_socket ssl_sock; NetworkAddress peer_address; Reference> sslContext; void init() { // Socket settings that have to be set after connect or accept succeeds socket.non_blocking(true); socket.set_option(boost::asio::ip::tcp::no_delay(true)); platform::setCloseOnExec(socket.native_handle()); } void closeSocket() { boost::system::error_code cancelError; socket.cancel(cancelError); boost::system::error_code closeError; socket.close(closeError); boost::system::error_code shutdownError; ssl_sock.shutdown(shutdownError); } void onReadError( const boost::system::error_code& error ) { TraceEvent(SevWarn, "N2_ReadError", id).suppressFor(1.0).detail("ErrorCode", error.value()).detail("Message", error.message()); closeSocket(); } void onWriteError( const boost::system::error_code& error ) { TraceEvent(SevWarn, "N2_WriteError", id).suppressFor(1.0).detail("ErrorCode", error.value()).detail("Message", error.message()); closeSocket(); } }; class SSLListener final : public IListener, ReferenceCounted { boost::asio::io_context& io_service; NetworkAddress listenAddress; tcp::acceptor acceptor; AsyncVar>> *contextVar; public: SSLListener( boost::asio::io_context& io_service, AsyncVar>>* contextVar, NetworkAddress listenAddress ) : io_service(io_service), listenAddress(listenAddress), acceptor( io_service, tcpEndpoint( listenAddress ) ), contextVar(contextVar) { platform::setCloseOnExec(acceptor.native_handle()); } void addref() override { ReferenceCounted::addref(); } void delref() override { ReferenceCounted::delref(); } // Returns one incoming connection when it is available Future> accept() override { return doAccept( this ); } NetworkAddress getListenAddress() const override { return listenAddress; } private: ACTOR static Future> doAccept( SSLListener* self ) { state Reference conn( new SSLConnection( self->io_service, self->contextVar->get() ) ); state tcp::acceptor::endpoint_type peer_endpoint; try { BindPromise p("N2_AcceptError", UID()); auto f = p.getFuture(); self->acceptor.async_accept( conn->getSocket(), peer_endpoint, std::move(p) ); wait( f ); auto peer_address = peer_endpoint.address().is_v6() ? IPAddress(peer_endpoint.address().to_v6().to_bytes()) : IPAddress(peer_endpoint.address().to_v4().to_ulong()); conn->accept(NetworkAddress(peer_address, peer_endpoint.port(), false, true)); return conn; } catch (...) { conn->close(); throw; } } }; #endif struct PromiseTask : public Task, public FastAllocated { Promise promise; PromiseTask() {} explicit PromiseTask(Promise&& promise) noexcept : promise(std::move(promise)) {} virtual void operator()() { promise.send(Void()); delete this; } }; // 5MB for loading files into memory Net2::Net2(const TLSConfig& tlsConfig, bool useThreadPool, bool useMetrics) : useThreadPool(useThreadPool), network(this), reactor(this), stopped(false), tasksIssued(0), // Until run() is called, yield() will always yield tscBegin(0), tscEnd(0), taskBegin(0), currentTaskID(TaskPriority::DefaultYield), numYields(0), lastPriorityStats(nullptr), tlsInitializedState(ETLSInitState::NONE), tlsConfig(tlsConfig), started(false) #ifndef TLS_DISABLED ,sslContextVar({ReferencedObject::from(boost::asio::ssl::context(boost::asio::ssl::context::tls))}), sslPoolHandshakesInProgress(0), sslHandshakerThreadsStarted(0) #endif { TraceEvent("Net2Starting"); // Set the global members if(useMetrics) { setGlobal(INetwork::enTDMetrics, (flowGlobalType) &tdmetrics); } setGlobal(INetwork::enNetworkConnections, (flowGlobalType) network); setGlobal(INetwork::enASIOService, (flowGlobalType) &reactor.ios); setGlobal(INetwork::enBlobCredentialFiles, &blobCredentialFiles); #ifdef __linux__ setGlobal(INetwork::enEventFD, (flowGlobalType) N2::ASIOReactor::newEventFD(reactor)); #endif updateNow(); } #ifndef TLS_DISABLED ACTOR static Future watchFileForChanges( std::string filename, AsyncTrigger* fileChanged ) { if (filename == "") { return Never(); } state bool firstRun = true; state bool statError = false; state std::time_t lastModTime = 0; loop { try { std::time_t modtime = wait(IAsyncFileSystem::filesystem()->lastWriteTime(filename)); if (firstRun) { lastModTime = modtime; firstRun = false; } if (lastModTime != modtime || statError) { lastModTime = modtime; statError = false; fileChanged->trigger(); } } catch (Error& e) { if (e.code() == error_code_io_error) { // EACCES, ELOOP, ENOENT all come out as io_error(), but are more of a system // configuration issue than an FDB problem. If we managed to load valid // certificates, then there's no point in crashing, but we should complain // loudly. IAsyncFile will log the error, but not necessarily as a warning. TraceEvent(SevWarnAlways, "TLSCertificateRefreshStatError").detail("File", filename); statError = true; } else { throw; } } wait(delay(FLOW_KNOBS->TLS_CERT_REFRESH_DELAY_SECONDS)); } } ACTOR static Future reloadCertificatesOnChange( TLSConfig config, std::function onPolicyFailure, AsyncVar>>* contextVar ) { if (FLOW_KNOBS->TLS_CERT_REFRESH_DELAY_SECONDS <= 0) { return Void(); } loop { // Early in bootup, the filesystem might not be initialized yet. Wait until it is. if (IAsyncFileSystem::filesystem() != nullptr) { break; } wait(delay(1.0)); } state int mismatches = 0; state AsyncTrigger fileChanged; state std::vector> lifetimes; lifetimes.push_back(watchFileForChanges(config.getCertificatePathSync(), &fileChanged)); lifetimes.push_back(watchFileForChanges(config.getKeyPathSync(), &fileChanged)); lifetimes.push_back(watchFileForChanges(config.getCAPathSync(), &fileChanged)); loop { wait( fileChanged.onTrigger() ); TraceEvent("TLSCertificateRefreshBegin"); try { LoadedTLSConfig loaded = wait( config.loadAsync() ); boost::asio::ssl::context context(boost::asio::ssl::context::tls); ConfigureSSLContext(loaded, &context, onPolicyFailure); TraceEvent(SevInfo, "TLSCertificateRefreshSucceeded"); mismatches = 0; contextVar->set(ReferencedObject::from(std::move(context))); } catch (Error &e) { if (e.code() == error_code_actor_cancelled) { throw; } // Some files didn't match up, they should in the future, and we'll retry then. mismatches++; TraceEvent(SevWarn, "TLSCertificateRefreshMismatch").error(e).detail("mismatches", mismatches); } } } #endif void Net2::initTLS(ETLSInitState targetState) { if(tlsInitializedState >= targetState) { return; } #ifndef TLS_DISABLED // Any target state must be higher than NONE so if the current state is NONE // then initialize the TLS config if(tlsInitializedState == ETLSInitState::NONE) { auto onPolicyFailure = [this]() { this->countTLSPolicyFailures++; }; try { boost::asio::ssl::context newContext(boost::asio::ssl::context::tls); const LoadedTLSConfig& loaded = tlsConfig.loadSync(); TraceEvent("Net2TLSConfig") .detail("CAPath", tlsConfig.getCAPathSync()) .detail("CertificatePath", tlsConfig.getCertificatePathSync()) .detail("KeyPath", tlsConfig.getKeyPathSync()) .detail("HasPassword", !loaded.getPassword().empty()) .detail("VerifyPeers", boost::algorithm::join(loaded.getVerifyPeers(), "|")); ConfigureSSLContext( tlsConfig.loadSync(), &newContext, onPolicyFailure ); sslContextVar.set(ReferencedObject::from(std::move(newContext))); } catch (Error& e) { TraceEvent("Net2TLSInitError").error(e); } backgroundCertRefresh = reloadCertificatesOnChange( tlsConfig, onPolicyFailure, &sslContextVar ); } // If a TLS connection is actually going to be used then start background threads if configured if(targetState > ETLSInitState::CONFIG) { int threadsToStart; switch(targetState) { case ETLSInitState::CONNECT: threadsToStart = FLOW_KNOBS->TLS_CLIENT_HANDSHAKE_THREADS; break; case ETLSInitState::LISTEN: threadsToStart = FLOW_KNOBS->TLS_SERVER_HANDSHAKE_THREADS; break; default: threadsToStart = 0; }; threadsToStart -= sslHandshakerThreadsStarted; if(threadsToStart > 0) { if(sslHandshakerThreadsStarted == 0) { #if defined(__linux__) if(mallopt(M_ARENA_MAX, FLOW_KNOBS->TLS_MALLOC_ARENA_MAX) != 1) { TraceEvent(SevWarn, "TLSMallocSetMaxArenasFailure").detail("MaxArenas", FLOW_KNOBS->TLS_MALLOC_ARENA_MAX); }; #endif sslHandshakerPool = createGenericThreadPool(FLOW_KNOBS->TLS_HANDSHAKE_THREAD_STACKSIZE); } for(int i = 0; i < threadsToStart; ++i) { ++sslHandshakerThreadsStarted; sslHandshakerPool->addThread(new SSLHandshakerThread()); } } } #endif tlsInitializedState = targetState; } ACTOR Future Net2::logTimeOffset() { loop { double processTime = timer_monotonic(); double systemTime = timer(); TraceEvent("ProcessTimeOffset").detailf("ProcessTime", "%lf", processTime).detailf("SystemTime", "%lf", systemTime).detailf("OffsetFromSystemTime", "%lf", processTime - systemTime); wait(::delay(FLOW_KNOBS->TIME_OFFSET_LOGGING_INTERVAL)); } } void Net2::initMetrics() { bytesReceived.init(LiteralStringRef("Net2.BytesReceived")); countWriteProbes.init(LiteralStringRef("Net2.CountWriteProbes")); countReadProbes.init(LiteralStringRef("Net2.CountReadProbes")); countReads.init(LiteralStringRef("Net2.CountReads")); countWouldBlock.init(LiteralStringRef("Net2.CountWouldBlock")); countWrites.init(LiteralStringRef("Net2.CountWrites")); countRunLoop.init(LiteralStringRef("Net2.CountRunLoop")); countCantSleep.init(LiteralStringRef("Net2.CountCantSleep")); countWontSleep.init(LiteralStringRef("Net2.CountWontSleep")); countTimers.init(LiteralStringRef("Net2.CountTimers")); countTasks.init(LiteralStringRef("Net2.CountTasks")); countYields.init(LiteralStringRef("Net2.CountYields")); countYieldBigStack.init(LiteralStringRef("Net2.CountYieldBigStack")); countYieldCalls.init(LiteralStringRef("Net2.CountYieldCalls")); countASIOEvents.init(LiteralStringRef("Net2.CountASIOEvents")); countYieldCallsTrue.init(LiteralStringRef("Net2.CountYieldCallsTrue")); countRunLoopProfilingSignals.init(LiteralStringRef("Net2.CountRunLoopProfilingSignals")); countTLSPolicyFailures.init(LiteralStringRef("Net2.CountTLSPolicyFailures")); priorityMetric.init(LiteralStringRef("Net2.Priority")); awakeMetric.init(LiteralStringRef("Net2.Awake")); slowTaskMetric.init(LiteralStringRef("Net2.SlowTask")); countLaunchTime.init(LiteralStringRef("Net2.CountLaunchTime")); countReactTime.init(LiteralStringRef("Net2.CountReactTime")); } bool Net2::checkRunnable() { return !started.exchange(true); } void Net2::run() { TraceEvent::setNetworkThread(); TraceEvent("Net2Running"); thread_network = this; #ifdef WIN32 if (timeBeginPeriod(1) != TIMERR_NOERROR) TraceEvent(SevError, "TimeBeginPeriodError"); #endif timeOffsetLogger = logTimeOffset(); const char *flow_profiler_enabled = getenv("FLOW_PROFILER_ENABLED"); if (flow_profiler_enabled != nullptr && *flow_profiler_enabled != '\0') { // The empty string check is to allow running `FLOW_PROFILER_ENABLED= ./fdbserver` to force disabling flow profiling at startup. startProfiling(this); } // Get the address to the launch function typedef void (*runCycleFuncPtr)(); runCycleFuncPtr runFunc = reinterpret_cast(reinterpret_cast(g_network->global(INetwork::enRunCycleFunc))); started.store(true); double nnow = timer_monotonic(); while(!stopped) { FDB_TRACE_PROBE(run_loop_begin); ++countRunLoop; if (runFunc) { tscBegin = timestampCounter(); taskBegin = nnow; trackAtPriority(TaskPriority::RunCycleFunction, taskBegin); runFunc(); double taskEnd = timer_monotonic(); trackAtPriority(TaskPriority::RunLoop, taskEnd); countLaunchTime += taskEnd - taskBegin; checkForSlowTask(tscBegin, timestampCounter(), taskEnd - taskBegin, TaskPriority::RunCycleFunction); } double sleepTime = 0; bool b = ready.empty(); if (b) { b = threadReady.canSleep(); if (!b) ++countCantSleep; } else ++countWontSleep; if (b) { sleepTime = 1e99; double sleepStart = timer_monotonic(); if (!timers.empty()) { sleepTime = timers.top().at - sleepStart; // + 500e-6? } if (sleepTime > 0) { #if defined(__linux__) // notify the run loop monitoring thread that we have gone idle net2RunLoopSleeps.fetch_add(1); #endif trackAtPriority(TaskPriority::Zero, sleepStart); awakeMetric = false; priorityMetric = 0; reactor.sleep(sleepTime); awakeMetric = true; } } tscBegin = timestampCounter(); taskBegin = timer_monotonic(); trackAtPriority(TaskPriority::ASIOReactor, taskBegin); reactor.react(); updateNow(); double now = this->currentTime; trackAtPriority(TaskPriority::RunLoop, now); countReactTime += now - taskBegin; checkForSlowTask(tscBegin, timestampCounter(), now - taskBegin, TaskPriority::ASIOReactor); if ((now-nnow) > FLOW_KNOBS->SLOW_LOOP_CUTOFF && nondeterministicRandom()->random01() < (now-nnow)*FLOW_KNOBS->SLOW_LOOP_SAMPLING_RATE) TraceEvent("SomewhatSlowRunLoopTop").detail("Elapsed", now - nnow); int numTimers = 0; while (!timers.empty() && timers.top().at < now) { ++numTimers; ++countTimers; ready.push( timers.top() ); timers.pop(); } countTimers += numTimers; FDB_TRACE_PROBE(run_loop_ready_timers, numTimers); processThreadReady(); tscBegin = timestampCounter(); tscEnd = tscBegin + FLOW_KNOBS->TSC_YIELD_TIME; taskBegin = timer_monotonic(); numYields = 0; TaskPriority minTaskID = TaskPriority::Max; int queueSize = ready.size(); FDB_TRACE_PROBE(run_loop_tasks_start, queueSize); while (!ready.empty()) { ++countTasks; currentTaskID = ready.top().taskID; if(currentTaskID < minTaskID) { trackAtPriority(currentTaskID, taskBegin); minTaskID = currentTaskID; } priorityMetric = static_cast(currentTaskID); Task* task = ready.top().task; ready.pop(); try { (*task)(); } catch (Error& e) { TraceEvent(SevError, "TaskError").error(e); } catch (...) { TraceEvent(SevError, "TaskError").error(unknown_error()); } double tscNow = timestampCounter(); double newTaskBegin = timer_monotonic(); if (check_yield(TaskPriority::Max, tscNow)) { checkForSlowTask(tscBegin, tscNow, newTaskBegin - taskBegin, currentTaskID); FDB_TRACE_PROBE(run_loop_yield); ++countYields; break; } taskBegin = newTaskBegin; tscBegin = tscNow; } trackAtPriority(TaskPriority::RunLoop, taskBegin); queueSize = ready.size(); FDB_TRACE_PROBE(run_loop_done, queueSize); #if defined(__linux__) if(FLOW_KNOBS->RUN_LOOP_PROFILING_INTERVAL > 0) { sigset_t orig_set; pthread_sigmask(SIG_BLOCK, &sigprof_set, &orig_set); size_t other_offset = net2backtraces_offset; bool was_overflow = net2backtraces_overflow; int signal_count = net2backtraces_count; countRunLoopProfilingSignals += signal_count; if (other_offset) { volatile void** _traces = net2backtraces; net2backtraces = other_backtraces; other_backtraces = _traces; net2backtraces_offset = 0; } net2backtraces_overflow = false; net2backtraces_count = 0; pthread_sigmask(SIG_SETMASK, &orig_set, nullptr); if (was_overflow) { TraceEvent("Net2RunLoopProfilerOverflow") .detail("SignalsReceived", signal_count) .detail("BackTraceHarvested", other_offset != 0); } if (other_offset) { size_t iter_offset = 0; while (iter_offset < other_offset) { ProfilingSample *ps = (ProfilingSample *)(other_backtraces + iter_offset); TraceEvent(SevWarn, "Net2RunLoopTrace").detailf("TraceTime", "%.6f", ps->timestamp).detail("Trace", platform::format_backtrace(ps->frames, ps->length)); iter_offset += ps->length + 2; } } // notify the run loop monitoring thread that we are making progress net2RunLoopIterations.fetch_add(1); } #endif nnow = timer_monotonic(); if ((nnow-now) > FLOW_KNOBS->SLOW_LOOP_CUTOFF && nondeterministicRandom()->random01() < (nnow-now)*FLOW_KNOBS->SLOW_LOOP_SAMPLING_RATE) TraceEvent("SomewhatSlowRunLoopBottom").detail("Elapsed", nnow - now); // This includes the time spent running tasks } for ( auto& fn : stopCallbacks ) { fn(); } #ifdef WIN32 timeEndPeriod(1); #endif } void Net2::trackAtPriority( TaskPriority priority, double now ) { if (lastPriorityStats == nullptr || priority != lastPriorityStats->priority) { // Start tracking current priority auto activeStatsItr = networkInfo.metrics.activeTrackers.try_emplace(priority, priority); activeStatsItr.first->second.active = true; activeStatsItr.first->second.windowedTimer = now; if(lastPriorityStats != nullptr) { // Stop tracking previous priority lastPriorityStats->active = false; lastPriorityStats->duration += now - lastPriorityStats->windowedTimer; } // Update starvation trackers TaskPriority lastPriority = (lastPriorityStats == nullptr) ? TaskPriority::Zero : lastPriorityStats->priority; for(auto& binStats : networkInfo.metrics.starvationTrackers) { if(binStats.priority > lastPriority && binStats.priority > priority) { break; } // Busy -> idle at binStats.priority if(binStats.priority > priority && binStats.priority <= lastPriority) { binStats.active = false; binStats.duration += now - binStats.windowedTimer; binStats.maxDuration = std::max(binStats.maxDuration, now - binStats.timer); } // Idle -> busy at binStats.priority else if(binStats.priority <= priority && binStats.priority > lastPriority) { binStats.active = true; binStats.timer = now; binStats.windowedTimer = now; } } lastPriorityStats = &activeStatsItr.first->second; } } void Net2::processThreadReady() { int numReady = 0; while (true) { Optional t = threadReady.pop(); if (!t.present()) break; t.get().priority -= ++tasksIssued; ASSERT( t.get().task != 0 ); ready.push( t.get() ); ++numReady; } FDB_TRACE_PROBE(run_loop_thread_ready, numReady); } void Net2::checkForSlowTask(int64_t tscBegin, int64_t tscEnd, double duration, TaskPriority priority) { int64_t elapsed = tscEnd-tscBegin; if (elapsed > FLOW_KNOBS->TSC_YIELD_TIME && tscBegin > 0) { int i = std::min(NetworkMetrics::SLOW_EVENT_BINS-1, log( elapsed/1e6 ) / log(2.)); ++networkInfo.metrics.countSlowEvents[i]; int64_t warnThreshold = g_network->isSimulated() ? 10e9 : 500e6; //printf("SlowTask: %d, %d yields\n", (int)(elapsed/1e6), numYields); slowTaskMetric->clocks = elapsed; slowTaskMetric->duration = (int64_t)(duration*1e9); slowTaskMetric->priority = static_cast(priority); slowTaskMetric->numYields = numYields; slowTaskMetric->log(); double sampleRate = std::min(1.0, (elapsed > warnThreshold) ? 1.0 : elapsed / 10e9); double slowTaskProfilingLogInterval = std::max(FLOW_KNOBS->RUN_LOOP_PROFILING_INTERVAL, FLOW_KNOBS->SLOWTASK_PROFILING_LOG_INTERVAL); if(slowTaskProfilingLogInterval > 0 && duration > slowTaskProfilingLogInterval) { sampleRate = 1; // Always include slow task events that could show up in our slow task profiling. } if ( !DEBUG_DETERMINISM && (nondeterministicRandom()->random01() < sampleRate )) TraceEvent(elapsed > warnThreshold ? SevWarnAlways : SevInfo, "SlowTask").detail("TaskID", priority).detail("MClocks", elapsed/1e6).detail("Duration", duration).detail("SampleRate", sampleRate).detail("NumYields", numYields); } } bool Net2::check_yield( TaskPriority taskID, int64_t tscNow ) { // SOMEDAY: Yield if there are lots of higher priority tasks queued? if ((g_stackYieldLimit) && ( (intptr_t)&taskID < g_stackYieldLimit )) { ++countYieldBigStack; return true; } processThreadReady(); if (taskID == TaskPriority::DefaultYield) taskID = currentTaskID; if (!ready.empty() && ready.top().priority > int64_t(taskID)<<32) { return true; } if (tscNow < tscBegin) { return true; } if (tscNow > tscEnd) { ++numYields; return true; } return false; } bool Net2::check_yield( TaskPriority taskID ) { if(numYields > 0) { ++numYields; return true; } return check_yield(taskID, timestampCounter()); } Future Net2::yield( TaskPriority taskID ) { ++countYieldCalls; if (taskID == TaskPriority::DefaultYield) taskID = currentTaskID; if (check_yield(taskID)) { ++countYieldCallsTrue; return delay(0, taskID); } g_network->setCurrentTask(taskID); return Void(); } Future Net2::delay( double seconds, TaskPriority taskId ) { if (seconds <= 0.) { PromiseTask* t = new PromiseTask; this->ready.push( OrderedTask( (int64_t(taskId)<<32)-(++tasksIssued), taskId, t) ); return t->promise.getFuture(); } if (seconds >= 4e12) // Intervals that overflow an int64_t in microseconds (more than 100,000 years) are treated as infinite return Never(); double at = now() + seconds; PromiseTask* t = new PromiseTask; this->timers.push( DelayedTask( at, (int64_t(taskId)<<32)-(++tasksIssued), taskId, t ) ); return t->promise.getFuture(); } void Net2::onMainThread(Promise&& signal, TaskPriority taskID) { if (stopped) return; PromiseTask* p = new PromiseTask( std::move(signal) ); int64_t priority = int64_t(taskID)<<32; if ( thread_network == this ) { processThreadReady(); this->ready.push( OrderedTask( priority-(++tasksIssued), taskID, p ) ); } else { if (threadReady.push( OrderedTask( priority, taskID, p ) )) reactor.wake(); } } THREAD_HANDLE Net2::startThread( THREAD_FUNC_RETURN (*func) (void*), void *arg ) { return ::startThread(func, arg); } Future< Reference > Net2::connect( NetworkAddress toAddr, std::string host ) { #ifndef TLS_DISABLED initTLS(ETLSInitState::CONNECT); if ( toAddr.isTLS() ) { return SSLConnection::connect(&this->reactor.ios, this->sslContextVar.get(), toAddr); } #endif return Connection::connect(&this->reactor.ios, toAddr); } ACTOR static Future> resolveTCPEndpoint_impl( Net2 *self, std::string host, std::string service) { state tcp::resolver tcpResolver(self->reactor.ios); Promise> promise; state Future> result = promise.getFuture(); tcpResolver.async_resolve(tcp::resolver::query(host, service), [=](const boost::system::error_code &ec, tcp::resolver::iterator iter) { if(ec) { promise.sendError(lookup_failed()); return; } std::vector addrs; tcp::resolver::iterator end; while(iter != end) { auto endpoint = iter->endpoint(); auto addr = endpoint.address(); if (addr.is_v6()) { addrs.push_back(NetworkAddress(IPAddress(addr.to_v6().to_bytes()), endpoint.port())); } else { addrs.push_back(NetworkAddress(addr.to_v4().to_ulong(), endpoint.port())); } ++iter; } if(addrs.empty()) { promise.sendError(lookup_failed()); } else { promise.send(addrs); } }); wait(ready(result)); tcpResolver.cancel(); return result.get(); } Future> Net2::createUDPSocket(NetworkAddress toAddr) { return UDPSocket::connect(&reactor.ios, toAddr, toAddr.ip.isV6()); } Future> Net2::createUDPSocket(bool isV6) { return UDPSocket::connect(&reactor.ios, Optional(), isV6); } Future> Net2::resolveTCPEndpoint( std::string host, std::string service) { return resolveTCPEndpoint_impl(this, host, service); } bool Net2::isAddressOnThisHost(NetworkAddress const& addr) const { auto it = addressOnHostCache.find( addr.ip ); if (it != addressOnHostCache.end()) return it->second; if (addressOnHostCache.size() > 50000) addressOnHostCache.clear(); // Bound cache memory; should not really happen try { boost::asio::io_service ioService; boost::asio::ip::udp::socket socket(ioService); boost::asio::ip::udp::endpoint endpoint(tcpAddress(addr.ip), 1); socket.connect(endpoint); bool local = addr.ip.isV6() ? socket.local_endpoint().address().to_v6().to_bytes() == addr.ip.toV6() : socket.local_endpoint().address().to_v4().to_ulong() == addr.ip.toV4(); socket.close(); if (local) TraceEvent(SevInfo, "AddressIsOnHost").detail("Address", addr); return addressOnHostCache[ addr.ip ] = local; } catch(boost::system::system_error e) { TraceEvent(SevWarnAlways, "IsAddressOnHostError").detail("Address", addr).detail("ErrDesc", e.what()).detail("ErrCode", e.code().value()); return addressOnHostCache[ addr.ip ] = false; } } Reference Net2::listen( NetworkAddress localAddr ) { try { #ifndef TLS_DISABLED initTLS(ETLSInitState::LISTEN); if ( localAddr.isTLS() ) { return Reference(new SSLListener( reactor.ios, &this->sslContextVar, localAddr )); } #endif return Reference( new Listener( reactor.ios, localAddr ) ); } catch (boost::system::system_error const& e) { Error x; if(e.code().value() == EADDRINUSE) x = address_in_use(); else if(e.code().value() == EADDRNOTAVAIL) x = invalid_local_address(); else x = bind_failed(); TraceEvent("Net2ListenError").error(x).detail("Message", e.what()); throw x; } catch (std::exception const& e) { Error x = unknown_error(); TraceEvent("Net2ListenError").error(x).detail("Message", e.what()); throw x; } catch (Error &e ) { TraceEvent("Net2ListenError").error(e); throw e; } catch (...) { Error x = unknown_error(); TraceEvent("Net2ListenError").error(x); throw x; } } void Net2::getDiskBytes(std::string const& directory, int64_t& free, int64_t& total) { return ::getDiskBytes(directory, free, total); } #ifdef __linux__ #include #include #include #endif ASIOReactor::ASIOReactor(Net2* net) : network(net), firstTimer(ios), do_not_stop(ios) { #ifdef __linux__ // Reactor flags are used only for experimentation, and are platform-specific if (FLOW_KNOBS->REACTOR_FLAGS & 1) { prctl(PR_SET_TIMERSLACK, 1, 0, 0, 0); printf("Set timerslack to 1ns\n"); } if (FLOW_KNOBS->REACTOR_FLAGS & 2) { int ret; pthread_t this_thread = pthread_self(); struct sched_param params; params.sched_priority = sched_get_priority_max(SCHED_FIFO); ret = pthread_setschedparam(this_thread, SCHED_FIFO, ¶ms); if (ret != 0) printf("Error setting priority (%d %d)\n", ret, errno); else printf("Set scheduler mode to SCHED_FIFO\n"); } #endif } void ASIOReactor::sleep(double sleepTime) { if (sleepTime > FLOW_KNOBS->BUSY_WAIT_THRESHOLD) { if (FLOW_KNOBS->REACTOR_FLAGS & 4) { #ifdef __linux timespec tv; tv.tv_sec = 0; tv.tv_nsec = 20000; nanosleep(&tv, nullptr); #endif } else { sleepTime -= FLOW_KNOBS->BUSY_WAIT_THRESHOLD; if (sleepTime < 4e12) { this->firstTimer.expires_from_now(boost::posix_time::microseconds(int64_t(sleepTime*1e6))); this->firstTimer.async_wait(&nullWaitHandler); } setProfilingEnabled(0); // The following line generates false positives for slow task profiling ios.run_one(); setProfilingEnabled(1); this->firstTimer.cancel(); } ++network->countASIOEvents; } else if (sleepTime > 0) { if (!(FLOW_KNOBS->REACTOR_FLAGS & 8)) threadYield(); } } void ASIOReactor::react() { while (ios.poll_one()) ++network->countASIOEvents; // Make this a task? } void ASIOReactor::wake() { ios.post( nullCompletionHandler ); } } // namespace net2 INetwork* newNet2(const TLSConfig& tlsConfig, bool useThreadPool, bool useMetrics) { try { N2::g_net2 = std::make_unique(tlsConfig, useThreadPool, useMetrics); } catch(boost::system::system_error e) { TraceEvent("Net2InitError").detail("Message", e.what()); throw unknown_error(); } catch(std::exception const& e) { TraceEvent("Net2InitError").detail("Message", e.what()); throw unknown_error(); } return N2::g_net2.get(); } struct TestGVR { Standalone key; int64_t version; Optional> debugID; Promise< Optional> > reply; TestGVR(){} template void serialize( Ar& ar ) { serializer(ar, key, version, debugID, reply); } }; template void startThreadF( F && func ) { struct Thing { F f; Thing( F && f ) : f(std::move(f)) {} THREAD_FUNC start(void* p) { Thing* self = (Thing*)p; self->f(); delete self; THREAD_RETURN; } }; Thing* t = new Thing(std::move(func)); startThread(Thing::start, t); } void net2_test() { /*printf("ThreadSafeQueue test\n"); printf(" Interface: "); ThreadSafeQueue tq; ASSERT( tq.canSleep() == true ); ASSERT( tq.push( 1 ) == true ) ; ASSERT( tq.push( 2 ) == false ); ASSERT( tq.push( 3 ) == false ); ASSERT( tq.pop().get() == 1 ); ASSERT( tq.pop().get() == 2 ); ASSERT( tq.push( 4 ) == false ); ASSERT( tq.pop().get() == 3 ); ASSERT( tq.pop().get() == 4 ); ASSERT( !tq.pop().present() ); printf("OK\n"); printf("Threaded: "); Event finished, finished2; int thread1Iterations = 1000000, thread2Iterations = 100000; if (thread1Iterations) startThreadF([&](){ printf("Thread1\n"); for(int i=0; i i = tq.pop(); if (i.present()) { int v = i.get(); ++c; if (mx[v>>20] != v) printf("Wrong value dequeued!\n"); ASSERT( mx[v>>20] == v ); mx[v>>20] = v + 1; } else { ++p; _mm_pause(); } if ((c&3)==0) tq.canSleep(); } printf("%d %d %x %x %s\n", c, p, mx[0], mx[1], mx[0]==thread1Iterations && mx[1]==(1<<20)+thread2Iterations ? "OK" : "FAIL"); finished.block(); finished2.block(); g_network = newNet2(); // for promise serialization below Endpoint destination; printf(" Used: %lld\n", FastAllocator<4096>::getTotalMemory()); char junk[100]; double before = timer(); vector reqs; reqs.reserve( 10000 ); int totalBytes = 0; for(int j=0; j<1000; j++) { UnsentPacketQueue unsent; ReliablePacketList reliable; reqs.resize(10000); for(int i=0; i<10000; i++) { TestGVR &req = reqs[i]; req.key = LiteralStringRef("Foobar"); SerializeSource what(req); SendBuffer* pb = unsent.getWriteBuffer(); ReliablePacket* rp = new ReliablePacket; // 0 PacketWriter wr(pb,rp,AssumeVersion(g_network->protocolVersion())); //BinaryWriter wr; SplitBuffer packetLen; uint32_t len = 0; wr.writeAhead(sizeof(len), &packetLen); wr << destination.token; //req.reply.getEndpoint(); what.serializePacketWriter(wr); //wr.serializeBytes(junk, 43); unsent.setWriteBuffer(wr.finish()); len = wr.size() - sizeof(len); packetLen.write(&len, sizeof(len)); //totalBytes += wr.getLength(); totalBytes += wr.size(); if (rp) reliable.insert(rp); } reqs.clear(); unsent.discardAll(); reliable.discardAll(); } printf("SimSend x 1Kx10K: %0.2f sec\n", timer()-before); printf(" Bytes: %d\n", totalBytes); printf(" Used: %lld\n", FastAllocator<4096>::getTotalMemory()); */ };