mirror of
https://github.com/h2o/h2o.git
synced 2025-05-19 13:22:13 +08:00
973 lines
36 KiB
C
973 lines
36 KiB
C
/*
|
|
* Copyright (c) 2014-2016 DeNA Co., Ltd., Kazuho Oku, Fastly, Inc.
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
* of this software and associated documentation files (the "Software"), to
|
|
* deal in the Software without restriction, including without limitation the
|
|
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
|
* sell copies of the Software, and to permit persons to whom the Software is
|
|
* furnished to do so, subject to the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice shall be included in
|
|
* all copies or substantial portions of the Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
* IN THE SOFTWARE.
|
|
*/
|
|
#include <netinet/in.h>
|
|
#include <netinet/tcp.h>
|
|
#include <stdlib.h>
|
|
#include <sys/time.h>
|
|
#include <sys/uio.h>
|
|
#include <unistd.h>
|
|
#if H2O_USE_KTLS
|
|
#include <linux/tls.h>
|
|
#endif
|
|
#include "cloexec.h"
|
|
#include "h2o/linklist.h"
|
|
#if H2O_USE_IO_URING
|
|
#include "h2o/io_uring.h"
|
|
#endif
|
|
|
|
#if !defined(H2O_USE_ACCEPT4)
|
|
#ifdef __linux__
|
|
#if defined(__ANDROID__) && __ANDROID_API__ < 21
|
|
#define H2O_USE_ACCEPT4 0
|
|
#else
|
|
#define H2O_USE_ACCEPT4 1
|
|
#endif
|
|
#elif __FreeBSD__ >= 10
|
|
#define H2O_USE_ACCEPT4 1
|
|
#else
|
|
#define H2O_USE_ACCEPT4 0
|
|
#endif
|
|
#endif
|
|
|
|
struct st_h2o_evloop_socket_t {
|
|
h2o_socket_t super;
|
|
int fd;
|
|
int _flags;
|
|
h2o_evloop_t *loop;
|
|
size_t max_read_size;
|
|
struct st_h2o_evloop_socket_t *_next_pending;
|
|
struct st_h2o_evloop_socket_t *_next_statechanged;
|
|
struct {
|
|
uint64_t prev_loop;
|
|
uint64_t cur_loop;
|
|
uint64_t cur_run_count;
|
|
} bytes_written;
|
|
/**
|
|
* vector to be sent (or vec.callbacks is NULL when not used)
|
|
*/
|
|
h2o_sendvec_t sendvec;
|
|
};
|
|
|
|
static void link_to_pending(struct st_h2o_evloop_socket_t *sock);
|
|
static void link_to_statechanged(struct st_h2o_evloop_socket_t *sock);
|
|
static void write_pending(struct st_h2o_evloop_socket_t *sock);
|
|
static h2o_evloop_t *create_evloop(size_t sz);
|
|
static void update_now(h2o_evloop_t *loop);
|
|
static int32_t adjust_max_wait(h2o_evloop_t *loop, int32_t max_wait);
|
|
|
|
/* functions to be defined in the backends */
|
|
static int evloop_do_proceed(h2o_evloop_t *loop, int32_t max_wait);
|
|
static void evloop_do_dispose(h2o_evloop_t *loop);
|
|
static void evloop_do_on_socket_create(struct st_h2o_evloop_socket_t *sock);
|
|
static int evloop_do_on_socket_close(struct st_h2o_evloop_socket_t *sock);
|
|
static void evloop_do_on_socket_export(struct st_h2o_evloop_socket_t *sock);
|
|
|
|
#if H2O_USE_POLL || H2O_USE_EPOLL || H2O_USE_KQUEUE
|
|
/* explicitly specified */
|
|
#else
|
|
#if defined(__APPLE__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__)
|
|
#define H2O_USE_KQUEUE 1
|
|
#elif defined(__linux)
|
|
#define H2O_USE_EPOLL 1
|
|
#if defined(SO_ZEROCOPY) && defined(SO_EE_ORIGIN_ZEROCOPY) && defined(MSG_ZEROCOPY)
|
|
#define H2O_USE_MSG_ZEROCOPY 1
|
|
#endif
|
|
#else
|
|
#define H2O_USE_POLL 1
|
|
#endif
|
|
#endif
|
|
#if !defined(H2O_USE_MSG_ZEROCOPY)
|
|
#define H2O_USE_MSG_ZEROCOPY 0
|
|
#endif
|
|
|
|
#if H2O_USE_POLL
|
|
#include "evloop/poll.c.h"
|
|
#elif H2O_USE_EPOLL
|
|
#include "evloop/epoll.c.h"
|
|
#elif H2O_USE_KQUEUE
|
|
#include "evloop/kqueue.c.h"
|
|
#else
|
|
#error "poller not specified"
|
|
#endif
|
|
|
|
size_t h2o_evloop_socket_max_read_size = 1024 * 1024; /* by default, we read up to 1MB at once */
|
|
size_t h2o_evloop_socket_max_write_size = 1024 * 1024; /* by default, we write up to 1MB at once */
|
|
|
|
void link_to_pending(struct st_h2o_evloop_socket_t *sock)
|
|
{
|
|
if (sock->_next_pending == sock) {
|
|
struct st_h2o_evloop_socket_t **slot = (sock->_flags & H2O_SOCKET_FLAG_IS_ACCEPTED_CONNECTION) != 0
|
|
? &sock->loop->_pending_as_server
|
|
: &sock->loop->_pending_as_client;
|
|
sock->_next_pending = *slot;
|
|
*slot = sock;
|
|
}
|
|
}
|
|
|
|
void link_to_statechanged(struct st_h2o_evloop_socket_t *sock)
|
|
{
|
|
if (sock->_next_statechanged == sock) {
|
|
sock->_next_statechanged = NULL;
|
|
*sock->loop->_statechanged.tail_ref = sock;
|
|
sock->loop->_statechanged.tail_ref = &sock->_next_statechanged;
|
|
}
|
|
}
|
|
|
|
static const char *on_read_core(int fd, h2o_buffer_t **input, size_t max_bytes)
|
|
{
|
|
ssize_t read_so_far = 0;
|
|
|
|
while (1) {
|
|
ssize_t rret;
|
|
h2o_iovec_t buf = h2o_buffer_try_reserve(input, max_bytes < 4096 ? max_bytes : 4096);
|
|
if (buf.base == NULL) {
|
|
/* memory allocation failed */
|
|
return h2o_socket_error_out_of_memory;
|
|
}
|
|
size_t read_size = buf.len <= INT_MAX / 2 ? buf.len : INT_MAX / 2 + 1;
|
|
if (read_size > max_bytes)
|
|
read_size = max_bytes;
|
|
while ((rret = read(fd, buf.base, read_size)) == -1 && errno == EINTR)
|
|
;
|
|
if (rret == -1) {
|
|
if (errno == EAGAIN)
|
|
break;
|
|
else
|
|
return h2o_socket_error_io;
|
|
} else if (rret == 0) {
|
|
if (read_so_far == 0)
|
|
return h2o_socket_error_closed; /* TODO notify close */
|
|
break;
|
|
}
|
|
(*input)->size += rret;
|
|
if (buf.len != rret)
|
|
break;
|
|
read_so_far += rret;
|
|
if (read_so_far >= max_bytes)
|
|
break;
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
static size_t write_vecs(struct st_h2o_evloop_socket_t *sock, h2o_iovec_t **bufs, size_t *bufcnt, int sendmsg_flags)
|
|
{
|
|
ssize_t wret;
|
|
|
|
while (*bufcnt != 0) {
|
|
/* write */
|
|
int iovcnt = *bufcnt < IOV_MAX ? (int)*bufcnt : IOV_MAX;
|
|
struct msghdr msg;
|
|
do {
|
|
msg = (struct msghdr){.msg_iov = (struct iovec *)*bufs, .msg_iovlen = iovcnt};
|
|
} while ((wret = sendmsg(sock->fd, &msg, sendmsg_flags)) == -1 && errno == EINTR);
|
|
SOCKET_PROBE(WRITEV, &sock->super, wret);
|
|
H2O_LOG_SOCK(writev, &sock->super, { PTLS_LOG_ELEMENT_SIGNED(ret, wret); });
|
|
|
|
if (wret == -1)
|
|
return errno == EAGAIN ? 0 : SIZE_MAX;
|
|
|
|
/* adjust the buffer, doing the write once again only if all IOV_MAX buffers being supplied were fully written */
|
|
while ((*bufs)->len <= wret) {
|
|
wret -= (*bufs)->len;
|
|
++*bufs;
|
|
--*bufcnt;
|
|
if (*bufcnt == 0) {
|
|
assert(wret == 0);
|
|
return 0;
|
|
}
|
|
}
|
|
if (wret != 0) {
|
|
return wret;
|
|
} else if (iovcnt < IOV_MAX) {
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static size_t write_core(struct st_h2o_evloop_socket_t *sock, h2o_iovec_t **bufs, size_t *bufcnt)
|
|
{
|
|
if (sock->super.ssl == NULL || sock->super.ssl->offload == H2O_SOCKET_SSL_OFFLOAD_ON) {
|
|
if (sock->super.ssl != NULL)
|
|
assert(!has_pending_ssl_bytes(sock->super.ssl));
|
|
return write_vecs(sock, bufs, bufcnt, 0);
|
|
}
|
|
|
|
/* SSL: flatten given vector if that has not been done yet; `*bufs` is guaranteed to have one slot available at the end; see
|
|
* `do_write_with_sendvec`, `init_write_buf`. */
|
|
if (sock->sendvec.callbacks != NULL) {
|
|
size_t veclen = flatten_sendvec(&sock->super, &sock->sendvec);
|
|
if (veclen == SIZE_MAX)
|
|
return SIZE_MAX;
|
|
sock->sendvec.callbacks = NULL;
|
|
(*bufs)[(*bufcnt)++] = h2o_iovec_init(sock->super._write_buf.flattened, veclen);
|
|
}
|
|
|
|
/* continue encrypting and writing, until we run out of data */
|
|
size_t first_buf_written = 0;
|
|
while (1) {
|
|
/* write bytes already encrypted, if any */
|
|
if (has_pending_ssl_bytes(sock->super.ssl)) {
|
|
h2o_iovec_t encbuf = h2o_iovec_init(sock->super.ssl->output.buf.base + sock->super.ssl->output.pending_off,
|
|
sock->super.ssl->output.buf.off - sock->super.ssl->output.pending_off);
|
|
h2o_iovec_t *encbufs = &encbuf;
|
|
size_t encbufcnt = 1, enc_written;
|
|
int sendmsg_flags = 0;
|
|
#if H2O_USE_MSG_ZEROCOPY
|
|
/* Use zero copy if amount of data to be written is no less than 4KB, and if the memory can be returned to
|
|
* `h2o_socket_zerocopy_buffer_allocator`. Latter is a short-cut. It is only under exceptional conditions (e.g., TLS
|
|
* stack adding a post-handshake message) that we'd see the buffer grow to a size that cannot be returned to the
|
|
* recycling allocator.
|
|
* Even though https://www.kernel.org/doc/html/v5.17/networking/msg_zerocopy.html recommends 10KB, 4KB has been chosen
|
|
* as the threshold, because we are likely to be using the non-temporal aesgcm engine and tx-nocache-copy, in which case
|
|
* copying sendmsg is going to be more costly than what the kernel documentation assumes. In a synthetic benchmark,
|
|
* changing from 16KB to 4KB increased the throughput by ~10%. */
|
|
if (sock->super.ssl->output.allocated_for_zerocopy && encbuf.len >= 4096 &&
|
|
sock->super.ssl->output.buf.capacity == h2o_socket_zerocopy_buffer_allocator.conf->memsize)
|
|
sendmsg_flags = MSG_ZEROCOPY;
|
|
#endif
|
|
if ((enc_written = write_vecs(sock, &encbufs, &encbufcnt, sendmsg_flags)) == SIZE_MAX) {
|
|
dispose_ssl_output_buffer(sock->super.ssl);
|
|
return SIZE_MAX;
|
|
}
|
|
if (sendmsg_flags != 0 && (encbufcnt == 0 || enc_written > 0)) {
|
|
zerocopy_buffers_push(sock->super._zerocopy, sock->super.ssl->output.buf.base);
|
|
if (!sock->super.ssl->output.zerocopy_owned) {
|
|
sock->super.ssl->output.zerocopy_owned = 1;
|
|
++h2o_socket_num_zerocopy_buffers_inflight;
|
|
}
|
|
}
|
|
/* if write is incomplete, record the advance and bail out */
|
|
if (encbufcnt != 0) {
|
|
sock->super.ssl->output.pending_off += enc_written;
|
|
break;
|
|
}
|
|
/* succeeded in writing all the encrypted data; free the buffer */
|
|
dispose_ssl_output_buffer(sock->super.ssl);
|
|
}
|
|
/* bail out if complete */
|
|
if (*bufcnt == 0 && sock->sendvec.callbacks == NULL)
|
|
break;
|
|
/* convert more cleartext to TLS records if possible, or bail out on fatal error */
|
|
if ((first_buf_written = generate_tls_records(&sock->super, bufs, bufcnt, first_buf_written)) == SIZE_MAX)
|
|
break;
|
|
/* as an optimization, if we have a flattened vector, release memory as soon as they have been encrypted */
|
|
if (*bufcnt == 0 && sock->super._write_buf.flattened != NULL) {
|
|
h2o_mem_free_recycle(&h2o_socket_ssl_buffer_allocator, sock->super._write_buf.flattened);
|
|
sock->super._write_buf.flattened = NULL;
|
|
}
|
|
}
|
|
|
|
return first_buf_written;
|
|
}
|
|
|
|
/**
|
|
* Sends contents of sendvec, and returns if operation has been successful, either completely or partially. Upon completion,
|
|
* `sendvec.vec.callbacks` is reset to NULL.
|
|
*/
|
|
static int sendvec_core(struct st_h2o_evloop_socket_t *sock)
|
|
{
|
|
size_t bytes_sent;
|
|
|
|
assert(sock->sendvec.len != 0);
|
|
|
|
/* send, and return an error if failed */
|
|
if ((bytes_sent = sock->sendvec.callbacks->send_(&sock->sendvec, sock->fd, sock->sendvec.len)) == SIZE_MAX)
|
|
return 0;
|
|
|
|
/* update offset, and return if we are not done yet */
|
|
if (sock->sendvec.len != 0)
|
|
return 1;
|
|
|
|
/* operation complete; mark as such */
|
|
sock->sendvec.callbacks = NULL;
|
|
return 1;
|
|
}
|
|
|
|
void write_pending(struct st_h2o_evloop_socket_t *sock)
|
|
{
|
|
assert(sock->super._cb.write != NULL);
|
|
|
|
/* write from buffer, if we have anything */
|
|
int ssl_needs_flatten = sock->sendvec.callbacks != NULL && sock->super.ssl != NULL
|
|
#if H2O_USE_KTLS
|
|
&& sock->super.ssl->offload != H2O_SOCKET_SSL_OFFLOAD_ON
|
|
#endif
|
|
;
|
|
if (sock->super._write_buf.cnt != 0 || has_pending_ssl_bytes(sock->super.ssl) || ssl_needs_flatten) {
|
|
size_t first_buf_written;
|
|
if ((first_buf_written = write_core(sock, &sock->super._write_buf.bufs, &sock->super._write_buf.cnt)) != SIZE_MAX) {
|
|
/* return if there's still pending data, adjusting buf[0] if necessary */
|
|
if (sock->super._write_buf.cnt != 0) {
|
|
sock->super._write_buf.bufs[0].base += first_buf_written;
|
|
sock->super._write_buf.bufs[0].len -= first_buf_written;
|
|
return;
|
|
} else if (has_pending_ssl_bytes(sock->super.ssl)) {
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* either completed or failed */
|
|
dispose_write_buf(&sock->super);
|
|
|
|
/* send the vector, if we have one and if all buffered writes are complete */
|
|
if (sock->sendvec.callbacks != NULL && sock->super._write_buf.cnt == 0 && !has_pending_ssl_bytes(sock->super.ssl)) {
|
|
/* send, and upon partial send, return without changing state for another round */
|
|
if (sendvec_core(sock) && sock->sendvec.callbacks != NULL)
|
|
return;
|
|
}
|
|
|
|
/* operation completed or failed, schedule notification */
|
|
SOCKET_PROBE(WRITE_COMPLETE, &sock->super, sock->super._write_buf.cnt == 0 && !has_pending_ssl_bytes(sock->super.ssl));
|
|
H2O_LOG_SOCK(write_complete, &sock->super,
|
|
{ PTLS_LOG_ELEMENT_BOOL(success, sock->super._write_buf.cnt == 0 && !has_pending_ssl_bytes(sock->super.ssl)); });
|
|
sock->bytes_written.cur_loop = sock->super.bytes_written;
|
|
sock->_flags |= H2O_SOCKET_FLAG_IS_WRITE_NOTIFY;
|
|
link_to_pending(sock);
|
|
link_to_statechanged(sock); /* might need to disable the write polling */
|
|
}
|
|
|
|
static void read_on_ready(struct st_h2o_evloop_socket_t *sock)
|
|
{
|
|
const char *err = 0;
|
|
size_t prev_size = sock->super.input->size;
|
|
|
|
if ((sock->_flags & H2O_SOCKET_FLAG_DONT_READ) != 0)
|
|
goto Notify;
|
|
|
|
if ((err = on_read_core(sock->fd, sock->super.ssl == NULL ? &sock->super.input : &sock->super.ssl->input.encrypted,
|
|
sock->max_read_size)) != NULL)
|
|
goto Notify;
|
|
|
|
if (sock->super.ssl != NULL && sock->super.ssl->handshake.cb == NULL)
|
|
err = decode_ssl_input(&sock->super);
|
|
|
|
Notify:
|
|
/* the application may get notified even if no new data is avaiable. The
|
|
* behavior is intentional; it is designed as such so that the applications
|
|
* can update their timeout counters when a partial SSL record arrives.
|
|
*/
|
|
sock->super.bytes_read += sock->super.input->size - prev_size;
|
|
sock->super._cb.read(&sock->super, err);
|
|
}
|
|
|
|
void do_dispose_socket(h2o_socket_t *_sock)
|
|
{
|
|
struct st_h2o_evloop_socket_t *sock = (struct st_h2o_evloop_socket_t *)_sock;
|
|
|
|
dispose_write_buf(&sock->super);
|
|
|
|
sock->_flags = H2O_SOCKET_FLAG_IS_DISPOSED | (sock->_flags & H2O_SOCKET_FLAG__EPOLL_IS_REGISTERED);
|
|
|
|
/* Give backends chance to do the necessary cleanup, as well as giving them chance to switch to their own disposal method; e.g.,
|
|
* shutdown(SHUT_RDWR) with delays to reclaim all zero copy buffers. */
|
|
if (evloop_do_on_socket_close(sock))
|
|
return;
|
|
|
|
/* immediate close */
|
|
if (sock->fd != -1) {
|
|
close(sock->fd);
|
|
sock->fd = -1;
|
|
}
|
|
link_to_statechanged(sock);
|
|
}
|
|
|
|
void report_early_write_error(h2o_socket_t *_sock)
|
|
{
|
|
struct st_h2o_evloop_socket_t *sock = (struct st_h2o_evloop_socket_t *)_sock;
|
|
|
|
/* fill in _wreq.bufs with fake data to indicate error */
|
|
sock->super._write_buf.bufs = sock->super._write_buf.smallbufs;
|
|
sock->super._write_buf.cnt = 1;
|
|
*sock->super._write_buf.bufs = h2o_iovec_init(H2O_STRLIT("deadbeef"));
|
|
sock->_flags |= H2O_SOCKET_FLAG_IS_WRITE_NOTIFY;
|
|
link_to_pending(sock);
|
|
}
|
|
|
|
void do_write(h2o_socket_t *_sock, h2o_iovec_t *bufs, size_t bufcnt)
|
|
{
|
|
struct st_h2o_evloop_socket_t *sock = (struct st_h2o_evloop_socket_t *)_sock;
|
|
size_t first_buf_written;
|
|
|
|
/* Don't write too much; if more than 1MB have been already written in the current invocation of `h2o_evloop_run`, wait until
|
|
* the event loop notifies us that the socket is writable. */
|
|
if (sock->bytes_written.cur_run_count != sock->loop->run_count) {
|
|
sock->bytes_written.prev_loop = sock->bytes_written.cur_loop;
|
|
sock->bytes_written.cur_run_count = sock->loop->run_count;
|
|
} else if (sock->bytes_written.cur_loop - sock->bytes_written.prev_loop >= h2o_evloop_socket_max_write_size) {
|
|
init_write_buf(&sock->super, bufs, bufcnt, 0);
|
|
goto Schedule_Write;
|
|
}
|
|
|
|
/* try to write now */
|
|
if ((first_buf_written = write_core(sock, &bufs, &bufcnt)) == SIZE_MAX) {
|
|
report_early_write_error(&sock->super);
|
|
return;
|
|
}
|
|
if (bufcnt == 0 && !has_pending_ssl_bytes(sock->super.ssl)) {
|
|
/* write complete, schedule the callback */
|
|
if (sock->super._write_buf.flattened != NULL) {
|
|
h2o_mem_free_recycle(&h2o_socket_ssl_buffer_allocator, sock->super._write_buf.flattened);
|
|
sock->super._write_buf.flattened = NULL;
|
|
}
|
|
if (sock->sendvec.callbacks != NULL) {
|
|
if (!sendvec_core(sock)) {
|
|
report_early_write_error(&sock->super);
|
|
return;
|
|
}
|
|
if (sock->sendvec.callbacks != NULL)
|
|
goto Schedule_Write;
|
|
}
|
|
sock->bytes_written.cur_loop = sock->super.bytes_written;
|
|
sock->_flags |= H2O_SOCKET_FLAG_IS_WRITE_NOTIFY;
|
|
link_to_pending(sock);
|
|
return;
|
|
}
|
|
|
|
/* setup the buffer to send pending data */
|
|
init_write_buf(&sock->super, bufs, bufcnt, first_buf_written);
|
|
|
|
Schedule_Write:
|
|
link_to_statechanged(sock);
|
|
}
|
|
|
|
static int can_tls_offload(h2o_socket_t *sock)
|
|
{
|
|
#if H2O_USE_KTLS
|
|
if (sock->ssl->offload != H2O_SOCKET_SSL_OFFLOAD_NONE && sock->ssl->ptls != NULL) {
|
|
ptls_cipher_suite_t *cipher = ptls_get_cipher(sock->ssl->ptls);
|
|
switch (cipher->id) {
|
|
case PTLS_CIPHER_SUITE_AES_128_GCM_SHA256:
|
|
case PTLS_CIPHER_SUITE_AES_256_GCM_SHA384:
|
|
return 1;
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
#endif
|
|
|
|
return 0;
|
|
}
|
|
|
|
#if H2O_USE_KTLS
|
|
static void switch_to_ktls(struct st_h2o_evloop_socket_t *sock)
|
|
{
|
|
assert(sock->super.ssl->offload == H2O_SOCKET_SSL_OFFLOAD_TBD);
|
|
|
|
/* Postpone the decision, when we are still in the early stages of the connection, as we want to use userspace TLS for
|
|
* generating small TLS records. TODO: integrate with TLS record size calculation logic. */
|
|
if (sock->super.bytes_written < 65536)
|
|
return;
|
|
|
|
/* load the key to the kernel */
|
|
struct {
|
|
uint8_t key[PTLS_MAX_SECRET_SIZE];
|
|
uint8_t iv[PTLS_MAX_DIGEST_SIZE];
|
|
uint64_t seq;
|
|
union {
|
|
struct tls12_crypto_info_aes_gcm_128 aesgcm128;
|
|
struct tls12_crypto_info_aes_gcm_256 aesgcm256;
|
|
} tx_params;
|
|
size_t tx_params_size;
|
|
} keys;
|
|
|
|
/* at the moment, only TLS/1.3 connections using aes-gcm is supported */
|
|
if (sock->super.ssl->ptls == NULL)
|
|
goto Fail;
|
|
ptls_cipher_suite_t *cipher = ptls_get_cipher(sock->super.ssl->ptls);
|
|
switch (cipher->id) {
|
|
case PTLS_CIPHER_SUITE_AES_128_GCM_SHA256:
|
|
case PTLS_CIPHER_SUITE_AES_256_GCM_SHA384:
|
|
break;
|
|
default:
|
|
goto Fail;
|
|
}
|
|
if (ptls_get_traffic_keys(sock->super.ssl->ptls, 1, keys.key, keys.iv, &keys.seq) != 0)
|
|
goto Fail;
|
|
keys.seq = htobe64(keys.seq); /* converted to big endian ASAP */
|
|
|
|
#define SETUP_TX_PARAMS(target, type) \
|
|
do { \
|
|
keys.tx_params.target.info.version = TLS_1_3_VERSION; \
|
|
keys.tx_params.target.info.cipher_type = type; \
|
|
H2O_BUILD_ASSERT(sizeof(keys.tx_params.target.key) == cipher->aead->key_size); \
|
|
memcpy(keys.tx_params.target.key, keys.key, cipher->aead->key_size); \
|
|
H2O_BUILD_ASSERT(cipher->aead->iv_size == 12); \
|
|
H2O_BUILD_ASSERT(sizeof(keys.tx_params.target.salt) == 4); \
|
|
memcpy(keys.tx_params.target.salt, keys.iv, 4); \
|
|
H2O_BUILD_ASSERT(sizeof(keys.tx_params.target.iv) == 8); \
|
|
memcpy(keys.tx_params.target.iv, keys.iv + 4, 8); \
|
|
H2O_BUILD_ASSERT(sizeof(keys.tx_params.target.rec_seq) == sizeof(keys.seq)); \
|
|
memcpy(keys.tx_params.target.rec_seq, &keys.seq, sizeof(keys.seq)); \
|
|
keys.tx_params_size = sizeof(keys.tx_params.target); \
|
|
} while (0)
|
|
switch (cipher->id) {
|
|
case PTLS_CIPHER_SUITE_AES_128_GCM_SHA256:
|
|
SETUP_TX_PARAMS(aesgcm128, TLS_CIPHER_AES_GCM_128);
|
|
break;
|
|
case PTLS_CIPHER_SUITE_AES_256_GCM_SHA384:
|
|
SETUP_TX_PARAMS(aesgcm256, TLS_CIPHER_AES_GCM_256);
|
|
break;
|
|
default:
|
|
goto Fail;
|
|
}
|
|
#undef SETUP_TX_PARAMS
|
|
|
|
/* set to kernel */
|
|
if (setsockopt(sock->fd, SOL_TCP, TCP_ULP, "tls", sizeof("tls")) != 0)
|
|
goto Fail;
|
|
if (setsockopt(sock->fd, SOL_TLS, TLS_TX, &keys.tx_params, keys.tx_params_size) != 0)
|
|
goto Fail;
|
|
sock->super.ssl->offload = H2O_SOCKET_SSL_OFFLOAD_ON;
|
|
|
|
Exit:
|
|
ptls_clear_memory(&keys, sizeof(keys));
|
|
return;
|
|
|
|
Fail:
|
|
sock->super.ssl->offload = H2O_SOCKET_SSL_OFFLOAD_NONE;
|
|
goto Exit;
|
|
}
|
|
#endif
|
|
|
|
/**
|
|
* `bufs` should be an array capable of storing `bufcnt + 1` objects, as we will be flattening `sendvec` at the end of `bufs` before
|
|
* encryption; see `write_core`.
|
|
*/
|
|
static int do_write_with_sendvec(h2o_socket_t *_sock, h2o_iovec_t *bufs, size_t bufcnt, h2o_sendvec_t *sendvec)
|
|
{
|
|
struct st_h2o_evloop_socket_t *sock = (struct st_h2o_evloop_socket_t *)_sock;
|
|
|
|
assert(sendvec->callbacks->read_ != NULL);
|
|
assert(sock->sendvec.callbacks == NULL);
|
|
|
|
/* If userspace TLS is used, rely on `read_` which is a mandatory callback. Otherwise, rely on `send_` if it is available. */
|
|
if (sock->super.ssl != NULL) {
|
|
#if H2O_USE_KTLS
|
|
if (sock->super.ssl->offload == H2O_SOCKET_SSL_OFFLOAD_TBD)
|
|
switch_to_ktls(sock);
|
|
if (sock->super.ssl->offload == H2O_SOCKET_SSL_OFFLOAD_ON && sendvec->callbacks->send_ == NULL)
|
|
return 0;
|
|
#endif
|
|
} else {
|
|
if (sendvec->callbacks->send_ == NULL)
|
|
return 0;
|
|
}
|
|
|
|
/* handling writes with sendvec, here */
|
|
sock->sendvec = *sendvec;
|
|
do_write(&sock->super, bufs, bufcnt);
|
|
|
|
return 1;
|
|
}
|
|
|
|
int h2o_socket_get_fd(h2o_socket_t *_sock)
|
|
{
|
|
struct st_h2o_evloop_socket_t *sock = (struct st_h2o_evloop_socket_t *)_sock;
|
|
return sock->fd;
|
|
}
|
|
|
|
void do_read_start(h2o_socket_t *_sock)
|
|
{
|
|
struct st_h2o_evloop_socket_t *sock = (struct st_h2o_evloop_socket_t *)_sock;
|
|
|
|
link_to_statechanged(sock);
|
|
}
|
|
|
|
void do_read_stop(h2o_socket_t *_sock)
|
|
{
|
|
struct st_h2o_evloop_socket_t *sock = (struct st_h2o_evloop_socket_t *)_sock;
|
|
|
|
sock->_flags &= ~H2O_SOCKET_FLAG_IS_READ_READY;
|
|
link_to_statechanged(sock);
|
|
}
|
|
|
|
void h2o_socket_dont_read(h2o_socket_t *_sock, int dont_read)
|
|
{
|
|
struct st_h2o_evloop_socket_t *sock = (struct st_h2o_evloop_socket_t *)_sock;
|
|
|
|
if (dont_read) {
|
|
sock->_flags |= H2O_SOCKET_FLAG_DONT_READ;
|
|
} else {
|
|
sock->_flags &= ~H2O_SOCKET_FLAG_DONT_READ;
|
|
}
|
|
}
|
|
|
|
int do_export(h2o_socket_t *_sock, h2o_socket_export_t *info)
|
|
{
|
|
struct st_h2o_evloop_socket_t *sock = (void *)_sock;
|
|
|
|
assert((sock->_flags & H2O_SOCKET_FLAG_IS_DISPOSED) == 0);
|
|
evloop_do_on_socket_export(sock);
|
|
sock->_flags = H2O_SOCKET_FLAG_IS_DISPOSED | (sock->_flags & H2O_SOCKET_FLAG__EPOLL_IS_REGISTERED);
|
|
|
|
info->fd = sock->fd;
|
|
sock->fd = -1;
|
|
|
|
return 0;
|
|
}
|
|
|
|
h2o_socket_t *do_import(h2o_loop_t *loop, h2o_socket_export_t *info)
|
|
{
|
|
return h2o_evloop_socket_create(loop, info->fd, 0);
|
|
}
|
|
|
|
h2o_loop_t *h2o_socket_get_loop(h2o_socket_t *_sock)
|
|
{
|
|
struct st_h2o_evloop_socket_t *sock = (void *)_sock;
|
|
return sock->loop;
|
|
}
|
|
|
|
socklen_t get_sockname_uncached(h2o_socket_t *_sock, struct sockaddr *sa)
|
|
{
|
|
struct st_h2o_evloop_socket_t *sock = (void *)_sock;
|
|
socklen_t len = sizeof(struct sockaddr_storage);
|
|
if (getsockname(sock->fd, sa, &len) != 0)
|
|
return 0;
|
|
return len;
|
|
}
|
|
|
|
socklen_t get_peername_uncached(h2o_socket_t *_sock, struct sockaddr *sa)
|
|
{
|
|
struct st_h2o_evloop_socket_t *sock = (void *)_sock;
|
|
socklen_t len = sizeof(struct sockaddr_storage);
|
|
if (getpeername(sock->fd, sa, &len) != 0)
|
|
return 0;
|
|
return len;
|
|
}
|
|
|
|
static struct st_h2o_evloop_socket_t *create_socket(h2o_evloop_t *loop, int fd, int flags)
|
|
{
|
|
struct st_h2o_evloop_socket_t *sock;
|
|
|
|
sock = h2o_mem_alloc(sizeof(*sock));
|
|
memset(sock, 0, sizeof(*sock));
|
|
h2o_buffer_init(&sock->super.input, &h2o_socket_buffer_prototype);
|
|
sock->loop = loop;
|
|
sock->fd = fd;
|
|
sock->_flags = flags;
|
|
sock->max_read_size = h2o_evloop_socket_max_read_size; /* by default, we read up to 1MB at once */
|
|
sock->_next_pending = sock;
|
|
sock->_next_statechanged = sock;
|
|
|
|
evloop_do_on_socket_create(sock);
|
|
|
|
return sock;
|
|
}
|
|
|
|
/**
|
|
* Sets TCP_NODELAY if the given file descriptor is likely to be a TCP socket. The intent of this function is to reduce number of
|
|
* unnecessary system calls. Therefore, we skip setting TCP_NODELAY when it is certain that the socket is not a TCP socket,
|
|
* otherwise call setsockopt.
|
|
*/
|
|
static void set_nodelay_if_likely_tcp(int fd, struct sockaddr *sa)
|
|
{
|
|
if (sa != NULL && !(sa->sa_family == AF_INET || sa->sa_family == AF_INET6))
|
|
return;
|
|
|
|
int on = 1;
|
|
setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &on, sizeof(on));
|
|
}
|
|
|
|
h2o_socket_t *h2o_evloop_socket_create(h2o_evloop_t *loop, int fd, int flags)
|
|
{
|
|
/* It is the reponsibility of the event loop to modify the properties of a socket for its use (e.g., set O_NONBLOCK). */
|
|
fcntl(fd, F_SETFL, O_NONBLOCK);
|
|
set_nodelay_if_likely_tcp(fd, NULL);
|
|
|
|
return &create_socket(loop, fd, flags)->super;
|
|
}
|
|
|
|
h2o_socket_t *h2o_evloop_socket_accept(h2o_socket_t *_listener)
|
|
{
|
|
struct st_h2o_evloop_socket_t *listener = (struct st_h2o_evloop_socket_t *)_listener;
|
|
int fd;
|
|
h2o_socket_t *sock;
|
|
union {
|
|
struct sockaddr sa;
|
|
struct sockaddr_in sin4;
|
|
struct sockaddr_in6 sin6;
|
|
} peeraddr;
|
|
socklen_t peeraddrlen = sizeof(peeraddr);
|
|
|
|
#if H2O_USE_ACCEPT4
|
|
if ((fd = accept4(listener->fd, &peeraddr.sa, &peeraddrlen, SOCK_NONBLOCK | SOCK_CLOEXEC)) == -1)
|
|
return NULL;
|
|
sock = &create_socket(listener->loop, fd, H2O_SOCKET_FLAG_IS_ACCEPTED_CONNECTION)->super;
|
|
#else
|
|
if ((fd = cloexec_accept(listener->fd, &peeraddr.sa, &peeraddrlen)) == -1)
|
|
return NULL;
|
|
fcntl(fd, F_SETFL, O_NONBLOCK);
|
|
sock = &create_socket(listener->loop, fd, H2O_SOCKET_FLAG_IS_ACCEPTED_CONNECTION)->super;
|
|
#endif
|
|
if (peeraddrlen <= sizeof(peeraddr)) {
|
|
h2o_socket_setpeername(sock, &peeraddr.sa, peeraddrlen);
|
|
} else {
|
|
peeraddr.sa.sa_family = AF_UNSPEC;
|
|
}
|
|
|
|
/* note: even on linux, the accepted socket might not inherit TCP_NODELAY from the listening socket; see
|
|
* https://github.com/h2o/h2o/pull/2542#issuecomment-760700859 */
|
|
set_nodelay_if_likely_tcp(fd, &peeraddr.sa);
|
|
|
|
ptls_log_init_conn_state(&sock->_log_state, ptls_openssl_random_bytes);
|
|
switch (peeraddr.sa.sa_family) {
|
|
case AF_INET: /* store as v6-mapped v4 address */
|
|
ptls_build_v4_mapped_v6_address(&sock->_log_state.address, &peeraddr.sin4.sin_addr);
|
|
break;
|
|
case AF_INET6:
|
|
sock->_log_state.address = peeraddr.sin6.sin6_addr;
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
return sock;
|
|
}
|
|
|
|
h2o_socket_t *h2o_socket_connect(h2o_loop_t *loop, struct sockaddr *addr, socklen_t addrlen, h2o_socket_cb cb, const char **err)
|
|
{
|
|
int fd, connect_ret;
|
|
struct st_h2o_evloop_socket_t *sock;
|
|
|
|
if ((fd = cloexec_socket(addr->sa_family, SOCK_STREAM, 0)) == -1) {
|
|
if (err != NULL) {
|
|
*err = h2o_socket_error_socket_fail;
|
|
}
|
|
return NULL;
|
|
}
|
|
fcntl(fd, F_SETFL, O_NONBLOCK);
|
|
|
|
if (!((connect_ret = connect(fd, addr, addrlen)) == 0 || errno == EINPROGRESS)) {
|
|
if (err != NULL)
|
|
*err = h2o_socket_get_error_string(errno, h2o_socket_error_conn_fail);
|
|
close(fd);
|
|
return NULL;
|
|
}
|
|
|
|
sock = create_socket(loop, fd, H2O_SOCKET_FLAG_IS_CONNECTING);
|
|
set_nodelay_if_likely_tcp(fd, addr);
|
|
|
|
if (connect_ret == 0) {
|
|
/* connection has been established synchronously; notify the fact without going back to epoll */
|
|
sock->_flags |= H2O_SOCKET_FLAG_IS_WRITE_NOTIFY | H2O_SOCKET_FLAG_IS_CONNECTING_CONNECTED;
|
|
sock->super._cb.write = cb;
|
|
link_to_pending(sock);
|
|
} else {
|
|
h2o_socket_notify_write(&sock->super, cb);
|
|
}
|
|
return &sock->super;
|
|
}
|
|
|
|
void h2o_evloop_socket_set_max_read_size(h2o_socket_t *_sock, size_t max_size)
|
|
{
|
|
struct st_h2o_evloop_socket_t *sock = (void *)_sock;
|
|
sock->max_read_size = max_size;
|
|
}
|
|
|
|
h2o_evloop_t *create_evloop(size_t sz)
|
|
{
|
|
h2o_evloop_t *loop = h2o_mem_alloc(sz);
|
|
|
|
memset(loop, 0, sz);
|
|
loop->_statechanged.tail_ref = &loop->_statechanged.head;
|
|
update_now(loop);
|
|
/* 3 levels * 32-slots => 1 second goes into 2nd, becomes O(N) above approx. 31 seconds */
|
|
loop->_timeouts = h2o_timerwheel_create(3, loop->_now_millisec);
|
|
|
|
#if H2O_USE_IO_URING
|
|
h2o_io_uring_setup(loop);
|
|
#endif
|
|
|
|
return loop;
|
|
}
|
|
|
|
void update_now(h2o_evloop_t *loop)
|
|
{
|
|
gettimeofday(&loop->_tv_at, NULL);
|
|
loop->_now_nanosec = ((uint64_t)loop->_tv_at.tv_sec * 1000000 + loop->_tv_at.tv_usec) * 1000;
|
|
loop->_now_millisec = loop->_now_nanosec / 1000000;
|
|
}
|
|
|
|
int32_t adjust_max_wait(h2o_evloop_t *loop, int32_t max_wait)
|
|
{
|
|
uint64_t wake_at = h2o_timerwheel_get_wake_at(loop->_timeouts);
|
|
|
|
update_now(loop);
|
|
|
|
if (wake_at <= loop->_now_millisec) {
|
|
max_wait = 0;
|
|
} else {
|
|
uint64_t delta = wake_at - loop->_now_millisec;
|
|
if (delta < max_wait)
|
|
max_wait = (int32_t)delta;
|
|
}
|
|
|
|
return max_wait;
|
|
}
|
|
|
|
void h2o_socket_notify_write(h2o_socket_t *_sock, h2o_socket_cb cb)
|
|
{
|
|
struct st_h2o_evloop_socket_t *sock = (struct st_h2o_evloop_socket_t *)_sock;
|
|
assert(sock->super._cb.write == NULL);
|
|
assert(sock->super._write_buf.cnt == 0);
|
|
assert(!has_pending_ssl_bytes(sock->super.ssl));
|
|
|
|
sock->super._cb.write = cb;
|
|
link_to_statechanged(sock);
|
|
}
|
|
|
|
static void run_socket(struct st_h2o_evloop_socket_t *sock)
|
|
{
|
|
if ((sock->_flags & H2O_SOCKET_FLAG_IS_DISPOSED) != 0) {
|
|
/* is freed in updatestates phase */
|
|
return;
|
|
}
|
|
|
|
if ((sock->_flags & H2O_SOCKET_FLAG_IS_READ_READY) != 0) {
|
|
sock->_flags &= ~H2O_SOCKET_FLAG_IS_READ_READY;
|
|
read_on_ready(sock);
|
|
}
|
|
|
|
if ((sock->_flags & H2O_SOCKET_FLAG_IS_WRITE_NOTIFY) != 0) {
|
|
const char *err = NULL;
|
|
assert(sock->super._cb.write != NULL);
|
|
sock->_flags &= ~H2O_SOCKET_FLAG_IS_WRITE_NOTIFY;
|
|
if (sock->super._write_buf.cnt != 0 || has_pending_ssl_bytes(sock->super.ssl) || sock->sendvec.callbacks != NULL) {
|
|
/* error */
|
|
err = h2o_socket_error_io;
|
|
sock->super._write_buf.cnt = 0;
|
|
if (has_pending_ssl_bytes(sock->super.ssl))
|
|
dispose_ssl_output_buffer(sock->super.ssl);
|
|
sock->sendvec.callbacks = NULL;
|
|
} else if ((sock->_flags & H2O_SOCKET_FLAG_IS_CONNECTING) != 0) {
|
|
/* completion of connect; determine error if we do not know whether the connection has been successfully estabilshed */
|
|
if ((sock->_flags & H2O_SOCKET_FLAG_IS_CONNECTING_CONNECTED) == 0) {
|
|
int so_err = 0;
|
|
socklen_t l = sizeof(so_err);
|
|
so_err = 0;
|
|
if (getsockopt(sock->fd, SOL_SOCKET, SO_ERROR, &so_err, &l) != 0 || so_err != 0)
|
|
err = h2o_socket_get_error_string(so_err, h2o_socket_error_conn_fail);
|
|
}
|
|
sock->_flags &= ~(H2O_SOCKET_FLAG_IS_CONNECTING | H2O_SOCKET_FLAG_IS_CONNECTING_CONNECTED);
|
|
}
|
|
on_write_complete(&sock->super, err);
|
|
}
|
|
}
|
|
|
|
static void run_pending(h2o_evloop_t *loop)
|
|
{
|
|
struct st_h2o_evloop_socket_t *sock;
|
|
|
|
while (loop->_pending_as_server != NULL || loop->_pending_as_client != NULL) {
|
|
while ((sock = loop->_pending_as_client) != NULL) {
|
|
loop->_pending_as_client = sock->_next_pending;
|
|
sock->_next_pending = sock;
|
|
run_socket(sock);
|
|
}
|
|
if ((sock = loop->_pending_as_server) != NULL) {
|
|
loop->_pending_as_server = sock->_next_pending;
|
|
sock->_next_pending = sock;
|
|
run_socket(sock);
|
|
}
|
|
}
|
|
}
|
|
|
|
void h2o_evloop_destroy(h2o_evloop_t *loop)
|
|
{
|
|
struct st_h2o_evloop_socket_t *sock;
|
|
|
|
/* timeouts are governed by the application and MUST be destroyed prior to destroying the loop */
|
|
assert(h2o_timerwheel_get_wake_at(loop->_timeouts) == UINT64_MAX);
|
|
|
|
/* dispose all socket */
|
|
while ((sock = loop->_pending_as_client) != NULL) {
|
|
loop->_pending_as_client = sock->_next_pending;
|
|
sock->_next_pending = sock;
|
|
h2o_socket_close((h2o_socket_t *)sock);
|
|
}
|
|
while ((sock = loop->_pending_as_server) != NULL) {
|
|
loop->_pending_as_server = sock->_next_pending;
|
|
sock->_next_pending = sock;
|
|
h2o_socket_close((h2o_socket_t *)sock);
|
|
}
|
|
|
|
/* now all socket are disposedand and placed in linked list statechanged
|
|
* we can freeing memory in cycle by next_statechanged,
|
|
*/
|
|
while ((sock = loop->_statechanged.head) != NULL) {
|
|
loop->_statechanged.head = sock->_next_statechanged;
|
|
free(sock);
|
|
}
|
|
|
|
/* dispose backend-specific data */
|
|
evloop_do_dispose(loop);
|
|
|
|
/* lastly we need to free loop memory */
|
|
h2o_timerwheel_destroy(loop->_timeouts);
|
|
free(loop);
|
|
}
|
|
|
|
int h2o_evloop_run(h2o_evloop_t *loop, int32_t max_wait)
|
|
{
|
|
++loop->run_count;
|
|
|
|
/* Update socket states, poll, set readable flags, perform pending writes. */
|
|
if (evloop_do_proceed(loop, max_wait) != 0)
|
|
return -1;
|
|
|
|
/* Run the pending callbacks. */
|
|
run_pending(loop);
|
|
|
|
/* Run the expired timers at the same time invoking pending callbacks for every timer callback. This is an locality
|
|
* optimization; handles things like timeout -> write -> on_write_complete for each object.
|
|
* Expired timers are fetched and run at most 10 times, after which `h2o_evloop_run` returns even if there is a
|
|
* pending immediate timer. By doing so, we guarantee that the server can make progress by polling the socket, doing
|
|
* I/O, as well as running other operations coded in the caller of `h2s_evloop_run`, even if there is broken code
|
|
* that registers an immediate timer perpetually. */
|
|
for (int i = 0; i < 10; ++i) {
|
|
h2o_linklist_t expired;
|
|
h2o_linklist_init_anchor(&expired);
|
|
h2o_timerwheel_get_expired(loop->_timeouts, loop->_now_millisec, &expired);
|
|
if (h2o_linklist_is_empty(&expired))
|
|
break;
|
|
do {
|
|
h2o_timerwheel_entry_t *timer = H2O_STRUCT_FROM_MEMBER(h2o_timerwheel_entry_t, _link, expired.next);
|
|
h2o_linklist_unlink(&timer->_link);
|
|
timer->cb(timer);
|
|
run_pending(loop);
|
|
} while (!h2o_linklist_is_empty(&expired));
|
|
}
|
|
|
|
assert(loop->_pending_as_client == NULL);
|
|
assert(loop->_pending_as_server == NULL);
|
|
|
|
if (h2o_sliding_counter_is_running(&loop->exec_time_nanosec_counter)) {
|
|
update_now(loop);
|
|
h2o_sliding_counter_stop(&loop->exec_time_nanosec_counter, loop->_now_nanosec);
|
|
}
|
|
|
|
return 0;
|
|
}
|