On Thu, Nov 17, 2022 at 12:54:05AM +0100, Stefano Brivio wrote:On Wed, 16 Nov 2022 15:41:56 +1100 David Gibson <david(a)gibson.dropbear.id.au> wrote:Right, valgrind seems to have better modeling of the syscall here. Note that this is harder for a static tool to get right, because the amount that accept() writes is variable. I got a reply on that bug report, saying apparently clang-tidy *should* consider sa written here, and they weren't easily able to reproduce the problem. So, I'm not sure what's going on here :(.In pasta mode, tcp_sock_init[46]() create separate sockets to listen for spliced connections (these are bound to localhost) and non-spliced connections (these are bound to the host address). This introduces a subtle behavioural difference between pasta and passt: by default, pasta will listen only on a single host address, whereas passt will listen on all addresses (0.0.0.0 or ::). This also prevents us using some additional optimizations that only work with the unspecified (0.0.0.0 or ::) address. However, it turns out we don't need to do this. We can splice a connection if and only if it originates from the loopback address. Currently we ensure this by having the "spliced" listening sockets listening only on loopback. Instead, defer the decision about whether to splice a connection until after accept(), by checking if the connection was made from the loopback address. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- tcp.c | 127 +++++++++++++-------------------------------------- tcp_splice.c | 25 ++++++++-- tcp_splice.h | 5 +- 3 files changed, 55 insertions(+), 102 deletions(-) diff --git a/tcp.c b/tcp.c index e66a82a..4065da7 100644 --- a/tcp.c +++ b/tcp.c @@ -434,7 +434,6 @@ static const char *tcp_flag_str[] __attribute((__unused__)) = { }; /* Listening sockets, used for automatic port forwarding in pasta mode only */ -static int tcp_sock_init_lo [NUM_PORTS][IP_VERSIONS]; static int tcp_sock_init_ext [NUM_PORTS][IP_VERSIONS]; static int tcp_sock_ns [NUM_PORTS][IP_VERSIONS]; @@ -2851,21 +2850,31 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref, socklen_t sl; int s; + assert(ref.r.p.tcp.tcp.listen); + assert(!ref.r.p.tcp.tcp.splice); + if (c->tcp.conn_count >= TCP_MAX_CONNS) return; sl = sizeof(sa); + /* FIXME: Workaround clang-tidy not realizing that accept4() + * writes the socket address. See + * https://github.com/llvm/llvm-project/issues/58992 + */ + memset(&sa, 0, sizeof(struct sockaddr_in6)); s = accept4(ref.r.s, (struct sockaddr *)&sa, &sl, SOCK_NONBLOCK);Ah, interesting. That looks new by the way -- not even valgrind complained about this.Fixed.if (s < 0) return; conn = tc + c->tcp.conn_count++; - if (ref.r.p.tcp.tcp.splice) - tcp_splice_conn_from_sock(c, ref, &conn->splice, s); - else - tcp_tap_conn_from_sock(c, ref, &conn->tap, s, - (struct sockaddr *)&sa, now); + if (c->mode == MODE_PASTA && + tcp_splice_conn_from_sock(c, ref, &conn->splice, + s, (struct sockaddr *)&sa)) + return; + + tcp_tap_conn_from_sock(c, ref, &conn->tap, s, + (struct sockaddr *)&sa, now); } /** @@ -3018,47 +3027,16 @@ static void tcp_sock_init4(const struct ctx *c, const struct in_addr *addr, { in_port_t idx = port + c->tcp.fwd_in.delta[port]; union tcp_epoll_ref tref = { .tcp.listen = 1, .tcp.index = idx }; - bool spliced = false, tap = true; int s; - if (c->mode == MODE_PASTA) { - spliced = !addr || IN4_IS_ADDR_UNSPECIFIED(addr) || - IN4_IS_ADDR_LOOPBACK(addr); - - if (!addr) - addr = &c->ip4.addr; - - tap = !IN4_IS_ADDR_LOOPBACK(addr); - } - - if (tap) { - s = sock_l4(c, AF_INET, IPPROTO_TCP, addr, ifname, port, - tref.u32); - if (s >= 0) - tcp_sock_set_bufsize(c, s); - else - s = -1; - - if (c->tcp.fwd_in.mode == FWD_AUTO) - tcp_sock_init_ext[port][V4] = s; - } - - if (spliced) { - struct in_addr loopback = { htonl(INADDR_LOOPBACK) }; - tref.tcp.splice = 1; - - addr = &loopback; - - s = sock_l4(c, AF_INET, IPPROTO_TCP, addr, ifname, port, - tref.u32); - if (s >= 0) - tcp_sock_set_bufsize(c, s); - else - s = -1; + s = sock_l4(c, AF_INET, IPPROTO_TCP, addr, ifname, port, tref.u32); + if (s >= 0) + tcp_sock_set_bufsize(c, s); + else + s = -1; - if (c->tcp.fwd_out.mode == FWD_AUTO) - tcp_sock_init_lo[port][V4] = s; - } + if (c->tcp.fwd_in.mode == FWD_AUTO) + tcp_sock_init_ext[port][V4] = s; } /** @@ -3075,47 +3053,16 @@ static void tcp_sock_init6(const struct ctx *c, in_port_t idx = port + c->tcp.fwd_in.delta[port]; union tcp_epoll_ref tref = { .tcp.listen = 1, .tcp.v6 = 1, .tcp.index = idx }; - bool spliced = false, tap = true; int s; - if (c->mode == MODE_PASTA) { - spliced = !addr || - IN6_IS_ADDR_UNSPECIFIED(addr) || - IN6_IS_ADDR_LOOPBACK(addr); - - if (!addr) - addr = &c->ip6.addr; - - tap = !IN6_IS_ADDR_LOOPBACK(addr); - } - - if (tap) { - s = sock_l4(c, AF_INET6, IPPROTO_TCP, addr, ifname, port, - tref.u32); - if (s >= 0) - tcp_sock_set_bufsize(c, s); - else - s = -1; - - if (c->tcp.fwd_in.mode == FWD_AUTO) - tcp_sock_init_ext[port][V6] = s; - } - - if (spliced) { - tref.tcp.splice = 1; - - addr = &in6addr_loopback; - - s = sock_l4(c, AF_INET6, IPPROTO_TCP, addr, ifname, port, - tref.u32); - if (s >= 0) - tcp_sock_set_bufsize(c, s); - else - s = -1; + s = sock_l4(c, AF_INET6, IPPROTO_TCP, addr, ifname, port, tref.u32); + if (s >= 0) + tcp_sock_set_bufsize(c, s); + else + s = -1; - if (c->tcp.fwd_out.mode == FWD_AUTO) - tcp_sock_init_lo[port][V6] = s; - } + if (c->tcp.fwd_in.mode == FWD_AUTO) + tcp_sock_init_ext[port][V6] = s; } /** @@ -3144,7 +3091,7 @@ static void tcp_ns_sock_init4(const struct ctx *c, in_port_t port) { in_port_t idx = port + c->tcp.fwd_out.delta[port]; union tcp_epoll_ref tref = { .tcp.listen = 1, .tcp.outbound = 1, - .tcp.splice = 1, .tcp.index = idx }; + .tcp.index = idx }; struct in_addr loopback = { htonl(INADDR_LOOPBACK) }; int s; @@ -3169,8 +3116,7 @@ static void tcp_ns_sock_init6(const struct ctx *c, in_port_t port) { in_port_t idx = port + c->tcp.fwd_out.delta[port]; union tcp_epoll_ref tref = { .tcp.listen = 1, .tcp.outbound = 1, - .tcp.splice = 1, .tcp.v6 = 1, - .tcp.index = idx}; + .tcp.v6 = 1, .tcp.index = idx};Space missing here (from 14/32).Good idea, done.int s; assert(c->mode == MODE_PASTA); @@ -3337,7 +3283,6 @@ int tcp_init(struct ctx *c) memset(init_sock_pool6, 0xff, sizeof(init_sock_pool6)); memset(ns_sock_pool4, 0xff, sizeof(ns_sock_pool4)); memset(ns_sock_pool6, 0xff, sizeof(ns_sock_pool6)); - memset(tcp_sock_init_lo, 0xff, sizeof(tcp_sock_init_lo)); memset(tcp_sock_init_ext, 0xff, sizeof(tcp_sock_init_ext)); memset(tcp_sock_ns, 0xff, sizeof(tcp_sock_ns)); @@ -3445,16 +3390,6 @@ static int tcp_port_rebind(void *arg) close(tcp_sock_init_ext[port][V6]); tcp_sock_init_ext[port][V6] = -1; } - - if (tcp_sock_init_lo[port][V4] >= 0) { - close(tcp_sock_init_lo[port][V4]); - tcp_sock_init_lo[port][V4] = -1; - } - - if (tcp_sock_init_lo[port][V6] >= 0) { - close(tcp_sock_init_lo[port][V6]); - tcp_sock_init_lo[port][V6] = -1; - } continue; } diff --git a/tcp_splice.c b/tcp_splice.c index 7007501..30d49d4 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -502,19 +502,35 @@ static void tcp_splice_dir(struct tcp_splice_conn *conn, int ref_sock, } /** - * tcp_splice_conn_from_sock() - Initialize state for spliced connection + * tcp_splice_conn_from_sock() - Attempt to init state for a spliced connection * @c: Execution context * @ref: epoll reference of listening socket * @conn: connection structure to initialize * @s: Accepted socket + * @sa: Peer address of connection * + * Return: true if able to create a spliced connection, false otherwise * #syscalls:pasta setsockopt */ -void tcp_splice_conn_from_sock(struct ctx *c, union epoll_ref ref, - struct tcp_splice_conn *conn, int s) +bool tcp_splice_conn_from_sock(struct ctx *c, union epoll_ref ref, + struct tcp_splice_conn *conn, int s, + const struct sockaddr *sa) { assert(c->mode == MODE_PASTA); + if (ref.r.p.tcp.tcp.v6) { + const struct sockaddr_in6 *sa6 + = (const struct sockaddr_in6 *)sa;Maybe you could split declaration and assignment here.-- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson+ if (!IN6_IS_ADDR_LOOPBACK(&sa6->sin6_addr)) + return false; + conn->flags = SPLICE_V6; + } else { + const struct sockaddr_in *sa4 = (const struct sockaddr_in *)sa; + if (!IN4_IS_ADDR_LOOPBACK(&sa4->sin_addr)) + return false; + conn->flags = 0; + } + if (setsockopt(s, SOL_TCP, TCP_QUICKACK, &((int){ 1 }), sizeof(int))) { trace("TCP (spliced): failed to set TCP_QUICKACK on %i", @@ -524,11 +540,12 @@ void tcp_splice_conn_from_sock(struct ctx *c, union epoll_ref ref, conn->c.spliced = true; c->tcp.splice_conn_count++; conn->a = s; - conn->flags = ref.r.p.tcp.tcp.v6 ? SPLICE_V6 : 0; if (tcp_splice_new(c, conn, ref.r.p.tcp.tcp.index, ref.r.p.tcp.tcp.outbound)) conn_flag(c, conn, CLOSING); + + return true; } /** diff --git a/tcp_splice.h b/tcp_splice.h index f9462ae..1a915dd 100644 --- a/tcp_splice.h +++ b/tcp_splice.h @@ -10,8 +10,9 @@ struct tcp_splice_conn; void tcp_sock_handler_splice(struct ctx *c, union epoll_ref ref, uint32_t events); -void tcp_splice_conn_from_sock(struct ctx *c, union epoll_ref ref, - struct tcp_splice_conn *conn, int s); +bool tcp_splice_conn_from_sock(struct ctx *c, union epoll_ref ref, + struct tcp_splice_conn *conn, int s, + const struct sockaddr *sa); void tcp_splice_init(struct ctx *c); #endif /* TCP_SPLICE_H */