On Tue, Oct 11, 2022 at 01:33:49AM +0200, Stefano Brivio wrote:It's indeed convenient to use the final destination port in the epoll reference data, so that we don't have to check the configured port deltas before establishing connections. However, this doesn't work if we need to access the original port information to know what to do once we receive a connection. The existing implementation resulted in inbound, spliced connections with a remapped destination port (and only those) to be attempted on the outbound side, as we would check the wrong position in port bitmaps to establish whether to use inbound or outbound sockets. For listening spliced sockets, set the port index in the epoll reference to the original port. Once we get a connection, use the port delta arrays to find the final destination port, and the original destination port to decide what socket pool we should use. This avoids the need for a reverse mapping table as implemented for UDP. Fixes: 33482d5bf293 ("passt: Add PASTA mode, major rework") Signed-off-by: Stefano Brivio <sbrivio(a)redhat.com> --- tcp.c | 19 ++++++++----------- tcp_splice.c | 18 ++++++++++++++---- 2 files changed, 22 insertions(+), 15 deletions(-) diff --git a/tcp.c b/tcp.c index 63153b6..cc08353 100644 --- a/tcp.c +++ b/tcp.c @@ -3084,15 +3084,16 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events, void tcp_sock_init(const struct ctx *c, int ns, sa_family_t af, const void *addr, const char *ifname, in_port_t port) { + union tcp_epoll_ref tref_spliced = { .tcp.listen = 1, .tcp.splice = 1 }; union tcp_epoll_ref tref = { .tcp.listen = 1 }; const void *bind_addr; int s; - if (ns) { + if (ns) tref.tcp.index = (in_port_t)(port + c->tcp.fwd_out.delta[port]); - } else { + else tref.tcp.index = (in_port_t)(port + c->tcp.fwd_in.delta[port]); - } + tref_spliced.tcp.index = (in_port_t) port; if (af == AF_INET || af == AF_UNSPEC) { if (!addr && c->mode == MODE_PASTA) @@ -3100,8 +3101,7 @@ void tcp_sock_init(const struct ctx *c, int ns, sa_family_t af, else bind_addr = addr; - tref.tcp.v6 = 0; - tref.tcp.splice = 0; + tref.tcp.v6 = tref_spliced.tcp.v6 = 0; if (!ns) { s = sock_l4(c, AF_INET, IPPROTO_TCP, bind_addr, ifname, @@ -3118,9 +3118,8 @@ void tcp_sock_init(const struct ctx *c, int ns, sa_family_t af, if (c->mode == MODE_PASTA) { bind_addr = &(uint32_t){ htonl(INADDR_LOOPBACK) }; - tref.tcp.splice = 1; s = sock_l4(c, AF_INET, IPPROTO_TCP, bind_addr, ifname, - port, tref.u32); + port, tref_spliced.u32); if (s >= 0) tcp_sock_set_bufsize(c, s); else @@ -3141,9 +3140,8 @@ void tcp_sock_init(const struct ctx *c, int ns, sa_family_t af, else bind_addr = addr; - tref.tcp.v6 = 1; + tref.tcp.v6 = tref_spliced.tcp.v6 = 1; - tref.tcp.splice = 0; if (!ns) { s = sock_l4(c, AF_INET6, IPPROTO_TCP, bind_addr, ifname, port, tref.u32); @@ -3159,9 +3157,8 @@ void tcp_sock_init(const struct ctx *c, int ns, sa_family_t af, if (c->mode == MODE_PASTA) { bind_addr = &in6addr_loopback; - tref.tcp.splice = 1; s = sock_l4(c, AF_INET6, IPPROTO_TCP, bind_addr, ifname, - port, tref.u32); + port, tref_spliced.u32); if (s >= 0) tcp_sock_set_bufsize(c, s); else diff --git a/tcp_splice.c b/tcp_splice.c index 96c31c8..bf5f62c 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -35,6 +35,7 @@ #include <fcntl.h> #include <limits.h> #include <stdint.h> +#include <stdbool.h> #include <string.h> #include <time.h> #include <unistd.h> @@ -512,13 +513,18 @@ static int tcp_splice_connect_ns(void *arg) static int tcp_splice_new(const struct ctx *c, struct tcp_splice_conn *conn, in_port_t port) { - struct tcp_splice_connect_ns_arg ns_arg = { c, conn, port, 0 }; int *p, i, s = -1; + bool inbound; - if (bitmap_isset(c->tcp.fwd_in.map, port)) + if (bitmap_isset(c->tcp.fwd_in.map, port)) { p = CONN_V6(conn) ? ns_sock_pool6 : ns_sock_pool4; - else + port += c->tcp.fwd_in.delta[port]; + inbound = true; + } else { p = CONN_V6(conn) ? init_sock_pool6 : init_sock_pool4; + port += c->tcp.fwd_out.delta[port]; + inbound = false; + }This smells wrong to me. AFAICT there's nothing inherently wrong with forwarding inbound connections to port 5015 to port 5016 in the namespace *and* forwarding outbound connections to port 5015 to port 5016 on the host. So I think rather than consulting the port map here, each conn object should know if its an inward or outward connection. To make that work with epoll, we'd also need a bit in the ref which says whether it's a socket listening on the host or in the ns.for (i = 0; i < TCP_SOCK_POOL_SIZE; i++, p++) { SWAP(s, *p); @@ -526,11 +532,15 @@ static int tcp_splice_new(const struct ctx *c, struct tcp_splice_conn *conn, break; } - if (s < 0 && bitmap_isset(c->tcp.fwd_in.map, port)) { + /* No socket available in namespace: create a new one for connect() */ + if (s < 0 && inbound) { + struct tcp_splice_connect_ns_arg ns_arg = { c, conn, port, 0 }; + NS_CALL(tcp_splice_connect_ns, &ns_arg); return ns_arg.ret; } + /* Otherwise, the socket will connect on the side it was created on */ return tcp_splice_connect(c, conn, s, port); }-- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson