We've had some issues with DNS requests starting to fail in long running guests or containers using passt/pasta. With assistance from a reporter, I've managed to track this down to some bugs in our management of saved UDP socket fds. Link: https://bugs.passt.top/show_bug.cgi?id=57 David Gibson (2): udp: Consistently use -1 to indicate un-opened sockets in maps udp: Remove socket from udp_{tap,splice}_map when timed out conf.c | 1 + udp.c | 36 +++++++++++++++++++++++++++--------- udp.h | 1 + 3 files changed, 29 insertions(+), 9 deletions(-) -- 2.41.0
udp uses the udp_tap_map, udp_splice_ns and udp_splice_init tables to keep track of already opened sockets bound to specific ports. We need a way to indicate entries where a socket hasn't been opened, but the code isn't consistent if this is indicated by a 0 or a -1: * udp_splice_sendfrom() and udp_tap_handler() assume that 0 indicates an unopened socket * udp_sock_init() fills in -1 for a failure to open a socket * udp_timer_one() is somewhere in between, treating only strictly positive fds as valid -1 (or, at least, negative) is really the correct choice here, since 0 is a theoretically valid fd value (if very unlikely in practice). Change to use that consistently throughout. The table does need to be initialised to all -1 values before any calls to udp_sock_init() which can happen from conf_ports(). Because C doesn't make it easy to statically initialise non zero values in large tables, this does require a somewhat awkward call to initialise the table from conf(). This is the best approach I could see for the short term, with any luck it will go away at some point when those socket tables are replaced by a unified flow table. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- conf.c | 1 + udp.c | 26 +++++++++++++++++++++----- udp.h | 1 + 3 files changed, 23 insertions(+), 5 deletions(-) diff --git a/conf.c b/conf.c index a235b31..95b3e4b 100644 --- a/conf.c +++ b/conf.c @@ -1740,6 +1740,7 @@ void conf(struct ctx *c, int argc, char **argv) c->no_map_gw = 1; /* Inbound port options can be parsed now (after IPv4/IPv6 settings) */ + udp_portmap_clear(); optind = 1; do { name = getopt_long(argc, argv, optstring, options, NULL); diff --git a/udp.c b/udp.c index cadf393..a8473e3 100644 --- a/udp.c +++ b/udp.c @@ -238,6 +238,20 @@ static struct sockaddr_in6 udp6_localname = { static struct mmsghdr udp4_mh_splice [UDP_MAX_FRAMES]; static struct mmsghdr udp6_mh_splice [UDP_MAX_FRAMES]; +/** + * udp_portmap_clear() - Clear UDP port map before configuration + */ +void udp_portmap_clear(void) +{ + unsigned i; + + for (i = 0; i < NUM_PORTS; i++) { + udp_tap_map[V4][i].sock = udp_tap_map[V6][i].sock = -1; + udp_splice_ns[V4][i].sock = udp_splice_ns[V6][i].sock = -1; + udp_splice_init[V4][i].sock = udp_splice_init[V6][i].sock = -1; + } +} + /** * udp_invert_portmap() - Compute reverse port translations for return packets * @fwd: Port forwarding configuration to compute reverse map for @@ -521,7 +535,7 @@ static void udp_splice_sendfrom(const struct ctx *c, unsigned start, unsigned n, if (from_ns) { src += c->udp.fwd_in.rdelta[src]; s = udp_splice_init[v6][src].sock; - if (!s && allow_new) + if (s < 0 && allow_new) s = udp_splice_new(c, v6, src, false); if (s < 0) @@ -532,7 +546,7 @@ static void udp_splice_sendfrom(const struct ctx *c, unsigned start, unsigned n, } else { src += c->udp.fwd_out.rdelta[src]; s = udp_splice_ns[v6][src].sock; - if (!s && allow_new) { + if (s < 0 && allow_new) { struct udp_splice_new_ns_arg arg = { c, v6, src, -1, }; @@ -844,7 +858,9 @@ int udp_tap_handler(struct ctx *c, int af, const void *saddr, const void *daddr, s_in.sin_addr = c->ip4.addr_seen; } - if (!(s = udp_tap_map[V4][src].sock)) { + debug("UDP from tap src=%hu dst=%hu, s=%d", + src, dst, udp_tap_map[V4][src].sock); + if ((s = udp_tap_map[V4][src].sock) < 0) { union udp_epoll_ref uref = { .port = src }; in_addr_t bind_addr = { 0 }; const char *bind_if = NULL; @@ -894,7 +910,7 @@ int udp_tap_handler(struct ctx *c, int af, const void *saddr, const void *daddr, bind_addr = &c->ip6.addr_ll; } - if (!(s = udp_tap_map[V6][src].sock)) { + if ((s = udp_tap_map[V6][src].sock) < 0) { union udp_epoll_ref uref = { .v6 = 1, .port = src }; const char *bind_if = NULL; @@ -1160,7 +1176,7 @@ static void udp_timer_one(struct ctx *c, int v6, enum udp_act_type type, return; } - if (s > 0) { + if (s >= 0) { epoll_ctl(c->epollfd, EPOLL_CTL_DEL, s, NULL); close(s); bitmap_clear(udp_act[v6 ? V6 : V4][type], port); diff --git a/udp.h b/udp.h index 7837134..73faf21 100644 --- a/udp.h +++ b/udp.h @@ -8,6 +8,7 @@ #define UDP_TIMER_INTERVAL 1000 /* ms */ +void udp_portmap_clear(void); void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events, const struct timespec *now); int udp_tap_handler(struct ctx *c, int af, const void *saddr, const void *daddr, -- 2.41.0
On Mon, 6 Nov 2023 13:17:08 +1100 David Gibson <david(a)gibson.dropbear.id.au> wrote:udp uses the udp_tap_map, udp_splice_ns and udp_splice_init tables to keep track of already opened sockets bound to specific ports. We need a way to indicate entries where a socket hasn't been opened, but the code isn't consistent if this is indicated by a 0 or a -1: * udp_splice_sendfrom() and udp_tap_handler() assume that 0 indicates an unopened socket * udp_sock_init() fills in -1 for a failure to open a socket * udp_timer_one() is somewhere in between, treating only strictly positive fds as valid -1 (or, at least, negative) is really the correct choice here, since 0 is a theoretically valid fd value (if very unlikely in practice).Not so unlikely, actually (see also commit 6943d41d6cd0, where I missed to fix the UDP equivalents). By default we close standard input after initialising the "tap" file descriptor, so, depending on configuration options, zero might very well happen to be a UDP socket. I even pondered for a while to open a dummy file descriptor after closing standard input just for the sake of having zero as a "reserved" value, but it's not guaranteed to work.Change to use that consistently throughout. The table does need to be initialised to all -1 values before any calls to udp_sock_init() which can happen from conf_ports(). Because C doesn't make it easy to statically initialise non zero values in large tables, this does require a somewhat awkward call to initialise the table from conf(). This is the best approach I could see for the short term, with any luck it will go away at some point when those socket tables are replaced by a unified flow table. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- conf.c | 1 + udp.c | 26 +++++++++++++++++++++----- udp.h | 1 + 3 files changed, 23 insertions(+), 5 deletions(-) diff --git a/conf.c b/conf.c index a235b31..95b3e4b 100644 --- a/conf.c +++ b/conf.c @@ -1740,6 +1740,7 @@ void conf(struct ctx *c, int argc, char **argv) c->no_map_gw = 1; /* Inbound port options can be parsed now (after IPv4/IPv6 settings) */ + udp_portmap_clear(); optind = 1; do { name = getopt_long(argc, argv, optstring, options, NULL); diff --git a/udp.c b/udp.c index cadf393..a8473e3 100644 --- a/udp.c +++ b/udp.c @@ -238,6 +238,20 @@ static struct sockaddr_in6 udp6_localname = { static struct mmsghdr udp4_mh_splice [UDP_MAX_FRAMES]; static struct mmsghdr udp6_mh_splice [UDP_MAX_FRAMES]; +/** + * udp_portmap_clear() - Clear UDP port map before configuration + */ +void udp_portmap_clear(void) +{ + unsigned i; + + for (i = 0; i < NUM_PORTS; i++) { + udp_tap_map[V4][i].sock = udp_tap_map[V6][i].sock = -1; + udp_splice_ns[V4][i].sock = udp_splice_ns[V6][i].sock = -1; + udp_splice_init[V4][i].sock = udp_splice_init[V6][i].sock = -1; + } +}For TCP we do: $ grep memset\(.*0xff tcp.c tcp_splice.c tcp.c: memset(init_sock_pool4, 0xff, sizeof(init_sock_pool4)); tcp.c: memset(init_sock_pool6, 0xff, sizeof(init_sock_pool6)); tcp.c: memset(tcp_sock_init_ext, 0xff, sizeof(tcp_sock_init_ext)); tcp.c: memset(tcp_sock_ns, 0xff, sizeof(tcp_sock_ns)); tcp_splice.c: memset(splice_pipe_pool, 0xff, sizeof(splice_pipe_pool)); tcp_splice.c: memset(&ns_sock_pool4, 0xff, sizeof(ns_sock_pool4)); tcp_splice.c: memset(&ns_sock_pool6, 0xff, sizeof(ns_sock_pool6)); ...given how common this is, perhaps we could introduce a helper. In any case, I'll go ahead and apply this now, as the issue is quite bad, we can change this detail later. -- Stefano
We save sockets bound to particular ports in udp_{tap,splice}_map for reuse later. If they're not used for a time, we time them out and close them. However, when that happened, we weren't actually removing the fds from the relevant map. That meant that later interactions on the same port could get a stale fd from the map. The stale fd might be closed, leading to unexpected EBADF errors, or it could have been re-used by a completely different socket bound to a different port, which could lead to us incorrectly forwarding packets. Link: https://bugs.passt.top/show_bug.cgi?id=57 Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- udp.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/udp.c b/udp.c index a8473e3..b40d1f3 100644 --- a/udp.c +++ b/udp.c @@ -1146,14 +1146,14 @@ static void udp_timer_one(struct ctx *c, int v6, enum udp_act_type type, { struct udp_splice_port *sp; struct udp_tap_port *tp; - int s = -1; + int *sockp = NULL; switch (type) { case UDP_ACT_TAP: tp = &udp_tap_map[v6 ? V6 : V4][port]; if (ts->tv_sec - tp->ts > UDP_CONN_TIMEOUT) { - s = tp->sock; + sockp = &tp->sock; tp->flags = 0; } @@ -1162,21 +1162,23 @@ static void udp_timer_one(struct ctx *c, int v6, enum udp_act_type type, sp = &udp_splice_init[v6 ? V6 : V4][port]; if (ts->tv_sec - sp->ts > UDP_CONN_TIMEOUT) - s = sp->sock; + sockp = &sp->sock; break; case UDP_ACT_SPLICE_NS: sp = &udp_splice_ns[v6 ? V6 : V4][port]; if (ts->tv_sec - sp->ts > UDP_CONN_TIMEOUT) - s = sp->sock; + sockp = &sp->sock; break; default: return; } - if (s >= 0) { + if (sockp && *sockp >= 0) { + int s = *sockp; + *sockp = -1; epoll_ctl(c->epollfd, EPOLL_CTL_DEL, s, NULL); close(s); bitmap_clear(udp_act[v6 ? V6 : V4][type], port); -- 2.41.0
On Mon, 6 Nov 2023 13:17:09 +1100 David Gibson <david(a)gibson.dropbear.id.au> wrote:We save sockets bound to particular ports in udp_{tap,splice}_map for reuse later. If they're not used for a time, we time them out and close them. However, when that happened, we weren't actually removing the fds from the relevant map. That meant that later interactions on the same port could get a stale fd from the map. The stale fd might be closed, leading to unexpected EBADF errors, or it could have been re-used by a completely different socket bound to a different port, which could lead to us incorrectly forwarding packets. Link: https://bugs.passt.top/show_bug.cgi?id=57Adding: Reported-by: Reported-by: Chris Kuhn <kuhnchris(a)kuhnchris.eu> Reported-by: Jay <bugs.passt.top(a)bitsbetwixt.com>Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- udp.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/udp.c b/udp.c index a8473e3..b40d1f3 100644 --- a/udp.c +++ b/udp.c @@ -1146,14 +1146,14 @@ static void udp_timer_one(struct ctx *c, int v6, enum udp_act_type type, { struct udp_splice_port *sp; struct udp_tap_port *tp; - int s = -1; + int *sockp = NULL; switch (type) { case UDP_ACT_TAP: tp = &udp_tap_map[v6 ? V6 : V4][port]; if (ts->tv_sec - tp->ts > UDP_CONN_TIMEOUT) { - s = tp->sock; + sockp = &tp->sock; tp->flags = 0; } @@ -1162,21 +1162,23 @@ static void udp_timer_one(struct ctx *c, int v6, enum udp_act_type type, sp = &udp_splice_init[v6 ? V6 : V4][port]; if (ts->tv_sec - sp->ts > UDP_CONN_TIMEOUT) - s = sp->sock; + sockp = &sp->sock; break; case UDP_ACT_SPLICE_NS: sp = &udp_splice_ns[v6 ? V6 : V4][port]; if (ts->tv_sec - sp->ts > UDP_CONN_TIMEOUT) - s = sp->sock; + sockp = &sp->sock; break; default: return; } - if (s >= 0) { + if (sockp && *sockp >= 0) { + int s = *sockp; + *sockp = -1; epoll_ctl(c->epollfd, EPOLL_CTL_DEL, s, NULL); close(s); bitmap_clear(udp_act[v6 ? V6 : V4][type], port);-- Stefano
On Mon, 6 Nov 2023 13:17:07 +1100 David Gibson <david(a)gibson.dropbear.id.au> wrote:We've had some issues with DNS requests starting to fail in long running guests or containers using passt/pasta. With assistance from a reporter, I've managed to track this down to some bugs in our management of saved UDP socket fds. Link: https://bugs.passt.top/show_bug.cgi?id=57 David Gibson (2): udp: Consistently use -1 to indicate un-opened sockets in maps udp: Remove socket from udp_{tap,splice}_map when timed outApplied. -- Stefano