On Thu, 4 Jul 2024 14:58:33 +1000 David Gibson <david(a)gibson.dropbear.id.au> wrote:When we receive datagrams on a socket, we need to split them into batches depending on how they need to be forwarded (either via a specific splice socket, or via tap). The logic to do this, is somewhat awkwardly split between udp_buf_sock_handler() itself, udp_splice_send() and udp_tap_send(). Move all the batching logic into udp_buf_sock_handler(), leaving udp_splice_send() to just send the prepared batch. udp_tap_send() reduces to just a call to tap_send_frames() so open-code that call in udp_buf_sock_handler(). This will allow separating the batching logic from the rest of the datagram forwarding logic, which we'll need for upcoming flow table support. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- udp.c | 128 ++++++++++++++++++---------------------------------------- 1 file changed, 39 insertions(+), 89 deletions(-) diff --git a/udp.c b/udp.c index 8ed59639..a317e986 100644 --- a/udp.c +++ b/udp.c @@ -501,42 +501,29 @@ static void udp_splice_prepare(struct mmsghdr *mmh, unsigned idx) } /** - * udp_splice_send() - Send datagrams from socket to socket + * udp_splice_send() - Send a batch of datagrams from socket to socket * @c: Execution context - * @start: Index of first datagram in udp[46]_l2_buf - * @n: Total number of datagrams in udp[46]_l2_buf pool - * @dst: Datagrams will be sent to this port (on destination side) + * @start: Index of batch's first datagram in udp[46]_l2_buf + * @n: Number of datagrams in batch + * @src: Source port for datagram (target side) + * @dst: Destination port for datagrams (target side) * @ref: epoll reference for origin socket * @now: Timestamp - * - * This consumes as many datagrams as are sendable via a single socket. It - * requires that udp_meta[(a)start].splicesrc is initialised, and will initialise - * udp_meta[].splicesrc for each datagram it consumes *and one more* (if - * present). - * - * Return: Number of datagrams forwarded */ -static unsigned udp_splice_send(const struct ctx *c, size_t start, size_t n, - in_port_t dst, union epoll_ref ref, - const struct timespec *now) +static void udp_splice_send(const struct ctx *c, size_t start, size_t n, + in_port_t src, in_port_t dst, + union epoll_ref ref, + const struct timespec *now) { - in_port_t src = udp_meta[start].splicesrc; - struct mmsghdr *mmh_recv; - unsigned int i = start; int s; - ASSERT(udp_meta[start].splicesrc >= 0); - ASSERT(ref.type == EPOLL_TYPE_UDP); - if (ref.udp.v6) { - mmh_recv = udp6_mh_recv; udp_spliceto.sa6 = (struct sockaddr_in6) { .sin6_family = AF_INET6, .sin6_addr = in6addr_loopback, .sin6_port = htons(dst), }; } else { - mmh_recv = udp4_mh_recv; udp_spliceto.sa4 = (struct sockaddr_in) { .sin_family = AF_INET, .sin_addr = in4addr_loopback, @@ -544,15 +531,6 @@ static unsigned udp_splice_send(const struct ctx *c, size_t start, size_t n, }; } - do { - udp_splice_prepare(mmh_recv, i); - - if (++i >= n) - break; - - udp_meta[i].splicesrc = udp_mmh_splice_port(ref, &mmh_recv[i]); - } while (udp_meta[i].splicesrc == src); - if (ref.udp.pif == PIF_SPLICE) { src += c->udp.fwd_in.rdelta[src]; s = udp_splice_init[ref.udp.v6][src].sock; @@ -560,7 +538,7 @@ static unsigned udp_splice_send(const struct ctx *c, size_t start, size_t n, s = udp_splice_new(c, ref.udp.v6, src, false); if (s < 0) - goto out; + return; udp_splice_ns[ref.udp.v6][dst].ts = now->tv_sec; udp_splice_init[ref.udp.v6][src].ts = now->tv_sec; @@ -577,15 +555,13 @@ static unsigned udp_splice_send(const struct ctx *c, size_t start, size_t n, s = arg.s; } if (s < 0) - goto out; + return; udp_splice_init[ref.udp.v6][dst].ts = now->tv_sec; udp_splice_ns[ref.udp.v6][src].ts = now->tv_sec; } - sendmmsg(s, udp_mh_splice + start, i - start, MSG_NOSIGNAL); -out: - return i - start; + sendmmsg(s, udp_mh_splice + start, n, MSG_NOSIGNAL); } /** @@ -725,7 +701,7 @@ static size_t udp_update_hdr6(const struct ctx *c, * @v6: Prepare for IPv6? * @now: Current timestamp */ -static void udp_tap_prepare(const struct ctx *c, struct mmsghdr *mmh, +static void udp_tap_prepare(const struct ctx *c, const struct mmsghdr *mmh, unsigned idx, in_port_t dstport, bool v6, const struct timespec *now) { @@ -752,49 +728,6 @@ static void udp_tap_prepare(const struct ctx *c, struct mmsghdr *mmh, (*tap_iov)[UDP_IOV_PAYLOAD].iov_len = l4len; } -/** - * udp_tap_send() - Prepare UDP datagrams and send to tap interface - * @c: Execution context - * @start: Index of first datagram in udp[46]_l2_buf pool - * @n: Total number of datagrams in udp[46]_l2_buf pool - * @dstport: Destination port number on destination side - * @ref: Epoll reference for origin socket - * @now: Current timestamp - * - * This consumes as many frames as are sendable via tap. It requires that - * udp_meta[(a)start].splicesrc is initialised, and will initialise - * udp_meta[].splicesrc for each frame it consumes *and one more* (if present). - * - * Return: Number of frames sent via tap - */ -static unsigned udp_tap_send(const struct ctx *c, size_t start, size_t n, - in_port_t dstport, union epoll_ref ref, - const struct timespec *now) -{ - struct mmsghdr *mmh_recv; - size_t i = start; - - ASSERT(udp_meta[start].splicesrc == -1); - ASSERT(ref.type == EPOLL_TYPE_UDP); - - if (ref.udp.v6) - mmh_recv = udp6_mh_recv; - else - mmh_recv = udp4_mh_recv; - - do { - udp_tap_prepare(c, mmh_recv, i, dstport, ref.udp.v6, now); - - if (++i >= n) - break; - - udp_meta[i].splicesrc = udp_mmh_splice_port(ref, &mmh_recv[i]); - } while (udp_meta[i].splicesrc == -1); - - tap_send_frames(c, &udp_l2_iov[start][0], UDP_NUM_IOVS, i - start); - return i - start; -} - /** * udp_sock_recv() - Receive datagrams from a socket * @c: Execution context @@ -842,7 +775,7 @@ void udp_buf_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t eve { struct mmsghdr *mmh_recv = ref.udp.v6 ? udp6_mh_recv : udp4_mh_recv; in_port_t dstport = ref.udp.port; - int n, m, i; + int n, i; if ((n = udp_sock_recv(c, ref.fd, events, mmh_recv)) <= 0) return; @@ -852,19 +785,36 @@ void udp_buf_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t eve else if (ref.udp.pif == PIF_HOST) dstport += c->udp.fwd_in.f.delta[dstport]; - /* We divide things into batches based on how we need to send them, + /* We divide datagrams into batches based on how we need to send them, * determined by udp_meta[i].splicesrc. To avoid either two passes * through the array, or recalculating splicesrc for a single entry, we - * have to populate it one entry *ahead* of the loop counter (if - * present). So we fill in entry 0 before the loop, then udp_*_send() - * populate one entry past where they consume. + * have to populate it one entry *ahead* of the loop counter. */ udp_meta[0].splicesrc = udp_mmh_splice_port(ref, mmh_recv); - for (i = 0; i < n; i += m) { - if (udp_meta[i].splicesrc >= 0) - m = udp_splice_send(c, i, n, dstport, ref, now); + for (i = 0; i < n; ) { + int batchsrc = udp_meta[i].splicesrc; + int batchstart = i; + + do { + if (batchsrc >= 0) + udp_splice_prepare(mmh_recv, i); + else + udp_tap_prepare(c, mmh_recv, i, dstport, + ref.udp.v6, now); + + if (++i >= n) + break; + + udp_meta[i].splicesrc = udp_mmh_splice_port(ref, + &mmh_recv[i]); + } while (udp_meta[i].splicesrc == batchsrc); + + if (batchsrc >= 0) + udp_splice_send(c, batchstart, i - batchstart, + batchsrc, dstport, ref, now); else - m = udp_tap_send(c, i, n, dstport, ref, now); + tap_send_frames(c, &udp_l2_iov[batchstart][0], + UDP_NUM_IOVS, i - batchstart); }The logic looks correct to me, but the nested loop makes it a bit hard to grasp. I'm wondering if we shouldn't rather have a single loop, always preparing the datagrams, noting down the previous udp_meta[i].splicesrc and the first index of a batch, and starting a new batch (sending the previous one) once the current udp_meta[i].splicesrc doesn't match the previous value. I tried to sketch this quickly but failed for the moment to come up with anything vaguely elegant, so I'm fine with either version. Nits: curly brackets around multiple lines. -- Stefano