On Fri, Jul 05, 2024 at 11:10:45AM +0200, Stefano Brivio wrote:On Thu, 4 Jul 2024 14:58:33 +1000 David Gibson <david(a)gibson.dropbear.id.au> wrote:I don't disagree it's pretty hard to follow, but I haven't really seen a better way.When we receive datagrams on a socket, we need to split them into batches depending on how they need to be forwarded (either via a specific splice socket, or via tap). The logic to do this, is somewhat awkwardly split between udp_buf_sock_handler() itself, udp_splice_send() and udp_tap_send(). Move all the batching logic into udp_buf_sock_handler(), leaving udp_splice_send() to just send the prepared batch. udp_tap_send() reduces to just a call to tap_send_frames() so open-code that call in udp_buf_sock_handler(). This will allow separating the batching logic from the rest of the datagram forwarding logic, which we'll need for upcoming flow table support. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- udp.c | 128 ++++++++++++++++++---------------------------------------- 1 file changed, 39 insertions(+), 89 deletions(-) diff --git a/udp.c b/udp.c index 8ed59639..a317e986 100644 --- a/udp.c +++ b/udp.c @@ -501,42 +501,29 @@ static void udp_splice_prepare(struct mmsghdr *mmh, unsigned idx) } /** - * udp_splice_send() - Send datagrams from socket to socket + * udp_splice_send() - Send a batch of datagrams from socket to socket * @c: Execution context - * @start: Index of first datagram in udp[46]_l2_buf - * @n: Total number of datagrams in udp[46]_l2_buf pool - * @dst: Datagrams will be sent to this port (on destination side) + * @start: Index of batch's first datagram in udp[46]_l2_buf + * @n: Number of datagrams in batch + * @src: Source port for datagram (target side) + * @dst: Destination port for datagrams (target side) * @ref: epoll reference for origin socket * @now: Timestamp - * - * This consumes as many datagrams as are sendable via a single socket. It - * requires that udp_meta[(a)start].splicesrc is initialised, and will initialise - * udp_meta[].splicesrc for each datagram it consumes *and one more* (if - * present). - * - * Return: Number of datagrams forwarded */ -static unsigned udp_splice_send(const struct ctx *c, size_t start, size_t n, - in_port_t dst, union epoll_ref ref, - const struct timespec *now) +static void udp_splice_send(const struct ctx *c, size_t start, size_t n, + in_port_t src, in_port_t dst, + union epoll_ref ref, + const struct timespec *now) { - in_port_t src = udp_meta[start].splicesrc; - struct mmsghdr *mmh_recv; - unsigned int i = start; int s; - ASSERT(udp_meta[start].splicesrc >= 0); - ASSERT(ref.type == EPOLL_TYPE_UDP); - if (ref.udp.v6) { - mmh_recv = udp6_mh_recv; udp_spliceto.sa6 = (struct sockaddr_in6) { .sin6_family = AF_INET6, .sin6_addr = in6addr_loopback, .sin6_port = htons(dst), }; } else { - mmh_recv = udp4_mh_recv; udp_spliceto.sa4 = (struct sockaddr_in) { .sin_family = AF_INET, .sin_addr = in4addr_loopback, @@ -544,15 +531,6 @@ static unsigned udp_splice_send(const struct ctx *c, size_t start, size_t n, }; } - do { - udp_splice_prepare(mmh_recv, i); - - if (++i >= n) - break; - - udp_meta[i].splicesrc = udp_mmh_splice_port(ref, &mmh_recv[i]); - } while (udp_meta[i].splicesrc == src); - if (ref.udp.pif == PIF_SPLICE) { src += c->udp.fwd_in.rdelta[src]; s = udp_splice_init[ref.udp.v6][src].sock; @@ -560,7 +538,7 @@ static unsigned udp_splice_send(const struct ctx *c, size_t start, size_t n, s = udp_splice_new(c, ref.udp.v6, src, false); if (s < 0) - goto out; + return; udp_splice_ns[ref.udp.v6][dst].ts = now->tv_sec; udp_splice_init[ref.udp.v6][src].ts = now->tv_sec; @@ -577,15 +555,13 @@ static unsigned udp_splice_send(const struct ctx *c, size_t start, size_t n, s = arg.s; } if (s < 0) - goto out; + return; udp_splice_init[ref.udp.v6][dst].ts = now->tv_sec; udp_splice_ns[ref.udp.v6][src].ts = now->tv_sec; } - sendmmsg(s, udp_mh_splice + start, i - start, MSG_NOSIGNAL); -out: - return i - start; + sendmmsg(s, udp_mh_splice + start, n, MSG_NOSIGNAL); } /** @@ -725,7 +701,7 @@ static size_t udp_update_hdr6(const struct ctx *c, * @v6: Prepare for IPv6? * @now: Current timestamp */ -static void udp_tap_prepare(const struct ctx *c, struct mmsghdr *mmh, +static void udp_tap_prepare(const struct ctx *c, const struct mmsghdr *mmh, unsigned idx, in_port_t dstport, bool v6, const struct timespec *now) { @@ -752,49 +728,6 @@ static void udp_tap_prepare(const struct ctx *c, struct mmsghdr *mmh, (*tap_iov)[UDP_IOV_PAYLOAD].iov_len = l4len; } -/** - * udp_tap_send() - Prepare UDP datagrams and send to tap interface - * @c: Execution context - * @start: Index of first datagram in udp[46]_l2_buf pool - * @n: Total number of datagrams in udp[46]_l2_buf pool - * @dstport: Destination port number on destination side - * @ref: Epoll reference for origin socket - * @now: Current timestamp - * - * This consumes as many frames as are sendable via tap. It requires that - * udp_meta[(a)start].splicesrc is initialised, and will initialise - * udp_meta[].splicesrc for each frame it consumes *and one more* (if present). - * - * Return: Number of frames sent via tap - */ -static unsigned udp_tap_send(const struct ctx *c, size_t start, size_t n, - in_port_t dstport, union epoll_ref ref, - const struct timespec *now) -{ - struct mmsghdr *mmh_recv; - size_t i = start; - - ASSERT(udp_meta[start].splicesrc == -1); - ASSERT(ref.type == EPOLL_TYPE_UDP); - - if (ref.udp.v6) - mmh_recv = udp6_mh_recv; - else - mmh_recv = udp4_mh_recv; - - do { - udp_tap_prepare(c, mmh_recv, i, dstport, ref.udp.v6, now); - - if (++i >= n) - break; - - udp_meta[i].splicesrc = udp_mmh_splice_port(ref, &mmh_recv[i]); - } while (udp_meta[i].splicesrc == -1); - - tap_send_frames(c, &udp_l2_iov[start][0], UDP_NUM_IOVS, i - start); - return i - start; -} - /** * udp_sock_recv() - Receive datagrams from a socket * @c: Execution context @@ -842,7 +775,7 @@ void udp_buf_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t eve { struct mmsghdr *mmh_recv = ref.udp.v6 ? udp6_mh_recv : udp4_mh_recv; in_port_t dstport = ref.udp.port; - int n, m, i; + int n, i; if ((n = udp_sock_recv(c, ref.fd, events, mmh_recv)) <= 0) return; @@ -852,19 +785,36 @@ void udp_buf_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t eve else if (ref.udp.pif == PIF_HOST) dstport += c->udp.fwd_in.f.delta[dstport]; - /* We divide things into batches based on how we need to send them, + /* We divide datagrams into batches based on how we need to send them, * determined by udp_meta[i].splicesrc. To avoid either two passes * through the array, or recalculating splicesrc for a single entry, we - * have to populate it one entry *ahead* of the loop counter (if - * present). So we fill in entry 0 before the loop, then udp_*_send() - * populate one entry past where they consume. + * have to populate it one entry *ahead* of the loop counter. */ udp_meta[0].splicesrc = udp_mmh_splice_port(ref, mmh_recv); - for (i = 0; i < n; i += m) { - if (udp_meta[i].splicesrc >= 0) - m = udp_splice_send(c, i, n, dstport, ref, now); + for (i = 0; i < n; ) { + int batchsrc = udp_meta[i].splicesrc; + int batchstart = i; + + do { + if (batchsrc >= 0) + udp_splice_prepare(mmh_recv, i); + else + udp_tap_prepare(c, mmh_recv, i, dstport, + ref.udp.v6, now); + + if (++i >= n) + break; + + udp_meta[i].splicesrc = udp_mmh_splice_port(ref, + &mmh_recv[i]); + } while (udp_meta[i].splicesrc == batchsrc); + + if (batchsrc >= 0) + udp_splice_send(c, batchstart, i - batchstart, + batchsrc, dstport, ref, now); else - m = udp_tap_send(c, i, n, dstport, ref, now); + tap_send_frames(c, &udp_l2_iov[batchstart][0], + UDP_NUM_IOVS, i - batchstart); }The logic looks correct to me, but the nested loop makes it a bit hard to grasp.I'm wondering if we shouldn't rather have a single loop, always preparing the datagrams, noting down the previous udp_meta[i].splicesrc and the first index of a batch, and starting a new batch (sending the previous one) once the current udp_meta[i].splicesrc doesn't match the previous value.I can't really picture what you have in mind here.I tried to sketch this quickly but failed for the moment to come up with anything vaguely elegant, so I'm fine with either version. Nits: curly brackets around multiple lines.That, at least, I can fix. -- David Gibson (he or they) | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you, not the other way | around. http://www.ozlabs.org/~dgibson