Re: [PATCH v2 5/5] tcp, udp: Pad batched frames for vhost-user modes to 60 bytes (802.3 minimum)

6 Dec 2025

On Fri, 5 Dec 2025 15:07:27 +1100
David Gibson  wrote:
...
On Fri, Dec 05, 2025 at 01:51:57AM +0100, Stefano Brivio wrote:
...
For both TCP and UDP, we request vhost-user buffers that are large
enough to reach ETH_ZLEN (60 bytes), so padding is just a matter of
increasing the appropriate iov_len and clearing bytes in the buffer
as needed.
Link: https://bugs.passt.top/show_bug.cgi?id=166
Signed-off-by: Stefano Brivio 
---
 tcp.c          |  2 --
 tcp_internal.h |  1 +
 tcp_vu.c       | 32 ++++++++++++++++++++++++++++----
 udp_vu.c       | 13 ++++++++++++-
 4 files changed, 41 insertions(+), 7 deletions(-)

diff --git a/tcp.c b/tcp.c
index c5486bc..8cd062f 100644
--- a/tcp.c
+++ b/tcp.c
@@ -341,8 +341,6 @@ enum {
 };
 #endif
-/* MSS rounding: see SET_MSS() */
-#define MSS_DEFAULT			536
 #define WINDOW_DEFAULT			14600		/* RFC 6928 */
#define RTO_INIT			1		/* s, RFC 6298 */
diff --git a/tcp_internal.h b/tcp_internal.h
index 5f8fb35..d2295c9 100644
--- a/tcp_internal.h
+++ b/tcp_internal.h
@@ -12,6 +12,7 @@
 #define BUF_DISCARD_SIZE	(1 << 20)
 #define DISCARD_IOV_NUM		DIV_ROUND_UP(MAX_WINDOW, BUF_DISCARD_SIZE)
+#define MSS_DEFAULT /* and minimum */	536 /* as it comes from minimum MTU */
 #define MSS4				ROUND_DOWN(IP_MAX_MTU -		   \
      				   sizeof(struct tcphdr) - \
      				   sizeof(struct iphdr),   \
diff --git a/tcp_vu.c b/tcp_vu.c
index 1c81ce3..638813c 100644
--- a/tcp_vu.c
+++ b/tcp_vu.c
@@ -60,6 +60,26 @@ static size_t tcp_vu_hdrlen(bool v6)
  return hdrlen;
 }
+/**
+ * tcp_vu_pad() - Pad 802.3 frame to minimum length (60 bytes) if needed
+ * @iov:	iovec array storing 802.3 frame with TCP segment inside
+ * @cnt:	Number of entries in @iov
+ */
+static void tcp_vu_pad(struct iovec *iov, size_t cnt)
+{
+	size_t l2len, pad;
+
+	ASSERT(iov_size(iov, cnt) >= sizeof(struct virtio_net_hdr_mrg_rxbuf));
+	l2len = iov_size(iov, cnt) - sizeof(struct virtio_net_hdr_mrg_rxbuf);
+	if (l2len >= ETH_ZLEN)
+		return;
+
+	pad = ETH_ZLEN - l2len;
+
+	memset((char *)iov[cnt - 1].iov_base + iov[cnt - 1].iov_len, 0, pad);
+	iov[cnt - 1].iov_len += pad;
This assumes there's enough allocated space in last buffer to extend
it this way.  I'm pretty sure that's true in practice, but it's not
super obvious from right here.
I can't see a way to avoid that without doing a substantial rework of
both paths.  But we should at least document the assumption in a
comment.
Oops, that's actually not guaranteed. The only assumption we have is
that the first buffer is at least 54 bytes long for IPv4:

	/* we guess the first iovec provided by the guest can embed
	 * all the headers needed by L2 frame
	 */
	ASSERT(iov[0].iov_len >= hdrlen);

because as far as I understand there's no known vhost-user
implementation that would give us less than 1k or so, and we request at
least 60 bytes from vu_collect(), but that might lead for example to a
situation with iov[n].iov_len = < 54, 1, 5 > and we would write 6 bytes
to the last element.

I could have switched to something like a iov_from_buf() call from the
60-byte eth_pad[] I added in 4/5, but there's no version adjusting
iov_len, and we probably wouldn't need it for anything else.

So I changed approach altogether: I'll just ASSERT() that we have at
least 60 bytes in the first buffer. It's just 6 bytes on top in the
worst case, and still much less than any buffer we might actually see
in practice.
...
Also, there's not anything really TCP specific about this, so we
should be able to re-use it for UDP, no?
Renamed to vu_pad() and moved to vu_common.c.
...
...
+}
+
 /**
  * tcp_vu_send_flag() - Send segment with flags to vhost-user (no payload)
  * @c:		Execution context
@@ -91,12 +111,11 @@ int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
  vu_set_element(&flags_elem[0], NULL, &flags_iov[0]);
elem_cnt = vu_collect(vdev, vq, &flags_elem[0], 1,
-			      hdrlen + sizeof(struct tcp_syn_opts), NULL);
+			      MAX(hdrlen + sizeof(*opts), ETH_ZLEN), NULL);
  if (elem_cnt != 1)
      return -1;
-	ASSERT(flags_elem[0].in_sg[0].iov_len >=
-	       hdrlen + sizeof(struct tcp_syn_opts));
+	ASSERT(flags_elem[0].in_sg[0].iov_len >= hdrlen + sizeof(*opts));
vu_set_vnethdr(vdev, flags_elem[0].in_sg[0].iov_base, 1);
@@ -138,6 +157,8 @@ int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
A few lines above here is where we truncate iov_len[] to match the
actual length of the frame.  Replace that with a call to the
pad/truncate function.
Ah, no need anymore, as the first buffer is now the only buffer that
might ever need padding.
...
...
tcp_fill_headers(c, conn, NULL, eh, ip4h, ip6h, th, &payload,
      	 NULL, seq, !*c->pcap);
+	tcp_vu_pad(&flags_elem[0].in_sg[0], 1);
+
  if (*c->pcap) {
      pcap_iov(&flags_elem[0].in_sg[0], 1,
      	 sizeof(struct virtio_net_hdr_mrg_rxbuf));
@@ -211,7 +232,8 @@ static ssize_t tcp_vu_sock_recv(const struct ctx *c, struct vu_virtq *vq,
cnt = vu_collect(vdev, vq, &elem[elem_cnt],
      		 VIRTQUEUE_MAX_SIZE - elem_cnt,
-				 MIN(mss, fillsize) + hdrlen, &frame_size);
+				 MAX(MIN(mss, fillsize) + hdrlen, ETH_ZLEN),
+				 &frame_size);
      if (cnt == 0)
      	break;
@@ -456,6 +478,8 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
tcp_vu_prepare(c, conn, iov, buf_cnt, &check, !*c->pcap, push);
+		tcp_vu_pad(iov, buf_cnt);
+
      if (*c->pcap) {
      	pcap_iov(iov, buf_cnt,
      		 sizeof(struct virtio_net_hdr_mrg_rxbuf));
diff --git a/udp_vu.c b/udp_vu.c
index 099677f..33dbb9a 100644
--- a/udp_vu.c
+++ b/udp_vu.c
@@ -72,8 +72,8 @@ static int udp_vu_sock_recv(const struct ctx *c, struct vu_virtq *vq, int s,
 {
  const struct vu_dev *vdev = c->vdev;
  int iov_cnt, idx, iov_used;
+	size_t off, hdrlen, l2len;
  struct msghdr msg  = { 0 };
-	size_t off, hdrlen;
ASSERT(!c->no_udp);
@@ -116,6 +116,17 @@ static int udp_vu_sock_recv(const struct ctx *c, struct vu_virtq *vq, int s,
      iov_vu[idx].iov_len = off;
  iov_used = idx + !!off;
+	/* pad 802.3 frame to 60 bytes if needed */
+	l2len = *dlen + hdrlen - sizeof(struct virtio_net_hdr_mrg_rxbuf);
+	if (l2len < ETH_ZLEN) {
+		size_t pad = ETH_ZLEN - l2len;
+
+		memset((char *)iov_vu[idx].iov_base + iov_vu[idx].iov_len,
+		       0, pad);
+
+		iov_vu[idx].iov_len += pad;
+	}
+
  vu_set_vnethdr(vdev, iov_vu[0].iov_base, iov_used);
/* release unused buffers */
-- 
2.43.0
-- 
Stefano