[PATCH 00/10] RFC: Unify and simplify tap send path

David Gibson

8 Dec 2022 8 Dec '22

9:55 a.m.

Although we have an abstraction for the "slow path" (DHCP, NDP) guest bound packets, the TCP and UDP forwarding paths write directly to the tap fd. However, it turns out how they send frames to the tap device is more similar than it originally appears. This series unifies the low-level tap send functions for TCP and UDP, and makes some clean ups along the way. David Gibson (10): pcap: Introduce pcap_frame() helper pcap: Replace pcapm() with pcap_multiple() tcp: Combine two parts of passt tap send path together tcp: Don't keep compute total bytes in a message until we need it tcp: Improve interface to tcp_l2_buf_flush() tcp: Combine two parts of pasta tap send path together tap, tcp: Move tap send path to tap.c tcp,tap: Use different io vector bases depending on tap type udp: Use tap_send_frames() tap: Improve handling of partial frame sends pcap.c | 78 ++++++++----------------------- pcap.h | 3 +- tap.c | 108 ++++++++++++++++++++++++++++++++++++++++++ tap.h | 1 + tcp.c | 145 +++++++++++++-------------------------------------------- udp.c | 145 +++------------------------------------------------------ udp.h | 2 +- 7 files changed, 169 insertions(+), 313 deletions(-) -- 2.38.1

Show replies by date

David Gibson

8 Dec 8 Dec

9:55 a.m.

New subject: [PATCH 01/10] pcap: Introduce pcap_frame() helper

pcap(), pcapm() and pcapmm() duplicate some code, for the actual writing to the capture file. The main purpose pf pcapm() and pcampp() not calling pcap seems to be to avoid repeatedly calling gettimeofday(). We can accomplish that while still sharing code by adding a new helper which takes the packet timestamp as a parameter. Signed-off-by: David Gibson --- pcap.c | 59 +++++++++++++++++++++++----------------------------------- 1 file changed, 23 insertions(+), 36 deletions(-) diff --git a/pcap.c b/pcap.c index 836688d..c9ac5cf 100644 --- a/pcap.c +++ b/pcap.c @@ -65,6 +65,24 @@ struct pcap_pkthdr { uint32_t len; }; +/** + * pcap_frame() - Capture a single frame to pcap file with given timestamp + * @pkt: Pointer to data buffer, including L2 headers + * @len: L2 packet length + * @tv: Timestamp + */ +static void pcap_frame(const char *pkt, size_t len, const struct timeval *tv) +{ + struct pcap_pkthdr h; + + h.tv_sec = tv->tv_sec; + h.tv_usec = tv->tv_usec; + h.caplen = h.len = len; + + if (write(pcap_fd, &h, sizeof(h)) < 0 || write(pcap_fd, pkt, len) < 0) + debug("Cannot log packet, length %lu", len); +} + /** * pcap() - Capture a single frame to pcap file * @pkt: Pointer to data buffer, including L2 headers @@ -72,19 +90,13 @@ struct pcap_pkthdr { */ void pcap(const char *pkt, size_t len) { - struct pcap_pkthdr h; struct timeval tv; if (pcap_fd == -1) return; gettimeofday(&tv, NULL); - h.tv_sec = tv.tv_sec; - h.tv_usec = tv.tv_usec; - h.caplen = h.len = len; - - if (write(pcap_fd, &h, sizeof(h)) < 0 || write(pcap_fd, pkt, len) < 0) - debug("Cannot log packet, length %lu", len); + pcap_frame(pkt, len, &tv); } /** @@ -93,8 +105,6 @@ void pcap(const char *pkt, size_t len) */ void pcapm(const struct msghdr *mh) { - struct pcap_pkthdr h; - struct iovec *iov; struct timeval tv; unsigned int i; @@ -102,24 +112,12 @@ void pcapm(const struct msghdr *mh) return; gettimeofday(&tv, NULL); - h.tv_sec = tv.tv_sec; - h.tv_usec = tv.tv_usec; for (i = 0; i < mh->msg_iovlen; i++) { - iov = &mh->msg_iov[i]; - - h.caplen = h.len = iov->iov_len - 4; + const struct iovec *iov = &mh->msg_iov[i]; - if (write(pcap_fd, &h, sizeof(h)) < 0) - goto fail; - if (write(pcap_fd, (char *)iov->iov_base + 4, - iov->iov_len - 4) < 0) - goto fail; + pcap_frame((char *)iov->iov_base + 4, iov->iov_len - 4, &tv); } - - return; -fail: - debug("Cannot log packet, length %lu", iov->iov_len - 4); } /** @@ -128,7 +126,6 @@ fail: */ void pcapmm(const struct mmsghdr *mmh, unsigned int vlen) { - struct pcap_pkthdr h; struct iovec *iov; struct timeval tv; unsigned int i, j; @@ -137,8 +134,6 @@ void pcapmm(const struct mmsghdr *mmh, unsigned int vlen) return; gettimeofday(&tv, NULL); - h.tv_sec = tv.tv_sec; - h.tv_usec = tv.tv_usec; for (i = 0; i < vlen; i++) { const struct msghdr *mh = &mmh[i].msg_hdr; @@ -146,18 +141,10 @@ void pcapmm(const struct mmsghdr *mmh, unsigned int vlen) for (j = 0; j < mh->msg_iovlen; j++) { iov = &mh->msg_iov[j]; - h.caplen = h.len = iov->iov_len - 4; - - if (write(pcap_fd, &h, sizeof(h)) < 0) - goto fail; - if (write(pcap_fd, (char *)iov->iov_base + 4, - iov->iov_len - 4) < 0) - goto fail; + pcap_frame((char *)iov->iov_base + 4, iov->iov_len - 4, + &tv); } } - return; -fail: - debug("Cannot log packet, length %lu", iov->iov_len - 4); } /** -- 2.38.1

David Gibson

9:55 a.m.

New subject: [PATCH 02/10] pcap: Replace pcapm() with pcap_multiple()

pcapm() captures multiple frames from a msghdr, however the only thing it cares about in the msghdr is the list of buffers, where it assumes there is one frame to capture per buffer. That's what we want for its single caller but it's not the only obvious choice here (one frame per msghdr would arguably make more sense in isolation). In addition pcapm() has logic that only makes sense in the context of the passt specific path its called from: it skips the first 4 bytes of each buffer, because those have the qemu vnet_len rather than the frame proper. Make this clearer by replacing pcapm() with pcap_multiple() which more explicitly takes one struct iovec per frame, and parameterizes how much of each buffer to skip (i.e. the offset of the frame within the buffer). Signed-off-by: David Gibson --- pcap.c | 16 ++++++++-------- pcap.h | 2 +- tcp.c | 3 ++- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/pcap.c b/pcap.c index c9ac5cf..8af9021 100644 --- a/pcap.c +++ b/pcap.c @@ -100,10 +100,12 @@ void pcap(const char *pkt, size_t len) } /** - * pcapm() - Capture multiple frames from message header to pcap file - * @mh: Pointer to sendmsg() message header buffer + * pcap_multiple() - Capture multiple frames + * @iov: Array of iovecs, one entry per frame + * @n: Number of frames to capture + * @offset: Offset of the frame within each iovec buffer */ -void pcapm(const struct msghdr *mh) +void pcap_multiple(const struct iovec *iov, unsigned int n, size_t offset) { struct timeval tv; unsigned int i; @@ -113,11 +115,9 @@ void pcapm(const struct msghdr *mh) gettimeofday(&tv, NULL); - for (i = 0; i < mh->msg_iovlen; i++) { - const struct iovec *iov = &mh->msg_iov[i]; - - pcap_frame((char *)iov->iov_base + 4, iov->iov_len - 4, &tv); - } + for (i = 0; i < n; i++) + pcap_frame((char *)iov[i].iov_base + offset, + iov[i].iov_len - offset, &tv); } /** diff --git a/pcap.h b/pcap.h index 9e1736c..eafc89b 100644 --- a/pcap.h +++ b/pcap.h @@ -7,7 +7,7 @@ #define PCAP_H void pcap(const char *pkt, size_t len); -void pcapm(const struct msghdr *mh); +void pcap_multiple(const struct iovec *iov, unsigned int n, size_t offset); void pcapmm(const struct mmsghdr *mmh, unsigned int vlen); void pcap_init(struct ctx *c); diff --git a/tcp.c b/tcp.c index cfdae06..ed65a9e 100644 --- a/tcp.c +++ b/tcp.c @@ -1468,7 +1468,8 @@ static void tcp_l2_buf_flush(struct ctx *c, struct msghdr *mh, } } *buf_used = *buf_bytes = 0; - pcapm(mh); + + pcap_multiple(mh->msg_iov, mh->msg_iovlen, sizeof(uint32_t)); } /** -- 2.38.1

David Gibson

9:55 a.m.

New subject: [PATCH 03/10] tcp: Combine two parts of passt tap send path together

tcp_l2_buf_flush() open codes the "primary" send of message to the passt tap interface, but calls tcp_l2_buf_flush_part() to handle the case of a short send. Combine these two passt-specific operations into tcp_l2_buf_flush_passt() which is a little cleaner and will enable furrther cleanups. Signed-off-by: David Gibson --- tcp.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/tcp.c b/tcp.c index ed65a9e..fe788b4 100644 --- a/tcp.c +++ b/tcp.c @@ -1415,22 +1415,28 @@ static int tcp_l2_buf_write_one(struct ctx *c, const struct iovec *iov) } /** - * tcp_l2_buf_flush_part() - Ensure a complete last message on partial sendmsg() + * tcp_l2_buf_flush_passt() - Send a message on the passt tap interface * @c: Execution context * @mh: Message header that was partially sent by sendmsg() - * @sent: Bytes already sent + * @buf_bytes: Total number of bytes to send */ -static void tcp_l2_buf_flush_part(const struct ctx *c, - const struct msghdr *mh, size_t sent) +static void tcp_l2_buf_flush_passt(const struct ctx *c, + const struct msghdr *mh, size_t buf_bytes) { - size_t end = 0, missing; + size_t end = 0, missing, sent; struct iovec *iov; unsigned int i; + ssize_t n; char *p; + n = sendmsg(c->fd_tap, mh, MSG_NOSIGNAL | MSG_DONTWAIT); + if (n < 0 || ((sent = (size_t)n) == buf_bytes)) + return; + + /* Ensure a complete last message on partial sendmsg() */ for (i = 0, iov = mh->msg_iov; i < mh->msg_iovlen; i++, iov++) { end += iov->iov_len; - if (end >= sent) + if (end >= (size_t)sent) break; } @@ -1454,9 +1460,7 @@ static void tcp_l2_buf_flush(struct ctx *c, struct msghdr *mh, return; if (c->mode == MODE_PASST) { - size_t n = sendmsg(c->fd_tap, mh, MSG_NOSIGNAL | MSG_DONTWAIT); - if (n > 0 && n < *buf_bytes) - tcp_l2_buf_flush_part(c, mh, n); + tcp_l2_buf_flush_passt(c, mh, *buf_bytes); } else { size_t i; -- 2.38.1

David Gibson

9:55 a.m.

New subject: [PATCH 04/10] tcp: Don't keep compute total bytes in a message until we need it

tcp[46]_l2_buf_bytes keep track of the total number of bytes we have queued to send to the tap interface. tcp_l2_buf_flush_passt() uses this to determine if sendmsg() has sent all the data we requested, or whether we need to resend a trailing portion. However, the logic for finding where we're up to in the case of a short sendmsg() can equally well tell whether we've had one at all, without knowing the total number in advance. This does require an extra loop after each sendmsg(), but it's doing simple arithmetic on values we've already been accessing, and it leads to overall simpler code. Signed-off-by: David Gibson --- tcp.c | 46 +++++++++++++++++++--------------------------- 1 file changed, 19 insertions(+), 27 deletions(-) diff --git a/tcp.c b/tcp.c index fe788b4..6963d1c 100644 --- a/tcp.c +++ b/tcp.c @@ -476,7 +476,6 @@ static struct tcp4_l2_buf_t { tcp4_l2_buf[TCP_FRAMES_MEM]; static unsigned int tcp4_l2_buf_used; -static size_t tcp4_l2_buf_bytes; /** * tcp6_l2_buf_t - Pre-cooked IPv6 packet buffers for tap connections @@ -507,7 +506,6 @@ struct tcp6_l2_buf_t { tcp6_l2_buf[TCP_FRAMES_MEM]; static unsigned int tcp6_l2_buf_used; -static size_t tcp6_l2_buf_bytes; /* recvmsg()/sendmsg() data for tap */ static char tcp_buf_discard [MAX_WINDOW]; @@ -1418,19 +1416,17 @@ static int tcp_l2_buf_write_one(struct ctx *c, const struct iovec *iov) * tcp_l2_buf_flush_passt() - Send a message on the passt tap interface * @c: Execution context * @mh: Message header that was partially sent by sendmsg() - * @buf_bytes: Total number of bytes to send */ -static void tcp_l2_buf_flush_passt(const struct ctx *c, - const struct msghdr *mh, size_t buf_bytes) +static void tcp_l2_buf_flush_passt(const struct ctx *c, const struct msghdr *mh) { - size_t end = 0, missing, sent; + size_t end = 0, missing; struct iovec *iov; unsigned int i; - ssize_t n; + ssize_t sent; char *p; - n = sendmsg(c->fd_tap, mh, MSG_NOSIGNAL | MSG_DONTWAIT); - if (n < 0 || ((sent = (size_t)n) == buf_bytes)) + sent = sendmsg(c->fd_tap, mh, MSG_NOSIGNAL | MSG_DONTWAIT); + if (sent < 0) return; /* Ensure a complete last message on partial sendmsg() */ @@ -1441,6 +1437,9 @@ static void tcp_l2_buf_flush_passt(const struct ctx *c, } missing = end - sent; + if (!missing) + return; + p = (char *)iov->iov_base + iov->iov_len - missing; if (send(c->fd_tap, p, missing, MSG_NOSIGNAL)) debug("TCP: failed to flush %lu missing bytes to tap", missing); @@ -1451,19 +1450,18 @@ static void tcp_l2_buf_flush_passt(const struct ctx *c, * @c: Execution context * @mh: Message header pointing to buffers, msg_iovlen not set * @buf_used: Pointer to count of used buffers, set to 0 on return - * @buf_bytes: Pointer to count of buffer bytes, set to 0 on return */ static void tcp_l2_buf_flush(struct ctx *c, struct msghdr *mh, - unsigned int *buf_used, size_t *buf_bytes) + unsigned int *buf_used) { + size_t i; + if (!(mh->msg_iovlen = *buf_used)) return; if (c->mode == MODE_PASST) { - tcp_l2_buf_flush_passt(c, mh, *buf_bytes); + tcp_l2_buf_flush_passt(c, mh); } else { - size_t i; - for (i = 0; i < mh->msg_iovlen; i++) { struct iovec *iov = &mh->msg_iov[i]; @@ -1471,7 +1469,7 @@ static void tcp_l2_buf_flush(struct ctx *c, struct msghdr *mh, i--; } } - *buf_used = *buf_bytes = 0; + *buf_used = 0; pcap_multiple(mh->msg_iov, mh->msg_iovlen, sizeof(uint32_t)); } @@ -1484,17 +1482,14 @@ static void tcp_l2_flags_buf_flush(struct ctx *c) { struct msghdr mh = { 0 }; unsigned int *buf_used; - size_t *buf_bytes; mh.msg_iov = tcp6_l2_flags_iov; buf_used = &tcp6_l2_flags_buf_used; - buf_bytes = &tcp6_l2_flags_buf_bytes; - tcp_l2_buf_flush(c, &mh, buf_used, buf_bytes); + tcp_l2_buf_flush(c, &mh, buf_used); mh.msg_iov = tcp4_l2_flags_iov; buf_used = &tcp4_l2_flags_buf_used; - buf_bytes = &tcp4_l2_flags_buf_bytes; - tcp_l2_buf_flush(c, &mh, buf_used, buf_bytes); + tcp_l2_buf_flush(c, &mh, buf_used); } /** @@ -1505,17 +1500,14 @@ static void tcp_l2_data_buf_flush(struct ctx *c) { struct msghdr mh = { 0 }; unsigned int *buf_used; - size_t *buf_bytes; mh.msg_iov = tcp6_l2_iov; buf_used = &tcp6_l2_buf_used; - buf_bytes = &tcp6_l2_buf_bytes; - tcp_l2_buf_flush(c, &mh, buf_used, buf_bytes); + tcp_l2_buf_flush(c, &mh, buf_used); mh.msg_iov = tcp4_l2_iov; buf_used = &tcp4_l2_buf_used; - buf_bytes = &tcp4_l2_buf_bytes; - tcp_l2_buf_flush(c, &mh, buf_used, buf_bytes); + tcp_l2_buf_flush(c, &mh, buf_used); } /** @@ -2203,7 +2195,7 @@ static void tcp_data_to_tap(struct ctx *c, struct tcp_tap_conn *conn, len = tcp_l2_buf_fill_headers(c, conn, b, plen, check, seq); iov = tcp4_l2_iov + tcp4_l2_buf_used++; - tcp4_l2_buf_bytes += iov->iov_len = len + sizeof(b->vnet_len); + iov->iov_len = len + sizeof(b->vnet_len); if (tcp4_l2_buf_used > ARRAY_SIZE(tcp4_l2_buf) - 1) tcp_l2_data_buf_flush(c); } else if (CONN_V6(conn)) { @@ -2212,7 +2204,7 @@ static void tcp_data_to_tap(struct ctx *c, struct tcp_tap_conn *conn, len = tcp_l2_buf_fill_headers(c, conn, b, plen, NULL, seq); iov = tcp6_l2_iov + tcp6_l2_buf_used++; - tcp6_l2_buf_bytes += iov->iov_len = len + sizeof(b->vnet_len); + iov->iov_len = len + sizeof(b->vnet_len); if (tcp6_l2_buf_used > ARRAY_SIZE(tcp6_l2_buf) - 1) tcp_l2_data_buf_flush(c); } -- 2.38.1

David Gibson

9:55 a.m.

New subject: [PATCH 05/10] tcp: Improve interface to tcp_l2_buf_flush()

Currently this takes a msghdr, but the only thing we actually care about in there in is the io vector. Make it take an io vector directly. We also have a weird side effect of zeroing @buf_used. Just pass this by value and zero it in the caller instead. Signed-off-by: David Gibson --- tcp.c | 59 ++++++++++++++++++++++++----------------------------------- 1 file changed, 24 insertions(+), 35 deletions(-) diff --git a/tcp.c b/tcp.c index 6963d1c..1d8708e 100644 --- a/tcp.c +++ b/tcp.c @@ -1415,22 +1415,27 @@ static int tcp_l2_buf_write_one(struct ctx *c, const struct iovec *iov) /** * tcp_l2_buf_flush_passt() - Send a message on the passt tap interface * @c: Execution context - * @mh: Message header that was partially sent by sendmsg() + * @iov: Pointer to array of buffers, one per frame + * @n: Number of buffers/frames to flush */ -static void tcp_l2_buf_flush_passt(const struct ctx *c, const struct msghdr *mh) +static void tcp_l2_buf_flush_passt(const struct ctx *c, + const struct iovec *iov, size_t n) { + struct msghdr mh = { + .msg_iov = (void *)iov, + .msg_iovlen = n, + }; size_t end = 0, missing; - struct iovec *iov; unsigned int i; ssize_t sent; char *p; - sent = sendmsg(c->fd_tap, mh, MSG_NOSIGNAL | MSG_DONTWAIT); + sent = sendmsg(c->fd_tap, &mh, MSG_NOSIGNAL | MSG_DONTWAIT); if (sent < 0) return; /* Ensure a complete last message on partial sendmsg() */ - for (i = 0, iov = mh->msg_iov; i < mh->msg_iovlen; i++, iov++) { + for (i = 0; i < n; i++, iov++) { end += iov->iov_len; if (end >= (size_t)sent) break; @@ -1448,30 +1453,24 @@ static void tcp_l2_buf_flush_passt(const struct ctx *c, const struct msghdr *mh) /** * tcp_l2_flags_buf_flush() - Send out buffers for segments with or without data * @c: Execution context - * @mh: Message header pointing to buffers, msg_iovlen not set - * @buf_used: Pointer to count of used buffers, set to 0 on return */ -static void tcp_l2_buf_flush(struct ctx *c, struct msghdr *mh, - unsigned int *buf_used) +static void tcp_l2_buf_flush(struct ctx *c, const struct iovec *iov, size_t n) { size_t i; - if (!(mh->msg_iovlen = *buf_used)) + if (!n) return; if (c->mode == MODE_PASST) { - tcp_l2_buf_flush_passt(c, mh); + tcp_l2_buf_flush_passt(c, iov, n); } else { - for (i = 0; i < mh->msg_iovlen; i++) { - struct iovec *iov = &mh->msg_iov[i]; - - if (tcp_l2_buf_write_one(c, iov)) + for (i = 0; i < n; i++) { + if (tcp_l2_buf_write_one(c, iov + i)) i--; } } - *buf_used = 0; - pcap_multiple(mh->msg_iov, mh->msg_iovlen, sizeof(uint32_t)); + pcap_multiple(iov, n, sizeof(uint32_t)); } /** @@ -1480,16 +1479,11 @@ static void tcp_l2_buf_flush(struct ctx *c, struct msghdr *mh, */ static void tcp_l2_flags_buf_flush(struct ctx *c) { - struct msghdr mh = { 0 }; - unsigned int *buf_used; + tcp_l2_buf_flush(c, tcp6_l2_flags_iov, tcp6_l2_flags_buf_used); + tcp6_l2_flags_buf_used = 0; - mh.msg_iov = tcp6_l2_flags_iov; - buf_used = &tcp6_l2_flags_buf_used; - tcp_l2_buf_flush(c, &mh, buf_used); - - mh.msg_iov = tcp4_l2_flags_iov; - buf_used = &tcp4_l2_flags_buf_used; - tcp_l2_buf_flush(c, &mh, buf_used); + tcp_l2_buf_flush(c, tcp4_l2_flags_iov, tcp4_l2_flags_buf_used); + tcp4_l2_flags_buf_used = 0; } /** @@ -1498,16 +1492,11 @@ static void tcp_l2_flags_buf_flush(struct ctx *c) */ static void tcp_l2_data_buf_flush(struct ctx *c) { - struct msghdr mh = { 0 }; - unsigned int *buf_used; - - mh.msg_iov = tcp6_l2_iov; - buf_used = &tcp6_l2_buf_used; - tcp_l2_buf_flush(c, &mh, buf_used); + tcp_l2_buf_flush(c, tcp6_l2_iov, tcp6_l2_buf_used); + tcp6_l2_buf_used = 0; - mh.msg_iov = tcp4_l2_iov; - buf_used = &tcp4_l2_buf_used; - tcp_l2_buf_flush(c, &mh, buf_used); + tcp_l2_buf_flush(c, tcp4_l2_iov, tcp4_l2_buf_used); + tcp4_l2_buf_used = 0; } /** -- 2.38.1

David Gibson

9:55 a.m.

New subject: [PATCH 06/10] tcp: Combine two parts of pasta tap send path together

tcp_l2_buf_flush() open codes the loop across each frame in a group, but but calls tcp_l2_buf_write_one() to send each frame to the pasta tuntap device. Combine these two pasta-specific operations into tcp_l2_buf_flush_pasta() which is a little cleaner and will enable further cleanups. Signed-off-by: David Gibson --- tcp.c | 40 ++++++++++++++++++---------------------- 1 file changed, 18 insertions(+), 22 deletions(-) diff --git a/tcp.c b/tcp.c index 1d8708e..4cfcb84 100644 --- a/tcp.c +++ b/tcp.c @@ -1393,23 +1393,25 @@ static void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn); } while (0) /** - * tcp_l2_buf_write_one() - Write a single buffer to tap file descriptor + * tcp_l2_buf_flush_pasta() - Send frames on the pasta tap interface * @c: Execution context - * @iov: struct iovec item pointing to buffer - * @ts: Current timestamp - * - * Return: 0 on success, negative error code on failure (tap reset possible) + * @iov: Pointer to array of buffers, one per frame + * @n: Number of buffers/frames to flush */ -static int tcp_l2_buf_write_one(struct ctx *c, const struct iovec *iov) +static void tcp_l2_buf_flush_pasta(struct ctx *c, + const struct iovec *iov, size_t n) { - if (write(c->fd_tap, (char *)iov->iov_base + 4, iov->iov_len - 4) < 0) { - debug("tap write: %s", strerror(errno)); - if (errno != EAGAIN && errno != EWOULDBLOCK) - tap_handler(c, c->fd_tap, EPOLLERR, NULL); - return -errno; - } + size_t i; - return 0; + for (i = 0; i < n; i++) { + if (write(c->fd_tap, (char *)iov->iov_base + 4, + iov->iov_len - 4) < 0) { + debug("tap write: %s", strerror(errno)); + if (errno != EAGAIN && errno != EWOULDBLOCK) + tap_handler(c, c->fd_tap, EPOLLERR, NULL); + i--; + } + } } /** @@ -1456,19 +1458,13 @@ static void tcp_l2_buf_flush_passt(const struct ctx *c, */ static void tcp_l2_buf_flush(struct ctx *c, const struct iovec *iov, size_t n) { - size_t i; - if (!n) return; - if (c->mode == MODE_PASST) { + if (c->mode == MODE_PASST) tcp_l2_buf_flush_passt(c, iov, n); - } else { - for (i = 0; i < n; i++) { - if (tcp_l2_buf_write_one(c, iov + i)) - i--; - } - } + else + tcp_l2_buf_flush_pasta(c, iov, n); pcap_multiple(iov, n, sizeof(uint32_t)); } -- 2.38.1

David Gibson

9:55 a.m.

New subject: [PATCH 07/10] tap, tcp: Move tap send path to tap.c

The functions which do the final steps of sending TCP packets on through the tap interface - tcp_l2_buf_flush*() - no longer have anything that's actually specific to TCP in them, other than comments and names. Move them all to tap.c. Signed-off-by: David Gibson --- tap.c | 84 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ tap.h | 1 + tcp.c | 85 +++-------------------------------------------------------- 3 files changed, 89 insertions(+), 81 deletions(-) diff --git a/tap.c b/tap.c index 2e603ed..558a734 100644 --- a/tap.c +++ b/tap.c @@ -53,6 +53,7 @@ #include "netlink.h" #include "pasta.h" #include "packet.h" +#include "tap.h" #include "log.h" /* IPv4 (plus ARP) and IPv6 message batches from tap/guest to IP handlers */ @@ -302,6 +303,89 @@ void tap_icmp6_send(const struct ctx *c, debug("tap: failed to send %lu bytes (IPv6)", len); } +/** + * tap_send_frames_pasta() - Send multiple frames to the pasta tap + * @c: Execution context + * @iov: Array of buffers, each containing one frame + * @n: Number of buffers/frames in @iov + * + * #syscalls:pasta write + */ +static void tap_send_frames_pasta(struct ctx *c, + const struct iovec *iov, size_t n) +{ + size_t i; + + for (i = 0; i < n; i++) { + if (write(c->fd_tap, (char *)iov->iov_base + 4, + iov->iov_len - 4) < 0) { + debug("tap write: %s", strerror(errno)); + if (errno != EAGAIN && errno != EWOULDBLOCK) + tap_handler(c, c->fd_tap, EPOLLERR, NULL); + i--; + } + } +} + +/** + * tap_send_frames_passt() - Send multiple frames to the passt tap + * @c: Execution context + * @iov: Array of buffers, each containing one frame + * @n: Number of buffers/frames in @iov + * + * #syscalls:passt sendmsg send + */ +static void tap_send_frames_passt(const struct ctx *c, + const struct iovec *iov, size_t n) +{ + struct msghdr mh = { + .msg_iov = (void *)iov, + .msg_iovlen = n, + }; + size_t end = 0, missing; + unsigned int i; + ssize_t sent; + char *p; + + sent = sendmsg(c->fd_tap, &mh, MSG_NOSIGNAL | MSG_DONTWAIT); + if (sent < 0) + return; + + /* Ensure a complete last message on partial sendmsg() */ + for (i = 0; i < n; i++, iov++) { + end += iov->iov_len; + if (end >= (size_t)sent) + break; + } + + missing = end - sent; + if (!missing) + return; + + p = (char *)iov->iov_base + iov->iov_len - missing; + if (send(c->fd_tap, p, missing, MSG_NOSIGNAL)) + debug("tap: failed to flush %lu missing bytes to tap", missing); +} + +/** + * tap_send_frames() - Send out multiple prepared frames + * @c: Execution context + * @iov: Array of buffers, each containing one frame (with L2 headers) + * @n: Number of buffers/frames in @iov + */ +void tap_send_frames(struct ctx *c, const struct iovec *iov, size_t n) +{ + if (!n) + return; + + if (c->mode == MODE_PASST) + tap_send_frames_passt(c, iov, n); + else + tap_send_frames_pasta(c, iov, n); + + pcap_multiple(iov, n, sizeof(uint32_t)); +} + PACKET_POOL_DECL(pool_l4, UIO_MAXIOV, pkt_buf); /** diff --git a/tap.h b/tap.h index 674ab5c..ceac890 100644 --- a/tap.h +++ b/tap.h @@ -22,6 +22,7 @@ void tap_icmp6_send(const struct ctx *c, const struct in6_addr *src, const struct in6_addr *dst, void *in, size_t len); int tap_send(const struct ctx *c, const void *data, size_t len); +void tap_send_frames(struct ctx *c, const struct iovec *iov, size_t n); void tap_handler(struct ctx *c, int fd, uint32_t events, const struct timespec *now); void tap_sock_init(struct ctx *c); diff --git a/tcp.c b/tcp.c index 4cfcb84..f560b96 100644 --- a/tcp.c +++ b/tcp.c @@ -1392,93 +1392,16 @@ static void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn); tcp_rst_do(c, conn); \ } while (0) -/** - * tcp_l2_buf_flush_pasta() - Send frames on the pasta tap interface - * @c: Execution context - * @iov: Pointer to array of buffers, one per frame - * @n: Number of buffers/frames to flush - */ -static void tcp_l2_buf_flush_pasta(struct ctx *c, - const struct iovec *iov, size_t n) -{ - size_t i; - - for (i = 0; i < n; i++) { - if (write(c->fd_tap, (char *)iov->iov_base + 4, - iov->iov_len - 4) < 0) { - debug("tap write: %s", strerror(errno)); - if (errno != EAGAIN && errno != EWOULDBLOCK) - tap_handler(c, c->fd_tap, EPOLLERR, NULL); - i--; - } - } -} - -/** - * tcp_l2_buf_flush_passt() - Send a message on the passt tap interface - * @c: Execution context - * @iov: Pointer to array of buffers, one per frame - * @n: Number of buffers/frames to flush - */ -static void tcp_l2_buf_flush_passt(const struct ctx *c, - const struct iovec *iov, size_t n) -{ - struct msghdr mh = { - .msg_iov = (void *)iov, - .msg_iovlen = n, - }; - size_t end = 0, missing; - unsigned int i; - ssize_t sent; - char *p; - - sent = sendmsg(c->fd_tap, &mh, MSG_NOSIGNAL | MSG_DONTWAIT); - if (sent < 0) - return; - - /* Ensure a complete last message on partial sendmsg() */ - for (i = 0; i < n; i++, iov++) { - end += iov->iov_len; - if (end >= (size_t)sent) - break; - } - - missing = end - sent; - if (!missing) - return; - - p = (char *)iov->iov_base + iov->iov_len - missing; - if (send(c->fd_tap, p, missing, MSG_NOSIGNAL)) - debug("TCP: failed to flush %lu missing bytes to tap", missing); -} - -/** - * tcp_l2_flags_buf_flush() - Send out buffers for segments with or without data - * @c: Execution context - */ -static void tcp_l2_buf_flush(struct ctx *c, const struct iovec *iov, size_t n) -{ - if (!n) - return; - - if (c->mode == MODE_PASST) - tcp_l2_buf_flush_passt(c, iov, n); - else - tcp_l2_buf_flush_pasta(c, iov, n); - - pcap_multiple(iov, n, sizeof(uint32_t)); -} - /** * tcp_l2_flags_buf_flush() - Send out buffers for segments with no data (flags) * @c: Execution context */ static void tcp_l2_flags_buf_flush(struct ctx *c) { - tcp_l2_buf_flush(c, tcp6_l2_flags_iov, tcp6_l2_flags_buf_used); + tap_send_frames(c, tcp6_l2_flags_iov, tcp6_l2_flags_buf_used); tcp6_l2_flags_buf_used = 0; - tcp_l2_buf_flush(c, tcp4_l2_flags_iov, tcp4_l2_flags_buf_used); + tap_send_frames(c, tcp4_l2_flags_iov, tcp4_l2_flags_buf_used); tcp4_l2_flags_buf_used = 0; } @@ -1488,10 +1411,10 @@ static void tcp_l2_flags_buf_flush(struct ctx *c) */ static void tcp_l2_data_buf_flush(struct ctx *c) { - tcp_l2_buf_flush(c, tcp6_l2_iov, tcp6_l2_buf_used); + tap_send_frames(c, tcp6_l2_iov, tcp6_l2_buf_used); tcp6_l2_buf_used = 0; - tcp_l2_buf_flush(c, tcp4_l2_iov, tcp4_l2_buf_used); + tap_send_frames(c, tcp4_l2_iov, tcp4_l2_buf_used); tcp4_l2_buf_used = 0; } -- 2.38.1

David Gibson

9:55 a.m.

New subject: [PATCH 08/10] tcp,tap: Use different io vector bases depending on tap type

Currently tap_send_frames() expects the frames it is given to include the vnet_len field, even in pasta mode which doesn't use it (although it need not be initialized in that case). This will inconvenience future changes, so alter it to expect just the frame as appropriate for the tap backend type. We alter the TCP code which uses it to match, setting up the base of iovec to include or exclude the vnet_len as needed. Signed-off-by: David Gibson --- tap.c | 5 ++--- tcp.c | 30 ++++++++++++++++++++++-------- 2 files changed, 24 insertions(+), 11 deletions(-) diff --git a/tap.c b/tap.c index 558a734..2dd14f1 100644 --- a/tap.c +++ b/tap.c @@ -317,8 +317,7 @@ static void tap_send_frames_pasta(struct ctx *c, size_t i; for (i = 0; i < n; i++) { - if (write(c->fd_tap, (char *)iov->iov_base + 4, - iov->iov_len - 4) < 0) { + if (write(c->fd_tap, (char *)iov->iov_base, iov->iov_len) < 0) { debug("tap write: %s", strerror(errno)); if (errno != EAGAIN && errno != EWOULDBLOCK) tap_handler(c, c->fd_tap, EPOLLERR, NULL); @@ -383,7 +382,7 @@ void tap_send_frames(struct ctx *c, const struct iovec *iov, size_t n) else tap_send_frames_pasta(c, iov, n); - pcap_multiple(iov, n, sizeof(uint32_t)); + pcap_multiple(iov, n, c->mode == MODE_PASST ? sizeof(uint32_t) : 0); } PACKET_POOL_DECL(pool_l4, UIO_MAXIOV, pkt_buf); diff --git a/tcp.c b/tcp.c index f560b96..e5fa5ae 100644 --- a/tcp.c +++ b/tcp.c @@ -1053,8 +1053,9 @@ void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s, /** * tcp_sock4_iov_init() - Initialise scatter-gather L2 buffers for IPv4 sockets + * @ctx: Execution context */ -static void tcp_sock4_iov_init(void) +static void tcp_sock4_iov_init(const struct ctx *c) { struct iovec *iov; int i; @@ -1076,18 +1077,25 @@ static void tcp_sock4_iov_init(void) } for (i = 0, iov = tcp4_l2_iov; i < TCP_FRAMES_MEM; i++, iov++) { - iov->iov_base = &tcp4_l2_buf[i].vnet_len; + if (c->mode == MODE_PASTA) + iov->iov_base = &tcp4_l2_buf[i].eh; + else + iov->iov_base = &tcp4_l2_buf[i].vnet_len; iov->iov_len = MSS_DEFAULT; } for (i = 0, iov = tcp4_l2_flags_iov; i < TCP_FRAMES_MEM; i++, iov++) - iov->iov_base = &tcp4_l2_flags_buf[i].vnet_len; + if (c->mode == MODE_PASTA) + iov->iov_base = &tcp4_l2_flags_buf[i].eh; + else + iov->iov_base = &tcp4_l2_flags_buf[i].vnet_len; } /** * tcp_sock6_iov_init() - Initialise scatter-gather L2 buffers for IPv6 sockets + * @ctx: Execution context */ -static void tcp_sock6_iov_init(void) +static void tcp_sock6_iov_init(const struct ctx *c) { struct iovec *iov; int i; @@ -1109,12 +1117,18 @@ static void tcp_sock6_iov_init(void) } for (i = 0, iov = tcp6_l2_iov; i < TCP_FRAMES_MEM; i++, iov++) { - iov->iov_base = &tcp6_l2_buf[i].vnet_len; + if (c->mode == MODE_PASTA) + iov->iov_base = &tcp6_l2_buf[i].eh; + else + iov->iov_base = &tcp6_l2_buf[i].vnet_len; iov->iov_len = MSS_DEFAULT; } for (i = 0, iov = tcp6_l2_flags_iov; i < TCP_FRAMES_MEM; i++, iov++) - iov->iov_base = &tcp6_l2_flags_buf[i].vnet_len; + if (c->mode == MODE_PASTA) + iov->iov_base = &tcp6_l2_flags_buf[i].eh; + else + iov->iov_base = &tcp6_l2_flags_buf[i].vnet_len; } /** @@ -3131,10 +3145,10 @@ int tcp_init(struct ctx *c) tcp_l2_mh[i] = (struct mmsghdr) { .msg_hdr.msg_iovlen = 1 }; if (c->ifi4) - tcp_sock4_iov_init(); + tcp_sock4_iov_init(c); if (c->ifi6) - tcp_sock6_iov_init(); + tcp_sock6_iov_init(c); memset(init_sock_pool4, 0xff, sizeof(init_sock_pool4)); memset(init_sock_pool6, 0xff, sizeof(init_sock_pool6)); -- 2.38.1

David Gibson

9:55 a.m.

New subject: [PATCH 09/10] udp: Use tap_send_frames()

To send frames on the tap interface, the UDP uses a fairly complicated two level batching. First multiple frames are gathered into a single "message" for the qemu stream socket, then multiple messages are send with sendmmsg(). We now have tap_send_frames() which already deals with sending a number of frames, including batching and handling partial sends. Use that to considerably simplify things. This does make a couple of behavioural changes: * We no longer split messages to keep them under 64kiB, which comments say is necessary. However the TCP code didn't have equivalent code, so either this isn't actually needed, or we should implement for both (which is now easier since it can be done in one place). * Previously when we got a partial send on UDP, we would resend the remainder of the entire "message", including multiple frames. The common code now only resends the remainder of a single frame, simply dropping any frames which weren't even partially sent. This is what TCP always did and is probably a better idea for UDP too. Signed-off-by: David Gibson --- pcap.c | 27 ----------- pcap.h | 1 - udp.c | 145 +++------------------------------------------------------ udp.h | 2 +- 4 files changed, 8 insertions(+), 167 deletions(-) diff --git a/pcap.c b/pcap.c index 8af9021..36f16b5 100644 --- a/pcap.c +++ b/pcap.c @@ -120,33 +120,6 @@ void pcap_multiple(const struct iovec *iov, unsigned int n, size_t offset) iov[i].iov_len - offset, &tv); } -/** - * pcapm() - Capture multiple frames from multiple message headers to pcap file - * @mmh: Pointer to first sendmmsg() header - */ -void pcapmm(const struct mmsghdr *mmh, unsigned int vlen) -{ - struct iovec *iov; - struct timeval tv; - unsigned int i, j; - - if (pcap_fd == -1) - return; - - gettimeofday(&tv, NULL); - - for (i = 0; i < vlen; i++) { - const struct msghdr *mh = &mmh[i].msg_hdr; - - for (j = 0; j < mh->msg_iovlen; j++) { - iov = &mh->msg_iov[j]; - - pcap_frame((char *)iov->iov_base + 4, iov->iov_len - 4, - &tv); - } - } -} - /** * pcap_init() - Initialise pcap file * @c: Execution context diff --git a/pcap.h b/pcap.h index eafc89b..c2af3cf 100644 --- a/pcap.h +++ b/pcap.h @@ -8,7 +8,6 @@ void pcap(const char *pkt, size_t len); void pcap_multiple(const struct iovec *iov, unsigned int n, size_t offset); -void pcapmm(const struct mmsghdr *mmh, unsigned int vlen); void pcap_init(struct ctx *c); #endif /* PCAP_H */ diff --git a/udp.c b/udp.c index f7b9bdc..0c81283 100644 --- a/udp.c +++ b/udp.c @@ -228,9 +228,6 @@ static struct iovec udp6_l2_iov_tap [UDP_MAX_FRAMES]; static struct mmsghdr udp4_l2_mh_sock [UDP_MAX_FRAMES]; static struct mmsghdr udp6_l2_mh_sock [UDP_MAX_FRAMES]; -static struct mmsghdr udp4_l2_mh_tap [UDP_MAX_FRAMES]; -static struct mmsghdr udp6_l2_mh_tap [UDP_MAX_FRAMES]; - /* recvmmsg()/sendmmsg() data for "spliced" connections */ static struct iovec udp4_iov_splice [UDP_MAX_FRAMES]; static struct iovec udp6_iov_splice [UDP_MAX_FRAMES]; @@ -347,16 +344,11 @@ static void udp_sock4_iov_init(const struct ctx *c) mh->msg_iovlen = 1; } - for (i = 0, h = udp4_l2_mh_tap; i < UDP_MAX_FRAMES; i++, h++) { - struct msghdr *mh = &h->msg_hdr; - + for (i = 0; i < UDP_MAX_FRAMES; i++, h++) { if (c->mode == MODE_PASTA) udp4_l2_iov_tap[i].iov_base = &udp4_l2_buf[i].eh; else udp4_l2_iov_tap[i].iov_base = &udp4_l2_buf[i].vnet_len; - - mh->msg_iov = &udp4_l2_iov_tap[i]; - mh->msg_iovlen = 1; } } @@ -392,16 +384,11 @@ static void udp_sock6_iov_init(const struct ctx *c) mh->msg_iovlen = 1; } - for (i = 0, h = udp6_l2_mh_tap; i < UDP_MAX_FRAMES; i++, h++) { - struct msghdr *mh = &h->msg_hdr; - + for (i = 0; i < UDP_MAX_FRAMES; i++, h++) { if (c->mode == MODE_PASTA) udp6_l2_iov_tap[i].iov_base = &udp6_l2_buf[i].eh; else udp6_l2_iov_tap[i].iov_base = &udp6_l2_buf[i].vnet_len; - - mh->msg_iov = &udp6_l2_iov_tap[i]; - mh->msg_iovlen = 1; } } @@ -741,102 +728,6 @@ static size_t udp_update_hdr6(const struct ctx *c, int n, in_port_t dstport, return buf_len; } -/** - * udp_tap_send_pasta() - Send datagrams to the pasta tap interface - * @c: Execution context - * @mmh: Array of message headers to send - * @n: Number of message headers to send - * - * #syscalls:pasta write - */ -static void udp_tap_send_pasta(const struct ctx *c, struct mmsghdr *mmh, - unsigned int n) -{ - unsigned int i, j; - - for (i = 0; i < n; i++) { - for (j = 0; j < mmh[i].msg_hdr.msg_iovlen; j++) { - struct iovec *iov = &mmh[i].msg_hdr.msg_iov[j]; - - /* We can't use writev() because the tap - * character device relies on the write() - * boundaries to discern frame boundaries - */ - if (write(c->fd_tap, iov->iov_base, iov->iov_len) < 0) - debug("tap write: %s", strerror(errno)); - else - pcap(iov->iov_base, iov->iov_len); - } - } -} - -/** - * udp_tap_send_passt() - Send datagrams to the passt tap interface - * @c: Execution context - * @mmh: Array of message headers to send - * @n: Number of message headers to send - * - * #syscalls:passt sendmmsg sendmsg - */ -static void udp_tap_send_passt(const struct ctx *c, struct mmsghdr *mmh, int n) -{ - struct msghdr *last_mh; - ssize_t missing = 0; - size_t msg_len = 0; - unsigned int i; - int ret; - - ret = sendmmsg(c->fd_tap, mmh, n, MSG_NOSIGNAL | MSG_DONTWAIT); - if (ret <= 0) - return; - - /* If we lose some messages to sendmmsg() here, fine, it's UDP. However, - * the last message needs to be delivered completely, otherwise qemu - * will fail to reassemble the next message and close the connection. Go - * through headers from the last sent message, counting bytes, and, if - * and as soon as we see more bytes than sendmmsg() sent, re-send the - * rest with a blocking call. - * - * In pictures, given this example: - * - * iov #0 iov #1 iov #2 iov #3 - * tap_mmh[ret - 1].msg_hdr: .... ...... ..... ...... - * tap_mmh[ret - 1].msg_len: 7 .... ... - * - * when 'msglen' reaches: 10 ^ - * and 'missing' below is: 3 --- - * - * re-send everything from here: ^-- ----- ------ - */ - last_mh = &mmh[ret - 1].msg_hdr; - for (i = 0; i < last_mh->msg_iovlen; i++) { - if (missing <= 0) { - msg_len += last_mh->msg_iov[i].iov_len; - missing = msg_len - mmh[ret - 1].msg_len; - } - - if (missing > 0) { - uint8_t **iov_base; - int first_offset; - - iov_base = (uint8_t **)&last_mh->msg_iov[i].iov_base; - first_offset = last_mh->msg_iov[i].iov_len - missing; - *iov_base += first_offset; - last_mh->msg_iov[i].iov_len = missing; - - last_mh->msg_iov = &last_mh->msg_iov[i]; - - if (sendmsg(c->fd_tap, last_mh, MSG_NOSIGNAL) < 0) - debug("UDP: %li bytes to tap missing", missing); - - *iov_base -= first_offset; - break; - } - } - - pcapmm(mmh, ret); -} - /** * udp_tap_send() - Prepare UDP datagrams and send to tap interface * @c: Execution context @@ -848,25 +739,18 @@ static void udp_tap_send_passt(const struct ctx *c, struct mmsghdr *mmh, int n) * * Return: size of tap frame with headers */ -static void udp_tap_send(const struct ctx *c, +static void udp_tap_send(struct ctx *c, unsigned int start, unsigned int n, in_port_t dstport, bool v6, const struct timespec *now) { - int msg_bufs = 0, msg_i = 0; - struct mmsghdr *tap_mmh; struct iovec *tap_iov; - ssize_t msg_len = 0; unsigned int i; - if (v6) { - tap_mmh = udp6_l2_mh_tap; + if (v6) tap_iov = udp6_l2_iov_tap; - } else { - tap_mmh = udp4_l2_mh_tap; + else tap_iov = udp4_l2_iov_tap; - } - tap_mmh[0].msg_hdr.msg_iov = &tap_iov[start]; for (i = start; i < start + n; i++) { size_t buf_len; @@ -876,24 +760,9 @@ static void udp_tap_send(const struct ctx *c, buf_len = udp_update_hdr4(c, i, dstport, now); tap_iov[i].iov_len = buf_len; - - /* With bigger messages, qemu closes the connection. */ - if (c->mode == MODE_PASST && msg_bufs && - msg_len + buf_len > SHRT_MAX) { - tap_mmh[msg_i].msg_hdr.msg_iovlen = msg_bufs; - msg_i++; - tap_mmh[msg_i].msg_hdr.msg_iov = &tap_iov[i]; - msg_len = msg_bufs = 0; - } - msg_len += buf_len; - msg_bufs++; } - tap_mmh[msg_i].msg_hdr.msg_iovlen = msg_bufs; - if (c->mode == MODE_PASTA) - udp_tap_send_pasta(c, tap_mmh, msg_i + 1); - else - udp_tap_send_passt(c, tap_mmh, msg_i + 1); + tap_send_frames(c, tap_iov + start, n); } /** @@ -905,7 +774,7 @@ static void udp_tap_send(const struct ctx *c, * * #syscalls recvmmsg */ -void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events, +void udp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events, const struct timespec *now) { in_port_t dstport = ref.r.p.udp.udp.port; diff --git a/udp.h b/udp.h index 2a03335..68082ea 100644 --- a/udp.h +++ b/udp.h @@ -8,7 +8,7 @@ #define UDP_TIMER_INTERVAL 1000 /* ms */ -void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events, +void udp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events, const struct timespec *now); int udp_tap_handler(struct ctx *c, int af, const void *addr, const struct pool *p, const struct timespec *now); -- 2.38.1

David Gibson

9:55 a.m.

New subject: [PATCH 10/10] tap: Improve handling of partial frame sends

In passt mode, when writing frames to the qemu socket, we might get a short send. If we ignored this and carried on, the qemu socket would get out of sync, because the bytes we actually sent wouldn't correspond to the length header we already sent. tap_send_frames_passt() handles that by doing a a blocking send to complete the message, but it has a few flaws: * We only attempt to resend once: although it's unlikely in practice, nothing prevents the blocking send() from also being short * We print a debug error if send() returns non-zero.. but send() returns the number of bytes sent, so we actually want it to return the length of the remaining data. Correct those flaws and also be a bit more thorough about reporting problems here. Signed-off-by: David Gibson --- tap.c | 51 ++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 38 insertions(+), 13 deletions(-) diff --git a/tap.c b/tap.c index 2dd14f1..3325022 100644 --- a/tap.c +++ b/tap.c @@ -326,13 +326,39 @@ static void tap_send_frames_pasta(struct ctx *c, } } +/** + * tap_send_remainder() - Send remainder of a partiall sent frame + * @c: Execution context + * @iov: Partially sent buffer + * @offset: Number of bytes already sent from @iov + * + * #syscalls:passt send + */ +static void tap_send_remainder(const struct ctx *c, const struct iovec *iov, + size_t offset) +{ + const char *base = (char *)iov->iov_base; + size_t len = iov->iov_len; + + while (offset < len) { + ssize_t sent = send(c->fd_tap, base + offset, len - offset, + MSG_NOSIGNAL); + if (sent < 0) { + err("tap: partial frame send (missing %lu bytes): %s", + len - offset, strerror(errno)); + return; + } + offset += sent; + } +} + /** * tap_send_frames_passt() - Send multiple frames to the passt tap * @c: Execution context * @iov: Array of buffers, each containing one frame * @n: Number of buffers/frames in @iov * - * #syscalls:passt sendmsg send + * #syscalls:passt sendmsg */ static void tap_send_frames_passt(const struct ctx *c, const struct iovec *iov, size_t n) @@ -341,29 +367,28 @@ static void tap_send_frames_passt(const struct ctx *c, .msg_iov = (void *)iov, .msg_iovlen = n, }; - size_t end = 0, missing; unsigned int i; ssize_t sent; - char *p; sent = sendmsg(c->fd_tap, &mh, MSG_NOSIGNAL | MSG_DONTWAIT); if (sent < 0) return; - /* Ensure a complete last message on partial sendmsg() */ - for (i = 0; i < n; i++, iov++) { - end += iov->iov_len; - if (end >= (size_t)sent) + /* Check for any partial frames due to short send */ + for (i = 0; i < n; i++) { + if ((size_t)sent < iov[i].iov_len) break; + sent -= iov[i].iov_len; } - missing = end - sent; - if (!missing) - return; + if (i < n && sent) { + /* A partial frame was sent */ + tap_send_remainder(c, &iov[i], sent); + i++; + } - p = (char *)iov->iov_base + iov->iov_len - missing; - if (send(c->fd_tap, p, missing, MSG_NOSIGNAL)) - debug("tap: failed to flush %lu missing bytes to tap", missing); + if (i < n) + debug("tap: dropped %lu frames due to short send", n - i); } /** -- 2.38.1

934

Age (days ago)

934

Last active (days ago)

List overview

Download

10 comments

1 participants

participants (1)

David Gibson

[PATCH 00/10] RFC: Unify and simplify tap send path

tags

participants (1)