Rebased, plus latest changes based on feedback. Jon Maloy (3): tcp: move seq_to_tap update to when frame is queued tcp: leverage support of SO_PEEK_OFF socket option when available tcp: allow retransmit when peer receive window is zero tcp.c | 182 +++++++++++++++++++++++++++++++++++++++-------------- tcp_conn.h | 2 + 2 files changed, 138 insertions(+), 46 deletions(-) -- 2.42.0
commit a469fc393fa1 ("tcp, tap: Don't increase tap-side sequence counter for dropped frames") delayed update of conn->seq_to_tap until the moment the corresponding frame has been successfully pushed out. This has the advantage that we immediately can make a new attempt to transmit a frame after a failed trasnmit, rather than waiting for the peer to later discover a gap and trigger the fast retransmit mechanism to solve the problem. This approach has turned out to cause a problem with spurious sequence number updates during peer-initiated retransmits, and we have realized it may not be the best way to solve the above issue. We now restore the previous method, by updating the said field at the moment a frame is added to the outqueue. To retain the advantage of having a quick re-attempt based on local failure detection, we now scan through the part of the outqueue that had do be dropped, and restore the sequence counter for each affected connection to the most appropriate value. Signed-off-by: Jon Maloy <jmaloy(a)redhat.com> --- v2: - Re-spun loop in tcp_revert_seq() and some other changes based on feedback from Stefano Brivio. - Added paranoid test to avoid that seq_to_tap becomes lower than seq_ack_from_tap. --- tcp.c | 63 ++++++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 45 insertions(+), 18 deletions(-) diff --git a/tcp.c b/tcp.c index 21d0af0..21cbfba 100644 --- a/tcp.c +++ b/tcp.c @@ -411,13 +411,14 @@ static int tcp_sock_ns [NUM_PORTS][IP_VERSIONS]; static union inany_addr low_rtt_dst[LOW_RTT_TABLE_SIZE]; /** - * tcp_buf_seq_update - Sequences to update with length of frames once sent - * @seq: Pointer to sequence number sent to tap-side, to be updated - * @len: TCP payload length + * tcp_frame_ref - References needed by queued frames in case we need + * to revert corresponding connection sequence numbers + * @conn: Pointer to connection for this frame + * @seq: Sequence number of the corresponding frame */ -struct tcp_buf_seq_update { - uint32_t *seq; - uint16_t len; +struct tcp_frame_ref { + struct tcp_tap_conn *conn; + uint32_t seq; }; /* Static buffers */ @@ -461,7 +462,7 @@ static struct tcp_payload_t tcp4_payload[TCP_FRAMES_MEM]; static_assert(MSS4 <= sizeof(tcp4_payload[0].data), "MSS4 is greater than 65516"); -static struct tcp_buf_seq_update tcp4_seq_update[TCP_FRAMES_MEM]; +static struct tcp_frame_ref tcp4_frame_ref[TCP_FRAMES_MEM]; static unsigned int tcp4_payload_used; static struct tap_hdr tcp4_flags_tap_hdr[TCP_FRAMES_MEM]; @@ -483,7 +484,7 @@ static struct tcp_payload_t tcp6_payload[TCP_FRAMES_MEM]; static_assert(MSS6 <= sizeof(tcp6_payload[0].data), "MSS6 is greater than 65516"); -static struct tcp_buf_seq_update tcp6_seq_update[TCP_FRAMES_MEM]; +static struct tcp_frame_ref tcp6_frame_ref[TCP_FRAMES_MEM]; static unsigned int tcp6_payload_used; static struct tap_hdr tcp6_flags_tap_hdr[TCP_FRAMES_MEM]; @@ -1261,25 +1262,50 @@ static void tcp_flags_flush(const struct ctx *c) tcp4_flags_used = 0; } +/** + * tcp_revert_seq() - Revert affected conn->seq_to_tap after failed transmission + * @frames_ref: Array with connection and sequence number data + * @first: Index of entry corresponding to first dropped frame + * @last: Index of entry corresponding to last dropped frame + */ +static void tcp_revert_seq(struct tcp_frame_ref *frame_ref, int first, int last) +{ + struct tcp_tap_conn *conn; + int i; + + for (i = first; i <= last; i++) { + conn = frame_ref[i].conn; + + if (SEQ_LE(conn->seq_to_tap, frame_ref[i].seq)) + continue; + + conn->seq_to_tap = frame_ref[i].seq; + + if (SEQ_GE(conn->seq_to_tap, conn->seq_ack_from_tap)) + continue; + + conn->seq_to_tap = conn->seq_ack_from_tap; + } +} + /** * tcp_payload_flush() - Send out buffers for segments with data * @c: Execution context */ static void tcp_payload_flush(const struct ctx *c) { - unsigned i; size_t m; m = tap_send_frames(c, &tcp6_l2_iov[0][0], TCP_NUM_IOVS, tcp6_payload_used); - for (i = 0; i < m; i++) - *tcp6_seq_update[i].seq += tcp6_seq_update[i].len; + if (m != tcp6_payload_used) + tcp_revert_seq(tcp6_frame_ref, m, tcp6_payload_used - 1); tcp6_payload_used = 0; m = tap_send_frames(c, &tcp4_l2_iov[0][0], TCP_NUM_IOVS, tcp4_payload_used); - for (i = 0; i < m; i++) - *tcp4_seq_update[i].seq += tcp4_seq_update[i].len; + if (m != tcp4_payload_used) + tcp_revert_seq(tcp4_frame_ref, m, tcp4_payload_used - 1); tcp4_payload_used = 0; } @@ -2129,10 +2155,11 @@ static int tcp_sock_consume(const struct tcp_tap_conn *conn, uint32_t ack_seq) static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, ssize_t dlen, int no_csum, uint32_t seq) { - uint32_t *seq_update = &conn->seq_to_tap; struct iovec *iov; size_t l4len; + conn->seq_to_tap = seq + dlen; + if (CONN_V4(conn)) { struct iovec *iov_prev = tcp4_l2_iov[tcp4_payload_used - 1]; const uint16_t *check = NULL; @@ -2142,8 +2169,8 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, check = &iph->check; } - tcp4_seq_update[tcp4_payload_used].seq = seq_update; - tcp4_seq_update[tcp4_payload_used].len = dlen; + tcp4_frame_ref[tcp4_payload_used].conn = conn; + tcp4_frame_ref[tcp4_payload_used].seq = seq; iov = tcp4_l2_iov[tcp4_payload_used++]; l4len = tcp_l2_buf_fill_headers(c, conn, iov, dlen, check, seq); @@ -2151,8 +2178,8 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, if (tcp4_payload_used > TCP_FRAMES_MEM - 1) tcp_payload_flush(c); } else if (CONN_V6(conn)) { - tcp6_seq_update[tcp6_payload_used].seq = seq_update; - tcp6_seq_update[tcp6_payload_used].len = dlen; + tcp6_frame_ref[tcp6_payload_used].conn = conn; + tcp6_frame_ref[tcp6_payload_used].seq = seq; iov = tcp6_l2_iov[tcp6_payload_used++]; l4len = tcp_l2_buf_fill_headers(c, conn, iov, dlen, NULL, seq); -- 2.42.0
On Sat, May 11, 2024 at 11:20:06AM -0400, Jon Maloy wrote:commit a469fc393fa1 ("tcp, tap: Don't increase tap-side sequence counter for dropped frames") delayed update of conn->seq_to_tap until the moment the corresponding frame has been successfully pushed out. This has the advantage that we immediately can make a new attempt to transmit a frame after a failed trasnmit, rather than waiting for the peer to later discover a gap and trigger the fast retransmit mechanism to solve the problem. This approach has turned out to cause a problem with spurious sequence number updates during peer-initiated retransmits, and we have realized it may not be the best way to solve the above issue. We now restore the previous method, by updating the said field at the moment a frame is added to the outqueue. To retain the advantage of having a quick re-attempt based on local failure detection, we now scan through the part of the outqueue that had do be dropped, and restore the sequence counter for each affected connection to the most appropriate value. Signed-off-by: Jon Maloy <jmaloy(a)redhat.com> --- v2: - Re-spun loop in tcp_revert_seq() and some other changes based on feedback from Stefano Brivio. - Added paranoid test to avoid that seq_to_tap becomes lower than seq_ack_from_tap. --- tcp.c | 63 ++++++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 45 insertions(+), 18 deletions(-) diff --git a/tcp.c b/tcp.c index 21d0af0..21cbfba 100644 --- a/tcp.c +++ b/tcp.c @@ -411,13 +411,14 @@ static int tcp_sock_ns [NUM_PORTS][IP_VERSIONS]; static union inany_addr low_rtt_dst[LOW_RTT_TABLE_SIZE]; /** - * tcp_buf_seq_update - Sequences to update with length of frames once sent - * @seq: Pointer to sequence number sent to tap-side, to be updated - * @len: TCP payload length + * tcp_frame_ref - References needed by queued frames in case we need + * to revert corresponding connection sequence numbers + * @conn: Pointer to connection for this frame + * @seq: Sequence number of the corresponding frame */ -struct tcp_buf_seq_update { - uint32_t *seq; - uint16_t len; +struct tcp_frame_ref { + struct tcp_tap_conn *conn; + uint32_t seq;As noted in another mail, I think we could get the sequence number from the actual frame buffer at revert time. That could be a follow up improvement, though.}; /* Static buffers */ @@ -461,7 +462,7 @@ static struct tcp_payload_t tcp4_payload[TCP_FRAMES_MEM]; static_assert(MSS4 <= sizeof(tcp4_payload[0].data), "MSS4 is greater than 65516"); -static struct tcp_buf_seq_update tcp4_seq_update[TCP_FRAMES_MEM]; +static struct tcp_frame_ref tcp4_frame_ref[TCP_FRAMES_MEM]; static unsigned int tcp4_payload_used; static struct tap_hdr tcp4_flags_tap_hdr[TCP_FRAMES_MEM]; @@ -483,7 +484,7 @@ static struct tcp_payload_t tcp6_payload[TCP_FRAMES_MEM]; static_assert(MSS6 <= sizeof(tcp6_payload[0].data), "MSS6 is greater than 65516"); -static struct tcp_buf_seq_update tcp6_seq_update[TCP_FRAMES_MEM]; +static struct tcp_frame_ref tcp6_frame_ref[TCP_FRAMES_MEM]; static unsigned int tcp6_payload_used; static struct tap_hdr tcp6_flags_tap_hdr[TCP_FRAMES_MEM]; @@ -1261,25 +1262,50 @@ static void tcp_flags_flush(const struct ctx *c) tcp4_flags_used = 0; } +/** + * tcp_revert_seq() - Revert affected conn->seq_to_tap after failed transmission + * @frames_ref: Array with connection and sequence number data + * @first: Index of entry corresponding to first dropped frame + * @last: Index of entry corresponding to last dropped frame + */ +static void tcp_revert_seq(struct tcp_frame_ref *frame_ref, int first, int last) +{ + struct tcp_tap_conn *conn; + int i; + + for (i = first; i <= last; i++) { + conn = frame_ref[i].conn;'conn' could be local to the loop body, and I think one of our static checkers is going to complain that it's not.+ + if (SEQ_LE(conn->seq_to_tap, frame_ref[i].seq)) + continue; + + conn->seq_to_tap = frame_ref[i].seq; + + if (SEQ_GE(conn->seq_to_tap, conn->seq_ack_from_tap)) + continue; + + conn->seq_to_tap = conn->seq_ack_from_tap; + } +} + /** * tcp_payload_flush() - Send out buffers for segments with data * @c: Execution context */ static void tcp_payload_flush(const struct ctx *c) { - unsigned i; size_t m; m = tap_send_frames(c, &tcp6_l2_iov[0][0], TCP_NUM_IOVS, tcp6_payload_used); - for (i = 0; i < m; i++) - *tcp6_seq_update[i].seq += tcp6_seq_update[i].len; + if (m != tcp6_payload_used) + tcp_revert_seq(tcp6_frame_ref, m, tcp6_payload_used - 1); tcp6_payload_used = 0; m = tap_send_frames(c, &tcp4_l2_iov[0][0], TCP_NUM_IOVS, tcp4_payload_used); - for (i = 0; i < m; i++) - *tcp4_seq_update[i].seq += tcp4_seq_update[i].len; + if (m != tcp4_payload_used) + tcp_revert_seq(tcp4_frame_ref, m, tcp4_payload_used - 1); tcp4_payload_used = 0; } @@ -2129,10 +2155,11 @@ static int tcp_sock_consume(const struct tcp_tap_conn *conn, uint32_t ack_seq) static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, ssize_t dlen, int no_csum, uint32_t seq) { - uint32_t *seq_update = &conn->seq_to_tap; struct iovec *iov; size_t l4len; + conn->seq_to_tap = seq + dlen; + if (CONN_V4(conn)) { struct iovec *iov_prev = tcp4_l2_iov[tcp4_payload_used - 1]; const uint16_t *check = NULL; @@ -2142,8 +2169,8 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, check = &iph->check; } - tcp4_seq_update[tcp4_payload_used].seq = seq_update; - tcp4_seq_update[tcp4_payload_used].len = dlen; + tcp4_frame_ref[tcp4_payload_used].conn = conn; + tcp4_frame_ref[tcp4_payload_used].seq = seq; iov = tcp4_l2_iov[tcp4_payload_used++]; l4len = tcp_l2_buf_fill_headers(c, conn, iov, dlen, check, seq); @@ -2151,8 +2178,8 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, if (tcp4_payload_used > TCP_FRAMES_MEM - 1) tcp_payload_flush(c); } else if (CONN_V6(conn)) { - tcp6_seq_update[tcp6_payload_used].seq = seq_update; - tcp6_seq_update[tcp6_payload_used].len = dlen; + tcp6_frame_ref[tcp6_payload_used].conn = conn; + tcp6_frame_ref[tcp6_payload_used].seq = seq; iov = tcp6_l2_iov[tcp6_payload_used++]; l4len = tcp_l2_buf_fill_headers(c, conn, iov, dlen, NULL, seq);-- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson
On Sat, 11 May 2024 11:20:06 -0400 Jon Maloy <jmaloy(a)redhat.com> wrote:commit a469fc393fa1 ("tcp, tap: Don't increase tap-side sequence counter for dropped frames") delayed update of conn->seq_to_tap until the moment the corresponding frame has been successfully pushed out. This has the advantage that we immediately can make a new attempt to transmit a frame after a failed trasnmit, rather than waiting for the peer to later discover a gap and trigger the fast retransmit mechanism to solve the problem. This approach has turned out to cause a problem with spurious sequence number updates during peer-initiated retransmits, and we have realized it may not be the best way to solve the above issue. We now restore the previous method, by updating the said field at the moment a frame is added to the outqueue. To retain the advantage of having a quick re-attempt based on local failure detection, we now scan through the part of the outqueue that had do be dropped, and restore the sequence counter for each affected connection to the most appropriate value. Signed-off-by: Jon Maloy <jmaloy(a)redhat.com> --- v2: - Re-spun loop in tcp_revert_seq() and some other changes based on feedback from Stefano Brivio. - Added paranoid test to avoid that seq_to_tap becomes lower than seq_ack_from_tap.Should we really fix it up there? More below.--- tcp.c | 63 ++++++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 45 insertions(+), 18 deletions(-) diff --git a/tcp.c b/tcp.c index 21d0af0..21cbfba 100644 --- a/tcp.c +++ b/tcp.c @@ -411,13 +411,14 @@ static int tcp_sock_ns [NUM_PORTS][IP_VERSIONS]; static union inany_addr low_rtt_dst[LOW_RTT_TABLE_SIZE]; /** - * tcp_buf_seq_update - Sequences to update with length of frames once sent - * @seq: Pointer to sequence number sent to tap-side, to be updated - * @len: TCP payload length + * tcp_frame_ref - References needed by queued frames in case we needI think the name isn't really indicative. If you don't like the tcp_conn_old_seq name I proposed, maybe something that refers to sequence numbers being reverted anyway? tcp_conn_revert_seq?+ * to revert corresponding connection sequence numbers + * @conn: Pointer to connection for this frame + * @seq: Sequence number of the corresponding frame */ -struct tcp_buf_seq_update { - uint32_t *seq; - uint16_t len; +struct tcp_frame_ref { + struct tcp_tap_conn *conn; + uint32_t seq; }; /* Static buffers */ @@ -461,7 +462,7 @@ static struct tcp_payload_t tcp4_payload[TCP_FRAMES_MEM]; static_assert(MSS4 <= sizeof(tcp4_payload[0].data), "MSS4 is greater than 65516"); -static struct tcp_buf_seq_update tcp4_seq_update[TCP_FRAMES_MEM]; +static struct tcp_frame_ref tcp4_frame_ref[TCP_FRAMES_MEM]; static unsigned int tcp4_payload_used; static struct tap_hdr tcp4_flags_tap_hdr[TCP_FRAMES_MEM]; @@ -483,7 +484,7 @@ static struct tcp_payload_t tcp6_payload[TCP_FRAMES_MEM]; static_assert(MSS6 <= sizeof(tcp6_payload[0].data), "MSS6 is greater than 65516"); -static struct tcp_buf_seq_update tcp6_seq_update[TCP_FRAMES_MEM]; +static struct tcp_frame_ref tcp6_frame_ref[TCP_FRAMES_MEM]; static unsigned int tcp6_payload_used; static struct tap_hdr tcp6_flags_tap_hdr[TCP_FRAMES_MEM]; @@ -1261,25 +1262,50 @@ static void tcp_flags_flush(const struct ctx *c) tcp4_flags_used = 0; } +/** + * tcp_revert_seq() - Revert affected conn->seq_to_tap after failed transmission + * @frames_ref: Array with connection and sequence number dataNit: it's frame_ref now.+ * @first: Index of entry corresponding to first dropped frame + * @last: Index of entry corresponding to last dropped frame + */ +static void tcp_revert_seq(struct tcp_frame_ref *frame_ref, int first, int last) +{ + struct tcp_tap_conn *conn; + int i; + + for (i = first; i <= last; i++) { + conn = frame_ref[i].conn; + + if (SEQ_LE(conn->seq_to_tap, frame_ref[i].seq)) + continue; + + conn->seq_to_tap = frame_ref[i].seq;So far, it all makes sense to me. Now, to the "paranoid" check you added here:+ if (SEQ_GE(conn->seq_to_tap, conn->seq_ack_from_tap)) + continue;let's say this is false. How did it happen? Did you actually see that happening? And in that case,+ conn->seq_to_tap = conn->seq_ack_from_tap;should we really fix it up here? If yes, I would add a debug() message and also a comment indicating that this isn't expected.+ } +} + /** * tcp_payload_flush() - Send out buffers for segments with data * @c: Execution context */ static void tcp_payload_flush(const struct ctx *c) { - unsigned i; size_t m; m = tap_send_frames(c, &tcp6_l2_iov[0][0], TCP_NUM_IOVS, tcp6_payload_used); - for (i = 0; i < m; i++) - *tcp6_seq_update[i].seq += tcp6_seq_update[i].len; + if (m != tcp6_payload_used) + tcp_revert_seq(tcp6_frame_ref, m, tcp6_payload_used - 1); tcp6_payload_used = 0; m = tap_send_frames(c, &tcp4_l2_iov[0][0], TCP_NUM_IOVS, tcp4_payload_used); - for (i = 0; i < m; i++) - *tcp4_seq_update[i].seq += tcp4_seq_update[i].len; + if (m != tcp4_payload_used) + tcp_revert_seq(tcp4_frame_ref, m, tcp4_payload_used - 1); tcp4_payload_used = 0; } @@ -2129,10 +2155,11 @@ static int tcp_sock_consume(const struct tcp_tap_conn *conn, uint32_t ack_seq) static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, ssize_t dlen, int no_csum, uint32_t seq) { - uint32_t *seq_update = &conn->seq_to_tap; struct iovec *iov; size_t l4len; + conn->seq_to_tap = seq + dlen; + if (CONN_V4(conn)) { struct iovec *iov_prev = tcp4_l2_iov[tcp4_payload_used - 1]; const uint16_t *check = NULL; @@ -2142,8 +2169,8 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, check = &iph->check; } - tcp4_seq_update[tcp4_payload_used].seq = seq_update; - tcp4_seq_update[tcp4_payload_used].len = dlen; + tcp4_frame_ref[tcp4_payload_used].conn = conn; + tcp4_frame_ref[tcp4_payload_used].seq = seq; iov = tcp4_l2_iov[tcp4_payload_used++]; l4len = tcp_l2_buf_fill_headers(c, conn, iov, dlen, check, seq); @@ -2151,8 +2178,8 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, if (tcp4_payload_used > TCP_FRAMES_MEM - 1) tcp_payload_flush(c); } else if (CONN_V6(conn)) { - tcp6_seq_update[tcp6_payload_used].seq = seq_update; - tcp6_seq_update[tcp6_payload_used].len = dlen; + tcp6_frame_ref[tcp6_payload_used].conn = conn; + tcp6_frame_ref[tcp6_payload_used].seq = seq; iov = tcp6_l2_iov[tcp6_payload_used++]; l4len = tcp_l2_buf_fill_headers(c, conn, iov, dlen, NULL, seq);The rest looks good to me. -- Stefano
From linux-6.9.0 the kernel will contain commit 05ea491641d3 ("tcp: add support for SO_PEEK_OFF socket option"). This new feature makes is possible to call recv_msg(MSG_PEEK) and make it start reading data from a given offset set by the SO_PEEK_OFF socket option. This way, we can avoid repeated reading of already read bytes of a received message, hence saving read cycles when forwarding TCP messages in the host->name space direction. In this commit, we add functionality to leverage this feature when available, while we fall back to the previous behavior when not. Measurements with iperf3 shows that throughput increases with 15-20 percent in the host->namespace direction when this feature is used. Signed-off-by: Jon Maloy <jmaloy(a)redhat.com> --- v2: - Some smaller changes as suggested by David Gibson and Stefano Brivio. - Moved initial set_peek_offset(0) to only the locations where the socket is set to ESTABLISHED. - Removed the per-packet synchronization between sk_peek_off and already_sent. Instead only doing it in retransmit situations. - The problem I found when trouble shooting the occasionally occurring out of synch values between 'already_sent' and 'sk_peek_offset' may have deeper implications that we may need to be investigate. v3: - Rebased to most recent version of tcp.c, plus the previous patch in this series. - Some changes based on feedback from PASST team --- tcp.c | 77 ++++++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 63 insertions(+), 14 deletions(-) diff --git a/tcp.c b/tcp.c index 21cbfba..8297812 100644 --- a/tcp.c +++ b/tcp.c @@ -520,6 +520,9 @@ static struct iovec tcp6_l2_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS]; static struct iovec tcp4_l2_flags_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS]; static struct iovec tcp6_l2_flags_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS]; +/* Does the kernel support TCP_PEEK_OFF? */ +static bool peek_offset_cap; + /* sendmsg() to socket */ static struct iovec tcp_iov [UIO_MAXIOV]; @@ -535,6 +538,20 @@ static_assert(ARRAY_SIZE(tc_hash) >= FLOW_MAX, int init_sock_pool4 [TCP_SOCK_POOL_SIZE]; int init_sock_pool6 [TCP_SOCK_POOL_SIZE]; +/** + * tcp_set_peek_offset() - Set SO_PEEK_OFF offset on a socket if supported + * @s: Socket to update + * @offset: Offset in bytes + */ +static void tcp_set_peek_offset(int s, int offset) +{ + if (!peek_offset_cap) + return; + + if (setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &offset, sizeof(offset))) + err("Failed to set SO_PEEK_OFF"); +} + /** * tcp_conn_epoll_events() - epoll events mask for given connection state * @events: Current connection events @@ -1280,11 +1297,14 @@ static void tcp_revert_seq(struct tcp_frame_ref *frame_ref, int first, int last) continue; conn->seq_to_tap = frame_ref[i].seq; + tcp_set_peek_offset(conn->sock, + conn->seq_to_tap - conn->seq_ack_from_tap); if (SEQ_GE(conn->seq_to_tap, conn->seq_ack_from_tap)) continue; conn->seq_to_tap = conn->seq_ack_from_tap; + tcp_set_peek_offset(conn->sock, 0); } } @@ -2203,42 +2223,52 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap; int fill_bufs, send_bufs = 0, last_len, iov_rem = 0; int sendlen, len, dlen, v4 = CONN_V4(conn); + uint32_t max_send, seq, already_sent; int s = conn->sock, i, ret = 0; struct msghdr mh_sock = { 0 }; uint16_t mss = MSS_GET(conn); - uint32_t already_sent, seq; struct iovec *iov; + /* How much have we read/sent since last received ack ? */ already_sent = conn->seq_to_tap - conn->seq_ack_from_tap; - if (SEQ_LT(already_sent, 0)) { /* RFC 761, section 2.1. */ flow_trace(conn, "ACK sequence gap: ACK for %u, sent: %u", conn->seq_ack_from_tap, conn->seq_to_tap); conn->seq_to_tap = conn->seq_ack_from_tap; already_sent = 0; + tcp_set_peek_offset(s, 0); } - if (!wnd_scaled || already_sent >= wnd_scaled) { + /* How much are we still allowed to send within current window ? */ + max_send = conn->seq_ack_from_tap + wnd_scaled - conn->seq_to_tap; + if (SEQ_LE(max_send, 0)) { + flow_trace(conn, "Empty window: win: %u, sent: %u", + wnd_scaled, conn->seq_to_tap); conn_flag(c, conn, STALLED); conn_flag(c, conn, ACK_FROM_TAP_DUE); return 0; } - /* Set up buffer descriptors we'll fill completely and partially. */ - fill_bufs = DIV_ROUND_UP(wnd_scaled - already_sent, mss); + /* Set up buffer descriptors to fill completely or partially. */ + fill_bufs = DIV_ROUND_UP(max_send, mss); if (fill_bufs > TCP_FRAMES) { fill_bufs = TCP_FRAMES; iov_rem = 0; } else { - iov_rem = (wnd_scaled - already_sent) % mss; + iov_rem = max_send % mss; } - mh_sock.msg_iov = iov_sock; - mh_sock.msg_iovlen = fill_bufs + 1; - - iov_sock[0].iov_base = tcp_buf_discard; - iov_sock[0].iov_len = already_sent; + /* Prepare iov according to kernel capability */ + if (!peek_offset_cap) { + mh_sock.msg_iov = iov_sock; + iov_sock[0].iov_base = tcp_buf_discard; + iov_sock[0].iov_len = already_sent; + mh_sock.msg_iovlen = fill_bufs + 1; + } else { + mh_sock.msg_iov = &iov_sock[1]; + mh_sock.msg_iovlen = fill_bufs; + } if (( v4 && tcp4_payload_used + fill_bufs > TCP_FRAMES_MEM) || (!v4 && tcp6_payload_used + fill_bufs > TCP_FRAMES_MEM)) { @@ -2279,7 +2309,10 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) return 0; } - sendlen = len - already_sent; + sendlen = len; + if (!peek_offset_cap) + sendlen -= already_sent; + if (sendlen <= 0) { conn_flag(c, conn, STALLED); return 0; @@ -2449,7 +2482,9 @@ static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn, flow_trace(conn, "fast re-transmit, ACK: %u, previous sequence: %u", max_ack_seq, conn->seq_to_tap); + conn->seq_to_tap = max_ack_seq; + tcp_set_peek_offset(conn->sock, 0); tcp_data_from_sock(c, conn); } @@ -2542,6 +2577,7 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn, conn->seq_ack_to_tap = conn->seq_from_tap; conn_event(c, conn, ESTABLISHED); + tcp_set_peek_offset(conn->sock, 0); /* The client might have sent data already, which we didn't * dequeue waiting for SYN,ACK from tap -- check now. @@ -2622,6 +2658,7 @@ int tcp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af, goto reset; conn_event(c, conn, ESTABLISHED); + tcp_set_peek_offset(conn->sock, 0); if (th->fin) { conn->seq_from_tap++; @@ -2788,7 +2825,7 @@ void tcp_listen_handler(struct ctx *c, union epoll_ref ref, union sockaddr_inany sa; socklen_t sl = sizeof(sa); union flow *flow; - int s; + int s = 0; if (c->no_tcp || !(flow = flow_alloc())) return; @@ -2875,6 +2912,7 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref) flow_dbg(conn, "ACK timeout, retry"); conn->retrans++; conn->seq_to_tap = conn->seq_ack_from_tap; + tcp_set_peek_offset(conn->sock, 0); tcp_data_from_sock(c, conn); tcp_timer_ctl(c, conn); } @@ -3166,7 +3204,8 @@ static void tcp_sock_refill_init(const struct ctx *c) */ int tcp_init(struct ctx *c) { - unsigned b; + unsigned int b, optv = 0; + int s; for (b = 0; b < TCP_HASH_TABLE_SIZE; b++) tc_hash[b] = FLOW_SIDX_NONE; @@ -3190,6 +3229,16 @@ int tcp_init(struct ctx *c) NS_CALL(tcp_ns_socks_init, c); } + /* Probe for SO_PEEK_OFF support */ + s = socket(AF_INET, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP); + if (s < 0) { + warn("Temporary TCP socket creation failed"); + } else { + if (!setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &optv, sizeof(int))) + peek_offset_cap = true; + close(s); + } + info("SO_PEEK_OFF%ssupported", peek_offset_cap ? " " : " not "); return 0; } -- 2.42.0
On Sat, May 11, 2024 at 11:20:07AM -0400, Jon Maloy wrote:Does this hunk actually related to the SO_PEEK_OFF change, or does it belong with the zero window fix in the next patch?From linux-6.9.0 the kernel will containcommit 05ea491641d3 ("tcp: add support for SO_PEEK_OFF socket option"). This new feature makes is possible to call recv_msg(MSG_PEEK) and make it start reading data from a given offset set by the SO_PEEK_OFF socket option. This way, we can avoid repeated reading of already read bytes of a received message, hence saving read cycles when forwarding TCP messages in the host->name space direction. In this commit, we add functionality to leverage this feature when available, while we fall back to the previous behavior when not. Measurements with iperf3 shows that throughput increases with 15-20 percent in the host->namespace direction when this feature is used. Signed-off-by: Jon Maloy <jmaloy(a)redhat.com> --- v2: - Some smaller changes as suggested by David Gibson and Stefano Brivio. - Moved initial set_peek_offset(0) to only the locations where the socket is set to ESTABLISHED. - Removed the per-packet synchronization between sk_peek_off and already_sent. Instead only doing it in retransmit situations. - The problem I found when trouble shooting the occasionally occurring out of synch values between 'already_sent' and 'sk_peek_offset' may have deeper implications that we may need to be investigate. v3: - Rebased to most recent version of tcp.c, plus the previous patch in this series. - Some changes based on feedback from PASST team --- tcp.c | 77 ++++++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 63 insertions(+), 14 deletions(-) diff --git a/tcp.c b/tcp.c index 21cbfba..8297812 100644 --- a/tcp.c +++ b/tcp.c @@ -520,6 +520,9 @@ static struct iovec tcp6_l2_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS]; static struct iovec tcp4_l2_flags_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS]; static struct iovec tcp6_l2_flags_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS]; +/* Does the kernel support TCP_PEEK_OFF? */ +static bool peek_offset_cap; + /* sendmsg() to socket */ static struct iovec tcp_iov [UIO_MAXIOV]; @@ -535,6 +538,20 @@ static_assert(ARRAY_SIZE(tc_hash) >= FLOW_MAX, int init_sock_pool4 [TCP_SOCK_POOL_SIZE]; int init_sock_pool6 [TCP_SOCK_POOL_SIZE]; +/** + * tcp_set_peek_offset() - Set SO_PEEK_OFF offset on a socket if supported + * @s: Socket to update + * @offset: Offset in bytes + */ +static void tcp_set_peek_offset(int s, int offset) +{ + if (!peek_offset_cap) + return; + + if (setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &offset, sizeof(offset))) + err("Failed to set SO_PEEK_OFF"); +} + /** * tcp_conn_epoll_events() - epoll events mask for given connection state * @events: Current connection events @@ -1280,11 +1297,14 @@ static void tcp_revert_seq(struct tcp_frame_ref *frame_ref, int first, int last) continue; conn->seq_to_tap = frame_ref[i].seq; + tcp_set_peek_offset(conn->sock, + conn->seq_to_tap - conn->seq_ack_from_tap); if (SEQ_GE(conn->seq_to_tap, conn->seq_ack_from_tap)) continue; conn->seq_to_tap = conn->seq_ack_from_tap; + tcp_set_peek_offset(conn->sock, 0); } } @@ -2203,42 +2223,52 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap; int fill_bufs, send_bufs = 0, last_len, iov_rem = 0; int sendlen, len, dlen, v4 = CONN_V4(conn); + uint32_t max_send, seq, already_sent; int s = conn->sock, i, ret = 0; struct msghdr mh_sock = { 0 }; uint16_t mss = MSS_GET(conn); - uint32_t already_sent, seq; struct iovec *iov; + /* How much have we read/sent since last received ack ? */ already_sent = conn->seq_to_tap - conn->seq_ack_from_tap; - if (SEQ_LT(already_sent, 0)) { /* RFC 761, section 2.1. */ flow_trace(conn, "ACK sequence gap: ACK for %u, sent: %u", conn->seq_ack_from_tap, conn->seq_to_tap); conn->seq_to_tap = conn->seq_ack_from_tap; already_sent = 0; + tcp_set_peek_offset(s, 0); } - if (!wnd_scaled || already_sent >= wnd_scaled) { + /* How much are we still allowed to send within current window ? */+ max_send = conn->seq_ack_from_tap + wnd_scaled - conn->seq_to_tap; + if (SEQ_LE(max_send, 0)) {IIUC, 'max_send' is an difference in sequence numbers, not an absolute sequence number, and should therefore be compared with a regular <= rather than SEQ_LE. Although it looks like you'd also need to invert the sign to make that work,+ flow_trace(conn, "Empty window: win: %u, sent: %u", + wnd_scaled, conn->seq_to_tap);conn_flag(c, conn, STALLED); conn_flag(c, conn, ACK_FROM_TAP_DUE); return 0; } - /* Set up buffer descriptors we'll fill completely and partially. */ - fill_bufs = DIV_ROUND_UP(wnd_scaled - already_sent, mss); + /* Set up buffer descriptors to fill completely or partially. */ + fill_bufs = DIV_ROUND_UP(max_send, mss); if (fill_bufs > TCP_FRAMES) { fill_bufs = TCP_FRAMES; iov_rem = 0; } else { - iov_rem = (wnd_scaled - already_sent) % mss; + iov_rem = max_send % mss; } - mh_sock.msg_iov = iov_sock; - mh_sock.msg_iovlen = fill_bufs + 1; - - iov_sock[0].iov_base = tcp_buf_discard; - iov_sock[0].iov_len = already_sent; + /* Prepare iov according to kernel capability */ + if (!peek_offset_cap) { + mh_sock.msg_iov = iov_sock; + iov_sock[0].iov_base = tcp_buf_discard; + iov_sock[0].iov_len = already_sent; + mh_sock.msg_iovlen = fill_bufs + 1; + } else { + mh_sock.msg_iov = &iov_sock[1]; + mh_sock.msg_iovlen = fill_bufs; + } if (( v4 && tcp4_payload_used + fill_bufs > TCP_FRAMES_MEM) || (!v4 && tcp6_payload_used + fill_bufs > TCP_FRAMES_MEM)) { @@ -2279,7 +2309,10 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) return 0; } - sendlen = len - already_sent; + sendlen = len; + if (!peek_offset_cap) + sendlen -= already_sent; + if (sendlen <= 0) { conn_flag(c, conn, STALLED); return 0; @@ -2449,7 +2482,9 @@ static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn, flow_trace(conn, "fast re-transmit, ACK: %u, previous sequence: %u", max_ack_seq, conn->seq_to_tap); + conn->seq_to_tap = max_ack_seq; + tcp_set_peek_offset(conn->sock, 0); tcp_data_from_sock(c, conn); } @@ -2542,6 +2577,7 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn, conn->seq_ack_to_tap = conn->seq_from_tap; conn_event(c, conn, ESTABLISHED); + tcp_set_peek_offset(conn->sock, 0); /* The client might have sent data already, which we didn't * dequeue waiting for SYN,ACK from tap -- check now. @@ -2622,6 +2658,7 @@ int tcp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af, goto reset; conn_event(c, conn, ESTABLISHED); + tcp_set_peek_offset(conn->sock, 0); if (th->fin) { conn->seq_from_tap++; @@ -2788,7 +2825,7 @@ void tcp_listen_handler(struct ctx *c, union epoll_ref ref, union sockaddr_inany sa; socklen_t sl = sizeof(sa); union flow *flow; - int s; + int s = 0;This appears unrelated to the rest. Also wrong, since stdin is not a reasonable initial value for this fd.if (c->no_tcp || !(flow = flow_alloc())) return; @@ -2875,6 +2912,7 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref) flow_dbg(conn, "ACK timeout, retry"); conn->retrans++; conn->seq_to_tap = conn->seq_ack_from_tap; + tcp_set_peek_offset(conn->sock, 0); tcp_data_from_sock(c, conn); tcp_timer_ctl(c, conn); } @@ -3166,7 +3204,8 @@ static void tcp_sock_refill_init(const struct ctx *c) */ int tcp_init(struct ctx *c) { - unsigned b; + unsigned int b, optv = 0; + int s; for (b = 0; b < TCP_HASH_TABLE_SIZE; b++) tc_hash[b] = FLOW_SIDX_NONE; @@ -3190,6 +3229,16 @@ int tcp_init(struct ctx *c) NS_CALL(tcp_ns_socks_init, c); } + /* Probe for SO_PEEK_OFF support */ + s = socket(AF_INET, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP); + if (s < 0) { + warn("Temporary TCP socket creation failed"); + } else { + if (!setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &optv, sizeof(int))) + peek_offset_cap = true; + close(s); + } + info("SO_PEEK_OFF%ssupported", peek_offset_cap ? " " : " not "); return 0; }-- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson
On Sat, 11 May 2024 11:20:07 -0400 Jon Maloy <jmaloy(a)redhat.com> wrote:From linux-6.9.0 the kernel will contain commit 05ea491641d3 ("tcp: add support for SO_PEEK_OFF socket option"). This new feature makes is possible to call recv_msg(MSG_PEEK) and make it start reading data from a given offset set by the SO_PEEK_OFF socket option. This way, we can avoid repeated reading of already read bytes of a received message, hence saving read cycles when forwarding TCP messages in the host->name space direction. In this commit, we add functionality to leverage this feature when available, while we fall back to the previous behavior when not. Measurements with iperf3 shows that throughput increases with 15-20 percent in the host->namespace direction when this feature is used. Signed-off-by: Jon Maloy <jmaloy(a)redhat.com> --- v2: - Some smaller changes as suggested by David Gibson and Stefano Brivio. - Moved initial set_peek_offset(0) to only the locations where the socket is set to ESTABLISHED. - Removed the per-packet synchronization between sk_peek_off and already_sent. Instead only doing it in retransmit situations. - The problem I found when trouble shooting the occasionally occurring out of synch values between 'already_sent' and 'sk_peek_offset' may have deeper implications that we may need to be investigate. v3: - Rebased to most recent version of tcp.c, plus the previous patch in this series. - Some changes based on feedback from PASST team --- tcp.c | 77 ++++++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 63 insertions(+), 14 deletions(-) diff --git a/tcp.c b/tcp.c index 21cbfba..8297812 100644 --- a/tcp.c +++ b/tcp.c @@ -520,6 +520,9 @@ static struct iovec tcp6_l2_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS]; static struct iovec tcp4_l2_flags_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS]; static struct iovec tcp6_l2_flags_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS]; +/* Does the kernel support TCP_PEEK_OFF? */ +static bool peek_offset_cap; + /* sendmsg() to socket */ static struct iovec tcp_iov [UIO_MAXIOV]; @@ -535,6 +538,20 @@ static_assert(ARRAY_SIZE(tc_hash) >= FLOW_MAX, int init_sock_pool4 [TCP_SOCK_POOL_SIZE]; int init_sock_pool6 [TCP_SOCK_POOL_SIZE]; +/** + * tcp_set_peek_offset() - Set SO_PEEK_OFF offset on a socket if supported + * @s: Socket to update + * @offset: Offset in bytes + */ +static void tcp_set_peek_offset(int s, int offset) +{ + if (!peek_offset_cap) + return; + + if (setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &offset, sizeof(offset))) + err("Failed to set SO_PEEK_OFF");It would be nice to say on which socket and at which offset, in case it failed.+} + /** * tcp_conn_epoll_events() - epoll events mask for given connection state * @events: Current connection events @@ -1280,11 +1297,14 @@ static void tcp_revert_seq(struct tcp_frame_ref *frame_ref, int first, int last) continue; conn->seq_to_tap = frame_ref[i].seq; + tcp_set_peek_offset(conn->sock, + conn->seq_to_tap - conn->seq_ack_from_tap); if (SEQ_GE(conn->seq_to_tap, conn->seq_ack_from_tap)) continue; conn->seq_to_tap = conn->seq_ack_from_tap; + tcp_set_peek_offset(conn->sock, 0); } } @@ -2203,42 +2223,52 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap; int fill_bufs, send_bufs = 0, last_len, iov_rem = 0; int sendlen, len, dlen, v4 = CONN_V4(conn); + uint32_t max_send, seq, already_sent; int s = conn->sock, i, ret = 0; struct msghdr mh_sock = { 0 }; uint16_t mss = MSS_GET(conn); - uint32_t already_sent, seq; struct iovec *iov; + /* How much have we read/sent since last received ack ? */ already_sent = conn->seq_to_tap - conn->seq_ack_from_tap; -Spurious change.if (SEQ_LT(already_sent, 0)) { /* RFC 761, section 2.1. */ flow_trace(conn, "ACK sequence gap: ACK for %u, sent: %u", conn->seq_ack_from_tap, conn->seq_to_tap); conn->seq_to_tap = conn->seq_ack_from_tap; already_sent = 0; + tcp_set_peek_offset(s, 0); } - if (!wnd_scaled || already_sent >= wnd_scaled) { + /* How much are we still allowed to send within current window ? */ + max_send = conn->seq_ack_from_tap + wnd_scaled - conn->seq_to_tap;I'm not sure about the purpose of this whole part of the patch, even if I try to see it in the context of 3/3.+ if (SEQ_LE(max_send, 0)) { + flow_trace(conn, "Empty window: win: %u, sent: %u", + wnd_scaled, conn->seq_to_tap); conn_flag(c, conn, STALLED); conn_flag(c, conn, ACK_FROM_TAP_DUE); return 0; } - /* Set up buffer descriptors we'll fill completely and partially. */ - fill_bufs = DIV_ROUND_UP(wnd_scaled - already_sent, mss); + /* Set up buffer descriptors to fill completely or partially. */Spurious change... or did you really mean to change this comment?+ fill_bufs = DIV_ROUND_UP(max_send, mss); if (fill_bufs > TCP_FRAMES) { fill_bufs = TCP_FRAMES; iov_rem = 0; } else { - iov_rem = (wnd_scaled - already_sent) % mss; + iov_rem = max_send % mss; } - mh_sock.msg_iov = iov_sock; - mh_sock.msg_iovlen = fill_bufs + 1; - - iov_sock[0].iov_base = tcp_buf_discard; - iov_sock[0].iov_len = already_sent; + /* Prepare iov according to kernel capability */ + if (!peek_offset_cap) { + mh_sock.msg_iov = iov_sock; + iov_sock[0].iov_base = tcp_buf_discard; + iov_sock[0].iov_len = already_sent; + mh_sock.msg_iovlen = fill_bufs + 1; + } else { + mh_sock.msg_iov = &iov_sock[1]; + mh_sock.msg_iovlen = fill_bufs; + } if (( v4 && tcp4_payload_used + fill_bufs > TCP_FRAMES_MEM) || (!v4 && tcp6_payload_used + fill_bufs > TCP_FRAMES_MEM)) { @@ -2279,7 +2309,10 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) return 0; } - sendlen = len - already_sent; + sendlen = len; + if (!peek_offset_cap) + sendlen -= already_sent; + if (sendlen <= 0) { conn_flag(c, conn, STALLED); return 0; @@ -2449,7 +2482,9 @@ static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn, flow_trace(conn, "fast re-transmit, ACK: %u, previous sequence: %u", max_ack_seq, conn->seq_to_tap); +Spurious change.conn->seq_to_tap = max_ack_seq; + tcp_set_peek_offset(conn->sock, 0); tcp_data_from_sock(c, conn); } @@ -2542,6 +2577,7 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn, conn->seq_ack_to_tap = conn->seq_from_tap; conn_event(c, conn, ESTABLISHED); + tcp_set_peek_offset(conn->sock, 0); /* The client might have sent data already, which we didn't * dequeue waiting for SYN,ACK from tap -- check now. @@ -2622,6 +2658,7 @@ int tcp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af, goto reset; conn_event(c, conn, ESTABLISHED); + tcp_set_peek_offset(conn->sock, 0); if (th->fin) { conn->seq_from_tap++; @@ -2788,7 +2825,7 @@ void tcp_listen_handler(struct ctx *c, union epoll_ref ref, union sockaddr_inany sa; socklen_t sl = sizeof(sa); union flow *flow; - int s; + int s = 0; if (c->no_tcp || !(flow = flow_alloc())) return; @@ -2875,6 +2912,7 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref) flow_dbg(conn, "ACK timeout, retry"); conn->retrans++; conn->seq_to_tap = conn->seq_ack_from_tap; + tcp_set_peek_offset(conn->sock, 0); tcp_data_from_sock(c, conn); tcp_timer_ctl(c, conn); } @@ -3166,7 +3204,8 @@ static void tcp_sock_refill_init(const struct ctx *c) */ int tcp_init(struct ctx *c) { - unsigned b; + unsigned int b, optv = 0; + int s; for (b = 0; b < TCP_HASH_TABLE_SIZE; b++) tc_hash[b] = FLOW_SIDX_NONE; @@ -3190,6 +3229,16 @@ int tcp_init(struct ctx *c) NS_CALL(tcp_ns_socks_init, c); } + /* Probe for SO_PEEK_OFF support */ + s = socket(AF_INET, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP); + if (s < 0) { + warn("Temporary TCP socket creation failed"); + } else { + if (!setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &optv, sizeof(int))) + peek_offset_cap = true; + close(s); + } + info("SO_PEEK_OFF%ssupported", peek_offset_cap ? " " : " not ");Maintaining the extra newline before the return would be nice. The changes actually related to SO_PEEK_OFF look good to me otherwise.return 0; }-- Stefano
On 2024-05-14 13:22, Stefano Brivio wrote:On Sat, 11 May 2024 11:20:07 -0400 Jon Maloy <jmaloy(a)redhat.com> wrote:ok.From linux-6.9.0 the kernel will contain commit 05ea491641d3 ("tcp: add support for SO_PEEK_OFF socket option"). This new feature makes is possible to call recv_msg(MSG_PEEK) and make it start reading data from a given offset set by the SO_PEEK_OFF socket option. This way, we can avoid repeated reading of already read bytes of a received message, hence saving read cycles when forwarding TCP messages in the host->name space direction. In this commit, we add functionality to leverage this feature when available, while we fall back to the previous behavior when not. Measurements with iperf3 shows that throughput increases with 15-20 percent in the host->namespace direction when this feature is used. Signed-off-by: Jon Maloy <jmaloy(a)redhat.com> --- v2: - Some smaller changes as suggested by David Gibson and Stefano Brivio. - Moved initial set_peek_offset(0) to only the locations where the socket is set to ESTABLISHED. - Removed the per-packet synchronization between sk_peek_off and already_sent. Instead only doing it in retransmit situations. - The problem I found when trouble shooting the occasionally occurring out of synch values between 'already_sent' and 'sk_peek_offset' may have deeper implications that we may need to be investigate. v3: - Rebased to most recent version of tcp.c, plus the previous patch in this series. - Some changes based on feedback from PASST team --- tcp.c | 77 ++++++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 63 insertions(+), 14 deletions(-) diff --git a/tcp.c b/tcp.c index 21cbfba..8297812 100644 --- a/tcp.c +++ b/tcp.c @@ -520,6 +520,9 @@ static struct iovec tcp6_l2_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS]; static struct iovec tcp4_l2_flags_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS]; static struct iovec tcp6_l2_flags_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS]; +/* Does the kernel support TCP_PEEK_OFF? */ +static bool peek_offset_cap; + /* sendmsg() to socket */ static struct iovec tcp_iov [UIO_MAXIOV]; @@ -535,6 +538,20 @@ static_assert(ARRAY_SIZE(tc_hash) >= FLOW_MAX, int init_sock_pool4 [TCP_SOCK_POOL_SIZE]; int init_sock_pool6 [TCP_SOCK_POOL_SIZE]; +/** + * tcp_set_peek_offset() - Set SO_PEEK_OFF offset on a socket if supported + * @s: Socket to update + * @offset: Offset in bytes + */ +static void tcp_set_peek_offset(int s, int offset) +{ + if (!peek_offset_cap) + return; + + if (setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &offset, sizeof(offset))) + err("Failed to set SO_PEEK_OFF");It would be nice to say on which socket and at which offset, in case it failed.No. Intentional.+} + /** * tcp_conn_epoll_events() - epoll events mask for given connection state * @events: Current connection events @@ -1280,11 +1297,14 @@ static void tcp_revert_seq(struct tcp_frame_ref *frame_ref, int first, int last) continue; conn->seq_to_tap = frame_ref[i].seq; + tcp_set_peek_offset(conn->sock, + conn->seq_to_tap - conn->seq_ack_from_tap); if (SEQ_GE(conn->seq_to_tap, conn->seq_ack_from_tap)) continue; conn->seq_to_tap = conn->seq_ack_from_tap; + tcp_set_peek_offset(conn->sock, 0); } } @@ -2203,42 +2223,52 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap; int fill_bufs, send_bufs = 0, last_len, iov_rem = 0; int sendlen, len, dlen, v4 = CONN_V4(conn); + uint32_t max_send, seq, already_sent; int s = conn->sock, i, ret = 0; struct msghdr mh_sock = { 0 }; uint16_t mss = MSS_GET(conn); - uint32_t already_sent, seq; struct iovec *iov; + /* How much have we read/sent since last received ack ? */ already_sent = conn->seq_to_tap - conn->seq_ack_from_tap; -Spurious change.To me it simply makes the code more readable, at least after the next patch. Also, we you pointed out that we shouldn't really be dealing with windows in this function, and that will be eliminated altogether in the next patch. I can move it back to the next patch, but the code will look the same.if (SEQ_LT(already_sent, 0)) { /* RFC 761, section 2.1. */ flow_trace(conn, "ACK sequence gap: ACK for %u, sent: %u", conn->seq_ack_from_tap, conn->seq_to_tap); conn->seq_to_tap = conn->seq_ack_from_tap; already_sent = 0; + tcp_set_peek_offset(s, 0); } - if (!wnd_scaled || already_sent >= wnd_scaled) { + /* How much are we still allowed to send within current window ? */ + max_send = conn->seq_ack_from_tap + wnd_scaled - conn->seq_to_tap;I'm not sure about the purpose of this whole part of the patch, even if I try to see it in the context of 3/3.Yes.+ if (SEQ_LE(max_send, 0)) { + flow_trace(conn, "Empty window: win: %u, sent: %u", + wnd_scaled, conn->seq_to_tap); conn_flag(c, conn, STALLED); conn_flag(c, conn, ACK_FROM_TAP_DUE); return 0; } - /* Set up buffer descriptors we'll fill completely and partially. */ - fill_bufs = DIV_ROUND_UP(wnd_scaled - already_sent, mss); + /* Set up buffer descriptors to fill completely or partially. */Spurious change... or did you really mean to change this comment?There is a reason I prefer printf() when debugging ;-) I will add it and post a new version shortly. ///jon+ fill_bufs = DIV_ROUND_UP(max_send, mss); if (fill_bufs > TCP_FRAMES) { fill_bufs = TCP_FRAMES; iov_rem = 0; } else { - iov_rem = (wnd_scaled - already_sent) % mss; + iov_rem = max_send % mss; } - mh_sock.msg_iov = iov_sock; - mh_sock.msg_iovlen = fill_bufs + 1; - - iov_sock[0].iov_base = tcp_buf_discard; - iov_sock[0].iov_len = already_sent; + /* Prepare iov according to kernel capability */ + if (!peek_offset_cap) { + mh_sock.msg_iov = iov_sock; + iov_sock[0].iov_base = tcp_buf_discard; + iov_sock[0].iov_len = already_sent; + mh_sock.msg_iovlen = fill_bufs + 1; + } else { + mh_sock.msg_iov = &iov_sock[1]; + mh_sock.msg_iovlen = fill_bufs; + } if (( v4 && tcp4_payload_used + fill_bufs > TCP_FRAMES_MEM) || (!v4 && tcp6_payload_used + fill_bufs > TCP_FRAMES_MEM)) { @@ -2279,7 +2309,10 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) return 0; } - sendlen = len - already_sent; + sendlen = len; + if (!peek_offset_cap) + sendlen -= already_sent; + if (sendlen <= 0) { conn_flag(c, conn, STALLED); return 0; @@ -2449,7 +2482,9 @@ static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn, flow_trace(conn, "fast re-transmit, ACK: %u, previous sequence: %u", max_ack_seq, conn->seq_to_tap); +Spurious change.conn->seq_to_tap = max_ack_seq; + tcp_set_peek_offset(conn->sock, 0); tcp_data_from_sock(c, conn); } @@ -2542,6 +2577,7 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn, conn->seq_ack_to_tap = conn->seq_from_tap; conn_event(c, conn, ESTABLISHED); + tcp_set_peek_offset(conn->sock, 0); /* The client might have sent data already, which we didn't * dequeue waiting for SYN,ACK from tap -- check now. @@ -2622,6 +2658,7 @@ int tcp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af, goto reset; conn_event(c, conn, ESTABLISHED); + tcp_set_peek_offset(conn->sock, 0); if (th->fin) { conn->seq_from_tap++; @@ -2788,7 +2825,7 @@ void tcp_listen_handler(struct ctx *c, union epoll_ref ref, union sockaddr_inany sa; socklen_t sl = sizeof(sa); union flow *flow; - int s; + int s = 0; if (c->no_tcp || !(flow = flow_alloc())) return; @@ -2875,6 +2912,7 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref) flow_dbg(conn, "ACK timeout, retry"); conn->retrans++; conn->seq_to_tap = conn->seq_ack_from_tap; + tcp_set_peek_offset(conn->sock, 0); tcp_data_from_sock(c, conn); tcp_timer_ctl(c, conn); } @@ -3166,7 +3204,8 @@ static void tcp_sock_refill_init(const struct ctx *c) */ int tcp_init(struct ctx *c) { - unsigned b; + unsigned int b, optv = 0; + int s; for (b = 0; b < TCP_HASH_TABLE_SIZE; b++) tc_hash[b] = FLOW_SIDX_NONE; @@ -3190,6 +3229,16 @@ int tcp_init(struct ctx *c) NS_CALL(tcp_ns_socks_init, c); } + /* Probe for SO_PEEK_OFF support */ + s = socket(AF_INET, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP); + if (s < 0) { + warn("Temporary TCP socket creation failed"); + } else { + if (!setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &optv, sizeof(int))) + peek_offset_cap = true; + close(s); + } + info("SO_PEEK_OFF%ssupported", peek_offset_cap ? " " : " not ");Maintaining the extra newline before the return would be nice.The changes actually related to SO_PEEK_OFF look good to me otherwise. > return 0; > } >
On Tue, 14 May 2024 16:06:22 -0400 Jon Maloy <jmaloy(a)redhat.com> wrote:[...]No no, I really meant having one newline between info() and the return 0 below, to keep the separation we had before. I wasn't commenting about the info() itself or about the format.There is a reason I prefer printf() when debugging ;-)+ } + info("SO_PEEK_OFF%ssupported", peek_offset_cap ? " " : " not ");Maintaining the extra newline before the return would be nice.I will add it and post a new version shortly. ///jon-- StefanoThe changes actually related to SO_PEEK_OFF look good to me otherwise. > return 0; > } >
A bug in kernel TCP may lead to a deadlock where a zero window is sent from the peer, while it is unable to send out window updates even after reads have freed up enough buffer space to permit a larger window. In this situation, new window advertisemnts from the peer can only be triggered by packets arriving from this side. However, such packets are never sent, because the zero-window condition currently prevents this side from sending out any packets whatsoever to the peer. We notice that the above bug is triggered *only* after the peer has dropped an arriving packet because of severe memory squeeze, and that we hence always enter a retransmission situation when this occurs. This also means that it goes against the RFC 9293 recommendation that a previously advertised window never should shrink. RFC 9293 gives the solution to this situation. In chapter 3.6.1 we find the following statement: "A TCP receiver SHOULD NOT shrink the window, i.e., move the right window edge to the left (SHLD-14). However, a sending TCP peer MUST be robust against window shrinking, which may cause the "usable window" (see Section 3.8.6.2.1) to become negative (MUST-34). If this happens, the sender SHOULD NOT send new data (SHLD-15), but SHOULD retransmit normally the old unacknowledged data between SND.UNA and SND.UNA+SND.WND (SHLD-16). The sender MAY also retransmit old data beyond SND.UNA+SND.WND (MAY-7)" We never see the window become negative, but we interpret this as a recommendation to use the previously available window during retransmission even when the currently advertised window is zero. In case of a zero-window non-retransmission situation where there is no new data to be sent, we also add a simple zero-window probing feature. By sending an empty packet at regular timeout events we resolve the situation described above, since the peer receives the necessary trigger to advertise its window once it becomes non-zero again. It should be noted that although this solves the problem we have at hand, it is not a genuine solution to the kernel bug. There may well be TCP stacks around in other OS-es which don't do this, nor have keep-alive probing as an alternatve way to solve the situation. Signed-off-by: Jon Maloy <jmaloy(a)redhat.com> --- v2: - Using previously advertised window during retransmission, instead highest send sequencece number in the cycle. v3: - Rebased to newest code - Changes based on feedback from PASST team - Sending out empty probe message at timer expiration when we are not in retransmit situation. --- tcp.c | 30 +++++++++++++++++++++--------- tcp_conn.h | 2 ++ 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/tcp.c b/tcp.c index 8297812..bd6bf35 100644 --- a/tcp.c +++ b/tcp.c @@ -1774,9 +1774,15 @@ static void tcp_get_tap_ws(struct tcp_tap_conn *conn, */ static void tcp_tap_window_update(struct tcp_tap_conn *conn, unsigned wnd) { + uint32_t wnd_upper; + wnd = MIN(MAX_WINDOW, wnd << conn->ws_from_tap); conn->wnd_from_tap = MIN(wnd >> conn->ws_from_tap, USHRT_MAX); + wnd_upper = conn->seq_ack_from_tap + wnd; + if (wnd && SEQ_GT(wnd_upper, conn->seq_wup_from_tap)) + conn->seq_wup_from_tap = wnd_upper; + /* FIXME: reflect the tap-side receiver's window back to the sock-side * sender by adjusting SO_RCVBUF? */ } @@ -1809,6 +1815,7 @@ static void tcp_seq_init(const struct ctx *c, struct tcp_tap_conn *conn, ns = (now->tv_sec * 1000000000 + now->tv_nsec) >> 5; conn->seq_to_tap = ((uint32_t)(hash >> 32) ^ (uint32_t)hash) + ns; + conn->seq_wup_from_tap = conn->seq_to_tap; } /** @@ -2220,7 +2227,6 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, */ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) { - uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap; int fill_bufs, send_bufs = 0, last_len, iov_rem = 0; int sendlen, len, dlen, v4 = CONN_V4(conn); uint32_t max_send, seq, already_sent; @@ -2241,10 +2247,11 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) } /* How much are we still allowed to send within current window ? */ - max_send = conn->seq_ack_from_tap + wnd_scaled - conn->seq_to_tap; + max_send = conn->seq_wup_from_tap - conn->seq_to_tap; if (SEQ_LE(max_send, 0)) { - flow_trace(conn, "Empty window: win: %u, sent: %u", - wnd_scaled, conn->seq_to_tap); + flow_trace(conn, "Empty window: win_upper: %u, sent: %u", + conn->seq_wup_from_tap, conn->seq_to_tap); + conn->seq_wup_from_tap = conn->seq_to_tap; conn_flag(c, conn, STALLED); conn_flag(c, conn, ACK_FROM_TAP_DUE); return 0; @@ -2380,7 +2387,7 @@ static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn, ASSERT(conn->events & ESTABLISHED); for (i = idx, iov_i = 0; i < (int)p->count; i++) { - uint32_t seq, seq_offset, ack_seq; + uint32_t seq, seq_offset, ack_seq, wnd; const struct tcphdr *th; char *data; size_t off; @@ -2413,11 +2420,12 @@ static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn, if (SEQ_GE(ack_seq, conn->seq_ack_from_tap) && SEQ_GE(ack_seq, max_ack_seq)) { /* Fast re-transmit */ + wnd = ntohs(th->window); retr = !len && !th->fin && ack_seq == max_ack_seq && - ntohs(th->window) == max_ack_seq_wnd; + (wnd == max_ack_seq_wnd || !wnd); - max_ack_seq_wnd = ntohs(th->window); + max_ack_seq_wnd = wnd; max_ack_seq = ack_seq; } } @@ -2480,8 +2488,9 @@ static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn, if (retr) { flow_trace(conn, - "fast re-transmit, ACK: %u, previous sequence: %u", - max_ack_seq, conn->seq_to_tap); + "fast re-transmit, seqno %u -> %u, win_upper: %u", + conn->seq_to_tap, max_ack_seq, + conn->seq_wup_from_tap); conn->seq_to_tap = max_ack_seq; tcp_set_peek_offset(conn->sock, 0); @@ -2931,6 +2940,9 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref) flow_dbg(conn, "activity timeout"); tcp_rst(c, conn); } + /* No data sent recently? Keep connection alive. */ + if (conn->seq_to_tap == conn->seq_ack_from_tap) + tcp_send_flag(c, conn, ACK_IF_NEEDED); } } diff --git a/tcp_conn.h b/tcp_conn.h index d280b22..8ae20ef 100644 --- a/tcp_conn.h +++ b/tcp_conn.h @@ -30,6 +30,7 @@ * @wnd_to_tap: Sending window advertised to tap, unscaled (as sent) * @seq_to_tap: Next sequence for packets to tap * @seq_ack_from_tap: Last ACK number received from tap + * @seq_wup_from_tap: Right edge of last non-zero window from tap * @seq_from_tap: Next sequence for packets from tap (not actually sent) * @seq_ack_to_tap: Last ACK number sent to tap * @seq_init_from_tap: Initial sequence number from tap @@ -101,6 +102,7 @@ struct tcp_tap_conn { uint32_t seq_to_tap; uint32_t seq_ack_from_tap; + uint32_t seq_wup_from_tap; uint32_t seq_from_tap; uint32_t seq_ack_to_tap; uint32_t seq_init_from_tap; -- 2.42.0
On Sat, 11 May 2024 11:20:08 -0400 Jon Maloy <jmaloy(a)redhat.com> wrote:A bug in kernel TCP may lead to a deadlock where a zero window is sent from the peer, while it is unable to send out window updates even after reads have freed up enough buffer space to permit a larger window. In this situation, new window advertisemnts from the peer can only be triggered by packets arriving from this side. However, such packets are never sent, because the zero-window condition currently prevents this side from sending out any packets whatsoever to the peer. We notice that the above bug is triggered *only* after the peer has dropped an arriving packet because of severe memory squeeze, and that we hence always enter a retransmission situation when this occurs. This also means that it goes against the RFC 9293 recommendation that a previously advertised window never should shrink. RFC 9293 gives the solution to this situation. In chapter 3.6.1 we find the following statement: "A TCP receiver SHOULD NOT shrink the window, i.e., move the right window edge to the left (SHLD-14). However, a sending TCP peer MUST be robust against window shrinking, which may cause the "usable window" (see Section 3.8.6.2.1) to become negative (MUST-34). If this happens, the sender SHOULD NOT send new data (SHLD-15), but SHOULD retransmit normally the old unacknowledged data between SND.UNA and SND.UNA+SND.WND (SHLD-16). The sender MAY also retransmit old data beyond SND.UNA+SND.WND (MAY-7)" We never see the window become negative, but we interpret this as a recommendation to use the previously available window during retransmission even when the currently advertised window is zero. In case of a zero-window non-retransmission situation where there is no new data to be sent, we also add a simple zero-window probing feature. By sending an empty packet at regular timeout events we resolve the situation described above, since the peer receives the necessary trigger to advertise its window once it becomes non-zero again. It should be noted that although this solves the problem we have at hand, it is not a genuine solution to the kernel bug. There may well be TCP stacks around in other OS-es which don't do this, nor have keep-alive probing as an alternatve way to solve the situation. Signed-off-by: Jon Maloy <jmaloy(a)redhat.com> --- v2: - Using previously advertised window during retransmission, instead highest send sequencece number in the cycle. v3: - Rebased to newest code - Changes based on feedback from PASST team - Sending out empty probe message at timer expiration when we are not in retransmit situation. --- tcp.c | 30 +++++++++++++++++++++--------- tcp_conn.h | 2 ++ 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/tcp.c b/tcp.c index 8297812..bd6bf35 100644 --- a/tcp.c +++ b/tcp.c @@ -1774,9 +1774,15 @@ static void tcp_get_tap_ws(struct tcp_tap_conn *conn, */ static void tcp_tap_window_update(struct tcp_tap_conn *conn, unsigned wnd) { + uint32_t wnd_upper; + wnd = MIN(MAX_WINDOW, wnd << conn->ws_from_tap); conn->wnd_from_tap = MIN(wnd >> conn->ws_from_tap, USHRT_MAX); + wnd_upper = conn->seq_ack_from_tap + wnd; + if (wnd && SEQ_GT(wnd_upper, conn->seq_wup_from_tap)) + conn->seq_wup_from_tap = wnd_upper; + /* FIXME: reflect the tap-side receiver's window back to the sock-side * sender by adjusting SO_RCVBUF? */ } @@ -1809,6 +1815,7 @@ static void tcp_seq_init(const struct ctx *c, struct tcp_tap_conn *conn, ns = (now->tv_sec * 1000000000 + now->tv_nsec) >> 5; conn->seq_to_tap = ((uint32_t)(hash >> 32) ^ (uint32_t)hash) + ns; + conn->seq_wup_from_tap = conn->seq_to_tap; } /** @@ -2220,7 +2227,6 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, */ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) { - uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap; int fill_bufs, send_bufs = 0, last_len, iov_rem = 0; int sendlen, len, dlen, v4 = CONN_V4(conn); uint32_t max_send, seq, already_sent; @@ -2241,10 +2247,11 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) } /* How much are we still allowed to send within current window ? */ - max_send = conn->seq_ack_from_tap + wnd_scaled - conn->seq_to_tap; + max_send = conn->seq_wup_from_tap - conn->seq_to_tap; if (SEQ_LE(max_send, 0)) { - flow_trace(conn, "Empty window: win: %u, sent: %u", - wnd_scaled, conn->seq_to_tap); + flow_trace(conn, "Empty window: win_upper: %u, sent: %u", + conn->seq_wup_from_tap, conn->seq_to_tap); + conn->seq_wup_from_tap = conn->seq_to_tap; conn_flag(c, conn, STALLED); conn_flag(c, conn, ACK_FROM_TAP_DUE); return 0; @@ -2380,7 +2387,7 @@ static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn, ASSERT(conn->events & ESTABLISHED); for (i = idx, iov_i = 0; i < (int)p->count; i++) { - uint32_t seq, seq_offset, ack_seq; + uint32_t seq, seq_offset, ack_seq, wnd; const struct tcphdr *th; char *data; size_t off; @@ -2413,11 +2420,12 @@ static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn, if (SEQ_GE(ack_seq, conn->seq_ack_from_tap) && SEQ_GE(ack_seq, max_ack_seq)) { /* Fast re-transmit */ + wnd = ntohs(th->window); retr = !len && !th->fin && ack_seq == max_ack_seq && - ntohs(th->window) == max_ack_seq_wnd; + (wnd == max_ack_seq_wnd || !wnd);Just as a reminder, as I mentioned on Monday: this means we'll re-transmit whenever we get a pure window update (!len && !th->fin && ack_seq == max_ack_seq) with a zero window. The receiver is telling us it ran out of space, and wham, we flood them, as a punishment. I would let this check alone, and just add zero-window probing, plus whatever retransmission you mentioned from the RFC -- but not a fast re-transmit on a zero window.- max_ack_seq_wnd = ntohs(th->window); + max_ack_seq_wnd = wnd; max_ack_seq = ack_seq; } } @@ -2480,8 +2488,9 @@ static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn, if (retr) { flow_trace(conn, - "fast re-transmit, ACK: %u, previous sequence: %u", - max_ack_seq, conn->seq_to_tap); + "fast re-transmit, seqno %u -> %u, win_upper: %u", + conn->seq_to_tap, max_ack_seq,I'm not sure if "->" really conveys the meaning of "we're sending this sequence *because* of that acknowledgement number". I would rather keep the received acknowledged sequence before everything else, because that's the causal trigger for the retransmission.+ conn->seq_wup_from_tap); conn->seq_to_tap = max_ack_seq; tcp_set_peek_offset(conn->sock, 0); @@ -2931,6 +2940,9 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref) flow_dbg(conn, "activity timeout"); tcp_rst(c, conn); } + /* No data sent recently? Keep connection alive. */ + if (conn->seq_to_tap == conn->seq_ack_from_tap) + tcp_send_flag(c, conn, ACK_IF_NEEDED);If the window is zero, this won't send anything, see the first condition in tcp_send_flag(). ACK_IF_NEEDED implies that that function should queue an ACK segment if we have data to acknowledge. Here, the flag you want is simply 'ACK'. But we should make sure that this can't be taken as a duplicate ACK, that is, we should only send this if seq_ack_to_tap == seq_from_tap. Otherwise, we shouldn't send anything, lest the peer retransmit anything that we didn't acknowledge yet.} } diff --git a/tcp_conn.h b/tcp_conn.h index d280b22..8ae20ef 100644 --- a/tcp_conn.h +++ b/tcp_conn.h @@ -30,6 +30,7 @@ * @wnd_to_tap: Sending window advertised to tap, unscaled (as sent) * @seq_to_tap: Next sequence for packets to tap * @seq_ack_from_tap: Last ACK number received from tap + * @seq_wup_from_tap: Right edge of last non-zero window from tap"Right edge" makes much more sense to me, and it also matches RFC language. Could we turn all the "wup" and "upper" references into something like "edge" or "right_edge"?* @seq_from_tap: Next sequence for packets from tap (not actually sent) * @seq_ack_to_tap: Last ACK number sent to tap * @seq_init_from_tap: Initial sequence number from tap @@ -101,6 +102,7 @@ struct tcp_tap_conn { uint32_t seq_to_tap; uint32_t seq_ack_from_tap; + uint32_t seq_wup_from_tap; uint32_t seq_from_tap; uint32_t seq_ack_to_tap; uint32_t seq_init_from_tap;-- Stefano
On 2024-05-14 13:46, Stefano Brivio wrote:On Sat, 11 May 2024 11:20:08 -0400 Jon Maloy <jmaloy(a)redhat.com> wrote:I think I have a good idea here. I'll use it in my next version.A bug in kernel TCP may lead to a deadlock where a zero window is sent from the peer, while it is unable to send out window updates even after reads have freed up enough buffer space to permit a larger window. In this situation, new window advertisemnts from the peer can only be triggered by packets arriving from this side. However, such packets are never sent, because the zero-window condition currently prevents this side from sending out any packets whatsoever to the peer. We notice that the above bug is triggered *only* after the peer has dropped an arriving packet because of severe memory squeeze, and that we hence always enter a retransmission situation when this occurs. This also means that it goes against the RFC 9293 recommendation that a previously advertised window never should shrink. RFC 9293 gives the solution to this situation. In chapter 3.6.1 we find the following statement: "A TCP receiver SHOULD NOT shrink the window, i.e., move the right window edge to the left (SHLD-14). However, a sending TCP peer MUST be robust against window shrinking, which may cause the "usable window" (see Section 3.8.6.2.1) to become negative (MUST-34). If this happens, the sender SHOULD NOT send new data (SHLD-15), but SHOULD retransmit normally the old unacknowledged data between SND.UNA and SND.UNA+SND.WND (SHLD-16). The sender MAY also retransmit old data beyond SND.UNA+SND.WND (MAY-7)" We never see the window become negative, but we interpret this as a recommendation to use the previously available window during retransmission even when the currently advertised window is zero. In case of a zero-window non-retransmission situation where there is no new data to be sent, we also add a simple zero-window probing feature. By sending an empty packet at regular timeout events we resolve the situation described above, since the peer receives the necessary trigger to advertise its window once it becomes non-zero again. It should be noted that although this solves the problem we have at hand, it is not a genuine solution to the kernel bug. There may well be TCP stacks around in other OS-es which don't do this, nor have keep-alive probing as an alternatve way to solve the situation. Signed-off-by: Jon Maloy <jmaloy(a)redhat.com> --- v2: - Using previously advertised window during retransmission, instead highest send sequencece number in the cycle. v3: - Rebased to newest code - Changes based on feedback from PASST team - Sending out empty probe message at timer expiration when we are not in retransmit situation. --- tcp.c | 30 +++++++++++++++++++++--------- tcp_conn.h | 2 ++ 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/tcp.c b/tcp.c index 8297812..bd6bf35 100644 --- a/tcp.c +++ b/tcp.c @@ -1774,9 +1774,15 @@ static void tcp_get_tap_ws(struct tcp_tap_conn *conn, */ static void tcp_tap_window_update(struct tcp_tap_conn *conn, unsigned wnd) { + uint32_t wnd_upper; + wnd = MIN(MAX_WINDOW, wnd << conn->ws_from_tap); conn->wnd_from_tap = MIN(wnd >> conn->ws_from_tap, USHRT_MAX); + wnd_upper = conn->seq_ack_from_tap + wnd; + if (wnd && SEQ_GT(wnd_upper, conn->seq_wup_from_tap)) + conn->seq_wup_from_tap = wnd_upper; + /* FIXME: reflect the tap-side receiver's window back to the sock-side * sender by adjusting SO_RCVBUF? */ } @@ -1809,6 +1815,7 @@ static void tcp_seq_init(const struct ctx *c, struct tcp_tap_conn *conn, ns = (now->tv_sec * 1000000000 + now->tv_nsec) >> 5; conn->seq_to_tap = ((uint32_t)(hash >> 32) ^ (uint32_t)hash) + ns; + conn->seq_wup_from_tap = conn->seq_to_tap; } /** @@ -2220,7 +2227,6 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, */ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) { - uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap; int fill_bufs, send_bufs = 0, last_len, iov_rem = 0; int sendlen, len, dlen, v4 = CONN_V4(conn); uint32_t max_send, seq, already_sent; @@ -2241,10 +2247,11 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) } /* How much are we still allowed to send within current window ? */ - max_send = conn->seq_ack_from_tap + wnd_scaled - conn->seq_to_tap; + max_send = conn->seq_wup_from_tap - conn->seq_to_tap; if (SEQ_LE(max_send, 0)) { - flow_trace(conn, "Empty window: win: %u, sent: %u", - wnd_scaled, conn->seq_to_tap); + flow_trace(conn, "Empty window: win_upper: %u, sent: %u", + conn->seq_wup_from_tap, conn->seq_to_tap); + conn->seq_wup_from_tap = conn->seq_to_tap; conn_flag(c, conn, STALLED); conn_flag(c, conn, ACK_FROM_TAP_DUE); return 0; @@ -2380,7 +2387,7 @@ static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn, ASSERT(conn->events & ESTABLISHED); for (i = idx, iov_i = 0; i < (int)p->count; i++) { - uint32_t seq, seq_offset, ack_seq; + uint32_t seq, seq_offset, ack_seq, wnd; const struct tcphdr *th; char *data; size_t off; @@ -2413,11 +2420,12 @@ static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn, if (SEQ_GE(ack_seq, conn->seq_ack_from_tap) && SEQ_GE(ack_seq, max_ack_seq)) { /* Fast re-transmit */ + wnd = ntohs(th->window); retr = !len && !th->fin && ack_seq == max_ack_seq && - ntohs(th->window) == max_ack_seq_wnd; + (wnd == max_ack_seq_wnd || !wnd);Just as a reminder, as I mentioned on Monday: this means we'll re-transmit whenever we get a pure window update (!len && !th->fin && ack_seq == max_ack_seq) with a zero window. The receiver is telling us it ran out of space, and wham, we flood them, as a punishment. I would let this check alone, and just add zero-window probing, plus whatever retransmission you mentioned from the RFC -- but not a fast re-transmit on a zero window.It really means "we are rewinding seq_to_tap from X to Y". That it is caused by a duplicate ack is implicit.- max_ack_seq_wnd = ntohs(th->window); + max_ack_seq_wnd = wnd; max_ack_seq = ack_seq; } } @@ -2480,8 +2488,9 @@ static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn, if (retr) { flow_trace(conn, - "fast re-transmit, ACK: %u, previous sequence: %u", - max_ack_seq, conn->seq_to_tap); + "fast re-transmit, seqno %u -> %u, win_upper: %u", + conn->seq_to_tap, max_ack_seq,I'm not sure if "->" really conveys the meaning of "we're sending this sequence *because* of that acknowledgement number".I would rather keep the received acknowledged sequence before everything else, because that's the causal trigger for the retransmission.Ok. I missed that.+ conn->seq_wup_from_tap); conn->seq_to_tap = max_ack_seq; tcp_set_peek_offset(conn->sock, 0); @@ -2931,6 +2940,9 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref) flow_dbg(conn, "activity timeout"); tcp_rst(c, conn); } + /* No data sent recently? Keep connection alive. */ + if (conn->seq_to_tap == conn->seq_ack_from_tap) + tcp_send_flag(c, conn, ACK_IF_NEEDED);If the window is zero, this won't send anything, see the first condition in tcp_send_flag(). ACK_IF_NEEDED implies that that function should queue an ACK segment if we have data to acknowledge.Here, the flag you want is simply 'ACK'. But we should make sure that this can't be taken as a duplicate ACK, that is, we should only send this if seq_ack_to_tap == seq_from_tap. Otherwise, we shouldn't send anything, lest the peer retransmit anything that we didn't acknowledge yet.But then we have no probing... Wasn't that the whole pint of this?I tried to come up with something short, because the field name becomes impractically long. I am open to suggestions. ///jon} } diff --git a/tcp_conn.h b/tcp_conn.h index d280b22..8ae20ef 100644 --- a/tcp_conn.h +++ b/tcp_conn.h @@ -30,6 +30,7 @@ * @wnd_to_tap: Sending window advertised to tap, unscaled (as sent) * @seq_to_tap: Next sequence for packets to tap * @seq_ack_from_tap: Last ACK number received from tap + * @seq_wup_from_tap: Right edge of last non-zero window from tap"Right edge" makes much more sense to me, and it also matches RFC language. Could we turn all the "wup" and "upper" references into something like "edge" or "right_edge"?> * @seq_from_tap: Next sequence for packets from tap (not actually sent) > * @seq_ack_to_tap: Last ACK number sent to tap > * @seq_init_from_tap: Initial sequence number from tap > @@ -101,6 +102,7 @@ struct tcp_tap_conn { > > uint32_t seq_to_tap; > uint32_t seq_ack_from_tap; > + uint32_t seq_wup_from_tap; > uint32_t seq_from_tap; > uint32_t seq_ack_to_tap; > uint32_t seq_init_from_tap;
On Tue, 14 May 2024 16:19:16 -0400 Jon Maloy <jmaloy(a)redhat.com> wrote:On 2024-05-14 13:46, Stefano Brivio wrote:I wouldn't take that for granted, so much that with the current version of this patch, it's *not* necessarily caused by a duplicate acknowledgement. Anyway, it really doesn't look intuitive to me, and users have to figure out what's happening, too.On Sat, 11 May 2024 11:20:08 -0400 Jon Maloy <jmaloy(a)redhat.com> wrote:I think I have a good idea here. I'll use it in my next version.A bug in kernel TCP may lead to a deadlock where a zero window is sent from the peer, while it is unable to send out window updates even after reads have freed up enough buffer space to permit a larger window. In this situation, new window advertisemnts from the peer can only be triggered by packets arriving from this side. However, such packets are never sent, because the zero-window condition currently prevents this side from sending out any packets whatsoever to the peer. We notice that the above bug is triggered *only* after the peer has dropped an arriving packet because of severe memory squeeze, and that we hence always enter a retransmission situation when this occurs. This also means that it goes against the RFC 9293 recommendation that a previously advertised window never should shrink. RFC 9293 gives the solution to this situation. In chapter 3.6.1 we find the following statement: "A TCP receiver SHOULD NOT shrink the window, i.e., move the right window edge to the left (SHLD-14). However, a sending TCP peer MUST be robust against window shrinking, which may cause the "usable window" (see Section 3.8.6.2.1) to become negative (MUST-34). If this happens, the sender SHOULD NOT send new data (SHLD-15), but SHOULD retransmit normally the old unacknowledged data between SND.UNA and SND.UNA+SND.WND (SHLD-16). The sender MAY also retransmit old data beyond SND.UNA+SND.WND (MAY-7)" We never see the window become negative, but we interpret this as a recommendation to use the previously available window during retransmission even when the currently advertised window is zero. In case of a zero-window non-retransmission situation where there is no new data to be sent, we also add a simple zero-window probing feature. By sending an empty packet at regular timeout events we resolve the situation described above, since the peer receives the necessary trigger to advertise its window once it becomes non-zero again. It should be noted that although this solves the problem we have at hand, it is not a genuine solution to the kernel bug. There may well be TCP stacks around in other OS-es which don't do this, nor have keep-alive probing as an alternatve way to solve the situation. Signed-off-by: Jon Maloy <jmaloy(a)redhat.com> --- v2: - Using previously advertised window during retransmission, instead highest send sequencece number in the cycle. v3: - Rebased to newest code - Changes based on feedback from PASST team - Sending out empty probe message at timer expiration when we are not in retransmit situation. --- tcp.c | 30 +++++++++++++++++++++--------- tcp_conn.h | 2 ++ 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/tcp.c b/tcp.c index 8297812..bd6bf35 100644 --- a/tcp.c +++ b/tcp.c @@ -1774,9 +1774,15 @@ static void tcp_get_tap_ws(struct tcp_tap_conn *conn, */ static void tcp_tap_window_update(struct tcp_tap_conn *conn, unsigned wnd) { + uint32_t wnd_upper; + wnd = MIN(MAX_WINDOW, wnd << conn->ws_from_tap); conn->wnd_from_tap = MIN(wnd >> conn->ws_from_tap, USHRT_MAX); + wnd_upper = conn->seq_ack_from_tap + wnd; + if (wnd && SEQ_GT(wnd_upper, conn->seq_wup_from_tap)) + conn->seq_wup_from_tap = wnd_upper; + /* FIXME: reflect the tap-side receiver's window back to the sock-side * sender by adjusting SO_RCVBUF? */ } @@ -1809,6 +1815,7 @@ static void tcp_seq_init(const struct ctx *c, struct tcp_tap_conn *conn, ns = (now->tv_sec * 1000000000 + now->tv_nsec) >> 5; conn->seq_to_tap = ((uint32_t)(hash >> 32) ^ (uint32_t)hash) + ns; + conn->seq_wup_from_tap = conn->seq_to_tap; } /** @@ -2220,7 +2227,6 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, */ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) { - uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap; int fill_bufs, send_bufs = 0, last_len, iov_rem = 0; int sendlen, len, dlen, v4 = CONN_V4(conn); uint32_t max_send, seq, already_sent; @@ -2241,10 +2247,11 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) } /* How much are we still allowed to send within current window ? */ - max_send = conn->seq_ack_from_tap + wnd_scaled - conn->seq_to_tap; + max_send = conn->seq_wup_from_tap - conn->seq_to_tap; if (SEQ_LE(max_send, 0)) { - flow_trace(conn, "Empty window: win: %u, sent: %u", - wnd_scaled, conn->seq_to_tap); + flow_trace(conn, "Empty window: win_upper: %u, sent: %u", + conn->seq_wup_from_tap, conn->seq_to_tap); + conn->seq_wup_from_tap = conn->seq_to_tap; conn_flag(c, conn, STALLED); conn_flag(c, conn, ACK_FROM_TAP_DUE); return 0; @@ -2380,7 +2387,7 @@ static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn, ASSERT(conn->events & ESTABLISHED); for (i = idx, iov_i = 0; i < (int)p->count; i++) { - uint32_t seq, seq_offset, ack_seq; + uint32_t seq, seq_offset, ack_seq, wnd; const struct tcphdr *th; char *data; size_t off; @@ -2413,11 +2420,12 @@ static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn, if (SEQ_GE(ack_seq, conn->seq_ack_from_tap) && SEQ_GE(ack_seq, max_ack_seq)) { /* Fast re-transmit */ + wnd = ntohs(th->window); retr = !len && !th->fin && ack_seq == max_ack_seq && - ntohs(th->window) == max_ack_seq_wnd; + (wnd == max_ack_seq_wnd || !wnd);Just as a reminder, as I mentioned on Monday: this means we'll re-transmit whenever we get a pure window update (!len && !th->fin && ack_seq == max_ack_seq) with a zero window. The receiver is telling us it ran out of space, and wham, we flood them, as a punishment. I would let this check alone, and just add zero-window probing, plus whatever retransmission you mentioned from the RFC -- but not a fast re-transmit on a zero window.It really means "we are rewinding seq_to_tap from X to Y". That it is caused by a duplicate ack is implicit.- max_ack_seq_wnd = ntohs(th->window); + max_ack_seq_wnd = wnd; max_ack_seq = ack_seq; } } @@ -2480,8 +2488,9 @@ static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn, if (retr) { flow_trace(conn, - "fast re-transmit, ACK: %u, previous sequence: %u", - max_ack_seq, conn->seq_to_tap); + "fast re-transmit, seqno %u -> %u, win_upper: %u", + conn->seq_to_tap, max_ack_seq,I'm not sure if "->" really conveys the meaning of "we're sending this sequence *because* of that acknowledgement number".We generally do. We would have no probing only in case an ACK for data that the peer *sent* us is due by us (the other way around). There, probing would mean causing the peer to re-transmit, which we don't want to trigger here.I would rather keep the received acknowledged sequence before everything else, because that's the causal trigger for the retransmission.Ok. I missed that.+ conn->seq_wup_from_tap); conn->seq_to_tap = max_ack_seq; tcp_set_peek_offset(conn->sock, 0); @@ -2931,6 +2940,9 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref) flow_dbg(conn, "activity timeout"); tcp_rst(c, conn); } + /* No data sent recently? Keep connection alive. */ + if (conn->seq_to_tap == conn->seq_ack_from_tap) + tcp_send_flag(c, conn, ACK_IF_NEEDED);If the window is zero, this won't send anything, see the first condition in tcp_send_flag(). ACK_IF_NEEDED implies that that function should queue an ACK segment if we have data to acknowledge.Here, the flag you want is simply 'ACK'. But we should make sure that this can't be taken as a duplicate ACK, that is, we should only send this if seq_ack_to_tap == seq_from_tap. Otherwise, we shouldn't send anything, lest the peer retransmit anything that we didn't acknowledge yet.But then we have no probing... Wasn't that the whole pint of this?@wnd_edge_from_tap? -- StefanoI tried to come up with something short, because the field name becomes impractically long. I am open to suggestions.} } diff --git a/tcp_conn.h b/tcp_conn.h index d280b22..8ae20ef 100644 --- a/tcp_conn.h +++ b/tcp_conn.h @@ -30,6 +30,7 @@ * @wnd_to_tap: Sending window advertised to tap, unscaled (as sent) * @seq_to_tap: Next sequence for packets to tap * @seq_ack_from_tap: Last ACK number received from tap + * @seq_wup_from_tap: Right edge of last non-zero window from tap"Right edge" makes much more sense to me, and it also matches RFC language. Could we turn all the "wup" and "upper" references into something like "edge" or "right_edge"?