[PATCH v9 0/5] Retry SYNs for inbound connections
When a client connects, SYN would be sent to guest only once. If the guest is not connected or ready at that time, the connection will be reset in 10s. These patches introduce the SYN retry mechanism using the similar backoff timeout as linux kernel. Also update the data retransmission timeout using the backoff timeout. v9: - Squash the patch of adding parameter c to tcp_timer_ctl() into the later one - Fix the cppcheck error - Update the return values of read_file(), remove the error message of it and add check for buf_size - Rename RTO_INIT_ACK to RTO_INIT_AFTER_SYN_RETRIES - Round up rto_max, fix the conversion in subtraction - Some other minor fixes related to wording v8: - Remove the TCP_/tcp_ suffix from certain macro and struct member names - Add parameter struct ctx *c to tcp_timer_ctl() - Modify rto_max type from size_t to int - Clamp syn_retries and syn_linear_timeouts with MAX_SYNCNT(127) - Some other minor fixes related to wording and formatting v7: - Update read_file() and read_file_integer() - Rename tcp_syn_params_init() to tcp_get_rto_params() - Modify the implementation of the timeout assignment - Add a patch to clamp the retry timeout Yumei Huang (5): tcp: Rename "retrans" to "retries" util: Introduce read_file() and read_file_integer() function tcp: Resend SYN for inbound connections tcp: Update data retransmission timeout tcp: Clamp the retry timeout tcp.c | 107 +++++++++++++++++++++++++++++++++++++++-------------- tcp.h | 6 +++ tcp_conn.h | 13 ++++--- util.c | 90 ++++++++++++++++++++++++++++++++++++++++++++ util.h | 1 + 5 files changed, 184 insertions(+), 33 deletions(-) -- 2.51.1
Rename "retrans" to "retries" so it can be used for SYN retries.
Signed-off-by: Yumei Huang
Signed-off-by: Yumei Huang
If a client connects while guest is not connected or ready yet,
resend SYN instead of just resetting connection after 10 seconds.
Use the same backoff calculation for the timeout as Linux kernel.
Link: https://bugs.passt.top/show_bug.cgi?id=153
Signed-off-by: Yumei Huang
Use an exponential backoff timeout for data retransmission according
to RFC 2988 and RFC 6298. Set the initial RTO to one second as discussed
in Appendix A of RFC 6298.
Also combine the macros defining the initial RTO for both SYN and ACK.
Signed-off-by: Yumei Huang
Clamp the TCP retry timeout as Linux kernel does. If a retry occurs
during the handshake and the RTO is below 3 seconds, re-initialise
it to 3 seconds for data retransmissions according to RFC 6298.
Suggested-by: Stefano Brivio
On Tue, Nov 25, 2025 at 03:26:36PM +0800, Yumei Huang wrote:
If a client connects while guest is not connected or ready yet, resend SYN instead of just resetting connection after 10 seconds.
Use the same backoff calculation for the timeout as Linux kernel.
Link: https://bugs.passt.top/show_bug.cgi?id=153 Signed-off-by: Yumei Huang
Reviewed-by: David Gibson
--- tcp.c | 77 ++++++++++++++++++++++++++++++++++++++++++++++------------- tcp.h | 4 ++++ 2 files changed, 65 insertions(+), 16 deletions(-)
diff --git a/tcp.c b/tcp.c index 05f0d8b..2887f2c 100644 --- a/tcp.c +++ b/tcp.c @@ -179,9 +179,11 @@ * * Timeouts are implemented by means of timerfd timers, set based on flags: * - * - SYN_TIMEOUT: if no ACK is received from tap/guest during handshake (flag - * ACK_FROM_TAP_DUE without ESTABLISHED event) within this time, reset the - * connection + * - SYN_TIMEOUT_INIT: if no SYN,ACK is received from tap/guest during + * handshake (flag ACK_FROM_TAP_DUE without ESTABLISHED event) within + * this time, resend SYN. It's the starting timeout for the first SYN + * retry. Retry for TCP_MAX_RETRIES or (syn_retries + syn_linear_timeouts)
English usage nit: s/for //
+ * times, reset the connection
English usage nit: s/, reset/, then reset/
* * - ACK_TIMEOUT: if no ACK segment was received from tap/guest, after sending * data (flag ACK_FROM_TAP_DUE with ESTABLISHED event), re-send data from the @@ -340,7 +342,7 @@ enum { #define WINDOW_DEFAULT 14600 /* RFC 6928 */
#define ACK_INTERVAL 10 /* ms */ -#define SYN_TIMEOUT 10 /* s */ +#define SYN_TIMEOUT_INIT 1 /* s, RFC 6928 */ #define ACK_TIMEOUT 2 #define FIN_TIMEOUT 60 #define ACT_TIMEOUT 7200 @@ -365,6 +367,13 @@ uint8_t tcp_migrate_rcv_queue [TCP_MIGRATE_RCV_QUEUE_MAX];
#define TCP_MIGRATE_RESTORE_CHUNK_MIN 1024 /* Try smaller when above this */
+#define SYN_RETRIES "/proc/sys/net/ipv4/tcp_syn_retries" +#define SYN_LINEAR_TIMEOUTS "/proc/sys/net/ipv4/tcp_syn_linear_timeouts" + +#define SYN_RETRIES_DEFAULT 6 +#define SYN_LINEAR_TIMEOUTS_DEFAULT 4 +#define MAX_SYNCNT 127 /* derived from kernel's limit */ + /* "Extended" data (not stored in the flow table) for TCP flow migration */ static struct tcp_tap_transfer_ext migrate_ext[FLOW_MAX];
@@ -543,11 +552,12 @@ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
/** * tcp_timer_ctl() - Set timerfd based on flags/events, create timerfd if needed + * @c: Execution context * @conn: Connection pointer * * #syscalls timerfd_create timerfd_settime */ -static void tcp_timer_ctl(struct tcp_tap_conn *conn) +static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn) { struct itimerspec it = { { 0 }, { 0 } };
@@ -584,10 +594,13 @@ static void tcp_timer_ctl(struct tcp_tap_conn *conn) if (conn->flags & ACK_TO_TAP_DUE) { it.it_value.tv_nsec = (long)ACK_INTERVAL * 1000 * 1000; } else if (conn->flags & ACK_FROM_TAP_DUE) { - if (!(conn->events & ESTABLISHED)) - it.it_value.tv_sec = SYN_TIMEOUT; - else + if (!(conn->events & ESTABLISHED)) { + int exp; + exp = (int)conn->retries - c->tcp.syn_linear_timeouts; + it.it_value.tv_sec = SYN_TIMEOUT_INIT << MAX(exp, 0); + } else { it.it_value.tv_sec = ACK_TIMEOUT; + } } else if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) { it.it_value.tv_sec = FIN_TIMEOUT; } else { @@ -631,7 +644,7 @@ void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn, * flags and factor this into the logic below. */ if (flag == ACK_FROM_TAP_DUE) - tcp_timer_ctl(conn); + tcp_timer_ctl(c, conn);
return; } @@ -647,7 +660,7 @@ void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn, if (flag == ACK_FROM_TAP_DUE || flag == ACK_TO_TAP_DUE || (flag == ~ACK_FROM_TAP_DUE && (conn->flags & ACK_TO_TAP_DUE)) || (flag == ~ACK_TO_TAP_DUE && (conn->flags & ACK_FROM_TAP_DUE))) - tcp_timer_ctl(conn); + tcp_timer_ctl(c, conn); }
/** @@ -703,7 +716,7 @@ void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn, }
if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) - tcp_timer_ctl(conn); + tcp_timer_ctl(c, conn); }
/** @@ -1771,7 +1784,7 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn, seq, conn->seq_from_tap);
tcp_send_flag(c, conn, ACK); - tcp_timer_ctl(conn); + tcp_timer_ctl(c, conn);
if (p->count == 1) { tcp_tap_window_update(c, conn, @@ -2422,11 +2435,21 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
if (conn->flags & ACK_TO_TAP_DUE) { tcp_send_flag(c, conn, ACK_IF_NEEDED); - tcp_timer_ctl(conn); + tcp_timer_ctl(c, conn); } else if (conn->flags & ACK_FROM_TAP_DUE) { if (!(conn->events & ESTABLISHED)) { - flow_dbg(conn, "handshake timeout"); - tcp_rst(c, conn); + int max; + max = c->tcp.syn_retries + c->tcp.syn_linear_timeouts; + max = MIN(TCP_MAX_RETRIES, max); + if (conn->retries >= max) { + flow_dbg(conn, "handshake timeout"); + tcp_rst(c, conn); + } else { + flow_trace(conn, "SYN timeout, retry"); + tcp_send_flag(c, conn, SYN); + conn->retries++; + tcp_timer_ctl(c, conn); + } } else if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) { flow_dbg(conn, "FIN timeout"); tcp_rst(c, conn); @@ -2444,7 +2467,7 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref) return;
tcp_data_from_sock(c, conn); - tcp_timer_ctl(conn); + tcp_timer_ctl(c, conn); } } else { struct itimerspec new = { { 0 }, { ACT_TIMEOUT, 0 } }; @@ -2782,6 +2805,26 @@ static socklen_t tcp_probe_tcp_info(void) return sl; }
+/** + * tcp_get_rto_params() - Get host kernel RTO parameters + * @c: Execution context + */ +static void tcp_get_rto_params(struct ctx *c) +{ + intmax_t v; + + v = read_file_integer(SYN_RETRIES, SYN_RETRIES_DEFAULT); + c->tcp.syn_retries = MIN(v, MAX_SYNCNT); + + v = read_file_integer(SYN_LINEAR_TIMEOUTS, SYN_LINEAR_TIMEOUTS_DEFAULT); + c->tcp.syn_linear_timeouts = MIN(v, MAX_SYNCNT); + + debug("Using TCP RTO parameters, syn_retries: %"PRIu8 + ", syn_linear_timeouts: %"PRIu8, + c->tcp.syn_retries, + c->tcp.syn_linear_timeouts); +} + /** * tcp_init() - Get initial sequence, hash secret, initialise per-socket data * @c: Execution context @@ -2792,6 +2835,8 @@ int tcp_init(struct ctx *c) { ASSERT(!c->no_tcp);
+ tcp_get_rto_params(c); + tcp_sock_iov_init(c);
memset(init_sock_pool4, 0xff, sizeof(init_sock_pool4)); diff --git a/tcp.h b/tcp.h index 0082386..37d7758 100644 --- a/tcp.h +++ b/tcp.h @@ -60,12 +60,16 @@ union tcp_listen_epoll_ref { * @fwd_out: Port forwarding configuration for outbound packets * @timer_run: Timestamp of most recent timer run * @pipe_size: Size of pipes for spliced connections + * @syn_retries: SYN retries using exponential backoff timeout + * @syn_linear_timeouts: SYN retries before using exponential backoff timeout */ struct tcp_ctx { struct fwd_ports fwd_in; struct fwd_ports fwd_out; struct timespec timer_run; size_t pipe_size; + uint8_t syn_retries; + uint8_t syn_linear_timeouts; };
#endif /* TCP_H */ -- 2.51.1
-- David Gibson (he or they) | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you, not the other way | around. http://www.ozlabs.org/~dgibson
On Tue, Nov 25, 2025 at 03:26:35PM +0800, Yumei Huang wrote:
Signed-off-by: Yumei Huang
--- util.c | 90 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ util.h | 1 + 2 files changed, 91 insertions(+) diff --git a/util.c b/util.c index ab23463..c56f920 100644 --- a/util.c +++ b/util.c @@ -589,6 +589,96 @@ int write_file(const char *path, const char *buf) return len == 0 ? 0 : -1; }
+/** + * read_file() - Read contents of file into a NULL-terminated buffer + * @path: Path to file to read + * @buf: Buffer to store file contents + * @buf_size: Size of buffer + * + * Return: number of bytes read on success, negative error code on failure + */ +static ssize_t read_file(const char *path, char *buf, size_t buf_size) +{ + int fd = open(path, O_RDONLY | O_CLOEXEC); + size_t total_read = 0; + + if (fd < 0) + return -errno; + + if (!buf_size) { + close(fd); + return -EINVAL; + }
Nit: usually it's preferable to put the error checks with the least "impact" first. So in this case, it would be slightly better to check buf_size first, before even opening the file. What you have is still correct though, so this isn't worth a respin.
+ + while (total_read < buf_size) { + ssize_t rc = read(fd, buf + total_read, buf_size - total_read); + + if (rc < 0) { + int errno_save = errno; + close(fd); + return -errno_save; + } + + if (rc == 0) + break; + + total_read += rc; + } + + close(fd); + + if (total_read == buf_size) { + buf[buf_size - 1] = '\0'; + return -ENOBUFS; + } + + buf[total_read] = '\0'; + + return total_read; +} + +/** + * read_file_integer() - Read an integer value from a file + * @path: Path to file to read + * @fallback: Default value if file can't be read + * + * Return: integer value, @fallback on failure + */ +intmax_t read_file_integer(const char *path, intmax_t fallback) +{ + ssize_t bytes_read; + char buf[BUFSIZ]; + intmax_t value; + char *end; + + bytes_read = read_file(path, buf, sizeof(buf)); + + if (bytes_read < 0) + goto error; + + if (bytes_read == 0) { + debug("Empty file %s", path); + goto error; + } + + errno = 0; + value = strtoimax(buf, &end, 10); + if (*end && *end != '\n') { + debug("Non-numeric content in %s", path); + goto error; + } + if (errno) { + debug("Out of range value in %s: %s", path, buf); + goto error; + } + + return value; + +error: + debug("Using %"PRIdMAX" as default value", fallback);
I think this does need to say what it's using this default for. So something like: "Couldn't read %s, using %"PRIdMAX"d as default value"
+ return fallback; +} + #ifdef __ia64__ /* Needed by do_clone() below: glibc doesn't export the prototype of __clone2(), * use the description from clone(2). diff --git a/util.h b/util.h index a0b2ada..6dec14b 100644 --- a/util.h +++ b/util.h @@ -229,6 +229,7 @@ void pidfile_write(int fd, pid_t pid); int __daemon(int pidfile_fd, int devnull_fd); int fls(unsigned long x); int write_file(const char *path, const char *buf); +intmax_t read_file_integer(const char *path, intmax_t fallback); int write_all_buf(int fd, const void *buf, size_t len); int write_remainder(int fd, const struct iovec *iov, size_t iovcnt, size_t skip); int read_all_buf(int fd, void *buf, size_t len); -- 2.51.1
-- David Gibson (he or they) | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you, not the other way | around. http://www.ozlabs.org/~dgibson
On Tue, Nov 25, 2025 at 03:26:38PM +0800, Yumei Huang wrote:
Clamp the TCP retry timeout as Linux kernel does. If a retry occurs during the handshake and the RTO is below 3 seconds, re-initialise it to 3 seconds for data retransmissions according to RFC 6298.
Suggested-by: Stefano Brivio
Signed-off-by: Yumei Huang
Reviewed-by: David Gibson
--- tcp.c | 25 ++++++++++++++++++++----- tcp.h | 2 ++ tcp_conn.h | 1 + 3 files changed, 23 insertions(+), 5 deletions(-)
diff --git a/tcp.c b/tcp.c index b3aa064..887b32f 100644 --- a/tcp.c +++ b/tcp.c @@ -187,6 +187,9 @@ * for established connections, or (syn_retries + syn_linear_timeouts) times * during the handshake, reset the connection * + * - RTO_INIT_AFTER_SYN_RETRIES: if SYN retries happened during handshake and + * RTO is less than this, re-initialise RTO to this for data retransmissions + * * - FIN_TIMEOUT: if a FIN segment was sent to tap/guest (flag ACK_FROM_TAP_DUE * with TAP_FIN_SENT event), and no ACK is received within this time, reset * the connection @@ -340,6 +343,7 @@ enum {
#define ACK_INTERVAL 10 /* ms */ #define RTO_INIT 1 /* s, RFC 6298 */ +#define RTO_INIT_AFTER_SYN_RETRIES 3 /* s, RFC 6298 */ #define FIN_TIMEOUT 60 #define ACT_TIMEOUT 7200
@@ -365,9 +369,11 @@ uint8_t tcp_migrate_rcv_queue [TCP_MIGRATE_RCV_QUEUE_MAX];
#define SYN_RETRIES "/proc/sys/net/ipv4/tcp_syn_retries" #define SYN_LINEAR_TIMEOUTS "/proc/sys/net/ipv4/tcp_syn_linear_timeouts" +#define RTO_MAX_MS "/proc/sys/net/ipv4/tcp_rto_max_ms"
#define SYN_RETRIES_DEFAULT 6 #define SYN_LINEAR_TIMEOUTS_DEFAULT 4 +#define RTO_MAX_MS_DEFAULT 120000 #define MAX_SYNCNT 127 /* derived from kernel's limit */
/* "Extended" data (not stored in the flow table) for TCP flow migration */ @@ -392,7 +398,7 @@ static const char *tcp_state_str[] __attribute((__unused__)) = {
static const char *tcp_flag_str[] __attribute((__unused__)) = { "STALLED", "LOCAL", "ACTIVE_CLOSE", "ACK_TO_TAP_DUE", - "ACK_FROM_TAP_DUE", "ACK_FROM_TAP_BLOCKS", + "ACK_FROM_TAP_DUE", "ACK_FROM_TAP_BLOCKS", "SYN_RETRIED", };
/* Listening sockets, used for automatic port forwarding in pasta mode only */ @@ -590,10 +596,13 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn) if (conn->flags & ACK_TO_TAP_DUE) { it.it_value.tv_nsec = (long)ACK_INTERVAL * 1000 * 1000; } else if (conn->flags & ACK_FROM_TAP_DUE) { - int exp = conn->retries; + int exp = conn->retries, timeout = RTO_INIT; if (!(conn->events & ESTABLISHED)) exp -= c->tcp.syn_linear_timeouts; - it.it_value.tv_sec = RTO_INIT << MAX(exp, 0); + else if (conn->flags & SYN_RETRIED) + timeout = MAX(timeout, RTO_INIT_AFTER_SYN_RETRIES); + timeout <<= MAX(exp, 0); + it.it_value.tv_sec = MIN(timeout, c->tcp.rto_max); } else if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) { it.it_value.tv_sec = FIN_TIMEOUT; } else { @@ -2441,6 +2450,7 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref) flow_trace(conn, "SYN timeout, retry"); tcp_send_flag(c, conn, SYN); conn->retries++; + conn_flag(c, conn, SYN_RETRIED); tcp_timer_ctl(c, conn); } } else if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) { @@ -2812,10 +2822,15 @@ static void tcp_get_rto_params(struct ctx *c) v = read_file_integer(SYN_LINEAR_TIMEOUTS, SYN_LINEAR_TIMEOUTS_DEFAULT); c->tcp.syn_linear_timeouts = MIN(v, MAX_SYNCNT);
+ v = read_file_integer(RTO_MAX_MS, RTO_MAX_MS_DEFAULT); + c->tcp.rto_max = MIN(DIV_ROUND_UP(v, 1000), INT_MAX); + debug("Using TCP RTO parameters, syn_retries: %"PRIu8 - ", syn_linear_timeouts: %"PRIu8, + ", syn_linear_timeouts: %"PRIu8 + ", rto_max: %d", c->tcp.syn_retries, - c->tcp.syn_linear_timeouts); + c->tcp.syn_linear_timeouts, + c->tcp.rto_max); }
/** diff --git a/tcp.h b/tcp.h index 37d7758..6fb6f92 100644 --- a/tcp.h +++ b/tcp.h @@ -60,6 +60,7 @@ union tcp_listen_epoll_ref { * @fwd_out: Port forwarding configuration for outbound packets * @timer_run: Timestamp of most recent timer run * @pipe_size: Size of pipes for spliced connections + * @rto_max: Maximum retry timeout (in s) * @syn_retries: SYN retries using exponential backoff timeout * @syn_linear_timeouts: SYN retries before using exponential backoff timeout */ @@ -68,6 +69,7 @@ struct tcp_ctx { struct fwd_ports fwd_out; struct timespec timer_run; size_t pipe_size; + int rto_max; uint8_t syn_retries; uint8_t syn_linear_timeouts; }; diff --git a/tcp_conn.h b/tcp_conn.h index 923af36..e36910c 100644 --- a/tcp_conn.h +++ b/tcp_conn.h @@ -77,6 +77,7 @@ struct tcp_tap_conn { #define ACK_TO_TAP_DUE BIT(3) #define ACK_FROM_TAP_DUE BIT(4) #define ACK_FROM_TAP_BLOCKS BIT(5) +#define SYN_RETRIED BIT(6)
#define SNDBUF_BITS 24 unsigned int sndbuf :SNDBUF_BITS; -- 2.51.1
-- David Gibson (he or they) | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you, not the other way | around. http://www.ozlabs.org/~dgibson
On Wed, 26 Nov 2025 15:07:25 +1100
David Gibson
On Tue, Nov 25, 2025 at 03:26:35PM +0800, Yumei Huang wrote:
Signed-off-by: Yumei Huang
--- util.c | 90 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ util.h | 1 + 2 files changed, 91 insertions(+) diff --git a/util.c b/util.c index ab23463..c56f920 100644 --- a/util.c +++ b/util.c @@ -589,6 +589,96 @@ int write_file(const char *path, const char *buf) return len == 0 ? 0 : -1; }
+/** + * read_file() - Read contents of file into a NULL-terminated buffer + * @path: Path to file to read + * @buf: Buffer to store file contents + * @buf_size: Size of buffer + * + * Return: number of bytes read on success, negative error code on failure + */ +static ssize_t read_file(const char *path, char *buf, size_t buf_size) +{ + int fd = open(path, O_RDONLY | O_CLOEXEC); + size_t total_read = 0; + + if (fd < 0) + return -errno; + + if (!buf_size) { + close(fd); + return -EINVAL; + }
Nit: usually it's preferable to put the error checks with the least "impact" first. So in this case, it would be slightly better to check buf_size first, before even opening the file. What you have is still correct though, so this isn't worth a respin.
+ + while (total_read < buf_size) { + ssize_t rc = read(fd, buf + total_read, buf_size - total_read); + + if (rc < 0) { + int errno_save = errno; + close(fd); + return -errno_save; + } + + if (rc == 0) + break; + + total_read += rc; + } + + close(fd); + + if (total_read == buf_size) { + buf[buf_size - 1] = '\0'; + return -ENOBUFS; + } + + buf[total_read] = '\0'; + + return total_read; +} + +/** + * read_file_integer() - Read an integer value from a file + * @path: Path to file to read + * @fallback: Default value if file can't be read + * + * Return: integer value, @fallback on failure + */ +intmax_t read_file_integer(const char *path, intmax_t fallback) +{ + ssize_t bytes_read; + char buf[BUFSIZ]; + intmax_t value; + char *end; + + bytes_read = read_file(path, buf, sizeof(buf)); + + if (bytes_read < 0) + goto error; + + if (bytes_read == 0) { + debug("Empty file %s", path); + goto error; + } + + errno = 0; + value = strtoimax(buf, &end, 10); + if (*end && *end != '\n') { + debug("Non-numeric content in %s", path); + goto error; + } + if (errno) { + debug("Out of range value in %s: %s", path, buf); + goto error; + } + + return value; + +error: + debug("Using %"PRIdMAX" as default value", fallback);
I think this does need to say what it's using this default for. So something like: "Couldn't read %s, using %"PRIdMAX"d as default value"
Right, in some cases it's printed in the message just before (as long as we don't have multi-threading, at least), but in the most common case (kernel too old): --- $ ./pasta -d [...] 0.0163: Using 4 as default value 0.0163: Using 120000 as default value 0.0164: Using TCP RTO parameters, syn_retries: 6, syn_linear_timeouts: 4, rto_max: 120 --- The rest of the comments from myself and David aren't worth a re-spin, but I think this one is, because it might actually confuse users.
+ return fallback; +} + #ifdef __ia64__ /* Needed by do_clone() below: glibc doesn't export the prototype of __clone2(), * use the description from clone(2). diff --git a/util.h b/util.h index a0b2ada..6dec14b 100644 --- a/util.h +++ b/util.h @@ -229,6 +229,7 @@ void pidfile_write(int fd, pid_t pid); int __daemon(int pidfile_fd, int devnull_fd); int fls(unsigned long x); int write_file(const char *path, const char *buf); +intmax_t read_file_integer(const char *path, intmax_t fallback); int write_all_buf(int fd, const void *buf, size_t len); int write_remainder(int fd, const struct iovec *iov, size_t iovcnt, size_t skip); int read_all_buf(int fd, void *buf, size_t len); -- 2.51.1
-- Stefano
On Tue, 25 Nov 2025 15:26:37 +0800
Yumei Huang
Use an exponential backoff timeout for data retransmission according to RFC 2988 and RFC 6298. Set the initial RTO to one second as discussed in Appendix A of RFC 6298.
Also combine the macros defining the initial RTO for both SYN and ACK.
Signed-off-by: Yumei Huang
Reviewed-by: David Gibson --- tcp.c | 31 ++++++++++++------------------- 1 file changed, 12 insertions(+), 19 deletions(-) diff --git a/tcp.c b/tcp.c index 2887f2c..b3aa064 100644 --- a/tcp.c +++ b/tcp.c @@ -179,16 +179,13 @@ * * Timeouts are implemented by means of timerfd timers, set based on flags: * - * - SYN_TIMEOUT_INIT: if no SYN,ACK is received from tap/guest during - * handshake (flag ACK_FROM_TAP_DUE without ESTABLISHED event) within - * this time, resend SYN. It's the starting timeout for the first SYN - * retry. Retry for TCP_MAX_RETRIES or (syn_retries + syn_linear_timeouts) - * times, reset the connection - * - * - ACK_TIMEOUT: if no ACK segment was received from tap/guest, after sending - * data (flag ACK_FROM_TAP_DUE with ESTABLISHED event), re-send data from the - * socket and reset sequence to what was acknowledged. If this persists for - * more than TCP_MAX_RETRIES times in a row, reset the connection + * - RTO_INIT: if no ACK segment was received from tap/guest, either during + * handshake (flag ACK_FROM_TAP_DUE without ESTABLISHED event) or after + * sending data (flag ACK_FROM_TAP_DUE with ESTABLISHED event), re-send data + * from the socket and reset sequence to what was acknowledged. This is the + * timeout for the first retry, in seconds. Retry for TCP_MAX_RETRIES times + * for established connections, or (syn_retries + syn_linear_timeouts) times + * during the handshake, reset the connection
Same as David's comment on 3/5: ", then reset the connection". Otherwise, after saying "Retry x times, or y times, ..." is looks like "reset the connection" is another possibility. No, we do that in any case, but later (after retrying), hence "then". Sorry, I just realised this was already like that starting from v7 but we missed it.
* * - FIN_TIMEOUT: if a FIN segment was sent to tap/guest (flag ACK_FROM_TAP_DUE * with TAP_FIN_SENT event), and no ACK is received within this time, reset @@ -342,8 +339,7 @@ enum { #define WINDOW_DEFAULT 14600 /* RFC 6928 */
#define ACK_INTERVAL 10 /* ms */ -#define SYN_TIMEOUT_INIT 1 /* s, RFC 6928 */ -#define ACK_TIMEOUT 2 +#define RTO_INIT 1 /* s, RFC 6298 */ #define FIN_TIMEOUT 60 #define ACT_TIMEOUT 7200
@@ -594,13 +590,10 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn) if (conn->flags & ACK_TO_TAP_DUE) { it.it_value.tv_nsec = (long)ACK_INTERVAL * 1000 * 1000; } else if (conn->flags & ACK_FROM_TAP_DUE) { - if (!(conn->events & ESTABLISHED)) { - int exp; - exp = (int)conn->retries - c->tcp.syn_linear_timeouts; - it.it_value.tv_sec = SYN_TIMEOUT_INIT << MAX(exp, 0); - } else { - it.it_value.tv_sec = ACK_TIMEOUT; - } + int exp = conn->retries; + if (!(conn->events & ESTABLISHED)) + exp -= c->tcp.syn_linear_timeouts; + it.it_value.tv_sec = RTO_INIT << MAX(exp, 0); } else if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) { it.it_value.tv_sec = FIN_TIMEOUT; } else {
-- Stefano
On Tue, 25 Nov 2025 15:26:38 +0800
Yumei Huang
Clamp the TCP retry timeout as Linux kernel does. If a retry occurs during the handshake and the RTO is below 3 seconds, re-initialise it to 3 seconds for data retransmissions according to RFC 6298.
Suggested-by: Stefano Brivio
Signed-off-by: Yumei Huang --- tcp.c | 25 ++++++++++++++++++++----- tcp.h | 2 ++ tcp_conn.h | 1 + 3 files changed, 23 insertions(+), 5 deletions(-) diff --git a/tcp.c b/tcp.c index b3aa064..887b32f 100644 --- a/tcp.c +++ b/tcp.c @@ -187,6 +187,9 @@ * for established connections, or (syn_retries + syn_linear_timeouts) times * during the handshake, reset the connection * + * - RTO_INIT_AFTER_SYN_RETRIES: if SYN retries happened during handshake and + * RTO is less than this, re-initialise RTO to this for data retransmissions + * * - FIN_TIMEOUT: if a FIN segment was sent to tap/guest (flag ACK_FROM_TAP_DUE * with TAP_FIN_SENT event), and no ACK is received within this time, reset * the connection @@ -340,6 +343,7 @@ enum {
#define ACK_INTERVAL 10 /* ms */ #define RTO_INIT 1 /* s, RFC 6298 */ +#define RTO_INIT_AFTER_SYN_RETRIES 3 /* s, RFC 6298 */ #define FIN_TIMEOUT 60 #define ACT_TIMEOUT 7200
@@ -365,9 +369,11 @@ uint8_t tcp_migrate_rcv_queue [TCP_MIGRATE_RCV_QUEUE_MAX];
#define SYN_RETRIES "/proc/sys/net/ipv4/tcp_syn_retries" #define SYN_LINEAR_TIMEOUTS "/proc/sys/net/ipv4/tcp_syn_linear_timeouts" +#define RTO_MAX_MS "/proc/sys/net/ipv4/tcp_rto_max_ms"
#define SYN_RETRIES_DEFAULT 6 #define SYN_LINEAR_TIMEOUTS_DEFAULT 4 +#define RTO_MAX_MS_DEFAULT 120000
Nit: I think it would make more sense to have this in seconds, because you are rounding it anyway. If somebody changes this to 120001, you'll use 120000. At that point you can just say 120. More importantly, perhaps, if somebody changes this to 1200, you would be using 2000. I think this could be RTO_MAX_DEFAULT (with a comment saying it's in seconds) and later you can just multiply it by 1000 when you use it. The rest of the series looks good to me now.
#define MAX_SYNCNT 127 /* derived from kernel's limit */
/* "Extended" data (not stored in the flow table) for TCP flow migration */ @@ -392,7 +398,7 @@ static const char *tcp_state_str[] __attribute((__unused__)) = {
static const char *tcp_flag_str[] __attribute((__unused__)) = { "STALLED", "LOCAL", "ACTIVE_CLOSE", "ACK_TO_TAP_DUE", - "ACK_FROM_TAP_DUE", "ACK_FROM_TAP_BLOCKS", + "ACK_FROM_TAP_DUE", "ACK_FROM_TAP_BLOCKS", "SYN_RETRIED", };
/* Listening sockets, used for automatic port forwarding in pasta mode only */ @@ -590,10 +596,13 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn) if (conn->flags & ACK_TO_TAP_DUE) { it.it_value.tv_nsec = (long)ACK_INTERVAL * 1000 * 1000; } else if (conn->flags & ACK_FROM_TAP_DUE) { - int exp = conn->retries; + int exp = conn->retries, timeout = RTO_INIT; if (!(conn->events & ESTABLISHED)) exp -= c->tcp.syn_linear_timeouts; - it.it_value.tv_sec = RTO_INIT << MAX(exp, 0); + else if (conn->flags & SYN_RETRIED) + timeout = MAX(timeout, RTO_INIT_AFTER_SYN_RETRIES); + timeout <<= MAX(exp, 0); + it.it_value.tv_sec = MIN(timeout, c->tcp.rto_max); } else if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) { it.it_value.tv_sec = FIN_TIMEOUT; } else { @@ -2441,6 +2450,7 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref) flow_trace(conn, "SYN timeout, retry"); tcp_send_flag(c, conn, SYN); conn->retries++; + conn_flag(c, conn, SYN_RETRIED); tcp_timer_ctl(c, conn); } } else if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) { @@ -2812,10 +2822,15 @@ static void tcp_get_rto_params(struct ctx *c) v = read_file_integer(SYN_LINEAR_TIMEOUTS, SYN_LINEAR_TIMEOUTS_DEFAULT); c->tcp.syn_linear_timeouts = MIN(v, MAX_SYNCNT);
+ v = read_file_integer(RTO_MAX_MS, RTO_MAX_MS_DEFAULT); + c->tcp.rto_max = MIN(DIV_ROUND_UP(v, 1000), INT_MAX); + debug("Using TCP RTO parameters, syn_retries: %"PRIu8 - ", syn_linear_timeouts: %"PRIu8, + ", syn_linear_timeouts: %"PRIu8 + ", rto_max: %d", c->tcp.syn_retries, - c->tcp.syn_linear_timeouts); + c->tcp.syn_linear_timeouts, + c->tcp.rto_max); }
/** diff --git a/tcp.h b/tcp.h index 37d7758..6fb6f92 100644 --- a/tcp.h +++ b/tcp.h @@ -60,6 +60,7 @@ union tcp_listen_epoll_ref { * @fwd_out: Port forwarding configuration for outbound packets * @timer_run: Timestamp of most recent timer run * @pipe_size: Size of pipes for spliced connections + * @rto_max: Maximum retry timeout (in s) * @syn_retries: SYN retries using exponential backoff timeout * @syn_linear_timeouts: SYN retries before using exponential backoff timeout */ @@ -68,6 +69,7 @@ struct tcp_ctx { struct fwd_ports fwd_out; struct timespec timer_run; size_t pipe_size; + int rto_max; uint8_t syn_retries; uint8_t syn_linear_timeouts; }; diff --git a/tcp_conn.h b/tcp_conn.h index 923af36..e36910c 100644 --- a/tcp_conn.h +++ b/tcp_conn.h @@ -77,6 +77,7 @@ struct tcp_tap_conn { #define ACK_TO_TAP_DUE BIT(3) #define ACK_FROM_TAP_DUE BIT(4) #define ACK_FROM_TAP_BLOCKS BIT(5) +#define SYN_RETRIED BIT(6)
#define SNDBUF_BITS 24 unsigned int sndbuf :SNDBUF_BITS;
-- Stefano
On Fri, Nov 28, 2025 at 6:33 AM Stefano Brivio
On Tue, 25 Nov 2025 15:26:38 +0800 Yumei Huang
wrote: Clamp the TCP retry timeout as Linux kernel does. If a retry occurs during the handshake and the RTO is below 3 seconds, re-initialise it to 3 seconds for data retransmissions according to RFC 6298.
Suggested-by: Stefano Brivio
Signed-off-by: Yumei Huang --- tcp.c | 25 ++++++++++++++++++++----- tcp.h | 2 ++ tcp_conn.h | 1 + 3 files changed, 23 insertions(+), 5 deletions(-) diff --git a/tcp.c b/tcp.c index b3aa064..887b32f 100644 --- a/tcp.c +++ b/tcp.c @@ -187,6 +187,9 @@ * for established connections, or (syn_retries + syn_linear_timeouts) times * during the handshake, reset the connection * + * - RTO_INIT_AFTER_SYN_RETRIES: if SYN retries happened during handshake and + * RTO is less than this, re-initialise RTO to this for data retransmissions + * * - FIN_TIMEOUT: if a FIN segment was sent to tap/guest (flag ACK_FROM_TAP_DUE * with TAP_FIN_SENT event), and no ACK is received within this time, reset * the connection @@ -340,6 +343,7 @@ enum {
#define ACK_INTERVAL 10 /* ms */ #define RTO_INIT 1 /* s, RFC 6298 */ +#define RTO_INIT_AFTER_SYN_RETRIES 3 /* s, RFC 6298 */ #define FIN_TIMEOUT 60 #define ACT_TIMEOUT 7200
@@ -365,9 +369,11 @@ uint8_t tcp_migrate_rcv_queue [TCP_MIGRATE_RCV_QUEUE_MAX];
#define SYN_RETRIES "/proc/sys/net/ipv4/tcp_syn_retries" #define SYN_LINEAR_TIMEOUTS "/proc/sys/net/ipv4/tcp_syn_linear_timeouts" +#define RTO_MAX_MS "/proc/sys/net/ipv4/tcp_rto_max_ms"
#define SYN_RETRIES_DEFAULT 6 #define SYN_LINEAR_TIMEOUTS_DEFAULT 4 +#define RTO_MAX_MS_DEFAULT 120000
Nit: I think it would make more sense to have this in seconds, because you are rounding it anyway.
I agree and will update.
If somebody changes this to 120001, you'll use 120000. At that point you can just say 120.
Just a little correction, as we are using DIV_ROUND_UP, when it's 120001, we will get 121000 instead of 120000.
More importantly, perhaps, if somebody changes this to 1200, you would be using 2000.
I think this could be RTO_MAX_DEFAULT (with a comment saying it's in seconds) and later you can just multiply it by 1000 when you use it.
The rest of the series looks good to me now.
Thanks for the review.
#define MAX_SYNCNT 127 /* derived from kernel's limit */
/* "Extended" data (not stored in the flow table) for TCP flow migration */ @@ -392,7 +398,7 @@ static const char *tcp_state_str[] __attribute((__unused__)) = {
static const char *tcp_flag_str[] __attribute((__unused__)) = { "STALLED", "LOCAL", "ACTIVE_CLOSE", "ACK_TO_TAP_DUE", - "ACK_FROM_TAP_DUE", "ACK_FROM_TAP_BLOCKS", + "ACK_FROM_TAP_DUE", "ACK_FROM_TAP_BLOCKS", "SYN_RETRIED", };
/* Listening sockets, used for automatic port forwarding in pasta mode only */ @@ -590,10 +596,13 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn) if (conn->flags & ACK_TO_TAP_DUE) { it.it_value.tv_nsec = (long)ACK_INTERVAL * 1000 * 1000; } else if (conn->flags & ACK_FROM_TAP_DUE) { - int exp = conn->retries; + int exp = conn->retries, timeout = RTO_INIT; if (!(conn->events & ESTABLISHED)) exp -= c->tcp.syn_linear_timeouts; - it.it_value.tv_sec = RTO_INIT << MAX(exp, 0); + else if (conn->flags & SYN_RETRIED) + timeout = MAX(timeout, RTO_INIT_AFTER_SYN_RETRIES); + timeout <<= MAX(exp, 0); + it.it_value.tv_sec = MIN(timeout, c->tcp.rto_max); } else if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) { it.it_value.tv_sec = FIN_TIMEOUT; } else { @@ -2441,6 +2450,7 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref) flow_trace(conn, "SYN timeout, retry"); tcp_send_flag(c, conn, SYN); conn->retries++; + conn_flag(c, conn, SYN_RETRIED); tcp_timer_ctl(c, conn); } } else if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) { @@ -2812,10 +2822,15 @@ static void tcp_get_rto_params(struct ctx *c) v = read_file_integer(SYN_LINEAR_TIMEOUTS, SYN_LINEAR_TIMEOUTS_DEFAULT); c->tcp.syn_linear_timeouts = MIN(v, MAX_SYNCNT);
+ v = read_file_integer(RTO_MAX_MS, RTO_MAX_MS_DEFAULT); + c->tcp.rto_max = MIN(DIV_ROUND_UP(v, 1000), INT_MAX); + debug("Using TCP RTO parameters, syn_retries: %"PRIu8 - ", syn_linear_timeouts: %"PRIu8, + ", syn_linear_timeouts: %"PRIu8 + ", rto_max: %d", c->tcp.syn_retries, - c->tcp.syn_linear_timeouts); + c->tcp.syn_linear_timeouts, + c->tcp.rto_max); }
/** diff --git a/tcp.h b/tcp.h index 37d7758..6fb6f92 100644 --- a/tcp.h +++ b/tcp.h @@ -60,6 +60,7 @@ union tcp_listen_epoll_ref { * @fwd_out: Port forwarding configuration for outbound packets * @timer_run: Timestamp of most recent timer run * @pipe_size: Size of pipes for spliced connections + * @rto_max: Maximum retry timeout (in s) * @syn_retries: SYN retries using exponential backoff timeout * @syn_linear_timeouts: SYN retries before using exponential backoff timeout */ @@ -68,6 +69,7 @@ struct tcp_ctx { struct fwd_ports fwd_out; struct timespec timer_run; size_t pipe_size; + int rto_max; uint8_t syn_retries; uint8_t syn_linear_timeouts; }; diff --git a/tcp_conn.h b/tcp_conn.h index 923af36..e36910c 100644 --- a/tcp_conn.h +++ b/tcp_conn.h @@ -77,6 +77,7 @@ struct tcp_tap_conn { #define ACK_TO_TAP_DUE BIT(3) #define ACK_FROM_TAP_DUE BIT(4) #define ACK_FROM_TAP_BLOCKS BIT(5) +#define SYN_RETRIED BIT(6)
#define SNDBUF_BITS 24 unsigned int sndbuf :SNDBUF_BITS;
-- Stefano
-- Thanks, Yumei Huang
participants (3)
-
David Gibson
-
Stefano Brivio
-
Yumei Huang