Currently, the tables for spliced and non-spliced connections are entirely separate, with different types in different arrays. We want to unify them. As a first step, create a union type which can represent either a spliced or non-spliced connection. For them to be distinguishable, the individual types need to have a common header added, with a bit indicating which type this structure is. This comes at the cost of increasing the size of tcp_tap_conn to over one (64 byte) cacheline. This isn't ideal, but it makes things simpler for now and we'll re-optimize this later. Signed-off-by: David Gibson --- tcp.c | 4 ++++ tcp_conn.h | 30 ++++++++++++++++++++++++++++++ tcp_splice.c | 2 ++ 3 files changed, 36 insertions(+) diff --git a/tcp.c b/tcp.c index 189041c..05eed85 100644 --- a/tcp.c +++ b/tcp.c @@ -288,6 +288,7 @@ #include #include #include +#include #include /* For struct tcp_info */ @@ -601,6 +602,7 @@ static inline struct tcp_tap_conn *conn_at_idx(int index) { if ((index < 0) || (index >= TCP_MAX_CONNS)) return NULL; + assert(!(CONN(index)->c.spliced)); return CONN(index); } @@ -2096,6 +2098,7 @@ static void tcp_conn_from_tap(struct ctx *c, int af, const void *addr, } conn = CONN(c->tcp.conn_count++); + conn->c.spliced = false; conn->sock = s; conn->timer = -1; conn_event(c, conn, TAP_SYN_RCVD); @@ -2764,6 +2767,7 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref, return; conn = CONN(c->tcp.conn_count++); + conn->c.spliced = false; conn->sock = s; conn->timer = -1; conn->ws_to_tap = conn->ws_from_tap = 0; diff --git a/tcp_conn.h b/tcp_conn.h index db4c2d9..39d104a 100644 --- a/tcp_conn.h +++ b/tcp_conn.h @@ -11,8 +11,19 @@ #define TCP_HASH_BUCKET_BITS (TCP_CONN_INDEX_BITS + 1) +/** + * struct tcp_conn_common - Common fields for spliced and non-spliced + * @spliced: Is this a spliced connection? + */ +struct tcp_conn_common { + bool spliced :1; +}; + +extern const char *tcp_common_flag_str[]; + /** * struct tcp_tap_conn - Descriptor for a TCP connection (not spliced) + * @c: Fields common with tcp_splice_conn * @next_index: Connection index of next item in hash chain, -1 for none * @tap_mss: MSS advertised by tap/guest, rounded to 2 ^ TCP_MSS_BITS * @sock: Socket descriptor number @@ -40,6 +51,9 @@ * @seq_init_from_tap: Initial sequence number from tap */ struct tcp_tap_conn { + /* Must be first element to match tcp_splice_conn */ + struct tcp_conn_common c; + int next_index :TCP_CONN_INDEX_BITS + 2; #define TCP_RETRANS_BITS 3 @@ -122,6 +136,7 @@ struct tcp_tap_conn { /** * struct tcp_splice_conn - Descriptor for a spliced TCP connection + * @c: Fields common with tcp_tap_conn * @a: File descriptor number of socket for accepted connection * @pipe_a_b: Pipe ends for splice() from @a to @b * @b: File descriptor number of peer connected socket @@ -134,6 +149,9 @@ struct tcp_tap_conn { * @b_written: Bytes written to @b (not fully written from one @a read) */ struct tcp_splice_conn { + /* Must be first element to match tcp_tap_conn */ + struct tcp_conn_common c; + int a; int pipe_a_b[2]; int b; @@ -165,4 +183,16 @@ struct tcp_splice_conn { uint32_t b_written; }; +/** + * union tcp_conn - Descriptor for a TCP connection (spliced or non-spliced) + * @c: Fields common between all variants + * @tap: Fields specific to non-spliced connections + * @splice: Fields specific to spliced connections +*/ +union tcp_conn { + struct tcp_conn_common c; + struct tcp_tap_conn tap; + struct tcp_splice_conn splice; +}; + #endif /* TCP_CONN_H */ diff --git a/tcp_splice.c b/tcp_splice.c index cbfab01..d8be91b 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -46,6 +46,7 @@ #include #include #include +#include #include "util.h" #include "passt.h" @@ -554,6 +555,7 @@ void tcp_sock_handler_splice(struct ctx *c, union epoll_ref ref, } conn = CONN(c->tcp.splice_conn_count++); + conn->c.spliced = true; conn->a = s; conn->flags = ref.r.p.tcp.tcp.v6 ? SPLICE_V6 : 0; -- 2.38.1

Stefano Brivio

18 Nov 18 Nov

1:25 a.m.

New subject: [PATCH v2 08/32] tcp: Add connection union type

On Thu, 17 Nov 2022 16:58:44 +1100 David Gibson wrote:

...

Currently, the tables for spliced and non-spliced connections are entirely separate, with different types in different arrays. We want to unify them. As a first step, create a union type which can represent either a spliced or non-spliced connection. For them to be distinguishable, the individual types need to have a common header added, with a bit indicating which type this structure is.

This comes at the cost of increasing the size of tcp_tap_conn to over one (64 byte) cacheline. This isn't ideal, but it makes things simpler for now and we'll re-optimize this later.

Signed-off-by: David Gibson --- tcp.c | 4 ++++ tcp_conn.h | 30 ++++++++++++++++++++++++++++++ tcp_splice.c | 2 ++ 3 files changed, 36 insertions(+)

diff --git a/tcp.c b/tcp.c index 189041c..05eed85 100644 --- a/tcp.c +++ b/tcp.c @@ -288,6 +288,7 @@ #include #include #include +#include

#include /* For struct tcp_info */

@@ -601,6 +602,7 @@ static inline struct tcp_tap_conn *conn_at_idx(int index) { if ((index < 0) || (index >= TCP_MAX_CONNS)) return NULL; + assert(!(CONN(index)->c.spliced)); return CONN(index); }

@@ -2096,6 +2098,7 @@ static void tcp_conn_from_tap(struct ctx *c, int af, const void *addr, }

conn = CONN(c->tcp.conn_count++); + conn->c.spliced = false; conn->sock = s; conn->timer = -1; conn_event(c, conn, TAP_SYN_RCVD); @@ -2764,6 +2767,7 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref, return;

conn = CONN(c->tcp.conn_count++); + conn->c.spliced = false; conn->sock = s; conn->timer = -1; conn->ws_to_tap = conn->ws_from_tap = 0; diff --git a/tcp_conn.h b/tcp_conn.h index db4c2d9..39d104a 100644 --- a/tcp_conn.h +++ b/tcp_conn.h @@ -11,8 +11,19 @@

#define TCP_HASH_BUCKET_BITS (TCP_CONN_INDEX_BITS + 1)

+/** + * struct tcp_conn_common - Common fields for spliced and non-spliced + * @spliced: Is this a spliced connection? + */ +struct tcp_conn_common { + bool spliced :1; +}; + +extern const char *tcp_common_flag_str[]; + /** * struct tcp_tap_conn - Descriptor for a TCP connection (not spliced) + * @c: Fields common with tcp_splice_conn * @next_index: Connection index of next item in hash chain, -1 for none * @tap_mss: MSS advertised by tap/guest, rounded to 2 ^ TCP_MSS_BITS * @sock: Socket descriptor number @@ -40,6 +51,9 @@ * @seq_init_from_tap: Initial sequence number from tap */ struct tcp_tap_conn { + /* Must be first element to match tcp_splice_conn */ + struct tcp_conn_common c; + int next_index :TCP_CONN_INDEX_BITS + 2;

#define TCP_RETRANS_BITS 3 @@ -122,6 +136,7 @@ struct tcp_tap_conn {

/** * struct tcp_splice_conn - Descriptor for a spliced TCP connection + * @c: Fields common with tcp_tap_conn * @a: File descriptor number of socket for accepted connection * @pipe_a_b: Pipe ends for splice() from @a to @b * @b: File descriptor number of peer connected socket @@ -134,6 +149,9 @@ struct tcp_tap_conn { * @b_written: Bytes written to @b (not fully written from one @a read) */ struct tcp_splice_conn { + /* Must be first element to match tcp_tap_conn */ + struct tcp_conn_common c; + int a; int pipe_a_b[2]; int b; @@ -165,4 +183,16 @@ struct tcp_splice_conn { uint32_t b_written; };

+/** + * union tcp_conn - Descriptor for a TCP connection (spliced or non-spliced) + * @c: Fields common between all variants + * @tap: Fields specific to non-spliced connections + * @splice: Fields specific to spliced connections +*/ +union tcp_conn { + struct tcp_conn_common c; + struct tcp_tap_conn tap; + struct tcp_splice_conn splice; +};

Sorry, I could have noticed earlier: I understand that this is needed to end up, at the end of the series, with a 64-byte tcp_conn, but it doesn't really look like the most natural way of doing things. I would have expected something like: struct tcp_conn { struct tcp_conn_common c; union { struct tcp_tap_conn tap; struct tcp_splice_conn splice; } u; }; but sure, if we do this, then we have 3 bytes between 'c' and 'u', and struct tcp_conn becomes 68 bytes long. It also confuses Coverity Scan, because in tcp_table_compact() we have: memset(hole, 0, sizeof(*hole)); and while the prototype is: void tcp_table_compact(struct ctx *c, union tcp_conn *hole) it sees that we're passing, from tcp_splice_destroy(), something smaller than that (48 bytes), but we're zeroing the whole thing. Of course, it's not a real issue, that space is reserved for a connection slot anyway, but given there are no other issues reported, I'd try to keep Coverity happy if possible. First try, failed: check hole->c.spliced and, if set, zero only sizeof(struct tcp_splice_conn) bytes. This looks like a false positive. Another try, which should probably work (I just hit the daily build submission quota, grr): explicitly pass the union tcp_conn containing our struct tcp_splice_conn. This patch does it: --- diff --git a/tcp.c b/tcp.c index 8874789..d635a8e 100644 --- a/tcp.c +++ b/tcp.c @@ -591,7 +591,7 @@ static size_t tcp6_l2_flags_buf_bytes; union tcp_conn tc[TCP_MAX_CONNS]; #define CONN(index) (&tc[(index)].tap) -#define CONN_IDX(conn) ((union tcp_conn *)(conn) - tc) +#define CONN_IDX(conn) (TCP_TAP_TO_COMMON(conn) - tc) /** conn_at_idx() - Find a connection by index, if present * @index: Index of connection to lookup @@ -1385,7 +1385,7 @@ static void tcp_conn_destroy(struct ctx *c, struct tcp_tap_conn *conn) close(conn->timer); tcp_hash_remove(c, conn); - tcp_table_compact(c, (union tcp_conn *)conn); + tcp_table_compact(c, TCP_TAP_TO_COMMON(conn)); } static void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn); diff --git a/tcp_conn.h b/tcp_conn.h index 4a8be29..fa407ad 100644 --- a/tcp_conn.h +++ b/tcp_conn.h @@ -176,6 +176,12 @@ union tcp_conn { struct tcp_splice_conn splice; }; +#define TCP_TAP_TO_COMMON(x) \ + ((union tcp_conn *)((char *)(x) - offsetof(union tcp_conn, tap))) + +#define TCP_SPLICE_TO_COMMON(x) \ + ((union tcp_conn *)((char *)(x) - offsetof(union tcp_conn, splice))) + /* TCP connections */ extern union tcp_conn tc[]; diff --git a/tcp_splice.c b/tcp_splice.c index e2f0ce1..7d3f17e 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -74,7 +75,7 @@ static int splice_pipe_pool [TCP_SPLICE_PIPE_POOL_SIZE][2][2]; #define CONN_V4(x) (!CONN_V6(x)) #define CONN_HAS(conn, set) ((conn->events & (set)) == (set)) #define CONN(index) (&tc[(index)].splice) -#define CONN_IDX(conn) ((union tcp_conn *)(conn) - tc) +#define CONN_IDX(conn) (TCP_SPLICE_TO_COMMON(conn) - tc) /* Display strings for connection events */ static const char *tcp_splice_event_str[] __attribute((__unused__)) = { @@ -283,7 +284,7 @@ void tcp_splice_destroy(struct ctx *c, struct tcp_splice_conn *conn) debug("TCP (spliced): index %li, CLOSED", CONN_IDX(conn)); c->tcp.splice_conn_count--; - tcp_table_compact(c, (union tcp_conn *)conn); + tcp_table_compact(c, TCP_SPLICE_TO_COMMON(conn)); } /** --- I can add it on top if you agree, assuming it works. I also tried to actually turn tcp_conn into a struct. It takes 68 bytes, so I'm not pursuing this approach, but I'm including the diff just in case you have some quick idea to fix it up: --- diff --git a/tcp.c b/tcp.c index 8874789..6ee5675 100644 --- a/tcp.c +++ b/tcp.c @@ -588,10 +588,10 @@ static unsigned int tcp6_l2_flags_buf_used; static size_t tcp6_l2_flags_buf_bytes; /* TCP connections */ -union tcp_conn tc[TCP_MAX_CONNS]; +struct tcp_conn tc[TCP_MAX_CONNS]; -#define CONN(index) (&tc[(index)].tap) -#define CONN_IDX(conn) ((union tcp_conn *)(conn) - tc) +#define CONN(index) (&tc[(index)].u.tap) +#define CONN_IDX(conn) (TO_TCP_CONN(conn) - tc) /** conn_at_idx() - Find a connection by index, if present * @index: Index of connection to lookup @@ -602,7 +602,7 @@ static inline struct tcp_tap_conn *conn_at_idx(int index) { if ((index < 0) || (index >= TCP_MAX_CONNS)) return NULL; - assert(!(CONN(index)->c.spliced)); + assert(!TO_TCP_CONN(CONN(index))->c.spliced); return CONN(index); } @@ -660,13 +660,13 @@ static void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn, */ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn) { - int m = conn->c.in_epoll ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; + int m = TO_TCP_CONN(conn)->c.in_epoll ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; union epoll_ref ref = { .r.proto = IPPROTO_TCP, .r.s = conn->sock, .r.p.tcp.tcp.index = CONN_IDX(conn) }; struct epoll_event ev = { .data.u64 = ref.u64 }; if (conn->events == CLOSED) { - if (conn->c.in_epoll) + if (TO_TCP_CONN(conn)->c.in_epoll) epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->sock, &ev); if (conn->timer != -1) epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->timer, &ev); @@ -678,7 +678,7 @@ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn) if (epoll_ctl(c->epollfd, m, conn->sock, &ev)) return -errno; - conn->c.in_epoll = true; + TO_TCP_CONN(conn)->c.in_epoll = true; if (conn->timer != -1) { union epoll_ref ref_t = { .r.proto = IPPROTO_TCP, @@ -1347,9 +1347,9 @@ static struct tcp_tap_conn *tcp_hash_lookup(const struct ctx *c, * @c: Execution context * @hole: Pointer to recently closed connection */ -void tcp_table_compact(struct ctx *c, union tcp_conn *hole) +void tcp_table_compact(struct ctx *c, struct tcp_conn *hole) { - union tcp_conn *from; + struct tcp_conn *from; if (CONN_IDX(hole) == --c->tcp.conn_count) { debug("TCP: table compaction: maximum index was %li (%p)", @@ -1361,14 +1361,15 @@ void tcp_table_compact(struct ctx *c, union tcp_conn *hole) from = tc + c->tcp.conn_count; memcpy(hole, from, sizeof(*hole)); - if (from->c.spliced) - tcp_splice_conn_update(c, &hole->splice); + if (TO_TCP_CONN(from)->c.spliced) + tcp_splice_conn_update(c, &hole->u.splice); else - tcp_tap_conn_update(c, &from->tap, &hole->tap); + tcp_tap_conn_update(c, &from->u.tap, &hole->u.tap); debug("TCP: table compaction (spliced=%d): old index %li, new index %li, " "from: %p, to: %p", - from->c.spliced, CONN_IDX(from), CONN_IDX(hole), from, hole); + TO_TCP_CONN(from)->c.spliced, CONN_IDX(from), CONN_IDX(hole), + from, hole); memset(from, 0, sizeof(*from)); } @@ -1385,7 +1386,7 @@ static void tcp_conn_destroy(struct ctx *c, struct tcp_tap_conn *conn) close(conn->timer); tcp_hash_remove(c, conn); - tcp_table_compact(c, (union tcp_conn *)conn); + tcp_table_compact(c, TO_TCP_CONN(conn)); } static void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn); @@ -1523,7 +1524,7 @@ void tcp_defer_handler(struct ctx *c) { int max_conns = c->tcp.conn_count / 100 * TCP_CONN_PRESSURE; int max_files = c->nofile / 100 * TCP_FILE_PRESSURE; - union tcp_conn *conn; + struct tcp_conn *conn; tcp_l2_flags_buf_flush(c); tcp_l2_data_buf_flush(c); @@ -1533,12 +1534,12 @@ void tcp_defer_handler(struct ctx *c) return; for (conn = tc + c->tcp.conn_count - 1; conn >= tc; conn--) { - if (conn->c.spliced) { - if (conn->splice.flags & CLOSING) - tcp_splice_destroy(c, &conn->splice); + if (TO_TCP_CONN(conn)->c.spliced) { + if (conn->u.splice.flags & CLOSING) + tcp_splice_destroy(c, &conn->u.splice); } else { - if (conn->tap.events == CLOSED) - tcp_conn_destroy(c, &conn->tap); + if (conn->u.tap.events == CLOSED) + tcp_conn_destroy(c, &conn->u.tap); } } @@ -2086,7 +2087,7 @@ static void tcp_conn_from_tap(struct ctx *c, int af, const void *addr, } conn = CONN(c->tcp.conn_count++); - conn->c.spliced = false; + TO_TCP_CONN(conn)->c.spliced = false; conn->sock = s; conn->timer = -1; conn_event(c, conn, TAP_SYN_RCVD); @@ -2770,7 +2771,7 @@ static void tcp_tap_conn_from_sock(struct ctx *c, union epoll_ref ref, struct sockaddr *sa, const struct timespec *now) { - conn->c.spliced = false; + TO_TCP_CONN(conn)->c.spliced = false; conn->sock = s; conn->timer = -1; conn->ws_to_tap = conn->ws_from_tap = 0; @@ -2804,7 +2805,7 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref, const struct timespec *now) { struct sockaddr_storage sa; - union tcp_conn *conn; + struct tcp_conn *conn; socklen_t sl; int s; @@ -2826,11 +2827,11 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref, conn = tc + c->tcp.conn_count++; if (c->mode == MODE_PASTA && - tcp_splice_conn_from_sock(c, ref, &conn->splice, + tcp_splice_conn_from_sock(c, ref, &conn->u.splice, s, (struct sockaddr *)&sa)) return; - tcp_tap_conn_from_sock(c, ref, &conn->tap, s, + tcp_tap_conn_from_sock(c, ref, &conn->u.tap, s, (struct sockaddr *)&sa, now); } @@ -2961,7 +2962,7 @@ static void tcp_tap_sock_handler(struct ctx *c, struct tcp_tap_conn *conn, void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events, const struct timespec *now) { - union tcp_conn *conn; + struct tcp_conn *conn; if (ref.r.p.tcp.tcp.timer) { tcp_timer_handler(c, ref); @@ -2975,10 +2976,10 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events, conn = tc + ref.r.p.tcp.tcp.index; - if (conn->c.spliced) - tcp_splice_sock_handler(c, &conn->splice, ref.r.s, events); + if (TO_TCP_CONN(conn)->c.spliced) + tcp_splice_sock_handler(c, &conn->u.splice, ref.r.s, events); else - tcp_tap_sock_handler(c, &conn->tap, events); + tcp_tap_sock_handler(c, &conn->u.tap, events); } /** @@ -3370,7 +3371,7 @@ static int tcp_port_rebind(void *arg) void tcp_timer(struct ctx *c, const struct timespec *ts) { struct tcp_sock_refill_arg refill_arg = { c, 0 }; - union tcp_conn *conn; + struct tcp_conn *conn; (void)ts; @@ -3394,11 +3395,11 @@ void tcp_timer(struct ctx *c, const struct timespec *ts) } for (conn = tc + c->tcp.conn_count - 1; conn >= tc; conn--) { - if (conn->c.spliced) { - tcp_splice_timer(c, &conn->splice); + if (TO_TCP_CONN(conn)->c.spliced) { + tcp_splice_timer(c, &conn->u.splice); } else { - if (conn->tap.events == CLOSED) - tcp_conn_destroy(c, &conn->tap); + if (conn->u.tap.events == CLOSED) + tcp_conn_destroy(c, &conn->u.tap); } } diff --git a/tcp_conn.h b/tcp_conn.h index 4a8be29..3df7905 100644 --- a/tcp_conn.h +++ b/tcp_conn.h @@ -23,7 +23,6 @@ extern const char *tcp_common_flag_str[]; /** * struct tcp_tap_conn - Descriptor for a TCP connection (not spliced) - * @c: Fields common with tcp_splice_conn * @next_index: Connection index of next item in hash chain, -1 for none * @tap_mss: MSS advertised by tap/guest, rounded to 2 ^ TCP_MSS_BITS * @sock: Socket descriptor number @@ -47,9 +46,6 @@ extern const char *tcp_common_flag_str[]; * @seq_init_from_tap: Initial sequence number from tap */ struct tcp_tap_conn { - /* Must be first element to match tcp_splice_conn */ - struct tcp_conn_common c; - int next_index :TCP_CONN_INDEX_BITS + 2; #define TCP_RETRANS_BITS 3 @@ -118,7 +114,6 @@ struct tcp_tap_conn { /** * struct tcp_splice_conn - Descriptor for a spliced TCP connection - * @c: Fields common with tcp_tap_conn * @a: File descriptor number of socket for accepted connection * @pipe_a_b: Pipe ends for splice() from @a to @b * @b: File descriptor number of peer connected socket @@ -131,9 +126,6 @@ struct tcp_tap_conn { * @b_written: Bytes written to @b (not fully written from one @a read) */ struct tcp_splice_conn { - /* Must be first element to match tcp_tap_conn */ - struct tcp_conn_common c; - int a; int pipe_a_b[2]; int b; @@ -165,22 +157,27 @@ struct tcp_splice_conn { }; /** - * union tcp_conn - Descriptor for a TCP connection (spliced or non-spliced) + * struct tcp_conn - Descriptor for a TCP connection (spliced or non-spliced) * @c: Fields common between all variants - * @tap: Fields specific to non-spliced connections - * @splice: Fields specific to spliced connections + * @u.tap: Fields specific to non-spliced connections + * @u.splice: Fields specific to spliced connections */ -union tcp_conn { +struct tcp_conn { struct tcp_conn_common c; - struct tcp_tap_conn tap; - struct tcp_splice_conn splice; + union { + struct tcp_tap_conn tap; + struct tcp_splice_conn splice; + } u; }; +#define TO_TCP_CONN(x) \ + ((struct tcp_conn *)((char *)(x) - offsetof(struct tcp_conn, u))) + /* TCP connections */ -extern union tcp_conn tc[]; +extern struct tcp_conn tc[]; void tcp_splice_conn_update(struct ctx *c, struct tcp_splice_conn *new); -void tcp_table_compact(struct ctx *c, union tcp_conn *hole); +void tcp_table_compact(struct ctx *c, struct tcp_conn *hole); void tcp_splice_destroy(struct ctx *c, struct tcp_splice_conn *conn); void tcp_splice_timer(struct ctx *c, struct tcp_splice_conn *conn); void tcp_splice_pipe_refill(const struct ctx *c); diff --git a/tcp_splice.c b/tcp_splice.c index e2f0ce1..04fc513 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -73,8 +74,8 @@ static int splice_pipe_pool [TCP_SPLICE_PIPE_POOL_SIZE][2][2]; #define CONN_V6(x) (x->flags & SPLICE_V6) #define CONN_V4(x) (!CONN_V6(x)) #define CONN_HAS(conn, set) ((conn->events & (set)) == (set)) -#define CONN(index) (&tc[(index)].splice) -#define CONN_IDX(conn) ((union tcp_conn *)(conn) - tc) +#define CONN(index) (&tc[(index)].u.splice) +#define CONN_IDX(conn) (TO_TCP_CONN(conn) - tc) /* Display strings for connection events */ static const char *tcp_splice_event_str[] __attribute((__unused__)) = { @@ -165,7 +166,7 @@ static void conn_flag_do(const struct ctx *c, struct tcp_splice_conn *conn, static int tcp_splice_epoll_ctl(const struct ctx *c, struct tcp_splice_conn *conn) { - int m = conn->c.in_epoll ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; + int m = TO_TCP_CONN(conn)->c.in_epoll ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; union epoll_ref ref_a = { .r.proto = IPPROTO_TCP, .r.s = conn->a, .r.p.tcp.tcp.index = CONN_IDX(conn) }; union epoll_ref ref_b = { .r.proto = IPPROTO_TCP, .r.s = conn->b, @@ -185,7 +186,7 @@ static int tcp_splice_epoll_ctl(const struct ctx *c, epoll_ctl(c->epollfd, m, conn->b, &ev_b)) goto delete; - conn->c.in_epoll = true; + TO_TCP_CONN(conn)->c.in_epoll = true; return 0; @@ -283,7 +284,7 @@ void tcp_splice_destroy(struct ctx *c, struct tcp_splice_conn *conn) debug("TCP (spliced): index %li, CLOSED", CONN_IDX(conn)); c->tcp.splice_conn_count--; - tcp_table_compact(c, (union tcp_conn *)conn); + tcp_table_compact(c, TO_TCP_CONN(conn)); } /** @@ -535,7 +536,7 @@ bool tcp_splice_conn_from_sock(struct ctx *c, union epoll_ref ref, if (setsockopt(s, SOL_TCP, TCP_QUICKACK, &((int){ 1 }), sizeof(int))) trace("TCP (spliced): failed to set TCP_QUICKACK on %i", s); - conn->c.spliced = true; + TO_TCP_CONN(conn)->c.spliced = true; c->tcp.splice_conn_count++; conn->a = s; --- I'm fine with this series in any case. If you don't have other ideas, I would just try to get rid of that warning (Out-of-bounds access, CWE-119) with the first diff here, or something similar. -- Stefano

David Gibson

2:10 a.m.

New subject: [PATCH v2 08/32] tcp: Add connection union type

On Fri, Nov 18, 2022 at 01:25:18AM +0100, Stefano Brivio wrote:

...

On Thu, 17 Nov 2022 16:58:44 +1100 David Gibson wrote:

...
Currently, the tables for spliced and non-spliced connections are entirely separate, with different types in different arrays. We want to unify them. As a first step, create a union type which can represent either a spliced or non-spliced connection. For them to be distinguishable, the individual types need to have a common header added, with a bit indicating which type this structure is.

This comes at the cost of increasing the size of tcp_tap_conn to over one (64 byte) cacheline. This isn't ideal, but it makes things simpler for now and we'll re-optimize this later.

Signed-off-by: David Gibson --- tcp.c | 4 ++++ tcp_conn.h | 30 ++++++++++++++++++++++++++++++ tcp_splice.c | 2 ++ 3 files changed, 36 insertions(+)

diff --git a/tcp.c b/tcp.c index 189041c..05eed85 100644 --- a/tcp.c +++ b/tcp.c @@ -288,6 +288,7 @@ #include #include #include +#include

#include /* For struct tcp_info */

@@ -601,6 +602,7 @@ static inline struct tcp_tap_conn *conn_at_idx(int index) { if ((index < 0) || (index >= TCP_MAX_CONNS)) return NULL; + assert(!(CONN(index)->c.spliced)); return CONN(index); }

@@ -2096,6 +2098,7 @@ static void tcp_conn_from_tap(struct ctx *c, int af, const void *addr, }

conn = CONN(c->tcp.conn_count++); + conn->c.spliced = false; conn->sock = s; conn->timer = -1; conn_event(c, conn, TAP_SYN_RCVD); @@ -2764,6 +2767,7 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref, return;

conn = CONN(c->tcp.conn_count++); + conn->c.spliced = false; conn->sock = s; conn->timer = -1; conn->ws_to_tap = conn->ws_from_tap = 0; diff --git a/tcp_conn.h b/tcp_conn.h index db4c2d9..39d104a 100644 --- a/tcp_conn.h +++ b/tcp_conn.h @@ -11,8 +11,19 @@

#define TCP_HASH_BUCKET_BITS (TCP_CONN_INDEX_BITS + 1)

+/** + * struct tcp_conn_common - Common fields for spliced and non-spliced + * @spliced: Is this a spliced connection? + */ +struct tcp_conn_common { + bool spliced :1; +}; + +extern const char *tcp_common_flag_str[]; + /** * struct tcp_tap_conn - Descriptor for a TCP connection (not spliced) + * @c: Fields common with tcp_splice_conn * @next_index: Connection index of next item in hash chain, -1 for none * @tap_mss: MSS advertised by tap/guest, rounded to 2 ^ TCP_MSS_BITS * @sock: Socket descriptor number @@ -40,6 +51,9 @@ * @seq_init_from_tap: Initial sequence number from tap */ struct tcp_tap_conn { + /* Must be first element to match tcp_splice_conn */ + struct tcp_conn_common c; + int next_index :TCP_CONN_INDEX_BITS + 2;

#define TCP_RETRANS_BITS 3 @@ -122,6 +136,7 @@ struct tcp_tap_conn {

/** * struct tcp_splice_conn - Descriptor for a spliced TCP connection + * @c: Fields common with tcp_tap_conn * @a: File descriptor number of socket for accepted connection * @pipe_a_b: Pipe ends for splice() from @a to @b * @b: File descriptor number of peer connected socket @@ -134,6 +149,9 @@ struct tcp_tap_conn { * @b_written: Bytes written to @b (not fully written from one @a read) */ struct tcp_splice_conn { + /* Must be first element to match tcp_tap_conn */ + struct tcp_conn_common c; + int a; int pipe_a_b[2]; int b; @@ -165,4 +183,16 @@ struct tcp_splice_conn { uint32_t b_written; };

+/** + * union tcp_conn - Descriptor for a TCP connection (spliced or non-spliced) + * @c: Fields common between all variants + * @tap: Fields specific to non-spliced connections + * @splice: Fields specific to spliced connections +*/ +union tcp_conn { + struct tcp_conn_common c; + struct tcp_tap_conn tap; + struct tcp_splice_conn splice; +};

Sorry, I could have noticed earlier: I understand that this is needed to end up, at the end of the series, with a 64-byte tcp_conn, but it doesn't really look like the most natural way of doing things. I would have expected something like:

struct tcp_conn { struct tcp_conn_common c; union { struct tcp_tap_conn tap; struct tcp_splice_conn splice; } u; };

Heh... so... I actually went forth between these two options about a dozen times while figuring out how to do this. Getting the thing back down to 64 bytes is not actually the main reason I settled on the current approach, although it does also make that easier. The biggest reason I didn't use the union-in-struct approach is that I wasn't sure if we wanted to introduce container_of mechanics (as you do in the patch below). Without that we'd have to pass tcp_conn to a lot of places that mostly just want a tap or splice variant, just so they can get the index. That in turn also means we have to explicitly reference into the union in a heap of places - I probably spent an hour or two attempting this getting bogged down in trivial text changes before giving up and going a different way. And finally, as you note with the union-of-structs approach, the compiler is able to pack other bitfields into the spare bits left by the common header, which makes it much, much easier to get the whole thing down to 64 bytes again. In general I don't love the "make sure these structs have the same header" approach over enforcing it with a union-in-struct, but that's why I chose it here. Note also that we are already dealing with a similar pattern for sockaddrs.

...

but sure, if we do this, then we have 3 bytes between 'c' and 'u', and struct tcp_conn becomes 68 bytes long.

Right. I toyed with a bunch of different options for trying to pull various fields out into the common part to fill that gap, and they were all really painful.

...

It also confuses Coverity Scan, because in tcp_table_compact() we have:

memset(hole, 0, sizeof(*hole));

and while the prototype is:

void tcp_table_compact(struct ctx *c, union tcp_conn *hole)

it sees that we're passing, from tcp_splice_destroy(), something smaller than that (48 bytes), but we're zeroing the whole thing.

Bother.

...

Of course, it's not a real issue, that space is reserved for a connection slot anyway, but given there are no other issues reported, I'd try to keep Coverity happy if possible.

First try, failed: check hole->c.spliced and, if set, zero only sizeof(struct tcp_splice_conn) bytes. This looks like a false positive.

Another try, which should probably work (I just hit the daily build submission quota, grr): explicitly pass the union tcp_conn containing our struct tcp_splice_conn. This patch does it:

--- diff --git a/tcp.c b/tcp.c index 8874789..d635a8e 100644 --- a/tcp.c +++ b/tcp.c @@ -591,7 +591,7 @@ static size_t tcp6_l2_flags_buf_bytes; union tcp_conn tc[TCP_MAX_CONNS];

#define CONN(index) (&tc[(index)].tap) -#define CONN_IDX(conn) ((union tcp_conn *)(conn) - tc) +#define CONN_IDX(conn) (TCP_TAP_TO_COMMON(conn) - tc)

/** conn_at_idx() - Find a connection by index, if present * @index: Index of connection to lookup @@ -1385,7 +1385,7 @@ static void tcp_conn_destroy(struct ctx *c, struct tcp_tap_conn *conn) close(conn->timer);

tcp_hash_remove(c, conn); - tcp_table_compact(c, (union tcp_conn *)conn); + tcp_table_compact(c, TCP_TAP_TO_COMMON(conn)); }

static void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn); diff --git a/tcp_conn.h b/tcp_conn.h index 4a8be29..fa407ad 100644 --- a/tcp_conn.h +++ b/tcp_conn.h @@ -176,6 +176,12 @@ union tcp_conn { struct tcp_splice_conn splice; };

+#define TCP_TAP_TO_COMMON(x) \ + ((union tcp_conn *)((char *)(x) - offsetof(union tcp_conn, tap))) + +#define TCP_SPLICE_TO_COMMON(x) \ + ((union tcp_conn *)((char *)(x) - offsetof(union tcp_conn, splice))) + /* TCP connections */ extern union tcp_conn tc[];

diff --git a/tcp_splice.c b/tcp_splice.c index e2f0ce1..7d3f17e 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -74,7 +75,7 @@ static int splice_pipe_pool [TCP_SPLICE_PIPE_POOL_SIZE][2][2]; #define CONN_V4(x) (!CONN_V6(x)) #define CONN_HAS(conn, set) ((conn->events & (set)) == (set)) #define CONN(index) (&tc[(index)].splice) -#define CONN_IDX(conn) ((union tcp_conn *)(conn) - tc) +#define CONN_IDX(conn) (TCP_SPLICE_TO_COMMON(conn) - tc)

/* Display strings for connection events */ static const char *tcp_splice_event_str[] __attribute((__unused__)) = { @@ -283,7 +284,7 @@ void tcp_splice_destroy(struct ctx *c, struct tcp_splice_conn *conn) debug("TCP (spliced): index %li, CLOSED", CONN_IDX(conn));

c->tcp.splice_conn_count--; - tcp_table_compact(c, (union tcp_conn *)conn); + tcp_table_compact(c, TCP_SPLICE_TO_COMMON(conn)); }

Hmm.. so TAP_TO_COMMON and SPLICE_TO_COMMON don't actually change the pointer any more than the cast does, but you're hoping that will be enough to convince coverity? I'm a bit skeptical, but I guess we can try it. I wonder if an explicit coverity lint override might be a better option here. Or, we could add padding to tcp_splice_conn so it has the same size as tcp_tap_conn. I think that would probably convince coverity.

...

/** ---

I can add it on top if you agree, assuming it works.

I also tried to actually turn tcp_conn into a struct. It takes 68 bytes, so I'm not pursuing this approach, but I'm including the diff just in case you have some quick idea to fix it up:

Like I said, I tried this in a bunch of ways, and just got mired in irritating and not very productive code frobbing

...

diff --git a/tcp.c b/tcp.c index 8874789..6ee5675 100644 --- a/tcp.c +++ b/tcp.c @@ -588,10 +588,10 @@ static unsigned int tcp6_l2_flags_buf_used; static size_t tcp6_l2_flags_buf_bytes;

/* TCP connections */ -union tcp_conn tc[TCP_MAX_CONNS]; +struct tcp_conn tc[TCP_MAX_CONNS];

-#define CONN(index) (&tc[(index)].tap) -#define CONN_IDX(conn) ((union tcp_conn *)(conn) - tc) +#define CONN(index) (&tc[(index)].u.tap) +#define CONN_IDX(conn) (TO_TCP_CONN(conn) - tc)

/** conn_at_idx() - Find a connection by index, if present * @index: Index of connection to lookup @@ -602,7 +602,7 @@ static inline struct tcp_tap_conn *conn_at_idx(int index) { if ((index < 0) || (index >= TCP_MAX_CONNS)) return NULL; - assert(!(CONN(index)->c.spliced)); + assert(!TO_TCP_CONN(CONN(index))->c.spliced); return CONN(index); }

@@ -660,13 +660,13 @@ static void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn, */ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn) { - int m = conn->c.in_epoll ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; + int m = TO_TCP_CONN(conn)->c.in_epoll ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; union epoll_ref ref = { .r.proto = IPPROTO_TCP, .r.s = conn->sock, .r.p.tcp.tcp.index = CONN_IDX(conn) }; struct epoll_event ev = { .data.u64 = ref.u64 };

if (conn->events == CLOSED) { - if (conn->c.in_epoll) + if (TO_TCP_CONN(conn)->c.in_epoll) epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->sock, &ev); if (conn->timer != -1) epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->timer, &ev); @@ -678,7 +678,7 @@ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn) if (epoll_ctl(c->epollfd, m, conn->sock, &ev)) return -errno;

- conn->c.in_epoll = true; + TO_TCP_CONN(conn)->c.in_epoll = true;

if (conn->timer != -1) { union epoll_ref ref_t = { .r.proto = IPPROTO_TCP, @@ -1347,9 +1347,9 @@ static struct tcp_tap_conn *tcp_hash_lookup(const struct ctx *c, * @c: Execution context * @hole: Pointer to recently closed connection */ -void tcp_table_compact(struct ctx *c, union tcp_conn *hole) +void tcp_table_compact(struct ctx *c, struct tcp_conn *hole) { - union tcp_conn *from; + struct tcp_conn *from;

if (CONN_IDX(hole) == --c->tcp.conn_count) { debug("TCP: table compaction: maximum index was %li (%p)", @@ -1361,14 +1361,15 @@ void tcp_table_compact(struct ctx *c, union tcp_conn *hole) from = tc + c->tcp.conn_count; memcpy(hole, from, sizeof(*hole));

- if (from->c.spliced) - tcp_splice_conn_update(c, &hole->splice); + if (TO_TCP_CONN(from)->c.spliced) + tcp_splice_conn_update(c, &hole->u.splice); else - tcp_tap_conn_update(c, &from->tap, &hole->tap); + tcp_tap_conn_update(c, &from->u.tap, &hole->u.tap);

debug("TCP: table compaction (spliced=%d): old index %li, new index %li, " "from: %p, to: %p", - from->c.spliced, CONN_IDX(from), CONN_IDX(hole), from, hole); + TO_TCP_CONN(from)->c.spliced, CONN_IDX(from), CONN_IDX(hole), + from, hole);

memset(from, 0, sizeof(*from)); } @@ -1385,7 +1386,7 @@ static void tcp_conn_destroy(struct ctx *c, struct tcp_tap_conn *conn) close(conn->timer);

tcp_hash_remove(c, conn); - tcp_table_compact(c, (union tcp_conn *)conn); + tcp_table_compact(c, TO_TCP_CONN(conn)); }

static void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn); @@ -1523,7 +1524,7 @@ void tcp_defer_handler(struct ctx *c) { int max_conns = c->tcp.conn_count / 100 * TCP_CONN_PRESSURE; int max_files = c->nofile / 100 * TCP_FILE_PRESSURE; - union tcp_conn *conn; + struct tcp_conn *conn;

tcp_l2_flags_buf_flush(c); tcp_l2_data_buf_flush(c); @@ -1533,12 +1534,12 @@ void tcp_defer_handler(struct ctx *c) return;

for (conn = tc + c->tcp.conn_count - 1; conn >= tc; conn--) { - if (conn->c.spliced) { - if (conn->splice.flags & CLOSING) - tcp_splice_destroy(c, &conn->splice); + if (TO_TCP_CONN(conn)->c.spliced) { + if (conn->u.splice.flags & CLOSING) + tcp_splice_destroy(c, &conn->u.splice); } else { - if (conn->tap.events == CLOSED) - tcp_conn_destroy(c, &conn->tap); + if (conn->u.tap.events == CLOSED) + tcp_conn_destroy(c, &conn->u.tap); } }

@@ -2086,7 +2087,7 @@ static void tcp_conn_from_tap(struct ctx *c, int af, const void *addr, }

conn = CONN(c->tcp.conn_count++); - conn->c.spliced = false; + TO_TCP_CONN(conn)->c.spliced = false; conn->sock = s; conn->timer = -1; conn_event(c, conn, TAP_SYN_RCVD); @@ -2770,7 +2771,7 @@ static void tcp_tap_conn_from_sock(struct ctx *c, union epoll_ref ref, struct sockaddr *sa, const struct timespec *now) { - conn->c.spliced = false; + TO_TCP_CONN(conn)->c.spliced = false; conn->sock = s; conn->timer = -1; conn->ws_to_tap = conn->ws_from_tap = 0; @@ -2804,7 +2805,7 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref, const struct timespec *now) { struct sockaddr_storage sa; - union tcp_conn *conn; + struct tcp_conn *conn; socklen_t sl; int s;

@@ -2826,11 +2827,11 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref, conn = tc + c->tcp.conn_count++;

if (c->mode == MODE_PASTA && - tcp_splice_conn_from_sock(c, ref, &conn->splice, + tcp_splice_conn_from_sock(c, ref, &conn->u.splice, s, (struct sockaddr *)&sa)) return;

- tcp_tap_conn_from_sock(c, ref, &conn->tap, s, + tcp_tap_conn_from_sock(c, ref, &conn->u.tap, s, (struct sockaddr *)&sa, now); }

@@ -2961,7 +2962,7 @@ static void tcp_tap_sock_handler(struct ctx *c, struct tcp_tap_conn *conn, void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events, const struct timespec *now) { - union tcp_conn *conn; + struct tcp_conn *conn;

if (ref.r.p.tcp.tcp.timer) { tcp_timer_handler(c, ref); @@ -2975,10 +2976,10 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,

conn = tc + ref.r.p.tcp.tcp.index;

- if (conn->c.spliced) - tcp_splice_sock_handler(c, &conn->splice, ref.r.s, events); + if (TO_TCP_CONN(conn)->c.spliced) + tcp_splice_sock_handler(c, &conn->u.splice, ref.r.s, events); else - tcp_tap_sock_handler(c, &conn->tap, events); + tcp_tap_sock_handler(c, &conn->u.tap, events); }

/** @@ -3370,7 +3371,7 @@ static int tcp_port_rebind(void *arg) void tcp_timer(struct ctx *c, const struct timespec *ts) { struct tcp_sock_refill_arg refill_arg = { c, 0 }; - union tcp_conn *conn; + struct tcp_conn *conn;

(void)ts;

@@ -3394,11 +3395,11 @@ void tcp_timer(struct ctx *c, const struct timespec *ts) }

for (conn = tc + c->tcp.conn_count - 1; conn >= tc; conn--) { - if (conn->c.spliced) { - tcp_splice_timer(c, &conn->splice); + if (TO_TCP_CONN(conn)->c.spliced) { + tcp_splice_timer(c, &conn->u.splice); } else { - if (conn->tap.events == CLOSED) - tcp_conn_destroy(c, &conn->tap); + if (conn->u.tap.events == CLOSED) + tcp_conn_destroy(c, &conn->u.tap); } }

diff --git a/tcp_conn.h b/tcp_conn.h index 4a8be29..3df7905 100644 --- a/tcp_conn.h +++ b/tcp_conn.h @@ -23,7 +23,6 @@ extern const char *tcp_common_flag_str[];

/** * struct tcp_tap_conn - Descriptor for a TCP connection (not spliced) - * @c: Fields common with tcp_splice_conn * @next_index: Connection index of next item in hash chain, -1 for none * @tap_mss: MSS advertised by tap/guest, rounded to 2 ^ TCP_MSS_BITS * @sock: Socket descriptor number @@ -47,9 +46,6 @@ extern const char *tcp_common_flag_str[]; * @seq_init_from_tap: Initial sequence number from tap */ struct tcp_tap_conn { - /* Must be first element to match tcp_splice_conn */ - struct tcp_conn_common c; - int next_index :TCP_CONN_INDEX_BITS + 2;

#define TCP_RETRANS_BITS 3 @@ -118,7 +114,6 @@ struct tcp_tap_conn {

/** * struct tcp_splice_conn - Descriptor for a spliced TCP connection - * @c: Fields common with tcp_tap_conn * @a: File descriptor number of socket for accepted connection * @pipe_a_b: Pipe ends for splice() from @a to @b * @b: File descriptor number of peer connected socket @@ -131,9 +126,6 @@ struct tcp_tap_conn { * @b_written: Bytes written to @b (not fully written from one @a read) */ struct tcp_splice_conn { - /* Must be first element to match tcp_tap_conn */ - struct tcp_conn_common c; - int a; int pipe_a_b[2]; int b; @@ -165,22 +157,27 @@ struct tcp_splice_conn { };

/** - * union tcp_conn - Descriptor for a TCP connection (spliced or non-spliced) + * struct tcp_conn - Descriptor for a TCP connection (spliced or non-spliced) * @c: Fields common between all variants - * @tap: Fields specific to non-spliced connections - * @splice: Fields specific to spliced connections + * @u.tap: Fields specific to non-spliced connections + * @u.splice: Fields specific to spliced connections */ -union tcp_conn { +struct tcp_conn { struct tcp_conn_common c; - struct tcp_tap_conn tap; - struct tcp_splice_conn splice; + union { + struct tcp_tap_conn tap; + struct tcp_splice_conn splice; + } u; };

+#define TO_TCP_CONN(x) \ + ((struct tcp_conn *)((char *)(x) - offsetof(struct tcp_conn, u))) + /* TCP connections */ -extern union tcp_conn tc[]; +extern struct tcp_conn tc[];

void tcp_splice_conn_update(struct ctx *c, struct tcp_splice_conn *new); -void tcp_table_compact(struct ctx *c, union tcp_conn *hole); +void tcp_table_compact(struct ctx *c, struct tcp_conn *hole); void tcp_splice_destroy(struct ctx *c, struct tcp_splice_conn *conn); void tcp_splice_timer(struct ctx *c, struct tcp_splice_conn *conn); void tcp_splice_pipe_refill(const struct ctx *c); diff --git a/tcp_splice.c b/tcp_splice.c index e2f0ce1..04fc513 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -73,8 +74,8 @@ static int splice_pipe_pool [TCP_SPLICE_PIPE_POOL_SIZE][2][2]; #define CONN_V6(x) (x->flags & SPLICE_V6) #define CONN_V4(x) (!CONN_V6(x)) #define CONN_HAS(conn, set) ((conn->events & (set)) == (set)) -#define CONN(index) (&tc[(index)].splice) -#define CONN_IDX(conn) ((union tcp_conn *)(conn) - tc) +#define CONN(index) (&tc[(index)].u.splice) +#define CONN_IDX(conn) (TO_TCP_CONN(conn) - tc)

/* Display strings for connection events */ static const char *tcp_splice_event_str[] __attribute((__unused__)) = { @@ -165,7 +166,7 @@ static void conn_flag_do(const struct ctx *c, struct tcp_splice_conn *conn, static int tcp_splice_epoll_ctl(const struct ctx *c, struct tcp_splice_conn *conn) { - int m = conn->c.in_epoll ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; + int m = TO_TCP_CONN(conn)->c.in_epoll ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; union epoll_ref ref_a = { .r.proto = IPPROTO_TCP, .r.s = conn->a, .r.p.tcp.tcp.index = CONN_IDX(conn) }; union epoll_ref ref_b = { .r.proto = IPPROTO_TCP, .r.s = conn->b, @@ -185,7 +186,7 @@ static int tcp_splice_epoll_ctl(const struct ctx *c, epoll_ctl(c->epollfd, m, conn->b, &ev_b)) goto delete;

- conn->c.in_epoll = true; + TO_TCP_CONN(conn)->c.in_epoll = true;

return 0;

@@ -283,7 +284,7 @@ void tcp_splice_destroy(struct ctx *c, struct tcp_splice_conn *conn) debug("TCP (spliced): index %li, CLOSED", CONN_IDX(conn));

c->tcp.splice_conn_count--; - tcp_table_compact(c, (union tcp_conn *)conn); + tcp_table_compact(c, TO_TCP_CONN(conn)); }

/** @@ -535,7 +536,7 @@ bool tcp_splice_conn_from_sock(struct ctx *c, union epoll_ref ref, if (setsockopt(s, SOL_TCP, TCP_QUICKACK, &((int){ 1 }), sizeof(int))) trace("TCP (spliced): failed to set TCP_QUICKACK on %i", s);

- conn->c.spliced = true; + TO_TCP_CONN(conn)->c.spliced = true; c->tcp.splice_conn_count++; conn->a = s;

---

I'm fine with this series in any case. If you don't have other ideas, I would just try to get rid of that warning (Out-of-bounds access, CWE-119) with the first diff here, or something similar.

-- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson

Stefano Brivio

19 Nov 19 Nov

9:39 a.m.

New subject: [PATCH v2 08/32] tcp: Add connection union type

On Fri, 18 Nov 2022 12:10:47 +1100 David Gibson wrote:

...

On Fri, Nov 18, 2022 at 01:25:18AM +0100, Stefano Brivio wrote:

...
[...]

It also confuses Coverity Scan, because in tcp_table_compact() we have:

memset(hole, 0, sizeof(*hole));

and while the prototype is:

void tcp_table_compact(struct ctx *c, union tcp_conn *hole)

it sees that we're passing, from tcp_splice_destroy(), something smaller than that (48 bytes), but we're zeroing the whole thing.

Bother.

...
Of course, it's not a real issue, that space is reserved for a connection slot anyway, but given there are no other issues reported, I'd try to keep Coverity happy if possible.

First try, failed: check hole->c.spliced and, if set, zero only sizeof(struct tcp_splice_conn) bytes. This looks like a false positive.

Another try, which should probably work (I just hit the daily build submission quota, grr): explicitly pass the union tcp_conn containing our struct tcp_splice_conn. This patch does it:

--- diff --git a/tcp.c b/tcp.c index 8874789..d635a8e 100644 --- a/tcp.c +++ b/tcp.c @@ -591,7 +591,7 @@ static size_t tcp6_l2_flags_buf_bytes; union tcp_conn tc[TCP_MAX_CONNS];

#define CONN(index) (&tc[(index)].tap) -#define CONN_IDX(conn) ((union tcp_conn *)(conn) - tc) +#define CONN_IDX(conn) (TCP_TAP_TO_COMMON(conn) - tc)

/** conn_at_idx() - Find a connection by index, if present * @index: Index of connection to lookup @@ -1385,7 +1385,7 @@ static void tcp_conn_destroy(struct ctx *c, struct tcp_tap_conn *conn) close(conn->timer);

tcp_hash_remove(c, conn); - tcp_table_compact(c, (union tcp_conn *)conn); + tcp_table_compact(c, TCP_TAP_TO_COMMON(conn)); }

static void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn); diff --git a/tcp_conn.h b/tcp_conn.h index 4a8be29..fa407ad 100644 --- a/tcp_conn.h +++ b/tcp_conn.h @@ -176,6 +176,12 @@ union tcp_conn { struct tcp_splice_conn splice; };

+#define TCP_TAP_TO_COMMON(x) \ + ((union tcp_conn *)((char *)(x) - offsetof(union tcp_conn, tap))) + +#define TCP_SPLICE_TO_COMMON(x) \ + ((union tcp_conn *)((char *)(x) - offsetof(union tcp_conn, splice))) + /* TCP connections */ extern union tcp_conn tc[];

diff --git a/tcp_splice.c b/tcp_splice.c index e2f0ce1..7d3f17e 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -74,7 +75,7 @@ static int splice_pipe_pool [TCP_SPLICE_PIPE_POOL_SIZE][2][2]; #define CONN_V4(x) (!CONN_V6(x)) #define CONN_HAS(conn, set) ((conn->events & (set)) == (set)) #define CONN(index) (&tc[(index)].splice) -#define CONN_IDX(conn) ((union tcp_conn *)(conn) - tc) +#define CONN_IDX(conn) (TCP_SPLICE_TO_COMMON(conn) - tc)

/* Display strings for connection events */ static const char *tcp_splice_event_str[] __attribute((__unused__)) = { @@ -283,7 +284,7 @@ void tcp_splice_destroy(struct ctx *c, struct tcp_splice_conn *conn) debug("TCP (spliced): index %li, CLOSED", CONN_IDX(conn));

c->tcp.splice_conn_count--; - tcp_table_compact(c, (union tcp_conn *)conn); + tcp_table_compact(c, TCP_SPLICE_TO_COMMON(conn)); }

Hmm.. so TAP_TO_COMMON and SPLICE_TO_COMMON don't actually change the pointer any more than the cast does, but you're hoping that will be enough to convince coverity? I'm a bit skeptical, but I guess we can try it.

Right, that didn't actually work.

...

I wonder if an explicit coverity lint override might be a better option here. Or, we could add padding to tcp_splice_conn so it has the same size as tcp_tap_conn. I think that would probably convince coverity.

Padding needs to be calculated manually, and a specific override might need some specific maintenance, with references to proprietary code. I'm not really fond of either. Given that we already have the pointers to the container union in the callers, we could just pass those instead, and then in splice or "tap" specific logic, and just there, use struct tcp_{tap,splice}_conn. I just posted a patch doing this. -- Stefano

David Gibson

25 Nov 25 Nov

10:22 a.m.

On Thu, 17 Nov 2022 16:58:36 +1100 David Gibson wrote:

...

When forwarding many ports, passt can consume a lot of kernel memory because of the many listening sockets it opens. There are not a lot of ways we can reduce that, but here's one.

Currently we create separate listening sockets for each port for both IPv4 and IPv6. However in Linux (and probably other platforms), it's possible to listen for both IPv4 and IPv6 connections on an IPv6 socket. This series uses such dual stack sockets to halve the number of listening sockets needed for TCP. When forwarding all TCP and UDP ports, this reduces the kernel memory used from around 677 MiB to around 487 MiB (kernel 6.0.8 on an x86_64 Fedora 37 machine).

Applied, thanks! -- Stefano

1109

Age (days ago)

1117

Last active (days ago)

List overview

Download

36 comments

2 participants

participants (2)

David Gibson
Stefano Brivio