On Thu, 17 Nov 2022 16:58:44 +1100
David Gibson wrote:
Currently, the tables for spliced and non-spliced connections are entirely
separate, with different types in different arrays. We want to unify them.
As a first step, create a union type which can represent either a spliced
or non-spliced connection. For them to be distinguishable, the individual
types need to have a common header added, with a bit indicating which type
this structure is.
This comes at the cost of increasing the size of tcp_tap_conn to over one
(64 byte) cacheline. This isn't ideal, but it makes things simpler for now
and we'll re-optimize this later.
Signed-off-by: David Gibson
---
tcp.c | 4 ++++
tcp_conn.h | 30 ++++++++++++++++++++++++++++++
tcp_splice.c | 2 ++
3 files changed, 36 insertions(+)
diff --git a/tcp.c b/tcp.c
index 189041c..05eed85 100644
--- a/tcp.c
+++ b/tcp.c
@@ -288,6 +288,7 @@
#include
#include
#include
+#include
#include /* For struct tcp_info */
@@ -601,6 +602,7 @@ static inline struct tcp_tap_conn *conn_at_idx(int index)
{
if ((index < 0) || (index >= TCP_MAX_CONNS))
return NULL;
+ assert(!(CONN(index)->c.spliced));
return CONN(index);
}
@@ -2096,6 +2098,7 @@ static void tcp_conn_from_tap(struct ctx *c, int af, const void *addr,
}
conn = CONN(c->tcp.conn_count++);
+ conn->c.spliced = false;
conn->sock = s;
conn->timer = -1;
conn_event(c, conn, TAP_SYN_RCVD);
@@ -2764,6 +2767,7 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref,
return;
conn = CONN(c->tcp.conn_count++);
+ conn->c.spliced = false;
conn->sock = s;
conn->timer = -1;
conn->ws_to_tap = conn->ws_from_tap = 0;
diff --git a/tcp_conn.h b/tcp_conn.h
index db4c2d9..39d104a 100644
--- a/tcp_conn.h
+++ b/tcp_conn.h
@@ -11,8 +11,19 @@
#define TCP_HASH_BUCKET_BITS (TCP_CONN_INDEX_BITS + 1)
+/**
+ * struct tcp_conn_common - Common fields for spliced and non-spliced
+ * @spliced: Is this a spliced connection?
+ */
+struct tcp_conn_common {
+ bool spliced :1;
+};
+
+extern const char *tcp_common_flag_str[];
+
/**
* struct tcp_tap_conn - Descriptor for a TCP connection (not spliced)
+ * @c: Fields common with tcp_splice_conn
* @next_index: Connection index of next item in hash chain, -1 for none
* @tap_mss: MSS advertised by tap/guest, rounded to 2 ^ TCP_MSS_BITS
* @sock: Socket descriptor number
@@ -40,6 +51,9 @@
* @seq_init_from_tap: Initial sequence number from tap
*/
struct tcp_tap_conn {
+ /* Must be first element to match tcp_splice_conn */
+ struct tcp_conn_common c;
+
int next_index :TCP_CONN_INDEX_BITS + 2;
#define TCP_RETRANS_BITS 3
@@ -122,6 +136,7 @@ struct tcp_tap_conn {
/**
* struct tcp_splice_conn - Descriptor for a spliced TCP connection
+ * @c: Fields common with tcp_tap_conn
* @a: File descriptor number of socket for accepted connection
* @pipe_a_b: Pipe ends for splice() from @a to @b
* @b: File descriptor number of peer connected socket
@@ -134,6 +149,9 @@ struct tcp_tap_conn {
* @b_written: Bytes written to @b (not fully written from one @a read)
*/
struct tcp_splice_conn {
+ /* Must be first element to match tcp_tap_conn */
+ struct tcp_conn_common c;
+
int a;
int pipe_a_b[2];
int b;
@@ -165,4 +183,16 @@ struct tcp_splice_conn {
uint32_t b_written;
};
+/**
+ * union tcp_conn - Descriptor for a TCP connection (spliced or non-spliced)
+ * @c: Fields common between all variants
+ * @tap: Fields specific to non-spliced connections
+ * @splice: Fields specific to spliced connections
+*/
+union tcp_conn {
+ struct tcp_conn_common c;
+ struct tcp_tap_conn tap;
+ struct tcp_splice_conn splice;
+};
Sorry, I could have noticed earlier: I understand that this is needed
to end up, at the end of the series, with a 64-byte tcp_conn, but it
doesn't really look like the most natural way of doing things. I would
have expected something like:
struct tcp_conn {
struct tcp_conn_common c;
union {
struct tcp_tap_conn tap;
struct tcp_splice_conn splice;
} u;
};
but sure, if we do this, then we have 3 bytes between 'c' and 'u', and
struct tcp_conn becomes 68 bytes long.
It also confuses Coverity Scan, because in tcp_table_compact() we have:
memset(hole, 0, sizeof(*hole));
and while the prototype is:
void tcp_table_compact(struct ctx *c, union tcp_conn *hole)
it sees that we're passing, from tcp_splice_destroy(), something
smaller than that (48 bytes), but we're zeroing the whole thing.
Of course, it's not a real issue, that space is reserved for a
connection slot anyway, but given there are no other issues reported,
I'd try to keep Coverity happy if possible.
First try, failed: check hole->c.spliced and, if set, zero only
sizeof(struct tcp_splice_conn) bytes. This looks like a false
positive.
Another try, which should probably work (I just hit the daily build
submission quota, grr): explicitly pass the union tcp_conn containing
our struct tcp_splice_conn. This patch does it:
---
diff --git a/tcp.c b/tcp.c
index 8874789..d635a8e 100644
--- a/tcp.c
+++ b/tcp.c
@@ -591,7 +591,7 @@ static size_t tcp6_l2_flags_buf_bytes;
union tcp_conn tc[TCP_MAX_CONNS];
#define CONN(index) (&tc[(index)].tap)
-#define CONN_IDX(conn) ((union tcp_conn *)(conn) - tc)
+#define CONN_IDX(conn) (TCP_TAP_TO_COMMON(conn) - tc)
/** conn_at_idx() - Find a connection by index, if present
* @index: Index of connection to lookup
@@ -1385,7 +1385,7 @@ static void tcp_conn_destroy(struct ctx *c, struct tcp_tap_conn *conn)
close(conn->timer);
tcp_hash_remove(c, conn);
- tcp_table_compact(c, (union tcp_conn *)conn);
+ tcp_table_compact(c, TCP_TAP_TO_COMMON(conn));
}
static void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn);
diff --git a/tcp_conn.h b/tcp_conn.h
index 4a8be29..fa407ad 100644
--- a/tcp_conn.h
+++ b/tcp_conn.h
@@ -176,6 +176,12 @@ union tcp_conn {
struct tcp_splice_conn splice;
};
+#define TCP_TAP_TO_COMMON(x) \
+ ((union tcp_conn *)((char *)(x) - offsetof(union tcp_conn, tap)))
+
+#define TCP_SPLICE_TO_COMMON(x) \
+ ((union tcp_conn *)((char *)(x) - offsetof(union tcp_conn, splice)))
+
/* TCP connections */
extern union tcp_conn tc[];
diff --git a/tcp_splice.c b/tcp_splice.c
index e2f0ce1..7d3f17e 100644
--- a/tcp_splice.c
+++ b/tcp_splice.c
@@ -37,6 +37,7 @@
#include
#include
#include
+#include
#include
#include
#include
@@ -74,7 +75,7 @@ static int splice_pipe_pool [TCP_SPLICE_PIPE_POOL_SIZE][2][2];
#define CONN_V4(x) (!CONN_V6(x))
#define CONN_HAS(conn, set) ((conn->events & (set)) == (set))
#define CONN(index) (&tc[(index)].splice)
-#define CONN_IDX(conn) ((union tcp_conn *)(conn) - tc)
+#define CONN_IDX(conn) (TCP_SPLICE_TO_COMMON(conn) - tc)
/* Display strings for connection events */
static const char *tcp_splice_event_str[] __attribute((__unused__)) = {
@@ -283,7 +284,7 @@ void tcp_splice_destroy(struct ctx *c, struct tcp_splice_conn *conn)
debug("TCP (spliced): index %li, CLOSED", CONN_IDX(conn));
c->tcp.splice_conn_count--;
- tcp_table_compact(c, (union tcp_conn *)conn);
+ tcp_table_compact(c, TCP_SPLICE_TO_COMMON(conn));
}
/**
---
I can add it on top if you agree, assuming it works.
I also tried to actually turn tcp_conn into a struct. It takes 68 bytes,
so I'm not pursuing this approach, but I'm including the diff just in
case you have some quick idea to fix it up:
---
diff --git a/tcp.c b/tcp.c
index 8874789..6ee5675 100644
--- a/tcp.c
+++ b/tcp.c
@@ -588,10 +588,10 @@ static unsigned int tcp6_l2_flags_buf_used;
static size_t tcp6_l2_flags_buf_bytes;
/* TCP connections */
-union tcp_conn tc[TCP_MAX_CONNS];
+struct tcp_conn tc[TCP_MAX_CONNS];
-#define CONN(index) (&tc[(index)].tap)
-#define CONN_IDX(conn) ((union tcp_conn *)(conn) - tc)
+#define CONN(index) (&tc[(index)].u.tap)
+#define CONN_IDX(conn) (TO_TCP_CONN(conn) - tc)
/** conn_at_idx() - Find a connection by index, if present
* @index: Index of connection to lookup
@@ -602,7 +602,7 @@ static inline struct tcp_tap_conn *conn_at_idx(int index)
{
if ((index < 0) || (index >= TCP_MAX_CONNS))
return NULL;
- assert(!(CONN(index)->c.spliced));
+ assert(!TO_TCP_CONN(CONN(index))->c.spliced);
return CONN(index);
}
@@ -660,13 +660,13 @@ static void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
*/
static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
{
- int m = conn->c.in_epoll ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
+ int m = TO_TCP_CONN(conn)->c.in_epoll ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
union epoll_ref ref = { .r.proto = IPPROTO_TCP, .r.s = conn->sock,
.r.p.tcp.tcp.index = CONN_IDX(conn) };
struct epoll_event ev = { .data.u64 = ref.u64 };
if (conn->events == CLOSED) {
- if (conn->c.in_epoll)
+ if (TO_TCP_CONN(conn)->c.in_epoll)
epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->sock, &ev);
if (conn->timer != -1)
epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->timer, &ev);
@@ -678,7 +678,7 @@ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
if (epoll_ctl(c->epollfd, m, conn->sock, &ev))
return -errno;
- conn->c.in_epoll = true;
+ TO_TCP_CONN(conn)->c.in_epoll = true;
if (conn->timer != -1) {
union epoll_ref ref_t = { .r.proto = IPPROTO_TCP,
@@ -1347,9 +1347,9 @@ static struct tcp_tap_conn *tcp_hash_lookup(const struct ctx *c,
* @c: Execution context
* @hole: Pointer to recently closed connection
*/
-void tcp_table_compact(struct ctx *c, union tcp_conn *hole)
+void tcp_table_compact(struct ctx *c, struct tcp_conn *hole)
{
- union tcp_conn *from;
+ struct tcp_conn *from;
if (CONN_IDX(hole) == --c->tcp.conn_count) {
debug("TCP: table compaction: maximum index was %li (%p)",
@@ -1361,14 +1361,15 @@ void tcp_table_compact(struct ctx *c, union tcp_conn *hole)
from = tc + c->tcp.conn_count;
memcpy(hole, from, sizeof(*hole));
- if (from->c.spliced)
- tcp_splice_conn_update(c, &hole->splice);
+ if (TO_TCP_CONN(from)->c.spliced)
+ tcp_splice_conn_update(c, &hole->u.splice);
else
- tcp_tap_conn_update(c, &from->tap, &hole->tap);
+ tcp_tap_conn_update(c, &from->u.tap, &hole->u.tap);
debug("TCP: table compaction (spliced=%d): old index %li, new index %li, "
"from: %p, to: %p",
- from->c.spliced, CONN_IDX(from), CONN_IDX(hole), from, hole);
+ TO_TCP_CONN(from)->c.spliced, CONN_IDX(from), CONN_IDX(hole),
+ from, hole);
memset(from, 0, sizeof(*from));
}
@@ -1385,7 +1386,7 @@ static void tcp_conn_destroy(struct ctx *c, struct tcp_tap_conn *conn)
close(conn->timer);
tcp_hash_remove(c, conn);
- tcp_table_compact(c, (union tcp_conn *)conn);
+ tcp_table_compact(c, TO_TCP_CONN(conn));
}
static void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn);
@@ -1523,7 +1524,7 @@ void tcp_defer_handler(struct ctx *c)
{
int max_conns = c->tcp.conn_count / 100 * TCP_CONN_PRESSURE;
int max_files = c->nofile / 100 * TCP_FILE_PRESSURE;
- union tcp_conn *conn;
+ struct tcp_conn *conn;
tcp_l2_flags_buf_flush(c);
tcp_l2_data_buf_flush(c);
@@ -1533,12 +1534,12 @@ void tcp_defer_handler(struct ctx *c)
return;
for (conn = tc + c->tcp.conn_count - 1; conn >= tc; conn--) {
- if (conn->c.spliced) {
- if (conn->splice.flags & CLOSING)
- tcp_splice_destroy(c, &conn->splice);
+ if (TO_TCP_CONN(conn)->c.spliced) {
+ if (conn->u.splice.flags & CLOSING)
+ tcp_splice_destroy(c, &conn->u.splice);
} else {
- if (conn->tap.events == CLOSED)
- tcp_conn_destroy(c, &conn->tap);
+ if (conn->u.tap.events == CLOSED)
+ tcp_conn_destroy(c, &conn->u.tap);
}
}
@@ -2086,7 +2087,7 @@ static void tcp_conn_from_tap(struct ctx *c, int af, const void *addr,
}
conn = CONN(c->tcp.conn_count++);
- conn->c.spliced = false;
+ TO_TCP_CONN(conn)->c.spliced = false;
conn->sock = s;
conn->timer = -1;
conn_event(c, conn, TAP_SYN_RCVD);
@@ -2770,7 +2771,7 @@ static void tcp_tap_conn_from_sock(struct ctx *c, union epoll_ref ref,
struct sockaddr *sa,
const struct timespec *now)
{
- conn->c.spliced = false;
+ TO_TCP_CONN(conn)->c.spliced = false;
conn->sock = s;
conn->timer = -1;
conn->ws_to_tap = conn->ws_from_tap = 0;
@@ -2804,7 +2805,7 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref,
const struct timespec *now)
{
struct sockaddr_storage sa;
- union tcp_conn *conn;
+ struct tcp_conn *conn;
socklen_t sl;
int s;
@@ -2826,11 +2827,11 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref,
conn = tc + c->tcp.conn_count++;
if (c->mode == MODE_PASTA &&
- tcp_splice_conn_from_sock(c, ref, &conn->splice,
+ tcp_splice_conn_from_sock(c, ref, &conn->u.splice,
s, (struct sockaddr *)&sa))
return;
- tcp_tap_conn_from_sock(c, ref, &conn->tap, s,
+ tcp_tap_conn_from_sock(c, ref, &conn->u.tap, s,
(struct sockaddr *)&sa, now);
}
@@ -2961,7 +2962,7 @@ static void tcp_tap_sock_handler(struct ctx *c, struct tcp_tap_conn *conn,
void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
const struct timespec *now)
{
- union tcp_conn *conn;
+ struct tcp_conn *conn;
if (ref.r.p.tcp.tcp.timer) {
tcp_timer_handler(c, ref);
@@ -2975,10 +2976,10 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
conn = tc + ref.r.p.tcp.tcp.index;
- if (conn->c.spliced)
- tcp_splice_sock_handler(c, &conn->splice, ref.r.s, events);
+ if (TO_TCP_CONN(conn)->c.spliced)
+ tcp_splice_sock_handler(c, &conn->u.splice, ref.r.s, events);
else
- tcp_tap_sock_handler(c, &conn->tap, events);
+ tcp_tap_sock_handler(c, &conn->u.tap, events);
}
/**
@@ -3370,7 +3371,7 @@ static int tcp_port_rebind(void *arg)
void tcp_timer(struct ctx *c, const struct timespec *ts)
{
struct tcp_sock_refill_arg refill_arg = { c, 0 };
- union tcp_conn *conn;
+ struct tcp_conn *conn;
(void)ts;
@@ -3394,11 +3395,11 @@ void tcp_timer(struct ctx *c, const struct timespec *ts)
}
for (conn = tc + c->tcp.conn_count - 1; conn >= tc; conn--) {
- if (conn->c.spliced) {
- tcp_splice_timer(c, &conn->splice);
+ if (TO_TCP_CONN(conn)->c.spliced) {
+ tcp_splice_timer(c, &conn->u.splice);
} else {
- if (conn->tap.events == CLOSED)
- tcp_conn_destroy(c, &conn->tap);
+ if (conn->u.tap.events == CLOSED)
+ tcp_conn_destroy(c, &conn->u.tap);
}
}
diff --git a/tcp_conn.h b/tcp_conn.h
index 4a8be29..3df7905 100644
--- a/tcp_conn.h
+++ b/tcp_conn.h
@@ -23,7 +23,6 @@ extern const char *tcp_common_flag_str[];
/**
* struct tcp_tap_conn - Descriptor for a TCP connection (not spliced)
- * @c: Fields common with tcp_splice_conn
* @next_index: Connection index of next item in hash chain, -1 for none
* @tap_mss: MSS advertised by tap/guest, rounded to 2 ^ TCP_MSS_BITS
* @sock: Socket descriptor number
@@ -47,9 +46,6 @@ extern const char *tcp_common_flag_str[];
* @seq_init_from_tap: Initial sequence number from tap
*/
struct tcp_tap_conn {
- /* Must be first element to match tcp_splice_conn */
- struct tcp_conn_common c;
-
int next_index :TCP_CONN_INDEX_BITS + 2;
#define TCP_RETRANS_BITS 3
@@ -118,7 +114,6 @@ struct tcp_tap_conn {
/**
* struct tcp_splice_conn - Descriptor for a spliced TCP connection
- * @c: Fields common with tcp_tap_conn
* @a: File descriptor number of socket for accepted connection
* @pipe_a_b: Pipe ends for splice() from @a to @b
* @b: File descriptor number of peer connected socket
@@ -131,9 +126,6 @@ struct tcp_tap_conn {
* @b_written: Bytes written to @b (not fully written from one @a read)
*/
struct tcp_splice_conn {
- /* Must be first element to match tcp_tap_conn */
- struct tcp_conn_common c;
-
int a;
int pipe_a_b[2];
int b;
@@ -165,22 +157,27 @@ struct tcp_splice_conn {
};
/**
- * union tcp_conn - Descriptor for a TCP connection (spliced or non-spliced)
+ * struct tcp_conn - Descriptor for a TCP connection (spliced or non-spliced)
* @c: Fields common between all variants
- * @tap: Fields specific to non-spliced connections
- * @splice: Fields specific to spliced connections
+ * @u.tap: Fields specific to non-spliced connections
+ * @u.splice: Fields specific to spliced connections
*/
-union tcp_conn {
+struct tcp_conn {
struct tcp_conn_common c;
- struct tcp_tap_conn tap;
- struct tcp_splice_conn splice;
+ union {
+ struct tcp_tap_conn tap;
+ struct tcp_splice_conn splice;
+ } u;
};
+#define TO_TCP_CONN(x) \
+ ((struct tcp_conn *)((char *)(x) - offsetof(struct tcp_conn, u)))
+
/* TCP connections */
-extern union tcp_conn tc[];
+extern struct tcp_conn tc[];
void tcp_splice_conn_update(struct ctx *c, struct tcp_splice_conn *new);
-void tcp_table_compact(struct ctx *c, union tcp_conn *hole);
+void tcp_table_compact(struct ctx *c, struct tcp_conn *hole);
void tcp_splice_destroy(struct ctx *c, struct tcp_splice_conn *conn);
void tcp_splice_timer(struct ctx *c, struct tcp_splice_conn *conn);
void tcp_splice_pipe_refill(const struct ctx *c);
diff --git a/tcp_splice.c b/tcp_splice.c
index e2f0ce1..04fc513 100644
--- a/tcp_splice.c
+++ b/tcp_splice.c
@@ -37,6 +37,7 @@
#include
#include
#include
+#include
#include
#include
#include
@@ -73,8 +74,8 @@ static int splice_pipe_pool [TCP_SPLICE_PIPE_POOL_SIZE][2][2];
#define CONN_V6(x) (x->flags & SPLICE_V6)
#define CONN_V4(x) (!CONN_V6(x))
#define CONN_HAS(conn, set) ((conn->events & (set)) == (set))
-#define CONN(index) (&tc[(index)].splice)
-#define CONN_IDX(conn) ((union tcp_conn *)(conn) - tc)
+#define CONN(index) (&tc[(index)].u.splice)
+#define CONN_IDX(conn) (TO_TCP_CONN(conn) - tc)
/* Display strings for connection events */
static const char *tcp_splice_event_str[] __attribute((__unused__)) = {
@@ -165,7 +166,7 @@ static void conn_flag_do(const struct ctx *c, struct tcp_splice_conn *conn,
static int tcp_splice_epoll_ctl(const struct ctx *c,
struct tcp_splice_conn *conn)
{
- int m = conn->c.in_epoll ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
+ int m = TO_TCP_CONN(conn)->c.in_epoll ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
union epoll_ref ref_a = { .r.proto = IPPROTO_TCP, .r.s = conn->a,
.r.p.tcp.tcp.index = CONN_IDX(conn) };
union epoll_ref ref_b = { .r.proto = IPPROTO_TCP, .r.s = conn->b,
@@ -185,7 +186,7 @@ static int tcp_splice_epoll_ctl(const struct ctx *c,
epoll_ctl(c->epollfd, m, conn->b, &ev_b))
goto delete;
- conn->c.in_epoll = true;
+ TO_TCP_CONN(conn)->c.in_epoll = true;
return 0;
@@ -283,7 +284,7 @@ void tcp_splice_destroy(struct ctx *c, struct tcp_splice_conn *conn)
debug("TCP (spliced): index %li, CLOSED", CONN_IDX(conn));
c->tcp.splice_conn_count--;
- tcp_table_compact(c, (union tcp_conn *)conn);
+ tcp_table_compact(c, TO_TCP_CONN(conn));
}
/**
@@ -535,7 +536,7 @@ bool tcp_splice_conn_from_sock(struct ctx *c, union epoll_ref ref,
if (setsockopt(s, SOL_TCP, TCP_QUICKACK, &((int){ 1 }), sizeof(int)))
trace("TCP (spliced): failed to set TCP_QUICKACK on %i", s);
- conn->c.spliced = true;
+ TO_TCP_CONN(conn)->c.spliced = true;
c->tcp.splice_conn_count++;
conn->a = s;
---
I'm fine with this series in any case. If you don't have other ideas,
I would just try to get rid of that warning (Out-of-bounds access,
CWE-119) with the first diff here, or something similar.
--
Stefano