On 17/10/2025 19:43, Stefano Brivio wrote:
Nits only:
On Fri, 17 Oct 2025 12:31:26 +0200 Laurent Vivier
wrote: The in_epoll boolean flag in tcp_tap_conn and tcp_splice_conn only tracked whether a connection was registered with epoll, not which epoll instance. This limited flexibility for future multi-epoll support.
Replace the boolean with a threadnb field in flow_common that identifies which thread (and thus which epoll instance) the flow is registered with. Use FLOW_THREADNB_INVALID to indicate when a flow is not registered with any epoll instance. A threadnb_to_epollfd[] mapping table translates thread numbers to their corresponding epoll file descriptors.
It wasn't really clear to me until I actually started digging into this change what NB meant there. For me it's "notifier block" before "number", but that didn't make sense either.
Some ideas (even though maybe as David suggested this name shouldn't have "thread" in it at all):
- f->thread / FLOW_THREAD_INVALID
- f->thread_id / FLOW_THREAD_ID_INVALID
...or f->epoll_id, reflecting David's observation?
epoll_id seems reasonable.
Add helper functions: - flow_in_epoll() to check if a flow is registered with epoll - flow_epollfd() to retrieve the epoll fd for a flow's thread - flow_thread_register() to register an epoll fd with a thread - flow_thread_set() to set the thread number of a flow
This change also simplifies tcp_timer_ctl() and conn_flag_do() by removing the need to pass the context 'c', since the epoll fd is now directly accessible from the flow structure via flow_epollfd().
Signed-off-by: Laurent Vivier
--- flow.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++- flow.h | 12 ++++++++++++ passt.c | 1 + tcp.c | 39 ++++++++++++++++++++------------------ tcp_conn.h | 8 +------- tcp_splice.c | 24 ++++++++++++------------ 6 files changed, 99 insertions(+), 38 deletions(-) diff --git a/flow.c b/flow.c index b14e9d8b63ff..d56bae776239 100644 --- a/flow.c +++ b/flow.c @@ -116,6 +116,7 @@ static_assert(ARRAY_SIZE(flow_proto) == FLOW_NUM_TYPES, unsigned flow_first_free; union flow flowtab[FLOW_MAX]; static const union flow *flow_new_entry; /* = NULL */
It would be nice to have an idea of how this table is organised without looking at how it's used, say:
/* Table of epoll file descriptors, indexed by thread number */
...or indexed by "epoll identifiers", perhaps, if we want to drop references to threads here.
I will drop reference to threads.
+static int threadnb_to_epollfd[FLOW_THREADNB_SIZE];
/* Hash table to index it */ #define FLOW_HASH_LOAD 70 /* % */ @@ -347,6 +348,55 @@ static void flow_set_state(struct flow_common *f, enum flow_state state) flow_log_details_(f, LOG_DEBUG, MAX(state, oldstate)); }
+/** + * flow_in_epoll() - Check if flow is registered with an epoll instance + * @f: Flow to check + * + * Return: true if flow is registered with epoll, false otherwise + */ +bool flow_in_epoll(const struct flow_common *f) +{ + return f->threadnb != FLOW_THREADNB_INVALID; +} + +/** + * flow_epollfd() - Get the epoll file descriptor for a flow + * @f: Flow to query + * + * Return: epoll file descriptor associated with the flow's thread + */ +int flow_epollfd(const struct flow_common *f) +{ + ASSERT(f->threadnb < FLOW_THREADNB_MAX); + + return threadnb_to_epollfd[f->threadnb]; +} + +/** + * flow_thread_set() - Associate a flow with a thread + * @f: Flow to update + * @threadnb: Thread number to associate with this flow + */ +void flow_thread_set(struct flow_common *f, int threadnb) +{ + ASSERT(threadnb < FLOW_THREADNB_MAX); + + f->threadnb = threadnb; +} + +/** + * flow_thread_register() - Initialize the threadnb -> epollfd mapping + * @threadnb: Thread number to associate to + * @epollfd: epoll file descriptor for the thread + */ +void flow_thread_register(int threadnb, int epollfd) +{ + ASSERT(threadnb < FLOW_THREADNB_MAX); + ASSERT(epollfd >= 0); + + threadnb_to_epollfd[threadnb] = epollfd; +} + /** * flow_initiate_() - Move flow to INI, setting pif[INISIDE] * @flow: Flow to change state @@ -548,6 +598,7 @@ union flow *flow_alloc(void)
flow_new_entry = flow; memset(flow, 0, sizeof(*flow)); + flow->f.threadnb = FLOW_THREADNB_INVALID; flow_set_state(&flow->f, FLOW_STATE_NEW);
return flow; @@ -827,7 +878,7 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now) case FLOW_TCP_SPLICE: closed = tcp_splice_flow_defer(&flow->tcp_splice); if (!closed && timer) - tcp_splice_timer(c, &flow->tcp_splice); + tcp_splice_timer(&flow->tcp_splice); break; case FLOW_PING4: case FLOW_PING6: diff --git a/flow.h b/flow.h index ef138b83add8..700d8b32c990 100644 --- a/flow.h +++ b/flow.h @@ -177,6 +177,8 @@ int flowside_connect(const struct ctx *c, int s, * @type: Type of packet flow * @pif[]: Interface for each side of the flow * @side[]: Information for each side of the flow + * @threadnb: Thread number flow is registered with + * (FLOW_THREADNB_INVALID if not)
You could phrase this more directly, say, "thread identifier, or FLOW_THREAD_INVALID", or "epollfd identifier, or EPOLLFD_ID_INVALID".
This is a structure describing flows, it's not surprising it's about the flow.
*/ struct flow_common { #ifdef __GNUC__ @@ -192,8 +194,14 @@ struct flow_common { #endif uint8_t pif[SIDES]; struct flowside side[SIDES]; +#define FLOW_THREADNB_BITS 8 + unsigned int threadnb:FLOW_THREADNB_BITS; };
+#define FLOW_THREADNB_SIZE (1 << FLOW_THREADNB_BITS) +#define FLOW_THREADNB_MAX (FLOW_THREADNB_SIZE - 1) +#define FLOW_THREADNB_INVALID FLOW_THREADNB_MAX + #define FLOW_INDEX_BITS 17 /* 128k - 1 */ #define FLOW_MAX MAX_FROM_BITS(FLOW_INDEX_BITS)
@@ -249,6 +257,10 @@ flow_sidx_t flow_lookup_sa(const struct ctx *c, uint8_t proto, uint8_t pif, union flow;
void flow_init(void); +bool flow_in_epoll(const struct flow_common *f); +int flow_epollfd(const struct flow_common *f); +void flow_thread_set(struct flow_common *f, int threadnb); +void flow_thread_register(int threadnb, int epollfd); void flow_defer_handler(const struct ctx *c, const struct timespec *now); int flow_migrate_source_early(struct ctx *c, const struct migrate_stage *stage, int fd); diff --git a/passt.c b/passt.c index af928111786b..37f2c897be84 100644 --- a/passt.c +++ b/passt.c @@ -285,6 +285,7 @@ int main(int argc, char **argv) c.epollfd = epoll_create1(EPOLL_CLOEXEC); if (c.epollfd == -1) die_perror("Failed to create epoll file descriptor"); + flow_thread_register(0, c.epollfd);
Temporary, I suppose. If not, maybe it deserves its own constant, such as EPOLLFD_ID_DEFAULT?
I agree
if (getrlimit(RLIMIT_NOFILE, &limit)) die_perror("Failed to get maximum value of open files limit"); diff --git a/tcp.c b/tcp.c index db9f17c0622f..8c49852b8454 100644 --- a/tcp.c +++ b/tcp.c @@ -504,25 +504,27 @@ static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags) */ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn) { - int m = conn->in_epoll ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; + int m = flow_in_epoll(&conn->f) ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; union epoll_ref ref = { .type = EPOLL_TYPE_TCP, .fd = conn->sock, .flowside = FLOW_SIDX(conn, !TAPSIDE(conn)), }; struct epoll_event ev = { .data.u64 = ref.u64 }; + int epollfd = flow_in_epoll(&conn->f) ? flow_epollfd(&conn->f) + : c->epollfd;
We usually align the second expression to the ternary operator, see also example in sock_l4_sa(), say:
int epollfd = flow_in_epoll(&conn->f) ? flow_epollfd(&conn->f) : c->epollfd;
I do like this usually. Missed this one when I change the name of the function.
if (conn->events == CLOSED) { - if (conn->in_epoll) - epoll_del(c->epollfd, conn->sock); + if (flow_in_epoll(&conn->f)) + epoll_del(epollfd, conn->sock); if (conn->timer != -1) - epoll_del(c->epollfd, conn->timer); + epoll_del(epollfd, conn->timer); return 0; }
ev.events = tcp_conn_epoll_events(conn->events, conn->flags);
- if (epoll_ctl(c->epollfd, m, conn->sock, &ev)) + if (epoll_ctl(epollfd, m, conn->sock, &ev)) return -errno;
- conn->in_epoll = true; + flow_thread_set(&conn->f, 0);
Not temporary I guess, maybe it's an EPOLLFD_ID_DEFAULT?
I agree
if (conn->timer != -1) { union epoll_ref ref_t = { .type = EPOLL_TYPE_TCP_TIMER, @@ -531,7 +533,8 @@ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn) struct epoll_event ev_t = { .data.u64 = ref_t.u64, .events = EPOLLIN | EPOLLET };
- if (epoll_ctl(c->epollfd, EPOLL_CTL_MOD, conn->timer, &ev_t)) + if (epoll_ctl(flow_epollfd(&conn->f), EPOLL_CTL_MOD, + conn->timer, &ev_t)) return -errno; }
@@ -540,12 +543,11 @@ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
/** * tcp_timer_ctl() - Set timerfd based on flags/events, create timerfd if needed - * @c: Execution context * @conn: Connection pointer * * #syscalls timerfd_create timerfd_settime */ -static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn) +static void tcp_timer_ctl(struct tcp_tap_conn *conn) { struct itimerspec it = { { 0 }, { 0 } };
@@ -570,7 +572,8 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn) } conn->timer = fd;
- if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, conn->timer, &ev)) { + if (epoll_ctl(flow_epollfd(&conn->f), EPOLL_CTL_ADD,
I wouldn't find it outrageous if you assigned an epollfd local variable first so that this doesn't need two lines.
I will do...
+ conn->timer, &ev)) { flow_dbg_perror(conn, "failed to add timer"); close(conn->timer); conn->timer = -1; @@ -628,7 +631,7 @@ void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn, * flags and factor this into the logic below. */ if (flag == ACK_FROM_TAP_DUE) - tcp_timer_ctl(c, conn); + tcp_timer_ctl(conn);
return; } @@ -644,7 +647,7 @@ void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn, if (flag == ACK_FROM_TAP_DUE || flag == ACK_TO_TAP_DUE || (flag == ~ACK_FROM_TAP_DUE && (conn->flags & ACK_TO_TAP_DUE)) || (flag == ~ACK_TO_TAP_DUE && (conn->flags & ACK_FROM_TAP_DUE))) - tcp_timer_ctl(c, conn); + tcp_timer_ctl(conn); }
/** @@ -699,7 +702,7 @@ void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn, tcp_epoll_ctl(c, conn);
if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) - tcp_timer_ctl(c, conn); + tcp_timer_ctl(conn); }
/** @@ -1757,7 +1760,7 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn, seq, conn->seq_from_tap);
tcp_send_flag(c, conn, ACK); - tcp_timer_ctl(c, conn); + tcp_timer_ctl(conn);
if (p->count == 1) { tcp_tap_window_update(c, conn, @@ -2406,7 +2409,7 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
if (conn->flags & ACK_TO_TAP_DUE) { tcp_send_flag(c, conn, ACK_IF_NEEDED); - tcp_timer_ctl(c, conn); + tcp_timer_ctl(conn); } else if (conn->flags & ACK_FROM_TAP_DUE) { if (!(conn->events & ESTABLISHED)) { flow_dbg(conn, "handshake timeout"); @@ -2428,7 +2431,7 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref) return;
tcp_data_from_sock(c, conn); - tcp_timer_ctl(c, conn); + tcp_timer_ctl(conn); } } else { struct itimerspec new = { { 0 }, { ACT_TIMEOUT, 0 } }; @@ -3476,7 +3479,7 @@ int tcp_flow_migrate_source_ext(const struct ctx *c, if (c->migrate_no_linger) close(s); else - epoll_del(c->epollfd, s); + epoll_del(flow_epollfd(&conn->f), s);
/* Adjustments unrelated to FIN segments: sequence numbers we dumped are * based on the end of the queues. @@ -3625,7 +3628,7 @@ static int tcp_flow_repair_connect(const struct ctx *c, return rc; }
- conn->in_epoll = 0; + conn->f.threadnb = FLOW_THREADNB_INVALID; conn->timer = -1; conn->listening_sock = -1;
diff --git a/tcp_conn.h b/tcp_conn.h index 38b5c541f003..81333122d531 100644 --- a/tcp_conn.h +++ b/tcp_conn.h @@ -12,7 +12,6 @@ /** * struct tcp_tap_conn - Descriptor for a TCP connection (not spliced) * @f: Generic flow information - * @in_epoll: Is the connection in the epoll set? * @retrans: Number of retransmissions occurred due to ACK_TIMEOUT * @ws_from_tap: Window scaling factor advertised from tap/guest * @ws_to_tap: Window scaling factor advertised to tap/guest @@ -36,8 +35,6 @@ struct tcp_tap_conn { /* Must be first element */ struct flow_common f;
- bool in_epoll :1; - #define TCP_RETRANS_BITS 3 unsigned int retrans :TCP_RETRANS_BITS; #define TCP_MAX_RETRANS MAX_FROM_BITS(TCP_RETRANS_BITS) @@ -196,7 +193,6 @@ struct tcp_tap_transfer_ext { * @written: Bytes written (not fully written from one other side read) * @events: Events observed/actions performed on connection * @flags: Connection flags (attributes, not events) - * @in_epoll: Is the connection in the epoll set? */ struct tcp_splice_conn { /* Must be first element */ @@ -220,8 +216,6 @@ struct tcp_splice_conn { #define RCVLOWAT_SET(sidei_) ((sidei_) ? BIT(1) : BIT(0)) #define RCVLOWAT_ACT(sidei_) ((sidei_) ? BIT(3) : BIT(2)) #define CLOSING BIT(4) - - bool in_epoll :1; };
/* Socket pools */ @@ -245,7 +239,7 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd bool tcp_flow_is_established(const struct tcp_tap_conn *conn);
bool tcp_splice_flow_defer(struct tcp_splice_conn *conn); -void tcp_splice_timer(const struct ctx *c, struct tcp_splice_conn *conn); +void tcp_splice_timer(struct tcp_splice_conn *conn); int tcp_conn_pool_sock(int pool[]); int tcp_conn_sock(sa_family_t af); int tcp_sock_refill_pool(int pool[], sa_family_t af); diff --git a/tcp_splice.c b/tcp_splice.c index 6f21184bdc55..703bd7610890 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -149,7 +149,9 @@ static void tcp_splice_conn_epoll_events(uint16_t events, static int tcp_splice_epoll_ctl(const struct ctx *c, struct tcp_splice_conn *conn) { - int m = conn->in_epoll ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; + int epollfd = flow_in_epoll(&conn->f) ? flow_epollfd(&conn->f) + : c->epollfd;
Same as above.
Given the new, more limited usage of c->epollfd, if it's not a temporary thing (I couldn't quite guess what your plan is), maybe worth updating its documentation in struct ctx?
In a later patch I remove epollfd from ctx and rely on the threadnb (0 for main) to get epollfd. Thansk, Laurent