In the Makefile we probe to create several defines based on the presence of particular fields in struct tcp_info. These defines are used for two purposes, neither of which they accomplish well: 1) Determining if the tcp_info fields are available at runtime. For this purpose the defines are Just Plain Wrong, since the runtime kernel may not be the same as the compile time kernel. We corrected this for tcp_snd_wnd, but not for tcpi_bytes_acked or tcpi_min_rtt 2) Allowing the source to compile against older kernel headers which don't have the fields in question. This works in theory, but it does mean we won't be able to use the fields, even if later run against a newer kernel. Furthermore, it's quite fragile: without much more thorough tests of builds in different environments that we're currently set up for, it's very easy to miss cases where we're accessing a field without protection from an #ifdef. For example we currently access tcpi_snd_wnd without #ifdefs in tcp_update_seqack_wnd(). Improve this with a different approach, borrowed from qemu (which has many instances of similar problems). Don't compile against linux/tcp.h, using netinet/tcp.h instead. Then for when we need an extension field, define a struct tcp_info_linux, copied from the kernel, with all the fields we're interested in. That may need updating from future kernel versions, but only when we want to use a new extension, so it shouldn't be frequent. This allows us to remove the HAS_SND_WND define entirely. We keep HAS_BYTES_ACKED and HAS_MIN_RTT now, since they're used for purpose (1), we'll fix that in a later patch. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- Makefile | 5 --- tcp.c | 30 ++++--------- tcp_info.h | 120 +++++++++++++++++++++++++++++++++++++++++++++++++ tcp_internal.h | 4 +- 4 files changed, 132 insertions(+), 27 deletions(-) create mode 100644 tcp_info.h diff --git a/Makefile b/Makefile index 74a9513..6faa501 100644 --- a/Makefile +++ b/Makefile @@ -67,11 +67,6 @@ PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \ udp.h udp_flow.h util.h HEADERS = $(PASST_HEADERS) seccomp.h -C := \#include <linux/tcp.h>\nstruct tcp_info x = { .tcpi_snd_wnd = 0 }; -ifeq ($(shell printf "$(C)" | $(CC) -S -xc - -o - >/dev/null 2>&1; echo $$?),0) - FLAGS += -DHAS_SND_WND -endif - C := \#include <linux/tcp.h>\nstruct tcp_info x = { .tcpi_bytes_acked = 0 }; ifeq ($(shell printf "$(C)" | $(CC) -S -xc - -o - >/dev/null 2>&1; echo $$?),0) FLAGS += -DHAS_BYTES_ACKED diff --git a/tcp.c b/tcp.c index 0d22e07..2a0b272 100644 --- a/tcp.c +++ b/tcp.c @@ -274,6 +274,7 @@ #include <net/if.h> #include <netinet/in.h> #include <netinet/ip.h> +#include <netinet/tcp.h> #include <stdint.h> #include <stdbool.h> #include <stddef.h> @@ -286,8 +287,6 @@ #include <time.h> #include <arpa/inet.h> -#include <linux/tcp.h> /* For struct tcp_info */ - #include "checksum.h" #include "util.h" #include "iov.h" @@ -303,6 +302,7 @@ #include "flow_table.h" #include "tcp_internal.h" +#include "tcp_info.h" #include "tcp_buf.h" /* MSS rounding: see SET_MSS() */ @@ -318,11 +318,6 @@ #define LOW_RTT_TABLE_SIZE 8 #define LOW_RTT_THRESHOLD 10 /* us */ -/* We need to include <linux/tcp.h> for tcpi_bytes_acked, instead of - * <netinet/tcp.h>, but that doesn't include a definition for SOL_TCP - */ -#define SOL_TCP IPPROTO_TCP - #define ACK_IF_NEEDED 0 /* See tcp_send_flag() */ #define CONN_IS_CLOSING(conn) \ @@ -365,14 +360,11 @@ char tcp_buf_discard [MAX_WINDOW]; /* Does the kernel support TCP_PEEK_OFF? */ bool peek_offset_cap; -#ifdef HAS_SND_WND + /* Does the kernel report sending window in TCP_INFO (kernel commit * 8f7baad7f035) */ bool snd_wnd_cap; -#else -#define snd_wnd_cap (false) -#endif /* sendmsg() to socket */ static struct iovec tcp_iov [UIO_MAXIOV]; @@ -678,7 +670,7 @@ static int tcp_rtt_dst_low(const struct tcp_tap_conn *conn) * @tinfo: Pointer to struct tcp_info for socket */ static void tcp_rtt_dst_check(const struct tcp_tap_conn *conn, - const struct tcp_info *tinfo) + const struct tcp_info_linux *tinfo) { #ifdef HAS_MIN_RTT const struct flowside *tapside = TAPFLOW(conn); @@ -1114,13 +1106,13 @@ size_t tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn, * Return: 1 if sequence or window were updated, 0 otherwise */ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, - bool force_seq, struct tcp_info *tinfo) + bool force_seq, struct tcp_info_linux *tinfo) { uint32_t prev_wnd_to_tap = conn->wnd_to_tap << conn->ws_to_tap; uint32_t prev_ack_to_tap = conn->seq_ack_to_tap; /* cppcheck-suppress [ctunullpointer, unmatchedSuppression] */ socklen_t sl = sizeof(*tinfo); - struct tcp_info tinfo_new; + struct tcp_info_linux tinfo_new; uint32_t new_wnd_to_tap = prev_wnd_to_tap; int s = conn->sock; @@ -1235,7 +1227,7 @@ int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn, int flags, struct tcphdr *th, struct tcp_syn_opts *opts, size_t *optlen) { - struct tcp_info tinfo = { 0 }; + struct tcp_info_linux tinfo = { 0 }; socklen_t sl = sizeof(tinfo); int s = conn->sock; @@ -2578,7 +2570,6 @@ static bool tcp_probe_peek_offset_cap(sa_family_t af) return ret; } -#ifdef HAS_SND_WND /** * tcp_probe_snd_wnd_cap() - Check if TCP_INFO reports tcpi_snd_wnd * @@ -2586,7 +2577,7 @@ static bool tcp_probe_peek_offset_cap(sa_family_t af) */ static bool tcp_probe_snd_wnd_cap(void) { - struct tcp_info tinfo; + struct tcp_info_linux tinfo; socklen_t sl = sizeof(tinfo); int s; @@ -2604,13 +2595,12 @@ static bool tcp_probe_snd_wnd_cap(void) close(s); - if (sl < (offsetof(struct tcp_info, tcpi_snd_wnd) + + if (sl < (offsetof(struct tcp_info_linux, tcpi_snd_wnd) + sizeof(tinfo.tcpi_snd_wnd))) return false; return true; } -#endif /* HAS_SND_WND */ /** * tcp_init() - Get initial sequence, hash secret, initialise per-socket data @@ -2645,9 +2635,7 @@ int tcp_init(struct ctx *c) (!c->ifi6 || tcp_probe_peek_offset_cap(AF_INET6)); debug("SO_PEEK_OFF%ssupported", peek_offset_cap ? " " : " not "); -#ifdef HAS_SND_WND snd_wnd_cap = tcp_probe_snd_wnd_cap(); -#endif debug("TCP_INFO tcpi_snd_wnd field%ssupported", snd_wnd_cap ? " " : " not "); diff --git a/tcp_info.h b/tcp_info.h new file mode 100644 index 0000000..59a1497 --- /dev/null +++ b/tcp_info.h @@ -0,0 +1,120 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later + * Copyright Red Hat + * + * Largely derived from include/linux/tcp.h in the Linux kernel + */ + +#ifndef TCP_INFO_H +#define TCP_INFO_H + +/* struct tcp_info_linux - Information from Linux TCP_INFO getsockopt() + * + * Some fields returned by TCP_INFO have been there for ages and are shared with + * BSD. struct tcp_info from netinet/tcp.h has only those fields. There are + * also a many Linux specific extensions to the structure, which are only found + * in the linux/tcp.h version of struct tcp_info. + * + * We want to use some of those extension fields, when available. We can test + * for availability in the runtime kernel using the length returned from + * getsockopt(). However, we won't necessarily be compiled against the same + * kernel headers as we'll run with, so compiling directly against linux/tcp.h + * means wrapping every field access in an #ifdef whose #else does the same + * thing as when the field is missing at runtime. This rapidly gets messy. + * + * Instead we here define struct tcp_info_linux which includes all the Linux + * extensions that we want to use. This taken from v6.11 of the kernel. + */ +struct tcp_info_linux { + uint8_t tcpi_state; + uint8_t tcpi_ca_state; + uint8_t tcpi_retransmits; + uint8_t tcpi_probes; + uint8_t tcpi_backoff; + uint8_t tcpi_options; + uint8_t tcpi_snd_wscale : 4, tcpi_rcv_wscale : 4; + uint8_t tcpi_delivery_rate_app_limited:1, tcpi_fastopen_client_fail:2; + + uint32_t tcpi_rto; + uint32_t tcpi_ato; + uint32_t tcpi_snd_mss; + uint32_t tcpi_rcv_mss; + + uint32_t tcpi_unacked; + uint32_t tcpi_sacked; + uint32_t tcpi_lost; + uint32_t tcpi_retrans; + uint32_t tcpi_fackets; + + /* Times. */ + uint32_t tcpi_last_data_sent; + uint32_t tcpi_last_ack_sent; + uint32_t tcpi_last_data_recv; + uint32_t tcpi_last_ack_recv; + + /* Metrics. */ + uint32_t tcpi_pmtu; + uint32_t tcpi_rcv_ssthresh; + uint32_t tcpi_rtt; + uint32_t tcpi_rttvar; + uint32_t tcpi_snd_ssthresh; + uint32_t tcpi_snd_cwnd; + uint32_t tcpi_advmss; + uint32_t tcpi_reordering; + + uint32_t tcpi_rcv_rtt; + uint32_t tcpi_rcv_space; + + uint32_t tcpi_total_retrans; + + /* Linux extensions */ + uint64_t tcpi_pacing_rate; + uint64_t tcpi_max_pacing_rate; + uint64_t tcpi_bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked */ + uint64_t tcpi_bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived */ + uint32_t tcpi_segs_out; /* RFC4898 tcpEStatsPerfSegsOut */ + uint32_t tcpi_segs_in; /* RFC4898 tcpEStatsPerfSegsIn */ + + uint32_t tcpi_notsent_bytes; + uint32_t tcpi_min_rtt; + uint32_t tcpi_data_segs_in; /* RFC4898 tcpEStatsDataSegsIn */ + uint32_t tcpi_data_segs_out; /* RFC4898 tcpEStatsDataSegsOut */ + + uint64_t tcpi_delivery_rate; + + uint64_t tcpi_busy_time; /* Time (usec) busy sending data */ + uint64_t tcpi_rwnd_limited; /* Time (usec) limited by receive window */ + uint64_t tcpi_sndbuf_limited; /* Time (usec) limited by send buffer */ + + uint32_t tcpi_delivered; + uint32_t tcpi_delivered_ce; + + uint64_t tcpi_bytes_sent; /* RFC4898 tcpEStatsPerfHCDataOctetsOut */ + uint64_t tcpi_bytes_retrans; /* RFC4898 tcpEStatsPerfOctetsRetrans */ + uint32_t tcpi_dsack_dups; /* RFC4898 tcpEStatsStackDSACKDups */ + uint32_t tcpi_reord_seen; /* reordering events seen */ + + uint32_t tcpi_rcv_ooopack; /* Out-of-order packets received */ + + uint32_t tcpi_snd_wnd; /* peer's advertised receive window after + * scaling (bytes) + */ + uint32_t tcpi_rcv_wnd; /* local advertised receive window after + * scaling (bytes) + */ + + uint32_t tcpi_rehash; /* PLB or timeout triggered rehash attempts */ + + uint16_t tcpi_total_rto; /* Total number of RTO timeouts, including + * SYN/SYN-ACK and recurring timeouts. + */ + uint16_t tcpi_total_rto_recoveries; /* Total number of RTO + * recoveries, including any + * unfinished recovery. + */ + uint32_t tcpi_total_rto_time; /* Total time spent in RTO recoveries + * in milliseconds, including any + * unfinished recovery. + */ +}; + +#endif /* TCP_INFO_H */ diff --git a/tcp_internal.h b/tcp_internal.h index 1ab8ce2..a5a47df 100644 --- a/tcp_internal.h +++ b/tcp_internal.h @@ -175,12 +175,14 @@ void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn); tcp_rst_do(c, conn); \ } while (0) +struct tcp_info_linux; + size_t tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn, struct iovec *iov, size_t dlen, const uint16_t *check, uint32_t seq, bool no_tcp_csum); int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, - bool force_seq, struct tcp_info *tinfo); + bool force_seq, struct tcp_info_linux *tinfo); int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn, int flags, struct tcphdr *th, struct tcp_syn_opts *opts, size_t *optlen); -- 2.47.0