We read the TTL/hop_limit from UDP packets arriving at outbound sockets and convey to the value to the packets delivered to the internal peers via the tap interface. A prerequisite for this to work is that we eliminate the dual-stack listener socket and create separate IPv4 and IPv6 sockets for each bound UDP port. The extra memory required for that approach seems to be a showstopper, so this patch is posted mostly as a documentation of the work done, maybe to be applied some time in the future if new conditions permit. Signed-off-by: Jon Maloy <jmaloy(a)redhat.com> --- udp.c | 69 ++++++++++++++++++++++++++++++++++++++++-------------- udp_flow.c | 18 ++++++++++++++ udp_vu.c | 3 +-- util.c | 5 ++++ 4 files changed, 76 insertions(+), 19 deletions(-) diff --git a/udp.c b/udp.c index 7cc050c..31b9203 100644 --- a/udp.c +++ b/udp.c @@ -181,7 +181,12 @@ enum udp_iov_idx { UDP_NUM_IOVS, }; +struct udp_cmsg { + char buf[CMSG_SPACE(sizeof(int)) + CMSG_SPACE(sizeof(struct in6_pktinfo))]; +}; + /* IOVs and msghdr arrays for receiving datagrams from sockets */ +static struct udp_cmsg udp_cmsg_recv [UDP_MAX_FRAMES]; static struct iovec udp_iov_recv [UDP_MAX_FRAMES]; static struct mmsghdr udp_mh_recv [UDP_MAX_FRAMES]; @@ -194,6 +199,40 @@ static struct mmsghdr udp_mh_splice [UDP_MAX_FRAMES]; /* IOVs for L2 frames */ static struct iovec udp_l2_iov [UDP_MAX_FRAMES][UDP_NUM_IOVS]; +static uint8_t udp4_read_ttl(const struct msghdr *mhdr) +{ + struct msghdr *mh = (struct msghdr *)mhdr; + struct cmsghdr *cm; + int ttl = 0; + + for (cm = CMSG_FIRSTHDR(mh); cm != NULL; cm = CMSG_NXTHDR(mh, cm)) { + if (cm->cmsg_level == IPPROTO_IP && cm->cmsg_type == IP_TTL) { + memcpy(&ttl, CMSG_DATA(cm), sizeof(ttl)); + ttl = *(int *) CMSG_DATA(cm); + break; + } + } + + return ttl; +} + +static uint8_t udp6_read_hop_limit(const struct msghdr *mhdr) +{ + struct msghdr *mh = (struct msghdr *)mhdr; + struct cmsghdr *cm; + int hop_limit = 0; + + for (cm = CMSG_FIRSTHDR(mh); cm != NULL; cm = CMSG_NXTHDR(mh, cm)) { + if (cm->cmsg_level == SOL_IPV6 && + cm->cmsg_type == IPV6_HOPLIMIT) { + memcpy(&hop_limit, CMSG_DATA(cm), sizeof(hop_limit)); + break; + } + } + + return hop_limit; +} + /** * udp_portmap_clear() - Clear UDP port map before configuration */ @@ -230,6 +269,7 @@ static void udp_iov_init_one(const struct ctx *c, size_t i) struct udp_meta_t *meta = &udp_meta[i]; struct iovec *siov = &udp_iov_recv[i]; struct iovec *tiov = udp_l2_iov[i]; + struct udp_cmsg *ucmsg = &udp_cmsg_recv[i]; *meta = (struct udp_meta_t) { .ip4h = L2_BUF_IP4_INIT(IPPROTO_UDP), @@ -243,6 +283,9 @@ static void udp_iov_init_one(const struct ctx *c, size_t i) mh->msg_iov = siov; mh->msg_iovlen = 1; + + mh->msg_control = ucmsg; + mh->msg_controllen = sizeof(*ucmsg); } /** @@ -271,8 +314,8 @@ static void udp_iov_init(const struct ctx *c) * Return: size of IPv4 payload (UDP header + data) */ size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp, - const struct flowside *toside, size_t dlen, - uint8_t ttl, bool no_udp_csum) + const struct flowside *toside, + size_t dlen, uint8_t ttl, bool no_udp_csum) { const struct in_addr *src = inany_v4(&toside->oaddr); const struct in_addr *dst = inany_v4(&toside->eaddr); @@ -285,6 +328,7 @@ size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp, ip4h->ttl = ttl; ip4h->daddr = dst->s_addr; ip4h->saddr = src->s_addr; + ip4h->ttl = ttl; ip4h->check = csum_ip4_header(l3len, IPPROTO_UDP, ttl, *src, *dst); bp->uh.source = htons(toside->oport); @@ -366,17 +410,20 @@ static void udp_tap_prepare(const struct mmsghdr *mmh, size_t l4len; if (!inany_v4(&toside->eaddr) || !inany_v4(&toside->oaddr)) { + uint8_t hop_limit = udp6_read_hop_limit(&mmh[idx].msg_hdr); + l4len = udp_update_hdr6(&bm->ip6h, bp, toside, mmh[idx].msg_len, - DEFAULT_TTL, no_udp_csum); + hop_limit, no_udp_csum); tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip6h) + sizeof(udp6_eth_hdr)); (*tap_iov)[UDP_IOV_ETH] = IOV_OF_LVALUE(udp6_eth_hdr); (*tap_iov)[UDP_IOV_IP] = IOV_OF_LVALUE(bm->ip6h); } else { + uint8_t ttl = udp4_read_ttl(&mmh[idx].msg_hdr); + l4len = udp_update_hdr4(&bm->ip4h, bp, toside, - mmh[idx].msg_len, - DEFAULT_TTL, no_udp_csum); + mmh[idx].msg_len, ttl, no_udp_csum); tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip4h) + sizeof(udp4_eth_hdr)); (*tap_iov)[UDP_IOV_ETH] = IOV_OF_LVALUE(udp4_eth_hdr); @@ -1094,18 +1141,6 @@ int udp_sock_init(const struct ctx *c, int ns, const union inany_addr *addr, ASSERT(!c->no_udp); - if (!addr && c->ifi4 && c->ifi6 && !ns) { - int s; - - /* Attempt to get a dual stack socket */ - s = pif_sock_l4(c, EPOLL_TYPE_UDP_LISTEN, PIF_HOST, - NULL, ifname, port, uref.u32); - udp_splice_init[V4][port] = s < 0 ? -1 : s; - udp_splice_init[V6][port] = s < 0 ? -1 : s; - if (IN_INTERVAL(0, FD_REF_MAX, s)) - return 0; - } - if ((!addr || inany_v4(addr)) && c->ifi4) { if (!ns) { r4 = pif_sock_l4(c, EPOLL_TYPE_UDP_LISTEN, PIF_HOST, diff --git a/udp_flow.c b/udp_flow.c index fea1cf3..8af0c1f 100644 --- a/udp_flow.c +++ b/udp_flow.c @@ -73,6 +73,8 @@ static int udp_flow_sock(const struct ctx *c, { const struct flowside *side = &uflow->f.side[sidei]; uint8_t pif = uflow->f.pif[sidei]; + int enable = 1; + union { flow_sidx_t sidx; uint32_t data; @@ -91,6 +93,22 @@ static int udp_flow_sock(const struct ctx *c, return rc; } + if (pif == PIF_HOST) { + if (inany_v4(&side->oaddr)) { + if (setsockopt(s, IPPROTO_IP, IP_RECVTTL, + &enable, sizeof(enable)) < 0) { + perror("setsockopt IP_RECVTTL"); + exit(1); + } + } else { + if (setsockopt(s, SOL_IPV6, IPV6_RECVHOPLIMIT, + &enable, sizeof(enable)) < 0) { + perror("setsockopt IPV6_RECVHOPLIMIT"); + exit(1); + } + } + } + /* It's possible, if unlikely, that we could receive some packets in * between the bind() and connect() which may or may not be for this * flow. Being UDP we could just discard them, but it's not ideal. diff --git a/udp_vu.c b/udp_vu.c index ef2257c..9871024 100644 --- a/udp_vu.c +++ b/udp_vu.c @@ -152,8 +152,7 @@ static size_t udp_vu_prepare(const struct ctx *c, *iph = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_UDP); - l4len = udp_update_hdr4(iph, bp, toside, dlen, - DEFAULT_TTL, true); + l4len = udp_update_hdr4(iph, bp, toside, dlen, 255, true); } else { struct ipv6hdr *ip6h = vu_ip(iov_vu[0].iov_base); struct udp_payload_t *bp = vu_payloadv6(iov_vu[0].iov_base); diff --git a/util.c b/util.c index 62a6003..7c06029 100644 --- a/util.c +++ b/util.c @@ -111,13 +111,18 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type, if (proto == IPPROTO_UDP) { int pktinfo = af == AF_INET ? IP_PKTINFO : IPV6_RECVPKTINFO; int recverr = af == AF_INET ? IP_RECVERR : IPV6_RECVERR; + int ttlopt = af == AF_INET ? IP_RECVTTL : IPV6_RECVHOPLIMIT; int level = af == AF_INET ? IPPROTO_IP : IPPROTO_IPV6; + int ttlevel = af == AF_INET ? IPPROTO_IP : SOL_IPV6; if (setsockopt(fd, level, recverr, &y, sizeof(y))) die_perror("Failed to set RECVERR on socket %i", fd); if (setsockopt(fd, level, pktinfo, &y, sizeof(y))) die_perror("Failed to set PKTINFO on socket %i", fd); + + if (setsockopt(fd, ttlevel, ttlopt, &y, sizeof(y))) + die_perror("Failed to set RECVTTL on socket %i", fd); } if (ifname && *ifname) { -- 2.48.1