On Sun, Dec 14, 2025 at 08:54:36PM -0500, Jon Maloy wrote:
We add subscriptions to RTMGRP_LINK, RTMGRP_IPV4_IFADDR, and RTMGRP_IPV6_IFADDR, so that we can receive notifications when link state or addresses change on the namespace interface.
When addresses are discovered via netlink:
- We mark them as non-permanent, which means they can be modified or deleted by subsequent events. - We apply the prefix indicated in the notification. - Update addr_seen to track the new address as the active one.
addr_seen isn't really about an "active" address. The expectation was that the guest would only use a single address, it just might not be the one we told it to. Now that we're aiming to allow multiple concurrent addresses, we can expect the guest to use all of them actively.
This provides the foundation for dynamic address monitoring, and supports runtime network changes.
Signed-off-by: Jon Maloy
--- epoll_type.h | 2 + netlink.c | 370 +++++++++++++++++++++++++++++++++++++++++++++++++++ netlink.h | 3 + passt.c | 5 + passt.h | 1 + tap.c | 6 +- 6 files changed, 384 insertions(+), 3 deletions(-) diff --git a/epoll_type.h b/epoll_type.h index a90ffb6..0a16d94 100644 --- a/epoll_type.h +++ b/epoll_type.h @@ -46,6 +46,8 @@ enum epoll_type { EPOLL_TYPE_REPAIR, /* Netlink neighbour subscription socket */ EPOLL_TYPE_NL_NEIGH, + /* Netlink link/address subscription socket */ + EPOLL_TYPE_NL_LINKADDR,
EPOLL_NUM_TYPES, }; diff --git a/netlink.c b/netlink.c index 82a2f0c..7492f17 100644 --- a/netlink.c +++ b/netlink.c @@ -35,6 +35,9 @@ #include "passt.h" #include "log.h" #include "ip.h" +#include "tap.h" +#include "arp.h" +#include "ndp.h" #include "netlink.h" #include "epoll_ctl.h"
@@ -59,6 +62,7 @@ int nl_sock = -1; int nl_sock_ns = -1; static int nl_sock_neigh = -1; +static int nl_sock_linkaddr = -1; static int nl_seq = 1;
/** @@ -91,6 +95,372 @@ static int nl_sock_init_do(void *arg) return 0; }
+/** + * nl_addr4_find() - Find an IPv4 address in the address array + * @c: Execution context + * @addr: Address to find + * + * Return: index if found, -1 otherwise + */ +static int nl_addr4_find(const struct ctx *c, const struct in_addr *addr) +{ + int i; + + for (i = 0; i < c->ip4.addr_count; i++) + if (IN4_ARE_ADDR_EQUAL(&c->ip4.addrs[i].addr, addr)) + return (int)i; + + return -1; +} + +/** + * nl_addr6_find() - Find an IPv6 address in the address array + * @c: Execution context + * @addr: Address to find + * + * Return: index if found, -1 otherwise + */ +static int nl_addr6_find(const struct ctx *c, const struct in6_addr *addr) +{ + int i; + + for (i = 0; i < c->ip6.addr_count; i++) + if (IN6_ARE_ADDR_EQUAL(&c->ip6.addrs[i].addr, addr)) + return (int)i; + + return -1; +} + +/** + * nl_addr4_add() - Add a discovered IPv4 address to the address array + * @c: Execution context + * @addr: Address to add + * @prefix_len: Prefix length + * + * Return: true if added or updated, false if array full or already permanent + */ +static bool nl_addr4_add(struct ctx *c, const struct in_addr *addr, + int prefix_len) +{ + int idx = nl_addr4_find(c, addr); + + if (idx >= 0) { + /* Address exists - if permanent, don't touch; else update */ + if (c->ip4.addrs[idx].permanent) + return false; + c->ip4.addrs[idx].prefix_len = prefix_len; + return true; + } + + /* New address - add if room */ + if (c->ip4.addr_count >= IP4_MAX_ADDRS) { + debug("IPv4 address array full, ignoring discovered address"); + return false; + } + + idx = c->ip4.addr_count++; + c->ip4.addrs[idx].addr = *addr; + c->ip4.addrs[idx].prefix_len = prefix_len; + c->ip4.addrs[idx].permanent = 0; + return true; +} + +/** + * nl_addr6_add() - Add a discovered IPv6 address to the address array + * @c: Execution context + * @addr: Address to add + * @prefix_len: Prefix length + * + * Return: true if added or updated, false if array full or already permanent + */ +static bool nl_addr6_add(struct ctx *c, const struct in6_addr *addr, + int prefix_len) +{ + int idx = nl_addr6_find(c, addr); + + if (idx >= 0) { + /* Address exists - if permanent, don't touch; else update */ + if (c->ip6.addrs[idx].permanent) + return false; + c->ip6.addrs[idx].prefix_len = prefix_len; + return true; + } + + /* New address - add if room */ + if (c->ip6.addr_count >= IP6_MAX_ADDRS) { + debug("IPv6 address array full, ignoring discovered address"); + return false; + } + + idx = c->ip6.addr_count++; + c->ip6.addrs[idx].addr = *addr; + c->ip6.addrs[idx].prefix_len = prefix_len; + c->ip6.addrs[idx].permanent = 0; + return true; +} + +/** + * nl_addr4_del() - Remove an IPv4 address from the array if not permanent + * @c: Execution context + * @addr: Address to remove + * + * Return: true if removed, false if not found or permanent + */ +static bool nl_addr4_del(struct ctx *c, const struct in_addr *addr) +{ + int i, idx = nl_addr4_find(c, addr); + + if (idx < 0) + return false; + + if (c->ip4.addrs[idx].permanent) + return false; + + /* Shift remaining entries down */ + c->ip4.addr_count--; + for (i = idx; i < c->ip4.addr_count; i++) + c->ip4.addrs[i] = c->ip4.addrs[i + 1]; + + return true; +} + +/** + * nl_addr6_del() - Remove an IPv6 address from the array if not permanent + * @c: Execution context + * @addr: Address to remove + * + * Return: true if removed, false if not found or permanent + */ +static bool nl_addr6_del(struct ctx *c, const struct in6_addr *addr) +{ + int i, idx = nl_addr6_find(c, addr); + + if (idx < 0) + return false; + + if (c->ip6.addrs[idx].permanent) + return false; + + /* Shift remaining entries down */ + c->ip6.addr_count--; + for (i = idx; i < c->ip6.addr_count; i++) + c->ip6.addrs[i] = c->ip6.addrs[i + 1]; + + return true; +}
All the functions above are more to do with the data structure storing the addresses than they are to do with netlink. Better to move them into... maybe ip.c? And use them from conf.c as well. Given the amount of near-duplication here, maybe it would be better to have a single table for v4 and v6 using inany_addr?
+/** + * nl_linkaddr_msg_read() - Parse and log a netlink link/addr message + * @c: Execution context + * @nh: Netlink message header + */ +static void nl_linkaddr_msg_read(struct ctx *c, const struct nlmsghdr *nh) +{ + if (nh->nlmsg_type == NLMSG_DONE || nh->nlmsg_type == NLMSG_ERROR) + return; + + if (nh->nlmsg_type == RTM_NEWLINK || nh->nlmsg_type == RTM_DELLINK) { + const struct ifinfomsg *ifm = NLMSG_DATA(nh); + struct rtattr *rta = IFLA_RTA(ifm); + size_t na = IFLA_PAYLOAD(nh); + const char *name = "?"; + bool up = !!(ifm->ifi_flags & IFF_UP); + bool running = !!(ifm->ifi_flags & IFF_RUNNING); + + for (; RTA_OK(rta, na); rta = RTA_NEXT(rta, na)) { + if (rta->rta_type == IFLA_IFNAME) { + name = (const char *)RTA_DATA(rta); + break; + } + } + + /* Update pasta interface UP state if this is our interface */ + if (c->mode == MODE_PASTA && + (unsigned int)ifm->ifi_index == c->pasta_ifi) { + c->pasta_ifi_up = up; + debug("Interface %s", up ? "UP" : "DOWN");
This only makes sense if we're listening to netlink messages in the guest netns, but the address stuff only makes sense listening to messages in the host netns.
+ } + + if (nh->nlmsg_type == RTM_NEWLINK) + debug("Link %s (idx=%d): %s %s", name, ifm->ifi_index, + up ? "UP" : "DOWN", running ? "RUNNING" : ""); + else + debug("Link %s (idx=%d): DELETED", name, ifm->ifi_index); + + return; + } + + if (nh->nlmsg_type == RTM_NEWADDR || nh->nlmsg_type == RTM_DELADDR) { + bool is_new = (nh->nlmsg_type == RTM_NEWADDR); + const struct ifaddrmsg *ifa = NLMSG_DATA(nh); + char addr_str[INET6_ADDRSTRLEN]; + struct rtattr *rta = IFA_RTA(ifa); + char ifname[IFNAMSIZ] = { 0 }; + size_t na = IFA_PAYLOAD(nh); + void *addr = NULL; + for (; RTA_OK(rta, na); rta = RTA_NEXT(rta, na)) { + if (ifa->ifa_family == AF_INET && + rta->rta_type == IFA_LOCAL) { + addr = RTA_DATA(rta); + break; + } else if (ifa->ifa_family == AF_INET6 && + rta->rta_type == IFA_ADDRESS) { + addr = RTA_DATA(rta); + break; + } + } + + if (!addr) + return; + + if_indextoname(ifa->ifa_index, ifname); + inet_ntop(ifa->ifa_family, addr, addr_str, sizeof(addr_str)); + + debug("%s addr on %s (index=%d): %s/%i%s", + is_new ? "NEW" : "DEL", ifname, ifa->ifa_index, addr_str, + ifa->ifa_prefixlen, + tap_is_ready(c) ? " (tap UP)" : " (tap DOWN)"); + + /* Only handle our pasta interface */ + if (c->mode != MODE_PASTA || ifa->ifa_index != c->pasta_ifi) + return;
Nope. This is a host netns event, so comparing to pasta_ifi makes no sense. We _should_ be comparing to ifi4 or ifi6 (depending on address family), and we should probably do that before we go parsing the details above. We should also probably check for --no-copy-addrs here, too. In the other direction, even for PASST mode we can store this address in our table, it just won't do anything until DHCP or whatever consults it.
+ + if (ifa->ifa_family == AF_INET) { + struct in_addr *a = (struct in_addr *)addr; + + if (!is_new) { + nl_addr4_del(c, a); + return; + } + + if (nl_addr4_add(c, a, ifa->ifa_prefixlen)) { + c->ip4.addr_seen = *a; + if (c->pasta_ifi_up && c->ifi4) { + debug("Sending ARP"); + arp_send_init_req(c);
What does this ARP request do? AFAICT we haven't actually added the address in the guest netns yet, so the guest won't respond to the ARP.
+ } + } + } else if (ifa->ifa_family == AF_INET6) { + struct in6_addr *a = (struct in6_addr *)addr; + + if (!is_new) { + nl_addr6_del(c, a); + return; + } + + if (nl_addr6_add(c, a, + ifa->ifa_prefixlen)) { + c->ip6.addr_seen = *a; + if (c->pasta_ifi_up && + c->ifi6 && !c->no_ndp) { + debug("Sending NDP"); + ndp_send_init_req(c);
Some question with this NDP.
+ } + } + } + } +} + +/** + * nl_linkaddr_notify_handler() - Handle events from link/addr notifier socket + * @c: Execution context + */ +void nl_linkaddr_notify_handler(struct ctx *c) +{ + char buf[NLBUFSIZ]; + + for (;;) { + ssize_t n = recv(nl_sock_linkaddr, buf, sizeof(buf), MSG_DONTWAIT); + struct nlmsghdr *nh = (struct nlmsghdr *)buf; + + if (n < 0) { + if (errno == EINTR) + continue; + if (errno != EAGAIN) + debug("recv() error: %s", strerror_(errno)); + break; + } + + debug("Received %zd bytes", n); + + for (; NLMSG_OK(nh, n); nh = NLMSG_NEXT(nh, n)) + nl_linkaddr_msg_read(c, nh); + } +} + +/** + * nl_linkaddr_init_do() - Actually create and bind the netlink socket + * @arg: Execution context (for namespace entry) or NULL + * + * Return: 0 on success, -1 on failure + */ +static int nl_linkaddr_init_do(void *arg) +{ + struct sockaddr_nl addr = { .nl_family = AF_NETLINK, + .nl_groups = RTMGRP_LINK | RTMGRP_IPV4_IFADDR | + RTMGRP_IPV6_IFADDR }; + + if (arg) + ns_enter((struct ctx *)arg); + + nl_sock_linkaddr = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, NETLINK_ROUTE);
Is there a reason to use an additional socket, rather than adding more events to the neighbour listening socket?
+ if (nl_sock_linkaddr < 0) { + debug("socket() failed: %s", strerror_(errno)); + return -1; + } + + if (bind(nl_sock_linkaddr, (struct sockaddr *)&addr, sizeof(addr)) < 0) { + debug("bind() failed: %s", strerror_(errno)); + close(nl_sock_linkaddr); + nl_sock_linkaddr = -1; + return -1; + } + + debug("socket fd=%d", nl_sock_linkaddr); + return 0; +} + +/** + * nl_linkaddr_notify_init() - Initialize link/address change notifier + * @c: Execution context + * + * Return: 0 on success, -1 on failure + */ +int nl_linkaddr_notify_init(const struct ctx *c) +{ + union epoll_ref ref = { .type = EPOLL_TYPE_NL_LINKADDR }; + struct epoll_event ev = { .events = EPOLLIN }; + + if (nl_sock_linkaddr >= 0) { + debug("notifier already initialized (fd=%d)", nl_sock_linkaddr); + return 0; + } + + /* Open the notifier socket in the namespace for pasta mode, + * or in the init namespace otherwise.
Definitely wrong. We're trying to watch host addresses so that we can copy them to the guest - therefore we always need to watch in the host netns. On pasta there might be reasons to *also* listen in the guest netns, but that would want a different handler to do different things.
+ */ + if (c->mode == MODE_PASTA) + NS_CALL(nl_linkaddr_init_do, (void *)c); + else + nl_linkaddr_init_do(NULL); + + if (nl_sock_linkaddr < 0) { + warn("Failed to create/bind link/addr notifier socket"); + return -1; + } + + ev.data.u64 = ref.u64; + if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, nl_sock_linkaddr, &ev) == -1) { + warn("epoll_ctl() failed on link/addr notifier socket: %s", + strerror_(errno)); + close(nl_sock_linkaddr); + nl_sock_linkaddr = -1; + return -1; + } + + return 0; +} /** * nl_sock_init() - Call nl_sock_init_do(), won't return on failure * @c: Execution context diff --git a/netlink.h b/netlink.h index 8f1e9b9..1796a72 100644 --- a/netlink.h +++ b/netlink.h @@ -33,4 +33,7 @@ int nl_link_set_flags(int s, unsigned int ifi, int nl_neigh_notify_init(const struct ctx *c); void nl_neigh_notify_handler(const struct ctx *c);
+int nl_linkaddr_notify_init(const struct ctx *c); +void nl_linkaddr_notify_handler(struct ctx *c); + #endif /* NETLINK_H */ diff --git a/passt.c b/passt.c index 5ed88d0..f274858 100644 --- a/passt.c +++ b/passt.c @@ -80,6 +80,7 @@ char *epoll_type_str[] = { [EPOLL_TYPE_REPAIR_LISTEN] = "TCP_REPAIR helper listening socket", [EPOLL_TYPE_REPAIR] = "TCP_REPAIR helper socket", [EPOLL_TYPE_NL_NEIGH] = "netlink neighbour notifier socket", + [EPOLL_TYPE_NL_LINKADDR] = "netlink link/address notifier socket", }; static_assert(ARRAY_SIZE(epoll_type_str) == EPOLL_NUM_TYPES, "epoll_type_str[] doesn't match enum epoll_type"); @@ -304,6 +305,9 @@ static void passt_worker(void *opaque, int nfds, struct epoll_event *events) case EPOLL_TYPE_NL_NEIGH: nl_neigh_notify_handler(c); break; + case EPOLL_TYPE_NL_LINKADDR: + nl_linkaddr_notify_handler(c); + break; default: /* Can't happen */ ASSERT(0); @@ -413,6 +417,7 @@ int main(int argc, char **argv)
fwd_neigh_table_init(&c); nl_neigh_notify_init(&c); + nl_linkaddr_notify_init(&c);
if (!c.foreground) { if ((devnull_fd = open("/dev/null", O_RDWR | O_CLOEXEC)) < 0) diff --git a/passt.h b/passt.h index 533f2cb..70ccaf1 100644 --- a/passt.h +++ b/passt.h @@ -264,6 +264,7 @@ struct ctx { char pasta_ifn[IF_NAMESIZE]; unsigned int pasta_ifi; int pasta_conf_ns; + int pasta_ifi_up; /* Namespace interface is UP */
int no_tcp; struct tcp_ctx tcp; diff --git a/tap.c b/tap.c index 0b96cc1..a2a4459 100644 --- a/tap.c +++ b/tap.c @@ -1363,10 +1363,10 @@ bool tap_is_ready(const struct ctx *c) return false;
if (c->mode == MODE_PASTA) { - /* If pasta_conf_ns is set, the interface was configured and - * brought up during initialization. If not, it's still down. + /* Check if the namespace interface is actually UP. + * This is tracked by netlink link notifications. */ - return c->pasta_conf_ns; + return c->pasta_ifi_up; }
return true; -- 2.51.1
-- David Gibson (he or they) | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you, not the other way | around. http://www.ozlabs.org/~dgibson