dev
Threads by month
- ----- 2026 -----
- May
- April
- March
- February
- January
- ----- 2025 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2024 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2023 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2022 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2021 -----
- December
- November
April 2026
- 7 participants
- 42 discussions
[PATCH v7 02/13] passt, pasta: Introduce unified multi-address data structures
by Jon Maloy 15 May '26
by Jon Maloy 15 May '26
15 May '26
As preparation for supporting multiple addresses per interface,
we replace the single addr/prefix_len fields with an array. The
array consists of a new struct inany_addr_entry containing an
address and prefix length, both in inany_addr format.
Despite a lot of code refactoring, there are only two real functional
changes:
- The indicated IPv6 prefix length is now properly stored, instead
of being ignored and overridden with the hardcoded value 64, as
has been the case until now.
- Since even IPv4 addresses now are stored in IPv6 format, we
also store the corresponding prefix length in that format,
i.e. using the range [96,128] instead of [0,32].
Signed-off-by: Jon Maloy <jmaloy(a)redhat.com>
---
v2: -Using inany_addr instead of protocol specific addresses as
entry address field.
v3: -Merging into one array, directly in struct ctx
-Changed prefix_len and flags fields in struct inany_addr_entry
to uint8_t, since that makes the struct directly migratable
v4: -Updated according to changes in previous commits
-Updated according to feedback from David G.
-Squashed IP4_MASK macro commit into this one
v6: -Renamed and moved some definitions
-Introduced fwd_set_addr() and fwd_get_addr() already in this commit
-Eliminated first_v4/v6() functions, replaced with fwd_get_addr()
-Some other changes as suggested by David G.
-I kept the flag CONF_ADDR_LINKLOCAL, since it will be
needed later in an address selection function.
v7: -Introduced CONF_ADDR_GENERATED flag
-Other fixes based on feedback from David and Stefano.
-I changed signature of inany_prefix_len(), but I did not change
its semantics, since the premise of David's comment is wrong: the
caller does *not* explicitly know he is dealing with an IPv4 address.
In fact, there are examples later in this series where it may be an
IPv6 address, and the caller just trusts he gets the return value in
the appropriate format.
-Introduced the inverse of inany_prefix_len(), called inany_prefix_len6()
which always returns the prefix in IPv6 or mapped IPv4 format.
The name of the function isn't great, but any alternative I came up
with became too long to be practical.
---
arp.c | 12 ++++-
conf.c | 143 ++++++++++++++++++++++++++++++-------------------------
dhcp.c | 14 ++++--
dhcpv6.c | 15 ++++--
fwd.c | 109 ++++++++++++++++++++++++++++++++++--------
fwd.h | 4 ++
inany.h | 41 ++++++++++++++++
ip.h | 2 +
ndp.c | 16 +++++--
passt.h | 67 ++++++++++++++++++++++----
pasta.c | 25 ++++++----
tap.c | 7 ++-
12 files changed, 340 insertions(+), 115 deletions(-)
diff --git a/arp.c b/arp.c
index bb042e9..a7fd82f 100644
--- a/arp.c
+++ b/arp.c
@@ -41,6 +41,8 @@
static bool ignore_arp(const struct ctx *c,
const struct arphdr *ah, const struct arpmsg *am)
{
+ const struct guest_addr *a;
+
if (ah->ar_hrd != htons(ARPHRD_ETHER) ||
ah->ar_pro != htons(ETH_P_IP) ||
ah->ar_hln != ETH_ALEN ||
@@ -54,7 +56,8 @@ static bool ignore_arp(const struct ctx *c,
return true;
/* Don't resolve the guest's assigned address, either. */
- if (!memcmp(am->tip, &c->ip4.addr, sizeof(am->tip)))
+ a = fwd_get_addr(c, AF_INET, 0, 0);
+ if (a && !memcmp(am->tip, inany_v4(&a->addr), sizeof(am->tip)))
return true;
return false;
@@ -123,12 +126,17 @@ int arp(const struct ctx *c, struct iov_tail *data)
*/
void arp_send_init_req(const struct ctx *c)
{
+ const struct guest_addr *a;
struct {
struct ethhdr eh;
struct arphdr ah;
struct arpmsg am;
} __attribute__((__packed__)) req;
+ a = fwd_get_addr(c, AF_INET, 0, 0);
+ if (!a)
+ return;
+
/* Ethernet header */
req.eh.h_proto = htons(ETH_P_ARP);
memcpy(req.eh.h_dest, MAC_BROADCAST, sizeof(req.eh.h_dest));
@@ -145,7 +153,7 @@ void arp_send_init_req(const struct ctx *c)
memcpy(req.am.sha, c->our_tap_mac, sizeof(req.am.sha));
memcpy(req.am.sip, &c->ip4.our_tap_addr, sizeof(req.am.sip));
memcpy(req.am.tha, MAC_BROADCAST, sizeof(req.am.tha));
- memcpy(req.am.tip, &c->ip4.addr, sizeof(req.am.tip));
+ memcpy(req.am.tip, inany_v4(&a->addr), sizeof(req.am.tip));
debug("Sending initial ARP request for guest MAC address");
tap_send_single(c, &req, sizeof(req));
diff --git a/conf.c b/conf.c
index f13fef6..591f561 100644
--- a/conf.c
+++ b/conf.c
@@ -728,13 +728,15 @@ static int conf_ip4_prefix(const char *arg)
/**
* conf_ip4() - Verify or detect IPv4 support, get relevant addresses
+ * @c: Execution context
* @ifi: Host interface to attempt (0 to determine one)
- * @ip4: IPv4 context (will be written)
*
* Return: interface index for IPv4, or 0 on failure.
*/
-static unsigned int conf_ip4(unsigned int ifi, struct ip4_ctx *ip4)
+static unsigned int conf_ip4(struct ctx *c, unsigned int ifi)
{
+ struct ip4_ctx *ip4 = &c->ip4;
+
if (!ifi)
ifi = nl_get_ext_if(nl_sock, AF_INET);
@@ -753,60 +755,57 @@ static unsigned int conf_ip4(unsigned int ifi, struct ip4_ctx *ip4)
}
}
- if (IN4_IS_ADDR_UNSPECIFIED(&ip4->addr)) {
+ if (!fwd_get_addr(c, AF_INET, 0, 0)) {
+ struct in_addr addr;
+ int prefix_len;
int rc = nl_addr_get(nl_sock, ifi, AF_INET,
- &ip4->addr, &ip4->prefix_len, NULL);
+ &addr, &prefix_len, NULL);
if (rc < 0) {
debug("Couldn't discover IPv4 address: %s",
strerror_(-rc));
return 0;
}
- }
+ if (IN4_IS_ADDR_UNSPECIFIED(&addr))
+ return 0;
- if (!ip4->prefix_len) {
- in_addr_t addr = ntohl(ip4->addr.s_addr);
- if (IN_CLASSA(addr))
- ip4->prefix_len = (32 - IN_CLASSA_NSHIFT);
- else if (IN_CLASSB(addr))
- ip4->prefix_len = (32 - IN_CLASSB_NSHIFT);
- else if (IN_CLASSC(addr))
- ip4->prefix_len = (32 - IN_CLASSC_NSHIFT);
- else
- ip4->prefix_len = 32;
+ fwd_set_addr(c, &inany_from_v4(addr), CONF_ADDR_HOST,
+ prefix_len);
+ ip4->addr_seen = addr;
}
- ip4->addr_seen = ip4->addr;
-
ip4->our_tap_addr = ip4->guest_gw;
- if (IN4_IS_ADDR_UNSPECIFIED(&ip4->addr))
- return 0;
-
return ifi;
}
/**
* conf_ip4_local() - Configure IPv4 addresses and attributes for local mode
- * @ip4: IPv4 context (will be written)
+ * @c: Execution context (will be written)
*/
-static void conf_ip4_local(struct ip4_ctx *ip4)
+static void conf_ip4_local(struct ctx *c)
{
- ip4->addr_seen = ip4->addr = IP4_LL_GUEST_ADDR;
- ip4->our_tap_addr = ip4->guest_gw = IP4_LL_GUEST_GW;
- ip4->prefix_len = IP4_LL_PREFIX_LEN;
+ struct ip4_ctx *ip4 = &c->ip4;
+ ip4->addr_seen = IP4_LL_GUEST_ADDR;
+ ip4->our_tap_addr = ip4->guest_gw = IP4_LL_GUEST_GW;
ip4->no_copy_addrs = ip4->no_copy_routes = true;
+ fwd_set_addr(c, &inany_from_v4(IP4_LL_GUEST_ADDR),
+ CONF_ADDR_GENERATED | CONF_ADDR_LINKLOCAL,
+ IP4_LL_PREFIX_LEN);
}
/**
* conf_ip6() - Verify or detect IPv6 support, get relevant addresses
+ * @c: Execution context
* @ifi: Host interface to attempt (0 to determine one)
- * @ip6: IPv6 context (will be written)
*
* Return: interface index for IPv6, or 0 on failure.
*/
-static unsigned int conf_ip6(unsigned int ifi, struct ip6_ctx *ip6)
+static unsigned int conf_ip6(struct ctx *c, unsigned int ifi)
{
+ struct ip6_ctx *ip6 = &c->ip6;
+ const struct guest_addr *a;
+ union inany_addr addr;
int prefix_len = 0;
int rc;
@@ -827,21 +826,28 @@ static unsigned int conf_ip6(unsigned int ifi, struct ip6_ctx *ip6)
}
}
- rc = nl_addr_get(nl_sock, ifi, AF_INET6,
- IN6_IS_ADDR_UNSPECIFIED(&ip6->addr) ? &ip6->addr : NULL,
+ rc = nl_addr_get(nl_sock, ifi, AF_INET6, &addr.a6,
&prefix_len, &ip6->our_tap_ll);
if (rc < 0) {
debug("Couldn't discover IPv6 address: %s", strerror_(-rc));
return 0;
}
- ip6->addr_seen = ip6->addr;
+ a = fwd_get_addr(c, AF_INET6, 0, 0);
+ if (!a) {
+ if (IN6_IS_ADDR_UNSPECIFIED(&addr))
+ return 0;
+
+ fwd_set_addr(c, &addr, CONF_ADDR_HOST, prefix_len);
+ ip6->addr_seen = addr.a6;
+ } else {
+ ip6->addr_seen = a->addr.a6;
+ }
if (IN6_IS_ADDR_LINKLOCAL(&ip6->guest_gw))
ip6->our_tap_ll = ip6->guest_gw;
- if (IN6_IS_ADDR_UNSPECIFIED(&ip6->addr) ||
- IN6_IS_ADDR_UNSPECIFIED(&ip6->our_tap_ll))
+ if (IN6_IS_ADDR_UNSPECIFIED(&ip6->our_tap_ll))
return 0;
return ifi;
@@ -849,13 +855,13 @@ static unsigned int conf_ip6(unsigned int ifi, struct ip6_ctx *ip6)
/**
* conf_ip6_local() - Configure IPv6 addresses and attributes for local mode
- * @ip6: IPv6 context (will be written)
+ * @c: Execution context (will be written)
*/
-static void conf_ip6_local(struct ip6_ctx *ip6)
+static void conf_ip6_local(struct ctx *c)
{
- ip6->our_tap_ll = ip6->guest_gw = IP6_LL_GUEST_GW;
+ c->ip6.our_tap_ll = c->ip6.guest_gw = IP6_LL_GUEST_GW;
- ip6->no_copy_addrs = ip6->no_copy_routes = true;
+ c->ip6.no_copy_addrs = c->ip6.no_copy_routes = true;
}
/**
@@ -1137,6 +1143,7 @@ enum passt_modes conf_mode(int argc, char *argv[])
static void conf_print(const struct ctx *c)
{
char buf[INANY_ADDRSTRLEN];
+ const struct guest_addr *a;
int i;
if (c->ifi4 > 0 || c->ifi6 > 0) {
@@ -1181,16 +1188,18 @@ static void conf_print(const struct ctx *c)
inet_ntop(AF_INET, &c->ip4.map_host_loopback,
buf, sizeof(buf)));
- if (!c->no_dhcp) {
+ a = fwd_get_addr(c, AF_INET, 0, 0);
+ if (a && !c->no_dhcp) {
uint32_t mask;
- mask = htonl(0xffffffff << (32 - c->ip4.prefix_len));
+ mask = IN4_MASK(inany_prefix_len(&a->addr,
+ a->prefix_len));
info("DHCP:");
info(" assign: %s",
- inet_ntop(AF_INET, &c->ip4.addr, buf, sizeof(buf)));
+ inany_ntop(&a->addr, buf, sizeof(buf)));
info(" mask: %s",
- inet_ntop(AF_INET, &mask, buf, sizeof(buf)));
+ inet_ntop(AF_INET, &mask, buf, sizeof(buf)));
info(" router: %s",
inet_ntop(AF_INET, &c->ip4.guest_gw,
buf, sizeof(buf)));
@@ -1201,8 +1210,8 @@ static void conf_print(const struct ctx *c)
break;
if (!i)
info("DNS:");
- inet_ntop(AF_INET, &c->ip4.dns[i], buf, sizeof(buf));
- info(" %s", buf);
+ info(" %s", inet_ntop(AF_INET, &c->ip4.dns[i],
+ buf, sizeof(buf)));
}
for (i = 0; *c->dns_search[i].n; i++) {
@@ -1227,13 +1236,14 @@ static void conf_print(const struct ctx *c)
else
goto dns6;
- info(" assign: %s",
- inet_ntop(AF_INET6, &c->ip6.addr, buf, sizeof(buf)));
+ a = fwd_get_addr(c, AF_INET6, 0, CONF_ADDR_LINKLOCAL);
+ if (a)
+ info(" assign: %s",
+ inany_ntop(&a->addr, buf, sizeof(buf)));
info(" router: %s",
inet_ntop(AF_INET6, &c->ip6.guest_gw, buf, sizeof(buf)));
info(" our link-local: %s",
- inet_ntop(AF_INET6, &c->ip6.our_tap_ll,
- buf, sizeof(buf)));
+ inet_ntop(AF_INET6, &c->ip6.our_tap_ll, buf, sizeof(buf)));
dns6:
for (i = 0; i < ARRAY_SIZE(c->ip6.dns); i++) {
@@ -1241,8 +1251,10 @@ dns6:
break;
if (!i)
info("DNS:");
- inet_ntop(AF_INET6, &c->ip6.dns[i], buf, sizeof(buf));
- info(" %s", buf);
+ info(" %s",
+ inet_ntop(AF_INET6, &c->ip6.dns[i],
+ buf, sizeof(buf)));
+
}
for (i = 0; *c->dns_search[i].n; i++) {
@@ -1886,19 +1898,16 @@ void conf(struct ctx *c, int argc, char **argv)
IN6_IS_ADDR_V4COMPAT(&addr.a6))
die("Invalid address: %s", optarg);
- if (inany_v4(&addr)) {
- c->ip4.addr = *inany_v4(&addr);
- c->ip4.prefix_len = prefix_len - 96;
- if (c->mode == MODE_PASTA)
- c->ip4.no_copy_addrs = true;
- } else {
- c->ip6.addr = addr.a6;
- if (c->mode == MODE_PASTA)
- c->ip6.no_copy_addrs = true;
- }
+ /* Legacy behaviour: replace existing address if any */
+ fwd_set_addr(c, &addr, CONF_ADDR_USER, prefix_len);
+ if (inany_v4(&addr))
+ c->ip4.no_copy_addrs = true;
+ else
+ c->ip6.no_copy_addrs = true;
break;
}
case 'n': {
+ struct guest_addr *a;
int plen;
if (addr_has_prefix_len)
@@ -1908,8 +1917,12 @@ void conf(struct ctx *c, int argc, char **argv)
if (plen < 0)
die("Invalid prefix length: %s", optarg);
- prefix_len_from_opt = plen + 96;
- c->ip4.prefix_len = plen;
+ prefix_len_from_opt = plen;
+
+ for_each_addr(a, c->addrs, c->addr_count, AF_INET) {
+ a->prefix_len = inany_prefix_len(&a->addr, plen);
+ break;
+ }
break;
}
case 'M':
@@ -2103,9 +2116,9 @@ void conf(struct ctx *c, int argc, char **argv)
nl_sock_init(c, false);
if (!v6_only && !c->splice_only)
- c->ifi4 = conf_ip4(ifi4, &c->ip4);
+ c->ifi4 = conf_ip4(c, ifi4);
if (!v4_only && !c->splice_only)
- c->ifi6 = conf_ip6(ifi6, &c->ip6);
+ c->ifi6 = conf_ip6(c, ifi6);
if (c->ifi4 && c->mtu < IPV4_MIN_MTU) {
warn("MTU %"PRIu16" is too small for IPv4 (minimum %u)",
@@ -2128,7 +2141,7 @@ void conf(struct ctx *c, int argc, char **argv)
if (!c->ifi4 && !v6_only) {
if (!c->splice_only) {
info("IPv4: no external interface as template, use local mode");
- conf_ip4_local(&c->ip4);
+ conf_ip4_local(c);
}
c->ifi4 = -1;
}
@@ -2136,7 +2149,7 @@ void conf(struct ctx *c, int argc, char **argv)
if (!c->ifi6 && !v4_only) {
if (!c->splice_only) {
info("IPv6: no external interface as template, use local mode");
- conf_ip6_local(&c->ip6);
+ conf_ip6_local(c);
}
c->ifi6 = -1;
}
@@ -2201,7 +2214,7 @@ void conf(struct ctx *c, int argc, char **argv)
if (!c->ifi6) {
c->no_ndp = 1;
c->no_dhcpv6 = 1;
- } else if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.addr)) {
+ } else if (!fwd_get_addr(c, AF_INET6, 0, 0)) {
c->no_dhcpv6 = 1;
}
diff --git a/dhcp.c b/dhcp.c
index 1ff8cba..f0fa212 100644
--- a/dhcp.c
+++ b/dhcp.c
@@ -303,6 +303,7 @@ static void opt_set_dns_search(const struct ctx *c, size_t max_len)
int dhcp(const struct ctx *c, struct iov_tail *data)
{
char macstr[ETH_ADDRSTRLEN];
+ const struct guest_addr *a;
size_t mlen, dlen, opt_len;
struct in_addr mask, dst;
struct ethhdr eh_storage;
@@ -313,6 +314,7 @@ int dhcp(const struct ctx *c, struct iov_tail *data)
const struct udphdr *uh;
struct msg m_storage;
struct msg const *m;
+ struct in_addr addr;
struct msg reply;
unsigned int i;
@@ -344,6 +346,10 @@ int dhcp(const struct ctx *c, struct iov_tail *data)
m->op != BOOTREQUEST)
return -1;
+ a = fwd_get_addr(c, AF_INET, 0, 0);
+ assert(a);
+ addr = *inany_v4(&a->addr);
+
reply.op = BOOTREPLY;
reply.htype = m->htype;
reply.hlen = m->hlen;
@@ -352,7 +358,7 @@ int dhcp(const struct ctx *c, struct iov_tail *data)
reply.secs = 0;
reply.flags = m->flags;
reply.ciaddr = m->ciaddr;
- reply.yiaddr = c->ip4.addr;
+ reply.yiaddr = addr;
reply.siaddr = 0;
reply.giaddr = m->giaddr;
memcpy(&reply.chaddr, m->chaddr, sizeof(reply.chaddr));
@@ -404,7 +410,7 @@ int dhcp(const struct ctx *c, struct iov_tail *data)
info(" from %s", eth_ntop(m->chaddr, macstr, sizeof(macstr)));
- mask.s_addr = htonl(0xffffffff << (32 - c->ip4.prefix_len));
+ mask.s_addr = IN4_MASK(inany_prefix_len(&a->addr, a->prefix_len));
memcpy(opts[1].s, &mask, sizeof(mask));
memcpy(opts[3].s, &c->ip4.guest_gw, sizeof(c->ip4.guest_gw));
memcpy(opts[54].s, &c->ip4.our_tap_addr, sizeof(c->ip4.our_tap_addr));
@@ -412,7 +418,7 @@ int dhcp(const struct ctx *c, struct iov_tail *data)
/* If the gateway is not on the assigned subnet, send an option 121
* (Classless Static Routing) adding a dummy route to it.
*/
- if ((c->ip4.addr.s_addr & mask.s_addr)
+ if ((addr.s_addr & mask.s_addr)
!= (c->ip4.guest_gw.s_addr & mask.s_addr)) {
/* a.b.c.d/32:0.0.0.0, 0:a.b.c.d */
opts[121].slen = 14;
@@ -471,7 +477,7 @@ int dhcp(const struct ctx *c, struct iov_tail *data)
if (m->flags & FLAG_BROADCAST)
dst = in4addr_broadcast;
else
- dst = c->ip4.addr;
+ dst = addr;
tap_udp4_send(c, c->ip4.our_tap_addr, 67, dst, 68, &reply, dlen);
diff --git a/dhcpv6.c b/dhcpv6.c
index 2db0944..0a064a9 100644
--- a/dhcpv6.c
+++ b/dhcpv6.c
@@ -318,7 +318,7 @@ static bool dhcpv6_opt(struct iov_tail *data, uint16_t type)
* false otherwise and @data is unmodified
*/
static bool dhcpv6_ia_notonlink(struct iov_tail *data,
- struct in6_addr *la)
+ const struct in6_addr *la)
{
int ia_types[2] = { OPT_IA_NA, OPT_IA_TA };
struct opt_ia_addr opt_addr_storage;
@@ -567,6 +567,7 @@ int dhcpv6(struct ctx *c, struct iov_tail *data,
struct opt_hdr client_id_storage;
/* cppcheck-suppress [variableScope,unmatchedSuppression] */
struct opt_ia_na ia_storage;
+ const struct guest_addr *a;
const struct in6_addr *src;
struct msg_hdr mh_storage;
const struct msg_hdr *mh;
@@ -574,6 +575,8 @@ int dhcpv6(struct ctx *c, struct iov_tail *data,
const struct udphdr *uh;
size_t mlen, n;
+ a = fwd_get_addr(c, AF_INET6, 0, CONF_ADDR_LINKLOCAL);
+
uh = IOV_REMOVE_HEADER(data, uh_storage);
if (!uh)
return -1;
@@ -627,7 +630,7 @@ int dhcpv6(struct ctx *c, struct iov_tail *data,
if (mh->type == TYPE_CONFIRM && server_id)
return -1;
- if (dhcpv6_ia_notonlink(data, &c->ip6.addr)) {
+ if (a && dhcpv6_ia_notonlink(data, &a->addr.a6)) {
dhcpv6_send_ia_notonlink(c, saddr, data, &client_id_base,
ntohs(client_id->l), mh->xid);
@@ -680,7 +683,8 @@ int dhcpv6(struct ctx *c, struct iov_tail *data,
resp.hdr.xid = mh->xid;
tap_udp6_send(c, src, 547, saddr, 546, mh->xid, &resp, n);
- c->ip6.addr_seen = c->ip6.addr;
+ if (a)
+ c->ip6.addr_seen = a->addr.a6;
return 1;
}
@@ -691,6 +695,7 @@ int dhcpv6(struct ctx *c, struct iov_tail *data,
*/
void dhcpv6_init(const struct ctx *c)
{
+ const struct guest_addr *a;
time_t y2k = 946684800; /* Epoch to 2000-01-01T00:00:00Z, no mktime() */
uint32_t duid_time;
@@ -704,5 +709,7 @@ void dhcpv6_init(const struct ctx *c)
memcpy(resp_not_on_link.server_id.duid_lladdr,
c->our_tap_mac, sizeof(c->our_tap_mac));
- resp.ia_addr.addr = c->ip6.addr;
+ a = fwd_get_addr(c, AF_INET6, 0, CONF_ADDR_LINKLOCAL);
+ if (a)
+ resp.ia_addr.addr = a->addr.a6;
}
diff --git a/fwd.c b/fwd.c
index bedbf98..14ce0a7 100644
--- a/fwd.c
+++ b/fwd.c
@@ -249,6 +249,61 @@ void fwd_neigh_table_init(const struct ctx *c)
fwd_neigh_table_update(c, &mga, c->our_tap_mac, true);
}
+/**
+ * fwd_set_addr() - Add or update an address in the unified address array
+ * @c: Execution context
+ * @addr: Address to add (IPv4-mapped or IPv6)
+ * @flags: CONF_ADDR_* flags for this address
+ * @prefix_len: Prefix length in IPv6 or IPv4 format
+ *
+ * Find the first existing entry of the same address family and
+ * overwrite it, or create a new one if none exists
+ */
+void fwd_set_addr(struct ctx *c, const union inany_addr *addr,
+ uint8_t flags, int prefix_len)
+{
+ struct guest_addr *a;
+
+ for_each_addr(a, c->addrs, c->addr_count, inany_af(addr)) {
+ goto found;
+ }
+
+ if (c->addr_count >= MAX_GUEST_ADDRS)
+ return;
+
+ a = &c->addrs[c->addr_count++];
+
+found:
+ a->addr = *addr;
+ a->prefix_len = inany_prefix_len6(addr, prefix_len);
+ a->flags = flags;
+}
+
+/**
+ * fwd_get_addr() - Get guest address entry matching criteria
+ * @c: Execution context
+ * @af: Address family (AF_INET, AF_INET6, or 0 for any)
+ * @incl: Flags that must be present (any-match)
+ * @excl: Flags that must not be present
+ *
+ * Return: first address entry matching criteria, or NULL
+ */
+const struct guest_addr *fwd_get_addr(const struct ctx *c, sa_family_t af,
+ uint8_t incl, uint8_t excl)
+{
+ const struct guest_addr *a;
+
+ for_each_addr(a, c->addrs, c->addr_count, af) {
+ if (incl && !(a->flags & incl))
+ continue;
+ if (a->flags & excl)
+ continue;
+ return a;
+ }
+
+ return NULL;
+}
+
/** fwd_probe_ephemeral() - Determine what ports this host considers ephemeral
*
* Work out what ports the host thinks are emphemeral and record it for later
@@ -941,8 +996,10 @@ static bool is_dns_flow(uint8_t proto, const struct flowside *ini)
* translation, false otherwise
*/
static bool fwd_guest_accessible4(const struct ctx *c,
- const struct in_addr *addr)
+ const struct in_addr *addr)
{
+ const struct guest_addr *a;
+
if (IN4_IS_ADDR_LOOPBACK(addr))
return false;
@@ -957,7 +1014,8 @@ static bool fwd_guest_accessible4(const struct ctx *c,
/* For IPv4, addr_seen is initialised to addr, so is always a valid
* address
*/
- if (IN4_ARE_ADDR_EQUAL(addr, &c->ip4.addr) ||
+ a = fwd_get_addr(c, AF_INET, 0, 0);
+ if ((a && IN4_ARE_ADDR_EQUAL(addr, inany_v4(&a->addr))) ||
IN4_ARE_ADDR_EQUAL(addr, &c->ip4.addr_seen))
return false;
@@ -975,10 +1033,13 @@ static bool fwd_guest_accessible4(const struct ctx *c,
static bool fwd_guest_accessible6(const struct ctx *c,
const struct in6_addr *addr)
{
+ const struct guest_addr *a;
+
if (IN6_IS_ADDR_LOOPBACK(addr))
return false;
- if (IN6_ARE_ADDR_EQUAL(addr, &c->ip6.addr))
+ a = fwd_get_addr(c, AF_INET6, 0, 0);
+ if (a && IN6_ARE_ADDR_EQUAL(addr, &a->addr.a6))
return false;
/* For IPv6, addr_seen starts unspecified, because we don't know what LL
@@ -1023,16 +1084,21 @@ static bool fwd_guest_accessible(const struct ctx *c,
static void nat_outbound(const struct ctx *c, const union inany_addr *addr,
union inany_addr *translated)
{
- if (inany_equals4(addr, &c->ip4.map_host_loopback))
+ const struct guest_addr *ga;
+
+ if (inany_equals4(addr, &c->ip4.map_host_loopback)) {
*translated = inany_loopback4;
- else if (inany_equals6(addr, &c->ip6.map_host_loopback))
+ } else if (inany_equals6(addr, &c->ip6.map_host_loopback)) {
*translated = inany_loopback6;
- else if (inany_equals4(addr, &c->ip4.map_guest_addr))
- *translated = inany_from_v4(c->ip4.addr);
- else if (inany_equals6(addr, &c->ip6.map_guest_addr))
- translated->a6 = c->ip6.addr;
- else
+ } else if (inany_equals4(addr, &c->ip4.map_guest_addr)) {
+ ga = fwd_get_addr(c, AF_INET, 0, 0);
+ *translated = ga ? ga->addr : inany_any4;
+ } else if (inany_equals6(addr, &c->ip6.map_guest_addr)) {
+ ga = fwd_get_addr(c, AF_INET6, 0, 0);
+ translated->a6 = ga ? ga->addr.a6 : in6addr_any;
+ } else {
*translated = *addr;
+ }
}
/**
@@ -1137,16 +1203,21 @@ bool nat_inbound(const struct ctx *c, const union inany_addr *addr,
} else if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_host_loopback) &&
inany_equals6(addr, &in6addr_loopback)) {
translated->a6 = c->ip6.map_host_loopback;
- } else if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_guest_addr) &&
- inany_equals4(addr, &c->ip4.addr)) {
- *translated = inany_from_v4(c->ip4.map_guest_addr);
- } else if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_guest_addr) &&
- inany_equals6(addr, &c->ip6.addr)) {
- translated->a6 = c->ip6.map_guest_addr;
- } else if (fwd_guest_accessible(c, addr)) {
- *translated = *addr;
} else {
- return false;
+ const struct guest_addr *ga4 = fwd_get_addr(c, AF_INET, 0, 0);
+ const struct guest_addr *ga6 = fwd_get_addr(c, AF_INET6, 0, 0);
+
+ if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.map_guest_addr) &&
+ ga4 && inany_equals(addr, &ga4->addr)) {
+ *translated = inany_from_v4(c->ip4.map_guest_addr);
+ } else if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.map_guest_addr) &&
+ ga6 && inany_equals(addr, &ga6->addr)) {
+ translated->a6 = c->ip6.map_guest_addr;
+ } else if (fwd_guest_accessible(c, addr)) {
+ *translated = *addr;
+ } else {
+ return false;
+ }
}
return true;
diff --git a/fwd.h b/fwd.h
index 958eee2..c5a1068 100644
--- a/fwd.h
+++ b/fwd.h
@@ -23,6 +23,8 @@ struct flowside;
void fwd_probe_ephemeral(void);
bool fwd_port_is_ephemeral(in_port_t port);
+const struct guest_addr *fwd_get_addr(const struct ctx *c, sa_family_t af,
+ uint8_t incl, uint8_t excl);
/**
* struct fwd_rule - Forwarding rule governing a range of ports
@@ -141,5 +143,7 @@ void fwd_neigh_table_free(const struct ctx *c,
void fwd_neigh_mac_get(const struct ctx *c, const union inany_addr *addr,
uint8_t *mac);
void fwd_neigh_table_init(const struct ctx *c);
+void fwd_set_addr(struct ctx *c, const union inany_addr *addr,
+ uint8_t flags, int prefix_len);
#endif /* FWD_H */
diff --git a/inany.h b/inany.h
index 9891ed6..0450c45 100644
--- a/inany.h
+++ b/inany.h
@@ -102,6 +102,16 @@ static inline struct in_addr *inany_v4(const union inany_addr *addr)
return (struct in_addr *)&addr->v4mapped.a4;
}
+/** inany_af - Get address family of IPv[46] address
+ * @addr: IPv4 or IPv6 address
+ *
+ * Return: AF_INET for IPv4, AF_INET6 for IPv6
+ */
+static inline sa_family_t inany_af(const union inany_addr *addr)
+{
+ return inany_v4(addr) ? AF_INET : AF_INET6;
+}
+
/** inany_default_prefix_len() - Get default prefix length for address
* @addr: IPv4 or iPv6 address
*
@@ -115,6 +125,37 @@ static inline int inany_default_prefix_len(const union inany_addr *addr)
return v4 ? ip4_class_prefix_len(v4) + 96 : 64;
}
+
+/** inany_prefix_len() - Convert prefix length to native format
+ * @addr: IPv4 or IPv6 address
+ * @prefix_len: prefix length (any format, auto-detected)
+ *
+ * Return: prefix length in native format (0-32 for IPv4, 0-128 for IPv6)
+ */
+static inline int inany_prefix_len(const union inany_addr *addr,
+ int prefix_len)
+{
+ if (inany_v4(addr) && prefix_len >= 96)
+ return prefix_len - 96;
+
+ return prefix_len;
+}
+
+/** inany_prefix_len6() - Convert prefix length to generic format
+ * @addr: IPv4 or IPv6 address
+ * @prefix_len: prefix length (any format, auto-detected)
+ *
+ * Return: prefix length in generic format (96-128 for IPv4, 0-128 for IPv6)
+ */
+static inline int inany_prefix_len6(const union inany_addr *addr,
+ int prefix_len)
+{
+ if (inany_v4(addr) && prefix_len && prefix_len <= 32)
+ return prefix_len + 96;
+
+ return prefix_len;
+}
+
/** inany_equals - Compare two IPv[46] addresses
* @a, @b: IPv[46] addresses
*
diff --git a/ip.h b/ip.h
index d0de6c8..933d98c 100644
--- a/ip.h
+++ b/ip.h
@@ -19,6 +19,8 @@
(ntohl(((struct in_addr *)(a))->s_addr) >> IN_CLASSA_NSHIFT == IN_LOOPBACKNET)
#define IN4_IS_ADDR_MULTICAST(a) \
(IN_MULTICAST(ntohl(((struct in_addr *)(a))->s_addr)))
+#define IN4_MASK(prefix) \
+ ((prefix) <= 0 ? 0 : htonl(0xffffffff << (32 - (prefix))))
#define IN4_ARE_ADDR_EQUAL(a, b) \
(((struct in_addr *)(a))->s_addr == ((struct in_addr *)b)->s_addr)
#define IN4ADDR_LOOPBACK_INIT \
diff --git a/ndp.c b/ndp.c
index 1f2bcb0..3750fc5 100644
--- a/ndp.c
+++ b/ndp.c
@@ -257,7 +257,6 @@ static void ndp_ra(const struct ctx *c, const struct in6_addr *dst)
.valid_lifetime = ~0U,
.pref_lifetime = ~0U,
},
- .prefix = c->ip6.addr,
.source_ll = {
.header = {
.type = OPT_SRC_L2_ADDR,
@@ -265,8 +264,13 @@ static void ndp_ra(const struct ctx *c, const struct in6_addr *dst)
},
},
};
+ const struct guest_addr *a = fwd_get_addr(c, AF_INET6, 0, 0);
unsigned char *ptr = NULL;
+ ASSERT(a);
+
+ ra.prefix = a->addr.a6;
+
ptr = &ra.var[0];
if (c->mtu) {
@@ -460,6 +464,7 @@ first:
*/
void ndp_send_init_req(const struct ctx *c)
{
+ const struct guest_addr *a = fwd_get_addr(c, AF_INET6, 0, 0);
struct ndp_ns ns = {
.ih = {
.icmp6_type = NS,
@@ -468,8 +473,13 @@ void ndp_send_init_req(const struct ctx *c)
.icmp6_solicited = 0, /* Reserved */
.icmp6_override = 0, /* Reserved */
},
- .target_addr = c->ip6.addr
+ .target_addr = IN6ADDR_ANY_INIT
};
+
+ if (!a)
+ return;
+
+ ns.target_addr = a->addr.a6;
debug("Sending initial NDP NS request for guest MAC address");
- ndp_send(c, &c->ip6.addr, &ns, sizeof(ns));
+ ndp_send(c, &a->addr.a6, &ns, sizeof(ns));
}
diff --git a/passt.h b/passt.h
index b614bdf..f75656d 100644
--- a/passt.h
+++ b/passt.h
@@ -64,11 +64,28 @@ enum passt_modes {
MODE_VU,
};
+/* Maximum number of addresses in context address array */
+#define MAX_GUEST_ADDRS 32
+
+/**
+ * struct guest_addr - Unified IPv4/IPv6 address entry
+ * @addr: IPv4 (as mapped) or IPv6 address
+ * @prefix_len: Prefix length in IPv6/IPv4-mapped [0,128]/[96,128] format
+ * @flags: CONF_ADDR_* flags
+ */
+struct guest_addr {
+ union inany_addr addr;
+ uint8_t prefix_len;
+ uint8_t flags;
+#define CONF_ADDR_USER BIT(0) /* User set via -a */
+#define CONF_ADDR_HOST BIT(1) /* From host interface */
+#define CONF_ADDR_GENERATED BIT(2) /* Generated by PASST/PASTA */
+#define CONF_ADDR_LINKLOCAL BIT(3) /* Link-local address */
+};
+
/**
* struct ip4_ctx - IPv4 execution context
- * @addr: IPv4 address assigned to guest
* @addr_seen: Latest IPv4 address seen as source from tap
- * @prefixlen: IPv4 prefix length (netmask)
* @guest_gw: IPv4 gateway as seen by the guest
* @map_host_loopback: Outbound connections to this address are NATted to the
* host's 127.0.0.1
@@ -84,10 +101,7 @@ enum passt_modes {
* @no_copy_addrs: Don't copy all addresses when configuring namespace
*/
struct ip4_ctx {
- /* PIF_TAP addresses */
- struct in_addr addr;
struct in_addr addr_seen;
- int prefix_len;
struct in_addr guest_gw;
struct in_addr map_host_loopback;
struct in_addr map_guest_addr;
@@ -107,7 +121,6 @@ struct ip4_ctx {
/**
* struct ip6_ctx - IPv6 execution context
- * @addr: IPv6 address assigned to guest
* @addr_seen: Latest IPv6 global/site address seen as source from tap
* @addr_ll_seen: Latest IPv6 link-local address seen as source from tap
* @guest_gw: IPv6 gateway as seen by the guest
@@ -125,8 +138,6 @@ struct ip4_ctx {
* @no_copy_addrs: Don't copy all addresses when configuring namespace
*/
struct ip6_ctx {
- /* PIF_TAP addresses */
- struct in6_addr addr;
struct in6_addr addr_seen;
struct in6_addr addr_ll_seen;
struct in6_addr guest_gw;
@@ -181,6 +192,8 @@ struct ip6_ctx {
* @fqdn: Guest FQDN
* @ifi6: Template interface for IPv6, -1: none, 0: IPv6 disabled
* @ip6: IPv6 configuration
+ * @addrs: Unified address array for both IPv4 (mapped) and IPv6
+ * @addr_count: Number of active entries in @addrs array
* @pasta_ifn: Name of namespace interface for pasta
* @pasta_ifi: Index of namespace interface for pasta
* @pasta_conf_ns: Configure namespace after creating it
@@ -260,6 +273,9 @@ struct ctx {
int ifi6;
struct ip6_ctx ip6;
+ struct guest_addr addrs[MAX_GUEST_ADDRS];
+ int addr_count;
+
char pasta_ifn[IF_NAMESIZE];
unsigned int pasta_ifi;
int pasta_conf_ns;
@@ -301,6 +317,41 @@ struct ctx {
bool migrate_exit;
};
+/**
+ * next_addr_idx_() - Find next address index matching family filter
+ * @addrs: Array of guest addresses
+ * @count: Number of addresses in array
+ * @i: Starting index
+ * @af: Address family filter: AF_INET, AF_INET6, or 0 for all
+ *
+ * Return: next matching index, or count if none found
+ */
+static inline int next_addr_idx_(const struct guest_addr *addrs, int count,
+ int i, sa_family_t af)
+{
+ for (; i < count; i++) {
+ sa_family_t entry_af;
+
+ entry_af = inany_v4(&addrs[i].addr) ? AF_INET : AF_INET6;
+
+ if (af == AF_UNSPEC || af == entry_af)
+ return i;
+ }
+ return i;
+}
+
+/**
+ * for_each_addr() - Iterate over addresses in array
+ * @a: Pointer variable for current entry (struct guest_addr *)
+ * @addrs: Array of guest addresses (e.g., c->addrs)
+ * @count: Number of addresses (e.g., c->addr_count)
+ * @af: Address family filter: AF_INET, AF_INET6, or 0 for all
+ */
+#define for_each_addr(a, addrs, count, af) \
+ for (int i_ = next_addr_idx_((addrs), (count), 0, (af)); \
+ i_ < (count) && ((a) = &(addrs)[i_], true); \
+ i_ = next_addr_idx_((addrs), (count), i_ + 1, (af)))
+
void proto_update_l2_buf(const unsigned char *eth_d);
#endif /* PASST_H */
diff --git a/pasta.c b/pasta.c
index bab945f..c51e4cd 100644
--- a/pasta.c
+++ b/pasta.c
@@ -330,6 +330,8 @@ void pasta_ns_conf(struct ctx *c)
if (c->pasta_conf_ns) {
unsigned int flags = IFF_UP;
+ const struct guest_addr *a;
+ int plen;
if (c->mtu)
nl_link_set_mtu(nl_sock_ns, c->pasta_ifi, c->mtu);
@@ -341,10 +343,15 @@ void pasta_ns_conf(struct ctx *c)
if (c->ifi4) {
if (c->ip4.no_copy_addrs) {
- rc = nl_addr_set(nl_sock_ns, c->pasta_ifi,
- AF_INET,
- &c->ip4.addr,
- c->ip4.prefix_len);
+ a = fwd_get_addr(c, AF_INET, 0, 0);
+ if (a) {
+ plen = inany_prefix_len(&a->addr,
+ a->prefix_len);
+ rc = nl_addr_set(nl_sock_ns,
+ c->pasta_ifi, AF_INET,
+ inany_v4(&a->addr),
+ plen);
+ }
} else {
rc = nl_addr_dup(nl_sock, c->ifi4,
nl_sock_ns, c->pasta_ifi,
@@ -397,11 +404,13 @@ ipv4_done:
0, IFF_NOARP);
if (c->ip6.no_copy_addrs) {
- if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.addr)) {
+ a = fwd_get_addr(c, AF_INET6, 0, 0);
+ if (a)
rc = nl_addr_set(nl_sock_ns,
- c->pasta_ifi, AF_INET6,
- &c->ip6.addr, 64);
- }
+ c->pasta_ifi,
+ AF_INET6,
+ &a->addr.a6,
+ a->prefix_len);
} else {
rc = nl_addr_dup(nl_sock, c->ifi6,
nl_sock_ns, c->pasta_ifi,
diff --git a/tap.c b/tap.c
index 59c45a3..eb93f74 100644
--- a/tap.c
+++ b/tap.c
@@ -936,8 +936,11 @@ resume:
c->ip6.addr_seen = *saddr;
}
- if (IN6_IS_ADDR_UNSPECIFIED(&c->ip6.addr))
- c->ip6.addr = *saddr;
+ if (!fwd_get_addr(c, AF_INET6, 0, 0)) {
+ union inany_addr addr = { .a6 = *saddr };
+
+ fwd_set_addr(c, &addr, CONF_ADDR_LINKLOCAL, 64);
+ }
} else if (!IN6_IS_ADDR_UNSPECIFIED(saddr)){
c->ip6.addr_seen = *saddr;
}
--
2.52.0
3
2
This series adds handling of multiple addresses into a unified address
array, so that a guest can see the same addresses on his own interface.
o All addresses are stored as union inany_addr
o User configured addresses are marked with a USER flag.
o Host provided addresses are marked with a HOST flag.
o Link local addresses are also marked with a LINKLOCAL flag.
o Addresses the guest is actually using are marked with an OBSERVED flag.
o Addresses eligible for DHCP assignments are marked with an DHCP flag.
o Addresses eligible for DHCPv6 advertisement are marked with an DHCPV6 flag.
o Addresses eligible for NDP advertisement are marked with an NDP flag.
v2:
- Added the earlier standalone CIDR commit to the head of the series.
- Replaced the guest namespace interface subscriptions with just
an address observation feature, so that it works with both PASTA
and PASST.
- Unified 'no_copy_addrs' and 'copy_addrs' code paths, as suggested
by David G.
- Multiple other changes, also based on feedback from David.
- Removed the host interface subscription patches, -for now.
I intend to re-add them once this series is applied.
- Outstanding question: When do we add an IPv4 link local address
to the guest? Only in local/opaque mode? Only when
explicitly requested? Always?
v3:
- Unified the IPv4 and IPv6 arrays into one array
- Changed prefix_len to always be in IPv6/IpV4 mapped format
- Updated migration protocol to v3, handling multiple addresses
- Many other smaller changes, based on feedback from the PASST team
v4:
- Numerous changes based on feedback
- Added several new commits, mostly broken
out of the pre-existing ones.
v5: - Re-introduced multiple OBSERVED addresses. This actually
turned out to be cleaner and with more predictable behaviour
than allowing only one.
- Included the DHCP and NDP patches from previous versions,
improved and updated according to feedback from the team.
- Likewise re-included the host-side netlink commit to support
late binding.
v6: - Skipped late binding commit for now.
- Added commit for using a single print buffer in conf_print
- Added commit for reading and adding all addresses from
template interface.
- Added commit for refactoring pasta_ns_conf().
- Added separate address flags for DHCP, DHCPv6, and NDP,
so that those are easy to recognize for their respective
functions.
- Split DHCP and DHCPv6 address selection into separate commits.
- Updated migration protocol to v3 for multi-address support.
- Numerous other smaller changes, both after feedback from
David G. and issues I have identified myself.
v7: - Replaced commit #1 with one that fixes a return address
issue with DHCPv6
- Modified for_each_addr() macro to take 4 arguments
- Many more fixes and changes based on feedback and own
findings.
Jon Maloy (13):
dhcpv6: Fix reply destination to match client's source address
passt, pasta: Introduce unified multi-address data structures
fwd: Unify guest accessibility checks with unified address array
arp: Check all configured addresses in ARP filtering
conf: Allow multiple -a/--address options per address family
netlink, conf: Read all addresses from template interface at startup
netlink, pasta: refactor function pasta_ns_conf()
conf, pasta: Track observed guest IPv4 addresses in unified address
array
conf, pasta: Track observed guest IPv6 addresses in unified address
array
migrate: Update protocol to v3 for multi-address support
dhcp: Select address for DHCP distribution
dhcpv6: Select addresses for DHCPv6 distribution
ndp: Support advertising multiple prefixes in Router Advertisements
arp.c | 20 +++-
conf.c | 200 ++++++++++++++++++++---------------
dhcp.c | 22 ++--
dhcpv6.c | 115 +++++++++++---------
dhcpv6.h | 2 +-
fwd.c | 305 ++++++++++++++++++++++++++++++++++++++++--------------
fwd.h | 8 ++
inany.h | 44 ++++++++
ip.h | 2 +
migrate.c | 240 ++++++++++++++++++++++++++++++++++++++++--
ndp.c | 131 ++++++++++++++++-------
netlink.c | 70 +++++++------
netlink.h | 7 +-
passt.1 | 7 +-
passt.h | 78 +++++++++++---
pasta.c | 224 ++++++++++++++++++++-------------------
tap.c | 37 ++-----
tap.h | 2 -
18 files changed, 1054 insertions(+), 460 deletions(-)
--
2.52.0
2
13
[PATCH v3 00/10] vhost-user: Preparatory series for multiple iovec entries per virtqueue element
by Laurent Vivier 14 May '26
by Laurent Vivier 14 May '26
14 May '26
Currently, the vhost-user path assumes each virtqueue element contains
exactly one iovec entry covering the entire frame. This assumption
breaks as some virtio-net drivers (notably iPXE) provide descriptors where the
vnet header and the frame payload are in separate buffers, resulting in
two iovec entries per virtqueue element.
This series refactors the vhost-user data path so that frame lengths,
header sizes, and padding are tracked and passed explicitly rather than
being derived from iovec sizes. This decoupling is a prerequisite for
correctly handling padding of multi-buffer frames.
The changes in this series can be split in 3 groups:
- New iov helpers (patches 1-2):
iov_memset() and iov_memcpy() operate across iovec boundaries.
These are needed by the final patch to pad and copy frame data
when a frame spans multiple iovec entries.
- Structural refactoring (patches 3-5):
Move vnethdr setup into vu_flush(), separate virtqueue management
from socket I/O in the UDP path, and pass iov arrays explicitly
instead of using file-scoped state. These changes make it possible
to pass explicit frame lengths through the stack, which is required
to pad frames independently of iovec layout.
- Explicit length passing throughout the stack (patches 6-10):
Thread explicit L4, L2, frame, and data lengths through checksum,
pcap, vu_flush(), and tcp_fill_headers(), replacing lengths that
were previously derived from iovec sizes. With lengths tracked
explicitly, the final patch can centralise Ethernet frame padding
into vu_collect() and a new vu_pad() helper that correctly pads
frames spanning multiple iovec entries.
v3:
- csum_udp4()/csum_udp6()/udp_vu_csum receive payload length (dlen) rather than l4len
- Add a length parameter to write_remainder() and use it in pcap_frame()
v2:
- Rename iov_memcopy() to iov_memcpy() and use clearer parameter names
- Use clearer code in pcap_frame()
- Add braces around bodies in pcap.c and tcp_vu.c for style consistency
- Extract l2len variable in tap_add_packet() and tcp_vu_send_flag()
to avoid repeating the same expression
- Fix indentation alignment of iov_skip_bytes() arguments in tcp_vu_c
- Introduce fill_size variable in vu_flush()
- Reposition comment for ETH_ZLEN in vu_collect()
Laurent Vivier (10):
iov: Introduce iov_memset()
iov: Add iov_memcpy() to copy data between iovec arrays
vu_common: Move vnethdr setup into vu_flush()
udp_vu: Move virtqueue management from udp_vu_sock_recv() to its
caller
udp_vu: Pass iov explicitly to helpers instead of using file-scoped
array
checksum: Pass explicit L4 length to checksum functions
pcap: Pass explicit L2 length to pcap_iov()
vu_common: Pass explicit frame length to vu_flush()
tcp: Pass explicit data length to tcp_fill_headers()
vhost-user: Centralise Ethernet frame padding in vu_collect() and
vu_pad()
checksum.c | 43 +++++++-----
checksum.h | 6 +-
iov.c | 78 ++++++++++++++++++++++
iov.h | 5 ++
pcap.c | 28 +++++---
pcap.h | 2 +-
tap.c | 10 +--
tcp.c | 14 ++--
tcp_buf.c | 3 +-
tcp_internal.h | 2 +-
tcp_vu.c | 66 ++++++++++---------
udp.c | 5 +-
udp_vu.c | 173 +++++++++++++++++++++++++------------------------
util.c | 31 +++++++--
util.h | 3 +-
vu_common.c | 58 ++++++++++-------
vu_common.h | 5 +-
17 files changed, 338 insertions(+), 194 deletions(-)
--
2.53.0
3
21
[PATCH v8 0/3] vhost-user,udp: Handle multiple iovec entries per virtqueue element
by Laurent Vivier 13 May '26
by Laurent Vivier 13 May '26
13 May '26
Some virtio-net drivers (notably iPXE) provide descriptors where the
vnet header and the frame payload are in separate buffers, resulting in
two iovec entries per virtqueue element. Currently, the RX (host to
guest) path assumes a single iovec per element, which triggers:
ASSERTION FAILED in virtqueue_map_desc (virtio.c:403):
num_sg < max_num_sg
This series reworks the UDP vhost-user receive path to support multiple
iovec entries per element, fixing the iPXE crash.
This series only addresses the UDP path. TCP vhost-user will be
updated to use multi-iov elements in a subsequent series.
Based-on: 20260416155721.3807225-1-lvivier(a)redhat.com
v8:
- Add Reviewed-by from David for 1/3 and 2/3
- Remove VLA to use fixed size (VIRTQUEUE_MAX_SIZE)
- Rename udp_frame to datagram
- Use sizeof(uh) instead of sizeof(struct udphdr)
- Push back uh in memory in udp_vu_csum()
v7:
- In udp_vu_sock_to_tap(), introduce iov_still_needed variable
- Fix iov_tail @data doc comments in udp_vu_prepare()/udp_vu_csum()
- Don't rewrap the virtio checksum comment in udp_update_hdr6()
- Add NULL check for uh in udp_vu_csum()
v6:
- Rebased on top of
[PATCH 00/10] vhost-user: Preparatory series for multiple iovec entries per virtqueue element
v5:
- This version doesn't change the padding system regarding v4,
it's a complex task that will be addressed in another version
- reorder patches and add new patches
- remove IOV_PUT_HEADER()/with_header() and introduce IOV_PUSH_HEADER()
- don't use the iov_tail to provide the headers to the functions
- move vu_set_vnethdr() to vu_flush(), extract vu_queue_notify()
- move vu_flush() inside loop in tcp_vu_data_from_sock() to flush data
by frame and not by full data length
v4:
- rebase
- replace ASSERT() by assert()
v3:
- include the series "Decouple iovec management from virtqueues elements"
- because of this series, drop:
"vu_common: Accept explicit iovec counts in vu_set_element()"
"vu_common: Accept explicit iovec count per element in vu_init_elem()"
"vu_common: Prepare to use multibuffer with guest RX"
"vhost-user,udp: Use 2 iovec entries per element"
- drop "vu_common: Pass iov_tail to vu_set_vnethdr()"
as the specs insures a buffer is big enough to contain vnet header
- introduce "with_header()" and merge
"udp: Pass iov_tail to udp_update_hdr4()/udp_update_hdr6()" and
"udp_vu: Use iov_tail in udp_vu_prepare()"
to use it
v2:
- add iov_truncate(), iov_memset()
- remove iov_tail_truncate() and iov_tail_zero_end()
- manage 802.3 minimum frame size
Laurent Vivier (3):
udp_vu: Allow virtqueue elements with multiple iovec entries
iov: Introduce IOV_PUSH_HEADER() macro
udp: Pass iov_tail to udp_update_hdr4()/udp_update_hdr6()
iov.c | 22 ++++++++++
iov.h | 11 +++++
udp.c | 70 ++++++++++++++++-------------
udp_internal.h | 4 +-
udp_vu.c | 116 ++++++++++++++++++++++++++-----------------------
5 files changed, 135 insertions(+), 88 deletions(-)
--
2.53.0
3
8
[PATCH v6 0/4] vhost-user,tcp: Handle multiple iovec entries per virtqueue element
by Laurent Vivier 11 May '26
by Laurent Vivier 11 May '26
11 May '26
This is the TCP counterpart to the UDP multi-iov series. It converts
the TCP vhost-user receive path from direct pointer arithmetic (via
vu_eth(), vu_ip(), etc.) to the iov_tail abstraction, removing the
assumption that all headers reside in a single contiguous buffer.
With this series applied, the TCP path correctly handles virtio-net
drivers that provide multiple buffers per virtqueue element (e.g. iPXE
provides the vnet header in the first buffer and the frame payload in a
second one), matching the support already present in the UDP path.
Based-on: 20260416160926.3822963-1-lvivier(a)redhat.com
v6:
- Rebase on v8 of UDP series (tcp_update_csum() takes dlen
rather than l4len)
v5:
- Use l2len variable for pcap_iov() length in tcp_vu_send_flag()
- Add braces
- Move pcap_iov() before vu_flush()
- Remove vu_flush() from tcp_vu_send_dup(), let the caller handle it
v4:
- fix error during rebase, s/vu_pad_len/vu_pad/
v3:
- Rebased on top of
[PATCH 00/10] vhost-user: Preparatory series for multiple iovec entries per virtqueue element
v2:
- add "tcp: Encode checksum computation flags in a single parameter"
- remove IOV_PUT_HEADER()/with_header() and use IOV_PUSH_HEADER()
- don't use the iov_tail to provide the headers to the functions
Laurent Vivier (4):
tcp: Encode checksum computation flags in a single parameter
tcp_vu: Build headers on the stack and write them into the iovec
tcp_vu: Support multibuffer frames in tcp_vu_sock_recv()
tcp_vu: Support multibuffer frames in tcp_vu_send_flag()
iov.c | 1 -
tcp.c | 25 +--
tcp_buf.c | 23 +--
tcp_internal.h | 7 +-
tcp_vu.c | 403 +++++++++++++++++++++++++++++--------------------
vu_common.h | 20 ---
6 files changed, 270 insertions(+), 209 deletions(-)
--
2.53.0
3
13
07 May '26
The TCP window advertised to the guest/container must balance two
competing needs: large enough to trigger kernel socket buffer
auto-tuning, but not so large that sendmsg() partially fails, causing
retransmissions.
The current approach uses the difference (SNDBUF_GET() - SIOCOUTQ), but
SNDBUF_GET() returns a scaled value that only roughly accounts for
per-skb overhead. The clamped_scale approximation doesn't accurately
track the actual per-segment overhead, which can lead to both excessive
retransmissions and reduced throughput.
We now introduce the use of SO_MEMINFO to obtain SK_MEMINFO_SNDBUF and
SK_MEMINFO_WMEM_QUEUED from the kernel. The latter is presented in the
kernel's own accounting units, i.e. including the sk_buff overhead,
and matches exactly what the kernel's own sk_stream_memory_free()
function is using.
When data is queued and the overhead ratio is observable, we calculate
the per-segment overhead as (wmem_queued - sendq) / num_segments, then
determine how many additional segments should fit in the remaining
buffer space, considering the calculated per-mss overhead. This approach
treats segments as discrete quantities, and produces a more accurate
estimate of available buffer space than a linear scaling factor does.
When the ratio cannot be observed, e.g. because the queue is empty or
we are in a transient state, we fall back to the existing clamped_scale
calculation (scaling between 100% and 75% of buffer capacity).
When SO_MEMINFO succeeds, we also use SK_MEMINFO_SNDBUF directly to
set SNDBUF, avoiding a separate SO_SNDBUF getsockopt() call.
If SO_MEMINFO is unavailable, we fall back to the pre-existing
SNDBUF_GET() - SIOCOUTQ calculation.
Link: https://bugs.passt.top/show_bug.cgi?id=138
Link: https://github.com/containers/podman/issues/28219
Signed-off-by: Jon Maloy <jmaloy(a)redhat.com>
---
v2: Updated according to feedback from Stefano. Segment-based discrete
overhead calculation instead of linear ratio.
v3: Addressed Stefano's v2 feedback:
- Extracted window calculation into tcp_wnd_from_sndbuf()
- Use wmem_queued instead of SIOCOUTQ for fallback and SWS check
---
tcp.c | 137 ++++++++++++++++++++++++++++++++++-------------------
tcp_conn.h | 2 +-
2 files changed, 89 insertions(+), 50 deletions(-)
diff --git a/tcp.c b/tcp.c
index 43b8fdb..61160cf 100644
--- a/tcp.c
+++ b/tcp.c
@@ -295,6 +295,7 @@
#include <arpa/inet.h>
#include <linux/sockios.h>
+#include <linux/sock_diag.h>
#include "checksum.h"
#include "util.h"
@@ -1017,6 +1018,90 @@ size_t tcp_fill_headers(const struct ctx *c, struct tcp_tap_conn *conn,
return MAX(l3len + sizeof(struct ethhdr), ETH_ZLEN);
}
+/**
+ * tcp_wnd_from_sndbuf() - Calculate window from available send buffer space
+ * @s: Socket file descriptor
+ * @conn: Connection pointer
+ * @tinfo: tcp_info from kernel
+ *
+ * Return: window value to advertise, not scaled
+ */
+static uint32_t tcp_wnd_from_sndbuf(int s, struct tcp_tap_conn *conn,
+ const struct tcp_info_linux *tinfo)
+{
+ uint32_t rtt_ms_ceiling = DIV_ROUND_UP(tinfo->tcpi_rtt, 1000);
+ uint32_t mem[SK_MEMINFO_VARS];
+ socklen_t mem_sl = sizeof(mem);
+ int mss = MSS_GET(conn);
+ uint32_t limit, sendq;
+
+ if (ioctl(s, SIOCOUTQ, &sendq)) {
+ debug_perror("SIOCOUTQ on socket %i, assuming 0", s);
+ sendq = 0;
+ }
+
+ if (getsockopt(s, SOL_SOCKET, SO_MEMINFO, &mem, &mem_sl)) {
+ tcp_get_sndbuf(conn);
+
+ if (sendq > SNDBUF_GET(conn)) /* Due to memory pressure? */
+ limit = 0;
+ else
+ limit = SNDBUF_GET(conn) - sendq;
+ } else {
+ uint32_t sndbuf = mem[SK_MEMINFO_SNDBUF];
+ uint32_t wmemq = mem[SK_MEMINFO_WMEM_QUEUED];
+ uint32_t scaled = clamped_scale(sndbuf, sndbuf, SNDBUF_SMALL,
+ SNDBUF_BIG, 75);
+
+ SNDBUF_SET(conn, MIN(INT_MAX, scaled));
+
+ if (wmemq > sndbuf) {
+ limit = 0;
+ } else if (!sendq || !mss || wmemq <= sendq) {
+ limit = SNDBUF_GET(conn) - wmemq;
+ } else {
+ uint32_t used_segs = MAX(sendq / mss, 1);
+ uint32_t overhead = (wmemq - sendq) / used_segs;
+ uint32_t remaining = sndbuf - wmemq;
+ uint32_t avail_segs = remaining / (mss + overhead);
+
+ limit = avail_segs * mss;
+ }
+ }
+
+ /* If the sender uses mechanisms to prevent Silly Window
+ * Syndrome (SWS, described in RFC 813 Section 3) it's critical
+ * that, should the window ever become less than the MSS, we
+ * advertise a new value once it increases again to be above it.
+ *
+ * The mechanism to avoid SWS in the kernel is, implicitly,
+ * implemented by Nagle's algorithm (which was proposed after
+ * RFC 813).
+ *
+ * To this end, for simplicity, approximate a window value below
+ * the MSS to zero, as we already have mechanisms in place to
+ * force updates after the window becomes zero. This matches the
+ * suggestion from RFC 813, Section 4.
+ *
+ * But don't do this if, either:
+ *
+ * - there's nothing in the outbound queue: the size of the
+ * sending buffer is limiting us, and it won't increase if we
+ * don't send data, so there's no point in waiting, or
+ *
+ * - we haven't sent data in a while (somewhat arbitrarily, ten
+ * times the RTT), as that might indicate that the receiver
+ * will only process data in batches that are large enough,
+ * but we won't send enough to fill one because we're stuck
+ * with pending data in the outbound queue
+ */
+ if (limit < (uint32_t)MSS_GET(conn) && sendq &&
+ tinfo->tcpi_last_data_sent < rtt_ms_ceiling * 10)
+ limit = 0;
+
+ return MIN(tinfo->tcpi_snd_wnd, limit);
+}
+
/**
* tcp_update_seqack_wnd() - Update ACK sequence and window to guest/tap
* @c: Execution context
@@ -1124,56 +1209,10 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
}
}
- if ((conn->flags & LOCAL) || tcp_rtt_dst_low(conn)) {
+ if ((conn->flags & LOCAL) || tcp_rtt_dst_low(conn))
new_wnd_to_tap = tinfo->tcpi_snd_wnd;
- } else {
- unsigned rtt_ms_ceiling = DIV_ROUND_UP(tinfo->tcpi_rtt, 1000);
- uint32_t sendq;
- int limit;
-
- if (ioctl(s, SIOCOUTQ, &sendq)) {
- debug_perror("SIOCOUTQ on socket %i, assuming 0", s);
- sendq = 0;
- }
- tcp_get_sndbuf(conn);
-
- if ((int)sendq > SNDBUF_GET(conn)) /* Due to memory pressure? */
- limit = 0;
- else
- limit = SNDBUF_GET(conn) - (int)sendq;
-
- /* If the sender uses mechanisms to prevent Silly Window
- * Syndrome (SWS, described in RFC 813 Section 3) it's critical
- * that, should the window ever become less than the MSS, we
- * advertise a new value once it increases again to be above it.
- *
- * The mechanism to avoid SWS in the kernel is, implicitly,
- * implemented by Nagle's algorithm (which was proposed after
- * RFC 813).
- *
- * To this end, for simplicity, approximate a window value below
- * the MSS to zero, as we already have mechanisms in place to
- * force updates after the window becomes zero. This matches the
- * suggestion from RFC 813, Section 4.
- *
- * But don't do this if, either:
- *
- * - there's nothing in the outbound queue: the size of the
- * sending buffer is limiting us, and it won't increase if we
- * don't send data, so there's no point in waiting, or
- *
- * - we haven't sent data in a while (somewhat arbitrarily, ten
- * times the RTT), as that might indicate that the receiver
- * will only process data in batches that are large enough,
- * but we won't send enough to fill one because we're stuck
- * with pending data in the outbound queue
- */
- if (limit < MSS_GET(conn) && sendq &&
- tinfo->tcpi_last_data_sent < rtt_ms_ceiling * 10)
- limit = 0;
-
- new_wnd_to_tap = MIN((int)tinfo->tcpi_snd_wnd, limit);
- }
+ else
+ new_wnd_to_tap = tcp_wnd_from_sndbuf(s, conn, tinfo);
new_wnd_to_tap = MIN(new_wnd_to_tap, MAX_WINDOW);
if (!(conn->events & ESTABLISHED))
diff --git a/tcp_conn.h b/tcp_conn.h
index 6985426..9f5bee0 100644
--- a/tcp_conn.h
+++ b/tcp_conn.h
@@ -98,7 +98,7 @@ struct tcp_tap_conn {
#define SNDBUF_BITS 24
unsigned int sndbuf :SNDBUF_BITS;
#define SNDBUF_SET(conn, bytes) (conn->sndbuf = ((bytes) >> (32 - SNDBUF_BITS)))
-#define SNDBUF_GET(conn) (conn->sndbuf << (32 - SNDBUF_BITS))
+#define SNDBUF_GET(conn) ((uint32_t)(conn->sndbuf << (32 - SNDBUF_BITS)))
uint8_t seq_dup_ack_approx;
--
2.52.0
2
1
While working on pesto, I ran into a number of awkward errors with the
static checkers. This series reworks the invocation of the checkers
in a way that will let us deal with that. As a bonus, it also gives
us static checking for passt-repair and qrap. It also makes a few
other cleanups to the Makefile that seemed natural along the way.
v2:
- Fixed nasty test failure in test/build/build.py
David Gibson (13):
Makefile: Use make variables for static checker configuration
cppcheck: Split out essential defines into a BASE_CPPFLAGS variable
Makefile: Remove preprocessor flags from $(FLAGS)
Makefile: Remove non-standard $(FLAGS) variable
Makefile: Make conditional definition of $(BIN) clearer
Makefile: Use common binary compilation rule
Makefile: Remove unhelpful $(HEADERS) variable
Makefile: Add header dependencies for secondary binaries
Makefile: Split static checker targets
passt-repair: Split out inotify handling to its own function
passt-repair: Simplify construction of Unix path from inotify
passt-repair: Run static checkers
qrap: Run static checkers
Makefile | 106 ++++++++++++++++-----------
linux_dep.h | 2 +-
passt-repair.c | 171 ++++++++++++++++++++++++--------------------
qrap.c | 42 ++++++-----
test/build/build.py | 4 +-
5 files changed, 190 insertions(+), 135 deletions(-)
--
2.53.0
3
30
04 May '26
Although fwd_rule_add() performs some sanity checks on the rule it is
given, there are invalid rules we don't check for, assuming that its
callers will do that.
That won't be enough when we can get rules inserted by a dynamic update
client without going through the existing parsing code. So, add stricter
checks to fwd_rule_add(), which is now possible thanks to the capabilities
bits in the struct fwd_table. Where those duplicate existing checks in the
callers, remove the old copies.
Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au>
---
conf.c | 21 ---------------------
fwd.c | 52 +++++++++++++++++++++++++++++++++++++++++++++++-----
2 files changed, 47 insertions(+), 26 deletions(-)
diff --git a/conf.c b/conf.c
index 6e884e54..b470b0d8 100644
--- a/conf.c
+++ b/conf.c
@@ -176,8 +176,6 @@ static void conf_ports_range_except(struct fwd_table *fwd, uint8_t proto,
die("Invalid interface name: %s", ifname);
}
- assert(first != 0);
-
for (base = first; base <= last; base++) {
if (exclude && bitmap_isset(exclude, base))
continue;
@@ -310,10 +308,6 @@ static void conf_ports_spec(struct fwd_table *fwd, uint8_t proto,
if (p != ep) /* Garbage after the ranges */
goto bad;
- if (orig_range.first == 0) {
- die("Can't forward port 0 included in '%s'", spec);
- }
-
conf_ports_range_except(fwd, proto, addr, ifname,
orig_range.first, orig_range.last,
exclude,
@@ -356,11 +350,6 @@ static void conf_ports(char optname, const char *optarg, struct fwd_table *fwd)
return;
}
- if (proto == IPPROTO_TCP && !(fwd->caps & FWD_CAP_TCP))
- die("TCP port forwarding requested but TCP is disabled");
- if (proto == IPPROTO_UDP && !(fwd->caps & FWD_CAP_UDP))
- die("UDP port forwarding requested but UDP is disabled");
-
strncpy(buf, optarg, sizeof(buf) - 1);
if ((spec = strchr(buf, '/'))) {
@@ -405,16 +394,6 @@ static void conf_ports(char optname, const char *optarg, struct fwd_table *fwd)
addr = NULL;
}
- if (addr) {
- if (!(fwd->caps & FWD_CAP_IPV4) && inany_v4(addr)) {
- die("IPv4 is disabled, can't use -%c %s",
- optname, optarg);
- } else if (!(fwd->caps & FWD_CAP_IPV6) && !inany_v4(addr)) {
- die("IPv6 is disabled, can't use -%c %s",
- optname, optarg);
- }
- }
-
if (optname == 'T' || optname == 'U') {
assert(!addr && !ifname);
diff --git a/fwd.c b/fwd.c
index c7fd1a9d..979c1494 100644
--- a/fwd.c
+++ b/fwd.c
@@ -367,17 +367,59 @@ int fwd_rule_add(struct fwd_table *fwd, const struct fwd_rule *new)
new->first, new->last);
return -EINVAL;
}
+ if (!new->first) {
+ warn("Forwarding rule attempts to map from port 0");
+ return -EINVAL;
+ }
+ if (!new->to ||
+ (in_port_t)(new->to + new->last - new->first) < new->to) {
+ warn("Forwarding rule attempts to map to port 0");
+ return -EINVAL;
+ }
if (new->flags & ~allowed_flags) {
warn("Rule has invalid flags 0x%hhx",
new->flags & ~allowed_flags);
return -EINVAL;
}
- if (new->flags & FWD_DUAL_STACK_ANY &&
- !inany_equals(&new->addr, &inany_any6)) {
- char astr[INANY_ADDRSTRLEN];
+ if (new->flags & FWD_DUAL_STACK_ANY) {
+ if (!inany_equals(&new->addr, &inany_any6)) {
+ char astr[INANY_ADDRSTRLEN];
- warn("Dual stack rule has non-wildcard address %s",
- inany_ntop(&new->addr, astr, sizeof(astr)));
+ warn("Dual stack rule has non-wildcard address %s",
+ inany_ntop(&new->addr, astr, sizeof(astr)));
+ return -EINVAL;
+ }
+ if (!(fwd->caps & FWD_CAP_IPV4)) {
+ warn("Dual stack forward, but IPv4 not enabled");
+ return -EINVAL;
+ }
+ if (!(fwd->caps & FWD_CAP_IPV6)) {
+ warn("Dual stack forward, but IPv6 not enabled");
+ return -EINVAL;
+ }
+ } else {
+ if (inany_v4(&new->addr) && !(fwd->caps & FWD_CAP_IPV4)) {
+ warn("IPv4 forward, but IPv4 not enabled");
+ return -EINVAL;
+ }
+ if (!inany_v4(&new->addr) && !(fwd->caps & FWD_CAP_IPV6)) {
+ warn("IPv6 forward, but IPv6 not enabled");
+ return -EINVAL;
+ }
+ }
+ if (new->proto == IPPROTO_TCP) {
+ if (!(fwd->caps & FWD_CAP_TCP)) {
+ warn("Can't add TCP forwarding rule, TCP not enabled");
+ return -EINVAL;
+ }
+ } else if (new->proto == IPPROTO_UDP) {
+ if (!(fwd->caps & FWD_CAP_UDP)) {
+ warn("Can't add UDP forwarding rule, UDP not enabled");
+ return -EINVAL;
+ }
+ } else {
+ warn("Unsupported protocol 0x%hhx (%s) for forwarding rule",
+ new->proto, ipproto_name(new->proto));
return -EINVAL;
}
--
2.53.0
3
4
03 May '26
Here's the next draft of dynamic configuration updates. This now can
successfully update rules, though I've not tested it very extensively.
Patches 1..8/18 are preliminary reworks that make sense even without
pesto - feel free to apply if you're happy with them. I don't think
the rest should be applied yet; we need to at least harden it so passt
can't be blocked indefinitely by a client which sends a partial update
then waits.
Based on my earlier series reworking static checking invocation.
TODO:
- Don't allow a client which sends a partial configuration then
blocks also block passt
- Allow pesto to clear existing configuration, not just add
- Allow pesto selectively delete existing rules, not just add
Changes in v5:
* If multiple clients connect at once, they're now blocked until the
first one finishes, instead of later ones being discarded
Changes in v4:
* Merged with remainder of forward rule parsing rework series
* Fix some bugs in rule checking pointed out by Laurent
* Significantly cleaned up option parsing code
* Changed from replacing all existing rules to adding new rules
(clear and remove still TBD)
* Somewhat simplified protocol (pif names and rules sent in a single
pass)
* pesto is now allocation free
* Fixed commit message and style nits pointed out by Stefano
Changes in v3:
* Removed already applied ASSERT() rename
* Renamed serialisation functions
* Incorporated Stefano's extensions, reworked and fixed
* Several additional cleanups / preliminary reworks
Changes in v2:
* Removed already applied cleanups
* Reworked assert() patch to handle -DNDEBUG properly
* Numerous extra patches:
* Factored out serialisation helpers and use them for migration as
well
* Reworked to allow ip.[ch] and inany.[ch] to be shared with pesto
* Reworks to share some forwarding rule datatypes with pesto
* Implemented sending pif names and current ruleset to pesto
David Gibson (18):
conf, fwd: Stricter rule checking in fwd_rule_add()
fwd_rule: Move ephemeral port probing to fwd_rule.c
fwd, conf: Move rule parsing code to fwd_rule.[ch]
fwd_rule: Move conflict checking back within fwd_rule_add()
fwd: Generalise fwd_rules_info()
pif: Limit pif names to 128 bytes
fwd_rule: Fix some format specifiers
tap, repair: Use SOCK_NONBLOCK and SOCK_CLOEXEC on Unix sockets
pesto: Introduce stub configuration tool
pesto, log: Share log.h (but not log.c) with pesto tool
pesto, conf: Have pesto connect to passt and check versions
pesto: Expose list of pifs to pesto and optionally display
ip: Prepare ip.[ch] for sharing with pesto tool
inany: Prepare inany.[ch] for sharing with pesto tool
pesto: Read current ruleset from passt/pasta and optionally display it
pesto: Parse and add new rules from command line
pesto, conf: Send updated rules from pesto back to passt/pasta
conf, fwd: Allow switching to new rules received from pesto
.gitignore | 2 +
Makefile | 54 ++--
common.h | 122 +++++++++
conf.c | 686 ++++++++++++++++++++++-----------------------------
conf.h | 2 +
epoll_type.h | 4 +
flow.c | 4 +-
fwd.c | 169 ++++---------
fwd.h | 41 +--
fwd_rule.c | 603 ++++++++++++++++++++++++++++++++++++++++++--
fwd_rule.h | 66 ++++-
inany.c | 19 +-
inany.h | 17 +-
ip.c | 56 +----
ip.h | 4 +-
lineread.c | 2 +-
log.h | 59 ++++-
passt.1 | 5 +
passt.c | 8 +
passt.h | 8 +
pesto.1 | 46 ++++
pesto.c | 470 +++++++++++++++++++++++++++++++++++
pesto.h | 55 +++++
pif.c | 2 +-
pif.h | 8 +-
repair.c | 9 +-
serialise.c | 7 +
serialise.h | 1 +
siphash.h | 13 +
tap.c | 64 ++++-
util.c | 2 +-
util.h | 110 +--------
32 files changed, 1921 insertions(+), 797 deletions(-)
create mode 100644 common.h
create mode 100644 pesto.1
create mode 100644 pesto.c
create mode 100644 pesto.h
--
2.53.0
3
28
03 May '26
The code parsing command line options into forwarding rules has now been
decoupled from most of passt/pasta's internals. This is good, because
we'll soon want to share it with a configuration update client.
Make the next step by moving this code into fwd_rule.[ch].
Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au>
Reviewd-by: Laurent Vivier <lvivier(a)redhat.com>
---
conf.c | 376 +------------------------------------------
fwd.c | 94 -----------
fwd.h | 33 ----
fwd_rule.c | 464 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
fwd_rule.h | 36 ++++-
5 files changed, 502 insertions(+), 501 deletions(-)
diff --git a/conf.c b/conf.c
index b470b0d8..5aacfe0f 100644
--- a/conf.c
+++ b/conf.c
@@ -13,7 +13,6 @@
*/
#include <arpa/inet.h>
-#include <ctype.h>
#include <errno.h>
#include <fcntl.h>
#include <getopt.h>
@@ -66,365 +65,6 @@
const char *pasta_default_ifn = "tap0";
-/**
- * port_range() - Represents a non-empty range of ports
- * @first: First port number in the range
- * @last: Last port number in the range (inclusive)
- *
- * Invariant: @last >= @first
- */
-struct port_range {
- in_port_t first, last;
-};
-
-/**
- * parse_port_range() - Parse a range of port numbers '<first>[-<last>]'
- * @s: String to parse
- * @endptr: Update to the character after the parsed range (similar to
- * strtol() etc.)
- * @range: Update with the parsed values on success
- *
- * Return: -EINVAL on parsing error, -ERANGE on out of range port
- * numbers, 0 on success
- */
-static int parse_port_range(const char *s, const char **endptr,
- struct port_range *range)
-{
- unsigned long first, last;
- char *ep;
-
- last = first = strtoul(s, &ep, 10);
- if (ep == s) /* Parsed nothing */
- return -EINVAL;
- if (*ep == '-') { /* we have a last value too */
- const char *lasts = ep + 1;
- last = strtoul(lasts, &ep, 10);
- if (ep == lasts) /* Parsed nothing */
- return -EINVAL;
- }
-
- if ((last < first) || (last >= NUM_PORTS))
- return -ERANGE;
-
- range->first = first;
- range->last = last;
- *endptr = ep;
-
- return 0;
-}
-
-/**
- * parse_keyword() - Parse a literal keyword
- * @s: String to parse
- * @endptr: Update to the character after the keyword
- * @kw: Keyword to accept
- *
- * Return: 0, if @s starts with @kw, -EINVAL if it does not
- */
-static int parse_keyword(const char *s, const char **endptr, const char *kw)
-{
- size_t len = strlen(kw);
-
- if (strlen(s) < len)
- return -EINVAL;
-
- if (memcmp(s, kw, len))
- return -EINVAL;
-
- *endptr = s + len;
- return 0;
-}
-
-/**
- * conf_ports_range_except() - Set up forwarding for a range of ports minus a
- * bitmap of exclusions
- * @fwd: Forwarding table to be updated
- * @proto: Protocol to forward
- * @addr: Listening address
- * @ifname: Listening interface
- * @first: First port to forward
- * @last: Last port to forward
- * @exclude: Bitmap of ports to exclude (may be NULL)
- * @to: Port to translate @first to when forwarding
- * @flags: Flags for forwarding entries
- */
-static void conf_ports_range_except(struct fwd_table *fwd, uint8_t proto,
- const union inany_addr *addr,
- const char *ifname,
- uint16_t first, uint16_t last,
- const uint8_t *exclude, uint16_t to,
- uint8_t flags)
-{
- struct fwd_rule rule = {
- .addr = addr ? *addr : inany_any6,
- .ifname = { 0 },
- .proto = proto,
- .flags = flags,
- };
- char rulestr[FWD_RULE_STRLEN];
- unsigned delta = to - first;
- unsigned base, i;
-
- if (!addr)
- rule.flags |= FWD_DUAL_STACK_ANY;
- if (ifname) {
- int ret;
-
- ret = snprintf(rule.ifname, sizeof(rule.ifname),
- "%s", ifname);
- if (ret <= 0 || (size_t)ret >= sizeof(rule.ifname))
- die("Invalid interface name: %s", ifname);
- }
-
- for (base = first; base <= last; base++) {
- if (exclude && bitmap_isset(exclude, base))
- continue;
-
- for (i = base; i <= last; i++) {
- if (exclude && bitmap_isset(exclude, i))
- break;
- }
-
- rule.first = base;
- rule.last = i - 1;
- rule.to = base + delta;
-
- fwd_rule_conflict_check(&rule, fwd->rules, fwd->count);
- if (fwd_rule_add(fwd, &rule) < 0)
- goto fail;
-
- base = i - 1;
- }
- return;
-
-fail:
- die("Unable to add rule %s",
- fwd_rule_fmt(&rule, rulestr, sizeof(rulestr)));
-}
-
-/*
- * for_each_chunk - Step through delimited chunks of a string
- * @p_: Pointer to start of each chunk (updated)
- * @ep_: Pointer to end of each chunk (updated)
- * @s_: String to step through
- * @sep_: String of all allowed delimiters
- */
-#define for_each_chunk(p_, ep_, s_, sep_) \
- for ((p_) = (s_); \
- (ep_) = (p_) + strcspn((p_), (sep_)), *(p_); \
- (p_) = *(ep_) ? (ep_) + 1 : (ep_))
-
-/**
- * conf_ports_spec() - Parse port range(s) specifier
- * @fwd: Forwarding table to be updated
- * @proto: Protocol to forward
- * @addr: Listening address for forwarding
- * @ifname: Interface name for listening
- * @spec: Port range(s) specifier
- */
-static void conf_ports_spec(struct fwd_table *fwd, uint8_t proto,
- const union inany_addr *addr, const char *ifname,
- const char *spec)
-{
- uint8_t exclude[PORT_BITMAP_SIZE] = { 0 };
- bool exclude_only = true;
- const char *p, *ep;
- uint8_t flags = 0;
- unsigned i;
-
- if (!strcmp(spec, "all")) {
- /* Treat "all" as equivalent to "": all non-ephemeral ports */
- spec = "";
- }
-
- /* Parse excluded ranges and "auto" in the first pass */
- for_each_chunk(p, ep, spec, ",") {
- struct port_range xrange;
-
- if (isdigit(*p)) {
- /* Include range, parse later */
- exclude_only = false;
- continue;
- }
-
- if (parse_keyword(p, &p, "auto") == 0) {
- if (p != ep) /* Garbage after the keyword */
- goto bad;
-
- if (!(fwd->caps & FWD_CAP_SCAN)) {
- die(
-"'auto' port forwarding is only allowed for pasta");
- }
-
- flags |= FWD_SCAN;
- continue;
- }
-
- /* Should be an exclude range */
- if (*p != '~')
- goto bad;
- p++;
-
- if (parse_port_range(p, &p, &xrange))
- goto bad;
- if (p != ep) /* Garbage after the range */
- goto bad;
-
- for (i = xrange.first; i <= xrange.last; i++)
- bitmap_set(exclude, i);
- }
-
- if (exclude_only) {
- /* Exclude ephemeral ports */
- fwd_port_map_ephemeral(exclude);
-
- conf_ports_range_except(fwd, proto, addr, ifname,
- 1, NUM_PORTS - 1, exclude,
- 1, flags | FWD_WEAK);
- return;
- }
-
- /* Now process base ranges, skipping exclusions */
- for_each_chunk(p, ep, spec, ",") {
- struct port_range orig_range, mapped_range;
-
- if (!isdigit(*p))
- /* Already parsed */
- continue;
-
- if (parse_port_range(p, &p, &orig_range))
- goto bad;
-
- if (*p == ':') { /* There's a range to map to as well */
- if (parse_port_range(p + 1, &p, &mapped_range))
- goto bad;
- if ((mapped_range.last - mapped_range.first) !=
- (orig_range.last - orig_range.first))
- goto bad;
- } else {
- mapped_range = orig_range;
- }
-
- if (p != ep) /* Garbage after the ranges */
- goto bad;
-
- conf_ports_range_except(fwd, proto, addr, ifname,
- orig_range.first, orig_range.last,
- exclude,
- mapped_range.first, flags);
- }
-
- return;
-bad:
- die("Invalid port specifier '%s'", spec);
-}
-
-/**
- * conf_ports() - Parse port configuration options, initialise UDP/TCP sockets
- * @optname: Short option name, t, T, u, or U
- * @optarg: Option argument (port specification)
- * @fwd: Forwarding table to be updated
- */
-static void conf_ports(char optname, const char *optarg, struct fwd_table *fwd)
-{
- union inany_addr addr_buf = inany_any6, *addr = &addr_buf;
- char buf[BUFSIZ], *spec, *ifname = NULL;
- uint8_t proto;
-
- if (optname == 't' || optname == 'T')
- proto = IPPROTO_TCP;
- else if (optname == 'u' || optname == 'U')
- proto = IPPROTO_UDP;
- else
- assert(0);
-
- if (!strcmp(optarg, "none")) {
- unsigned i;
-
- for (i = 0; i < fwd->count; i++) {
- if (fwd->rules[i].proto == proto) {
- die("-%c none conflicts with previous options",
- optname);
- }
- }
- return;
- }
-
- strncpy(buf, optarg, sizeof(buf) - 1);
-
- if ((spec = strchr(buf, '/'))) {
- *spec = 0;
- spec++;
-
- if (optname != 't' && optname != 'u')
- die("Listening address not allowed for -%c %s",
- optname, optarg);
-
- if ((ifname = strchr(buf, '%'))) {
- *ifname = 0;
- ifname++;
-
- /* spec is already advanced one past the '/',
- * so the length of the given ifname is:
- * (spec - ifname - 1)
- */
- if (spec - ifname - 1 >= IFNAMSIZ) {
- die("Interface name '%s' is too long (max %u)",
- ifname, IFNAMSIZ - 1);
- }
- }
-
- if (ifname == buf + 1) { /* Interface without address */
- addr = NULL;
- } else {
- char *p = buf;
-
- /* Allow square brackets for IPv4 too for convenience */
- if (*p == '[' && p[strlen(p) - 1] == ']') {
- p[strlen(p) - 1] = '\0';
- p++;
- }
-
- if (!inany_pton(p, addr))
- die("Bad forwarding address '%s'", p);
- }
- } else {
- spec = buf;
-
- addr = NULL;
- }
-
- if (optname == 'T' || optname == 'U') {
- assert(!addr && !ifname);
-
- if (!(fwd->caps & FWD_CAP_IFNAME)) {
- warn(
-"SO_BINDTODEVICE unavailable, forwarding only 127.0.0.1 and ::1 for '-%c %s'",
- optname, optarg);
-
- if (fwd->caps & FWD_CAP_IPV4) {
- conf_ports_spec(fwd, proto,
- &inany_loopback4, NULL, spec);
- }
- if (fwd->caps & FWD_CAP_IPV6) {
- conf_ports_spec(fwd, proto,
- &inany_loopback6, NULL, spec);
- }
- return;
- }
-
- ifname = "lo";
- }
-
- if (ifname && !(fwd->caps & FWD_CAP_IFNAME)) {
- die(
-"Device binding for '-%c %s' unsupported (requires kernel 5.7+)",
- optname, optarg);
- }
-
- conf_ports_spec(fwd, proto, addr, ifname, spec);
-}
-
/**
* add_dns4() - Possibly add the IPv4 address of a DNS resolver to configuration
* @c: Execution context
@@ -2160,16 +1800,16 @@ void conf(struct ctx *c, int argc, char **argv)
if (name == 't') {
opt_t = true;
- conf_ports(name, optarg, c->fwd[PIF_HOST]);
+ fwd_rule_parse(name, optarg, c->fwd[PIF_HOST]);
} else if (name == 'u') {
opt_u = true;
- conf_ports(name, optarg, c->fwd[PIF_HOST]);
+ fwd_rule_parse(name, optarg, c->fwd[PIF_HOST]);
} else if (name == 'T') {
opt_T = true;
- conf_ports(name, optarg, c->fwd[PIF_SPLICE]);
+ fwd_rule_parse(name, optarg, c->fwd[PIF_SPLICE]);
} else if (name == 'U') {
opt_U = true;
- conf_ports(name, optarg, c->fwd[PIF_SPLICE]);
+ fwd_rule_parse(name, optarg, c->fwd[PIF_SPLICE]);
}
} while (name != -1);
@@ -2221,13 +1861,13 @@ void conf(struct ctx *c, int argc, char **argv)
if (c->mode == MODE_PASTA) {
if (!opt_t)
- conf_ports('t', "auto", c->fwd[PIF_HOST]);
+ fwd_rule_parse('t', "auto", c->fwd[PIF_HOST]);
if (!opt_T)
- conf_ports('T', "auto", c->fwd[PIF_SPLICE]);
+ fwd_rule_parse('T', "auto", c->fwd[PIF_SPLICE]);
if (!opt_u)
- conf_ports('u', "auto", c->fwd[PIF_HOST]);
+ fwd_rule_parse('u', "auto", c->fwd[PIF_HOST]);
if (!opt_U)
- conf_ports('U', "auto", c->fwd[PIF_SPLICE]);
+ fwd_rule_parse('U', "auto", c->fwd[PIF_SPLICE]);
}
if (!c->quiet)
diff --git a/fwd.c b/fwd.c
index a6d75b74..728a783c 100644
--- a/fwd.c
+++ b/fwd.c
@@ -275,100 +275,6 @@ void fwd_rule_init(struct ctx *c)
c->fwd[PIF_SPLICE] = &fwd_out;
}
-/**
- * fwd_rule_add() - Validate and add a rule to a forwarding table
- * @fwd: Table to add to
- * @new: Rule to add
- *
- * Return: 0 on success, negative error code on failure
- */
-int fwd_rule_add(struct fwd_table *fwd, const struct fwd_rule *new)
-{
- /* Flags which can be set from the caller */
- const uint8_t allowed_flags = FWD_WEAK | FWD_SCAN | FWD_DUAL_STACK_ANY;
- unsigned num = (unsigned)new->last - new->first + 1;
- unsigned port;
-
- if (new->first > new->last) {
- warn("Rule has invalid port range %u-%u",
- new->first, new->last);
- return -EINVAL;
- }
- if (!new->first) {
- warn("Forwarding rule attempts to map from port 0");
- return -EINVAL;
- }
- if (!new->to ||
- (in_port_t)(new->to + new->last - new->first) < new->to) {
- warn("Forwarding rule attempts to map to port 0");
- return -EINVAL;
- }
- if (new->flags & ~allowed_flags) {
- warn("Rule has invalid flags 0x%hhx",
- new->flags & ~allowed_flags);
- return -EINVAL;
- }
- if (new->flags & FWD_DUAL_STACK_ANY) {
- if (!inany_equals(&new->addr, &inany_any6)) {
- char astr[INANY_ADDRSTRLEN];
-
- warn("Dual stack rule has non-wildcard address %s",
- inany_ntop(&new->addr, astr, sizeof(astr)));
- return -EINVAL;
- }
- if (!(fwd->caps & FWD_CAP_IPV4)) {
- warn("Dual stack forward, but IPv4 not enabled");
- return -EINVAL;
- }
- if (!(fwd->caps & FWD_CAP_IPV6)) {
- warn("Dual stack forward, but IPv6 not enabled");
- return -EINVAL;
- }
- } else {
- if (inany_v4(&new->addr) && !(fwd->caps & FWD_CAP_IPV4)) {
- warn("IPv4 forward, but IPv4 not enabled");
- return -EINVAL;
- }
- if (!inany_v4(&new->addr) && !(fwd->caps & FWD_CAP_IPV6)) {
- warn("IPv6 forward, but IPv6 not enabled");
- return -EINVAL;
- }
- }
- if (new->proto == IPPROTO_TCP) {
- if (!(fwd->caps & FWD_CAP_TCP)) {
- warn("Can't add TCP forwarding rule, TCP not enabled");
- return -EINVAL;
- }
- } else if (new->proto == IPPROTO_UDP) {
- if (!(fwd->caps & FWD_CAP_UDP)) {
- warn("Can't add UDP forwarding rule, UDP not enabled");
- return -EINVAL;
- }
- } else {
- warn("Unsupported protocol 0x%hhx (%s) for forwarding rule",
- new->proto, ipproto_name(new->proto));
- return -EINVAL;
- }
-
- if (fwd->count >= ARRAY_SIZE(fwd->rules)) {
- warn("Too many rules (maximum %u)", ARRAY_SIZE(fwd->rules));
- return -ENOSPC;
- }
- if ((fwd->sock_count + num) > ARRAY_SIZE(fwd->socks)) {
- warn("Rules require too many listening sockets (maximum %u)",
- ARRAY_SIZE(fwd->socks));
- return -ENOSPC;
- }
-
- fwd->rulesocks[fwd->count] = &fwd->socks[fwd->sock_count];
- for (port = new->first; port <= new->last; port++)
- fwd->rulesocks[fwd->count][port - new->first] = -1;
-
- fwd->rules[fwd->count++] = *new;
- fwd->sock_count += num;
- return 0;
-}
-
/**
* fwd_rule_match() - Does a prospective flow match a given forwarding rule?
* @rule: Forwarding rule
diff --git a/fwd.h b/fwd.h
index e664d1d0..8f845d09 100644
--- a/fwd.h
+++ b/fwd.h
@@ -20,8 +20,6 @@
struct flowside;
-#define FWD_RULE_BITS 8
-#define MAX_FWD_RULES MAX_FROM_BITS(FWD_RULE_BITS)
#define FWD_NO_HINT (-1)
/**
@@ -36,36 +34,6 @@ struct fwd_listen_ref {
unsigned rule :FWD_RULE_BITS;
};
-/* Maximum number of listening sockets (per pif)
- *
- * Rationale: This lets us listen on every port for two addresses and two
- * protocols (which we need for -T auto -U auto without SO_BINDTODEVICE), plus a
- * comfortable number of extras.
- */
-#define MAX_LISTEN_SOCKS (NUM_PORTS * 5)
-
-/**
- * struct fwd_table - Forwarding state (per initiating pif)
- * @caps: Forwarding capabilities for this initiating pif
- * @count: Number of forwarding rules
- * @rules: Array of forwarding rules
- * @rulesocks: Parallel array of @rules (@count valid entries) of pointers to
- * @socks entries giving the start of the corresponding rule's
- * sockets within the larger array
- * @sock_count: Number of entries used in @socks (for all rules combined)
- * @socks: Listening sockets for forwarding
- */
-struct fwd_table {
- uint32_t caps;
- unsigned count;
- struct fwd_rule rules[MAX_FWD_RULES];
- int *rulesocks[MAX_FWD_RULES];
- unsigned sock_count;
- int socks[MAX_LISTEN_SOCKS];
-};
-
-#define PORT_BITMAP_SIZE DIV_ROUND_UP(NUM_PORTS, 8)
-
/**
* struct fwd_scan - Port scanning state for a protocol+direction
* @scan4: /proc/net fd to scan for IPv4 ports when in AUTO mode
@@ -81,7 +49,6 @@ struct fwd_scan {
#define FWD_PORT_SCAN_INTERVAL 1000 /* ms */
void fwd_rule_init(struct ctx *c);
-int fwd_rule_add(struct fwd_table *fwd, const struct fwd_rule *new);
const struct fwd_rule *fwd_rule_search(const struct fwd_table *fwd,
const struct flowside *ini,
uint8_t proto, int hint);
diff --git a/fwd_rule.c b/fwd_rule.c
index 9d489827..cd3dec04 100644
--- a/fwd_rule.c
+++ b/fwd_rule.c
@@ -15,6 +15,7 @@
* Author: David Gibson <david(a)gibson.dropbear.id.au>
*/
+#include <ctype.h>
#include <errno.h>
#include <fcntl.h>
#include <stdio.h>
@@ -89,7 +90,7 @@ parse_err:
* fwd_port_map_ephemeral() - Mark ephemeral ports in a bitmap
* @map: Bitmap to update
*/
-void fwd_port_map_ephemeral(uint8_t *map)
+static void fwd_port_map_ephemeral(uint8_t *map)
{
unsigned port;
@@ -123,6 +124,7 @@ const union inany_addr *fwd_rule_addr(const struct fwd_rule *rule)
*/
__attribute__((noinline))
#endif
+/* cppcheck-suppress staticFunction */
const char *fwd_rule_fmt(const struct fwd_rule *rule, char *dst, size_t size)
{
const char *percent = *rule->ifname ? "%" : "";
@@ -199,8 +201,8 @@ static bool fwd_rule_conflicts(const struct fwd_rule *a, const struct fwd_rule *
* @rules: Existing rules against which to test
* @count: Number of rules in @rules
*/
-void fwd_rule_conflict_check(const struct fwd_rule *new,
- const struct fwd_rule *rules, size_t count)
+static void fwd_rule_conflict_check(const struct fwd_rule *new,
+ const struct fwd_rule *rules, size_t count)
{
unsigned i;
@@ -215,3 +217,459 @@ void fwd_rule_conflict_check(const struct fwd_rule *new,
fwd_rule_fmt(&rules[i], rulestr, sizeof(rulestr)));
}
}
+
+/**
+ * fwd_rule_add() - Validate and add a rule to a forwarding table
+ * @fwd: Table to add to
+ * @new: Rule to add
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int fwd_rule_add(struct fwd_table *fwd, const struct fwd_rule *new)
+{
+ /* Flags which can be set from the caller */
+ const uint8_t allowed_flags = FWD_WEAK | FWD_SCAN | FWD_DUAL_STACK_ANY;
+ unsigned num = (unsigned)new->last - new->first + 1;
+ unsigned port;
+
+ if (new->first > new->last) {
+ warn("Rule has invalid port range %u-%u",
+ new->first, new->last);
+ return -EINVAL;
+ }
+ if (!new->first) {
+ warn("Forwarding rule attempts to map from port 0");
+ return -EINVAL;
+ }
+ if (!new->to ||
+ (in_port_t)(new->to + new->last - new->first) < new->to) {
+ warn("Forwarding rule attempts to map to port 0");
+ return -EINVAL;
+ }
+ if (new->flags & ~allowed_flags) {
+ warn("Rule has invalid flags 0x%hhx",
+ new->flags & ~allowed_flags);
+ return -EINVAL;
+ }
+ if (new->flags & FWD_DUAL_STACK_ANY) {
+ if (!inany_equals(&new->addr, &inany_any6)) {
+ char astr[INANY_ADDRSTRLEN];
+
+ warn("Dual stack rule has non-wildcard address %s",
+ inany_ntop(&new->addr, astr, sizeof(astr)));
+ return -EINVAL;
+ }
+ if (!(fwd->caps & FWD_CAP_IPV4)) {
+ warn("Dual stack forward, but IPv4 not enabled");
+ return -EINVAL;
+ }
+ if (!(fwd->caps & FWD_CAP_IPV6)) {
+ warn("Dual stack forward, but IPv6 not enabled");
+ return -EINVAL;
+ }
+ } else {
+ if (inany_v4(&new->addr) && !(fwd->caps & FWD_CAP_IPV4)) {
+ warn("IPv4 forward, but IPv4 not enabled");
+ return -EINVAL;
+ }
+ if (!inany_v4(&new->addr) && !(fwd->caps & FWD_CAP_IPV6)) {
+ warn("IPv6 forward, but IPv6 not enabled");
+ return -EINVAL;
+ }
+ }
+ if (new->proto == IPPROTO_TCP) {
+ if (!(fwd->caps & FWD_CAP_TCP)) {
+ warn("Can't add TCP forwarding rule, TCP not enabled");
+ return -EINVAL;
+ }
+ } else if (new->proto == IPPROTO_UDP) {
+ if (!(fwd->caps & FWD_CAP_UDP)) {
+ warn("Can't add UDP forwarding rule, UDP not enabled");
+ return -EINVAL;
+ }
+ } else {
+ warn("Unsupported protocol 0x%hhx (%s) for forwarding rule",
+ new->proto, ipproto_name(new->proto));
+ return -EINVAL;
+ }
+
+ if (fwd->count >= ARRAY_SIZE(fwd->rules)) {
+ warn("Too many rules (maximum %u)", ARRAY_SIZE(fwd->rules));
+ return -ENOSPC;
+ }
+ if ((fwd->sock_count + num) > ARRAY_SIZE(fwd->socks)) {
+ warn("Rules require too many listening sockets (maximum %u)",
+ ARRAY_SIZE(fwd->socks));
+ return -ENOSPC;
+ }
+
+ fwd->rulesocks[fwd->count] = &fwd->socks[fwd->sock_count];
+ for (port = new->first; port <= new->last; port++)
+ fwd->rulesocks[fwd->count][port - new->first] = -1;
+
+ fwd->rules[fwd->count++] = *new;
+ fwd->sock_count += num;
+ return 0;
+}
+
+/**
+ * port_range() - Represents a non-empty range of ports
+ * @first: First port number in the range
+ * @last: Last port number in the range (inclusive)
+ *
+ * Invariant: @last >= @first
+ */
+struct port_range {
+ in_port_t first, last;
+};
+
+/**
+ * parse_port_range() - Parse a range of port numbers '<first>[-<last>]'
+ * @s: String to parse
+ * @endptr: Update to the character after the parsed range (similar to
+ * strtol() etc.)
+ * @range: Update with the parsed values on success
+ *
+ * Return: -EINVAL on parsing error, -ERANGE on out of range port
+ * numbers, 0 on success
+ */
+static int parse_port_range(const char *s, const char **endptr,
+ struct port_range *range)
+{
+ unsigned long first, last;
+ char *ep;
+
+ last = first = strtoul(s, &ep, 10);
+ if (ep == s) /* Parsed nothing */
+ return -EINVAL;
+ if (*ep == '-') { /* we have a last value too */
+ const char *lasts = ep + 1;
+ last = strtoul(lasts, &ep, 10);
+ if (ep == lasts) /* Parsed nothing */
+ return -EINVAL;
+ }
+
+ if ((last < first) || (last >= NUM_PORTS))
+ return -ERANGE;
+
+ range->first = first;
+ range->last = last;
+ *endptr = ep;
+
+ return 0;
+}
+
+/**
+ * parse_keyword() - Parse a literal keyword
+ * @s: String to parse
+ * @endptr: Update to the character after the keyword
+ * @kw: Keyword to accept
+ *
+ * Return: 0, if @s starts with @kw, -EINVAL if it does not
+ */
+static int parse_keyword(const char *s, const char **endptr, const char *kw)
+{
+ size_t len = strlen(kw);
+
+ if (strlen(s) < len)
+ return -EINVAL;
+
+ if (memcmp(s, kw, len))
+ return -EINVAL;
+
+ *endptr = s + len;
+ return 0;
+}
+
+/**
+ * fwd_rule_range_except() - Set up forwarding for a range of ports minus a
+ * bitmap of exclusions
+ * @fwd: Forwarding table to be updated
+ * @proto: Protocol to forward
+ * @addr: Listening address
+ * @ifname: Listening interface
+ * @first: First port to forward
+ * @last: Last port to forward
+ * @exclude: Bitmap of ports to exclude (may be NULL)
+ * @to: Port to translate @first to when forwarding
+ * @flags: Flags for forwarding entries
+ */
+static void fwd_rule_range_except(struct fwd_table *fwd, uint8_t proto,
+ const union inany_addr *addr,
+ const char *ifname,
+ uint16_t first, uint16_t last,
+ const uint8_t *exclude, uint16_t to,
+ uint8_t flags)
+{
+ struct fwd_rule rule = {
+ .addr = addr ? *addr : inany_any6,
+ .ifname = { 0 },
+ .proto = proto,
+ .flags = flags,
+ };
+ char rulestr[FWD_RULE_STRLEN];
+ unsigned delta = to - first;
+ unsigned base, i;
+
+ if (!addr)
+ rule.flags |= FWD_DUAL_STACK_ANY;
+ if (ifname) {
+ int ret;
+
+ ret = snprintf(rule.ifname, sizeof(rule.ifname),
+ "%s", ifname);
+ if (ret <= 0 || (size_t)ret >= sizeof(rule.ifname))
+ die("Invalid interface name: %s", ifname);
+ }
+
+ for (base = first; base <= last; base++) {
+ if (exclude && bitmap_isset(exclude, base))
+ continue;
+
+ for (i = base; i <= last; i++) {
+ if (exclude && bitmap_isset(exclude, i))
+ break;
+ }
+
+ rule.first = base;
+ rule.last = i - 1;
+ rule.to = base + delta;
+
+ fwd_rule_conflict_check(&rule, fwd->rules, fwd->count);
+ if (fwd_rule_add(fwd, &rule) < 0)
+ goto fail;
+
+ base = i - 1;
+ }
+ return;
+
+fail:
+ die("Unable to add rule %s",
+ fwd_rule_fmt(&rule, rulestr, sizeof(rulestr)));
+}
+
+/*
+ * for_each_chunk - Step through delimited chunks of a string
+ * @p_: Pointer to start of each chunk (updated)
+ * @ep_: Pointer to end of each chunk (updated)
+ * @s_: String to step through
+ * @sep_: String of all allowed delimiters
+ */
+#define for_each_chunk(p_, ep_, s_, sep_) \
+ for ((p_) = (s_); \
+ (ep_) = (p_) + strcspn((p_), (sep_)), *(p_); \
+ (p_) = *(ep_) ? (ep_) + 1 : (ep_))
+
+/**
+ * fwd_rule_parse_ports() - Parse port range(s) specifier
+ * @fwd: Forwarding table to be updated
+ * @proto: Protocol to forward
+ * @addr: Listening address for forwarding
+ * @ifname: Interface name for listening
+ * @spec: Port range(s) specifier
+ */
+static void fwd_rule_parse_ports(struct fwd_table *fwd, uint8_t proto,
+ const union inany_addr *addr,
+ const char *ifname,
+ const char *spec)
+{
+ uint8_t exclude[PORT_BITMAP_SIZE] = { 0 };
+ bool exclude_only = true;
+ const char *p, *ep;
+ uint8_t flags = 0;
+ unsigned i;
+
+ if (!strcmp(spec, "all")) {
+ /* Treat "all" as equivalent to "": all non-ephemeral ports */
+ spec = "";
+ }
+
+ /* Parse excluded ranges and "auto" in the first pass */
+ for_each_chunk(p, ep, spec, ",") {
+ struct port_range xrange;
+
+ if (isdigit(*p)) {
+ /* Include range, parse later */
+ exclude_only = false;
+ continue;
+ }
+
+ if (parse_keyword(p, &p, "auto") == 0) {
+ if (p != ep) /* Garbage after the keyword */
+ goto bad;
+
+ if (!(fwd->caps & FWD_CAP_SCAN)) {
+ die(
+"'auto' port forwarding is only allowed for pasta");
+ }
+
+ flags |= FWD_SCAN;
+ continue;
+ }
+
+ /* Should be an exclude range */
+ if (*p != '~')
+ goto bad;
+ p++;
+
+ if (parse_port_range(p, &p, &xrange))
+ goto bad;
+ if (p != ep) /* Garbage after the range */
+ goto bad;
+
+ for (i = xrange.first; i <= xrange.last; i++)
+ bitmap_set(exclude, i);
+ }
+
+ if (exclude_only) {
+ /* Exclude ephemeral ports */
+ fwd_port_map_ephemeral(exclude);
+
+ fwd_rule_range_except(fwd, proto, addr, ifname,
+ 1, NUM_PORTS - 1, exclude,
+ 1, flags | FWD_WEAK);
+ return;
+ }
+
+ /* Now process base ranges, skipping exclusions */
+ for_each_chunk(p, ep, spec, ",") {
+ struct port_range orig_range, mapped_range;
+
+ if (!isdigit(*p))
+ /* Already parsed */
+ continue;
+
+ if (parse_port_range(p, &p, &orig_range))
+ goto bad;
+
+ if (*p == ':') { /* There's a range to map to as well */
+ if (parse_port_range(p + 1, &p, &mapped_range))
+ goto bad;
+ if ((mapped_range.last - mapped_range.first) !=
+ (orig_range.last - orig_range.first))
+ goto bad;
+ } else {
+ mapped_range = orig_range;
+ }
+
+ if (p != ep) /* Garbage after the ranges */
+ goto bad;
+
+ fwd_rule_range_except(fwd, proto, addr, ifname,
+ orig_range.first, orig_range.last,
+ exclude,
+ mapped_range.first, flags);
+ }
+
+ return;
+bad:
+ die("Invalid port specifier '%s'", spec);
+}
+
+/**
+ * fwd_rule_parse() - Parse port configuration option
+ * @optname: Short option name, t, T, u, or U
+ * @optarg: Option argument (port specification)
+ * @fwd: Forwarding table to be updated
+ */
+void fwd_rule_parse(char optname, const char *optarg, struct fwd_table *fwd)
+{
+ union inany_addr addr_buf = inany_any6, *addr = &addr_buf;
+ char buf[BUFSIZ], *spec, *ifname = NULL;
+ uint8_t proto;
+
+ if (optname == 't' || optname == 'T')
+ proto = IPPROTO_TCP;
+ else if (optname == 'u' || optname == 'U')
+ proto = IPPROTO_UDP;
+ else
+ assert(0);
+
+ if (!strcmp(optarg, "none")) {
+ unsigned i;
+
+ for (i = 0; i < fwd->count; i++) {
+ if (fwd->rules[i].proto == proto) {
+ die("-%c none conflicts with previous options",
+ optname);
+ }
+ }
+ return;
+ }
+
+ strncpy(buf, optarg, sizeof(buf) - 1);
+
+ if ((spec = strchr(buf, '/'))) {
+ *spec = 0;
+ spec++;
+
+ if (optname != 't' && optname != 'u')
+ die("Listening address not allowed for -%c %s",
+ optname, optarg);
+
+ if ((ifname = strchr(buf, '%'))) {
+ *ifname = 0;
+ ifname++;
+
+ /* spec is already advanced one past the '/',
+ * so the length of the given ifname is:
+ * (spec - ifname - 1)
+ */
+ if (spec - ifname - 1 >= IFNAMSIZ) {
+ die("Interface name '%s' is too long (max %u)",
+ ifname, IFNAMSIZ - 1);
+ }
+ }
+
+ if (ifname == buf + 1) { /* Interface without address */
+ addr = NULL;
+ } else {
+ char *p = buf;
+
+ /* Allow square brackets for IPv4 too for convenience */
+ if (*p == '[' && p[strlen(p) - 1] == ']') {
+ p[strlen(p) - 1] = '\0';
+ p++;
+ }
+
+ if (!inany_pton(p, addr))
+ die("Bad forwarding address '%s'", p);
+ }
+ } else {
+ spec = buf;
+
+ addr = NULL;
+ }
+
+ if (optname == 'T' || optname == 'U') {
+ assert(!addr && !ifname);
+
+ if (!(fwd->caps & FWD_CAP_IFNAME)) {
+ warn(
+"SO_BINDTODEVICE unavailable, forwarding only 127.0.0.1 and ::1 for '-%c %s'",
+ optname, optarg);
+
+ if (fwd->caps & FWD_CAP_IPV4) {
+ fwd_rule_parse_ports(fwd, proto,
+ &inany_loopback4, NULL,
+ spec);
+ }
+ if (fwd->caps & FWD_CAP_IPV6) {
+ fwd_rule_parse_ports(fwd, proto,
+ &inany_loopback6, NULL,
+ spec);
+ }
+ return;
+ }
+
+ ifname = "lo";
+ }
+
+ if (ifname && !(fwd->caps & FWD_CAP_IFNAME)) {
+ die(
+"Device binding for '-%c %s' unsupported (requires kernel 5.7+)",
+ optname, optarg);
+ }
+
+ fwd_rule_parse_ports(fwd, proto, addr, ifname, spec);
+}
diff --git a/fwd_rule.h b/fwd_rule.h
index 5c7b67aa..f0f4efda 100644
--- a/fwd_rule.h
+++ b/fwd_rule.h
@@ -19,6 +19,7 @@
/* Number of ports for both TCP and UDP */
#define NUM_PORTS (1U << 16)
+#define PORT_BITMAP_SIZE DIV_ROUND_UP(NUM_PORTS, 8)
/* Forwarding capability bits */
#define FWD_CAP_IPV4 BIT(0)
@@ -54,8 +55,38 @@ struct fwd_rule {
uint8_t flags;
};
+#define FWD_RULE_BITS 8
+#define MAX_FWD_RULES MAX_FROM_BITS(FWD_RULE_BITS)
+
+/* Maximum number of listening sockets (per pif)
+ *
+ * Rationale: This lets us listen on every port for two addresses and two
+ * protocols (which we need for -T auto -U auto without SO_BINDTODEVICE), plus a
+ * comfortable number of extras.
+ */
+#define MAX_LISTEN_SOCKS (NUM_PORTS * 5)
+
+/**
+ * struct fwd_table - Forwarding state (per initiating pif)
+ * @caps: Forwarding capabilities for this initiating pif
+ * @count: Number of forwarding rules
+ * @rules: Array of forwarding rules
+ * @rulesocks: Parallel array of @rules (@count valid entries) of pointers to
+ * @socks entries giving the start of the corresponding rule's
+ * sockets within the larger array
+ * @sock_count: Number of entries used in @socks (for all rules combined)
+ * @socks: Listening sockets for forwarding
+ */
+struct fwd_table {
+ uint32_t caps;
+ unsigned count;
+ struct fwd_rule rules[MAX_FWD_RULES];
+ int *rulesocks[MAX_FWD_RULES];
+ unsigned sock_count;
+ int socks[MAX_LISTEN_SOCKS];
+};
+
void fwd_probe_ephemeral(void);
-void fwd_port_map_ephemeral(uint8_t *map);
#define FWD_RULE_STRLEN \
(IPPROTO_STRLEN - 1 \
@@ -67,7 +98,6 @@ void fwd_port_map_ephemeral(uint8_t *map);
const union inany_addr *fwd_rule_addr(const struct fwd_rule *rule);
const char *fwd_rule_fmt(const struct fwd_rule *rule, char *dst, size_t size);
void fwd_rules_info(const struct fwd_rule *rules, size_t count);
-void fwd_rule_conflict_check(const struct fwd_rule *new,
- const struct fwd_rule *rules, size_t count);
+void fwd_rule_parse(char optname, const char *optarg, struct fwd_table *fwd);
#endif /* FWD_RULE_H */
--
2.53.0
3
3