On Tue, Sep 09, 2025 at 09:59:13PM -0400, Jon Maloy wrote:
We add a cache table to keep partial contents of the kernel ARP/NDP tables. This way, we drastically reduce the number of netlink calls to read those tables.
We create undefined cache entries representing non- or not-yet- existing ARP/NDP entries when needed. We add a short expiration time to each such entry, so that we can know when to make repeated calls to the kernel tables in the beginning. We also add an access counter to the entries, to ensure that the timer becomes longer and the call frequency abates over time if no ARP/NDP entry shows up.
For regular entries we use a much longer timer, with the purpose to update the entry in the rare case that a remote host changes its MAC address.
I don't think the description of expiry behaviour is correct any more
Signed-off-by: Jon Maloy
--- v5: - Moved to earlier in series to reduce rebase conflicts v6: - Sqashed the hash list commit and the FIFO/LRU queue commit - Removed hash lookup. We now only use linear lookup in a linked list - Eliminated dynamic memory allocation. - Ensured there is only one call to clock_gettime() - Using MAC_ZERO instead of the previously dedicated definitions
The last point doesn't appear to be correct...
--- conf.c | 1 + fwd.c | 176 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ fwd.h | 4 ++ util.c | 12 ++++ util.h | 1 + 5 files changed, 194 insertions(+)
diff --git a/conf.c b/conf.c index f47f48e..27b04d3 100644 --- a/conf.c +++ b/conf.c @@ -2122,6 +2122,7 @@ void conf(struct ctx *c, int argc, char **argv) c->udp.fwd_out.mode = fwd_default;
fwd_scan_ports_init(c); + fwd_mac_cache_init();
if (!c->quiet) conf_print(c); diff --git a/fwd.c b/fwd.c index 250cf56..c58ba9f 100644 --- a/fwd.c +++ b/fwd.c @@ -19,6 +19,8 @@ #include
#include #include +#include +#include #include "util.h" #include "ip.h" @@ -26,6 +28,8 @@ #include "passt.h" #include "lineread.h" #include "flow_table.h" +#include "inany.h" +#include "netlink.h"
/* Empheral port range: values from RFC 6335 */ static in_port_t fwd_ephemeral_min = (1 << 15) + (1 << 14); @@ -33,6 +37,178 @@ static in_port_t fwd_ephemeral_max = NUM_PORTS - 1;
#define PORT_RANGE_SYSCTL "/proc/sys/net/ipv4/ip_local_port_range"
+#define MAC_CACHE_SIZE 128 +#define MAC_CACHE_RENEWAL 3600 /* Refresh entry from ARP/NDP every hour */ + +/* Partial cache of ARP/NDP table contents */ +struct mac_cache_entry { + struct mac_cache_entry *prev; + struct mac_cache_entry *next; + struct timespec expiry; + union inany_addr addr; + uint8_t mac[ETH_ALEN]; + uint16_t access_count; +}; + +struct mac_cache_table { + struct mac_cache_entry mac_table[MAC_CACHE_SIZE]; + + /* FIFO/LRU queue */ + struct mac_cache_entry *head; + struct mac_cache_entry *tail; +}; + +static struct mac_cache_table mac_cache; +const uint8_t undefined_mac[6] = {0, 0, 0, 0, 0, 0};
...this is still here, and I don't see MAC_ZERO used anywhere.
+/** + * fwd_mac_cache_unlink() - Unlink entry from LRU queue + */ +static void fwd_mac_cache_unlink(struct mac_cache_entry *e) +{ + struct mac_cache_table *t = &mac_cache; + + if (e->prev) + e->prev->next = e->next; + else + t->head = e->next; + + if (e->next) + e->next->prev = e->prev; + else + t->tail = e->prev; + + e->prev = e->next = NULL; +} + +/** + * fwd_mac_cache_append_tail() - Add entry to tail of LRU queue + */ +static void fwd_mac_cache_append_to_tail(struct mac_cache_entry *e) +{ + struct mac_cache_table *t = &mac_cache; + + e->next = NULL; + e->prev = t->tail; + t->tail->next = e; + t->tail = e; +} + +/** + * fwd_mac_cache_move_to_tail() - Move entry to tail of LRU queue + */ +static void fwd_mac_cache_move_to_tail(struct mac_cache_entry *e) +{ + struct mac_cache_table *t = &mac_cache; + + if (t->tail == e) + return; + + fwd_mac_cache_unlink(e); + fwd_mac_cache_append_to_tail(e); +} + +/** + * mac_entry_set_expiry() - Set the time for a cache entry to expire + * @e: Cache entry + * @expiry: Expiration time, in seconds from current moment. + */ +static void mac_entry_set_expiry(struct mac_cache_entry *e, struct timespec *now, int expiry) +{ + e->expiry = *now; + e->expiry.tv_sec += expiry; +} + +/** + * fwd_mac_cache_find() - Find an entry in the ARP/NDP cache table + * @addr: IPv4 or IPv6 address, used as key for the lookup + * + * Return: Pointer to the entry on success, NULL on failure. + */ +static struct mac_cache_entry *fwd_mac_cache_find(const union inany_addr *addr) +{ + const struct mac_cache_table *t = &mac_cache; + struct mac_cache_entry *e = t->tail; + + for (e = t->tail; e; e = e->prev) { + if (inany_equals(&e->addr, addr)) + return e; + } + return NULL; +} + +/** + * fwd_neigh_mac_get() - Find a MAC address the ARP/NDP cache table
This is missing an "in" at least, and doesn't really distinguish this from fwd_mac_cache_find().
+ * @c: Execution context + * @addr: IPv4 or IPv6 address + * @ifi: Interface index + * @mac: Buffer for Ethernet MAC to return, found or default. + * + * Return: true if real MAC found, false if not found or failure + */ +bool fwd_neigh_mac_get(const struct ctx *c, const union inany_addr *addr, + uint8_t *mac) +{ + struct mac_cache_entry *e = fwd_mac_cache_find(addr); + int ifi = inany_v4(addr) ? c->ifi4 : c->ifi6; + bool refresh = false; + bool found = false; + struct timespec now; + + clock_gettime(CLOCK_MONOTONIC, &now); + + if (e) { + refresh = timespec_before(&e->expiry, &now); + } else { + /* Recycle least recently used entry */ + e = mac_cache.head; + e->addr = *addr; + memcpy(e->mac, undefined_mac, ETH_ALEN);
MAC_ZERO.
+ e->access_count = 0; + refresh = true; + } + + if (!refresh) { + found = !MAC_IS_ZERO(e->mac); + } else { + found = nl_neigh_mac_get(nl_sock, addr, ifi, e->mac); + mac_entry_set_expiry(e, &now, MAC_CACHE_RENEWAL); + } + + if (found) { + memcpy(mac, e->mac, ETH_ALEN); + } else { + /* Do linear back-off from new netlink calls if nothing found */ + mac_entry_set_expiry(e, &now, e->access_count++);
Do we need this? Particularly now we're treating negative and positive entries the same for initial expiry.
+ memcpy(mac, c->our_tap_mac, ETH_ALEN); + } + + /* Set to most recently used */ + fwd_mac_cache_move_to_tail(e); + + return found; +} + +/** + * fwd_mac_cache_init() - Initiate ARP/NDP cache table + */ +void fwd_mac_cache_init(void) +{ + struct mac_cache_table *t = &mac_cache; + struct mac_cache_entry *e; + int i; + + memset(t, 0, sizeof(*t)); + + for (i = 0; i < MAC_CACHE_SIZE; i++) { + e = &t->mac_table[i]; + e->prev = (i == 0) ? NULL : &t->mac_table[i - 1]; + e->next = (i < (MAC_CACHE_SIZE - 1)) ? &t->mac_table[i + 1] : NULL; + } + t->head = &t->mac_table[0]; + t->tail = &t->mac_table[MAC_CACHE_SIZE - 1]; +} + /** fwd_probe_ephemeral() - Determine what ports this host considers ephemeral * * Work out what ports the host thinks are emphemeral and record it for later diff --git a/fwd.h b/fwd.h index 65c7c96..728601f 100644 --- a/fwd.h +++ b/fwd.h @@ -57,4 +57,8 @@ uint8_t fwd_nat_from_splice(const struct ctx *c, uint8_t proto, uint8_t fwd_nat_from_host(const struct ctx *c, uint8_t proto, const struct flowside *ini, struct flowside *tgt);
+bool fwd_neigh_mac_get(const struct ctx *c, const union inany_addr *addr, + uint8_t *mac); +void fwd_mac_cache_init(void); + #endif /* FWD_H */ diff --git a/util.c b/util.c index c492f90..8a8846c 100644 --- a/util.c +++ b/util.c @@ -305,6 +305,18 @@ long timespec_diff_ms(const struct timespec *a, const struct timespec *b) return timespec_diff_us(a, b) / 1000; }
+/** + * timespec_before() - Check the relation between two points in time + * @a: Point in time to be tested + * @b: Point in time test a against + * Return: True if a comes before b, otherwise b + */ +bool timespec_before(const struct timespec *a, const struct timespec *b) +{ + return (a->tv_sec < b->tv_sec) || + (a->tv_sec == b->tv_sec && a->tv_nsec < b->tv_nsec); +} + /** * bitmap_set() - Set single bit in bitmap * @map: Pointer to bitmap diff --git a/util.h b/util.h index 2a8c38f..5ec3f22 100644 --- a/util.h +++ b/util.h @@ -207,6 +207,7 @@ int sock_unix(char *sock_path); void sock_probe_mem(struct ctx *c); long timespec_diff_ms(const struct timespec *a, const struct timespec *b); int64_t timespec_diff_us(const struct timespec *a, const struct timespec *b); +bool timespec_before(const struct timespec *a, const struct timespec *b); void bitmap_set(uint8_t *map, unsigned bit); void bitmap_clear(uint8_t *map, unsigned bit); bool bitmap_isset(const uint8_t *map, unsigned bit); -- 2.50.1
-- David Gibson (he or they) | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you, not the other way | around. http://www.ozlabs.org/~dgibson