[PATCH v2 0/4] Add vhost-user support to passt. (part 3)

newer
[PATCH 0/3] Assorted fixes related...

older
[PATCH] test: Speed up by cutting...

Laurent Vivier

12 Jul 2024 12 Jul '24

5:32 p.m.

This series of patches adds vhost-user support to passt and then allows passt to connect to QEMU network backend using virtqueue rather than a socket. With QEMU, rather than using to connect: -netdev stream,id=s,server=off,addr.type=unix,addr.path=/tmp/passt_1.socket we will use: -chardev socket,id=chr0,path=/tmp/passt_1.socket -netdev vhost-user,id=netdev0,chardev=chr0 -device virtio-net,netdev=netdev0 -object memory-backend-memfd,id=memfd0,share=on,size=$RAMSIZE -numa node,memdev=memfd0 The memory backend is needed to share data between passt and QEMU. Performance comparison between "-netdev stream" and "-netdev vhost-user": $ iperf3 -c localhost -p 10001 -t 60 -6 -u -b 50G socket: [ 5] 0.00-60.05 sec 95.6 GBytes 13.7 Gbits/sec 0.017 ms 6998988/10132413 (69%) receiver vhost-user: [ 5] 0.00-60.04 sec 237 GBytes 33.9 Gbits/sec 0.006 ms 53673/7813770 (0.69%) receiver $ iperf3 -c localhost -p 10001 -t 60 -4 -u -b 50G socket: [ 5] 0.00-60.05 sec 98.9 GBytes 14.1 Gbits/sec 0.018 ms 6260735/9501832 (66%) receiver vhost-user: [ 5] 0.00-60.05 sec 235 GBytes 33.7 Gbits/sec 0.008 ms 37581/7752699 (0.48%) receiver $ iperf3 -c localhost -p 10001 -t 60 -6 socket: [ 5] 0.00-60.00 sec 17.3 GBytes 2.48 Gbits/sec 0 sender [ 5] 0.00-60.06 sec 17.3 GBytes 2.48 Gbits/sec receiver vhost-user: [ 5] 0.00-60.00 sec 191 GBytes 27.4 Gbits/sec 0 sender [ 5] 0.00-60.05 sec 191 GBytes 27.3 Gbits/sec receiver $ iperf3 -c localhost -p 10001 -t 60 -4 socket: [ 5] 0.00-60.00 sec 15.6 GBytes 2.24 Gbits/sec 0 sender [ 5] 0.00-60.06 sec 15.6 GBytes 2.24 Gbits/sec receiver vhost-user: [ 5] 0.00-60.00 sec 189 GBytes 27.1 Gbits/sec 0 sender [ 5] 0.00-60.04 sec 189 GBytes 27.0 Gbits/sec receiver v2: - remove PATCH 4 - rewrite PATCH 2 and 3 to follow passt coding style - move some code from PATCH 3 to PATCH 4 (previously PATCH 5) - partially addressed David's comment on PATCH 5 Laurent Vivier (4): packet: replace struct desc by struct iovec vhost-user: introduce virtio API vhost-user: introduce vhost-user API vhost-user: add vhost-user Makefile | 4 +- checksum.c | 1 - conf.c | 24 +- iov.c | 1 - isolation.c | 15 +- packet.c | 97 ++-- packet.h | 16 +- passt.c | 16 +- passt.h | 10 + pcap.c | 1 - tap.c | 114 ++++- tap.h | 5 +- tcp.c | 17 +- tcp_vu.c | 560 +++++++++++++++++++++ tcp_vu.h | 12 + udp.c | 54 +- udp_internal.h | 39 ++ udp_vu.c | 240 +++++++++ udp_vu.h | 11 + util.h | 11 + vhost_user.c | 1273 ++++++++++++++++++++++++++++++++++++++++++++++++ vhost_user.h | 197 ++++++++ virtio.c | 605 +++++++++++++++++++++++ virtio.h | 190 ++++++++ 24 files changed, 3392 insertions(+), 121 deletions(-) create mode 100644 tcp_vu.c create mode 100644 tcp_vu.h create mode 100644 udp_internal.h create mode 100644 udp_vu.c create mode 100644 udp_vu.h create mode 100644 vhost_user.c create mode 100644 vhost_user.h create mode 100644 virtio.c create mode 100644 virtio.h -- 2.45.2

Show replies by date

Laurent Vivier

12 Jul 12 Jul

5:32 p.m.

New subject: [PATCH v2 1/4] packet: replace struct desc by struct iovec

To be able to manage buffers inside a shared memory provided by a VM via a vhost-user interface, we cannot rely on the fact that buffers are located in a pre-defined memory area and use a base address and a 32bit offset to address them. We need a 64bit address, so replace struct desc by struct iovec and update range checking. Signed-off-by: Laurent Vivier --- packet.c | 84 +++++++++++++++++++++++++++++++------------------------- packet.h | 14 ++-------- 2 files changed, 49 insertions(+), 49 deletions(-) diff --git a/packet.c b/packet.c index ccfc84607709..f7bb523c4ffa 100644 --- a/packet.c +++ b/packet.c @@ -22,6 +22,39 @@ #include "util.h" #include "log.h" +/** + * packet_check_range() - Check if a packet memory range is valid + * @p: Packet pool + * @offset: Offset of data range in packet descriptor + * @len: Length of desired data range + * @start: Start of the packet descriptor + * @func: For tracing: name of calling function, NULL means no trace() + * @line: For tracing: caller line of function call + * + * Return: 0 if the range is valid, -1 otherwise + */ +static int packet_check_range(const struct pool *p, size_t offset, size_t len, + const char *start, const char *func, int line) +{ + if (start < p->buf) { + if (func) { + trace("add packet start %p before buffer start %p, " + "%s:%i", (void *)start, (void *)p->buf, func, line); + } + return -1; + } + + if (start + len + offset > p->buf + p->buf_size) { + if (func) { + trace("packet offset plus length %lu from size %lu, " + "%s:%i", start - p->buf + len + offset, + p->buf_size, func, line); + } + return -1; + } + + return 0; +} /** * packet_add_do() - Add data as packet descriptor to given pool * @p: Existing pool @@ -41,34 +74,16 @@ void packet_add_do(struct pool *p, size_t len, const char *start, return; } - if (start < p->buf) { - trace("add packet start %p before buffer start %p, %s:%i", - (void *)start, (void *)p->buf, func, line); + if (packet_check_range(p, 0, len, start, func, line)) return; - } - - if (start + len > p->buf + p->buf_size) { - trace("add packet start %p, length: %zu, buffer end %p, %s:%i", - (void *)start, len, (void *)(p->buf + p->buf_size), - func, line); - return; - } if (len > UINT16_MAX) { trace("add packet length %zu, %s:%i", len, func, line); return; } -#if UINTPTR_MAX == UINT64_MAX - if ((uintptr_t)start - (uintptr_t)p->buf > UINT32_MAX) { - trace("add packet start %p, buffer start %p, %s:%i", - (void *)start, (void *)p->buf, func, line); - return; - } -#endif - - p->pkt[idx].offset = start - p->buf; - p->pkt[idx].len = len; + p->pkt[idx].iov_base = (void *)start; + p->pkt[idx].iov_len = len; p->count++; } @@ -96,36 +111,31 @@ void *packet_get_do(const struct pool *p, size_t idx, size_t offset, return NULL; } - if (len > UINT16_MAX || len + offset > UINT32_MAX) { + if (len > UINT16_MAX) { if (func) { - trace("packet data length %zu, offset %zu, %s:%i", - len, offset, func, line); + trace("packet data length %zu, %s:%i", + len, func, line); } return NULL; } - if (p->pkt[idx].offset + len + offset > p->buf_size) { + if (len + offset > p->pkt[idx].iov_len) { if (func) { - trace("packet offset plus length %zu from size %zu, " - "%s:%i", p->pkt[idx].offset + len + offset, - p->buf_size, func, line); + trace("data length %zu, offset %zu from length %zu, " + "%s:%i", len, offset, p->pkt[idx].iov_len, + func, line); } return NULL; } - if (len + offset > p->pkt[idx].len) { - if (func) { - trace("data length %zu, offset %zu from length %u, " - "%s:%i", len, offset, p->pkt[idx].len, - func, line); - } + if (packet_check_range(p, offset, len, p->pkt[idx].iov_base, + func, line)) return NULL; - } if (left) - *left = p->pkt[idx].len - offset - len; + *left = p->pkt[idx].iov_len - offset - len; - return p->buf + p->pkt[idx].offset + offset; + return (char *)p->pkt[idx].iov_base + offset; } /** diff --git a/packet.h b/packet.h index a784b07bbed5..8377dcf678bb 100644 --- a/packet.h +++ b/packet.h @@ -6,16 +6,6 @@ #ifndef PACKET_H #define PACKET_H -/** - * struct desc - Generic offset-based descriptor within buffer - * @offset: Offset of descriptor relative to buffer start, 32-bit limit - * @len: Length of descriptor, host order, 16-bit limit - */ -struct desc { - uint32_t offset; - uint16_t len; -}; - /** * struct pool - Generic pool of packets stored in a buffer * @buf: Buffer storing packet descriptors @@ -29,7 +19,7 @@ struct pool { size_t buf_size; size_t size; size_t count; - struct desc pkt[1]; + struct iovec pkt[1]; }; void packet_add_do(struct pool *p, size_t len, const char *start, @@ -54,7 +44,7 @@ struct _name ## _t { \ size_t buf_size; \ size_t size; \ size_t count; \ - struct desc pkt[_size]; \ + struct iovec pkt[_size]; \ } #define PACKET_POOL_INIT_NOCAST(_size, _buf, _buf_size) \ -- 2.45.2

David Gibson

15 Jul 15 Jul

6:59 a.m.

New subject: [PATCH v2 1/4] packet: replace struct desc by struct iovec

On Fri, Jul 12, 2024 at 05:32:41PM +0200, Laurent Vivier wrote:

...

To be able to manage buffers inside a shared memory provided by a VM via a vhost-user interface, we cannot rely on the fact that buffers are located in a pre-defined memory area and use a base address and a 32bit offset to address them.

We need a 64bit address, so replace struct desc by struct iovec and update range checking.

Signed-off-by: Laurent Vivier --- packet.c | 84 +++++++++++++++++++++++++++++++------------------------- packet.h | 14 ++-------- 2 files changed, 49 insertions(+), 49 deletions(-)

diff --git a/packet.c b/packet.c index ccfc84607709..f7bb523c4ffa 100644 --- a/packet.c +++ b/packet.c @@ -22,6 +22,39 @@ #include "util.h" #include "log.h"

+/** + * packet_check_range() - Check if a packet memory range is valid + * @p: Packet pool + * @offset: Offset of data range in packet descriptor + * @len: Length of desired data range + * @start: Start of the packet descriptor + * @func: For tracing: name of calling function, NULL means no trace() + * @line: For tracing: caller line of function call + * + * Return: 0 if the range is valid, -1 otherwise + */ +static int packet_check_range(const struct pool *p, size_t offset, size_t len, + const char *start, const char *func, int line) +{ + if (start < p->buf) { + if (func) {

Omitting the message entirely if func is not set doesn't seem correct. I believe printf() should format NULL pointers sanely (typically as "<null>"), so I think you can just leave out this check.

...

+ trace("add packet start %p before buffer start %p, " + "%s:%i", (void *)start, (void *)p->buf, func, line); + } + return -1; + } + + if (start + len + offset > p->buf + p->buf_size) {

It's not really clear to me why offset is needed in here. AIUI, offset is used when we want to talk about some piece of a larger packet/frame that's in the buffer. That's useful when we're dissecting packets, but surely we always want the whole frame/whatever to be within the buffer, so I don't know we need the extra complexity in this helper. I also think we should check for overflow on the LHS here, but that's pre-existing, so it doesn't need to go in this patch.

...

+ if (func) { + trace("packet offset plus length %lu from size %lu, " + "%s:%i", start - p->buf + len + offset, + p->buf_size, func, line); + } + return -1; + } + + return 0; +} /** * packet_add_do() - Add data as packet descriptor to given pool * @p: Existing pool @@ -41,34 +74,16 @@ void packet_add_do(struct pool *p, size_t len, const char *start, return; }

- if (start < p->buf) { - trace("add packet start %p before buffer start %p, %s:%i", - (void *)start, (void *)p->buf, func, line); + if (packet_check_range(p, 0, len, start, func, line)) return; - } - - if (start + len > p->buf + p->buf_size) { - trace("add packet start %p, length: %zu, buffer end %p, %s:%i", - (void *)start, len, (void *)(p->buf + p->buf_size), - func, line); - return; - }

if (len > UINT16_MAX) { trace("add packet length %zu, %s:%i", len, func, line); return; }

-#if UINTPTR_MAX == UINT64_MAX - if ((uintptr_t)start - (uintptr_t)p->buf > UINT32_MAX) { - trace("add packet start %p, buffer start %p, %s:%i", - (void *)start, (void *)p->buf, func, line); - return; - } -#endif - - p->pkt[idx].offset = start - p->buf; - p->pkt[idx].len = len; + p->pkt[idx].iov_base = (void *)start; + p->pkt[idx].iov_len = len;

p->count++; } @@ -96,36 +111,31 @@ void *packet_get_do(const struct pool *p, size_t idx, size_t offset, return NULL; }

- if (len > UINT16_MAX || len + offset > UINT32_MAX) { + if (len > UINT16_MAX) { if (func) { - trace("packet data length %zu, offset %zu, %s:%i", - len, offset, func, line); + trace("packet data length %zu, %s:%i", + len, func, line);

Should this be an assert? Seems like something is wrong in the caller, if they're trying to pass in a ludicrously long packet.

...

} return NULL; }

- if (p->pkt[idx].offset + len + offset > p->buf_size) { + if (len + offset > p->pkt[idx].iov_len) { if (func) { - trace("packet offset plus length %zu from size %zu, " - "%s:%i", p->pkt[idx].offset + len + offset, - p->buf_size, func, line); + trace("data length %zu, offset %zu from length %zu, " + "%s:%i", len, offset, p->pkt[idx].iov_len, + func, line); } return NULL; }

- if (len + offset > p->pkt[idx].len) { - if (func) { - trace("data length %zu, offset %zu from length %u, " - "%s:%i", len, offset, p->pkt[idx].len, - func, line); - } + if (packet_check_range(p, offset, len, p->pkt[idx].iov_base, + func, line)) return NULL; - }

if (left) - *left = p->pkt[idx].len - offset - len; + *left = p->pkt[idx].iov_len - offset - len;

- return p->buf + p->pkt[idx].offset + offset; + return (char *)p->pkt[idx].iov_base + offset; }

/** diff --git a/packet.h b/packet.h index a784b07bbed5..8377dcf678bb 100644 --- a/packet.h +++ b/packet.h @@ -6,16 +6,6 @@ #ifndef PACKET_H #define PACKET_H

-/** - * struct desc - Generic offset-based descriptor within buffer - * @offset: Offset of descriptor relative to buffer start, 32-bit limit - * @len: Length of descriptor, host order, 16-bit limit - */ -struct desc { - uint32_t offset; - uint16_t len; -}; - /** * struct pool - Generic pool of packets stored in a buffer * @buf: Buffer storing packet descriptors @@ -29,7 +19,7 @@ struct pool { size_t buf_size; size_t size; size_t count; - struct desc pkt[1]; + struct iovec pkt[1]; };

void packet_add_do(struct pool *p, size_t len, const char *start, @@ -54,7 +44,7 @@ struct _name ## _t { \ size_t buf_size; \ size_t size; \ size_t count; \ - struct desc pkt[_size]; \ + struct iovec pkt[_size]; \ }

#define PACKET_POOL_INIT_NOCAST(_size, _buf, _buf_size) \

-- David Gibson (he or they) | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you, not the other way | around. http://www.ozlabs.org/~dgibson

Stefano Brivio

19 Jul 19 Jul

11:28 p.m.

New subject: [PATCH v2 1/4] packet: replace struct desc by struct iovec

On Mon, 15 Jul 2024 14:59:42 +1000 David Gibson wrote:

...

On Fri, Jul 12, 2024 at 05:32:41PM +0200, Laurent Vivier wrote:

...
To be able to manage buffers inside a shared memory provided by a VM via a vhost-user interface, we cannot rely on the fact that buffers are located in a pre-defined memory area and use a base address and a 32bit offset to address them.

We need a 64bit address, so replace struct desc by struct iovec and update range checking.

Signed-off-by: Laurent Vivier --- packet.c | 84 +++++++++++++++++++++++++++++++------------------------- packet.h | 14 ++-------- 2 files changed, 49 insertions(+), 49 deletions(-)

diff --git a/packet.c b/packet.c index ccfc84607709..f7bb523c4ffa 100644 --- a/packet.c +++ b/packet.c @@ -22,6 +22,39 @@ #include "util.h" #include "log.h"

+/** + * packet_check_range() - Check if a packet memory range is valid + * @p: Packet pool + * @offset: Offset of data range in packet descriptor + * @len: Length of desired data range + * @start: Start of the packet descriptor + * @func: For tracing: name of calling function, NULL means no trace() + * @line: For tracing: caller line of function call + * + * Return: 0 if the range is valid, -1 otherwise + */ +static int packet_check_range(const struct pool *p, size_t offset, size_t len, + const char *start, const char *func, int line) +{ + if (start < p->buf) { + if (func) {

Omitting the message entirely if func is not set doesn't seem correct. I believe printf() should format NULL pointers sanely (typically as "<null>"), so I think you can just leave out this check.

That intention is actually pre-existing: look at the function comment (coming from packet_add_do()). Originally, I wanted to implement --trace like that: if no function name was given, no messages would be printed. Then I realised it wasn't really practical and changed to a static logging flag, but I still accidentally left this in commit bb708111833e ("treewide: Packet abstraction with mandatory boundary checks"). Anyway, yes, func is always passed, so there's no need for this check (and sure, there would be no _need_ anyway). We just need to fix the function comments.

...

...
+ trace("add packet start %p before buffer start %p, "

It's not "add" if it's called from packet_get_do(). As we print the function name anyway, we could drop "add " from this altogether, it should be clear enough.

...

...
+ "%s:%i", (void *)start, (void *)p->buf, func, line); + } + return -1; + } + + if (start + len + offset > p->buf + p->buf_size) {

It's not really clear to me why offset is needed in here. AIUI, offset is used when we want to talk about some piece of a larger packet/frame that's in the buffer. That's useful when we're dissecting packets,

...and that's packet_get_do()'s usage, passing a non-zero offset here (stricter check anyway), while:

...

but surely we always want the whole frame/whatever to be within the buffer,

packet_add_do() calls this with a zero offset, because the whole packet should fit. That is, this check replaces: if (start + len > p->buf + p->buf_size) { from packet_add_do(), and: if (p->pkt[idx].offset + len + offset > p->buf_size) { from packet_get_do(). It looks equivalent to me.

...

so I don't know we need the extra complexity in this helper.

I also think we should check for overflow on the LHS here, but that's pre-existing, so it doesn't need to go in this patch.

...
+ if (func) { + trace("packet offset plus length %lu from size %lu, " + "%s:%i", start - p->buf + len + offset, + p->buf_size, func, line); + } + return -1; + } + + return 0; +} /** * packet_add_do() - Add data as packet descriptor to given pool * @p: Existing pool @@ -41,34 +74,16 @@ void packet_add_do(struct pool *p, size_t len, const char *start, return; }

- if (start < p->buf) { - trace("add packet start %p before buffer start %p, %s:%i", - (void *)start, (void *)p->buf, func, line); + if (packet_check_range(p, 0, len, start, func, line)) return; - } - - if (start + len > p->buf + p->buf_size) { - trace("add packet start %p, length: %zu, buffer end %p, %s:%i", - (void *)start, len, (void *)(p->buf + p->buf_size), - func, line); - return; - }

if (len > UINT16_MAX) { trace("add packet length %zu, %s:%i", len, func, line); return; }

-#if UINTPTR_MAX == UINT64_MAX - if ((uintptr_t)start - (uintptr_t)p->buf > UINT32_MAX) { - trace("add packet start %p, buffer start %p, %s:%i", - (void *)start, (void *)p->buf, func, line); - return; - } -#endif - - p->pkt[idx].offset = start - p->buf; - p->pkt[idx].len = len; + p->pkt[idx].iov_base = (void *)start; + p->pkt[idx].iov_len = len;

p->count++; } @@ -96,36 +111,31 @@ void *packet_get_do(const struct pool *p, size_t idx, size_t offset, return NULL; }

- if (len > UINT16_MAX || len + offset > UINT32_MAX) { + if (len > UINT16_MAX) { if (func) { - trace("packet data length %zu, offset %zu, %s:%i", - len, offset, func, line); + trace("packet data length %zu, %s:%i", + len, func, line);

Should this be an assert? Seems like something is wrong in the caller, if they're trying to pass in a ludicrously long packet.

Maybe something is wrong in the caller, but these are sanity checks for security's sake, so if somebody finds out how to reach here with a ludicrously long packet, I think it's preferable to discard the packet rather than crashing and turning whatever issue into a vulnerability.

...

...
} return NULL; }

- if (p->pkt[idx].offset + len + offset > p->buf_size) { + if (len + offset > p->pkt[idx].iov_len) { if (func) { - trace("packet offset plus length %zu from size %zu, " - "%s:%i", p->pkt[idx].offset + len + offset, - p->buf_size, func, line); + trace("data length %zu, offset %zu from length %zu, " + "%s:%i", len, offset, p->pkt[idx].iov_len, + func, line); } return NULL; }

- if (len + offset > p->pkt[idx].len) { - if (func) { - trace("data length %zu, offset %zu from length %u, " - "%s:%i", len, offset, p->pkt[idx].len, - func, line); - } + if (packet_check_range(p, offset, len, p->pkt[idx].iov_base, + func, line)) return NULL; - }

if (left) - *left = p->pkt[idx].len - offset - len; + *left = p->pkt[idx].iov_len - offset - len;

- return p->buf + p->pkt[idx].offset + offset; + return (char *)p->pkt[idx].iov_base + offset; }

/** diff --git a/packet.h b/packet.h index a784b07bbed5..8377dcf678bb 100644 --- a/packet.h +++ b/packet.h @@ -6,16 +6,6 @@ #ifndef PACKET_H #define PACKET_H

-/** - * struct desc - Generic offset-based descriptor within buffer - * @offset: Offset of descriptor relative to buffer start, 32-bit limit - * @len: Length of descriptor, host order, 16-bit limit - */ -struct desc { - uint32_t offset; - uint16_t len; -}; - /** * struct pool - Generic pool of packets stored in a buffer * @buf: Buffer storing packet descriptors @@ -29,7 +19,7 @@ struct pool { size_t buf_size; size_t size; size_t count; - struct desc pkt[1]; + struct iovec pkt[1]; };

void packet_add_do(struct pool *p, size_t len, const char *start, @@ -54,7 +44,7 @@ struct _name ## _t { \ size_t buf_size; \ size_t size; \ size_t count; \ - struct desc pkt[_size]; \ + struct iovec pkt[_size]; \ }

#define PACKET_POOL_INIT_NOCAST(_size, _buf, _buf_size) \

-- Stefano

Laurent Vivier

12 Jul 12 Jul

5:32 p.m.

New subject: [PATCH v2 2/4] vhost-user: introduce virtio API

Add virtio.c and virtio.h that define the functions needed to manage virtqueues. Signed-off-by: Laurent Vivier --- Makefile | 4 +- util.h | 11 + virtio.c | 611 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ virtio.h | 190 +++++++++++++++++ 4 files changed, 814 insertions(+), 2 deletions(-) create mode 100644 virtio.c create mode 100644 virtio.h diff --git a/Makefile b/Makefile index 09fc461d087e..39613a7cf1f2 100644 --- a/Makefile +++ b/Makefile @@ -47,7 +47,7 @@ FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS) PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c flow.c fwd.c \ icmp.c igmp.c inany.c iov.c ip.c isolation.c lineread.c log.c mld.c \ ndp.c netlink.c packet.c passt.c pasta.c pcap.c pif.c tap.c tcp.c \ - tcp_buf.c tcp_splice.c udp.c util.c + tcp_buf.c tcp_splice.c udp.c util.c virtio.c QRAP_SRCS = qrap.c SRCS = $(PASST_SRCS) $(QRAP_SRCS) @@ -57,7 +57,7 @@ PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \ flow_table.h icmp.h icmp_flow.h inany.h iov.h ip.h isolation.h \ lineread.h log.h ndp.h netlink.h packet.h passt.h pasta.h pcap.h pif.h \ siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h tcp_internal.h tcp_splice.h \ - udp.h util.h + udp.h util.h virtio.h HEADERS = $(PASST_HEADERS) seccomp.h C := \#include \nstruct tcp_info x = { .tcpi_snd_wnd = 0 }; diff --git a/util.h b/util.h index eebb027be487..56c4e2e7b4fe 100644 --- a/util.h +++ b/util.h @@ -48,6 +48,9 @@ #define ROUND_DOWN(x, y) ((x) & ~((y) - 1)) #define ROUND_UP(x, y) (((x) + (y) - 1) & ~((y) - 1)) +#define ALIGN_DOWN(n, m) ((n) / (m) * (m)) +#define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m)) + #define MAX_FROM_BITS(n) (((1U << (n)) - 1)) #define BIT(n) (1UL << (n)) @@ -116,6 +119,14 @@ #define htonl_constant(x) (__bswap_constant_32(x)) #endif +static inline void barrier(void) { __asm__ __volatile__("" ::: "memory"); } +#define smp_mb() do { barrier(); __atomic_thread_fence(__ATOMIC_SEQ_CST); } while (0) +#define smp_mb_release() do { barrier(); __atomic_thread_fence(__ATOMIC_RELEASE); } while (0) +#define smp_mb_acquire() do { barrier(); __atomic_thread_fence(__ATOMIC_ACQUIRE); } while (0) + +#define smp_wmb() smp_mb_release() +#define smp_rmb() smp_mb_acquire() + #define NS_FN_STACK_SIZE (RLIMIT_STACK_VAL * 1024 / 8) int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags, void *arg); diff --git a/virtio.c b/virtio.c new file mode 100644 index 000000000000..5f984f92cae0 --- /dev/null +++ b/virtio.c @@ -0,0 +1,611 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later + * Copyright Red Hat + * Author: Laurent Vivier + * + * virtio API, vring and virtqueue functions definition + */ + +/* some parts copied from QEMU subprojects/libvhost-user/libvhost-user.c */ + +#include +#include +#include +#include +#include +#include + +#include "util.h" +#include "virtio.h" + +#define VIRTQUEUE_MAX_SIZE 1024 + +/** + * vu_gpa_to_va() - Translate guest physical address to our virtual address. + * @dev: Vhost-user device + * @plen: Physical length to map (input), virtual address mapped (output) + * @guest_addr: Guest physical address + * + * Return: virtual address in our address space of the guest physical address + */ +static void *vu_gpa_to_va(struct vu_dev *dev, uint64_t *plen, uint64_t guest_addr) +{ + unsigned int i; + + if (*plen == 0) + return NULL; + + /* Find matching memory region. */ + for (i = 0; i < dev->nregions; i++) { + const struct vu_dev_region *r = &dev->regions[i]; + + if ((guest_addr >= r->gpa) && + (guest_addr < (r->gpa + r->size))) { + if ((guest_addr + *plen) > (r->gpa + r->size)) + *plen = r->gpa + r->size - guest_addr; + /* NOLINTNEXTLINE(performance-no-int-to-ptr) */ + return (void *)(guest_addr - r->gpa + r->mmap_addr + + r->mmap_offset); + } + } + + return NULL; +} + +/** + * vring_avail_flags() - Read the available ring flags + * @vq: Virtqueue + * + * Return: the available ring descriptor flags of the given virtqueue + */ +static inline uint16_t vring_avail_flags(const struct vu_virtq *vq) +{ + return le16toh(vq->vring.avail->flags); +} + +/** + * vring_avail_idx() - Read the available ring index + * @vq: Virtqueue + * + * Return: the available ring index of the given virtqueue + */ +static inline uint16_t vring_avail_idx(struct vu_virtq *vq) +{ + vq->shadow_avail_idx = le16toh(vq->vring.avail->idx); + + return vq->shadow_avail_idx; +} + +/** + * vring_avail_ring() - Read an available ring entry + * @vq: Virtqueue + * @i Index of the entry to read + * + * Return: the ring entry content (head of the descriptor chain) + */ +static inline uint16_t vring_avail_ring(const struct vu_virtq *vq, int i) +{ + return le16toh(vq->vring.avail->ring[i]); +} + +/** + * vring_get_used_event() - Get the used event from the available ring + * @vq Virtqueue + * + * Return: the used event (available only if VIRTIO_RING_F_EVENT_IDX is set) + * used_event is a performant alternative where the driver + * specifies how far the device can progress before a notification + * is required. In this case, virq_avail is defined as: + * struct virtq_avail { + * le16 flags; + * le16 idx; + * le16 ring[num]; + * le16 used_event; // Only if VIRTIO_F_EVENT_IDX + * }; + * If the idx field in the used ring (which determined where that + * descriptor index was placed) was equal to used_event, the device + * must send a notification. + * Otherwise the device should not send a notification. + */ +static inline uint16_t vring_get_used_event(const struct vu_virtq *vq) +{ + return vring_avail_ring(vq, vq->vring.num); +} + +/** + * virtqueue_get_head() - Get the head of the descriptor chain for a given + * index + * @vq: Virtqueue + * @idx: Available ring entry index + * @head: Head of the descriptor chain + */ +static void virtqueue_get_head(const struct vu_virtq *vq, + unsigned int idx, unsigned int *head) +{ + /* Grab the next descriptor number they're advertising, and increment + * the index we've seen. + */ + *head = vring_avail_ring(vq, idx % vq->vring.num); + + /* If their number is silly, that's a fatal mistake. */ + if (*head >= vq->vring.num) + vu_panic("Guest says index %u is available", *head); +} + +/** + * virtqueue_read_indirect_desc() - Copy virtio ring descriptors from guest + * memory + * @dev: Vhost-user device + * @desc: Destination address to copy the descriptors + * @addr: Guest memory address to copy from + * @len: Length of memory to copy + * + * Return: -1 if there is an error, 0 otherwise + */ +static int virtqueue_read_indirect_desc(struct vu_dev *dev, struct vring_desc *desc, + uint64_t addr, size_t len) +{ + uint64_t read_len; + + if (len > (VIRTQUEUE_MAX_SIZE * sizeof(struct vring_desc))) + return -1; + + if (len == 0) + return -1; + + while (len) { + const struct vring_desc *ori_desc; + + read_len = len; + ori_desc = vu_gpa_to_va(dev, &read_len, addr); + if (!ori_desc) + return -1; + + memcpy(desc, ori_desc, read_len); + len -= read_len; + addr += read_len; + desc += read_len / sizeof(struct vring_desc); + } + + return 0; +} + +/** + * enum virtqueue_read_desc_state - State in the descriptor chain + * @VIRTQUEUE_READ_DESC_ERROR Found an invalid descriptor + * @VIRTQUEUE_READ_DESC_DONE No more descriptor in the chain + * @VIRTQUEUE_READ_DESC_MORE there is more descriptors in the chain + */ +enum virtqueue_read_desc_state { + VIRTQUEUE_READ_DESC_ERROR = -1, + VIRTQUEUE_READ_DESC_DONE = 0, /* end of chain */ + VIRTQUEUE_READ_DESC_MORE = 1, /* more buffers in chain */ +}; + +/** + * virtqueue_read_next_desc() - Read the the next descriptor in the chain + * @desc: Virtio ring descriptors + * @i: Index of the current descriptor + * @max: Maximum value of the descriptor index + * @next: Index of the next descriptor in the chain (output value) + * + * Return: current chain descriptor state (error, next, done) + */ +static int virtqueue_read_next_desc(const struct vring_desc *desc, + int i, unsigned int max, unsigned int *next) +{ + /* If this descriptor says it doesn't chain, we're done. */ + if (!(le16toh(desc[i].flags) & VRING_DESC_F_NEXT)) + return VIRTQUEUE_READ_DESC_DONE; + + /* Check they're not leading us off end of descriptors. */ + *next = le16toh(desc[i].next); + /* Make sure compiler knows to grab that: we don't want it changing! */ + smp_wmb(); + + if (*next >= max) + return VIRTQUEUE_READ_DESC_ERROR; + + return VIRTQUEUE_READ_DESC_MORE; +} + +/** + * vu_queue_empty() - Check if virtqueue is empty + * @vq: Virtqueue + * + * Return: true if the virtqueue is empty, false otherwise + */ +bool vu_queue_empty(struct vu_virtq *vq) +{ + if (!vq->vring.avail) + return true; + + if (vq->shadow_avail_idx != vq->last_avail_idx) + return false; + + return vring_avail_idx(vq) == vq->last_avail_idx; +} + +/** + * vring_notify() - Check if a notification can be sent + * @dev: Vhost-user device + * @vq: Virtqueue + * + * Return: true if notification can be sent + */ +static bool vring_notify(const struct vu_dev *dev, struct vu_virtq *vq) +{ + uint16_t old, new; + bool v; + + /* We need to expose used array entries before checking used event. */ + smp_mb(); + + /* Always notify when queue is empty (when feature acknowledge) */ + if (vu_has_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY) && + !vq->inuse && vu_queue_empty(vq)) { + return true; + } + + if (!vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) + return !(vring_avail_flags(vq) & VRING_AVAIL_F_NO_INTERRUPT); + + v = vq->signalled_used_valid; + vq->signalled_used_valid = true; + old = vq->signalled_used; + new = vq->signalled_used = vq->used_idx; + return !v || vring_need_event(vring_get_used_event(vq), new, old); +} + +/** + * vu_queue_notify() - Send a notification the given virtqueue + * @dev: Vhost-user device + * @vq: Virtqueue + */ +/* cppcheck-suppress unusedFunction */ +void vu_queue_notify(const struct vu_dev *dev, struct vu_virtq *vq) +{ + if (!vq->vring.avail) + return; + + if (!vring_notify(dev, vq)) { + debug("skipped notify..."); + return; + } + + if (eventfd_write(vq->call_fd, 1) < 0) + vu_panic("Error writing eventfd: %s", strerror(errno)); +} + +/** + * vring_set_avail_event() - Set avail_event + * @vq: Virtqueue + * @val: Value to set to avail_event + * avail_event is used in the same way the used_event is in the + * avail_ring. + * struct virtq_used { + * le16 flags; + * le16 idx; + * struct virtq_used_elem ringnum]; + * le16 avail_event; // Only if VIRTIO_F_EVENT_IDX + * }; + * avail_event is used to advise the driver that notifications + * are unnecessary until the driver writes entry with an index + * specified by avail_event into the available ring. + */ +static inline void vring_set_avail_event(struct vu_virtq *vq, uint16_t val) +{ + uint16_t val_le = htole16(val); + + if (!vq->notification) + return; + + memcpy(&vq->vring.used->ring[vq->vring.num], &val_le, sizeof(uint16_t)); +} + +/** + * virtqueue_map_desc() - Translate descriptor ring physical address into our + * virtual address space + * @dev: Vhost-user device + * @p_num_sg: First iov entry to use (input), + * first iov entry not sued (output) + * @iov: Iov array to use to store buffer virtual addresses + * @max_num_sg: Maximum number of iov entries + * @pa: Guest physical address of the buffer to map into our virtual + * address + * @sz: Size of the buffer + * + * Return: false on error, true otherwise + */ +static bool virtqueue_map_desc(struct vu_dev *dev, + unsigned int *p_num_sg, struct iovec *iov, + unsigned int max_num_sg, + uint64_t pa, size_t sz) +{ + unsigned int num_sg = *p_num_sg; + + ASSERT(num_sg <= max_num_sg); + + if (!sz) + vu_panic("virtio: zero sized buffers are not allowed"); + + while (sz) { + uint64_t len = sz; + + if (num_sg == max_num_sg) + vu_panic("virtio: too many descriptors in indirect table"); + + iov[num_sg].iov_base = vu_gpa_to_va(dev, &len, pa); + if (iov[num_sg].iov_base == NULL) + vu_panic("virtio: invalid address for buffers"); + iov[num_sg].iov_len = len; + num_sg++; + sz -= len; + pa += len; + } + + *p_num_sg = num_sg; + return true; +} + +/** + * vu_queue_map_desc - Map the virqueue descriptor ring into our virtual + * address space + * @dev: Vhost-user device + * @vq: Virtqueue + * @idx: First descriptor ring entry to map + * @elem: Virtqueue element to store descriptor ring iov + * + * Return: -1 if there is an error, 0 otherwise + */ +static int vu_queue_map_desc(struct vu_dev *dev, struct vu_virtq *vq, unsigned int idx, + struct vu_virtq_element *elem) +{ + const struct vring_desc *desc = vq->vring.desc; + struct vring_desc desc_buf[VIRTQUEUE_MAX_SIZE]; + unsigned int out_num = 0, in_num = 0; + unsigned int max = vq->vring.num; + unsigned int i = idx; + uint64_t read_len; + int rc; + + if (le16toh(desc[i].flags) & VRING_DESC_F_INDIRECT) { + unsigned int desc_len; + uint64_t desc_addr; + + if (le32toh(desc[i].len) % sizeof(struct vring_desc)) + vu_panic("Invalid size for indirect buffer table"); + + /* loop over the indirect descriptor table */ + desc_addr = le64toh(desc[i].addr); + desc_len = le32toh(desc[i].len); + max = desc_len / sizeof(struct vring_desc); + read_len = desc_len; + desc = vu_gpa_to_va(dev, &read_len, desc_addr); + if (desc && read_len != desc_len) { + /* Failed to use zero copy */ + desc = NULL; + if (!virtqueue_read_indirect_desc(dev, desc_buf, desc_addr, desc_len)) + desc = desc_buf; + } + if (!desc) + vu_panic("Invalid indirect buffer table"); + i = 0; + } + + /* Collect all the descriptors */ + do { + if (le16toh(desc[i].flags) & VRING_DESC_F_WRITE) { + if (!virtqueue_map_desc(dev, &in_num, elem->in_sg, + elem->in_num, + le64toh(desc[i].addr), + le32toh(desc[i].len))) { + return -1; + } + } else { + if (in_num) + vu_panic("Incorrect order for descriptors"); + if (!virtqueue_map_desc(dev, &out_num, elem->out_sg, + elem->out_num, + le64toh(desc[i].addr), + le32toh(desc[i].len))) { + return -1; + } + } + + /* If we've got too many, that implies a descriptor loop. */ + if ((in_num + out_num) > max) + vu_panic("Looped descriptor"); + rc = virtqueue_read_next_desc(desc, i, max, &i); + } while (rc == VIRTQUEUE_READ_DESC_MORE); + + if (rc == VIRTQUEUE_READ_DESC_ERROR) + vu_panic("read descriptor error"); + + elem->index = idx; + elem->in_num = in_num; + elem->out_num = out_num; + + return 0; +} + +/** + * vu_queue_pop() - Pop an entry from the virtqueue + * @dev: Vhost-user device + * @vq: Virtqueue + * @elem: Virtqueue element to file with the entry information + * + * Return: -1 if there is an error, 0 otherwise + */ +/* cppcheck-suppress unusedFunction */ +int vu_queue_pop(struct vu_dev *dev, struct vu_virtq *vq, struct vu_virtq_element *elem) +{ + unsigned int head; + int ret; + + if (!vq->vring.avail) + return -1; + + if (vu_queue_empty(vq)) + return -1; + + /* + * Needed after vu_queue_empty(), see comment in + * virtqueue_num_heads(). + */ + smp_rmb(); + + if (vq->inuse >= vq->vring.num) + vu_panic("Virtqueue size exceeded"); + + virtqueue_get_head(vq, vq->last_avail_idx++, &head); + + if (vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) + vring_set_avail_event(vq, vq->last_avail_idx); + + ret = vu_queue_map_desc(dev, vq, head, elem); + + if (ret < 0) + return ret; + + vq->inuse++; + + return 0; +} + +/** + * vu_queue_detach_element() - Detach an element from the virqueue + * @dev: Vhost-user device + * @vq: Virtqueue + * @index: Index of the element to detach + * @len: Size of the element to detach + */ +void vu_queue_detach_element(struct vu_dev *dev, struct vu_virtq *vq, + unsigned int index, size_t len) +{ + (void)dev; + (void)index; + (void)len; + + vq->inuse--; + /* unmap, when DMA support is added */ +} + +/** + * vu_queue_unpop() - Push back a previously popped element from the virqueue + * @dev: Vhost-user device + * @vq: Virtqueue + * @index: Index of the element to unpop + * @len: Size of the element to unpop + */ +/* cppcheck-suppress unusedFunction */ +void vu_queue_unpop(struct vu_dev *dev, struct vu_virtq *vq, unsigned int index, size_t len) +{ + vq->last_avail_idx--; + vu_queue_detach_element(dev, vq, index, len); +} + +/** + * vu_queue_rewind() - Push back a given number of popped elements + * @dev: Vhost-user device + * @vq: Virtqueue + * @num: Number of element to unpop + */ +/* cppcheck-suppress unusedFunction */ +bool vu_queue_rewind(struct vu_dev *dev, struct vu_virtq *vq, unsigned int num) +{ + (void)dev; + if (num > vq->inuse) + return false; + + vq->last_avail_idx -= num; + vq->inuse -= num; + return true; +} + +/** + * vring_used_write() - Write an entry in the used ring + * @vq: Virtqueue + * @uelem: Entry to write + * @i: Index of the entry in the used ring + */ +static inline void vring_used_write(struct vu_virtq *vq, + const struct vring_used_elem *uelem, int i) +{ + struct vring_used *used = vq->vring.used; + + used->ring[i] = *uelem; +} + +/** + * vu_queue_fill_by_index() - Update information of a descriptor ring entry + * in the used ring + * @vq: Virtqueue + * @index: Descriptor ring index + * @len: Size of the element + * @idx: Used ring entry index + */ +void vu_queue_fill_by_index(struct vu_virtq *vq, unsigned int index, + unsigned int len, unsigned int idx) +{ + struct vring_used_elem uelem; + + if (!vq->vring.avail) + return; + + idx = (idx + vq->used_idx) % vq->vring.num; + + uelem.id = htole32(index); + uelem.len = htole32(len); + vring_used_write(vq, &uelem, idx); +} + +/** + * vu_queue_fill() - Update information of a given element in the used ring + * @dev: Vhost-user device + * @vq: Virtqueue + * @elem: Element information to fill + * @len: Size of the element + * @idx: Used ring entry index + */ +/* cppcheck-suppress unusedFunction */ +void vu_queue_fill(struct vu_virtq *vq, const struct vu_virtq_element *elem, + unsigned int len, unsigned int idx) +{ + vu_queue_fill_by_index(vq, elem->index, len, idx); +} + +/** + * vring_used_idx_set() - Set the descriptor ring current index + * @vq: Virtqueue + * @val: Value to set in the index + */ +static inline void vring_used_idx_set(struct vu_virtq *vq, uint16_t val) +{ + vq->vring.used->idx = htole16(val); + + vq->used_idx = val; +} + +/** + * vu_queue_flush() - Flush the virtqueue + * @vq: Virtqueue + * @count: Number of entry to flush + */ +/* cppcheck-suppress unusedFunction */ +void vu_queue_flush(struct vu_virtq *vq, unsigned int count) +{ + uint16_t old, new; + + if (!vq->vring.avail) + return; + + /* Make sure buffer is written before we update index. */ + smp_wmb(); + + old = vq->used_idx; + new = old + count; + vring_used_idx_set(vq, new); + vq->inuse -= count; + if ((int16_t)(new - vq->signalled_used) < (uint16_t)(new - old)) + vq->signalled_used_valid = false; +} diff --git a/virtio.h b/virtio.h new file mode 100644 index 000000000000..0a2cf6230139 --- /dev/null +++ b/virtio.h @@ -0,0 +1,190 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later + * Copyright Red Hat + * Author: Laurent Vivier + * + * virtio API, vring and virtqueue functions definition + */ + +#ifndef VIRTIO_H +#define VIRTIO_H + +#include +#include + +#define vu_panic(...) die( __VA_ARGS__ ) + +/* Maximum size of a virtqueue */ +#define VIRTQUEUE_MAX_SIZE 1024 + +/** + * struct vu_ring - Virtqueue rings + * @num: Size of the queue + * @desc: Descriptor ring + * @avail: Available ring + * @used: Used ring + * @log_guest_addr: Guest address for logging + * @flags: Vring flags + * VHOST_VRING_F_LOG is set if log address is valid + */ +struct vu_ring { + unsigned int num; + struct vring_desc *desc; + struct vring_avail *avail; + struct vring_used *used; + uint64_t log_guest_addr; + uint32_t flags; +}; + +/** + * struct vu_virtq - Virtqueue definition + * @vring: Virtqueue rings + * @last_avail_idx: Next head to pop + * @shadow_avail_idx: Last avail_idx read from VQ. + * @used_idx: Descriptor ring current index + * @signalled_used: Last used index value we have signalled on + * @signalled_used_valid: True if signalled_used if valid + * @notification: True if the queues notify (via event + * index or interrupt) + * @inuse: Number of entries in use + * @call_fd: The event file descriptor to signal when + * buffers are used. + * @kick_fd: The event file descriptor for adding + * buffers to the vring + * @err_fd: The event file descriptor to signal when + * error occurs + * @enable: True if the virtqueue is enabled + * @started: True if the virtqueue is started + * @vra: QEMU address of our rings + */ +struct vu_virtq { + struct vu_ring vring; + uint16_t last_avail_idx; + uint16_t shadow_avail_idx; + uint16_t used_idx; + uint16_t signalled_used; + bool signalled_used_valid; + bool notification; + unsigned int inuse; + int call_fd; + int kick_fd; + int err_fd; + unsigned int enable; + bool started; + struct vhost_vring_addr vra; +}; + +/** + * struct vu_dev_region - guest shared memory region + * @gpa: Guest physical address of the region + * @size: Memory size in bytes + * @qva: QEMU virtual address + * @mmap_offset: Offset where the region starts in the mapped memory + * @mmap_addr: Address of the mapped memory + */ +struct vu_dev_region { + uint64_t gpa; + uint64_t size; + uint64_t qva; + uint64_t mmap_offset; + uint64_t mmap_addr; +}; + +#define VHOST_USER_MAX_QUEUES 2 + +/* + * Set a reasonable maximum number of ram slots, which will be supported by + * any architecture. + */ +#define VHOST_USER_MAX_RAM_SLOTS 32 + +/** + * struct vu_dev + * @context: Execution context + * nregions: Number of shared memory regions + * @regions: Guest shared memory regions + * @features: Vhost-user features + * @protocol_features: Vhost-user protocol features + * @hdrlen: Virtio -net header length + */ +struct vu_dev { + uint32_t nregions; + struct vu_dev_region regions[VHOST_USER_MAX_RAM_SLOTS]; + struct vu_virtq vq[VHOST_USER_MAX_QUEUES]; + uint64_t features; + uint64_t protocol_features; + int hdrlen; +}; + +/** + * struct vu_virtq_element + * @index: Descriptor ring index + * @out_num: Number of outgoing iovec buffers + * @in_num: Number of incoming iovec buffers + * @in_sg: Incoming iovec buffers + * @out_sg: Outgoing iovec buffers + */ +struct vu_virtq_element { + unsigned int index; + unsigned int out_num; + unsigned int in_num; + struct iovec *in_sg; + struct iovec *out_sg; +}; + +/** + * has_feature() - Check a feature bit in a features set + * @features: Features set + * @fb: Feature bit to check + * + * Return: True if the feature bit is set + */ +static inline bool has_feature(uint64_t features, unsigned int fbit) +{ + return !!(features & (1ULL << fbit)); +} + +/** + * vu_has_feature() - Check if a virtio-net feature is available + * @vdev: Vhost-user device + * @bit: Feature to check + * + * Return: True if the feature is available + */ +static inline bool vu_has_feature(const struct vu_dev *vdev, + unsigned int fbit) +{ + return has_feature(vdev->features, fbit); +} + +/** + * vu_has_protocol_feature() - Check if a vhost-user feature is available + * @vdev: Vhost-user device + * @bit: Feature to check + * + * Return: True if the feature is available + */ +/* cppcheck-suppress unusedFunction */ +static inline bool vu_has_protocol_feature(const struct vu_dev *vdev, + unsigned int fbit) +{ + return has_feature(vdev->protocol_features, fbit); +} + +bool vu_queue_empty(struct vu_virtq *vq); +void vu_queue_notify(const struct vu_dev *dev, struct vu_virtq *vq); +int vu_queue_pop(struct vu_dev *dev, struct vu_virtq *vq, + struct vu_virtq_element *elem); +void vu_queue_detach_element(struct vu_dev *dev, struct vu_virtq *vq, + unsigned int index, size_t len); +void vu_queue_unpop(struct vu_dev *dev, struct vu_virtq *vq, + unsigned int index, size_t len); +bool vu_queue_rewind(struct vu_dev *dev, struct vu_virtq *vq, + unsigned int num); + +void vu_queue_fill_by_index(struct vu_virtq *vq, unsigned int index, + unsigned int len, unsigned int idx); +void vu_queue_fill(struct vu_virtq *vq, + const struct vu_virtq_element *elem, unsigned int len, + unsigned int idx); +void vu_queue_flush(struct vu_virtq *vq, unsigned int count); +#endif /* VIRTIO_H */ -- 2.45.2

David Gibson

17 Jul 17 Jul

7:21 a.m.

New subject: [PATCH v2 2/4] vhost-user: introduce virtio API

On Fri, Jul 12, 2024 at 05:32:42PM +0200, Laurent Vivier wrote:

...

Add virtio.c and virtio.h that define the functions needed to manage virtqueues.

Signed-off-by: Laurent Vivier --- Makefile | 4 +- util.h | 11 + virtio.c | 611 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ virtio.h | 190 +++++++++++++++++ 4 files changed, 814 insertions(+), 2 deletions(-) create mode 100644 virtio.c create mode 100644 virtio.h

diff --git a/Makefile b/Makefile index 09fc461d087e..39613a7cf1f2 100644 --- a/Makefile +++ b/Makefile @@ -47,7 +47,7 @@ FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS) PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c flow.c fwd.c \ icmp.c igmp.c inany.c iov.c ip.c isolation.c lineread.c log.c mld.c \ ndp.c netlink.c packet.c passt.c pasta.c pcap.c pif.c tap.c tcp.c \ - tcp_buf.c tcp_splice.c udp.c util.c + tcp_buf.c tcp_splice.c udp.c util.c virtio.c QRAP_SRCS = qrap.c SRCS = $(PASST_SRCS) $(QRAP_SRCS)

@@ -57,7 +57,7 @@ PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \ flow_table.h icmp.h icmp_flow.h inany.h iov.h ip.h isolation.h \ lineread.h log.h ndp.h netlink.h packet.h passt.h pasta.h pcap.h pif.h \ siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h tcp_internal.h tcp_splice.h \ - udp.h util.h + udp.h util.h virtio.h HEADERS = $(PASST_HEADERS) seccomp.h

C := \#include \nstruct tcp_info x = { .tcpi_snd_wnd = 0 }; diff --git a/util.h b/util.h index eebb027be487..56c4e2e7b4fe 100644 --- a/util.h +++ b/util.h @@ -48,6 +48,9 @@ #define ROUND_DOWN(x, y) ((x) & ~((y) - 1)) #define ROUND_UP(x, y) (((x) + (y) - 1) & ~((y) - 1))

+#define ALIGN_DOWN(n, m) ((n) / (m) * (m)) +#define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m))

Hrm. Aren't these equivalent to the ROUND_{UP,DOWN}() macros above. Or rather, I think the ALIGN versions are more general, since they'll work with y/m values that aren't powers of 2. I don't see any reason to have two versions, though, since I'm fairly confident the compiler will be able to convert the more general version to the more specific one as necessary.

...

#define MAX_FROM_BITS(n) (((1U << (n)) - 1))

#define BIT(n) (1UL << (n)) @@ -116,6 +119,14 @@ #define htonl_constant(x) (__bswap_constant_32(x)) #endif

+static inline void barrier(void) { __asm__ __volatile__("" ::: "memory"); } +#define smp_mb() do { barrier(); __atomic_thread_fence(__ATOMIC_SEQ_CST); } while (0) +#define smp_mb_release() do { barrier(); __atomic_thread_fence(__ATOMIC_RELEASE); } while (0) +#define smp_mb_acquire() do { barrier(); __atomic_thread_fence(__ATOMIC_ACQUIRE); } while (0) + +#define smp_wmb() smp_mb_release() +#define smp_rmb() smp_mb_acquire() + #define NS_FN_STACK_SIZE (RLIMIT_STACK_VAL * 1024 / 8) int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags, void *arg); diff --git a/virtio.c b/virtio.c new file mode 100644 index 000000000000..5f984f92cae0 --- /dev/null +++ b/virtio.c @@ -0,0 +1,611 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later + * Copyright Red Hat + * Author: Laurent Vivier + * + * virtio API, vring and virtqueue functions definition + */

Nit: the convention in post passt source files is, SPDX, then description of this file, then copyright and authorship.

...

+ +/* some parts copied from QEMU subprojects/libvhost-user/libvhost-user.c */ + +#include +#include +#include +#include +#include +#include + +#include "util.h" +#include "virtio.h" + +#define VIRTQUEUE_MAX_SIZE 1024 + +/** + * vu_gpa_to_va() - Translate guest physical address to our virtual address. + * @dev: Vhost-user device + * @plen: Physical length to map (input), virtual address mapped (output) + * @guest_addr: Guest physical address + * + * Return: virtual address in our address space of the guest physical address + */ +static void *vu_gpa_to_va(struct vu_dev *dev, uint64_t *plen, uint64_t guest_addr) +{ + unsigned int i; + + if (*plen == 0) + return NULL; + + /* Find matching memory region. */ + for (i = 0; i < dev->nregions; i++) { + const struct vu_dev_region *r = &dev->regions[i]; + + if ((guest_addr >= r->gpa) && + (guest_addr < (r->gpa + r->size))) { + if ((guest_addr + *plen) > (r->gpa + r->size)) + *plen = r->gpa + r->size - guest_addr; + /* NOLINTNEXTLINE(performance-no-int-to-ptr) */ + return (void *)(guest_addr - r->gpa + r->mmap_addr + + r->mmap_offset); + } + } + + return NULL; +} + +/** + * vring_avail_flags() - Read the available ring flags + * @vq: Virtqueue + * + * Return: the available ring descriptor flags of the given virtqueue + */ +static inline uint16_t vring_avail_flags(const struct vu_virtq *vq) +{ + return le16toh(vq->vring.avail->flags); +} + +/** + * vring_avail_idx() - Read the available ring index + * @vq: Virtqueue + * + * Return: the available ring index of the given virtqueue + */ +static inline uint16_t vring_avail_idx(struct vu_virtq *vq) +{ + vq->shadow_avail_idx = le16toh(vq->vring.avail->idx); + + return vq->shadow_avail_idx; +} + +/** + * vring_avail_ring() - Read an available ring entry + * @vq: Virtqueue + * @i Index of the entry to read + * + * Return: the ring entry content (head of the descriptor chain) + */ +static inline uint16_t vring_avail_ring(const struct vu_virtq *vq, int i) +{ + return le16toh(vq->vring.avail->ring[i]); +} + +/** + * vring_get_used_event() - Get the used event from the available ring + * @vq Virtqueue + * + * Return: the used event (available only if VIRTIO_RING_F_EVENT_IDX is set) + * used_event is a performant alternative where the driver + * specifies how far the device can progress before a notification + * is required. In this case, virq_avail is defined as: + * struct virtq_avail { + * le16 flags; + * le16 idx; + * le16 ring[num]; + * le16 used_event; // Only if VIRTIO_F_EVENT_IDX + * }; + * If the idx field in the used ring (which determined where that + * descriptor index was placed) was equal to used_event, the device + * must send a notification. + * Otherwise the device should not send a notification. + */ +static inline uint16_t vring_get_used_event(const struct vu_virtq *vq) +{ + return vring_avail_ring(vq, vq->vring.num); +} + +/** + * virtqueue_get_head() - Get the head of the descriptor chain for a given + * index + * @vq: Virtqueue + * @idx: Available ring entry index + * @head: Head of the descriptor chain + */ +static void virtqueue_get_head(const struct vu_virtq *vq, + unsigned int idx, unsigned int *head) +{ + /* Grab the next descriptor number they're advertising, and increment + * the index we've seen. + */ + *head = vring_avail_ring(vq, idx % vq->vring.num); + + /* If their number is silly, that's a fatal mistake. */ + if (*head >= vq->vring.num) + vu_panic("Guest says index %u is available", *head); +} + +/** + * virtqueue_read_indirect_desc() - Copy virtio ring descriptors from guest + * memory + * @dev: Vhost-user device + * @desc: Destination address to copy the descriptors + * @addr: Guest memory address to copy from + * @len: Length of memory to copy + * + * Return: -1 if there is an error, 0 otherwise + */ +static int virtqueue_read_indirect_desc(struct vu_dev *dev, struct vring_desc *desc, + uint64_t addr, size_t len) +{ + uint64_t read_len; + + if (len > (VIRTQUEUE_MAX_SIZE * sizeof(struct vring_desc))) + return -1; + + if (len == 0) + return -1; + + while (len) { + const struct vring_desc *ori_desc; + + read_len = len; + ori_desc = vu_gpa_to_va(dev, &read_len, addr); + if (!ori_desc) + return -1; + + memcpy(desc, ori_desc, read_len); + len -= read_len; + addr += read_len; + desc += read_len / sizeof(struct vring_desc); + } + + return 0; +} + +/** + * enum virtqueue_read_desc_state - State in the descriptor chain + * @VIRTQUEUE_READ_DESC_ERROR Found an invalid descriptor + * @VIRTQUEUE_READ_DESC_DONE No more descriptor in the chain

Nit: grammar, "No more descriptors in the chain"

...

+ * @VIRTQUEUE_READ_DESC_MORE there is more descriptors in the chain

Nit: grammar, "there are" rather than "there is"

...

+ */ +enum virtqueue_read_desc_state { + VIRTQUEUE_READ_DESC_ERROR = -1, + VIRTQUEUE_READ_DESC_DONE = 0, /* end of chain */ + VIRTQUEUE_READ_DESC_MORE = 1, /* more buffers in chain */ +}; + +/** + * virtqueue_read_next_desc() - Read the the next descriptor in the chain + * @desc: Virtio ring descriptors + * @i: Index of the current descriptor + * @max: Maximum value of the descriptor index + * @next: Index of the next descriptor in the chain (output value) + * + * Return: current chain descriptor state (error, next, done) + */ +static int virtqueue_read_next_desc(const struct vring_desc *desc, + int i, unsigned int max, unsigned int *next) +{ + /* If this descriptor says it doesn't chain, we're done. */ + if (!(le16toh(desc[i].flags) & VRING_DESC_F_NEXT)) + return VIRTQUEUE_READ_DESC_DONE; + + /* Check they're not leading us off end of descriptors. */ + *next = le16toh(desc[i].next); + /* Make sure compiler knows to grab that: we don't want it changing! */ + smp_wmb(); + + if (*next >= max) + return VIRTQUEUE_READ_DESC_ERROR; + + return VIRTQUEUE_READ_DESC_MORE; +} + +/** + * vu_queue_empty() - Check if virtqueue is empty + * @vq: Virtqueue + * + * Return: true if the virtqueue is empty, false otherwise + */ +bool vu_queue_empty(struct vu_virtq *vq) +{ + if (!vq->vring.avail) + return true; + + if (vq->shadow_avail_idx != vq->last_avail_idx) + return false; + + return vring_avail_idx(vq) == vq->last_avail_idx; +} + +/** + * vring_notify() - Check if a notification can be sent + * @dev: Vhost-user device + * @vq: Virtqueue + * + * Return: true if notification can be sent + */

Maybe call this vring_can_notify() or something, since it doesn't actually do the notification.

...

+static bool vring_notify(const struct vu_dev *dev, struct vu_virtq *vq) +{ + uint16_t old, new; + bool v; + + /* We need to expose used array entries before checking used event. */ + smp_mb(); + + /* Always notify when queue is empty (when feature acknowledge) */ + if (vu_has_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY) && + !vq->inuse && vu_queue_empty(vq)) { + return true; + } + + if (!vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) + return !(vring_avail_flags(vq) & VRING_AVAIL_F_NO_INTERRUPT); + + v = vq->signalled_used_valid; + vq->signalled_used_valid = true; + old = vq->signalled_used; + new = vq->signalled_used = vq->used_idx; + return !v || vring_need_event(vring_get_used_event(vq), new, old); +} + +/** + * vu_queue_notify() - Send a notification the given virtqueue + * @dev: Vhost-user device + * @vq: Virtqueue + */ +/* cppcheck-suppress unusedFunction */ +void vu_queue_notify(const struct vu_dev *dev, struct vu_virtq *vq) +{ + if (!vq->vring.avail) + return; + + if (!vring_notify(dev, vq)) { + debug("skipped notify...");

Maybe give a bit more context in this message (like the fact that it's vhost-user related).

...

+ return; + } + + if (eventfd_write(vq->call_fd, 1) < 0) + vu_panic("Error writing eventfd: %s", strerror(errno)); +} + +/** + * vring_set_avail_event() - Set avail_event + * @vq: Virtqueue + * @val: Value to set to avail_event + * avail_event is used in the same way the used_event is in the + * avail_ring. + * struct virtq_used { + * le16 flags; + * le16 idx; + * struct virtq_used_elem ringnum]; + * le16 avail_event; // Only if VIRTIO_F_EVENT_IDX + * }; + * avail_event is used to advise the driver that notifications + * are unnecessary until the driver writes entry with an index + * specified by avail_event into the available ring. + */ +static inline void vring_set_avail_event(struct vu_virtq *vq, uint16_t val) +{ + uint16_t val_le = htole16(val); + + if (!vq->notification) + return; + + memcpy(&vq->vring.used->ring[vq->vring.num], &val_le, sizeof(uint16_t));

sizeof(val_le) would be preferred here.

...

+} + +/** + * virtqueue_map_desc() - Translate descriptor ring physical address into our + * virtual address space + * @dev: Vhost-user device + * @p_num_sg: First iov entry to use (input), + * first iov entry not sued (output)

s/sued/used/?

...

+ * @iov: Iov array to use to store buffer virtual addresses + * @max_num_sg: Maximum number of iov entries + * @pa: Guest physical address of the buffer to map into our virtual + * address + * @sz: Size of the buffer + * + * Return: false on error, true otherwise + */ +static bool virtqueue_map_desc(struct vu_dev *dev, + unsigned int *p_num_sg, struct iovec *iov, + unsigned int max_num_sg, + uint64_t pa, size_t sz) +{ + unsigned int num_sg = *p_num_sg; + + ASSERT(num_sg <= max_num_sg);

Shouldn't this be strictly

...

+ if (!sz) + vu_panic("virtio: zero sized buffers are not allowed");

IIUC this indicates a bug in the caller, so just ASSERT(sz) would be appropriate.

...

+ + while (sz) { + uint64_t len = sz; + + if (num_sg == max_num_sg) + vu_panic("virtio: too many descriptors in indirect table"); + + iov[num_sg].iov_base = vu_gpa_to_va(dev, &len, pa); + if (iov[num_sg].iov_base == NULL) + vu_panic("virtio: invalid address for buffers");

This could also be an ASSERT(), I think.

...

+ iov[num_sg].iov_len = len; + num_sg++; + sz -= len; + pa += len; + } + + *p_num_sg = num_sg; + return true; +} + +/** + * vu_queue_map_desc - Map the virqueue descriptor ring into our virtual + * address space + * @dev: Vhost-user device + * @vq: Virtqueue + * @idx: First descriptor ring entry to map + * @elem: Virtqueue element to store descriptor ring iov + * + * Return: -1 if there is an error, 0 otherwise + */ +static int vu_queue_map_desc(struct vu_dev *dev, struct vu_virtq *vq, unsigned int idx, + struct vu_virtq_element *elem) +{ + const struct vring_desc *desc = vq->vring.desc; + struct vring_desc desc_buf[VIRTQUEUE_MAX_SIZE]; + unsigned int out_num = 0, in_num = 0; + unsigned int max = vq->vring.num; + unsigned int i = idx; + uint64_t read_len; + int rc; + + if (le16toh(desc[i].flags) & VRING_DESC_F_INDIRECT) { + unsigned int desc_len; + uint64_t desc_addr; + + if (le32toh(desc[i].len) % sizeof(struct vring_desc)) + vu_panic("Invalid size for indirect buffer table"); + + /* loop over the indirect descriptor table */ + desc_addr = le64toh(desc[i].addr); + desc_len = le32toh(desc[i].len); + max = desc_len / sizeof(struct vring_desc); + read_len = desc_len; + desc = vu_gpa_to_va(dev, &read_len, desc_addr); + if (desc && read_len != desc_len) { + /* Failed to use zero copy */ + desc = NULL; + if (!virtqueue_read_indirect_desc(dev, desc_buf, desc_addr, desc_len)) + desc = desc_buf; + } + if (!desc) + vu_panic("Invalid indirect buffer table"); + i = 0; + } + + /* Collect all the descriptors */ + do { + if (le16toh(desc[i].flags) & VRING_DESC_F_WRITE) { + if (!virtqueue_map_desc(dev, &in_num, elem->in_sg, + elem->in_num, + le64toh(desc[i].addr), + le32toh(desc[i].len))) { + return -1; + } + } else { + if (in_num) + vu_panic("Incorrect order for descriptors"); + if (!virtqueue_map_desc(dev, &out_num, elem->out_sg, + elem->out_num, + le64toh(desc[i].addr), + le32toh(desc[i].len))) { + return -1; + } + } + + /* If we've got too many, that implies a descriptor loop. */ + if ((in_num + out_num) > max) + vu_panic("Looped descriptor"); + rc = virtqueue_read_next_desc(desc, i, max, &i); + } while (rc == VIRTQUEUE_READ_DESC_MORE); + + if (rc == VIRTQUEUE_READ_DESC_ERROR) + vu_panic("read descriptor error"); + + elem->index = idx; + elem->in_num = in_num; + elem->out_num = out_num; + + return 0; +} + +/** + * vu_queue_pop() - Pop an entry from the virtqueue + * @dev: Vhost-user device + * @vq: Virtqueue + * @elem: Virtqueue element to file with the entry information + * + * Return: -1 if there is an error, 0 otherwise + */ +/* cppcheck-suppress unusedFunction */ +int vu_queue_pop(struct vu_dev *dev, struct vu_virtq *vq, struct vu_virtq_element *elem) +{ + unsigned int head; + int ret; + + if (!vq->vring.avail) + return -1; + + if (vu_queue_empty(vq)) + return -1; + + /* + * Needed after vu_queue_empty(), see comment in + * virtqueue_num_heads(). + */ + smp_rmb(); + + if (vq->inuse >= vq->vring.num) + vu_panic("Virtqueue size exceeded"); + + virtqueue_get_head(vq, vq->last_avail_idx++, &head); + + if (vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) + vring_set_avail_event(vq, vq->last_avail_idx); + + ret = vu_queue_map_desc(dev, vq, head, elem); + + if (ret < 0) + return ret; + + vq->inuse++; + + return 0; +} + +/** + * vu_queue_detach_element() - Detach an element from the virqueue + * @dev: Vhost-user device + * @vq: Virtqueue + * @index: Index of the element to detach + * @len: Size of the element to detach + */ +void vu_queue_detach_element(struct vu_dev *dev, struct vu_virtq *vq, + unsigned int index, size_t len) +{ + (void)dev; + (void)index; + (void)len;

AFAICT this isn't used as a function pointer, so why include the unused parameter?

...

+ + vq->inuse--; + /* unmap, when DMA support is added */ +} + +/** + * vu_queue_unpop() - Push back a previously popped element from the virqueue + * @dev: Vhost-user device + * @vq: Virtqueue + * @index: Index of the element to unpop + * @len: Size of the element to unpop + */ +/* cppcheck-suppress unusedFunction */ +void vu_queue_unpop(struct vu_dev *dev, struct vu_virtq *vq, unsigned int index, size_t len) +{ + vq->last_avail_idx--; + vu_queue_detach_element(dev, vq, index, len); +} + +/** + * vu_queue_rewind() - Push back a given number of popped elements + * @dev: Vhost-user device + * @vq: Virtqueue + * @num: Number of element to unpop + */ +/* cppcheck-suppress unusedFunction */ +bool vu_queue_rewind(struct vu_dev *dev, struct vu_virtq *vq, unsigned int num) +{ + (void)dev;

Unused parameter again.

...

+ if (num > vq->inuse) + return false; + + vq->last_avail_idx -= num; + vq->inuse -= num; + return true; +} + +/** + * vring_used_write() - Write an entry in the used ring + * @vq: Virtqueue + * @uelem: Entry to write + * @i: Index of the entry in the used ring + */ +static inline void vring_used_write(struct vu_virtq *vq, + const struct vring_used_elem *uelem, int i) +{ + struct vring_used *used = vq->vring.used; + + used->ring[i] = *uelem; +} + +/** + * vu_queue_fill_by_index() - Update information of a descriptor ring entry + * in the used ring + * @vq: Virtqueue + * @index: Descriptor ring index + * @len: Size of the element + * @idx: Used ring entry index + */ +void vu_queue_fill_by_index(struct vu_virtq *vq, unsigned int index, + unsigned int len, unsigned int idx) +{ + struct vring_used_elem uelem; + + if (!vq->vring.avail) + return; + + idx = (idx + vq->used_idx) % vq->vring.num; + + uelem.id = htole32(index); + uelem.len = htole32(len); + vring_used_write(vq, &uelem, idx); +} + +/** + * vu_queue_fill() - Update information of a given element in the used ring + * @dev: Vhost-user device + * @vq: Virtqueue + * @elem: Element information to fill + * @len: Size of the element + * @idx: Used ring entry index + */ +/* cppcheck-suppress unusedFunction */ +void vu_queue_fill(struct vu_virtq *vq, const struct vu_virtq_element *elem, + unsigned int len, unsigned int idx) +{ + vu_queue_fill_by_index(vq, elem->index, len, idx); +} + +/** + * vring_used_idx_set() - Set the descriptor ring current index + * @vq: Virtqueue + * @val: Value to set in the index + */ +static inline void vring_used_idx_set(struct vu_virtq *vq, uint16_t val) +{ + vq->vring.used->idx = htole16(val); + + vq->used_idx = val; +} + +/** + * vu_queue_flush() - Flush the virtqueue + * @vq: Virtqueue + * @count: Number of entry to flush + */ +/* cppcheck-suppress unusedFunction */ +void vu_queue_flush(struct vu_virtq *vq, unsigned int count) +{ + uint16_t old, new; + + if (!vq->vring.avail) + return; + + /* Make sure buffer is written before we update index. */ + smp_wmb(); + + old = vq->used_idx; + new = old + count; + vring_used_idx_set(vq, new); + vq->inuse -= count; + if ((int16_t)(new - vq->signalled_used) < (uint16_t)(new - old))

This seems really weird: explicitly casting two sides of a comparison to different signedness. Is that an error or is there some subtle logic to it?

...

+ vq->signalled_used_valid = false; +} diff --git a/virtio.h b/virtio.h new file mode 100644 index 000000000000..0a2cf6230139 --- /dev/null +++ b/virtio.h @@ -0,0 +1,190 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later + * Copyright Red Hat + * Author: Laurent Vivier + * + * virtio API, vring and virtqueue functions definition + */ + +#ifndef VIRTIO_H +#define VIRTIO_H + +#include +#include + +#define vu_panic(...) die( __VA_ARGS__ ) + +/* Maximum size of a virtqueue */ +#define VIRTQUEUE_MAX_SIZE 1024 + +/** + * struct vu_ring - Virtqueue rings + * @num: Size of the queue + * @desc: Descriptor ring + * @avail: Available ring + * @used: Used ring + * @log_guest_addr: Guest address for logging + * @flags: Vring flags + * VHOST_VRING_F_LOG is set if log address is valid + */ +struct vu_ring { + unsigned int num; + struct vring_desc *desc; + struct vring_avail *avail; + struct vring_used *used; + uint64_t log_guest_addr; + uint32_t flags; +}; + +/** + * struct vu_virtq - Virtqueue definition + * @vring: Virtqueue rings + * @last_avail_idx: Next head to pop + * @shadow_avail_idx: Last avail_idx read from VQ. + * @used_idx: Descriptor ring current index + * @signalled_used: Last used index value we have signalled on + * @signalled_used_valid: True if signalled_used if valid + * @notification: True if the queues notify (via event + * index or interrupt) + * @inuse: Number of entries in use + * @call_fd: The event file descriptor to signal when + * buffers are used. + * @kick_fd: The event file descriptor for adding + * buffers to the vring + * @err_fd: The event file descriptor to signal when + * error occurs + * @enable: True if the virtqueue is enabled + * @started: True if the virtqueue is started + * @vra: QEMU address of our rings + */ +struct vu_virtq { + struct vu_ring vring; + uint16_t last_avail_idx; + uint16_t shadow_avail_idx; + uint16_t used_idx; + uint16_t signalled_used; + bool signalled_used_valid; + bool notification; + unsigned int inuse; + int call_fd; + int kick_fd; + int err_fd; + unsigned int enable; + bool started; + struct vhost_vring_addr vra; +}; + +/** + * struct vu_dev_region - guest shared memory region + * @gpa: Guest physical address of the region + * @size: Memory size in bytes + * @qva: QEMU virtual address

Is this actually the qemu virtual address? Or is it our virtual address?

...

+ * @mmap_offset: Offset where the region starts in the mapped memory + * @mmap_addr: Address of the mapped memory + */ +struct vu_dev_region { + uint64_t gpa; + uint64_t size; + uint64_t qva; + uint64_t mmap_offset; + uint64_t mmap_addr; +}; + +#define VHOST_USER_MAX_QUEUES 2 + +/* + * Set a reasonable maximum number of ram slots, which will be supported by + * any architecture. + */ +#define VHOST_USER_MAX_RAM_SLOTS 32 + +/** + * struct vu_dev + * @context: Execution context

This looks like a copypasta error.

...

+ * nregions: Number of shared memory regions

Missing '@'

...

+ * @regions: Guest shared memory regions + * @features: Vhost-user features + * @protocol_features: Vhost-user protocol features + * @hdrlen: Virtio -net header length + */ +struct vu_dev { + uint32_t nregions; + struct vu_dev_region regions[VHOST_USER_MAX_RAM_SLOTS]; + struct vu_virtq vq[VHOST_USER_MAX_QUEUES]; + uint64_t features; + uint64_t protocol_features; + int hdrlen; +}; + +/** + * struct vu_virtq_element + * @index: Descriptor ring index + * @out_num: Number of outgoing iovec buffers + * @in_num: Number of incoming iovec buffers + * @in_sg: Incoming iovec buffers + * @out_sg: Outgoing iovec buffers + */ +struct vu_virtq_element { + unsigned int index; + unsigned int out_num; + unsigned int in_num; + struct iovec *in_sg; + struct iovec *out_sg; +}; + +/** + * has_feature() - Check a feature bit in a features set + * @features: Features set + * @fb: Feature bit to check + * + * Return: True if the feature bit is set + */ +static inline bool has_feature(uint64_t features, unsigned int fbit) +{ + return !!(features & (1ULL << fbit)); +} + +/** + * vu_has_feature() - Check if a virtio-net feature is available + * @vdev: Vhost-user device + * @bit: Feature to check + * + * Return: True if the feature is available + */ +static inline bool vu_has_feature(const struct vu_dev *vdev, + unsigned int fbit) +{ + return has_feature(vdev->features, fbit); +} + +/** + * vu_has_protocol_feature() - Check if a vhost-user feature is available + * @vdev: Vhost-user device + * @bit: Feature to check + * + * Return: True if the feature is available + */ +/* cppcheck-suppress unusedFunction */ +static inline bool vu_has_protocol_feature(const struct vu_dev *vdev, + unsigned int fbit) +{ + return has_feature(vdev->protocol_features, fbit); +} + +bool vu_queue_empty(struct vu_virtq *vq); +void vu_queue_notify(const struct vu_dev *dev, struct vu_virtq *vq); +int vu_queue_pop(struct vu_dev *dev, struct vu_virtq *vq, + struct vu_virtq_element *elem); +void vu_queue_detach_element(struct vu_dev *dev, struct vu_virtq *vq, + unsigned int index, size_t len); +void vu_queue_unpop(struct vu_dev *dev, struct vu_virtq *vq, + unsigned int index, size_t len); +bool vu_queue_rewind(struct vu_dev *dev, struct vu_virtq *vq, + unsigned int num); + +void vu_queue_fill_by_index(struct vu_virtq *vq, unsigned int index, + unsigned int len, unsigned int idx); +void vu_queue_fill(struct vu_virtq *vq, + const struct vu_virtq_element *elem, unsigned int len, + unsigned int idx); +void vu_queue_flush(struct vu_virtq *vq, unsigned int count); +#endif /* VIRTIO_H */

-- David Gibson (he or they) | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you, not the other way | around. http://www.ozlabs.org/~dgibson

Laurent Vivier

14 Aug 14 Aug

2:47 p.m.

New subject: [PATCH v2 2/4] vhost-user: introduce virtio API

On 17/07/2024 07:21, David Gibson wrote:

...

On Fri, Jul 12, 2024 at 05:32:42PM +0200, Laurent Vivier wrote:

...
Add virtio.c and virtio.h that define the functions needed to manage virtqueues.

Signed-off-by: Laurent Vivier --- Makefile | 4 +- util.h | 11 + virtio.c | 611 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ virtio.h | 190 +++++++++++++++++ 4 files changed, 814 insertions(+), 2 deletions(-) create mode 100644 virtio.c create mode 100644 virtio.h

...

...
+ +/** + * struct vu_dev_region - guest shared memory region + * @gpa: Guest physical address of the region + * @size: Memory size in bytes + * @qva: QEMU virtual address

Is this actually the qemu virtual address? Or is it our virtual address?

It is actually QEMU virtual address, it's a virtual address provided by the front-end (in our case QEMU). https://qemu-project.gitlab.io/qemu/interop/vhost-user.html#memory-region-de... It's used in qva_to_va() to convert vring addresses that are in QEMU userspace (vhost-user) to our process userspace mapped address, while vu_gpa_to_va() is used to convert guest physical address (virtio) of buffers address stored vring descritors that are in guest physical address space to our process address space. vhost-user addresses are in QEMU virtual adress space, virtio addresses are in guest physical address space. Thansk, Laurent

David Gibson

15 Aug 15 Aug

6:52 a.m.

New subject: [PATCH v2 2/4] vhost-user: introduce virtio API

On Wed, Aug 14, 2024 at 02:47:36PM +0200, Laurent Vivier wrote:

...

On 17/07/2024 07:21, David Gibson wrote:

...
On Fri, Jul 12, 2024 at 05:32:42PM +0200, Laurent Vivier wrote:

...
Add virtio.c and virtio.h that define the functions needed to manage virtqueues.

Signed-off-by: Laurent Vivier --- Makefile | 4 +- util.h | 11 + virtio.c | 611 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ virtio.h | 190 +++++++++++++++++ 4 files changed, 814 insertions(+), 2 deletions(-) create mode 100644 virtio.c create mode 100644 virtio.h

...

...
+ +/** + * struct vu_dev_region - guest shared memory region + * @gpa: Guest physical address of the region + * @size: Memory size in bytes + * @qva: QEMU virtual address

Is this actually the qemu virtual address? Or is it our virtual address?

It is actually QEMU virtual address, it's a virtual address provided by the front-end (in our case QEMU).

Ok.

...

https://qemu-project.gitlab.io/qemu/interop/vhost-user.html#memory-region-de...

It's used in qva_to_va() to convert vring addresses that are in QEMU userspace (vhost-user) to our process userspace mapped address, while vu_gpa_to_va() is used to convert guest physical address (virtio) of buffers address stored vring descritors that are in guest physical address space to our process address space.

vhost-user addresses are in QEMU virtual adress space, virtio addresses are in guest physical address space.

Thansk, Laurent

-- David Gibson (he or they) | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you, not the other way | around. http://www.ozlabs.org/~dgibson

Stefano Brivio

19 Jul 19 Jul

11:29 p.m.

New subject: [PATCH v2 2/4] vhost-user: introduce virtio API

On Fri, 12 Jul 2024 17:32:42 +0200 Laurent Vivier wrote:

...

Add virtio.c and virtio.h that define the functions needed to manage virtqueues.

Signed-off-by: Laurent Vivier --- Makefile | 4 +- util.h | 11 + virtio.c | 611 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ virtio.h | 190 +++++++++++++++++ 4 files changed, 814 insertions(+), 2 deletions(-) create mode 100644 virtio.c create mode 100644 virtio.h

diff --git a/Makefile b/Makefile index 09fc461d087e..39613a7cf1f2 100644 --- a/Makefile +++ b/Makefile @@ -47,7 +47,7 @@ FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS) PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c flow.c fwd.c \ icmp.c igmp.c inany.c iov.c ip.c isolation.c lineread.c log.c mld.c \ ndp.c netlink.c packet.c passt.c pasta.c pcap.c pif.c tap.c tcp.c \ - tcp_buf.c tcp_splice.c udp.c util.c + tcp_buf.c tcp_splice.c udp.c util.c virtio.c QRAP_SRCS = qrap.c SRCS = $(PASST_SRCS) $(QRAP_SRCS)

@@ -57,7 +57,7 @@ PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \ flow_table.h icmp.h icmp_flow.h inany.h iov.h ip.h isolation.h \ lineread.h log.h ndp.h netlink.h packet.h passt.h pasta.h pcap.h pif.h \ siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h tcp_internal.h tcp_splice.h \ - udp.h util.h + udp.h util.h virtio.h HEADERS = $(PASST_HEADERS) seccomp.h

C := \#include \nstruct tcp_info x = { .tcpi_snd_wnd = 0 }; diff --git a/util.h b/util.h index eebb027be487..56c4e2e7b4fe 100644 --- a/util.h +++ b/util.h @@ -48,6 +48,9 @@ #define ROUND_DOWN(x, y) ((x) & ~((y) - 1)) #define ROUND_UP(x, y) (((x) + (y) - 1) & ~((y) - 1))

+#define ALIGN_DOWN(n, m) ((n) / (m) * (m)) +#define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m)) + #define MAX_FROM_BITS(n) (((1U << (n)) - 1))

#define BIT(n) (1UL << (n)) @@ -116,6 +119,14 @@ #define htonl_constant(x) (__bswap_constant_32(x)) #endif

+static inline void barrier(void) { __asm__ __volatile__("" ::: "memory"); } +#define smp_mb() do { barrier(); __atomic_thread_fence(__ATOMIC_SEQ_CST); } while (0) +#define smp_mb_release() do { barrier(); __atomic_thread_fence(__ATOMIC_RELEASE); } while (0) +#define smp_mb_acquire() do { barrier(); __atomic_thread_fence(__ATOMIC_ACQUIRE); } while (0) + +#define smp_wmb() smp_mb_release() +#define smp_rmb() smp_mb_acquire() + #define NS_FN_STACK_SIZE (RLIMIT_STACK_VAL * 1024 / 8) int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags, void *arg); diff --git a/virtio.c b/virtio.c new file mode 100644 index 000000000000..5f984f92cae0 --- /dev/null +++ b/virtio.c @@ -0,0 +1,611 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later + * Copyright Red Hat + * Author: Laurent Vivier + * + * virtio API, vring and virtqueue functions definition + */ + +/* some parts copied from QEMU subprojects/libvhost-user/libvhost-user.c */

I think full attribution would be nice, even though not legally required in this case. See checksum.c for an example (and the comment to csum_avx2() there if it applies, but I don't think that part would be practical here).

...

+ +#include +#include +#include +#include +#include +#include + +#include "util.h" +#include "virtio.h" + +#define VIRTQUEUE_MAX_SIZE 1024 + +/** + * vu_gpa_to_va() - Translate guest physical address to our virtual address. + * @dev: Vhost-user device + * @plen: Physical length to map (input), virtual address mapped (output) + * @guest_addr: Guest physical address + * + * Return: virtual address in our address space of the guest physical address + */ +static void *vu_gpa_to_va(struct vu_dev *dev, uint64_t *plen, uint64_t guest_addr) +{ + unsigned int i; + + if (*plen == 0) + return NULL; + + /* Find matching memory region. */

Extra whitespace before */.

...

+ for (i = 0; i < dev->nregions; i++) { + const struct vu_dev_region *r = &dev->regions[i]; + + if ((guest_addr >= r->gpa) && + (guest_addr < (r->gpa + r->size))) { + if ((guest_addr + *plen) > (r->gpa + r->size)) + *plen = r->gpa + r->size - guest_addr; + /* NOLINTNEXTLINE(performance-no-int-to-ptr) */ + return (void *)(guest_addr - r->gpa + r->mmap_addr + + r->mmap_offset); + } + } + + return NULL; +} + +/** + * vring_avail_flags() - Read the available ring flags + * @vq: Virtqueue + * + * Return: the available ring descriptor flags of the given virtqueue + */ +static inline uint16_t vring_avail_flags(const struct vu_virtq *vq) +{ + return le16toh(vq->vring.avail->flags); +} + +/** + * vring_avail_idx() - Read the available ring index + * @vq: Virtqueue + * + * Return: the available ring index of the given virtqueue + */ +static inline uint16_t vring_avail_idx(struct vu_virtq *vq) +{ + vq->shadow_avail_idx = le16toh(vq->vring.avail->idx); + + return vq->shadow_avail_idx; +} + +/** + * vring_avail_ring() - Read an available ring entry + * @vq: Virtqueue + * @i Index of the entry to read

@i:

...

+ * + * Return: the ring entry content (head of the descriptor chain) + */ +static inline uint16_t vring_avail_ring(const struct vu_virtq *vq, int i) +{ + return le16toh(vq->vring.avail->ring[i]); +} + +/** + * vring_get_used_event() - Get the used event from the available ring + * @vq Virtqueue + * + * Return: the used event (available only if VIRTIO_RING_F_EVENT_IDX is set) + * used_event is a performant alternative where the driver

This is taken from QEMU's hw/virtio/virtio.c, not from subprojects/libvhost-user/libvhost-user.c.

...

+ * specifies how far the device can progress before a notification + * is required. In this case, virq_avail is defined as:

s/virq_avail/virtq_avail/, but...

...

+ * struct virtq_avail { + * le16 flags; + * le16 idx; + * le16 ring[num]; + * le16 used_event; // Only if VIRTIO_F_EVENT_IDX + * };

I don't understand why you describe this structure here. All this function returns is an index of a descriptor, right?

...

+ * If the idx field in the used ring (which determined where that + * descriptor index was placed) was equal to used_event, the device + * must send a notification. + * Otherwise the device should not send a notification. + */ +static inline uint16_t vring_get_used_event(const struct vu_virtq *vq) +{ + return vring_avail_ring(vq, vq->vring.num); +} + +/** + * virtqueue_get_head() - Get the head of the descriptor chain for a given + * index + * @vq: Virtqueue + * @idx: Available ring entry index + * @head: Head of the descriptor chain + */ +static void virtqueue_get_head(const struct vu_virtq *vq, + unsigned int idx, unsigned int *head) +{ + /* Grab the next descriptor number they're advertising, and increment + * the index we've seen. + */ + *head = vring_avail_ring(vq, idx % vq->vring.num); + + /* If their number is silly, that's a fatal mistake. */ + if (*head >= vq->vring.num) + vu_panic("Guest says index %u is available", *head);

I think David's comment in: https://archives.passt.top/passt-dev/ZnjgSNbIXxKrAllp@zatzit/ really referred to using die() in place of vu_panic(), instead of defining vu_panic() as die() and using it. Well, in any case, that would be my comment: why do why need vu_panic() at all?

...

+} + +/** + * virtqueue_read_indirect_desc() - Copy virtio ring descriptors from guest + * memory + * @dev: Vhost-user device + * @desc: Destination address to copy the descriptors + * @addr: Guest memory address to copy from + * @len: Length of memory to copy + * + * Return: -1 if there is an error, 0 otherwise + */ +static int virtqueue_read_indirect_desc(struct vu_dev *dev, struct vring_desc *desc, + uint64_t addr, size_t len) +{ + uint64_t read_len; + + if (len > (VIRTQUEUE_MAX_SIZE * sizeof(struct vring_desc))) + return -1; + + if (len == 0) + return -1; + + while (len) { + const struct vring_desc *ori_desc;

It took me a bit to understand that "ori" means... "orig". :) In general I'd say "orig" (ending with a consonant) is much clearer, that's what we use in another occurrence in passt and also what the Linux kernel generally uses.

...

+ + read_len = len; + ori_desc = vu_gpa_to_va(dev, &read_len, addr); + if (!ori_desc) + return -1; + + memcpy(desc, ori_desc, read_len); + len -= read_len; + addr += read_len; + desc += read_len / sizeof(struct vring_desc); + } + + return 0; +} + +/** + * enum virtqueue_read_desc_state - State in the descriptor chain + * @VIRTQUEUE_READ_DESC_ERROR Found an invalid descriptor + * @VIRTQUEUE_READ_DESC_DONE No more descriptor in the chain + * @VIRTQUEUE_READ_DESC_MORE there is more descriptors in the chain + */ +enum virtqueue_read_desc_state { + VIRTQUEUE_READ_DESC_ERROR = -1, + VIRTQUEUE_READ_DESC_DONE = 0, /* end of chain */ + VIRTQUEUE_READ_DESC_MORE = 1, /* more buffers in chain */ +}; + +/** + * virtqueue_read_next_desc() - Read the the next descriptor in the chain + * @desc: Virtio ring descriptors + * @i: Index of the current descriptor + * @max: Maximum value of the descriptor index + * @next: Index of the next descriptor in the chain (output value) + * + * Return: current chain descriptor state (error, next, done) + */ +static int virtqueue_read_next_desc(const struct vring_desc *desc, + int i, unsigned int max, unsigned int *next) +{ + /* If this descriptor says it doesn't chain, we're done. */ + if (!(le16toh(desc[i].flags) & VRING_DESC_F_NEXT)) + return VIRTQUEUE_READ_DESC_DONE; + + /* Check they're not leading us off end of descriptors. */ + *next = le16toh(desc[i].next); + /* Make sure compiler knows to grab that: we don't want it changing! */ + smp_wmb(); + + if (*next >= max) + return VIRTQUEUE_READ_DESC_ERROR; + + return VIRTQUEUE_READ_DESC_MORE; +} + +/** + * vu_queue_empty() - Check if virtqueue is empty + * @vq: Virtqueue + * + * Return: true if the virtqueue is empty, false otherwise + */ +bool vu_queue_empty(struct vu_virtq *vq) +{ + if (!vq->vring.avail) + return true; + + if (vq->shadow_avail_idx != vq->last_avail_idx) + return false; + + return vring_avail_idx(vq) == vq->last_avail_idx; +} + +/** + * vring_notify() - Check if a notification can be sent + * @dev: Vhost-user device + * @vq: Virtqueue + * + * Return: true if notification can be sent + */ +static bool vring_notify(const struct vu_dev *dev, struct vu_virtq *vq) +{ + uint16_t old, new; + bool v; + + /* We need to expose used array entries before checking used event. */ + smp_mb(); + + /* Always notify when queue is empty (when feature acknowledge) */ + if (vu_has_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY) && + !vq->inuse && vu_queue_empty(vq)) { + return true; + } + + if (!vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) + return !(vring_avail_flags(vq) & VRING_AVAIL_F_NO_INTERRUPT); + + v = vq->signalled_used_valid; + vq->signalled_used_valid = true; + old = vq->signalled_used; + new = vq->signalled_used = vq->used_idx; + return !v || vring_need_event(vring_get_used_event(vq), new, old); +} + +/** + * vu_queue_notify() - Send a notification the given virtqueue

s/the/to the/

...

+ * @dev: Vhost-user device + * @vq: Virtqueue + */ +/* cppcheck-suppress unusedFunction */ +void vu_queue_notify(const struct vu_dev *dev, struct vu_virtq *vq) +{ + if (!vq->vring.avail) + return; + + if (!vring_notify(dev, vq)) { + debug("skipped notify..."); + return; + } + + if (eventfd_write(vq->call_fd, 1) < 0) + vu_panic("Error writing eventfd: %s", strerror(errno)); +} + +/** + * vring_set_avail_event() - Set avail_event + * @vq: Virtqueue + * @val: Value to set to avail_event + * avail_event is used in the same way the used_event is in the + * avail_ring. + * struct virtq_used { + * le16 flags; + * le16 idx; + * struct virtq_used_elem ringnum]; + * le16 avail_event; // Only if VIRTIO_F_EVENT_IDX + * };

Same as above: why is this struct described here?

...

+ * avail_event is used to advise the driver that notifications + * are unnecessary until the driver writes entry with an index + * specified by avail_event into the available ring. + */ +static inline void vring_set_avail_event(struct vu_virtq *vq, uint16_t val) +{ + uint16_t val_le = htole16(val); + + if (!vq->notification) + return; + + memcpy(&vq->vring.used->ring[vq->vring.num], &val_le, sizeof(uint16_t)); +} + +/** + * virtqueue_map_desc() - Translate descriptor ring physical address into our + * virtual address space + * @dev: Vhost-user device + * @p_num_sg: First iov entry to use (input), + * first iov entry not sued (output) + * @iov: Iov array to use to store buffer virtual addresses + * @max_num_sg: Maximum number of iov entries + * @pa: Guest physical address of the buffer to map into our virtual + * address + * @sz: Size of the buffer + * + * Return: false on error, true otherwise + */ +static bool virtqueue_map_desc(struct vu_dev *dev, + unsigned int *p_num_sg, struct iovec *iov, + unsigned int max_num_sg, + uint64_t pa, size_t sz) +{ + unsigned int num_sg = *p_num_sg; + + ASSERT(num_sg <= max_num_sg); + + if (!sz) + vu_panic("virtio: zero sized buffers are not allowed"); + + while (sz) { + uint64_t len = sz; + + if (num_sg == max_num_sg) + vu_panic("virtio: too many descriptors in indirect table"); + + iov[num_sg].iov_base = vu_gpa_to_va(dev, &len, pa); + if (iov[num_sg].iov_base == NULL) + vu_panic("virtio: invalid address for buffers"); + iov[num_sg].iov_len = len; + num_sg++; + sz -= len; + pa += len; + } + + *p_num_sg = num_sg; + return true; +} + +/** + * vu_queue_map_desc - Map the virqueue descriptor ring into our virtual + * address space + * @dev: Vhost-user device + * @vq: Virtqueue + * @idx: First descriptor ring entry to map + * @elem: Virtqueue element to store descriptor ring iov + * + * Return: -1 if there is an error, 0 otherwise + */ +static int vu_queue_map_desc(struct vu_dev *dev, struct vu_virtq *vq, unsigned int idx, + struct vu_virtq_element *elem) +{ + const struct vring_desc *desc = vq->vring.desc; + struct vring_desc desc_buf[VIRTQUEUE_MAX_SIZE]; + unsigned int out_num = 0, in_num = 0; + unsigned int max = vq->vring.num; + unsigned int i = idx; + uint64_t read_len; + int rc; + + if (le16toh(desc[i].flags) & VRING_DESC_F_INDIRECT) { + unsigned int desc_len; + uint64_t desc_addr; + + if (le32toh(desc[i].len) % sizeof(struct vring_desc)) + vu_panic("Invalid size for indirect buffer table"); + + /* loop over the indirect descriptor table */ + desc_addr = le64toh(desc[i].addr); + desc_len = le32toh(desc[i].len); + max = desc_len / sizeof(struct vring_desc); + read_len = desc_len; + desc = vu_gpa_to_va(dev, &read_len, desc_addr); + if (desc && read_len != desc_len) { + /* Failed to use zero copy */ + desc = NULL; + if (!virtqueue_read_indirect_desc(dev, desc_buf, desc_addr, desc_len)) + desc = desc_buf; + } + if (!desc) + vu_panic("Invalid indirect buffer table"); + i = 0; + } + + /* Collect all the descriptors */ + do { + if (le16toh(desc[i].flags) & VRING_DESC_F_WRITE) { + if (!virtqueue_map_desc(dev, &in_num, elem->in_sg, + elem->in_num, + le64toh(desc[i].addr), + le32toh(desc[i].len))) { + return -1; + } + } else { + if (in_num) + vu_panic("Incorrect order for descriptors"); + if (!virtqueue_map_desc(dev, &out_num, elem->out_sg, + elem->out_num, + le64toh(desc[i].addr), + le32toh(desc[i].len))) { + return -1; + } + } + + /* If we've got too many, that implies a descriptor loop. */ + if ((in_num + out_num) > max) + vu_panic("Looped descriptor"); + rc = virtqueue_read_next_desc(desc, i, max, &i); + } while (rc == VIRTQUEUE_READ_DESC_MORE); + + if (rc == VIRTQUEUE_READ_DESC_ERROR) + vu_panic("read descriptor error"); + + elem->index = idx; + elem->in_num = in_num; + elem->out_num = out_num; + + return 0; +} + +/** + * vu_queue_pop() - Pop an entry from the virtqueue + * @dev: Vhost-user device + * @vq: Virtqueue + * @elem: Virtqueue element to file with the entry information + * + * Return: -1 if there is an error, 0 otherwise + */ +/* cppcheck-suppress unusedFunction */ +int vu_queue_pop(struct vu_dev *dev, struct vu_virtq *vq, struct vu_virtq_element *elem) +{ + unsigned int head; + int ret; + + if (!vq->vring.avail) + return -1; + + if (vu_queue_empty(vq)) + return -1; + + /* + * Needed after vu_queue_empty(), see comment in + * virtqueue_num_heads(). + */ + smp_rmb(); + + if (vq->inuse >= vq->vring.num) + vu_panic("Virtqueue size exceeded"); + + virtqueue_get_head(vq, vq->last_avail_idx++, &head); + + if (vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) + vring_set_avail_event(vq, vq->last_avail_idx); + + ret = vu_queue_map_desc(dev, vq, head, elem); + + if (ret < 0) + return ret; + + vq->inuse++; + + return 0; +} + +/** + * vu_queue_detach_element() - Detach an element from the virqueue + * @dev: Vhost-user device + * @vq: Virtqueue + * @index: Index of the element to detach + * @len: Size of the element to detach + */ +void vu_queue_detach_element(struct vu_dev *dev, struct vu_virtq *vq, + unsigned int index, size_t len) +{ + (void)dev; + (void)index; + (void)len; + + vq->inuse--; + /* unmap, when DMA support is added */ +} + +/** + * vu_queue_unpop() - Push back a previously popped element from the virqueue + * @dev: Vhost-user device + * @vq: Virtqueue + * @index: Index of the element to unpop + * @len: Size of the element to unpop + */ +/* cppcheck-suppress unusedFunction */ +void vu_queue_unpop(struct vu_dev *dev, struct vu_virtq *vq, unsigned int index, size_t len) +{ + vq->last_avail_idx--; + vu_queue_detach_element(dev, vq, index, len); +} + +/** + * vu_queue_rewind() - Push back a given number of popped elements + * @dev: Vhost-user device + * @vq: Virtqueue + * @num: Number of element to unpop + */ +/* cppcheck-suppress unusedFunction */ +bool vu_queue_rewind(struct vu_dev *dev, struct vu_virtq *vq, unsigned int num) +{ + (void)dev; + if (num > vq->inuse) + return false; + + vq->last_avail_idx -= num; + vq->inuse -= num; + return true; +} + +/** + * vring_used_write() - Write an entry in the used ring + * @vq: Virtqueue + * @uelem: Entry to write + * @i: Index of the entry in the used ring + */ +static inline void vring_used_write(struct vu_virtq *vq, + const struct vring_used_elem *uelem, int i) +{ + struct vring_used *used = vq->vring.used; + + used->ring[i] = *uelem; +} + +/** + * vu_queue_fill_by_index() - Update information of a descriptor ring entry + * in the used ring + * @vq: Virtqueue + * @index: Descriptor ring index + * @len: Size of the element + * @idx: Used ring entry index + */ +void vu_queue_fill_by_index(struct vu_virtq *vq, unsigned int index, + unsigned int len, unsigned int idx) +{ + struct vring_used_elem uelem; + + if (!vq->vring.avail) + return; + + idx = (idx + vq->used_idx) % vq->vring.num; + + uelem.id = htole32(index); + uelem.len = htole32(len); + vring_used_write(vq, &uelem, idx); +} + +/** + * vu_queue_fill() - Update information of a given element in the used ring + * @dev: Vhost-user device + * @vq: Virtqueue + * @elem: Element information to fill + * @len: Size of the element + * @idx: Used ring entry index + */ +/* cppcheck-suppress unusedFunction */ +void vu_queue_fill(struct vu_virtq *vq, const struct vu_virtq_element *elem, + unsigned int len, unsigned int idx) +{ + vu_queue_fill_by_index(vq, elem->index, len, idx); +} + +/** + * vring_used_idx_set() - Set the descriptor ring current index + * @vq: Virtqueue + * @val: Value to set in the index + */ +static inline void vring_used_idx_set(struct vu_virtq *vq, uint16_t val) +{ + vq->vring.used->idx = htole16(val); + + vq->used_idx = val; +} + +/** + * vu_queue_flush() - Flush the virtqueue + * @vq: Virtqueue + * @count: Number of entry to flush + */ +/* cppcheck-suppress unusedFunction */ +void vu_queue_flush(struct vu_virtq *vq, unsigned int count) +{ + uint16_t old, new; + + if (!vq->vring.avail) + return; + + /* Make sure buffer is written before we update index. */ + smp_wmb(); + + old = vq->used_idx; + new = old + count; + vring_used_idx_set(vq, new); + vq->inuse -= count; + if ((int16_t)(new - vq->signalled_used) < (uint16_t)(new - old)) + vq->signalled_used_valid = false; +} diff --git a/virtio.h b/virtio.h new file mode 100644 index 000000000000..0a2cf6230139 --- /dev/null +++ b/virtio.h @@ -0,0 +1,190 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later + * Copyright Red Hat + * Author: Laurent Vivier + * + * virtio API, vring and virtqueue functions definition + */ + +#ifndef VIRTIO_H +#define VIRTIO_H + +#include +#include + +#define vu_panic(...) die( __VA_ARGS__ ) + +/* Maximum size of a virtqueue */ +#define VIRTQUEUE_MAX_SIZE 1024 + +/** + * struct vu_ring - Virtqueue rings + * @num: Size of the queue + * @desc: Descriptor ring + * @avail: Available ring + * @used: Used ring + * @log_guest_addr: Guest address for logging + * @flags: Vring flags + * VHOST_VRING_F_LOG is set if log address is valid + */ +struct vu_ring { + unsigned int num; + struct vring_desc *desc; + struct vring_avail *avail; + struct vring_used *used; + uint64_t log_guest_addr; + uint32_t flags; +}; + +/** + * struct vu_virtq - Virtqueue definition + * @vring: Virtqueue rings + * @last_avail_idx: Next head to pop + * @shadow_avail_idx: Last avail_idx read from VQ. + * @used_idx: Descriptor ring current index + * @signalled_used: Last used index value we have signalled on + * @signalled_used_valid: True if signalled_used if valid + * @notification: True if the queues notify (via event + * index or interrupt) + * @inuse: Number of entries in use + * @call_fd: The event file descriptor to signal when + * buffers are used. + * @kick_fd: The event file descriptor for adding + * buffers to the vring + * @err_fd: The event file descriptor to signal when + * error occurs + * @enable: True if the virtqueue is enabled + * @started: True if the virtqueue is started + * @vra: QEMU address of our rings + */ +struct vu_virtq { + struct vu_ring vring; + uint16_t last_avail_idx; + uint16_t shadow_avail_idx; + uint16_t used_idx; + uint16_t signalled_used; + bool signalled_used_valid; + bool notification; + unsigned int inuse; + int call_fd; + int kick_fd; + int err_fd; + unsigned int enable; + bool started; + struct vhost_vring_addr vra; +}; + +/** + * struct vu_dev_region - guest shared memory region + * @gpa: Guest physical address of the region + * @size: Memory size in bytes + * @qva: QEMU virtual address + * @mmap_offset: Offset where the region starts in the mapped memory + * @mmap_addr: Address of the mapped memory + */ +struct vu_dev_region { + uint64_t gpa; + uint64_t size; + uint64_t qva; + uint64_t mmap_offset; + uint64_t mmap_addr; +}; + +#define VHOST_USER_MAX_QUEUES 2 + +/* + * Set a reasonable maximum number of ram slots, which will be supported by + * any architecture. + */ +#define VHOST_USER_MAX_RAM_SLOTS 32

See QEMU's commit 0fa6344c90a0 ("libvhost-user: Bump up VHOST_USER_MAX_RAM_SLOTS to 509"). I'm not sure if that, or other bits of the series posted at: https://lore.kernel.org/all/20240214151701.29906-1-david@redhat.com/ are actually relevant for us.

...

+ +/** + * struct vu_dev

Missing description. It represents a... vhost-user device, with guest mappings, I guess?

...

+ * @context: Execution context + * nregions: Number of shared memory regions + * @regions: Guest shared memory regions + * @features: Vhost-user features + * @protocol_features: Vhost-user protocol features + * @hdrlen: Virtio -net header length + */ +struct vu_dev { + uint32_t nregions; + struct vu_dev_region regions[VHOST_USER_MAX_RAM_SLOTS]; + struct vu_virtq vq[VHOST_USER_MAX_QUEUES]; + uint64_t features; + uint64_t protocol_features; + int hdrlen; +}; + +/** + * struct vu_virtq_element

And this is an element in the vhost-user virtqueue ring?

...

+ * @index: Descriptor ring index + * @out_num: Number of outgoing iovec buffers + * @in_num: Number of incoming iovec buffers + * @in_sg: Incoming iovec buffers + * @out_sg: Outgoing iovec buffers + */ +struct vu_virtq_element { + unsigned int index; + unsigned int out_num; + unsigned int in_num; + struct iovec *in_sg; + struct iovec *out_sg; +}; + +/** + * has_feature() - Check a feature bit in a features set + * @features: Features set + * @fb: Feature bit to check + * + * Return: True if the feature bit is set + */ +static inline bool has_feature(uint64_t features, unsigned int fbit) +{ + return !!(features & (1ULL << fbit)); +} + +/** + * vu_has_feature() - Check if a virtio-net feature is available + * @vdev: Vhost-user device + * @bit: Feature to check + * + * Return: True if the feature is available + */ +static inline bool vu_has_feature(const struct vu_dev *vdev, + unsigned int fbit) +{ + return has_feature(vdev->features, fbit); +} + +/** + * vu_has_protocol_feature() - Check if a vhost-user feature is available + * @vdev: Vhost-user device + * @bit: Feature to check + * + * Return: True if the feature is available + */ +/* cppcheck-suppress unusedFunction */ +static inline bool vu_has_protocol_feature(const struct vu_dev *vdev, + unsigned int fbit) +{ + return has_feature(vdev->protocol_features, fbit); +} + +bool vu_queue_empty(struct vu_virtq *vq); +void vu_queue_notify(const struct vu_dev *dev, struct vu_virtq *vq); +int vu_queue_pop(struct vu_dev *dev, struct vu_virtq *vq, + struct vu_virtq_element *elem); +void vu_queue_detach_element(struct vu_dev *dev, struct vu_virtq *vq, + unsigned int index, size_t len); +void vu_queue_unpop(struct vu_dev *dev, struct vu_virtq *vq, + unsigned int index, size_t len); +bool vu_queue_rewind(struct vu_dev *dev, struct vu_virtq *vq, + unsigned int num); + +void vu_queue_fill_by_index(struct vu_virtq *vq, unsigned int index, + unsigned int len, unsigned int idx); +void vu_queue_fill(struct vu_virtq *vq, + const struct vu_virtq_element *elem, unsigned int len, + unsigned int idx); +void vu_queue_flush(struct vu_virtq *vq, unsigned int count); +#endif /* VIRTIO_H */

-- Stefano

Laurent Vivier

12 Jul 12 Jul

5:32 p.m.

New subject: [PATCH v2 3/4] vhost-user: introduce vhost-user API

Add vhost_user.c and vhost_user.h that define the functions needed to implement vhost-user backend. Signed-off-by: Laurent Vivier --- Makefile | 4 +- iov.c | 1 - vhost_user.c | 1267 ++++++++++++++++++++++++++++++++++++++++++++++++++ vhost_user.h | 197 ++++++++ virtio.c | 5 - virtio.h | 2 +- 6 files changed, 1467 insertions(+), 9 deletions(-) create mode 100644 vhost_user.c create mode 100644 vhost_user.h diff --git a/Makefile b/Makefile index 39613a7cf1f2..b2da6ad62103 100644 --- a/Makefile +++ b/Makefile @@ -47,7 +47,7 @@ FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS) PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c flow.c fwd.c \ icmp.c igmp.c inany.c iov.c ip.c isolation.c lineread.c log.c mld.c \ ndp.c netlink.c packet.c passt.c pasta.c pcap.c pif.c tap.c tcp.c \ - tcp_buf.c tcp_splice.c udp.c util.c virtio.c + tcp_buf.c tcp_splice.c udp.c util.c vhost_user.c virtio.c QRAP_SRCS = qrap.c SRCS = $(PASST_SRCS) $(QRAP_SRCS) @@ -57,7 +57,7 @@ PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \ flow_table.h icmp.h icmp_flow.h inany.h iov.h ip.h isolation.h \ lineread.h log.h ndp.h netlink.h packet.h passt.h pasta.h pcap.h pif.h \ siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h tcp_internal.h tcp_splice.h \ - udp.h util.h virtio.h + udp.h util.h vhost_user.h virtio.h HEADERS = $(PASST_HEADERS) seccomp.h C := \#include \nstruct tcp_info x = { .tcpi_snd_wnd = 0 }; diff --git a/iov.c b/iov.c index 3f9e229a305f..3741db21790f 100644 --- a/iov.c +++ b/iov.c @@ -68,7 +68,6 @@ size_t iov_skip_bytes(const struct iovec *iov, size_t n, * * Returns: The number of bytes successfully copied. */ -/* cppcheck-suppress unusedFunction */ size_t iov_from_buf(const struct iovec *iov, size_t iov_cnt, size_t offset, const void *buf, size_t bytes) { diff --git a/vhost_user.c b/vhost_user.c new file mode 100644 index 000000000000..23ec4326995d --- /dev/null +++ b/vhost_user.c @@ -0,0 +1,1267 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later + * Copyright Red Hat + * Author: Laurent Vivier + * + * vhost-user API, command management and virtio interface + */ +/* some parts from QEMU subprojects/libvhost-user/libvhost-user.c */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "util.h" +#include "passt.h" +#include "tap.h" +#include "vhost_user.h" + +/* vhost-user version we are compatible with */ +#define VHOST_USER_VERSION 1 + +/** + * vu_print_capabilities() - print vhost-user capabilities + * this is part of the vhost-user backend + * convention. + */ +/* cppcheck-suppress unusedFunction */ +void vu_print_capabilities(void) +{ + printf("{\n"); + printf(" \"type\": \"net\"\n"); + printf("}\n"); + exit(EXIT_SUCCESS); +} + +/** + * vu_request_to_string() - convert a vhost-user request number to its name + * @req: request number + * + * Return: the name of request number + */ +static const char *vu_request_to_string(unsigned int req) +{ + if (req < VHOST_USER_MAX) { +#define REQ(req) [req] = #req + static const char * const vu_request_str[] = { + REQ(VHOST_USER_NONE), + REQ(VHOST_USER_GET_FEATURES), + REQ(VHOST_USER_SET_FEATURES), + REQ(VHOST_USER_SET_OWNER), + REQ(VHOST_USER_RESET_OWNER), + REQ(VHOST_USER_SET_MEM_TABLE), + REQ(VHOST_USER_SET_LOG_BASE), + REQ(VHOST_USER_SET_LOG_FD), + REQ(VHOST_USER_SET_VRING_NUM), + REQ(VHOST_USER_SET_VRING_ADDR), + REQ(VHOST_USER_SET_VRING_BASE), + REQ(VHOST_USER_GET_VRING_BASE), + REQ(VHOST_USER_SET_VRING_KICK), + REQ(VHOST_USER_SET_VRING_CALL), + REQ(VHOST_USER_SET_VRING_ERR), + REQ(VHOST_USER_GET_PROTOCOL_FEATURES), + REQ(VHOST_USER_SET_PROTOCOL_FEATURES), + REQ(VHOST_USER_GET_QUEUE_NUM), + REQ(VHOST_USER_SET_VRING_ENABLE), + REQ(VHOST_USER_SEND_RARP), + REQ(VHOST_USER_NET_SET_MTU), + REQ(VHOST_USER_SET_BACKEND_REQ_FD), + REQ(VHOST_USER_IOTLB_MSG), + REQ(VHOST_USER_SET_VRING_ENDIAN), + REQ(VHOST_USER_GET_CONFIG), + REQ(VHOST_USER_SET_CONFIG), + REQ(VHOST_USER_POSTCOPY_ADVISE), + REQ(VHOST_USER_POSTCOPY_LISTEN), + REQ(VHOST_USER_POSTCOPY_END), + REQ(VHOST_USER_GET_INFLIGHT_FD), + REQ(VHOST_USER_SET_INFLIGHT_FD), + REQ(VHOST_USER_GPU_SET_SOCKET), + REQ(VHOST_USER_VRING_KICK), + REQ(VHOST_USER_GET_MAX_MEM_SLOTS), + REQ(VHOST_USER_ADD_MEM_REG), + REQ(VHOST_USER_REM_MEM_REG), + REQ(VHOST_USER_MAX), + }; +#undef REQ + return vu_request_str[req]; + } + + return "unknown"; +} + +/** + * qva_to_va() - Translate front-end (QEMU) virtual address to our virtual + * address. + * @dev: Vhost-user device + * @qemu_addr: front-end userspace address + * + * Return: the memory address in our process virtual address space. + */ +static void *qva_to_va(struct vu_dev *dev, uint64_t qemu_addr) +{ + unsigned int i; + + /* Find matching memory region. */ + for (i = 0; i < dev->nregions; i++) { + const struct vu_dev_region *r = &dev->regions[i]; + + if ((qemu_addr >= r->qva) && (qemu_addr < (r->qva + r->size))) { + /* NOLINTNEXTLINE(performance-no-int-to-ptr) */ + return (void *)(qemu_addr - r->qva + r->mmap_addr + + r->mmap_offset); + } + } + + return NULL; +} + +/** + * vmsg_close_fds() - Close all file descriptors of a given message + * @vmsg: Vhost-user message with the list of the file descriptors + */ +static void vmsg_close_fds(const struct vhost_user_msg *vmsg) +{ + int i; + + for (i = 0; i < vmsg->fd_num; i++) + close(vmsg->fds[i]); +} + +/** + * vu_remove_watch() - Remove a file descriptor from an our passt epoll + * file descriptor + * @vdev: Vhost-user device + * @fd: file descriptor to remove + */ +static void vu_remove_watch(const struct vu_dev *vdev, int fd) +{ + (void)vdev; + (void)fd; +} + +/** + * vmsg_set_reply_u64() - Set reply payload.u64 and clear request flags + * and fd_num + * @vmsg: Vhost-user message + * @val: 64bit value to reply + */ +static void vmsg_set_reply_u64(struct vhost_user_msg *vmsg, uint64_t val) +{ + vmsg->hdr.flags = 0; /* defaults will be set by vu_send_reply() */ + vmsg->hdr.size = sizeof(vmsg->payload.u64); + vmsg->payload.u64 = val; + vmsg->fd_num = 0; +} + +/** + * vu_message_read_default() - Read incoming vhost-user message from the + * front-end + * @conn_fd: Vhost-user command socket + * @vmsg: Vhost-user message + * + * Return: -1 there is an error, + * 0 if recvmsg() has been interrupted, + * 1 if a message has been received + */ +static int vu_message_read_default(int conn_fd, struct vhost_user_msg *vmsg) +{ + char control[CMSG_SPACE(VHOST_MEMORY_BASELINE_NREGIONS * + sizeof(int))] = { 0 }; + struct iovec iov = { + .iov_base = (char *)vmsg, + .iov_len = VHOST_USER_HDR_SIZE, + }; + struct msghdr msg = { + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = control, + .msg_controllen = sizeof(control), + }; + size_t fd_size; + struct cmsghdr *cmsg; + ssize_t ret, sz_payload; + + ret = recvmsg(conn_fd, &msg, MSG_DONTWAIT); + if (ret < 0) { + if (errno == EINTR || errno == EAGAIN || errno == EWOULDBLOCK) + return 0; + return -1; + } + + vmsg->fd_num = 0; + for (cmsg = CMSG_FIRSTHDR(&msg); cmsg != NULL; + cmsg = CMSG_NXTHDR(&msg, cmsg)) { + if (cmsg->cmsg_level == SOL_SOCKET && + cmsg->cmsg_type == SCM_RIGHTS) { + fd_size = cmsg->cmsg_len - CMSG_LEN(0); + vmsg->fd_num = fd_size / sizeof(int); + memcpy(vmsg->fds, CMSG_DATA(cmsg), fd_size); + break; + } + } + + sz_payload = vmsg->hdr.size; + if ((size_t)sz_payload > sizeof(vmsg->payload)) { + vu_panic("Error: too big message request: %d," + " size: vmsg->size: %zd, " + "while sizeof(vmsg->payload) = %zu", + vmsg->hdr.request, sz_payload, sizeof(vmsg->payload)); + } + + if (sz_payload) { + do { + ret = recv(conn_fd, &vmsg->payload, sz_payload, 0); + } while (ret < 0 && (errno == EINTR || errno == EAGAIN)); + + if (ret < sz_payload) + vu_panic("Error while reading: %s", + strerror(errno)); + } + + return 1; +} + +/** + * vu_message_write() - send a message to the front-end + * @conn_fd: Vhost-user command socket + * @vmsg: Vhost-user message + * + * #syscalls:vu sendmsg + */ +static void vu_message_write(int conn_fd, struct vhost_user_msg *vmsg) +{ + int rc; + const uint8_t *p = (uint8_t *)vmsg; + char control[CMSG_SPACE(VHOST_MEMORY_BASELINE_NREGIONS * sizeof(int))] = { 0 }; + struct iovec iov = { + .iov_base = (char *)vmsg, + .iov_len = VHOST_USER_HDR_SIZE, + }; + struct msghdr msg = { + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = control, + }; + + memset(control, 0, sizeof(control)); + assert(vmsg->fd_num <= VHOST_MEMORY_BASELINE_NREGIONS); + if (vmsg->fd_num > 0) { + size_t fdsize = vmsg->fd_num * sizeof(int); + struct cmsghdr *cmsg; + + msg.msg_controllen = CMSG_SPACE(fdsize); + cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_len = CMSG_LEN(fdsize); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + memcpy(CMSG_DATA(cmsg), vmsg->fds, fdsize); + } else { + msg.msg_controllen = 0; + } + + do { + rc = sendmsg(conn_fd, &msg, 0); + } while (rc < 0 && (errno == EINTR || errno == EAGAIN)); + + if (vmsg->hdr.size) { + do { + rc = write(conn_fd, p + VHOST_USER_HDR_SIZE, + vmsg->hdr.size); + } while (rc < 0 && (errno == EINTR || errno == EAGAIN)); + } + + if (rc <= 0) + vu_panic("Error while writing: %s", strerror(errno)); +} + +/** + * vu_send_reply() - Update message flags and send it to front-end + * @conn_fd: Vhost-user command socket + * @vmsg: Vhost-user message + */ +static void vu_send_reply(int conn_fd, struct vhost_user_msg *msg) +{ + msg->hdr.flags &= ~VHOST_USER_VERSION_MASK; + msg->hdr.flags |= VHOST_USER_VERSION; + msg->hdr.flags |= VHOST_USER_REPLY_MASK; + + vu_message_write(conn_fd, msg); +} + +/** + * vu_get_features_exec() - Provide back-end features bitmask to front-end + * @vmsg: Vhost-user message + * + * Return: true as a reply is requested + */ +static bool vu_get_features_exec(struct vhost_user_msg *msg) +{ + uint64_t features = + 1ULL << VIRTIO_F_VERSION_1 | + 1ULL << VIRTIO_NET_F_MRG_RXBUF | + 1ULL << VHOST_USER_F_PROTOCOL_FEATURES; + + vmsg_set_reply_u64(msg, features); + + debug("Sending back to guest u64: 0x%016"PRIx64, msg->payload.u64); + + return true; +} + +/** + * vu_set_enable_all_rings() - Enable/disable all the virqueues + * @vdev: Vhost-user device + * @enabled: New virtqueues state + */ +static void vu_set_enable_all_rings(struct vu_dev *vdev, bool enabled) +{ + uint16_t i; + + for (i = 0; i < VHOST_USER_MAX_QUEUES; i++) + vdev->vq[i].enable = enabled; +} + +/** + * vu_set_features_exec() - Enable features of the back-end + * @vdev: Vhost-user device + * @vmsg: Vhost-user message + * + * Return: false as no reply is requested + */ +static bool vu_set_features_exec(struct vu_dev *vdev, + struct vhost_user_msg *msg) +{ + debug("u64: 0x%016"PRIx64, msg->payload.u64); + + vdev->features = msg->payload.u64; + /* + * We only support devices conforming to VIRTIO 1.0 or + * later + */ + if (!vu_has_feature(vdev, VIRTIO_F_VERSION_1)) + vu_panic("virtio legacy devices aren't supported by passt"); + + if (!vu_has_feature(vdev, VHOST_USER_F_PROTOCOL_FEATURES)) + vu_set_enable_all_rings(vdev, true); + + /* virtio-net features */ + + if (vu_has_feature(vdev, VIRTIO_F_VERSION_1) || + vu_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF)) { + vdev->hdrlen = sizeof(struct virtio_net_hdr_mrg_rxbuf); + } else { + vdev->hdrlen = sizeof(struct virtio_net_hdr); + } + + return false; +} + +/** + * vu_set_owner_exec() - Session start flag, do nothing in our case + * + * Return: false as no reply is requested + */ +static bool vu_set_owner_exec(void) +{ + return false; +} + +/** + * map_ring() - Convert ring front-end (QEMU) addresses to our process + * virtual address space. + * @vdev: Vhost-user device + * @vq: Virtqueue + * + * Return: true if ring cannot be mapped to our address space + */ +static bool map_ring(struct vu_dev *vdev, struct vu_virtq *vq) +{ + vq->vring.desc = qva_to_va(vdev, vq->vra.desc_user_addr); + vq->vring.used = qva_to_va(vdev, vq->vra.used_user_addr); + vq->vring.avail = qva_to_va(vdev, vq->vra.avail_user_addr); + + debug("Setting virtq addresses:"); + debug(" vring_desc at %p", (void *)vq->vring.desc); + debug(" vring_used at %p", (void *)vq->vring.used); + debug(" vring_avail at %p", (void *)vq->vring.avail); + + return !(vq->vring.desc && vq->vring.used && vq->vring.avail); +} + +/** + * vu_packet_check_range() - Check if a given memory zone is contained in + * a mapped guest memory region + * @buf: Array of the available memory regions + * @offset: Offset of data range in packet descriptor + * @size: Length of desired data range + * @start: Start of the packet descriptor + * + * Return: 0 if the zone in a mapped memory region, -1 otherwise + */ +/* cppcheck-suppress unusedFunction */ +int vu_packet_check_range(void *buf, size_t offset, size_t len, + const char *start) +{ + struct vu_dev_region *dev_region; + + for (dev_region = buf; dev_region->mmap_addr; dev_region++) { + /* NOLINTNEXTLINE(performance-no-int-to-ptr) */ + char *m = (char *)dev_region->mmap_addr; + + if (m <= start && + start + offset + len < m + dev_region->mmap_offset + + dev_region->size) + return 0; + } + + return -1; +} + +/** + * vu_set_mem_table_exec() - Sets the memory map regions to be able to + * translate the vring addresses. + * @vdev: Vhost-user device + * @vmsg: Vhost-user message + * + * Return: false as no reply is requested + * + * #syscalls:vu mmap munmap + */ +static bool vu_set_mem_table_exec(struct vu_dev *vdev, + struct vhost_user_msg *msg) +{ + unsigned int i; + struct vhost_user_memory m = msg->payload.memory, *memory = &m; + + for (i = 0; i < vdev->nregions; i++) { + struct vu_dev_region *r = &vdev->regions[i]; + /* NOLINTNEXTLINE(performance-no-int-to-ptr) */ + void *mm = (void *)r->mmap_addr; + + if (mm) + munmap(mm, r->size + r->mmap_offset); + } + vdev->nregions = memory->nregions; + + debug("Nregions: %u", memory->nregions); + for (i = 0; i < vdev->nregions; i++) { + void *mmap_addr; + struct vhost_user_memory_region *msg_region = &memory->regions[i]; + struct vu_dev_region *dev_region = &vdev->regions[i]; + + debug("Region %d", i); + debug(" guest_phys_addr: 0x%016"PRIx64, + msg_region->guest_phys_addr); + debug(" memory_size: 0x%016"PRIx64, + msg_region->memory_size); + debug(" userspace_addr 0x%016"PRIx64, + msg_region->userspace_addr); + debug(" mmap_offset 0x%016"PRIx64, + msg_region->mmap_offset); + + dev_region->gpa = msg_region->guest_phys_addr; + dev_region->size = msg_region->memory_size; + dev_region->qva = msg_region->userspace_addr; + dev_region->mmap_offset = msg_region->mmap_offset; + + /* We don't use offset argument of mmap() since the + * mapped address has to be page aligned, and we use huge + * pages. + */ + mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset, + PROT_READ | PROT_WRITE, MAP_SHARED | + MAP_NORESERVE, msg->fds[i], 0); + + if (mmap_addr == MAP_FAILED) + vu_panic("region mmap error: %s", strerror(errno)); + + dev_region->mmap_addr = (uint64_t)(uintptr_t)mmap_addr; + debug(" mmap_addr: 0x%016"PRIx64, + dev_region->mmap_addr); + + close(msg->fds[i]); + } + + for (i = 0; i < VHOST_USER_MAX_QUEUES; i++) { + if (vdev->vq[i].vring.desc) { + if (map_ring(vdev, &vdev->vq[i])) + vu_panic("remapping queue %d during setmemtable", i); + } + } + + return false; +} + +/** + * vu_set_vring_num_exec() - Set the size of the queue (vring size) + * @vdev: Vhost-user device + * @vmsg: Vhost-user message + * + * Return: false as no reply is requested + */ +static bool vu_set_vring_num_exec(struct vu_dev *vdev, + struct vhost_user_msg *msg) +{ + unsigned int idx = msg->payload.state.index; + unsigned int num = msg->payload.state.num; + + debug("State.index: %u", idx); + debug("State.num: %u", num); + vdev->vq[idx].vring.num = num; + + return false; +} + +/** + * vu_set_vring_addr_exec() - Set the addresses of the vring + * @vdev: Vhost-user device + * @vmsg: Vhost-user message + * + * Return: false as no reply is requested + */ +static bool vu_set_vring_addr_exec(struct vu_dev *vdev, + struct vhost_user_msg *msg) +{ + struct vhost_vring_addr addr = msg->payload.addr, *vra = &addr; + struct vu_virtq *vq = &vdev->vq[vra->index]; + + debug("vhost_vring_addr:"); + debug(" index: %d", vra->index); + debug(" flags: %d", vra->flags); + debug(" desc_user_addr: 0x%016" PRIx64, (uint64_t)vra->desc_user_addr); + debug(" used_user_addr: 0x%016" PRIx64, (uint64_t)vra->used_user_addr); + debug(" avail_user_addr: 0x%016" PRIx64, (uint64_t)vra->avail_user_addr); + debug(" log_guest_addr: 0x%016" PRIx64, (uint64_t)vra->log_guest_addr); + + vq->vra = *vra; + vq->vring.flags = vra->flags; + vq->vring.log_guest_addr = vra->log_guest_addr; + + if (map_ring(vdev, vq)) + vu_panic("Invalid vring_addr message"); + + vq->used_idx = le16toh(vq->vring.used->idx); + + if (vq->last_avail_idx != vq->used_idx) { + debug("Last avail index != used index: %u != %u", + vq->last_avail_idx, vq->used_idx); + } + + return false; +} +/** + * vu_set_vring_base_exec() - Sets the next index to use for descriptors + * in this vring + * @vdev: Vhost-user device + * @vmsg: Vhost-user message + * + * Return: false as no reply is requested + */ +static bool vu_set_vring_base_exec(struct vu_dev *vdev, + struct vhost_user_msg *msg) +{ + unsigned int idx = msg->payload.state.index; + unsigned int num = msg->payload.state.num; + + debug("State.index: %u", idx); + debug("State.num: %u", num); + vdev->vq[idx].shadow_avail_idx = vdev->vq[idx].last_avail_idx = num; + + return false; +} + +/** + * vu_get_vring_base_exec() - Stops the vring and returns the current + * descriptor index or indices + * @vdev: Vhost-user device + * @vmsg: Vhost-user message + * + * Return: false as a reply is requested + */ +static bool vu_get_vring_base_exec(struct vu_dev *vdev, + struct vhost_user_msg *msg) +{ + unsigned int idx = msg->payload.state.index; + + debug("State.index: %u", idx); + msg->payload.state.num = vdev->vq[idx].last_avail_idx; + msg->hdr.size = sizeof(msg->payload.state); + + vdev->vq[idx].started = false; + + if (vdev->vq[idx].call_fd != -1) { + close(vdev->vq[idx].call_fd); + vdev->vq[idx].call_fd = -1; + } + if (vdev->vq[idx].kick_fd != -1) { + vu_remove_watch(vdev, vdev->vq[idx].kick_fd); + close(vdev->vq[idx].kick_fd); + vdev->vq[idx].kick_fd = -1; + } + + return true; +} + +/** + * vu_set_watch() - Add a file descriptor to the passt epoll file descriptor + * @vdev: vhost-user device + * @fd: file descriptor to add + */ +static void vu_set_watch(const struct vu_dev *vdev, int fd) +{ + (void)vdev; + (void)fd; +} + +/** + * vu_wait_queue() - wait new free entries in the virtqueue + * @vq: virtqueue to wait on + */ +static int vu_wait_queue(const struct vu_virtq *vq) +{ + eventfd_t kick_data; + ssize_t rc; + int status; + + /* wait the kernel to put new entries in the queue */ + + status = fcntl(vq->kick_fd, F_GETFL); + if (status == -1) + return -1; + + fcntl(vq->kick_fd, F_SETFL, status & ~O_NONBLOCK); + rc = eventfd_read(vq->kick_fd, &kick_data); + fcntl(vq->kick_fd, F_SETFL, status); + if (rc == -1) + return -1; + + return 0; +} + +/** + * vu_send() - Send a buffer to the front-end using the RX virtqueue + * @vdev: vhost-user device + * @buf: address of the buffer + * @size: size of the buffer + * + * Return: number of bytes sent, -1 if there is an error + */ +/* cppcheck-suppress unusedFunction */ +int vu_send(struct vu_dev *vdev, const void *buf, size_t size) +{ + size_t hdrlen = vdev->hdrlen; + struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE]; + struct vu_virtq_element elem[VIRTQUEUE_MAX_SIZE]; + struct iovec in_sg[VIRTQUEUE_MAX_SIZE]; + size_t lens[VIRTQUEUE_MAX_SIZE]; + size_t offset; + int i, j; + __virtio16 *num_buffers_ptr; + int in_sg_count; + + debug("vu_send size %zu hdrlen %zu", size, hdrlen); + + if (!vu_queue_enabled(vq) || !vu_queue_started(vq)) { + err("Got packet, but no available descriptors on RX virtq."); + return 0; + } + + offset = 0; + i = 0; + num_buffers_ptr = NULL; + in_sg_count = 0; + while (offset < size) { + size_t len; + int total; + int ret; + + total = 0; + + if (i == ARRAY_SIZE(elem) || + in_sg_count == ARRAY_SIZE(in_sg)) { + err("virtio-net unexpected long buffer chain"); + goto err; + } + + elem[i].out_num = 0; + elem[i].out_sg = NULL; + elem[i].in_num = ARRAY_SIZE(in_sg) - in_sg_count; + elem[i].in_sg = &in_sg[in_sg_count]; + + ret = vu_queue_pop(vdev, vq, &elem[i]); + if (ret < 0) { + if (vu_wait_queue(vq) != -1) + continue; + if (i) { + err("virtio-net unexpected empty queue: " + "i %d mergeable %d offset %zd, size %zd, " + "features 0x%" PRIx64, + i, vu_has_feature(vdev, + VIRTIO_NET_F_MRG_RXBUF), + offset, size, vdev->features); + } + offset = -1; + goto err; + } + in_sg_count += elem[i].in_num; + + if (elem[i].in_num < 1) { + err("virtio-net receive queue contains no in buffers"); + vu_queue_detach_element(vdev, vq, elem[i].index, 0); + offset = -1; + goto err; + } + + if (i == 0) { + struct virtio_net_hdr hdr = { + .flags = VIRTIO_NET_HDR_F_DATA_VALID, + .gso_type = VIRTIO_NET_HDR_GSO_NONE, + }; + + ASSERT(offset == 0); + ASSERT(elem[i].in_sg[0].iov_len >= hdrlen); + + len = iov_from_buf(elem[i].in_sg, elem[i].in_num, 0, + &hdr, sizeof(hdr)); + + num_buffers_ptr = (__virtio16 *)((char *)elem[i].in_sg[0].iov_base + + len); + + total += hdrlen; + } + + len = iov_from_buf(elem[i].in_sg, elem[i].in_num, total, + (char *)buf + offset, size - offset); + + total += len; + offset += len; + + /* If buffers can't be merged, at this point we + * must have consumed the complete packet. + * Otherwise, drop it. + */ + if (!vu_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF) && + offset < size) { + vu_queue_unpop(vdev, vq, elem[i].index, total); + goto err; + } + + lens[i] = total; + i++; + } + + if (num_buffers_ptr && vu_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF)) + *num_buffers_ptr = htole16(i); + + for (j = 0; j < i; j++) { + debug("filling total %zd idx %d", lens[j], j); + vu_queue_fill(vq, &elem[j], lens[j], j); + } + + vu_queue_flush(vq, i); + vu_queue_notify(vdev, vq); + + debug("sent %zu", offset); + + return offset; +err: + for (j = 0; j < i; j++) + vu_queue_detach_element(vdev, vq, elem[j].index, lens[j]); + + return offset; +} + +/** + * vu_handle_tx() - Receive data from the TX virqueue + * @vdev: vhost-user device + * @index: index of the virtqueue + */ +static void vu_handle_tx(struct vu_dev *vdev, int index) +{ + struct vu_virtq *vq = &vdev->vq[index]; + int hdrlen = vdev->hdrlen; + struct timespec now; + struct vu_virtq_element elem[VIRTQUEUE_MAX_SIZE]; + struct iovec out_sg[VIRTQUEUE_MAX_SIZE]; + int out_sg_count; + + int count; + + if (index % 2 != VHOST_USER_TX_QUEUE) { + debug("index %d is not a TX queue", index); + return; + } + + clock_gettime(CLOCK_MONOTONIC, &now); + + tap_flush_pools(); + + count = 0; + out_sg_count = 0; + while (1) { + int ret; + + ASSERT(index == VHOST_USER_TX_QUEUE); + + elem[count].out_num = 1; + elem[count].out_sg = &out_sg[out_sg_count]; + elem[count].in_num = 0; + elem[count].in_sg = NULL; + ret = vu_queue_pop(vdev, vq, &elem[count]); + if (ret < 0) + break; + out_sg_count += elem[count].out_num; + + if (elem[count].out_num < 1) { + debug("virtio-net header not in first element"); + break; + } + ASSERT(elem[count].out_num == 1); + + tap_add_packet(vdev->context, + elem[count].out_sg[0].iov_len - hdrlen, + (char *)elem[count].out_sg[0].iov_base + hdrlen); + count++; + } + tap_handler(vdev->context, &now); + + if (count) { + int i; + + for (i = 0; i < count; i++) + vu_queue_fill(vq, &elem[i], 0, i); + vu_queue_flush(vq, count); + vu_queue_notify(vdev, vq); + } +} + +/** + * vu_kick_cb() - Called on a kick event to start to receive data + * @vdev: vhost-user device + * @ref: epoll reference information + */ +/* cppcheck-suppress unusedFunction */ +void vu_kick_cb(struct vu_dev *vdev, union epoll_ref ref) +{ + eventfd_t kick_data; + ssize_t rc; + int idx; + + for (idx = 0; idx < VHOST_USER_MAX_QUEUES; idx++) + if (vdev->vq[idx].kick_fd == ref.fd) + break; + + if (idx == VHOST_USER_MAX_QUEUES) + return; + + rc = eventfd_read(ref.fd, &kick_data); + if (rc == -1) + vu_panic("kick eventfd_read(): %s", strerror(errno)); + + debug("Got kick_data: %016"PRIx64" idx:%d", + kick_data, idx); + if (idx % 2 == VHOST_USER_TX_QUEUE) + vu_handle_tx(vdev, idx); +} + +/** + * vu_check_queue_msg_file() - Check if a message is valid, + * close fds if NOFD bit is set + * @vmsg: Vhost-user message + */ +static void vu_check_queue_msg_file(struct vhost_user_msg *msg) +{ + int idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + bool nofd = msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK; + + if (idx >= VHOST_USER_MAX_QUEUES) + vu_panic("Invalid queue index: %u", idx); + + if (nofd) { + vmsg_close_fds(msg); + return; + } + + if (msg->fd_num != 1) + vu_panic("Invalid fds in request: %d", msg->hdr.request); +} + +/** + * vu_set_vring_kick_exec() - Set the event file descriptor for adding buffers + * to the vring + * @vdev: Vhost-user device + * @vmsg: Vhost-user message + * + * Return: false as no reply is requested + */ +static bool vu_set_vring_kick_exec(struct vu_dev *vdev, + struct vhost_user_msg *msg) +{ + int idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + bool nofd = msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK; + + debug("u64: 0x%016"PRIx64, msg->payload.u64); + + vu_check_queue_msg_file(msg); + + if (vdev->vq[idx].kick_fd != -1) { + vu_remove_watch(vdev, vdev->vq[idx].kick_fd); + close(vdev->vq[idx].kick_fd); + vdev->vq[idx].kick_fd = -1; + } + + /* cppcheck-suppress redundantAssignment */ + vdev->vq[idx].kick_fd = nofd ? -1 : msg->fds[0]; + debug("Got kick_fd: %d for vq: %d", vdev->vq[idx].kick_fd, idx); + + vdev->vq[idx].started = true; + + if (vdev->vq[idx].kick_fd != -1 && idx % 2 == VHOST_USER_TX_QUEUE) { + vu_set_watch(vdev, vdev->vq[idx].kick_fd); + debug("Waiting for kicks on fd: %d for vq: %d", + vdev->vq[idx].kick_fd, idx); + } + + return false; +} + +/** + * vu_set_vring_call_exec() - Set the event file descriptor to signal when + * buffers are used + * @vdev: Vhost-user device + * @vmsg: Vhost-user message + * + * Return: false as no reply is requested + */ +static bool vu_set_vring_call_exec(struct vu_dev *vdev, + struct vhost_user_msg *msg) +{ + int idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + bool nofd = msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK; + + debug("u64: 0x%016"PRIx64, msg->payload.u64); + + vu_check_queue_msg_file(msg); + + if (vdev->vq[idx].call_fd != -1) { + close(vdev->vq[idx].call_fd); + vdev->vq[idx].call_fd = -1; + } + + /* cppcheck-suppress redundantAssignment */ + vdev->vq[idx].call_fd = nofd ? -1 : msg->fds[0]; + + /* in case of I/O hang after reconnecting */ + if (vdev->vq[idx].call_fd != -1) + eventfd_write(msg->fds[0], 1); + + debug("Got call_fd: %d for vq: %d", vdev->vq[idx].call_fd, idx); + + return false; +} + +/** + * vu_set_vring_err_exec() - Set the event file descriptor to signal when + * error occurs + * @vdev: Vhost-user device + * @vmsg: Vhost-user message + * + * Return: false as no reply is requested + */ +static bool vu_set_vring_err_exec(struct vu_dev *vdev, + struct vhost_user_msg *msg) +{ + int idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + bool nofd = msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK; + + debug("u64: 0x%016"PRIx64, msg->payload.u64); + + vu_check_queue_msg_file(msg); + + if (vdev->vq[idx].err_fd != -1) { + close(vdev->vq[idx].err_fd); + vdev->vq[idx].err_fd = -1; + } + + /* cppcheck-suppress redundantAssignment */ + vdev->vq[idx].err_fd = nofd ? -1 : msg->fds[0]; + + return false; +} + +/** + * vu_get_protocol_features_exec() - Provide the protocol (vhost-user) features + * to the front-end + * @vdev: Vhost-user device + * @vmsg: Vhost-user message + * + * Return: false as a reply is requested + */ +static bool vu_get_protocol_features_exec(struct vhost_user_msg *msg) +{ + uint64_t features = 1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK; + + vmsg_set_reply_u64(msg, features); + + return true; +} + +/** + * vu_set_protocol_features_exec() - Enable protocol (vhost-user) features + * @vdev: Vhost-user device + * @vmsg: Vhost-user message + * + * Return: false as no reply is requested + */ +static bool vu_set_protocol_features_exec(struct vu_dev *vdev, + struct vhost_user_msg *msg) +{ + uint64_t features = msg->payload.u64; + + debug("u64: 0x%016"PRIx64, features); + + vdev->protocol_features = msg->payload.u64; + + if (vu_has_protocol_feature(vdev, + VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS) && + (!vu_has_protocol_feature(vdev, VHOST_USER_PROTOCOL_F_BACKEND_REQ) || + !vu_has_protocol_feature(vdev, VHOST_USER_PROTOCOL_F_REPLY_ACK))) { + /* + * The use case for using messages for kick/call is simulation, to make + * the kick and call synchronous. To actually get that behaviour, both + * of the other features are required. + * Theoretically, one could use only kick messages, or do them without + * having F_REPLY_ACK, but too many (possibly pending) messages on the + * socket will eventually cause the master to hang, to avoid this in + * scenarios where not desired enforce that the settings are in a way + * that actually enables the simulation case. + */ + vu_panic("F_IN_BAND_NOTIFICATIONS requires F_BACKEND_REQ && F_REPLY_ACK"); + return false; + } + + return false; +} + +/** + * vu_get_queue_num_exec() - Tell how many queues we support + * @vmsg: Vhost-user message + * + * Return: true as a reply is requested + */ +static bool vu_get_queue_num_exec(struct vhost_user_msg *msg) +{ + vmsg_set_reply_u64(msg, VHOST_USER_MAX_QUEUES); + return true; +} + +/** + * vu_set_vring_enable_exec() - Enable or disable corresponding vring + * @vdev: Vhost-user device + * @vmsg: Vhost-user message + * + * Return: false as no reply is requested + */ +static bool vu_set_vring_enable_exec(struct vu_dev *vdev, + struct vhost_user_msg *msg) +{ + unsigned int idx = msg->payload.state.index; + unsigned int enable = msg->payload.state.num; + + debug("State.index: %u", idx); + debug("State.enable: %u", enable); + + if (idx >= VHOST_USER_MAX_QUEUES) + vu_panic("Invalid vring_enable index: %u", idx); + + vdev->vq[idx].enable = enable; + return false; +} + +/** + * vu_init() - Initialize vhost-user device structure + * @c: execution context + * @vdev: vhost-user device + */ +/* cppcheck-suppress unusedFunction */ +void vu_init(struct ctx *c, struct vu_dev *vdev) +{ + int i; + + vdev->context = c; + vdev->hdrlen = 0; + for (i = 0; i < VHOST_USER_MAX_QUEUES; i++) + vdev->vq[i] = (struct vu_virtq){ + .call_fd = -1, + .kick_fd = -1, + .err_fd = -1, + .notification = true, + }; +} + +/** + * vu_cleanup() - Reset vhost-user device + * @vdev: vhost-user device + */ +void vu_cleanup(struct vu_dev *vdev) +{ + unsigned int i; + + for (i = 0; i < VHOST_USER_MAX_QUEUES; i++) { + struct vu_virtq *vq = &vdev->vq[i]; + + vq->started = false; + vq->notification = true; + + if (vq->call_fd != -1) { + close(vq->call_fd); + vq->call_fd = -1; + } + if (vq->err_fd != -1) { + close(vq->err_fd); + vq->err_fd = -1; + } + if (vq->kick_fd != -1) { + vu_remove_watch(vdev, vq->kick_fd); + close(vq->kick_fd); + vq->kick_fd = -1; + } + + vq->vring.desc = 0; + vq->vring.used = 0; + vq->vring.avail = 0; + } + vdev->hdrlen = 0; + + for (i = 0; i < vdev->nregions; i++) { + const struct vu_dev_region *r = &vdev->regions[i]; + /* NOLINTNEXTLINE(performance-no-int-to-ptr) */ + void *m = (void *)r->mmap_addr; + + if (m) + munmap(m, r->size + r->mmap_offset); + } + vdev->nregions = 0; +} + +/** + * vu_sock_reset() - Reset connection socket + * @vdev: vhost-user device + */ +static void vu_sock_reset(struct vu_dev *vdev) +{ + (void)vdev; +} + +/** + * tap_handler_vu() - Packet handler for vhost-user + * @vdev: vhost-user device + * @fd: vhost-user message socket + * @events: epoll events + */ +/* cppcheck-suppress unusedFunction */ +void tap_handler_vu(struct vu_dev *vdev, int fd, uint32_t events) +{ + struct vhost_user_msg msg = { 0 }; + bool need_reply, reply_requested; + int ret; + + if (events & (EPOLLRDHUP | EPOLLHUP | EPOLLERR)) { + vu_sock_reset(vdev); + return; + } + + + ret = vu_message_read_default(fd, &msg); + if (ret < 0) + vu_panic("Error while recvmsg: %s", strerror(errno)); + if (ret == 0) { + vu_sock_reset(vdev); + return; + } + debug("================ Vhost user message ================"); + debug("Request: %s (%d)", vu_request_to_string(msg.hdr.request), + msg.hdr.request); + debug("Flags: 0x%x", msg.hdr.flags); + debug("Size: %u", msg.hdr.size); + + need_reply = msg.hdr.flags & VHOST_USER_NEED_REPLY_MASK; + switch (msg.hdr.request) { + case VHOST_USER_GET_FEATURES: + reply_requested = vu_get_features_exec(&msg); + break; + case VHOST_USER_SET_FEATURES: + reply_requested = vu_set_features_exec(vdev, &msg); + break; + case VHOST_USER_GET_PROTOCOL_FEATURES: + reply_requested = vu_get_protocol_features_exec(&msg); + break; + case VHOST_USER_SET_PROTOCOL_FEATURES: + reply_requested = vu_set_protocol_features_exec(vdev, &msg); + break; + case VHOST_USER_GET_QUEUE_NUM: + reply_requested = vu_get_queue_num_exec(&msg); + break; + case VHOST_USER_SET_OWNER: + reply_requested = vu_set_owner_exec(); + break; + case VHOST_USER_SET_MEM_TABLE: + reply_requested = vu_set_mem_table_exec(vdev, &msg); + break; + case VHOST_USER_SET_VRING_NUM: + reply_requested = vu_set_vring_num_exec(vdev, &msg); + break; + case VHOST_USER_SET_VRING_ADDR: + reply_requested = vu_set_vring_addr_exec(vdev, &msg); + break; + case VHOST_USER_SET_VRING_BASE: + reply_requested = vu_set_vring_base_exec(vdev, &msg); + break; + case VHOST_USER_GET_VRING_BASE: + reply_requested = vu_get_vring_base_exec(vdev, &msg); + break; + case VHOST_USER_SET_VRING_KICK: + reply_requested = vu_set_vring_kick_exec(vdev, &msg); + break; + case VHOST_USER_SET_VRING_CALL: + reply_requested = vu_set_vring_call_exec(vdev, &msg); + break; + case VHOST_USER_SET_VRING_ERR: + reply_requested = vu_set_vring_err_exec(vdev, &msg); + break; + case VHOST_USER_SET_VRING_ENABLE: + reply_requested = vu_set_vring_enable_exec(vdev, &msg); + break; + case VHOST_USER_NONE: + vu_cleanup(vdev); + return; + default: + vu_panic("Unhandled request: %d", msg.hdr.request); + return; + } + + if (!reply_requested && need_reply) { + msg.payload.u64 = 0; + msg.hdr.flags = 0; + msg.hdr.size = sizeof(msg.payload.u64); + msg.fd_num = 0; + reply_requested = true; + } + + if (reply_requested) + vu_send_reply(fd, &msg); +} diff --git a/vhost_user.h b/vhost_user.h new file mode 100644 index 000000000000..b9e4bcf8e531 --- /dev/null +++ b/vhost_user.h @@ -0,0 +1,197 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later + * Copyright Red Hat + * Author: Laurent Vivier + * + * vhost-user API, command management and virtio interface + */ + +/* some parts from subprojects/libvhost-user/libvhost-user.h */ + +#ifndef VHOST_USER_H +#define VHOST_USER_H + +#include "virtio.h" +#include "iov.h" + +#define VHOST_USER_F_PROTOCOL_FEATURES 30 + +#define VHOST_MEMORY_BASELINE_NREGIONS 8 + +/** + * enum vhost_user_protocol_feature - List of available vhost-user features + */ +enum vhost_user_protocol_feature { + VHOST_USER_PROTOCOL_F_MQ = 0, + VHOST_USER_PROTOCOL_F_LOG_SHMFD = 1, + VHOST_USER_PROTOCOL_F_RARP = 2, + VHOST_USER_PROTOCOL_F_REPLY_ACK = 3, + VHOST_USER_PROTOCOL_F_NET_MTU = 4, + VHOST_USER_PROTOCOL_F_BACKEND_REQ = 5, + VHOST_USER_PROTOCOL_F_CROSS_ENDIAN = 6, + VHOST_USER_PROTOCOL_F_CRYPTO_SESSION = 7, + VHOST_USER_PROTOCOL_F_PAGEFAULT = 8, + VHOST_USER_PROTOCOL_F_CONFIG = 9, + VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD = 10, + VHOST_USER_PROTOCOL_F_HOST_NOTIFIER = 11, + VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD = 12, + VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS = 14, + VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS = 15, + + VHOST_USER_PROTOCOL_F_MAX +}; + +/** + * enum vhost_user_request - list of available vhost-user request + */ +enum vhost_user_request { + VHOST_USER_NONE = 0, + VHOST_USER_GET_FEATURES = 1, + VHOST_USER_SET_FEATURES = 2, + VHOST_USER_SET_OWNER = 3, + VHOST_USER_RESET_OWNER = 4, + VHOST_USER_SET_MEM_TABLE = 5, + VHOST_USER_SET_LOG_BASE = 6, + VHOST_USER_SET_LOG_FD = 7, + VHOST_USER_SET_VRING_NUM = 8, + VHOST_USER_SET_VRING_ADDR = 9, + VHOST_USER_SET_VRING_BASE = 10, + VHOST_USER_GET_VRING_BASE = 11, + VHOST_USER_SET_VRING_KICK = 12, + VHOST_USER_SET_VRING_CALL = 13, + VHOST_USER_SET_VRING_ERR = 14, + VHOST_USER_GET_PROTOCOL_FEATURES = 15, + VHOST_USER_SET_PROTOCOL_FEATURES = 16, + VHOST_USER_GET_QUEUE_NUM = 17, + VHOST_USER_SET_VRING_ENABLE = 18, + VHOST_USER_SEND_RARP = 19, + VHOST_USER_NET_SET_MTU = 20, + VHOST_USER_SET_BACKEND_REQ_FD = 21, + VHOST_USER_IOTLB_MSG = 22, + VHOST_USER_SET_VRING_ENDIAN = 23, + VHOST_USER_GET_CONFIG = 24, + VHOST_USER_SET_CONFIG = 25, + VHOST_USER_CREATE_CRYPTO_SESSION = 26, + VHOST_USER_CLOSE_CRYPTO_SESSION = 27, + VHOST_USER_POSTCOPY_ADVISE = 28, + VHOST_USER_POSTCOPY_LISTEN = 29, + VHOST_USER_POSTCOPY_END = 30, + VHOST_USER_GET_INFLIGHT_FD = 31, + VHOST_USER_SET_INFLIGHT_FD = 32, + VHOST_USER_GPU_SET_SOCKET = 33, + VHOST_USER_VRING_KICK = 35, + VHOST_USER_GET_MAX_MEM_SLOTS = 36, + VHOST_USER_ADD_MEM_REG = 37, + VHOST_USER_REM_MEM_REG = 38, + VHOST_USER_MAX +}; + +/** + * struct vhost_user_header - Vhost-user message header + * @request: Request type of the message + * @flags: Request flags + * @size: The following payload size + */ +struct vhost_user_header { + enum vhost_user_request request; + +#define VHOST_USER_VERSION_MASK 0x3 +#define VHOST_USER_REPLY_MASK (0x1 << 2) +#define VHOST_USER_NEED_REPLY_MASK (0x1 << 3) + uint32_t flags; + uint32_t size; /* the following payload size */ +} __attribute__ ((__packed__)); + +/** + * struct vhost_user_memory_region - Front-end shared memory region information + * @guest_phys_addr: Guest physical address of the region + * @memory_size: Memory size + * @userspace_addr: front-end (QEMU) userspace address + * @mmap_offset: region offset in the shared memory area + */ +struct vhost_user_memory_region { + uint64_t guest_phys_addr; + uint64_t memory_size; + uint64_t userspace_addr; + uint64_t mmap_offset; +}; + +/** + * struct vhost_user_memory - List of all the shared memory regions + * @nregions: Number of memory regions + * @padding: Padding + * @regions: Memory regions list + */ +struct vhost_user_memory { + uint32_t nregions; + uint32_t padding; + struct vhost_user_memory_region regions[VHOST_MEMORY_BASELINE_NREGIONS]; +}; + +/** + * union vhost_user_payload - Vhost-user message payload + * @u64: 64bit payload + * @state: Vring state payload + * @addr: Vring addresses payload + * vhost_user_memory: Memory regions information payload + */ +union vhost_user_payload { +#define VHOST_USER_VRING_IDX_MASK 0xff +#define VHOST_USER_VRING_NOFD_MASK (0x1 << 8) + uint64_t u64; + struct vhost_vring_state state; + struct vhost_vring_addr addr; + struct vhost_user_memory memory; +}; + +/** + * struct vhost_user_msg - Vhost-use message + * @hdr: Message header + * @payload: Message payload + * @fds: File descriptors associated with the message + * in the ancillary data. + * (shared memory or event file descriptors) + * @fd_num: Number of file descriptors + */ +struct vhost_user_msg { + struct vhost_user_header hdr; + union vhost_user_payload payload; + + int fds[VHOST_MEMORY_BASELINE_NREGIONS]; + int fd_num; +} __attribute__ ((__packed__)); +#define VHOST_USER_HDR_SIZE sizeof(struct vhost_user_header) + +/* index of the RX virtqueue */ +#define VHOST_USER_RX_QUEUE 0 +/* index of the TX virtqueue */ +#define VHOST_USER_TX_QUEUE 1 + +/** + * vu_queue_enabled - Return state of a virtqueue + * @vq: Virtqueue to check + * + * Return: true if the virqueue is enabled, false otherwise + */ +static inline bool vu_queue_enabled(const struct vu_virtq *vq) +{ + return vq->enable; +} + +/** + * vu_queue_started - Return state of a virtqueue + * @vq: Virtqueue to check + * + * Return: true if the virqueue is started, false otherwise + */ +static inline bool vu_queue_started(const struct vu_virtq *vq) +{ + return vq->started; +} + +int vu_send(struct vu_dev *vdev, const void *buf, size_t size); +void vu_print_capabilities(void); +void vu_init(struct ctx *c, struct vu_dev *vdev); +void vu_kick_cb(struct vu_dev *vdev, union epoll_ref ref); +void vu_cleanup(struct vu_dev *vdev); +void tap_handler_vu(struct vu_dev *vdev, int fd, uint32_t events); +#endif /* VHOST_USER_H */ diff --git a/virtio.c b/virtio.c index 5f984f92cae0..d712f30cc33d 100644 --- a/virtio.c +++ b/virtio.c @@ -261,7 +261,6 @@ static bool vring_notify(const struct vu_dev *dev, struct vu_virtq *vq) * @dev: Vhost-user device * @vq: Virtqueue */ -/* cppcheck-suppress unusedFunction */ void vu_queue_notify(const struct vu_dev *dev, struct vu_virtq *vq) { if (!vq->vring.avail) @@ -436,7 +435,6 @@ static int vu_queue_map_desc(struct vu_dev *dev, struct vu_virtq *vq, unsigned i * * Return: -1 if there is an error, 0 otherwise */ -/* cppcheck-suppress unusedFunction */ int vu_queue_pop(struct vu_dev *dev, struct vu_virtq *vq, struct vu_virtq_element *elem) { unsigned int head; @@ -497,7 +495,6 @@ void vu_queue_detach_element(struct vu_dev *dev, struct vu_virtq *vq, * @index: Index of the element to unpop * @len: Size of the element to unpop */ -/* cppcheck-suppress unusedFunction */ void vu_queue_unpop(struct vu_dev *dev, struct vu_virtq *vq, unsigned int index, size_t len) { vq->last_avail_idx--; @@ -567,7 +564,6 @@ void vu_queue_fill_by_index(struct vu_virtq *vq, unsigned int index, * @len: Size of the element * @idx: Used ring entry index */ -/* cppcheck-suppress unusedFunction */ void vu_queue_fill(struct vu_virtq *vq, const struct vu_virtq_element *elem, unsigned int len, unsigned int idx) { @@ -591,7 +587,6 @@ static inline void vring_used_idx_set(struct vu_virtq *vq, uint16_t val) * @vq: Virtqueue * @count: Number of entry to flush */ -/* cppcheck-suppress unusedFunction */ void vu_queue_flush(struct vu_virtq *vq, unsigned int count) { uint16_t old, new; diff --git a/virtio.h b/virtio.h index 0a2cf6230139..61fb2f9cbf20 100644 --- a/virtio.h +++ b/virtio.h @@ -107,6 +107,7 @@ struct vu_dev_region { * @hdrlen: Virtio -net header length */ struct vu_dev { + struct ctx *context; uint32_t nregions; struct vu_dev_region regions[VHOST_USER_MAX_RAM_SLOTS]; struct vu_virtq vq[VHOST_USER_MAX_QUEUES]; @@ -163,7 +164,6 @@ static inline bool vu_has_feature(const struct vu_dev *vdev, * * Return: True if the feature is available */ -/* cppcheck-suppress unusedFunction */ static inline bool vu_has_protocol_feature(const struct vu_dev *vdev, unsigned int fbit) { -- 2.45.2

Stefano Brivio

19 Jul 19 Jul

11:29 p.m.

New subject: [PATCH v2 3/4] vhost-user: introduce vhost-user API

On Fri, 12 Jul 2024 17:32:43 +0200 Laurent Vivier wrote:

...

Add vhost_user.c and vhost_user.h that define the functions needed to implement vhost-user backend.

Signed-off-by: Laurent Vivier --- Makefile | 4 +- iov.c | 1 - vhost_user.c | 1267 ++++++++++++++++++++++++++++++++++++++++++++++++++ vhost_user.h | 197 ++++++++ virtio.c | 5 - virtio.h | 2 +- 6 files changed, 1467 insertions(+), 9 deletions(-) create mode 100644 vhost_user.c create mode 100644 vhost_user.h

diff --git a/Makefile b/Makefile index 39613a7cf1f2..b2da6ad62103 100644 --- a/Makefile +++ b/Makefile @@ -47,7 +47,7 @@ FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS) PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c flow.c fwd.c \ icmp.c igmp.c inany.c iov.c ip.c isolation.c lineread.c log.c mld.c \ ndp.c netlink.c packet.c passt.c pasta.c pcap.c pif.c tap.c tcp.c \ - tcp_buf.c tcp_splice.c udp.c util.c virtio.c + tcp_buf.c tcp_splice.c udp.c util.c vhost_user.c virtio.c QRAP_SRCS = qrap.c SRCS = $(PASST_SRCS) $(QRAP_SRCS)

@@ -57,7 +57,7 @@ PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \ flow_table.h icmp.h icmp_flow.h inany.h iov.h ip.h isolation.h \ lineread.h log.h ndp.h netlink.h packet.h passt.h pasta.h pcap.h pif.h \ siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h tcp_internal.h tcp_splice.h \ - udp.h util.h virtio.h + udp.h util.h vhost_user.h virtio.h HEADERS = $(PASST_HEADERS) seccomp.h

C := \#include \nstruct tcp_info x = { .tcpi_snd_wnd = 0 }; diff --git a/iov.c b/iov.c index 3f9e229a305f..3741db21790f 100644 --- a/iov.c +++ b/iov.c @@ -68,7 +68,6 @@ size_t iov_skip_bytes(const struct iovec *iov, size_t n, * * Returns: The number of bytes successfully copied. */ -/* cppcheck-suppress unusedFunction */ size_t iov_from_buf(const struct iovec *iov, size_t iov_cnt, size_t offset, const void *buf, size_t bytes) { diff --git a/vhost_user.c b/vhost_user.c new file mode 100644 index 000000000000..23ec4326995d --- /dev/null +++ b/vhost_user.c @@ -0,0 +1,1267 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later + * Copyright Red Hat + * Author: Laurent Vivier + * + * vhost-user API, command management and virtio interface + */ +/* some parts from QEMU subprojects/libvhost-user/libvhost-user.c */

Same here about attribution.

...

+ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "util.h" +#include "passt.h" +#include "tap.h" +#include "vhost_user.h" + +/* vhost-user version we are compatible with */ +#define VHOST_USER_VERSION 1 + +/** + * vu_print_capabilities() - print vhost-user capabilities + * this is part of the vhost-user backend + * convention. + */ +/* cppcheck-suppress unusedFunction */ +void vu_print_capabilities(void) +{ + printf("{\n"); + printf(" \"type\": \"net\"\n"); + printf("}\n");

I think this should be info() (added bonus: it adds newlines by itself).

...

+ exit(EXIT_SUCCESS); +} + +/** + * vu_request_to_string() - convert a vhost-user request number to its name + * @req: request number + * + * Return: the name of request number + */ +static const char *vu_request_to_string(unsigned int req) +{ + if (req < VHOST_USER_MAX) { +#define REQ(req) [req] = #req + static const char * const vu_request_str[] = { + REQ(VHOST_USER_NONE), + REQ(VHOST_USER_GET_FEATURES), + REQ(VHOST_USER_SET_FEATURES), + REQ(VHOST_USER_SET_OWNER), + REQ(VHOST_USER_RESET_OWNER), + REQ(VHOST_USER_SET_MEM_TABLE), + REQ(VHOST_USER_SET_LOG_BASE), + REQ(VHOST_USER_SET_LOG_FD), + REQ(VHOST_USER_SET_VRING_NUM), + REQ(VHOST_USER_SET_VRING_ADDR), + REQ(VHOST_USER_SET_VRING_BASE), + REQ(VHOST_USER_GET_VRING_BASE), + REQ(VHOST_USER_SET_VRING_KICK), + REQ(VHOST_USER_SET_VRING_CALL), + REQ(VHOST_USER_SET_VRING_ERR), + REQ(VHOST_USER_GET_PROTOCOL_FEATURES), + REQ(VHOST_USER_SET_PROTOCOL_FEATURES), + REQ(VHOST_USER_GET_QUEUE_NUM), + REQ(VHOST_USER_SET_VRING_ENABLE), + REQ(VHOST_USER_SEND_RARP), + REQ(VHOST_USER_NET_SET_MTU), + REQ(VHOST_USER_SET_BACKEND_REQ_FD), + REQ(VHOST_USER_IOTLB_MSG), + REQ(VHOST_USER_SET_VRING_ENDIAN), + REQ(VHOST_USER_GET_CONFIG), + REQ(VHOST_USER_SET_CONFIG), + REQ(VHOST_USER_POSTCOPY_ADVISE), + REQ(VHOST_USER_POSTCOPY_LISTEN), + REQ(VHOST_USER_POSTCOPY_END), + REQ(VHOST_USER_GET_INFLIGHT_FD), + REQ(VHOST_USER_SET_INFLIGHT_FD), + REQ(VHOST_USER_GPU_SET_SOCKET), + REQ(VHOST_USER_VRING_KICK), + REQ(VHOST_USER_GET_MAX_MEM_SLOTS), + REQ(VHOST_USER_ADD_MEM_REG), + REQ(VHOST_USER_REM_MEM_REG), + REQ(VHOST_USER_MAX), + }; +#undef REQ + return vu_request_str[req]; + } + + return "unknown"; +} + +/** + * qva_to_va() - Translate front-end (QEMU) virtual address to our virtual + * address.

No period needed at the end of the description, it's not a proper sentence.

...

+ * @dev: Vhost-user device + * @qemu_addr: front-end userspace address + * + * Return: the memory address in our process virtual address space. + */ +static void *qva_to_va(struct vu_dev *dev, uint64_t qemu_addr)

This whole function is _almost_ the same as vu_gpa_to_va() from 2/4... could we just use/adjust that one with, say, 'plen' set to NULL?

...

+{ + unsigned int i; + + /* Find matching memory region. */ + for (i = 0; i < dev->nregions; i++) { + const struct vu_dev_region *r = &dev->regions[i]; + + if ((qemu_addr >= r->qva) && (qemu_addr < (r->qva + r->size))) { + /* NOLINTNEXTLINE(performance-no-int-to-ptr) */ + return (void *)(qemu_addr - r->qva + r->mmap_addr + + r->mmap_offset); + } + } + + return NULL; +} + +/** + * vmsg_close_fds() - Close all file descriptors of a given message + * @vmsg: Vhost-user message with the list of the file descriptors + */ +static void vmsg_close_fds(const struct vhost_user_msg *vmsg) +{ + int i; + + for (i = 0; i < vmsg->fd_num; i++) + close(vmsg->fds[i]); +} + +/** + * vu_remove_watch() - Remove a file descriptor from an our passt epoll + * file descriptor + * @vdev: Vhost-user device + * @fd: file descriptor to remove + */ +static void vu_remove_watch(const struct vu_dev *vdev, int fd) +{ + (void)vdev; + (void)fd; +} + +/** + * vmsg_set_reply_u64() - Set reply payload.u64 and clear request flags + * and fd_num + * @vmsg: Vhost-user message + * @val: 64bit value to reply + */ +static void vmsg_set_reply_u64(struct vhost_user_msg *vmsg, uint64_t val) +{ + vmsg->hdr.flags = 0; /* defaults will be set by vu_send_reply() */ + vmsg->hdr.size = sizeof(vmsg->payload.u64); + vmsg->payload.u64 = val; + vmsg->fd_num = 0; +} + +/** + * vu_message_read_default() - Read incoming vhost-user message from the + * front-end + * @conn_fd: Vhost-user command socket + * @vmsg: Vhost-user message + * + * Return: -1 there is an error, + * 0 if recvmsg() has been interrupted, + * 1 if a message has been received + */ +static int vu_message_read_default(int conn_fd, struct vhost_user_msg *vmsg) +{ + char control[CMSG_SPACE(VHOST_MEMORY_BASELINE_NREGIONS * + sizeof(int))] = { 0 }; + struct iovec iov = { + .iov_base = (char *)vmsg, + .iov_len = VHOST_USER_HDR_SIZE, + }; + struct msghdr msg = { + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = control, + .msg_controllen = sizeof(control), + }; + size_t fd_size; + struct cmsghdr *cmsg; + ssize_t ret, sz_payload; + + ret = recvmsg(conn_fd, &msg, MSG_DONTWAIT); + if (ret < 0) { + if (errno == EINTR || errno == EAGAIN || errno == EWOULDBLOCK) + return 0; + return -1; + } + + vmsg->fd_num = 0; + for (cmsg = CMSG_FIRSTHDR(&msg); cmsg != NULL; + cmsg = CMSG_NXTHDR(&msg, cmsg)) { + if (cmsg->cmsg_level == SOL_SOCKET && + cmsg->cmsg_type == SCM_RIGHTS) { + fd_size = cmsg->cmsg_len - CMSG_LEN(0); + vmsg->fd_num = fd_size / sizeof(int); + memcpy(vmsg->fds, CMSG_DATA(cmsg), fd_size);

Coverity Scan is not really happy about using fd_size as received by recvmsg() without sanitising it. This isn't really security-relevant because if the hypervisor wants to affect its connectivity, it can already do so, but it would be nice to make this robust. I guess you could check that it doesn't exceed VHOST_MEMORY_BASELINE_NREGIONS? I see you have this as assert() in vu_message_write().

...

+ break; + } + } + + sz_payload = vmsg->hdr.size; + if ((size_t)sz_payload > sizeof(vmsg->payload)) { + vu_panic("Error: too big message request: %d,"

Same in this patch about using die() instead.

...

+ " size: vmsg->size: %zd, " + "while sizeof(vmsg->payload) = %zu", + vmsg->hdr.request, sz_payload, sizeof(vmsg->payload)); + } + + if (sz_payload) { + do { + ret = recv(conn_fd, &vmsg->payload, sz_payload, 0); + } while (ret < 0 && (errno == EINTR || errno == EAGAIN)); + + if (ret < sz_payload) + vu_panic("Error while reading: %s", + strerror(errno)); + } + + return 1; +} + +/** + * vu_message_write() - send a message to the front-end + * @conn_fd: Vhost-user command socket + * @vmsg: Vhost-user message + * + * #syscalls:vu sendmsg + */ +static void vu_message_write(int conn_fd, struct vhost_user_msg *vmsg) +{ + int rc; + const uint8_t *p = (uint8_t *)vmsg; + char control[CMSG_SPACE(VHOST_MEMORY_BASELINE_NREGIONS * sizeof(int))] = { 0 }; + struct iovec iov = { + .iov_base = (char *)vmsg, + .iov_len = VHOST_USER_HDR_SIZE, + }; + struct msghdr msg = { + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = control, + }; + + memset(control, 0, sizeof(control)); + assert(vmsg->fd_num <= VHOST_MEMORY_BASELINE_NREGIONS); + if (vmsg->fd_num > 0) { + size_t fdsize = vmsg->fd_num * sizeof(int); + struct cmsghdr *cmsg; + + msg.msg_controllen = CMSG_SPACE(fdsize); + cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_len = CMSG_LEN(fdsize); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + memcpy(CMSG_DATA(cmsg), vmsg->fds, fdsize); + } else { + msg.msg_controllen = 0; + } + + do { + rc = sendmsg(conn_fd, &msg, 0); + } while (rc < 0 && (errno == EINTR || errno == EAGAIN)); + + if (vmsg->hdr.size) { + do { + rc = write(conn_fd, p + VHOST_USER_HDR_SIZE, + vmsg->hdr.size); + } while (rc < 0 && (errno == EINTR || errno == EAGAIN)); + } + + if (rc <= 0) + vu_panic("Error while writing: %s", strerror(errno)); +} + +/** + * vu_send_reply() - Update message flags and send it to front-end + * @conn_fd: Vhost-user command socket + * @vmsg: Vhost-user message + */ +static void vu_send_reply(int conn_fd, struct vhost_user_msg *msg) +{ + msg->hdr.flags &= ~VHOST_USER_VERSION_MASK; + msg->hdr.flags |= VHOST_USER_VERSION; + msg->hdr.flags |= VHOST_USER_REPLY_MASK; + + vu_message_write(conn_fd, msg); +} + +/** + * vu_get_features_exec() - Provide back-end features bitmask to front-end + * @vmsg: Vhost-user message + * + * Return: true as a reply is requested + */ +static bool vu_get_features_exec(struct vhost_user_msg *msg) +{ + uint64_t features = + 1ULL << VIRTIO_F_VERSION_1 | + 1ULL << VIRTIO_NET_F_MRG_RXBUF | + 1ULL << VHOST_USER_F_PROTOCOL_FEATURES; + + vmsg_set_reply_u64(msg, features); + + debug("Sending back to guest u64: 0x%016"PRIx64, msg->payload.u64); + + return true; +} + +/** + * vu_set_enable_all_rings() - Enable/disable all the virqueues

s/virqueues/virtqueues/

...

+ * @vdev: Vhost-user device + * @enabled: New virtqueues state

Perhaps 'enable' (imperative) instead of 'enabled' (indicative), so that it's clear it's the (new) state we want?

...

+ */ +static void vu_set_enable_all_rings(struct vu_dev *vdev, bool enabled) +{ + uint16_t i; + + for (i = 0; i < VHOST_USER_MAX_QUEUES; i++) + vdev->vq[i].enable = enabled; +} + +/** + * vu_set_features_exec() - Enable features of the back-end + * @vdev: Vhost-user device + * @vmsg: Vhost-user message + * + * Return: false as no reply is requested + */ +static bool vu_set_features_exec(struct vu_dev *vdev, + struct vhost_user_msg *msg) +{ + debug("u64: 0x%016"PRIx64, msg->payload.u64); + + vdev->features = msg->payload.u64; + /*

No need to have an extra line on top: /* We only support ...

...

+ * We only support devices conforming to VIRTIO 1.0 or + * later + */ + if (!vu_has_feature(vdev, VIRTIO_F_VERSION_1)) + vu_panic("virtio legacy devices aren't supported by passt"); + + if (!vu_has_feature(vdev, VHOST_USER_F_PROTOCOL_FEATURES)) + vu_set_enable_all_rings(vdev, true); + + /* virtio-net features */ + + if (vu_has_feature(vdev, VIRTIO_F_VERSION_1) || + vu_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF)) { + vdev->hdrlen = sizeof(struct virtio_net_hdr_mrg_rxbuf); + } else { + vdev->hdrlen = sizeof(struct virtio_net_hdr); + } + + return false; +} + +/** + * vu_set_owner_exec() - Session start flag, do nothing in our case + * + * Return: false as no reply is requested + */ +static bool vu_set_owner_exec(void) +{ + return false; +} + +/** + * map_ring() - Convert ring front-end (QEMU) addresses to our process + * virtual address space. + * @vdev: Vhost-user device + * @vq: Virtqueue + * + * Return: true if ring cannot be mapped to our address space + */ +static bool map_ring(struct vu_dev *vdev, struct vu_virtq *vq) +{ + vq->vring.desc = qva_to_va(vdev, vq->vra.desc_user_addr); + vq->vring.used = qva_to_va(vdev, vq->vra.used_user_addr); + vq->vring.avail = qva_to_va(vdev, vq->vra.avail_user_addr); + + debug("Setting virtq addresses:"); + debug(" vring_desc at %p", (void *)vq->vring.desc); + debug(" vring_used at %p", (void *)vq->vring.used); + debug(" vring_avail at %p", (void *)vq->vring.avail); + + return !(vq->vring.desc && vq->vring.used && vq->vring.avail); +} + +/** + * vu_packet_check_range() - Check if a given memory zone is contained in + * a mapped guest memory region + * @buf: Array of the available memory regions + * @offset: Offset of data range in packet descriptor + * @size: Length of desired data range + * @start: Start of the packet descriptor + * + * Return: 0 if the zone in a mapped memory region, -1 otherwise + */ +/* cppcheck-suppress unusedFunction */ +int vu_packet_check_range(void *buf, size_t offset, size_t len, + const char *start) +{ + struct vu_dev_region *dev_region; + + for (dev_region = buf; dev_region->mmap_addr; dev_region++) { + /* NOLINTNEXTLINE(performance-no-int-to-ptr) */ + char *m = (char *)dev_region->mmap_addr; + + if (m <= start && + start + offset + len < m + dev_region->mmap_offset + + dev_region->size) + return 0; + } + + return -1; +} + +/** + * vu_set_mem_table_exec() - Sets the memory map regions to be able to + * translate the vring addresses. + * @vdev: Vhost-user device + * @vmsg: Vhost-user message + * + * Return: false as no reply is requested + * + * #syscalls:vu mmap munmap + */ +static bool vu_set_mem_table_exec(struct vu_dev *vdev, + struct vhost_user_msg *msg) +{ + unsigned int i; + struct vhost_user_memory m = msg->payload.memory, *memory = &m; + + for (i = 0; i < vdev->nregions; i++) { + struct vu_dev_region *r = &vdev->regions[i]; + /* NOLINTNEXTLINE(performance-no-int-to-ptr) */ + void *mm = (void *)r->mmap_addr; + + if (mm) + munmap(mm, r->size + r->mmap_offset); + } + vdev->nregions = memory->nregions; + + debug("Nregions: %u", memory->nregions); + for (i = 0; i < vdev->nregions; i++) { + void *mmap_addr; + struct vhost_user_memory_region *msg_region = &memory->regions[i]; + struct vu_dev_region *dev_region = &vdev->regions[i]; + + debug("Region %d", i); + debug(" guest_phys_addr: 0x%016"PRIx64, + msg_region->guest_phys_addr); + debug(" memory_size: 0x%016"PRIx64, + msg_region->memory_size); + debug(" userspace_addr 0x%016"PRIx64, + msg_region->userspace_addr); + debug(" mmap_offset 0x%016"PRIx64, + msg_region->mmap_offset); + + dev_region->gpa = msg_region->guest_phys_addr; + dev_region->size = msg_region->memory_size; + dev_region->qva = msg_region->userspace_addr; + dev_region->mmap_offset = msg_region->mmap_offset; + + /* We don't use offset argument of mmap() since the + * mapped address has to be page aligned, and we use huge + * pages. + */ + mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset, + PROT_READ | PROT_WRITE, MAP_SHARED | + MAP_NORESERVE, msg->fds[i], 0); + + if (mmap_addr == MAP_FAILED) + vu_panic("region mmap error: %s", strerror(errno)); + + dev_region->mmap_addr = (uint64_t)(uintptr_t)mmap_addr; + debug(" mmap_addr: 0x%016"PRIx64, + dev_region->mmap_addr); + + close(msg->fds[i]); + } + + for (i = 0; i < VHOST_USER_MAX_QUEUES; i++) { + if (vdev->vq[i].vring.desc) { + if (map_ring(vdev, &vdev->vq[i])) + vu_panic("remapping queue %d during setmemtable", i); + } + } + + return false; +} + +/** + * vu_set_vring_num_exec() - Set the size of the queue (vring size) + * @vdev: Vhost-user device + * @vmsg: Vhost-user message + * + * Return: false as no reply is requested + */ +static bool vu_set_vring_num_exec(struct vu_dev *vdev, + struct vhost_user_msg *msg) +{ + unsigned int idx = msg->payload.state.index; + unsigned int num = msg->payload.state.num; + + debug("State.index: %u", idx); + debug("State.num: %u", num); + vdev->vq[idx].vring.num = num; + + return false; +} + +/** + * vu_set_vring_addr_exec() - Set the addresses of the vring + * @vdev: Vhost-user device + * @vmsg: Vhost-user message + * + * Return: false as no reply is requested + */ +static bool vu_set_vring_addr_exec(struct vu_dev *vdev, + struct vhost_user_msg *msg) +{ + struct vhost_vring_addr addr = msg->payload.addr, *vra = &addr; + struct vu_virtq *vq = &vdev->vq[vra->index]; + + debug("vhost_vring_addr:"); + debug(" index: %d", vra->index); + debug(" flags: %d", vra->flags); + debug(" desc_user_addr: 0x%016" PRIx64, (uint64_t)vra->desc_user_addr); + debug(" used_user_addr: 0x%016" PRIx64, (uint64_t)vra->used_user_addr); + debug(" avail_user_addr: 0x%016" PRIx64, (uint64_t)vra->avail_user_addr); + debug(" log_guest_addr: 0x%016" PRIx64, (uint64_t)vra->log_guest_addr); + + vq->vra = *vra; + vq->vring.flags = vra->flags; + vq->vring.log_guest_addr = vra->log_guest_addr; + + if (map_ring(vdev, vq)) + vu_panic("Invalid vring_addr message"); + + vq->used_idx = le16toh(vq->vring.used->idx); + + if (vq->last_avail_idx != vq->used_idx) { + debug("Last avail index != used index: %u != %u", + vq->last_avail_idx, vq->used_idx); + } + + return false; +} +/** + * vu_set_vring_base_exec() - Sets the next index to use for descriptors + * in this vring + * @vdev: Vhost-user device + * @vmsg: Vhost-user message + * + * Return: false as no reply is requested + */ +static bool vu_set_vring_base_exec(struct vu_dev *vdev, + struct vhost_user_msg *msg) +{ + unsigned int idx = msg->payload.state.index; + unsigned int num = msg->payload.state.num; + + debug("State.index: %u", idx); + debug("State.num: %u", num); + vdev->vq[idx].shadow_avail_idx = vdev->vq[idx].last_avail_idx = num; + + return false; +} + +/** + * vu_get_vring_base_exec() - Stops the vring and returns the current + * descriptor index or indices + * @vdev: Vhost-user device + * @vmsg: Vhost-user message + * + * Return: false as a reply is requested + */ +static bool vu_get_vring_base_exec(struct vu_dev *vdev, + struct vhost_user_msg *msg) +{ + unsigned int idx = msg->payload.state.index; + + debug("State.index: %u", idx); + msg->payload.state.num = vdev->vq[idx].last_avail_idx; + msg->hdr.size = sizeof(msg->payload.state); + + vdev->vq[idx].started = false; + + if (vdev->vq[idx].call_fd != -1) { + close(vdev->vq[idx].call_fd); + vdev->vq[idx].call_fd = -1; + } + if (vdev->vq[idx].kick_fd != -1) { + vu_remove_watch(vdev, vdev->vq[idx].kick_fd); + close(vdev->vq[idx].kick_fd); + vdev->vq[idx].kick_fd = -1; + } + + return true; +} + +/** + * vu_set_watch() - Add a file descriptor to the passt epoll file descriptor + * @vdev: vhost-user device + * @fd: file descriptor to add + */ +static void vu_set_watch(const struct vu_dev *vdev, int fd) +{ + (void)vdev; + (void)fd; +} + +/** + * vu_wait_queue() - wait new free entries in the virtqueue + * @vq: virtqueue to wait on + */ +static int vu_wait_queue(const struct vu_virtq *vq) +{ + eventfd_t kick_data; + ssize_t rc; + int status; + + /* wait the kernel to put new entries in the queue */ + + status = fcntl(vq->kick_fd, F_GETFL); + if (status == -1) + return -1; + + fcntl(vq->kick_fd, F_SETFL, status & ~O_NONBLOCK);

Here, and two lines below, Coverity Scan complains about the fact that you're using fcntl() without checking the return value.

...

+ rc = eventfd_read(vq->kick_fd, &kick_data);

Extra whitespace after =.

...

+ fcntl(vq->kick_fd, F_SETFL, status); + if (rc == -1) + return -1; + + return 0; +} + +/** + * vu_send() - Send a buffer to the front-end using the RX virtqueue + * @vdev: vhost-user device + * @buf: address of the buffer + * @size: size of the buffer + * + * Return: number of bytes sent, -1 if there is an error + */ +/* cppcheck-suppress unusedFunction */ +int vu_send(struct vu_dev *vdev, const void *buf, size_t size) +{ + size_t hdrlen = vdev->hdrlen; + struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE]; + struct vu_virtq_element elem[VIRTQUEUE_MAX_SIZE]; + struct iovec in_sg[VIRTQUEUE_MAX_SIZE]; + size_t lens[VIRTQUEUE_MAX_SIZE]; + size_t offset; + int i, j; + __virtio16 *num_buffers_ptr; + int in_sg_count;

Can those be aligned in the usual way (from longest to shortest)?

...

+ + debug("vu_send size %zu hdrlen %zu", size, hdrlen); + + if (!vu_queue_enabled(vq) || !vu_queue_started(vq)) { + err("Got packet, but no available descriptors on RX virtq."); + return 0; + } + + offset = 0; + i = 0; + num_buffers_ptr = NULL; + in_sg_count = 0;

Could those be initialised when you declare them?

...

+ while (offset < size) { + size_t len; + int total; + int ret; + + total = 0; + + if (i == ARRAY_SIZE(elem) || + in_sg_count == ARRAY_SIZE(in_sg)) { + err("virtio-net unexpected long buffer chain"); + goto err; + } + + elem[i].out_num = 0; + elem[i].out_sg = NULL; + elem[i].in_num = ARRAY_SIZE(in_sg) - in_sg_count; + elem[i].in_sg = &in_sg[in_sg_count]; + + ret = vu_queue_pop(vdev, vq, &elem[i]); + if (ret < 0) { + if (vu_wait_queue(vq) != -1) + continue; + if (i) { + err("virtio-net unexpected empty queue: " + "i %d mergeable %d offset %zd, size %zd, " + "features 0x%" PRIx64, + i, vu_has_feature(vdev, + VIRTIO_NET_F_MRG_RXBUF), + offset, size, vdev->features); + } + offset = -1; + goto err; + } + in_sg_count += elem[i].in_num; + + if (elem[i].in_num < 1) { + err("virtio-net receive queue contains no in buffers"); + vu_queue_detach_element(vdev, vq, elem[i].index, 0); + offset = -1; + goto err; + } + + if (i == 0) { + struct virtio_net_hdr hdr = { + .flags = VIRTIO_NET_HDR_F_DATA_VALID, + .gso_type = VIRTIO_NET_HDR_GSO_NONE, + }; + + ASSERT(offset == 0); + ASSERT(elem[i].in_sg[0].iov_len >= hdrlen); + + len = iov_from_buf(elem[i].in_sg, elem[i].in_num, 0, + &hdr, sizeof(hdr)); + + num_buffers_ptr = (__virtio16 *)((char *)elem[i].in_sg[0].iov_base + + len); + + total += hdrlen;

Shouldn't this be 'total += len' or, alternatively, shouldn't there be a check that len == hdrlen?

...

+ } + + len = iov_from_buf(elem[i].in_sg, elem[i].in_num, total, + (char *)buf + offset, size - offset); + + total += len; + offset += len; + + /* If buffers can't be merged, at this point we + * must have consumed the complete packet. + * Otherwise, drop it. + */ + if (!vu_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF) && + offset < size) { + vu_queue_unpop(vdev, vq, elem[i].index, total); + goto err; + } + + lens[i] = total; + i++; + } + + if (num_buffers_ptr && vu_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF)) + *num_buffers_ptr = htole16(i); + + for (j = 0; j < i; j++) { + debug("filling total %zd idx %d", lens[j], j); + vu_queue_fill(vq, &elem[j], lens[j], j); + } + + vu_queue_flush(vq, i); + vu_queue_notify(vdev, vq); + + debug("sent %zu", offset);

It would be nice to be a bit more specific here ("vhost-user sent ..." or something like that).

...

+ + return offset; +err: + for (j = 0; j < i; j++) + vu_queue_detach_element(vdev, vq, elem[j].index, lens[j]); + + return offset; +} + +/** + * vu_handle_tx() - Receive data from the TX virqueue

s/virqueue/virtqueue/

...

+ * @vdev: vhost-user device + * @index: index of the virtqueue + */ +static void vu_handle_tx(struct vu_dev *vdev, int index) +{ + struct vu_virtq *vq = &vdev->vq[index]; + int hdrlen = vdev->hdrlen; + struct timespec now; + struct vu_virtq_element elem[VIRTQUEUE_MAX_SIZE];

Excess whitespace.

...

+ struct iovec out_sg[VIRTQUEUE_MAX_SIZE]; + int out_sg_count; +

Excess newline.

...

+ int count;

Could those be ordered in the usual way?

...

+ + if (index % 2 != VHOST_USER_TX_QUEUE) {

This, and similar checks below, are rather convoluted. The defines are misleading: /* index of the RX virtqueue */ #define VHOST_USER_RX_QUEUE 0 ...but no, 2 is a receive queue too. Perhaps it would be more readable to just have something like: #define VHOST_USER_IS_QUEUE_TX(n) (n % 2) #define VHOST_USER_IS_QUEUE_RX(n) (!(n % 2)) ?

...

+ debug("index %d is not a TX queue", index); + return; + } + + clock_gettime(CLOCK_MONOTONIC, &now);

I guess vu_kick_cb() could take a timestamp instead?

...

+ + tap_flush_pools(); + + count = 0; + out_sg_count = 0; + while (1) { + int ret; + + ASSERT(index == VHOST_USER_TX_QUEUE);

...why is this one here? 'index' doesn't actually change in this loop.

...

+ + elem[count].out_num = 1; + elem[count].out_sg = &out_sg[out_sg_count]; + elem[count].in_num = 0; + elem[count].in_sg = NULL; + ret = vu_queue_pop(vdev, vq, &elem[count]); + if (ret < 0) + break;

This (a bit hidden) is the intended loop termination condition. I wonder: should we add an upper limit to the packets that can be dequeued in one run, or there's no risk of this loop starving everything else for some other reason?

...

+ out_sg_count += elem[count].out_num; + + if (elem[count].out_num < 1) { + debug("virtio-net header not in first element"); + break; + } + ASSERT(elem[count].out_num == 1); + + tap_add_packet(vdev->context, + elem[count].out_sg[0].iov_len - hdrlen, + (char *)elem[count].out_sg[0].iov_base + hdrlen); + count++; + } + tap_handler(vdev->context, &now); + + if (count) { + int i; + + for (i = 0; i < count; i++) + vu_queue_fill(vq, &elem[i], 0, i); + vu_queue_flush(vq, count); + vu_queue_notify(vdev, vq); + } +} + +/** + * vu_kick_cb() - Called on a kick event to start to receive data + * @vdev: vhost-user device + * @ref: epoll reference information + */ +/* cppcheck-suppress unusedFunction */ +void vu_kick_cb(struct vu_dev *vdev, union epoll_ref ref) +{ + eventfd_t kick_data; + ssize_t rc; + int idx; + + for (idx = 0; idx < VHOST_USER_MAX_QUEUES; idx++) + if (vdev->vq[idx].kick_fd == ref.fd) + break; + + if (idx == VHOST_USER_MAX_QUEUES) + return; + + rc = eventfd_read(ref.fd, &kick_data);

Extra whitespace after =.

...

+ if (rc == -1) + vu_panic("kick eventfd_read(): %s", strerror(errno)); + + debug("Got kick_data: %016"PRIx64" idx:%d", + kick_data, idx); + if (idx % 2 == VHOST_USER_TX_QUEUE) + vu_handle_tx(vdev, idx); +} + +/** + * vu_check_queue_msg_file() - Check if a message is valid, + * close fds if NOFD bit is set + * @vmsg: Vhost-user message + */ +static void vu_check_queue_msg_file(struct vhost_user_msg *msg) +{ + int idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + bool nofd = msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK; + + if (idx >= VHOST_USER_MAX_QUEUES) + vu_panic("Invalid queue index: %u", idx); + + if (nofd) { + vmsg_close_fds(msg); + return; + } + + if (msg->fd_num != 1) + vu_panic("Invalid fds in request: %d", msg->hdr.request); +} + +/** + * vu_set_vring_kick_exec() - Set the event file descriptor for adding buffers + * to the vring + * @vdev: Vhost-user device + * @vmsg: Vhost-user message + * + * Return: false as no reply is requested + */ +static bool vu_set_vring_kick_exec(struct vu_dev *vdev, + struct vhost_user_msg *msg) +{ + int idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + bool nofd = msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK; + + debug("u64: 0x%016"PRIx64, msg->payload.u64); + + vu_check_queue_msg_file(msg); + + if (vdev->vq[idx].kick_fd != -1) { + vu_remove_watch(vdev, vdev->vq[idx].kick_fd); + close(vdev->vq[idx].kick_fd); + vdev->vq[idx].kick_fd = -1; + } + + /* cppcheck-suppress redundantAssignment */

Actually, it's not clear to me either: why is this assigned just above?

...

+ vdev->vq[idx].kick_fd = nofd ? -1 : msg->fds[0]; + debug("Got kick_fd: %d for vq: %d", vdev->vq[idx].kick_fd, idx); + + vdev->vq[idx].started = true; + + if (vdev->vq[idx].kick_fd != -1 && idx % 2 == VHOST_USER_TX_QUEUE) { + vu_set_watch(vdev, vdev->vq[idx].kick_fd); + debug("Waiting for kicks on fd: %d for vq: %d", + vdev->vq[idx].kick_fd, idx); + } + + return false; +} + +/** + * vu_set_vring_call_exec() - Set the event file descriptor to signal when + * buffers are used + * @vdev: Vhost-user device + * @vmsg: Vhost-user message + * + * Return: false as no reply is requested + */ +static bool vu_set_vring_call_exec(struct vu_dev *vdev, + struct vhost_user_msg *msg) +{ + int idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + bool nofd = msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK; + + debug("u64: 0x%016"PRIx64, msg->payload.u64); + + vu_check_queue_msg_file(msg); + + if (vdev->vq[idx].call_fd != -1) { + close(vdev->vq[idx].call_fd); + vdev->vq[idx].call_fd = -1; + } + + /* cppcheck-suppress redundantAssignment */ + vdev->vq[idx].call_fd = nofd ? -1 : msg->fds[0]; + + /* in case of I/O hang after reconnecting */ + if (vdev->vq[idx].call_fd != -1) + eventfd_write(msg->fds[0], 1); + + debug("Got call_fd: %d for vq: %d", vdev->vq[idx].call_fd, idx); + + return false; +} + +/** + * vu_set_vring_err_exec() - Set the event file descriptor to signal when + * error occurs + * @vdev: Vhost-user device + * @vmsg: Vhost-user message + * + * Return: false as no reply is requested + */ +static bool vu_set_vring_err_exec(struct vu_dev *vdev, + struct vhost_user_msg *msg) +{ + int idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + bool nofd = msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK; + + debug("u64: 0x%016"PRIx64, msg->payload.u64); + + vu_check_queue_msg_file(msg); + + if (vdev->vq[idx].err_fd != -1) { + close(vdev->vq[idx].err_fd); + vdev->vq[idx].err_fd = -1; + } + + /* cppcheck-suppress redundantAssignment */

...same here.

...

+ vdev->vq[idx].err_fd = nofd ? -1 : msg->fds[0]; + + return false; +} + +/** + * vu_get_protocol_features_exec() - Provide the protocol (vhost-user) features + * to the front-end + * @vdev: Vhost-user device + * @vmsg: Vhost-user message + * + * Return: false as a reply is requested + */ +static bool vu_get_protocol_features_exec(struct vhost_user_msg *msg) +{ + uint64_t features = 1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK; + + vmsg_set_reply_u64(msg, features); + + return true; +} + +/** + * vu_set_protocol_features_exec() - Enable protocol (vhost-user) features + * @vdev: Vhost-user device + * @vmsg: Vhost-user message + * + * Return: false as no reply is requested + */ +static bool vu_set_protocol_features_exec(struct vu_dev *vdev, + struct vhost_user_msg *msg) +{ + uint64_t features = msg->payload.u64; + + debug("u64: 0x%016"PRIx64, features); + + vdev->protocol_features = msg->payload.u64; + + if (vu_has_protocol_feature(vdev, + VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS) && + (!vu_has_protocol_feature(vdev, VHOST_USER_PROTOCOL_F_BACKEND_REQ) || + !vu_has_protocol_feature(vdev, VHOST_USER_PROTOCOL_F_REPLY_ACK))) { + /* + * The use case for using messages for kick/call is simulation, to make + * the kick and call synchronous. To actually get that behaviour, both + * of the other features are required. + * Theoretically, one could use only kick messages, or do them without + * having F_REPLY_ACK, but too many (possibly pending) messages on the + * socket will eventually cause the master to hang, to avoid this in + * scenarios where not desired enforce that the settings are in a way + * that actually enables the simulation case. + */ + vu_panic("F_IN_BAND_NOTIFICATIONS requires F_BACKEND_REQ && F_REPLY_ACK"); + return false; + } + + return false; +} + +/** + * vu_get_queue_num_exec() - Tell how many queues we support + * @vmsg: Vhost-user message + * + * Return: true as a reply is requested + */ +static bool vu_get_queue_num_exec(struct vhost_user_msg *msg) +{ + vmsg_set_reply_u64(msg, VHOST_USER_MAX_QUEUES); + return true; +} + +/** + * vu_set_vring_enable_exec() - Enable or disable corresponding vring + * @vdev: Vhost-user device + * @vmsg: Vhost-user message + * + * Return: false as no reply is requested + */ +static bool vu_set_vring_enable_exec(struct vu_dev *vdev, + struct vhost_user_msg *msg) +{ + unsigned int idx = msg->payload.state.index; + unsigned int enable = msg->payload.state.num; + + debug("State.index: %u", idx); + debug("State.enable: %u", enable); + + if (idx >= VHOST_USER_MAX_QUEUES) + vu_panic("Invalid vring_enable index: %u", idx); + + vdev->vq[idx].enable = enable; + return false; +} + +/** + * vu_init() - Initialize vhost-user device structure + * @c: execution context + * @vdev: vhost-user device + */ +/* cppcheck-suppress unusedFunction */ +void vu_init(struct ctx *c, struct vu_dev *vdev) +{ + int i; + + vdev->context = c; + vdev->hdrlen = 0; + for (i = 0; i < VHOST_USER_MAX_QUEUES; i++)

Curly brackets for multi-line blocks (for consistency, not needed otherwise).

...

+ vdev->vq[i] = (struct vu_virtq){ + .call_fd = -1, + .kick_fd = -1, + .err_fd = -1, + .notification = true, + }; +} + +/** + * vu_cleanup() - Reset vhost-user device + * @vdev: vhost-user device + */ +void vu_cleanup(struct vu_dev *vdev) +{ + unsigned int i; + + for (i = 0; i < VHOST_USER_MAX_QUEUES; i++) { + struct vu_virtq *vq = &vdev->vq[i]; + + vq->started = false; + vq->notification = true; + + if (vq->call_fd != -1) { + close(vq->call_fd); + vq->call_fd = -1; + } + if (vq->err_fd != -1) { + close(vq->err_fd); + vq->err_fd = -1; + } + if (vq->kick_fd != -1) { + vu_remove_watch(vdev, vq->kick_fd);

Excess whitespace.

...

+ close(vq->kick_fd); + vq->kick_fd = -1; + } + + vq->vring.desc = 0; + vq->vring.used = 0; + vq->vring.avail = 0; + } + vdev->hdrlen = 0; + + for (i = 0; i < vdev->nregions; i++) { + const struct vu_dev_region *r = &vdev->regions[i]; + /* NOLINTNEXTLINE(performance-no-int-to-ptr) */ + void *m = (void *)r->mmap_addr; + + if (m) + munmap(m, r->size + r->mmap_offset); + } + vdev->nregions = 0; +} + +/** + * vu_sock_reset() - Reset connection socket + * @vdev: vhost-user device + */ +static void vu_sock_reset(struct vu_dev *vdev) +{ + (void)vdev; +} + +/** + * tap_handler_vu() - Packet handler for vhost-user + * @vdev: vhost-user device + * @fd: vhost-user message socket + * @events: epoll events + */ +/* cppcheck-suppress unusedFunction */ +void tap_handler_vu(struct vu_dev *vdev, int fd, uint32_t events) +{ + struct vhost_user_msg msg = { 0 }; + bool need_reply, reply_requested; + int ret; + + if (events & (EPOLLRDHUP | EPOLLHUP | EPOLLERR)) { + vu_sock_reset(vdev); + return; + } +

Excess newline.

...

+ + ret = vu_message_read_default(fd, &msg); + if (ret < 0) + vu_panic("Error while recvmsg: %s", strerror(errno));

die() has also a die_perror() variant, by the way.

...

+ if (ret == 0) { + vu_sock_reset(vdev); + return; + } + debug("================ Vhost user message ================"); + debug("Request: %s (%d)", vu_request_to_string(msg.hdr.request), + msg.hdr.request); + debug("Flags: 0x%x", msg.hdr.flags); + debug("Size: %u", msg.hdr.size); + + need_reply = msg.hdr.flags & VHOST_USER_NEED_REPLY_MASK; + switch (msg.hdr.request) { + case VHOST_USER_GET_FEATURES: + reply_requested = vu_get_features_exec(&msg); + break; + case VHOST_USER_SET_FEATURES: + reply_requested = vu_set_features_exec(vdev, &msg); + break; + case VHOST_USER_GET_PROTOCOL_FEATURES: + reply_requested = vu_get_protocol_features_exec(&msg); + break; + case VHOST_USER_SET_PROTOCOL_FEATURES: + reply_requested = vu_set_protocol_features_exec(vdev, &msg); + break; + case VHOST_USER_GET_QUEUE_NUM: + reply_requested = vu_get_queue_num_exec(&msg); + break; + case VHOST_USER_SET_OWNER: + reply_requested = vu_set_owner_exec(); + break; + case VHOST_USER_SET_MEM_TABLE: + reply_requested = vu_set_mem_table_exec(vdev, &msg); + break; + case VHOST_USER_SET_VRING_NUM: + reply_requested = vu_set_vring_num_exec(vdev, &msg); + break; + case VHOST_USER_SET_VRING_ADDR: + reply_requested = vu_set_vring_addr_exec(vdev, &msg); + break; + case VHOST_USER_SET_VRING_BASE: + reply_requested = vu_set_vring_base_exec(vdev, &msg); + break; + case VHOST_USER_GET_VRING_BASE: + reply_requested = vu_get_vring_base_exec(vdev, &msg); + break; + case VHOST_USER_SET_VRING_KICK: + reply_requested = vu_set_vring_kick_exec(vdev, &msg); + break; + case VHOST_USER_SET_VRING_CALL: + reply_requested = vu_set_vring_call_exec(vdev, &msg); + break; + case VHOST_USER_SET_VRING_ERR: + reply_requested = vu_set_vring_err_exec(vdev, &msg); + break; + case VHOST_USER_SET_VRING_ENABLE: + reply_requested = vu_set_vring_enable_exec(vdev, &msg); + break; + case VHOST_USER_NONE: + vu_cleanup(vdev); + return; + default: + vu_panic("Unhandled request: %d", msg.hdr.request); + return; + } + + if (!reply_requested && need_reply) { + msg.payload.u64 = 0; + msg.hdr.flags = 0; + msg.hdr.size = sizeof(msg.payload.u64); + msg.fd_num = 0; + reply_requested = true; + } + + if (reply_requested) + vu_send_reply(fd, &msg); +} diff --git a/vhost_user.h b/vhost_user.h new file mode 100644 index 000000000000..b9e4bcf8e531 --- /dev/null +++ b/vhost_user.h @@ -0,0 +1,197 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later + * Copyright Red Hat + * Author: Laurent Vivier + * + * vhost-user API, command management and virtio interface + */ + +/* some parts from subprojects/libvhost-user/libvhost-user.h */ + +#ifndef VHOST_USER_H +#define VHOST_USER_H + +#include "virtio.h" +#include "iov.h" + +#define VHOST_USER_F_PROTOCOL_FEATURES 30 + +#define VHOST_MEMORY_BASELINE_NREGIONS 8 + +/** + * enum vhost_user_protocol_feature - List of available vhost-user features + */ +enum vhost_user_protocol_feature { + VHOST_USER_PROTOCOL_F_MQ = 0, + VHOST_USER_PROTOCOL_F_LOG_SHMFD = 1, + VHOST_USER_PROTOCOL_F_RARP = 2, + VHOST_USER_PROTOCOL_F_REPLY_ACK = 3, + VHOST_USER_PROTOCOL_F_NET_MTU = 4, + VHOST_USER_PROTOCOL_F_BACKEND_REQ = 5, + VHOST_USER_PROTOCOL_F_CROSS_ENDIAN = 6, + VHOST_USER_PROTOCOL_F_CRYPTO_SESSION = 7, + VHOST_USER_PROTOCOL_F_PAGEFAULT = 8, + VHOST_USER_PROTOCOL_F_CONFIG = 9, + VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD = 10, + VHOST_USER_PROTOCOL_F_HOST_NOTIFIER = 11, + VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD = 12, + VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS = 14, + VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS = 15, + + VHOST_USER_PROTOCOL_F_MAX +}; + +/** + * enum vhost_user_request - list of available vhost-user request + */ +enum vhost_user_request { + VHOST_USER_NONE = 0, + VHOST_USER_GET_FEATURES = 1, + VHOST_USER_SET_FEATURES = 2, + VHOST_USER_SET_OWNER = 3, + VHOST_USER_RESET_OWNER = 4, + VHOST_USER_SET_MEM_TABLE = 5, + VHOST_USER_SET_LOG_BASE = 6, + VHOST_USER_SET_LOG_FD = 7, + VHOST_USER_SET_VRING_NUM = 8, + VHOST_USER_SET_VRING_ADDR = 9, + VHOST_USER_SET_VRING_BASE = 10, + VHOST_USER_GET_VRING_BASE = 11, + VHOST_USER_SET_VRING_KICK = 12, + VHOST_USER_SET_VRING_CALL = 13, + VHOST_USER_SET_VRING_ERR = 14, + VHOST_USER_GET_PROTOCOL_FEATURES = 15, + VHOST_USER_SET_PROTOCOL_FEATURES = 16, + VHOST_USER_GET_QUEUE_NUM = 17, + VHOST_USER_SET_VRING_ENABLE = 18, + VHOST_USER_SEND_RARP = 19, + VHOST_USER_NET_SET_MTU = 20, + VHOST_USER_SET_BACKEND_REQ_FD = 21, + VHOST_USER_IOTLB_MSG = 22, + VHOST_USER_SET_VRING_ENDIAN = 23, + VHOST_USER_GET_CONFIG = 24, + VHOST_USER_SET_CONFIG = 25, + VHOST_USER_CREATE_CRYPTO_SESSION = 26, + VHOST_USER_CLOSE_CRYPTO_SESSION = 27, + VHOST_USER_POSTCOPY_ADVISE = 28, + VHOST_USER_POSTCOPY_LISTEN = 29, + VHOST_USER_POSTCOPY_END = 30, + VHOST_USER_GET_INFLIGHT_FD = 31, + VHOST_USER_SET_INFLIGHT_FD = 32, + VHOST_USER_GPU_SET_SOCKET = 33, + VHOST_USER_VRING_KICK = 35, + VHOST_USER_GET_MAX_MEM_SLOTS = 36, + VHOST_USER_ADD_MEM_REG = 37, + VHOST_USER_REM_MEM_REG = 38, + VHOST_USER_MAX +}; + +/** + * struct vhost_user_header - Vhost-user message header + * @request: Request type of the message + * @flags: Request flags + * @size: The following payload size + */ +struct vhost_user_header { + enum vhost_user_request request; + +#define VHOST_USER_VERSION_MASK 0x3 +#define VHOST_USER_REPLY_MASK (0x1 << 2) +#define VHOST_USER_NEED_REPLY_MASK (0x1 << 3) + uint32_t flags; + uint32_t size; /* the following payload size */ +} __attribute__ ((__packed__)); + +/** + * struct vhost_user_memory_region - Front-end shared memory region information + * @guest_phys_addr: Guest physical address of the region + * @memory_size: Memory size + * @userspace_addr: front-end (QEMU) userspace address + * @mmap_offset: region offset in the shared memory area + */ +struct vhost_user_memory_region { + uint64_t guest_phys_addr; + uint64_t memory_size; + uint64_t userspace_addr; + uint64_t mmap_offset; +}; + +/** + * struct vhost_user_memory - List of all the shared memory regions + * @nregions: Number of memory regions + * @padding: Padding + * @regions: Memory regions list + */ +struct vhost_user_memory { + uint32_t nregions; + uint32_t padding; + struct vhost_user_memory_region regions[VHOST_MEMORY_BASELINE_NREGIONS]; +}; + +/** + * union vhost_user_payload - Vhost-user message payload + * @u64: 64bit payload + * @state: Vring state payload + * @addr: Vring addresses payload + * vhost_user_memory: Memory regions information payload + */ +union vhost_user_payload { +#define VHOST_USER_VRING_IDX_MASK 0xff +#define VHOST_USER_VRING_NOFD_MASK (0x1 << 8) + uint64_t u64; + struct vhost_vring_state state; + struct vhost_vring_addr addr; + struct vhost_user_memory memory; +}; + +/** + * struct vhost_user_msg - Vhost-use message + * @hdr: Message header + * @payload: Message payload + * @fds: File descriptors associated with the message + * in the ancillary data. + * (shared memory or event file descriptors) + * @fd_num: Number of file descriptors + */ +struct vhost_user_msg { + struct vhost_user_header hdr; + union vhost_user_payload payload; + + int fds[VHOST_MEMORY_BASELINE_NREGIONS]; + int fd_num; +} __attribute__ ((__packed__)); +#define VHOST_USER_HDR_SIZE sizeof(struct vhost_user_header) + +/* index of the RX virtqueue */ +#define VHOST_USER_RX_QUEUE 0 +/* index of the TX virtqueue */ +#define VHOST_USER_TX_QUEUE 1 + +/** + * vu_queue_enabled - Return state of a virtqueue + * @vq: Virtqueue to check + * + * Return: true if the virqueue is enabled, false otherwise + */ +static inline bool vu_queue_enabled(const struct vu_virtq *vq) +{ + return vq->enable; +} + +/** + * vu_queue_started - Return state of a virtqueue + * @vq: Virtqueue to check + * + * Return: true if the virqueue is started, false otherwise + */ +static inline bool vu_queue_started(const struct vu_virtq *vq) +{ + return vq->started; +} + +int vu_send(struct vu_dev *vdev, const void *buf, size_t size); +void vu_print_capabilities(void); +void vu_init(struct ctx *c, struct vu_dev *vdev); +void vu_kick_cb(struct vu_dev *vdev, union epoll_ref ref); +void vu_cleanup(struct vu_dev *vdev); +void tap_handler_vu(struct vu_dev *vdev, int fd, uint32_t events); +#endif /* VHOST_USER_H */ diff --git a/virtio.c b/virtio.c index 5f984f92cae0..d712f30cc33d 100644 --- a/virtio.c +++ b/virtio.c @@ -261,7 +261,6 @@ static bool vring_notify(const struct vu_dev *dev, struct vu_virtq *vq) * @dev: Vhost-user device * @vq: Virtqueue */ -/* cppcheck-suppress unusedFunction */ void vu_queue_notify(const struct vu_dev *dev, struct vu_virtq *vq) { if (!vq->vring.avail) @@ -436,7 +435,6 @@ static int vu_queue_map_desc(struct vu_dev *dev, struct vu_virtq *vq, unsigned i * * Return: -1 if there is an error, 0 otherwise */ -/* cppcheck-suppress unusedFunction */ int vu_queue_pop(struct vu_dev *dev, struct vu_virtq *vq, struct vu_virtq_element *elem) { unsigned int head; @@ -497,7 +495,6 @@ void vu_queue_detach_element(struct vu_dev *dev, struct vu_virtq *vq, * @index: Index of the element to unpop * @len: Size of the element to unpop */ -/* cppcheck-suppress unusedFunction */ void vu_queue_unpop(struct vu_dev *dev, struct vu_virtq *vq, unsigned int index, size_t len) { vq->last_avail_idx--; @@ -567,7 +564,6 @@ void vu_queue_fill_by_index(struct vu_virtq *vq, unsigned int index, * @len: Size of the element * @idx: Used ring entry index */ -/* cppcheck-suppress unusedFunction */ void vu_queue_fill(struct vu_virtq *vq, const struct vu_virtq_element *elem, unsigned int len, unsigned int idx) { @@ -591,7 +587,6 @@ static inline void vring_used_idx_set(struct vu_virtq *vq, uint16_t val) * @vq: Virtqueue * @count: Number of entry to flush */ -/* cppcheck-suppress unusedFunction */ void vu_queue_flush(struct vu_virtq *vq, unsigned int count) { uint16_t old, new; diff --git a/virtio.h b/virtio.h index 0a2cf6230139..61fb2f9cbf20 100644 --- a/virtio.h +++ b/virtio.h @@ -107,6 +107,7 @@ struct vu_dev_region { * @hdrlen: Virtio -net header length */ struct vu_dev { + struct ctx *context; uint32_t nregions; struct vu_dev_region regions[VHOST_USER_MAX_RAM_SLOTS]; struct vu_virtq vq[VHOST_USER_MAX_QUEUES]; @@ -163,7 +164,6 @@ static inline bool vu_has_feature(const struct vu_dev *vdev, * * Return: True if the feature is available */ -/* cppcheck-suppress unusedFunction */ static inline bool vu_has_protocol_feature(const struct vu_dev *vdev, unsigned int fbit) {

...the rest looks good to me, but I didn't review 4/4 yet (it conflicts quite a bit with the flow table implementation and I didn't manage to apply it quickly). -- Stefano

Laurent Vivier

14 Aug 14 Aug

4:44 p.m.

New subject: [PATCH v2 3/4] vhost-user: introduce vhost-user API

On 19/07/2024 23:29, Stefano Brivio wrote:

...

On Fri, 12 Jul 2024 17:32:43 +0200 Laurent Vivier wrote:

...
Add vhost_user.c and vhost_user.h that define the functions needed to implement vhost-user backend.

Signed-off-by: Laurent Vivier --- Makefile | 4 +- iov.c | 1 - vhost_user.c | 1267 ++++++++++++++++++++++++++++++++++++++++++++++++++ vhost_user.h | 197 ++++++++ virtio.c | 5 - virtio.h | 2 +- 6 files changed, 1467 insertions(+), 9 deletions(-) create mode 100644 vhost_user.c create mode 100644 vhost_user.h

...

...
+/** + * vu_send() - Send a buffer to the front-end using the RX virtqueue + * @vdev: vhost-user device + * @buf: address of the buffer + * @size: size of the buffer + * + * Return: number of bytes sent, -1 if there is an error + */ +/* cppcheck-suppress unusedFunction */ +int vu_send(struct vu_dev *vdev, const void *buf, size_t size) +{ + size_t hdrlen = vdev->hdrlen; + struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE]; + struct vu_virtq_element elem[VIRTQUEUE_MAX_SIZE]; + struct iovec in_sg[VIRTQUEUE_MAX_SIZE]; + size_t lens[VIRTQUEUE_MAX_SIZE]; + size_t offset; + int i, j; + __virtio16 *num_buffers_ptr; + int in_sg_count;

Can those be aligned in the usual way (from longest to shortest)?

...
+ + debug("vu_send size %zu hdrlen %zu", size, hdrlen); + + if (!vu_queue_enabled(vq) || !vu_queue_started(vq)) { + err("Got packet, but no available descriptors on RX virtq."); + return 0; + } + + offset = 0; + i = 0; + num_buffers_ptr = NULL; + in_sg_count = 0;

Could those be initialised when you declare them?

...
+ while (offset < size) { + size_t len; + int total; + int ret; + + total = 0; + + if (i == ARRAY_SIZE(elem) || + in_sg_count == ARRAY_SIZE(in_sg)) { + err("virtio-net unexpected long buffer chain"); + goto err; + } + + elem[i].out_num = 0; + elem[i].out_sg = NULL; + elem[i].in_num = ARRAY_SIZE(in_sg) - in_sg_count; + elem[i].in_sg = &in_sg[in_sg_count]; + + ret = vu_queue_pop(vdev, vq, &elem[i]); + if (ret < 0) { + if (vu_wait_queue(vq) != -1) + continue; + if (i) { + err("virtio-net unexpected empty queue: " + "i %d mergeable %d offset %zd, size %zd, " + "features 0x%" PRIx64, + i, vu_has_feature(vdev, + VIRTIO_NET_F_MRG_RXBUF), + offset, size, vdev->features); + } + offset = -1; + goto err; + } + in_sg_count += elem[i].in_num; + + if (elem[i].in_num < 1) { + err("virtio-net receive queue contains no in buffers"); + vu_queue_detach_element(vdev, vq, elem[i].index, 0); + offset = -1; + goto err; + } + + if (i == 0) { + struct virtio_net_hdr hdr = { + .flags = VIRTIO_NET_HDR_F_DATA_VALID, + .gso_type = VIRTIO_NET_HDR_GSO_NONE, + }; + + ASSERT(offset == 0); + ASSERT(elem[i].in_sg[0].iov_len >= hdrlen); + + len = iov_from_buf(elem[i].in_sg, elem[i].in_num, 0, + &hdr, sizeof(hdr)); + + num_buffers_ptr = (__virtio16 *)((char *)elem[i].in_sg[0].iov_base + + len); + + total += hdrlen;

Shouldn't this be 'total += len' or, alternatively, shouldn't there be a check that len == hdrlen?

len is sizeof(virtio_net_hdr) but hdrlen can be either sizeof(struct virtio_net_hdr) or sizeof(struct virtio_net_hdr_mrg_rxbuf). It depends on VIRTIO_NET_F_MRG_RXBUF. We actually want to add hdrlen to total. struct virtio_net_hdr_mrg_rxbuf { struct virtio_net_hdr hdr; __virtio16 num_buffers; /* Number of merged rx buffers */ }; At this point we initialize hdr, num_buffers will be set later only if hdrlen is sizeof(struct virtio_net_hdr_mrg_rxbuf). Thanks, Laurent

Laurent Vivier

12 Jul 12 Jul

5:32 p.m.

New subject: [PATCH v2 4/4] vhost-user: add vhost-user

add virtio and vhost-user functions to connect with QEMU. $ ./passt --vhost-user and # qemu-system-x86_64 ... -m 4G \ -object memory-backend-memfd,id=memfd0,share=on,size=4G \ -numa node,memdev=memfd0 \ -chardev socket,id=chr0,path=/tmp/passt_1.socket \ -netdev vhost-user,id=netdev0,chardev=chr0 \ -device virtio-net,mac=9a:2b:2c:2d:2e:2f,netdev=netdev0 \ ... Signed-off-by: Laurent Vivier --- Makefile | 4 +- checksum.c | 1 - conf.c | 24 ++- isolation.c | 15 +- packet.c | 13 ++ packet.h | 2 + passt.c | 16 +- passt.h | 10 + pcap.c | 1 - tap.c | 114 +++++++--- tap.h | 5 +- tcp.c | 17 +- tcp_vu.c | 560 +++++++++++++++++++++++++++++++++++++++++++++++++ tcp_vu.h | 12 ++ udp.c | 54 ++--- udp_internal.h | 39 ++++ udp_vu.c | 240 +++++++++++++++++++++ udp_vu.h | 11 + vhost_user.c | 28 ++- virtio.c | 1 - 20 files changed, 1084 insertions(+), 83 deletions(-) create mode 100644 tcp_vu.c create mode 100644 tcp_vu.h create mode 100644 udp_internal.h create mode 100644 udp_vu.c create mode 100644 udp_vu.h diff --git a/Makefile b/Makefile index b2da6ad62103..d22388726099 100644 --- a/Makefile +++ b/Makefile @@ -47,7 +47,7 @@ FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS) PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c flow.c fwd.c \ icmp.c igmp.c inany.c iov.c ip.c isolation.c lineread.c log.c mld.c \ ndp.c netlink.c packet.c passt.c pasta.c pcap.c pif.c tap.c tcp.c \ - tcp_buf.c tcp_splice.c udp.c util.c vhost_user.c virtio.c + tcp_buf.c tcp_splice.c tcp_vu.c udp.c udp_vu.c util.c vhost_user.c virtio.c QRAP_SRCS = qrap.c SRCS = $(PASST_SRCS) $(QRAP_SRCS) @@ -57,7 +57,7 @@ PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \ flow_table.h icmp.h icmp_flow.h inany.h iov.h ip.h isolation.h \ lineread.h log.h ndp.h netlink.h packet.h passt.h pasta.h pcap.h pif.h \ siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h tcp_internal.h tcp_splice.h \ - udp.h util.h vhost_user.h virtio.h + tcp_vu.h udp.h udp_internal.h udp_vu.h util.h vhost_user.h virtio.h HEADERS = $(PASST_HEADERS) seccomp.h C := \#include \nstruct tcp_info x = { .tcpi_snd_wnd = 0 }; diff --git a/checksum.c b/checksum.c index 006614fcbb28..aa5b7ae1cb66 100644 --- a/checksum.c +++ b/checksum.c @@ -501,7 +501,6 @@ uint16_t csum(const void *buf, size_t len, uint32_t init) * * Return: 16-bit folded, complemented checksum */ -/* cppcheck-suppress unusedFunction */ uint16_t csum_iov(const struct iovec *iov, size_t n, uint32_t init) { unsigned int i; diff --git a/conf.c b/conf.c index 3c38cebcbd69..3eb5ba56df88 100644 --- a/conf.c +++ b/conf.c @@ -45,6 +45,7 @@ #include "lineread.h" #include "isolation.h" #include "log.h" +#include "vhost_user.h" /** * next_chunk - Return the next piece of a string delimited by a character @@ -744,9 +745,14 @@ static void usage(const char *name, FILE *f, int status) " default: same interface name as external one\n"); } else { fprintf(f, - " -s, --socket PATH UNIX domain socket path\n" + " -s, --socket, --socket-path PATH UNIX domain socket path\n" " default: probe free path starting from " UNIX_SOCK_PATH "\n", 1); + fprintf(f, + " --vhost-user Enable vhost-user mode\n" + " UNIX domain socket is provided by -s option\n" + " --print-capabilities print back-end capabilities in JSON format,\n" + " only meaningful for vhost-user mode\n"); } fprintf(f, @@ -1215,6 +1221,10 @@ void conf(struct ctx *c, int argc, char **argv) {"no-copy-routes", no_argument, NULL, 18 }, {"no-copy-addrs", no_argument, NULL, 19 }, {"netns-only", no_argument, NULL, 20 }, + {"vhost-user", no_argument, NULL, 21 }, + /* vhost-user backend program convention */ + {"print-capabilities", no_argument, NULL, 22 }, + {"socket-path", required_argument, NULL, 's' }, { 0 }, }; const char *logname = (c->mode == MODE_PASTA) ? "pasta" : "passt"; @@ -1344,14 +1354,12 @@ void conf(struct ctx *c, int argc, char **argv) sizeof(c->ip4.ifname_out), "%s", optarg); if (ret <= 0 || ret >= (int)sizeof(c->ip4.ifname_out)) die("Invalid interface name: %s", optarg); - break; case 16: ret = snprintf(c->ip6.ifname_out, sizeof(c->ip6.ifname_out), "%s", optarg); if (ret <= 0 || ret >= (int)sizeof(c->ip6.ifname_out)) die("Invalid interface name: %s", optarg); - break; case 17: if (c->mode != MODE_PASTA) @@ -1380,6 +1388,16 @@ void conf(struct ctx *c, int argc, char **argv) netns_only = 1; *userns = 0; break; + case 21: + if (c->mode == MODE_PASTA) { + err("--vhost-user is for passt mode only"); + usage(argv[0], stdout, EXIT_SUCCESS); + } + c->mode = MODE_VU; + break; + case 22: + vu_print_capabilities(); + break; case 'd': c->debug = 1; c->quiet = 0; diff --git a/isolation.c b/isolation.c index 4956d7e6f331..1a27f066c2ba 100644 --- a/isolation.c +++ b/isolation.c @@ -373,12 +373,19 @@ void isolate_postfork(const struct ctx *c) prctl(PR_SET_DUMPABLE, 0); - if (c->mode == MODE_PASTA) { - prog.len = (unsigned short)ARRAY_SIZE(filter_pasta); - prog.filter = filter_pasta; - } else { + switch (c->mode) { + case MODE_PASST: prog.len = (unsigned short)ARRAY_SIZE(filter_passt); prog.filter = filter_passt; + break; + case MODE_PASTA: + prog.len = (unsigned short)ARRAY_SIZE(filter_pasta); + prog.filter = filter_pasta; + break; + case MODE_VU: + prog.len = (unsigned short)ARRAY_SIZE(filter_vu); + prog.filter = filter_vu; + break; } if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) || diff --git a/packet.c b/packet.c index f7bb523c4ffa..a95d525f97f2 100644 --- a/packet.c +++ b/packet.c @@ -36,6 +36,19 @@ static int packet_check_range(const struct pool *p, size_t offset, size_t len, const char *start, const char *func, int line) { + ASSERT(p->buf); + + if (p->buf_size == 0) { + int ret; + + ret = vu_packet_check_range((void *)p->buf, offset, len, start); + + if (ret == -1 && func) + trace("cannot find region, %s:%i", func, line); + + return ret; + } + if (start < p->buf) { if (func) { trace("add packet start %p before buffer start %p, " diff --git a/packet.h b/packet.h index 8377dcf678bb..d32688d8a0a4 100644 --- a/packet.h +++ b/packet.h @@ -22,6 +22,8 @@ struct pool { struct iovec pkt[1]; }; +int vu_packet_check_range(void *buf, size_t offset, size_t len, + const char *start); void packet_add_do(struct pool *p, size_t len, const char *start, const char *func, int line); void *packet_get_do(const struct pool *p, const size_t idx, diff --git a/passt.c b/passt.c index e4d45daab011..6f3df4026c83 100644 --- a/passt.c +++ b/passt.c @@ -73,6 +73,8 @@ char *epoll_type_str[] = { [EPOLL_TYPE_TAP_PASTA] = "/dev/net/tun device", [EPOLL_TYPE_TAP_PASST] = "connected qemu socket", [EPOLL_TYPE_TAP_LISTEN] = "listening qemu socket", + [EPOLL_TYPE_VHOST_CMD] = "vhost-user command socket", + [EPOLL_TYPE_VHOST_KICK] = "vhost-user kick socket", }; static_assert(ARRAY_SIZE(epoll_type_str) == EPOLL_NUM_TYPES, "epoll_type_str[] doesn't match enum epoll_type"); @@ -205,6 +207,7 @@ int main(int argc, char **argv) struct rlimit limit; struct timespec now; struct sigaction sa; + struct vu_dev vdev; arch_avx2_exec(argv); @@ -259,6 +262,8 @@ int main(int argc, char **argv) pasta_netns_quit_init(&c); tap_sock_init(&c); + if (c.mode == MODE_VU) + vu_init(&c, &vdev); secret_init(&c); @@ -347,11 +352,20 @@ loop: tcp_timer_handler(&c, ref); break; case EPOLL_TYPE_UDP: - udp_buf_sock_handler(&c, ref, eventmask, &now); + if (c.mode == MODE_VU) + udp_vu_sock_handler(&c, ref, eventmask, &now); + else + udp_buf_sock_handler(&c, ref, eventmask, &now); break; case EPOLL_TYPE_PING: icmp_sock_handler(&c, ref); break; + case EPOLL_TYPE_VHOST_CMD: + tap_handler_vu(&vdev, c.fd_tap, eventmask); + break; + case EPOLL_TYPE_VHOST_KICK: + vu_kick_cb(&vdev, ref); + break; default: /* Can't happen */ ASSERT(0); diff --git a/passt.h b/passt.h index 21cf4c15c921..7dce022505b4 100644 --- a/passt.h +++ b/passt.h @@ -22,6 +22,8 @@ union epoll_ref; #include "fwd.h" #include "tcp.h" #include "udp.h" +#include "udp_vu.h" +#include "vhost_user.h" /** * enum epoll_type - Different types of fds we poll over @@ -51,6 +53,10 @@ enum epoll_type { EPOLL_TYPE_TAP_PASST, /* socket listening for qemu socket connections */ EPOLL_TYPE_TAP_LISTEN, + /* vhost-user command socket */ + EPOLL_TYPE_VHOST_CMD, + /* vhost-user kick event socket */ + EPOLL_TYPE_VHOST_KICK, EPOLL_NUM_TYPES, }; @@ -117,6 +123,7 @@ struct fqdn { enum passt_modes { MODE_PASST, MODE_PASTA, + MODE_VU, }; /** @@ -223,6 +230,7 @@ struct ip6_ctx { * @no_map_gw: Don't map connections, untracked UDP to gateway to host * @low_wmem: Low probed net.core.wmem_max * @low_rmem: Low probed net.core.rmem_max + * @vdev: vhost-user device */ struct ctx { enum passt_modes mode; @@ -286,6 +294,8 @@ struct ctx { int low_wmem; int low_rmem; + + struct vu_dev *vdev; }; void proto_update_l2_buf(const unsigned char *eth_d, diff --git a/pcap.c b/pcap.c index 46cc4b0d72b6..7e9c56090041 100644 --- a/pcap.c +++ b/pcap.c @@ -140,7 +140,6 @@ void pcap_multiple(const struct iovec *iov, size_t frame_parts, unsigned int n, * containing packet data to write, including L2 header * @iovcnt: Number of buffers (@iov entries) */ -/* cppcheck-suppress unusedFunction */ void pcap_iov(const struct iovec *iov, size_t iovcnt) { struct timespec now; diff --git a/tap.c b/tap.c index ec994a2ed4ed..56506a45e4dd 100644 --- a/tap.c +++ b/tap.c @@ -58,6 +58,7 @@ #include "packet.h" #include "tap.h" #include "log.h" +#include "vhost_user.h" /* IPv4 (plus ARP) and IPv6 message batches from tap/guest to IP handlers */ static PACKET_POOL_NOINIT(pool_tap4, TAP_MSGS, pkt_buf); @@ -78,16 +79,22 @@ void tap_send_single(const struct ctx *c, const void *data, size_t l2len) struct iovec iov[2]; size_t iovcnt = 0; - if (c->mode == MODE_PASST) { + switch (c->mode) { + case MODE_PASST: iov[iovcnt] = IOV_OF_LVALUE(vnet_len); iovcnt++; - } - - iov[iovcnt].iov_base = (void *)data; - iov[iovcnt].iov_len = l2len; - iovcnt++; + /* fall through */ + case MODE_PASTA: + iov[iovcnt].iov_base = (void *)data; + iov[iovcnt].iov_len = l2len; + iovcnt++; - tap_send_frames(c, iov, iovcnt, 1); + tap_send_frames(c, iov, iovcnt, 1); + break; + case MODE_VU: + vu_send(c->vdev, data, l2len); + break; + } } /** @@ -416,10 +423,18 @@ size_t tap_send_frames(const struct ctx *c, const struct iovec *iov, if (!nframes) return 0; - if (c->mode == MODE_PASTA) + switch (c->mode) { + case MODE_PASTA: m = tap_send_frames_pasta(c, iov, bufs_per_frame, nframes); - else + break; + case MODE_PASST: m = tap_send_frames_passt(c, iov, bufs_per_frame, nframes); + break; + case MODE_VU: + /* fall through */ + default: + ASSERT(0); + } if (m < nframes) debug("tap: failed to send %zu frames of %zu", @@ -977,7 +992,7 @@ void tap_add_packet(struct ctx *c, ssize_t l2len, char *p) * tap_sock_reset() - Handle closing or failure of connect AF_UNIX socket * @c: Execution context */ -static void tap_sock_reset(struct ctx *c) +void tap_sock_reset(struct ctx *c) { if (c->one_off) { info("Client closed connection, exiting"); @@ -988,6 +1003,8 @@ static void tap_sock_reset(struct ctx *c) epoll_ctl(c->epollfd, EPOLL_CTL_DEL, c->fd_tap, NULL); close(c->fd_tap); c->fd_tap = -1; + if (c->mode == MODE_VU) + vu_cleanup(c->vdev); } /** @@ -1180,11 +1197,17 @@ static void tap_sock_unix_init(struct ctx *c) ev.data.u64 = ref.u64; epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap_listen, &ev); - info("\nYou can now start qemu (>= 7.2, with commit 13c6be96618c):"); - info(" kvm ... -device virtio-net-pci,netdev=s -netdev stream,id=s,server=off,addr.type=unix,addr.path=%s", - c->sock_path); - info("or qrap, for earlier qemu versions:"); - info(" ./qrap 5 kvm ... -net socket,fd=5 -net nic,model=virtio"); + if (c->mode == MODE_VU) { + info("You can start qemu with:"); + info(" kvm ... -chardev socket,id=chr0,path=%s -netdev vhost-user,id=netdev0,chardev=chr0 -device virtio-net,netdev=netdev0 -object memory-backend-memfd,id=memfd0,share=on,size=$RAMSIZE -numa node,memdev=memfd0\n", + c->sock_path); + } else { + info("\nYou can now start qemu (>= 7.2, with commit 13c6be96618c):"); + info(" kvm ... -device virtio-net-pci,netdev=s -netdev stream,id=s,server=off,addr.type=unix,addr.path=%s", + c->sock_path); + info("or qrap, for earlier qemu versions:"); + info(" ./qrap 5 kvm ... -net socket,fd=5 -net nic,model=virtio"); + } } /** @@ -1194,8 +1217,8 @@ static void tap_sock_unix_init(struct ctx *c) */ void tap_listen_handler(struct ctx *c, uint32_t events) { - union epoll_ref ref = { .type = EPOLL_TYPE_TAP_PASST }; struct epoll_event ev = { 0 }; + union epoll_ref ref; int v = INT_MAX / 2; struct ucred ucred; socklen_t len; @@ -1235,7 +1258,13 @@ void tap_listen_handler(struct ctx *c, uint32_t events) trace("tap: failed to set SO_SNDBUF to %i", v); ref.fd = c->fd_tap; - ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP; + if (c->mode == MODE_VU) { + ref.type = EPOLL_TYPE_VHOST_CMD; + ev.events = EPOLLIN | EPOLLRDHUP; + } else { + ref.type = EPOLL_TYPE_TAP_PASST; + ev.events = EPOLLIN | EPOLLRDHUP | EPOLLET; + } ev.data.u64 = ref.u64; epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev); } @@ -1296,21 +1325,47 @@ static void tap_sock_tun_init(struct ctx *c) epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev); } +void tap_sock_update_buf(void *base, size_t size) +{ + int i; + + pool_tap4_storage.buf = base; + pool_tap4_storage.buf_size = size; + pool_tap6_storage.buf = base; + pool_tap6_storage.buf_size = size; + + for (i = 0; i < TAP_SEQS; i++) { + tap4_l4[i].p.buf = base; + tap4_l4[i].p.buf_size = size; + tap6_l4[i].p.buf = base; + tap6_l4[i].p.buf_size = size; + } +} + /** * tap_sock_init() - Create and set up AF_UNIX socket or tuntap file descriptor * @c: Execution context */ void tap_sock_init(struct ctx *c) { - size_t sz = sizeof(pkt_buf); + size_t sz; + char *buf; int i; - pool_tap4_storage = PACKET_INIT(pool_tap4, TAP_MSGS, pkt_buf, sz); - pool_tap6_storage = PACKET_INIT(pool_tap6, TAP_MSGS, pkt_buf, sz); + if (c->mode == MODE_VU) { + buf = NULL; + sz = 0; + } else { + buf = pkt_buf; + sz = sizeof(pkt_buf); + } + + pool_tap4_storage = PACKET_INIT(pool_tap4, TAP_MSGS, buf, sz); + pool_tap6_storage = PACKET_INIT(pool_tap6, TAP_MSGS, buf, sz); for (i = 0; i < TAP_SEQS; i++) { - tap4_l4[i].p = PACKET_INIT(pool_l4, UIO_MAXIOV, pkt_buf, sz); - tap6_l4[i].p = PACKET_INIT(pool_l4, UIO_MAXIOV, pkt_buf, sz); + tap4_l4[i].p = PACKET_INIT(pool_l4, UIO_MAXIOV, buf, sz); + tap6_l4[i].p = PACKET_INIT(pool_l4, UIO_MAXIOV, buf, sz); } if (c->fd_tap != -1) { /* Passed as --fd */ @@ -1319,12 +1374,21 @@ void tap_sock_init(struct ctx *c) ASSERT(c->one_off); ref.fd = c->fd_tap; - if (c->mode == MODE_PASST) + switch (c->mode) { + case MODE_PASST: ref.type = EPOLL_TYPE_TAP_PASST; - else + ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP; + break; + case MODE_PASTA: ref.type = EPOLL_TYPE_TAP_PASTA; + ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP; + break; + case MODE_VU: + ref.type = EPOLL_TYPE_VHOST_CMD; + ev.events = EPOLLIN | EPOLLRDHUP; + break; + } - ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP; ev.data.u64 = ref.u64; epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev); return; diff --git a/tap.h b/tap.h index d496bd0e4b99..d9c6d4f57093 100644 --- a/tap.h +++ b/tap.h @@ -40,7 +40,8 @@ static inline struct iovec tap_hdr_iov(const struct ctx *c, */ static inline void tap_hdr_update(struct tap_hdr *thdr, size_t l2len) { - thdr->vnet_len = htonl(l2len); + if (thdr) + thdr->vnet_len = htonl(l2len); } struct in_addr tap_ip4_daddr(const struct ctx *c); @@ -69,6 +70,8 @@ void tap_handler_pasta(struct ctx *c, uint32_t events, void tap_handler_passt(struct ctx *c, uint32_t events, const struct timespec *now); int tap_sock_unix_open(char *sock_path); +void tap_sock_reset(struct ctx *c); +void tap_sock_update_buf(void *base, size_t size); void tap_sock_init(struct ctx *c); void tap_flush_pools(void); void tap_handler(struct ctx *c, const struct timespec *now); diff --git a/tcp.c b/tcp.c index 698e7ecb821a..424486ff73be 100644 --- a/tcp.c +++ b/tcp.c @@ -304,6 +304,7 @@ #include "flow_table.h" #include "tcp_internal.h" #include "tcp_buf.h" +#include "tcp_vu.h" #define TCP_HASH_TABLE_LOAD 70 /* % */ #define TCP_HASH_TABLE_SIZE (FLOW_MAX * 100 / TCP_HASH_TABLE_LOAD) @@ -1049,7 +1050,10 @@ static size_t tcp_fill_headers4(const struct ctx *c, tcp_fill_header(th, conn, seq); - tcp_update_check_tcp4(iph, th); + if (c->mode != MODE_VU) + tcp_update_check_tcp4(iph, th); + else + th->check = 0; tap_hdr_update(taph, l3len + sizeof(struct ethhdr)); @@ -1094,7 +1098,10 @@ static size_t tcp_fill_headers6(const struct ctx *c, tcp_fill_header(th, conn, seq); - tcp_update_check_tcp6(ip6h, th); + if (c->mode != MODE_VU) + tcp_update_check_tcp6(ip6h, th); + else + th->check = 0; tap_hdr_update(taph, l4len + sizeof(*ip6h) + sizeof(struct ethhdr)); @@ -1362,6 +1369,9 @@ int tcp_prepare_flags(struct ctx *c, struct tcp_tap_conn *conn, */ int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) { + if (c->mode == MODE_VU) + return tcp_vu_send_flag(c, conn, flags); + return tcp_buf_send_flag(c, conn, flags); } @@ -1814,6 +1824,9 @@ static int tcp_sock_consume(const struct tcp_tap_conn *conn, uint32_t ack_seq) */ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) { + if (c->mode == MODE_VU) + return tcp_vu_data_from_sock(c, conn); + return tcp_buf_data_from_sock(c, conn); } diff --git a/tcp_vu.c b/tcp_vu.c new file mode 100644 index 000000000000..24fee3f63278 --- /dev/null +++ b/tcp_vu.c @@ -0,0 +1,560 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later + * Copyright Red Hat + * Author: Laurent Vivier + * + * tcp_vu.c - TCP L2 vhost-user management functions + */ + +#include +#include +#include + +#include + +#include + +#include +#include + +#include "util.h" +#include "ip.h" +#include "passt.h" +#include "siphash.h" +#include "inany.h" +#include "vhost_user.h" +#include "tcp.h" +#include "pcap.h" +#include "flow.h" +#include "tcp_conn.h" +#include "flow_table.h" +#include "tcp_vu.h" +#include "tcp_internal.h" +#include "checksum.h" + +#define CONN_V4(conn) (!!inany_v4(&(conn)->faddr)) +#define CONN_V6(conn) (!CONN_V4(conn)) + +/** + * struct tcp_payload_t - TCP header and data to send segments with payload + * @th: TCP header + * @data: TCP data + */ +struct tcp_payload_t { + struct tcphdr th; + uint8_t data[IP_MAX_MTU - sizeof(struct tcphdr)]; +}; + +/** + * struct tcp_flags_t - TCP header and data to send zero-length + * segments (flags) + * @th: TCP header + * @opts TCP options + */ +struct tcp_flags_t { + struct tcphdr th; + char opts[OPT_MSS_LEN + OPT_WS_LEN + 1]; +}; + +/* vhost-user */ +static const struct virtio_net_hdr vu_header = { + .flags = VIRTIO_NET_HDR_F_DATA_VALID, + .gso_type = VIRTIO_NET_HDR_GSO_NONE, +}; + +int tcp_vu_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) +{ + struct vu_dev *vdev = c->vdev; + struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE]; + size_t l2len, vnet_hdrlen, l4len, optlen; + struct virtio_net_hdr_mrg_rxbuf *vh; + struct iovec l2_iov[TCP_NUM_IOVS]; + struct vu_virtq_element elem; + struct iovec in_sg; + struct ethhdr *eh; + int nb_ack; + int ret; + + elem.out_num = 0; + elem.out_sg = NULL; + elem.in_num = 1; + elem.in_sg = &in_sg; + ret = vu_queue_pop(vdev, vq, &elem); + if (ret < 0) + return 0; + + if (elem.in_num < 1) { + err("virtio-net receive queue contains no in buffers"); + vu_queue_rewind(vdev, vq, 1); + return 0; + } + + vh = elem.in_sg[0].iov_base; + + vh->hdr = vu_header; + if (vu_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF)) { + vnet_hdrlen = sizeof(*vh); + vh->num_buffers = htole16(1); + } else { + vnet_hdrlen = sizeof(vh->hdr); + } + + l2_iov[TCP_IOV_TAP].iov_base = NULL; + l2_iov[TCP_IOV_TAP].iov_len = 0; + l2_iov[TCP_IOV_ETH].iov_base = (char *)elem.in_sg[0].iov_base + vnet_hdrlen; + l2_iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr); + + eh = l2_iov[TCP_IOV_ETH].iov_base; + + memcpy(eh->h_dest, c->mac_guest, sizeof(eh->h_dest)); + memcpy(eh->h_source, c->mac, sizeof(eh->h_source)); + + if (CONN_V4(conn)) { + struct tcp_flags_t *payload; + struct iphdr *iph; + uint32_t seq; + + l2_iov[TCP_IOV_IP].iov_base = (char *)l2_iov[TCP_IOV_ETH].iov_base + + l2_iov[TCP_IOV_ETH].iov_len; + l2_iov[TCP_IOV_IP].iov_len = sizeof(struct iphdr); + l2_iov[TCP_IOV_PAYLOAD].iov_base = (char *)l2_iov[TCP_IOV_IP].iov_base + + l2_iov[TCP_IOV_IP].iov_len; + + eh->h_proto = htons(ETH_P_IP); + + iph = l2_iov[TCP_IOV_IP].iov_base; + *iph = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_TCP); + + payload = l2_iov[TCP_IOV_PAYLOAD].iov_base; + payload->th = (struct tcphdr){ + .doff = offsetof(struct tcp_flags_t, opts) / 4, + .ack = 1 + }; + + seq = conn->seq_to_tap; + ret = tcp_prepare_flags(c, conn, flags, &payload->th, payload->opts, &optlen); + if (ret <= 0) { + vu_queue_rewind(vdev, vq, 1); + return ret; + } + + l4len = tcp_l2_buf_fill_headers(c, conn, l2_iov, optlen, NULL, + seq); + /* cppcheck-suppress unreadVariable */ + l2_iov[TCP_IOV_PAYLOAD].iov_len = l4len; + + l2len = l4len + sizeof(*iph) + sizeof(struct ethhdr); + + if (*c->pcap) { + struct in_addr saddr, daddr; + uint32_t sum; + + saddr.s_addr = iph->saddr; + daddr.s_addr = iph->daddr; + sum = proto_ipv4_header_psum(l4len, + IPPROTO_TCP, + saddr, daddr); + + payload->th.check = 0; + payload->th.check = csum(&payload->th, optlen + sizeof(struct tcphdr), sum); + } + } else { + struct tcp_flags_t *payload; + struct ipv6hdr *ip6h; + uint32_t seq; + + l2_iov[TCP_IOV_IP].iov_base = (char *)l2_iov[TCP_IOV_ETH].iov_base + + l2_iov[TCP_IOV_ETH].iov_len; + l2_iov[TCP_IOV_IP].iov_len = sizeof(struct ipv6hdr); + l2_iov[TCP_IOV_PAYLOAD].iov_base = (char *)l2_iov[TCP_IOV_IP].iov_base + + l2_iov[TCP_IOV_IP].iov_len; + + eh->h_proto = htons(ETH_P_IPV6); + + ip6h = l2_iov[TCP_IOV_IP].iov_base; + *ip6h = (struct ipv6hdr)L2_BUF_IP6_INIT(IPPROTO_TCP); + + payload = l2_iov[TCP_IOV_PAYLOAD].iov_base; + payload->th = (struct tcphdr){ + .doff = offsetof(struct tcp_flags_t, opts) / 4, + .ack = 1 + }; + + seq = conn->seq_to_tap; + ret = tcp_prepare_flags(c, conn, flags, &payload->th, payload->opts, &optlen); + if (ret <= 0) { + vu_queue_rewind(vdev, vq, 1); + return ret; + } + + l4len = tcp_l2_buf_fill_headers(c, conn, l2_iov, optlen, NULL, + seq); + /* cppcheck-suppress unreadVariable */ + l2_iov[TCP_IOV_PAYLOAD].iov_len = l4len; + + l2len = l4len + sizeof(*ip6h) + sizeof(struct ethhdr); + + if (*c->pcap) { + uint32_t sum = proto_ipv6_header_psum(l4len, + IPPROTO_TCP, + &ip6h->saddr, + &ip6h->daddr); + + payload->th.check = 0; + payload->th.check = csum(&payload->th, optlen + sizeof(struct tcphdr), sum); + } + } + + pcap((void *)eh, l2len); + + l2len += vnet_hdrlen; + ASSERT(l2len <= elem.in_sg[0].iov_len); + + vu_queue_fill(vq, &elem, l2len, 0); + nb_ack = 1; + + if (flags & DUP_ACK) { + struct vu_virtq_element elem_dup; + struct iovec in_sg_dup; + + elem_dup.out_num = 0; + elem_dup.out_sg = NULL; + elem_dup.in_num = 1; + elem_dup.in_sg = &in_sg_dup; + ret = vu_queue_pop(vdev, vq, &elem_dup); + if (ret == 0) { + if (elem_dup.in_num < 1 || elem_dup.in_sg[0].iov_len < l2len) { + vu_queue_rewind(vdev, vq, 1); + } else { + memcpy(elem_dup.in_sg[0].iov_base, vh, l2len); + nb_ack++; + } + } + } + + vu_queue_flush(vq, nb_ack); + vu_queue_notify(vdev, vq); + + return 0; +} + +int tcp_vu_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) +{ + uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap; + static struct iovec iov_vu[VIRTQUEUE_MAX_SIZE]; + static struct vu_virtq_element elem[VIRTQUEUE_MAX_SIZE]; + static struct iovec in_sg[VIRTQUEUE_MAX_SIZE]; + struct vu_dev *vdev = c->vdev; + struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE]; + size_t l2_hdrlen, vnet_hdrlen, fillsize; + int s = conn->sock, v4 = CONN_V4(conn); + struct iovec l2_iov[TCP_NUM_IOVS]; + int i, ret, iov_cnt, iov_used; + struct msghdr mh_sock = { 0 }; + uint16_t mss = MSS_GET(conn); + static int in_sg_count; + uint32_t already_sent; + const uint16_t *check; + struct iovec *first; + bool has_mrg_rxbuf; + int segment_size; + int num_buffers; + ssize_t len; + + if (!vu_queue_enabled(vq) || !vu_queue_started(vq)) { + flow_err(conn, + "Got packet, but no available descriptors on RX virtq."); + return 0; + } + + already_sent = conn->seq_to_tap - conn->seq_ack_from_tap; + + if (SEQ_LT(already_sent, 0)) { + /* RFC 761, section 2.1. */ + flow_trace(conn, "ACK sequence gap: ACK for %u, sent: %u", + conn->seq_ack_from_tap, conn->seq_to_tap); + conn->seq_to_tap = conn->seq_ack_from_tap; + already_sent = 0; + } + + if (!wnd_scaled || already_sent >= wnd_scaled) { + conn_flag(c, conn, STALLED); + conn_flag(c, conn, ACK_FROM_TAP_DUE); + return 0; + } + + /* Set up buffer descriptors we'll fill completely and partially. */ + + fillsize = wnd_scaled; + + iov_vu[0].iov_base = tcp_buf_discard; + iov_vu[0].iov_len = already_sent; + fillsize -= already_sent; + + has_mrg_rxbuf = vu_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF); + if (has_mrg_rxbuf) + vnet_hdrlen = sizeof(struct virtio_net_hdr_mrg_rxbuf); + else + vnet_hdrlen = sizeof(struct virtio_net_hdr); + l2_hdrlen = vnet_hdrlen + sizeof(struct ethhdr) + sizeof(struct tcphdr); + if (v4) + l2_hdrlen += sizeof(struct iphdr); + else + l2_hdrlen += sizeof(struct ipv6hdr); + + iov_cnt = 0; + in_sg_count = 0; + segment_size = 0; + while (fillsize > 0 && iov_cnt < VIRTQUEUE_MAX_SIZE - 1 && + in_sg_count < ARRAY_SIZE(in_sg)) { + + elem[iov_cnt].out_num = 0; + elem[iov_cnt].out_sg = NULL; + elem[iov_cnt].in_num = ARRAY_SIZE(in_sg) - in_sg_count; + elem[iov_cnt].in_sg = &in_sg[in_sg_count]; + ret = vu_queue_pop(vdev, vq, &elem[iov_cnt]); + if (ret < 0) + break; + + if (elem[iov_cnt].in_num < 1) { + err("virtio-net receive queue contains no in buffers"); + goto err; + } + in_sg_count += elem[iov_cnt].in_num; + + ASSERT(elem[iov_cnt].in_num == 1); + ASSERT(elem[iov_cnt].in_sg[0].iov_len >= l2_hdrlen); + + if (segment_size == 0) { + iov_vu[iov_cnt + 1].iov_base = + (char *)elem[iov_cnt].in_sg[0].iov_base + l2_hdrlen; + iov_vu[iov_cnt + 1].iov_len = + elem[iov_cnt].in_sg[0].iov_len - l2_hdrlen; + } else { + iov_vu[iov_cnt + 1].iov_base = elem[iov_cnt].in_sg[0].iov_base; + iov_vu[iov_cnt + 1].iov_len = elem[iov_cnt].in_sg[0].iov_len; + } + + if (iov_vu[iov_cnt + 1].iov_len > fillsize) + iov_vu[iov_cnt + 1].iov_len = fillsize; + + segment_size += iov_vu[iov_cnt + 1].iov_len; + if (!has_mrg_rxbuf) { + segment_size = 0; + } else if (segment_size >= mss) { + iov_vu[iov_cnt + 1].iov_len -= segment_size - mss; + segment_size = 0; + } + fillsize -= iov_vu[iov_cnt + 1].iov_len; + + iov_cnt++; + } + if (iov_cnt == 0) + return 0; + + ret = 0; + mh_sock.msg_iov = iov_vu; + mh_sock.msg_iovlen = iov_cnt + 1; + + do + len = recvmsg(s, &mh_sock, MSG_PEEK); + while (len < 0 && errno == EINTR); + + if (len < 0) + goto err; + + if (!len) { + vu_queue_rewind(vdev, vq, iov_cnt); + if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == SOCK_FIN_RCVD) { + ret = tcp_vu_send_flag(c, conn, FIN | ACK); + if (ret) { + tcp_rst(c, conn); + return ret; + } + + conn_event(c, conn, TAP_FIN_SENT); + } + + return 0; + } + + len -= already_sent; + if (len <= 0) { + conn_flag(c, conn, STALLED); + vu_queue_rewind(vdev, vq, iov_cnt); + return 0; + } + + conn_flag(c, conn, ~STALLED); + + /* Likely, some new data was acked too. */ + tcp_update_seqack_wnd(c, conn, 0, NULL); + + /* initialize headers */ + iov_used = 0; + num_buffers = 0; + check = NULL; + segment_size = 0; + for (i = 0; i < iov_cnt && len; i++) { + + if (segment_size == 0) + first = &iov_vu[i + 1]; + + if (iov_vu[i + 1].iov_len > (size_t)len) + iov_vu[i + 1].iov_len = len; + + len -= iov_vu[i + 1].iov_len; + iov_used++; + + segment_size += iov_vu[i + 1].iov_len; + num_buffers++; + + if (segment_size >= mss || len == 0 || + i + 1 == iov_cnt || !has_mrg_rxbuf) { + char *base = (char *)first->iov_base - l2_hdrlen; + size_t size = first->iov_len + l2_hdrlen; + struct virtio_net_hdr_mrg_rxbuf *vh; + struct ethhdr *eh; + size_t l4len; + + vh = (struct virtio_net_hdr_mrg_rxbuf *)base; + + vh->hdr = vu_header; + if (has_mrg_rxbuf) + vh->num_buffers = htole16(num_buffers); + + l2_iov[TCP_IOV_TAP].iov_base = NULL; + l2_iov[TCP_IOV_TAP].iov_len = 0; + l2_iov[TCP_IOV_ETH].iov_base = base + vnet_hdrlen; + l2_iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr); + + eh = l2_iov[TCP_IOV_ETH].iov_base; + + memcpy(eh->h_dest, c->mac_guest, sizeof(eh->h_dest)); + memcpy(eh->h_source, c->mac, sizeof(eh->h_source)); + + /* initialize header */ + if (v4) { + struct tcp_payload_t *payload; + struct iphdr *iph; + + l2_iov[TCP_IOV_IP].iov_base = (char *)l2_iov[TCP_IOV_ETH].iov_base + + l2_iov[TCP_IOV_ETH].iov_len; + l2_iov[TCP_IOV_IP].iov_len = sizeof(struct iphdr); + l2_iov[TCP_IOV_PAYLOAD].iov_base = (char *)l2_iov[TCP_IOV_IP].iov_base + + l2_iov[TCP_IOV_IP].iov_len; + + + eh->h_proto = htons(ETH_P_IP); + + iph = l2_iov[TCP_IOV_IP].iov_base; + *iph = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_TCP); + payload = l2_iov[TCP_IOV_PAYLOAD].iov_base; + payload->th = (struct tcphdr){ + .doff = offsetof(struct tcp_payload_t, data) / 4, + .ack = 1 + }; + + l4len = tcp_l2_buf_fill_headers(c, conn, l2_iov, + segment_size, + len ? check : NULL, + conn->seq_to_tap); + l2_iov[TCP_IOV_PAYLOAD].iov_len = l4len; + + if (*c->pcap) { + struct in_addr saddr, daddr; + uint32_t sum; + + saddr.s_addr = iph->saddr; + daddr.s_addr = iph->daddr; + sum = proto_ipv4_header_psum(l4len, + IPPROTO_TCP, + saddr, daddr); + first->iov_base = &payload->th; + first->iov_len = size - l2_hdrlen + sizeof(struct tcphdr); + payload->th.check = 0; + payload->th.check = csum_iov(first, num_buffers, sum); + } + + check = &iph->check; + } else { + struct tcp_payload_t *payload; + struct ipv6hdr *ip6h; + + l2_iov[TCP_IOV_IP].iov_base = (char *)l2_iov[TCP_IOV_ETH].iov_base + + l2_iov[TCP_IOV_ETH].iov_len; + l2_iov[TCP_IOV_IP].iov_len = sizeof(struct ipv6hdr); + l2_iov[TCP_IOV_PAYLOAD].iov_base = (char *)l2_iov[TCP_IOV_IP].iov_base + + l2_iov[TCP_IOV_IP].iov_len; + + + eh->h_proto = htons(ETH_P_IPV6); + + ip6h = l2_iov[TCP_IOV_IP].iov_base; + *ip6h = (struct ipv6hdr)L2_BUF_IP6_INIT(IPPROTO_TCP); + + payload = l2_iov[TCP_IOV_PAYLOAD].iov_base; + payload->th = (struct tcphdr){ + .doff = offsetof(struct tcp_payload_t, data) / 4, + .ack = 1 + }; +; + l4len = tcp_l2_buf_fill_headers(c, conn, l2_iov, + segment_size, + NULL, conn->seq_to_tap); + l2_iov[TCP_IOV_PAYLOAD].iov_len = l4len; + + if (*c->pcap) { + uint32_t sum = proto_ipv6_header_psum(l4len, + IPPROTO_TCP, + &ip6h->saddr, + &ip6h->daddr); + + first->iov_base = &payload->th; + first->iov_len = size - l2_hdrlen + sizeof(struct tcphdr); + + payload->th.check = 0; + payload->th.check = csum_iov(first, num_buffers, sum); + } + } + + /* set iov for pcap logging */ + first->iov_base = eh; + first->iov_len = size - vnet_hdrlen; + + pcap_iov(first, num_buffers); + + /* set iov_len for vu_queue_fill_by_index(); */ + + first->iov_base = base; + first->iov_len = size; + + conn->seq_to_tap += segment_size; + + segment_size = 0; + num_buffers = 0; + } + } + + /* release unused buffers */ + vu_queue_rewind(vdev, vq, iov_cnt - iov_used); + + /* send packets */ + for (i = 0; i < iov_used; i++) + vu_queue_fill(vq, &elem[i], iov_vu[i + 1].iov_len, i); + + vu_queue_flush(vq, iov_used); + vu_queue_notify(vdev, vq); + + conn_flag(c, conn, ACK_FROM_TAP_DUE); + + return 0; +err: + vu_queue_rewind(vdev, vq, iov_cnt); + + if (errno != EAGAIN && errno != EWOULDBLOCK) { + ret = -errno; + tcp_rst(c, conn); + } + + return ret; +} diff --git a/tcp_vu.h b/tcp_vu.h new file mode 100644 index 000000000000..99daba5b34ed --- /dev/null +++ b/tcp_vu.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later + * Copyright Red Hat + * Author: Laurent Vivier + */ + +#ifndef TCP_VU_H +#define TCP_VU_H + +int tcp_vu_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags); +int tcp_vu_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn); + +#endif /*TCP_VU_H */ diff --git a/udp.c b/udp.c index e089ef952b8a..2560094d4c21 100644 --- a/udp.c +++ b/udp.c @@ -121,9 +121,7 @@ #include "tap.h" #include "pcap.h" #include "log.h" - -#define UDP_CONN_TIMEOUT 180 /* s, timeout for ephemeral or local bind */ -#define UDP_MAX_FRAMES 32 /* max # of frames to receive at once */ +#include "udp_internal.h" /** * struct udp_tap_port - Port tracking based on tap-facing source port @@ -171,20 +169,8 @@ static uint8_t udp_act[IP_VERSIONS][UDP_ACT_TYPE_MAX][DIV_ROUND_UP(NUM_PORTS, 8) /* Static buffers */ -/** - * struct udp_payload_t - UDP header and data for inbound messages - * @uh: UDP header - * @data: UDP data - */ -static struct udp_payload_t { - struct udphdr uh; - char data[USHRT_MAX - sizeof(struct udphdr)]; -#ifdef __AVX2__ -} __attribute__ ((packed, aligned(32))) -#else -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))) -#endif -udp_payload[UDP_MAX_FRAMES]; +/* UDP header and data for inbound messages */ +static struct udp_payload_t udp_payload[UDP_MAX_FRAMES]; /* Ethernet header for IPv4 frames */ static struct ethhdr udp4_eth_hdr; @@ -241,11 +227,11 @@ static struct mmsghdr udp6_l2_mh_sock [UDP_MAX_FRAMES]; /* recvmmsg()/sendmmsg() data for "spliced" connections */ static struct iovec udp_iov_splice [UDP_MAX_FRAMES]; -static struct sockaddr_in udp4_localname = { +struct sockaddr_in udp4_localname = { .sin_family = AF_INET, .sin_addr = IN4ADDR_LOOPBACK_INIT, }; -static struct sockaddr_in6 udp6_localname = { +struct sockaddr_in6 udp6_localname = { .sin6_family = AF_INET6, .sin6_addr = IN6ADDR_LOOPBACK_INIT, }; @@ -594,11 +580,11 @@ out: * * Return: size of IPv4 payload (UDP header + data) */ -static size_t udp_update_hdr4(const struct ctx *c, - struct iphdr *ip4h, const struct sockaddr_in *s_in, - struct udp_payload_t *bp, - in_port_t dstport, size_t dlen, - const struct timespec *now) +size_t udp_update_hdr4(const struct ctx *c, + struct iphdr *ip4h, const struct sockaddr_in *s_in, + struct udp_payload_t *bp, + in_port_t dstport, size_t dlen, + const struct timespec *now) { const struct in_addr dst = c->ip4.addr_seen; in_port_t srcport = ntohs(s_in->sin_port); @@ -633,7 +619,10 @@ static size_t udp_update_hdr4(const struct ctx *c, bp->uh.source = s_in->sin_port; bp->uh.dest = htons(dstport); bp->uh.len = htons(l4len); - csum_udp4(&bp->uh, src, dst, bp->data, dlen); + if (c->mode != MODE_VU) + csum_udp4(&bp->uh, src, dst, bp->data, dlen); + else + bp->uh.check = 0; return l4len; } @@ -650,11 +639,11 @@ static size_t udp_update_hdr4(const struct ctx *c, * * Return: size of IPv6 payload (UDP header + data) */ -static size_t udp_update_hdr6(const struct ctx *c, - struct ipv6hdr *ip6h, struct sockaddr_in6 *s_in6, - struct udp_payload_t *bp, - in_port_t dstport, size_t dlen, - const struct timespec *now) +size_t udp_update_hdr6(const struct ctx *c, + struct ipv6hdr *ip6h, struct sockaddr_in6 *s_in6, + struct udp_payload_t *bp, + in_port_t dstport, size_t dlen, + const struct timespec *now) { const struct in6_addr *src = &s_in6->sin6_addr; const struct in6_addr *dst = &c->ip6.addr_seen; @@ -705,7 +694,10 @@ static size_t udp_update_hdr6(const struct ctx *c, bp->uh.source = s_in6->sin6_port; bp->uh.dest = htons(dstport); bp->uh.len = ip6h->payload_len; - csum_udp6(&bp->uh, src, dst, bp->data, dlen); + if (c->mode != MODE_VU) + csum_udp6(&bp->uh, src, dst, bp->data, dlen); + else + bp->uh.check = 0xffff; /* zero checksum is invalid with IPv6 */ return l4len; } diff --git a/udp_internal.h b/udp_internal.h new file mode 100644 index 000000000000..898d1e103cb8 --- /dev/null +++ b/udp_internal.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later + * Copyright (c) 2021 Red Hat GmbH + * Author: Stefano Brivio + */ + +#ifndef UDP_INTERNAL_H +#define UDP_INTERNAL_H + +#define UDP_CONN_TIMEOUT 180 /* s, timeout for ephemeral or local bind */ +#define UDP_MAX_FRAMES 32 /* max # of frames to receive at once */ + +extern struct sockaddr_in udp4_localname; +extern struct sockaddr_in6 udp6_localname; + +/** + * struct udp_payload_t - UDP header and data for inbound messages + * @uh: UDP header + * @data: UDP data + */ +struct udp_payload_t { + struct udphdr uh; + char data[USHRT_MAX - sizeof(struct udphdr)]; +#ifdef __AVX2__ +} __attribute__ ((packed, aligned(32))); +#else +} __attribute__ ((packed, aligned(__alignof__(unsigned int)))); +#endif + +size_t udp_update_hdr4(const struct ctx *c, + struct iphdr *ip4h, const struct sockaddr_in *s_in, + struct udp_payload_t *bp, + in_port_t dstport, size_t dlen, + const struct timespec *now); +size_t udp_update_hdr6(const struct ctx *c, + struct ipv6hdr *ip6h, struct sockaddr_in6 *s_in6, + struct udp_payload_t *bp, + in_port_t dstport, size_t dlen, + const struct timespec *now); +#endif /* UDP_INTERNAL_H */ diff --git a/udp_vu.c b/udp_vu.c new file mode 100644 index 000000000000..22b9fbf53807 --- /dev/null +++ b/udp_vu.c @@ -0,0 +1,240 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later + * Copyright Red Hat + * Author: Laurent Vivier + * + * udp_vu.c - UDP L2 vhost-user management functions + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "checksum.h" +#include "util.h" +#include "ip.h" +#include "passt.h" +#include "pcap.h" +#include "log.h" +#include "vhost_user.h" +#include "udp_internal.h" +#include "udp_vu.h" + +/* vhost-user */ +static const struct virtio_net_hdr vu_header = { + .flags = VIRTIO_NET_HDR_F_DATA_VALID, + .gso_type = VIRTIO_NET_HDR_GSO_NONE, +}; + +static struct iovec iov_vu [VIRTQUEUE_MAX_SIZE]; +static struct vu_virtq_element elem [VIRTQUEUE_MAX_SIZE]; +static struct iovec in_sg[VIRTQUEUE_MAX_SIZE]; +static int in_sg_count; + +void udp_vu_sock_handler(const struct ctx *c, union epoll_ref ref, + uint32_t events, const struct timespec *now) +{ + struct vu_dev *vdev = c->vdev; + struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE]; + bool has_mrg_rxbuf, v6 = ref.udp.v6; + in_port_t dstport = ref.udp.port; + size_t l2_hdrlen, vnet_hdrlen; + struct msghdr msg; + int i, virtqueue_max; + + if (c->no_udp || !(events & EPOLLIN)) + return; + + has_mrg_rxbuf = vu_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF); + if (has_mrg_rxbuf) { + vnet_hdrlen = sizeof(struct virtio_net_hdr_mrg_rxbuf); + virtqueue_max = VIRTQUEUE_MAX_SIZE; + } else { + vnet_hdrlen = sizeof(struct virtio_net_hdr); + virtqueue_max = 1; + } + l2_hdrlen = vnet_hdrlen + sizeof(struct ethhdr) + sizeof(struct udphdr); + + if (v6) { + l2_hdrlen += sizeof(struct ipv6hdr); + + udp6_localname.sin6_port = htons(dstport); + msg.msg_name = &udp6_localname; + msg.msg_namelen = sizeof(udp6_localname); + } else { + l2_hdrlen += sizeof(struct iphdr); + + udp4_localname.sin_port = htons(dstport); + msg.msg_name = &udp4_localname; + msg.msg_namelen = sizeof(udp4_localname); + } + + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = 0; + + for (i = 0; i < UDP_MAX_FRAMES; i++) { + struct virtio_net_hdr_mrg_rxbuf *vh; + size_t size, fillsize, off; + int iov_cnt, iov_used, idx; + struct ethhdr *eh; + ssize_t data_len; + size_t l4len; + char *base; + + fillsize = USHRT_MAX; + iov_cnt = 0; + in_sg_count = 0; + while (fillsize && iov_cnt < virtqueue_max && + in_sg_count < ARRAY_SIZE(in_sg)) { + int ret; + + elem[iov_cnt].out_num = 0; + elem[iov_cnt].out_sg = NULL; + elem[iov_cnt].in_num = ARRAY_SIZE(in_sg) - in_sg_count; + elem[iov_cnt].in_sg = &in_sg[in_sg_count]; + ret = vu_queue_pop(vdev, vq, &elem[iov_cnt]); + if (ret < 0) + break; + in_sg_count += elem[iov_cnt].in_num; + + if (elem[iov_cnt].in_num < 1) { + err("virtio-net receive queue contains no in buffers"); + vu_queue_rewind(vdev, vq, iov_cnt); + return; + } + ASSERT(elem[iov_cnt].in_num == 1); + ASSERT(elem[iov_cnt].in_sg[0].iov_len >= l2_hdrlen); + + if (iov_cnt == 0) { + base = elem[iov_cnt].in_sg[0].iov_base; + size = elem[iov_cnt].in_sg[0].iov_len; + + /* keep space for the headers */ + iov_vu[0].iov_base = base + l2_hdrlen; + iov_vu[0].iov_len = size - l2_hdrlen; + } else { + iov_vu[iov_cnt].iov_base = elem[iov_cnt].in_sg[0].iov_base; + iov_vu[iov_cnt].iov_len = elem[iov_cnt].in_sg[0].iov_len; + } + + if (iov_vu[iov_cnt].iov_len > fillsize) + iov_vu[iov_cnt].iov_len = fillsize; + + fillsize -= iov_vu[iov_cnt].iov_len; + + iov_cnt++; + } + if (iov_cnt == 0) + break; + + msg.msg_iov = iov_vu; + msg.msg_iovlen = iov_cnt; + + data_len = recvmsg(ref.fd, &msg, 0); + if (data_len < 0) { + vu_queue_rewind(vdev, vq, iov_cnt); + return; + } + + /* restore original values */ + iov_vu[0].iov_base = base; + iov_vu[0].iov_len = size; + + /* count the numbers of buffer filled by recvmsg() */ + idx = iov_skip_bytes(iov_vu, iov_cnt, l2_hdrlen + data_len, + &off); + /* adjust last iov length */ + if (idx < iov_cnt) + iov_vu[idx].iov_len = off; + iov_used = idx + !!off; + if (idx == 0) + size = iov_vu[0].iov_len; + + /* release unused buffers */ + vu_queue_rewind(vdev, vq, iov_cnt - iov_used); + + /* vnet_header */ + vh = (struct virtio_net_hdr_mrg_rxbuf *)base; + vh->hdr = vu_header; + if (has_mrg_rxbuf) + vh->num_buffers = htole16(iov_used); + + /* ethernet header */ + eh = (struct ethhdr *)(base + vnet_hdrlen); + + memcpy(eh->h_dest, c->mac_guest, sizeof(eh->h_dest)); + memcpy(eh->h_source, c->mac, sizeof(eh->h_source)); + + /* initialize header */ + if (v6) { + struct ipv6hdr *ip6h = (struct ipv6hdr *)(eh + 1); + struct udp_payload_t *bp = (struct udp_payload_t *)(ip6h + 1); + + eh->h_proto = htons(ETH_P_IPV6); + + *ip6h = (struct ipv6hdr)L2_BUF_IP6_INIT(IPPROTO_UDP); + + l4len = udp_update_hdr6(c, ip6h, &udp6_localname, bp, + dstport, data_len, now); + if (*c->pcap) { + uint32_t sum; + + sum = proto_ipv6_header_psum(l4len, IPPROTO_UDP, + &ip6h->saddr, + &ip6h->daddr); + + iov_vu[0].iov_base = &bp->uh; + iov_vu[0].iov_len = size - l2_hdrlen + + sizeof(bp->uh); + bp->uh.check = 0; /* by default, set to 0xffff */ + bp->uh.check = csum_iov(iov_vu, iov_used, sum); + } + } else { + struct iphdr *iph = (struct iphdr *)(eh + 1); + struct udp_payload_t *bp = (struct udp_payload_t *)(iph + 1); + + eh->h_proto = htons(ETH_P_IP); + + *iph = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_UDP); + + l4len = udp_update_hdr4(c, iph, &udp4_localname, bp, + dstport, data_len, now); + if (*c->pcap) { + uint32_t sum; + + sum = proto_ipv4_header_psum(l4len, IPPROTO_UDP, + /* cppcheck-suppress unknownEvaluationOrder */ + (struct in_addr){ .s_addr = iph->saddr }, + (struct in_addr){ .s_addr = iph->daddr }); + + iov_vu[0].iov_base = &bp->uh; + iov_vu[0].iov_len = size - l2_hdrlen + + sizeof(bp->uh); + bp->uh.check = csum_iov(iov_vu, iov_used, sum); + } + } + + /* set iov for pcap logging */ + iov_vu[0].iov_base = base + vnet_hdrlen; + iov_vu[0].iov_len = size - vnet_hdrlen; + pcap_iov(iov_vu, iov_used); + + /* set iov_len for vu_queue_fill_by_index(); */ + iov_vu[0].iov_base = base; + iov_vu[0].iov_len = size; + + /* send packets */ + for (i = 0; i < iov_used; i++) + vu_queue_fill(vq, &elem[i], iov_vu[i].iov_len, i); + + vu_queue_flush(vq, iov_used); + vu_queue_notify(vdev, vq); + } +} diff --git a/udp_vu.h b/udp_vu.h new file mode 100644 index 000000000000..d402980c21c1 --- /dev/null +++ b/udp_vu.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later + * Copyright Red Hat + * Author: Laurent Vivier + */ + +#ifndef UDP_VU_H +#define UDP_VU_H + +void udp_vu_sock_handler(const struct ctx *c, union epoll_ref ref, + uint32_t events, const struct timespec *now); +#endif /* UDP_VU_H */ diff --git a/vhost_user.c b/vhost_user.c index 23ec4326995d..496d9c1088ad 100644 --- a/vhost_user.c +++ b/vhost_user.c @@ -38,7 +38,6 @@ * this is part of the vhost-user backend * convention. */ -/* cppcheck-suppress unusedFunction */ void vu_print_capabilities(void) { printf("{\n"); @@ -149,8 +148,7 @@ static void vmsg_close_fds(const struct vhost_user_msg *vmsg) */ static void vu_remove_watch(const struct vu_dev *vdev, int fd) { - (void)vdev; - (void)fd; + epoll_ctl(vdev->context->epollfd, EPOLL_CTL_DEL, fd, NULL); } /** @@ -412,7 +410,6 @@ static bool map_ring(struct vu_dev *vdev, struct vu_virtq *vq) * * Return: 0 if the zone in a mapped memory region, -1 otherwise */ -/* cppcheck-suppress unusedFunction */ int vu_packet_check_range(void *buf, size_t offset, size_t len, const char *start) { @@ -503,6 +500,14 @@ static bool vu_set_mem_table_exec(struct vu_dev *vdev, } } + /* As vu_packet_check_range() has no access to the number of + * memory regions, mark the end of the array with mmap_addr = 0 + */ + ASSERT(vdev->nregions < VHOST_USER_MAX_RAM_SLOTS - 1); + vdev->regions[vdev->nregions].mmap_addr = 0; + + tap_sock_update_buf(vdev->regions, 0); + return false; } @@ -623,8 +628,12 @@ static bool vu_get_vring_base_exec(struct vu_dev *vdev, */ static void vu_set_watch(const struct vu_dev *vdev, int fd) { - (void)vdev; - (void)fd; + union epoll_ref ref = { .type = EPOLL_TYPE_VHOST_KICK, .fd = fd }; + struct epoll_event ev = { 0 }; + + ev.data.u64 = ref.u64; + ev.events = EPOLLIN; + epoll_ctl(vdev->context->epollfd, EPOLL_CTL_ADD, fd, &ev); } /** @@ -660,7 +669,6 @@ static int vu_wait_queue(const struct vu_virtq *vq) * * Return: number of bytes sent, -1 if there is an error */ -/* cppcheck-suppress unusedFunction */ int vu_send(struct vu_dev *vdev, const void *buf, size_t size) { size_t hdrlen = vdev->hdrlen; @@ -854,7 +862,6 @@ static void vu_handle_tx(struct vu_dev *vdev, int index) * @vdev: vhost-user device * @ref: epoll reference information */ -/* cppcheck-suppress unusedFunction */ void vu_kick_cb(struct vu_dev *vdev, union epoll_ref ref) { eventfd_t kick_data; @@ -1097,11 +1104,11 @@ static bool vu_set_vring_enable_exec(struct vu_dev *vdev, * @c: execution context * @vdev: vhost-user device */ -/* cppcheck-suppress unusedFunction */ void vu_init(struct ctx *c, struct vu_dev *vdev) { int i; + c->vdev = vdev; vdev->context = c; vdev->hdrlen = 0; for (i = 0; i < VHOST_USER_MAX_QUEUES; i++) @@ -1164,7 +1171,7 @@ void vu_cleanup(struct vu_dev *vdev) */ static void vu_sock_reset(struct vu_dev *vdev) { - (void)vdev; + tap_sock_reset(vdev->context); } /** @@ -1173,7 +1180,6 @@ static void vu_sock_reset(struct vu_dev *vdev) * @fd: vhost-user message socket * @events: epoll events */ -/* cppcheck-suppress unusedFunction */ void tap_handler_vu(struct vu_dev *vdev, int fd, uint32_t events) { struct vhost_user_msg msg = { 0 }; diff --git a/virtio.c b/virtio.c index d712f30cc33d..9e6e79382e2c 100644 --- a/virtio.c +++ b/virtio.c @@ -507,7 +507,6 @@ void vu_queue_unpop(struct vu_dev *dev, struct vu_virtq *vq, unsigned int index, * @vq: Virtqueue * @num: Number of element to unpop */ -/* cppcheck-suppress unusedFunction */ bool vu_queue_rewind(struct vu_dev *dev, struct vu_virtq *vq, unsigned int num) { (void)dev; -- 2.45.2

564

Age (days ago)

598

Last active (days ago)

List overview

Download

12 comments

3 participants

participants (3)

David Gibson
Laurent Vivier
Stefano Brivio

[PATCH v2 0/4] Add vhost-user support to passt. (part 3)

tags

participants (3)