Skip site navigation (1)Skip section navigation (2)
Date:      Sun, 21 Jul 2019 10:58:25 +0000 (UTC)
From:      Vincenzo Maffione <vmaffione@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-stable@freebsd.org, svn-src-stable-12@freebsd.org
Subject:   svn commit: r350193 - stable/12/usr.sbin/bhyve
Message-ID:  <201907211058.x6LAwPUs005642@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: vmaffione
Date: Sun Jul 21 10:58:25 2019
New Revision: 350193
URL: https://svnweb.freebsd.org/changeset/base/350193

Log:
  MFC r349803
  
  bhyve: abstraction for network backends
  
  Bhyve can currently emulate two virtual NICs, namely virtio-net and e1000,
  and connect to the host network through two backends, namely tap and netmap.
  However, there is no interface between virtual NIC functionalities and
  backend functionalities. As a result, the backend code is duplicated between
  the two virtual NIC implementations and also within the same virtual NIC.
  Also, e1000 cannot currently use netmap as a backend.
  This patch introduces a network backend API between virtio-net/e1000 and
  tap/netmap, to improve code reuse and add missing functionalities.
  Virtual NICs and backends can negotiate virtio-net features, such as checksum
  offload and TSO. If the backend supports the features, it will propagate this
  information to the guest, so that the latter can make use of them. Currently,
  only netmap VALE ports support the features, but support should be added to
  tap in the future.
  
  Reviewed by:    jhb, bryanv
  Differential Revision:  https://reviews.freebsd.org/D20659

Added:
  stable/12/usr.sbin/bhyve/net_backends.c
     - copied unchanged from r349803, head/usr.sbin/bhyve/net_backends.c
  stable/12/usr.sbin/bhyve/net_backends.h
     - copied unchanged from r349803, head/usr.sbin/bhyve/net_backends.h
Modified:
  stable/12/usr.sbin/bhyve/Makefile
  stable/12/usr.sbin/bhyve/pci_e82545.c
  stable/12/usr.sbin/bhyve/pci_virtio_net.c

Modified: stable/12/usr.sbin/bhyve/Makefile
==============================================================================
--- stable/12/usr.sbin/bhyve/Makefile	Sun Jul 21 08:28:28 2019	(r350192)
+++ stable/12/usr.sbin/bhyve/Makefile	Sun Jul 21 10:58:25 2019	(r350193)
@@ -3,7 +3,7 @@
 #
 
 .include <src.opts.mk>
-CFLAGS+=-I${SRCTOP}/sys
+CFLAGS+=-I${SRCTOP}/sys -DWITHOUT_CAPSICUM
 .PATH:  ${SRCTOP}/sys/cam/ctl
 
 PROG=	bhyve
@@ -32,6 +32,7 @@ SRCS=	\
 	mem.c			\
 	mevent.c		\
 	mptbl.c			\
+	net_backends.c		\
 	net_utils.c		\
 	pci_ahci.c		\
 	pci_e82545.c		\

Copied: stable/12/usr.sbin/bhyve/net_backends.c (from r349803, head/usr.sbin/bhyve/net_backends.c)
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ stable/12/usr.sbin/bhyve/net_backends.c	Sun Jul 21 10:58:25 2019	(r350193, copy of r349803, head/usr.sbin/bhyve/net_backends.c)
@@ -0,0 +1,806 @@
+/*-
+ * Copyright (c) 2019 Vincenzo Maffione <vmaffione@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+ * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * This file implements multiple network backends (tap, netmap, ...),
+ * to be used by network frontends such as virtio-net and e1000.
+ * The API to access the backend (e.g. send/receive packets, negotiate
+ * features) is exported by net_backends.h.
+ */
+
+#include <sys/types.h>		/* u_short etc */
+#ifndef WITHOUT_CAPSICUM
+#include <sys/capsicum.h>
+#endif
+#include <sys/cdefs.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/uio.h>
+
+#include <net/if.h>
+#include <net/netmap.h>
+#include <net/netmap_virt.h>
+#define NETMAP_WITH_LIBS
+#include <net/netmap_user.h>
+
+#ifndef WITHOUT_CAPSICUM
+#include <capsicum_helpers.h>
+#endif
+#include <err.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <unistd.h>
+#include <sysexits.h>
+#include <assert.h>
+#include <pthread.h>
+#include <pthread_np.h>
+#include <poll.h>
+#include <assert.h>
+
+
+#include "iov.h"
+#include "mevent.h"
+#include "net_backends.h"
+
+#include <sys/linker_set.h>
+
+/*
+ * Each network backend registers a set of function pointers that are
+ * used to implement the net backends API.
+ * This might need to be exposed if we implement backends in separate files.
+ */
+struct net_backend {
+	const char *prefix;	/* prefix matching this backend */
+
+	/*
+	 * Routines used to initialize and cleanup the resources needed
+	 * by a backend. The cleanup function is used internally,
+	 * and should not be called by the frontend.
+	 */
+	int (*init)(struct net_backend *be, const char *devname,
+	    net_be_rxeof_t cb, void *param);
+	void (*cleanup)(struct net_backend *be);
+
+	/*
+	 * Called to serve a guest transmit request. The scatter-gather
+	 * vector provided by the caller has 'iovcnt' elements and contains
+	 * the packet to send.
+	 */
+	ssize_t (*send)(struct net_backend *be, struct iovec *iov, int iovcnt);
+
+	/*
+	 * Called to receive a packet from the backend. When the function
+	 * returns a positive value 'len', the scatter-gather vector
+	 * provided by the caller contains a packet with such length.
+	 * The function returns 0 if the backend doesn't have a new packet to
+	 * receive.
+	 */
+	ssize_t (*recv)(struct net_backend *be, struct iovec *iov, int iovcnt);
+
+	/*
+	 * Ask the backend for the virtio-net features it is able to
+	 * support. Possible features are TSO, UFO and checksum offloading
+	 * in both rx and tx direction and for both IPv4 and IPv6.
+	 */
+	uint64_t (*get_cap)(struct net_backend *be);
+
+	/*
+	 * Tell the backend to enable/disable the specified virtio-net
+	 * features (capabilities).
+	 */
+	int (*set_cap)(struct net_backend *be, uint64_t features,
+	    unsigned int vnet_hdr_len);
+
+	struct pci_vtnet_softc *sc;
+	int fd;
+
+	/*
+	 * Length of the virtio-net header used by the backend and the
+	 * frontend, respectively. A zero value means that the header
+	 * is not used.
+	 */
+	unsigned int be_vnet_hdr_len;
+	unsigned int fe_vnet_hdr_len;
+
+	/* Size of backend-specific private data. */
+	size_t priv_size;
+
+	/* Room for backend-specific data. */
+	char opaque[0];
+};
+
+SET_DECLARE(net_backend_set, struct net_backend);
+
+#define VNET_HDR_LEN	sizeof(struct virtio_net_rxhdr)
+
+#define WPRINTF(params) printf params
+
+/*
+ * The tap backend
+ */
+
+struct tap_priv {
+	struct mevent *mevp;
+};
+
+static void
+tap_cleanup(struct net_backend *be)
+{
+	struct tap_priv *priv = (struct tap_priv *)be->opaque;
+
+	if (priv->mevp) {
+		mevent_delete(priv->mevp);
+	}
+	if (be->fd != -1) {
+		close(be->fd);
+		be->fd = -1;
+	}
+}
+
+static int
+tap_init(struct net_backend *be, const char *devname,
+	 net_be_rxeof_t cb, void *param)
+{
+	struct tap_priv *priv = (struct tap_priv *)be->opaque;
+	char tbuf[80];
+	int fd;
+	int opt = 1;
+#ifndef WITHOUT_CAPSICUM
+	cap_rights_t rights;
+#endif
+
+	if (cb == NULL) {
+		WPRINTF(("TAP backend requires non-NULL callback\n"));
+		return (-1);
+	}
+
+	strcpy(tbuf, "/dev/");
+	strlcat(tbuf, devname, sizeof(tbuf));
+
+	fd = open(tbuf, O_RDWR);
+	if (fd == -1) {
+		WPRINTF(("open of tap device %s failed\n", tbuf));
+		goto error;
+	}
+
+	/*
+	 * Set non-blocking and register for read
+	 * notifications with the event loop
+	 */
+	if (ioctl(fd, FIONBIO, &opt) < 0) {
+		WPRINTF(("tap device O_NONBLOCK failed\n"));
+		goto error;
+	}
+
+#ifndef WITHOUT_CAPSICUM
+	cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE);
+	if (caph_rights_limit(fd, &rights) == -1)
+		errx(EX_OSERR, "Unable to apply rights for sandbox");
+#endif
+
+	priv->mevp = mevent_add(fd, EVF_READ, cb, param);
+	if (priv->mevp == NULL) {
+		WPRINTF(("Could not register event\n"));
+		goto error;
+	}
+
+	be->fd = fd;
+
+	return (0);
+
+error:
+	tap_cleanup(be);
+	return (-1);
+}
+
+/*
+ * Called to send a buffer chain out to the tap device
+ */
+static ssize_t
+tap_send(struct net_backend *be, struct iovec *iov, int iovcnt)
+{
+	return (writev(be->fd, iov, iovcnt));
+}
+
+static ssize_t
+tap_recv(struct net_backend *be, struct iovec *iov, int iovcnt)
+{
+	ssize_t ret;
+
+	/* Should never be called without a valid tap fd */
+	assert(be->fd != -1);
+
+	ret = readv(be->fd, iov, iovcnt);
+
+	if (ret < 0 && errno == EWOULDBLOCK) {
+		return (0);
+	}
+
+	return (ret);
+}
+
+static uint64_t
+tap_get_cap(struct net_backend *be)
+{
+
+	return (0); /* no capabilities for now */
+}
+
+static int
+tap_set_cap(struct net_backend *be, uint64_t features,
+		unsigned vnet_hdr_len)
+{
+
+	return ((features || vnet_hdr_len) ? -1 : 0);
+}
+
+static struct net_backend tap_backend = {
+	.prefix = "tap",
+	.priv_size = sizeof(struct tap_priv),
+	.init = tap_init,
+	.cleanup = tap_cleanup,
+	.send = tap_send,
+	.recv = tap_recv,
+	.get_cap = tap_get_cap,
+	.set_cap = tap_set_cap,
+};
+
+/* A clone of the tap backend, with a different prefix. */
+static struct net_backend vmnet_backend = {
+	.prefix = "vmnet",
+	.priv_size = sizeof(struct tap_priv),
+	.init = tap_init,
+	.cleanup = tap_cleanup,
+	.send = tap_send,
+	.recv = tap_recv,
+	.get_cap = tap_get_cap,
+	.set_cap = tap_set_cap,
+};
+
+DATA_SET(net_backend_set, tap_backend);
+DATA_SET(net_backend_set, vmnet_backend);
+
+/*
+ * The netmap backend
+ */
+
+/* The virtio-net features supported by netmap. */
+#define NETMAP_FEATURES (VIRTIO_NET_F_CSUM | VIRTIO_NET_F_HOST_TSO4 | \
+		VIRTIO_NET_F_HOST_TSO6 | VIRTIO_NET_F_HOST_UFO | \
+		VIRTIO_NET_F_GUEST_CSUM | VIRTIO_NET_F_GUEST_TSO4 | \
+		VIRTIO_NET_F_GUEST_TSO6 | VIRTIO_NET_F_GUEST_UFO)
+
+struct netmap_priv {
+	char ifname[IFNAMSIZ];
+	struct nm_desc *nmd;
+	uint16_t memid;
+	struct netmap_ring *rx;
+	struct netmap_ring *tx;
+	struct mevent *mevp;
+	net_be_rxeof_t cb;
+	void *cb_param;
+};
+
+static void
+nmreq_init(struct nmreq *req, char *ifname)
+{
+
+	memset(req, 0, sizeof(*req));
+	strlcpy(req->nr_name, ifname, sizeof(req->nr_name));
+	req->nr_version = NETMAP_API;
+}
+
+static int
+netmap_set_vnet_hdr_len(struct net_backend *be, int vnet_hdr_len)
+{
+	int err;
+	struct nmreq req;
+	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
+
+	nmreq_init(&req, priv->ifname);
+	req.nr_cmd = NETMAP_BDG_VNET_HDR;
+	req.nr_arg1 = vnet_hdr_len;
+	err = ioctl(be->fd, NIOCREGIF, &req);
+	if (err) {
+		WPRINTF(("Unable to set vnet header length %d\n",
+				vnet_hdr_len));
+		return (err);
+	}
+
+	be->be_vnet_hdr_len = vnet_hdr_len;
+
+	return (0);
+}
+
+static int
+netmap_has_vnet_hdr_len(struct net_backend *be, unsigned vnet_hdr_len)
+{
+	int prev_hdr_len = be->be_vnet_hdr_len;
+	int ret;
+
+	if (vnet_hdr_len == prev_hdr_len) {
+		return (1);
+	}
+
+	ret = netmap_set_vnet_hdr_len(be, vnet_hdr_len);
+	if (ret) {
+		return (0);
+	}
+
+	netmap_set_vnet_hdr_len(be, prev_hdr_len);
+
+	return (1);
+}
+
+static uint64_t
+netmap_get_cap(struct net_backend *be)
+{
+
+	return (netmap_has_vnet_hdr_len(be, VNET_HDR_LEN) ?
+	    NETMAP_FEATURES : 0);
+}
+
+static int
+netmap_set_cap(struct net_backend *be, uint64_t features,
+	       unsigned vnet_hdr_len)
+{
+
+	return (netmap_set_vnet_hdr_len(be, vnet_hdr_len));
+}
+
+static int
+netmap_init(struct net_backend *be, const char *devname,
+	    net_be_rxeof_t cb, void *param)
+{
+	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
+
+	strlcpy(priv->ifname, devname, sizeof(priv->ifname));
+	priv->ifname[sizeof(priv->ifname) - 1] = '\0';
+
+	priv->nmd = nm_open(priv->ifname, NULL, NETMAP_NO_TX_POLL, NULL);
+	if (priv->nmd == NULL) {
+		WPRINTF(("Unable to nm_open(): interface '%s', errno (%s)\n",
+			devname, strerror(errno)));
+		free(priv);
+		return (-1);
+	}
+
+	priv->memid = priv->nmd->req.nr_arg2;
+	priv->tx = NETMAP_TXRING(priv->nmd->nifp, 0);
+	priv->rx = NETMAP_RXRING(priv->nmd->nifp, 0);
+	priv->cb = cb;
+	priv->cb_param = param;
+	be->fd = priv->nmd->fd;
+
+	priv->mevp = mevent_add(be->fd, EVF_READ, cb, param);
+	if (priv->mevp == NULL) {
+		WPRINTF(("Could not register event\n"));
+		return (-1);
+	}
+
+	return (0);
+}
+
+static void
+netmap_cleanup(struct net_backend *be)
+{
+	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
+
+	if (priv->mevp) {
+		mevent_delete(priv->mevp);
+	}
+	if (priv->nmd) {
+		nm_close(priv->nmd);
+	}
+	be->fd = -1;
+}
+
+static ssize_t
+netmap_send(struct net_backend *be, struct iovec *iov,
+	    int iovcnt)
+{
+	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
+	struct netmap_ring *ring;
+	ssize_t totlen = 0;
+	int nm_buf_size;
+	int nm_buf_len;
+	uint32_t head;
+	void *nm_buf;
+	int j;
+
+	ring = priv->tx;
+	head = ring->head;
+	if (head == ring->tail) {
+		WPRINTF(("No space, drop %zu bytes\n", count_iov(iov, iovcnt)));
+		goto txsync;
+	}
+	nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx);
+	nm_buf_size = ring->nr_buf_size;
+	nm_buf_len = 0;
+
+	for (j = 0; j < iovcnt; j++) {
+		int iov_frag_size = iov[j].iov_len;
+		void *iov_frag_buf = iov[j].iov_base;
+
+		totlen += iov_frag_size;
+
+		/*
+		 * Split each iovec fragment over more netmap slots, if
+		 * necessary.
+		 */
+		for (;;) {
+			int copylen;
+
+			copylen = iov_frag_size < nm_buf_size ? iov_frag_size : nm_buf_size;
+			memcpy(nm_buf, iov_frag_buf, copylen);
+
+			iov_frag_buf += copylen;
+			iov_frag_size -= copylen;
+			nm_buf += copylen;
+			nm_buf_size -= copylen;
+			nm_buf_len += copylen;
+
+			if (iov_frag_size == 0) {
+				break;
+			}
+
+			ring->slot[head].len = nm_buf_len;
+			ring->slot[head].flags = NS_MOREFRAG;
+			head = nm_ring_next(ring, head);
+			if (head == ring->tail) {
+				/*
+				 * We ran out of netmap slots while
+				 * splitting the iovec fragments.
+				 */
+				WPRINTF(("No space, drop %zu bytes\n",
+				   count_iov(iov, iovcnt)));
+				goto txsync;
+			}
+			nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx);
+			nm_buf_size = ring->nr_buf_size;
+			nm_buf_len = 0;
+		}
+	}
+
+	/* Complete the last slot, which must not have NS_MOREFRAG set. */
+	ring->slot[head].len = nm_buf_len;
+	ring->slot[head].flags = 0;
+	head = nm_ring_next(ring, head);
+
+	/* Now update ring->head and ring->cur. */
+	ring->head = ring->cur = head;
+txsync:
+	ioctl(be->fd, NIOCTXSYNC, NULL);
+
+	return (totlen);
+}
+
+static ssize_t
+netmap_recv(struct net_backend *be, struct iovec *iov, int iovcnt)
+{
+	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
+	struct netmap_slot *slot = NULL;
+	struct netmap_ring *ring;
+	void *iov_frag_buf;
+	int iov_frag_size;
+	ssize_t totlen = 0;
+	uint32_t head;
+
+	assert(iovcnt);
+
+	ring = priv->rx;
+	head = ring->head;
+	iov_frag_buf = iov->iov_base;
+	iov_frag_size = iov->iov_len;
+
+	do {
+		int nm_buf_len;
+		void *nm_buf;
+
+		if (head == ring->tail) {
+			return (0);
+		}
+
+		slot = ring->slot + head;
+		nm_buf = NETMAP_BUF(ring, slot->buf_idx);
+		nm_buf_len = slot->len;
+
+		for (;;) {
+			int copylen = nm_buf_len < iov_frag_size ?
+			    nm_buf_len : iov_frag_size;
+
+			memcpy(iov_frag_buf, nm_buf, copylen);
+			nm_buf += copylen;
+			nm_buf_len -= copylen;
+			iov_frag_buf += copylen;
+			iov_frag_size -= copylen;
+			totlen += copylen;
+
+			if (nm_buf_len == 0) {
+				break;
+			}
+
+			iov++;
+			iovcnt--;
+			if (iovcnt == 0) {
+				/* No space to receive. */
+				WPRINTF(("Short iov, drop %zd bytes\n",
+				    totlen));
+				return (-ENOSPC);
+			}
+			iov_frag_buf = iov->iov_base;
+			iov_frag_size = iov->iov_len;
+		}
+
+		head = nm_ring_next(ring, head);
+
+	} while (slot->flags & NS_MOREFRAG);
+
+	/* Release slots to netmap. */
+	ring->head = ring->cur = head;
+
+	return (totlen);
+}
+
+static struct net_backend netmap_backend = {
+	.prefix = "netmap",
+	.priv_size = sizeof(struct netmap_priv),
+	.init = netmap_init,
+	.cleanup = netmap_cleanup,
+	.send = netmap_send,
+	.recv = netmap_recv,
+	.get_cap = netmap_get_cap,
+	.set_cap = netmap_set_cap,
+};
+
+/* A clone of the netmap backend, with a different prefix. */
+static struct net_backend vale_backend = {
+	.prefix = "vale",
+	.priv_size = sizeof(struct netmap_priv),
+	.init = netmap_init,
+	.cleanup = netmap_cleanup,
+	.send = netmap_send,
+	.recv = netmap_recv,
+	.get_cap = netmap_get_cap,
+	.set_cap = netmap_set_cap,
+};
+
+DATA_SET(net_backend_set, netmap_backend);
+DATA_SET(net_backend_set, vale_backend);
+
+/*
+ * Initialize a backend and attach to the frontend.
+ * This is called during frontend initialization.
+ *  @pbe is a pointer to the backend to be initialized
+ *  @devname is the backend-name as supplied on the command line,
+ * 	e.g. -s 2:0,frontend-name,backend-name[,other-args]
+ *  @cb is the receive callback supplied by the frontend,
+ *	and it is invoked in the event loop when a receive
+ *	event is generated in the hypervisor,
+ *  @param is a pointer to the frontend, and normally used as
+ *	the argument for the callback.
+ */
+int
+netbe_init(struct net_backend **ret, const char *devname, net_be_rxeof_t cb,
+    void *param)
+{
+	struct net_backend **pbe, *nbe, *tbe = NULL;
+	int err;
+
+	/*
+	 * Find the network backend that matches the user-provided
+	 * device name. net_backend_set is built using a linker set.
+	 */
+	SET_FOREACH(pbe, net_backend_set) {
+		if (strncmp(devname, (*pbe)->prefix,
+		    strlen((*pbe)->prefix)) == 0) {
+			tbe = *pbe;
+			assert(tbe->init != NULL);
+			assert(tbe->cleanup != NULL);
+			assert(tbe->send != NULL);
+			assert(tbe->recv != NULL);
+			assert(tbe->get_cap != NULL);
+			assert(tbe->set_cap != NULL);
+			break;
+		}
+	}
+
+	*ret = NULL;
+	if (tbe == NULL)
+		return (EINVAL);
+	nbe = calloc(1, sizeof(*nbe) + tbe->priv_size);
+	*nbe = *tbe;	/* copy the template */
+	nbe->fd = -1;
+	nbe->sc = param;
+	nbe->be_vnet_hdr_len = 0;
+	nbe->fe_vnet_hdr_len = 0;
+
+	/* Initialize the backend. */
+	err = nbe->init(nbe, devname, cb, param);
+	if (err) {
+		free(nbe);
+		return (err);
+	}
+
+	*ret = nbe;
+
+	return (0);
+}
+
+void
+netbe_cleanup(struct net_backend *be)
+{
+
+	if (be != NULL) {
+		be->cleanup(be);
+		free(be);
+	}
+}
+
+uint64_t
+netbe_get_cap(struct net_backend *be)
+{
+
+	assert(be != NULL);
+	return (be->get_cap(be));
+}
+
+int
+netbe_set_cap(struct net_backend *be, uint64_t features,
+	      unsigned vnet_hdr_len)
+{
+	int ret;
+
+	assert(be != NULL);
+
+	/* There are only three valid lengths, i.e., 0, 10 and 12. */
+	if (vnet_hdr_len && vnet_hdr_len != VNET_HDR_LEN
+		&& vnet_hdr_len != (VNET_HDR_LEN - sizeof(uint16_t)))
+		return (-1);
+
+	be->fe_vnet_hdr_len = vnet_hdr_len;
+
+	ret = be->set_cap(be, features, vnet_hdr_len);
+	assert(be->be_vnet_hdr_len == 0 ||
+	       be->be_vnet_hdr_len == be->fe_vnet_hdr_len);
+
+	return (ret);
+}
+
+static __inline struct iovec *
+iov_trim(struct iovec *iov, int *iovcnt, unsigned int tlen)
+{
+	struct iovec *riov;
+
+	/* XXX short-cut: assume first segment is >= tlen */
+	assert(iov[0].iov_len >= tlen);
+
+	iov[0].iov_len -= tlen;
+	if (iov[0].iov_len == 0) {
+		assert(*iovcnt > 1);
+		*iovcnt -= 1;
+		riov = &iov[1];
+	} else {
+		iov[0].iov_base = (void *)((uintptr_t)iov[0].iov_base + tlen);
+		riov = &iov[0];
+	}
+
+	return (riov);
+}
+
+ssize_t
+netbe_send(struct net_backend *be, struct iovec *iov, int iovcnt)
+{
+
+	assert(be != NULL);
+	if (be->be_vnet_hdr_len != be->fe_vnet_hdr_len) {
+		/*
+		 * The frontend uses a virtio-net header, but the backend
+		 * does not. We ignore it (as it must be all zeroes) and
+		 * strip it.
+		 */
+		assert(be->be_vnet_hdr_len == 0);
+		iov = iov_trim(iov, &iovcnt, be->fe_vnet_hdr_len);
+	}
+
+	return (be->send(be, iov, iovcnt));
+}
+
+/*
+ * Try to read a packet from the backend, without blocking.
+ * If no packets are available, return 0. In case of success, return
+ * the length of the packet just read. Return -1 in case of errors.
+ */
+ssize_t
+netbe_recv(struct net_backend *be, struct iovec *iov, int iovcnt)
+{
+	/* Length of prepended virtio-net header. */
+	unsigned int hlen = be->fe_vnet_hdr_len;
+	int ret;
+
+	assert(be != NULL);
+
+	if (hlen && hlen != be->be_vnet_hdr_len) {
+		/*
+		 * The frontend uses a virtio-net header, but the backend
+		 * does not. We need to prepend a zeroed header.
+		 */
+		struct virtio_net_rxhdr *vh;
+
+		assert(be->be_vnet_hdr_len == 0);
+
+		/*
+		 * Get a pointer to the rx header, and use the
+		 * data immediately following it for the packet buffer.
+		 */
+		vh = iov[0].iov_base;
+		iov = iov_trim(iov, &iovcnt, hlen);
+
+		/*
+		 * The only valid field in the rx packet header is the
+		 * number of buffers if merged rx bufs were negotiated.
+		 */
+		memset(vh, 0, hlen);
+		if (hlen == VNET_HDR_LEN) {
+			vh->vrh_bufs = 1;
+		}
+	}
+
+	ret = be->recv(be, iov, iovcnt);
+	if (ret > 0) {
+		ret += hlen;
+	}
+
+	return (ret);
+}
+
+/*
+ * Read a packet from the backend and discard it.
+ * Returns the size of the discarded packet or zero if no packet was available.
+ * A negative error code is returned in case of read error.
+ */
+ssize_t
+netbe_rx_discard(struct net_backend *be)
+{
+	/*
+	 * MP note: the dummybuf is only used to discard frames,
+	 * so there is no need for it to be per-vtnet or locked.
+	 * We only make it large enough for TSO-sized segment.
+	 */
+	static uint8_t dummybuf[65536 + 64];
+	struct iovec iov;
+
+	iov.iov_base = dummybuf;
+	iov.iov_len = sizeof(dummybuf);
+
+	return netbe_recv(be, &iov, 1);
+}
+

Copied: stable/12/usr.sbin/bhyve/net_backends.h (from r349803, head/usr.sbin/bhyve/net_backends.h)
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ stable/12/usr.sbin/bhyve/net_backends.h	Sun Jul 21 10:58:25 2019	(r350193, copy of r349803, head/usr.sbin/bhyve/net_backends.h)
@@ -0,0 +1,87 @@
+/*-
+ * Copyright (c) 2019 Vincenzo Maffione <vmaffione@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+ * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef __NET_BACKENDS_H__
+#define __NET_BACKENDS_H__
+
+#include <stdint.h>
+
+/* Opaque type representing a network backend. */
+typedef struct net_backend net_backend_t;
+
+/* Interface between network frontends and the network backends. */
+typedef void (*net_be_rxeof_t)(int, enum ev_type, void *param);
+int	netbe_init(net_backend_t **be, const char *devname, net_be_rxeof_t cb,
+            void *param);
+void	netbe_cleanup(net_backend_t *be);
+uint64_t netbe_get_cap(net_backend_t *be);
+int	 netbe_set_cap(net_backend_t *be, uint64_t cap,
+             unsigned vnet_hdr_len);
+ssize_t	netbe_send(net_backend_t *be, struct iovec *iov, int iovcnt);
+ssize_t	netbe_recv(net_backend_t *be, struct iovec *iov, int iovcnt);
+ssize_t	netbe_rx_discard(net_backend_t *be);
+
+
+/*
+ * Network device capabilities taken from the VirtIO standard.
+ * Despite the name, these capabilities can be used by different frontents
+ * (virtio-net, ptnet) and supported by different backends (netmap, tap, ...).
+ */
+#define	VIRTIO_NET_F_CSUM	(1 <<  0) /* host handles partial cksum */
+#define	VIRTIO_NET_F_GUEST_CSUM	(1 <<  1) /* guest handles partial cksum */
+#define	VIRTIO_NET_F_MAC	(1 <<  5) /* host supplies MAC */
+#define	VIRTIO_NET_F_GSO_DEPREC	(1 <<  6) /* deprecated: host handles GSO */
+#define	VIRTIO_NET_F_GUEST_TSO4	(1 <<  7) /* guest can rcv TSOv4 */
+#define	VIRTIO_NET_F_GUEST_TSO6	(1 <<  8) /* guest can rcv TSOv6 */
+#define	VIRTIO_NET_F_GUEST_ECN	(1 <<  9) /* guest can rcv TSO with ECN */
+#define	VIRTIO_NET_F_GUEST_UFO	(1 << 10) /* guest can rcv UFO */
+#define	VIRTIO_NET_F_HOST_TSO4	(1 << 11) /* host can rcv TSOv4 */
+#define	VIRTIO_NET_F_HOST_TSO6	(1 << 12) /* host can rcv TSOv6 */
+#define	VIRTIO_NET_F_HOST_ECN	(1 << 13) /* host can rcv TSO with ECN */
+#define	VIRTIO_NET_F_HOST_UFO	(1 << 14) /* host can rcv UFO */
+#define	VIRTIO_NET_F_MRG_RXBUF	(1 << 15) /* host can merge RX buffers */
+#define	VIRTIO_NET_F_STATUS	(1 << 16) /* config status field available */
+#define	VIRTIO_NET_F_CTRL_VQ	(1 << 17) /* control channel available */
+#define	VIRTIO_NET_F_CTRL_RX	(1 << 18) /* control channel RX mode support */
+#define	VIRTIO_NET_F_CTRL_VLAN	(1 << 19) /* control channel VLAN filtering */
+#define	VIRTIO_NET_F_GUEST_ANNOUNCE \
+				(1 << 21) /* guest can send gratuitous pkts */
+
+/*
+ * Fixed network header size
+ */
+struct virtio_net_rxhdr {
+	uint8_t		vrh_flags;
+	uint8_t		vrh_gso_type;
+	uint16_t	vrh_hdr_len;
+	uint16_t	vrh_gso_size;
+	uint16_t	vrh_csum_start;
+	uint16_t	vrh_csum_offset;
+	uint16_t	vrh_bufs;
+} __packed;
+
+#endif /* __NET_BACKENDS_H__ */

Modified: stable/12/usr.sbin/bhyve/pci_e82545.c
==============================================================================
--- stable/12/usr.sbin/bhyve/pci_e82545.c	Sun Jul 21 08:28:28 2019	(r350192)
+++ stable/12/usr.sbin/bhyve/pci_e82545.c	Sun Jul 21 10:58:25 2019	(r350193)
@@ -66,6 +66,7 @@ __FBSDID("$FreeBSD$");
 #include "pci_emul.h"
 #include "mevent.h"
 #include "net_utils.h"
+#include "net_backends.h"
 
 /* Hardware/register definitions XXX: move some to common code. */
 #define E82545_VENDOR_ID_INTEL			0x8086
@@ -245,11 +246,10 @@ struct  eth_uni {
 struct e82545_softc {
 	struct pci_devinst *esc_pi;
 	struct vmctx	*esc_ctx;
-	struct mevent   *esc_mevp;
 	struct mevent   *esc_mevpitr;
 	pthread_mutex_t	esc_mtx;
 	struct ether_addr esc_mac;
-	int		esc_tapfd;
+	net_backend_t	*esc_be;
 
 	/* General */
 	uint32_t	esc_CTRL;	/* x0000 device ctl */
@@ -355,7 +355,7 @@ struct e82545_softc {
 static void e82545_reset(struct e82545_softc *sc, int dev);
 static void e82545_rx_enable(struct e82545_softc *sc);
 static void e82545_rx_disable(struct e82545_softc *sc);
-static void e82545_tap_callback(int fd, enum ev_type type, void *param);
+static void e82545_rx_callback(int fd, enum ev_type type, void *param);
 static void e82545_tx_start(struct e82545_softc *sc);
 static void e82545_tx_enable(struct e82545_softc *sc);
 static void e82545_tx_disable(struct e82545_softc *sc);
@@ -824,11 +824,9 @@ e82545_bufsz(uint32_t rctl)
 	return (256);	/* Forbidden value. */
 }
 
-static uint8_t dummybuf[2048];
-
 /* XXX one packet at a time until this is debugged */
 static void
-e82545_tap_callback(int fd, enum ev_type type, void *param)
+e82545_rx_callback(int fd, enum ev_type type, void *param)
 {
 	struct e82545_softc *sc = param;
 	struct e1000_rx_desc *rxd;
@@ -843,7 +841,7 @@ e82545_tap_callback(int fd, enum ev_type type, void *p
 	if (!sc->esc_rx_enabled || sc->esc_rx_loopback) {
 		DPRINTF("rx disabled (!%d || %d) -- packet(s) dropped\r\n",
 		    sc->esc_rx_enabled, sc->esc_rx_loopback);
-		while (read(sc->esc_tapfd, dummybuf, sizeof(dummybuf)) > 0) {
+		while (netbe_rx_discard(sc->esc_be) > 0) {
 		}
 		goto done1;
 	}
@@ -856,7 +854,7 @@ e82545_tap_callback(int fd, enum ev_type type, void *p
 	if (left < maxpktdesc) {
 		DPRINTF("rx overflow (%d < %d) -- packet(s) dropped\r\n",
 		    left, maxpktdesc);
-		while (read(sc->esc_tapfd, dummybuf, sizeof(dummybuf)) > 0) {
+		while (netbe_rx_discard(sc->esc_be) > 0) {
 		}
 		goto done1;
 	}
@@ -873,9 +871,9 @@ e82545_tap_callback(int fd, enum ev_type type, void *p
 			    rxd->buffer_addr, bufsz);
 			vec[i].iov_len = bufsz;
 		}
-		len = readv(sc->esc_tapfd, vec, maxpktdesc);
+		len = netbe_recv(sc->esc_be, vec, maxpktdesc);
 		if (len <= 0) {
-			DPRINTF("tap: readv() returned %d\n", len);
+			DPRINTF("netbe_recv() returned %d\n", len);
 			goto done;
 		}
 
@@ -1050,10 +1048,10 @@ static void
 e82545_transmit_backend(struct e82545_softc *sc, struct iovec *iov, int iovcnt)
 {
 
-	if (sc->esc_tapfd == -1)
+	if (sc->esc_be == NULL)
 		return;

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201907211058.x6LAwPUs005642>