From owner-svn-soc-all@freebsd.org Fri Aug 5 13:50:39 2016 Return-Path: Delivered-To: svn-soc-all@mailman.ysv.freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:1900:2254:206a::19:1]) by mailman.ysv.freebsd.org (Postfix) with ESMTP id 457B7BAFB6C for ; Fri, 5 Aug 2016 13:50:39 +0000 (UTC) (envelope-from vincenzo@FreeBSD.org) Received: from socsvn.freebsd.org (socsvn.freebsd.org [IPv6:2001:1900:2254:206a::50:2]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (Client did not present a certificate) by mx1.freebsd.org (Postfix) with ESMTPS id 3785118B1 for ; Fri, 5 Aug 2016 13:50:39 +0000 (UTC) (envelope-from vincenzo@FreeBSD.org) Received: from socsvn.freebsd.org ([127.0.1.124]) by socsvn.freebsd.org (8.15.2/8.15.2) with ESMTP id u75DodCS025663 for ; Fri, 5 Aug 2016 13:50:39 GMT (envelope-from vincenzo@FreeBSD.org) Received: (from www@localhost) by socsvn.freebsd.org (8.15.2/8.15.2/Submit) id u75Docov025660 for svn-soc-all@FreeBSD.org; Fri, 5 Aug 2016 13:50:38 GMT (envelope-from vincenzo@FreeBSD.org) Date: Fri, 5 Aug 2016 13:50:38 GMT Message-Id: <201608051350.u75Docov025660@socsvn.freebsd.org> X-Authentication-Warning: socsvn.freebsd.org: www set sender to vincenzo@FreeBSD.org using -f From: vincenzo@FreeBSD.org To: svn-soc-all@FreeBSD.org Subject: socsvn commit: r307216 - soc2016/vincenzo/head/usr.sbin/bhyve MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit X-BeenThere: svn-soc-all@freebsd.org X-Mailman-Version: 2.1.22 Precedence: list List-Id: SVN commit messages for the entire Summer of Code repository List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Fri, 05 Aug 2016 13:50:39 -0000 Author: vincenzo Date: Fri Aug 5 13:50:37 2016 New Revision: 307216 URL: http://svnweb.FreeBSD.org/socsvn/?view=rev&rev=307216 Log: net: add net_backends module Modified: soc2016/vincenzo/head/usr.sbin/bhyve/Makefile soc2016/vincenzo/head/usr.sbin/bhyve/pci_virtio_net.c Modified: soc2016/vincenzo/head/usr.sbin/bhyve/Makefile ============================================================================== --- soc2016/vincenzo/head/usr.sbin/bhyve/Makefile Fri Aug 5 09:08:00 2016 (r307215) +++ soc2016/vincenzo/head/usr.sbin/bhyve/Makefile Fri Aug 5 13:50:37 2016 (r307216) @@ -25,6 +25,7 @@ mem.c \ mevent.c \ mptbl.c \ + net_backends.c \ pci_ahci.c \ pci_emul.c \ pci_hostbridge.c \ @@ -50,6 +51,8 @@ LIBADD= vmmapi md pthread +CFLAGS=-I/home/vmaffione/git/netmap/sys + WARNS?= 2 .include Modified: soc2016/vincenzo/head/usr.sbin/bhyve/pci_virtio_net.c ============================================================================== --- soc2016/vincenzo/head/usr.sbin/bhyve/pci_virtio_net.c Fri Aug 5 09:08:00 2016 (r307215) +++ soc2016/vincenzo/head/usr.sbin/bhyve/pci_virtio_net.c Fri Aug 5 13:50:37 2016 (r307216) @@ -58,34 +58,12 @@ #include "pci_emul.h" #include "mevent.h" #include "virtio.h" +#include "net_backends.h" #define VTNET_RINGSZ 1024 #define VTNET_MAXSEGS 256 -/* - * Host capabilities. Note that we only offer a few of these. - */ -#define VIRTIO_NET_F_CSUM (1 << 0) /* host handles partial cksum */ -#define VIRTIO_NET_F_GUEST_CSUM (1 << 1) /* guest handles partial cksum */ -#define VIRTIO_NET_F_MAC (1 << 5) /* host supplies MAC */ -#define VIRTIO_NET_F_GSO_DEPREC (1 << 6) /* deprecated: host handles GSO */ -#define VIRTIO_NET_F_GUEST_TSO4 (1 << 7) /* guest can rcv TSOv4 */ -#define VIRTIO_NET_F_GUEST_TSO6 (1 << 8) /* guest can rcv TSOv6 */ -#define VIRTIO_NET_F_GUEST_ECN (1 << 9) /* guest can rcv TSO with ECN */ -#define VIRTIO_NET_F_GUEST_UFO (1 << 10) /* guest can rcv UFO */ -#define VIRTIO_NET_F_HOST_TSO4 (1 << 11) /* host can rcv TSOv4 */ -#define VIRTIO_NET_F_HOST_TSO6 (1 << 12) /* host can rcv TSOv6 */ -#define VIRTIO_NET_F_HOST_ECN (1 << 13) /* host can rcv TSO with ECN */ -#define VIRTIO_NET_F_HOST_UFO (1 << 14) /* host can rcv UFO */ -#define VIRTIO_NET_F_MRG_RXBUF (1 << 15) /* host can merge RX buffers */ -#define VIRTIO_NET_F_STATUS (1 << 16) /* config status field available */ -#define VIRTIO_NET_F_CTRL_VQ (1 << 17) /* control channel available */ -#define VIRTIO_NET_F_CTRL_RX (1 << 18) /* control channel RX mode support */ -#define VIRTIO_NET_F_CTRL_VLAN (1 << 19) /* control channel VLAN filtering */ -#define VIRTIO_NET_F_GUEST_ANNOUNCE \ - (1 << 21) /* guest can send gratuitous pkts */ - #define VTNET_S_HOSTCAPS \ ( VIRTIO_NET_F_MAC | VIRTIO_NET_F_MRG_RXBUF | VIRTIO_NET_F_STATUS | \ VIRTIO_F_NOTIFY_ON_EMPTY | VIRTIO_RING_F_INDIRECT_DESC) @@ -96,6 +74,7 @@ struct virtio_net_config { uint8_t mac[6]; uint16_t status; + uint16_t max_virtqueue_pairs; } __packed; /* @@ -108,19 +87,6 @@ #define VTNET_MAXQ 3 /* - * Fixed network header size - */ -struct virtio_net_rxhdr { - uint8_t vrh_flags; - uint8_t vrh_gso_type; - uint16_t vrh_hdr_len; - uint16_t vrh_gso_size; - uint16_t vrh_csum_start; - uint16_t vrh_csum_offset; - uint16_t vrh_bufs; -} __packed; - -/* * Debug printf */ static int pci_vtnet_debug; @@ -134,10 +100,8 @@ struct virtio_softc vsc_vs; struct vqueue_info vsc_queues[VTNET_MAXQ - 1]; pthread_mutex_t vsc_mtx; - struct mevent *vsc_mevp; - int vsc_tapfd; - struct nm_desc *vsc_nmd; + struct net_backend *vsc_be; int vsc_rx_ready; volatile int resetting; /* set and checked outside lock */ @@ -154,10 +118,6 @@ pthread_mutex_t tx_mtx; pthread_cond_t tx_cond; int tx_in_progress; - - void (*pci_vtnet_rx)(struct pci_vtnet_softc *sc); - void (*pci_vtnet_tx)(struct pci_vtnet_softc *sc, struct iovec *iov, - int iovcnt, int len); }; static void pci_vtnet_reset(void *); @@ -233,280 +193,47 @@ } /* - * Called to send a buffer chain out to the tap device - */ -static void -pci_vtnet_tap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt, - int len) -{ - static char pad[60]; /* all zero bytes */ - - if (sc->vsc_tapfd == -1) - return; - - /* - * If the length is < 60, pad out to that and add the - * extra zero'd segment to the iov. It is guaranteed that - * there is always an extra iov available by the caller. - */ - if (len < 60) { - iov[iovcnt].iov_base = pad; - iov[iovcnt].iov_len = 60 - len; - iovcnt++; - } - (void) writev(sc->vsc_tapfd, iov, iovcnt); -} - -/* * Called when there is read activity on the tap file descriptor. * Each buffer posted by the guest is assumed to be able to contain * an entire ethernet frame + rx header. * MP note: the dummybuf is only used for discarding frames, so there * is no need for it to be per-vtnet or locked. */ -static uint8_t dummybuf[2048]; - -static __inline struct iovec * -rx_iov_trim(struct iovec *iov, int *niov, int tlen) -{ - struct iovec *riov; - - /* XXX short-cut: assume first segment is >= tlen */ - assert(iov[0].iov_len >= tlen); - - iov[0].iov_len -= tlen; - if (iov[0].iov_len == 0) { - assert(*niov > 1); - *niov -= 1; - riov = &iov[1]; - } else { - iov[0].iov_base = (void *)((uintptr_t)iov[0].iov_base + tlen); - riov = &iov[0]; - } - return (riov); -} - -static void -pci_vtnet_tap_rx(struct pci_vtnet_softc *sc) +void +pci_vtnet_rx_discard(struct pci_vtnet_softc *sc, struct iovec *iov) { - struct iovec iov[VTNET_MAXSEGS], *riov; - struct vqueue_info *vq; - void *vrx; - int len, n; - uint16_t idx; - - /* - * Should never be called without a valid tap fd - */ - assert(sc->vsc_tapfd != -1); - /* - * But, will be called when the rx ring hasn't yet - * been set up or the guest is resetting the device. + * MP note: the dummybuf is only used to discard frames, + * so there is no need for it to be per-vtnet or locked. + * We only make it large enough for TSO-sized segment. */ - if (!sc->vsc_rx_ready || sc->resetting) { - /* - * Drop the packet and try later. - */ - (void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf)); - return; - } - - /* - * Check for available rx buffers - */ - vq = &sc->vsc_queues[VTNET_RXQ]; - if (!vq_has_descs(vq)) { - /* - * Drop the packet and try later. Interrupt on - * empty, if that's negotiated. - */ - (void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf)); - vq_endchains(vq, 1); - return; - } - - do { - /* - * Get descriptor chain. - */ - n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL); - assert(n >= 1 && n <= VTNET_MAXSEGS); - - /* - * Get a pointer to the rx header, and use the - * data immediately following it for the packet buffer. - */ - vrx = iov[0].iov_base; - riov = rx_iov_trim(iov, &n, sc->rx_vhdrlen); - - len = readv(sc->vsc_tapfd, riov, n); - - if (len < 0 && errno == EWOULDBLOCK) { - /* - * No more packets, but still some avail ring - * entries. Interrupt if needed/appropriate. - */ - vq_retchain(vq); - vq_endchains(vq, 0); - return; - } - - /* - * The only valid field in the rx packet header is the - * number of buffers if merged rx bufs were negotiated. - */ - memset(vrx, 0, sc->rx_vhdrlen); - - if (sc->rx_merge) { - struct virtio_net_rxhdr *vrxh; + static uint8_t dummybuf[65536+64]; + int more; - vrxh = vrx; - vrxh->vrh_bufs = 1; - } - - /* - * Release this chain and handle more chains. - */ - vq_relchain(vq, idx, len + sc->rx_vhdrlen); - } while (vq_has_descs(vq)); - - /* Interrupt if needed, including for NOTIFY_ON_EMPTY. */ - vq_endchains(vq, 1); -} - -static __inline int -pci_vtnet_netmap_writev(struct nm_desc *nmd, struct iovec *iov, int iovcnt) -{ - int r, i; - int len = 0; - - for (r = nmd->cur_tx_ring; ; ) { - struct netmap_ring *ring = NETMAP_TXRING(nmd->nifp, r); - uint32_t cur, idx; - char *buf; - - if (nm_ring_empty(ring)) { - r++; - if (r > nmd->last_tx_ring) - r = nmd->first_tx_ring; - if (r == nmd->cur_tx_ring) - break; - continue; - } - cur = ring->cur; - idx = ring->slot[cur].buf_idx; - buf = NETMAP_BUF(ring, idx); - - for (i = 0; i < iovcnt; i++) { - if (len + iov[i].iov_len > 2048) - break; - memcpy(&buf[len], iov[i].iov_base, iov[i].iov_len); - len += iov[i].iov_len; - } - ring->slot[cur].len = len; - ring->head = ring->cur = nm_ring_next(ring, cur); - nmd->cur_tx_ring = r; - ioctl(nmd->fd, NIOCTXSYNC, NULL); - break; - } - - return (len); -} - -static __inline int -pci_vtnet_netmap_readv(struct nm_desc *nmd, struct iovec *iov, int iovcnt) -{ - int len = 0; - int i = 0; - int r; - - for (r = nmd->cur_rx_ring; ; ) { - struct netmap_ring *ring = NETMAP_RXRING(nmd->nifp, r); - uint32_t cur, idx; - char *buf; - size_t left; - - if (nm_ring_empty(ring)) { - r++; - if (r > nmd->last_rx_ring) - r = nmd->first_rx_ring; - if (r == nmd->cur_rx_ring) - break; - continue; - } - cur = ring->cur; - idx = ring->slot[cur].buf_idx; - buf = NETMAP_BUF(ring, idx); - left = ring->slot[cur].len; - - for (i = 0; i < iovcnt && left > 0; i++) { - if (iov[i].iov_len > left) - iov[i].iov_len = left; - memcpy(iov[i].iov_base, &buf[len], iov[i].iov_len); - len += iov[i].iov_len; - left -= iov[i].iov_len; - } - ring->head = ring->cur = nm_ring_next(ring, cur); - nmd->cur_rx_ring = r; - ioctl(nmd->fd, NIOCRXSYNC, NULL); - break; - } - for (; i < iovcnt; i++) - iov[i].iov_len = 0; - - return (len); -} - -/* - * Called to send a buffer chain out to the vale port - */ -static void -pci_vtnet_netmap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt, - int len) -{ - static char pad[60]; /* all zero bytes */ - - if (sc->vsc_nmd == NULL) - return; - - /* - * If the length is < 60, pad out to that and add the - * extra zero'd segment to the iov. It is guaranteed that - * there is always an extra iov available by the caller. - */ - if (len < 60) { - iov[iovcnt].iov_base = pad; - iov[iovcnt].iov_len = 60 - len; - iovcnt++; - } - (void) pci_vtnet_netmap_writev(sc->vsc_nmd, iov, iovcnt); + iov[0].iov_base = dummybuf; + iov[0].iov_len = sizeof(dummybuf); + netbe_recv(sc->vsc_be, iov, 1, &more); } static void -pci_vtnet_netmap_rx(struct pci_vtnet_softc *sc) +pci_vtnet_rx(struct pci_vtnet_softc *sc) { - struct iovec iov[VTNET_MAXSEGS], *riov; + struct iovec iov[VTNET_MAXSEGS + 1]; struct vqueue_info *vq; - void *vrx; int len, n; uint16_t idx; + int more; /* - * Should never be called without a valid netmap descriptor - */ - assert(sc->vsc_nmd != NULL); - - /* - * But, will be called when the rx ring hasn't yet + * This will be called when the rx ring hasn't yet * been set up or the guest is resetting the device. */ if (!sc->vsc_rx_ready || sc->resetting) { /* * Drop the packet and try later. */ - (void) nm_nextpkt(sc->vsc_nmd, (void *)dummybuf); + pci_vtnet_rx_discard(sc, iov); return; } @@ -519,7 +246,7 @@ * Drop the packet and try later. Interrupt on * empty, if that's negotiated. */ - (void) nm_nextpkt(sc->vsc_nmd, (void *)dummybuf); + pci_vtnet_rx_discard(sc, iov); vq_endchains(vq, 1); return; } @@ -531,14 +258,7 @@ n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL); assert(n >= 1 && n <= VTNET_MAXSEGS); - /* - * Get a pointer to the rx header, and use the - * data immediately following it for the packet buffer. - */ - vrx = iov[0].iov_base; - riov = rx_iov_trim(iov, &n, sc->rx_vhdrlen); - - len = pci_vtnet_netmap_readv(sc->vsc_nmd, riov, n); + len = netbe_recv(sc->vsc_be, iov, n, &more); if (len == 0) { /* @@ -551,19 +271,6 @@ } /* - * The only valid field in the rx packet header is the - * number of buffers if merged rx bufs were negotiated. - */ - memset(vrx, 0, sc->rx_vhdrlen); - - if (sc->rx_merge) { - struct virtio_net_rxhdr *vrxh; - - vrxh = vrx; - vrxh->vrh_bufs = 1; - } - - /* * Release this chain and handle more chains. */ vq_relchain(vq, idx, len + sc->rx_vhdrlen); @@ -579,7 +286,7 @@ struct pci_vtnet_softc *sc = param; pthread_mutex_lock(&sc->rx_mtx); - sc->pci_vtnet_rx(sc); + pci_vtnet_rx(sc); pthread_mutex_unlock(&sc->rx_mtx); } @@ -603,27 +310,24 @@ { struct iovec iov[VTNET_MAXSEGS + 1]; int i, n; - int plen, tlen; + int len; uint16_t idx; /* - * Obtain chain of descriptors. The first one is - * really the header descriptor, so we need to sum - * up two lengths: packet length and transfer length. + * Obtain chain of descriptors. The first descriptor also + * contains the virtio-net header. */ n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL); assert(n >= 1 && n <= VTNET_MAXSEGS); - tlen = 0; + len = 0; for (i = 0; i < n; i++) { - tlen += iov[i].iov_len; + len += iov[i].iov_len; } - plen = tlen - iov[0].iov_len; - DPRINTF(("virtio: packet send, %d bytes, %d segs\n\r", plen, n)); - sc->pci_vtnet_tx(sc, &iov[1], n - 1, plen); + netbe_send(sc->vsc_be, iov, n, len, 0 /* more */); - /* chain is processed, release it and set tlen */ - vq_relchain(vq, idx, tlen); + /* chain is processed, release it and set len */ + vq_relchain(vq, idx, len); } static void @@ -731,68 +435,6 @@ return (0); } -static void -pci_vtnet_tap_setup(struct pci_vtnet_softc *sc, char *devname) -{ - char tbuf[80]; - - strcpy(tbuf, "/dev/"); - strlcat(tbuf, devname, sizeof(tbuf)); - - sc->pci_vtnet_rx = pci_vtnet_tap_rx; - sc->pci_vtnet_tx = pci_vtnet_tap_tx; - - sc->vsc_tapfd = open(tbuf, O_RDWR); - if (sc->vsc_tapfd == -1) { - WPRINTF(("open of tap device %s failed\n", tbuf)); - return; - } - - /* - * Set non-blocking and register for read - * notifications with the event loop - */ - int opt = 1; - if (ioctl(sc->vsc_tapfd, FIONBIO, &opt) < 0) { - WPRINTF(("tap device O_NONBLOCK failed\n")); - close(sc->vsc_tapfd); - sc->vsc_tapfd = -1; - } - - sc->vsc_mevp = mevent_add(sc->vsc_tapfd, - EVF_READ, - pci_vtnet_rx_callback, - sc); - if (sc->vsc_mevp == NULL) { - WPRINTF(("Could not register event\n")); - close(sc->vsc_tapfd); - sc->vsc_tapfd = -1; - } -} - -static void -pci_vtnet_netmap_setup(struct pci_vtnet_softc *sc, char *ifname) -{ - sc->pci_vtnet_rx = pci_vtnet_netmap_rx; - sc->pci_vtnet_tx = pci_vtnet_netmap_tx; - - sc->vsc_nmd = nm_open(ifname, NULL, 0, 0); - if (sc->vsc_nmd == NULL) { - WPRINTF(("open of netmap device %s failed\n", ifname)); - return; - } - - sc->vsc_mevp = mevent_add(sc->vsc_nmd->fd, - EVF_READ, - pci_vtnet_rx_callback, - sc); - if (sc->vsc_mevp == NULL) { - WPRINTF(("Could not register event\n")); - nm_close(sc->vsc_nmd); - sc->vsc_nmd = NULL; - } -} - static int pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) { @@ -804,12 +446,20 @@ char *devname; char *vtopts; int mac_provided; + struct virtio_consts *vc; - sc = calloc(1, sizeof(struct pci_vtnet_softc)); + /* sc also contains a copy of the vtnet_vi_consts, + * because the capabilities change depending on + * the backend. + */ + sc = calloc(1, sizeof(struct pci_vtnet_softc) + + sizeof(struct virtio_consts)); + vc = (struct virtio_consts *)(sc + 1); + memcpy(vc, &vtnet_vi_consts, sizeof(*vc)); pthread_mutex_init(&sc->vsc_mtx, NULL); - vi_softc_linkup(&sc->vsc_vs, &vtnet_vi_consts, sc, pi, sc->vsc_queues); + vi_softc_linkup(&sc->vsc_vs, vc, sc, pi, sc->vsc_queues); sc->vsc_vs.vs_mtx = &sc->vsc_mtx; sc->vsc_queues[VTNET_RXQ].vq_qsize = VTNET_RINGSZ; @@ -822,12 +472,10 @@ #endif /* - * Attempt to open the tap device and read the MAC address + * Attempt to open the backend device and read the MAC address * if specified */ mac_provided = 0; - sc->vsc_tapfd = -1; - sc->vsc_nmd = NULL; if (opts != NULL) { int err; @@ -843,11 +491,12 @@ mac_provided = 1; } - if (strncmp(devname, "vale", 4) == 0) - pci_vtnet_netmap_setup(sc, devname); - if (strncmp(devname, "tap", 3) == 0 || - strncmp(devname, "vmnet", 5) == 0) - pci_vtnet_tap_setup(sc, devname); + sc->vsc_be = netbe_init(devname, pci_vtnet_rx_callback, sc); + if (!sc->vsc_be) { + WPRINTF(("net backend initialization failed\n")); + } else { + vc->vc_hv_caps |= netbe_get_features(sc->vsc_be); + } free(devname); } @@ -879,9 +528,8 @@ pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_NET); pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR); - /* Link is up if we managed to open tap device or vale port. */ - sc->vsc_config.status = (opts == NULL || sc->vsc_tapfd >= 0 || - sc->vsc_nmd != NULL); + /* Link is up if we managed to open backend device. */ + sc->vsc_config.status = (opts == NULL || sc->vsc_be); /* use BAR 1 to map MSI-X table and PBA, if we're using MSI-X */ if (vi_intr_init(&sc->vsc_vs, 1, fbsdrun_virtio_msix())) @@ -951,11 +599,14 @@ sc->vsc_features = negotiated_features; - if (!(sc->vsc_features & VIRTIO_NET_F_MRG_RXBUF)) { + if (!(negotiated_features & VIRTIO_NET_F_MRG_RXBUF)) { sc->rx_merge = 0; /* non-merge rx header is 2 bytes shorter */ sc->rx_vhdrlen -= 2; } + + /* Tell the backend to enable some features it has advertised. */ + netbe_set_features(sc->vsc_be, negotiated_features, sc->rx_vhdrlen); } struct pci_devemu pci_de_vnet = {