From owner-svn-src-user@FreeBSD.ORG Thu Mar 21 06:56:36 2013 Return-Path: Delivered-To: svn-src-user@freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:1900:2254:206a::19:1]) by hub.freebsd.org (Postfix) with ESMTP id 5B78FC1A; Thu, 21 Mar 2013 06:56:36 +0000 (UTC) (envelope-from bryanv@FreeBSD.org) Received: from svn.freebsd.org (svn.freebsd.org [IPv6:2001:1900:2254:2068::e6a:0]) by mx1.freebsd.org (Postfix) with ESMTP id 4D3F2CD3; Thu, 21 Mar 2013 06:56:36 +0000 (UTC) Received: from svn.freebsd.org ([127.0.1.70]) by svn.freebsd.org (8.14.6/8.14.6) with ESMTP id r2L6uaJ8065799; Thu, 21 Mar 2013 06:56:36 GMT (envelope-from bryanv@svn.freebsd.org) Received: (from bryanv@localhost) by svn.freebsd.org (8.14.6/8.14.5/Submit) id r2L6uabL065797; Thu, 21 Mar 2013 06:56:36 GMT (envelope-from bryanv@svn.freebsd.org) Message-Id: <201303210656.r2L6uabL065797@svn.freebsd.org> From: Bryan Venteicher Date: Thu, 21 Mar 2013 06:56:36 +0000 (UTC) To: src-committers@freebsd.org, svn-src-user@freebsd.org Subject: svn commit: r248565 - user/bryanv/vtnetmq/sys/dev/virtio/network X-SVN-Group: user MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit X-BeenThere: svn-src-user@freebsd.org X-Mailman-Version: 2.1.14 Precedence: list List-Id: "SVN commit messages for the experimental " user" src tree" List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Thu, 21 Mar 2013 06:56:36 -0000 Author: bryanv Date: Thu Mar 21 06:56:35 2013 New Revision: 248565 URL: http://svnweb.freebsd.org/changeset/base/248565 Log: Commit development snapshot of the multiqueue driver This commit contains lots of cleanup, bug fixes, and enhancements such as: - improved Rx/Tx checksumming - better handling of deferred transmit and interrupt handlers - per-queue statistics, exported via sysctl A lot of work still remains. Modified: user/bryanv/vtnetmq/sys/dev/virtio/network/if_vtnet.c user/bryanv/vtnetmq/sys/dev/virtio/network/if_vtnetvar.h user/bryanv/vtnetmq/sys/dev/virtio/network/virtio_net.h Modified: user/bryanv/vtnetmq/sys/dev/virtio/network/if_vtnet.c ============================================================================== --- user/bryanv/vtnetmq/sys/dev/virtio/network/if_vtnet.c Wed Mar 20 21:47:05 2013 (r248564) +++ user/bryanv/vtnetmq/sys/dev/virtio/network/if_vtnet.c Thu Mar 21 06:56:35 2013 (r248565) @@ -62,6 +62,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -78,6 +79,9 @@ __FBSDID("$FreeBSD$"); #include "virtio_if.h" +#include "opt_inet.h" +#include "opt_inet6.h" + static int vtnet_modevent(module_t, int, void *); static int vtnet_probe(device_t); @@ -110,7 +114,7 @@ static int vtnet_rxq_replace_lro_nomgr_b static int vtnet_rxq_replace_buf(struct vtnet_rxq *, struct mbuf *, int); static int vtnet_rxq_enqueue_buf(struct vtnet_rxq *, struct mbuf *); static int vtnet_rxq_new_buf(struct vtnet_rxq *); -static int vtnet_rx_csum(struct vtnet_softc *, struct mbuf *, +static int vtnet_rxq_csum(struct vtnet_rxq *, struct mbuf *, struct virtio_net_hdr *); static void vtnet_rxq_discard_merged_bufs(struct vtnet_rxq *, int); static void vtnet_rxq_discard_buf(struct vtnet_rxq *, struct mbuf *); @@ -119,11 +123,15 @@ static void vtnet_rxq_input(struct vtnet struct virtio_net_hdr *); static int vtnet_rxq_eof(struct vtnet_rxq *); static void vtnet_rx_vq_intr(void *); -static void vtnet_rxq_taskqueue(void *, int); +static void vtnet_rxq_tq_intr(void *, int); static void vtnet_txq_free_mbufs(struct vtnet_txq *); +static int vtnet_txq_offload_ctx(struct vtnet_txq *, struct mbuf *, + int *, int *, int *); +static int vtnet_txq_offload_tso(struct vtnet_txq *, struct mbuf *, int, + int, struct virtio_net_hdr *); static struct mbuf * - vtnet_tx_offload(struct vtnet_softc *, struct mbuf *, + vtnet_txq_offload(struct vtnet_txq *, struct mbuf *, struct virtio_net_hdr *); static int vtnet_txq_enqueue_buf(struct vtnet_txq *, struct mbuf **, struct vtnet_tx_header *); @@ -134,9 +142,9 @@ static void vtnet_start(struct ifnet *); #else static int vtnet_txq_mq_start_locked(struct vtnet_txq *, struct mbuf *); static int vtnet_txq_mq_start(struct ifnet *, struct mbuf *); -static void vtnet_txq_taskqueue(void *, int); -static void vtnet_txq_tq_start(struct vtnet_txq *); +static void vtnet_txq_tq_deferred(void *, int); #endif +static void vtnet_txq_tq_intr(void *, int); static void vtnet_txq_eof(struct vtnet_txq *); static void vtnet_tx_vq_intr(void *); @@ -155,13 +163,15 @@ static void vtnet_drain_rxtx_queues(stru static void vtnet_stop_rendezvous(struct vtnet_softc *); static void vtnet_stop(struct vtnet_softc *); static int vtnet_virtio_reinit(struct vtnet_softc *); +static void vtnet_init_rx_filters(struct vtnet_softc *); static int vtnet_init_rx_queues(struct vtnet_softc *); +static int vtnet_init_tx_queues(struct vtnet_softc *); +static int vtnet_init_rxtx_queues(struct vtnet_softc *); static void vtnet_set_active_vq_pairs(struct vtnet_softc *); static int vtnet_reinit(struct vtnet_softc *); static void vtnet_init_locked(struct vtnet_softc *); static void vtnet_init(void *); -static void vtnet_init_rx_filters(struct vtnet_softc *); static void vtnet_free_ctrl_vq(struct vtnet_softc *); static void vtnet_exec_ctrl_cmd(struct vtnet_softc *, void *, struct sglist *, int, int); @@ -186,6 +196,11 @@ static void vtnet_get_hwaddr(struct vtne static void vtnet_set_hwaddr(struct vtnet_softc *); static void vtnet_vlan_tag_remove(struct mbuf *); +static void vtnet_setup_rxq_sysctl(struct sysctl_ctx_list *, + struct sysctl_oid_list *, struct vtnet_rxq *); +static void vtnet_setup_txq_sysctl(struct sysctl_ctx_list *, + struct sysctl_oid_list *, struct vtnet_txq *); +static void vtnet_setup_queue_sysctl(struct vtnet_softc *); static void vtnet_setup_sysctl(struct vtnet_softc *); static int vtnet_rxq_enable_intr(struct vtnet_rxq *); @@ -214,12 +229,14 @@ static int vtnet_rx_process_limit = 256; TUNABLE_INT("hw.vtnet.rx_process_limit", &vtnet_rx_process_limit); /* - * Reducing the number of transmit completed interrupts can - * improve performance. To do so, the define below keeps the - * Tx vq interrupt disabled and adds calls to vtnet_txeof() - * in the start and watchdog paths. The price to pay for this - * is the m_free'ing of transmitted mbufs may be delayed until - * the watchdog fires. + * Reducing the number of transmit completed interrupts can improve + * performance. To do so, the define below keeps the Tx vq interrupt + * disabled and adds calls to vtnet_txeof() in the start and watchdog + * paths. The price to pay for this is the m_free'ing of transmitted + * mbufs may be delayed until the watchdog fires. + * + * BMV: Reintroduce this later as a run-time option, if it makes + * sense after the EVENT_IDX feature is supported. */ #define VTNET_TX_INTR_MODERATION @@ -366,18 +383,15 @@ vtnet_attach(device_t dev) error = virtio_setup_intr(dev, INTR_TYPE_NET); if (error) { device_printf(dev, "cannot setup virtqueue interrupts\n"); + /* BMV: This will crash if during boot! */ ether_ifdetach(sc->vtnet_ifp); goto fail; } - vtnet_start_taskqueues(sc); - - /* - * Even though this is a polling operation, it must be done after - * interrupts have been setup. - */ vtnet_attach_disable_promisc(sc); + vtnet_start_taskqueues(sc); + fail: if (error) vtnet_detach(dev); @@ -610,7 +624,7 @@ vtnet_init_rxq(struct vtnet_softc *sc, i rxq->vtnrx_id = id; rxq->vtnrx_process_limit = vtnet_rx_process_limit; - TASK_INIT(&rxq->vtnrx_task, 0, vtnet_rxq_taskqueue, rxq); + TASK_INIT(&rxq->vtnrx_intrtask, 0, vtnet_rxq_tq_intr, rxq); rxq->vtnrx_tq = taskqueue_create(rxq->vtnrx_name, M_NOWAIT, taskqueue_thread_enqueue, &rxq->vtnrx_tq); @@ -637,12 +651,13 @@ vtnet_init_txq(struct vtnet_softc *sc, i if (txq->vtntx_br == NULL) return (ENOMEM); - TASK_INIT(&txq->vtntx_task, 0, vtnet_txq_taskqueue, txq); + TASK_INIT(&txq->vtntx_defrtask, 0, vtnet_txq_tq_deferred, txq); +#endif + TASK_INIT(&txq->vtntx_intrtask, 0, vtnet_txq_tq_intr, txq); txq->vtntx_tq = taskqueue_create(txq->vtntx_name, M_NOWAIT, taskqueue_thread_enqueue, &txq->vtntx_tq); if (txq->vtntx_tq == NULL) return (ENOMEM); -#endif return (0); } @@ -650,18 +665,18 @@ vtnet_init_txq(struct vtnet_softc *sc, i static int vtnet_alloc_rxtx_queues(struct vtnet_softc *sc) { - int i, pairs, error; + int i, npairs, error; - pairs = sc->vtnet_max_vq_pairs; + npairs = sc->vtnet_max_vq_pairs; - sc->vtnet_rxqs = malloc(sizeof(struct vtnet_rxq) * pairs, M_DEVBUF, + sc->vtnet_rxqs = malloc(sizeof(struct vtnet_rxq) * npairs, M_DEVBUF, M_NOWAIT | M_ZERO); - sc->vtnet_txqs = malloc(sizeof(struct vtnet_txq) * pairs, M_DEVBUF, + sc->vtnet_txqs = malloc(sizeof(struct vtnet_txq) * npairs, M_DEVBUF, M_NOWAIT | M_ZERO); if (sc->vtnet_rxqs == NULL || sc->vtnet_txqs == NULL) return (ENOMEM); - for (i = 0; i < pairs; i++) { + for (i = 0; i < npairs; i++) { error = vtnet_init_rxq(sc, i); if (error) return (error); @@ -670,6 +685,8 @@ vtnet_alloc_rxtx_queues(struct vtnet_sof return (error); } + vtnet_setup_queue_sysctl(sc); + return (0); } @@ -1051,10 +1068,7 @@ vtnet_ioctl(struct ifnet *ifp, u_long cm if (mask & (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6 | IFCAP_LRO | IFCAP_VLAN_HWFILTER)) { - /* - * These Rx features require us to renegotiate with - * the host. - */ + /* These Rx features require us to renegotiate. */ reinit = 1; /* @@ -1152,6 +1166,9 @@ vtnet_rx_alloc_buf(struct vtnet_softc *s clsize = sc->vtnet_rx_clsize; + KASSERT(nbufs == 1 || sc->vtnet_flags & VTNET_FLAG_LRO_NOMRG, + ("%s: chained mbuf %d request without LRO_NOMRG", __func__, nbufs)); + m_head = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, clsize); if (m_head == NULL) goto fail; @@ -1159,19 +1176,15 @@ vtnet_rx_alloc_buf(struct vtnet_softc *s m_head->m_len = clsize; m_tail = m_head; - if (nbufs > 1) { - KASSERT(sc->vtnet_flags & VTNET_FLAG_LRO_NOMRG, - ("%s: chained mbuf request without LRO_NOMRG", __func__)); - - for (i = 1; i < nbufs; i++) { - m = m_getjcl(M_NOWAIT, MT_DATA, 0, clsize); - if (m == NULL) - goto fail; - - m->m_len = clsize; - m_tail->m_next = m; - m_tail = m; - } + /* Allocate the rest of the chain. */ + for (i = 1; i < nbufs; i++) { + m = m_getjcl(M_NOWAIT, MT_DATA, 0, clsize); + if (m == NULL) + goto fail; + + m->m_len = clsize; + m_tail->m_next = m; + m_tail = m; } if (m_tailp != NULL) @@ -1293,27 +1306,25 @@ vtnet_rxq_replace_buf(struct vtnet_rxq * ("%s: chained mbuf without LRO_NOMRG", __func__)); if (m->m_next == NULL) { - /* - * Simplified fast-path for the common case of just one mbuf. - * - * BMV: This is a lot like vtnet_rxq_new_buf(). - */ - m->m_len = MIN(m->m_len, len); /* BMV XXX */ + /* Fast-path for the common case of just one mbuf. */ + if (m->m_len < len) + return (EINVAL); m_new = vtnet_rx_alloc_buf(sc, 1, NULL); - if (m_new != NULL) { - error = vtnet_rxq_enqueue_buf(rxq, m_new); - if (error) { - /* - * The new mbuf is suppose to be an identical - * copy of the one just dequeued so this is an - * unexpected error. - */ - m_freem(m_new); - sc->vtnet_stats.rx_enq_replacement_failed++; - } + if (m_new == NULL) + return (ENOBUFS); + + error = vtnet_rxq_enqueue_buf(rxq, m_new); + if (error) { + /* + * The new mbuf is suppose to be an identical + * copy of the one just dequeued so this is an + * unexpected error. + */ + m_freem(m_new); + sc->vtnet_stats.rx_enq_replacement_failed++; } else - error = ENOBUFS; + m->m_len = len; } else error = vtnet_rxq_replace_lro_nomgr_buf(rxq, m, len); @@ -1327,7 +1338,6 @@ vtnet_rxq_enqueue_buf(struct vtnet_rxq * struct sglist_seg segs[VTNET_MAX_RX_SEGS]; struct vtnet_softc *sc; struct vtnet_rx_header *rxhdr; - struct virtio_net_hdr *hdr; uint8_t *mdata; int offset, error; @@ -1342,24 +1352,15 @@ vtnet_rxq_enqueue_buf(struct vtnet_rxq * if ((sc->vtnet_flags & VTNET_FLAG_MRG_RXBUFS) == 0) { MPASS(sc->vtnet_hdr_size == sizeof(struct virtio_net_hdr)); rxhdr = (struct vtnet_rx_header *) mdata; - hdr = &rxhdr->vrh_hdr; + sglist_append(&sg, &rxhdr->vrh_hdr, sc->vtnet_hdr_size); offset = sizeof(struct vtnet_rx_header); - sglist_append(&sg, hdr, sc->vtnet_hdr_size); } else offset = 0; - /* - * XXX BMV: Either sglist_append() should never fail here ... - */ - - error = sglist_append(&sg, mdata + offset, m->m_len - offset); - if (error) - return (error); - + sglist_append(&sg, mdata + offset, m->m_len - offset); if (m->m_next != NULL) { error = sglist_append_mbuf(&sg, m->m_next); - if (error) - return (error); + MPASS(error == 0); } error = virtqueue_enqueue(rxq->vtnrx_vq, m, &sg, 0, sg.sg_nseg); @@ -1388,39 +1389,35 @@ vtnet_rxq_new_buf(struct vtnet_rxq *rxq) } /* - * Set the appropriate CSUM_* flags. Unfortunately, the information - * provided is not directly useful to us. The VirtIO header gives the - * offset of the checksum, which is all Linux needs, but this is not - * how FreeBSD does things. We are forced to peek inside the packet - * a bit. - * - * It would be nice if VirtIO gave us the L4 protocol or if FreeBSD - * could accept the offsets and let the stack figure it out. + * Use the checksum offset in the VirtIO header to set the + * correct CSUM_* flags. */ static int -vtnet_rx_csum(struct vtnet_softc *sc, struct mbuf *m, - struct virtio_net_hdr *hdr) +vtnet_rxq_csum_by_offset(struct vtnet_rxq *rxq, struct mbuf *m, + uint16_t eth_type, int ip_start, struct virtio_net_hdr *hdr) { - struct ether_header *eh; - struct ether_vlan_header *evh; - int offset; - uint16_t eth_type; - - offset = hdr->csum_start + hdr->csum_offset; - - if (offset < sizeof(struct ether_header) + sizeof(struct ip)) - return (1); - if (m->m_len < offset) - return (1); + struct vtnet_softc *sc; +#if defined(INET) || defined(INET6) + int offset = hdr->csum_start + hdr->csum_offset; +#endif - eh = mtod(m, struct ether_header *); - eth_type = ntohs(eh->ether_type); - if (eth_type == ETHERTYPE_VLAN) { - evh = mtod(m, struct ether_vlan_header *); - eth_type = ntohs(evh->evl_proto); - } + sc = rxq->vtnrx_sc; - if (eth_type != ETHERTYPE_IP && eth_type != ETHERTYPE_IPV6) { + /* Only do a basic sanity check on the offset. */ + switch (eth_type) { +#if defined(INET) + case ETHERTYPE_IP: + if (__predict_false(offset < ip_start + sizeof(struct ip))) + return (1); + break; +#endif +#if defined(INET6) + case ETHERTYPE_IPV6: + if (__predict_false(offset < ip_start + sizeof(struct ip6_hdr))) + return (1); + break; +#endif + default: sc->vtnet_stats.rx_csum_bad_ethtype++; return (1); } @@ -1428,8 +1425,8 @@ vtnet_rx_csum(struct vtnet_softc *sc, st /* * Use the offset to determine the appropriate CSUM_* flags. This * is a bit dirty, but we can get by with it since the checksum - * offsets happen to be different. The implied assumption is that - * the host does not do IPv4 header checksum offloading. + * offsets happen to be different. We assume the host host does + * not do IPv4 header checksum offloading. */ switch (hdr->csum_offset) { case offsetof(struct udphdr, uh_sum): @@ -1437,21 +1434,116 @@ vtnet_rx_csum(struct vtnet_softc *sc, st m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xFFFF; break; - case offsetof(struct sctphdr, checksum): m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID; break; - default: sc->vtnet_stats.rx_csum_bad_offset++; return (1); } - sc->vtnet_stats.rx_csum_offloaded++; + return (0); +} + +static int +vtnet_rxq_csum_by_parse(struct vtnet_rxq *rxq, struct mbuf *m, + uint16_t eth_type, int ip_start, struct virtio_net_hdr *hdr) +{ + struct vtnet_softc *sc; + int offset, proto; + + sc = rxq->vtnrx_sc; + + switch (eth_type) { +#if defined(INET) + case ETHERTYPE_IP: { + struct ip *ip; + if (__predict_false(m->m_len < ip_start + sizeof(struct ip))) + return (1); + ip = (struct ip *)(m->m_data + ip_start); + proto = ip->ip_p; + offset = ip_start + (ip->ip_hl << 2); + break; + } +#endif +#if defined(INET6) + case ETHERTYPE_IPV6: + if (__predict_false(m->m_len < ip_start + + sizeof(struct ip6_hdr))) + return (1); + offset = ip6_lasthdr(m, ip_start, IPPROTO_IPV6, &proto); + if (__predict_false(offset < 0)) + return (1); + break; +#endif + default: + sc->vtnet_stats.rx_csum_bad_ethtype++; + return (1); + } + + switch (proto) { + case IPPROTO_TCP: + if (__predict_false(m->m_len < offset + sizeof(struct tcphdr))) + return (1); + m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; + m->m_pkthdr.csum_data = 0xFFFF; + break; + case IPPROTO_UDP: + if (__predict_false(m->m_len < offset + sizeof(struct udphdr))) + return (1); + m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; + m->m_pkthdr.csum_data = 0xFFFF; + break; + case IPPROTO_SCTP: + if (__predict_false(m->m_len < offset + sizeof(struct sctphdr))) + return (1); + m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID; + break; + default: + sc->vtnet_stats.rx_csum_bad_proto++; + return (1); + } return (0); } +/* + * Set the appropriate CSUM_* flags. Unfortunately, the information + * provided is not directly useful to us. The VirtIO header gives the + * offset of the checksum, which is all Linux needs, but this is not + * how FreeBSD does things. We are forced to peek inside the packet + * a bit. + * + * It would be nice if VirtIO gave us the L4 protocol or if FreeBSD + * could accept the offsets and let the stack figure it out. + */ +static int +vtnet_rxq_csum(struct vtnet_rxq *rxq, struct mbuf *m, + struct virtio_net_hdr *hdr) +{ + struct ether_header *eh; + struct ether_vlan_header *evh; + uint16_t eth_type; + int offset, error; + + eh = mtod(m, struct ether_header *); + eth_type = ntohs(eh->ether_type); + if (eth_type == ETHERTYPE_VLAN) { + /* BMV: We should handle nested VLAN tags too. */ + evh = mtod(m, struct ether_vlan_header *); + eth_type = ntohs(evh->evl_proto); + offset = sizeof(struct ether_vlan_header); + } else + offset = sizeof(struct ether_header); + + if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) + error = vtnet_rxq_csum_by_offset(rxq, m, eth_type, offset, hdr); + else + error = vtnet_rxq_csum_by_parse(rxq, m, eth_type, offset, hdr); + + return (error); +} + static void vtnet_rxq_discard_merged_bufs(struct vtnet_rxq *rxq, int nbufs) { @@ -1496,12 +1588,12 @@ vtnet_rxq_merged_eof(struct vtnet_rxq *r while (--nbufs > 0) { m = virtqueue_dequeue(vq, &len); if (m == NULL) { - ifp->if_ierrors++; + rxq->vtnrx_stats.vrxs_ierrors++; goto fail; } if (vtnet_rxq_new_buf(rxq) != 0) { - ifp->if_iqdrops++; + rxq->vtnrx_stats.vrxs_discarded++; vtnet_rxq_discard_buf(rxq, m); if (nbufs > 1) vtnet_rxq_discard_merged_bufs(rxq, nbufs); @@ -1555,14 +1647,25 @@ vtnet_rxq_input(struct vtnet_rxq *rxq, s m->m_pkthdr.flowid = rxq->vtnrx_id; m->m_flags |= M_FLOWID; - if (ifp->if_capenable & IFCAP_RXCSUM && - hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { - if (vtnet_rx_csum(sc, m, hdr) != 0) - sc->vtnet_stats.rx_csum_failed++; + /* + * BVM: FreeBSD does not have the UNNECESSARY and PARTIAL checksum + * distinction that Linux does. Need to reevaluate if performing + * offloading for the NEEDS_CSUM case is really appropriate. + */ + if (hdr->flags & (VIRTIO_NET_HDR_F_NEEDS_CSUM | + VIRTIO_NET_HDR_F_DATA_VALID)) { + if (vtnet_rxq_csum(rxq, m, hdr) == 0) + rxq->vtnrx_stats.vrxs_csum++; + else + rxq->vtnrx_stats.vrxs_csum_failed++; } - ifp->if_ipackets++; + rxq->vtnrx_stats.vrxs_ipackets++; + rxq->vtnrx_stats.vrxs_ibytes += m->m_pkthdr.len; + + /* VTNET_RXQ_UNLOCK(rxq); */ (*ifp->if_input)(ifp, m); + /* VTNET_RXQ_LOCK(rxq); */ } static int @@ -1587,10 +1690,6 @@ vtnet_rxq_eof(struct vtnet_rxq *rxq) VTNET_RXQ_LOCK_ASSERT(rxq); - /* - * `count` limits how many leading descriptors we dequeue. The - * actual number could be higher if there are merged buffers. - */ while (count-- > 0) { m = virtqueue_dequeue(vq, &len); if (m == NULL) @@ -1598,7 +1697,7 @@ vtnet_rxq_eof(struct vtnet_rxq *rxq) deq++; if (len < sc->vtnet_hdr_size + ETHER_HDR_LEN) { - ifp->if_ierrors++; + rxq->vtnrx_stats.vrxs_discarded++; vtnet_rxq_discard_buf(rxq, m); continue; } @@ -1618,7 +1717,7 @@ vtnet_rxq_eof(struct vtnet_rxq *rxq) } if (vtnet_rxq_replace_buf(rxq, m, len) != 0) { - ifp->if_iqdrops++; + rxq->vtnrx_stats.vrxs_discarded++; vtnet_rxq_discard_buf(rxq, m); if (nbufs > 1) vtnet_rxq_discard_merged_bufs(rxq, nbufs); @@ -1642,7 +1741,7 @@ vtnet_rxq_eof(struct vtnet_rxq *rxq) * regular header. * * BMV: Is this memcpy() expensive? We know the mbuf data is - * still valid after we adjust it. + * still valid even after the m_adj(). */ memcpy(hdr, mtod(m, void *), sizeof(struct virtio_net_hdr)); m_adj(m, adjsz); @@ -1662,18 +1761,19 @@ vtnet_rx_vq_intr(void *xrxq) struct vtnet_softc *sc; struct vtnet_rxq *rxq; struct ifnet *ifp; - int more; + int tries, more; rxq = xrxq; sc = rxq->vtnrx_sc; ifp = sc->vtnet_ifp; + tries = 0; if (__predict_false(rxq->vtnrx_id >= sc->vtnet_act_vq_pairs)) { /* - * Ignore this interrupt. Either the host generated a spurious - * interrupt (probably unlikely) or we have multiqueue without - * per-VQ MSIX so every queue needs to be polled (brain dead - * configuration we could try harder to avoid). + * Ignore this interrupt. Either this is a spurious interrupt + * or multiqueue without per-VQ MSIX so every queue needs to + * be polled (a brain dead configuration we could try harder + * to avoid). */ vtnet_rxq_disable_intr(rxq); return; @@ -1683,7 +1783,6 @@ again: VTNET_RXQ_LOCK(rxq); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { - vtnet_rxq_enable_intr(rxq); VTNET_RXQ_UNLOCK(rxq); return; } @@ -1692,21 +1791,47 @@ again: if (more || vtnet_rxq_enable_intr(rxq) != 0) { if (!more) vtnet_rxq_disable_intr(rxq); - sc->vtnet_stats.rx_task_rescheduled++; + /* + * This is an occasional condition or race (when !more), + * so retry a few times before scheduling the taskqueue. + */ + rxq->vtnrx_stats.vrxs_rescheduled++; + VTNET_RXQ_UNLOCK(rxq); + if (tries++ < VTNET_INTR_DISABLE_RETRIES) + goto again; + taskqueue_enqueue(rxq->vtnrx_tq, &rxq->vtnrx_intrtask); + } else VTNET_RXQ_UNLOCK(rxq); - goto again; - } - - VTNET_RXQ_UNLOCK(rxq); } static void -vtnet_rxq_taskqueue(void *xrxq, int pending) +vtnet_rxq_tq_intr(void *xrxq, int pending) { + struct vtnet_softc *sc; + struct vtnet_rxq *rxq; + struct ifnet *ifp; + int more; - /* - * BMV: Do stuff here when we defer in vtnet_rx_vq_intr(). - */ + rxq = xrxq; + sc = rxq->vtnrx_sc; + ifp = sc->vtnet_ifp; + + VTNET_RXQ_LOCK(rxq); + + if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { + VTNET_RXQ_UNLOCK(rxq); + return; + } + + more = vtnet_rxq_eof(rxq); + if (more || vtnet_rxq_enable_intr(rxq) != 0) { + if (!more) + vtnet_rxq_disable_intr(rxq); + rxq->vtnrx_stats.vrxs_rescheduled++; + taskqueue_enqueue(rxq->vtnrx_tq, &rxq->vtnrx_intrtask); + } + + VTNET_RXQ_UNLOCK(rxq); } static void @@ -1729,121 +1854,150 @@ vtnet_txq_free_mbufs(struct vtnet_txq *t } /* - * BMV: Uggg ... rewrite this function. + * BMV: Much of this can go away once we finally have offsets in + * the mbuf packet header. Bug andre@. */ -static struct mbuf * -vtnet_tx_offload(struct vtnet_softc *sc, struct mbuf *m, - struct virtio_net_hdr *hdr) +static int +vtnet_txq_offload_ctx(struct vtnet_txq *txq, struct mbuf *m, + int *etype, int *proto, int *start) { - struct ifnet *ifp; - struct ether_header *eh; + struct vtnet_softc *sc; struct ether_vlan_header *evh; - struct ip *ip; - struct ip6_hdr *ip6; - struct tcphdr *tcp; - int ip_offset; - uint16_t eth_type, csum_start; - uint8_t ip_proto, gso_type; + int offset; - ifp = sc->vtnet_ifp; + sc = txq->vtntx_sc; - ip_offset = sizeof(struct ether_header); - if (m->m_len < ip_offset) { - if ((m = m_pullup(m, ip_offset)) == NULL) - return (NULL); + evh = mtod(m, struct ether_vlan_header *); + if (evh->evl_encap_proto == htons(ETHERTYPE_VLAN)) { + /* BMV: We should handle nested VLAN tags too. */ + *etype = ntohs(evh->evl_proto); + offset = sizeof(struct ether_vlan_header); + } else { + *etype = ntohs(evh->evl_encap_proto); + offset = sizeof(struct ether_header); } - eh = mtod(m, struct ether_header *); - eth_type = ntohs(eh->ether_type); - if (eth_type == ETHERTYPE_VLAN) { - ip_offset = sizeof(struct ether_vlan_header); - if (m->m_len < ip_offset) { - if ((m = m_pullup(m, ip_offset)) == NULL) - return (NULL); - } - evh = mtod(m, struct ether_vlan_header *); - eth_type = ntohs(evh->evl_proto); + switch (*etype) { +#if defined(INET) + case ETHERTYPE_IP: { + struct ip *ip, iphdr; + if (__predict_false(m->m_len < offset + sizeof(struct ip))) { + m_copydata(m, offset, sizeof(struct ip), + (caddr_t) &iphdr); + ip = &iphdr; + } else + ip = (struct ip *)(m->m_data + offset); + *proto = ip->ip_p; + *start = offset + (ip->ip_hl << 2); + break; + } +#endif +#if defined(INET6) + case ETHERTYPE_IPV6: + *proto = -1; + *start = ip6_lasthdr(m, offset, IPPROTO_IPV6, proto); + /* Assert the network stack sends us a valid packet. */ + KASSERT(*start > offset, + ("%s: mbuf %p start %d offset %d proto %d", __func__, m, + *start, offset, *proto)); + break; +#endif + default: + sc->vtnet_stats.tx_csum_bad_ethtype++; + return (EINVAL); } - switch (eth_type) { - case ETHERTYPE_IP: - if (m->m_len < ip_offset + sizeof(struct ip)) { - m = m_pullup(m, ip_offset + sizeof(struct ip)); - if (m == NULL) - return (NULL); - } + return (0); +} - ip = (struct ip *)(mtod(m, uint8_t *) + ip_offset); - ip_proto = ip->ip_p; - csum_start = ip_offset + (ip->ip_hl << 2); - gso_type = VIRTIO_NET_HDR_GSO_TCPV4; - break; +static int +vtnet_txq_offload_tso(struct vtnet_txq *txq, struct mbuf *m, int eth_type, + int offset, struct virtio_net_hdr *hdr) +{ + static struct timeval lastecn; + static int curecn; + struct vtnet_softc *sc; + struct tcphdr *tcp, tcphdr; - case ETHERTYPE_IPV6: - if (m->m_len < ip_offset + sizeof(struct ip6_hdr)) { - m = m_pullup(m, ip_offset + sizeof(struct ip6_hdr)); - if (m == NULL) - return (NULL); - } + sc = txq->vtntx_sc; + + if (__predict_false(m->m_len < offset + sizeof(struct tcphdr))) { + m_copydata(m, offset, sizeof(struct tcphdr), (caddr_t) &tcphdr); + tcp = &tcphdr; + } else + tcp = (struct tcphdr *)(m->m_data + offset); + + hdr->hdr_len = offset + (tcp->th_off << 2); + hdr->gso_size = m->m_pkthdr.tso_segsz; + hdr->gso_type = eth_type == ETHERTYPE_IP ? VIRTIO_NET_HDR_GSO_TCPV4 : + VIRTIO_NET_HDR_GSO_TCPV6; - ip6 = (struct ip6_hdr *)(mtod(m, uint8_t *) + ip_offset); + if (tcp->th_flags & TH_CWR) { /* - * XXX Assume no extension headers are present. Presently, - * this will always be true in the case of TSO, and FreeBSD - * does not perform checksum offloading of IPv6 yet. + * Drop if VIRTIO_NET_F_HOST_ECN was not negotiated. In FreeBSD + * ECN support is not on a per-interface basis, but globally via + * the net.inet.tcp.ecn.enable sysctl knob. The default is off. */ - ip_proto = ip6->ip6_nxt; - csum_start = ip_offset + sizeof(struct ip6_hdr); - gso_type = VIRTIO_NET_HDR_GSO_TCPV6; - break; - - default: - return (m); + if ((sc->vtnet_flags & VTNET_FLAG_TSO_ECN) == 0) { + if (ppsratecheck(&lastecn, &curecn, 1)) + if_printf(sc->vtnet_ifp, + "TSO with ECN not negotiated with host\n"); + return (ENOTSUP); + } + hdr->gso_type |= VIRTIO_NET_HDR_GSO_ECN; } - if (m->m_pkthdr.csum_flags & VTNET_CSUM_OFFLOAD) { - hdr->flags |= VIRTIO_NET_HDR_F_NEEDS_CSUM; - hdr->csum_start = csum_start; - hdr->csum_offset = m->m_pkthdr.csum_data; + txq->vtntx_stats.vtxs_tso++; - sc->vtnet_stats.tx_csum_offloaded++; - } + return (0); +} - if (m->m_pkthdr.csum_flags & CSUM_TSO) { - if (ip_proto != IPPROTO_TCP) - return (m); +static struct mbuf * +vtnet_txq_offload(struct vtnet_txq *txq, struct mbuf *m, + struct virtio_net_hdr *hdr) +{ + struct vtnet_softc *sc; + int flags, etype, csum_start, proto, error; - if (m->m_len < csum_start + sizeof(struct tcphdr)) { - m = m_pullup(m, csum_start + sizeof(struct tcphdr)); - if (m == NULL) - return (NULL); - } + sc = txq->vtntx_sc; + flags = m->m_pkthdr.csum_flags; - tcp = (struct tcphdr *)(mtod(m, uint8_t *) + csum_start); - hdr->gso_type = gso_type; - hdr->hdr_len = csum_start + (tcp->th_off << 2); - hdr->gso_size = m->m_pkthdr.tso_segsz; + error = vtnet_txq_offload_ctx(txq, m, &etype, &csum_start, &proto); + if (error) + goto drop; - if (tcp->th_flags & TH_CWR) { - /* - * Drop if we did not negotiate VIRTIO_NET_F_HOST_ECN. - * ECN support is only configurable globally with the - * net.inet.tcp.ecn.enable sysctl knob. - */ - if ((sc->vtnet_flags & VTNET_FLAG_TSO_ECN) == 0) { - if_printf(ifp, "TSO with ECN not supported " - "by host\n"); - m_freem(m); - return (NULL); - } + if ((etype == ETHERTYPE_IP && flags & VTNET_CSUM_OFFLOAD) || + (etype == ETHERTYPE_IPV6 && flags & VTNET_CSUM_OFFLOAD_IPV6)) { + /* + * We could compare the IP protocol vs the CSUM_ flag too, + * but that really should not be necessary. + */ + hdr->flags |= VIRTIO_NET_HDR_F_NEEDS_CSUM; + hdr->csum_start = csum_start; + hdr->csum_offset = m->m_pkthdr.csum_data; + txq->vtntx_stats.vtxs_csum++; + } - hdr->flags |= VIRTIO_NET_HDR_GSO_ECN; + if (flags & CSUM_TSO) { + if (__predict_false(proto != IPPROTO_TCP)) { + /* Likely failed to correctly parse the mbuf. */ + sc->vtnet_stats.tx_tso_not_tcp++; + goto drop; } - sc->vtnet_stats.tx_tso_offloaded++; + KASSERT(hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM, + ("%s: mbuf %p TSO without checksum offload", __func__, m)); + + error = vtnet_txq_offload_tso(txq, m, etype, csum_start, hdr); + if (error) + goto drop; } return (m); + +drop: + m_freem(m); + return (NULL); } static int @@ -1879,6 +2033,7 @@ again: *m_head = m; collapsed = 1; + txq->vtntx_stats.vtxs_collapsed++; goto again; } @@ -1920,29 +2075,26 @@ vtnet_txq_encap(struct vtnet_txq *txq, s * The vtnet_hdr_size is used to enqueue the correct header size. */ hdr = &txhdr->vth_uhdr.hdr; - + error = ENOBUFS; if (m->m_flags & M_VLANTAG) { m = ether_vlanencap(m, m->m_pkthdr.ether_vtag); - if ((*m_head = m) == NULL) { - error = ENOBUFS; + if ((*m_head = m) == NULL) goto fail; - } m->m_flags &= ~M_VLANTAG; } - if (m->m_pkthdr.csum_flags != 0) { - m = vtnet_tx_offload(sc, m, hdr); - if ((*m_head = m) == NULL) { - error = ENOBUFS; + if (m->m_pkthdr.csum_flags & VTNET_CSUM_ALL_OFFLOAD) { + m = vtnet_txq_offload(txq, m, hdr); + if ((*m_head = m) == NULL) goto fail; - } } error = vtnet_txq_enqueue_buf(txq, m_head, txhdr); + if (error == 0) + return (0); fail: - if (error) - uma_zfree(vtnet_tx_header_zone, txhdr); + uma_zfree(vtnet_tx_header_zone, txhdr); return (error); } @@ -2071,7 +2223,6 @@ vtnet_txq_mq_start(struct ifnet *ifp, st sc = ifp->if_softc; npairs = sc->vtnet_act_vq_pairs; - /* BMV: Is this the best way to determine which queue? */ if (m->m_flags & M_FLOWID) i = m->m_pkthdr.flowid % npairs; else @@ -2084,14 +2235,14 @@ vtnet_txq_mq_start(struct ifnet *ifp, st VTNET_TXQ_UNLOCK(txq); } else { error = drbr_enqueue(ifp, txq->vtntx_br, m); - vtnet_txq_tq_start(txq); + taskqueue_enqueue(txq->vtntx_tq, &txq->vtntx_defrtask); } return (error); } static void -vtnet_txq_taskqueue(void *xtxq, int pending) +vtnet_txq_tq_deferred(void *xtxq, int pending) { struct vtnet_softc *sc; struct vtnet_txq *txq; @@ -2105,36 +2256,64 @@ vtnet_txq_taskqueue(void *xtxq, int pend *** DIFF OUTPUT TRUNCATED AT 1000 LINES ***