From owner-svn-src-user@FreeBSD.ORG  Thu Mar 21 06:56:36 2013
Return-Path: <owner-svn-src-user@FreeBSD.ORG>
Delivered-To: svn-src-user@freebsd.org
Received: from mx1.freebsd.org (mx1.freebsd.org
 [IPv6:2001:1900:2254:206a::19:1])
 by hub.freebsd.org (Postfix) with ESMTP id 5B78FC1A;
 Thu, 21 Mar 2013 06:56:36 +0000 (UTC)
 (envelope-from bryanv@FreeBSD.org)
Received: from svn.freebsd.org (svn.freebsd.org
 [IPv6:2001:1900:2254:2068::e6a:0])
 by mx1.freebsd.org (Postfix) with ESMTP id 4D3F2CD3;
 Thu, 21 Mar 2013 06:56:36 +0000 (UTC)
Received: from svn.freebsd.org ([127.0.1.70])
 by svn.freebsd.org (8.14.6/8.14.6) with ESMTP id r2L6uaJ8065799;
 Thu, 21 Mar 2013 06:56:36 GMT (envelope-from bryanv@svn.freebsd.org)
Received: (from bryanv@localhost)
 by svn.freebsd.org (8.14.6/8.14.5/Submit) id r2L6uabL065797;
 Thu, 21 Mar 2013 06:56:36 GMT (envelope-from bryanv@svn.freebsd.org)
Message-Id: <201303210656.r2L6uabL065797@svn.freebsd.org>
From: Bryan Venteicher <bryanv@FreeBSD.org>
Date: Thu, 21 Mar 2013 06:56:36 +0000 (UTC)
To: src-committers@freebsd.org, svn-src-user@freebsd.org
Subject: svn commit: r248565 - user/bryanv/vtnetmq/sys/dev/virtio/network
X-SVN-Group: user
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
X-BeenThere: svn-src-user@freebsd.org
X-Mailman-Version: 2.1.14
Precedence: list
List-Id: "SVN commit messages for the experimental &quot; user&quot;
 src tree" <svn-src-user.freebsd.org>
List-Unsubscribe: <http://lists.freebsd.org/mailman/options/svn-src-user>,
 <mailto:svn-src-user-request@freebsd.org?subject=unsubscribe>
List-Archive: <http://lists.freebsd.org/pipermail/svn-src-user>
List-Post: <mailto:svn-src-user@freebsd.org>
List-Help: <mailto:svn-src-user-request@freebsd.org?subject=help>
List-Subscribe: <http://lists.freebsd.org/mailman/listinfo/svn-src-user>,
 <mailto:svn-src-user-request@freebsd.org?subject=subscribe>
X-List-Received-Date: Thu, 21 Mar 2013 06:56:36 -0000

Author: bryanv
Date: Thu Mar 21 06:56:35 2013
New Revision: 248565
URL: http://svnweb.freebsd.org/changeset/base/248565

Log:
  Commit development snapshot of the multiqueue driver
  
  This commit contains lots of cleanup, bug fixes, and enhancements
  such as:
    - improved Rx/Tx checksumming
    - better handling of deferred transmit and interrupt
      handlers
    - per-queue statistics, exported via sysctl
  
  A lot of work still remains.

Modified:
  user/bryanv/vtnetmq/sys/dev/virtio/network/if_vtnet.c
  user/bryanv/vtnetmq/sys/dev/virtio/network/if_vtnetvar.h
  user/bryanv/vtnetmq/sys/dev/virtio/network/virtio_net.h

Modified: user/bryanv/vtnetmq/sys/dev/virtio/network/if_vtnet.c
==============================================================================
--- user/bryanv/vtnetmq/sys/dev/virtio/network/if_vtnet.c	Wed Mar 20 21:47:05 2013	(r248564)
+++ user/bryanv/vtnetmq/sys/dev/virtio/network/if_vtnet.c	Thu Mar 21 06:56:35 2013	(r248565)
@@ -62,6 +62,7 @@ __FBSDID("$FreeBSD$");
 #include <netinet/in.h>
 #include <netinet/ip.h>
 #include <netinet/ip6.h>
+#include <netinet6/ip6_var.h>
 #include <netinet/udp.h>
 #include <netinet/tcp.h>
 #include <netinet/sctp.h>
@@ -78,6 +79,9 @@ __FBSDID("$FreeBSD$");
 
 #include "virtio_if.h"
 
+#include "opt_inet.h"
+#include "opt_inet6.h"
+
 static int	vtnet_modevent(module_t, int, void *);
 
 static int	vtnet_probe(device_t);
@@ -110,7 +114,7 @@ static int	vtnet_rxq_replace_lro_nomgr_b
 static int	vtnet_rxq_replace_buf(struct vtnet_rxq *, struct mbuf *, int);
 static int	vtnet_rxq_enqueue_buf(struct vtnet_rxq *, struct mbuf *);
 static int	vtnet_rxq_new_buf(struct vtnet_rxq *);
-static int	vtnet_rx_csum(struct vtnet_softc *, struct mbuf *,
+static int	vtnet_rxq_csum(struct vtnet_rxq *, struct mbuf *,
 		     struct virtio_net_hdr *);
 static void	vtnet_rxq_discard_merged_bufs(struct vtnet_rxq *, int);
 static void	vtnet_rxq_discard_buf(struct vtnet_rxq *, struct mbuf *);
@@ -119,11 +123,15 @@ static void	vtnet_rxq_input(struct vtnet
 		    struct virtio_net_hdr *);
 static int	vtnet_rxq_eof(struct vtnet_rxq *);
 static void	vtnet_rx_vq_intr(void *);
-static void	vtnet_rxq_taskqueue(void *, int);
+static void	vtnet_rxq_tq_intr(void *, int);
 
 static void	vtnet_txq_free_mbufs(struct vtnet_txq *);
+static int	vtnet_txq_offload_ctx(struct vtnet_txq *, struct mbuf *,
+		    int *, int *, int *);
+static int	vtnet_txq_offload_tso(struct vtnet_txq *, struct mbuf *, int,
+		    int, struct virtio_net_hdr *);
 static struct mbuf *
-		vtnet_tx_offload(struct vtnet_softc *, struct mbuf *,
+		vtnet_txq_offload(struct vtnet_txq *, struct mbuf *,
 		    struct virtio_net_hdr *);
 static int	vtnet_txq_enqueue_buf(struct vtnet_txq *, struct mbuf **,
 		    struct vtnet_tx_header *);
@@ -134,9 +142,9 @@ static void	vtnet_start(struct ifnet *);
 #else
 static int	vtnet_txq_mq_start_locked(struct vtnet_txq *, struct mbuf *);
 static int	vtnet_txq_mq_start(struct ifnet *, struct mbuf *);
-static void	vtnet_txq_taskqueue(void *, int);
-static void	vtnet_txq_tq_start(struct vtnet_txq *);
+static void	vtnet_txq_tq_deferred(void *, int);
 #endif
+static void	vtnet_txq_tq_intr(void *, int);
 static void	vtnet_txq_eof(struct vtnet_txq *);
 static void	vtnet_tx_vq_intr(void *);
 
@@ -155,13 +163,15 @@ static void	vtnet_drain_rxtx_queues(stru
 static void	vtnet_stop_rendezvous(struct vtnet_softc *);
 static void	vtnet_stop(struct vtnet_softc *);
 static int	vtnet_virtio_reinit(struct vtnet_softc *);
+static void	vtnet_init_rx_filters(struct vtnet_softc *);
 static int	vtnet_init_rx_queues(struct vtnet_softc *);
+static int	vtnet_init_tx_queues(struct vtnet_softc *);
+static int	vtnet_init_rxtx_queues(struct vtnet_softc *);
 static void	vtnet_set_active_vq_pairs(struct vtnet_softc *);
 static int	vtnet_reinit(struct vtnet_softc *);
 static void	vtnet_init_locked(struct vtnet_softc *);
 static void	vtnet_init(void *);
 
-static void	vtnet_init_rx_filters(struct vtnet_softc *);
 static void	vtnet_free_ctrl_vq(struct vtnet_softc *);
 static void	vtnet_exec_ctrl_cmd(struct vtnet_softc *, void *,
 		    struct sglist *, int, int);
@@ -186,6 +196,11 @@ static void	vtnet_get_hwaddr(struct vtne
 static void	vtnet_set_hwaddr(struct vtnet_softc *);
 static void	vtnet_vlan_tag_remove(struct mbuf *);
 
+static void	vtnet_setup_rxq_sysctl(struct sysctl_ctx_list *,
+		    struct sysctl_oid_list *, struct vtnet_rxq *);
+static void	vtnet_setup_txq_sysctl(struct sysctl_ctx_list *,
+		    struct sysctl_oid_list *, struct vtnet_txq *);
+static void	vtnet_setup_queue_sysctl(struct vtnet_softc *);
 static void	vtnet_setup_sysctl(struct vtnet_softc *);
 
 static int	vtnet_rxq_enable_intr(struct vtnet_rxq *);
@@ -214,12 +229,14 @@ static int vtnet_rx_process_limit = 256;
 TUNABLE_INT("hw.vtnet.rx_process_limit", &vtnet_rx_process_limit);
 
 /*
- * Reducing the number of transmit completed interrupts can
- * improve performance. To do so, the define below keeps the
- * Tx vq interrupt disabled and adds calls to vtnet_txeof()
- * in the start and watchdog paths. The price to pay for this
- * is the m_free'ing of transmitted mbufs may be delayed until
- * the watchdog fires.
+ * Reducing the number of transmit completed interrupts can improve
+ * performance. To do so, the define below keeps the Tx vq interrupt
+ * disabled and adds calls to vtnet_txeof() in the start and watchdog
+ * paths. The price to pay for this is the m_free'ing of transmitted
+ * mbufs may be delayed until the watchdog fires.
+ *
+ * BMV: Reintroduce this later as a run-time option, if it makes
+ * sense after the EVENT_IDX feature is supported.
  */
 #define VTNET_TX_INTR_MODERATION
 
@@ -366,18 +383,15 @@ vtnet_attach(device_t dev)
 	error = virtio_setup_intr(dev, INTR_TYPE_NET);
 	if (error) {
 		device_printf(dev, "cannot setup virtqueue interrupts\n");
+		/* BMV: This will crash if during boot! */
 		ether_ifdetach(sc->vtnet_ifp);
 		goto fail;
 	}
 
-	vtnet_start_taskqueues(sc);
-
-	/*
-	 * Even though this is a polling operation, it must be done after
-	 * interrupts have been setup.
-	 */
 	vtnet_attach_disable_promisc(sc);
 
+	vtnet_start_taskqueues(sc);
+
 fail:
 	if (error)
 		vtnet_detach(dev);
@@ -610,7 +624,7 @@ vtnet_init_rxq(struct vtnet_softc *sc, i
 	rxq->vtnrx_id = id;
 	rxq->vtnrx_process_limit = vtnet_rx_process_limit;
 
-	TASK_INIT(&rxq->vtnrx_task, 0, vtnet_rxq_taskqueue, rxq);
+	TASK_INIT(&rxq->vtnrx_intrtask, 0, vtnet_rxq_tq_intr, rxq);
 	rxq->vtnrx_tq = taskqueue_create(rxq->vtnrx_name, M_NOWAIT,
 	    taskqueue_thread_enqueue, &rxq->vtnrx_tq);
 
@@ -637,12 +651,13 @@ vtnet_init_txq(struct vtnet_softc *sc, i
 	if (txq->vtntx_br == NULL)
 		return (ENOMEM);
 
-	TASK_INIT(&txq->vtntx_task, 0, vtnet_txq_taskqueue, txq);
+	TASK_INIT(&txq->vtntx_defrtask, 0, vtnet_txq_tq_deferred, txq);
+#endif
+	TASK_INIT(&txq->vtntx_intrtask, 0, vtnet_txq_tq_intr, txq);
 	txq->vtntx_tq = taskqueue_create(txq->vtntx_name, M_NOWAIT,
 	    taskqueue_thread_enqueue, &txq->vtntx_tq);
 	if (txq->vtntx_tq == NULL)
 		return (ENOMEM);
-#endif
 
 	return (0);
 }
@@ -650,18 +665,18 @@ vtnet_init_txq(struct vtnet_softc *sc, i
 static int
 vtnet_alloc_rxtx_queues(struct vtnet_softc *sc)
 {
-	int i, pairs, error;
+	int i, npairs, error;
 
-	pairs = sc->vtnet_max_vq_pairs;
+	npairs = sc->vtnet_max_vq_pairs;
 
-	sc->vtnet_rxqs = malloc(sizeof(struct vtnet_rxq) * pairs, M_DEVBUF,
+	sc->vtnet_rxqs = malloc(sizeof(struct vtnet_rxq) * npairs, M_DEVBUF,
 	    M_NOWAIT | M_ZERO);
-	sc->vtnet_txqs = malloc(sizeof(struct vtnet_txq) * pairs, M_DEVBUF,
+	sc->vtnet_txqs = malloc(sizeof(struct vtnet_txq) * npairs, M_DEVBUF,
 	    M_NOWAIT | M_ZERO);
 	if (sc->vtnet_rxqs == NULL || sc->vtnet_txqs == NULL)
 		return (ENOMEM);
 
-	for (i = 0; i < pairs; i++) {
+	for (i = 0; i < npairs; i++) {
 		error = vtnet_init_rxq(sc, i);
 		if (error)
 			return (error);
@@ -670,6 +685,8 @@ vtnet_alloc_rxtx_queues(struct vtnet_sof
 			return (error);
 	}
 
+	vtnet_setup_queue_sysctl(sc);
+
 	return (0);
 }
 
@@ -1051,10 +1068,7 @@ vtnet_ioctl(struct ifnet *ifp, u_long cm
 
 		if (mask & (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6 | IFCAP_LRO |
 		    IFCAP_VLAN_HWFILTER)) {
-			/*
-			 * These Rx features require us to renegotiate with
-			 * the host.
-			 */
+			/* These Rx features require us to renegotiate. */
 			reinit = 1;
 
 			/*
@@ -1152,6 +1166,9 @@ vtnet_rx_alloc_buf(struct vtnet_softc *s
 
 	clsize = sc->vtnet_rx_clsize;
 
+	KASSERT(nbufs == 1 || sc->vtnet_flags & VTNET_FLAG_LRO_NOMRG,
+	    ("%s: chained mbuf %d request without LRO_NOMRG", __func__, nbufs));
+
 	m_head = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, clsize);
 	if (m_head == NULL)
 		goto fail;
@@ -1159,19 +1176,15 @@ vtnet_rx_alloc_buf(struct vtnet_softc *s
 	m_head->m_len = clsize;
 	m_tail = m_head;
 
-	if (nbufs > 1) {
-		KASSERT(sc->vtnet_flags & VTNET_FLAG_LRO_NOMRG,
-		    ("%s: chained mbuf request without LRO_NOMRG", __func__));
-
-		for (i = 1; i < nbufs; i++) {
-			m = m_getjcl(M_NOWAIT, MT_DATA, 0, clsize);
-			if (m == NULL)
-				goto fail;
-
-			m->m_len = clsize;
-			m_tail->m_next = m;
-			m_tail = m;
-		}
+	/* Allocate the rest of the chain. */
+	for (i = 1; i < nbufs; i++) {
+		m = m_getjcl(M_NOWAIT, MT_DATA, 0, clsize);
+		if (m == NULL)
+			goto fail;
+
+		m->m_len = clsize;
+		m_tail->m_next = m;
+		m_tail = m;
 	}
 
 	if (m_tailp != NULL)
@@ -1293,27 +1306,25 @@ vtnet_rxq_replace_buf(struct vtnet_rxq *
 	    ("%s: chained mbuf without LRO_NOMRG", __func__));
 
 	if (m->m_next == NULL) {
-		/*
-		 * Simplified fast-path for the common case of just one mbuf.
-		 *
-		 * BMV: This is a lot like vtnet_rxq_new_buf().
-		 */
-		m->m_len = MIN(m->m_len, len);		/* BMV XXX */
+		/* Fast-path for the common case of just one mbuf. */
+		if (m->m_len < len)
+			return (EINVAL);
 
 		m_new = vtnet_rx_alloc_buf(sc, 1, NULL);
-		if (m_new != NULL) {
-			error = vtnet_rxq_enqueue_buf(rxq, m_new);
-			if (error) {
-				/*
-				 * The new mbuf is suppose to be an identical
-				 * copy of the one just dequeued so this is an
-				 * unexpected error.
-				 */
-				m_freem(m_new);
-				sc->vtnet_stats.rx_enq_replacement_failed++;
-			}
+		if (m_new == NULL)
+			return (ENOBUFS);
+
+		error = vtnet_rxq_enqueue_buf(rxq, m_new);
+		if (error) {
+			/*
+			 * The new mbuf is suppose to be an identical
+			 * copy of the one just dequeued so this is an
+			 * unexpected error.
+			 */
+			m_freem(m_new);
+			sc->vtnet_stats.rx_enq_replacement_failed++;
 		} else
-			error = ENOBUFS;
+			m->m_len = len;
 	} else
 		error = vtnet_rxq_replace_lro_nomgr_buf(rxq, m, len);
 
@@ -1327,7 +1338,6 @@ vtnet_rxq_enqueue_buf(struct vtnet_rxq *
 	struct sglist_seg segs[VTNET_MAX_RX_SEGS];
 	struct vtnet_softc *sc;
 	struct vtnet_rx_header *rxhdr;
-	struct virtio_net_hdr *hdr;
 	uint8_t *mdata;
 	int offset, error;
 
@@ -1342,24 +1352,15 @@ vtnet_rxq_enqueue_buf(struct vtnet_rxq *
 	if ((sc->vtnet_flags & VTNET_FLAG_MRG_RXBUFS) == 0) {
 		MPASS(sc->vtnet_hdr_size == sizeof(struct virtio_net_hdr));
 		rxhdr = (struct vtnet_rx_header *) mdata;
-		hdr = &rxhdr->vrh_hdr;
+		sglist_append(&sg, &rxhdr->vrh_hdr, sc->vtnet_hdr_size);
 		offset = sizeof(struct vtnet_rx_header);
-		sglist_append(&sg, hdr, sc->vtnet_hdr_size);
 	} else
 		offset = 0;
 
-	/*
-	 * XXX BMV: Either sglist_append() should never fail here ...
-	 */
-
-	error = sglist_append(&sg, mdata + offset, m->m_len - offset);
-	if (error)
-		return (error);
-
+	sglist_append(&sg, mdata + offset, m->m_len - offset);
 	if (m->m_next != NULL) {
 		error = sglist_append_mbuf(&sg, m->m_next);
-		if (error)
-			return (error);
+		MPASS(error == 0);
 	}
 
 	error = virtqueue_enqueue(rxq->vtnrx_vq, m, &sg, 0, sg.sg_nseg);
@@ -1388,39 +1389,35 @@ vtnet_rxq_new_buf(struct vtnet_rxq *rxq)
 }
 
 /*
- * Set the appropriate CSUM_* flags. Unfortunately, the information
- * provided is not directly useful to us. The VirtIO header gives the
- * offset of the checksum, which is all Linux needs, but this is not
- * how FreeBSD does things. We are forced to peek inside the packet
- * a bit.
- *
- * It would be nice if VirtIO gave us the L4 protocol or if FreeBSD
- * could accept the offsets and let the stack figure it out.
+ * Use the checksum offset in the VirtIO header to set the
+ * correct CSUM_* flags.
  */
 static int
-vtnet_rx_csum(struct vtnet_softc *sc, struct mbuf *m,
-    struct virtio_net_hdr *hdr)
+vtnet_rxq_csum_by_offset(struct vtnet_rxq *rxq, struct mbuf *m,
+    uint16_t eth_type, int ip_start, struct virtio_net_hdr *hdr)
 {
-	struct ether_header *eh;
-	struct ether_vlan_header *evh;
-	int offset;
-	uint16_t eth_type;
-
-	offset = hdr->csum_start + hdr->csum_offset;
-
-	if (offset < sizeof(struct ether_header) + sizeof(struct ip))
-		return (1);
-	if (m->m_len < offset)
-		return (1);
+	struct vtnet_softc *sc;
+#if defined(INET) || defined(INET6)
+	int offset = hdr->csum_start + hdr->csum_offset;
+#endif
 
-	eh = mtod(m, struct ether_header *);
-	eth_type = ntohs(eh->ether_type);
-	if (eth_type == ETHERTYPE_VLAN) {
-		evh = mtod(m, struct ether_vlan_header *);
-		eth_type = ntohs(evh->evl_proto);
-	}
+	sc = rxq->vtnrx_sc;
 
-	if (eth_type != ETHERTYPE_IP && eth_type != ETHERTYPE_IPV6) {
+	/* Only do a basic sanity check on the offset. */
+	switch (eth_type) {
+#if defined(INET)
+	case ETHERTYPE_IP:
+		if (__predict_false(offset < ip_start + sizeof(struct ip)))
+			return (1);
+		break;
+#endif
+#if defined(INET6)
+	case ETHERTYPE_IPV6:
+		if (__predict_false(offset < ip_start + sizeof(struct ip6_hdr)))
+			return (1);
+		break;
+#endif
+	default:
 		sc->vtnet_stats.rx_csum_bad_ethtype++;
 		return (1);
 	}
@@ -1428,8 +1425,8 @@ vtnet_rx_csum(struct vtnet_softc *sc, st
 	/*
 	 * Use the offset to determine the appropriate CSUM_* flags. This
 	 * is a bit dirty, but we can get by with it since the checksum
-	 * offsets happen to be different. The implied assumption is that
-	 * the host does not do IPv4 header checksum offloading.
+	 * offsets happen to be different. We assume the host host does
+	 * not do IPv4 header checksum offloading.
 	 */
 	switch (hdr->csum_offset) {
 	case offsetof(struct udphdr, uh_sum):
@@ -1437,21 +1434,116 @@ vtnet_rx_csum(struct vtnet_softc *sc, st
 		m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
 		m->m_pkthdr.csum_data = 0xFFFF;
 		break;
-
 	case offsetof(struct sctphdr, checksum):
 		m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
 		break;
-
 	default:
 		sc->vtnet_stats.rx_csum_bad_offset++;
 		return (1);
 	}
 
-	sc->vtnet_stats.rx_csum_offloaded++;
+	return (0);
+}
+
+static int
+vtnet_rxq_csum_by_parse(struct vtnet_rxq *rxq, struct mbuf *m,
+    uint16_t eth_type, int ip_start, struct virtio_net_hdr *hdr)
+{
+	struct vtnet_softc *sc;
+	int offset, proto;
+
+	sc = rxq->vtnrx_sc;
+
+	switch (eth_type) {
+#if defined(INET)
+	case ETHERTYPE_IP: {
+		struct ip *ip;
+		if (__predict_false(m->m_len < ip_start + sizeof(struct ip)))
+			return (1);
+		ip = (struct ip *)(m->m_data + ip_start);
+		proto = ip->ip_p;
+		offset = ip_start + (ip->ip_hl << 2);
+		break;
+	}
+#endif
+#if defined(INET6)
+	case ETHERTYPE_IPV6:
+		if (__predict_false(m->m_len < ip_start +
+		    sizeof(struct ip6_hdr)))
+			return (1);
+		offset = ip6_lasthdr(m, ip_start, IPPROTO_IPV6, &proto);
+		if (__predict_false(offset < 0))
+			return (1);
+		break;
+#endif
+	default:
+		sc->vtnet_stats.rx_csum_bad_ethtype++;
+		return (1);
+	}
+
+	switch (proto) {
+	case IPPROTO_TCP:
+		if (__predict_false(m->m_len < offset + sizeof(struct tcphdr)))
+			return (1);
+		m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
+		m->m_pkthdr.csum_data = 0xFFFF;
+		break;
+	case IPPROTO_UDP:
+		if (__predict_false(m->m_len < offset + sizeof(struct udphdr)))
+			return (1);
+		m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
+		m->m_pkthdr.csum_data = 0xFFFF;
+		break;
+	case IPPROTO_SCTP:
+		if (__predict_false(m->m_len < offset + sizeof(struct sctphdr)))
+			return (1);
+		m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
+		break;
+	default:
+		sc->vtnet_stats.rx_csum_bad_proto++;
+		return (1);
+	}
 
 	return (0);
 }
 
+/*
+ * Set the appropriate CSUM_* flags. Unfortunately, the information
+ * provided is not directly useful to us. The VirtIO header gives the
+ * offset of the checksum, which is all Linux needs, but this is not
+ * how FreeBSD does things. We are forced to peek inside the packet
+ * a bit.
+ *
+ * It would be nice if VirtIO gave us the L4 protocol or if FreeBSD
+ * could accept the offsets and let the stack figure it out.
+ */
+static int
+vtnet_rxq_csum(struct vtnet_rxq *rxq, struct mbuf *m,
+    struct virtio_net_hdr *hdr)
+{
+	struct ether_header *eh;
+	struct ether_vlan_header *evh;
+	uint16_t eth_type;
+	int offset, error;
+
+	eh = mtod(m, struct ether_header *);
+	eth_type = ntohs(eh->ether_type);
+	if (eth_type == ETHERTYPE_VLAN) {
+		/* BMV: We should handle nested VLAN tags too. */
+		evh = mtod(m, struct ether_vlan_header *);
+		eth_type = ntohs(evh->evl_proto);
+		offset = sizeof(struct ether_vlan_header);
+	} else
+		offset = sizeof(struct ether_header);
+
+	if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)
+		error = vtnet_rxq_csum_by_offset(rxq, m, eth_type, offset, hdr);
+	else
+		error = vtnet_rxq_csum_by_parse(rxq, m, eth_type, offset, hdr);
+
+	return (error);
+}
+
 static void
 vtnet_rxq_discard_merged_bufs(struct vtnet_rxq *rxq, int nbufs)
 {
@@ -1496,12 +1588,12 @@ vtnet_rxq_merged_eof(struct vtnet_rxq *r
 	while (--nbufs > 0) {
 		m = virtqueue_dequeue(vq, &len);
 		if (m == NULL) {
-			ifp->if_ierrors++;
+			rxq->vtnrx_stats.vrxs_ierrors++;
 			goto fail;
 		}
 
 		if (vtnet_rxq_new_buf(rxq) != 0) {
-			ifp->if_iqdrops++;
+			rxq->vtnrx_stats.vrxs_discarded++;
 			vtnet_rxq_discard_buf(rxq, m);
 			if (nbufs > 1)
 				vtnet_rxq_discard_merged_bufs(rxq, nbufs);
@@ -1555,14 +1647,25 @@ vtnet_rxq_input(struct vtnet_rxq *rxq, s
 	m->m_pkthdr.flowid = rxq->vtnrx_id;
 	m->m_flags |= M_FLOWID;
 
-	if (ifp->if_capenable & IFCAP_RXCSUM &&
-	    hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
-		if (vtnet_rx_csum(sc, m, hdr) != 0)
-			sc->vtnet_stats.rx_csum_failed++;
+	/*
+	 * BVM: FreeBSD does not have the UNNECESSARY and PARTIAL checksum
+	 * distinction that Linux does. Need to reevaluate if performing
+	 * offloading for the NEEDS_CSUM case is really appropriate.
+	 */
+	if (hdr->flags & (VIRTIO_NET_HDR_F_NEEDS_CSUM |
+	    VIRTIO_NET_HDR_F_DATA_VALID)) {
+		if (vtnet_rxq_csum(rxq, m, hdr) == 0)
+			rxq->vtnrx_stats.vrxs_csum++;
+		else
+			rxq->vtnrx_stats.vrxs_csum_failed++;
 	}
 
-	ifp->if_ipackets++;
+	rxq->vtnrx_stats.vrxs_ipackets++;
+	rxq->vtnrx_stats.vrxs_ibytes += m->m_pkthdr.len;
+
+	/* VTNET_RXQ_UNLOCK(rxq); */
 	(*ifp->if_input)(ifp, m);
+	/* VTNET_RXQ_LOCK(rxq); */
 }
 
 static int
@@ -1587,10 +1690,6 @@ vtnet_rxq_eof(struct vtnet_rxq *rxq)
 
 	VTNET_RXQ_LOCK_ASSERT(rxq);
 
-	/*
-	 * `count` limits how many leading descriptors we dequeue. The
-	 * actual number could be higher if there are merged buffers.
-	 */
 	while (count-- > 0) {
 		m = virtqueue_dequeue(vq, &len);
 		if (m == NULL)
@@ -1598,7 +1697,7 @@ vtnet_rxq_eof(struct vtnet_rxq *rxq)
 		deq++;
 
 		if (len < sc->vtnet_hdr_size + ETHER_HDR_LEN) {
-			ifp->if_ierrors++;
+			rxq->vtnrx_stats.vrxs_discarded++;
 			vtnet_rxq_discard_buf(rxq, m);
 			continue;
 		}
@@ -1618,7 +1717,7 @@ vtnet_rxq_eof(struct vtnet_rxq *rxq)
 		}
 
 		if (vtnet_rxq_replace_buf(rxq, m, len) != 0) {
-			ifp->if_iqdrops++;
+			rxq->vtnrx_stats.vrxs_discarded++;
 			vtnet_rxq_discard_buf(rxq, m);
 			if (nbufs > 1)
 				vtnet_rxq_discard_merged_bufs(rxq, nbufs);
@@ -1642,7 +1741,7 @@ vtnet_rxq_eof(struct vtnet_rxq *rxq)
 		 * regular header.
 		 *
 		 * BMV: Is this memcpy() expensive? We know the mbuf data is
-		 * still valid after we adjust it.
+		 * still valid even after the m_adj().
 		 */
 		memcpy(hdr, mtod(m, void *), sizeof(struct virtio_net_hdr));
 		m_adj(m, adjsz);
@@ -1662,18 +1761,19 @@ vtnet_rx_vq_intr(void *xrxq)
 	struct vtnet_softc *sc;
 	struct vtnet_rxq *rxq;
 	struct ifnet *ifp;
-	int more;
+	int tries, more;
 
 	rxq = xrxq;
 	sc = rxq->vtnrx_sc;
 	ifp = sc->vtnet_ifp;
+	tries = 0;
 
 	if (__predict_false(rxq->vtnrx_id >= sc->vtnet_act_vq_pairs)) {
 		/*
-		 * Ignore this interrupt. Either the host generated a spurious
-		 * interrupt (probably unlikely) or we have multiqueue without
-		 * per-VQ MSIX so every queue needs to be polled (brain dead
-		 * configuration we could try harder to avoid).
+		 * Ignore this interrupt. Either this is a spurious interrupt
+		 * or multiqueue without per-VQ MSIX so every queue needs to
+		 * be polled (a brain dead configuration we could try harder
+		 * to avoid).
 		 */
 		vtnet_rxq_disable_intr(rxq);
 		return;
@@ -1683,7 +1783,6 @@ again:
 	VTNET_RXQ_LOCK(rxq);
 
 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
-		vtnet_rxq_enable_intr(rxq);
 		VTNET_RXQ_UNLOCK(rxq);
 		return;
 	}
@@ -1692,21 +1791,47 @@ again:
 	if (more || vtnet_rxq_enable_intr(rxq) != 0) {
 		if (!more)
 			vtnet_rxq_disable_intr(rxq);
-		sc->vtnet_stats.rx_task_rescheduled++;
+		/*
+		 * This is an occasional condition or race (when !more),
+		 * so retry a few times before scheduling the taskqueue.
+		 */
+		rxq->vtnrx_stats.vrxs_rescheduled++;
+		VTNET_RXQ_UNLOCK(rxq);
+		if (tries++ < VTNET_INTR_DISABLE_RETRIES)
+			goto again;
+		taskqueue_enqueue(rxq->vtnrx_tq, &rxq->vtnrx_intrtask);
+	} else
 		VTNET_RXQ_UNLOCK(rxq);
-		goto again;
-	}
-
-	VTNET_RXQ_UNLOCK(rxq);
 }
 
 static void
-vtnet_rxq_taskqueue(void *xrxq, int pending)
+vtnet_rxq_tq_intr(void *xrxq, int pending)
 {
+	struct vtnet_softc *sc;
+	struct vtnet_rxq *rxq;
+	struct ifnet *ifp;
+	int more;
 
-	/*
-	 * BMV: Do stuff here when we defer in vtnet_rx_vq_intr().
-	 */
+	rxq = xrxq;
+	sc = rxq->vtnrx_sc;
+	ifp = sc->vtnet_ifp;
+
+	VTNET_RXQ_LOCK(rxq);
+
+	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
+		VTNET_RXQ_UNLOCK(rxq);
+		return;
+	}
+
+	more = vtnet_rxq_eof(rxq);
+	if (more || vtnet_rxq_enable_intr(rxq) != 0) {
+		if (!more)
+			vtnet_rxq_disable_intr(rxq);
+		rxq->vtnrx_stats.vrxs_rescheduled++;
+		taskqueue_enqueue(rxq->vtnrx_tq, &rxq->vtnrx_intrtask);
+	}
+
+	VTNET_RXQ_UNLOCK(rxq);
 }
 
 static void
@@ -1729,121 +1854,150 @@ vtnet_txq_free_mbufs(struct vtnet_txq *t
 }
 
 /*
- * BMV: Uggg ... rewrite this function.
+ * BMV: Much of this can go away once we finally have offsets in
+ * the mbuf packet header. Bug andre@.
  */
-static struct mbuf *
-vtnet_tx_offload(struct vtnet_softc *sc, struct mbuf *m,
-    struct virtio_net_hdr *hdr)
+static int
+vtnet_txq_offload_ctx(struct vtnet_txq *txq, struct mbuf *m,
+    int *etype, int *proto, int *start)
 {
-	struct ifnet *ifp;
-	struct ether_header *eh;
+	struct vtnet_softc *sc;
 	struct ether_vlan_header *evh;
-	struct ip *ip;
-	struct ip6_hdr *ip6;
-	struct tcphdr *tcp;
-	int ip_offset;
-	uint16_t eth_type, csum_start;
-	uint8_t ip_proto, gso_type;
+	int offset;
 
-	ifp = sc->vtnet_ifp;
+	sc = txq->vtntx_sc;
 
-	ip_offset = sizeof(struct ether_header);
-	if (m->m_len < ip_offset) {
-		if ((m = m_pullup(m, ip_offset)) == NULL)
-			return (NULL);
+	evh = mtod(m, struct ether_vlan_header *);
+	if (evh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
+		/* BMV: We should handle nested VLAN tags too. */
+		*etype = ntohs(evh->evl_proto);
+		offset = sizeof(struct ether_vlan_header);
+	} else {
+		*etype = ntohs(evh->evl_encap_proto);
+		offset = sizeof(struct ether_header);
 	}
 
-	eh = mtod(m, struct ether_header *);
-	eth_type = ntohs(eh->ether_type);
-	if (eth_type == ETHERTYPE_VLAN) {
-		ip_offset = sizeof(struct ether_vlan_header);
-		if (m->m_len < ip_offset) {
-			if ((m = m_pullup(m, ip_offset)) == NULL)
-				return (NULL);
-		}
-		evh = mtod(m, struct ether_vlan_header *);
-		eth_type = ntohs(evh->evl_proto);
+	switch (*etype) {
+#if defined(INET)
+	case ETHERTYPE_IP: {
+		struct ip *ip, iphdr;
+		if (__predict_false(m->m_len < offset + sizeof(struct ip))) {
+			m_copydata(m, offset, sizeof(struct ip),
+			    (caddr_t) &iphdr);
+			ip = &iphdr;
+		} else
+			ip = (struct ip *)(m->m_data + offset);
+		*proto = ip->ip_p;
+		*start = offset + (ip->ip_hl << 2);
+		break;
+	}
+#endif
+#if defined(INET6)
+	case ETHERTYPE_IPV6:
+		*proto = -1;
+		*start = ip6_lasthdr(m, offset, IPPROTO_IPV6, proto);
+		/* Assert the network stack sends us a valid packet. */
+		KASSERT(*start > offset,
+		    ("%s: mbuf %p start %d offset %d proto %d", __func__, m,
+		    *start, offset, *proto));
+		break;
+#endif
+	default:
+		sc->vtnet_stats.tx_csum_bad_ethtype++;
+		return (EINVAL);
 	}
 
-	switch (eth_type) {
-	case ETHERTYPE_IP:
-		if (m->m_len < ip_offset + sizeof(struct ip)) {
-			m = m_pullup(m, ip_offset + sizeof(struct ip));
-			if (m == NULL)
-				return (NULL);
-		}
+	return (0);
+}
 
-		ip = (struct ip *)(mtod(m, uint8_t *) + ip_offset);
-		ip_proto = ip->ip_p;
-		csum_start = ip_offset + (ip->ip_hl << 2);
-		gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
-		break;
+static int
+vtnet_txq_offload_tso(struct vtnet_txq *txq, struct mbuf *m, int eth_type,
+    int offset, struct virtio_net_hdr *hdr)
+{
+	static struct timeval lastecn;
+	static int curecn;
+	struct vtnet_softc *sc;
+	struct tcphdr *tcp, tcphdr;
 
-	case ETHERTYPE_IPV6:
-		if (m->m_len < ip_offset + sizeof(struct ip6_hdr)) {
-			m = m_pullup(m, ip_offset + sizeof(struct ip6_hdr));
-			if (m == NULL)
-				return (NULL);
-		}
+	sc = txq->vtntx_sc;
+
+	if (__predict_false(m->m_len < offset + sizeof(struct tcphdr))) {
+		m_copydata(m, offset, sizeof(struct tcphdr), (caddr_t) &tcphdr);
+		tcp = &tcphdr;
+	} else
+		tcp = (struct tcphdr *)(m->m_data + offset);
+
+	hdr->hdr_len = offset + (tcp->th_off << 2);
+	hdr->gso_size = m->m_pkthdr.tso_segsz;
+	hdr->gso_type = eth_type == ETHERTYPE_IP ? VIRTIO_NET_HDR_GSO_TCPV4 :
+	    VIRTIO_NET_HDR_GSO_TCPV6;
 
-		ip6 = (struct ip6_hdr *)(mtod(m, uint8_t *) + ip_offset);
+	if (tcp->th_flags & TH_CWR) {
 		/*
-		 * XXX Assume no extension headers are present. Presently,
-		 * this will always be true in the case of TSO, and FreeBSD
-		 * does not perform checksum offloading of IPv6 yet.
+		 * Drop if VIRTIO_NET_F_HOST_ECN was not negotiated. In FreeBSD
+		 * ECN support is not on a per-interface basis, but globally via
+		 * the net.inet.tcp.ecn.enable sysctl knob. The default is off.
 		 */
-		ip_proto = ip6->ip6_nxt;
-		csum_start = ip_offset + sizeof(struct ip6_hdr);
-		gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
-		break;
-
-	default:
-		return (m);
+		if ((sc->vtnet_flags & VTNET_FLAG_TSO_ECN) == 0) {
+			if (ppsratecheck(&lastecn, &curecn, 1))
+				if_printf(sc->vtnet_ifp,
+				    "TSO with ECN not negotiated with host\n");
+			return (ENOTSUP);
+		}
+		hdr->gso_type |= VIRTIO_NET_HDR_GSO_ECN;
 	}
 
-	if (m->m_pkthdr.csum_flags & VTNET_CSUM_OFFLOAD) {
-		hdr->flags |= VIRTIO_NET_HDR_F_NEEDS_CSUM;
-		hdr->csum_start = csum_start;
-		hdr->csum_offset = m->m_pkthdr.csum_data;
+	txq->vtntx_stats.vtxs_tso++;
 
-		sc->vtnet_stats.tx_csum_offloaded++;
-	}
+	return (0);
+}
 
-	if (m->m_pkthdr.csum_flags & CSUM_TSO) {
-		if (ip_proto != IPPROTO_TCP)
-			return (m);
+static struct mbuf *
+vtnet_txq_offload(struct vtnet_txq *txq, struct mbuf *m,
+    struct virtio_net_hdr *hdr)
+{
+	struct vtnet_softc *sc;
+	int flags, etype, csum_start, proto, error;
 
-		if (m->m_len < csum_start + sizeof(struct tcphdr)) {
-			m = m_pullup(m, csum_start + sizeof(struct tcphdr));
-			if (m == NULL)
-				return (NULL);
-		}
+	sc = txq->vtntx_sc;
+	flags = m->m_pkthdr.csum_flags;
 
-		tcp = (struct tcphdr *)(mtod(m, uint8_t *) + csum_start);
-		hdr->gso_type = gso_type;
-		hdr->hdr_len = csum_start + (tcp->th_off << 2);
-		hdr->gso_size = m->m_pkthdr.tso_segsz;
+	error = vtnet_txq_offload_ctx(txq, m, &etype, &csum_start, &proto);
+	if (error)
+		goto drop;
 
-		if (tcp->th_flags & TH_CWR) {
-			/*
-			 * Drop if we did not negotiate VIRTIO_NET_F_HOST_ECN.
-			 * ECN support is only configurable globally with the
-			 * net.inet.tcp.ecn.enable sysctl knob.
-			 */
-			if ((sc->vtnet_flags & VTNET_FLAG_TSO_ECN) == 0) {
-				if_printf(ifp, "TSO with ECN not supported "
-				    "by host\n");
-				m_freem(m);
-				return (NULL);
-			}
+	if ((etype == ETHERTYPE_IP && flags & VTNET_CSUM_OFFLOAD) ||
+	    (etype == ETHERTYPE_IPV6 && flags & VTNET_CSUM_OFFLOAD_IPV6)) {
+		/*
+		 * We could compare the IP protocol vs the CSUM_ flag too,
+		 * but that really should not be necessary.
+		 */
+		hdr->flags |= VIRTIO_NET_HDR_F_NEEDS_CSUM;
+		hdr->csum_start = csum_start;
+		hdr->csum_offset = m->m_pkthdr.csum_data;
+		txq->vtntx_stats.vtxs_csum++;
+	}
 
-			hdr->flags |= VIRTIO_NET_HDR_GSO_ECN;
+	if (flags & CSUM_TSO) {
+		if (__predict_false(proto != IPPROTO_TCP)) {
+			/* Likely failed to correctly parse the mbuf. */
+			sc->vtnet_stats.tx_tso_not_tcp++;
+			goto drop;
 		}
 
-		sc->vtnet_stats.tx_tso_offloaded++;
+		KASSERT(hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM,
+		    ("%s: mbuf %p TSO without checksum offload", __func__, m));
+
+		error = vtnet_txq_offload_tso(txq, m, etype, csum_start, hdr);
+		if (error)
+			goto drop;
 	}
 
 	return (m);
+
+drop:
+	m_freem(m);
+	return (NULL);
 }
 
 static int
@@ -1879,6 +2033,7 @@ again:
 
 		*m_head = m;
 		collapsed = 1;
+		txq->vtntx_stats.vtxs_collapsed++;
 		goto again;
 	}
 
@@ -1920,29 +2075,26 @@ vtnet_txq_encap(struct vtnet_txq *txq, s
 	 * The vtnet_hdr_size is used to enqueue the correct header size.
 	 */
 	hdr = &txhdr->vth_uhdr.hdr;
-
+	error = ENOBUFS;
 	if (m->m_flags & M_VLANTAG) {
 		m = ether_vlanencap(m, m->m_pkthdr.ether_vtag);
-		if ((*m_head = m) == NULL) {
-			error = ENOBUFS;
+		if ((*m_head = m) == NULL)
 			goto fail;
-		}
 		m->m_flags &= ~M_VLANTAG;
 	}
 
-	if (m->m_pkthdr.csum_flags != 0) {
-		m = vtnet_tx_offload(sc, m, hdr);
-		if ((*m_head = m) == NULL) {
-			error = ENOBUFS;
+	if (m->m_pkthdr.csum_flags & VTNET_CSUM_ALL_OFFLOAD) {
+		m = vtnet_txq_offload(txq, m, hdr);
+		if ((*m_head = m) == NULL)
 			goto fail;
-		}
 	}
 
 	error = vtnet_txq_enqueue_buf(txq, m_head, txhdr);
+	if (error == 0)
+		return (0);
 
 fail:
-	if (error)
-		uma_zfree(vtnet_tx_header_zone, txhdr);
+	uma_zfree(vtnet_tx_header_zone, txhdr);
 
 	return (error);
 }
@@ -2071,7 +2223,6 @@ vtnet_txq_mq_start(struct ifnet *ifp, st
 	sc = ifp->if_softc;
 	npairs = sc->vtnet_act_vq_pairs;
 
-	/* BMV: Is this the best way to determine which queue? */
 	if (m->m_flags & M_FLOWID)
 		i = m->m_pkthdr.flowid % npairs;
 	else
@@ -2084,14 +2235,14 @@ vtnet_txq_mq_start(struct ifnet *ifp, st
 		VTNET_TXQ_UNLOCK(txq);
 	} else {
 		error = drbr_enqueue(ifp, txq->vtntx_br, m);
-		vtnet_txq_tq_start(txq);
+		taskqueue_enqueue(txq->vtntx_tq, &txq->vtntx_defrtask);
 	}
 
 	return (error);
 }
 
 static void
-vtnet_txq_taskqueue(void *xtxq, int pending)
+vtnet_txq_tq_deferred(void *xtxq, int pending)
 {
 	struct vtnet_softc *sc;
 	struct vtnet_txq *txq;
@@ -2105,36 +2256,64 @@ vtnet_txq_taskqueue(void *xtxq, int pend

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***