From owner-svn-src-user@FreeBSD.ORG Tue Feb 24 16:39:58 2009 Return-Path: Delivered-To: svn-src-user@freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:4f8:fff6::34]) by hub.freebsd.org (Postfix) with ESMTP id A5341106564A; Tue, 24 Feb 2009 16:39:58 +0000 (UTC) (envelope-from dfr@FreeBSD.org) Received: from svn.freebsd.org (svn.freebsd.org [IPv6:2001:4f8:fff6::2c]) by mx1.freebsd.org (Postfix) with ESMTP id 92F2C8FC29; Tue, 24 Feb 2009 16:39:58 +0000 (UTC) (envelope-from dfr@FreeBSD.org) Received: from svn.freebsd.org (localhost [127.0.0.1]) by svn.freebsd.org (8.14.3/8.14.3) with ESMTP id n1OGdwAm070495; Tue, 24 Feb 2009 16:39:58 GMT (envelope-from dfr@svn.freebsd.org) Received: (from dfr@localhost) by svn.freebsd.org (8.14.3/8.14.3/Submit) id n1OGdwHt070494; Tue, 24 Feb 2009 16:39:58 GMT (envelope-from dfr@svn.freebsd.org) Message-Id: <200902241639.n1OGdwHt070494@svn.freebsd.org> From: Doug Rabson Date: Tue, 24 Feb 2009 16:39:58 +0000 (UTC) To: src-committers@freebsd.org, svn-src-user@freebsd.org X-SVN-Group: user MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cc: Subject: svn commit: r188998 - user/dfr/xenhvm/6/sys/dev/xen/netfront X-BeenThere: svn-src-user@freebsd.org X-Mailman-Version: 2.1.5 Precedence: list List-Id: "SVN commit messages for the experimental " user" src tree" List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Tue, 24 Feb 2009 16:39:59 -0000 Author: dfr Date: Tue Feb 24 16:39:58 2009 New Revision: 188998 URL: http://svn.freebsd.org/changeset/base/188998 Log: Merge TSO and LRO, mostly for diff reduction purposes (its not supported in FreeBSD 6.x). Modified: user/dfr/xenhvm/6/sys/dev/xen/netfront/ (props changed) user/dfr/xenhvm/6/sys/dev/xen/netfront/netfront.c Modified: user/dfr/xenhvm/6/sys/dev/xen/netfront/netfront.c ============================================================================== --- user/dfr/xenhvm/6/sys/dev/xen/netfront/netfront.c Tue Feb 24 16:23:34 2009 (r188997) +++ user/dfr/xenhvm/6/sys/dev/xen/netfront/netfront.c Tue Feb 24 16:39:58 2009 (r188998) @@ -28,6 +28,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -47,6 +48,10 @@ __FBSDID("$FreeBSD$"); #include #include #include +#if __FreeBSD_version >= 700000 +#include +#include +#endif #include #include @@ -76,13 +81,22 @@ __FBSDID("$FreeBSD$"); #include "xenbus_if.h" -#define XN_CSUM_FEATURES (CSUM_TCP | CSUM_UDP) +#define XN_CSUM_FEATURES (CSUM_TCP | CSUM_UDP | CSUM_TSO) #define GRANT_INVALID_REF 0 #define NET_TX_RING_SIZE __RING_SIZE((netif_tx_sring_t *)0, PAGE_SIZE) #define NET_RX_RING_SIZE __RING_SIZE((netif_rx_sring_t *)0, PAGE_SIZE) +/* + * Should the driver do LRO on the RX end + * this can be toggled on the fly, but the + * interface must be reset (down/up) for it + * to take effect. + */ +static int xn_enable_lro = 1; +TUNABLE_INT("hw.xn.enable_lro", &xn_enable_lro); + #ifdef CONFIG_XEN static int MODPARM_rx_copy = 0; module_param_named(rx_copy, MODPARM_rx_copy, bool, 0); @@ -95,6 +109,7 @@ static const int MODPARM_rx_copy = 1; static const int MODPARM_rx_flip = 0; #endif +#define MAX_SKB_FRAGS (65536/PAGE_SIZE + 2) #define RX_COPY_THRESHOLD 256 #define net_ratelimit() 0 @@ -195,6 +210,9 @@ struct net_device_stats struct netfront_info { struct ifnet *xn_ifp; +#if __FreeBSD_version >= 700000 + struct lro_ctrl xn_lro; +#endif struct net_device_stats stats; u_int tx_full; @@ -339,28 +357,6 @@ xennet_get_rx_ref(struct netfront_info * #define DPRINTK(fmt, args...) #endif -static __inline struct mbuf* -makembuf (struct mbuf *buf) -{ - struct mbuf *m = NULL; - - MGETHDR (m, M_DONTWAIT, MT_DATA); - - if (! m) - return 0; - - M_MOVE_PKTHDR(m, buf); - - m_cljget(m, M_DONTWAIT, MJUMPAGESIZE); - m->m_pkthdr.len = buf->m_pkthdr.len; - m->m_len = buf->m_len; - m_copydata(buf, 0, buf->m_pkthdr.len, mtod(m,caddr_t) ); - - m->m_ext.ext_args = (caddr_t *)(uintptr_t)(vtophys(mtod(m,caddr_t)) >> PAGE_SHIFT); - - return m; -} - /** * Read the 'mac' node at the given device's node in the store, and parse that * as colon-separated octets, placing result the given mac array. mac must be @@ -420,6 +416,11 @@ netfront_attach(device_t dev) return err; } + SYSCTL_ADD_INT(device_get_sysctl_ctx(dev), + SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), + OID_AUTO, "enable_lro", CTLTYPE_INT|CTLFLAG_RW, + &xn_enable_lro, 0, "Large Receive Offload"); + return 0; } @@ -500,7 +501,7 @@ talk_to_backend(device_t dev, struct net message = "writing feature-sg"; goto abort_transaction; } -#ifdef HAVE_TSO +#if __FreeBSD_version >= 700000 err = xenbus_printf(xbt, node, "feature-gso-tcpv4", "%d", 1); if (err) { message = "writing feature-gso-tcpv4"; @@ -868,6 +869,10 @@ static void xn_rxeof(struct netfront_info *np) { struct ifnet *ifp; +#if __FreeBSD_version >= 700000 + struct lro_ctrl *lro = &np->xn_lro; + struct lro_entry *queued; +#endif struct netfront_rx_info rinfo; struct netif_rx_response *rx = &rinfo.rx; struct netif_extra_info *extras = rinfo.extras; @@ -962,13 +967,35 @@ xn_rxeof(struct netfront_info *np) * Do we really need to drop the rx lock? */ XN_RX_UNLOCK(np); - /* Pass it up. */ +#if __FreeBSD_version >= 700000 + /* Use LRO if possible */ + if ((ifp->if_capenable & IFCAP_LRO) == 0 || + lro->lro_cnt == 0 || tcp_lro_rx(lro, m, 0)) { + /* + * If LRO fails, pass up to the stack + * directly. + */ + (*ifp->if_input)(ifp, m); + } +#else (*ifp->if_input)(ifp, m); +#endif XN_RX_LOCK(np); } np->rx.rsp_cons = i; +#if __FreeBSD_version >= 700000 + /* + * Flush any outstanding LRO work + */ + while (!SLIST_EMPTY(&lro->lro_active)) { + queued = SLIST_FIRST(&lro->lro_active); + SLIST_REMOVE_HEAD(&lro->lro_active, next); + tcp_lro_flush(lro, queued); + } +#endif + #if 0 /* If we get a callback with very few responses, reduce fill target. */ /* NB. Note exponential increase, linear decrease. */ @@ -989,6 +1016,7 @@ xn_txeof(struct netfront_info *np) RING_IDX i, prod; unsigned short id; struct ifnet *ifp; + netif_tx_response_t *txr; struct mbuf *m; XN_TX_LOCK_ASSERT(np); @@ -1004,10 +1032,19 @@ xn_txeof(struct netfront_info *np) rmb(); /* Ensure we see responses up to 'rp'. */ for (i = np->tx.rsp_cons; i != prod; i++) { - id = RING_GET_RESPONSE(&np->tx, i)->id; + txr = RING_GET_RESPONSE(&np->tx, i); + if (txr->status == NETIF_RSP_NULL) + continue; + + id = txr->id; m = np->xn_cdata.xn_tx_chain[id]; - ifp->if_opackets++; + /* + * Increment packet count if this is the last + * mbuf of the chain. + */ + if (!m->m_next) + ifp->if_opackets++; KASSERT(m != NULL, ("mbuf not found in xn_tx_chain")); M_ASSERTVALID(m); if (unlikely(gnttab_query_foreign_access( @@ -1025,7 +1062,7 @@ xn_txeof(struct netfront_info *np) np->xn_cdata.xn_tx_chain[id] = NULL; add_id_to_freelist(np->xn_cdata.xn_tx_chain, id); - m_freem(m); + m_free(m); } np->tx.rsp_cons = prod; @@ -1320,13 +1357,14 @@ xn_start_locked(struct ifnet *ifp) { int otherend_id; unsigned short id; - struct mbuf *m_head, *new_m; + struct mbuf *m_head, *m; struct netfront_info *sc; netif_tx_request_t *tx; + netif_extra_info_t *extra; RING_IDX i; grant_ref_t ref; u_long mfn, tx_bytes; - int notify; + int notify, nfrags; sc = ifp->if_softc; otherend_id = xenbus_get_otherend_id(sc->xbdev); @@ -1346,36 +1384,96 @@ xn_start_locked(struct ifnet *ifp) break; } - id = get_id_from_freelist(sc->xn_cdata.xn_tx_chain); + + /* + * Defragment the mbuf if necessary. + */ + for (m = m_head, nfrags = 0; m; m = m->m_next) + nfrags++; + if (nfrags > MAX_SKB_FRAGS) { + m = m_defrag(m_head, M_DONTWAIT); + if (!m) { + m_freem(m_head); + break; + } + m_head = m; + } /* * Start packing the mbufs in this chain into * the fragment pointers. Stop when we run out * of fragments or hit the end of the mbuf chain. */ - new_m = makembuf(m_head); - tx = RING_GET_REQUEST(&sc->tx, i); - tx->id = id; - ref = gnttab_claim_grant_reference(&sc->gref_tx_head); - KASSERT((short)ref >= 0, ("Negative ref")); - mfn = virt_to_mfn(mtod(new_m, vm_offset_t)); - gnttab_grant_foreign_access_ref(ref, otherend_id, - mfn, GNTMAP_readonly); - tx->gref = sc->grant_tx_ref[id] = ref; - tx->size = new_m->m_pkthdr.len; - if (new_m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) - tx->flags = NETTXF_csum_blank | NETTXF_data_validated; - else + m = m_head; + extra = NULL; + for (m = m_head; m; m = m->m_next) { + tx = RING_GET_REQUEST(&sc->tx, i); + id = get_id_from_freelist(sc->xn_cdata.xn_tx_chain); + sc->xn_cdata.xn_tx_chain[id] = m; + tx->id = id; + ref = gnttab_claim_grant_reference(&sc->gref_tx_head); + KASSERT((short)ref >= 0, ("Negative ref")); + mfn = virt_to_mfn(mtod(m, vm_offset_t)); + gnttab_grant_foreign_access_ref(ref, otherend_id, + mfn, GNTMAP_readonly); + tx->gref = sc->grant_tx_ref[id] = ref; + tx->offset = mtod(m, vm_offset_t) & (PAGE_SIZE - 1); tx->flags = 0; - new_m->m_next = NULL; - new_m->m_nextpkt = NULL; + if (m == m_head) { + /* + * The first fragment has the entire packet + * size, subsequent fragments have just the + * fragment size. The backend works out the + * true size of the first fragment by + * subtracting the sizes of the other + * fragments. + */ + tx->size = m->m_pkthdr.len; - m_freem(m_head); + /* + * The first fragment contains the + * checksum flags and is optionally + * followed by extra data for TSO etc. + */ + if (m->m_pkthdr.csum_flags + & CSUM_DELAY_DATA) { + tx->flags |= (NETTXF_csum_blank + | NETTXF_data_validated); + } +#if __FreeBSD_version >= 700000 + if (m->m_pkthdr.csum_flags & CSUM_TSO) { + struct netif_extra_info *gso = + (struct netif_extra_info *) + RING_GET_REQUEST(&sc->tx, ++i); + + if (extra) + extra->flags |= XEN_NETIF_EXTRA_FLAG_MORE; + else + tx->flags |= NETTXF_extra_info; + + gso->u.gso.size = m->m_pkthdr.tso_segsz; + gso->u.gso.type = + XEN_NETIF_GSO_TYPE_TCPV4; + gso->u.gso.pad = 0; + gso->u.gso.features = 0; + + gso->type = XEN_NETIF_EXTRA_TYPE_GSO; + gso->flags = 0; + extra = gso; + } +#endif + } else { + tx->size = m->m_len; + } + if (m->m_next) { + tx->flags |= NETTXF_more_data; + i++; + } + } - sc->xn_cdata.xn_tx_chain[id] = new_m; - BPF_MTAP(ifp, new_m); + BPF_MTAP(ifp, m_head); - sc->stats.tx_bytes += new_m->m_pkthdr.len; + sc->stats.tx_bytes += m_head->m_pkthdr.len; sc->stats.tx_packets++; } @@ -1517,12 +1615,39 @@ xn_ioctl(struct ifnet *ifp, u_long cmd, break; case SIOCSIFCAP: mask = ifr->ifr_reqcap ^ ifp->if_capenable; - if (mask & IFCAP_HWCSUM) { - if (IFCAP_HWCSUM & ifp->if_capenable) - ifp->if_capenable &= ~IFCAP_HWCSUM; - else - ifp->if_capenable |= IFCAP_HWCSUM; + if (mask & IFCAP_TXCSUM) { + if (IFCAP_TXCSUM & ifp->if_capenable) { + ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4); + ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP + | CSUM_IP | CSUM_TSO); + } else { + ifp->if_capenable |= IFCAP_TXCSUM; + ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP + | CSUM_IP); + } + } + if (mask & IFCAP_RXCSUM) { + ifp->if_capenable ^= IFCAP_RXCSUM; + } +#if __FreeBSD_version >= 700000 + if (mask & IFCAP_TSO4) { + if (IFCAP_TSO4 & ifp->if_capenable) { + ifp->if_capenable &= ~IFCAP_TSO4; + ifp->if_hwassist &= ~CSUM_TSO; + } else if (IFCAP_TXCSUM & ifp->if_capenable) { + ifp->if_capenable |= IFCAP_TSO4; + ifp->if_hwassist |= CSUM_TSO; + } else { + DPRINTK("Xen requires tx checksum offload" + " be enabled to use TSO\n"); + error = EINVAL; + } + } + if (mask & IFCAP_LRO) { + ifp->if_capenable ^= IFCAP_LRO; + } +#endif error = 0; break; case SIOCADDMULTI: @@ -1733,6 +1858,18 @@ create_netdev(device_t dev) ifp->if_hwassist = XN_CSUM_FEATURES; ifp->if_capabilities = IFCAP_HWCSUM; +#if __FreeBSD_version >= 700000 + ifp->if_capabilities |= IFCAP_TSO4; + if (xn_enable_lro) { + int err = tcp_lro_init(&np->xn_lro); + if (err) { + device_printf(dev, "LRO initialization failed\n"); + goto exit; + } + np->xn_lro.ifp = ifp; + ifp->if_capabilities |= IFCAP_LRO; + } +#endif ifp->if_capenable = ifp->if_capabilities; ether_ifattach(ifp, np->mac);