From owner-svn-src-user@FreeBSD.ORG Thu Jun 11 05:36:49 2009 Return-Path: Delivered-To: svn-src-user@freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:4f8:fff6::34]) by hub.freebsd.org (Postfix) with ESMTP id 64AD6106566C; Thu, 11 Jun 2009 05:36:49 +0000 (UTC) (envelope-from kmacy@FreeBSD.org) Received: from svn.freebsd.org (svn.freebsd.org [IPv6:2001:4f8:fff6::2c]) by mx1.freebsd.org (Postfix) with ESMTP id 51F088FC0A; Thu, 11 Jun 2009 05:36:49 +0000 (UTC) (envelope-from kmacy@FreeBSD.org) Received: from svn.freebsd.org (localhost [127.0.0.1]) by svn.freebsd.org (8.14.3/8.14.3) with ESMTP id n5B5anUW036660; Thu, 11 Jun 2009 05:36:49 GMT (envelope-from kmacy@svn.freebsd.org) Received: (from kmacy@localhost) by svn.freebsd.org (8.14.3/8.14.3/Submit) id n5B5anYY036659; Thu, 11 Jun 2009 05:36:49 GMT (envelope-from kmacy@svn.freebsd.org) Message-Id: <200906110536.n5B5anYY036659@svn.freebsd.org> From: Kip Macy Date: Thu, 11 Jun 2009 05:36:49 +0000 (UTC) To: src-committers@freebsd.org, svn-src-user@freebsd.org X-SVN-Group: user MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cc: Subject: svn commit: r193969 - user/kmacy/releng_7_2_xen/sys/dev/xen/netfront X-BeenThere: svn-src-user@freebsd.org X-Mailman-Version: 2.1.5 Precedence: list List-Id: "SVN commit messages for the experimental " user" src tree" List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Thu, 11 Jun 2009 05:36:49 -0000 Author: kmacy Date: Thu Jun 11 05:36:49 2009 New Revision: 193969 URL: http://svn.freebsd.org/changeset/base/193969 Log: integrate Adrian's changes from head Modified: user/kmacy/releng_7_2_xen/sys/dev/xen/netfront/netfront.c Modified: user/kmacy/releng_7_2_xen/sys/dev/xen/netfront/netfront.c ============================================================================== --- user/kmacy/releng_7_2_xen/sys/dev/xen/netfront/netfront.c Thu Jun 11 05:20:05 2009 (r193968) +++ user/kmacy/releng_7_2_xen/sys/dev/xen/netfront/netfront.c Thu Jun 11 05:36:49 2009 (r193969) @@ -24,11 +24,11 @@ __FBSDID("$FreeBSD$"); #include #include #include -#include #include #include #include #include +#include #include #include #include @@ -48,6 +48,10 @@ __FBSDID("$FreeBSD$"); #include #include #include +#if __FreeBSD_version >= 700000 +#include +#include +#endif #include #include @@ -64,23 +68,42 @@ __FBSDID("$FreeBSD$"); #include #include +#include #include #include #include #include #include -#include -#include #include #include +#include + #include "xenbus_if.h" +#define XN_CSUM_FEATURES (CSUM_TCP | CSUM_UDP | CSUM_TSO) + #define GRANT_INVALID_REF 0 #define NET_TX_RING_SIZE __RING_SIZE((netif_tx_sring_t *)0, PAGE_SIZE) #define NET_RX_RING_SIZE __RING_SIZE((netif_rx_sring_t *)0, PAGE_SIZE) +#if __FreeBSD_version >= 700000 +/* + * Should the driver do LRO on the RX end + * this can be toggled on the fly, but the + * interface must be reset (down/up) for it + * to take effect. + */ +static int xn_enable_lro = 1; +TUNABLE_INT("hw.xn.enable_lro", &xn_enable_lro); +#else + +#define IFCAP_TSO4 0 +#define CSUM_TSO 0 + +#endif + #ifdef CONFIG_XEN static int MODPARM_rx_copy = 0; module_param_named(rx_copy, MODPARM_rx_copy, bool, 0); @@ -93,6 +116,7 @@ static const int MODPARM_rx_copy = 1; static const int MODPARM_rx_flip = 0; #endif +#define MAX_SKB_FRAGS (65536/PAGE_SIZE + 2) #define RX_COPY_THRESHOLD 256 #define net_ratelimit() 0 @@ -153,6 +177,7 @@ static int xennet_get_responses(struct n */ struct xn_chain_data { struct mbuf *xn_tx_chain[NET_TX_RING_SIZE+1]; + int xn_tx_chain_cnt; struct mbuf *xn_rx_chain[NET_RX_RING_SIZE+1]; }; @@ -193,6 +218,9 @@ struct net_device_stats struct netfront_info { struct ifnet *xn_ifp; +#if __FreeBSD_version >= 700000 + struct lro_ctrl xn_lro; +#endif struct net_device_stats stats; u_int tx_full; @@ -325,38 +353,16 @@ xennet_get_rx_ref(struct netfront_info * return ref; } -#ifdef DEBUG - -#endif #define IPRINTK(fmt, args...) \ printf("[XEN] " fmt, ##args) #define WPRINTK(fmt, args...) \ printf("[XEN] " fmt, ##args) +#if 0 #define DPRINTK(fmt, args...) \ printf("[XEN] %s: " fmt, __func__, ##args) - - -static __inline struct mbuf* -makembuf (struct mbuf *buf) -{ - struct mbuf *m = NULL; - - MGETHDR (m, M_DONTWAIT, MT_DATA); - - if (! m) - return 0; - - M_MOVE_PKTHDR(m, buf); - - m_cljget(m, M_DONTWAIT, MJUMPAGESIZE); - m->m_pkthdr.len = buf->m_pkthdr.len; - m->m_len = buf->m_len; - m_copydata(buf, 0, buf->m_pkthdr.len, mtod(m,caddr_t) ); - - m->m_ext.ext_args = (caddr_t *)(uintptr_t)(vtophys(mtod(m,caddr_t)) >> PAGE_SHIFT); - - return m; -} +#else +#define DPRINTK(fmt, args...) +#endif /** * Read the 'mac' node at the given device's node in the store, and parse that @@ -417,6 +423,13 @@ netfront_attach(device_t dev) return err; } +#if __FreeBSD_version >= 700000 + SYSCTL_ADD_INT(device_get_sysctl_ctx(dev), + SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), + OID_AUTO, "enable_lro", CTLTYPE_INT|CTLFLAG_RW, + &xn_enable_lro, 0, "Large Receive Offload"); +#endif + return 0; } @@ -492,17 +505,12 @@ talk_to_backend(device_t dev, struct net message = "writing feature-rx-notify"; goto abort_transaction; } - err = xenbus_printf(xbt, node, "feature-no-csum-offload", "%d", 1); - if (err) { - message = "writing feature-no-csum-offload"; - goto abort_transaction; - } err = xenbus_printf(xbt, node, "feature-sg", "%d", 1); if (err) { message = "writing feature-sg"; goto abort_transaction; } -#ifdef HAVE_TSO +#if __FreeBSD_version >= 700000 err = xenbus_printf(xbt, node, "feature-gso-tcpv4", "%d", 1); if (err) { message = "writing feature-gso-tcpv4"; @@ -572,7 +580,7 @@ setup_device(device_t dev, struct netfro goto fail; error = bind_listening_port_to_irqhandler(xenbus_get_otherend_id(dev), - "xn", xn_intr, info, INTR_TYPE_NET | INTR_MPSAFE, &info->irq); + "xn", xn_intr, info, INTR_TYPE_NET | INTR_MPSAFE, &info->irq); if (error) { xenbus_dev_fatal(dev, error, @@ -590,6 +598,24 @@ setup_device(device_t dev, struct netfro } /** + * If this interface has an ipv4 address, send an arp for it. This + * helps to get the network going again after migrating hosts. + */ +static void +netfront_send_fake_arp(device_t dev, struct netfront_info *info) +{ + struct ifnet *ifp; + struct ifaddr *ifa; + + ifp = info->xn_ifp; + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + if (ifa->ifa_addr->sa_family == AF_INET) { + arp_ifinit(ifp, ifa); + } + } +} + +/** * Callback received when the backend's state changes. */ static void @@ -614,9 +640,7 @@ netfront_backend_changed(device_t dev, X if (network_connect(sc) != 0) break; xenbus_set_state(dev, XenbusStateConnected); -#ifdef notyet - (void)send_fake_arp(netdev); -#endif + netfront_send_fake_arp(dev, sc); break; case XenbusStateClosing: xenbus_set_state(dev, XenbusStateClosed); @@ -702,6 +726,10 @@ netif_release_tx_bufs(struct netfront_in np->grant_tx_ref[i]); np->grant_tx_ref[i] = GRANT_INVALID_REF; add_id_to_freelist(np->tx_mbufs, i); + np->xn_cdata.xn_tx_chain_cnt--; + if (np->xn_cdata.xn_tx_chain_cnt < 0) { + panic("netif_release_tx_bufs: tx_chain_cnt must be >= 0"); + } m_freem(m); } } @@ -871,6 +899,10 @@ static void xn_rxeof(struct netfront_info *np) { struct ifnet *ifp; +#if __FreeBSD_version >= 700000 + struct lro_ctrl *lro = &np->xn_lro; + struct lro_entry *queued; +#endif struct netfront_rx_info rinfo; struct netif_rx_response *rx = &rinfo.rx; struct netif_extra_info *extras = rinfo.extras; @@ -965,13 +997,35 @@ xn_rxeof(struct netfront_info *np) * Do we really need to drop the rx lock? */ XN_RX_UNLOCK(np); - /* Pass it up. */ +#if __FreeBSD_version >= 700000 + /* Use LRO if possible */ + if ((ifp->if_capenable & IFCAP_LRO) == 0 || + lro->lro_cnt == 0 || tcp_lro_rx(lro, m, 0)) { + /* + * If LRO fails, pass up to the stack + * directly. + */ + (*ifp->if_input)(ifp, m); + } +#else (*ifp->if_input)(ifp, m); +#endif XN_RX_LOCK(np); } np->rx.rsp_cons = i; +#if __FreeBSD_version >= 700000 + /* + * Flush any outstanding LRO work + */ + while (!SLIST_EMPTY(&lro->lro_active)) { + queued = SLIST_FIRST(&lro->lro_active); + SLIST_REMOVE_HEAD(&lro->lro_active, next); + tcp_lro_flush(lro, queued); + } +#endif + #if 0 /* If we get a callback with very few responses, reduce fill target. */ /* NB. Note exponential increase, linear decrease. */ @@ -992,6 +1046,7 @@ xn_txeof(struct netfront_info *np) RING_IDX i, prod; unsigned short id; struct ifnet *ifp; + netif_tx_response_t *txr; struct mbuf *m; XN_TX_LOCK_ASSERT(np); @@ -1007,12 +1062,21 @@ xn_txeof(struct netfront_info *np) rmb(); /* Ensure we see responses up to 'rp'. */ for (i = np->tx.rsp_cons; i != prod; i++) { - id = RING_GET_RESPONSE(&np->tx, i)->id; + txr = RING_GET_RESPONSE(&np->tx, i); + if (txr->status == NETIF_RSP_NULL) + continue; + + id = txr->id; m = np->xn_cdata.xn_tx_chain[id]; KASSERT(m != NULL, ("mbuf not found in xn_tx_chain")); M_ASSERTVALID(m); - ifp->if_opackets++; + /* + * Increment packet count if this is the last + * mbuf of the chain. + */ + if (!m->m_next) + ifp->if_opackets++; if (unlikely(gnttab_query_foreign_access( np->grant_tx_ref[id]) != 0)) { printf("network_tx_buf_gc: warning " @@ -1028,7 +1092,13 @@ xn_txeof(struct netfront_info *np) np->xn_cdata.xn_tx_chain[id] = NULL; add_id_to_freelist(np->xn_cdata.xn_tx_chain, id); - m_freem(m); + np->xn_cdata.xn_tx_chain_cnt--; + if (np->xn_cdata.xn_tx_chain_cnt < 0) { + panic("netif_release_tx_bufs: tx_chain_cnt must be >= 0"); + } + m_free(m); + /* Only mark the queue active if we've freed up at least one slot to try */ + ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; } np->tx.rsp_cons = prod; @@ -1045,7 +1115,6 @@ xn_txeof(struct netfront_info *np) prod + ((np->tx.sring->req_prod - prod) >> 1) + 1; mb(); - } while (prod != np->tx.sring->rsp_prod); out: @@ -1257,7 +1326,7 @@ xennet_get_responses(struct netfront_inf next: if (m == NULL) break; - + m->m_len = rx->status; m->m_data += rx->offset; m0->m_pkthdr.len += rx->status; @@ -1324,13 +1393,14 @@ xn_start_locked(struct ifnet *ifp) { int otherend_id; unsigned short id; - struct mbuf *m_head, *new_m; + struct mbuf *m_head, *m; struct netfront_info *sc; netif_tx_request_t *tx; + netif_extra_info_t *extra; RING_IDX i; grant_ref_t ref; u_long mfn, tx_bytes; - int notify; + int notify, nfrags; sc = ifp->if_softc; otherend_id = xenbus_get_otherend_id(sc->xbdev); @@ -1355,37 +1425,153 @@ xn_start_locked(struct ifnet *ifp) ifp->if_drv_flags |= IFF_DRV_OACTIVE; break; } - - id = get_id_from_freelist(sc->xn_cdata.xn_tx_chain); + + /* + * Defragment the mbuf if necessary. + */ + for (m = m_head, nfrags = 0; m; m = m->m_next) + nfrags++; + if (nfrags > MAX_SKB_FRAGS) { + m = m_defrag(m_head, M_DONTWAIT); + if (!m) { + m_freem(m_head); + break; + } + m_head = m; + } + + /* Determine how many fragments now exist */ + for (m = m_head, nfrags = 0; m; m = m->m_next) + nfrags++; + + /* + * Don't attempt to queue this packet if there aren't + * enough free entries in the chain. + * + * There isn't a 1:1 correspondance between the mbuf TX ring + * and the xenbus TX ring. + * xn_txeof() may need to be called to free up some slots. + * + * It is quite possible that this can be later eliminated if + * it turns out that partial * packets can be pushed into + * the ringbuffer, with fragments pushed in when further slots + * free up. + * + * It is also quite possible that the driver will lock up + * if the TX queue fills up with no RX traffic, and + * the mbuf ring is exhausted. The queue may need + * a swift kick to continue. + */ + + /* + * It is not +1 like the allocation because we need to keep + * slot [0] free for the freelist head + */ + if (sc->xn_cdata.xn_tx_chain_cnt + nfrags >= NET_TX_RING_SIZE) { + printf("xn_start_locked: xn_tx_chain_cnt (%d) + nfrags %d >= NET_TX_RING_SIZE (%d); must be full!\n", + (int) sc->xn_cdata.xn_tx_chain_cnt, + (int) nfrags, (int) NET_TX_RING_SIZE); + IF_PREPEND(&ifp->if_snd, m_head); + ifp->if_drv_flags |= IFF_DRV_OACTIVE; + break; + } + + /* + * Make sure there's actually space available in the + * Xen TX ring for this. Overcompensate for the possibility + * of having a TCP offload fragment just in case for now + * (the +1) rather than adding logic to accurately calculate + * the required size. + */ + if (RING_FREE_REQUESTS(&sc->tx) < (nfrags + 1)) { + printf("xn_start_locked: free ring slots (%d) < (nfrags + 1) (%d); must be full!\n", + (int) RING_FREE_REQUESTS(&sc->tx), + (int) (nfrags + 1)); + IF_PREPEND(&ifp->if_snd, m_head); + ifp->if_drv_flags |= IFF_DRV_OACTIVE; + break; + } /* * Start packing the mbufs in this chain into * the fragment pointers. Stop when we run out * of fragments or hit the end of the mbuf chain. */ - new_m = makembuf(m_head); - tx = RING_GET_REQUEST(&sc->tx, i); - tx->id = id; - ref = gnttab_claim_grant_reference(&sc->gref_tx_head); - KASSERT((short)ref >= 0, ("Negative ref")); - mfn = virt_to_mfn(mtod(new_m, vm_offset_t)); - gnttab_grant_foreign_access_ref(ref, otherend_id, - mfn, GNTMAP_readonly); - tx->gref = sc->grant_tx_ref[id] = ref; - tx->size = new_m->m_pkthdr.len; -#if 0 - tx->flags = (skb->ip_summed == CHECKSUM_HW) ? NETTXF_csum_blank : 0; -#endif - tx->flags = 0; - new_m->m_next = NULL; - new_m->m_nextpkt = NULL; + m = m_head; + extra = NULL; + for (m = m_head; m; m = m->m_next) { + tx = RING_GET_REQUEST(&sc->tx, i); + id = get_id_from_freelist(sc->xn_cdata.xn_tx_chain); + if (id == 0) + panic("xn_start_locked: was allocated the freelist head!\n"); + sc->xn_cdata.xn_tx_chain_cnt++; + if (sc->xn_cdata.xn_tx_chain_cnt >= NET_TX_RING_SIZE+1) + panic("xn_start_locked: tx_chain_cnt must be < NET_TX_RING_SIZE+1\n"); + sc->xn_cdata.xn_tx_chain[id] = m; + tx->id = id; + ref = gnttab_claim_grant_reference(&sc->gref_tx_head); + KASSERT((short)ref >= 0, ("Negative ref")); + mfn = virt_to_mfn(mtod(m, vm_offset_t)); + gnttab_grant_foreign_access_ref(ref, otherend_id, + mfn, GNTMAP_readonly); + tx->gref = sc->grant_tx_ref[id] = ref; + tx->offset = mtod(m, vm_offset_t) & (PAGE_SIZE - 1); + tx->flags = 0; + if (m == m_head) { + /* + * The first fragment has the entire packet + * size, subsequent fragments have just the + * fragment size. The backend works out the + * true size of the first fragment by + * subtracting the sizes of the other + * fragments. + */ + tx->size = m->m_pkthdr.len; - m_freem(m_head); + /* + * The first fragment contains the + * checksum flags and is optionally + * followed by extra data for TSO etc. + */ + if (m->m_pkthdr.csum_flags + & CSUM_DELAY_DATA) { + tx->flags |= (NETTXF_csum_blank + | NETTXF_data_validated); + } +#if __FreeBSD_version >= 700000 + if (m->m_pkthdr.csum_flags & CSUM_TSO) { + struct netif_extra_info *gso = + (struct netif_extra_info *) + RING_GET_REQUEST(&sc->tx, ++i); + + if (extra) + extra->flags |= XEN_NETIF_EXTRA_FLAG_MORE; + else + tx->flags |= NETTXF_extra_info; + + gso->u.gso.size = m->m_pkthdr.tso_segsz; + gso->u.gso.type = + XEN_NETIF_GSO_TYPE_TCPV4; + gso->u.gso.pad = 0; + gso->u.gso.features = 0; + + gso->type = XEN_NETIF_EXTRA_TYPE_GSO; + gso->flags = 0; + extra = gso; + } +#endif + } else { + tx->size = m->m_len; + } + if (m->m_next) { + tx->flags |= NETTXF_more_data; + i++; + } + } - sc->xn_cdata.xn_tx_chain[id] = new_m; - BPF_MTAP(ifp, new_m); + BPF_MTAP(ifp, m_head); - sc->stats.tx_bytes += new_m->m_pkthdr.len; + sc->stats.tx_bytes += m_head->m_pkthdr.len; sc->stats.tx_packets++; } @@ -1471,9 +1657,9 @@ xn_ioctl(struct ifnet *ifp, u_long cmd, if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) xn_ifinit_locked(sc); arp_ifinit(ifp, ifa); - XN_UNLOCK(sc); + XN_UNLOCK(sc); } else { - XN_UNLOCK(sc); + XN_UNLOCK(sc); error = ether_ioctl(ifp, cmd, data); } break; @@ -1527,12 +1713,39 @@ xn_ioctl(struct ifnet *ifp, u_long cmd, break; case SIOCSIFCAP: mask = ifr->ifr_reqcap ^ ifp->if_capenable; - if (mask & IFCAP_HWCSUM) { - if (IFCAP_HWCSUM & ifp->if_capenable) - ifp->if_capenable &= ~IFCAP_HWCSUM; - else - ifp->if_capenable |= IFCAP_HWCSUM; + if (mask & IFCAP_TXCSUM) { + if (IFCAP_TXCSUM & ifp->if_capenable) { + ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4); + ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP + | CSUM_IP | CSUM_TSO); + } else { + ifp->if_capenable |= IFCAP_TXCSUM; + ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP + | CSUM_IP); + } + } + if (mask & IFCAP_RXCSUM) { + ifp->if_capenable ^= IFCAP_RXCSUM; + } +#if __FreeBSD_version >= 700000 + if (mask & IFCAP_TSO4) { + if (IFCAP_TSO4 & ifp->if_capenable) { + ifp->if_capenable &= ~IFCAP_TSO4; + ifp->if_hwassist &= ~CSUM_TSO; + } else if (IFCAP_TXCSUM & ifp->if_capenable) { + ifp->if_capenable |= IFCAP_TSO4; + ifp->if_hwassist |= CSUM_TSO; + } else { + IPRINTK("Xen requires tx checksum offload" + " be enabled to use TSO\n"); + error = EINVAL; + } } + if (mask & IFCAP_LRO) { + ifp->if_capenable ^= IFCAP_LRO; + + } +#endif error = 0; break; case SIOCADDMULTI: @@ -1741,11 +1954,21 @@ create_netdev(device_t dev) ifp->if_mtu = ETHERMTU; ifp->if_snd.ifq_maxlen = NET_TX_RING_SIZE - 1; -#ifdef notyet ifp->if_hwassist = XN_CSUM_FEATURES; ifp->if_capabilities = IFCAP_HWCSUM; +#if __FreeBSD_version >= 700000 + ifp->if_capabilities |= IFCAP_TSO4; + if (xn_enable_lro) { + int err = tcp_lro_init(&np->xn_lro); + if (err) { + device_printf(dev, "LRO initialization failed\n"); + goto exit; + } + np->xn_lro.ifp = ifp; + ifp->if_capabilities |= IFCAP_LRO; + } +#endif ifp->if_capenable = ifp->if_capabilities; -#endif ether_ifattach(ifp, np->mac); callout_init(&np->xn_stat_ch, CALLOUT_MPSAFE);