From owner-svn-src-all@FreeBSD.ORG Fri Aug 31 10:07:38 2012 Return-Path: Delivered-To: svn-src-all@freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [69.147.83.52]) by hub.freebsd.org (Postfix) with ESMTP id BE978106566B; Fri, 31 Aug 2012 10:07:38 +0000 (UTC) (envelope-from scottl@FreeBSD.org) Received: from svn.freebsd.org (svn.freebsd.org [IPv6:2001:4f8:fff6::2c]) by mx1.freebsd.org (Postfix) with ESMTP id A89628FC0C; Fri, 31 Aug 2012 10:07:38 +0000 (UTC) Received: from svn.freebsd.org (localhost [127.0.0.1]) by svn.freebsd.org (8.14.4/8.14.4) with ESMTP id q7VA7cel011003; Fri, 31 Aug 2012 10:07:38 GMT (envelope-from scottl@svn.freebsd.org) Received: (from scottl@localhost) by svn.freebsd.org (8.14.4/8.14.4/Submit) id q7VA7cwT010999; Fri, 31 Aug 2012 10:07:38 GMT (envelope-from scottl@svn.freebsd.org) Message-Id: <201208311007.q7VA7cwT010999@svn.freebsd.org> From: Scott Long Date: Fri, 31 Aug 2012 10:07:38 +0000 (UTC) To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org X-SVN-Group: head MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cc: Subject: svn commit: r239940 - head/sys/dev/ixgbe X-BeenThere: svn-src-all@freebsd.org X-Mailman-Version: 2.1.5 Precedence: list List-Id: "SVN commit messages for the entire src tree \(except for " user" and " projects" \)" List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Fri, 31 Aug 2012 10:07:38 -0000 Author: scottl Date: Fri Aug 31 10:07:38 2012 New Revision: 239940 URL: http://svn.freebsd.org/changeset/base/239940 Log: Heavily optimize the case of small RX packets of 160 bytes or less. For this case, allocate a plain mbuf and copy the frame into it, then send the copy up the stack, leaving the original mbuf+cluster in place in the receive ring for immediate re-use. This saves a trip through 2 of the 3 zones of the compound mbuf allocator, a trip through busdma, and a trip through the 1 of the 3 mbuf destructors. For our load at Netflix, this can lower CPU consumption by as much as 20%. The copy algorithm is based on investigative work from Luigi Rizzo earlier in the year. Reviewed by: jfv Obtained from: Netflix Modified: head/sys/dev/ixgbe/ixgbe.c head/sys/dev/ixgbe/ixgbe.h head/sys/dev/ixgbe/ixgbe_osdep.h Modified: head/sys/dev/ixgbe/ixgbe.c ============================================================================== --- head/sys/dev/ixgbe/ixgbe.c Fri Aug 31 09:42:46 2012 (r239939) +++ head/sys/dev/ixgbe/ixgbe.c Fri Aug 31 10:07:38 2012 (r239940) @@ -3734,21 +3734,30 @@ no_split: mp = rxbuf->m_pack; mp->m_pkthdr.len = mp->m_len = adapter->rx_mbuf_sz; - /* Get the memory mapping */ - error = bus_dmamap_load_mbuf_sg(rxr->ptag, - rxbuf->pmap, mp, pseg, &nsegs, BUS_DMA_NOWAIT); - if (error != 0) { - printf("Refresh mbufs: payload dmamap load" - " failure - %d\n", error); - m_free(mp); - rxbuf->m_pack = NULL; - goto update; + + /* If we're dealing with an mbuf that was copied rather + * than replaced, there's no need to go through busdma. + */ + if ((rxbuf->flags & IXGBE_RX_COPY) == 0) { + /* Get the memory mapping */ + error = bus_dmamap_load_mbuf_sg(rxr->ptag, + rxbuf->pmap, mp, pseg, &nsegs, BUS_DMA_NOWAIT); + if (error != 0) { + printf("Refresh mbufs: payload dmamap load" + " failure - %d\n", error); + m_free(mp); + rxbuf->m_pack = NULL; + goto update; + } + rxbuf->m_pack = mp; + bus_dmamap_sync(rxr->ptag, rxbuf->pmap, + BUS_DMASYNC_PREREAD); + rxbuf->paddr = rxr->rx_base[i].read.pkt_addr = + htole64(pseg[0].ds_addr); + } else { + rxr->rx_base[i].read.pkt_addr = rxbuf->paddr; + rxbuf->flags &= ~IXGBE_RX_COPY; } - rxbuf->m_pack = mp; - bus_dmamap_sync(rxr->ptag, rxbuf->pmap, - BUS_DMASYNC_PREREAD); - rxr->rx_base[i].read.pkt_addr = - htole64(pseg[0].ds_addr); refreshed = TRUE; /* Next is precalculated */ @@ -4061,6 +4070,7 @@ skip_head: rxr->next_to_refresh = 0; rxr->lro_enabled = FALSE; rxr->rx_split_packets = 0; + rxr->rx_copies = 0; rxr->rx_bytes = 0; rxr->discard = FALSE; rxr->vtag_strip = FALSE; @@ -4618,14 +4628,37 @@ ixgbe_rxeof(struct ix_queue *que, int co ** that determines what we are */ sendmp = rbuf->fmp; - rbuf->m_pack = rbuf->fmp = NULL; if (sendmp != NULL) { /* secondary frag */ + rbuf->m_pack = rbuf->fmp = NULL; mp->m_flags &= ~M_PKTHDR; sendmp->m_pkthdr.len += mp->m_len; } else { + /* + * Optimize. This might be a small packet, + * maybe just a TCP ACK. Do a fast copy that + * is cache aligned into a new mbuf, and + * leave the old mbuf+cluster for re-use. + */ + if (eop && plen <= IXGBE_RX_COPY_LEN) { + prefetch(mp->m_data); + sendmp = m_gethdr(M_DONTWAIT, MT_DATA); + if (sendmp != NULL) { + sendmp->m_data += + IXGBE_RX_COPY_ALIGN; + ixgbe_bcopy(mp->m_data, + sendmp->m_data, plen); + sendmp->m_len = plen; + rxr->rx_copies++; + rbuf->flags |= IXGBE_RX_COPY; + } + } + if (sendmp == NULL) { + rbuf->m_pack = rbuf->fmp = NULL; + sendmp = mp; + } + /* first desc of a non-ps chain */ - sendmp = mp; sendmp->m_flags |= M_PKTHDR; sendmp->m_pkthdr.len = mp->m_len; if (staterr & IXGBE_RXD_STAT_VP) { @@ -5476,6 +5509,9 @@ ixgbe_add_hw_stats(struct adapter *adapt SYSCTL_ADD_UQUAD(ctx, queue_list, OID_AUTO, "rx_bytes", CTLFLAG_RD, &rxr->rx_bytes, "Queue Bytes Received"); + SYSCTL_ADD_UQUAD(ctx, queue_list, OID_AUTO, "rx_copies", + CTLFLAG_RD, &rxr->rx_copies, + "Copied RX Frames"); SYSCTL_ADD_INT(ctx, queue_list, OID_AUTO, "lro_queued", CTLFLAG_RD, &lro->lro_queued, 0, "LRO Queued"); Modified: head/sys/dev/ixgbe/ixgbe.h ============================================================================== --- head/sys/dev/ixgbe/ixgbe.h Fri Aug 31 09:42:46 2012 (r239939) +++ head/sys/dev/ixgbe/ixgbe.h Fri Aug 31 10:07:38 2012 (r239940) @@ -154,6 +154,19 @@ #define IXGBE_FC_HI 0x20000 #define IXGBE_FC_LO 0x10000 +/* + * Used for optimizing small rx mbufs. Effort is made to keep the copy + * small and aligned for the CPU L1 cache. + * + * MHLEN is typically 168 bytes, giving us 8-byte alignment. Getting + * 32 byte alignment needed for the fast bcopy results in 8 bytes being + * wasted. Getting 64 byte alignment, which _should_ be ideal for + * modern Intel CPUs, results in 40 bytes wasted and a significant drop + * in observed efficiency of the optimization, 97.9% -> 81.8%. + */ +#define IXGBE_RX_COPY_LEN 160 +#define IXGBE_RX_COPY_ALIGN (MHLEN - IXGBE_RX_COPY_LEN) + /* Keep older OS drivers building... */ #if !defined(SYSCTL_ADD_UQUAD) #define SYSCTL_ADD_UQUAD SYSCTL_ADD_QUAD @@ -245,6 +258,9 @@ struct ixgbe_rx_buf { struct mbuf *fmp; bus_dmamap_t hmap; bus_dmamap_t pmap; + u_int flags; +#define IXGBE_RX_COPY 0x01 + uint64_t paddr; }; /* @@ -339,6 +355,7 @@ struct rx_ring { /* Soft stats */ u64 rx_irq; u64 rx_split_packets; + u64 rx_copies; u64 rx_packets; u64 rx_bytes; u64 rx_discarded; Modified: head/sys/dev/ixgbe/ixgbe_osdep.h ============================================================================== --- head/sys/dev/ixgbe/ixgbe_osdep.h Fri Aug 31 09:42:46 2012 (r239939) +++ head/sys/dev/ixgbe/ixgbe_osdep.h Fri Aug 31 10:07:38 2012 (r239940) @@ -143,6 +143,25 @@ void prefetch(void *x) #define prefetch(x) #endif +/* + * Optimized bcopy thanks to Luigi Rizzo's investigative work. Assumes + * non-overlapping regions and 32-byte padding on both src and dst. + */ +static __inline int +ixgbe_bcopy(void *_src, void *_dst, int l) +{ + uint64_t *src = _src; + uint64_t *dst = _dst; + + for (; l > 0; l -= 32) { + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + } + return (0); +} + struct ixgbe_osdep { bus_space_tag_t mem_bus_space_tag;