From owner-p4-projects@FreeBSD.ORG Wed Nov 8 00:40:57 2006 Return-Path: X-Original-To: p4-projects@freebsd.org Delivered-To: p4-projects@freebsd.org Received: by hub.freebsd.org (Postfix, from userid 32767) id 25B8416A4C9; Wed, 8 Nov 2006 00:40:57 +0000 (UTC) X-Original-To: perforce@freebsd.org Delivered-To: perforce@freebsd.org Received: from mx1.FreeBSD.org (mx1.freebsd.org [216.136.204.125]) by hub.freebsd.org (Postfix) with ESMTP id F084E16A47E for ; Wed, 8 Nov 2006 00:40:56 +0000 (UTC) (envelope-from sam@freebsd.org) Received: from repoman.freebsd.org (repoman.freebsd.org [216.136.204.115]) by mx1.FreeBSD.org (Postfix) with ESMTP id BB78B43D9A for ; Wed, 8 Nov 2006 00:40:20 +0000 (GMT) (envelope-from sam@freebsd.org) Received: from repoman.freebsd.org (localhost [127.0.0.1]) by repoman.freebsd.org (8.13.6/8.13.6) with ESMTP id kA80eJtJ092833 for ; Wed, 8 Nov 2006 00:40:19 GMT (envelope-from sam@freebsd.org) Received: (from perforce@localhost) by repoman.freebsd.org (8.13.6/8.13.4/Submit) id kA80eJfa092830 for perforce@freebsd.org; Wed, 8 Nov 2006 00:40:19 GMT (envelope-from sam@freebsd.org) Date: Wed, 8 Nov 2006 00:40:19 GMT Message-Id: <200611080040.kA80eJfa092830@repoman.freebsd.org> X-Authentication-Warning: repoman.freebsd.org: perforce set sender to sam@freebsd.org using -f From: Sam Leffler To: Perforce Change Reviews Cc: Subject: PERFORCE change 109499 for review X-BeenThere: p4-projects@freebsd.org X-Mailman-Version: 2.1.5 Precedence: list List-Id: p4 projects tree changes List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Wed, 08 Nov 2006 00:40:57 -0000 http://perforce.freebsd.org/chv.cgi?CH=109499 Change 109499 by sam@sam_ebb on 2006/11/08 00:39:22 Add multi-segment tx: o change ix_npe in npebuf to an NPE_MAXSEG array of descriptors (3 for now based on tracing traffic for NFS root mount and normal traffic patterns running diskless) o bring in defrag code from ath to handle the case where the mbuf chain doesn't fit Gets us >20% improvement for upstream TCP netperf on a WITNESS+INVARIANTS kernel. Note: rx buffers get NPE_MAXSEG-1 unused descriptors (~8Kbytes right now); this can easily be reclaimed. Note: can optimize npebuf setup a bit in tx path by unrolling loop and eliminating extraneous write to the uncached npebuf. Affected files ... .. //depot/projects/arm/src/sys/arm/xscale/ixp425/if_npe.c#15 edit .. //depot/projects/arm/src/sys/arm/xscale/ixp425/if_npereg.h#3 edit Differences ... ==== //depot/projects/arm/src/sys/arm/xscale/ixp425/if_npe.c#15 (text+ko) ==== @@ -380,7 +380,7 @@ static int npe_dma_setup(struct npe_softc *sc, struct npedma *dma, - const char *name, int nbuf) + const char *name, int nbuf, int maxseg) { int error, i; @@ -391,7 +391,7 @@ /* DMA tag for mapped mbufs */ error = bus_dma_tag_create(NULL, 1, 0, BUS_SPACE_MAXADDR_32BIT, - BUS_SPACE_MAXADDR, NULL, NULL, MCLBYTES, 1, MCLBYTES, 0, + BUS_SPACE_MAXADDR, NULL, NULL, MCLBYTES, maxseg, MCLBYTES, 0, busdma_lock_mutex, &sc->sc_mtx, &dma->mtag); if (error != 0) { device_printf(sc->sc_dev, "unable to create %s mbuf dma tag, " @@ -508,10 +508,11 @@ } } else sc->sc_miih = sc->sc_ioh; - error = npe_dma_setup(sc, &sc->txdma, "tx", NPE_MAX_TX_BUFFERS); + error = npe_dma_setup(sc, &sc->txdma, "tx", NPE_MAX_TX_BUFFERS, + NPE_MAXSEG); if (error != 0) return error; - error = npe_dma_setup(sc, &sc->rxdma, "rx", NPE_MAX_RX_BUFFERS); + error = npe_dma_setup(sc, &sc->rxdma, "rx", NPE_MAX_RX_BUFFERS, 1); if (error != 0) return error; @@ -753,6 +754,7 @@ uint32_t entry; NPE_LOCK(sc); + /* XXX max # at a time? */ while (ixpqmgr_qread(qid, &entry) == 0) { struct npebuf *npe = P2V(NPE_QM_Q_ADDR(entry)); @@ -786,7 +788,6 @@ m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR); if (m == NULL) return ENOBUFS; - m->m_len = MCLBYTES; } KASSERT(m->m_ext.ext_size >= 1536 + ETHER_ALIGN, ("ext_size %d", m->m_ext.ext_size)); @@ -799,11 +800,11 @@ m_freem(m); return error; } - npe->ix_ne_data = htobe32(segs[0].ds_addr); + npe->ix_ne[0].data = htobe32(segs[0].ds_addr); /* NB: NPE requires length be a multiple of 64 */ /* NB: buffer length is shifted in word */ - npe->ix_ne_len = htobe32(segs[0].ds_len << 16); - npe->ix_ne_next = 0; + npe->ix_ne[0].len = htobe32(segs[0].ds_len << 16); + npe->ix_ne[0].next = 0; npe->ix_m = m; /* Flush the memory in the mbuf */ bus_dmamap_sync(dma->mtag, npe->ix_map, BUS_DMASYNC_PREREAD); @@ -830,7 +831,7 @@ struct mbuf *m; DPRINTF(sc, "%s: entry 0x%x neaddr 0x%x ne_len 0x%x\n", - __func__, entry, npe->ix_neaddr, npe->ix_ne_len);/*XXX*/ + __func__, entry, npe->ix_neaddr, npe->ix_ne[0].len); /* * Allocate a new mbuf to replenish the rx buffer. * If doing so fails we drop the rx'd frame so we @@ -848,7 +849,7 @@ BUS_DMASYNC_POSTREAD); /* set m_len etc. per rx frame size */ - mrx->m_len = be32toh(npe->ix_ne_len) & 0xffff; + mrx->m_len = be32toh(npe->ix_ne[0].len) & 0xffff; mrx->m_pkthdr.len = mrx->m_len; mrx->m_pkthdr.rcvif = ifp; mrx->m_flags |= M_HASFCS; @@ -862,8 +863,8 @@ } } else { m = npe->ix_m; - npe->ix_ne_len = htobe32(m->m_len << 16); - npe->ix_ne_next = 0; + npe->ix_ne[0].len = htobe32(m->m_len << 16); + npe->ix_ne[0].next = 0; /* XXX? sync? */ } bus_dmamap_sync(dma->buf_tag, dma->buf_map, @@ -1000,27 +1001,88 @@ NPE_UNLOCK(sc); } +/* + * Defragment an mbuf chain, returning at most maxfrags separate + * mbufs+clusters. If this is not possible NULL is returned and + * the original mbuf chain is left in it's present (potentially + * modified) state. We use two techniques: collapsing consecutive + * mbufs and replacing consecutive mbufs by a cluster. + */ static struct mbuf * -npe_linearize(struct mbuf *m0, int how) +npe_defrag(struct mbuf *m0, int how, int maxfrags) { - struct mbuf *m, *n; + struct mbuf *m, *n, *n2, **prev; + u_int curfrags; - if (m0->m_pkthdr.len > MHLEN) - n = m_getcl(how, MT_DATA, M_PKTHDR); - else - n = m_gethdr(how, MT_DATA); - if (n != NULL) { - n->m_len = 0; /* NB: not initialized on alloc */ - for (m = m0; m != NULL; m = m->m_next) { - bcopy(mtod(m, void *), mtod(n, char *) + n->m_len, - m->m_len); - n->m_len += m->m_len; + /* + * Calculate the current number of frags. + */ + curfrags = 0; + for (m = m0; m != NULL; m = m->m_next) + curfrags++; + /* + * First, try to collapse mbufs. Note that we always collapse + * towards the front so we don't need to deal with moving the + * pkthdr. This may be suboptimal if the first mbuf has much + * less data than the following. + */ + m = m0; +again: + for (;;) { + n = m->m_next; + if (n == NULL) + break; + if ((m->m_flags & M_RDONLY) == 0 && + n->m_len < M_TRAILINGSPACE(m)) { + bcopy(mtod(n, void *), mtod(m, char *) + m->m_len, + n->m_len); + m->m_len += n->m_len; + m->m_next = n->m_next; + m_free(n); + if (--curfrags <= maxfrags) + return m0; + } else + m = n; + } + KASSERT(maxfrags > 1, + ("maxfrags %u, but normal collapse failed", maxfrags)); + /* + * Collapse consecutive mbufs to a cluster. + */ + prev = &m0->m_next; /* NB: not the first mbuf */ + while ((n = *prev) != NULL) { + if ((n2 = n->m_next) != NULL && + n->m_len + n2->m_len < MCLBYTES) { + m = m_getcl(how, MT_DATA, 0); + if (m == NULL) + goto bad; + bcopy(mtod(n, void *), mtod(m, void *), n->m_len); + bcopy(mtod(n2, void *), mtod(m, char *) + n->m_len, + n2->m_len); + m->m_len = n->m_len + n2->m_len; + m->m_next = n2->m_next; + *prev = m; + m_free(n); + m_free(n2); + if (--curfrags <= maxfrags) /* +1 cl -2 mbufs */ + return m0; + /* + * Still not there, try the normal collapse + * again before we allocate another cluster. + */ + goto again; } - /* NB: this works because we never change m_final->m_data */ - m_move_pkthdr(n, m0); + prev = &n->m_next; } - m_freem(m0); - return n; + /* + * No place where we can collapse to a cluster; punt. + * This can occur if, for example, you request 2 frags + * but the packet requires that both be clusters (we + * never reallocate the first mbuf to avoid moving the + * packet header). + */ +bad: + return NULL; } /* @@ -1031,10 +1093,11 @@ { struct npe_softc *sc = ifp->if_softc; struct npebuf *npe; - struct mbuf *m; + struct mbuf *m, *n; struct npedma *dma = &sc->txdma; - bus_dma_segment_t segs[1]; - int nseg, len; + bus_dma_segment_t segs[NPE_MAXSEG]; + int nseg, len, error, i; + uint32_t next; NPE_ASSERT_LOCKED(sc); /* XXX can this happen? */ @@ -1049,19 +1112,27 @@ return; } npe = sc->tx_free; - if (m->m_next != NULL) { - m = npe_linearize(m, M_DONTWAIT); - if (m == NULL) - return; + error = bus_dmamap_load_mbuf_sg(dma->mtag, npe->ix_map, + m, segs, &nseg, 0); + if (error == EFBIG) { + n = npe_defrag(m, M_DONTWAIT, NPE_MAXSEG); + if (n == NULL) { + if_printf(ifp, "%s: too many fragments %u\n", + __func__, nseg); + m_freem(m); + return; /* XXX? */ + } + m = n; + error = bus_dmamap_load_mbuf_sg(dma->mtag, npe->ix_map, + m, segs, &nseg, 0); } - if (bus_dmamap_load_mbuf_sg(dma->mtag, npe->ix_map, - m, segs, &nseg, 0) != 0) { + if (error != 0 || nseg == 0) { + if_printf(ifp, "%s: error %u nseg %u\n", + __func__, error, nseg); m_freem(m); - continue; + return; /* XXX? */ } sc->tx_free = npe->ix_next; - if (sc->tx_free == NULL) - ifp->if_drv_flags |= IFF_DRV_OACTIVE; bus_dmamap_sync(dma->mtag, npe->ix_map, BUS_DMASYNC_PREWRITE); @@ -1071,22 +1142,30 @@ BPF_MTAP(ifp, m); npe->ix_m = m; - npe->ix_ne_data = htobe32(segs[0].ds_addr); - len = segs[0].ds_len; - /* NB: this sets both frame and buffer lengths */ - npe->ix_ne_len = htobe32((len<<16) | len); - npe->ix_ne_next = 0; /* NB: no chaining (yet) */ + len = m->m_pkthdr.len; + next = npe->ix_neaddr + sizeof(npe->ix_ne[0]); + for (i = 0; i < nseg; i++) { + npe->ix_ne[i].data = htobe32(segs[i].ds_addr); + npe->ix_ne[i].len = htobe32((segs[i].ds_len<<16) | len); + npe->ix_ne[i].next = htobe32(next); + + len = 0; /* zero for segments > 1 */ + next += sizeof(npe->ix_ne[0]); + } + npe->ix_ne[i-1].next = 0; /* zero last in chain */ /* XXX flush descriptor instead of using uncached memory */ DPRINTF(sc, "%s: qwrite(%u, 0x%x) ne_data %x ne_len 0x%x\n", __func__, sc->tx_qid, npe->ix_neaddr, - npe->ix_ne_data, npe->ix_ne_len); + npe->ix_ne[0].data, npe->ix_ne[0].len); /* stick it on the tx q */ /* XXX add vlan priority */ ixpqmgr_qwrite(sc->tx_qid, npe->ix_neaddr); ifp->if_timer = 5; } + if (sc->tx_free == NULL) + ifp->if_drv_flags |= IFF_DRV_OACTIVE; } void ==== //depot/projects/arm/src/sys/arm/xscale/ixp425/if_npereg.h#3 (text+ko) ==== @@ -67,16 +67,20 @@ * of the Intel code all the s/w area is free for us to use as we * choose--only the npe area layout and alignment must be honored. */ +#define NPE_MAXSEG 3 /* empirically selected */ + struct npebuf { struct npebuf *ix_next; /* chain to next buffer */ void *ix_m; /* backpointer to mbuf */ uint32_t ix_neaddr; /* phys address of ix_ne */ bus_dmamap_t ix_map; /* bus dma map for associated data */ uint32_t ix_reserved[4]; - uint32_t ix_ne[8]; /* NPE shared area, cacheline aligned */ -#define ix_ne_next ix_ne[0] /* phys addr of next buffer */ -#define ix_ne_len ix_ne[1] /* buffer length (bytes) */ -#define ix_ne_data ix_ne[2] /* phys addr of data buffer */ + struct { /* NPE shared area, cacheline aligned */ + uint32_t next; /* phys addr of next segment */ + uint32_t len; /* buffer/segment length (bytes) */ + uint32_t data; /* phys addr of data segment */ + uint32_t pad[5]; /* pad to cacheline */ + } ix_ne[NPE_MAXSEG]; }; #define NPE_PORTS_MAX 3