Skip site navigation (1)Skip section navigation (2)
Date:      Tue, 13 Sep 2011 14:36:53 -0400
From:      Arnaud Lacombe <lacombar@gmail.com>
To:        Jack Vogel <jfvogel@gmail.com>
Cc:        freebsd-net@freebsd.org
Subject:   Re: FreeBSD 7-STABLE mbuf corruption
Message-ID:  <CACqU3MV7JRxQ_mNeHCk7RVyzETZLAcc3XL=xyZ-qqtPfRxkZeQ@mail.gmail.com>
In-Reply-To: <CAFOYbc=mu7rGU8LudLSzZwKfM3QHFw%2BPGEHHKD3rcA2=dxGfoQ@mail.gmail.com>
References:  <CACqU3MUs9Z9GeuGe=8iVp=MWV6eG-tO%2BkHb1znatsTq2uEqwvA@mail.gmail.com> <CACqU3MXf52tLajTfVCEiGGhtCuXsesrdM65LfsoGecuZj2tNwA@mail.gmail.com> <CAFOYbc=mu7rGU8LudLSzZwKfM3QHFw%2BPGEHHKD3rcA2=dxGfoQ@mail.gmail.com>

next in thread | previous in thread | raw e-mail | index | archive | help

[-- Attachment #1 --]
Hi,

On Wed, Sep 7, 2011 at 7:57 PM, Jack Vogel <jfvogel@gmail.com> wrote:
> I have seen this, but I don't have any hot ideas right off the top of my
> head yet :(
>
I've been running for 19h now the following patches:
 - backport of kmacy@'s buf_ring(9) API, from FreeBSD 8 (from [0], see
attachment for full diff)
 - conversion of igb(4), from CURRENT, to use buf_ring(9) on FreeBSD
7.1 (see attachment)
 - all the original patches I already sent

It did not crash, yet. The only downside is that after 3h30 and ~4h,
igb(4) queues' handler started spinning infinitely, breaking network
connectivity.

I would be tempted to say that the infinite loop issue is an igb(4)
(separate from the original crashes), and to link the crashes I was
seeing to a race in the legacy IFQ code...

 - Arnaud


[0]: roughly, a cherry-pick of r185162, r185164, r185193, r185543,
r186207, r186213, r191033, r191161, r191899, r193848 and r194518.

> Jack
>
>
> On Wed, Sep 7, 2011 at 4:19 PM, Arnaud Lacombe <lacombar@gmail.com> wrote:
>>
>> Hi,
>>
>> On Mon, Sep 5, 2011 at 2:59 AM, Arnaud Lacombe <lacombar@gmail.com> wrote:
>> > Hi folks,
>> >
>> > We have been trying to track down a bad mbuf management for about two
>> > weeks on a customized 7.1 base. I have finally been able to reproduce
>> > it with a stock FreeBSD 7-STABLE (kernel from r225276, userland from
>> > 7.4).
>> >
>> > With the help of the attached patches, I have just been able to
>> > trigger the following panic:
>> >
>> > panic: Corrupted unused flags, expected 0xffffffff00000000, got 0x0,
>> > flags 0x3
>> > cpuid = 1
>> > Uptime: 3d10h5m3s
>> > Cannot dump. No dump device defined
>> >
>> General form of the crash is:
>>
>> panic: Corrupted unused flags, expected 0xffffffff00000000, got
>> 0xbabe0000000000, flags 0xbabe0000babe00
>> cpuid = 0
>> KDB: stack backtrace:
>> db_trace_self_wrapper(c0874e29,0,c0835757,f4574c48,0,...) at
>> db_trace_self_wrapper+0x26
>> panic(c0835757,0,ffffffff,0,babe00,...) at panic+0x10b
>> igb_txeof(c6a25008,0,c0837083,5ea,17c,...) at igb_txeof+0x399
>> igb_msix_que(c6a2b800,0,c084d367,4b6,c69dd068,...) at igb_msix_que+0x7b
>> ithread_loop(c6a29090,f4574d38,c084d0db,31c,c6a16828,...) at
>> ithread_loop+0xc3
>> fork_exit(c061d520,c6a29090,f4574d38) at fork_exit+0xa6
>> fork_trampoline() at fork_trampoline+0x8
>> --- trap 0, eip = 0, esp = 0xf4574d70, ebp = 0 ---
>> Uptime: 1m42s
>>
>> It happens particularly easily when the box receives wall of SYN
>> (about 1000 cnx attempts at once) every 5s or so.
>>
>>  - Arnaud
>>
>> >
>> > [cut stuff no one cares about...]
>
>

[-- Attachment #2 --]
diff --git a/sys/amd64/include/atomic.h b/sys/amd64/include/atomic.h
index 52c90c9..a9971cd 100644
--- a/sys/amd64/include/atomic.h
+++ b/sys/amd64/include/atomic.h
@@ -32,6 +32,10 @@
 #error this file needs sys/cdefs.h as a prerequisite
 #endif
 
+#define mb()	__asm__ __volatile__ ("mfence;": : :"memory")
+#define wmb()	__asm__ __volatile__ ("sfence;": : :"memory")
+#define rmb()	__asm__ __volatile__ ("lfence;": : :"memory")
+
 /*
  * Various simple operations on memory, each of which is atomic in the
  * presence of interrupts and multiple processors.
diff --git a/sys/arm/include/atomic.h b/sys/arm/include/atomic.h
index bed5a72..f0be3ae 100644
--- a/sys/arm/include/atomic.h
+++ b/sys/arm/include/atomic.h
@@ -45,6 +45,11 @@
 
 #include <sys/types.h>
 
+
+#define	mb()
+#define	wmb()
+#define	rmb()
+
 #ifndef I32_bit
 #define I32_bit (1 << 7)        /* IRQ disable */
 #endif
diff --git a/sys/conf/files b/sys/conf/files
index 8226e11..d5351c1 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -1676,6 +1676,7 @@ kern/subr_acl_posix1e.c		standard
 kern/subr_autoconf.c		standard
 kern/subr_blist.c		standard
 kern/subr_bus.c			standard
+kern/subr_bufring.c		standard
 kern/subr_clock.c		standard
 kern/subr_devstat.c		standard
 kern/subr_disk.c		standard
diff --git a/sys/dev/bce/if_bcereg.h b/sys/dev/bce/if_bcereg.h
index 723fd26..2bb589f 100644
--- a/sys/dev/bce/if_bcereg.h
+++ b/sys/dev/bce/if_bcereg.h
@@ -541,6 +541,8 @@ default: DBPRINT(sc, BCE_INSANE_PHY,					\
 
 #endif /* BCE_DEBUG */
 
+
+#if __FreeBSD_version < 800054
 #if defined(__i386__) || defined(__amd64__)
 #define mb()    __asm volatile("mfence" ::: "memory")
 #define wmb()   __asm volatile("sfence" ::: "memory")
@@ -550,6 +552,7 @@ default: DBPRINT(sc, BCE_INSANE_PHY,					\
 #define rmb()
 #define wmb()
 #endif
+#endif
 
 /****************************************************************************/
 /* Device identification definitions.                                       */
diff --git a/sys/dev/cxgb/cxgb_adapter.h b/sys/dev/cxgb/cxgb_adapter.h
index b54d340..a6099a0 100644
--- a/sys/dev/cxgb/cxgb_adapter.h
+++ b/sys/dev/cxgb/cxgb_adapter.h
@@ -42,6 +42,7 @@ $FreeBSD$
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/condvar.h>
+#include <sys/buf_ring.h>
 
 #include <net/ethernet.h>
 #include <net/if.h>
@@ -265,7 +266,7 @@ struct sge_txq {
 	 * mbuf touches
 	 */
 	struct mbuf_head cleanq;	
-	struct buf_ring txq_mr;
+	struct buf_ring *txq_mr;
 	struct mbuf     *immpkt;
 	uint32_t        txq_drops;
 	uint32_t        txq_skipped;
diff --git a/sys/dev/cxgb/cxgb_multiq.c b/sys/dev/cxgb/cxgb_multiq.c
index 045b094..dc667be 100644
--- a/sys/dev/cxgb/cxgb_multiq.c
+++ b/sys/dev/cxgb/cxgb_multiq.c
@@ -134,7 +134,7 @@ cxgb_pcpu_enqueue_packet_(struct sge_qset *qs, struct mbuf *m)
 		return (ENETDOWN);
 	}
 	txq = &qs->txq[TXQ_ETH];
-	err = buf_ring_enqueue(&txq->txq_mr, m);
+	err = drbr_enqueue(qs->port->ifp, txq->txq_mr, m);
 	if (err) {
 		txq->txq_drops++;
 		m_freem(m);
@@ -199,14 +199,11 @@ cxgb_dequeue_packet(struct sge_txq *txq, struct mbuf **m_vec)
 	}
 	sc = qs->port->adapter;
 
-	m = buf_ring_dequeue(&txq->txq_mr);
+	m = buf_ring_dequeue_sc(txq->txq_mr);
 	if (m == NULL) 
 		return (0);
 
 	count = 1;
-	KASSERT(m->m_type == MT_DATA,
-	    ("m=%p is bad mbuf type %d from ring cons=%d prod=%d", m,
-		m->m_type, txq->txq_mr.br_cons, txq->txq_mr.br_prod));
 	m_vec[0] = m;
 	if (m->m_pkthdr.tso_segsz > 0 || m->m_pkthdr.len > TX_WR_SIZE_MAX ||
 	    m->m_next != NULL || (cxgb_pcpu_tx_coalesce == 0)) {
@@ -214,14 +211,14 @@ cxgb_dequeue_packet(struct sge_txq *txq, struct mbuf **m_vec)
 	}
 
 	size = m->m_pkthdr.len;
-	for (m = buf_ring_peek(&txq->txq_mr); m != NULL;
-	     m = buf_ring_peek(&txq->txq_mr)) {
+	for (m = buf_ring_peek(txq->txq_mr); m != NULL;
+	     m = buf_ring_peek(txq->txq_mr)) {
 
 		if (m->m_pkthdr.tso_segsz > 0 ||
 		    size + m->m_pkthdr.len > TX_WR_SIZE_MAX || m->m_next != NULL)
 			break;
 
-		buf_ring_dequeue(&txq->txq_mr);
+		buf_ring_dequeue_sc(txq->txq_mr);
 		size += m->m_pkthdr.len;
 		m_vec[count++] = m;
 
@@ -372,7 +369,7 @@ cxgb_pcpu_free(struct sge_qset *qs)
 	mtx_lock(&txq->lock);
 	while ((m = mbufq_dequeue(&txq->sendq)) != NULL) 
 		m_freem(m);
-	while ((m = buf_ring_dequeue(&txq->txq_mr)) != NULL) 
+	while ((m = buf_ring_dequeue_sc(txq->txq_mr)) != NULL) 
 		m_freem(m);
 
 	t3_free_tx_desc_all(txq);
@@ -434,7 +431,7 @@ cxgb_pcpu_start_(struct sge_qset *qs, struct mbuf *immpkt, int tx_flush)
 		initerr = ENETDOWN;
 	else if (immpkt) {
 
-		if (!buf_ring_empty(&txq->txq_mr)) 
+		if (!buf_ring_empty(txq->txq_mr)) 
 			initerr = cxgb_pcpu_enqueue_packet_(qs, immpkt);
 		else
 			txq->immpkt = immpkt;
@@ -465,7 +462,7 @@ cxgb_pcpu_start_(struct sge_qset *qs, struct mbuf *immpkt, int tx_flush)
 	}
 
 	stopped = isset(&qs->txq_stopped, TXQ_ETH);
-	flush = (((!buf_ring_empty(&txq->txq_mr) || (!IFQ_DRV_IS_EMPTY(&pi->ifp->if_snd))) && !stopped) || txq->immpkt); 
+	flush = (((!buf_ring_empty(txq->txq_mr) || (!IFQ_DRV_IS_EMPTY(&pi->ifp->if_snd))) && !stopped) || txq->immpkt); 
 	max_desc = tx_flush ? TX_ETH_Q_SIZE : TX_START_MAX_DESC;
 
 	if (cxgb_debug)
@@ -476,7 +473,7 @@ cxgb_pcpu_start_(struct sge_qset *qs, struct mbuf *immpkt, int tx_flush)
 
 
 	if ((tx_flush && flush && err == 0) &&
-	    (!buf_ring_empty(&txq->txq_mr)  ||
+	    (!buf_ring_empty(txq->txq_mr)  ||
 		!IFQ_DRV_IS_EMPTY(&pi->ifp->if_snd))) {
 		struct thread *td = curthread;
 
@@ -526,7 +523,7 @@ cxgb_pcpu_start(struct ifnet *ifp, struct mbuf *immpkt)
 	txq = &qs->txq[TXQ_ETH];
 
 	if (((sc->tunq_coalesce == 0) ||
-		(buf_ring_count(&txq->txq_mr) >= TX_WR_COUNT_MAX) ||
+		(buf_ring_count(txq->txq_mr) >= TX_WR_COUNT_MAX) ||
 		(cxgb_pcpu_tx_coalesce == 0)) && mtx_trylock(&txq->lock)) {
 		if (cxgb_debug)
 			printf("doing immediate transmit\n");
@@ -534,12 +531,12 @@ cxgb_pcpu_start(struct ifnet *ifp, struct mbuf *immpkt)
 		txq->flags |= TXQ_TRANSMITTING;
 		err = cxgb_pcpu_start_(qs, immpkt, FALSE);
 		txq->flags &= ~TXQ_TRANSMITTING;
-		resid = (buf_ring_count(&txq->txq_mr) > 64) || (desc_reclaimable(txq) > 64);
+		resid = (buf_ring_count(txq->txq_mr) > 64) || (desc_reclaimable(txq) > 64);
 		mtx_unlock(&txq->lock);
 	} else if (immpkt) {
 		if (cxgb_debug)
 			printf("deferred coalesce=%jx ring_count=%d mtx_owned=%d\n",
-			    sc->tunq_coalesce, buf_ring_count(&txq->txq_mr), mtx_owned(&txq->lock));
+			    sc->tunq_coalesce, buf_ring_count(txq->txq_mr), mtx_owned(&txq->lock));
 		err = cxgb_pcpu_enqueue_packet_(qs, immpkt);
 	}
 	
@@ -591,7 +588,7 @@ cxgb_pcpu_start_proc(void *arg)
 
 		if ((qs->port->ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
 			idleticks = hz;
-			if (!buf_ring_empty(&txq->txq_mr) ||
+			if (!buf_ring_empty(txq->txq_mr) ||
 			    !mbufq_empty(&txq->sendq))
 				cxgb_pcpu_free(qs);
 			goto done;
@@ -616,11 +613,13 @@ cxgb_pcpu_start_proc(void *arg)
 			mtx_unlock(&qs->rspq.lock);
 		}
 #endif		
-		if ((!buf_ring_empty(&txq->txq_mr)) && err == 0) {
+		if ((!buf_ring_empty(txq->txq_mr)) && err == 0) {
+#if 0
 			if (cxgb_debug)
 				printf("head=%p cons=%d prod=%d\n",
 				    txq->sendq.head, txq->txq_mr.br_cons,
 				    txq->txq_mr.br_prod);
+#endif			
 			continue;
 		}
 	done:	
diff --git a/sys/dev/cxgb/cxgb_osdep.h b/sys/dev/cxgb/cxgb_osdep.h
index 29b9e2f..4f42290 100644
--- a/sys/dev/cxgb/cxgb_osdep.h
+++ b/sys/dev/cxgb/cxgb_osdep.h
@@ -155,9 +155,6 @@ void cxgb_log_tcb(struct adapter *sc, unsigned int tid);
 
 
 #if defined(__i386__) || defined(__amd64__)
-#define mb()    __asm volatile("mfence":::"memory")
-#define rmb()   __asm volatile("lfence":::"memory")
-#define wmb()   __asm volatile("sfence" ::: "memory")
 #define smp_mb() mb()
 
 #define L1_CACHE_BYTES 128
@@ -178,163 +175,11 @@ extern void kdb_backtrace(void);
 
 
 #else /* !i386 && !amd64 */
-#define mb()
-#define rmb()
-#define wmb()
 #define smp_mb()
 #define prefetch(x)
 #define L1_CACHE_BYTES 32
 #endif
 
-struct buf_ring {
-	caddr_t          *br_ring;
-	volatile uint32_t br_cons;
-	volatile uint32_t br_prod;
-	int               br_size;
-	struct mtx        br_lock;
-};
-
-struct buf_ring *buf_ring_alloc(int count, int flags);
-void buf_ring_free(struct buf_ring *);
-
-static __inline int
-buf_ring_count(struct buf_ring *mr)
-{
-	int size = mr->br_size;
-	uint32_t mask = size - 1;
-	
-	return ((size + mr->br_prod - mr->br_cons) & mask);
-}
-
-static __inline int
-buf_ring_empty(struct buf_ring *mr)
-{
-	return (mr->br_cons == mr->br_prod);
-}
-
-static __inline int
-buf_ring_full(struct buf_ring *mr)
-{
-	uint32_t mask;
-
-	mask = mr->br_size - 1;
-	return (mr->br_cons == ((mr->br_prod + 1) & mask));
-}
-
-/*
- * The producer and consumer are independently locked
- * this relies on the consumer providing his own serialization
- *
- */
-static __inline void *
-buf_ring_dequeue(struct buf_ring *mr)
-{
-	uint32_t prod, cons, mask;
-	caddr_t *ring, m;
-	
-	ring = (caddr_t *)mr->br_ring;
-	mask = mr->br_size - 1;
-	cons = mr->br_cons;
-	mb();
-	prod = mr->br_prod;
-	m = NULL;
-	if (cons != prod) {
-		m = ring[cons];
-		ring[cons] = NULL;
-		mr->br_cons = (cons + 1) & mask;
-		mb();
-	}
-	return (m);
-}
-
-#ifdef DEBUG_BUFRING
-static __inline void
-__buf_ring_scan(struct buf_ring *mr, void *m, char *file, int line)
-{
-	int i;
-
-	for (i = 0; i < mr->br_size; i++)
-		if (m == mr->br_ring[i])
-			panic("%s:%d m=%p present prod=%d cons=%d idx=%d", file,
-			    line, m, mr->br_prod, mr->br_cons, i);
-}
-
-static __inline void
-buf_ring_scan(struct buf_ring *mr, void *m, char *file, int line)
-{
-	mtx_lock(&mr->br_lock);
-	__buf_ring_scan(mr, m, file, line);
-	mtx_unlock(&mr->br_lock);
-}
-
-#else
-static __inline void
-__buf_ring_scan(struct buf_ring *mr, void *m, char *file, int line)
-{
-}
-
-static __inline void
-buf_ring_scan(struct buf_ring *mr, void *m, char *file, int line)
-{
-}
-#endif
-
-static __inline int
-__buf_ring_enqueue(struct buf_ring *mr, void *m, char *file, int line)
-{
-	
-	uint32_t prod, cons, mask;
-	int err;
-	
-	mask = mr->br_size - 1;
-	prod = mr->br_prod;
-	mb();
-	cons = mr->br_cons;
-	__buf_ring_scan(mr, m, file, line);
-	if (((prod + 1) & mask) != cons) {
-		KASSERT(mr->br_ring[prod] == NULL, ("overwriting entry"));
-		mr->br_ring[prod] = m;
-		mb();
-		mr->br_prod = (prod + 1) & mask;
-		err = 0;
-	} else
-		err = ENOBUFS;
-
-	return (err);
-}
-
-static __inline int
-buf_ring_enqueue_(struct buf_ring *mr, void *m, char *file, int line)
-{
-	int err;
-	
-	mtx_lock(&mr->br_lock);
-	err = __buf_ring_enqueue(mr, m, file, line);
-	mtx_unlock(&mr->br_lock);
-
-	return (err);
-}
-
-#define buf_ring_enqueue(mr, m) buf_ring_enqueue_((mr), (m), __FILE__, __LINE__)
-
-
-static __inline void *
-buf_ring_peek(struct buf_ring *mr)
-{
-	int prod, cons, mask;
-	caddr_t *ring, m;
-	
-	ring = (caddr_t *)mr->br_ring;
-	mask = mr->br_size - 1;
-	cons = mr->br_cons;
-	prod = mr->br_prod;
-	m = NULL;
-	if (cons != prod)
-		m = ring[cons];
-
-	return (m);
-}
-
 #define DBG_RX          (1 << 0)
 static const int debug_flags = DBG_RX;
 
diff --git a/sys/dev/cxgb/cxgb_sge.c b/sys/dev/cxgb/cxgb_sge.c
index 26233d9..1a00d42 100644
--- a/sys/dev/cxgb/cxgb_sge.c
+++ b/sys/dev/cxgb/cxgb_sge.c
@@ -1716,10 +1716,8 @@ t3_free_qset(adapter_t *sc, struct sge_qset *q)
 	t3_free_tx_desc_all(&q->txq[TXQ_ETH]);
 	
 	for (i = 0; i < SGE_TXQ_PER_SET; i++) 
-		if (q->txq[i].txq_mr.br_ring != NULL) {
-			free(q->txq[i].txq_mr.br_ring, M_DEVBUF);
-			mtx_destroy(&q->txq[i].txq_mr.br_lock);
-		}
+		if (q->txq[i].txq_mr != NULL) 
+			buf_ring_free(q->txq[i].txq_mr, M_DEVBUF);
 	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
 		if (q->fl[i].desc) {
 			mtx_lock_spin(&sc->sge.reg_lock);
@@ -1874,7 +1872,6 @@ t3_free_tx_desc(struct sge_txq *q, int reclaimable)
 				txsd->flags &= ~TX_SW_DESC_MAPPED;
 			}
 			m_freem_iovec(&txsd->mi);	
-			buf_ring_scan(&q->txq_mr, txsd->mi.mi_base, __FILE__, __LINE__);
 			txsd->mi.mi_base = NULL;
 #if defined(DIAGNOSTIC) && 0
 			if (m_get_priority(txsd->m[0]) != cidx) 
@@ -2272,14 +2269,12 @@ t3_sge_alloc_qset(adapter_t *sc, u_int id, int nports, int irq_vec_idx,
 	int i, header_size, ret = 0;
 
 	for (i = 0; i < SGE_TXQ_PER_SET; i++) {
-		if ((q->txq[i].txq_mr.br_ring = malloc(cxgb_txq_buf_ring_size*sizeof(struct mbuf *),
-			    M_DEVBUF, M_WAITOK|M_ZERO)) == NULL) {
+		
+		if ((q->txq[i].txq_mr = buf_ring_alloc(cxgb_txq_buf_ring_size,
+			    M_DEVBUF, M_WAITOK, &q->txq[i].lock)) == NULL) {
 			device_printf(sc->dev, "failed to allocate mbuf ring\n");
 			goto err;
 		}
-		q->txq[i].txq_mr.br_prod = q->txq[i].txq_mr.br_cons = 0;
-		q->txq[i].txq_mr.br_size = cxgb_txq_buf_ring_size;
-		mtx_init(&q->txq[i].txq_mr.br_lock, "txq mbuf ring", NULL, MTX_DEF);
 	}
 
 	init_qset_cntxt(q, id);
@@ -3496,12 +3491,14 @@ t3_add_configured_sysctls(adapter_t *sc)
 			SYSCTL_ADD_INT(ctx, txqpoidlist, OID_AUTO, "sendqlen",
 			    CTLFLAG_RD, &qs->txq[TXQ_ETH].sendq.qlen,
 			    0, "#tunneled packets waiting to be sent");
+#if 0			
 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "queue_pidx",
 			    CTLFLAG_RD, (uint32_t *)(uintptr_t)&qs->txq[TXQ_ETH].txq_mr.br_prod,
 			    0, "#tunneled packets queue producer index");
 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "queue_cidx",
 			    CTLFLAG_RD, (uint32_t *)(uintptr_t)&qs->txq[TXQ_ETH].txq_mr.br_cons,
 			    0, "#tunneled packets queue consumer index");
+#endif			
 			SYSCTL_ADD_INT(ctx, txqpoidlist, OID_AUTO, "processed",
 			    CTLFLAG_RD, &qs->txq[TXQ_ETH].processed,
 			    0, "#tunneled packets processed by the card");
diff --git a/sys/dev/cxgb/sys/cxgb_support.c b/sys/dev/cxgb/sys/cxgb_support.c
index e911dfc..18f799e 100644
--- a/sys/dev/cxgb/sys/cxgb_support.c
+++ b/sys/dev/cxgb/sys/cxgb_support.c
@@ -308,33 +308,3 @@ free:
 		uma_zfree(zone, vec[i]);
 }
 	
-struct buf_ring *
-buf_ring_alloc(int count, int flags)
-{
-	struct buf_ring *br;
-
-	KASSERT(powerof2(count), ("buf ring must be size power of 2"));
-	
-	br = malloc(sizeof(struct buf_ring), M_DEVBUF, flags|M_ZERO);
-	if (br == NULL)
-		return (NULL);
-	
-	br->br_ring = malloc(sizeof(caddr_t)*count, M_DEVBUF, flags|M_ZERO);
-	if (br->br_ring == NULL) {
-		free(br, M_DEVBUF);
-		return (NULL);
-	}
-	
-	mtx_init(&br->br_lock, "buf ring", NULL, MTX_DUPOK|MTX_DEF);
-	br->br_size = count;
-	br->br_prod = br->br_cons = 0;
-
-	return (br);
-}
-
-void
-buf_ring_free(struct buf_ring *br)
-{
-	free(br->br_ring, M_DEVBUF);
-	free(br, M_DEVBUF);
-}
diff --git a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_resource.c b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_resource.c
index 9d3618f..47d9d8e 100644
--- a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_resource.c
+++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_resource.c
@@ -102,7 +102,7 @@ static int __cxio_init_resource_fifo(struct buf_ring **fifo,
 	u32 rarray[16];
 	mtx_init(fifo_lock, "cxio fifo", NULL, MTX_DEF|MTX_DUPOK);
 
-	*fifo = buf_ring_alloc(nr, M_NOWAIT);
+	*fifo = buf_ring_alloc(nr, M_DEVBUF, M_NOWAIT, fifo_lock);
 	if (*fifo == NULL)
 		return (-ENOMEM);
 #if 0
@@ -134,7 +134,7 @@ static int __cxio_init_resource_fifo(struct buf_ring **fifo,
 			buf_ring_enqueue(*fifo, (void *) (uintptr_t)i);
 #if 0
 	for (i = 0; i < skip_low + skip_high; i++)
-		buf_ring_dequeue(*fifo);
+		buf_ring_dequeue_sc(*fifo);
 #endif	
 	return 0;
 }
@@ -161,7 +161,8 @@ static int cxio_init_qpid_fifo(struct cxio_rdev *rdev_p)
 
 	mtx_init(&rdev_p->rscp->qpid_fifo_lock, "qpid fifo", NULL, MTX_DEF);
 
-	rdev_p->rscp->qpid_fifo = buf_ring_alloc(T3_MAX_NUM_QP, M_NOWAIT);
+	rdev_p->rscp->qpid_fifo = buf_ring_alloc(T3_MAX_NUM_QP, M_DEVBUF,
+	    M_NOWAIT, &rdev_p->rscp->qpid_fifo_lock);
 	if (rdev_p->rscp->qpid_fifo == NULL)
 		return (-ENOMEM);
 
@@ -180,7 +181,7 @@ int cxio_hal_init_rhdl_resource(u32 nr_rhdl)
 
 void cxio_hal_destroy_rhdl_resource(void)
 {
-	buf_ring_free(rhdl_fifo);
+	buf_ring_free(rhdl_fifo, M_DEVBUF);
 }
 #endif
 
@@ -214,11 +215,11 @@ int cxio_hal_init_resource(struct cxio_rdev *rdev_p,
 		goto pdid_err;
 	return 0;
 pdid_err:
-	buf_ring_free(rscp->cqid_fifo);
+	buf_ring_free(rscp->cqid_fifo, M_DEVBUF);
 cqid_err:
-	buf_ring_free(rscp->qpid_fifo);
+	buf_ring_free(rscp->qpid_fifo, M_DEVBUF);
 qpid_err:
-	buf_ring_free(rscp->tpt_fifo);
+	buf_ring_free(rscp->tpt_fifo, M_DEVBUF);
 tpt_err:
 	return (-ENOMEM);
 }
@@ -231,7 +232,7 @@ static u32 cxio_hal_get_resource(struct buf_ring *fifo, struct mtx *lock)
 	u32 entry;
 	
 	mtx_lock(lock);
-	entry = (u32)(uintptr_t)buf_ring_dequeue(fifo);
+	entry = (u32)(uintptr_t)buf_ring_dequeue_sc(fifo);
 	mtx_unlock(lock);
 	return entry;
 }
@@ -288,10 +289,10 @@ void cxio_hal_put_pdid(struct cxio_hal_resource *rscp, u32 pdid)
 
 void cxio_hal_destroy_resource(struct cxio_hal_resource *rscp)
 {
-	buf_ring_free(rscp->tpt_fifo);
-	buf_ring_free(rscp->cqid_fifo);
-	buf_ring_free(rscp->qpid_fifo);
-	buf_ring_free(rscp->pdid_fifo);
+	buf_ring_free(rscp->tpt_fifo, M_DEVBUF);
+	buf_ring_free(rscp->cqid_fifo, M_DEVBUF);
+	buf_ring_free(rscp->qpid_fifo, M_DEVBUF);
+	buf_ring_free(rscp->pdid_fifo, M_DEVBUF);
 	free(rscp, M_DEVBUF);
 }
 
diff --git a/sys/dev/mxge/if_mxge_var.h b/sys/dev/mxge/if_mxge_var.h
index e5d176d..a91b4d8 100644
--- a/sys/dev/mxge/if_mxge_var.h
+++ b/sys/dev/mxge/if_mxge_var.h
@@ -298,6 +298,8 @@ struct mxge_media_type
 /* implement our own memory barriers, since bus_space_barrier
    cannot handle write-combining regions */
 
+#if __FreeBSD_version < 800053
+
 #if defined (__GNUC__)
   #if #cpu(i386) || defined __i386 || defined i386 || defined __i386__ || #cpu(x86_64) || defined __x86_64__
     #define mb()  __asm__ __volatile__ ("sfence;": : :"memory")
@@ -312,6 +314,8 @@ struct mxge_media_type
   #error "unknown compiler"
 #endif
 
+#endif
+
 static inline void
 mxge_pio_copy(volatile void *to_v, void *from_v, size_t size)
 {
diff --git a/sys/dev/nxge/xge-osdep.h b/sys/dev/nxge/xge-osdep.h
index 15adfe7..e8f4aba 100644
--- a/sys/dev/nxge/xge-osdep.h
+++ b/sys/dev/nxge/xge-osdep.h
@@ -242,8 +242,12 @@ typedef xge_pci_info_t             *pci_cfg_h;
 	mtx_unlock_flags(lockp, flags);                                        \
 }
 
+#if __FreeBSD_version > 800053
 /* Write memory barrier */
+#define xge_os_wmb()		wmb()	
+#else
 #define xge_os_wmb()
+#endif
 
 /* Delay (in micro seconds) */
 #define xge_os_udelay(us)            DELAY(us)
diff --git a/sys/i386/include/atomic.h b/sys/i386/include/atomic.h
index 06216fb..bbf2655 100644
--- a/sys/i386/include/atomic.h
+++ b/sys/i386/include/atomic.h
@@ -32,6 +32,21 @@
 #error this file needs sys/cdefs.h as a prerequisite
 #endif
 
+
+#if defined(I686_CPU)
+#define mb()	__asm__ __volatile__ ("mfence;": : :"memory")
+#define wmb()	__asm__ __volatile__ ("sfence;": : :"memory")
+#define rmb()	__asm__ __volatile__ ("lfence;": : :"memory")
+#else
+/*
+ * do we need a serializing instruction?
+ */
+#define mb()
+#define wmb()
+#define rmb()
+#endif
+
+
 /*
  * Various simple operations on memory, each of which is atomic in the
  * presence of interrupts and multiple processors.
diff --git a/sys/ia64/include/atomic.h b/sys/ia64/include/atomic.h
index 631193f..fdfcb9e 100644
--- a/sys/ia64/include/atomic.h
+++ b/sys/ia64/include/atomic.h
@@ -29,6 +29,10 @@
 #ifndef _MACHINE_ATOMIC_H_
 #define	_MACHINE_ATOMIC_H_
 
+#define	mb()
+#define	wmb()
+#define	rmb()
+
 /*
  * Various simple arithmetic on memory which is atomic in the presence
  * of interrupts and SMP safe.
diff --git a/sys/kern/subr_bufring.c b/sys/kern/subr_bufring.c
new file mode 100644
index 0000000..63938ea
--- /dev/null
+++ b/sys/kern/subr_bufring.c
@@ -0,0 +1,68 @@
+/**************************************************************************
+ *
+ * Copyright (c) 2007,2008 Kip Macy kmacy@freebsd.org
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *
+ * 2. The name of Kip Macy nor the names of other
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ ***************************************************************************/
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/ktr.h>
+#include <sys/buf_ring.h>
+
+
+struct buf_ring *
+buf_ring_alloc(int count, struct malloc_type *type, int flags, struct mtx *lock)
+{
+	struct buf_ring *br;
+
+	KASSERT(powerof2(count), ("buf ring must be size power of 2"));
+	
+	br = malloc(sizeof(struct buf_ring) + count*sizeof(caddr_t),
+	    type, flags|M_ZERO);
+	if (br == NULL)
+		return (NULL);
+#ifdef DEBUG_BUFRING
+	br->br_lock = lock;
+#endif	
+	br->br_prod_size = br->br_cons_size = count;
+	br->br_prod_mask = br->br_cons_mask = count-1;
+	br->br_prod_head = br->br_cons_head = 0;
+	br->br_prod_tail = br->br_cons_tail = 0;
+		
+	return (br);
+}
+
+void
+buf_ring_free(struct buf_ring *br, struct malloc_type *type)
+{
+	free(br, type);
+}
diff --git a/sys/net/if.c b/sys/net/if.c
index 77d4ec3..358d8f2 100644
--- a/sys/net/if.c
+++ b/sys/net/if.c
@@ -119,10 +119,10 @@ static void	if_freemulti(struct ifmultiaddr *);
 static void	if_grow(void);
 static void	if_init(void *);
 static void	if_check(void *);
-static void	if_qflush(struct ifaltq *);
 static void	if_route(struct ifnet *, int flag, int fam);
 static int	if_setflag(struct ifnet *, int, int, int *, int);
 static void	if_slowtimo(void *);
+static int	if_transmit(struct ifnet *ifp, struct mbuf *m);
 static void	if_unroute(struct ifnet *, int flag, int fam);
 static void	link_rtrequest(int, struct rtentry *, struct rt_addrinfo *);
 static int	if_rtdel(struct radix_node *, void *);
@@ -507,6 +507,28 @@ if_free_type(struct ifnet *ifp, u_char type)
 	free(ifp, M_IFNET);
 };
 
+void
+ifq_attach(struct ifaltq *ifq, struct ifnet *ifp)
+{
+	
+	mtx_init(&ifq->ifq_mtx, ifp->if_xname, "if send queue", MTX_DEF);
+
+	if (ifq->ifq_maxlen == 0) 
+		ifq->ifq_maxlen = ifqmaxlen;
+
+	ifq->altq_type = 0;
+	ifq->altq_disc = NULL;
+	ifq->altq_flags &= ALTQF_CANTCHANGE;
+	ifq->altq_tbr  = NULL;
+	ifq->altq_ifp  = ifp;
+}
+
+void
+ifq_detach(struct ifaltq *ifq)
+{
+	mtx_destroy(&ifq->ifq_mtx);
+}
+
 /*
  * Perform generic interface initalization tasks and attach the interface
  * to the list of "active" interfaces.
@@ -547,7 +569,15 @@ if_attach(struct ifnet *ifp)
 	getmicrotime(&ifp->if_lastchange);
 	ifp->if_data.ifi_epoch = time_uptime;
 	ifp->if_data.ifi_datalen = sizeof(struct if_data);
+	KASSERT((ifp->if_transmit == NULL && ifp->if_qflush == NULL) ||
+	    (ifp->if_transmit != NULL && ifp->if_qflush != NULL),
+	    ("transmit and qflush must both either be set or both be NULL"));
 
+	if (ifp->if_transmit == NULL) {
+		ifp->if_transmit = if_transmit;
+		ifp->if_qflush = if_qflush;
+	}
+	
 #ifdef MAC
 	mac_init_ifnet(ifp);
 	mac_create_ifnet(ifp);
@@ -559,7 +589,7 @@ if_attach(struct ifnet *ifp)
 	make_dev_alias(ifdev_byindex(ifp->if_index), "%s%d",
 	    net_cdevsw.d_name, ifp->if_index);
 
-	mtx_init(&ifp->if_snd.ifq_mtx, ifp->if_xname, "if send queue", MTX_DEF);
+	ifq_attach(&ifp->if_snd, ifp);
 
 	/*
 	 * create a Link Level name for this device
@@ -596,11 +626,15 @@ if_attach(struct ifnet *ifp)
 	ifa->ifa_refcnt = 1;
 	TAILQ_INSERT_HEAD(&ifp->if_addrhead, ifa, ifa_link);
 	ifp->if_broadcastaddr = NULL; /* reliably crash if used uninitialized */
+<<<<<<< HEAD
 	ifp->if_snd.altq_type = 0;
 	ifp->if_snd.altq_disc = NULL;
 	ifp->if_snd.altq_flags &= ALTQF_CANTCHANGE;
 	ifp->if_snd.altq_tbr  = NULL;
 	ifp->if_snd.altq_ifp  = ifp;
+=======
+
+>>>>>>> 1be87f0... - bump __FreeBSD version to reflect added buf_ring, memory barriers,
 
 	IFNET_WLOCK();
 	TAILQ_INSERT_TAIL(&ifnet, ifp, if_link);
@@ -849,7 +883,7 @@ if_detach(struct ifnet *ifp)
 	KNOTE_UNLOCKED(&ifp->if_klist, NOTE_EXIT);
 	knlist_clear(&ifp->if_klist, 0);
 	knlist_destroy(&ifp->if_klist);
-	mtx_destroy(&ifp->if_snd.ifq_mtx);
+	ifq_detach(&ifp->if_snd);
 	IF_AFDATA_DESTROY(ifp);
 	splx(s);
 }
@@ -1440,7 +1474,8 @@ if_unroute(struct ifnet *ifp, int flag, int fam)
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
 		if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family))
 			pfctlinput(PRC_IFDOWN, ifa->ifa_addr);
-	if_qflush(&ifp->if_snd);
+	ifp->if_qflush(ifp);
+
 #ifdef DEV_CARP
 	if (ifp->if_carp)
 		carp_carpdev_state(ifp->if_carp);
@@ -1567,11 +1602,13 @@ if_up(struct ifnet *ifp)
 /*
  * Flush an interface queue.
  */
-static void
-if_qflush(struct ifaltq *ifq)
+void
+if_qflush(struct ifnet *ifp)
 {
 	struct mbuf *m, *n;
-
+	struct ifaltq *ifq;
+	
+	ifq = &ifp->if_snd;
 	IFQ_LOCK(ifq);
 #ifdef ALTQ
 	if (ALTQ_IS_ENABLED(ifq))
@@ -2862,6 +2899,19 @@ if_start_deferred(void *context, int pending)
 	(ifp->if_start)(ifp);
 }
 
+/*
+ * Backwards compatibility interface for drivers 
+ * that have not implemented it
+ */
+static int
+if_transmit(struct ifnet *ifp, struct mbuf *m)
+{
+	int error;
+
+	IFQ_HANDOFF(ifp, m, error);
+	return (error);
+}
+
 int
 if_handoff(struct ifqueue *ifq, struct mbuf *m, struct ifnet *ifp, int adjust)
 {
diff --git a/sys/net/if_ethersubr.c b/sys/net/if_ethersubr.c
index fa43ae0..24acac8 100644
--- a/sys/net/if_ethersubr.c
+++ b/sys/net/if_ethersubr.c
@@ -383,7 +383,6 @@ bad:			if (m != NULL)
 int
 ether_output_frame(struct ifnet *ifp, struct mbuf *m)
 {
-	int error;
 #if defined(INET) || defined(INET6)
 	struct ip_fw *rule = ip_dn_claim_rule(m);
 
@@ -402,8 +401,7 @@ ether_output_frame(struct ifnet *ifp, struct mbuf *m)
 	 * Queue message on interface, update output statistics if
 	 * successful, and start output if interface not yet active.
 	 */
-	IFQ_HANDOFF(ifp, m, error);
-	return (error);
+	return ((ifp->if_transmit)(ifp, m));
 }
 
 #if defined(INET) || defined(INET6)
diff --git a/sys/net/if_fddisubr.c b/sys/net/if_fddisubr.c
index a2fd00b..61f8f56 100644
--- a/sys/net/if_fddisubr.c
+++ b/sys/net/if_fddisubr.c
@@ -336,7 +336,7 @@ fddi_output(ifp, m, dst, rt0)
 		}
 	}
 
-	IFQ_HANDOFF(ifp, m, error);
+	error = (ifp->if_transmit)(ifp, m);
 	if (error)
 		ifp->if_oerrors++;
 
diff --git a/sys/net/if_fwsubr.c b/sys/net/if_fwsubr.c
index b1c68d2..d53b318 100644
--- a/sys/net/if_fwsubr.c
+++ b/sys/net/if_fwsubr.c
@@ -249,7 +249,7 @@ firewire_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst,
 		 */
 		enc->ul[0] = htonl(enc->ul[0]);
 
-		IFQ_HANDOFF(ifp, m, error);
+		error = (ifp->if_transmit)(ifp, m);
 		return (error);
 	} else {
 		/*
@@ -309,7 +309,7 @@ firewire_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst,
 			enc->ul[0] = htonl(enc->ul[0]);
 			enc->ul[1] = htonl(enc->ul[1]);
 
-			IFQ_HANDOFF(ifp, m, error);
+			error = (ifp->if_transmit)(ifp, m);
 			if (error) {
 				if (mtail)
 					m_freem(mtail);
diff --git a/sys/net/if_lagg.c b/sys/net/if_lagg.c
index 42ca7c6..be0cf05 100644
--- a/sys/net/if_lagg.c
+++ b/sys/net/if_lagg.c
@@ -1374,12 +1374,8 @@ out:
 int
 lagg_enqueue(struct ifnet *ifp, struct mbuf *m)
 {
-	int error = 0;
 
-	IFQ_HANDOFF(ifp, m, error);
-	if (error)
-		ifp->if_oerrors++;
-	return (error);
+	return (ifp->if_transmit)(ifp, m);
 }
 
 /*
diff --git a/sys/net/if_tun.c b/sys/net/if_tun.c
index 0852ad5..82723f1 100644
--- a/sys/net/if_tun.c
+++ b/sys/net/if_tun.c
@@ -647,7 +647,7 @@ tunoutput(
 		}
 	}
 
-	IFQ_HANDOFF(ifp, m0, error);
+	error = (ifp->if_transmit)(ifp, m0);
 	if (error) {
 		ifp->if_collisions++;
 		return (ENOBUFS);
diff --git a/sys/net/if_var.h b/sys/net/if_var.h
index 92c8e80..d4f3823 100644
--- a/sys/net/if_var.h
+++ b/sys/net/if_var.h
@@ -77,6 +77,7 @@ struct  ifvlantrunk;
 #ifdef _KERNEL
 #include <sys/mbuf.h>
 #include <sys/eventhandler.h>
+#include <sys/buf_ring.h>
 #endif /* _KERNEL */
 #include <sys/lock.h>		/* XXX */
 #include <sys/mutex.h>		/* XXX */
@@ -186,7 +187,11 @@ struct ifnet {
 					/* protected by if_addr_mtx */
 	void	*if_pf_kif;
 	void	*if_lagg;		/* lagg glue */
-	void	*if_pspare[10];		/* multiq/TOE 3; vimage 3; general use 4 */
+	void	*if_pspare[8];		/* multiq/TOE 3; vimage 3; general use 4 */
+	void	(*if_qflush)	/* flush any queues */
+		(struct ifnet *);
+	int	(*if_transmit)	/* initiate output routine */
+		(struct ifnet *, struct mbuf *);
 	int	if_ispare[2];		/* general use 2 */
 };
 
@@ -536,6 +541,119 @@ do {									\
 	IFQ_PURGE(ifq);							\
 } while (0)
 
+#ifdef _KERNEL
+static __inline void
+drbr_stats_update(struct ifnet *ifp, int len, int mflags)
+{
+#ifndef NO_SLOW_STATS
+	ifp->if_obytes += len;
+	if (mflags & M_MCAST)
+		ifp->if_omcasts++;
+#endif
+}
+
+static __inline int
+drbr_enqueue(struct ifnet *ifp, struct buf_ring *br, struct mbuf *m)
+{	
+	int error = 0;
+	int len = m->m_pkthdr.len;
+	int mflags = m->m_flags;
+
+#ifdef ALTQ
+	if (ALTQ_IS_ENABLED(&ifp->if_snd)) {
+		IFQ_ENQUEUE(&ifp->if_snd, m, error);
+		return (error);
+	}
+#endif
+	if ((error = buf_ring_enqueue_bytes(br, m, len)) == ENOBUFS) {
+		br->br_drops++;
+		m_freem(m);
+	} else
+		drbr_stats_update(ifp, len, mflags);
+	
+	return (error);
+}
+
+static __inline void
+drbr_flush(struct ifnet *ifp, struct buf_ring *br)
+{
+	struct mbuf *m;
+
+#ifdef ALTQ
+	if (ifp != NULL && ALTQ_IS_ENABLED(&ifp->if_snd)) {
+		while (!IFQ_IS_EMPTY(&ifp->if_snd)) {
+			IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
+			m_freem(m);
+		}
+	}
+#endif	
+	while ((m = buf_ring_dequeue_sc(br)) != NULL)
+		m_freem(m);
+}
+
+static __inline void
+drbr_free(struct buf_ring *br, struct malloc_type *type)
+{
+
+	drbr_flush(NULL, br);
+	buf_ring_free(br, type);
+}
+
+static __inline struct mbuf *
+drbr_dequeue(struct ifnet *ifp, struct buf_ring *br)
+{
+#ifdef ALTQ
+	struct mbuf *m;
+
+	if (ALTQ_IS_ENABLED(&ifp->if_snd)) {	
+		IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
+		return (m);
+	}
+#endif
+	return (buf_ring_dequeue_sc(br));
+}
+
+static __inline struct mbuf *
+drbr_dequeue_cond(struct ifnet *ifp, struct buf_ring *br,
+    int (*func) (struct mbuf *, void *), void *arg) 
+{
+	struct mbuf *m;
+#ifdef ALTQ
+	/*
+	 * XXX need to evaluate / requeue 
+	 */
+	if (ALTQ_IS_ENABLED(&ifp->if_snd)) {	
+		IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
+		return (m);
+	}
+#endif
+	m = buf_ring_peek(br);
+	if (m == NULL || func(m, arg) == 0)
+		return (NULL);
+
+	return (buf_ring_dequeue_sc(br));
+}
+
+static __inline int
+drbr_empty(struct ifnet *ifp, struct buf_ring *br)
+{
+#ifdef ALTQ
+	if (ALTQ_IS_ENABLED(&ifp->if_snd))
+		return (IFQ_DRV_IS_EMPTY(&ifp->if_snd));
+#endif
+	return (buf_ring_empty(br));
+}
+
+static __inline int
+drbr_inuse(struct ifnet *ifp, struct buf_ring *br)
+{
+#ifdef ALTQ
+	if (ALTQ_IS_ENABLED(&ifp->if_snd))
+		return (ifp->if_snd.ifq_len);
+#endif
+	return (buf_ring_count(br));
+}
+#endif
 /*
  * 72 was chosen below because it is the size of a TCP/IP
  * header (40) + the minimum mss (32).
@@ -677,6 +795,7 @@ void	if_free_type(struct ifnet *, u_char);
 void	if_initname(struct ifnet *, const char *, int);
 void	if_link_state_change(struct ifnet *, int);
 int	if_printf(struct ifnet *, const char *, ...) __printflike(2, 3);
+void	if_qflush(struct ifnet *);
 int	if_setlladdr(struct ifnet *, const u_char *, int);
 void	if_up(struct ifnet *);
 /*void	ifinit(void);*/ /* declared in systm.h for main() */
@@ -684,6 +803,9 @@ int	ifioctl(struct socket *, u_long, caddr_t, struct thread *);
 int	ifpromisc(struct ifnet *, int);
 struct	ifnet *ifunit(const char *);
 
+void	ifq_attach(struct ifaltq *, struct ifnet *ifp);
+void	ifq_detach(struct ifaltq *);
+
 struct	ifaddr *ifa_ifwithaddr(struct sockaddr *);
 struct	ifaddr *ifa_ifwithbroadaddr(struct sockaddr *);
 struct	ifaddr *ifa_ifwithdstaddr(struct sockaddr *);
diff --git a/sys/net/if_vlan.c b/sys/net/if_vlan.c
index c1881da..b1f7dee 100644
--- a/sys/net/if_vlan.c
+++ b/sys/net/if_vlan.c
@@ -871,7 +871,7 @@ vlan_start(struct ifnet *ifp)
 		 * Send it, precisely as ether_output() would have.
 		 * We are already running at splimp.
 		 */
-		IFQ_HANDOFF(p, m, error);
+		error = (p->if_transmit)(p, m);
 		if (!error)
 			ifp->if_opackets++;
 		else
diff --git a/sys/net80211/ieee80211_output.c b/sys/net80211/ieee80211_output.c
index c82de48..ee96325 100644
--- a/sys/net80211/ieee80211_output.c
+++ b/sys/net80211/ieee80211_output.c
@@ -233,7 +233,22 @@ ieee80211_raw_xmit(struct ieee80211_node *ni, struct mbuf *m,
 	if_start(ifp);
 	ifp->if_opackets++;
 
+#error "r185164 cherry-pick conflicted here, please fix."
+<<<<<<< HEAD
 	return 0;
+=======
+		/* XXX defer if_start calls? */
+		error = (parent->if_transmit)(parent, m);
+		if (error != 0) {
+			/* NB: IFQ_HANDOFF reclaims mbuf */
+			ieee80211_free_node(ni);
+		} else {
+			ifp->if_opackets++;
+		}
+		ic->ic_lastdata = ticks;
+	}
+#undef IS_DWDS
+>>>>>>> eee4f1f... convert calls to IFQ_HANDOFF to if_transmit
 }
 
 /*
diff --git a/sys/netgraph/ng_iface.c b/sys/netgraph/ng_iface.c
index 5b76c29..6358d89 100644
--- a/sys/netgraph/ng_iface.c
+++ b/sys/netgraph/ng_iface.c
@@ -401,7 +401,7 @@ ng_iface_output(struct ifnet *ifp, struct mbuf *m,
 			return (ENOBUFS);
 		}
 		*(sa_family_t *)m->m_data = dst->sa_family;
-		IFQ_HANDOFF(ifp, m, error);
+		error = (ifp->if_transmit)(ifp, m);
 	} else
 		error = ng_iface_send(ifp, m, dst->sa_family);
 
diff --git a/sys/powerpc/include/atomic.h b/sys/powerpc/include/atomic.h
index d515a6a..4ac9f0c 100644
--- a/sys/powerpc/include/atomic.h
+++ b/sys/powerpc/include/atomic.h
@@ -39,6 +39,10 @@
 #define	__ATOMIC_BARRIER					\
     __asm __volatile("sync" : : : "memory")
 
+#define mb()	__ATOMIC_BARRIER
+#define	wmb()	mb()
+#define	rmb()	mb()
+
 /*
  * atomic_add(p, v)
  * { *p += v; }
diff --git a/sys/sparc64/include/atomic.h b/sys/sparc64/include/atomic.h
index fe36791..d663fbc 100644
--- a/sys/sparc64/include/atomic.h
+++ b/sys/sparc64/include/atomic.h
@@ -40,6 +40,10 @@
 #define	__ASI_ATOMIC	ASI_P
 #endif
 
+#define mb()	__asm__ __volatile__ ("membar #MemIssue": : :"memory")
+#define wmb()	mb()
+#define rmb()	mb()
+
 /*
  * Various simple arithmetic on memory which is atomic in the presence
  * of interrupts and multiple processors.  See atomic(9) for details.
diff --git a/sys/sun4v/include/atomic.h b/sys/sun4v/include/atomic.h
index fe36791..c5005fa 100644
--- a/sys/sun4v/include/atomic.h
+++ b/sys/sun4v/include/atomic.h
@@ -33,6 +33,10 @@
 
 #include <machine/cpufunc.h>
 
+#define mb()	__asm__ __volatile__ ("membar #MemIssue": : :"memory")
+#define wmb()	mb()
+#define rmb()	mb()
+
 /* Userland needs different ASI's. */
 #ifdef _KERNEL
 #define	__ASI_ATOMIC	ASI_N
diff --git a/sys/sys/buf_ring.h b/sys/sys/buf_ring.h
new file mode 100644
index 0000000..efa667d
--- /dev/null
+++ b/sys/sys/buf_ring.h
@@ -0,0 +1,279 @@
+/**************************************************************************
+ *
+ * Copyright (c) 2007-2009 Kip Macy kmacy@freebsd.org
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *
+ * 2. The name of Kip Macy nor the names of other
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ *
+ ***************************************************************************/
+
+#ifndef	_SYS_BUF_RING_H_
+#define	_SYS_BUF_RING_H_
+
+#include <machine/cpu.h>
+
+#if defined(INVARIANTS) && !defined(DEBUG_BUFRING)
+#define DEBUG_BUFRING 1
+#endif
+
+#ifdef DEBUG_BUFRING
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#endif
+
+struct buf_ring {
+	volatile uint32_t	br_prod_head;
+	volatile uint32_t	br_prod_tail;	
+	int              	br_prod_size;
+	int              	br_prod_mask;
+	uint64_t		br_drops;
+	uint64_t		br_prod_bufs;
+	uint64_t		br_prod_bytes;
+	/*
+	 * Pad out to next L2 cache line
+	 */
+	uint64_t	  	_pad0[11];
+
+	volatile uint32_t	br_cons_head;
+	volatile uint32_t	br_cons_tail;
+	int		 	br_cons_size;
+	int              	br_cons_mask;
+	
+	/*
+	 * Pad out to next L2 cache line
+	 */
+	uint64_t	  	_pad1[14];
+#ifdef DEBUG_BUFRING
+	struct mtx		*br_lock;
+#endif	
+	void			*br_ring[0];
+};
+
+/*
+ * multi-producer safe lock-free ring buffer enqueue
+ *
+ */
+static __inline int
+buf_ring_enqueue_bytes(struct buf_ring *br, void *buf, int nbytes)
+{
+	uint32_t prod_head, prod_next;
+	uint32_t cons_tail;
+	int success;
+#ifdef DEBUG_BUFRING
+	int i;
+	for (i = br->br_cons_head; i != br->br_prod_head;
+	     i = ((i + 1) & br->br_cons_mask))
+		if(br->br_ring[i] == buf)
+			panic("buf=%p already enqueue at %d prod=%d cons=%d",
+			    buf, i, br->br_prod_tail, br->br_cons_tail);
+#endif	
+	critical_enter();
+	do {
+		prod_head = br->br_prod_head;
+		cons_tail = br->br_cons_tail;
+
+		prod_next = (prod_head + 1) & br->br_prod_mask;
+		
+		if (prod_next == cons_tail) {
+			critical_exit();
+			return (ENOBUFS);
+		}
+		
+		success = atomic_cmpset_int(&br->br_prod_head, prod_head,
+		    prod_next);
+	} while (success == 0);
+#ifdef DEBUG_BUFRING
+	if (br->br_ring[prod_head] != NULL)
+		panic("dangling value in enqueue");
+#endif	
+	br->br_ring[prod_head] = buf;
+	wmb();
+
+	/*
+	 * If there are other enqueues in progress
+	 * that preceeded us, we need to wait for them
+	 * to complete 
+	 */   
+	while (br->br_prod_tail != prod_head)
+		cpu_spinwait();
+	br->br_prod_bufs++;
+	br->br_prod_bytes += nbytes;
+	br->br_prod_tail = prod_next;
+	critical_exit();
+	return (0);
+}
+
+static __inline int
+buf_ring_enqueue(struct buf_ring *br, void *buf)
+{
+
+	return (buf_ring_enqueue_bytes(br, buf, 0));
+}
+
+/*
+ * multi-consumer safe dequeue 
+ *
+ */
+static __inline void *
+buf_ring_dequeue_mc(struct buf_ring *br)
+{
+	uint32_t cons_head, cons_next;
+	uint32_t prod_tail;
+	void *buf;
+	int success;
+
+	critical_enter();
+	do {
+		cons_head = br->br_cons_head;
+		prod_tail = br->br_prod_tail;
+
+		cons_next = (cons_head + 1) & br->br_cons_mask;
+		
+		if (cons_head == prod_tail) {
+			critical_exit();
+			return (NULL);
+		}
+		
+		success = atomic_cmpset_int(&br->br_cons_head, cons_head,
+		    cons_next);
+	} while (success == 0);		
+
+	buf = br->br_ring[cons_head];
+#ifdef DEBUG_BUFRING
+	br->br_ring[cons_head] = NULL;
+#endif
+	rmb();
+	
+	/*
+	 * If there are other dequeues in progress
+	 * that preceeded us, we need to wait for them
+	 * to complete 
+	 */   
+	while (br->br_cons_tail != cons_head)
+		cpu_spinwait();
+
+	br->br_cons_tail = cons_next;
+	critical_exit();
+
+	return (buf);
+}
+
+/*
+ * single-consumer dequeue 
+ * use where dequeue is protected by a lock
+ * e.g. a network driver's tx queue lock
+ */
+static __inline void *
+buf_ring_dequeue_sc(struct buf_ring *br)
+{
+	uint32_t cons_head, cons_next, cons_next_next;
+	uint32_t prod_tail;
+	void *buf;
+	
+	cons_head = br->br_cons_head;
+	prod_tail = br->br_prod_tail;
+	
+	cons_next = (cons_head + 1) & br->br_cons_mask;
+	cons_next_next = (cons_head + 2) & br->br_cons_mask;
+	
+	if (cons_head == prod_tail) 
+		return (NULL);
+
+#ifdef PREFETCH_DEFINED	
+	if (cons_next != prod_tail) {		
+		prefetch(br->br_ring[cons_next]);
+		if (cons_next_next != prod_tail) 
+			prefetch(br->br_ring[cons_next_next]);
+	}
+#endif
+	br->br_cons_head = cons_next;
+	buf = br->br_ring[cons_head];
+
+#ifdef DEBUG_BUFRING
+	br->br_ring[cons_head] = NULL;
+	if (!mtx_owned(br->br_lock))
+		panic("lock not held on single consumer dequeue");
+	if (br->br_cons_tail != cons_head)
+		panic("inconsistent list cons_tail=%d cons_head=%d",
+		    br->br_cons_tail, cons_head);
+#endif
+	br->br_cons_tail = cons_next;
+	return (buf);
+}
+
+/*
+ * return a pointer to the first entry in the ring
+ * without modifying it, or NULL if the ring is empty
+ * race-prone if not protected by a lock
+ */
+static __inline void *
+buf_ring_peek(struct buf_ring *br)
+{
+
+#ifdef DEBUG_BUFRING
+	if ((br->br_lock != NULL) && !mtx_owned(br->br_lock))
+		panic("lock not held on single consumer dequeue");
+#endif	
+	/*
+	 * I believe it is safe to not have a memory barrier
+	 * here because we control cons and tail is worst case
+	 * a lagging indicator so we worst case we might
+	 * return NULL immediately after a buffer has been enqueued
+	 */
+	if (br->br_cons_head == br->br_prod_tail)
+		return (NULL);
+	
+	return (br->br_ring[br->br_cons_head]);
+}
+
+static __inline int
+buf_ring_full(struct buf_ring *br)
+{
+
+	return (((br->br_prod_head + 1) & br->br_prod_mask) == br->br_cons_tail);
+}
+
+static __inline int
+buf_ring_empty(struct buf_ring *br)
+{
+
+	return (br->br_cons_head == br->br_prod_tail);
+}
+
+static __inline int
+buf_ring_count(struct buf_ring *br)
+{
+
+	return ((br->br_prod_size + br->br_prod_tail - br->br_cons_tail)
+	    & br->br_prod_mask);
+}
+
+struct buf_ring *buf_ring_alloc(int count, struct malloc_type *type, int flags,
+    struct mtx *);
+void buf_ring_free(struct buf_ring *br, struct malloc_type *type);
+
+
+
+#endif

[-- Attachment #3 --]
From 6ef23c48a71766641f47716a9e3dc275c2315a2e Mon Sep 17 00:00:00 2001
From: Arnaud Lacombe <lacombar@gmail.com>
Date: Mon, 12 Sep 2011 22:32:45 -0400
Subject: [PATCH] IGB_MULTIQUEUE

---
 sys/dev/e1000/if_igb.c |   34 ++++++++++++++++++----------------
 sys/dev/e1000/if_igb.h |    4 ++--
 2 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/sys/dev/e1000/if_igb.c b/sys/dev/e1000/if_igb.c
index 4944e56..de1fae9 100644
--- a/sys/dev/e1000/if_igb.c
+++ b/sys/dev/e1000/if_igb.c
@@ -42,7 +42,7 @@
 
 #include <sys/param.h>
 #include <sys/systm.h>
-#if __FreeBSD_version >= 800000
+#ifdef IGB_MULTIQUEUE
 #include <sys/buf_ring.h>
 #endif
 #include <sys/bus.h>
@@ -172,7 +172,7 @@ static int	igb_detach(device_t);
 static int	igb_shutdown(device_t);
 static int	igb_suspend(device_t);
 static int	igb_resume(device_t);
-#if __FreeBSD_version >= 800000
+#ifdef IGB_MULTIQUEUE
 static int	igb_mq_start(struct ifnet *, struct mbuf *);
 static int	igb_mq_start_locked(struct ifnet *,
 		    struct tx_ring *, struct mbuf *);
@@ -800,7 +800,7 @@ igb_resume(device_t dev)
 {
 	struct adapter *adapter = device_get_softc(dev);
 	struct ifnet *ifp = adapter->ifp;
-#if __FreeBSD_version >= 800000
+#ifdef IGB_MULTIQUEUE
 	struct tx_ring *txr = adapter->tx_rings;
 #endif
 
@@ -810,7 +810,7 @@ igb_resume(device_t dev)
 
 	if ((ifp->if_flags & IFF_UP) &&
 	    (ifp->if_drv_flags & IFF_DRV_RUNNING)) {
-#if __FreeBSD_version < 800000
+#ifndef IGB_MULTIQUEUE
 		igb_start(ifp);
 #else
 		for (int i = 0; i < adapter->num_queues; i++, txr++) {
@@ -828,7 +828,7 @@ igb_resume(device_t dev)
 }
 
 
-#if __FreeBSD_version < 800000
+#ifndef IGB_MULTIQUEUE
 /*********************************************************************
  *  Transmit entry point
  *
@@ -905,7 +905,7 @@ igb_start(struct ifnet *ifp)
 	return;
 }
 
-#else /* __FreeBSD_version >= 800000 */
+#else /* defined(IGB_MULTIQUEUE) */
 /*
 ** Multiqueue Transmit driver
 **
@@ -918,9 +918,11 @@ igb_mq_start(struct ifnet *ifp, struct mbuf *m)
 	struct tx_ring		*txr;
 	int 			i = 0, err = 0;
 
+#if __FreeBSD_version >= 800000
 	/* Which queue to use */
 	if ((m->m_flags & M_FLOWID) != 0)
 		i = m->m_pkthdr.flowid % adapter->num_queues;
+#endif
 
 	txr = &adapter->tx_rings[i];
 	que = &adapter->queues[i];
@@ -1024,7 +1026,7 @@ igb_qflush(struct ifnet *ifp)
 	}
 	if_qflush(ifp);
 }
-#endif /* __FreeBSD_version < 800000 */
+#endif /* !defined(IGB_MULTIQUEUE) */
 
 /*********************************************************************
  *  Ioctl entry point
@@ -1358,7 +1360,7 @@ igb_handle_que(void *context, int pending)
 		IGB_TX_LOCK(txr);
 		if (igb_txeof(txr))
 			more = TRUE;
-#if __FreeBSD_version >= 800000
+#ifdef IGB_MULTIQUEUE
 		if (!drbr_empty(ifp, txr->br))
 			igb_mq_start_locked(ifp, txr, NULL);
 #else
@@ -1482,7 +1484,7 @@ igb_poll(struct ifnet *ifp, enum poll_cmd cmd, int count)
 	do {
 		more = igb_txeof(txr);
 	} while (loop-- && more);
-#if __FreeBSD_version >= 800000
+#ifdef IGB_MULTIQUEUE
 	if (!drbr_empty(ifp, txr->br))
 		igb_mq_start_locked(ifp, txr, NULL);
 #else
@@ -2235,7 +2237,7 @@ igb_allocate_legacy(struct adapter *adapter)
 {
 	device_t		dev = adapter->dev;
 	struct igb_queue	*que = adapter->queues;
-#if __FreeBSD_version >= 800000
+#ifdef IGB_MULTIQUEUE
 	struct tx_ring		*txr = adapter->tx_rings;
 #endif
 	int			error, rid = 0;
@@ -2256,7 +2258,7 @@ igb_allocate_legacy(struct adapter *adapter)
 		return (ENXIO);
 	}
 
-#if __FreeBSD_version >= 800000
+#ifdef IGB_MULTIQUEUE
 	TASK_INIT(&txr->txq_task, 0, igb_deferred_mq_start, txr);
 #endif
 
@@ -2330,7 +2332,7 @@ igb_allocate_msix(struct adapter *adapter)
 		*/
 		if (adapter->num_queues > 1)
 			bus_bind_intr(dev, que->res, i);
-#if __FreeBSD_version >= 800000
+#ifdef IGB_MULTIQUEUE
 		TASK_INIT(&que->txr->txq_task, 0, igb_deferred_mq_start,
 		    que->txr);
 #endif
@@ -2832,7 +2834,7 @@ igb_setup_interface(device_t dev, struct adapter *adapter)
 	ifp->if_softc = adapter;
 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
 	ifp->if_ioctl = igb_ioctl;
-#if __FreeBSD_version >= 800000
+#ifdef IGB_MULTIQUEUE
 	ifp->if_transmit = igb_mq_start;
 	ifp->if_qflush = igb_qflush;
 #else
@@ -3076,7 +3078,7 @@ igb_allocate_queues(struct adapter *adapter)
 			error = ENOMEM;
 			goto err_tx_desc;
         	}
-#if __FreeBSD_version >= 800000
+#ifdef IGB_MULTIQUEUE
 		/* Allocate a buf ring */
 		txr->br = buf_ring_alloc(IGB_BR_SIZE, M_DEVBUF,
 		    M_WAITOK, &txr->tx_mtx);
@@ -3137,7 +3139,7 @@ err_tx_desc:
 		igb_dma_free(adapter, &txr->txdma);
 	free(adapter->rx_rings, M_DEVBUF);
 rx_fail:
-#if __FreeBSD_version >= 800000
+#ifdef IGB_MULTIQUEUE
 	buf_ring_free(txr->br, M_DEVBUF);
 #endif
 	free(adapter->tx_rings, M_DEVBUF);
@@ -3381,7 +3383,7 @@ igb_free_transmit_buffers(struct tx_ring *txr)
 			tx_buffer->map = NULL;
 		}
 	}
-#if __FreeBSD_version >= 800000
+#ifdef IGB_MULTIQUEUE
 	if (txr->br != NULL)
 		buf_ring_free(txr->br, M_DEVBUF);
 #endif
diff --git a/sys/dev/e1000/if_igb.h b/sys/dev/e1000/if_igb.h
index 80abf6e..85dbcef 100644
--- a/sys/dev/e1000/if_igb.h
+++ b/sys/dev/e1000/if_igb.h
@@ -293,7 +293,7 @@ struct tx_ring {
 	u32			next_to_clean;
 	volatile u16		tx_avail;
 	struct igb_tx_buffer	*tx_buffers;
-#if __FreeBSD_version >= 800000
+#ifdef IGB_MULTIQUEUE
 	struct buf_ring		*br;
 #endif
 	bus_dma_tag_t		txtag;
@@ -527,7 +527,7 @@ igb_rx_unrefreshed(struct rx_ring *rxr)
 	cur |= new;				\
 }
 
-#if __FreeBSD_version >= 800000 && __FreeBSD_version < 800504
+#if defined(IGB_MULTIQUEUE) && __FreeBSD_version < 800504
 static __inline int
 drbr_needs_enqueue(struct ifnet *ifp, struct buf_ring *br)
 {
-- 
1.7.6.153.g78432


Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?CACqU3MV7JRxQ_mNeHCk7RVyzETZLAcc3XL=xyZ-qqtPfRxkZeQ>