Date: Tue, 5 Feb 2013 06:49:37 -0500 From: Randall Stewart <rrs@lakerest.net> To: John Baldwin <jhb@FreeBSD.org> Cc: freebsd-net <freebsd-net@FreeBSD.org>, Robert Watson <rwatson@FreeBSD.org>, Kip Macy <kmacy@FreeBSD.org>, jlv@FreeBSD.org Subject: Re: Driver patch to look at... Message-ID: <39571D84-A8C0-46A4-8EFA-CF74D862EAAE@lakerest.net> In-Reply-To: <45AD1046-A630-4C96-B4D2-B8A7D6A6DC45@lakerest.net> References: <D3AA078A-CD19-4228-A019-BE9C985895E2@lakerest.net> <201302041524.58699.jhb@freebsd.org> <45AD1046-A630-4C96-B4D2-B8A7D6A6DC45@lakerest.net>
next in thread | previous in thread | raw e-mail | index | archive | help
--Apple-Mail=_E172FC28-2504-451F-A867-0842517F6CD4
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain;
charset=windows-1252
John:
Here is an updated patch, per your suggestions. Note that I also
expanded and the only driver that uses these methods I did not touch
is the cxgb, but thats because I am not really sure it has the problem=85 =
it
does not quite enqueue the same way (it appears) that the other drivers =
do ;-)
R
--Apple-Mail=_E172FC28-2504-451F-A867-0842517F6CD4
Content-Disposition: attachment;
filename=driver_patch.txt
Content-Type: text/plain;
x-unix-mode=0644;
name="driver_patch.txt"
Content-Transfer-Encoding: quoted-printable
Index: dev/e1000/if_em.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- dev/e1000/if_em.c (revision 246323)
+++ dev/e1000/if_em.c (working copy)
@@ -894,7 +894,7 @@ static int
em_mq_start_locked(struct ifnet *ifp, struct tx_ring *txr, struct mbuf =
*m)
{
struct adapter *adapter =3D txr->adapter;
- struct mbuf *next;
+ struct mbuf *next, *snext;
int err =3D 0, enq =3D 0;
=20
if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=3D=
@@ -905,22 +905,25 @@ em_mq_start_locked(struct ifnet *ifp, struct tx_ri
}
=20
enq =3D 0;
- if (m =3D=3D NULL) {
- next =3D drbr_dequeue(ifp, txr->br);
- } else if (drbr_needs_enqueue(ifp, txr->br)) {
- if ((err =3D drbr_enqueue(ifp, txr->br, m)) !=3D 0)
+ if (m) {
+ err =3D drbr_enqueue(ifp, txr->br, m);
+ if (err) {
return (err);
- next =3D drbr_dequeue(ifp, txr->br);
- } else
- next =3D m;
+ }
+ }=20
=20
/* Process the queue */
- while (next !=3D NULL) {
+ while ((next =3D drbr_peek(ifp, txr->br)) !=3D NULL) {
+ snext =3D next;
if ((err =3D em_xmit(txr, &next)) !=3D 0) {
- if (next !=3D NULL)
- err =3D drbr_enqueue(ifp, txr->br, =
next);
- break;
+ if (next =3D=3D NULL) {
+ drbr_advance(ifp, txr->br);
+ } else {
+ drbr_putback(ifp, txr->br, next, snext);
+ }
+ break;
}
+ drbr_advance(ifp, txr->br);
enq++;
ifp->if_obytes +=3D next->m_pkthdr.len;
if (next->m_flags & M_MCAST)
@@ -928,7 +931,6 @@ em_mq_start_locked(struct ifnet *ifp, struct tx_ri
ETHER_BPF_MTAP(ifp, next);
if ((ifp->if_drv_flags & IFF_DRV_RUNNING) =3D=3D 0)
break;
- next =3D drbr_dequeue(ifp, txr->br);
}
=20
if (enq > 0) {
Index: dev/e1000/if_igb.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- dev/e1000/if_igb.c (revision 246323)
+++ dev/e1000/if_igb.c (working copy)
@@ -981,7 +981,7 @@ static int
igb_mq_start_locked(struct ifnet *ifp, struct tx_ring *txr)
{
struct adapter *adapter =3D txr->adapter;
- struct mbuf *next;
+ struct mbuf *next, *snext;
int err =3D 0, enq;
=20
IGB_TX_LOCK_ASSERT(txr);
@@ -994,12 +994,23 @@ igb_mq_start_locked(struct ifnet *ifp, struct tx_r
enq =3D 0;
=20
/* Process the queue */
- while ((next =3D drbr_dequeue(ifp, txr->br)) !=3D NULL) {
+ while ((next =3D drbr_peek(ifp, txr->br)) !=3D NULL) {
+ snext =3D next;
if ((err =3D igb_xmit(txr, &next)) !=3D 0) {
- if (next !=3D NULL)
- err =3D drbr_enqueue(ifp, txr->br, =
next);
+ if (next =3D=3D NULL) {
+ /* It was freed, move forward */
+ drbr_advance(ifp, txr->br);
+ } else {
+ /*=20
+ * Still have one left, it may not be
+ * the same since the transmit function
+ * may have changed it.
+ */
+ drbr_putback(ifp, txr->br, next, snext);
+ }
break;
}
+ drbr_advance(ifp, txr->br);
enq++;
ifp->if_obytes +=3D next->m_pkthdr.len;
if (next->m_flags & M_MCAST)
Index: dev/oce/oce_if.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- dev/oce/oce_if.c (revision 246323)
+++ dev/oce/oce_if.c (working copy)
@@ -1154,6 +1154,7 @@ oce_multiq_transmit(struct ifnet *ifp, struct mbuf
POCE_SOFTC sc =3D ifp->if_softc;
int status =3D 0, queue_index =3D 0;
struct mbuf *next =3D NULL;
+ struct mbuf *snext;
struct buf_ring *br =3D NULL;
=20
br =3D wq->br;
@@ -1166,29 +1167,28 @@ oce_multiq_transmit(struct ifnet *ifp, struct =
mbuf
return status;
}
=20
- if (m =3D=3D NULL)
- next =3D drbr_dequeue(ifp, br); =09
- else if (drbr_needs_enqueue(ifp, br)) {
+ if (m) {
if ((status =3D drbr_enqueue(ifp, br, m)) !=3D 0)
return status;
- next =3D drbr_dequeue(ifp, br);
- } else
- next =3D m;
-
- while (next !=3D NULL) {
+ }=20
+ while ((next =3D drbr_peek(ifp, br)) !=3D NULL) {
+ snext =3D next;
if (oce_tx(sc, &next, queue_index)) {
- if (next !=3D NULL) {
+ if (next =3D=3D NULL) {
+ drbr_advance(ifp, br);
+ } else {
+ drbr_putback(ifp, br, next, snext);
wq->tx_stats.tx_stops ++;
ifp->if_drv_flags |=3D IFF_DRV_OACTIVE;
status =3D drbr_enqueue(ifp, br, next);
} =20
break;
}
+ drbr_advance(ifp, br);
ifp->if_obytes +=3D next->m_pkthdr.len;
if (next->m_flags & M_MCAST)
ifp->if_omcasts++;
ETHER_BPF_MTAP(ifp, next);
- next =3D drbr_dequeue(ifp, br);
}
=20
return status;
Index: dev/ixgbe/ixgbe.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- dev/ixgbe/ixgbe.c (revision 246323)
+++ dev/ixgbe/ixgbe.c (working copy)
@@ -821,7 +821,7 @@ static int
ixgbe_mq_start_locked(struct ifnet *ifp, struct tx_ring *txr, struct =
mbuf *m)
{
struct adapter *adapter =3D txr->adapter;
- struct mbuf *next;
+ struct mbuf *next, *snext;
int enqueued, err =3D 0;
=20
if (((ifp->if_drv_flags & IFF_DRV_RUNNING) =3D=3D 0) ||
@@ -832,22 +832,25 @@ ixgbe_mq_start_locked(struct ifnet *ifp, struct tx
}
=20
enqueued =3D 0;
- if (m =3D=3D NULL) {
- next =3D drbr_dequeue(ifp, txr->br);
- } else if (drbr_needs_enqueue(ifp, txr->br)) {
- if ((err =3D drbr_enqueue(ifp, txr->br, m)) !=3D 0)
+ if (m) {
+ err =3D drbr_enqueue(ifp, txr->br, m);
+ if (err) {
return (err);
- next =3D drbr_dequeue(ifp, txr->br);
- } else
- next =3D m;
+ }
+ }
=20
/* Process the queue */
- while (next !=3D NULL) {
+ while ((next =3D drbr_peek(ifp, txr->br)) !=3D NULL) {
+ snext =3D next;
if ((err =3D ixgbe_xmit(txr, &next)) !=3D 0) {
- if (next !=3D NULL)
- err =3D drbr_enqueue(ifp, txr->br, =
next);
+ if (next =3D=3D NULL) {
+ drbr_advance(ifp, txr->br);
+ } else {
+ drbr_putback(ifp, txr->br, next, snext);
+ }
break;
}
+ drbr_advance(ifp, txr->br);
enqueued++;
/* Send a copy of the frame to the BPF listener */
ETHER_BPF_MTAP(ifp, next);
@@ -855,7 +858,6 @@ ixgbe_mq_start_locked(struct ifnet *ifp, struct tx
break;
if (txr->tx_avail < IXGBE_TX_OP_THRESHOLD)
ixgbe_txeof(txr);
- next =3D drbr_dequeue(ifp, txr->br);
}
=20
if (enqueued > 0) {
Index: dev/ixgbe/ixv.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- dev/ixgbe/ixv.c (revision 246323)
+++ dev/ixgbe/ixv.c (working copy)
@@ -605,7 +605,7 @@ static int
ixv_mq_start_locked(struct ifnet *ifp, struct tx_ring *txr, struct mbuf =
*m)
{
struct adapter *adapter =3D txr->adapter;
- struct mbuf *next;
+ struct mbuf *next, *snext;
int enqueued, err =3D 0;
=20
if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=3D=
@@ -620,22 +620,24 @@ ixv_mq_start_locked(struct ifnet *ifp, struct tx_r
ixv_txeof(txr);
=20
enqueued =3D 0;
- if (m =3D=3D NULL) {
- next =3D drbr_dequeue(ifp, txr->br);
- } else if (drbr_needs_enqueue(ifp, txr->br)) {
- if ((err =3D drbr_enqueue(ifp, txr->br, m)) !=3D 0)
+ if (m) {
+ err =3D drbr_enqueue(ifp, txr->br, m);
+ if (err) {
return (err);
- next =3D drbr_dequeue(ifp, txr->br);
- } else
- next =3D m;
-
+ }
+ }
/* Process the queue */
- while (next !=3D NULL) {
+ while ((next =3D drbr_peek(ifp, txr->br)) !=3D NULL) {
+ snext =3D next;
if ((err =3D ixv_xmit(txr, &next)) !=3D 0) {
- if (next !=3D NULL)
- err =3D drbr_enqueue(ifp, txr->br, =
next);
+ if (next =3D=3D NULL) {
+ drbr_advance(ifp, txr->br);
+ } else {
+ drbr_putback(ifp, txr->br, next, snext);
+ }
break;
}
+ drbr_advance(ifp, txr->br);
enqueued++;
ifp->if_obytes +=3D next->m_pkthdr.len;
if (next->m_flags & M_MCAST)
@@ -648,7 +650,6 @@ ixv_mq_start_locked(struct ifnet *ifp, struct tx_r
ifp->if_drv_flags |=3D IFF_DRV_OACTIVE;
break;
}
- next =3D drbr_dequeue(ifp, txr->br);
}
=20
if (enqueued > 0) {
Index: dev/bxe/if_bxe.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- dev/bxe/if_bxe.c (revision 246323)
+++ dev/bxe/if_bxe.c (working copy)
@@ -9491,7 +9491,7 @@ bxe_tx_mq_start_locked(struct ifnet *ifp,
struct bxe_fastpath *fp, struct mbuf *m)
{
struct bxe_softc *sc;
- struct mbuf *next;
+ struct mbuf *next, *snext;
int depth, rc, tx_count;
=20
sc =3D fp->sc;
@@ -9506,24 +9506,16 @@ bxe_tx_mq_start_locked(struct ifnet *ifp,
=20
BXE_FP_LOCK_ASSERT(fp);
=20
- if (m =3D=3D NULL) {
- /* No new work, check for pending frames. */
- next =3D drbr_dequeue(ifp, fp->br);
- } else if (drbr_needs_enqueue(ifp, fp->br)) {
- /* Both new and pending work, maintain packet order. */
+ if (m) {
rc =3D drbr_enqueue(ifp, fp->br, m);
if (rc !=3D 0) {
fp->tx_soft_errors++;
goto bxe_tx_mq_start_locked_exit;
}
- next =3D drbr_dequeue(ifp, fp->br);
- } else
- /* New work only, nothing pending. */
- next =3D m;
-
+ }
/* Keep adding entries while there are frames to send. */
- while (next !=3D NULL) {
-
+ while ((next =3D drbr_peek(ifp, fp->br)) !=3D NULL) {
+ snext =3D next;
/* The transmit mbuf now belongs to us, keep track of =
it. */
fp->tx_mbuf_alloc++;
=20
@@ -9537,23 +9529,22 @@ bxe_tx_mq_start_locked(struct ifnet *ifp,
if (__predict_false(rc !=3D 0)) {
fp->tx_encap_failures++;
/* Very Bad Frames(tm) may have been dropped. */
- if (next !=3D NULL) {
+ if (next =3D=3D NULL) {
+ drbr_advance(ifp, fp->br);
+ } else {
+ drbr_putback(ifp, fp->br, next, snext);
/*
* Mark the TX queue as full and save
* the frame.
*/
ifp->if_drv_flags |=3D IFF_DRV_OACTIVE;
fp->tx_frame_deferred++;
-
- /* This may reorder frame. */
- rc =3D drbr_enqueue(ifp, fp->br, next);
fp->tx_mbuf_alloc--;
}
-
/* Stop looking for more work. */
break;
}
-
+ drbr_advance(ifp, fp->br);
/* The transmit frame was enqueued successfully. */
tx_count++;
=20
@@ -9574,8 +9565,6 @@ bxe_tx_mq_start_locked(struct ifnet *ifp,
ifp->if_drv_flags &=3D ~IFF_DRV_OACTIVE;
break;
}
-
- next =3D drbr_dequeue(ifp, fp->br);
}
=20
/* No TX packets were dequeued. */
Index: net/if_var.h
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- net/if_var.h (revision 246323)
+++ net/if_var.h (working copy)
@@ -622,6 +622,47 @@ drbr_enqueue(struct ifnet *ifp, struct buf_ring *b
}
=20
static __inline void
+drbr_putback(struct ifnet *ifp, struct buf_ring *br, struct mbuf *new, =
struct mbuf *prev)
+{
+ /*
+ * The top of the list needs to be swapped=20
+ * for this one.
+ */
+#ifdef ALTQ
+ struct mbuf *m;
+ if (ifp !=3D NULL && ALTQ_IS_ENABLED(&ifp->if_snd)) {
+ /*=20
+ * Peek in altq case dequeued it
+ * so put it back.
+ */
+ IFQ_DRV_PREPEND(ifq, new);
+ return;
+ }
+#endif
+ if (new !=3D prev)=20
+ buf_ring_swap(br, new, prev);
+}
+
+static __inline struct mbuf *
+drbr_peek(struct ifnet *ifp, struct buf_ring *br)
+{
+#ifdef ALTQ
+ struct mbuf *m;
+ if (ifp !=3D NULL && ALTQ_IS_ENABLED(&ifp->if_snd)) {
+ /*=20
+ * Pull it off like a dequeue
+ * since drbr_advance() does nothing
+ * for altq and drbr_putback() will
+ * use the old prepend function.
+ */
+ IFQ_DEQUEUE(&ifp->if_snd, m);
+ return (m);
+ }
+#endif
+ return(buf_ring_peek(br));
+}
+
+static __inline void
drbr_flush(struct ifnet *ifp, struct buf_ring *br)
{
struct mbuf *m;
@@ -656,6 +697,17 @@ drbr_dequeue(struct ifnet *ifp, struct buf_ring *b
return (buf_ring_dequeue_sc(br));
}
=20
+static __inline void
+drbr_advance(struct ifnet *ifp, struct buf_ring *br)
+{
+#ifdef ALTQ
+ /* Nothing to do here since peek dequeues in altq case */
+ return;
+#endif
+ return (buf_ring_advance_sc(br));
+}
+
+
static __inline struct mbuf *
drbr_dequeue_cond(struct ifnet *ifp, struct buf_ring *br,
int (*func) (struct mbuf *, void *), void *arg)=20
Index: sys/buf_ring.h
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- sys/buf_ring.h (revision 246323)
+++ sys/buf_ring.h (working copy)
@@ -208,6 +208,53 @@ buf_ring_dequeue_sc(struct buf_ring *br)
}
=20
/*
+ * single-consumer advance after a peek
+ * use where it is protected by a lock
+ * e.g. a network driver's tx queue lock
+ */
+static __inline void
+buf_ring_advance_sc(struct buf_ring *br)
+{
+ uint32_t cons_head, cons_next;
+ uint32_t prod_tail;
+ void *buf;
+=09
+ cons_head =3D br->br_cons_head;
+ prod_tail =3D br->br_prod_tail;
+=09
+ cons_next =3D (cons_head + 1) & br->br_cons_mask;
+=09
+ if (cons_head =3D=3D prod_tail)=20
+ return;
+
+ br->br_cons_head =3D cons_next;
+ buf =3D br->br_ring[cons_head];
+ br->br_cons_tail =3D cons_next;
+}
+
+
+/*
+ * Used to return a differnt mbuf to the
+ * top of the ring. This can happen if
+ * the driver changed the packets (some defragmentation
+ * for example) and then realized the transmit
+ * ring was full. In such a case the old packet
+ * is now freed, but we want the order of the actual
+ * data (being sent in the new packet) to remain
+ * the same.
+ */
+static __inline void
+buf_ring_swap(struct buf_ring *br, void *new, void *old)
+{
+ int ret;
+ if (br->br_cons_head =3D=3D br->br_prod_tail)=20
+ /* Huh? */
+ return;
+ ret =3D atomic_cmpset_long((uint64_t =
*)&br->br_ring[br->br_cons_head], (uint64_t)old, (uint64_t)new);
+ KASSERT(ret, ("Swap out failed old:%p new:%p ret:%d", old, new, =
ret));
+}
+
+/*
* return a pointer to the first entry in the ring
* without modifying it, or NULL if the ring is empty
* race-prone if not protected by a lock
Index: ofed/drivers/net/mlx4/en_tx.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- ofed/drivers/net/mlx4/en_tx.c (revision 246323)
+++ ofed/drivers/net/mlx4/en_tx.c (working copy)
@@ -919,7 +919,7 @@ mlx4_en_transmit_locked(struct ifnet *dev, int tx_
{
struct mlx4_en_priv *priv =3D netdev_priv(dev);
struct mlx4_en_tx_ring *ring;
- struct mbuf *next;
+ struct mbuf *next, *snext;
int enqueued, err =3D 0;
=20
ring =3D &priv->tx_ring[tx_ind];
@@ -931,22 +931,22 @@ mlx4_en_transmit_locked(struct ifnet *dev, int tx_
}
=20
enqueued =3D 0;
- if (m =3D=3D NULL) {
- next =3D drbr_dequeue(dev, ring->br);
- } else if (drbr_needs_enqueue(dev, ring->br)) {
+ if (m) {
if ((err =3D drbr_enqueue(dev, ring->br, m)) !=3D 0)
return (err);
- next =3D drbr_dequeue(dev, ring->br);
- } else
- next =3D m;
-
+ }
/* Process the queue */
- while (next !=3D NULL) {
+ while ((next =3D drbr_peek(ifp, txr->br)) !=3D NULL) {
+ snext =3D next;
if ((err =3D mlx4_en_xmit(dev, tx_ind, &next)) !=3D 0) {
- if (next !=3D NULL)
- err =3D drbr_enqueue(dev, ring->br, =
next);
+ if (next =3D=3D NULL) {
+ drbr_advance(ifp, txr->br);
+ } else {
+ drbr_putback(ifp, txr->br, next, snext);
+ }
break;
}
+ drbr_advance(ifp, txr->br);
enqueued++;
dev->if_obytes +=3D next->m_pkthdr.len;
if (next->m_flags & M_MCAST)
@@ -955,7 +955,6 @@ mlx4_en_transmit_locked(struct ifnet *dev, int tx_
ETHER_BPF_MTAP(dev, next);
if ((dev->if_drv_flags & IFF_DRV_RUNNING) =3D=3D 0)
break;
- next =3D drbr_dequeue(dev, ring->br);
}
=20
if (enqueued > 0)
--Apple-Mail=_E172FC28-2504-451F-A867-0842517F6CD4
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain;
charset=us-ascii
On Feb 5, 2013, at 5:49 AM, Randy Stewart wrote:
>=20
> On Feb 4, 2013, at 3:24 PM, John Baldwin wrote:
>=20
>> On Monday, February 04, 2013 12:22:49 pm Randy Stewart wrote:
>>> All:
>>>=20
>>> I have been working with TCP in gigabit networks (igb driver =
actually) and have
>>> found a very nasty problem with the way the driver is doing its put =
back when
>>> it fills the out-bound transmit queue.
>>>=20
>>> Basically it has taken a packet from the head of the ring buffer, =
and then=20
>>> realizes it can't fit it into the transmit queue. So it just =
re-enqueue's it
>>> into the ring buffer. Whats wrong with that? Well most of the time =
there
>>> are anywhere from 10-50 packets (maybe more) in that ring buffer =
when you are
>>> operating at full speed (or trying to). This means you will see 10 =
duplicate
>>> ACKs, do a fast retransmit and cut your cwnd in half.. not very nice =
actually.
>>>=20
>>> The patch I have attached makes it so that
>>>=20
>>> 1) There are ways to swap back.
>>> 2) Use the peek in the ring buffer and only
>>> dequeue the packet if we put it into the transmit ring
>>> 3) If something goes wrong and the transmit frees the packet we =
dequeue it.
>>> 4) If the transmit changed it (defrag etc) then swap out the new =
mbuf that
>>> has taken its place.
>>>=20
>>> I have fixed the four intel drivers that had this systemic issue, =
but there
>>> are still more to fix.
>>>=20
>>> Comments/review .. rotten egg's etc.. would be most welcome before
>>> I commit this..
>>=20
>> Does this only happen in drivers that use buffering?
>=20
> Yep, there are a lot of drivers that *do not* use the drbr_xxxx() =
functions and
> for those they do the IFQ_DRV_PREPEND().. its only the newer drivers =
like the
> intel 1Gig and 10Gig ones that seem effected
>=20
> Also effected are :
>=20
> bxe
> cxgb
> oce
> en
>=20
> I have not fixed those yet.
>=20
>> I seem to recall that
>> drivers using IFQ would just stuff the packet at the head of the IFQ =
via
>> IFQ_DRV_PREPEND() in this case so it is still the next packet to =
transmit.
>> See, for example, this bit in dc_start_locked():
>>=20
>> for (queued =3D 0; !IFQ_DRV_IS_EMPTY(&ifp->if_snd); ) {
>> /*
>> * If there's no way we can send any packets, return =
now.
>> */
>> if (sc->dc_cdata.dc_tx_cnt > DC_TX_LIST_CNT - =
DC_TX_LIST_RSVD) {
>> ifp->if_drv_flags |=3D IFF_DRV_OACTIVE;
>> break;
>> }
>> IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
>> if (m_head =3D=3D NULL)
>> break;
>>=20
>> if (dc_encap(sc, &m_head)) {
>> if (m_head =3D=3D NULL)
>> break;
>> IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
>> ifp->if_drv_flags |=3D IFF_DRV_OACTIVE;
>> break;
>> }
>>=20
>> It sounds like drbr/buf_ring just don't handle this case correctly? =
It
>> seems a shame to have to duplicate so much code in the various =
drivers to
>> fix this, but that seems to be par for the course when using =
buf_ring. :(
>> (buggy in edge cases and lots of duplicated code that is).
>=20
>> Also, doing the drbr_swap() just so that drbr_dequeue() returns what =
you
>> just swapped in seems... odd. It seems that it would be nicer =
instead
>> to have some sort of drbr_peek() / drbr_advance() where the latter =
just
>> skips over whatever the current head is? Then you could have =
something
>> like:
>>=20
>> while ((next =3D drbr_peek()) !=3D NULL) {
>> if (!foo_encap(&next)) {
>> if (next =3D=3D NULL)
>> drbr_advance();
>> break;
>> }
>> drbr_advance();
>> }
>>=20
>=20
> That was what I originally did (without the rename), but there is a =
sure crash waiting in that.
> The only difference from what I originally had was just =
drbr_dequeue().. but
> I was being a bit lazy and not wanting to add yet another function to =
the=20
> drbr_xxxx code since essential it would be a clone of drbr_dequeue() =
without
> returning the mbuf.
>=20
> The crash potential here is in that foo_encap(&next) often times will =
return
> a different mbuf (at least in the igb driver it does). I believe its =
due
> to either the m_pullup() which could change the lead mbuf you want
> in the drbr_ring, or the m_defrag all within igb_xmit. Thus you have
> to track what comes back from the !foo_encap() call and compare it to=20=
> see if you must swap it out.=20
>=20
>=20
>> I guess the sticky widget is the case of ATLQ as you need to dequeue =
the
>> packet always in the ALTQ case and put it back if the encap fails.
>=20
> Yeah ALTQ is ugly and IMO we need to re-write it anyway.. maybe =
re-think
> this whole layer ;-0
>=20
>> For
>> your patch it's not clear to me how that works. It seems that if the
>> encap routine frees the mbuf you try to dereference a freed pointer =
when
>> you call drbr_dequeue().
>=20
> Hmm you are right.. I forgot how we keep those using the mbuf =
itself...
>=20
>> I really think you will instead need some sort
>> of 'drbr_putback()' and have 'drbr_peek()' dequeue in the ALTQ case =
and
>> use 'drbr_putback()' to put it back (PREPEND) in the ALTQ case.
>=20
> We could do that but drbr_putback() would probably need both the old
> and new pointers and then we could make it do the ring_swap() to put
> the right mbuf in place..
>=20
> Let me go explore that and come up with a better patch ;-)
>=20
> R
>=20
>=20
>>=20
>> --=20
>> John Baldwin
>> _______________________________________________
>> freebsd-net@freebsd.org mailing list
>> http://lists.freebsd.org/mailman/listinfo/freebsd-net
>> To unsubscribe, send any mail to =
"freebsd-net-unsubscribe@freebsd.org"
>>=20
>=20
> -----
> Randall Stewart
> randall@lakerest.net
>=20
>=20
>=20
>=20
> _______________________________________________
> freebsd-net@freebsd.org mailing list
> http://lists.freebsd.org/mailman/listinfo/freebsd-net
> To unsubscribe, send any mail to "freebsd-net-unsubscribe@freebsd.org"
>=20
------------------------------
Randall Stewart
803-317-4952 (cell)
--Apple-Mail=_E172FC28-2504-451F-A867-0842517F6CD4--
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?39571D84-A8C0-46A4-8EFA-CF74D862EAAE>
