Skip site navigation (1)Skip section navigation (2)
Date:      Mon, 4 Feb 2013 12:22:49 -0500
From:      Randy Stewart <randall@lakerest.net>
To:        John Baldwin <jhb@FreeBSD.org>, jv@FreeBSD.org, George Nevile Neil <gnn@neville-neil.com>, Robert Watson <rwatson@FreeBSD.org>, Kip Macy <kmacy@FreeBSD.org>
Cc:        freebsd-net <freebsd-net@FreeBSD.org>
Subject:   Driver patch to look at...
Message-ID:  <D3AA078A-CD19-4228-A019-BE9C985895E2@lakerest.net>

next in thread | raw e-mail | index | archive | help

--Apple-Mail=_70D37FD7-3A2F-4331-BEC7-37AA9F73B8FC
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain;
	charset=us-ascii

All:

I have been working with TCP in gigabit networks (igb driver actually) =
and have
found a very nasty problem with the way the driver is doing its put back =
when
it fills the out-bound transmit queue.

Basically it has taken a packet from the head of the ring buffer, and =
then=20
realizes it can't fit it into the transmit queue. So it just =
re-enqueue's it
into the ring buffer. Whats wrong with that? Well most of the time there
are anywhere from 10-50 packets (maybe more) in that ring buffer when =
you are
operating at full speed (or trying to). This means you will see 10 =
duplicate
ACKs, do a fast retransmit and cut your cwnd in half.. not very nice =
actually.

The patch I have attached makes it so that

1) There are ways to swap back.
2) Use the peek in the ring buffer and only
   dequeue the packet if we put it into the transmit ring
3) If something goes wrong and the transmit frees the packet we dequeue =
it.
4) If the transmit changed it (defrag etc) then swap out the new mbuf =
that
   has taken its place.

I have fixed the four intel drivers that had this systemic issue, but =
there
are still more to fix.

Comments/review .. rotten egg's etc.. would be most welcome before
I commit this..

Jack are you out there?

Thanks

R


--Apple-Mail=_70D37FD7-3A2F-4331-BEC7-37AA9F73B8FC
Content-Disposition: attachment;
	filename=driver_patch.txt
Content-Type: text/plain;
	x-unix-mode=0644;
	name="driver_patch.txt"
Content-Transfer-Encoding: quoted-printable

Index: dev/e1000/if_em.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- dev/e1000/if_em.c	(revision 246323)
+++ dev/e1000/if_em.c	(working copy)
@@ -894,7 +894,7 @@ static int
 em_mq_start_locked(struct ifnet *ifp, struct tx_ring *txr, struct mbuf =
*m)
 {
 	struct adapter  *adapter =3D txr->adapter;
-        struct mbuf     *next;
+        struct mbuf     *next, *next, *dequeued;
         int             err =3D 0, enq =3D 0;
=20
 	if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=3D=

@@ -905,22 +905,27 @@ em_mq_start_locked(struct ifnet *ifp, struct tx_ri
 	}
=20
 	enq =3D 0;
-	if (m =3D=3D NULL) {
-		next =3D drbr_dequeue(ifp, txr->br);
-	} else if (drbr_needs_enqueue(ifp, txr->br)) {
-		if ((err =3D drbr_enqueue(ifp, txr->br, m)) !=3D 0)
+	if (m) {
+		err =3D drbr_enqueue(ifp, txr->br, m);
+		if (err) {
 			return (err);
-		next =3D drbr_dequeue(ifp, txr->br);
-	} else
-		next =3D m;
+		}
+	}=20
=20
 	/* Process the queue */
-	while (next !=3D NULL) {
+	while ((next =3D drbr_peek(ifp, txr->br)) !=3D NULL) {
+		snext =3D next;
 		if ((err =3D em_xmit(txr, &next)) !=3D 0) {
-                        if (next !=3D NULL)
-                                err =3D drbr_enqueue(ifp, txr->br, =
next);
-                        break;
+			if (next =3D=3D NULL) {
+				dequeued =3D drbr_dequeue(ifp, txr->br);
+				KASSERT(dequeued =3D=3D snext, =
("dequeued incorrect packet from buf_ring"));
+			} else if (next !=3D snext) {
+				drbr_swap(ifp, txr->br, next, snext);
+			}
+			break;
 		}
+		dequeued =3D drbr_dequeue(ifp, txr->br);
+		KASSERT(dequeued =3D=3D snext, ("dequeued incorrect =
packet from buf_ring"));
 		enq++;
 		ifp->if_obytes +=3D next->m_pkthdr.len;
 		if (next->m_flags & M_MCAST)
Index: dev/e1000/if_igb.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- dev/e1000/if_igb.c	(revision 246323)
+++ dev/e1000/if_igb.c	(working copy)
@@ -981,7 +981,7 @@ static int
 igb_mq_start_locked(struct ifnet *ifp, struct tx_ring *txr)
 {
 	struct adapter  *adapter =3D txr->adapter;
-        struct mbuf     *next;
+        struct mbuf     *next, *snext, *dequeued;
         int             err =3D 0, enq;
=20
 	IGB_TX_LOCK_ASSERT(txr);
@@ -994,12 +994,21 @@ igb_mq_start_locked(struct ifnet *ifp, struct tx_r
 	enq =3D 0;
=20
 	/* Process the queue */
-	while ((next =3D drbr_dequeue(ifp, txr->br)) !=3D NULL) {
+	while ((next =3D drbr_peek(ifp, txr->br)) !=3D NULL) {
+		snext =3D next;
 		if ((err =3D igb_xmit(txr, &next)) !=3D 0) {
-			if (next !=3D NULL)
-				err =3D drbr_enqueue(ifp, txr->br, =
next);
+			if (next =3D=3D NULL) {
+				/* It was freed, dequeue it */
+				dequeued =3D drbr_dequeue(ifp, txr->br);
+				KASSERT(dequeued =3D=3D snext, =
("dequeued incorrect packet from buf_ring"));
+			} else if (next !=3D snext) {
+				/* it was changed -- defrag? pullup? */
+				drbr_swap(ifp, txr->br, next, snext);
+			}
 			break;
 		}
+		dequeued =3D drbr_dequeue(ifp, txr->br);
+		KASSERT(dequeued =3D=3D snext, ("dequeued incorrect =
packet from buf_ring"));
 		enq++;
 		ifp->if_obytes +=3D next->m_pkthdr.len;
 		if (next->m_flags & M_MCAST)
Index: dev/ixgbe/ixgbe.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- dev/ixgbe/ixgbe.c	(revision 246323)
+++ dev/ixgbe/ixgbe.c	(working copy)
@@ -821,7 +821,7 @@ static int
 ixgbe_mq_start_locked(struct ifnet *ifp, struct tx_ring *txr, struct =
mbuf *m)
 {
 	struct adapter  *adapter =3D txr->adapter;
-        struct mbuf     *next;
+        struct mbuf     *next, *snext, *dequeued;
         int             enqueued, err =3D 0;
=20
 	if (((ifp->if_drv_flags & IFF_DRV_RUNNING) =3D=3D 0) ||
@@ -832,22 +832,27 @@ ixgbe_mq_start_locked(struct ifnet *ifp, struct tx
 	}
=20
 	enqueued =3D 0;
-	if (m =3D=3D NULL) {
-		next =3D drbr_dequeue(ifp, txr->br);
-	} else if (drbr_needs_enqueue(ifp, txr->br)) {
-		if ((err =3D drbr_enqueue(ifp, txr->br, m)) !=3D 0)
+	if (m) {
+		err =3D drbr_enqueue(ifp, txr->br, m);
+		if (err) {
 			return (err);
-		next =3D drbr_dequeue(ifp, txr->br);
-	} else
-		next =3D m;
+		}
+	}
=20
 	/* Process the queue */
-	while (next !=3D NULL) {
+	while ((next =3D drbr_peek(ifp, txr->br)) !=3D NULL) {
+		snext =3D next;
 		if ((err =3D ixgbe_xmit(txr, &next)) !=3D 0) {
-			if (next !=3D NULL)
-				err =3D drbr_enqueue(ifp, txr->br, =
next);
+			if (next =3D=3D NULL) {
+				dequeued =3D drbr_dequeue(ifp, txr->br);
+				KASSERT(dequeued =3D=3D snext, =
("dequeued incorrect packet from buf_ring"));
+			} else if (next !=3D snext) {
+				drbr_swap(ifp, txr->br, next, snext);
+			}
 			break;
 		}
+		dequeued =3D drbr_dequeue(ifp, txr->br);
+		KASSERT(dequeued =3D=3D snext, ("dequeued incorrect =
packet from buf_ring"));
 		enqueued++;
 		/* Send a copy of the frame to the BPF listener */
 		ETHER_BPF_MTAP(ifp, next);
Index: dev/ixgbe/ixv.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- dev/ixgbe/ixv.c	(revision 246323)
+++ dev/ixgbe/ixv.c	(working copy)
@@ -605,7 +605,7 @@ static int
 ixv_mq_start_locked(struct ifnet *ifp, struct tx_ring *txr, struct mbuf =
*m)
 {
 	struct adapter  *adapter =3D txr->adapter;
-        struct mbuf     *next;
+        struct mbuf     *next, *snext, *dequeued;
         int             enqueued, err =3D 0;
=20
 	if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=3D=

@@ -620,22 +620,26 @@ ixv_mq_start_locked(struct ifnet *ifp, struct tx_r
 		ixv_txeof(txr);
=20
 	enqueued =3D 0;
-	if (m =3D=3D NULL) {
-		next =3D drbr_dequeue(ifp, txr->br);
-	} else if (drbr_needs_enqueue(ifp, txr->br)) {
-		if ((err =3D drbr_enqueue(ifp, txr->br, m)) !=3D 0)
+	if (m) {
+		err =3D drbr_enqueue(ifp, txr->br, m);
+		if (err) {
 			return (err);
-		next =3D drbr_dequeue(ifp, txr->br);
-	} else
-		next =3D m;
-
+		}
+	}
 	/* Process the queue */
-	while (next !=3D NULL) {
+	while ((next =3D drbr_peek(ifp, txr->br)) !=3D NULL) {
+		snext =3D next;
 		if ((err =3D ixv_xmit(txr, &next)) !=3D 0) {
-			if (next !=3D NULL)
-				err =3D drbr_enqueue(ifp, txr->br, =
next);
+			if (next =3D=3D NULL) {
+				dequeued =3D drbr_dequeue(ifp, txr->br);
+				KASSERT(dequeued =3D=3D snext, =
("dequeued incorrect packet from buf_ring"));
+			} else if (next !=3D snext) {
+				drbr_swap(ifp, txr->br, next, snext);
+			}
 			break;
 		}
+		dequeued =3D drbr_dequeue(ifp, txr->br);
+		KASSERT(dequeued =3D=3D snext, ("dequeued incorrect =
packet from buf_ring"));
 		enqueued++;
 		ifp->if_obytes +=3D next->m_pkthdr.len;
 		if (next->m_flags & M_MCAST)
Index: net/if_var.h
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- net/if_var.h	(revision 246323)
+++ net/if_var.h	(working copy)
@@ -622,6 +622,41 @@ drbr_enqueue(struct ifnet *ifp, struct buf_ring *b
 }
=20
 static __inline void
+drbr_swap(struct ifnet *ifp, struct buf_ring *br, struct mbuf *new, =
struct mbuf *prev)
+{
+	/*
+	 * The top of the list needs to be swapped=20
+	 * for this one.
+	 */
+#ifdef ALTQ
+	struct mbuf *m;
+	if (ifp !=3D NULL && ALTQ_IS_ENABLED(&ifp->if_snd)) {
+		/* Pull it off and put it back in */
+		IFQ_DEQUEUE(&ifp->if_snd, m);
+		KASSERT(m =3D=3D prev, ("Swap out failed to find prev =
mbuf"));
+		IFQ_DRV_DEQUEUE(&ifp->if_snd, new);
+		return;
+	}
+#endif
+	buf_ring_swap(br, new, prev);
+}
+
+static __inline struct mbuf *
+drbr_peek(struct ifnet *ifp, struct buf_ring *br)
+{
+#ifdef ALTQ
+	struct mbuf *m;
+	if (ifp !=3D NULL && ALTQ_IS_ENABLED(&ifp->if_snd)) {
+		/* Pull it off and put it back in */
+		IFQ_DEQUEUE(&ifp->if_snd, m);
+		IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
+		return (m);
+	}
+#endif
+	return(buf_ring_peek(br));
+}
+
+static __inline void
 drbr_flush(struct ifnet *ifp, struct buf_ring *br)
 {
 	struct mbuf *m;
Index: sys/buf_ring.h
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- sys/buf_ring.h	(revision 246323)
+++ sys/buf_ring.h	(working copy)
@@ -208,6 +208,27 @@ buf_ring_dequeue_sc(struct buf_ring *br)
 }
=20
 /*
+ * Used to return a differnt mbuf to the
+ * top of the ring. This can happen if
+ * the driver changed the packets (some deframentation
+ * for example) and then realized the transmit
+ * ring was full. In such a case the old packet
+ * is now freed, but we want the order of the actual
+ * data (being sent in the new packet) to remain
+ * the same.
+ */
+static __inline void
+buf_ring_swap(struct buf_ring *br, void *new, void *old)
+{
+	int ret;
+	if (br->br_cons_head =3D=3D br->br_prod_tail)=20
+		/* Huh? */
+		return;
+	ret =3D atomic_cmpset_long((uint64_t =
*)&br->br_ring[br->br_cons_head], (uint64_t)old, (uint64_t)new);
+	KASSERT(ret, ("Swap out failed old:%p new:%p ret:%d", old, new, =
ret));
+}
+
+/*
  * return a pointer to the first entry in the ring
  * without modifying it, or NULL if the ring is empty
  * race-prone if not protected by a lock

--Apple-Mail=_70D37FD7-3A2F-4331-BEC7-37AA9F73B8FC
Content-Transfer-Encoding: 7bit
Content-Type: text/plain;
	charset=us-ascii


-----
Randall Stewart
randall@lakerest.net





--Apple-Mail=_70D37FD7-3A2F-4331-BEC7-37AA9F73B8FC--



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?D3AA078A-CD19-4228-A019-BE9C985895E2>