From owner-freebsd-net@FreeBSD.ORG Mon Feb 4 17:22:54 2013 Return-Path: Delivered-To: freebsd-net@FreeBSD.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:1900:2254:206a::19:1]) by hub.freebsd.org (Postfix) with ESMTP id 6934DB79; Mon, 4 Feb 2013 17:22:54 +0000 (UTC) (envelope-from randall@lakerest.net) Received: from lakerest.net (lakerest.net [70.155.160.98]) by mx1.freebsd.org (Postfix) with ESMTP id C658EC23; Mon, 4 Feb 2013 17:22:50 +0000 (UTC) Received: from [10.1.1.101] (bsd4.lakerest.net [70.155.160.102]) (authenticated bits=0) by lakerest.net (8.14.4/8.14.3) with ESMTP id r14HN5AZ028472 (version=TLSv1/SSLv3 cipher=AES128-SHA bits=128 verify=NOT); Mon, 4 Feb 2013 12:23:05 -0500 (EST) (envelope-from randall@lakerest.net) From: Randy Stewart Content-Type: multipart/mixed; boundary="Apple-Mail=_70D37FD7-3A2F-4331-BEC7-37AA9F73B8FC" Subject: Driver patch to look at... Date: Mon, 4 Feb 2013 12:22:49 -0500 Message-Id: To: John Baldwin , jv@FreeBSD.org, George Nevile Neil , Robert Watson , Kip Macy Mime-Version: 1.0 (Apple Message framework v1283) X-Mailer: Apple Mail (2.1283) Cc: freebsd-net X-BeenThere: freebsd-net@freebsd.org X-Mailman-Version: 2.1.14 Precedence: list List-Id: Networking and TCP/IP with FreeBSD List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Mon, 04 Feb 2013 17:22:54 -0000 --Apple-Mail=_70D37FD7-3A2F-4331-BEC7-37AA9F73B8FC Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset=us-ascii All: I have been working with TCP in gigabit networks (igb driver actually) = and have found a very nasty problem with the way the driver is doing its put back = when it fills the out-bound transmit queue. Basically it has taken a packet from the head of the ring buffer, and = then=20 realizes it can't fit it into the transmit queue. So it just = re-enqueue's it into the ring buffer. Whats wrong with that? Well most of the time there are anywhere from 10-50 packets (maybe more) in that ring buffer when = you are operating at full speed (or trying to). This means you will see 10 = duplicate ACKs, do a fast retransmit and cut your cwnd in half.. not very nice = actually. The patch I have attached makes it so that 1) There are ways to swap back. 2) Use the peek in the ring buffer and only dequeue the packet if we put it into the transmit ring 3) If something goes wrong and the transmit frees the packet we dequeue = it. 4) If the transmit changed it (defrag etc) then swap out the new mbuf = that has taken its place. I have fixed the four intel drivers that had this systemic issue, but = there are still more to fix. Comments/review .. rotten egg's etc.. would be most welcome before I commit this.. Jack are you out there? Thanks R --Apple-Mail=_70D37FD7-3A2F-4331-BEC7-37AA9F73B8FC Content-Disposition: attachment; filename=driver_patch.txt Content-Type: text/plain; x-unix-mode=0644; name="driver_patch.txt" Content-Transfer-Encoding: quoted-printable Index: dev/e1000/if_em.c =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D --- dev/e1000/if_em.c (revision 246323) +++ dev/e1000/if_em.c (working copy) @@ -894,7 +894,7 @@ static int em_mq_start_locked(struct ifnet *ifp, struct tx_ring *txr, struct mbuf = *m) { struct adapter *adapter =3D txr->adapter; - struct mbuf *next; + struct mbuf *next, *next, *dequeued; int err =3D 0, enq =3D 0; =20 if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=3D= @@ -905,22 +905,27 @@ em_mq_start_locked(struct ifnet *ifp, struct tx_ri } =20 enq =3D 0; - if (m =3D=3D NULL) { - next =3D drbr_dequeue(ifp, txr->br); - } else if (drbr_needs_enqueue(ifp, txr->br)) { - if ((err =3D drbr_enqueue(ifp, txr->br, m)) !=3D 0) + if (m) { + err =3D drbr_enqueue(ifp, txr->br, m); + if (err) { return (err); - next =3D drbr_dequeue(ifp, txr->br); - } else - next =3D m; + } + }=20 =20 /* Process the queue */ - while (next !=3D NULL) { + while ((next =3D drbr_peek(ifp, txr->br)) !=3D NULL) { + snext =3D next; if ((err =3D em_xmit(txr, &next)) !=3D 0) { - if (next !=3D NULL) - err =3D drbr_enqueue(ifp, txr->br, = next); - break; + if (next =3D=3D NULL) { + dequeued =3D drbr_dequeue(ifp, txr->br); + KASSERT(dequeued =3D=3D snext, = ("dequeued incorrect packet from buf_ring")); + } else if (next !=3D snext) { + drbr_swap(ifp, txr->br, next, snext); + } + break; } + dequeued =3D drbr_dequeue(ifp, txr->br); + KASSERT(dequeued =3D=3D snext, ("dequeued incorrect = packet from buf_ring")); enq++; ifp->if_obytes +=3D next->m_pkthdr.len; if (next->m_flags & M_MCAST) Index: dev/e1000/if_igb.c =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D --- dev/e1000/if_igb.c (revision 246323) +++ dev/e1000/if_igb.c (working copy) @@ -981,7 +981,7 @@ static int igb_mq_start_locked(struct ifnet *ifp, struct tx_ring *txr) { struct adapter *adapter =3D txr->adapter; - struct mbuf *next; + struct mbuf *next, *snext, *dequeued; int err =3D 0, enq; =20 IGB_TX_LOCK_ASSERT(txr); @@ -994,12 +994,21 @@ igb_mq_start_locked(struct ifnet *ifp, struct tx_r enq =3D 0; =20 /* Process the queue */ - while ((next =3D drbr_dequeue(ifp, txr->br)) !=3D NULL) { + while ((next =3D drbr_peek(ifp, txr->br)) !=3D NULL) { + snext =3D next; if ((err =3D igb_xmit(txr, &next)) !=3D 0) { - if (next !=3D NULL) - err =3D drbr_enqueue(ifp, txr->br, = next); + if (next =3D=3D NULL) { + /* It was freed, dequeue it */ + dequeued =3D drbr_dequeue(ifp, txr->br); + KASSERT(dequeued =3D=3D snext, = ("dequeued incorrect packet from buf_ring")); + } else if (next !=3D snext) { + /* it was changed -- defrag? pullup? */ + drbr_swap(ifp, txr->br, next, snext); + } break; } + dequeued =3D drbr_dequeue(ifp, txr->br); + KASSERT(dequeued =3D=3D snext, ("dequeued incorrect = packet from buf_ring")); enq++; ifp->if_obytes +=3D next->m_pkthdr.len; if (next->m_flags & M_MCAST) Index: dev/ixgbe/ixgbe.c =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D --- dev/ixgbe/ixgbe.c (revision 246323) +++ dev/ixgbe/ixgbe.c (working copy) @@ -821,7 +821,7 @@ static int ixgbe_mq_start_locked(struct ifnet *ifp, struct tx_ring *txr, struct = mbuf *m) { struct adapter *adapter =3D txr->adapter; - struct mbuf *next; + struct mbuf *next, *snext, *dequeued; int enqueued, err =3D 0; =20 if (((ifp->if_drv_flags & IFF_DRV_RUNNING) =3D=3D 0) || @@ -832,22 +832,27 @@ ixgbe_mq_start_locked(struct ifnet *ifp, struct tx } =20 enqueued =3D 0; - if (m =3D=3D NULL) { - next =3D drbr_dequeue(ifp, txr->br); - } else if (drbr_needs_enqueue(ifp, txr->br)) { - if ((err =3D drbr_enqueue(ifp, txr->br, m)) !=3D 0) + if (m) { + err =3D drbr_enqueue(ifp, txr->br, m); + if (err) { return (err); - next =3D drbr_dequeue(ifp, txr->br); - } else - next =3D m; + } + } =20 /* Process the queue */ - while (next !=3D NULL) { + while ((next =3D drbr_peek(ifp, txr->br)) !=3D NULL) { + snext =3D next; if ((err =3D ixgbe_xmit(txr, &next)) !=3D 0) { - if (next !=3D NULL) - err =3D drbr_enqueue(ifp, txr->br, = next); + if (next =3D=3D NULL) { + dequeued =3D drbr_dequeue(ifp, txr->br); + KASSERT(dequeued =3D=3D snext, = ("dequeued incorrect packet from buf_ring")); + } else if (next !=3D snext) { + drbr_swap(ifp, txr->br, next, snext); + } break; } + dequeued =3D drbr_dequeue(ifp, txr->br); + KASSERT(dequeued =3D=3D snext, ("dequeued incorrect = packet from buf_ring")); enqueued++; /* Send a copy of the frame to the BPF listener */ ETHER_BPF_MTAP(ifp, next); Index: dev/ixgbe/ixv.c =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D --- dev/ixgbe/ixv.c (revision 246323) +++ dev/ixgbe/ixv.c (working copy) @@ -605,7 +605,7 @@ static int ixv_mq_start_locked(struct ifnet *ifp, struct tx_ring *txr, struct mbuf = *m) { struct adapter *adapter =3D txr->adapter; - struct mbuf *next; + struct mbuf *next, *snext, *dequeued; int enqueued, err =3D 0; =20 if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=3D= @@ -620,22 +620,26 @@ ixv_mq_start_locked(struct ifnet *ifp, struct tx_r ixv_txeof(txr); =20 enqueued =3D 0; - if (m =3D=3D NULL) { - next =3D drbr_dequeue(ifp, txr->br); - } else if (drbr_needs_enqueue(ifp, txr->br)) { - if ((err =3D drbr_enqueue(ifp, txr->br, m)) !=3D 0) + if (m) { + err =3D drbr_enqueue(ifp, txr->br, m); + if (err) { return (err); - next =3D drbr_dequeue(ifp, txr->br); - } else - next =3D m; - + } + } /* Process the queue */ - while (next !=3D NULL) { + while ((next =3D drbr_peek(ifp, txr->br)) !=3D NULL) { + snext =3D next; if ((err =3D ixv_xmit(txr, &next)) !=3D 0) { - if (next !=3D NULL) - err =3D drbr_enqueue(ifp, txr->br, = next); + if (next =3D=3D NULL) { + dequeued =3D drbr_dequeue(ifp, txr->br); + KASSERT(dequeued =3D=3D snext, = ("dequeued incorrect packet from buf_ring")); + } else if (next !=3D snext) { + drbr_swap(ifp, txr->br, next, snext); + } break; } + dequeued =3D drbr_dequeue(ifp, txr->br); + KASSERT(dequeued =3D=3D snext, ("dequeued incorrect = packet from buf_ring")); enqueued++; ifp->if_obytes +=3D next->m_pkthdr.len; if (next->m_flags & M_MCAST) Index: net/if_var.h =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D --- net/if_var.h (revision 246323) +++ net/if_var.h (working copy) @@ -622,6 +622,41 @@ drbr_enqueue(struct ifnet *ifp, struct buf_ring *b } =20 static __inline void +drbr_swap(struct ifnet *ifp, struct buf_ring *br, struct mbuf *new, = struct mbuf *prev) +{ + /* + * The top of the list needs to be swapped=20 + * for this one. + */ +#ifdef ALTQ + struct mbuf *m; + if (ifp !=3D NULL && ALTQ_IS_ENABLED(&ifp->if_snd)) { + /* Pull it off and put it back in */ + IFQ_DEQUEUE(&ifp->if_snd, m); + KASSERT(m =3D=3D prev, ("Swap out failed to find prev = mbuf")); + IFQ_DRV_DEQUEUE(&ifp->if_snd, new); + return; + } +#endif + buf_ring_swap(br, new, prev); +} + +static __inline struct mbuf * +drbr_peek(struct ifnet *ifp, struct buf_ring *br) +{ +#ifdef ALTQ + struct mbuf *m; + if (ifp !=3D NULL && ALTQ_IS_ENABLED(&ifp->if_snd)) { + /* Pull it off and put it back in */ + IFQ_DEQUEUE(&ifp->if_snd, m); + IFQ_DRV_DEQUEUE(&ifp->if_snd, m); + return (m); + } +#endif + return(buf_ring_peek(br)); +} + +static __inline void drbr_flush(struct ifnet *ifp, struct buf_ring *br) { struct mbuf *m; Index: sys/buf_ring.h =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D --- sys/buf_ring.h (revision 246323) +++ sys/buf_ring.h (working copy) @@ -208,6 +208,27 @@ buf_ring_dequeue_sc(struct buf_ring *br) } =20 /* + * Used to return a differnt mbuf to the + * top of the ring. This can happen if + * the driver changed the packets (some deframentation + * for example) and then realized the transmit + * ring was full. In such a case the old packet + * is now freed, but we want the order of the actual + * data (being sent in the new packet) to remain + * the same. + */ +static __inline void +buf_ring_swap(struct buf_ring *br, void *new, void *old) +{ + int ret; + if (br->br_cons_head =3D=3D br->br_prod_tail)=20 + /* Huh? */ + return; + ret =3D atomic_cmpset_long((uint64_t = *)&br->br_ring[br->br_cons_head], (uint64_t)old, (uint64_t)new); + KASSERT(ret, ("Swap out failed old:%p new:%p ret:%d", old, new, = ret)); +} + +/* * return a pointer to the first entry in the ring * without modifying it, or NULL if the ring is empty * race-prone if not protected by a lock --Apple-Mail=_70D37FD7-3A2F-4331-BEC7-37AA9F73B8FC Content-Transfer-Encoding: 7bit Content-Type: text/plain; charset=us-ascii ----- Randall Stewart randall@lakerest.net --Apple-Mail=_70D37FD7-3A2F-4331-BEC7-37AA9F73B8FC--