Date: Thu, 27 Mar 2014 18:44:01 -0400 (EDT) From: Rick Macklem <rmacklem@uoguelph.ca> To: Christopher Forgeron <csforgeron@gmail.com> Cc: Pyun YongHyeon <pyunyh@gmail.com>, Andre Oppermann <andre@freebsd.org>, FreeBSD Net <freebsd-net@freebsd.org>, Markus Gebert <markus.gebert@hostpoint.ch>, Jack Vogel <jfvogel@gmail.com>, Garrett Wollman <wollman@freebsd.org> Subject: Re: 9.2 ixgbe tx queue hang Message-ID: <1292881633.1858906.1395960241007.JavaMail.root@uoguelph.ca> In-Reply-To: <CAB2_NwB7PnJfyzfgf4n7tqkKqxqgno%2B%2Bf9xY8_aV6AJ-mgPMYw@mail.gmail.com>
next in thread | previous in thread | raw e-mail | index | archive | help
[-- Attachment #1 --]
Christopher Forgeron wrote:
>
>
>
>
>
>
> On Wed, Mar 26, 2014 at 9:35 PM, Rick Macklem < rmacklem@uoguelph.ca
> > wrote:
>
>
>
>
> I've suggested in the other thread what you suggested in a recent
> post...ie. to change the default, at least until the propagation
> of driver set values is resolved.
>
> rick
>
>
>
> I wonder if we need to worry about propagating values up from the
> sub-if's - Setting the default in if.c means this is set for all
> if's, and it's a simple 1 line code change. If a specific 'if' needs
> a different value, it can be set before ether_attach() is called.
>
>
> I'm more concerned with the equation we use to calculate if_hw_tsomax
> - Are we considering the right variables? Are we thinking on the
> wrong OSI layer for headers?
>
Well, I'm pragmatic (which means I mostly care about some fix that works),
but it seems to me that:
- The problem is that some TSO enabled network drivers/hardware can only
handle 32 transmit segments (or 32 mbufs in the chain for the TSO packet
to be transmitted, if that is clearer).
--> Since the problem is in certain drivers, it seems that those drivers
should be where the long term fix goes.
--> Since some hardware can't handle more than 32, it seems that the
driver should be able to specify that limit, which tcp_output() can
then apply.
I have an untested patch that does this by adding if_hw_tsomaxseg.
(The attachment called tsomaxseg.patch.)
Changing if_hw_tsomax or its default value is just a hack that gets tcp_output()
to apply a limit that the driver can then fix to 32 mbufs in the chain via
m_defrag().
Since if_hw_tsomax (and if_hw_tsomaxseg in the untested patch) aren't
propagated up through lagg, that needs to be fixed.
(Yet another attached untested patch called lagg.patch.)
As I said before, I don't see these patches getting tested/reviewed etc
in time for 9.3, so I think reducing the default value of if_hw_tsomax
is a reasonable short term hack to work around the problem.
(And it sounds like Pyun YongHyeon has volunteered to fix many of the
drivers, where the 32 limit isn't a hardware one.)
rick
[-- Attachment #2 --]
--- kern/uipc_sockbuf.c.sav 2014-01-30 20:27:17.000000000 -0500
+++ kern/uipc_sockbuf.c 2014-01-30 22:12:08.000000000 -0500
@@ -965,6 +965,39 @@ sbsndptr(struct sockbuf *sb, u_int off,
}
/*
+ * Return the first mbuf for the provided offset.
+ */
+struct mbuf *
+sbsndmbuf(struct sockbuf *sb, u_int off, long *first_len)
+{
+ struct mbuf *m;
+
+ KASSERT(sb->sb_mb != NULL, ("%s: sb_mb is NULL", __func__));
+
+ *first_len = 0;
+ /*
+ * Is off below stored offset? Happens on retransmits.
+ * If so, just use sb_mb.
+ */
+ if (sb->sb_sndptr == NULL || sb->sb_sndptroff > off)
+ m = sb->sb_mb;
+ else {
+ m = sb->sb_sndptr;
+ off -= sb->sb_sndptroff;
+ }
+ while (off > 0 && m != NULL) {
+ if (off < m->m_len)
+ break;
+ off -= m->m_len;
+ m = m->m_next;
+ }
+ if (m != NULL)
+ *first_len = m->m_len - off;
+
+ return (m);
+}
+
+/*
* Drop a record off the front of a sockbuf and move the next record to the
* front.
*/
--- sys/sockbuf.h.sav 2014-01-30 20:42:28.000000000 -0500
+++ sys/sockbuf.h 2014-01-30 22:08:43.000000000 -0500
@@ -153,6 +153,8 @@ int sbreserve_locked(struct sockbuf *sb,
struct thread *td);
struct mbuf *
sbsndptr(struct sockbuf *sb, u_int off, u_int len, u_int *moff);
+struct mbuf *
+ sbsndmbuf(struct sockbuf *sb, u_int off, long *first_len);
void sbtoxsockbuf(struct sockbuf *sb, struct xsockbuf *xsb);
int sbwait(struct sockbuf *sb);
int sblock(struct sockbuf *sb, int flags);
--- netinet/tcp_input.c.sav 2014-01-30 19:37:52.000000000 -0500
+++ netinet/tcp_input.c 2014-01-30 19:39:07.000000000 -0500
@@ -3627,6 +3627,7 @@ tcp_mss(struct tcpcb *tp, int offer)
if (cap.ifcap & CSUM_TSO) {
tp->t_flags |= TF_TSO;
tp->t_tsomax = cap.tsomax;
+ tp->t_tsomaxsegs = cap.tsomaxsegs;
}
}
--- netinet/tcp_output.c.sav 2014-01-30 18:55:15.000000000 -0500
+++ netinet/tcp_output.c 2014-01-30 22:18:56.000000000 -0500
@@ -166,8 +166,8 @@ int
tcp_output(struct tcpcb *tp)
{
struct socket *so = tp->t_inpcb->inp_socket;
- long len, recwin, sendwin;
- int off, flags, error = 0; /* Keep compiler happy */
+ long len, recwin, sendwin, tso_tlen;
+ int cnt, off, flags, error = 0; /* Keep compiler happy */
struct mbuf *m;
struct ip *ip = NULL;
struct ipovly *ipov = NULL;
@@ -780,6 +780,24 @@ send:
}
/*
+ * Limit the number of TSO transmit segments (mbufs
+ * in mbuf list) to tp->t_tsomaxsegs.
+ */
+ cnt = 0;
+ m = sbsndmbuf(&so->so_snd, off, &tso_tlen);
+ while (m != NULL && cnt < tp->t_tsomaxsegs &&
+ tso_tlen < len) {
+ if (cnt > 0)
+ tso_tlen += m->m_len;
+ cnt++;
+ m = m->m_next;
+ }
+ if (m != NULL && tso_tlen < len) {
+ len = tso_tlen;
+ sendalot = 1;
+ }
+
+ /*
* Prevent the last segment from being
* fractional unless the send sockbuf can
* be emptied.
--- netinet/tcp_subr.c.sav 2014-01-30 19:44:35.000000000 -0500
+++ netinet/tcp_subr.c 2014-01-30 20:56:12.000000000 -0500
@@ -1800,6 +1800,12 @@ tcp_maxmtu(struct in_conninfo *inc, stru
ifp->if_hwassist & CSUM_TSO)
cap->ifcap |= CSUM_TSO;
cap->tsomax = ifp->if_hw_tsomax;
+#ifdef notyet
+ cap->tsomaxsegs = ifp->if_hw_tsomaxsegs;
+#endif
+ if (cap->tsomaxsegs == 0)
+ cap->tsomaxsegs =
+ TCPTSO_MAX_TX_SEGS_DEFAULT;
}
RTFREE(sro.ro_rt);
}
--- netinet/tcp_var.h.sav 2014-01-30 19:39:22.000000000 -0500
+++ netinet/tcp_var.h 2014-01-30 20:52:57.000000000 -0500
@@ -209,6 +209,7 @@ struct tcpcb {
u_int t_keepcnt; /* number of keepalives before close */
u_int t_tsomax; /* tso burst length limit */
+ u_int t_tsomaxsegs; /* tso burst segment limit */
uint32_t t_ispare[8]; /* 5 UTO, 3 TBD */
void *t_pspare2[4]; /* 4 TBD */
@@ -268,6 +269,11 @@ struct tcpcb {
#define TCPOOB_HAVEDATA 0x01
#define TCPOOB_HADDATA 0x02
+/*
+ * Default value for TSO maximum number of transmit segments (count of mbufs).
+ */
+#define TCPTSO_MAX_TX_SEGS_DEFAULT 30
+
#ifdef TCP_SIGNATURE
/*
* Defines which are needed by the xform_tcp module and tcp_[in|out]put
@@ -333,6 +339,7 @@ struct hc_metrics_lite { /* must stay in
struct tcp_ifcap {
int ifcap;
u_int tsomax;
+ u_int tsomaxsegs;
};
#ifndef _NETINET_IN_PCB_H_
[-- Attachment #3 --]
--- net/if_lagg.c.sav 2014-03-26 21:04:36.000000000 -0400
+++ net/if_lagg.c 2014-03-26 21:07:57.000000000 -0400
@@ -425,6 +425,7 @@ lagg_capabilities(struct lagg_softc *sc)
struct lagg_port *lp;
int cap = ~0, ena = ~0;
u_long hwa = ~0UL;
+ u_int hw_tsomax = IP_MAXPACKET;
LAGG_WLOCK_ASSERT(sc);
@@ -433,6 +434,8 @@ lagg_capabilities(struct lagg_softc *sc)
cap &= lp->lp_ifp->if_capabilities;
ena &= lp->lp_ifp->if_capenable;
hwa &= lp->lp_ifp->if_hwassist;
+ if (lp->lp_ifp->if_hw_tsomax < hw_tsomax)
+ hw_tsomax = lp->lp_ifp->if_hw_tsomax;
}
cap = (cap == ~0 ? 0 : cap);
ena = (ena == ~0 ? 0 : ena);
@@ -440,10 +443,12 @@ lagg_capabilities(struct lagg_softc *sc)
if (sc->sc_ifp->if_capabilities != cap ||
sc->sc_ifp->if_capenable != ena ||
- sc->sc_ifp->if_hwassist != hwa) {
+ sc->sc_ifp->if_hwassist != hwa ||
+ sc->sc_ifp->if_hw_tsomax != hw_tsomax) {
sc->sc_ifp->if_capabilities = cap;
sc->sc_ifp->if_capenable = ena;
sc->sc_ifp->if_hwassist = hwa;
+ sc->sc_ifp->if_hw_tsomax = hw_tsomax;
getmicrotime(&sc->sc_ifp->if_lastchange);
if (sc->sc_ifflags & IFF_DEBUG)
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?1292881633.1858906.1395960241007.JavaMail.root>
