Skip site navigation (1)Skip section navigation (2)
Date:      Wed, 30 May 2012 07:59:24 -0700
From:      Colin Percival <cperciva@freebsd.org>
To:        freebsd-net@freebsd.org
Subject:   [please review] TSO mbuf chain length limiting patch
Message-ID:  <4FC635CC.5030608@freebsd.org>

next in thread | raw e-mail | index | archive | help
This is a multi-part message in MIME format.
--------------080401060809040608040406
Content-Type: text/plain; charset=ISO-8859-1
Content-Transfer-Encoding: 7bit

Hi all,

The Xen virtual network interface has an issue (ok, really the issue is with
the linux back-end, but that's what most people are using) where it can't
handle scatter-gather writes with lots of pieces, aka. long mbuf chains.
This currently bites us hard with TSO enabled, since it produces said long
mbuf chains.

I've attached a patch which:
(1) adds a IFCAP_MB_CHAIN_LIMIT "capability" and a if_mbuf_chain_limit field
to struct ifnet,
(2) sets these in netfront when the IFCAP_TSO4 flag is set,
(3) extends tcp_maxmtu to read the if_mbuf_chain_limit value (it doesn't
exactly fit here, but this is where we look up interface properties...),
(4) adds a tx_chain_limit field to struct tcpcb,
(5) makes tcp_mss_update set tx_chain_limit using tcp_maxmtu,
(6) adds a new m_copy_nbufs function which truncates the copied mbuf chain
after a specified number of mbufs, and returns the copied data length,
(7) uses these in tcp_output (rearranging some code so that if the output
length is modified, statistics are updated with the correct value).

I'm hoping to commit this and MFC it before 9.1-RELEASE; I believe I've
avoided breaking any ABIs.  In my testing on EC2 with 10 GbE and TSO turned
on, I get ~250-300 Mbps without this patch and 3-4 Gbps with this patch;
this replaces a patch I was using in my EC2 builds which did (6)&(7) above
with a hard-coded maximum mbuf chain length.

Please tell me all the things I did wrong. :-)

-- 
Colin Percival
Security Officer Emeritus, FreeBSD | The power to serve
Founder, Tarsnap | www.tarsnap.com | Online backups for the truly paranoid


--------------080401060809040608040406
Content-Type: text/plain; charset=us-ascii;
 name="tx_chain_limit.patch"
Content-Transfer-Encoding: 7bit
Content-Disposition: attachment;
 filename="tx_chain_limit.patch"

Index: netinet/tcp_input.c
===================================================================
--- netinet/tcp_input.c	(revision 235780)
+++ netinet/tcp_input.c	(working copy)
@@ -3301,6 +3301,7 @@
 	struct inpcb *inp = tp->t_inpcb;
 	struct hc_metrics_lite metrics;
 	int origoffer;
+	int tx_chain_limit = 0;
 #ifdef INET6
 	int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
 	size_t min_protoh = isipv6 ?
@@ -3321,8 +3322,10 @@
 	/* Initialize. */
 #ifdef INET6
 	if (isipv6) {
-		maxmtu = tcp_maxmtu6(&inp->inp_inc, mtuflags);
+		maxmtu = tcp_maxmtu6_ext(&inp->inp_inc, mtuflags,
+		    &tx_chain_limit);
 		tp->t_maxopd = tp->t_maxseg = V_tcp_v6mssdflt;
+		tp->t_tx_chain_limit = tx_chain_limit;
 	}
 #endif
 #if defined(INET) && defined(INET6)
@@ -3330,8 +3333,10 @@
 #endif
 #ifdef INET
 	{
-		maxmtu = tcp_maxmtu(&inp->inp_inc, mtuflags);
+		maxmtu = tcp_maxmtu_ext(&inp->inp_inc, mtuflags,
+		    &tx_chain_limit);
 		tp->t_maxopd = tp->t_maxseg = V_tcp_mssdflt;
+		tp->t_tx_chain_limit = tx_chain_limit;
 	}
 #endif
 
Index: netinet/tcp_subr.c
===================================================================
--- netinet/tcp_subr.c	(revision 235780)
+++ netinet/tcp_subr.c	(working copy)
@@ -1708,7 +1708,7 @@
  * tcp_mss_update to get the peer/interface MTU.
  */
 u_long
-tcp_maxmtu(struct in_conninfo *inc, int *flags)
+tcp_maxmtu_ext(struct in_conninfo *inc, int *flags, int * tx_chain_limit)
 {
 	struct route sro;
 	struct sockaddr_in *dst;
@@ -1738,15 +1738,26 @@
 			    ifp->if_hwassist & CSUM_TSO)
 				*flags |= CSUM_TSO;
 		}
+		if (tx_chain_limit != NULL) {
+			if (ifp->if_capabilities & IFCAP_MB_CHAIN_LIMIT)
+				*tx_chain_limit = ifp->if_mbuf_chain_limit;
+		}
 		RTFREE(sro.ro_rt);
 	}
 	return (maxmtu);
 }
+
+u_long
+tcp_maxmtu(struct in_conninfo *inc, int *flags)
+{
+
+	return (tcp_maxmtu_ext(inc, flags, NULL));
+}
 #endif /* INET */
 
 #ifdef INET6
 u_long
-tcp_maxmtu6(struct in_conninfo *inc, int *flags)
+tcp_maxmtu6_ext(struct in_conninfo *inc, int *flags, int * tx_chain_limit)
 {
 	struct route_in6 sro6;
 	struct ifnet *ifp;
@@ -1775,11 +1786,22 @@
 			    ifp->if_hwassist & CSUM_TSO)
 				*flags |= CSUM_TSO;
 		}
+		if (tx_chain_limit != NULL) {
+			if (ifp->if_capabilities & IFCAP_MB_CHAIN_LIMIT)
+				*tx_chain_limit = ifp->if_mbuf_chain_limit;
+		}
 		RTFREE(sro6.ro_rt);
 	}
 
 	return (maxmtu);
 }
+
+u_long
+tcp_maxmtu6(struct in_conninfo *inc, int *flags)
+{
+
+	return (tcp_maxmtu6_ext(inc, flags, NULL));
+}
 #endif /* INET6 */
 
 #ifdef IPSEC
Index: netinet/tcp_var.h
===================================================================
--- netinet/tcp_var.h	(revision 235780)
+++ netinet/tcp_var.h	(working copy)
@@ -208,7 +208,8 @@
 	u_int	t_keepintvl;		/* interval between keepalives */
 	u_int	t_keepcnt;		/* number of keepalives before close */
 
-	uint32_t t_ispare[8];		/* 5 UTO, 3 TBD */
+	uint32_t t_tx_chain_limit;	/* max # chained mbufs to tx for TSO */
+	uint32_t t_ispare[7];		/* 5 UTO, 2 TBD */
 	void	*t_pspare2[4];		/* 4 TBD */
 	uint64_t _pad[6];		/* 6 TBD (1-2 CC/RTT?) */
 };
@@ -674,7 +675,9 @@
 #endif
 void	 tcp_input(struct mbuf *, int);
 u_long	 tcp_maxmtu(struct in_conninfo *, int *);
+u_long	 tcp_maxmtu_ext(struct in_conninfo *, int *, int *);
 u_long	 tcp_maxmtu6(struct in_conninfo *, int *);
+u_long	 tcp_maxmtu6_ext(struct in_conninfo *, int *, int *);
 void	 tcp_mss_update(struct tcpcb *, int, int, struct hc_metrics_lite *,
 	    int *);
 void	 tcp_mss(struct tcpcb *, int);
Index: netinet/tcp_output.c
===================================================================
--- netinet/tcp_output.c	(revision 235780)
+++ netinet/tcp_output.c	(working copy)
@@ -801,16 +801,6 @@
 		struct mbuf *mb;
 		u_int moff;
 
-		if ((tp->t_flags & TF_FORCEDATA) && len == 1)
-			TCPSTAT_INC(tcps_sndprobe);
-		else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) {
-			tp->t_sndrexmitpack++;
-			TCPSTAT_INC(tcps_sndrexmitpack);
-			TCPSTAT_ADD(tcps_sndrexmitbyte, len);
-		} else {
-			TCPSTAT_INC(tcps_sndpack);
-			TCPSTAT_ADD(tcps_sndbyte, len);
-		}
 		MGETHDR(m, M_DONTWAIT, MT_DATA);
 		if (m == NULL) {
 			SOCKBUF_UNLOCK(&so->so_snd);
@@ -842,7 +832,8 @@
 			    mtod(m, caddr_t) + hdrlen);
 			m->m_len += len;
 		} else {
-			m->m_next = m_copy(mb, moff, (int)len);
+			m->m_next = m_copy_nbufs(mb, moff, (int)len,
+			    M_DONTWAIT, &len, tp->t_tx_chain_limit);
 			if (m->m_next == NULL) {
 				SOCKBUF_UNLOCK(&so->so_snd);
 				(void) m_free(m);
@@ -851,6 +842,18 @@
 			}
 		}
 
+		/* Update stats here as m_copy_nbufs may have adjusted len. */
+		if ((tp->t_flags & TF_FORCEDATA) && len == 1)
+			TCPSTAT_INC(tcps_sndprobe);
+		else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) {
+			tp->t_sndrexmitpack++;
+			TCPSTAT_INC(tcps_sndrexmitpack);
+			TCPSTAT_ADD(tcps_sndrexmitbyte, len);
+		} else {
+			TCPSTAT_INC(tcps_sndpack);
+			TCPSTAT_ADD(tcps_sndbyte, len);
+		}
+
 		/*
 		 * If we're sending everything we've got, set PUSH.
 		 * (This will keep happy those implementations which only
Index: sys/mbuf.h
===================================================================
--- sys/mbuf.h	(revision 235780)
+++ sys/mbuf.h	(working copy)
@@ -894,6 +894,7 @@
 		    int, int, int, int);
 struct mbuf	*m_copypacket(struct mbuf *, int);
 void		 m_copy_pkthdr(struct mbuf *, struct mbuf *);
+struct mbuf	*m_copy_nbufs(struct mbuf *, int, int, int, long *, uint32_t);
 struct mbuf	*m_copyup(struct mbuf *n, int len, int dstoff);
 struct mbuf	*m_defrag(struct mbuf *, int);
 void		 m_demote(struct mbuf *, int);
Index: kern/uipc_mbuf.c
===================================================================
--- kern/uipc_mbuf.c	(revision 235780)
+++ kern/uipc_mbuf.c	(working copy)
@@ -525,12 +525,14 @@
  * only their reference counts are incremented.
  */
 struct mbuf *
-m_copym(struct mbuf *m, int off0, int len, int wait)
+m_copy_nbufs(struct mbuf *m, int off0, int len, int wait, long * outlen,
+    uint32_t nbufmax)
 {
 	struct mbuf *n, **np;
 	int off = off0;
 	struct mbuf *top;
 	int copyhdr = 0;
+	int len_orig = len;
 
 	KASSERT(off >= 0, ("m_copym, negative off %d", off));
 	KASSERT(len >= 0, ("m_copym, negative len %d", len));
@@ -546,7 +548,7 @@
 	}
 	np = &top;
 	top = 0;
-	while (len > 0) {
+	while (len > 0 && --nbufmax != 0) {
 		if (m == NULL) {
 			KASSERT(len == M_COPYALL, 
 			    ("m_copym, length > size of mbuf chain"));
@@ -584,6 +586,9 @@
 	if (top == NULL)
 		mbstat.m_mcfail++;	/* XXX: No consistency. */
 
+	if (outlen)
+		*outlen = len_orig - len;
+
 	return (top);
 nospace:
 	m_freem(top);
@@ -591,6 +596,13 @@
 	return (NULL);
 }
 
+struct mbuf *
+m_copym(struct mbuf *m, int off0, int len, int wait)
+{
+
+	return (m_copy_nbufs(m, off0, len, wait, NULL, 0));
+}
+
 /*
  * Returns mbuf chain with new head for the prepending case.
  * Copies from mbuf (chain) n from off for len to mbuf (chain) m
Index: dev/xen/netfront/netfront.c
===================================================================
--- dev/xen/netfront/netfront.c	(revision 235780)
+++ dev/xen/netfront/netfront.c	(working copy)
@@ -2040,6 +2040,9 @@
 	if (val) {
 		np->xn_ifp->if_capabilities |= IFCAP_TSO4|IFCAP_LRO;
 		printf(" feature-gso-tcp4");
+
+		np->xn_ifp->if_capabilities |= IFCAP_MB_CHAIN_LIMIT;
+		np->xn_ifp->if_mbuf_chain_limit = np->maxfrags;
 	}
 
 	printf("\n");
Index: net/if.h
===================================================================
--- net/if.h	(revision 235780)
+++ net/if.h	(working copy)
@@ -230,6 +230,7 @@
 #define	IFCAP_VLAN_HWTSO	0x40000 /* can do IFCAP_TSO on VLANs */
 #define	IFCAP_LINKSTATE		0x80000 /* the runtime link state is dynamic */
 #define	IFCAP_NETMAP		0x100000 /* netmap mode supported/enabled */
+#define	IFCAP_MB_CHAIN_LIMIT	0x200000 /* TX mbuf chain length limited */
 
 #define IFCAP_HWCSUM	(IFCAP_RXCSUM | IFCAP_TXCSUM)
 #define	IFCAP_TSO	(IFCAP_TSO4 | IFCAP_TSO6)
Index: net/if_var.h
===================================================================
--- net/if_var.h	(revision 235780)
+++ net/if_var.h	(working copy)
@@ -205,7 +205,8 @@
 	 * be used with care where binary compatibility is required.
 	 */
 	char	if_cspare[3];
-	int	if_ispare[4];
+	int	if_mbuf_chain_limit;	/* if IFCAP_MB_CHAIN_LIMIT */
+	int	if_ispare[3];
 	void	*if_pspare[8];		/* 1 netmap, 7 TDB */
 };
 


--------------080401060809040608040406--



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?4FC635CC.5030608>