Skip site navigation (1)Skip section navigation (2)
Date:      Mon, 17 Mar 2014 21:26:23 -0400 (EDT)
From:      Rick Macklem <rmacklem@uoguelph.ca>
To:        FreeBSD Filesystems <freebsd-fs@freebsd.org>
Cc:        Alexander Motin <mav@freebsd.org>
Subject:   review/test: NFS patch to use pagesize mbuf clusters
Message-ID:  <570922189.23999456.1395105983047.JavaMail.root@uoguelph.ca>
In-Reply-To: <1351117550.23999435.1395105975009.JavaMail.root@uoguelph.ca>

next in thread | previous in thread | raw e-mail | index | archive | help

[-- Attachment #1 --]
Hi,

Several of the TSO capable network interfaces have a limit of
32 mbufs in the transmit mbuf chain (the drivers call these transmit
segments, which I admit I find confusing).

For a 64K read/readdir reply or 64K write request, NFS passes
a list of 34 mbufs down to TCP. TCP will split the list, since
it is slightly more than 64K bytes, but that split will normally
be a copy by reference of the last mbuf cluster. As such, normally
the network interface will get a list of 34 mbufs.

For TSO enabled interfaces that are limited to 32 mbufs in the
list, the usual workaround in the driver is to copy { real copy,
not copy by reference } the list to 32 mbuf clusters via m_defrag().
(A few drivers use m_collapse() which is less likely to succeed.)

As a workaround to this problem, the attached patch modifies NFS
to use larger pagesize clusters, so that the 64K RPC message is
in 18 mbufs (assuming a 4K pagesize).

Testing on my slow hardware which does not have TSO capability
shows it to be performance neutral, but I believe avoiding the
overhead of copying via m_defrag() { and possible failures
resulting in the message never being transmitted } makes this
patch worth doing.

As such, I'd like to request review and/or testing of this patch
by anyone who can do so.

Thanks in advance for your help, rick
ps: If you don't get the attachment, just email and I'll
    send you a copy.

[-- Attachment #2 --]
--- fs/nfsserver/nfs_nfsdport.c.sav2	2014-01-26 18:54:29.000000000 -0500
+++ fs/nfsserver/nfs_nfsdport.c	2014-03-16 23:22:56.000000000 -0400
@@ -566,8 +566,7 @@ nfsvno_readlink(struct vnode *vp, struct
 	len = 0;
 	i = 0;
 	while (len < NFS_MAXPATHLEN) {
-		NFSMGET(mp);
-		MCLGET(mp, M_WAITOK);
+		NFSMCLGET(mp, M_NOWAIT);
 		mp->m_len = NFSMSIZ(mp);
 		if (len == 0) {
 			mp3 = mp2 = mp;
@@ -621,7 +620,7 @@ nfsvno_read(struct vnode *vp, off_t off,
     struct thread *p, struct mbuf **mpp, struct mbuf **mpendp)
 {
 	struct mbuf *m;
-	int i;
+	int do_pagesize, i;
 	struct iovec *iv;
 	struct iovec *iv2;
 	int error = 0, len, left, siz, tlen, ioflag = 0;
@@ -630,14 +629,33 @@ nfsvno_read(struct vnode *vp, off_t off,
 	struct nfsheur *nh;
 
 	len = left = NFSM_RNDUP(cnt);
+	do_pagesize = 0;
+#if MJUMPAGESIZE != MCLBYTES
+	if (left > MCLBYTES)
+		do_pagesize = 1;
+#endif
 	m3 = NULL;
 	/*
 	 * Generate the mbuf list with the uio_iov ref. to it.
 	 */
 	i = 0;
 	while (left > 0) {
-		NFSMGET(m);
-		MCLGET(m, M_WAITOK);
+		/*
+		 * For large reads, try and acquire MJUMPAGESIZE clusters.
+		 * However, do so with M_NOWAIT so the thread can't get
+		 * stuck sleeping on "btalloc".
+		 * If this fails, use NFSMCLGET(..M_NOWAIT), which does an
+		 * MGET(..M_WAITOK) followed by a MCLGET(..M_NOWAIT).  The
+		 * MCLGET(..M_NOWAIT) may not get a cluster, but will drain
+		 * the mbuf cluster zone when it fails.
+		 * As such, an mbuf will always be allocated and most likely
+		 * it will have a cluster.
+		 */
+		m = NULL;
+		if (do_pagesize != 0)
+			m = m_getjcl(M_NOWAIT, MT_DATA, 0, MJUMPAGESIZE);
+		if (m == NULL)
+			NFSMCLGET(m, M_NOWAIT);
 		m->m_len = 0;
 		siz = min(M_TRAILINGSPACE(m), left);
 		left -= siz;
@@ -1653,10 +1671,10 @@ again:
 	if (siz == 0) {
 		vput(vp);
 		if (nd->nd_flag & ND_NFSV2) {
-			NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
+			NFSM_BUILD_PAGEMBCL(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 		} else {
 			nfsrv_postopattr(nd, getret, &at);
-			NFSM_BUILD(tl, u_int32_t *, 4 * NFSX_UNSIGNED);
+			NFSM_BUILD_PAGEMBCL(tl, u_int32_t *, 4 * NFSX_UNSIGNED);
 			txdr_hyper(at.na_filerev, tl);
 			tl += 2;
 		}
@@ -1708,7 +1726,7 @@ again:
 	 */
 	if (nd->nd_flag & ND_NFSV3) {
 		nfsrv_postopattr(nd, getret, &at);
-		NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
+		NFSM_BUILD_PAGEMBCL(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 		txdr_hyper(at.na_filerev, tl);
 		dirlen = NFSX_V3POSTOPATTR + NFSX_VERF + 2 * NFSX_UNSIGNED;
 	} else {
@@ -1734,20 +1752,24 @@ again:
 			 * the dirent entry.
 			 */
 			if (nd->nd_flag & ND_NFSV3) {
-				NFSM_BUILD(tl, u_int32_t *, 3 * NFSX_UNSIGNED);
+				NFSM_BUILD_PAGEMBCL(tl, u_int32_t *,
+				    3 * NFSX_UNSIGNED);
 				*tl++ = newnfs_true;
 				*tl++ = 0;
 			} else {
-				NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
+				NFSM_BUILD_PAGEMBCL(tl, u_int32_t *,
+				    2 * NFSX_UNSIGNED);
 				*tl++ = newnfs_true;
 			}
 			*tl = txdr_unsigned(dp->d_fileno);
 			(void) nfsm_strtom(nd, dp->d_name, nlen);
 			if (nd->nd_flag & ND_NFSV3) {
-				NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
+				NFSM_BUILD_PAGEMBCL(tl, u_int32_t *,
+				    2 * NFSX_UNSIGNED);
 				*tl++ = 0;
 			} else
-				NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
+				NFSM_BUILD_PAGEMBCL(tl, u_int32_t *,
+				    NFSX_UNSIGNED);
 			*tl = txdr_unsigned(*cookiep);
 		}
 		cpos += dp->d_reclen;
@@ -1757,7 +1779,7 @@ again:
 	}
 	if (cpos < cend)
 		eofflag = 0;
-	NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
+	NFSM_BUILD_PAGEMBCL(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 	*tl++ = newnfs_false;
 	if (eofflag)
 		*tl = newnfs_true;
@@ -1928,7 +1950,7 @@ again:
 		vput(vp);
 		if (nd->nd_flag & ND_NFSV3)
 			nfsrv_postopattr(nd, getret, &at);
-		NFSM_BUILD(tl, u_int32_t *, 4 * NFSX_UNSIGNED);
+		NFSM_BUILD_PAGEMBCL(tl, u_int32_t *, 4 * NFSX_UNSIGNED);
 		txdr_hyper(at.na_filerev, tl);
 		tl += 2;
 		*tl++ = newnfs_false;
@@ -2031,7 +2053,7 @@ again:
 	} else {
 		dirlen = NFSX_VERF + 2 * NFSX_UNSIGNED;
 	}
-	NFSM_BUILD(tl, u_int32_t *, NFSX_VERF);
+	NFSM_BUILD_PAGEMBCL(tl, u_int32_t *, NFSX_VERF);
 	txdr_hyper(at.na_filerev, tl);
 
 	/*
@@ -2186,12 +2208,14 @@ again:
 			 * Build the directory record xdr
 			 */
 			if (nd->nd_flag & ND_NFSV3) {
-				NFSM_BUILD(tl, u_int32_t *, 3 * NFSX_UNSIGNED);
+				NFSM_BUILD_PAGEMBCL(tl, u_int32_t *,
+				    3 * NFSX_UNSIGNED);
 				*tl++ = newnfs_true;
 				*tl++ = 0;
 				*tl = txdr_unsigned(dp->d_fileno);
 				dirlen += nfsm_strtom(nd, dp->d_name, nlen);
-				NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
+				NFSM_BUILD_PAGEMBCL(tl, u_int32_t *,
+				    2 * NFSX_UNSIGNED);
 				*tl++ = 0;
 				*tl = txdr_unsigned(*cookiep);
 				nfsrv_postopattr(nd, 0, nvap);
@@ -2200,7 +2224,8 @@ again:
 				if (nvp != NULL)
 					vput(nvp);
 			} else {
-				NFSM_BUILD(tl, u_int32_t *, 3 * NFSX_UNSIGNED);
+				NFSM_BUILD_PAGEMBCL(tl, u_int32_t *,
+				    3 * NFSX_UNSIGNED);
 				*tl++ = newnfs_true;
 				*tl++ = 0;
 				*tl = txdr_unsigned(*cookiep);
@@ -2267,7 +2292,7 @@ again:
 	} else if (cpos < cend)
 		eofflag = 0;
 	if (!nd->nd_repstat) {
-		NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
+		NFSM_BUILD_PAGEMBCL(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 		*tl++ = newnfs_false;
 		if (eofflag)
 			*tl = newnfs_true;
--- fs/nfsclient/nfs_clcomsubs.c.sav2	2014-02-01 20:47:07.000000000 -0500
+++ fs/nfsclient/nfs_clcomsubs.c	2014-03-16 23:22:06.000000000 -0400
@@ -155,7 +155,7 @@ nfscl_reqstart(struct nfsrv_descript *nd
 	 * Get the first mbuf for the request.
 	 */
 	if (nfs_bigrequest[procnum])
-		NFSMCLGET(mb, M_WAITOK);
+		NFSMCLGET(mb, M_NOWAIT);
 	else
 		NFSMGET(mb);
 	mbuf_setlen(mb, 0);
@@ -267,9 +267,29 @@ nfsm_uiombuf(struct nfsrv_descript *nd, 
 		while (left > 0) {
 			mlen = M_TRAILINGSPACE(mp);
 			if (mlen == 0) {
-				if (clflg)
-					NFSMCLGET(mp, M_WAITOK);
-				else
+				if (clflg != 0) {
+					/*
+					 * For large writes, try and acquire
+					 * MJUMPAGESIZE clusters.
+					 * However, do so with M_NOWAIT so the
+					 * thread can't get stuck sleeping on
+					 * "btalloc".  If this fails, use
+					 * NFSMCLGET(..M_NOWAIT), which does an
+					 * MGET(..M_WAITOK) followed by a
+					 * MCLGE T(..M_NOWAIT). This may not get
+					 * a cluster, but will drain the mbuf
+					 * cluster zone when it fails.
+					 * As such, an mbuf will always be
+					 * allocated and most likely it will
+					 * have a cluster.
+					 */
+#if MJUMPAGESIZE != MCLBYTES
+					mp = m_getjcl(M_NOWAIT, MT_DATA, 0,
+					    MJUMPAGESIZE);
+					if (mp == NULL)
+#endif
+						NFSMCLGET(mp, M_NOWAIT);
+				} else
 					NFSMGET(mp);
 				mbuf_setlen(mp, 0);
 				mbuf_setnext(mp2, mp);
--- fs/nfs/nfsm_subs.h.sav2	2014-02-01 19:51:12.000000000 -0500
+++ fs/nfs/nfsm_subs.h	2014-03-13 18:54:27.000000000 -0400
@@ -89,6 +89,37 @@ nfsm_build(struct nfsrv_descript *nd, in
 
 #define	NFSM_BUILD(a, c, s)	((a) = (c)nfsm_build(nd, (s)))
 
+/*
+ * Same as above, but allocates MJUMPAGESIZE mbuf clusters, if possible.
+ */
+static __inline void *
+nfsm_build_pagembcl(struct nfsrv_descript *nd, int siz)
+{
+	void *retp;
+	struct mbuf *mb2;
+
+	if (siz > M_TRAILINGSPACE(nd->nd_mb)) {
+		mb2 = NULL;
+#if MJUMPAGESIZE != MCLBYTES
+		mb2 = m_getjcl(M_NOWAIT, MT_DATA, 0, MJUMPAGESIZE);
+#endif
+		if (mb2 == NULL)
+			NFSMCLGET(mb2, M_NOWAIT);
+		if (siz > MLEN)
+			panic("build > MLEN");
+		mbuf_setlen(mb2, 0);
+		nd->nd_bpos = NFSMTOD(mb2, caddr_t);
+		nd->nd_mb->m_next = mb2;
+		nd->nd_mb = mb2;
+	}
+	retp = (void *)(nd->nd_bpos);
+	nd->nd_mb->m_len += siz;
+	nd->nd_bpos += siz;
+	return (retp);
+}
+
+#define	NFSM_BUILD_PAGEMBCL(a, c, s)	((a) = (c)nfsm_build_pagembcl(nd, (s)))
+
 static __inline void *
 nfsm_dissect(struct nfsrv_descript *nd, int siz)
 {
--- fs/nfs/nfsport.h.sav2	2014-02-13 19:03:22.000000000 -0500
+++ fs/nfs/nfsport.h	2014-02-13 19:14:24.000000000 -0500
@@ -138,6 +138,8 @@
 
 /*
  * Allocate mbufs. Must succeed and never set the mbuf ptr to NULL.
+ * Note that when NFSMCLGET(m, M_NOWAIT) is done, it still must allocate
+ * an mbuf (and can sleep), but might not get a cluster, in the worst case.
  */
 #define	NFSMGET(m)	do { 					\
 		MGET((m), M_WAITOK, MT_DATA); 			\

Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?570922189.23999456.1395105983047.JavaMail.root>