Date: Mon, 17 Mar 2014 21:26:23 -0400 (EDT) From: Rick Macklem <rmacklem@uoguelph.ca> To: FreeBSD Filesystems <freebsd-fs@freebsd.org> Cc: Alexander Motin <mav@freebsd.org> Subject: review/test: NFS patch to use pagesize mbuf clusters Message-ID: <570922189.23999456.1395105983047.JavaMail.root@uoguelph.ca> In-Reply-To: <1351117550.23999435.1395105975009.JavaMail.root@uoguelph.ca>
next in thread | previous in thread | raw e-mail | index | archive | help
[-- Attachment #1 --]
Hi,
Several of the TSO capable network interfaces have a limit of
32 mbufs in the transmit mbuf chain (the drivers call these transmit
segments, which I admit I find confusing).
For a 64K read/readdir reply or 64K write request, NFS passes
a list of 34 mbufs down to TCP. TCP will split the list, since
it is slightly more than 64K bytes, but that split will normally
be a copy by reference of the last mbuf cluster. As such, normally
the network interface will get a list of 34 mbufs.
For TSO enabled interfaces that are limited to 32 mbufs in the
list, the usual workaround in the driver is to copy { real copy,
not copy by reference } the list to 32 mbuf clusters via m_defrag().
(A few drivers use m_collapse() which is less likely to succeed.)
As a workaround to this problem, the attached patch modifies NFS
to use larger pagesize clusters, so that the 64K RPC message is
in 18 mbufs (assuming a 4K pagesize).
Testing on my slow hardware which does not have TSO capability
shows it to be performance neutral, but I believe avoiding the
overhead of copying via m_defrag() { and possible failures
resulting in the message never being transmitted } makes this
patch worth doing.
As such, I'd like to request review and/or testing of this patch
by anyone who can do so.
Thanks in advance for your help, rick
ps: If you don't get the attachment, just email and I'll
send you a copy.
[-- Attachment #2 --]
--- fs/nfsserver/nfs_nfsdport.c.sav2 2014-01-26 18:54:29.000000000 -0500
+++ fs/nfsserver/nfs_nfsdport.c 2014-03-16 23:22:56.000000000 -0400
@@ -566,8 +566,7 @@ nfsvno_readlink(struct vnode *vp, struct
len = 0;
i = 0;
while (len < NFS_MAXPATHLEN) {
- NFSMGET(mp);
- MCLGET(mp, M_WAITOK);
+ NFSMCLGET(mp, M_NOWAIT);
mp->m_len = NFSMSIZ(mp);
if (len == 0) {
mp3 = mp2 = mp;
@@ -621,7 +620,7 @@ nfsvno_read(struct vnode *vp, off_t off,
struct thread *p, struct mbuf **mpp, struct mbuf **mpendp)
{
struct mbuf *m;
- int i;
+ int do_pagesize, i;
struct iovec *iv;
struct iovec *iv2;
int error = 0, len, left, siz, tlen, ioflag = 0;
@@ -630,14 +629,33 @@ nfsvno_read(struct vnode *vp, off_t off,
struct nfsheur *nh;
len = left = NFSM_RNDUP(cnt);
+ do_pagesize = 0;
+#if MJUMPAGESIZE != MCLBYTES
+ if (left > MCLBYTES)
+ do_pagesize = 1;
+#endif
m3 = NULL;
/*
* Generate the mbuf list with the uio_iov ref. to it.
*/
i = 0;
while (left > 0) {
- NFSMGET(m);
- MCLGET(m, M_WAITOK);
+ /*
+ * For large reads, try and acquire MJUMPAGESIZE clusters.
+ * However, do so with M_NOWAIT so the thread can't get
+ * stuck sleeping on "btalloc".
+ * If this fails, use NFSMCLGET(..M_NOWAIT), which does an
+ * MGET(..M_WAITOK) followed by a MCLGET(..M_NOWAIT). The
+ * MCLGET(..M_NOWAIT) may not get a cluster, but will drain
+ * the mbuf cluster zone when it fails.
+ * As such, an mbuf will always be allocated and most likely
+ * it will have a cluster.
+ */
+ m = NULL;
+ if (do_pagesize != 0)
+ m = m_getjcl(M_NOWAIT, MT_DATA, 0, MJUMPAGESIZE);
+ if (m == NULL)
+ NFSMCLGET(m, M_NOWAIT);
m->m_len = 0;
siz = min(M_TRAILINGSPACE(m), left);
left -= siz;
@@ -1653,10 +1671,10 @@ again:
if (siz == 0) {
vput(vp);
if (nd->nd_flag & ND_NFSV2) {
- NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
+ NFSM_BUILD_PAGEMBCL(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
} else {
nfsrv_postopattr(nd, getret, &at);
- NFSM_BUILD(tl, u_int32_t *, 4 * NFSX_UNSIGNED);
+ NFSM_BUILD_PAGEMBCL(tl, u_int32_t *, 4 * NFSX_UNSIGNED);
txdr_hyper(at.na_filerev, tl);
tl += 2;
}
@@ -1708,7 +1726,7 @@ again:
*/
if (nd->nd_flag & ND_NFSV3) {
nfsrv_postopattr(nd, getret, &at);
- NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
+ NFSM_BUILD_PAGEMBCL(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
txdr_hyper(at.na_filerev, tl);
dirlen = NFSX_V3POSTOPATTR + NFSX_VERF + 2 * NFSX_UNSIGNED;
} else {
@@ -1734,20 +1752,24 @@ again:
* the dirent entry.
*/
if (nd->nd_flag & ND_NFSV3) {
- NFSM_BUILD(tl, u_int32_t *, 3 * NFSX_UNSIGNED);
+ NFSM_BUILD_PAGEMBCL(tl, u_int32_t *,
+ 3 * NFSX_UNSIGNED);
*tl++ = newnfs_true;
*tl++ = 0;
} else {
- NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
+ NFSM_BUILD_PAGEMBCL(tl, u_int32_t *,
+ 2 * NFSX_UNSIGNED);
*tl++ = newnfs_true;
}
*tl = txdr_unsigned(dp->d_fileno);
(void) nfsm_strtom(nd, dp->d_name, nlen);
if (nd->nd_flag & ND_NFSV3) {
- NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
+ NFSM_BUILD_PAGEMBCL(tl, u_int32_t *,
+ 2 * NFSX_UNSIGNED);
*tl++ = 0;
} else
- NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
+ NFSM_BUILD_PAGEMBCL(tl, u_int32_t *,
+ NFSX_UNSIGNED);
*tl = txdr_unsigned(*cookiep);
}
cpos += dp->d_reclen;
@@ -1757,7 +1779,7 @@ again:
}
if (cpos < cend)
eofflag = 0;
- NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
+ NFSM_BUILD_PAGEMBCL(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
*tl++ = newnfs_false;
if (eofflag)
*tl = newnfs_true;
@@ -1928,7 +1950,7 @@ again:
vput(vp);
if (nd->nd_flag & ND_NFSV3)
nfsrv_postopattr(nd, getret, &at);
- NFSM_BUILD(tl, u_int32_t *, 4 * NFSX_UNSIGNED);
+ NFSM_BUILD_PAGEMBCL(tl, u_int32_t *, 4 * NFSX_UNSIGNED);
txdr_hyper(at.na_filerev, tl);
tl += 2;
*tl++ = newnfs_false;
@@ -2031,7 +2053,7 @@ again:
} else {
dirlen = NFSX_VERF + 2 * NFSX_UNSIGNED;
}
- NFSM_BUILD(tl, u_int32_t *, NFSX_VERF);
+ NFSM_BUILD_PAGEMBCL(tl, u_int32_t *, NFSX_VERF);
txdr_hyper(at.na_filerev, tl);
/*
@@ -2186,12 +2208,14 @@ again:
* Build the directory record xdr
*/
if (nd->nd_flag & ND_NFSV3) {
- NFSM_BUILD(tl, u_int32_t *, 3 * NFSX_UNSIGNED);
+ NFSM_BUILD_PAGEMBCL(tl, u_int32_t *,
+ 3 * NFSX_UNSIGNED);
*tl++ = newnfs_true;
*tl++ = 0;
*tl = txdr_unsigned(dp->d_fileno);
dirlen += nfsm_strtom(nd, dp->d_name, nlen);
- NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
+ NFSM_BUILD_PAGEMBCL(tl, u_int32_t *,
+ 2 * NFSX_UNSIGNED);
*tl++ = 0;
*tl = txdr_unsigned(*cookiep);
nfsrv_postopattr(nd, 0, nvap);
@@ -2200,7 +2224,8 @@ again:
if (nvp != NULL)
vput(nvp);
} else {
- NFSM_BUILD(tl, u_int32_t *, 3 * NFSX_UNSIGNED);
+ NFSM_BUILD_PAGEMBCL(tl, u_int32_t *,
+ 3 * NFSX_UNSIGNED);
*tl++ = newnfs_true;
*tl++ = 0;
*tl = txdr_unsigned(*cookiep);
@@ -2267,7 +2292,7 @@ again:
} else if (cpos < cend)
eofflag = 0;
if (!nd->nd_repstat) {
- NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
+ NFSM_BUILD_PAGEMBCL(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
*tl++ = newnfs_false;
if (eofflag)
*tl = newnfs_true;
--- fs/nfsclient/nfs_clcomsubs.c.sav2 2014-02-01 20:47:07.000000000 -0500
+++ fs/nfsclient/nfs_clcomsubs.c 2014-03-16 23:22:06.000000000 -0400
@@ -155,7 +155,7 @@ nfscl_reqstart(struct nfsrv_descript *nd
* Get the first mbuf for the request.
*/
if (nfs_bigrequest[procnum])
- NFSMCLGET(mb, M_WAITOK);
+ NFSMCLGET(mb, M_NOWAIT);
else
NFSMGET(mb);
mbuf_setlen(mb, 0);
@@ -267,9 +267,29 @@ nfsm_uiombuf(struct nfsrv_descript *nd,
while (left > 0) {
mlen = M_TRAILINGSPACE(mp);
if (mlen == 0) {
- if (clflg)
- NFSMCLGET(mp, M_WAITOK);
- else
+ if (clflg != 0) {
+ /*
+ * For large writes, try and acquire
+ * MJUMPAGESIZE clusters.
+ * However, do so with M_NOWAIT so the
+ * thread can't get stuck sleeping on
+ * "btalloc". If this fails, use
+ * NFSMCLGET(..M_NOWAIT), which does an
+ * MGET(..M_WAITOK) followed by a
+ * MCLGE T(..M_NOWAIT). This may not get
+ * a cluster, but will drain the mbuf
+ * cluster zone when it fails.
+ * As such, an mbuf will always be
+ * allocated and most likely it will
+ * have a cluster.
+ */
+#if MJUMPAGESIZE != MCLBYTES
+ mp = m_getjcl(M_NOWAIT, MT_DATA, 0,
+ MJUMPAGESIZE);
+ if (mp == NULL)
+#endif
+ NFSMCLGET(mp, M_NOWAIT);
+ } else
NFSMGET(mp);
mbuf_setlen(mp, 0);
mbuf_setnext(mp2, mp);
--- fs/nfs/nfsm_subs.h.sav2 2014-02-01 19:51:12.000000000 -0500
+++ fs/nfs/nfsm_subs.h 2014-03-13 18:54:27.000000000 -0400
@@ -89,6 +89,37 @@ nfsm_build(struct nfsrv_descript *nd, in
#define NFSM_BUILD(a, c, s) ((a) = (c)nfsm_build(nd, (s)))
+/*
+ * Same as above, but allocates MJUMPAGESIZE mbuf clusters, if possible.
+ */
+static __inline void *
+nfsm_build_pagembcl(struct nfsrv_descript *nd, int siz)
+{
+ void *retp;
+ struct mbuf *mb2;
+
+ if (siz > M_TRAILINGSPACE(nd->nd_mb)) {
+ mb2 = NULL;
+#if MJUMPAGESIZE != MCLBYTES
+ mb2 = m_getjcl(M_NOWAIT, MT_DATA, 0, MJUMPAGESIZE);
+#endif
+ if (mb2 == NULL)
+ NFSMCLGET(mb2, M_NOWAIT);
+ if (siz > MLEN)
+ panic("build > MLEN");
+ mbuf_setlen(mb2, 0);
+ nd->nd_bpos = NFSMTOD(mb2, caddr_t);
+ nd->nd_mb->m_next = mb2;
+ nd->nd_mb = mb2;
+ }
+ retp = (void *)(nd->nd_bpos);
+ nd->nd_mb->m_len += siz;
+ nd->nd_bpos += siz;
+ return (retp);
+}
+
+#define NFSM_BUILD_PAGEMBCL(a, c, s) ((a) = (c)nfsm_build_pagembcl(nd, (s)))
+
static __inline void *
nfsm_dissect(struct nfsrv_descript *nd, int siz)
{
--- fs/nfs/nfsport.h.sav2 2014-02-13 19:03:22.000000000 -0500
+++ fs/nfs/nfsport.h 2014-02-13 19:14:24.000000000 -0500
@@ -138,6 +138,8 @@
/*
* Allocate mbufs. Must succeed and never set the mbuf ptr to NULL.
+ * Note that when NFSMCLGET(m, M_NOWAIT) is done, it still must allocate
+ * an mbuf (and can sleep), but might not get a cluster, in the worst case.
*/
#define NFSMGET(m) do { \
MGET((m), M_WAITOK, MT_DATA); \
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?570922189.23999456.1395105983047.JavaMail.root>
