Skip site navigation (1)Skip section navigation (2)
Date:      Thu, 6 Feb 2014 20:58:35 -0500 (EST)
From:      Rick Macklem <rmacklem@uoguelph.ca>
To:        Garrett Wollman <wollman@freebsd.org>
Cc:        freebsd-net@freebsd.org, Alexander Motin <mav@freebsd.org>
Subject:   Re: Terrible NFS performance under 9.2-RELEASE?
Message-ID:  <1261040377.1982994.1391738315028.JavaMail.root@uoguelph.ca>
In-Reply-To: <CABXB=RRFfparjXm7_f6aaWHHbpUBoDWOLsjTyWdZmyKx3d2zAw@mail.gmail.com>

next in thread | previous in thread | raw e-mail | index | archive | help

[-- Attachment #1 --]
Garrett Wollman wrote:
> The real big improvement, which I have not tried to implement, would
> be to use physical pages (via sfbufs) by sharing the inner loop of
> sendfile(2).  Since I use ZFS as my backing filesystem, I'm not sure
> this would have any benefit for me, but it should be a measurable
> improvement for UFS-backed NFS servers.
(I didn't have this email handy, so I just cut/pasted this paragraph.)

Well, I'm far from sure this is a good idea at this point, but I've
attached a patch that seems to work for a short test of an export of
a UFS volume. (If there is no v_object for ZFS vnodes, it will print
out a message with the error# for EINVAL.) It would need some serious
review and testing before it would be anywhere near ready for head.
(For example, since I hold a LK_SHARED locked vnode which was not
 VI_DOOMED when it was locked, I don't know if I need to acquire a
 reference count on the vm object or if I need to check for OBJ_DEAD?)
The patch checks for OBJ_DEAD, but does not acquire a reference count.
(The reference count part is #ifdef notnow.)

Like most of these things, I don't see a measurable difference on my
old single core i386 hardware with 100Mbps networking, but that only
indicates it might not be a serious regression.

The patch doesn't have yours applied to it, but it should be easy
to integrate the two, since it just adds a call to nfsrv_file_mbuf()
at the beginning of nfsvno_read().

Good luck with the testing, rick
ps: It would be nice to know if it works for ZFS?
pss: If the patch doesn't make it through on the list and you want a
     copy, just email me.


[-- Attachment #2 --]
--- kern/uipc_syscalls.c.sav3	2014-02-04 19:22:52.000000000 -0500
+++ kern/uipc_syscalls.c	2014-02-06 18:23:47.000000000 -0500
@@ -1950,6 +1950,176 @@ freebsd4_sendfile(struct thread *td, str
 }
 #endif /* COMPAT_FREEBSD4 */
 
+/*
+ * The inner loop of kern_sendfile() extracted out so that it can be
+ * used by the NFS server as well.
+ */
+int
+kern_sendfile_mbuf(struct vnode *vp, struct vm_object *obj, struct mbuf **mp,
+    struct mbuf **mtailp, off_t off, off_t *remp, int bsize, int *donep,
+    int space, int mnw, int wait_forall, int flags, int ioflags, void *sfsp,
+    struct ucred *acred, struct thread *td)
+{
+	off_t xfsize, startrem;
+	vm_pindex_t pindex;
+	vm_offset_t pgoff;
+	struct vm_page *pg;
+	struct mbuf *m0, *m, *mtail;
+	struct sf_buf *sf;
+	int error = 0, loopbytes;
+	ssize_t resid;
+	int readahead = sfreadahead * MAXBSIZE;
+	struct sendfile_sync *sfs = (struct sendfile_sync *)sfsp;
+
+	m = *mp;
+	mtail = *mtailp;
+	startrem = *remp;
+	loopbytes = 0;
+	/*
+	 * Loop and construct maximum sized mbuf chain to be bulk
+	 * dumped into socket buffer.
+	 */
+	while (space > loopbytes) {
+		/*
+		 * Calculate the amount to transfer.
+		 * Not to exceed a page, the EOF,
+		 * or the passed in nbytes.
+		 */
+		pgoff = (vm_offset_t)(off & PAGE_MASK);
+		*remp = startrem - loopbytes;
+		xfsize = omin(PAGE_SIZE - pgoff, *remp);
+		xfsize = omin(space - loopbytes, xfsize);
+		if (xfsize <= 0) {
+			*donep = 1;		/* all data sent */
+			break;
+		}
+
+		/*
+		 * Attempt to look up the page.  Allocate
+		 * if not found or wait and loop if busy.
+		 */
+		pindex = OFF_TO_IDX(off);
+		VM_OBJECT_WLOCK(obj);
+		pg = vm_page_grab(obj, pindex, VM_ALLOC_NOBUSY |
+		    VM_ALLOC_NORMAL | VM_ALLOC_WIRED | VM_ALLOC_RETRY);
+
+		/*
+		 * Check if page is valid for what we need,
+		 * otherwise initiate I/O.
+		 * If we already turned some pages into mbufs,
+		 * send them off before we come here again and
+		 * block.
+		 */
+		if (pg->valid && vm_page_is_valid(pg, pgoff, xfsize))
+			VM_OBJECT_WUNLOCK(obj);
+		else if (m != NULL && wait_forall == 0)
+			error = EAGAIN;	/* send what we already got */
+		else if (flags & SF_NODISKIO)
+			error = EBUSY;
+		else {
+			VM_OBJECT_WUNLOCK(obj);
+
+			/*
+			 * Get the page from backing store.
+			 * XXXMAC: Because we don't have fp->f_cred
+			 * here, we pass in NOCRED.  This is probably
+			 * wrong, but is consistent with our original
+			 * implementation.
+			 */
+			error = vn_rdwr(UIO_READ, vp, NULL, readahead,
+			    trunc_page(off), UIO_NOCOPY, ioflags |
+			    ((readahead / bsize) << IO_SEQSHIFT),
+			    td->td_ucred, acred, &resid, td);
+			SFSTAT_INC(sf_iocnt);
+			if (error)
+				VM_OBJECT_WLOCK(obj);
+		}
+		if (error) {
+			vm_page_lock(pg);
+			vm_page_unwire(pg, 0);
+			/*
+			 * See if anyone else might know about
+			 * this page.  If not and it is not valid,
+			 * then free it.
+			 */
+			if (pg->wire_count == 0 && pg->valid == 0 &&
+			    !vm_page_busied(pg))
+				vm_page_free(pg);
+			vm_page_unlock(pg);
+			VM_OBJECT_WUNLOCK(obj);
+			if (error == EAGAIN && wait_forall == 0)
+				error = 0;	/* not a real error */
+			break;
+		}
+
+		/*
+		 * Get a sendfile buf.  When allocating the
+		 * first buffer for mbuf chain, we usually
+		 * wait as long as necessary, but this wait
+		 * can be interrupted.  For consequent
+		 * buffers, do not sleep, since several
+		 * threads might exhaust the buffers and then
+		 * deadlock.
+		 */
+		sf = sf_buf_alloc(pg, (mnw || (m != NULL && wait_forall == 0)) ?
+		    SFB_NOWAIT : SFB_CATCH);
+		if (sf == NULL) {
+			SFSTAT_INC(sf_allocfail);
+			vm_page_lock(pg);
+			vm_page_unwire(pg, 0);
+			KASSERT(pg->object != NULL,
+			    ("kern_sendfile: object disappeared"));
+			vm_page_unlock(pg);
+			if (m == NULL)
+				error = (mnw ? EAGAIN : EINTR);
+			break;
+		}
+
+		/*
+		 * Get an mbuf and set it up as having
+		 * external storage.
+		 */
+		m0 = m_get((mnw ? M_NOWAIT : M_WAITOK), MT_DATA);
+		if (m0 == NULL) {
+			error = (mnw ? EAGAIN : ENOBUFS);
+			sf_buf_mext(NULL, sf);
+			break;
+		}
+		if (m_extadd(m0, (caddr_t )sf_buf_kva(sf), PAGE_SIZE,
+		    sf_buf_mext, sfs, sf, M_RDONLY, EXT_SFBUF,
+		    (mnw ? M_NOWAIT : M_WAITOK)) != 0) {
+			error = (mnw ? EAGAIN : ENOBUFS);
+			sf_buf_mext(NULL, sf);
+			m_freem(m0);
+			break;
+		}
+		m0->m_data = (char *)sf_buf_kva(sf) + pgoff;
+		m0->m_len = xfsize;
+
+		/* Append to mbuf chain. */
+		if (mtail != NULL)
+			mtail->m_next = m0;
+		else if (m != NULL)
+			m_last(m)->m_next = m0;
+		else
+			m = m0;
+		mtail = m0;
+
+		/* Keep track of bits processed. */
+		loopbytes += xfsize;
+		off += xfsize;
+
+		if (sfs != NULL) {
+			mtx_lock(&sfs->mtx);
+			sfs->count++;
+			mtx_unlock(&sfs->mtx);
+		}
+	}
+	*mp = m;
+	*mtailp = mtail;
+	return (error);
+}
+
 int
 kern_sendfile(struct thread *td, struct sendfile_args *uap,
     struct uio *hdr_uio, struct uio *trl_uio, int compat)
@@ -1959,10 +2129,8 @@ kern_sendfile(struct thread *td, struct 
 	struct vm_object *obj = NULL;
 	struct socket *so = NULL;
 	struct mbuf *m = NULL;
-	struct sf_buf *sf;
-	struct vm_page *pg;
 	struct vattr va;
-	off_t off, xfsize, fsbytes = 0, sbytes = 0, rem = 0;
+	off_t off, fsbytes = 0, sbytes = 0, rem = 0;
 	int error, hdrlen = 0, mnw = 0;
 	int bsize;
 	struct sendfile_sync *sfs = NULL;
@@ -2105,7 +2273,6 @@ kern_sendfile(struct thread *td, struct 
 	 */
 	for (off = uap->offset; ; ) {
 		struct mbuf *mtail;
-		int loopbytes;
 		int space;
 		int done;
 
@@ -2114,7 +2281,6 @@ kern_sendfile(struct thread *td, struct 
 			break;
 
 		mtail = NULL;
-		loopbytes = 0;
 		space = 0;
 		done = 0;
 
@@ -2193,157 +2359,15 @@ retry_space:
 			goto done;
 		}
 
-		/*
-		 * Loop and construct maximum sized mbuf chain to be bulk
-		 * dumped into socket buffer.
-		 */
-		while (space > loopbytes) {
-			vm_pindex_t pindex;
-			vm_offset_t pgoff;
-			struct mbuf *m0;
-
-			/*
-			 * Calculate the amount to transfer.
-			 * Not to exceed a page, the EOF,
-			 * or the passed in nbytes.
-			 */
-			pgoff = (vm_offset_t)(off & PAGE_MASK);
-			if (uap->nbytes)
-				rem = (uap->nbytes - fsbytes - loopbytes);
-			else
-				rem = va.va_size -
-				    uap->offset - fsbytes - loopbytes;
-			xfsize = omin(PAGE_SIZE - pgoff, rem);
-			xfsize = omin(space - loopbytes, xfsize);
-			if (xfsize <= 0) {
-				done = 1;		/* all data sent */
-				break;
-			}
-
-			/*
-			 * Attempt to look up the page.  Allocate
-			 * if not found or wait and loop if busy.
-			 */
-			pindex = OFF_TO_IDX(off);
-			VM_OBJECT_WLOCK(obj);
-			pg = vm_page_grab(obj, pindex, VM_ALLOC_NOBUSY |
-			    VM_ALLOC_NORMAL | VM_ALLOC_WIRED | VM_ALLOC_RETRY);
-
-			/*
-			 * Check if page is valid for what we need,
-			 * otherwise initiate I/O.
-			 * If we already turned some pages into mbufs,
-			 * send them off before we come here again and
-			 * block.
-			 */
-			if (pg->valid && vm_page_is_valid(pg, pgoff, xfsize))
-				VM_OBJECT_WUNLOCK(obj);
-			else if (m != NULL)
-				error = EAGAIN;	/* send what we already got */
-			else if (uap->flags & SF_NODISKIO)
-				error = EBUSY;
-			else {
-				ssize_t resid;
-				int readahead = sfreadahead * MAXBSIZE;
-
-				VM_OBJECT_WUNLOCK(obj);
-
-				/*
-				 * Get the page from backing store.
-				 * XXXMAC: Because we don't have fp->f_cred
-				 * here, we pass in NOCRED.  This is probably
-				 * wrong, but is consistent with our original
-				 * implementation.
-				 */
-				error = vn_rdwr(UIO_READ, vp, NULL, readahead,
-				    trunc_page(off), UIO_NOCOPY, IO_NODELOCKED |
-				    IO_VMIO | ((readahead / bsize) << IO_SEQSHIFT),
-				    td->td_ucred, NOCRED, &resid, td);
-				SFSTAT_INC(sf_iocnt);
-				if (error)
-					VM_OBJECT_WLOCK(obj);
-			}
-			if (error) {
-				vm_page_lock(pg);
-				vm_page_unwire(pg, 0);
-				/*
-				 * See if anyone else might know about
-				 * this page.  If not and it is not valid,
-				 * then free it.
-				 */
-				if (pg->wire_count == 0 && pg->valid == 0 &&
-				    !vm_page_busied(pg))
-					vm_page_free(pg);
-				vm_page_unlock(pg);
-				VM_OBJECT_WUNLOCK(obj);
-				if (error == EAGAIN)
-					error = 0;	/* not a real error */
-				break;
-			}
-
-			/*
-			 * Get a sendfile buf.  When allocating the
-			 * first buffer for mbuf chain, we usually
-			 * wait as long as necessary, but this wait
-			 * can be interrupted.  For consequent
-			 * buffers, do not sleep, since several
-			 * threads might exhaust the buffers and then
-			 * deadlock.
-			 */
-			sf = sf_buf_alloc(pg, (mnw || m != NULL) ? SFB_NOWAIT :
-			    SFB_CATCH);
-			if (sf == NULL) {
-				SFSTAT_INC(sf_allocfail);
-				vm_page_lock(pg);
-				vm_page_unwire(pg, 0);
-				KASSERT(pg->object != NULL,
-				    ("kern_sendfile: object disappeared"));
-				vm_page_unlock(pg);
-				if (m == NULL)
-					error = (mnw ? EAGAIN : EINTR);
-				break;
-			}
-
-			/*
-			 * Get an mbuf and set it up as having
-			 * external storage.
-			 */
-			m0 = m_get((mnw ? M_NOWAIT : M_WAITOK), MT_DATA);
-			if (m0 == NULL) {
-				error = (mnw ? EAGAIN : ENOBUFS);
-				sf_buf_mext(NULL, sf);
-				break;
-			}
-			if (m_extadd(m0, (caddr_t )sf_buf_kva(sf), PAGE_SIZE,
-			    sf_buf_mext, sfs, sf, M_RDONLY, EXT_SFBUF,
-			    (mnw ? M_NOWAIT : M_WAITOK)) != 0) {
-				error = (mnw ? EAGAIN : ENOBUFS);
-				sf_buf_mext(NULL, sf);
-				m_freem(m0);
-				break;
-			}
-			m0->m_data = (char *)sf_buf_kva(sf) + pgoff;
-			m0->m_len = xfsize;
-
-			/* Append to mbuf chain. */
-			if (mtail != NULL)
-				mtail->m_next = m0;
-			else if (m != NULL)
-				m_last(m)->m_next = m0;
-			else
-				m = m0;
-			mtail = m0;
-
-			/* Keep track of bits processed. */
-			loopbytes += xfsize;
-			off += xfsize;
-
-			if (sfs != NULL) {
-				mtx_lock(&sfs->mtx);
-				sfs->count++;
-				mtx_unlock(&sfs->mtx);
-			}
-		}
+		/* Call kern_sendfile_mbuf() for the inner loop. */
+		if (uap->nbytes)
+			rem = (uap->nbytes - fsbytes);
+		else
+			rem = va.va_size -
+			    uap->offset - fsbytes;
+		error = kern_sendfile_mbuf(vp, obj, &m, &mtail, off, &rem,
+		    bsize, &done, space, mnw, 0, uap->flags,
+		    IO_NODELOCKED | IO_VMIO, sfs, NOCRED, td);
 
 		VOP_UNLOCK(vp, 0);
 
--- fs/nfsserver/nfs_nfsdport.c.sav3	2014-02-06 18:42:02.000000000 -0500
+++ fs/nfsserver/nfs_nfsdport.c	2014-02-06 18:26:49.000000000 -0500
@@ -74,6 +74,8 @@ static uint32_t nfsv4_sysid = 0;
 
 static int nfssvc_srvcall(struct thread *, struct nfssvc_args *,
     struct ucred *);
+static int nfsrv_file_mbuf(struct vnode *, struct nfsvattr *, off_t,
+    int, struct mbuf **, struct mbuf **, struct ucred *, struct thread *);
 
 int nfsrv_enable_crossmntpt = 1;
 static int nfs_commit_blks;
@@ -617,8 +619,9 @@ out:
  * Read vnode op call into mbuf list.
  */
 int
-nfsvno_read(struct vnode *vp, off_t off, int cnt, struct ucred *cred,
-    struct thread *p, struct mbuf **mpp, struct mbuf **mpendp)
+nfsvno_read(struct vnode *vp, struct nfsvattr *nvap, off_t off, int cnt,
+    struct ucred *cred, struct thread *p, struct mbuf **mpp,
+    struct mbuf **mpendp)
 {
 	struct mbuf *m;
 	int i;
@@ -629,6 +632,9 @@ nfsvno_read(struct vnode *vp, off_t off,
 	struct uio io, *uiop = &io;
 	struct nfsheur *nh;
 
+	error = nfsrv_file_mbuf(vp, nvap, off, cnt, mpp, mpendp, cred, p);
+	if (error == 0)
+		goto out;
 	len = left = NFSM_RNDUP(cnt);
 	m3 = NULL;
 	/*
@@ -3295,6 +3301,98 @@ nfsrv_backupstable(void)
 	}
 }
 
+/*
+ * This function uses kern_sendfile_mbuf() to generate a list of mbufs
+ * that can be used by the NFS server read reply.
+ */
+int
+nfsrv_file_mbuf(struct vnode *vp, struct nfsvattr *nvap, off_t off,
+    int nbytes, struct mbuf **mp, struct mbuf **mendp, struct ucred *acred,
+    struct thread *td)
+{
+	struct vm_object *obj = NULL;
+	struct mbuf *m, *m2;
+	off_t rem;
+	int bsize, cnt, done, error, i;
+	char *cp;
+
+	*mp = NULL;
+	*mendp = NULL;
+	error = 0;
+	bsize = vp->v_mount->mnt_stat.f_iosize;
+	if (nvap->na_size > off)
+		rem = nvap->na_size - off;
+	else
+		goto out;	/* Nothing to read, so we are done. */
+	rem = omin(rem, nbytes);
+	cnt = rem;
+	cnt = NFSM_RNDUP(cnt) - cnt;	/* Cnt of bytes of trailing 0s. */
+	obj = vp->v_object;
+	if (obj != NULL) {
+		/*
+		 * Do we need to acquire a reference cnt on the obj like
+		 * kern_sendfile() does, even if we never unlock vp?
+		 */
+		VM_OBJECT_WLOCK(obj);
+		if ((obj->flags & OBJ_DEAD) == 0) {
+#ifdef notnow
+			vm_object_reference_locked(obj);
+#endif
+			VM_OBJECT_WUNLOCK(obj);
+		} else {
+			VM_OBJECT_WUNLOCK(obj);
+			obj = NULL;
+		}
+	}
+	if (obj == NULL)
+		error = EINVAL;
+
+	/* Setup the args for kern_sendfile_mbuf(). */
+	done = 0;
+	if (error == 0)
+		error = kern_sendfile_mbuf(vp, obj, mp, mendp, off, &rem, bsize,
+		    &done, 2 * MAXBSIZE, 0, 1, 0, IO_NODELOCKED | IO_VMIO |
+		    IO_NOMACCHECK, NULL, acred, td);
+	if (error == 0 && cnt > 0) {
+		/*
+		 * Sun XDR requires that data be filled to a multiple
+		 * of 4 bytes with 0 bytes.
+		 * Since the list of mbufs returned by kern_sendfile_mbuf
+		 * reference pages for a file and can't be modified, this
+		 * requires an additional mbuf.
+		 */
+		m = *mendp;
+		NFSMGET(m2);
+		m->m_next = m2;
+		*mendp = m2;
+		cp = mtod(m2, char *);
+		m2->m_len = cnt;
+		for (i = 0; i < cnt; i++)
+			*cp++ = '\0';
+	}
+	if (error == ERESTART)
+		panic("nfsrv_file_mbuf: ERESTART");
+out:
+#ifdef notnow
+	if (obj != NULL) {
+		NFSVOPUNLOCK(vp, 0);
+		vm_object_deallocate(obj);
+		NFSVOPLOCK(vp, LK_SHARED | LK_RETRY);
+		if (error == 0 && (vp->v_iflag & VI_DOOMED) != 0)
+			error = ESTALE;
+	}
+#endif
+	if (error != 0) {
+		if (*mp != NULL) {
+			m_freem(*mp);
+			*mp = NULL;
+			*mendp = NULL;
+		}
+printf("eo nfsrv_file_mbuf err=%d\n", error);
+	}
+	return (error);
+}
+
 extern int (*nfsd_call_nfsd)(struct thread *, struct nfssvc_args *);
 
 /*
--- fs/nfsserver/nfs_nfsdserv.c.sav3	2014-02-05 21:35:16.000000000 -0500
+++ fs/nfsserver/nfs_nfsdserv.c	2014-02-05 21:35:45.000000000 -0500
@@ -726,7 +726,7 @@ nfsrvd_read(struct nfsrv_descript *nd, _
 		cnt = reqlen;
 	m3 = NULL;
 	if (cnt > 0) {
-		nd->nd_repstat = nfsvno_read(vp, off, cnt, nd->nd_cred, p,
+		nd->nd_repstat = nfsvno_read(vp, &nva, off, cnt, nd->nd_cred, p,
 		    &m3, &m2);
 		if (!(nd->nd_flag & ND_NFSV4)) {
 			getret = nfsvno_getattr(vp, &nva, nd->nd_cred, p, 1);
--- fs/nfs/nfs_var.h.sav3	2014-02-05 22:14:38.000000000 -0500
+++ fs/nfs/nfs_var.h	2014-02-05 22:17:29.000000000 -0500
@@ -579,8 +579,8 @@ void nfsvno_setpathbuf(struct nameidata 
 void nfsvno_relpathbuf(struct nameidata *);
 int nfsvno_readlink(vnode_t, struct ucred *, NFSPROC_T *, mbuf_t *,
     mbuf_t *, int *);
-int nfsvno_read(vnode_t, off_t, int, struct ucred *, NFSPROC_T *,
-    mbuf_t *, mbuf_t *);
+int nfsvno_read(vnode_t, struct nfsvattr *, off_t, int, struct ucred *,
+    NFSPROC_T *, mbuf_t *, mbuf_t *);
 int nfsvno_write(vnode_t, off_t, int, int, int, mbuf_t,
     char *, struct ucred *, NFSPROC_T *);
 int nfsvno_createsub(struct nfsrv_descript *, struct nameidata *,
--- sys/syscallsubr.h.sav3	2014-02-04 20:30:44.000000000 -0500
+++ sys/syscallsubr.h	2014-02-06 18:24:48.000000000 -0500
@@ -253,6 +253,11 @@ int	kern_wait6(struct thread *td, enum i
 int	kern_writev(struct thread *td, int fd, struct uio *auio);
 int	kern_socketpair(struct thread *td, int domain, int type, int protocol,
 	    int *rsv);
+int	kern_sendfile_mbuf(struct vnode *vp, struct vm_object *obj,
+	    struct mbuf **mp, struct mbuf **mtailp, off_t off, off_t *remp,
+	    int bsize, int *donep, int space, int mnw, int wait_forall,
+	    int flags, int ioflags, void *sfs, struct ucred *acred,
+	    struct thread *td);
 
 /* flags for kern_sigaction */
 #define	KSA_OSIGSET	0x0001	/* uses osigact_t */

Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?1261040377.1982994.1391738315028.JavaMail.root>