Date: Thu, 6 Feb 2014 20:58:35 -0500 (EST) From: Rick Macklem <rmacklem@uoguelph.ca> To: Garrett Wollman <wollman@freebsd.org> Cc: freebsd-net@freebsd.org, Alexander Motin <mav@freebsd.org> Subject: Re: Terrible NFS performance under 9.2-RELEASE? Message-ID: <1261040377.1982994.1391738315028.JavaMail.root@uoguelph.ca> In-Reply-To: <CABXB=RRFfparjXm7_f6aaWHHbpUBoDWOLsjTyWdZmyKx3d2zAw@mail.gmail.com>
next in thread | previous in thread | raw e-mail | index | archive | help
[-- Attachment #1 --]
Garrett Wollman wrote:
> The real big improvement, which I have not tried to implement, would
> be to use physical pages (via sfbufs) by sharing the inner loop of
> sendfile(2). Since I use ZFS as my backing filesystem, I'm not sure
> this would have any benefit for me, but it should be a measurable
> improvement for UFS-backed NFS servers.
(I didn't have this email handy, so I just cut/pasted this paragraph.)
Well, I'm far from sure this is a good idea at this point, but I've
attached a patch that seems to work for a short test of an export of
a UFS volume. (If there is no v_object for ZFS vnodes, it will print
out a message with the error# for EINVAL.) It would need some serious
review and testing before it would be anywhere near ready for head.
(For example, since I hold a LK_SHARED locked vnode which was not
VI_DOOMED when it was locked, I don't know if I need to acquire a
reference count on the vm object or if I need to check for OBJ_DEAD?)
The patch checks for OBJ_DEAD, but does not acquire a reference count.
(The reference count part is #ifdef notnow.)
Like most of these things, I don't see a measurable difference on my
old single core i386 hardware with 100Mbps networking, but that only
indicates it might not be a serious regression.
The patch doesn't have yours applied to it, but it should be easy
to integrate the two, since it just adds a call to nfsrv_file_mbuf()
at the beginning of nfsvno_read().
Good luck with the testing, rick
ps: It would be nice to know if it works for ZFS?
pss: If the patch doesn't make it through on the list and you want a
copy, just email me.
[-- Attachment #2 --]
--- kern/uipc_syscalls.c.sav3 2014-02-04 19:22:52.000000000 -0500
+++ kern/uipc_syscalls.c 2014-02-06 18:23:47.000000000 -0500
@@ -1950,6 +1950,176 @@ freebsd4_sendfile(struct thread *td, str
}
#endif /* COMPAT_FREEBSD4 */
+/*
+ * The inner loop of kern_sendfile() extracted out so that it can be
+ * used by the NFS server as well.
+ */
+int
+kern_sendfile_mbuf(struct vnode *vp, struct vm_object *obj, struct mbuf **mp,
+ struct mbuf **mtailp, off_t off, off_t *remp, int bsize, int *donep,
+ int space, int mnw, int wait_forall, int flags, int ioflags, void *sfsp,
+ struct ucred *acred, struct thread *td)
+{
+ off_t xfsize, startrem;
+ vm_pindex_t pindex;
+ vm_offset_t pgoff;
+ struct vm_page *pg;
+ struct mbuf *m0, *m, *mtail;
+ struct sf_buf *sf;
+ int error = 0, loopbytes;
+ ssize_t resid;
+ int readahead = sfreadahead * MAXBSIZE;
+ struct sendfile_sync *sfs = (struct sendfile_sync *)sfsp;
+
+ m = *mp;
+ mtail = *mtailp;
+ startrem = *remp;
+ loopbytes = 0;
+ /*
+ * Loop and construct maximum sized mbuf chain to be bulk
+ * dumped into socket buffer.
+ */
+ while (space > loopbytes) {
+ /*
+ * Calculate the amount to transfer.
+ * Not to exceed a page, the EOF,
+ * or the passed in nbytes.
+ */
+ pgoff = (vm_offset_t)(off & PAGE_MASK);
+ *remp = startrem - loopbytes;
+ xfsize = omin(PAGE_SIZE - pgoff, *remp);
+ xfsize = omin(space - loopbytes, xfsize);
+ if (xfsize <= 0) {
+ *donep = 1; /* all data sent */
+ break;
+ }
+
+ /*
+ * Attempt to look up the page. Allocate
+ * if not found or wait and loop if busy.
+ */
+ pindex = OFF_TO_IDX(off);
+ VM_OBJECT_WLOCK(obj);
+ pg = vm_page_grab(obj, pindex, VM_ALLOC_NOBUSY |
+ VM_ALLOC_NORMAL | VM_ALLOC_WIRED | VM_ALLOC_RETRY);
+
+ /*
+ * Check if page is valid for what we need,
+ * otherwise initiate I/O.
+ * If we already turned some pages into mbufs,
+ * send them off before we come here again and
+ * block.
+ */
+ if (pg->valid && vm_page_is_valid(pg, pgoff, xfsize))
+ VM_OBJECT_WUNLOCK(obj);
+ else if (m != NULL && wait_forall == 0)
+ error = EAGAIN; /* send what we already got */
+ else if (flags & SF_NODISKIO)
+ error = EBUSY;
+ else {
+ VM_OBJECT_WUNLOCK(obj);
+
+ /*
+ * Get the page from backing store.
+ * XXXMAC: Because we don't have fp->f_cred
+ * here, we pass in NOCRED. This is probably
+ * wrong, but is consistent with our original
+ * implementation.
+ */
+ error = vn_rdwr(UIO_READ, vp, NULL, readahead,
+ trunc_page(off), UIO_NOCOPY, ioflags |
+ ((readahead / bsize) << IO_SEQSHIFT),
+ td->td_ucred, acred, &resid, td);
+ SFSTAT_INC(sf_iocnt);
+ if (error)
+ VM_OBJECT_WLOCK(obj);
+ }
+ if (error) {
+ vm_page_lock(pg);
+ vm_page_unwire(pg, 0);
+ /*
+ * See if anyone else might know about
+ * this page. If not and it is not valid,
+ * then free it.
+ */
+ if (pg->wire_count == 0 && pg->valid == 0 &&
+ !vm_page_busied(pg))
+ vm_page_free(pg);
+ vm_page_unlock(pg);
+ VM_OBJECT_WUNLOCK(obj);
+ if (error == EAGAIN && wait_forall == 0)
+ error = 0; /* not a real error */
+ break;
+ }
+
+ /*
+ * Get a sendfile buf. When allocating the
+ * first buffer for mbuf chain, we usually
+ * wait as long as necessary, but this wait
+ * can be interrupted. For consequent
+ * buffers, do not sleep, since several
+ * threads might exhaust the buffers and then
+ * deadlock.
+ */
+ sf = sf_buf_alloc(pg, (mnw || (m != NULL && wait_forall == 0)) ?
+ SFB_NOWAIT : SFB_CATCH);
+ if (sf == NULL) {
+ SFSTAT_INC(sf_allocfail);
+ vm_page_lock(pg);
+ vm_page_unwire(pg, 0);
+ KASSERT(pg->object != NULL,
+ ("kern_sendfile: object disappeared"));
+ vm_page_unlock(pg);
+ if (m == NULL)
+ error = (mnw ? EAGAIN : EINTR);
+ break;
+ }
+
+ /*
+ * Get an mbuf and set it up as having
+ * external storage.
+ */
+ m0 = m_get((mnw ? M_NOWAIT : M_WAITOK), MT_DATA);
+ if (m0 == NULL) {
+ error = (mnw ? EAGAIN : ENOBUFS);
+ sf_buf_mext(NULL, sf);
+ break;
+ }
+ if (m_extadd(m0, (caddr_t )sf_buf_kva(sf), PAGE_SIZE,
+ sf_buf_mext, sfs, sf, M_RDONLY, EXT_SFBUF,
+ (mnw ? M_NOWAIT : M_WAITOK)) != 0) {
+ error = (mnw ? EAGAIN : ENOBUFS);
+ sf_buf_mext(NULL, sf);
+ m_freem(m0);
+ break;
+ }
+ m0->m_data = (char *)sf_buf_kva(sf) + pgoff;
+ m0->m_len = xfsize;
+
+ /* Append to mbuf chain. */
+ if (mtail != NULL)
+ mtail->m_next = m0;
+ else if (m != NULL)
+ m_last(m)->m_next = m0;
+ else
+ m = m0;
+ mtail = m0;
+
+ /* Keep track of bits processed. */
+ loopbytes += xfsize;
+ off += xfsize;
+
+ if (sfs != NULL) {
+ mtx_lock(&sfs->mtx);
+ sfs->count++;
+ mtx_unlock(&sfs->mtx);
+ }
+ }
+ *mp = m;
+ *mtailp = mtail;
+ return (error);
+}
+
int
kern_sendfile(struct thread *td, struct sendfile_args *uap,
struct uio *hdr_uio, struct uio *trl_uio, int compat)
@@ -1959,10 +2129,8 @@ kern_sendfile(struct thread *td, struct
struct vm_object *obj = NULL;
struct socket *so = NULL;
struct mbuf *m = NULL;
- struct sf_buf *sf;
- struct vm_page *pg;
struct vattr va;
- off_t off, xfsize, fsbytes = 0, sbytes = 0, rem = 0;
+ off_t off, fsbytes = 0, sbytes = 0, rem = 0;
int error, hdrlen = 0, mnw = 0;
int bsize;
struct sendfile_sync *sfs = NULL;
@@ -2105,7 +2273,6 @@ kern_sendfile(struct thread *td, struct
*/
for (off = uap->offset; ; ) {
struct mbuf *mtail;
- int loopbytes;
int space;
int done;
@@ -2114,7 +2281,6 @@ kern_sendfile(struct thread *td, struct
break;
mtail = NULL;
- loopbytes = 0;
space = 0;
done = 0;
@@ -2193,157 +2359,15 @@ retry_space:
goto done;
}
- /*
- * Loop and construct maximum sized mbuf chain to be bulk
- * dumped into socket buffer.
- */
- while (space > loopbytes) {
- vm_pindex_t pindex;
- vm_offset_t pgoff;
- struct mbuf *m0;
-
- /*
- * Calculate the amount to transfer.
- * Not to exceed a page, the EOF,
- * or the passed in nbytes.
- */
- pgoff = (vm_offset_t)(off & PAGE_MASK);
- if (uap->nbytes)
- rem = (uap->nbytes - fsbytes - loopbytes);
- else
- rem = va.va_size -
- uap->offset - fsbytes - loopbytes;
- xfsize = omin(PAGE_SIZE - pgoff, rem);
- xfsize = omin(space - loopbytes, xfsize);
- if (xfsize <= 0) {
- done = 1; /* all data sent */
- break;
- }
-
- /*
- * Attempt to look up the page. Allocate
- * if not found or wait and loop if busy.
- */
- pindex = OFF_TO_IDX(off);
- VM_OBJECT_WLOCK(obj);
- pg = vm_page_grab(obj, pindex, VM_ALLOC_NOBUSY |
- VM_ALLOC_NORMAL | VM_ALLOC_WIRED | VM_ALLOC_RETRY);
-
- /*
- * Check if page is valid for what we need,
- * otherwise initiate I/O.
- * If we already turned some pages into mbufs,
- * send them off before we come here again and
- * block.
- */
- if (pg->valid && vm_page_is_valid(pg, pgoff, xfsize))
- VM_OBJECT_WUNLOCK(obj);
- else if (m != NULL)
- error = EAGAIN; /* send what we already got */
- else if (uap->flags & SF_NODISKIO)
- error = EBUSY;
- else {
- ssize_t resid;
- int readahead = sfreadahead * MAXBSIZE;
-
- VM_OBJECT_WUNLOCK(obj);
-
- /*
- * Get the page from backing store.
- * XXXMAC: Because we don't have fp->f_cred
- * here, we pass in NOCRED. This is probably
- * wrong, but is consistent with our original
- * implementation.
- */
- error = vn_rdwr(UIO_READ, vp, NULL, readahead,
- trunc_page(off), UIO_NOCOPY, IO_NODELOCKED |
- IO_VMIO | ((readahead / bsize) << IO_SEQSHIFT),
- td->td_ucred, NOCRED, &resid, td);
- SFSTAT_INC(sf_iocnt);
- if (error)
- VM_OBJECT_WLOCK(obj);
- }
- if (error) {
- vm_page_lock(pg);
- vm_page_unwire(pg, 0);
- /*
- * See if anyone else might know about
- * this page. If not and it is not valid,
- * then free it.
- */
- if (pg->wire_count == 0 && pg->valid == 0 &&
- !vm_page_busied(pg))
- vm_page_free(pg);
- vm_page_unlock(pg);
- VM_OBJECT_WUNLOCK(obj);
- if (error == EAGAIN)
- error = 0; /* not a real error */
- break;
- }
-
- /*
- * Get a sendfile buf. When allocating the
- * first buffer for mbuf chain, we usually
- * wait as long as necessary, but this wait
- * can be interrupted. For consequent
- * buffers, do not sleep, since several
- * threads might exhaust the buffers and then
- * deadlock.
- */
- sf = sf_buf_alloc(pg, (mnw || m != NULL) ? SFB_NOWAIT :
- SFB_CATCH);
- if (sf == NULL) {
- SFSTAT_INC(sf_allocfail);
- vm_page_lock(pg);
- vm_page_unwire(pg, 0);
- KASSERT(pg->object != NULL,
- ("kern_sendfile: object disappeared"));
- vm_page_unlock(pg);
- if (m == NULL)
- error = (mnw ? EAGAIN : EINTR);
- break;
- }
-
- /*
- * Get an mbuf and set it up as having
- * external storage.
- */
- m0 = m_get((mnw ? M_NOWAIT : M_WAITOK), MT_DATA);
- if (m0 == NULL) {
- error = (mnw ? EAGAIN : ENOBUFS);
- sf_buf_mext(NULL, sf);
- break;
- }
- if (m_extadd(m0, (caddr_t )sf_buf_kva(sf), PAGE_SIZE,
- sf_buf_mext, sfs, sf, M_RDONLY, EXT_SFBUF,
- (mnw ? M_NOWAIT : M_WAITOK)) != 0) {
- error = (mnw ? EAGAIN : ENOBUFS);
- sf_buf_mext(NULL, sf);
- m_freem(m0);
- break;
- }
- m0->m_data = (char *)sf_buf_kva(sf) + pgoff;
- m0->m_len = xfsize;
-
- /* Append to mbuf chain. */
- if (mtail != NULL)
- mtail->m_next = m0;
- else if (m != NULL)
- m_last(m)->m_next = m0;
- else
- m = m0;
- mtail = m0;
-
- /* Keep track of bits processed. */
- loopbytes += xfsize;
- off += xfsize;
-
- if (sfs != NULL) {
- mtx_lock(&sfs->mtx);
- sfs->count++;
- mtx_unlock(&sfs->mtx);
- }
- }
+ /* Call kern_sendfile_mbuf() for the inner loop. */
+ if (uap->nbytes)
+ rem = (uap->nbytes - fsbytes);
+ else
+ rem = va.va_size -
+ uap->offset - fsbytes;
+ error = kern_sendfile_mbuf(vp, obj, &m, &mtail, off, &rem,
+ bsize, &done, space, mnw, 0, uap->flags,
+ IO_NODELOCKED | IO_VMIO, sfs, NOCRED, td);
VOP_UNLOCK(vp, 0);
--- fs/nfsserver/nfs_nfsdport.c.sav3 2014-02-06 18:42:02.000000000 -0500
+++ fs/nfsserver/nfs_nfsdport.c 2014-02-06 18:26:49.000000000 -0500
@@ -74,6 +74,8 @@ static uint32_t nfsv4_sysid = 0;
static int nfssvc_srvcall(struct thread *, struct nfssvc_args *,
struct ucred *);
+static int nfsrv_file_mbuf(struct vnode *, struct nfsvattr *, off_t,
+ int, struct mbuf **, struct mbuf **, struct ucred *, struct thread *);
int nfsrv_enable_crossmntpt = 1;
static int nfs_commit_blks;
@@ -617,8 +619,9 @@ out:
* Read vnode op call into mbuf list.
*/
int
-nfsvno_read(struct vnode *vp, off_t off, int cnt, struct ucred *cred,
- struct thread *p, struct mbuf **mpp, struct mbuf **mpendp)
+nfsvno_read(struct vnode *vp, struct nfsvattr *nvap, off_t off, int cnt,
+ struct ucred *cred, struct thread *p, struct mbuf **mpp,
+ struct mbuf **mpendp)
{
struct mbuf *m;
int i;
@@ -629,6 +632,9 @@ nfsvno_read(struct vnode *vp, off_t off,
struct uio io, *uiop = &io;
struct nfsheur *nh;
+ error = nfsrv_file_mbuf(vp, nvap, off, cnt, mpp, mpendp, cred, p);
+ if (error == 0)
+ goto out;
len = left = NFSM_RNDUP(cnt);
m3 = NULL;
/*
@@ -3295,6 +3301,98 @@ nfsrv_backupstable(void)
}
}
+/*
+ * This function uses kern_sendfile_mbuf() to generate a list of mbufs
+ * that can be used by the NFS server read reply.
+ */
+int
+nfsrv_file_mbuf(struct vnode *vp, struct nfsvattr *nvap, off_t off,
+ int nbytes, struct mbuf **mp, struct mbuf **mendp, struct ucred *acred,
+ struct thread *td)
+{
+ struct vm_object *obj = NULL;
+ struct mbuf *m, *m2;
+ off_t rem;
+ int bsize, cnt, done, error, i;
+ char *cp;
+
+ *mp = NULL;
+ *mendp = NULL;
+ error = 0;
+ bsize = vp->v_mount->mnt_stat.f_iosize;
+ if (nvap->na_size > off)
+ rem = nvap->na_size - off;
+ else
+ goto out; /* Nothing to read, so we are done. */
+ rem = omin(rem, nbytes);
+ cnt = rem;
+ cnt = NFSM_RNDUP(cnt) - cnt; /* Cnt of bytes of trailing 0s. */
+ obj = vp->v_object;
+ if (obj != NULL) {
+ /*
+ * Do we need to acquire a reference cnt on the obj like
+ * kern_sendfile() does, even if we never unlock vp?
+ */
+ VM_OBJECT_WLOCK(obj);
+ if ((obj->flags & OBJ_DEAD) == 0) {
+#ifdef notnow
+ vm_object_reference_locked(obj);
+#endif
+ VM_OBJECT_WUNLOCK(obj);
+ } else {
+ VM_OBJECT_WUNLOCK(obj);
+ obj = NULL;
+ }
+ }
+ if (obj == NULL)
+ error = EINVAL;
+
+ /* Setup the args for kern_sendfile_mbuf(). */
+ done = 0;
+ if (error == 0)
+ error = kern_sendfile_mbuf(vp, obj, mp, mendp, off, &rem, bsize,
+ &done, 2 * MAXBSIZE, 0, 1, 0, IO_NODELOCKED | IO_VMIO |
+ IO_NOMACCHECK, NULL, acred, td);
+ if (error == 0 && cnt > 0) {
+ /*
+ * Sun XDR requires that data be filled to a multiple
+ * of 4 bytes with 0 bytes.
+ * Since the list of mbufs returned by kern_sendfile_mbuf
+ * reference pages for a file and can't be modified, this
+ * requires an additional mbuf.
+ */
+ m = *mendp;
+ NFSMGET(m2);
+ m->m_next = m2;
+ *mendp = m2;
+ cp = mtod(m2, char *);
+ m2->m_len = cnt;
+ for (i = 0; i < cnt; i++)
+ *cp++ = '\0';
+ }
+ if (error == ERESTART)
+ panic("nfsrv_file_mbuf: ERESTART");
+out:
+#ifdef notnow
+ if (obj != NULL) {
+ NFSVOPUNLOCK(vp, 0);
+ vm_object_deallocate(obj);
+ NFSVOPLOCK(vp, LK_SHARED | LK_RETRY);
+ if (error == 0 && (vp->v_iflag & VI_DOOMED) != 0)
+ error = ESTALE;
+ }
+#endif
+ if (error != 0) {
+ if (*mp != NULL) {
+ m_freem(*mp);
+ *mp = NULL;
+ *mendp = NULL;
+ }
+printf("eo nfsrv_file_mbuf err=%d\n", error);
+ }
+ return (error);
+}
+
extern int (*nfsd_call_nfsd)(struct thread *, struct nfssvc_args *);
/*
--- fs/nfsserver/nfs_nfsdserv.c.sav3 2014-02-05 21:35:16.000000000 -0500
+++ fs/nfsserver/nfs_nfsdserv.c 2014-02-05 21:35:45.000000000 -0500
@@ -726,7 +726,7 @@ nfsrvd_read(struct nfsrv_descript *nd, _
cnt = reqlen;
m3 = NULL;
if (cnt > 0) {
- nd->nd_repstat = nfsvno_read(vp, off, cnt, nd->nd_cred, p,
+ nd->nd_repstat = nfsvno_read(vp, &nva, off, cnt, nd->nd_cred, p,
&m3, &m2);
if (!(nd->nd_flag & ND_NFSV4)) {
getret = nfsvno_getattr(vp, &nva, nd->nd_cred, p, 1);
--- fs/nfs/nfs_var.h.sav3 2014-02-05 22:14:38.000000000 -0500
+++ fs/nfs/nfs_var.h 2014-02-05 22:17:29.000000000 -0500
@@ -579,8 +579,8 @@ void nfsvno_setpathbuf(struct nameidata
void nfsvno_relpathbuf(struct nameidata *);
int nfsvno_readlink(vnode_t, struct ucred *, NFSPROC_T *, mbuf_t *,
mbuf_t *, int *);
-int nfsvno_read(vnode_t, off_t, int, struct ucred *, NFSPROC_T *,
- mbuf_t *, mbuf_t *);
+int nfsvno_read(vnode_t, struct nfsvattr *, off_t, int, struct ucred *,
+ NFSPROC_T *, mbuf_t *, mbuf_t *);
int nfsvno_write(vnode_t, off_t, int, int, int, mbuf_t,
char *, struct ucred *, NFSPROC_T *);
int nfsvno_createsub(struct nfsrv_descript *, struct nameidata *,
--- sys/syscallsubr.h.sav3 2014-02-04 20:30:44.000000000 -0500
+++ sys/syscallsubr.h 2014-02-06 18:24:48.000000000 -0500
@@ -253,6 +253,11 @@ int kern_wait6(struct thread *td, enum i
int kern_writev(struct thread *td, int fd, struct uio *auio);
int kern_socketpair(struct thread *td, int domain, int type, int protocol,
int *rsv);
+int kern_sendfile_mbuf(struct vnode *vp, struct vm_object *obj,
+ struct mbuf **mp, struct mbuf **mtailp, off_t off, off_t *remp,
+ int bsize, int *donep, int space, int mnw, int wait_forall,
+ int flags, int ioflags, void *sfs, struct ucred *acred,
+ struct thread *td);
/* flags for kern_sigaction */
#define KSA_OSIGSET 0x0001 /* uses osigact_t */
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?1261040377.1982994.1391738315028.JavaMail.root>
