Date: Wed, 7 May 2003 12:38:33 -0700 (PDT) From: Don Lewis <truckman@FreeBSD.org> To: current@FreeBSD.org Subject: Re: CFR: NFS server vnode locking patch Message-ID: <200305071938.h47JcXM7033234@gw.catspoiler.org> In-Reply-To: <200305071936.h47JaMM7033225@gw.catspoiler.org>
next in thread | previous in thread | raw e-mail | index | archive | help
On 7 May, To: current@freebsd.org wrote:
> I managed to find some time to take a closer look at vnode locking in
> the NFS server code and found that the situation was worse than I
> initially thought. I've put together a patch that seems to fix all the
> bugs that I found. With this patch, the code passes the simple tests
> that I wrote as well as NFS mounting a local directory on /usr/obj and
> running "make -j10 buildworld" (after I cranked up vfs.hirunningspace
> and vfs.lorunningspace by 50x to avoid the wdrain bio deadlock I
> mentioned yesterday), all with the DEBUG_VFS_LOCKS kernel option
> enabled.
>
> The NFS server code was in bad shape from being hacked on too many times
> before I touched it and it looks like it has accumulated some historical
> baggage, and my changes certainly don't help. I attempted to match the
> existing style and control flow since I wanted to minimize the changes
> at the time to avoid introducing new bugs, but this meant that I had to
> duplicate some code in a number of places.
>
> I saw two possible ways of getting the initial dirp attributes. One was
> to set LOCKPARENT on the first lookup() call in nfs_namei() and cap
> VOP_GETATTR() at that point. I chose the other possible implementation,
> which was to temporarily lock the dirp and call VOP_GETATTR() before the
> loop, because this change was simpler.
>
> The NFS server code badly needs a rewrite by someone who understands it
> well.
>
> I'm hoping to get enough review and testing of this patch so that I can
> get re approval to fix vnode locking in the NFS server code for
> 5.1.
It might help to actually include the patch ...
Index: sys/nfsserver/nfs.h
===================================================================
RCS file: /home/ncvs/src/sys/nfsserver/nfs.h,v
retrieving revision 1.68
diff -u -r1.68 nfs.h
--- sys/nfsserver/nfs.h 24 Jul 2002 22:27:35 -0000 1.68
+++ sys/nfsserver/nfs.h 5 May 2003 00:34:59 -0000
@@ -332,7 +332,8 @@
int netaddr_match(int, union nethostaddr *, struct sockaddr *);
int nfs_namei(struct nameidata *, fhandle_t *, int,
struct nfssvc_sock *, struct sockaddr *, struct mbuf **,
- caddr_t *, struct vnode **, struct thread *, int);
+ caddr_t *, struct vnode **, int, struct vattr *, int *,
+ struct thread *, int);
void nfsm_adj(struct mbuf *, int, int);
int nfsm_mbuftouio(struct mbuf **, struct uio *, int, caddr_t *);
void nfsrv_initcache(void);
Index: sys/nfsserver/nfs_serv.c
===================================================================
RCS file: /home/ncvs/src/sys/nfsserver/nfs_serv.c,v
retrieving revision 1.133
diff -u -r1.133 nfs_serv.c
--- sys/nfsserver/nfs_serv.c 24 Apr 2003 04:31:25 -0000 1.133
+++ sys/nfsserver/nfs_serv.c 7 May 2003 02:20:28 -0000
@@ -467,7 +467,7 @@
nd.ni_cnd.cn_nameiop = LOOKUP;
nd.ni_cnd.cn_flags = LOCKLEAF | SAVESTART;
error = nfs_namei(&nd, fhp, len, slp, nam, &md, &dpos,
- &dirp, td, pubflag);
+ &dirp, v3, &dirattr, &dirattr_ret, td, pubflag);
/*
* namei failure, only dirp to cleanup. Clear out garbarge from
@@ -476,9 +476,6 @@
if (error) {
if (dirp) {
- if (v3)
- dirattr_ret = VOP_GETATTR(dirp, &dirattr, cred,
- td);
vrele(dirp);
dirp = NULL;
}
@@ -551,9 +548,6 @@
}
if (dirp) {
- if (v3)
- dirattr_ret = VOP_GETATTR(dirp, &dirattr, cred,
- td);
vrele(dirp);
dirp = NULL;
}
@@ -1630,15 +1624,10 @@
* prior to calling nfsm_reply ( which might goto nfsmout ).
*/
error = nfs_namei(&nd, fhp, len, slp, nam, &md, &dpos,
- &dirp, td, FALSE);
- if (dirp) {
- if (v3) {
- dirfor_ret = VOP_GETATTR(dirp, &dirfor, cred,
- td);
- } else {
- vrele(dirp);
- dirp = NULL;
- }
+ &dirp, v3, &dirfor, &dirfor_ret, td, FALSE);
+ if (dirp && !v3) {
+ vrele(dirp);
+ dirp = NULL;
}
if (error) {
nfsm_reply(NFSX_WCCDATA(v3));
@@ -1809,9 +1798,27 @@
if (exclusive_flag && !error &&
bcmp(cverf, (caddr_t)&vap->va_atime, NFSX_V3CREATEVERF))
error = EEXIST;
- diraft_ret = VOP_GETATTR(dirp, &diraft, cred, td);
- vrele(dirp);
- dirp = NULL;
+ if (dirp == nd.ni_dvp)
+ diraft_ret = VOP_GETATTR(dirp, &diraft, cred, td);
+ else {
+ /* Drop the other locks to avoid deadlock. */
+ if (nd.ni_dvp) {
+ if (nd.ni_dvp == nd.ni_vp)
+ vrele(nd.ni_dvp);
+ else
+ vput(nd.ni_dvp);
+ }
+ if (nd.ni_vp)
+ vput(nd.ni_vp);
+ nd.ni_dvp = NULL;
+ nd.ni_vp = NULL;
+
+ if (vn_lock(dirp, LK_EXCLUSIVE | LK_RETRY, td) == 0) {
+ diraft_ret =
+ VOP_GETATTR(dirp, &diraft, cred, td);
+ VOP_UNLOCK(dirp, 0, td);
+ }
+ }
}
ereply:
nfsm_reply(NFSX_SRVFH(v3) + NFSX_FATTR(v3) + NFSX_WCCDATA(v3));
@@ -1906,9 +1913,7 @@
*/
error = nfs_namei(&nd, fhp, len, slp, nam, &md, &dpos,
- &dirp, td, FALSE);
- if (dirp)
- dirfor_ret = VOP_GETATTR(dirp, &dirfor, cred, td);
+ &dirp, v3, &dirfor, &dirfor_ret, td, FALSE);
if (error) {
nfsm_reply(NFSX_WCCDATA(1));
nfsm_srvwcc_data(dirfor_ret, &dirfor, diraft_ret, &diraft);
@@ -2006,10 +2011,10 @@
vp = NULL;
nd.ni_vp = NULL;
}
- diraft_ret = VOP_GETATTR(dirp, &diraft, cred, td);
if (dirp) {
- vrele(dirp);
- dirp = NULL;
+ if (vn_lock(dirp, LK_EXCLUSIVE | LK_RETRY, td) == 0)
+ diraft_ret = VOP_GETATTR(dirp, &diraft, cred, td);
+ VOP_UNLOCK(dirp, 0, td);
}
ereply:
nfsm_reply(NFSX_SRVFH(1) + NFSX_POSTOPATTR(1) + NFSX_WCCDATA(1));
@@ -2085,15 +2090,10 @@
nd.ni_cnd.cn_nameiop = DELETE;
nd.ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF;
error = nfs_namei(&nd, fhp, len, slp, nam, &md, &dpos,
- &dirp, td, FALSE);
- if (dirp) {
- if (v3) {
- dirfor_ret = VOP_GETATTR(dirp, &dirfor, cred,
- td);
- } else {
- vrele(dirp);
- dirp = NULL;
- }
+ &dirp, v3, &dirfor, &dirfor_ret, td, FALSE);
+ if (dirp && !v3) {
+ vrele(dirp);
+ dirp = NULL;
}
if (error == 0) {
if (nd.ni_vp->v_type == VDIR) {
@@ -2114,9 +2114,27 @@
}
}
if (dirp && v3) {
- diraft_ret = VOP_GETATTR(dirp, &diraft, cred, td);
- vrele(dirp);
- dirp = NULL;
+ if (dirp == nd.ni_dvp)
+ diraft_ret = VOP_GETATTR(dirp, &diraft, cred, td);
+ else {
+ /* Drop the other locks to avoid deadlock. */
+ if (nd.ni_dvp) {
+ if (nd.ni_dvp == nd.ni_vp)
+ vrele(nd.ni_dvp);
+ else
+ vput(nd.ni_dvp);
+ }
+ if (nd.ni_vp)
+ vput(nd.ni_vp);
+ nd.ni_dvp = NULL;
+ nd.ni_vp = NULL;
+
+ if (vn_lock(dirp, LK_EXCLUSIVE | LK_RETRY, td) == 0) {
+ diraft_ret =
+ VOP_GETATTR(dirp, &diraft, cred, td);
+ VOP_UNLOCK(dirp, 0, td);
+ }
+ }
}
ereply:
nfsm_reply(NFSX_WCCDATA(v3));
@@ -2200,15 +2218,10 @@
fromnd.ni_cnd.cn_nameiop = DELETE;
fromnd.ni_cnd.cn_flags = WANTPARENT | SAVESTART;
error = nfs_namei(&fromnd, ffhp, len, slp, nam, &md,
- &dpos, &fdirp, td, FALSE);
- if (fdirp) {
- if (v3) {
- fdirfor_ret = VOP_GETATTR(fdirp, &fdirfor, cred,
- td);
- } else {
- vrele(fdirp);
- fdirp = NULL;
- }
+ &dpos, &fdirp, v3, &fdirfor, &fdirfor_ret, td, FALSE);
+ if (fdirp && !v3) {
+ vrele(fdirp);
+ fdirp = NULL;
}
if (error) {
nfsm_reply(2 * NFSX_WCCDATA(v3));
@@ -2227,15 +2240,10 @@
tond.ni_cnd.cn_nameiop = RENAME;
tond.ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART;
error = nfs_namei(&tond, tfhp, len2, slp, nam, &md,
- &dpos, &tdirp, td, FALSE);
- if (tdirp) {
- if (v3) {
- tdirfor_ret = VOP_GETATTR(tdirp, &tdirfor, cred,
- td);
- } else {
- vrele(tdirp);
- tdirp = NULL;
- }
+ &dpos, &tdirp, v3, &tdirfor, &tdirfor_ret, td, FALSE);
+ if (tdirp && !v3) {
+ vrele(tdirp);
+ tdirp = NULL;
}
if (error)
goto out1;
@@ -2318,12 +2326,30 @@
/* fall through */
out1:
- if (fdirp)
- fdiraft_ret = VOP_GETATTR(fdirp, &fdiraft, cred, td);
- if (tdirp)
- tdiraft_ret = VOP_GETATTR(tdirp, &tdiraft, cred, td);
nfsm_reply(2 * NFSX_WCCDATA(v3));
if (v3) {
+ /* Release existing locks to prevent deadlock. */
+ if (tond.ni_dvp) {
+ if (tond.ni_dvp == tond.ni_vp)
+ vrele(tond.ni_dvp);
+ else
+ vput(tond.ni_dvp);
+ }
+ if (tond.ni_vp)
+ vput(tond.ni_vp);
+ tond.ni_dvp = NULL;
+ tond.ni_vp = NULL;
+
+ if (fdirp &&
+ vn_lock(fdirp, LK_EXCLUSIVE | LK_RETRY, td) == 0) {
+ fdiraft_ret = VOP_GETATTR(fdirp, &fdiraft, cred, td);
+ VOP_UNLOCK(fdirp, 0, td);
+ }
+ if (tdirp &&
+ vn_lock(tdirp, LK_EXCLUSIVE | LK_RETRY, td) == 0) {
+ tdiraft_ret = VOP_GETATTR(tdirp, &tdiraft, cred, td);
+ VOP_UNLOCK(tdirp, 0, td);
+ }
nfsm_srvwcc_data(fdirfor_ret, &fdirfor, fdiraft_ret, &fdiraft);
nfsm_srvwcc_data(tdirfor_ret, &tdirfor, tdiraft_ret, &tdiraft);
}
@@ -2407,7 +2433,7 @@
nfsm_srvmtofh(dfhp);
nfsm_srvnamesiz(len);
- error = nfsrv_fhtovp(fhp, FALSE, &vp, cred, slp, nam, &rdonly, TRUE);
+ error = nfsrv_fhtovp(fhp, TRUE, &vp, cred, slp, nam, &rdonly, TRUE);
if (error) {
nfsm_reply(NFSX_POSTOPATTR(v3) + NFSX_WCCDATA(v3));
if (v3) {
@@ -2418,47 +2444,77 @@
error = 0;
goto nfsmout;
}
+ if (v3)
+ getret = VOP_GETATTR(vp, &at, cred, td);
if (vp->v_type == VDIR) {
error = EPERM; /* POSIX */
goto out1;
}
+ VOP_UNLOCK(vp, 0, td);
nd.ni_cnd.cn_cred = cred;
nd.ni_cnd.cn_nameiop = CREATE;
nd.ni_cnd.cn_flags = LOCKPARENT;
error = nfs_namei(&nd, dfhp, len, slp, nam, &md, &dpos,
- &dirp, td, FALSE);
- if (dirp) {
- if (v3) {
- dirfor_ret = VOP_GETATTR(dirp, &dirfor, cred,
- td);
- } else {
- vrele(dirp);
- dirp = NULL;
- }
+ &dirp, v3, &dirfor, &dirfor_ret, td, FALSE);
+ if (dirp && !v3) {
+ vrele(dirp);
+ dirp = NULL;
+ }
+ if (error) {
+ vrele(vp);
+ vp = NULL;
+ goto out2;
}
- if (error)
- goto out1;
-
xp = nd.ni_vp;
if (xp != NULL) {
error = EEXIST;
- goto out;
+ vrele(vp);
+ vp = NULL;
+ goto out2;
}
xp = nd.ni_dvp;
- if (vp->v_mount != xp->v_mount)
+ if (vp->v_mount != xp->v_mount) {
error = EXDEV;
-out:
- if (!error) {
- error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
- NDFREE(&nd, NDF_ONLY_PNBUF);
+ vrele(vp);
+ vp = NULL;
+ goto out2;
}
+ if ((error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td)) != 0) {
+ vrele(vp);
+ vp = NULL;
+ goto out2;
+ }
+ error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
/* fall through */
out1:
if (v3)
getret = VOP_GETATTR(vp, &at, cred, td);
- if (dirp)
- diraft_ret = VOP_GETATTR(dirp, &diraft, cred, td);
+out2:
+ if (dirp) {
+ if (dirp == nd.ni_dvp)
+ diraft_ret = VOP_GETATTR(dirp, &diraft, cred, td);
+ else {
+ /* Release existing locks to prevent deadlock. */
+ if (nd.ni_dvp) {
+ if (nd.ni_dvp == nd.ni_vp)
+ vrele(nd.ni_dvp);
+ else
+ vput(nd.ni_dvp);
+ }
+ if (nd.ni_vp)
+ vrele(nd.ni_vp);
+ nd.ni_dvp = NULL;
+ nd.ni_vp = NULL;
+
+ if (vn_lock(dirp, LK_EXCLUSIVE | LK_RETRY, td) == 0) {
+ diraft_ret =
+ VOP_GETATTR(dirp, &diraft, cred, td);
+ VOP_UNLOCK(dirp, 0, td);
+ }
+ }
+ }
ereply:
nfsm_reply(NFSX_POSTOPATTR(v3) + NFSX_WCCDATA(v3));
if (v3) {
@@ -2473,7 +2529,7 @@
if (dirp)
vrele(dirp);
if (vp)
- vrele(vp);
+ vput(vp);
if (nd.ni_dvp) {
if (nd.ni_dvp == nd.ni_vp)
vrele(nd.ni_dvp);
@@ -2534,15 +2590,10 @@
nd.ni_cnd.cn_nameiop = CREATE;
nd.ni_cnd.cn_flags = LOCKPARENT | SAVESTART;
error = nfs_namei(&nd, fhp, len, slp, nam, &md, &dpos,
- &dirp, td, FALSE);
- if (dirp) {
- if (v3) {
- dirfor_ret = VOP_GETATTR(dirp, &dirfor, cred,
- td);
- } else {
- vrele(dirp);
- dirp = NULL;
- }
+ &dirp, v3, &dirfor, &dirfor_ret, td, FALSE);
+ if (dirp && !v3) {
+ vrele(dirp);
+ dirp = NULL;
}
if (error)
goto out;
@@ -2629,10 +2680,9 @@
FREE(pathcp, M_TEMP);
pathcp = NULL;
}
- if (dirp) {
+ if (dirp && vn_lock(dirp, LK_EXCLUSIVE | LK_RETRY, td) == 0) {
diraft_ret = VOP_GETATTR(dirp, &diraft, cred, td);
- vrele(dirp);
- dirp = NULL;
+ VOP_UNLOCK(dirp, 0, td);
}
if (nd.ni_startdir) {
vrele(nd.ni_startdir);
@@ -2719,15 +2769,10 @@
nd.ni_cnd.cn_flags = LOCKPARENT;
error = nfs_namei(&nd, fhp, len, slp, nam, &md, &dpos,
- &dirp, td, FALSE);
- if (dirp) {
- if (v3) {
- dirfor_ret = VOP_GETATTR(dirp, &dirfor, cred,
- td);
- } else {
- vrele(dirp);
- dirp = NULL;
- }
+ &dirp, v3, &dirfor, &dirfor_ret, td, FALSE);
+ if (dirp && !v3) {
+ vrele(dirp);
+ dirp = NULL;
}
if (error) {
nfsm_reply(NFSX_WCCDATA(v3));
@@ -2778,8 +2823,33 @@
error = VOP_GETATTR(nd.ni_vp, vap, cred, td);
}
out:
- if (dirp)
- diraft_ret = VOP_GETATTR(dirp, &diraft, cred, td);
+ if (dirp) {
+ if (dirp == nd.ni_dvp) {
+ diraft_ret = VOP_GETATTR(dirp, &diraft, cred, td);
+ } else {
+ /* Release existing locks to prevent deadlock. */
+ if (nd.ni_dvp) {
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ if (nd.ni_dvp == nd.ni_vp && vpexcl)
+ vrele(nd.ni_dvp);
+ else
+ vput(nd.ni_dvp);
+ }
+ if (nd.ni_vp) {
+ if (vpexcl)
+ vput(nd.ni_vp);
+ else
+ vrele(nd.ni_vp);
+ }
+ nd.ni_dvp = NULL;
+ nd.ni_vp = NULL;
+ if (vn_lock(dirp, LK_EXCLUSIVE | LK_RETRY, td) == 0) {
+ diraft_ret =
+ VOP_GETATTR(dirp, &diraft, cred, td);
+ VOP_UNLOCK(dirp, 0, td);
+ }
+ }
+ }
nfsm_reply(NFSX_SRVFH(v3) + NFSX_POSTOPATTR(v3) + NFSX_WCCDATA(v3));
if (v3) {
if (!error) {
@@ -2859,15 +2929,10 @@
nd.ni_cnd.cn_nameiop = DELETE;
nd.ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF;
error = nfs_namei(&nd, fhp, len, slp, nam, &md, &dpos,
- &dirp, td, FALSE);
- if (dirp) {
- if (v3) {
- dirfor_ret = VOP_GETATTR(dirp, &dirfor, cred,
- td);
- } else {
- vrele(dirp);
- dirp = NULL;
- }
+ &dirp, v3, &dirfor, &dirfor_ret, td, FALSE);
+ if (dirp && !v3) {
+ vrele(dirp);
+ dirp = NULL;
}
if (error) {
nfsm_reply(NFSX_WCCDATA(v3));
@@ -2902,8 +2967,28 @@
error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
NDFREE(&nd, NDF_ONLY_PNBUF);
- if (dirp)
- diraft_ret = VOP_GETATTR(dirp, &diraft, cred, td);
+ if (dirp) {
+ if (dirp == nd.ni_dvp)
+ diraft_ret = VOP_GETATTR(dirp, &diraft, cred, td);
+ else {
+ /* Release existing locks to prevent deadlock. */
+ if (nd.ni_dvp) {
+ if (nd.ni_dvp == nd.ni_vp)
+ vrele(nd.ni_dvp);
+ else
+ vput(nd.ni_dvp);
+ }
+ if (nd.ni_vp)
+ vput(nd.ni_vp);
+ nd.ni_dvp = NULL;
+ nd.ni_vp = NULL;
+ if (vn_lock(dirp, LK_EXCLUSIVE | LK_RETRY, td) == 0) {
+ diraft_ret =
+ VOP_GETATTR(dirp, &diraft, cred, td);
+ VOP_UNLOCK(dirp, 0, td);
+ }
+ }
+ }
nfsm_reply(NFSX_WCCDATA(v3));
error = 0;
if (v3)
Index: sys/nfsserver/nfs_srvsubs.c
===================================================================
RCS file: /home/ncvs/src/sys/nfsserver/nfs_srvsubs.c,v
retrieving revision 1.120
diff -u -r1.120 nfs_srvsubs.c
--- sys/nfsserver/nfs_srvsubs.c 19 Feb 2003 05:47:39 -0000 1.120
+++ sys/nfsserver/nfs_srvsubs.c 6 May 2003 02:56:15 -0000
@@ -592,7 +592,8 @@
int
nfs_namei(struct nameidata *ndp, fhandle_t *fhp, int len,
struct nfssvc_sock *slp, struct sockaddr *nam, struct mbuf **mdp,
- caddr_t *dposp, struct vnode **retdirp, struct thread *td, int pubflag)
+ caddr_t *dposp, struct vnode **retdirp, int v3, struct vattr *retdirattrp,
+ int *retdirattr_retp, struct thread *td, int pubflag)
{
int i, rem;
struct mbuf *md;
@@ -602,6 +603,7 @@
struct vnode *dp;
int error, rdonly, linklen;
struct componentname *cnp = &ndp->ni_cnd;
+ int lockleaf = (cnp->cn_flags & LOCKLEAF) != 0;
*retdirp = NULL;
cnp->cn_flags |= NOMACCHECK;
@@ -664,6 +666,12 @@
* to the returned pointer
*/
*retdirp = dp;
+ if (v3) {
+ vn_lock(dp, LK_EXCLUSIVE | LK_RETRY, td);
+ *retdirattr_retp = VOP_GETATTR(dp, retdirattrp,
+ ndp->ni_cnd.cn_cred, td);
+ VOP_UNLOCK(dp, 0, td);
+ }
if (pubflag) {
/*
@@ -736,6 +744,8 @@
VREF(dp);
ndp->ni_startdir = dp;
+ if (!lockleaf)
+ cnp->cn_flags |= LOCKLEAF;
for (;;) {
cnp->cn_nameptr = cnp->cn_pnbuf;
/*
@@ -761,6 +771,8 @@
cnp->cn_flags |= HASBUF;
else
uma_zfree(namei_zone, cnp->cn_pnbuf);
+ if (ndp->ni_vp && !lockleaf)
+ VOP_UNLOCK(ndp->ni_vp, 0, td);
break;
}
@@ -840,6 +852,8 @@
ndp->ni_startdir = ndp->ni_dvp;
ndp->ni_dvp = NULL;
}
+ if (!lockleaf)
+ cnp->cn_flags &= ~LOCKLEAF;
/*
* nfs_namei() guarentees that fields will not contain garbage
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200305071938.h47JcXM7033234>
