Date: Tue, 2 Oct 2001 16:12:51 -0700 (PDT) From: Matt Dillon <dillon@earth.backplane.com> To: Ian Dowse <iedowse@maths.tcd.ie> Cc: Yevgeniy Aleynikov <eugenea@infospace.com>, peter@FreeBSD.ORG, ache@FreeBSD.ORG, mckusick@mckusick.com, Ken Pizzini <kenp@infospace.com>, hackers@FreeBSD.ORG Subject: patch #3 (was Re: bleh. Re: ufs_rename panic) Message-ID: <200110022312.f92NCpp60948@earth.backplane.com>
next in thread | raw e-mail | index | archive | help
Here is the latest patch I have. It appears to completely solve the problem. I have shims in unionfs and nfs for the moment. The patch is against -stable. * Implements SOFTLOCKLEAF namei() option * Implements EAGAIN error & appropriate tsleep/retry code * Universal for rename() & rmdir(). Final patch will also probably implement it on unlink() to solve (pre-existing) unlink/rename regular file race. * Tested very well w/ UFS, should be compatible with and work for direct access to other filesystems that still use IN_RENAME. * Tested for collision probability. Answer: Very low even when one tries on purpose. There is no need to implement a more sophisticated fine-grained tsleep. -Matt Index: kern/vfs_lookup.c =================================================================== RCS file: /home/ncvs/src/sys/kern/vfs_lookup.c,v retrieving revision 1.38.2.3 diff -u -r1.38.2.3 vfs_lookup.c --- kern/vfs_lookup.c 2001/08/31 19:36:49 1.38.2.3 +++ kern/vfs_lookup.c 2001/10/02 20:06:33 @@ -372,6 +372,11 @@ error = EISDIR; goto bad; } + if (cnp->cn_flags & SOFTLOCKLEAF) { + if ((error = vsetsoftlock(dp)) != 0) + goto bad; + VREF(dp); + } if (wantparent) { ndp->ni_dvp = dp; VREF(dp); @@ -565,13 +570,17 @@ error = EROFS; goto bad2; } + if (cnp->cn_flags & SOFTLOCKLEAF) { + if ((error = vsetsoftlock(dp)) != 0) + goto bad2; + VREF(dp); + } if (cnp->cn_flags & SAVESTART) { ndp->ni_startdir = ndp->ni_dvp; VREF(ndp->ni_startdir); } if (!wantparent) vrele(ndp->ni_dvp); - if ((cnp->cn_flags & LOCKLEAF) == 0) VOP_UNLOCK(dp, 0, p); return (0); @@ -654,6 +663,11 @@ error = ENOTDIR; goto bad; } + if (cnp->cn_flags & SOFTLOCKLEAF) { + if ((error = vsetsoftlock(dp)) != 0) + goto bad; + VREF(dp); + } if (!(cnp->cn_flags & LOCKLEAF)) VOP_UNLOCK(dp, 0, p); *vpp = dp; @@ -707,6 +721,11 @@ error = EROFS; goto bad2; } + if (cnp->cn_flags & SOFTLOCKLEAF) { + if ((error = vsetsoftlock(dp)) != 0) + goto bad2; + VREF(dp); + } /* ASSERT(dvp == ndp->ni_startdir) */ if (cnp->cn_flags & SAVESTART) VREF(dvp); @@ -715,8 +734,9 @@ vrele(dvp); if (vn_canvmio(dp) == TRUE && - ((cnp->cn_flags & (NOOBJ|LOCKLEAF)) == LOCKLEAF)) + ((cnp->cn_flags & (NOOBJ|LOCKLEAF)) == LOCKLEAF)) { vfs_object_create(dp, cnp->cn_proc, cnp->cn_cred); + } if ((cnp->cn_flags & LOCKLEAF) == 0) VOP_UNLOCK(dp, 0, p); Index: kern/vfs_subr.c =================================================================== RCS file: /home/ncvs/src/sys/kern/vfs_subr.c,v retrieving revision 1.249.2.11 diff -u -r1.249.2.11 vfs_subr.c --- kern/vfs_subr.c 2001/09/11 09:49:53 1.249.2.11 +++ kern/vfs_subr.c 2001/10/02 22:55:38 @@ -130,6 +132,8 @@ #endif struct nfs_public nfs_pub; /* publicly exported FS */ static vm_zone_t vnode_zone; +static int vagain_count = 1; +static int vagain_waiting = 0; /* * The workitem queue. @@ -2927,6 +2963,13 @@ struct nameidata *ndp; const uint flags; { + if (!(flags & NDF_NO_FREE_SOFTLOCKLEAF) && + (ndp->ni_cnd.cn_flags & SOFTLOCKLEAF) && + ndp->ni_vp) { + vclearsoftlock(ndp->ni_vp); + ndp->ni_cnd.cn_flags &= ~SOFTLOCKLEAF; + vrele(ndp->ni_vp); + } if (!(flags & NDF_NO_FREE_PNBUF) && (ndp->ni_cnd.cn_flags & HASBUF)) { zfree(namei_zone, ndp->ni_cnd.cn_pnbuf); @@ -2955,3 +2998,55 @@ ndp->ni_startdir = NULL; } } + +/* + * vsetsoftlock() - set the VSOFTLOCK flag on the vnode, return + * EAGAIN if it has already been set by someone else. + * + * note: we could further refine the collision by setting a VSOFTLOCKCOLL + * flag and then only waking up waiters when the colliding vnode is + * released. However, this sort of collision does not happen often + * enough for such an addition to yield any improvement in performance. + */ + +int +vsetsoftlock(struct vnode *vp) +{ + int s; + int error = 0; + + s = splbio(); + if (vp->v_flag & VSOFTLOCK) + error = EAGAIN; + else + vp->v_flag |= VSOFTLOCK; + splx(s); + return(error); +} + +void +vclearsoftlock(struct vnode *vp) +{ + int s; + + s = splbio(); + vp->v_flag &= ~VSOFTLOCK; + splx(s); + if (++vagain_count == 0) + vagain_count = 1; + if (vagain_waiting) { + vagain_waiting = 0; + wakeup(&vagain_waiting); + } +} + +void +vagain(int *gc) +{ + while (*gc && *gc == vagain_count) { + vagain_waiting = 1; + tsleep(&vagain_waiting, PVFS, "vagain", 0); + } + *gc = vagain_count; +} + Index: kern/vfs_syscalls.c =================================================================== RCS file: /home/ncvs/src/sys/kern/vfs_syscalls.c,v retrieving revision 1.151.2.9 diff -u -r1.151.2.9 vfs_syscalls.c --- kern/vfs_syscalls.c 2001/08/12 10:48:00 1.151.2.9 +++ kern/vfs_syscalls.c 2001/10/02 22:19:22 @@ -2631,12 +2631,18 @@ register struct vnode *tvp, *fvp, *tdvp; struct nameidata fromnd, tond; int error; + int gc = 0; +again: + vagain(&gc); bwillwrite(); - NDINIT(&fromnd, DELETE, WANTPARENT | SAVESTART, UIO_USERSPACE, - SCARG(uap, from), p); - if ((error = namei(&fromnd)) != 0) + NDINIT(&fromnd, DELETE, WANTPARENT | SAVESTART | SOFTLOCKLEAF, + UIO_USERSPACE, SCARG(uap, from), p); + if ((error = namei(&fromnd)) != 0) { + if (error == EAGAIN) + goto again; return (error); + } fvp = fromnd.ni_vp; NDINIT(&tond, RENAME, LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | NOOBJ, UIO_USERSPACE, SCARG(uap, to), p); @@ -2646,7 +2652,7 @@ /* Translate error code for rename("dir1", "dir2/."). */ if (error == EISDIR && fvp->v_type == VDIR) error = EINVAL; - NDFREE(&fromnd, NDF_ONLY_PNBUF); + NDFREE(&fromnd, NDF_ONLY_PNBUF & NDF_ONLY_SOFTLOCKLEAF); vrele(fromnd.ni_dvp); vrele(fvp); goto out1; @@ -2683,12 +2689,14 @@ if (tvp) { VOP_LEASE(tvp, p, p->p_ucred, LEASE_WRITE); } + fromnd.ni_cnd.cn_flags &= ~SOFTLOCKLEAF; /* XXX hack */ error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd, tond.ni_dvp, tond.ni_vp, &tond.ni_cnd); - NDFREE(&fromnd, NDF_ONLY_PNBUF); + fromnd.ni_cnd.cn_flags |= SOFTLOCKLEAF; + NDFREE(&fromnd, NDF_ONLY_PNBUF & NDF_ONLY_SOFTLOCKLEAF); NDFREE(&tond, NDF_ONLY_PNBUF); } else { - NDFREE(&fromnd, NDF_ONLY_PNBUF); + NDFREE(&fromnd, NDF_ONLY_PNBUF & NDF_ONLY_SOFTLOCKLEAF); NDFREE(&tond, NDF_ONLY_PNBUF); if (tdvp == tvp) vrele(tdvp); @@ -2782,13 +2790,19 @@ { register struct vnode *vp; int error; + int gc = 0; struct nameidata nd; +again: + vagain(&gc); bwillwrite(); - NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF, UIO_USERSPACE, - SCARG(uap, path), p); - if ((error = namei(&nd)) != 0) + NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF | SOFTLOCKLEAF, + UIO_USERSPACE, SCARG(uap, path), p); + if ((error = namei(&nd)) != 0) { + if (error == EAGAIN) + goto again; return (error); + } vp = nd.ni_vp; if (vp->v_type != VDIR) { error = ENOTDIR; @@ -2812,7 +2826,7 @@ error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd); } out: - NDFREE(&nd, NDF_ONLY_PNBUF); + NDFREE(&nd, NDF_ONLY_PNBUF & NDF_ONLY_SOFTLOCKLEAF); if (nd.ni_dvp == vp) vrele(nd.ni_dvp); else Index: miscfs/union/union_vnops.c =================================================================== RCS file: /home/ncvs/src/sys/miscfs/union/Attic/union_vnops.c,v retrieving revision 1.72 diff -u -r1.72 union_vnops.c --- miscfs/union/union_vnops.c 1999/12/15 23:02:14 1.72 +++ miscfs/union/union_vnops.c 2001/10/02 22:48:40 @@ -1380,6 +1380,7 @@ struct vnode *fvp = ap->a_fvp; struct vnode *tdvp = ap->a_tdvp; struct vnode *tvp = ap->a_tvp; + int setsoftlock = 0; /* * Figure out what fdvp to pass to our upper or lower vnode. If we @@ -1456,6 +1457,16 @@ fvp = un->un_uppervp; VREF(fvp); vrele(ap->a_fvp); + if ((error = vsetsoftlock(fvp)) != 0) { + /* + * XXX for now do not support a retry request + */ + if (error == EAGAIN) + error = EINVAL; + goto bad; + } + setsoftlock = 1; + VREF(fvp); /* for softlock */ } /* @@ -1505,11 +1516,15 @@ } /* - * VOP_RENAME releases/vputs prior to returning, so we have no - * cleanup to do. + * VOP_RENAME releases/vputs prior to returning, except for the + * SOFTLOCK, so we have minimal cleanup to do. */ - - return (VOP_RENAME(fdvp, fvp, ap->a_fcnp, tdvp, tvp, ap->a_tcnp)); + error = VOP_RENAME(fdvp, fvp, ap->a_fcnp, tdvp, tvp, ap->a_tcnp); + if (setsoftlock) { + vclearsoftlock(fvp); + vrele(fvp); + } + return(error); /* * Error. We still have to release / vput the various elements. @@ -1517,8 +1532,13 @@ bad: vrele(fdvp); - if (fvp) + if (fvp) { + if (setsoftlock) { + vclearsoftlock(fvp); + vrele(fvp); + } vrele(fvp); + } vput(tdvp); if (tvp != NULLVP) { if (tvp != tdvp) @@ -1583,7 +1603,16 @@ if ((uppervp = union_lock_upper(un, p)) != NULLVP) { if (union_dowhiteout(un, cnp->cn_cred, p)) cnp->cn_flags |= DOWHITEOUT; - error = VOP_RMDIR(upperdvp, uppervp, ap->a_cnp); + if ((error = vsetsoftlock(upperdvp)) != 0) { + /* + * XXX for now do not support a retry request + */ + if (error == EAGAIN) + error = EINVAL; + + } else { + error = VOP_RMDIR(upperdvp, uppervp, ap->a_cnp); + } union_unlock_upper(uppervp, p); } else { error = union_mkwhiteout( Index: nfs/nfs_serv.c =================================================================== RCS file: /home/ncvs/src/sys/nfs/Attic/nfs_serv.c,v retrieving revision 1.93 diff -u -r1.93 nfs_serv.c --- nfs/nfs_serv.c 1999/12/18 19:20:05 1.93 +++ nfs/nfs_serv.c 2001/10/02 23:03:34 @@ -2194,7 +2194,7 @@ saved_uid = cred->cr_uid; fromnd.ni_cnd.cn_cred = cred; fromnd.ni_cnd.cn_nameiop = DELETE; - fromnd.ni_cnd.cn_flags = WANTPARENT | SAVESTART; + fromnd.ni_cnd.cn_flags = WANTPARENT | SAVESTART | SOFTLOCKLEAF; error = nfs_namei(&fromnd, ffhp, len, slp, nam, &md, &dpos, &fdirp, procp, (nfsd->nd_flag & ND_KERBAUTH), FALSE); if (fdirp) { @@ -2207,6 +2207,8 @@ } } if (error) { + if (error == EAGAIN) /* XXX EAGAIN not yet supported */ + error = EINVAL; nfsm_reply(2 * NFSX_WCCDATA(v3)); nfsm_srvwcc_data(fdirfor_ret, &fdirfor, fdiraft_ret, &fdiraft); nfsm_srvwcc_data(tdirfor_ret, &tdirfor, tdiraft_ret, &tdiraft); @@ -2300,13 +2302,17 @@ if (tvp) { nqsrv_getl(tvp, ND_WRITE); } + fromnd.ni_cnd.cn_flags &= ~SOFTLOCKLEAF; /* XXX hack */ error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd, tond.ni_dvp, tond.ni_vp, &tond.ni_cnd); - fromnd.ni_dvp = NULL; - fromnd.ni_vp = NULL; - tond.ni_dvp = NULL; - tond.ni_vp = NULL; - if (error) { + /* XXX this is a @!#$!@#$@#$ mess */ + vclearsoftlock(fromnd.ni_vp); /* cleanup after SOFTLOCKLEAF */ + vrele(fromnd.ni_vp); /* cleanup after SOFTLOCKLEAF */ + fromnd.ni_dvp = NULL; /* clear for fall through */ + fromnd.ni_vp = NULL; /* clear for fall through */ + tond.ni_dvp = NULL; /* clear for fall through */ + tond.ni_vp = NULL; /* clear for fall through */ + if (error) { /* clear for fall through */ fromnd.ni_cnd.cn_flags &= ~HASBUF; tond.ni_cnd.cn_flags &= ~HASBUF; } @@ -2354,7 +2360,7 @@ vrele(fdirp); if (fromnd.ni_startdir) vrele(fromnd.ni_startdir); - NDFREE(&fromnd, NDF_ONLY_PNBUF); + NDFREE(&fromnd, NDF_ONLY_PNBUF & NDF_ONLY_SOFTLOCKLEAF); if (fromnd.ni_dvp) vrele(fromnd.ni_dvp); if (fromnd.ni_vp) @@ -2826,7 +2832,7 @@ nfsm_srvnamesiz(len); nd.ni_cnd.cn_cred = cred; nd.ni_cnd.cn_nameiop = DELETE; - nd.ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF; + nd.ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF | SOFTLOCKLEAF; error = nfs_namei(&nd, fhp, len, slp, nam, &md, &dpos, &dirp, procp, (nfsd->nd_flag & ND_KERBAUTH), FALSE); if (dirp) { @@ -2839,6 +2845,8 @@ } } if (error) { + if (error == EAGAIN) /* XXX EAGAIN not yet supported */ + error = EINVAL; nfsm_reply(NFSX_WCCDATA(v3)); nfsm_srvwcc_data(dirfor_ret, &dirfor, diraft_ret, &diraft); error = 0; @@ -2871,7 +2879,7 @@ nqsrv_getl(vp, ND_WRITE); error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd); } - NDFREE(&nd, NDF_ONLY_PNBUF); + NDFREE(&nd, NDF_ONLY_PNBUF & NDF_ONLY_SOFTLOCKLEAF); if (dirp) diraft_ret = VOP_GETATTR(dirp, &diraft, cred, procp); @@ -2883,7 +2891,7 @@ /* fall through */ nfsmout: - NDFREE(&nd, NDF_ONLY_PNBUF); + NDFREE(&nd, NDF_ONLY_PNBUF & NDF_ONLY_SOFTLOCKLEAF); if (dirp) vrele(dirp); if (nd.ni_dvp) { Index: nfs/nfs_subs.c =================================================================== RCS file: /home/ncvs/src/sys/nfs/Attic/nfs_subs.c,v retrieving revision 1.90.2.1 diff -u -r1.90.2.1 nfs_subs.c --- nfs/nfs_subs.c 2000/10/28 16:27:27 1.90.2.1 +++ nfs/nfs_subs.c 2001/10/02 23:01:03 @@ -1729,6 +1729,10 @@ out: if (error) { zfree(namei_zone, cnp->cn_pnbuf); + if ((cnp->cn_flags & SOFTLOCKLEAF) && ndp->ni_vp) { + vclearsoftlock(ndp->ni_vp); + vrele(ndp->ni_vp); + } ndp->ni_vp = NULL; ndp->ni_dvp = NULL; ndp->ni_startdir = NULL; Index: sys/namei.h =================================================================== RCS file: /home/ncvs/src/sys/sys/namei.h,v retrieving revision 1.29.2.2 diff -u -r1.29.2.2 namei.h --- sys/namei.h 2001/09/30 21:12:54 1.29.2.2 +++ sys/namei.h 2001/10/02 18:43:34 @@ -114,7 +114,7 @@ #define FOLLOW 0x0040 /* follow symbolic links */ #define NOOBJ 0x0080 /* don't create object */ #define NOFOLLOW 0x0000 /* do not follow symbolic links (pseudo) */ -#define MODMASK 0x00fc /* mask of operational modifiers */ +#define MODMASK (0x00fc|SOFTLOCKLEAF) /* * Namei parameter descriptors. * @@ -143,7 +143,7 @@ #define WILLBEDIR 0x080000 /* new files will be dirs; allow trailing / */ #define ISUNICODE 0x100000 /* current component name is unicode*/ #define PDIRUNLOCK 0x200000 /* file system lookup() unlocked parent dir */ -#define PARAMASK 0x1fff00 /* mask of parameter descriptors */ +#define SOFTLOCKLEAF 0x400000 /* set VSOFTLOCK in vnode */ /* * Initialization of an nameidata structure. */ @@ -180,7 +180,9 @@ #define NDF_NO_VP_PUT 0x0000000c #define NDF_NO_STARTDIR_RELE 0x00000010 #define NDF_NO_FREE_PNBUF 0x00000020 +#define NDF_NO_FREE_SOFTLOCKLEAF 0x00000040 #define NDF_ONLY_PNBUF (~NDF_NO_FREE_PNBUF) +#define NDF_ONLY_SOFTLOCKLEAF (~NDF_NO_FREE_SOFTLOCKLEAF) void NDFREE __P((struct nameidata *, const uint)); Index: sys/vnode.h =================================================================== RCS file: /home/ncvs/src/sys/sys/vnode.h,v retrieving revision 1.111.2.12 diff -u -r1.111.2.12 vnode.h --- sys/vnode.h 2001/09/22 09:21:48 1.111.2.12 +++ sys/vnode.h 2001/10/02 22:54:37 @@ -169,6 +169,7 @@ #define VTBFREE 0x100000 /* This vnode is on the to-be-freelist */ #define VONWORKLST 0x200000 /* On syncer work-list */ #define VMOUNT 0x400000 /* Mount in progress */ +#define VSOFTLOCK 0x800000 /* Soft lock on vnode (directory entry) */ /* * Vnode attributes. A field value of VNOVAL represents a field whose value @@ -630,6 +632,9 @@ void vrele __P((struct vnode *vp)); void vref __P((struct vnode *vp)); void vbusy __P((struct vnode *vp)); +int vsetsoftlock __P((struct vnode *vp)); +void vclearsoftlock __P((struct vnode *vp)); +void vagain __P((int *gc)); extern vop_t **default_vnodeop_p; extern vop_t **spec_vnodeop_p; Index: ufs/ufs/ufs_vnops.c =================================================================== RCS file: /home/ncvs/src/sys/ufs/ufs/ufs_vnops.c,v retrieving revision 1.131.2.4 diff -u -r1.131.2.4 ufs_vnops.c --- ufs/ufs/ufs_vnops.c 2001/08/28 17:28:49 1.131.2.4 +++ ufs/ufs/ufs_vnops.c 2001/10/02 19:16:40 @@ -997,13 +997,11 @@ * Avoid ".", "..", and aliases of "." for obvious reasons. */ if ((fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') || - dp == ip || (fcnp->cn_flags | tcnp->cn_flags) & ISDOTDOT || - (ip->i_flag & IN_RENAME)) { + dp == ip || (fcnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) { VOP_UNLOCK(fvp, 0, p); error = EINVAL; goto abortit; } - ip->i_flag |= IN_RENAME; oldparent = dp->i_number; doingdirectory = 1; } @@ -1224,13 +1222,12 @@ return (0); } /* - * Ensure that the directory entry still exists and has not - * changed while the new name has been entered. If the source is - * a file then the entry may have been unlinked or renamed. In - * either case there is no further work to be done. If the source - * is a directory then it cannot have been rmdir'ed; the IN_RENAME - * flag ensures that it cannot be moved by another rename or removed - * by a rmdir. + * The source has been VSOFTLOCKd by the caller, preventing rename + * or rmdir, but not prevent unlink. At the moment we allow the unlink + * race and so do not panic if relookup fails or the inodes do not + * match, because files can be hardlinked and our flag is on the vnode + * rather then the directory entry in the namei cache. But source + * directories must still exist. */ if (xp != ip) { if (doingdirectory) @@ -1248,7 +1245,6 @@ cache_purge(fdvp); } error = ufs_dirremove(fdvp, xp, fcnp->cn_flags, 0); - xp->i_flag &= ~IN_RENAME; } VN_KNOTE(fvp, NOTE_RENAME); if (dp) @@ -1263,13 +1259,10 @@ vput(ITOV(xp)); vput(ITOV(dp)); out: - if (doingdirectory) - ip->i_flag &= ~IN_RENAME; if (vn_lock(fvp, LK_EXCLUSIVE, p) == 0) { ip->i_effnlink--; ip->i_nlink--; ip->i_flag |= IN_CHANGE; - ip->i_flag &= ~IN_RENAME; if (DOINGSOFTDEP(fvp)) softdep_change_linkcnt(ip); vput(fvp); @@ -1503,18 +1496,16 @@ dp = VTOI(dvp); /* - * Do not remove a directory that is in the process of being renamed. * Verify the directory is empty (and valid). Rmdir ".." will not be * valid since ".." will contain a reference to the current directory * and thus be non-empty. Do not allow the removal of mounted on * directories (this can happen when an NFS exported filesystem * tries to remove a locally mounted on directory). + * + * The syscall wrapper already protects us from attempting to remove + * a directory that is undergoing a rename. */ error = 0; - if (ip->i_flag & IN_RENAME) { - error = EINVAL; - goto out; - } if (ip->i_effnlink != 2 || !ufs_dirempty(ip, dp->i_number, cnp->cn_cred)) { error = ENOTEMPTY; To Unsubscribe: send mail to majordomo@FreeBSD.org with "unsubscribe freebsd-hackers" in the body of the message
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200110022312.f92NCpp60948>