From owner-freebsd-bugs Wed Aug 16 12:40:30 2000 Delivered-To: freebsd-bugs@freebsd.org Received: from freefall.freebsd.org (freefall.FreeBSD.ORG [204.216.27.21]) by hub.freebsd.org (Postfix) with ESMTP id 74E6737B817 for ; Wed, 16 Aug 2000 12:40:03 -0700 (PDT) (envelope-from gnats@FreeBSD.org) Received: (from gnats@localhost) by freefall.freebsd.org (8.9.3/8.9.2) id MAA01501; Wed, 16 Aug 2000 12:40:03 -0700 (PDT) (envelope-from gnats@FreeBSD.org) Date: Wed, 16 Aug 2000 12:40:03 -0700 (PDT) Message-Id: <200008161940.MAA01501@freefall.freebsd.org> To: freebsd-bugs@FreeBSD.org Cc: From: Tor.Egge@fast.no Subject: Re: kern/20632: stacking mount_null causes an error: mount_null: Resource deadlock avoided Reply-To: Tor.Egge@fast.no Sender: owner-freebsd-bugs@FreeBSD.ORG Precedence: bulk X-Loop: FreeBSD.org The following reply was made to PR kern/20632; it has been noted by GNATS. From: Tor.Egge@fast.no To: sheldonh@uunet.co.za Cc: akr@m17n.org, semen@iclub.nsu.ru Subject: Re: kern/20632: stacking mount_null causes an error: mount_null: Resource deadlock avoided Date: Wed, 16 Aug 2000 20:52:29 +0200 > > Unfortunately the patch doesn't fix the problem. > > I've asked Ustimenko Semen , the author of the > patch, to take a look at your PR. Here is yet another nullfs patch for -current. It lacks some of the features in Ustimenko's patch (e.g. support for nfs exporting nfs mounted file systems), but it avoids the aliasing problem. Index: sbin/mount_null/mount_null.c =================================================================== RCS file: /home/ncvs/src/sbin/mount_null/mount_null.c,v retrieving revision 1.14 diff -u -r1.14 mount_null.c --- sbin/mount_null/mount_null.c 2000/07/28 11:54:08 1.14 +++ sbin/mount_null/mount_null.c 2000/08/06 19:16:28 @@ -101,9 +101,11 @@ (void)checkpath(argv[0], target); (void)checkpath(argv[1], source); +#if 0 if (subdir(target, source) || subdir(source, target)) errx(EX_USAGE, "%s (%s) and %s are not distinct paths", argv[0], target, argv[1]); +#endif args.target = target; Index: sys/kern/vfs_subr.c =================================================================== RCS file: /home/ncvs/src/sys/kern/vfs_subr.c,v retrieving revision 1.267 diff -u -r1.267 vfs_subr.c --- sys/kern/vfs_subr.c 2000/07/24 05:28:29 1.267 +++ sys/kern/vfs_subr.c 2000/08/06 19:16:46 @@ -711,7 +711,7 @@ */ simple_lock(&vp->v_interlock); object = vp->v_object; - if (object != NULL) { + if (object != NULL && (void *) vp == object->handle) { vm_object_page_remove(object, 0, 0, (flags & V_SAVE) ? TRUE : FALSE); } @@ -841,6 +841,8 @@ int s; KASSERT(bp->b_vp == NULL, ("bgetvp: not free")); + if (vp->v_tag == VT_NULL) + panic("bgetvp: null node"); vhold(vp); bp->b_vp = vp; @@ -1190,6 +1192,8 @@ } vn_syncer_add_to_worklist(newvp, delay); } + if (newvp->v_tag == VT_NULL) + panic("reassignbuf: null node dirty list"); bp->b_xflags |= BX_VNDIRTY; tbp = TAILQ_FIRST(listheadp); if (tbp == NULL || @@ -1243,6 +1247,8 @@ TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); } } else { + if (newvp->v_tag == VT_NULL) + panic("reassignbuf: null node clean list"); bp->b_xflags |= BX_VNCLEAN; TAILQ_INSERT_TAIL(&newvp->v_cleanblkhd, bp, b_vnbufs); if ((newvp->v_flag & VONWORKLST) && @@ -1683,7 +1689,7 @@ vinvalbuf(vp, 0, NOCRED, p, 0, 0); } - if ((obj = vp->v_object) != NULL) { + if ((obj = vp->v_object) != NULL && (void *) vp == obj->handle) { if (obj->ref_count == 0) { /* * vclean() may be called twice. The first time @@ -2529,7 +2535,8 @@ simple_lock(&vp->v_interlock); if (vp->v_object && - (vp->v_object->flags & OBJ_MIGHTBEDIRTY)) { + (void *) vp == vp->v_object->handle && + (vp->v_object->flags & OBJ_MIGHTBEDIRTY)) { if (!vget(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ, curproc)) { if (vp->v_object) { @@ -2613,6 +2620,21 @@ s = splbio(); simple_lock(&vnode_free_list_slock); + if ((vp->v_flag & VFREE) != 0) + panic("vfree called on doomed node"); + if ((vp->v_flag & VFREE) != 0) + panic("vfree called on free node"); + if (vp->v_holdcnt != 0) + panic("vfree called with holdcnt=%d (>0)", vp->v_holdcnt); + if (vp->v_usecount != 0) + panic("vfree called with usecnt=%d (>0)", vp->v_usecount); + if (vp->v_object != NULL && + vp->v_object->handle == (void *) vp && + (vp->v_object->resident_page_count != 0 || + vp->v_object->ref_count != 0)) + panic("vfree called with object RPC: %d, RC: %d", + vp->v_object->resident_page_count, + vp->v_object->ref_count); KASSERT((vp->v_flag & VFREE) == 0, ("vnode already free")); if (vp->v_flag & VAGE) { TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); Index: sys/miscfs/nullfs/null.h =================================================================== RCS file: /home/ncvs/src/sys/miscfs/nullfs/null.h,v retrieving revision 1.13 diff -u -r1.13 null.h --- sys/miscfs/nullfs/null.h 2000/05/26 02:04:58 1.13 +++ sys/miscfs/nullfs/null.h 2000/08/06 19:16:47 @@ -55,7 +55,15 @@ LIST_ENTRY(null_node) null_hash; /* Hash list */ struct vnode *null_lowervp; /* VREFed once */ struct vnode *null_vnode; /* Back pointer */ + int flags; /* flags (see below) */ + pid_t lockholder; /* owner of exclusive lock */ }; + +/* Null node flags */ + +#define NN_LOCK 0x01 +#define NN_WANT 0x02 +#define NN_FSYNC_SKIPPED 0x04 extern int nullfs_init __P((struct vfsconf *vfsp)); extern int null_node_create __P((struct mount *mp, struct vnode *target, struct vnode **vpp)); Index: sys/miscfs/nullfs/null_subr.c =================================================================== RCS file: /home/ncvs/src/sys/miscfs/nullfs/null_subr.c,v retrieving revision 1.23 diff -u -r1.23 null_subr.c --- sys/miscfs/nullfs/null_subr.c 2000/05/26 02:04:59 1.23 +++ sys/miscfs/nullfs/null_subr.c 2000/08/06 19:16:47 @@ -45,6 +45,8 @@ #include #include #include +#include +#include #define LOG2_SIZEVNODE 7 /* log2(sizeof struct vnode) */ #define NNULLNODECACHE 16 @@ -94,6 +96,7 @@ struct null_node_hashhead *hd; struct null_node *a; struct vnode *vp; + int retry; /* * Find hash base, and then search the (two-way) linked @@ -102,6 +105,7 @@ * reference count (but NOT the lower vnode's VREF counter). */ hd = NULL_NHASH(lowervp); + retry = 0; loop: for (a = hd->lh_first; a != 0; a = a->null_hash.le_next) { if (a->null_lowervp == lowervp && NULLTOV(a)->v_mount == mp) { @@ -112,6 +116,9 @@ * the lower node. */ if (vget(vp, 0, p)) { + retry++; + if (retry < 2) + goto loop; printf ("null_node_find: vget failed.\n"); goto loop; }; @@ -138,6 +145,7 @@ struct null_node *xp; struct vnode *othervp, *vp; int error; + int waslocked; /* * Do the MALLOC before the getnewvnode since doing so afterward @@ -157,6 +165,8 @@ xp->null_vnode = vp; vp->v_data = xp; xp->null_lowervp = lowervp; + xp->flags = 0; + xp->lockholder = 0; /* * Before we insert our new node onto the hash chains, * check to see if someone else has beaten us to it. @@ -167,12 +177,41 @@ FREE(xp, M_TEMP); vp->v_type = VBAD; /* node is discarded */ vp->v_usecount = 0; /* XXX */ + if (VSHOULDFREE(vp)) + vfree(vp); *vpp = othervp; return 0; }; VREF(lowervp); /* Extra VREF will be vrele'd in null_node_create */ hd = NULL_NHASH(lowervp); LIST_INSERT_HEAD(hd, xp, null_hash); + /* + * Create the lower VM object, if needed + */ + if ((lowervp->v_type == VREG) && + ((lowervp->v_object == NULL) || + (lowervp->v_object->flags & OBJ_DEAD))) { + waslocked = VOP_ISLOCKED(lowervp, NULL); /* XXX: Wrong */ + if (!waslocked) { + simple_lock(&lowervp->v_interlock); + vn_lock(lowervp, + LK_EXCLUSIVE | LK_INTERLOCK | LK_RETRY, + curproc); + } + vfs_object_create(lowervp, curproc, curproc->p_ucred); + if (!waslocked) { + simple_lock(&lowervp->v_interlock); + VOP_UNLOCK(lowervp, LK_INTERLOCK, curproc); + } + + } + vp->v_object = lowervp->v_object; /* XXX: share object */ + if (vp->v_object != NULL) { + if ((lowervp->v_flag & VOBJBUF) != 0) + vp->v_flag |= VOBJBUF; + if ((lowervp->v_flag & VTEXT) != 0) + vp->v_flag |= VTEXT; + } return 0; } Index: sys/miscfs/nullfs/null_vfsops.c =================================================================== RCS file: /home/ncvs/src/sys/miscfs/nullfs/null_vfsops.c,v retrieving revision 1.37 diff -u -r1.37 null_vfsops.c --- sys/miscfs/nullfs/null_vfsops.c 2000/07/28 11:54:09 1.37 +++ sys/miscfs/nullfs/null_vfsops.c 2000/08/06 19:16:47 @@ -93,7 +93,6 @@ struct vnode *nullm_rootvp; struct null_mount *xmp; u_int size; - int isvnunlocked = 0; #ifdef DEBUG printf("nullfs_mount(mp = %p)\n", (void *)mp); @@ -115,46 +114,23 @@ return (error); /* - * Unlock lower node to avoid deadlock. - * (XXX) VOP_ISLOCKED is needed? - */ - if ((mp->mnt_vnodecovered->v_op == null_vnodeop_p) && - VOP_ISLOCKED(mp->mnt_vnodecovered, NULL)) { - VOP_UNLOCK(mp->mnt_vnodecovered, 0, p); - isvnunlocked = 1; - } - /* * Find lower node */ - NDINIT(ndp, LOOKUP, FOLLOW|WANTPARENT|LOCKLEAF, - UIO_USERSPACE, args.target, p); + NDINIT(ndp, LOOKUP, FOLLOW | LOCKLEAF, + UIO_USERSPACE, args.target, p); error = namei(ndp); - /* - * Re-lock vnode. - */ - if (isvnunlocked && !VOP_ISLOCKED(mp->mnt_vnodecovered, NULL)) - vn_lock(mp->mnt_vnodecovered, LK_EXCLUSIVE | LK_RETRY, p); - if (error) return (error); NDFREE(ndp, NDF_ONLY_PNBUF); - /* - * Sanity check on lower vnode - */ lowerrootvp = ndp->ni_vp; - vrele(ndp->ni_dvp); - ndp->ni_dvp = NULLVP; - /* - * Check multi null mount to avoid `lock against myself' panic. + * Sanity check on lower vnode */ - if (lowerrootvp == VTONULL(mp->mnt_vnodecovered)->null_lowervp) { -#ifdef DEBUG - printf("nullfs_mount: multi null mount?\n"); -#endif - return (EDEADLK); + if (VOP_ISLOCKED(mp->mnt_vnodecovered, NULL)) { + vput(lowerrootvp); + return EBUSY; } xmp = (struct null_mount *) malloc(sizeof(struct null_mount), @@ -171,19 +147,18 @@ */ error = null_node_create(mp, lowerrootvp, &vp); /* - * Unlock the node (either the lower or the alias) - */ - VOP_UNLOCK(vp, 0, p); - /* * Make sure the node alias worked */ if (error) { - vrele(lowerrootvp); + vput(lowerrootvp); free(xmp, M_NULLFSMNT); /* XXX */ return (error); } - /* + * Unlock the node (either the lower or the alias) + */ + VOP_UNLOCK(vp, 0, p); + /* * Keep a held reference to the root vnode. * It is vrele'd in nullfs_unmount. */ @@ -269,7 +244,8 @@ /* * And blow it away for future re-use */ - vgone(nullm_rootvp); + if (nullm_rootvp->v_mount == mp) + vgone(nullm_rootvp); /* * Finally, throw away the null_mount structure */ @@ -297,18 +273,7 @@ */ vp = MOUNTTONULLMOUNT(mp)->nullm_rootvp; VREF(vp); - if (VOP_ISLOCKED(vp, NULL)) { - /* - * XXX - * Should we check type of node? - */ -#ifdef DEBUG - printf("nullfs_root: multi null mount?\n"); -#endif - vrele(vp); - return (EDEADLK); - } else - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); *vpp = vp; return 0; } Index: sys/miscfs/nullfs/null_vnops.c =================================================================== RCS file: /home/ncvs/src/sys/miscfs/nullfs/null_vnops.c,v retrieving revision 1.39 diff -u -r1.39 null_vnops.c --- sys/miscfs/nullfs/null_vnops.c 2000/04/18 15:15:23 1.39 +++ sys/miscfs/nullfs/null_vnops.c 2000/08/06 19:16:51 @@ -182,7 +182,12 @@ #include #include #include +#include +#include +#include #include +#include +#include static int null_bug_bypass = 0; /* for debugging: enables bypass printf'ing */ SYSCTL_INT(_debug, OID_AUTO, nullfs_bug_bypass, CTLFLAG_RW, @@ -191,12 +196,15 @@ static int null_access __P((struct vop_access_args *ap)); static int null_getattr __P((struct vop_getattr_args *ap)); static int null_inactive __P((struct vop_inactive_args *ap)); +static int null_islocked __P((struct vop_islocked_args *ap)); static int null_lock __P((struct vop_lock_args *ap)); static int null_lookup __P((struct vop_lookup_args *ap)); static int null_print __P((struct vop_print_args *ap)); static int null_reclaim __P((struct vop_reclaim_args *ap)); static int null_setattr __P((struct vop_setattr_args *ap)); static int null_unlock __P((struct vop_unlock_args *ap)); +static int null_fsync __P((struct vop_fsync_args *ap)); +static int null_getwritemount __P((struct vop_getwritemount_args *ap)); /* * This is the 10-Apr-92 bypass routine. @@ -236,6 +244,7 @@ struct vnode ***vppp; struct vnodeop_desc *descp = ap->a_desc; int reles, i; + struct mount *old_mount; if (null_bug_bypass) printf ("null_bypass: %s\n", descp->vdesc_name); @@ -288,6 +297,11 @@ */ error = VCALL(*(vps_p[0]), descp->vdesc_offset, ap); + if (descp->vdesc_vp_offsets[0] != VDESC_NO_OFFSET && + old_vps[0] != NULLVP) + old_mount = old_vps[0]->v_mount; + else + old_mount = NULL; /* * Maintain the illusion of call-by-value * by restoring vnodes in the argument structure @@ -323,7 +337,7 @@ vppp = VOPARG_OFFSETTO(struct vnode***, descp->vdesc_vpp_offset,ap); if (*vppp) - error = null_node_create(old_vps[0]->v_mount, **vppp, *vppp); + error = null_node_create(old_mount, **vppp, *vppp); } out: @@ -496,10 +510,31 @@ struct proc *a_p; } */ *ap; { + struct null_node *xp; + pid_t pid; + xp = VTONULL(ap->a_vp); + if ((xp->flags & NN_LOCK) != 0) { + pid = (ap->a_p == NULL) ? LK_KERNPROC : ap->a_p->p_pid; + if (xp->lockholder == pid) { + panic("null_lock: recursive lock"); + } else { + if ((ap->a_flags & LK_NOWAIT) != 0) + return EBUSY; + while ((xp->flags & NN_LOCK) != 0) { + xp->flags |= NN_WANT; + tsleep(xp, PINOD, "nnlock", 0); + } + } + } + vop_nolock(ap); - if ((ap->a_flags & LK_TYPE_MASK) == LK_DRAIN) - return (0); + if ((ap->a_flags & LK_TYPE_MASK) == LK_DRAIN) { + pid = (ap->a_p == NULL) ? LK_KERNPROC : ap->a_p->p_pid; + xp->flags |= NN_LOCK; + xp->lockholder = pid; + return 0; + } ap->a_flags &= ~LK_INTERLOCK; return (null_bypass((struct vop_generic_args *)ap)); } @@ -517,28 +552,58 @@ struct proc *a_p; } */ *ap; { + struct vnode *vp = ap->a_vp; + struct null_node *xp; + pid_t pid; + + xp = VTONULL(vp); + if ((xp->flags & NN_LOCK) != 0) { + pid = (ap->a_p == NULL) ? LK_KERNPROC : ap->a_p->p_pid; + if (xp->lockholder == pid) { + xp->flags &= ~ (NN_LOCK | NN_FSYNC_SKIPPED); + xp->lockholder = 0; + if ((xp->flags & NN_WANT) != 0) { + xp->flags &= ~NN_WANT; + wakeup(xp); + } + return 0; + } + } + vop_nounlock(ap); ap->a_flags &= ~LK_INTERLOCK; return (null_bypass((struct vop_generic_args *)ap)); } static int +null_islocked(ap) + struct vop_islocked_args /* { + struct vnode *a_vp; + } */ *ap; +{ + struct vnode *vp; + struct null_node *xp; + + vp = ap->a_vp; + xp = VTONULL(vp); + if (xp == NULL || (xp->flags & NN_LOCK) != 0) + return LK_EXCLUSIVE; + + return (null_bypass((struct vop_generic_args *)ap)); +} + +static int null_inactive(ap) struct vop_inactive_args /* { struct vnode *a_vp; struct proc *a_p; } */ *ap; { - struct vnode *vp = ap->a_vp; - struct null_node *xp = VTONULL(vp); - struct vnode *lowervp = xp->null_lowervp; /* * Do nothing (and _don't_ bypass). * Wait to vrele lowervp until reclaim, * so that until then our null_node is in the * cache and reusable. - * We still have to tell the lower layer the vnode - * is now inactive though. * * NEEDSWORK: Someday, consider inactive'ing * the lowervp and then trying to reactivate it @@ -546,8 +611,8 @@ * like they do in the name lookup cache code. * That's too much work for now. */ - VOP_INACTIVE(lowervp, ap->a_p); VOP_UNLOCK(ap->a_vp, 0, ap->a_p); + vrecycle(ap->a_vp, (struct simplelock *) 0, ap->a_p); return (0); } @@ -561,21 +626,85 @@ struct vnode *vp = ap->a_vp; struct null_node *xp = VTONULL(vp); struct vnode *lowervp = xp->null_lowervp; + vm_object_t object; /* * Note: in vop_reclaim, vp->v_op == dead_vnodeop_p, * so we can't call VOPs on ourself. */ + + object = vp->v_object; + if (object != lowervp->v_object) + panic("null_reclaim: wrong object"); + if (object != NULL) { + if ((void *) lowervp != object->handle) + panic("null_reclaim: wrong object handle"); + vp->v_object = NULL; + vp->v_flag &= ~(VTEXT|VOBJBUF); + if (lowervp->v_usecount < 1 + object->ref_count) + panic("null_reclaim: lowervp->v_usecount too low"); + } /* After this assignment, this node will not be re-used. */ xp->null_lowervp = NULLVP; LIST_REMOVE(xp, null_hash); FREE(vp->v_data, M_TEMP); vp->v_data = NULL; + if (lowervp->v_usecount < 1) + panic("null_reclaim: lowervp->v_usecount < 1"); vrele (lowervp); return (0); } static int +null_fsync(ap) + struct vop_fsync_args /* { + struct vnodeop_desc *a_desc; + struct vnode *a_vp; + struct ucred *a_cred; + int a_waitfor; + struct proc *a_p; + } */ *ap; +{ + struct vnode *vp; + struct null_node *xp; + pid_t pid; + + vp = ap->a_vp; + xp = VTONULL(vp); + + if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) + panic("dirty buffers on null vnode"); + if (!TAILQ_EMPTY(&vp->v_cleanblkhd)) + panic("clean buffers on null vnode"); + + if ((xp->flags & NN_LOCK) != 0) { + pid = (ap->a_p == NULL) ? LK_KERNPROC : ap->a_p->p_pid; + if (xp->lockholder == pid) + if (/* vp->v_object != NULL && + vp->v_object->handle != (void *) vp && */ + (xp->flags & NN_FSYNC_SKIPPED) == 0) { + xp->flags |= NN_FSYNC_SKIPPED; + return 0; + } + } + return (null_bypass((struct vop_generic_args *)ap)); +} + +int +null_getwritemount(ap) + struct vop_getwritemount_args /* { + struct vnode *a_vp; + struct mount **a_mpp; + } */ *ap; +{ + if (VTONULL(ap->a_vp) == NULL) { + return VOP_GETWRITEMOUNT(ap->a_vp->v_mount->mnt_vnodecovered, ap->a_mpp); + } else + return (null_bypass((struct vop_generic_args *) ap)); +} + + +static int null_print(ap) struct vop_print_args /* { struct vnode *a_vp; @@ -595,12 +724,15 @@ { &vop_access_desc, (vop_t *) null_access }, { &vop_getattr_desc, (vop_t *) null_getattr }, { &vop_inactive_desc, (vop_t *) null_inactive }, + { &vop_islocked_desc, (vop_t *) null_islocked }, { &vop_lock_desc, (vop_t *) null_lock }, { &vop_lookup_desc, (vop_t *) null_lookup }, { &vop_print_desc, (vop_t *) null_print }, { &vop_reclaim_desc, (vop_t *) null_reclaim }, { &vop_setattr_desc, (vop_t *) null_setattr }, { &vop_unlock_desc, (vop_t *) null_unlock }, + { &vop_fsync_desc, (vop_t *) null_fsync }, + { &vop_getwritemount_desc, (vop_t *) null_getwritemount }, { NULL, NULL } }; static struct vnodeopv_desc null_vnodeop_opv_desc = Index: sys/vm/vnode_pager.c =================================================================== RCS file: /home/ncvs/src/sys/vm/vnode_pager.c,v retrieving revision 1.124 diff -u -r1.124 vnode_pager.c --- sys/vm/vnode_pager.c 2000/07/11 22:07:57 1.124 +++ sys/vm/vnode_pager.c 2000/08/06 19:17:02 @@ -154,7 +154,7 @@ vp->v_usecount++; } else { object->ref_count++; - vp->v_usecount++; + ((struct vnode *) object->handle)->v_usecount++; } vp->v_flag &= ~VOLOCK; - Tor Egge To Unsubscribe: send mail to majordomo@FreeBSD.org with "unsubscribe freebsd-bugs" in the body of the message