Skip site navigation (1)Skip section navigation (2)
Date:      Tue, 18 Sep 2012 09:14:22 -0600
From:      "Justin T. Gibbs" <gibbs@scsiguy.com>
To:        fs@FreeBSD.org
Subject:   ZFS: Deadlock during vnode recycling
Message-ID:  <76CBA055-021F-458D-8978-E9A973D9B783@scsiguy.com>

next in thread | raw e-mail | index | archive | help
One of our systems became unresponsive due to an inability to recycle
vnodes.  We tracked this down to a deadlock in zfs_zget().  I've =
attached
the stack trace from the vnlru process to the end of this email.

We are currently testing the following patch. Since this issue is hard =
to
replicate I would appreciate review and feedback before I commit it to
FreeBSD.

Thanks,
Jusitn

Patch
=3D=3D=3D8<=3D=3D=3D8<=3D=3D=3D8<=3D=3D=3D8<=3D=3D=3D8<=3D=3D=3D8<=3D=3D=3D=
8<=3D=3D=3D8<=3D=3D=3D8<=3D=3D=3D8<=3D=3D=3D8<=3D=3D=3D8<=3D=3D=3D8<=3D=3D=
=3D8<
Change 635310 by justing@justing_ns1_spectrabsd on 2012/09/17 15:30:14

	For most vnode consumers of ZFS, the appropriate behavior
	when encountering a vnode that is in the process of being
	reclaimed is to wait for that process to complete and then
	allocate a new vnode.  This behavior is enforced in zfs_zget()
	by checking for the VI_DOOMED vnode flag.  In the case of
	the thread actually reclaiming the vnode, zfs_zget() must
	return the current vnode, otherwise a deadlock will occur.
=09
	sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h:
		Create a virtual znode field, z_reclaim_td, which is
		implemeted as a macro that redirects to =
z_task.ta_context.
=09
		z_task is only used by the reclaim code to perform the
		final cleanup of a znode in a secondary thread.  Since
		this can only occur after any calls to zfs_zget(), it
		is safe to reuse the ta_context field.
=09
	sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c:
		In zfs_freebsd_reclaim(), record curthread in the
		znode being reclaimed.
=09
	sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c:
		o Null out z_reclaim_td when znode_ts are constructed.
=09
		o In zfs_zget(), return a "doomed vnode" if the current
		  thread is actively reclaiming this object.

Affected files ...

... =
//SpectraBSD/stable/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs=
_znode.h#2 edit
... =
//SpectraBSD/stable/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vno=
ps.c#3 edit
... =
//SpectraBSD/stable/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_zno=
de.c#2 edit

Differences ...

=3D=3D=3D=3D =
//SpectraBSD/stable/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs=
_znode.h#2 (text) =3D=3D=3D=3D

@@ -241,6 +241,7 @@
 	struct task	z_task;
 } znode_t;
=20
+#define	z_reclaim_td z_task.ta_context
=20
 /*
  * Convert between znode pointers and vnode pointers

=3D=3D=3D=3D =
//SpectraBSD/stable/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vno=
ps.c#3 (text) =3D=3D=3D=3D

@@ -6083,6 +6083,13 @@
=20
 	ASSERT(zp !=3D NULL);
=20
+ 	/*
+	 * Mark the znode so that operations that typically block
+	 * waiting for reclamation to complete will return the current,
+	 * "doomed vnode", for this thread.
+	 */
+	zp->z_reclaim_td =3D curthread;
+
 	/*
 	 * Destroy the vm object and flush associated pages.
 	 */

=3D=3D=3D=3D =
//SpectraBSD/stable/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_zno=
de.c#2 (text) =3D=3D=3D=3D

@@ -158,6 +158,7 @@
 	zp->z_dirlocks =3D NULL;
 	zp->z_acl_cached =3D NULL;
 	zp->z_moved =3D 0;
+	zp->z_reclaim_td =3D NULL;
 	return (0);
 }
=20
@@ -1192,7 +1193,8 @@
 				dying =3D 1;
 			else {
 				VN_HOLD(vp);
-				if ((vp->v_iflag & VI_DOOMED) !=3D 0) {
+				if ((vp->v_iflag & VI_DOOMED) !=3D 0 &&
+				    zp->z_reclaim_td !=3D curthread) {
 					dying =3D 1;
 					/*
 					 * Don't VN_RELE() vnode here, =
because

vnlru_proc debug session
=3D=3D=3D8<=3D=3D=3D8<=3D=3D=3D8<=3D=3D=3D8<=3D=3D=3D8<=3D=3D=3D8<=3D=3D=3D=
8<=3D=3D=3D8<=3D=3D=3D8<=3D=3D=3D8<=3D=3D=3D8<=3D=3D=3D8<=3D=3D=3D8<=3D=3D=
=3D8<
#0  sched_switch (td=3D0xfffffe000f87b470, newtd=3D0xfffffe000d36c8e0, =
flags=3DVariable "flags" is not available.
) at /usr/src/sys/kern/sched_ule.c:1927
#1  0xffffffff8057f2b6 in mi_switch (flags=3D260, newtd=3D0x0) at =
/usr/src/sys/kern/kern_synch.c:485
#2  0xffffffff805b8982 in sleepq_timedwait (wchan=3D0xfffffe05c7515640, =
pri=3D0) at /usr/src/sys/kern/subr_sleepqueue.c:658
#3  0xffffffff8057f89f in _sleep (ident=3D0xfffffe05c7515640, lock=3D0x0, =
priority=3DVariable "priority" is not available.
) at /usr/src/sys/kern/kern_synch.c:246
#4  0xffffffff81093035 in zfs_zget (zfsvfs=3D0xfffffe001de4c000, =
obj_num=3D81963, zpp=3D0xffffff8c60dc51b0) at =
/usr/src/sys/modules/zfs/../../cddl/contrib/opensolaris/uts/common/fs/zfs/=
zfs_znode.c:1224
#5  0xffffffff810bec9a in zfs_get_data (arg=3D0xfffffe001de4c000, =
lr=3D0xffffff820f5330b8, buf=3D0x0, zio=3D0xfffffe0584625000) at =
/usr/src/sys/modules/zfs/../../cddl/contrib/opensolaris/uts/common/fs/zfs/=
zfs_vnops.c:1142
#6  0xffffffff81096891 in zil_commit (zilog=3D0xfffffe001c382800, =
foid=3DVariable "foid" is not available.
) at =
/usr/src/sys/modules/zfs/../../cddl/contrib/opensolaris/uts/common/fs/zfs/=
zil.c:1048
#7  0xffffffff810bceb0 in zfs_freebsd_write (ap=3DVariable "ap" is not =
available.
) at =
/usr/src/sys/modules/zfs/../../cddl/contrib/opensolaris/uts/common/fs/zfs/=
zfs_vnops.c:1083
#8  0xffffffff8081f112 in VOP_WRITE_APV (vop=3D0xffffffff8112cf40, =
a=3D0xffffff8c60dc5680) at vnode_if.c:951
#9  0xffffffff807b1a6b in vnode_pager_generic_putpages =
(vp=3D0xfffffe05c76171e0, ma=3D0xffffff8c60dc5890, bytecount=3DVariable =
"bytecount" is not available.
) at vnode_if.h:413
#10 0xffffffff807b1749 in vnode_pager_putpages =
(object=3D0xfffffe05e9ee9bc8, m=3D0xffffff8c60dc5890, count=3D61440, =
sync=3D1, rtvals=3D0xffffff8c60dc57a0) at vnode_if.h:1189
#11 0xffffffff807aaee0 in vm_pageout_flush (mc=3D0xffffff8c60dc5890, =
count=3D15, flags=3D1, mreq=3D0, prunlen=3D0xffffff8c60dc594c, =
eio=3D0xffffff8c60dc59c0) at vm_pager.h:145
#12 0xffffffff807a3da3 in vm_object_page_collect_flush (object=3DVariable =
"object" is not available.
) at /usr/src/sys/vm/vm_object.c:936
#13 0xffffffff807a3f23 in vm_object_page_clean =
(object=3D0xfffffe05e9ee9bc8, start=3DVariable "start" is not available.
) at /usr/src/sys/vm/vm_object.c:861
#14 0xffffffff807a42d4 in vm_object_terminate =
(object=3D0xfffffe05e9ee9bc8) at /usr/src/sys/vm/vm_object.c:706
#15 0xffffffff807b241e in vnode_destroy_vobject (vp=3D0xfffffe05c76171e0) =
at /usr/src/sys/vm/vnode_pager.c:167
#16 0xffffffff810beec7 in zfs_freebsd_reclaim (ap=3DVariable "ap" is not =
available.
) at =
/usr/src/sys/modules/zfs/../../cddl/contrib/opensolaris/uts/common/fs/zfs/=
zfs_vnops.c:6146
#17 0xffffffff806101e1 in vgonel (vp=3D0xfffffe05c76171e0) at =
vnode_if.h:830
#18 0xffffffff80616379 in vnlru_proc () at =
/usr/src/sys/kern/vfs_subr.c:734

(kgdb) frame 4
#4  0xffffffff81093035 in zfs_zget (zfsvfs=3D0xfffffe001de4c000, =
obj_num=3D81963, zpp=3D0xffffff8c60dc51b0) at =
/usr/src/sys/modules/zfs/../../cddl/contrib/opensolaris/uts/common/fs/zfs/=
zfs_znode.c:1224
1224                                    tsleep(zp, 0, "zcollide", 1);
(kgdb) l
1219                                    sa_buf_rele(db, NULL);
1220                                    mutex_exit(&zp->z_lock);
1221                                    ZFS_OBJ_HOLD_EXIT(zfsvfs, =
obj_num);
1222                                    if (vp !=3D NULL)
1223                                            VN_RELE(vp);
1224                                    tsleep(zp, 0, "zcollide", 1);
1225                                    goto again;
1226                            }
1227                            *zpp =3D zp;
1228                            err =3D 0;
(kgdb)





Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?76CBA055-021F-458D-8978-E9A973D9B783>