Skip site navigation (1)Skip section navigation (2)
Date:      Mon, 16 Dec 2013 11:08:11 +0100 (CET)
From:      krichy@tvnetwork.hu
To:        freebsd-fs@freebsd.org
Subject:   kern/184677
Message-ID:  <alpine.DEB.2.10.1312161059150.7004@krichy.tvnetwork.hu>

next in thread | raw e-mail | index | archive | help

[-- Attachment #1 --]
Dear devs,

I've attached a patch, which makes the recursive lockmgr disappear, and 
makes the reported bug to disappear. I dont know if I followed any 
guidelines well, or not, but at least it works for me. Please some 
ZFS/FreeBSD fs expert review it, and fix it where it needed.

But unfortunately, my original problem is still not solved, maybe the same 
as Ryan's: 
http://lists.freebsd.org/pipermail/freebsd-fs/2013-December/018707.html

Tracing the problem down is that zfsctl_snapdir_lookup() tries to acquire 
spa_namespace_lock while when finishing a zfs send -R does a 
zfsdev_close(), and that also holds the same mutex. And this causes a 
deadlock scenario. I looked at illumos's code, and for some reason they 
use another mutex on zfsdev_close(), which therefore may not deadlock with 
zfsctl_snapdir_lookup(). But I am still investigating the problem.

I would like to help making ZFS more stable on freebsd also with its whole 
functionality. I would be very thankful if some expert would give some 
advice, how to solve these bugs. PJD, Steven, Xin?

Thanks in advance,


Kojedzinszky Richard
Euronet Magyarorszag Informatikai Zrt.
[-- Attachment #2 --]
commit 7965e01fca79187d815b8a86570f50547d00e531
Author: Richard Kojedzinszky <krichy@cflinux.hu>
Date:   Mon Dec 16 09:59:57 2013 +0100

    ZFS lock ordering fix

diff --git a/sys/cddl/compat/opensolaris/kern/opensolaris_lookup.c b/sys/cddl/compat/opensolaris/kern/opensolaris_lookup.c
index 94383d6..225521a 100644
--- a/sys/cddl/compat/opensolaris/kern/opensolaris_lookup.c
+++ b/sys/cddl/compat/opensolaris/kern/opensolaris_lookup.c
@@ -81,6 +81,8 @@ traverse(vnode_t **cvpp, int lktype)
 	 * progress on this vnode.
 	 */
 
+	vn_lock(cvp, lktype);
+
 	for (;;) {
 		/*
 		 * Reached the end of the mount chain?
@@ -89,13 +91,7 @@ traverse(vnode_t **cvpp, int lktype)
 		if (vfsp == NULL)
 			break;
 		error = vfs_busy(vfsp, 0);
-		/*
-		 * tvp is NULL for *cvpp vnode, which we can't unlock.
-		 */
-		if (tvp != NULL)
-			vput(cvp);
-		else
-			vrele(cvp);
+		vput(cvp);
 		if (error)
 			return (error);
 
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c b/sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c
index 59944a1..ce43fff 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c
@@ -448,7 +448,7 @@ gfs_lookup_dot(vnode_t **vpp, vnode_t *dvp, vnode_t *pvp, const char *nm)
 			VN_HOLD(pvp);
 			*vpp = pvp;
 		}
-		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
+		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY | LK_CANRECURSE);
 		return (0);
 	}
 
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c
index 28ab1fa..9a05976 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c
@@ -112,6 +112,30 @@ snapentry_compare(const void *a, const void *b)
 		return (0);
 }
 
+static void
+snapdir_entry_remove_free(zfsctl_snapdir_t *sdp, zfs_snapentry_t *sep)
+{
+	avl_remove(&sdp->sd_snaps, sep);
+	kmem_free(sep->se_name, strlen(sep->se_name) + 1);
+	kmem_free(sep, sizeof (zfs_snapentry_t));
+}
+
+static zfsctl_snapdir_t*
+snapshot_get_snapdir(vnode_t *vp, vnode_t **dvpp)
+{
+	gfs_dir_t *dp = vp->v_data;
+	*dvpp = dp->gfsd_file.gfs_parent;
+	zfsctl_snapdir_t *sdp;
+
+	VN_HOLD(*dvpp);
+	VOP_UNLOCK(vp, 0);
+	vn_lock(*dvpp, LK_SHARED | LK_RETRY | LK_CANRECURSE);
+	sdp = (*dvpp)->v_data;
+	VOP_UNLOCK(*dvpp, 0);
+
+	return (sdp);
+}
+
 #ifdef sun
 vnodeops_t *zfsctl_ops_root;
 vnodeops_t *zfsctl_ops_snapdir;
@@ -1012,7 +1036,13 @@ zfsctl_snapdir_lookup(ap)
 			/*
 			 * The snapshot was unmounted behind our backs,
 			 * try to remount it.
+			 * Concurrent zfsctl_snapshot_inactive() would remove our entry
+			 * so do this ourselves, and make a fresh new mount.
 			 */
+			snapdir_entry_remove_free(sdp, sep);
+			vput(*vpp);
+			/* find new place for sep entry */
+			avl_find(&sdp->sd_snaps, &search, &where);
 			VERIFY(zfsctl_snapshot_zname(dvp, nm, MAXNAMELEN, snapname) == 0);
 			goto domount;
 		} else {
@@ -1028,6 +1058,7 @@ zfsctl_snapdir_lookup(ap)
 		return (err);
 	}
 
+domount:
 	/*
 	 * The requested snapshot is not currently mounted, look it up.
 	 */
@@ -1068,7 +1099,6 @@ zfsctl_snapdir_lookup(ap)
 	avl_insert(&sdp->sd_snaps, sep, where);
 
 	dmu_objset_rele(snap, FTAG);
-domount:
 	mountpoint_len = strlen(dvp->v_vfsp->mnt_stat.f_mntonname) +
 	    strlen("/" ZFS_CTLDIR_NAME "/snapshot/") + strlen(nm) + 1;
 	mountpoint = kmem_alloc(mountpoint_len, KM_SLEEP);
@@ -1350,9 +1380,7 @@ zfsctl_snapdir_inactive(ap)
 	 */
 	mutex_enter(&sdp->sd_lock);
 	while ((sep = avl_first(&sdp->sd_snaps)) != NULL) {
-		avl_remove(&sdp->sd_snaps, sep);
-		kmem_free(sep->se_name, strlen(sep->se_name) + 1);
-		kmem_free(sep, sizeof (zfs_snapentry_t));
+		snapdir_entry_remove_free(sdp, sep);
 	}
 	mutex_exit(&sdp->sd_lock);
 	gfs_dir_inactive(vp);
@@ -1463,17 +1491,19 @@ zfsctl_snapshot_inactive(ap)
 	zfs_snapentry_t *sep, *next;
 	int locked;
 	vnode_t *dvp;
+	gfs_dir_t *dp;
 
-	if (vp->v_count > 0)
-		goto end;
-
-	VERIFY(gfs_dir_lookup(vp, "..", &dvp, cr, 0, NULL, NULL) == 0);
-	sdp = dvp->v_data;
-	VOP_UNLOCK(dvp, 0);
+	/* This is for accessing the real parent directly, without a possible deadlock
+	 * with zfsctl_snapdir_lookup(). The release of lock on vp and lock on dvp provides
+	 * the same lock order as in zfsctl_snapshot_lookup().
+	 */
+	sdp = snapshot_get_snapdir(vp, &dvp);
 
 	if (!(locked = MUTEX_HELD(&sdp->sd_lock)))
 		mutex_enter(&sdp->sd_lock);
 
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+
 	ASSERT(!vn_ismntpt(vp));
 
 	sep = avl_first(&sdp->sd_snaps);
@@ -1481,9 +1511,7 @@ zfsctl_snapshot_inactive(ap)
 		next = AVL_NEXT(&sdp->sd_snaps, sep);
 
 		if (sep->se_root == vp) {
-			avl_remove(&sdp->sd_snaps, sep);
-			kmem_free(sep->se_name, strlen(sep->se_name) + 1);
-			kmem_free(sep, sizeof (zfs_snapentry_t));
+			snapdir_entry_remove_free(sdp, sep);
 			break;
 		}
 		sep = next;
@@ -1494,7 +1522,6 @@ zfsctl_snapshot_inactive(ap)
 		mutex_exit(&sdp->sd_lock);
 	VN_RELE(dvp);
 
-end:
 	/*
 	 * Dispose of the vnode for the snapshot mount point.
 	 * This is safe to do because once this entry has been removed
@@ -1595,20 +1622,18 @@ zfsctl_snapshot_lookup(ap)
 static int
 zfsctl_snapshot_vptocnp(struct vop_vptocnp_args *ap)
 {
-	zfsvfs_t *zfsvfs = ap->a_vp->v_vfsp->vfs_data;
-	vnode_t *dvp, *vp;
+	vnode_t *vp = ap->a_vp;
+	vnode_t *dvp;
 	zfsctl_snapdir_t *sdp;
 	zfs_snapentry_t *sep;
 	int error;
 
-	ASSERT(zfsvfs->z_ctldir != NULL);
-	error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp,
-	    NULL, 0, NULL, kcred, NULL, NULL, NULL);
-	if (error != 0)
-		return (error);
-	sdp = dvp->v_data;
+	sdp = snapshot_get_snapdir(vp, &dvp);
 
 	mutex_enter(&sdp->sd_lock);
+
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+
 	sep = avl_first(&sdp->sd_snaps);
 	while (sep != NULL) {
 		vp = sep->se_root;

Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?alpine.DEB.2.10.1312161059150.7004>