Skip site navigation (1)Skip section navigation (2)
Date:      Sun, 30 Sep 2012 15:24:03 +0300
From:      Konstantin Belousov <kostikbel@gmail.com>
To:        Pawel Jakub Dawidek <pjd@freebsd.org>
Cc:        freebsd-fs@freebsd.org, avg@freebsd.org
Subject:   Re: panic: _sx_xlock_hard: recursed on non-recursive sx zfsvfs->z_hold_mtx[i] @ ...cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c:1407
Message-ID:  <20120930122403.GB35915@deviant.kiev.zoral.com.ua>
In-Reply-To: <20120929154101.GK1402@garage.freebsd.pl>
References:  <505DB4E6.8030407@smeets.im> <20120924224606.GE79077@ithaqua.etoilebsd.net> <20120925090840.GD35915@deviant.kiev.zoral.com.ua> <20120929154101.GK1402@garage.freebsd.pl>

next in thread | previous in thread | raw e-mail | index | archive | help

--N+dhEFW7Y2Uiel/w
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
Content-Transfer-Encoding: quoted-printable

On Sat, Sep 29, 2012 at 05:41:02PM +0200, Pawel Jakub Dawidek wrote:
> On Tue, Sep 25, 2012 at 12:08:40PM +0300, Konstantin Belousov wrote:
> > On Tue, Sep 25, 2012 at 12:46:07AM +0200, Baptiste Daroussin wrote:
> > > Hi,
> > >=20
> > > I have the exact same problem making: tinderbox and poudriere highly
> > > unusable.
> > >
> > > This is really problematic because pointyhat also rely on nullfs and
> > > zfs, which means we can't upgrade the building nodes if we need to for
> > > example.
> > >
> > > regards, Bapt
> >=20
> > This is zfs bug. Filesystems shall not call getnewvnode() while holding
> > internal locks. At least not the locks which are needed during reclaim.
> > Nullfs changes amplified the probability of the problematic situation,
> > since now nullfs vnodes are indeed cached instead of being recreated
> > on each access, so the overall count of used vnodes could be twice as
> > high.
> >=20
> > You might try to increase the kern.maxvnodes to reduce the probability =
of
> > the recursive calls into vnlnru() from getnewvnode(). But for real, bug
> > needs to be fixed in zfs.
>=20
> With all FreeBSD's VFS constraints, it is really hard to breath,
> especially within file system that was not designed with our VFS
> complexity in mind.
>=20
> For example it would be nice of VFS to not reclaim vnodes from
> getnewvnode() and leave this task entirely to the vnlru process.
> It is pretty obvious layering violation to me - file system code needs
> new vnode, it calls VFS routine to allocate one, which then calls file
> system again to reclaim one of its vnodes.
The postponing of the reclaim when vnode reserve goes low to the vnlru
process does not solve anything, since you only change the recursion into
the deadlock.

I discussed an approach for this issue with avg. Basic idea is presented
in the untested patch below. You can specify that some count of
the free vnodes must be present for some dynamic scope, started by
getnewvnode_reserve() function. While staying inside the reserved
pool, getnewvnode() calls would not recurse into vnlru(). The scope is
finished with getnewvnode_drop_reserve().

The getnewvnode_reserve() shall be called while no locks are held.

What do you thing ?
>=20
> It would also be nice to handle EAGAIN from VOP_RECLAIM(). Currently we
> panic on error. This would be useful to return if some of the locks
> cannot be acquired immediately. ZFS reclaim already discovers potential
> deadlocks and defer some reclamation portion to separate thread.

diff --git a/sys/kern/subr_trap.c b/sys/kern/subr_trap.c
index 24960fd..66e3201 100644
--- a/sys/kern/subr_trap.c
+++ b/sys/kern/subr_trap.c
@@ -154,6 +154,8 @@ userret(struct thread *td, struct trapframe *frame)
 	    ("userret: Returning with sleep disabled"));
 	KASSERT(td->td_pinned =3D=3D 0,
 	    ("userret: Returning with with pinned thread"));
+	KASSERT(td->td_vp_reserv =3D=3D 0,
+	    ("userret: Returning while holding vnode reservation"));
 #ifdef VIMAGE
 	/* Unfortunately td_vnet_lpush needs VNET_DEBUG. */
 	VNET_ASSERT(curvnet =3D=3D NULL,
diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
index 5c781c2..0de53f0 100644
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@@ -935,34 +935,22 @@ vtryrecycle(struct vnode *vp)
 }
=20
 /*
- * Return the next vnode from the free list.
+ * Wait for available vnodes.
  */
-int
-getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops,
-    struct vnode **vpp)
+static int
+getnewvnode_wait(int suspended)
 {
-	struct vnode *vp =3D NULL;
-	struct bufobj *bo;
=20
-	CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag);
-	mtx_lock(&vnode_free_list_mtx);
-	/*
-	 * Lend our context to reclaim vnodes if they've exceeded the max.
-	 */
-	if (freevnodes > wantfreevnodes)
-		vnlru_free(1);
-	/*
-	 * Wait for available vnodes.
-	 */
+	mtx_assert(&vnode_free_list_mtx, MA_OWNED);
 	if (numvnodes > desiredvnodes) {
-		if (mp !=3D NULL && (mp->mnt_kern_flag & MNTK_SUSPEND)) {
+		if (suspended) {
 			/*
 			 * File system is beeing suspended, we cannot risk a
 			 * deadlock here, so allocate new vnode anyway.
 			 */
 			if (freevnodes > wantfreevnodes)
 				vnlru_free(freevnodes - wantfreevnodes);
-			goto alloc;
+			return (0);
 		}
 		if (vnlruproc_sig =3D=3D 0) {
 			vnlruproc_sig =3D 1;	/* avoid unnecessary wakeups */
@@ -970,16 +958,75 @@ getnewvnode(const char *tag, struct mount *mp, struct=
 vop_vector *vops,
 		}
 		msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS,
 		    "vlruwk", hz);
-#if 0	/* XXX Not all VFS_VGET/ffs_vget callers check returns. */
-		if (numvnodes > desiredvnodes) {
-			mtx_unlock(&vnode_free_list_mtx);
-			return (ENFILE);
+	}
+	return (numvnodes > desiredvnodes ? ENFILE : 0);
+}
+
+void
+getnewvnode_reserve(u_int count)
+{
+	struct thread *td;
+
+	td =3D curthread;
+	mtx_lock(&vnode_free_list_mtx);
+	while (count > 0) {
+		if (getnewvnode_wait(0) =3D=3D 0) {
+			count--;
+			td->td_vp_reserv++;
+			numvnodes++;
 		}
-#endif
 	}
-alloc:
+	mtx_unlock(&vnode_free_list_mtx);
+}
+
+void
+getnewvnode_drop_reserve(void)
+{
+	struct thread *td;
+
+	td =3D curthread;
+	mtx_lock(&vnode_free_list_mtx);
+	KASSERT(numvnodes >=3D td->td_vp_reserv, ("reserve too large"));
+	numvnodes -=3D td->td_vp_reserv;
+	mtx_unlock(&vnode_free_list_mtx);
+	td->td_vp_reserv =3D 0;
+}
+
+/*
+ * Return the next vnode from the free list.
+ */
+int
+getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops,
+    struct vnode **vpp)
+{
+	struct vnode *vp;
+	struct bufobj *bo;
+	struct thread *td;
+	int error;
+
+	CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag);
+	vp =3D NULL;
+	td =3D curthread;
+	if (td->td_vp_reserv > 0) {
+		td->td_vp_reserv -=3D 1;
+		goto alloc;
+	}
+	mtx_lock(&vnode_free_list_mtx);
+	/*
+	 * Lend our context to reclaim vnodes if they've exceeded the max.
+	 */
+	if (freevnodes > wantfreevnodes)
+		vnlru_free(1);
+	error =3D getnewvnode_wait(mp !=3D NULL && (mp->mnt_kern_flag &
+	    MNTK_SUSPEND));
+#if 0	/* XXX Not all VFS_VGET/ffs_vget callers check returns. */
+	if (error !=3D 0) {
+		mtx_unlock(&vnode_free_list_mtx);
+		return (error);
+#endif
 	numvnodes++;
 	mtx_unlock(&vnode_free_list_mtx);
+alloc:
 	vp =3D (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK|M_ZERO);
 	/*
 	 * Setup locks.
diff --git a/sys/sys/proc.h b/sys/sys/proc.h
index 4c4aa2f..0aae0ce 100644
--- a/sys/sys/proc.h
+++ b/sys/sys/proc.h
@@ -272,6 +272,7 @@ struct thread {
 	struct osd	td_osd;		/* (k) Object specific data. */
 	struct vm_map_entry *td_map_def_user; /* (k) Deferred entries. */
 	pid_t		td_dbg_forked;	/* (c) Child pid for debugger. */
+	u_int		td_vp_reserv;	/* (k) Count of reserved vnodes. */
 #define	td_endzero td_sigmask
=20
 /* Copied during fork1() or create_thread(). */
diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h
index 1bde8b9..029458f 100644
--- a/sys/sys/vnode.h
+++ b/sys/sys/vnode.h
@@ -600,6 +600,8 @@ void	cvtstat(struct stat *st, struct ostat *ost);
 void	cvtnstat(struct stat *sb, struct nstat *nsb);
 int	getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops,
 	    struct vnode **vpp);
+void	getnewvnode_reserve(u_int count);
+void	getnewvnode_drop_reserve(void);
 int	insmntque1(struct vnode *vp, struct mount *mp,
 	    void (*dtr)(struct vnode *, void *), void *dtr_arg);
 int	insmntque(struct vnode *vp, struct mount *mp);

--N+dhEFW7Y2Uiel/w
Content-Type: application/pgp-signature

-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.12 (FreeBSD)

iEYEARECAAYFAlBoOeMACgkQC3+MBN1Mb4hk4QCglh9bUwmlet/PEwgkmQyuGjbN
9PoAn2VmBWDLTwMwOBYYwdo+IYDI+tDp
=QYIV
-----END PGP SIGNATURE-----

--N+dhEFW7Y2Uiel/w--



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?20120930122403.GB35915>