Skip site navigation (1)Skip section navigation (2)
Date:      Thu, 18 Mar 2021 15:36:39 GMT
From:      Mateusz Guzik <mjg@FreeBSD.org>
To:        src-committers@FreeBSD.org, dev-commits-src-all@FreeBSD.org, dev-commits-src-branches@FreeBSD.org
Subject:   git: faa41af1fed3 - releng/13.0 - vfs: fix vnlru marker handling for filtered/unfiltered cases
Message-ID:  <202103181536.12IFadcY007002@gitrepo.freebsd.org>

next in thread | raw e-mail | index | archive | help
The branch releng/13.0 has been updated by mjg:

URL: https://cgit.FreeBSD.org/src/commit/?id=faa41af1fed350327cc542cb240ca2c6e1e8ba0c

commit faa41af1fed350327cc542cb240ca2c6e1e8ba0c
Author:     Mateusz Guzik <mjg@FreeBSD.org>
AuthorDate: 2021-03-17 21:33:47 +0000
Commit:     Mateusz Guzik <mjg@FreeBSD.org>
CommitDate: 2021-03-18 15:36:22 +0000

    vfs: fix vnlru marker handling for filtered/unfiltered cases
    
    The global list has a marker with an invariant that free vnodes are
    placed somewhere past that. A caller which performs filtering (like ZFS)
    can move said marker all the way to the end, across free vnodes which
    don't match. Then a caller which does not perform filtering will fail to
    find them. This makes vn_alloc_hard sleep for 1 second instead of
    reclaiming, resulting in significant stalls.
    
    Fix the problem by requiring an explicit marker by callers which do
    filtering.
    
    As a temporary measure extend vnlru_free to restart if it fails to
    reclaim anything.
    
    Big thanks go to the reporter for testing several iterations of the
    patch.
    
    Reported by:    Yamagi <lists yamagi.org>
    Tested by:      Yamagi <lists yamagi.org>
    Reviewed by:    kib
    Differential Revision:  https://reviews.freebsd.org/D29324
    Approved by:    re (gjb)
    
    (cherry picked from commit e9272225e6bed840b00eef1c817b188c172338ee)
---
 sys/contrib/openzfs/module/os/freebsd/zfs/arc_os.c | 14 ++++-
 sys/kern/vfs_subr.c                                | 72 +++++++++++++++++++---
 sys/sys/vnode.h                                    |  3 +
 3 files changed, 80 insertions(+), 9 deletions(-)

diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/arc_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/arc_os.c
index 4fc7468bfa47..0d5cffbe8d1e 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/arc_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/arc_os.c
@@ -51,6 +51,9 @@
 #include <sys/vm.h>
 #include <sys/vmmeter.h>
 
+static struct sx arc_vnlru_lock;
+static struct vnode *arc_vnlru_marker;
+
 extern struct vfsops zfs_vfsops;
 
 uint_t zfs_arc_free_target = 0;
@@ -157,7 +160,9 @@ arc_prune_task(void *arg)
 
 	arc_reduce_target_size(ptob(nr_scan));
 	free(arg, M_TEMP);
-	vnlru_free(nr_scan, &zfs_vfsops);
+	sx_xlock(&arc_vnlru_lock);
+	vnlru_free_vfsops(nr_scan, &zfs_vfsops, arc_vnlru_marker);
+	sx_xunlock(&arc_vnlru_lock);
 }
 
 /*
@@ -234,7 +239,8 @@ arc_lowmem_init(void)
 {
 	arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL,
 	    EVENTHANDLER_PRI_FIRST);
-
+	arc_vnlru_marker = vnlru_alloc_marker();
+	sx_init(&arc_vnlru_lock, "arc vnlru lock");
 }
 
 void
@@ -242,6 +248,10 @@ arc_lowmem_fini(void)
 {
 	if (arc_event_lowmem != NULL)
 		EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem);
+	if (arc_vnlru_marker != NULL) {
+		vnlru_free_marker(arc_vnlru_marker);
+		sx_destroy(&arc_vnlru_lock);
+	}
 }
 
 void
diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
index 04cd0e0175f9..74470805411c 100644
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@@ -1216,9 +1216,9 @@ SYSCTL_INT(_debug, OID_AUTO, max_vnlru_free, CTLFLAG_RW, &max_vnlru_free,
  * Attempt to reduce the free list by the requested amount.
  */
 static int
-vnlru_free_locked(int count, struct vfsops *mnt_op)
+vnlru_free_impl(int count, struct vfsops *mnt_op, struct vnode *mvp)
 {
-	struct vnode *vp, *mvp;
+	struct vnode *vp;
 	struct mount *mp;
 	int ocount;
 
@@ -1226,7 +1226,6 @@ vnlru_free_locked(int count, struct vfsops *mnt_op)
 	if (count > max_vnlru_free)
 		count = max_vnlru_free;
 	ocount = count;
-	mvp = vnode_list_free_marker;
 	vp = mvp;
 	for (;;) {
 		if (count == 0) {
@@ -1268,13 +1267,72 @@ vnlru_free_locked(int count, struct vfsops *mnt_op)
 	return (ocount - count);
 }
 
+static int
+vnlru_free_locked(int count)
+{
+
+	mtx_assert(&vnode_list_mtx, MA_OWNED);
+	return (vnlru_free_impl(count, NULL, vnode_list_free_marker));
+}
+
+void
+vnlru_free_vfsops(int count, struct vfsops *mnt_op, struct vnode *mvp)
+{
+
+	MPASS(mnt_op != NULL);
+	MPASS(mvp != NULL);
+	VNPASS(mvp->v_type == VMARKER, mvp);
+	mtx_lock(&vnode_list_mtx);
+	vnlru_free_impl(count, mnt_op, mvp);
+	mtx_unlock(&vnode_list_mtx);
+}
+
+/*
+ * Temporary binary compat, don't use. Call vnlru_free_vfsops instead.
+ */
 void
 vnlru_free(int count, struct vfsops *mnt_op)
 {
+	struct vnode *mvp;
+
+	if (count == 0)
+		return;
+	mtx_lock(&vnode_list_mtx);
+	mvp = vnode_list_free_marker;
+	if (vnlru_free_impl(count, mnt_op, mvp) == 0) {
+		/*
+		 * It is possible the marker was moved over eligible vnodes by
+		 * callers which filtered by different ops. If so, start from
+		 * scratch.
+		 */
+		if (vnlru_read_freevnodes() > 0) {
+			TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist);
+			TAILQ_INSERT_HEAD(&vnode_list, mvp, v_vnodelist);
+		}
+		vnlru_free_impl(count, mnt_op, mvp);
+	}
+	mtx_unlock(&vnode_list_mtx);
+}
+
+struct vnode *
+vnlru_alloc_marker(void)
+{
+	struct vnode *mvp;
 
+	mvp = vn_alloc_marker(NULL);
+	mtx_lock(&vnode_list_mtx);
+	TAILQ_INSERT_BEFORE(vnode_list_free_marker, mvp, v_vnodelist);
+	mtx_unlock(&vnode_list_mtx);
+	return (mvp);
+}
+
+void
+vnlru_free_marker(struct vnode *mvp)
+{
 	mtx_lock(&vnode_list_mtx);
-	vnlru_free_locked(count, mnt_op);
+	TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist);
 	mtx_unlock(&vnode_list_mtx);
+	vn_free_marker(mvp);
 }
 
 static void
@@ -1423,7 +1481,7 @@ vnlru_proc(void)
 		 * try to reduce it by discarding from the free list.
 		 */
 		if (rnumvnodes > desiredvnodes) {
-			vnlru_free_locked(rnumvnodes - desiredvnodes, NULL);
+			vnlru_free_locked(rnumvnodes - desiredvnodes);
 			rnumvnodes = atomic_load_long(&numvnodes);
 		}
 		/*
@@ -1615,7 +1673,7 @@ vn_alloc_hard(struct mount *mp)
 	 * should be chosen so that we never wait or even reclaim from
 	 * the free list to below its target minimum.
 	 */
-	if (vnlru_free_locked(1, NULL) > 0)
+	if (vnlru_free_locked(1) > 0)
 		goto alloc;
 	if (mp == NULL || (mp->mnt_kern_flag & MNTK_SUSPEND) == 0) {
 		/*
@@ -1625,7 +1683,7 @@ vn_alloc_hard(struct mount *mp)
 		msleep(&vnlruproc_sig, &vnode_list_mtx, PVFS, "vlruwk", hz);
 		if (atomic_load_long(&numvnodes) + 1 > desiredvnodes &&
 		    vnlru_read_freevnodes() > 1)
-			vnlru_free_locked(1, NULL);
+			vnlru_free_locked(1);
 	}
 alloc:
 	rnumvnodes = atomic_fetchadd_long(&numvnodes, 1) + 1;
diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h
index 639a16881e09..450e2747b773 100644
--- a/sys/sys/vnode.h
+++ b/sys/sys/vnode.h
@@ -821,7 +821,10 @@ void	vfs_timestamp(struct timespec *);
 void	vfs_write_resume(struct mount *mp, int flags);
 int	vfs_write_suspend(struct mount *mp, int flags);
 int	vfs_write_suspend_umnt(struct mount *mp);
+struct vnode *vnlru_alloc_marker(void);
+void	vnlru_free_marker(struct vnode *);
 void	vnlru_free(int, struct vfsops *);
+void	vnlru_free_vfsops(int, struct vfsops *, struct vnode *);
 int	vop_stdbmap(struct vop_bmap_args *);
 int	vop_stdfdatasync_buf(struct vop_fdatasync_args *);
 int	vop_stdfsync(struct vop_fsync_args *);



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?202103181536.12IFadcY007002>