Date: Mon, 13 Jan 2020 02:34:02 +0000 (UTC) From: Mateusz Guzik <mjg@FreeBSD.org> To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org Subject: svn commit: r356670 - in head/sys: fs/nfsserver kern sys Message-ID: <202001130234.00D2Y2GP027222@repo.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: mjg Date: Mon Jan 13 02:34:02 2020 New Revision: 356670 URL: https://svnweb.freebsd.org/changeset/base/356670 Log: vfs: add per-mount vnode lazy list and use it for deferred inactive + msync This obviates the need to scan the entire active list looking for vnodes of interest. msync is handled by adding all vnodes with write count to the lazy list. deferred inactive directly adds vnodes as it sets the VI_DEFINACT flag. Vnodes get dequeued from the list when their hold count reaches 0. Newly added MNT_VNODE_FOREACH_LAZY* macros support filtering so that spurious locking is avoided in the common case. Reviewed by: jeff Tested by: pho (in a larger patch, previous version) Differential Revision: https://reviews.freebsd.org/D22995 Modified: head/sys/fs/nfsserver/nfs_nfsdport.c head/sys/kern/vfs_default.c head/sys/kern/vfs_mount.c head/sys/kern/vfs_subr.c head/sys/sys/mount.h head/sys/sys/vnode.h Modified: head/sys/fs/nfsserver/nfs_nfsdport.c ============================================================================== --- head/sys/fs/nfsserver/nfs_nfsdport.c Mon Jan 13 02:31:51 2020 (r356669) +++ head/sys/fs/nfsserver/nfs_nfsdport.c Mon Jan 13 02:34:02 2020 (r356670) @@ -3318,6 +3318,7 @@ nfsd_mntinit(void) nfsv4root_mnt.mnt_flag = (MNT_RDONLY | MNT_EXPORTED); TAILQ_INIT(&nfsv4root_mnt.mnt_nvnodelist); TAILQ_INIT(&nfsv4root_mnt.mnt_activevnodelist); + TAILQ_INIT(&nfsv4root_mnt.mnt_lazyvnodelist); nfsv4root_mnt.mnt_export = NULL; TAILQ_INIT(&nfsv4root_opt); TAILQ_INIT(&nfsv4root_newopt); @@ -3325,6 +3326,7 @@ nfsd_mntinit(void) nfsv4root_mnt.mnt_optnew = &nfsv4root_newopt; nfsv4root_mnt.mnt_nvnodelistsize = 0; nfsv4root_mnt.mnt_activevnodelistsize = 0; + nfsv4root_mnt.mnt_lazyvnodelistsize = 0; } /* Modified: head/sys/kern/vfs_default.c ============================================================================== --- head/sys/kern/vfs_default.c Mon Jan 13 02:31:51 2020 (r356669) +++ head/sys/kern/vfs_default.c Mon Jan 13 02:34:02 2020 (r356670) @@ -1230,6 +1230,8 @@ vop_stdadd_writecount(struct vop_add_writecount_args * VNASSERT(vp->v_writecount + ap->a_inc >= 0, vp, ("neg writecount increment %d", ap->a_inc)); vp->v_writecount += ap->a_inc; + if (vp->v_writecount > 0 && vp->v_mount != NULL) + vlazy(vp); error = 0; } VI_UNLOCK(vp); Modified: head/sys/kern/vfs_mount.c ============================================================================== --- head/sys/kern/vfs_mount.c Mon Jan 13 02:31:51 2020 (r356669) +++ head/sys/kern/vfs_mount.c Mon Jan 13 02:34:02 2020 (r356670) @@ -506,6 +506,8 @@ vfs_mount_alloc(struct vnode *vp, struct vfsconf *vfsp mp->mnt_activevnodelistsize = 0; TAILQ_INIT(&mp->mnt_tmpfreevnodelist); mp->mnt_tmpfreevnodelistsize = 0; + TAILQ_INIT(&mp->mnt_lazyvnodelist); + mp->mnt_lazyvnodelistsize = 0; if (mp->mnt_ref != 0 || mp->mnt_lockref != 0 || mp->mnt_writeopcount != 0) panic("%s: non-zero counters on new mp %p\n", __func__, mp); @@ -575,6 +577,8 @@ vfs_mount_destroy(struct mount *mp) panic("vfs_mount_destroy: nonzero activevnodelistsize"); if (mp->mnt_tmpfreevnodelistsize != 0) panic("vfs_mount_destroy: nonzero tmpfreevnodelistsize"); + if (mp->mnt_lazyvnodelistsize != 0) + panic("vfs_mount_destroy: nonzero lazyvnodelistsize"); if (mp->mnt_lockref != 0) panic("vfs_mount_destroy: nonzero lock refcount"); MNT_IUNLOCK(mp); Modified: head/sys/kern/vfs_subr.c ============================================================================== --- head/sys/kern/vfs_subr.c Mon Jan 13 02:31:51 2020 (r356669) +++ head/sys/kern/vfs_subr.c Mon Jan 13 02:34:02 2020 (r356670) @@ -1810,6 +1810,15 @@ delmntque(struct vnode *vp) mp->mnt_activevnodelistsize--; mtx_unlock(&mp->mnt_listmtx); } + if (vp->v_mflag & VMP_LAZYLIST) { + mtx_lock(&mp->mnt_listmtx); + if (vp->v_mflag & VMP_LAZYLIST) { + vp->v_mflag &= ~VMP_LAZYLIST; + TAILQ_REMOVE(&mp->mnt_lazyvnodelist, vp, v_lazylist); + mp->mnt_lazyvnodelistsize--; + } + mtx_unlock(&mp->mnt_listmtx); + } vp->v_mount = NULL; VI_UNLOCK(vp); VNASSERT(mp->mnt_nvnodelistsize > 0, vp, @@ -3038,6 +3047,25 @@ vrefcnt(struct vnode *vp) return (vp->v_usecount); } +void +vlazy(struct vnode *vp) +{ + struct mount *mp; + + VNASSERT(vp->v_holdcnt > 0, vp, ("%s: vnode not held", __func__)); + + if ((vp->v_mflag & VMP_LAZYLIST) != 0) + return; + mp = vp->v_mount; + mtx_lock(&mp->mnt_listmtx); + if ((vp->v_mflag & VMP_LAZYLIST) == 0) { + vp->v_mflag |= VMP_LAZYLIST; + TAILQ_INSERT_TAIL(&mp->mnt_lazyvnodelist, vp, v_lazylist); + mp->mnt_lazyvnodelistsize++; + } + mtx_unlock(&mp->mnt_listmtx); +} + static void vdefer_inactive(struct vnode *vp) { @@ -3054,6 +3082,7 @@ vdefer_inactive(struct vnode *vp) vdropl(vp); return; } + vlazy(vp); vp->v_iflag |= VI_DEFINACT; VI_UNLOCK(vp); counter_u64_add(deferred_inact, 1); @@ -3329,6 +3358,11 @@ vdrop_deactivate(struct vnode *vp) ("vdrop: freeing when we shouldn't")); mp = vp->v_mount; mtx_lock(&mp->mnt_listmtx); + if (vp->v_mflag & VMP_LAZYLIST) { + vp->v_mflag &= ~VMP_LAZYLIST; + TAILQ_REMOVE(&mp->mnt_lazyvnodelist, vp, v_lazylist); + mp->mnt_lazyvnodelistsize--; + } if (vp->v_iflag & VI_ACTIVE) { vp->v_iflag &= ~VI_ACTIVE; TAILQ_REMOVE(&mp->mnt_activevnodelist, vp, v_actfreelist); @@ -3906,7 +3940,9 @@ vn_printf(struct vnode *vp, const char *fmt, ...) } if (vp->v_mflag & VMP_TMPMNTFREELIST) strlcat(buf, "|VMP_TMPMNTFREELIST", sizeof(buf)); - flags = vp->v_mflag & ~(VMP_TMPMNTFREELIST); + if (vp->v_mflag & VMP_LAZYLIST) + strlcat(buf, "|VMP_LAZYLIST", sizeof(buf)); + flags = vp->v_mflag & ~(VMP_TMPMNTFREELIST | VMP_LAZYLIST); if (flags != 0) { snprintf(buf2, sizeof(buf2), "|VMP(0x%lx)", flags); strlcat(buf, buf2, sizeof(buf)); @@ -4126,6 +4162,8 @@ DB_SHOW_COMMAND(mount, db_show_mount) db_printf(" mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize); db_printf(" mnt_activevnodelistsize = %d\n", mp->mnt_activevnodelistsize); + db_printf(" mnt_lazyvnodelistsize = %d\n", + mp->mnt_lazyvnodelistsize); db_printf(" mnt_writeopcount = %d (with %d in the struct)\n", vfs_mount_fetch_counter(mp, MNT_COUNT_WRITEOPCOUNT), mp->mnt_writeopcount); db_printf(" mnt_maxsymlinklen = %d\n", mp->mnt_maxsymlinklen); @@ -4477,6 +4515,13 @@ vfs_deferred_inactive(struct vnode *vp, int lkflags) vdefer_inactive_cond(vp); } +static int +vfs_periodic_inactive_filter(struct vnode *vp, void *arg) +{ + + return (vp->v_iflag & VI_DEFINACT); +} + static void __noinline vfs_periodic_inactive(struct mount *mp, int flags) { @@ -4487,7 +4532,7 @@ vfs_periodic_inactive(struct mount *mp, int flags) if (flags != MNT_WAIT) lkflags |= LK_NOWAIT; - MNT_VNODE_FOREACH_ACTIVE(vp, mp, mvp) { + MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, vfs_periodic_inactive_filter, NULL) { if ((vp->v_iflag & VI_DEFINACT) == 0) { VI_UNLOCK(vp); continue; @@ -4502,12 +4547,27 @@ vfs_want_msync(struct vnode *vp) { struct vm_object *obj; + /* + * This test may be performed without any locks held. + * We rely on vm_object's type stability. + */ if (vp->v_vflag & VV_NOSYNC) return (false); obj = vp->v_object; return (obj != NULL && vm_object_mightbedirty(obj)); } +static int +vfs_periodic_msync_inactive_filter(struct vnode *vp, void *arg __unused) +{ + + if (vp->v_vflag & VV_NOSYNC) + return (false); + if (vp->v_iflag & VI_DEFINACT) + return (true); + return (vfs_want_msync(vp)); +} + static void __noinline vfs_periodic_msync_inactive(struct mount *mp, int flags) { @@ -4527,7 +4587,7 @@ vfs_periodic_msync_inactive(struct mount *mp, int flag objflags = OBJPC_SYNC; } - MNT_VNODE_FOREACH_ACTIVE(vp, mp, mvp) { + MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, vfs_periodic_msync_inactive_filter, NULL) { seen_defer = false; if (vp->v_iflag & VI_DEFINACT) { vp->v_iflag &= ~VI_DEFINACT; @@ -6235,4 +6295,213 @@ __mnt_vnode_markerfree_active(struct vnode **mvp, stru TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist); mtx_unlock(&mp->mnt_listmtx); mnt_vnode_markerfree_active(mvp, mp); +} + +/* + * These are helper functions for filesystems to traverse their + * lazy vnodes. See MNT_VNODE_FOREACH_LAZY() in sys/mount.h + */ +static void +mnt_vnode_markerfree_lazy(struct vnode **mvp, struct mount *mp) +{ + + KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); + + MNT_ILOCK(mp); + MNT_REL(mp); + MNT_IUNLOCK(mp); + free(*mvp, M_VNODE_MARKER); + *mvp = NULL; +} + +/* + * Relock the mp mount vnode list lock with the vp vnode interlock in the + * conventional lock order during mnt_vnode_next_lazy iteration. + * + * On entry, the mount vnode list lock is held and the vnode interlock is not. + * The list lock is dropped and reacquired. On success, both locks are held. + * On failure, the mount vnode list lock is held but the vnode interlock is + * not, and the procedure may have yielded. + */ +static bool +mnt_vnode_next_lazy_relock(struct vnode *mvp, struct mount *mp, + struct vnode *vp) +{ + const struct vnode *tmp; + bool held, ret; + + VNASSERT(mvp->v_mount == mp && mvp->v_type == VMARKER && + TAILQ_NEXT(mvp, v_lazylist) != NULL, mvp, + ("%s: bad marker", __func__)); + VNASSERT(vp->v_mount == mp && vp->v_type != VMARKER, vp, + ("%s: inappropriate vnode", __func__)); + ASSERT_VI_UNLOCKED(vp, __func__); + mtx_assert(&mp->mnt_listmtx, MA_OWNED); + + ret = false; + + TAILQ_REMOVE(&mp->mnt_lazyvnodelist, mvp, v_lazylist); + TAILQ_INSERT_BEFORE(vp, mvp, v_lazylist); + + /* + * Use a hold to prevent vp from disappearing while the mount vnode + * list lock is dropped and reacquired. Normally a hold would be + * acquired with vhold(), but that might try to acquire the vnode + * interlock, which would be a LOR with the mount vnode list lock. + */ + held = refcount_acquire_if_not_zero(&vp->v_holdcnt); + mtx_unlock(&mp->mnt_listmtx); + if (!held) + goto abort; + VI_LOCK(vp); + if (!refcount_release_if_not_last(&vp->v_holdcnt)) { + vdropl(vp); + goto abort; + } + mtx_lock(&mp->mnt_listmtx); + + /* + * Determine whether the vnode is still the next one after the marker, + * excepting any other markers. If the vnode has not been doomed by + * vgone() then the hold should have ensured that it remained on the + * lazy list. If it has been doomed but is still on the lazy list, + * don't abort, but rather skip over it (avoid spinning on doomed + * vnodes). + */ + tmp = mvp; + do { + tmp = TAILQ_NEXT(tmp, v_lazylist); + } while (tmp != NULL && tmp->v_type == VMARKER); + if (tmp != vp) { + mtx_unlock(&mp->mnt_listmtx); + VI_UNLOCK(vp); + goto abort; + } + + ret = true; + goto out; +abort: + maybe_yield(); + mtx_lock(&mp->mnt_listmtx); +out: + if (ret) + ASSERT_VI_LOCKED(vp, __func__); + else + ASSERT_VI_UNLOCKED(vp, __func__); + mtx_assert(&mp->mnt_listmtx, MA_OWNED); + return (ret); +} + +static struct vnode * +mnt_vnode_next_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb, + void *cbarg) +{ + struct vnode *vp, *nvp; + + mtx_assert(&mp->mnt_listmtx, MA_OWNED); + KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); +restart: + vp = TAILQ_NEXT(*mvp, v_lazylist); + while (vp != NULL) { + if (vp->v_type == VMARKER) { + vp = TAILQ_NEXT(vp, v_lazylist); + continue; + } + /* + * See if we want to process the vnode. Note we may encounter a + * long string of vnodes we don't care about and hog the list + * as a result. Check for it and requeue the marker. + */ + if (VN_IS_DOOMED(vp) || !cb(vp, cbarg)) { + if (!should_yield()) { + vp = TAILQ_NEXT(vp, v_lazylist); + continue; + } + TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, + v_lazylist); + TAILQ_INSERT_AFTER(&mp->mnt_lazyvnodelist, vp, *mvp, + v_lazylist); + mtx_unlock(&mp->mnt_listmtx); + kern_yield(PRI_USER); + mtx_lock(&mp->mnt_listmtx); + goto restart; + } + /* + * Try-lock because this is the wrong lock order. If that does + * not succeed, drop the mount vnode list lock and try to + * reacquire it and the vnode interlock in the right order. + */ + if (!VI_TRYLOCK(vp) && + !mnt_vnode_next_lazy_relock(*mvp, mp, vp)) + goto restart; + KASSERT(vp->v_type != VMARKER, ("locked marker %p", vp)); + KASSERT(vp->v_mount == mp || vp->v_mount == NULL, + ("alien vnode on the lazy list %p %p", vp, mp)); + if (vp->v_mount == mp && !VN_IS_DOOMED(vp)) + break; + nvp = TAILQ_NEXT(vp, v_lazylist); + VI_UNLOCK(vp); + vp = nvp; + } + TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, v_lazylist); + + /* Check if we are done */ + if (vp == NULL) { + mtx_unlock(&mp->mnt_listmtx); + mnt_vnode_markerfree_lazy(mvp, mp); + return (NULL); + } + TAILQ_INSERT_AFTER(&mp->mnt_lazyvnodelist, vp, *mvp, v_lazylist); + mtx_unlock(&mp->mnt_listmtx); + ASSERT_VI_LOCKED(vp, "lazy iter"); + KASSERT((vp->v_iflag & VI_ACTIVE) != 0, ("Non-active vp %p", vp)); + return (vp); +} + +struct vnode * +__mnt_vnode_next_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb, + void *cbarg) +{ + + if (should_yield()) + kern_yield(PRI_USER); + mtx_lock(&mp->mnt_listmtx); + return (mnt_vnode_next_lazy(mvp, mp, cb, cbarg)); +} + +struct vnode * +__mnt_vnode_first_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb, + void *cbarg) +{ + struct vnode *vp; + + *mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO); + MNT_ILOCK(mp); + MNT_REF(mp); + MNT_IUNLOCK(mp); + (*mvp)->v_type = VMARKER; + (*mvp)->v_mount = mp; + + mtx_lock(&mp->mnt_listmtx); + vp = TAILQ_FIRST(&mp->mnt_lazyvnodelist); + if (vp == NULL) { + mtx_unlock(&mp->mnt_listmtx); + mnt_vnode_markerfree_lazy(mvp, mp); + return (NULL); + } + TAILQ_INSERT_BEFORE(vp, *mvp, v_lazylist); + return (mnt_vnode_next_lazy(mvp, mp, cb, cbarg)); +} + +void +__mnt_vnode_markerfree_lazy(struct vnode **mvp, struct mount *mp) +{ + + if (*mvp == NULL) + return; + + mtx_lock(&mp->mnt_listmtx); + TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, v_lazylist); + mtx_unlock(&mp->mnt_listmtx); + mnt_vnode_markerfree_lazy(mvp, mp); } Modified: head/sys/sys/mount.h ============================================================================== --- head/sys/sys/mount.h Mon Jan 13 02:31:51 2020 (r356669) +++ head/sys/sys/mount.h Mon Jan 13 02:34:02 2020 (r356670) @@ -223,6 +223,8 @@ struct mount { int mnt_activevnodelistsize;/* (l) # of active vnodes */ struct vnodelst mnt_tmpfreevnodelist; /* (l) list of free vnodes */ int mnt_tmpfreevnodelistsize;/* (l) # of free vnodes */ + struct vnodelst mnt_lazyvnodelist; /* (l) list of lazy vnodes */ + int mnt_lazyvnodelistsize; /* (l) # of lazy vnodes */ struct lock mnt_explock; /* vfs_export walkers lock */ TAILQ_ENTRY(mount) mnt_upper_link; /* (m) we in the all uppers */ TAILQ_HEAD(, mount) mnt_uppers; /* (m) upper mounts over us*/ @@ -266,6 +268,24 @@ void __mnt_vnode_markerfree_active(struct vno #define MNT_VNODE_FOREACH_ACTIVE_ABORT(mp, mvp) \ __mnt_vnode_markerfree_active(&(mvp), (mp)) + +/* + * Definitions for MNT_VNODE_FOREACH_LAZY. + */ +typedef int mnt_lazy_cb_t(struct vnode *, void *); +struct vnode *__mnt_vnode_next_lazy(struct vnode **mvp, struct mount *mp, + mnt_lazy_cb_t *cb, void *cbarg); +struct vnode *__mnt_vnode_first_lazy(struct vnode **mvp, struct mount *mp, + mnt_lazy_cb_t *cb, void *cbarg); +void __mnt_vnode_markerfree_lazy(struct vnode **mvp, struct mount *mp); + +#define MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, cb, cbarg) \ + for (vp = __mnt_vnode_first_lazy(&(mvp), (mp), (cb), (cbarg)); \ + (vp) != NULL; \ + vp = __mnt_vnode_next_lazy(&(mvp), (mp), (cb), (cbarg))) + +#define MNT_VNODE_FOREACH_LAZY_ABORT(mp, mvp) \ + __mnt_vnode_markerfree_lazy(&(mvp), (mp)) #define MNT_ILOCK(mp) mtx_lock(&(mp)->mnt_mtx) #define MNT_ITRYLOCK(mp) mtx_trylock(&(mp)->mnt_mtx) Modified: head/sys/sys/vnode.h ============================================================================== --- head/sys/sys/vnode.h Mon Jan 13 02:31:51 2020 (r356669) +++ head/sys/sys/vnode.h Mon Jan 13 02:34:02 2020 (r356670) @@ -148,6 +148,7 @@ struct vnode { * The machinery of being a vnode */ TAILQ_ENTRY(vnode) v_actfreelist; /* l vnode active/free lists */ + TAILQ_ENTRY(vnode) v_lazylist; /* l vnode lazy list */ struct bufobj v_bufobj; /* * Buffer cache object */ /* @@ -260,6 +261,7 @@ struct xvnode { #define VV_READLINK 0x2000 /* fdescfs linux vnode */ #define VMP_TMPMNTFREELIST 0x0001 /* Vnode is on mnt's tmp free list */ +#define VMP_LAZYLIST 0x0002 /* Vnode is on mnt's lazy list */ /* * Vnode attributes. A field value of VNOVAL represents a field whose value @@ -653,6 +655,7 @@ int vaccess_acl_posix1e(enum vtype type, uid_t file_ui struct ucred *cred, int *privused); void vattr_null(struct vattr *vap); int vcount(struct vnode *vp); +void vlazy(struct vnode *); void vdrop(struct vnode *); void vdropl(struct vnode *); int vflush(struct mount *mp, int rootrefs, int flags, struct thread *td);
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?202001130234.00D2Y2GP027222>