Date: Mon, 13 Jan 2020 02:39:41 +0000 (UTC) From: Mateusz Guzik <mjg@FreeBSD.org> To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org Subject: svn commit: r356673 - in head/sys: kern sys Message-ID: <202001130239.00D2df0x028071@repo.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: mjg Date: Mon Jan 13 02:39:41 2020 New Revision: 356673 URL: https://svnweb.freebsd.org/changeset/base/356673 Log: vfs: per-cpu batched requeuing of free vnodes Constant requeuing adds significant lock contention in certain workloads. Lessen the problem by batching it. Per-cpu areas are locked in order to synchronize against UMA freeing memory. vnode's v_mflag is converted to short to prevent the struct from growing. Sample result from an incremental make -s -j 104 bzImage on tmpfs: stock: 122.38s user 1780.45s system 6242% cpu 30.480 total patched: 144.84s user 985.90s system 4856% cpu 23.282 total Reviewed by: jeff Tested by: pho (in a larger patch, previous version) Differential Revision: https://reviews.freebsd.org/D22998 Modified: head/sys/kern/vfs_subr.c head/sys/sys/vnode.h Modified: head/sys/kern/vfs_subr.c ============================================================================== --- head/sys/kern/vfs_subr.c Mon Jan 13 02:37:25 2020 (r356672) +++ head/sys/kern/vfs_subr.c Mon Jan 13 02:39:41 2020 (r356673) @@ -295,6 +295,16 @@ static int stat_rush_requests; /* number of times I/O SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, "Number of times I/O speeded up (rush requests)"); +#define VDBATCH_SIZE 8 +struct vdbatch { + u_int index; + struct mtx lock; + struct vnode *tab[VDBATCH_SIZE]; +}; +DPCPU_DEFINE_STATIC(struct vdbatch, vd); + +static void vdbatch_dequeue(struct vnode *vp); + /* * When shutting down the syncer, run it at four times normal speed. */ @@ -552,6 +562,8 @@ vnode_init(void *mem, int size, int flags) */ rangelock_init(&vp->v_rl); + vp->v_dbatchcpu = NOCPU; + mtx_lock(&vnode_list_mtx); TAILQ_INSERT_BEFORE(vnode_list_free_marker, vp, v_vnodelist); mtx_unlock(&vnode_list_mtx); @@ -568,6 +580,7 @@ vnode_fini(void *mem, int size) struct bufobj *bo; vp = mem; + vdbatch_dequeue(vp); mtx_lock(&vnode_list_mtx); TAILQ_REMOVE(&vnode_list, vp, v_vnodelist); mtx_unlock(&vnode_list_mtx); @@ -602,8 +615,9 @@ vnode_fini(void *mem, int size) static void vntblinit(void *dummy __unused) { + struct vdbatch *vd; + int cpu, physvnodes, virtvnodes; u_int i; - int physvnodes, virtvnodes; /* * Desiredvnodes is a function of the physical memory size and the @@ -669,6 +683,12 @@ vntblinit(void *dummy __unused) for (i = 1; i <= sizeof(struct vnode); i <<= 1) vnsz2log++; vnsz2log--; + + CPU_FOREACH(cpu) { + vd = DPCPU_ID_PTR((cpu), vd); + bzero(vd, sizeof(*vd)); + mtx_init(&vd->lock, "vdbatch", NULL, MTX_DEF); + } } SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL); @@ -3199,7 +3219,99 @@ vholdnz(struct vnode *vp) #endif } +static void __noinline +vdbatch_process(struct vdbatch *vd) +{ + struct vnode *vp; + int i; + + mtx_assert(&vd->lock, MA_OWNED); + MPASS(vd->index == VDBATCH_SIZE); + + mtx_lock(&vnode_list_mtx); + for (i = 0; i < VDBATCH_SIZE; i++) { + vp = vd->tab[i]; + TAILQ_REMOVE(&vnode_list, vp, v_vnodelist); + TAILQ_INSERT_TAIL(&vnode_list, vp, v_vnodelist); + MPASS(vp->v_dbatchcpu != NOCPU); + vp->v_dbatchcpu = NOCPU; + } + bzero(vd->tab, sizeof(vd->tab)); + vd->index = 0; + mtx_unlock(&vnode_list_mtx); +} + +static void +vdbatch_enqueue(struct vnode *vp) +{ + struct vdbatch *vd; + + ASSERT_VI_LOCKED(vp, __func__); + VNASSERT(!VN_IS_DOOMED(vp), vp, + ("%s: deferring requeue of a doomed vnode", __func__)); + + if (vp->v_dbatchcpu != NOCPU) { + VI_UNLOCK(vp); + return; + } + + /* + * A hack: pin us to the current CPU so that we know what to put in + * ->v_dbatchcpu. + */ + sched_pin(); + vd = DPCPU_PTR(vd); + mtx_lock(&vd->lock); + MPASS(vd->index < VDBATCH_SIZE); + MPASS(vd->tab[vd->index] == NULL); + vp->v_dbatchcpu = curcpu; + vd->tab[vd->index] = vp; + vd->index++; + VI_UNLOCK(vp); + if (vd->index == VDBATCH_SIZE) + vdbatch_process(vd); + mtx_unlock(&vd->lock); + sched_unpin(); +} + /* + * This routine must only be called for vnodes which are about to be + * deallocated. Supporting dequeue for arbitrary vndoes would require + * validating that the locked batch matches. + */ +static void +vdbatch_dequeue(struct vnode *vp) +{ + struct vdbatch *vd; + int i; + short cpu; + + VNASSERT(vp->v_type == VBAD || vp->v_type == VNON, vp, + ("%s: called for a used vnode\n", __func__)); + + cpu = vp->v_dbatchcpu; + if (cpu == NOCPU) + return; + + vd = DPCPU_ID_PTR(cpu, vd); + mtx_lock(&vd->lock); + for (i = 0; i < vd->index; i++) { + if (vd->tab[i] != vp) + continue; + vp->v_dbatchcpu = NOCPU; + vd->index--; + vd->tab[i] = vd->tab[vd->index]; + vd->tab[vd->index] = NULL; + break; + } + mtx_unlock(&vd->lock); + /* + * Either we dequeued the vnode above or the target CPU beat us to it. + */ + MPASS(vp->v_dbatchcpu == NOCPU); +} + +/* * Drop the hold count of the vnode. If this is the last reference to * the vnode we place it on the free list unless it has been vgone'd * (marked VIRF_DOOMED) in which case we will free it. @@ -3236,12 +3348,8 @@ vdrop_deactivate(struct vnode *vp) mp->mnt_lazyvnodelistsize--; mtx_unlock(&mp->mnt_listmtx); } - mtx_lock(&vnode_list_mtx); - TAILQ_REMOVE(&vnode_list, vp, v_vnodelist); - TAILQ_INSERT_TAIL(&vnode_list, vp, v_vnodelist); - mtx_unlock(&vnode_list_mtx); atomic_add_long(&freevnodes, 1); - VI_UNLOCK(vp); + vdbatch_enqueue(vp); } void Modified: head/sys/sys/vnode.h ============================================================================== --- head/sys/sys/vnode.h Mon Jan 13 02:37:25 2020 (r356672) +++ head/sys/sys/vnode.h Mon Jan 13 02:39:41 2020 (r356673) @@ -171,7 +171,8 @@ struct vnode { u_int v_usecount; /* I ref count of users */ u_int v_iflag; /* i vnode flags (see below) */ u_int v_vflag; /* v vnode flags */ - u_int v_mflag; /* l mnt-specific vnode flags */ + u_short v_mflag; /* l mnt-specific vnode flags */ + short v_dbatchcpu; /* i LRU requeue deferral batch */ int v_writecount; /* I ref count of writers or (negative) text users */ u_int v_hash;
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?202001130239.00D2df0x028071>