From owner-svn-src-head@freebsd.org  Mon Jan 13 02:39:41 2020
Return-Path: <owner-svn-src-head@freebsd.org>
Delivered-To: svn-src-head@mailman.nyi.freebsd.org
Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2610:1c1:1:606c::19:1])
 by mailman.nyi.freebsd.org (Postfix) with ESMTP id DBCBF1F35C3;
 Mon, 13 Jan 2020 02:39:41 +0000 (UTC) (envelope-from mjg@FreeBSD.org)
Received: from mxrelay.nyi.freebsd.org (mxrelay.nyi.freebsd.org
 [IPv6:2610:1c1:1:606c::19:3])
 (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits)
 server-signature RSA-PSS (4096 bits)
 client-signature RSA-PSS (4096 bits) client-digest SHA256)
 (Client CN "mxrelay.nyi.freebsd.org",
 Issuer "Let's Encrypt Authority X3" (verified OK))
 by mx1.freebsd.org (Postfix) with ESMTPS id 47wyV55X3fz3MwG;
 Mon, 13 Jan 2020 02:39:41 +0000 (UTC) (envelope-from mjg@FreeBSD.org)
Received: from repo.freebsd.org (repo.freebsd.org
 [IPv6:2610:1c1:1:6068::e6a:0])
 (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
 (Client did not present a certificate)
 by mxrelay.nyi.freebsd.org (Postfix) with ESMTPS id B8DE725864;
 Mon, 13 Jan 2020 02:39:41 +0000 (UTC) (envelope-from mjg@FreeBSD.org)
Received: from repo.freebsd.org ([127.0.1.37])
 by repo.freebsd.org (8.15.2/8.15.2) with ESMTP id 00D2dfUY028073;
 Mon, 13 Jan 2020 02:39:41 GMT (envelope-from mjg@FreeBSD.org)
Received: (from mjg@localhost)
 by repo.freebsd.org (8.15.2/8.15.2/Submit) id 00D2df0x028071;
 Mon, 13 Jan 2020 02:39:41 GMT (envelope-from mjg@FreeBSD.org)
Message-Id: <202001130239.00D2df0x028071@repo.freebsd.org>
X-Authentication-Warning: repo.freebsd.org: mjg set sender to mjg@FreeBSD.org
 using -f
From: Mateusz Guzik <mjg@FreeBSD.org>
Date: Mon, 13 Jan 2020 02:39:41 +0000 (UTC)
To: src-committers@freebsd.org, svn-src-all@freebsd.org,
 svn-src-head@freebsd.org
Subject: svn commit: r356673 - in head/sys: kern sys
X-SVN-Group: head
X-SVN-Commit-Author: mjg
X-SVN-Commit-Paths: in head/sys: kern sys
X-SVN-Commit-Revision: 356673
X-SVN-Commit-Repository: base
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
X-BeenThere: svn-src-head@freebsd.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: SVN commit messages for the src tree for head/-current
 <svn-src-head.freebsd.org>
List-Unsubscribe: <https://lists.freebsd.org/mailman/options/svn-src-head>,
 <mailto:svn-src-head-request@freebsd.org?subject=unsubscribe>
List-Archive: <http://lists.freebsd.org/pipermail/svn-src-head/>
List-Post: <mailto:svn-src-head@freebsd.org>
List-Help: <mailto:svn-src-head-request@freebsd.org?subject=help>
List-Subscribe: <https://lists.freebsd.org/mailman/listinfo/svn-src-head>,
 <mailto:svn-src-head-request@freebsd.org?subject=subscribe>
X-List-Received-Date: Mon, 13 Jan 2020 02:39:41 -0000

Author: mjg
Date: Mon Jan 13 02:39:41 2020
New Revision: 356673
URL: https://svnweb.freebsd.org/changeset/base/356673

Log:
  vfs: per-cpu batched requeuing of free vnodes
  
  Constant requeuing adds significant lock contention in certain
  workloads. Lessen the problem by batching it.
  
  Per-cpu areas are locked in order to synchronize against UMA freeing
  memory.
  
  vnode's v_mflag is converted to short to prevent the struct from
  growing.
  
  Sample result from an incremental make -s -j 104 bzImage on tmpfs:
  stock:   122.38s user 1780.45s system 6242% cpu 30.480 total
  patched: 144.84s user 985.90s system 4856% cpu 23.282 total
  
  Reviewed by:	jeff
  Tested by:	pho (in a larger patch, previous version)
  Differential Revision:	https://reviews.freebsd.org/D22998

Modified:
  head/sys/kern/vfs_subr.c
  head/sys/sys/vnode.h

Modified: head/sys/kern/vfs_subr.c
==============================================================================
--- head/sys/kern/vfs_subr.c	Mon Jan 13 02:37:25 2020	(r356672)
+++ head/sys/kern/vfs_subr.c	Mon Jan 13 02:39:41 2020	(r356673)
@@ -295,6 +295,16 @@ static int stat_rush_requests;	/* number of times I/O 
 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0,
     "Number of times I/O speeded up (rush requests)");
 
+#define	VDBATCH_SIZE 8
+struct vdbatch {
+	u_int index;
+	struct mtx lock;
+	struct vnode *tab[VDBATCH_SIZE];
+};
+DPCPU_DEFINE_STATIC(struct vdbatch, vd);
+
+static void	vdbatch_dequeue(struct vnode *vp);
+
 /*
  * When shutting down the syncer, run it at four times normal speed.
  */
@@ -552,6 +562,8 @@ vnode_init(void *mem, int size, int flags)
 	 */
 	rangelock_init(&vp->v_rl);
 
+	vp->v_dbatchcpu = NOCPU;
+
 	mtx_lock(&vnode_list_mtx);
 	TAILQ_INSERT_BEFORE(vnode_list_free_marker, vp, v_vnodelist);
 	mtx_unlock(&vnode_list_mtx);
@@ -568,6 +580,7 @@ vnode_fini(void *mem, int size)
 	struct bufobj *bo;
 
 	vp = mem;
+	vdbatch_dequeue(vp);
 	mtx_lock(&vnode_list_mtx);
 	TAILQ_REMOVE(&vnode_list, vp, v_vnodelist);
 	mtx_unlock(&vnode_list_mtx);
@@ -602,8 +615,9 @@ vnode_fini(void *mem, int size)
 static void
 vntblinit(void *dummy __unused)
 {
+	struct vdbatch *vd;
+	int cpu, physvnodes, virtvnodes;
 	u_int i;
-	int physvnodes, virtvnodes;
 
 	/*
 	 * Desiredvnodes is a function of the physical memory size and the
@@ -669,6 +683,12 @@ vntblinit(void *dummy __unused)
 	for (i = 1; i <= sizeof(struct vnode); i <<= 1)
 		vnsz2log++;
 	vnsz2log--;
+
+	CPU_FOREACH(cpu) {
+		vd = DPCPU_ID_PTR((cpu), vd);
+		bzero(vd, sizeof(*vd));
+		mtx_init(&vd->lock, "vdbatch", NULL, MTX_DEF);
+	}
 }
 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL);
 
@@ -3199,7 +3219,99 @@ vholdnz(struct vnode *vp)
 #endif
 }
 
+static void __noinline
+vdbatch_process(struct vdbatch *vd)
+{
+	struct vnode *vp;
+	int i;
+
+	mtx_assert(&vd->lock, MA_OWNED);
+	MPASS(vd->index == VDBATCH_SIZE);
+
+	mtx_lock(&vnode_list_mtx);
+	for (i = 0; i < VDBATCH_SIZE; i++) {
+		vp = vd->tab[i];
+		TAILQ_REMOVE(&vnode_list, vp, v_vnodelist);
+		TAILQ_INSERT_TAIL(&vnode_list, vp, v_vnodelist);
+		MPASS(vp->v_dbatchcpu != NOCPU);
+		vp->v_dbatchcpu = NOCPU;
+	}
+	bzero(vd->tab, sizeof(vd->tab));
+	vd->index = 0;
+	mtx_unlock(&vnode_list_mtx);
+}
+
+static void
+vdbatch_enqueue(struct vnode *vp)
+{
+	struct vdbatch *vd;
+
+	ASSERT_VI_LOCKED(vp, __func__);
+	VNASSERT(!VN_IS_DOOMED(vp), vp,
+	    ("%s: deferring requeue of a doomed vnode", __func__));
+
+	if (vp->v_dbatchcpu != NOCPU) {
+		VI_UNLOCK(vp);
+		return;
+	}
+
+	/*
+	 * A hack: pin us to the current CPU so that we know what to put in
+	 * ->v_dbatchcpu.
+	 */
+	sched_pin();
+	vd = DPCPU_PTR(vd);
+	mtx_lock(&vd->lock);
+	MPASS(vd->index < VDBATCH_SIZE);
+	MPASS(vd->tab[vd->index] == NULL);
+	vp->v_dbatchcpu = curcpu;
+	vd->tab[vd->index] = vp;
+	vd->index++;
+	VI_UNLOCK(vp);
+	if (vd->index == VDBATCH_SIZE)
+		vdbatch_process(vd);
+	mtx_unlock(&vd->lock);
+	sched_unpin();
+}
+
 /*
+ * This routine must only be called for vnodes which are about to be
+ * deallocated. Supporting dequeue for arbitrary vndoes would require
+ * validating that the locked batch matches.
+ */
+static void
+vdbatch_dequeue(struct vnode *vp)
+{
+	struct vdbatch *vd;
+	int i;
+	short cpu;
+
+	VNASSERT(vp->v_type == VBAD || vp->v_type == VNON, vp,
+	    ("%s: called for a used vnode\n", __func__));
+
+	cpu = vp->v_dbatchcpu;
+	if (cpu == NOCPU)
+		return;
+
+	vd = DPCPU_ID_PTR(cpu, vd);
+	mtx_lock(&vd->lock);
+	for (i = 0; i < vd->index; i++) {
+		if (vd->tab[i] != vp)
+			continue;
+		vp->v_dbatchcpu = NOCPU;
+		vd->index--;
+		vd->tab[i] = vd->tab[vd->index];
+		vd->tab[vd->index] = NULL;
+		break;
+	}
+	mtx_unlock(&vd->lock);
+	/*
+	 * Either we dequeued the vnode above or the target CPU beat us to it.
+	 */
+	MPASS(vp->v_dbatchcpu == NOCPU);
+}
+
+/*
  * Drop the hold count of the vnode.  If this is the last reference to
  * the vnode we place it on the free list unless it has been vgone'd
  * (marked VIRF_DOOMED) in which case we will free it.
@@ -3236,12 +3348,8 @@ vdrop_deactivate(struct vnode *vp)
 		mp->mnt_lazyvnodelistsize--;
 		mtx_unlock(&mp->mnt_listmtx);
 	}
-	mtx_lock(&vnode_list_mtx);
-	TAILQ_REMOVE(&vnode_list, vp, v_vnodelist);
-	TAILQ_INSERT_TAIL(&vnode_list, vp, v_vnodelist);
-	mtx_unlock(&vnode_list_mtx);
 	atomic_add_long(&freevnodes, 1);
-	VI_UNLOCK(vp);
+	vdbatch_enqueue(vp);
 }
 
 void

Modified: head/sys/sys/vnode.h
==============================================================================
--- head/sys/sys/vnode.h	Mon Jan 13 02:37:25 2020	(r356672)
+++ head/sys/sys/vnode.h	Mon Jan 13 02:39:41 2020	(r356673)
@@ -171,7 +171,8 @@ struct vnode {
 	u_int	v_usecount;			/* I ref count of users */
 	u_int	v_iflag;			/* i vnode flags (see below) */
 	u_int	v_vflag;			/* v vnode flags */
-	u_int	v_mflag;			/* l mnt-specific vnode flags */
+	u_short	v_mflag;			/* l mnt-specific vnode flags */
+	short	v_dbatchcpu;			/* i LRU requeue deferral batch */
 	int	v_writecount;			/* I ref count of writers or
 						   (negative) text users */
 	u_int	v_hash;