Skip site navigation (1)Skip section navigation (2)
Date:      Tue, 20 Feb 2018 00:06:07 +0000 (UTC)
From:      Jeff Roberson <jeff@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r329612 - in head/sys: kern sys
Message-ID:  <201802200006.w1K067ZL032311@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: jeff
Date: Tue Feb 20 00:06:07 2018
New Revision: 329612
URL: https://svnweb.freebsd.org/changeset/base/329612

Log:
  Further parallelize the buffer cache.
  
  Provide multiple clean queues partitioned into 'domains'.  Each domain manages
  its own bufspace and has its own bufspace daemon.  Each domain has a set of
  subqueues indexed by the current cpuid to reduce lock contention on the cleanq.
  
  Refine the sleep/wakeup around the bufspace daemon to use atomics as much as
  possible.
  
  Add a B_REUSE flag that is used to requeue bufs during the scan to approximate
  LRU rather than locking the queue on every use of a frequently accessed buf.
  
  Implement bufspace_reserve with only atomic_fetchadd to avoid loop restarts.
  
  Reviewed by:	markj
  Tested by:	pho
  Sponsored by:	Netflix, Dell/EMC Isilon
  Differential Revision:	https://reviews.freebsd.org/D14274

Modified:
  head/sys/kern/vfs_bio.c
  head/sys/kern/vfs_subr.c
  head/sys/sys/buf.h
  head/sys/sys/bufobj.h

Modified: head/sys/kern/vfs_bio.c
==============================================================================
--- head/sys/kern/vfs_bio.c	Mon Feb 19 22:56:04 2018	(r329611)
+++ head/sys/kern/vfs_bio.c	Tue Feb 20 00:06:07 2018	(r329612)
@@ -51,6 +51,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/conf.h>
+#include <sys/counter.h>
 #include <sys/buf.h>
 #include <sys/devicestat.h>
 #include <sys/eventhandler.h>
@@ -105,7 +106,6 @@ caddr_t unmapped_buf;
 
 /* Used below and for softdep flushing threads in ufs/ffs/ffs_softdep.c */
 struct proc *bufdaemonproc;
-struct proc *bufspacedaemonproc;
 
 static int inmem(struct vnode *vp, daddr_t blkno);
 static void vm_hold_free_pages(struct buf *bp, int newbsize);
@@ -124,11 +124,8 @@ static int vfs_bio_clcheck(struct vnode *vp, int size,
 static void breada(struct vnode *, daddr_t *, int *, int, struct ucred *, int,
 		void (*)(struct buf *));
 static int buf_flush(struct vnode *vp, int);
-static int buf_recycle(bool);
-static int buf_scan(bool);
 static int flushbufqueues(struct vnode *, int, int);
 static void buf_daemon(void);
-static void bremfreel(struct buf *bp);
 static __inline void bd_wakeup(void);
 static int sysctl_runningspace(SYSCTL_HANDLER_ARGS);
 static void bufkva_reclaim(vmem_t *, int);
@@ -137,28 +134,17 @@ static int buf_import(void *, void **, int, int, int);
 static void buf_release(void *, void **, int);
 static void maxbcachebuf_adjust(void);
 
-#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
-    defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 static int sysctl_bufspace(SYSCTL_HANDLER_ARGS);
-#endif
-
 int vmiodirenable = TRUE;
 SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW, &vmiodirenable, 0,
     "Use the VM system for directory writes");
 long runningbufspace;
 SYSCTL_LONG(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0,
     "Amount of presently outstanding async buffer io");
-static long bufspace;
-#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
-    defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 SYSCTL_PROC(_vfs, OID_AUTO, bufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RD,
-    &bufspace, 0, sysctl_bufspace, "L", "Virtual memory used for buffers");
-#else
-SYSCTL_LONG(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0,
-    "Physical memory used for buffers");
-#endif
-static long bufkvaspace;
-SYSCTL_LONG(_vfs, OID_AUTO, bufkvaspace, CTLFLAG_RD, &bufkvaspace, 0,
+    NULL, 0, sysctl_bufspace, "L", "Physical memory used for buffers");
+static counter_u64_t bufkvaspace;
+SYSCTL_COUNTER_U64(_vfs, OID_AUTO, bufkvaspace, CTLFLAG_RD, &bufkvaspace,
     "Kernel virtual memory used for buffers");
 static long maxbufspace;
 SYSCTL_LONG(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RW, &maxbufspace, 0,
@@ -178,11 +164,11 @@ SYSCTL_LONG(_vfs, OID_AUTO, hibufspace, CTLFLAG_RW, &h
 long bufspacethresh;
 SYSCTL_LONG(_vfs, OID_AUTO, bufspacethresh, CTLFLAG_RW, &bufspacethresh,
     0, "Bufspace consumed before waking the daemon to free some");
-static int buffreekvacnt;
-SYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt, 0,
+static counter_u64_t buffreekvacnt;
+SYSCTL_COUNTER_U64(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt,
     "Number of times we have freed the KVA space from some buffer");
-static int bufdefragcnt;
-SYSCTL_INT(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt, 0,
+static counter_u64_t bufdefragcnt;
+SYSCTL_COUNTER_U64(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt,
     "Number of times we have had to repeat buffer allocation to defragment");
 static long lorunningspace;
 SYSCTL_PROC(_vfs, OID_AUTO, lorunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE |
@@ -225,24 +211,26 @@ SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW, 
 static int hifreebuffers;
 SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, &hifreebuffers, 0,
    "Threshold for clean buffer recycling");
-static int getnewbufcalls;
-SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW, &getnewbufcalls, 0,
-   "Number of calls to getnewbuf");
-static int getnewbufrestarts;
-SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW, &getnewbufrestarts, 0,
+static counter_u64_t getnewbufcalls;
+SYSCTL_COUNTER_U64(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RD,
+   &getnewbufcalls, "Number of calls to getnewbuf");
+static counter_u64_t getnewbufrestarts;
+SYSCTL_COUNTER_U64(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RD,
+    &getnewbufrestarts,
     "Number of times getnewbuf has had to restart a buffer acquisition");
-static int mappingrestarts;
-SYSCTL_INT(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RW, &mappingrestarts, 0,
+static counter_u64_t mappingrestarts;
+SYSCTL_COUNTER_U64(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RD,
+    &mappingrestarts,
     "Number of times getblk has had to restart a buffer mapping for "
     "unmapped buffer");
-static int numbufallocfails;
-SYSCTL_INT(_vfs, OID_AUTO, numbufallocfails, CTLFLAG_RW, &numbufallocfails, 0,
-    "Number of times buffer allocations failed");
+static counter_u64_t numbufallocfails;
+SYSCTL_COUNTER_U64(_vfs, OID_AUTO, numbufallocfails, CTLFLAG_RW,
+    &numbufallocfails, "Number of times buffer allocations failed");
 static int flushbufqtarget = 100;
 SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0,
     "Amount of work to do in flushbufqueues when helping bufdaemon");
-static long notbufdflushes;
-SYSCTL_LONG(_vfs, OID_AUTO, notbufdflushes, CTLFLAG_RD, &notbufdflushes, 0,
+static counter_u64_t notbufdflushes;
+SYSCTL_COUNTER_U64(_vfs, OID_AUTO, notbufdflushes, CTLFLAG_RD, &notbufdflushes,
     "Number of dirty buffer flushes done by the bufdaemon helpers");
 static long barrierwrites;
 SYSCTL_LONG(_vfs, OID_AUTO, barrierwrites, CTLFLAG_RW, &barrierwrites, 0,
@@ -266,11 +254,6 @@ static struct mtx_padalign __exclusive_cache_line bdlo
 static struct mtx_padalign __exclusive_cache_line rbreqlock;
 
 /*
- * Lock that protects needsbuffer and the sleeps/wakeups surrounding it.
- */
-static struct rwlock_padalign __exclusive_cache_line nblock;
-
-/*
  * Lock that protects bdirtywait.
  */
 static struct mtx_padalign __exclusive_cache_line bdirtylock;
@@ -283,11 +266,6 @@ static struct mtx_padalign __exclusive_cache_line bdir
 static int bd_request;
 
 /*
- * Request/wakeup point for the bufspace daemon.
- */
-static int bufspace_request;
-
-/*
  * Request for the buf daemon to write more buffers than is indicated by
  * lodirtybuf.  This may be necessary to push out excess dependencies or
  * defragment the address space where a simple count of the number of dirty
@@ -302,15 +280,6 @@ static int bd_speedupreq;
  */
 static int runningbufreq;
 
-/* 
- * Synchronization (sleep/wakeup) variable for buffer requests.
- * Can contain the VFS_BIO_NEED flags defined below; setting/clearing is done
- * by and/or.
- * Used in numdirtywakeup(), bufspace_wakeup(), bwillwrite(),
- * getnewbuf(), and getblk().
- */
-static volatile int needsbuffer;
-
 /*
  * Synchronization for bwillwrite() waiters.
  */
@@ -323,29 +292,69 @@ static int bdirtywait;
 #define QUEUE_EMPTY	1	/* empty buffer headers */
 #define QUEUE_DIRTY	2	/* B_DELWRI buffers */
 #define QUEUE_CLEAN	3	/* non-B_DELWRI buffers */
-#define QUEUE_SENTINEL	1024	/* not an queue index, but mark for sentinel */
+#define QUEUE_SENTINEL	4	/* not an queue index, but mark for sentinel */
 
-/* Maximum number of clean buffer queues. */
-#define	CLEAN_QUEUES	16
+struct bufqueue {
+	struct mtx_padalign	bq_lock;
+	TAILQ_HEAD(, buf)	bq_queue;
+	uint8_t			bq_index;
+	uint16_t		bq_subqueue;
+	int			bq_len;
+} __aligned(CACHE_LINE_SIZE);
 
+#define	BQ_LOCKPTR(bq)		(&(bq)->bq_lock)
+#define	BQ_LOCK(bq)		mtx_lock(BQ_LOCKPTR((bq)))
+#define	BQ_UNLOCK(bq)		mtx_unlock(BQ_LOCKPTR((bq)))
+#define	BQ_ASSERT_LOCKED(bq)	mtx_assert(BQ_LOCKPTR((bq)), MA_OWNED)
+
+struct bufqueue __exclusive_cache_line bqempty;
+struct bufqueue __exclusive_cache_line bqdirty;
+
+struct bufdomain {
+	struct bufqueue	bd_subq[MAXCPU + 1]; /* Per-cpu sub queues + global */
+	struct bufqueue	*bd_cleanq;
+	struct mtx_padalign bd_run_lock;
+	/* Constants */
+	long		bd_maxbufspace;
+	long		bd_hibufspace;
+	long 		bd_lobufspace;
+	long 		bd_bufspacethresh;
+	int		bd_hifreebuffers;
+	int		bd_lofreebuffers;
+	int		bd_lim;
+	/* atomics */
+	int		bd_wanted;
+	int  __aligned(CACHE_LINE_SIZE)	bd_running;
+	long __aligned(CACHE_LINE_SIZE) bd_bufspace;
+	int __aligned(CACHE_LINE_SIZE)	bd_freebuffers;
+} __aligned(CACHE_LINE_SIZE);
+
+#define	BD_LOCKPTR(bd)		(&(bd)->bd_cleanq->bq_lock)
+#define	BD_LOCK(bd)		mtx_lock(BD_LOCKPTR((bd)))
+#define	BD_UNLOCK(bd)		mtx_unlock(BD_LOCKPTR((bd)))
+#define	BD_ASSERT_LOCKED(bd)	mtx_assert(BD_LOCKPTR((bd)), MA_OWNED)
+#define	BD_RUN_LOCKPTR(bd)	(&(bd)->bd_run_lock)
+#define	BD_RUN_LOCK(bd)		mtx_lock(BD_RUN_LOCKPTR((bd)))
+#define	BD_RUN_UNLOCK(bd)	mtx_unlock(BD_RUN_LOCKPTR((bd)))
+#define	BD_DOMAIN(bd)		(bd - bdclean)
+
+/* Maximum number of clean buffer domains. */
+#define	CLEAN_DOMAINS	8
+
 /* Configured number of clean queues. */
-static int clean_queues;
+static int __read_mostly clean_domains;
 
-/* Maximum number of buffer queues. */
-#define BUFFER_QUEUES	(QUEUE_CLEAN + CLEAN_QUEUES)
+struct bufdomain __exclusive_cache_line bdclean[CLEAN_DOMAINS];
 
-/* Queues for free buffers with various properties */
-static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } };
-#ifdef INVARIANTS
-static int bq_len[BUFFER_QUEUES];
-#endif
+static void bq_remove(struct bufqueue *bq, struct buf *bp);
+static void bq_insert(struct bufqueue *bq, struct buf *bp, bool unlock);
+static int buf_recycle(struct bufdomain *, bool kva);
+static void bq_init(struct bufqueue *bq, int qindex, int cpu,
+	    const char *lockname);
+static void bd_init(struct bufdomain *bd);
+static int bd_flushall(struct bufdomain *bd);
 
 /*
- * Lock for each bufqueue
- */
-static struct mtx_padalign __exclusive_cache_line bqlocks[BUFFER_QUEUES];
-
-/*
  * per-cpu empty buffer cache.
  */
 uma_zone_t buf_zone;
@@ -391,46 +400,34 @@ sysctl_bufspace(SYSCTL_HANDLER_ARGS)
 {
 	long lvalue;
 	int ivalue;
+	int i;
 
+	lvalue = 0;
+	for (i = 0; i < clean_domains; i++)
+		lvalue += bdclean[i].bd_bufspace;
 	if (sizeof(int) == sizeof(long) || req->oldlen >= sizeof(long))
-		return (sysctl_handle_long(oidp, arg1, arg2, req));
-	lvalue = *(long *)arg1;
+		return (sysctl_handle_long(oidp, &lvalue, 0, req));
 	if (lvalue > INT_MAX)
 		/* On overflow, still write out a long to trigger ENOMEM. */
 		return (sysctl_handle_long(oidp, &lvalue, 0, req));
 	ivalue = lvalue;
 	return (sysctl_handle_int(oidp, &ivalue, 0, req));
 }
-#endif
-
+#else
 static int
-bqcleanq(void)
+sysctl_bufspace(SYSCTL_HANDLER_ARGS)
 {
-	static int nextq;
+	long lvalue;
+	int i;
 
-	return ((atomic_fetchadd_int(&nextq, 1) % clean_queues) + QUEUE_CLEAN);
+	lvalue = 0;
+	for (i = 0; i < clean_domains; i++)
+		lvalue += bdclean[i].bd_bufspace;
+	return (sysctl_handle_int(oidp, &lvalue, 0, req));
 }
+#endif
 
-static int
-bqisclean(int qindex)
-{
-
-	return (qindex >= QUEUE_CLEAN && qindex < QUEUE_CLEAN + CLEAN_QUEUES);
-}
-
 /*
- *	bqlock:
- *
- *	Return the appropriate queue lock based on the index.
- */
-static inline struct mtx *
-bqlock(int qindex)
-{
-
-	return (struct mtx *)&bqlocks[qindex];
-}
-
-/*
  *	bdirtywakeup:
  *
  *	Wakeup any bwillwrite() waiters.
@@ -481,47 +478,50 @@ bdirtyadd(void)
 }
 
 /*
- *	bufspace_wakeup:
+ *	bufspace_daemon_wakeup:
  *
- *	Called when buffer space is potentially available for recovery.
- *	getnewbuf() will block on this flag when it is unable to free 
- *	sufficient buffer space.  Buffer space becomes recoverable when 
- *	bp's get placed back in the queues.
+ *	Wakeup the daemons responsible for freeing clean bufs.
  */
 static void
-bufspace_wakeup(void)
+bufspace_daemon_wakeup(struct bufdomain *bd)
 {
 
 	/*
-	 * If someone is waiting for bufspace, wake them up.
-	 *
-	 * Since needsbuffer is set prior to doing an additional queue
-	 * scan it is safe to check for the flag prior to acquiring the
-	 * lock.  The thread that is preparing to scan again before
-	 * blocking would discover the buf we released.
+	 * avoid the lock if the daemon is running.
 	 */
-	if (needsbuffer) {
-		rw_rlock(&nblock);
-		if (atomic_cmpset_int(&needsbuffer, 1, 0) == 1)
-			wakeup(__DEVOLATILE(void *, &needsbuffer));
-		rw_runlock(&nblock);
+	if (atomic_fetchadd_int(&bd->bd_running, 1) == 0) {
+		BD_RUN_LOCK(bd);
+		atomic_store_int(&bd->bd_running, 1);
+		wakeup(&bd->bd_running);
+		BD_RUN_UNLOCK(bd);
 	}
 }
 
 /*
- *	bufspace_daemonwakeup:
+ *	bufspace_daemon_wait:
  *
- *	Wakeup the daemon responsible for freeing clean bufs.
+ *	Sleep until the domain falls below a limit or one second passes.
  */
 static void
-bufspace_daemonwakeup(void)
+bufspace_daemon_wait(struct bufdomain *bd)
 {
-	rw_rlock(&nblock);
-	if (bufspace_request == 0) {
-		bufspace_request = 1;
-		wakeup(&bufspace_request);
+	/*
+	 * Re-check our limits and sleep.  bd_running must be
+	 * cleared prior to checking the limits to avoid missed
+	 * wakeups.  The waker will adjust one of bufspace or
+	 * freebuffers prior to checking bd_running.
+	 */
+	BD_RUN_LOCK(bd);
+	atomic_store_int(&bd->bd_running, 0);
+	if (bd->bd_bufspace < bd->bd_bufspacethresh &&
+	    bd->bd_freebuffers > bd->bd_lofreebuffers) {
+		msleep(&bd->bd_running, BD_RUN_LOCKPTR(bd), PRIBIO|PDROP,
+		    "-", hz);
+	} else {
+		/* Avoid spurious wakeups while running. */
+		atomic_store_int(&bd->bd_running, 1);
+		BD_RUN_UNLOCK(bd);
 	}
-	rw_runlock(&nblock);
 }
 
 /*
@@ -533,20 +533,22 @@ bufspace_daemonwakeup(void)
 static void
 bufspace_adjust(struct buf *bp, int bufsize)
 {
+	struct bufdomain *bd;
 	long space;
 	int diff;
 
 	KASSERT((bp->b_flags & B_MALLOC) == 0,
 	    ("bufspace_adjust: malloc buf %p", bp));
+	bd = &bdclean[bp->b_domain];
 	diff = bufsize - bp->b_bufsize;
 	if (diff < 0) {
-		atomic_subtract_long(&bufspace, -diff);
-		bufspace_wakeup();
+		atomic_subtract_long(&bd->bd_bufspace, -diff);
 	} else {
-		space = atomic_fetchadd_long(&bufspace, diff);
+		space = atomic_fetchadd_long(&bd->bd_bufspace, diff);
 		/* Wake up the daemon on the transition. */
-		if (space < bufspacethresh && space + diff >= bufspacethresh)
-			bufspace_daemonwakeup();
+		if (space < bd->bd_bufspacethresh &&
+		    space + diff >= bd->bd_bufspacethresh)
+			bufspace_daemon_wakeup(bd);
 	}
 	bp->b_bufsize = bufsize;
 }
@@ -558,24 +560,25 @@ bufspace_adjust(struct buf *bp, int bufsize)
  *	different space limit than data.
  */
 static int
-bufspace_reserve(int size, bool metadata)
+bufspace_reserve(struct bufdomain *bd, int size, bool metadata)
 {
-	long limit;
+	long limit, new;
 	long space;
 
 	if (metadata)
-		limit = maxbufspace;
+		limit = bd->bd_maxbufspace;
 	else
-		limit = hibufspace;
-	do {
-		space = bufspace;
-		if (space + size > limit)
-			return (ENOSPC);
-	} while (atomic_cmpset_long(&bufspace, space, space + size) == 0);
+		limit = bd->bd_hibufspace;
+	space = atomic_fetchadd_long(&bd->bd_bufspace, size);
+	new = space + size;
+	if (new > limit) {
+		atomic_subtract_long(&bd->bd_bufspace, size);
+		return (ENOSPC);
+	}
 
 	/* Wake up the daemon on the transition. */
-	if (space < bufspacethresh && space + size >= bufspacethresh)
-		bufspace_daemonwakeup();
+	if (space < bd->bd_bufspacethresh && new >= bd->bd_bufspacethresh)
+		bufspace_daemon_wakeup(bd);
 
 	return (0);
 }
@@ -586,21 +589,22 @@ bufspace_reserve(int size, bool metadata)
  *	Release reserved bufspace after bufspace_adjust() has consumed it.
  */
 static void
-bufspace_release(int size)
+bufspace_release(struct bufdomain *bd, int size)
 {
-	atomic_subtract_long(&bufspace, size);
-	bufspace_wakeup();
+
+	atomic_subtract_long(&bd->bd_bufspace, size);
 }
 
 /*
  *	bufspace_wait:
  *
  *	Wait for bufspace, acting as the buf daemon if a locked vnode is
- *	supplied.  needsbuffer must be set in a safe fashion prior to
- *	polling for space.  The operation must be re-tried on return.
+ *	supplied.  bd_wanted must be set prior to polling for space.  The
+ *	operation must be re-tried on return.
  */
 static void
-bufspace_wait(struct vnode *vp, int gbflags, int slpflag, int slptimeo)
+bufspace_wait(struct bufdomain *bd, struct vnode *vp, int gbflags,
+    int slpflag, int slptimeo)
 {
 	struct thread *td;
 	int error, fl, norunbuf;
@@ -609,11 +613,11 @@ bufspace_wait(struct vnode *vp, int gbflags, int slpfl
 		return;
 
 	td = curthread;
-	rw_wlock(&nblock);
-	while (needsbuffer != 0) {
+	BD_LOCK(bd);
+	while (bd->bd_wanted) {
 		if (vp != NULL && vp->v_type != VCHR &&
 		    (td->td_pflags & TDP_BUFNEED) == 0) {
-			rw_wunlock(&nblock);
+			BD_UNLOCK(bd);
 			/*
 			 * getblk() is called with a vnode locked, and
 			 * some majority of the dirty buffers may as
@@ -636,18 +640,18 @@ bufspace_wait(struct vnode *vp, int gbflags, int slpfl
 			td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF;
 			fl = buf_flush(vp, flushbufqtarget);
 			td->td_pflags &= norunbuf;
-			rw_wlock(&nblock);
+			BD_LOCK(bd);
 			if (fl != 0)
 				continue;
-			if (needsbuffer == 0)
+			if (bd->bd_wanted == 0)
 				break;
 		}
-		error = rw_sleep(__DEVOLATILE(void *, &needsbuffer), &nblock,
+		error = msleep(&bd->bd_wanted, BD_LOCKPTR(bd),
 		    (PRIBIO + 4) | slpflag, "newbuf", slptimeo);
 		if (error != 0)
 			break;
 	}
-	rw_wunlock(&nblock);
+	BD_UNLOCK(bd);
 }
 
 
@@ -659,10 +663,13 @@ bufspace_wait(struct vnode *vp, int gbflags, int slpfl
  *	block nor work to reclaim buffers.
  */
 static void
-bufspace_daemon(void)
+bufspace_daemon(void *arg)
 {
+	struct bufdomain *bd;
+
+	bd = arg;
 	for (;;) {
-		kproc_suspend_check(bufspacedaemonproc);
+		kproc_suspend_check(curproc);
 
 		/*
 		 * Free buffers from the clean queue until we meet our
@@ -689,46 +696,25 @@ bufspace_daemon(void)
 		 *	which will inefficiently trade bufs with bqrelse
 		 *	until we return to condition 2.
 		 */
-		while (bufspace > lobufspace ||
-		    numfreebuffers < hifreebuffers) {
-			if (buf_recycle(false) != 0) {
-				atomic_set_int(&needsbuffer, 1);
-				if (buf_recycle(false) != 0) {
-					rw_wlock(&nblock);
-					if (needsbuffer)
-						rw_sleep(__DEVOLATILE(void *,
-						    &needsbuffer), &nblock,
-						    PRIBIO|PDROP, "bufspace",
-						    hz/10);
-					else
-						rw_wunlock(&nblock);
-				}
+		do {
+			if (buf_recycle(bd, false) != 0) {
+				if (bd_flushall(bd))
+					continue;
+				BD_LOCK(bd);
+				if (bd->bd_wanted) {
+					msleep(&bd->bd_wanted, BD_LOCKPTR(bd),
+					    PRIBIO|PDROP, "bufspace", hz/10);
+				} else
+					BD_UNLOCK(bd);
 			}
 			maybe_yield();
-		}
+		} while (bd->bd_bufspace > bd->bd_lobufspace ||
+		    bd->bd_freebuffers < bd->bd_hifreebuffers);
 
-		/*
-		 * Re-check our limits under the exclusive nblock.
-		 */
-		rw_wlock(&nblock);
-		if (bufspace < bufspacethresh &&
-		    numfreebuffers > lofreebuffers) {
-			bufspace_request = 0;
-			rw_sleep(&bufspace_request, &nblock, PRIBIO|PDROP,
-			    "-", hz);
-		} else
-			rw_wunlock(&nblock);
+		bufspace_daemon_wait(bd);
 	}
 }
 
-static struct kproc_desc bufspace_kp = {
-	"bufspacedaemon",
-	bufspace_daemon,
-	&bufspacedaemonproc
-};
-SYSINIT(bufspacedaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start,
-    &bufspace_kp);
-
 /*
  *	bufmallocadjust:
  *
@@ -842,7 +828,7 @@ vfs_buf_test_cache(struct buf *bp, vm_ooffset_t foff, 
 }
 
 /* Wake up the buffer daemon if necessary */
-static __inline void
+static void
 bd_wakeup(void)
 {
 
@@ -1038,19 +1024,12 @@ bufinit(void)
 	KASSERT(maxbcachebuf >= MAXBSIZE,
 	    ("maxbcachebuf (%d) must be >= MAXBSIZE (%d)\n", maxbcachebuf,
 	    MAXBSIZE));
-	mtx_init(&bqlocks[QUEUE_DIRTY], "bufq dirty lock", NULL, MTX_DEF);
-	mtx_init(&bqlocks[QUEUE_EMPTY], "bufq empty lock", NULL, MTX_DEF);
-	for (i = QUEUE_CLEAN; i < QUEUE_CLEAN + CLEAN_QUEUES; i++)
-		mtx_init(&bqlocks[i], "bufq clean lock", NULL, MTX_DEF);
+	bq_init(&bqempty, QUEUE_EMPTY, -1, "bufq empty lock");
+	bq_init(&bqdirty, QUEUE_DIRTY, -1, "bufq dirty lock");
 	mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF);
-	rw_init(&nblock, "needsbuffer lock");
 	mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF);
 	mtx_init(&bdirtylock, "dirty buf lock", NULL, MTX_DEF);
 
-	/* next, make a null set of free lists */
-	for (i = 0; i < BUFFER_QUEUES; i++)
-		TAILQ_INIT(&bufqueues[i]);
-
 	unmapped_buf = (caddr_t)kva_alloc(MAXPHYS);
 
 	/* finally, initialize each buffer header and stick on empty q */
@@ -1060,15 +1039,14 @@ bufinit(void)
 		bp->b_flags = B_INVAL;
 		bp->b_rcred = NOCRED;
 		bp->b_wcred = NOCRED;
-		bp->b_qindex = QUEUE_EMPTY;
+		bp->b_qindex = QUEUE_NONE;
+		bp->b_domain = -1;
+		bp->b_subqueue = mp_ncpus;
 		bp->b_xflags = 0;
 		bp->b_data = bp->b_kvabase = unmapped_buf;
 		LIST_INIT(&bp->b_dep);
 		BUF_LOCKINIT(bp);
-		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
-#ifdef INVARIANTS
-		bq_len[QUEUE_EMPTY]++;
-#endif
+		bq_insert(&bqempty, bp, false);
 	}
 
 	/*
@@ -1150,8 +1128,31 @@ bufinit(void)
 	 * One queue per-256mb up to the max.  More queues gives better
 	 * concurrency but less accurate LRU.
 	 */
-	clean_queues = MIN(howmany(maxbufspace, 256*1024*1024), CLEAN_QUEUES);
+	clean_domains = MIN(howmany(maxbufspace, 256*1024*1024), CLEAN_DOMAINS);
+	for (i = 0 ; i < clean_domains; i++) {
+		struct bufdomain *bd;
 
+		bd = &bdclean[i];
+		bd_init(bd);
+		bd->bd_freebuffers = nbuf / clean_domains;
+		bd->bd_hifreebuffers = hifreebuffers / clean_domains;
+		bd->bd_lofreebuffers = lofreebuffers / clean_domains;
+		bd->bd_bufspace = 0;
+		bd->bd_maxbufspace = maxbufspace / clean_domains;
+		bd->bd_hibufspace = hibufspace / clean_domains;
+		bd->bd_lobufspace = lobufspace / clean_domains;
+		bd->bd_bufspacethresh = bufspacethresh / clean_domains;
+		/* Don't allow more than 2% of bufs in the per-cpu caches. */
+		bd->bd_lim = nbuf / clean_domains / 50 / mp_ncpus;
+	}
+	getnewbufcalls = counter_u64_alloc(M_WAITOK);
+	getnewbufrestarts = counter_u64_alloc(M_WAITOK);
+	mappingrestarts = counter_u64_alloc(M_WAITOK);
+	numbufallocfails = counter_u64_alloc(M_WAITOK);
+	notbufdflushes = counter_u64_alloc(M_WAITOK);
+	buffreekvacnt = counter_u64_alloc(M_WAITOK);
+	bufdefragcnt = counter_u64_alloc(M_WAITOK);
+	bufkvaspace = counter_u64_alloc(M_WAITOK);
 }
 
 #ifdef INVARIANTS
@@ -1326,58 +1327,92 @@ bpmap_qenter(struct buf *bp)
 	    (vm_offset_t)(bp->b_offset & PAGE_MASK));
 }
 
+static struct bufqueue *
+bufqueue(struct buf *bp)
+{
+
+	switch (bp->b_qindex) {
+	case QUEUE_NONE:
+		/* FALLTHROUGH */
+	case QUEUE_SENTINEL:
+		return (NULL);
+	case QUEUE_EMPTY:
+		return (&bqempty);
+	case QUEUE_DIRTY:
+		return (&bqdirty);
+	case QUEUE_CLEAN:
+		return (&bdclean[bp->b_domain].bd_subq[bp->b_subqueue]);
+	default:
+		break;
+	}
+	panic("bufqueue(%p): Unhandled type %d\n", bp, bp->b_qindex);
+}
+
 /*
+ * Return the locked bufqueue that bp is a member of.
+ */
+static struct bufqueue *
+bufqueue_acquire(struct buf *bp)
+{
+	struct bufqueue *bq, *nbq;
+
+	/*
+	 * bp can be pushed from a per-cpu queue to the
+	 * cleanq while we're waiting on the lock.  Retry
+	 * if the queues don't match.
+	 */
+	bq = bufqueue(bp);
+	BQ_LOCK(bq);
+	for (;;) {
+		nbq = bufqueue(bp);
+		if (bq == nbq)
+			break;
+		BQ_UNLOCK(bq);
+		BQ_LOCK(nbq);
+		bq = nbq;
+	}
+	return (bq);
+}
+
+/*
  *	binsfree:
  *
- *	Insert the buffer into the appropriate free list.
+ *	Insert the buffer into the appropriate free list.  Requires a
+ *	locked buffer on entry and buffer is unlocked before return.
  */
 static void
 binsfree(struct buf *bp, int qindex)
 {
-	struct mtx *olock, *nlock;
+	struct bufdomain *bd;
+	struct bufqueue *bq;
 
-	if (qindex != QUEUE_EMPTY) {
-		BUF_ASSERT_XLOCKED(bp);
-	}
+	KASSERT(qindex == QUEUE_CLEAN || qindex == QUEUE_DIRTY,
+	    ("binsfree: Invalid qindex %d", qindex));
+	BUF_ASSERT_XLOCKED(bp);
 
 	/*
-	 * Stick to the same clean queue for the lifetime of the buf to
-	 * limit locking below.  Otherwise pick ont sequentially.
-	 */
-	if (qindex == QUEUE_CLEAN) {
-		if (bqisclean(bp->b_qindex))
-			qindex = bp->b_qindex;
-		else
-			qindex = bqcleanq();
-	}
-
-	/*
 	 * Handle delayed bremfree() processing.
 	 */
-	nlock = bqlock(qindex);
 	if (bp->b_flags & B_REMFREE) {
-		olock = bqlock(bp->b_qindex);
-		mtx_lock(olock);
-		bremfreel(bp);
-		if (olock != nlock) {
-			mtx_unlock(olock);
-			mtx_lock(nlock);
+		if (bp->b_qindex == qindex) {
+			bp->b_flags |= B_REUSE;
+			bp->b_flags &= ~B_REMFREE;
+			BUF_UNLOCK(bp);
+			return;
 		}
+		bq = bufqueue_acquire(bp);
+		bq_remove(bq, bp);
+		BQ_UNLOCK(bq);
+	}
+	if (qindex == QUEUE_CLEAN) {
+		bd = &bdclean[bp->b_domain];
+		if (bd->bd_lim != 0)
+			bq = &bd->bd_subq[PCPU_GET(cpuid)];
+		else
+			bq = bd->bd_cleanq;
 	} else
-		mtx_lock(nlock);
-
-	if (bp->b_qindex != QUEUE_NONE)
-		panic("binsfree: free buffer onto another queue???");
-
-	bp->b_qindex = qindex;
-	if (bp->b_flags & B_AGE)
-		TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist);
-	else
-		TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist);
-#ifdef INVARIANTS
-	bq_len[bp->b_qindex]++;
-#endif
-	mtx_unlock(nlock);
+		bq = &bqdirty;
+	bq_insert(bq, bp, true);
 }
 
 /*
@@ -1404,10 +1439,9 @@ buf_free(struct buf *bp)
 	if (!LIST_EMPTY(&bp->b_dep))
 		buf_deallocate(bp);
 	bufkva_free(bp);
+	atomic_add_int(&bdclean[bp->b_domain].bd_freebuffers, 1);
 	BUF_UNLOCK(bp);
 	uma_zfree(buf_zone, bp);
-	atomic_add_int(&numfreebuffers, 1);
-	bufspace_wakeup();
 }
 
 /*
@@ -1424,15 +1458,15 @@ buf_import(void *arg, void **store, int cnt, int domai
 	struct buf *bp;
 	int i;
 
-	mtx_lock(&bqlocks[QUEUE_EMPTY]);
+	BQ_LOCK(&bqempty);
 	for (i = 0; i < cnt; i++) {
-		bp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
+		bp = TAILQ_FIRST(&bqempty.bq_queue);
 		if (bp == NULL)
 			break;
-		bremfreel(bp);
+		bq_remove(&bqempty, bp);
 		store[i] = bp;
 	}
-	mtx_unlock(&bqlocks[QUEUE_EMPTY]);
+	BQ_UNLOCK(&bqempty);
 
 	return (i);
 }
@@ -1445,10 +1479,21 @@ buf_import(void *arg, void **store, int cnt, int domai
 static void
 buf_release(void *arg, void **store, int cnt)
 {
+	struct bufqueue *bq;
+	struct buf *bp;
         int i;
 
-        for (i = 0; i < cnt; i++)
-		binsfree(store[i], QUEUE_EMPTY);
+	bq = &bqempty;
+	BQ_LOCK(bq);
+        for (i = 0; i < cnt; i++) {
+		bp = store[i];
+		/* Inline bq_insert() to batch locking. */
+		TAILQ_INSERT_TAIL(&bq->bq_queue, bp, b_freelist);
+		bp->b_flags &= ~(B_AGE | B_REUSE);
+		bq->bq_len++;
+		bp->b_qindex = bq->bq_index;
+	}
+	BQ_UNLOCK(bq);
 }
 
 /*
@@ -1457,22 +1502,31 @@ buf_release(void *arg, void **store, int cnt)
  *	Allocate an empty buffer header.
  */
 static struct buf *
-buf_alloc(void)
+buf_alloc(struct bufdomain *bd)
 {
 	struct buf *bp;
+	int freebufs;
 
-	bp = uma_zalloc(buf_zone, M_NOWAIT);
+	/*
+	 * We can only run out of bufs in the buf zone if the average buf
+	 * is less than BKVASIZE.  In this case the actual wait/block will
+	 * come from buf_reycle() failing to flush one of these small bufs.
+	 */
+	bp = NULL;
+	freebufs = atomic_fetchadd_int(&bd->bd_freebuffers, -1);
+	if (freebufs > 0)
+		bp = uma_zalloc(buf_zone, M_NOWAIT);
 	if (bp == NULL) {
-		bufspace_daemonwakeup();
-		atomic_add_int(&numbufallocfails, 1);
+		atomic_fetchadd_int(&bd->bd_freebuffers, 1);
+		bufspace_daemon_wakeup(bd);
+		counter_u64_add(numbufallocfails, 1);
 		return (NULL);
 	}
-
 	/*
-	 * Wake-up the bufspace daemon on transition.
+	 * Wake-up the bufspace daemon on transition below threshold.
 	 */
-	if (atomic_fetchadd_int(&numfreebuffers, -1) == lofreebuffers)
-		bufspace_daemonwakeup();
+	if (freebufs == bd->bd_lofreebuffers)
+		bufspace_daemon_wakeup(bd);
 
 	if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
 		panic("getnewbuf_empty: Locked buf %p on free queue.", bp);
@@ -1488,6 +1542,7 @@ buf_alloc(void)
 	KASSERT(bp->b_kvasize == 0, ("bp: %p still has kva\n", bp));
 	KASSERT(bp->b_bufsize == 0, ("bp: %p still has bufspace\n", bp));
 
+	bp->b_domain = BD_DOMAIN(bd);
 	bp->b_flags = 0;
 	bp->b_ioflags = 0;
 	bp->b_xflags = 0;
@@ -1512,22 +1567,26 @@ buf_alloc(void)
 }
 
 /*
- *	buf_qrecycle:
+ *	buf_recycle:
  *
  *	Free a buffer from the given bufqueue.  kva controls whether the
  *	freed buf must own some kva resources.  This is used for
  *	defragmenting.
  */
 static int
-buf_qrecycle(int qindex, bool kva)
+buf_recycle(struct bufdomain *bd, bool kva)
 {
+	struct bufqueue *bq;
 	struct buf *bp, *nbp;
 
 	if (kva)
-		atomic_add_int(&bufdefragcnt, 1);
+		counter_u64_add(bufdefragcnt, 1);
 	nbp = NULL;
-	mtx_lock(&bqlocks[qindex]);
-	nbp = TAILQ_FIRST(&bufqueues[qindex]);
+	bq = bd->bd_cleanq;
+	BQ_LOCK(bq);
+	KASSERT(BQ_LOCKPTR(bq) == BD_LOCKPTR(bd),
+	    ("buf_recycle: Locks don't match"));
+	nbp = TAILQ_FIRST(&bq->bq_queue);
 
 	/*
 	 * Run scan, possibly freeing data and/or kva mappings on the fly
@@ -1551,6 +1610,18 @@ buf_qrecycle(int qindex, bool kva)
 			continue;
 
 		/*
+		 * Implement a second chance algorithm for frequently
+		 * accessed buffers.
+		 */
+		if ((bp->b_flags & B_REUSE) != 0) {
+			TAILQ_REMOVE(&bq->bq_queue, bp, b_freelist);
+			TAILQ_INSERT_TAIL(&bq->bq_queue, bp, b_freelist);
+			bp->b_flags &= ~B_REUSE;
+			BUF_UNLOCK(bp);
+			continue;
+		}
+
+		/*
 		 * Skip buffers with background writes in progress.
 		 */
 		if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
@@ -1558,14 +1629,18 @@ buf_qrecycle(int qindex, bool kva)
 			continue;
 		}
 
-		KASSERT(bp->b_qindex == qindex,
-		    ("getnewbuf: inconsistent queue %d bp %p", qindex, bp));
+		KASSERT(bp->b_qindex == QUEUE_CLEAN,
+		    ("buf_recycle: inconsistent queue %d bp %p",
+		    bp->b_qindex, bp));
+		KASSERT(bp->b_domain == BD_DOMAIN(bd),
+		    ("getnewbuf: queue domain %d doesn't match request %d",
+		    bp->b_domain, (int)BD_DOMAIN(bd)));
 		/*
 		 * NOTE:  nbp is now entirely invalid.  We can only restart
 		 * the scan from this point on.
 		 */
-		bremfreel(bp);
-		mtx_unlock(&bqlocks[qindex]);
+		bq_remove(bq, bp);
+		BQ_UNLOCK(bq);
 
 		/*
 		 * Requeue the background write buffer with error and
@@ -1573,70 +1648,21 @@ buf_qrecycle(int qindex, bool kva)
 		 */
 		if ((bp->b_vflags & BV_BKGRDERR) != 0) {
 			bqrelse(bp);
-			mtx_lock(&bqlocks[qindex]);
-			nbp = TAILQ_FIRST(&bufqueues[qindex]);
+			BQ_LOCK(bq);
+			nbp = TAILQ_FIRST(&bq->bq_queue);
 			continue;
 		}
 		bp->b_flags |= B_INVAL;
 		brelse(bp);
 		return (0);
 	}
-	mtx_unlock(&bqlocks[qindex]);
+	bd->bd_wanted = 1;
+	BQ_UNLOCK(bq);
 
 	return (ENOBUFS);
 }

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201802200006.w1K067ZL032311>