Skip site navigation (1)Skip section navigation (2)
Date:      Tue, 13 Mar 2018 18:33:51 +0000 (UTC)
From:      Mark Johnston <markj@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-user@freebsd.org
Subject:   svn commit: r330873 - in user/markj/vm-playground/sys: kern sys vm
Message-ID:  <201803131833.w2DIXpWU071061@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: markj
Date: Tue Mar 13 18:33:50 2018
New Revision: 330873
URL: https://svnweb.freebsd.org/changeset/base/330873

Log:
  Merge from user/jeff/numa at r330828.

Modified:
  user/markj/vm-playground/sys/kern/kern_cpuset.c
  user/markj/vm-playground/sys/kern/vfs_bio.c
  user/markj/vm-playground/sys/sys/_bitset.h
  user/markj/vm-playground/sys/sys/domainset.h
  user/markj/vm-playground/sys/sys/proc.h
  user/markj/vm-playground/sys/vm/vm_domainset.c
  user/markj/vm-playground/sys/vm/vm_page.c
  user/markj/vm-playground/sys/vm/vm_pageout.c
  user/markj/vm-playground/sys/vm/vm_phys.h
  user/markj/vm-playground/sys/vm/vm_reserv.c
  user/markj/vm-playground/sys/vm/vnode_pager.c
Directory Properties:
  user/markj/vm-playground/   (props changed)

Modified: user/markj/vm-playground/sys/kern/kern_cpuset.c
==============================================================================
--- user/markj/vm-playground/sys/kern/kern_cpuset.c	Tue Mar 13 18:30:26 2018	(r330872)
+++ user/markj/vm-playground/sys/kern/kern_cpuset.c	Tue Mar 13 18:33:50 2018	(r330873)
@@ -37,6 +37,8 @@ __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
+#include <sys/sysctl.h>
+#include <sys/ctype.h>
 #include <sys/sysproto.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
@@ -112,6 +114,9 @@ __FBSDID("$FreeBSD$");
  * meaning 'curthread'.  It may query available cpus for that tid with a
  * getaffinity call using (CPU_LEVEL_CPUSET, CPU_WHICH_PID, -1, ...).
  */
+
+LIST_HEAD(domainlist, domainset);
+
 static uma_zone_t cpuset_zone;
 static uma_zone_t domainset_zone;
 static struct mtx cpuset_lock;
@@ -119,6 +124,7 @@ static struct setlist cpuset_ids;
 static struct domainlist cpuset_domains;
 static struct unrhdr *cpuset_unr;
 static struct cpuset *cpuset_zero, *cpuset_default, *cpuset_kernel;
+static struct domainset domainset0, domainset2;
 
 /* Return the size of cpuset_t at the kernel level */
 SYSCTL_INT(_kern_sched, OID_AUTO, cpusetsize, CTLFLAG_RD | CTLFLAG_CAPRD,
@@ -477,11 +483,24 @@ _domainset_create(struct domainset *domain, struct dom
 /*
  * Create or lookup a domainset based on the key held in 'domain'.
  */
-static struct domainset *
+struct domainset *
 domainset_create(const struct domainset *domain)
 {
 	struct domainset *ndomain;
 
+	/*
+	 * Validate the policy.  It must specify a useable policy number with
+	 * only valid domains.  Preferred must include the preferred domain
+	 * in the mask.
+	 */
+	if (domain->ds_policy <= DOMAINSET_POLICY_INVALID ||
+	    domain->ds_policy > DOMAINSET_POLICY_MAX)
+		return (NULL);
+	if (domain->ds_policy == DOMAINSET_POLICY_PREFER &&
+	    !DOMAINSET_ISSET(domain->ds_prefer, &domain->ds_mask))
+		return (NULL);
+	if (!DOMAINSET_SUBSET(&domainset0.ds_mask, &domain->ds_mask))
+		return (NULL);
 	ndomain = uma_zalloc(domainset_zone, M_WAITOK | M_ZERO);
 	domainset_copy(domain, ndomain);
 	return _domainset_create(ndomain, NULL);
@@ -1132,6 +1151,55 @@ out:
 	return (error);
 }
 
+static int
+bitset_strprint(char *buf, size_t bufsiz, const struct bitset *set, int setlen)
+{
+	size_t bytes;
+	int i, once;
+	char *p;
+
+	once = 0;
+	p = buf;
+	for (i = 0; i < __bitset_words(setlen); i++) {
+		if (once != 0) {
+			if (bufsiz < 1)
+				return (0);
+			*p = ',';
+			p++;
+			bufsiz--;
+		} else
+			once = 1;
+		if (bufsiz < sizeof(__STRING(ULONG_MAX)))
+			return (0);
+		bytes = snprintf(p, bufsiz, "%lx", set->__bits[i]);
+		p += bytes;
+		bufsiz -= bytes;
+	}
+	return (p - buf);
+}
+
+static int
+bitset_strscan(struct bitset *set, int setlen, const char *buf)
+{
+	int i, ret;
+	const char *p;
+
+	BIT_ZERO(setlen, set);
+	p = buf;
+	for (i = 0; i < __bitset_words(setlen); i++) {
+		if (*p == ',') {
+			p++;
+			continue;
+		}
+		ret = sscanf(p, "%lx", &set->__bits[i]);
+		if (ret == 0 || ret == -1)
+			break;
+		while (isxdigit(*p))
+			p++;
+	}
+	return (p - buf);
+}
+
 /*
  * Return a string representing a valid layout for a cpuset_t object.
  * It expects an incoming buffer at least sized as CPUSETBUFSIZ.
@@ -1139,19 +1207,9 @@ out:
 char *
 cpusetobj_strprint(char *buf, const cpuset_t *set)
 {
-	char *tbuf;
-	size_t i, bytesp, bufsiz;
 
-	tbuf = buf;
-	bytesp = 0;
-	bufsiz = CPUSETBUFSIZ;
-
-	for (i = 0; i < (_NCPUWORDS - 1); i++) {
-		bytesp = snprintf(tbuf, bufsiz, "%lx,", set->__bits[i]);
-		bufsiz -= bytesp;
-		tbuf += bytesp;
-	}
-	snprintf(tbuf, bufsiz, "%lx", set->__bits[_NCPUWORDS - 1]);
+	bitset_strprint(buf, CPUSETBUFSIZ, (const struct bitset *)set,
+	    CPU_SETSIZE);
 	return (buf);
 }
 
@@ -1162,37 +1220,71 @@ cpusetobj_strprint(char *buf, const cpuset_t *set)
 int
 cpusetobj_strscan(cpuset_t *set, const char *buf)
 {
-	u_int nwords;
-	int i, ret;
+	char p;
 
 	if (strlen(buf) > CPUSETBUFSIZ - 1)
 		return (-1);
 
-	/* Allow to pass a shorter version of the mask when necessary. */
-	nwords = 1;
-	for (i = 0; buf[i] != '\0'; i++)
-		if (buf[i] == ',')
-			nwords++;
-	if (nwords > _NCPUWORDS)
+	p = buf[bitset_strscan((struct bitset *)set, CPU_SETSIZE, buf)];
+	if (p != '\0')
 		return (-1);
 
-	CPU_ZERO(set);
-	for (i = 0; i < (nwords - 1); i++) {
-		ret = sscanf(buf, "%lx,", &set->__bits[i]);
-		if (ret == 0 || ret == -1)
-			return (-1);
-		buf = strstr(buf, ",");
-		if (buf == NULL)
-			return (-1);
-		buf++;
-	}
-	ret = sscanf(buf, "%lx", &set->__bits[nwords - 1]);
-	if (ret == 0 || ret == -1)
-		return (-1);
 	return (0);
 }
 
 /*
+ * Handle a domainset specifier in the sysctl tree.  A poiner to a pointer to
+ * a domainset is in arg1.  If the user specifies a valid domainset the
+ * pointer is updated.
+ *
+ * Format is:
+ * hex mask word 0,hex mask word 1,...:decimal policy:decimal preferred
+ */
+int
+sysctl_handle_domainset(SYSCTL_HANDLER_ARGS)
+{
+	char buf[DOMAINSETBUFSIZ];
+	struct domainset *dset;
+	struct domainset key;
+	char *p;
+	int error;
+
+	dset = *(struct domainset **)arg1;
+	error = 0;
+
+	if (dset != NULL) {
+		p = buf + bitset_strprint(buf, DOMAINSETBUFSIZ,
+		    (const struct bitset *)&dset->ds_mask, DOMAINSET_SETSIZE);
+		sprintf(p, ":%d:%d", dset->ds_policy, dset->ds_prefer);
+	} else
+		sprintf(buf, "<NULL>");
+	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+
+	/*
+	 * Read in and validate the string.
+	 */
+	memset(&key, 0, sizeof(key));
+	p = &buf[bitset_strscan((struct bitset *)&key.ds_mask,
+	    DOMAINSET_SETSIZE, buf)];
+	if (p == buf)
+		return (EINVAL);
+	if (sscanf(p, ":%hd:%hhd", &key.ds_policy, &key.ds_prefer) != 2)
+		return (EINVAL);
+
+	/* Domainset_create() validates the policy.*/
+	dset = domainset_create(&key);
+	if (dset == NULL)
+		return (EINVAL);
+	*(struct domainset **)arg1 = dset;
+
+	return (error);
+}
+
+#ifdef DDB
+
+/*
  * Apply an anonymous mask or a domain to a single thread.
  */
 static int
@@ -1256,8 +1348,6 @@ cpuset_setithread(lwpid_t id, int cpu)
 /*
  * Create the domainset for cpuset 0, 1 and cpuset 2.
  */
-static struct domainset domainset0, domainset2;
-
 void
 domainset_zero(void)
 {
@@ -2079,8 +2169,6 @@ out:
 	return (error);
 }
 
-#ifdef DDB
-BITSET_DEFINE(bitset, 1);
 static void
 ddb_display_bitset(const struct bitset *set, int size)
 {

Modified: user/markj/vm-playground/sys/kern/vfs_bio.c
==============================================================================
--- user/markj/vm-playground/sys/kern/vfs_bio.c	Tue Mar 13 18:30:26 2018	(r330872)
+++ user/markj/vm-playground/sys/kern/vfs_bio.c	Tue Mar 13 18:33:50 2018	(r330873)
@@ -50,6 +50,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
+#include <sys/bitset.h>
 #include <sys/conf.h>
 #include <sys/counter.h>
 #include <sys/buf.h>
@@ -100,6 +101,7 @@ struct	buf_ops buf_ops_bio = {
 	.bop_bdflush	=	bufbdflush,
 };
 
+struct bufdomain;
 static struct buf *buf;		/* buffer header pool */
 extern struct buf *swbuf;	/* Swap buffer header pool. */
 caddr_t unmapped_buf;
@@ -123,8 +125,8 @@ static int vfs_bio_clcheck(struct vnode *vp, int size,
 		daddr_t lblkno, daddr_t blkno);
 static void breada(struct vnode *, daddr_t *, int *, int, struct ucred *, int,
 		void (*)(struct buf *));
-static int buf_flush(struct vnode *vp, int);
-static int flushbufqueues(struct vnode *, int, int);
+static int buf_flush(struct vnode *vp, struct bufdomain *, int);
+static int flushbufqueues(struct vnode *, struct bufdomain *, int, int);
 static void buf_daemon(void);
 static __inline void bd_wakeup(void);
 static int sysctl_runningspace(SYSCTL_HANDLER_ARGS);
@@ -133,6 +135,7 @@ static void bufkva_free(struct buf *);
 static int buf_import(void *, void **, int, int, int);
 static void buf_release(void *, void **, int);
 static void maxbcachebuf_adjust(void);
+static inline struct bufdomain *bufdomain(struct buf *);
 
 static int sysctl_bufspace(SYSCTL_HANDLER_ARGS);
 int vmiodirenable = TRUE;
@@ -147,22 +150,22 @@ static counter_u64_t bufkvaspace;
 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, bufkvaspace, CTLFLAG_RD, &bufkvaspace,
     "Kernel virtual memory used for buffers");
 static long maxbufspace;
-SYSCTL_LONG(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RW, &maxbufspace, 0,
+SYSCTL_LONG(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RD, &maxbufspace, 0,
     "Maximum allowed value of bufspace (including metadata)");
 static long bufmallocspace;
 SYSCTL_LONG(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0,
     "Amount of malloced memory for buffers");
 static long maxbufmallocspace;
-SYSCTL_LONG(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace,
+SYSCTL_LONG(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RD, &maxbufmallocspace,
     0, "Maximum amount of malloced memory for buffers");
 static long lobufspace;
-SYSCTL_LONG(_vfs, OID_AUTO, lobufspace, CTLFLAG_RW, &lobufspace, 0,
+SYSCTL_LONG(_vfs, OID_AUTO, lobufspace, CTLFLAG_RD, &lobufspace, 0,
     "Minimum amount of buffers we want to have");
 long hibufspace;
-SYSCTL_LONG(_vfs, OID_AUTO, hibufspace, CTLFLAG_RW, &hibufspace, 0,
+SYSCTL_LONG(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD, &hibufspace, 0,
     "Maximum allowed value of bufspace (excluding metadata)");
 long bufspacethresh;
-SYSCTL_LONG(_vfs, OID_AUTO, bufspacethresh, CTLFLAG_RW, &bufspacethresh,
+SYSCTL_LONG(_vfs, OID_AUTO, bufspacethresh, CTLFLAG_RD, &bufspacethresh,
     0, "Bufspace consumed before waking the daemon to free some");
 static counter_u64_t buffreekvacnt;
 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt,
@@ -190,26 +193,27 @@ SYSCTL_INT(_vfs, OID_AUTO, altbufferflushes, CTLFLAG_R
 static int recursiveflushes;
 SYSCTL_INT(_vfs, OID_AUTO, recursiveflushes, CTLFLAG_RW, &recursiveflushes,
     0, "Number of flushes skipped due to being recursive");
-static int numdirtybuffers;
-SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD, &numdirtybuffers, 0,
+static int sysctl_numdirtybuffers(SYSCTL_HANDLER_ARGS);
+SYSCTL_PROC(_vfs, OID_AUTO, numdirtybuffers,
+    CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RD, NULL, 0, sysctl_numdirtybuffers, "I",
     "Number of buffers that are dirty (has unwritten changes) at the moment");
 static int lodirtybuffers;
-SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW, &lodirtybuffers, 0,
+SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RD, &lodirtybuffers, 0,
     "How many buffers we want to have free before bufdaemon can sleep");
 static int hidirtybuffers;
-SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW, &hidirtybuffers, 0,
+SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RD, &hidirtybuffers, 0,
     "When the number of dirty buffers is considered severe");
 int dirtybufthresh;
-SYSCTL_INT(_vfs, OID_AUTO, dirtybufthresh, CTLFLAG_RW, &dirtybufthresh,
+SYSCTL_INT(_vfs, OID_AUTO, dirtybufthresh, CTLFLAG_RD, &dirtybufthresh,
     0, "Number of bdwrite to bawrite conversions to clear dirty buffers");
 static int numfreebuffers;
 SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, &numfreebuffers, 0,
     "Number of free buffers");
 static int lofreebuffers;
-SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW, &lofreebuffers, 0,
+SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RD, &lofreebuffers, 0,
    "Target number of free buffers");
 static int hifreebuffers;
-SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, &hifreebuffers, 0,
+SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RD, &hifreebuffers, 0,
    "Threshold for clean buffer recycling");
 static counter_u64_t getnewbufcalls;
 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RD,
@@ -308,10 +312,10 @@ struct bufqueue {
 #define	BQ_ASSERT_LOCKED(bq)	mtx_assert(BQ_LOCKPTR((bq)), MA_OWNED)
 
 struct bufqueue __exclusive_cache_line bqempty;
-struct bufqueue __exclusive_cache_line bqdirty;
 
 struct bufdomain {
 	struct bufqueue	bd_subq[MAXCPU + 1]; /* Per-cpu sub queues + global */
+	struct bufqueue bd_dirtyq;
 	struct bufqueue	*bd_cleanq;
 	struct mtx_padalign bd_run_lock;
 	/* Constants */
@@ -321,10 +325,14 @@ struct bufdomain {
 	long 		bd_bufspacethresh;
 	int		bd_hifreebuffers;
 	int		bd_lofreebuffers;
+	int		bd_hidirtybuffers;
+	int		bd_lodirtybuffers;
+	int		bd_dirtybufthresh;
 	int		bd_lim;
 	/* atomics */
 	int		bd_wanted;
-	int  __aligned(CACHE_LINE_SIZE)	bd_running;
+	int __aligned(CACHE_LINE_SIZE)	bd_numdirtybuffers;
+	int __aligned(CACHE_LINE_SIZE)	bd_running;
 	long __aligned(CACHE_LINE_SIZE) bd_bufspace;
 	int __aligned(CACHE_LINE_SIZE)	bd_freebuffers;
 } __aligned(CACHE_LINE_SIZE);
@@ -336,15 +344,19 @@ struct bufdomain {
 #define	BD_RUN_LOCKPTR(bd)	(&(bd)->bd_run_lock)
 #define	BD_RUN_LOCK(bd)		mtx_lock(BD_RUN_LOCKPTR((bd)))
 #define	BD_RUN_UNLOCK(bd)	mtx_unlock(BD_RUN_LOCKPTR((bd)))
-#define	BD_DOMAIN(bd)		(bd - bdclean)
+#define	BD_DOMAIN(bd)		(bd - bdomain)
 
-/* Maximum number of clean buffer domains. */
-#define	CLEAN_DOMAINS	8
+/* Maximum number of buffer domains. */
+#define	BUF_DOMAINS	8
 
+BITSET_DEFINE(bufdomainset, BUF_DOMAINS);
+struct bufdomainset bdlodirty;		/* Domains > lodirty */
+struct bufdomainset bdhidirty;		/* Domains > hidirty */
+
 /* Configured number of clean queues. */
-static int __read_mostly clean_domains;
+static int __read_mostly buf_domains;
 
-struct bufdomain __exclusive_cache_line bdclean[CLEAN_DOMAINS];
+struct bufdomain __exclusive_cache_line bdomain[BUF_DOMAINS];
 
 static void bq_remove(struct bufqueue *bq, struct buf *bp);
 static void bq_insert(struct bufqueue *bq, struct buf *bp, bool unlock);
@@ -403,8 +415,8 @@ sysctl_bufspace(SYSCTL_HANDLER_ARGS)
 	int i;
 
 	lvalue = 0;
-	for (i = 0; i < clean_domains; i++)
-		lvalue += bdclean[i].bd_bufspace;
+	for (i = 0; i < buf_domains; i++)
+		lvalue += bdomain[i].bd_bufspace;
 	if (sizeof(int) == sizeof(long) || req->oldlen >= sizeof(long))
 		return (sysctl_handle_long(oidp, &lvalue, 0, req));
 	if (lvalue > INT_MAX)
@@ -421,12 +433,24 @@ sysctl_bufspace(SYSCTL_HANDLER_ARGS)
 	int i;
 
 	lvalue = 0;
-	for (i = 0; i < clean_domains; i++)
-		lvalue += bdclean[i].bd_bufspace;
+	for (i = 0; i < buf_domains; i++)
+		lvalue += bdomain[i].bd_bufspace;
 	return (sysctl_handle_long(oidp, &lvalue, 0, req));
 }
 #endif
 
+static int
+sysctl_numdirtybuffers(SYSCTL_HANDLER_ARGS)
+{
+	int value;
+	int i;
+
+	value = 0;
+	for (i = 0; i < buf_domains; i++)
+		value += bdomain[i].bd_numdirtybuffers;
+	return (sysctl_handle_int(oidp, &value, 0, req));
+}
+
 /*
  *	bdirtywakeup:
  *
@@ -444,18 +468,59 @@ bdirtywakeup(void)
 }
 
 /*
+ *	bd_clear:
+ *
+ *	Clear a domain from the appropriate bitsets when dirtybuffers
+ *	is decremented.
+ */
+static void
+bd_clear(struct bufdomain *bd)
+{
+
+	mtx_lock(&bdirtylock);
+	if (bd->bd_numdirtybuffers <= bd->bd_lodirtybuffers)
+		BIT_CLR(BUF_DOMAINS, BD_DOMAIN(bd), &bdlodirty);
+	if (bd->bd_numdirtybuffers <= bd->bd_hidirtybuffers)
+		BIT_CLR(BUF_DOMAINS, BD_DOMAIN(bd), &bdhidirty);
+	mtx_unlock(&bdirtylock);
+}
+
+/*
+ *	bd_set:
+ *
+ *	Set a domain in the appropriate bitsets when dirtybuffers
+ *	is incremented.
+ */
+static void
+bd_set(struct bufdomain *bd)
+{
+
+	mtx_lock(&bdirtylock);
+	if (bd->bd_numdirtybuffers > bd->bd_lodirtybuffers)
+		BIT_SET(BUF_DOMAINS, BD_DOMAIN(bd), &bdlodirty);
+	if (bd->bd_numdirtybuffers > bd->bd_hidirtybuffers)
+		BIT_SET(BUF_DOMAINS, BD_DOMAIN(bd), &bdhidirty);
+	mtx_unlock(&bdirtylock);
+}
+
+/*
  *	bdirtysub:
  *
  *	Decrement the numdirtybuffers count by one and wakeup any
  *	threads blocked in bwillwrite().
  */
 static void
-bdirtysub(void)
+bdirtysub(struct buf *bp)
 {
+	struct bufdomain *bd;
+	int num;
 
-	if (atomic_fetchadd_int(&numdirtybuffers, -1) ==
-	    (lodirtybuffers + hidirtybuffers) / 2)
+	bd = bufdomain(bp);
+	num = atomic_fetchadd_int(&bd->bd_numdirtybuffers, -1);
+	if (num == (bd->bd_lodirtybuffers + bd->bd_hidirtybuffers) / 2)
 		bdirtywakeup();
+	if (num == bd->bd_lodirtybuffers || num == bd->bd_hidirtybuffers)
+		bd_clear(bd);
 }
 
 /*
@@ -465,16 +530,21 @@ bdirtysub(void)
  *	daemon if needed.
  */
 static void
-bdirtyadd(void)
+bdirtyadd(struct buf *bp)
 {
+	struct bufdomain *bd;
+	int num;
 
 	/*
 	 * Only do the wakeup once as we cross the boundary.  The
 	 * buf daemon will keep running until the condition clears.
 	 */
-	if (atomic_fetchadd_int(&numdirtybuffers, 1) ==
-	    (lodirtybuffers + hidirtybuffers) / 2)
+	bd = bufdomain(bp);
+	num = atomic_fetchadd_int(&bd->bd_numdirtybuffers, 1);
+	if (num == (bd->bd_lodirtybuffers + bd->bd_hidirtybuffers) / 2)
 		bd_wakeup();
+	if (num == bd->bd_lodirtybuffers || num == bd->bd_hidirtybuffers)
+		bd_set(bd);
 }
 
 /*
@@ -539,11 +609,11 @@ bufspace_adjust(struct buf *bp, int bufsize)
 
 	KASSERT((bp->b_flags & B_MALLOC) == 0,
 	    ("bufspace_adjust: malloc buf %p", bp));
-	bd = &bdclean[bp->b_domain];
+	bd = bufdomain(bp);
 	diff = bufsize - bp->b_bufsize;
 	if (diff < 0) {
 		atomic_subtract_long(&bd->bd_bufspace, -diff);
-	} else {
+	} else if (diff > 0) {
 		space = atomic_fetchadd_long(&bd->bd_bufspace, diff);
 		/* Wake up the daemon on the transition. */
 		if (space < bd->bd_bufspacethresh &&
@@ -638,7 +708,7 @@ bufspace_wait(struct bufdomain *bd, struct vnode *vp, 
 			 * recursion.
 			 */
 			td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF;
-			fl = buf_flush(vp, flushbufqtarget);
+			fl = buf_flush(vp, bd, flushbufqtarget);
 			td->td_pflags &= norunbuf;
 			BD_LOCK(bd);
 			if (fl != 0)
@@ -700,7 +770,6 @@ bufspace_daemon(void *arg)
 			if (buf_recycle(bd, false) != 0) {
 				if (bd_flushall(bd))
 					continue;
-				bd_speedup();
 				BD_LOCK(bd);
 				if (bd->bd_wanted) {
 					msleep(&bd->bd_wanted, BD_LOCKPTR(bd),
@@ -1026,7 +1095,6 @@ bufinit(void)
 	    ("maxbcachebuf (%d) must be >= MAXBSIZE (%d)\n", maxbcachebuf,
 	    MAXBSIZE));
 	bq_init(&bqempty, QUEUE_EMPTY, -1, "bufq empty lock");
-	bq_init(&bqdirty, QUEUE_DIRTY, -1, "bufq dirty lock");
 	mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF);
 	mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF);
 	mtx_init(&bdirtylock, "dirty buf lock", NULL, MTX_DEF);
@@ -1094,7 +1162,6 @@ bufinit(void)
 	 */
 	hidirtybuffers = nbuf / 4 + 20;
 	dirtybufthresh = hidirtybuffers * 9 / 10;
-	numdirtybuffers = 0;
 	/*
 	 * To support extreme low-memory systems, make sure hidirtybuffers
 	 * cannot eat up all available buffer space.  This occurs when our
@@ -1129,22 +1196,26 @@ bufinit(void)
 	 * One queue per-256mb up to the max.  More queues gives better
 	 * concurrency but less accurate LRU.
 	 */
-	clean_domains = MIN(howmany(maxbufspace, 256*1024*1024), CLEAN_DOMAINS);
-	for (i = 0 ; i < clean_domains; i++) {
+	buf_domains = MIN(howmany(maxbufspace, 256*1024*1024), BUF_DOMAINS);
+	for (i = 0 ; i < buf_domains; i++) {
 		struct bufdomain *bd;
 
-		bd = &bdclean[i];
+		bd = &bdomain[i];
 		bd_init(bd);
-		bd->bd_freebuffers = nbuf / clean_domains;
-		bd->bd_hifreebuffers = hifreebuffers / clean_domains;
-		bd->bd_lofreebuffers = lofreebuffers / clean_domains;
+		bd->bd_freebuffers = nbuf / buf_domains;
+		bd->bd_hifreebuffers = hifreebuffers / buf_domains;
+		bd->bd_lofreebuffers = lofreebuffers / buf_domains;
 		bd->bd_bufspace = 0;
-		bd->bd_maxbufspace = maxbufspace / clean_domains;
-		bd->bd_hibufspace = hibufspace / clean_domains;
-		bd->bd_lobufspace = lobufspace / clean_domains;
-		bd->bd_bufspacethresh = bufspacethresh / clean_domains;
+		bd->bd_maxbufspace = maxbufspace / buf_domains;
+		bd->bd_hibufspace = hibufspace / buf_domains;
+		bd->bd_lobufspace = lobufspace / buf_domains;
+		bd->bd_bufspacethresh = bufspacethresh / buf_domains;
+		bd->bd_numdirtybuffers = 0;
+		bd->bd_hidirtybuffers = hidirtybuffers / buf_domains;
+		bd->bd_lodirtybuffers = lodirtybuffers / buf_domains;
+		bd->bd_dirtybufthresh = dirtybufthresh / buf_domains;
 		/* Don't allow more than 2% of bufs in the per-cpu caches. */
-		bd->bd_lim = nbuf / clean_domains / 50 / mp_ncpus;
+		bd->bd_lim = nbuf / buf_domains / 50 / mp_ncpus;
 	}
 	getnewbufcalls = counter_u64_alloc(M_WAITOK);
 	getnewbufrestarts = counter_u64_alloc(M_WAITOK);
@@ -1328,6 +1399,13 @@ bpmap_qenter(struct buf *bp)
 	    (vm_offset_t)(bp->b_offset & PAGE_MASK));
 }
 
+static inline struct bufdomain *
+bufdomain(struct buf *bp)
+{
+
+	return (&bdomain[bp->b_domain]);
+}
+
 static struct bufqueue *
 bufqueue(struct buf *bp)
 {
@@ -1340,9 +1418,9 @@ bufqueue(struct buf *bp)
 	case QUEUE_EMPTY:
 		return (&bqempty);
 	case QUEUE_DIRTY:
-		return (&bqdirty);
+		return (&bufdomain(bp)->bd_dirtyq);
 	case QUEUE_CLEAN:
-		return (&bdclean[bp->b_domain].bd_subq[bp->b_subqueue]);
+		return (&bufdomain(bp)->bd_subq[bp->b_subqueue]);
 	default:
 		break;
 	}
@@ -1405,14 +1483,14 @@ binsfree(struct buf *bp, int qindex)
 		bq_remove(bq, bp);
 		BQ_UNLOCK(bq);
 	}
+	bd = bufdomain(bp);
 	if (qindex == QUEUE_CLEAN) {
-		bd = &bdclean[bp->b_domain];
 		if (bd->bd_lim != 0)
 			bq = &bd->bd_subq[PCPU_GET(cpuid)];
 		else
 			bq = bd->bd_cleanq;
 	} else
-		bq = &bqdirty;
+		bq = &bd->bd_dirtyq;
 	bq_insert(bq, bp, true);
 }
 
@@ -1440,7 +1518,7 @@ buf_free(struct buf *bp)
 	if (!LIST_EMPTY(&bp->b_dep))
 		buf_deallocate(bp);
 	bufkva_free(bp);
-	atomic_add_int(&bdclean[bp->b_domain].bd_freebuffers, 1);
+	atomic_add_int(&bufdomain(bp)->bd_freebuffers, 1);
 	BUF_UNLOCK(bp);
 	uma_zfree(buf_zone, bp);
 }
@@ -1716,9 +1794,10 @@ bd_init(struct bufdomain *bd)
 	int domain;
 	int i;
 
-	domain = bd - bdclean;
+	domain = bd - bdomain;
 	bd->bd_cleanq = &bd->bd_subq[mp_maxid + 1];
 	bq_init(bd->bd_cleanq, QUEUE_CLEAN, mp_maxid + 1, "bufq clean lock");
+	bq_init(&bd->bd_dirtyq, QUEUE_DIRTY, -1, "bufq dirty lock");
 	for (i = 0; i <= mp_maxid; i++)
 		bq_init(&bd->bd_subq[i], QUEUE_CLEAN, i,
 		    "bufq clean subqueue lock");
@@ -1810,7 +1889,7 @@ bq_insert(struct bufqueue *bq, struct buf *bp, bool un
 	if (bp->b_qindex != QUEUE_NONE)
 		panic("bq_insert: free buffer %p onto another queue?", bp);
 
-	bd = &bdclean[bp->b_domain];
+	bd = bufdomain(bp);
 	if (bp->b_flags & B_AGE) {
 		/* Place this buf directly on the real queue. */
 		if (bq->bq_index == QUEUE_CLEAN)
@@ -1927,8 +2006,8 @@ bufkva_reclaim(vmem_t *vmem, int flags)
 
 	done = false;
 	for (i = 0; i < 5; i++) {
-		for (q = 0; q < clean_domains; q++)
-			if (buf_recycle(&bdclean[q], true) != 0)
+		for (q = 0; q < buf_domains; q++)
+			if (buf_recycle(&bdomain[q], true) != 0)
 				done = true;
 		if (done)
 			break;
@@ -2320,7 +2399,7 @@ bdirty(struct buf *bp)
 	if ((bp->b_flags & B_DELWRI) == 0) {
 		bp->b_flags |= /* XXX B_DONE | */ B_DELWRI;
 		reassignbuf(bp);
-		bdirtyadd();
+		bdirtyadd(bp);
 	}
 }
 
@@ -2348,7 +2427,7 @@ bundirty(struct buf *bp)
 	if (bp->b_flags & B_DELWRI) {
 		bp->b_flags &= ~B_DELWRI;
 		reassignbuf(bp);
-		bdirtysub();
+		bdirtysub(bp);
 	}
 	/*
 	 * Since it is now being written, we can clear its deferred write flag.
@@ -2420,9 +2499,9 @@ void
 bwillwrite(void)
 {
 
-	if (numdirtybuffers >= hidirtybuffers) {
+	if (buf_dirty_count_severe()) {
 		mtx_lock(&bdirtylock);
-		while (numdirtybuffers >= hidirtybuffers) {
+		while (buf_dirty_count_severe()) {
 			bdirtywait = 1;
 			msleep(&bdirtywait, &bdirtylock, (PRIBIO + 4),
 			    "flswai", 0);
@@ -2438,7 +2517,7 @@ int
 buf_dirty_count_severe(void)
 {
 
-	return(numdirtybuffers >= hidirtybuffers);
+	return (!BIT_EMPTY(BUF_DOMAINS, &bdhidirty));
 }
 
 /*
@@ -2523,7 +2602,7 @@ brelse(struct buf *bp)
 		if (!LIST_EMPTY(&bp->b_dep))
 			buf_deallocate(bp);
 		if (bp->b_flags & B_DELWRI)
-			bdirtysub();
+			bdirtysub(bp);
 		bp->b_flags &= ~(B_DELWRI | B_CACHE);
 		if ((bp->b_flags & B_VMIO) == 0) {
 			allocbuf(bp, 0);
@@ -3136,9 +3215,9 @@ getnewbuf(struct vnode *vp, int slpflag, int slptimeo,
 	else
 		metadata = false;
 	if (vp == NULL)
-		bd = &bdclean[0];
+		bd = &bdomain[0];
 	else
-		bd = &bdclean[vp->v_bufobj.bo_domain];
+		bd = &bdomain[vp->v_bufobj.bo_domain];
 
 	counter_u64_add(getnewbufcalls, 1);
 	reserved = false;
@@ -3184,11 +3263,11 @@ static struct kproc_desc buf_kp = {
 SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp);
 
 static int
-buf_flush(struct vnode *vp, int target)
+buf_flush(struct vnode *vp, struct bufdomain *bd, int target)
 {
 	int flushed;
 
-	flushed = flushbufqueues(vp, target, 0);
+	flushed = flushbufqueues(vp, bd, target, 0);
 	if (flushed == 0) {
 		/*
 		 * Could not find any buffers without rollback
@@ -3197,7 +3276,7 @@ buf_flush(struct vnode *vp, int target)
 		 */
 		if (vp != NULL && target > 2)
 			target /= 2;
-		flushbufqueues(vp, target, 1);
+		flushbufqueues(vp, bd, target, 1);
 	}
 	return (flushed);
 }
@@ -3205,6 +3284,8 @@ buf_flush(struct vnode *vp, int target)
 static void
 buf_daemon()
 {
+	struct bufdomain *bd;
+	int speedupreq;
 	int lodirty;
 	int i;
 
@@ -3217,11 +3298,11 @@ buf_daemon()
 	/*
 	 * Start the buf clean daemons as children threads.
 	 */
-	for (i = 0 ; i < clean_domains; i++) {
+	for (i = 0 ; i < buf_domains; i++) {
 		int error;
 
 		error = kthread_add((void (*)(void *))bufspace_daemon,
-		    &bdclean[i], curproc, NULL, 0, 0, "bufspacedaemon-%d", i);
+		    &bdomain[i], curproc, NULL, 0, 0, "bufspacedaemon-%d", i);
 		if (error)
 			panic("error %d spawning bufspace daemon", error);
 	}
@@ -3236,20 +3317,30 @@ buf_daemon()
 		mtx_unlock(&bdlock);
 
 		kproc_suspend_check(bufdaemonproc);
-		lodirty = lodirtybuffers;
-		if (bd_speedupreq) {
-			lodirty = numdirtybuffers / 2;
-			bd_speedupreq = 0;
-		}
+
 		/*
-		 * Do the flush.  Limit the amount of in-transit I/O we
-		 * allow to build up, otherwise we would completely saturate
-		 * the I/O system.
+		 * Save speedupreq for this pass and reset to capture new
+		 * requests.
 		 */
-		while (numdirtybuffers > lodirty) {
-			if (buf_flush(NULL, numdirtybuffers - lodirty) == 0)
-				break;
-			kern_yield(PRI_USER);
+		speedupreq = bd_speedupreq;
+		bd_speedupreq = 0;
+
+		/*
+		 * Flush each domain sequentially according to its level and
+		 * the speedup request.
+		 */
+		for (i = 0; i < buf_domains; i++) {
+			bd = &bdomain[i];
+			if (speedupreq)
+				lodirty = bd->bd_numdirtybuffers / 2;
+			else
+				lodirty = bd->bd_lodirtybuffers;
+			while (bd->bd_numdirtybuffers > lodirty) {
+				if (buf_flush(NULL, bd,
+				    bd->bd_numdirtybuffers - lodirty) == 0)
+					break;
+				kern_yield(PRI_USER);
+			}
 		}
 
 		/*
@@ -3263,7 +3354,7 @@ buf_daemon()
 		 * to avoid endless loops on unlockable buffers.
 		 */
 		mtx_lock(&bdlock);
-		if (numdirtybuffers <= lodirtybuffers) {
+		if (!BIT_EMPTY(BUF_DOMAINS, &bdlodirty)) {
 			/*
 			 * We reached our low water mark, reset the
 			 * request and sleep until we are needed again.
@@ -3302,7 +3393,8 @@ SYSCTL_INT(_vfs, OID_AUTO, flushwithdeps, CTLFLAG_RW, 
     0, "Number of buffers flushed with dependecies that require rollbacks");
 
 static int
-flushbufqueues(struct vnode *lvp, int target, int flushdeps)
+flushbufqueues(struct vnode *lvp, struct bufdomain *bd, int target,
+    int flushdeps)
 {
 	struct bufqueue *bq;
 	struct buf *sentinel;
@@ -3315,7 +3407,7 @@ flushbufqueues(struct vnode *lvp, int target, int flus
 	bool unlock;
 
 	flushed = 0;
-	bq = &bqdirty;
+	bq = &bd->bd_dirtyq;
 	bp = NULL;
 	sentinel = malloc(sizeof(struct buf), M_TEMP, M_WAITOK | M_ZERO);
 	sentinel->b_qindex = QUEUE_SENTINEL;
@@ -3651,7 +3743,7 @@ bp_unmapped_get_kva(struct buf *bp, daddr_t blkno, int
 			panic("GB_NOWAIT_BD and GB_UNMAPPED %p", bp);
 		}
 		counter_u64_add(mappingrestarts, 1);
-		bufspace_wait(&bdclean[bp->b_domain], bp->b_vp, gbflags, 0, 0);
+		bufspace_wait(bufdomain(bp), bp->b_vp, gbflags, 0, 0);
 	}
 has_addr:
 	if (need_mapping) {
@@ -3849,7 +3941,7 @@ loop:
 		 */
 		if (flags & GB_NOCREAT)
 			return NULL;
-		if (bdclean[bo->bo_domain].bd_freebuffers == 0 &&
+		if (bdomain[bo->bo_domain].bd_freebuffers == 0 &&
 		    TD_IS_IDLETHREAD(curthread))
 			return NULL;
 
@@ -3906,7 +3998,7 @@ loop:
 		if (gbincore(bo, blkno)) {
 			BO_UNLOCK(bo);
 			bp->b_flags |= B_INVAL;
-			bufspace_release(&bdclean[bp->b_domain], maxsize);
+			bufspace_release(bufdomain(bp), maxsize);
 			brelse(bp);
 			goto loop;
 		}
@@ -3941,7 +4033,7 @@ loop:
 		}
 
 		allocbuf(bp, size);
-		bufspace_release(&bdclean[bp->b_domain], maxsize);
+		bufspace_release(bufdomain(bp), maxsize);
 		bp->b_flags &= ~B_DONE;
 	}
 	CTR4(KTR_BUF, "getblk(%p, %ld, %d) = %p", vp, (long)blkno, size, bp);
@@ -3970,7 +4062,7 @@ geteblk(int size, int flags)
 			return (NULL);
 	}
 	allocbuf(bp, size);
-	bufspace_release(&bdclean[bp->b_domain], maxsize);
+	bufspace_release(bufdomain(bp), maxsize);
 	bp->b_flags |= B_INVAL;	/* b_dep cleared by getnewbuf() */
 	BUF_ASSERT_HELD(bp);
 	return (bp);
@@ -4839,7 +4931,7 @@ bufobj_init(struct bufobj *bo, void *private)
 	static volatile int bufobj_cleanq;
 
         bo->bo_domain =
-            atomic_fetchadd_int(&bufobj_cleanq, 1) % clean_domains;
+            atomic_fetchadd_int(&bufobj_cleanq, 1) % buf_domains;
         rw_init(BO_LOCKPTR(bo), "bufobj interlock");
         bo->bo_private = private;
         TAILQ_INIT(&bo->bo_clean.bv_hd);
@@ -5184,10 +5276,9 @@ DB_SHOW_COMMAND(bufqueues, bufqueues)
 	int i, j;
 
 	db_printf("bqempty: %d\n", bqempty.bq_len);
-	db_printf("bqdirty: %d\n", bqdirty.bq_len);
 
-	for (i = 0; i < clean_domains; i++) {
-		bd = &bdclean[i];
+	for (i = 0; i < buf_domains; i++) {
+		bd = &bdomain[i];
 		db_printf("Buf domain %d\n", i);
 		db_printf("\tfreebufs\t%d\n", bd->bd_freebuffers);
 		db_printf("\tlofreebufs\t%d\n", bd->bd_lofreebuffers);
@@ -5199,7 +5290,13 @@ DB_SHOW_COMMAND(bufqueues, bufqueues)
 		db_printf("\tlobufspace\t%ld\n", bd->bd_lobufspace);
 		db_printf("\tbufspacethresh\t%ld\n", bd->bd_bufspacethresh);
 		db_printf("\n");
+		db_printf("\tnumdirtybuffers\t%d\n", bd->bd_numdirtybuffers);
+		db_printf("\tlodirtybuffers\t%d\n", bd->bd_lodirtybuffers);
+		db_printf("\thidirtybufferss\t%d\n", bd->bd_hidirtybuffers);
+		db_printf("\tdirtybufthresh\t%d\n", bd->bd_dirtybufthresh);
+		db_printf("\n");
 		db_printf("\tcleanq count\t%d\n", bd->bd_cleanq->bq_len);
+		db_printf("\tdirtyq count\t%d\n", bd->bd_dirtyq.bq_len);
 		db_printf("\twakeup\t\t%d\n", bd->bd_wanted);
 		db_printf("\tlim\t\t%d\n", bd->bd_lim);
 		db_printf("\tCPU ");

Modified: user/markj/vm-playground/sys/sys/_bitset.h
==============================================================================
--- user/markj/vm-playground/sys/sys/_bitset.h	Tue Mar 13 18:30:26 2018	(r330872)
+++ user/markj/vm-playground/sys/sys/_bitset.h	Tue Mar 13 18:33:50 2018	(r330873)
@@ -57,4 +57,10 @@ struct t {								\
  */
 #define BITSET_DEFINE_VAR(t)	BITSET_DEFINE(t, 1)
 
+/*
+ * Define a default type that can be used while manually specifying size
+ * to every call.
+ */
+BITSET_DEFINE(bitset, 1);
+
 #endif /* !_SYS__BITSET_H_ */

Modified: user/markj/vm-playground/sys/sys/domainset.h
==============================================================================
--- user/markj/vm-playground/sys/sys/domainset.h	Tue Mar 13 18:30:26 2018	(r330872)
+++ user/markj/vm-playground/sys/sys/domainset.h	Tue Mar 13 18:33:50 2018	(r330873)
@@ -28,8 +28,8 @@
  * $FreeBSD$
  */
 
-#ifndef _SYS_DOMAINSETSET_H_
-#define	_SYS_DOMAINSETSET_H_
+#ifndef _SYS_DOMAINSET_H_
+#define	_SYS_DOMAINSET_H_
 
 #include <sys/_domainset.h>
 
@@ -38,8 +38,12 @@
 #define	_NDOMAINSETBITS			_BITSET_BITS
 #define	_NDOMAINSETWORDS		__bitset_words(DOMAINSET_SETSIZE)
 
-#define	DOMAINSETSETBUFSIZ	((2 + sizeof(long) * 2) * _NDOMAINSETWORDS)
+#define	DOMAINSETBUFSIZ							\
+	    (((2 + sizeof(long) * 2) * _NDOMAINSETWORDS) +		\
+	    sizeof("::") + sizeof(__XSTRING(DOMAINSET_POLICY_MAX)) +	\
+	    sizeof(__XSTRING(MAXMEMDOM)))
 
+
 #define	DOMAINSET_CLR(n, p)		BIT_CLR(DOMAINSET_SETSIZE, n, p)
 #define	DOMAINSET_COPY(f, t)		BIT_COPY(DOMAINSET_SETSIZE, f, t)
 #define	DOMAINSET_ISSET(n, p)		BIT_ISSET(DOMAINSET_SETSIZE, n, p)
@@ -77,9 +81,6 @@
 #define	DOMAINSET_POLICY_MAX		DOMAINSET_POLICY_INTERLEAVE
 
 #ifdef _KERNEL
-#include <sys/queue.h>
-LIST_HEAD(domainlist, domainset);
-
 #if MAXMEMDOM < 256
 typedef	uint8_t		domainid_t;
 #else

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201803131833.w2DIXpWU071061>