Skip site navigation (1)Skip section navigation (2)
Date:      Thu, 29 Mar 2018 02:54:51 +0000 (UTC)
From:      Jeff Roberson <jeff@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r331723 - in head: share/man/man9 sys/kern sys/sys sys/vm usr.bin/cpuset
Message-ID:  <201803290254.w2T2spJB077720@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: jeff
Date: Thu Mar 29 02:54:50 2018
New Revision: 331723
URL: https://svnweb.freebsd.org/changeset/base/331723

Log:
  Implement several enhancements to NUMA policies.
  
  Add a new "interleave" allocation policy which stripes pages across
  domains with a stride or width keeping contiguity within a multi-page
  region.
  
  Move the kernel to the dedicated numbered cpuset #2 making it possible
  to assign kernel threads and memory policy separately from user.  This
  also eliminates the need for the complicated interrupt binding code.
  
  Add a sysctl API for viewing and manipulating domainsets.  Refactor some
  of the cpuset_t manipulation code using the generic bitset type so that
  it can be used for both.  This probably belongs in a dedicated subr file.
  
  Attempt to improve the include situation.
  
  Reviewed by:	kib
  Discussed with:	jhb (cpuset parts)
  Tested by:	pho (before review feedback)
  Sponsored by:	Netflix, Dell/EMC Isilon
  Differential Revision:	https://reviews.freebsd.org/D14839

Added:
  head/share/man/man9/domainset.9   (contents, props changed)
Modified:
  head/share/man/man9/Makefile
  head/sys/kern/kern_cpuset.c
  head/sys/kern/kern_kthread.c
  head/sys/sys/_bitset.h
  head/sys/sys/cpuset.h
  head/sys/sys/domainset.h
  head/sys/sys/proc.h
  head/sys/vm/vm_domainset.c
  head/sys/vm/vm_domainset.h
  head/sys/vm/vm_page.c
  head/sys/vm/vnode_pager.c
  head/usr.bin/cpuset/cpuset.1
  head/usr.bin/cpuset/cpuset.c

Modified: head/share/man/man9/Makefile
==============================================================================
--- head/share/man/man9/Makefile	Thu Mar 29 02:50:57 2018	(r331722)
+++ head/share/man/man9/Makefile	Thu Mar 29 02:54:50 2018	(r331723)
@@ -118,6 +118,7 @@ MAN=	accept_filter.9 \
 	disk.9 \
 	dnv.9 \
 	domain.9 \
+	domainset.9 \
 	dpcpu.9 \
 	drbr.9 \
 	driver.9 \

Added: head/share/man/man9/domainset.9
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ head/share/man/man9/domainset.9	Thu Mar 29 02:54:50 2018	(r331723)
@@ -0,0 +1,128 @@
+.\" Copyright (c) 2018 Jeffrey Roberson <jeff@FreeBSD.org>
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\"    notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\"    notice, this list of conditions and the following disclaimer in the
+.\"    documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS''
+.\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+.\" TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+.\" PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE
+.\" LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+.\" CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+.\" SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+.\" INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+.\" CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+.\" ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+.\" POSSIBILITY OF SUCH DAMAGE.
+.\"
+.\" $FreeBSD$
+.\"
+.Dd March 24, 2018
+.Dt DOMAINSET 9
+.Os
+.Sh NAME
+.Nm domainset(9)
+\(em
+.Nm domainset_create ,
+.Nm sysctl_handle_domainset .
+.Nd domainset functions and operation
+.Sh SYNOPSIS
+.In sys/_domainset.h
+.In sys/domainset.h
+.\"
+.Bd -literal -offset indent
+struct domainset {
+        domainset_t     ds_mask;
+        uint16_t        ds_policy;
+        domainid_t      ds_prefer;
+	...
+};
+.Ed
+.Pp
+.Ft struct domainset *
+.Fn domainset_create "const struct domainset *key"
+.Ft int
+.Fn sysctl_handle_domainset "SYSCTL_HANDLER_ARGS"
+.Sh DESCRIPTION
+The
+.Nm
+API provides memory domain allocation policy for NUMA machines.
+Each
+.Vt domainset
+contains a bitmask of allowed domains, an integer policy, and an optional
+preferred domain.
+Together, these specify a search order for memory allocations as well as
+the ability to restrict threads and objects to a subset of available
+memory domains for system partitioning and resource management.
+.Pp
+Every thread in the system and optionally every
+.Vt vm_object_t ,
+which is used to represent files and other memory sources, has
+a reference to a
+.Vt struct domainset .
+The domainset associated with the object is consulted first and the system
+falls back to the thread policy if none exists.
+.Pp
+The allocation policy has the following possible values:
+.Bl -tag -width "foo"
+.It Dv DOMAINSET_POLICY_ROUNDROBIN
+Memory is allocated from each domain in the mask in a round-robin fashion.
+This distributes bandwidth evenly among available domains.
+This policy can specify a single domain for a fixed allocation.
+.It Dv DOMAINSET_POLICY_FIRSTTOUCH
+Memory is allocated from the node that it is first accessed on.
+Allocation falls back to round-robin if the current domain is not in the
+allowed set or is out of memory.
+This policy optimizes for locality but may give pessimal results if the
+memory is accessed from many CPUs that are not in the local domain.
+.It Dv DOMAINSET_POLICY_PREFER
+Memory is allocated from the node in the
+.Vt prefer
+member.  The preferred node must be set in the allowed mask.
+If the preferred node is out of memory the allocation falls back to 
+round-robin among allowed sets.
+.It Dv DOMAINSET_POLICY_INTERLEAVE
+Memory is allocated in a striped fashion with multiple pages
+allocated to each domain in the set according to the offset within
+the object.
+The strip width is object dependent and may be as large as a
+super-page (2MB on amd64).
+This gives good distribution among memory domains while keeping system
+efficiency higher and is preferential to round-robin for general use.
+.El
+.Pp
+The
+.Fn domainset_create
+function takes a partially filled in domainset as a key and returns a
+valid domainset or NULL.
+It is critical that consumers not use domainsets that have not been
+returned by this function.
+.Vt
+domainset
+is an immutable type that is shared among all matching keys and must
+not be modified after return.
+.Pp
+The
+.Fn sysctl_handle_domainset
+function is provided as a convenience for modifying or viewing domainsets
+that are not accessible via
+.Xr cpuset 2 .
+It is intended for use with 
+.Xr sysctl 9 .
+.Pp
+.Sh SEE ALSO
+.Xr cpuset 1 ,
+.Xr cpuset 2 ,
+.Xr cpuset_setdomain 2 ,
+.Xr bitset 9
+.Sh HISTORY
+.In sys/domainset.h
+first appeared in
+.Fx 12.0 .

Modified: head/sys/kern/kern_cpuset.c
==============================================================================
--- head/sys/kern/kern_cpuset.c	Thu Mar 29 02:50:57 2018	(r331722)
+++ head/sys/kern/kern_cpuset.c	Thu Mar 29 02:54:50 2018	(r331723)
@@ -37,6 +37,8 @@ __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
+#include <sys/sysctl.h>
+#include <sys/ctype.h>
 #include <sys/sysproto.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
@@ -63,9 +65,7 @@ __FBSDID("$FreeBSD$");
 #include <vm/uma.h>
 #include <vm/vm.h>
 #include <vm/vm_object.h>
-#include <vm/vm_page.h>
-#include <vm/vm_param.h>
-#include <vm/vm_phys.h>
+#include <vm/vm_extern.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
@@ -112,13 +112,17 @@ __FBSDID("$FreeBSD$");
  * meaning 'curthread'.  It may query available cpus for that tid with a
  * getaffinity call using (CPU_LEVEL_CPUSET, CPU_WHICH_PID, -1, ...).
  */
+
+LIST_HEAD(domainlist, domainset);
+
 static uma_zone_t cpuset_zone;
 static uma_zone_t domainset_zone;
 static struct mtx cpuset_lock;
 static struct setlist cpuset_ids;
 static struct domainlist cpuset_domains;
 static struct unrhdr *cpuset_unr;
-static struct cpuset *cpuset_zero, *cpuset_default;
+static struct cpuset *cpuset_zero, *cpuset_default, *cpuset_kernel;
+static struct domainset domainset0, domainset2;
 
 /* Return the size of cpuset_t at the kernel level */
 SYSCTL_INT(_kern_sched, OID_AUTO, cpusetsize, CTLFLAG_RD | CTLFLAG_CAPRD,
@@ -445,6 +449,7 @@ static struct domainset *
 _domainset_create(struct domainset *domain, struct domainlist *freelist)
 {
 	struct domainset *ndomain;
+	int i, j, max;
 
 	mtx_lock_spin(&cpuset_lock);
 	LIST_FOREACH(ndomain, &cpuset_domains, ds_link)
@@ -457,7 +462,10 @@ _domainset_create(struct domainset *domain, struct dom
 	if (ndomain == NULL) {
 		LIST_INSERT_HEAD(&cpuset_domains, domain, ds_link);
 		domain->ds_cnt = DOMAINSET_COUNT(&domain->ds_mask);
-		domain->ds_max = DOMAINSET_FLS(&domain->ds_mask) + 1;
+		max = DOMAINSET_FLS(&domain->ds_mask) + 1;
+		for (i = 0, j = 0; i < max; i++)
+			if (DOMAINSET_ISSET(i, &domain->ds_mask))
+				domain->ds_order[j++] = i;
 	}
 	mtx_unlock_spin(&cpuset_lock);
 	if (ndomain == NULL)
@@ -473,11 +481,24 @@ _domainset_create(struct domainset *domain, struct dom
 /*
  * Create or lookup a domainset based on the key held in 'domain'.
  */
-static struct domainset *
+struct domainset *
 domainset_create(const struct domainset *domain)
 {
 	struct domainset *ndomain;
 
+	/*
+	 * Validate the policy.  It must specify a useable policy number with
+	 * only valid domains.  Preferred must include the preferred domain
+	 * in the mask.
+	 */
+	if (domain->ds_policy <= DOMAINSET_POLICY_INVALID ||
+	    domain->ds_policy > DOMAINSET_POLICY_MAX)
+		return (NULL);
+	if (domain->ds_policy == DOMAINSET_POLICY_PREFER &&
+	    !DOMAINSET_ISSET(domain->ds_prefer, &domain->ds_mask))
+		return (NULL);
+	if (!DOMAINSET_SUBSET(&domainset0.ds_mask, &domain->ds_mask))
+		return (NULL);
 	ndomain = uma_zalloc(domainset_zone, M_WAITOK | M_ZERO);
 	domainset_copy(domain, ndomain);
 	return _domainset_create(ndomain, NULL);
@@ -507,7 +528,7 @@ domainset_notify(void)
 		PROC_UNLOCK(p);
 	}
 	sx_sunlock(&allproc_lock);
-	kernel_object->domain.dr_policy = cpuset_default->cs_domain;
+	kernel_object->domain.dr_policy = cpuset_kernel->cs_domain;
 }
 
 /*
@@ -1128,6 +1149,55 @@ out:
 	return (error);
 }
 
+static int
+bitset_strprint(char *buf, size_t bufsiz, const struct bitset *set, int setlen)
+{
+	size_t bytes;
+	int i, once;
+	char *p;
+
+	once = 0;
+	p = buf;
+	for (i = 0; i < __bitset_words(setlen); i++) {
+		if (once != 0) {
+			if (bufsiz < 1)
+				return (0);
+			*p = ',';
+			p++;
+			bufsiz--;
+		} else
+			once = 1;
+		if (bufsiz < sizeof(__STRING(ULONG_MAX)))
+			return (0);
+		bytes = snprintf(p, bufsiz, "%lx", set->__bits[i]);
+		p += bytes;
+		bufsiz -= bytes;
+	}
+	return (p - buf);
+}
+
+static int
+bitset_strscan(struct bitset *set, int setlen, const char *buf)
+{
+	int i, ret;
+	const char *p;
+
+	BIT_ZERO(setlen, set);
+	p = buf;
+	for (i = 0; i < __bitset_words(setlen); i++) {
+		if (*p == ',') {
+			p++;
+			continue;
+		}
+		ret = sscanf(p, "%lx", &set->__bits[i]);
+		if (ret == 0 || ret == -1)
+			break;
+		while (isxdigit(*p))
+			p++;
+	}
+	return (p - buf);
+}
+
 /*
  * Return a string representing a valid layout for a cpuset_t object.
  * It expects an incoming buffer at least sized as CPUSETBUFSIZ.
@@ -1135,19 +1205,9 @@ out:
 char *
 cpusetobj_strprint(char *buf, const cpuset_t *set)
 {
-	char *tbuf;
-	size_t i, bytesp, bufsiz;
 
-	tbuf = buf;
-	bytesp = 0;
-	bufsiz = CPUSETBUFSIZ;
-
-	for (i = 0; i < (_NCPUWORDS - 1); i++) {
-		bytesp = snprintf(tbuf, bufsiz, "%lx,", set->__bits[i]);
-		bufsiz -= bytesp;
-		tbuf += bytesp;
-	}
-	snprintf(tbuf, bufsiz, "%lx", set->__bits[_NCPUWORDS - 1]);
+	bitset_strprint(buf, CPUSETBUFSIZ, (const struct bitset *)set,
+	    CPU_SETSIZE);
 	return (buf);
 }
 
@@ -1158,37 +1218,71 @@ cpusetobj_strprint(char *buf, const cpuset_t *set)
 int
 cpusetobj_strscan(cpuset_t *set, const char *buf)
 {
-	u_int nwords;
-	int i, ret;
+	char p;
 
 	if (strlen(buf) > CPUSETBUFSIZ - 1)
 		return (-1);
 
-	/* Allow to pass a shorter version of the mask when necessary. */
-	nwords = 1;
-	for (i = 0; buf[i] != '\0'; i++)
-		if (buf[i] == ',')
-			nwords++;
-	if (nwords > _NCPUWORDS)
+	p = buf[bitset_strscan((struct bitset *)set, CPU_SETSIZE, buf)];
+	if (p != '\0')
 		return (-1);
 
-	CPU_ZERO(set);
-	for (i = 0; i < (nwords - 1); i++) {
-		ret = sscanf(buf, "%lx,", &set->__bits[i]);
-		if (ret == 0 || ret == -1)
-			return (-1);
-		buf = strstr(buf, ",");
-		if (buf == NULL)
-			return (-1);
-		buf++;
-	}
-	ret = sscanf(buf, "%lx", &set->__bits[nwords - 1]);
-	if (ret == 0 || ret == -1)
-		return (-1);
 	return (0);
 }
 
 /*
+ * Handle a domainset specifier in the sysctl tree.  A poiner to a pointer to
+ * a domainset is in arg1.  If the user specifies a valid domainset the
+ * pointer is updated.
+ *
+ * Format is:
+ * hex mask word 0,hex mask word 1,...:decimal policy:decimal preferred
+ */
+int
+sysctl_handle_domainset(SYSCTL_HANDLER_ARGS)
+{
+	char buf[DOMAINSETBUFSIZ];
+	struct domainset *dset;
+	struct domainset key;
+	int policy, prefer, error;
+	char *p;
+
+	dset = *(struct domainset **)arg1;
+	error = 0;
+
+	if (dset != NULL) {
+		p = buf + bitset_strprint(buf, DOMAINSETBUFSIZ,
+		    (const struct bitset *)&dset->ds_mask, DOMAINSET_SETSIZE);
+		sprintf(p, ":%d:%d", dset->ds_policy, dset->ds_prefer);
+	} else
+		sprintf(buf, "<NULL>");
+	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+
+	/*
+	 * Read in and validate the string.
+	 */
+	memset(&key, 0, sizeof(key));
+	p = &buf[bitset_strscan((struct bitset *)&key.ds_mask,
+	    DOMAINSET_SETSIZE, buf)];
+	if (p == buf)
+		return (EINVAL);
+	if (sscanf(p, ":%d:%d", &policy, &prefer) != 2)
+		return (EINVAL);
+	key.ds_policy = policy;
+	key.ds_prefer = prefer;
+
+	/* Domainset_create() validates the policy.*/
+	dset = domainset_create(&key);
+	if (dset == NULL)
+		return (EINVAL);
+	*(struct domainset **)arg1 = dset;
+
+	return (error);
+}
+
+/*
  * Apply an anonymous mask or a domain to a single thread.
  */
 static int
@@ -1239,95 +1333,19 @@ cpuset_setthread(lwpid_t id, cpuset_t *mask)
 int
 cpuset_setithread(lwpid_t id, int cpu)
 {
-	struct setlist cpusets;
-	struct cpuset *nset, *rset;
-	struct cpuset *parent, *old_set;
-	struct thread *td;
-	struct proc *p;
-	cpusetid_t cs_id;
 	cpuset_t mask;
-	int error;
 
-	cpuset_freelist_init(&cpusets, 1);
-	rset = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO);
-	cs_id = CPUSET_INVALID;
-
 	CPU_ZERO(&mask);
 	if (cpu == NOCPU)
 		CPU_COPY(cpuset_root, &mask);
 	else
 		CPU_SET(cpu, &mask);
-
-	error = cpuset_which(CPU_WHICH_TID, id, &p, &td, &old_set);
-	if (error != 0 || ((cs_id = alloc_unr(cpuset_unr)) == CPUSET_INVALID))
-		goto out;
-
-	/* cpuset_which() returns with PROC_LOCK held. */
-	old_set = td->td_cpuset;
-
-	if (cpu == NOCPU) {
-		nset = LIST_FIRST(&cpusets);
-		LIST_REMOVE(nset, cs_link);
-
-		/*
-		 * roll back to default set. We're not using cpuset_shadow()
-		 * here because we can fail CPU_SUBSET() check. This can happen
-		 * if default set does not contain all CPUs.
-		 */
-		error = _cpuset_create(nset, cpuset_default, &mask, NULL,
-		    CPUSET_INVALID);
-
-		goto applyset;
-	}
-
-	if (old_set->cs_id == 1 || (old_set->cs_id == CPUSET_INVALID &&
-	    old_set->cs_parent->cs_id == 1)) {
-
-		/*
-		 * Current set is either default (1) or
-		 * shadowed version of default set.
-		 *
-		 * Allocate new root set to be able to shadow it
-		 * with any mask.
-		 */
-		error = _cpuset_create(rset, cpuset_zero,
-		    &cpuset_zero->cs_mask, NULL, cs_id);
-		if (error != 0) {
-			PROC_UNLOCK(p);
-			goto out;
-		}
-		rset->cs_flags |= CPU_SET_ROOT;
-		parent = rset;
-		rset = NULL;
-		cs_id = CPUSET_INVALID;
-	} else {
-		/* Assume existing set was already allocated by previous call */
-		parent = old_set;
-		old_set = NULL;
-	}
-
-	error = cpuset_shadow(parent, &nset, &mask, NULL, &cpusets, NULL);
-applyset:
-	if (error == 0) {
-		thread_lock(td);
-		old_set = cpuset_update_thread(td, nset);
-		thread_unlock(td);
-	} else
-		old_set = NULL;
-	PROC_UNLOCK(p);
-	if (old_set != NULL)
-		cpuset_rel(old_set);
-out:
-	cpuset_freelist_free(&cpusets);
-	if (rset != NULL)
-		uma_zfree(cpuset_zone, rset);
-	if (cs_id != CPUSET_INVALID)
-		free_unr(cpuset_unr, cs_id);
-	return (error);
+	return _cpuset_setthread(id, &mask, NULL);
 }
 
-static struct domainset domainset0;
-
+/*
+ * Create the domainset for cpuset 0, 1 and cpuset 2.
+ */
 void
 domainset_zero(void)
 {
@@ -1340,14 +1358,17 @@ domainset_zero(void)
 	DOMAINSET_ZERO(&dset->ds_mask);
 	for (i = 0; i < vm_ndomains; i++)
 		DOMAINSET_SET(i, &dset->ds_mask);
-	dset->ds_policy = DOMAINSET_POLICY_ROUNDROBIN;
+	dset->ds_policy = DOMAINSET_POLICY_FIRSTTOUCH;
 	dset->ds_prefer = -1;
 	curthread->td_domain.dr_policy = _domainset_create(dset, NULL);
-	kernel_object->domain.dr_policy = curthread->td_domain.dr_policy;
+
+	domainset_copy(dset, &domainset2);
+	domainset2.ds_policy = DOMAINSET_POLICY_INTERLEAVE;
+	kernel_object->domain.dr_policy = _domainset_create(&domainset2, NULL);
 }
 
 /*
- * Creates system-wide cpusets and the cpuset for thread0 including two
+ * Creates system-wide cpusets and the cpuset for thread0 including three
  * sets:
  * 
  * 0 - The root set which should represent all valid processors in the
@@ -1357,6 +1378,8 @@ domainset_zero(void)
  * 1 - The default set which all processes are a member of until changed.
  *     This allows an administrator to move all threads off of given cpus to
  *     dedicate them to high priority tasks or save power etc.
+ * 2 - The kernel set which allows restriction and policy to be applied only
+ *     to kernel threads and the kernel_object.
  */
 struct cpuset *
 cpuset_thread0(void)
@@ -1366,12 +1389,12 @@ cpuset_thread0(void)
 	int i;
 
 	cpuset_zone = uma_zcreate("cpuset", sizeof(struct cpuset), NULL, NULL,
-	    NULL, NULL, UMA_ALIGN_PTR, 0);
+	    NULL, NULL, UMA_ALIGN_CACHE, 0);
 	domainset_zone = uma_zcreate("domainset", sizeof(struct domainset),
-	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+	    NULL, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0);
 
 	/*
-	 * Create the root system set for the whole machine.  Doesn't use
+	 * Create the root system set (0) for the whole machine.  Doesn't use
 	 * cpuset_create() due to NULL parent.
 	 */
 	set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO);
@@ -1385,12 +1408,20 @@ cpuset_thread0(void)
 	cpuset_root = &set->cs_mask;
 
 	/*
-	 * Now derive a default, modifiable set from that to give out.
+	 * Now derive a default (1), modifiable set from that to give out.
 	 */
 	set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO);
 	error = _cpuset_create(set, cpuset_zero, NULL, NULL, 1);
 	KASSERT(error == 0, ("Error creating default set: %d\n", error));
 	cpuset_default = set;
+	/*
+	 * Create the kernel set (2).
+	 */
+	set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO);
+	error = _cpuset_create(set, cpuset_zero, NULL, NULL, 2);
+	KASSERT(error == 0, ("Error creating kernel set: %d\n", error));
+	set->cs_domain = &domainset2;
+	cpuset_kernel = set;
 
 	/*
 	 * Initialize the unit allocator. 0 and 1 are allocated above.
@@ -1407,9 +1438,21 @@ cpuset_thread0(void)
 	CPU_COPY(&all_cpus, &cpuset_domain[0]);
 domains_set:
 
-	return (set);
+	return (cpuset_default);
 }
 
+void
+cpuset_kernthread(struct thread *td)
+{
+	struct cpuset *set;
+
+	thread_lock(td);
+	set = td->td_cpuset;
+	td->td_cpuset = cpuset_ref(cpuset_kernel);
+	thread_unlock(td);
+	cpuset_rel(set);
+}
+
 /*
  * Create a cpuset, which would be cpuset_create() but
  * mark the new 'set' as root.
@@ -2108,7 +2151,7 @@ out:
 }
 
 #ifdef DDB
-BITSET_DEFINE(bitset, 1);
+
 static void
 ddb_display_bitset(const struct bitset *set, int size)
 {
@@ -2164,9 +2207,8 @@ DB_SHOW_COMMAND(domainsets, db_show_domainsets)
 	struct domainset *set;
 
 	LIST_FOREACH(set, &cpuset_domains, ds_link) {
-		db_printf("set=%p policy %d prefer %d cnt %d max %d\n",
-		    set, set->ds_policy, set->ds_prefer, set->ds_cnt,
-		    set->ds_max);
+		db_printf("set=%p policy %d prefer %d cnt %d\n",
+		    set, set->ds_policy, set->ds_prefer, set->ds_cnt);
 		db_printf("  mask =");
 		ddb_display_domainset(&set->ds_mask);
 		db_printf("\n");

Modified: head/sys/kern/kern_kthread.c
==============================================================================
--- head/sys/kern/kern_kthread.c	Thu Mar 29 02:50:57 2018	(r331722)
+++ head/sys/kern/kern_kthread.c	Thu Mar 29 02:54:50 2018	(r331723)
@@ -131,7 +131,7 @@ kproc_create(void (*func)(void *), void *arg,
 	cpu_fork_kthread_handler(td, func, arg);
 
 	/* Avoid inheriting affinity from a random parent. */
-	cpuset_setthread(td->td_tid, cpuset_root);
+	cpuset_kernthread(td);
 	thread_lock(td);
 	TD_SET_CAN_RUN(td);
 	sched_prio(td, PVM);
@@ -309,7 +309,7 @@ kthread_add(void (*func)(void *), void *arg, struct pr
 	tidhash_add(newtd);
 
 	/* Avoid inheriting affinity from a random parent. */
-	cpuset_setthread(newtd->td_tid, cpuset_root);
+	cpuset_kernthread(newtd);
 
 	/* Delay putting it on the run queue until now. */
 	if (!(flags & RFSTOPPED)) {

Modified: head/sys/sys/_bitset.h
==============================================================================
--- head/sys/sys/_bitset.h	Thu Mar 29 02:50:57 2018	(r331722)
+++ head/sys/sys/_bitset.h	Thu Mar 29 02:54:50 2018	(r331723)
@@ -57,4 +57,10 @@ struct t {								\
  */
 #define BITSET_DEFINE_VAR(t)	BITSET_DEFINE(t, 1)
 
+/*
+ * Define a default type that can be used while manually specifying size
+ * to every call.
+ */
+BITSET_DEFINE(bitset, 1);
+
 #endif /* !_SYS__BITSET_H_ */

Modified: head/sys/sys/cpuset.h
==============================================================================
--- head/sys/sys/cpuset.h	Thu Mar 29 02:50:57 2018	(r331722)
+++ head/sys/sys/cpuset.h	Thu Mar 29 02:54:50 2018	(r331723)
@@ -139,6 +139,7 @@ int	cpuset_create_root(struct prison *, struct cpuset 
 int	cpuset_setproc_update_set(struct proc *, struct cpuset *);
 int	cpuset_which(cpuwhich_t, id_t, struct proc **,
 	    struct thread **, struct cpuset **);
+void	cpuset_kernthread(struct thread *);
 
 char	*cpusetobj_strprint(char *, const cpuset_t *);
 int	cpusetobj_strscan(cpuset_t *, const char *);

Modified: head/sys/sys/domainset.h
==============================================================================
--- head/sys/sys/domainset.h	Thu Mar 29 02:50:57 2018	(r331722)
+++ head/sys/sys/domainset.h	Thu Mar 29 02:54:50 2018	(r331723)
@@ -28,8 +28,8 @@
  * $FreeBSD$
  */
 
-#ifndef _SYS_DOMAINSETSET_H_
-#define	_SYS_DOMAINSETSET_H_
+#ifndef _SYS_DOMAINSET_H_
+#define	_SYS_DOMAINSET_H_
 
 #include <sys/_domainset.h>
 
@@ -38,8 +38,12 @@
 #define	_NDOMAINSETBITS			_BITSET_BITS
 #define	_NDOMAINSETWORDS		__bitset_words(DOMAINSET_SETSIZE)
 
-#define	DOMAINSETSETBUFSIZ	((2 + sizeof(long) * 2) * _NDOMAINSETWORDS)
+#define	DOMAINSETBUFSIZ							\
+	    (((2 + sizeof(long) * 2) * _NDOMAINSETWORDS) +		\
+	    sizeof("::") + sizeof(__XSTRING(DOMAINSET_POLICY_MAX)) +	\
+	    sizeof(__XSTRING(MAXMEMDOM)))
 
+
 #define	DOMAINSET_CLR(n, p)		BIT_CLR(DOMAINSET_SETSIZE, n, p)
 #define	DOMAINSET_COPY(f, t)		BIT_COPY(DOMAINSET_SETSIZE, f, t)
 #define	DOMAINSET_ISSET(n, p)		BIT_ISSET(DOMAINSET_SETSIZE, n, p)
@@ -73,23 +77,37 @@
 #define	DOMAINSET_POLICY_ROUNDROBIN	1
 #define	DOMAINSET_POLICY_FIRSTTOUCH	2
 #define	DOMAINSET_POLICY_PREFER		3
-#define	DOMAINSET_POLICY_MAX		DOMAINSET_POLICY_PREFER
+#define	DOMAINSET_POLICY_INTERLEAVE	4
+#define	DOMAINSET_POLICY_MAX		DOMAINSET_POLICY_INTERLEAVE
 
 #ifdef _KERNEL
-#include <sys/queue.h>
-LIST_HEAD(domainlist, domainset);
+#if MAXMEMDOM < 256
+typedef	uint8_t		domainid_t;
+#else
+typedef uint16_t	domainid_t;
+#endif
 
 struct domainset {
 	LIST_ENTRY(domainset)	ds_link;
 	domainset_t	ds_mask;	/* allowed domains. */
 	uint16_t	ds_policy;	/* Policy type. */
-	int16_t		ds_prefer;	/* Preferred domain or -1. */
-	uint16_t	ds_cnt;		/* popcnt from above. */
-	uint16_t	ds_max;		/* Maximum domain in set. */
+	domainid_t	ds_prefer;	/* Preferred domain or -1. */
+	domainid_t	ds_cnt;		/* popcnt from above. */
+	domainid_t	ds_order[MAXMEMDOM];  /* nth domain table. */
 };
 
 void domainset_zero(void);
 
+/*
+ * Add a domainset to the system based on a key initializing policy, prefer,
+ * and mask.  Do not create and directly use domainset structures.  The
+ * returned value will not match the key pointer.
+ */
+struct domainset *domainset_create(const struct domainset *);
+#ifdef _SYS_SYSCTL_H_
+int sysctl_handle_domainset(SYSCTL_HANDLER_ARGS);
+#endif
+
 #else
 __BEGIN_DECLS
 int	cpuset_getdomain(cpulevel_t, cpuwhich_t, id_t, size_t, domainset_t *,
@@ -99,4 +117,4 @@ int	cpuset_setdomain(cpulevel_t, cpuwhich_t, id_t, siz
 
 __END_DECLS
 #endif
-#endif /* !_SYS_DOMAINSETSET_H_ */
+#endif /* !_SYS_DOMAINSET_H_ */

Modified: head/sys/sys/proc.h
==============================================================================
--- head/sys/sys/proc.h	Thu Mar 29 02:50:57 2018	(r331722)
+++ head/sys/sys/proc.h	Thu Mar 29 02:54:50 2018	(r331723)
@@ -67,7 +67,7 @@
 #include <sys/ucontext.h>
 #include <sys/ucred.h>
 #include <sys/types.h>
-#include <sys/domainset.h>
+#include <sys/_domainset.h>
 
 #include <machine/proc.h>		/* Machine-dependent proc substruct. */
 #ifdef _KERNEL

Modified: head/sys/vm/vm_domainset.c
==============================================================================
--- head/sys/vm/vm_domainset.c	Thu Mar 29 02:50:57 2018	(r331722)
+++ head/sys/vm/vm_domainset.c	Thu Mar 29 02:54:50 2018	(r331723)
@@ -56,11 +56,14 @@ __FBSDID("$FreeBSD$");
  * assumed that most allocations are successful.
  */
 
+static int vm_domainset_default_stride = 64;
+
 /*
  * Determine which policy is to be used for this allocation.
  */
 static void
-vm_domainset_iter_domain(struct vm_domainset_iter *di, struct vm_object *obj)
+vm_domainset_iter_init(struct vm_domainset_iter *di, struct vm_object *obj,
+    vm_pindex_t pindex)
 {
 	struct domainset *domain;
 
@@ -76,18 +79,33 @@ vm_domainset_iter_domain(struct vm_domainset_iter *di,
 		di->di_domain = curthread->td_domain.dr_policy;
 		di->di_iter = &curthread->td_domain.dr_iterator;
 	}
+	di->di_policy = di->di_domain->ds_policy;
+	if (di->di_policy == DOMAINSET_POLICY_INTERLEAVE) {
+		if (vm_object_reserv(obj)) {
+			/*
+			 * Color the pindex so we end up on the correct
+			 * reservation boundary.
+			 */
+			pindex += obj->pg_color;
+			pindex >>= VM_LEVEL_0_ORDER;
+		} else
+			pindex /= vm_domainset_default_stride;
+		/*
+		 * Offset pindex so the first page of each object does
+		 * not end up in domain 0.
+		 */
+		if (obj != NULL)
+			pindex += (((uintptr_t)obj) / sizeof(*obj));
+		di->di_offset = pindex;
+	}
 }
 
 static void
 vm_domainset_iter_rr(struct vm_domainset_iter *di, int *domain)
 {
-	int d;
 
-	d = *di->di_iter;
-	do {
-		d = (d + 1) % di->di_domain->ds_max;
-	} while (!DOMAINSET_ISSET(d, &di->di_domain->ds_mask));
-	*di->di_iter = *domain = d;
+	*domain = di->di_domain->ds_order[
+	    ++(*di->di_iter) % di->di_domain->ds_cnt];
 }
 
 static void
@@ -95,27 +113,38 @@ vm_domainset_iter_prefer(struct vm_domainset_iter *di,
 {
 	int d;
 
-	d = *di->di_iter;
 	do {
-		d = (d + 1) % di->di_domain->ds_max;
-	} while (!DOMAINSET_ISSET(d, &di->di_domain->ds_mask) || 
-	    d == di->di_domain->ds_prefer);
-	*di->di_iter = *domain = d;
+		d = di->di_domain->ds_order[
+		    ++(*di->di_iter) % di->di_domain->ds_cnt];
+	} while (d == di->di_domain->ds_prefer);
+	*domain = d;
 }
 
 static void
+vm_domainset_iter_interleave(struct vm_domainset_iter *di, int *domain)
+{
+	int d;
+
+	d = di->di_offset % di->di_domain->ds_cnt;
+	*di->di_iter = d;
+	*domain = di->di_domain->ds_order[d];
+}
+
+static void
 vm_domainset_iter_next(struct vm_domainset_iter *di, int *domain)
 {
 
 	KASSERT(di->di_n > 0,
 	    ("vm_domainset_iter_first: Invalid n %d", di->di_n));
-	switch (di->di_domain->ds_policy) {
+	switch (di->di_policy) {
 	case DOMAINSET_POLICY_FIRSTTOUCH:
 		/*
 		 * To prevent impossible allocations we convert an invalid
 		 * first-touch to round-robin.
 		 */
 		/* FALLTHROUGH */
+	case DOMAINSET_POLICY_INTERLEAVE:
+		/* FALLTHROUGH */
 	case DOMAINSET_POLICY_ROUNDROBIN:
 		vm_domainset_iter_rr(di, domain);
 		break;
@@ -124,7 +153,7 @@ vm_domainset_iter_next(struct vm_domainset_iter *di, i
 		break;
 	default:
 		panic("vm_domainset_iter_first: Unknown policy %d",
-		    di->di_domain->ds_policy);
+		    di->di_policy);
 	}
 	KASSERT(*domain < vm_ndomains,
 	    ("vm_domainset_iter_next: Invalid domain %d", *domain));
@@ -134,11 +163,15 @@ static void
 vm_domainset_iter_first(struct vm_domainset_iter *di, int *domain)
 {
 
-	switch (di->di_domain->ds_policy) {
+	switch (di->di_policy) {
 	case DOMAINSET_POLICY_FIRSTTOUCH:
 		*domain = PCPU_GET(domain);
 		if (DOMAINSET_ISSET(*domain, &di->di_domain->ds_mask)) {
-			di->di_n = 1;
+			/*
+			 * Add an extra iteration because we will visit the
+			 * current domain a second time in the rr iterator.
+			 */
+			di->di_n = di->di_domain->ds_cnt + 1;
 			break;
 		}
 		/*
@@ -154,9 +187,13 @@ vm_domainset_iter_first(struct vm_domainset_iter *di, 
 		*domain = di->di_domain->ds_prefer;
 		di->di_n = di->di_domain->ds_cnt;
 		break;
+	case DOMAINSET_POLICY_INTERLEAVE:
+		vm_domainset_iter_interleave(di, domain);
+		di->di_n = di->di_domain->ds_cnt;
+		break;
 	default:
 		panic("vm_domainset_iter_first: Unknown policy %d",
-		    di->di_domain->ds_policy);
+		    di->di_policy);
 	}
 	KASSERT(di->di_n > 0,
 	    ("vm_domainset_iter_first: Invalid n %d", di->di_n));
@@ -166,10 +203,10 @@ vm_domainset_iter_first(struct vm_domainset_iter *di, 
 
 void
 vm_domainset_iter_page_init(struct vm_domainset_iter *di, struct vm_object *obj,
-    int *domain, int *req)
+    vm_pindex_t pindex, int *domain, int *req)
 {
 
-	vm_domainset_iter_domain(di, obj);
+	vm_domainset_iter_init(di, obj, pindex);
 	di->di_flags = *req;
 	*req = (di->di_flags & ~(VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL)) |
 	    VM_ALLOC_NOWAIT;
@@ -213,7 +250,9 @@ vm_domainset_iter_malloc_init(struct vm_domainset_iter
     struct vm_object *obj, int *domain, int *flags)
 {
 
-	vm_domainset_iter_domain(di, obj);
+	vm_domainset_iter_init(di, obj, 0);
+	if (di->di_policy == DOMAINSET_POLICY_INTERLEAVE)
+		di->di_policy = DOMAINSET_POLICY_ROUNDROBIN;
 	di->di_flags = *flags;
 	*flags = (di->di_flags & ~M_WAITOK) | M_NOWAIT;
 	vm_domainset_iter_first(di, domain);
@@ -253,7 +292,7 @@ vm_domainset_iter_page(struct vm_domainset_iter *di, i
 
 void
 vm_domainset_iter_page_init(struct vm_domainset_iter *di,
-            struct vm_object *obj, int *domain, int *flags)
+            struct vm_object *obj, vm_pindex_t pindex, int *domain, int *flags)
 {
 
 	*domain = 0;

Modified: head/sys/vm/vm_domainset.h
==============================================================================
--- head/sys/vm/vm_domainset.h	Thu Mar 29 02:50:57 2018	(r331722)
+++ head/sys/vm/vm_domainset.h	Thu Mar 29 02:54:50 2018	(r331723)
@@ -33,13 +33,15 @@
 struct vm_domainset_iter {
 	struct domainset	*di_domain;
 	int			*di_iter;
+	vm_pindex_t		di_offset;
+	int			di_policy;
 	int			di_flags;
 	int			di_n;
 };
 
 int	vm_domainset_iter_page(struct vm_domainset_iter *, int *, int *);
 void	vm_domainset_iter_page_init(struct vm_domainset_iter *,
-	    struct vm_object *, int *, int *);
+	    struct vm_object *, vm_pindex_t, int *, int *);
 int	vm_domainset_iter_malloc(struct vm_domainset_iter *, int *, int *);
 void	vm_domainset_iter_malloc_init(struct vm_domainset_iter *,
 	    struct vm_object *, int *, int *);

Modified: head/sys/vm/vm_page.c
==============================================================================
--- head/sys/vm/vm_page.c	Thu Mar 29 02:50:57 2018	(r331722)
+++ head/sys/vm/vm_page.c	Thu Mar 29 02:54:50 2018	(r331723)
@@ -1660,7 +1660,7 @@ vm_page_alloc_after(vm_object_t object, vm_pindex_t pi
 	vm_page_t m;
 	int domain;
 
-	vm_domainset_iter_page_init(&di, object, &domain, &req);
+	vm_domainset_iter_page_init(&di, object, pindex, &domain, &req);
 	do {
 		m = vm_page_alloc_domain_after(object, pindex, domain, req,
 		    mpred);
@@ -1893,7 +1893,7 @@ vm_page_alloc_contig(vm_object_t object, vm_pindex_t p
 	vm_page_t m;
 	int domain;
 
-	vm_domainset_iter_page_init(&di, object, &domain, &req);
+	vm_domainset_iter_page_init(&di, object, pindex, &domain, &req);
 	do {
 		m = vm_page_alloc_contig_domain(object, pindex, domain, req,
 		    npages, low, high, alignment, boundary, memattr);
@@ -2092,7 +2092,7 @@ vm_page_alloc_freelist(int freelist, int req)
 	vm_page_t m;
 	int domain;
 
-	vm_domainset_iter_page_init(&di, kernel_object, &domain, &req);
+	vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req);
 	do {
 		m = vm_page_alloc_freelist_domain(domain, freelist, req);
 		if (m != NULL)
@@ -2691,7 +2691,7 @@ vm_page_reclaim_contig(int req, u_long npages, vm_padd
 	int domain;
 	bool ret;
 
-	vm_domainset_iter_page_init(&di, kernel_object, &domain, &req);
+	vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req);
 	do {
 		ret = vm_page_reclaim_contig_domain(domain, req, npages, low,
 		    high, alignment, boundary);

Modified: head/sys/vm/vnode_pager.c
==============================================================================
--- head/sys/vm/vnode_pager.c	Thu Mar 29 02:50:57 2018	(r331722)
+++ head/sys/vm/vnode_pager.c	Thu Mar 29 02:54:50 2018	(r331723)
@@ -59,6 +59,7 @@ __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
+#include <sys/sysctl.h>
 #include <sys/proc.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
@@ -69,6 +70,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/conf.h>
 #include <sys/rwlock.h>
 #include <sys/sf_buf.h>

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201803290254.w2T2spJB077720>