Skip site navigation (1)Skip section navigation (2)
Date:      Fri, 12 Jan 2018 22:48:23 +0000 (UTC)
From:      Jeff Roberson <jeff@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r327895 - in head: lib/libc/sys sys/arm/arm sys/compat/freebsd32 sys/conf sys/ddb sys/kern sys/netpfil/ipfw sys/sys sys/vm sys/x86/acpica usr.bin/cpuset usr.bin/numactl
Message-ID:  <201801122248.w0CMmNdF082700@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: jeff
Date: Fri Jan 12 22:48:23 2018
New Revision: 327895
URL: https://svnweb.freebsd.org/changeset/base/327895

Log:
  Implement 'domainset', a cpuset based NUMA policy mechanism.  This allows
  userspace to control NUMA policy administratively and programmatically.
  
  Implement domainset based iterators in the page layer.
  
  Remove the now legacy numa_* syscalls.
  
  Cleanup some header polution created by having seq.h in proc.h.
  
  Reviewed by:	markj, kib
  Discussed with:	alc
  Tested by:	pho
  Sponsored by:	Netflix, Dell/EMC Isilon
  Differential Revision:	https://reviews.freebsd.org/D13403

Deleted:
  head/sys/kern/kern_numa.c
  head/sys/sys/_vm_domain.h
  head/sys/vm/vm_domain.c
  head/sys/vm/vm_domain.h
Modified:
  head/lib/libc/sys/Symbol.map
  head/sys/arm/arm/machdep_ptrace.c
  head/sys/compat/freebsd32/freebsd32_misc.c
  head/sys/compat/freebsd32/syscalls.master
  head/sys/conf/files
  head/sys/ddb/db_run.c
  head/sys/kern/init_main.c
  head/sys/kern/init_sysent.c
  head/sys/kern/kern_cpuset.c
  head/sys/kern/kern_exit.c
  head/sys/kern/kern_fork.c
  head/sys/kern/kern_thr.c
  head/sys/kern/kern_thread.c
  head/sys/kern/makesyscalls.sh
  head/sys/kern/sched_4bsd.c
  head/sys/kern/sched_ule.c
  head/sys/kern/subr_kdb.c
  head/sys/kern/syscalls.master
  head/sys/netpfil/ipfw/dn_sched_fq_codel.c
  head/sys/sys/cpuset.h
  head/sys/sys/proc.h
  head/sys/sys/syscallsubr.h
  head/sys/vm/vm_fault.c
  head/sys/vm/vm_object.c
  head/sys/vm/vm_object.h
  head/sys/vm/vm_page.c
  head/sys/vm/vm_page.h
  head/sys/vm/vm_phys.c
  head/sys/vm/vm_phys.h
  head/sys/x86/acpica/srat.c
  head/usr.bin/cpuset/cpuset.c
  head/usr.bin/numactl/numactl.c

Modified: head/lib/libc/sys/Symbol.map
==============================================================================
--- head/lib/libc/sys/Symbol.map	Fri Jan 12 21:50:18 2018	(r327894)
+++ head/lib/libc/sys/Symbol.map	Fri Jan 12 22:48:23 2018	(r327895)
@@ -398,6 +398,8 @@ FBSD_1.5 {
 	mknodat;
 	stat;
 	statfs;
+	cpuset_getdomain;
+	cpuset_setdomain;
 };
 
 FBSDprivate_1.0 {
@@ -1022,4 +1024,8 @@ FBSDprivate_1.0 {
 	gssd_syscall;
 	__libc_interposing_slot;
 	__libc_sigwait;
+	_cpuset_getdomain;
+	__sys_cpuset_getdomain;
+	_cpuset_setdomain;
+	__sys_cpuset_setdomain;
 };

Modified: head/sys/arm/arm/machdep_ptrace.c
==============================================================================
--- head/sys/arm/arm/machdep_ptrace.c	Fri Jan 12 21:50:18 2018	(r327894)
+++ head/sys/arm/arm/machdep_ptrace.c	Fri Jan 12 22:48:23 2018	(r327895)
@@ -32,6 +32,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/param.h>
 #include <sys/proc.h>
 #include <sys/ptrace.h>
+#include <sys/lock.h>
 #include <sys/mutex.h>
 
 #include <machine/machdep.h>

Modified: head/sys/compat/freebsd32/freebsd32_misc.c
==============================================================================
--- head/sys/compat/freebsd32/freebsd32_misc.c	Fri Jan 12 21:50:18 2018	(r327894)
+++ head/sys/compat/freebsd32/freebsd32_misc.c	Fri Jan 12 22:48:23 2018	(r327895)
@@ -3017,6 +3017,24 @@ freebsd32_cpuset_setaffinity(struct thread *td,
 }
 
 int
+freebsd32_cpuset_getdomain(struct thread *td,
+    struct freebsd32_cpuset_getdomain_args *uap)
+{
+
+	return (kern_cpuset_getdomain(td, uap->level, uap->which,
+	    PAIR32TO64(id_t,uap->id), uap->domainsetsize, uap->mask, uap->policy));
+}
+
+int
+freebsd32_cpuset_setdomain(struct thread *td,
+    struct freebsd32_cpuset_setdomain_args *uap)
+{
+
+	return (kern_cpuset_setdomain(td, uap->level, uap->which,
+	    PAIR32TO64(id_t,uap->id), uap->domainsetsize, uap->mask, uap->policy));
+}
+
+int
 freebsd32_nmount(struct thread *td,
     struct freebsd32_nmount_args /* {
     	struct iovec *iovp;

Modified: head/sys/compat/freebsd32/syscalls.master
==============================================================================
--- head/sys/compat/freebsd32/syscalls.master	Fri Jan 12 21:50:18 2018	(r327894)
+++ head/sys/compat/freebsd32/syscalls.master	Fri Jan 12 22:48:23 2018	(r327895)
@@ -1086,12 +1086,8 @@
 547	AUE_FUTIMESAT	STD	{ int freebsd32_utimensat(int fd, \
 				    char *path, \
 				    struct timespec *times, int flag); }
-548	AUE_NULL	NOPROTO	{ int numa_getaffinity(cpuwhich_t which, \
-				    id_t id, \
-				    struct vm_domain_policy *policy); }
-549	AUE_NULL	NOPROTO	{ int numa_setaffinity(cpuwhich_t which, \
-				    id_t id, \
-				    const struct vm_domain_policy *policy); }
+548	AUE_NULL	UNIMPL	numa_getaffinity
+549	AUE_NULL	UNIMPL	numa_setaffinity
 550	AUE_FSYNC	NOPROTO	{ int fdatasync(int fd); }
 551	AUE_FSTAT	STD	{ int freebsd32_fstat(int fd, \
 				    struct stat32 *ub); }
@@ -1119,4 +1115,13 @@
 				    struct kevent32 *eventlist, \
 				    int nevents, \
 				    const struct timespec32 *timeout); }
+561	AUE_NULL	STD	{ int freebsd32_cpuset_getdomain(cpulevel_t level, \
+				    cpuwhich_t which, uint32_t id1, uint32_t id2, \
+				    size_t domainsetsize, domainset_t *mask, \
+				    int *policy); }
+562	AUE_NULL	STD	{ int freebsd32_cpuset_setdomain(cpulevel_t level, \
+				    cpuwhich_t which, uint32_t id1, uint32_t id2, \
+				    size_t domainsetsize, domainset_t *mask, \
+				    int policy); }
+
 ; vim: syntax=off

Modified: head/sys/conf/files
==============================================================================
--- head/sys/conf/files	Fri Jan 12 21:50:18 2018	(r327894)
+++ head/sys/conf/files	Fri Jan 12 22:48:23 2018	(r327895)
@@ -3787,7 +3787,6 @@ kern/kern_module.c		standard
 kern/kern_mtxpool.c		standard
 kern/kern_mutex.c		standard
 kern/kern_ntptime.c		standard
-kern/kern_numa.c		standard
 kern/kern_osd.c			standard
 kern/kern_physio.c		standard
 kern/kern_pmc.c			standard
@@ -4837,7 +4836,7 @@ vm/swap_pager.c			standard
 vm/uma_core.c			standard
 vm/uma_dbg.c			standard
 vm/memguard.c			optional DEBUG_MEMGUARD
-vm/vm_domain.c			standard
+vm/vm_domainset.c		standard
 vm/vm_fault.c			standard
 vm/vm_glue.c			standard
 vm/vm_init.c			standard

Modified: head/sys/ddb/db_run.c
==============================================================================
--- head/sys/ddb/db_run.c	Fri Jan 12 21:50:18 2018	(r327894)
+++ head/sys/ddb/db_run.c	Fri Jan 12 22:48:23 2018	(r327895)
@@ -40,6 +40,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/param.h>
 #include <sys/kdb.h>
 #include <sys/proc.h>
+#include <sys/systm.h>
 
 #include <machine/kdb.h>
 #include <machine/pcb.h>

Modified: head/sys/kern/init_main.c
==============================================================================
--- head/sys/kern/init_main.c	Fri Jan 12 21:50:18 2018	(r327894)
+++ head/sys/kern/init_main.c	Fri Jan 12 22:48:23 2018	(r327895)
@@ -89,7 +89,6 @@ __FBSDID("$FreeBSD$");
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
-#include <vm/vm_domain.h>
 #include <sys/copyright.h>
 
 #include <ddb/ddb.h>
@@ -497,10 +496,7 @@ proc0_init(void *dummy __unused)
 	td->td_flags = TDF_INMEM;
 	td->td_pflags = TDP_KTHREAD;
 	td->td_cpuset = cpuset_thread0();
-	vm_domain_policy_init(&td->td_vm_dom_policy);
-	vm_domain_policy_set(&td->td_vm_dom_policy, VM_POLICY_NONE, -1);
-	vm_domain_policy_init(&p->p_vm_dom_policy);
-	vm_domain_policy_set(&p->p_vm_dom_policy, VM_POLICY_NONE, -1);
+	td->td_domain.dr_policy = td->td_cpuset->cs_domain;
 	prison0_init();
 	p->p_peers = 0;
 	p->p_leader = p;

Modified: head/sys/kern/init_sysent.c
==============================================================================
--- head/sys/kern/init_sysent.c	Fri Jan 12 21:50:18 2018	(r327894)
+++ head/sys/kern/init_sysent.c	Fri Jan 12 22:48:23 2018	(r327895)
@@ -599,8 +599,8 @@ struct sysent sysent[] = {
 	{ AS(ppoll_args), (sy_call_t *)sys_ppoll, AUE_POLL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 545 = ppoll */
 	{ AS(futimens_args), (sy_call_t *)sys_futimens, AUE_FUTIMES, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 546 = futimens */
 	{ AS(utimensat_args), (sy_call_t *)sys_utimensat, AUE_FUTIMESAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 547 = utimensat */
-	{ AS(numa_getaffinity_args), (sy_call_t *)sys_numa_getaffinity, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 548 = numa_getaffinity */
-	{ AS(numa_setaffinity_args), (sy_call_t *)sys_numa_setaffinity, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 549 = numa_setaffinity */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 548 = numa_getaffinity */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 549 = numa_setaffinity */
 	{ AS(fdatasync_args), (sy_call_t *)sys_fdatasync, AUE_FSYNC, NULL, 0, 0, 0, SY_THR_STATIC },	/* 550 = fdatasync */
 	{ AS(fstat_args), (sy_call_t *)sys_fstat, AUE_FSTAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 551 = fstat */
 	{ AS(fstatat_args), (sy_call_t *)sys_fstatat, AUE_FSTATAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 552 = fstatat */
@@ -612,4 +612,6 @@ struct sysent sysent[] = {
 	{ AS(fhstatfs_args), (sy_call_t *)sys_fhstatfs, AUE_FHSTATFS, NULL, 0, 0, 0, SY_THR_STATIC },	/* 558 = fhstatfs */
 	{ AS(mknodat_args), (sy_call_t *)sys_mknodat, AUE_MKNODAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 559 = mknodat */
 	{ AS(kevent_args), (sy_call_t *)sys_kevent, AUE_KEVENT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 560 = kevent */
+	{ AS(cpuset_getdomain_args), (sy_call_t *)sys_cpuset_getdomain, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 561 = cpuset_getdomain */
+	{ AS(cpuset_setdomain_args), (sy_call_t *)sys_cpuset_setdomain, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 562 = cpuset_setdomain */
 };

Modified: head/sys/kern/kern_cpuset.c
==============================================================================
--- head/sys/kern/kern_cpuset.c	Fri Jan 12 21:50:18 2018	(r327894)
+++ head/sys/kern/kern_cpuset.c	Fri Jan 12 22:48:23 2018	(r327895)
@@ -51,17 +51,21 @@ __FBSDID("$FreeBSD$");
 #include <sys/syscallsubr.h>
 #include <sys/capsicum.h>
 #include <sys/cpuset.h>
+#include <sys/domainset.h>
 #include <sys/sx.h>
 #include <sys/queue.h>
 #include <sys/libkern.h>
 #include <sys/limits.h>
 #include <sys/bus.h>
 #include <sys/interrupt.h>
+#include <sys/vmmeter.h>
 
 #include <vm/uma.h>
 #include <vm/vm.h>
+#include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_param.h>
+#include <vm/vm_phys.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
@@ -109,8 +113,10 @@ __FBSDID("$FreeBSD$");
  * getaffinity call using (CPU_LEVEL_CPUSET, CPU_WHICH_PID, -1, ...).
  */
 static uma_zone_t cpuset_zone;
+static uma_zone_t domainset_zone;
 static struct mtx cpuset_lock;
 static struct setlist cpuset_ids;
+static struct domainlist cpuset_domains;
 static struct unrhdr *cpuset_unr;
 static struct cpuset *cpuset_zero, *cpuset_default;
 
@@ -121,7 +127,33 @@ SYSCTL_INT(_kern_sched, OID_AUTO, cpusetsize, CTLFLAG_
 cpuset_t *cpuset_root;
 cpuset_t cpuset_domain[MAXMEMDOM];
 
+static int domainset_valid(const struct domainset *, const struct domainset *);
+
 /*
+ * Find the first non-anonymous set starting from 'set'.
+ */
+static struct cpuset *
+cpuset_getbase(struct cpuset *set)
+{
+
+	if (set->cs_id == CPUSET_INVALID)
+		set = set->cs_parent;
+	return (set);
+}
+
+/*
+ * Walks up the tree from 'set' to find the root.
+ */
+static struct cpuset *
+cpuset_getroot(struct cpuset *set)
+{
+
+	while ((set->cs_flags & CPU_SET_ROOT) == 0 && set->cs_parent != NULL)
+		set = set->cs_parent;
+	return (set);
+}
+
+/*
  * Acquire a reference to a cpuset, all pointers must be tracked with refs.
  */
 struct cpuset *
@@ -140,12 +172,7 @@ static struct cpuset *
 cpuset_refroot(struct cpuset *set)
 {
 
-	for (; set->cs_parent != NULL; set = set->cs_parent)
-		if (set->cs_flags & CPU_SET_ROOT)
-			break;
-	cpuset_ref(set);
-
-	return (set);
+	return (cpuset_ref(cpuset_getroot(set)));
 }
 
 /*
@@ -157,11 +184,7 @@ static struct cpuset *
 cpuset_refbase(struct cpuset *set)
 {
 
-	if (set->cs_id == CPUSET_INVALID)
-		set = set->cs_parent;
-	cpuset_ref(set);
-
-	return (set);
+	return (cpuset_ref(cpuset_getbase(set)));
 }
 
 /*
@@ -257,17 +280,25 @@ cpuset_lookup(cpusetid_t setid, struct thread *td)
  * will have no valid cpu based on restrictions from the parent.
  */
 static int
-_cpuset_create(struct cpuset *set, struct cpuset *parent, const cpuset_t *mask,
-    cpusetid_t id)
+_cpuset_create(struct cpuset *set, struct cpuset *parent,
+    const cpuset_t *mask, struct domainset *domain, cpusetid_t id)
 {
 
+	if (domain == NULL)
+		domain = parent->cs_domain;
+	if (mask == NULL)
+		mask = &parent->cs_mask;
 	if (!CPU_OVERLAP(&parent->cs_mask, mask))
 		return (EDEADLK);
+	/* The domain must be prepared ahead of time. */
+	if (!domainset_valid(parent->cs_domain, domain))
+		return (EDEADLK);
 	CPU_COPY(mask, &set->cs_mask);
 	LIST_INIT(&set->cs_children);
 	refcount_init(&set->cs_ref, 1);
 	set->cs_flags = 0;
 	mtx_lock_spin(&cpuset_lock);
+	set->cs_domain = domain;
 	CPU_AND(&set->cs_mask, &parent->cs_mask);
 	set->cs_id = id;
 	set->cs_parent = cpuset_ref(parent);
@@ -294,8 +325,8 @@ cpuset_create(struct cpuset **setp, struct cpuset *par
 	id = alloc_unr(cpuset_unr);
 	if (id == -1)
 		return (ENFILE);
-	*setp = set = uma_zalloc(cpuset_zone, M_WAITOK);
-	error = _cpuset_create(set, parent, mask, id);
+	*setp = set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO);
+	error = _cpuset_create(set, parent, mask, NULL, id);
 	if (error == 0)
 		return (0);
 	free_unr(cpuset_unr, id);
@@ -304,7 +335,207 @@ cpuset_create(struct cpuset **setp, struct cpuset *par
 	return (error);
 }
 
+static void
+cpuset_freelist_add(struct setlist *list, int count)
+{
+	struct cpuset *set;
+	int i;
+
+	for (i = 0; i < count; i++) {
+		set = uma_zalloc(cpuset_zone, M_ZERO | M_WAITOK);
+		LIST_INSERT_HEAD(list, set, cs_link);
+	}
+}
+
+static void
+cpuset_freelist_init(struct setlist *list, int count)
+{
+
+	LIST_INIT(list);
+	cpuset_freelist_add(list, count);
+}
+
+static void
+cpuset_freelist_free(struct setlist *list)
+{
+	struct cpuset *set;
+
+	while ((set = LIST_FIRST(list)) != NULL) {
+		LIST_REMOVE(set, cs_link);
+		uma_zfree(cpuset_zone, set);
+	}
+}
+
+static void
+domainset_freelist_add(struct domainlist *list, int count)
+{
+	struct domainset *set;
+	int i;
+
+	for (i = 0; i < count; i++) {
+		set = uma_zalloc(domainset_zone, M_ZERO | M_WAITOK);
+		LIST_INSERT_HEAD(list, set, ds_link);
+	}
+}
+
+static void
+domainset_freelist_init(struct domainlist *list, int count)
+{
+
+	LIST_INIT(list);
+	domainset_freelist_add(list, count);
+}
+
+static void
+domainset_freelist_free(struct domainlist *list)
+{
+	struct domainset *set;
+
+	while ((set = LIST_FIRST(list)) != NULL) {
+		LIST_REMOVE(set, ds_link);
+		uma_zfree(domainset_zone, set);
+	}
+}
+
+/* Copy a domainset preserving mask and policy. */
+static void
+domainset_copy(const struct domainset *from, struct domainset *to)
+{
+
+	DOMAINSET_COPY(&from->ds_mask, &to->ds_mask);
+	to->ds_policy = from->ds_policy;
+	to->ds_prefer = from->ds_prefer;
+}
+
+/* Return 1 if mask and policy are equal, otherwise 0. */
+static int
+domainset_equal(const struct domainset *one, const struct domainset *two)
+{
+
+	return (DOMAINSET_CMP(&one->ds_mask, &two->ds_mask) == 0 &&
+	    one->ds_policy == two->ds_policy &&
+	    one->ds_prefer == two->ds_prefer);
+}
+
+/* Return 1 if child is a valid subset of parent. */
+static int
+domainset_valid(const struct domainset *parent, const struct domainset *child)
+{
+	if (child->ds_policy != DOMAINSET_POLICY_PREFER)
+		return (DOMAINSET_SUBSET(&parent->ds_mask, &child->ds_mask));
+	return (DOMAINSET_ISSET(child->ds_prefer, &parent->ds_mask));
+}
+
+static int
+domainset_restrict(const struct domainset *parent,
+    const struct domainset *child)
+{
+	if (child->ds_policy != DOMAINSET_POLICY_PREFER)
+		return (DOMAINSET_OVERLAP(&parent->ds_mask, &child->ds_mask));
+	return (DOMAINSET_ISSET(child->ds_prefer, &parent->ds_mask));
+}
+
 /*
+ * Lookup or create a domainset.  The key is provided in ds_mask and
+ * ds_policy.  If the domainset does not yet exist the storage in
+ * 'domain' is used to insert.  Otherwise this storage is freed to the
+ * domainset_zone and the existing domainset is returned.
+ */
+static struct domainset *
+_domainset_create(struct domainset *domain, struct domainlist *freelist)
+{
+	struct domainset *ndomain;
+
+	mtx_lock_spin(&cpuset_lock);
+	LIST_FOREACH(ndomain, &cpuset_domains, ds_link)
+		if (domainset_equal(ndomain, domain))
+			break;
+	/*
+	 * If the domain does not yet exist we insert it and initialize
+	 * various iteration helpers which are not part of the key.
+	 */
+	if (ndomain == NULL) {
+		LIST_INSERT_HEAD(&cpuset_domains, domain, ds_link);
+		domain->ds_cnt = DOMAINSET_COUNT(&domain->ds_mask);
+		domain->ds_max = DOMAINSET_FLS(&domain->ds_mask) + 1;
+	}
+	mtx_unlock_spin(&cpuset_lock);
+	if (ndomain == NULL)
+		return (domain);
+	if (freelist != NULL)
+		LIST_INSERT_HEAD(freelist, domain, ds_link);
+	else
+		uma_zfree(domainset_zone, domain);
+	return (ndomain);
+	
+}
+
+/*
+ * Create or lookup a domainset based on the key held in 'domain'.
+ */
+static struct domainset *
+domainset_create(const struct domainset *domain)
+{
+	struct domainset *ndomain;
+
+	ndomain = uma_zalloc(domainset_zone, M_WAITOK | M_ZERO);
+	domainset_copy(domain, ndomain);
+	return _domainset_create(ndomain, NULL);
+}
+
+/*
+ * Update thread domainset pointers.
+ */
+static void
+domainset_notify(void)
+{
+	struct thread *td;
+	struct proc *p;
+
+	sx_slock(&allproc_lock);
+	FOREACH_PROC_IN_SYSTEM(p) {
+		PROC_LOCK(p);
+		if (p->p_state == PRS_NEW) {
+			PROC_UNLOCK(p);
+			continue;
+		}
+		FOREACH_THREAD_IN_PROC(p, td) {
+			thread_lock(td);
+			td->td_domain.dr_policy = td->td_cpuset->cs_domain;
+			thread_unlock(td);
+		}
+		PROC_UNLOCK(p);
+	}
+	sx_sunlock(&allproc_lock);
+	kernel_object->domain.dr_policy = cpuset_default->cs_domain;
+}
+
+/*
+ * Create a new set that is a subset of a parent.
+ */
+static struct domainset *
+domainset_shadow(const struct domainset *pdomain,
+    const struct domainset *domain, struct domainlist *freelist)
+{
+	struct domainset *ndomain;
+
+	ndomain = LIST_FIRST(freelist);
+	LIST_REMOVE(ndomain, ds_link);
+
+	/*
+	 * Initialize the key from the request.
+	 */
+	domainset_copy(domain, ndomain);
+
+	/*
+	 * Restrict the key by the parent.
+	 */
+	DOMAINSET_AND(&ndomain->ds_mask, &pdomain->ds_mask);
+
+	return _domainset_create(ndomain, freelist);
+}
+
+/*
  * Recursively check for errors that would occur from applying mask to
  * the tree of sets starting at 'set'.  Checks for sets that would become
  * empty as well as RDONLY flags.
@@ -376,10 +607,12 @@ cpuset_modify(struct cpuset *set, cpuset_t *mask)
 	 * Verify that we have access to this set of
 	 * cpus.
 	 */
-	root = set->cs_parent;
-	if (root && !CPU_SUBSET(&root->cs_mask, mask))
-		return (EINVAL);
+	root = cpuset_getroot(set);
 	mtx_lock_spin(&cpuset_lock);
+	if (root && !CPU_SUBSET(&root->cs_mask, mask)) {
+		error = EINVAL;
+		goto out;
+	}
 	error = cpuset_testupdate(set, mask, 0);
 	if (error)
 		goto out;
@@ -392,6 +625,141 @@ out:
 }
 
 /*
+ * Recursively check for errors that would occur from applying mask to
+ * the tree of sets starting at 'set'.  Checks for sets that would become
+ * empty as well as RDONLY flags.
+ */
+static int
+cpuset_testupdate_domain(struct cpuset *set, struct domainset *dset,
+    struct domainset *orig, int *count, int check_mask)
+{
+	struct cpuset *nset;
+	struct domainset *domain;
+	struct domainset newset;
+	int error;
+
+	mtx_assert(&cpuset_lock, MA_OWNED);
+	if (set->cs_flags & CPU_SET_RDONLY)
+		return (EPERM);
+	domain = set->cs_domain;
+	domainset_copy(domain, &newset);
+	if (!domainset_equal(domain, orig)) {
+		if (!domainset_restrict(domain, dset))
+			return (EDEADLK);
+		DOMAINSET_AND(&newset.ds_mask, &dset->ds_mask);
+		/* Count the number of domains that are changing. */
+		(*count)++;
+	}
+	error = 0;
+	LIST_FOREACH(nset, &set->cs_children, cs_siblings) 
+		if ((error = cpuset_testupdate_domain(nset, &newset, domain,
+		    count, 1)) != 0)
+			break;
+	return (error);
+}
+
+/*
+ * Applies the mask 'mask' without checking for empty sets or permissions.
+ */
+static void
+cpuset_update_domain(struct cpuset *set, struct domainset *domain,
+    struct domainset *orig, struct domainlist *domains)
+{
+	struct cpuset *nset;
+
+	mtx_assert(&cpuset_lock, MA_OWNED);
+	/*
+	 * If this domainset has changed from the parent we must calculate
+	 * a new set.  Otherwise it simply inherits from the parent.  When
+	 * we inherit from the parent we get a new mask and policy.  If the
+	 * set is modified from the parent we keep the policy and only
+	 * update the mask.
+	 */
+	if (set->cs_domain != orig) {
+		orig = set->cs_domain;
+		set->cs_domain = domainset_shadow(domain, orig, domains);
+	} else
+		set->cs_domain = domain;
+	LIST_FOREACH(nset, &set->cs_children, cs_siblings) 
+		cpuset_update_domain(nset, set->cs_domain, orig, domains);
+
+	return;
+}
+
+/*
+ * Modify the set 'set' to use a copy the domainset provided.  Apply this new
+ * mask to restrict all children in the tree.  Checks for validity before
+ * applying the changes.
+ */
+static int
+cpuset_modify_domain(struct cpuset *set, struct domainset *domain)
+{
+	struct domainlist domains;
+	struct domainset temp;
+	struct domainset *dset;
+	struct cpuset *root;
+	int ndomains, needed;
+	int error;
+
+	error = priv_check(curthread, PRIV_SCHED_CPUSET);
+	if (error)
+		return (error);
+	/*
+	 * In case we are called from within the jail
+	 * we do not allow modifying the dedicated root
+	 * cpuset of the jail but may still allow to
+	 * change child sets.
+	 */
+	if (jailed(curthread->td_ucred) &&
+	    set->cs_flags & CPU_SET_ROOT)
+		return (EPERM);
+	domainset_freelist_init(&domains, 0);
+	domain = domainset_create(domain);
+	ndomains = needed = 0;
+	do {
+		if (ndomains < needed) {
+			domainset_freelist_add(&domains, needed - ndomains);
+			ndomains = needed;
+		}
+		root = cpuset_getroot(set);
+		mtx_lock_spin(&cpuset_lock);
+		dset = root->cs_domain;
+		/*
+		 * Verify that we have access to this set of domains.
+		 */
+		if (root && !domainset_valid(dset, domain)) {
+			error = EINVAL;
+			goto out;
+		}
+		/*
+		 * If applying prefer we keep the current set as the fallback.
+		 */
+		if (domain->ds_policy == DOMAINSET_POLICY_PREFER)
+			DOMAINSET_COPY(&set->cs_domain->ds_mask,
+			    &domain->ds_mask);
+		/*
+		 * Determine whether we can apply this set of domains and
+		 * how many new domain structures it will require.
+		 */
+		domainset_copy(domain, &temp);
+		needed = 0;
+		error = cpuset_testupdate_domain(set, &temp, set->cs_domain,
+		    &needed, 0);
+		if (error)
+			goto out;
+	} while (ndomains < needed);
+	dset = set->cs_domain;
+	cpuset_update_domain(set, domain, dset, &domains);
+out:
+	mtx_unlock_spin(&cpuset_lock);
+	domainset_freelist_free(&domains);
+	if (error == 0)
+		domainset_notify();
+
+	return (error);
+}
+
+/*
  * Resolve the 'which' parameter of several cpuset apis.
  *
  * For WHICH_PID and WHICH_TID return a locked proc and valid proc/tid.  Also
@@ -481,44 +849,203 @@ cpuset_which(cpuwhich_t which, id_t id, struct proc **
 	return (0);
 }
 
+static int
+cpuset_testshadow(struct cpuset *set, const cpuset_t *mask,
+    const struct domainset *domain)
+{
+	struct cpuset *parent;
+	struct domainset *dset;
+
+	parent = cpuset_getbase(set);
+	/*
+	 * If we are restricting a cpu mask it must be a subset of the
+	 * parent or invalid CPUs have been specified.
+	 */
+	if (mask != NULL && !CPU_SUBSET(&parent->cs_mask, mask))
+		return (EINVAL);
+
+	/*
+	 * If we are restricting a domain mask it must be a subset of the
+	 * parent or invalid domains have been specified.
+	 */
+	dset = parent->cs_domain;
+	if (domain != NULL && !domainset_valid(dset, domain))
+		return (EINVAL);
+
+	return (0);
+}
+
 /*
  * Create an anonymous set with the provided mask in the space provided by
- * 'fset'.  If the passed in set is anonymous we use its parent otherwise
+ * 'nset'.  If the passed in set is anonymous we use its parent otherwise
  * the new set is a child of 'set'.
  */
 static int
-cpuset_shadow(struct cpuset *set, struct cpuset *fset, const cpuset_t *mask)
+cpuset_shadow(struct cpuset *set, struct cpuset **nsetp,
+   const cpuset_t *mask, const struct domainset *domain,
+   struct setlist *cpusets, struct domainlist *domains)
 {
 	struct cpuset *parent;
+	struct cpuset *nset;
+	struct domainset *dset;
+	struct domainset *d;
+	int error;
 
-	if (set->cs_id == CPUSET_INVALID)
-		parent = set->cs_parent;
+	error = cpuset_testshadow(set, mask, domain);
+	if (error)
+		return (error);
+
+	parent = cpuset_getbase(set);
+	dset = parent->cs_domain;
+	if (mask == NULL)
+		mask = &set->cs_mask;
+	if (domain != NULL)
+		d = domainset_shadow(dset, domain, domains);
 	else
-		parent = set;
-	if (!CPU_SUBSET(&parent->cs_mask, mask))
+		d = set->cs_domain;
+	nset = LIST_FIRST(cpusets);
+	error = _cpuset_create(nset, parent, mask, d, CPUSET_INVALID);
+	if (error == 0) {
+		LIST_REMOVE(nset, cs_link);
+		*nsetp = nset;
+	}
+	return (error);
+}
+
+static struct cpuset *
+cpuset_update_thread(struct thread *td, struct cpuset *nset)
+{
+	struct cpuset *tdset;
+
+	tdset = td->td_cpuset;
+	td->td_cpuset = nset;
+	td->td_domain.dr_policy = nset->cs_domain;
+	sched_affinity(td);
+
+	return (tdset);
+}
+
+static int
+cpuset_setproc_test_maskthread(struct cpuset *tdset, cpuset_t *mask,
+    struct domainset *domain)
+{
+	struct cpuset *parent;
+
+	parent = cpuset_getbase(tdset);
+	if (mask == NULL)
+		mask = &tdset->cs_mask;
+	if (domain == NULL)
+		domain = tdset->cs_domain;
+	return cpuset_testshadow(parent, mask, domain);
+}
+
+static int
+cpuset_setproc_maskthread(struct cpuset *tdset, cpuset_t *mask,
+    struct domainset *domain, struct cpuset **nsetp,
+    struct setlist *freelist, struct domainlist *domainlist)
+{
+	struct cpuset *parent;
+
+	parent = cpuset_getbase(tdset);
+	if (mask == NULL)
+		mask = &tdset->cs_mask;
+	if (domain == NULL)
+		domain = tdset->cs_domain;
+	return cpuset_shadow(parent, nsetp, mask, domain, freelist,
+	    domainlist);
+}
+
+static int
+cpuset_setproc_setthread_mask(struct cpuset *tdset, struct cpuset *set,
+    cpuset_t *mask, struct domainset *domain)
+{
+	struct cpuset *parent;
+
+	parent = cpuset_getbase(tdset);
+
+	/*
+	 * If the thread restricted its mask then apply that same
+	 * restriction to the new set, otherwise take it wholesale.
+	 */
+	if (CPU_CMP(&tdset->cs_mask, &parent->cs_mask) != 0) {
+		CPU_COPY(&tdset->cs_mask, mask);
+		CPU_AND(mask, &set->cs_mask);
+	} else
+		CPU_COPY(&set->cs_mask, mask);
+
+	/*
+	 * If the thread restricted the domain then we apply the
+	 * restriction to the new set but retain the policy.
+	 */
+	if (tdset->cs_domain != parent->cs_domain) {
+		domainset_copy(tdset->cs_domain, domain);
+		DOMAINSET_AND(&domain->ds_mask, &set->cs_domain->ds_mask);
+	} else
+		domainset_copy(set->cs_domain, domain);
+
+	if (CPU_EMPTY(mask) || DOMAINSET_EMPTY(&domain->ds_mask))
 		return (EDEADLK);
-	return (_cpuset_create(fset, parent, mask, CPUSET_INVALID));
+
+	return (0);
 }
 
+static int
+cpuset_setproc_test_setthread(struct cpuset *tdset, struct cpuset *set)
+{
+	struct domainset domain;
+	cpuset_t mask;
+
+	if (tdset->cs_id != CPUSET_INVALID)
+		return (0);
+	return cpuset_setproc_setthread_mask(tdset, set, &mask, &domain);
+}
+
+static int
+cpuset_setproc_setthread(struct cpuset *tdset, struct cpuset *set,
+    struct cpuset **nsetp, struct setlist *freelist,
+    struct domainlist *domainlist)
+{
+	struct domainset domain;
+	cpuset_t mask;
+	int error;
+
+	/*
+	 * If we're replacing on a thread that has not constrained the
+	 * original set we can simply accept the new set.
+	 */
+	if (tdset->cs_id != CPUSET_INVALID) {
+		*nsetp = cpuset_ref(set);
+		return (0);
+	}
+	error = cpuset_setproc_setthread_mask(tdset, set, &mask, &domain);
+	if (error)
+		return (error);
+
+	return cpuset_shadow(tdset, nsetp, &mask, &domain, freelist,
+	    domainlist);
+}
+
 /*
- * Handle two cases for replacing the base set or mask of an entire process.
+ * Handle three cases for updating an entire process.
  *
- * 1) Set is non-null and mask is null.  This reparents all anonymous sets
- *    to the provided set and replaces all non-anonymous td_cpusets with the
- *    provided set.
- * 2) Mask is non-null and set is null.  This replaces or creates anonymous
- *    sets for every thread with the existing base as a parent.
+ * 1) Set is non-null.  This reparents all anonymous sets to the provided
+ *    set and replaces all non-anonymous td_cpusets with the provided set.
+ * 2) Mask is non-null.  This replaces or creates anonymous sets for every
+ *    thread with the existing base as a parent.
+ * 3) domain is non-null.  This creates anonymous sets for every thread
+ *    and replaces the domain set.
  *
  * This is overly complicated because we can't allocate while holding a 
  * spinlock and spinlocks must be held while changing and examining thread
  * state.
  */
 static int
-cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t *mask)
+cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t *mask,
+    struct domainset *domain)
 {
 	struct setlist freelist;
 	struct setlist droplist;
-	struct cpuset *tdset;
+	struct domainlist domainlist;
 	struct cpuset *nset;
 	struct thread *td;
 	struct proc *p;
@@ -533,7 +1060,9 @@ cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t
 	 * 2) If enough cpusets have not been allocated release the locks and
 	 *    allocate them.  Loop.
 	 */
-	LIST_INIT(&freelist);
+	cpuset_freelist_init(&freelist, 1);
+	domainset_freelist_init(&domainlist, 1);
+	nfree = 1;
 	LIST_INIT(&droplist);
 	nfree = 0;
 	for (;;) {
@@ -544,39 +1073,27 @@ cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t
 			break;
 		threads = p->p_numthreads;
 		PROC_UNLOCK(p);
-		for (; nfree < threads; nfree++) {
-			nset = uma_zalloc(cpuset_zone, M_WAITOK);
-			LIST_INSERT_HEAD(&freelist, nset, cs_link);
+		if (nfree < threads) {
+			cpuset_freelist_add(&freelist, threads - nfree);
+			domainset_freelist_add(&domainlist, threads - nfree);
+			nfree = threads;
 		}
 	}
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	/*
 	 * Now that the appropriate locks are held and we have enough cpusets,
-	 * make sure the operation will succeed before applying changes.  The
+	 * make sure the operation will succeed before applying changes. The
 	 * proc lock prevents td_cpuset from changing between calls.
 	 */
 	error = 0;
 	FOREACH_THREAD_IN_PROC(p, td) {
 		thread_lock(td);
-		tdset = td->td_cpuset;
-		/*
-		 * Verify that a new mask doesn't specify cpus outside of
-		 * the set the thread is a member of.
-		 */
-		if (mask) {
-			if (tdset->cs_id == CPUSET_INVALID)
-				tdset = tdset->cs_parent;
-			if (!CPU_SUBSET(&tdset->cs_mask, mask))
-				error = EDEADLK;
-		/*
-		 * Verify that a new set won't leave an existing thread
-		 * mask without a cpu to run on.  It can, however, restrict
-		 * the set.
-		 */
-		} else if (tdset->cs_id == CPUSET_INVALID) {
-			if (!CPU_OVERLAP(&set->cs_mask, &tdset->cs_mask))
-				error = EDEADLK;
-		}
+		if (set != NULL)
+			error = cpuset_setproc_test_setthread(td->td_cpuset,
+			    set);
+		else
+			error = cpuset_setproc_test_maskthread(td->td_cpuset,
+			    mask, domain);
 		thread_unlock(td);
 		if (error)
 			goto unlock_out;
@@ -588,33 +1105,17 @@ cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t
 	 */
 	FOREACH_THREAD_IN_PROC(p, td) {
 		thread_lock(td);
-		/*
-		 * If we presently have an anonymous set or are applying a
-		 * mask we must create an anonymous shadow set.  That is
-		 * either parented to our existing base or the supplied set.
-		 *
-		 * If we have a base set with no anonymous shadow we simply
-		 * replace it outright.
-		 */
-		tdset = td->td_cpuset;
-		if (tdset->cs_id == CPUSET_INVALID || mask) {
-			nset = LIST_FIRST(&freelist);
-			LIST_REMOVE(nset, cs_link);
-			if (mask)
-				error = cpuset_shadow(tdset, nset, mask);
-			else
-				error = _cpuset_create(nset, set,
-				    &tdset->cs_mask, CPUSET_INVALID);
-			if (error) {
-				LIST_INSERT_HEAD(&freelist, nset, cs_link);
-				thread_unlock(td);
-				break;
-			}
-		} else
-			nset = cpuset_ref(set);
-		cpuset_rel_defer(&droplist, tdset);
-		td->td_cpuset = nset;
-		sched_affinity(td);
+		if (set != NULL)
+			error = cpuset_setproc_setthread(td->td_cpuset, set,
+			    &nset, &freelist, &domainlist);
+		else
+			error = cpuset_setproc_maskthread(td->td_cpuset, mask,
+			    domain, &nset, &freelist, &domainlist);
+		if (error) {
+			thread_unlock(td);
+			break;
+		}
+		cpuset_rel_defer(&droplist, cpuset_update_thread(td, nset));
 		thread_unlock(td);
 	}
 unlock_out:
@@ -622,10 +1123,8 @@ unlock_out:
 out:
 	while ((nset = LIST_FIRST(&droplist)) != NULL)
 		cpuset_rel_complete(nset);
-	while ((nset = LIST_FIRST(&freelist)) != NULL) {
-		LIST_REMOVE(nset, cs_link);
-		uma_zfree(cpuset_zone, nset);
-	}
+	cpuset_freelist_free(&freelist);
+	domainset_freelist_free(&domainlist);
 	return (error);
 }
 
@@ -690,46 +1189,57 @@ cpusetobj_strscan(cpuset_t *set, const char *buf)

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201801122248.w0CMmNdF082700>