Date: Mon, 1 Oct 2018 14:14:22 +0000 (UTC) From: Andrew Gallatin <gallatin@FreeBSD.org> To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org Subject: svn commit: r339043 - in head/sys: kern vm x86/acpica Message-ID: <201810011414.w91EEMlQ038867@repo.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: gallatin Date: Mon Oct 1 14:14:21 2018 New Revision: 339043 URL: https://svnweb.freebsd.org/changeset/base/339043 Log: Allow empty NUMA memory domains to support Threadripper2 The AMD Threadripper 2990WX is basically a slightly crippled Epyc. Rather than having 4 memory controllers, one per NUMA domain, it has only 2 memory controllers enabled. This means that only 2 of the 4 NUMA domains can be populated with physical memory, and the others are empty. Add support to FreeBSD for empty NUMA domains by: - creating empty memory domains when parsing the SRAT table, rather than failing to parse the table - not running the pageout deamon threads in empty domains - adding defensive code to UMA to avoid allocating from empty domains - adding defensive code to cpuset to avoid binding to an empty domain Thanks to Jeff for suggesting this strategy. Reviewed by: alc, markj Approved by: re (gjb@) Differential Revision: https://reviews.freebsd.org/D1683 Modified: head/sys/kern/kern_cpuset.c head/sys/vm/uma_core.c head/sys/vm/vm_kern.c head/sys/vm/vm_pageout.c head/sys/vm/vm_pagequeue.h head/sys/x86/acpica/srat.c Modified: head/sys/kern/kern_cpuset.c ============================================================================== --- head/sys/kern/kern_cpuset.c Mon Oct 1 14:05:31 2018 (r339042) +++ head/sys/kern/kern_cpuset.c Mon Oct 1 14:14:21 2018 (r339043) @@ -65,7 +65,12 @@ __FBSDID("$FreeBSD$"); #include <vm/uma.h> #include <vm/vm.h> #include <vm/vm_object.h> +#include <vm/vm_page.h> +#include <vm/vm_pageout.h> #include <vm/vm_extern.h> +#include <vm/vm_param.h> +#include <vm/vm_phys.h> +#include <vm/vm_pagequeue.h> #ifdef DDB #include <ddb/ddb.h> @@ -479,6 +484,26 @@ _domainset_create(struct domainset *domain, struct dom } /* + * Are any of the domains in the mask empty? If so, silently + * remove them. If only empty domains are present, we must + * return failure. + */ +static bool +domainset_empty_vm(struct domainset *domain) +{ + int i, max; + + max = DOMAINSET_FLS(&domain->ds_mask) + 1; + for (i = 0; i < max; i++) { + if (DOMAINSET_ISSET(i, &domain->ds_mask) && + VM_DOMAIN_EMPTY(i)) + DOMAINSET_CLR(i, &domain->ds_mask); + } + + return (DOMAINSET_EMPTY(&domain->ds_mask)); +} + +/* * Create or lookup a domainset based on the key held in 'domain'. */ struct domainset * @@ -1360,6 +1385,7 @@ domainset_zero(void) DOMAINSET_SET(i, &dset->ds_mask); dset->ds_policy = DOMAINSET_POLICY_FIRSTTOUCH; dset->ds_prefer = -1; + (void)domainset_empty_vm(dset); curthread->td_domain.dr_policy = _domainset_create(dset, NULL); domainset_copy(dset, &domainset2); @@ -2086,6 +2112,13 @@ kern_cpuset_setdomain(struct thread *td, cpulevel_t le /* This will be constrained by domainset_shadow(). */ DOMAINSET_FILL(&domain.ds_mask); } + + /* + * When given an impossible policy, fall back to interleaving + * across all domains + */ + if (domainset_empty_vm(&domain)) + domainset_copy(&domainset2, &domain); switch (level) { case CPU_LEVEL_ROOT: Modified: head/sys/vm/uma_core.c ============================================================================== --- head/sys/vm/uma_core.c Mon Oct 1 14:05:31 2018 (r339042) +++ head/sys/vm/uma_core.c Mon Oct 1 14:14:21 2018 (r339043) @@ -84,6 +84,7 @@ __FBSDID("$FreeBSD$"); #include <vm/vm_pageout.h> #include <vm/vm_param.h> #include <vm/vm_phys.h> +#include <vm/vm_pagequeue.h> #include <vm/vm_map.h> #include <vm/vm_kern.h> #include <vm/vm_extern.h> @@ -2469,9 +2470,11 @@ zalloc_start: if (bucket != NULL) bucket_free(zone, bucket, udata); - if (zone->uz_flags & UMA_ZONE_NUMA) + if (zone->uz_flags & UMA_ZONE_NUMA) { domain = PCPU_GET(domain); - else + if (VM_DOMAIN_EMPTY(domain)) + domain = UMA_ANYDOMAIN; + } else domain = UMA_ANYDOMAIN; /* Short-circuit for zones without buckets and low memory. */ @@ -2647,7 +2650,11 @@ keg_fetch_slab(uma_keg_t keg, uma_zone_t zone, int rdo rdomain = 0; rr = rdomain == UMA_ANYDOMAIN; if (rr) { - keg->uk_cursor = (keg->uk_cursor + 1) % vm_ndomains; + start = keg->uk_cursor; + do { + keg->uk_cursor = (keg->uk_cursor + 1) % vm_ndomains; + domain = keg->uk_cursor; + } while (VM_DOMAIN_EMPTY(domain) && domain != start); domain = start = keg->uk_cursor; /* Only block on the second pass. */ if ((flags & (M_WAITOK | M_NOVM)) == M_WAITOK) @@ -2698,8 +2705,11 @@ again: LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link); return (slab); } - if (rr) - domain = (domain + 1) % vm_ndomains; + if (rr) { + do { + domain = (domain + 1) % vm_ndomains; + } while (VM_DOMAIN_EMPTY(domain) && domain != start); + } } while (domain != start); /* Retry domain scan with blocking. */ @@ -2903,6 +2913,8 @@ zone_alloc_bucket(uma_zone_t zone, void *udata, int do uma_bucket_t bucket; int max; + CTR1(KTR_UMA, "zone_alloc:_bucket domain %d)", domain); + /* Don't wait for buckets, preserve caller's NOVM setting. */ bucket = bucket_alloc(zone, udata, M_NOWAIT | (flags & M_NOVM)); if (bucket == NULL) @@ -2970,6 +2982,11 @@ zone_alloc_item(uma_zone_t zone, void *udata, int doma item = NULL; + if (domain != UMA_ANYDOMAIN) { + /* avoid allocs targeting empty domains */ + if (VM_DOMAIN_EMPTY(domain)) + domain = UMA_ANYDOMAIN; + } if (zone->uz_import(zone->uz_arg, &item, 1, domain, flags) != 1) goto fail; atomic_add_long(&zone->uz_allocs, 1); @@ -3139,9 +3156,11 @@ zfree_start: /* We are no longer associated with this CPU. */ critical_exit(); - if ((zone->uz_flags & UMA_ZONE_NUMA) != 0) + if ((zone->uz_flags & UMA_ZONE_NUMA) != 0) { domain = PCPU_GET(domain); - else + if (VM_DOMAIN_EMPTY(domain)) + domain = UMA_ANYDOMAIN; + } else domain = 0; zdom = &zone->uz_domain[0]; @@ -3588,7 +3607,9 @@ uma_prealloc(uma_zone_t zone, int items) dom = &keg->uk_domain[slab->us_domain]; LIST_INSERT_HEAD(&dom->ud_free_slab, slab, us_link); slabs--; - domain = (domain + 1) % vm_ndomains; + do { + domain = (domain + 1) % vm_ndomains; + } while (VM_DOMAIN_EMPTY(domain)); } KEG_UNLOCK(keg); } @@ -3678,6 +3699,11 @@ uma_large_malloc_domain(vm_size_t size, int domain, in vm_offset_t addr; uma_slab_t slab; + if (domain != UMA_ANYDOMAIN) { + /* avoid allocs targeting empty domains */ + if (VM_DOMAIN_EMPTY(domain)) + domain = UMA_ANYDOMAIN; + } slab = zone_alloc_item(slabzone, NULL, domain, wait); if (slab == NULL) return (NULL); Modified: head/sys/vm/vm_kern.c ============================================================================== --- head/sys/vm/vm_kern.c Mon Oct 1 14:05:31 2018 (r339042) +++ head/sys/vm/vm_kern.c Mon Oct 1 14:14:21 2018 (r339043) @@ -502,6 +502,8 @@ kmem_back(vm_object_t object, vm_offset_t addr, vm_siz */ if (vm_ndomains > 1) { domain = (addr >> KVA_QUANTUM_SHIFT) % vm_ndomains; + while (VM_DOMAIN_EMPTY(domain)) + domain++; next = roundup2(addr + 1, KVA_QUANTUM); if (next > end || next < start) next = end; Modified: head/sys/vm/vm_pageout.c ============================================================================== --- head/sys/vm/vm_pageout.c Mon Oct 1 14:05:31 2018 (r339042) +++ head/sys/vm/vm_pageout.c Mon Oct 1 14:14:21 2018 (r339043) @@ -2082,6 +2082,13 @@ vm_pageout(void) if (error != 0) panic("starting laundry for domain 0, error %d", error); for (i = 1; i < vm_ndomains; i++) { + if (VM_DOMAIN_EMPTY(i)) { + if (bootverbose) + printf("domain %d empty; skipping pageout\n", + i); + continue; + } + error = kthread_add(vm_pageout_worker, (void *)(uintptr_t)i, curproc, NULL, 0, 0, "dom%d", i); if (error != 0) { Modified: head/sys/vm/vm_pagequeue.h ============================================================================== --- head/sys/vm/vm_pagequeue.h Mon Oct 1 14:05:31 2018 (r339042) +++ head/sys/vm/vm_pagequeue.h Mon Oct 1 14:14:21 2018 (r339043) @@ -151,7 +151,8 @@ struct vm_domain { extern struct vm_domain vm_dom[MAXMEMDOM]; -#define VM_DOMAIN(n) (&vm_dom[(n)]) +#define VM_DOMAIN(n) (&vm_dom[(n)]) +#define VM_DOMAIN_EMPTY(n) (vm_dom[(n)].vmd_page_count == 0) #define vm_pagequeue_assert_locked(pq) mtx_assert(&(pq)->pq_mutex, MA_OWNED) #define vm_pagequeue_lock(pq) mtx_lock(&(pq)->pq_mutex) Modified: head/sys/x86/acpica/srat.c ============================================================================== --- head/sys/x86/acpica/srat.c Mon Oct 1 14:05:31 2018 (r339042) +++ head/sys/x86/acpica/srat.c Mon Oct 1 14:14:21 2018 (r339043) @@ -311,8 +311,20 @@ check_domains(void) } for (i = 0; i <= max_apic_id; i++) if (cpus[i].enabled && !cpus[i].has_memory) { - printf("SRAT: No memory found for CPU %d\n", i); - return (ENXIO); + found = 0; + for (j = 0; j < num_mem && !found; j++) { + if (mem_info[j].domain == cpus[i].domain) + found = 1; + } + if (!found) { + if (bootverbose) + printf("SRAT: mem dom %d is empty\n", + cpus[i].domain); + mem_info[num_mem].start = 0; + mem_info[num_mem].end = 0; + mem_info[num_mem].domain = cpus[i].domain; + num_mem++; + } } return (0); }
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201810011414.w91EEMlQ038867>