Date: Mon, 4 Apr 2016 19:31:47 +0300 From: Andriy Gapon <avg@FreeBSD.org> To: FreeBSD Current <freebsd-current@FreeBSD.org> Subject: [HEADSUP] new x86 smp topology detection code Message-ID: <570296F3.8090307@FreeBSD.org> In-Reply-To: <201604041609.u34G9TCd022548@repo.freebsd.org> References: <201604041609.u34G9TCd022548@repo.freebsd.org>
next in thread | previous in thread | raw e-mail | index | archive | help
I've just committed new code for detecting SMP (processor and cache) topology on x86 systems. Please be aware. If you get any panics or crashes that look like they might be caused by this change please send a copy of a report to me. Another thing to watch is kern.sched.topology_spec. Please check if the reported topology reasonably matches what you expect on your system. You can install hwloc package (devel/hwloc) and then run lstopo -p --no-io to double-check the topology (--output-format ascii would produce a nice ASCII-art diagram). I hope that you see only improvements :-) -------- Forwarded Message -------- Subject: svn commit: r297558 - in head/sys: kern sys x86/x86 Date: Mon, 4 Apr 2016 16:09:29 +0000 (UTC) From: Andriy Gapon <avg@freebsd.org> To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org Author: avg Date: Mon Apr 4 16:09:29 2016 New Revision: 297558 URL: https://svnweb.freebsd.org/changeset/base/297558 Log: new x86 smp topology detection code Previously, the code determined a topology of processing units (hardware threads, cores, packages) and then deduced a cache topology using certain assumptions. The new code builds a topology that includes both processing units and caches using the information provided by the hardware. At the moment, the discovered full topology is used only to creeate a scheduling topology for SCHED_ULE. There is no KPI for other kernel uses. Summary: - based on APIC ID derivation rules for Intel and AMD CPUs - can handle non-uniform topologies - requires homogeneous APIC ID assignment (same bit widths for ID components) - topology for dual-node AMD CPUs may not be optimal - topology for latest AMD CPU models may not be optimal as the code is several years old - supports only thread/package/core/cache nodes Todo: - AMD dual-node processors - latest AMD processors - NUMA nodes - checking for homogeneity of the APIC ID assignment across packages - more flexible cache placement within topology - expose topology to userland, e.g., via sysctl nodes Long term todo: - KPI for CPU sharing and affinity with respect to various resources (e.g., two logical processors may share the same FPU, etc) Reviewed by: mav Tested by: mav MFC after: 1 month Differential Revision: https://reviews.freebsd.org/D2728 Modified: head/sys/kern/subr_smp.c head/sys/sys/smp.h head/sys/x86/x86/mp_x86.c Modified: head/sys/kern/subr_smp.c ============================================================================== --- head/sys/kern/subr_smp.c Mon Apr 4 15:56:14 2016 (r297557) +++ head/sys/kern/subr_smp.c Mon Apr 4 16:09:29 2016 (r297558) @@ -39,6 +39,7 @@ __FBSDID("$FreeBSD$"); #include <sys/proc.h> #include <sys/bus.h> #include <sys/lock.h> +#include <sys/malloc.h> #include <sys/mutex.h> #include <sys/pcpu.h> #include <sys/sched.h> @@ -51,6 +52,10 @@ __FBSDID("$FreeBSD$"); #include "opt_sched.h" #ifdef SMP +MALLOC_DEFINE(M_TOPO, "toponodes", "SMP topology data"); +#endif + +#ifdef SMP volatile cpuset_t stopped_cpus; volatile cpuset_t started_cpus; volatile cpuset_t suspended_cpus; @@ -556,7 +561,7 @@ smp_rendezvous(void (* setup_func)(void smp_rendezvous_cpus(all_cpus, setup_func, action_func, teardown_func, arg); } -static struct cpu_group group[MAXCPU]; +static struct cpu_group group[MAXCPU * MAX_CACHE_LEVELS + 1]; struct cpu_group * smp_topo(void) @@ -616,6 +621,17 @@ smp_topo(void) } struct cpu_group * +smp_topo_alloc(u_int count) +{ + static u_int index; + u_int curr; + + curr = index; + index += count; + return (&group[curr]); +} + +struct cpu_group * smp_topo_none(void) { struct cpu_group *top; @@ -861,3 +877,233 @@ sysctl_kern_smp_active(SYSCTL_HANDLER_AR return (error); } + +#ifdef SMP +void +topo_init_node(struct topo_node *node) +{ + + bzero(node, sizeof(*node)); + TAILQ_INIT(&node->children); +} + +void +topo_init_root(struct topo_node *root) +{ + + topo_init_node(root); + root->type = TOPO_TYPE_SYSTEM; +} + +struct topo_node * +topo_add_node_by_hwid(struct topo_node *parent, int hwid, + topo_node_type type, uintptr_t subtype) +{ + struct topo_node *node; + + TAILQ_FOREACH_REVERSE(node, &parent->children, + topo_children, siblings) { + if (node->hwid == hwid + && node->type == type && node->subtype == subtype) { + return (node); + } + } + + node = malloc(sizeof(*node), M_TOPO, M_WAITOK); + topo_init_node(node); + node->parent = parent; + node->hwid = hwid; + node->type = type; + node->subtype = subtype; + TAILQ_INSERT_TAIL(&parent->children, node, siblings); + parent->nchildren++; + + return (node); +} + +struct topo_node * +topo_find_node_by_hwid(struct topo_node *parent, int hwid, + topo_node_type type, uintptr_t subtype) +{ + + struct topo_node *node; + + TAILQ_FOREACH(node, &parent->children, siblings) { + if (node->hwid == hwid + && node->type == type && node->subtype == subtype) { + return (node); + } + } + + return (NULL); +} + +void +topo_promote_child(struct topo_node *child) +{ + struct topo_node *next; + struct topo_node *node; + struct topo_node *parent; + + parent = child->parent; + next = TAILQ_NEXT(child, siblings); + TAILQ_REMOVE(&parent->children, child, siblings); + TAILQ_INSERT_HEAD(&parent->children, child, siblings); + + while (next != NULL) { + node = next; + next = TAILQ_NEXT(node, siblings); + TAILQ_REMOVE(&parent->children, node, siblings); + TAILQ_INSERT_AFTER(&parent->children, child, node, siblings); + child = node; + } +} + +struct topo_node * +topo_next_node(struct topo_node *top, struct topo_node *node) +{ + struct topo_node *next; + + if ((next = TAILQ_FIRST(&node->children)) != NULL) + return (next); + + if ((next = TAILQ_NEXT(node, siblings)) != NULL) + return (next); + + while ((node = node->parent) != top) + if ((next = TAILQ_NEXT(node, siblings)) != NULL) + return (next); + + return (NULL); +} + +struct topo_node * +topo_next_nonchild_node(struct topo_node *top, struct topo_node *node) +{ + struct topo_node *next; + + if ((next = TAILQ_NEXT(node, siblings)) != NULL) + return (next); + + while ((node = node->parent) != top) + if ((next = TAILQ_NEXT(node, siblings)) != NULL) + return (next); + + return (NULL); +} + +void +topo_set_pu_id(struct topo_node *node, cpuid_t id) +{ + + KASSERT(node->type == TOPO_TYPE_PU, + ("topo_set_pu_id: wrong node type: %u", node->type)); + KASSERT(CPU_EMPTY(&node->cpuset) && node->cpu_count == 0, + ("topo_set_pu_id: cpuset already not empty")); + node->id = id; + CPU_SET(id, &node->cpuset); + node->cpu_count = 1; + node->subtype = 1; + + while ((node = node->parent) != NULL) { + if (CPU_ISSET(id, &node->cpuset)) + break; + CPU_SET(id, &node->cpuset); + node->cpu_count++; + } +} + +int +topo_analyze(struct topo_node *topo_root, int all, + int *pkg_count, int *cores_per_pkg, int *thrs_per_core) +{ + struct topo_node *pkg_node; + struct topo_node *core_node; + struct topo_node *pu_node; + int thrs_per_pkg; + int cpp_counter; + int tpc_counter; + int tpp_counter; + + *pkg_count = 0; + *cores_per_pkg = -1; + *thrs_per_core = -1; + thrs_per_pkg = -1; + pkg_node = topo_root; + while (pkg_node != NULL) { + if (pkg_node->type != TOPO_TYPE_PKG) { + pkg_node = topo_next_node(topo_root, pkg_node); + continue; + } + if (!all && CPU_EMPTY(&pkg_node->cpuset)) { + pkg_node = topo_next_nonchild_node(topo_root, pkg_node); + continue; + } + + (*pkg_count)++; + + cpp_counter = 0; + tpp_counter = 0; + core_node = pkg_node; + while (core_node != NULL) { + if (core_node->type == TOPO_TYPE_CORE) { + if (!all && CPU_EMPTY(&core_node->cpuset)) { + core_node = + topo_next_nonchild_node(pkg_node, + core_node); + continue; + } + + cpp_counter++; + + tpc_counter = 0; + pu_node = core_node; + while (pu_node != NULL) { + if (pu_node->type == TOPO_TYPE_PU && + (all || !CPU_EMPTY(&pu_node->cpuset))) + tpc_counter++; + pu_node = topo_next_node(core_node, + pu_node); + } + + if (*thrs_per_core == -1) + *thrs_per_core = tpc_counter; + else if (*thrs_per_core != tpc_counter) + return (0); + + core_node = topo_next_nonchild_node(pkg_node, + core_node); + } else { + /* PU node directly under PKG. */ + if (core_node->type == TOPO_TYPE_PU && + (all || !CPU_EMPTY(&core_node->cpuset))) + tpp_counter++; + core_node = topo_next_node(pkg_node, + core_node); + } + } + + if (*cores_per_pkg == -1) + *cores_per_pkg = cpp_counter; + else if (*cores_per_pkg != cpp_counter) + return (0); + if (thrs_per_pkg == -1) + thrs_per_pkg = tpp_counter; + else if (thrs_per_pkg != tpp_counter) + return (0); + + pkg_node = topo_next_nonchild_node(topo_root, pkg_node); + } + + KASSERT(*pkg_count > 0, + ("bug in topology or analysis")); + if (*cores_per_pkg == 0) { + KASSERT(*thrs_per_core == -1 && thrs_per_pkg > 0, + ("bug in topology or analysis")); + *thrs_per_core = thrs_per_pkg; + } + + return (1); +} +#endif /* SMP */ + Modified: head/sys/sys/smp.h ============================================================================== --- head/sys/sys/smp.h Mon Apr 4 15:56:14 2016 (r297557) +++ head/sys/sys/smp.h Mon Apr 4 16:09:29 2016 (r297558) @@ -17,9 +17,52 @@ #ifndef LOCORE #include <sys/cpuset.h> +#include <sys/queue.h> /* - * Topology of a NUMA or HTT system. + * Types of nodes in the topological tree. + */ +typedef enum { + /* No node has this type; can be used in topo API calls. */ + TOPO_TYPE_DUMMY, + /* Processing unit aka computing unit aka logical CPU. */ + TOPO_TYPE_PU, + /* Physical subdivision of a package. */ + TOPO_TYPE_CORE, + /* CPU L1/L2/L3 cache. */ + TOPO_TYPE_CACHE, + /* Package aka chip, equivalent to socket. */ + TOPO_TYPE_PKG, + /* NUMA node. */ + TOPO_TYPE_NODE, + /* Other logical or physical grouping of PUs. */ + /* E.g. PUs on the same dye, or PUs sharing an FPU. */ + TOPO_TYPE_GROUP, + /* The whole system. */ + TOPO_TYPE_SYSTEM +} topo_node_type; + +/* Hardware indenitifier of a topology component. */ +typedef unsigned int hwid_t; +/* Logical CPU idenitifier. */ +typedef int cpuid_t; + +/* A node in the topology. */ +struct topo_node { + struct topo_node *parent; + TAILQ_HEAD(topo_children, topo_node) children; + TAILQ_ENTRY(topo_node) siblings; + cpuset_t cpuset; + topo_node_type type; + uintptr_t subtype; + hwid_t hwid; + cpuid_t id; + int nchildren; + int cpu_count; +}; + +/* + * Scheduling topology of a NUMA or SMP system. * * The top level topology is an array of pointers to groups. Each group * contains a bitmask of cpus in its group or subgroups. It may also @@ -52,6 +95,8 @@ typedef struct cpu_group *cpu_group_t; #define CG_SHARE_L2 2 #define CG_SHARE_L3 3 +#define MAX_CACHE_LEVELS CG_SHARE_L3 + /* * Behavior modifiers for load balancing and affinity. */ @@ -60,10 +105,29 @@ typedef struct cpu_group *cpu_group_t; #define CG_FLAG_THREAD (CG_FLAG_HTT | CG_FLAG_SMT) /* Any threading. */ /* - * Convenience routines for building topologies. + * Convenience routines for building and traversing topologies. */ #ifdef SMP +void topo_init_node(struct topo_node *node); +void topo_init_root(struct topo_node *root); +struct topo_node * topo_add_node_by_hwid(struct topo_node *parent, int hwid, + topo_node_type type, uintptr_t subtype); +struct topo_node * topo_find_node_by_hwid(struct topo_node *parent, int hwid, + topo_node_type type, uintptr_t subtype); +void topo_promote_child(struct topo_node *child); +struct topo_node * topo_next_node(struct topo_node *top, + struct topo_node *node); +struct topo_node * topo_next_nonchild_node(struct topo_node *top, + struct topo_node *node); +void topo_set_pu_id(struct topo_node *node, cpuid_t id); +int topo_analyze(struct topo_node *topo_root, int all, int *pkg_count, + int *cores_per_pkg, int *thrs_per_core); + +#define TOPO_FOREACH(i, root) \ + for (i = root; i != NULL; i = topo_next_node(root, i)) + struct cpu_group *smp_topo(void); +struct cpu_group *smp_topo_alloc(u_int count); struct cpu_group *smp_topo_none(void); struct cpu_group *smp_topo_1level(int l1share, int l1count, int l1flags); struct cpu_group *smp_topo_2level(int l2share, int l2count, int l1share, Modified: head/sys/x86/x86/mp_x86.c ============================================================================== --- head/sys/x86/x86/mp_x86.c Mon Apr 4 15:56:14 2016 (r297557) +++ head/sys/x86/x86/mp_x86.c Mon Apr 4 16:09:29 2016 (r297558) @@ -133,19 +133,28 @@ volatile int aps_ready = 0; * the APs. */ struct cpu_info cpu_info[MAX_APIC_ID + 1]; -int cpu_apic_ids[MAXCPU]; int apic_cpuids[MAX_APIC_ID + 1]; +int cpu_apic_ids[MAXCPU]; /* Holds pending bitmap based IPIs per CPU */ volatile u_int cpu_ipi_pending[MAXCPU]; -int cpu_logical; /* logical cpus per core */ -int cpu_cores; /* cores per package */ - static void release_aps(void *dummy); -static u_int hyperthreading_cpus; /* logical cpus sharing L1 cache */ static int hyperthreading_allowed = 1; +SYSCTL_INT(_machdep, OID_AUTO, hyperthreading_allowed, CTLFLAG_RDTUN, + &hyperthreading_allowed, 0, "Use Intel HTT logical CPUs"); + +static struct topo_node topo_root; + +static int pkg_id_shift; +static int core_id_shift; +static int disabled_cpus; + +struct cache_info { + int id_shift; + int present; +} static caches[MAX_CACHE_LEVELS]; void mem_range_AP_init(void) @@ -155,60 +164,125 @@ mem_range_AP_init(void) mem_range_softc.mr_op->initAP(&mem_range_softc); } -static void -topo_probe_amd(void) +/* + * Round up to the next power of two, if necessary, and then + * take log2. + * Returns -1 if argument is zero. + */ +static __inline int +mask_width(u_int x) { - int core_id_bits; - int id; - /* AMD processors do not support HTT. */ - cpu_logical = 1; + return (fls(x << (1 - powerof2(x))) - 1); +} + +static int +add_deterministic_cache(int type, int level, int share_count) +{ - if ((amd_feature2 & AMDID2_CMP) == 0) { - cpu_cores = 1; - return; + if (type == 0) + return (0); + if (type > 3) { + printf("unexpected cache type %d\n", type); + return (1); } - - core_id_bits = (cpu_procinfo2 & AMDID_COREID_SIZE) >> - AMDID_COREID_SIZE_SHIFT; - if (core_id_bits == 0) { - cpu_cores = (cpu_procinfo2 & AMDID_CMP_CORES) + 1; - return; + if (type == 2) /* ignore instruction cache */ + return (1); + if (level == 0 || level > MAX_CACHE_LEVELS) { + printf("unexpected cache level %d\n", type); + return (1); } - /* Fam 10h and newer should get here. */ - for (id = 0; id <= MAX_APIC_ID; id++) { - /* Check logical CPU availability. */ - if (!cpu_info[id].cpu_present || cpu_info[id].cpu_disabled) - continue; - /* Check if logical CPU has the same package ID. */ - if ((id >> core_id_bits) != (boot_cpu_id >> core_id_bits)) - continue; - cpu_cores++; + if (caches[level - 1].present) { + printf("WARNING: multiple entries for L%u data cache\n", level); + printf("%u => %u\n", caches[level - 1].id_shift, + mask_width(share_count)); + } + caches[level - 1].id_shift = mask_width(share_count); + caches[level - 1].present = 1; + + if (caches[level - 1].id_shift > pkg_id_shift) { + printf("WARNING: L%u data cache covers more " + "APIC IDs than a package\n", level); + printf("%u > %u\n", caches[level - 1].id_shift, pkg_id_shift); + caches[level - 1].id_shift = pkg_id_shift; + } + if (caches[level - 1].id_shift < core_id_shift) { + printf("WARNING: L%u data cache covers less " + "APIC IDs than a core\n", level); + printf("%u < %u\n", caches[level - 1].id_shift, core_id_shift); + caches[level - 1].id_shift = core_id_shift; } + + return (1); } -/* - * Round up to the next power of two, if necessary, and then - * take log2. - * Returns -1 if argument is zero. - */ -static __inline int -mask_width(u_int x) +static void +topo_probe_amd(void) { + u_int p[4]; + int level; + int share_count; + int type; + int i; - return (fls(x << (1 - powerof2(x))) - 1); + /* No multi-core capability. */ + if ((amd_feature2 & AMDID2_CMP) == 0) + return; + + /* For families 10h and newer. */ + pkg_id_shift = (cpu_procinfo2 & AMDID_COREID_SIZE) >> + AMDID_COREID_SIZE_SHIFT; + + /* For 0Fh family. */ + if (pkg_id_shift == 0) + pkg_id_shift = + mask_width((cpu_procinfo2 & AMDID_CMP_CORES) + 1); + + if ((amd_feature2 & AMDID2_TOPOLOGY) != 0) { + for (i = 0; ; i++) { + cpuid_count(0x8000001d, i, p); + type = p[0] & 0x1f; + level = (p[0] >> 5) & 0x7; + share_count = 1 + ((p[0] >> 14) & 0xfff); + + if (!add_deterministic_cache(type, level, share_count)) + break; + } + } else { + if (cpu_exthigh >= 0x80000005) { + cpuid_count(0x80000005, 0, p); + if (((p[2] >> 24) & 0xff) != 0) { + caches[0].id_shift = 0; + caches[0].present = 1; + } + } + if (cpu_exthigh >= 0x80000006) { + cpuid_count(0x80000006, 0, p); + if (((p[2] >> 16) & 0xffff) != 0) { + caches[1].id_shift = 0; + caches[1].present = 1; + } + if (((p[3] >> 18) & 0x3fff) != 0) { + + /* + * TODO: Account for dual-node processors + * where each node within a package has its own + * L3 cache. + */ + caches[2].id_shift = pkg_id_shift; + caches[2].present = 1; + } + } + } } static void -topo_probe_0x4(void) +topo_probe_intel_0x4(void) { u_int p[4]; - int pkg_id_bits; - int core_id_bits; int max_cores; int max_logical; - int id; /* Both zero and one here mean one logical processor per package. */ max_logical = (cpu_feature & CPUID_HTT) != 0 ? @@ -216,180 +290,432 @@ topo_probe_0x4(void) if (max_logical <= 1) return; - /* - * Because of uniformity assumption we examine only - * those logical processors that belong to the same - * package as BSP. Further, we count number of - * logical processors that belong to the same core - * as BSP thus deducing number of threads per core. - */ if (cpu_high >= 0x4) { cpuid_count(0x04, 0, p); max_cores = ((p[0] >> 26) & 0x3f) + 1; } else max_cores = 1; - core_id_bits = mask_width(max_logical/max_cores); - if (core_id_bits < 0) - return; - pkg_id_bits = core_id_bits + mask_width(max_cores); - - for (id = 0; id <= MAX_APIC_ID; id++) { - /* Check logical CPU availability. */ - if (!cpu_info[id].cpu_present || cpu_info[id].cpu_disabled) - continue; - /* Check if logical CPU has the same package ID. */ - if ((id >> pkg_id_bits) != (boot_cpu_id >> pkg_id_bits)) - continue; - cpu_cores++; - /* Check if logical CPU has the same package and core IDs. */ - if ((id >> core_id_bits) == (boot_cpu_id >> core_id_bits)) - cpu_logical++; - } - KASSERT(cpu_cores >= 1 && cpu_logical >= 1, - ("topo_probe_0x4 couldn't find BSP")); - - cpu_cores /= cpu_logical; - hyperthreading_cpus = cpu_logical; + core_id_shift = mask_width(max_logical/max_cores); + KASSERT(core_id_shift >= 0, + ("intel topo: max_cores > max_logical\n")); + pkg_id_shift = core_id_shift + mask_width(max_cores); } static void -topo_probe_0xb(void) +topo_probe_intel_0xb(void) { u_int p[4]; int bits; - int cnt; - int i; - int logical; int type; - int x; + int i; + + /* Fall back if CPU leaf 11 doesn't really exist. */ + cpuid_count(0x0b, 0, p); + if (p[1] == 0) { + topo_probe_intel_0x4(); + return; + } /* We only support three levels for now. */ - for (i = 0; i < 3; i++) { + for (i = 0; ; i++) { cpuid_count(0x0b, i, p); - /* Fall back if CPU leaf 11 doesn't really exist. */ - if (i == 0 && p[1] == 0) { - topo_probe_0x4(); - return; - } - bits = p[0] & 0x1f; - logical = p[1] &= 0xffff; type = (p[2] >> 8) & 0xff; - if (type == 0 || logical == 0) + + if (type == 0) break; - /* - * Because of uniformity assumption we examine only - * those logical processors that belong to the same - * package as BSP. - */ - for (cnt = 0, x = 0; x <= MAX_APIC_ID; x++) { - if (!cpu_info[x].cpu_present || - cpu_info[x].cpu_disabled) - continue; - if (x >> bits == boot_cpu_id >> bits) - cnt++; - } + + /* TODO: check for duplicate (re-)assignment */ if (type == CPUID_TYPE_SMT) - cpu_logical = cnt; + core_id_shift = bits; else if (type == CPUID_TYPE_CORE) - cpu_cores = cnt; + pkg_id_shift = bits; + else + printf("unknown CPU level type %d\n", type); + } + + if (pkg_id_shift < core_id_shift) { + printf("WARNING: core covers more APIC IDs than a package\n"); + core_id_shift = pkg_id_shift; + } +} + +static void +topo_probe_intel_caches(void) +{ + u_int p[4]; + int level; + int share_count; + int type; + int i; + + if (cpu_high < 0x4) { + /* + * Available cache level and sizes can be determined + * via CPUID leaf 2, but that requires a huge table of hardcoded + * values, so for now just assume L1 and L2 caches potentially + * shared only by HTT processing units, if HTT is present. + */ + caches[0].id_shift = pkg_id_shift; + caches[0].present = 1; + caches[1].id_shift = pkg_id_shift; + caches[1].present = 1; + return; + } + + for (i = 0; ; i++) { + cpuid_count(0x4, i, p); + type = p[0] & 0x1f; + level = (p[0] >> 5) & 0x7; + share_count = 1 + ((p[0] >> 14) & 0xfff); + + if (!add_deterministic_cache(type, level, share_count)) + break; } - if (cpu_logical == 0) - cpu_logical = 1; - cpu_cores /= cpu_logical; +} + +static void +topo_probe_intel(void) +{ + + /* + * See Intel(R) 64 Architecture Processor + * Topology Enumeration article for details. + * + * Note that 0x1 <= cpu_high < 4 case should be + * compatible with topo_probe_intel_0x4() logic when + * CPUID.1:EBX[23:16] > 0 (cpu_cores will be 1) + * or it should trigger the fallback otherwise. + */ + if (cpu_high >= 0xb) + topo_probe_intel_0xb(); + else if (cpu_high >= 0x1) + topo_probe_intel_0x4(); + + topo_probe_intel_caches(); } /* - * Both topology discovery code and code that consumes topology - * information assume top-down uniformity of the topology. - * That is, all physical packages must be identical and each - * core in a package must have the same number of threads. * Topology information is queried only on BSP, on which this * code runs and for which it can query CPUID information. - * Then topology is extrapolated on all packages using the - * uniformity assumption. + * Then topology is extrapolated on all packages using an + * assumption that APIC ID to hardware component ID mapping is + * homogenious. + * That doesn't necesserily imply that the topology is uniform. */ void topo_probe(void) { static int cpu_topo_probed = 0; + struct x86_topo_layer { + int type; + int subtype; + int id_shift; + } topo_layers[MAX_CACHE_LEVELS + 3]; + struct topo_node *parent; + struct topo_node *node; + int layer; + int nlayers; + int node_id; + int i; if (cpu_topo_probed) return; CPU_ZERO(&logical_cpus_mask); + if (mp_ncpus <= 1) - cpu_cores = cpu_logical = 1; + ; /* nothing */ else if (cpu_vendor_id == CPU_VENDOR_AMD) topo_probe_amd(); - else if (cpu_vendor_id == CPU_VENDOR_INTEL) { - /* - * See Intel(R) 64 Architecture Processor - * Topology Enumeration article for details. - * - * Note that 0x1 <= cpu_high < 4 case should be - * compatible with topo_probe_0x4() logic when - * CPUID.1:EBX[23:16] > 0 (cpu_cores will be 1) - * or it should trigger the fallback otherwise. - */ - if (cpu_high >= 0xb) - topo_probe_0xb(); - else if (cpu_high >= 0x1) - topo_probe_0x4(); - } + else if (cpu_vendor_id == CPU_VENDOR_INTEL) + topo_probe_intel(); + + KASSERT(pkg_id_shift >= core_id_shift, + ("bug in APIC topology discovery")); + + nlayers = 0; + bzero(topo_layers, sizeof(topo_layers)); + + topo_layers[nlayers].type = TOPO_TYPE_PKG; + topo_layers[nlayers].id_shift = pkg_id_shift; + if (bootverbose) + printf("Package ID shift: %u\n", topo_layers[nlayers].id_shift); + nlayers++; /* - * Fallback: assume each logical CPU is in separate - * physical package. That is, no multi-core, no SMT. - */ - if (cpu_cores == 0 || cpu_logical == 0) - cpu_cores = cpu_logical = 1; + * Consider all caches to be within a package/chip + * and "in front" of all sub-components like + * cores and hardware threads. + */ + for (i = MAX_CACHE_LEVELS - 1; i >= 0; --i) { + if (caches[i].present) { + KASSERT(caches[i].id_shift <= pkg_id_shift, + ("bug in APIC topology discovery")); + KASSERT(caches[i].id_shift >= core_id_shift, + ("bug in APIC topology discovery")); + + topo_layers[nlayers].type = TOPO_TYPE_CACHE; + topo_layers[nlayers].subtype = i + 1; + topo_layers[nlayers].id_shift = caches[i].id_shift; + if (bootverbose) + printf("L%u cache ID shift: %u\n", + topo_layers[nlayers].subtype, + topo_layers[nlayers].id_shift); + nlayers++; + } + } + + if (pkg_id_shift > core_id_shift) { + topo_layers[nlayers].type = TOPO_TYPE_CORE; + topo_layers[nlayers].id_shift = core_id_shift; + if (bootverbose) + printf("Core ID shift: %u\n", + topo_layers[nlayers].id_shift); + nlayers++; + } + + topo_layers[nlayers].type = TOPO_TYPE_PU; + topo_layers[nlayers].id_shift = 0; + nlayers++; + + topo_init_root(&topo_root); + for (i = 0; i <= MAX_APIC_ID; ++i) { + if (!cpu_info[i].cpu_present) + continue; + + parent = &topo_root; + for (layer = 0; layer < nlayers; ++layer) { + node_id = i >> topo_layers[layer].id_shift; + parent = topo_add_node_by_hwid(parent, node_id, + topo_layers[layer].type, + topo_layers[layer].subtype); + } + } + + parent = &topo_root; + for (layer = 0; layer < nlayers; ++layer) { + node_id = boot_cpu_id >> topo_layers[layer].id_shift; + node = topo_find_node_by_hwid(parent, node_id, + topo_layers[layer].type, + topo_layers[layer].subtype); + topo_promote_child(node); + parent = node; + } + cpu_topo_probed = 1; } -struct cpu_group * -cpu_topo(void) +/* + * Assign logical CPU IDs to local APICs. + */ +void +assign_cpu_ids(void) { - int cg_flags; + struct topo_node *node; + u_int smt_mask; + + smt_mask = (1u << core_id_shift) - 1; /* - * Determine whether any threading flags are - * necessry. + * Assign CPU IDs to local APIC IDs and disable any CPUs + * beyond MAXCPU. CPU 0 is always assigned to the BSP. */ - topo_probe(); - if (cpu_logical > 1 && hyperthreading_cpus) - cg_flags = CG_FLAG_HTT; - else if (cpu_logical > 1) - cg_flags = CG_FLAG_SMT; + mp_ncpus = 0; + TOPO_FOREACH(node, &topo_root) { + if (node->type != TOPO_TYPE_PU) + continue; + + if ((node->hwid & smt_mask) != (boot_cpu_id & smt_mask)) + cpu_info[node->hwid].cpu_hyperthread = 1; + + if (resource_disabled("lapic", node->hwid)) { + if (node->hwid != boot_cpu_id) + cpu_info[node->hwid].cpu_disabled = 1; + else + printf("Cannot disable BSP, APIC ID = %d\n", + node->hwid); + } + + if (!hyperthreading_allowed && + cpu_info[node->hwid].cpu_hyperthread) + cpu_info[node->hwid].cpu_disabled = 1; + + if (mp_ncpus >= MAXCPU) + cpu_info[node->hwid].cpu_disabled = 1; + + if (cpu_info[node->hwid].cpu_disabled) { + disabled_cpus++; + continue; + } + + cpu_apic_ids[mp_ncpus] = node->hwid; + apic_cpuids[node->hwid] = mp_ncpus; + topo_set_pu_id(node, mp_ncpus); + mp_ncpus++; + } + + KASSERT(mp_maxid >= mp_ncpus - 1, + ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid, + mp_ncpus)); +} + +/* + * Print various information about the SMP system hardware and setup. + */ +void +cpu_mp_announce(void) +{ + struct topo_node *node; + const char *hyperthread; + int pkg_count; + int cores_per_pkg; + int thrs_per_core; + + printf("FreeBSD/SMP: "); + if (topo_analyze(&topo_root, 1, &pkg_count, + &cores_per_pkg, &thrs_per_core)) { + printf("%d package(s)", pkg_count); + if (cores_per_pkg > 0) + printf(" x %d core(s)", cores_per_pkg); + if (thrs_per_core > 1) + printf(" x %d hardware threads", thrs_per_core); + } else { + printf("Non-uniform topology"); + } + printf("\n"); + + if (disabled_cpus) { + printf("FreeBSD/SMP Online: "); + if (topo_analyze(&topo_root, 0, &pkg_count, + &cores_per_pkg, &thrs_per_core)) { + printf("%d package(s)", pkg_count); + if (cores_per_pkg > 0) + printf(" x %d core(s)", cores_per_pkg); + if (thrs_per_core > 1) + printf(" x %d hardware threads", thrs_per_core); + } else { + printf("Non-uniform topology"); + } + printf("\n"); + } + + if (!bootverbose) + return; + + TOPO_FOREACH(node, &topo_root) { + switch (node->type) { + case TOPO_TYPE_PKG: + printf("Package HW ID = %u (%#x)\n", + node->hwid, node->hwid); + break; + case TOPO_TYPE_CORE: + printf("\tCore HW ID = %u (%#x)\n", *** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?570296F3.8090307>