Date: Mon, 23 Aug 2021 23:22:19 GMT From: Konstantin Belousov <kib@FreeBSD.org> To: src-committers@FreeBSD.org, dev-commits-src-all@FreeBSD.org, dev-commits-src-branches@FreeBSD.org Subject: git: c946f699856f - stable/13 - amd64: rework AP startup Message-ID: <202108232322.17NNMJ7r094581@gitrepo.freebsd.org>
next in thread | raw e-mail | index | archive | help
The branch stable/13 has been updated by kib: URL: https://cgit.FreeBSD.org/src/commit/?id=c946f699856f6737a5256d7c9f746ac8035339ee commit c946f699856f6737a5256d7c9f746ac8035339ee Author: Konstantin Belousov <kib@FreeBSD.org> AuthorDate: 2021-07-10 19:38:42 +0000 Commit: Konstantin Belousov <kib@FreeBSD.org> CommitDate: 2021-08-23 23:21:12 +0000 amd64: rework AP startup (cherry picked from commit d6717f877872e62d9df1e0ce2d8856620c993924) --- sys/amd64/amd64/machdep.c | 4 +- sys/amd64/amd64/mp_machdep.c | 187 ++++++++++++++++--------------------------- sys/amd64/amd64/mpboot.S | 64 +++++++-------- sys/amd64/include/smp.h | 3 +- sys/x86/x86/mp_x86.c | 5 -- sys/x86/xen/pv.c | 1 - 6 files changed, 96 insertions(+), 168 deletions(-) diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c index 93030cbe7126..840570be534a 100644 --- a/sys/amd64/amd64/machdep.c +++ b/sys/amd64/amd64/machdep.c @@ -187,7 +187,6 @@ struct init_ops init_ops = { .early_delay = i8254_delay, .parse_memmap = native_parse_memmap, #ifdef SMP - .mp_bootaddress = mp_bootaddress, .start_all_aps = native_start_all_aps, #endif #ifdef DEV_PCI @@ -1288,8 +1287,7 @@ getmemsize(caddr_t kmdp, u_int64_t first) * is configured to support APs and APs for the system start * in real mode mode (e.g. SMP bare metal). */ - if (init_ops.mp_bootaddress) - init_ops.mp_bootaddress(physmap, &physmap_idx); + alloc_ap_trampoline(physmap, &physmap_idx); /* call pmap initialization to make new kernel address space */ pmap_bootstrap(&first); diff --git a/sys/amd64/amd64/mp_machdep.c b/sys/amd64/amd64/mp_machdep.c index d1064262891f..082a58ada48f 100644 --- a/sys/amd64/amd64/mp_machdep.c +++ b/sys/amd64/amd64/mp_machdep.c @@ -106,6 +106,7 @@ char *dbg_stack; void *bootpcpu; extern u_int mptramp_la57; +extern u_int mptramp_nx; /* * Local data and functions. @@ -113,86 +114,6 @@ extern u_int mptramp_la57; static int start_ap(int apic_id); -static bool -is_kernel_paddr(vm_paddr_t pa) -{ - - return (pa >= trunc_2mpage(btext - KERNBASE) && - pa < round_page(_end - KERNBASE)); -} - -static bool -is_mpboot_good(vm_paddr_t start, vm_paddr_t end) -{ - - return (start + AP_BOOTPT_SZ <= GiB(4) && atop(end) < Maxmem); -} - -/* - * Calculate usable address in base memory for AP trampoline code. - */ -void -mp_bootaddress(vm_paddr_t *physmap, unsigned int *physmap_idx) -{ - vm_paddr_t start, end; - unsigned int i; - bool allocated; - - alloc_ap_trampoline(physmap, physmap_idx); - - /* - * Find a memory region big enough below the 4GB boundary to - * store the initial page tables. Region must be mapped by - * the direct map. - * - * Note that it needs to be aligned to a page boundary. - */ - allocated = false; - for (i = *physmap_idx; i <= *physmap_idx; i -= 2) { - /* - * First, try to chomp at the start of the physmap region. - * Kernel binary might claim it already. - */ - start = round_page(physmap[i]); - end = start + AP_BOOTPT_SZ; - if (start < end && end <= physmap[i + 1] && - is_mpboot_good(start, end) && - !is_kernel_paddr(start) && !is_kernel_paddr(end - 1)) { - allocated = true; - physmap[i] = end; - break; - } - - /* - * Second, try to chomp at the end. Again, check - * against kernel. - */ - end = trunc_page(physmap[i + 1]); - start = end - AP_BOOTPT_SZ; - if (start < end && start >= physmap[i] && - is_mpboot_good(start, end) && - !is_kernel_paddr(start) && !is_kernel_paddr(end - 1)) { - allocated = true; - physmap[i + 1] = start; - break; - } - } - if (allocated) { - mptramp_pagetables = start; - if (physmap[i] == physmap[i + 1] && *physmap_idx != 0) { - memmove(&physmap[i], &physmap[i + 2], - sizeof(*physmap) * (*physmap_idx - i + 2)); - *physmap_idx -= 2; - } - } else { - mptramp_pagetables = trunc_page(boot_address) - AP_BOOTPT_SZ; - if (bootverbose) - printf( -"Cannot find enough space for the initial AP page tables, placing them at %#x", - mptramp_pagetables); - } -} - /* * Initialize the IPI handlers and start up the AP's. */ @@ -244,6 +165,9 @@ cpu_mp_start(void) assign_cpu_ids(); mptramp_la57 = la57; + mptramp_nx = pg_nx != 0; + MPASS(kernel_pmap->pm_cr3 < (1UL << 32)); + mptramp_pagetables = kernel_pmap->pm_cr3; /* Start each Application Processor */ init_ops.start_all_aps(); @@ -398,55 +322,67 @@ mp_realloc_pcpu(int cpuid, int domain) int native_start_all_aps(void) { - u_int64_t *pt5, *pt4, *pt3, *pt2; + vm_page_t m_pml4, m_pdp, m_pd[4]; + pml5_entry_t old_pml45; + pml4_entry_t *v_pml4; + pdp_entry_t *v_pdp; + pd_entry_t *v_pd; u_int32_t mpbioswarmvec; - int apic_id, cpu, domain, i, xo; + int apic_id, cpu, domain, i; u_char mpbiosreason; mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN); - /* copy the AP 1st level boot code */ - bcopy(mptramp_start, (void *)PHYS_TO_DMAP(boot_address), bootMP_size); - - /* Locate the page tables, they'll be below the trampoline */ + /* Create a transient 1:1 mapping of low 4G */ if (la57) { - pt5 = (uint64_t *)PHYS_TO_DMAP(mptramp_pagetables); - xo = 1; + m_pml4 = pmap_page_alloc_below_4g(true); + v_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pml4)); } else { - xo = 0; + v_pml4 = &kernel_pmap->pm_pmltop[0]; } - pt4 = (uint64_t *)PHYS_TO_DMAP(mptramp_pagetables + xo * PAGE_SIZE); - pt3 = pt4 + (PAGE_SIZE) / sizeof(u_int64_t); - pt2 = pt3 + (PAGE_SIZE) / sizeof(u_int64_t); - - /* Create the initial 1GB replicated page tables */ - for (i = 0; i < 512; i++) { - if (la57) { - pt5[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + - PAGE_SIZE); - pt5[i] |= PG_V | PG_RW | PG_U; - } - - /* - * Each slot of the level 4 pages points to the same - * level 3 page. - */ - pt4[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + - (xo + 1) * PAGE_SIZE); - pt4[i] |= PG_V | PG_RW | PG_U; - - /* - * Each slot of the level 3 pages points to the same - * level 2 page. - */ - pt3[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + - ((xo + 2) * PAGE_SIZE)); - pt3[i] |= PG_V | PG_RW | PG_U; - - /* The level 2 page slots are mapped with 2MB pages for 1GB. */ - pt2[i] = i * (2 * 1024 * 1024); - pt2[i] |= PG_V | PG_RW | PG_PS | PG_U; + m_pdp = pmap_page_alloc_below_4g(true); + v_pdp = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pdp)); + m_pd[0] = pmap_page_alloc_below_4g(false); + v_pd = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pd[0])); + for (i = 0; i < NPDEPG; i++) + v_pd[i] = (i << PDRSHIFT) | X86_PG_V | X86_PG_RW | X86_PG_A | + X86_PG_M | PG_PS; + m_pd[1] = pmap_page_alloc_below_4g(false); + v_pd = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pd[1])); + for (i = 0; i < NPDEPG; i++) + v_pd[i] = (NBPDP + (i << PDRSHIFT)) | X86_PG_V | X86_PG_RW | + X86_PG_A | X86_PG_M | PG_PS; + m_pd[2] = pmap_page_alloc_below_4g(false); + v_pd = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pd[2])); + for (i = 0; i < NPDEPG; i++) + v_pd[i] = (2UL * NBPDP + (i << PDRSHIFT)) | X86_PG_V | + X86_PG_RW | X86_PG_A | X86_PG_M | PG_PS; + m_pd[3] = pmap_page_alloc_below_4g(false); + v_pd = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pd[3])); + for (i = 0; i < NPDEPG; i++) + v_pd[i] = (3UL * NBPDP + (i << PDRSHIFT)) | X86_PG_V | + X86_PG_RW | X86_PG_A | X86_PG_M | PG_PS; + v_pdp[0] = VM_PAGE_TO_PHYS(m_pd[0]) | X86_PG_V | + X86_PG_RW | X86_PG_A | X86_PG_M; + v_pdp[1] = VM_PAGE_TO_PHYS(m_pd[1]) | X86_PG_V | + X86_PG_RW | X86_PG_A | X86_PG_M; + v_pdp[2] = VM_PAGE_TO_PHYS(m_pd[2]) | X86_PG_V | + X86_PG_RW | X86_PG_A | X86_PG_M; + v_pdp[3] = VM_PAGE_TO_PHYS(m_pd[3]) | X86_PG_V | + X86_PG_RW | X86_PG_A | X86_PG_M; + old_pml45 = kernel_pmap->pm_pmltop[0]; + if (la57) { + kernel_pmap->pm_pmltop[0] = VM_PAGE_TO_PHYS(m_pml4) | + X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M; } + v_pml4[0] = VM_PAGE_TO_PHYS(m_pdp) | X86_PG_V | + X86_PG_RW | X86_PG_A | X86_PG_M; + pmap_invalidate_all(kernel_pmap); + + /* copy the AP 1st level boot code */ + bcopy(mptramp_start, (void *)PHYS_TO_DMAP(boot_address), bootMP_size); + if (bootverbose) + printf("AP boot address %#x\n", boot_address); /* save the current value of the warm-start vector */ if (!efi_boot) @@ -517,6 +453,17 @@ native_start_all_aps(void) outb(CMOS_REG, BIOS_RESET); outb(CMOS_DATA, mpbiosreason); + /* Destroy transient 1:1 mapping */ + kernel_pmap->pm_pmltop[0] = old_pml45; + invlpg(0); + if (la57) + vm_page_free(m_pml4); + vm_page_free(m_pd[3]); + vm_page_free(m_pd[2]); + vm_page_free(m_pd[1]); + vm_page_free(m_pd[0]); + vm_page_free(m_pdp); + /* number of APs actually started */ return (mp_naps); } diff --git a/sys/amd64/amd64/mpboot.S b/sys/amd64/amd64/mpboot.S index afdcffa573a4..1b5657d3bef8 100644 --- a/sys/amd64/amd64/mpboot.S +++ b/sys/amd64/amd64/mpboot.S @@ -95,12 +95,25 @@ protmode: * is later enabled. */ mov %cr4, %eax - orl $CR4_PAE, %eax + orl $(CR4_PAE | CR4_PGE), %eax cmpb $0, mptramp_la57-mptramp_start(%ebx) je 1f orl $CR4_LA57, %eax 1: mov %eax, %cr4 + /* + * If the BSP reported NXE support, enable EFER.NXE for all APs + * prior to loading %cr3. This avoids page faults if the AP + * encounters memory marked with the NX bit prior to detecting and + * enabling NXE support. + */ + cmpb $0,mptramp_nx-mptramp_start(%ebx) + je 2f + movl $MSR_EFER, %ecx + rdmsr + orl $EFER_NXE, %eax + wrmsr +2: /* * Enable EFER.LME so that we get long mode when all the prereqs are * in place. In this case, it turns on when CR0_PG is finally enabled. @@ -112,12 +125,13 @@ protmode: wrmsr /* - * Point to the embedded page tables for startup. Note that this - * only gets accessed after we're actually in 64 bit mode, however - * we can only set the bottom 32 bits of %cr3 in this state. This - * means we are required to use a temporary page table that is below - * the 4GB limit. %ebx is still our relocation base. We could just - * subtract 3 * PAGE_SIZE, but that would be too easy. + * Load kernel page table pointer into %cr3. + * %ebx is still our relocation base. + * + * Note that this only gets accessed after we're actually in 64 bit + * mode, however we can only set the bottom 32 bits of %cr3 in this + * state. This means we depend on the kernel page table being + * allocated from the low 4G. */ leal mptramp_pagetables-mptramp_start(%ebx),%eax movl (%eax), %eax @@ -155,10 +169,8 @@ jmp_64: /* * Yeehar! We're running in 64 bit mode! We can mostly ignore our * segment registers, and get on with it. - * Note that we are running at the correct virtual address, but with - * a 1:1 1GB mirrored mapping over entire address space. We had better - * switch to a real %cr3 promptly so that we can get to the direct map - * space. Remember that jmp is relative and that we've been relocated, + * We are running at the correct virtual address space. + * Note that the jmp is relative and that we've been relocated, * so use an indirect jump. */ .code64 @@ -220,6 +232,10 @@ mptramp_pagetables: mptramp_la57: .long 0 + .globl mptramp_nx +mptramp_nx: + .long 0 + /* * The pseudo descriptor for lgdt to use. */ @@ -243,32 +259,6 @@ bootMP_size: .code64 .p2align 4,0 entry_64: - /* - * If the BSP reported NXE support, enable EFER.NXE for all APs - * prior to loading %cr3. This avoids page faults if the AP - * encounters memory marked with the NX bit prior to detecting and - * enabling NXE support. - */ - movq pg_nx, %rbx - testq %rbx, %rbx - je 1f - movl $MSR_EFER, %ecx - rdmsr - orl $EFER_NXE, %eax - wrmsr - -1: - /* - * Load a real %cr3 that has all the direct map stuff and switches - * off the 1GB replicated mirror. Load a stack pointer and jump - * into AP startup code in C. - */ - cmpl $0, la57 - jne 2f - movq KPML4phys, %rax - jmp 3f -2: movq KPML5phys, %rax -3: movq %rax, %cr3 movq bootSTK, %rsp /* diff --git a/sys/amd64/include/smp.h b/sys/amd64/include/smp.h index 8fbd89da0e57..84ee73cef723 100644 --- a/sys/amd64/include/smp.h +++ b/sys/amd64/include/smp.h @@ -38,8 +38,7 @@ inthand_t IDTVEC(rendezvous_pti); void invlop_handler(void); -int native_start_all_aps(void); -void mp_bootaddress(vm_paddr_t *, unsigned int *); +int native_start_all_aps(void); #endif /* !LOCORE */ #endif /* SMP */ diff --git a/sys/x86/x86/mp_x86.c b/sys/x86/x86/mp_x86.c index f1c1e45e79b8..441a766f87fb 100644 --- a/sys/x86/x86/mp_x86.c +++ b/sys/x86/x86/mp_x86.c @@ -1070,11 +1070,6 @@ init_secondary_tail(void) } #ifdef __amd64__ - /* - * Enable global pages TLB extension - * This also implicitly flushes the TLB - */ - load_cr4(rcr4() | CR4_PGE); if (pmap_pcid_enabled) load_cr4(rcr4() | CR4_PCIDE); load_ds(_udatasel); diff --git a/sys/x86/xen/pv.c b/sys/x86/xen/pv.c index 2fd698772f9d..59c5b464aace 100644 --- a/sys/x86/xen/pv.c +++ b/sys/x86/xen/pv.c @@ -134,7 +134,6 @@ struct init_ops xen_pvh_init_ops = { .early_delay = xen_delay, .parse_memmap = xen_pvh_parse_memmap, #ifdef SMP - .mp_bootaddress = mp_bootaddress, .start_all_aps = native_start_all_aps, #endif .msi_init = msi_init,
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?202108232322.17NNMJ7r094581>