Skip site navigation (1)Skip section navigation (2)
Date:      Tue, 27 Jul 2021 17:20:03 GMT
From:      Konstantin Belousov <kib@FreeBSD.org>
To:        src-committers@FreeBSD.org, dev-commits-src-all@FreeBSD.org, dev-commits-src-main@FreeBSD.org
Subject:   git: d6717f877872 - main - amd64: rework AP startup
Message-ID:  <202107271720.16RHK3i8013282@gitrepo.freebsd.org>

next in thread | raw e-mail | index | archive | help
The branch main has been updated by kib:

URL: https://cgit.FreeBSD.org/src/commit/?id=d6717f877872e62d9df1e0ce2d8856620c993924

commit d6717f877872e62d9df1e0ce2d8856620c993924
Author:     Konstantin Belousov <kib@FreeBSD.org>
AuthorDate: 2021-07-10 19:38:42 +0000
Commit:     Konstantin Belousov <kib@FreeBSD.org>
CommitDate: 2021-07-27 17:11:15 +0000

    amd64: rework AP startup
    
    Stop using temporal page table with 1:1 mapping of low 1G populated over
    the whole VA.  Use 1:1 mapping of low 4G temporarily installed in the
    normal kernel page table.
    
    The features are:
    - now there is one less step for startup asm to perform
    - the startup code still needs to be at lower 1G because CPU starts in
      real mode. But everything else can be located anywhere in low 4G
      because it is accessed by non-paged 32bit protected mode.  Note that
      kernel page table root page is at low 4G, as well as the kernel itself.
    - the page table pages can be allocated by normal allocator, there is
      no need to carve them from the phys_avail segments at very early time.
      The allocation of the page for startup code still requires some magic.
      Pages are freed after APs are ignited.
    - la57 startup for APs is less tricky, we directly load the final page
      table and do not need to tweak the paging mode.
    
    Reviewed by:    markj
    Tested by:      pho
    Sponsored by:   The FreeBSD Foundation
    MFC after:      1 week
    Differential revision:  https://reviews.freebsd.org/D31121
---
 sys/amd64/amd64/machdep.c    |   2 +-
 sys/amd64/amd64/mp_machdep.c | 187 ++++++++++++++++---------------------------
 sys/amd64/amd64/mpboot.S     |  64 +++++++--------
 sys/amd64/include/smp.h      |   1 -
 sys/x86/x86/mp_x86.c         |   5 --
 5 files changed, 95 insertions(+), 164 deletions(-)

diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c
index 1bd2c8c0afe0..49e245e1fdfe 100644
--- a/sys/amd64/amd64/machdep.c
+++ b/sys/amd64/amd64/machdep.c
@@ -1279,7 +1279,7 @@ getmemsize(caddr_t kmdp, u_int64_t first)
 	 * in real mode mode (e.g. SMP bare metal).
 	 */
 #ifdef SMP
-	mp_bootaddress(physmap, &physmap_idx);
+	alloc_ap_trampoline(physmap, &physmap_idx);
 #endif
 
 	/* call pmap initialization to make new kernel address space */
diff --git a/sys/amd64/amd64/mp_machdep.c b/sys/amd64/amd64/mp_machdep.c
index 886ea3734a3f..df0270c543e6 100644
--- a/sys/amd64/amd64/mp_machdep.c
+++ b/sys/amd64/amd64/mp_machdep.c
@@ -105,6 +105,7 @@ static char *nmi_stack;
 static char *dbg_stack;
 
 extern u_int mptramp_la57;
+extern u_int mptramp_nx;
 
 /*
  * Local data and functions.
@@ -112,86 +113,6 @@ extern u_int mptramp_la57;
 
 static int	start_ap(int apic_id);
 
-static bool
-is_kernel_paddr(vm_paddr_t pa)
-{
-
-	return (pa >= trunc_2mpage(btext - KERNBASE) &&
-	   pa < round_page(_end - KERNBASE));
-}
-
-static bool
-is_mpboot_good(vm_paddr_t start, vm_paddr_t end)
-{
-
-	return (start + AP_BOOTPT_SZ <= GiB(4) && atop(end) < Maxmem);
-}
-
-/*
- * Calculate usable address in base memory for AP trampoline code.
- */
-void
-mp_bootaddress(vm_paddr_t *physmap, unsigned int *physmap_idx)
-{
-	vm_paddr_t start, end;
-	unsigned int i;
-	bool allocated;
-
-	alloc_ap_trampoline(physmap, physmap_idx);
-
-	/*
-	 * Find a memory region big enough below the 4GB boundary to
-	 * store the initial page tables.  Region must be mapped by
-	 * the direct map.
-	 *
-	 * Note that it needs to be aligned to a page boundary.
-	 */
-	allocated = false;
-	for (i = *physmap_idx; i <= *physmap_idx; i -= 2) {
-		/*
-		 * First, try to chomp at the start of the physmap region.
-		 * Kernel binary might claim it already.
-		 */
-		start = round_page(physmap[i]);
-		end = start + AP_BOOTPT_SZ;
-		if (start < end && end <= physmap[i + 1] &&
-		    is_mpboot_good(start, end) &&
-		    !is_kernel_paddr(start) && !is_kernel_paddr(end - 1)) {
-			allocated = true;
-			physmap[i] = end;
-			break;
-		}
-
-		/*
-		 * Second, try to chomp at the end.  Again, check
-		 * against kernel.
-		 */
-		end = trunc_page(physmap[i + 1]);
-		start = end - AP_BOOTPT_SZ;
-		if (start < end && start >= physmap[i] &&
-		    is_mpboot_good(start, end) &&
-		    !is_kernel_paddr(start) && !is_kernel_paddr(end - 1)) {
-			allocated = true;
-			physmap[i + 1] = start;
-			break;
-		}
-	}
-	if (allocated) {
-		mptramp_pagetables = start;
-		if (physmap[i] == physmap[i + 1] && *physmap_idx != 0) {
-			memmove(&physmap[i], &physmap[i + 2],
-			    sizeof(*physmap) * (*physmap_idx - i + 2));
-			*physmap_idx -= 2;
-		}
-	} else {
-		mptramp_pagetables = trunc_page(boot_address) - AP_BOOTPT_SZ;
-		if (bootverbose)
-			printf(
-"Cannot find enough space for the initial AP page tables, placing them at %#x",
-			    mptramp_pagetables);
-	}
-}
-
 /*
  * Initialize the IPI handlers and start up the AP's.
  */
@@ -243,6 +164,9 @@ cpu_mp_start(void)
 	assign_cpu_ids();
 
 	mptramp_la57 = la57;
+	mptramp_nx = pg_nx != 0;
+	MPASS(kernel_pmap->pm_cr3 < (1UL << 32));
+	mptramp_pagetables = kernel_pmap->pm_cr3;
 
 	/* Start each Application Processor */
 	start_all_aps();
@@ -399,55 +323,67 @@ mp_realloc_pcpu(int cpuid, int domain)
 int
 start_all_aps(void)
 {
-	u_int64_t *pt5, *pt4, *pt3, *pt2;
+	vm_page_t m_pml4, m_pdp, m_pd[4];
+	pml5_entry_t old_pml45;
+	pml4_entry_t *v_pml4;
+	pdp_entry_t *v_pdp;
+	pd_entry_t *v_pd;
 	u_int32_t mpbioswarmvec;
-	int apic_id, cpu, domain, i, xo;
+	int apic_id, cpu, domain, i;
 	u_char mpbiosreason;
 
 	mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN);
 
-	/* copy the AP 1st level boot code */
-	bcopy(mptramp_start, (void *)PHYS_TO_DMAP(boot_address), bootMP_size);
-
-	/* Locate the page tables, they'll be below the trampoline */
+	/* Create a transient 1:1 mapping of low 4G */
 	if (la57) {
-		pt5 = (uint64_t *)PHYS_TO_DMAP(mptramp_pagetables);
-		xo = 1;
+		m_pml4 = pmap_page_alloc_below_4g(true);
+		v_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pml4));
 	} else {
-		xo = 0;
+		v_pml4 = &kernel_pmap->pm_pmltop[0];
 	}
-	pt4 = (uint64_t *)PHYS_TO_DMAP(mptramp_pagetables + xo * PAGE_SIZE);
-	pt3 = pt4 + (PAGE_SIZE) / sizeof(u_int64_t);
-	pt2 = pt3 + (PAGE_SIZE) / sizeof(u_int64_t);
-
-	/* Create the initial 1GB replicated page tables */
-	for (i = 0; i < 512; i++) {
-		if (la57) {
-			pt5[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables +
-			    PAGE_SIZE);
-			pt5[i] |= PG_V | PG_RW | PG_U;
-		}
-
-		/*
-		 * Each slot of the level 4 pages points to the same
-		 * level 3 page.
-		 */
-		pt4[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables +
-		    (xo + 1) * PAGE_SIZE);
-		pt4[i] |= PG_V | PG_RW | PG_U;
-
-		/*
-		 * Each slot of the level 3 pages points to the same
-		 * level 2 page.
-		 */
-		pt3[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables +
-		    ((xo + 2) * PAGE_SIZE));
-		pt3[i] |= PG_V | PG_RW | PG_U;
-
-		/* The level 2 page slots are mapped with 2MB pages for 1GB. */
-		pt2[i] = i * (2 * 1024 * 1024);
-		pt2[i] |= PG_V | PG_RW | PG_PS | PG_U;
+	m_pdp = pmap_page_alloc_below_4g(true);
+	v_pdp = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pdp));
+	m_pd[0] = pmap_page_alloc_below_4g(false);
+	v_pd = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pd[0]));
+	for (i = 0; i < NPDEPG; i++)
+		v_pd[i] = (i << PDRSHIFT) | X86_PG_V | X86_PG_RW | X86_PG_A |
+		    X86_PG_M | PG_PS;
+	m_pd[1] = pmap_page_alloc_below_4g(false);
+	v_pd = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pd[1]));
+	for (i = 0; i < NPDEPG; i++)
+		v_pd[i] = (NBPDP + (i << PDRSHIFT)) | X86_PG_V | X86_PG_RW |
+		    X86_PG_A | X86_PG_M | PG_PS;
+	m_pd[2] = pmap_page_alloc_below_4g(false);
+	v_pd = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pd[2]));
+	for (i = 0; i < NPDEPG; i++)
+		v_pd[i] = (2UL * NBPDP + (i << PDRSHIFT)) | X86_PG_V |
+		    X86_PG_RW | X86_PG_A | X86_PG_M | PG_PS;
+	m_pd[3] = pmap_page_alloc_below_4g(false);
+	v_pd = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pd[3]));
+	for (i = 0; i < NPDEPG; i++)
+		v_pd[i] = (3UL * NBPDP + (i << PDRSHIFT)) | X86_PG_V |
+		    X86_PG_RW | X86_PG_A | X86_PG_M | PG_PS;
+	v_pdp[0] = VM_PAGE_TO_PHYS(m_pd[0]) | X86_PG_V |
+	    X86_PG_RW | X86_PG_A | X86_PG_M;
+	v_pdp[1] = VM_PAGE_TO_PHYS(m_pd[1]) | X86_PG_V |
+	    X86_PG_RW | X86_PG_A | X86_PG_M;
+	v_pdp[2] = VM_PAGE_TO_PHYS(m_pd[2]) | X86_PG_V |
+	    X86_PG_RW | X86_PG_A | X86_PG_M;
+	v_pdp[3] = VM_PAGE_TO_PHYS(m_pd[3]) | X86_PG_V |
+	    X86_PG_RW | X86_PG_A | X86_PG_M;
+	old_pml45 = kernel_pmap->pm_pmltop[0];
+	if (la57) {
+		kernel_pmap->pm_pmltop[0] = VM_PAGE_TO_PHYS(m_pml4) |
+		    X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M;
 	}
+	v_pml4[0] = VM_PAGE_TO_PHYS(m_pdp) | X86_PG_V |
+	    X86_PG_RW | X86_PG_A | X86_PG_M;
+	pmap_invalidate_all(kernel_pmap);
+
+	/* copy the AP 1st level boot code */
+	bcopy(mptramp_start, (void *)PHYS_TO_DMAP(boot_address), bootMP_size);
+	if (bootverbose)
+		printf("AP boot address %#x\n", boot_address);
 
 	/* save the current value of the warm-start vector */
 	if (!efi_boot)
@@ -515,6 +451,17 @@ start_all_aps(void)
 	outb(CMOS_REG, BIOS_RESET);
 	outb(CMOS_DATA, mpbiosreason);
 
+	/* Destroy transient 1:1 mapping */
+	kernel_pmap->pm_pmltop[0] = old_pml45;
+	invlpg(0);
+	if (la57)
+		vm_page_free(m_pml4);
+	vm_page_free(m_pd[3]);
+	vm_page_free(m_pd[2]);
+	vm_page_free(m_pd[1]);
+	vm_page_free(m_pd[0]);
+	vm_page_free(m_pdp);
+
 	/* number of APs actually started */
 	return (mp_naps);
 }
diff --git a/sys/amd64/amd64/mpboot.S b/sys/amd64/amd64/mpboot.S
index e525102b5d3d..b6c599a2aff3 100644
--- a/sys/amd64/amd64/mpboot.S
+++ b/sys/amd64/amd64/mpboot.S
@@ -95,12 +95,25 @@ protmode:
 	 * is later enabled.
 	 */
 	mov	%cr4, %eax
-	orl	$CR4_PAE, %eax
+	orl	$(CR4_PAE | CR4_PGE), %eax
 	cmpb	$0, mptramp_la57-mptramp_start(%ebx)
 	je	1f
 	orl	$CR4_LA57, %eax
 1:	mov	%eax, %cr4
 
+	/*
+	 * If the BSP reported NXE support, enable EFER.NXE for all APs
+	 * prior to loading %cr3. This avoids page faults if the AP
+	 * encounters memory marked with the NX bit prior to detecting and
+	 * enabling NXE support.
+	 */
+	cmpb	$0,mptramp_nx-mptramp_start(%ebx)
+	je	2f
+	movl	$MSR_EFER, %ecx
+	rdmsr
+	orl	$EFER_NXE, %eax
+	wrmsr
+2:
 	/*
 	 * Enable EFER.LME so that we get long mode when all the prereqs are
 	 * in place.  In this case, it turns on when CR0_PG is finally enabled.
@@ -112,12 +125,13 @@ protmode:
 	wrmsr
 
 	/*
-	 * Point to the embedded page tables for startup.  Note that this
-	 * only gets accessed after we're actually in 64 bit mode, however
-	 * we can only set the bottom 32 bits of %cr3 in this state.  This
-	 * means we are required to use a temporary page table that is below
-	 * the 4GB limit.  %ebx is still our relocation base.  We could just
-	 * subtract 3 * PAGE_SIZE, but that would be too easy.
+	 * Load kernel page table pointer into %cr3.
+	 * %ebx is still our relocation base.
+	 *
+	 * Note that this only gets accessed after we're actually in 64 bit
+	 * mode, however we can only set the bottom 32 bits of %cr3 in this
+	 * state.  This means we depend on the kernel page table being
+	 * allocated from the low 4G.
 	 */
 	leal	mptramp_pagetables-mptramp_start(%ebx),%eax
 	movl	(%eax), %eax
@@ -155,10 +169,8 @@ jmp_64:
 	/*
 	 * Yeehar!  We're running in 64 bit mode!  We can mostly ignore our
 	 * segment registers, and get on with it.
-	 * Note that we are running at the correct virtual address, but with
-	 * a 1:1 1GB mirrored mapping over entire address space.  We had better
-	 * switch to a real %cr3 promptly so that we can get to the direct map
-	 * space. Remember that jmp is relative and that we've been relocated,
+	 * We are running at the correct virtual address space.
+	 * Note that the jmp is relative and that we've been relocated,
 	 * so use an indirect jump.
 	 */
 	.code64
@@ -220,6 +232,10 @@ mptramp_pagetables:
 mptramp_la57:
 	.long	0
 
+	.globl	mptramp_nx
+mptramp_nx:
+	.long	0
+
 	/*
 	 * The pseudo descriptor for lgdt to use.
 	 */
@@ -243,31 +259,5 @@ bootMP_size:
 	.code64
 	.p2align 4,0
 entry_64:
-	/*
-	 * If the BSP reported NXE support, enable EFER.NXE for all APs
-	 * prior to loading %cr3. This avoids page faults if the AP
-	 * encounters memory marked with the NX bit prior to detecting and
-	 * enabling NXE support.
-	 */
-	movq	pg_nx, %rbx
-	testq	%rbx, %rbx
-	je	1f
-	movl	$MSR_EFER, %ecx
-	rdmsr
-	orl	$EFER_NXE, %eax
-	wrmsr
-
-1:
-	/*
-	 * Load a real %cr3 that has all the direct map stuff and switches
-	 * off the 1GB replicated mirror.  Load a stack pointer and jump
-	 * into AP startup code in C.
-	*/
-	cmpl	$0, la57
-	jne	2f
-	movq	KPML4phys, %rax
-	jmp	3f
-2:	movq	KPML5phys, %rax
-3:	movq	%rax, %cr3
 	movq	bootSTK, %rsp
 	jmp	init_secondary
diff --git a/sys/amd64/include/smp.h b/sys/amd64/include/smp.h
index ac9ed5f61a23..cb8bbdcd7260 100644
--- a/sys/amd64/include/smp.h
+++ b/sys/amd64/include/smp.h
@@ -39,7 +39,6 @@ inthand_t
 
 void	invlop_handler(void);
 int	start_all_aps(void);
-void	mp_bootaddress(vm_paddr_t *, unsigned int *);
 
 #endif /* !LOCORE */
 #endif /* SMP */
diff --git a/sys/x86/x86/mp_x86.c b/sys/x86/x86/mp_x86.c
index 1f22b3a7886a..c98ac12a7c3f 100644
--- a/sys/x86/x86/mp_x86.c
+++ b/sys/x86/x86/mp_x86.c
@@ -1065,11 +1065,6 @@ init_secondary_tail(void)
 	}
 
 #ifdef __amd64__
-	/*
-	 * Enable global pages TLB extension
-	 * This also implicitly flushes the TLB 
-	 */
-	load_cr4(rcr4() | CR4_PGE);
 	if (pmap_pcid_enabled)
 		load_cr4(rcr4() | CR4_PCIDE);
 	load_ds(_udatasel);



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?202107271720.16RHK3i8013282>