Skip site navigation (1)Skip section navigation (2)
Date:      Sun, 23 Aug 2020 20:19:05 +0000 (UTC)
From:      Konstantin Belousov <kib@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r364527 - in head/sys: amd64/amd64 amd64/include amd64/linux amd64/vmm/amd amd64/vmm/intel cddl/dev/dtrace/amd64
Message-ID:  <202008232019.07NKJ53S016121@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: kib
Date: Sun Aug 23 20:19:04 2020
New Revision: 364527
URL: https://svnweb.freebsd.org/changeset/base/364527

Log:
  amd64 pmap: LA57 AKA 5-level paging
  
  Since LA57 was moved to the main SDM document with revision 072, it
  seems that we should have a support for it, and silicons are coming.
  
  This patch makes pmap support both LA48 and LA57 hardware.  The
  selection of page table level is done at startup, kernel always
  receives control from loader with 4-level paging.  It is not clear how
  UEFI spec would adapt LA57, for instance it could hand out control in
  LA57 mode sometimes.
  
  To switch from LA48 to LA57 requires turning off long mode, requesting
  LA57 in CR4, then re-entering long mode.  This is somewhat delicate
  and done in pmap_bootstrap_la57().  AP startup in LA57 mode is much
  easier, we only need to toggle a bit in CR4 and load right value in CR3.
  
  I decided to not change kernel map for now.  Single PML5 entry is
  created that points to the existing kernel_pml4 (KML4Phys) page, and a
  pml5 entry to create our recursive mapping for vtopte()/vtopde().
  This decision is motivated by the fact that we cannot overcommit for
  KVA, so large space there is unusable until machines start providing
  wider physical memory addressing.  Another reason is that I do not
  want to break our fragile autotuning, so the KVA expansion is not
  included into this first step.  Nice side effect is that minidumps are
  compatible.
  
  On the other hand, (very) large address space is definitely
  immediately useful for some userspace applications.
  
  For userspace, numbering of pte entries (or page table pages) is
  always done for 5-level structures even if we operate in 4-level mode.
  The pmap_is_la57() function is added to report the mode of the
  specified pmap, this is done not to allow simultaneous 4-/5-levels
  (which is not allowed by hw), but to accomodate for EPT which has
  separate level control and in principle might not allow 5-leve EPT
  despite x86 paging supports it. Anyway, it does not seems critical to
  have 5-level EPT support now.
  
  Tested by:	pho (LA48 hardware)
  Reviewed by:	alc
  Sponsored by:	The FreeBSD Foundation
  Differential revision:	https://reviews.freebsd.org/D25273

Modified:
  head/sys/amd64/amd64/elf_machdep.c
  head/sys/amd64/amd64/genassym.c
  head/sys/amd64/amd64/locore.S
  head/sys/amd64/amd64/mp_machdep.c
  head/sys/amd64/amd64/mpboot.S
  head/sys/amd64/amd64/pmap.c
  head/sys/amd64/include/md_var.h
  head/sys/amd64/include/param.h
  head/sys/amd64/include/pmap.h
  head/sys/amd64/include/proc.h
  head/sys/amd64/include/vmparam.h
  head/sys/amd64/linux/linux_sysvec.c
  head/sys/amd64/vmm/amd/svm.c
  head/sys/amd64/vmm/intel/vmx.c
  head/sys/cddl/dev/dtrace/amd64/dtrace_subr.c

Modified: head/sys/amd64/amd64/elf_machdep.c
==============================================================================
--- head/sys/amd64/amd64/elf_machdep.c	Sun Aug 23 20:14:57 2020	(r364526)
+++ head/sys/amd64/amd64/elf_machdep.c	Sun Aug 23 20:19:04 2020	(r364527)
@@ -49,7 +49,7 @@ __FBSDID("$FreeBSD$");
 #include <machine/fpu.h>
 #include <machine/md_var.h>
 
-struct sysentvec elf64_freebsd_sysvec = {
+struct sysentvec elf64_freebsd_sysvec_la48 = {
 	.sv_size	= SYS_MAXSYSCALL,
 	.sv_table	= sysent,
 	.sv_errsize	= 0,
@@ -64,9 +64,9 @@ struct sysentvec elf64_freebsd_sysvec = {
 	.sv_imgact_try	= NULL,
 	.sv_minsigstksz	= MINSIGSTKSZ,
 	.sv_minuser	= VM_MIN_ADDRESS,
-	.sv_maxuser	= VM_MAXUSER_ADDRESS,
-	.sv_usrstack	= USRSTACK,
-	.sv_psstrings	= PS_STRINGS,
+	.sv_maxuser	= VM_MAXUSER_ADDRESS_LA48,
+	.sv_usrstack	= USRSTACK_LA48,
+	.sv_psstrings	= PS_STRINGS_LA48,
 	.sv_stackprot	= VM_PROT_ALL,
 	.sv_copyout_auxargs = __elfN(freebsd_copyout_auxargs),
 	.sv_copyout_strings	= exec_copyout_strings,
@@ -78,15 +78,65 @@ struct sysentvec elf64_freebsd_sysvec = {
 	.sv_set_syscall_retval = cpu_set_syscall_retval,
 	.sv_fetch_syscall_args = cpu_fetch_syscall_args,
 	.sv_syscallnames = syscallnames,
-	.sv_shared_page_base = SHAREDPAGE,
+	.sv_shared_page_base = SHAREDPAGE_LA48,
 	.sv_shared_page_len = PAGE_SIZE,
 	.sv_schedtail	= NULL,
 	.sv_thread_detach = NULL,
 	.sv_trap	= NULL,
 	.sv_stackgap	= elf64_stackgap,
 };
-INIT_SYSENTVEC(elf64_sysvec, &elf64_freebsd_sysvec);
 
+struct sysentvec elf64_freebsd_sysvec_la57 = {
+	.sv_size	= SYS_MAXSYSCALL,
+	.sv_table	= sysent,
+	.sv_errsize	= 0,
+	.sv_errtbl	= NULL,
+	.sv_transtrap	= NULL,
+	.sv_fixup	= __elfN(freebsd_fixup),
+	.sv_sendsig	= sendsig,
+	.sv_sigcode	= sigcode,
+	.sv_szsigcode	= &szsigcode,
+	.sv_name	= "FreeBSD ELF64",
+	.sv_coredump	= __elfN(coredump),
+	.sv_imgact_try	= NULL,
+	.sv_minsigstksz	= MINSIGSTKSZ,
+	.sv_minuser	= VM_MIN_ADDRESS,
+	.sv_maxuser	= VM_MAXUSER_ADDRESS_LA57,
+	.sv_usrstack	= USRSTACK_LA57,
+	.sv_psstrings	= PS_STRINGS_LA57,
+	.sv_stackprot	= VM_PROT_ALL,
+	.sv_copyout_auxargs = __elfN(freebsd_copyout_auxargs),
+	.sv_copyout_strings	= exec_copyout_strings,
+	.sv_setregs	= exec_setregs,
+	.sv_fixlimit	= NULL,
+	.sv_maxssiz	= NULL,
+	.sv_flags	= SV_ABI_FREEBSD | SV_ASLR | SV_LP64 | SV_SHP |
+			    SV_TIMEKEEP,
+	.sv_set_syscall_retval = cpu_set_syscall_retval,
+	.sv_fetch_syscall_args = cpu_fetch_syscall_args,
+	.sv_syscallnames = syscallnames,
+	.sv_shared_page_base = SHAREDPAGE_LA57,
+	.sv_shared_page_len = PAGE_SIZE,
+	.sv_schedtail	= NULL,
+	.sv_thread_detach = NULL,
+	.sv_trap	= NULL,
+	.sv_stackgap	= elf64_stackgap,
+};
+
+static void
+amd64_init_sysvecs(void *arg)
+{
+	amd64_lower_shared_page(&elf64_freebsd_sysvec_la48);
+	if (la57) {
+		exec_sysvec_init(&elf64_freebsd_sysvec_la57);
+		exec_sysvec_init_secondary(&elf64_freebsd_sysvec_la57,
+		    &elf64_freebsd_sysvec_la48);
+	} else {
+		exec_sysvec_init(&elf64_freebsd_sysvec_la48);
+	}
+}
+SYSINIT(elf64_sysvec, SI_SUB_EXEC, SI_ORDER_ANY, amd64_init_sysvecs, NULL);
+
 void
 amd64_lower_shared_page(struct sysentvec *sv)
 {
@@ -98,29 +148,57 @@ amd64_lower_shared_page(struct sysentvec *sv)
 	}
 }
 
-/*
- * Do this fixup before INIT_SYSENTVEC (SI_ORDER_ANY) because the latter
- * uses the value of sv_shared_page_base.
- */
-SYSINIT(elf64_sysvec_fixup, SI_SUB_EXEC, SI_ORDER_FIRST,
-	(sysinit_cfunc_t) amd64_lower_shared_page,
-	&elf64_freebsd_sysvec);
+static boolean_t
+freebsd_brand_info_la57_img_compat(struct image_params *imgp,
+    int32_t *osrel __unused, uint32_t *fctl0)
+{
+	if ((imgp->proc->p_md.md_flags & P_MD_LA57) != 0)
+		return (TRUE);
+	if (fctl0 == NULL || (*fctl0 & NT_FREEBSD_FCTL_LA48) != 0)
+		return (FALSE);
+	if ((imgp->proc->p_md.md_flags & P_MD_LA48) != 0)
+		return (FALSE);
+	return (TRUE);
+}
 
-static Elf64_Brandinfo freebsd_brand_info = {
+static Elf64_Brandinfo freebsd_brand_info_la48 = {
 	.brand		= ELFOSABI_FREEBSD,
 	.machine	= EM_X86_64,
 	.compat_3_brand	= "FreeBSD",
 	.emul_path	= NULL,
 	.interp_path	= "/libexec/ld-elf.so.1",
-	.sysvec		= &elf64_freebsd_sysvec,
+	.sysvec		= &elf64_freebsd_sysvec_la48,
 	.interp_newpath	= NULL,
 	.brand_note	= &elf64_freebsd_brandnote,
-	.flags		= BI_CAN_EXEC_DYN | BI_BRAND_NOTE
+	.flags		= BI_CAN_EXEC_DYN | BI_BRAND_NOTE,
 };
 
+static Elf64_Brandinfo freebsd_brand_info_la57 = {
+	.brand		= ELFOSABI_FREEBSD,
+	.machine	= EM_X86_64,
+	.compat_3_brand	= "FreeBSD",
+	.emul_path	= NULL,
+	.interp_path	= "/libexec/ld-elf.so.1",
+	.sysvec		= &elf64_freebsd_sysvec_la57,
+	.interp_newpath	= NULL,
+	.brand_note	= &elf64_freebsd_brandnote,
+	.flags		= BI_CAN_EXEC_DYN | BI_BRAND_NOTE,
+	.header_supported = freebsd_brand_info_la57_img_compat,
+};
+
+static void
+sysinit_register_elf64_brand_entries(void *arg __unused)
+{
+	/*
+	 * _57 must go first so it can either claim the image or hand
+	 * it to _48.
+	 */
+	if (la57)
+		elf64_insert_brand_entry(&freebsd_brand_info_la57);
+	elf64_insert_brand_entry(&freebsd_brand_info_la48);
+}
 SYSINIT(elf64, SI_SUB_EXEC, SI_ORDER_FIRST,
-	(sysinit_cfunc_t) elf64_insert_brand_entry,
-	&freebsd_brand_info);
+    sysinit_register_elf64_brand_entries, NULL);
 
 static Elf64_Brandinfo freebsd_brand_oinfo = {
 	.brand		= ELFOSABI_FREEBSD,
@@ -128,15 +206,14 @@ static Elf64_Brandinfo freebsd_brand_oinfo = {
 	.compat_3_brand	= "FreeBSD",
 	.emul_path	= NULL,
 	.interp_path	= "/usr/libexec/ld-elf.so.1",
-	.sysvec		= &elf64_freebsd_sysvec,
+	.sysvec		= &elf64_freebsd_sysvec_la48,
 	.interp_newpath	= NULL,
 	.brand_note	= &elf64_freebsd_brandnote,
 	.flags		= BI_CAN_EXEC_DYN | BI_BRAND_NOTE
 };
 
 SYSINIT(oelf64, SI_SUB_EXEC, SI_ORDER_ANY,
-	(sysinit_cfunc_t) elf64_insert_brand_entry,
-	&freebsd_brand_oinfo);
+    (sysinit_cfunc_t)elf64_insert_brand_entry, &freebsd_brand_oinfo);
 
 static Elf64_Brandinfo kfreebsd_brand_info = {
 	.brand		= ELFOSABI_FREEBSD,
@@ -144,15 +221,14 @@ static Elf64_Brandinfo kfreebsd_brand_info = {
 	.compat_3_brand	= "FreeBSD",
 	.emul_path	= NULL,
 	.interp_path	= "/lib/ld-kfreebsd-x86-64.so.1",
-	.sysvec		= &elf64_freebsd_sysvec,
+	.sysvec		= &elf64_freebsd_sysvec_la48,
 	.interp_newpath	= NULL,
 	.brand_note	= &elf64_kfreebsd_brandnote,
 	.flags		= BI_CAN_EXEC_DYN | BI_BRAND_NOTE_MANDATORY
 };
 
 SYSINIT(kelf64, SI_SUB_EXEC, SI_ORDER_ANY,
-	(sysinit_cfunc_t) elf64_insert_brand_entry,
-	&kfreebsd_brand_info);
+    (sysinit_cfunc_t)elf64_insert_brand_entry, &kfreebsd_brand_info);
 
 void
 elf64_dump_thread(struct thread *td, void *dst, size_t *off)

Modified: head/sys/amd64/amd64/genassym.c
==============================================================================
--- head/sys/amd64/amd64/genassym.c	Sun Aug 23 20:14:57 2020	(r364526)
+++ head/sys/amd64/amd64/genassym.c	Sun Aug 23 20:19:04 2020	(r364527)
@@ -99,11 +99,10 @@ ASSYM(TDP_KTHREAD, TDP_KTHREAD);
 ASSYM(PAGE_SIZE, PAGE_SIZE);
 ASSYM(NPTEPG, NPTEPG);
 ASSYM(NPDEPG, NPDEPG);
-ASSYM(addr_PTmap, addr_PTmap);
-ASSYM(addr_PDmap, addr_PDmap);
-ASSYM(addr_PDPmap, addr_PDPmap);
-ASSYM(addr_PML4map, addr_PML4map);
-ASSYM(addr_PML4pml4e, addr_PML4pml4e);
+ASSYM(addr_P4Tmap, addr_P4Tmap);
+ASSYM(addr_P4Dmap, addr_P4Dmap);
+ASSYM(addr_P5Tmap, addr_P5Tmap);
+ASSYM(addr_P5Dmap, addr_P5Dmap);
 ASSYM(PDESIZE, sizeof(pd_entry_t));
 ASSYM(PTESIZE, sizeof(pt_entry_t));
 ASSYM(PAGE_SHIFT, PAGE_SHIFT);

Modified: head/sys/amd64/amd64/locore.S
==============================================================================
--- head/sys/amd64/amd64/locore.S	Sun Aug 23 20:14:57 2020	(r364526)
+++ head/sys/amd64/amd64/locore.S	Sun Aug 23 20:19:04 2020	(r364527)
@@ -36,13 +36,8 @@
 /*
  * Compiled KERNBASE location
  */
-	.globl	kernbase,loc_PTmap,loc_PDmap,loc_PDPmap,loc_PML4map,loc_PML4pml4e,dmapbase,dmapend
+	.globl	kernbase, loc_PTmap, loc_PDmap, loc_PDPmap, dmapbase, dmapend
 	.set	kernbase,KERNBASE
-	.set	loc_PTmap,addr_PTmap
-	.set	loc_PDmap,addr_PDmap
-	.set	loc_PDPmap,addr_PDPmap
-	.set	loc_PML4map,addr_PML4map
-	.set	loc_PML4pml4e,addr_PML4pml4e
 	.set	dmapbase,DMAP_MIN_ADDRESS
 	.set	dmapend,DMAP_MAX_ADDRESS
 
@@ -81,6 +76,62 @@ NON_GPROF_ENTRY(btext)
 	call	mi_startup		/* autoconfiguration, mountroot etc */
 0:	hlt
 	jmp	0b
+
+/* la57_trampoline(%rdi pml5) */
+NON_GPROF_ENTRY(la57_trampoline)
+	movq	%rsp,%r11
+	movq	%rbx,%r10
+	leaq	la57_trampoline_end(%rip),%rsp
+
+	movq	%cr0,%rdx
+	lgdtq	la57_trampoline_gdt_desc(%rip)
+
+	pushq	$(2<<3)
+	leaq	l1(%rip),%rax
+	leaq	l2(%rip),%rbx
+
+	pushq	%rax
+	lretq
+	.code32
+
+l1:	movl	$(3<<3),%eax
+	movl	%eax,%ss
+
+	movl	%edx,%eax
+	andl	$~CR0_PG,%eax
+	movl	%eax,%cr0
+
+	movl	%cr4,%eax
+	orl	$CR4_LA57,%eax
+	movl	%eax,%cr4
+
+	movl	%edi,%cr3
+	movl	%edx,%cr0
+
+	pushl	$(1<<3)
+	pushl	%ebx
+	lretl
+	.code64
+
+l2:	movq	%r11,%rsp
+	movq	%r10,%rbx
+	retq
+	.p2align 4,0
+NON_GPROF_ENTRY(la57_trampoline_gdt_desc)
+	.word	la57_trampoline_end - la57_trampoline_gdt
+	.long	0		/* filled by pmap_bootstrap_la57 */
+	.p2align 4,0
+NON_GPROF_ENTRY(la57_trampoline_gdt)
+	.long	0x00000000	/* null desc */
+	.long	0x00000000
+	.long	0x00000000	/* 64bit code */
+	.long	0x00209800
+	.long	0x0000ffff	/* 32bit code */
+	.long	0x00cf9b00
+	.long	0x0000ffff	/* universal data */
+	.long	0x00cf9300
+	.dcb.l	16,0
+NON_GPROF_ENTRY(la57_trampoline_end)
 
 	.bss
 	ALIGN_DATA			/* just to be sure */

Modified: head/sys/amd64/amd64/mp_machdep.c
==============================================================================
--- head/sys/amd64/amd64/mp_machdep.c	Sun Aug 23 20:14:57 2020	(r364526)
+++ head/sys/amd64/amd64/mp_machdep.c	Sun Aug 23 20:19:04 2020	(r364527)
@@ -96,7 +96,7 @@ __FBSDID("$FreeBSD$");
 
 #define GiB(v)			(v ## ULL << 30)
 
-#define	AP_BOOTPT_SZ		(PAGE_SIZE * 3)
+#define	AP_BOOTPT_SZ		(PAGE_SIZE * 4)
 
 /* Temporary variables for init_secondary()  */
 char *doublefault_stack;
@@ -104,6 +104,8 @@ char *mce_stack;
 char *nmi_stack;
 char *dbg_stack;
 
+extern u_int mptramp_la57;
+
 /*
  * Local data and functions.
  */
@@ -240,6 +242,8 @@ cpu_mp_start(void)
 
 	assign_cpu_ids();
 
+	mptramp_la57 = la57;
+
 	/* Start each Application Processor */
 	init_ops.start_all_aps();
 
@@ -395,9 +399,9 @@ mp_realloc_pcpu(int cpuid, int domain)
 int
 native_start_all_aps(void)
 {
-	u_int64_t *pt4, *pt3, *pt2;
+	u_int64_t *pt5, *pt4, *pt3, *pt2;
 	u_int32_t mpbioswarmvec;
-	int apic_id, cpu, domain, i;
+	int apic_id, cpu, domain, i, xo;
 	u_char mpbiosreason;
 
 	mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN);
@@ -406,18 +410,38 @@ native_start_all_aps(void)
 	bcopy(mptramp_start, (void *)PHYS_TO_DMAP(boot_address), bootMP_size);
 
 	/* Locate the page tables, they'll be below the trampoline */
-	pt4 = (uint64_t *)PHYS_TO_DMAP(mptramp_pagetables);
+	if (la57) {
+		pt5 = (uint64_t *)PHYS_TO_DMAP(mptramp_pagetables);
+		xo = 1;
+	} else {
+		xo = 0;
+	}
+	pt4 = (uint64_t *)PHYS_TO_DMAP(mptramp_pagetables + xo * PAGE_SIZE);
 	pt3 = pt4 + (PAGE_SIZE) / sizeof(u_int64_t);
 	pt2 = pt3 + (PAGE_SIZE) / sizeof(u_int64_t);
 
 	/* Create the initial 1GB replicated page tables */
 	for (i = 0; i < 512; i++) {
-		/* Each slot of the level 4 pages points to the same level 3 page */
-		pt4[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + PAGE_SIZE);
+		if (la57) {
+			pt5[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables +
+			    PAGE_SIZE);
+			pt5[i] |= PG_V | PG_RW | PG_U;
+		}
+
+		/*
+		 * Each slot of the level 4 pages points to the same
+		 * level 3 page.
+		 */
+		pt4[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables +
+		    (xo + 1) * PAGE_SIZE);
 		pt4[i] |= PG_V | PG_RW | PG_U;
 
-		/* Each slot of the level 3 pages points to the same level 2 page */
-		pt3[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + (2 * PAGE_SIZE));
+		/*
+		 * Each slot of the level 3 pages points to the same
+		 * level 2 page.
+		 */
+		pt3[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables +
+		    ((xo + 2) * PAGE_SIZE));
 		pt3[i] |= PG_V | PG_RW | PG_U;
 
 		/* The level 2 page slots are mapped with 2MB pages for 1GB. */

Modified: head/sys/amd64/amd64/mpboot.S
==============================================================================
--- head/sys/amd64/amd64/mpboot.S	Sun Aug 23 20:14:57 2020	(r364526)
+++ head/sys/amd64/amd64/mpboot.S	Sun Aug 23 20:19:04 2020	(r364527)
@@ -90,10 +90,16 @@ protmode:
 	mov	$bootdata-gdt, %eax
 	mov	%ax, %ds
 
-	/* Turn on the PAE bit for when paging is enabled */
+	/*
+	 * Turn on the PAE bit and optionally the LA57 bit for when paging
+	 * is later enabled.
+	 */
 	mov	%cr4, %eax
 	orl	$CR4_PAE, %eax
-	mov	%eax, %cr4
+	cmpb	$0, mptramp_la57-mptramp_start(%ebx)
+	je	1f
+	orl	$CR4_LA57, %eax
+1:	mov	%eax, %cr4
 
 	/*
 	 * Enable EFER.LME so that we get long mode when all the prereqs are
@@ -132,9 +138,9 @@ protmode:
 	/*
 	 * At this point paging is enabled, and we are in "compatibility" mode.
 	 * We do another far jump to reload %cs with the 64 bit selector.
-	 * %cr3 points to a 4-level page table page.
+	 * %cr3 points to a 4- or 5-level page table.
 	 * We cannot yet jump all the way to the kernel because we can only
-	 * specify a 32 bit linear address.  So, yet another trampoline.
+	 * specify a 32 bit linear address.  So, we use yet another trampoline.
 	 *
 	 * The following instruction is:
 	 * ljmp $kernelcode-gdt, $tramp_64-mptramp_start
@@ -209,6 +215,11 @@ gdtend:
 mptramp_pagetables:
 	.long	0
 
+	/* 5-level paging ? */
+	.globl	mptramp_la57
+mptramp_la57:
+	.long	0
+
 	/*
 	 * The pseudo descriptor for lgdt to use.
 	 */
@@ -251,8 +262,12 @@ entry_64:
 	 * Load a real %cr3 that has all the direct map stuff and switches
 	 * off the 1GB replicated mirror.  Load a stack pointer and jump
 	 * into AP startup code in C.
-	 */
+	*/
+	cmpl	$0, la57
+	jne	2f
 	movq	KPML4phys, %rax
-	movq	%rax, %cr3
+	jmp	3f
+2:	movq	KPML5phys, %rax
+3:	movq	%rax, %cr3
 	movq	bootSTK, %rsp
 	jmp	init_secondary

Modified: head/sys/amd64/amd64/pmap.c
==============================================================================
--- head/sys/amd64/amd64/pmap.c	Sun Aug 23 20:14:57 2020	(r364526)
+++ head/sys/amd64/amd64/pmap.c	Sun Aug 23 20:19:04 2020	(r364527)
@@ -398,6 +398,19 @@ static int pg_ps_enabled = 1;
 SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
     &pg_ps_enabled, 0, "Are large page mappings enabled?");
 
+int __read_frequently la57 = 0;
+SYSCTL_INT(_vm_pmap, OID_AUTO, la57, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
+    &la57, 0,
+    "5-level paging for host is enabled");
+
+static bool
+pmap_is_la57(pmap_t pmap)
+{
+	if (pmap->pm_type == PT_X86)
+		return (la57);
+	return (false);		/* XXXKIB handle EPT */
+}
+
 #define	PAT_INDEX_SIZE	8
 static int pat_index[PAT_INDEX_SIZE];	/* cache mode to PAT index conversion */
 
@@ -405,7 +418,10 @@ static u_int64_t	KPTphys;	/* phys addr of kernel level
 static u_int64_t	KPDphys;	/* phys addr of kernel level 2 */
 u_int64_t		KPDPphys;	/* phys addr of kernel level 3 */
 u_int64_t		KPML4phys;	/* phys addr of kernel level 4 */
+u_int64_t		KPML5phys;	/* phys addr of kernel level 5,
+					   if supported */
 
+static pml4_entry_t	*kernel_pml4;
 static u_int64_t	DMPDphys;	/* phys addr of direct mapped level 2 */
 static u_int64_t	DMPDPphys;	/* phys addr of direct mapped level 3 */
 static int		ndmpdpphys;	/* number of DMPDPphys pages */
@@ -1257,7 +1273,7 @@ static void pmap_update_pde(pmap_t pmap, vm_offset_t v
 static void pmap_update_pde_invalidate(pmap_t, vm_offset_t va, pd_entry_t pde);
 
 static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex,
-		struct rwlock **lockp);
+		struct rwlock **lockp, vm_offset_t va);
 static pd_entry_t *pmap_alloc_pde(pmap_t pmap, vm_offset_t va, vm_page_t *pdpgp,
 		struct rwlock **lockp);
 static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va,
@@ -1271,22 +1287,87 @@ static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry
 /* Inline functions */
 /********************/
 
-/* Return a non-clipped PD index for a given VA */
+/*
+ * Return a non-clipped indexes for a given VA, which are page table
+ * pages indexes at the corresponding level.
+ */
 static __inline vm_pindex_t
 pmap_pde_pindex(vm_offset_t va)
 {
 	return (va >> PDRSHIFT);
 }
 
+static __inline vm_pindex_t
+pmap_pdpe_pindex(vm_offset_t va)
+{
+	return (NUPDE + (va >> PDPSHIFT));
+}
 
+static __inline vm_pindex_t
+pmap_pml4e_pindex(vm_offset_t va)
+{
+	return (NUPDE + NUPDPE + (va >> PML4SHIFT));
+}
+
+static __inline vm_pindex_t
+pmap_pml5e_pindex(vm_offset_t va)
+{
+	return (NUPDE + NUPDPE + NUPML4E + (va >> PML5SHIFT));
+}
+
+static __inline pml4_entry_t *
+pmap_pml5e(pmap_t pmap, vm_offset_t va)
+{
+
+	MPASS(pmap_is_la57(pmap));
+	return (&pmap->pm_pmltop[pmap_pml5e_index(va)]);
+}
+
+static __inline pml4_entry_t *
+pmap_pml5e_u(pmap_t pmap, vm_offset_t va)
+{
+
+	MPASS(pmap_is_la57(pmap));
+	return (&pmap->pm_pmltopu[pmap_pml5e_index(va)]);
+}
+
+static __inline pml4_entry_t *
+pmap_pml5e_to_pml4e(pml5_entry_t *pml5e, vm_offset_t va)
+{
+	pml4_entry_t *pml4e;
+
+	/* XXX MPASS(pmap_is_la57(pmap); */
+	pml4e = (pml4_entry_t *)PHYS_TO_DMAP(*pml5e & PG_FRAME);
+	return (&pml4e[pmap_pml4e_index(va)]);
+}
+
 /* Return a pointer to the PML4 slot that corresponds to a VA */
 static __inline pml4_entry_t *
 pmap_pml4e(pmap_t pmap, vm_offset_t va)
 {
+	pml5_entry_t *pml5e;
+	pml4_entry_t *pml4e;
+	pt_entry_t PG_V;
 
-	return (&pmap->pm_pml4[pmap_pml4e_index(va)]);
+	if (pmap_is_la57(pmap)) {
+		pml5e = pmap_pml5e(pmap, va);
+		PG_V = pmap_valid_bit(pmap);
+		if ((*pml5e & PG_V) == 0)
+			return (NULL);
+		pml4e = (pml4_entry_t *)PHYS_TO_DMAP(*pml5e & PG_FRAME);
+	} else {
+		pml4e = pmap->pm_pmltop;
+	}
+	return (&pml4e[pmap_pml4e_index(va)]);
 }
 
+static __inline pml4_entry_t *
+pmap_pml4e_u(pmap_t pmap, vm_offset_t va)
+{
+	MPASS(!pmap_is_la57(pmap));
+	return (&pmap->pm_pmltopu[pmap_pml4e_index(va)]);
+}
+
 /* Return a pointer to the PDP slot that corresponds to a VA */
 static __inline pdp_entry_t *
 pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va)
@@ -1306,7 +1387,7 @@ pmap_pdpe(pmap_t pmap, vm_offset_t va)
 
 	PG_V = pmap_valid_bit(pmap);
 	pml4e = pmap_pml4e(pmap, va);
-	if ((*pml4e & PG_V) == 0)
+	if (pml4e == NULL || (*pml4e & PG_V) == 0)
 		return (NULL);
 	return (pmap_pml4e_to_pdpe(pml4e, va));
 }
@@ -1387,21 +1468,37 @@ pmap_resident_count_dec(pmap_t pmap, int count)
 PMAP_INLINE pt_entry_t *
 vtopte(vm_offset_t va)
 {
-	u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
+	u_int64_t mask;
 
 	KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopte on a uva/gpa 0x%0lx", va));
 
-	return (PTmap + ((va >> PAGE_SHIFT) & mask));
+	if (la57) {
+		mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT +
+		    NPML4EPGSHIFT + NPML5EPGSHIFT)) - 1);
+		return (P5Tmap + ((va >> PAGE_SHIFT) & mask));
+	} else {
+		mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT +
+		    NPML4EPGSHIFT)) - 1);
+		return (P4Tmap + ((va >> PAGE_SHIFT) & mask));
+	}
 }
 
 static __inline pd_entry_t *
 vtopde(vm_offset_t va)
 {
-	u_int64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
+	u_int64_t mask;
 
 	KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopde on a uva/gpa 0x%0lx", va));
 
-	return (PDmap + ((va >> PDRSHIFT) & mask));
+	if (la57) {
+		mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT +
+		    NPML4EPGSHIFT + NPML5EPGSHIFT)) - 1);
+		return (P5Dmap + ((va >> PDRSHIFT) & mask));
+	} else {
+		mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT +
+		    NPML4EPGSHIFT)) - 1);
+		return (P4Dmap + ((va >> PDRSHIFT) & mask));
+	}
 }
 
 static u_int64_t
@@ -1658,6 +1755,8 @@ create_pagetables(vm_paddr_t *firstaddr)
 		p4_p[KPML4BASE + i] = KPDPphys + ptoa(i);
 		p4_p[KPML4BASE + i] |= X86_PG_RW | X86_PG_V;
 	}
+
+	kernel_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys);
 }
 
 /*
@@ -1730,7 +1829,7 @@ pmap_bootstrap(vm_paddr_t *firstaddr)
 	 * later unmapped (using pmap_remove()) and freed.
 	 */
 	PMAP_LOCK_INIT(kernel_pmap);
-	kernel_pmap->pm_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(KPML4phys);
+	kernel_pmap->pm_pmltop = kernel_pml4;
 	kernel_pmap->pm_cr3 = KPML4phys;
 	kernel_pmap->pm_ucr3 = PMAP_NO_CR3;
 	CPU_FILL(&kernel_pmap->pm_active);	/* don't allow deactivation */
@@ -1891,6 +1990,148 @@ pmap_init_pat(void)
 	load_cr4(cr4);
 }
 
+extern const char la57_trampoline[], la57_trampoline_gdt_desc[],
+    la57_trampoline_gdt[], la57_trampoline_end[];
+
+static void
+pmap_bootstrap_la57(void *arg __unused)
+{
+	char *v_code;
+	pml5_entry_t *v_pml5;
+	pml4_entry_t *v_pml4;
+	pdp_entry_t *v_pdp;
+	pd_entry_t *v_pd;
+	pt_entry_t *v_pt;
+	vm_page_t m_code, m_pml4, m_pdp, m_pd, m_pt, m_pml5;
+	void (*la57_tramp)(uint64_t pml5);
+	struct region_descriptor r_gdt;
+
+	if ((cpu_stdext_feature2 & CPUID_STDEXT2_LA57) == 0)
+		return;
+	if (!TUNABLE_INT_FETCH("vm.pmap.la57", &la57))
+		la57 = 1;
+	if (!la57)
+		return;
+
+	r_gdt.rd_limit = NGDT * sizeof(struct user_segment_descriptor) - 1;
+	r_gdt.rd_base = (long)__pcpu[0].pc_gdt;
+
+	m_code = vm_page_alloc_contig(NULL, 0,
+	    VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ,
+	    1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT);
+	if ((m_code->flags & PG_ZERO) == 0)
+		pmap_zero_page(m_code);
+	v_code = (char *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_code));
+	m_pml5 = vm_page_alloc_contig(NULL, 0,
+	    VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ,
+	    1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT);
+	if ((m_pml5->flags & PG_ZERO) == 0)
+		pmap_zero_page(m_pml5);
+	KPML5phys = VM_PAGE_TO_PHYS(m_pml5);
+	v_pml5 = (pml5_entry_t *)PHYS_TO_DMAP(KPML5phys);
+	m_pml4 = vm_page_alloc_contig(NULL, 0,
+	    VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ,
+	    1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT);
+	if ((m_pml4->flags & PG_ZERO) == 0)
+		pmap_zero_page(m_pml4);
+	v_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pml4));
+	m_pdp = vm_page_alloc_contig(NULL, 0,
+	    VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ,
+	    1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT);
+	if ((m_pdp->flags & PG_ZERO) == 0)
+		pmap_zero_page(m_pdp);
+	v_pdp = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pdp));
+	m_pd = vm_page_alloc_contig(NULL, 0,
+	    VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ,
+	    1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT);
+	if ((m_pd->flags & PG_ZERO) == 0)
+		pmap_zero_page(m_pd);
+	v_pd = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pd));
+	m_pt = vm_page_alloc_contig(NULL, 0,
+	    VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ,
+	    1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT);
+	if ((m_pt->flags & PG_ZERO) == 0)
+		pmap_zero_page(m_pt);
+	v_pt = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pt));
+
+	/*
+	 * Map m_code 1:1, it appears below 4G in KVA due to physical
+	 * address being below 4G.  Since kernel KVA is in upper half,
+	 * the pml4e should be zero and free for temporary use.
+	 */
+	kernel_pmap->pm_pmltop[pmap_pml4e_index(VM_PAGE_TO_PHYS(m_code))] =
+	    VM_PAGE_TO_PHYS(m_pdp) | X86_PG_V | X86_PG_RW | X86_PG_A |
+	    X86_PG_M;
+	v_pdp[pmap_pdpe_index(VM_PAGE_TO_PHYS(m_code))] =
+	    VM_PAGE_TO_PHYS(m_pd) | X86_PG_V | X86_PG_RW | X86_PG_A |
+	    X86_PG_M;
+	v_pd[pmap_pde_index(VM_PAGE_TO_PHYS(m_code))] =
+	    VM_PAGE_TO_PHYS(m_pt) | X86_PG_V | X86_PG_RW | X86_PG_A |
+	    X86_PG_M;
+	v_pt[pmap_pte_index(VM_PAGE_TO_PHYS(m_code))] =
+	    VM_PAGE_TO_PHYS(m_code) | X86_PG_V | X86_PG_RW | X86_PG_A |
+	    X86_PG_M;
+
+	/*
+	 * Add pml5 entry at top of KVA pointing to existing pml4 table,
+	 * entering all existing kernel mappings into level 5 table.
+	 */
+	v_pml5[pmap_pml5e_index(UPT_MAX_ADDRESS)] = KPML4phys | X86_PG_V |
+	    X86_PG_RW | X86_PG_A | X86_PG_M | pg_g;
+
+	/*
+	 * Add pml5 entry for 1:1 trampoline mapping after LA57 is turned on.
+	 */
+	v_pml5[pmap_pml5e_index(VM_PAGE_TO_PHYS(m_code))] =
+	    VM_PAGE_TO_PHYS(m_pml4) | X86_PG_V | X86_PG_RW | X86_PG_A |
+	    X86_PG_M;
+	v_pml4[pmap_pml4e_index(VM_PAGE_TO_PHYS(m_code))] =
+	    VM_PAGE_TO_PHYS(m_pdp) | X86_PG_V | X86_PG_RW | X86_PG_A |
+	    X86_PG_M;
+
+	/*
+	 * Copy and call the 48->57 trampoline, hope we return there, alive.
+	 */
+	bcopy(la57_trampoline, v_code, la57_trampoline_end - la57_trampoline);
+	*(u_long *)(v_code + 2 + (la57_trampoline_gdt_desc - la57_trampoline)) =
+	    la57_trampoline_gdt - la57_trampoline + VM_PAGE_TO_PHYS(m_code);
+	la57_tramp = (void (*)(uint64_t))VM_PAGE_TO_PHYS(m_code);
+	la57_tramp(KPML5phys);
+
+	/*
+	 * gdt was necessary reset, switch back to our gdt.
+	 */
+	lgdt(&r_gdt);
+	wrmsr(MSR_GSBASE, (uint64_t)&__pcpu[0]);
+	load_ds(_udatasel);
+	load_es(_udatasel);
+	load_fs(_ufssel);
+	ssdtosyssd(&gdt_segs[GPROC0_SEL],
+	    (struct system_segment_descriptor *)&__pcpu[0].pc_gdt[GPROC0_SEL]);
+	ltr(GSEL(GPROC0_SEL, SEL_KPL));
+
+	/*
+	 * Now unmap the trampoline, and free the pages.
+	 * Clear pml5 entry used for 1:1 trampoline mapping.
+	 */
+	pte_clear(&v_pml5[pmap_pml5e_index(VM_PAGE_TO_PHYS(m_code))]);
+	invlpg((vm_offset_t)v_code);
+	vm_page_free(m_code);
+	vm_page_free(m_pdp);
+	vm_page_free(m_pd);
+	vm_page_free(m_pt);
+
+	/* 
+	 * Recursively map PML5 to itself in order to get PTmap and
+	 * PDmap.
+	 */
+	v_pml5[PML5PML5I] = KPML5phys | X86_PG_RW | X86_PG_V | pg_nx;
+
+	kernel_pmap->pm_cr3 = KPML5phys;
+	kernel_pmap->pm_pmltop = v_pml5;
+}
+SYSINIT(la57, SI_SUB_KMEM, SI_ORDER_ANY, pmap_bootstrap_la57, NULL);
+
 /*
  *	Initialize a vm_page's machine-dependent fields.
  */
@@ -2190,7 +2431,8 @@ pmap_init(void)
 		}
 		for (i = 0; i < lm_ents; i++) {
 			m = pmap_large_map_getptp_unlocked();
-			kernel_pmap->pm_pml4[LMSPML4I + i] = X86_PG_V |
+			/* XXXKIB la57 */
+			kernel_pml4[LMSPML4I + i] = X86_PG_V |
 			    X86_PG_RW | X86_PG_A | X86_PG_M | pg_nx |
 			    VM_PAGE_TO_PHYS(m);
 		}
@@ -3566,44 +3808,57 @@ pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t
 static void
 _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
 {
+	pml5_entry_t *pml5;
+	pml4_entry_t *pml4;
+	pdp_entry_t *pdp;
+	pd_entry_t *pd;
+	vm_page_t pdpg, pdppg, pml4pg;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+
 	/*
 	 * unmap the page table page
 	 */
-	if (m->pindex >= NUPDE + NUPDPE) {
+	if (m->pindex >= NUPDE + NUPDPE + NUPML4E) {
+		/* PML4 page */
+		MPASS(pmap_is_la57(pmap));
+		pml5 = pmap_pml5e(pmap, va);
+		*pml5 = 0;
+		if (pmap->pm_pmltopu != NULL && va <= VM_MAXUSER_ADDRESS) {
+			pml5 = pmap_pml5e_u(pmap, va);
+			*pml5 = 0;
+		}
+	} else if (m->pindex >= NUPDE + NUPDPE) {
 		/* PDP page */
-		pml4_entry_t *pml4;
 		pml4 = pmap_pml4e(pmap, va);
 		*pml4 = 0;
-		if (pmap->pm_pml4u != NULL && va <= VM_MAXUSER_ADDRESS) {
-			pml4 = &pmap->pm_pml4u[pmap_pml4e_index(va)];
+		if (!pmap_is_la57(pmap) && pmap->pm_pmltopu != NULL &&
+		    va <= VM_MAXUSER_ADDRESS) {
+			pml4 = pmap_pml4e_u(pmap, va);
 			*pml4 = 0;
 		}
 	} else if (m->pindex >= NUPDE) {
 		/* PD page */
-		pdp_entry_t *pdp;
 		pdp = pmap_pdpe(pmap, va);
 		*pdp = 0;
 	} else {
 		/* PTE page */
-		pd_entry_t *pd;
 		pd = pmap_pde(pmap, va);
 		*pd = 0;
 	}
 	pmap_resident_count_dec(pmap, 1);
 	if (m->pindex < NUPDE) {
 		/* We just released a PT, unhold the matching PD */
-		vm_page_t pdpg;
-
 		pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME);
 		pmap_unwire_ptp(pmap, va, pdpg, free);
 	} else if (m->pindex < NUPDE + NUPDPE) {
 		/* We just released a PD, unhold the matching PDP */
-		vm_page_t pdppg;
-
 		pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME);
 		pmap_unwire_ptp(pmap, va, pdppg, free);
+	} else if (m->pindex < NUPDE + NUPDPE + NUPML4E && pmap_is_la57(pmap)) {
+		/* We just released a PDP, unhold the matching PML4 */
+		pml4pg = PHYS_TO_VM_PAGE(*pmap_pml5e(pmap, va) & PG_FRAME);
+		pmap_unwire_ptp(pmap, va, pml4pg, free);
 	}
 
 	/* 
@@ -3659,9 +3914,9 @@ pmap_pinit0(pmap_t pmap)
 	int i;
 
 	PMAP_LOCK_INIT(pmap);
-	pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys);
-	pmap->pm_pml4u = NULL;
-	pmap->pm_cr3 = KPML4phys;
+	pmap->pm_pmltop = kernel_pmap->pm_pmltop;
+	pmap->pm_pmltopu = NULL;
+	pmap->pm_cr3 = kernel_pmap->pm_cr3;
 	/* hack to keep pmap_pti_pcid_invalidate() alive */
 	pmap->pm_ucr3 = PMAP_NO_CR3;
 	pmap->pm_root.rt_root = 0;
@@ -3714,20 +3969,61 @@ pmap_pinit_pml4(vm_page_t pml4pg)
 
 	/* install large map entries if configured */
 	for (i = 0; i < lm_ents; i++)
-		pm_pml4[LMSPML4I + i] = kernel_pmap->pm_pml4[LMSPML4I + i];
+		pm_pml4[LMSPML4I + i] = kernel_pmap->pm_pmltop[LMSPML4I + i];
 }
 
+void
+pmap_pinit_pml5(vm_page_t pml5pg)
+{
+	pml5_entry_t *pm_pml5;
+
+	pm_pml5 = (pml5_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml5pg));
+
+	/*
+	 * Add pml5 entry at top of KVA pointing to existing pml4 table,
+	 * entering all existing kernel mappings into level 5 table.
+	 */
+	pm_pml5[pmap_pml5e_index(UPT_MAX_ADDRESS)] = KPML4phys | X86_PG_V |
+	    X86_PG_RW | X86_PG_A | X86_PG_M | pg_g |
+	    pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, FALSE);
+
+	/* 
+	 * Install self-referential address mapping entry.
+	 */
+	pm_pml5[PML5PML5I] = VM_PAGE_TO_PHYS(pml5pg) |
+	    X86_PG_RW | X86_PG_V | X86_PG_M | X86_PG_A |
+	    pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, FALSE);
+}
+
 static void
-pmap_pinit_pml4_pti(vm_page_t pml4pg)
+pmap_pinit_pml4_pti(vm_page_t pml4pgu)
 {
-	pml4_entry_t *pm_pml4;
+	pml4_entry_t *pm_pml4u;
 	int i;
 
-	pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg));
+	pm_pml4u = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pgu));
 	for (i = 0; i < NPML4EPG; i++)
-		pm_pml4[i] = pti_pml4[i];
+		pm_pml4u[i] = pti_pml4[i];
 }
 
+static void
+pmap_pinit_pml5_pti(vm_page_t pml5pgu)
+{
+	pml5_entry_t *pm_pml5u;
+
+	pm_pml5u = (pml5_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml5pgu));
+
+	/*
+	 * Add pml5 entry at top of KVA pointing to existing pml4 pti
+	 * table, entering all kernel mappings needed for usermode
+	 * into level 5 table.
+	 */
+	pm_pml5u[pmap_pml5e_index(UPT_MAX_ADDRESS)] =
+	    pmap_kextract((vm_offset_t)pti_pml4) |
+	    X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M | pg_g |
+	    pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, FALSE);
+}
+
 /*
  * Initialize a preallocated and zeroed pmap structure,
  * such as one in a vmspace structure.
@@ -3735,29 +4031,30 @@ pmap_pinit_pml4_pti(vm_page_t pml4pg)
 int
 pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags)
 {
-	vm_page_t pml4pg, pml4pgu;
-	vm_paddr_t pml4phys;
+	vm_page_t pmltop_pg, pmltop_pgu;
+	vm_paddr_t pmltop_phys;
 	int i;
 
 	/*
 	 * allocate the page directory page
 	 */
-	pml4pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
+	pmltop_pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
 	    VM_ALLOC_WIRED | VM_ALLOC_ZERO | VM_ALLOC_WAITOK);
 
-	pml4phys = VM_PAGE_TO_PHYS(pml4pg);
-	pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(pml4phys);
+	pmltop_phys = VM_PAGE_TO_PHYS(pmltop_pg);
+	pmap->pm_pmltop = (pml5_entry_t *)PHYS_TO_DMAP(pmltop_phys);
+
 	CPU_FOREACH(i) {
 		pmap->pm_pcids[i].pm_pcid = PMAP_PCID_NONE;
 		pmap->pm_pcids[i].pm_gen = 0;
 	}
 	pmap->pm_cr3 = PMAP_NO_CR3;	/* initialize to an invalid value */
 	pmap->pm_ucr3 = PMAP_NO_CR3;
-	pmap->pm_pml4u = NULL;
+	pmap->pm_pmltopu = NULL;
 
 	pmap->pm_type = pm_type;
-	if ((pml4pg->flags & PG_ZERO) == 0)
-		pagezero(pmap->pm_pml4);
+	if ((pmltop_pg->flags & PG_ZERO) == 0)
+		pagezero(pmap->pm_pmltop);
 
 	/*
 	 * Do not install the host kernel mappings in the nested page
@@ -3766,15 +4063,21 @@ pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, i
 	 * Install minimal kernel mappings in PTI case.
 	 */
 	if (pm_type == PT_X86) {
-		pmap->pm_cr3 = pml4phys;
-		pmap_pinit_pml4(pml4pg);
+		pmap->pm_cr3 = pmltop_phys;
+		if (pmap_is_la57(pmap))
+			pmap_pinit_pml5(pmltop_pg);
+		else
+			pmap_pinit_pml4(pmltop_pg);
 		if ((curproc->p_md.md_flags & P_MD_KPTI) != 0) {

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?202008232019.07NKJ53S016121>