Date: Sun, 23 Aug 2020 20:19:05 +0000 (UTC) From: Konstantin Belousov <kib@FreeBSD.org> To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org Subject: svn commit: r364527 - in head/sys: amd64/amd64 amd64/include amd64/linux amd64/vmm/amd amd64/vmm/intel cddl/dev/dtrace/amd64 Message-ID: <202008232019.07NKJ53S016121@repo.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: kib Date: Sun Aug 23 20:19:04 2020 New Revision: 364527 URL: https://svnweb.freebsd.org/changeset/base/364527 Log: amd64 pmap: LA57 AKA 5-level paging Since LA57 was moved to the main SDM document with revision 072, it seems that we should have a support for it, and silicons are coming. This patch makes pmap support both LA48 and LA57 hardware. The selection of page table level is done at startup, kernel always receives control from loader with 4-level paging. It is not clear how UEFI spec would adapt LA57, for instance it could hand out control in LA57 mode sometimes. To switch from LA48 to LA57 requires turning off long mode, requesting LA57 in CR4, then re-entering long mode. This is somewhat delicate and done in pmap_bootstrap_la57(). AP startup in LA57 mode is much easier, we only need to toggle a bit in CR4 and load right value in CR3. I decided to not change kernel map for now. Single PML5 entry is created that points to the existing kernel_pml4 (KML4Phys) page, and a pml5 entry to create our recursive mapping for vtopte()/vtopde(). This decision is motivated by the fact that we cannot overcommit for KVA, so large space there is unusable until machines start providing wider physical memory addressing. Another reason is that I do not want to break our fragile autotuning, so the KVA expansion is not included into this first step. Nice side effect is that minidumps are compatible. On the other hand, (very) large address space is definitely immediately useful for some userspace applications. For userspace, numbering of pte entries (or page table pages) is always done for 5-level structures even if we operate in 4-level mode. The pmap_is_la57() function is added to report the mode of the specified pmap, this is done not to allow simultaneous 4-/5-levels (which is not allowed by hw), but to accomodate for EPT which has separate level control and in principle might not allow 5-leve EPT despite x86 paging supports it. Anyway, it does not seems critical to have 5-level EPT support now. Tested by: pho (LA48 hardware) Reviewed by: alc Sponsored by: The FreeBSD Foundation Differential revision: https://reviews.freebsd.org/D25273 Modified: head/sys/amd64/amd64/elf_machdep.c head/sys/amd64/amd64/genassym.c head/sys/amd64/amd64/locore.S head/sys/amd64/amd64/mp_machdep.c head/sys/amd64/amd64/mpboot.S head/sys/amd64/amd64/pmap.c head/sys/amd64/include/md_var.h head/sys/amd64/include/param.h head/sys/amd64/include/pmap.h head/sys/amd64/include/proc.h head/sys/amd64/include/vmparam.h head/sys/amd64/linux/linux_sysvec.c head/sys/amd64/vmm/amd/svm.c head/sys/amd64/vmm/intel/vmx.c head/sys/cddl/dev/dtrace/amd64/dtrace_subr.c Modified: head/sys/amd64/amd64/elf_machdep.c ============================================================================== --- head/sys/amd64/amd64/elf_machdep.c Sun Aug 23 20:14:57 2020 (r364526) +++ head/sys/amd64/amd64/elf_machdep.c Sun Aug 23 20:19:04 2020 (r364527) @@ -49,7 +49,7 @@ __FBSDID("$FreeBSD$"); #include <machine/fpu.h> #include <machine/md_var.h> -struct sysentvec elf64_freebsd_sysvec = { +struct sysentvec elf64_freebsd_sysvec_la48 = { .sv_size = SYS_MAXSYSCALL, .sv_table = sysent, .sv_errsize = 0, @@ -64,9 +64,9 @@ struct sysentvec elf64_freebsd_sysvec = { .sv_imgact_try = NULL, .sv_minsigstksz = MINSIGSTKSZ, .sv_minuser = VM_MIN_ADDRESS, - .sv_maxuser = VM_MAXUSER_ADDRESS, - .sv_usrstack = USRSTACK, - .sv_psstrings = PS_STRINGS, + .sv_maxuser = VM_MAXUSER_ADDRESS_LA48, + .sv_usrstack = USRSTACK_LA48, + .sv_psstrings = PS_STRINGS_LA48, .sv_stackprot = VM_PROT_ALL, .sv_copyout_auxargs = __elfN(freebsd_copyout_auxargs), .sv_copyout_strings = exec_copyout_strings, @@ -78,15 +78,65 @@ struct sysentvec elf64_freebsd_sysvec = { .sv_set_syscall_retval = cpu_set_syscall_retval, .sv_fetch_syscall_args = cpu_fetch_syscall_args, .sv_syscallnames = syscallnames, - .sv_shared_page_base = SHAREDPAGE, + .sv_shared_page_base = SHAREDPAGE_LA48, .sv_shared_page_len = PAGE_SIZE, .sv_schedtail = NULL, .sv_thread_detach = NULL, .sv_trap = NULL, .sv_stackgap = elf64_stackgap, }; -INIT_SYSENTVEC(elf64_sysvec, &elf64_freebsd_sysvec); +struct sysentvec elf64_freebsd_sysvec_la57 = { + .sv_size = SYS_MAXSYSCALL, + .sv_table = sysent, + .sv_errsize = 0, + .sv_errtbl = NULL, + .sv_transtrap = NULL, + .sv_fixup = __elfN(freebsd_fixup), + .sv_sendsig = sendsig, + .sv_sigcode = sigcode, + .sv_szsigcode = &szsigcode, + .sv_name = "FreeBSD ELF64", + .sv_coredump = __elfN(coredump), + .sv_imgact_try = NULL, + .sv_minsigstksz = MINSIGSTKSZ, + .sv_minuser = VM_MIN_ADDRESS, + .sv_maxuser = VM_MAXUSER_ADDRESS_LA57, + .sv_usrstack = USRSTACK_LA57, + .sv_psstrings = PS_STRINGS_LA57, + .sv_stackprot = VM_PROT_ALL, + .sv_copyout_auxargs = __elfN(freebsd_copyout_auxargs), + .sv_copyout_strings = exec_copyout_strings, + .sv_setregs = exec_setregs, + .sv_fixlimit = NULL, + .sv_maxssiz = NULL, + .sv_flags = SV_ABI_FREEBSD | SV_ASLR | SV_LP64 | SV_SHP | + SV_TIMEKEEP, + .sv_set_syscall_retval = cpu_set_syscall_retval, + .sv_fetch_syscall_args = cpu_fetch_syscall_args, + .sv_syscallnames = syscallnames, + .sv_shared_page_base = SHAREDPAGE_LA57, + .sv_shared_page_len = PAGE_SIZE, + .sv_schedtail = NULL, + .sv_thread_detach = NULL, + .sv_trap = NULL, + .sv_stackgap = elf64_stackgap, +}; + +static void +amd64_init_sysvecs(void *arg) +{ + amd64_lower_shared_page(&elf64_freebsd_sysvec_la48); + if (la57) { + exec_sysvec_init(&elf64_freebsd_sysvec_la57); + exec_sysvec_init_secondary(&elf64_freebsd_sysvec_la57, + &elf64_freebsd_sysvec_la48); + } else { + exec_sysvec_init(&elf64_freebsd_sysvec_la48); + } +} +SYSINIT(elf64_sysvec, SI_SUB_EXEC, SI_ORDER_ANY, amd64_init_sysvecs, NULL); + void amd64_lower_shared_page(struct sysentvec *sv) { @@ -98,29 +148,57 @@ amd64_lower_shared_page(struct sysentvec *sv) } } -/* - * Do this fixup before INIT_SYSENTVEC (SI_ORDER_ANY) because the latter - * uses the value of sv_shared_page_base. - */ -SYSINIT(elf64_sysvec_fixup, SI_SUB_EXEC, SI_ORDER_FIRST, - (sysinit_cfunc_t) amd64_lower_shared_page, - &elf64_freebsd_sysvec); +static boolean_t +freebsd_brand_info_la57_img_compat(struct image_params *imgp, + int32_t *osrel __unused, uint32_t *fctl0) +{ + if ((imgp->proc->p_md.md_flags & P_MD_LA57) != 0) + return (TRUE); + if (fctl0 == NULL || (*fctl0 & NT_FREEBSD_FCTL_LA48) != 0) + return (FALSE); + if ((imgp->proc->p_md.md_flags & P_MD_LA48) != 0) + return (FALSE); + return (TRUE); +} -static Elf64_Brandinfo freebsd_brand_info = { +static Elf64_Brandinfo freebsd_brand_info_la48 = { .brand = ELFOSABI_FREEBSD, .machine = EM_X86_64, .compat_3_brand = "FreeBSD", .emul_path = NULL, .interp_path = "/libexec/ld-elf.so.1", - .sysvec = &elf64_freebsd_sysvec, + .sysvec = &elf64_freebsd_sysvec_la48, .interp_newpath = NULL, .brand_note = &elf64_freebsd_brandnote, - .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE + .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE, }; +static Elf64_Brandinfo freebsd_brand_info_la57 = { + .brand = ELFOSABI_FREEBSD, + .machine = EM_X86_64, + .compat_3_brand = "FreeBSD", + .emul_path = NULL, + .interp_path = "/libexec/ld-elf.so.1", + .sysvec = &elf64_freebsd_sysvec_la57, + .interp_newpath = NULL, + .brand_note = &elf64_freebsd_brandnote, + .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE, + .header_supported = freebsd_brand_info_la57_img_compat, +}; + +static void +sysinit_register_elf64_brand_entries(void *arg __unused) +{ + /* + * _57 must go first so it can either claim the image or hand + * it to _48. + */ + if (la57) + elf64_insert_brand_entry(&freebsd_brand_info_la57); + elf64_insert_brand_entry(&freebsd_brand_info_la48); +} SYSINIT(elf64, SI_SUB_EXEC, SI_ORDER_FIRST, - (sysinit_cfunc_t) elf64_insert_brand_entry, - &freebsd_brand_info); + sysinit_register_elf64_brand_entries, NULL); static Elf64_Brandinfo freebsd_brand_oinfo = { .brand = ELFOSABI_FREEBSD, @@ -128,15 +206,14 @@ static Elf64_Brandinfo freebsd_brand_oinfo = { .compat_3_brand = "FreeBSD", .emul_path = NULL, .interp_path = "/usr/libexec/ld-elf.so.1", - .sysvec = &elf64_freebsd_sysvec, + .sysvec = &elf64_freebsd_sysvec_la48, .interp_newpath = NULL, .brand_note = &elf64_freebsd_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; SYSINIT(oelf64, SI_SUB_EXEC, SI_ORDER_ANY, - (sysinit_cfunc_t) elf64_insert_brand_entry, - &freebsd_brand_oinfo); + (sysinit_cfunc_t)elf64_insert_brand_entry, &freebsd_brand_oinfo); static Elf64_Brandinfo kfreebsd_brand_info = { .brand = ELFOSABI_FREEBSD, @@ -144,15 +221,14 @@ static Elf64_Brandinfo kfreebsd_brand_info = { .compat_3_brand = "FreeBSD", .emul_path = NULL, .interp_path = "/lib/ld-kfreebsd-x86-64.so.1", - .sysvec = &elf64_freebsd_sysvec, + .sysvec = &elf64_freebsd_sysvec_la48, .interp_newpath = NULL, .brand_note = &elf64_kfreebsd_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE_MANDATORY }; SYSINIT(kelf64, SI_SUB_EXEC, SI_ORDER_ANY, - (sysinit_cfunc_t) elf64_insert_brand_entry, - &kfreebsd_brand_info); + (sysinit_cfunc_t)elf64_insert_brand_entry, &kfreebsd_brand_info); void elf64_dump_thread(struct thread *td, void *dst, size_t *off) Modified: head/sys/amd64/amd64/genassym.c ============================================================================== --- head/sys/amd64/amd64/genassym.c Sun Aug 23 20:14:57 2020 (r364526) +++ head/sys/amd64/amd64/genassym.c Sun Aug 23 20:19:04 2020 (r364527) @@ -99,11 +99,10 @@ ASSYM(TDP_KTHREAD, TDP_KTHREAD); ASSYM(PAGE_SIZE, PAGE_SIZE); ASSYM(NPTEPG, NPTEPG); ASSYM(NPDEPG, NPDEPG); -ASSYM(addr_PTmap, addr_PTmap); -ASSYM(addr_PDmap, addr_PDmap); -ASSYM(addr_PDPmap, addr_PDPmap); -ASSYM(addr_PML4map, addr_PML4map); -ASSYM(addr_PML4pml4e, addr_PML4pml4e); +ASSYM(addr_P4Tmap, addr_P4Tmap); +ASSYM(addr_P4Dmap, addr_P4Dmap); +ASSYM(addr_P5Tmap, addr_P5Tmap); +ASSYM(addr_P5Dmap, addr_P5Dmap); ASSYM(PDESIZE, sizeof(pd_entry_t)); ASSYM(PTESIZE, sizeof(pt_entry_t)); ASSYM(PAGE_SHIFT, PAGE_SHIFT); Modified: head/sys/amd64/amd64/locore.S ============================================================================== --- head/sys/amd64/amd64/locore.S Sun Aug 23 20:14:57 2020 (r364526) +++ head/sys/amd64/amd64/locore.S Sun Aug 23 20:19:04 2020 (r364527) @@ -36,13 +36,8 @@ /* * Compiled KERNBASE location */ - .globl kernbase,loc_PTmap,loc_PDmap,loc_PDPmap,loc_PML4map,loc_PML4pml4e,dmapbase,dmapend + .globl kernbase, loc_PTmap, loc_PDmap, loc_PDPmap, dmapbase, dmapend .set kernbase,KERNBASE - .set loc_PTmap,addr_PTmap - .set loc_PDmap,addr_PDmap - .set loc_PDPmap,addr_PDPmap - .set loc_PML4map,addr_PML4map - .set loc_PML4pml4e,addr_PML4pml4e .set dmapbase,DMAP_MIN_ADDRESS .set dmapend,DMAP_MAX_ADDRESS @@ -81,6 +76,62 @@ NON_GPROF_ENTRY(btext) call mi_startup /* autoconfiguration, mountroot etc */ 0: hlt jmp 0b + +/* la57_trampoline(%rdi pml5) */ +NON_GPROF_ENTRY(la57_trampoline) + movq %rsp,%r11 + movq %rbx,%r10 + leaq la57_trampoline_end(%rip),%rsp + + movq %cr0,%rdx + lgdtq la57_trampoline_gdt_desc(%rip) + + pushq $(2<<3) + leaq l1(%rip),%rax + leaq l2(%rip),%rbx + + pushq %rax + lretq + .code32 + +l1: movl $(3<<3),%eax + movl %eax,%ss + + movl %edx,%eax + andl $~CR0_PG,%eax + movl %eax,%cr0 + + movl %cr4,%eax + orl $CR4_LA57,%eax + movl %eax,%cr4 + + movl %edi,%cr3 + movl %edx,%cr0 + + pushl $(1<<3) + pushl %ebx + lretl + .code64 + +l2: movq %r11,%rsp + movq %r10,%rbx + retq + .p2align 4,0 +NON_GPROF_ENTRY(la57_trampoline_gdt_desc) + .word la57_trampoline_end - la57_trampoline_gdt + .long 0 /* filled by pmap_bootstrap_la57 */ + .p2align 4,0 +NON_GPROF_ENTRY(la57_trampoline_gdt) + .long 0x00000000 /* null desc */ + .long 0x00000000 + .long 0x00000000 /* 64bit code */ + .long 0x00209800 + .long 0x0000ffff /* 32bit code */ + .long 0x00cf9b00 + .long 0x0000ffff /* universal data */ + .long 0x00cf9300 + .dcb.l 16,0 +NON_GPROF_ENTRY(la57_trampoline_end) .bss ALIGN_DATA /* just to be sure */ Modified: head/sys/amd64/amd64/mp_machdep.c ============================================================================== --- head/sys/amd64/amd64/mp_machdep.c Sun Aug 23 20:14:57 2020 (r364526) +++ head/sys/amd64/amd64/mp_machdep.c Sun Aug 23 20:19:04 2020 (r364527) @@ -96,7 +96,7 @@ __FBSDID("$FreeBSD$"); #define GiB(v) (v ## ULL << 30) -#define AP_BOOTPT_SZ (PAGE_SIZE * 3) +#define AP_BOOTPT_SZ (PAGE_SIZE * 4) /* Temporary variables for init_secondary() */ char *doublefault_stack; @@ -104,6 +104,8 @@ char *mce_stack; char *nmi_stack; char *dbg_stack; +extern u_int mptramp_la57; + /* * Local data and functions. */ @@ -240,6 +242,8 @@ cpu_mp_start(void) assign_cpu_ids(); + mptramp_la57 = la57; + /* Start each Application Processor */ init_ops.start_all_aps(); @@ -395,9 +399,9 @@ mp_realloc_pcpu(int cpuid, int domain) int native_start_all_aps(void) { - u_int64_t *pt4, *pt3, *pt2; + u_int64_t *pt5, *pt4, *pt3, *pt2; u_int32_t mpbioswarmvec; - int apic_id, cpu, domain, i; + int apic_id, cpu, domain, i, xo; u_char mpbiosreason; mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN); @@ -406,18 +410,38 @@ native_start_all_aps(void) bcopy(mptramp_start, (void *)PHYS_TO_DMAP(boot_address), bootMP_size); /* Locate the page tables, they'll be below the trampoline */ - pt4 = (uint64_t *)PHYS_TO_DMAP(mptramp_pagetables); + if (la57) { + pt5 = (uint64_t *)PHYS_TO_DMAP(mptramp_pagetables); + xo = 1; + } else { + xo = 0; + } + pt4 = (uint64_t *)PHYS_TO_DMAP(mptramp_pagetables + xo * PAGE_SIZE); pt3 = pt4 + (PAGE_SIZE) / sizeof(u_int64_t); pt2 = pt3 + (PAGE_SIZE) / sizeof(u_int64_t); /* Create the initial 1GB replicated page tables */ for (i = 0; i < 512; i++) { - /* Each slot of the level 4 pages points to the same level 3 page */ - pt4[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + PAGE_SIZE); + if (la57) { + pt5[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + + PAGE_SIZE); + pt5[i] |= PG_V | PG_RW | PG_U; + } + + /* + * Each slot of the level 4 pages points to the same + * level 3 page. + */ + pt4[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + + (xo + 1) * PAGE_SIZE); pt4[i] |= PG_V | PG_RW | PG_U; - /* Each slot of the level 3 pages points to the same level 2 page */ - pt3[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + (2 * PAGE_SIZE)); + /* + * Each slot of the level 3 pages points to the same + * level 2 page. + */ + pt3[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + + ((xo + 2) * PAGE_SIZE)); pt3[i] |= PG_V | PG_RW | PG_U; /* The level 2 page slots are mapped with 2MB pages for 1GB. */ Modified: head/sys/amd64/amd64/mpboot.S ============================================================================== --- head/sys/amd64/amd64/mpboot.S Sun Aug 23 20:14:57 2020 (r364526) +++ head/sys/amd64/amd64/mpboot.S Sun Aug 23 20:19:04 2020 (r364527) @@ -90,10 +90,16 @@ protmode: mov $bootdata-gdt, %eax mov %ax, %ds - /* Turn on the PAE bit for when paging is enabled */ + /* + * Turn on the PAE bit and optionally the LA57 bit for when paging + * is later enabled. + */ mov %cr4, %eax orl $CR4_PAE, %eax - mov %eax, %cr4 + cmpb $0, mptramp_la57-mptramp_start(%ebx) + je 1f + orl $CR4_LA57, %eax +1: mov %eax, %cr4 /* * Enable EFER.LME so that we get long mode when all the prereqs are @@ -132,9 +138,9 @@ protmode: /* * At this point paging is enabled, and we are in "compatibility" mode. * We do another far jump to reload %cs with the 64 bit selector. - * %cr3 points to a 4-level page table page. + * %cr3 points to a 4- or 5-level page table. * We cannot yet jump all the way to the kernel because we can only - * specify a 32 bit linear address. So, yet another trampoline. + * specify a 32 bit linear address. So, we use yet another trampoline. * * The following instruction is: * ljmp $kernelcode-gdt, $tramp_64-mptramp_start @@ -209,6 +215,11 @@ gdtend: mptramp_pagetables: .long 0 + /* 5-level paging ? */ + .globl mptramp_la57 +mptramp_la57: + .long 0 + /* * The pseudo descriptor for lgdt to use. */ @@ -251,8 +262,12 @@ entry_64: * Load a real %cr3 that has all the direct map stuff and switches * off the 1GB replicated mirror. Load a stack pointer and jump * into AP startup code in C. - */ + */ + cmpl $0, la57 + jne 2f movq KPML4phys, %rax - movq %rax, %cr3 + jmp 3f +2: movq KPML5phys, %rax +3: movq %rax, %cr3 movq bootSTK, %rsp jmp init_secondary Modified: head/sys/amd64/amd64/pmap.c ============================================================================== --- head/sys/amd64/amd64/pmap.c Sun Aug 23 20:14:57 2020 (r364526) +++ head/sys/amd64/amd64/pmap.c Sun Aug 23 20:19:04 2020 (r364527) @@ -398,6 +398,19 @@ static int pg_ps_enabled = 1; SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &pg_ps_enabled, 0, "Are large page mappings enabled?"); +int __read_frequently la57 = 0; +SYSCTL_INT(_vm_pmap, OID_AUTO, la57, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, + &la57, 0, + "5-level paging for host is enabled"); + +static bool +pmap_is_la57(pmap_t pmap) +{ + if (pmap->pm_type == PT_X86) + return (la57); + return (false); /* XXXKIB handle EPT */ +} + #define PAT_INDEX_SIZE 8 static int pat_index[PAT_INDEX_SIZE]; /* cache mode to PAT index conversion */ @@ -405,7 +418,10 @@ static u_int64_t KPTphys; /* phys addr of kernel level static u_int64_t KPDphys; /* phys addr of kernel level 2 */ u_int64_t KPDPphys; /* phys addr of kernel level 3 */ u_int64_t KPML4phys; /* phys addr of kernel level 4 */ +u_int64_t KPML5phys; /* phys addr of kernel level 5, + if supported */ +static pml4_entry_t *kernel_pml4; static u_int64_t DMPDphys; /* phys addr of direct mapped level 2 */ static u_int64_t DMPDPphys; /* phys addr of direct mapped level 3 */ static int ndmpdpphys; /* number of DMPDPphys pages */ @@ -1257,7 +1273,7 @@ static void pmap_update_pde(pmap_t pmap, vm_offset_t v static void pmap_update_pde_invalidate(pmap_t, vm_offset_t va, pd_entry_t pde); static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, - struct rwlock **lockp); + struct rwlock **lockp, vm_offset_t va); static pd_entry_t *pmap_alloc_pde(pmap_t pmap, vm_offset_t va, vm_page_t *pdpgp, struct rwlock **lockp); static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, @@ -1271,22 +1287,87 @@ static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry /* Inline functions */ /********************/ -/* Return a non-clipped PD index for a given VA */ +/* + * Return a non-clipped indexes for a given VA, which are page table + * pages indexes at the corresponding level. + */ static __inline vm_pindex_t pmap_pde_pindex(vm_offset_t va) { return (va >> PDRSHIFT); } +static __inline vm_pindex_t +pmap_pdpe_pindex(vm_offset_t va) +{ + return (NUPDE + (va >> PDPSHIFT)); +} +static __inline vm_pindex_t +pmap_pml4e_pindex(vm_offset_t va) +{ + return (NUPDE + NUPDPE + (va >> PML4SHIFT)); +} + +static __inline vm_pindex_t +pmap_pml5e_pindex(vm_offset_t va) +{ + return (NUPDE + NUPDPE + NUPML4E + (va >> PML5SHIFT)); +} + +static __inline pml4_entry_t * +pmap_pml5e(pmap_t pmap, vm_offset_t va) +{ + + MPASS(pmap_is_la57(pmap)); + return (&pmap->pm_pmltop[pmap_pml5e_index(va)]); +} + +static __inline pml4_entry_t * +pmap_pml5e_u(pmap_t pmap, vm_offset_t va) +{ + + MPASS(pmap_is_la57(pmap)); + return (&pmap->pm_pmltopu[pmap_pml5e_index(va)]); +} + +static __inline pml4_entry_t * +pmap_pml5e_to_pml4e(pml5_entry_t *pml5e, vm_offset_t va) +{ + pml4_entry_t *pml4e; + + /* XXX MPASS(pmap_is_la57(pmap); */ + pml4e = (pml4_entry_t *)PHYS_TO_DMAP(*pml5e & PG_FRAME); + return (&pml4e[pmap_pml4e_index(va)]); +} + /* Return a pointer to the PML4 slot that corresponds to a VA */ static __inline pml4_entry_t * pmap_pml4e(pmap_t pmap, vm_offset_t va) { + pml5_entry_t *pml5e; + pml4_entry_t *pml4e; + pt_entry_t PG_V; - return (&pmap->pm_pml4[pmap_pml4e_index(va)]); + if (pmap_is_la57(pmap)) { + pml5e = pmap_pml5e(pmap, va); + PG_V = pmap_valid_bit(pmap); + if ((*pml5e & PG_V) == 0) + return (NULL); + pml4e = (pml4_entry_t *)PHYS_TO_DMAP(*pml5e & PG_FRAME); + } else { + pml4e = pmap->pm_pmltop; + } + return (&pml4e[pmap_pml4e_index(va)]); } +static __inline pml4_entry_t * +pmap_pml4e_u(pmap_t pmap, vm_offset_t va) +{ + MPASS(!pmap_is_la57(pmap)); + return (&pmap->pm_pmltopu[pmap_pml4e_index(va)]); +} + /* Return a pointer to the PDP slot that corresponds to a VA */ static __inline pdp_entry_t * pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va) @@ -1306,7 +1387,7 @@ pmap_pdpe(pmap_t pmap, vm_offset_t va) PG_V = pmap_valid_bit(pmap); pml4e = pmap_pml4e(pmap, va); - if ((*pml4e & PG_V) == 0) + if (pml4e == NULL || (*pml4e & PG_V) == 0) return (NULL); return (pmap_pml4e_to_pdpe(pml4e, va)); } @@ -1387,21 +1468,37 @@ pmap_resident_count_dec(pmap_t pmap, int count) PMAP_INLINE pt_entry_t * vtopte(vm_offset_t va) { - u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); + u_int64_t mask; KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopte on a uva/gpa 0x%0lx", va)); - return (PTmap + ((va >> PAGE_SHIFT) & mask)); + if (la57) { + mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + + NPML4EPGSHIFT + NPML5EPGSHIFT)) - 1); + return (P5Tmap + ((va >> PAGE_SHIFT) & mask)); + } else { + mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + + NPML4EPGSHIFT)) - 1); + return (P4Tmap + ((va >> PAGE_SHIFT) & mask)); + } } static __inline pd_entry_t * vtopde(vm_offset_t va) { - u_int64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); + u_int64_t mask; KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopde on a uva/gpa 0x%0lx", va)); - return (PDmap + ((va >> PDRSHIFT) & mask)); + if (la57) { + mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + + NPML4EPGSHIFT + NPML5EPGSHIFT)) - 1); + return (P5Dmap + ((va >> PDRSHIFT) & mask)); + } else { + mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + + NPML4EPGSHIFT)) - 1); + return (P4Dmap + ((va >> PDRSHIFT) & mask)); + } } static u_int64_t @@ -1658,6 +1755,8 @@ create_pagetables(vm_paddr_t *firstaddr) p4_p[KPML4BASE + i] = KPDPphys + ptoa(i); p4_p[KPML4BASE + i] |= X86_PG_RW | X86_PG_V; } + + kernel_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys); } /* @@ -1730,7 +1829,7 @@ pmap_bootstrap(vm_paddr_t *firstaddr) * later unmapped (using pmap_remove()) and freed. */ PMAP_LOCK_INIT(kernel_pmap); - kernel_pmap->pm_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(KPML4phys); + kernel_pmap->pm_pmltop = kernel_pml4; kernel_pmap->pm_cr3 = KPML4phys; kernel_pmap->pm_ucr3 = PMAP_NO_CR3; CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ @@ -1891,6 +1990,148 @@ pmap_init_pat(void) load_cr4(cr4); } +extern const char la57_trampoline[], la57_trampoline_gdt_desc[], + la57_trampoline_gdt[], la57_trampoline_end[]; + +static void +pmap_bootstrap_la57(void *arg __unused) +{ + char *v_code; + pml5_entry_t *v_pml5; + pml4_entry_t *v_pml4; + pdp_entry_t *v_pdp; + pd_entry_t *v_pd; + pt_entry_t *v_pt; + vm_page_t m_code, m_pml4, m_pdp, m_pd, m_pt, m_pml5; + void (*la57_tramp)(uint64_t pml5); + struct region_descriptor r_gdt; + + if ((cpu_stdext_feature2 & CPUID_STDEXT2_LA57) == 0) + return; + if (!TUNABLE_INT_FETCH("vm.pmap.la57", &la57)) + la57 = 1; + if (!la57) + return; + + r_gdt.rd_limit = NGDT * sizeof(struct user_segment_descriptor) - 1; + r_gdt.rd_base = (long)__pcpu[0].pc_gdt; + + m_code = vm_page_alloc_contig(NULL, 0, + VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ, + 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); + if ((m_code->flags & PG_ZERO) == 0) + pmap_zero_page(m_code); + v_code = (char *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_code)); + m_pml5 = vm_page_alloc_contig(NULL, 0, + VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ, + 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); + if ((m_pml5->flags & PG_ZERO) == 0) + pmap_zero_page(m_pml5); + KPML5phys = VM_PAGE_TO_PHYS(m_pml5); + v_pml5 = (pml5_entry_t *)PHYS_TO_DMAP(KPML5phys); + m_pml4 = vm_page_alloc_contig(NULL, 0, + VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ, + 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); + if ((m_pml4->flags & PG_ZERO) == 0) + pmap_zero_page(m_pml4); + v_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pml4)); + m_pdp = vm_page_alloc_contig(NULL, 0, + VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ, + 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); + if ((m_pdp->flags & PG_ZERO) == 0) + pmap_zero_page(m_pdp); + v_pdp = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pdp)); + m_pd = vm_page_alloc_contig(NULL, 0, + VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ, + 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); + if ((m_pd->flags & PG_ZERO) == 0) + pmap_zero_page(m_pd); + v_pd = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pd)); + m_pt = vm_page_alloc_contig(NULL, 0, + VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ, + 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); + if ((m_pt->flags & PG_ZERO) == 0) + pmap_zero_page(m_pt); + v_pt = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pt)); + + /* + * Map m_code 1:1, it appears below 4G in KVA due to physical + * address being below 4G. Since kernel KVA is in upper half, + * the pml4e should be zero and free for temporary use. + */ + kernel_pmap->pm_pmltop[pmap_pml4e_index(VM_PAGE_TO_PHYS(m_code))] = + VM_PAGE_TO_PHYS(m_pdp) | X86_PG_V | X86_PG_RW | X86_PG_A | + X86_PG_M; + v_pdp[pmap_pdpe_index(VM_PAGE_TO_PHYS(m_code))] = + VM_PAGE_TO_PHYS(m_pd) | X86_PG_V | X86_PG_RW | X86_PG_A | + X86_PG_M; + v_pd[pmap_pde_index(VM_PAGE_TO_PHYS(m_code))] = + VM_PAGE_TO_PHYS(m_pt) | X86_PG_V | X86_PG_RW | X86_PG_A | + X86_PG_M; + v_pt[pmap_pte_index(VM_PAGE_TO_PHYS(m_code))] = + VM_PAGE_TO_PHYS(m_code) | X86_PG_V | X86_PG_RW | X86_PG_A | + X86_PG_M; + + /* + * Add pml5 entry at top of KVA pointing to existing pml4 table, + * entering all existing kernel mappings into level 5 table. + */ + v_pml5[pmap_pml5e_index(UPT_MAX_ADDRESS)] = KPML4phys | X86_PG_V | + X86_PG_RW | X86_PG_A | X86_PG_M | pg_g; + + /* + * Add pml5 entry for 1:1 trampoline mapping after LA57 is turned on. + */ + v_pml5[pmap_pml5e_index(VM_PAGE_TO_PHYS(m_code))] = + VM_PAGE_TO_PHYS(m_pml4) | X86_PG_V | X86_PG_RW | X86_PG_A | + X86_PG_M; + v_pml4[pmap_pml4e_index(VM_PAGE_TO_PHYS(m_code))] = + VM_PAGE_TO_PHYS(m_pdp) | X86_PG_V | X86_PG_RW | X86_PG_A | + X86_PG_M; + + /* + * Copy and call the 48->57 trampoline, hope we return there, alive. + */ + bcopy(la57_trampoline, v_code, la57_trampoline_end - la57_trampoline); + *(u_long *)(v_code + 2 + (la57_trampoline_gdt_desc - la57_trampoline)) = + la57_trampoline_gdt - la57_trampoline + VM_PAGE_TO_PHYS(m_code); + la57_tramp = (void (*)(uint64_t))VM_PAGE_TO_PHYS(m_code); + la57_tramp(KPML5phys); + + /* + * gdt was necessary reset, switch back to our gdt. + */ + lgdt(&r_gdt); + wrmsr(MSR_GSBASE, (uint64_t)&__pcpu[0]); + load_ds(_udatasel); + load_es(_udatasel); + load_fs(_ufssel); + ssdtosyssd(&gdt_segs[GPROC0_SEL], + (struct system_segment_descriptor *)&__pcpu[0].pc_gdt[GPROC0_SEL]); + ltr(GSEL(GPROC0_SEL, SEL_KPL)); + + /* + * Now unmap the trampoline, and free the pages. + * Clear pml5 entry used for 1:1 trampoline mapping. + */ + pte_clear(&v_pml5[pmap_pml5e_index(VM_PAGE_TO_PHYS(m_code))]); + invlpg((vm_offset_t)v_code); + vm_page_free(m_code); + vm_page_free(m_pdp); + vm_page_free(m_pd); + vm_page_free(m_pt); + + /* + * Recursively map PML5 to itself in order to get PTmap and + * PDmap. + */ + v_pml5[PML5PML5I] = KPML5phys | X86_PG_RW | X86_PG_V | pg_nx; + + kernel_pmap->pm_cr3 = KPML5phys; + kernel_pmap->pm_pmltop = v_pml5; +} +SYSINIT(la57, SI_SUB_KMEM, SI_ORDER_ANY, pmap_bootstrap_la57, NULL); + /* * Initialize a vm_page's machine-dependent fields. */ @@ -2190,7 +2431,8 @@ pmap_init(void) } for (i = 0; i < lm_ents; i++) { m = pmap_large_map_getptp_unlocked(); - kernel_pmap->pm_pml4[LMSPML4I + i] = X86_PG_V | + /* XXXKIB la57 */ + kernel_pml4[LMSPML4I + i] = X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M | pg_nx | VM_PAGE_TO_PHYS(m); } @@ -3566,44 +3808,57 @@ pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) { + pml5_entry_t *pml5; + pml4_entry_t *pml4; + pdp_entry_t *pdp; + pd_entry_t *pd; + vm_page_t pdpg, pdppg, pml4pg; PMAP_LOCK_ASSERT(pmap, MA_OWNED); + /* * unmap the page table page */ - if (m->pindex >= NUPDE + NUPDPE) { + if (m->pindex >= NUPDE + NUPDPE + NUPML4E) { + /* PML4 page */ + MPASS(pmap_is_la57(pmap)); + pml5 = pmap_pml5e(pmap, va); + *pml5 = 0; + if (pmap->pm_pmltopu != NULL && va <= VM_MAXUSER_ADDRESS) { + pml5 = pmap_pml5e_u(pmap, va); + *pml5 = 0; + } + } else if (m->pindex >= NUPDE + NUPDPE) { /* PDP page */ - pml4_entry_t *pml4; pml4 = pmap_pml4e(pmap, va); *pml4 = 0; - if (pmap->pm_pml4u != NULL && va <= VM_MAXUSER_ADDRESS) { - pml4 = &pmap->pm_pml4u[pmap_pml4e_index(va)]; + if (!pmap_is_la57(pmap) && pmap->pm_pmltopu != NULL && + va <= VM_MAXUSER_ADDRESS) { + pml4 = pmap_pml4e_u(pmap, va); *pml4 = 0; } } else if (m->pindex >= NUPDE) { /* PD page */ - pdp_entry_t *pdp; pdp = pmap_pdpe(pmap, va); *pdp = 0; } else { /* PTE page */ - pd_entry_t *pd; pd = pmap_pde(pmap, va); *pd = 0; } pmap_resident_count_dec(pmap, 1); if (m->pindex < NUPDE) { /* We just released a PT, unhold the matching PD */ - vm_page_t pdpg; - pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME); pmap_unwire_ptp(pmap, va, pdpg, free); } else if (m->pindex < NUPDE + NUPDPE) { /* We just released a PD, unhold the matching PDP */ - vm_page_t pdppg; - pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME); pmap_unwire_ptp(pmap, va, pdppg, free); + } else if (m->pindex < NUPDE + NUPDPE + NUPML4E && pmap_is_la57(pmap)) { + /* We just released a PDP, unhold the matching PML4 */ + pml4pg = PHYS_TO_VM_PAGE(*pmap_pml5e(pmap, va) & PG_FRAME); + pmap_unwire_ptp(pmap, va, pml4pg, free); } /* @@ -3659,9 +3914,9 @@ pmap_pinit0(pmap_t pmap) int i; PMAP_LOCK_INIT(pmap); - pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys); - pmap->pm_pml4u = NULL; - pmap->pm_cr3 = KPML4phys; + pmap->pm_pmltop = kernel_pmap->pm_pmltop; + pmap->pm_pmltopu = NULL; + pmap->pm_cr3 = kernel_pmap->pm_cr3; /* hack to keep pmap_pti_pcid_invalidate() alive */ pmap->pm_ucr3 = PMAP_NO_CR3; pmap->pm_root.rt_root = 0; @@ -3714,20 +3969,61 @@ pmap_pinit_pml4(vm_page_t pml4pg) /* install large map entries if configured */ for (i = 0; i < lm_ents; i++) - pm_pml4[LMSPML4I + i] = kernel_pmap->pm_pml4[LMSPML4I + i]; + pm_pml4[LMSPML4I + i] = kernel_pmap->pm_pmltop[LMSPML4I + i]; } +void +pmap_pinit_pml5(vm_page_t pml5pg) +{ + pml5_entry_t *pm_pml5; + + pm_pml5 = (pml5_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml5pg)); + + /* + * Add pml5 entry at top of KVA pointing to existing pml4 table, + * entering all existing kernel mappings into level 5 table. + */ + pm_pml5[pmap_pml5e_index(UPT_MAX_ADDRESS)] = KPML4phys | X86_PG_V | + X86_PG_RW | X86_PG_A | X86_PG_M | pg_g | + pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, FALSE); + + /* + * Install self-referential address mapping entry. + */ + pm_pml5[PML5PML5I] = VM_PAGE_TO_PHYS(pml5pg) | + X86_PG_RW | X86_PG_V | X86_PG_M | X86_PG_A | + pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, FALSE); +} + static void -pmap_pinit_pml4_pti(vm_page_t pml4pg) +pmap_pinit_pml4_pti(vm_page_t pml4pgu) { - pml4_entry_t *pm_pml4; + pml4_entry_t *pm_pml4u; int i; - pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg)); + pm_pml4u = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pgu)); for (i = 0; i < NPML4EPG; i++) - pm_pml4[i] = pti_pml4[i]; + pm_pml4u[i] = pti_pml4[i]; } +static void +pmap_pinit_pml5_pti(vm_page_t pml5pgu) +{ + pml5_entry_t *pm_pml5u; + + pm_pml5u = (pml5_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml5pgu)); + + /* + * Add pml5 entry at top of KVA pointing to existing pml4 pti + * table, entering all kernel mappings needed for usermode + * into level 5 table. + */ + pm_pml5u[pmap_pml5e_index(UPT_MAX_ADDRESS)] = + pmap_kextract((vm_offset_t)pti_pml4) | + X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M | pg_g | + pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, FALSE); +} + /* * Initialize a preallocated and zeroed pmap structure, * such as one in a vmspace structure. @@ -3735,29 +4031,30 @@ pmap_pinit_pml4_pti(vm_page_t pml4pg) int pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags) { - vm_page_t pml4pg, pml4pgu; - vm_paddr_t pml4phys; + vm_page_t pmltop_pg, pmltop_pgu; + vm_paddr_t pmltop_phys; int i; /* * allocate the page directory page */ - pml4pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | + pmltop_pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO | VM_ALLOC_WAITOK); - pml4phys = VM_PAGE_TO_PHYS(pml4pg); - pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(pml4phys); + pmltop_phys = VM_PAGE_TO_PHYS(pmltop_pg); + pmap->pm_pmltop = (pml5_entry_t *)PHYS_TO_DMAP(pmltop_phys); + CPU_FOREACH(i) { pmap->pm_pcids[i].pm_pcid = PMAP_PCID_NONE; pmap->pm_pcids[i].pm_gen = 0; } pmap->pm_cr3 = PMAP_NO_CR3; /* initialize to an invalid value */ pmap->pm_ucr3 = PMAP_NO_CR3; - pmap->pm_pml4u = NULL; + pmap->pm_pmltopu = NULL; pmap->pm_type = pm_type; - if ((pml4pg->flags & PG_ZERO) == 0) - pagezero(pmap->pm_pml4); + if ((pmltop_pg->flags & PG_ZERO) == 0) + pagezero(pmap->pm_pmltop); /* * Do not install the host kernel mappings in the nested page @@ -3766,15 +4063,21 @@ pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, i * Install minimal kernel mappings in PTI case. */ if (pm_type == PT_X86) { - pmap->pm_cr3 = pml4phys; - pmap_pinit_pml4(pml4pg); + pmap->pm_cr3 = pmltop_phys; + if (pmap_is_la57(pmap)) + pmap_pinit_pml5(pmltop_pg); + else + pmap_pinit_pml4(pmltop_pg); if ((curproc->p_md.md_flags & P_MD_KPTI) != 0) { *** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?202008232019.07NKJ53S016121>