Date: Sun, 8 Aug 2021 19:53:03 GMT From: Konstantin Belousov <kib@FreeBSD.org> To: src-committers@FreeBSD.org, dev-commits-src-all@FreeBSD.org, dev-commits-src-main@FreeBSD.org Subject: git: f75caed644a5 - main - amd64 UEFI loader: stop copying staging area to 2M physical Message-ID: <202108081953.178Jr3Av034565@gitrepo.freebsd.org>
next in thread | raw e-mail | index | archive | help
The branch main has been updated by kib: URL: https://cgit.FreeBSD.org/src/commit/?id=f75caed644a5c8c342a1ea5e7a6d5251f82ed0b1 commit f75caed644a5c8c342a1ea5e7a6d5251f82ed0b1 Author: Konstantin Belousov <kib@FreeBSD.org> AuthorDate: 2021-07-10 19:55:56 +0000 Commit: Konstantin Belousov <kib@FreeBSD.org> CommitDate: 2021-08-08 19:52:29 +0000 amd64 UEFI loader: stop copying staging area to 2M physical On amd64, add a possibility to activate kernel with staging area in place. Add 'copy_staging' command to control this. For now, by default the old mode of copying kernel to 2M phys is retained. It is going to be changed in several weeks. On amd64, add some slop to the staging area to satisfy both requirements of the kernel startup allocator, and to have space for minor staging data increase after the final size is calculated. Add a new command 'staging_slop' to control its size. Improve staging area resizing, in particular, reallocate it anew if we cannot grow it neither down nor up. Reviewed by: kevans, markj Discussed with: emaste (the delivery plan) Sponsored by: The FreeBSD Foundation MFC after: 1 week Differential revision: https://reviews.freebsd.org/D31121 --- stand/common/bootstrap.h | 3 + stand/common/load_elf.c | 15 ++ stand/efi/loader/arch/amd64/elf64_freebsd.c | 142 +++++++++++++---- stand/efi/loader/bootinfo.c | 6 +- stand/efi/loader/copy.c | 232 ++++++++++++++++++++++++---- stand/efi/loader/loader_efi.h | 10 ++ 6 files changed, 348 insertions(+), 60 deletions(-) diff --git a/stand/common/bootstrap.h b/stand/common/bootstrap.h index b7e1f8553f47..eb4e50203133 100644 --- a/stand/common/bootstrap.h +++ b/stand/common/bootstrap.h @@ -228,6 +228,9 @@ struct preloaded_file size_t f_size; /* file size */ struct kernel_module *f_modules; /* list of modules if any */ struct preloaded_file *f_next; /* next file */ +#ifdef __amd64__ + bool f_kernphys_relocatable; +#endif }; struct file_format diff --git a/stand/common/load_elf.c b/stand/common/load_elf.c index f1a9ff8e0c22..9ae91036dbb4 100644 --- a/stand/common/load_elf.c +++ b/stand/common/load_elf.c @@ -207,6 +207,18 @@ static int elf_section_header_convert(const Elf_Ehdr *ehdr, Elf_Shdr *shdr) #undef CONVERT_SWITCH #undef CONVERT_FIELD + +#ifdef __amd64__ +static bool +is_kernphys_relocatable(elf_file_t ef) +{ + Elf_Sym sym; + + return (__elfN(lookup_symbol)(ef, "kernphys", &sym, STT_OBJECT) == 0 && + sym.st_size == 8); +} +#endif + static int __elfN(load_elf_header)(char *filename, elf_file_t ef) { @@ -434,6 +446,9 @@ __elfN(loadfile_raw)(char *filename, uint64_t dest, /* Load OK, return module pointer */ *result = (struct preloaded_file *)fp; err = 0; +#ifdef __amd64__ + fp->f_kernphys_relocatable = is_kernphys_relocatable(&ef); +#endif goto out; ioerr: diff --git a/stand/efi/loader/arch/amd64/elf64_freebsd.c b/stand/efi/loader/arch/amd64/elf64_freebsd.c index a950ca55e843..d0c8ef96eeea 100644 --- a/stand/efi/loader/arch/amd64/elf64_freebsd.c +++ b/stand/efi/loader/arch/amd64/elf64_freebsd.c @@ -82,7 +82,11 @@ struct file_format *file_formats[] = { static pml4_entry_t *PT4; static pdp_entry_t *PT3; +static pdp_entry_t *PT3_l, *PT3_u; static pd_entry_t *PT2; +static pd_entry_t *PT2_l0, *PT2_l1, *PT2_l2, *PT2_l3, *PT2_u0, *PT2_u1; + +extern EFI_PHYSICAL_ADDRESS staging; static void (*trampoline)(uint64_t stack, void *copy_finish, uint64_t kernend, uint64_t modulep, pml4_entry_t *pagetable, uint64_t entry); @@ -105,6 +109,12 @@ elf64_exec(struct preloaded_file *fp) ACPI_TABLE_RSDP *rsdp; char buf[24]; int revision; + bool copy_auto; + + copy_auto = copy_staging == COPY_STAGING_AUTO; + if (copy_auto) + copy_staging = fp->f_kernphys_relocatable ? + COPY_STAGING_DISABLE : COPY_STAGING_ENABLE; /* * Report the RSDP to the kernel. While this can be found with @@ -151,57 +161,133 @@ elf64_exec(struct preloaded_file *fp) } if ((md = file_findmetadata(fp, MODINFOMD_ELFHDR)) == NULL) - return(EFTYPE); + return (EFTYPE); ehdr = (Elf_Ehdr *)&(md->md_data); - trampcode = (vm_offset_t)0x0000000040000000; + trampcode = copy_staging == COPY_STAGING_ENABLE ? + (vm_offset_t)0x0000000040000000 /* 1G */ : + (vm_offset_t)0x0000000100000000; /* 4G */; err = BS->AllocatePages(AllocateMaxAddress, EfiLoaderData, 1, (EFI_PHYSICAL_ADDRESS *)&trampcode); + if (EFI_ERROR(err)) { + printf("Unable to allocate trampoline\n"); + if (copy_auto) + copy_staging = COPY_STAGING_AUTO; + return (ENOMEM); + } bzero((void *)trampcode, EFI_PAGE_SIZE); trampstack = trampcode + EFI_PAGE_SIZE - 8; bcopy((void *)&amd64_tramp, (void *)trampcode, amd64_tramp_size); trampoline = (void *)trampcode; - PT4 = (pml4_entry_t *)0x0000000040000000; - err = BS->AllocatePages(AllocateMaxAddress, EfiLoaderData, 3, - (EFI_PHYSICAL_ADDRESS *)&PT4); - bzero(PT4, 3 * EFI_PAGE_SIZE); + if (copy_staging == COPY_STAGING_ENABLE) { + PT4 = (pml4_entry_t *)0x0000000040000000; + err = BS->AllocatePages(AllocateMaxAddress, EfiLoaderData, 3, + (EFI_PHYSICAL_ADDRESS *)&PT4); + if (EFI_ERROR(err)) { + printf("Unable to allocate trampoline page table\n"); + BS->FreePages(trampcode, 1); + if (copy_auto) + copy_staging = COPY_STAGING_AUTO; + return (ENOMEM); + } + bzero(PT4, 3 * EFI_PAGE_SIZE); + PT3 = &PT4[512]; + PT2 = &PT3[512]; + + /* + * This is kinda brutal, but every single 1GB VM + * memory segment points to the same first 1GB of + * physical memory. But it is more than adequate. + */ + for (i = 0; i < NPTEPG; i++) { + /* + * Each slot of the L4 pages points to the + * same L3 page. + */ + PT4[i] = (pml4_entry_t)PT3; + PT4[i] |= PG_V | PG_RW; + + /* + * Each slot of the L3 pages points to the + * same L2 page. + */ + PT3[i] = (pdp_entry_t)PT2; + PT3[i] |= PG_V | PG_RW; + + /* + * The L2 page slots are mapped with 2MB pages for 1GB. + */ + PT2[i] = (pd_entry_t)i * (2 * 1024 * 1024); + PT2[i] |= PG_V | PG_RW | PG_PS; + } + } else { + PT4 = (pml4_entry_t *)0x0000000100000000; /* 4G */ + err = BS->AllocatePages(AllocateMaxAddress, EfiLoaderData, 9, + (EFI_PHYSICAL_ADDRESS *)&PT4); + if (EFI_ERROR(err)) { + printf("Unable to allocate trampoline page table\n"); + BS->FreePages(trampcode, 9); + if (copy_auto) + copy_staging = COPY_STAGING_AUTO; + return (ENOMEM); + } - PT3 = &PT4[512]; - PT2 = &PT3[512]; + bzero(PT4, 9 * EFI_PAGE_SIZE); + + PT3_l = &PT4[NPML4EPG * 1]; + PT3_u = &PT4[NPML4EPG * 2]; + PT2_l0 = &PT4[NPML4EPG * 3]; + PT2_l1 = &PT4[NPML4EPG * 4]; + PT2_l2 = &PT4[NPML4EPG * 5]; + PT2_l3 = &PT4[NPML4EPG * 6]; + PT2_u0 = &PT4[NPML4EPG * 7]; + PT2_u1 = &PT4[NPML4EPG * 8]; + + /* 1:1 mapping of lower 4G */ + PT4[0] = (pml4_entry_t)PT3_l | PG_V | PG_RW; + PT3_l[0] = (pdp_entry_t)PT2_l0 | PG_V | PG_RW; + PT3_l[1] = (pdp_entry_t)PT2_l1 | PG_V | PG_RW; + PT3_l[2] = (pdp_entry_t)PT2_l2 | PG_V | PG_RW; + PT3_l[3] = (pdp_entry_t)PT2_l3 | PG_V | PG_RW; + for (i = 0; i < 4 * NPDEPG; i++) { + PT2_l0[i] = ((pd_entry_t)i << PDRSHIFT) | PG_V | + PG_RW | PG_PS; + } - /* - * This is kinda brutal, but every single 1GB VM memory segment points - * to the same first 1GB of physical memory. But it is more than - * adequate. - */ - for (i = 0; i < 512; i++) { - /* Each slot of the L4 pages points to the same L3 page. */ - PT4[i] = (pml4_entry_t)PT3; - PT4[i] |= PG_V | PG_RW; - - /* Each slot of the L3 pages points to the same L2 page. */ - PT3[i] = (pdp_entry_t)PT2; - PT3[i] |= PG_V | PG_RW; - - /* The L2 page slots are mapped with 2MB pages for 1GB. */ - PT2[i] = i * (2 * 1024 * 1024); - PT2[i] |= PG_V | PG_RW | PG_PS; + /* mapping of kernel 2G below top */ + PT4[NPML4EPG - 1] = (pml4_entry_t)PT3_u | PG_V | PG_RW; + PT3_u[NPDPEPG - 2] = (pdp_entry_t)PT2_u0 | PG_V | PG_RW; + PT3_u[NPDPEPG - 1] = (pdp_entry_t)PT2_u1 | PG_V | PG_RW; + /* compat mapping of phys @0 */ + PT2_u0[0] = PG_PS | PG_V | PG_RW; + /* this maps past staging area */ + for (i = 1; i < 2 * NPDEPG; i++) { + PT2_u0[i] = ((pd_entry_t)staging + + ((pd_entry_t)i - 1) * NBPDR) | + PG_V | PG_RW | PG_PS; + } } + printf("staging %#lx (%scoping) tramp %p PT4 %p\n", + staging, copy_staging == COPY_STAGING_ENABLE ? "" : "not ", + trampoline, PT4); printf("Start @ 0x%lx ...\n", ehdr->e_entry); efi_time_fini(); err = bi_load(fp->f_args, &modulep, &kernend, true); if (err != 0) { efi_time_init(); - return(err); + if (copy_auto) + copy_staging = COPY_STAGING_AUTO; + return (err); } dev_cleanup(); - trampoline(trampstack, efi_copy_finish, kernend, modulep, PT4, - ehdr->e_entry); + trampoline(trampstack, copy_staging == COPY_STAGING_ENABLE ? + efi_copy_finish : efi_copy_finish_nop, kernend, modulep, + PT4, ehdr->e_entry); panic("exec returned"); } diff --git a/stand/efi/loader/bootinfo.c b/stand/efi/loader/bootinfo.c index 9924901d29e6..f4501f18f14c 100644 --- a/stand/efi/loader/bootinfo.c +++ b/stand/efi/loader/bootinfo.c @@ -65,6 +65,8 @@ int bi_load(char *args, vm_offset_t *modulep, vm_offset_t *kernendp, extern EFI_SYSTEM_TABLE *ST; +int boot_services_gone; + static int bi_getboothowto(char *kargs) { @@ -396,8 +398,10 @@ bi_load_efi_data(struct preloaded_file *kfp, bool exit_bs) if (!exit_bs) break; status = BS->ExitBootServices(IH, efi_mapkey); - if (!EFI_ERROR(status)) + if (!EFI_ERROR(status)) { + boot_services_gone = 1; break; + } } if (retry == 0) { diff --git a/stand/efi/loader/copy.c b/stand/efi/loader/copy.c index e723b61e3bca..b8ed4c8e027e 100644 --- a/stand/efi/loader/copy.c +++ b/stand/efi/loader/copy.c @@ -39,6 +39,11 @@ __FBSDID("$FreeBSD$"); #include "loader_efi.h" +#define M(x) ((x) * 1024 * 1024) +#define G(x) (1UL * (x) * 1024 * 1024 * 1024) + +extern int boot_services_gone; + #if defined(__i386__) || defined(__amd64__) #include <machine/cpufunc.h> #include <machine/specialreg.h> @@ -175,24 +180,142 @@ out: #ifndef EFI_STAGING_SIZE #if defined(__arm__) -#define EFI_STAGING_SIZE 32 +#define EFI_STAGING_SIZE M(32) +#else +#define EFI_STAGING_SIZE M(64) +#endif +#endif + +#if defined(__aarch64__) || defined(__amd64__) || defined(__arm__) || \ + defined(__riscv) +#define EFI_STAGING_2M_ALIGN 1 #else -#define EFI_STAGING_SIZE 64 +#define EFI_STAGING_2M_ALIGN 0 #endif + +#if defined(__amd64__) +#define EFI_STAGING_SLOP M(8) +#else +#define EFI_STAGING_SLOP 0 #endif +static u_long staging_slop = EFI_STAGING_SLOP; + EFI_PHYSICAL_ADDRESS staging, staging_end, staging_base; int stage_offset_set = 0; ssize_t stage_offset; +static void +efi_copy_free(void) +{ + BS->FreePages(staging_base, (staging_end - staging_base) / + EFI_PAGE_SIZE); + stage_offset_set = 0; + stage_offset = 0; +} + +#ifdef __amd64__ +int copy_staging = COPY_STAGING_ENABLE; + +static int +command_copy_staging(int argc, char *argv[]) +{ + static const char *const mode[3] = { + [COPY_STAGING_ENABLE] = "enable", + [COPY_STAGING_DISABLE] = "disable", + [COPY_STAGING_AUTO] = "auto", + }; + int prev, res; + + res = CMD_OK; + if (argc > 2) { + res = CMD_ERROR; + } else if (argc == 2) { + prev = copy_staging; + if (strcmp(argv[1], "enable") == 0) + copy_staging = COPY_STAGING_ENABLE; + else if (strcmp(argv[1], "disable") == 0) + copy_staging = COPY_STAGING_DISABLE; + else if (strcmp(argv[1], "auto") == 0) + copy_staging = COPY_STAGING_AUTO; + else { + printf("usage: copy_staging enable|disable|auto\n"); + res = CMD_ERROR; + } + if (res == CMD_OK && prev != copy_staging) { + printf("changed copy_staging, unloading kernel\n"); + unload(); + efi_copy_free(); + efi_copy_init(); + } + } else { + printf("copy staging: %s\n", mode[copy_staging]); + } + return (res); +} +COMMAND_SET(copy_staging, "copy_staging", "copy staging", command_copy_staging); +#endif + +static int +command_staging_slop(int argc, char *argv[]) +{ + char *endp; + u_long new, prev; + int res; + + res = CMD_OK; + if (argc > 2) { + res = CMD_ERROR; + } else if (argc == 2) { + new = strtoul(argv[1], &endp, 0); + if (*endp != '\0') { + printf("invalid slop value\n"); + res = CMD_ERROR; + } + if (res == CMD_OK && staging_slop != new) { + printf("changed slop, unloading kernel\n"); + unload(); + efi_copy_free(); + efi_copy_init(); + } + } else { + printf("staging slop %#lx\n", staging_slop); + } + return (res); +} +COMMAND_SET(staging_slop, "staging_slop", "set staging slop", + command_staging_slop); + +#if defined(__i386__) || defined(__amd64__) +/* + * The staging area must reside in the the first 1GB or 4GB physical + * memory: see elf64_exec() in + * boot/efi/loader/arch/amd64/elf64_freebsd.c. + */ +static EFI_PHYSICAL_ADDRESS +get_staging_max(void) +{ + EFI_PHYSICAL_ADDRESS res; + +#if defined(__i386__) + res = G(1); +#elif defined(__amd64__) + res = copy_staging == COPY_STAGING_ENABLE ? G(1) : G(4); +#endif + return (res); +} +#define EFI_ALLOC_METHOD AllocateMaxAddress +#else +#define EFI_ALLOC_METHOD AllocateAnyPages +#endif + int efi_copy_init(void) { EFI_STATUS status; - unsigned long nr_pages; - nr_pages = EFI_SIZE_TO_PAGES((EFI_STAGING_SIZE) * 1024 * 1024); + nr_pages = EFI_SIZE_TO_PAGES((EFI_STAGING_SIZE)); #if defined(__i386__) || defined(__amd64__) /* @@ -203,18 +326,10 @@ efi_copy_init(void) if (running_on_hyperv()) efi_verify_staging_size(&nr_pages); - /* - * The staging area must reside in the the first 1GB physical - * memory: see elf64_exec() in - * boot/efi/loader/arch/amd64/elf64_freebsd.c. - */ - staging = 1024*1024*1024; - status = BS->AllocatePages(AllocateMaxAddress, EfiLoaderData, - nr_pages, &staging); -#else - status = BS->AllocatePages(AllocateAnyPages, EfiLoaderData, - nr_pages, &staging); + staging = get_staging_max(); #endif + status = BS->AllocatePages(EFI_ALLOC_METHOD, EfiLoaderData, + nr_pages, &staging); if (EFI_ERROR(status)) { printf("failed to allocate staging area: %lu\n", EFI_ERROR_CODE(status)); @@ -223,7 +338,7 @@ efi_copy_init(void) staging_base = staging; staging_end = staging + nr_pages * EFI_PAGE_SIZE; -#if defined(__aarch64__) || defined(__arm__) || defined(__riscv) +#if EFI_STAGING_2M_ALIGN /* * Round the kernel load address to a 2MiB value. This is needed * because the kernel builds a page table based on where it has @@ -231,7 +346,7 @@ efi_copy_init(void) * either a 1MiB or 2MiB page for this we need to make sure it * is correctly aligned for both cases. */ - staging = roundup2(staging, 2 * 1024 * 1024); + staging = roundup2(staging, M(2)); #endif return (0); @@ -240,20 +355,42 @@ efi_copy_init(void) static bool efi_check_space(vm_offset_t end) { - EFI_PHYSICAL_ADDRESS addr; + EFI_PHYSICAL_ADDRESS addr, new_base, new_staging; EFI_STATUS status; unsigned long nr_pages; + end = roundup2(end, EFI_PAGE_SIZE); + /* There is already enough space */ - if (end <= staging_end) + if (end + staging_slop <= staging_end) return (true); - end = roundup2(end, EFI_PAGE_SIZE); - nr_pages = EFI_SIZE_TO_PAGES(end - staging_end); + if (boot_services_gone) { + if (end <= staging_end) + return (true); + panic("efi_check_space: cannot expand staging area " + "after boot services were exited\n"); + } + + /* + * Add slop at the end: + * 1. amd64 kernel expects to do some very early allocations + * by carving out memory after kernend. Slop guarantees + * that it does not ovewrite anything useful. + * 2. It seems that initial calculation of the staging size + * could be somewhat smaller than actually copying in after + * boot services are exited. Slop avoids calling + * BS->AllocatePages() when it cannot work. + */ + end += staging_slop; + nr_pages = EFI_SIZE_TO_PAGES(end - staging_end); #if defined(__i386__) || defined(__amd64__) - /* X86 needs all memory to be allocated under the 1G boundary */ - if (end > 1024*1024*1024) + /* + * i386 needs all memory to be allocated under the 1G boundary. + * amd64 needs all memory to be allocated under the 1G or 4G boundary. + */ + if (end > get_staging_max()) goto before_staging; #endif @@ -268,14 +405,12 @@ efi_check_space(vm_offset_t end) before_staging: /* Try allocating space before the previous allocation */ - if (staging < nr_pages * EFI_PAGE_SIZE) { - printf("Not enough space before allocation\n"); - return (false); - } + if (staging < nr_pages * EFI_PAGE_SIZE) + goto expand; addr = staging - nr_pages * EFI_PAGE_SIZE; -#if defined(__aarch64__) || defined(__arm__) || defined(__riscv) +#if EFI_STAGING_2M_ALIGN /* See efi_copy_init for why this is needed */ - addr = rounddown2(addr, 2 * 1024 * 1024); + addr = rounddown2(addr, M(2)); #endif nr_pages = EFI_SIZE_TO_PAGES(staging_base - addr); status = BS->AllocatePages(AllocateAddress, EfiLoaderData, nr_pages, @@ -288,11 +423,42 @@ before_staging: staging_base = addr; memmove((void *)(uintptr_t)staging_base, (void *)(uintptr_t)staging, staging_end - staging); - stage_offset -= (staging - staging_base); + stage_offset -= staging - staging_base; staging = staging_base; return (true); } +expand: + nr_pages = EFI_SIZE_TO_PAGES(end - (vm_offset_t)staging); +#if EFI_STAGING_2M_ALIGN + nr_pages += M(2) / EFI_PAGE_SIZE; +#endif +#if defined(__i386__) || defined(__amd64__) + new_base = get_staging_max(); +#endif + status = BS->AllocatePages(EFI_ALLOC_METHOD, EfiLoaderData, + nr_pages, &new_base); + if (!EFI_ERROR(status)) { +#if EFI_STAGING_2M_ALIGN + new_staging = roundup2(new_base, M(2)); +#else + new_staging = new_base; +#endif + /* + * Move the old allocation and update the state so + * translation still works. + */ + memcpy((void *)(uintptr_t)new_staging, + (void *)(uintptr_t)staging, staging_end - staging); + BS->FreePages(staging_base, (staging_end - staging_base) / + EFI_PAGE_SIZE); + stage_offset -= staging - new_staging; + staging = new_staging; + staging_end = new_base + nr_pages * EFI_PAGE_SIZE; + staging_base = new_base; + return (true); + } + printf("efi_check_space: Unable to expand staging area\n"); return (false); } @@ -335,7 +501,6 @@ efi_copyout(const vm_offset_t src, void *dest, const size_t len) return (len); } - ssize_t efi_readin(readin_handle_t fd, vm_offset_t dest, const size_t len) { @@ -364,3 +529,8 @@ efi_copy_finish(void) while (src < last) *dst++ = *src++; } + +void +efi_copy_finish_nop(void) +{ +} diff --git a/stand/efi/loader/loader_efi.h b/stand/efi/loader/loader_efi.h index 4d077514e423..8254d16b1592 100644 --- a/stand/efi/loader/loader_efi.h +++ b/stand/efi/loader/loader_efi.h @@ -34,6 +34,15 @@ #include <stand.h> #include <readin.h> +#ifdef __amd64__ +enum { + COPY_STAGING_ENABLE, + COPY_STAGING_DISABLE, + COPY_STAGING_AUTO, +}; +extern int copy_staging; +#endif + int efi_autoload(void); int efi_copy_init(void); @@ -44,5 +53,6 @@ ssize_t efi_readin(readin_handle_t fd, vm_offset_t dest, const size_t len); void * efi_translate(vm_offset_t ptr); void efi_copy_finish(void); +void efi_copy_finish_nop(void); #endif /* _LOADER_EFI_COPY_H_ */
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?202108081953.178Jr3Av034565>