andling of 1GB largepage mappings List-Id: Commits to the stable branches of the FreeBSD src repository List-Archive: https://lists.freebsd.org/archives/dev-commits-src-branches List-Help: List-Post: List-Subscribe: List-Unsubscribe: X-BeenThere: dev-commits-src-branches@freebsd.org Sender: owner-dev-commits-src-branches@FreeBSD.org MIME-Version: 1.0 Content-Type: text/plain; charset=utf-8 Content-Transfer-Encoding: 8bit X-Git-Committer: markj X-Git-Repository: src X-Git-Refname: refs/heads/stable/15 X-Git-Reftype: branch X-Git-Commit: 9331e62e8b80b3047470ede2845664d89583302a Auto-Submitted: auto-generated Date: Tue, 21 Apr 2026 15:43:03 +0000 Message-Id: <69e79b07.344d5.39410391@gitrepo.freebsd.org> The branch stable/15 has been updated by markj: URL: https://cgit.FreeBSD.org/src/commit/?id=9331e62e8b80b3047470ede2845664d89583302a commit 9331e62e8b80b3047470ede2845664d89583302a Author: Mark Johnston AuthorDate: 2026-03-31 13:37:43 +0000 Commit: Mark Johnston CommitDate: 2026-04-21 15:42:41 +0000 pkru: Fix handling of 1GB largepage mappings pmap_pkru_update_range() did not handle the case where a PDPE has PG_PS set. More generally, the SET_PKRU and CLEAR_PKRU sysarch implementations did not check whether the request covers a "boundary" vm map entry. Fix this, add the missing PG_PS test, and add some tests. Approved by: so Security: FreeBSD-SA-26:11.amd64 Security: CVE-2026-6386 Reported by: Nicholas Carlini Reviewed by: kib, alc Differential Revision: https://reviews.freebsd.org/D56184 --- lib/libsys/x86/pkru.3 | 3 + sys/amd64/amd64/pmap.c | 20 +++- sys/amd64/amd64/sys_machdep.c | 43 +++++++-- sys/vm/vm_map.c | 32 +++++++ sys/vm/vm_map.h | 1 + tests/sys/posixshm/posixshm_test.c | 187 +++++++++++++++++++++++++++++++++++++ 6 files changed, 274 insertions(+), 12 deletions(-) diff --git a/lib/libsys/x86/pkru.3 b/lib/libsys/x86/pkru.3 index 95bc66c979ac..033dc07c4b06 100644 --- a/lib/libsys/x86/pkru.3 +++ b/lib/libsys/x86/pkru.3 @@ -179,6 +179,9 @@ The supplied argument for .Fn x86_pkru_protect_range has reserved bits set. +.It Bq Er EINVAL +The range of the request partially covers a mapping of an object created by +.Xr shm_create_largepage 3 . .It Bq Er EFAULT The supplied address range does not completely fit into the user-managed address range. diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index 956e9c5e78d2..497c85c3f0c2 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -11551,7 +11551,7 @@ pmap_pkru_update_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, u_int keyidx) { pml4_entry_t *pml4e; - pdp_entry_t *pdpe; + pdp_entry_t newpdpe, *pdpe; pd_entry_t newpde, ptpaddr, *pde; pt_entry_t newpte, *ptep, pte; vm_offset_t va, va_next; @@ -11577,6 +11577,22 @@ pmap_pkru_update_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, va_next = eva; continue; } + if ((*pdpe & PG_PS) != 0) { + va_next = (va + NBPDP) & ~PDPMASK; + if (va_next < va) + va_next = eva; + KASSERT(va_next <= eva, + ("partial update of non-transparent 1G mapping " + "pdpe %#lx va %#lx eva %#lx va_next %#lx", + *pdpe, va, eva, va_next)); + newpdpe = (*pdpe & ~X86_PG_PKU_MASK) | + X86_PG_PKU(keyidx); + if (newpdpe != *pdpe) { + *pdpe = newpdpe; + changed = true; + } + continue; + } va_next = (va + NBPDR) & ~PDRMASK; if (va_next < va) @@ -11629,8 +11645,6 @@ pmap_pkru_check_uargs(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, if (pmap->pm_type != PT_X86 || keyidx > PMAP_MAX_PKRU_IDX || (flags & ~(AMD64_PKRU_PERSIST | AMD64_PKRU_EXCL)) != 0) return (EINVAL); - if (eva <= sva || eva > VM_MAXUSER_ADDRESS) - return (EFAULT); if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0) return (ENOTSUP); return (0); diff --git a/sys/amd64/amd64/sys_machdep.c b/sys/amd64/amd64/sys_machdep.c index 51f55687bbcf..1df73a25c05e 100644 --- a/sys/amd64/amd64/sys_machdep.c +++ b/sys/amd64/amd64/sys_machdep.c @@ -30,7 +30,6 @@ * SUCH DAMAGE. */ -#include #include "opt_capsicum.h" #include "opt_ktrace.h" @@ -369,32 +368,58 @@ sysarch(struct thread *td, struct sysarch_args *uap) break; case I386_SET_PKRU: - case AMD64_SET_PKRU: + case AMD64_SET_PKRU: { + vm_offset_t addr, start, end; + vm_size_t len; + + addr = (uintptr_t)a64pkru.addr; + len = a64pkru.len; + /* * Read-lock the map to synchronize with parallel * pmap_vmspace_copy() on fork. */ map = &td->td_proc->p_vmspace->vm_map; vm_map_lock_read(map); - error = pmap_pkru_set(PCPU_GET(curpmap), - (vm_offset_t)a64pkru.addr, (vm_offset_t)a64pkru.addr + - a64pkru.len, a64pkru.keyidx, a64pkru.flags); + if (len == 0 || !vm_map_check_boundary(map, addr, addr + len)) { + vm_map_unlock_read(map); + error = EINVAL; + break; + } + start = trunc_page(addr); + end = round_page(addr + len); + error = pmap_pkru_set(PCPU_GET(curpmap), start, end, + a64pkru.keyidx, a64pkru.flags); vm_map_unlock_read(map); break; + } case I386_CLEAR_PKRU: - case AMD64_CLEAR_PKRU: + case AMD64_CLEAR_PKRU: { + vm_offset_t addr, start, end; + vm_size_t len; + if (a64pkru.flags != 0 || a64pkru.keyidx != 0) { error = EINVAL; break; } + + addr = (uintptr_t)a64pkru.addr; + len = a64pkru.len; + map = &td->td_proc->p_vmspace->vm_map; vm_map_lock_read(map); - error = pmap_pkru_clear(PCPU_GET(curpmap), - (vm_offset_t)a64pkru.addr, - (vm_offset_t)a64pkru.addr + a64pkru.len); + if (len == 0 || !vm_map_check_boundary(map, addr, addr + len)) { + vm_map_unlock_read(map); + error = EINVAL; + break; + } + start = trunc_page(addr); + end = round_page(addr + len); + error = pmap_pkru_clear(PCPU_GET(curpmap), start, end); vm_map_unlock_read(map); break; + } case AMD64_DISABLE_TLSBASE: clear_pcb_flags(pcb, PCB_TLSBASE); diff --git a/sys/vm/vm_map.c b/sys/vm/vm_map.c index b8295bb2108d..63bdce9d60f8 100644 --- a/sys/vm/vm_map.c +++ b/sys/vm/vm_map.c @@ -4162,6 +4162,38 @@ vm_map_check_protection(vm_map_t map, vm_offset_t start, vm_offset_t end, return (TRUE); } +/* + * Check whether the specified range partially overlaps a map entry with + * fixed boundaries, and return false if so. + * + * The map must be locked. + */ +bool +vm_map_check_boundary(vm_map_t map, vm_offset_t start, vm_offset_t end) +{ + vm_map_entry_t entry; + int bdry_idx; + + if (!vm_map_range_valid(map, start, end)) + return (false); + if (start == end) + return (true); + + if (vm_map_lookup_entry(map, start, &entry)) { + bdry_idx = MAP_ENTRY_SPLIT_BOUNDARY_INDEX(entry); + if (bdry_idx != 0 && + (start & (pagesizes[bdry_idx] - 1)) != 0) + return (false); + } + if (vm_map_lookup_entry(map, end - 1, &entry)) { + bdry_idx = MAP_ENTRY_SPLIT_BOUNDARY_INDEX(entry); + if (bdry_idx != 0 && + (end & (pagesizes[bdry_idx] - 1)) != 0) + return (false); + } + return (true); +} + /* * * vm_map_copy_swap_object: diff --git a/sys/vm/vm_map.h b/sys/vm/vm_map.h index 6af3dba42685..0b0edb24a64d 100644 --- a/sys/vm/vm_map.h +++ b/sys/vm/vm_map.h @@ -479,6 +479,7 @@ vm_map_entry_read_succ(void *token, struct vm_map_entry *const clone, #endif /* ! _KERNEL */ #ifdef _KERNEL +bool vm_map_check_boundary(vm_map_t, vm_offset_t, vm_offset_t); boolean_t vm_map_check_protection (vm_map_t, vm_offset_t, vm_offset_t, vm_prot_t); int vm_map_delete(vm_map_t, vm_offset_t, vm_offset_t); int vm_map_find(vm_map_t, vm_object_t, vm_ooffset_t, vm_offset_t *, vm_size_t, diff --git a/tests/sys/posixshm/posixshm_test.c b/tests/sys/posixshm/posixshm_test.c index 680a443b6eac..8333faa90594 100644 --- a/tests/sys/posixshm/posixshm_test.c +++ b/tests/sys/posixshm/posixshm_test.c @@ -38,10 +38,17 @@ #include #include +#ifdef __amd64__ +#include +#endif + #include #include #include +#include +#include #include +#include #include #include #include @@ -1889,6 +1896,183 @@ ATF_TC_BODY(largepage_pipe, tc) } } +#ifdef __amd64__ +static sigjmp_buf jmpbuf; +static _Atomic(void *) faultaddr; +static _Atomic(int) faultsig; + +#define KEY_RW 1 +#define KEY_RO 2 +#define KEY_WO 3 +#define KEY_NO 4 +#define VAL 0xdeadfacec0debeef +static void +set_keys(void) +{ + int error; + + error = x86_pkru_set_perm(KEY_RW, 1, 1); + ATF_REQUIRE(error == 0); + error = x86_pkru_set_perm(KEY_RO, 1, 0); + ATF_REQUIRE(error == 0); + error = x86_pkru_set_perm(KEY_WO, 0, 1); + ATF_REQUIRE(error == 0); + error = x86_pkru_set_perm(KEY_NO, 0, 0); + ATF_REQUIRE(error == 0); +} + +static void +sigsegv(int sig, siginfo_t *si, void *uc __unused) +{ + faultsig = sig; + faultaddr = si->si_addr; + siglongjmp(jmpbuf, 1); +} + +static bool +try_read(volatile uint64_t *p, uint64_t *outp) +{ + if (sigsetjmp(jmpbuf, 1) == 0) { + *outp = *p; + return (true); + } else { + atomic_signal_fence(memory_order_relaxed); + ATF_REQUIRE(faultsig == SIGSEGV); + ATF_REQUIRE(faultaddr == p); + set_keys(); /* PKRU is not restored by siglongjmp? */ + return (false); + } +} + +static bool +try_write(volatile uint64_t *p, uint64_t val) +{ + if (sigsetjmp(jmpbuf, 1) == 0) { + *p = val; + return (true); + } else { + atomic_signal_fence(memory_order_relaxed); + ATF_REQUIRE(faultsig == SIGSEGV); + ATF_REQUIRE(faultaddr == p); + set_keys(); /* PKRU is not restored by siglongjmp? */ + return (false); + } +} + +ATF_TC_WITHOUT_HEAD(largepage_pkru); +ATF_TC_BODY(largepage_pkru, tc) +{ + size_t ps[MAXPAGESIZES]; + struct sigaction sa; + char *addr, *addr1; + int error, fd, pscnt; + + memset(&sa, 0, sizeof(sa)); + sa.sa_sigaction = sigsegv; + sa.sa_flags = SA_SIGINFO; + sigemptyset(&sa.sa_mask); + error = sigaction(SIGSEGV, &sa, NULL); + ATF_REQUIRE(error == 0); + + pscnt = pagesizes(ps, true); + + for (int i = 1; i < pscnt; i++) { + uint64_t val; + + fd = shm_open_large(i, SHM_LARGEPAGE_ALLOC_DEFAULT, ps[i]); + addr = mmap(NULL, ps[i], PROT_READ | PROT_WRITE, MAP_SHARED, fd, + 0); + ATF_REQUIRE_MSG(addr != MAP_FAILED, + "mmap(%zu bytes) failed; error=%d", ps[i], errno); + + /* + * Ensure that the page is faulted into the pmap. + */ + memset(addr, 0, ps[i]); + + set_keys(); + + /* + * Make sure we can't partially cover a largepage mapping. + */ + error = x86_pkru_protect_range(addr, PAGE_SIZE, KEY_RW, 0); + ATF_REQUIRE_ERRNO(EINVAL, error != 0); + error = x86_pkru_protect_range(addr, ps[i] - PAGE_SIZE, KEY_RW, + 0); + ATF_REQUIRE_ERRNO(EINVAL, error != 0); + error = x86_pkru_protect_range(addr + PAGE_SIZE, ps[i] - PAGE_SIZE, + KEY_RW, 0); + ATF_REQUIRE_ERRNO(EINVAL, error != 0); + error = x86_pkru_protect_range(addr + 1, ps[i], KEY_RW, 0); + ATF_REQUIRE_ERRNO(EINVAL, error != 0); + + /* + * Make sure that protections are honoured. + */ + for (int j = 1; j <= 4; j++) { + volatile uint64_t *addr64; + + error = x86_pkru_protect_range(addr, ps[i], 0, 0); + ATF_REQUIRE(error == 0); + + addr64 = (volatile uint64_t *)(void *)addr; + *addr64 = VAL; + + error = x86_pkru_protect_range(addr, ps[i], j, 0); + ATF_REQUIRE(error == 0); + switch (j) { + case KEY_RW: + ATF_REQUIRE(try_write(addr64, VAL)); + ATF_REQUIRE(try_read(addr64, &val)); + ATF_REQUIRE(val == VAL); + break; + case KEY_RO: + ATF_REQUIRE(try_read(addr64, &val)); + ATF_REQUIRE(val == VAL); + ATF_REQUIRE(!try_write(addr64, VAL)); + break; + case KEY_WO: + /* !access implies !modify */ + case KEY_NO: + ATF_REQUIRE(!try_read(addr64, &val)); + ATF_REQUIRE(!try_write(addr64, VAL)); + break; + default: + __unreachable(); + } + } + error = munmap(addr, ps[i]); + ATF_CHECK(error == 0); + + /* + * Try mapping a large page in a region partially covered by a + * key. + * + * Rather than detecting the mismatch when the logical mapping + * is created, we currently only fail once pmap_enter() is + * called from the fault handler. This is not ideal and might + * be improved in the future. + */ + error = x86_pkru_protect_range(addr, ps[i], 0, 0); + ATF_REQUIRE(error == 0); + error = x86_pkru_protect_range(addr + PAGE_SIZE, + ps[i] - PAGE_SIZE, KEY_RW, 0); + ATF_REQUIRE(error == 0); + + addr1 = mmap(addr, ps[i], PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_FIXED, fd, 0); + ATF_REQUIRE(addr1 != MAP_FAILED); + ATF_REQUIRE(addr == addr1); + ATF_REQUIRE(!try_read((volatile uint64_t *)(void *)addr, &val)); + ATF_REQUIRE(!try_write((volatile uint64_t *)(void *)addr, VAL)); + } +} +#undef KEY_RW +#undef KEY_RO +#undef KEY_WO +#undef KEY_NO +#endif + ATF_TC_WITHOUT_HEAD(largepage_reopen); ATF_TC_BODY(largepage_reopen, tc) { @@ -1979,6 +2163,9 @@ ATF_TP_ADD_TCS(tp) ATF_TP_ADD_TC(tp, largepage_mprotect); ATF_TP_ADD_TC(tp, largepage_minherit); ATF_TP_ADD_TC(tp, largepage_pipe); +#ifdef __amd64__ + ATF_TP_ADD_TC(tp, largepage_pkru); +#endif ATF_TP_ADD_TC(tp, largepage_reopen); return (atf_no_error());