Skip site navigation (1)Skip section navigation (2)
Date:      Wed, 20 Feb 2019 09:51:13 +0000 (UTC)
From:      Konstantin Belousov <kib@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r344353 - in head/sys: amd64/amd64 amd64/include arm/include arm64/include i386/include mips/include powerpc/include riscv/include sparc64/include vm x86/include
Message-ID:  <201902200951.x1K9pDQs001745@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: kib
Date: Wed Feb 20 09:51:13 2019
New Revision: 344353
URL: https://svnweb.freebsd.org/changeset/base/344353

Log:
  Add kernel support for Intel userspace protection keys feature on
  Skylake Xeons.
  
  See SDM rev. 68 Vol 3 4.6.2 Protection Keys and the description of the
  RDPKRU and WRPKRU instructions.
  
  Reviewed by:	markj
  Tested by:	pho
  Sponsored by:	The FreeBSD Foundation
  MFC after:	2 weeks
  Differential revision:	https://reviews.freebsd.org/D18893

Modified:
  head/sys/amd64/amd64/initcpu.c
  head/sys/amd64/amd64/pmap.c
  head/sys/amd64/amd64/sys_machdep.c
  head/sys/amd64/amd64/trap.c
  head/sys/amd64/include/pmap.h
  head/sys/arm/include/pmap.h
  head/sys/arm64/include/pmap.h
  head/sys/i386/include/pmap.h
  head/sys/mips/include/pmap.h
  head/sys/powerpc/include/pmap.h
  head/sys/riscv/include/pmap.h
  head/sys/sparc64/include/pmap.h
  head/sys/vm/vm_fault.c
  head/sys/vm/vm_map.c
  head/sys/x86/include/sysarch.h

Modified: head/sys/amd64/amd64/initcpu.c
==============================================================================
--- head/sys/amd64/amd64/initcpu.c	Wed Feb 20 09:46:44 2019	(r344352)
+++ head/sys/amd64/amd64/initcpu.c	Wed Feb 20 09:51:13 2019	(r344353)
@@ -233,6 +233,9 @@ initializecpu(void)
 	if (cpu_stdext_feature & CPUID_STDEXT_FSGSBASE)
 		cr4 |= CR4_FSGSBASE;
 
+	if (cpu_stdext_feature2 & CPUID_STDEXT2_PKU)
+		cr4 |= CR4_PKE;
+
 	/*
 	 * Postpone enabling the SMEP on the boot CPU until the page
 	 * tables are switched from the boot loader identity mapping

Modified: head/sys/amd64/amd64/pmap.c
==============================================================================
--- head/sys/amd64/amd64/pmap.c	Wed Feb 20 09:46:44 2019	(r344352)
+++ head/sys/amd64/amd64/pmap.c	Wed Feb 20 09:51:13 2019	(r344353)
@@ -48,7 +48,7 @@
  */
 /*-
  * Copyright (c) 2003 Networks Associates Technology, Inc.
- * Copyright (c) 2014-2018 The FreeBSD Foundation
+ * Copyright (c) 2014-2019 The FreeBSD Foundation
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project by Jake Burkholder,
@@ -121,6 +121,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/mman.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
+#include <sys/rangeset.h>
 #include <sys/rwlock.h>
 #include <sys/sx.h>
 #include <sys/turnstile.h>
@@ -155,6 +156,7 @@ __FBSDID("$FreeBSD$");
 #ifdef SMP
 #include <machine/smp.h>
 #endif
+#include <machine/sysarch.h>
 #include <machine/tss.h>
 
 static __inline boolean_t
@@ -285,6 +287,13 @@ pmap_modified_bit(pmap_t pmap)
 	return (mask);
 }
 
+static __inline pt_entry_t
+pmap_pku_mask_bit(pmap_t pmap)
+{
+
+	return (pmap->pm_type == PT_X86 ? X86_PG_PKU_MASK : 0);
+}
+
 #if !defined(DIAGNOSTIC)
 #ifdef __GNUC_GNU_INLINE__
 #define PMAP_INLINE	__attribute__((__gnu_inline__)) inline
@@ -424,6 +433,22 @@ static pml4_entry_t *pti_pml4;
 static vm_pindex_t pti_pg_idx;
 static bool pti_finalized;
 
+struct pmap_pkru_range {
+	struct rs_el	pkru_rs_el;
+	u_int		pkru_keyidx;
+	int		pkru_flags;
+};
+
+static uma_zone_t pmap_pkru_ranges_zone;
+static bool pmap_pkru_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva);
+static pt_entry_t pmap_pkru_get(pmap_t pmap, vm_offset_t va);
+static void pmap_pkru_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva);
+static void *pkru_dup_range(void *ctx, void *data);
+static void pkru_free_range(void *ctx, void *node);
+static int pmap_pkru_copy(pmap_t dst_pmap, pmap_t src_pmap);
+static int pmap_pkru_deassign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva);
+static void pmap_pkru_deassign_all(pmap_t pmap);
+
 static int
 pmap_pcid_save_cnt_proc(SYSCTL_HANDLER_ARGS)
 {
@@ -2846,6 +2871,12 @@ pmap_pinit0(pmap_t pmap)
 		pmap->pm_pcids[i].pm_gen = 1;
 	}
 	pmap_activate_boot(pmap);
+
+	if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) {
+		pmap_pkru_ranges_zone = uma_zcreate("pkru ranges",
+		    sizeof(struct pmap_pkru_range), NULL, NULL, NULL, NULL,
+		    UMA_ALIGN_PTR, 0);
+	}
 }
 
 void
@@ -2934,6 +2965,10 @@ pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, i
 			pmap_pinit_pml4_pti(pml4pgu);
 			pmap->pm_ucr3 = VM_PAGE_TO_PHYS(pml4pgu);
 		}
+		if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) {
+			rangeset_init(&pmap->pm_pkru, pkru_dup_range,
+			    pkru_free_range, pmap, M_NOWAIT);
+		}
 	}
 
 	pmap->pm_root.rt_root = 0;
@@ -3230,6 +3265,9 @@ pmap_release(pmap_t pmap)
 		vm_page_unwire_noq(m);
 		vm_page_free(m);
 	}
+	if (pmap->pm_type == PT_X86 &&
+	    (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0)
+		rangeset_fini(&pmap->pm_pkru);
 }
 
 static int
@@ -4060,7 +4098,7 @@ pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, v
 {
 	pd_entry_t newpde, oldpde;
 	pt_entry_t *firstpte, newpte;
-	pt_entry_t PG_A, PG_G, PG_M, PG_RW, PG_V;
+	pt_entry_t PG_A, PG_G, PG_M, PG_PKU_MASK, PG_RW, PG_V;
 	vm_paddr_t mptepa;
 	vm_page_t mpte;
 	struct spglist free;
@@ -4073,6 +4111,7 @@ pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, v
 	PG_RW = pmap_rw_bit(pmap);
 	PG_V = pmap_valid_bit(pmap);
 	PG_PTE_CACHE = pmap_cache_mask(pmap, 0);
+	PG_PKU_MASK = pmap_pku_mask_bit(pmap);
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	oldpde = *pde;
@@ -4505,6 +4544,7 @@ pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t 
 out:
 	if (anyvalid)
 		pmap_invalidate_all(pmap);
+	pmap_pkru_on_remove(pmap, sva, eva);
 	PMAP_UNLOCK(pmap);
 	pmap_delayed_invl_finished();
 	vm_page_free_pages_toq(&free, true);
@@ -4816,7 +4856,7 @@ pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offs
 {
 	pd_entry_t newpde;
 	pt_entry_t *firstpte, oldpte, pa, *pte;
-	pt_entry_t PG_G, PG_A, PG_M, PG_RW, PG_V;
+	pt_entry_t PG_G, PG_A, PG_M, PG_RW, PG_V, PG_PKU_MASK;
 	vm_page_t mpte;
 	int PG_PTE_CACHE;
 
@@ -4825,6 +4865,7 @@ pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offs
 	PG_M = pmap_modified_bit(pmap);
 	PG_V = pmap_valid_bit(pmap);
 	PG_RW = pmap_rw_bit(pmap);
+	PG_PKU_MASK = pmap_pku_mask_bit(pmap);
 	PG_PTE_CACHE = pmap_cache_mask(pmap, 0);
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
@@ -5052,6 +5093,8 @@ retry:
 
 	origpte = *pte;
 	pv = NULL;
+	if (va < VM_MAXUSER_ADDRESS && pmap->pm_type == PT_X86)
+		newpte |= pmap_pkru_get(pmap, va);
 
 	/*
 	 * Is the specified virtual address already mapped?
@@ -5271,6 +5314,25 @@ pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t
 		    " in pmap %p", va, pmap);
 		return (KERN_RESOURCE_SHORTAGE);
 	}
+
+	/*
+	 * If pkru is not same for the whole pde range, return failure
+	 * and let vm_fault() cope.  Check after pde allocation, since
+	 * it could sleep.
+	 */
+	if (!pmap_pkru_same(pmap, va, va + NBPDR)) {
+		SLIST_INIT(&free);
+		if (pmap_unwire_ptp(pmap, va, pdpg, &free)) {
+			pmap_invalidate_page(pmap, va);
+			vm_page_free_pages_toq(&free, true);
+		}
+		return (KERN_FAILURE);
+	}
+	if (va < VM_MAXUSER_ADDRESS && pmap->pm_type == PT_X86) {
+		newpde &= ~X86_PG_PKU_MASK;
+		newpde |= pmap_pkru_get(pmap, va);
+	}
+
 	pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg));
 	pde = &pde[pmap_pde_index(va)];
 	oldpde = *pde;
@@ -5530,7 +5592,7 @@ pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, v
 	if ((prot & VM_PROT_EXECUTE) == 0)
 		newpte |= pg_nx;
 	if (va < VM_MAXUSER_ADDRESS)
-		newpte |= PG_U;
+		newpte |= PG_U | pmap_pkru_get(pmap, va);
 	pte_store(pte, newpte);
 	return (mpte);
 }
@@ -5906,6 +5968,36 @@ out:
 	PMAP_UNLOCK(dst_pmap);
 }
 
+int
+pmap_vmspace_copy(pmap_t dst_pmap, pmap_t src_pmap)
+{
+	int error;
+
+	if (dst_pmap->pm_type != src_pmap->pm_type ||
+	    dst_pmap->pm_type != PT_X86 ||
+	    (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0)
+		return (0);
+	for (;;) {
+		if (dst_pmap < src_pmap) {
+			PMAP_LOCK(dst_pmap);
+			PMAP_LOCK(src_pmap);
+		} else {
+			PMAP_LOCK(src_pmap);
+			PMAP_LOCK(dst_pmap);
+		}
+		error = pmap_pkru_copy(dst_pmap, src_pmap);
+		/* Clean up partial copy on failure due to no memory. */
+		if (error == ENOMEM)
+			pmap_pkru_deassign_all(dst_pmap);
+		PMAP_UNLOCK(src_pmap);
+		PMAP_UNLOCK(dst_pmap);
+		if (error != ENOMEM)
+			break;
+		vm_wait(NULL);
+	}
+	return (error);
+}
+
 /*
  * Zero the specified hardware page.
  */
@@ -6305,6 +6397,7 @@ pmap_remove_pages(pmap_t pmap)
 	if (lock != NULL)
 		rw_wunlock(lock);
 	pmap_invalidate_all(pmap);
+	pmap_pkru_deassign_all(pmap);
 	PMAP_UNLOCK(pmap);
 	vm_page_free_pages_toq(&free, true);
 }
@@ -8939,6 +9032,285 @@ pmap_pti_remove_kva(vm_offset_t sva, vm_offset_t eva)
 	}
 	pmap_invalidate_range(kernel_pmap, sva, eva);
 	VM_OBJECT_WUNLOCK(pti_obj);
+}
+
+static void *
+pkru_dup_range(void *ctx __unused, void *data)
+{
+	struct pmap_pkru_range *node, *new_node;
+
+	new_node = uma_zalloc(pmap_pkru_ranges_zone, M_NOWAIT);
+	if (new_node == NULL)
+		return (NULL);
+	node = data;
+	memcpy(new_node, node, sizeof(*node));
+	return (new_node);
+}
+
+static void
+pkru_free_range(void *ctx __unused, void *node)
+{
+
+	uma_zfree(pmap_pkru_ranges_zone, node);
+}
+
+static int
+pmap_pkru_assign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, u_int keyidx,
+    int flags)
+{
+	struct pmap_pkru_range *ppr;
+	int error;
+
+	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+	MPASS(pmap->pm_type == PT_X86);
+	MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0);
+	if ((flags & AMD64_PKRU_EXCL) != 0 &&
+	    !rangeset_check_empty(&pmap->pm_pkru, sva, eva))
+		return (EBUSY);
+	ppr = uma_zalloc(pmap_pkru_ranges_zone, M_NOWAIT);
+	if (ppr == NULL)
+		return (ENOMEM);
+	ppr->pkru_keyidx = keyidx;
+	ppr->pkru_flags = flags & AMD64_PKRU_PERSIST;
+	error = rangeset_insert(&pmap->pm_pkru, sva, eva, ppr);
+	if (error != 0)
+		uma_zfree(pmap_pkru_ranges_zone, ppr);
+	return (error);
+}
+
+static int
+pmap_pkru_deassign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
+{
+
+	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+	MPASS(pmap->pm_type == PT_X86);
+	MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0);
+	return (rangeset_remove(&pmap->pm_pkru, sva, eva));
+}
+
+static void
+pmap_pkru_deassign_all(pmap_t pmap)
+{
+
+	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+	if (pmap->pm_type == PT_X86 &&
+	    (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0)
+		rangeset_remove_all(&pmap->pm_pkru);
+}
+
+static bool
+pmap_pkru_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
+{
+	struct pmap_pkru_range *ppr, *prev_ppr;
+	vm_offset_t va;
+
+	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+	if (pmap->pm_type != PT_X86 ||
+	    (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0 ||
+	    sva >= VM_MAXUSER_ADDRESS)
+		return (true);
+	MPASS(eva <= VM_MAXUSER_ADDRESS);
+	for (va = sva, prev_ppr = NULL; va < eva;) {
+		ppr = rangeset_lookup(&pmap->pm_pkru, va);
+		if ((ppr == NULL) ^ (prev_ppr == NULL))
+			return (false);
+		if (ppr == NULL) {
+			va += PAGE_SIZE;
+			continue;
+		}
+		if (prev_ppr->pkru_keyidx != ppr->pkru_keyidx)
+			return (false);
+		va = ppr->pkru_rs_el.re_end;
+	}
+	return (true);
+}
+
+static pt_entry_t
+pmap_pkru_get(pmap_t pmap, vm_offset_t va)
+{
+	struct pmap_pkru_range *ppr;
+
+	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+	if (pmap->pm_type != PT_X86 ||
+	    (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0 ||
+	    va >= VM_MAXUSER_ADDRESS)
+		return (0);
+	ppr = rangeset_lookup(&pmap->pm_pkru, va);
+	if (ppr != NULL)
+		return (X86_PG_PKU(ppr->pkru_keyidx));
+	return (0);
+}
+
+static bool
+pred_pkru_on_remove(void *ctx __unused, void *r)
+{
+	struct pmap_pkru_range *ppr;
+
+	ppr = r;
+	return ((ppr->pkru_flags & AMD64_PKRU_PERSIST) == 0);
+}
+
+static void
+pmap_pkru_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
+{
+
+	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+	if (pmap->pm_type == PT_X86 &&
+	    (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) {
+		rangeset_remove_pred(&pmap->pm_pkru, sva, eva,
+		    pred_pkru_on_remove);
+	}
+}
+
+static int
+pmap_pkru_copy(pmap_t dst_pmap, pmap_t src_pmap)
+{
+
+	PMAP_LOCK_ASSERT(dst_pmap, MA_OWNED);
+	PMAP_LOCK_ASSERT(src_pmap, MA_OWNED);
+	MPASS(dst_pmap->pm_type == PT_X86);
+	MPASS(src_pmap->pm_type == PT_X86);
+	MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0);
+	if (src_pmap->pm_pkru.rs_data_ctx == NULL)
+		return (0);
+	return (rangeset_copy(&dst_pmap->pm_pkru, &src_pmap->pm_pkru));
+}
+
+static void
+pmap_pkru_update_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
+    u_int keyidx)
+{
+	pml4_entry_t *pml4e;
+	pdp_entry_t *pdpe;
+	pd_entry_t newpde, ptpaddr, *pde;
+	pt_entry_t newpte, *ptep, pte;
+	vm_offset_t va, va_next;
+	bool changed;
+
+	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+	MPASS(pmap->pm_type == PT_X86);
+	MPASS(keyidx <= PMAP_MAX_PKRU_IDX);
+
+	for (changed = false, va = sva; va < eva; va = va_next) {
+		pml4e = pmap_pml4e(pmap, va);
+		if ((*pml4e & X86_PG_V) == 0) {
+			va_next = (va + NBPML4) & ~PML4MASK;
+			if (va_next < va)
+				va_next = eva;
+			continue;
+		}
+
+		pdpe = pmap_pml4e_to_pdpe(pml4e, va);
+		if ((*pdpe & X86_PG_V) == 0) {
+			va_next = (va + NBPDP) & ~PDPMASK;
+			if (va_next < va)
+				va_next = eva;
+			continue;
+		}
+
+		va_next = (va + NBPDR) & ~PDRMASK;
+		if (va_next < va)
+			va_next = eva;
+
+		pde = pmap_pdpe_to_pde(pdpe, va);
+		ptpaddr = *pde;
+		if (ptpaddr == 0)
+			continue;
+
+		MPASS((ptpaddr & X86_PG_V) != 0);
+		if ((ptpaddr & PG_PS) != 0) {
+			if (va + NBPDR == va_next && eva >= va_next) {
+				newpde = (ptpaddr & ~X86_PG_PKU_MASK) |
+				    X86_PG_PKU(keyidx);
+				if (newpde != ptpaddr) {
+					*pde = newpde;
+					changed = true;
+				}
+				continue;
+			} else if (!pmap_demote_pde(pmap, pde, va)) {
+				continue;
+			}
+		}
+
+		if (va_next > eva)
+			va_next = eva;
+
+		for (ptep = pmap_pde_to_pte(pde, va); va != va_next;
+		    ptep++, va += PAGE_SIZE) {
+			pte = *ptep;
+			if ((pte & X86_PG_V) == 0)
+				continue;
+			newpte = (pte & ~X86_PG_PKU_MASK) | X86_PG_PKU(keyidx);
+			if (newpte != pte) {
+				*ptep = newpte;
+				changed = true;
+			}
+		}
+	}
+	if (changed)
+		pmap_invalidate_range(pmap, sva, eva);
+}
+
+static int
+pmap_pkru_check_uargs(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
+    u_int keyidx, int flags)
+{
+
+	if (pmap->pm_type != PT_X86 || keyidx > PMAP_MAX_PKRU_IDX ||
+	    (flags & ~(AMD64_PKRU_PERSIST | AMD64_PKRU_EXCL)) != 0)
+		return (EINVAL);
+	if (eva <= sva || eva > VM_MAXUSER_ADDRESS)
+		return (EFAULT);
+	if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0)
+		return (ENOTSUP);
+	return (0);
+}
+
+int
+pmap_pkru_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, u_int keyidx,
+    int flags)
+{
+	int error;
+
+	sva = trunc_page(sva);
+	eva = round_page(eva);
+	error = pmap_pkru_check_uargs(pmap, sva, eva, keyidx, flags);
+	if (error != 0)
+		return (error);
+	for (;;) {
+		PMAP_LOCK(pmap);
+		error = pmap_pkru_assign(pmap, sva, eva, keyidx, flags);
+		if (error == 0)
+			pmap_pkru_update_range(pmap, sva, eva, keyidx);
+		PMAP_UNLOCK(pmap);
+		if (error != ENOMEM)
+			break;
+		vm_wait(NULL);
+	}
+	return (error);
+}
+
+int
+pmap_pkru_clear(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
+{
+	int error;
+
+	sva = trunc_page(sva);
+	eva = round_page(eva);
+	error = pmap_pkru_check_uargs(pmap, sva, eva, 0, 0);
+	if (error != 0)
+		return (error);
+	for (;;) {
+		PMAP_LOCK(pmap);
+		error = pmap_pkru_deassign(pmap, sva, eva);
+		if (error == 0)
+			pmap_pkru_update_range(pmap, sva, eva, 0);
+		PMAP_UNLOCK(pmap);
+		if (error != ENOMEM)
+			break;
+		vm_wait(NULL);
+	}
+	return (error);
 }
 
 #include "opt_ddb.h"

Modified: head/sys/amd64/amd64/sys_machdep.c
==============================================================================
--- head/sys/amd64/amd64/sys_machdep.c	Wed Feb 20 09:46:44 2019	(r344352)
+++ head/sys/amd64/amd64/sys_machdep.c	Wed Feb 20 09:51:13 2019	(r344353)
@@ -44,6 +44,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
+#include <sys/pcpu.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/smp.h>
@@ -53,6 +54,7 @@ __FBSDID("$FreeBSD$");
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_kern.h>		/* for kernel_map */
+#include <vm/vm_map.h>
 #include <vm/vm_extern.h>
 
 #include <machine/frame.h>
@@ -170,13 +172,16 @@ update_gdt_fsbase(struct thread *td, uint32_t base)
 int
 sysarch(struct thread *td, struct sysarch_args *uap)
 {
-	int error = 0;
-	struct pcb *pcb = curthread->td_pcb;
+	struct pcb *pcb;
+	struct vm_map *map;
 	uint32_t i386base;
 	uint64_t a64base;
 	struct i386_ioperm_args iargs;
 	struct i386_get_xfpustate i386xfpu;
+	struct i386_set_pkru i386pkru;
 	struct amd64_get_xfpustate a64xfpu;
+	struct amd64_set_pkru a64pkru;
+	int error;
 
 #ifdef CAPABILITY_MODE
 	/*
@@ -194,11 +199,15 @@ sysarch(struct thread *td, struct sysarch_args *uap)
 		case I386_GET_GSBASE:
 		case I386_SET_GSBASE:
 		case I386_GET_XFPUSTATE:
+		case I386_SET_PKRU:
+		case I386_CLEAR_PKRU:
 		case AMD64_GET_FSBASE:
 		case AMD64_SET_FSBASE:
 		case AMD64_GET_GSBASE:
 		case AMD64_SET_GSBASE:
 		case AMD64_GET_XFPUSTATE:
+		case AMD64_SET_PKRU:
+		case AMD64_CLEAR_PKRU:
 			break;
 
 		case I386_SET_IOPERM:
@@ -214,6 +223,10 @@ sysarch(struct thread *td, struct sysarch_args *uap)
 
 	if (uap->op == I386_GET_LDT || uap->op == I386_SET_LDT)
 		return (sysarch_ldt(td, uap, UIO_USERSPACE));
+
+	error = 0;
+	pcb = td->td_pcb;
+
 	/*
 	 * XXXKIB check that the BSM generation code knows to encode
 	 * the op argument.
@@ -233,11 +246,27 @@ sysarch(struct thread *td, struct sysarch_args *uap)
 		a64xfpu.addr = (void *)(uintptr_t)i386xfpu.addr;
 		a64xfpu.len = i386xfpu.len;
 		break;
+	case I386_SET_PKRU:
+	case I386_CLEAR_PKRU:
+		if ((error = copyin(uap->parms, &i386pkru,
+		    sizeof(struct i386_set_pkru))) != 0)
+			return (error);
+		a64pkru.addr = (void *)(uintptr_t)i386pkru.addr;
+		a64pkru.len = i386pkru.len;
+		a64pkru.keyidx = i386pkru.keyidx;
+		a64pkru.flags = i386pkru.flags;
+		break;
 	case AMD64_GET_XFPUSTATE:
 		if ((error = copyin(uap->parms, &a64xfpu,
 		    sizeof(struct amd64_get_xfpustate))) != 0)
 			return (error);
 		break;
+	case AMD64_SET_PKRU:
+	case AMD64_CLEAR_PKRU:
+		if ((error = copyin(uap->parms, &a64pkru,
+		    sizeof(struct amd64_set_pkru))) != 0)
+			return (error);
+		break;
 	default:
 		break;
 	}
@@ -324,6 +353,34 @@ sysarch(struct thread *td, struct sysarch_args *uap)
 		fpugetregs(td);
 		error = copyout((char *)(get_pcb_user_save_td(td) + 1),
 		    a64xfpu.addr, a64xfpu.len);
+		break;
+
+	case I386_SET_PKRU:
+	case AMD64_SET_PKRU:
+		/*
+		 * Read-lock the map to synchronize with parallel
+		 * pmap_vmspace_copy() on fork.
+		 */
+		map = &td->td_proc->p_vmspace->vm_map;
+		vm_map_lock_read(map);
+		error = pmap_pkru_set(PCPU_GET(curpmap),
+		    (vm_offset_t)a64pkru.addr, (vm_offset_t)a64pkru.addr +
+		    a64pkru.len, a64pkru.keyidx, a64pkru.flags);
+		vm_map_unlock_read(map);
+		break;
+
+	case I386_CLEAR_PKRU:
+	case AMD64_CLEAR_PKRU:
+		if (a64pkru.flags != 0 || a64pkru.keyidx != 0) {
+			error = EINVAL;
+			break;
+		}
+		map = &td->td_proc->p_vmspace->vm_map;
+		vm_map_lock_read(map);
+		error = pmap_pkru_clear(PCPU_GET(curpmap),
+		    (vm_offset_t)a64pkru.addr,
+		    (vm_offset_t)a64pkru.addr + a64pkru.len);
+		vm_map_unlock(map);
 		break;
 
 	default:

Modified: head/sys/amd64/amd64/trap.c
==============================================================================
--- head/sys/amd64/amd64/trap.c	Wed Feb 20 09:46:44 2019	(r344352)
+++ head/sys/amd64/amd64/trap.c	Wed Feb 20 09:51:13 2019	(r344353)
@@ -808,6 +808,20 @@ trap_pfault(struct trapframe *frame, int usermode)
 	}
 
 	/*
+	 * User-mode protection key violation (PKU).  May happen
+	 * either from usermode or from kernel if copyin accessed
+	 * key-protected mapping.
+	 */
+	if ((frame->tf_err & PGEX_PK) != 0) {
+		if (eva > VM_MAXUSER_ADDRESS) {
+			trap_fatal(frame, eva);
+			return (-1);
+		}
+		rv = KERN_PROTECTION_FAILURE;
+		goto after_vmfault;
+	}
+
+	/*
 	 * If nx protection of the usermode portion of kernel page
 	 * tables caused trap, panic.
 	 */
@@ -842,6 +856,7 @@ trap_pfault(struct trapframe *frame, int usermode)
 #endif
 		return (0);
 	}
+after_vmfault:
 	if (!usermode) {
 		if (td->td_intr_nesting_level == 0 &&
 		    curpcb->pcb_onfault != NULL) {

Modified: head/sys/amd64/include/pmap.h
==============================================================================
--- head/sys/amd64/include/pmap.h	Wed Feb 20 09:46:44 2019	(r344352)
+++ head/sys/amd64/include/pmap.h	Wed Feb 20 09:51:13 2019	(r344353)
@@ -66,6 +66,7 @@
 #define	X86_PG_AVAIL2	0x400	/*   <	programmers use		*/
 #define	X86_PG_AVAIL3	0x800	/*    \				*/
 #define	X86_PG_PDE_PAT	0x1000	/* PAT	PAT index		*/
+#define	X86_PG_PKU(idx)	((pt_entry_t)idx << 59)
 #define	X86_PG_NX	(1ul<<63) /* No-execute */
 #define	X86_PG_AVAIL(x)	(1ul << (x))
 
@@ -73,6 +74,10 @@
 #define	X86_PG_PDE_CACHE (X86_PG_PDE_PAT | X86_PG_NC_PWT | X86_PG_NC_PCD)
 #define	X86_PG_PTE_CACHE (X86_PG_PTE_PAT | X86_PG_NC_PWT | X86_PG_NC_PCD)
 
+/* Protection keys indexes */
+#define	PMAP_MAX_PKRU_IDX	0xf
+#define	X86_PG_PKU_MASK		X86_PG_PKU(PMAP_MAX_PKRU_IDX)
+
 /*
  * Intel extended page table (EPT) bit definitions.
  */
@@ -120,7 +125,7 @@
  * (PTE) page mappings have identical settings for the following fields:
  */
 #define	PG_PTE_PROMOTE	(PG_NX | PG_MANAGED | PG_W | PG_G | PG_PTE_CACHE | \
-	    PG_M | PG_A | PG_U | PG_RW | PG_V)
+	    PG_M | PG_A | PG_U | PG_RW | PG_V | PG_PKU_MASK)
 
 /*
  * Page Protection Exception bits
@@ -242,6 +247,8 @@
 #include <sys/_cpuset.h>
 #include <sys/_lock.h>
 #include <sys/_mutex.h>
+#include <sys/_pctrie.h>
+#include <sys/_rangeset.h>
 
 #include <vm/_vm_radix.h>
 
@@ -336,6 +343,7 @@ struct pmap {
 	long			pm_eptgen;	/* EPT pmap generation id */
 	int			pm_flags;
 	struct pmap_pcids	pm_pcids[MAXCPU];
+	struct rangeset		pm_pkru;
 };
 
 /* flags */
@@ -454,6 +462,10 @@ void	pmap_pti_pcid_invalidate(uint64_t ucr3, uint64_t 
 void	pmap_pti_pcid_invlpg(uint64_t ucr3, uint64_t kcr3, vm_offset_t va);
 void	pmap_pti_pcid_invlrng(uint64_t ucr3, uint64_t kcr3, vm_offset_t sva,
 	    vm_offset_t eva);
+int	pmap_pkru_clear(pmap_t pmap, vm_offset_t sva, vm_offset_t eva);
+int	pmap_pkru_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
+	    u_int keyidx, int flags);
+int	pmap_vmspace_copy(pmap_t dst_pmap, pmap_t src_pmap);
 #endif /* _KERNEL */
 
 /* Return various clipped indexes for a given VA */

Modified: head/sys/arm/include/pmap.h
==============================================================================
--- head/sys/arm/include/pmap.h	Wed Feb 20 09:46:44 2019	(r344352)
+++ head/sys/arm/include/pmap.h	Wed Feb 20 09:51:13 2019	(r344353)
@@ -71,5 +71,12 @@ void pmap_kremove_device(vm_offset_t, vm_size_t);
 vm_paddr_t pmap_kextract(vm_offset_t);
 #define vtophys(va)	pmap_kextract((vm_offset_t)(va))
 
+static inline int
+pmap_vmspace_copy(pmap_t dst_pmap __unused, pmap_t src_pmap __unused)
+{
+
+	return (0);
+}
+
 #endif	/* _KERNEL */
 #endif	/* !_MACHINE_PMAP_H_ */

Modified: head/sys/arm64/include/pmap.h
==============================================================================
--- head/sys/arm64/include/pmap.h	Wed Feb 20 09:46:44 2019	(r344352)
+++ head/sys/arm64/include/pmap.h	Wed Feb 20 09:51:13 2019	(r344353)
@@ -171,6 +171,13 @@ struct pcb *pmap_switch(struct thread *, struct thread
 
 #define	pmap_page_is_mapped(m)	(!TAILQ_EMPTY(&(m)->md.pv_list))
 
+static inline int
+pmap_vmspace_copy(pmap_t dst_pmap __unused, pmap_t src_pmap __unused)
+{
+
+	return (0);
+}
+
 #endif	/* _KERNEL */
 
 #endif	/* !LOCORE */

Modified: head/sys/i386/include/pmap.h
==============================================================================
--- head/sys/i386/include/pmap.h	Wed Feb 20 09:46:44 2019	(r344352)
+++ head/sys/i386/include/pmap.h	Wed Feb 20 09:51:13 2019	(r344353)
@@ -244,6 +244,13 @@ extern vm_offset_t virtual_end;
 #define	pmap_page_is_write_mapped(m)	(((m)->aflags & PGA_WRITEABLE) != 0)
 #define	pmap_unmapbios(va, sz)	pmap_unmapdev((va), (sz))
 
+static inline int
+pmap_vmspace_copy(pmap_t dst_pmap __unused, pmap_t src_pmap __unused)
+{
+
+	return (0);
+}
+
 struct sf_buf;
 
 /*

Modified: head/sys/mips/include/pmap.h
==============================================================================
--- head/sys/mips/include/pmap.h	Wed Feb 20 09:46:44 2019	(r344352)
+++ head/sys/mips/include/pmap.h	Wed Feb 20 09:51:13 2019	(r344353)
@@ -185,6 +185,13 @@ int pmap_emulate_modified(pmap_t pmap, vm_offset_t va)
 void pmap_page_set_memattr(vm_page_t, vm_memattr_t);
 int pmap_change_attr(vm_offset_t, vm_size_t, vm_memattr_t);
 
+static inline int
+pmap_vmspace_copy(pmap_t dst_pmap __unused, pmap_t src_pmap __unused)
+{
+
+	return (0);
+}
+
 #endif				/* _KERNEL */
 
 #endif				/* !LOCORE */

Modified: head/sys/powerpc/include/pmap.h
==============================================================================
--- head/sys/powerpc/include/pmap.h	Wed Feb 20 09:46:44 2019	(r344352)
+++ head/sys/powerpc/include/pmap.h	Wed Feb 20 09:51:13 2019	(r344353)
@@ -288,6 +288,13 @@ vm_offset_t pmap_early_io_map(vm_paddr_t pa, vm_size_t
 void pmap_early_io_unmap(vm_offset_t va, vm_size_t size);
 void pmap_track_page(pmap_t pmap, vm_offset_t va);
 
+static inline int
+pmap_vmspace_copy(pmap_t dst_pmap __unused, pmap_t src_pmap __unused)
+{
+
+	return (0);
+}
+
 #endif
 
 #endif /* !_MACHINE_PMAP_H_ */

Modified: head/sys/riscv/include/pmap.h
==============================================================================
--- head/sys/riscv/include/pmap.h	Wed Feb 20 09:46:44 2019	(r344352)
+++ head/sys/riscv/include/pmap.h	Wed Feb 20 09:51:13 2019	(r344353)
@@ -166,6 +166,13 @@ bool	pmap_get_tables(pmap_t, vm_offset_t, pd_entry_t *
 
 int pmap_fault_fixup(pmap_t, vm_offset_t, vm_prot_t);
 
+static inline int
+pmap_vmspace_copy(pmap_t dst_pmap __unused, pmap_t src_pmap __unused)
+{
+
+	return (0);
+}
+
 #endif	/* _KERNEL */
 
 #endif	/* !LOCORE */

Modified: head/sys/sparc64/include/pmap.h
==============================================================================
--- head/sys/sparc64/include/pmap.h	Wed Feb 20 09:46:44 2019	(r344352)
+++ head/sys/sparc64/include/pmap.h	Wed Feb 20 09:51:13 2019	(r344353)
@@ -128,4 +128,11 @@ SYSCTL_DECL(_debug_pmap_stats);
 
 #endif
 
+static inline int
+pmap_vmspace_copy(pmap_t dst_pmap __unused, pmap_t src_pmap __unused)
+{
+
+	return (0);
+}
+
 #endif /* !_MACHINE_PMAP_H_ */

Modified: head/sys/vm/vm_fault.c
==============================================================================
--- head/sys/vm/vm_fault.c	Wed Feb 20 09:46:44 2019	(r344352)
+++ head/sys/vm/vm_fault.c	Wed Feb 20 09:51:13 2019	(r344353)
@@ -481,8 +481,20 @@ vm_fault_populate(struct faultstate *fs, vm_prot_t pro
 			    fault_flags, true);
 		}
 		VM_OBJECT_WUNLOCK(fs->first_object);
-		pmap_enter(fs->map->pmap, vaddr, m, prot, fault_type | (wired ?
-		    PMAP_ENTER_WIRED : 0), psind);
+		rv = pmap_enter(fs->map->pmap, vaddr, m, prot, fault_type |
+		    (wired ? PMAP_ENTER_WIRED : 0), psind);
+#if defined(__amd64__)
+		if (psind > 0 && rv == KERN_FAILURE) {
+			for (i = 0; i < npages; i++) {
+				rv = pmap_enter(fs->map->pmap, vaddr + ptoa(i),
+				    &m[i], prot, fault_type |
+				    (wired ? PMAP_ENTER_WIRED : 0), 0);
+				MPASS(rv == KERN_SUCCESS);
+			}
+		}
+#else
+		MPASS(rv == KERN_SUCCESS);
+#endif
 		VM_OBJECT_WLOCK(fs->first_object);
 		m_mtx = NULL;
 		for (i = 0; i < npages; i++) {

Modified: head/sys/vm/vm_map.c
==============================================================================
--- head/sys/vm/vm_map.c	Wed Feb 20 09:46:44 2019	(r344352)
+++ head/sys/vm/vm_map.c	Wed Feb 20 09:51:13 2019	(r344353)
@@ -3544,7 +3544,7 @@ vmspace_fork(struct vmspace *vm1, vm_ooffset_t *fork_c
 	vm_map_t new_map, old_map;
 	vm_map_entry_t new_entry, old_entry;
 	vm_object_t object;
-	int locked;
+	int error, locked;
 	vm_inherit_t inh;
 
 	old_map = &vm1->vm_map;
@@ -3553,6 +3553,7 @@ vmspace_fork(struct vmspace *vm1, vm_ooffset_t *fork_c
 	    pmap_pinit);
 	if (vm2 == NULL)
 		return (NULL);
+
 	vm2->vm_taddr = vm1->vm_taddr;
 	vm2->vm_daddr = vm1->vm_daddr;
 	vm2->vm_maxsaddr = vm1->vm_maxsaddr;
@@ -3563,7 +3564,17 @@ vmspace_fork(struct vmspace *vm1, vm_ooffset_t *fork_c
 	locked = vm_map_trylock(new_map); /* trylock to silence WITNESS */
 	KASSERT(locked, ("vmspace_fork: lock failed"));
 
+	error = pmap_vmspace_copy(new_map->pmap, old_map->pmap);
+	if (error != 0) {
+		sx_xunlock(&old_map->lock);
+		sx_xunlock(&new_map->lock);
+		vm_map_process_deferred();
+		vmspace_free(vm2);
+		return (NULL);
+	}
+
 	new_map->anon_loc = old_map->anon_loc;
+
 	old_entry = old_map->header.next;
 
 	while (old_entry != &old_map->header) {

Modified: head/sys/x86/include/sysarch.h
==============================================================================
--- head/sys/x86/include/sysarch.h	Wed Feb 20 09:46:44 2019	(r344352)
+++ head/sys/x86/include/sysarch.h	Wed Feb 20 09:51:13 2019	(r344353)
@@ -52,6 +52,8 @@
 #define	I386_GET_GSBASE		9
 #define	I386_SET_GSBASE		10
 #define	I386_GET_XFPUSTATE	11
+#define	I386_SET_PKRU		12
+#define	I386_CLEAR_PKRU		13
 
 /* Leave space for 0-127 for to avoid translating syscalls */
 #define	AMD64_GET_FSBASE	128
@@ -59,7 +61,13 @@
 #define	AMD64_GET_GSBASE	130
 #define	AMD64_SET_GSBASE	131
 #define	AMD64_GET_XFPUSTATE	132
+#define	AMD64_SET_PKRU		133
+#define	AMD64_CLEAR_PKRU	134
 
+/* Flags for AMD64_SET_PKRU */
+#define	AMD64_PKRU_EXCL		0x0001
+#define	AMD64_PKRU_PERSIST	0x0002
+
 struct i386_ioperm_args {
 	unsigned int start;
 	unsigned int length;
@@ -94,11 +102,25 @@ struct i386_get_xfpustate {
 	int len;
 };
 
+struct i386_set_pkru {
+	unsigned int addr;
+	unsigned int len;
+	unsigned int keyidx;
+	int flags;
+};
+
 struct amd64_get_xfpustate {
 	void *addr;
 	int len;
 };
 #endif
+
+struct amd64_set_pkru {
+	void *addr;
+	unsigned long len;
+	unsigned int keyidx;
+	int flags;
+};
 
 #ifndef _KERNEL
 union descriptor;



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201902200951.x1K9pDQs001745>