Skip site navigation (1)Skip section navigation (2)
Date:      Wed, 13 Feb 2019 17:19:37 +0000 (UTC)
From:      Mark Johnston <markj@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r344106 - in head/sys: riscv/include riscv/riscv vm
Message-ID:  <201902131719.x1DHJbv3025077@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: markj
Date: Wed Feb 13 17:19:37 2019
New Revision: 344106
URL: https://svnweb.freebsd.org/changeset/base/344106

Log:
  Implement transparent 2MB superpage promotion for RISC-V.
  
  This includes support for pmap_enter(..., psind=1) as described in the
  commit log message for r321378.
  
  The changes are largely modelled after amd64.  arm64 has more stringent
  requirements around superpage creation to avoid the possibility of TLB
  conflict aborts, and these requirements do not apply to RISC-V, which
  like amd64 permits simultaneous caching of 4KB and 2MB translations for
  a given page.  RISC-V's PTE format includes only two software bits, and
  as these are already consumed we do not have an analogue for amd64's
  PG_PROMOTED.  Instead, pmap_remove_l2() always invalidates the entire
  2MB address range.
  
  pmap_ts_referenced() is modified to clear PTE_A, now that we support
  both hardware- and software-managed reference and dirty bits.  Also
  fix pmap_fault_fixup() so that it does not set PTE_A or PTE_D on kernel
  mappings.
  
  Reviewed by:	kib (earlier version)
  Discussed with:	jhb
  Sponsored by:	The FreeBSD Foundation
  Differential Revision:	https://reviews.freebsd.org/D18863
  Differential Revision:	https://reviews.freebsd.org/D18864
  Differential Revision:	https://reviews.freebsd.org/D18865
  Differential Revision:	https://reviews.freebsd.org/D18866
  Differential Revision:	https://reviews.freebsd.org/D18867
  Differential Revision:	https://reviews.freebsd.org/D18868

Modified:
  head/sys/riscv/include/param.h
  head/sys/riscv/include/pmap.h
  head/sys/riscv/include/pte.h
  head/sys/riscv/include/vmparam.h
  head/sys/riscv/riscv/pmap.c
  head/sys/vm/vm_fault.c

Modified: head/sys/riscv/include/param.h
==============================================================================
--- head/sys/riscv/include/param.h	Wed Feb 13 16:02:55 2019	(r344105)
+++ head/sys/riscv/include/param.h	Wed Feb 13 17:19:37 2019	(r344106)
@@ -82,7 +82,7 @@
 #define	PAGE_SIZE	(1 << PAGE_SHIFT)	/* Page size */
 #define	PAGE_MASK	(PAGE_SIZE - 1)
 
-#define	MAXPAGESIZES	1		/* maximum number of supported page sizes */
+#define	MAXPAGESIZES	3	/* maximum number of supported page sizes */
 
 #ifndef KSTACK_PAGES
 #define	KSTACK_PAGES	4	/* pages of kernel stack (with pcb) */

Modified: head/sys/riscv/include/pmap.h
==============================================================================
--- head/sys/riscv/include/pmap.h	Wed Feb 13 16:02:55 2019	(r344105)
+++ head/sys/riscv/include/pmap.h	Wed Feb 13 17:19:37 2019	(r344106)
@@ -44,6 +44,8 @@
 #include <sys/_lock.h>
 #include <sys/_mutex.h>
 
+#include <vm/_vm_radix.h>
+
 #ifdef _KERNEL
 
 #define	vtophys(va)	pmap_kextract((vm_offset_t)(va))
@@ -80,6 +82,7 @@ struct pmap {
 	pd_entry_t		*pm_l1;
 	TAILQ_HEAD(,pv_chunk)	pm_pvchunk;	/* list of mappings in pmap */
 	LIST_ENTRY(pmap)	pm_list;	/* List of all pmaps */
+	struct vm_radix		pm_root;
 };
 
 typedef struct pv_entry {
@@ -139,6 +142,7 @@ void	pmap_kenter_device(vm_offset_t, vm_size_t, vm_pad
 vm_paddr_t pmap_kextract(vm_offset_t va);
 void	pmap_kremove(vm_offset_t);
 void	pmap_kremove_device(vm_offset_t, vm_size_t);
+bool	pmap_ps_enabled(pmap_t);
 
 void	*pmap_mapdev(vm_offset_t, vm_size_t);
 void	*pmap_mapbios(vm_paddr_t, vm_size_t);

Modified: head/sys/riscv/include/pte.h
==============================================================================
--- head/sys/riscv/include/pte.h	Wed Feb 13 16:02:55 2019	(r344105)
+++ head/sys/riscv/include/pte.h	Wed Feb 13 17:19:37 2019	(r344106)
@@ -62,7 +62,8 @@ typedef	uint64_t	pn_t;			/* page number */
 #define	L3_SIZE 	(1 << L3_SHIFT)
 #define	L3_OFFSET 	(L3_SIZE - 1)
 
-#define	Ln_ENTRIES	(1 << 9)
+#define	Ln_ENTRIES_SHIFT 9
+#define	Ln_ENTRIES	(1 << Ln_ENTRIES_SHIFT)
 #define	Ln_ADDR_MASK	(Ln_ENTRIES - 1)
 
 /* Bits 9:8 are reserved for software */
@@ -79,6 +80,8 @@ typedef	uint64_t	pn_t;			/* page number */
 #define	PTE_RWX		(PTE_R | PTE_W | PTE_X)
 #define	PTE_RX		(PTE_R | PTE_X)
 #define	PTE_KERN	(PTE_V | PTE_R | PTE_W | PTE_A | PTE_D)
+#define	PTE_PROMOTE	(PTE_V | PTE_RWX | PTE_D | PTE_A | PTE_G | PTE_U | \
+			 PTE_SW_MANAGED | PTE_SW_WIRED)
 
 #define	PTE_PPN0_S	10
 #define	PTE_PPN1_S	19

Modified: head/sys/riscv/include/vmparam.h
==============================================================================
--- head/sys/riscv/include/vmparam.h	Wed Feb 13 16:02:55 2019	(r344105)
+++ head/sys/riscv/include/vmparam.h	Wed Feb 13 17:19:37 2019	(r344106)
@@ -99,10 +99,10 @@
 #define	VM_NFREEORDER		12
 
 /*
- * Disable superpage reservations.
+ * Enable superpage reservations: 1 level.
  */
 #ifndef	VM_NRESERVLEVEL
-#define	VM_NRESERVLEVEL		0
+#define	VM_NRESERVLEVEL		1
 #endif
 
 /*

Modified: head/sys/riscv/riscv/pmap.c
==============================================================================
--- head/sys/riscv/riscv/pmap.c	Wed Feb 13 16:02:55 2019	(r344105)
+++ head/sys/riscv/riscv/pmap.c	Wed Feb 13 17:19:37 2019	(r344106)
@@ -118,6 +118,7 @@ __FBSDID("$FreeBSD$");
  */
 
 #include <sys/param.h>
+#include <sys/bitstring.h>
 #include <sys/bus.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
@@ -145,6 +146,7 @@ __FBSDID("$FreeBSD$");
 #include <vm/vm_extern.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
+#include <vm/vm_phys.h>
 #include <vm/vm_radix.h>
 #include <vm/vm_reserv.h>
 #include <vm/uma.h>
@@ -154,9 +156,8 @@ __FBSDID("$FreeBSD$");
 #include <machine/pcb.h>
 #include <machine/sbi.h>
 
-#define	NPDEPG		(PAGE_SIZE/(sizeof (pd_entry_t)))
-#define	NUPDE			(NPDEPG * NPDEPG)
-#define	NUSERPGTBLS		(NUPDE + NPDEPG)
+#define	NUL1E		(Ln_ENTRIES * Ln_ENTRIES)
+#define	NUL2E		(Ln_ENTRIES * NUL1E)
 
 #if !defined(DIAGNOSTIC)
 #ifdef __GNUC_GNU_INLINE__
@@ -175,11 +176,12 @@ __FBSDID("$FreeBSD$");
 #endif
 
 #define	pmap_l2_pindex(v)	((v) >> L2_SHIFT)
+#define	pa_to_pvh(pa)		(&pv_table[pa_index(pa)])
 
 #define	NPV_LIST_LOCKS	MAXCPU
 
 #define	PHYS_TO_PV_LIST_LOCK(pa)	\
-			(&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS])
+			(&pv_list_locks[pmap_l2_pindex(pa) % NPV_LIST_LOCKS])
 
 #define	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa)	do {	\
 	struct rwlock **_lockp = (lockp);		\
@@ -230,13 +232,52 @@ CTASSERT((DMAP_MAX_ADDRESS  & ~L1_OFFSET) == DMAP_MAX_
 static struct rwlock_padalign pvh_global_lock;
 static struct mtx_padalign allpmaps_lock;
 
+static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0,
+    "VM/pmap parameters");
+
+static int superpages_enabled = 1;
+SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled,
+    CTLFLAG_RDTUN, &superpages_enabled, 0,
+    "Enable support for transparent superpages");
+
+static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD, 0,
+    "2MB page mapping counters");
+
+static u_long pmap_l2_demotions;
+SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD,
+    &pmap_l2_demotions, 0,
+    "2MB page demotions");
+
+static u_long pmap_l2_mappings;
+SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, mappings, CTLFLAG_RD,
+    &pmap_l2_mappings, 0,
+    "2MB page mappings");
+
+static u_long pmap_l2_p_failures;
+SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD,
+    &pmap_l2_p_failures, 0,
+    "2MB page promotion failures");
+
+static u_long pmap_l2_promotions;
+SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD,
+    &pmap_l2_promotions, 0,
+    "2MB page promotions");
+
 /*
  * Data for the pv entry allocation mechanism
  */
 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
 static struct mtx pv_chunks_mutex;
 static struct rwlock pv_list_locks[NPV_LIST_LOCKS];
+static struct md_page *pv_table;
+static struct md_page pv_dummy;
 
+/*
+ * Internal flags for pmap_enter()'s helper functions.
+ */
+#define	PMAP_ENTER_NORECLAIM	0x1000000	/* Don't reclaim PV entries. */
+#define	PMAP_ENTER_NOREPLACE	0x2000000	/* Don't replace mappings. */
+
 static void	free_pv_chunk(struct pv_chunk *pc);
 static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp);
@@ -244,6 +285,11 @@ static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, 
 static void	pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
 		    vm_offset_t va);
+static bool	pmap_demote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va);
+static bool	pmap_demote_l2_locked(pmap_t pmap, pd_entry_t *l2,
+		    vm_offset_t va, struct rwlock **lockp);
+static int	pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2,
+		    u_int flags, vm_page_t m, struct rwlock **lockp);
 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
     vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
 static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva,
@@ -254,9 +300,9 @@ static boolean_t pmap_try_insert_pv_entry(pmap_t pmap,
 static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex,
 		struct rwlock **lockp);
 
-static void _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m,
+static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m,
     struct spglist *free);
-static int pmap_unuse_l3(pmap_t, vm_offset_t, pd_entry_t, struct spglist *);
+static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *);
 
 #define	pmap_clear(pte)			pmap_store(pte, 0)
 #define	pmap_clear_bits(pte, bits)	atomic_clear_64(pte, bits)
@@ -636,7 +682,8 @@ pmap_page_init(vm_page_t m)
 void
 pmap_init(void)
 {
-	int i;
+	vm_size_t s;
+	int i, pv_npg;
 
 	/*
 	 * Initialize the pv chunk and pmap list mutexes.
@@ -649,6 +696,24 @@ pmap_init(void)
 	 */
 	for (i = 0; i < NPV_LIST_LOCKS; i++)
 		rw_init(&pv_list_locks[i], "pmap pv list");
+
+	/*
+	 * Calculate the size of the pv head table for superpages.
+	 */
+	pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, L2_SIZE);
+
+	/*
+	 * Allocate memory for the pv head table for superpages.
+	 */
+	s = (vm_size_t)(pv_npg * sizeof(struct md_page));
+	s = round_page(s);
+	pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO);
+	for (i = 0; i < pv_npg; i++)
+		TAILQ_INIT(&pv_table[i].pv_list);
+	TAILQ_INIT(&pv_dummy.pv_list);
+
+	if (superpages_enabled)
+		pagesizes[1] = L2_SIZE;
 }
 
 #ifdef SMP
@@ -999,6 +1064,13 @@ pmap_qremove(vm_offset_t sva, int count)
 	pmap_invalidate_range(kernel_pmap, sva, va);
 }
 
+bool
+pmap_ps_enabled(pmap_t pmap __unused)
+{
+
+	return (superpages_enabled);
+}
+
 /***************************************************
  * Page table page management routines.....
  ***************************************************/
@@ -1018,6 +1090,34 @@ pmap_add_delayed_free_list(vm_page_t m, struct spglist
 		m->flags &= ~PG_ZERO;
 	SLIST_INSERT_HEAD(free, m, plinks.s.ss);
 }
+
+/*
+ * Inserts the specified page table page into the specified pmap's collection
+ * of idle page table pages.  Each of a pmap's page table pages is responsible
+ * for mapping a distinct range of virtual addresses.  The pmap's collection is
+ * ordered by this virtual address range.
+ */
+static __inline int
+pmap_insert_pt_page(pmap_t pmap, vm_page_t ml3)
+{
+
+	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+	return (vm_radix_insert(&pmap->pm_root, ml3));
+}
+
+/*
+ * Removes the page table page mapping the specified virtual address from the
+ * specified pmap's collection of idle page table pages, and returns it.
+ * Otherwise, returns NULL if there is no page table page corresponding to the
+ * specified virtual address.
+ */
+static __inline vm_page_t
+pmap_remove_pt_page(pmap_t pmap, vm_offset_t va)
+{
+
+	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+	return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va)));
+}
 	
 /*
  * Decrements a page table page's wire count, which is used to record the
@@ -1026,12 +1126,12 @@ pmap_add_delayed_free_list(vm_page_t m, struct spglist
  * page table page was unmapped and FALSE otherwise.
  */
 static inline boolean_t
-pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
+pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
 {
 
 	--m->wire_count;
 	if (m->wire_count == 0) {
-		_pmap_unwire_l3(pmap, va, m, free);
+		_pmap_unwire_ptp(pmap, va, m, free);
 		return (TRUE);
 	} else {
 		return (FALSE);
@@ -1039,36 +1139,30 @@ pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t 
 }
 
 static void
-_pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
+_pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
 {
 	vm_paddr_t phys;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
-	/*
-	 * unmap the page table page
-	 */
-	if (m->pindex >= NUPDE) {
-		/* PD page */
+	if (m->pindex >= NUL1E) {
 		pd_entry_t *l1;
 		l1 = pmap_l1(pmap, va);
 		pmap_clear(l1);
 		pmap_distribute_l1(pmap, pmap_l1_index(va), 0);
 	} else {
-		/* PTE page */
 		pd_entry_t *l2;
 		l2 = pmap_l2(pmap, va);
 		pmap_clear(l2);
 	}
 	pmap_resident_count_dec(pmap, 1);
-	if (m->pindex < NUPDE) {
+	if (m->pindex < NUL1E) {
 		pd_entry_t *l1;
-		/* We just released a PT, unhold the matching PD */
 		vm_page_t pdpg;
 
 		l1 = pmap_l1(pmap, va);
 		phys = PTE_TO_PHYS(pmap_load(l1));
 		pdpg = PHYS_TO_VM_PAGE(phys);
-		pmap_unwire_l3(pmap, va, pdpg, free);
+		pmap_unwire_ptp(pmap, va, pdpg, free);
 	}
 	pmap_invalidate_page(pmap, va);
 
@@ -1082,24 +1176,20 @@ _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t
 }
 
 /*
- * After removing an l3 entry, this routine is used to
+ * After removing a page table entry, this routine is used to
  * conditionally free the page, and manage the hold/wire counts.
  */
 static int
-pmap_unuse_l3(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde,
+pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde,
     struct spglist *free)
 {
-	vm_paddr_t phys;
 	vm_page_t mpte;
 
 	if (va >= VM_MAXUSER_ADDRESS)
 		return (0);
 	KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
-
-	phys = PTE_TO_PHYS(ptepde);
-
-	mpte = PHYS_TO_VM_PAGE(phys);
-	return (pmap_unwire_l3(pmap, va, mpte, free));
+	mpte = PHYS_TO_VM_PAGE(PTE_TO_PHYS(ptepde));
+	return (pmap_unwire_ptp(pmap, va, mpte, free));
 }
 
 void
@@ -1140,6 +1230,8 @@ pmap_pinit(pmap_t pmap)
 	LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
 	mtx_unlock(&allpmaps_lock);
 
+	vm_radix_init(&pmap->pm_root);
+
 	return (1);
 }
 
@@ -1193,11 +1285,11 @@ _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, str
 	 * it isn't already there.
 	 */
 
-	if (ptepindex >= NUPDE) {
+	if (ptepindex >= NUL1E) {
 		pd_entry_t *l1;
 		vm_pindex_t l1index;
 
-		l1index = ptepindex - NUPDE;
+		l1index = ptepindex - NUL1E;
 		l1 = &pmap->pm_l1[l1index];
 
 		pn = (VM_PAGE_TO_PHYS(m) / PAGE_SIZE);
@@ -1213,7 +1305,7 @@ _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, str
 		l1 = &pmap->pm_l1[l1index];
 		if (pmap_load(l1) == 0) {
 			/* recurse for allocating page dir */
-			if (_pmap_alloc_l3(pmap, NUPDE + l1index,
+			if (_pmap_alloc_l3(pmap, NUL1E + l1index,
 			    lockp) == NULL) {
 				vm_page_unwire_noq(m);
 				vm_page_free_zero(m);
@@ -1241,6 +1333,29 @@ _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, str
 }
 
 static vm_page_t
+pmap_alloc_l2(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
+{
+	pd_entry_t *l1;
+	vm_page_t l2pg;
+	vm_pindex_t l2pindex;
+
+retry:
+	l1 = pmap_l1(pmap, va);
+	if (l1 != NULL && (pmap_load(l1) & PTE_RWX) == 0) {
+		/* Add a reference to the L2 page. */
+		l2pg = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l1)));
+		l2pg->wire_count++;
+	} else {
+		/* Allocate a L2 page. */
+		l2pindex = pmap_l2_pindex(va) >> Ln_ENTRIES_SHIFT;
+		l2pg = _pmap_alloc_l3(pmap, NUL2E + l2pindex, lockp);
+		if (l2pg == NULL && lockp != NULL)
+			goto retry;
+	}
+	return (l2pg);
+}
+
+static vm_page_t
 pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
 {
 	vm_pindex_t ptepindex;
@@ -1599,6 +1714,79 @@ retry:
 }
 
 /*
+ * Ensure that the number of spare PV entries in the specified pmap meets or
+ * exceeds the given count, "needed".
+ *
+ * The given PV list lock may be released.
+ */
+static void
+reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
+{
+	struct pch new_tail;
+	struct pv_chunk *pc;
+	vm_page_t m;
+	int avail, free;
+	bool reclaimed;
+
+	rw_assert(&pvh_global_lock, RA_LOCKED);
+	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+	KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
+
+	/*
+	 * Newly allocated PV chunks must be stored in a private list until
+	 * the required number of PV chunks have been allocated.  Otherwise,
+	 * reclaim_pv_chunk() could recycle one of these chunks.  In
+	 * contrast, these chunks must be added to the pmap upon allocation.
+	 */
+	TAILQ_INIT(&new_tail);
+retry:
+	avail = 0;
+	TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
+		bit_count((bitstr_t *)pc->pc_map, 0,
+		    sizeof(pc->pc_map) * NBBY, &free);
+		if (free == 0)
+			break;
+		avail += free;
+		if (avail >= needed)
+			break;
+	}
+	for (reclaimed = false; avail < needed; avail += _NPCPV) {
+		m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
+		    VM_ALLOC_WIRED);
+		if (m == NULL) {
+			m = reclaim_pv_chunk(pmap, lockp);
+			if (m == NULL)
+				goto retry;
+			reclaimed = true;
+		}
+		/* XXX PV STATS */
+#if 0
+		dump_add_page(m->phys_addr);
+#endif
+		pc = (void *)PHYS_TO_DMAP(m->phys_addr);
+		pc->pc_pmap = pmap;
+		pc->pc_map[0] = PC_FREE0;
+		pc->pc_map[1] = PC_FREE1;
+		pc->pc_map[2] = PC_FREE2;
+		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
+		TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
+
+		/*
+		 * The reclaim might have freed a chunk from the current pmap.
+		 * If that chunk contained available entries, we need to
+		 * re-count the number of available entries.
+		 */
+		if (reclaimed)
+			goto retry;
+	}
+	if (!TAILQ_EMPTY(&new_tail)) {
+		mtx_lock(&pv_chunks_mutex);
+		TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
+		mtx_unlock(&pv_chunks_mutex);
+	}
+}
+
+/*
  * First find and then remove the pv entry for the specified pmap and virtual
  * address from the specified pv list.  Returns the pv entry if found and NULL
  * otherwise.  This operation can be performed on pv lists for either 4KB or
@@ -1632,7 +1820,7 @@ pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_off
 
 	pv = pmap_pvh_remove(pvh, pmap, va);
 
-	KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
+	KASSERT(pv != NULL, ("pmap_pvh_free: pv not found for %#lx", va));
 	free_pv_entry(pmap, pv);
 }
 
@@ -1660,6 +1848,222 @@ pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, 
 }
 
 /*
+ * After demotion from a 2MB page mapping to 512 4KB page mappings,
+ * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
+ * entries for each of the 4KB page mappings.
+ */
+static void __unused
+pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
+    struct rwlock **lockp)
+{
+	struct md_page *pvh;
+	struct pv_chunk *pc;
+	pv_entry_t pv;
+	vm_page_t m;
+	vm_offset_t va_last;
+	int bit, field;
+
+	rw_assert(&pvh_global_lock, RA_LOCKED);
+	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
+
+	/*
+	 * Transfer the 2mpage's pv entry for this mapping to the first
+	 * page's pv list.  Once this transfer begins, the pv list lock
+	 * must not be released until the last pv entry is reinstantiated.
+	 */
+	pvh = pa_to_pvh(pa);
+	va &= ~L2_OFFSET;
+	pv = pmap_pvh_remove(pvh, pmap, va);
+	KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found"));
+	m = PHYS_TO_VM_PAGE(pa);
+	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
+	m->md.pv_gen++;
+	/* Instantiate the remaining 511 pv entries. */
+	va_last = va + L2_SIZE - PAGE_SIZE;
+	for (;;) {
+		pc = TAILQ_FIRST(&pmap->pm_pvchunk);
+		KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 ||
+		    pc->pc_map[2] != 0, ("pmap_pv_demote_l2: missing spare"));
+		for (field = 0; field < _NPCM; field++) {
+			while (pc->pc_map[field] != 0) {
+				bit = ffsl(pc->pc_map[field]) - 1;
+				pc->pc_map[field] &= ~(1ul << bit);
+				pv = &pc->pc_pventry[field * 64 + bit];
+				va += PAGE_SIZE;
+				pv->pv_va = va;
+				m++;
+				KASSERT((m->oflags & VPO_UNMANAGED) == 0,
+			    ("pmap_pv_demote_l2: page %p is not managed", m));
+				TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
+				m->md.pv_gen++;
+				if (va == va_last)
+					goto out;
+			}
+		}
+		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
+		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
+	}
+out:
+	if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) {
+		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
+		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
+	}
+	/* XXX PV stats */
+}
+
+#if VM_NRESERVLEVEL > 0
+static void
+pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
+    struct rwlock **lockp)
+{
+	struct md_page *pvh;
+	pv_entry_t pv;
+	vm_page_t m;
+	vm_offset_t va_last;
+
+	rw_assert(&pvh_global_lock, RA_LOCKED);
+	KASSERT((va & L2_OFFSET) == 0,
+	    ("pmap_pv_promote_l2: misaligned va %#lx", va));
+
+	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
+
+	m = PHYS_TO_VM_PAGE(pa);
+	pv = pmap_pvh_remove(&m->md, pmap, va);
+	KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv for %#lx not found", va));
+	pvh = pa_to_pvh(pa);
+	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
+	pvh->pv_gen++;
+
+	va_last = va + L2_SIZE - PAGE_SIZE;
+	do {
+		m++;
+		va += PAGE_SIZE;
+		pmap_pvh_free(&m->md, pmap, va);
+	} while (va < va_last);
+}
+#endif /* VM_NRESERVLEVEL > 0 */
+
+/*
+ * Create the PV entry for a 2MB page mapping.  Always returns true unless the
+ * flag PMAP_ENTER_NORECLAIM is specified.  If that flag is specified, returns
+ * false if the PV entry cannot be allocated without resorting to reclamation.
+ */
+static bool
+pmap_pv_insert_l2(pmap_t pmap, vm_offset_t va, pd_entry_t l2e, u_int flags,
+    struct rwlock **lockp)
+{
+	struct md_page *pvh;
+	pv_entry_t pv;
+	vm_paddr_t pa;
+
+	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+	/* Pass NULL instead of the lock pointer to disable reclamation. */
+	if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ?
+	    NULL : lockp)) == NULL)
+		return (false);
+	pv->pv_va = va;
+	pa = PTE_TO_PHYS(l2e);
+	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
+	pvh = pa_to_pvh(pa);
+	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
+	pvh->pv_gen++;
+	return (true);
+}
+
+static void
+pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va)
+{
+	pt_entry_t newl2, oldl2;
+	vm_page_t ml3;
+	vm_paddr_t ml3pa;
+
+	KASSERT(!VIRT_IN_DMAP(va), ("removing direct mapping of %#lx", va));
+	KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap));
+	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+
+	ml3 = pmap_remove_pt_page(pmap, va);
+	if (ml3 == NULL)
+		panic("pmap_remove_kernel_l2: Missing pt page");
+
+	ml3pa = VM_PAGE_TO_PHYS(ml3);
+	newl2 = ml3pa | PTE_V;
+
+	/*
+	 * Initialize the page table page.
+	 */
+	pagezero((void *)PHYS_TO_DMAP(ml3pa));
+
+	/*
+	 * Demote the mapping.
+	 */
+	oldl2 = pmap_load_store(l2, newl2);
+	KASSERT(oldl2 == 0, ("%s: found existing mapping at %p: %#lx",
+	    __func__, l2, oldl2));
+}
+
+/*
+ * pmap_remove_l2: Do the things to unmap a level 2 superpage.
+ */
+static int
+pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva,
+    pd_entry_t l1e, struct spglist *free, struct rwlock **lockp)
+{
+	struct md_page *pvh;
+	pt_entry_t oldl2;
+	vm_offset_t eva, va;
+	vm_page_t m, ml3;
+
+	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+	KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned"));
+	oldl2 = pmap_load_clear(l2);
+	KASSERT((oldl2 & PTE_RWX) != 0,
+	    ("pmap_remove_l2: L2e %lx is not a superpage mapping", oldl2));
+
+	/*
+	 * The sfence.vma documentation states that it is sufficient to specify
+	 * a single address within a superpage mapping.  However, since we do
+	 * not perform any invalidation upon promotion, TLBs may still be
+	 * caching 4KB mappings within the superpage, so we must invalidate the
+	 * entire range.
+	 */
+	pmap_invalidate_range(pmap, sva, sva + L2_SIZE);
+	if ((oldl2 & PTE_SW_WIRED) != 0)
+		pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE;
+	pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE);
+	if ((oldl2 & PTE_SW_MANAGED) != 0) {
+		CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, PTE_TO_PHYS(oldl2));
+		pvh = pa_to_pvh(PTE_TO_PHYS(oldl2));
+		pmap_pvh_free(pvh, pmap, sva);
+		eva = sva + L2_SIZE;
+		for (va = sva, m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(oldl2));
+		    va < eva; va += PAGE_SIZE, m++) {
+			if ((oldl2 & PTE_D) != 0)
+				vm_page_dirty(m);
+			if ((oldl2 & PTE_A) != 0)
+				vm_page_aflag_set(m, PGA_REFERENCED);
+			if (TAILQ_EMPTY(&m->md.pv_list) &&
+			    TAILQ_EMPTY(&pvh->pv_list))
+				vm_page_aflag_clear(m, PGA_WRITEABLE);
+		}
+	}
+	if (pmap == kernel_pmap) {
+		pmap_remove_kernel_l2(pmap, l2, sva);
+	} else {
+		ml3 = pmap_remove_pt_page(pmap, sva);
+		if (ml3 != NULL) {
+			pmap_resident_count_dec(pmap, 1);
+			KASSERT(ml3->wire_count == Ln_ENTRIES,
+			    ("pmap_remove_l2: l3 page wire count error"));
+			ml3->wire_count = 1;
+			vm_page_unwire_noq(ml3);
+			pmap_add_delayed_free_list(ml3, free, FALSE);
+		}
+	}
+	return (pmap_unuse_pt(pmap, sva, l1e, free));
+}
+
+/*
  * pmap_remove_l3: do the things to unmap a page in a process
  */
 static int
@@ -1687,7 +2091,7 @@ pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_
 		pmap_pvh_free(&m->md, pmap, va);
 	}
 
-	return (pmap_unuse_l3(pmap, va, l2e, free));
+	return (pmap_unuse_pt(pmap, va, l2e, free));
 }
 
 /*
@@ -1699,11 +2103,11 @@ pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_
 void
 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
+	struct spglist free;
 	struct rwlock *lock;
 	vm_offset_t va, va_next;
-	pd_entry_t *l1, *l2;
-	pt_entry_t l3_pte, *l3;
-	struct spglist free;
+	pd_entry_t *l1, *l2, l2e;
+	pt_entry_t *l3;
 
 	/*
 	 * Perform an unsynchronized read.  This is, however, safe.
@@ -1739,16 +2143,22 @@ pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t 
 		l2 = pmap_l1_to_l2(l1, sva);
 		if (l2 == NULL)
 			continue;
-
-		l3_pte = pmap_load(l2);
-
-		/*
-		 * Weed out invalid mappings.
-		 */
-		if (l3_pte == 0)
+		if ((l2e = pmap_load(l2)) == 0)
 			continue;
-		if ((pmap_load(l2) & PTE_RX) != 0)
-			continue;
+		if ((l2e & PTE_RWX) != 0) {
+			if (sva + L2_SIZE == va_next && eva >= va_next) {
+				(void)pmap_remove_l2(pmap, l2, sva,
+				    pmap_load(l1), &free, &lock);
+				continue;
+			} else if (!pmap_demote_l2_locked(pmap, l2, sva,
+			    &lock)) {
+				/*
+				 * The large page mapping was destroyed.
+				 */
+				continue;
+			}
+			l2e = pmap_load(l2);
+		}
 
 		/*
 		 * Limit our scan to either the end of the va represented
@@ -1761,8 +2171,6 @@ pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t 
 		va = va_next;
 		for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
 		    sva += L3_SIZE) {
-			if (l3 == NULL)
-				panic("l3 == NULL");
 			if (pmap_load(l3) == 0) {
 				if (va != va_next) {
 					pmap_invalidate_range(pmap, va, sva);
@@ -1772,8 +2180,7 @@ pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t 
 			}
 			if (va == va_next)
 				va = sva;
-			if (pmap_remove_l3(pmap, l3, sva, l3_pte, &free,
-			    &lock)) {
+			if (pmap_remove_l3(pmap, l3, sva, l2e, &free, &lock)) {
 				sva += L3_SIZE;
 				break;
 			}
@@ -1783,7 +2190,7 @@ pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t 
 	}
 	if (lock != NULL)
 		rw_wunlock(lock);
-	rw_runlock(&pvh_global_lock);	
+	rw_runlock(&pvh_global_lock);
 	PMAP_UNLOCK(pmap);
 	vm_page_free_pages_toq(&free, false);
 }
@@ -1804,42 +2211,54 @@ pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t 
 void
 pmap_remove_all(vm_page_t m)
 {
-	pv_entry_t pv;
-	pmap_t pmap;
-	pt_entry_t *l3, tl3;
-	pd_entry_t *l2, tl2;
 	struct spglist free;
+	struct md_page *pvh;
+	pmap_t pmap;
+	pt_entry_t *l3, l3e;
+	pd_entry_t *l2, l2e;
+	pv_entry_t pv;
+	vm_offset_t va;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_remove_all: page %p is not managed", m));
 	SLIST_INIT(&free);
+	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
+	    pa_to_pvh(VM_PAGE_TO_PHYS(m));
+
 	rw_wlock(&pvh_global_lock);
+	while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
+		pmap = PV_PMAP(pv);
+		PMAP_LOCK(pmap);
+		va = pv->pv_va;
+		l2 = pmap_l2(pmap, va);
+		(void)pmap_demote_l2(pmap, l2, va);
+		PMAP_UNLOCK(pmap);
+	}
 	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		pmap_resident_count_dec(pmap, 1);
 		l2 = pmap_l2(pmap, pv->pv_va);
 		KASSERT(l2 != NULL, ("pmap_remove_all: no l2 table found"));
-		tl2 = pmap_load(l2);
+		l2e = pmap_load(l2);
 
-		KASSERT((tl2 & PTE_RX) == 0,
-		    ("pmap_remove_all: found a table when expecting "
-		    "a block in %p's pv list", m));
+		KASSERT((l2e & PTE_RX) == 0,
+		    ("pmap_remove_all: found a superpage in %p's pv list", m));
 
 		l3 = pmap_l2_to_l3(l2, pv->pv_va);
-		tl3 = pmap_load_clear(l3);
+		l3e = pmap_load_clear(l3);
 		pmap_invalidate_page(pmap, pv->pv_va);
-		if (tl3 & PTE_SW_WIRED)
+		if (l3e & PTE_SW_WIRED)
 			pmap->pm_stats.wired_count--;
-		if ((tl3 & PTE_A) != 0)
+		if ((l3e & PTE_A) != 0)
 			vm_page_aflag_set(m, PGA_REFERENCED);
 
 		/*
 		 * Update the vm_page_t clean and reference bits.
 		 */
-		if ((tl3 & PTE_D) != 0)
+		if ((l3e & PTE_D) != 0)
 			vm_page_dirty(m);
-		pmap_unuse_l3(pmap, pv->pv_va, pmap_load(l2), &free);
+		pmap_unuse_pt(pmap, pv->pv_va, pmap_load(l2), &free);
 		TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
 		m->md.pv_gen++;
 		free_pv_entry(pmap, pv);
@@ -1857,10 +2276,12 @@ pmap_remove_all(vm_page_t m)
 void
 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
 {
-	pd_entry_t *l1, *l2;
+	pd_entry_t *l1, *l2, l2e;
 	pt_entry_t *l3, l3e, mask;
 	vm_page_t m;
-	vm_offset_t va_next;
+	vm_paddr_t pa;
+	vm_offset_t va, va_next;
+	bool anychanged, pv_lists_locked;
 
 	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
 		pmap_remove(pmap, sva, eva);
@@ -1871,12 +2292,14 @@ pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t
 	    (VM_PROT_WRITE | VM_PROT_EXECUTE))
 		return;
 
+	anychanged = false;
+	pv_lists_locked = false;
 	mask = 0;
 	if ((prot & VM_PROT_WRITE) == 0)
 		mask |= PTE_W | PTE_D;
 	if ((prot & VM_PROT_EXECUTE) == 0)
 		mask |= PTE_X;
-
+resume:
 	PMAP_LOCK(pmap);
 	for (; sva < eva; sva = va_next) {
 		l1 = pmap_l1(pmap, sva);
@@ -1892,10 +2315,41 @@ pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t
 			va_next = eva;
 
 		l2 = pmap_l1_to_l2(l1, sva);
-		if (l2 == NULL || pmap_load(l2) == 0)
+		if (l2 == NULL || (l2e = pmap_load(l2)) == 0)
 			continue;
-		if ((pmap_load(l2) & PTE_RX) != 0)
-			continue;
+		if ((l2e & PTE_RWX) != 0) {
+			if (sva + L2_SIZE == va_next && eva >= va_next) {
+retryl2:
+				if ((l2e & (PTE_SW_MANAGED | PTE_D)) ==
+				    (PTE_SW_MANAGED | PTE_D)) {
+					pa = PTE_TO_PHYS(l2e);
+					for (va = sva, m = PHYS_TO_VM_PAGE(pa);
+					    va < va_next; m++, va += PAGE_SIZE)
+						vm_page_dirty(m);
+				}
+				if (!atomic_fcmpset_long(l2, &l2e, l2e & ~mask))
+					goto retryl2;
+				anychanged = true;
+			} else {
+				if (!pv_lists_locked) {
+					pv_lists_locked = true;
+					if (!rw_try_rlock(&pvh_global_lock)) {
+						if (anychanged)
+							pmap_invalidate_all(
+							    pmap);
+						PMAP_UNLOCK(pmap);
+						rw_rlock(&pvh_global_lock);
+						goto resume;
+					}
+				}
+				if (!pmap_demote_l2(pmap, l2, sva)) {
+					/*
+					 * The large page mapping was destroyed.
+					 */
+					continue;
+				}
+			}
+		}
 
 		if (va_next > eva)
 			va_next = eva;
@@ -1903,7 +2357,7 @@ pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t
 		for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
 		    sva += L3_SIZE) {
 			l3e = pmap_load(l3);
-retry:
+retryl3:
 			if ((l3e & PTE_V) == 0)
 				continue;
 			if ((prot & VM_PROT_WRITE) == 0 &&
@@ -1913,60 +2367,236 @@ retry:
 				vm_page_dirty(m);
 			}
 			if (!atomic_fcmpset_long(l3, &l3e, l3e & ~mask))
-				goto retry;
-			/* XXX: Use pmap_invalidate_range */
-			pmap_invalidate_page(pmap, sva);
+				goto retryl3;
+			anychanged = true;
 		}
 	}
+	if (anychanged)
+		pmap_invalidate_all(pmap);
+	if (pv_lists_locked)
+		rw_runlock(&pvh_global_lock);
 	PMAP_UNLOCK(pmap);
 }
 
 int
 pmap_fault_fixup(pmap_t pmap, vm_offset_t va, vm_prot_t ftype)
 {
-	pt_entry_t orig_l3;
-	pt_entry_t new_l3;
-	pt_entry_t *l3;
+	pd_entry_t *l2, l2e;

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201902131719.x1DHJbv3025077>