Skip site navigation (1)Skip section navigation (2)
Date:      Fri, 26 Oct 2012 03:02:39 +0000 (UTC)
From:      Alan Cox <alc@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r242121 - in head/sys: conf ia64/ia64 ia64/include
Message-ID:  <201210260302.q9Q32dFO007302@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: alc
Date: Fri Oct 26 03:02:39 2012
New Revision: 242121
URL: http://svn.freebsd.org/changeset/base/242121

Log:
  Port the new PV entry allocator from amd64/i386.  This allocator has two
  advantages.  First, PV entries are roughly half the size.  Second, this
  allocator doesn't access the paging queues, and thus it allows for the
  removal of the page queues lock from this pmap.
  
  Replace all uses of the page queues lock by a R/W lock that is private
  to this pmap.
  
  Tested by:	marcel

Modified:
  head/sys/conf/options.ia64
  head/sys/ia64/ia64/pmap.c
  head/sys/ia64/include/pmap.h

Modified: head/sys/conf/options.ia64
==============================================================================
--- head/sys/conf/options.ia64	Fri Oct 26 02:50:16 2012	(r242120)
+++ head/sys/conf/options.ia64	Fri Oct 26 03:02:39 2012	(r242121)
@@ -11,6 +11,8 @@ UWX_TRACE_ENABLE	opt_global.h
 
 COMPAT_FREEBSD32	opt_compat.h
 
+PV_STATS		opt_pmap.h
+
 EXCEPTION_TRACING	opt_xtrace.h
 
 VGA_ALT_SEQACCESS	opt_vga.h

Modified: head/sys/ia64/ia64/pmap.c
==============================================================================
--- head/sys/ia64/ia64/pmap.c	Fri Oct 26 02:50:16 2012	(r242120)
+++ head/sys/ia64/ia64/pmap.c	Fri Oct 26 03:02:39 2012	(r242121)
@@ -48,12 +48,15 @@
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
+#include "opt_pmap.h"
+
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mman.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
+#include <sys/rwlock.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
@@ -108,16 +111,18 @@ __FBSDID("$FreeBSD$");
 /* XXX move to a header. */
 extern uint64_t ia64_gateway_page[];
 
-#ifndef PMAP_SHPGPERPROC
-#define PMAP_SHPGPERPROC 200
-#endif
-
 #if !defined(DIAGNOSTIC)
 #define PMAP_INLINE __inline
 #else
 #define PMAP_INLINE
 #endif
 
+#ifdef PV_STATS
+#define PV_STAT(x)	do { x ; } while (0)
+#else
+#define PV_STAT(x)	do { } while (0)
+#endif
+
 #define	pmap_accessed(lpte)		((lpte)->pte & PTE_ACCESSED)
 #define	pmap_dirty(lpte)		((lpte)->pte & PTE_DIRTY)
 #define	pmap_exec(lpte)			((lpte)->pte & PTE_AR_RX)
@@ -187,10 +192,21 @@ static uint64_t *pmap_ridmap;
 struct mtx pmap_ridmutex;
 
 /*
+ * Isolate the global pv list lock from data and other locks to prevent false
+ * sharing within the cache.
+ */
+static struct {
+	struct rwlock	lock;
+	char		padding[CACHE_LINE_SIZE - sizeof(struct rwlock)];
+} pvh_global __aligned(CACHE_LINE_SIZE);
+
+#define	pvh_global_lock	pvh_global.lock
+
+/*
  * Data for the pv entry allocation mechanism
  */
-static uma_zone_t pvzone;
-static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
+static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
+static int pv_entry_count;
 
 /*
  * Data for allocating PTEs for user processes.
@@ -224,8 +240,10 @@ SYSCTL_PROC(_machdep_vhpt, OID_AUTO, pop
 
 static struct ia64_lpte *pmap_find_vhpt(vm_offset_t va);
 
-static PMAP_INLINE void	free_pv_entry(pv_entry_t pv);
-static pv_entry_t get_pv_entry(pmap_t locked_pmap);
+static void free_pv_chunk(struct pv_chunk *pc);
+static void free_pv_entry(pmap_t pmap, pv_entry_t pv);
+static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try);
+static vm_page_t pmap_pv_reclaim(pmap_t locked_pmap);
 
 static void	pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
 		    vm_page_t m, vm_prot_t prot);
@@ -402,9 +420,14 @@ pmap_bootstrap()
 	PMAP_LOCK_INIT(kernel_pmap);
 	for (i = 0; i < IA64_VM_MINKERN_REGION; i++)
 		kernel_pmap->pm_rid[i] = 0;
-	TAILQ_INIT(&kernel_pmap->pm_pvlist);
+	TAILQ_INIT(&kernel_pmap->pm_pvchunk);
 	PCPU_SET(md.current_pmap, kernel_pmap);
 
+ 	/*
+	 * Initialize the global pv list lock.
+	 */
+	rw_init(&pvh_global_lock, "pmap pv global");
+
 	/* Region 5 is mapped via the VHPT. */
 	ia64_set_rr(IA64_RR_BASE(5), (5 << 8) | (PAGE_SHIFT << 2) | 1);
 
@@ -449,7 +472,6 @@ pmap_page_init(vm_page_t m)
 {
 
 	TAILQ_INIT(&m->md.pv_list);
-	m->md.pv_list_count = 0;
 	m->md.memattr = VM_MEMATTR_DEFAULT;
 }
 
@@ -461,19 +483,6 @@ pmap_page_init(vm_page_t m)
 void
 pmap_init(void)
 {
-	int shpgperproc = PMAP_SHPGPERPROC;
-
-	/*
-	 * Initialize the address space (zone) for the pv entries.  Set a
-	 * high water mark so that the system can recover from excessive
-	 * numbers of pv entries.
-	 */
-	pvzone = uma_zcreate("PV ENTRY", sizeof(struct pv_entry), NULL, NULL,
-	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE);
-	TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
-	pv_entry_max = shpgperproc * maxproc + cnt.v_page_count;
-	TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
-	pv_entry_high_water = 9 * (pv_entry_max / 10);
 
 	ptezone = uma_zcreate("PT ENTRY", sizeof (struct ia64_lpte), 
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM|UMA_ZONE_NOFREE);
@@ -615,7 +624,7 @@ pmap_pinit(struct pmap *pmap)
 	PMAP_LOCK_INIT(pmap);
 	for (i = 0; i < IA64_VM_MINKERN_REGION; i++)
 		pmap->pm_rid[i] = pmap_allocate_rid();
-	TAILQ_INIT(&pmap->pm_pvlist);
+	TAILQ_INIT(&pmap->pm_pvchunk);
 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
 	return (1);
 }
@@ -683,14 +692,222 @@ pmap_growkernel(vm_offset_t addr)
  * page management routines.
  ***************************************************/
 
+CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
+CTASSERT(_NPCM == 6);
+CTASSERT(_NPCPV == 337);
+
+static __inline struct pv_chunk *
+pv_to_chunk(pv_entry_t pv)
+{
+
+	return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
+}
+
+#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
+
+#define	PC_FREE0_4	0xfffffffffffffffful
+#define	PC_FREE5	0x000000000001fffful
+
+static const u_long pc_freemask[_NPCM] = {
+	PC_FREE0_4, PC_FREE0_4, PC_FREE0_4,
+	PC_FREE0_4, PC_FREE0_4, PC_FREE5
+};
+
+static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
+
+SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
+    "Current number of pv entries");
+
+#ifdef PV_STATS
+static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
+
+SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
+    "Current number of pv entry chunks");
+SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
+    "Current number of pv entry chunks allocated");
+SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
+    "Current number of pv entry chunks frees");
+SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
+    "Number of times tried to get a chunk page but failed.");
+
+static long pv_entry_frees, pv_entry_allocs;
+static int pv_entry_spare;
+
+SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
+    "Current number of pv entry frees");
+SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
+    "Current number of pv entry allocs");
+SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
+    "Current number of spare pv entries");
+#endif
+
+/*
+ * We are in a serious low memory condition.  Resort to
+ * drastic measures to free some pages so we can allocate
+ * another pv entry chunk.
+ */
+static vm_page_t
+pmap_pv_reclaim(pmap_t locked_pmap)
+{
+	struct pch newtail;
+	struct pv_chunk *pc;
+	struct ia64_lpte *pte;
+	pmap_t pmap;
+	pv_entry_t pv;
+	vm_offset_t va;
+	vm_page_t m, m_pc;
+	u_long inuse;
+	int bit, field, freed, idx;
+
+	PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
+	pmap = NULL;
+	m_pc = NULL;
+	TAILQ_INIT(&newtail);
+	while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL) {
+		TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
+		if (pmap != pc->pc_pmap) {
+			if (pmap != NULL) {
+				if (pmap != locked_pmap) {
+					pmap_switch(locked_pmap);
+					PMAP_UNLOCK(pmap);
+				}
+			}
+			pmap = pc->pc_pmap;
+			/* Avoid deadlock and lock recursion. */
+			if (pmap > locked_pmap)
+				PMAP_LOCK(pmap);
+			else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) {
+				pmap = NULL;
+				TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
+				continue;
+			}
+			pmap_switch(pmap);
+		}
+
+		/*
+		 * Destroy every non-wired, 8 KB page mapping in the chunk.
+		 */
+		freed = 0;
+		for (field = 0; field < _NPCM; field++) {
+			for (inuse = ~pc->pc_map[field] & pc_freemask[field];
+			    inuse != 0; inuse &= ~(1UL << bit)) {
+				bit = ffsl(inuse) - 1;
+				idx = field * sizeof(inuse) * NBBY + bit;
+				pv = &pc->pc_pventry[idx];
+				va = pv->pv_va;
+				pte = pmap_find_vhpt(va);
+				KASSERT(pte != NULL, ("pte"));
+				if (pmap_wired(pte))
+					continue;
+				pmap_remove_vhpt(va);
+				pmap_invalidate_page(va);
+				m = PHYS_TO_VM_PAGE(pmap_ppn(pte));
+				if (pmap_accessed(pte))
+					vm_page_aflag_set(m, PGA_REFERENCED);
+				if (pmap_dirty(pte))
+					vm_page_dirty(m);
+				pmap_free_pte(pte, va);
+				TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
+				if (TAILQ_EMPTY(&m->md.pv_list))
+					vm_page_aflag_clear(m, PGA_WRITEABLE);
+				pc->pc_map[field] |= 1UL << bit;
+				freed++;
+			}
+		}
+		if (freed == 0) {
+			TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
+			continue;
+		}
+		/* Every freed mapping is for a 8 KB page. */
+		pmap->pm_stats.resident_count -= freed;
+		PV_STAT(pv_entry_frees += freed);
+		PV_STAT(pv_entry_spare += freed);
+		pv_entry_count -= freed;
+		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
+		for (field = 0; field < _NPCM; field++)
+			if (pc->pc_map[field] != pc_freemask[field]) {
+				TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc,
+				    pc_list);
+				TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
+
+				/*
+				 * One freed pv entry in locked_pmap is
+				 * sufficient.
+				 */
+				if (pmap == locked_pmap)
+					goto out;
+				break;
+			}
+		if (field == _NPCM) {
+			PV_STAT(pv_entry_spare -= _NPCPV);
+			PV_STAT(pc_chunk_count--);
+			PV_STAT(pc_chunk_frees++);
+			/* Entire chunk is free; return it. */
+			m_pc = PHYS_TO_VM_PAGE(IA64_RR_MASK((vm_offset_t)pc));
+			break;
+		}
+	}
+out:
+	TAILQ_CONCAT(&pv_chunks, &newtail, pc_lru);
+	if (pmap != NULL) {
+		if (pmap != locked_pmap) {
+			pmap_switch(locked_pmap);
+			PMAP_UNLOCK(pmap);
+		}
+	}
+	return (m_pc);
+}
+
 /*
  * free the pv_entry back to the free list
  */
-static PMAP_INLINE void
-free_pv_entry(pv_entry_t pv)
+static void
+free_pv_entry(pmap_t pmap, pv_entry_t pv)
 {
+	struct pv_chunk *pc;
+	int bit, field, idx;
+
+	rw_assert(&pvh_global_lock, RA_WLOCKED);
+	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+	PV_STAT(pv_entry_frees++);
+	PV_STAT(pv_entry_spare++);
 	pv_entry_count--;
-	uma_zfree(pvzone, pv);
+	pc = pv_to_chunk(pv);
+	idx = pv - &pc->pc_pventry[0];
+	field = idx / (sizeof(u_long) * NBBY);
+	bit = idx % (sizeof(u_long) * NBBY);
+	pc->pc_map[field] |= 1ul << bit;
+	for (idx = 0; idx < _NPCM; idx++)
+		if (pc->pc_map[idx] != pc_freemask[idx]) {
+			/*
+			 * 98% of the time, pc is already at the head of the
+			 * list.  If it isn't already, move it to the head.
+			 */
+			if (__predict_false(TAILQ_FIRST(&pmap->pm_pvchunk) !=
+			    pc)) {
+				TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
+				TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc,
+				    pc_list);
+			}
+			return;
+		}
+	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
+	free_pv_chunk(pc);
+}
+
+static void
+free_pv_chunk(struct pv_chunk *pc)
+{
+	vm_page_t m;
+
+ 	TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
+	PV_STAT(pv_entry_spare -= _NPCPV);
+	PV_STAT(pc_chunk_count--);
+	PV_STAT(pc_chunk_frees++);
+	/* entire chunk is free, return it */
+	m = PHYS_TO_VM_PAGE(IA64_RR_MASK((vm_offset_t)pc));
+	vm_page_unwire(m, 0);
+	vm_page_free(m);
 }
 
 /*
@@ -698,82 +915,67 @@ free_pv_entry(pv_entry_t pv)
  * when needed.
  */
 static pv_entry_t
-get_pv_entry(pmap_t locked_pmap)
+get_pv_entry(pmap_t pmap, boolean_t try)
 {
-	static const struct timeval printinterval = { 60, 0 };
-	static struct timeval lastprint;
-	struct vpgqueues *vpq;
-	struct ia64_lpte *pte;
-	pmap_t oldpmap, pmap;
-	pv_entry_t allocated_pv, next_pv, pv;
-	vm_offset_t va;
+	struct pv_chunk *pc;
+	pv_entry_t pv;
 	vm_page_t m;
+	int bit, field, idx;
 
-	PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
-	allocated_pv = uma_zalloc(pvzone, M_NOWAIT);
-	if (allocated_pv != NULL) {
-		pv_entry_count++;
-		if (pv_entry_count > pv_entry_high_water)
-			pagedaemon_wakeup();
-		else
-			return (allocated_pv);
-	}
-
-	/*
-	 * Reclaim pv entries: At first, destroy mappings to inactive
-	 * pages.  After that, if a pv entry is still needed, destroy
-	 * mappings to active pages.
-	 */
-	if (ratecheck(&lastprint, &printinterval))
-		printf("Approaching the limit on PV entries, "
-		    "increase the vm.pmap.shpgperproc tunable.\n");
-	vpq = &vm_page_queues[PQ_INACTIVE];
+	rw_assert(&pvh_global_lock, RA_WLOCKED);
+	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+	PV_STAT(pv_entry_allocs++);
+	pv_entry_count++;
 retry:
-	TAILQ_FOREACH(m, &vpq->pl, pageq) {
-		if ((m->flags & PG_MARKER) != 0 || m->hold_count || m->busy)
-			continue;
-		TAILQ_FOREACH_SAFE(pv, &m->md.pv_list, pv_list, next_pv) {
-			va = pv->pv_va;
-			pmap = pv->pv_pmap;
-			/* Avoid deadlock and lock recursion. */
-			if (pmap > locked_pmap)
-				PMAP_LOCK(pmap);
-			else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap))
-				continue;
-			pmap->pm_stats.resident_count--;
-			oldpmap = pmap_switch(pmap);
-			pte = pmap_find_vhpt(va);
-			KASSERT(pte != NULL, ("pte"));
-			pmap_remove_vhpt(va);
-			pmap_invalidate_page(va);
-			pmap_switch(oldpmap);
-			if (pmap_accessed(pte))
-				vm_page_aflag_set(m, PGA_REFERENCED);
-			if (pmap_dirty(pte))
-				vm_page_dirty(m);
-			pmap_free_pte(pte, va);
-			TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
-			m->md.pv_list_count--;
-			TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
-			if (pmap != locked_pmap)
-				PMAP_UNLOCK(pmap);
-			if (allocated_pv == NULL)
-				allocated_pv = pv;
-			else
-				free_pv_entry(pv);
+	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
+	if (pc != NULL) {
+		for (field = 0; field < _NPCM; field++) {
+			if (pc->pc_map[field]) {
+				bit = ffsl(pc->pc_map[field]) - 1;
+				break;
+			}
+		}
+		if (field < _NPCM) {
+			idx = field * sizeof(pc->pc_map[field]) * NBBY + bit;
+			pv = &pc->pc_pventry[idx];
+			pc->pc_map[field] &= ~(1ul << bit);
+			/* If this was the last item, move it to tail */
+			for (field = 0; field < _NPCM; field++)
+				if (pc->pc_map[field] != 0) {
+					PV_STAT(pv_entry_spare--);
+					return (pv);	/* not full, return */
+				}
+			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
+			TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
+			PV_STAT(pv_entry_spare--);
+			return (pv);
 		}
-		if (TAILQ_EMPTY(&m->md.pv_list))
-			vm_page_aflag_clear(m, PGA_WRITEABLE);
 	}
-	if (allocated_pv == NULL) {
-		if (vpq == &vm_page_queues[PQ_INACTIVE]) {
-			vpq = &vm_page_queues[PQ_ACTIVE];
-			goto retry;
+	/* No free items, allocate another chunk */
+	m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
+	    VM_ALLOC_WIRED);
+	if (m == NULL) {
+		if (try) {
+			pv_entry_count--;
+			PV_STAT(pc_chunk_tryfail++);
+			return (NULL);
 		}
-		panic("get_pv_entry: increase the vm.pmap.shpgperproc tunable");
+		m = pmap_pv_reclaim(pmap);
+		if (m == NULL)
+			goto retry;
 	}
-	return (allocated_pv);
+	PV_STAT(pc_chunk_count++);
+	PV_STAT(pc_chunk_allocs++);
+	pc = (struct pv_chunk *)IA64_PHYS_TO_RR7(VM_PAGE_TO_PHYS(m));
+	pc->pc_pmap = pmap;
+	pc->pc_map[0] = pc_freemask[0] & ~1ul;	/* preallocated bit 0 */
+	for (field = 1; field < _NPCM; field++)
+		pc->pc_map[field] = pc_freemask[field];
+	TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
+	pv = &pc->pc_pventry[0];
+	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
+	PV_STAT(pv_entry_spare += _NPCPV - 1);
+	return (pv);
 }
 
 /*
@@ -785,15 +987,10 @@ pmap_try_insert_pv_entry(pmap_t pmap, vm
 	pv_entry_t pv;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
-	if (pv_entry_count < pv_entry_high_water && 
-	    (pv = uma_zalloc(pvzone, M_NOWAIT)) != NULL) {
-		pv_entry_count++;
+	rw_assert(&pvh_global_lock, RA_WLOCKED);
+	if ((pv = get_pv_entry(pmap, TRUE)) != NULL) {
 		pv->pv_va = va;
-		pv->pv_pmap = pmap;
-		TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist);
 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
-		m->md.pv_list_count++;
 		return (TRUE);
 	} else
 		return (FALSE);
@@ -899,28 +1096,21 @@ pmap_find_vhpt(vm_offset_t va)
 static int
 pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va, pv_entry_t pv)
 {
+
+	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	if (!pv) {
-		if (m->md.pv_list_count < pmap->pm_stats.resident_count) {
-			TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
-				if (pmap == pv->pv_pmap && va == pv->pv_va) 
-					break;
-			}
-		} else {
-			TAILQ_FOREACH(pv, &pmap->pm_pvlist, pv_plist) {
-				if (va == pv->pv_va) 
-					break;
-			}
+		TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
+			if (pmap == PV_PMAP(pv) && va == pv->pv_va) 
+				break;
 		}
 	}
 
 	if (pv) {
 		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
-		m->md.pv_list_count--;
 		if (TAILQ_FIRST(&m->md.pv_list) == NULL)
 			vm_page_aflag_clear(m, PGA_WRITEABLE);
 
-		TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
-		free_pv_entry(pv);
+		free_pv_entry(pmap, pv);
 		return 0;
 	} else {
 		return ENOENT;
@@ -936,15 +1126,10 @@ pmap_insert_entry(pmap_t pmap, vm_offset
 {
 	pv_entry_t pv;
 
-	pv = get_pv_entry(pmap);
-	pv->pv_pmap = pmap;
+	rw_assert(&pvh_global_lock, RA_WLOCKED);
+	pv = get_pv_entry(pmap, FALSE);
 	pv->pv_va = va;
-
-	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
-	TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist);
 	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
-	m->md.pv_list_count++;
 }
 
 /*
@@ -1317,53 +1502,32 @@ pmap_map(vm_offset_t *virt, vm_offset_t 
  *
  *	It is assumed that the start and end are properly
  *	rounded to the page size.
+ *
+ *	Sparsely used ranges are inefficiently removed.  The VHPT is
+ *	probed for every page within the range.  XXX
  */
 void
 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	pmap_t oldpmap;
 	vm_offset_t va;
-	pv_entry_t npv, pv;
 	struct ia64_lpte *pte;
 
+	/*
+	 * Perform an unsynchronized read.  This is, however, safe.
+	 */
 	if (pmap->pm_stats.resident_count == 0)
 		return;
 
-	vm_page_lock_queues();
+	rw_wlock(&pvh_global_lock);
 	PMAP_LOCK(pmap);
 	oldpmap = pmap_switch(pmap);
-
-	/*
-	 * special handling of removing one page.  a very
-	 * common operation and easy to short circuit some
-	 * code.
-	 */
-	if (sva + PAGE_SIZE == eva) {
-		pte = pmap_find_vhpt(sva);
+	for (va = sva; va < eva; va += PAGE_SIZE) {
+		pte = pmap_find_vhpt(va);
 		if (pte != NULL)
-			pmap_remove_pte(pmap, pte, sva, 0, 1);
-		goto out;
-	}
-
-	if (pmap->pm_stats.resident_count < ((eva - sva) >> PAGE_SHIFT)) {
-		TAILQ_FOREACH_SAFE(pv, &pmap->pm_pvlist, pv_plist, npv) {
-			va = pv->pv_va;
-			if (va >= sva && va < eva) {
-				pte = pmap_find_vhpt(va);
-				KASSERT(pte != NULL, ("pte"));
-				pmap_remove_pte(pmap, pte, va, pv, 1);
-			}
-		}
-	} else {
-		for (va = sva; va < eva; va += PAGE_SIZE) {
-			pte = pmap_find_vhpt(va);
-			if (pte != NULL)
-				pmap_remove_pte(pmap, pte, va, 0, 1);
-		}
+			pmap_remove_pte(pmap, pte, va, 0, 1);
 	}
-
-out:
-	vm_page_unlock_queues();
+	rw_wunlock(&pvh_global_lock);
 	pmap_switch(oldpmap);
 	PMAP_UNLOCK(pmap);
 }
@@ -1389,10 +1553,10 @@ pmap_remove_all(vm_page_t m)
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_remove_all: page %p is not managed", m));
-	vm_page_lock_queues();
+	rw_wlock(&pvh_global_lock);
 	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
 		struct ia64_lpte *pte;
-		pmap_t pmap = pv->pv_pmap;
+		pmap_t pmap = PV_PMAP(pv);
 		vm_offset_t va = pv->pv_va;
 
 		PMAP_LOCK(pmap);
@@ -1406,7 +1570,7 @@ pmap_remove_all(vm_page_t m)
 		PMAP_UNLOCK(pmap);
 	}
 	vm_page_aflag_clear(m, PGA_WRITEABLE);
-	vm_page_unlock_queues();
+	rw_wunlock(&pvh_global_lock);
 }
 
 /*
@@ -1485,7 +1649,7 @@ pmap_enter(pmap_t pmap, vm_offset_t va, 
 	struct ia64_lpte *pte;
 	boolean_t icache_inval, managed;
 
-	vm_page_lock_queues();
+	rw_wlock(&pvh_global_lock);
 	PMAP_LOCK(pmap);
 	oldpmap = pmap_switch(pmap);
 
@@ -1500,9 +1664,9 @@ pmap_enter(pmap_t pmap, vm_offset_t va, 
 	while ((pte = pmap_find_pte(va)) == NULL) {
 		pmap_switch(oldpmap);
 		PMAP_UNLOCK(pmap);
-		vm_page_unlock_queues();
+		rw_wunlock(&pvh_global_lock);
 		VM_WAIT;
-		vm_page_lock_queues();
+		rw_wlock(&pvh_global_lock);
 		PMAP_LOCK(pmap);
 		oldpmap = pmap_switch(pmap);
 	}
@@ -1591,7 +1755,7 @@ validate:
 
 	if ((prot & VM_PROT_WRITE) != 0 && managed)
 		vm_page_aflag_set(m, PGA_WRITEABLE);
-	vm_page_unlock_queues();
+	rw_wunlock(&pvh_global_lock);
 	pmap_switch(oldpmap);
 	PMAP_UNLOCK(pmap);
 }
@@ -1619,14 +1783,14 @@ pmap_enter_object(pmap_t pmap, vm_offset
 	VM_OBJECT_LOCK_ASSERT(m_start->object, MA_OWNED);
 	psize = atop(end - start);
 	m = m_start;
-	vm_page_lock_queues();
+	rw_wlock(&pvh_global_lock);
 	PMAP_LOCK(pmap);
 	oldpmap = pmap_switch(pmap);
 	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
 		pmap_enter_quick_locked(pmap, start + ptoa(diff), m, prot);
 		m = TAILQ_NEXT(m, listq);
 	}
-	vm_page_unlock_queues();
+	rw_wunlock(&pvh_global_lock);
 	pmap_switch(oldpmap);
  	PMAP_UNLOCK(pmap);
 }
@@ -1645,11 +1809,11 @@ pmap_enter_quick(pmap_t pmap, vm_offset_
 {
 	pmap_t oldpmap;
 
-	vm_page_lock_queues();
+	rw_wlock(&pvh_global_lock);
 	PMAP_LOCK(pmap);
 	oldpmap = pmap_switch(pmap);
 	pmap_enter_quick_locked(pmap, va, m, prot);
-	vm_page_unlock_queues();
+	rw_wunlock(&pvh_global_lock);
 	pmap_switch(oldpmap);
 	PMAP_UNLOCK(pmap);
 }
@@ -1664,7 +1828,7 @@ pmap_enter_quick_locked(pmap_t pmap, vm_
 	KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
 	    (m->oflags & VPO_UNMANAGED) != 0,
 	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	if ((pte = pmap_find_pte(va)) == NULL)
@@ -1845,9 +2009,9 @@ pmap_page_exists_quick(pmap_t pmap, vm_p
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_page_exists_quick: page %p is not managed", m));
 	rv = FALSE;
-	vm_page_lock_queues();
+	rw_wlock(&pvh_global_lock);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
-		if (pv->pv_pmap == pmap) {
+		if (PV_PMAP(pv) == pmap) {
 			rv = TRUE;
 			break;
 		}
@@ -1855,7 +2019,7 @@ pmap_page_exists_quick(pmap_t pmap, vm_p
 		if (loops >= 16)
 			break;
 	}
-	vm_page_unlock_queues();
+	rw_wunlock(&pvh_global_lock);
 	return (rv);
 }
 
@@ -1876,9 +2040,9 @@ pmap_page_wired_mappings(vm_page_t m)
 	count = 0;
 	if ((m->oflags & VPO_UNMANAGED) != 0)
 		return (count);
-	vm_page_lock_queues();
+	rw_wlock(&pvh_global_lock);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
-		pmap = pv->pv_pmap;
+		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		oldpmap = pmap_switch(pmap);
 		pte = pmap_find_vhpt(pv->pv_va);
@@ -1888,7 +2052,7 @@ pmap_page_wired_mappings(vm_page_t m)
 		pmap_switch(oldpmap);
 		PMAP_UNLOCK(pmap);
 	}
-	vm_page_unlock_queues();
+	rw_wunlock(&pvh_global_lock);
 	return (count);
 }
 
@@ -1903,33 +2067,62 @@ pmap_page_wired_mappings(vm_page_t m)
 void
 pmap_remove_pages(pmap_t pmap)
 {
-	pmap_t oldpmap;
-	pv_entry_t pv, npv;
+	struct pv_chunk *pc, *npc;
+	struct ia64_lpte *pte;
+	pv_entry_t pv;
+	vm_offset_t va;
+	vm_page_t m;
+	u_long inuse, bitmask;
+	int allfree, bit, field, idx;
 
 	if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)) {
 		printf("warning: %s called with non-current pmap\n",
 		    __func__);
 		return;
 	}
-
-	vm_page_lock_queues();
+	rw_wlock(&pvh_global_lock);
 	PMAP_LOCK(pmap);
-	oldpmap = pmap_switch(pmap);
-
-	for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) {
-		struct ia64_lpte *pte;
-
-		npv = TAILQ_NEXT(pv, pv_plist);
-
-		pte = pmap_find_vhpt(pv->pv_va);
-		KASSERT(pte != NULL, ("pte"));
-		if (!pmap_wired(pte))
-			pmap_remove_pte(pmap, pte, pv->pv_va, pv, 1);
+	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
+		allfree = 1;
+		for (field = 0; field < _NPCM; field++) {
+			inuse = ~pc->pc_map[field] & pc_freemask[field];
+			while (inuse != 0) {
+				bit = ffsl(inuse) - 1;
+				bitmask = 1UL << bit;
+				idx = field * sizeof(inuse) * NBBY + bit;
+				pv = &pc->pc_pventry[idx];
+				inuse &= ~bitmask;
+				va = pv->pv_va;
+				pte = pmap_find_vhpt(va);
+				KASSERT(pte != NULL, ("pte"));
+				if (pmap_wired(pte)) {
+					allfree = 0;
+					continue;
+				}
+				pmap_remove_vhpt(va);
+				pmap_invalidate_page(va);
+				m = PHYS_TO_VM_PAGE(pmap_ppn(pte));
+				if (pmap_dirty(pte))
+					vm_page_dirty(m);
+				pmap_free_pte(pte, va);
+				/* Mark free */
+				PV_STAT(pv_entry_frees++);
+				PV_STAT(pv_entry_spare++);
+				pv_entry_count--;
+				pc->pc_map[field] |= bitmask;
+				pmap->pm_stats.resident_count--;
+				TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
+				if (TAILQ_EMPTY(&m->md.pv_list))
+					vm_page_aflag_clear(m, PGA_WRITEABLE);
+			}
+		}
+		if (allfree) {
+			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
+			free_pv_chunk(pc);
+		}
 	}
-
-	pmap_switch(oldpmap);
+	rw_wunlock(&pvh_global_lock);
 	PMAP_UNLOCK(pmap);
-	vm_page_unlock_queues();
 }
 
 /*
@@ -1948,16 +2141,17 @@ int
 pmap_ts_referenced(vm_page_t m)
 {
 	struct ia64_lpte *pte;
-	pmap_t oldpmap;
+	pmap_t oldpmap, pmap;
 	pv_entry_t pv;
 	int count = 0;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_ts_referenced: page %p is not managed", m));
-	vm_page_lock_queues();
+	rw_wlock(&pvh_global_lock);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
-		PMAP_LOCK(pv->pv_pmap);
-		oldpmap = pmap_switch(pv->pv_pmap);
+		pmap = PV_PMAP(pv);
+		PMAP_LOCK(pmap);
+		oldpmap = pmap_switch(pmap);
 		pte = pmap_find_vhpt(pv->pv_va);
 		KASSERT(pte != NULL, ("pte"));
 		if (pmap_accessed(pte)) {
@@ -1966,9 +2160,9 @@ pmap_ts_referenced(vm_page_t m)
 			pmap_invalidate_page(pv->pv_va);
 		}
 		pmap_switch(oldpmap);
-		PMAP_UNLOCK(pv->pv_pmap);
+		PMAP_UNLOCK(pmap);
 	}
-	vm_page_unlock_queues();
+	rw_wunlock(&pvh_global_lock);
 	return (count);
 }
 
@@ -1982,7 +2176,7 @@ boolean_t
 pmap_is_modified(vm_page_t m)
 {
 	struct ia64_lpte *pte;
-	pmap_t oldpmap;
+	pmap_t oldpmap, pmap;
 	pv_entry_t pv;
 	boolean_t rv;
 
@@ -1999,19 +2193,20 @@ pmap_is_modified(vm_page_t m)
 	if ((m->oflags & VPO_BUSY) == 0 &&
 	    (m->aflags & PGA_WRITEABLE) == 0)
 		return (rv);
-	vm_page_lock_queues();
+	rw_wlock(&pvh_global_lock);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
-		PMAP_LOCK(pv->pv_pmap);
-		oldpmap = pmap_switch(pv->pv_pmap);
+		pmap = PV_PMAP(pv);
+		PMAP_LOCK(pmap);
+		oldpmap = pmap_switch(pmap);
 		pte = pmap_find_vhpt(pv->pv_va);
 		pmap_switch(oldpmap);
 		KASSERT(pte != NULL, ("pte"));
 		rv = pmap_dirty(pte) ? TRUE : FALSE;
-		PMAP_UNLOCK(pv->pv_pmap);
+		PMAP_UNLOCK(pmap);
 		if (rv)
 			break;
 	}
-	vm_page_unlock_queues();
+	rw_wunlock(&pvh_global_lock);
 	return (rv);
 }
 
@@ -2042,26 +2237,27 @@ boolean_t
 pmap_is_referenced(vm_page_t m)
 {
 	struct ia64_lpte *pte;
-	pmap_t oldpmap;
+	pmap_t oldpmap, pmap;
 	pv_entry_t pv;
 	boolean_t rv;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_is_referenced: page %p is not managed", m));
 	rv = FALSE;
-	vm_page_lock_queues();
+	rw_wlock(&pvh_global_lock);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
-		PMAP_LOCK(pv->pv_pmap);
-		oldpmap = pmap_switch(pv->pv_pmap);
+		pmap = PV_PMAP(pv);
+		PMAP_LOCK(pmap);
+		oldpmap = pmap_switch(pmap);
 		pte = pmap_find_vhpt(pv->pv_va);
 		pmap_switch(oldpmap);
 		KASSERT(pte != NULL, ("pte"));
 		rv = pmap_accessed(pte) ? TRUE : FALSE;
-		PMAP_UNLOCK(pv->pv_pmap);
+		PMAP_UNLOCK(pmap);
 		if (rv)
 			break;
 	}
-	vm_page_unlock_queues();
+	rw_wunlock(&pvh_global_lock);
 	return (rv);
 }
 
@@ -2072,7 +2268,7 @@ void
 pmap_clear_modify(vm_page_t m)
 {
 	struct ia64_lpte *pte;
-	pmap_t oldpmap;
+	pmap_t oldpmap, pmap;
 	pv_entry_t pv;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
@@ -2088,10 +2284,11 @@ pmap_clear_modify(vm_page_t m)
 	 */
 	if ((m->aflags & PGA_WRITEABLE) == 0)
 		return;
-	vm_page_lock_queues();
+	rw_wlock(&pvh_global_lock);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
-		PMAP_LOCK(pv->pv_pmap);
-		oldpmap = pmap_switch(pv->pv_pmap);
+		pmap = PV_PMAP(pv);
+		PMAP_LOCK(pmap);
+		oldpmap = pmap_switch(pmap);
 		pte = pmap_find_vhpt(pv->pv_va);
 		KASSERT(pte != NULL, ("pte"));
 		if (pmap_dirty(pte)) {
@@ -2099,9 +2296,9 @@ pmap_clear_modify(vm_page_t m)
 			pmap_invalidate_page(pv->pv_va);
 		}
 		pmap_switch(oldpmap);
-		PMAP_UNLOCK(pv->pv_pmap);
+		PMAP_UNLOCK(pmap);
 	}
-	vm_page_unlock_queues();
+	rw_wunlock(&pvh_global_lock);
 }
 
 /*
@@ -2113,15 +2310,16 @@ void
 pmap_clear_reference(vm_page_t m)
 {
 	struct ia64_lpte *pte;
-	pmap_t oldpmap;
+	pmap_t oldpmap, pmap;

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201210260302.q9Q32dFO007302>