From owner-svn-src-stable@FreeBSD.ORG Wed Sep 5 20:40:12 2012 Return-Path: Delivered-To: svn-src-stable@freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [69.147.83.52]) by hub.freebsd.org (Postfix) with ESMTP id 5B3F7106566B; Wed, 5 Sep 2012 20:40:12 +0000 (UTC) (envelope-from kib@FreeBSD.org) Received: from svn.freebsd.org (svn.freebsd.org [IPv6:2001:4f8:fff6::2c]) by mx1.freebsd.org (Postfix) with ESMTP id 4367E8FC0A; Wed, 5 Sep 2012 20:40:12 +0000 (UTC) Received: from svn.freebsd.org (localhost [127.0.0.1]) by svn.freebsd.org (8.14.4/8.14.4) with ESMTP id q85KeCtO098724; Wed, 5 Sep 2012 20:40:12 GMT (envelope-from kib@svn.freebsd.org) Received: (from kib@localhost) by svn.freebsd.org (8.14.4/8.14.4/Submit) id q85KeCTe098721; Wed, 5 Sep 2012 20:40:12 GMT (envelope-from kib@svn.freebsd.org) Message-Id: <201209052040.q85KeCTe098721@svn.freebsd.org> From: Konstantin Belousov Date: Wed, 5 Sep 2012 20:40:12 +0000 (UTC) To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-stable@freebsd.org, svn-src-stable-9@freebsd.org X-SVN-Group: stable-9 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cc: Subject: svn commit: r240151 - in stable/9/sys: amd64/amd64 amd64/include i386/i386 i386/xen kern mips/mips sparc64/sparc64 vm X-BeenThere: svn-src-stable@freebsd.org X-Mailman-Version: 2.1.5 Precedence: list List-Id: SVN commit messages for all the -stable branches of the src tree List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Wed, 05 Sep 2012 20:40:12 -0000 Author: kib Date: Wed Sep 5 20:40:11 2012 New Revision: 240151 URL: http://svn.freebsd.org/changeset/base/240151 Log: MFC r233122,r237086,r237228,r237264,r237290,r237404,r237414,r237513,r237551, r237592,r237604,r237623,r237684,r237733,r237813,r237855,r238124,r238126, r238163,r238414,r238610,r238889,r238970,r239072,r239137,r240126 (all by alc): Add fine-grained PV chunk and list locking to the amd64 pmap, enabling concurrent execution of the following functions on different pmaps: pmap_change_wiring() pmap_copy() pmap_enter() pmap_enter_object() pmap_enter_quick() pmap_page_exists_quick() pmap_page_is_mapped() pmap_protect() pmap_remove() pmap_remove_pages() Requested and approved by: alc Modified: stable/9/sys/amd64/amd64/pmap.c stable/9/sys/amd64/include/cpufunc.h stable/9/sys/i386/i386/pmap.c stable/9/sys/i386/xen/pmap.c stable/9/sys/kern/subr_witness.c stable/9/sys/mips/mips/pmap.c stable/9/sys/sparc64/sparc64/pmap.c stable/9/sys/vm/vm_map.c stable/9/sys/vm/vm_page.c Directory Properties: stable/9/sys/ (props changed) Modified: stable/9/sys/amd64/amd64/pmap.c ============================================================================== --- stable/9/sys/amd64/amd64/pmap.c Wed Sep 5 19:01:39 2012 (r240150) +++ stable/9/sys/amd64/amd64/pmap.c Wed Sep 5 20:40:11 2012 (r240151) @@ -168,6 +168,39 @@ __FBSDID("$FreeBSD$"); #define pa_index(pa) ((pa) >> PDRSHIFT) #define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) +#define NPV_LIST_LOCKS MAXCPU + +#define PHYS_TO_PV_LIST_LOCK(pa) \ + (&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS]) + +#define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ + struct rwlock **_lockp = (lockp); \ + struct rwlock *_new_lock; \ + \ + _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ + if (_new_lock != *_lockp) { \ + if (*_lockp != NULL) \ + rw_wunlock(*_lockp); \ + *_lockp = _new_lock; \ + rw_wlock(*_lockp); \ + } \ +} while (0) + +#define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ + CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m)) + +#define RELEASE_PV_LIST_LOCK(lockp) do { \ + struct rwlock **_lockp = (lockp); \ + \ + if (*_lockp != NULL) { \ + rw_wunlock(*_lockp); \ + *_lockp = NULL; \ + } \ +} while (0) + +#define VM_PAGE_TO_PV_LIST_LOCK(m) \ + PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) + struct pmap kernel_pmap_store; vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ @@ -214,7 +247,8 @@ static struct { * Data for the pv entry allocation mechanism */ static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); -static long pv_entry_count; +static struct mtx pv_chunks_mutex; +static struct rwlock pv_list_locks[NPV_LIST_LOCKS]; static struct md_page *pv_table; /* @@ -230,10 +264,17 @@ static caddr_t crashdumpmap; static void free_pv_chunk(struct pv_chunk *pc); static void free_pv_entry(pmap_t pmap, pv_entry_t pv); -static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try); -static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); -static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); -static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); +static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp); +static int popcnt_pc_map_elem(uint64_t elem); +static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); +static void reserve_pv_entries(pmap_t pmap, int needed, + struct rwlock **lockp); +static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, + struct rwlock **lockp); +static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, + struct rwlock **lockp); +static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, + struct rwlock **lockp); static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va); @@ -241,12 +282,14 @@ static int pmap_pvh_wired_mappings(struc static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode); static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); +static boolean_t pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, + vm_offset_t va, struct rwlock **lockp); static boolean_t pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va); static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, - vm_prot_t prot); + vm_prot_t prot, struct rwlock **lockp); static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, - vm_page_t m, vm_prot_t prot, vm_page_t mpte); + vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte); static void pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte); static boolean_t pmap_is_modified_pvh(struct md_page *pvh); @@ -254,32 +297,34 @@ static boolean_t pmap_is_referenced_pvh( static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode); static vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va); static void pmap_pde_attr(pd_entry_t *pde, int cache_bits); -static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); +static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, + struct rwlock **lockp); static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot); static void pmap_pte_attr(pt_entry_t *pte, int cache_bits); static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, - vm_page_t *free); + vm_page_t *free, struct rwlock **lockp); static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, - vm_offset_t sva, pd_entry_t ptepde, vm_page_t *free); + vm_offset_t sva, pd_entry_t ptepde, vm_page_t *free, + struct rwlock **lockp); static void pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte); static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, vm_page_t *free); -static void pmap_remove_entry(struct pmap *pmap, vm_page_t m, - vm_offset_t va); -static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m); static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, - vm_page_t m); + vm_page_t m, struct rwlock **lockp); static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde); static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde); -static vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va, int flags); -static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags); +static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, + struct rwlock **lockp); +static vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va, + struct rwlock **lockp); +static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, + struct rwlock **lockp); -static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, int flags); -static int _pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m, - vm_page_t* free); +static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, + vm_page_t *free); static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, vm_page_t *); static vm_offset_t pmap_kmem_choose(vm_offset_t addr); @@ -597,7 +642,7 @@ pmap_bootstrap(vm_paddr_t *firstaddr) /* * Initialize the global pv list lock. */ - rw_init(&pvh_global_lock, "pvh global"); + rw_init(&pvh_global_lock, "pmap pv global"); /* * Reserve some special page table entries/VA space for temporary @@ -763,6 +808,17 @@ pmap_init(void) } /* + * Initialize the pv chunk list mutex. + */ + mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF); + + /* + * Initialize the pool of pv list locks. + */ + for (i = 0; i < NPV_LIST_LOCKS; i++) + rw_init(&pv_list_locks[i], "pmap pv list"); + + /* * Calculate the size of the pv head table for superpages. */ for (i = 0; phys_avail[i + 1]; i += 2); @@ -1501,23 +1557,25 @@ pmap_remove_pt_page(pmap_t pmap, vm_page } /* - * This routine unholds page table pages, and if the hold count - * drops to zero, then it decrements the wire count. + * Decrements a page table page's wire count, which is used to record the + * number of valid page table entries within the page. If the wire count + * drops to zero, then the page table page is unmapped. Returns TRUE if the + * page table page was unmapped and FALSE otherwise. */ -static __inline int -pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t *free) +static inline boolean_t +pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t *free) { --m->wire_count; - if (m->wire_count == 0) - return (_pmap_unwire_pte_hold(pmap, va, m, free)); - else - return (0); + if (m->wire_count == 0) { + _pmap_unwire_ptp(pmap, va, m, free); + return (TRUE); + } else + return (FALSE); } -static int -_pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m, - vm_page_t *free) +static void +_pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t *free) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); @@ -1546,14 +1604,14 @@ _pmap_unwire_pte_hold(pmap_t pmap, vm_of vm_page_t pdpg; pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME); - pmap_unwire_pte_hold(pmap, va, pdpg, free); + pmap_unwire_ptp(pmap, va, pdpg, free); } if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) { /* We just released a PD, unhold the matching PDP */ vm_page_t pdppg; pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME); - pmap_unwire_pte_hold(pmap, va, pdppg, free); + pmap_unwire_ptp(pmap, va, pdppg, free); } /* @@ -1568,8 +1626,6 @@ _pmap_unwire_pte_hold(pmap_t pmap, vm_of * *ALL* TLB shootdown is done */ pmap_add_delayed_free_list(m, free, TRUE); - - return (1); } /* @@ -1585,7 +1641,7 @@ pmap_unuse_pt(pmap_t pmap, vm_offset_t v return (0); KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); - return (pmap_unwire_pte_hold(pmap, va, mpte, free)); + return (pmap_unwire_ptp(pmap, va, mpte, free)); } void @@ -1644,8 +1700,10 @@ pmap_pinit(pmap_t pmap) } /* - * this routine is called if the page table page is not - * mapped correctly. + * This routine is called if the desired page table page does not exist. + * + * If page table page allocation fails, this routine may sleep before + * returning NULL. It sleeps only if a lock pointer was given. * * Note: If a page allocation fails at page table level two or three, * one or two pages may be held during the wait, only to be released @@ -1653,25 +1711,23 @@ pmap_pinit(pmap_t pmap) * race conditions. */ static vm_page_t -_pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, int flags) +_pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) { vm_page_t m, pdppg, pdpg; - KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT || - (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK, - ("_pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK")); - PMAP_LOCK_ASSERT(pmap, MA_OWNED); + /* * Allocate a page table page. */ if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { - if (flags & M_WAITOK) { + if (lockp != NULL) { + RELEASE_PV_LIST_LOCK(lockp); PMAP_UNLOCK(pmap); - rw_wunlock(&pvh_global_lock); + rw_runlock(&pvh_global_lock); VM_WAIT; - rw_wlock(&pvh_global_lock); + rw_rlock(&pvh_global_lock); PMAP_LOCK(pmap); } @@ -1712,7 +1768,7 @@ _pmap_allocpte(pmap_t pmap, vm_pindex_t if ((*pml4 & PG_V) == 0) { /* Have to allocate a new pdp, recurse */ if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index, - flags) == NULL) { + lockp) == NULL) { --m->wire_count; atomic_subtract_int(&cnt.v_wire_count, 1); vm_page_free_zero(m); @@ -1745,7 +1801,7 @@ _pmap_allocpte(pmap_t pmap, vm_pindex_t if ((*pml4 & PG_V) == 0) { /* Have to allocate a new pd, recurse */ if (_pmap_allocpte(pmap, NUPDE + pdpindex, - flags) == NULL) { + lockp) == NULL) { --m->wire_count; atomic_subtract_int(&cnt.v_wire_count, 1); vm_page_free_zero(m); @@ -1759,7 +1815,7 @@ _pmap_allocpte(pmap_t pmap, vm_pindex_t if ((*pdp & PG_V) == 0) { /* Have to allocate a new pd, recurse */ if (_pmap_allocpte(pmap, NUPDE + pdpindex, - flags) == NULL) { + lockp) == NULL) { --m->wire_count; atomic_subtract_int(&cnt.v_wire_count, 1); @@ -1785,15 +1841,12 @@ _pmap_allocpte(pmap_t pmap, vm_pindex_t } static vm_page_t -pmap_allocpde(pmap_t pmap, vm_offset_t va, int flags) +pmap_allocpde(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) { vm_pindex_t pdpindex, ptepindex; pdp_entry_t *pdpe; vm_page_t pdpg; - KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT || - (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK, - ("pmap_allocpde: flags is neither M_NOWAIT nor M_WAITOK")); retry: pdpe = pmap_pdpe(pmap, va); if (pdpe != NULL && (*pdpe & PG_V) != 0) { @@ -1804,24 +1857,20 @@ retry: /* Allocate a pd page. */ ptepindex = pmap_pde_pindex(va); pdpindex = ptepindex >> NPDPEPGSHIFT; - pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, flags); - if (pdpg == NULL && (flags & M_WAITOK)) + pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp); + if (pdpg == NULL && lockp != NULL) goto retry; } return (pdpg); } static vm_page_t -pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags) +pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) { vm_pindex_t ptepindex; pd_entry_t *pd; vm_page_t m; - KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT || - (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK, - ("pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK")); - /* * Calculate pagetable page index */ @@ -1837,7 +1886,7 @@ retry: * normal 4K page. */ if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) { - if (!pmap_demote_pde(pmap, pd, va)) { + if (!pmap_demote_pde_locked(pmap, pd, va, lockp)) { /* * Invalidation of the 2MB page mapping may have caused * the deallocation of the underlying PD page. @@ -1858,8 +1907,8 @@ retry: * Here if the pte page isn't mapped, or if it has been * deallocated. */ - m = _pmap_allocpte(pmap, ptepindex, flags); - if (m == NULL && (flags & M_WAITOK)) + m = _pmap_allocpte(pmap, ptepindex, lockp); + if (m == NULL && lockp != NULL) goto retry; } return (m); @@ -2023,9 +2072,6 @@ pv_to_chunk(pv_entry_t pv) static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 }; -SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, - "Current number of pv entries"); - #ifdef PV_STATS static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; @@ -2038,13 +2084,15 @@ SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_ SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, "Number of times tried to get a chunk page but failed."); -static long pv_entry_frees, pv_entry_allocs; +static long pv_entry_frees, pv_entry_allocs, pv_entry_count; static int pv_entry_spare; SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, "Current number of pv entry frees"); SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, "Current number of pv entry allocs"); +SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, + "Current number of pv entries"); SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, "Current number of spare pv entries"); #endif @@ -2054,14 +2102,16 @@ SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_ * drastic measures to free some pages so we can allocate * another pv entry chunk. * + * Returns NULL if PV entries were reclaimed from the specified pmap. + * * We do not, however, unmap 2mpages because subsequent accesses will * allocate per-page pv entries until repromotion occurs, thereby * exacerbating the shortage of free pv entries. */ static vm_page_t -pmap_pv_reclaim(pmap_t locked_pmap) +reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) { - struct pch newtail; + struct pch new_tail; struct pv_chunk *pc; struct md_page *pvh; pd_entry_t *pde; @@ -2073,13 +2123,16 @@ pmap_pv_reclaim(pmap_t locked_pmap) uint64_t inuse; int bit, field, freed; - rw_assert(&pvh_global_lock, RA_WLOCKED); + rw_assert(&pvh_global_lock, RA_LOCKED); PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); + KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL")); pmap = NULL; free = m_pc = NULL; - TAILQ_INIT(&newtail); + TAILQ_INIT(&new_tail); + mtx_lock(&pv_chunks_mutex); while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && free == NULL) { TAILQ_REMOVE(&pv_chunks, pc, pc_lru); + mtx_unlock(&pv_chunks_mutex); if (pmap != pc->pc_pmap) { if (pmap != NULL) { pmap_invalidate_all(pmap); @@ -2088,11 +2141,14 @@ pmap_pv_reclaim(pmap_t locked_pmap) } pmap = pc->pc_pmap; /* Avoid deadlock and lock recursion. */ - if (pmap > locked_pmap) + if (pmap > locked_pmap) { + RELEASE_PV_LIST_LOCK(lockp); PMAP_LOCK(pmap); - else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) { + } else if (pmap != locked_pmap && + !PMAP_TRYLOCK(pmap)) { pmap = NULL; - TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); + TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); + mtx_lock(&pv_chunks_mutex); continue; } } @@ -2121,6 +2177,7 @@ pmap_pv_reclaim(pmap_t locked_pmap) vm_page_dirty(m); if ((tpte & PG_A) != 0) vm_page_aflag_set(m, PGA_REFERENCED); + CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { @@ -2136,32 +2193,36 @@ pmap_pv_reclaim(pmap_t locked_pmap) } } if (freed == 0) { - TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); + TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); + mtx_lock(&pv_chunks_mutex); continue; } /* Every freed mapping is for a 4 KB page. */ pmap_resident_count_dec(pmap, freed); - PV_STAT(pv_entry_frees += freed); - PV_STAT(pv_entry_spare += freed); - pv_entry_count -= freed; + PV_STAT(atomic_add_long(&pv_entry_frees, freed)); + PV_STAT(atomic_add_int(&pv_entry_spare, freed)); + PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 && pc->pc_map[2] == PC_FREE2) { - PV_STAT(pv_entry_spare -= _NPCPV); - PV_STAT(pc_chunk_count--); - PV_STAT(pc_chunk_frees++); + PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); + PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); + PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); /* Entire chunk is free; return it. */ m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); dump_drop_page(m_pc->phys_addr); + mtx_lock(&pv_chunks_mutex); break; } TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); - TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); + TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); + mtx_lock(&pv_chunks_mutex); /* One freed pv entry in locked_pmap is sufficient. */ if (pmap == locked_pmap) break; } - TAILQ_CONCAT(&pv_chunks, &newtail, pc_lru); + TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru); + mtx_unlock(&pv_chunks_mutex); if (pmap != NULL) { pmap_invalidate_all(pmap); if (pmap != locked_pmap) @@ -2187,11 +2248,11 @@ free_pv_entry(pmap_t pmap, pv_entry_t pv struct pv_chunk *pc; int idx, field, bit; - rw_assert(&pvh_global_lock, RA_WLOCKED); + rw_assert(&pvh_global_lock, RA_LOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); - PV_STAT(pv_entry_frees++); - PV_STAT(pv_entry_spare++); - pv_entry_count--; + PV_STAT(atomic_add_long(&pv_entry_frees, 1)); + PV_STAT(atomic_add_int(&pv_entry_spare, 1)); + PV_STAT(atomic_subtract_long(&pv_entry_count, 1)); pc = pv_to_chunk(pv); idx = pv - &pc->pc_pventry[0]; field = idx / 64; @@ -2215,10 +2276,12 @@ free_pv_chunk(struct pv_chunk *pc) { vm_page_t m; + mtx_lock(&pv_chunks_mutex); TAILQ_REMOVE(&pv_chunks, pc, pc_lru); - PV_STAT(pv_entry_spare -= _NPCPV); - PV_STAT(pc_chunk_count--); - PV_STAT(pc_chunk_frees++); + mtx_unlock(&pv_chunks_mutex); + PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); + PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); + PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); /* entire chunk is free, return it */ m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); dump_drop_page(m->phys_addr); @@ -2227,20 +2290,24 @@ free_pv_chunk(struct pv_chunk *pc) } /* - * get a new pv_entry, allocating a block from the system - * when needed. + * Returns a new PV entry, allocating a new PV chunk from the system when + * needed. If this PV chunk allocation fails and a PV list lock pointer was + * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is + * returned. + * + * The given PV list lock may be released. */ static pv_entry_t -get_pv_entry(pmap_t pmap, boolean_t try) +get_pv_entry(pmap_t pmap, struct rwlock **lockp) { int bit, field; pv_entry_t pv; struct pv_chunk *pc; vm_page_t m; - rw_assert(&pvh_global_lock, RA_WLOCKED); + rw_assert(&pvh_global_lock, RA_LOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); - PV_STAT(pv_entry_allocs++); + PV_STAT(atomic_add_long(&pv_entry_allocs, 1)); retry: pc = TAILQ_FIRST(&pmap->pm_pvchunk); if (pc != NULL) { @@ -2260,8 +2327,8 @@ retry: TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); } - pv_entry_count++; - PV_STAT(pv_entry_spare--); + PV_STAT(atomic_add_long(&pv_entry_count, 1)); + PV_STAT(atomic_subtract_int(&pv_entry_spare, 1)); return (pv); } } @@ -2269,31 +2336,121 @@ retry: m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); if (m == NULL) { - if (try) { + if (lockp == NULL) { PV_STAT(pc_chunk_tryfail++); return (NULL); } - m = pmap_pv_reclaim(pmap); + m = reclaim_pv_chunk(pmap, lockp); if (m == NULL) goto retry; } - PV_STAT(pc_chunk_count++); - PV_STAT(pc_chunk_allocs++); + PV_STAT(atomic_add_int(&pc_chunk_count, 1)); + PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); dump_add_page(m->phys_addr); pc = (void *)PHYS_TO_DMAP(m->phys_addr); pc->pc_pmap = pmap; pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */ pc->pc_map[1] = PC_FREE1; pc->pc_map[2] = PC_FREE2; + mtx_lock(&pv_chunks_mutex); TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); + mtx_unlock(&pv_chunks_mutex); pv = &pc->pc_pventry[0]; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); - pv_entry_count++; - PV_STAT(pv_entry_spare += _NPCPV - 1); + PV_STAT(atomic_add_long(&pv_entry_count, 1)); + PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1)); return (pv); } /* + * Returns the number of one bits within the given PV chunk map element. + */ +static int +popcnt_pc_map_elem(uint64_t elem) +{ + int count; + + /* + * This simple method of counting the one bits performs well because + * the given element typically contains more zero bits than one bits. + */ + count = 0; + for (; elem != 0; elem &= elem - 1) + count++; + return (count); +} + +/* + * Ensure that the number of spare PV entries in the specified pmap meets or + * exceeds the given count, "needed". + * + * The given PV list lock may be released. + */ +static void +reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) +{ + struct pch new_tail; + struct pv_chunk *pc; + int avail, free; + vm_page_t m; + + rw_assert(&pvh_global_lock, RA_LOCKED); + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); + + /* + * Newly allocated PV chunks must be stored in a private list until + * the required number of PV chunks have been allocated. Otherwise, + * reclaim_pv_chunk() could recycle one of these chunks. In + * contrast, these chunks must be added to the pmap upon allocation. + */ + TAILQ_INIT(&new_tail); +retry: + avail = 0; + TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { + if ((cpu_feature2 & CPUID2_POPCNT) == 0) { + free = popcnt_pc_map_elem(pc->pc_map[0]); + free += popcnt_pc_map_elem(pc->pc_map[1]); + free += popcnt_pc_map_elem(pc->pc_map[2]); + } else { + free = popcntq(pc->pc_map[0]); + free += popcntq(pc->pc_map[1]); + free += popcntq(pc->pc_map[2]); + } + if (free == 0) + break; + avail += free; + if (avail >= needed) + break; + } + for (; avail < needed; avail += _NPCPV) { + m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | + VM_ALLOC_WIRED); + if (m == NULL) { + m = reclaim_pv_chunk(pmap, lockp); + if (m == NULL) + goto retry; + } + PV_STAT(atomic_add_int(&pc_chunk_count, 1)); + PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); + dump_add_page(m->phys_addr); + pc = (void *)PHYS_TO_DMAP(m->phys_addr); + pc->pc_pmap = pmap; + pc->pc_map[0] = PC_FREE0; + pc->pc_map[1] = PC_FREE1; + pc->pc_map[2] = PC_FREE2; + TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); + TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); + PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV)); + } + if (!TAILQ_EMPTY(&new_tail)) { + mtx_lock(&pv_chunks_mutex); + TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru); + mtx_unlock(&pv_chunks_mutex); + } +} + +/* * First find and then remove the pv entry for the specified pmap and virtual * address from the specified pv list. Returns the pv entry if found and NULL * otherwise. This operation can be performed on pv lists for either 4KB or @@ -2304,7 +2461,7 @@ pmap_pvh_remove(struct md_page *pvh, pma { pv_entry_t pv; - rw_assert(&pvh_global_lock, RA_WLOCKED); + rw_assert(&pvh_global_lock, RA_LOCKED); TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) { if (pmap == PV_PMAP(pv) && va == pv->pv_va) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_list); @@ -2320,20 +2477,26 @@ pmap_pvh_remove(struct md_page *pvh, pma * entries for each of the 4KB page mappings. */ static void -pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) +pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, + struct rwlock **lockp) { struct md_page *pvh; + struct pv_chunk *pc; pv_entry_t pv; vm_offset_t va_last; vm_page_t m; + int bit, field; - rw_assert(&pvh_global_lock, RA_WLOCKED); + rw_assert(&pvh_global_lock, RA_LOCKED); + PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((pa & PDRMASK) == 0, ("pmap_pv_demote_pde: pa is not 2mpage aligned")); + CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); /* * Transfer the 2mpage's pv entry for this mapping to the first - * page's pv list. + * page's pv list. Once this transfer begins, the pv list lock + * must not be released until the last pv entry is reinstantiated. */ pvh = pa_to_pvh(pa); va = trunc_2mpage(va); @@ -2342,14 +2505,37 @@ pmap_pv_demote_pde(pmap_t pmap, vm_offse m = PHYS_TO_VM_PAGE(pa); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); /* Instantiate the remaining NPTEPG - 1 pv entries. */ + PV_STAT(atomic_add_long(&pv_entry_allocs, NPTEPG - 1)); va_last = va + NBPDR - PAGE_SIZE; - do { - m++; - KASSERT((m->oflags & VPO_UNMANAGED) == 0, - ("pmap_pv_demote_pde: page %p is not managed", m)); - va += PAGE_SIZE; - pmap_insert_entry(pmap, va, m); - } while (va < va_last); + for (;;) { + pc = TAILQ_FIRST(&pmap->pm_pvchunk); + KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 || + pc->pc_map[2] != 0, ("pmap_pv_demote_pde: missing spare")); + for (field = 0; field < _NPCM; field++) { + while (pc->pc_map[field]) { + bit = bsfq(pc->pc_map[field]); + pc->pc_map[field] &= ~(1ul << bit); + pv = &pc->pc_pventry[field * 64 + bit]; + va += PAGE_SIZE; + pv->pv_va = va; + m++; + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("pmap_pv_demote_pde: page %p is not managed", m)); + TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); + if (va == va_last) + goto out; + } + } + TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); + TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); + } +out: + if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { + TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); + TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); + } + PV_STAT(atomic_add_long(&pv_entry_count, NPTEPG - 1)); + PV_STAT(atomic_subtract_int(&pv_entry_spare, NPTEPG - 1)); } /* @@ -2358,23 +2544,25 @@ pmap_pv_demote_pde(pmap_t pmap, vm_offse * for the 2MB page mapping. */ static void -pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) +pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, + struct rwlock **lockp) { struct md_page *pvh; pv_entry_t pv; vm_offset_t va_last; vm_page_t m; - rw_assert(&pvh_global_lock, RA_WLOCKED); + rw_assert(&pvh_global_lock, RA_LOCKED); KASSERT((pa & PDRMASK) == 0, ("pmap_pv_promote_pde: pa is not 2mpage aligned")); + CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); /* - * Transfer the first page's pv entry for this mapping to the - * 2mpage's pv list. Aside from avoiding the cost of a call - * to get_pv_entry(), a transfer avoids the possibility that - * get_pv_entry() calls pmap_collect() and that pmap_collect() - * removes one of the mappings that is being promoted. + * Transfer the first page's pv entry for this mapping to the 2mpage's + * pv list. Aside from avoiding the cost of a call to get_pv_entry(), + * a transfer avoids the possibility that get_pv_entry() calls + * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the + * mappings that is being promoted. */ m = PHYS_TO_VM_PAGE(pa); va = trunc_2mpage(va); @@ -2406,48 +2594,22 @@ pmap_pvh_free(struct md_page *pvh, pmap_ free_pv_entry(pmap, pv); } -static void -pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) -{ - struct md_page *pvh; - - rw_assert(&pvh_global_lock, RA_WLOCKED); - pmap_pvh_free(&m->md, pmap, va); - if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { - pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); - if (TAILQ_EMPTY(&pvh->pv_list)) - vm_page_aflag_clear(m, PGA_WRITEABLE); - } -} - /* - * Create a pv entry for page at pa for - * (pmap, va). - */ -static void -pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) -{ - pv_entry_t pv; - - rw_assert(&pvh_global_lock, RA_WLOCKED); - PMAP_LOCK_ASSERT(pmap, MA_OWNED); - pv = get_pv_entry(pmap, FALSE); - pv->pv_va = va; - TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); -} - -/* - * Conditionally create a pv entry. + * Conditionally create the PV entry for a 4KB page mapping if the required + * memory can be allocated without resorting to reclamation. */ static boolean_t -pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) +pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, + struct rwlock **lockp) { pv_entry_t pv; - rw_assert(&pvh_global_lock, RA_WLOCKED); + rw_assert(&pvh_global_lock, RA_LOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); - if ((pv = get_pv_entry(pmap, TRUE)) != NULL) { + /* Pass NULL instead of the lock pointer to disable reclamation. */ + if ((pv = get_pv_entry(pmap, NULL)) != NULL) { pv->pv_va = va; + CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); return (TRUE); } else @@ -2455,17 +2617,22 @@ pmap_try_insert_pv_entry(pmap_t pmap, vm } /* - * Create the pv entry for a 2MB page mapping. + * Conditionally create the PV entry for a 2MB page mapping if the required + * memory can be allocated without resorting to reclamation. */ static boolean_t -pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) +pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, + struct rwlock **lockp) { struct md_page *pvh; pv_entry_t pv; - rw_assert(&pvh_global_lock, RA_WLOCKED); - if ((pv = get_pv_entry(pmap, TRUE)) != NULL) { + rw_assert(&pvh_global_lock, RA_LOCKED); + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + /* Pass NULL instead of the lock pointer to disable reclamation. */ + if ((pv = get_pv_entry(pmap, NULL)) != NULL) { pv->pv_va = va; + CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); pvh = pa_to_pvh(pa); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_list); return (TRUE); @@ -2494,6 +2661,20 @@ pmap_fill_ptp(pt_entry_t *firstpte, pt_e static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) { + struct rwlock *lock; + boolean_t rv; + + lock = NULL; + rv = pmap_demote_pde_locked(pmap, pde, va, &lock); + if (lock != NULL) + rw_wunlock(lock); + return (rv); +} + +static boolean_t +pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, + struct rwlock **lockp) +{ pd_entry_t newpde, oldpde; pt_entry_t *firstpte, newpte; vm_paddr_t mptepa; @@ -2528,7 +2709,8 @@ pmap_demote_pde(pmap_t pmap, pd_entry_t DMAP_MAX_ADDRESS ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { free = NULL; - pmap_remove_pde(pmap, pde, trunc_2mpage(va), &free); + pmap_remove_pde(pmap, pde, trunc_2mpage(va), &free, + lockp); pmap_invalidate_page(pmap, trunc_2mpage(va)); pmap_free_zero_pages(free); CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#lx" @@ -2568,6 +2750,17 @@ pmap_demote_pde(pmap_t pmap, pd_entry_t pmap_fill_ptp(firstpte, newpte); /* + * The spare PV entries must be reserved prior to demoting the + * mapping, that is, prior to changing the PDE. Otherwise, the state + * of the PDE and the PV lists will be inconsistent, which can result + * in reclaim_pv_chunk() attempting to remove a PV entry from the + * wrong PV list and pmap_pv_demote_pde() failing to find the expected + * PV entry for the 2MB page mapping that is being demoted. + */ + if ((oldpde & PG_MANAGED) != 0) + reserve_pv_entries(pmap, NPTEPG - 1, lockp); + + /* * Demote the mapping. This pmap is locked. The old PDE has * PG_A set. If the old PDE has PG_RW set, it also has PG_M * set. Thus, there is no danger of a race with another @@ -2586,18 +2779,12 @@ pmap_demote_pde(pmap_t pmap, pd_entry_t pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); /* - * Demote the pv entry. This depends on the earlier demotion - * of the mapping. Specifically, the (re)creation of a per- - * page pv entry might trigger the execution of pmap_collect(), - * which might reclaim a newly (re)created per-page pv entry - * and destroy the associated mapping. In order to destroy - * the mapping, the PDE must have already changed from mapping - * the 2mpage to referencing the page table page. + * Demote the PV entry. */ if ((oldpde & PG_MANAGED) != 0) - pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME); + pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME, lockp); - pmap_pde_demotions++; + atomic_add_long(&pmap_pde_demotions, 1); CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#lx" " in pmap %p", va, pmap); return (TRUE); @@ -2608,7 +2795,7 @@ pmap_demote_pde(pmap_t pmap, pd_entry_t */ static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, - vm_page_t *free) + vm_page_t *free, struct rwlock **lockp) { struct md_page *pvh; pd_entry_t oldpde; @@ -2630,6 +2817,7 @@ pmap_remove_pde(pmap_t pmap, pd_entry_t pmap_invalidate_page(kernel_pmap, sva); pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE); if (oldpde & PG_MANAGED) { + CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME); pvh = pa_to_pvh(oldpde & PG_PS_FRAME); pmap_pvh_free(pvh, pmap, sva); *** DIFF OUTPUT TRUNCATED AT 1000 LINES ***