Skip site navigation (1)Skip section navigation (2)
Date:      Thu, 22 Mar 2018 19:21:11 +0000 (UTC)
From:      Jeff Roberson <jeff@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r331369 - head/sys/vm
Message-ID:  <201803221921.w2MJLBi7058560@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: jeff
Date: Thu Mar 22 19:21:11 2018
New Revision: 331369
URL: https://svnweb.freebsd.org/changeset/base/331369

Log:
  Lock reservations with a dedicated lock in each reservation.  Protect the
  vmd_free_count with atomics.
  
  This allows us to allocate and free from reservations without the free lock
  except where a superpage is allocated from the physical layer, which is
  roughly 1/512 of the operations on amd64.
  
  Use the counter api to eliminate cache conention on counters.
  
  Reviewed by:	markj
  Tested by:	pho
  Sponsored by:	Netflix, Dell/EMC Isilon
  Differential Revision:	https://reviews.freebsd.org/D14707

Modified:
  head/sys/vm/vm_page.c
  head/sys/vm/vm_pagequeue.h
  head/sys/vm/vm_reserv.c
  head/sys/vm/vm_reserv.h

Modified: head/sys/vm/vm_page.c
==============================================================================
--- head/sys/vm/vm_page.c	Thu Mar 22 19:11:43 2018	(r331368)
+++ head/sys/vm/vm_page.c	Thu Mar 22 19:21:11 2018	(r331369)
@@ -177,7 +177,6 @@ static uma_zone_t fakepg_zone;
 static void vm_page_alloc_check(vm_page_t m);
 static void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits);
 static void vm_page_enqueue(uint8_t queue, vm_page_t m);
-static void vm_page_free_phys(struct vm_domain *vmd, vm_page_t m);
 static void vm_page_init(void *dummy);
 static int vm_page_insert_after(vm_page_t m, vm_object_t object,
     vm_pindex_t pindex, vm_page_t mpred);
@@ -1677,10 +1676,10 @@ vm_page_alloc_after(vm_object_t object, vm_pindex_t pi
  * for the request class and false otherwise.
  */
 int
-vm_domain_available(struct vm_domain *vmd, int req, int npages)
+vm_domain_allocate(struct vm_domain *vmd, int req, int npages)
 {
+	u_int limit, old, new;
 
-	vm_domain_free_assert_locked(vmd);
 	req = req & VM_ALLOC_CLASS_MASK;
 
 	/*
@@ -1688,15 +1687,34 @@ vm_domain_available(struct vm_domain *vmd, int req, in
 	 */
 	if (curproc == pageproc && req != VM_ALLOC_INTERRUPT)
 		req = VM_ALLOC_SYSTEM;
+	if (req == VM_ALLOC_INTERRUPT)
+		limit = 0;
+	else if (req == VM_ALLOC_SYSTEM)
+		limit = vmd->vmd_interrupt_free_min;
+	else
+		limit = vmd->vmd_free_reserved;
 
-	if (vmd->vmd_free_count >= npages + vmd->vmd_free_reserved ||
-	    (req == VM_ALLOC_SYSTEM &&
-	    vmd->vmd_free_count >= npages + vmd->vmd_interrupt_free_min) ||
-	    (req == VM_ALLOC_INTERRUPT &&
-	    vmd->vmd_free_count >= npages))
-		return (1);
+	/*
+	 * Attempt to reserve the pages.  Fail if we're below the limit.
+	 */
+	limit += npages;
+	old = vmd->vmd_free_count;
+	do {
+		if (old < limit)
+			return (0);
+		new = old - npages;
+	} while (atomic_fcmpset_int(&vmd->vmd_free_count, &old, new) == 0);
 
-	return (0);
+	/* Wake the page daemon if we've crossed the threshold. */
+	if (vm_paging_needed(vmd, new) && !vm_paging_needed(vmd, old))
+		pagedaemon_wakeup(vmd->vmd_domain);
+
+	/* Only update bitsets on transitions. */
+	if ((old >= vmd->vmd_free_min && new < vmd->vmd_free_min) ||
+	    (old >= vmd->vmd_free_severe && new < vmd->vmd_free_severe))
+		vm_domain_set(vmd);
+
+	return (1);
 }
 
 vm_page_t
@@ -1723,44 +1741,34 @@ vm_page_alloc_domain_after(vm_object_t object, vm_pind
 again:
 	m = NULL;
 #if VM_NRESERVLEVEL > 0
+	/*
+	 * Can we allocate the page from a reservation?
+	 */
 	if (vm_object_reserv(object) &&
-	    (m = vm_reserv_extend(req, object, pindex, domain, mpred))
-	    != NULL) {
+	    ((m = vm_reserv_extend(req, object, pindex, domain, mpred)) != NULL ||
+	    (m = vm_reserv_alloc_page(req, object, pindex, domain, mpred)) != NULL)) {
 		domain = vm_phys_domain(m);
 		vmd = VM_DOMAIN(domain);
 		goto found;
 	}
 #endif
 	vmd = VM_DOMAIN(domain);
-	vm_domain_free_lock(vmd);
-	if (vm_domain_available(vmd, req, 1)) {
+	if (vm_domain_allocate(vmd, req, 1)) {
 		/*
-		 * Can we allocate the page from a reservation?
+		 * If not, allocate it from the free page queues.
 		 */
+		vm_domain_free_lock(vmd);
+		m = vm_phys_alloc_pages(domain, object != NULL ?
+		    VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT, 0);
+		vm_domain_free_unlock(vmd);
+		if (m == NULL) {
+			vm_domain_freecnt_inc(vmd, 1);
 #if VM_NRESERVLEVEL > 0
-		if (!vm_object_reserv(object) ||
-		    (m = vm_reserv_alloc_page(object, pindex,
-		    domain, mpred)) == NULL)
+			if (vm_reserv_reclaim_inactive(domain))
+				goto again;
 #endif
-		{
-			/*
-			 * If not, allocate it from the free page queues.
-			 */
-			m = vm_phys_alloc_pages(domain, object != NULL ?
-			    VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT, 0);
-#if VM_NRESERVLEVEL > 0
-			if (m == NULL && vm_reserv_reclaim_inactive(domain)) {
-				m = vm_phys_alloc_pages(domain,
-				    object != NULL ?
-				    VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT,
-				    0);
-			}
-#endif
 		}
 	}
-	if (m != NULL)
-		vm_domain_freecnt_dec(vmd, 1);
-	vm_domain_free_unlock(vmd);
 	if (m == NULL) {
 		/*
 		 * Not allocatable, give up.
@@ -1775,9 +1783,7 @@ again:
 	 */
 	KASSERT(m != NULL, ("missing page"));
 
-#if VM_NRESERVLEVEL > 0
 found:
-#endif
 	vm_page_alloc_check(m);
 
 	/*
@@ -1934,9 +1940,14 @@ vm_page_alloc_contig_domain(vm_object_t object, vm_pin
 	 */
 again:
 #if VM_NRESERVLEVEL > 0
+	/*
+	 * Can we allocate the pages from a reservation?
+	 */
 	if (vm_object_reserv(object) &&
-	    (m_ret = vm_reserv_extend_contig(req, object, pindex, domain,
-	    npages, low, high, alignment, boundary, mpred)) != NULL) {
+	    ((m_ret = vm_reserv_extend_contig(req, object, pindex, domain,
+	    npages, low, high, alignment, boundary, mpred)) != NULL ||
+	    (m_ret = vm_reserv_alloc_contig(req, object, pindex, domain,
+	    npages, low, high, alignment, boundary, mpred)) != NULL)) {
 		domain = vm_phys_domain(m_ret);
 		vmd = VM_DOMAIN(domain);
 		goto found;
@@ -1944,31 +1955,23 @@ again:
 #endif
 	m_ret = NULL;
 	vmd = VM_DOMAIN(domain);
-	vm_domain_free_lock(vmd);
-	if (vm_domain_available(vmd, req, npages)) {
+	if (vm_domain_allocate(vmd, req, npages)) {
 		/*
-		 * Can we allocate the pages from a reservation?
+		 * allocate them from the free page queues.
 		 */
+		vm_domain_free_lock(vmd);
+		m_ret = vm_phys_alloc_contig(domain, npages, low, high,
+		    alignment, boundary);
+		vm_domain_free_unlock(vmd);
+		if (m_ret == NULL) {
+			vm_domain_freecnt_inc(vmd, npages);
 #if VM_NRESERVLEVEL > 0
-retry:
-		if (!vm_object_reserv(object) ||
-		    (m_ret = vm_reserv_alloc_contig(object, pindex, domain,
-		    npages, low, high, alignment, boundary, mpred)) == NULL)
+			if (vm_reserv_reclaim_contig(domain, npages, low,
+			    high, alignment, boundary))
+				goto again;
 #endif
-			/*
-			 * If not, allocate them from the free page queues.
-			 */
-			m_ret = vm_phys_alloc_contig(domain, npages, low, high,
-			    alignment, boundary);
-#if VM_NRESERVLEVEL > 0
-		if (m_ret == NULL && vm_reserv_reclaim_contig(
-		    domain, npages, low, high, alignment, boundary))
-			goto retry;
-#endif
+		}
 	}
-	if (m_ret != NULL)
-		vm_domain_freecnt_dec(vmd, npages);
-	vm_domain_free_unlock(vmd);
 	if (m_ret == NULL) {
 		if (vm_domain_alloc_fail(vmd, object, req))
 			goto again;
@@ -2109,13 +2112,14 @@ vm_page_alloc_freelist_domain(int domain, int freelist
 	 */
 	vmd = VM_DOMAIN(domain);
 again:
-	vm_domain_free_lock(vmd);
-	if (vm_domain_available(vmd, req, 1))
+	if (vm_domain_allocate(vmd, req, 1)) {
+		vm_domain_free_lock(vmd);
 		m = vm_phys_alloc_freelist_pages(domain, freelist,
 		    VM_FREEPOOL_DIRECT, 0);
-	if (m != NULL)
-		vm_domain_freecnt_dec(vmd, 1);
-	vm_domain_free_unlock(vmd);
+		vm_domain_free_unlock(vmd);
+		if (m == NULL)
+			vm_domain_freecnt_inc(vmd, 1);
+	}
 	if (m == NULL) {
 		if (vm_domain_alloc_fail(vmd, NULL, req))
 			goto again;
@@ -2491,8 +2495,9 @@ retry:
 					vm_page_remque(m);
 					vm_page_replace_checked(m_new, object,
 					    m->pindex, m);
-					m->valid = 0;
-					vm_page_undirty(m);
+					if (vm_page_free_prep(m, false))
+						SLIST_INSERT_HEAD(&free, m,
+						    plinks.s.ss);
 
 					/*
 					 * The new page must be deactivated
@@ -2504,10 +2509,12 @@ retry:
 					m->flags &= ~PG_ZERO;
 					vm_page_remque(m);
 					vm_page_remove(m);
+					if (vm_page_free_prep(m, false))
+						SLIST_INSERT_HEAD(&free, m,
+						    plinks.s.ss);
 					KASSERT(m->dirty == 0,
 					    ("page %p is dirty", m));
 				}
-				SLIST_INSERT_HEAD(&free, m, plinks.s.ss);
 			} else
 				error = EBUSY;
 unlock:
@@ -2548,7 +2555,7 @@ unlock:
 		do {
 			MPASS(vm_phys_domain(m) == domain);
 			SLIST_REMOVE_HEAD(&free, plinks.s.ss);
-			vm_page_free_phys(vmd, m);
+			vm_phys_free_pages(m, 0);
 			cnt++;
 		} while ((m = SLIST_FIRST(&free)) != NULL);
 		vm_domain_free_unlock(vmd);
@@ -3159,24 +3166,12 @@ vm_page_free_prep(vm_page_t m, bool pagequeue_locked)
 	if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT)
 		pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT);
 
-	return (true);
-}
-
-/*
- * Insert the page into the physical memory allocator's free page
- * queues.  This is the last step to free a page.  The caller is
- * responsible for adjusting the free page count.
- */
-static void
-vm_page_free_phys(struct vm_domain *vmd, vm_page_t m)
-{
-
-	vm_domain_free_assert_locked(vmd);
-
 #if VM_NRESERVLEVEL > 0
-	if (!vm_reserv_free_page(m))
+	if (vm_reserv_free_page(m))
+		return (false);
 #endif
-		vm_phys_free_pages(m, 0);
+
+	return (true);
 }
 
 void
@@ -3200,7 +3195,7 @@ vm_page_free_phys_pglist(struct pglist *tq)
 			vmd = vm_pagequeue_domain(m);
 			vm_domain_free_lock(vmd);
 		}
-		vm_page_free_phys(vmd, m);
+		vm_phys_free_pages(m, 0);
 		cnt++;
 	}
 	if (vmd != NULL) {
@@ -3227,7 +3222,7 @@ vm_page_free_toq(vm_page_t m)
 		return;
 	vmd = vm_pagequeue_domain(m);
 	vm_domain_free_lock(vmd);
-	vm_page_free_phys(vmd, m);
+	vm_phys_free_pages(m, 0);
 	vm_domain_free_unlock(vmd);
 	vm_domain_freecnt_inc(vmd, 1);
 }

Modified: head/sys/vm/vm_pagequeue.h
==============================================================================
--- head/sys/vm/vm_pagequeue.h	Thu Mar 22 19:11:43 2018	(r331368)
+++ head/sys/vm/vm_pagequeue.h	Thu Mar 22 19:21:11 2018	(r331369)
@@ -180,7 +180,7 @@ vm_pagequeue_cnt_add(struct vm_pagequeue *pq, int adde
 
 void vm_domain_set(struct vm_domain *vmd);
 void vm_domain_clear(struct vm_domain *vmd);
-int vm_domain_available(struct vm_domain *vmd, int req, int npages);
+int vm_domain_allocate(struct vm_domain *vmd, int req, int npages);
 
 /*
  *      vm_pagequeue_domain:
@@ -265,23 +265,6 @@ vm_domain_freecnt_inc(struct vm_domain *vmd, int adj)
 	    new >= vmd->vmd_pageout_free_min)))
 		vm_domain_clear(vmd);
 }
-
-static inline void
-vm_domain_freecnt_dec(struct vm_domain *vmd, int adj)
-{
-	u_int old, new;
-
-	old = atomic_fetchadd_int(&vmd->vmd_free_count, -adj);
-	new = old - adj;
-	KASSERT(new >= 0, ("vm_domain_freecnt_dec: free count underflow"));
-	if (vm_paging_needed(vmd, new) && !vm_paging_needed(vmd, old))
-		pagedaemon_wakeup(vmd->vmd_domain);
-	/* Only update bitsets on transitions. */
-	if ((old >= vmd->vmd_free_min && new < vmd->vmd_free_min) ||
-	    (old >= vmd->vmd_free_severe && new < vmd->vmd_free_severe))
-		vm_domain_set(vmd);
-}
-
 
 #endif	/* _KERNEL */
 #endif				/* !_VM_PAGEQUEUE_ */

Modified: head/sys/vm/vm_reserv.c
==============================================================================
--- head/sys/vm/vm_reserv.c	Thu Mar 22 19:11:43 2018	(r331368)
+++ head/sys/vm/vm_reserv.c	Thu Mar 22 19:21:11 2018	(r331369)
@@ -45,6 +45,8 @@ __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
+#include <sys/counter.h>
+#include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
@@ -54,6 +56,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/vmmeter.h>
+#include <sys/smp.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
@@ -166,22 +169,37 @@ popmap_is_set(popmap_t popmap[], int i)
  *
  * A partially populated reservation can be broken and reclaimed at any time.
  *
- * f - vm_domain_free_lock
+ * r - vm_reserv_lock
+ * d - vm_reserv_domain_lock
  * o - vm_reserv_object_lock
  * c - constant after boot
  */
 struct vm_reserv {
-	TAILQ_ENTRY(vm_reserv) partpopq;	/* (f) per-domain queue. */
-	LIST_ENTRY(vm_reserv) objq;		/* (o, f) object queue */
-	vm_object_t	object;			/* (o, f) containing object */
-	vm_pindex_t	pindex;			/* (o, f) offset in object */
+	struct mtx	lock;			/* reservation lock. */
+	TAILQ_ENTRY(vm_reserv) partpopq;	/* (d) per-domain queue. */
+	LIST_ENTRY(vm_reserv) objq;		/* (o, r) object queue */
+	vm_object_t	object;			/* (o, r) containing object */
+	vm_pindex_t	pindex;			/* (o, r) offset in object */
 	vm_page_t	pages;			/* (c) first page  */
-	int		domain;			/* (c) NUMA domain. */
-	int		popcnt;			/* (f) # of pages in use */
-	char		inpartpopq;		/* (f) */
-	popmap_t	popmap[NPOPMAP];	/* (f) bit vector, used pages */
+	uint16_t	domain;			/* (c) NUMA domain. */
+	uint16_t	popcnt;			/* (r) # of pages in use */
+	char		inpartpopq;		/* (d) */
+	popmap_t	popmap[NPOPMAP];	/* (r) bit vector, used pages */
 };
 
+#define	vm_reserv_lockptr(rv)		(&(rv)->lock)
+#define	vm_reserv_assert_locked(rv)					\
+	    mtx_assert(vm_reserv_lockptr(rv), MA_OWNED)
+#define	vm_reserv_lock(rv)		mtx_lock(vm_reserv_lockptr(rv))
+#define	vm_reserv_trylock(rv)		mtx_trylock(vm_reserv_lockptr(rv))
+#define	vm_reserv_unlock(rv)		mtx_unlock(vm_reserv_lockptr(rv))
+
+static struct mtx_padalign vm_reserv_domain_locks[MAXMEMDOM];
+
+#define	vm_reserv_domain_lockptr(d)	&vm_reserv_domain_locks[(d)]
+#define	vm_reserv_domain_lock(d)	mtx_lock(vm_reserv_domain_lockptr(d))
+#define	vm_reserv_domain_unlock(d)	mtx_unlock(vm_reserv_domain_lockptr(d))
+
 /*
  * The reservation array
  *
@@ -218,13 +236,13 @@ static TAILQ_HEAD(, vm_reserv) vm_rvq_partpop[MAXMEMDO
 
 static SYSCTL_NODE(_vm, OID_AUTO, reserv, CTLFLAG_RD, 0, "Reservation Info");
 
-static long vm_reserv_broken;
-SYSCTL_LONG(_vm_reserv, OID_AUTO, broken, CTLFLAG_RD,
-    &vm_reserv_broken, 0, "Cumulative number of broken reservations");
+static counter_u64_t vm_reserv_broken = EARLY_COUNTER;
+SYSCTL_COUNTER_U64(_vm_reserv, OID_AUTO, broken, CTLFLAG_RD,
+    &vm_reserv_broken, "Cumulative number of broken reservations");
 
-static long vm_reserv_freed;
-SYSCTL_LONG(_vm_reserv, OID_AUTO, freed, CTLFLAG_RD,
-    &vm_reserv_freed, 0, "Cumulative number of freed reservations");
+static counter_u64_t vm_reserv_freed = EARLY_COUNTER;
+SYSCTL_COUNTER_U64(_vm_reserv, OID_AUTO, freed, CTLFLAG_RD,
+    &vm_reserv_freed, "Cumulative number of freed reservations");
 
 static int sysctl_vm_reserv_fullpop(SYSCTL_HANDLER_ARGS);
 
@@ -236,9 +254,9 @@ static int sysctl_vm_reserv_partpopq(SYSCTL_HANDLER_AR
 SYSCTL_OID(_vm_reserv, OID_AUTO, partpopq, CTLTYPE_STRING | CTLFLAG_RD, NULL, 0,
     sysctl_vm_reserv_partpopq, "A", "Partially populated reservation queues");
 
-static long vm_reserv_reclaimed;
-SYSCTL_LONG(_vm_reserv, OID_AUTO, reclaimed, CTLFLAG_RD,
-    &vm_reserv_reclaimed, 0, "Cumulative number of reclaimed reservations");
+static counter_u64_t vm_reserv_reclaimed = EARLY_COUNTER;
+SYSCTL_COUNTER_U64(_vm_reserv, OID_AUTO, reclaimed, CTLFLAG_RD,
+    &vm_reserv_reclaimed, "Cumulative number of reclaimed reservations");
 
 /*
  * The object lock pool is used to synchronize the rvq.  We can not use a
@@ -313,12 +331,12 @@ sysctl_vm_reserv_partpopq(SYSCTL_HANDLER_ARGS)
 		for (level = -1; level <= VM_NRESERVLEVEL - 2; level++) {
 			counter = 0;
 			unused_pages = 0;
-			vm_domain_free_lock(VM_DOMAIN(domain));
+			vm_reserv_domain_lock(domain);
 			TAILQ_FOREACH(rv, &vm_rvq_partpop[domain], partpopq) {
 				counter++;
 				unused_pages += VM_LEVEL_0_NPAGES - rv->popcnt;
 			}
-			vm_domain_free_unlock(VM_DOMAIN(domain));
+			vm_reserv_domain_unlock(domain);
 			sbuf_printf(&sbuf, "%6d, %7d, %6dK, %6d\n",
 			    domain, level,
 			    unused_pages * ((int)PAGE_SIZE / 1024), counter);
@@ -337,6 +355,9 @@ vm_reserv_remove(vm_reserv_t rv)
 {
 	vm_object_t object;
 
+	vm_reserv_assert_locked(rv);
+	CTR5(KTR_VM, "%s: rv %p object %p popcnt %d inpartpop %d",
+	    __FUNCTION__, rv, rv->object, rv->popcnt, rv->inpartpopq);
 	KASSERT(rv->object != NULL,
 	    ("vm_reserv_remove: reserv %p is free", rv));
 	KASSERT(!rv->inpartpopq,
@@ -356,6 +377,11 @@ vm_reserv_insert(vm_reserv_t rv, vm_object_t object, v
 {
 	int i;
 
+	vm_reserv_assert_locked(rv);
+	CTR6(KTR_VM,
+	    "%s: rv %p(%p) object %p new %p popcnt %d",
+	    __FUNCTION__, rv, rv->pages, rv->object, object,
+	   rv->popcnt);
 	KASSERT(rv->object == NULL,
 	    ("vm_reserv_insert: reserv %p isn't free", rv));
 	KASSERT(rv->popcnt == 0,
@@ -377,14 +403,15 @@ vm_reserv_insert(vm_reserv_t rv, vm_object_t object, v
  * becomes zero, the reservation is destroyed.  Additionally, moves the
  * reservation to the tail of the partially populated reservation queue if the
  * population count is non-zero.
- *
- * The free page queue lock must be held.
  */
 static void
 vm_reserv_depopulate(vm_reserv_t rv, int index)
 {
+	struct vm_domain *vmd;
 
-	vm_domain_free_assert_locked(VM_DOMAIN(rv->domain));
+	vm_reserv_assert_locked(rv);
+	CTR5(KTR_VM, "%s: rv %p object %p popcnt %d inpartpop %d",
+	    __FUNCTION__, rv, rv->object, rv->popcnt, rv->inpartpopq);
 	KASSERT(rv->object != NULL,
 	    ("vm_reserv_depopulate: reserv %p is free", rv));
 	KASSERT(popmap_is_set(rv->popmap, index),
@@ -395,10 +422,7 @@ vm_reserv_depopulate(vm_reserv_t rv, int index)
 	KASSERT(rv->domain >= 0 && rv->domain < vm_ndomains,
 	    ("vm_reserv_depopulate: reserv %p's domain is corrupted %d",
 	    rv, rv->domain));
-	if (rv->inpartpopq) {
-		TAILQ_REMOVE(&vm_rvq_partpop[rv->domain], rv, partpopq);
-		rv->inpartpopq = FALSE;
-	} else {
+	if (rv->popcnt == VM_LEVEL_0_NPAGES) {
 		KASSERT(rv->pages->psind == 1,
 		    ("vm_reserv_depopulate: reserv %p is already demoted",
 		    rv));
@@ -406,14 +430,25 @@ vm_reserv_depopulate(vm_reserv_t rv, int index)
 	}
 	popmap_clear(rv->popmap, index);
 	rv->popcnt--;
+	vm_reserv_domain_lock(rv->domain);
+	if (rv->inpartpopq) {
+		TAILQ_REMOVE(&vm_rvq_partpop[rv->domain], rv, partpopq);
+		rv->inpartpopq = FALSE;
+	}
+	if (rv->popcnt != 0) {
+		rv->inpartpopq = TRUE;
+		TAILQ_INSERT_TAIL(&vm_rvq_partpop[rv->domain], rv, partpopq);
+	}
+	vm_reserv_domain_unlock(rv->domain);
+	vmd = VM_DOMAIN(rv->domain);
 	if (rv->popcnt == 0) {
 		vm_reserv_remove(rv);
+		vm_domain_free_lock(vmd);
 		vm_phys_free_pages(rv->pages, VM_LEVEL_0_ORDER);
-		vm_reserv_freed++;
-	} else {
-		rv->inpartpopq = TRUE;
-		TAILQ_INSERT_TAIL(&vm_rvq_partpop[rv->domain], rv, partpopq);
+		vm_domain_free_unlock(vmd);
+		counter_u64_add(vm_reserv_freed, 1);
 	}
+	vm_domain_freecnt_inc(vmd, 1);
 }
 
 /*
@@ -484,7 +519,9 @@ static void
 vm_reserv_populate(vm_reserv_t rv, int index)
 {
 
-	vm_domain_free_assert_locked(VM_DOMAIN(rv->domain));
+	vm_reserv_assert_locked(rv);
+	CTR5(KTR_VM, "%s: rv %p object %p popcnt %d inpartpop %d",
+	    __FUNCTION__, rv, rv->object, rv->popcnt, rv->inpartpopq);
 	KASSERT(rv->object != NULL,
 	    ("vm_reserv_populate: reserv %p is free", rv));
 	KASSERT(popmap_is_clear(rv->popmap, index),
@@ -497,17 +534,23 @@ vm_reserv_populate(vm_reserv_t rv, int index)
 	KASSERT(rv->domain >= 0 && rv->domain < vm_ndomains,
 	    ("vm_reserv_populate: reserv %p's domain is corrupted %d",
 	    rv, rv->domain));
+	popmap_set(rv->popmap, index);
+	rv->popcnt++;
+	vm_reserv_domain_lock(rv->domain);
 	if (rv->inpartpopq) {
 		TAILQ_REMOVE(&vm_rvq_partpop[rv->domain], rv, partpopq);
 		rv->inpartpopq = FALSE;
 	}
-	popmap_set(rv->popmap, index);
-	rv->popcnt++;
 	if (rv->popcnt < VM_LEVEL_0_NPAGES) {
 		rv->inpartpopq = TRUE;
 		TAILQ_INSERT_TAIL(&vm_rvq_partpop[rv->domain], rv, partpopq);
-	} else
+	} else {
+		KASSERT(rv->pages->psind == 0,
+		    ("vm_reserv_populate: reserv %p is already promoted",
+		    rv));
 		rv->pages->psind = 1;
+	}
+	vm_reserv_domain_unlock(rv->domain);
 }
 
 /*
@@ -572,31 +615,29 @@ vm_reserv_extend_contig(int req, vm_object_t object, v
 		return (NULL);
 	domain = rv->domain;
 	vmd = VM_DOMAIN(domain);
-	vm_domain_free_lock(vmd);
-	if (rv->object != object || !vm_domain_available(vmd, req, npages)) {
-		m = NULL;
+	vm_reserv_lock(rv);
+	if (rv->object != object)
 		goto out;
-	}
 	m = &rv->pages[index];
 	pa = VM_PAGE_TO_PHYS(m);
 	if (pa < low || pa + size > high || (pa & (alignment - 1)) != 0 ||
-	    ((pa ^ (pa + size - 1)) & ~(boundary - 1)) != 0) {
-		m = NULL;
+	    ((pa ^ (pa + size - 1)) & ~(boundary - 1)) != 0)
 		goto out;
-	}
 	/* Handle vm_page_rename(m, new_object, ...). */
 	for (i = 0; i < npages; i++) {
-		if (popmap_is_set(rv->popmap, index + i)) {
-			m = NULL;
+		if (popmap_is_set(rv->popmap, index + i))
 			goto out;
-		}
 	}
+	if (!vm_domain_allocate(vmd, req, npages))
+		goto out;
 	for (i = 0; i < npages; i++)
 		vm_reserv_populate(rv, index + i);
-	vm_domain_freecnt_dec(vmd, npages);
-out:
-	vm_domain_free_unlock(vmd);
+	vm_reserv_unlock(rv);
 	return (m);
+
+out:
+	vm_reserv_unlock(rv);
+	return (NULL);
 }
 
 /*
@@ -618,10 +659,11 @@ out:
  * The object and free page queue must be locked.
  */
 vm_page_t
-vm_reserv_alloc_contig(vm_object_t object, vm_pindex_t pindex, int domain,
+vm_reserv_alloc_contig(int req, vm_object_t object, vm_pindex_t pindex, int domain,
     u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment,
     vm_paddr_t boundary, vm_page_t mpred)
 {
+	struct vm_domain *vmd;
 	vm_paddr_t pa, size;
 	vm_page_t m, m_ret, msucc;
 	vm_pindex_t first, leftcap, rightcap;
@@ -629,7 +671,6 @@ vm_reserv_alloc_contig(vm_object_t object, vm_pindex_t
 	u_long allocpages, maxpages, minpages;
 	int i, index, n;
 
-	vm_domain_free_assert_locked(VM_DOMAIN(domain));
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	KASSERT(npages != 0, ("vm_reserv_alloc_contig: npages is 0"));
 
@@ -737,9 +778,19 @@ vm_reserv_alloc_contig(vm_object_t object, vm_pindex_t
 	 * specified index may not be the first page within the first new
 	 * reservation.
 	 */
-	m = vm_phys_alloc_contig(domain, allocpages, low, high, ulmax(alignment,
-	    VM_LEVEL_0_SIZE), boundary > VM_LEVEL_0_SIZE ? boundary : 0);
-	if (m == NULL)
+	m = NULL;
+	vmd = VM_DOMAIN(domain);
+	if (vm_domain_allocate(vmd, req, npages)) {
+		vm_domain_free_lock(vmd);
+		m = vm_phys_alloc_contig(domain, allocpages, low, high,
+		    ulmax(alignment, VM_LEVEL_0_SIZE),
+		    boundary > VM_LEVEL_0_SIZE ? boundary : 0);
+		vm_domain_free_unlock(vmd);
+		if (m == NULL) {
+			vm_domain_freecnt_inc(vmd, npages);
+			return (NULL);
+		}
+	} else
 		return (NULL);
 	KASSERT(vm_phys_domain(m) == domain,
 	    ("vm_reserv_alloc_contig: Page domain does not match requested."));
@@ -757,6 +808,7 @@ vm_reserv_alloc_contig(vm_object_t object, vm_pindex_t
 		KASSERT(rv->pages == m,
 		    ("vm_reserv_alloc_contig: reserv %p's pages is corrupted",
 		    rv));
+		vm_reserv_lock(rv);
 		vm_reserv_insert(rv, object, first);
 		n = ulmin(VM_LEVEL_0_NPAGES - index, npages);
 		for (i = 0; i < n; i++)
@@ -766,6 +818,7 @@ vm_reserv_alloc_contig(vm_object_t object, vm_pindex_t
 			m_ret = &rv->pages[index];
 			index = 0;
 		}
+		vm_reserv_unlock(rv);
 		m += VM_LEVEL_0_NPAGES;
 		first += VM_LEVEL_0_NPAGES;
 		allocpages -= VM_LEVEL_0_NPAGES;
@@ -813,18 +866,20 @@ vm_reserv_extend(int req, vm_object_t object, vm_pinde
 	vmd = VM_DOMAIN(domain);
 	index = VM_RESERV_INDEX(object, pindex);
 	m = &rv->pages[index];
-	vm_domain_free_lock(vmd);
-	if (vm_domain_available(vmd, req, 1) == 0 ||
-	    /* Handle reclaim race. */
-	    rv->object != object ||
+	vm_reserv_lock(rv);
+	/* Handle reclaim race. */
+	if (rv->object != object ||
 	    /* Handle vm_page_rename(m, new_object, ...). */
-	    popmap_is_set(rv->popmap, index))
+	    popmap_is_set(rv->popmap, index)) {
 		m = NULL;
-	if (m != NULL) {
-		vm_reserv_populate(rv, index);
-		vm_domain_freecnt_dec(vmd, 1);
+		goto out;
 	}
-	vm_domain_free_unlock(vmd);
+	if (vm_domain_allocate(vmd, req, 1) == 0)
+		m = NULL;
+	else
+		vm_reserv_populate(rv, index);
+out:
+	vm_reserv_unlock(rv);
 
 	return (m);
 }
@@ -840,15 +895,15 @@ vm_reserv_extend(int req, vm_object_t object, vm_pinde
  * The object and free page queue must be locked.
  */
 vm_page_t
-vm_reserv_alloc_page(vm_object_t object, vm_pindex_t pindex, int domain,
+vm_reserv_alloc_page(int req, vm_object_t object, vm_pindex_t pindex, int domain,
     vm_page_t mpred)
 {
+	struct vm_domain *vmd;
 	vm_page_t m, msucc;
 	vm_pindex_t first, leftcap, rightcap;
 	vm_reserv_t rv;
 	int index;
 
-	vm_domain_free_assert_locked(VM_DOMAIN(domain));
 	VM_OBJECT_ASSERT_WLOCKED(object);
 
 	/*
@@ -917,15 +972,28 @@ vm_reserv_alloc_page(vm_object_t object, vm_pindex_t p
 	/*
 	 * Allocate and populate the new reservation.
 	 */
-	m = vm_phys_alloc_pages(domain, VM_FREEPOOL_DEFAULT, VM_LEVEL_0_ORDER);
-	if (m == NULL)
+	m = NULL;
+	vmd = VM_DOMAIN(domain);
+	if (vm_domain_allocate(vmd, req, 1)) {
+		vm_domain_free_lock(vmd);
+		m = vm_phys_alloc_pages(domain, VM_FREEPOOL_DEFAULT,
+		    VM_LEVEL_0_ORDER);
+		vm_domain_free_unlock(vmd);
+		if (m == NULL) {
+			vm_domain_freecnt_inc(vmd, 1);
+			return (NULL);
+		}
+	} else
 		return (NULL);
 	rv = vm_reserv_from_page(m);
+	vm_reserv_lock(rv);
 	KASSERT(rv->pages == m,
 	    ("vm_reserv_alloc_page: reserv %p's pages is corrupted", rv));
 	vm_reserv_insert(rv, object, first);
 	index = VM_RESERV_INDEX(object, pindex);
 	vm_reserv_populate(rv, index);
+	vm_reserv_unlock(rv);
+
 	return (&rv->pages[index]);
 }
 
@@ -942,7 +1010,9 @@ vm_reserv_break(vm_reserv_t rv)
 {
 	int begin_zeroes, hi, i, lo;
 
-	vm_domain_free_assert_locked(VM_DOMAIN(rv->domain));
+	vm_reserv_assert_locked(rv);
+	CTR5(KTR_VM, "%s: rv %p object %p popcnt %d inpartpop %d",
+	    __FUNCTION__, rv, rv->object, rv->popcnt, rv->inpartpopq);
 	vm_reserv_remove(rv);
 	rv->pages->psind = 0;
 	i = hi = 0;
@@ -981,12 +1051,14 @@ vm_reserv_break(vm_reserv_t rv)
 		if (i != NPOPMAP)
 			/* Convert from ffsl() to ordinary bit numbering. */
 			hi--;
+		vm_domain_free_lock(VM_DOMAIN(rv->domain));
 		vm_phys_free_contig(&rv->pages[begin_zeroes], NBPOPMAP * i +
 		    hi - begin_zeroes);
+		vm_domain_free_unlock(VM_DOMAIN(rv->domain));
 	} while (i < NPOPMAP);
 	KASSERT(rv->popcnt == 0,
 	    ("vm_reserv_break: reserv %p's popcnt is corrupted", rv));
-	vm_reserv_broken++;
+	counter_u64_add(vm_reserv_broken, 1);
 }
 
 /*
@@ -996,7 +1068,6 @@ void
 vm_reserv_break_all(vm_object_t object)
 {
 	vm_reserv_t rv;
-	struct vm_domain *vmd;
 
 	/*
 	 * This access of object->rvq is unsynchronized so that the
@@ -1005,27 +1076,22 @@ vm_reserv_break_all(vm_object_t object)
 	 * lock prevents new additions, so we are guaranteed that when
 	 * it returns NULL the object is properly empty.
 	 */
-	vmd = NULL;
 	while ((rv = LIST_FIRST(&object->rvq)) != NULL) {
-		if (vmd != VM_DOMAIN(rv->domain)) {
-			if (vmd != NULL)
-				vm_domain_free_unlock(vmd);
-			vmd = VM_DOMAIN(rv->domain);
-			vm_domain_free_lock(vmd);
-		}
+		vm_reserv_lock(rv);
 		/* Reclaim race. */
-		if (rv->object != object)
+		if (rv->object != object) {
+			vm_reserv_unlock(rv);
 			continue;
-		KASSERT(rv->object == object,
-		    ("vm_reserv_break_all: reserv %p is corrupted", rv));
+		}
+		vm_reserv_domain_lock(rv->domain);
 		if (rv->inpartpopq) {
 			TAILQ_REMOVE(&vm_rvq_partpop[rv->domain], rv, partpopq);
 			rv->inpartpopq = FALSE;
 		}
+		vm_reserv_domain_unlock(rv->domain);
 		vm_reserv_break(rv);
+		vm_reserv_unlock(rv);
 	}
-	if (vmd != NULL)
-		vm_domain_free_unlock(vmd);
 }
 
 /*
@@ -1038,13 +1104,21 @@ boolean_t
 vm_reserv_free_page(vm_page_t m)
 {
 	vm_reserv_t rv;
+	boolean_t ret;
 
 	rv = vm_reserv_from_page(m);
 	if (rv->object == NULL)
 		return (FALSE);
-	vm_domain_free_assert_locked(VM_DOMAIN(rv->domain));
-	vm_reserv_depopulate(rv, m - rv->pages);
-	return (TRUE);
+	vm_reserv_lock(rv);
+	/* Re-validate after lock. */
+	if (rv->object != NULL) {
+		vm_reserv_depopulate(rv, m - rv->pages);
+		ret = TRUE;
+	} else
+		ret = FALSE;
+	vm_reserv_unlock(rv);
+
+	return (ret);
 }
 
 /*
@@ -1058,6 +1132,7 @@ vm_reserv_init(void)
 {
 	vm_paddr_t paddr;
 	struct vm_phys_seg *seg;
+	struct vm_reserv *rv;
 	int i, segind;
 
 	/*
@@ -1068,15 +1143,22 @@ vm_reserv_init(void)
 		seg = &vm_phys_segs[segind];
 		paddr = roundup2(seg->start, VM_LEVEL_0_SIZE);
 		while (paddr + VM_LEVEL_0_SIZE <= seg->end) {
-			vm_reserv_array[paddr >> VM_LEVEL_0_SHIFT].pages =
-			    PHYS_TO_VM_PAGE(paddr);
-			vm_reserv_array[paddr >> VM_LEVEL_0_SHIFT].domain =
-			    seg->domain;
+			rv = &vm_reserv_array[paddr >> VM_LEVEL_0_SHIFT];
+			rv->pages = PHYS_TO_VM_PAGE(paddr);
+			rv->domain = seg->domain;
+			mtx_init(&rv->lock, "vm reserv", NULL, MTX_DEF);
 			paddr += VM_LEVEL_0_SIZE;
 		}
 	}
-	for (i = 0; i < MAXMEMDOM; i++)
+	for (i = 0; i < MAXMEMDOM; i++) {
+		mtx_init(&vm_reserv_domain_locks[i], "VM reserv domain", NULL,
+		    MTX_DEF);
 		TAILQ_INIT(&vm_rvq_partpop[i]);
+	}
+
+	for (i = 0; i < VM_RESERV_OBJ_LOCK_COUNT; i++)
+		mtx_init(&vm_reserv_object_mtx[i], "resv obj lock", NULL,
+		    MTX_DEF);
 }
 
 /*
@@ -1091,7 +1173,6 @@ vm_reserv_is_page_free(vm_page_t m)
 	rv = vm_reserv_from_page(m);
 	if (rv->object == NULL)
 		return (false);
-	vm_domain_free_assert_locked(VM_DOMAIN(rv->domain));
 	return (popmap_is_clear(rv->popmap, m - rv->pages));
 }
 
@@ -1131,7 +1212,10 @@ static void
 vm_reserv_reclaim(vm_reserv_t rv)
 {
 
-	vm_domain_free_assert_locked(VM_DOMAIN(rv->domain));
+	vm_reserv_assert_locked(rv);
+	CTR5(KTR_VM, "%s: rv %p object %p popcnt %d inpartpop %d",
+	    __FUNCTION__, rv, rv->object, rv->popcnt, rv->inpartpopq);
+	vm_reserv_domain_lock(rv->domain);
 	KASSERT(rv->inpartpopq,
 	    ("vm_reserv_reclaim: reserv %p's inpartpopq is FALSE", rv));
 	KASSERT(rv->domain >= 0 && rv->domain < vm_ndomains,
@@ -1139,8 +1223,9 @@ vm_reserv_reclaim(vm_reserv_t rv)
 	    rv, rv->domain));
 	TAILQ_REMOVE(&vm_rvq_partpop[rv->domain], rv, partpopq);
 	rv->inpartpopq = FALSE;
+	vm_reserv_domain_unlock(rv->domain);
 	vm_reserv_break(rv);
-	vm_reserv_reclaimed++;
+	counter_u64_add(vm_reserv_reclaimed, 1);
 }
 
 /*
@@ -1155,9 +1240,14 @@ vm_reserv_reclaim_inactive(int domain)
 {
 	vm_reserv_t rv;
 
-	vm_domain_free_assert_locked(VM_DOMAIN(domain));
-	if ((rv = TAILQ_FIRST(&vm_rvq_partpop[domain])) != NULL) {
+	while ((rv = TAILQ_FIRST(&vm_rvq_partpop[domain])) != NULL) {
+		vm_reserv_lock(rv);
+		if (rv != TAILQ_FIRST(&vm_rvq_partpop[domain])) {
+			vm_reserv_unlock(rv);
+			continue;
+		}
 		vm_reserv_reclaim(rv);
+		vm_reserv_unlock(rv);
 		return (TRUE);
 	}
 	return (FALSE);
@@ -1176,14 +1266,16 @@ vm_reserv_reclaim_contig(int domain, u_long npages, vm
     vm_paddr_t high, u_long alignment, vm_paddr_t boundary)
 {
 	vm_paddr_t pa, size;
-	vm_reserv_t rv;
+	vm_reserv_t rv, rvn;
 	int hi, i, lo, low_index, next_free;
 
-	vm_domain_free_assert_locked(VM_DOMAIN(domain));
 	if (npages > VM_LEVEL_0_NPAGES - 1)
 		return (FALSE);
 	size = npages << PAGE_SHIFT;
-	TAILQ_FOREACH(rv, &vm_rvq_partpop[domain], partpopq) {
+	vm_reserv_domain_lock(domain);
+again:
+	for (rv = TAILQ_FIRST(&vm_rvq_partpop[domain]); rv != NULL; rv = rvn) {
+		rvn = TAILQ_NEXT(rv, partpopq);
 		pa = VM_PAGE_TO_PHYS(&rv->pages[VM_LEVEL_0_NPAGES - 1]);
 		if (pa + PAGE_SIZE - size < low) {
 			/* This entire reservation is too low; go to next. */
@@ -1194,6 +1286,17 @@ vm_reserv_reclaim_contig(int domain, u_long npages, vm
 			/* This entire reservation is too high; go to next. */
 			continue;
 		}
+		if (vm_reserv_trylock(rv) == 0) {
+			vm_reserv_domain_unlock(domain);
+			vm_reserv_lock(rv);
+			if (!rv->inpartpopq) {
+				vm_reserv_domain_lock(domain);
+				if (!rvn->inpartpopq)
+					goto again;
+				continue;
+			}
+		} else
+			vm_reserv_domain_unlock(domain);
 		if (pa < low) {
 			/* Start the search for free pages at "low". */
 			low_index = (low + PAGE_MASK - pa) >> PAGE_SHIFT;
@@ -1239,6 +1342,7 @@ vm_reserv_reclaim_contig(int domain, u_long npages, vm
 				if ((NBPOPMAP * i - next_free) * PAGE_SIZE >=
 				    size) {
 					vm_reserv_reclaim(rv);
+					vm_reserv_unlock(rv);
 					return (TRUE);
 				}
 				hi = ffsl(rv->popmap[i]);
@@ -1249,10 +1353,16 @@ vm_reserv_reclaim_contig(int domain, u_long npages, vm
 			if ((NBPOPMAP * i + hi - next_free) * PAGE_SIZE >=
 			    size) {
 				vm_reserv_reclaim(rv);
+				vm_reserv_unlock(rv);
 				return (TRUE);
 			}
 		} while (i < NPOPMAP);
+		vm_reserv_unlock(rv);
+		vm_reserv_domain_lock(domain);
+		if (rvn != NULL && !rvn->inpartpopq)
+			goto again;
 	}
+	vm_reserv_domain_unlock(domain);
 	return (FALSE);
 }
 
@@ -1270,7 +1380,11 @@ vm_reserv_rename(vm_page_t m, vm_object_t new_object, 
 	VM_OBJECT_ASSERT_WLOCKED(new_object);
 	rv = vm_reserv_from_page(m);
 	if (rv->object == old_object) {
-		vm_domain_free_lock(VM_DOMAIN(rv->domain));
+		vm_reserv_lock(rv);
+		CTR6(KTR_VM,
+		    "%s: rv %p object %p new %p popcnt %d inpartpop %d",

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201803221921.w2MJLBi7058560>