Skip site navigation (1)Skip section navigation (2)
Date:      Fri, 22 Nov 2019 16:28:52 +0000 (UTC)
From:      Mark Johnston <markj@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r355001 - head/sys/vm
Message-ID:  <201911221628.xAMGSq4R078678@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: markj
Date: Fri Nov 22 16:28:52 2019
New Revision: 355001
URL: https://svnweb.freebsd.org/changeset/base/355001

Log:
  Fix locking in vm_reserv_reclaim_contig().
  
  We were not properly handling the case where the trylock of the
  reservaton fails, in which case we could leak reservation lock.
  
  Introduce a marker reservation to implement precise scanning in
  vm_reserv_reclaim_contig().  Before, a race could result in early
  termination of the scan in rare situations.  Use the marker's lock to
  serialize scans of the partpop queue so that a global marker structure
  can be used.  Modify vm_reserv_reclaim_inactive() to handle the presence
  of a marker while minimizing the hold time of domain-global locks.
  
  Reviewed by:	alc, jeff, kib
  MFC after:	2 weeks
  Sponsored by:	The FreeBSD Foundation
  Differential Revision:	https://reviews.freebsd.org/D22392

Modified:
  head/sys/vm/vm_reserv.c
  head/sys/vm/vm_reserv.h

Modified: head/sys/vm/vm_reserv.c
==============================================================================
--- head/sys/vm/vm_reserv.c	Fri Nov 22 16:25:00 2019	(r355000)
+++ head/sys/vm/vm_reserv.c	Fri Nov 22 16:28:52 2019	(r355001)
@@ -181,10 +181,11 @@ popmap_is_set(popmap_t popmap[], int i)
  *
  * A partially populated reservation can be broken and reclaimed at any time.
  *
- * r - vm_reserv_lock
+ * c - constant after boot
  * d - vm_reserv_domain_lock
  * o - vm_reserv_object_lock
- * c - constant after boot
+ * r - vm_reserv_lock
+ * s - vm_reserv_domain_scan_lock
  */
 struct vm_reserv {
 	struct mtx	lock;			/* reservation lock. */
@@ -200,6 +201,8 @@ struct vm_reserv {
 	popmap_t	popmap[NPOPMAP_MAX];	/* (r) bit vector, used pages */
 };
 
+TAILQ_HEAD(vm_reserv_queue, vm_reserv);
+
 #define	vm_reserv_lockptr(rv)		(&(rv)->lock)
 #define	vm_reserv_assert_locked(rv)					\
 	    mtx_assert(vm_reserv_lockptr(rv), MA_OWNED)
@@ -238,18 +241,26 @@ static vm_reserv_t vm_reserv_array;
  * is the least recently changed, partially populated reservation.
  *
  * Access to this queue is synchronized by the per-domain reservation lock.
+ * Threads reclaiming free pages from the queue must hold the per-domain scan
+ * lock.
  */
 struct vm_reserv_domain {
-	struct mtx lock;
-	TAILQ_HEAD(, vm_reserv) partpop;
+	struct mtx 		lock;
+	struct vm_reserv_queue	partpop;	/* (d) */
+	struct vm_reserv	marker;		/* (d, s) scan marker/lock */
 } __aligned(CACHE_LINE_SIZE);
 
 static struct vm_reserv_domain vm_rvd[MAXMEMDOM];
 
 #define	vm_reserv_domain_lockptr(d)	(&vm_rvd[(d)].lock)
+#define	vm_reserv_domain_assert_locked(d)	\
+	mtx_assert(vm_reserv_domain_lockptr(d), MA_OWNED)
 #define	vm_reserv_domain_lock(d)	mtx_lock(vm_reserv_domain_lockptr(d))
 #define	vm_reserv_domain_unlock(d)	mtx_unlock(vm_reserv_domain_lockptr(d))
 
+#define	vm_reserv_domain_scan_lock(d)	mtx_lock(&vm_rvd[(d)].marker.lock)
+#define	vm_reserv_domain_scan_unlock(d)	mtx_unlock(&vm_rvd[(d)].marker.lock)
+
 static SYSCTL_NODE(_vm, OID_AUTO, reserv, CTLFLAG_RD, 0, "Reservation Info");
 
 static counter_u64_t vm_reserv_broken = EARLY_COUNTER;
@@ -350,6 +361,8 @@ sysctl_vm_reserv_partpopq(SYSCTL_HANDLER_ARGS)
 			unused_pages = 0;
 			vm_reserv_domain_lock(domain);
 			TAILQ_FOREACH(rv, &vm_rvd[domain].partpop, partpopq) {
+				if (rv == &vm_rvd[domain].marker)
+					continue;
 				counter++;
 				unused_pages += VM_LEVEL_0_NPAGES - rv->popcnt;
 			}
@@ -1037,7 +1050,8 @@ vm_reserv_init(void)
 	vm_paddr_t paddr;
 	struct vm_phys_seg *seg;
 	struct vm_reserv *rv;
-	int i, segind;
+	struct vm_reserv_domain *rvd;
+	int i, j, segind;
 
 	/*
 	 * Initialize the reservation array.  Specifically, initialize the
@@ -1056,8 +1070,18 @@ vm_reserv_init(void)
 		}
 	}
 	for (i = 0; i < MAXMEMDOM; i++) {
-		mtx_init(&vm_rvd[i].lock, "VM reserv domain", NULL, MTX_DEF);
-		TAILQ_INIT(&vm_rvd[i].partpop);
+		rvd = &vm_rvd[i];
+		mtx_init(&rvd->lock, "vm reserv domain", NULL, MTX_DEF);
+		TAILQ_INIT(&rvd->partpop);
+		mtx_init(&rvd->marker.lock, "vm reserv marker", NULL, MTX_DEF);
+
+		/*
+		 * Fully populated reservations should never be present in the
+		 * partially populated reservation queues.
+		 */
+		rvd->marker.popcnt = VM_LEVEL_0_NPAGES;
+		for (j = 0; j < NBPOPMAP; j++)
+			popmap_set(rvd->marker.popmap, j);
 	}
 
 	for (i = 0; i < VM_RESERV_OBJ_LOCK_COUNT; i++)
@@ -1107,50 +1131,71 @@ vm_reserv_level_iffullpop(vm_page_t m)
 }
 
 /*
- * Breaks the given partially populated reservation, releasing its free pages
- * to the physical memory allocator.
+ * Remove a partially populated reservation from the queue.
  */
 static void
-vm_reserv_reclaim(vm_reserv_t rv)
+vm_reserv_dequeue(vm_reserv_t rv)
 {
 
+	vm_reserv_domain_assert_locked(rv->domain);
 	vm_reserv_assert_locked(rv);
 	CTR5(KTR_VM, "%s: rv %p object %p popcnt %d inpartpop %d",
 	    __FUNCTION__, rv, rv->object, rv->popcnt, rv->inpartpopq);
-	vm_reserv_domain_lock(rv->domain);
 	KASSERT(rv->inpartpopq,
 	    ("vm_reserv_reclaim: reserv %p's inpartpopq is FALSE", rv));
-	KASSERT(rv->domain < vm_ndomains,
-	    ("vm_reserv_reclaim: reserv %p's domain is corrupted %d",
-	    rv, rv->domain));
+
 	TAILQ_REMOVE(&vm_rvd[rv->domain].partpop, rv, partpopq);
 	rv->inpartpopq = FALSE;
-	vm_reserv_domain_unlock(rv->domain);
+}
+
+/*
+ * Breaks the given partially populated reservation, releasing its free pages
+ * to the physical memory allocator.
+ */
+static void
+vm_reserv_reclaim(vm_reserv_t rv)
+{
+
+	vm_reserv_assert_locked(rv);
+	CTR5(KTR_VM, "%s: rv %p object %p popcnt %d inpartpop %d",
+	    __FUNCTION__, rv, rv->object, rv->popcnt, rv->inpartpopq);
+	if (rv->inpartpopq) {
+		vm_reserv_domain_lock(rv->domain);
+		vm_reserv_dequeue(rv);
+		vm_reserv_domain_unlock(rv->domain);
+	}
 	vm_reserv_break(rv);
 	counter_u64_add(vm_reserv_reclaimed, 1);
 }
 
 /*
- * Breaks the reservation at the head of the partially populated reservation
+ * Breaks a reservation near the head of the partially populated reservation
  * queue, releasing its free pages to the physical memory allocator.  Returns
  * TRUE if a reservation is broken and FALSE otherwise.
  */
-boolean_t
+bool
 vm_reserv_reclaim_inactive(int domain)
 {
 	vm_reserv_t rv;
 
-	while ((rv = TAILQ_FIRST(&vm_rvd[domain].partpop)) != NULL) {
-		vm_reserv_lock(rv);
-		if (rv != TAILQ_FIRST(&vm_rvd[domain].partpop)) {
-			vm_reserv_unlock(rv);
-			continue;
+	vm_reserv_domain_lock(domain);
+	TAILQ_FOREACH(rv, &vm_rvd[domain].partpop, partpopq) {
+		/*
+		 * A locked reservation is likely being updated or reclaimed,
+		 * so just skip ahead.
+		 */
+		if (rv != &vm_rvd[domain].marker && vm_reserv_trylock(rv)) {
+			vm_reserv_dequeue(rv);
+			break;
 		}
+	}
+	vm_reserv_domain_unlock(domain);
+	if (rv != NULL) {
 		vm_reserv_reclaim(rv);
 		vm_reserv_unlock(rv);
-		return (TRUE);
+		return (true);
 	}
-	return (FALSE);
+	return (false);
 }
 
 /*
@@ -1236,20 +1281,23 @@ vm_reserv_test_contig(vm_reserv_t rv, u_long npages, v
  * contiguous physical memory.  If a satisfactory reservation is found, it is
  * broken.  Returns true if a reservation is broken and false otherwise.
  */
-boolean_t
+bool
 vm_reserv_reclaim_contig(int domain, u_long npages, vm_paddr_t low,
     vm_paddr_t high, u_long alignment, vm_paddr_t boundary)
 {
+	struct vm_reserv_queue *queue;
 	vm_paddr_t pa, size;
-	vm_reserv_t rv, rvn;
+	vm_reserv_t marker, rv, rvn;
 
 	if (npages > VM_LEVEL_0_NPAGES - 1)
 		return (false);
+	marker = &vm_rvd[domain].marker;
+	queue = &vm_rvd[domain].partpop;
 	size = npages << PAGE_SHIFT;
+
+	vm_reserv_domain_scan_lock(domain);
 	vm_reserv_domain_lock(domain);
-again:
-	for (rv = TAILQ_FIRST(&vm_rvd[domain].partpop); rv != NULL; rv = rvn) {
-		rvn = TAILQ_NEXT(rv, partpopq);
+	TAILQ_FOREACH_SAFE(rv, queue, partpopq, rvn) {
 		pa = VM_PAGE_TO_PHYS(&rv->pages[0]);
 		if (pa + VM_LEVEL_0_SIZE - size < low) {
 			/* This entire reservation is too low; go to next. */
@@ -1259,29 +1307,35 @@ again:
 			/* This entire reservation is too high; go to next. */
 			continue;
 		}
+
 		if (vm_reserv_trylock(rv) == 0) {
+			TAILQ_INSERT_AFTER(queue, rv, marker, partpopq);
 			vm_reserv_domain_unlock(domain);
 			vm_reserv_lock(rv);
-			if (!rv->inpartpopq) {
+			if (!rv->inpartpopq ||
+			    TAILQ_NEXT(rv, partpopq) != marker) {
+				vm_reserv_unlock(rv);
 				vm_reserv_domain_lock(domain);
-				if (!rvn->inpartpopq)
-					goto again;
+				rvn = TAILQ_NEXT(marker, partpopq);
+				TAILQ_REMOVE(queue, marker, partpopq);
 				continue;
 			}
-		} else
-			vm_reserv_domain_unlock(domain);
+			vm_reserv_domain_lock(domain);
+			TAILQ_REMOVE(queue, marker, partpopq);
+		}
+		vm_reserv_domain_unlock(domain);
 		if (vm_reserv_test_contig(rv, npages, low, high,
 		    alignment, boundary)) {
+			vm_reserv_domain_scan_unlock(domain);
 			vm_reserv_reclaim(rv);
 			vm_reserv_unlock(rv);
 			return (true);
 		}
 		vm_reserv_unlock(rv);
 		vm_reserv_domain_lock(domain);
-		if (rvn != NULL && !rvn->inpartpopq)
-			goto again;
 	}
 	vm_reserv_domain_unlock(domain);
+	vm_reserv_domain_scan_unlock(domain);
 	return (false);
 }
 

Modified: head/sys/vm/vm_reserv.h
==============================================================================
--- head/sys/vm/vm_reserv.h	Fri Nov 22 16:25:00 2019	(r355000)
+++ head/sys/vm/vm_reserv.h	Fri Nov 22 16:28:52 2019	(r355001)
@@ -59,10 +59,10 @@ void		vm_reserv_init(void);
 bool		vm_reserv_is_page_free(vm_page_t m);
 int		vm_reserv_level(vm_page_t m);
 int		vm_reserv_level_iffullpop(vm_page_t m);
-boolean_t	vm_reserv_reclaim_contig(int domain, u_long npages,
+bool		vm_reserv_reclaim_contig(int domain, u_long npages,
 		    vm_paddr_t low, vm_paddr_t high, u_long alignment,
 		    vm_paddr_t boundary);
-boolean_t	vm_reserv_reclaim_inactive(int domain);
+bool		vm_reserv_reclaim_inactive(int domain);
 void		vm_reserv_rename(vm_page_t m, vm_object_t new_object,
 		    vm_object_t old_object, vm_pindex_t old_object_offset);
 int		vm_reserv_size(int level);



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201911221628.xAMGSq4R078678>