From owner-svn-src-all@freebsd.org Fri Nov 22 16:28:53 2019 Return-Path: Delivered-To: svn-src-all@mailman.nyi.freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2610:1c1:1:606c::19:1]) by mailman.nyi.freebsd.org (Postfix) with ESMTP id 96B311BB844; Fri, 22 Nov 2019 16:28:53 +0000 (UTC) (envelope-from markj@FreeBSD.org) Received: from mxrelay.nyi.freebsd.org (mxrelay.nyi.freebsd.org [IPv6:2610:1c1:1:606c::19:3]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) server-signature RSA-PSS (4096 bits) client-signature RSA-PSS (4096 bits) client-digest SHA256) (Client CN "mxrelay.nyi.freebsd.org", Issuer "Let's Encrypt Authority X3" (verified OK)) by mx1.freebsd.org (Postfix) with ESMTPS id 47KMLs3XYnz3xKL; Fri, 22 Nov 2019 16:28:53 +0000 (UTC) (envelope-from markj@FreeBSD.org) Received: from repo.freebsd.org (repo.freebsd.org [IPv6:2610:1c1:1:6068::e6a:0]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (Client did not present a certificate) by mxrelay.nyi.freebsd.org (Postfix) with ESMTPS id 599EA263B9; Fri, 22 Nov 2019 16:28:53 +0000 (UTC) (envelope-from markj@FreeBSD.org) Received: from repo.freebsd.org ([127.0.1.37]) by repo.freebsd.org (8.15.2/8.15.2) with ESMTP id xAMGSrvF078680; Fri, 22 Nov 2019 16:28:53 GMT (envelope-from markj@FreeBSD.org) Received: (from markj@localhost) by repo.freebsd.org (8.15.2/8.15.2/Submit) id xAMGSq4R078678; Fri, 22 Nov 2019 16:28:52 GMT (envelope-from markj@FreeBSD.org) Message-Id: <201911221628.xAMGSq4R078678@repo.freebsd.org> X-Authentication-Warning: repo.freebsd.org: markj set sender to markj@FreeBSD.org using -f From: Mark Johnston Date: Fri, 22 Nov 2019 16:28:52 +0000 (UTC) To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org Subject: svn commit: r355001 - head/sys/vm X-SVN-Group: head X-SVN-Commit-Author: markj X-SVN-Commit-Paths: head/sys/vm X-SVN-Commit-Revision: 355001 X-SVN-Commit-Repository: base MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit X-BeenThere: svn-src-all@freebsd.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: "SVN commit messages for the entire src tree \(except for " user" and " projects" \)" List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Fri, 22 Nov 2019 16:28:53 -0000 Author: markj Date: Fri Nov 22 16:28:52 2019 New Revision: 355001 URL: https://svnweb.freebsd.org/changeset/base/355001 Log: Fix locking in vm_reserv_reclaim_contig(). We were not properly handling the case where the trylock of the reservaton fails, in which case we could leak reservation lock. Introduce a marker reservation to implement precise scanning in vm_reserv_reclaim_contig(). Before, a race could result in early termination of the scan in rare situations. Use the marker's lock to serialize scans of the partpop queue so that a global marker structure can be used. Modify vm_reserv_reclaim_inactive() to handle the presence of a marker while minimizing the hold time of domain-global locks. Reviewed by: alc, jeff, kib MFC after: 2 weeks Sponsored by: The FreeBSD Foundation Differential Revision: https://reviews.freebsd.org/D22392 Modified: head/sys/vm/vm_reserv.c head/sys/vm/vm_reserv.h Modified: head/sys/vm/vm_reserv.c ============================================================================== --- head/sys/vm/vm_reserv.c Fri Nov 22 16:25:00 2019 (r355000) +++ head/sys/vm/vm_reserv.c Fri Nov 22 16:28:52 2019 (r355001) @@ -181,10 +181,11 @@ popmap_is_set(popmap_t popmap[], int i) * * A partially populated reservation can be broken and reclaimed at any time. * - * r - vm_reserv_lock + * c - constant after boot * d - vm_reserv_domain_lock * o - vm_reserv_object_lock - * c - constant after boot + * r - vm_reserv_lock + * s - vm_reserv_domain_scan_lock */ struct vm_reserv { struct mtx lock; /* reservation lock. */ @@ -200,6 +201,8 @@ struct vm_reserv { popmap_t popmap[NPOPMAP_MAX]; /* (r) bit vector, used pages */ }; +TAILQ_HEAD(vm_reserv_queue, vm_reserv); + #define vm_reserv_lockptr(rv) (&(rv)->lock) #define vm_reserv_assert_locked(rv) \ mtx_assert(vm_reserv_lockptr(rv), MA_OWNED) @@ -238,18 +241,26 @@ static vm_reserv_t vm_reserv_array; * is the least recently changed, partially populated reservation. * * Access to this queue is synchronized by the per-domain reservation lock. + * Threads reclaiming free pages from the queue must hold the per-domain scan + * lock. */ struct vm_reserv_domain { - struct mtx lock; - TAILQ_HEAD(, vm_reserv) partpop; + struct mtx lock; + struct vm_reserv_queue partpop; /* (d) */ + struct vm_reserv marker; /* (d, s) scan marker/lock */ } __aligned(CACHE_LINE_SIZE); static struct vm_reserv_domain vm_rvd[MAXMEMDOM]; #define vm_reserv_domain_lockptr(d) (&vm_rvd[(d)].lock) +#define vm_reserv_domain_assert_locked(d) \ + mtx_assert(vm_reserv_domain_lockptr(d), MA_OWNED) #define vm_reserv_domain_lock(d) mtx_lock(vm_reserv_domain_lockptr(d)) #define vm_reserv_domain_unlock(d) mtx_unlock(vm_reserv_domain_lockptr(d)) +#define vm_reserv_domain_scan_lock(d) mtx_lock(&vm_rvd[(d)].marker.lock) +#define vm_reserv_domain_scan_unlock(d) mtx_unlock(&vm_rvd[(d)].marker.lock) + static SYSCTL_NODE(_vm, OID_AUTO, reserv, CTLFLAG_RD, 0, "Reservation Info"); static counter_u64_t vm_reserv_broken = EARLY_COUNTER; @@ -350,6 +361,8 @@ sysctl_vm_reserv_partpopq(SYSCTL_HANDLER_ARGS) unused_pages = 0; vm_reserv_domain_lock(domain); TAILQ_FOREACH(rv, &vm_rvd[domain].partpop, partpopq) { + if (rv == &vm_rvd[domain].marker) + continue; counter++; unused_pages += VM_LEVEL_0_NPAGES - rv->popcnt; } @@ -1037,7 +1050,8 @@ vm_reserv_init(void) vm_paddr_t paddr; struct vm_phys_seg *seg; struct vm_reserv *rv; - int i, segind; + struct vm_reserv_domain *rvd; + int i, j, segind; /* * Initialize the reservation array. Specifically, initialize the @@ -1056,8 +1070,18 @@ vm_reserv_init(void) } } for (i = 0; i < MAXMEMDOM; i++) { - mtx_init(&vm_rvd[i].lock, "VM reserv domain", NULL, MTX_DEF); - TAILQ_INIT(&vm_rvd[i].partpop); + rvd = &vm_rvd[i]; + mtx_init(&rvd->lock, "vm reserv domain", NULL, MTX_DEF); + TAILQ_INIT(&rvd->partpop); + mtx_init(&rvd->marker.lock, "vm reserv marker", NULL, MTX_DEF); + + /* + * Fully populated reservations should never be present in the + * partially populated reservation queues. + */ + rvd->marker.popcnt = VM_LEVEL_0_NPAGES; + for (j = 0; j < NBPOPMAP; j++) + popmap_set(rvd->marker.popmap, j); } for (i = 0; i < VM_RESERV_OBJ_LOCK_COUNT; i++) @@ -1107,50 +1131,71 @@ vm_reserv_level_iffullpop(vm_page_t m) } /* - * Breaks the given partially populated reservation, releasing its free pages - * to the physical memory allocator. + * Remove a partially populated reservation from the queue. */ static void -vm_reserv_reclaim(vm_reserv_t rv) +vm_reserv_dequeue(vm_reserv_t rv) { + vm_reserv_domain_assert_locked(rv->domain); vm_reserv_assert_locked(rv); CTR5(KTR_VM, "%s: rv %p object %p popcnt %d inpartpop %d", __FUNCTION__, rv, rv->object, rv->popcnt, rv->inpartpopq); - vm_reserv_domain_lock(rv->domain); KASSERT(rv->inpartpopq, ("vm_reserv_reclaim: reserv %p's inpartpopq is FALSE", rv)); - KASSERT(rv->domain < vm_ndomains, - ("vm_reserv_reclaim: reserv %p's domain is corrupted %d", - rv, rv->domain)); + TAILQ_REMOVE(&vm_rvd[rv->domain].partpop, rv, partpopq); rv->inpartpopq = FALSE; - vm_reserv_domain_unlock(rv->domain); +} + +/* + * Breaks the given partially populated reservation, releasing its free pages + * to the physical memory allocator. + */ +static void +vm_reserv_reclaim(vm_reserv_t rv) +{ + + vm_reserv_assert_locked(rv); + CTR5(KTR_VM, "%s: rv %p object %p popcnt %d inpartpop %d", + __FUNCTION__, rv, rv->object, rv->popcnt, rv->inpartpopq); + if (rv->inpartpopq) { + vm_reserv_domain_lock(rv->domain); + vm_reserv_dequeue(rv); + vm_reserv_domain_unlock(rv->domain); + } vm_reserv_break(rv); counter_u64_add(vm_reserv_reclaimed, 1); } /* - * Breaks the reservation at the head of the partially populated reservation + * Breaks a reservation near the head of the partially populated reservation * queue, releasing its free pages to the physical memory allocator. Returns * TRUE if a reservation is broken and FALSE otherwise. */ -boolean_t +bool vm_reserv_reclaim_inactive(int domain) { vm_reserv_t rv; - while ((rv = TAILQ_FIRST(&vm_rvd[domain].partpop)) != NULL) { - vm_reserv_lock(rv); - if (rv != TAILQ_FIRST(&vm_rvd[domain].partpop)) { - vm_reserv_unlock(rv); - continue; + vm_reserv_domain_lock(domain); + TAILQ_FOREACH(rv, &vm_rvd[domain].partpop, partpopq) { + /* + * A locked reservation is likely being updated or reclaimed, + * so just skip ahead. + */ + if (rv != &vm_rvd[domain].marker && vm_reserv_trylock(rv)) { + vm_reserv_dequeue(rv); + break; } + } + vm_reserv_domain_unlock(domain); + if (rv != NULL) { vm_reserv_reclaim(rv); vm_reserv_unlock(rv); - return (TRUE); + return (true); } - return (FALSE); + return (false); } /* @@ -1236,20 +1281,23 @@ vm_reserv_test_contig(vm_reserv_t rv, u_long npages, v * contiguous physical memory. If a satisfactory reservation is found, it is * broken. Returns true if a reservation is broken and false otherwise. */ -boolean_t +bool vm_reserv_reclaim_contig(int domain, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary) { + struct vm_reserv_queue *queue; vm_paddr_t pa, size; - vm_reserv_t rv, rvn; + vm_reserv_t marker, rv, rvn; if (npages > VM_LEVEL_0_NPAGES - 1) return (false); + marker = &vm_rvd[domain].marker; + queue = &vm_rvd[domain].partpop; size = npages << PAGE_SHIFT; + + vm_reserv_domain_scan_lock(domain); vm_reserv_domain_lock(domain); -again: - for (rv = TAILQ_FIRST(&vm_rvd[domain].partpop); rv != NULL; rv = rvn) { - rvn = TAILQ_NEXT(rv, partpopq); + TAILQ_FOREACH_SAFE(rv, queue, partpopq, rvn) { pa = VM_PAGE_TO_PHYS(&rv->pages[0]); if (pa + VM_LEVEL_0_SIZE - size < low) { /* This entire reservation is too low; go to next. */ @@ -1259,29 +1307,35 @@ again: /* This entire reservation is too high; go to next. */ continue; } + if (vm_reserv_trylock(rv) == 0) { + TAILQ_INSERT_AFTER(queue, rv, marker, partpopq); vm_reserv_domain_unlock(domain); vm_reserv_lock(rv); - if (!rv->inpartpopq) { + if (!rv->inpartpopq || + TAILQ_NEXT(rv, partpopq) != marker) { + vm_reserv_unlock(rv); vm_reserv_domain_lock(domain); - if (!rvn->inpartpopq) - goto again; + rvn = TAILQ_NEXT(marker, partpopq); + TAILQ_REMOVE(queue, marker, partpopq); continue; } - } else - vm_reserv_domain_unlock(domain); + vm_reserv_domain_lock(domain); + TAILQ_REMOVE(queue, marker, partpopq); + } + vm_reserv_domain_unlock(domain); if (vm_reserv_test_contig(rv, npages, low, high, alignment, boundary)) { + vm_reserv_domain_scan_unlock(domain); vm_reserv_reclaim(rv); vm_reserv_unlock(rv); return (true); } vm_reserv_unlock(rv); vm_reserv_domain_lock(domain); - if (rvn != NULL && !rvn->inpartpopq) - goto again; } vm_reserv_domain_unlock(domain); + vm_reserv_domain_scan_unlock(domain); return (false); } Modified: head/sys/vm/vm_reserv.h ============================================================================== --- head/sys/vm/vm_reserv.h Fri Nov 22 16:25:00 2019 (r355000) +++ head/sys/vm/vm_reserv.h Fri Nov 22 16:28:52 2019 (r355001) @@ -59,10 +59,10 @@ void vm_reserv_init(void); bool vm_reserv_is_page_free(vm_page_t m); int vm_reserv_level(vm_page_t m); int vm_reserv_level_iffullpop(vm_page_t m); -boolean_t vm_reserv_reclaim_contig(int domain, u_long npages, +bool vm_reserv_reclaim_contig(int domain, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary); -boolean_t vm_reserv_reclaim_inactive(int domain); +bool vm_reserv_reclaim_inactive(int domain); void vm_reserv_rename(vm_page_t m, vm_object_t new_object, vm_object_t old_object, vm_pindex_t old_object_offset); int vm_reserv_size(int level);