From owner-svn-src-head@FreeBSD.ORG Wed Jun 5 23:53:00 2013 Return-Path: Delivered-To: svn-src-head@freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:1900:2254:206a::19:1]) by hub.freebsd.org (Postfix) with ESMTP id 9AC0A9AA; Wed, 5 Jun 2013 23:53:00 +0000 (UTC) (envelope-from jeff@FreeBSD.org) Received: from svn.freebsd.org (svn.freebsd.org [IPv6:2001:1900:2254:2068::e6a:0]) by mx1.freebsd.org (Postfix) with ESMTP id 7C5AB1A41; Wed, 5 Jun 2013 23:53:00 +0000 (UTC) Received: from svn.freebsd.org ([127.0.1.70]) by svn.freebsd.org (8.14.7/8.14.7) with ESMTP id r55Nr0pv096759; Wed, 5 Jun 2013 23:53:00 GMT (envelope-from jeff@svn.freebsd.org) Received: (from jeff@localhost) by svn.freebsd.org (8.14.7/8.14.5/Submit) id r55Nr0nx096758; Wed, 5 Jun 2013 23:53:00 GMT (envelope-from jeff@svn.freebsd.org) Message-Id: <201306052353.r55Nr0nx096758@svn.freebsd.org> From: Jeff Roberson Date: Wed, 5 Jun 2013 23:53:00 +0000 (UTC) To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org Subject: svn commit: r251446 - head/sys/kern X-SVN-Group: head MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit X-BeenThere: svn-src-head@freebsd.org X-Mailman-Version: 2.1.14 Precedence: list List-Id: SVN commit messages for the src tree for head/-current List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Wed, 05 Jun 2013 23:53:00 -0000 Author: jeff Date: Wed Jun 5 23:53:00 2013 New Revision: 251446 URL: http://svnweb.freebsd.org/changeset/base/251446 Log: - Consolidate duplicate code into support functions. - Split the bqlock into bqclean and bqdirty locks. - Only acquire the wakeup synchronization locks when we cross a threshold requiring them. - Restructure the way flushbufqueues() targets work so they are more smp friendly and sane. Reviewed by: kib Discussed with: mckusick, attilio Sponsored by: EMC / Isilon Storage Division M vfs_bio.c Modified: head/sys/kern/vfs_bio.c Modified: head/sys/kern/vfs_bio.c ============================================================================== --- head/sys/kern/vfs_bio.c Wed Jun 5 23:28:29 2013 (r251445) +++ head/sys/kern/vfs_bio.c Wed Jun 5 23:53:00 2013 (r251446) @@ -113,10 +113,11 @@ static void vfs_setdirty_locked_object(s static void vfs_vmio_release(struct buf *bp); static int vfs_bio_clcheck(struct vnode *vp, int size, daddr_t lblkno, daddr_t blkno); -static int buf_do_flush(struct vnode *vp); +static int buf_flush(struct vnode *vp, int); static int flushbufqueues(struct vnode *, int, int); static void buf_daemon(void); static void bremfreel(struct buf *bp); +static __inline void bd_wakeup(void); #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) static int sysctl_bufspace(SYSCTL_HANDLER_ARGS); @@ -217,8 +218,8 @@ SYSCTL_INT(_vfs, OID_AUTO, mappingrestar static int flushbufqtarget = 100; SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0, "Amount of work to do in flushbufqueues when helping bufdaemon"); -static long notbufdflashes; -SYSCTL_LONG(_vfs, OID_AUTO, notbufdflashes, CTLFLAG_RD, ¬bufdflashes, 0, +static long notbufdflushes; +SYSCTL_LONG(_vfs, OID_AUTO, notbufdflushes, CTLFLAG_RD, ¬bufdflushes, 0, "Number of dirty buffer flushes done by the bufdaemon helpers"); static long barrierwrites; SYSCTL_LONG(_vfs, OID_AUTO, barrierwrites, CTLFLAG_RW, &barrierwrites, 0, @@ -228,6 +229,37 @@ SYSCTL_INT(_vfs, OID_AUTO, unmapped_buf_ "Permit the use of the unmapped i/o"); /* + * Lock for the non-dirty bufqueues + */ +static struct mtx_padalign bqclean; + +/* + * Lock for the dirty queue. + */ +static struct mtx_padalign bqdirty; + +/* + * This lock synchronizes access to bd_request. + */ +static struct mtx_padalign bdlock; + +/* + * This lock protects the runningbufreq and synchronizes runningbufwakeup and + * waitrunningbufspace(). + */ +static struct mtx_padalign rbreqlock; + +/* + * Lock that protects needsbuffer and the sleeps/wakeups surrounding it. + */ +static struct mtx_padalign nblock; + +/* + * Lock that protects bdirtywait. + */ +static struct mtx_padalign bdirtylock; + +/* * Wakeup point for bufdaemon, as well as indicator of whether it is already * active. Set to 1 when the bufdaemon is already "on" the queue, 0 when it * is idling. @@ -243,11 +275,6 @@ static int bd_request; static int bd_speedupreq; /* - * This lock synchronizes access to bd_request. - */ -static struct mtx bdlock; - -/* * bogus page -- for I/O to/from partially complete buffers * this is a temporary solution to the problem, but it is not * really that bad. it would be better to split the buffer @@ -263,25 +290,19 @@ vm_page_t bogus_page; */ static int runningbufreq; -/* - * This lock protects the runningbufreq and synchronizes runningbufwakeup and - * waitrunningbufspace(). - */ -static struct mtx rbreqlock; - /* * Synchronization (sleep/wakeup) variable for buffer requests. * Can contain the VFS_BIO_NEED flags defined below; setting/clearing is done * by and/or. - * Used in numdirtywakeup(), bufspacewakeup(), bufcountwakeup(), bwillwrite(), + * Used in numdirtywakeup(), bufspacewakeup(), bufcountadd(), bwillwrite(), * getnewbuf(), and getblk(). */ static int needsbuffer; /* - * Lock that protects needsbuffer and the sleeps/wakeups surrounding it. + * Synchronization for bwillwrite() waiters. */ -static struct mtx nblock; +static int bdirtywait; /* * Definitions for the buffer free lists. @@ -301,9 +322,6 @@ static TAILQ_HEAD(bqueues, buf) bufqueue static int bq_len[BUFFER_QUEUES]; #endif -/* Lock for the bufqueues */ -static struct mtx bqlock; - /* * Single global constant for BUF_WMESG, to avoid getting multiple references. * buf_wmesg is referred from macros. @@ -311,7 +329,6 @@ static struct mtx bqlock; const char *buf_wmesg = BUF_WMESG; #define VFS_BIO_NEED_ANY 0x01 /* any freeable buffer */ -#define VFS_BIO_NEED_DIRTYFLUSH 0x02 /* waiting for dirty buffer flush */ #define VFS_BIO_NEED_FREE 0x04 /* wait for free bufs, hi hysteresis */ #define VFS_BIO_NEED_BUFSPACE 0x08 /* wait for buf space, lo hysteresis */ @@ -337,25 +354,69 @@ sysctl_bufspace(SYSCTL_HANDLER_ARGS) #ifdef DIRECTIO extern void ffs_rawread_setup(void); #endif /* DIRECTIO */ + /* - * numdirtywakeup: + * bqlock: * - * If someone is blocked due to there being too many dirty buffers, - * and numdirtybuffers is now reasonable, wake them up. + * Return the appropriate queue lock based on the index. */ - -static __inline void -numdirtywakeup(int level) +static inline struct mtx * +bqlock(int qindex) { - if (numdirtybuffers <= level) { - mtx_lock(&nblock); - if (needsbuffer & VFS_BIO_NEED_DIRTYFLUSH) { - needsbuffer &= ~VFS_BIO_NEED_DIRTYFLUSH; - wakeup(&needsbuffer); - } - mtx_unlock(&nblock); + if (qindex == QUEUE_DIRTY) + return (struct mtx *)(&bqdirty); + return (struct mtx *)(&bqclean); +} + +/* + * bdirtywakeup: + * + * Wakeup any bwillwrite() waiters. + */ +static void +bdirtywakeup(void) +{ + mtx_lock(&bdirtylock); + if (bdirtywait) { + bdirtywait = 0; + wakeup(&bdirtywait); } + mtx_unlock(&bdirtylock); +} + +/* + * bdirtysub: + * + * Decrement the numdirtybuffers count by one and wakeup any + * threads blocked in bwillwrite(). + */ +static void +bdirtysub(void) +{ + + if (atomic_fetchadd_int(&numdirtybuffers, -1) == + (lodirtybuffers + hidirtybuffers) / 2) + bdirtywakeup(); +} + +/* + * bdirtyadd: + * + * Increment the numdirtybuffers count by one and wakeup the buf + * daemon if needed. + */ +static void +bdirtyadd(void) +{ + + /* + * Only do the wakeup once as we cross the boundary. The + * buf daemon will keep running until the condition clears. + */ + if (atomic_fetchadd_int(&numdirtybuffers, 1) == + (lodirtybuffers + hidirtybuffers) / 2) + bd_wakeup(); } /* @@ -385,36 +446,59 @@ bufspacewakeup(void) } /* - * runningbufwakeup() - in-progress I/O accounting. + * runningwakeup: * + * Wake up processes that are waiting on asynchronous writes to fall + * below lorunningspace. + */ +static void +runningwakeup(void) +{ + + mtx_lock(&rbreqlock); + if (runningbufreq) { + runningbufreq = 0; + wakeup(&runningbufreq); + } + mtx_unlock(&rbreqlock); +} + +/* + * runningbufwakeup: + * + * Decrement the outstanding write count according. */ void runningbufwakeup(struct buf *bp) { + long space, bspace; - if (bp->b_runningbufspace) { - atomic_subtract_long(&runningbufspace, bp->b_runningbufspace); - bp->b_runningbufspace = 0; - mtx_lock(&rbreqlock); - if (runningbufreq && runningbufspace <= lorunningspace) { - runningbufreq = 0; - wakeup(&runningbufreq); - } - mtx_unlock(&rbreqlock); - } + if (bp->b_runningbufspace == 0) + return; + space = atomic_fetchadd_long(&runningbufspace, -bp->b_runningbufspace); + bspace = bp->b_runningbufspace; + bp->b_runningbufspace = 0; + /* + * Only acquire the lock and wakeup on the transition from exceeding + * the threshold to falling below it. + */ + if (space < lorunningspace) + return; + if (space - bspace > lorunningspace) + return; + runningwakeup(); } /* - * bufcountwakeup: + * bufcountadd: * * Called when a buffer has been added to one of the free queues to * account for the buffer and to wakeup anyone waiting for free buffers. * This typically occurs when large amounts of metadata are being handled * by the buffer cache ( else buffer space runs out first, usually ). */ - static __inline void -bufcountwakeup(struct buf *bp) +bufcountadd(struct buf *bp) { int old; @@ -435,6 +519,30 @@ bufcountwakeup(struct buf *bp) } /* + * bufcountsub: + * + * Decrement the numfreebuffers count as needed. + */ +static void +bufcountsub(struct buf *bp) +{ + int old; + + /* + * Fixup numfreebuffers count. If the buffer is invalid or not + * delayed-write, the buffer was free and we must decrement + * numfreebuffers. + */ + if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) { + KASSERT((bp->b_flags & B_INFREECNT) != 0, + ("buf %p not counted in numfreebuffers", bp)); + bp->b_flags &= ~B_INFREECNT; + old = atomic_fetchadd_int(&numfreebuffers, -1); + KASSERT(old > 0, ("numfreebuffers dropped to %d", old - 1)); + } +} + +/* * waitrunningbufspace() * * runningbufspace is a measure of the amount of I/O currently @@ -442,9 +550,6 @@ bufcountwakeup(struct buf *bp) * prevent creating huge backups of pending writes to a device. * Only asynchronous writes are governed by this function. * - * Reads will adjust runningbufspace, but will not block based on it. - * The read load has a side effect of reducing the allowed write load. - * * This does NOT turn an async write into a sync write. It waits * for earlier writes to complete and generally returns before the * caller's write has reached the device. @@ -485,13 +590,12 @@ vfs_buf_test_cache(struct buf *bp, } /* Wake up the buffer daemon if necessary */ -static __inline -void -bd_wakeup(int dirtybuflevel) +static __inline void +bd_wakeup(void) { mtx_lock(&bdlock); - if (bd_request == 0 && numdirtybuffers >= dirtybuflevel) { + if (bd_request == 0) { bd_request = 1; wakeup(&bd_request); } @@ -501,7 +605,6 @@ bd_wakeup(int dirtybuflevel) /* * bd_speedup - speedup the buffer cache flushing code */ - void bd_speedup(void) { @@ -656,10 +759,12 @@ bufinit(void) struct buf *bp; int i; - mtx_init(&bqlock, "buf queue lock", NULL, MTX_DEF); + mtx_init(&bqclean, "bufq clean lock", NULL, MTX_DEF); + mtx_init(&bqdirty, "bufq dirty lock", NULL, MTX_DEF); mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF); mtx_init(&nblock, "needsbuffer lock", NULL, MTX_DEF); mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF); + mtx_init(&bdirtylock, "dirty buf lock", NULL, MTX_DEF); /* next, make a null set of free lists */ for (i = 0; i < BUFFER_QUEUES; i++) @@ -831,15 +936,60 @@ bfreekva(struct buf *bp) } /* + * binsfree: + * + * Insert the buffer into the appropriate free list. + */ +static void +binsfree(struct buf *bp, int qindex) +{ + struct mtx *olock, *nlock; + + BUF_ASSERT_XLOCKED(bp); + + olock = bqlock(bp->b_qindex); + nlock = bqlock(qindex); + mtx_lock(olock); + /* Handle delayed bremfree() processing. */ + if (bp->b_flags & B_REMFREE) + bremfreel(bp); + + if (bp->b_qindex != QUEUE_NONE) + panic("binsfree: free buffer onto another queue???"); + + bp->b_qindex = qindex; + if (olock != nlock) { + mtx_unlock(olock); + mtx_lock(nlock); + } + if (bp->b_flags & B_AGE) + TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist); + else + TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist); +#ifdef INVARIANTS + bq_len[bp->b_qindex]++; +#endif + mtx_unlock(nlock); + + /* + * Something we can maybe free or reuse. + */ + if (bp->b_bufsize && !(bp->b_flags & B_DELWRI)) + bufspacewakeup(); + + if ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI)) + bufcountadd(bp); +} + +/* * bremfree: * - * Mark the buffer for removal from the appropriate free list in brelse. + * Mark the buffer for removal from the appropriate free list. * */ void bremfree(struct buf *bp) { - int old; CTR3(KTR_BUF, "bremfree(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT((bp->b_flags & B_REMFREE) == 0, @@ -849,14 +999,7 @@ bremfree(struct buf *bp) BUF_ASSERT_XLOCKED(bp); bp->b_flags |= B_REMFREE; - /* Fixup numfreebuffers count. */ - if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) { - KASSERT((bp->b_flags & B_INFREECNT) != 0, - ("buf %p not counted in numfreebuffers", bp)); - bp->b_flags &= ~B_INFREECNT; - old = atomic_fetchadd_int(&numfreebuffers, -1); - KASSERT(old > 0, ("numfreebuffers dropped to %d", old - 1)); - } + bufcountsub(bp); } /* @@ -868,28 +1011,30 @@ bremfree(struct buf *bp) void bremfreef(struct buf *bp) { - mtx_lock(&bqlock); + struct mtx *qlock; + + qlock = bqlock(bp->b_qindex); + mtx_lock(qlock); bremfreel(bp); - mtx_unlock(&bqlock); + mtx_unlock(qlock); } /* * bremfreel: * * Removes a buffer from the free list, must be called with the - * bqlock held. + * correct qlock held. */ static void bremfreel(struct buf *bp) { - int old; CTR3(KTR_BUF, "bremfreel(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_qindex != QUEUE_NONE, ("bremfreel: buffer %p not on a queue.", bp)); BUF_ASSERT_XLOCKED(bp); - mtx_assert(&bqlock, MA_OWNED); + mtx_assert(bqlock(bp->b_qindex), MA_OWNED); TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist); #ifdef INVARIANTS @@ -906,18 +1051,7 @@ bremfreel(struct buf *bp) bp->b_flags &= ~B_REMFREE; return; } - /* - * Fixup numfreebuffers count. If the buffer is invalid or not - * delayed-write, the buffer was free and we must decrement - * numfreebuffers. - */ - if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) { - KASSERT((bp->b_flags & B_INFREECNT) != 0, - ("buf %p not counted in numfreebuffers", bp)); - bp->b_flags &= ~B_INFREECNT; - old = atomic_fetchadd_int(&numfreebuffers, -1); - KASSERT(old > 0, ("numfreebuffers dropped to %d", old - 1)); - } + bufcountsub(bp); } /* @@ -1018,6 +1152,7 @@ bufwrite(struct buf *bp) { int oldflags; struct vnode *vp; + long space; int vp_md; CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); @@ -1065,7 +1200,7 @@ bufwrite(struct buf *bp) * Normal bwrites pipeline writes */ bp->b_runningbufspace = bp->b_bufsize; - atomic_add_long(&runningbufspace, bp->b_runningbufspace); + space = atomic_fetchadd_long(&runningbufspace, bp->b_runningbufspace); if (!TD_IS_IDLETHREAD(curthread)) curthread->td_ru.ru_oublock++; @@ -1078,7 +1213,7 @@ bufwrite(struct buf *bp) int rtval = bufwait(bp); brelse(bp); return (rtval); - } else { + } else if (space > hirunningspace) { /* * don't allow the async write to saturate the I/O * system. We will not deadlock here because @@ -1213,13 +1348,6 @@ bdwrite(struct buf *bp) bqrelse(bp); /* - * Wakeup the buffer flushing daemon if we have a lot of dirty - * buffers (midpoint between our recovery point and our stall - * point). - */ - bd_wakeup((lodirtybuffers + hidirtybuffers) / 2); - - /* * note: we cannot initiate I/O from a bdwrite even if we wanted to, * due to the softdep code. */ @@ -1259,8 +1387,7 @@ bdirty(struct buf *bp) if ((bp->b_flags & B_DELWRI) == 0) { bp->b_flags |= /* XXX B_DONE | */ B_DELWRI; reassignbuf(bp); - atomic_add_int(&numdirtybuffers, 1); - bd_wakeup((lodirtybuffers + hidirtybuffers) / 2); + bdirtyadd(); } } @@ -1288,8 +1415,7 @@ bundirty(struct buf *bp) if (bp->b_flags & B_DELWRI) { bp->b_flags &= ~B_DELWRI; reassignbuf(bp); - atomic_subtract_int(&numdirtybuffers, 1); - numdirtywakeup(lodirtybuffers); + bdirtysub(); } /* * Since it is now being written, we can clear its deferred write flag. @@ -1357,20 +1483,18 @@ bbarrierwrite(struct buf *bp) * of any vnodes we attempt to avoid the situation where a locked vnode * prevents the various system daemons from flushing related buffers. */ - void bwillwrite(void) { if (numdirtybuffers >= hidirtybuffers) { - mtx_lock(&nblock); + mtx_lock(&bdirtylock); while (numdirtybuffers >= hidirtybuffers) { - bd_wakeup(1); - needsbuffer |= VFS_BIO_NEED_DIRTYFLUSH; - msleep(&needsbuffer, &nblock, - (PRIBIO + 4), "flswai", 0); + bdirtywait = 1; + msleep(&bdirtywait, &bdirtylock, (PRIBIO + 4), + "flswai", 0); } - mtx_unlock(&nblock); + mtx_unlock(&bdirtylock); } } @@ -1403,6 +1527,8 @@ buf_vm_page_count_severe(void) void brelse(struct buf *bp) { + int qindex; + CTR3(KTR_BUF, "brelse(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), @@ -1441,10 +1567,8 @@ brelse(struct buf *bp) bp->b_flags |= B_INVAL; if (!LIST_EMPTY(&bp->b_dep)) buf_deallocate(bp); - if (bp->b_flags & B_DELWRI) { - atomic_subtract_int(&numdirtybuffers, 1); - numdirtywakeup(lodirtybuffers); - } + if (bp->b_flags & B_DELWRI) + bdirtysub(); bp->b_flags &= ~(B_DELWRI | B_CACHE); if ((bp->b_flags & B_VMIO) == 0) { if (bp->b_bufsize) @@ -1591,15 +1715,6 @@ brelse(struct buf *bp) brelvp(bp); } - /* enqueue */ - mtx_lock(&bqlock); - /* Handle delayed bremfree() processing. */ - if (bp->b_flags & B_REMFREE) - bremfreel(bp); - - if (bp->b_qindex != QUEUE_NONE) - panic("brelse: free buffer onto another queue???"); - /* * If the buffer has junk contents signal it and eventually * clean up B_DELWRI and diassociate the vnode so that gbincore() @@ -1620,54 +1735,26 @@ brelse(struct buf *bp) bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA); if (bp->b_vflags & BV_BKGRDINPROG) panic("losing buffer 1"); - if (bp->b_kvasize) { - bp->b_qindex = QUEUE_EMPTYKVA; - } else { - bp->b_qindex = QUEUE_EMPTY; - } - TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist); + if (bp->b_kvasize) + qindex = QUEUE_EMPTYKVA; + else + qindex = QUEUE_EMPTY; + bp->b_flags |= B_AGE; /* buffers with junk contents */ } else if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) || (bp->b_ioflags & BIO_ERROR)) { bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA); if (bp->b_vflags & BV_BKGRDINPROG) panic("losing buffer 2"); - bp->b_qindex = QUEUE_CLEAN; - TAILQ_INSERT_HEAD(&bufqueues[QUEUE_CLEAN], bp, b_freelist); + qindex = QUEUE_CLEAN; + bp->b_flags |= B_AGE; /* remaining buffers */ - } else { - if (bp->b_flags & B_DELWRI) - bp->b_qindex = QUEUE_DIRTY; - else - bp->b_qindex = QUEUE_CLEAN; - if (bp->b_flags & B_AGE) { - TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, - b_freelist); - } else { - TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, - b_freelist); - } - } -#ifdef INVARIANTS - bq_len[bp->b_qindex]++; -#endif - mtx_unlock(&bqlock); - - /* - * Fixup numfreebuffers count. The bp is on an appropriate queue - * unless locked. We then bump numfreebuffers if it is not B_DELWRI. - * We've already handled the B_INVAL case ( B_DELWRI will be clear - * if B_INVAL is set ). - */ - - if (!(bp->b_flags & B_DELWRI)) - bufcountwakeup(bp); + } else if (bp->b_flags & B_DELWRI) + qindex = QUEUE_DIRTY; + else + qindex = QUEUE_CLEAN; - /* - * Something we can maybe free or reuse - */ - if (bp->b_bufsize || bp->b_kvasize) - bufspacewakeup(); + binsfree(bp, qindex); bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF | B_DIRECT); if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY)) @@ -1690,7 +1777,7 @@ brelse(struct buf *bp) void bqrelse(struct buf *bp) { - struct bufobj *bo; + int qindex; CTR3(KTR_BUF, "bqrelse(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), @@ -1701,71 +1788,40 @@ bqrelse(struct buf *bp) BUF_UNLOCK(bp); return; } + bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF); - bo = bp->b_bufobj; if (bp->b_flags & B_MANAGED) { - if (bp->b_flags & B_REMFREE) { - mtx_lock(&bqlock); - bremfreel(bp); - mtx_unlock(&bqlock); - } - bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF); - BUF_UNLOCK(bp); - return; + if (bp->b_flags & B_REMFREE) + bremfreef(bp); + goto out; } - mtx_lock(&bqlock); - /* Handle delayed bremfree() processing. */ - if (bp->b_flags & B_REMFREE) - bremfreel(bp); - - if (bp->b_qindex != QUEUE_NONE) - panic("bqrelse: free buffer onto another queue???"); /* buffers with stale but valid contents */ if (bp->b_flags & B_DELWRI) { - bp->b_qindex = QUEUE_DIRTY; - TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist); -#ifdef INVARIANTS - bq_len[bp->b_qindex]++; -#endif + qindex = QUEUE_DIRTY; } else { + if ((bp->b_flags & B_DELWRI) == 0 && + (bp->b_xflags & BX_VNDIRTY)) + panic("bqrelse: not dirty"); /* * BKGRDINPROG can only be set with the buf and bufobj * locks both held. We tolerate a race to clear it here. */ - if (!buf_vm_page_count_severe() || - (bp->b_vflags & BV_BKGRDINPROG)) { - bp->b_qindex = QUEUE_CLEAN; - TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp, - b_freelist); -#ifdef INVARIANTS - bq_len[QUEUE_CLEAN]++; -#endif - } else { + if (buf_vm_page_count_severe() && + (bp->b_vflags & BV_BKGRDINPROG) == 0) { /* * We are too low on memory, we have to try to free * the buffer (most importantly: the wired pages * making up its backing store) *now*. */ - mtx_unlock(&bqlock); brelse(bp); return; } + qindex = QUEUE_CLEAN; } - mtx_unlock(&bqlock); - - if ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI)) - bufcountwakeup(bp); + binsfree(bp, qindex); - /* - * Something we can maybe free or reuse. - */ - if (bp->b_bufsize && !(bp->b_flags & B_DELWRI)) - bufspacewakeup(); - - bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF); - if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY)) - panic("bqrelse: not dirty"); +out: /* unlock */ BUF_UNLOCK(bp); } @@ -2000,7 +2056,7 @@ getnewbuf_bufd_help(struct vnode *vp, in char *waitmsg; int fl, flags, norunbuf; - mtx_assert(&bqlock, MA_OWNED); + mtx_assert(&bqclean, MA_OWNED); if (defrag) { flags = VFS_BIO_NEED_BUFSPACE; @@ -2015,7 +2071,7 @@ getnewbuf_bufd_help(struct vnode *vp, in mtx_lock(&nblock); needsbuffer |= flags; mtx_unlock(&nblock); - mtx_unlock(&bqlock); + mtx_unlock(&bqclean); bd_speedup(); /* heeeelp */ if ((gbflags & GB_NOWAIT_BD) != 0) @@ -2038,7 +2094,7 @@ getnewbuf_bufd_help(struct vnode *vp, in (td->td_pflags & TDP_NORUNNINGBUF); /* play bufdaemon */ td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF; - fl = buf_do_flush(vp); + fl = buf_flush(vp, flushbufqtarget); td->td_pflags &= norunbuf; mtx_lock(&nblock); if (fl != 0) @@ -2060,7 +2116,7 @@ getnewbuf_reuse_bp(struct buf *bp, int q CTR6(KTR_BUF, "getnewbuf(%p) vp %p flags %X kvasize %d bufsize %d " "queue %d (recycling)", bp, bp->b_vp, bp->b_flags, bp->b_kvasize, bp->b_bufsize, qindex); - mtx_assert(&bqlock, MA_NOTOWNED); + mtx_assert(&bqclean, MA_NOTOWNED); /* * Note: we no longer distinguish between VMIO and non-VMIO @@ -2156,7 +2212,7 @@ restart: * where we cannot backup. */ nbp = NULL; - mtx_lock(&bqlock); + mtx_lock(&bqclean); if (!defrag && unmapped) { nqindex = QUEUE_EMPTY; nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]); @@ -2267,14 +2323,14 @@ restart: ("getnewbuf: inconsistent queue %d bp %p", qindex, bp)); bremfreel(bp); - mtx_unlock(&bqlock); + mtx_unlock(&bqclean); /* * NOTE: nbp is now entirely invalid. We can only restart * the scan from this point on. */ getnewbuf_reuse_bp(bp, qindex); - mtx_assert(&bqlock, MA_NOTOWNED); + mtx_assert(&bqclean, MA_NOTOWNED); /* * If we are defragging then free the buffer. @@ -2335,10 +2391,6 @@ restart: * We have insufficient buffer space * buffer_map is too fragmented ( space reservation fails ) * If we have to flush dirty buffers ( but we try to avoid this ) - * - * To avoid VFS layer recursion we do not flush dirty buffers ourselves. - * Instead we ask the buf daemon to do it for us. We attempt to - * avoid piecemeal wakeups of the pageout daemon. */ static struct buf * getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int size, int maxsize, @@ -2379,11 +2431,11 @@ restart: * Generally we are sleeping due to insufficient buffer space. */ if (bp == NULL) { - mtx_assert(&bqlock, MA_OWNED); + mtx_assert(&bqclean, MA_OWNED); getnewbuf_bufd_help(vp, gbflags, slpflag, slptimeo, defrag); - mtx_assert(&bqlock, MA_NOTOWNED); + mtx_assert(&bqclean, MA_NOTOWNED); } else if ((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) == GB_UNMAPPED) { - mtx_assert(&bqlock, MA_NOTOWNED); + mtx_assert(&bqclean, MA_NOTOWNED); bfreekva(bp); bp->b_flags |= B_UNMAPPED; @@ -2393,7 +2445,7 @@ restart: atomic_add_long(&unmapped_bufspace, bp->b_kvasize); atomic_add_int(&bufreusecnt, 1); } else { - mtx_assert(&bqlock, MA_NOTOWNED); + mtx_assert(&bqclean, MA_NOTOWNED); /* * We finally have a valid bp. We aren't quite out of the @@ -2464,18 +2516,20 @@ static struct kproc_desc buf_kp = { SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp); static int -buf_do_flush(struct vnode *vp) +buf_flush(struct vnode *vp, int target) { int flushed; - flushed = flushbufqueues(vp, QUEUE_DIRTY, 0); + flushed = flushbufqueues(vp, target, 0); if (flushed == 0) { /* * Could not find any buffers without rollback * dependencies, so just write the first one * in the hopes of eventually making progress. */ - flushbufqueues(vp, QUEUE_DIRTY, 1); + if (vp != NULL && target > 2) + target /= 2; + flushbufqueues(vp, target, 1); } return (flushed); } @@ -2483,7 +2537,7 @@ buf_do_flush(struct vnode *vp) static void buf_daemon() { - int lodirtysave; + int lodirty; /* * This process needs to be suspended prior to shutdown sync. @@ -2501,23 +2555,21 @@ buf_daemon() mtx_unlock(&bdlock); kproc_suspend_check(bufdaemonproc); - lodirtysave = lodirtybuffers; + lodirty = lodirtybuffers; if (bd_speedupreq) { - lodirtybuffers = numdirtybuffers / 2; + lodirty = numdirtybuffers / 2; bd_speedupreq = 0; } /* * Do the flush. Limit the amount of in-transit I/O we * allow to build up, otherwise we would completely saturate - * the I/O system. Wakeup any waiting processes before we - * normally would so they can run in parallel with our drain. + * the I/O system. */ - while (numdirtybuffers > lodirtybuffers) { - if (buf_do_flush(NULL) == 0) + while (numdirtybuffers > lodirty) { + if (buf_flush(NULL, numdirtybuffers - lodirty) == 0) break; kern_yield(PRI_USER); } - lodirtybuffers = lodirtysave; /* * Only clear bd_request if we have reached our low water @@ -2526,8 +2578,8 @@ buf_daemon() * built up, within reason. * * If we were unable to hit our low water mark and couldn't - * find any flushable buffers, we sleep half a second. - * Otherwise we loop immediately. + * find any flushable buffers, we sleep for a short period + * to avoid endless loops on unlockable buffers. */ mtx_lock(&bdlock); if (numdirtybuffers <= lodirtybuffers) { @@ -2537,6 +2589,14 @@ buf_daemon() * The sleep is just so the suspend code works. */ bd_request = 0; + /* + * Do an extra wakeup in case dirty threshold + * changed via sysctl and the explicit transition + * out of shortfall was missed. + */ + bdirtywakeup(); + if (runningbufspace <= lorunningspace) + runningwakeup(); msleep(&bd_request, &bdlock, PVM, "psleep", hz); } else { /* @@ -2561,7 +2621,7 @@ SYSCTL_INT(_vfs, OID_AUTO, flushwithdeps 0, "Number of buffers flushed with dependecies that require rollbacks"); static int -flushbufqueues(struct vnode *lvp, int queue, int flushdeps) +flushbufqueues(struct vnode *lvp, int target, int flushdeps) { struct buf *sentinel; struct vnode *vp; @@ -2569,19 +2629,14 @@ flushbufqueues(struct vnode *lvp, int qu struct buf *bp; int hasdeps; int flushed; - int target; + int queue; - if (lvp == NULL) { - target = numdirtybuffers - lodirtybuffers; - if (flushdeps && target > 2) - target /= 2; - } else - target = flushbufqtarget; flushed = 0; + queue = QUEUE_DIRTY; bp = NULL; sentinel = malloc(sizeof(struct buf), M_TEMP, M_WAITOK | M_ZERO); sentinel->b_qindex = QUEUE_SENTINEL; - mtx_lock(&bqlock); + mtx_lock(&bqdirty); TAILQ_INSERT_HEAD(&bufqueues[queue], sentinel, b_freelist); while (flushed != target) { bp = TAILQ_NEXT(sentinel, b_freelist); @@ -2620,11 +2675,10 @@ flushbufqueues(struct vnode *lvp, int qu } if (bp->b_flags & B_INVAL) { *** DIFF OUTPUT TRUNCATED AT 1000 LINES ***