From owner-svn-src-all@freebsd.org Wed Oct 14 02:10:08 2015 Return-Path: Delivered-To: svn-src-all@mailman.ysv.freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:1900:2254:206a::19:1]) by mailman.ysv.freebsd.org (Postfix) with ESMTP id B0C8EA127F1; Wed, 14 Oct 2015 02:10:08 +0000 (UTC) (envelope-from jeff@FreeBSD.org) Received: from repo.freebsd.org (repo.freebsd.org [IPv6:2610:1c1:1:6068::e6a:0]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (Client did not present a certificate) by mx1.freebsd.org (Postfix) with ESMTPS id 6EBD81AC2; Wed, 14 Oct 2015 02:10:08 +0000 (UTC) (envelope-from jeff@FreeBSD.org) Received: from repo.freebsd.org ([127.0.1.37]) by repo.freebsd.org (8.15.2/8.15.2) with ESMTP id t9E2A7dP056597; Wed, 14 Oct 2015 02:10:07 GMT (envelope-from jeff@FreeBSD.org) Received: (from jeff@localhost) by repo.freebsd.org (8.15.2/8.15.2/Submit) id t9E2A79H056595; Wed, 14 Oct 2015 02:10:07 GMT (envelope-from jeff@FreeBSD.org) Message-Id: <201510140210.t9E2A79H056595@repo.freebsd.org> X-Authentication-Warning: repo.freebsd.org: jeff set sender to jeff@FreeBSD.org using -f From: Jeff Roberson Date: Wed, 14 Oct 2015 02:10:07 +0000 (UTC) To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org Subject: svn commit: r289279 - in head/sys: kern vm X-SVN-Group: head MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit X-BeenThere: svn-src-all@freebsd.org X-Mailman-Version: 2.1.20 Precedence: list List-Id: "SVN commit messages for the entire src tree \(except for " user" and " projects" \)" List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Wed, 14 Oct 2015 02:10:08 -0000 Author: jeff Date: Wed Oct 14 02:10:07 2015 New Revision: 289279 URL: https://svnweb.freebsd.org/changeset/base/289279 Log: Parallelize the buffer cache and rewrite getnewbuf(). This results in a 8x performance improvement in a micro benchmark on a 4 socket machine. - Get buffer headers from a per-cpu uma cache that sits in from of the free queue. - Use a per-cpu quantum cache in vmem to eliminate contention for kva. - Use multiple clean queues according to buffer cache size to eliminate clean queue lock contention. - Introduce a bufspace daemon that attempts to prevent getnewbuf() callers from blocking or doing direct recycling. - Close some bufspace allocation races that could lead to endless recycling. - Further the transition to a more modern style of small functions grouped by prefix in order to improve growing complexity. Sponsored by: EMC / Isilon Reviewed by: kib Tested by: pho Modified: head/sys/kern/vfs_bio.c head/sys/vm/vm_init.c Modified: head/sys/kern/vfs_bio.c ============================================================================== --- head/sys/kern/vfs_bio.c Wed Oct 14 00:43:29 2015 (r289278) +++ head/sys/kern/vfs_bio.c Wed Oct 14 02:10:07 2015 (r289279) @@ -63,6 +63,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -100,6 +101,7 @@ caddr_t unmapped_buf; /* Used below and for softdep flushing threads in ufs/ffs/ffs_softdep.c */ struct proc *bufdaemonproc; +struct proc *bufspacedaemonproc; static int inmem(struct vnode *vp, daddr_t blkno); static void vm_hold_free_pages(struct buf *bp, int newbsize); @@ -116,11 +118,18 @@ static void vfs_vmio_extend(struct buf * static int vfs_bio_clcheck(struct vnode *vp, int size, daddr_t lblkno, daddr_t blkno); static int buf_flush(struct vnode *vp, int); +static int buf_recycle(bool); +static int buf_scan(bool); static int flushbufqueues(struct vnode *, int, int); static void buf_daemon(void); static void bremfreel(struct buf *bp); static __inline void bd_wakeup(void); static int sysctl_runningspace(SYSCTL_HANDLER_ARGS); +static void bufkva_reclaim(vmem_t *, int); +static void bufkva_free(struct buf *); +static int buf_import(void *, void **, int, int); +static void buf_release(void *, void **, int); + #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) static int sysctl_bufspace(SYSCTL_HANDLER_ARGS); @@ -145,23 +154,23 @@ static long bufkvaspace; SYSCTL_LONG(_vfs, OID_AUTO, bufkvaspace, CTLFLAG_RD, &bufkvaspace, 0, "Kernel virtual memory used for buffers"); static long maxbufspace; -SYSCTL_LONG(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RD, &maxbufspace, 0, - "Maximum allowed value of bufspace (including buf_daemon)"); +SYSCTL_LONG(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RW, &maxbufspace, 0, + "Maximum allowed value of bufspace (including metadata)"); static long bufmallocspace; SYSCTL_LONG(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0, "Amount of malloced memory for buffers"); static long maxbufmallocspace; -SYSCTL_LONG(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace, 0, - "Maximum amount of malloced memory for buffers"); +SYSCTL_LONG(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace, + 0, "Maximum amount of malloced memory for buffers"); static long lobufspace; -SYSCTL_LONG(_vfs, OID_AUTO, lobufspace, CTLFLAG_RD, &lobufspace, 0, +SYSCTL_LONG(_vfs, OID_AUTO, lobufspace, CTLFLAG_RW, &lobufspace, 0, "Minimum amount of buffers we want to have"); long hibufspace; -SYSCTL_LONG(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD, &hibufspace, 0, - "Maximum allowed value of bufspace (excluding buf_daemon)"); -static int bufreusecnt; -SYSCTL_INT(_vfs, OID_AUTO, bufreusecnt, CTLFLAG_RW, &bufreusecnt, 0, - "Number of times we have reused a buffer"); +SYSCTL_LONG(_vfs, OID_AUTO, hibufspace, CTLFLAG_RW, &hibufspace, 0, + "Maximum allowed value of bufspace (excluding metadata)"); +long bufspacethresh; +SYSCTL_LONG(_vfs, OID_AUTO, bufspacethresh, CTLFLAG_RW, &bufspacethresh, + 0, "Bufspace consumed before waking the daemon to free some"); static int buffreekvacnt; SYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt, 0, "Number of times we have freed the KVA space from some buffer"); @@ -205,10 +214,10 @@ SYSCTL_INT(_vfs, OID_AUTO, numfreebuffer "Number of free buffers"); static int lofreebuffers; SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW, &lofreebuffers, 0, - "XXX Unused"); + "Target number of free buffers"); static int hifreebuffers; SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, &hifreebuffers, 0, - "XXX Complicatedly unused"); + "Threshold for clean buffer recycling"); static int getnewbufcalls; SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW, &getnewbufcalls, 0, "Number of calls to getnewbuf"); @@ -219,6 +228,9 @@ static int mappingrestarts; SYSCTL_INT(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RW, &mappingrestarts, 0, "Number of times getblk has had to restart a buffer mapping for " "unmapped buffer"); +static int numbufallocfails; +SYSCTL_INT(_vfs, OID_AUTO, numbufallocfails, CTLFLAG_RW, &numbufallocfails, 0, + "Number of times buffer allocations failed"); static int flushbufqtarget = 100; SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0, "Amount of work to do in flushbufqueues when helping bufdaemon"); @@ -233,16 +245,6 @@ SYSCTL_INT(_vfs, OID_AUTO, unmapped_buf_ "Permit the use of the unmapped i/o"); /* - * Lock for the non-dirty bufqueues - */ -static struct mtx_padalign bqclean; - -/* - * Lock for the dirty queue. - */ -static struct mtx_padalign bqdirty; - -/* * This lock synchronizes access to bd_request. */ static struct mtx_padalign bdlock; @@ -271,6 +273,11 @@ static struct mtx_padalign bdirtylock; static int bd_request; /* + * Request/wakeup point for the bufspace daemon. + */ +static int bufspace_request; + +/* * Request for the buf daemon to write more buffers than is indicated by * lodirtybuf. This may be necessary to push out excess dependencies or * defragment the address space where a simple count of the number of dirty @@ -298,7 +305,7 @@ static int runningbufreq; * Synchronization (sleep/wakeup) variable for buffer requests. * Can contain the VFS_BIO_NEED flags defined below; setting/clearing is done * by and/or. - * Used in numdirtywakeup(), bufspacewakeup(), bufcountadd(), bwillwrite(), + * Used in numdirtywakeup(), bufspace_wakeup(), bwillwrite(), * getnewbuf(), and getblk(). */ static volatile int needsbuffer; @@ -311,14 +318,21 @@ static int bdirtywait; /* * Definitions for the buffer free lists. */ -#define BUFFER_QUEUES 4 /* number of free buffer queues */ - #define QUEUE_NONE 0 /* on no queue */ -#define QUEUE_CLEAN 1 /* non-B_DELWRI buffers */ +#define QUEUE_EMPTY 1 /* empty buffer headers */ #define QUEUE_DIRTY 2 /* B_DELWRI buffers */ -#define QUEUE_EMPTY 3 /* empty buffer headers */ +#define QUEUE_CLEAN 3 /* non-B_DELWRI buffers */ #define QUEUE_SENTINEL 1024 /* not an queue index, but mark for sentinel */ +/* Maximum number of clean buffer queues. */ +#define CLEAN_QUEUES 16 + +/* Configured number of clean queues. */ +static int clean_queues; + +/* Maximum number of buffer queues. */ +#define BUFFER_QUEUES (QUEUE_CLEAN + CLEAN_QUEUES) + /* Queues for free buffers with various properties */ static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } }; #ifdef INVARIANTS @@ -326,15 +340,21 @@ static int bq_len[BUFFER_QUEUES]; #endif /* + * Lock for each bufqueue + */ +static struct mtx_padalign bqlocks[BUFFER_QUEUES]; + +/* + * per-cpu empty buffer cache. + */ +uma_zone_t buf_zone; + +/* * Single global constant for BUF_WMESG, to avoid getting multiple references. * buf_wmesg is referred from macros. */ const char *buf_wmesg = BUF_WMESG; -#define VFS_BIO_NEED_ANY 0x01 /* any freeable buffer */ -#define VFS_BIO_NEED_FREE 0x04 /* wait for free bufs, hi hysteresis */ -#define VFS_BIO_NEED_BUFSPACE 0x08 /* wait for buf space, lo hysteresis */ - static int sysctl_runningspace(SYSCTL_HANDLER_ARGS) { @@ -382,6 +402,21 @@ sysctl_bufspace(SYSCTL_HANDLER_ARGS) } #endif +static int +bqcleanq(void) +{ + static int nextq; + + return ((atomic_fetchadd_int(&nextq, 1) % clean_queues) + QUEUE_CLEAN); +} + +static int +bqisclean(int qindex) +{ + + return (qindex >= QUEUE_CLEAN && qindex < QUEUE_CLEAN + CLEAN_QUEUES); +} + /* * bqlock: * @@ -391,9 +426,7 @@ static inline struct mtx * bqlock(int qindex) { - if (qindex == QUEUE_DIRTY) - return (struct mtx *)(&bqdirty); - return (struct mtx *)(&bqclean); + return (struct mtx *)&bqlocks[qindex]; } /* @@ -447,62 +480,255 @@ bdirtyadd(void) } /* - * bufspacewakeup: + * bufspace_wakeup: * * Called when buffer space is potentially available for recovery. * getnewbuf() will block on this flag when it is unable to free * sufficient buffer space. Buffer space becomes recoverable when * bp's get placed back in the queues. */ -static __inline void -bufspacewakeup(void) +static void +bufspace_wakeup(void) { - int need_wakeup, on; /* - * If someone is waiting for bufspace, wake them up. Even - * though we may not have freed the kva space yet, the waiting - * process will be able to now. + * If someone is waiting for bufspace, wake them up. + * + * Since needsbuffer is set prior to doing an additional queue + * scan it is safe to check for the flag prior to acquiring the + * lock. The thread that is preparing to scan again before + * blocking would discover the buf we released. */ + if (needsbuffer) { + rw_rlock(&nblock); + if (atomic_cmpset_int(&needsbuffer, 1, 0) == 1) + wakeup(__DEVOLATILE(void *, &needsbuffer)); + rw_runlock(&nblock); + } +} + +/* + * bufspace_daemonwakeup: + * + * Wakeup the daemon responsible for freeing clean bufs. + */ +static void +bufspace_daemonwakeup(void) +{ rw_rlock(&nblock); - for (;;) { - need_wakeup = 0; - on = needsbuffer; - if ((on & VFS_BIO_NEED_BUFSPACE) == 0) - break; - need_wakeup = 1; - if (atomic_cmpset_rel_int(&needsbuffer, on, - on & ~VFS_BIO_NEED_BUFSPACE)) - break; + if (bufspace_request == 0) { + bufspace_request = 1; + wakeup(&bufspace_request); } - if (need_wakeup) - wakeup(__DEVOLATILE(void *, &needsbuffer)); rw_runlock(&nblock); } /* - * bufspaceadjust: + * bufspace_adjust: * * Adjust the reported bufspace for a KVA managed buffer, possibly * waking any waiters. */ static void -bufspaceadjust(struct buf *bp, int bufsize) +bufspace_adjust(struct buf *bp, int bufsize) { + long space; int diff; KASSERT((bp->b_flags & B_MALLOC) == 0, - ("bufspaceadjust: malloc buf %p", bp)); + ("bufspace_adjust: malloc buf %p", bp)); diff = bufsize - bp->b_bufsize; if (diff < 0) { atomic_subtract_long(&bufspace, -diff); - bufspacewakeup(); - } else - atomic_add_long(&bufspace, diff); + bufspace_wakeup(); + } else { + space = atomic_fetchadd_long(&bufspace, diff); + /* Wake up the daemon on the transition. */ + if (space < bufspacethresh && space + diff >= bufspacethresh) + bufspace_daemonwakeup(); + } bp->b_bufsize = bufsize; } /* + * bufspace_reserve: + * + * Reserve bufspace before calling allocbuf(). metadata has a + * different space limit than data. + */ +static int +bufspace_reserve(int size, bool metadata) +{ + long limit; + long space; + + if (metadata) + limit = maxbufspace; + else + limit = hibufspace; + do { + space = bufspace; + if (space + size > limit) + return (ENOSPC); + } while (atomic_cmpset_long(&bufspace, space, space + size) == 0); + + /* Wake up the daemon on the transition. */ + if (space < bufspacethresh && space + size >= bufspacethresh) + bufspace_daemonwakeup(); + + return (0); +} + +/* + * bufspace_release: + * + * Release reserved bufspace after bufspace_adjust() has consumed it. + */ +static void +bufspace_release(int size) +{ + atomic_subtract_long(&bufspace, size); + bufspace_wakeup(); +} + +/* + * bufspace_wait: + * + * Wait for bufspace, acting as the buf daemon if a locked vnode is + * supplied. needsbuffer must be set in a safe fashion prior to + * polling for space. The operation must be re-tried on return. + */ +static void +bufspace_wait(struct vnode *vp, int gbflags, int slpflag, int slptimeo) +{ + struct thread *td; + int error, fl, norunbuf; + + if ((gbflags & GB_NOWAIT_BD) != 0) + return; + + td = curthread; + rw_wlock(&nblock); + while (needsbuffer != 0) { + if (vp != NULL && vp->v_type != VCHR && + (td->td_pflags & TDP_BUFNEED) == 0) { + rw_wunlock(&nblock); + /* + * getblk() is called with a vnode locked, and + * some majority of the dirty buffers may as + * well belong to the vnode. Flushing the + * buffers there would make a progress that + * cannot be achieved by the buf_daemon, that + * cannot lock the vnode. + */ + norunbuf = ~(TDP_BUFNEED | TDP_NORUNNINGBUF) | + (td->td_pflags & TDP_NORUNNINGBUF); + + /* + * Play bufdaemon. The getnewbuf() function + * may be called while the thread owns lock + * for another dirty buffer for the same + * vnode, which makes it impossible to use + * VOP_FSYNC() there, due to the buffer lock + * recursion. + */ + td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF; + fl = buf_flush(vp, flushbufqtarget); + td->td_pflags &= norunbuf; + rw_wlock(&nblock); + if (fl != 0) + continue; + if (needsbuffer == 0) + break; + } + error = rw_sleep(__DEVOLATILE(void *, &needsbuffer), &nblock, + (PRIBIO + 4) | slpflag, "newbuf", slptimeo); + if (error != 0) + break; + } + rw_wunlock(&nblock); +} + + +/* + * bufspace_daemon: + * + * buffer space management daemon. Tries to maintain some marginal + * amount of free buffer space so that requesting processes neither + * block nor work to reclaim buffers. + */ +static void +bufspace_daemon(void) +{ + for (;;) { + kproc_suspend_check(bufspacedaemonproc); + + /* + * Free buffers from the clean queue until we meet our + * targets. + * + * Theory of operation: The buffer cache is most efficient + * when some free buffer headers and space are always + * available to getnewbuf(). This daemon attempts to prevent + * the excessive blocking and synchronization associated + * with shortfall. It goes through three phases according + * demand: + * + * 1) The daemon wakes up voluntarily once per-second + * during idle periods when the counters are below + * the wakeup thresholds (bufspacethresh, lofreebuffers). + * + * 2) The daemon wakes up as we cross the thresholds + * ahead of any potential blocking. This may bounce + * slightly according to the rate of consumption and + * release. + * + * 3) The daemon and consumers are starved for working + * clean buffers. This is the 'bufspace' sleep below + * which will inefficiently trade bufs with bqrelse + * until we return to condition 2. + */ + while (bufspace > lobufspace || + numfreebuffers < hifreebuffers) { + if (buf_recycle(false) != 0) { + atomic_set_int(&needsbuffer, 1); + if (buf_recycle(false) != 0) { + rw_wlock(&nblock); + if (needsbuffer) + rw_sleep(__DEVOLATILE(void *, + &needsbuffer), &nblock, + PRIBIO|PDROP, "bufspace", + hz/10); + else + rw_wunlock(&nblock); + } + } + maybe_yield(); + } + + /* + * Re-check our limits under the exclusive nblock. + */ + rw_wlock(&nblock); + if (bufspace < bufspacethresh && + numfreebuffers > lofreebuffers) { + bufspace_request = 0; + rw_sleep(&bufspace_request, &nblock, PRIBIO|PDROP, + "-", hz); + } else + rw_wunlock(&nblock); + } +} + +static struct kproc_desc bufspace_kp = { + "bufspacedaemon", + bufspace_daemon, + &bufspacedaemonproc +}; +SYSINIT(bufspacedaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, + &bufspace_kp); + +/* * bufmallocadjust: * * Adjust the reported bufspace for a malloc managed buffer, possibly @@ -516,10 +742,9 @@ bufmallocadjust(struct buf *bp, int bufs KASSERT((bp->b_flags & B_MALLOC) != 0, ("bufmallocadjust: non-malloc buf %p", bp)); diff = bufsize - bp->b_bufsize; - if (diff < 0) { + if (diff < 0) atomic_subtract_long(&bufmallocspace, -diff); - bufspacewakeup(); - } else + else atomic_add_long(&bufmallocspace, diff); bp->b_bufsize = bufsize; } @@ -571,67 +796,6 @@ runningbufwakeup(struct buf *bp) } /* - * bufcountadd: - * - * Called when a buffer has been added to one of the free queues to - * account for the buffer and to wakeup anyone waiting for free buffers. - * This typically occurs when large amounts of metadata are being handled - * by the buffer cache ( else buffer space runs out first, usually ). - */ -static __inline void -bufcountadd(struct buf *bp) -{ - int mask, need_wakeup, old, on; - - KASSERT((bp->b_flags & B_INFREECNT) == 0, - ("buf %p already counted as free", bp)); - bp->b_flags |= B_INFREECNT; - old = atomic_fetchadd_int(&numfreebuffers, 1); - KASSERT(old >= 0 && old < nbuf, - ("numfreebuffers climbed to %d", old + 1)); - mask = VFS_BIO_NEED_ANY; - if (numfreebuffers >= hifreebuffers) - mask |= VFS_BIO_NEED_FREE; - rw_rlock(&nblock); - for (;;) { - need_wakeup = 0; - on = needsbuffer; - if (on == 0) - break; - need_wakeup = 1; - if (atomic_cmpset_rel_int(&needsbuffer, on, on & ~mask)) - break; - } - if (need_wakeup) - wakeup(__DEVOLATILE(void *, &needsbuffer)); - rw_runlock(&nblock); -} - -/* - * bufcountsub: - * - * Decrement the numfreebuffers count as needed. - */ -static void -bufcountsub(struct buf *bp) -{ - int old; - - /* - * Fixup numfreebuffers count. If the buffer is invalid or not - * delayed-write, the buffer was free and we must decrement - * numfreebuffers. - */ - if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) { - KASSERT((bp->b_flags & B_INFREECNT) != 0, - ("buf %p not counted in numfreebuffers", bp)); - bp->b_flags &= ~B_INFREECNT; - old = atomic_fetchadd_int(&numfreebuffers, -1); - KASSERT(old > 0, ("numfreebuffers dropped to %d", old - 1)); - } -} - -/* * waitrunningbufspace() * * runningbufspace is a measure of the amount of I/O currently @@ -847,8 +1011,10 @@ bufinit(void) int i; CTASSERT(MAXBCACHEBUF >= MAXBSIZE); - mtx_init(&bqclean, "bufq clean lock", NULL, MTX_DEF); - mtx_init(&bqdirty, "bufq dirty lock", NULL, MTX_DEF); + mtx_init(&bqlocks[QUEUE_DIRTY], "bufq dirty lock", NULL, MTX_DEF); + mtx_init(&bqlocks[QUEUE_EMPTY], "bufq empty lock", NULL, MTX_DEF); + for (i = QUEUE_CLEAN; i < QUEUE_CLEAN + CLEAN_QUEUES; i++) + mtx_init(&bqlocks[i], "bufq clean lock", NULL, MTX_DEF); mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF); rw_init(&nblock, "needsbuffer lock"); mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF); @@ -864,7 +1030,7 @@ bufinit(void) for (i = 0; i < nbuf; i++) { bp = &buf[i]; bzero(bp, sizeof *bp); - bp->b_flags = B_INVAL | B_INFREECNT; + bp->b_flags = B_INVAL; bp->b_rcred = NOCRED; bp->b_wcred = NOCRED; bp->b_qindex = QUEUE_EMPTY; @@ -881,18 +1047,19 @@ bufinit(void) /* * maxbufspace is the absolute maximum amount of buffer space we are * allowed to reserve in KVM and in real terms. The absolute maximum - * is nominally used by buf_daemon. hibufspace is the nominal maximum - * used by most other processes. The differential is required to - * ensure that buf_daemon is able to run when other processes might - * be blocked waiting for buffer space. + * is nominally used by metadata. hibufspace is the nominal maximum + * used by most other requests. The differential is required to + * ensure that metadata deadlocks don't occur. * * maxbufspace is based on BKVASIZE. Allocating buffers larger then * this may result in KVM fragmentation which is not handled optimally - * by the system. + * by the system. XXX This is less true with vmem. We could use + * PAGE_SIZE. */ maxbufspace = (long)nbuf * BKVASIZE; hibufspace = lmax(3 * maxbufspace / 4, maxbufspace - MAXBCACHEBUF * 10); - lobufspace = hibufspace - MAXBCACHEBUF; + lobufspace = (hibufspace / 20) * 19; /* 95% */ + bufspacethresh = lobufspace + (hibufspace - lobufspace) / 2; /* * Note: The 16 MiB upper limit for hirunningspace was chosen @@ -906,44 +1073,61 @@ bufinit(void) 16 * 1024 * 1024), 1024 * 1024); lorunningspace = roundup((hirunningspace * 2) / 3, MAXBCACHEBUF); -/* - * Limit the amount of malloc memory since it is wired permanently into - * the kernel space. Even though this is accounted for in the buffer - * allocation, we don't want the malloced region to grow uncontrolled. - * The malloc scheme improves memory utilization significantly on average - * (small) directories. - */ + /* + * Limit the amount of malloc memory since it is wired permanently into + * the kernel space. Even though this is accounted for in the buffer + * allocation, we don't want the malloced region to grow uncontrolled. + * The malloc scheme improves memory utilization significantly on + * average (small) directories. + */ maxbufmallocspace = hibufspace / 20; -/* - * Reduce the chance of a deadlock occuring by limiting the number - * of delayed-write dirty buffers we allow to stack up. - */ + /* + * Reduce the chance of a deadlock occuring by limiting the number + * of delayed-write dirty buffers we allow to stack up. + */ hidirtybuffers = nbuf / 4 + 20; dirtybufthresh = hidirtybuffers * 9 / 10; numdirtybuffers = 0; -/* - * To support extreme low-memory systems, make sure hidirtybuffers cannot - * eat up all available buffer space. This occurs when our minimum cannot - * be met. We try to size hidirtybuffers to 3/4 our buffer space assuming - * BKVASIZE'd buffers. - */ + /* + * To support extreme low-memory systems, make sure hidirtybuffers + * cannot eat up all available buffer space. This occurs when our + * minimum cannot be met. We try to size hidirtybuffers to 3/4 our + * buffer space assuming BKVASIZE'd buffers. + */ while ((long)hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) { hidirtybuffers >>= 1; } lodirtybuffers = hidirtybuffers / 2; -/* - * Try to keep the number of free buffers in the specified range, - * and give special processes (e.g. like buf_daemon) access to an - * emergency reserve. - */ - lofreebuffers = nbuf / 18 + 5; - hifreebuffers = 2 * lofreebuffers; + /* + * lofreebuffers should be sufficient to avoid stalling waiting on + * buf headers under heavy utilization. The bufs in per-cpu caches + * are counted as free but will be unavailable to threads executing + * on other cpus. + * + * hifreebuffers is the free target for the bufspace daemon. This + * should be set appropriately to limit work per-iteration. + */ + lofreebuffers = MIN((nbuf / 25) + (20 * mp_ncpus), 128 * mp_ncpus); + hifreebuffers = (3 * lofreebuffers) / 2; numfreebuffers = nbuf; bogus_page = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL | VM_ALLOC_WIRED); + + /* Setup the kva and free list allocators. */ + vmem_set_reclaim(buffer_arena, bufkva_reclaim); + buf_zone = uma_zcache_create("buf free cache", sizeof(struct buf), + NULL, NULL, NULL, NULL, buf_import, buf_release, NULL, 0); + + /* + * Size the clean queue according to the amount of buffer space. + * One queue per-256mb up to the max. More queues gives better + * concurrency but less accurate LRU. + */ + clean_queues = MIN(howmany(maxbufspace, 256*1024*1024), CLEAN_QUEUES); + } #ifdef INVARIANTS @@ -1129,10 +1313,25 @@ binsfree(struct buf *bp, int qindex) { struct mtx *olock, *nlock; - BUF_ASSERT_XLOCKED(bp); + if (qindex != QUEUE_EMPTY) { + BUF_ASSERT_XLOCKED(bp); + } + + /* + * Stick to the same clean queue for the lifetime of the buf to + * limit locking below. Otherwise pick ont sequentially. + */ + if (qindex == QUEUE_CLEAN) { + if (bqisclean(bp->b_qindex)) + qindex = bp->b_qindex; + else + qindex = bqcleanq(); + } + /* + * Handle delayed bremfree() processing. + */ nlock = bqlock(qindex); - /* Handle delayed bremfree() processing. */ if (bp->b_flags & B_REMFREE) { olock = bqlock(bp->b_qindex); mtx_lock(olock); @@ -1156,15 +1355,263 @@ binsfree(struct buf *bp, int qindex) bq_len[bp->b_qindex]++; #endif mtx_unlock(nlock); +} + +/* + * buf_free: + * + * Free a buffer to the buf zone once it no longer has valid contents. + */ +static void +buf_free(struct buf *bp) +{ + + if (bp->b_flags & B_REMFREE) + bremfreef(bp); + if (bp->b_vflags & BV_BKGRDINPROG) + panic("losing buffer 1"); + if (bp->b_rcred != NOCRED) { + crfree(bp->b_rcred); + bp->b_rcred = NOCRED; + } + if (bp->b_wcred != NOCRED) { + crfree(bp->b_wcred); + bp->b_wcred = NOCRED; + } + if (!LIST_EMPTY(&bp->b_dep)) + buf_deallocate(bp); + bufkva_free(bp); + BUF_UNLOCK(bp); + uma_zfree(buf_zone, bp); + atomic_add_int(&numfreebuffers, 1); + bufspace_wakeup(); +} + +/* + * buf_import: + * + * Import bufs into the uma cache from the buf list. The system still + * expects a static array of bufs and much of the synchronization + * around bufs assumes type stable storage. As a result, UMA is used + * only as a per-cpu cache of bufs still maintained on a global list. + */ +static int +buf_import(void *arg, void **store, int cnt, int flags) +{ + struct buf *bp; + int i; + + mtx_lock(&bqlocks[QUEUE_EMPTY]); + for (i = 0; i < cnt; i++) { + bp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]); + if (bp == NULL) + break; + bremfreel(bp); + store[i] = bp; + } + mtx_unlock(&bqlocks[QUEUE_EMPTY]); + + return (i); +} + +/* + * buf_release: + * + * Release bufs from the uma cache back to the buffer queues. + */ +static void +buf_release(void *arg, void **store, int cnt) +{ + int i; + + for (i = 0; i < cnt; i++) + binsfree(store[i], QUEUE_EMPTY); +} + +/* + * buf_alloc: + * + * Allocate an empty buffer header. + */ +static struct buf * +buf_alloc(void) +{ + struct buf *bp; + + bp = uma_zalloc(buf_zone, M_NOWAIT); + if (bp == NULL) { + bufspace_daemonwakeup(); + atomic_add_int(&numbufallocfails, 1); + return (NULL); + } + + /* + * Wake-up the bufspace daemon on transition. + */ + if (atomic_fetchadd_int(&numfreebuffers, -1) == lofreebuffers) + bufspace_daemonwakeup(); + + if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) + panic("getnewbuf_empty: Locked buf %p on free queue.", bp); + + KASSERT(bp->b_vp == NULL, + ("bp: %p still has vnode %p.", bp, bp->b_vp)); + KASSERT((bp->b_flags & (B_DELWRI | B_NOREUSE)) == 0, + ("invalid buffer %p flags %#x", bp, bp->b_flags)); + KASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0, + ("bp: %p still on a buffer list. xflags %X", bp, bp->b_xflags)); + KASSERT(bp->b_npages == 0, + ("bp: %p still has %d vm pages\n", bp, bp->b_npages)); + KASSERT(bp->b_kvasize == 0, ("bp: %p still has kva\n", bp)); + KASSERT(bp->b_bufsize == 0, ("bp: %p still has bufspace\n", bp)); + + bp->b_flags = 0; + bp->b_ioflags = 0; + bp->b_xflags = 0; + bp->b_vflags = 0; + bp->b_vp = NULL; + bp->b_blkno = bp->b_lblkno = 0; + bp->b_offset = NOOFFSET; + bp->b_iodone = 0; + bp->b_error = 0; + bp->b_resid = 0; + bp->b_bcount = 0; + bp->b_npages = 0; + bp->b_dirtyoff = bp->b_dirtyend = 0; + bp->b_bufobj = NULL; + bp->b_pin_count = 0; + bp->b_data = bp->b_kvabase = unmapped_buf; + bp->b_fsprivate1 = NULL; + bp->b_fsprivate2 = NULL; + bp->b_fsprivate3 = NULL; + LIST_INIT(&bp->b_dep); + + return (bp); +} + +/* + * buf_qrecycle: + * + * Free a buffer from the given bufqueue. kva controls whether the + * freed buf must own some kva resources. This is used for + * defragmenting. + */ +static int +buf_qrecycle(int qindex, bool kva) +{ + struct buf *bp, *nbp; + + if (kva) + atomic_add_int(&bufdefragcnt, 1); + nbp = NULL; + mtx_lock(&bqlocks[qindex]); + nbp = TAILQ_FIRST(&bufqueues[qindex]); + + /* + * Run scan, possibly freeing data and/or kva mappings on the fly + * depending. + */ + while ((bp = nbp) != NULL) { + /* + * Calculate next bp (we can only use it if we do not + * release the bqlock). + */ + nbp = TAILQ_NEXT(bp, b_freelist); + + /* + * If we are defragging then we need a buffer with + * some kva to reclaim. + */ + if (kva && bp->b_kvasize == 0) + continue; + + if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) + continue; + + /* + * Skip buffers with background writes in progress. + */ + if ((bp->b_vflags & BV_BKGRDINPROG) != 0) { + BUF_UNLOCK(bp); + continue; + } + + KASSERT(bp->b_qindex == qindex, + ("getnewbuf: inconsistent queue %d bp %p", qindex, bp)); + /* + * NOTE: nbp is now entirely invalid. We can only restart + * the scan from this point on. + */ + bremfreel(bp); + mtx_unlock(&bqlocks[qindex]); + + /* + * Requeue the background write buffer with error and + * restart the scan. + */ + if ((bp->b_vflags & BV_BKGRDERR) != 0) { + bqrelse(bp); + mtx_lock(&bqlocks[qindex]); + nbp = TAILQ_FIRST(&bufqueues[qindex]); + continue; + } + bp->b_flags |= B_INVAL; + brelse(bp); + return (0); + } + mtx_unlock(&bqlocks[qindex]); + + return (ENOBUFS); +} + +/* + * buf_recycle: + * + * Iterate through all clean queues until we find a buf to recycle or + * exhaust the search. + */ +static int +buf_recycle(bool kva) +{ + int qindex, first_qindex; + + qindex = first_qindex = bqcleanq(); + do { + if (buf_qrecycle(qindex, kva) == 0) + return (0); + if (++qindex == QUEUE_CLEAN + clean_queues) + qindex = QUEUE_CLEAN; + } while (qindex != first_qindex); + + return (ENOBUFS); +} + +/* + * buf_scan: + * + * Scan the clean queues looking for a buffer to recycle. needsbuffer + * is set on failure so that the caller may optionally bufspace_wait() + * in a race-free fashion. + */ +static int +buf_scan(bool defrag) +{ + int error; /* - * Something we can maybe free or reuse. - */ - if (bp->b_bufsize && !(bp->b_flags & B_DELWRI)) - bufspacewakeup(); - - if ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI)) - bufcountadd(bp); + * To avoid heavy synchronization and wakeup races we set + * needsbuffer and re-poll before failing. This ensures that + * no frees can be missed between an unsuccessful poll and + * going to sleep in a synchronized fashion. + */ *** DIFF OUTPUT TRUNCATED AT 1000 LINES ***