From owner-svn-src-head@freebsd.org Tue Feb 20 00:06:08 2018 Return-Path: Delivered-To: svn-src-head@mailman.ysv.freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2610:1c1:1:606c::19:1]) by mailman.ysv.freebsd.org (Postfix) with ESMTP id D13EFF0BD27; Tue, 20 Feb 2018 00:06:07 +0000 (UTC) (envelope-from jeff@FreeBSD.org) Received: from mxrelay.nyi.freebsd.org (mxrelay.nyi.freebsd.org [IPv6:2610:1c1:1:606c::19:3]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (Client CN "mxrelay.nyi.freebsd.org", Issuer "Let's Encrypt Authority X3" (verified OK)) by mx1.freebsd.org (Postfix) with ESMTPS id 83DE773D5C; Tue, 20 Feb 2018 00:06:07 +0000 (UTC) (envelope-from jeff@FreeBSD.org) Received: from repo.freebsd.org (repo.freebsd.org [IPv6:2610:1c1:1:6068::e6a:0]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (Client did not present a certificate) by mxrelay.nyi.freebsd.org (Postfix) with ESMTPS id 7E88221897; Tue, 20 Feb 2018 00:06:07 +0000 (UTC) (envelope-from jeff@FreeBSD.org) Received: from repo.freebsd.org ([127.0.1.37]) by repo.freebsd.org (8.15.2/8.15.2) with ESMTP id w1K067oB032313; Tue, 20 Feb 2018 00:06:07 GMT (envelope-from jeff@FreeBSD.org) Received: (from jeff@localhost) by repo.freebsd.org (8.15.2/8.15.2/Submit) id w1K067ZL032311; Tue, 20 Feb 2018 00:06:07 GMT (envelope-from jeff@FreeBSD.org) Message-Id: <201802200006.w1K067ZL032311@repo.freebsd.org> X-Authentication-Warning: repo.freebsd.org: jeff set sender to jeff@FreeBSD.org using -f From: Jeff Roberson Date: Tue, 20 Feb 2018 00:06:07 +0000 (UTC) To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org Subject: svn commit: r329612 - in head/sys: kern sys X-SVN-Group: head X-SVN-Commit-Author: jeff X-SVN-Commit-Paths: in head/sys: kern sys X-SVN-Commit-Revision: 329612 X-SVN-Commit-Repository: base MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit X-BeenThere: svn-src-head@freebsd.org X-Mailman-Version: 2.1.25 Precedence: list List-Id: SVN commit messages for the src tree for head/-current List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Tue, 20 Feb 2018 00:06:08 -0000 Author: jeff Date: Tue Feb 20 00:06:07 2018 New Revision: 329612 URL: https://svnweb.freebsd.org/changeset/base/329612 Log: Further parallelize the buffer cache. Provide multiple clean queues partitioned into 'domains'. Each domain manages its own bufspace and has its own bufspace daemon. Each domain has a set of subqueues indexed by the current cpuid to reduce lock contention on the cleanq. Refine the sleep/wakeup around the bufspace daemon to use atomics as much as possible. Add a B_REUSE flag that is used to requeue bufs during the scan to approximate LRU rather than locking the queue on every use of a frequently accessed buf. Implement bufspace_reserve with only atomic_fetchadd to avoid loop restarts. Reviewed by: markj Tested by: pho Sponsored by: Netflix, Dell/EMC Isilon Differential Revision: https://reviews.freebsd.org/D14274 Modified: head/sys/kern/vfs_bio.c head/sys/kern/vfs_subr.c head/sys/sys/buf.h head/sys/sys/bufobj.h Modified: head/sys/kern/vfs_bio.c ============================================================================== --- head/sys/kern/vfs_bio.c Mon Feb 19 22:56:04 2018 (r329611) +++ head/sys/kern/vfs_bio.c Tue Feb 20 00:06:07 2018 (r329612) @@ -51,6 +51,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -105,7 +106,6 @@ caddr_t unmapped_buf; /* Used below and for softdep flushing threads in ufs/ffs/ffs_softdep.c */ struct proc *bufdaemonproc; -struct proc *bufspacedaemonproc; static int inmem(struct vnode *vp, daddr_t blkno); static void vm_hold_free_pages(struct buf *bp, int newbsize); @@ -124,11 +124,8 @@ static int vfs_bio_clcheck(struct vnode *vp, int size, static void breada(struct vnode *, daddr_t *, int *, int, struct ucred *, int, void (*)(struct buf *)); static int buf_flush(struct vnode *vp, int); -static int buf_recycle(bool); -static int buf_scan(bool); static int flushbufqueues(struct vnode *, int, int); static void buf_daemon(void); -static void bremfreel(struct buf *bp); static __inline void bd_wakeup(void); static int sysctl_runningspace(SYSCTL_HANDLER_ARGS); static void bufkva_reclaim(vmem_t *, int); @@ -137,28 +134,17 @@ static int buf_import(void *, void **, int, int, int); static void buf_release(void *, void **, int); static void maxbcachebuf_adjust(void); -#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ - defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) static int sysctl_bufspace(SYSCTL_HANDLER_ARGS); -#endif - int vmiodirenable = TRUE; SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW, &vmiodirenable, 0, "Use the VM system for directory writes"); long runningbufspace; SYSCTL_LONG(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0, "Amount of presently outstanding async buffer io"); -static long bufspace; -#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ - defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) SYSCTL_PROC(_vfs, OID_AUTO, bufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RD, - &bufspace, 0, sysctl_bufspace, "L", "Virtual memory used for buffers"); -#else -SYSCTL_LONG(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0, - "Physical memory used for buffers"); -#endif -static long bufkvaspace; -SYSCTL_LONG(_vfs, OID_AUTO, bufkvaspace, CTLFLAG_RD, &bufkvaspace, 0, + NULL, 0, sysctl_bufspace, "L", "Physical memory used for buffers"); +static counter_u64_t bufkvaspace; +SYSCTL_COUNTER_U64(_vfs, OID_AUTO, bufkvaspace, CTLFLAG_RD, &bufkvaspace, "Kernel virtual memory used for buffers"); static long maxbufspace; SYSCTL_LONG(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RW, &maxbufspace, 0, @@ -178,11 +164,11 @@ SYSCTL_LONG(_vfs, OID_AUTO, hibufspace, CTLFLAG_RW, &h long bufspacethresh; SYSCTL_LONG(_vfs, OID_AUTO, bufspacethresh, CTLFLAG_RW, &bufspacethresh, 0, "Bufspace consumed before waking the daemon to free some"); -static int buffreekvacnt; -SYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt, 0, +static counter_u64_t buffreekvacnt; +SYSCTL_COUNTER_U64(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt, "Number of times we have freed the KVA space from some buffer"); -static int bufdefragcnt; -SYSCTL_INT(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt, 0, +static counter_u64_t bufdefragcnt; +SYSCTL_COUNTER_U64(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt, "Number of times we have had to repeat buffer allocation to defragment"); static long lorunningspace; SYSCTL_PROC(_vfs, OID_AUTO, lorunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE | @@ -225,24 +211,26 @@ SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW, static int hifreebuffers; SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, &hifreebuffers, 0, "Threshold for clean buffer recycling"); -static int getnewbufcalls; -SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW, &getnewbufcalls, 0, - "Number of calls to getnewbuf"); -static int getnewbufrestarts; -SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW, &getnewbufrestarts, 0, +static counter_u64_t getnewbufcalls; +SYSCTL_COUNTER_U64(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RD, + &getnewbufcalls, "Number of calls to getnewbuf"); +static counter_u64_t getnewbufrestarts; +SYSCTL_COUNTER_U64(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RD, + &getnewbufrestarts, "Number of times getnewbuf has had to restart a buffer acquisition"); -static int mappingrestarts; -SYSCTL_INT(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RW, &mappingrestarts, 0, +static counter_u64_t mappingrestarts; +SYSCTL_COUNTER_U64(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RD, + &mappingrestarts, "Number of times getblk has had to restart a buffer mapping for " "unmapped buffer"); -static int numbufallocfails; -SYSCTL_INT(_vfs, OID_AUTO, numbufallocfails, CTLFLAG_RW, &numbufallocfails, 0, - "Number of times buffer allocations failed"); +static counter_u64_t numbufallocfails; +SYSCTL_COUNTER_U64(_vfs, OID_AUTO, numbufallocfails, CTLFLAG_RW, + &numbufallocfails, "Number of times buffer allocations failed"); static int flushbufqtarget = 100; SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0, "Amount of work to do in flushbufqueues when helping bufdaemon"); -static long notbufdflushes; -SYSCTL_LONG(_vfs, OID_AUTO, notbufdflushes, CTLFLAG_RD, ¬bufdflushes, 0, +static counter_u64_t notbufdflushes; +SYSCTL_COUNTER_U64(_vfs, OID_AUTO, notbufdflushes, CTLFLAG_RD, ¬bufdflushes, "Number of dirty buffer flushes done by the bufdaemon helpers"); static long barrierwrites; SYSCTL_LONG(_vfs, OID_AUTO, barrierwrites, CTLFLAG_RW, &barrierwrites, 0, @@ -266,11 +254,6 @@ static struct mtx_padalign __exclusive_cache_line bdlo static struct mtx_padalign __exclusive_cache_line rbreqlock; /* - * Lock that protects needsbuffer and the sleeps/wakeups surrounding it. - */ -static struct rwlock_padalign __exclusive_cache_line nblock; - -/* * Lock that protects bdirtywait. */ static struct mtx_padalign __exclusive_cache_line bdirtylock; @@ -283,11 +266,6 @@ static struct mtx_padalign __exclusive_cache_line bdir static int bd_request; /* - * Request/wakeup point for the bufspace daemon. - */ -static int bufspace_request; - -/* * Request for the buf daemon to write more buffers than is indicated by * lodirtybuf. This may be necessary to push out excess dependencies or * defragment the address space where a simple count of the number of dirty @@ -302,15 +280,6 @@ static int bd_speedupreq; */ static int runningbufreq; -/* - * Synchronization (sleep/wakeup) variable for buffer requests. - * Can contain the VFS_BIO_NEED flags defined below; setting/clearing is done - * by and/or. - * Used in numdirtywakeup(), bufspace_wakeup(), bwillwrite(), - * getnewbuf(), and getblk(). - */ -static volatile int needsbuffer; - /* * Synchronization for bwillwrite() waiters. */ @@ -323,29 +292,69 @@ static int bdirtywait; #define QUEUE_EMPTY 1 /* empty buffer headers */ #define QUEUE_DIRTY 2 /* B_DELWRI buffers */ #define QUEUE_CLEAN 3 /* non-B_DELWRI buffers */ -#define QUEUE_SENTINEL 1024 /* not an queue index, but mark for sentinel */ +#define QUEUE_SENTINEL 4 /* not an queue index, but mark for sentinel */ -/* Maximum number of clean buffer queues. */ -#define CLEAN_QUEUES 16 +struct bufqueue { + struct mtx_padalign bq_lock; + TAILQ_HEAD(, buf) bq_queue; + uint8_t bq_index; + uint16_t bq_subqueue; + int bq_len; +} __aligned(CACHE_LINE_SIZE); +#define BQ_LOCKPTR(bq) (&(bq)->bq_lock) +#define BQ_LOCK(bq) mtx_lock(BQ_LOCKPTR((bq))) +#define BQ_UNLOCK(bq) mtx_unlock(BQ_LOCKPTR((bq))) +#define BQ_ASSERT_LOCKED(bq) mtx_assert(BQ_LOCKPTR((bq)), MA_OWNED) + +struct bufqueue __exclusive_cache_line bqempty; +struct bufqueue __exclusive_cache_line bqdirty; + +struct bufdomain { + struct bufqueue bd_subq[MAXCPU + 1]; /* Per-cpu sub queues + global */ + struct bufqueue *bd_cleanq; + struct mtx_padalign bd_run_lock; + /* Constants */ + long bd_maxbufspace; + long bd_hibufspace; + long bd_lobufspace; + long bd_bufspacethresh; + int bd_hifreebuffers; + int bd_lofreebuffers; + int bd_lim; + /* atomics */ + int bd_wanted; + int __aligned(CACHE_LINE_SIZE) bd_running; + long __aligned(CACHE_LINE_SIZE) bd_bufspace; + int __aligned(CACHE_LINE_SIZE) bd_freebuffers; +} __aligned(CACHE_LINE_SIZE); + +#define BD_LOCKPTR(bd) (&(bd)->bd_cleanq->bq_lock) +#define BD_LOCK(bd) mtx_lock(BD_LOCKPTR((bd))) +#define BD_UNLOCK(bd) mtx_unlock(BD_LOCKPTR((bd))) +#define BD_ASSERT_LOCKED(bd) mtx_assert(BD_LOCKPTR((bd)), MA_OWNED) +#define BD_RUN_LOCKPTR(bd) (&(bd)->bd_run_lock) +#define BD_RUN_LOCK(bd) mtx_lock(BD_RUN_LOCKPTR((bd))) +#define BD_RUN_UNLOCK(bd) mtx_unlock(BD_RUN_LOCKPTR((bd))) +#define BD_DOMAIN(bd) (bd - bdclean) + +/* Maximum number of clean buffer domains. */ +#define CLEAN_DOMAINS 8 + /* Configured number of clean queues. */ -static int clean_queues; +static int __read_mostly clean_domains; -/* Maximum number of buffer queues. */ -#define BUFFER_QUEUES (QUEUE_CLEAN + CLEAN_QUEUES) +struct bufdomain __exclusive_cache_line bdclean[CLEAN_DOMAINS]; -/* Queues for free buffers with various properties */ -static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } }; -#ifdef INVARIANTS -static int bq_len[BUFFER_QUEUES]; -#endif +static void bq_remove(struct bufqueue *bq, struct buf *bp); +static void bq_insert(struct bufqueue *bq, struct buf *bp, bool unlock); +static int buf_recycle(struct bufdomain *, bool kva); +static void bq_init(struct bufqueue *bq, int qindex, int cpu, + const char *lockname); +static void bd_init(struct bufdomain *bd); +static int bd_flushall(struct bufdomain *bd); /* - * Lock for each bufqueue - */ -static struct mtx_padalign __exclusive_cache_line bqlocks[BUFFER_QUEUES]; - -/* * per-cpu empty buffer cache. */ uma_zone_t buf_zone; @@ -391,46 +400,34 @@ sysctl_bufspace(SYSCTL_HANDLER_ARGS) { long lvalue; int ivalue; + int i; + lvalue = 0; + for (i = 0; i < clean_domains; i++) + lvalue += bdclean[i].bd_bufspace; if (sizeof(int) == sizeof(long) || req->oldlen >= sizeof(long)) - return (sysctl_handle_long(oidp, arg1, arg2, req)); - lvalue = *(long *)arg1; + return (sysctl_handle_long(oidp, &lvalue, 0, req)); if (lvalue > INT_MAX) /* On overflow, still write out a long to trigger ENOMEM. */ return (sysctl_handle_long(oidp, &lvalue, 0, req)); ivalue = lvalue; return (sysctl_handle_int(oidp, &ivalue, 0, req)); } -#endif - +#else static int -bqcleanq(void) +sysctl_bufspace(SYSCTL_HANDLER_ARGS) { - static int nextq; + long lvalue; + int i; - return ((atomic_fetchadd_int(&nextq, 1) % clean_queues) + QUEUE_CLEAN); + lvalue = 0; + for (i = 0; i < clean_domains; i++) + lvalue += bdclean[i].bd_bufspace; + return (sysctl_handle_int(oidp, &lvalue, 0, req)); } +#endif -static int -bqisclean(int qindex) -{ - - return (qindex >= QUEUE_CLEAN && qindex < QUEUE_CLEAN + CLEAN_QUEUES); -} - /* - * bqlock: - * - * Return the appropriate queue lock based on the index. - */ -static inline struct mtx * -bqlock(int qindex) -{ - - return (struct mtx *)&bqlocks[qindex]; -} - -/* * bdirtywakeup: * * Wakeup any bwillwrite() waiters. @@ -481,47 +478,50 @@ bdirtyadd(void) } /* - * bufspace_wakeup: + * bufspace_daemon_wakeup: * - * Called when buffer space is potentially available for recovery. - * getnewbuf() will block on this flag when it is unable to free - * sufficient buffer space. Buffer space becomes recoverable when - * bp's get placed back in the queues. + * Wakeup the daemons responsible for freeing clean bufs. */ static void -bufspace_wakeup(void) +bufspace_daemon_wakeup(struct bufdomain *bd) { /* - * If someone is waiting for bufspace, wake them up. - * - * Since needsbuffer is set prior to doing an additional queue - * scan it is safe to check for the flag prior to acquiring the - * lock. The thread that is preparing to scan again before - * blocking would discover the buf we released. + * avoid the lock if the daemon is running. */ - if (needsbuffer) { - rw_rlock(&nblock); - if (atomic_cmpset_int(&needsbuffer, 1, 0) == 1) - wakeup(__DEVOLATILE(void *, &needsbuffer)); - rw_runlock(&nblock); + if (atomic_fetchadd_int(&bd->bd_running, 1) == 0) { + BD_RUN_LOCK(bd); + atomic_store_int(&bd->bd_running, 1); + wakeup(&bd->bd_running); + BD_RUN_UNLOCK(bd); } } /* - * bufspace_daemonwakeup: + * bufspace_daemon_wait: * - * Wakeup the daemon responsible for freeing clean bufs. + * Sleep until the domain falls below a limit or one second passes. */ static void -bufspace_daemonwakeup(void) +bufspace_daemon_wait(struct bufdomain *bd) { - rw_rlock(&nblock); - if (bufspace_request == 0) { - bufspace_request = 1; - wakeup(&bufspace_request); + /* + * Re-check our limits and sleep. bd_running must be + * cleared prior to checking the limits to avoid missed + * wakeups. The waker will adjust one of bufspace or + * freebuffers prior to checking bd_running. + */ + BD_RUN_LOCK(bd); + atomic_store_int(&bd->bd_running, 0); + if (bd->bd_bufspace < bd->bd_bufspacethresh && + bd->bd_freebuffers > bd->bd_lofreebuffers) { + msleep(&bd->bd_running, BD_RUN_LOCKPTR(bd), PRIBIO|PDROP, + "-", hz); + } else { + /* Avoid spurious wakeups while running. */ + atomic_store_int(&bd->bd_running, 1); + BD_RUN_UNLOCK(bd); } - rw_runlock(&nblock); } /* @@ -533,20 +533,22 @@ bufspace_daemonwakeup(void) static void bufspace_adjust(struct buf *bp, int bufsize) { + struct bufdomain *bd; long space; int diff; KASSERT((bp->b_flags & B_MALLOC) == 0, ("bufspace_adjust: malloc buf %p", bp)); + bd = &bdclean[bp->b_domain]; diff = bufsize - bp->b_bufsize; if (diff < 0) { - atomic_subtract_long(&bufspace, -diff); - bufspace_wakeup(); + atomic_subtract_long(&bd->bd_bufspace, -diff); } else { - space = atomic_fetchadd_long(&bufspace, diff); + space = atomic_fetchadd_long(&bd->bd_bufspace, diff); /* Wake up the daemon on the transition. */ - if (space < bufspacethresh && space + diff >= bufspacethresh) - bufspace_daemonwakeup(); + if (space < bd->bd_bufspacethresh && + space + diff >= bd->bd_bufspacethresh) + bufspace_daemon_wakeup(bd); } bp->b_bufsize = bufsize; } @@ -558,24 +560,25 @@ bufspace_adjust(struct buf *bp, int bufsize) * different space limit than data. */ static int -bufspace_reserve(int size, bool metadata) +bufspace_reserve(struct bufdomain *bd, int size, bool metadata) { - long limit; + long limit, new; long space; if (metadata) - limit = maxbufspace; + limit = bd->bd_maxbufspace; else - limit = hibufspace; - do { - space = bufspace; - if (space + size > limit) - return (ENOSPC); - } while (atomic_cmpset_long(&bufspace, space, space + size) == 0); + limit = bd->bd_hibufspace; + space = atomic_fetchadd_long(&bd->bd_bufspace, size); + new = space + size; + if (new > limit) { + atomic_subtract_long(&bd->bd_bufspace, size); + return (ENOSPC); + } /* Wake up the daemon on the transition. */ - if (space < bufspacethresh && space + size >= bufspacethresh) - bufspace_daemonwakeup(); + if (space < bd->bd_bufspacethresh && new >= bd->bd_bufspacethresh) + bufspace_daemon_wakeup(bd); return (0); } @@ -586,21 +589,22 @@ bufspace_reserve(int size, bool metadata) * Release reserved bufspace after bufspace_adjust() has consumed it. */ static void -bufspace_release(int size) +bufspace_release(struct bufdomain *bd, int size) { - atomic_subtract_long(&bufspace, size); - bufspace_wakeup(); + + atomic_subtract_long(&bd->bd_bufspace, size); } /* * bufspace_wait: * * Wait for bufspace, acting as the buf daemon if a locked vnode is - * supplied. needsbuffer must be set in a safe fashion prior to - * polling for space. The operation must be re-tried on return. + * supplied. bd_wanted must be set prior to polling for space. The + * operation must be re-tried on return. */ static void -bufspace_wait(struct vnode *vp, int gbflags, int slpflag, int slptimeo) +bufspace_wait(struct bufdomain *bd, struct vnode *vp, int gbflags, + int slpflag, int slptimeo) { struct thread *td; int error, fl, norunbuf; @@ -609,11 +613,11 @@ bufspace_wait(struct vnode *vp, int gbflags, int slpfl return; td = curthread; - rw_wlock(&nblock); - while (needsbuffer != 0) { + BD_LOCK(bd); + while (bd->bd_wanted) { if (vp != NULL && vp->v_type != VCHR && (td->td_pflags & TDP_BUFNEED) == 0) { - rw_wunlock(&nblock); + BD_UNLOCK(bd); /* * getblk() is called with a vnode locked, and * some majority of the dirty buffers may as @@ -636,18 +640,18 @@ bufspace_wait(struct vnode *vp, int gbflags, int slpfl td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF; fl = buf_flush(vp, flushbufqtarget); td->td_pflags &= norunbuf; - rw_wlock(&nblock); + BD_LOCK(bd); if (fl != 0) continue; - if (needsbuffer == 0) + if (bd->bd_wanted == 0) break; } - error = rw_sleep(__DEVOLATILE(void *, &needsbuffer), &nblock, + error = msleep(&bd->bd_wanted, BD_LOCKPTR(bd), (PRIBIO + 4) | slpflag, "newbuf", slptimeo); if (error != 0) break; } - rw_wunlock(&nblock); + BD_UNLOCK(bd); } @@ -659,10 +663,13 @@ bufspace_wait(struct vnode *vp, int gbflags, int slpfl * block nor work to reclaim buffers. */ static void -bufspace_daemon(void) +bufspace_daemon(void *arg) { + struct bufdomain *bd; + + bd = arg; for (;;) { - kproc_suspend_check(bufspacedaemonproc); + kproc_suspend_check(curproc); /* * Free buffers from the clean queue until we meet our @@ -689,46 +696,25 @@ bufspace_daemon(void) * which will inefficiently trade bufs with bqrelse * until we return to condition 2. */ - while (bufspace > lobufspace || - numfreebuffers < hifreebuffers) { - if (buf_recycle(false) != 0) { - atomic_set_int(&needsbuffer, 1); - if (buf_recycle(false) != 0) { - rw_wlock(&nblock); - if (needsbuffer) - rw_sleep(__DEVOLATILE(void *, - &needsbuffer), &nblock, - PRIBIO|PDROP, "bufspace", - hz/10); - else - rw_wunlock(&nblock); - } + do { + if (buf_recycle(bd, false) != 0) { + if (bd_flushall(bd)) + continue; + BD_LOCK(bd); + if (bd->bd_wanted) { + msleep(&bd->bd_wanted, BD_LOCKPTR(bd), + PRIBIO|PDROP, "bufspace", hz/10); + } else + BD_UNLOCK(bd); } maybe_yield(); - } + } while (bd->bd_bufspace > bd->bd_lobufspace || + bd->bd_freebuffers < bd->bd_hifreebuffers); - /* - * Re-check our limits under the exclusive nblock. - */ - rw_wlock(&nblock); - if (bufspace < bufspacethresh && - numfreebuffers > lofreebuffers) { - bufspace_request = 0; - rw_sleep(&bufspace_request, &nblock, PRIBIO|PDROP, - "-", hz); - } else - rw_wunlock(&nblock); + bufspace_daemon_wait(bd); } } -static struct kproc_desc bufspace_kp = { - "bufspacedaemon", - bufspace_daemon, - &bufspacedaemonproc -}; -SYSINIT(bufspacedaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, - &bufspace_kp); - /* * bufmallocadjust: * @@ -842,7 +828,7 @@ vfs_buf_test_cache(struct buf *bp, vm_ooffset_t foff, } /* Wake up the buffer daemon if necessary */ -static __inline void +static void bd_wakeup(void) { @@ -1038,19 +1024,12 @@ bufinit(void) KASSERT(maxbcachebuf >= MAXBSIZE, ("maxbcachebuf (%d) must be >= MAXBSIZE (%d)\n", maxbcachebuf, MAXBSIZE)); - mtx_init(&bqlocks[QUEUE_DIRTY], "bufq dirty lock", NULL, MTX_DEF); - mtx_init(&bqlocks[QUEUE_EMPTY], "bufq empty lock", NULL, MTX_DEF); - for (i = QUEUE_CLEAN; i < QUEUE_CLEAN + CLEAN_QUEUES; i++) - mtx_init(&bqlocks[i], "bufq clean lock", NULL, MTX_DEF); + bq_init(&bqempty, QUEUE_EMPTY, -1, "bufq empty lock"); + bq_init(&bqdirty, QUEUE_DIRTY, -1, "bufq dirty lock"); mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF); - rw_init(&nblock, "needsbuffer lock"); mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF); mtx_init(&bdirtylock, "dirty buf lock", NULL, MTX_DEF); - /* next, make a null set of free lists */ - for (i = 0; i < BUFFER_QUEUES; i++) - TAILQ_INIT(&bufqueues[i]); - unmapped_buf = (caddr_t)kva_alloc(MAXPHYS); /* finally, initialize each buffer header and stick on empty q */ @@ -1060,15 +1039,14 @@ bufinit(void) bp->b_flags = B_INVAL; bp->b_rcred = NOCRED; bp->b_wcred = NOCRED; - bp->b_qindex = QUEUE_EMPTY; + bp->b_qindex = QUEUE_NONE; + bp->b_domain = -1; + bp->b_subqueue = mp_ncpus; bp->b_xflags = 0; bp->b_data = bp->b_kvabase = unmapped_buf; LIST_INIT(&bp->b_dep); BUF_LOCKINIT(bp); - TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist); -#ifdef INVARIANTS - bq_len[QUEUE_EMPTY]++; -#endif + bq_insert(&bqempty, bp, false); } /* @@ -1150,8 +1128,31 @@ bufinit(void) * One queue per-256mb up to the max. More queues gives better * concurrency but less accurate LRU. */ - clean_queues = MIN(howmany(maxbufspace, 256*1024*1024), CLEAN_QUEUES); + clean_domains = MIN(howmany(maxbufspace, 256*1024*1024), CLEAN_DOMAINS); + for (i = 0 ; i < clean_domains; i++) { + struct bufdomain *bd; + bd = &bdclean[i]; + bd_init(bd); + bd->bd_freebuffers = nbuf / clean_domains; + bd->bd_hifreebuffers = hifreebuffers / clean_domains; + bd->bd_lofreebuffers = lofreebuffers / clean_domains; + bd->bd_bufspace = 0; + bd->bd_maxbufspace = maxbufspace / clean_domains; + bd->bd_hibufspace = hibufspace / clean_domains; + bd->bd_lobufspace = lobufspace / clean_domains; + bd->bd_bufspacethresh = bufspacethresh / clean_domains; + /* Don't allow more than 2% of bufs in the per-cpu caches. */ + bd->bd_lim = nbuf / clean_domains / 50 / mp_ncpus; + } + getnewbufcalls = counter_u64_alloc(M_WAITOK); + getnewbufrestarts = counter_u64_alloc(M_WAITOK); + mappingrestarts = counter_u64_alloc(M_WAITOK); + numbufallocfails = counter_u64_alloc(M_WAITOK); + notbufdflushes = counter_u64_alloc(M_WAITOK); + buffreekvacnt = counter_u64_alloc(M_WAITOK); + bufdefragcnt = counter_u64_alloc(M_WAITOK); + bufkvaspace = counter_u64_alloc(M_WAITOK); } #ifdef INVARIANTS @@ -1326,58 +1327,92 @@ bpmap_qenter(struct buf *bp) (vm_offset_t)(bp->b_offset & PAGE_MASK)); } +static struct bufqueue * +bufqueue(struct buf *bp) +{ + + switch (bp->b_qindex) { + case QUEUE_NONE: + /* FALLTHROUGH */ + case QUEUE_SENTINEL: + return (NULL); + case QUEUE_EMPTY: + return (&bqempty); + case QUEUE_DIRTY: + return (&bqdirty); + case QUEUE_CLEAN: + return (&bdclean[bp->b_domain].bd_subq[bp->b_subqueue]); + default: + break; + } + panic("bufqueue(%p): Unhandled type %d\n", bp, bp->b_qindex); +} + /* + * Return the locked bufqueue that bp is a member of. + */ +static struct bufqueue * +bufqueue_acquire(struct buf *bp) +{ + struct bufqueue *bq, *nbq; + + /* + * bp can be pushed from a per-cpu queue to the + * cleanq while we're waiting on the lock. Retry + * if the queues don't match. + */ + bq = bufqueue(bp); + BQ_LOCK(bq); + for (;;) { + nbq = bufqueue(bp); + if (bq == nbq) + break; + BQ_UNLOCK(bq); + BQ_LOCK(nbq); + bq = nbq; + } + return (bq); +} + +/* * binsfree: * - * Insert the buffer into the appropriate free list. + * Insert the buffer into the appropriate free list. Requires a + * locked buffer on entry and buffer is unlocked before return. */ static void binsfree(struct buf *bp, int qindex) { - struct mtx *olock, *nlock; + struct bufdomain *bd; + struct bufqueue *bq; - if (qindex != QUEUE_EMPTY) { - BUF_ASSERT_XLOCKED(bp); - } + KASSERT(qindex == QUEUE_CLEAN || qindex == QUEUE_DIRTY, + ("binsfree: Invalid qindex %d", qindex)); + BUF_ASSERT_XLOCKED(bp); /* - * Stick to the same clean queue for the lifetime of the buf to - * limit locking below. Otherwise pick ont sequentially. - */ - if (qindex == QUEUE_CLEAN) { - if (bqisclean(bp->b_qindex)) - qindex = bp->b_qindex; - else - qindex = bqcleanq(); - } - - /* * Handle delayed bremfree() processing. */ - nlock = bqlock(qindex); if (bp->b_flags & B_REMFREE) { - olock = bqlock(bp->b_qindex); - mtx_lock(olock); - bremfreel(bp); - if (olock != nlock) { - mtx_unlock(olock); - mtx_lock(nlock); + if (bp->b_qindex == qindex) { + bp->b_flags |= B_REUSE; + bp->b_flags &= ~B_REMFREE; + BUF_UNLOCK(bp); + return; } + bq = bufqueue_acquire(bp); + bq_remove(bq, bp); + BQ_UNLOCK(bq); + } + if (qindex == QUEUE_CLEAN) { + bd = &bdclean[bp->b_domain]; + if (bd->bd_lim != 0) + bq = &bd->bd_subq[PCPU_GET(cpuid)]; + else + bq = bd->bd_cleanq; } else - mtx_lock(nlock); - - if (bp->b_qindex != QUEUE_NONE) - panic("binsfree: free buffer onto another queue???"); - - bp->b_qindex = qindex; - if (bp->b_flags & B_AGE) - TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist); - else - TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist); -#ifdef INVARIANTS - bq_len[bp->b_qindex]++; -#endif - mtx_unlock(nlock); + bq = &bqdirty; + bq_insert(bq, bp, true); } /* @@ -1404,10 +1439,9 @@ buf_free(struct buf *bp) if (!LIST_EMPTY(&bp->b_dep)) buf_deallocate(bp); bufkva_free(bp); + atomic_add_int(&bdclean[bp->b_domain].bd_freebuffers, 1); BUF_UNLOCK(bp); uma_zfree(buf_zone, bp); - atomic_add_int(&numfreebuffers, 1); - bufspace_wakeup(); } /* @@ -1424,15 +1458,15 @@ buf_import(void *arg, void **store, int cnt, int domai struct buf *bp; int i; - mtx_lock(&bqlocks[QUEUE_EMPTY]); + BQ_LOCK(&bqempty); for (i = 0; i < cnt; i++) { - bp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]); + bp = TAILQ_FIRST(&bqempty.bq_queue); if (bp == NULL) break; - bremfreel(bp); + bq_remove(&bqempty, bp); store[i] = bp; } - mtx_unlock(&bqlocks[QUEUE_EMPTY]); + BQ_UNLOCK(&bqempty); return (i); } @@ -1445,10 +1479,21 @@ buf_import(void *arg, void **store, int cnt, int domai static void buf_release(void *arg, void **store, int cnt) { + struct bufqueue *bq; + struct buf *bp; int i; - for (i = 0; i < cnt; i++) - binsfree(store[i], QUEUE_EMPTY); + bq = &bqempty; + BQ_LOCK(bq); + for (i = 0; i < cnt; i++) { + bp = store[i]; + /* Inline bq_insert() to batch locking. */ + TAILQ_INSERT_TAIL(&bq->bq_queue, bp, b_freelist); + bp->b_flags &= ~(B_AGE | B_REUSE); + bq->bq_len++; + bp->b_qindex = bq->bq_index; + } + BQ_UNLOCK(bq); } /* @@ -1457,22 +1502,31 @@ buf_release(void *arg, void **store, int cnt) * Allocate an empty buffer header. */ static struct buf * -buf_alloc(void) +buf_alloc(struct bufdomain *bd) { struct buf *bp; + int freebufs; - bp = uma_zalloc(buf_zone, M_NOWAIT); + /* + * We can only run out of bufs in the buf zone if the average buf + * is less than BKVASIZE. In this case the actual wait/block will + * come from buf_reycle() failing to flush one of these small bufs. + */ + bp = NULL; + freebufs = atomic_fetchadd_int(&bd->bd_freebuffers, -1); + if (freebufs > 0) + bp = uma_zalloc(buf_zone, M_NOWAIT); if (bp == NULL) { - bufspace_daemonwakeup(); - atomic_add_int(&numbufallocfails, 1); + atomic_fetchadd_int(&bd->bd_freebuffers, 1); + bufspace_daemon_wakeup(bd); + counter_u64_add(numbufallocfails, 1); return (NULL); } - /* - * Wake-up the bufspace daemon on transition. + * Wake-up the bufspace daemon on transition below threshold. */ - if (atomic_fetchadd_int(&numfreebuffers, -1) == lofreebuffers) - bufspace_daemonwakeup(); + if (freebufs == bd->bd_lofreebuffers) + bufspace_daemon_wakeup(bd); if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) panic("getnewbuf_empty: Locked buf %p on free queue.", bp); @@ -1488,6 +1542,7 @@ buf_alloc(void) KASSERT(bp->b_kvasize == 0, ("bp: %p still has kva\n", bp)); KASSERT(bp->b_bufsize == 0, ("bp: %p still has bufspace\n", bp)); + bp->b_domain = BD_DOMAIN(bd); bp->b_flags = 0; bp->b_ioflags = 0; bp->b_xflags = 0; @@ -1512,22 +1567,26 @@ buf_alloc(void) } /* - * buf_qrecycle: + * buf_recycle: * * Free a buffer from the given bufqueue. kva controls whether the * freed buf must own some kva resources. This is used for * defragmenting. */ static int -buf_qrecycle(int qindex, bool kva) +buf_recycle(struct bufdomain *bd, bool kva) { + struct bufqueue *bq; struct buf *bp, *nbp; if (kva) - atomic_add_int(&bufdefragcnt, 1); + counter_u64_add(bufdefragcnt, 1); nbp = NULL; - mtx_lock(&bqlocks[qindex]); - nbp = TAILQ_FIRST(&bufqueues[qindex]); + bq = bd->bd_cleanq; + BQ_LOCK(bq); + KASSERT(BQ_LOCKPTR(bq) == BD_LOCKPTR(bd), + ("buf_recycle: Locks don't match")); + nbp = TAILQ_FIRST(&bq->bq_queue); /* * Run scan, possibly freeing data and/or kva mappings on the fly @@ -1551,6 +1610,18 @@ buf_qrecycle(int qindex, bool kva) continue; /* + * Implement a second chance algorithm for frequently + * accessed buffers. + */ + if ((bp->b_flags & B_REUSE) != 0) { + TAILQ_REMOVE(&bq->bq_queue, bp, b_freelist); + TAILQ_INSERT_TAIL(&bq->bq_queue, bp, b_freelist); + bp->b_flags &= ~B_REUSE; + BUF_UNLOCK(bp); + continue; + } + + /* * Skip buffers with background writes in progress. */ if ((bp->b_vflags & BV_BKGRDINPROG) != 0) { @@ -1558,14 +1629,18 @@ buf_qrecycle(int qindex, bool kva) continue; } - KASSERT(bp->b_qindex == qindex, - ("getnewbuf: inconsistent queue %d bp %p", qindex, bp)); + KASSERT(bp->b_qindex == QUEUE_CLEAN, + ("buf_recycle: inconsistent queue %d bp %p", + bp->b_qindex, bp)); + KASSERT(bp->b_domain == BD_DOMAIN(bd), + ("getnewbuf: queue domain %d doesn't match request %d", + bp->b_domain, (int)BD_DOMAIN(bd))); /* * NOTE: nbp is now entirely invalid. We can only restart * the scan from this point on. */ - bremfreel(bp); - mtx_unlock(&bqlocks[qindex]); + bq_remove(bq, bp); + BQ_UNLOCK(bq); /* * Requeue the background write buffer with error and @@ -1573,70 +1648,21 @@ buf_qrecycle(int qindex, bool kva) */ if ((bp->b_vflags & BV_BKGRDERR) != 0) { bqrelse(bp); - mtx_lock(&bqlocks[qindex]); - nbp = TAILQ_FIRST(&bufqueues[qindex]); + BQ_LOCK(bq); + nbp = TAILQ_FIRST(&bq->bq_queue); continue; } bp->b_flags |= B_INVAL; brelse(bp); return (0); } - mtx_unlock(&bqlocks[qindex]); + bd->bd_wanted = 1; + BQ_UNLOCK(bq); return (ENOBUFS); } *** DIFF OUTPUT TRUNCATED AT 1000 LINES ***