From owner-svn-src-user@freebsd.org Tue Mar 13 19:28:04 2018 Return-Path: Delivered-To: svn-src-user@mailman.ysv.freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2610:1c1:1:606c::19:1]) by mailman.ysv.freebsd.org (Postfix) with ESMTP id 943D2F4934F for ; Tue, 13 Mar 2018 19:28:04 +0000 (UTC) (envelope-from jeff@FreeBSD.org) Received: from mxrelay.nyi.freebsd.org (mxrelay.nyi.freebsd.org [IPv6:2610:1c1:1:606c::19:3]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (Client CN "mxrelay.nyi.freebsd.org", Issuer "Let's Encrypt Authority X3" (verified OK)) by mx1.freebsd.org (Postfix) with ESMTPS id 39D7770120; Tue, 13 Mar 2018 19:28:04 +0000 (UTC) (envelope-from jeff@FreeBSD.org) Received: from repo.freebsd.org (repo.freebsd.org [IPv6:2610:1c1:1:6068::e6a:0]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (Client did not present a certificate) by mxrelay.nyi.freebsd.org (Postfix) with ESMTPS id 30A9D1DF7B; Tue, 13 Mar 2018 19:28:04 +0000 (UTC) (envelope-from jeff@FreeBSD.org) Received: from repo.freebsd.org ([127.0.1.37]) by repo.freebsd.org (8.15.2/8.15.2) with ESMTP id w2DJS4KH096199; Tue, 13 Mar 2018 19:28:04 GMT (envelope-from jeff@FreeBSD.org) Received: (from jeff@localhost) by repo.freebsd.org (8.15.2/8.15.2/Submit) id w2DJS4Ax096198; Tue, 13 Mar 2018 19:28:04 GMT (envelope-from jeff@FreeBSD.org) Message-Id: <201803131928.w2DJS4Ax096198@repo.freebsd.org> X-Authentication-Warning: repo.freebsd.org: jeff set sender to jeff@FreeBSD.org using -f From: Jeff Roberson Date: Tue, 13 Mar 2018 19:28:04 +0000 (UTC) To: src-committers@freebsd.org, svn-src-user@freebsd.org Subject: svn commit: r330874 - user/jeff/numa/sys/kern X-SVN-Group: user X-SVN-Commit-Author: jeff X-SVN-Commit-Paths: user/jeff/numa/sys/kern X-SVN-Commit-Revision: 330874 X-SVN-Commit-Repository: base MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit X-BeenThere: svn-src-user@freebsd.org X-Mailman-Version: 2.1.25 Precedence: list List-Id: "SVN commit messages for the experimental " user" src tree" List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Tue, 13 Mar 2018 19:28:05 -0000 Author: jeff Date: Tue Mar 13 19:28:03 2018 New Revision: 330874 URL: https://svnweb.freebsd.org/changeset/base/330874 Log: Resolve a deadlock when softdep leaves too many buffers locked waiting on I/O. Add some new debugging output to bufqueues. Make sysctls R/W again. Reported by: pho, bde Modified: user/jeff/numa/sys/kern/vfs_bio.c Modified: user/jeff/numa/sys/kern/vfs_bio.c ============================================================================== --- user/jeff/numa/sys/kern/vfs_bio.c Tue Mar 13 18:33:50 2018 (r330873) +++ user/jeff/numa/sys/kern/vfs_bio.c Tue Mar 13 19:28:03 2018 (r330874) @@ -101,7 +101,52 @@ struct buf_ops buf_ops_bio = { .bop_bdflush = bufbdflush, }; -struct bufdomain; +struct bufqueue { + struct mtx_padalign bq_lock; + TAILQ_HEAD(, buf) bq_queue; + uint8_t bq_index; + uint16_t bq_subqueue; + int bq_len; +} __aligned(CACHE_LINE_SIZE); + +#define BQ_LOCKPTR(bq) (&(bq)->bq_lock) +#define BQ_LOCK(bq) mtx_lock(BQ_LOCKPTR((bq))) +#define BQ_UNLOCK(bq) mtx_unlock(BQ_LOCKPTR((bq))) +#define BQ_ASSERT_LOCKED(bq) mtx_assert(BQ_LOCKPTR((bq)), MA_OWNED) + +struct bufdomain { + struct bufqueue bd_subq[MAXCPU + 1]; /* Per-cpu sub queues + global */ + struct bufqueue bd_dirtyq; + struct bufqueue *bd_cleanq; + struct mtx_padalign bd_run_lock; + /* Constants */ + long bd_maxbufspace; + long bd_hibufspace; + long bd_lobufspace; + long bd_bufspacethresh; + int bd_hifreebuffers; + int bd_lofreebuffers; + int bd_hidirtybuffers; + int bd_lodirtybuffers; + int bd_dirtybufthresh; + int bd_lim; + /* atomics */ + int bd_wanted; + int __aligned(CACHE_LINE_SIZE) bd_numdirtybuffers; + int __aligned(CACHE_LINE_SIZE) bd_running; + long __aligned(CACHE_LINE_SIZE) bd_bufspace; + int __aligned(CACHE_LINE_SIZE) bd_freebuffers; +} __aligned(CACHE_LINE_SIZE); + +#define BD_LOCKPTR(bd) (&(bd)->bd_cleanq->bq_lock) +#define BD_LOCK(bd) mtx_lock(BD_LOCKPTR((bd))) +#define BD_UNLOCK(bd) mtx_unlock(BD_LOCKPTR((bd))) +#define BD_ASSERT_LOCKED(bd) mtx_assert(BD_LOCKPTR((bd)), MA_OWNED) +#define BD_RUN_LOCKPTR(bd) (&(bd)->bd_run_lock) +#define BD_RUN_LOCK(bd) mtx_lock(BD_RUN_LOCKPTR((bd))) +#define BD_RUN_UNLOCK(bd) mtx_unlock(BD_RUN_LOCKPTR((bd))) +#define BD_DOMAIN(bd) (bd - bdomain) + static struct buf *buf; /* buffer header pool */ extern struct buf *swbuf; /* Swap buffer header pool. */ caddr_t unmapped_buf; @@ -136,6 +181,15 @@ static int buf_import(void *, void **, int, int, int); static void buf_release(void *, void **, int); static void maxbcachebuf_adjust(void); static inline struct bufdomain *bufdomain(struct buf *); +static void bq_remove(struct bufqueue *bq, struct buf *bp); +static void bq_insert(struct bufqueue *bq, struct buf *bp, bool unlock); +static int buf_recycle(struct bufdomain *, bool kva); +static void bq_init(struct bufqueue *bq, int qindex, int cpu, + const char *lockname); +static void bd_init(struct bufdomain *bd); +static int bd_flushall(struct bufdomain *bd); +static int sysctl_bufdomain_long(SYSCTL_HANDLER_ARGS); +static int sysctl_bufdomain_int(SYSCTL_HANDLER_ARGS); static int sysctl_bufspace(SYSCTL_HANDLER_ARGS); int vmiodirenable = TRUE; @@ -150,23 +204,31 @@ static counter_u64_t bufkvaspace; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, bufkvaspace, CTLFLAG_RD, &bufkvaspace, "Kernel virtual memory used for buffers"); static long maxbufspace; -SYSCTL_LONG(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RD, &maxbufspace, 0, +SYSCTL_PROC(_vfs, OID_AUTO, maxbufspace, + CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &maxbufspace, + __offsetof(struct bufdomain, bd_maxbufspace), sysctl_bufdomain_long, "L", "Maximum allowed value of bufspace (including metadata)"); static long bufmallocspace; SYSCTL_LONG(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0, "Amount of malloced memory for buffers"); static long maxbufmallocspace; -SYSCTL_LONG(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RD, &maxbufmallocspace, +SYSCTL_LONG(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace, 0, "Maximum amount of malloced memory for buffers"); static long lobufspace; -SYSCTL_LONG(_vfs, OID_AUTO, lobufspace, CTLFLAG_RD, &lobufspace, 0, +SYSCTL_PROC(_vfs, OID_AUTO, lobufspace, + CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &lobufspace, + __offsetof(struct bufdomain, bd_lobufspace), sysctl_bufdomain_long, "L", "Minimum amount of buffers we want to have"); long hibufspace; -SYSCTL_LONG(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD, &hibufspace, 0, +SYSCTL_PROC(_vfs, OID_AUTO, hibufspace, + CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &hibufspace, + __offsetof(struct bufdomain, bd_hibufspace), sysctl_bufdomain_long, "L", "Maximum allowed value of bufspace (excluding metadata)"); long bufspacethresh; -SYSCTL_LONG(_vfs, OID_AUTO, bufspacethresh, CTLFLAG_RD, &bufspacethresh, - 0, "Bufspace consumed before waking the daemon to free some"); +SYSCTL_PROC(_vfs, OID_AUTO, bufspacethresh, + CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &bufspacethresh, + __offsetof(struct bufdomain, bd_bufspacethresh), sysctl_bufdomain_long, "L", + "Bufspace consumed before waking the daemon to free some"); static counter_u64_t buffreekvacnt; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt, "Number of times we have freed the KVA space from some buffer"); @@ -198,22 +260,32 @@ SYSCTL_PROC(_vfs, OID_AUTO, numdirtybuffers, CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RD, NULL, 0, sysctl_numdirtybuffers, "I", "Number of buffers that are dirty (has unwritten changes) at the moment"); static int lodirtybuffers; -SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RD, &lodirtybuffers, 0, +SYSCTL_PROC(_vfs, OID_AUTO, lodirtybuffers, + CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &lodirtybuffers, + __offsetof(struct bufdomain, bd_lodirtybuffers), sysctl_bufdomain_int, "L", "How many buffers we want to have free before bufdaemon can sleep"); static int hidirtybuffers; -SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RD, &hidirtybuffers, 0, +SYSCTL_PROC(_vfs, OID_AUTO, hidirtybuffers, + CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &hidirtybuffers, + __offsetof(struct bufdomain, bd_hidirtybuffers), sysctl_bufdomain_int, "L", "When the number of dirty buffers is considered severe"); int dirtybufthresh; -SYSCTL_INT(_vfs, OID_AUTO, dirtybufthresh, CTLFLAG_RD, &dirtybufthresh, - 0, "Number of bdwrite to bawrite conversions to clear dirty buffers"); +SYSCTL_PROC(_vfs, OID_AUTO, dirtybufthresh, + CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &dirtybufthresh, + __offsetof(struct bufdomain, bd_dirtybufthresh), sysctl_bufdomain_int, "L", + "Number of bdwrite to bawrite conversions to clear dirty buffers"); static int numfreebuffers; SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, &numfreebuffers, 0, "Number of free buffers"); static int lofreebuffers; -SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RD, &lofreebuffers, 0, +SYSCTL_PROC(_vfs, OID_AUTO, lofreebuffers, + CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &lofreebuffers, + __offsetof(struct bufdomain, bd_lofreebuffers), sysctl_bufdomain_int, "L", "Target number of free buffers"); static int hifreebuffers; -SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RD, &hifreebuffers, 0, +SYSCTL_PROC(_vfs, OID_AUTO, hifreebuffers, + CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &hifreebuffers, + __offsetof(struct bufdomain, bd_hifreebuffers), sysctl_bufdomain_int, "L", "Threshold for clean buffer recycling"); static counter_u64_t getnewbufcalls; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RD, @@ -298,74 +370,19 @@ static int bdirtywait; #define QUEUE_CLEAN 3 /* non-B_DELWRI buffers */ #define QUEUE_SENTINEL 4 /* not an queue index, but mark for sentinel */ -struct bufqueue { - struct mtx_padalign bq_lock; - TAILQ_HEAD(, buf) bq_queue; - uint8_t bq_index; - uint16_t bq_subqueue; - int bq_len; -} __aligned(CACHE_LINE_SIZE); - -#define BQ_LOCKPTR(bq) (&(bq)->bq_lock) -#define BQ_LOCK(bq) mtx_lock(BQ_LOCKPTR((bq))) -#define BQ_UNLOCK(bq) mtx_unlock(BQ_LOCKPTR((bq))) -#define BQ_ASSERT_LOCKED(bq) mtx_assert(BQ_LOCKPTR((bq)), MA_OWNED) - -struct bufqueue __exclusive_cache_line bqempty; - -struct bufdomain { - struct bufqueue bd_subq[MAXCPU + 1]; /* Per-cpu sub queues + global */ - struct bufqueue bd_dirtyq; - struct bufqueue *bd_cleanq; - struct mtx_padalign bd_run_lock; - /* Constants */ - long bd_maxbufspace; - long bd_hibufspace; - long bd_lobufspace; - long bd_bufspacethresh; - int bd_hifreebuffers; - int bd_lofreebuffers; - int bd_hidirtybuffers; - int bd_lodirtybuffers; - int bd_dirtybufthresh; - int bd_lim; - /* atomics */ - int bd_wanted; - int __aligned(CACHE_LINE_SIZE) bd_numdirtybuffers; - int __aligned(CACHE_LINE_SIZE) bd_running; - long __aligned(CACHE_LINE_SIZE) bd_bufspace; - int __aligned(CACHE_LINE_SIZE) bd_freebuffers; -} __aligned(CACHE_LINE_SIZE); - -#define BD_LOCKPTR(bd) (&(bd)->bd_cleanq->bq_lock) -#define BD_LOCK(bd) mtx_lock(BD_LOCKPTR((bd))) -#define BD_UNLOCK(bd) mtx_unlock(BD_LOCKPTR((bd))) -#define BD_ASSERT_LOCKED(bd) mtx_assert(BD_LOCKPTR((bd)), MA_OWNED) -#define BD_RUN_LOCKPTR(bd) (&(bd)->bd_run_lock) -#define BD_RUN_LOCK(bd) mtx_lock(BD_RUN_LOCKPTR((bd))) -#define BD_RUN_UNLOCK(bd) mtx_unlock(BD_RUN_LOCKPTR((bd))) -#define BD_DOMAIN(bd) (bd - bdomain) - /* Maximum number of buffer domains. */ #define BUF_DOMAINS 8 -BITSET_DEFINE(bufdomainset, BUF_DOMAINS); struct bufdomainset bdlodirty; /* Domains > lodirty */ struct bufdomainset bdhidirty; /* Domains > hidirty */ /* Configured number of clean queues. */ static int __read_mostly buf_domains; +BITSET_DEFINE(bufdomainset, BUF_DOMAINS); struct bufdomain __exclusive_cache_line bdomain[BUF_DOMAINS]; +struct bufqueue __exclusive_cache_line bqempty; -static void bq_remove(struct bufqueue *bq, struct buf *bp); -static void bq_insert(struct bufqueue *bq, struct buf *bp, bool unlock); -static int buf_recycle(struct bufdomain *, bool kva); -static void bq_init(struct bufqueue *bq, int qindex, int cpu, - const char *lockname); -static void bd_init(struct bufdomain *bd); -static int bd_flushall(struct bufdomain *bd); - /* * per-cpu empty buffer cache. */ @@ -405,6 +422,44 @@ sysctl_runningspace(SYSCTL_HANDLER_ARGS) return (error); } +static int +sysctl_bufdomain_int(SYSCTL_HANDLER_ARGS) +{ + int error; + int value; + int i; + + value = *(int *)arg1; + error = sysctl_handle_int(oidp, &value, 0, req); + if (error != 0 || req->newptr == NULL) + return (error); + *(int *)arg1 = value; + for (i = 0; i < buf_domains; i++) + *(int *)(((uintptr_t)&bdomain[i]) + arg2) = + value / buf_domains; + + return (error); +} + +static int +sysctl_bufdomain_long(SYSCTL_HANDLER_ARGS) +{ + long value; + int error; + int i; + + value = *(long *)arg1; + error = sysctl_handle_long(oidp, &value, 0, req); + if (error != 0 || req->newptr == NULL) + return (error); + *(long *)arg1 = value; + for (i = 0; i < buf_domains; i++) + *(long *)(((uintptr_t)&bdomain[i]) + arg2) = + value / buf_domains; + + return (error); +} + #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) static int @@ -770,6 +825,15 @@ bufspace_daemon(void *arg) if (buf_recycle(bd, false) != 0) { if (bd_flushall(bd)) continue; + /* + * Speedup dirty if we've run out of clean + * buffers. This is possible in particular + * because softdep may held many bufs locked + * pending writes to other bufs which are + * marked for delayed write, exhausting + * clean space until they are written. + */ + bd_speedup(); BD_LOCK(bd); if (bd->bd_wanted) { msleep(&bd->bd_wanted, BD_LOCKPTR(bd), @@ -5253,6 +5317,7 @@ DB_SHOW_COMMAND(buffer, db_show_buffer) } db_printf("\n"); } + BUF_LOCKPRINTINFO(bp); #if defined(FULL_BUF_TRACKING) db_printf("b_io_tracking: b_io_tcnt = %u\n", bp->b_io_tcnt); @@ -5267,13 +5332,14 @@ DB_SHOW_COMMAND(buffer, db_show_buffer) db_printf("b_io_tracking: %s\n", bp->b_io_tracking); #endif db_printf(" "); - BUF_LOCKPRINTINFO(bp); } DB_SHOW_COMMAND(bufqueues, bufqueues) { struct bufdomain *bd; - int i, j; + struct buf *bp; + long total; + int i, j, cnt; db_printf("bqempty: %d\n", bqempty.bq_len); @@ -5292,17 +5358,41 @@ DB_SHOW_COMMAND(bufqueues, bufqueues) db_printf("\n"); db_printf("\tnumdirtybuffers\t%d\n", bd->bd_numdirtybuffers); db_printf("\tlodirtybuffers\t%d\n", bd->bd_lodirtybuffers); - db_printf("\thidirtybufferss\t%d\n", bd->bd_hidirtybuffers); + db_printf("\thidirtybuffers\t%d\n", bd->bd_hidirtybuffers); db_printf("\tdirtybufthresh\t%d\n", bd->bd_dirtybufthresh); db_printf("\n"); - db_printf("\tcleanq count\t%d\n", bd->bd_cleanq->bq_len); - db_printf("\tdirtyq count\t%d\n", bd->bd_dirtyq.bq_len); + total = 0; + TAILQ_FOREACH(bp, &bd->bd_cleanq->bq_queue, b_freelist) + total += bp->b_bufsize; + db_printf("\tcleanq count\t%d (%ld)\n", + bd->bd_cleanq->bq_len, total); + total = 0; + TAILQ_FOREACH(bp, &bd->bd_dirtyq.bq_queue, b_freelist) + total += bp->b_bufsize; + db_printf("\tdirtyq count\t%d (%ld)\n", + bd->bd_dirtyq.bq_len, total); db_printf("\twakeup\t\t%d\n", bd->bd_wanted); db_printf("\tlim\t\t%d\n", bd->bd_lim); db_printf("\tCPU "); for (j = 0; j <= mp_maxid; j++) db_printf("%d, ", bd->bd_subq[j].bq_len); db_printf("\n"); + cnt = 0; + total = 0; + for (j = 0; j < nbuf; j++) + if (buf[j].b_domain == i && BUF_ISLOCKED(&buf[j])) { + cnt++; + total += buf[j].b_bufsize; + } + db_printf("\tLocked buffers: %d space %jd\n", cnt, total); + cnt = 0; + total = 0; + for (j = 0; j < nbuf; j++) + if (buf[j].b_domain == i) { + cnt++; + total += buf[j].b_bufsize; + } + db_printf("\tTotal buffers: %d space %jd\n", cnt, total); } }