From owner-svn-src-all@FreeBSD.ORG Tue May 5 10:34:44 2009 Return-Path: Delivered-To: svn-src-all@freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:4f8:fff6::34]) by hub.freebsd.org (Postfix) with ESMTP id 584F51065670; Tue, 5 May 2009 10:34:44 +0000 (UTC) (envelope-from kib@FreeBSD.org) Received: from svn.freebsd.org (svn.freebsd.org [IPv6:2001:4f8:fff6::2c]) by mx1.freebsd.org (Postfix) with ESMTP id 42BD68FC0C; Tue, 5 May 2009 10:34:44 +0000 (UTC) (envelope-from kib@FreeBSD.org) Received: from svn.freebsd.org (localhost [127.0.0.1]) by svn.freebsd.org (8.14.3/8.14.3) with ESMTP id n45AYi2D074741; Tue, 5 May 2009 10:34:44 GMT (envelope-from kib@svn.freebsd.org) Received: (from kib@localhost) by svn.freebsd.org (8.14.3/8.14.3/Submit) id n45AYild074736; Tue, 5 May 2009 10:34:44 GMT (envelope-from kib@svn.freebsd.org) Message-Id: <200905051034.n45AYild074736@svn.freebsd.org> From: Konstantin Belousov Date: Tue, 5 May 2009 10:34:44 +0000 (UTC) To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-stable@freebsd.org, svn-src-stable-7@freebsd.org X-SVN-Group: stable-7 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cc: Subject: svn commit: r191813 - in stable/7/sys: . contrib/pf dev/ath/ath_hal dev/cxgb gnu/fs/xfs/FreeBSD kern sys ufs/ffs X-BeenThere: svn-src-all@freebsd.org X-Mailman-Version: 2.1.5 Precedence: list List-Id: "SVN commit messages for the entire src tree \(except for " user" and " projects" \)" List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Tue, 05 May 2009 10:34:44 -0000 Author: kib Date: Tue May 5 10:34:43 2009 New Revision: 191813 URL: http://svn.freebsd.org/changeset/base/191813 Log: MFC r189878: Fix two issues with bufdaemon, often causing the processes to hang in the "nbufkv" sleep. Do not retry request for the new block from ffs_bufwrite() when write is done from bufdaemon and there is a buffer shortage. In getnewbuf(), help bufdaemon to flush dirty buffers owned by the vnode locked by curthread. For MFC, default value for sysctl vfs.flushbufqtarget is set to -1, disabling the helpers. The TDP_BUFNEED flag value from HEAD conflicts with TDP_UPCALLING KSE bit, so it is moved to the end of allocated bits. Modified: stable/7/sys/ (props changed) stable/7/sys/contrib/pf/ (props changed) stable/7/sys/dev/ath/ath_hal/ (props changed) stable/7/sys/dev/cxgb/ (props changed) stable/7/sys/gnu/fs/xfs/FreeBSD/xfs_buf.c stable/7/sys/kern/vfs_bio.c stable/7/sys/sys/buf.h stable/7/sys/sys/proc.h stable/7/sys/ufs/ffs/ffs_vfsops.c Modified: stable/7/sys/gnu/fs/xfs/FreeBSD/xfs_buf.c ============================================================================== --- stable/7/sys/gnu/fs/xfs/FreeBSD/xfs_buf.c Tue May 5 09:24:20 2009 (r191812) +++ stable/7/sys/gnu/fs/xfs/FreeBSD/xfs_buf.c Tue May 5 10:34:43 2009 (r191813) @@ -81,7 +81,7 @@ xfs_buf_get_empty(size_t size, xfs_buft { struct buf *bp; - bp = geteblk(0); + bp = geteblk(0, 0); if (bp != NULL) { bp->b_bufsize = size; bp->b_bcount = size; @@ -101,7 +101,7 @@ xfs_buf_get_noaddr(size_t len, xfs_bufta if (len >= MAXPHYS) return (NULL); - bp = geteblk(len); + bp = geteblk(len, 0); if (bp != NULL) { KASSERT(BUF_REFCNT(bp) == 1, ("xfs_buf_get_empty: bp %p not locked",bp)); Modified: stable/7/sys/kern/vfs_bio.c ============================================================================== --- stable/7/sys/kern/vfs_bio.c Tue May 5 09:24:20 2009 (r191812) +++ stable/7/sys/kern/vfs_bio.c Tue May 5 10:34:43 2009 (r191813) @@ -105,7 +105,8 @@ static void vfs_setdirty_locked_object(s static void vfs_vmio_release(struct buf *bp); static int vfs_bio_clcheck(struct vnode *vp, int size, daddr_t lblkno, daddr_t blkno); -static int flushbufqueues(int, int); +static int buf_do_flush(struct vnode *vp); +static int flushbufqueues(struct vnode *, int, int); static void buf_daemon(void); static void bremfreel(struct buf *bp); @@ -187,6 +188,9 @@ SYSCTL_INT(_vfs, OID_AUTO, getnewbufcall static int getnewbufrestarts; SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW, &getnewbufrestarts, 0, "Number of times getnewbuf has had to restart a buffer aquisition"); +static int flushbufqtarget = -1; +SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0, + "Amount of work to do in flushbufqueues when helping bufdaemon"); /* * Wakeup point for bufdaemon, as well as indicator of whether it is already @@ -258,6 +262,7 @@ static struct mtx bpinlock; #define QUEUE_DIRTY_GIANT 3 /* B_DELWRI buffers that need giant */ #define QUEUE_EMPTYKVA 4 /* empty buffer headers w/KVA assignment */ #define QUEUE_EMPTY 5 /* empty buffer headers */ +#define QUEUE_SENTINEL 1024 /* not an queue index, but mark for sentinel */ /* Queues for free buffers with various properties */ static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } }; @@ -1707,21 +1712,23 @@ vfs_bio_awrite(struct buf *bp) */ static struct buf * -getnewbuf(int slpflag, int slptimeo, int size, int maxsize) +getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int size, int maxsize, + int gbflags) { + struct thread *td; struct buf *bp; struct buf *nbp; int defrag = 0; int nqindex; static int flushingbufs; + td = curthread; /* * We can't afford to block since we might be holding a vnode lock, * which may prevent system daemons from running. We deal with * low-memory situations by proactively returning memory and running * async I/O rather then sync I/O. */ - atomic_add_int(&getnewbufcalls, 1); atomic_subtract_int(&getnewbufrestarts, 1); restart: @@ -1953,8 +1960,9 @@ restart: */ if (bp == NULL) { - int flags; + int flags, norunbuf; char *waitmsg; + int fl; if (defrag) { flags = VFS_BIO_NEED_BUFSPACE; @@ -1972,9 +1980,35 @@ restart: mtx_unlock(&bqlock); bd_speedup(); /* heeeelp */ + if (gbflags & GB_NOWAIT_BD) + return (NULL); mtx_lock(&nblock); while (needsbuffer & flags) { + if (vp != NULL && (td->td_pflags & TDP_BUFNEED) == 0) { + mtx_unlock(&nblock); + /* + * getblk() is called with a vnode + * locked, and some majority of the + * dirty buffers may as well belong to + * the vnode. Flushing the buffers + * there would make a progress that + * cannot be achieved by the + * buf_daemon, that cannot lock the + * vnode. + */ + norunbuf = ~(TDP_BUFNEED | TDP_NORUNNINGBUF) | + (td->td_pflags & TDP_NORUNNINGBUF); + /* play bufdaemon */ + td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF; + fl = buf_do_flush(vp); + td->td_pflags &= norunbuf; + mtx_lock(&nblock); + if (fl != 0) + continue; + if ((needsbuffer & flags) == 0) + break; + } if (msleep(&needsbuffer, &nblock, (PRIBIO + 4) | slpflag, waitmsg, slptimeo)) { mtx_unlock(&nblock); @@ -2043,6 +2077,35 @@ static struct kproc_desc buf_kp = { }; SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp); +static int +buf_do_flush(struct vnode *vp) +{ + int flushed; + + flushed = flushbufqueues(vp, QUEUE_DIRTY, 0); + /* The list empty check here is slightly racy */ + if (!TAILQ_EMPTY(&bufqueues[QUEUE_DIRTY_GIANT])) { + mtx_lock(&Giant); + flushed += flushbufqueues(vp, QUEUE_DIRTY_GIANT, 0); + mtx_unlock(&Giant); + } + if (flushed == 0) { + /* + * Could not find any buffers without rollback + * dependencies, so just write the first one + * in the hopes of eventually making progress. + */ + flushbufqueues(vp, QUEUE_DIRTY, 1); + if (!TAILQ_EMPTY( + &bufqueues[QUEUE_DIRTY_GIANT])) { + mtx_lock(&Giant); + flushbufqueues(vp, QUEUE_DIRTY_GIANT, 1); + mtx_unlock(&Giant); + } + } + return (flushed); +} + static void buf_daemon() { @@ -2056,7 +2119,7 @@ buf_daemon() /* * This process is allowed to take the buffer cache to the limit */ - curthread->td_pflags |= TDP_NORUNNINGBUF; + curthread->td_pflags |= TDP_NORUNNINGBUF | TDP_BUFNEED; mtx_lock(&bdlock); for (;;) { bd_request = 0; @@ -2071,30 +2134,8 @@ buf_daemon() * normally would so they can run in parallel with our drain. */ while (numdirtybuffers > lodirtybuffers) { - int flushed; - - flushed = flushbufqueues(QUEUE_DIRTY, 0); - /* The list empty check here is slightly racy */ - if (!TAILQ_EMPTY(&bufqueues[QUEUE_DIRTY_GIANT])) { - mtx_lock(&Giant); - flushed += flushbufqueues(QUEUE_DIRTY_GIANT, 0); - mtx_unlock(&Giant); - } - if (flushed == 0) { - /* - * Could not find any buffers without rollback - * dependencies, so just write the first one - * in the hopes of eventually making progress. - */ - flushbufqueues(QUEUE_DIRTY, 1); - if (!TAILQ_EMPTY( - &bufqueues[QUEUE_DIRTY_GIANT])) { - mtx_lock(&Giant); - flushbufqueues(QUEUE_DIRTY_GIANT, 1); - mtx_unlock(&Giant); - } + if (buf_do_flush(NULL) == 0) break; - } uio_yield(); } @@ -2140,7 +2181,7 @@ SYSCTL_INT(_vfs, OID_AUTO, flushwithdeps 0, "Number of buffers flushed with dependecies that require rollbacks"); static int -flushbufqueues(int queue, int flushdeps) +flushbufqueues(struct vnode *lvp, int queue, int flushdeps) { struct thread *td = curthread; struct buf sentinel; @@ -2151,20 +2192,37 @@ flushbufqueues(int queue, int flushdeps) int flushed; int target; - target = numdirtybuffers - lodirtybuffers; - if (flushdeps && target > 2) - target /= 2; + if (lvp == NULL) { + target = numdirtybuffers - lodirtybuffers; + if (flushdeps && target > 2) + target /= 2; + } else + target = flushbufqtarget; flushed = 0; bp = NULL; + sentinel.b_qindex = QUEUE_SENTINEL; mtx_lock(&bqlock); - TAILQ_INSERT_TAIL(&bufqueues[queue], &sentinel, b_freelist); + TAILQ_INSERT_HEAD(&bufqueues[queue], &sentinel, b_freelist); while (flushed != target) { - bp = TAILQ_FIRST(&bufqueues[queue]); - if (bp == &sentinel) + bp = TAILQ_NEXT(&sentinel, b_freelist); + if (bp != NULL) { + TAILQ_REMOVE(&bufqueues[queue], &sentinel, b_freelist); + TAILQ_INSERT_AFTER(&bufqueues[queue], bp, &sentinel, + b_freelist); + } else break; - TAILQ_REMOVE(&bufqueues[queue], bp, b_freelist); - TAILQ_INSERT_TAIL(&bufqueues[queue], bp, b_freelist); - + /* + * Skip sentinels inserted by other invocations of the + * flushbufqueues(), taking care to not reorder them. + */ + if (bp->b_qindex == QUEUE_SENTINEL) + continue; + /* + * Only flush the buffers that belong to the + * vnode locked by the curthread. + */ + if (lvp != NULL && bp->b_vp != lvp) + continue; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) continue; if (bp->b_pin_count > 0) { @@ -2212,16 +2270,28 @@ flushbufqueues(int queue, int flushdeps) BUF_UNLOCK(bp); continue; } - if (vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT, td) == 0) { + if (vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_CANRECURSE, + td) == 0) { mtx_unlock(&bqlock); CTR3(KTR_BUF, "flushbufqueue(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); - vfs_bio_awrite(bp); + if (curproc == bufdaemonproc) + vfs_bio_awrite(bp); + else { + bremfree(bp); + bwrite(bp); + } vn_finished_write(mp); VOP_UNLOCK(vp, 0, td); flushwithdeps += hasdeps; flushed++; - waitrunningbufspace(); + + /* + * Sleeping on runningbufspace while holding + * vnode lock leads to deadlock. + */ + if (curproc == bufdaemonproc) + waitrunningbufspace(); numdirtywakeup((lodirtybuffers + hidirtybuffers) / 2); mtx_lock(&bqlock); continue; @@ -2603,7 +2673,7 @@ loop: maxsize = vmio ? size + (offset & PAGE_MASK) : size; maxsize = imax(maxsize, bsize); - bp = getnewbuf(slpflag, slptimeo, size, maxsize); + bp = getnewbuf(vp, slpflag, slptimeo, size, maxsize, flags); if (bp == NULL) { if (slpflag || slptimeo) return NULL; @@ -2678,14 +2748,17 @@ loop: * set to B_INVAL. */ struct buf * -geteblk(int size) +geteblk(int size, int flags) { struct buf *bp; int maxsize; maxsize = (size + BKVAMASK) & ~BKVAMASK; - while ((bp = getnewbuf(0, 0, size, maxsize)) == 0) - continue; + while ((bp = getnewbuf(NULL, 0, 0, size, maxsize, flags)) == NULL) { + if ((flags & GB_NOWAIT_BD) && + (curthread->td_pflags & TDP_BUFNEED) != 0) + return (NULL); + } allocbuf(bp, size); bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */ KASSERT(BUF_REFCNT(bp) == 1, ("geteblk: bp %p not locked",bp)); Modified: stable/7/sys/sys/buf.h ============================================================================== --- stable/7/sys/sys/buf.h Tue May 5 09:24:20 2009 (r191812) +++ stable/7/sys/sys/buf.h Tue May 5 10:34:43 2009 (r191813) @@ -475,6 +475,7 @@ buf_countdeps(struct buf *bp, int i) */ #define GB_LOCK_NOWAIT 0x0001 /* Fail if we block on a buf lock. */ #define GB_NOCREAT 0x0002 /* Don't create a buf if not found. */ +#define GB_NOWAIT_BD 0x0004 /* Do not wait for bufdaemon */ #ifdef _KERNEL extern int nbuf; /* The number of buffer headers */ @@ -519,7 +520,7 @@ struct buf * getpbuf(int *); struct buf *incore(struct bufobj *, daddr_t); struct buf *gbincore(struct bufobj *, daddr_t); struct buf *getblk(struct vnode *, daddr_t, int, int, int, int); -struct buf *geteblk(int); +struct buf *geteblk(int, int); int bufwait(struct buf *); int bufwrite(struct buf *); void bufdone(struct buf *); Modified: stable/7/sys/sys/proc.h ============================================================================== --- stable/7/sys/sys/proc.h Tue May 5 09:24:20 2009 (r191812) +++ stable/7/sys/sys/proc.h Tue May 5 10:34:43 2009 (r191813) @@ -380,6 +380,7 @@ do { \ #define TDP_INBDFLUSH 0x00100000 /* Already in BO_BDFLUSH, do not recurse */ #define TDP_IGNSUSP 0x00800000 /* Permission to ignore the MNTK_SUSPEND* */ #define TDP_AUDITREC 0x01000000 /* Audit record pending on thread */ +#define TDP_BUFNEED 0x02000000 /* Do not recurse into the buf flush */ /* * Reasons that the current thread can not be run yet. Modified: stable/7/sys/ufs/ffs/ffs_vfsops.c ============================================================================== --- stable/7/sys/ufs/ffs/ffs_vfsops.c Tue May 5 09:24:20 2009 (r191812) +++ stable/7/sys/ufs/ffs/ffs_vfsops.c Tue May 5 10:34:43 2009 (r191813) @@ -1842,7 +1842,9 @@ ffs_bufwrite(struct buf *bp) ("bufwrite: needs chained iodone (%p)", bp->b_iodone)); /* get a new block */ - newbp = geteblk(bp->b_bufsize); + newbp = geteblk(bp->b_bufsize, GB_NOWAIT_BD); + if (newbp == NULL) + goto normal_write; /* * set it to be identical to the old block. We have to @@ -1882,6 +1884,7 @@ ffs_bufwrite(struct buf *bp) } /* Let the normal bufwrite do the rest for us */ +normal_write: return (bufwrite(bp)); }