Skip site navigation (1)Skip section navigation (2)
Date:      Wed, 4 Feb 2009 16:33:09 +0200
From:      Kostik Belousov <kostikbel@gmail.com>
To:        Matt Burke <mattblists@icritical.com>
Cc:        stable@freebsd.org
Subject:   Re: 7.1-RELEASE I/O hang
Message-ID:  <20090204143309.GF9427@deviant.kiev.zoral.com.ua>
In-Reply-To: <49898E3D.7030609@icritical.com>
References:  <49898E3D.7030609@icritical.com>

next in thread | previous in thread | raw e-mail | index | archive | help

--H4SyuGOnfnj3aJqJ
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
Content-Transfer-Encoding: quoted-printable

On Wed, Feb 04, 2009 at 12:46:53PM +0000, Matt Burke wrote:
> I have a machine with a PERC6/e controller. Attached to that are 3 disk
> shelves, each configured as individual 14-disk RAID10 arrays (the PERC
> annoyingly only lets you use 8 spans per array)
>=20
> I can run bonnie++ on the arrays individually with no problem.
> I can also run it across a gstripe of the arrays with no problem.
>=20
> However running it over the 3 arrays in parallel causes something I/O
> related in the kernel to hang.
>=20
> To define 'hang' better:
>=20
> It appears anything which needs disk io, even on a different controller
> (albeit the same mfi driver), will hang. A command like 'ps' cached in
> ram will work but bash hangs after execution, presumably while trying to
> write ~/.bash_history
>=20
> 'sysctl -a' works but trying to run 'sysctl kern.msgbuf' also hangs
>=20
> I've done some research and it seems the usual cause of bonnie++
> crashing a system is due to overflowing TCQ. camcontrol doesn't see any
> disks, so I've tried setting hw.mfi.max_cmds=3D32 in /boot/loader.conf but
> it hadn't made any difference.
>=20
> The bonnie++ invocation is this:
>=20
> (newfs devices mfid[2-3], mount)
> bonnie++ -s 64g -u root -p3
> bonnie++ -d /data/2 -s 64g -u root -y s >b2 2>&1 &
> bonnie++ -d /data/3 -s 64g -u root -y s >b3 2>&1 &
> bonnie++ -d /data/4 -s 64g -u root -y s >b4 2>&1 &
>=20
> and it always hangs on "Rewriting...". It's a fresh 7.1-RELEASE with
> nothing else running (devd, sshd, syslogd, etc)
>=20
>=20
> Any ideas?

Compile ddb into the kernel, and do "ps" from the ddb prompt. If there
are processes hung in the "nbufkv" state, then the patch below might
help.

Index: gnu/fs/xfs/FreeBSD/xfs_buf.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- gnu/fs/xfs/FreeBSD/xfs_buf.c	(revision 188080)
+++ gnu/fs/xfs/FreeBSD/xfs_buf.c	(working copy)
@@ -81,7 +81,7 @@
 {
 	struct buf *bp;
=20
-	bp =3D geteblk(0);
+	bp =3D geteblk(0, 0);
 	if (bp !=3D NULL) {
 		bp->b_bufsize =3D size;
 		bp->b_bcount =3D size;
@@ -101,7 +101,7 @@
 	if (len >=3D MAXPHYS)
 		return (NULL);
=20
-	bp =3D geteblk(len);
+	bp =3D geteblk(len, 0);
 	if (bp !=3D NULL) {
 		KASSERT(BUF_REFCNT(bp) =3D=3D 1,
 			("xfs_buf_get_empty: bp %p not locked",bp));
Index: ufs/ffs/ffs_vfsops.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- ufs/ffs/ffs_vfsops.c	(revision 188080)
+++ ufs/ffs/ffs_vfsops.c	(working copy)
@@ -1747,7 +1747,9 @@
 		    ("bufwrite: needs chained iodone (%p)", bp->b_iodone));
=20
 		/* get a new block */
-		newbp =3D geteblk(bp->b_bufsize);
+		newbp =3D geteblk(bp->b_bufsize, GB_NOWAIT_BD);
+		if (newbp =3D=3D NULL)
+			goto normal_write;
=20
 		/*
 		 * set it to be identical to the old block.  We have to
@@ -1787,6 +1789,7 @@
 	}
=20
 	/* Let the normal bufwrite do the rest for us */
+normal_write:
 	return (bufwrite(bp));
 }
=20
Index: kern/vfs_bio.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- kern/vfs_bio.c	(revision 188080)
+++ kern/vfs_bio.c	(working copy)
@@ -105,7 +105,8 @@
 static void vfs_vmio_release(struct buf *bp);
 static int vfs_bio_clcheck(struct vnode *vp, int size,
 		daddr_t lblkno, daddr_t blkno);
-static int flushbufqueues(int, int);
+static int buf_do_flush(struct vnode *vp);
+static int flushbufqueues(struct vnode *, int, int);
 static void buf_daemon(void);
 static void bremfreel(struct buf *bp);
=20
@@ -258,6 +259,7 @@
 #define QUEUE_DIRTY_GIANT 3	/* B_DELWRI buffers that need giant */
 #define QUEUE_EMPTYKVA	4	/* empty buffer headers w/KVA assignment */
 #define QUEUE_EMPTY	5	/* empty buffer headers */
+#define QUEUE_SENTINEL	1024	/* not an queue index, but mark for sentinel */
=20
 /* Queues for free buffers with various properties */
 static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] =3D { { 0 } };
@@ -1703,21 +1705,23 @@
  */
=20
 static struct buf *
-getnewbuf(int slpflag, int slptimeo, int size, int maxsize)
+getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int size, int maxsi=
ze,
+    int gbflags)
 {
+	struct thread *td;
 	struct buf *bp;
 	struct buf *nbp;
 	int defrag =3D 0;
 	int nqindex;
 	static int flushingbufs;
=20
+	td =3D curthread;
 	/*
 	 * We can't afford to block since we might be holding a vnode lock,
 	 * which may prevent system daemons from running.  We deal with
 	 * low-memory situations by proactively returning memory and running
 	 * async I/O rather then sync I/O.
 	 */
-
 	atomic_add_int(&getnewbufcalls, 1);
 	atomic_subtract_int(&getnewbufrestarts, 1);
 restart:
@@ -1949,8 +1953,9 @@
 	 */
=20
 	if (bp =3D=3D NULL) {
-		int flags;
+		int flags, norunbuf;
 		char *waitmsg;
+		int fl;
=20
 		if (defrag) {
 			flags =3D VFS_BIO_NEED_BUFSPACE;
@@ -1968,9 +1973,35 @@
 		mtx_unlock(&bqlock);
=20
 		bd_speedup();	/* heeeelp */
+		if (gbflags & GB_NOWAIT_BD)
+			return (NULL);
=20
 		mtx_lock(&nblock);
 		while (needsbuffer & flags) {
+			if (vp !=3D NULL && (td->td_pflags & TDP_BUFNEED) =3D=3D 0) {
+				mtx_unlock(&nblock);
+				/*
+				 * getblk() is called with a vnode
+				 * locked, and some majority of the
+				 * dirty buffers may as well belong to
+				 * the vnode. Flushing the buffers
+				 * there would make a progress that
+				 * cannot be achieved by the
+				 * buf_daemon, that cannot lock the
+				 * vnode.
+				 */
+				norunbuf =3D ~(TDP_BUFNEED | TDP_NORUNNINGBUF) |
+				    (td->td_pflags & TDP_NORUNNINGBUF);
+				/* play bufdaemon */
+				td->td_pflags |=3D TDP_BUFNEED | TDP_NORUNNINGBUF;
+				fl =3D buf_do_flush(vp);
+				td->td_pflags &=3D norunbuf;
+				mtx_lock(&nblock);
+				if (fl !=3D 0)
+					continue;
+				if ((needsbuffer & flags) =3D=3D 0)
+					break;
+			}
 			if (msleep(&needsbuffer, &nblock,
 			    (PRIBIO + 4) | slpflag, waitmsg, slptimeo)) {
 				mtx_unlock(&nblock);
@@ -2039,6 +2070,35 @@
 };
 SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_k=
p);
=20
+static int
+buf_do_flush(struct vnode *vp)
+{
+	int flushed;
+
+	flushed =3D flushbufqueues(vp, QUEUE_DIRTY, 0);
+	/* The list empty check here is slightly racy */
+	if (!TAILQ_EMPTY(&bufqueues[QUEUE_DIRTY_GIANT])) {
+		mtx_lock(&Giant);
+		flushed +=3D flushbufqueues(vp, QUEUE_DIRTY_GIANT, 0);
+		mtx_unlock(&Giant);
+	}
+	if (flushed =3D=3D 0) {
+		/*
+		 * Could not find any buffers without rollback
+		 * dependencies, so just write the first one
+		 * in the hopes of eventually making progress.
+		 */
+		flushbufqueues(vp, QUEUE_DIRTY, 1);
+		if (!TAILQ_EMPTY(
+			    &bufqueues[QUEUE_DIRTY_GIANT])) {
+			mtx_lock(&Giant);
+			flushbufqueues(vp, QUEUE_DIRTY_GIANT, 1);
+			mtx_unlock(&Giant);
+		}
+	}
+	return (flushed);
+}
+
 static void
 buf_daemon()
 {
@@ -2052,7 +2112,7 @@
 	/*
 	 * This process is allowed to take the buffer cache to the limit
 	 */
-	curthread->td_pflags |=3D TDP_NORUNNINGBUF;
+	curthread->td_pflags |=3D TDP_NORUNNINGBUF | TDP_BUFNEED;
 	mtx_lock(&bdlock);
 	for (;;) {
 		bd_request =3D 0;
@@ -2067,30 +2127,8 @@
 		 * normally would so they can run in parallel with our drain.
 		 */
 		while (numdirtybuffers > lodirtybuffers) {
-			int flushed;
-
-			flushed =3D flushbufqueues(QUEUE_DIRTY, 0);
-			/* The list empty check here is slightly racy */
-			if (!TAILQ_EMPTY(&bufqueues[QUEUE_DIRTY_GIANT])) {
-				mtx_lock(&Giant);
-				flushed +=3D flushbufqueues(QUEUE_DIRTY_GIANT, 0);
-				mtx_unlock(&Giant);
-			}
-			if (flushed =3D=3D 0) {
-				/*
-				 * Could not find any buffers without rollback
-				 * dependencies, so just write the first one
-				 * in the hopes of eventually making progress.
-				 */
-				flushbufqueues(QUEUE_DIRTY, 1);
-				if (!TAILQ_EMPTY(
-				    &bufqueues[QUEUE_DIRTY_GIANT])) {
-					mtx_lock(&Giant);
-					flushbufqueues(QUEUE_DIRTY_GIANT, 1);
-					mtx_unlock(&Giant);
-				}
+			if (buf_do_flush(NULL) =3D=3D 0)
 				break;
-			}
 			uio_yield();
 		}
=20
@@ -2136,7 +2174,7 @@
     0, "Number of buffers flushed with dependecies that require rollbacks"=
);
=20
 static int
-flushbufqueues(int queue, int flushdeps)
+flushbufqueues(struct vnode *lvp, int queue, int flushdeps)
 {
 	struct thread *td =3D curthread;
 	struct buf sentinel;
@@ -2147,20 +2185,37 @@
 	int flushed;
 	int target;
=20
-	target =3D numdirtybuffers - lodirtybuffers;
-	if (flushdeps && target > 2)
-		target /=3D 2;
+	if (lvp =3D=3D NULL) {
+		target =3D numdirtybuffers - lodirtybuffers;
+		if (flushdeps && target > 2)
+			target /=3D 2;
+	} else
+		target =3D 1;
 	flushed =3D 0;
 	bp =3D NULL;
+	sentinel.b_qindex =3D QUEUE_SENTINEL;
 	mtx_lock(&bqlock);
-	TAILQ_INSERT_TAIL(&bufqueues[queue], &sentinel, b_freelist);
+	TAILQ_INSERT_HEAD(&bufqueues[queue], &sentinel, b_freelist);
 	while (flushed !=3D target) {
-		bp =3D TAILQ_FIRST(&bufqueues[queue]);
-		if (bp =3D=3D &sentinel)
+		bp =3D TAILQ_NEXT(&sentinel, b_freelist);
+		if (bp !=3D NULL) {
+			TAILQ_REMOVE(&bufqueues[queue], &sentinel, b_freelist);
+			TAILQ_INSERT_AFTER(&bufqueues[queue], bp, &sentinel,
+			    b_freelist);
+		} else
 			break;
-		TAILQ_REMOVE(&bufqueues[queue], bp, b_freelist);
-		TAILQ_INSERT_TAIL(&bufqueues[queue], bp, b_freelist);
-
+		/*
+		 * Skip sentinels inserted by other invocations of the
+		 * flushbufqueues(), taking care to not reorder them.
+		 */
+		if (bp->b_qindex =3D=3D QUEUE_SENTINEL)
+			continue;
+		/*
+		 * Only flush the buffers that belong to the
+		 * vnode locked by the curthread.
+		 */
+		if (lvp !=3D NULL && bp->b_vp !=3D lvp)
+			continue;
 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) !=3D 0)
 			continue;
 		if (bp->b_pin_count > 0) {
@@ -2208,16 +2263,28 @@
 			BUF_UNLOCK(bp);
 			continue;
 		}
-		if (vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT, td) =3D=3D 0) {
+		if (vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_CANRECURSE, td)
+		    =3D=3D 0) {
 			mtx_unlock(&bqlock);
 			CTR3(KTR_BUF, "flushbufqueue(%p) vp %p flags %X",
 			    bp, bp->b_vp, bp->b_flags);
-			vfs_bio_awrite(bp);
+			if (curproc =3D=3D bufdaemonproc)
+				vfs_bio_awrite(bp);
+			else {
+				bremfree(bp);
+				bwrite(bp);
+			}
 			vn_finished_write(mp);
 			VOP_UNLOCK(vp, 0, td);
 			flushwithdeps +=3D hasdeps;
 			flushed++;
-			waitrunningbufspace();
+
+			/*
+			 * Sleeping on runningbufspace while holding
+			 * vnode lock leads to deadlock.
+			 */
+			if (curproc =3D=3D bufdaemonproc)
+				waitrunningbufspace();
 			numdirtywakeup((lodirtybuffers + hidirtybuffers) / 2);
 			mtx_lock(&bqlock);
 			continue;
@@ -2599,7 +2666,7 @@
 		maxsize =3D vmio ? size + (offset & PAGE_MASK) : size;
 		maxsize =3D imax(maxsize, bsize);
=20
-		bp =3D getnewbuf(slpflag, slptimeo, size, maxsize);
+		bp =3D getnewbuf(vp, slpflag, slptimeo, size, maxsize, flags);
 		if (bp =3D=3D NULL) {
 			if (slpflag || slptimeo)
 				return NULL;
@@ -2674,14 +2741,17 @@
  * set to B_INVAL.
  */
 struct buf *
-geteblk(int size)
+geteblk(int size, int flags)
 {
 	struct buf *bp;
 	int maxsize;
=20
 	maxsize =3D (size + BKVAMASK) & ~BKVAMASK;
-	while ((bp =3D getnewbuf(0, 0, size, maxsize)) =3D=3D 0)
-		continue;
+	while ((bp =3D getnewbuf(NULL, 0, 0, size, maxsize, flags)) =3D=3D NULL) {
+		if ((flags & GB_NOWAIT_BD) &&
+		    (curthread->td_pflags & TDP_BUFNEED) !=3D 0)
+			return (NULL);
+	}
 	allocbuf(bp, size);
 	bp->b_flags |=3D B_INVAL;	/* b_dep cleared by getnewbuf() */
 	KASSERT(BUF_REFCNT(bp) =3D=3D 1, ("geteblk: bp %p not locked",bp));
Index: sys/proc.h
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- sys/proc.h	(revision 188080)
+++ sys/proc.h	(working copy)
@@ -378,6 +378,7 @@
 #define	TDP_NORUNNINGBUF 0x00040000 /* Ignore runningbufspace check */
 #define	TDP_WAKEUP	0x00080000 /* Don't sleep in umtx cond_wait */
 #define	TDP_INBDFLUSH	0x00100000 /* Already in BO_BDFLUSH, do not recurse =
*/
+#define	TDP_BUFNEED	0x00200000 /* Do not recurse into the buf flush */
=20
 /*
  * Reasons that the current thread can not be run yet.
Index: sys/buf.h
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- sys/buf.h	(revision 188080)
+++ sys/buf.h	(working copy)
@@ -475,6 +475,7 @@
  */
 #define	GB_LOCK_NOWAIT	0x0001		/* Fail if we block on a buf lock. */
 #define	GB_NOCREAT	0x0002		/* Don't create a buf if not found. */
+#define	GB_NOWAIT_BD	0x0004		/* Do not wait for bufdaemon */
=20
 #ifdef _KERNEL
 extern int	nbuf;			/* The number of buffer headers */
@@ -519,7 +520,7 @@
 struct buf *incore(struct bufobj *, daddr_t);
 struct buf *gbincore(struct bufobj *, daddr_t);
 struct buf *getblk(struct vnode *, daddr_t, int, int, int, int);
-struct buf *geteblk(int);
+struct buf *geteblk(int, int);
 int	bufwait(struct buf *);
 int	bufwrite(struct buf *);
 void	bufdone(struct buf *);

--H4SyuGOnfnj3aJqJ
Content-Type: application/pgp-signature
Content-Disposition: inline

-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.9 (FreeBSD)

iEYEARECAAYFAkmJpyMACgkQC3+MBN1Mb4hRqQCfanVd7jimz/10nzSqaJXM+R1J
NpwAoOOWhGu3xJFa/5EpN/nTlaQ38Hbg
=8is/
-----END PGP SIGNATURE-----

--H4SyuGOnfnj3aJqJ--



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?20090204143309.GF9427>