Date: Sun, 4 Nov 2001 17:12:39 -0800 (PST) From: Matthew Dillon <dillon@apollo.backplane.com> To: Mark Santcroos <marks@ripe.net>, current@FreeBSD.ORG Subject: patch #2 (was Re: buf_daemon() lockup) Message-ID: <200111050112.fA51Cdc42844@apollo.backplane.com> References: <20011101092118.A434@laptop.6bone.nl> <200111042259.fA4MxSc93566@apollo.backplane.com> <200111050006.fA506f309535@apollo.backplane.com> <200111050015.fA50Fdk09561@apollo.backplane.com>
next in thread | previous in thread | raw e-mail | index | archive | help
:
: Hmm.. that last patch didn't do it. I've noticed some errors on the
: console before the lockup:
:
:unexpected md driver lock: 0xe1813900: type VREG, usecount 2, writecount 1, refcount 3871, flags (VOBJBUF)
: tag VT_UFS, ino 4, on dev da0s1h (13, 131079) lock type inode: EXCL (count 1) by pid 6
Ok. I think these unexpected md driver lock messages are bogus... I'll
leave it to Poul to remove it. The syncer or buf_daemon can be flushing
buffers associated with the underlying file simultaniously with other
processes doing MD ops.
Here's a new patch. It's the same as the old one except I fixed a
missing B_NOWDRAIN flag in the clustering code, and I added B_NOWDRAIN
support to the nfs client code. I think the missing drain flag in the
clustering code was the problem. Try this patch.
-Matt
Index: dev/md/md.c
===================================================================
RCS file: /home/ncvs/src/sys/dev/md/md.c,v
retrieving revision 1.47
diff -u -r1.47 md.c
--- dev/md/md.c 2001/10/11 23:38:13 1.47
+++ dev/md/md.c 2001/11/04 23:54:18
@@ -388,13 +388,18 @@
auio.uio_td = curthread;
if (VOP_ISLOCKED(sc->vnode, NULL))
vprint("unexpected md driver lock", sc->vnode);
+ /*
+ * When reading set IO_DIRECT to try to avoid double-caching
+ * the data. When writing IO_DIRECT is not optimal, but we
+ * must set IO_NOWDRAIN to avoid a wdrain deadlock.
+ */
if (bp->bio_cmd == BIO_READ) {
vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY, curthread);
- error = VOP_READ(sc->vnode, &auio, 0, sc->cred);
+ error = VOP_READ(sc->vnode, &auio, IO_DIRECT, sc->cred);
} else {
(void) vn_start_write(sc->vnode, &mp, V_WAIT);
vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY, curthread);
- error = VOP_WRITE(sc->vnode, &auio, 0, sc->cred);
+ error = VOP_WRITE(sc->vnode, &auio, IO_NOWDRAIN, sc->cred);
vn_finished_write(mp);
}
VOP_UNLOCK(sc->vnode, 0, curthread);
Index: kern/vfs_bio.c
===================================================================
RCS file: /home/ncvs/src/sys/kern/vfs_bio.c,v
retrieving revision 1.291
diff -u -r1.291 vfs_bio.c
--- kern/vfs_bio.c 2001/10/21 06:26:55 1.291
+++ kern/vfs_bio.c 2001/11/04 23:41:19
@@ -758,11 +758,15 @@
int rtval = bufwait(bp);
brelse(bp);
return (rtval);
- } else {
+ } else if ((oldflags & B_NOWDRAIN) == 0) {
/*
* don't allow the async write to saturate the I/O
- * system. There is no chance of deadlock here because
- * we are blocking on I/O that is already in-progress.
+ * system. Deadlocks can occur only if a device strategy
+ * routine (like in MD) turns around and issues another
+ * high-level write, in which case B_NOWDRAIN is expected
+ * to be set. Otherwise we will not deadlock here because
+ * we are blocking waiting for I/O that is already in-progress
+ * to complete.
*/
waitrunningbufspace();
}
@@ -1286,7 +1290,8 @@
/* unlock */
BUF_UNLOCK(bp);
- bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF | B_DIRECT);
+ bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF |
+ B_DIRECT | B_NOWDRAIN);
bp->b_ioflags &= ~BIO_ORDERED;
if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY))
panic("brelse: not dirty");
Index: kern/vfs_cluster.c
===================================================================
RCS file: /home/ncvs/src/sys/kern/vfs_cluster.c,v
retrieving revision 1.114
diff -u -r1.114 vfs_cluster.c
--- kern/vfs_cluster.c 2001/10/25 22:49:48 1.114
+++ kern/vfs_cluster.c 2001/11/05 00:49:33
@@ -836,7 +836,7 @@
bp->b_data = (char *)((vm_offset_t)bp->b_data |
((vm_offset_t)tbp->b_data & PAGE_MASK));
bp->b_flags |= B_CLUSTER |
- (tbp->b_flags & (B_VMIO | B_NEEDCOMMIT));
+ (tbp->b_flags & (B_VMIO | B_NEEDCOMMIT | B_NOWDRAIN));
bp->b_iodone = cluster_callback;
pbgetvp(vp, bp);
/*
Index: nfsclient/nfs_bio.c
===================================================================
RCS file: /home/ncvs/src/sys/nfsclient/nfs_bio.c,v
retrieving revision 1.102
diff -u -r1.102 nfs_bio.c
--- nfsclient/nfs_bio.c 2001/10/11 23:38:16 1.102
+++ nfsclient/nfs_bio.c 2001/11/05 01:07:42
@@ -961,6 +961,12 @@
}
vfs_bio_set_validclean(bp, on, n);
}
+ /*
+ * If IO_NOWDRAIN then set B_NOWDRAIN (nfs-backed MD
+ * filesystem)
+ */
+ if (ioflag & IO_NOWDRAIN)
+ bp->b_flags |= B_NOWDRAIN;
/*
* If IO_SYNC do bwrite().
Index: sys/buf.h
===================================================================
RCS file: /home/ncvs/src/sys/sys/buf.h,v
retrieving revision 1.121
diff -u -r1.121 buf.h
--- sys/buf.h 2001/09/12 08:38:04 1.121
+++ sys/buf.h 2001/11/04 23:30:25
@@ -192,6 +192,11 @@
* the pages underlying the buffer. B_DIRECT is
* sticky until the buffer is released and typically
* only has an effect when B_RELBUF is also set.
+ *
+ * B_NOWDRAIN This flag should be set when a device (like MD)
+ * does a turn-around VOP_WRITE from its strategy
+ * routine. This flag prevents bwrite() from blocking
+ * in wdrain, avoiding a deadlock situation.
*/
#define B_AGE 0x00000001 /* Move to age queue when I/O done. */
@@ -204,7 +209,7 @@
#define B_DELWRI 0x00000080 /* Delay I/O until buffer reused. */
#define B_DONE 0x00000200 /* I/O completed. */
#define B_EINTR 0x00000400 /* I/O was interrupted */
-#define B_00000800 0x00000800 /* Available flag. */
+#define B_NOWDRAIN 0x00000800 /* Avoid wdrain deadlock */
#define B_SCANNED 0x00001000 /* VOP_FSYNC funcs mark written bufs */
#define B_INVAL 0x00002000 /* Does not contain valid info. */
#define B_LOCKED 0x00004000 /* Locked in core (not reusable). */
Index: sys/vnode.h
===================================================================
RCS file: /home/ncvs/src/sys/sys/vnode.h,v
retrieving revision 1.162
diff -u -r1.162 vnode.h
--- sys/vnode.h 2001/10/27 19:58:55 1.162
+++ sys/vnode.h 2001/11/04 23:27:40
@@ -222,6 +222,7 @@
#define IO_INVAL 0x40 /* invalidate after I/O */
#define IO_ASYNC 0x80 /* bawrite rather then bdwrite */
#define IO_DIRECT 0x100 /* attempt to bypass buffer cache */
+#define IO_NOWDRAIN 0x200 /* do not block on wdrain */
/*
* Modes. Some values same as Ixxx entries from inode.h for now.
Index: ufs/ufs/ufs_readwrite.c
===================================================================
RCS file: /home/ncvs/src/sys/ufs/ufs/ufs_readwrite.c,v
retrieving revision 1.82
diff -u -r1.82 ufs_readwrite.c
--- ufs/ufs/ufs_readwrite.c 2001/09/12 08:38:10 1.82
+++ ufs/ufs/ufs_readwrite.c 2001/11/04 23:29:15
@@ -511,6 +511,8 @@
break;
if (ioflag & IO_DIRECT)
bp->b_flags |= B_DIRECT;
+ if (ioflag & IO_NOWDRAIN)
+ bp->b_flags |= B_NOWDRAIN;
if (uio->uio_offset + xfersize > ip->i_size) {
ip->i_size = uio->uio_offset + xfersize;
To Unsubscribe: send mail to majordomo@FreeBSD.org
with "unsubscribe freebsd-current" in the body of the message
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200111050112.fA51Cdc42844>
