Date: Wed, 12 Dec 2001 22:08:09 -0800 (PST) From: Matthew Dillon <dillon@apollo.backplane.com> To: Jordan Hubbard <jkh@winston.freebsd.org> Cc: Peter Wemm <peter@wemm.org>, Mike Smith <msmith@FreeBSD.ORG>, hackers@FreeBSD.ORG, msmith@mass.dis.org Subject: Found NFS data corruption bug... (was Re: NFS: How to make FreeBSD fall on its face in one easy step ) Message-ID: <200112130608.fBD689K49906@apollo.backplane.com> References: <58885.1008217148@winston.freebsd.org>
index | next in thread | previous in thread | raw e-mail
Ok, here is the latest patch for -stable. Note that Kirk comitted a
slightly modified version of the softupdates fix to -current already
(the VOP_FSYNC stuff), which I will be MFCing in 3 days.
This still doesn't fix all the problems the nfstest program that Jordan
posted finds, but it sure runs a hellofalot longer now before reporting
an error. 10,000+ tests now before failing (NFSv2 and NFSv3).
Bugs fixed:
* Possible SMP database corruption due to vm_pager_unmap_page()
not clearing the TLB for the other cpu's.
* When flusing a dirty buffer due to B_CACHE getting cleared,
we were accidently setting B_CACHE again (that is, bwrite() sets
B_CACHE), when we really want it to stay clear after the write
is complete. This resulted in a corrupt buffer.
* We have to call vtruncbuf() when ftruncate()ing to remove
any buffer cache buffers. This is still tentitive, I may
be able to remove it due to the second bug fix.
* vnode_pager_setsize() race against nfs_vinvalbuf()... we have
to set n_size before calling nfs_vinvalbuf or the NFS code
may recursively vnode_pager_setsize() to the original value
before the truncate. This is what was causing the user mmap
bus faults in the nfs tester program.
* Fix to softupdates (old version)
There are some general comments in there too. After I do more tests
and cleanups (maybe remove the vtruncbuf()) I will port it all to
-current, test, and commit. So far the stuff is simple enough that
a 3-day MFC will probably suffice.
All I can say is... holy shit!
-Matt
Index: kern/vfs_bio.c
===================================================================
RCS file: /home/ncvs/src/sys/kern/vfs_bio.c,v
retrieving revision 1.242.2.13
diff -u -r1.242.2.13 vfs_bio.c
--- kern/vfs_bio.c 2001/11/18 07:10:59 1.242.2.13
+++ kern/vfs_bio.c 2001/12/13 05:52:55
@@ -2189,9 +2189,22 @@
* to softupdates re-dirtying the buffer. In the latter
* case, B_CACHE is set after the first write completes,
* preventing further loops.
+ *
+ * NOTE! b*write() sets B_CACHE. If we cleared B_CACHE
+ * above while extending the buffer, we cannot allow the
+ * buffer to remain with B_CACHE set after the write
+ * completes or it will represent a corrupt state. To
+ * deal with this we set B_NOCACHE to scrap the buffer
+ * after the write.
+ *
+ * We might be able to do something fancy, like setting
+ * B_CACHE in bwrite() except if B_DELWRI is already set,
+ * so the below call doesn't set B_CACHE, but that gets real
+ * confusing. This is much easier.
*/
if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) {
+ bp->b_flags |= B_NOCACHE;
VOP_BWRITE(bp->b_vp, bp);
goto loop;
}
Index: nfs/nfs_bio.c
===================================================================
RCS file: /home/ncvs/src/sys/nfs/Attic/nfs_bio.c,v
retrieving revision 1.83.2.1
diff -u -r1.83.2.1 nfs_bio.c
--- nfs/nfs_bio.c 2001/11/18 07:10:59 1.83.2.1
+++ nfs/nfs_bio.c 2001/12/13 05:52:10
@@ -193,8 +193,14 @@
vm_page_set_validclean(m, 0, size - toff);
/* handled by vm_fault now */
/* vm_page_zero_invalid(m, TRUE); */
+ } else {
+ /*
+ * Read operation was short. If no error occured
+ * we may have hit a zero-fill section. We simply
+ * leave valid set to 0.
+ */
+ ;
}
-
if (i != ap->a_reqpage) {
/*
* Whether or not to leave the page activated is up in
@@ -896,14 +902,12 @@
else
bcount = np->n_size - (off_t)lbn * biosize;
}
-
- bp = nfs_getcacheblk(vp, lbn, bcount, p);
-
if (uio->uio_offset + n > np->n_size) {
np->n_size = uio->uio_offset + n;
np->n_flag |= NMODIFIED;
vnode_pager_setsize(vp, np->n_size);
}
+ bp = nfs_getcacheblk(vp, lbn, bcount, p);
}
if (!bp) {
@@ -1409,11 +1413,13 @@
io.iov_len = uiop->uio_resid = bp->b_bcount;
io.iov_base = bp->b_data;
uiop->uio_rw = UIO_READ;
+
switch (vp->v_type) {
case VREG:
uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
nfsstats.read_bios++;
error = nfs_readrpc(vp, uiop, cr);
+
if (!error) {
if (uiop->uio_resid) {
/*
@@ -1425,7 +1431,7 @@
* writes, but that is not possible any longer.
*/
int nread = bp->b_bcount - uiop->uio_resid;
- int left = bp->b_bcount - nread;
+ int left = uiop->uio_resid;
if (left > 0)
bzero((char *)bp->b_data + nread, left);
Index: nfs/nfs_vnops.c
===================================================================
RCS file: /home/ncvs/src/sys/nfs/Attic/nfs_vnops.c,v
retrieving revision 1.150.2.4
diff -u -r1.150.2.4 nfs_vnops.c
--- nfs/nfs_vnops.c 2001/08/05 00:23:58 1.150.2.4
+++ nfs/nfs_vnops.c 2001/12/13 04:23:51
@@ -710,7 +710,22 @@
*/
if (vp->v_mount->mnt_flag & MNT_RDONLY)
return (EROFS);
- vnode_pager_setsize(vp, vap->va_size);
+
+ /*
+ * We run vnode_pager_setsize() early (why?),
+ * we must set np->n_size now to avoid vinvalbuf
+ * V_SAVE races that might setsize a lower
+ * value.
+ */
+ tsize = np->n_size;
+ np->n_size = vap->va_size;
+#if 1
+ if (np->n_size < tsize)
+ error = vtruncbuf(vp, ap->a_cred, ap->a_p, vap->va_size, vp->v_mount->mnt_stat.f_iosize);
+ else
+#endif
+ vnode_pager_setsize(vp, vap->va_size);
+
if (np->n_flag & NMODIFIED) {
if (vap->va_size == 0)
error = nfs_vinvalbuf(vp, 0,
@@ -719,12 +734,12 @@
error = nfs_vinvalbuf(vp, V_SAVE,
ap->a_cred, ap->a_p, 1);
if (error) {
+ np->n_size = tsize;
vnode_pager_setsize(vp, np->n_size);
return (error);
}
}
- tsize = np->n_size;
- np->n_size = np->n_vattr.va_size = vap->va_size;
+ np->n_vattr.va_size = vap->va_size;
};
} else if ((vap->va_mtime.tv_sec != VNOVAL ||
vap->va_atime.tv_sec != VNOVAL) && (np->n_flag & NMODIFIED) &&
@@ -1119,10 +1134,12 @@
m_freem(mrep);
tsiz -= retlen;
if (v3) {
- if (eof || retlen == 0)
+ if (eof || retlen == 0) {
tsiz = 0;
- } else if (retlen < len)
+ }
+ } else if (retlen < len) {
tsiz = 0;
+ }
}
nfsmout:
return (error);
Index: ufs/ffs/ffs_inode.c
===================================================================
RCS file: /home/ncvs/src/sys/ufs/ffs/ffs_inode.c,v
retrieving revision 1.56.2.3
diff -u -r1.56.2.3 ffs_inode.c
--- ufs/ffs/ffs_inode.c 2001/11/20 20:27:27 1.56.2.3
+++ ufs/ffs/ffs_inode.c 2001/12/12 23:43:36
@@ -244,6 +244,10 @@
if (error) {
return (error);
}
+ if (length > 0 && DOINGSOFTDEP(ovp)) {
+ if ((error = VOP_FSYNC(ovp, cred, MNT_WAIT, p)) != 0)
+ return (error);
+ }
oip->i_size = length;
size = blksize(fs, oip, lbn);
if (ovp->v_type != VDIR)
@@ -359,6 +363,10 @@
if (newspace == 0)
panic("ffs_truncate: newspace");
if (oldspace - newspace > 0) {
+ /*
+ * XXX Softupdates, adjust queued directblock
+ * in place rather then the second FSYNC above?
+ */
/*
* Block number of space to be free'd is
* the old block # plus the number of frags
Index: vm/vnode_pager.c
===================================================================
RCS file: /home/ncvs/src/sys/vm/vnode_pager.c,v
retrieving revision 1.116.2.5
diff -u -r1.116.2.5 vnode_pager.c
--- vm/vnode_pager.c 2001/11/09 03:21:22 1.116.2.5
+++ vm/vnode_pager.c 2001/12/13 02:49:39
@@ -310,6 +310,20 @@
vm_pager_unmap_page(kva);
/*
+ * XXX work around SMP data integrity race
+ * by unmapping the page from user processes.
+ * The garbage we just cleared may be mapped
+ * to a user process running on another cpu
+ * and this code is not running through normal
+ * I/O channels which handle SMP issues for
+ * us, so unmap page to synchronize all cpus.
+ *
+ * XXX should vm_pager_unmap_page() have
+ * dealt with this?
+ */
+ vm_page_protect(m, VM_PROT_NONE);
+
+ /*
* Clear out partial-page dirty bits. This
* has the side effect of setting the valid
* bits, but that is ok. There are a bunch
To Unsubscribe: send mail to majordomo@FreeBSD.org
with "unsubscribe freebsd-hackers" in the body of the message
help
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200112130608.fBD689K49906>
