Skip site navigation (1)Skip section navigation (2)
Date:      Sun, 5 Dec 1999 17:52:57 -0800 (PST)
From:      Alfred Perlstein <bright@wintelcom.net>
To:        Matthew Dillon <dillon@apollo.backplane.com>
Cc:        hackers@freebsd.org
Subject:   vfs_bio questions/nfs cluster commit
Message-ID:  <Pine.BSF.4.21.9912041342290.4557-100000@fw.wintelcom.net>

next in thread | raw e-mail | index | archive | help

I've been trying to workout mega-clusters for NFS, since afaik the
vfs_cluster code will only do 64k chunks and we can benifit greatly
by compacting ranges for commit RPCs.

The problem, is that it seems that NFS has been having issues, doing
large appends on my box (lptest 80 100000 > /nfsmount/foo) seems
to just get stuck even without my new code.  Is anyone else having
this problem on a recent -current?

It seems that taking out the sync rpc commit in nfs_writebp may
have been a mistake, it appears that it allows a buffer that _must_
be free'd to be delayed for commit, i'm not sure if the code makes
sure that the buffer is written sync if we need it to be?

I'm not sure it's ok to do this becasue afaik a written but uncommitted
bp is marked as delay-write? (B_DELWRI)  We don't really get our
buffers back until a buffer is committed.

Anyhow back to the commit clustering..
Does anyone want to take a look at this?

When a buffer is scheduled to be committed from nfs_doio() the
nfs_cluster_commit() function trys to find adjacent buffers and
include them in one giant mega-commit, then it frees the buffers.

Last thing, the paranioa check in nfs_subr.c looks bogus because
we are at splbio(), or is it essential that this check be done?

XXX: this code _almost_ works :)

Index: nfs_bio.c
===================================================================
RCS file: /home/ncvs/src/sys/nfs/nfs_bio.c,v
retrieving revision 1.80
diff -u -u -r1.80 nfs_bio.c
--- nfs_bio.c	1999/10/29 18:09:11	1.80
+++ nfs_bio.c	1999/12/06 05:01:33
@@ -62,6 +62,8 @@
 #include <nfs/nqnfs.h>
 #include <nfs/nfsnode.h>
 
+static int nfs_cluster_commit __P((struct vnode *vp, struct buf *bp, 
+					struct proc *p));
 static struct buf *nfs_getcacheblk __P((struct vnode *vp, daddr_t bn, int size,
 					struct proc *p));
 
@@ -1006,6 +1008,10 @@
 	nmp = VFSTONFS(mp);
 
 	if (nmp->nm_flag & NFSMNT_INT) {
+		/* 
+		 * don't spin on signals not applicable to interupting an
+		 * NFS operation, try short term non-interuptable requests
+		 */
 		bp = getblk(vp, bn, size, PCATCH, 0);
 		while (bp == (struct buf *)0) {
 			if (nfs_sigintr(nmp, (struct nfsreq *)0, p))
@@ -1347,27 +1353,9 @@
 	    /* 
 	     * If we only need to commit, try to commit
 	     */
-	    if (bp->b_flags & B_NEEDCOMMIT) {
-		    int retv;
-		    off_t off;
-
-		    off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE + bp->b_dirtyoff;
-		    bp->b_flags |= B_WRITEINPROG;
-		    retv = nfs_commit(
-				bp->b_vp, off, bp->b_dirtyend-bp->b_dirtyoff,
-				bp->b_wcred, p);
-		    bp->b_flags &= ~B_WRITEINPROG;
-		    if (retv == 0) {
-			    bp->b_dirtyoff = bp->b_dirtyend = 0;
-			    bp->b_flags &= ~B_NEEDCOMMIT;
-			    bp->b_resid = 0;
-			    biodone(bp);
+	    if (bp->b_flags & B_NEEDCOMMIT && 
+			nfs_cluster_commit(vp, bp, p) == 0)
 			    return (0);
-		    }
-		    if (retv == NFSERR_STALEWRITEVERF) {
-			    nfs_clearcommit(bp->b_vp->v_mount);
-		    }
-	    }
 
 	    /*
 	     * Setup for actual write
@@ -1460,4 +1448,172 @@
 	    nfs_clearcommit(vp->v_mount);
 	biodone(bp);
 	return (error);
+}
+
+/*
+ * nfs_cluster_commit(vp, bp, p)
+ *
+ * This function is called on a NFS buffer that needs a commit RPC
+ * Even though the buffer may already be clustered, clustering is limited
+ * to 64k chunks, try to grab an even bigger range by scanning forward and
+ * backward in the file.
+ */
+static int
+nfs_cluster_commit(vp, bp, p)
+	struct vnode	*vp;
+	struct buf	*bp;
+	struct proc *p;
+{
+	struct buf *cbufs[16], *holdbp;
+	int commit_idx, retv, flags, pass, i, dirty_off, dirty_end, s, gap;
+	int pre_count=0, post_count=0;
+	u_quad_t	sblkno, eblkno, lblkno;
+		/* start, end, logical block number */
+
+	KASSERT(bp->b_flags & B_NEEDCOMMIT, ("NFS commit without B_NEEDCOMMIT"));
+
+	flags = B_DELWRI | B_NEEDCOMMIT;
+	gap = 0;
+	commit_idx = 0;
+	pass = 0;
+	sblkno = eblkno = (u_quad_t)bp->b_blkno;
+	dirty_end = bp->b_dirtyend;
+	dirty_off = bp->b_dirtyoff;
+	
+	s = splbio();
+	/*
+	 * first pass scan forward, second backwards from bp
+	 */
+pass:
+	lblkno = bp->b_blkno;
+	holdbp = bp;
+
+	/* 
+	 * don't scan too much and don't overflow buf pointer array 
+	 */
+	while (gap < 4 && commit_idx < (sizeof(cbufs)/sizeof(cbufs[0]))) {
+
+		/* scan forward or backward */
+		if (pass == 1) { /* backwards */
+			/*
+			 * backwards is easy just subtracting one from the lbn
+			 * should get us the previous buffer if it exists
+			 */
+			if (lblkno == 0)	
+				break;
+			else
+				lblkno--;
+		} else {
+			/*
+			 * when going forward make sure to remeber to account
+			 * for buffers that span more than one block
+			 * if holdbp == NULL then gbincore() failed, try
+			 * going forward only a single block
+			 */
+			if (lblkno == (u_quad_t)-1)
+				break;
+			else if (holdbp == NULL)
+				lblkno++;
+			else
+				lblkno += holdbp->b_bufsize / DEV_BSIZE;
+		}
+
+		/*
+		 * if the buffer isn't in memory bump the gap count
+		 * and try the next one
+		 */
+		holdbp = gbincore(vp, lblkno);
+		if (holdbp == NULL) {
+			gap++;
+			continue;
+		}
+		lblkno = holdbp->b_blkno;
+
+		/*
+		 * make sure flags and cred are the same as the first buffer
+		 * also make sure we can lock the buffer
+		 * if not we have to escape the loop
+		 */
+		if (((holdbp->b_flags & flags) != flags) ||
+			holdbp->b_wcred != bp->b_wcred ||
+			BUF_LOCK(holdbp, LK_EXCLUSIVE))
+			break;
+
+		gap = 0;	/* scan more */
+		
+		bremfree(holdbp);	/* buffer accounting */
+
+		holdbp->b_flags |= B_WRITEINPROG;
+		vfs_busy_pages(holdbp, 1);
+		cbufs[ commit_idx++ ] = holdbp;
+		if (pass == 1) {
+			/* going backwards so set start block and offset */
+ 			sblkno = lblkno;	
+			dirty_off = holdbp->b_dirtyoff;
+			++pre_count;
+		} else {
+			/* going forward so set end block and end pointer */
+			eblkno = lblkno;
+			dirty_end = holdbp->b_dirtyend;
+			++post_count;
+		}
+	}
+	gap = 0;
+	if (++pass == 1)
+		goto pass;
+
+	splx(s);
+
+	retv = nfs_commit(bp->b_vp, 
+		sblkno * DEV_BSIZE + dirty_off, 
+		(eblkno - sblkno) * DEV_BSIZE + dirty_end,
+		bp->b_wcred, p);
+
+	printf("nfs commit @%lld %d bytes, behind bufs = %d, forward bufs = %d\n",
+		sblkno * DEV_BSIZE + dirty_off, 
+		(int)(eblkno - sblkno) * DEV_BSIZE + dirty_end,
+		pre_count, post_count);
+
+	for (i = 0; i < commit_idx; i++) {
+		holdbp = cbufs[i]; 
+		holdbp->b_flags &= ~(B_WRITEINPROG|B_NEEDCOMMIT);
+		if (retv) {
+			/*
+			* Error, leave B_DELWRI intact
+			*/
+			vfs_unbusy_pages(holdbp);
+			brelse(holdbp);
+		} else {
+			/*
+			* Success, remove B_DELWRI ( bundirty() ).
+			*
+			* b_dirtyoff/b_dirtyend seem to be NFS 
+			* specific.  We should probably move that
+			* into bundirty(). XXX
+			*/
+			s = splbio();
+			vp->v_numoutput++;
+			holdbp->b_flags |= B_ASYNC;
+			bundirty(holdbp);
+			holdbp->b_flags &= ~(B_READ|B_DONE|B_ERROR);
+			holdbp->b_dirtyoff = holdbp->b_dirtyend = 0;
+			holdbp->b_resid = 0;
+			splx(s);
+			biodone(holdbp);
+		}
+	}
+	bp->b_flags &= ~B_WRITEINPROG;
+	if (retv == 0) {
+		bp->b_dirtyoff = bp->b_dirtyend = 0;
+		bp->b_flags &= ~B_NEEDCOMMIT;
+		bp->b_resid = 0;
+		biodone(bp);
+		return (0);
+	}
+
+	if (retv == NFSERR_STALEWRITEVERF)
+		nfs_clearcommit(bp->b_vp->v_mount);
+
+	return (retv);
+
 }
Index: nfs_subs.c
===================================================================
RCS file: /home/ncvs/src/sys/nfs/nfs_subs.c,v
retrieving revision 1.85
diff -u -u -r1.85 nfs_subs.c
--- nfs_subs.c	1999/10/03 12:18:26	1.85
+++ nfs_subs.c	1999/12/04 18:12:38
@@ -2156,18 +2156,14 @@
 nfs_clearcommit(mp)
 	struct mount *mp;
 {
-	register struct vnode *vp, *nvp;
-	register struct buf *bp, *nbp;
+	register struct vnode *vp;
+	register struct buf *bp;
 	int s;
 
 	s = splbio();
 loop:
-	for (vp = mp->mnt_vnodelist.lh_first; vp; vp = nvp) {
-		if (vp->v_mount != mp)	/* Paranoia */
-			goto loop;
-		nvp = vp->v_mntvnodes.le_next;
-		for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
-			nbp = TAILQ_NEXT(bp, b_vnbufs);
+	LIST_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) {
+		TAILQ_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs) {
 			if (BUF_REFCNT(bp) == 0 &&
 			    (bp->b_flags & (B_DELWRI | B_NEEDCOMMIT))
 				== (B_DELWRI | B_NEEDCOMMIT))




To Unsubscribe: send mail to majordomo@FreeBSD.org
with "unsubscribe freebsd-hackers" in the body of the message




Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?Pine.BSF.4.21.9912041342290.4557-100000>