From owner-freebsd-hackers Sun Dec 5 17:23:54 1999 Delivered-To: freebsd-hackers@freebsd.org Received: from fw.wintelcom.net (ns1.wintelcom.net [209.1.153.20]) by hub.freebsd.org (Postfix) with ESMTP id DAC4A14C86 for ; Sun, 5 Dec 1999 17:23:46 -0800 (PST) (envelope-from bright@wintelcom.net) Received: from localhost (bright@localhost) by fw.wintelcom.net (8.9.3/8.9.3) with ESMTP id RAA15831; Sun, 5 Dec 1999 17:52:57 -0800 (PST) Date: Sun, 5 Dec 1999 17:52:57 -0800 (PST) From: Alfred Perlstein To: Matthew Dillon Cc: hackers@freebsd.org Subject: vfs_bio questions/nfs cluster commit Message-ID: MIME-Version: 1.0 Content-Type: TEXT/PLAIN; charset=US-ASCII Sender: owner-freebsd-hackers@FreeBSD.ORG Precedence: bulk X-Loop: FreeBSD.ORG I've been trying to workout mega-clusters for NFS, since afaik the vfs_cluster code will only do 64k chunks and we can benifit greatly by compacting ranges for commit RPCs. The problem, is that it seems that NFS has been having issues, doing large appends on my box (lptest 80 100000 > /nfsmount/foo) seems to just get stuck even without my new code. Is anyone else having this problem on a recent -current? It seems that taking out the sync rpc commit in nfs_writebp may have been a mistake, it appears that it allows a buffer that _must_ be free'd to be delayed for commit, i'm not sure if the code makes sure that the buffer is written sync if we need it to be? I'm not sure it's ok to do this becasue afaik a written but uncommitted bp is marked as delay-write? (B_DELWRI) We don't really get our buffers back until a buffer is committed. Anyhow back to the commit clustering.. Does anyone want to take a look at this? When a buffer is scheduled to be committed from nfs_doio() the nfs_cluster_commit() function trys to find adjacent buffers and include them in one giant mega-commit, then it frees the buffers. Last thing, the paranioa check in nfs_subr.c looks bogus because we are at splbio(), or is it essential that this check be done? XXX: this code _almost_ works :) Index: nfs_bio.c =================================================================== RCS file: /home/ncvs/src/sys/nfs/nfs_bio.c,v retrieving revision 1.80 diff -u -u -r1.80 nfs_bio.c --- nfs_bio.c 1999/10/29 18:09:11 1.80 +++ nfs_bio.c 1999/12/06 05:01:33 @@ -62,6 +62,8 @@ #include #include +static int nfs_cluster_commit __P((struct vnode *vp, struct buf *bp, + struct proc *p)); static struct buf *nfs_getcacheblk __P((struct vnode *vp, daddr_t bn, int size, struct proc *p)); @@ -1006,6 +1008,10 @@ nmp = VFSTONFS(mp); if (nmp->nm_flag & NFSMNT_INT) { + /* + * don't spin on signals not applicable to interupting an + * NFS operation, try short term non-interuptable requests + */ bp = getblk(vp, bn, size, PCATCH, 0); while (bp == (struct buf *)0) { if (nfs_sigintr(nmp, (struct nfsreq *)0, p)) @@ -1347,27 +1353,9 @@ /* * If we only need to commit, try to commit */ - if (bp->b_flags & B_NEEDCOMMIT) { - int retv; - off_t off; - - off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE + bp->b_dirtyoff; - bp->b_flags |= B_WRITEINPROG; - retv = nfs_commit( - bp->b_vp, off, bp->b_dirtyend-bp->b_dirtyoff, - bp->b_wcred, p); - bp->b_flags &= ~B_WRITEINPROG; - if (retv == 0) { - bp->b_dirtyoff = bp->b_dirtyend = 0; - bp->b_flags &= ~B_NEEDCOMMIT; - bp->b_resid = 0; - biodone(bp); + if (bp->b_flags & B_NEEDCOMMIT && + nfs_cluster_commit(vp, bp, p) == 0) return (0); - } - if (retv == NFSERR_STALEWRITEVERF) { - nfs_clearcommit(bp->b_vp->v_mount); - } - } /* * Setup for actual write @@ -1460,4 +1448,172 @@ nfs_clearcommit(vp->v_mount); biodone(bp); return (error); +} + +/* + * nfs_cluster_commit(vp, bp, p) + * + * This function is called on a NFS buffer that needs a commit RPC + * Even though the buffer may already be clustered, clustering is limited + * to 64k chunks, try to grab an even bigger range by scanning forward and + * backward in the file. + */ +static int +nfs_cluster_commit(vp, bp, p) + struct vnode *vp; + struct buf *bp; + struct proc *p; +{ + struct buf *cbufs[16], *holdbp; + int commit_idx, retv, flags, pass, i, dirty_off, dirty_end, s, gap; + int pre_count=0, post_count=0; + u_quad_t sblkno, eblkno, lblkno; + /* start, end, logical block number */ + + KASSERT(bp->b_flags & B_NEEDCOMMIT, ("NFS commit without B_NEEDCOMMIT")); + + flags = B_DELWRI | B_NEEDCOMMIT; + gap = 0; + commit_idx = 0; + pass = 0; + sblkno = eblkno = (u_quad_t)bp->b_blkno; + dirty_end = bp->b_dirtyend; + dirty_off = bp->b_dirtyoff; + + s = splbio(); + /* + * first pass scan forward, second backwards from bp + */ +pass: + lblkno = bp->b_blkno; + holdbp = bp; + + /* + * don't scan too much and don't overflow buf pointer array + */ + while (gap < 4 && commit_idx < (sizeof(cbufs)/sizeof(cbufs[0]))) { + + /* scan forward or backward */ + if (pass == 1) { /* backwards */ + /* + * backwards is easy just subtracting one from the lbn + * should get us the previous buffer if it exists + */ + if (lblkno == 0) + break; + else + lblkno--; + } else { + /* + * when going forward make sure to remeber to account + * for buffers that span more than one block + * if holdbp == NULL then gbincore() failed, try + * going forward only a single block + */ + if (lblkno == (u_quad_t)-1) + break; + else if (holdbp == NULL) + lblkno++; + else + lblkno += holdbp->b_bufsize / DEV_BSIZE; + } + + /* + * if the buffer isn't in memory bump the gap count + * and try the next one + */ + holdbp = gbincore(vp, lblkno); + if (holdbp == NULL) { + gap++; + continue; + } + lblkno = holdbp->b_blkno; + + /* + * make sure flags and cred are the same as the first buffer + * also make sure we can lock the buffer + * if not we have to escape the loop + */ + if (((holdbp->b_flags & flags) != flags) || + holdbp->b_wcred != bp->b_wcred || + BUF_LOCK(holdbp, LK_EXCLUSIVE)) + break; + + gap = 0; /* scan more */ + + bremfree(holdbp); /* buffer accounting */ + + holdbp->b_flags |= B_WRITEINPROG; + vfs_busy_pages(holdbp, 1); + cbufs[ commit_idx++ ] = holdbp; + if (pass == 1) { + /* going backwards so set start block and offset */ + sblkno = lblkno; + dirty_off = holdbp->b_dirtyoff; + ++pre_count; + } else { + /* going forward so set end block and end pointer */ + eblkno = lblkno; + dirty_end = holdbp->b_dirtyend; + ++post_count; + } + } + gap = 0; + if (++pass == 1) + goto pass; + + splx(s); + + retv = nfs_commit(bp->b_vp, + sblkno * DEV_BSIZE + dirty_off, + (eblkno - sblkno) * DEV_BSIZE + dirty_end, + bp->b_wcred, p); + + printf("nfs commit @%lld %d bytes, behind bufs = %d, forward bufs = %d\n", + sblkno * DEV_BSIZE + dirty_off, + (int)(eblkno - sblkno) * DEV_BSIZE + dirty_end, + pre_count, post_count); + + for (i = 0; i < commit_idx; i++) { + holdbp = cbufs[i]; + holdbp->b_flags &= ~(B_WRITEINPROG|B_NEEDCOMMIT); + if (retv) { + /* + * Error, leave B_DELWRI intact + */ + vfs_unbusy_pages(holdbp); + brelse(holdbp); + } else { + /* + * Success, remove B_DELWRI ( bundirty() ). + * + * b_dirtyoff/b_dirtyend seem to be NFS + * specific. We should probably move that + * into bundirty(). XXX + */ + s = splbio(); + vp->v_numoutput++; + holdbp->b_flags |= B_ASYNC; + bundirty(holdbp); + holdbp->b_flags &= ~(B_READ|B_DONE|B_ERROR); + holdbp->b_dirtyoff = holdbp->b_dirtyend = 0; + holdbp->b_resid = 0; + splx(s); + biodone(holdbp); + } + } + bp->b_flags &= ~B_WRITEINPROG; + if (retv == 0) { + bp->b_dirtyoff = bp->b_dirtyend = 0; + bp->b_flags &= ~B_NEEDCOMMIT; + bp->b_resid = 0; + biodone(bp); + return (0); + } + + if (retv == NFSERR_STALEWRITEVERF) + nfs_clearcommit(bp->b_vp->v_mount); + + return (retv); + } Index: nfs_subs.c =================================================================== RCS file: /home/ncvs/src/sys/nfs/nfs_subs.c,v retrieving revision 1.85 diff -u -u -r1.85 nfs_subs.c --- nfs_subs.c 1999/10/03 12:18:26 1.85 +++ nfs_subs.c 1999/12/04 18:12:38 @@ -2156,18 +2156,14 @@ nfs_clearcommit(mp) struct mount *mp; { - register struct vnode *vp, *nvp; - register struct buf *bp, *nbp; + register struct vnode *vp; + register struct buf *bp; int s; s = splbio(); loop: - for (vp = mp->mnt_vnodelist.lh_first; vp; vp = nvp) { - if (vp->v_mount != mp) /* Paranoia */ - goto loop; - nvp = vp->v_mntvnodes.le_next; - for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { - nbp = TAILQ_NEXT(bp, b_vnbufs); + LIST_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) { + TAILQ_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs) { if (BUF_REFCNT(bp) == 0 && (bp->b_flags & (B_DELWRI | B_NEEDCOMMIT)) == (B_DELWRI | B_NEEDCOMMIT)) To Unsubscribe: send mail to majordomo@FreeBSD.org with "unsubscribe freebsd-hackers" in the body of the message