Date: Mon, 27 Feb 2012 19:00:55 GMT From: John Baldwin <jhb@FreeBSD.org> To: Perforce Change Reviews <perforce@freebsd.org> Subject: PERFORCE change 206989 for review Message-ID: <201202271900.q1RJ0toJ064427@skunkworks.freebsd.org>
next in thread | raw e-mail | index | archive | help
http://p4web.freebsd.org/@@206989?ac=10 Change 206989 by jhb@jhb_jhbbsd on 2012/02/27 19:00:32 Import my current WIP to implement POSIX_FADV_WILLNEED for UFS. Affected files ... .. //depot/projects/fadvise/sys/kern/vfs_bio.c#5 edit .. //depot/projects/fadvise/sys/kern/vfs_cluster.c#3 edit .. //depot/projects/fadvise/sys/sys/buf.h#2 edit .. //depot/projects/fadvise/sys/ufs/ffs/ffs_vnops.c#3 edit Differences ... ==== //depot/projects/fadvise/sys/kern/vfs_bio.c#5 (text+ko) ==== @@ -2664,8 +2664,10 @@ if (error == ENOLCK) goto loop; /* We timed out or were interrupted. */ - else if (error) + else if (error) { + CTR4(KTR_BUF, "getblk(%p, %ld, %d) failed %d", vp, (long)blkno, size, error); return (NULL); + } /* * The buffer is locked. B_CACHE is cleared if the buffer is @@ -2787,8 +2789,16 @@ bp = getnewbuf(vp, slpflag, slptimeo, size, maxsize, flags); if (bp == NULL) { - if (slpflag || slptimeo) + /* + * XXX: Should this also return NULL if + * GB_NOWAIT_BD is set? + */ + if (slpflag || slptimeo) { + CTR3(KTR_BUF, + "getblk(%p, %ld, %d) failed getnewbuf()", + vp, (long)blkno, size); return NULL; + } goto loop; } ==== //depot/projects/fadvise/sys/kern/vfs_cluster.c#3 (text+ko) ==== @@ -39,6 +39,7 @@ #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> +#include <sys/ktr.h> #include <sys/proc.h> #include <sys/bio.h> #include <sys/buf.h> @@ -64,8 +65,10 @@ cluster_collectbufs(struct vnode *vp, struct buf *last_bp); static struct buf * cluster_rbuild(struct vnode *vp, u_quad_t filesize, daddr_t lbn, - daddr_t blkno, long size, int run, struct buf *fbp); + daddr_t blkno, long size, int run, struct buf *fbp, int gbflags); static void cluster_callback(struct buf *); +static void cluster_ra(struct vnode *vp, u_quad_t filesize, daddr_t flbn, + daddr_t elbn, long size, int racluster, int gbflags); static int write_behind = 1; SYSCTL_INT(_vfs, OID_AUTO, write_behind, CTLFLAG_RW, &write_behind, 0, @@ -75,6 +78,19 @@ SYSCTL_INT(_vfs, OID_AUTO, read_max, CTLFLAG_RW, &read_max, 0, "Cluster read-ahead max block count"); +SYSCTL_NODE(_vfs, OID_AUTO, cluster, CTLFLAG_RD, NULL, ""); + +static int ra_fails; +SYSCTL_INT(_vfs_cluster, OID_AUTO, ra_fails, CTLFLAG_RD, &ra_fails, 0, ""); +static int rbuild_fails; +SYSCTL_INT(_vfs_cluster, OID_AUTO, rbuild_fails, CTLFLAG_RD, &rbuild_fails, 0, + ""); +static int ra_clusters; +SYSCTL_INT(_vfs_cluster, OID_AUTO, ra_clusters, CTLFLAG_RD, &ra_clusters, 0, + ""); +static int ra_singles; +SYSCTL_INT(_vfs_cluster, OID_AUTO, ra_singles, CTLFLAG_RD, &ra_singles, 0, ""); + /* Page expended to mark partially backed buffers */ extern vm_page_t bogus_page; @@ -208,7 +224,7 @@ if (ncontig < nblks) nblks = ncontig; bp = cluster_rbuild(vp, filesize, lblkno, - blkno, size, nblks, bp); + blkno, size, nblks, bp, 0); lblkno += (bp->b_bufsize / size); } else { bp->b_flags |= B_RAM; @@ -236,11 +252,69 @@ /* * If we have been doing sequential I/O, then do some read-ahead. */ - while (lblkno < (origblkno + maxra)) { + cluster_ra(vp, filesize, lblkno, origblkno + maxra, size, racluster, 0); + + if (reqbp) + return (bufwait(reqbp)); + else + return (error); +} + +/* + * Perform asynchronous read-ahead clustering reads for contiguous blocks + * if possible. Returns the amount of I/O it attempted to schedule. + */ +long +cluster_readahead(vp, filesize, lblkno, size) + struct vnode *vp; + u_quad_t filesize; + daddr_t lblkno; + long size; +{ + int maxra, racluster; + + /* + * Try to limit the amount of read-ahead by a few + * ad-hoc parameters. This needs work!!! + */ + racluster = vp->v_mount->mnt_iosize_max / size; + maxra = min(nbuf/8, read_max); + if (((u_quad_t)(lblkno + maxra + 1) * size) > filesize) + maxra = (filesize / size) - lblkno; + CTR3(KTR_BUF, "cluster_readahead(%p, %ld) using maxra %d", vp, lblkno, + maxra); + cluster_ra(vp, filesize, lblkno, lblkno + maxra, size, racluster, + /* GB_NOWAIT_BD | */ GB_LOCK_NOWAIT); + return (maxra * size); +} + +static void +cluster_ra(vp, filesize, flbn, elbn, size, racluster, gbflags) + struct vnode *vp; + u_quad_t filesize; + daddr_t flbn; + daddr_t elbn; + long size; + int racluster; + int gbflags; +{ + struct buf *rbp; + daddr_t blkno, lblkno; +#ifdef KTR + daddr_t old; +#endif + int error, ncontig; + + for (lblkno = flbn; lblkno < elbn; ) { +#ifdef KTR + old = lblkno; +#endif error = VOP_BMAP(vp, lblkno, NULL, &blkno, &ncontig, NULL); if (error) break; + CTR4(KTR_BUF, "cluster_ra: VOP_BMAP(%p, %ld) returned %ld, %d", + vp, lblkno, blkno, ncontig); if (blkno == -1) break; @@ -252,22 +326,46 @@ if (ncontig) { ncontig = min(ncontig + 1, racluster); rbp = cluster_rbuild(vp, filesize, lblkno, blkno, - size, ncontig, NULL); + size, ncontig, NULL, gbflags); + if (rbp == NULL) { + CTR2(KTR_BUF, "cluster_rbuild(%p, %ld) failed", + vp, lblkno); + lblkno += 1; + continue; + } lblkno += (rbp->b_bufsize / size); if (rbp->b_flags & B_DELWRI) { + CTR2(KTR_BUF, + "cluster_ra: cluster for %ld,%d has B_DELWRI", + old, rbp->b_bufsize / size); bqrelse(rbp); continue; } + CTR2(KTR_BUF, + "cluster_ra: scheduling cluster %ld,%d", + old, rbp->b_bufsize / size); + ra_clusters++; } else { - rbp = getblk(vp, lblkno, size, 0, 0, 0); + rbp = getblk(vp, lblkno, size, 0, 0, gbflags); lblkno += 1; + if (rbp == NULL) { + CTR2(KTR_BUF, + "cluster_ra: getblk(%p, %ld) failed", + vp, lblkno); + ra_fails++; + continue; + } if (rbp->b_flags & B_DELWRI) { + CTR1(KTR_BUF, + "cluster_ra: block %ld has B_DELWRI", old); bqrelse(rbp); continue; } rbp->b_flags |= B_ASYNC | B_RAM; rbp->b_iocmd = BIO_READ; rbp->b_blkno = blkno; + CTR1(KTR_BUF, "cluster_ra: scheduling block %ld", old); + ra_singles++; } if (rbp->b_flags & B_CACHE) { rbp->b_flags &= ~B_ASYNC; @@ -285,11 +383,6 @@ bstrategy(rbp); curthread->td_ru.ru_inblock++; } - - if (reqbp) - return (bufwait(reqbp)); - else - return (error); } /* @@ -298,7 +391,7 @@ * and then parcel them up into logical blocks in the buffer hash table. */ static struct buf * -cluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp) +cluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp, gbflags) struct vnode *vp; u_quad_t filesize; daddr_t lbn; @@ -306,6 +399,7 @@ long size; int run; struct buf *fbp; + int gbflags; { struct bufobj *bo; struct buf *bp, *tbp; @@ -329,8 +423,10 @@ tbp = fbp; tbp->b_iocmd = BIO_READ; } else { - tbp = getblk(vp, lbn, size, 0, 0, 0); - if (tbp->b_flags & B_CACHE) + tbp = getblk(vp, lbn, size, 0, 0, gbflags); + if (tbp == NULL) + rbuild_fails++; + if (tbp == NULL || tbp->b_flags & B_CACHE) return tbp; tbp->b_flags |= B_ASYNC | B_RAM; tbp->b_iocmd = BIO_READ; ==== //depot/projects/fadvise/sys/sys/buf.h#2 (text+ko) ==== @@ -504,6 +504,7 @@ int cluster_read(struct vnode *, u_quad_t, daddr_t, long, struct ucred *, long, int, struct buf **); +long cluster_readahead(struct vnode *, u_quad_t, daddr_t, long); int cluster_wbuild(struct vnode *, long, daddr_t, int); void cluster_write(struct vnode *, struct buf *, u_quad_t, int); void vfs_bio_set_valid(struct buf *, int base, int size); ==== //depot/projects/fadvise/sys/ufs/ffs/ffs_vnops.c#3 (text+ko) ==== @@ -70,6 +70,7 @@ #include <sys/buf.h> #include <sys/conf.h> #include <sys/extattr.h> +#include <sys/fcntl.h> #include <sys/kernel.h> #include <sys/limits.h> #include <sys/malloc.h> @@ -100,6 +101,7 @@ #ifdef DIRECTIO extern int ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone); #endif +static vop_advise_t ffs_advise; static vop_fsync_t ffs_fsync; static vop_lock1_t ffs_lock; static vop_getpages_t ffs_getpages; @@ -124,6 +126,7 @@ .vop_fsync = ffs_fsync, .vop_getpages = ffs_getpages, .vop_lock1 = ffs_lock, + .vop_advise = ffs_advise, .vop_read = ffs_read, .vop_reallocblks = ffs_reallocblks, .vop_write = ffs_write, @@ -143,6 +146,7 @@ .vop_fsync = ffs_fsync, .vop_getpages = ffs_getpages, .vop_lock1 = ffs_lock, + .vop_advise = ffs_advise, .vop_read = ffs_read, .vop_reallocblks = ffs_reallocblks, .vop_write = ffs_write, @@ -399,6 +403,78 @@ #endif } +static int +ffs_advise(ap) + struct vop_advise_args /* { + struct vnode *a_vp; + off_t a_start; + off_t a_end; + int a_advice; + } */ *ap; +{ + struct vnode *vp; + struct inode *ip; + struct fs *fs; + off_t start, end; + size_t resid; + ufs_lbn_t lbn, endblkno; + long size, blkoffset; + int xfersize; + + switch (ap->a_advice) { + case POSIX_FADV_WILLNEED: + vp = ap->a_vp; + start = ap->a_start; + end = ap->a_end; + vn_lock(vp, LK_SHARED | LK_RETRY); + if (vp->v_iflag & VI_DOOMED) { + VOP_UNLOCK(vp, 0); + return (EBADF); + } + KASSERT(vp->v_type == VREG, ("FADV_WILLNEED on bad vnode")); + ip = VTOI(vp); + if (start >= ip->i_size) { + VOP_UNLOCK(vp, 0); + return (0); + } + if (end >= ip->i_size) + end = ip->i_size - 1; + resid = end - start + 1; + fs = ip->i_fs; + + /* HACK: Prefetch indirect blocks for this range. */ + endblkno = lblkno(fs, end); + for (lbn = NDADDR; lbn < endblkno; lbn += NINDIR(fs)) + breada(vp, &lbn, &fs->fs_bsize, 1, NOCRED); + + while (resid > 0) { + /* Limit the number of read ahead buffers. */ + if (runningbufspace > hibufspace / 2) + break; + lbn = lblkno(fs, start); + size = blksize(fs, ip, lbn); + blkoffset = blkoff(fs, start); + if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { + resid += blkoffset; + start -= blkoffset; + xfersize = cluster_readahead(vp, ip->i_size, + lbn, size); + } else { + xfersize = fs->fs_bsize - blkoffset; + if (resid < xfersize) + xfersize = resid; + breada(vp, &lbn, &xfersize, 1, NOCRED); + } + resid -= xfersize; + start += xfersize; + } + VOP_UNLOCK(vp, 0); + return (0); + default: + return (vop_stdadvise(ap)); + } +} + /* * Vnode op for reading. */
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201202271900.q1RJ0toJ064427>