Date: Tue, 28 Aug 2012 18:45:20 +0000 (UTC) From: John Baldwin <jhb@FreeBSD.org> To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-stable@freebsd.org, svn-src-stable-8@freebsd.org Subject: svn commit: r239788 - in stable/8: lib/libc/sys sys/kern sys/sys Message-ID: <201208281845.q7SIjKm4011026@svn.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: jhb Date: Tue Aug 28 18:45:20 2012 New Revision: 239788 URL: http://svn.freebsd.org/changeset/base/239788 Log: MFC 230782,237274: Refine the implementation of POSIX_FADV_NOREUSE to perform POSIX_FADV_DONTNEED requests on the currently accessed portion of the file on each read(2) or write(2) rather than using direct I/O. This gives much better performance including read-ahead and write clustering similar to normal read(2) and write(2) calls. If subsequent read(2) and write(2) calls are sequential, then the POSIX_FADV_DONTNEED requests will cover the entire sequentially-accessed range. Modified: stable/8/lib/libc/sys/posix_fadvise.2 stable/8/sys/kern/vfs_syscalls.c stable/8/sys/kern/vfs_vnops.c stable/8/sys/sys/file.h Directory Properties: stable/8/lib/libc/ (props changed) stable/8/lib/libc/stdtime/ (props changed) stable/8/lib/libc/sys/ (props changed) stable/8/lib/libc/uuid/ (props changed) stable/8/sys/ (props changed) stable/8/sys/amd64/include/xen/ (props changed) stable/8/sys/boot/ (props changed) stable/8/sys/cddl/ (props changed) stable/8/sys/cddl/contrib/opensolaris/ (props changed) stable/8/sys/compat/ (props changed) stable/8/sys/contrib/dev/acpica/ (props changed) stable/8/sys/contrib/pf/ (props changed) stable/8/sys/dev/ (props changed) stable/8/sys/dev/e1000/ (props changed) stable/8/sys/dev/sound/ (props changed) stable/8/sys/dev/sound/pci/ (props changed) stable/8/sys/dev/virtio/ (props changed) stable/8/sys/kern/ (props changed) stable/8/sys/sys/ (props changed) stable/8/sys/vm/ (props changed) Modified: stable/8/lib/libc/sys/posix_fadvise.2 ============================================================================== --- stable/8/lib/libc/sys/posix_fadvise.2 Tue Aug 28 18:44:56 2012 (r239787) +++ stable/8/lib/libc/sys/posix_fadvise.2 Tue Aug 28 18:45:20 2012 (r239788) @@ -28,7 +28,7 @@ .\" @(#)madvise.2 8.1 (Berkeley) 6/9/93 .\" $FreeBSD$ .\" -.Dd February 25, 2012 +.Dd June 19, 2012 .Dt POSIX_FADVISE 2 .Os .Sh NAME @@ -84,10 +84,9 @@ specified range and future access to thi .It Dv POSIX_FADV_NOREUSE Tells the system that the specified data will only be accessed once and then not reused. -Accesses to data within the specified range are treated as if the file -descriptor has the -.Dv O_DIRECT -flag enabled. +The system may decrease the in-memory priority of data once it has been +read or written. +Future access to this data may require a read operation. .El .Pp .Sh RETURN VALUES Modified: stable/8/sys/kern/vfs_syscalls.c ============================================================================== --- stable/8/sys/kern/vfs_syscalls.c Tue Aug 28 18:44:56 2012 (r239787) +++ stable/8/sys/kern/vfs_syscalls.c Tue Aug 28 18:45:20 2012 (r239788) @@ -4824,6 +4824,8 @@ kern_posix_fadvise(struct thread *td, in new->fa_advice = advice; new->fa_start = offset; new->fa_end = end; + new->fa_prevstart = 0; + new->fa_prevend = 0; fp->f_advice = new; new = fa; } Modified: stable/8/sys/kern/vfs_vnops.c ============================================================================== --- stable/8/sys/kern/vfs_vnops.c Tue Aug 28 18:44:56 2012 (r239787) +++ stable/8/sys/kern/vfs_vnops.c Tue Aug 28 18:45:20 2012 (r239788) @@ -512,6 +512,7 @@ vn_read(fp, uio, active_cred, flags, td) int error, ioflag; struct mtx *mtxp; int advice, vfslocked; + off_t offset, start, end; KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", uio->uio_td, td)); @@ -551,19 +552,14 @@ vn_read(fp, uio, active_cred, flags, td) switch (advice) { case POSIX_FADV_NORMAL: case POSIX_FADV_SEQUENTIAL: + case POSIX_FADV_NOREUSE: ioflag |= sequential_heuristic(uio, fp); break; case POSIX_FADV_RANDOM: /* Disable read-ahead for random I/O. */ break; - case POSIX_FADV_NOREUSE: - /* - * Request the underlying FS to discard the buffers - * and pages after the I/O is complete. - */ - ioflag |= IO_DIRECT; - break; } + offset = uio->uio_offset; #ifdef MAC error = mac_vnode_check_read(active_cred, fp->f_cred, vp); @@ -580,6 +576,39 @@ vn_read(fp, uio, active_cred, flags, td) } fp->f_nextoff = uio->uio_offset; VOP_UNLOCK(vp, 0); + if (error == 0 && advice == POSIX_FADV_NOREUSE && + offset != uio->uio_offset) { + /* + * Use POSIX_FADV_DONTNEED to flush clean pages and + * buffers for the backing file after a + * POSIX_FADV_NOREUSE read(2). To optimize the common + * case of using POSIX_FADV_NOREUSE with sequential + * access, track the previous implicit DONTNEED + * request and grow this request to include the + * current read(2) in addition to the previous + * DONTNEED. With purely sequential access this will + * cause the DONTNEED requests to continously grow to + * cover all of the previously read regions of the + * file. This allows filesystem blocks that are + * accessed by multiple calls to read(2) to be flushed + * once the last read(2) finishes. + */ + start = offset; + end = uio->uio_offset - 1; + mtx_lock(mtxp); + if (fp->f_advice != NULL && + fp->f_advice->fa_advice == POSIX_FADV_NOREUSE) { + if (start != 0 && fp->f_advice->fa_prevend + 1 == start) + start = fp->f_advice->fa_prevstart; + else if (fp->f_advice->fa_prevstart != 0 && + fp->f_advice->fa_prevstart == end + 1) + end = fp->f_advice->fa_prevend; + fp->f_advice->fa_prevstart = start; + fp->f_advice->fa_prevend = end; + } + mtx_unlock(mtxp); + error = VOP_ADVISE(vp, start, end, POSIX_FADV_DONTNEED); + } VFS_UNLOCK_GIANT(vfslocked); return (error); } @@ -600,6 +629,7 @@ vn_write(fp, uio, active_cred, flags, td int error, ioflag, lock_flags; struct mtx *mtxp; int advice, vfslocked; + off_t offset, start, end; KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", uio->uio_td, td)); @@ -634,6 +664,7 @@ vn_write(fp, uio, active_cred, flags, td if ((flags & FOF_OFFSET) == 0) uio->uio_offset = fp->f_offset; advice = POSIX_FADV_NORMAL; + mtxp = NULL; if (fp->f_advice != NULL) { mtxp = mtx_pool_find(mtxpool_sleep, fp); mtx_lock(mtxp); @@ -646,19 +677,14 @@ vn_write(fp, uio, active_cred, flags, td switch (advice) { case POSIX_FADV_NORMAL: case POSIX_FADV_SEQUENTIAL: + case POSIX_FADV_NOREUSE: ioflag |= sequential_heuristic(uio, fp); break; case POSIX_FADV_RANDOM: /* XXX: Is this correct? */ break; - case POSIX_FADV_NOREUSE: - /* - * Request the underlying FS to discard the buffers - * and pages after the I/O is complete. - */ - ioflag |= IO_DIRECT; - break; } + offset = uio->uio_offset; #ifdef MAC error = mac_vnode_check_write(active_cred, fp->f_cred, vp); @@ -671,6 +697,55 @@ vn_write(fp, uio, active_cred, flags, td VOP_UNLOCK(vp, 0); if (vp->v_type != VCHR) vn_finished_write(mp); + if (error == 0 && advice == POSIX_FADV_NOREUSE && + offset != uio->uio_offset) { + /* + * Use POSIX_FADV_DONTNEED to flush clean pages and + * buffers for the backing file after a + * POSIX_FADV_NOREUSE write(2). To optimize the + * common case of using POSIX_FADV_NOREUSE with + * sequential access, track the previous implicit + * DONTNEED request and grow this request to include + * the current write(2) in addition to the previous + * DONTNEED. With purely sequential access this will + * cause the DONTNEED requests to continously grow to + * cover all of the previously written regions of the + * file. + * + * Note that the blocks just written are almost + * certainly still dirty, so this only works when + * VOP_ADVISE() calls from subsequent writes push out + * the data written by this write(2) once the backing + * buffers are clean. However, as compared to forcing + * IO_DIRECT, this gives much saner behavior. Write + * clustering is still allowed, and clean pages are + * merely moved to the cache page queue rather than + * outright thrown away. This means a subsequent + * read(2) can still avoid hitting the disk if the + * pages have not been reclaimed. + * + * This does make POSIX_FADV_NOREUSE largely useless + * with non-sequential access. However, sequential + * access is the more common use case and the flag is + * merely advisory. + */ + start = offset; + end = uio->uio_offset - 1; + mtx_lock(mtxp); + if (fp->f_advice != NULL && + fp->f_advice->fa_advice == POSIX_FADV_NOREUSE) { + if (start != 0 && fp->f_advice->fa_prevend + 1 == start) + start = fp->f_advice->fa_prevstart; + else if (fp->f_advice->fa_prevstart != 0 && + fp->f_advice->fa_prevstart == end + 1) + end = fp->f_advice->fa_prevend; + fp->f_advice->fa_prevstart = start; + fp->f_advice->fa_prevend = end; + } + mtx_unlock(mtxp); + error = VOP_ADVISE(vp, start, end, POSIX_FADV_DONTNEED); + } + unlock: VFS_UNLOCK_GIANT(vfslocked); return (error); Modified: stable/8/sys/sys/file.h ============================================================================== --- stable/8/sys/sys/file.h Tue Aug 28 18:44:56 2012 (r239787) +++ stable/8/sys/sys/file.h Tue Aug 28 18:45:20 2012 (r239788) @@ -117,6 +117,8 @@ struct fadvise_info { int fa_advice; /* (f) FADV_* type. */ off_t fa_start; /* (f) Region start. */ off_t fa_end; /* (f) Region end. */ + off_t fa_prevstart; /* (f) Previous NOREUSE start. */ + off_t fa_prevend; /* (f) Previous NOREUSE end. */ }; struct file {
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201208281845.q7SIjKm4011026>