Skip site navigation (1)Skip section navigation (2)
Date:      Tue, 28 Aug 2012 18:45:20 +0000 (UTC)
From:      John Baldwin <jhb@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-stable@freebsd.org, svn-src-stable-8@freebsd.org
Subject:   svn commit: r239788 - in stable/8: lib/libc/sys sys/kern sys/sys
Message-ID:  <201208281845.q7SIjKm4011026@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: jhb
Date: Tue Aug 28 18:45:20 2012
New Revision: 239788
URL: http://svn.freebsd.org/changeset/base/239788

Log:
  MFC 230782,237274:
  Refine the implementation of POSIX_FADV_NOREUSE to perform
  POSIX_FADV_DONTNEED requests on the currently accessed portion of the
  file on each read(2) or write(2) rather than using direct I/O.  This
  gives much better performance including read-ahead and write clustering
  similar to normal read(2) and write(2) calls.
  
  If subsequent read(2) and write(2) calls are sequential, then the
  POSIX_FADV_DONTNEED requests will cover the entire sequentially-accessed
  range.

Modified:
  stable/8/lib/libc/sys/posix_fadvise.2
  stable/8/sys/kern/vfs_syscalls.c
  stable/8/sys/kern/vfs_vnops.c
  stable/8/sys/sys/file.h
Directory Properties:
  stable/8/lib/libc/   (props changed)
  stable/8/lib/libc/stdtime/   (props changed)
  stable/8/lib/libc/sys/   (props changed)
  stable/8/lib/libc/uuid/   (props changed)
  stable/8/sys/   (props changed)
  stable/8/sys/amd64/include/xen/   (props changed)
  stable/8/sys/boot/   (props changed)
  stable/8/sys/cddl/   (props changed)
  stable/8/sys/cddl/contrib/opensolaris/   (props changed)
  stable/8/sys/compat/   (props changed)
  stable/8/sys/contrib/dev/acpica/   (props changed)
  stable/8/sys/contrib/pf/   (props changed)
  stable/8/sys/dev/   (props changed)
  stable/8/sys/dev/e1000/   (props changed)
  stable/8/sys/dev/sound/   (props changed)
  stable/8/sys/dev/sound/pci/   (props changed)
  stable/8/sys/dev/virtio/   (props changed)
  stable/8/sys/kern/   (props changed)
  stable/8/sys/sys/   (props changed)
  stable/8/sys/vm/   (props changed)

Modified: stable/8/lib/libc/sys/posix_fadvise.2
==============================================================================
--- stable/8/lib/libc/sys/posix_fadvise.2	Tue Aug 28 18:44:56 2012	(r239787)
+++ stable/8/lib/libc/sys/posix_fadvise.2	Tue Aug 28 18:45:20 2012	(r239788)
@@ -28,7 +28,7 @@
 .\"	@(#)madvise.2	8.1 (Berkeley) 6/9/93
 .\" $FreeBSD$
 .\"
-.Dd February 25, 2012
+.Dd June 19, 2012
 .Dt POSIX_FADVISE 2
 .Os
 .Sh NAME
@@ -84,10 +84,9 @@ specified range and future access to thi
 .It Dv POSIX_FADV_NOREUSE
 Tells the system that the specified data will only be accessed once and
 then not reused.
-Accesses to data within the specified range are treated as if the file
-descriptor has the
-.Dv O_DIRECT
-flag enabled.
+The system may decrease the in-memory priority of data once it has been
+read or written.
+Future access to this data may require a read operation.
 .El
 .Pp
 .Sh RETURN VALUES

Modified: stable/8/sys/kern/vfs_syscalls.c
==============================================================================
--- stable/8/sys/kern/vfs_syscalls.c	Tue Aug 28 18:44:56 2012	(r239787)
+++ stable/8/sys/kern/vfs_syscalls.c	Tue Aug 28 18:45:20 2012	(r239788)
@@ -4824,6 +4824,8 @@ kern_posix_fadvise(struct thread *td, in
 			new->fa_advice = advice;
 			new->fa_start = offset;
 			new->fa_end = end;
+			new->fa_prevstart = 0;
+			new->fa_prevend = 0;
 			fp->f_advice = new;
 			new = fa;
 		}

Modified: stable/8/sys/kern/vfs_vnops.c
==============================================================================
--- stable/8/sys/kern/vfs_vnops.c	Tue Aug 28 18:44:56 2012	(r239787)
+++ stable/8/sys/kern/vfs_vnops.c	Tue Aug 28 18:45:20 2012	(r239788)
@@ -512,6 +512,7 @@ vn_read(fp, uio, active_cred, flags, td)
 	int error, ioflag;
 	struct mtx *mtxp;
 	int advice, vfslocked;
+	off_t offset, start, end;
 
 	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
 	    uio->uio_td, td));
@@ -551,19 +552,14 @@ vn_read(fp, uio, active_cred, flags, td)
 	switch (advice) {
 	case POSIX_FADV_NORMAL:
 	case POSIX_FADV_SEQUENTIAL:
+	case POSIX_FADV_NOREUSE:
 		ioflag |= sequential_heuristic(uio, fp);
 		break;
 	case POSIX_FADV_RANDOM:
 		/* Disable read-ahead for random I/O. */
 		break;
-	case POSIX_FADV_NOREUSE:
-		/*
-		 * Request the underlying FS to discard the buffers
-		 * and pages after the I/O is complete.
-		 */
-		ioflag |= IO_DIRECT;
-		break;
 	}
+	offset = uio->uio_offset;
 
 #ifdef MAC
 	error = mac_vnode_check_read(active_cred, fp->f_cred, vp);
@@ -580,6 +576,39 @@ vn_read(fp, uio, active_cred, flags, td)
 	}
 	fp->f_nextoff = uio->uio_offset;
 	VOP_UNLOCK(vp, 0);
+	if (error == 0 && advice == POSIX_FADV_NOREUSE &&
+	    offset != uio->uio_offset) {
+		/*
+		 * Use POSIX_FADV_DONTNEED to flush clean pages and
+		 * buffers for the backing file after a
+		 * POSIX_FADV_NOREUSE read(2).  To optimize the common
+		 * case of using POSIX_FADV_NOREUSE with sequential
+		 * access, track the previous implicit DONTNEED
+		 * request and grow this request to include the
+		 * current read(2) in addition to the previous
+		 * DONTNEED.  With purely sequential access this will
+		 * cause the DONTNEED requests to continously grow to
+		 * cover all of the previously read regions of the
+		 * file.  This allows filesystem blocks that are
+		 * accessed by multiple calls to read(2) to be flushed
+		 * once the last read(2) finishes.
+		 */
+		start = offset;
+		end = uio->uio_offset - 1;
+		mtx_lock(mtxp);
+		if (fp->f_advice != NULL &&
+		    fp->f_advice->fa_advice == POSIX_FADV_NOREUSE) {
+			if (start != 0 && fp->f_advice->fa_prevend + 1 == start)
+				start = fp->f_advice->fa_prevstart;
+			else if (fp->f_advice->fa_prevstart != 0 &&
+			    fp->f_advice->fa_prevstart == end + 1)
+				end = fp->f_advice->fa_prevend;
+			fp->f_advice->fa_prevstart = start;
+			fp->f_advice->fa_prevend = end;
+		}
+		mtx_unlock(mtxp);
+		error = VOP_ADVISE(vp, start, end, POSIX_FADV_DONTNEED);
+	}
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
@@ -600,6 +629,7 @@ vn_write(fp, uio, active_cred, flags, td
 	int error, ioflag, lock_flags;
 	struct mtx *mtxp;
 	int advice, vfslocked;
+	off_t offset, start, end;
 
 	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
 	    uio->uio_td, td));
@@ -634,6 +664,7 @@ vn_write(fp, uio, active_cred, flags, td
 	if ((flags & FOF_OFFSET) == 0)
 		uio->uio_offset = fp->f_offset;
 	advice = POSIX_FADV_NORMAL;
+	mtxp = NULL;
 	if (fp->f_advice != NULL) {
 		mtxp = mtx_pool_find(mtxpool_sleep, fp);
 		mtx_lock(mtxp);
@@ -646,19 +677,14 @@ vn_write(fp, uio, active_cred, flags, td
 	switch (advice) {
 	case POSIX_FADV_NORMAL:
 	case POSIX_FADV_SEQUENTIAL:
+	case POSIX_FADV_NOREUSE:
 		ioflag |= sequential_heuristic(uio, fp);
 		break;
 	case POSIX_FADV_RANDOM:
 		/* XXX: Is this correct? */
 		break;
-	case POSIX_FADV_NOREUSE:
-		/*
-		 * Request the underlying FS to discard the buffers
-		 * and pages after the I/O is complete.
-		 */
-		ioflag |= IO_DIRECT;
-		break;
 	}
+	offset = uio->uio_offset;
 
 #ifdef MAC
 	error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
@@ -671,6 +697,55 @@ vn_write(fp, uio, active_cred, flags, td
 	VOP_UNLOCK(vp, 0);
 	if (vp->v_type != VCHR)
 		vn_finished_write(mp);
+	if (error == 0 && advice == POSIX_FADV_NOREUSE &&
+	    offset != uio->uio_offset) {
+		/*
+		 * Use POSIX_FADV_DONTNEED to flush clean pages and
+		 * buffers for the backing file after a
+		 * POSIX_FADV_NOREUSE write(2).  To optimize the
+		 * common case of using POSIX_FADV_NOREUSE with
+		 * sequential access, track the previous implicit
+		 * DONTNEED request and grow this request to include
+		 * the current write(2) in addition to the previous
+		 * DONTNEED.  With purely sequential access this will
+		 * cause the DONTNEED requests to continously grow to
+		 * cover all of the previously written regions of the
+		 * file.
+		 *
+		 * Note that the blocks just written are almost
+		 * certainly still dirty, so this only works when
+		 * VOP_ADVISE() calls from subsequent writes push out
+		 * the data written by this write(2) once the backing
+		 * buffers are clean.  However, as compared to forcing
+		 * IO_DIRECT, this gives much saner behavior.  Write
+		 * clustering is still allowed, and clean pages are
+		 * merely moved to the cache page queue rather than
+		 * outright thrown away.  This means a subsequent
+		 * read(2) can still avoid hitting the disk if the
+		 * pages have not been reclaimed.
+		 *
+		 * This does make POSIX_FADV_NOREUSE largely useless
+		 * with non-sequential access.  However, sequential
+		 * access is the more common use case and the flag is
+		 * merely advisory.
+		 */
+		start = offset;
+		end = uio->uio_offset - 1;
+		mtx_lock(mtxp);
+		if (fp->f_advice != NULL &&
+		    fp->f_advice->fa_advice == POSIX_FADV_NOREUSE) {
+			if (start != 0 && fp->f_advice->fa_prevend + 1 == start)
+				start = fp->f_advice->fa_prevstart;
+			else if (fp->f_advice->fa_prevstart != 0 &&
+			    fp->f_advice->fa_prevstart == end + 1)
+				end = fp->f_advice->fa_prevend;
+			fp->f_advice->fa_prevstart = start;
+			fp->f_advice->fa_prevend = end;
+		}
+		mtx_unlock(mtxp);
+		error = VOP_ADVISE(vp, start, end, POSIX_FADV_DONTNEED);
+	}
+	
 unlock:
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);

Modified: stable/8/sys/sys/file.h
==============================================================================
--- stable/8/sys/sys/file.h	Tue Aug 28 18:44:56 2012	(r239787)
+++ stable/8/sys/sys/file.h	Tue Aug 28 18:45:20 2012	(r239788)
@@ -117,6 +117,8 @@ struct fadvise_info {
 	int		fa_advice;	/* (f) FADV_* type. */
 	off_t		fa_start;	/* (f) Region start. */
 	off_t		fa_end;		/* (f) Region end. */
+	off_t		fa_prevstart;	/* (f) Previous NOREUSE start. */
+	off_t		fa_prevend;	/* (f) Previous NOREUSE end. */
 };
 
 struct file {



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201208281845.q7SIjKm4011026>