Skip site navigation (1)Skip section navigation (2)
Date:      Fri, 4 Nov 2011 04:02:50 +0000 (UTC)
From:      John Baldwin <jhb@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r227070 - in head: lib/libc/sys sys/compat/freebsd32 sys/kern sys/sys sys/vm
Message-ID:  <201111040402.pA442oYZ000317@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: jhb
Date: Fri Nov  4 04:02:50 2011
New Revision: 227070
URL: http://svn.freebsd.org/changeset/base/227070

Log:
  Add the posix_fadvise(2) system call.  It is somewhat similar to
  madvise(2) except that it operates on a file descriptor instead of a
  memory region.  It is currently only supported on regular files.
  
  Just as with madvise(2), the advice given to posix_fadvise(2) can be
  divided into two types.  The first type provide hints about data access
  patterns and are used in the file read and write routines to modify the
  I/O flags passed down to VOP_READ() and VOP_WRITE().  These modes are
  thus filesystem independent.  Note that to ease implementation (and
  since this API is only advisory anyway), only a single non-normal
  range is allowed per file descriptor.
  
  The second type of hints are used to hint to the OS that data will or
  will not be used.  These hints are implemented via a new VOP_ADVISE().
  A default implementation is provided which does nothing for the WILLNEED
  request and attempts to move any clean pages to the cache page queue for
  the DONTNEED request.  This latter case required two other changes.
  First, a new V_CLEANONLY flag was added to vinvalbuf().  This requests
  vinvalbuf() to only flush clean buffers for the vnode from the buffer
  cache and to not remove any backing pages from the vnode.  This is
  used to ensure clean pages are not wired into the buffer cache before
  attempting to move them to the cache page queue.  The second change adds
  a new vm_object_page_cache() method.  This method is somewhat similar to
  vm_object_page_remove() except that instead of freeing each page in the
  specified range, it attempts to move clean pages to the cache queue if
  possible.
  
  To preserve the ABI of struct file, the f_cdevpriv pointer is now reused
  in a union to point to the currently active advice region if one is
  present for regular files.
  
  Reviewed by:	jilles, kib, arch@
  Approved by:	re (kib)
  MFC after:	1 month

Added:
  head/lib/libc/sys/posix_fadvise.2   (contents, props changed)
Modified:
  head/lib/libc/sys/Makefile.inc
  head/lib/libc/sys/Symbol.map
  head/lib/libc/sys/madvise.2
  head/sys/compat/freebsd32/freebsd32_misc.c
  head/sys/compat/freebsd32/syscalls.master
  head/sys/kern/syscalls.master
  head/sys/kern/vfs_default.c
  head/sys/kern/vfs_subr.c
  head/sys/kern/vfs_syscalls.c
  head/sys/kern/vfs_vnops.c
  head/sys/kern/vnode_if.src
  head/sys/sys/fcntl.h
  head/sys/sys/file.h
  head/sys/sys/param.h
  head/sys/sys/vnode.h
  head/sys/vm/vm_object.c
  head/sys/vm/vm_object.h

Modified: head/lib/libc/sys/Makefile.inc
==============================================================================
--- head/lib/libc/sys/Makefile.inc	Fri Nov  4 03:39:31 2011	(r227069)
+++ head/lib/libc/sys/Makefile.inc	Fri Nov  4 04:02:50 2011	(r227070)
@@ -96,7 +96,8 @@ MAN+=	abort2.2 accept.2 access.2 acct.2 
 	mq_setattr.2 \
 	msgctl.2 msgget.2 msgrcv.2 msgsnd.2 \
 	msync.2 munmap.2 nanosleep.2 nfssvc.2 ntp_adjtime.2 open.2 \
-	pathconf.2 pdfork.2 pipe.2 poll.2 posix_fallocate.2 posix_openpt.2 profil.2 \
+	pathconf.2 pdfork.2 pipe.2 poll.2 posix_fadvise.2 posix_fallocate.2 \
+	posix_openpt.2 profil.2 \
 	pselect.2 ptrace.2 quotactl.2 \
 	read.2 readlink.2 reboot.2 recv.2 rename.2 revoke.2 rfork.2 rmdir.2 \
 	rtprio.2

Modified: head/lib/libc/sys/Symbol.map
==============================================================================
--- head/lib/libc/sys/Symbol.map	Fri Nov  4 03:39:31 2011	(r227069)
+++ head/lib/libc/sys/Symbol.map	Fri Nov  4 04:02:50 2011	(r227070)
@@ -378,6 +378,10 @@ FBSD_1.2 {
 	setloginclass;
 };
 
+FBSD_1.3 {
+	posix_fadvise;
+};
+
 FBSDprivate_1.0 {
 	___acl_aclcheck_fd;
 	__sys___acl_aclcheck_fd;

Modified: head/lib/libc/sys/madvise.2
==============================================================================
--- head/lib/libc/sys/madvise.2	Fri Nov  4 03:39:31 2011	(r227069)
+++ head/lib/libc/sys/madvise.2	Fri Nov  4 04:02:50 2011	(r227070)
@@ -169,7 +169,8 @@ was specified and the process does not h
 .Xr mincore 2 ,
 .Xr mprotect 2 ,
 .Xr msync 2 ,
-.Xr munmap 2
+.Xr munmap 2 ,
+.Xr posix_fadvise 2
 .Sh STANDARDS
 The
 .Fn posix_madvise

Added: head/lib/libc/sys/posix_fadvise.2
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ head/lib/libc/sys/posix_fadvise.2	Fri Nov  4 04:02:50 2011	(r227070)
@@ -0,0 +1,139 @@
+.\" Copyright (c) 1991, 1993
+.\"	The Regents of the University of California.  All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\"    notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\"    notice, this list of conditions and the following disclaimer in the
+.\"    documentation and/or other materials provided with the distribution.
+.\" 4. Neither the name of the University nor the names of its contributors
+.\"    may be used to endorse or promote products derived from this software
+.\"    without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\"	@(#)madvise.2	8.1 (Berkeley) 6/9/93
+.\" $FreeBSD$
+.\"
+.Dd October 26, 2011
+.Dt POSIX_FADVISE 2
+.Os
+.Sh NAME
+.Nm posix_fadvise
+.Nd give advice about use of file data
+.Sh LIBRARY
+.Lb libc
+.Sh SYNOPSIS
+.In fcntl.h
+.Ft int
+.Fn posix_fadvise "int fd" "off_t offset" "off_t len" "int advice"
+.Sh DESCRIPTION
+The
+.Fn posix_fadvise
+system call
+allows a process to describe to the system its data access behavior for an
+open file descriptor
+.Fa fd .
+The advice covers the data starting at offset
+.Fa offset
+and continuing for
+.Fa len
+bytes.
+If
+.Fa len
+is zero,
+all data from
+.Fa offset
+to the end of the file is covered.
+.Pp
+The behavior is specified by the
+.Fa advice
+parameter and may be one of:
+.Bl -tag -width POSIX_FADV_SEQUENTIAL
+.It Dv POSIX_FADV_NORMAL
+Tells the system to revert to the default data access behavior.
+.It Dv POSIX_FADV_RANDOM
+Is a hint that file data will be accessed randomly,
+and prefetching is likely not advantageous.
+.It Dv POSIX_FADV_SEQUENTIAL
+Tells the system that file data will be accessed sequentially.
+This currently does nothing as the default behavior uses heuristics to
+detect sequential behavior.
+.It Dv POSIX_FADV_WILLNEED
+Tells the system that the specified data will be accessed in the near future.
+The system may initiate an asychronous read of the data if it is not already
+present in memory.
+.It Dv POSIX_FADV_DONTNEED
+Tells the system that the specified data will not be accessed in the near
+future.
+The system may decrease the in-memory priority of clean data within the
+specified range and future access to this data may require a read operation.
+.It Dv POSIX_FADV_NOREUSE
+Tells the system that the specified data will only be accessed once and
+then not reused.
+Accesses to data within the specified range are treated as if the file
+descriptor has the
+.Dv O_DIRECT
+flag enabled.
+.El
+.Pp
+.Sh RETURN VALUES
+.Rv -std posix_fadvise
+.Sh ERRORS
+The
+.Fn posix_fadvise
+system call will fail if:
+.Bl -tag -width Er
+.It Bq Er EBADF
+The
+.Fa fd
+argument is not a valid file descriptor.
+.It Bq Er EINVAL
+The
+.Fa advice
+argument is not valid.
+.It Bq Er EINVAL
+The
+.Fa offset
+or
+.Fa len
+arguments are negative,
+or
+.Fa offset
++
+.Fa len
+is greater than the maximum file size.
+.It Bq Er ENODEV
+The
+.Fa fd
+argument does not refer to a regular file.
+.It Bq Er ESPIPE
+The
+.Fa fd
+argument is associated with a pipe or FIFO.
+.El
+.Sh SEE ALSO
+.Xr madvise 2
+.Sh STANDARDS
+The
+.Fn posix_fadvise
+interface conforms to
+.St -p1003.1-2001 .
+.Sh HISTORY
+The
+.Fn posix_fadvise
+system call first appeared in
+.Fx 10.0 .

Modified: head/sys/compat/freebsd32/freebsd32_misc.c
==============================================================================
--- head/sys/compat/freebsd32/freebsd32_misc.c	Fri Nov  4 03:39:31 2011	(r227069)
+++ head/sys/compat/freebsd32/freebsd32_misc.c	Fri Nov  4 04:02:50 2011	(r227070)
@@ -2835,3 +2835,16 @@ freebsd32_posix_fallocate(struct thread 
 	ap.len = PAIR32TO64(off_t, uap->len);
 	return (sys_posix_fallocate(td, &ap));
 }
+
+int
+freebsd32_posix_fadvise(struct thread *td,
+    struct freebsd32_posix_fadvise_args *uap)
+{
+	struct posix_fadvise_args ap;
+
+	ap.fd = uap->fd;
+	ap.offset = PAIR32TO64(off_t, uap->offset);
+	ap.len = PAIR32TO64(off_t, uap->len);
+	ap.advice = uap->advice;
+	return (sys_posix_fadvise(td, &ap));
+}

Modified: head/sys/compat/freebsd32/syscalls.master
==============================================================================
--- head/sys/compat/freebsd32/syscalls.master	Fri Nov  4 03:39:31 2011	(r227069)
+++ head/sys/compat/freebsd32/syscalls.master	Fri Nov  4 04:02:50 2011	(r227070)
@@ -991,4 +991,7 @@
 530	AUE_NULL	STD	{ int freebsd32_posix_fallocate(int fd,\
 				    uint32_t offset1, uint32_t offset2,\
 				    uint32_t len1, uint32_t len2); }
-531	AUE_NULL	UNIMPL	posix_fadvise
+531	AUE_NULL	STD	{ int freebsd32_posix_fadvise(int fd, \
+				    uint32_t offset1, uint32_t offset2,\
+				    uint32_t len1, uint32_t len2, \
+				    int advice); }

Modified: head/sys/kern/syscalls.master
==============================================================================
--- head/sys/kern/syscalls.master	Fri Nov  4 03:39:31 2011	(r227069)
+++ head/sys/kern/syscalls.master	Fri Nov  4 04:02:50 2011	(r227070)
@@ -947,6 +947,7 @@
 				    size_t outbuflen); }
 530	AUE_NULL	STD	{ int posix_fallocate(int fd, \
 				    off_t offset, off_t len); }
-531	AUE_NULL	UNIMPL	posix_fadvise
+531	AUE_NULL	STD	{ int posix_fadvise(int fd, off_t offset, \
+				    off_t len, int advice); }
 ; Please copy any additions and changes to the following compatability tables:
 ; sys/compat/freebsd32/syscalls.master

Modified: head/sys/kern/vfs_default.c
==============================================================================
--- head/sys/kern/vfs_default.c	Fri Nov  4 03:39:31 2011	(r227069)
+++ head/sys/kern/vfs_default.c	Fri Nov  4 04:02:50 2011	(r227070)
@@ -96,6 +96,7 @@ struct vop_vector default_vnodeops = {
 
 	.vop_access =		vop_stdaccess,
 	.vop_accessx =		vop_stdaccessx,
+	.vop_advise =		vop_stdadvise,
 	.vop_advlock =		vop_stdadvlock,
 	.vop_advlockasync =	vop_stdadvlockasync,
 	.vop_advlockpurge =	vop_stdadvlockpurge,
@@ -984,6 +985,58 @@ vop_stdallocate(struct vop_allocate_args
 	return (error);
 }
 
+int
+vop_stdadvise(struct vop_advise_args *ap)
+{
+	struct vnode *vp;
+	off_t start, end;
+	int error, vfslocked;
+
+	vp = ap->a_vp;
+	switch (ap->a_advice) {
+	case POSIX_FADV_WILLNEED:
+		/*
+		 * Do nothing for now.  Filesystems should provide a
+		 * custom method which starts an asynchronous read of
+		 * the requested region.
+		 */
+		error = 0;
+		break;
+	case POSIX_FADV_DONTNEED:
+		/*
+		 * Flush any open FS buffers and then remove pages
+		 * from the backing VM object.  Using vinvalbuf() here
+		 * is a bit heavy-handed as it flushes all buffers for
+		 * the given vnode, not just the buffers covering the
+		 * requested range.
+		 */
+		error = 0;
+		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
+		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+		if (vp->v_iflag & VI_DOOMED) {
+			VOP_UNLOCK(vp, 0);
+			VFS_UNLOCK_GIANT(vfslocked);
+			break;
+		}
+		vinvalbuf(vp, V_CLEANONLY, 0, 0);
+		if (vp->v_object != NULL) {
+			start = trunc_page(ap->a_start);
+			end = round_page(ap->a_end);
+			VM_OBJECT_LOCK(vp->v_object);
+			vm_object_page_cache(vp->v_object, OFF_TO_IDX(start),
+			    OFF_TO_IDX(end));
+			VM_OBJECT_UNLOCK(vp->v_object);
+		}
+		VOP_UNLOCK(vp, 0);
+		VFS_UNLOCK_GIANT(vfslocked);
+		break;
+	default:
+		error = EINVAL;
+		break;
+	}
+	return (error);
+}
+
 /*
  * vfs default ops
  * used to fill the vfs function table to get reasonable default return values.

Modified: head/sys/kern/vfs_subr.c
==============================================================================
--- head/sys/kern/vfs_subr.c	Fri Nov  4 03:39:31 2011	(r227069)
+++ head/sys/kern/vfs_subr.c	Fri Nov  4 04:02:50 2011	(r227070)
@@ -1191,7 +1191,7 @@ bufobj_invalbuf(struct bufobj *bo, int f
 	do {
 		error = flushbuflist(&bo->bo_clean,
 		    flags, bo, slpflag, slptimeo);
-		if (error == 0)
+		if (error == 0 && !(flags & V_CLEANONLY))
 			error = flushbuflist(&bo->bo_dirty,
 			    flags, bo, slpflag, slptimeo);
 		if (error != 0 && error != EAGAIN) {
@@ -1220,7 +1220,8 @@ bufobj_invalbuf(struct bufobj *bo, int f
 	/*
 	 * Destroy the copy in the VM cache, too.
 	 */
-	if (bo->bo_object != NULL && (flags & (V_ALT | V_NORMAL)) == 0) {
+	if (bo->bo_object != NULL &&
+	    (flags & (V_ALT | V_NORMAL | V_CLEANONLY)) == 0) {
 		VM_OBJECT_LOCK(bo->bo_object);
 		vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ?
 		    OBJPR_CLEANONLY : 0);
@@ -1229,7 +1230,7 @@ bufobj_invalbuf(struct bufobj *bo, int f
 
 #ifdef INVARIANTS
 	BO_LOCK(bo);
-	if ((flags & (V_ALT | V_NORMAL)) == 0 &&
+	if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY)) == 0 &&
 	    (bo->bo_dirty.bv_cnt > 0 || bo->bo_clean.bv_cnt > 0))
 		panic("vinvalbuf: flush failed");
 	BO_UNLOCK(bo);

Modified: head/sys/kern/vfs_syscalls.c
==============================================================================
--- head/sys/kern/vfs_syscalls.c	Fri Nov  4 03:39:31 2011	(r227069)
+++ head/sys/kern/vfs_syscalls.c	Fri Nov  4 04:02:50 2011	(r227070)
@@ -86,6 +86,8 @@ __FBSDID("$FreeBSD$");
 #include <vm/vm_page.h>
 #include <vm/uma.h>
 
+static MALLOC_DEFINE(M_FADVISE, "fadvise", "posix_fadvise(2) information");
+
 SDT_PROVIDER_DEFINE(vfs);
 SDT_PROBE_DEFINE(vfs, , stat, mode, mode);
 SDT_PROBE_ARGTYPE(vfs, , stat, mode, 0, "char *");
@@ -4845,3 +4847,135 @@ sys_posix_fallocate(struct thread *td, s
 
 	return (kern_posix_fallocate(td, uap->fd, uap->offset, uap->len));
 }
+
+/*
+ * Unlike madvise(2), we do not make a best effort to remember every
+ * possible caching hint.  Instead, we remember the last setting with
+ * the exception that we will allow POSIX_FADV_NORMAL to adjust the
+ * region of any current setting.
+ */
+int
+sys_posix_fadvise(struct thread *td, struct posix_fadvise_args *uap)
+{
+	struct fadvise_info *fa, *new;
+	struct file *fp;
+	struct vnode *vp;
+	off_t end;
+	int error;
+
+	if (uap->offset < 0 || uap->len < 0 ||
+	    uap->offset > OFF_MAX - uap->len)
+		return (EINVAL);
+	switch (uap->advice) {
+	case POSIX_FADV_SEQUENTIAL:
+	case POSIX_FADV_RANDOM:
+	case POSIX_FADV_NOREUSE:
+		new = malloc(sizeof(*fa), M_FADVISE, M_WAITOK);
+		break;
+	case POSIX_FADV_NORMAL:
+	case POSIX_FADV_WILLNEED:
+	case POSIX_FADV_DONTNEED:
+		new = NULL;
+		break;
+	default:
+		return (EINVAL);
+	}
+	/* XXX: CAP_POSIX_FADVISE? */
+	error = fget(td, uap->fd, 0, &fp);
+	if (error != 0)
+		goto out;
+	
+	switch (fp->f_type) {
+	case DTYPE_VNODE:
+		break;
+	case DTYPE_PIPE:
+	case DTYPE_FIFO:
+		error = ESPIPE;
+		goto out;
+	default:
+		error = ENODEV;
+		goto out;
+	}
+	vp = fp->f_vnode;
+	if (vp->v_type != VREG) {
+		error = ENODEV;
+		goto out;
+	}
+	if (uap->len == 0)
+		end = OFF_MAX;
+	else
+		end = uap->offset + uap->len - 1;
+	switch (uap->advice) {
+	case POSIX_FADV_SEQUENTIAL:
+	case POSIX_FADV_RANDOM:
+	case POSIX_FADV_NOREUSE:
+		/*
+		 * Try to merge any existing non-standard region with
+		 * this new region if possible, otherwise create a new
+		 * non-standard region for this request.
+		 */
+		mtx_pool_lock(mtxpool_sleep, fp);
+		fa = fp->f_advice;
+		if (fa != NULL && fa->fa_advice == uap->advice &&
+		    ((fa->fa_start <= end && fa->fa_end >= uap->offset) ||
+		    (end != OFF_MAX && fa->fa_start == end + 1) ||
+		    (fa->fa_end != OFF_MAX && fa->fa_end + 1 == uap->offset))) {
+			if (uap->offset < fa->fa_start)
+				fa->fa_start = uap->offset;
+			if (end > fa->fa_end)
+				fa->fa_end = end;
+		} else {
+			new->fa_advice = uap->advice;
+			new->fa_start = uap->offset;
+			new->fa_end = end;
+			fp->f_advice = new;
+			new = fa;
+		}
+		mtx_pool_unlock(mtxpool_sleep, fp);
+		break;
+	case POSIX_FADV_NORMAL:
+		/*
+		 * If a the "normal" region overlaps with an existing
+		 * non-standard region, trim or remove the
+		 * non-standard region.
+		 */
+		mtx_pool_lock(mtxpool_sleep, fp);
+		fa = fp->f_advice;
+		if (fa != NULL) {
+			if (uap->offset <= fa->fa_start &&
+			    end >= fa->fa_end) {
+				new = fa;
+				fp->f_advice = NULL;
+			} else if (uap->offset <= fa->fa_start &&
+			    end >= fa->fa_start)
+				fa->fa_start = end + 1;
+			else if (uap->offset <= fa->fa_end &&
+			    end >= fa->fa_end)
+				fa->fa_end = uap->offset - 1;
+			else if (uap->offset >= fa->fa_start &&
+			    end <= fa->fa_end) {
+				/*
+				 * If the "normal" region is a middle
+				 * portion of the existing
+				 * non-standard region, just remove
+				 * the whole thing rather than picking
+				 * one side or the other to
+				 * preserve.
+				 */
+				new = fa;
+				fp->f_advice = NULL;
+			}
+		}
+		mtx_pool_unlock(mtxpool_sleep, fp);
+		break;
+	case POSIX_FADV_WILLNEED:
+	case POSIX_FADV_DONTNEED:
+		error = VOP_ADVISE(vp, uap->offset, end, uap->advice);
+		break;
+	}
+out:
+	if (fp != NULL)
+		fdrop(fp, td);
+	free(new, M_FADVISE);
+	return (error);
+}

Modified: head/sys/kern/vfs_vnops.c
==============================================================================
--- head/sys/kern/vfs_vnops.c	Fri Nov  4 03:39:31 2011	(r227069)
+++ head/sys/kern/vfs_vnops.c	Fri Nov  4 04:02:50 2011	(r227070)
@@ -518,7 +518,7 @@ vn_read(fp, uio, active_cred, flags, td)
 	struct vnode *vp;
 	int error, ioflag;
 	struct mtx *mtxp;
-	int vfslocked;
+	int advice, vfslocked;
 
 	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
 	    uio->uio_td, td));
@@ -529,27 +529,48 @@ vn_read(fp, uio, active_cred, flags, td)
 		ioflag |= IO_NDELAY;
 	if (fp->f_flag & O_DIRECT)
 		ioflag |= IO_DIRECT;
+	advice = POSIX_FADV_NORMAL;
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	/*
 	 * According to McKusick the vn lock was protecting f_offset here.
 	 * It is now protected by the FOFFSET_LOCKED flag.
 	 */
-	if ((flags & FOF_OFFSET) == 0) {
+	if ((flags & FOF_OFFSET) == 0 || fp->f_advice != NULL) {
 		mtxp = mtx_pool_find(mtxpool_sleep, fp);
 		mtx_lock(mtxp);
-		while(fp->f_vnread_flags & FOFFSET_LOCKED) {
-			fp->f_vnread_flags |= FOFFSET_LOCK_WAITING;
-			msleep(&fp->f_vnread_flags, mtxp, PUSER -1,
-			    "vnread offlock", 0);
+		if ((flags & FOF_OFFSET) == 0) {
+			while (fp->f_vnread_flags & FOFFSET_LOCKED) {
+				fp->f_vnread_flags |= FOFFSET_LOCK_WAITING;
+				msleep(&fp->f_vnread_flags, mtxp, PUSER -1,
+				    "vnread offlock", 0);
+			}
+			fp->f_vnread_flags |= FOFFSET_LOCKED;
+			uio->uio_offset = fp->f_offset;
 		}
-		fp->f_vnread_flags |= FOFFSET_LOCKED;
+		if (fp->f_advice != NULL &&
+		    uio->uio_offset >= fp->f_advice->fa_start &&
+		    uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end)
+			advice = fp->f_advice->fa_advice;
 		mtx_unlock(mtxp);
-		vn_lock(vp, LK_SHARED | LK_RETRY);
-		uio->uio_offset = fp->f_offset;
-	} else
-		vn_lock(vp, LK_SHARED | LK_RETRY);
+	}
+	vn_lock(vp, LK_SHARED | LK_RETRY);
 
-	ioflag |= sequential_heuristic(uio, fp);
+	switch (advice) {
+	case POSIX_FADV_NORMAL:
+	case POSIX_FADV_SEQUENTIAL:
+		ioflag |= sequential_heuristic(uio, fp);
+		break;
+	case POSIX_FADV_RANDOM:
+		/* Disable read-ahead for random I/O. */
+		break;
+	case POSIX_FADV_NOREUSE:
+		/*
+		 * Request the underlying FS to discard the buffers
+		 * and pages after the I/O is complete.
+		 */
+		ioflag |= IO_DIRECT;
+		break;
+	}
 
 #ifdef MAC
 	error = mac_vnode_check_read(active_cred, fp->f_cred, vp);
@@ -584,7 +605,8 @@ vn_write(fp, uio, active_cred, flags, td
 	struct vnode *vp;
 	struct mount *mp;
 	int error, ioflag, lock_flags;
-	int vfslocked;
+	struct mtx *mtxp;
+	int advice, vfslocked;
 
 	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
 	    uio->uio_td, td));
@@ -618,7 +640,33 @@ vn_write(fp, uio, active_cred, flags, td
 	vn_lock(vp, lock_flags | LK_RETRY);
 	if ((flags & FOF_OFFSET) == 0)
 		uio->uio_offset = fp->f_offset;
-	ioflag |= sequential_heuristic(uio, fp);
+	advice = POSIX_FADV_NORMAL;
+	if (fp->f_advice != NULL) {
+		mtxp = mtx_pool_find(mtxpool_sleep, fp);
+		mtx_lock(mtxp);
+		if (fp->f_advice != NULL &&
+		    uio->uio_offset >= fp->f_advice->fa_start &&
+		    uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end)
+			advice = fp->f_advice->fa_advice;
+		mtx_unlock(mtxp);
+	}
+	switch (advice) {
+	case POSIX_FADV_NORMAL:
+	case POSIX_FADV_SEQUENTIAL:
+		ioflag |= sequential_heuristic(uio, fp);
+		break;
+	case POSIX_FADV_RANDOM:
+		/* XXX: Is this correct? */
+		break;
+	case POSIX_FADV_NOREUSE:
+		/*
+		 * Request the underlying FS to discard the buffers
+		 * and pages after the I/O is complete.
+		 */
+		ioflag |= IO_DIRECT;
+		break;
+	}
+
 #ifdef MAC
 	error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
 	if (error == 0)

Modified: head/sys/kern/vnode_if.src
==============================================================================
--- head/sys/kern/vnode_if.src	Fri Nov  4 03:39:31 2011	(r227069)
+++ head/sys/kern/vnode_if.src	Fri Nov  4 04:02:50 2011	(r227070)
@@ -628,3 +628,12 @@ vop_allocate {
 	INOUT off_t *offset;
 	INOUT off_t *len;
 };
+
+%% advise	vp	U U U
+
+vop_advise {
+	IN struct vnode *vp;
+	IN off_t start;
+	IN off_t end;
+	IN int advice;
+};

Modified: head/sys/sys/fcntl.h
==============================================================================
--- head/sys/sys/fcntl.h	Fri Nov  4 03:39:31 2011	(r227069)
+++ head/sys/sys/fcntl.h	Fri Nov  4 04:02:50 2011	(r227070)
@@ -277,9 +277,17 @@ struct oflock {
 #define	LOCK_UN		0x08		/* unlock file */
 #endif
 
+#if __POSIX_VISIBLE >= 200112
 /*
- * XXX missing posix_fadvise() and POSIX_FADV_* macros.
+ * Advice to posix_fadvise
  */
+#define	POSIX_FADV_NORMAL	0	/* no special treatment */
+#define	POSIX_FADV_RANDOM	1	/* expect random page references */
+#define	POSIX_FADV_SEQUENTIAL	2	/* expect sequential page references */
+#define	POSIX_FADV_WILLNEED	3	/* will need these pages */
+#define	POSIX_FADV_DONTNEED	4	/* dont need these pages */
+#define	POSIX_FADV_NOREUSE	5	/* access data only once */
+#endif
 
 #ifndef _KERNEL
 __BEGIN_DECLS
@@ -293,6 +301,7 @@ int	flock(int, int);
 int	openat(int, const char *, int, ...);
 #endif
 #if __BSD_VISIBLE || __POSIX_VISIBLE >= 200112
+int	posix_fadvise(int, off_t, off_t, int);
 int	posix_fallocate(int, off_t, off_t);
 #endif
 __END_DECLS

Modified: head/sys/sys/file.h
==============================================================================
--- head/sys/sys/file.h	Fri Nov  4 03:39:31 2011	(r227069)
+++ head/sys/sys/file.h	Fri Nov  4 04:02:50 2011	(r227070)
@@ -122,6 +122,12 @@ struct fileops {
  * none	not locked
  */
 
+struct fadvise_info {
+	int		fa_advice;	/* (f) FADV_* type. */
+	off_t		fa_start;	/* (f) Region start. */
+	off_t		fa_end;		/* (f) Region end. */
+};
+
 struct file {
 	void		*f_data;	/* file descriptor specific data */
 	struct fileops	*f_ops;		/* File operations */
@@ -136,7 +142,11 @@ struct file {
 	 */
 	int		f_seqcount;	/* Count of sequential accesses. */
 	off_t		f_nextoff;	/* next expected read/write offset. */
-	struct cdev_privdata *f_cdevpriv; /* (d) Private data for the cdev. */
+	union {
+		struct cdev_privdata *fvn_cdevpriv;
+					/* (d) Private data for the cdev. */
+		struct fadvise_info *fvn_advice;
+	} f_vnun;
 	/*
 	 *  DFLAG_SEEKABLE specific fields
 	 */
@@ -147,6 +157,9 @@ struct file {
 	void		*f_label;	/* Place-holder for MAC label. */
 };
 
+#define	f_cdevpriv	f_vnun.fvn_cdevpriv
+#define	f_advice	f_vnun.fvn_advice
+
 #define	FOFFSET_LOCKED       0x1
 #define	FOFFSET_LOCK_WAITING 0x2		 
 

Modified: head/sys/sys/param.h
==============================================================================
--- head/sys/sys/param.h	Fri Nov  4 03:39:31 2011	(r227069)
+++ head/sys/sys/param.h	Fri Nov  4 04:02:50 2011	(r227070)
@@ -58,7 +58,7 @@
  *		in the range 5 to 9.
  */
 #undef __FreeBSD_version
-#define __FreeBSD_version 1000000	/* Master, propagated to newvers */
+#define __FreeBSD_version 1000001	/* Master, propagated to newvers */
 
 #ifdef _KERNEL
 #define	P_OSREL_SIGWAIT		700000

Modified: head/sys/sys/vnode.h
==============================================================================
--- head/sys/sys/vnode.h	Fri Nov  4 03:39:31 2011	(r227069)
+++ head/sys/sys/vnode.h	Fri Nov  4 04:02:50 2011	(r227070)
@@ -384,6 +384,7 @@ extern int		vttoif_tab[];
 #define	V_SAVE		0x0001	/* vinvalbuf: sync file first */
 #define	V_ALT		0x0002	/* vinvalbuf: invalidate only alternate bufs */
 #define	V_NORMAL	0x0004	/* vinvalbuf: invalidate only regular bufs */
+#define	V_CLEANONLY	0x0008	/* vinvalbuf: invalidate only clean bufs */
 #define	REVOKEALL	0x0001	/* vop_revoke: revoke all aliases */
 #define	V_WAIT		0x0001	/* vn_start_write: sleep for suspend */
 #define	V_NOWAIT	0x0002	/* vn_start_write: don't sleep for suspend */
@@ -685,6 +686,7 @@ int	vop_stdunlock(struct vop_unlock_args
 int	vop_nopoll(struct vop_poll_args *);
 int	vop_stdaccess(struct vop_access_args *ap);
 int	vop_stdaccessx(struct vop_accessx_args *ap);
+int	vop_stdadvise(struct vop_advise_args *ap);
 int	vop_stdadvlock(struct vop_advlock_args *ap);
 int	vop_stdadvlockasync(struct vop_advlockasync_args *ap);
 int	vop_stdadvlockpurge(struct vop_advlockpurge_args *ap);

Modified: head/sys/vm/vm_object.c
==============================================================================
--- head/sys/vm/vm_object.c	Fri Nov  4 03:39:31 2011	(r227069)
+++ head/sys/vm/vm_object.c	Fri Nov  4 04:02:50 2011	(r227070)
@@ -1863,6 +1863,60 @@ skipmemq:
 }
 
 /*
+ *	vm_object_page_cache:
+ *
+ *	For the given object, attempt to move the specified clean
+ *	pages to the cache queue.  If a page is wired for any reason,
+ *	then it will not be changed.  Pages are specified by the given
+ *	range ["start", "end").  As a special case, if "end" is zero,
+ *	then the range extends from "start" to the end of the object.
+ *	Any mappings to the specified pages are removed before the
+ *	pages are moved to the cache queue.
+ *
+ *	This operation should only be performed on objects that
+ *	contain managed pages.
+ *
+ *	The object must be locked.
+ */
+void
+vm_object_page_cache(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
+{
+	struct mtx *mtx, *new_mtx;
+	vm_page_t p, next;
+
+	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+	KASSERT((object->type != OBJT_DEVICE && object->type != OBJT_SG &&
+	    object->type != OBJT_PHYS),
+	    ("vm_object_page_cache: illegal object %p", object));
+	if (object->resident_page_count == 0)
+		return;
+	p = vm_page_find_least(object, start);
+
+	/*
+	 * Here, the variable "p" is either (1) the page with the least pindex
+	 * greater than or equal to the parameter "start" or (2) NULL. 
+	 */
+	mtx = NULL;
+	for (; p != NULL && (p->pindex < end || end == 0); p = next) {
+		next = TAILQ_NEXT(p, listq);
+
+		/*
+		 * Avoid releasing and reacquiring the same page lock.
+		 */
+		new_mtx = vm_page_lockptr(p);
+		if (mtx != new_mtx) {
+			if (mtx != NULL)
+				mtx_unlock(mtx);
+			mtx = new_mtx;
+			mtx_lock(mtx);
+		}
+		vm_page_try_to_cache(p);
+	}
+	if (mtx != NULL)
+		mtx_unlock(mtx);
+}
+
+/*
  *	Populate the specified range of the object with valid pages.  Returns
  *	TRUE if the range is successfully populated and FALSE otherwise.
  *

Modified: head/sys/vm/vm_object.h
==============================================================================
--- head/sys/vm/vm_object.h	Fri Nov  4 03:39:31 2011	(r227069)
+++ head/sys/vm/vm_object.h	Fri Nov  4 04:02:50 2011	(r227070)
@@ -223,6 +223,8 @@ void vm_object_destroy (vm_object_t);
 void vm_object_terminate (vm_object_t);
 void vm_object_set_writeable_dirty (vm_object_t);
 void vm_object_init (void);
+void vm_object_page_cache(vm_object_t object, vm_pindex_t start,
+    vm_pindex_t end);
 void vm_object_page_clean(vm_object_t object, vm_ooffset_t start,
     vm_ooffset_t end, int flags);
 void vm_object_page_remove(vm_object_t object, vm_pindex_t start,



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201111040402.pA442oYZ000317>