Skip site navigation (1)Skip section navigation (2)
Date:      Fri, 6 Jan 2012 19:29:16 +0000 (UTC)
From:      John Baldwin <jhb@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-stable@freebsd.org, svn-src-stable-9@freebsd.org
Subject:   svn commit: r229723 - in stable/9: . lib/libc/sys sys/compat/freebsd32 sys/kern sys/sys sys/vm
Message-ID:  <201201061929.q06JTG7d004261@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: jhb
Date: Fri Jan  6 19:29:16 2012
New Revision: 229723
URL: http://svn.freebsd.org/changeset/base/229723

Log:
  MFC 227070,227341,227502:
  Add the posix_fadvise(2) system call.  It is somewhat similar to
  madvise(2) except that it operates on a file descriptor instead of a
  memory region.  It is currently only supported on regular files.
  
  Note that this adds a new VOP, so all filesystem modules must be
  recompiled.
  
  Approved by:	re (kib)

Added:
  stable/9/lib/libc/sys/posix_fadvise.2
     - copied unchanged from r227070, head/lib/libc/sys/posix_fadvise.2
Modified:
  stable/9/UPDATING
  stable/9/lib/libc/sys/Makefile.inc
  stable/9/lib/libc/sys/Symbol.map
  stable/9/lib/libc/sys/madvise.2
  stable/9/sys/compat/freebsd32/freebsd32_misc.c
  stable/9/sys/compat/freebsd32/syscalls.master
  stable/9/sys/kern/syscalls.master
  stable/9/sys/kern/vfs_default.c
  stable/9/sys/kern/vfs_subr.c
  stable/9/sys/kern/vfs_syscalls.c
  stable/9/sys/kern/vfs_vnops.c
  stable/9/sys/kern/vnode_if.src
  stable/9/sys/sys/fcntl.h
  stable/9/sys/sys/file.h
  stable/9/sys/sys/param.h
  stable/9/sys/sys/syscallsubr.h
  stable/9/sys/sys/unistd.h
  stable/9/sys/sys/vnode.h
  stable/9/sys/vm/vm_object.c
  stable/9/sys/vm/vm_object.h
Directory Properties:
  stable/9/lib/libc/   (props changed)
  stable/9/lib/libc/stdtime/   (props changed)
  stable/9/sys/   (props changed)
  stable/9/sys/amd64/include/xen/   (props changed)
  stable/9/sys/boot/   (props changed)
  stable/9/sys/boot/i386/efi/   (props changed)
  stable/9/sys/boot/ia64/efi/   (props changed)
  stable/9/sys/boot/ia64/ski/   (props changed)
  stable/9/sys/boot/powerpc/boot1.chrp/   (props changed)
  stable/9/sys/boot/powerpc/ofw/   (props changed)
  stable/9/sys/cddl/contrib/opensolaris/   (props changed)
  stable/9/sys/conf/   (props changed)
  stable/9/sys/contrib/dev/acpica/   (props changed)
  stable/9/sys/contrib/octeon-sdk/   (props changed)
  stable/9/sys/contrib/pf/   (props changed)
  stable/9/sys/contrib/x86emu/   (props changed)

Modified: stable/9/UPDATING
==============================================================================
--- stable/9/UPDATING	Fri Jan  6 19:27:51 2012	(r229722)
+++ stable/9/UPDATING	Fri Jan  6 19:29:16 2012	(r229723)
@@ -10,6 +10,10 @@ Items affecting the ports and packages s
 /usr/ports/UPDATING.  Please read that file before running portupgrade.
 
 20120106:
+	A new VOP_ADVISE() was added to support posix_fadvise(2).  All
+	filesystem modules must be recompiled.
+
+20120106:
 	The interface of the VOP_VPTOCNP(9) changed, now the returned
 	vnode shall be referenced, previously it was required to be
 	only held.  All in-tree filesystems are converted.

Modified: stable/9/lib/libc/sys/Makefile.inc
==============================================================================
--- stable/9/lib/libc/sys/Makefile.inc	Fri Jan  6 19:27:51 2012	(r229722)
+++ stable/9/lib/libc/sys/Makefile.inc	Fri Jan  6 19:29:16 2012	(r229723)
@@ -96,7 +96,8 @@ MAN+=	abort2.2 accept.2 access.2 acct.2 
 	mq_setattr.2 \
 	msgctl.2 msgget.2 msgrcv.2 msgsnd.2 \
 	msync.2 munmap.2 nanosleep.2 nfssvc.2 ntp_adjtime.2 open.2 \
-	pathconf.2 pdfork.2 pipe.2 poll.2 posix_fallocate.2 posix_openpt.2 profil.2 \
+	pathconf.2 pdfork.2 pipe.2 poll.2 posix_fadvise.2 posix_fallocate.2 \
+	posix_openpt.2 profil.2 \
 	pselect.2 ptrace.2 quotactl.2 \
 	read.2 readlink.2 reboot.2 recv.2 rename.2 revoke.2 rfork.2 rmdir.2 \
 	rtprio.2

Modified: stable/9/lib/libc/sys/Symbol.map
==============================================================================
--- stable/9/lib/libc/sys/Symbol.map	Fri Jan  6 19:27:51 2012	(r229722)
+++ stable/9/lib/libc/sys/Symbol.map	Fri Jan  6 19:29:16 2012	(r229723)
@@ -378,6 +378,10 @@ FBSD_1.2 {
 	setloginclass;
 };
 
+FBSD_1.3 {
+	posix_fadvise;
+};
+
 FBSDprivate_1.0 {
 	___acl_aclcheck_fd;
 	__sys___acl_aclcheck_fd;

Modified: stable/9/lib/libc/sys/madvise.2
==============================================================================
--- stable/9/lib/libc/sys/madvise.2	Fri Jan  6 19:27:51 2012	(r229722)
+++ stable/9/lib/libc/sys/madvise.2	Fri Jan  6 19:29:16 2012	(r229723)
@@ -169,7 +169,8 @@ was specified and the process does not h
 .Xr mincore 2 ,
 .Xr mprotect 2 ,
 .Xr msync 2 ,
-.Xr munmap 2
+.Xr munmap 2 ,
+.Xr posix_fadvise 2
 .Sh STANDARDS
 The
 .Fn posix_madvise

Copied: stable/9/lib/libc/sys/posix_fadvise.2 (from r227070, head/lib/libc/sys/posix_fadvise.2)
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ stable/9/lib/libc/sys/posix_fadvise.2	Fri Jan  6 19:29:16 2012	(r229723, copy of r227070, head/lib/libc/sys/posix_fadvise.2)
@@ -0,0 +1,139 @@
+.\" Copyright (c) 1991, 1993
+.\"	The Regents of the University of California.  All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\"    notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\"    notice, this list of conditions and the following disclaimer in the
+.\"    documentation and/or other materials provided with the distribution.
+.\" 4. Neither the name of the University nor the names of its contributors
+.\"    may be used to endorse or promote products derived from this software
+.\"    without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\"	@(#)madvise.2	8.1 (Berkeley) 6/9/93
+.\" $FreeBSD$
+.\"
+.Dd October 26, 2011
+.Dt POSIX_FADVISE 2
+.Os
+.Sh NAME
+.Nm posix_fadvise
+.Nd give advice about use of file data
+.Sh LIBRARY
+.Lb libc
+.Sh SYNOPSIS
+.In fcntl.h
+.Ft int
+.Fn posix_fadvise "int fd" "off_t offset" "off_t len" "int advice"
+.Sh DESCRIPTION
+The
+.Fn posix_fadvise
+system call
+allows a process to describe to the system its data access behavior for an
+open file descriptor
+.Fa fd .
+The advice covers the data starting at offset
+.Fa offset
+and continuing for
+.Fa len
+bytes.
+If
+.Fa len
+is zero,
+all data from
+.Fa offset
+to the end of the file is covered.
+.Pp
+The behavior is specified by the
+.Fa advice
+parameter and may be one of:
+.Bl -tag -width POSIX_FADV_SEQUENTIAL
+.It Dv POSIX_FADV_NORMAL
+Tells the system to revert to the default data access behavior.
+.It Dv POSIX_FADV_RANDOM
+Is a hint that file data will be accessed randomly,
+and prefetching is likely not advantageous.
+.It Dv POSIX_FADV_SEQUENTIAL
+Tells the system that file data will be accessed sequentially.
+This currently does nothing as the default behavior uses heuristics to
+detect sequential behavior.
+.It Dv POSIX_FADV_WILLNEED
+Tells the system that the specified data will be accessed in the near future.
+The system may initiate an asychronous read of the data if it is not already
+present in memory.
+.It Dv POSIX_FADV_DONTNEED
+Tells the system that the specified data will not be accessed in the near
+future.
+The system may decrease the in-memory priority of clean data within the
+specified range and future access to this data may require a read operation.
+.It Dv POSIX_FADV_NOREUSE
+Tells the system that the specified data will only be accessed once and
+then not reused.
+Accesses to data within the specified range are treated as if the file
+descriptor has the
+.Dv O_DIRECT
+flag enabled.
+.El
+.Pp
+.Sh RETURN VALUES
+.Rv -std posix_fadvise
+.Sh ERRORS
+The
+.Fn posix_fadvise
+system call will fail if:
+.Bl -tag -width Er
+.It Bq Er EBADF
+The
+.Fa fd
+argument is not a valid file descriptor.
+.It Bq Er EINVAL
+The
+.Fa advice
+argument is not valid.
+.It Bq Er EINVAL
+The
+.Fa offset
+or
+.Fa len
+arguments are negative,
+or
+.Fa offset
++
+.Fa len
+is greater than the maximum file size.
+.It Bq Er ENODEV
+The
+.Fa fd
+argument does not refer to a regular file.
+.It Bq Er ESPIPE
+The
+.Fa fd
+argument is associated with a pipe or FIFO.
+.El
+.Sh SEE ALSO
+.Xr madvise 2
+.Sh STANDARDS
+The
+.Fn posix_fadvise
+interface conforms to
+.St -p1003.1-2001 .
+.Sh HISTORY
+The
+.Fn posix_fadvise
+system call first appeared in
+.Fx 10.0 .

Modified: stable/9/sys/compat/freebsd32/freebsd32_misc.c
==============================================================================
--- stable/9/sys/compat/freebsd32/freebsd32_misc.c	Fri Jan  6 19:27:51 2012	(r229722)
+++ stable/9/sys/compat/freebsd32/freebsd32_misc.c	Fri Jan  6 19:29:16 2012	(r229723)
@@ -2808,10 +2808,16 @@ int
 freebsd32_posix_fallocate(struct thread *td,
     struct freebsd32_posix_fallocate_args *uap)
 {
-	struct posix_fallocate_args ap;
 
-	ap.fd = uap->fd;
-	ap.offset = PAIR32TO64(off_t, uap->offset);
-	ap.len = PAIR32TO64(off_t, uap->len);
-	return (sys_posix_fallocate(td, &ap));
+	return (kern_posix_fallocate(td, uap->fd,
+	    PAIR32TO64(off_t, uap->offset), PAIR32TO64(off_t, uap->len)));
+}
+
+int
+freebsd32_posix_fadvise(struct thread *td,
+    struct freebsd32_posix_fadvise_args *uap)
+{
+
+	return (kern_posix_fadvise(td, uap->fd, PAIR32TO64(off_t, uap->offset),
+	    PAIR32TO64(off_t, uap->len), uap->advice));
 }

Modified: stable/9/sys/compat/freebsd32/syscalls.master
==============================================================================
--- stable/9/sys/compat/freebsd32/syscalls.master	Fri Jan  6 19:27:51 2012	(r229722)
+++ stable/9/sys/compat/freebsd32/syscalls.master	Fri Jan  6 19:29:16 2012	(r229723)
@@ -991,4 +991,7 @@
 530	AUE_NULL	STD	{ int freebsd32_posix_fallocate(int fd,\
 				    uint32_t offset1, uint32_t offset2,\
 				    uint32_t len1, uint32_t len2); }
-531	AUE_NULL	UNIMPL	posix_fadvise
+531	AUE_NULL	STD	{ int freebsd32_posix_fadvise(int fd, \
+				    uint32_t offset1, uint32_t offset2,\
+				    uint32_t len1, uint32_t len2, \
+				    int advice); }

Modified: stable/9/sys/kern/syscalls.master
==============================================================================
--- stable/9/sys/kern/syscalls.master	Fri Jan  6 19:27:51 2012	(r229722)
+++ stable/9/sys/kern/syscalls.master	Fri Jan  6 19:29:16 2012	(r229723)
@@ -947,6 +947,7 @@
 				    size_t outbuflen); }
 530	AUE_NULL	STD	{ int posix_fallocate(int fd, \
 				    off_t offset, off_t len); }
-531	AUE_NULL	UNIMPL	posix_fadvise
+531	AUE_NULL	STD	{ int posix_fadvise(int fd, off_t offset, \
+				    off_t len, int advice); }
 ; Please copy any additions and changes to the following compatability tables:
 ; sys/compat/freebsd32/syscalls.master

Modified: stable/9/sys/kern/vfs_default.c
==============================================================================
--- stable/9/sys/kern/vfs_default.c	Fri Jan  6 19:27:51 2012	(r229722)
+++ stable/9/sys/kern/vfs_default.c	Fri Jan  6 19:29:16 2012	(r229723)
@@ -96,6 +96,7 @@ struct vop_vector default_vnodeops = {
 
 	.vop_access =		vop_stdaccess,
 	.vop_accessx =		vop_stdaccessx,
+	.vop_advise =		vop_stdadvise,
 	.vop_advlock =		vop_stdadvlock,
 	.vop_advlockasync =	vop_stdadvlockasync,
 	.vop_advlockpurge =	vop_stdadvlockpurge,
@@ -984,6 +985,58 @@ vop_stdallocate(struct vop_allocate_args
 	return (error);
 }
 
+int
+vop_stdadvise(struct vop_advise_args *ap)
+{
+	struct vnode *vp;
+	off_t start, end;
+	int error, vfslocked;
+
+	vp = ap->a_vp;
+	switch (ap->a_advice) {
+	case POSIX_FADV_WILLNEED:
+		/*
+		 * Do nothing for now.  Filesystems should provide a
+		 * custom method which starts an asynchronous read of
+		 * the requested region.
+		 */
+		error = 0;
+		break;
+	case POSIX_FADV_DONTNEED:
+		/*
+		 * Flush any open FS buffers and then remove pages
+		 * from the backing VM object.  Using vinvalbuf() here
+		 * is a bit heavy-handed as it flushes all buffers for
+		 * the given vnode, not just the buffers covering the
+		 * requested range.
+		 */
+		error = 0;
+		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
+		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+		if (vp->v_iflag & VI_DOOMED) {
+			VOP_UNLOCK(vp, 0);
+			VFS_UNLOCK_GIANT(vfslocked);
+			break;
+		}
+		vinvalbuf(vp, V_CLEANONLY, 0, 0);
+		if (vp->v_object != NULL) {
+			start = trunc_page(ap->a_start);
+			end = round_page(ap->a_end);
+			VM_OBJECT_LOCK(vp->v_object);
+			vm_object_page_cache(vp->v_object, OFF_TO_IDX(start),
+			    OFF_TO_IDX(end));
+			VM_OBJECT_UNLOCK(vp->v_object);
+		}
+		VOP_UNLOCK(vp, 0);
+		VFS_UNLOCK_GIANT(vfslocked);
+		break;
+	default:
+		error = EINVAL;
+		break;
+	}
+	return (error);
+}
+
 /*
  * vfs default ops
  * used to fill the vfs function table to get reasonable default return values.

Modified: stable/9/sys/kern/vfs_subr.c
==============================================================================
--- stable/9/sys/kern/vfs_subr.c	Fri Jan  6 19:27:51 2012	(r229722)
+++ stable/9/sys/kern/vfs_subr.c	Fri Jan  6 19:29:16 2012	(r229723)
@@ -1191,7 +1191,7 @@ bufobj_invalbuf(struct bufobj *bo, int f
 	do {
 		error = flushbuflist(&bo->bo_clean,
 		    flags, bo, slpflag, slptimeo);
-		if (error == 0)
+		if (error == 0 && !(flags & V_CLEANONLY))
 			error = flushbuflist(&bo->bo_dirty,
 			    flags, bo, slpflag, slptimeo);
 		if (error != 0 && error != EAGAIN) {
@@ -1220,7 +1220,8 @@ bufobj_invalbuf(struct bufobj *bo, int f
 	/*
 	 * Destroy the copy in the VM cache, too.
 	 */
-	if (bo->bo_object != NULL && (flags & (V_ALT | V_NORMAL)) == 0) {
+	if (bo->bo_object != NULL &&
+	    (flags & (V_ALT | V_NORMAL | V_CLEANONLY)) == 0) {
 		VM_OBJECT_LOCK(bo->bo_object);
 		vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ?
 		    OBJPR_CLEANONLY : 0);
@@ -1229,7 +1230,7 @@ bufobj_invalbuf(struct bufobj *bo, int f
 
 #ifdef INVARIANTS
 	BO_LOCK(bo);
-	if ((flags & (V_ALT | V_NORMAL)) == 0 &&
+	if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY)) == 0 &&
 	    (bo->bo_dirty.bv_cnt > 0 || bo->bo_clean.bv_cnt > 0))
 		panic("vinvalbuf: flush failed");
 	BO_UNLOCK(bo);

Modified: stable/9/sys/kern/vfs_syscalls.c
==============================================================================
--- stable/9/sys/kern/vfs_syscalls.c	Fri Jan  6 19:27:51 2012	(r229722)
+++ stable/9/sys/kern/vfs_syscalls.c	Fri Jan  6 19:29:16 2012	(r229723)
@@ -86,6 +86,8 @@ __FBSDID("$FreeBSD$");
 #include <vm/vm_page.h>
 #include <vm/uma.h>
 
+static MALLOC_DEFINE(M_FADVISE, "fadvise", "posix_fadvise(2) information");
+
 SDT_PROVIDER_DEFINE(vfs);
 SDT_PROBE_DEFINE(vfs, , stat, mode, mode);
 SDT_PROBE_ARGTYPE(vfs, , stat, mode, 0, "char *");
@@ -4765,7 +4767,7 @@ out:
 	return (error);
 }
 
-static int
+int
 kern_posix_fallocate(struct thread *td, int fd, off_t offset, off_t len)
 {
 	struct file *fp;
@@ -4859,3 +4861,140 @@ sys_posix_fallocate(struct thread *td, s
 
 	return (kern_posix_fallocate(td, uap->fd, uap->offset, uap->len));
 }
+
+/*
+ * Unlike madvise(2), we do not make a best effort to remember every
+ * possible caching hint.  Instead, we remember the last setting with
+ * the exception that we will allow POSIX_FADV_NORMAL to adjust the
+ * region of any current setting.
+ */
+int
+kern_posix_fadvise(struct thread *td, int fd, off_t offset, off_t len,
+    int advice)
+{
+	struct fadvise_info *fa, *new;
+	struct file *fp;
+	struct vnode *vp;
+	off_t end;
+	int error;
+
+	if (offset < 0 || len < 0 || offset > OFF_MAX - len)
+		return (EINVAL);
+	switch (advice) {
+	case POSIX_FADV_SEQUENTIAL:
+	case POSIX_FADV_RANDOM:
+	case POSIX_FADV_NOREUSE:
+		new = malloc(sizeof(*fa), M_FADVISE, M_WAITOK);
+		break;
+	case POSIX_FADV_NORMAL:
+	case POSIX_FADV_WILLNEED:
+	case POSIX_FADV_DONTNEED:
+		new = NULL;
+		break;
+	default:
+		return (EINVAL);
+	}
+	/* XXX: CAP_POSIX_FADVISE? */
+	error = fget(td, fd, 0, &fp);
+	if (error != 0)
+		goto out;
+	
+	switch (fp->f_type) {
+	case DTYPE_VNODE:
+		break;
+	case DTYPE_PIPE:
+	case DTYPE_FIFO:
+		error = ESPIPE;
+		goto out;
+	default:
+		error = ENODEV;
+		goto out;
+	}
+	vp = fp->f_vnode;
+	if (vp->v_type != VREG) {
+		error = ENODEV;
+		goto out;
+	}
+	if (len == 0)
+		end = OFF_MAX;
+	else
+		end = offset + len - 1;
+	switch (advice) {
+	case POSIX_FADV_SEQUENTIAL:
+	case POSIX_FADV_RANDOM:
+	case POSIX_FADV_NOREUSE:
+		/*
+		 * Try to merge any existing non-standard region with
+		 * this new region if possible, otherwise create a new
+		 * non-standard region for this request.
+		 */
+		mtx_pool_lock(mtxpool_sleep, fp);
+		fa = fp->f_advice;
+		if (fa != NULL && fa->fa_advice == advice &&
+		    ((fa->fa_start <= end && fa->fa_end >= offset) ||
+		    (end != OFF_MAX && fa->fa_start == end + 1) ||
+		    (fa->fa_end != OFF_MAX && fa->fa_end + 1 == offset))) {
+			if (offset < fa->fa_start)
+				fa->fa_start = offset;
+			if (end > fa->fa_end)
+				fa->fa_end = end;
+		} else {
+			new->fa_advice = advice;
+			new->fa_start = offset;
+			new->fa_end = end;
+			fp->f_advice = new;
+			new = fa;
+		}
+		mtx_pool_unlock(mtxpool_sleep, fp);
+		break;
+	case POSIX_FADV_NORMAL:
+		/*
+		 * If a the "normal" region overlaps with an existing
+		 * non-standard region, trim or remove the
+		 * non-standard region.
+		 */
+		mtx_pool_lock(mtxpool_sleep, fp);
+		fa = fp->f_advice;
+		if (fa != NULL) {
+			if (offset <= fa->fa_start && end >= fa->fa_end) {
+				new = fa;
+				fp->f_advice = NULL;
+			} else if (offset <= fa->fa_start &&
+ 			    end >= fa->fa_start)
+				fa->fa_start = end + 1;
+			else if (offset <= fa->fa_end && end >= fa->fa_end)
+				fa->fa_end = offset - 1;
+			else if (offset >= fa->fa_start && end <= fa->fa_end) {
+				/*
+				 * If the "normal" region is a middle
+				 * portion of the existing
+				 * non-standard region, just remove
+				 * the whole thing rather than picking
+				 * one side or the other to
+				 * preserve.
+				 */
+				new = fa;
+				fp->f_advice = NULL;
+			}
+		}
+		mtx_pool_unlock(mtxpool_sleep, fp);
+		break;
+	case POSIX_FADV_WILLNEED:
+	case POSIX_FADV_DONTNEED:
+		error = VOP_ADVISE(vp, offset, end, advice);
+		break;
+	}
+out:
+	if (fp != NULL)
+		fdrop(fp, td);
+	free(new, M_FADVISE);
+	return (error);
+}
+
+int
+sys_posix_fadvise(struct thread *td, struct posix_fadvise_args *uap)
+{
+
+	return (kern_posix_fadvise(td, uap->fd, uap->offset, uap->len,
+	    uap->advice));
+}

Modified: stable/9/sys/kern/vfs_vnops.c
==============================================================================
--- stable/9/sys/kern/vfs_vnops.c	Fri Jan  6 19:27:51 2012	(r229722)
+++ stable/9/sys/kern/vfs_vnops.c	Fri Jan  6 19:29:16 2012	(r229723)
@@ -518,7 +518,7 @@ vn_read(fp, uio, active_cred, flags, td)
 	struct vnode *vp;
 	int error, ioflag;
 	struct mtx *mtxp;
-	int vfslocked;
+	int advice, vfslocked;
 
 	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
 	    uio->uio_td, td));
@@ -529,27 +529,48 @@ vn_read(fp, uio, active_cred, flags, td)
 		ioflag |= IO_NDELAY;
 	if (fp->f_flag & O_DIRECT)
 		ioflag |= IO_DIRECT;
+	advice = POSIX_FADV_NORMAL;
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	/*
 	 * According to McKusick the vn lock was protecting f_offset here.
 	 * It is now protected by the FOFFSET_LOCKED flag.
 	 */
-	if ((flags & FOF_OFFSET) == 0) {
+	if ((flags & FOF_OFFSET) == 0 || fp->f_advice != NULL) {
 		mtxp = mtx_pool_find(mtxpool_sleep, fp);
 		mtx_lock(mtxp);
-		while(fp->f_vnread_flags & FOFFSET_LOCKED) {
-			fp->f_vnread_flags |= FOFFSET_LOCK_WAITING;
-			msleep(&fp->f_vnread_flags, mtxp, PUSER -1,
-			    "vnread offlock", 0);
+		if ((flags & FOF_OFFSET) == 0) {
+			while (fp->f_vnread_flags & FOFFSET_LOCKED) {
+				fp->f_vnread_flags |= FOFFSET_LOCK_WAITING;
+				msleep(&fp->f_vnread_flags, mtxp, PUSER -1,
+				    "vnread offlock", 0);
+			}
+			fp->f_vnread_flags |= FOFFSET_LOCKED;
+			uio->uio_offset = fp->f_offset;
 		}
-		fp->f_vnread_flags |= FOFFSET_LOCKED;
+		if (fp->f_advice != NULL &&
+		    uio->uio_offset >= fp->f_advice->fa_start &&
+		    uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end)
+			advice = fp->f_advice->fa_advice;
 		mtx_unlock(mtxp);
-		vn_lock(vp, LK_SHARED | LK_RETRY);
-		uio->uio_offset = fp->f_offset;
-	} else
-		vn_lock(vp, LK_SHARED | LK_RETRY);
+	}
+	vn_lock(vp, LK_SHARED | LK_RETRY);
 
-	ioflag |= sequential_heuristic(uio, fp);
+	switch (advice) {
+	case POSIX_FADV_NORMAL:
+	case POSIX_FADV_SEQUENTIAL:
+		ioflag |= sequential_heuristic(uio, fp);
+		break;
+	case POSIX_FADV_RANDOM:
+		/* Disable read-ahead for random I/O. */
+		break;
+	case POSIX_FADV_NOREUSE:
+		/*
+		 * Request the underlying FS to discard the buffers
+		 * and pages after the I/O is complete.
+		 */
+		ioflag |= IO_DIRECT;
+		break;
+	}
 
 #ifdef MAC
 	error = mac_vnode_check_read(active_cred, fp->f_cred, vp);
@@ -584,7 +605,8 @@ vn_write(fp, uio, active_cred, flags, td
 	struct vnode *vp;
 	struct mount *mp;
 	int error, ioflag, lock_flags;
-	int vfslocked;
+	struct mtx *mtxp;
+	int advice, vfslocked;
 
 	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
 	    uio->uio_td, td));
@@ -618,7 +640,33 @@ vn_write(fp, uio, active_cred, flags, td
 	vn_lock(vp, lock_flags | LK_RETRY);
 	if ((flags & FOF_OFFSET) == 0)
 		uio->uio_offset = fp->f_offset;
-	ioflag |= sequential_heuristic(uio, fp);
+	advice = POSIX_FADV_NORMAL;
+	if (fp->f_advice != NULL) {
+		mtxp = mtx_pool_find(mtxpool_sleep, fp);
+		mtx_lock(mtxp);
+		if (fp->f_advice != NULL &&
+		    uio->uio_offset >= fp->f_advice->fa_start &&
+		    uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end)
+			advice = fp->f_advice->fa_advice;
+		mtx_unlock(mtxp);
+	}
+	switch (advice) {
+	case POSIX_FADV_NORMAL:
+	case POSIX_FADV_SEQUENTIAL:
+		ioflag |= sequential_heuristic(uio, fp);
+		break;
+	case POSIX_FADV_RANDOM:
+		/* XXX: Is this correct? */
+		break;
+	case POSIX_FADV_NOREUSE:
+		/*
+		 * Request the underlying FS to discard the buffers
+		 * and pages after the I/O is complete.
+		 */
+		ioflag |= IO_DIRECT;
+		break;
+	}
+
 #ifdef MAC
 	error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
 	if (error == 0)

Modified: stable/9/sys/kern/vnode_if.src
==============================================================================
--- stable/9/sys/kern/vnode_if.src	Fri Jan  6 19:27:51 2012	(r229722)
+++ stable/9/sys/kern/vnode_if.src	Fri Jan  6 19:29:16 2012	(r229723)
@@ -628,3 +628,12 @@ vop_allocate {
 	INOUT off_t *offset;
 	INOUT off_t *len;
 };
+
+%% advise	vp	U U U
+
+vop_advise {
+	IN struct vnode *vp;
+	IN off_t start;
+	IN off_t end;
+	IN int advice;
+};

Modified: stable/9/sys/sys/fcntl.h
==============================================================================
--- stable/9/sys/sys/fcntl.h	Fri Jan  6 19:27:51 2012	(r229722)
+++ stable/9/sys/sys/fcntl.h	Fri Jan  6 19:29:16 2012	(r229723)
@@ -277,9 +277,17 @@ struct oflock {
 #define	LOCK_UN		0x08		/* unlock file */
 #endif
 
+#if __POSIX_VISIBLE >= 200112
 /*
- * XXX missing posix_fadvise() and POSIX_FADV_* macros.
+ * Advice to posix_fadvise
  */
+#define	POSIX_FADV_NORMAL	0	/* no special treatment */
+#define	POSIX_FADV_RANDOM	1	/* expect random page references */
+#define	POSIX_FADV_SEQUENTIAL	2	/* expect sequential page references */
+#define	POSIX_FADV_WILLNEED	3	/* will need these pages */
+#define	POSIX_FADV_DONTNEED	4	/* dont need these pages */
+#define	POSIX_FADV_NOREUSE	5	/* access data only once */
+#endif
 
 #ifndef _KERNEL
 __BEGIN_DECLS
@@ -290,6 +298,7 @@ int	fcntl(int, int, ...);
 int	openat(int, const char *, int, ...);
 #endif
 #if __BSD_VISIBLE || __POSIX_VISIBLE >= 200112
+int	posix_fadvise(int, off_t, off_t, int);
 int	posix_fallocate(int, off_t, off_t);
 #endif
 #if __BSD_VISIBLE

Modified: stable/9/sys/sys/file.h
==============================================================================
--- stable/9/sys/sys/file.h	Fri Jan  6 19:27:51 2012	(r229722)
+++ stable/9/sys/sys/file.h	Fri Jan  6 19:29:16 2012	(r229723)
@@ -122,6 +122,12 @@ struct fileops {
  * none	not locked
  */
 
+struct fadvise_info {
+	int		fa_advice;	/* (f) FADV_* type. */
+	off_t		fa_start;	/* (f) Region start. */
+	off_t		fa_end;		/* (f) Region end. */
+};
+
 struct file {
 	void		*f_data;	/* file descriptor specific data */
 	struct fileops	*f_ops;		/* File operations */
@@ -136,7 +142,11 @@ struct file {
 	 */
 	int		f_seqcount;	/* Count of sequential accesses. */
 	off_t		f_nextoff;	/* next expected read/write offset. */
-	struct cdev_privdata *f_cdevpriv; /* (d) Private data for the cdev. */
+	union {
+		struct cdev_privdata *fvn_cdevpriv;
+					/* (d) Private data for the cdev. */
+		struct fadvise_info *fvn_advice;
+	} f_vnun;
 	/*
 	 *  DFLAG_SEEKABLE specific fields
 	 */
@@ -147,6 +157,9 @@ struct file {
 	void		*f_label;	/* Place-holder for MAC label. */
 };
 
+#define	f_cdevpriv	f_vnun.fvn_cdevpriv
+#define	f_advice	f_vnun.fvn_advice
+
 #define	FOFFSET_LOCKED       0x1
 #define	FOFFSET_LOCK_WAITING 0x2		 
 

Modified: stable/9/sys/sys/param.h
==============================================================================
--- stable/9/sys/sys/param.h	Fri Jan  6 19:27:51 2012	(r229722)
+++ stable/9/sys/sys/param.h	Fri Jan  6 19:29:16 2012	(r229723)
@@ -58,7 +58,7 @@
  *		in the range 5 to 9.
  */
 #undef __FreeBSD_version
-#define __FreeBSD_version 900500	/* Master, propagated to newvers */
+#define __FreeBSD_version 900501	/* Master, propagated to newvers */
 
 #ifdef _KERNEL
 #define	P_OSREL_SIGWAIT		700000

Modified: stable/9/sys/sys/syscallsubr.h
==============================================================================
--- stable/9/sys/sys/syscallsubr.h	Fri Jan  6 19:27:51 2012	(r229722)
+++ stable/9/sys/sys/syscallsubr.h	Fri Jan  6 19:29:16 2012	(r229723)
@@ -153,6 +153,10 @@ int	kern_openat(struct thread *td, int f
 int	kern_pathconf(struct thread *td, char *path, enum uio_seg pathseg,
 	    int name, u_long flags);
 int	kern_pipe(struct thread *td, int fildes[2]);
+int	kern_posix_fadvise(struct thread *td, int fd, off_t offset, off_t len,
+	    int advice);
+int	kern_posix_fallocate(struct thread *td, int fd, off_t offset,
+	    off_t len);
 int	kern_preadv(struct thread *td, int fd, struct uio *auio, off_t offset);
 int	kern_pselect(struct thread *td, int nd, fd_set *in, fd_set *ou,
 	    fd_set *ex, struct timeval *tvp, sigset_t *uset, int abi_nfdbits);

Modified: stable/9/sys/sys/unistd.h
==============================================================================
--- stable/9/sys/sys/unistd.h	Fri Jan  6 19:27:51 2012	(r229722)
+++ stable/9/sys/sys/unistd.h	Fri Jan  6 19:29:16 2012	(r229723)
@@ -49,7 +49,7 @@
  * the POSIX standard; however, if the relevant sysconf() function
  * returns -1, the functions may be stubbed out.
  */
-#define	_POSIX_ADVISORY_INFO		-1
+#define	_POSIX_ADVISORY_INFO		200112L
 #define	_POSIX_ASYNCHRONOUS_IO		0
 #define	_POSIX_CHOWN_RESTRICTED		1
 #define	_POSIX_CLOCK_SELECTION		-1

Modified: stable/9/sys/sys/vnode.h
==============================================================================
--- stable/9/sys/sys/vnode.h	Fri Jan  6 19:27:51 2012	(r229722)
+++ stable/9/sys/sys/vnode.h	Fri Jan  6 19:29:16 2012	(r229723)
@@ -384,6 +384,7 @@ extern int		vttoif_tab[];
 #define	V_SAVE		0x0001	/* vinvalbuf: sync file first */
 #define	V_ALT		0x0002	/* vinvalbuf: invalidate only alternate bufs */
 #define	V_NORMAL	0x0004	/* vinvalbuf: invalidate only regular bufs */
+#define	V_CLEANONLY	0x0008	/* vinvalbuf: invalidate only clean bufs */
 #define	REVOKEALL	0x0001	/* vop_revoke: revoke all aliases */
 #define	V_WAIT		0x0001	/* vn_start_write: sleep for suspend */
 #define	V_NOWAIT	0x0002	/* vn_start_write: don't sleep for suspend */
@@ -685,6 +686,7 @@ int	vop_stdunlock(struct vop_unlock_args
 int	vop_nopoll(struct vop_poll_args *);
 int	vop_stdaccess(struct vop_access_args *ap);
 int	vop_stdaccessx(struct vop_accessx_args *ap);
+int	vop_stdadvise(struct vop_advise_args *ap);
 int	vop_stdadvlock(struct vop_advlock_args *ap);
 int	vop_stdadvlockasync(struct vop_advlockasync_args *ap);
 int	vop_stdadvlockpurge(struct vop_advlockpurge_args *ap);

Modified: stable/9/sys/vm/vm_object.c
==============================================================================
--- stable/9/sys/vm/vm_object.c	Fri Jan  6 19:27:51 2012	(r229722)
+++ stable/9/sys/vm/vm_object.c	Fri Jan  6 19:29:16 2012	(r229723)
@@ -1878,6 +1878,60 @@ skipmemq:
 }
 
 /*
+ *	vm_object_page_cache:
+ *
+ *	For the given object, attempt to move the specified clean
+ *	pages to the cache queue.  If a page is wired for any reason,
+ *	then it will not be changed.  Pages are specified by the given
+ *	range ["start", "end").  As a special case, if "end" is zero,
+ *	then the range extends from "start" to the end of the object.
+ *	Any mappings to the specified pages are removed before the
+ *	pages are moved to the cache queue.
+ *
+ *	This operation should only be performed on objects that
+ *	contain managed pages.
+ *
+ *	The object must be locked.
+ */
+void
+vm_object_page_cache(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
+{
+	struct mtx *mtx, *new_mtx;
+	vm_page_t p, next;
+
+	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+	KASSERT((object->type != OBJT_DEVICE && object->type != OBJT_SG &&
+	    object->type != OBJT_PHYS),
+	    ("vm_object_page_cache: illegal object %p", object));
+	if (object->resident_page_count == 0)
+		return;
+	p = vm_page_find_least(object, start);
+
+	/*
+	 * Here, the variable "p" is either (1) the page with the least pindex
+	 * greater than or equal to the parameter "start" or (2) NULL. 
+	 */
+	mtx = NULL;
+	for (; p != NULL && (p->pindex < end || end == 0); p = next) {
+		next = TAILQ_NEXT(p, listq);
+
+		/*
+		 * Avoid releasing and reacquiring the same page lock.
+		 */
+		new_mtx = vm_page_lockptr(p);
+		if (mtx != new_mtx) {
+			if (mtx != NULL)
+				mtx_unlock(mtx);
+			mtx = new_mtx;
+			mtx_lock(mtx);
+		}
+		vm_page_try_to_cache(p);
+	}
+	if (mtx != NULL)
+		mtx_unlock(mtx);
+}
+
+/*
  *	Populate the specified range of the object with valid pages.  Returns
  *	TRUE if the range is successfully populated and FALSE otherwise.
  *

Modified: stable/9/sys/vm/vm_object.h
==============================================================================
--- stable/9/sys/vm/vm_object.h	Fri Jan  6 19:27:51 2012	(r229722)
+++ stable/9/sys/vm/vm_object.h	Fri Jan  6 19:29:16 2012	(r229723)
@@ -224,6 +224,8 @@ void vm_object_destroy (vm_object_t);
 void vm_object_terminate (vm_object_t);
 void vm_object_set_writeable_dirty (vm_object_t);
 void vm_object_init (void);
+void vm_object_page_cache(vm_object_t object, vm_pindex_t start,
+    vm_pindex_t end);
 void vm_object_page_clean(vm_object_t object, vm_ooffset_t start,
     vm_ooffset_t end, int flags);
 void vm_object_page_remove(vm_object_t object, vm_pindex_t start,



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201201061929.q06JTG7d004261>